os_setup.in 7.61 KB
Newer Older
1
2
#!/usr/bin/perl -wT
use English;
3
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
4
require 'ctime.pl';
5

6
#
7
8
9
10
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
11
#
12
# TODO: Reload disk images.
13
# 
14
# usage: os_setup <pid> <eid>
15
#
16
17
18
19
20
21
sub usage()
{
    print STDOUT "Usage: os_setup <pid> <eid>\n";
    exit(-1);
}
my  $optlist = "";
22
23
24
25
26
27

#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
28
my $TBOPS       = "@TBOPSEMAIL@";
29

30
my $nodereboot	= "$TB/bin/node_reboot";
31
my $ping	= "/sbin/ping";
32
my $mail        = "/usr/bin/mail";
33
my $dbg		= 0;
34
my @nodes       = ();
35
my %osid        = ();
36
37
my %waitfor     = ();
my @row;
38

39
40
#
# This stuff is BOGUS! Quick hack for paper deadline to make Jay happy.
41
# If Frisbee works, this might be appropriate.
42
#
43
my $doreloading = 0;
44
my $forcereload = 0;
45
my %reload      = ();
46
my $osload	= "$TB/bin/os_load";
47
    
48
49
50
51
52
53
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
    die("Bad data in pid: $pid.");
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
    die("Bad data in eid: $eid.");
}

84
85
86
87
#
# Set up for querying the database.
# 
use Mysql;
88
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
89
90

#
91
# Figure out who called us. Only root, people with admin status
92
93
# in the DB, or the owner of the experiment can run this script.
#
94
95
96
97
98
99
$db_result = $DB->query("select expt_head_uid from experiments ".
			"where eid='$eid' and pid='$pid'");
if ($db_result->numrows < 1) {	
  die("There is no experiment '$eid' in project '$pid'.\n");
}

100
101
102
103
if ($UID != 0) {
    my ($me) = getpwuid($UID)
	or die "$UID not in passwd file";

104
    @row = $db_result->fetchrow_array();
105
    if ($row[0] ne "$me") {
106
	print STDERR "Checking for admin status ...\n" if $dbg;
107
	$db_result = $DB->query("select admin from users where uid='$me'");
108
        @row = $db_result->fetchrow_array();
109
	if ($row[0] != 1) {
110
	    die("os_setup: You must be root or a TB administrator\n");
111
112
113
114
115
	}
    }
}

#
116
# Get the set of nodes, as well as the nodes table information for them.
117
#
118
119
120
121
122
$db_result = $DB->query("select * from nodes left join reserved on ".
			"nodes.node_id=reserved.node_id ".
			"where reserved.pid='$pid' and reserved.eid='$eid'");
if ($db_result->numrows < 1) {	
    die("There are no nodes assigned to experiment '$eid' in project '$pid'.");
123
124
}

125
126
127
for ($i = 0; $i < $db_result->numrows; $i++) {
    my %row  = $db_result->fetchhash();
    my $node = $row{'node_id'};
128
129

    push(@nodes, $node);
130
    $osid{$node} = $row{'def_boot_osid'};
131

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
    #
    # Make sure the files specified in the paths exist.
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

	if ($path ne "" && ! -e $path) {
	    die("File $path for node $node does not exist!");
	}
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

	if ($path ne "" && ! -e $path) {
	    die("File $path for node $node does not exist!");
	}
    }

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
	if (! -e $delta) {
	    die("Delta file $delta for node $node does not exist!");
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
	if (! -e $rpm) {
	    die("RPM $rpm for node $node does not exist!");
	}
    }
    
170
    #
171
    # If pingable, then the node is "waitable".
172
    #
173
    if (OSFeatureSupported($osid{$node}, "ping")) {
174
	$waitfor{$node} = 1;
175
176
    }
    else {
177
	$waitfor{$node} = 0;
178
    }
179
    print STDOUT "$node - $osid{$node} - $waitfor{$node}\n" if $dbg;
180
}
181

182
#
183
184
185
# Fire off a mass reboot. The reboot script does this in parallel, so
# no need to create any new children here. We just wait until it exits,
# which means all the nodes are actually rebooting.
186
#
187
188
if (system("$nodereboot @nodes")) {
    die("Failed to reboot some nodes!");
189
190
}

191
192
print STDOUT "Waiting for testbed nodes to finish rebooting ...\n";

193
194
my $waitstart = time;

195
196
197
#
# Now lets wait for them to come back alive.
#
198
foreach my $node ( @nodes ) {
199
200
201
202
    #
    # Don't bother to wait for nodes that are running foreign OSs since
    # we are not going to deal with them anyway later in the process.
    #
203
204
    if ($waitfor{$node} == 0) {
	print STDOUT "Not waiting for $node to come alive. Foreign OS.\n";
205
206
207
	next;
    }	

208
209
    if (WaitTillAlive($node) == 0) {
	print STDOUT "$node is alive and well\n";
210
211
	next;
    }
212

213
    print STDOUT "$node may be down. This has been reported to testbed-ops.\n";
214
215
216
217
218
    print STDOUT "Please end this experiment, and try again.\n";
    
    # Reserve it to testbed down

    $cmd = "update reserved set pid='testbed',eid='down' ".
219
      "where eid='$eid' and pid='$pid' and node_id='$node'";
220
221
222
223
224
225
226
227
228
    print "Using '$cmd'\n" if $dbg;
    $db_result = $DB->query($cmd) 
      || print STDERR "WARNING: Couldn't change reservation:".
	$DB->errmsg."\n";
    if ($db_result->num_rows < 1 ) {
      print STDERR "WARNING: Couldn't change reservation!\n";
    }

    # Send mail to testbed-ops about it
229
230
231
232
233
    open(MAIL,"| $mail -s \"TESTBED: $node down?\" $TBOPS");
    print MAIL "User ".getpwuid($UID)." was running expt. $eid\n";
    print MAIL "in proj. $pid using ir file /proj/$pid/exp/$eid/tbdata\n";
    print MAIL "but $node appears to be unresponsive.\n";
    print MAIL "\nPlease look into this matter. $node has been reserved to\n";
234
235
236
237
    print MAIL "the testbed/down experiment until this has been resolved.\n\n";
    print MAIL "Thanks,\nTestbed Operations\ntestbed-ops\@flux.cs.utah.edu\n";
    close(MAIL);
    
238
    die("Oops, $node did not come back alive!\n");
239
240
241
242
243
}

print STDOUT "OS Setup Done!\n";
exit 0;

244
sub WaitTillAlive {
245
    my ($pc) = @_;
246

247
248
    my $maxwait = 150;
    if ($reload{$pc}) {
249
	$maxwait += 350;
250
251
    }

252
253
254
255
256
257
    print STDERR "Waiting for $pc to come alive\n" if $dbg;
    #
    # Sigh, a long ping results in the script waiting until all the
    # packets are sent from all the pings, before it will exit. So,
    # loop doing a bunch of shorter pings.
    #
258
    my $lasttime = ( (time - $waitstart) > 60 ? 61 : (time - $waitstart));
259
    for ($i = 0; $i < 200; $i++) {
260
	open(PING, "$ping -c 3 -t 4 $pc 2>&1 |");
261
262
263
264
265
266
267
268
	do {
	    $_ = <PING>;
	    if ( $_ =~ /bytes from/ ) {
		print STDERR "Yep, $pc alive and well\n" if $dbg;
		return 0;
	    }
	}
	until ( $_ =~ /transmitted, (\d*) packets received/ );
269
270
271
272
273
274
275
	my $curtime = time - $waitstart;
	print "Waited ",$curtime," seconds...\n" if $dbg;
	if ( $curtime % 60 < $lasttime % 60 ) { 
	  print STDERR "Still waiting for $pc - its been ",
	  (int ($curtime/60))," min.\n";
	}
	$lasttime = $curtime;
276
	if ($i > 3 && $curtime > $maxwait) { last; }
277
278
    }
    close(PING);
279
    print STDERR "$pc is not responding. Better check into it.\n" if $dbg;
280
281
282
    return 1;
}

283
284
285
sub OSFeatureSupported {
    local($os)      = $_[0];
    local($feature) = $_[1];
286
    local($db_result);
287

288
    $db_result = $DB->query("select osfeatures from os_info where osid='$os'");
289
290
291
292
293
294
295
296
297
298
299

    if ($db_result->numrows < 1) {
	return 0;
    }
    foreach $osfeature (split(',', $db_result->fetchrow_array())) {
	if ($feature eq $osfeature) {
	    return 1;
	}
    }
    return 0;
}