os_setup.in 8.57 KB
Newer Older
1
2
#!/usr/bin/perl -wT
use English;
3
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
4
require 'ctime.pl';
5

6
#
7
8
9
10
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
11
#
12
# TODO: Reload disk images.
13
# 
14
# usage: os_setup <pid> <eid>
15
#
16
17
18
19
20
21
sub usage()
{
    print STDOUT "Usage: os_setup <pid> <eid>\n";
    exit(-1);
}
my  $optlist = "";
22
23
24
25
26
27

#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
28
my $TBOPS       = "@TBOPSEMAIL@";
29
my $TFTP	= "/tftpboot";
30

31
32
33
34
35
36
37
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

38
my $nodereboot	= "$TB/bin/node_reboot";
39
my $ping	= "/sbin/ping";
40
my $dbg		= 0;
41
my @nodes       = ();
42
my %osid        = ();
43
my %waitfor     = ();
44
my %canfail     = ();
45
my $db_result;
46
my @row;
47

48
49
#
# This stuff is BOGUS! Quick hack for paper deadline to make Jay happy.
50
# If Frisbee works, this might be appropriate.
51
#
52
my $doreloading = 0;
53
my $forcereload = 0;
54
my %reload      = ();
55
my $osload	= "$TB/bin/os_load";
56
    
57
58
59
60
61
62
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
    die("Bad data in pid: $pid.");
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
    die("Bad data in eid: $eid.");
}

93
#
94
# Figure out who called us. Only root, people with admin status
95
96
# in the DB, or the owner of the experiment can run this script.
#
97
if ($UID && !TBAdmin($UID)) {
98
99
100
    my ($me) = getpwuid($UID)
	or die "$UID not in passwd file";

101
102
103
104
    my $leader = ExpLeader($pid, $eid);

    if ($me ne $leader) {
	die("os_setup: You must be root or a TB administrator\n");
105
106
107
108
    }
}

#
109
# Get the set of nodes, as well as the nodes table information for them.
110
#
111
112
113
114
115
$db_result =
    DBQueryFatal("select * from nodes left join reserved on ".
		 "nodes.node_id=reserved.node_id ".
		 "where reserved.pid='$pid' and reserved.eid='$eid'");

116
117
if ($db_result->numrows < 1) {	
    die("There are no nodes assigned to experiment '$eid' in project '$pid'.");
118
119
}

120
121
122
for ($i = 0; $i < $db_result->numrows; $i++) {
    my %row  = $db_result->fetchhash();
    my $node = $row{'node_id'};
123
124

    push(@nodes, $node);
125
    $osid{$node} = $row{'def_boot_osid'};
126

127
    #
128
129
130
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
131
132
133
134
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
		die("File $path for node $node must reside in $TFTP\n");
	    }

	    if (! -e $path) {
		die("File $path for node $node does not exist!");
	    }
	}
153
154
155
156
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
		die("File $path for node $node must reside in $TFTP\n");
	    }

	    if (! -e $path) {
		die("File $path for node $node does not exist!");
	    }
	}
175
176
    }

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
	if (! -e $delta) {
	    die("Delta file $delta for node $node does not exist!");
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
	if (! -e $rpm) {
	    die("RPM $rpm for node $node does not exist!");
	}
    }
    
197
198
199
200
201
202
203
204
205
206
207
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
	if (! -e $tar) {
	    die("Tarfile $tar for node $node does not exist!");
	}
    }
    
208
    #
209
    # If pingable, then the node is "waitable".
210
    #
211
    if (OSFeatureSupported($osid{$node}, "ping")) {
212
	$waitfor{$node} = 1;
213
214
    }
    else {
215
	$waitfor{$node} = 0;
216
    }
217
218
219
220
221
222
223
224
225
226
227
228
229
230

    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
    if ($row{'type'} eq "shark") {
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
    print STDOUT "$node - $osid{$node} - $waitfor{$node} - $canfail{$node}\n"
	if $dbg;
231
}
232

233
#
234
235
236
# Fire off a mass reboot. The reboot script does this in parallel, so
# no need to create any new children here. We just wait until it exits,
# which means all the nodes are actually rebooting.
237
#
238
239
if (system("$nodereboot @nodes")) {
    die("Failed to reboot some nodes!");
240
241
}

242
243
print STDOUT "Waiting for testbed nodes to finish rebooting ...\n";

244
245
my $waitstart = time;

246
247
248
#
# Now lets wait for them to come back alive.
#
249
foreach my $node ( @nodes ) {
250
251
    my $failmesg;
    
252
253
254
255
    #
    # Don't bother to wait for nodes that are running foreign OSs since
    # we are not going to deal with them anyway later in the process.
    #
256
257
    if ($waitfor{$node} == 0) {
	print STDOUT "Not waiting for $node to come alive. Foreign OS.\n";
258
	SetNodeBootStatus($node, NODEBOOTSTATUS_UNKNOWN);
259
260
261
	next;
    }	

262
263
    if (WaitTillAlive($node) == 0) {
	print STDOUT "$node is alive and well\n";
264
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
265
266
	next;
    }
267
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
268

269
    print STDOUT "$node may be down. This has been reported to testbed-ops.\n";
270
    print STDOUT "Please end this experiment, and try again.\n";
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
	SENDMAIL($user, "TESTBED: Node $node is down",
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	print STDERR "*** Oops, $node did not come back alive!\n";
    }
    else {
	# Reserve it to down experiment.
	MarkNodeDown($node);

	# Send mail to testbed-ops about it
	SENDMAIL($TBOPS, "TESTBED: Node $node is down",
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Please look into this matter. $node has been reserved\n".
		 "by the Testbed until this matter has been resolved.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n");

	die("*** Oops, $node did not come back alive!\n");
    }
306
307
308
309
310
}

print STDOUT "OS Setup Done!\n";
exit 0;

311
312
313
#
# Wait for a node to come back alive.
# 
314
sub WaitTillAlive {
315
    my ($pc) = @_;
316

317
318
319
    print STDERR "Waiting for $pc to come alive\n" if $dbg;
    
    #
320
    # Seems like a long time to wait, but it ain't!
321
    # 
322
    my $maxwait = (60 * 4);
323
    if ($reload{$pc}) {
324
	$maxwait += (60 * 5);
325
326
    }

327
328
329
330
331
332
333
    #
    # Start a counter going, relative to the time we rebooted the first
    # node. 
    # 
    my $waittime  = 0;
    my $minutes   = 0;

334
335
336
337
338
    #
    # Sigh, a long ping results in the script waiting until all the
    # packets are sent from all the pings, before it will exit. So,
    # loop doing a bunch of shorter pings.
    #
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
    while (1) {
	system("$ping -q -c 4 -t 4 $pc >/dev/null 2>&1");
	$status = $? >> 8;

	#
	# Returns 0 if any packets are returned. Returns 2 if pingable
	# but no packets are returned. Other non-zero error codes indicate
	# other problems.  Any non-zero return indicates "not pingable" to us.
	# 
	if (! $status) {
	    print STDERR "$pc alive and well\n" if $dbg;
	    return 0;
	}
	$waittime = time - $waitstart;
	if ($waittime > $maxwait) {
	    print "$pc appears dead; its been ",
	    (int ($waittime / 60))," minutes since reload started.\n";
	    return 1;
357
	}
358
359
360
	if (int($waittime / 60) > $minutes) {
	    $minutes = int($waittime / 60);
	    print "Still waiting for $pc - its been $minutes minute(s)\n";
361
	}
362
363
    }
}