os_setup.in 18.7 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6
7
8
# All rights reserved.
#

9
use English;
10
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11
require 'ctime.pl';
12

13
#
14
15
16
17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
# 
21
# usage: os_setup <pid> <eid>
22
#
Chad Barb's avatar
   
Chad Barb committed
23
24
25
26
27
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

28
29
sub usage()
{
30
    print STDERR "Usage: os_setup <pid> <eid>\n";
31
32
    exit(-1);
}
33
my  $optlist = "d";
34

Chad Barb's avatar
   
Chad Barb committed
35
36
37
38
39
40
41
42
43
44
45
#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
sub die_noretry($)
{
    my ($mesg) = shift;
    print STDERR "$mesg\n";
    exit(-1);
}

46
47
48
49
50
#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
51
my $TBOPS       = "@TBOPSEMAIL@";
52
my $TESTMODE    = @TESTMODE@;
53
my $TFTP	= "/tftpboot";
54

55
56
57
58
59
60
61
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

62
my $nodereboot	= "$TB/bin/node_reboot";
63
my $os_load	= "$TB/bin/os_load";
64
my $vnode_setup = "$TB/sbin/vnode_setup";
65
my $osselect    = "$TB/bin/os_select";
66
my $dbg		= 0;
67
my $failed      = 0;
68
69
my %nodes       = ();
my %vnodes      = ();
70
my %osids       = ();
71
my %canfail     = ();
72
my $db_result;
73
my @row;
74

75
#
76
77
78
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid. 
79
#
80
81
82
my %reloads     = ();
my %reboots	= ();
my $doautoload  = 1;
83
my $dolastload  = 1;
84
    
85
86
87
88
89
90
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

91
92
93
94
95
96
97
98
99
100
101
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
102
103
104
105
if (defined($options{"d"})) {
    $dbg = 1;
}

106
107
108
109
110
111
112
113
114
115
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
116
    die_noretry("Bad data in pid: $pid.");
117
118
119
120
121
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
122
    die_noretry("Bad data in eid: $eid.");
123
124
}

125
#
126
# Figure out who called us. Only root, people with admin status
127
128
# in the DB, or the owner of the experiment can run this script.
#
129
130
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
Chad Barb's avatar
   
Chad Barb committed
131
132
    die_noretry("*** $0:\n".
		"    You do not have permission to swap this experiment!");
133
134
}

135
136
TBDebugTimeStamp("os_setup started");

137
#
138
# Get the set of nodes, as well as the nodes table information for them.
139
#
140
$db_result =
141
142
    DBQueryFatal("select n.*,l.pid from reserved as r ".
		 "left join nodes as n on n.node_id=r.node_id ".
143
144
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
145

146
if ($db_result->numrows < 1) {	
147
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
Chad Barb's avatar
   
Chad Barb committed
148
    exit 0;
149
150
}

151
while (my %row = $db_result->fetchhash()) {
152
153
    my $node     = $row{'node_id'};
    my $osid     = $row{'def_boot_osid'};
154
    my $type     = $row{'type'};
155
    my $bootpath = 0;
156
    my $jailnode = 0;
157

158
    #
159
160
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
161
    #
162
163
164
165
166
167
168
169
    if (TBIsNodeVirtual($node, \$jailed)) {
	$vnodes{$node} = $jailed;
	if (! $jailed) {
	    next;
	}
	$jailnode = 1;
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
170
171
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
172
	$nodes{$node}  = $node;
Chad Barb's avatar
   
Chad Barb committed
173
174
175
176
177
178
179
180
	$nodeAllocStates{$node} = $nodeAllocState;
	# only reboot node if assign_wrapper just pulled it into expt.
	# (e.g. it isnt ALLOCSTATE_RES_READY)
	#if (($nodeAllocState eq TBDB_ALLOCSTATE_RES_INIT_DIRTY()) ||
	#    ($nodeAllocState eq TBDB_ALLOCSTATE_RES_INIT_CLEAN())) {
	if ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    $reboots{$node} = 1;
	}
181
    }
182
    $osids{$node} = $osid;
183

184
    #
185
186
187
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
188
189
190
191
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

192
193
194
195
196
197
198
199
200
201
202
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
203
204
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
205
206
	    }

207
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
208
		die_noretry("*** File $path for node $node does not exist!");
209
	    }
210
	    $bootpath = 1;
211
	}
212
213
214
215
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

216
217
218
219
220
221
222
223
224
225
226
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
227
228
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
229
230
	    }

231
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
232
		die_noretry("*** File $path for node $node does not exist!");
233
234
	    }
	}
235
236
    }

237
238
239
240
241
242
243
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
244
	if (! -f $delta) {
Chad Barb's avatar
   
Chad Barb committed
245
	    die_noretry("*** Delta file $delta for node $node does not exist!");
246
247
248
249
250
251
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
252
	if (! -f $rpm) {
Chad Barb's avatar
   
Chad Barb committed
253
	    die_noretry("*** RPM $rpm for node $node does not exist!");
254
255
256
	}
    }
    
257
258
259
260
261
262
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
263
	if (! -f $tar) {
Chad Barb's avatar
   
Chad Barb committed
264
	    die_noretry("*** Tarfile $tar for node $node does not exist!");
265
266
	}
    }
267
268
269
270
271
272
273

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
    # it. 
    #
274
    if (!$bootpath && !$jailnode) {
275
276
277
278
279
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
	# 
	if (! $osid) {
Chad Barb's avatar
   
Chad Barb committed
280
281
	    die_noretry(
	        "*** $node has no bootpath and no def_boot_osid set!");
282
283
284
285
286
287
288
289
290
	}

	#
	# Grab the info for this OSID. This is part of the image check.
	#
	my $osid_result =
	    DBQueryFatal("select * from os_info where osid='$osid'");
	
	if ($osid_result->numrows == 0) {
Chad Barb's avatar
   
Chad Barb committed
291
	    die_noretry("*** No such OSID $osid is defined!");
292
293
294
295
296
297
298
	}
	
	my %osid_row   = $osid_result->fetchhash();

	#
	# If there is an actual path, its an OSKit kernel not an image.
	# 
299
	if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
300
	    #
301
302
	    # Not an OSKit kernel.
	    # Make sure this OSID is actually loaded on the machine. 
303
	    #
304
305
306
	    my $p_result =
		DBQueryFatal("select * from partitions ".
			     "where node_id='$node' and osid='$osid'");
307
308

	    #
309
310
	    # If not loaded, then see if the user was looking for the generic
	    # name of the OS that is loaded. 
311
	    # 
312
313
	    if ($p_result->numrows == 0) {
		#
314
		# Check to see if a non specific version specified.
315
		#
316
317
318
319
		if (! defined($osid_row{'version'}) ||
		    $osid_row{'version'} eq "") {

		    #
320
321
322
323
		    # A non-specific version. There needs to be a way to
		    # map it to another osid. 
		    #
		    if (!defined($osid_row{'nextosid'})) {
Chad Barb's avatar
   
Chad Barb committed
324
325
326
			die_noretry(
			    "*** $0:\n".
			    "    No mapping can be made for $osid ($node)!");
327
328
329
330
331
332
		    }
		    my $nextosid = $osid_row{'nextosid'};
		    
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
333
334
		    # 
		    my $o_result =
335
336
337
338
339
340
341
342
343
344
345
346
347
348
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
			# loaded on the machine. 
			#
			print "Mapping $osid on $node to $nextosid ".
			    "and setting up a reload.\n";
			
			SetupReload($node, $nextosid, $type);
			$osids{$node} = $nextosid;
349
350
		    }
		    else {
351
352
353
354
			#
			# Already loaded. 
			# 
			print "Mapping $osid on $node to $nextosid.\n";
355
356
357

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
358
			    SetupReload($node, $nextosid, $type);
359
360
			}
			else {
361
			    system("$osselect $nextosid $node") and
Chad Barb's avatar
   
Chad Barb committed
362
363
				die_noretry("*** Could not set boot OS to ".
				    "$nextosid for $node");
364
			}
365
			$osids{$node} = $nextosid;
366
		    }
367
368
		}
		else {
369
370
		    #
		    # User wants a specific version of an OS, but its not
371
		    # loaded on the machine. 
372
		    #
373
374
375
376
377
378
379
380
381
382
		    SetupReload($node, $osid, $type);
		}
	    }
	    else {
		#
		# OSID is loaded, but might need to be cleaned. 
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osid, $type);
383
		}
384
385
386
	    }
	}
    }
387
    
388
389
390
391
    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
392
    if ($row{'type'} eq "dnard") {
393
394
395
396
397
398
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
399
    print STDERR "$node - $osids{$node} - $canfail{$node}\n"
400
	if $dbg;
401
}
402

403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
#
# Now do osid setup for jailed nodes. We waited until the physnodes were
# done above so that we can set the osid for the vnodes to the same
# as the physnode. We rely on the fact that the user is not allowed to
# set the OS for jailed nodes or for the physnodes that are hosting
# jailed nodes, and the node_types table has the right stuff. Non-jailed
# nodes do not need to be done.
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
Chad Barb's avatar
   
Chad Barb committed
420
421
	die_noretry("*** $0:\n".
	    "    Cannot determine phys_nodeid for $vnode!");
422
    }
423
424
425
426
427
428
429
430
431
432
433
434
    my $n_osid;
    
    if (!defined($nodes{$pnode})) {
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
    else {
	$n_osid = $osids{$pnode};
    }
435
436
    
    system("$osselect $n_osid $vnode") and
Chad Barb's avatar
   
Chad Barb committed
437
	die_noretry("*** Could not set boot OS to $n_osid for $vnode");
438
439
440
441

    $osids{$vnode} = $n_osid;
}

442
#
443
# We need to issue the reboots and the reloads in parallel.
444
#
445
TBDebugTimeStamp("rebooting/reloading started");
446
if (!$TESTMODE) {
447
448
449
450
451
    my %pids  = ();
    my $count = 0;
    my $cmd;

    if (keys(%reboots)) {
Chad Barb's avatar
   
Chad Barb committed
452
453
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
Chad Barb's avatar
   
Chad Barb committed
454
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
   
Chad Barb committed
455
456
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
Chad Barb's avatar
   
Chad Barb committed
457
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
   
Chad Barb committed
458
459
460
461
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
	}

462
463
464
465
466
467
468
	$cmd = "$nodereboot " . join(" ", keys(%reboots));
	$pids{$cmd} = ForkCmd($cmd);
    }

    foreach my $imageid ( keys(%reloads) ) {
	my @list = @{ $reloads{$imageid} };

Chad Barb's avatar
   
Chad Barb committed
469
470
471
472
473
	foreach my $node (@list) {
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
	}

474
475
476
477
478
479
480
481
482
483
484
	sleep(5);
	$pids{"$os_load -m $imageid @list"} =
	    ForkCmd("$os_load -m $imageid @list");
    }

    foreach $cmd ( keys(%pids) ) {
	my $pid = $pids{$cmd};

	waitpid($pid, 0);
	if ($?) {
	    $failed++;
485
	    print "*** Failed: $cmd\n";
486
487
	}
    }
488
}
489
TBDebugTimeStamp("rebooting/reloading finished");
490
sleep(2);
491

492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
#
# XXX What happens if something above fails? We could exit, but some nodes
# that *are* rebooting would be caught in the middle. For the nodes that
# were reloaded, we can check the state right away (and avoid the wait
# below as well); they should be in the ISUP state when os_load is
# finished.  If not, thats a failure and we can save some time below.  For
# plain reboot failures, nothing to do but find out below after the wait.
# I do not want to exit right away cause we might end up with a lot more
# power cycles since the nodes are very likely to be in a non responsive
# state if just rebooted!
#
foreach my $imageid ( keys(%reloads) ) {
    my @list = @{ $reloads{$imageid} };

    foreach my $node ( @list ) {
507
	my $mode;
508
	
509
510
	if (!TBGetNodeOpMode($node, \$mode)) {
	    print "*** Error getting operational mode for $node!\n";
511
512
513
	    $failed++;
	    delete($nodes{$node});
	}
514
	if ($mode eq TBDB_NODEOPMODE_RELOAD) {
515
516
517
518
519
520
521
522
523
	    print "*** Not waiting for $node since its reload failed!\n";
	    $failed++;
	    delete($nodes{$node});
	}
    }
}
# Remaining nodes we need to wait for.
my @nodelist = keys(%nodes);

524
#
525
526
527
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
528
#
529
if (@nodelist) {
530
531
532
    print "Waiting for local testbed nodes to finish rebooting ...\n";
}

533
534
my %retries;
my %waitstart;
535
foreach my $node ( @nodelist ) {
536
537
538
539
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

540
TBDebugTimeStamp("Local node waiting started");
541
542
543
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
544

545
546
    if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*7))) {
	print "$node is alive and well\n";
547
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
Chad Barb's avatar
   
Chad Barb committed
548
549
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();	
550
551
	next;
    }
552

553
554
555
556
557
558
    if ($retries{$node}) {
	$retries{$node} -= 1;

	print "*** Rebooting $node and waiting again ...\n";
	
	if (system("$nodereboot $node") == 0) {
559
	    push(@nodelist, $node);
560
561
562
563
564
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }
565

566
567
568
569
570
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
    print "*** WARNING: $node may be down.\n".
	  "    This has been reported to testbed-ops.\n";
	
571
572
573
574
    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
575
	SENDMAIL($user, "Node $node is down",
576
577
578
579
580
581
582
583
584
585
586
587
588
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

589
590
	print "*** Continuing with experiment setup anyway ...\n";
	next;
591
592
    }

593
594
    # Reserve it to down experiment.
    MarkNodeDown($node);
Chad Barb's avatar
   
Chad Barb committed
595
596
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();	
597

598
599
600
601
602
603
    # Send mail to testbed-ops about it
    SENDMAIL($TBOPS, "Node $node is down",
	     "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
	     "$node has been taken out of the pool until this matter ".
	     "is resolved.\n");

604
    $failed++;
605
}
606
TBDebugTimeStamp("Local node waiting finished");
607

608
609
610
611
612
613
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
#
614
615
616
617
618
619
620
621
622
623
624
625
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
my @vnodelist = keys(%vnodes);

if ($failed && @vnodelist) {
    print "*** Skipping virtual node setup since there were previous ".
	"failures!\n";
}
elsif (@vnodelist) {
    print "Setting up virtual testbed nodes ...\n";
    system("$vnode_setup $pid $eid");
626
627
    
    if ($?) {
Chad Barb's avatar
   
Chad Barb committed
628
629
	die_noretry("*** $0:\n".
	    "    Vnode setup failed!");
630
631
    }

632
    foreach my $node (@vnodelist) {
633
634
	$waitstart{$node} = time;
    }
635
    my $maxwait = 120 + (30 * scalar(@vnodelist));
636

637
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
638

639
640
641
642
    TBDebugTimeStamp("Virtual node waiting started");
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
	my $wstart = $waitstart{$node};
643

644
	if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, $maxwait)) {
645
	    print "$node is alive and well\n";
646
647
648
649
650
651
652
653
654
655
656
657
658
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    next;
	}

	SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
	print "*** WARNING: $node may be down.\n".
	      "    This has been reported to testbed-ops.\n";
	
	if ($canfail{$node}) {
	    # Send mail to testbed-ops and to the user about it.
	    my ($user) = getpwuid($UID);
	
659
	    SENDMAIL($user, "Virtual Node $node is down",
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	    print "*** Continuing with experiment setup anyway ...\n";
	    next;
	}

	# Reserve it to down experiment. 
	# MarkNodeDown($node);

	# Send mail to testbed-ops about it
681
682
	SENDMAIL($TBOPS, "Virtual Node $node is down",
		 "Virtual node $node in pid/eid $pid/$eid appears to be ".
683
684
685
		 "unresponsive.\n\n");

	print "*** Experiment will be terminated automatically.\n";
686
687
	$failed++;
    }
688
    TBDebugTimeStamp("Virtual node waiting finished");
689
690
}

Chad Barb's avatar
   
Chad Barb committed
691
print "OS Setup Done. There were $failed failed nodes.\n";
692
TBDebugTimeStamp("os_setup finished");
Chad Barb's avatar
   
Chad Barb committed
693
694
695
696
697
698

if ($failed > 0) {
    exit 1;
} else {
    exit 0;
}
699

700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
#
# Map an OSID to an imageid for a node type.
#
sub TBMapOSIDtoImageID($$)
{
    my ($osid, $type) = @_;

    my $query_result =
	DBQueryFatal("select imageid from osidtoimageid ".
		     "where type='$type' and osid='$osid'");

    if ($query_result->numrows == 0) {
	return 0;
    }
    my ($imageid) = $query_result->fetchrow_array();

    return $imageid;
}

719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
#
# Setup a reload of a node if we can find an image.
# This goo constructs a hashed array of lists.
#
sub SetupReload($$$)
{
    my ($node, $osid, $type) = @_;

    if ((my $imageid = TBMapOSIDtoImageID($osid, $type))) {
	if (! defined($reloads{$imageid})) {
	    $reloads{$imageid} = [ $node ];
	}
	else {
	    push(@{ $reloads{$imageid} }, $node);
	}
	delete $reboots{$node};
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
737
738
	die_noretry("*** $0:\n".
	    "    No image can be found for $osid on $node!");
739
740
741
    }
}

742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
#
# Fork a process to exec a command. Return the pid to wait on.
# 
sub ForkCmd($) {
    my ($cmd) = @_;
    my($mypid);

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    if ($dbg) {
	print STDERR "Forking command: $cmd\n";
    }

    system($cmd);
    exit($? >> 8);
}