os_setup.in 18.4 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6
7
8
# All rights reserved.
#

9
use English;
10
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11
require 'ctime.pl';
12

13
#
14
15
16
17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
# 
21
# usage: os_setup <pid> <eid>
22
#
Chad Barb's avatar
   
Chad Barb committed
23
24
25
26
27
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

28
29
sub usage()
{
30
    print STDERR "Usage: os_setup <pid> <eid>\n";
31
32
    exit(-1);
}
33
my  $optlist = "d";
34

Chad Barb's avatar
   
Chad Barb committed
35
36
37
38
39
40
41
42
43
44
45
#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
sub die_noretry($)
{
    my ($mesg) = shift;
    print STDERR "$mesg\n";
    exit(-1);
}

46
47
48
49
50
#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
51
my $TBOPS       = "@TBOPSEMAIL@";
52
my $TESTMODE    = @TESTMODE@;
53
my $TFTP	= "/tftpboot";
54

55
56
57
58
59
60
61
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

62
my $nodereboot	= "$TB/bin/node_reboot";
63
my $os_load	= "$TB/bin/os_load";
64
my $vnode_setup = "$TB/sbin/vnode_setup";
65
my $osselect    = "$TB/bin/os_select";
66
my $dbg		= 0;
67
my $failed      = 0;
68
69
my %nodes       = ();
my %vnodes      = ();
70
71
72
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
73
my %osids       = ();
74
my %canfail     = ();
75
my $db_result;
76
my @row;
77

78
#
79
80
81
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid. 
82
#
83
84
85
my %reloads     = ();
my %reboots	= ();
my $doautoload  = 1;
86
my $dolastload  = 1;
87
    
88
89
90
91
92
93
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

94
95
96
97
98
99
100
101
102
103
104
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
105
106
107
108
if (defined($options{"d"})) {
    $dbg = 1;
}

109
110
111
112
113
114
115
116
117
118
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
119
    die_noretry("Bad data in pid: $pid.");
120
121
122
123
124
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
125
    die_noretry("Bad data in eid: $eid.");
126
127
}

128
#
129
# Figure out who called us. Only root, people with admin status
130
131
# in the DB, or the owner of the experiment can run this script.
#
132
133
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
Chad Barb's avatar
   
Chad Barb committed
134
135
    die_noretry("*** $0:\n".
		"    You do not have permission to swap this experiment!");
136
137
}

138
139
TBDebugTimeStamp("os_setup started");

140
#
141
# Get the set of nodes, as well as the nodes table information for them.
142
#
143
$db_result =
144
145
    DBQueryFatal("select n.*,l.pid from reserved as r ".
		 "left join nodes as n on n.node_id=r.node_id ".
146
147
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
148

149
if ($db_result->numrows < 1) {	
150
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
Chad Barb's avatar
   
Chad Barb committed
151
    exit 0;
152
153
}

154
while (my %row = $db_result->fetchhash()) {
155
156
    my $node     = $row{'node_id'};
    my $osid     = $row{'def_boot_osid'};
157
    my $type     = $row{'type'};
158
    my $bootpath = 0;
159
    my $jailnode = 0;
160

161
    #
162
163
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
164
    #
165
166
167
168
169
170
171
172
    if (TBIsNodeVirtual($node, \$jailed)) {
	$vnodes{$node} = $jailed;
	if (! $jailed) {
	    next;
	}
	$jailnode = 1;
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
173
174
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
175
	$nodes{$node}  = $node;
Chad Barb's avatar
   
Chad Barb committed
176
177
178
179
180
181
	$nodeAllocStates{$node} = $nodeAllocState;
	# only reboot node if assign_wrapper just pulled it into expt.
	# (e.g. it isnt ALLOCSTATE_RES_READY)
	if ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    $reboots{$node} = 1;
	}
182
    }
183
    $osids{$node} = $osid;
184

185
    #
186
187
188
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
189
190
191
192
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

193
194
195
196
197
198
199
200
201
202
203
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
204
205
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
206
207
	    }

208
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
209
		die_noretry("*** File $path for node $node does not exist!");
210
	    }
211
	    $bootpath = 1;
212
	}
213
214
215
216
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

217
218
219
220
221
222
223
224
225
226
227
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
228
229
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
230
231
	    }

232
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
233
		die_noretry("*** File $path for node $node does not exist!");
234
235
	    }
	}
236
237
    }

238
239
240
241
242
243
244
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
245
	if (! -f $delta) {
Chad Barb's avatar
   
Chad Barb committed
246
	    die_noretry("*** Delta file $delta for node $node does not exist!");
247
248
249
250
251
252
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
253
	if (! -f $rpm) {
Chad Barb's avatar
   
Chad Barb committed
254
	    die_noretry("*** RPM $rpm for node $node does not exist!");
255
256
257
	}
    }
    
258
259
260
261
262
263
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
264
	if (! -f $tar) {
Chad Barb's avatar
   
Chad Barb committed
265
	    die_noretry("*** Tarfile $tar for node $node does not exist!");
266
267
	}
    }
268
269
270
271
272
273
274

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
    # it. 
    #
275
    if (!$bootpath && !$jailnode) {
276
277
278
279
280
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
	# 
	if (! $osid) {
Chad Barb's avatar
   
Chad Barb committed
281
282
	    die_noretry(
	        "*** $node has no bootpath and no def_boot_osid set!");
283
284
285
286
287
288
289
290
291
	}

	#
	# Grab the info for this OSID. This is part of the image check.
	#
	my $osid_result =
	    DBQueryFatal("select * from os_info where osid='$osid'");
	
	if ($osid_result->numrows == 0) {
Chad Barb's avatar
   
Chad Barb committed
292
	    die_noretry("*** No such OSID $osid is defined!");
293
294
295
296
297
298
299
	}
	
	my %osid_row   = $osid_result->fetchhash();

	#
	# If there is an actual path, its an OSKit kernel not an image.
	# 
300
	if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
301
	    #
302
303
	    # Not an OSKit kernel.
	    # Make sure this OSID is actually loaded on the machine. 
304
	    #
305
306
307
	    my $p_result =
		DBQueryFatal("select * from partitions ".
			     "where node_id='$node' and osid='$osid'");
308
309

	    #
310
311
	    # If not loaded, then see if the user was looking for the generic
	    # name of the OS that is loaded. 
312
	    # 
313
314
	    if ($p_result->numrows == 0) {
		#
315
		# Check to see if a non specific version specified.
316
		#
317
318
319
320
		if (! defined($osid_row{'version'}) ||
		    $osid_row{'version'} eq "") {

		    #
321
322
323
324
		    # A non-specific version. There needs to be a way to
		    # map it to another osid. 
		    #
		    if (!defined($osid_row{'nextosid'})) {
Chad Barb's avatar
   
Chad Barb committed
325
326
327
			die_noretry(
			    "*** $0:\n".
			    "    No mapping can be made for $osid ($node)!");
328
329
330
331
332
333
		    }
		    my $nextosid = $osid_row{'nextosid'};
		    
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
334
335
		    # 
		    my $o_result =
336
337
338
339
340
341
342
343
344
345
346
347
348
349
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
			# loaded on the machine. 
			#
			print "Mapping $osid on $node to $nextosid ".
			    "and setting up a reload.\n";
			
			SetupReload($node, $nextosid, $type);
			$osids{$node} = $nextosid;
350
351
		    }
		    else {
352
353
354
355
			#
			# Already loaded. 
			# 
			print "Mapping $osid on $node to $nextosid.\n";
356
357
358

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
359
			    SetupReload($node, $nextosid, $type);
360
361
			}
			else {
362
			    system("$osselect $nextosid $node") and
Chad Barb's avatar
   
Chad Barb committed
363
364
				die_noretry("*** Could not set boot OS to ".
				    "$nextosid for $node");
365
			}
366
			$osids{$node} = $nextosid;
367
		    }
368
369
		}
		else {
370
371
		    #
		    # User wants a specific version of an OS, but its not
372
		    # loaded on the machine. 
373
		    #
374
375
376
377
378
379
380
381
382
383
		    SetupReload($node, $osid, $type);
		}
	    }
	    else {
		#
		# OSID is loaded, but might need to be cleaned. 
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osid, $type);
384
		}
385
386
387
	    }
	}
    }
388
    
389
390
391
392
    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
393
    if ($row{'type'} eq "dnard") {
394
395
396
397
398
399
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
400
    print STDERR "$node - $osids{$node} - $canfail{$node}\n"
401
	if $dbg;
402
}
403

404
#
405
# Collect some info about vnodes. 
406
407
408
409
410
411
412
413
414
415
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
Chad Barb's avatar
   
Chad Barb committed
416
417
	die_noretry("*** $0:\n".
	    "    Cannot determine phys_nodeid for $vnode!");
418
    }
419
420
421
422
423
424
425
426
427
428

    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
429
430
431
432
433
434
435
436
    
    if (!defined($nodes{$pnode})) {
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
437
    # Nothing else to do for local jail nodes at this time ...
438
439
}

440
#
441
# We need to issue the reboots and the reloads in parallel.
442
#
443
TBDebugTimeStamp("rebooting/reloading started");
444
if (!$TESTMODE) {
445
446
447
448
449
    my %pids  = ();
    my $count = 0;
    my $cmd;

    if (keys(%reboots)) {
Chad Barb's avatar
   
Chad Barb committed
450
451
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
Chad Barb's avatar
   
Chad Barb committed
452
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
   
Chad Barb committed
453
454
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
Chad Barb's avatar
   
Chad Barb committed
455
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
   
Chad Barb committed
456
457
458
459
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
	}

460
461
462
463
464
465
466
	$cmd = "$nodereboot " . join(" ", keys(%reboots));
	$pids{$cmd} = ForkCmd($cmd);
    }

    foreach my $imageid ( keys(%reloads) ) {
	my @list = @{ $reloads{$imageid} };

Chad Barb's avatar
   
Chad Barb committed
467
468
469
470
471
	foreach my $node (@list) {
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
	}

472
473
474
475
476
477
478
479
480
481
482
	sleep(5);
	$pids{"$os_load -m $imageid @list"} =
	    ForkCmd("$os_load -m $imageid @list");
    }

    foreach $cmd ( keys(%pids) ) {
	my $pid = $pids{$cmd};

	waitpid($pid, 0);
	if ($?) {
	    $failed++;
483
	    print "*** Failed: $cmd\n";
484
485
	}
    }
486
}
487
TBDebugTimeStamp("rebooting/reloading finished");
488
sleep(2);
489

490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
#
# XXX What happens if something above fails? We could exit, but some nodes
# that *are* rebooting would be caught in the middle. For the nodes that
# were reloaded, we can check the state right away (and avoid the wait
# below as well); they should be in the ISUP state when os_load is
# finished.  If not, thats a failure and we can save some time below.  For
# plain reboot failures, nothing to do but find out below after the wait.
# I do not want to exit right away cause we might end up with a lot more
# power cycles since the nodes are very likely to be in a non responsive
# state if just rebooted!
#
foreach my $imageid ( keys(%reloads) ) {
    my @list = @{ $reloads{$imageid} };

    foreach my $node ( @list ) {
505
	my $mode;
506
	
507
508
	if (!TBGetNodeOpMode($node, \$mode)) {
	    print "*** Error getting operational mode for $node!\n";
509
510
511
	    $failed++;
	    delete($nodes{$node});
	}
512
	if ($mode eq TBDB_NODEOPMODE_RELOAD) {
513
514
515
516
517
518
519
520
521
	    print "*** Not waiting for $node since its reload failed!\n";
	    $failed++;
	    delete($nodes{$node});
	}
    }
}
# Remaining nodes we need to wait for.
my @nodelist = keys(%nodes);

522
#
523
524
525
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
526
#
527
if (@nodelist) {
528
529
530
    print "Waiting for local testbed nodes to finish rebooting ...\n";
}

531
532
my %retries;
my %waitstart;
533
foreach my $node ( @nodelist ) {
534
535
536
537
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

538
TBDebugTimeStamp("Local node waiting started");
539
540
541
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
542

543
544
    if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*7))) {
	print "$node is alive and well\n";
545
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
Chad Barb's avatar
   
Chad Barb committed
546
547
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();	
548
549
	next;
    }
550

551
552
553
554
555
556
    if ($retries{$node}) {
	$retries{$node} -= 1;

	print "*** Rebooting $node and waiting again ...\n";
	
	if (system("$nodereboot $node") == 0) {
557
	    push(@nodelist, $node);
558
559
560
561
562
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }
563

564
565
566
567
568
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
    print "*** WARNING: $node may be down.\n".
	  "    This has been reported to testbed-ops.\n";
	
569
570
571
572
    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
573
	SENDMAIL($user, "Node $node is down",
574
575
576
577
578
579
580
581
582
583
584
585
586
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

587
588
	print "*** Continuing with experiment setup anyway ...\n";
	next;
589
590
    }

591
592
    # Reserve it to down experiment.
    MarkNodeDown($node);
Chad Barb's avatar
   
Chad Barb committed
593
594
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();	
595

596
597
598
599
600
601
    # Send mail to testbed-ops about it
    SENDMAIL($TBOPS, "Node $node is down",
	     "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
	     "$node has been taken out of the pool until this matter ".
	     "is resolved.\n");

602
    $failed++;
603
}
604
TBDebugTimeStamp("Local node waiting finished");
605

606
607
608
609
610
611
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
#
612
613
614
615
616
617
618
619
620
621
622
623
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
my @vnodelist = keys(%vnodes);

if ($failed && @vnodelist) {
    print "*** Skipping virtual node setup since there were previous ".
	"failures!\n";
}
elsif (@vnodelist) {
    print "Setting up virtual testbed nodes ...\n";
    system("$vnode_setup $pid $eid");
624
625
    
    if ($?) {
Chad Barb's avatar
   
Chad Barb committed
626
627
	die_noretry("*** $0:\n".
	    "    Vnode setup failed!");
628
629
    }

630
    foreach my $node (@vnodelist) {
631
632
	$waitstart{$node} = time;
    }
633
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
634

635
636
637
    TBDebugTimeStamp("Virtual node waiting started");
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
638
	my $pnode  = $vnode2pnode{$node};
639
	my $wstart = $waitstart{$node};
640
	my $maxwait = 90 + (30 * $pnodevcount{$pnode});
641

642
	if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, $maxwait)) {
643
	    print "$node is alive and well\n";
644
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
645
646
647
648
649
650
651
652
653
654
655
656
657
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    next;
	}

	SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
	print "*** WARNING: $node may be down.\n".
	      "    This has been reported to testbed-ops.\n";
	
	if ($canfail{$node}) {
	    # Send mail to testbed-ops and to the user about it.
	    my ($user) = getpwuid($UID);
	
658
	    SENDMAIL($user, "Virtual Node $node is down",
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	    print "*** Continuing with experiment setup anyway ...\n";
	    next;
	}

	print "*** Experiment will be terminated automatically.\n";
677
678
	$failed++;
    }
679
    TBDebugTimeStamp("Virtual node waiting finished");
680
681
}

Chad Barb's avatar
   
Chad Barb committed
682
print "OS Setup Done. There were $failed failed nodes.\n";
683
TBDebugTimeStamp("os_setup finished");
Chad Barb's avatar
   
Chad Barb committed
684
685
686
687
688
689

if ($failed > 0) {
    exit 1;
} else {
    exit 0;
}
690

691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
#
# Map an OSID to an imageid for a node type.
#
sub TBMapOSIDtoImageID($$)
{
    my ($osid, $type) = @_;

    my $query_result =
	DBQueryFatal("select imageid from osidtoimageid ".
		     "where type='$type' and osid='$osid'");

    if ($query_result->numrows == 0) {
	return 0;
    }
    my ($imageid) = $query_result->fetchrow_array();

    return $imageid;
}

710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
#
# Setup a reload of a node if we can find an image.
# This goo constructs a hashed array of lists.
#
sub SetupReload($$$)
{
    my ($node, $osid, $type) = @_;

    if ((my $imageid = TBMapOSIDtoImageID($osid, $type))) {
	if (! defined($reloads{$imageid})) {
	    $reloads{$imageid} = [ $node ];
	}
	else {
	    push(@{ $reloads{$imageid} }, $node);
	}
	delete $reboots{$node};
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
728
729
	die_noretry("*** $0:\n".
	    "    No image can be found for $osid on $node!");
730
731
732
    }
}

733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
#
# Fork a process to exec a command. Return the pid to wait on.
# 
sub ForkCmd($) {
    my ($cmd) = @_;
    my($mypid);

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    if ($dbg) {
	print STDERR "Forking command: $cmd\n";
    }

    system($cmd);
    exit($? >> 8);
}