os_setup.in 19.9 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6
7
8
# All rights reserved.
#

9
use English;
10
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11
require 'ctime.pl';
12

13
#
14
15
16
17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
# 
21
# usage: os_setup <pid> <eid>
22
#
Chad Barb's avatar
   
Chad Barb committed
23
24
25
26
27
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

28
29
sub usage()
{
30
    print STDERR "Usage: os_setup <pid> <eid>\n";
31
32
    exit(-1);
}
33
my  $optlist = "d";
34

Chad Barb's avatar
   
Chad Barb committed
35
36
37
38
39
40
41
42
43
44
45
#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
sub die_noretry($)
{
    my ($mesg) = shift;
    print STDERR "$mesg\n";
    exit(-1);
}

46
47
48
49
50
#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
51
my $TBOPS       = "@TBOPSEMAIL@";
52
my $TESTMODE    = @TESTMODE@;
53
my $TFTP	= "/tftpboot";
54

55
56
57
58
59
60
61
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

62
my $nodereboot	= "$TB/bin/node_reboot";
63
my $os_load	= "$TB/bin/os_load";
64
my $vnode_setup = "$TB/sbin/vnode_setup";
65
my $osselect    = "$TB/bin/os_select";
66
my $dbg		= 0;
67
my $failed      = 0;
68
my $failedvnodes= 0;
69
70
my %nodes       = ();
my %vnodes      = ();
71
72
73
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
74
my %osids       = ();
75
my %canfail     = ();
76
my $db_result;
77
my @row;
78

79
#
80
81
82
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid. 
83
#
84
85
my %reloads     = ();
my %reboots	= ();
86
my %rebooted    = ();
87
my $doautoload  = 1;
88
my $dolastload  = 1;
89
    
90
91
92
93
94
95
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

96
97
98
99
100
101
102
103
104
105
106
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
107
108
109
110
if (defined($options{"d"})) {
    $dbg = 1;
}

111
112
113
114
115
116
117
118
119
120
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
121
    die_noretry("Bad data in pid: $pid.");
122
123
124
125
126
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
127
    die_noretry("Bad data in eid: $eid.");
128
129
}

130
#
131
# Figure out who called us. Only root, people with admin status
132
133
# in the DB, or the owner of the experiment can run this script.
#
134
135
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
Chad Barb's avatar
   
Chad Barb committed
136
137
    die_noretry("*** $0:\n".
		"    You do not have permission to swap this experiment!");
138
139
}

140
141
TBDebugTimeStamp("os_setup started");

142
#
143
# Get the set of nodes, as well as the nodes table information for them.
144
#
145
$db_result =
146
147
    DBQueryFatal("select n.*,l.pid from reserved as r ".
		 "left join nodes as n on n.node_id=r.node_id ".
148
149
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
150

151
if ($db_result->numrows < 1) {	
152
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
Chad Barb's avatar
   
Chad Barb committed
153
    exit 0;
154
155
}

156
while (my %row = $db_result->fetchhash()) {
157
158
    my $node     = $row{'node_id'};
    my $osid     = $row{'def_boot_osid'};
159
    my $type     = $row{'type'};
160
    my $bootpath = 0;
161
    my $jailnode = 0;
162

163
    #
164
165
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
166
    #
167
168
169
170
171
172
173
174
    if (TBIsNodeVirtual($node, \$jailed)) {
	$vnodes{$node} = $jailed;
	if (! $jailed) {
	    next;
	}
	$jailnode = 1;
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
175
176
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
177
	$nodes{$node}  = $node;
Chad Barb's avatar
   
Chad Barb committed
178
179
180
181
182
183
	$nodeAllocStates{$node} = $nodeAllocState;
	# only reboot node if assign_wrapper just pulled it into expt.
	# (e.g. it isnt ALLOCSTATE_RES_READY)
	if ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    $reboots{$node} = 1;
	}
184
    }
185
    $osids{$node} = $osid;
186

187
    #
188
189
190
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
191
192
193
194
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

195
196
197
198
199
200
201
202
203
204
205
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
206
207
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
208
209
	    }

210
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
211
		die_noretry("*** File $path for node $node does not exist!");
212
	    }
213
	    $bootpath = 1;
214
	}
215
216
217
218
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

219
220
221
222
223
224
225
226
227
228
229
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
230
231
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
232
233
	    }

234
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
235
		die_noretry("*** File $path for node $node does not exist!");
236
237
	    }
	}
238
239
    }

240
241
242
243
244
245
246
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
247
	if (! -f $delta) {
Chad Barb's avatar
   
Chad Barb committed
248
	    die_noretry("*** Delta file $delta for node $node does not exist!");
249
250
251
252
253
254
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
255
	if (! -f $rpm) {
Chad Barb's avatar
   
Chad Barb committed
256
	    die_noretry("*** RPM $rpm for node $node does not exist!");
257
258
259
	}
    }
    
260
261
262
263
264
265
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
266
	if (! -f $tar) {
Chad Barb's avatar
   
Chad Barb committed
267
	    die_noretry("*** Tarfile $tar for node $node does not exist!");
268
269
	}
    }
270
271
272
273
274
275
276

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
    # it. 
    #
277
    if (!$bootpath && !$jailnode) {
278
279
280
281
282
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
	# 
	if (! $osid) {
Chad Barb's avatar
   
Chad Barb committed
283
284
	    die_noretry(
	        "*** $node has no bootpath and no def_boot_osid set!");
285
286
287
288
289
290
291
292
293
	}

	#
	# Grab the info for this OSID. This is part of the image check.
	#
	my $osid_result =
	    DBQueryFatal("select * from os_info where osid='$osid'");
	
	if ($osid_result->numrows == 0) {
Chad Barb's avatar
   
Chad Barb committed
294
	    die_noretry("*** No such OSID $osid is defined!");
295
296
297
298
299
300
301
	}
	
	my %osid_row   = $osid_result->fetchhash();

	#
	# If there is an actual path, its an OSKit kernel not an image.
	# 
302
	if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
303
	    #
304
305
	    # Not an OSKit kernel.
	    # Make sure this OSID is actually loaded on the machine. 
306
	    #
307
308
309
	    my $p_result =
		DBQueryFatal("select * from partitions ".
			     "where node_id='$node' and osid='$osid'");
310
311

	    #
312
313
	    # If not loaded, then see if the user was looking for the generic
	    # name of the OS that is loaded. 
314
	    # 
315
316
	    if ($p_result->numrows == 0) {
		#
317
		# Check to see if a non specific version specified.
318
		#
319
320
321
322
		if (! defined($osid_row{'version'}) ||
		    $osid_row{'version'} eq "") {

		    #
323
324
325
326
		    # A non-specific version. There needs to be a way to
		    # map it to another osid. 
		    #
		    if (!defined($osid_row{'nextosid'})) {
Chad Barb's avatar
   
Chad Barb committed
327
328
329
			die_noretry(
			    "*** $0:\n".
			    "    No mapping can be made for $osid ($node)!");
330
331
332
333
334
335
		    }
		    my $nextosid = $osid_row{'nextosid'};
		    
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
336
337
		    # 
		    my $o_result =
338
339
340
341
342
343
344
345
346
347
348
349
350
351
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
			# loaded on the machine. 
			#
			print "Mapping $osid on $node to $nextosid ".
			    "and setting up a reload.\n";
			
			SetupReload($node, $nextosid, $type);
			$osids{$node} = $nextosid;
352
353
		    }
		    else {
354
355
356
357
			#
			# Already loaded. 
			# 
			print "Mapping $osid on $node to $nextosid.\n";
358
359
360

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
361
			    SetupReload($node, $nextosid, $type);
362
363
			}
			else {
364
			    system("$osselect $nextosid $node") and
Chad Barb's avatar
   
Chad Barb committed
365
366
				die_noretry("*** Could not set boot OS to ".
				    "$nextosid for $node");
367
			}
368
			$osids{$node} = $nextosid;
369
		    }
370
371
		}
		else {
372
373
		    #
		    # User wants a specific version of an OS, but its not
374
		    # loaded on the machine. 
375
		    #
376
377
378
379
380
381
382
383
384
385
		    SetupReload($node, $osid, $type);
		}
	    }
	    else {
		#
		# OSID is loaded, but might need to be cleaned. 
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osid, $type);
386
		}
387
388
389
	    }
	}
    }
390
    
391
392
393
394
    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
395
    if ($row{'type'} eq "dnard") {
396
397
398
399
400
401
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
402
    print STDERR "$node - $osids{$node} - $canfail{$node}\n"
403
	if $dbg;
404
}
405

406
#
407
# Collect some info about vnodes. 
408
409
410
411
412
413
414
415
416
417
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
Chad Barb's avatar
   
Chad Barb committed
418
419
	die_noretry("*** $0:\n".
	    "    Cannot determine phys_nodeid for $vnode!");
420
    }
421
422
423
424
425
426
427
428
429
430

    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
431
432

    if (!exists($nodes{$pnode})) {
433
434
435
436
437
438
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
439
    
440
    # Nothing else to do for local jail nodes at this time ...
441
442
}

443
#
444
# We need to issue the reboots and the reloads in parallel.
445
#
446
TBDebugTimeStamp("rebooting/reloading started");
447
if (!$TESTMODE) {
448
449
450
451
    my %pids  = ();
    my $count = 0;
    my $cmd;

452
453
454
455
456
457
    foreach my $imageid ( keys(%reloads) ) {
	my @list = @{ $reloads{$imageid} };

	foreach my $node (@list) {
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
458
	    # No point in rebooting, obviously, but it does reboot!
459
	    delete $reboots{$node};
460
	    $rebooted{$node} = 1;
461
462
463
464
465
466
467
	}

	sleep(5);
	$pids{"$os_load -m $imageid @list"} =
	    ForkCmd("$os_load -m $imageid @list");
    }

468
    if (keys(%reboots)) {
Chad Barb's avatar
   
Chad Barb committed
469
470
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
Chad Barb's avatar
   
Chad Barb committed
471
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
   
Chad Barb committed
472
473
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
Chad Barb's avatar
   
Chad Barb committed
474
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
   
Chad Barb committed
475
476
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
477
478
	    # See below, needed for vnode_setup.
	    $rebooted{$node} = 1;
Chad Barb's avatar
   
Chad Barb committed
479
480
	}

481
482
483
484
485
486
487
488
489
490
	$cmd = "$nodereboot " . join(" ", keys(%reboots));
	$pids{$cmd} = ForkCmd($cmd);
    }

    foreach $cmd ( keys(%pids) ) {
	my $pid = $pids{$cmd};

	waitpid($pid, 0);
	if ($?) {
	    $failed++;
491
	    print "*** Failed: $cmd\n";
492
493
	}
    }
494
}
495
TBDebugTimeStamp("rebooting/reloading finished");
496
sleep(2);
497

498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
#
# XXX What happens if something above fails? We could exit, but some nodes
# that *are* rebooting would be caught in the middle. For the nodes that
# were reloaded, we can check the state right away (and avoid the wait
# below as well); they should be in the ISUP state when os_load is
# finished.  If not, thats a failure and we can save some time below.  For
# plain reboot failures, nothing to do but find out below after the wait.
# I do not want to exit right away cause we might end up with a lot more
# power cycles since the nodes are very likely to be in a non responsive
# state if just rebooted!
#
foreach my $imageid ( keys(%reloads) ) {
    my @list = @{ $reloads{$imageid} };

    foreach my $node ( @list ) {
513
	my $mode;
514
	
515
516
	if (!TBGetNodeOpMode($node, \$mode)) {
	    print "*** Error getting operational mode for $node!\n";
517
518
519
	    $failed++;
	    delete($nodes{$node});
	}
520
	if ($mode eq TBDB_NODEOPMODE_RELOAD) {
521
522
523
524
525
526
527
528
529
	    print "*** Not waiting for $node since its reload failed!\n";
	    $failed++;
	    delete($nodes{$node});
	}
    }
}
# Remaining nodes we need to wait for.
my @nodelist = keys(%nodes);

530
#
531
532
533
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
534
#
535
if (@nodelist) {
536
537
538
    print "Waiting for local testbed nodes to finish rebooting ...\n";
}

539
540
my %retries;
my %waitstart;
541
foreach my $node ( @nodelist ) {
542
543
544
545
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

546
TBDebugTimeStamp("Local node waiting started");
547
548
549
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
550

551
552
    if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*7))) {
	print "$node is alive and well\n";
553
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
Chad Barb's avatar
   
Chad Barb committed
554
555
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();	
556
557
	next;
    }
558

559
560
561
562
563
564
    if ($retries{$node}) {
	$retries{$node} -= 1;

	print "*** Rebooting $node and waiting again ...\n";
	
	if (system("$nodereboot $node") == 0) {
565
	    push(@nodelist, $node);
566
567
568
569
570
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }
571

572
573
574
575
576
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
    print "*** WARNING: $node may be down.\n".
	  "    This has been reported to testbed-ops.\n";
	
577
578
579
580
    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
581
	SENDMAIL($user, "Node $node is down",
582
583
584
585
586
587
588
589
590
591
592
593
594
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

595
596
	print "*** Continuing with experiment setup anyway ...\n";
	next;
597
598
    }

599
600
    # Reserve it to down experiment.
    MarkNodeDown($node);
Chad Barb's avatar
   
Chad Barb committed
601
602
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();	
603

604
605
606
607
608
609
    # Send mail to testbed-ops about it
    SENDMAIL($TBOPS, "Node $node is down",
	     "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
	     "$node has been taken out of the pool until this matter ".
	     "is resolved.\n");

610
    $failed++;
611
}
612
TBDebugTimeStamp("Local node waiting finished");
613

614
615
616
617
618
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#
my @vnodelist = keys(%vnodes);

#
# Set the allocstate for the local vnodes that were sucessfully rebooted
# and came to ISUP above. These do not need to be setup again! We move
# them to RES_READY, so vnode_setup will ignore them. If they fail to
# hit ISUP, we will move them to DOWN so that vnode_setup will ignore
# them again, in the teardown phase.
#
# Note, we do this even if there were failures above, since the teardown
# phase is going to happen, and we want vnode_setup to know which nodes
# came up with phynodes okay (need to be torndown) and which ones never
# had the chance (no need to teardown). Think swapmod, which does teardown
# in the ACTIVATING state.
#
foreach my $vnode (@vnodelist) {
    my $pnode  = $vnode2pnode{$vnode};

    # Remote node, always does setup. 
    next
	if (!exists($nodes{$pnode}));
641
642
643
    # Not rebooted, so leave allocstate alone for vnode_setup.
    next
	if (!exists($rebooted{$pnode}));
644
645
646
647
648
649
650

    if ($nodeAllocStates{$pnode} eq TBDB_ALLOCSTATE_RES_READY()) {
	TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_RES_READY());
	$nodeAllocStates{$vnode} = TBDB_ALLOCSTATE_RES_READY();
    }
}

651
#
652
653
654
655
656
657
658
659
660
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
if ($failed && @vnodelist) {
    print "*** Skipping virtual node setup since there were previous ".
	"failures!\n";
}
elsif (@vnodelist) {
    print "Setting up virtual testbed nodes ...\n";
661

662
    system("$vnode_setup $pid $eid");
663
    if ($?) {
Chad Barb's avatar
   
Chad Barb committed
664
665
	die_noretry("*** $0:\n".
	    "    Vnode setup failed!");
666
667
    }

668
    foreach my $node (@vnodelist) {
669
670
	$waitstart{$node} = time;
    }
671
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
672

673
674
675
    TBDebugTimeStamp("Virtual node waiting started");
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
676
	my $pnode  = $vnode2pnode{$node};
677
	my $wstart = $waitstart{$node};
678
	my $maxwait = 90 + (300 * $pnodevcount{$pnode});
679

680
	if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, $maxwait)) {
681
	    print "$node is alive and well\n";
682
	    # Might have already been set above.
683
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
684
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY;
685
686
687
688
689
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    next;
	}

	SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
690
	TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
691
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN;
692
	
693
	print "*** WARNING: $node may be down.\n";
694
695
696
697
698
	
	if ($canfail{$node}) {
	    # Send mail to testbed-ops and to the user about it.
	    my ($user) = getpwuid($UID);
	
699
	    SENDMAIL($user, "Virtual Node $node is down",
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	    print "*** Continuing with experiment setup anyway ...\n";
	    next;
	}
716
	$failedvnodes++;
717
    }
718
    TBDebugTimeStamp("Virtual node waiting finished");
719
720
}

721
print "OS Setup Done.\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
722
print "*** There were $failed failed nodes\n"
723
    if ($failed);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
724
print "*** There were $failedvnodes failed virtual nodes\n"
725
726
    if ($failedvnodes);

727
TBDebugTimeStamp("os_setup finished");
Chad Barb's avatar
   
Chad Barb committed
728

729
730
731
732
733
734
# No retry if vnodes failed. Indicates a fatal problem. 
exit(-1)
    if ($failedvnodes);
exit(1)
    if ($failed);
exit 0;
735

736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
#
# Map an OSID to an imageid for a node type.
#
sub TBMapOSIDtoImageID($$)
{
    my ($osid, $type) = @_;

    my $query_result =
	DBQueryFatal("select imageid from osidtoimageid ".
		     "where type='$type' and osid='$osid'");

    if ($query_result->numrows == 0) {
	return 0;
    }
    my ($imageid) = $query_result->fetchrow_array();

    return $imageid;
}

755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
#
# Setup a reload of a node if we can find an image.
# This goo constructs a hashed array of lists.
#
sub SetupReload($$$)
{
    my ($node, $osid, $type) = @_;

    if ((my $imageid = TBMapOSIDtoImageID($osid, $type))) {
	if (! defined($reloads{$imageid})) {
	    $reloads{$imageid} = [ $node ];
	}
	else {
	    push(@{ $reloads{$imageid} }, $node);
	}
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
772
773
	die_noretry("*** $0:\n".
	    "    No image can be found for $osid on $node!");
774
775
776
    }
}

777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
#
# Fork a process to exec a command. Return the pid to wait on.
# 
sub ForkCmd($) {
    my ($cmd) = @_;
    my($mypid);

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    if ($dbg) {
	print STDERR "Forking command: $cmd\n";
    }

    system($cmd);
    exit($? >> 8);
}