os_setup.in 19.7 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6
7
8
# All rights reserved.
#

9
use English;
10
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11
require 'ctime.pl';
12

13
#
14
15
16
17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
# 
21
# usage: os_setup <pid> <eid>
22
#
Chad Barb's avatar
   
Chad Barb committed
23
24
25
26
27
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

28
29
sub usage()
{
30
    print STDERR "Usage: os_setup <pid> <eid>\n";
31
32
    exit(-1);
}
33
my  $optlist = "d";
34

Chad Barb's avatar
   
Chad Barb committed
35
36
37
38
39
40
41
42
43
44
45
#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
sub die_noretry($)
{
    my ($mesg) = shift;
    print STDERR "$mesg\n";
    exit(-1);
}

46
47
48
49
50
#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
51
my $TBOPS       = "@TBOPSEMAIL@";
52
my $TESTMODE    = @TESTMODE@;
53
my $TFTP	= "/tftpboot";
54

55
56
57
58
59
60
61
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

62
my $nodereboot	= "$TB/bin/node_reboot";
63
my $os_load	= "$TB/bin/os_load";
64
my $vnode_setup = "$TB/sbin/vnode_setup";
65
my $osselect    = "$TB/bin/os_select";
66
my $dbg		= 0;
67
my $failed      = 0;
68
my $failedvnodes= 0;
69
70
my %nodes       = ();
my %vnodes      = ();
71
72
73
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
74
my %osids       = ();
75
my %canfail     = ();
76
my $db_result;
77
my @row;
78

79
#
80
81
82
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid. 
83
#
84
85
my %reloads     = ();
my %reboots	= ();
86
my %willreboot  = ();
87
my $doautoload  = 1;
88
my $dolastload  = 1;
89
    
90
91
92
93
94
95
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

96
97
98
99
100
101
102
103
104
105
106
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
107
108
109
110
if (defined($options{"d"})) {
    $dbg = 1;
}

111
112
113
114
115
116
117
118
119
120
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
121
    die_noretry("Bad data in pid: $pid.");
122
123
124
125
126
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
127
    die_noretry("Bad data in eid: $eid.");
128
129
}

130
#
131
# Figure out who called us. Only root, people with admin status
132
133
# in the DB, or the owner of the experiment can run this script.
#
134
135
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
Chad Barb's avatar
   
Chad Barb committed
136
137
    die_noretry("*** $0:\n".
		"    You do not have permission to swap this experiment!");
138
139
}

140
141
TBDebugTimeStamp("os_setup started");

142
#
143
# Get the set of nodes, as well as the nodes table information for them.
144
#
145
$db_result =
146
147
    DBQueryFatal("select n.*,l.pid from reserved as r ".
		 "left join nodes as n on n.node_id=r.node_id ".
148
149
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
150

151
if ($db_result->numrows < 1) {	
152
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
Chad Barb's avatar
   
Chad Barb committed
153
    exit 0;
154
155
}

156
while (my %row = $db_result->fetchhash()) {
157
158
    my $node     = $row{'node_id'};
    my $osid     = $row{'def_boot_osid'};
159
    my $type     = $row{'type'};
160
    my $bootpath = 0;
161
    my $jailnode = 0;
162

163
    #
164
165
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
166
    #
167
168
169
170
171
172
173
174
    if (TBIsNodeVirtual($node, \$jailed)) {
	$vnodes{$node} = $jailed;
	if (! $jailed) {
	    next;
	}
	$jailnode = 1;
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
175
176
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
177
	$nodes{$node}  = $node;
Chad Barb's avatar
   
Chad Barb committed
178
179
180
181
182
183
	$nodeAllocStates{$node} = $nodeAllocState;
	# only reboot node if assign_wrapper just pulled it into expt.
	# (e.g. it isnt ALLOCSTATE_RES_READY)
	if ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    $reboots{$node} = 1;
	}
184
    }
185
    $osids{$node} = $osid;
186

187
    #
188
189
190
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
191
192
193
194
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

195
196
197
198
199
200
201
202
203
204
205
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
206
207
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
208
209
	    }

210
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
211
		die_noretry("*** File $path for node $node does not exist!");
212
	    }
213
	    $bootpath = 1;
214
	}
215
216
217
218
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

219
220
221
222
223
224
225
226
227
228
229
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
230
231
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
232
233
	    }

234
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
235
		die_noretry("*** File $path for node $node does not exist!");
236
237
	    }
	}
238
239
    }

240
241
242
243
244
245
246
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
247
	if (! -f $delta) {
Chad Barb's avatar
   
Chad Barb committed
248
	    die_noretry("*** Delta file $delta for node $node does not exist!");
249
250
251
252
253
254
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
255
	if (! -f $rpm) {
Chad Barb's avatar
   
Chad Barb committed
256
	    die_noretry("*** RPM $rpm for node $node does not exist!");
257
258
259
	}
    }
    
260
261
262
263
264
265
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
266
	if (! -f $tar) {
Chad Barb's avatar
   
Chad Barb committed
267
	    die_noretry("*** Tarfile $tar for node $node does not exist!");
268
269
	}
    }
270
271
272
273
274
275
276

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
    # it. 
    #
277
    if (!$bootpath && !$jailnode) {
278
279
280
281
282
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
	# 
	if (! $osid) {
Chad Barb's avatar
   
Chad Barb committed
283
284
	    die_noretry(
	        "*** $node has no bootpath and no def_boot_osid set!");
285
286
287
288
289
290
291
292
293
	}

	#
	# Grab the info for this OSID. This is part of the image check.
	#
	my $osid_result =
	    DBQueryFatal("select * from os_info where osid='$osid'");
	
	if ($osid_result->numrows == 0) {
Chad Barb's avatar
   
Chad Barb committed
294
	    die_noretry("*** No such OSID $osid is defined!");
295
296
297
298
299
300
301
	}
	
	my %osid_row   = $osid_result->fetchhash();

	#
	# If there is an actual path, its an OSKit kernel not an image.
	# 
302
	if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
303
	    #
304
305
	    # Not an OSKit kernel.
	    # Make sure this OSID is actually loaded on the machine. 
306
	    #
307
308
309
	    my $p_result =
		DBQueryFatal("select * from partitions ".
			     "where node_id='$node' and osid='$osid'");
310
311

	    #
312
313
	    # If not loaded, then see if the user was looking for the generic
	    # name of the OS that is loaded. 
314
	    # 
315
316
	    if ($p_result->numrows == 0) {
		#
317
		# Check to see if a non specific version specified.
318
		#
319
320
321
322
		if (! defined($osid_row{'version'}) ||
		    $osid_row{'version'} eq "") {

		    #
323
324
325
326
		    # A non-specific version. There needs to be a way to
		    # map it to another osid. 
		    #
		    if (!defined($osid_row{'nextosid'})) {
Chad Barb's avatar
   
Chad Barb committed
327
328
329
			die_noretry(
			    "*** $0:\n".
			    "    No mapping can be made for $osid ($node)!");
330
331
332
333
334
335
		    }
		    my $nextosid = $osid_row{'nextosid'};
		    
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
336
337
		    # 
		    my $o_result =
338
339
340
341
342
343
344
345
346
347
348
349
350
351
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
			# loaded on the machine. 
			#
			print "Mapping $osid on $node to $nextosid ".
			    "and setting up a reload.\n";
			
			SetupReload($node, $nextosid, $type);
			$osids{$node} = $nextosid;
352
353
		    }
		    else {
354
355
356
357
			#
			# Already loaded. 
			# 
			print "Mapping $osid on $node to $nextosid.\n";
358
359
360

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
361
			    SetupReload($node, $nextosid, $type);
362
363
			}
			else {
364
			    system("$osselect $nextosid $node") and
Chad Barb's avatar
   
Chad Barb committed
365
366
				die_noretry("*** Could not set boot OS to ".
				    "$nextosid for $node");
367
			}
368
			$osids{$node} = $nextosid;
369
		    }
370
371
		}
		else {
372
373
		    #
		    # User wants a specific version of an OS, but its not
374
		    # loaded on the machine. 
375
		    #
376
377
378
379
380
381
382
383
384
385
		    SetupReload($node, $osid, $type);
		}
	    }
	    else {
		#
		# OSID is loaded, but might need to be cleaned. 
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osid, $type);
386
		}
387
388
389
	    }
	}
    }
390
    
391
392
393
394
    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
395
    if ($row{'type'} eq "dnard") {
396
397
398
399
400
401
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
402
    print STDERR "$node - $osids{$node} - $canfail{$node}\n"
403
	if $dbg;
404
}
405

406
#
407
# Collect some info about vnodes. 
408
409
410
411
412
413
414
415
416
417
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
Chad Barb's avatar
   
Chad Barb committed
418
419
	die_noretry("*** $0:\n".
	    "    Cannot determine phys_nodeid for $vnode!");
420
    }
421
422
423
424
425
426
427
428
429
430

    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
431
432

    if (!exists($nodes{$pnode})) {
433
434
435
436
437
438
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
439
    
440
    # Nothing else to do for local jail nodes at this time ...
441
442
}

443
#
444
# We need to issue the reboots and the reloads in parallel.
445
#
446
TBDebugTimeStamp("rebooting/reloading started");
447
if (!$TESTMODE) {
448
449
450
451
    my %pids  = ();
    my $count = 0;
    my $cmd;

452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
    foreach my $imageid ( keys(%reloads) ) {
	my @list = @{ $reloads{$imageid} };

	foreach my $node (@list) {
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
	    # No point in rebooting, obviously.
	    delete $reboots{$node};
	}

	sleep(5);
	$pids{"$os_load -m $imageid @list"} =
	    ForkCmd("$os_load -m $imageid @list");
    }

467
    if (keys(%reboots)) {
Chad Barb's avatar
   
Chad Barb committed
468
469
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
Chad Barb's avatar
   
Chad Barb committed
470
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
   
Chad Barb committed
471
472
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
Chad Barb's avatar
   
Chad Barb committed
473
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
   
Chad Barb committed
474
475
476
477
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
	}

478
479
480
481
482
483
484
485
486
487
	$cmd = "$nodereboot " . join(" ", keys(%reboots));
	$pids{$cmd} = ForkCmd($cmd);
    }

    foreach $cmd ( keys(%pids) ) {
	my $pid = $pids{$cmd};

	waitpid($pid, 0);
	if ($?) {
	    $failed++;
488
	    print "*** Failed: $cmd\n";
489
490
	}
    }
491
}
492
TBDebugTimeStamp("rebooting/reloading finished");
493
sleep(2);
494

495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
#
# XXX What happens if something above fails? We could exit, but some nodes
# that *are* rebooting would be caught in the middle. For the nodes that
# were reloaded, we can check the state right away (and avoid the wait
# below as well); they should be in the ISUP state when os_load is
# finished.  If not, thats a failure and we can save some time below.  For
# plain reboot failures, nothing to do but find out below after the wait.
# I do not want to exit right away cause we might end up with a lot more
# power cycles since the nodes are very likely to be in a non responsive
# state if just rebooted!
#
foreach my $imageid ( keys(%reloads) ) {
    my @list = @{ $reloads{$imageid} };

    foreach my $node ( @list ) {
510
	my $mode;
511
	
512
513
	if (!TBGetNodeOpMode($node, \$mode)) {
	    print "*** Error getting operational mode for $node!\n";
514
515
516
	    $failed++;
	    delete($nodes{$node});
	}
517
	if ($mode eq TBDB_NODEOPMODE_RELOAD) {
518
519
520
521
522
523
524
525
526
	    print "*** Not waiting for $node since its reload failed!\n";
	    $failed++;
	    delete($nodes{$node});
	}
    }
}
# Remaining nodes we need to wait for.
my @nodelist = keys(%nodes);

527
#
528
529
530
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
531
#
532
if (@nodelist) {
533
534
535
    print "Waiting for local testbed nodes to finish rebooting ...\n";
}

536
537
my %retries;
my %waitstart;
538
foreach my $node ( @nodelist ) {
539
540
541
542
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

543
TBDebugTimeStamp("Local node waiting started");
544
545
546
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
547

548
549
    if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*7))) {
	print "$node is alive and well\n";
550
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
Chad Barb's avatar
   
Chad Barb committed
551
552
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();	
553
554
	next;
    }
555

556
557
558
559
560
561
    if ($retries{$node}) {
	$retries{$node} -= 1;

	print "*** Rebooting $node and waiting again ...\n";
	
	if (system("$nodereboot $node") == 0) {
562
	    push(@nodelist, $node);
563
564
565
566
567
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }
568

569
570
571
572
573
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
    print "*** WARNING: $node may be down.\n".
	  "    This has been reported to testbed-ops.\n";
	
574
575
576
577
    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
578
	SENDMAIL($user, "Node $node is down",
579
580
581
582
583
584
585
586
587
588
589
590
591
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

592
593
	print "*** Continuing with experiment setup anyway ...\n";
	next;
594
595
    }

596
597
    # Reserve it to down experiment.
    MarkNodeDown($node);
Chad Barb's avatar
   
Chad Barb committed
598
599
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();	
600

601
602
603
604
605
606
    # Send mail to testbed-ops about it
    SENDMAIL($TBOPS, "Node $node is down",
	     "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
	     "$node has been taken out of the pool until this matter ".
	     "is resolved.\n");

607
    $failed++;
608
}
609
TBDebugTimeStamp("Local node waiting finished");
610

611
612
613
614
615
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
#
my @vnodelist = keys(%vnodes);

#
# Set the allocstate for the local vnodes that were sucessfully rebooted
# and came to ISUP above. These do not need to be setup again! We move
# them to RES_READY, so vnode_setup will ignore them. If they fail to
# hit ISUP, we will move them to DOWN so that vnode_setup will ignore
# them again, in the teardown phase.
#
# Note, we do this even if there were failures above, since the teardown
# phase is going to happen, and we want vnode_setup to know which nodes
# came up with phynodes okay (need to be torndown) and which ones never
# had the chance (no need to teardown). Think swapmod, which does teardown
# in the ACTIVATING state.
#
foreach my $vnode (@vnodelist) {
    my $pnode  = $vnode2pnode{$vnode};

    # Remote node, always does setup. 
    next
	if (!exists($nodes{$pnode}));

    if ($nodeAllocStates{$pnode} eq TBDB_ALLOCSTATE_RES_READY()) {
	TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_RES_READY());
	$nodeAllocStates{$vnode} = TBDB_ALLOCSTATE_RES_READY();
    }
}

645
#
646
647
648
649
650
651
652
653
654
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
if ($failed && @vnodelist) {
    print "*** Skipping virtual node setup since there were previous ".
	"failures!\n";
}
elsif (@vnodelist) {
    print "Setting up virtual testbed nodes ...\n";
655

656
    system("$vnode_setup $pid $eid");
657
    if ($?) {
Chad Barb's avatar
   
Chad Barb committed
658
659
	die_noretry("*** $0:\n".
	    "    Vnode setup failed!");
660
661
    }

662
    foreach my $node (@vnodelist) {
663
664
	$waitstart{$node} = time;
    }
665
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
666

667
668
669
    TBDebugTimeStamp("Virtual node waiting started");
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
670
	my $pnode  = $vnode2pnode{$node};
671
	my $wstart = $waitstart{$node};
672
	my $maxwait = 90 + (30 * $pnodevcount{$pnode});
673

674
	if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, $maxwait)) {
675
	    print "$node is alive and well\n";
676
	    # Might have already been set above.
677
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
678
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY;
679
680
681
682
683
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    next;
	}

	SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
684
	TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
685
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN;
686
	
687
	print "*** WARNING: $node may be down.\n";
688
689
690
691
692
	
	if ($canfail{$node}) {
	    # Send mail to testbed-ops and to the user about it.
	    my ($user) = getpwuid($UID);
	
693
	    SENDMAIL($user, "Virtual Node $node is down",
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	    print "*** Continuing with experiment setup anyway ...\n";
	    next;
	}
710
	$failedvnodes++;
711
    }
712
    TBDebugTimeStamp("Virtual node waiting finished");
713
714
}

715
print "OS Setup Done.\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
716
print "*** There were $failed failed nodes\n"
717
    if ($failed);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
718
print "*** There were $failedvnodes failed virtual nodes\n"
719
720
    if ($failedvnodes);

721
TBDebugTimeStamp("os_setup finished");
Chad Barb's avatar
   
Chad Barb committed
722

723
724
725
726
727
728
# No retry if vnodes failed. Indicates a fatal problem. 
exit(-1)
    if ($failedvnodes);
exit(1)
    if ($failed);
exit 0;
729

730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
#
# Map an OSID to an imageid for a node type.
#
sub TBMapOSIDtoImageID($$)
{
    my ($osid, $type) = @_;

    my $query_result =
	DBQueryFatal("select imageid from osidtoimageid ".
		     "where type='$type' and osid='$osid'");

    if ($query_result->numrows == 0) {
	return 0;
    }
    my ($imageid) = $query_result->fetchrow_array();

    return $imageid;
}

749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
#
# Setup a reload of a node if we can find an image.
# This goo constructs a hashed array of lists.
#
sub SetupReload($$$)
{
    my ($node, $osid, $type) = @_;

    if ((my $imageid = TBMapOSIDtoImageID($osid, $type))) {
	if (! defined($reloads{$imageid})) {
	    $reloads{$imageid} = [ $node ];
	}
	else {
	    push(@{ $reloads{$imageid} }, $node);
	}
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
766
767
	die_noretry("*** $0:\n".
	    "    No image can be found for $osid on $node!");
768
769
770
    }
}

771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
#
# Fork a process to exec a command. Return the pid to wait on.
# 
sub ForkCmd($) {
    my ($cmd) = @_;
    my($mypid);

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    if ($dbg) {
	print STDERR "Forking command: $cmd\n";
    }

    system($cmd);
    exit($? >> 8);
}