os_setup.in 20.4 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6
7
8
# All rights reserved.
#

9
use English;
10
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11
require 'ctime.pl';
12

13
#
14
15
16
17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
# 
21
# usage: os_setup <pid> <eid>
22
#
Chad Barb's avatar
   
Chad Barb committed
23
24
25
26
27
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

28
29
sub usage()
{
30
    print STDERR "Usage: os_setup <pid> <eid>\n";
31
32
    exit(-1);
}
33
my  $optlist = "d";
34

Chad Barb's avatar
   
Chad Barb committed
35
36
37
38
39
40
41
42
43
44
45
#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
sub die_noretry($)
{
    my ($mesg) = shift;
    print STDERR "$mesg\n";
    exit(-1);
}

46
47
48
49
50
#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
51
my $TBOPS       = "@TBOPSEMAIL@";
52
my $TESTMODE    = @TESTMODE@;
53
my $TFTP	= "/tftpboot";
54

55
56
57
58
59
60
61
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

62
my $nodereboot	= "$TB/bin/node_reboot";
63
my $os_load	= "$TB/bin/os_load";
64
my $vnode_setup = "$TB/sbin/vnode_setup";
65
my $osselect    = "$TB/bin/os_select";
66
my $dbg		= 0;
67
my $failed      = 0;
68
my $failedvnodes= 0;
69
my $failedplab  = 0;
70
71
my %nodes       = ();
my %vnodes      = ();
72
73
74
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
75
my %plabvnodes  = ();
76
my %osids       = ();
77
my %canfail     = ();
78
my $db_result;
79
my @row;
80

81
#
82
83
84
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
# imageid, a list of the nodes to pass to os_load for that imageid. 
85
#
86
87
my %reloads     = ();
my %reboots	= ();
88
my %rebooted    = ();
89
my $doautoload  = 1;
90
my $dolastload  = 1;
91
    
92
93
94
95
96
97
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

98
99
100
101
102
103
104
105
106
107
108
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
109
110
111
112
if (defined($options{"d"})) {
    $dbg = 1;
}

113
114
115
116
117
118
119
120
121
122
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
123
    die_noretry("Bad data in pid: $pid.");
124
125
126
127
128
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
Chad Barb's avatar
   
Chad Barb committed
129
    die_noretry("Bad data in eid: $eid.");
130
131
}

132
#
133
# Figure out who called us. Only root, people with admin status
134
135
# in the DB, or the owner of the experiment can run this script.
#
136
137
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
Chad Barb's avatar
   
Chad Barb committed
138
139
    die_noretry("*** $0:\n".
		"    You do not have permission to swap this experiment!");
140
141
}

142
143
TBDebugTimeStamp("os_setup started");

144
#
145
# Get the set of nodes, as well as the nodes table information for them.
146
#
147
$db_result =
148
    DBQueryFatal("select n.*,l.pid,nt.* from reserved as r ".
149
		 "left join nodes as n on n.node_id=r.node_id ".
150
		 "left join last_reservation as l on n.node_id=l.node_id ".
151
		 "left join node_types as nt on nt.type=n.type ".
152
		 "where r.pid='$pid' and r.eid='$eid'");
153

154
if ($db_result->numrows < 1) {	
155
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
Chad Barb's avatar
   
Chad Barb committed
156
    exit 0;
157
158
}

159
while (my %row = $db_result->fetchhash()) {
160
161
    my $node     = $row{'node_id'};
    my $osid     = $row{'def_boot_osid'};
162
    my $type     = $row{'type'};
163
164
165
166
    my $subnode  = $row{'issubnode'};
    my $virtnode = $row{'isvirtnode'};
    my $jailnode = $row{'jailflag'};
    my $plabnode = $row{'isplabdslice'};
167
    my $bootpath = 0;
168

169
    #
170
171
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
172
    #
173
    if ($virtnode) {
174
	$vnodes{$node} = ($jailnode || $plabnode);
175
	$plabvnodes{$node} = $plabnode;
176
	if (! $jailnode && ! $plabnode) {
177
178
179
	    next;
	}
    }
180
181
182
    elsif ($subnode) {
	print "Skipping subnode $node\n";
    }
183
    else {
Chad Barb's avatar
   
Chad Barb committed
184
185
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
186
	$nodes{$node}  = $node;
Chad Barb's avatar
   
Chad Barb committed
187
188
189
190
191
192
	$nodeAllocStates{$node} = $nodeAllocState;
	# only reboot node if assign_wrapper just pulled it into expt.
	# (e.g. it isnt ALLOCSTATE_RES_READY)
	if ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    $reboots{$node} = 1;
	}
193
    }
194
    $osids{$node} = $osid;
195

196
    #
197
198
199
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
    # and just check the path directly. 
200
201
202
203
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

204
205
206
207
208
209
210
211
212
213
214
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
215
216
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
217
218
	    }

219
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
220
		die_noretry("*** File $path for node $node does not exist!");
221
	    }
222
	    $bootpath = 1;
223
	}
224
225
226
227
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

228
229
230
231
232
233
234
235
236
237
238
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Chad Barb's avatar
   
Chad Barb committed
239
240
		die_noretry(
		    "*** File $path for node $node must reside in $TFTP");
241
242
	    }

243
	    if (! -f $path) {
Chad Barb's avatar
   
Chad Barb committed
244
		die_noretry("*** File $path for node $node does not exist!");
245
246
	    }
	}
247
248
    }

249
250
251
252
253
254
255
    #
    # XXX - Check for existence of the delta files. We do this here
    # cause its easier than looking for a failure later, when the node
    # tries to install the delta. Not a general solution though. Needs
    # more thought.
    #
    foreach my $delta (split(":", $row{'deltas'})) {
256
	if (! -f $delta) {
Chad Barb's avatar
   
Chad Barb committed
257
	    die_noretry("*** Delta file $delta for node $node does not exist!");
258
259
260
261
262
263
	}
    }
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
264
	if (! -f $rpm) {
Chad Barb's avatar
   
Chad Barb committed
265
	    die_noretry("*** RPM $rpm for node $node does not exist!");
266
267
268
	}
    }
    
269
270
271
272
273
274
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
	
275
	if (! -f $tar) {
Chad Barb's avatar
   
Chad Barb committed
276
	    die_noretry("*** Tarfile $tar for node $node does not exist!");
277
278
	}
    }
279
280
281
282
283
284
285

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
    # it. 
    #
286
    if (!$bootpath && !$jailnode && !$plabnode) {
287
288
289
290
291
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
	# 
	if (! $osid) {
Chad Barb's avatar
   
Chad Barb committed
292
293
	    die_noretry(
	        "*** $node has no bootpath and no def_boot_osid set!");
294
295
296
297
298
299
300
301
302
	}

	#
	# Grab the info for this OSID. This is part of the image check.
	#
	my $osid_result =
	    DBQueryFatal("select * from os_info where osid='$osid'");
	
	if ($osid_result->numrows == 0) {
Chad Barb's avatar
   
Chad Barb committed
303
	    die_noretry("*** No such OSID $osid is defined!");
304
305
306
307
308
309
310
	}
	
	my %osid_row   = $osid_result->fetchhash();

	#
	# If there is an actual path, its an OSKit kernel not an image.
	# 
311
	if (! defined($osid_row{'path'}) || $osid_row{'path'} eq "") {
312
	    #
313
314
	    # Not an OSKit kernel.
	    # Make sure this OSID is actually loaded on the machine. 
315
	    #
316
317
318
	    my $p_result =
		DBQueryFatal("select * from partitions ".
			     "where node_id='$node' and osid='$osid'");
319
320

	    #
321
322
	    # If not loaded, then see if the user was looking for the generic
	    # name of the OS that is loaded. 
323
	    # 
324
325
	    if ($p_result->numrows == 0) {
		#
326
		# Check to see if a non specific version specified.
327
		#
328
329
330
331
		if (! defined($osid_row{'version'}) ||
		    $osid_row{'version'} eq "") {

		    #
332
333
334
335
		    # A non-specific version. There needs to be a way to
		    # map it to another osid. 
		    #
		    if (!defined($osid_row{'nextosid'})) {
Chad Barb's avatar
   
Chad Barb committed
336
337
338
			die_noretry(
			    "*** $0:\n".
			    "    No mapping can be made for $osid ($node)!");
339
340
341
342
343
344
		    }
		    my $nextosid = $osid_row{'nextosid'};
		    
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
345
346
		    # 
		    my $o_result =
347
348
349
350
351
352
353
354
355
356
357
358
359
360
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
			# loaded on the machine. 
			#
			print "Mapping $osid on $node to $nextosid ".
			    "and setting up a reload.\n";
			
			SetupReload($node, $nextosid, $type);
			$osids{$node} = $nextosid;
361
362
		    }
		    else {
363
364
365
366
			#
			# Already loaded. 
			# 
			print "Mapping $osid on $node to $nextosid.\n";
367
368
369

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
370
			    SetupReload($node, $nextosid, $type);
371
372
			}
			else {
373
			    system("$osselect $nextosid $node") and
Chad Barb's avatar
   
Chad Barb committed
374
375
				die_noretry("*** Could not set boot OS to ".
				    "$nextosid for $node");
376
			}
377
			$osids{$node} = $nextosid;
378
		    }
379
380
		}
		else {
381
382
		    #
		    # User wants a specific version of an OS, but its not
383
		    # loaded on the machine. 
384
		    #
385
386
387
388
389
390
391
392
393
394
		    SetupReload($node, $osid, $type);
		}
	    }
	    else {
		#
		# OSID is loaded, but might need to be cleaned. 
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osid, $type);
395
		}
396
397
398
	    }
	}
    }
399
    
400
401
402
403
    #
    # Set the canfail bit. Currently, sharks are always canfail=1.
    # Will come from DB at some point.
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
404
    if ($plabnode) {
405
406
407
408
409
410
	$canfail{$node} = 1;
    }
    else {
	$canfail{$node} = 0;
    }
    
411
    print STDERR "$node - $osids{$node} - $canfail{$node}\n"
412
	if $dbg;
413
}
414

415
#
416
# Collect some info about vnodes. 
417
418
419
420
421
422
423
424
425
426
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
Chad Barb's avatar
   
Chad Barb committed
427
428
	die_noretry("*** $0:\n".
	    "    Cannot determine phys_nodeid for $vnode!");
429
    }
430
431
432
433
434
435
436
437
438
439

    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
440
441

    if (!exists($nodes{$pnode})) {
442
443
444
445
446
447
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
448
    
449
    # Nothing else to do for local jail nodes at this time ...
450
451
}

452
#
453
# We need to issue the reboots and the reloads in parallel.
454
#
455
TBDebugTimeStamp("rebooting/reloading started");
456
if (!$TESTMODE) {
457
458
459
460
    my %pids  = ();
    my $count = 0;
    my $cmd;

461
462
463
464
465
466
    foreach my $imageid ( keys(%reloads) ) {
	my @list = @{ $reloads{$imageid} };

	foreach my $node (@list) {
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
467
	    # No point in rebooting, obviously, but it does reboot!
468
	    delete $reboots{$node};
469
	    $rebooted{$node} = 1;
470
471
472
473
474
475
476
	}

	sleep(5);
	$pids{"$os_load -m $imageid @list"} =
	    ForkCmd("$os_load -m $imageid @list");
    }

477
    if (keys(%reboots)) {
Chad Barb's avatar
   
Chad Barb committed
478
479
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
Chad Barb's avatar
   
Chad Barb committed
480
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
   
Chad Barb committed
481
482
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
Chad Barb's avatar
   
Chad Barb committed
483
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
   
Chad Barb committed
484
485
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
486
487
	    # See below, needed for vnode_setup.
	    $rebooted{$node} = 1;
Chad Barb's avatar
   
Chad Barb committed
488
489
	}

490
491
492
493
494
495
496
497
498
499
	$cmd = "$nodereboot " . join(" ", keys(%reboots));
	$pids{$cmd} = ForkCmd($cmd);
    }

    foreach $cmd ( keys(%pids) ) {
	my $pid = $pids{$cmd};

	waitpid($pid, 0);
	if ($?) {
	    $failed++;
500
	    print "*** Failed: $cmd\n";
501
502
	}
    }
503
}
504
TBDebugTimeStamp("rebooting/reloading finished");
505
sleep(2);
506

507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
#
# XXX What happens if something above fails? We could exit, but some nodes
# that *are* rebooting would be caught in the middle. For the nodes that
# were reloaded, we can check the state right away (and avoid the wait
# below as well); they should be in the ISUP state when os_load is
# finished.  If not, thats a failure and we can save some time below.  For
# plain reboot failures, nothing to do but find out below after the wait.
# I do not want to exit right away cause we might end up with a lot more
# power cycles since the nodes are very likely to be in a non responsive
# state if just rebooted!
#
foreach my $imageid ( keys(%reloads) ) {
    my @list = @{ $reloads{$imageid} };

    foreach my $node ( @list ) {
522
	my $mode;
523
	
524
525
	if (!TBGetNodeOpMode($node, \$mode)) {
	    print "*** Error getting operational mode for $node!\n";
526
527
528
	    $failed++;
	    delete($nodes{$node});
	}
529
	if ($mode eq TBDB_NODEOPMODE_RELOAD) {
530
531
532
533
534
535
536
537
538
	    print "*** Not waiting for $node since its reload failed!\n";
	    $failed++;
	    delete($nodes{$node});
	}
    }
}
# Remaining nodes we need to wait for.
my @nodelist = keys(%nodes);

539
#
540
541
542
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
543
#
544
if (@nodelist) {
545
546
547
    print "Waiting for local testbed nodes to finish rebooting ...\n";
}

548
549
my %retries;
my %waitstart;
550
foreach my $node ( @nodelist ) {
551
552
553
554
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

555
TBDebugTimeStamp("Local node waiting started");
556
557
558
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
559

560
561
    if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, (60*7))) {
	print "$node is alive and well\n";
562
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
Chad Barb's avatar
   
Chad Barb committed
563
564
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();	
565
566
	next;
    }
567

568
569
570
571
572
573
    if ($retries{$node}) {
	$retries{$node} -= 1;

	print "*** Rebooting $node and waiting again ...\n";
	
	if (system("$nodereboot $node") == 0) {
574
	    push(@nodelist, $node);
575
576
577
578
579
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }
580

581
582
583
584
585
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
	
    print "*** WARNING: $node may be down.\n".
	  "    This has been reported to testbed-ops.\n";
	
586
587
588
589
    if ($canfail{$node}) {
	# Send mail to testbed-ops and to the user about it.
	my ($user) = getpwuid($UID);
	
590
	SENDMAIL($user, "Node $node is down",
591
592
593
594
595
596
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
597
		 "Testbed Operations has also been notified.\n\n".
598
599
600
601
602
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

603
604
	print "*** Continuing with experiment setup anyway ...\n";
	next;
605
606
    }

607
608
    # Reserve it to down experiment.
    MarkNodeDown($node);
Chad Barb's avatar
   
Chad Barb committed
609
610
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();	
611

612
613
614
615
616
617
    # Send mail to testbed-ops about it
    SENDMAIL($TBOPS, "Node $node is down",
	     "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
	     "$node has been taken out of the pool until this matter ".
	     "is resolved.\n");

618
    $failed++;
619
}
620
TBDebugTimeStamp("Local node waiting finished");
621

622
623
624
625
626
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
#
my @vnodelist = keys(%vnodes);

#
# Set the allocstate for the local vnodes that were sucessfully rebooted
# and came to ISUP above. These do not need to be setup again! We move
# them to RES_READY, so vnode_setup will ignore them. If they fail to
# hit ISUP, we will move them to DOWN so that vnode_setup will ignore
# them again, in the teardown phase.
#
# Note, we do this even if there were failures above, since the teardown
# phase is going to happen, and we want vnode_setup to know which nodes
# came up with phynodes okay (need to be torndown) and which ones never
# had the chance (no need to teardown). Think swapmod, which does teardown
# in the ACTIVATING state.
#
foreach my $vnode (@vnodelist) {
    my $pnode  = $vnode2pnode{$vnode};

    # Remote node, always does setup. 
    next
	if (!exists($nodes{$pnode}));
649
650
651
    # Not rebooted, so leave allocstate alone for vnode_setup.
    next
	if (!exists($rebooted{$pnode}));
652
653
654
655
656
657
658

    if ($nodeAllocStates{$pnode} eq TBDB_ALLOCSTATE_RES_READY()) {
	TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_RES_READY());
	$nodeAllocStates{$vnode} = TBDB_ALLOCSTATE_RES_READY();
    }
}

659
#
660
661
662
663
664
665
666
667
668
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
if ($failed && @vnodelist) {
    print "*** Skipping virtual node setup since there were previous ".
	"failures!\n";
}
elsif (@vnodelist) {
    print "Setting up virtual testbed nodes ...\n";
669

670
    system("$vnode_setup $pid $eid");
671
    if ($?) {
Chad Barb's avatar
   
Chad Barb committed
672
673
	die_noretry("*** $0:\n".
	    "    Vnode setup failed!");
674
675
    }

676
    foreach my $node (@vnodelist) {
677
678
	$waitstart{$node} = time;
    }
679
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
680

681
682
683
    TBDebugTimeStamp("Virtual node waiting started");
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
684
	my $pnode  = $vnode2pnode{$node};
685
	my $wstart = $waitstart{$node};
686
	my $maxwait = 90 + (100 * $pnodevcount{$pnode});
687

688
	if (!TBNodeStateWait($node, TBDB_NODESTATE_ISUP, $wstart, $maxwait)) {
689
	    print "$node is alive and well\n";
690
	    # Might have already been set above.
691
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
692
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY;
693
694
695
696
697
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    next;
	}

	SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);
698
	TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
699
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN;
700
	
701
	print "*** WARNING: $node may be down.\n";
702
703
704
705
706
	
	if ($canfail{$node}) {
	    # Send mail to testbed-ops and to the user about it.
	    my ($user) = getpwuid($UID);
	
707
	    SENDMAIL($user, "Virtual Node $node is down",
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
		 "Node $node in pid/eid $pid/$eid appears to be dead.\n\n".
		 "Your experiment will continue to run since this failure\n".
		 "is nonfatal, although you might encounter other problems\n".
		 "if your experiment depends explicitly on this node.\n".
		 "You should terminate this experiment if it cannot ".
		 "tolerate this failure.\n\n".
		 "Testbed Operations has also been notified so they can ".
		 "investigate.\n\n".
		 "Thanks\n".
		 "Testbed Operations\n",
		 0,
		 "Cc: $TBOPS");

	    print "*** Continuing with experiment setup anyway ...\n";
	    next;
	}
724
725
726
727
728
729
	if ($plabvnodes{$node}) {
	    $failedplab++;
	}
	else {
	    $failedvnodes++;
	}
730
    }
731
    TBDebugTimeStamp("Virtual node waiting finished");
732
733
}

734
print "OS Setup Done.\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
735
print "*** There were $failed failed nodes\n"
736
    if ($failed);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
737
print "*** There were $failedvnodes failed virtual nodes\n"
738
    if ($failedvnodes);
739
740
print "*** There were $failedplab failed plab nodes\n"
    if ($failedplab);
741

742
TBDebugTimeStamp("os_setup finished");
Chad Barb's avatar
   
Chad Barb committed
743

744
745
746
747
# No retry if vnodes failed. Indicates a fatal problem. 
exit(-1)
    if ($failedvnodes);
exit(1)
748
    if ($failed || $failedplab);
749
exit 0;
750

751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
#
# Map an OSID to an imageid for a node type.
#
sub TBMapOSIDtoImageID($$)
{
    my ($osid, $type) = @_;

    my $query_result =
	DBQueryFatal("select imageid from osidtoimageid ".
		     "where type='$type' and osid='$osid'");

    if ($query_result->numrows == 0) {
	return 0;
    }
    my ($imageid) = $query_result->fetchrow_array();

    return $imageid;
}

770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
#
# Setup a reload of a node if we can find an image.
# This goo constructs a hashed array of lists.
#
sub SetupReload($$$)
{
    my ($node, $osid, $type) = @_;

    if ((my $imageid = TBMapOSIDtoImageID($osid, $type))) {
	if (! defined($reloads{$imageid})) {
	    $reloads{$imageid} = [ $node ];
	}
	else {
	    push(@{ $reloads{$imageid} }, $node);
	}
    }
    else {
Chad Barb's avatar
   
Chad Barb committed
787
788
	die_noretry("*** $0:\n".
	    "    No image can be found for $osid on $node!");
789
790
791
    }
}

792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
#
# Fork a process to exec a command. Return the pid to wait on.
# 
sub ForkCmd($) {
    my ($cmd) = @_;
    my($mypid);

    $mypid = fork();
    if ($mypid) {
	return $mypid;
    }

    if ($dbg) {
	print STDERR "Forking command: $cmd\n";
    }

    system($cmd);
    exit($? >> 8);
}