os_setup.in 50.8 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5 6
# All rights reserved.
#
7
use English;
8
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
9
require 'ctime.pl';
10
use POSIX ":sys_wait_h";
11

12
#
13 14 15 16
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
17
#
18
# TODO: Reload disk images.
19
#
20
# usage: os_setup <pid> <eid>
21
#
Chad Barb's avatar
 
Chad Barb committed
22 23 24 25 26
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

27 28
sub usage()
{
29
    print STDERR "Usage: os_setup [-d] <pid> <eid>\n";
30 31
    exit(-1);
}
32
my  $optlist = "d";
33 34 35 36 37 38

#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
39
my $TBOPS       = "@TBOPSEMAIL@";
40
my $TESTMODE    = @TESTMODE@;
41
my $TFTP	= "/tftpboot";
42
my $PGENISUPPORT= @PROTOGENI_SUPPORT@;
43

44 45 46 47 48
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
49 50
use libreboot;
use libosload;
51
use libtestbed;
Kevin Atkinson's avatar
Kevin Atkinson committed
52
use libtblog;
53
use libArchive;
54
use Template;
55
use NodeType;
56
use Experiment;
57
use Image;
58
use OSinfo;
59
use User;
60
use Node;
61 62 63
if ($PGENISUPPORT) {
    require libGeni;
}
64

Leigh B. Stoller's avatar
Leigh B. Stoller committed
65 66
TBDebugTimeStampsOn();

67
my $vnode_setup = "$TB/sbin/vnode_setup";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
68
my $nodereboot  = "$TB/bin/node_reboot";
69
my $elab_setup  = "$TB/sbin/elabinelab";
70
my $dbg		= 0;
71
my $failed      = 0;
72
my $noretry     = 0;
73
my $failedvnodes= 0;
74
my $failedplab  = 0;
75
my $failedgeni  = 0;
76
my $canceled    = 0;
77
my %nodeobjs    = ();
78 79
my %nodes       = ();
my %vnodes      = ();
80
my %sharednodes = ();
81 82 83
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
84
my %plabvnodes  = ();
85
my %geninodes   = ();
86
my %einenodes	= ();
87
my %osids       = ();
88
my %osmap       = ();
89
my %canfail     = ();
90 91 92
my %bios_waittime   = ();	# Indexed by node_type.
my %reboot_waittime = ();	# Indexed by osid.
my %node_types  = ();		# Indexed by node_id.
Kevin Atkinson's avatar
 
Kevin Atkinson committed
93
my %vname = ();                 # Indexed by node_id.
94 95
my $plab_setup_pid;		# Run plab setup in parallel.
my $geni_setup_pid;		# Run geni setup in parallel.
96

Kevin Atkinson's avatar
 
Kevin Atkinson committed
97 98
#
# This variable keeps track of the failed nodes of all types.
Kevin Atkinson's avatar
 
Kevin Atkinson committed
99 100 101 102 103 104 105
#   values = ['boot'|'osload', 'fatal'|'nonfatal']
my %failed_nodes = ();
sub add_failed_node_fatal($)     {$failed_nodes{$_[0]} = ['boot', 'fatal']}
sub add_failed_node_nonfatal($)  {$failed_nodes{$_[0]} = ['boot', 'nonfatal']}
sub add_failed_node_reload($)    {$failed_nodes{$_[0]} = ['reload', 'fatal']}

my @all_nodes; # list of all nodes before any are deleted from %nodes
Kevin Atkinson's avatar
 
Kevin Atkinson committed
106

107
#
108 109
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
110
# imageid, a list of the nodes to pass to os_load for that imageid.
111
#
112 113
my %reloads     = ();
my %reboots	= ();
114
my %reconfigs	= ();
115
my %rebooted    = ();
116
my $doautoload  = 1;
117
my $dolastload  = 1;
118

119 120 121
# Protos
sub SetupReload($$$);
sub FirewallSetup($);
122
sub os_setup_one($$$;$);
123
sub KillChildren();
124
				  
125 126 127 128 129 130
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

Kevin Atkinson's avatar
 
Kevin Atkinson committed
131 132 133 134 135

#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
136
sub die_noretry($;$)
Kevin Atkinson's avatar
 
Kevin Atkinson committed
137
{
138 139
    my $parms = {};
    $parms = shift if ref $_[0] eq 'HASH';
Kevin Atkinson's avatar
 
Kevin Atkinson committed
140
    my ($mesg) = shift;
141
    tberror($parms, $mesg);
142
    KillChildren();
Kevin Atkinson's avatar
 
Kevin Atkinson committed
143 144 145
    exit(-1);
}

146 147 148 149 150 151 152 153 154 155 156
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
157 158 159 160
if (defined($options{"d"})) {
    $dbg = 1;
}

161 162 163 164 165 166 167 168 169 170
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
Chad Barb's avatar
 
Chad Barb committed
171
    die_noretry("Bad data in pid: $pid.");
172 173 174 175 176
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
Chad Barb's avatar
 
Chad Barb committed
177
    die_noretry("Bad data in eid: $eid.");
178 179
}

180
#
181
# Verify user and get his DB uid and other info for later.
182
#
183 184 185
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    die_noretry("You ($UID) do not exist!");
186
}
187 188 189
my $user_uid      = $this_user->uid();
my $user_name     = $this_user->name();
my $user_email    = $this_user->email();
190 191
my $user_email_to = "$user_name <$user_email>";

192 193 194 195 196 197 198 199 200 201
#
# Check permission.
#
my $experiment = Experiment->Lookup($pid, $eid);
if (!defined($experiment)) {
    die_noretry("Could not find experiment object for $pid/$eid!");    
}
if (!$experiment->AccessCheck($this_user, TB_EXPT_MODIFY)) {
    die_noretry("You do not have permission to swap this experiment!");
}
202 203
TBDebugTimeStamp("os_setup started");

204 205 206 207
#
# See if the experiment is firewalled
#
my $firewall;
208
my $firewalled = $experiment->IsFirewalled(\$firewall);
209
my $firewallimage;
210 211 212 213

#
# Ditto ElabinElab.
#
214
my $elabinelab = $experiment->elabinelab();
215

216 217 218 219 220
#
# Ditto PlabinElab.
#
my $plabinelab = 0;
my $plcnode;
221
my $plcimage;
222 223 224 225
if (TBExptPlabInElabPLC($pid, $eid, \$plcnode)) {
    $plabinelab = 1;
}

226
#
227
# Get the set of nodes, as well as the nodes table information for them.
228
#
229
my $db_result =
230 231
    DBQueryFatal("select n.*,l.pid,r.vname,r.sharing_mode,r.inner_elab_role ".
		 "from reserved as r ".
232
		 "left join nodes as n on n.node_id=r.node_id ".
233 234
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
235

236
if ($db_result->numrows < 1) {
237
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
Chad Barb's avatar
 
Chad Barb committed
238
    exit 0;
239 240
}

241
while (my %row = $db_result->fetchhash()) {
242 243 244 245 246
    my $node      = $row{'node_id'};
    my $osid      = $row{'def_boot_osid'};
    my $type      = $row{'type'};
    my $jailnode  = $row{'jailflag'};
    my $failmode  = $row{'failureaction'};
Kevin Atkinson's avatar
 
Kevin Atkinson committed
247
    my $vname     = $row{'vname'};
248 249 250 251
    my $typeinfo  = NodeType->Lookup($type);
    my $class     = $typeinfo->class();
    my $subnode   = $typeinfo->issubnode();
    my $virtnode  = $typeinfo->isvirtnode();
252 253
    my $sharednode = defined($row{'sharing_mode'}) 
	&& $row{'sharing_mode'} eq 'using_shared_local';
254 255
    my $iseinenode= $elabinelab && defined($row{'inner_elab_role'})
	&& $row{'inner_elab_role'} eq 'node';
256
    my $isremote  = $typeinfo->isremotenode();
257
    my $isgeninode= $typeinfo->isfednode();
258 259 260
    my $imageable = $typeinfo->imageable();
    my $plabnode  = $typeinfo->isplabdslice();
    my $bios_wait = $typeinfo->bios_waittime();
261
    my $bootpath  = 0;
262
    my $osinfo    = undef;
263

264 265 266 267 268 269
    my $nodeobj = Node->Lookup($node);
    if (!defined($nodeobj)) {
	die_noretry("Cannot lookup object for $node!");
    }
    $nodeobjs{$node} = $nodeobj;

270 271 272 273 274 275 276
    if ($isgeninode) {
	#
	# Geni nodes are currently a lot like plab nodes, but that will
	# change later.
	#
	if ($virtnode) {
	    $vnodes{$node} = $virtnode;
277
	    $sharednodes{$node} = $sharednode;
278 279 280 281 282 283 284 285 286 287 288
	}
	else {
	    $nodes{$node}  = $node;
	}
	$geninodes{$node}  = 1;
    }
    elsif ($virtnode) {
	#
	# Virtual nodes are special. Jailed vnodes can do quite a bit,
	# and so run them through the checks below.
	#
289
	$vnodes{$node} = ($jailnode || $plabnode || $isremote);
290
	$sharednodes{$node} = $sharednode;
291
	$plabvnodes{$node} = $plabnode;
292
	if (! $jailnode && ! $plabnode && !$isremote) {
293 294 295
	    next;
	}
    }
296 297 298 299 300
    elsif ($iseinenode) {
	print "Will skip reload/reboot of inner elab node $node.\n";
	$einenodes{$node} = 1;
	next;
    }
Timothy Stack's avatar
 
Timothy Stack committed
301
    elsif ($subnode && !$imageable) {
302
	print "Will skip subnode $node ISUP wait.\n";
303
    }
304
    else {
Chad Barb's avatar
 
Chad Barb committed
305
	my $nodeAllocState;
306
	$nodeobj->GetAllocState(\$nodeAllocState);
307
	$nodes{$node}  = $node;
Chad Barb's avatar
 
Chad Barb committed
308
	$nodeAllocStates{$node} = $nodeAllocState;
309 310 311 312 313 314 315
	if ($nodeAllocState eq TBDB_ALLOCSTATE_RES_RECONFIG()) {
	    # Terrible use of state machine.
	    $reconfigs{$node} = 1;
	}
	elsif ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    # only reboot node if assign_wrapper just pulled it into expt.
	    # (e.g. it isnt ALLOCSTATE_RES_READY)
Chad Barb's avatar
 
Chad Barb committed
316 317
	    $reboots{$node} = 1;
	}
318
    }
319

320
    $osids{$node} = $osid;
321 322 323 324 325 326 327
    if ($osid) {
	$osinfo = OSinfo->Lookup($osid);

	die_noretry("Could not map $osid to its object!")
	    if (!defined($osinfo));
    }
    $osmap{$node}         = $osinfo;
328
    $bios_waittime{$type} = (defined($bios_wait) ? $bios_wait : 0);
329
    $node_types{$node}    = $type;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
330
    $vname{$node}         = $vname;
331

332
    #
333 334
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
335
    # and just check the path directly.
336 337 338 339
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

340 341 342 343 344 345 346 347 348 349 350
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
351
		die_noretry("File $path for node $node must reside in $TFTP");
352 353
	    }

354
	    if (! -f $path) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
355
		die_noretry("File $path for node $node does not exist!");
356
	    }
357
	    $bootpath = 1;
358
	}
359 360 361 362
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

363 364 365 366 367 368 369 370 371 372 373
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
374
		die_noretry("File $path for node $node must reside in $TFTP");
375 376
	    }

377
	    if (! -f $path) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
378
		die_noretry("File $path for node $node does not exist!");
379 380
	    }
	}
381 382
    }

383 384 385 386
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
387
	if (! -f $rpm) {
388 389 390
	    die_noretry({type => 'primary', severity => SEV_ERROR,
			 error => ['file_not_found', 'rpm', $rpm, $node]},
			"RPM $rpm for node $node does not exist!");
391 392
	}
    }
393

394 395 396 397 398
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
399

400
	if (! -f $tar) {
401 402 403
	    die_noretry({type => 'primary', severity => SEV_ERROR,
			 error => ['file_not_found', 'tar', $tar, $node]},
			"Tarfile $tar for node $node does not exist!");
404 405
	}
    }
406

407 408 409 410 411 412 413 414 415
    #
    # If the virtnode is running a subOS, we set $imageable because it 
    # really is going to be reloaded... even though virtnode types are not
    # typically imageable.
    #
    if ($virtnode && defined($osinfo) && $osinfo->def_parentosid()) {
	$imageable = 1;
    }

416 417 418 419
    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
420
    # it.
421
    #
422 423
    if (!$bootpath && (!$virtnode || ($virtnode && $imageable)) 
	&& !$isgeninode && $imageable) {
424 425 426
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
427
	#
428
	if (! $osinfo) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
429
	    die_noretry("$node has no bootpath and no def_boot_osid set!");
430 431 432
	}
	#
	# If there is an actual path, its an OSKit kernel not an image.
433
	#
434
	if (! defined($osinfo->path()) || $osinfo->path() eq "") {
435 436
	    my $nextosinfo;
	    
437
	    #
438
	    # Not an OSKit kernel.
439
	    #
440 441 442 443 444 445 446 447 448 449 450
	    if ($osinfo->IsGeneric()) {
		#
		# Map generic OSID to the specific one.
		#
		$nextosinfo = $osinfo->ResolveNextOSID($experiment);
		if (!defined($nextosinfo)) {
		    die_noretry("No next mapping for $osinfo on $node!\n");
		}
		print "Mapping $osinfo on $node to $nextosinfo\n";
		$osinfo = $nextosinfo;
	    }
451
	    #
452
	    # Make sure this OSID is actually loaded on the machine.
453
	    #
454 455 456 457 458 459
	    my $isloaded = $nodeobj->IsOSLoaded($osinfo);
	    if ($isloaded < 0) {
		die_noretry("Error determining if $osinfo ".
			    "is loaded on $node\n");
	    }
	    if ($isloaded) {
460
		#
461
		# OSID is loaded, but might need to be cleaned.
462
		#
463 464 465
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
		    SetupReload($node, $osinfo, $type);
466
		}
467
		elsif ($nextosinfo) {
468
		    #
469 470 471
		    # Seems like a bad place for this; if the OS was
		    # mapped to something else that is already on the
		    # disk, need to reset def_boot_osid.
472
		    #
473 474 475 476
		    if ($nodeobj->OSSelect($osinfo, "def_boot_osid", 0)) {
			die_noretry("Could not set boot OS to ".
				    "$osinfo for $node");
		    }
477 478 479 480
		}
	    }
	    else {
		#
481
		# OS not loaded.
482
		#
483
		SetupReload($node, $osinfo, $type);
484
	    }
485 486
	    $osmap{$node} = $osinfo;
	    $osids{$node} = $osinfo->osid();
487 488
	}
    }
489

490
    #
491
    # Set the canfail bit.
492
    #
493
    $canfail{$node} = (($failmode eq NODEFAILMODE_FATAL()) ? 0 : 1);
494

495 496 497 498 499 500
    #
    # Set the reboot waittime from the osid now that we have it
    # finalized.
    #
    $osid = $osids{$node};
    if (!exists($reboot_waittime{$osid})) {
501
	$reboot_waittime{$osid} = $osmap{$node}->reboot_waittime();
502 503
    }

504
    print STDERR "$node - $osmap{$node} - $canfail{$node}\n"
505
	if $dbg;
506
}
507

508 509 510 511 512 513 514 515 516 517 518 519 520
#
# XXX Inner elab nodes should never report in to us.
# If they do, make sure they wind up in PXEWAIT.
#
if (keys(%einenodes)) {
    DBQueryFatal("update nodes set ".
		 "  def_boot_osid=NULL,".
		 "  next_boot_osid=NULL,".
		 "  temp_boot_osid=NULL ".
		 "where node_id in (".
		 join(",", map("'$_'", keys %einenodes)). ")");
}    

Kevin Atkinson's avatar
 
Kevin Atkinson committed
521
@all_nodes = (keys %nodes, keys %vnodes);
Kevin Atkinson's avatar
 
Kevin Atkinson committed
522 523 524 525 526 527 528 529 530 531

#
# Perform some prechecks on the images.  This will also have the
# effect of catching the info for the images for latter use
#
# FIXME: WRITEME
#    Maybe this isn't a good idea since it will also attempt to fetch
#    the image from the real boss in an inner-emulab.  This should
#    really be done in parallel.

532
#
533
# Collect some info about vnodes.
534 535
#
foreach my $vnode (keys(%vnodes)) {
536 537
    my $nodeobj = $nodeobjs{$vnode};
    my $jailed  = $vnodes{$vnode};
538

539 540
#    print "$vnode, $jailed\n";

541 542 543
    if (! $jailed) {
	next;
    }
544
    my $pnode = $nodeobj->phys_nodeid();
545

546 547
#    print "$vnode, $jailed, $pnode\n";

548 549 550 551 552 553 554 555 556
    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
557 558 559
    my $pnodeobj = Node->Lookup($pnode);
    if (!defined($pnodeobj)) {
	die_noretry("Cannot lookup object for $pnode!");
560
    }
561
    $nodeobjs{$pnode} = $pnodeobj;
562 563
}

564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
#
# Setup the firewall first.  Once it is up we can continue with the
# remaining nodes.
#
# There is very little point in setting up the other nodes at the same time
# as they will not be able to PXE boot until the firewall is up.  We could
# fire them off a little early in hopes of overlapping any BIOS boot time
# with the last stages of the firewall setup, but it probably isn't worth
# the complexity (and would not work with nodes for which "reboot" means
# "fall out of PXEWAIT and boot".
#
# Note that we formerly did just do them all at once and let the nodes
# continually PXE-timeout and reboot until the firewall came up.  But that
# can actually take longer than what we do now, if a node happened to
# timeout and reboot just as the firewall came up (i.e., we would have to
# wait an extra BIOS-reboot cycle, which can be 90 seconds or more.
#
if ($firewalled) {
    my $node = $firewall;
583
    my $nodeobj = $nodeobjs{$node};
584 585 586 587 588 589 590 591 592 593 594 595

    TBDebugTimeStamp("rebooting/reloading firewall");
    if (!FirewallSetup($node)) {
	tbwarn "Firewall node $node failed to boot.".
	    "This has been reported to testbed-ops.";

	# XXX do we need to set NODEBOOTSTATUS_FAILED here?

	#
	# We assume that firewall node images are "standard" here,
	# and whine to tbops.
	#
596 597
	$nodeobj->MarkAsDown();
	$nodeobj->InsertNodeLogEntry($this_user, TB_DEFAULT_NODELOGTYPE(),
598
			  "'Moved to hwdown by os_setup; ".
599
			  "failed to boot image for osid " . $osmap{$node} .
600 601 602 603 604 605
			  " in $pid/$eid'");
	SENDMAIL($TBOPS, "1 node is down",
		 "Node:\n".
		 "  $node\n".
		 "in pid/eid $pid/$eid appears to be dead.\n\n".
		 "The node has been taken out of the pool until this matter ".
606 607
		 "is resolved.\n",
		 $user_email_to);
608 609

	$failed++;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
610
	add_failed_node_fatal($node);
611 612 613 614 615 616 617
	goto tballdone;
    }

    #
    # Check for cancelation.  Firewall setup may have taken awhile.
    #
    if (!$canceled) {
618
	$canceled = $experiment->canceled();
619
	if ($canceled) {
620 621 622
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
623 624 625 626 627 628 629 630 631 632
	    goto tballdone;
	}
    }

    #
    # remove it from the nodelist
    #
    delete $nodes{$node};
}

633 634 635 636 637 638 639 640 641
#
# Likewise, setup a PLC node before other plabinelab nodes.
# XXX right now, we setup PLC before ANY other node, whether it is
# part of the inner plab or not.
#
if ($plabinelab) {
    my $node = $plcnode;

    TBDebugTimeStamp("rebooting/reloading PLC node");
642
    if (!os_setup_one($node, $plcimage, "PLC", 10*60)) {
643 644 645 646 647 648
	tbwarn "PLC node $node failed to boot".
	    "This has been reported to testbed-ops.";
	SENDMAIL($TBOPS, "1 node is down",
		 "Node:\n".
		 "  $node\n".
		 "in pid/eid $pid/$eid failed to boot after loading OS.\n\n".
649 650
		 "The nodes have been freed.\n",
		 $user_email_to);
651
	$failed++;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
652
	add_failed_node_fatal($node);
653 654 655 656 657 658 659
	goto tballdone;
    }

    #
    # Check for cancelation.  PLC setup may have taken awhile.
    #
    if (!$canceled) {
660
	$canceled = $experiment->canceled();
661
	if ($canceled) {
662 663 664
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
665 666 667 668 669 670 671 672 673 674
	    goto tballdone;
	}
    }

    #
    # remove it from the nodelist
    #
    delete $nodes{$node};
}

Kirk Webb's avatar
 
Kirk Webb committed
675 676 677 678 679 680 681 682 683 684 685 686
#
# Start up plab vnode setup now since it doesn't depend on
# physical node readiness.
#
if (grep($_, values(%plabvnodes))) {
    my $plabnumbatch = TBGetSiteVar("plab/setup/vnode_batch_size");
    my $plabwait     = TBGetSiteVar("plab/setup/vnode_wait_time");
    TBDebugTimeStamp("Starting PlanetLab vnode setup.");
    if (!($plab_setup_pid = fork())) { 
        exec("$vnode_setup -p -n $plabnumbatch -w $plabwait $pid $eid") 
            or die_noretry("Exec failed.");
    } elsif ($plab_setup_pid == -1) {
687 688 689 690 691 692 693 694 695 696 697 698 699 700
        die_noretry("Plab fork failed.");
    }
}

#
# Ditto for Geni nodes. Parent keeps going.
#
if (keys(%geninodes)) {
    TBDebugTimeStamp("Starting Geni setup.");

    $geni_setup_pid = fork();
    if (! $geni_setup_pid) {
	TBdbfork();	# So we get the event system fork too ...

701
	if (libGeni::StartSlivers($experiment, $this_user, $dbg)) {
702 703 704 705 706 707 708 709
	    print STDERR "*** Could not start Geni slivers\n";
	    exit(-1);
	}
	TBDebugTimeStamp("Geni slivers have been started.");
	exit(0);
    }
    elsif ($geni_setup_pid == -1) {
        die_noretry("Geni fork failed.");
Kirk Webb's avatar
 
Kirk Webb committed
710
    }
711 712
    # Give it a chance to get going.
    sleep(1);
Kirk Webb's avatar
 
Kirk Webb committed
713 714
}

715
#
716
# We need to issue the reboots and the reloads in parallel.
717
#
718
TBDebugTimeStamp("rebooting/reloading nodes started");
719
if (!$TESTMODE) {
720
    my @children = ();
721

722
    foreach my $imageid ( keys(%reloads) ) {
723
	my @nodelist = @{ $reloads{$imageid} };
724

725 726
	my %nodeflags = ();

727
	foreach my $node (@nodelist) {
728 729
	    my $nodeobj = $nodeobjs{$node};
	    
730 731 732 733 734 735 736
	    #
	    # vnodes only get rebooted if this is a modify and we need to
	    # reload them (otherwise they will get rebooted because of presence
	    # in %reboots).
	    #
	    if (defined($vnodes{$node})) {
		my $vstate;
737
		$nodeobj->GetAllocState(\$vstate);
738 739 740 741 742
		if ($vstate eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
		    $nodeflags{$node}{'noreboot'} = 1;
		}
	    }

743 744 745 746 747 748 749 750 751 752
	    #
	    # osload should not wait for shared vnodes.  We need vnode_setup
	    # to boot/reboot them since the underlying pnode won't be booting.
	    # So for them, osload just sets up the reload and finishes.
	    #
	    if (defined($vnodes{$node}) && $sharednodes{$node} == 1) {
		$nodeflags{$node}{'noreboot'} = 1;
		$nodeflags{$node}{'nowait'} = 1;
	    }

753
	    $nodeobj->SetAllocState(TBDB_ALLOCSTATE_RES_RELOAD());
754
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
755
	    # No point in reboot/reconfig obviously, since node will reboot!
756
	    delete $reboots{$node};
757
	    delete $reconfigs{$node};
758
	    $rebooted{$node} = 1;
759 760
	}

761 762 763 764 765 766 767
	my %reload_args     = ();
	my $reload_failures = {};

	$reload_args{'debug'}     = $dbg;
	$reload_args{'asyncmode'} = 1;
	$reload_args{'imageid'}   = $imageid;
	$reload_args{'nodelist'}  = [ @nodelist ];
768
	$reload_args{'nodeflags'} = \%nodeflags;
769 770 771 772

	my $pid = osload(\%reload_args, $reload_failures);
	push(@children, [ $pid, \&osload_wait,
			  [ @nodelist ], $reload_failures ]);
773 774 775
	sleep(5);
    }

776 777 778
    #
    # Fire off the reboots.
    # 
779
    if (keys(%reboots)) {
Chad Barb's avatar
 
Chad Barb committed
780
	foreach my $node (keys(%reboots)) {
781 782
	    my $nodeobj = $nodeobjs{$node};

Chad Barb's avatar
 
Chad Barb committed
783
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
784
		$nodeobj->SetAllocState(TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
 
Chad Barb committed
785 786
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
787
		$nodeobj->SetAllocState(TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
 
Chad Barb committed
788 789
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
790 791
	    # See below, needed for vnode_setup.
	    $rebooted{$node} = 1;
Chad Barb's avatar
 
Chad Barb committed
792 793
	}

794 795 796 797 798 799 800 801 802 803 804 805 806
	my @nodelist        = keys(%reboots);
	my %reboot_args     = ();
	my $reboot_failures = {};

	$reboot_args{'debug'}     = $dbg;
	$reboot_args{'waitmode'}  = 0;
	$reboot_args{'asyncmode'} = 1;
	$reboot_args{'nodelist'}  = [ @nodelist ];

	my $pid = nodereboot(\%reboot_args, $reboot_failures);
	push(@children, [ $pid, \&nodereboot_wait,
			  [ @nodelist ], $reboot_failures ]);
	sleep(2);
807 808
    }

809
    #
810
    # Fire off the reconfigs.
811 812
    #
    if (keys(%reconfigs)) {
813 814 815 816 817 818 819 820 821 822 823 824 825
	my @nodelist        = keys(%reconfigs);
	my %reboot_args     = ();
	my $reboot_failures = {};

	$reboot_args{'debug'}     = $dbg;
	$reboot_args{'waitmode'}  = 0;
	$reboot_args{'asyncmode'} = 1;
	$reboot_args{'reconfig'}  = 1;
	$reboot_args{'nodelist'}  = [ @nodelist ];

	my $pid = nodereboot(\%reboot_args, $reboot_failures);
	push(@children, [ $pid, \&nodereboot_wait,
			  [ @nodelist ], $reboot_failures ]);
826 827
    }

828 829 830 831 832 833 834 835
    #
    # Wait for all of the children to exit. We look at the $pid to know if
    # command failed/ended immediately; otherwise we need to wait on it.
    # For any failures, record the node failures for later so that we do
    # not wait for them needlessly.
    #
    while (@children) {
	my ($pid, $waitfunc, $listref, $hashref) = @{ pop(@children) };
836

837 838 839
	# This is not likely to happen.
	next
	    if ($pid == 0);
840

841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
	if ($pid > 0) {
	    next
		if (! &$waitfunc($pid));
	}
	
	#
	# Failure. Record the failures for later. If the $pid<0 then the
	# entire list failed. Otherwise, have to scan the return hash to
	# find the failures.
	#
	my @nodelist = ();
	
	if ($pid < 0) {
	    @nodelist = @{ $listref };
	}
	else {
	    foreach my $node (keys(%{ $hashref })) {
		push(@nodelist, $node)
		    if ($hashref->{$node});
	    }
	}
862

863 864 865 866 867 868
	#
	# These errors are unusal enough that we do not want to retry
	# or keep going even if canfail is set. Better to stop and let
	# someone look at what happened.
	#
	$noretry = 1;
869

870
	foreach my $node (@nodelist) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
871
	    tbnotice "Not waiting for $node since its reload/reboot failed!";
872
	    $failed++;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
873
	    add_failed_node_reload($node);
874
	    delete($nodes{$node});
875

876
	    $nodeobjs{$node}->SetAllocState(TBDB_ALLOCSTATE_DOWN());
877
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();
878 879 880
	}
    }
}
881 882
TBDebugTimeStamp("rebooting/reloading finished");

883 884 885 886 887 888 889 890 891 892
#
# XXX declare the inner plab nodes as UP since we won't be hearing from
# them again (they are talking only to their PLC).
#
if ($plabinelab) {
    my @plabnodes = ();
    TBExptPlabInElabNodes($pid, $eid, \@plabnodes);
    foreach my $node (@plabnodes) {
	if (exists($nodes{$node})) {
	    tbnotice "Not waiting for emulated plab node $node";
893 894
	    $nodeobjs{$node}->SetBootStatus(NODEBOOTSTATUS_OKAY);
	    $nodeobjs{$node}->SetAllocState(TBDB_ALLOCSTATE_RES_READY());
895
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();
896
	    $nodeobjs{$node}->SetEventState(TBDB_NODESTATE_ISUP());
897 898 899 900 901
	    delete($nodes{$node});
	}
    }
}

902 903 904 905 906
#
# Remaining nodes we need to wait for. Why do we wait in the face of errors
# above? So that they enter a reasonably known state before we try to tear
# things down. Otherwise we could end up power cycling nodes a lot more often.
# This should probably be handled in other ways, say via stated or the alloc
907
# state machine.
908
#
909 910
my @nodelist = keys(%nodes);

911
#
912 913 914
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
915
#
916
if (@nodelist) {
917
    print "Waiting for local testbed nodes to finish rebooting ...\n";
918
}
919

920 921 922
my %retries;
my %waitstart;
foreach my $node ( @nodelist ) {
923
    $retries{$node} = (exists($geninodes{$node}) ? 0 : 1);
924 925 926
    $waitstart{$node} = time;
}

927 928 929 930 931 932 933 934
#
# List of nodes to inform the user and testbed-ops about in the event
# of failures.  We coalesce the nodes here so we only sent one message.
#
my @informuser = ();
my @informtbopswarn = ();
my @informtbopsfatal = ();

935 936
TBDebugTimeStamp("Local node waiting started");
while ( @nodelist ) {
937 938 939
    my $node    = shift(@nodelist);
    my $nodeobj = $nodeobjs{$node};
    my $wstart  = $waitstart{$node};
940
    my $actual_state;
941 942 943 944 945 946 947 948
    my $waittime = (60 * 7);	# The default.

    # Compute actual waittime.
    if (defined($bios_waittime{$node_types{$node}}) &&
	defined($reboot_waittime{$osids{$node}})) {
	$waittime = ($bios_waittime{$node_types{$node}} +
		     $reboot_waittime{$osids{$node}}) * 2;
    }