os_setup.in 44.6 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2007 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6 7
# All rights reserved.
#
8
use English;
9
use Getopt::Std;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
10
require 'ctime.pl';
11

12
#
13 14 15 16
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
17
#
18
# TODO: Reload disk images.
19
#
20
# usage: os_setup <pid> <eid>
21
#
22 23 24 25 26
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

27 28
sub usage()
{
29
    print STDERR "Usage: os_setup [-d] <pid> <eid>\n";
30 31
    exit(-1);
}
32
my  $optlist = "d";
33 34 35 36 37 38

#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
39
my $TBOPS       = "@TBOPSEMAIL@";
40
my $TESTMODE    = @TESTMODE@;
41
my $TFTP	= "/tftpboot";
42

43 44 45 46 47
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
48 49
use libreboot;
use libosload;
50
use libtestbed;
Kevin Atkinson's avatar
Kevin Atkinson committed
51
use libtblog;
52
use libArchive;
53
use Template;
54
use NodeType;
55
use OSinfo;
56
use User;
57

Leigh B. Stoller's avatar
Leigh B. Stoller committed
58 59
TBDebugTimeStampsOn();

60
my $vnode_setup = "$TB/sbin/vnode_setup";
61
my $osselect    = "$TB/bin/os_select";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
62
my $nodereboot  = "$TB/bin/node_reboot";
63
my $elab_setup  = "$TB/sbin/elabinelab";
64
my $dbg		= 0;
65
my $failed      = 0;
66
my $noretry     = 0;
67
my $failedvnodes= 0;
68
my $failedplab  = 0;
69
my $canceled    = 0;
70 71
my %nodes       = ();
my %vnodes      = ();
72 73 74
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
75
my %plabvnodes  = ();
76
my %osids       = ();
77
my %osmap       = ();
78
my %canfail     = ();
79 80 81
my %bios_waittime   = ();	# Indexed by node_type.
my %reboot_waittime = ();	# Indexed by osid.
my %node_types  = ();		# Indexed by node_id.
Kevin Atkinson's avatar
Kevin Atkinson committed
82
my %vname = ();                 # Indexed by node_id.
83

84 85
#
# This variable keeps track of the failed nodes of all types.
86 87 88 89 90 91 92
#   values = ['boot'|'osload', 'fatal'|'nonfatal']
my %failed_nodes = ();
sub add_failed_node_fatal($)     {$failed_nodes{$_[0]} = ['boot', 'fatal']}
sub add_failed_node_nonfatal($)  {$failed_nodes{$_[0]} = ['boot', 'nonfatal']}
sub add_failed_node_reload($)    {$failed_nodes{$_[0]} = ['reload', 'fatal']}

my @all_nodes; # list of all nodes before any are deleted from %nodes
93

94
#
95 96
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
97
# imageid, a list of the nodes to pass to os_load for that imageid.
98
#
99 100
my %reloads     = ();
my %reboots	= ();
101
my %reconfigs	= ();
102
my %rebooted    = ();
103
my $doautoload  = 1;
104
my $dolastload  = 1;
105

106 107 108
# Protos
sub SetupReload($$$);
sub FirewallSetup($);
109
sub os_setup_one($$$);
110
				  
111 112 113 114 115 116
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

117 118 119 120 121

#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
122
sub die_noretry($;$)
123
{
124 125
    my $parms = {};
    $parms = shift if ref $_[0] eq 'HASH';
126
    my ($mesg) = shift;
127
    tberror($parms, $mesg);
128 129 130
    exit(-1);
}

131 132 133 134 135 136 137 138 139 140 141
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
142 143 144 145
if (defined($options{"d"})) {
    $dbg = 1;
}

146 147 148 149 150 151 152 153 154 155
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
156
    die_noretry("Bad data in pid: $pid.");
157 158 159 160 161
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
162
    die_noretry("Bad data in eid: $eid.");
163 164
}

165
#
166
# Figure out who called us. Only root, people with admin status
167 168
# in the DB, or the owner of the experiment can run this script.
#
169 170
if ($UID && !TBAdmin($UID) &&
    !TBExptAccessCheck($UID, $pid, $eid, TB_EXPT_MODIFY)) {
171
    die_noretry("You do not have permission to swap this experiment!");
172 173
}

174
#
175
# Verify user and get his DB uid and other info for later.
176
#
177 178 179
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    die_noretry("You ($UID) do not exist!");
180
}
181 182 183
my $user_uid      = $this_user->uid();
my $user_name     = $this_user->name();
my $user_email    = $this_user->email();
184 185
my $user_email_to = "$user_name <$user_email>";

186 187
TBDebugTimeStamp("os_setup started");

188 189 190 191 192
#
# See if the experiment is firewalled
#
my $firewall;
my $firewalled = TBExptFirewall($pid, $eid, \$firewall);
193
my $firewallimage;
194 195 196 197 198 199 200 201 202 203

#
# Ditto ElabinElab.
#
my $elabinelab;
if (! TBExptIsElabInElab($pid, $eid, \$elabinelab)) {
    die("*** $0:\n".
	"    Could not get elabinelab status for experiment $pid/$eid\n");
}

204 205 206 207 208
#
# Ditto PlabinElab.
#
my $plabinelab = 0;
my $plcnode;
209
my $plcimage;
210 211 212 213
if (TBExptPlabInElabPLC($pid, $eid, \$plcnode)) {
    $plabinelab = 1;
}

214
#
215
# Get the set of nodes, as well as the nodes table information for them.
216
#
217
my $db_result =
Kevin Atkinson's avatar
Kevin Atkinson committed
218
    DBQueryFatal("select n.*,l.pid,r.vname from reserved as r ".
219
		 "left join nodes as n on n.node_id=r.node_id ".
220 221
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
222

223
if ($db_result->numrows < 1) {
224
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
225
    exit 0;
226 227
}

228
while (my %row = $db_result->fetchhash()) {
229 230 231 232 233
    my $node      = $row{'node_id'};
    my $osid      = $row{'def_boot_osid'};
    my $type      = $row{'type'};
    my $jailnode  = $row{'jailflag'};
    my $failmode  = $row{'failureaction'};
Kevin Atkinson's avatar
Kevin Atkinson committed
234
    my $vname     = $row{'vname'};
235 236 237 238
    my $typeinfo  = NodeType->Lookup($type);
    my $class     = $typeinfo->class();
    my $subnode   = $typeinfo->issubnode();
    my $virtnode  = $typeinfo->isvirtnode();
239
    my $isremote  = $typeinfo->isremotenode();
240 241 242
    my $imageable = $typeinfo->imageable();
    my $plabnode  = $typeinfo->isplabdslice();
    my $bios_wait = $typeinfo->bios_waittime();
243
    my $bootpath  = 0;
244
    my $osinfo    = undef;
245

246
    #
247 248
    # VIRTNODE HACK: Virtual nodes are special. Jailed vnodes can do quite
    # a bit, and so run them through the checks below.
249
    #
250
    if ($virtnode) {
251
	$vnodes{$node} = ($jailnode || $plabnode || $isremote);
252
	$plabvnodes{$node} = $plabnode;
253
	if (! $jailnode && ! $plabnode && !$isremote) {
254 255 256
	    next;
	}
    }
257
    elsif ($subnode && !$imageable) {
258
	print "Will skip subnode $node ISUP wait.\n";
259
    }
260
    else {
Chad Barb's avatar
Chad Barb committed
261 262
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
263
	$nodes{$node}  = $node;
Chad Barb's avatar
Chad Barb committed
264
	$nodeAllocStates{$node} = $nodeAllocState;
265 266 267 268 269 270 271
	if ($nodeAllocState eq TBDB_ALLOCSTATE_RES_RECONFIG()) {
	    # Terrible use of state machine.
	    $reconfigs{$node} = 1;
	}
	elsif ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    # only reboot node if assign_wrapper just pulled it into expt.
	    # (e.g. it isnt ALLOCSTATE_RES_READY)
Chad Barb's avatar
Chad Barb committed
272 273
	    $reboots{$node} = 1;
	}
274
    }
275

276
    $osids{$node} = $osid;
277 278 279 280 281 282 283
    if ($osid) {
	$osinfo = OSinfo->Lookup($osid);

	die_noretry("Could not map $osid to its object!")
	    if (!defined($osinfo));
    }
    $osmap{$node}         = $osinfo;
284
    $bios_waittime{$type} = (defined($bios_wait) ? $bios_wait : 0);
285
    $node_types{$node}    = $type;
Kevin Atkinson's avatar
Kevin Atkinson committed
286
    $vname{$node}         = $vname;
287

288
    #
289 290
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
291
    # and just check the path directly.
292 293 294 295
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

296 297 298 299 300 301 302 303 304 305 306
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
307
		die_noretry("File $path for node $node must reside in $TFTP");
308 309
	    }

310
	    if (! -f $path) {
311
		die_noretry("File $path for node $node does not exist!");
312
	    }
313
	    $bootpath = 1;
314
	}
315 316 317 318
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

319 320 321 322 323 324 325 326 327 328 329
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
330
		die_noretry("File $path for node $node must reside in $TFTP");
331 332
	    }

333
	    if (! -f $path) {
334
		die_noretry("File $path for node $node does not exist!");
335 336
	    }
	}
337 338
    }

339 340 341 342
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
343
	if (! -f $rpm) {
344 345 346
	    die_noretry({type => 'primary', severity => SEV_ERROR,
			 error => ['file_not_found', 'rpm', $rpm, $node]},
			"RPM $rpm for node $node does not exist!");
347 348
	}
    }
349

350 351 352 353 354
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
355

356
	if (! -f $tar) {
357 358 359
	    die_noretry({type => 'primary', severity => SEV_ERROR,
			 error => ['file_not_found', 'tar', $tar, $node]},
			"Tarfile $tar for node $node does not exist!");
360 361
	}
    }
362 363 364 365 366

    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
367
    # it.
368
    #
369
    if (!$bootpath && !$virtnode && $imageable) {
370 371 372
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
373
	#
374
	if (! $osinfo) {
375
	    die_noretry("$node has no bootpath and no def_boot_osid set!");
376 377 378 379
	}

	#
	# If there is an actual path, its an OSKit kernel not an image.
380
	#
381
	if (! defined($osinfo->path()) || $osinfo->path() eq "") {
382
	    #
383
	    # Not an OSKit kernel.
384
	    # Make sure this OSID is actually loaded on the machine.
385
	    #
386 387
	    my $p_result =
		DBQueryFatal("select * from partitions ".
388 389
			     "where node_id='$node' and osid='$osid'".
			     "order by partition");
390 391

	    #
392
	    # If not loaded, then see if the user was looking for the generic
393 394
	    # name of the OS that is loaded.
	    #
395 396
	    if ($p_result->numrows == 0) {
		#
397
		# Check to see if a non specific version specified.
398
		#
399
		if (!defined($osinfo->version()) || $osinfo->version() eq "") {
400
		    #
401
		    # A non-specific version. There needs to be a way to
402
		    # map it to another osid.
403
		    #
404 405
		    if (! defined($osinfo->nextosid())) {
			die_noretry("No mapping for $osinfo ($node)!");
406
		    }
407 408 409

		    my $nextosid = TBResolveNextOSID($osid, $pid, $eid);
		    if (!defined($nextosid)) {
410
			die_noretry("No mapping for $osinfo ($node)!");
411
		    }
412 413 414 415
		    $nextosinfo = OSinfo->Lookup($nextosid);

		    die_noretry("Could not map $osid to its object!")
			if (!defined($nextosinfo));
416
		
417 418 419
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
420
		    #
421
		    my $o_result =
422 423 424 425 426 427 428
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
429
			# loaded on the machine.
430
			#
431
			print "Mapping $osinfo on $node to $nextosinfo ".
432
			    "and setting up a reload.\n";
433

434
			SetupReload($node, $nextosinfo, $type);
435
			$osids{$node} = $nextosid;
436
			$osmap{$node} = $nextosinfo;
437 438
		    }
		    else {
439
			#
440 441
			# Already loaded.
			#
442
			print "Mapping $osinfo on $node to $nextosinfo.\n";
443 444 445

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
446
			    SetupReload($node, $nextosinfo, $type);
447 448
			}
			else {
449
			    system("$osselect $nextosid $node") and
450 451
				die_noretry("Could not set boot OS to ".
					    "$nextosid for $node");
452
			}
453
			$osids{$node} = $nextosid;
454
			$osmap{$node} = $nextosinfo;
455
		    }
456 457
		}
		else {
458 459
		    #
		    # User wants a specific version of an OS, but its not
460
		    # loaded on the machine.
461
		    #
462
		    SetupReload($node, $osinfo, $type);
463 464 465 466
		}
	    }
	    else {
		#
467
		# OSID is loaded, but might need to be cleaned.
468 469 470
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
471
		    SetupReload($node, $osinfo, $type);
472
		}
473 474 475
	    }
	}
    }
476

477
    #
478
    # Set the canfail bit.
479
    #
480
    $canfail{$node} = (($failmode eq NODEFAILMODE_FATAL()) ? 0 : 1);
481

482 483 484 485 486 487 488 489 490
    #
    # Set the reboot waittime from the osid now that we have it
    # finalized.
    #
    $osid = $osids{$node};
    if (!exists($reboot_waittime{$osid})) {
	$reboot_waittime{$osid} = TBOSIDRebootWaittime($osid);
    }

491
    print STDERR "$node - $osmap{$node} - $canfail{$node}\n"
492
	if $dbg;
493
}
494

495
@all_nodes = (keys %nodes, keys %vnodes);
496 497 498 499 500 501 502 503 504 505

#
# Perform some prechecks on the images.  This will also have the
# effect of catching the info for the images for latter use
#
# FIXME: WRITEME
#    Maybe this isn't a good idea since it will also attempt to fetch
#    the image from the real boss in an inner-emulab.  This should
#    really be done in parallel.

506
#
507
# Collect some info about vnodes.
508 509 510 511 512 513 514 515 516 517
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
518
	die_noretry("Cannot determine phys_nodeid for $vnode!");
519
    }
520 521 522 523 524 525 526 527 528 529

    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
530 531

    if (!exists($nodes{$pnode})) {
532 533 534 535 536 537
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
538

539
    # Nothing else to do for local jail nodes at this time ...
540 541
}

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
#
# Setup the firewall first.  Once it is up we can continue with the
# remaining nodes.
#
# There is very little point in setting up the other nodes at the same time
# as they will not be able to PXE boot until the firewall is up.  We could
# fire them off a little early in hopes of overlapping any BIOS boot time
# with the last stages of the firewall setup, but it probably isn't worth
# the complexity (and would not work with nodes for which "reboot" means
# "fall out of PXEWAIT and boot".
#
# Note that we formerly did just do them all at once and let the nodes
# continually PXE-timeout and reboot until the firewall came up.  But that
# can actually take longer than what we do now, if a node happened to
# timeout and reboot just as the firewall came up (i.e., we would have to
# wait an extra BIOS-reboot cycle, which can be 90 seconds or more.
#
if ($firewalled) {
    my $node = $firewall;

    TBDebugTimeStamp("rebooting/reloading firewall");
    if (!FirewallSetup($node)) {
	tbwarn "Firewall node $node failed to boot.".
	    "This has been reported to testbed-ops.";

	# XXX do we need to set NODEBOOTSTATUS_FAILED here?

	#
	# We assume that firewall node images are "standard" here,
	# and whine to tbops.
	#
	MarkNodeDown($node);
574
	TBSetNodeLogEntry($node, $user_uid, TB_DEFAULT_NODELOGTYPE(),
575
			  "'Moved to hwdown by os_setup; ".
576
			  "failed to boot image for osid " . $osmap{$node} .
577 578 579 580 581 582 583 584 585
			  " in $pid/$eid'");
	SENDMAIL($TBOPS, "1 node is down",
		 "Node:\n".
		 "  $node\n".
		 "in pid/eid $pid/$eid appears to be dead.\n\n".
		 "The node has been taken out of the pool until this matter ".
		 "is resolved.\n");

	$failed++;
586
	add_failed_node_fatal($node);
587 588 589 590 591 592 593 594 595
	goto tballdone;
    }

    #
    # Check for cancelation.  Firewall setup may have taken awhile.
    #
    if (!$canceled) {
	TBGetCancelFlag($pid, $eid, \$canceled);
	if ($canceled) {
596 597 598
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
599 600 601 602 603 604 605 606 607 608
	    goto tballdone;
	}
    }

    #
    # remove it from the nodelist
    #
    delete $nodes{$node};
}

609 610 611 612 613 614 615 616 617
#
# Likewise, setup a PLC node before other plabinelab nodes.
# XXX right now, we setup PLC before ANY other node, whether it is
# part of the inner plab or not.
#
if ($plabinelab) {
    my $node = $plcnode;

    TBDebugTimeStamp("rebooting/reloading PLC node");
618
    if (!os_setup_one($node, $plcimage, "PLC")) {
619 620 621 622 623 624 625 626
	tbwarn "PLC node $node failed to boot".
	    "This has been reported to testbed-ops.";
	SENDMAIL($TBOPS, "1 node is down",
		 "Node:\n".
		 "  $node\n".
		 "in pid/eid $pid/$eid failed to boot after loading OS.\n\n".
		 "The nodes have been freed.\n");
	$failed++;
627
	add_failed_node_fatal($node);
628 629 630 631 632 633 634 635 636
	goto tballdone;
    }

    #
    # Check for cancelation.  PLC setup may have taken awhile.
    #
    if (!$canceled) {
	TBGetCancelFlag($pid, $eid, \$canceled);
	if ($canceled) {
637 638 639
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
640 641 642 643 644 645 646 647 648 649
	    goto tballdone;
	}
    }

    #
    # remove it from the nodelist
    #
    delete $nodes{$node};
}

650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
#
# Start up plab vnode setup now since it doesn't depend on
# physical node readiness.
#
my $plab_setup_pid = -1;
if (grep($_, values(%plabvnodes))) {
    my $plabnumbatch = TBGetSiteVar("plab/setup/vnode_batch_size");
    my $plabwait     = TBGetSiteVar("plab/setup/vnode_wait_time");
    TBDebugTimeStamp("Starting PlanetLab vnode setup.");
    if (!($plab_setup_pid = fork())) { 
        exec("$vnode_setup -p -n $plabnumbatch -w $plabwait $pid $eid") 
            or die_noretry("Exec failed.");
    } elsif ($plab_setup_pid == -1) {
        die_noretry("Fork failed.");
    }
}

667
#
668
# We need to issue the reboots and the reloads in parallel.
669
#
670
TBDebugTimeStamp("rebooting/reloading nodes started");
671
if (!$TESTMODE) {
672
    my @children = ();
673

674
    foreach my $imageid ( keys(%reloads) ) {
675
	my @nodelist = @{ $reloads{$imageid} };
676

677
	foreach my $node (@nodelist) {
678 679
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
680
	    # No point in reboot/reconfig obviously, since node will reboot!
681
	    delete $reboots{$node};
682
	    delete $reconfigs{$node};
683
	    $rebooted{$node} = 1;
684 685
	}

686 687 688 689 690 691 692 693 694 695 696
	my %reload_args     = ();
	my $reload_failures = {};

	$reload_args{'debug'}     = $dbg;
	$reload_args{'asyncmode'} = 1;
	$reload_args{'imageid'}   = $imageid;
	$reload_args{'nodelist'}  = [ @nodelist ];

	my $pid = osload(\%reload_args, $reload_failures);
	push(@children, [ $pid, \&osload_wait,
			  [ @nodelist ], $reload_failures ]);
697 698 699
	sleep(5);
    }

700 701 702
    #
    # Fire off the reboots.
    # 
703
    if (keys(%reboots)) {
Chad Barb's avatar
Chad Barb committed
704 705
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
706
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
Chad Barb committed
707 708
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
709
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
Chad Barb committed
710 711
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
712 713
	    # See below, needed for vnode_setup.
	    $rebooted{$node} = 1;
Chad Barb's avatar
Chad Barb committed
714 715
	}

716 717 718 719 720 721 722 723 724 725 726 727 728
	my @nodelist        = keys(%reboots);
	my %reboot_args     = ();
	my $reboot_failures = {};

	$reboot_args{'debug'}     = $dbg;
	$reboot_args{'waitmode'}  = 0;
	$reboot_args{'asyncmode'} = 1;
	$reboot_args{'nodelist'}  = [ @nodelist ];

	my $pid = nodereboot(\%reboot_args, $reboot_failures);
	push(@children, [ $pid, \&nodereboot_wait,
			  [ @nodelist ], $reboot_failures ]);
	sleep(2);
729 730
    }

731
    #
732
    # Fire off the reconfigs.
733 734
    #
    if (keys(%reconfigs)) {
735 736 737 738 739 740 741 742 743 744 745 746 747
	my @nodelist        = keys(%reconfigs);
	my %reboot_args     = ();
	my $reboot_failures = {};

	$reboot_args{'debug'}     = $dbg;
	$reboot_args{'waitmode'}  = 0;
	$reboot_args{'asyncmode'} = 1;
	$reboot_args{'reconfig'}  = 1;
	$reboot_args{'nodelist'}  = [ @nodelist ];

	my $pid = nodereboot(\%reboot_args, $reboot_failures);
	push(@children, [ $pid, \&nodereboot_wait,
			  [ @nodelist ], $reboot_failures ]);
748 749
    }

750 751 752 753 754 755 756 757
    #
    # Wait for all of the children to exit. We look at the $pid to know if
    # command failed/ended immediately; otherwise we need to wait on it.
    # For any failures, record the node failures for later so that we do
    # not wait for them needlessly.
    #
    while (@children) {
	my ($pid, $waitfunc, $listref, $hashref) = @{ pop(@children) };
758

759 760 761
	# This is not likely to happen.
	next
	    if ($pid == 0);
762

763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
	if ($pid > 0) {
	    next
		if (! &$waitfunc($pid));
	}
	
	#
	# Failure. Record the failures for later. If the $pid<0 then the
	# entire list failed. Otherwise, have to scan the return hash to
	# find the failures.
	#
	my @nodelist = ();
	
	if ($pid < 0) {
	    @nodelist = @{ $listref };
	}
	else {
	    foreach my $node (keys(%{ $hashref })) {
		push(@nodelist, $node)
		    if ($hashref->{$node});
	    }
	}
784

785 786 787 788 789 790
	#
	# These errors are unusal enough that we do not want to retry
	# or keep going even if canfail is set. Better to stop and let
	# someone look at what happened.
	#
	$noretry = 1;
791

792
	foreach my $node (@nodelist) {
793
	    tbnotice "Not waiting for $node since its reload/reboot failed!";
794
	    $failed++;
795
	    add_failed_node_reload($node);
796
	    delete($nodes{$node});
797 798 799

	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();
800 801 802
	}
    }
}
803 804
TBDebugTimeStamp("rebooting/reloading finished");

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823
#
# XXX declare the inner plab nodes as UP since we won't be hearing from
# them again (they are talking only to their PLC).
#
if ($plabinelab) {
    my @plabnodes = ();
    TBExptPlabInElabNodes($pid, $eid, \@plabnodes);
    foreach my $node (@plabnodes) {
	if (exists($nodes{$node})) {
	    tbnotice "Not waiting for emulated plab node $node";
	    SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY());
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();
	    TBSetNodeEventState($node, TBDB_NODESTATE_ISUP());
	    delete($nodes{$node});
	}
    }
}

824 825 826 827 828
#
# Remaining nodes we need to wait for. Why do we wait in the face of errors
# above? So that they enter a reasonably known state before we try to tear
# things down. Otherwise we could end up power cycling nodes a lot more often.
# This should probably be handled in other ways, say via stated or the alloc
829
# state machine.
830
#
831 832
my @nodelist = keys(%nodes);

833
#
834 835 836
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
837
#
838
if (@nodelist) {
839
    print "Waiting for local testbed nodes to finish rebooting ...\n";
840
}
841

842 843 844 845 846 847 848
my %retries;
my %waitstart;
foreach my $node ( @nodelist ) {
    $retries{$node} = 1;
    $waitstart{$node} = time;
}

849 850 851 852 853 854 855 856
#
# List of nodes to inform the user and testbed-ops about in the event
# of failures.  We coalesce the nodes here so we only sent one message.
#
my @informuser = ();
my @informtbopswarn = ();
my @informtbopsfatal = ();

857 858 859 860
TBDebugTimeStamp("Local node waiting started");
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
861
    my $actual_state;
862 863 864 865 866 867 868 869
    my $waittime = (60 * 7);	# The default.

    # Compute actual waittime.
    if (defined($bios_waittime{$node_types{$node}}) &&
	defined($reboot_waittime{$osids{$node}})) {
	$waittime = ($bios_waittime{$node_types{$node}} +
		     $reboot_waittime{$osids{$node}}) * 2;
    }
870

871
    if (!TBNodeStateWait($node, $wstart, $waittime, \$actual_state,
872 873
			 (TBDB_NODESTATE_TBFAILED, TBDB_NODESTATE_ISUP))) {
	if ($actual_state eq TBDB_NODESTATE_TBFAILED) {
874
	    tbwarn "$node reported a TBFAILED event; not retrying";
875 876 877
	    $retries{$node} = 0;
	    goto tbfailed;
	}
878 879 880
	print "$node is alive and well\n";
	SetNodeBootStatus($node, NODEBOOTSTATUS_OKAY);
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
881
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();
882
	next;
883 884
    }

885 886 887 888 889 890 891 892
    #
    # Check for cancelation. Do not want to retry the reboots if the
    # swap was canceled.
    #
    if (!$canceled) {
	TBGetCancelFlag($pid, $eid, \$canceled);

	if ($canceled) {
893 894 895
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
896 897 898
	}
    }

899
    if ($retries{$node} && !($canceled || $noretry)) {
900
	$retries{$node} -= 1;
901

902
	tbnotice "Rebooting $node and waiting again ...";
903

904 905 906 907 908 909 910 911
	if (system("$nodereboot $node") == 0) {
	    push(@nodelist, $node);
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }

912
    tbwarn "$node may be down. This has been reported to testbed-ops.";
913

914 915 916
  tbfailed:
    SetNodeBootStatus($node, NODEBOOTSTATUS_FAILED);

917
    if ($canfail{$node} && !($canceled || $noretry)) {
918
	push(@informuser, $node);
919
	add_failed_node_nonfatal($node);
920
	tbnotice "Continuing with experiment setup anyway ...";
921
	next;
922
    }
923

924 925
    #
    # If the user has picked a standard image and it fails to boot,
926
    # something is wrong, so reserve it to hwdown experiment. If the
927 928 929 930 931 932 933 934
    # image belongs to the user, then we assume its the image at fault,
    # and allow it to be returned to the pool (caller, tbswap will end
    # doing the nfree on nodes with a DOWN allocstate).
    #
    my $pidofosid;
    if (! TBOsidToPid($osids{$node}, \$pidofosid) ||
	$pidofosid eq TBOPSPID()) {
	MarkNodeDown($node);
935
	TBSetNodeLogEntry($node, $user_uid, TB_DEFAULT_NODELOGTYPE(),
936
			  "'Moved to hwdown by os_setup; ".
937
			  "failed to boot image for osid " . $osmap{$node} .
938
			  " in $pid/$eid'");
939 940 941
	push(@informtbopsfatal, $node);
    } else {
	push(@informtbopswarn, $node);
942
    }
943
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
944
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();
945 946

    $failed++;
947
    add_failed_node_fatal($node);
948
}
949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987

#
# Spam time!  Send mail to the user and testbed-ops about failures.
#
my $count = scalar(@informuser);
if ($count > 0) {
    SENDMAIL($user_email_to, "$count nodes are down",
	     "Nodes:\n".
	     "  " . join(" ", @informuser) . "\n".
	     "in pid/eid $pid/$eid appear to be dead.\n\n".
	     "Your experiment will continue to run since these failures\n".
	     "are nonfatal, although you might encounter other problems\n".
	     "if your experiment depends explicitly on these nodes.\n".
	     "You should terminate this experiment if it cannot ".
	     "tolerate these failures.\n\n".
	     "Testbed Operations has also been notified.\n\n".
	     "Thanks\n".
	     "Testbed Operations\n",
	     0,
	     "Cc: $TBOPS");
}
$count = scalar(@informtbopsfatal);
if ($count > 0) {
    SENDMAIL($TBOPS, "$count nodes are down",
	     "Nodes:\n".
	     "  " . join(" ", @informtbopsfatal) . "\n".
	     "in pid/eid $pid/$eid appear to be dead.\n\n".
	     "The nodes have been taken out of the pool until this matter ".
	     "is resolved.\n");
}
$count = scalar(@informtbopswarn);
if ($count > 0) {
    SENDMAIL($TBOPS, "$count nodes are down",
	     "Nodes:\n".
	     "  " . join(" ", @informtbopswarn) . "\n".
	     "in pid/eid $pid/$eid failed to boot after loading OS.\n\n".
	     "The nodes have been freed.\n");
}

988
TBDebugTimeStamp("Local node waiting finished");
989

990 991 992 993 994
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
#
my @vnodelist = keys(%vnodes);

#
# Set the allocstate for the local vnodes that were sucessfully rebooted
# and came to ISUP above. These do not need to be setup again! We move
# them to RES_READY, so vnode_setup will ignore them. If they fail to
# hit ISUP, we will move them to DOWN so that vnode_setup will ignore
# them again, in the teardown phase.
#
# Note, we do this even if there were failures above, since the teardown
# phase is going to happen, and we want vnode_setup to know which nodes
# came up with phynodes okay (need to be torndown) and which ones never
# had the chance (no need to teardown). Think swapmod, which does teardown
# in the ACTIVATING state.
#
foreach my $vnode (@vnodelist) {
    my $pnode  = $vnode2pnode{$vnode};

1014
    # Default retry count.
1015
    $retries{$vnode} = 0;
1016

1017
    # Remote node, always does setup.
1018 1019
    next
	if (!exists($nodes{$pnode}));
1020
    
1021 1022
    # Pnode was neither rebooted or reconfiged, so leave allocstate alone
    # for vnode_setup (has to be done).
1023
    next
1024
	if (!exists($rebooted{$pnode}) && !exists($reconfigs{$pnode}));
1025 1026 1027 1028 1029 1030 1031

    if ($nodeAllocStates{$pnode} eq TBDB_ALLOCSTATE_RES_READY()) {
	TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_RES_READY());
	$nodeAllocStates{$vnode} = TBDB_ALLOCSTATE_RES_READY();
    }
}

1032 1033 1034 1035 1036 1037 1038
#
# Reset the failure lists. See above.
#
@informuser = ();
@informtbopswarn = ();
@informtbopsfatal = ();

1039
#
1040 1041 1042
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
1043
if ($canceled && @vnodelist) {
1044
    tbnotice "Skipping virtual node setup since swapin was canceled!";
1045 1046
}
elsif ($failed && @vnodelist) {
1047 1048
    tbnotice "Skipping virtual node setup since there were previous ".
	"failures!";
1049 1050
}
elsif (@vnodelist) {
1051
    my @retry_list = ();
Kirk Webb's avatar
Kirk Webb committed
1052

Leigh B. Stoller's avatar
Leigh B. Stoller committed
1053
    TBDebugTimeStamp("Setting up virtual nodes");
1054
    print "Setting up virtual testbed nodes ...\n";
1055

1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
    # Wait for plab vnode setup to finish if it's running.
    if ($plab_setup_pid > 0) {
        my $kid = waitpid($plab_setup_pid,0);
        if ($kid == $plab_setup_pid) {
            if ($?) {
                die_noretry("Failed to setup plab vnodes.");
            }
        } else {
            die_noretry("Error waiting for plab vnode to finish.");
        }
Kirk Webb's avatar
Kirk Webb committed
1066 1067
    }

1068
  retry:
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1069
    TBDebugTimeStamp("Setting up virtual nodes");
1070 1071 1072
    # Only fire off local (jailed) nodes here.  Plab vnode setup has
    # already been started at this point.
    system("$vnode_setup -j $pid $eid");
1073
    if ($?) {
1074
	die_noretry("Vnode setup failed!");
1075
    }
1076 1077
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
    TBDebugTimeStamp("Virtual node waiting started");
1078

1079 1080 1081
    foreach my $node (@vnodelist) {
	$waitstart{$node} = time;
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1082
    @vnodelist = sort(@vnodelist);
1083

1084 1085 1086
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
	my $pnode  = $vnode2pnode{$node};
1087
	my $islocal= exists($nodes{$pnode});
1088 1089
	my $wstart = $waitstart{$node};
	my $curallocstate;
1090
	my $actual_state;
1091
	my $maxwait;
1092