os_setup.in 51.6 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh Stoller's avatar
Leigh Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
Leigh Stoller's avatar
Leigh Stoller committed
6 7
# All rights reserved.
#
8
use English;
9
use Getopt::Std;
Leigh Stoller's avatar
Leigh Stoller committed
10
require 'ctime.pl';
11
use POSIX ":sys_wait_h";
12

13
#
14 15 16 17
# Reboot the nodes in an experiment. The nodes table will already contain
# all the information. This script deals with possible disk reloading,
# rebooting, and waiting for nodes to come back alive before allowing
# experiment creation to continue.
18
#
19
# TODO: Reload disk images.
20
#
21
# usage: os_setup <pid> <eid>
22
#
23 24 25 26 27
# errorcode:  0 - all reboots succeeded.
#             1 - some/all reboots failed; retry may help.
#            -1 - failure; retry is inappropriate.
#

28 29
sub usage()
{
30
    print STDERR "Usage: os_setup [-d] <pid> <eid>\n";
31 32
    exit(-1);
}
33
my  $optlist = "d";
34 35 36 37 38 39

#
# Configure variables
#
my $TB		= "@prefix@";
my $DBNAME	= "@TBDBNAME@";
40
my $TBOPS       = "@TBOPSEMAIL@";
41
my $TESTMODE    = @TESTMODE@;
42
my $TFTP	= "/tftpboot";
43
my $PGENISUPPORT= @PROTOGENI_SUPPORT@;
44

45 46 47 48 49
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
50 51
use libreboot;
use libosload;
52
use libtestbed;
53
use libtblog;
54
use libArchive;
55
use Template;
56
use NodeType;
57
use Experiment;
58
use OSinfo;
59
use User;
60 61 62
if ($PGENISUPPORT) {
    require libGeni;
}
63

Leigh Stoller's avatar
Leigh Stoller committed
64 65
TBDebugTimeStampsOn();

66
my $vnode_setup = "$TB/sbin/vnode_setup";
67
my $osselect    = "$TB/bin/os_select";
Leigh Stoller's avatar
Leigh Stoller committed
68
my $nodereboot  = "$TB/bin/node_reboot";
69
my $elab_setup  = "$TB/sbin/elabinelab";
70
my $dbg		= 0;
71
my $failed      = 0;
72
my $noretry     = 0;
73
my $failedvnodes= 0;
74
my $failedplab  = 0;
75
my $failedgeni  = 0;
76
my $canceled    = 0;
77 78
my %nodes       = ();
my %vnodes      = ();
79
my %sharednodes = ();
80 81 82
my %vnodephosts = ();
my %vnode2pnode = ();
my %pnodevcount = ();
83
my %plabvnodes  = ();
84
my %geninodes   = ();
85
my %osids       = ();
86
my %osmap       = ();
87
my %canfail     = ();
88 89 90
my %bios_waittime   = ();	# Indexed by node_type.
my %reboot_waittime = ();	# Indexed by osid.
my %node_types  = ();		# Indexed by node_id.
Kevin Atkinson's avatar
Kevin Atkinson committed
91
my %vname = ();                 # Indexed by node_id.
92 93
my $plab_setup_pid;		# Run plab setup in parallel.
my $geni_setup_pid;		# Run geni setup in parallel.
94

95 96
#
# This variable keeps track of the failed nodes of all types.
97 98 99 100 101 102 103
#   values = ['boot'|'osload', 'fatal'|'nonfatal']
my %failed_nodes = ();
sub add_failed_node_fatal($)     {$failed_nodes{$_[0]} = ['boot', 'fatal']}
sub add_failed_node_nonfatal($)  {$failed_nodes{$_[0]} = ['boot', 'nonfatal']}
sub add_failed_node_reload($)    {$failed_nodes{$_[0]} = ['reload', 'fatal']}

my @all_nodes; # list of all nodes before any are deleted from %nodes
104

105
#
106 107
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
108
# imageid, a list of the nodes to pass to os_load for that imageid.
109
#
110 111
my %reloads     = ();
my %reboots	= ();
112
my %reconfigs	= ();
113
my %rebooted    = ();
114
my $doautoload  = 1;
115
my $dolastload  = 1;
116

117 118 119
# Protos
sub SetupReload($$$);
sub FirewallSetup($);
120
sub os_setup_one($$$;$);
121
sub KillChildren();
122
				  
123 124 125 126 127 128
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

129 130 131 132 133

#
# Used to die with a -1 return code, to indicate to caller (tbswap)
# that the failure is not likely to be fixed with another attempt.
#
134
sub die_noretry($;$)
135
{
136 137
    my $parms = {};
    $parms = shift if ref $_[0] eq 'HASH';
138
    my ($mesg) = shift;
139
    tberror($parms, $mesg);
140
    KillChildren();
141 142 143
    exit(-1);
}

144 145 146 147 148 149 150 151 152 153 154
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 2) {
    usage();
}
155 156 157 158
if (defined($options{"d"})) {
    $dbg = 1;
}

159 160 161 162 163 164 165 166 167 168
my $pid = $ARGV[0];
my $eid = $ARGV[1];

#
# Untaint args.
#
if ($pid =~ /^([-\@\w]+)$/) {
    $pid = $1;
}
else {
169
    die_noretry("Bad data in pid: $pid.");
170 171 172 173 174
}
if ($eid =~ /^([-\@\w]+)$/) {
    $eid = $1;
}
else {
175
    die_noretry("Bad data in eid: $eid.");
176 177
}

178
#
179
# Verify user and get his DB uid and other info for later.
180
#
181 182 183
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    die_noretry("You ($UID) do not exist!");
184
}
185 186 187
my $user_uid      = $this_user->uid();
my $user_name     = $this_user->name();
my $user_email    = $this_user->email();
188 189
my $user_email_to = "$user_name <$user_email>";

190 191 192 193 194 195 196 197 198 199
#
# Check permission.
#
my $experiment = Experiment->Lookup($pid, $eid);
if (!defined($experiment)) {
    die_noretry("Could not find experiment object for $pid/$eid!");    
}
if (!$experiment->AccessCheck($this_user, TB_EXPT_MODIFY)) {
    die_noretry("You do not have permission to swap this experiment!");
}
200 201
TBDebugTimeStamp("os_setup started");

202 203 204 205
#
# See if the experiment is firewalled
#
my $firewall;
206
my $firewalled = $experiment->IsFirewalled(\$firewall);
207
my $firewallimage;
208 209 210 211

#
# Ditto ElabinElab.
#
212
my $elabinelab = $experiment->elabinelab();
213

214 215 216 217 218
#
# Ditto PlabinElab.
#
my $plabinelab = 0;
my $plcnode;
219
my $plcimage;
220 221 222 223
if (TBExptPlabInElabPLC($pid, $eid, \$plcnode)) {
    $plabinelab = 1;
}

224
#
225
# Get the set of nodes, as well as the nodes table information for them.
226
#
227
my $db_result =
228
    DBQueryFatal("select n.*,l.pid,r.vname,r.sharing_mode from reserved as r ".
229
		 "left join nodes as n on n.node_id=r.node_id ".
230 231
		 "left join last_reservation as l on n.node_id=l.node_id ".
		 "where r.pid='$pid' and r.eid='$eid'");
232

233
if ($db_result->numrows < 1) {
234
    print "There are no nodes in experiment '$eid' in project '$pid'.\n";
235
    exit 0;
236 237
}

238
while (my %row = $db_result->fetchhash()) {
239 240 241 242 243
    my $node      = $row{'node_id'};
    my $osid      = $row{'def_boot_osid'};
    my $type      = $row{'type'};
    my $jailnode  = $row{'jailflag'};
    my $failmode  = $row{'failureaction'};
Kevin Atkinson's avatar
Kevin Atkinson committed
244
    my $vname     = $row{'vname'};
245 246 247 248
    my $typeinfo  = NodeType->Lookup($type);
    my $class     = $typeinfo->class();
    my $subnode   = $typeinfo->issubnode();
    my $virtnode  = $typeinfo->isvirtnode();
249 250
    my $sharednode = defined($row{'sharing_mode'}) 
	&& $row{'sharing_mode'} eq 'using_shared_local';
251
    my $isremote  = $typeinfo->isremotenode();
252
    my $isgeninode= $typeinfo->isfednode();
253 254 255
    my $imageable = $typeinfo->imageable();
    my $plabnode  = $typeinfo->isplabdslice();
    my $bios_wait = $typeinfo->bios_waittime();
256
    my $bootpath  = 0;
257
    my $osinfo    = undef;
258

259 260 261 262 263 264 265
    if ($isgeninode) {
	#
	# Geni nodes are currently a lot like plab nodes, but that will
	# change later.
	#
	if ($virtnode) {
	    $vnodes{$node} = $virtnode;
266
	    $sharednodes{$node} = $sharednode;
267 268 269 270 271 272 273 274 275 276 277
	}
	else {
	    $nodes{$node}  = $node;
	}
	$geninodes{$node}  = 1;
    }
    elsif ($virtnode) {
	#
	# Virtual nodes are special. Jailed vnodes can do quite a bit,
	# and so run them through the checks below.
	#
278
	$vnodes{$node} = ($jailnode || $plabnode || $isremote);
279
	$sharednodes{$node} = $sharednode;
280
	$plabvnodes{$node} = $plabnode;
281
	if (! $jailnode && ! $plabnode && !$isremote) {
282 283 284
	    next;
	}
    }
285
    elsif ($subnode && !$imageable) {
286
	print "Will skip subnode $node ISUP wait.\n";
287
    }
288
    else {
Chad Barb's avatar
Chad Barb committed
289 290
	my $nodeAllocState;
	TBGetNodeAllocState( $node, \$nodeAllocState );
291
	$nodes{$node}  = $node;
Chad Barb's avatar
Chad Barb committed
292
	$nodeAllocStates{$node} = $nodeAllocState;
293 294 295 296 297 298 299
	if ($nodeAllocState eq TBDB_ALLOCSTATE_RES_RECONFIG()) {
	    # Terrible use of state machine.
	    $reconfigs{$node} = 1;
	}
	elsif ($nodeAllocState ne TBDB_ALLOCSTATE_RES_READY()) {
	    # only reboot node if assign_wrapper just pulled it into expt.
	    # (e.g. it isnt ALLOCSTATE_RES_READY)
Chad Barb's avatar
Chad Barb committed
300 301
	    $reboots{$node} = 1;
	}
302
    }
303

304
    $osids{$node} = $osid;
305 306 307 308 309 310 311
    if ($osid) {
	$osinfo = OSinfo->Lookup($osid);

	die_noretry("Could not map $osid to its object!")
	    if (!defined($osinfo));
    }
    $osmap{$node}         = $osinfo;
312
    $bios_waittime{$type} = (defined($bios_wait) ? $bios_wait : 0);
313
    $node_types{$node}    = $type;
Kevin Atkinson's avatar
Kevin Atkinson committed
314
    $vname{$node}         = $vname;
315

316
    #
317 318
    # Make sure the files specified in the paths exist. We mount the
    # user tftp directory on boss node, so we can ignore the IP address,
319
    # and just check the path directly.
320 321 322 323
    #
    if (defined($row{'def_boot_path'})) {
	my $path = $row{'def_boot_path'};

324 325 326 327 328 329 330 331 332 333 334
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
335
		die_noretry("File $path for node $node must reside in $TFTP");
336 337
	    }

338
	    if (! -f $path) {
339
		die_noretry("File $path for node $node does not exist!");
340
	    }
341
	    $bootpath = 1;
342
	}
343 344 345 346
    }
    if (defined($row{'next_boot_path'})) {
	my $path = $row{'next_boot_path'};

347 348 349 350 351 352 353 354 355 356 357
	if ($path ne "") {
	    my $ip   = 0;

	    # Split out IP address if it exists.
	    if ($path =~ /^([0-9\.]+):(\/.*)$/) {
		$ip   = $1;
		$path = $2;
	    }

	    # Path must begin with $TFTP
	    if (! ($path =~ /^\/$TFTP\//)) {
358
		die_noretry("File $path for node $node must reside in $TFTP");
359 360
	    }

361
	    if (! -f $path) {
362
		die_noretry("File $path for node $node does not exist!");
363 364
	    }
	}
365 366
    }

367 368 369 370
    #
    # XXX - Ditto for RPMs.
    #
    foreach my $rpm (split(":", $row{'rpms'})) {
371
	if (! -f $rpm) {
372 373 374
	    die_noretry({type => 'primary', severity => SEV_ERROR,
			 error => ['file_not_found', 'rpm', $rpm, $node]},
			"RPM $rpm for node $node does not exist!");
375 376
	}
    }
377

378 379 380 381 382
    #
    # XXX - Ditto for tarfiles.
    #
    foreach my $tarspec (split(":", $row{'tarballs'})) {
	my ($dir, $tar) = split(" ", $tarspec);
383

384
	if (! -f $tar) {
385 386 387
	    die_noretry({type => 'primary', severity => SEV_ERROR,
			 error => ['file_not_found', 'tar', $tar, $node]},
			"Tarfile $tar for node $node does not exist!");
388 389
	}
    }
390

391 392 393 394 395 396 397 398 399
    #
    # If the virtnode is running a subOS, we set $imageable because it 
    # really is going to be reloaded... even though virtnode types are not
    # typically imageable.
    #
    if ($virtnode && defined($osinfo) && $osinfo->def_parentosid()) {
	$imageable = 1;
    }

400 401 402 403
    #
    # If there is a path specified, then we don't worry anymore about it.
    # The user must know what is going on. The OSID might have a path
    # associated with it, which means the same thing; we don't worry about
404
    # it.
405
    #
406 407
    if (!$bootpath && (!$virtnode || ($virtnode && $imageable)) 
	&& !$isgeninode && $imageable) {
408 409 410
	#
	# These checks are not necessary if the front end and web page
	# are doing the right thing, but lets be careful anyway.
411
	#
412
	if (! $osinfo) {
413
	    die_noretry("$node has no bootpath and no def_boot_osid set!");
414 415 416 417
	}

	#
	# If there is an actual path, its an OSKit kernel not an image.
418
	#
419
	if (! defined($osinfo->path()) || $osinfo->path() eq "") {
420
	    #
421
	    # Not an OSKit kernel.
422
	    # Make sure this OSID is actually loaded on the machine.
423
	    #
424 425
	    my $p_result =
		DBQueryFatal("select * from partitions ".
426 427
			     "where node_id='$node' and osid='$osid'".
			     "order by partition");
428 429

	    #
430
	    # If not loaded, then see if the user was looking for the generic
431 432
	    # name of the OS that is loaded.
	    #
433 434
	    if ($p_result->numrows == 0) {
		#
435
		# Check to see if a non specific version specified.
436
		#
437
		if (!defined($osinfo->version()) || $osinfo->version() eq "") {
438
		    #
439
		    # A non-specific version. There needs to be a way to
440
		    # map it to another osid.
441
		    #
442 443
		    if (! defined($osinfo->nextosid())) {
			die_noretry("No mapping for $osinfo ($node)!");
444
		    }
445 446 447

		    my $nextosid = TBResolveNextOSID($osid, $pid, $eid);
		    if (!defined($nextosid)) {
448
			die_noretry("No mapping for $osinfo ($node)!");
449
		    }
450 451 452 453
		    $nextosinfo = OSinfo->Lookup($nextosid);

		    die_noretry("Could not map $osid to its object!")
			if (!defined($nextosinfo));
454
		
455 456 457
		    #
		    # See if the nextosid is already on the disk. If not,
		    # it needs to be loaded.
458
		    #
459
		    my $o_result =
460 461 462 463 464 465 466
			DBQueryFatal("select osid from partitions as p ".
				     "where p.node_id='$node' and ".
				     "      p.osid='$nextosid'");

		    if (! $o_result->numrows) {
			#
			# User wants a specific version of an OS, but its not
467
			# loaded on the machine.
468
			#
469
			print "Mapping $osinfo on $node to $nextosinfo ".
470
			    "and setting up a reload.\n";
471

472
			SetupReload($node, $nextosinfo, $type);
473
			$osids{$node} = $nextosid;
474
			$osmap{$node} = $nextosinfo;
475 476
		    }
		    else {
477
			#
478 479
			# Already loaded.
			#
480
			print "Mapping $osinfo on $node to $nextosinfo.\n";
481 482 483

			if ($dolastload &&
			    defined($row{'pid'}) && $row{'pid'} ne $pid) {
484
			    SetupReload($node, $nextosinfo, $type);
485 486
			}
			else {
487
			    system("$osselect $nextosid $node") and
488 489
				die_noretry("Could not set boot OS to ".
					    "$nextosid for $node");
490
			}
491
			$osids{$node} = $nextosid;
492
			$osmap{$node} = $nextosinfo;
493
		    }
494 495
		}
		else {
496 497
		    #
		    # User wants a specific version of an OS, but its not
498
		    # loaded on the machine.
499
		    #
500
		    SetupReload($node, $osinfo, $type);
501 502 503 504
		}
	    }
	    else {
		#
505
		# OSID is loaded, but might need to be cleaned.
506 507 508
		#
		if ($dolastload &&
		    defined($row{'pid'}) && $row{'pid'} ne $pid) {
509
		    SetupReload($node, $osinfo, $type);
510
		}
511 512 513
	    }
	}
    }
514

515
    #
516
    # Set the canfail bit.
517
    #
518
    $canfail{$node} = (($failmode eq NODEFAILMODE_FATAL()) ? 0 : 1);
519

520 521 522 523 524 525 526 527 528
    #
    # Set the reboot waittime from the osid now that we have it
    # finalized.
    #
    $osid = $osids{$node};
    if (!exists($reboot_waittime{$osid})) {
	$reboot_waittime{$osid} = TBOSIDRebootWaittime($osid);
    }

529
    print STDERR "$node - $osmap{$node} - $canfail{$node}\n"
530
	if $dbg;
531
}
532

533
@all_nodes = (keys %nodes, keys %vnodes);
534 535 536 537 538 539 540 541 542 543

#
# Perform some prechecks on the images.  This will also have the
# effect of catching the info for the images for latter use
#
# FIXME: WRITEME
#    Maybe this isn't a good idea since it will also attempt to fetch
#    the image from the real boss in an inner-emulab.  This should
#    really be done in parallel.

544
#
545
# Collect some info about vnodes.
546 547 548 549 550
#
foreach my $vnode (keys(%vnodes)) {
    my $jailed = $vnodes{$vnode};
    my $pnode;

551 552
#    print "$vnode, $jailed\n";

553 554 555 556 557
    if (! $jailed) {
	next;
    }

    if (! TBPhysNodeID($vnode, \$pnode)) {
558
	die_noretry("Cannot determine phys_nodeid for $vnode!");
559
    }
560

561 562
#    print "$vnode, $jailed, $pnode\n";

563 564 565 566 567 568 569 570 571
    #
    # Count up the number of jailed nodes on this pnode, and add the
    # mapping. We use this below for determining how long to wait for
    # a particular vnode.
    #
    $pnodevcount{$pnode} = 0
	if (!defined($pnodevcount{$pnode}));
    $pnodevcount{$pnode}++;
    $vnode2pnode{$vnode} = $pnode;
572 573

    if (!exists($nodes{$pnode})) {
574 575 576 577 578 579
	#
	# Typical on remote nodes; we do not allocate the underlying
	# phys node to the experiment.
	#
	next;
    }
580

581
    # Nothing else to do for local jail nodes at this time ...
582 583
}

584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
#
# Setup the firewall first.  Once it is up we can continue with the
# remaining nodes.
#
# There is very little point in setting up the other nodes at the same time
# as they will not be able to PXE boot until the firewall is up.  We could
# fire them off a little early in hopes of overlapping any BIOS boot time
# with the last stages of the firewall setup, but it probably isn't worth
# the complexity (and would not work with nodes for which "reboot" means
# "fall out of PXEWAIT and boot".
#
# Note that we formerly did just do them all at once and let the nodes
# continually PXE-timeout and reboot until the firewall came up.  But that
# can actually take longer than what we do now, if a node happened to
# timeout and reboot just as the firewall came up (i.e., we would have to
# wait an extra BIOS-reboot cycle, which can be 90 seconds or more.
#
if ($firewalled) {
    my $node = $firewall;

    TBDebugTimeStamp("rebooting/reloading firewall");
    if (!FirewallSetup($node)) {
	tbwarn "Firewall node $node failed to boot.".
	    "This has been reported to testbed-ops.";

	# XXX do we need to set NODEBOOTSTATUS_FAILED here?

	#
	# We assume that firewall node images are "standard" here,
	# and whine to tbops.
	#
615
	Node::MarkAsDown($node);
616
	TBSetNodeLogEntry($node, $user_uid, TB_DEFAULT_NODELOGTYPE(),
617
			  "'Moved to hwdown by os_setup; ".
618
			  "failed to boot image for osid " . $osmap{$node} .
619 620 621 622 623 624
			  " in $pid/$eid'");
	SENDMAIL($TBOPS, "1 node is down",
		 "Node:\n".
		 "  $node\n".
		 "in pid/eid $pid/$eid appears to be dead.\n\n".
		 "The node has been taken out of the pool until this matter ".
625 626
		 "is resolved.\n",
		 $user_email_to);
627 628

	$failed++;
629
	add_failed_node_fatal($node);
630 631 632 633 634 635 636 637 638
	goto tballdone;
    }

    #
    # Check for cancelation.  Firewall setup may have taken awhile.
    #
    if (!$canceled) {
	TBGetCancelFlag($pid, $eid, \$canceled);
	if ($canceled) {
639 640 641
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
642 643 644 645 646 647 648 649 650 651
	    goto tballdone;
	}
    }

    #
    # remove it from the nodelist
    #
    delete $nodes{$node};
}

652 653 654 655 656 657 658 659 660
#
# Likewise, setup a PLC node before other plabinelab nodes.
# XXX right now, we setup PLC before ANY other node, whether it is
# part of the inner plab or not.
#
if ($plabinelab) {
    my $node = $plcnode;

    TBDebugTimeStamp("rebooting/reloading PLC node");
661
    if (!os_setup_one($node, $plcimage, "PLC", 10*60)) {
662 663 664 665 666 667
	tbwarn "PLC node $node failed to boot".
	    "This has been reported to testbed-ops.";
	SENDMAIL($TBOPS, "1 node is down",
		 "Node:\n".
		 "  $node\n".
		 "in pid/eid $pid/$eid failed to boot after loading OS.\n\n".
668 669
		 "The nodes have been freed.\n",
		 $user_email_to);
670
	$failed++;
671
	add_failed_node_fatal($node);
672 673 674 675 676 677 678 679 680
	goto tballdone;
    }

    #
    # Check for cancelation.  PLC setup may have taken awhile.
    #
    if (!$canceled) {
	TBGetCancelFlag($pid, $eid, \$canceled);
	if ($canceled) {
681 682 683
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
684 685 686 687 688 689 690 691 692 693
	    goto tballdone;
	}
    }

    #
    # remove it from the nodelist
    #
    delete $nodes{$node};
}

694 695 696 697 698 699 700 701 702 703 704 705
#
# Start up plab vnode setup now since it doesn't depend on
# physical node readiness.
#
if (grep($_, values(%plabvnodes))) {
    my $plabnumbatch = TBGetSiteVar("plab/setup/vnode_batch_size");
    my $plabwait     = TBGetSiteVar("plab/setup/vnode_wait_time");
    TBDebugTimeStamp("Starting PlanetLab vnode setup.");
    if (!($plab_setup_pid = fork())) { 
        exec("$vnode_setup -p -n $plabnumbatch -w $plabwait $pid $eid") 
            or die_noretry("Exec failed.");
    } elsif ($plab_setup_pid == -1) {
706 707 708 709 710 711 712 713 714 715 716 717 718 719
        die_noretry("Plab fork failed.");
    }
}

#
# Ditto for Geni nodes. Parent keeps going.
#
if (keys(%geninodes)) {
    TBDebugTimeStamp("Starting Geni setup.");

    $geni_setup_pid = fork();
    if (! $geni_setup_pid) {
	TBdbfork();	# So we get the event system fork too ...

720
	if (libGeni::StartSlivers($experiment, $this_user, 0, $dbg)) {
721 722 723 724 725 726 727 728
	    print STDERR "*** Could not start Geni slivers\n";
	    exit(-1);
	}
	TBDebugTimeStamp("Geni slivers have been started.");
	exit(0);
    }
    elsif ($geni_setup_pid == -1) {
        die_noretry("Geni fork failed.");
729
    }
730 731
    # Give it a chance to get going.
    sleep(1);
732 733
}

734
#
735
# We need to issue the reboots and the reloads in parallel.
736
#
737
TBDebugTimeStamp("rebooting/reloading nodes started");
738
if (!$TESTMODE) {
739
    my @children = ();
740

741
    foreach my $imageid ( keys(%reloads) ) {
742
	my @nodelist = @{ $reloads{$imageid} };
743

744 745
	my %nodeflags = ();

746
	foreach my $node (@nodelist) {
747 748 749 750 751 752 753 754 755 756 757 758 759
	    #
	    # vnodes only get rebooted if this is a modify and we need to
	    # reload them (otherwise they will get rebooted because of presence
	    # in %reboots).
	    #
	    if (defined($vnodes{$node})) {
		my $vstate;
		TBGetNodeAllocState($node,\$vstate);
		if ($vstate eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
		    $nodeflags{$node}{'noreboot'} = 1;
		}
	    }

760 761 762 763 764 765 766 767 768 769
	    #
	    # osload should not wait for shared vnodes.  We need vnode_setup
	    # to boot/reboot them since the underlying pnode won't be booting.
	    # So for them, osload just sets up the reload and finishes.
	    #
	    if (defined($vnodes{$node}) && $sharednodes{$node} == 1) {
		$nodeflags{$node}{'noreboot'} = 1;
		$nodeflags{$node}{'nowait'} = 1;
	    }

770 771
	    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_RELOAD() );
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_RELOAD();
772
	    # No point in reboot/reconfig obviously, since node will reboot!
773
	    delete $reboots{$node};
774
	    delete $reconfigs{$node};
775
	    $rebooted{$node} = 1;
776 777
	}

778 779 780 781 782 783 784
	my %reload_args     = ();
	my $reload_failures = {};

	$reload_args{'debug'}     = $dbg;
	$reload_args{'asyncmode'} = 1;
	$reload_args{'imageid'}   = $imageid;
	$reload_args{'nodelist'}  = [ @nodelist ];
785
	$reload_args{'nodeflags'} = \%nodeflags;
786 787 788 789

	my $pid = osload(\%reload_args, $reload_failures);
	push(@children, [ $pid, \&osload_wait,
			  [ @nodelist ], $reload_failures ]);
790 791 792
	sleep(5);
    }

793 794 795
    #
    # Fire off the reboots.
    # 
796
    if (keys(%reboots)) {
Chad Barb's avatar
Chad Barb committed
797 798
	foreach my $node (keys(%reboots)) {
	    if ($nodeAllocStates{$node} eq TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
799
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_CLEAN());
Chad Barb's avatar
Chad Barb committed
800 801
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_CLEAN();
	    } else {
802
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_REBOOT_DIRTY());
Chad Barb's avatar
Chad Barb committed
803 804
		$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_REBOOT_DIRTY();
	    }
805 806
	    # See below, needed for vnode_setup.
	    $rebooted{$node} = 1;
Chad Barb's avatar
Chad Barb committed
807 808
	}

809 810 811 812 813 814 815 816 817 818 819 820 821
	my @nodelist        = keys(%reboots);
	my %reboot_args     = ();
	my $reboot_failures = {};

	$reboot_args{'debug'}     = $dbg;
	$reboot_args{'waitmode'}  = 0;
	$reboot_args{'asyncmode'} = 1;
	$reboot_args{'nodelist'}  = [ @nodelist ];

	my $pid = nodereboot(\%reboot_args, $reboot_failures);
	push(@children, [ $pid, \&nodereboot_wait,
			  [ @nodelist ], $reboot_failures ]);
	sleep(2);
822 823
    }

824
    #
825
    # Fire off the reconfigs.
826 827
    #
    if (keys(%reconfigs)) {
828 829 830 831 832 833 834 835 836 837 838 839 840
	my @nodelist        = keys(%reconfigs);
	my %reboot_args     = ();
	my $reboot_failures = {};

	$reboot_args{'debug'}     = $dbg;
	$reboot_args{'waitmode'}  = 0;
	$reboot_args{'asyncmode'} = 1;
	$reboot_args{'reconfig'}  = 1;
	$reboot_args{'nodelist'}  = [ @nodelist ];

	my $pid = nodereboot(\%reboot_args, $reboot_failures);
	push(@children, [ $pid, \&nodereboot_wait,
			  [ @nodelist ], $reboot_failures ]);
841 842
    }

843 844 845 846 847 848 849 850
    #
    # Wait for all of the children to exit. We look at the $pid to know if
    # command failed/ended immediately; otherwise we need to wait on it.
    # For any failures, record the node failures for later so that we do
    # not wait for them needlessly.
    #
    while (@children) {
	my ($pid, $waitfunc, $listref, $hashref) = @{ pop(@children) };
851

852 853 854
	# This is not likely to happen.
	next
	    if ($pid == 0);
855

856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876
	if ($pid > 0) {
	    next
		if (! &$waitfunc($pid));
	}
	
	#
	# Failure. Record the failures for later. If the $pid<0 then the
	# entire list failed. Otherwise, have to scan the return hash to
	# find the failures.
	#
	my @nodelist = ();
	
	if ($pid < 0) {
	    @nodelist = @{ $listref };
	}
	else {
	    foreach my $node (keys(%{ $hashref })) {
		push(@nodelist, $node)
		    if ($hashref->{$node});
	    }
	}
877

878 879 880 881 882 883
	#
	# These errors are unusal enough that we do not want to retry
	# or keep going even if canfail is set. Better to stop and let
	# someone look at what happened.
	#
	$noretry = 1;
884

885
	foreach my $node (@nodelist) {
886
	    tbnotice "Not waiting for $node since its reload/reboot failed!";
887
	    $failed++;
888
	    add_failed_node_reload($node);
889
	    delete($nodes{$node});
890 891 892

	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();
893 894 895
	}
    }
}
896 897
TBDebugTimeStamp("rebooting/reloading finished");

898 899 900 901 902 903 904 905 906 907
#
# XXX declare the inner plab nodes as UP since we won't be hearing from
# them again (they are talking only to their PLC).
#
if ($plabinelab) {
    my @plabnodes = ();
    TBExptPlabInElabNodes($pid, $eid, \@plabnodes);
    foreach my $node (@plabnodes) {
	if (exists($nodes{$node})) {
	    tbnotice "Not waiting for emulated plab node $node";
908
	    Node::SetBootStatus($node, NODEBOOTSTATUS_OKAY);
909 910 911 912 913 914 915 916
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY());
	    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();
	    TBSetNodeEventState($node, TBDB_NODESTATE_ISUP());
	    delete($nodes{$node});
	}
    }
}

917 918 919 920 921
#
# Remaining nodes we need to wait for. Why do we wait in the face of errors
# above? So that they enter a reasonably known state before we try to tear
# things down. Otherwise we could end up power cycling nodes a lot more often.
# This should probably be handled in other ways, say via stated or the alloc
922
# state machine.
923
#
924 925
my @nodelist = keys(%nodes);

926
#
927 928 929
# Now lets wait for them to come back alive. Set up a retry list though
# so that we can give each node at least 1 second chance. Avoids pointless
# experiment failures.
930
#
931
if (@nodelist) {
932
    print "Waiting for local testbed nodes to finish rebooting ...\n";
933
}
934

935 936 937
my %retries;
my %waitstart;
foreach my $node ( @nodelist ) {
938
    $retries{$node} = (exists($geninodes{$node}) ? 0 : 1);
939 940 941
    $waitstart{$node} = time;
}

942 943 944 945 946 947 948 949
#
# List of nodes to inform the user and testbed-ops about in the event
# of failures.  We coalesce the nodes here so we only sent one message.
#
my @informuser = ();
my @informtbopswarn = ();
my @informtbopsfatal = ();

950 951 952 953
TBDebugTimeStamp("Local node waiting started");
while ( @nodelist ) {
    my $node   = shift(@nodelist);
    my $wstart = $waitstart{$node};
954
    my $actual_state;
955 956 957 958 959 960 961 962
    my $waittime = (60 * 7);	# The default.

    # Compute actual waittime.
    if (defined($bios_waittime{$node_types{$node}}) &&
	defined($reboot_waittime{$osids{$node}})) {
	$waittime = ($bios_waittime{$node_types{$node}} +
		     $reboot_waittime{$osids{$node}}) * 2;
    }
963

964
    if (!TBNodeStateWait($node, $wstart, $waittime, \$actual_state,
965 966
			 (TBDB_NODESTATE_TBFAILED, TBDB_NODESTATE_ISUP))) {
	if ($actual_state eq TBDB_NODESTATE_TBFAILED) {
967
	    tbwarn "$node reported a TBFAILED event; not retrying";
968 969 970
	    $retries{$node} = 0;
	    goto tbfailed;
	}
971
	print "$node is alive and well\n";
972
	Node::SetBootStatus($node, NODEBOOTSTATUS_OKAY);
973
	TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_RES_READY() );
974
	$nodeAllocStates{$node} = TBDB_ALLOCSTATE_RES_READY();
975
	next;
976 977
    }

978 979 980 981 982 983 984 985
    #
    # Check for cancelation. Do not want to retry the reboots if the
    # swap was canceled.
    #
    if (!$canceled) {
	TBGetCancelFlag($pid, $eid, \$canceled);

	if ($canceled) {
986 987 988
	    tbnotice({cause => 'canceled', severity => SEV_IMMEDIATE,
		      error => ['cancel_flag']},
		     "Swap canceled; will terminate os_setup early!");
989 990 991
	}
    }

992
    if ($retries{$node} && !($canceled || $noretry)) {
993
	$retries{$node} -= 1;
994

995
	tbnotice "Rebooting $node and waiting again ...";
996

997 998 999 1000 1001 1002 1003 1004
	if (system("$nodereboot $node") == 0) {
	    push(@nodelist, $node);
	    $waitstart{$node} = time;
	    next;
	}
	# Fall through on failure.
    }

1005
    tbwarn "$node may be down. This has been reported to testbed-ops.";
1006

1007
  tbfailed:
1008
    Node::SetBootStatus($node, NODEBOOTSTATUS_FAILED);
1009

1010
    if ($canfail{$node} && !($canceled || $noretry)) {
1011
	push(@informuser, $node);
1012
	add_failed_node_nonfatal($node);
1013
	tbnotice "Continuing with experiment setup anyway ...";
1014
	next;
1015
    }
1016

1017 1018
    #
    # If the user has picked a standard image and it fails to boot,
1019
    # something is wrong, so reserve it to hwdown experiment. If the
1020 1021 1022 1023 1024
    # image belongs to the user, then we assume its the image at fault,
    # and allow it to be returned to the pool (caller, tbswap will end
    # doing the nfree on nodes with a DOWN allocstate).
    #
    my $pidofosid;
1025 1026 1027
    if (!exists($geninodes{$node}) &&
	(! TBOsidToPid($osids{$node}, \$pidofosid) ||
	 $pidofosid eq TBOPSPID())) {
1028
	Node::MarkAsIll($node);
1029
	TBSetNodeLogEntry($node, $user_uid, TB_DEFAULT_NODELOGTYPE(),
1030
			  "'Moved to hwcheckup by os_setup; ".
1031
			  "failed to boot image for osid " . $osmap{$node} .
1032
			  " in $pid/$eid'");
1033 1034 1035
	push(@informtbopsfatal, $node);
    } else {
	push(@informtbopswarn, $node);
1036
    }
1037
    TBSetNodeAllocState( $node, TBDB_ALLOCSTATE_DOWN() );
1038
    $nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();
1039 1040

    $failed++;
1041
    add_failed_node_fatal($node);
1042
}
1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069

#
# Spam time!  Send mail to the user and testbed-ops about failures.
#
my $count = scalar(@informuser);
if ($count > 0) {
    SENDMAIL($user_email_to, "$count nodes are down",
	     "Nodes:\n".
	     "  " . join(" ", @informuser) . "\n".
	     "in pid/eid $pid/$eid appear to be dead.\n\n".
	     "Your experiment will continue to run since these failures\n".
	     "are nonfatal, although you might encounter other problems\n".
	     "if your experiment depends explicitly on these nodes.\n".
	     "You should terminate this experiment if it cannot ".
	     "tolerate these failures.\n\n".
	     "Testbed Operations has also been notified.\n\n".
	     "Thanks\n".
	     "Testbed Operations\n",
	     0,
	     "Cc: $TBOPS");
}
$count = scalar(@informtbopsfatal);
if ($count > 0) {
    SENDMAIL($TBOPS, "$count nodes are down",
	     "Nodes:\n".
	     "  " . join(" ", @informtbopsfatal) . "\n".
	     "in pid/eid $pid/$eid appear to be dead.\n\n".
1070
	     "The nodes have been moved into hardware checkup.\n",
1071
	     $user_email_to);
1072 1073 1074 1075 1076 1077 1078
}
$count = scalar(@informtbopswarn);
if ($count > 0) {
    SENDMAIL($TBOPS, "$count nodes are down",
	     "Nodes:\n".
	     "  " . join(" ", @informtbopswarn) . "\n".
	     "in pid/eid $pid/$eid failed to boot after loading OS.\n\n".
1079 1080
	     "The nodes have been freed.\n",
	     $user_email_to);
1081 1082
}

1083
TBDebugTimeStamp("Local node waiting finished");
1084

1085 1086 1087 1088 1089
#
# Now deal with virtual nodes.
#
# We do this in a sub script since nodes are not owned by the user
# and so must be setuid root so that ssh will work.
1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
#
my @vnodelist = keys(%vnodes);

#
# Set the allocstate for the local vnodes that were sucessfully rebooted
# and came to ISUP above. These do not need to be setup again! We move
# them to RES_READY, so vnode_setup will ignore them. If they fail to
# hit ISUP, we will move them to DOWN so that vnode_setup will ignore
# them again, in the teardown phase.
#
# Note, we do this even if there were failures above, since the teardown
# phase is going to happen, and we want vnode_setup to know which nodes
# came up with phynodes okay (need to be torndown) and which ones never
# had the chance (no need to teardown). Think swapmod, which does teardown
# in the ACTIVATING state.
#
foreach my $vnode (@vnodelist) {
    my $pnode  = $vnode2pnode{$vnode};

1109
    # Default retry count.
1110
    $retries{$vnode} = 0;
1111

1112
    # Remote or shared node, always does setup.
1113 1114
    next
	if (!exists($nodes{$pnode}));
1115
    
1116 1117
    # Pnode was neither rebooted or reconfiged, so leave allocstate alone
    # for vnode_setup (has to be done).
1118
    next
1119
	if (!exists($rebooted{$pnode}) && !exists($reconfigs{$pnode}));
1120 1121 1122 1123 1124 1125 1126

    if ($nodeAllocStates{$pnode} eq TBDB_ALLOCSTATE_RES_READY()) {
	TBSetNodeAllocState($vnode, TBDB_ALLOCSTATE_RES_READY());
	$nodeAllocStates{$vnode} = TBDB_ALLOCSTATE_RES_READY();
    }
}

1127 1128 1129 1130 1131 1132 1133
#
# Reset the failure lists. See above.
#
@informuser = ();
@informtbopswarn = ();
@informtbopsfatal = ();

1134
#
1135 1136 1137
# XXX - Don't bother if something above failed. A waste of time and
# usually leads to cascading errors.
#
1138
if ($canceled && @vnodelist) {
1139
    tbnotice "Skipping virtual node setup since swapin was canceled!";
1140 1141
}
elsif ($failed && @vnodelist) {
1142 1143
    tbnotice "Skipping virtual node setup since there were previous ".
	"failures!";
1144 1145
}
elsif (@vnodelist) {
1146
    my @retry_list = ();
Kirk Webb's avatar
Kirk Webb committed
1147

Leigh Stoller's avatar
Leigh Stoller committed
1148
    TBDebugTimeStamp("Setting up virtual nodes");
1149
    print "Setting up virtual testbed nodes ...\n";
1150

1151
    # Wait for plab vnode setup to finish if it's running.
1152
    if (defined($plab_setup_pid) && $plab_setup_pid > 0) {
1153 1154
        my $kid = waitpid($plab_setup_pid,0);
        if ($kid == $plab_setup_pid) {
1155
	    $plab_setup_pid = undef;
1156 1157 1158
            if ($?) {
                die_noretry("Failed to setup plab vnodes.");
            }
1159 1160
        }
	else {
1161 1162
            die_noretry("Error waiting for plab vnode to finish.");
        }
Kirk Webb's avatar
Kirk Webb committed
1163 1164
    }

1165
  retry:
Leigh Stoller's avatar
Leigh Stoller committed
1166
    TBDebugTimeStamp("Setting up virtual nodes");
1167
    # Only fire off local (jailed) nodes here. Plab/Geni vnode setup has
1168 1169
    # already been started at this point.
    system("$vnode_setup -j $pid $eid");
1170
    if ($?) {
1171
	die_noretry("Vnode setup failed!");
1172
    }
1173 1174
    print "Waiting for virtual testbed nodes to finish setting up ...\n";
    TBDebugTimeStamp("Virtual node waiting started");
1175

1176 1177 1178
    foreach my $node (@vnodelist) {
	$waitstart{$node} = time;
    }
Leigh Stoller's avatar
Leigh Stoller committed
1179
    @vnodelist = sort(@vnodelist);
1180

1181 1182 1183
    while ( @vnodelist ) {
	my $node   = shift(@vnodelist);
	my $pnode  = $vnode2pnode{$node};
1184
	my $islocal= exists($nodes{$pnode});
1185 1186
	my $wstart = $waitstart{$node};
	my $curallocstate;
1187
	my $actual_state;
1188
	my $maxwait;
1189

1190 1191 1192 1193 1194
        #
        # Base the maxwait for vnodes on the reboot_waittime field for
        # their respective OSIDs, with some slop time that scales up
        # as a function of the number of vnodes on the parent pnode.
        #
1195 1196 1197 1198
	my $reboot_time = 0;
	my $osinfo = $osmap{$node};
	if (defined($osinfo)) {
	    my $osid = $osinfo->osid();
1199 1200 1201
	    if (defined($reboot_waittime{$osid})) {
		$reboot_time = $reboot_waittime{$osid};
	    }
1202
	}
1203 1204 1205 1206 1207 1208 1209 1210
	if ($islocal) {
	    $maxwait = $reboot_time + (40 * $pnodevcount{$pnode});
	}
	else {
	    #
	    # A remote node is supposed to be up and running, but no idea
	    # how long is reasonable.
	    #
1211
	    $maxwait = $reboot_time + 60 * $pnodevcount{$pnode};
1212
	}
1213

1214 1215 1216
	TBGetNodeAllocState($node, \$curallocstate);

	#
1217
	# See if vnode_setup already determined the node was dead.
1218 1219 1220 1221
	#
	if ($curallocstate ne TBDB_ALLOCSTATE_DOWN() &&
	    $curallocstate ne TBDB_ALLOCSTATE_DEAD()) {

1222 1223 1224 1225 1226
	    if (!TBNodeStateWait($node, $wstart, $maxwait, \$actual_state,
				 (TBDB_NODESTATE_TBFAILED,
				  TBDB_NODESTATE_ISUP))) {

		if ($actual_state eq TBDB_NODESTATE_TBFAILED) {
1227
		    tbwarn "$node reported a TBFAILED event.";
1228 1229
		    goto vtbfailed;
		}
1230
		print "$node is alive and well\n";
Leigh Stoller's avatar
Leigh Stoller committed
1231 1232
		TBDebugTimeStamp("Virtual node $node setup ISUP");
		
1233 1234
		# Might have already been set above.
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_READY);
1235
		Node::SetBootStatus($node, NODEBOOTSTATUS_OKAY);
1236
		next;
1237
	    }
1238

1239
	  vtbfailed:
Leigh Stoller's avatar
Leigh Stoller committed
1240
	    TBDebugTimeStamp("Virtual node $node setup FAILED");
1241
	    Node::SetBootStatus($node, NODEBOOTSTATUS_FAILED);
1242
	    TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
1243 1244 1245 1246 1247 1248 1249 1250

	    #
	    # If a local node, lets retry since jail setup appears to be
	    # rather flaky.
	    # 
	    if ($islocal && $retries{$node}) {
		$retries{$node} -= 1;

1251
		tbwarn "$node did not boot; will retry setup ...";
1252 1253 1254 1255 1256
		push(@retry_list, $node);
		next;
	    }

	    # Otherwise, fall through ...
1257
	}
1258

1259
	tbwarn "$node did not boot!";
1260 1261 1262 1263 1264

	if ($plabvnodes{$node}) {
	    #
	    # We move the pnode into hwdown so that it will not be considered
	    # again, until the plab monitor daemon determines that it is
1265
	    # really working again.
1266
	    #