libosload.pm.in 31.4 KB
Newer Older
1 2 3
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2008 University of Utah and the Flux Group.
5 6 7 8 9 10 11 12 13 14 15 16
# All rights reserved.
#
# Osload library. Basically the backend to the osload script, but also used
# where we need finer control of loading of nodes.
#
package libosload;

use strict;
use Exporter;
use vars qw(@ISA @EXPORT);

@ISA    = "Exporter";
17
@EXPORT = qw ( osload osload_wait osload_setupswapinfo );
18 19 20

# Must come after package declaration!
use lib '@prefix@/lib';
21
use libtestbed; # for TBGenSecretKey();
22 23
use libdb;
use libreboot;
24
use libtblog;
25 26
use Node;
use NodeType;
27
use Image;
28
use User;
29
use OSinfo;
30 31 32 33 34 35 36 37
use English;
use File::stat;
use IO::Handle;

# Configure variables
my $TB		= "@prefix@";
my $TESTMODE    = @TESTMODE@;
my $TBOPS       = "@TBOPSEMAIL@";
38
my $ELABINELAB  = @ELABINELAB@;
39
my $PROJROOT    = "@PROJROOT_DIR@";
40 41 42 43 44 45 46

# Max number of retries (per node) before its deemed fatal. This allows
# for the occasional pxeboot failure.
my $MAXRETRIES  = 1;

my $FRISBEELAUNCHER = "$TB/sbin/frisbeelauncher";
my $osselect	    = "$TB/bin/os_select";
47
my $TBUISP	    = "$TB/bin/tbuisp";
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62

# Locals
my %imageinfo       = ();	# Per imageid DB info.
my $debug           = 0;
my %children        = ();	# Child pids in when asyncmode=1

sub osload ($$) {
    my ($args, $result) = @_;

    # These come in from the caller.
    my $imageid;
    my $waitmode    = 1;
    my @nodes       = ();
    my $noreboot    = 0;
    my $asyncmode   = 0;
63
    my $zerofree    = 0;
64
    my $swapinfo    = 0;
65 66 67 68 69 70 71

    # Locals
    my %retries	    = ();
    my $failures    = 0;
    my $usedefault  = 1;
    my $mereuser    = 0;
    my $rowref;
72
    my $image;
73
    my $this_user;
74 75

    if (!defined($args->{'nodelist'})) {
76
	tberror "Must supply a node list!"; # INTERNAL
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
	return -1;
    }
    @nodes = sort(@{ $args->{'nodelist'} });
    
    if (defined($args->{'waitmode'})) {
	$waitmode = $args->{'waitmode'};
    }
    if (defined($args->{'noreboot'})) {
	$noreboot = $args->{'noreboot'};
    }
    if (defined($args->{'debug'})) {
	$debug = $args->{'debug'};
    }
    if (defined($args->{'imageid'})) {
	$imageid    = $args->{'imageid'};
	$usedefault = 0;
    }
    if (defined($args->{'asyncmode'})) {
	$asyncmode = $args->{'asyncmode'};
    }
97 98 99
    if (defined($args->{'zerofree'})) {
	$zerofree = $args->{'zerofree'};
    }
100

101 102 103
    if (defined($args->{'swapinfo'})) {
	$swapinfo = $args->{'swapinfo'};
    }
104 105 106 107 108 109 110 111 112

    #
    # Start a new logging sub-process
    #
    my $old_env = \%ENV;
    local %ENV;
    copy_hash %ENV, $old_env;
    tblog_sub_process("osload", @nodes);

113 114 115 116 117
    #
    # Figure out who called us. Root and admin types can do whatever they
    # want. Normal users can only change nodes in experiments in their
    # own projects.
    #
118 119 120 121 122 123 124 125 126 127 128 129 130
    if ($UID) {
	$this_user = User->ThisUser();
	return -1
	    if (!defined($this_user));

	if (!$this_user->IsAdmin()) {
	    $mereuser = 1;

	    if (! TBNodeAccessCheck($UID, TB_NODEACCESS_LOADIMAGE, @nodes)) {
		tberror("Not enough permission to load images on one or ".
			"more nodes!");
		return -1;
	    }
131 132 133
	}
    }

134 135 136 137 138 139 140 141
    if (defined($imageid)) {
	$image = Image->Lookup($imageid);
	if (!defined($image)) {
	    tberror("Could not map $imageid to its object!");
	    return -1;
	}
    }

142 143 144
    #
    # Check permission to use the imageid.
    # 
145
    if (defined($image) && $mereuser &&
146
	! $image->AccessCheck($this_user, TB_IMAGEID_READINFO)) {
147
	tberror "You do not have permission to load $image";
148 149 150
	return -1;
    }

151 152 153 154 155 156 157
    #
    # If there's a maxiumum number of concurrent loads listed for the image,
    # check to see if we'll go over the limit, by checking to see how many
    # other nodes are currently booting thie image's default_osid. This is
    # NOT intended to be strong enforcement of license restrictions, just a
    # way to catch mistakes.
    #
158 159
    if (defined($image) &&
	!TBImageLoadMaxOkay($image->imageid(), scalar(@nodes), @nodes)) {
160 161
	tberror 
	    "Would exceed maxiumum concurrent instances ".
162
		"limitation for $image";
163 164 165
	return -1;
    }

166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
    #
    # This is somewhat hackish. To promote parallelism during os_setup, we
    # want to fork off the osload from the parent so it can do other things.
    # The problem is how to return status via the results vector. Well,
    # lets do it with some simple IPC. Since the results vector is simply
    # a hash of node name to an integer value, its easy to pass that back.
    #
    # We return the pid to the caller, which it can wait on either directly
    # or by calling back into this library if it wants to actually get the
    # results from the child!
    # 
    if ($asyncmode) {
	#
	# Create a pipe to read back results from the child we will create.
	#
181 182
	my $PARENT_READER = new IO::Handle; # Need a new handle for each child
	if (! pipe($PARENT_READER, CHILD_WRITER)) {
183
	    tberror "creating pipe: $!";
184 185 186 187 188 189
	    return -1;
	}
	CHILD_WRITER->autoflush(1);

	if (my $childpid = fork()) {
	    close(CHILD_WRITER);
190
	    $children{$childpid} = [ $PARENT_READER, $result ];
191 192 193 194 195
	    return $childpid;
	}
	#
	# Child keeps going. 
	#
196
	close($PARENT_READER);
197 198 199
	TBdbfork();
    }

200 201 202 203
    # This will store information about each node, so that if we have to try
    # again later, we'll have it all.
    my %reload_info;

204 205 206 207
    # Loop for each node.
    foreach my $node (@nodes) {
	# All nodes start out as being successful; altered later as needed.
	$result->{$node} = 0;
208 209 210 211 212 213

	my $nodeobject = Node->Lookup($node);
	if (!defined($nodeobject)) {
	    tberror "$node: Could not map to object!";
	    goto failednode;
	}
214 215
	
	# Get default imageid for this node.
216 217
	my $default_imageid = $nodeobject->default_imageid();
	if (! defined($default_imageid)) {
218
	    tberror "$node: No default imageid defined!";
219 220
	    goto failednode;
	}
Leigh Stoller's avatar
Leigh Stoller committed
221 222 223 224 225 226
	my $default_image = Image->Lookup($default_imageid);
	if (!defined($default_image)) {
	    tberror("Could not map $default_imageid to its object!");
	    return -1;
	}
	
227 228
	if ($usedefault) {
	    $imageid = $default_imageid;
Leigh Stoller's avatar
Leigh Stoller committed
229
	    $image   = $default_image;
230 231
	}

232
	print STDERR "osload: Using $image for $node\n"
233 234 235
	    if $debug;

	#
236 237 238
	# We try to avoid repeated queries to DB for info that does not change
	# by caching the image info on the first use.  GetImageInfo() will
	# perform various one-time checks as well.
239
	# 
240
	if (!exists($imageinfo{$imageid}) && !GetImageInfo($image, $node)) {
241
	    goto failednode;
242
	}
243
	$rowref = $imageinfo{$imageid};
244 245 246
	if ($rowref eq 'BADIMAGE') {
	    goto failednode;
	}
247 248 249 250 251

	my $loadpart       = $rowref->{'loadpart'};
	my $loadlen        = $rowref->{'loadlength'};
	my $imagepath      = $rowref->{'path'};
	my $defosid        = $rowref->{'default_osid'};
252
	my $maxwait	   = $rowref->{'maxloadwait'};
253
	my $imagepid	   = $rowref->{'pid'};
254
	my $access_key	   = $rowref->{'access_key'};
255

256 257 258 259 260 261 262
	my $osinfo = OSinfo->Lookup($defosid);
	if (!defined($osinfo)) {
	    tberror("Could map map OSID $defosid to its object!");
	    goto failednode;
	}

	print "osload ($node): Changing default OS to $osinfo\n";
263 264 265
	if (!$TESTMODE) {
	    system("$osselect $defosid $node");
	    if ($?) {
266
		tberror "$node: os_select $defosid failed!";
267 268 269 270 271 272 273 274 275 276 277
		goto failednode;
	    }
	}

	#
	# If loading an image (which is not the default) then schedule
	# a reload for it so that when the experiment is terminated it
	# will get a fresh default image before getting reallocated to
	# another experiment.
	#
	if ($imageid ne $default_imageid &&
Leigh Stoller's avatar
Leigh Stoller committed
278
	    $nodeobject->SetSchedReload($default_image) != 0) {
279 280
	    tberror
		"$node: Could not schedule default reload";
281 282 283 284
	    goto failednode;
	}

	#
285 286 287 288 289 290 291 292 293 294
	# XXX assumes a DOS MBR, but this is ingrained in the DB schema
	# as well (i.e., the images.part[1234]_osid fields).
	#
	my $MINPART = 1;
	my $MAXPART = 4;

	#
	# Assign partition table entries for each partition in the image
	# that has an OSID associated with it.
	#
295
	# This is complicated by the fact that an image that covers only
296 297
	# part of the slices should only change the partition table entries
	# for the subset of slices that are written to disk...
298
	#
299 300 301 302 303
	# ...UNLESS, the new image requires a different version of the MBR
	# in which case we must invalidate all partitions except the ones we
	# are loading since the partition boundaries may have changed.
	#
	my $startpart = $loadpart == 0 ? $MINPART : $loadpart;
304 305
	my $endpart   = $startpart + $loadlen;
	
306
	for (my $i = $MINPART; $i <= $MAXPART; $i++) {
307 308 309
	    my $partname = "part${i}_osid";
	    my $dbresult;
	    
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
	    #
	    # Partition is outside the range affected by this image.
	    # Normally, we just leave it alone, unless a change of MBR
	    # is implied by the current image.
	    #
	    if ($i < $startpart || $i >= $endpart) {
		if (defined($rowref->{'mbr_version'})) {
		    $dbresult =
			DBQueryWarn("select mbr_version ".
				    " from partitions as p,images as i ".
				    " where p.imageid=i.imageid ".
				    " and node_id='$node' and partition='$i'");
		    if ($dbresult && $dbresult->numrows) {
			my ($pmbr) = $dbresult->fetchrow_array();
			if ($pmbr != $rowref->{'mbr_version'}) {
			    tbwarn("$node: Existing partition $i inconsistent".
				   " with new image partitioning,".
				   " invalidating existing partition");
			    $dbresult =
				DBQueryWarn("delete from partitions ".
					    "where node_id='$node' and ".
					    "partition='$i'");
			    if (!$dbresult) {
				tberror("$node: Could not update ".
					"partition table");
				goto failednode;
			    }
			}
		    }
		}
		next;
	    }

	    #
	    # This image has an OSID in the current partition,
	    # replace the partition table info.
	    #
347 348 349 350 351
	    if (defined($rowref->{$partname})) {
		my $osid = $rowref->{$partname};
		
		$dbresult =
		    DBQueryWarn("replace into partitions ".
352 353 354
				"(node_id,partition,osid,imageid,imagepid) ".
				"values ".
				"('$node','$i','$osid','$imageid','$imagepid')");
355
	    }
356 357 358 359
	    #
	    # Otherwise, if there is no OS for a particular image partition,
	    # clear any current partitions table info.
	    #
360 361 362 363 364 365
	    else {
		$dbresult =
		    DBQueryWarn("delete from partitions ".
				"where node_id='$node' and partition='$i'");
	    }
	    if (!$dbresult) {
366
		tberror "$node: Could not update partition table";
367 368 369 370
		goto failednode;
	    }
	}

371 372 373 374 375 376 377 378 379
	#
	# Setup swapinfo now after partitions have initialized but before
	# we setup the one-shot frisbee load.
	#
	if ($swapinfo) {
	    print "osload: Updating image signature.\n";
	    osload_setupswapinfo(undef, undef, $node);
	}

380 381 382 383 384
	#
	# Determine which mode to use for reloading this node (note: this may
	# become an entry in node_capabilities or something like that in the
	# future - that would be cleaner)
	#
385 386
	my $type  = $nodeobject->type();
	my $class = $nodeobject->class();
387 388 389 390 391 392 393
	my $reload_mode;
	my $reload_func;
	my $reboot_required;
	if ($class eq "mote") {
	    $reload_mode = "UISP";
	    $reload_func = \&SetupReloadUISP;
	    $reboot_required = 0; # We don't reboot motes to reload them
394
	    $zerofree = 0; # and we don't zero "the disk"
395 396 397 398
	} else {
	    $reload_mode = "Frisbee";
	    $reload_func = \&SetupReloadFrisbee;
	    $reboot_required = !$noreboot; # Reboot unless $noreboot flag set
399 400 401

	    # This is passed along so that remote node can request the file.
	    # Make sure the image object has an access key defined.
402
	    if ($nodeobject->isremotenode() && !defined($access_key)) {
403 404 405 406 407 408 409 410
		$access_key = TBGenSecretKey();

		$rowref->{'access_key'} = $access_key;
		if ($image->Update({'access_key' => $access_key}) != 0) {
		    tberror "$node: Could not initialize image access key";
		    goto failednode;
		}
	    }
411 412 413 414 415 416 417
	}

	#
	# Remember this stuff so that if we have to retry this node again
	# later, we'll know how to handle it
	#
	$reload_info{$node} = {
418
	    'node'    => $node,
419 420 421 422
	    'mode'    => $reload_mode,
	    'func'    => $reload_func,
	    'imageid' => $imageid,
	    'osid'    => $defosid,
423
	    'reboot'  => $reboot_required,
424
	    'zerofree'=> $zerofree,
425
	    'maxwait' => $maxwait
426 427 428 429
	};

	print "Setting up reload for $node (mode: $reload_mode)\n";

430
	if (!$TESTMODE) {
431
	    if (&$reload_func($reload_info{$node}) < 0) {
432
		tberror("$node: Could not set up reload. Skipping.");
433 434 435 436 437 438 439 440 441
		goto failednode;
	    }
	}
	next;
	
      failednode:
	$result->{$node} = -1;
	$failures++;
    }
442
    
443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
    #
    # Remove any failed nodes from the list we are going to operate on.
    #
    my @temp = ();
    foreach my $node (@nodes) {
	push(@temp, $node)
	    if (! $result->{$node});
    }
    @nodes = @temp;

    # Exit if not doing an actual reload.
    if ($TESTMODE) {
	print "osload: Stopping in Testmode!\n";
	goto done;
    }

    if (! @nodes) {
460
	tbnotice "Stopping because of previous failures!";
461 462 463 464 465
	goto done;
    }

    # Fire off a mass reboot and quit if not in waitmode.
    if (! $waitmode) {
466
	my ($reboot_nodes, $noreboot_nodes)
467
	    = GetNodesRequiringReboot(\%reload_info, keys(%reload_info));
468
	if (@$reboot_nodes) {
469 470 471 472 473 474 475
	    print "osload: Rebooting nodes.\n";

	    my %reboot_args     = ();
	    my %reboot_failures = ();

	    $reboot_args{'debug'}    = $debug;
	    $reboot_args{'waitmode'} = 0;
476
	    $reboot_args{'nodelist'} = [ @$reboot_nodes ];
477 478

	    if (nodereboot(\%reboot_args, \%reboot_failures)) {
479
		foreach my $node (@$reboot_nodes) {
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
		    if ($reboot_failures{$node}) {
			$result->{$node} = $reboot_failures{$node};
			$failures++;
		    }
		}
	    }
	}
	goto done;
    }

    #
    # The retry vector is initialized to the number of retries we allow per
    # node, afterwhich its a fatal error.
    #
    foreach my $node (@nodes) {
	$retries{$node} = $MAXRETRIES;
    }
    
    while (@nodes) {
499 500 501
	my ($reboot_nodes, $noreboot_nodes)
	    = GetNodesRequiringReboot(\%reload_info, @nodes);
	if (@$reboot_nodes) {
502
	    # Reboot them all.
503
	    print "osload: Issuing reboot for @$reboot_nodes and then waiting ...\n";
504 505 506 507 508 509
	    
	    my %reboot_args     = ();
	    my %reboot_failures = ();

	    $reboot_args{'debug'}    = $debug;
	    $reboot_args{'waitmode'} = 0;
510
	    $reboot_args{'nodelist'} = [ @$reboot_nodes ];
511 512 513 514 515 516 517 518

	    if (nodereboot(\%reboot_args, \%reboot_failures)) {
		#
		# If we get any failures in the reboot, we want to
		# alter the list of nodes accordingly for the next phase.
		# 
		my @temp = ();
		
519
		foreach my $node (@$reboot_nodes) {
520 521 522 523 524 525 526 527
		    if ($reboot_failures{$node}) {
			$result->{$node} = $reboot_failures{$node};
			$failures++;
		    }
		    else {
			push(@temp, $node);
		    }
		}
528
		@nodes = (@temp,@$noreboot_nodes);
529 530 531 532 533
	    }
	}

	# Now wait for them.
	my $startwait   = time;
534 535 536 537
	my @failednodes = WaitTillReloadDone($startwait,
					     $waitmode,
					     \%reload_info, 
					     @nodes);
538 539 540 541 542 543
	@nodes=();
    
	while (@failednodes) {
	    my $node = shift(@failednodes);

	    if ($retries{$node}) {
544
		tbnotice "$node: Trying again ...";
545

546 547
		my $reload_info = $reload_info{$node};

548
		# Possible race with reboot?
549
		if (&{$reload_info->{'func'}}($reload_info) < 0) {
550
		    tberror("$node: Could not set up reload. Skipping.");
551 552 553 554 555 556 557 558 559 560
		    $result->{$node} = -1;
		    $failures++;
		    next;
		}
		push(@nodes, $node);

		# Retry until count hits zero.
		$retries{$node} -= 1;
	    }
	    else {
561 562
		tberror ({sublevel => -1}, 
			 "$node failed to boot too many times. Skipping!");
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
		$result->{$node} = -1;
		$failures++;
	    }
	}
    }
  done:
    print "osload: Done! There were $failures failures.\n";

    if ($asyncmode) {
	#
	# We are a child. Send back the results to the parent side
	# and *exit* with status instead of returning it.
	# 
	foreach my $node (keys(%{ $result })) {
	    my $status = $result->{$node};
	    
	    print CHILD_WRITER "$node,$status\n";
	}
	close(CHILD_WRITER);
	exit($failures);
    }
    return $failures;
}

587 588 589 590 591
#
# Fetch information for a specified image the first time it is used
# (for the indicated node).  This info is cached for use by all other
# nodes that require the image.  Returns 1 on success, 0 on failure.
#
592
sub GetImageInfo($;$)
593
{
594
    my ($image, $node) = @_;
595

596 597 598 599
    my $imageid = $image->imageid();
    my $rowref  = $image->DBData();
    if (!defined($rowref)) {
	tberror("No DBData for $image!");
600
	$imageinfo{$imageid} = 'BADIMAGE';
601 602
	return 0;
    }
603
    $imageinfo{$imageid} = $rowref;
604 605 606 607 608 609 610 611

    my $imagepath = $rowref->{'path'};

    #
    # Perform a few validity checks: imageid should have a file name
    # and that file should exist.
    #
    if (!defined($imagepath)) {
612
	tberror "No filename associated with $image!";
613
	$imageinfo{$imageid} = 'BADIMAGE';
614 615 616 617 618 619 620 621 622 623
	return 0;
    }

    if (! -R $imagepath) {
	if ($ELABINELAB) {
	    #
	    # Yuck. See if we can get it via frisbeelauncher before giving up.
	    #
	    system("$FRISBEELAUNCHER " . ($debug ? "-d ": "") . "$imageid");
	    if ($?) {
624
		tberror "Frisbeelauncher ($image) failed!";
625
		$imageinfo{$imageid} = 'BADIMAGE';
626 627 628
		return 0;
	    }
	    if (! -R $imagepath) {
629
		tberror	"Frisbeelauncher could not fetch $imagepath ($image)!";
630
		$imageinfo{$imageid} = 'BADIMAGE';
631 632 633 634
		return 0;
	    }
	}
	else {
635
	    tberror "$imagepath does not exist or cannot be read!";
636
	    $imageinfo{$imageid} = 'BADIMAGE';
637 638 639 640
	    return 0;
	}
    }

641 642 643 644 645 646 647 648 649 650 651 652 653
    my $sb = stat($imagepath);

    #
    # A zero-length image cannot be right and will result in much confusion
    # if allowed to pass: the image load will succeed, but the disk will be
    # unchanged, making it appear that os_load loaded the default image.
    #
    if ($sb->size == 0) {
	tberror "$imagepath is empty!";
	$imageinfo{$imageid} = 'BADIMAGE';
	return 0;
    }

654 655 656 657 658 659 660 661 662
    #
    # Compute a maxwait time based on the image size plus a constant
    # factor for the reboot cycle.  This is used later in
    # WaitTillReloadDone().  Arguably, this should be part of the
    # image DB state, so we store it in the imageinfo array too.
    #
    if (!defined($rowref->{'maxloadwait'})) {
	my $chunks = $sb->size >> 20; # size may be > 2^31, shift is unsigned

663
	$rowref->{'maxloadwait'} = int((($chunks / 100.0) * 65) + (5 * 60));
664 665 666
    }

    print STDERR
667
	"$image: loadpart=", $rowref->{'loadpart'},
668 669 670 671 672 673 674 675 676
	", loadlen=", $rowref->{'loadlength'},
	", imagepath=", $rowref->{'path'},
	", defosid=", $rowref->{'default_osid'},
	", maxloadwait=", $rowref->{'maxloadwait'}, "\n"
	    if ($debug);

    return 1;
}

677
# Wait for a reload to finish by watching its state
678
sub WaitTillReloadDone($$$@)
679
{
680
    my ($startwait, $waitmode, $reload_info, @nodes) = @_;
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
    my %done	= ();
    my $count   = @nodes;
    my @failed  = ();

    foreach my $node ( @nodes ) { $done{$node}  = 0; }

    print STDERR "Waiting for @nodes to finish reloading\n".`date` if $debug;

    # Start a counter going, relative to the time we rebooted the first
    # node.
    my $waittime  = 0;
    my $minutes   = 0;

    while ($count) {
	# Wait first to make sure reboot is done, and so that we don't
	# wait one more time after everyone is up.
	sleep(5);
	foreach my $node (@nodes) {
	    if (! $done{$node}) {
700 701 702 703 704 705 706 707 708 709 710 711
		my $maxwait;

		#
		# If we have to zero fill free space, then the
		# wait time has to be proportional to the disk
		# size.  In other words, a really, really, really
		# long time.  Lets assume 20MB/sec to blast zeros,
		# so 50 seconds/GB.  What the heck, lets call it
		# 1GB/minute.  Did I mention how this would take
		# a really long time?
		#
		if ($reload_info->{$node}{'zerofree'}) {
712 713 714
		    my $nodeobject = Node->Lookup($node);
		    my $disksize   = $nodeobject->disksize();

715 716 717 718
		    $disksize = 20
			if (!$disksize);
		    $maxwait = ($disksize * 60);
		} else {
719
		    $maxwait = $reload_info->{$node}{'maxwait'};
720
		}
721 722 723 724 725 726 727 728 729 730
		
		my $query_result =
		    DBQueryWarn("select * from current_reloads ".
				"where node_id='$node'");

		#
		# There is no point in quitting if this query fails. Just
		# try again in a little bit.
		# 
		if (!$query_result) {
731
		    tbwarn "$node: Query failed; waiting a bit.";
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
		    next;
		}

		#
		# We simply wait for stated to clear the current_reloads entry.
		#
		if (!$query_result->numrows) {
		    print STDERR "osload ($node): left reloading mode at ".`date`
			if ($debug);
		    
		    $count--;
		    $done{$node} = 1;
		    next;
		}
	
		# Soon we will have stated's timeouts take care of
		# rebooting once or twice if we get stuck during
		# reloading.
		$waittime = time - $startwait;
		if ($waittime > $maxwait) {
		    my $t = (int ($waittime / 60));
753 754
		    tbnotice "$node appears wedged; ".
			"it has been $t minutes since it was rebooted.";
755
		    TBNodeConsoleTail($node, *STDERR);
756 757

		    $count--;
758
		    $done{$node} = $waitmode;
759 760 761 762 763 764 765 766 767 768 769
		    push(@failed, $node);
		    next;
		}
		if (int($waittime / 60) > $minutes) {
		    $minutes = int($waittime / 60);
		    print STDERR "osload ($node): still waiting; ".
			"it has been $minutes minute(s)\n";
		}
	    }
	}
    }
770 771 772 773 774 775 776

    if ($waitmode > 1) {
	$count = @nodes - @failed;
	$startwait = time;
	while ($count) {
	    foreach my $node (@nodes) {
		print STDERR "osload ($node): waiting for node to finish ".
777
		    "booting\n";
778
		if ($done{$node} < $waitmode) {
779 780
		    my $actual_state;

781 782
		    if (!TBNodeStateWait($node,
					 $startwait,
783 784 785 786
					 (60*6),
					 \$actual_state,
					 (TBDB_NODESTATE_TBFAILED,
					  TBDB_NODESTATE_ISUP))) {
787 788 789 790 791 792 793 794
			$count--;
			$done{$node} = $waitmode;
		    }
		}
	    }
	}
    }

795 796 797 798
    return @failed;
}

# Setup a reload. 
799
sub SetupReloadFrisbee($)
800
{
801 802 803 804 805
    my $reload_info   = $_[0];
    my $node          = $reload_info->{'node'};
    my $imageid       = $reload_info->{'imageid'};
    my $zerofree      = $reload_info->{'zerofree'};
    my $osid          = TBNodeDiskloadOSID($node);
806

807 808 809 810
    #
    # Put it in the current_reloads table so that nodes can find out which
    # OS to load. See tmcd. 
    #
811
    my $query_result = 
812
	DBQueryWarn("replace into current_reloads ".
813 814
		    "(node_id, image_id, mustwipe) values ".
		    "('$node', '$imageid', $zerofree)");
815 816 817
    return -1
	if (!$query_result);

818
    system("$osselect -1 $osid $node");
819
    if ($?) {
820
	tberror "os_select $osid failed!";
821 822 823 824
	return -1;
    }
    system("$FRISBEELAUNCHER " . ($debug ? "-d ": "") . "$imageid");
    if ($?) {
825
	tberror "Frisbee Launcher ($imageid) failed!";
826 827 828 829 830
	return -1;
    }
    return 0;
}

831 832 833 834 835
#
# Setup a reload, using USIP (for motes), rather than Frisbee. Note that
# this differs from a Frisbee reload in one key way - it does the reload
# right here in this code, rather than setting up a reload for later.
#
836
sub SetupReloadUISP($)
837
{
838 839 840 841
    my $reload_info   = $_[0];
    my $node          = $reload_info->{'node'};
    my $imageid       = $reload_info->{'imageid'};
    my $osid          = $reload_info->{'osid'};
842 843 844 845 846 847 848

    #
    # Get the path to the image
    #
    my $query_result = DBQueryFatal("select path from images " .
	"where imageid='$imageid'");
    if ($query_result->num_rows() != 1) {
849
	tberror "Failed to get path for $imageid!";
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
	return -1;
    }
    my ($path) = $query_result->fetchrow();

    #
    # Tell stated that we're about to start reloading
    #
    TBSetNodeNextOpMode($node,TBDB_NODEOPMODE_RELOADMOTE);

    #
    # The mote goes 'down', then starts to reload
    #
    TBSetNodeEventState($node,TBDB_NODESTATE_SHUTDOWN);
    TBSetNodeEventState($node,TBDB_NODESTATE_RELOADING);

    #
    # Okay, just run tbuisp with that path
    #
    my $rv = system("$TBUISP upload $path $node");
    if ($rv) {
870
	tberror "$node: tbuisp failed";
871 872 873 874 875 876 877 878 879 880
	return -1;
    }

    #
    # Tell stated that we've finished reloading the node
    #
    TBSetNodeEventState($node,TBDB_NODESTATE_RELOADDONE);

    system("$osselect $osid $node");
    if ($?) {
881
	tberror "os_select $osid failed!";
882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897
	goto failednode;
    }

    #
    # 'Reboot' the node (from stated's perspective, anyway)
    # has been shutdown, so that the os_select will take effect
    #
    TBSetNodeEventState($node,TBDB_NODESTATE_SHUTDOWN);

    return 0;
}

#
# Return two array references (possbily empty) of:
# [all nodes requiring reboot, all nodes not requiring reboot]
#
898 899
sub GetNodesRequiringReboot($@) {
    my ($reload_info, @nodes) = @_;
900
    my (@reboot, @noreboot);
901
    foreach my $node (@nodes) {
902 903 904 905 906 907 908 909 910
	if ($reload_info->{$node}{'reboot'}) {
	    push @reboot, $node;
	} else {
	    push @noreboot, $node;
	}
    }
    return (\@reboot, \@noreboot);
}

911 912 913 914 915 916 917 918 919 920 921
#
# This gets called in the parent, to wait for an async osload that was
# launched earlier (asyncmode). The child will print the results back
# on the the pipe that was opened between the parent and child. They
# are stuffed into the original results array.
# 
sub osload_wait($)
{
    my ($childpid) = @_;

    if (!exists($children{$childpid})) {
922
	tberror "No such child pid $childpid!"; # INTERNAL
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
	return -1;
    }
    my ($PARENT_READER, $result) = @{ $children{$childpid}};

    #
    # Read back the results.
    # 
    while (<$PARENT_READER>) {
	chomp($_);

	if ($_ =~ /^([-\w]+),([-\d])+$/) {
	    $result->{$1} = $2;
	    print STDERR "reload ($1): child returned $2 status.\n";
	}
	else {
938
	    tberror "Improper response from child: $_"; # INTERNAL
939 940 941 942 943 944 945 946 947 948
	}
    }
    
    #
    # And get the actual exit status.
    # 
    waitpid($childpid, 0);
    return $? >> 8;
}

949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964
#
# Save signature files and boot partition info for all nodes in an experiment
# (or just the listed nodes).  We call this when swapping in an experiment or
# when reloading nodes in an experiment.
#
# Note that this is not strictly an os loading function, we do it on swapins
# of nodes which already have the correct OS as well.  But we stick it here
# because it is about os loading in principle.
#
sub osload_setupswapinfo($$;@)
{
    my ($pid, $eid, @nodelist) = @_;
    my %nodeinfo = ();
    my $allnodes;
    my $clause = "";

965
    if (!@nodelist) {
966
	@nodelist = ExpNodes($pid, $eid, 1, 0);
967 968 969 970 971 972 973 974 975
	$clause .= "r.pid='$pid' and r.eid='$eid'";
	$allnodes = 1;
    } else {
	$clause .= "r.node_id in (" . join(",", map("'$_'", @nodelist)) . ")";
	$allnodes = 0;
    }
    map { $nodeinfo{$_} = 0 } @nodelist;

    # XXX only know how to do this for local PCs right now
976
    $clause .= " and nt.class='pc' and nt.isremotenode=0";
977 978 979 980 981 982 983 984

    #
    # Note that we are using the def_boot_osid from the nodes table to identify
    # the image of interest.  This is because the osid field is set by stated
    # after a node has reached the BOOTING state the first time, and may be
    # set to an MFS at other times.
    #
    my $query_result = DBQueryWarn(
985
	"select r.node_id,r.vname,r.pid,r.eid,r.erole,n.osid,p.partition,p.imageid,p.imagepid,i.imagename,i.loadpart,e.savedisk ".
986 987 988 989 990 991 992 993 994 995 996
	"from reserved as r ".
	"left join nodes as n on n.node_id=r.node_id ".
	"left join node_types as nt on nt.type=n.type ".
	"left join partitions as p on p.node_id=n.node_id and p.osid=n.def_boot_osid ".
        "left join images as i on i.imageid=p.imageid ".
        "left join experiments as e on e.pid=r.pid and e.eid=r.eid ".
	"where $clause");
    if (!$query_result) {
	return 1;
    }

997
    while (my ($node, $vname, $rpid, $reid, $erole, $osid, $part, $imageid,
998
	       $imagepid, $imagename, $lpart, $savedisk) =
999 1000
	   $query_result->fetchrow_array()) {

1001 1002 1003 1004 1005 1006 1007 1008 1009
	my $nodeobject = Node->Lookup($node);

	# If the node is not imageable, skip it.
	next
	    if (! $nodeobject->imageable());
	
	my $dtype = $nodeobject->disktype();
	my $dunit = $nodeobject->bootdisk_unit();

1010 1011 1012 1013
	#
	# XXX not a disk-based OSID.  This can happen during frisbee loads
	#
	if (!defined($imageid)) {
1014
	    print "*** swapinfo: OS $osid is not disk-based!?\n";
1015 1016 1017 1018 1019
	    next
		if (!$allnodes);
	    return 1;
	}

1020 1021 1022
	#
	# Weed out otherwise ineligible nodes:
	#	- from experiments that are not saving disk state
1023
	#	- non-'node' role machines (i.e., delaynodes, virthosts)
1024 1025 1026 1027
	# They are removed from nodeinfo entirely so we do not complain about
	# them below.  This is the only reason we are doing this here rather
	# than as part of the above query.
	#
1028
	if (!defined($savedisk) || $savedisk == 0 || $erole ne "node") {
1029 1030 1031 1032
	    delete $nodeinfo{$node};
	    next;
	}

1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	# Sanity checks
	if (!defined($nodeinfo{$node})) {
	    next
		if (!$allnodes);
	    print "*** swapinfo: Got partition info for invalid node $node!?\n";
	    return 1;
	}
	if ($nodeinfo{$node} != 0) {
	    print "*** swapinfo: Got redundant partition info for $node!?\n";
	    return 1;
	}
1044

1045 1046
	my $disk = "$dtype$dunit";
	$nodeinfo{$node} =
1047
	    [$vname, $rpid, $reid, $disk, $part, $imagepid, $imagename, $lpart];
1048 1049 1050 1051 1052 1053 1054
    }

    #
    # Copy over the signature file for the image used on every node under
    # the name <vname>.sig.  Likewise, we record the partition that the
    # image resides in under <vname>.part.
    #
1055
    # Note that we actually copy the signature over as <imagename>.sig and
1056 1057 1058
    # then symlink the <vname>.sig's to it.  This not only saves space,
    # but makes it easier to determine what is loaded on each node.
    #
1059 1060 1061 1062 1063
    # Finally note that we are using imagename rather than imageid (which
    # is a numeric UUID).  The latter is really closer to what we want, but
    # was added later and needs to be reconciled with our idea of 'unique'
    # (the signature).
    #
1064
    my %gotsig = ();
1065 1066 1067 1068 1069 1070
    for my $node (keys(%nodeinfo)) {
	my $infop = $nodeinfo{$node};
	if ($infop == 0) {
	    print "*** swapinfo: WARNING: got no partition info for $node!\n";
	    next;
	}
1071
	my ($vname, $rpid, $reid, $disk, $part, $imagepid, $imagename, $lpart) = @{$infop};
1072 1073 1074 1075 1076

	#
	# If imageid is not "fully qualified" with the project name,
	# generate a name that is.
	#
1077 1078 1079
	my $rimagename = $imagename;
	if ($rimagename !~ /^$imagepid-/) {
	    $rimagename = "$imagepid-$imagename";
1080 1081 1082
	}

	# XXX backward compat
1083
	my $infodir = "/$PROJROOT/$rpid/exp/$reid/swapinfo";
1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
	if (! -d "$infodir" && !mkdir($infodir, 0770)) {
	    print "*** swapinfo: no swap info directory $infodir!\n";
	    next
		if (!$allnodes);
	    return 1;
	}

	#
	# First make sure we get rid of any old signature for the node
	# in case any of the following steps fail.
	#
	unlink("$infodir/$vname.sig", "$infodir/$vname.part");

1097 1098 1099 1100 1101
	#
	# Now copy over the base signature if needed, either because
	# it doesn't exist in the swapinfo directory or is out of date.
	#
	my $mustcopy = 0;
1102 1103 1104 1105
	my ($sigdir, $signame);
	if ($imagepid eq TBOPSPID()) {
	    $sigdir = "$TB/images/sigs";
	} else {
1106
	    $sigdir = "/$PROJROOT/$imagepid/images/sigs";
1107
	}
1108
	$signame = "$imagename.ndz.sig";
1109 1110
	$signame =~ s/^$imagepid-//;
	if (! -d $sigdir || ! -f "$sigdir/$signame") {
1111
	    print "*** swapinfo: WARNING: ".
1112
		"no image signature for $rimagename, ".
1113 1114 1115
		"cannot save swapout state!\n";
	    next;
	}
1116
	my $basesig = "$infodir/$rimagename.sig";
1117
	if (! -r $basesig) {
1118 1119 1120 1121 1122 1123
	    $mustcopy = 1;
	} elsif (!defined($gotsig{$basesig})) {
	    my $fromtime = stat("$sigdir/$signame")->mtime;
	    my $totime = stat($basesig)->mtime;
	    if ($fromtime > $totime) {
		print "*** swapinfo: WARNING: ".
1124
		    "$rimagename.sig out of date, updating...\n";
1125 1126 1127
		$mustcopy = 1;
	    } elsif ($fromtime < $totime) {
		print "*** swapinfo: WARNING: ".
1128
		    "$rimagename.sig newer than source $sigdir/$signame!\n";
1129 1130 1131 1132
	    }
	}
	if ($mustcopy) {
	    unlink($basesig);
1133 1134 1135 1136 1137 1138 1139
	    if (system("/bin/cp -p $sigdir/$signame $basesig")) {
		print "*** swapinfo: WARNING: ".
		      "could not create signature $basesig, ".
		      "cannot save swapout state!\n";
		next;
	    }
	}
1140 1141
	$gotsig{$basesig} = 1;

1142
	if (system("/bin/ln -s $rimagename.sig $infodir/$vname.sig")) {
1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160
	    print "*** swapinfo: WARNING: ".
		"could not create signature $infodir/$vname.sig, ".
		    "cannot save swapout state!\n";
	    next;
	}

	if (!open(FD, "> $infodir/$vname.part")) {
		print "*** swapinfo: WARNING: ".
		      "could not create partition file $infodir/$vname.part, ".
		      "cannot save swapout state!\n";
		unlink("$infodir/$vname.sig");
		next;
	}
	print FD "DISK=$disk ";
	print FD "LOADPART=$lpart ";
	print FD "BOOTPART=$part\n";
	close(FD);
    }
1161 1162 1163 1164 1165 1166 1167

    #
    # Now get rid of usused signature files
    # Note that we can only use the gotsig hash if we are loading all nodes
    # in an experiment (else we don't know whether a sig is used or not).
    #
    if ($allnodes) {
1168
	my $infodir = "/$PROJROOT/$pid/exp/$eid/swapinfo";
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181
	my @allsigs = `ls $infodir/*.sig`;
	chomp(@allsigs);
	for my $sig (@allsigs) {
	    if (! -l $sig && !defined($gotsig{$sig})) {
		# untaint the file name
		if ($sig =~ /^($infodir\/[-\w\.\+]+\.sig)$/) {
		    $sig = $1;
		    print "removing unused signature file $sig ...\n";
		    unlink($sig);
		}
	    }
	}
    }
1182 1183
}

1184 1185
# _Always_ make sure that this 1 is at the end of the file...
1;