libosload.pm.in 28.1 KB
Newer Older
1 2 3
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
5 6 7 8 9 10 11 12 13 14 15 16
# All rights reserved.
#
# Osload library. Basically the backend to the osload script, but also used
# where we need finer control of loading of nodes.
#
package libosload;

use strict;
use Exporter;
use vars qw(@ISA @EXPORT);

@ISA    = "Exporter";
17
@EXPORT = qw ( osload osload_wait osload_setupswapinfo );
18 19 20 21 22

# Must come after package declaration!
use lib '@prefix@/lib';
use libdb;
use libreboot;
23
use libtblog;
24 25
use Node;
use NodeType;
26 27 28 29 30 31 32 33
use English;
use File::stat;
use IO::Handle;

# Configure variables
my $TB		= "@prefix@";
my $TESTMODE    = @TESTMODE@;
my $TBOPS       = "@TBOPSEMAIL@";
34
my $ELABINELAB  = @ELABINELAB@;
35
my $PROJROOT    = "@PROJROOT_DIR@";
36 37 38 39 40 41 42

# Max number of retries (per node) before its deemed fatal. This allows
# for the occasional pxeboot failure.
my $MAXRETRIES  = 1;

my $FRISBEELAUNCHER = "$TB/sbin/frisbeelauncher";
my $osselect	    = "$TB/bin/os_select";
43
my $TBUISP	    = "$TB/bin/tbuisp";
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

# Locals
my %imageinfo       = ();	# Per imageid DB info.
my $debug           = 0;
my %children        = ();	# Child pids in when asyncmode=1

sub osload ($$) {
    my ($args, $result) = @_;

    # These come in from the caller.
    my $imageid;
    my $waitmode    = 1;
    my @nodes       = ();
    my $noreboot    = 0;
    my $asyncmode   = 0;
59
    my $zerofree    = 0;
60
    my $swapinfo    = 0;
61 62 63 64 65 66 67 68 69

    # Locals
    my %retries	    = ();
    my $failures    = 0;
    my $usedefault  = 1;
    my $mereuser    = 0;
    my $rowref;

    if (!defined($args->{'nodelist'})) {
70
	tberror "Must supply a node list!"; # INTERNAL
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
	return -1;
    }
    @nodes = sort(@{ $args->{'nodelist'} });
    
    if (defined($args->{'waitmode'})) {
	$waitmode = $args->{'waitmode'};
    }
    if (defined($args->{'noreboot'})) {
	$noreboot = $args->{'noreboot'};
    }
    if (defined($args->{'debug'})) {
	$debug = $args->{'debug'};
    }
    if (defined($args->{'imageid'})) {
	$imageid    = $args->{'imageid'};
	$usedefault = 0;
    }
    if (defined($args->{'asyncmode'})) {
	$asyncmode = $args->{'asyncmode'};
    }
91 92 93
    if (defined($args->{'zerofree'})) {
	$zerofree = $args->{'zerofree'};
    }
94

95 96 97
    if (defined($args->{'swapinfo'})) {
	$swapinfo = $args->{'swapinfo'};
    }
98 99 100 101 102 103 104 105 106

    #
    # Start a new logging sub-process
    #
    my $old_env = \%ENV;
    local %ENV;
    copy_hash %ENV, $old_env;
    tblog_sub_process("osload", @nodes);

107 108 109 110 111 112 113 114 115
    #
    # Figure out who called us. Root and admin types can do whatever they
    # want. Normal users can only change nodes in experiments in their
    # own projects.
    #
    if ($UID && !TBAdmin($UID)) {
	$mereuser = 1;

	if (! TBNodeAccessCheck($UID, TB_NODEACCESS_LOADIMAGE, @nodes)) {
116 117 118
	    tberror
		"Not enough permission to load images on one or ".
		    "more nodes!";
119 120 121 122 123 124 125 126 127
	    return -1;
	}
    }

    #
    # Check permission to use the imageid.
    # 
    if (defined($imageid) && $mereuser &&
	! TBImageIDAccessCheck($UID, $imageid, TB_IMAGEID_READINFO)) {
128
	tberror "You do not have permission to load '$imageid'!";
129 130 131
	return -1;
    }

132 133 134 135 136 137 138 139 140
    #
    # If there's a maxiumum number of concurrent loads listed for the image,
    # check to see if we'll go over the limit, by checking to see how many
    # other nodes are currently booting thie image's default_osid. This is
    # NOT intended to be strong enforcement of license restrictions, just a
    # way to catch mistakes.
    #
    if (defined($imageid) &&
	!TBImageLoadMaxOkay($imageid, scalar(@nodes), @nodes)) {
141 142 143
	tberror 
	    "Would exceed maxiumum concurrent instances ".
		"limitation for $imageid";
144 145 146
	return -1;
    }

147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
    #
    # This is somewhat hackish. To promote parallelism during os_setup, we
    # want to fork off the osload from the parent so it can do other things.
    # The problem is how to return status via the results vector. Well,
    # lets do it with some simple IPC. Since the results vector is simply
    # a hash of node name to an integer value, its easy to pass that back.
    #
    # We return the pid to the caller, which it can wait on either directly
    # or by calling back into this library if it wants to actually get the
    # results from the child!
    # 
    if ($asyncmode) {
	#
	# Create a pipe to read back results from the child we will create.
	#
162 163
	my $PARENT_READER = new IO::Handle; # Need a new handle for each child
	if (! pipe($PARENT_READER, CHILD_WRITER)) {
164
	    tberror "creating pipe: $!";
165 166 167 168 169 170
	    return -1;
	}
	CHILD_WRITER->autoflush(1);

	if (my $childpid = fork()) {
	    close(CHILD_WRITER);
171
	    $children{$childpid} = [ $PARENT_READER, $result ];
172 173 174 175 176
	    return $childpid;
	}
	#
	# Child keeps going. 
	#
177
	close($PARENT_READER);
178 179 180
	TBdbfork();
    }

181 182 183 184
    # This will store information about each node, so that if we have to try
    # again later, we'll have it all.
    my %reload_info;

185 186 187 188 189 190 191
    # Loop for each node.
    foreach my $node (@nodes) {
	# All nodes start out as being successful; altered later as needed.
	$result->{$node} = 0;
	
	# Get default imageid for this node.
	my $default_imageid;
192 193
	if (! DefaultImageID($node, \$default_imageid) ||
	    ! defined($default_imageid)) {
194
	    tberror "$node: No default imageid defined!";
195 196 197 198 199 200 201 202 203 204 205
	    goto failednode;
	}

	if ($usedefault) {
	    $imageid = $default_imageid;
	}

	print STDERR "osload: Using $imageid for $node\n"
	    if $debug;

	#
206 207 208
	# We try to avoid repeated queries to DB for info that does not change
	# by caching the image info on the first use.  GetImageInfo() will
	# perform various one-time checks as well.
209
	# 
210 211
	if (!exists($imageinfo{$imageid}) && !GetImageInfo($imageid, $node)) {
	    goto failednode;
212
	}
213
	$rowref = $imageinfo{$imageid};
214 215 216
	if ($rowref eq 'BADIMAGE') {
	    goto failednode;
	}
217 218 219 220 221

	my $loadpart       = $rowref->{'loadpart'};
	my $loadlen        = $rowref->{'loadlength'};
	my $imagepath      = $rowref->{'path'};
	my $defosid        = $rowref->{'default_osid'};
222
	my $maxwait	   = $rowref->{'maxloadwait'};
223
	my $imagepid	   = $rowref->{'pid'};
224 225 226 227 228

	print "osload ($node): Changing default OS to $defosid\n";
	if (!$TESTMODE) {
	    system("$osselect $defosid $node");
	    if ($?) {
229
		tberror "$node: os_select $defosid failed!";
230 231 232 233 234 235 236 237 238 239 240 241
		goto failednode;
	    }
	}

	#
	# If loading an image (which is not the default) then schedule
	# a reload for it so that when the experiment is terminated it
	# will get a fresh default image before getting reallocated to
	# another experiment.
	#
	if ($imageid ne $default_imageid &&
	    !TBSetSchedReload($node, $default_imageid)) {
242 243
	    tberror
		"$node: Could not schedule default reload";
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
	    goto failednode;
	}

	#
	# Assign partition table entries for each partition in the image.
	# This is complicated by the fact that an image that covers only
	# part of the slices, should only change the partition table entries
	# for the subset of slices that are written to disk.
	#
	my $startpart = $loadpart == 0 ? 1 : $loadpart;
	my $endpart   = $startpart + $loadlen;
	
	for (my $i = $startpart; $i < $endpart; $i++) {
	    my $partname = "part${i}_osid";
	    my $dbresult;
	    
	    if (defined($rowref->{$partname})) {
		my $osid = $rowref->{$partname};
		
		$dbresult =
		    DBQueryWarn("replace into partitions ".
265 266 267
				"(node_id,partition,osid,imageid,imagepid) ".
				"values ".
				"('$node','$i','$osid','$imageid','$imagepid')");
268 269 270 271 272 273 274
	    }
	    else {
		$dbresult =
		    DBQueryWarn("delete from partitions ".
				"where node_id='$node' and partition='$i'");
	    }
	    if (!$dbresult) {
275
		tberror "$node: Could not update partition table";
276 277 278 279
		goto failednode;
	    }
	}

280 281 282 283 284 285 286 287 288
	#
	# Setup swapinfo now after partitions have initialized but before
	# we setup the one-shot frisbee load.
	#
	if ($swapinfo) {
	    print "osload: Updating image signature.\n";
	    osload_setupswapinfo(undef, undef, $node);
	}

289 290 291 292 293 294 295 296 297 298 299 300 301
	#
	# Determine which mode to use for reloading this node (note: this may
	# become an entry in node_capabilities or something like that in the
	# future - that would be cleaner)
	#
	my ($type, $class) = TBNodeType($node);
	my $reload_mode;
	my $reload_func;
	my $reboot_required;
	if ($class eq "mote") {
	    $reload_mode = "UISP";
	    $reload_func = \&SetupReloadUISP;
	    $reboot_required = 0; # We don't reboot motes to reload them
302
	    $zerofree = 0; # and we don't zero "the disk"
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
	} else {
	    $reload_mode = "Frisbee";
	    $reload_func = \&SetupReloadFrisbee;
	    $reboot_required = !$noreboot; # Reboot unless $noreboot flag set
	}

	#
	# Remember this stuff so that if we have to retry this node again
	# later, we'll know how to handle it
	#
	$reload_info{$node} = {
	    'mode'    => $reload_mode,
	    'func'    => $reload_func,
	    'imageid' => $imageid,
	    'osid'    => $defosid,
318
	    'reboot'  => $reboot_required,
319 320
	    'zerofree'=> $zerofree,
	    'maxwait' => $maxwait
321 322 323 324
	};

	print "Setting up reload for $node (mode: $reload_mode)\n";

325
	if (!$TESTMODE) {
326
	    if (&$reload_func($node, $imageid, $defosid, $zerofree) < 0) {
327
		tberror("$node: Could not set up reload. Skipping.");
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
		goto failednode;
	    }
	}
	next;
	
      failednode:
	$result->{$node} = -1;
	$failures++;
    }

    #
    # Remove any failed nodes from the list we are going to operate on.
    #
    my @temp = ();
    foreach my $node (@nodes) {
	push(@temp, $node)
	    if (! $result->{$node});
    }
    @nodes = @temp;

    # Exit if not doing an actual reload.
    if ($TESTMODE) {
	print "osload: Stopping in Testmode!\n";
	goto done;
    }

    if (! @nodes) {
355
	tbnotice "Stopping because of previous failures!";
356 357 358 359 360
	goto done;
    }

    # Fire off a mass reboot and quit if not in waitmode.
    if (! $waitmode) {
361
	my ($reboot_nodes, $noreboot_nodes)
362
	    = GetNodesRequiringReboot(\%reload_info, keys(%reload_info));
363
	if (@$reboot_nodes) {
364 365 366 367 368 369 370
	    print "osload: Rebooting nodes.\n";

	    my %reboot_args     = ();
	    my %reboot_failures = ();

	    $reboot_args{'debug'}    = $debug;
	    $reboot_args{'waitmode'} = 0;
371
	    $reboot_args{'nodelist'} = [ @$reboot_nodes ];
372 373

	    if (nodereboot(\%reboot_args, \%reboot_failures)) {
374
		foreach my $node (@$reboot_nodes) {
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
		    if ($reboot_failures{$node}) {
			$result->{$node} = $reboot_failures{$node};
			$failures++;
		    }
		}
	    }
	}
	goto done;
    }

    #
    # The retry vector is initialized to the number of retries we allow per
    # node, afterwhich its a fatal error.
    #
    foreach my $node (@nodes) {
	$retries{$node} = $MAXRETRIES;
    }
    
    while (@nodes) {
394 395 396
	my ($reboot_nodes, $noreboot_nodes)
	    = GetNodesRequiringReboot(\%reload_info, @nodes);
	if (@$reboot_nodes) {
397
	    # Reboot them all.
398
	    print "osload: Issuing reboot for @$reboot_nodes and then waiting ...\n";
399 400 401 402 403 404
	    
	    my %reboot_args     = ();
	    my %reboot_failures = ();

	    $reboot_args{'debug'}    = $debug;
	    $reboot_args{'waitmode'} = 0;
405
	    $reboot_args{'nodelist'} = [ @$reboot_nodes ];
406 407 408 409 410 411 412 413

	    if (nodereboot(\%reboot_args, \%reboot_failures)) {
		#
		# If we get any failures in the reboot, we want to
		# alter the list of nodes accordingly for the next phase.
		# 
		my @temp = ();
		
414
		foreach my $node (@$reboot_nodes) {
415 416 417 418 419 420 421 422
		    if ($reboot_failures{$node}) {
			$result->{$node} = $reboot_failures{$node};
			$failures++;
		    }
		    else {
			push(@temp, $node);
		    }
		}
423
		@nodes = (@temp,@$noreboot_nodes);
424 425 426 427 428
	    }
	}

	# Now wait for them.
	my $startwait   = time;
429 430 431 432
	my @failednodes = WaitTillReloadDone($startwait,
					     $waitmode,
					     \%reload_info, 
					     @nodes);
433 434 435 436 437 438
	@nodes=();
    
	while (@failednodes) {
	    my $node = shift(@failednodes);

	    if ($retries{$node}) {
439
		tbnotice "$node: Trying again ...";
440

441 442
		my $reload_info = $reload_info{$node};

443
		# Possible race with reboot?
444
		if (&{$reload_info->{'func'}}($node, $reload_info->{'imageid'},
445
		    $reload_info->{'osid'}, $reload_info->{'zerofree'}) < 0) {
446
		    tberror("$node: Could not set up reload. Skipping.");
447 448 449 450 451 452 453 454 455 456
		    $result->{$node} = -1;
		    $failures++;
		    next;
		}
		push(@nodes, $node);

		# Retry until count hits zero.
		$retries{$node} -= 1;
	    }
	    else {
457 458
		tberror ({sublevel => -1}, 
			 "$node failed to boot too many times. Skipping!");
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
		$result->{$node} = -1;
		$failures++;
	    }
	}
    }
  done:
    print "osload: Done! There were $failures failures.\n";

    if ($asyncmode) {
	#
	# We are a child. Send back the results to the parent side
	# and *exit* with status instead of returning it.
	# 
	foreach my $node (keys(%{ $result })) {
	    my $status = $result->{$node};
	    
	    print CHILD_WRITER "$node,$status\n";
	}
	close(CHILD_WRITER);
	exit($failures);
    }
    
    return $failures;
}

484 485 486 487 488
#
# Fetch information for a specified image the first time it is used
# (for the indicated node).  This info is cached for use by all other
# nodes that require the image.  Returns 1 on success, 0 on failure.
#
489
sub GetImageInfo($;$)
490 491 492 493 494 495
{
    my ($imageid, $node) = @_;

    my $query_result =
	DBQueryWarn("select * from images where imageid='$imageid'");
    if (! $query_result || $query_result->numrows < 1) {
496
	tberror "Imageid $imageid is not defined!";
497
	$imageinfo{$imageid} = 'BADIMAGE';
498 499 500 501 502 503 504 505 506 507 508 509
	return 0;
    }
    $imageinfo{$imageid} = $query_result->fetchrow_hashref();

    my $rowref = $imageinfo{$imageid};
    my $imagepath = $rowref->{'path'};

    #
    # Perform a few validity checks: imageid should have a file name
    # and that file should exist.
    #
    if (!defined($imagepath)) {
510
	tberror "No filename associated with $imageid!";
511
	$imageinfo{$imageid} = 'BADIMAGE';
512 513 514 515 516 517 518 519 520 521
	return 0;
    }

    if (! -R $imagepath) {
	if ($ELABINELAB) {
	    #
	    # Yuck. See if we can get it via frisbeelauncher before giving up.
	    #
	    system("$FRISBEELAUNCHER " . ($debug ? "-d ": "") . "$imageid");
	    if ($?) {
522
		tberror "Frisbeelauncher ($imageid) failed!";
523
		$imageinfo{$imageid} = 'BADIMAGE';
524 525 526
		return 0;
	    }
	    if (! -R $imagepath) {
527
		tberror	"Frisbeelauncher could not fetch $imagepath ($imageid)!";
528
		$imageinfo{$imageid} = 'BADIMAGE';
529 530 531 532
		return 0;
	    }
	}
	else {
533
	    tberror "$imagepath does not exist or cannot be read!";
534
	    $imageinfo{$imageid} = 'BADIMAGE';
535 536 537 538
	    return 0;
	}
    }

539 540 541 542 543 544 545 546 547 548 549 550 551
    my $sb = stat($imagepath);

    #
    # A zero-length image cannot be right and will result in much confusion
    # if allowed to pass: the image load will succeed, but the disk will be
    # unchanged, making it appear that os_load loaded the default image.
    #
    if ($sb->size == 0) {
	tberror "$imagepath is empty!";
	$imageinfo{$imageid} = 'BADIMAGE';
	return 0;
    }

552 553 554 555 556 557 558 559 560
    #
    # Compute a maxwait time based on the image size plus a constant
    # factor for the reboot cycle.  This is used later in
    # WaitTillReloadDone().  Arguably, this should be part of the
    # image DB state, so we store it in the imageinfo array too.
    #
    if (!defined($rowref->{'maxloadwait'})) {
	my $chunks = $sb->size >> 20; # size may be > 2^31, shift is unsigned

561
	$rowref->{'maxloadwait'} = int((($chunks / 100.0) * 65) + (5 * 60));
562 563 564 565 566 567 568 569 570 571 572 573 574
    }

    print STDERR
	"$imageid: loadpart=", $rowref->{'loadpart'},
	", loadlen=", $rowref->{'loadlength'},
	", imagepath=", $rowref->{'path'},
	", defosid=", $rowref->{'default_osid'},
	", maxloadwait=", $rowref->{'maxloadwait'}, "\n"
	    if ($debug);

    return 1;
}

575
# Wait for a reload to finish by watching its state
576
sub WaitTillReloadDone($$$@)
577
{
578
    my ($startwait, $waitmode, $reload_info, @nodes) = @_;
579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
    my %done	= ();
    my $count   = @nodes;
    my @failed  = ();

    foreach my $node ( @nodes ) { $done{$node}  = 0; }

    print STDERR "Waiting for @nodes to finish reloading\n".`date` if $debug;

    # Start a counter going, relative to the time we rebooted the first
    # node.
    my $waittime  = 0;
    my $minutes   = 0;

    while ($count) {
	# Wait first to make sure reboot is done, and so that we don't
	# wait one more time after everyone is up.
	sleep(5);
	foreach my $node (@nodes) {
	    if (! $done{$node}) {
598 599 600 601 602 603 604 605 606 607 608 609
		my $maxwait;

		#
		# If we have to zero fill free space, then the
		# wait time has to be proportional to the disk
		# size.  In other words, a really, really, really
		# long time.  Lets assume 20MB/sec to blast zeros,
		# so 50 seconds/GB.  What the heck, lets call it
		# 1GB/minute.  Did I mention how this would take
		# a really long time?
		#
		if ($reload_info->{$node}{'zerofree'}) {
610 611 612
		    my $nodeobject = Node->Lookup($node);
		    my $disksize   = $nodeobject->disksize();

613 614 615 616
		    $disksize = 20
			if (!$disksize);
		    $maxwait = ($disksize * 60);
		} else {
617
		    $maxwait = $reload_info->{$node}{'maxwait'};
618
		}
619 620 621 622 623 624 625 626 627 628
		
		my $query_result =
		    DBQueryWarn("select * from current_reloads ".
				"where node_id='$node'");

		#
		# There is no point in quitting if this query fails. Just
		# try again in a little bit.
		# 
		if (!$query_result) {
629
		    tbwarn "$node: Query failed; waiting a bit.";
630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
		    next;
		}

		#
		# We simply wait for stated to clear the current_reloads entry.
		#
		if (!$query_result->numrows) {
		    print STDERR "osload ($node): left reloading mode at ".`date`
			if ($debug);
		    
		    $count--;
		    $done{$node} = 1;
		    next;
		}
	
		# Soon we will have stated's timeouts take care of
		# rebooting once or twice if we get stuck during
		# reloading.
		$waittime = time - $startwait;
		if ($waittime > $maxwait) {
		    my $t = (int ($waittime / 60));
651 652
		    tbnotice "$node appears wedged; ".
			"it has been $t minutes since it was rebooted.";
653
		    TBNodeConsoleTail($node, *STDERR);
654 655

		    $count--;
656
		    $done{$node} = $waitmode;
657 658 659 660 661 662 663 664 665 666 667
		    push(@failed, $node);
		    next;
		}
		if (int($waittime / 60) > $minutes) {
		    $minutes = int($waittime / 60);
		    print STDERR "osload ($node): still waiting; ".
			"it has been $minutes minute(s)\n";
		}
	    }
	}
    }
668 669 670 671 672 673 674

    if ($waitmode > 1) {
	$count = @nodes - @failed;
	$startwait = time;
	while ($count) {
	    foreach my $node (@nodes) {
		print STDERR "osload ($node): waiting for node to finish ".
675
		    "booting\n";
676
		if ($done{$node} < $waitmode) {
677 678
		    my $actual_state;

679 680
		    if (!TBNodeStateWait($node,
					 $startwait,
681 682 683 684
					 (60*6),
					 \$actual_state,
					 (TBDB_NODESTATE_TBFAILED,
					  TBDB_NODESTATE_ISUP))) {
685 686 687 688 689 690 691 692
			$count--;
			$done{$node} = $waitmode;
		    }
		}
	    }
	}
    }

693 694 695 696
    return @failed;
}

# Setup a reload. 
697
sub SetupReloadFrisbee($$$$)
698
{
699
    my ($node, $imageid, $osid_notused, $zerofree) = @_;
700
    my $osid = TBNodeDiskloadOSID($node);
701

702 703 704 705
    #
    # Put it in the current_reloads table so that nodes can find out which
    # OS to load. See tmcd. 
    #
706
    my $query_result = 
707
	DBQueryWarn("replace into current_reloads ".
708 709
		    "(node_id, image_id, mustwipe) values ".
		    "('$node', '$imageid', $zerofree)");
710 711 712
    return -1
	if (!$query_result);

713
    system("$osselect -1 $osid $node");
714
    if ($?) {
715
	tberror "os_select $osid failed!";
716 717 718 719
	return -1;
    }
    system("$FRISBEELAUNCHER " . ($debug ? "-d ": "") . "$imageid");
    if ($?) {
720
	tberror "Frisbee Launcher ($imageid) failed!";
721 722 723 724 725
	return -1;
    }
    return 0;
}

726 727 728 729 730
#
# Setup a reload, using USIP (for motes), rather than Frisbee. Note that
# this differs from a Frisbee reload in one key way - it does the reload
# right here in this code, rather than setting up a reload for later.
#
731
sub SetupReloadUISP($$$$)
732
{
733
    my ($node, $imageid, $osid, $zerofree_unused) = @_;
734 735 736 737 738 739 740

    #
    # Get the path to the image
    #
    my $query_result = DBQueryFatal("select path from images " .
	"where imageid='$imageid'");
    if ($query_result->num_rows() != 1) {
741
	tberror "Failed to get path for $imageid!";
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
	return -1;
    }
    my ($path) = $query_result->fetchrow();

    #
    # Tell stated that we're about to start reloading
    #
    TBSetNodeNextOpMode($node,TBDB_NODEOPMODE_RELOADMOTE);

    #
    # The mote goes 'down', then starts to reload
    #
    TBSetNodeEventState($node,TBDB_NODESTATE_SHUTDOWN);
    TBSetNodeEventState($node,TBDB_NODESTATE_RELOADING);

    #
    # Okay, just run tbuisp with that path
    #
    my $rv = system("$TBUISP upload $path $node");
    if ($rv) {
762
	tberror "$node: tbuisp failed";
763 764 765 766 767 768 769 770 771 772
	return -1;
    }

    #
    # Tell stated that we've finished reloading the node
    #
    TBSetNodeEventState($node,TBDB_NODESTATE_RELOADDONE);

    system("$osselect $osid $node");
    if ($?) {
773
	tberror "os_select $osid failed!";
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789
	goto failednode;
    }

    #
    # 'Reboot' the node (from stated's perspective, anyway)
    # has been shutdown, so that the os_select will take effect
    #
    TBSetNodeEventState($node,TBDB_NODESTATE_SHUTDOWN);

    return 0;
}

#
# Return two array references (possbily empty) of:
# [all nodes requiring reboot, all nodes not requiring reboot]
#
790 791
sub GetNodesRequiringReboot($@) {
    my ($reload_info, @nodes) = @_;
792
    my (@reboot, @noreboot);
793
    foreach my $node (@nodes) {
794 795 796 797 798 799 800 801 802
	if ($reload_info->{$node}{'reboot'}) {
	    push @reboot, $node;
	} else {
	    push @noreboot, $node;
	}
    }
    return (\@reboot, \@noreboot);
}

803 804 805 806 807 808 809 810 811 812 813
#
# This gets called in the parent, to wait for an async osload that was
# launched earlier (asyncmode). The child will print the results back
# on the the pipe that was opened between the parent and child. They
# are stuffed into the original results array.
# 
sub osload_wait($)
{
    my ($childpid) = @_;

    if (!exists($children{$childpid})) {
814
	tberror "No such child pid $childpid!"; # INTERNAL
815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
	return -1;
    }
    my ($PARENT_READER, $result) = @{ $children{$childpid}};

    #
    # Read back the results.
    # 
    while (<$PARENT_READER>) {
	chomp($_);

	if ($_ =~ /^([-\w]+),([-\d])+$/) {
	    $result->{$1} = $2;
	    print STDERR "reload ($1): child returned $2 status.\n";
	}
	else {
830
	    tberror "Improper response from child: $_"; # INTERNAL
831 832 833 834 835 836 837 838 839 840
	}
    }
    
    #
    # And get the actual exit status.
    # 
    waitpid($childpid, 0);
    return $? >> 8;
}

841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
#
# Save signature files and boot partition info for all nodes in an experiment
# (or just the listed nodes).  We call this when swapping in an experiment or
# when reloading nodes in an experiment.
#
# Note that this is not strictly an os loading function, we do it on swapins
# of nodes which already have the correct OS as well.  But we stick it here
# because it is about os loading in principle.
#
sub osload_setupswapinfo($$;@)
{
    my ($pid, $eid, @nodelist) = @_;
    my %nodeinfo = ();
    my $allnodes;
    my $clause = "";

857
    if (!@nodelist) {
858
	@nodelist = ExpNodes($pid, $eid, 1, 0);
859 860 861 862 863 864 865 866 867
	$clause .= "r.pid='$pid' and r.eid='$eid'";
	$allnodes = 1;
    } else {
	$clause .= "r.node_id in (" . join(",", map("'$_'", @nodelist)) . ")";
	$allnodes = 0;
    }
    map { $nodeinfo{$_} = 0 } @nodelist;

    # XXX only know how to do this for local PCs right now
868
    $clause .= " and nt.class='pc' and nt.isremotenode=0";
869 870 871 872 873 874 875 876

    #
    # Note that we are using the def_boot_osid from the nodes table to identify
    # the image of interest.  This is because the osid field is set by stated
    # after a node has reached the BOOTING state the first time, and may be
    # set to an MFS at other times.
    #
    my $query_result = DBQueryWarn(
877
	"select r.node_id,r.vname,r.pid,r.eid,r.erole,n.osid,p.partition,p.imageid,p.imagepid,i.loadpart,e.savedisk ".
878 879 880 881 882 883 884 885 886 887 888
	"from reserved as r ".
	"left join nodes as n on n.node_id=r.node_id ".
	"left join node_types as nt on nt.type=n.type ".
	"left join partitions as p on p.node_id=n.node_id and p.osid=n.def_boot_osid ".
        "left join images as i on i.imageid=p.imageid ".
        "left join experiments as e on e.pid=r.pid and e.eid=r.eid ".
	"where $clause");
    if (!$query_result) {
	return 1;
    }

889 890
    while (my ($node, $vname, $rpid, $reid, $erole, $osid, $part, $imageid,
	       $imagepid, $lpart, $savedisk) =
891 892
	   $query_result->fetchrow_array()) {

893 894 895 896 897 898 899 900 901
	my $nodeobject = Node->Lookup($node);

	# If the node is not imageable, skip it.
	next
	    if (! $nodeobject->imageable());
	
	my $dtype = $nodeobject->disktype();
	my $dunit = $nodeobject->bootdisk_unit();

902 903 904 905 906 907 908 909 910 911
	#
	# XXX not a disk-based OSID.  This can happen during frisbee loads
	#
	if (!defined($imageid)) {
	    print "*** swapinfo: $osid is not disk-based!?\n";
	    next
		if (!$allnodes);
	    return 1;
	}

912 913 914
	#
	# Weed out otherwise ineligible nodes:
	#	- from experiments that are not saving disk state
915
	#	- non-'node' role machines (i.e., delaynodes, virthosts)
916 917 918 919 920 921 922 923 924
	# They are removed from nodeinfo entirely so we do not complain about
	# them below.  This is the only reason we are doing this here rather
	# than as part of the above query.
	#
	if ($savedisk == 0 || $erole ne "node") {
	    delete $nodeinfo{$node};
	    next;
	}

925 926 927 928 929 930 931 932 933 934 935
	# Sanity checks
	if (!defined($nodeinfo{$node})) {
	    next
		if (!$allnodes);
	    print "*** swapinfo: Got partition info for invalid node $node!?\n";
	    return 1;
	}
	if ($nodeinfo{$node} != 0) {
	    print "*** swapinfo: Got redundant partition info for $node!?\n";
	    return 1;
	}
936

937 938 939 940 941 942 943 944 945 946 947 948 949 950
	my $disk = "$dtype$dunit";
	$nodeinfo{$node} =
	    [$vname, $rpid, $reid, $osid, $disk, $part, $imageid, $imagepid, $lpart];
    }

    #
    # Copy over the signature file for the image used on every node under
    # the name <vname>.sig.  Likewise, we record the partition that the
    # image resides in under <vname>.part.
    #
    # Note that we actually copy the signature over as <imageid>.sig and
    # then symlink the <vname>.sig's to it.  This not only saves space,
    # but makes it easier to determine what is loaded on each node.
    #
951
    my %gotsig = ();
952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
    for my $node (keys(%nodeinfo)) {
	my $infop = $nodeinfo{$node};
	if ($infop == 0) {
	    print "*** swapinfo: WARNING: got no partition info for $node!\n";
	    next;
	}
	my ($vname, $rpid, $reid, $osid, $disk, $part, $imageid, $imagepid, $lpart) = @{$infop};

	#
	# If imageid is not "fully qualified" with the project name,
	# generate a name that is.
	#
	my $rimageid = $imageid;
	if ($rimageid !~ /^$imagepid-/) {
	    $rimageid = "$imagepid-$imageid";
	}

	# XXX backward compat
970
	my $infodir = "/$PROJROOT/$rpid/exp/$reid/swapinfo";
971 972 973 974 975 976 977 978 979 980 981 982 983
	if (! -d "$infodir" && !mkdir($infodir, 0770)) {
	    print "*** swapinfo: no swap info directory $infodir!\n";
	    next
		if (!$allnodes);
	    return 1;
	}

	#
	# First make sure we get rid of any old signature for the node
	# in case any of the following steps fail.
	#
	unlink("$infodir/$vname.sig", "$infodir/$vname.part");

984 985 986 987 988
	#
	# Now copy over the base signature if needed, either because
	# it doesn't exist in the swapinfo directory or is out of date.
	#
	my $mustcopy = 0;
989 990 991 992
	my ($sigdir, $signame);
	if ($imagepid eq TBOPSPID()) {
	    $sigdir = "$TB/images/sigs";
	} else {
993
	    $sigdir = "/$PROJROOT/$imagepid/images/sigs";
994 995 996 997
	}
	$signame = "$imageid.ndz.sig";
	$signame =~ s/^$imagepid-//;
	if (! -d $sigdir || ! -f "$sigdir/$signame") {
998 999
	    print "*** swapinfo: WARNING: ".
		"no image signature for $rimageid, ".
1000 1001 1002
		"cannot save swapout state!\n";
	    next;
	}
1003
	my $basesig = "$infodir/$rimageid.sig";
1004
	if (! -r $basesig) {
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
	    $mustcopy = 1;
	} elsif (!defined($gotsig{$basesig})) {
	    my $fromtime = stat("$sigdir/$signame")->mtime;
	    my $totime = stat($basesig)->mtime;
	    if ($fromtime > $totime) {
		print "*** swapinfo: WARNING: ".
		    "$rimageid.sig out of date, updating...\n";
		$mustcopy = 1;
	    } elsif ($fromtime < $totime) {
		print "*** swapinfo: WARNING: ".
		    "$rimageid.sig newer than source $sigdir/$signame!\n";
	    }
	}
	if ($mustcopy) {
	    unlink($basesig);
1020 1021 1022 1023 1024 1025 1026
	    if (system("/bin/cp -p $sigdir/$signame $basesig")) {
		print "*** swapinfo: WARNING: ".
		      "could not create signature $basesig, ".
		      "cannot save swapout state!\n";
		next;
	    }
	}
1027 1028 1029
	$gotsig{$basesig} = 1;

	if (system("/bin/ln -s $rimageid.sig $infodir/$vname.sig")) {
1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047
	    print "*** swapinfo: WARNING: ".
		"could not create signature $infodir/$vname.sig, ".
		    "cannot save swapout state!\n";
	    next;
	}

	if (!open(FD, "> $infodir/$vname.part")) {
		print "*** swapinfo: WARNING: ".
		      "could not create partition file $infodir/$vname.part, ".
		      "cannot save swapout state!\n";
		unlink("$infodir/$vname.sig");
		next;
	}
	print FD "DISK=$disk ";
	print FD "LOADPART=$lpart ";
	print FD "BOOTPART=$part\n";
	close(FD);
    }
1048 1049 1050 1051 1052 1053 1054

    #
    # Now get rid of usused signature files
    # Note that we can only use the gotsig hash if we are loading all nodes
    # in an experiment (else we don't know whether a sig is used or not).
    #
    if ($allnodes) {
1055
	my $infodir = "/$PROJROOT/$pid/exp/$eid/swapinfo";
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
	my @allsigs = `ls $infodir/*.sig`;
	chomp(@allsigs);
	for my $sig (@allsigs) {
	    if (! -l $sig && !defined($gotsig{$sig})) {
		# untaint the file name
		if ($sig =~ /^($infodir\/[-\w\.\+]+\.sig)$/) {
		    $sig = $1;
		    print "removing unused signature file $sig ...\n";
		    unlink($sig);
		}
	    }
	}
    }
1069 1070
}

1071 1072
# _Always_ make sure that this 1 is at the end of the file...
1;