reload_daemon.in 24.1 KB
Newer Older
1
#!/usr/bin/perl -w
Leigh Stoller's avatar
Leigh Stoller committed
2 3

#
4
# Copyright (c) 2000-2016, 2018 University of Utah and the Flux Group.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh Stoller's avatar
Leigh Stoller committed
24 25
#

26 27 28 29 30 31 32 33 34 35 36 37
use English;
use Getopt::Std;

#
# Look for nodes to reload.
#
#	usage: reload_daemon [-d]
#
# TODO: Use "logger" instead of writing a log file.
#
sub usage()
{
38 39 40 41 42 43 44 45
    print STDOUT "Usage: reload_daemon [-d] [-t tag]\n" .
	"    -d     Prevent daemonization\n" . 
	"    -t tag Only manage reloads for nodes or node types\n" . 
	"           that have the value of <tag> for a node_type_attribute\n" . 
	"           or a node_attribute named 'reload_daemon_tag'.\n" . 
	"           IF this tag is not set, the reload_daemon picks only\n" . 
	"           those nodes that DO NOT have this type or node\n" . 
	"           attribute set!\n";
46 47
    exit(-1);
}
48
my  $optlist = "dt:";
49 50 51 52 53 54 55 56

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";

57 58 59
# Set this to turn off tblog in libraries.
$ENV{'TBLOG_OFF'} = "yep";

60 61 62
# Testbed Support library
use lib "@prefix@/lib";
use libdb;
63
use libosload;
64
use libtestbed;
65 66
use Experiment;
use Node;
67
use NodeType;
68 69
use EmulabFeatures;
use User;
70 71 72 73 74 75 76

#
# These come from the library.
# 
my $RELOADPID	= NODERELOADING_PID;
my $RELOADEID	= NODERELOADING_EID;
my $PENDINGEID	= NODERELOADPENDING_EID;
77 78
my $REPOSPID	= NODEREPOSITIONING_PID;
my $RPPENDINGEID= NODEREPOSPENDING_EID;
79 80
my $NODEDEAD_PID= NODEDEAD_PID;
my $NODEDEAD_EID= NODEDEAD_EID;
81

82 83
sub myosload($$$$);
sub logit($);
84 85
sub fatal($);
sub notify($);
86
sub freefromreloading($);
87 88
sub getsitevars();
sub tohwdown($$$);
89
		      
90 91
my $sched_reload= "$TB/sbin/sched_reload";
my $reboot	= "$TB/bin/node_reboot";
92
my $tbrsync     = "$TB/bin/tbrsync";
93 94
my $power	= "$TB/bin/power";
my $nodeadmin	= "$TB/bin/node_admin";
95 96
my $logfile	= "$TB/log/reloadlog";
my $debug	= 0;
97
my $tag;
98
my $retry_time  = 20;              # in minutes
99
my $fail_time	= 0;		   # in minutes
100 101
my $widearea_multiplier = 2;       # widearea nodes get (mult+1)x longer, but
                                   #  possibly not quite true cause of mustwipe)
102 103
my $warnonretry = 1;
my $hwdownaction= "nothing";
104 105
my %retried     = ();
my %failed	= ();
106
my @retry_list  = ();
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132

#
# Turn off line buffering on output (dots ...).
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
if (defined($options{"d"})) {
    $debug = $options{"d"};
}
133 134 135 136 137
if (defined($options{"t"})) {
    $tag = $options{"t"};
    # rename the logfile too
    $logfile = "$logfile-$tag";
}
138

139 140 141 142 143 144 145 146 147 148
#
# This should run as root to make sure that it has permission to reboot nodes
# (since only root is allowed to power cycle nodes at any time - it's time-
# limited for anyone else)
#
if ($UID != 0 && !defined($tag)) {
    die("*** $0:\n".
	"    Only root can run this script!\n");
}

149
#
150 151
# Only one please (for the default reload_daemon).  If you specified
# a tag, it's your problem.
152
#
153
if (!defined($tag) && CheckDaemonRunning("reload_daemon")) {
154 155 156
    fatal("Not starting another reload daemon!");
}

157 158
# Go to ground.
if (! $debug) {
159 160 161 162
    if (TBBackGround($logfile)) {
	exit(0);
    }
}
163
if (!defined($tag) && MarkDaemonRunning("reload_daemon")) {
164
    fatal("Could not mark daemon as running!");
165
}
166 167 168 169 170 171 172 173 174
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
    ReOpenLog($logfile);
}
$SIG{HUP} = \&handler
    if (!$debug);
175

176
logit("Reload Daemon starting... pid $$");
177

178 179 180 181 182 183 184
# We use this a lot.
my $reloading_experiment = Experiment->Lookup($RELOADPID, $RELOADEID);
if (!defined($reloading_experiment)) {
    Fatal("Could not locate experiment object for $RELOADEID");
    return;
}

185 186 187
#
# Loop, looking for nodes to reload.
# 
188
my $idle=0;
189
my $lastvartime = 0;
190
while (1) {
191
    my($count, $which, @row, %hrow, $imageid, $node, $retry, $stamp);
192
    my($pid, $eid);
193 194

    # Partial delay between loops in case of an error.
195 196 197 198 199 200 201
    if ($idle) {
	sleep(10);
    }
    # Wait longer if we're not doing anything
    else {
	sleep(1);
    }
202

203 204 205 206 207 208
    #
    # We use this to figure out when to delete nodes from the retried and
    # warned hashes
    #
    my $time = time();

209 210 211 212 213 214 215 216
    # Re-read sitevars periodically
    if ($time - $lastvartime > 60) {
	getsitevars();
	$lastvartime = $time;
    }

    $idle=1; # Assume we're going to be idle this iteration

217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
    #
    # If we are the default reload daemon (i.e., have no tag for our 
    # reload_pool), only look for nodes that have neither a reload_pool
    # node_type_attribute nor a node_attribute.
    #
    # If we have a reload_pool tag, only pick up nodes that 
    #  * have our tag for the node_type_attribute, and our tag or NULL
    #    for the node_attribute, OR
    #  * have our tag for the node attribute.
    #
    my $tag_query = '';
    if (!defined($tag)) {
	$tag_query = 'and nta_reload_pool.attrvalue is NULL' . 
	    ' and na_reload_pool.attrvalue is NULL';
    }
    else {
	$tag_query = "" . 
	    " and ((nta_reload_pool.attrvalue='$tag' and" . 
	    "       (na_reload_pool.attrvalue='$tag'" . 
	    "        or na_reload_pool.attrvalue is NULL))" . 
	    "      or na_reload_pool.attrvalue='$tag')";
    }

240
    #
241
    # Find all nodes in emulab-ops/reloading
242
    #
243
    $query_result =
244 245
	DBQueryWarn("select r.node_id,r.mustwipe,UNIX_TIMESTAMP(r.rsrv_time),nt.isremotenode" .
		    " from reserved as r" . 
246 247
		    " left join nodes as n on r.node_id=n.node_id" . 
		    " left join node_types as nt on n.type=nt.type " . 
248 249 250 251 252 253
		    " left outer join (select type,attrvalue from node_type_attributes" . 
		    "   where attrkey='reload_daemon_pool') as nta_reload_pool" . 
		    "   on n.type=nta_reload_pool.type" . 
		    " left outer join (select node_id,attrvalue from node_attributes" . 
		    "   where attrkey='reload_daemon_pool') as na_reload_pool" . 
		    "   on r.node_id=na_reload_pool.node_id" . 
254 255
		    " where r.pid='$RELOADPID' and r.eid='$RELOADEID'" .
		    " $tag_query order by r.rsrv_time");
256
    if (! $query_result) {
257
	logit("DB Error. Waiting a bit.");
258 259 260
	next;
    }

261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
    #
    # Build up a list of current nodes in reloading.
    #
    my %found = ();
    my @curnodes = ();
    while (($node, $mustwipe, $rtime, $isremote) = $query_result->fetchrow) {
	$found{$node} = $rtime;
	push(@curnodes, [ 0, $node, $mustwipe, $rtime, $isremote ]);
    }

    #
    # Remove nodes from retried/warned/failed that are no longer in reloading
    #
    foreach $node (keys %retried) {
	if (!exists($found{$node})) {
	    delete $retried{$node};
	}
    }
    foreach $node (keys %failed) {
	if (!exists($found{$node})) {
281 282
	    delete $failed{$node};
	}
283
    }
284

285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
    if ($debug && @curnodes > 0) {
	logit("Found nodes in reloading:");
	foreach my $aref (@curnodes) {
	    my ($handled, $node, $mustwipe, $rtime, $isremote) = @$aref;
	    my $e = $time - $rtime;
	    print "  $node: restime=$rtime (elapsed=$e), wipe=$mustwipe, rem=$isremote\n";
	}
	if (keys(%retried) > 0) {
	    print "  Retried:\n";
	    foreach $node (keys %retried) {
		my $t = $retried{$node};
		my $e = $time - $t;
		print "    $node: time=$t (elapsed=$e)\n";
	    }
	}
	if (keys(%failed) > 0) {
	    print "  Failed:\n";
	    foreach $node (keys %failed) {
		my $t = $failed{$node};
		my $e = $time - $t;
		print "    $node: time=$t (elapsed=$e)\n";
306
	    }
307 308 309 310
	}
    }

    #
311 312 313 314
    # Send to hwdown any nodes that have been in reloading too long.
    # Note that this is a hard limit, i.e., no compensating for the size of
    # the image being reloaded. So make sure the fail time is set to a
    # sufficiently large value!
315
    #
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
    foreach my $aref (@curnodes) {
	my ($handled, $node, $mustwipe, $rtime, $isremote) = @$aref;

	if ($handled || $fail_time <= 0) {
	    next;
	}

	my $interval = $fail_time * 60;
	if (($time - $rtime) >= $interval) {
	    my $elapsed = int(($time - $rtime) / 60);

	    tohwdown($node, "in reloading for $elapsed minutes", 1);

	    # mark as handled
	    $aref->[0] = 1;

	    # note that we did something
	    $idle = 0;
334 335 336 337
	}
    }

    #
338 339
    # Now look for nodes that have been in the reloading experiment for
    # longer than $retry_time, and try rebooting them or re-osloading them.
340
    #
341 342 343
    # XXX we count on mustwipe having the value 0, 1, 2 to represent
    # ever slower forms of wipeage.  For retry_time of 20 minutes that
    # yields waits of 20, 40 and 60 minutes.
344
    #
345 346
    foreach my $aref (@curnodes) {
	my ($handled, $node, $mustwipe, $rtime, $isremote) = @$aref;
347
	my $multiplier = 0;
348

349 350
	if ($handled || $retry_time <= 0) {
	    next;
351
	}
352 353 354 355 356 357 358 359
	my $nodeobj = Node->Lookup($node);
	if (defined($nodeobj) && $nodeobj->isswitch()) {
	    $multiplier = 2;
	    $nodeobj->Flush();
	}
	elsif ($isremote) {
	    $multiplier = $widearea_multiplier;
	}
360 361
	my $interval =
	    ($retry_time * ($mustwipe + 1) + 
362
	     ($retry_time * $multiplier)) * 60;
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431

	# XXX this is a relative interval
	my $stime = $retried{$node} ? $retried{$node} : $rtime;
	if (($time - $stime) < $interval) {
	    next;
	}

	my $elapsed = int(($time - $rtime) / 60);

	# note that we did something
	$idle = 0;

	#
	# If we have already attempted a reboot or re-osload, and we are
	# still here, move the node to hwdown.
	#
	if ($retried{$node}) {
	    tohwdown($node,
		     "failed two attempts to reload after $elapsed minutes",
		     1);

	    # mark as handled
	    $aref->[0] = 1;

	    next;
	}

	#
	# Let admins know we are attempting a corrective action.
	#
	if ($warnonretry) {
	    my $act = $failed{$node} ? "reload of OS" : "reboot";

	    if ($debug) {
		logit("$node: has been in reloading for $elapsed minutes.");
	    } else {
		notify("$node has been in $RELOADPID/$RELOADEID for " .
		       "$elapsed minutes, attempting $act.");
	    }
	}

	#
	# If this node failed its os_load, try os_load again.
	# Note that we will not get here a second time, we will move
	# the node to hwdown if this second reload attempt fails below.
	# 
	if ($failed{$node}) {
	    logit("$node: failed an earlier os_load, retrying once.");
	    push(@retry_list, [$node, $mustwipe, 1]);

	    # mark as handled
	    $aref->[0] = 1;
	}
	#
	# os_load succeeded but it hasn't finished within a reasonable time,
	# try a power cycle.
	#
	else {
	    logit("$node: reload appears wedged, ".
		  "power cycling and trying again.");
		
	    if (system("$reboot -f $node")) {
		tohwdown($node, "attempt to unwedge with reboot failed", 1);

		# mark as handled
		$aref->[0] = 1;

		next;
	    }
432
	}
433 434

	$retried{$node} = $time;
435 436
    }

437 438
    @curnodes = ();

439
    #
440
    # Find all of the free nodes that have not been reloaded (no pid entry
441 442 443
    # in last_reservation, which is reset anytime a node is reloaded by
    # the system).
    #
444 445 446 447
    # XXX - This should not be hardwired in.
    # 
    my $CLASSCLAUSE = "(n.class='pc' or n.class='pct')";
    
448
    $query_result =
449
	DBQueryWarn("select a.node_id,b.pid,b.eid,b.mustwipe,a.type ".
450
		    "from reserved as b ".
451
		    "left join nodes as a on a.node_id=b.node_id ".
452
		    "left join last_reservation as l on l.node_id=a.node_id ".
453 454 455 456 457 458 459 460 461 462
		    "left join node_types as n on n.type=a.type ".
		    " left outer join (select type,attrvalue from node_type_attributes" . 
		    "   where attrkey='reload_daemon_pool') as nta_reload_pool" . 
		    "   on n.type=nta_reload_pool.type" . 
		    " left outer join (select node_id,attrvalue from node_attributes" . 
		    "   where attrkey='reload_daemon_pool') as na_reload_pool" . 
		    "   on b.node_id=na_reload_pool.node_id" .
		    " where ((b.node_id is null and $CLASSCLAUSE and l.pid!='') ".
		    "or (b.pid='$RELOADPID' and b.eid='$PENDINGEID')) ". 
		    " $tag_query " . 
463 464
		    "order by a.node_id");

465
    if (! $query_result) {
466
	logit("DB Error. Waiting a bit.");
467 468 469 470
	next;
    }
    $count = $query_result->numrows;

471
    if (!$count && !scalar(@retry_list)) {
472 473 474
	next;
    }

475 476 477 478 479 480 481 482 483 484 485 486
    if ($debug) {
	if ($count) {
	    logit("Found $count nodes in reloadpending or other free state.");
	}
	if (@retry_list > 0) {
	    logit("Found " . scalar(@retry_list) . " nodes in retry list.");
	}
    }

    # note that we did something
    $idle = 0;

487
    # Grab all the nodes that match
488
    my @pending_list = @retry_list;
489
    while (%hrow = $query_result->fetchhash()) {
490 491 492
	$node = $hrow{'node_id'};
	$pid  = $hrow{'pid'};
	$eid  = $hrow{'eid'};
493
	$mustwipe = $hrow{'mustwipe'};
494 495
	$type = $hrow{'type'};
	$imageable = NodeType->LookupSync($type)->imageable();
496 497 498 499 500

	#
	# If any non-imageable nodes made it this far, just free them now
	#
	if (!$imageable) {
501
	    logit("$node: non-imageable, skipping reload.");
502 503 504
	    freefromreloading($node);
	    next;
	}
505
	if ($pid eq $RELOADPID && $eid eq $PENDINGEID) {
506 507 508 509
	    if ($debug) {
		logit("$node: in reloadpending.");
	    }
	    push(@pending_list, [$node,$mustwipe,0]);
510
	} else {
511 512 513 514
	    if ($debug) {
		logit("$node: otherwise needs reloading.");
	    }
	    push(@other_list, [$node,$mustwipe,0]);
515 516
	}
    }
517
    my $nodes = join(" ", map { $_->[0] } @pending_list, @other_list);
518 519 520 521
    if (!$nodes) {
	next;
    }

522 523
    logit("Trying to reload $nodes.");
    $nodes = "";
524 525

    #
526 527
    # What we do depends on whether its a free node or a node reserved
    # into the reload pending experiment.
528
    #
529
    if (@pending_list > 0) {
530
	#
531
	# Query for the imageid from the reloads table.
532
	#
533 534
	my %images = ();
	my %imagenodes = ();
535
	my %nodeobjs = ();
536
	foreach $ref (@pending_list) {
537
	    ($node, $mustwipe, undef) = @{$ref};
538 539
	    my $nodeobj = Node->Lookup($node);
	    if (!defined($nodeobj)) {
540
		notify("Could not local node object for $node.");
541 542 543 544
		next;
	    }
	    $nodeobjs{$node} = $nodeobj;
	    
545 546
	    ($imageid, undef) = $nodeobj->GetSchedReload();
	    if (!defined($imageid)) {
547 548 549
		#
		# If this node didn't make it into the scheduled_reloads table
		# for some reason, then we load it with the default image and
550
		# type.
551 552 553
		#
		$imageid = "";
	    }
554

555 556 557 558 559 560 561 562 563 564 565
	    #
	    # We need to divide up nodes not only by the image they are
	    # to load (imageid) but also by if and how the disk should be
	    # zeroed (mustzero).  So we really have a hash of hashes each
	    # of which is an array of nodes.  However, my perl skilz are
	    # not up to that so just combine the imageid and mustwipe into
	    # a single hash key ('/' is illegal in both, so we use it as
	    # the separator).
	    #
	    my $idid = "$imageid/$mustwipe";

566
	    $images{$node} = $imageid;
Mike Hibler's avatar
Mike Hibler committed
567
	    if (exists($imagenodes{$idid})) {
568
		push(@{$imagenodes{$idid}},$node);
569
	    } else {
570
		$imagenodes{$idid} = [$node];
571 572
	    }
	    if ($debug) {
573 574
		logit("$node ($mustwipe) => $images{$node} == $imageid (".
		      join(",",@{$imagenodes{$idid}}).")\n");
575
	    }
576 577 578 579 580
	}
	
	#
	# The node is reserved into the special pid/eid, as the result
	# of a sched_reload while it was still allocated to an experiment.
581
	# We change the reservation EID over and fire up an osload
582 583
	# directly.
	#
584
	foreach $ref (@pending_list) {
585
	    ($node, $mustwipe, $isretry) = @{$ref};
586 587 588
	    my $nodeobj = $nodeobjs{$node};
	    next
		if (!defined($nodeobj));
589

590 591 592 593 594 595 596 597 598 599 600 601 602
	    # XXX sanity check
	    if ($nodeobj->ReservationID() == $reloading_experiment->idx()) {
		if (!$isretry) {
		    logit("$node: WARNING: in reloading but not a retry!");
		}
	    } else {
		if ($isretry) {
		    logit("$node: WARNING: is a retry but not in reloading!");
		}
	    }

	    if (!$isretry &&
		$nodeobj->MoveReservation($reloading_experiment) == 0) {
603 604
		$nodeobj->SetNodeHistory(TB_NODEHISTORY_OP_MOVE, undef,
					 $reloading_experiment);
605
	    }
606
	}
607 608
	# It is now safe to clear this.
	@retry_list = ();
609

610 611 612 613 614 615
	#
	# Now run an OS load for each image.
	# We invoke libosload directly rather than calling os_load,
	# not so much for efficiency but because it gives us more
	# precise knowledge about failures.
	#
616
	foreach my $idid (keys %imagenodes) {
617

618 619
	    my @nodelist = @{$imagenodes{$idid}};
	    my $nodestr = join(' ', @nodelist);
620

621 622
	    ($imageid, $mustzero) = split("/", $idid);

623
	    logit("Invoking osload on $nodestr.");
624

625 626
	    my @failedload = ();
	    if (myosload($imageid, $mustzero, \@nodelist, \@failedload)) {
627 628 629 630 631
		#
		# For nodes that have failed already, put them in hwdown.
		#
		my $failstr = "";
		my $retrystr = "";
632
		foreach my $node (@failedload) {
633 634 635 636 637 638 639 640 641 642 643 644 645
		    if ($failed{$node}) {
			tohwdown($node, "failed second OS load", 0);
			$failstr .= "$node ";
		    } else {
			$retrystr .= "$node ";
		    }
		    $failed{$node} = $time;
		}
		
		if ($retrystr ne "") {
		    notify("OS load failed on $retrystr.\n".
			   "That is not supposed to happen. ".
			   "Will attempt another reload in $retry_time minutes.");
646
		}
647 648 649 650 651 652
		if ($failstr ne "") {
		    notify("OS load failed twice on $failstr.\n".
			   "That is not supposed to happen. ".
			   "Nodes sent to hwdown.");
		}

653 654 655 656 657
		foreach my $node (@nodelist) {
		    if (!$failed{$node}) {
			$nodes .= "$node ";
		    }
		}
658 659
	    }
	    else {
660 661
		$nodes .= "$nodestr ";
		logit("osload done.");
662
	    }
663
	}
664
    }
665 666
	
    if (@other_list > 0 ) {
667
	my $nodestr = join(" ", map { $_->[0] } @other_list);
668

669
	#
670 671 672 673
	# Call sched_reload with the "force" option, which says that if
	# sched_reload cannot reserve the node (cause someone just got it)
	# then don't schedule a reload for later. Just fail outright.
	# We will try again in a bit.
674
	#
675 676 677 678
	# We do not need to specify an imageid, since we want the node
	# default, and sched_reload will pick that up from the database
	# in the absence of a -i option. 
	#
679 680
	logit("Invoking sched_reload on $nodestr.");
	if (system("$sched_reload -f $nodestr")) {
681 682 683
	    #
	    # Could not get it. Wait and go around again.
	    #
684
	    logit("$sched_reload failed on $nodestr. Waiting a bit.");
685 686
	    next;
	}
687
	$nodes .= "$nodestr ";
688
    }
689

Mike Hibler's avatar
Mike Hibler committed
690 691 692 693 694 695 696 697 698 699
    if ($nodes) {
	logit("Reload of $nodes has started.");
	#
	# For Frisbee reloads, we don't wait for the node to finish reloading,
	# since the whole point is to let many nodes load at once.
	#
	logit("Not waiting for frisbee reload of $nodes.");
    } else {
	logit("No nodes eligible for reload.");
    }
700 701
}

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
sub myosload($$$$)
{
    my ($imageid, $mustzero, $nlist, $failedp) = @_;

    my %osloadargs  = ();
    my %nodestatus = ();
    my $failed = 0;

    $osloadargs{'waitmode'} = 0;
    $osloadargs{'zerofree'} = $mustzero;
    # XXX we don't set prepare?
    #$osloadargs{'prepare'}  = 1;
    $osloadargs{'nodelist'} = [ @{$nlist} ];
    # No imageid means to load the default image.
    $osloadargs{'imageids'} = [ $imageid ]
	if ($imageid);

    # XXX replicate what os_load does
    my $oquerymax = $libdb::DBQUERY_MAXTRIES;
    $libdb::DBQUERY_MAXTRIES = 30;

    my $user = User->ThisUser();
    my $experiment = $reloading_experiment;
    my $group = $experiment->GetGroup();
    if (EmulabFeatures->FeatureEnabled("NewOsload",$user,$group,$experiment)) {
	require libosload_new;

	my $loadobj = libosload_new->New();
	$loadobj->debug($debug);
	#
	# XXX basically, tell devices that might be reconfig'd via push
	# from us (like switches) that a reconfig should follow the reload!
	#
	$osloadargs{'reconfig'} = 1;

	# add a few more things for feature checks down the line:
	$osloadargs{'user'} = $user;
	$osloadargs{'experiment'} = $experiment;
	$osloadargs{'group'} = $group;
	$failed = $loadobj->osload(\%osloadargs, \%nodestatus);
    } else {
	$failed = osload(\%osloadargs, \%nodestatus);
    }

    if ($failed) {
	my @list = ();
	foreach my $node (keys %nodestatus) {
	    if ($nodestatus{$node}) {
		push @list, $node;
	    }
	}
753 754 755 756 757 758 759 760 761 762

	#
	# XXX if no status returned, assume a general failure affecting
	# all nodes.
	#
	if (@list == 0) {
	    @$failedp = @$nlist;
	} else {
	    @{$failedp} = @list;
	}
763 764 765 766 767 768
    }

    $libdb::DBQUERY_MAXTRIES = $oquerymax;

    return $failed;
}
769 770 771 772 773 774

#
# free up the node and clear any assocaited reload DB state.
# (code stolen from stated).
#
sub freefromreloading($) {
775 776 777
    my $nodeid = shift;
    my $node = Node->Lookup($nodeid);
    if (!defined($node)) {
778
	notify("Could not get node object for $nodeid.");
779 780 781 782 783 784 785 786 787 788
	return;
    }
    $node->FlushReserved();
    $node->ClearCurrentReload();
    my $experiment = $node->Reservation();
    if (defined($experiment) &&
	$experiment->pid() eq $RELOADPID &&
	($experiment->eid() eq $RELOADEID ||
	 $experiment->eid() eq $PENDINGEID)) {
	$node->ClearSchedReload();
789 790 791 792 793

	# Check if the robot is back in its pen, otherwise we have to throw it
	# back to repositionpending.
	my $loc_result =
	    DBQueryWarn("SELECT * FROM reposition_status ".
794
			"WHERE node_id='$nodeid'");
795 796

	if ($loc_result->numrows) {
797 798 799
	    my $target_experiment =
		Experiment->Lookup($RELOADPID, $RPPENDINGEID);
	    if (!defined($target_experiment)) {
800
		notify("Could not locate experiment object for $RPPENDINGEID.");
801 802 803
		return;
	    }
	    if ($node->MoveReservation($target_experiment) == 0) {
804 805
		logit("Reposition pending nodes moved to $RPPENDINGEID.");

806 807
		$node->SetNodeHistory(TB_NODEHISTORY_OP_MOVE, undef,
				      $target_experiment);
808 809 810
	    }
	}
	else {
811 812
	    $node->ClearReservation();
	    $node->SetNodeHistory(TB_NODEHISTORY_OP_FREE, undef, $experiment);
813
	}
814 815 816
    }
}

817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
sub tohwdown($$$)
{
    my ($node, $msg, $mailit) = @_;
    my $actstr = "";
    
    if ($hwdownaction eq "poweroff") {
	$actstr = " and powering off";
    } elsif ($hwdownaction eq "nodeadmin") {
	$actstr = " and booting in admin MFS";
    }
    if ($debug) {
	logit("$node: $msg, sending to hwdown$actstr.");
    } elsif ($mailit) {
	notify("$node $msg.\n".
	       "Moved to $NODEDEAD_PID/$NODEDEAD_EID$actstr.");
    }

    MarkPhysNodeDown($node);
    TBSetNodeLogEntry($node, "daemon",
		      TB_DEFAULT_NODELOGTYPE(),
		      "'Moved to hwdown; $msg'");

    if ($hwdownaction eq "poweroff") {
	if (system("$power off $node")) {
	    logit("'$power off $node' failed!");
	}
    }
    if ($hwdownaction eq "adminmode") {
	if (system("$nodeadmin on $node")) {
	    logit("'$nodeadmin on $node' failed!");
	}
    }
}

#
# Read site variables for global defaults:
#
# reload/retrytime:
#	If a node has been in reloading for longer than this period (minutes),
#	try rebooting/reloading it. If zero, never try reboot/reload.
# reload/failtime:
#	If a node has been in reloading for longer than this period (minutes),
#	send it to hwdown. If zero, leave nodes in reloading.
# reload/warnonretry:
#	If non-zero send e-mail to testbed-ops when a retry is attempted.
# reload/hwdownaction:
#	What to do when nodes are moved to hwdown.
#	'poweroff' to power nodes off,
#	'adminmode' to put nodes in admin MFS,
#	'nothing' to just move them (default).
#
sub getsitevars()
{
    my ($val,$nfail,$nretry,$nwarn,$nhwdown);

    $nfail = $fail_time;
    $nretry = $retry_time;
    $nwarn = $warnonretry;
    $naction = $hwdownaction;
    if (TBGetSiteVar("reload/retrytime", \$val)) {
	$nretry = int($val);
    }
    if (TBGetSiteVar("reload/failtime", \$val)) {
	$nfail = int($val);
    }
    if ($nfail > 0) {
	$nretry = $nfail - 1
	    if ($nretry > $nfail);
    }
    if (TBGetSiteVar("reload/warnonretry", \$val)) {
	$nwarn = int($val);
	if ($nwarn != 0) {
	    $nwarn = 1;
	}
    }
    if (TBGetSiteVar("reload/hwdownaction", \$val)) {
	if ($val =~ /^(nothing|poweroff|adminmode)$/) {
	    $naction = $1;
	} else {
	    notify("bogus 'reload/hwdownaction' sitevar value ignored.");
	}
    }
    
    if ($nfail != $fail_time ||
	$nretry != $retry_time ||
	$nwarn != $warnonretry ||
	$naction ne $hwdownaction) {
	logit("Changing fail/retry/warn/hwdown values: ".
	      "$fail_time/$retry_time/$warnonretry/$hwdownaction => ".
	      "$nfail/$nretry/$nwarn/$naction");
	$fail_time = $nfail;
	$retry_time = $nretry;
	$warnonretry = $nwarn;
	$hwdownaction = $naction;
    }
}

914 915 916 917 918 919 920 921
sub logit($)
{
    my ($msg) = @_;
    my $stamp = localtime();

    print "$stamp: $msg\n";
}

922 923
sub fatal($)
{
924 925
    local($msg) = $_[0];

926
    SENDMAIL($TBOPS, "Reload Daemon Died", $msg, $TBOPS);
927
    MarkDaemonStopped("reload_daemon");
928 929 930
    die($msg);
}

931 932
sub notify($)
{
933
    my($msg) = $_[0];
934

935 936
    logit($msg);
    SENDMAIL($TBOPS, "Reload Daemon Message", "$msg\n", $TBOPS);
937
}