reload_daemon.in 18.2 KB
Newer Older
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6 7 8
# All rights reserved.
#

9 10 11
use English;
use Getopt::Std;

12 13 14 15 16 17
#
# This should run as root to make sure that it has permission to reboot nodes
# (since only root is allowed to power cycle nodes at any time - it's time-
# limited for anyone else)
#
if ($UID != 0) {
18 19
    die("*** $0:\n".
	"    Only root can run this script!\n");
20 21
}

22 23 24 25 26 27 28 29 30
#
# Look for nodes to reload.
#
#	usage: reload_daemon [-d]
#
# TODO: Use "logger" instead of writing a log file.
#
sub usage()
{
31 32 33 34 35 36 37 38
    print STDOUT "Usage: reload_daemon [-d] [-t tag]\n" .
	"    -d     Prevent daemonization\n" . 
	"    -t tag Only manage reloads for nodes or node types\n" . 
	"           that have the value of <tag> for a node_type_attribute\n" . 
	"           or a node_attribute named 'reload_daemon_tag'.\n" . 
	"           IF this tag is not set, the reload_daemon picks only\n" . 
	"           those nodes that DO NOT have this type or node\n" . 
	"           attribute set!\n";
39 40
    exit(-1);
}
41
my  $optlist = "dt:";
42 43 44 45 46 47 48 49

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";

Timothy Stack's avatar
 
Timothy Stack committed
50 51 52 53
# XXX
my $BUILDING = "MEB-ROBOTS";
my $FLOOR = 4;

54 55 56 57
# Testbed Support library
use lib "@prefix@/lib";
use libdb;
use libtestbed;
58
use NodeType;
59 60 61 62 63 64 65

#
# These come from the library.
# 
my $RELOADPID	= NODERELOADING_PID;
my $RELOADEID	= NODERELOADING_EID;
my $PENDINGEID	= NODERELOADPENDING_EID;
Timothy Stack's avatar
 
Timothy Stack committed
66 67
my $REPOSPID	= NODEREPOSITIONING_PID;
my $RPPENDINGEID= NODEREPOSPENDING_EID;
68 69
my $NODEDEAD_PID= NODEDEAD_PID;
my $NODEDEAD_EID= NODEDEAD_EID;
70

71 72
sub fatal($);
sub notify($);
73 74
sub freefromreloading($);
		      
Leigh B. Stoller's avatar
Leigh B. Stoller committed
75
my $os_load	= "$TB/bin/os_load -s";
76 77
my $sched_reload= "$TB/sbin/sched_reload";
my $reboot	= "$TB/bin/node_reboot";
Kirk Webb's avatar
 
Kirk Webb committed
78
my $tbrsync     = "$TB/bin/tbrsync";
79 80
my $logfile	= "$TB/log/reloadlog";
my $debug	= 0;
81
my $tag;
82 83
my $retry_time  = 20;              # in minutes
my $warn_time   = $retry_time * 2; # in minutes
84 85
my $widearea_multiplier = 2;       # widearea nodes get (mult+1)x longer, but
                                   #  possibly not quite true cause of mustwipe)
86 87 88
my %retried     = ();
my %warned	= ();
my %failed	= ();
89
my @retry_list  = ();
90

Kirk Webb's avatar
 
Kirk Webb committed
91 92 93 94
# XXX: Garcia hack vars
my $gimageid = "GARCIA-STARGATE";
my $gimagepath = "/usr/testbed/images/garcia";

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
#
# Turn off line buffering on output (dots ...).
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
if (defined($options{"d"})) {
    $debug = $options{"d"};
}
120 121 122 123 124
if (defined($options{"t"})) {
    $tag = $options{"t"};
    # rename the logfile too
    $logfile = "$logfile-$tag";
}
125

126
#
127 128
# Only one please (for the default reload_daemon).  If you specified
# a tag, it's your problem.
129
#
130
if (!defined($tag) && CheckDaemonRunning("reload_daemon")) {
131 132 133
    fatal("Not starting another reload daemon!");
}

134 135
# Go to ground.
if (! $debug) {
136 137 138 139
    if (TBBackGround($logfile)) {
	exit(0);
    }
}
140
if (!defined($tag) && MarkDaemonRunning("reload_daemon")) {
141
    fatal("Could not mark daemon as running!");
142
}
143 144 145 146 147 148 149 150 151
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
    ReOpenLog($logfile);
}
$SIG{HUP} = \&handler
    if (!$debug);
152

153
print "Reload Daemon starting... pid $$, at ".`date`;
154

155 156 157
#
# Loop, looking for nodes to reload.
# 
158
my $idle=0;
159
while (1) {
160
    my($count, $which, @row, %hrow, $imageid, $node, $retry, $stamp);
161
    my($pid, $eid);
162 163

    # Partial delay between loops in case of an error.
164 165
    if ($idle) { sleep(10); } # Wait longer if we're not doing anything
    else { sleep(1); }
166

167
    $idle=1; # Assume we're going to be idle this iteration
168 169 170 171 172 173
    #
    # We use this to figure out when to delete nodes from the retried and
    # warned hashes
    #
    my $time = time();

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
    #
    # If we are the default reload daemon (i.e., have no tag for our 
    # reload_pool), only look for nodes that have neither a reload_pool
    # node_type_attribute nor a node_attribute.
    #
    # If we have a reload_pool tag, only pick up nodes that 
    #  * have our tag for the node_type_attribute, and our tag or NULL
    #    for the node_attribute, OR
    #  * have our tag for the node attribute.
    #
    my $tag_query = '';
    if (!defined($tag)) {
	$tag_query = 'and nta_reload_pool.attrvalue is NULL' . 
	    ' and na_reload_pool.attrvalue is NULL';
    }
    else {
	$tag_query = "" . 
	    " and ((nta_reload_pool.attrvalue='$tag' and" . 
	    "       (na_reload_pool.attrvalue='$tag'" . 
	    "        or na_reload_pool.attrvalue is NULL))" . 
	    "      or na_reload_pool.attrvalue='$tag')";
    }

197
    #
198 199 200
    # First, look for nodes that have been in the reloading experiment for
    # longer than $retry_time, and try rebooting them
    #
201 202 203 204
    # XXX we count on mustwipe having the value 0, 1, 2 to represent
    # ever slower forms of wipeage.  For retry_time of 20 minutes that
    # yields waits of 20, 40 and 60 minutes.
    #
205
    $query_result =
206 207 208
	DBQueryWarn("select r.node_id,r.mustwipe from reserved as r" . 
		    " left join nodes as n on r.node_id=n.node_id" . 
		    " left join node_types as nt on n.type=nt.type " . 
209 210 211 212 213 214 215 216 217 218
		    " left outer join (select type,attrvalue from node_type_attributes" . 
		    "   where attrkey='reload_daemon_pool') as nta_reload_pool" . 
		    "   on n.type=nta_reload_pool.type" . 
		    " left outer join (select node_id,attrvalue from node_attributes" . 
		    "   where attrkey='reload_daemon_pool') as na_reload_pool" . 
		    "   on r.node_id=na_reload_pool.node_id" . 
		    " where r.pid='$RELOADPID' and r.eid='$RELOADEID' and" .
		    " (CURRENT_TIMESTAMP - INTERVAL ($retry_time * (r.mustwipe + 1) + (nt.isremotenode * $retry_time * $widearea_multiplier)) MINUTE)".
		    "  > rsrv_time" . 
		    " $tag_query");
219 220 221 222 223 224

    if (! $query_result) {
	print "DB Error. Waiting a bit.\n";
	next;
    }

225
    while (($node, $mustwipe) = $query_result->fetchrow) {
226
	$idle=0;
227 228 229 230 231 232
	#
	# If this was a node that failed os_load, then instead of rebooting,
	# send it back through os_load.
	# 
	if ($failed{$node}) {
	    print "$node failed an earlier os_load. Trying again\n";
233
	    push(@retry_list, [$node, $mustwipe]);
234 235 236 237 238
	    delete $failed{$node};
	    # Skip any reboots. 
	    $retried{$node} = $time;
	    next;
	}
239
	if (!$retried{$node}) {
240 241 242
	    print "\nReload appears wedged at ".`date`.
		"Power cycling and trying once more!\n";
		
243
	    if (system("$reboot -f $node")) {
244 245 246 247 248 249 250
		notify("$node was wedged, but could not be rebooted.\n".
		       "Moved to $NODEDEAD_PID/$NODEDEAD_EID\n");

		MarkPhysNodeDown($node);
		TBSetNodeLogEntry($node, "daemon",
				  TB_DEFAULT_NODELOGTYPE(),
				  "'Moved to hwdown; reload reboot failed'");
251
	    }
252 253 254 255 256
	}
	$retried{$node} = $time;
    }

    #
257 258
    # We can pull out all nodes that were not 'touched' (matched by the
    # select above) during this pass
259 260 261 262
    #
    foreach $node (keys %retried) {
	if ($retried{$node} != $time) {
	    delete $retried{$node};
263 264 265 266 267 268
	}
    }

    #
    # Next, we do the same thing for nodes in the reloading experiment for
    # longer than $warn_time, and warn the admins.
269
    #
270 271
    # XXX again, we scale by the value of mustwipe.
    #
272
    $query_result =
273 274 275
	DBQueryWarn("select r.node_id,r.mustwipe from reserved as r" . 
		    " left join nodes as n on r.node_id=n.node_id" . 
		    " left join node_types as nt on n.type=nt.type " . 
276 277 278 279 280 281 282 283 284 285
		    " left outer join (select type,attrvalue from node_type_attributes" . 
		    "   where attrkey='reload_daemon_pool') as nta_reload_pool" . 
		    "   on n.type=nta_reload_pool.type" . 
		    " left outer join (select node_id,attrvalue from node_attributes" . 
		    "   where attrkey='reload_daemon_pool') as na_reload_pool" . 
		    "   on r.node_id=na_reload_pool.node_id" . 
		    " where r.pid='$RELOADPID' and r.eid='$RELOADEID' and " .
		    " (CURRENT_TIMESTAMP - INTERVAL ($warn_time * (mustwipe + 1) + (nt.isremotenode * $warn_time * $widearea_multiplier)) MINUTE)".
		    "  > rsrv_time" . 
		    " $tag_query");
286 287 288 289 290 291
    
    if (! $query_result) {
	print "DB Error. Waiting a bit.\n";
	next;
    }

292
    while (($node, $mustwipe) = $query_result->fetchrow) {
293
	$idle=0;
294
	if (!$warned{$node}) {
295
	    my $toolong = $warn_time * ($mustwipe + 1);
296
	    notify("Node $node has been in $RELOADPID/$RELOADEID for " .
297
		   "more than $toolong minutes");
298 299 300 301 302
	}
	$warned{$node} = $time;
    }

    #
303 304
    # We can pull out all nodes that were not 'touched' (matched by the
    # select above) during this pass
305 306 307 308
    #
    foreach $node (keys %warned) {
	if ($warned{$node} != $time) {
	    delete $warned{$node};
309 310 311
	}
    }

312
    #
313
    # Find all of the free nodes that have not been reloaded (no pid entry
314 315 316
    # in last_reservation, which is reset anytime a node is reloaded by
    # the system).
    #
317 318 319 320
    # XXX - This should not be hardwired in.
    # 
    my $CLASSCLAUSE = "(n.class='pc' or n.class='pct')";
    
321
    $query_result =
322
	DBQueryWarn("select a.node_id,b.pid,b.eid,b.mustwipe,a.type ".
323
		    "from reserved as b ".
324
		    "left join nodes as a on a.node_id=b.node_id ".
325
		    "left join last_reservation as l on l.node_id=a.node_id ".
326 327 328 329 330 331 332 333 334 335
		    "left join node_types as n on n.type=a.type ".
		    " left outer join (select type,attrvalue from node_type_attributes" . 
		    "   where attrkey='reload_daemon_pool') as nta_reload_pool" . 
		    "   on n.type=nta_reload_pool.type" . 
		    " left outer join (select node_id,attrvalue from node_attributes" . 
		    "   where attrkey='reload_daemon_pool') as na_reload_pool" . 
		    "   on b.node_id=na_reload_pool.node_id" .
		    " where ((b.node_id is null and $CLASSCLAUSE and l.pid!='') ".
		    "or (b.pid='$RELOADPID' and b.eid='$PENDINGEID')) ". 
		    " $tag_query " . 
336 337
		    "order by a.node_id");

338
    if (! $query_result) {
339
	print "DB Error. Waiting a bit.\n";
340 341 342 343
	next;
    }
    $count = $query_result->numrows;

344
    if (!$count && !scalar(@retry_list)) {
345
	next;
346 347
    } else {
	$idle=0;
348 349
    }

350
    # Grab all the nodes that match
351
    my @pending_list = @retry_list;
352
    while (%hrow = $query_result->fetchhash()) {
353 354 355
	$node = $hrow{'node_id'};
	$pid  = $hrow{'pid'};
	$eid  = $hrow{'eid'};
356
	$mustwipe = $hrow{'mustwipe'};
357 358
	$type = $hrow{'type'};
	$imageable = NodeType->LookupSync($type)->imageable();
359 360

	# XXX Garcia Hack
361
	if (!$imageable && $type eq "garcia") {
362 363 364 365 366 367 368 369 370 371 372 373
	    $imageable = 1;
	}
	# XXX End Garcia Hack

	#
	# If any non-imageable nodes made it this far, just free them now
	#
	if (!$imageable) {
	    print "Skipping non-imageable node $node\n";
	    freefromreloading($node);
	    next;
	}
374
	if ($pid eq $RELOADPID && $eid eq $PENDINGEID) {
375
	    push(@pending_list, [$node,$mustwipe]);
376
	} else {
377
	    push(@other_list, [$node,$mustwipe]);
378 379
	}
    }
380
    my $nodes = join(" ", map { $_->[0] } @pending_list, @other_list);
381 382 383 384
    if (!$nodes) {
	next;
    }

385
    print "Trying to reload $nodes at ".`date`;
386 387

    #
388 389
    # What we do depends on whether its a free node or a node reserved
    # into the reload pending experiment.
390
    #
391
    if (@pending_list > 0) {
392
	#
393
	# Query for the imageid from the reloads table.
394
	#
395 396
	my %images = ();
	my %imagenodes = ();
397 398 399
	foreach $ref (@pending_list) {
	    ($node, $mustwipe) = @{$ref};

400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
	    $query_result =
	      DBQueryWarn("select image_id from scheduled_reloads " .
			  "where node_id='$node'");

	    if ((! $query_result) || (!$query_result->numrows())) {
		#
		# If this node didn't make it into the scheduled_reloads table
		# for some reason, then we load it with the default image and
		# type
		#
		$imageid = "";

	    } else {
		@row     = $query_result->fetchrow_array();
		$imageid = $row[0];
	    }
Kirk Webb's avatar
 
Kirk Webb committed
416 417 418 419 420 421 422

            # XXX Garcia Hack
            if (TBNodeType($node) eq "garcia") {
                $imageid = $gimageid;
            }
            # XXX End Garcia Hack

423 424 425 426 427 428 429 430 431 432 433
	    #
	    # We need to divide up nodes not only by the image they are
	    # to load (imageid) but also by if and how the disk should be
	    # zeroed (mustzero).  So we really have a hash of hashes each
	    # of which is an array of nodes.  However, my perl skilz are
	    # not up to that so just combine the imageid and mustwipe into
	    # a single hash key ('/' is illegal in both, so we use it as
	    # the separator).
	    #
	    my $idid = "$imageid/$mustwipe";

434
	    $images{$node} = $imageid;
435 436
	    if (defined(@{$imagenodes{$idid}})) {
		push(@{$imagenodes{$idid}},$node);
437
	    } else {
438
		$imagenodes{$idid} = [$node];
439 440
	    }
	    if ($debug) {
441 442
		print "$node ($mustwipe) => $images{$node} == $imageid (".
		  join(",",@{$imagenodes{$idid}}).")\n";
443
	    }
444 445 446 447 448 449 450 451
	}
	
	#
	# The node is reserved into the special pid/eid, as the result
	# of a sched_reload while it was still allocated to an experiment.
	# We change the reservation EID over and fire up an os_load
	# directly.
	#
452 453
	my $cond = "node_id in (" .
	    join(",", map("'$_->[0]'", @pending_list)) . ")";
454 455
	if (! DBQueryWarn("update reserved set ".
			  "rsrv_time=now(),eid='$RELOADEID' ".
456
			  "where $cond")) {
457 458 459
	    print "Could not update EID for " .
		join(" ", map("$_->[0]", @pending_list)) .
		    ". Waiting a bit.\n";
460
	    next;
461 462
	} else {
	    print "Pending nodes moved to $RELOADEID at ".`date`;
463

464
	    foreach my $n (map("$_->[0]", @pending_list)) {
465 466 467
		TBSetNodeHistory($n, TB_NODEHISTORY_OP_MOVE, $UID,
				 $RELOADPID, $RELOADEID);
	    }
468
	}
469 470
	# It is now safe to clear this.
	@retry_list = ();
471

472 473
	# Now run an os_load for each image
	
474
	foreach my $idid (keys %imagenodes) {
475

476
	    my $nodelist = join(" ",@{$imagenodes{$idid}});
477
	    my $os_load_flags = "";
478

479 480
	    ($imageid, $mustzero) = split("/", $idid);

Kirk Webb's avatar
 
Kirk Webb committed
481 482 483 484 485 486 487 488 489 490
            # XXX Garcia Hack - gross..
            # We special-case garcia loading for now until the subnode->node
            # dependancies are worked out inside os_load.
            if ($imageid eq $gimageid) {
                print "Synching garcia nodes: '$nodelist' at ".`date`;
                # path to directory tree "image" hardcoded for now since
                # users have no choice over OS selection when reloading
                # isn't performed during swapin.
                if (system("$tbrsync upload $gimagepath $nodelist") == 0) {
                    if (system("$reboot $nodelist") == 0) {
Kirk Webb's avatar
 
Kirk Webb committed
491
                        # rsync and reboot succeeded, so free 'em up.
492
                        foreach my $gnode (@{$imagenodes{$idid}}) {
Kirk Webb's avatar
 
Kirk Webb committed
493 494 495
                            freefromreloading($gnode);
                        }
                        print "garcia reload done at ".`date`;
Kirk Webb's avatar
 
Kirk Webb committed
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514
                        next;
                    }
                    else {
                        notify("Failed to reboot garcias after rsync: ". 
                               "$nodelist.\n");
                    }
                }
                else {
                    notify("Failed to rsync garcia nodes: $nodelist.\n");
                }

                # Either rsync or reboot failed on the robots if we get here.
		foreach my $node (@{$imagenodes{$imageid}}) {
		    $failed{$node} = $time;
		}
                next;
            }
            # XXX End Garcia Hack

515
	    #
516 517 518
	    # We only add the -m flag to os_load if we found a specific image
	    # above. Omitting it causes os_load to pick the default image for
	    # the node's type
519
	    #
520
	    if ($imageid) {
521 522 523 524 525 526 527 528
		$os_load_flags .= " -m $imageid";
	    }

	    #
	    # Handle optional zeroing of the disk
	    #
	    if ($mustzero) {
		$os_load_flags .= " -z $mustzero";
529 530
	    }

531
	    print "Running '$os_load $os_load_flags $nodelist' at ".`date`;
532 533 534

	    if (system("$os_load $os_load_flags $nodelist")) {
		#
535
		# This should not fail, but it does when the DB gets busy.
536
		#
537
		notify("$os_load $os_load_flags failed on $nodelist. ".
538 539 540 541 542 543
		       "That is not supposed to happen.\n".
		       "Attempting to recover from this unfortunate ".
		       "situation!\n");

		# Record the failure list. If we get to the 15 minute
		# retry, call os_load again instead of rebooting.
544
		foreach my $node (@{$imagenodes{$idid}}) {
545 546 547 548 549
		    $failed{$node} = $time;		    
		}
	    }
	    else {
		print "os_load done at ".`date`;
550
	    }
551
	}
552
    }
553 554
	
    if (@other_list > 0 ) {
555 556
	my $nodes = join(" ", map { $_->[0] } @other_list);

557
	#
558 559 560 561
	# Call sched_reload with the "force" option, which says that if
	# sched_reload cannot reserve the node (cause someone just got it)
	# then don't schedule a reload for later. Just fail outright.
	# We will try again in a bit.
562
	#
563 564 565 566
	# We do not need to specify an imageid, since we want the node
	# default, and sched_reload will pick that up from the database
	# in the absence of a -i option. 
	#
567
	if (system("$sched_reload -f $nodes")) {
568 569 570
	    #
	    # Could not get it. Wait and go around again.
	    #
571
	    print "$sched_reload failed on $nodes. Waiting a bit.\n";
572 573
	    next;
	}
574

575
    }
576
    $stamp = DBDateTime();
577
	
578
    print "Reload of $nodes has started at $stamp.\n";
579

580 581 582 583
    #
    # For Frisbee reloads, we don't wait for the node to finish reloading,
    # since the whole point is to let many nodes load at once.
    #
584 585 586
    print "Not waiting for frisbee reload of $nodes.\n";
    next;

587

588 589
}

Kirk Webb's avatar
 
Kirk Webb committed
590 591 592 593 594 595 596 597

#
# free up the node and clear any assocaited reload DB state.
# (code stolen from stated).
#
sub freefromreloading($) {
    my $node = shift;

Timothy Stack's avatar
 
Timothy Stack committed
598
    DBQueryFatal("delete from current_reloads where node_id='$node'");
Kirk Webb's avatar
 
Kirk Webb committed
599 600
    my ($pid,$eid);
    NodeidToExp($node,\$pid,\$eid);
601
    if ($pid eq $RELOADPID && ($eid eq $RELOADEID || $eid eq $PENDINGEID)) {
Timothy Stack's avatar
 
Timothy Stack committed
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
	DBQueryFatal("delete from scheduled_reloads where node_id='$node'");

	# Check if the robot is back in its pen, otherwise we have to throw it
	# back to repositionpending.
	my $loc_result =
	    DBQueryWarn("SELECT * FROM reposition_status ".
			"WHERE node_id='$node'");

	if ($loc_result->numrows) {
	    if (!DBQueryWarn("update reserved set ".
			     "rsrv_time=now(),eid='$RPPENDINGEID' ".
			     "where node_id='$node'")) {
		print "Could not update EID for $node. Waiting a bit.\n";
	    } else {
		print "Reposition pending nodes moved to $RPPENDINGEID at ".
		    `date`;
		
		TBSetNodeHistory($node, TB_NODEHISTORY_OP_MOVE, $UID,
				 $REPOSPID, $RPPENDINGEID);
	    }
	}
	else {
	    DBQueryFatal("delete from reserved where node_id='$node'");
	    TBSetNodeHistory($node, TB_NODEHISTORY_OP_FREE,
			     $UID, $pid, $eid);
	}
Kirk Webb's avatar
 
Kirk Webb committed
628 629 630
    }
}

631 632
sub fatal($)
{
633 634
    local($msg) = $_[0];

635
    SENDMAIL($TBOPS, "Reload Daemon Died", $msg, $TBOPS);
636
    MarkDaemonStopped("reload_daemon");
637 638 639
    die($msg);
}

640 641 642
sub notify($)
{
    my($mesg) = $_[0];
643

644
    print "$mesg\n";
645
    SENDMAIL($TBOPS, "Reload Daemon Message", $mesg, $TBOPS);
646
}