reload_daemon.in 10.5 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6 7 8
# All rights reserved.
#

9 10 11
use English;
use Getopt::Std;

12 13 14 15 16 17
#
# This should run as root to make sure that it has permission to reboot nodes
# (since only root is allowed to power cycle nodes at any time - it's time-
# limited for anyone else)
#
if ($UID != 0) {
18 19
    die("*** $0:\n".
	"    Only root can run this script!\n");
20 21
}

22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
#
# Look for nodes to reload.
#
#	usage: reload_daemon [-d]
#
# TODO: Use "logger" instead of writing a log file.
#
sub usage()
{
    print STDOUT "Usage: reload_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
    exit(-1);
}
my  $optlist = "d";

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";

44 45 46 47 48
# Testbed Support library
use lib "@prefix@/lib";
use libdb;
use libtestbed;

49
# Be careful not to exit on transient error
50
$libdb::DBQUERY_MAXTRIES = 30;
51

52 53 54 55 56 57
#
# These come from the library.
# 
my $RELOADPID	= NODERELOADING_PID;
my $RELOADEID	= NODERELOADING_EID;
my $PENDINGEID	= NODERELOADPENDING_EID;
58 59
my $NODEDEAD_PID= NODEDEAD_PID;
my $NODEDEAD_EID= NODEDEAD_EID;
60

Leigh B. Stoller's avatar
Leigh B. Stoller committed
61
my $os_load	= "$TB/bin/os_load -s";
62 63 64 65
my $sched_reload= "$TB/sbin/sched_reload";
my $reboot	= "$TB/bin/node_reboot";
my $logfile	= "$TB/log/reloadlog";
my $debug	= 0;
66
my $retry_time  = 15; # in minutes
67
my $warn_time   = 30; # in minutes
68 69 70
my %retried     = ();
my %warned	= ();
my %failed	= ();
71
my @retry_list  = ();
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

#
# Turn off line buffering on output (dots ...).
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
if (defined($options{"d"})) {
    $debug = $options{"d"};
}

# Go to ground.
if (! $debug) {
    daemonize();
}

104
print "Reload Daemon starting... pid $$, at ".`date`;
105

106 107 108
#
# Loop, looking for nodes to reload.
# 
109
my $idle=0;
110
while (1) {
111
    my($count, $which, @row, %hrow, $imageid, $node, $retry, $stamp);
112
    my($pid, $eid);
113 114

    # Partial delay between loops in case of an error.
115 116
    if ($idle) { sleep(10); } # Wait longer if we're not doing anything
    else { sleep(1); }
117

118
    $idle=1; # Assume we're going to be idle this iteration
119 120 121 122 123 124
    #
    # We use this to figure out when to delete nodes from the retried and
    # warned hashes
    #
    my $time = time();

125
    #
126 127 128 129 130 131
    # First, look for nodes that have been in the reloading experiment for
    # longer than $retry_time, and try rebooting them
    #
    $query_result =
	DBQueryWarn("select node_id from reserved where pid='$RELOADPID' " .
		    "and eid='$RELOADEID' and " .
132 133 134 135 136 137 138 139
		    "(CURRENT_TIMESTAMP - INTERVAL $retry_time MINUTE) ".
		    "  > rsrv_time");

    if (! $query_result) {
	print "DB Error. Waiting a bit.\n";
	next;
    }

140
    while (($node) = $query_result->fetchrow){ 
141
	$idle=0;
142 143 144 145 146 147 148 149 150 151 152 153
	#
	# If this was a node that failed os_load, then instead of rebooting,
	# send it back through os_load.
	# 
	if ($failed{$node}) {
	    print "$node failed an earlier os_load. Trying again\n";
	    push(@retry_list, $node);
	    delete $failed{$node};
	    # Skip any reboots. 
	    $retried{$node} = $time;
	    next;
	}
154
	if (!$retried{$node}) {
155 156 157
	    print "\nReload appears wedged at ".`date`.
		"Power cycling and trying once more!\n";
		
158
	    if (system("$reboot -f $node")) {
159 160 161 162 163 164 165
		notify("$node was wedged, but could not be rebooted.\n".
		       "Moved to $NODEDEAD_PID/$NODEDEAD_EID\n");

		MarkPhysNodeDown($node);
		TBSetNodeLogEntry($node, "daemon",
				  TB_DEFAULT_NODELOGTYPE(),
				  "'Moved to hwdown; reload reboot failed'");
166
	    }
167 168 169 170 171
	}
	$retried{$node} = $time;
    }

    #
172 173
    # We can pull out all nodes that were not 'touched' (matched by the
    # select above) during this pass
174 175 176 177
    #
    foreach $node (keys %retried) {
	if ($retried{$node} != $time) {
	    delete $retried{$node};
178 179 180 181 182 183
	}
    }

    #
    # Next, we do the same thing for nodes in the reloading experiment for
    # longer than $warn_time, and warn the admins.
184 185 186
    #
    $query_result =
	DBQueryWarn("select node_id from reserved where pid='$RELOADPID' " .
187
		    "and eid='$RELOADEID' and " .
188 189 190 191 192 193 194 195
		    "(CURRENT_TIMESTAMP - INTERVAL $warn_time MINUTE) > ".
		    "   rsrv_time");
    
    if (! $query_result) {
	print "DB Error. Waiting a bit.\n";
	next;
    }

196
    while (($node) = $query_result->fetchrow){ 
197
	$idle=0;
198 199
	if (!$warned{$node}) {
	    notify("Node $node has been in $RELOADPID/$RELOADEID for " .
200
	    "more than $warn_time minutes");
201 202 203 204 205
	}
	$warned{$node} = $time;
    }

    #
206 207
    # We can pull out all nodes that were not 'touched' (matched by the
    # select above) during this pass
208 209 210 211
    #
    foreach $node (keys %warned) {
	if ($warned{$node} != $time) {
	    delete $warned{$node};
212 213 214
	}
    }

215
    #
216
    # Find all of the free nodes that have not been reloaded (no pid entry
217 218 219
    # in last_reservation, which is reset anytime a node is reloaded by
    # the system).
    #
220 221 222 223
    # XXX - This should not be hardwired in.
    # 
    my $CLASSCLAUSE = "(n.class='pc' or n.class='pct')";
    
224
    $query_result =
225 226
	DBQueryWarn("select a.node_id,b.pid,b.eid from reserved as b ".
		    "left join nodes as a on a.node_id=b.node_id ".
227 228
		    "left join last_reservation as l on l.node_id=a.node_id ".
		    "left join node_types as n on n.type=a.type where ".
229
		    "(b.node_id is null and $CLASSCLAUSE and l.pid!='') ".
230 231 232
		    "or (b.pid='$RELOADPID' and b.eid='$PENDINGEID') ".
		    "order by a.node_id");

233
    if (! $query_result) {
234
	print "DB Error. Waiting a bit.\n";
235 236 237 238
	next;
    }
    $count = $query_result->numrows;

239
    if (!$count && !scalar(@retry_list)) {
240
	next;
241 242
    } else {
	$idle=0;
243 244
    }

245
    # Grab all the nodes that match
246 247
    my @pending_list = @retry_list;
    while (%hrow  = $query_result->fetchhash()) {
248 249 250
	$node = $hrow{'node_id'};
	$pid  = $hrow{'pid'};
	$eid  = $hrow{'eid'};
251 252 253 254 255 256
	if ($pid eq $RELOADPID && $eid eq $PENDINGEID) {
	    push(@pending_list,$node);
	} else {
	    push(@other_list,$node);
	}
    }
257

258
    my $nodes = join(" ", (@pending_list, @other_list));
259
    print "Trying to reload $nodes at ".`date`;
260 261

    #
262 263
    # What we do depends on whether its a free node or a node reserved
    # into the reload pending experiment.
264
    #
265
    if (@pending_list > 0) {
266
	#
267
	# Query for the imageid from the reloads table.
268
	#
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
	my %images = ();
	my %imagenodes = ();
	foreach $node (@pending_list) {
	    $query_result =
	      DBQueryWarn("select image_id from scheduled_reloads " .
			  "where node_id='$node'");

	    if ((! $query_result) || (!$query_result->numrows())) {
		#
		# If this node didn't make it into the scheduled_reloads table
		# for some reason, then we load it with the default image and
		# type
		#
		$imageid = "";

	    } else {
		@row     = $query_result->fetchrow_array();
		$imageid = $row[0];
	    }
	    $images{$node} = $imageid;
	    if (defined(@{$imagenodes{$imageid}})) {
		push(@{$imagenodes{$imageid}},$node);
	    } else {
		$imagenodes{$imageid} = [$node];
	    }
	    if ($debug) {
		print "$node => $images{$node} == $imageid (".
		  join(",",@{$imagenodes{$imageid}}).")\n";
	    }
298 299 300 301 302 303 304 305
	}
	
	#
	# The node is reserved into the special pid/eid, as the result
	# of a sched_reload while it was still allocated to an experiment.
	# We change the reservation EID over and fire up an os_load
	# directly.
	#
306
	my $cond = join(" or ",map("node_id='$_'",@pending_list));
307 308
	if (! DBQueryWarn("update reserved set ".
			  "rsrv_time=now(),eid='$RELOADEID' ".
309 310 311
			  "where $cond")) {
	    print "Could not update EID for ".join(" ",@pending_list).
	      ". Waiting a bit.\n";
312
	    next;
313 314
	} else {
	    print "Pending nodes moved to $RELOADEID at ".`date`;
315
	}
316 317
	# It is now safe to clear this.
	@retry_list = ();
318

319 320 321
	# Now run an os_load for each image
	
	foreach $imageid (keys %imagenodes) {
322

323 324
	    my $nodelist = join(" ",@{$imagenodes{$imageid}});
	    my $os_load_flags = "";
325

326
	    #
327 328 329
	    # We only add the -m flag to os_load if we found a specific image
	    # above. Omitting it causes os_load to pick the default image for
	    # the node's type
330
	    #
331 332 333 334
	    if ($imageid) {
		$os_load_flags .= " -m $imageid ";
	    }

335
	    print "Running '$os_load $os_load_flags $nodelist' at ".`date`;
336 337 338

	    if (system("$os_load $os_load_flags $nodelist")) {
		#
339
		# This should not fail, but it does when the DB gets busy.
340
		#
341 342 343 344 345 346 347 348 349 350 351 352 353
		notify("$os_load failed on $nodelist. ".
		       "That is not supposed to happen.\n".
		       "Attempting to recover from this unfortunate ".
		       "situation!\n");

		# Record the failure list. If we get to the 15 minute
		# retry, call os_load again instead of rebooting.
		foreach my $node (@{$imagenodes{$imageid}}) {
		    $failed{$node} = $time;		    
		}
	    }
	    else {
		print "os_load done at ".`date`;
354
	    }
355
	}
356
    }
357 358
	
    if (@other_list > 0 ) {
359
	#
360 361 362 363
	# Call sched_reload with the "force" option, which says that if
	# sched_reload cannot reserve the node (cause someone just got it)
	# then don't schedule a reload for later. Just fail outright.
	# We will try again in a bit.
364
	#
365 366 367 368
	# We do not need to specify an imageid, since we want the node
	# default, and sched_reload will pick that up from the database
	# in the absence of a -i option. 
	#
369
	if (system("$sched_reload -f @other_list")) {
370 371 372
	    #
	    # Could not get it. Wait and go around again.
	    #
373
	    print "$sched_reload failed on @other_list. Waiting a bit.\n";
374 375
	    next;
	}
376

377
    }
378
    $stamp = DBDateTime();
379
	
380
    print "Reload of $nodes has started at $stamp.\n";
381

382 383 384 385
    #
    # For Frisbee reloads, we don't wait for the node to finish reloading,
    # since the whole point is to let many nodes load at once.
    #
386 387 388
    print "Not waiting for frisbee reload of $nodes.\n";
    next;

389

390 391
}

392 393
sub fatal($)
{
394 395
    local($msg) = $_[0];

396
    SENDMAIL($TBOPS, "Reload Daemon Died", $msg, $TBOPS);
397 398 399
    die($msg);
}

400 401 402
sub notify($)
{
    my($mesg) = $_[0];
403

404
    print "$mesg\n";
405
    SENDMAIL($TBOPS, "Reload Daemon Message", $mesg, $TBOPS);
406 407
}

408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
#
# Become a daemon.
# 
sub daemonize()
{
    my $mypid = fork();
    if ($mypid) {
	exit(0);
    }

    #
    # We have to disconnect from the caller by redirecting both STDIN and
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
    #
    open(STDIN, "< /dev/null") or
	die("opening /dev/null for STDIN: $!");

    #
    # Open the batch log and start writing to it. 
    #
    open(STDERR, ">> $logfile") or die("opening $logfile for STDERR: $!");
    open(STDOUT, ">> $logfile") or die("opening $logfile for STDOUT: $!");

    return 0;
}