checkup_daemon.in 10.5 KB
Newer Older
1 2 3 4
#!/usr/bin/perl -wT

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2005, 2006, 2007 University of Utah and the Flux Group.
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
# All rights reserved.
#

use English;
use Getopt::Std;

if ($UID != 0) {
    die("*** $0:\n".
	"    Only root can run this script!\n");
}

#
# Look for objects to check.
#
#	usage: checkup_daemon [-d]
#
# TODO: Use "logger" instead of writing a log file.
#
sub usage()
{
    print STDOUT "Usage: checkup_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
    exit(-1);
}
my  $optlist = "d";

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";

# Testbed Support library
use lib "@prefix@/lib";
use libdb;
use libtestbed;
43 44
use User;
use Project;
45 46 47 48 49 50 51

#
# These come from the library.
#
my $NODEDEAD_PID= NODEDEAD_PID;
my $NODEDEAD_EID= NODEDEAD_EID;
my $TBOPSPID= TBOPSPID;
52
my $HOME= USERROOT() . "/elabckup";
53 54 55 56 57 58 59 60 61 62

sub fatal($);
sub daemonize();
sub misconfig($$);
sub IsNodeFree($);
sub UpdateCheckupState($$$$$);
sub ExptExists($$);

my $logfile	= "$TB/log/checkuplog";
my $debug	= 0;
63
my $parallelization = 5;
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
my $retry_interval = 10 * 60; # seconds

#
# Turn off line buffering on output (dots ...).
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:/usr/local/bin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
if (defined($options{"d"})) {
    $debug = $options{"d"};
}

# Go to ground.
if (! $debug) {
    daemonize();
}

print "Checkup daemon starting... pid $$, at ".`date`;

99 100 101 102 103
# Need the unix uid for the backup user.
my $user = User->Lookup('elabckup');
fatal("Could not get object for backup user")
    if (!defined($user));
my $ev_uid   = $user->unix_uid();
104

105 106 107 108 109
# and need the unix gid for the group.
my $project = Project->Lookup($TBOPSPID);
fatal("Could not get object for $TBOPSPID project")
    if (!defined($project));
my $unix_gid = $project->unix_gid();
110 111 112 113

print "Experiment head: $ev_uid\n"
    if ($debug);

114
# Switch to the elabckup user.
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
$GID = $unix_gid;
$EGID = "$unix_gid";
$EUID = $UID = $ev_uid;

my $iter = 0;

MAINLOOP: while(1) {
    my $disabled;
    
    # Avoid doing anything while the testbed is down for maintenance.
    if (! TBGetSiteVar("web/nologins", \$disabled)) {
	print "Error getting sitevar 'web/nologins'. Waiting a bit ...\n";
	goto pause;
    }
    if ($disabled) {
	print "Web interface is disabled; waiting a bit ...\n";
	sleep(60);
	goto pause;
    }

    # Get the checkups that are currently running.
    $query_result =
	DBQueryWarn("select ac.object,ac.state,ct.object_type, ".
		    "  ct.checkup_type,ct.major_type,ct.expiration ".
		    "from active_checkups as ac ".
		    "left join checkup_types as ct on ".
141 142
		    "  ac.type=ct.checkup_type and ".
		    "  ac.object_type=ct.object_type");
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
    
    my $count = 0;
    while (my ($object, $state, $otype, $ctype, $mtype, $exp) =
	   $query_result->fetchrow_array()) {
	
	if ($ctype =~ /.\.ns/) {
	    my $eid = "ckup-$object"; # XXX check length
	    my $fn = "$HOME/$object-$ctype";

	    if ($state eq "new") {
		# Create a new experiment with the NS file.
		my $src = "$TB/lib/checkup/$ctype";
		my $error = "";

		print "Creating $eid to run $ctype\n";

		open(NSFILE, "> $fn");
		print NSFILE "set opt(CHECKUP_OBJECT) $object\n";
		if ($otype ne "") {
		    print NSFILE "set opt(OBJECT_TYPE) $otype\n";
		}
		if ($mtype ne "") {
		    print NSFILE "set opt(MAJOR_TYPE) $mtype\n";
		}
		close(NSFILE);
		
		if (! -f $fn) {
		    misconfig($ctype, "$src does not exist");
		    $state = "finished";
		}
		elsif (system("cat $src >> $fn")) {
		    $error = "problem appending NS to '$fn'";
		}
		elsif (system("$TB/bin/batchexp ".
			      "-n ".
			      "-L \"Should not idle unless checkup failed.\" ".
			      "-p $TBOPSPID ".
			      "-g $TBOPSPID ".
			      "-e $eid ".
			      "$fn")) {
		    system("cat $fn");
		    $error = "$ctype - cannot create experiment for $object";
		}

		unlink($fn);

		if ($state eq "new") {
		    if ($error eq "") {
			$state = "running";
		    }
		    else {
			print "error: $error\n";
			$state = "failed";
		    }
		}
		UpdateCheckupState($object, $ctype, $mtype, $state, $exp);
	    }
	    elsif ($state eq "running") {
		# Check the experiment's state.
202 203
		my $report = PROJROOT() .
		    "/$TBOPSPID/exp/$eid/logs/report.mail";
204 205 206 207 208

		if (-e $report) {
		    # Failed...
		    SENDMAIL($TBOPS,
			     "Checkup failure - $object/$ctype",
209
			     "[Swapout $TBOPSPID/$eid when you're finished]\n\n".
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
			     `cat $report`,
			     $TBOPS);
		    UpdateCheckupState($object, $ctype, $mtype,
				       "locked", $exp);
		}
		elsif (ExptExists($TBOPSPID, $eid)) {
		    # Wait...
		}
		else {
		    # Finished...
		    UpdateCheckupState($object, $ctype, $mtype,
				       "finished", $exp);
		}
	    }
	    elsif ($state eq "locked") {
		# Locked waiting for human intervention...
		if (ExpState($TBOPSPID, $eid) eq EXPTSTATE_SWAPPED) {
		    # Swapped out...
		    system("$TB/bin/endexp $TBOPSPID $eid");
		}
		elsif (!ExptExists($TBOPSPID, $eid)) {
		    UpdateCheckupState($object, $ctype, $mtype,
				       "finished", $exp);
		}
	    }
	    else {
		print "error: $object/$ctype is in an unknown state!\n";
	    }
	}
	else {
	    my $fn = "$TB/libexec/checkup/$ctype";
	    my $newstate = $state;
	    
	    if (! -x $fn) {
		misconfig($ctype, "$fn is not executable");
		$newstate = "finished";
	    }
	    else {
		my $workdir = "$HOME/ckup-$object";
		my $pid;

		if ($state eq "new") {
		    mkdir($workdir, 0770);
		    chmod(0770, $workdir);
		}
		chdir $workdir;
		if (!($pid = open(HANDLER, "$fn $object $state 2>&1 |"))) {
		    print "error: could not run $fn\n";
		}
		else {
		    my $output = "";
		    my $exit_code;

		    while (<HANDLER>) {
			print "$ctype\[$pid\] $_";
			$output .= $_;
		    }
		    close(HANDLER);
		    $exit_code = $? >> 8;
		    if ($exit_code == 0) {
			$newstate = "finished";
		    }
		    elsif ($state eq "locked") {
		    }
		    elsif ($exit_code == 10) {
			$newstate = "running";
		    }
		    else {
			SENDMAIL($TBOPS,
				 "Checkup failure - $object/$ctype",
				 "[Working directory is $workdir]\n\n".
				 $output,
				 $TBOPS);
			$newstate = "locked";
		    }
		}
		chdir $HOME;
		if ($newstate eq "finished") {
		    system("rm -rf $workdir");
		}
	    }
	    if ($newstate ne $state) {
		UpdateCheckupState($object, $ctype, $mtype, $newstate, $exp);
	    }
	}

	$count += 1;
    }

    if ($iter % 5 == 0) {
	# Automatically fill the checkups table.
	# XXX Just do nodes for now.
302 303
	DBQueryWarn("INSERT INTO checkups_temp (object, object_type, type, next) ".
		    "SELECT n.node_id,n.type,ct.checkup_type,NOW() FROM nodes as n ".
304 305
		    "INNER JOIN checkup_types as ct on n.type=ct.object_type ".
		    "LEFT JOIN checkups as c on c.object=n.node_id ".
306 307
		    "WHERE c.object is null and n.role='testnode'");
	DBQueryWarn("INSERT INTO checkups (object, object_type, type, next) ".
308 309 310 311 312 313 314 315 316 317 318
		    "SELECT * FROM checkups_temp");
	DBQueryWarn("DELETE FROM checkups_temp");
    }
    
    if ($count < $parallelization) {
	# Look for any objects ready for their next checkup.
	$query_result =
	    DBQueryWarn("SELECT c.object,ct.object_type,ct.checkup_type, ".
			"  ct.major_type,ct.expiration ".
			"FROM checkups as c ".
			"INNER JOIN checkup_types as ct on ".
319 320
			"  c.type=ct.checkup_type and ".
			"  c.object_type=ct.object_type ".
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
			"LEFT JOIN active_checkups as ac on ".
			"  ac.object=c.object ".
			"WHERE NOW() >= c.next and ac.object is null ".
			"ORDER BY c.next DESC");

	while (my ($object, $otype, $ctype, $mtype, $exp) =
	       $query_result->fetchrow_array()) {
	    if (!defined($ctype) || $ctype eq "") {
		die "Empty checkup type";
	    }

	    if ($mtype eq "node") {
		DBQueryWarn("UPDATE nodes SET reserved_pid='$TBOPSPID' ".
			    "WHERE node_id='$object' and ".
			    "reserved_pid is null");
		
		if (!IsNodeFree($object)) {
		    next;
		}
	    }

	    print "Adding $object/$ctype to active_checkups\n";

	    # Make the checkups active.
	    DBQueryWarn("INSERT INTO active_checkups SET ".
346 347
			"object='$object',object_type='$otype',".
			"type='$ctype',start=NOW()");
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451

	    if ($exp == 0) {
		$exp = 10;
	    }

	    # Schedule the next checkup.
	    DBQueryWarn("UPDATE checkups SET next=NULL ".
			"WHERE object='$object' and type='$ctype'");

	    $count += 1;

	    if ($count >= $parallelization) {
		last;
	    }
	}
    }

  pause:
    sleep(5);

    $iter += 1;
}

sub IsNodeFree($)
{
    my($pnode) = @_;

    my $query_result = DBQueryWarn("select 1 from reserved " .
				   "where node_id='$pnode'");

    return ($query_result->numrows == 0);
}

sub UpdateCheckupState($$$$$)
{
    my ($object, $ctype, $mtype, $newstate, $exp) = @_;

    print "  Updating $object-$ctype -> $newstate\n";
    if ($newstate eq "finished" || $newstate eq "failed") {
	DBQueryWarn("delete from active_checkups where object='$object'");
	if ($newstate eq "failed") {
	    $exp = $retry_interval;
	}
	DBQueryWarn("UPDATE checkups SET next=DATE_ADD(NOW(), ".
		    "INTERVAL $exp SECOND) ".
		    "WHERE object='$object' and type='$ctype'");
	if ($mtype eq "node") {
	    DBQueryWarn("UPDATE nodes SET reserved_pid=NULL ".
			"WHERE node_id='$object'");
	}
    }
    else {
	DBQueryWarn("update active_checkups SET state='$newstate' ".
		    "WHERE object='$object'");
    }
}

sub ExptExists($$)
{
    my ($pid, $eid) = @_;

    my $query_result =
	DBQueryFatal("select 1 from experiments ".
		     "where pid='$pid' and eid='$eid'");

    return ($query_result->numrows != 0);
}

sub misconfig($$)
{
    my ($ctype, $msg) = @_;

    print "error: $msg\n";
    SENDMAIL($TBOPS, "Checkup misconfiguration", $msg, $TBOPS);
    DBQueryWarn("delete from checkup_types where checkup_type='$ctype'");
    DBQueryWarn("delete from checkups where type='$ctype'");
}

#
# Become a daemon.
# 
sub daemonize()
{
    my $mypid = fork();
    if ($mypid) {
	exit(0);
    }

    #
    # We have to disconnect from the caller by redirecting both STDIN and
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
    #
    open(STDIN, "< /dev/null") or
	die("opening /dev/null for STDIN: $!");

    #
    # Open the batch log and start writing to it. 
    #
    open(STDERR, ">> $logfile") or die("opening $logfile for STDERR: $!");
    open(STDOUT, ">> $logfile") or die("opening $logfile for STDOUT: $!");

    return 0;
}
452 453 454 455 456 457 458

sub fatal($) {
    my($mesg) = $_[0];

    die("*** $0:\n".
	"    $mesg\n");
}