batch_daemon.in 14.5 KB
Newer Older
1 2 3 4 5 6 7 8 9
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Create a batch experiment.
#
# usage: batch_daemon
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
10 11
# TODO: Use "logger" instead of writing a log file.
#
12 13
sub usage()
{
14 15
    print STDOUT "Usage: batch_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
16 17
    exit(-1);
}
18
my  $optlist = "d";
19 20 21 22 23 24 25

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";
26
my $TBLOGS   = "@TBLOGSEMAIL@";
27

28 29 30 31 32 33 34
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

Leigh B. Stoller's avatar
Leigh B. Stoller committed
35 36 37 38 39
#
# Ug, exit value from startexp when not enough nodes.
# 
my $TOOFEWNODES = 2;

40 41 42 43
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $startexp = "$TB/bin/startexp";
my $endexp   = "$TB/bin/endexp";
44
my $savelogs = "$TB/bin/savelogs";
45
my $avail    = "$TB/sbin/avail";
46 47
my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj";
48
my $debug    = 0;
49 50
my $dirname;

51 52 53 54 55
my $BSTATE_POSTED	= BATCHSTATE_POSTED;
my $BSTATE_ACTIVATING	= BATCHSTATE_ACTIVATING;
my $BSTATE_RUNNING	= BATCHSTATE_RUNNING;
my $BSTATE_TERMINATING	= BATCHSTATE_TERMINATING;

56 57 58 59 60 61
#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
#
my $eid;
my $pid;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
62
my $gid;
63
my $logname;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
64
my $nsfile;
65
my $user_name  = "Testbed Operations";
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
my $user_email = "$TBOPS";

#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
90 91 92
if (defined($options{"d"})) {
    $debug = $options{"d"};
}
93 94

# Go to ground.
95
if (! $debug) {
96 97 98
    if (TBBackGround($batchlog)) {
	exit(0);
    }
99
}
100 101 102 103 104

#
# Loop, looking for batch experiments that want to run.
# 
while (1) {
105
    my($count, $i, $query_result, $pending_result, $running_result);
106
    my(%row, %pending_row);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
107

108
    #
109
    # Need to lock the table here because of cancelation in endexp.
110 111 112 113 114 115
    # See the comments in there. We need to atomically grab the next
    # batch experiment we want to try, and then change its state from
    # new to configuring. We want to grab just one experiment, since
    # it takes a while to configure an experiment, and grabbing a bunch and
    # locking them up might result in having to wait a really long time
    # to cancel a batch experiment that hasn't really tried to start yet!
116
    # Thats would ne annoying to users.
117
    #
118
    # So, now you're wondering what my selection criteria is? Well, its
119 120 121 122 123
    # damn simplistic. I set the "started" datetime field each attempt,
    # and I pick the batch_experiment with the oldest time, thereby cycling
    # through in a "least recently attempted" manner. 
    #
    $query_result =
124
	DBQuery("lock tables experiments write");
125 126
    if (! $query_result) {
	print "DB Error locking tables. Waiting a bit ...\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
127
	goto pause;
128 129 130
    }
    
    $pending_result =
131 132 133 134 135
	DBQueryWarn("SELECT * FROM experiments ".
		    "WHERE batchmode=1 and canceled=0 and ".
		    "      batchstate='$BSTATE_POSTED' and ".
		    "      (attempts=0 or ".
		    "       ((UNIX_TIMESTAMP() - ".
136
		    "         UNIX_TIMESTAMP(expt_start) > (60 * 15)))) ".
137
		    "ORDER BY expt_start LIMIT 1");
138

139
    $running_result =
140 141 142
	DBQuery("select * from experiments ".
		"where batchmode=1 and batchstate='$BSTATE_RUNNING' ".
		"ORDER BY expt_start");
143 144 145

    if (!$pending_result || !$running_result) {
	print "DB Error getting batch info. Waiting a bit ...\n";
146
	DBQuery("unlock tables");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
147
	goto pause;
148 149 150
    }

    if (!$pending_result->numrows && !$running_result->numrows) {
151
	DBQuery("unlock tables");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
152
	goto pause;
153 154 155
    }

    #
156
    # If we have a pending experiment to run, set its state to configuring
157
    # right away, while we have the tables locked. This prevents endexp
158
    # from seeing it as something it can cancel.
159
    #
160 161 162 163 164 165 166 167
    if ($pending_result->numrows) {
	%pending_row = $pending_result->fetchhash();

	# Local vars!
	my $eid = $pending_row{'eid'};
	my $pid = $pending_row{'pid'};

	$query_result = 
168 169 170
	    DBQuery("update experiments set expt_start=now(), ".
		    "batchstate='$BSTATE_ACTIVATING' ".
		    "where eid='$eid' and pid='$pid'");
171 172 173

	if (! $query_result) {
	    print "DB error setting batch $pid/$eid to configuring.\n";
174
	    DBQuery("unlock tables");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
175
	    goto pause;
176 177
	}
    }
178
    DBQueryWarn("unlock tables");
179

180 181 182 183 184 185 186 187 188 189 190
    #
    # Okay, first we check the status of running batch mode experiments
    # since we want to end those before trying to start any new ones, cause
    # it would be nice to have as many nodes available as possible before
    # trying to add a new one. This can potentially delay startup, but thats
    # okay. Its a batch system.
    #
    # If you are wondering why I check for finished experiments in the main
    # loop instead of in the child that started the experiment, its so that
    # we fire up again and look for them in the event that paper goes down.
    #
191
    while (%row = $running_result->fetchhash()) {
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
	my $canceled = $row{'canceled'};
	if ($canceled) {
	    dosomething("cancel", %row);
	    next;
	}
	if (isexpdone(%row)) {
	    dosomething("end", %row);
	    next;
	}
    }

    #
    # Finally start an actual experiment!
    #
    if ($pending_result->numrows) {
	dosomething("start", %pending_row);
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
209 210
  pause:
    sleep(30);
211 212 213
}

#
214
# Do something as the user. Either, start, end, or cancel an experiment.
215
#
216
sub dosomething($$)
217
{
218 219
    my($dowhat)   = shift;
    my(%exphash)  = @_;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
220
    my($unix_uid, $unix_gid, $row, $query_result);
221 222

    # Global vars
223 224 225 226 227 228 229 230
    $eid     = $exphash{'eid'};
    $pid     = $exphash{'pid'};
    $gid     = $exphash{'gid'};
    $dirname = $exphash{'path'};
    $nsfile  = "$eid.ns";
    
    # Locals
    my $creator = $exphash{'expt_head_uid'};
231

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
    print "Doing a '$dowhat' to batch experiment $pid/$eid\n";

    #
    # Create a temporary name for a log file. We do this in the parent so
    # we can remove it when the child ends. The child could remove it, but
    # since it is open in the child, it has the tendency to stick around.
    #
    $logname = `mktemp /tmp/$dowhat-batch-$pid-$eid.XXXXXX`;

    # Note different taint check (allow /).
    if ($logname =~ /^([-\@\w.\/]+)$/) {
	$logname = $1;
    } else {
	die "Bad data in $logname";
    }

248 249 250
    #
    # Start up a child to run the guts. The parent waits. If the
    # experiment configures okay, the parent can return to try something
251
    # else.
252 253 254
    #
    $childpid = fork();
    if ($childpid) {
255 256
	print "Child PID $childpid started to $dowhat $pid/$eid\n";

257
	waitpid($childpid, 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
258
	my $status = $? >> 8;
259

Leigh B. Stoller's avatar
Leigh B. Stoller committed
260
	print "Child PID $childpid exited with exit status $status\n";
261

Leigh B. Stoller's avatar
Leigh B. Stoller committed
262
	sleep(5);
263

264
	unlink($logname);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
265
	return $status;
266
    }
267 268
    openlog($logname);

269 270 271
    #
    # Get some user information. 
    #
272
    if (!UserDBInfo($creator, \$user_name, \$user_email)) {
273
	fatal("DB Error getting user information for uid $creator");
274
    }
275 276 277
    
    chdir("$dirname/tbdata") or
	fatal("Could not cd into $dirname/tbdata!");
278 279 280 281 282

    #
    # Figure out the unix uid/gid that the experiment configuration is
    # going to run as. 
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
283
    (undef,undef,$unix_uid) = getpwnam($creator) or
284
	fatal("No such user $creator");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
285 286
    (undef,undef,$unix_gid) = getgrnam($gid) or
	fatal("No such group $gid");
287

288 289 290
    #
    # Change the ownership of the log file before we flip.
    #
291 292
    chown($unix_uid, $unix_gid, $logname) or
	fatal("Could not chown $logname to $unix_uid/$unix_gid!");
293 294

    # Flip to the user. We never flip back.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
295 296
    $EGID = $GID = $unix_gid;
    $EUID = $UID = $unix_uid;
297
    $ENV{'USER'} = $creator;
298
    
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
    if ($dowhat eq "start") {
	startexp(%exphash);
    }
    elsif ($dowhat eq "end") {
	endexp(%exphash);
    }
    elsif ($dowhat eq "cancel") {
	cancelexp(1, %exphash);
    }
    exit(0);
}

#
# Try to start an experiment. Never returns.
# 
sub startexp($)
{
    my(%exphash)  = @_;
317
    my($exit_status, $running, $query_result);
318 319 320

    my $attempts  = $exphash{'attempts'};

321
    #
322
    # Try to start the experiment. 
323
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
324
    system("$startexp -b $logname -g $gid $pid $eid $nsfile");
325 326
    $exit_status = $? >> 8;
    $running     = 1;
327 328 329 330 331
    if ($exit_status) {
	$running = 0;
    }
    
    #
332 333
    # Look for cancelation. If we get a DB error on this, just continue cause
    # we can pick up the cancelation later.
334 335
    #
    $query_result =
336
	DBQueryWarn("select canceled from experiments ".
337
		    "where eid='$eid' and pid='$pid'");
338

339 340
    if ($query_result) {
	@row = $query_result->fetchrow_array();
341

342 343 344 345 346 347 348 349
	if ($row[0]) {
	    cancelexp($running);
	    #
	    # Never returns, but just to be safe ...
	    #
	    exit(0);
	}
    }
350 351 352 353

    #
    # If the configuration failed for lack of nodes, then don't send
    # email unless the number of attempts starts to get big.
354
    #
355 356 357
    # If the configuration failed for some other reason, then send email.
    # We have to reset the state to "new" so that it will be retried again
    # later. 
358 359
    #
    if (! $running) {
360 361 362 363
	#
	# XXX - What if this update fails?
	# 
	$query_result = 
364
	    DBQueryWarn("update experiments set attempts=attempts+1 ".
365
			"where eid='$eid' and pid='$pid'");
366

367
	if ($exit_status == $TOOFEWNODES) {
Leigh B. Stoller's avatar
Leigh B. Stoller committed
368
	    if ($attempts && (($attempts % 30) == 0)) {
369 370 371 372 373 374 375 376 377
		$attempts++;
		
		my $msg =
		    "Could not configure Batch Mode experiment $pid/$eid.\n".
		    "\n". 
		    "There are not enough free nodes at this time.\n".
		    "Another attempt will be made in a little while.\n".
		    "\n". 
		    "There have been $attempts attempts to start this batch.";
378
	    
379 380 381 382 383 384 385 386 387 388 389
		email_status($msg);
	    }

	    #
	    # There is some state that needs to be reset so that another
	    # attempt can be made.
	    # 
	    SetExpState($pid, $eid, EXPTSTATE_NEW);
	    TBSetBatchState($pid, $eid, $BSTATE_POSTED);
	    
	    exit($exit_status);
390
	}
391 392 393 394
	email_status("Experiment startup exited with error code $exit_status.".
		     "\n".
		     "Batch has been removed from the system.");
	ExptCleanup();
395
	exit($exit_status);
396 397 398 399 400
    }

    #
    # Well, it configured! Lets set it state to running.
    #
401 402
    TBSetBatchState($pid, $eid, $BSTATE_RUNNING);
    
403
    email_status("Batch Mode experiment $pid/$eid is now running!\n".
404
		 "Please consult the Web interface to see how it is doing.");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
405

406
    #
407
    # Done with this phase. Must exit.
408
    #
409 410
    exit(0);
}
411

412 413 414 415 416 417
#
# End an experiment. Never returns.
#
sub endexp($)
{
    my(%exphash)  = @_;
418

419 420 421
    #
    # Save tiplogs
    #
422 423
    system("$savelogs $pid $eid");

424 425 426 427 428
    #
    # Have to set the state to terminating or else endexp will not accept it.
    # 
    TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
    
429
    system("$endexp -b $pid $eid");
430 431 432 433 434 435 436 437
    my $exit_status = $? >> 8;

    if ($exit_status) {
	#
	# TB admin is going to have to clean up. 
	# 
	fatal("Terminating Batch Mode experiment $pid/$eid");
    }
438 439

    ExptCleanup();
440
    email_status("Batch Mode experiment $pid/$eid has finished!");
441
   
442
    #
443
    # Child must exit!
444
    #
445 446 447 448 449 450 451 452 453 454 455
    exit(0);
}

#
# Cancel an experiment. Never returns.
#
sub cancelexp($$)
{
    my($running) = shift;
    my(%exphash) = @_;
    
456 457
    TBSetBatchState($pid, $eid, $BSTATE_TERMINATING);
    
458 459 460 461
    if ($running) {
	system("$endexp -b $pid $eid");
    }

462 463
    ExptCleanup();
    donotify("Your Batch Mode experiment has been canceled!", "Canceled", 0);
464 465 466 467 468 469 470
   
    #
    # Child must exit!
    #
    exit(0);
}

471 472 473 474 475 476 477
#
# Check experiment status. Looks to see if all of the nodes in an
# experiment have reported in.
#
sub isexpdone($)
{
    my(%exphash)  = @_;
478
    my($query_result, @row);
479 480 481 482 483 484 485 486 487 488 489
    
    # Global vars
    $eid = $exphash{'eid'};
    $pid = $exphash{'pid'};

    print "Checking to see if $pid/$eid has finished up yet\n";

    #
    # Look to see if any nodes yet to report status. If so, spin again.
    #
    $query_result =
490 491 492
	DBQueryWarn("SELECT startstatus,bootstatus FROM nodes ".
		    "LEFT JOIN reserved ON nodes.node_id=reserved.node_id ".
		    "WHERE reserved.eid='$eid' and reserved.pid='$pid'");
493 494 495 496 497

    if (! $query_result) {
	return 0;
    }

498 499 500 501 502 503 504 505 506 507 508 509 510
    #
    # Well, right now a node is considered finished up only if its
    # boot did not fail, and it has reported start command status.
    # The idea being that if the boot failed, then its status will
    # never be reported anyway, and we might as well consider the node
    # done (else the experiment would never end).
    # 
    while (@row = $query_result->fetchrow_array()) {
	if ($row[1] eq NODEBOOTSTATUS_FAILED) {
	    next;
	}
	if ($row[0] eq NODESTARTSTATUS_NOSTATUS) {
	    return 0;
511 512
	}
    }
513
    return 1;
514 515
}

516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
#
# Remove all trace.
# 
sub ExptCleanup()
{
    if (system("rm -rf $dirname")) {
	print "*** WARNING: Not able to remove experiment directory.\n";
	print "             Someone will need to do this by hand.\n";
    }

    #
    # Remove all trace from the DB.
    # 
    DBQueryWarn("DELETE from nsfiles ".
		"WHERE eid='$eid' and pid='$pid'");

    DBQueryWarn("DELETE from exppid_access ".
		"WHERE exp_eid='$eid' and exp_pid='$pid'");

    DBQueryWarn("DELETE from experiments ".
		"WHERE eid='$eid' and pid='$pid'");
}

539 540
#
# Start up a child, and set its descriptors talking to a log file.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
541
# The log file already exists, created with mktemp above.
542 543 544 545 546 547 548
# 
sub openlog($)
{
    my($logname) = $_[0];
	
    #
    # We have to disconnect from the caller by redirecting both STDIN and
Leigh B. Stoller's avatar
Leigh B. Stoller committed
549 550
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
551 552
    #
    open(STDIN, "< /dev/null") or
553
	fatal("opening /dev/null for STDIN: $!");
554 555 556 557 558 559 560 561 562

    open(STDERR, ">> $logname") or
	fatal("opening $logname for STDERR: $!");
    open(STDOUT, ">> $logname") or
	fatal("opening $logname for STDOUT: $!");

    return 0;
}

563 564 565 566 567
#
# A fatal error is something that the user does not need to know about.
# Caused by a breakdown in the TB system. Generally speaking, once the
# experiment is running, this should not be used.
# 
568
sub fatal($)
569 570 571
{
    my($mesg) = $_[0];

572
    donotify($mesg, "Failure", 1);
573 574 575 576

    exit(-1);
}

577 578 579
#
# Something the user cares about. 
# 
580
sub email_status($)
581
{
582
    my($mesg) = $_[0];
583

584
    donotify($mesg, "Status", 0);
585 586
}

587
sub donotify($$$)
588
{
589
    my($mesg, $subtext, $iserr) = @_;
590
    my($subject, $from, $to, $hdrs);
591
    my $MAIL;
592

593 594 595
    $mesg = "$mesg\n";

    print STDOUT "$mesg";
596

597 598
    $subject = "TESTBED: Batch Mode Experiment $subtext $pid/$eid";
    $from    = $TBOPS;
599 600 601 602 603 604
    $hdrs    = "Reply-To: $TBOPS";
    
    #
    # An error goes just to Testbed Operations. Normal status messages go
    # to the user and to the Testbed Logs address.
    # 
605
    if ($iserr) {
606
	$to = "$TBOPS";
607 608
    }
    else {
609 610 611
	$to   = "$user_name <$user_email>";
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
612 613
    }

614
    SENDMAIL($to, $subject, $mesg, $from, $hdrs,
615
	     ($logname, "assign.log", $nsfile));
616
}