batch_daemon.in 20.8 KB
Newer Older
1
#!/usr/bin/perl -w
Leigh Stoller's avatar
Leigh Stoller committed
2 3

#
4
# Copyright (c) 2000-2011 University of Utah and the Flux Group.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh Stoller's avatar
Leigh Stoller committed
24 25
#

26 27
use English;
use Getopt::Std;
28 29
use Fcntl;
use IO::Handle;
30 31 32 33 34 35

#
# Create a batch experiment.
#
# usage: batch_daemon
#
Leigh Stoller's avatar
Leigh Stoller committed
36 37
# TODO: Use "logger" instead of writing a log file.
#
38 39
sub usage()
{
40 41
    print STDOUT "Usage: batch_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
42 43
    exit(-1);
}
44 45 46 47

sub dosomething($$);
sub isexpdone($);

48
my  $optlist = "d";
49 50 51 52 53 54 55

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";
56
my $TBLOGS   = "@TBLOGSEMAIL@";
57

58 59 60 61 62 63 64 65
#
# Only root can run this script.
#
if ($UID) {
    die("*** $0:\n".
	"    Only root can run this script!\n");
}

66 67 68 69 70 71 72 73 74 75 76
#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

77 78 79 80 81 82
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
83
use Template;
84
use Experiment;
85

86 87
# Be careful not to exit on transient error; 0 means infinite retry.
$libdb::DBQUERY_MAXTRIES = 0;
88

89
my $tbbindir = "$TB/bin/";
90
my $swapexp  = "$TB/bin/swapexp";
91
my $endexp   = "$TB/bin/endexp";
92
my $savelogs = "$TB/bin/savelogs";
93
my $avail    = "$TB/sbin/avail";
94
my $batchlog = "$TB/log/batchlog";
95
my $debug    = 0;
96

97 98 99
# New template stuff.
my $template_swapout  = "$TB/bin/template_swapout";

100 101 102 103 104 105 106
my $BSTATE_POSTED	= EXPTSTATE_QUEUED;
my $BSTATE_ACTIVATING	= EXPTSTATE_ACTIVATING;
my $BSTATE_RUNNING	= EXPTSTATE_ACTIVE;
my $BSTATE_TERMINATING	= EXPTSTATE_TERMINATING;
my $BSTATE_PAUSED	= EXPTSTATE_SWAPPED;
my $BSTATE_LOCKED       = BATCHSTATE_LOCKED;
my $BSTATE_UNLOCKED     = BATCHSTATE_UNLOCKED;
107
my $TBOPSPID            = TBOPSPID;
108

109 110 111 112 113 114
#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
#
my $eid;
my $pid;
Leigh Stoller's avatar
Leigh Stoller committed
115
my $gid;
116
my $logname;
117
my $nsfile;
118 119
my $userdir;
my $workdir;
120
my $user_name  = "Testbed Operations";
121
my $user_email = "$TBOPS";
122
my $template;
123 124 125 126 127 128 129 130 131 132 133 134

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
135
if (defined($options{"d"})) {
136
    $debug = 1;
137
}
138 139

# Go to ground.
140
if (! $debug) {
141 142 143
    if (TBBackGround($batchlog)) {
	exit(0);
    }
144
}
145 146 147 148 149

#
# Loop, looking for batch experiments that want to run.
# 
while (1) {
150
    my($count, $i, $query_result, $pending_result, $running_result);
151
    my(%pending_row);
152
    my $retry_wait = 60;
153
    my $disabled;
154

155 156 157 158 159 160 161 162 163
    if (! TBGetSiteVar("web/nologins", \$disabled)) {
	print "Error getting sitevar 'web/nologins'. Waiting a bit ...\n";
	goto pause;
    }
    if ($disabled) {
	print "Web interface is disabled; waiting a bit ...\n";
	sleep(60);
	goto pause;
    }
164 165 166 167
    if (! TBGetSiteVar("batch/retry_wait", \$retry_wait)) {
	print "Error getting sitevar 'batch/retry_wait'. Waiting a bit ...\n";
	goto pause;
    }
168 169 170
    # Do not allow zero!
    $retry_wait = 60
	if ($retry_wait == 0);
Leigh Stoller's avatar
Leigh Stoller committed
171

172
    #
173
    # Need to lock the table here because of cancelation in endexp.
174 175 176 177 178 179
    # See the comments in there. We need to atomically grab the next
    # batch experiment we want to try, and then change its state from
    # new to configuring. We want to grab just one experiment, since
    # it takes a while to configure an experiment, and grabbing a bunch and
    # locking them up might result in having to wait a really long time
    # to cancel a batch experiment that hasn't really tried to start yet!
180
    # Thats would ne annoying to users.
181
    #
182
    # So, now you're wondering what my selection criteria is? Well, its
183 184
    # damn simplistic. I set the "started" datetime field each attempt,
    # and I pick the batch_experiment with the oldest time, thereby cycling
185 186 187 188
    # through in a "least recently attempted" manner. In addition, we want
    # to throttle the number simultaneous batches that one person can
    # can have running at a time (curently to just one batch), so check to
    # to see if the person has another batch active (thats e2 below).
189 190
    #
    $query_result =
191 192
	DBQuery("lock tables experiments write, experiments as e1 write, ".
		"experiments as e2 write");
193 194
    if (! $query_result) {
	print "DB Error locking tables. Waiting a bit ...\n";
Leigh Stoller's avatar
Leigh Stoller committed
195
	goto pause;
196 197 198
    }
    
    $pending_result =
199 200 201
	DBQueryWarn("SELECT e1.* FROM experiments as e1 ".
		    "left join experiments as e2 on ".
		    " e2.expt_head_uid=e1.expt_head_uid and ".
202
		    " e2.batchmode=1 and e2.state='$BSTATE_RUNNING' and ".
203 204
		    " e1.pid=e2.pid and e1.eid!=e2.eid and ".
		    " e2.pid!='$TBOPSPID' ".
205 206
		    "WHERE e2.eid is null and ".
		    " e1.batchmode=1 and e1.canceled=0 and ".
207
		    " e1.expt_locked is null and ".
208
		    " e1.batchstate!='$BSTATE_LOCKED' and ".
209
		    " e1.state='$BSTATE_POSTED' and ".
210 211
		    "  (e1.attempts=0 or ".
		    "    ((UNIX_TIMESTAMP() - ".
212
		    "      UNIX_TIMESTAMP(e1.expt_start) > ($retry_wait)))) ".
213
		    "ORDER BY e1.expt_start LIMIT 1");
214

215
    $running_result =
216
	DBQuery("select * from experiments ".
217 218
		"where batchmode=1 and state='$BSTATE_RUNNING' and ".
		"      batchstate!='$BSTATE_LOCKED' ".
219
		"ORDER BY expt_start");
220 221 222

    if (!$pending_result || !$running_result) {
	print "DB Error getting batch info. Waiting a bit ...\n";
223
	DBQuery("unlock tables");
Leigh Stoller's avatar
Leigh Stoller committed
224
	goto pause;
225 226 227
    }

    if (!$pending_result->numrows && !$running_result->numrows) {
228
	DBQuery("unlock tables");
Leigh Stoller's avatar
Leigh Stoller committed
229
	goto pause;
230 231 232
    }

    #
233 234
    # If we have a pending experiment to run, the lock it right away,
    # while we have the tables locked. 
235
    #
236 237 238 239 240 241 242 243
    if ($pending_result->numrows) {
	%pending_row = $pending_result->fetchhash();

	# Local vars!
	my $eid = $pending_row{'eid'};
	my $pid = $pending_row{'pid'};

	$query_result = 
244
	    DBQuery("update experiments set ".
245
		    "       expt_locked=now(),expt_start=now(), ".
246
		    "       batchstate='$BSTATE_LOCKED' ".
247
		    "where eid='$eid' and pid='$pid'");
248 249 250

	if (! $query_result) {
	    print "DB error setting batch $pid/$eid to configuring.\n";
251
	    DBQuery("unlock tables");
Leigh Stoller's avatar
Leigh Stoller committed
252
	    goto pause;
253 254
	}
    }
255

256 257 258 259 260 261 262 263 264
    #
    # Okay, first we check the status of running batch mode experiments
    # since we want to end those before trying to start any new ones, cause
    # it would be nice to have as many nodes available as possible before
    # trying to add a new one. This can potentially delay startup, but thats
    # okay. Its a batch system.
    #
    # If you are wondering why I check for finished experiments in the main
    # loop instead of in the child that started the experiment, its so that
265
    # we fire up again and look for them in the event that boss goes down.
266
    #
267 268 269 270
    if (!$running_result->numrows) {
	DBQueryWarn("unlock tables");
    }
    else {
271
	my %canterm = ();
272 273 274

        # Need to do this when we want to seek around inside the results.
	$running_result = $running_result->WrapForSeek();
275 276 277 278 279
	
	while (my %running_row = $running_result->fetchhash()) {
	    # Local vars!
	    my $eid         = $running_row{'eid'};
	    my $pid         = $running_row{'pid'};
280

281 282 283 284 285 286 287 288 289 290 291
	    #
	    # Lock so user cannot mess with it.
	    # 
	    $query_result = 
		DBQuery("update experiments set ".
			"       expt_locked=now(), ".
			"       batchstate='$BSTATE_LOCKED' ".
			"where eid='$eid' and pid='$pid'");

	    $canterm{"$pid:$eid"} = ($query_result ? 1 : 0);
	}
292 293
	DBQueryWarn("unlock tables");

294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
	#
	# Reset and go through again.
	#
	$running_result->dataseek(0);
	
	while (my %running_row = $running_result->fetchhash()) {
	    my $canceled    = $running_row{'canceled'};
	    # Local vars!
	    my $eid         = $running_row{'eid'};
	    my $pid         = $running_row{'pid'};
	    my $canterm     = $canterm{"$pid:$eid"};

	    next
		if (!$canterm);

309 310 311
	    if ($canceled) {
		# Look at the cancel flag.
		if ($canceled == EXPTCANCEL_TERM) {
312
		    dosomething("cancel", \%running_row);
313 314
		}
		elsif ($canceled == EXPTCANCEL_SWAP) {
315
		    dosomething("swap", \%running_row);
316 317 318 319
		}
		else {
		    print "Improper cancel flag: $canceled\n";
		}
320 321
	    }
	    else {
322
		if (isexpdone(\%running_row)) {
323 324 325
		    #
		    # Terminate the experiment. 
		    # 
326
		    dosomething("swap", \%running_row);
327 328 329 330 331 332 333
		}
		else {
		    #
		    # Unlock.
		    # 
		    TBBatchUnLockExp($pid, $eid);
		}
334
	    }
335 336 337 338 339 340
	}
    }
    #
    # Finally start an actual experiment!
    #
    if ($pending_result->numrows) {
341
	dosomething("start", \%pending_row);
342
    }
Leigh Stoller's avatar
Leigh Stoller committed
343
  pause:
344
    sleep(15);
345 346 347
}

#
348
# Do something as the user. Either, start, end, or cancel an experiment.
349
#
350
sub dosomething($$)
351
{
352
    my($dowhat,$exphash)   = @_;
Leigh Stoller's avatar
Leigh Stoller committed
353
    my($unix_uid, $unix_gid, $row, $query_result);
354 355

    # Global vars
356 357 358 359
    $eid     = $exphash->{'eid'};
    $pid     = $exphash->{'pid'};
    $gid     = $exphash->{'gid'};
    $userdir = $exphash->{'path'};
360
    $workdir = TBExptWorkDir($pid, $eid);
361
    $nsfile  = "$eid.ns";
362
    $template= undef;
363 364
    
    # Locals
365 366 367 368 369
    my $experiment = Experiment->Lookup($pid, $eid);
    if (! defined($experiment)) {
	print "Could not find experiment object! Skipping ...\n";
	return -1;
    }
370 371 372 373 374 375 376
    my $creator = $experiment->GetCreator();
    if (! defined($creator)) {
	print "Could not find creator object! Skipping ...\n";
	return -1;
    }
    my $creator_uid = $creator->uid();
    
377 378
    print "Doing a '$dowhat' to batch experiment $pid/$eid\n";

379 380 381
    # Clean before starting.
    $experiment->CleanLogFiles();

382 383 384 385 386
    #
    # Create a temporary name for a log file. We do this in the parent so
    # we can remove it when the child ends. The child could remove it, but
    # since it is open in the child, it has the tendency to stick around.
    #
387 388 389 390 391 392
    my $logfile = $experiment->CreateLogFile("${dowhat}-batch");
    if (!defined($logfile)) {
	print "Could not create logfile!\n";
	return -1;
    }
    $logname = $logfile->filename();
393

394 395 396 397
    my $exptidx  = $exphash->{'idx'};
    my $instance = Template::Instance->LookupByExptidx($exptidx);
    if (defined($instance)) {
	$template = $instance->template();
398 399
    }

400 401 402
    #
    # Start up a child to run the guts. The parent waits. If the
    # experiment configures okay, the parent can return to try something
403
    # else.
404 405 406
    #
    $childpid = fork();
    if ($childpid) {
407

408
	print TBDateTimeFSSafe() . "\n";
409 410
	print "Child PID $childpid started to $dowhat $pid/$eid\n";

411
	waitpid($childpid, 0);
Leigh Stoller's avatar
Leigh Stoller committed
412
	my $status = $? >> 8;
413

414
	print TBDateTimeFSSafe() . "\n";
Leigh Stoller's avatar
Leigh Stoller committed
415
	print "Child PID $childpid exited with exit status $status\n";
416

417
        #
418
        # Close the log file.
419 420
	# The exp dir might be gone if the batch was killed/canceled.
        #
421
	if (-e $userdir) {
422
	    $experiment->CloseLogFile();
423
	}
Leigh Stoller's avatar
Leigh Stoller committed
424
	return $status;
425
    }
426
    openlog($logname);
427 428 429 430
    # We want it to spew to the web.
    $experiment->SetLogFile($logfile);
    # And mark it as spewing.
    $experiment->OpenLogFile();
431

432 433 434
    #
    # Get some user information. 
    #
435 436
    $user_name  = $creator->name();
    $user_email = $creator->email();
437
    
438 439
    chdir("$workdir") or
	fatal("Could not cd into $workdir!");
440 441 442 443 444

    #
    # Figure out the unix uid/gid that the experiment configuration is
    # going to run as. 
    #
445 446
    (undef,undef,$unix_uid) = getpwnam($creator_uid) or
	fatal("No such user $creator_uid");
Leigh Stoller's avatar
Leigh Stoller committed
447 448 449 450 451
    my $group = $experiment->GetGroup();
    if (!defined($group)) {
	fatal("Could not get group object for $experiment");
    }
    $unix_gid = $group->unix_gid();
452

453 454 455 456 457
    #
    # Need the entire group list for the user, cause of subgroups, and cause
    # thats the correct thing to do. Too bad perl does not have a getgrouplist
    # function like the C library.
    #
458
    my $glist = `id -G $creator_uid`;
459 460 461 462
    if ($glist =~ /^([\d ]*)$/) {
	$glist = $1;
    }
    else {
463
	fatal("Unexpected results from 'id -G $creator_uid': $glist");
464 465 466 467 468 469 470 471 472
    }
    #
    # Remove current group from glist, then add gid twice at the front
    # of the list. Order matters here, or we won't pick up all the
    # groups we need.
    #
    $glist =~ s/ ?\b$unix_gid\b ?//;
    $glist = $unix_gid . " " . $unix_gid . " " . $glist;

473 474 475
    #
    # Change the ownership of the log file before we flip.
    #
476 477
    chown($unix_uid, $unix_gid, $logname) or
	fatal("Could not chown $logname to $unix_uid/$unix_gid!");
478 479

    # Flip to the user. We never flip back.
480 481 482
    $GID            = $unix_gid;
    $EGID           = $glist;
    $EUID = $UID    = $unix_uid;
483 484
    $ENV{'USER'}    = $creator_uid;
    $ENV{'LOGNAME'} = $creator_uid;
485
    
486
    if ($dowhat eq "start") {
487
	startexp($exphash);
488
    }
489
    elsif ($dowhat eq "swap") {
490
	swapexp($exphash);
491 492
    }
    elsif ($dowhat eq "cancel") {
493
	cancelexp($exphash);
494 495 496 497 498 499 500 501 502
    }
    exit(0);
}

#
# Try to start an experiment. Never returns.
# 
sub startexp($)
{
503
    my($exphash)  = @_;
504
    my($exit_status, $running, $query_result);
505

506
    my $attempts  = $exphash->{'attempts'};
507

508
    #
509
    # Try to swap the experiment in. 
510
    #
511
    system("$swapexp -b -s in $pid $eid");
512 513
    $exit_status = $? >> 8;
    $running     = 1;
514 515 516 517 518
    if ($exit_status) {
	$running = 0;
    }
    
    #
519 520
    # Look for cancelation. If we get a DB error on this, just continue cause
    # we can pick up the cancelation later.
521 522
    #
    $query_result =
523
	DBQueryWarn("select canceled from experiments ".
524
		    "where eid='$eid' and pid='$pid'");
525

526
    if ($query_result) {
527
	my ($canceled) = $query_result->fetchrow_array();
528
	$exphash->{'canceled'} = $canceled;
529

530
	# Yuck: This is strictly for the benefit of swapexp() below.
531
	$exphash->{'state'} = EXPTSTATE_ACTIVE
532 533 534 535
	    if ($running);
	
	if ($canceled) {
	    # Look at the cancel flag.
536
	    if ($canceled == EXPTCANCEL_TERM) {
537
		cancelexp($exphash);
538
	    }
539 540
	    elsif ($canceled == EXPTCANCEL_SWAP ||
		   $canceled == EXPTCANCEL_DEQUEUE) {
541
		swapexp($exphash);
542 543 544 545
	    }
	    else {
		print "Improper cancel flag: $canceled\n";
	    }
546 547 548
	    #
	    # Never returns, but just to be safe ...
	    #
549
	    exit(-1);
550 551
	}
    }
552 553 554 555

    #
    # If the configuration failed for lack of nodes, then don't send
    # email unless the number of attempts starts to get big.
556
    #
557 558 559
    # If the configuration failed for some other reason, then send email.
    # We have to reset the state to "new" so that it will be retried again
    # later. 
560 561
    #
    if (! $running) {
562 563 564 565
	#
	# XXX - What if this update fails?
	# 
	$query_result = 
566
	    DBQueryWarn("update experiments set attempts=attempts+1 ".
567
			"where eid='$eid' and pid='$pid'");
568

569
	#
570 571 572 573
	# The exit status does not tell us if the experiment can ever be
	# mapped. In fact, that is really hard to know at this level;
	# it depends on what resources the testbed actually has. So,
	# unless status is -1 (really really fatal) just keep going.
574 575
	# We leave it up the user to kill the batch if it looks like its
	# never going to work.
576
	#
577
	if ($exit_status == 255) {
578
	    TBBatchUnLockExp($pid, $eid, EXPTSTATE_SWAPPED());
579
	    
580
	    email_status("Experiment startup has failed with a fatal error!\n".
581 582
			 "Batch has been dequeued so that you may check it.");

583
	    exit($exit_status);
584
	}
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
	
	if (($attempts % 30) == 0) {
	    $attempts++;

	    my $msg =
		"Could not configure Batch Mode experiment $pid/$eid.\n".
		"\n".
		"There was an assignment violation (please check the log)\n".
		"that prevented it from being scheduled. The violation\n".
		"might result from not enough nodes or not enough link\n".
		"bandwidth. If you feel that the violation is in error,\n".
		"please cancel the batch and notify $TBOPS\n".
		"Otherwise, another attempt will be made in a little while.\n".
		"\n". 
		"There have been $attempts attempts to start this batch.";

	    email_status($msg);
	}
603
	TBBatchUnLockExp($pid, $eid, EXPTSTATE_QUEUED());
604
	exit($exit_status);
605 606 607
    }

    #
608
    # Well, it configured! We can now unlock it.
609
    #
610
    TBBatchUnLockExp($pid, $eid);
611
    
612
    email_status("Batch Mode experiment $pid/$eid is now running!\n".
613
		 "Please consult the Web interface to see how it is doing.");
Leigh Stoller's avatar
Leigh Stoller committed
614

615
    #
616
    # Done with this phase. Must exit.
617
    #
618 619
    exit(0);
}
620

621
#
622
# A batch has completed. Swap it out. 
623
#
624
sub swapexp($;$)
625
{
626 627 628
    my($exphash)  = @_;
    my $canceled  = $exphash->{'canceled'};
    my $running   = ($exphash->{'state'} eq EXPTSTATE_ACTIVE);
629

630
    if ($running) {
631 632 633 634 635
	if (defined($template)) {
	    my $guid = $template->guid();
	    my $vers = $template->vers();
	    
	    system("$template_swapout -b -e $eid $guid/$vers");
636 637 638 639
	}
	else {
	    system("$swapexp -b -s out $pid $eid");
	}
640 641 642 643 644 645
	if ($?) {
	    #
	    # TB admin is going to have to clean up. 
	    # 
	    fatal("Swapping out Batch Mode experiment $pid/$eid");
	}
646 647 648 649 650
    }
    #
    # Set the state to paused to ensure that it is not run again until
    # the user wants to.
    # 
651 652
    TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR);
    TBBatchUnLockExp($pid, $eid);
653
    
654 655 656 657 658
    if ($canceled) {
	email_status("Batch Mode experiment $pid/$eid has been stopped!");
    }
    else {
	email_status("Batch Mode experiment $pid/$eid has finished!");
659
    }
660
   
661
    #
662
    # Child must exit!
663
    #
664 665 666 667 668 669
    exit(0);
}

#
# Cancel an experiment. Never returns.
#
670
sub cancelexp($)
671
{
672
    my($exphash) = @_;
673
    
674 675 676 677
    #
    # It does not matter if the experiment is running; endexp does the
    # right thing.
    # 
678 679 680 681 682
    if (defined($template)) {
	my $guid = $template->guid();
	my $vers = $template->vers();
	
	system("$template_swapout -b -e $eid $guid/$vers");
683 684
    }
    else {
685
	system("$endexp -b $pid,$eid");
686
    }
687 688 689 690 691
    if ($?) {
	#
	# TB admin is going to have to clean up. 
	# 
	fatal("Terminating Batch Mode experiment $pid/$eid");
692
    }
693
    donotify("Your Batch Mode experiment has been canceled!", "Canceled", 0);
694 695 696 697 698 699 700
   
    #
    # Child must exit!
    #
    exit(0);
}

701 702 703 704 705 706
#
# Check experiment status. Looks to see if all of the nodes in an
# experiment have reported in.
#
sub isexpdone($)
{
707
    my($exphash)  = @_;
708
    my($query_result, @row);
709 710
    
    # Global vars
711 712
    $eid = $exphash->{'eid'};
    $pid = $exphash->{'pid'};
713

714
    print TBDateTimeFSSafe() . "\n";
715 716 717 718 719 720
    print "Checking to see if $pid/$eid has finished up yet\n";

    #
    # Look to see if any nodes yet to report status. If so, spin again.
    #
    $query_result =
721 722 723
	DBQueryWarn("SELECT startstatus,bootstatus FROM nodes ".
		    "LEFT JOIN reserved ON nodes.node_id=reserved.node_id ".
		    "WHERE reserved.eid='$eid' and reserved.pid='$pid'");
724 725 726 727 728

    if (! $query_result) {
	return 0;
    }

729 730 731 732 733 734 735 736 737 738 739 740 741
    #
    # Well, right now a node is considered finished up only if its
    # boot did not fail, and it has reported start command status.
    # The idea being that if the boot failed, then its status will
    # never be reported anyway, and we might as well consider the node
    # done (else the experiment would never end).
    # 
    while (@row = $query_result->fetchrow_array()) {
	if ($row[1] eq NODEBOOTSTATUS_FAILED) {
	    next;
	}
	if ($row[0] eq NODESTARTSTATUS_NOSTATUS) {
	    return 0;
742 743
	}
    }
744
    return 1;
745 746 747 748
}

#
# Start up a child, and set its descriptors talking to a log file.
Leigh Stoller's avatar
Leigh Stoller committed
749
# The log file already exists, created with mktemp above.
750 751 752 753 754 755 756
# 
sub openlog($)
{
    my($logname) = $_[0];
	
    #
    # We have to disconnect from the caller by redirecting both STDIN and
Leigh Stoller's avatar
Leigh Stoller committed
757 758
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
759 760
    #
    open(STDIN, "< /dev/null") or
761
	fatal("opening /dev/null for STDIN: $!");
762 763 764 765 766 767

    open(STDERR, ">> $logname") or
	fatal("opening $logname for STDERR: $!");
    open(STDOUT, ">> $logname") or
	fatal("opening $logname for STDOUT: $!");

768 769 770
    STDOUT->autoflush(1);
    STDERR->autoflush(1);

771 772 773
    return 0;
}

774 775 776 777 778
#
# A fatal error is something that the user does not need to know about.
# Caused by a breakdown in the TB system. Generally speaking, once the
# experiment is running, this should not be used.
# 
779
sub fatal($)
780 781 782
{
    my($mesg) = $_[0];

783
    donotify($mesg, "Failure", 1);
784 785 786 787

    exit(-1);
}

788 789 790
#
# Something the user cares about. 
# 
791
sub email_status($)
792
{
793
    my($mesg) = $_[0];
794

795
    donotify($mesg, "Status", 0);
796 797
}

798
sub donotify($$$)
799
{
800
    my($mesg, $subtext, $iserr) = @_;
801
    my($subject, $from, $to, $hdrs);
802
    my $MAIL;
803

804 805 806
    $mesg = "$mesg\n";

    print STDOUT "$mesg";
807

808
    $subject = "Batch Mode Experiment $subtext $pid/$eid";
809
    $from    = $TBOPS;
810 811 812 813 814 815
    $hdrs    = "Reply-To: $TBOPS";
    
    #
    # An error goes just to Testbed Operations. Normal status messages go
    # to the user and to the Testbed Logs address.
    # 
816
    if ($iserr) {
817
	$to = "$TBOPS";
818 819
    }
    else {
820 821 822
	$to   = "$user_name <$user_email>";
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
823 824
    }

825
    SENDMAIL($to, $subject, $mesg, $from, $hdrs,
826
	     ($logname, "assign.log", $nsfile));
827
}
828