batch_daemon.in 20.1 KB
Newer Older
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2011 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6 7 8
# All rights reserved.
#

9 10
use English;
use Getopt::Std;
11 12
use Fcntl;
use IO::Handle;
13 14 15 16 17 18

#
# Create a batch experiment.
#
# usage: batch_daemon
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
19 20
# TODO: Use "logger" instead of writing a log file.
#
21 22
sub usage()
{
23 24
    print STDOUT "Usage: batch_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
25 26
    exit(-1);
}
27 28 29 30

sub dosomething($$);
sub isexpdone($);

31
my  $optlist = "d";
32 33 34 35 36 37 38

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";
39
my $TBLOGS   = "@TBLOGSEMAIL@";
40

41 42 43 44 45 46 47 48
#
# Only root can run this script.
#
if ($UID) {
    die("*** $0:\n".
	"    Only root can run this script!\n");
}

49 50 51 52 53 54 55 56 57 58 59
#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

60 61 62 63 64 65
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
66
use Template;
67
use Experiment;
68

69 70
# Be careful not to exit on transient error; 0 means infinite retry.
$libdb::DBQUERY_MAXTRIES = 0;
71

72
my $tbbindir = "$TB/bin/";
73
my $swapexp  = "$TB/bin/swapexp";
74
my $endexp   = "$TB/bin/endexp";
75
my $savelogs = "$TB/bin/savelogs";
76
my $avail    = "$TB/sbin/avail";
77
my $batchlog = "$TB/log/batchlog";
78
my $debug    = 0;
79

80 81 82
# New template stuff.
my $template_swapout  = "$TB/bin/template_swapout";

83 84 85 86 87 88 89
my $BSTATE_POSTED	= EXPTSTATE_QUEUED;
my $BSTATE_ACTIVATING	= EXPTSTATE_ACTIVATING;
my $BSTATE_RUNNING	= EXPTSTATE_ACTIVE;
my $BSTATE_TERMINATING	= EXPTSTATE_TERMINATING;
my $BSTATE_PAUSED	= EXPTSTATE_SWAPPED;
my $BSTATE_LOCKED       = BATCHSTATE_LOCKED;
my $BSTATE_UNLOCKED     = BATCHSTATE_UNLOCKED;
90
my $TBOPSPID            = TBOPSPID;
91

92 93 94 95 96 97
#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
#
my $eid;
my $pid;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
98
my $gid;
99
my $logname;
100
my $nsfile;
101 102
my $userdir;
my $workdir;
103
my $user_name  = "Testbed Operations";
104
my $user_email = "$TBOPS";
105
my $template;
106 107 108 109 110 111 112 113 114 115 116 117

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
118
if (defined($options{"d"})) {
119
    $debug = 1;
120
}
121 122

# Go to ground.
123
if (! $debug) {
124 125 126
    if (TBBackGround($batchlog)) {
	exit(0);
    }
127
}
128 129 130 131 132

#
# Loop, looking for batch experiments that want to run.
# 
while (1) {
133
    my($count, $i, $query_result, $pending_result, $running_result);
134
    my(%pending_row);
135
    my $retry_wait = 60;
136
    my $disabled;
137

138 139 140 141 142 143 144 145 146
    if (! TBGetSiteVar("web/nologins", \$disabled)) {
	print "Error getting sitevar 'web/nologins'. Waiting a bit ...\n";
	goto pause;
    }
    if ($disabled) {
	print "Web interface is disabled; waiting a bit ...\n";
	sleep(60);
	goto pause;
    }
147 148 149 150
    if (! TBGetSiteVar("batch/retry_wait", \$retry_wait)) {
	print "Error getting sitevar 'batch/retry_wait'. Waiting a bit ...\n";
	goto pause;
    }
151 152 153
    # Do not allow zero!
    $retry_wait = 60
	if ($retry_wait == 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
154

155
    #
156
    # Need to lock the table here because of cancelation in endexp.
157 158 159 160 161 162
    # See the comments in there. We need to atomically grab the next
    # batch experiment we want to try, and then change its state from
    # new to configuring. We want to grab just one experiment, since
    # it takes a while to configure an experiment, and grabbing a bunch and
    # locking them up might result in having to wait a really long time
    # to cancel a batch experiment that hasn't really tried to start yet!
163
    # Thats would ne annoying to users.
164
    #
165
    # So, now you're wondering what my selection criteria is? Well, its
166 167
    # damn simplistic. I set the "started" datetime field each attempt,
    # and I pick the batch_experiment with the oldest time, thereby cycling
168 169 170 171
    # through in a "least recently attempted" manner. In addition, we want
    # to throttle the number simultaneous batches that one person can
    # can have running at a time (curently to just one batch), so check to
    # to see if the person has another batch active (thats e2 below).
172 173
    #
    $query_result =
174 175
	DBQuery("lock tables experiments write, experiments as e1 write, ".
		"experiments as e2 write");
176 177
    if (! $query_result) {
	print "DB Error locking tables. Waiting a bit ...\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
178
	goto pause;
179 180 181
    }
    
    $pending_result =
182 183 184
	DBQueryWarn("SELECT e1.* FROM experiments as e1 ".
		    "left join experiments as e2 on ".
		    " e2.expt_head_uid=e1.expt_head_uid and ".
185
		    " e2.batchmode=1 and e2.state='$BSTATE_RUNNING' and ".
186 187
		    " e1.pid=e2.pid and e1.eid!=e2.eid and ".
		    " e2.pid!='$TBOPSPID' ".
188 189
		    "WHERE e2.eid is null and ".
		    " e1.batchmode=1 and e1.canceled=0 and ".
190
		    " e1.expt_locked is null and ".
191
		    " e1.batchstate!='$BSTATE_LOCKED' and ".
192
		    " e1.state='$BSTATE_POSTED' and ".
193 194
		    "  (e1.attempts=0 or ".
		    "    ((UNIX_TIMESTAMP() - ".
195
		    "      UNIX_TIMESTAMP(e1.expt_start) > ($retry_wait)))) ".
196
		    "ORDER BY e1.expt_start LIMIT 1");
197

198
    $running_result =
199
	DBQuery("select * from experiments ".
200 201
		"where batchmode=1 and state='$BSTATE_RUNNING' and ".
		"      batchstate!='$BSTATE_LOCKED' ".
202
		"ORDER BY expt_start");
203 204 205

    if (!$pending_result || !$running_result) {
	print "DB Error getting batch info. Waiting a bit ...\n";
206
	DBQuery("unlock tables");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
207
	goto pause;
208 209 210
    }

    if (!$pending_result->numrows && !$running_result->numrows) {
211
	DBQuery("unlock tables");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
212
	goto pause;
213 214 215
    }

    #
216 217
    # If we have a pending experiment to run, the lock it right away,
    # while we have the tables locked. 
218
    #
219 220 221 222 223 224 225 226
    if ($pending_result->numrows) {
	%pending_row = $pending_result->fetchhash();

	# Local vars!
	my $eid = $pending_row{'eid'};
	my $pid = $pending_row{'pid'};

	$query_result = 
227
	    DBQuery("update experiments set ".
228
		    "       expt_locked=now(),expt_start=now(), ".
229
		    "       batchstate='$BSTATE_LOCKED' ".
230
		    "where eid='$eid' and pid='$pid'");
231 232 233

	if (! $query_result) {
	    print "DB error setting batch $pid/$eid to configuring.\n";
234
	    DBQuery("unlock tables");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
235
	    goto pause;
236 237
	}
    }
238

239 240 241 242 243 244 245 246 247
    #
    # Okay, first we check the status of running batch mode experiments
    # since we want to end those before trying to start any new ones, cause
    # it would be nice to have as many nodes available as possible before
    # trying to add a new one. This can potentially delay startup, but thats
    # okay. Its a batch system.
    #
    # If you are wondering why I check for finished experiments in the main
    # loop instead of in the child that started the experiment, its so that
248
    # we fire up again and look for them in the event that boss goes down.
249
    #
250 251 252 253
    if (!$running_result->numrows) {
	DBQueryWarn("unlock tables");
    }
    else {
254
	my %canterm = ();
255 256 257

        # Need to do this when we want to seek around inside the results.
	$running_result = $running_result->WrapForSeek();
258 259 260 261 262
	
	while (my %running_row = $running_result->fetchhash()) {
	    # Local vars!
	    my $eid         = $running_row{'eid'};
	    my $pid         = $running_row{'pid'};
263

264 265 266 267 268 269 270 271 272 273 274
	    #
	    # Lock so user cannot mess with it.
	    # 
	    $query_result = 
		DBQuery("update experiments set ".
			"       expt_locked=now(), ".
			"       batchstate='$BSTATE_LOCKED' ".
			"where eid='$eid' and pid='$pid'");

	    $canterm{"$pid:$eid"} = ($query_result ? 1 : 0);
	}
275 276
	DBQueryWarn("unlock tables");

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
	#
	# Reset and go through again.
	#
	$running_result->dataseek(0);
	
	while (my %running_row = $running_result->fetchhash()) {
	    my $canceled    = $running_row{'canceled'};
	    # Local vars!
	    my $eid         = $running_row{'eid'};
	    my $pid         = $running_row{'pid'};
	    my $canterm     = $canterm{"$pid:$eid"};

	    next
		if (!$canterm);

292 293 294
	    if ($canceled) {
		# Look at the cancel flag.
		if ($canceled == EXPTCANCEL_TERM) {
295
		    dosomething("cancel", \%running_row);
296 297
		}
		elsif ($canceled == EXPTCANCEL_SWAP) {
298
		    dosomething("swap", \%running_row);
299 300 301 302
		}
		else {
		    print "Improper cancel flag: $canceled\n";
		}
303 304
	    }
	    else {
305
		if (isexpdone(\%running_row)) {
306 307 308
		    #
		    # Terminate the experiment. 
		    # 
309
		    dosomething("swap", \%running_row);
310 311 312 313 314 315 316
		}
		else {
		    #
		    # Unlock.
		    # 
		    TBBatchUnLockExp($pid, $eid);
		}
317
	    }
318 319 320 321 322 323
	}
    }
    #
    # Finally start an actual experiment!
    #
    if ($pending_result->numrows) {
324
	dosomething("start", \%pending_row);
325
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
326
  pause:
327
    sleep(15);
328 329 330
}

#
331
# Do something as the user. Either, start, end, or cancel an experiment.
332
#
333
sub dosomething($$)
334
{
335
    my($dowhat,$exphash)   = @_;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
336
    my($unix_uid, $unix_gid, $row, $query_result);
337 338

    # Global vars
339 340 341 342
    $eid     = $exphash->{'eid'};
    $pid     = $exphash->{'pid'};
    $gid     = $exphash->{'gid'};
    $userdir = $exphash->{'path'};
343
    $workdir = TBExptWorkDir($pid, $eid);
344
    $nsfile  = "$eid.ns";
345
    $template= undef;
346 347
    
    # Locals
348 349 350 351 352
    my $experiment = Experiment->Lookup($pid, $eid);
    if (! defined($experiment)) {
	print "Could not find experiment object! Skipping ...\n";
	return -1;
    }
353 354 355 356 357 358 359
    my $creator = $experiment->GetCreator();
    if (! defined($creator)) {
	print "Could not find creator object! Skipping ...\n";
	return -1;
    }
    my $creator_uid = $creator->uid();
    
360 361
    print "Doing a '$dowhat' to batch experiment $pid/$eid\n";

362 363 364
    # Clean before starting.
    $experiment->CleanLogFiles();

365 366 367 368 369
    #
    # Create a temporary name for a log file. We do this in the parent so
    # we can remove it when the child ends. The child could remove it, but
    # since it is open in the child, it has the tendency to stick around.
    #
370 371 372 373 374 375
    my $logfile = $experiment->CreateLogFile("${dowhat}-batch");
    if (!defined($logfile)) {
	print "Could not create logfile!\n";
	return -1;
    }
    $logname = $logfile->filename();
376

377 378 379 380
    my $exptidx  = $exphash->{'idx'};
    my $instance = Template::Instance->LookupByExptidx($exptidx);
    if (defined($instance)) {
	$template = $instance->template();
381 382
    }

383 384 385
    #
    # Start up a child to run the guts. The parent waits. If the
    # experiment configures okay, the parent can return to try something
386
    # else.
387 388 389
    #
    $childpid = fork();
    if ($childpid) {
390

391
	print TBDateTimeFSSafe() . "\n";
392 393
	print "Child PID $childpid started to $dowhat $pid/$eid\n";

394
	waitpid($childpid, 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
395
	my $status = $? >> 8;
396

397
	print TBDateTimeFSSafe() . "\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
398
	print "Child PID $childpid exited with exit status $status\n";
399

400
        #
401
        # Close the log file.
402 403
	# The exp dir might be gone if the batch was killed/canceled.
        #
404
	if (-e $userdir) {
405
	    $experiment->CloseLogFile();
406
	}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
407
	return $status;
408
    }
409
    openlog($logname);
410 411 412 413
    # We want it to spew to the web.
    $experiment->SetLogFile($logfile);
    # And mark it as spewing.
    $experiment->OpenLogFile();
414

415 416 417
    #
    # Get some user information. 
    #
418 419
    $user_name  = $creator->name();
    $user_email = $creator->email();
420
    
421 422
    chdir("$workdir") or
	fatal("Could not cd into $workdir!");
423 424 425 426 427

    #
    # Figure out the unix uid/gid that the experiment configuration is
    # going to run as. 
    #
428 429
    (undef,undef,$unix_uid) = getpwnam($creator_uid) or
	fatal("No such user $creator_uid");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
430 431 432 433 434
    my $group = $experiment->GetGroup();
    if (!defined($group)) {
	fatal("Could not get group object for $experiment");
    }
    $unix_gid = $group->unix_gid();
435

436 437 438 439 440
    #
    # Need the entire group list for the user, cause of subgroups, and cause
    # thats the correct thing to do. Too bad perl does not have a getgrouplist
    # function like the C library.
    #
441
    my $glist = `id -G $creator_uid`;
442 443 444 445
    if ($glist =~ /^([\d ]*)$/) {
	$glist = $1;
    }
    else {
446
	fatal("Unexpected results from 'id -G $creator_uid': $glist");
447 448 449 450 451 452 453 454 455
    }
    #
    # Remove current group from glist, then add gid twice at the front
    # of the list. Order matters here, or we won't pick up all the
    # groups we need.
    #
    $glist =~ s/ ?\b$unix_gid\b ?//;
    $glist = $unix_gid . " " . $unix_gid . " " . $glist;

456 457 458
    #
    # Change the ownership of the log file before we flip.
    #
459 460
    chown($unix_uid, $unix_gid, $logname) or
	fatal("Could not chown $logname to $unix_uid/$unix_gid!");
461 462

    # Flip to the user. We never flip back.
463 464 465
    $GID            = $unix_gid;
    $EGID           = $glist;
    $EUID = $UID    = $unix_uid;
466 467
    $ENV{'USER'}    = $creator_uid;
    $ENV{'LOGNAME'} = $creator_uid;
468
    
469
    if ($dowhat eq "start") {
470
	startexp($exphash);
471
    }
472
    elsif ($dowhat eq "swap") {
473
	swapexp($exphash);
474 475
    }
    elsif ($dowhat eq "cancel") {
476
	cancelexp($exphash);
477 478 479 480 481 482 483 484 485
    }
    exit(0);
}

#
# Try to start an experiment. Never returns.
# 
sub startexp($)
{
486
    my($exphash)  = @_;
487
    my($exit_status, $running, $query_result);
488

489
    my $attempts  = $exphash->{'attempts'};
490

491
    #
492
    # Try to swap the experiment in. 
493
    #
494
    system("$swapexp -b -s in $pid $eid");
495 496
    $exit_status = $? >> 8;
    $running     = 1;
497 498 499 500 501
    if ($exit_status) {
	$running = 0;
    }
    
    #
502 503
    # Look for cancelation. If we get a DB error on this, just continue cause
    # we can pick up the cancelation later.
504 505
    #
    $query_result =
506
	DBQueryWarn("select canceled from experiments ".
507
		    "where eid='$eid' and pid='$pid'");
508

509
    if ($query_result) {
510
	my ($canceled) = $query_result->fetchrow_array();
511
	$exphash->{'canceled'} = $canceled;
512

513
	# Yuck: This is strictly for the benefit of swapexp() below.
514
	$exphash->{'state'} = EXPTSTATE_ACTIVE
515 516 517 518
	    if ($running);
	
	if ($canceled) {
	    # Look at the cancel flag.
519
	    if ($canceled == EXPTCANCEL_TERM) {
520
		cancelexp($exphash);
521
	    }
522 523
	    elsif ($canceled == EXPTCANCEL_SWAP ||
		   $canceled == EXPTCANCEL_DEQUEUE) {
524
		swapexp($exphash);
525 526 527 528
	    }
	    else {
		print "Improper cancel flag: $canceled\n";
	    }
529 530 531
	    #
	    # Never returns, but just to be safe ...
	    #
532
	    exit(-1);
533 534
	}
    }
535 536 537 538

    #
    # If the configuration failed for lack of nodes, then don't send
    # email unless the number of attempts starts to get big.
539
    #
540 541 542
    # If the configuration failed for some other reason, then send email.
    # We have to reset the state to "new" so that it will be retried again
    # later. 
543 544
    #
    if (! $running) {
545 546 547 548
	#
	# XXX - What if this update fails?
	# 
	$query_result = 
549
	    DBQueryWarn("update experiments set attempts=attempts+1 ".
550
			"where eid='$eid' and pid='$pid'");
551

552
	#
553 554 555 556
	# The exit status does not tell us if the experiment can ever be
	# mapped. In fact, that is really hard to know at this level;
	# it depends on what resources the testbed actually has. So,
	# unless status is -1 (really really fatal) just keep going.
557 558
	# We leave it up the user to kill the batch if it looks like its
	# never going to work.
559
	#
560
	if ($exit_status == 255) {
561
	    TBBatchUnLockExp($pid, $eid, EXPTSTATE_SWAPPED());
562
	    
563
	    email_status("Experiment startup has failed with a fatal error!\n".
564 565
			 "Batch has been dequeued so that you may check it.");

566
	    exit($exit_status);
567
	}
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
	
	if (($attempts % 30) == 0) {
	    $attempts++;

	    my $msg =
		"Could not configure Batch Mode experiment $pid/$eid.\n".
		"\n".
		"There was an assignment violation (please check the log)\n".
		"that prevented it from being scheduled. The violation\n".
		"might result from not enough nodes or not enough link\n".
		"bandwidth. If you feel that the violation is in error,\n".
		"please cancel the batch and notify $TBOPS\n".
		"Otherwise, another attempt will be made in a little while.\n".
		"\n". 
		"There have been $attempts attempts to start this batch.";

	    email_status($msg);
	}
586
	TBBatchUnLockExp($pid, $eid, EXPTSTATE_QUEUED());
587
	exit($exit_status);
588 589 590
    }

    #
591
    # Well, it configured! We can now unlock it.
592
    #
593
    TBBatchUnLockExp($pid, $eid);
594
    
595
    email_status("Batch Mode experiment $pid/$eid is now running!\n".
596
		 "Please consult the Web interface to see how it is doing.");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
597

598
    #
599
    # Done with this phase. Must exit.
600
    #
601 602
    exit(0);
}
603

604
#
605
# A batch has completed. Swap it out. 
606
#
607
sub swapexp($;$)
608
{
609 610 611
    my($exphash)  = @_;
    my $canceled  = $exphash->{'canceled'};
    my $running   = ($exphash->{'state'} eq EXPTSTATE_ACTIVE);
612

613
    if ($running) {
614 615 616 617 618
	if (defined($template)) {
	    my $guid = $template->guid();
	    my $vers = $template->vers();
	    
	    system("$template_swapout -b -e $eid $guid/$vers");
619 620 621 622
	}
	else {
	    system("$swapexp -b -s out $pid $eid");
	}
623 624 625 626 627 628
	if ($?) {
	    #
	    # TB admin is going to have to clean up. 
	    # 
	    fatal("Swapping out Batch Mode experiment $pid/$eid");
	}
629 630 631 632 633
    }
    #
    # Set the state to paused to ensure that it is not run again until
    # the user wants to.
    # 
634 635
    TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR);
    TBBatchUnLockExp($pid, $eid);
636
    
637 638 639 640 641
    if ($canceled) {
	email_status("Batch Mode experiment $pid/$eid has been stopped!");
    }
    else {
	email_status("Batch Mode experiment $pid/$eid has finished!");
642
    }
643
   
644
    #
645
    # Child must exit!
646
    #
647 648 649 650 651 652
    exit(0);
}

#
# Cancel an experiment. Never returns.
#
653
sub cancelexp($)
654
{
655
    my($exphash) = @_;
656
    
657 658 659 660
    #
    # It does not matter if the experiment is running; endexp does the
    # right thing.
    # 
661 662 663 664 665
    if (defined($template)) {
	my $guid = $template->guid();
	my $vers = $template->vers();
	
	system("$template_swapout -b -e $eid $guid/$vers");
666 667
    }
    else {
668
	system("$endexp -b $pid,$eid");
669
    }
670 671 672 673 674
    if ($?) {
	#
	# TB admin is going to have to clean up. 
	# 
	fatal("Terminating Batch Mode experiment $pid/$eid");
675
    }
676
    donotify("Your Batch Mode experiment has been canceled!", "Canceled", 0);
677 678 679 680 681 682 683
   
    #
    # Child must exit!
    #
    exit(0);
}

684 685 686 687 688 689
#
# Check experiment status. Looks to see if all of the nodes in an
# experiment have reported in.
#
sub isexpdone($)
{
690
    my($exphash)  = @_;
691
    my($query_result, @row);
692 693
    
    # Global vars
694 695
    $eid = $exphash->{'eid'};
    $pid = $exphash->{'pid'};
696

697
    print TBDateTimeFSSafe() . "\n";
698 699 700 701 702 703
    print "Checking to see if $pid/$eid has finished up yet\n";

    #
    # Look to see if any nodes yet to report status. If so, spin again.
    #
    $query_result =
704 705 706
	DBQueryWarn("SELECT startstatus,bootstatus FROM nodes ".
		    "LEFT JOIN reserved ON nodes.node_id=reserved.node_id ".
		    "WHERE reserved.eid='$eid' and reserved.pid='$pid'");
707 708 709 710 711

    if (! $query_result) {
	return 0;
    }

712 713 714 715 716 717 718 719 720 721 722 723 724
    #
    # Well, right now a node is considered finished up only if its
    # boot did not fail, and it has reported start command status.
    # The idea being that if the boot failed, then its status will
    # never be reported anyway, and we might as well consider the node
    # done (else the experiment would never end).
    # 
    while (@row = $query_result->fetchrow_array()) {
	if ($row[1] eq NODEBOOTSTATUS_FAILED) {
	    next;
	}
	if ($row[0] eq NODESTARTSTATUS_NOSTATUS) {
	    return 0;
725 726
	}
    }
727
    return 1;
728 729 730 731
}

#
# Start up a child, and set its descriptors talking to a log file.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
732
# The log file already exists, created with mktemp above.
733 734 735 736 737 738 739
# 
sub openlog($)
{
    my($logname) = $_[0];
	
    #
    # We have to disconnect from the caller by redirecting both STDIN and
Leigh B. Stoller's avatar
Leigh B. Stoller committed
740 741
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
742 743
    #
    open(STDIN, "< /dev/null") or
744
	fatal("opening /dev/null for STDIN: $!");
745 746 747 748 749 750

    open(STDERR, ">> $logname") or
	fatal("opening $logname for STDERR: $!");
    open(STDOUT, ">> $logname") or
	fatal("opening $logname for STDOUT: $!");

751 752 753
    STDOUT->autoflush(1);
    STDERR->autoflush(1);

754 755 756
    return 0;
}

757 758 759 760 761
#
# A fatal error is something that the user does not need to know about.
# Caused by a breakdown in the TB system. Generally speaking, once the
# experiment is running, this should not be used.
# 
762
sub fatal($)
763 764 765
{
    my($mesg) = $_[0];

766
    donotify($mesg, "Failure", 1);
767 768 769 770

    exit(-1);
}

771 772 773
#
# Something the user cares about. 
# 
774
sub email_status($)
775
{
776
    my($mesg) = $_[0];
777

778
    donotify($mesg, "Status", 0);
779 780
}

781
sub donotify($$$)
782
{
783
    my($mesg, $subtext, $iserr) = @_;
784
    my($subject, $from, $to, $hdrs);
785
    my $MAIL;
786

787 788 789
    $mesg = "$mesg\n";

    print STDOUT "$mesg";
790

791
    $subject = "Batch Mode Experiment $subtext $pid/$eid";
792
    $from    = $TBOPS;
793 794 795 796 797 798
    $hdrs    = "Reply-To: $TBOPS";
    
    #
    # An error goes just to Testbed Operations. Normal status messages go
    # to the user and to the Testbed Logs address.
    # 
799
    if ($iserr) {
800
	$to = "$TBOPS";
801 802
    }
    else {
803 804 805
	$to   = "$user_name <$user_email>";
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
806 807
    }

808
    SENDMAIL($to, $subject, $mesg, $from, $hdrs,
809
	     ($logname, "assign.log", $nsfile));
810
}
811