batch_daemon.in 15 KB
Newer Older
1
2
3
4
5
6
7
8
9
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Create a batch experiment.
#
# usage: batch_daemon
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
10
11
# TODO: Use "logger" instead of writing a log file.
#
12
13
sub usage()
{
14
15
    print STDOUT "Usage: batch_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
16
17
    exit(-1);
}
18
my  $optlist = "d";
19
20
21
22
23
24
25

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";
26
my $TBLOGS   = "@TBLOGSEMAIL@";
27

28
29
30
31
32
33
34
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

Leigh B. Stoller's avatar
Leigh B. Stoller committed
35
36
37
38
39
#
# Ug, exit value from startexp when not enough nodes.
# 
my $TOOFEWNODES = 2;

40
41
42
43
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $startexp = "$TB/bin/startexp";
my $endexp   = "$TB/bin/endexp";
44
my $savelogs = "$TB/bin/savelogs";
45
my $avail    = "$TB/sbin/avail";
46
47
my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj";
48
my $debug    = 0;
49
50
51
52
53
54
55
56
my $dirname;

#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
#
my $eid;
my $pid;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
57
my $gid;
58
my $logname;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
59
my $nsfile;
60
my $user_name  = "Testbed Operations";
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
my $user_email = "$TBOPS";

#
# Turn off line buffering on output
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
85
86
87
if (defined($options{"d"})) {
    $debug = $options{"d"};
}
88
89

# Go to ground.
90
91
92
if (! $debug) {
    daemonize();
}
93
94
95
96
97

#
# Loop, looking for batch experiments that want to run.
# 
while (1) {
98
    my($count, $i, $query_result, $pending_result, $running_result);
99
100
    my(%row, %pending_row);
    
101
102
103
104
105
106
107
108
109
110
    #
    # Need to lock the table here because of cancelation in killbatchexp.
    # See the comments in there. We need to atomically grab the next
    # batch experiment we want to try, and then change its state from
    # new to configuring. We want to grab just one experiment, since
    # it takes a while to configure an experiment, and grabbing a bunch and
    # locking them up might result in having to wait a really long time
    # to cancel a batch experiment that hasn't really tried to start yet!
    # Thats would ne annoying to users, and we love our users, right?
    #
111
    # So, now you're wondering what my selection criteria is? Well, its
112
113
114
115
116
    # damn simplistic. I set the "started" datetime field each attempt,
    # and I pick the batch_experiment with the oldest time, thereby cycling
    # through in a "least recently attempted" manner. 
    #
    $query_result =
117
	DBQuery("lock tables batch_experiments write");
118
119
120
121
122
123
124
    if (! $query_result) {
	print "DB Error locking tables. Waiting a bit ...\n";
	sleep(10);
	next;
    }
    
    $pending_result =
125
	DBQuery("SELECT * FROM batch_experiments ".
126
127
128
		"WHERE status='new' and canceled=0 and (attempts=0 or ".
		"((UNIX_TIMESTAMP() - UNIX_TIMESTAMP(started) > (60 * 10)))) ".
		"ORDER BY started LIMIT 1");
129

130
    $running_result =
131
	DBQuery("SELECT * FROM batch_experiments ".
132
133
134
135
		"WHERE status='running' ORDER BY started");

    if (!$pending_result || !$running_result) {
	print "DB Error getting batch info. Waiting a bit ...\n";
136
	DBQuery("unlock tables");
137
138
139
140
141
142
	sleep(10);
	next;

    }

    if (!$pending_result->numrows && !$running_result->numrows) {
143
	DBQuery("unlock tables");
144
145
146
147
148
	sleep(10);
	next;
    }

    #
149
150
151
    # If we have a pending experiment to run, set its state to configuring
    # right away, while we have the tables locked. This prevents killbatchexp
    # from seeing it as something it can cancel.
152
    #
153
154
155
156
157
158
    if ($pending_result->numrows) {
	%pending_row = $pending_result->fetchhash();

	# Local vars!
	my $eid = $pending_row{'eid'};
	my $pid = $pending_row{'pid'};
159
	my $now = DBDateTime();
160
161

	$query_result = 
162
	    DBQuery("update batch_experiments set status='configuring', ".
163
164
165
166
		    "started='$now' where eid='$eid' and pid='$pid'");

	if (! $query_result) {
	    print "DB error setting batch $pid/$eid to configuring.\n";
167
	    DBQuery("unlock tables");
168
169
170
171
	    sleep(10);
	    next;
	}
    }
172
    DBQueryWarn("unlock tables");
173

174
175
176
177
178
179
180
181
182
183
184
    #
    # Okay, first we check the status of running batch mode experiments
    # since we want to end those before trying to start any new ones, cause
    # it would be nice to have as many nodes available as possible before
    # trying to add a new one. This can potentially delay startup, but thats
    # okay. Its a batch system.
    #
    # If you are wondering why I check for finished experiments in the main
    # loop instead of in the child that started the experiment, its so that
    # we fire up again and look for them in the event that paper goes down.
    #
185
    while (%row = $running_result->fetchhash()) {
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
	my $canceled = $row{'canceled'};
	if ($canceled) {
	    dosomething("cancel", %row);
	    next;
	}
	if (isexpdone(%row)) {
	    dosomething("end", %row);
	    next;
	}
    }

    #
    # Finally start an actual experiment!
    #
    if ($pending_result->numrows) {
	dosomething("start", %pending_row);
    }
203
    sleep(15);
204
205
206
}

#
207
# Do something as the user. Either, start, end, or cancel an experiment.
208
#
209
sub dosomething($$)
210
{
211
212
    my($dowhat)   = shift;
    my(%exphash)  = @_;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
213
    my($unix_uid, $unix_gid, $row, $query_result);
214
215
216
217

    # Global vars
    $eid = $exphash{'eid'};
    $pid = $exphash{'pid'};
Leigh B. Stoller's avatar
Leigh B. Stoller committed
218
    $gid = $exphash{'gid'};
219

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
    print "Doing a '$dowhat' to batch experiment $pid/$eid\n";

    #
    # Create a temporary name for a log file. We do this in the parent so
    # we can remove it when the child ends. The child could remove it, but
    # since it is open in the child, it has the tendency to stick around.
    #
    $logname = `mktemp /tmp/$dowhat-batch-$pid-$eid.XXXXXX`;

    # Note different taint check (allow /).
    if ($logname =~ /^([-\@\w.\/]+)$/) {
	$logname = $1;
    } else {
	die "Bad data in $logname";
    }

236
237
238
    #
    # Start up a child to run the guts. The parent waits. If the
    # experiment configures okay, the parent can return to try something
239
    # else.
240
241
242
    #
    $childpid = fork();
    if ($childpid) {
243
244
	print "Child PID $childpid started to $dowhat $pid/$eid\n";

245
	waitpid($childpid, 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
246
	my $status = $? >> 8;
247

Leigh B. Stoller's avatar
Leigh B. Stoller committed
248
	print "Child PID $childpid exited with exit status $status\n";
249

Leigh B. Stoller's avatar
Leigh B. Stoller committed
250
	sleep(5);
251

252
	unlink($logname);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
253
	return $status;
254
    }
255
256
257
258
259
    openlog($logname);

    my $creator  = $exphash{'creator_uid'};
    my $longname = $exphash{'name'};
    
260
    # Global vars
261
    $dirname  = "$batchdir/$pid-$eid";
262
    $nsfile   = "$dirname/$eid.ns";
263
264
265
266
267

    #
    # Get some user information. 
    #
    $query_result =
268
269
	DBQueryFatal("SELECT usr_name,usr_email from users ".
		     "WHERE uid='$creator'");
270

271
    if ($query_result->numrows != 1) {
272
273
274
275
276
277
278
279
280
281
	fatal("DB Error getting user information for uid $creator\n");
    }
    @row = $query_result->fetchrow_array();
    $user_name  = $row[0];
    $user_email = $row[1];

    #
    # Figure out the unix uid/gid that the experiment configuration is
    # going to run as. 
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
282
    (undef,undef,$unix_uid) = getpwnam($creator) or
283
	fatal("No such user $creator");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
284
285
    (undef,undef,$unix_gid) = getgrnam($gid) or
	fatal("No such group $gid");
286

287
288
289
    #
    # Change the ownership of the log file before we flip.
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
290
    chown($unix_uid, $unix_gid, $logname);
291
292

    # Flip to the user. We never flip back.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
293
294
    $EGID = $GID = $unix_gid;
    $EUID = $UID = $unix_uid;
295
    $ENV{'USER'} = $creator;
296
    
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
    if ($dowhat eq "start") {
	startexp(%exphash);
    }
    elsif ($dowhat eq "end") {
	endexp(%exphash);
    }
    elsif ($dowhat eq "cancel") {
	cancelexp(1, %exphash);
    }
    exit(0);
}

#
# Try to start an experiment. Never returns.
# 
sub startexp($)
{
    my(%exphash)  = @_;
315
    my($exit_status, $running, $query_result);
316
317
318
319

    my $creator   = $exphash{'creator_uid'};
    my $longname  = $exphash{'name'};
    my $attempts  = $exphash{'attempts'};
320
    my $expires   = $exphash{'expires'};
321
    my $rightnow  = DBDateTime();
322

323
324
325
    #
    # Insert an experiment record for startexp.
    #
326
    $query_result =
327
	DBQueryFatal("insert into experiments ".
Leigh B. Stoller's avatar
Leigh B. Stoller committed
328
		     "(eid, pid, gid, expt_created, expt_name, ".
329
		     "expt_head_uid, expt_expires, state, batchmode) ".
Leigh B. Stoller's avatar
Leigh B. Stoller committed
330
331
332
		     "VALUES ('$eid', '$pid', '$gid', '$rightnow', ".
		     "        '$longname', ".
		     "        '$creator', '$expires', 'new', 1)");
333
334
335
336

    #
    # Try to start the experiment. If it fails, the experiment is gone.
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
337
    system("$startexp -b $logname -g $gid $pid $eid $nsfile");
338
339
    $exit_status = $? >> 8;
    $running     = 1;
340
341
342
343
344
    if ($exit_status) {
	$running = 0;
    }
    
    #
345
346
    # Look for cancelation. If we get a DB error on this, just continue cause
    # we can pick up the cancelation later.
347
348
    #
    $query_result =
349
350
	DBQueryWarn("select canceled from batch_experiments ".
		    "where eid='$eid' and pid='$pid'");
351

352
353
    if ($query_result) {
	@row = $query_result->fetchrow_array();
354

355
356
357
358
359
360
361
362
	if ($row[0]) {
	    cancelexp($running);
	    #
	    # Never returns, but just to be safe ...
	    #
	    exit(0);
	}
    }
363
364
365
366

    #
    # If the configuration failed for lack of nodes, then don't send
    # email unless the number of attempts starts to get big.
367
    #
368
369
370
    # If the configuration failed for some other reason, then send email.
    # We have to reset the state to "new" so that it will be retried again
    # later. 
371
372
    #
    if (! $running) {
373
374
375
376
	#
	# XXX - What if this update fails?
	# 
	$query_result = 
377
378
	    DBQueryWarn("update batch_experiments set status='new', ".
			"attempts=attempts+1 where eid='$eid' and pid='$pid'");
379
	$attempts++;
380

Leigh B. Stoller's avatar
Leigh B. Stoller committed
381
382
383
	if (($exit_status == $TOOFEWNODES && $attempts >= 9 &&
	     (($attempts % 9) == 0)) ||
	    (($exit_status != $TOOFEWNODES) && ($attempts % 5) == 0) ||
384
	    ($attempts == 0)) {
385
	    
386
387
388
389
	    email_status("Could not configure Batch Mode experiment ".
			 "$pid/$eid\n".
			 "There have been $attempts attempts made to start ".
			 "this batch\n");
390
	}
391
	exit($exit_status);
392
393
394
395
396
    }

    #
    # Well, it configured! Lets set it state to running.
    #
397
    # XXX - What if this update fails?
Leigh B. Stoller's avatar
Leigh B. Stoller committed
398
    #
399
    $query_result = 
400
401
	DBQueryWarn("update batch_experiments set status='running' ".
		    "where eid='$eid' and pid='$pid'");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
402

403
404
    email_status("Batch Mode experiment $pid/$eid is now running!\n".
		 "Please consult the Web interface to see how it is doing\n");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
405

406
    #
407
    # Done with this phase. Must exit.
408
    #
409
410
    exit(0);
}
411

412
413
414
415
416
417
#
# End an experiment. Never returns.
#
sub endexp($)
{
    my(%exphash)  = @_;
418
419
420
421
422
423

	#
	# Save tiplogs
	#
    system("$savelogs $pid $eid");

424
    system("$endexp -b $pid $eid");
425
426
427
428
429
430
431
432
433
    my $exit_status = $? >> 8;

    if ($exit_status) {
	#
	# TB admin is going to have to clean up. 
	# 
	fatal("Terminating Batch Mode experiment $pid/$eid");
    }
    
434
435
    DBQueryWarn("DELETE from batch_experiments ".
		"WHERE eid='$eid' and pid='$pid'");
436
437
438
    email_status("Batch Mode experiment $pid/$eid has finished!\n");
    system("rm -rf $dirname");
   
439
    #
440
    # Child must exit!
441
    #
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
    exit(0);
}

#
# Cancel an experiment. Never returns.
#
sub cancelexp($$)
{
    my($running) = shift;
    my(%exphash) = @_;
    
    if ($running) {
	system("$endexp -b $pid $eid");
    }

457
458
459
    DBQueryWarn("DELETE from batch_experiments ".
		"WHERE eid='$eid' and pid='$pid'");
    donotify("Your Batch Mode experiment has been canceled. You may now\n".
460
		"reuse the experiment name\n", "Canceled", 0);
461
462
463
464
465
466
467
468
    system("rm -rf $dirname");
   
    #
    # Child must exit!
    #
    exit(0);
}

469
470
471
472
473
474
475
#
# Check experiment status. Looks to see if all of the nodes in an
# experiment have reported in.
#
sub isexpdone($)
{
    my(%exphash)  = @_;
476
    my($query_result, @row);
477
478
479
480
481
482
483
484
485
486
487
    
    # Global vars
    $eid = $exphash{'eid'};
    $pid = $exphash{'pid'};

    print "Checking to see if $pid/$eid has finished up yet\n";

    #
    # Look to see if any nodes yet to report status. If so, spin again.
    #
    $query_result =
488
489
490
	DBQueryWarn("SELECT startstatus,bootstatus FROM nodes ".
		    "LEFT JOIN reserved ON nodes.node_id=reserved.node_id ".
		    "WHERE reserved.eid='$eid' and reserved.pid='$pid'");
491
492
493
494
495

    if (! $query_result) {
	return 0;
    }

496
497
498
499
500
501
502
503
504
505
506
507
508
    #
    # Well, right now a node is considered finished up only if its
    # boot did not fail, and it has reported start command status.
    # The idea being that if the boot failed, then its status will
    # never be reported anyway, and we might as well consider the node
    # done (else the experiment would never end).
    # 
    while (@row = $query_result->fetchrow_array()) {
	if ($row[1] eq NODEBOOTSTATUS_FAILED) {
	    next;
	}
	if ($row[0] eq NODESTARTSTATUS_NOSTATUS) {
	    return 0;
509
510
	}
    }
511
    return 1;
512
513
514
515
}

#
# Start up a child, and set its descriptors talking to a log file.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
516
# The log file already exists, created with mktemp above.
517
518
519
520
521
522
523
# 
sub openlog($)
{
    my($logname) = $_[0];
	
    #
    # We have to disconnect from the caller by redirecting both STDIN and
Leigh B. Stoller's avatar
Leigh B. Stoller committed
524
525
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
526
527
    #
    open(STDIN, "< /dev/null") or
528
	fatal("opening /dev/null for STDIN: $!");
529
530
531
532
533
534
535
536
537

    open(STDERR, ">> $logname") or
	fatal("opening $logname for STDERR: $!");
    open(STDOUT, ">> $logname") or
	fatal("opening $logname for STDOUT: $!");

    return 0;
}

538
539
540
541
542
#
# A fatal error is something that the user does not need to know about.
# Caused by a breakdown in the TB system. Generally speaking, once the
# experiment is running, this should not be used.
# 
543
sub fatal($)
544
545
546
{
    my($mesg) = $_[0];

547
    donotify($mesg, "Failure", 1);
548
549
550
551

    exit(-1);
}

552
553
554
#
# Something the user cares about. 
# 
555
sub email_status($)
556
{
557
    my($mesg) = $_[0];
558

559
    donotify($mesg, "Status", 0);
560
561
}

562
sub donotify($$$)
563
{
564
    my($mesg, $subtext, $iserr) = @_;
565
    my($subject, $from, $to, $hdrs);
566
    my $MAIL;
567
568
569

    print STDOUT "$mesg\n";

570
571
    $subject = "TESTBED: Batch Mode Experiment $subtext $pid/$eid";
    $from    = $TBOPS;
572
573
574
575
576
577
    $hdrs    = "Reply-To: $TBOPS";
    
    #
    # An error goes just to Testbed Operations. Normal status messages go
    # to the user and to the Testbed Logs address.
    # 
578
    if ($iserr) {
579
	$to = "$TBOPS";
580
581
    }
    else {
582
583
584
	$to   = "$user_name <$user_email>";
	$hdrs = "Bcc: $TBLOGS\n".
	        "$hdrs";
585
586
    }

587
    if (! ($MAIL = OPENMAIL($to, $subject, $from, $hdrs))) {
588
589
	die("Cannot start mail program!");
    }
590

591
    print $MAIL $mesg;
592
593

    if (defined($logname) && open(IN, "$logname")) {
594
	print $MAIL "\n\n---------\n\n";
595
596
	
	while (<IN>) {
597
	    print $MAIL "$_";
598
599
600
	}
	close(IN);
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
601
602

    if (defined($nsfile) && open(IN, "$nsfile")) {
603
	print $MAIL "\n\n---------\n\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
604
605
	
	while (<IN>) {
606
	    print $MAIL "$_";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
607
608
609
610
	}
	close(IN);
    }
    
611
    close($MAIL);
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
}

#
# Become a daemon.
# 
sub daemonize()
{
    my $mypid = fork();
    if ($mypid) {
	exit(0);
    }

    #
    # We have to disconnect from the caller by redirecting both STDIN and
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
    #
    open(STDIN, "< /dev/null") or
	die("opening /dev/null for STDIN: $!");

    #
    # Open the batch log and start writing to it. 
    #
    open(STDERR, ">> $batchlog") or die("opening $batchlog for STDERR: $!");
    open(STDOUT, ">> $batchlog") or die("opening $batchlog for STDOUT: $!");

    return 0;
}