batch_daemon.in 14.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Create a batch experiment.
#
# usage: batch_daemon
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
10
11
# TODO: Use "logger" instead of writing a log file.
#
12
13
sub usage()
{
14
15
    print STDOUT "Usage: batch_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
16
17
    exit(-1);
}
18
my  $optlist = "d";
19
20
21
22
23
24
25

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";
26
my $TBLOGS   = "@TBLOGSEMAIL@";
27

Leigh B. Stoller's avatar
Leigh B. Stoller committed
28
29
30
31
32
#
# Ug, exit value from startexp when not enough nodes.
# 
my $TOOFEWNODES = 2;

33
34
35
36
my $tbbindir = "$TB/bin/";
my $batchdir = "$TB/batch";
my $startexp = "$TB/bin/startexp";
my $endexp   = "$TB/bin/endexp";
37
my $avail    = "$TB/sbin/avail";
38
39
my $batchlog = "$TB/log/batchlog";
my $projroot = "/proj";
40
my $debug    = 0;
41
42
43
44
45
46
47
48
49
my $dirname;

#
# These are valid in the children, not the parent. I suppose I could use
# dynamically scoped variables, but hardly worth it.
#
my $eid;
my $pid;
my $logname;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
50
my $nsfile;
51
my $user_name  = "Testbed Operations";
52
53
54
55
56
57
58
my $user_email = "$TBOPS";

#
# Turn off line buffering on output
#
$| = 1;

59
60
61
62
63
64
#
# Testbed Support library
# 
push(@INC, "$TB/lib");
require libtestbed;

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
82
83
84
if (defined($options{"d"})) {
    $debug = $options{"d"};
}
85
86

# Go to ground.
87
88
89
if (! $debug) {
    daemonize();
}
90
91
92
93
94
95
96
97
98
99
100

#
# Set up for querying the database.
# 
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");

#
# Loop, looking for batch experiments that want to run.
# 
while (1) {
101
    my($count, $i);
102
103
    my(%row, %pending_row);
    
104
105
106
107
108
109
110
111
112
113
    #
    # Need to lock the table here because of cancelation in killbatchexp.
    # See the comments in there. We need to atomically grab the next
    # batch experiment we want to try, and then change its state from
    # new to configuring. We want to grab just one experiment, since
    # it takes a while to configure an experiment, and grabbing a bunch and
    # locking them up might result in having to wait a really long time
    # to cancel a batch experiment that hasn't really tried to start yet!
    # Thats would ne annoying to users, and we love our users, right?
    #
114
    # So, now you're wondering what my selection criteria is? Well, its
115
116
117
118
119
    # damn simplistic. I set the "started" datetime field each attempt,
    # and I pick the batch_experiment with the oldest time, thereby cycling
    # through in a "least recently attempted" manner. 
    #
    $query_result =
120
121
122
123
124
125
126
127
	DBquery("lock tables batch_experiments write");
    if (! $query_result) {
	print "DB Error locking tables. Waiting a bit ...\n";
	sleep(10);
	next;
    }
    
    $pending_result =
128
	DBquery("SELECT * FROM batch_experiments ".
129
130
131
		"WHERE status='new' and canceled=0 and (attempts=0 or ".
		"((UNIX_TIMESTAMP() - UNIX_TIMESTAMP(started) > (60 * 10)))) ".
		"ORDER BY started LIMIT 1");
132

133
134
135
136
137
138
139
140
141
142
143
144
145
    $running_result =
	DBquery("SELECT * FROM batch_experiments ".
		"WHERE status='running' ORDER BY started");

    if (!$pending_result || !$running_result) {
	print "DB Error getting batch info. Waiting a bit ...\n";
	DBquery("unlock tables");
	sleep(10);
	next;

    }

    if (!$pending_result->numrows && !$running_result->numrows) {
146
147
148
149
150
151
	DBquery("unlock tables");
	sleep(10);
	next;
    }

    #
152
153
154
    # If we have a pending experiment to run, set its state to configuring
    # right away, while we have the tables locked. This prevents killbatchexp
    # from seeing it as something it can cancel.
155
    #
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    if ($pending_result->numrows) {
	%pending_row = $pending_result->fetchhash();

	# Local vars!
	my $eid = $pending_row{'eid'};
	my $pid = $pending_row{'pid'};
	my $now = `date '+20%y-%m-%d %H:%M:%S'`;

	$query_result = 
	    DBquery("update batch_experiments set status='configuring', ".
		    "started='$now' where eid='$eid' and pid='$pid'");

	if (! $query_result) {
	    print "DB error setting batch $pid/$eid to configuring.\n";
	    DBquery("unlock tables");
	    sleep(10);
	    next;
	}
    }
175
176
    DBquery("unlock tables");

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
    #
    # Okay, first we check the status of running batch mode experiments
    # since we want to end those before trying to start any new ones, cause
    # it would be nice to have as many nodes available as possible before
    # trying to add a new one. This can potentially delay startup, but thats
    # okay. Its a batch system.
    #
    # If you are wondering why I check for finished experiments in the main
    # loop instead of in the child that started the experiment, its so that
    # we fire up again and look for them in the event that paper goes down.
    #
    $count = $running_result->numrows;
    for ($i = 0; $i < $count; $i++) {
	%row = $running_result->fetchhash();

	my $canceled = $row{'canceled'};
	if ($canceled) {
	    dosomething("cancel", %row);
	    next;
	}
	if (isexpdone(%row)) {
	    dosomething("end", %row);
	    next;
	}
    }

    #
    # Finally start an actual experiment!
    #
    if ($pending_result->numrows) {
	dosomething("start", %pending_row);
    }
209
    sleep(15);
210
211
212
}

#
213
# Do something as the user. Either, start, end, or cancel an experiment.
214
#
215
sub dosomething($$)
216
{
217
218
    my($dowhat)   = shift;
    my(%exphash)  = @_;
219
220
221
222
223
224
    my($uid, $gid, $row);

    # Global vars
    $eid = $exphash{'eid'};
    $pid = $exphash{'pid'};

225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
    print "Doing a '$dowhat' to batch experiment $pid/$eid\n";

    #
    # Create a temporary name for a log file. We do this in the parent so
    # we can remove it when the child ends. The child could remove it, but
    # since it is open in the child, it has the tendency to stick around.
    #
    $logname = `mktemp /tmp/$dowhat-batch-$pid-$eid.XXXXXX`;

    # Note different taint check (allow /).
    if ($logname =~ /^([-\@\w.\/]+)$/) {
	$logname = $1;
    } else {
	die "Bad data in $logname";
    }

241
242
243
    #
    # Start up a child to run the guts. The parent waits. If the
    # experiment configures okay, the parent can return to try something
244
    # else.
245
246
247
    #
    $childpid = fork();
    if ($childpid) {
248
249
	print "Child PID $childpid started to $dowhat $pid/$eid\n";

250
	waitpid($childpid, 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
251
	my $status = $? >> 8;
252

Leigh B. Stoller's avatar
Leigh B. Stoller committed
253
	print "Child PID $childpid exited with exit status $status\n";
254

Leigh B. Stoller's avatar
Leigh B. Stoller committed
255
	sleep(5);
256

257
	unlink($logname);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
258
	return $status;
259
    }
260
261
262
263
264
265
266
267
    openlog($logname);

    #
    # Form a new connection to the DB since we are in the child. Not sure
    # if this happens as a result of the fork, but lets be sure.
    #
    undef($DB);
    $DB = Mysql->connect("localhost", $DBNAME, "script", "none");
268

269
270
271
    my $creator  = $exphash{'creator_uid'};
    my $longname = $exphash{'name'};
    
272
    # Global vars
273
    $dirname  = "$batchdir/$pid-$eid";
274
    $nsfile   = "$dirname/$eid.ns";
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

    #
    # Get some user information. 
    #
    $query_result =
	$DB->query("SELECT usr_name,usr_email from users ".
		   "WHERE uid='$creator'");

    if (! $query_result ||
	$query_result->numrows != 1) {
	fatal("DB Error getting user information for uid $creator\n");
    }
    @row = $query_result->fetchrow_array();
    $user_name  = $row[0];
    $user_email = $row[1];

    #
    # Figure out the unix uid/gid that the experiment configuration is
    # going to run as. 
    #
    (undef,undef,$uid) = getpwnam($creator) or
	fatal("No such user $creator");
    (undef,undef,$gid) = getgrnam($pid) or
	fatal("No such group $pid");

300
301
302
303
304
305
    #
    # Change the ownership of the log file before we flip.
    #
    chown($uid, $gid, $logname);

    # Flip to the user. We never flip back.
306
307
    $EGID = $GID = $gid;
    $EUID = $UID = $uid;
308
    $ENV{'USER'} = $creator;
309
    
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
    if ($dowhat eq "start") {
	startexp(%exphash);
    }
    elsif ($dowhat eq "end") {
	endexp(%exphash);
    }
    elsif ($dowhat eq "cancel") {
	cancelexp(1, %exphash);
    }
    exit(0);
}

#
# Try to start an experiment. Never returns.
# 
sub startexp($)
{
    my(%exphash)  = @_;
    my($exit_status, $running);

    my $creator   = $exphash{'creator_uid'};
    my $longname  = $exphash{'name'};
    my $attempts  = $exphash{'attempts'};

334
335
336
337
    #
    # Insert an experiment record for startexp.
    #
    my $rightnow = `date '+20%y-%m-%d %H:%M:%S'`;
338
339
340
    $query_result =
	DBquery("insert into experiments ".
		"(eid, pid, expt_created, expt_name, ".
Leigh B. Stoller's avatar
Leigh B. Stoller committed
341
		"expt_head_uid, expt_start, state, batchmode) ".
342
		"VALUES ('$eid', '$pid', '$rightnow', '$longname', ".
Leigh B. Stoller's avatar
Leigh B. Stoller committed
343
		"'$creator', '$rightnow', 'new', 1)");
344
345
346
    if (! $query_result) {
	fatal("DB error inserting experiment record. Quitting ...\n");
    }
347
348
349
350

    #
    # Try to start the experiment. If it fails, the experiment is gone.
    #
351
    system("$startexp -b $logname $pid $eid $nsfile");
352
353
    $exit_status = $? >> 8;
    $running     = 1;
354
355
356
357
358
    if ($exit_status) {
	$running = 0;
    }
    
    #
359
360
    # Look for cancelation. If we get a DB error on this, just continue cause
    # we can pick up the cancelation later.
361
362
363
364
365
    #
    $query_result =
	DBquery("select canceled from batch_experiments ".
		"where eid='$eid' and pid='$pid'");

366
367
    if ($query_result) {
	@row = $query_result->fetchrow_array();
368

369
370
371
372
373
374
375
376
	if ($row[0]) {
	    cancelexp($running);
	    #
	    # Never returns, but just to be safe ...
	    #
	    exit(0);
	}
    }
377
378
379
380

    #
    # If the configuration failed for lack of nodes, then don't send
    # email unless the number of attempts starts to get big.
381
    #
382
383
384
    # If the configuration failed for some other reason, then send email.
    # We have to reset the state to "new" so that it will be retried again
    # later. 
385
386
    #
    if (! $running) {
387
388
389
390
391
392
	#
	# XXX - What if this update fails?
	# 
	$query_result = 
	    DBquery("update batch_experiments set status='new', ".
		    "attempts=attempts+1 where eid='$eid' and pid='$pid'");
393
	$attempts++;
394

Leigh B. Stoller's avatar
Leigh B. Stoller committed
395
396
397
	if (($exit_status == $TOOFEWNODES && $attempts >= 9 &&
	     (($attempts % 9) == 0)) ||
	    (($exit_status != $TOOFEWNODES) && ($attempts % 5) == 0) ||
398
	    ($attempts == 0)) {
399
400
401
402
403
	    
	    fatal("Could not configure Batch Mode experiment $pid/$eid\n".
		  "There have been $attempts attempts made to start this ".
		  "batch\n");
	}
404
	exit($exit_status);
405
406
407
408
409
    }

    #
    # Well, it configured! Lets set it state to running.
    #
410
    # XXX - What if this update fails?
Leigh B. Stoller's avatar
Leigh B. Stoller committed
411
    #
412
413
414
    $query_result = 
	DBquery("update batch_experiments set status='running' ".
		"where eid='$eid' and pid='$pid'");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
415

416
417
    email_status("Batch Mode experiment $pid/$eid is now running!\n".
		 "Please consult the Web interface to see how it is doing\n");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
418

419
    #
420
    # Done with this phase. Must exit.
421
    #
422
423
    exit(0);
}
424

425
426
427
428
429
430
#
# End an experiment. Never returns.
#
sub endexp($)
{
    my(%exphash)  = @_;
431
    
432
433
434
435
436
    system("$endexp -b $pid $eid");
    DBquery("DELETE from batch_experiments WHERE eid='$eid' and pid='$pid'");
    email_status("Batch Mode experiment $pid/$eid has finished!\n");
    system("rm -rf $dirname");
   
437
    #
438
    # Child must exit!
439
    #
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
    exit(0);
}

#
# Cancel an experiment. Never returns.
#
sub cancelexp($$)
{
    my($running) = shift;
    my(%exphash) = @_;
    
    if ($running) {
	system("$endexp -b $pid $eid");
    }

455
    DBquery("DELETE from batch_experiments WHERE eid='$eid' and pid='$pid'");
456
457
    notify_user("Your Batch Mode experiment has been canceled. You may now\n".
		"reuse the experiment name\n", "Canceled", 0);
458
459
460
461
462
463
464
465
    system("rm -rf $dirname");
   
    #
    # Child must exit!
    #
    exit(0);
}

466
467
468
469
470
471
472
#
# Check experiment status. Looks to see if all of the nodes in an
# experiment have reported in.
#
sub isexpdone($)
{
    my(%exphash)  = @_;
473
    my($row, $done, $i);
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
    
    # Global vars
    $eid = $exphash{'eid'};
    $pid = $exphash{'pid'};

    print "Checking to see if $pid/$eid has finished up yet\n";

    #
    # Look to see if any nodes yet to report status. If so, spin again.
    #
    $query_result =
	DBquery("SELECT startstatus FROM nodes LEFT JOIN reserved ".
		"ON nodes.node_id=reserved.node_id ".
		"WHERE reserved.eid='$eid' and reserved.pid='$pid'");

    if (! $query_result) {
	return 0;
    }

    $done = 1;
    for ($i = 0; $i < $query_result->numrows; $i++) {
	@row = $query_result->fetchrow_array();
	
	if ($row[0] eq "none") {
	    $done = 0;
	}
    }
    return $done;
}

504
505
506
sub DBquery($)
{
    my($query) = $_[0];
507
    my($result);
508

509
    $result = $DB->query($query);
510

511
512
    if (! $result) {
	print "DB Query failed: $query\n";
513
514
    }

515
    return $result;
516
517
518
519
}

#
# Start up a child, and set its descriptors talking to a log file.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
520
# The log file already exists, created with mktemp above.
521
522
523
524
525
526
527
# 
sub openlog($)
{
    my($logname) = $_[0];
	
    #
    # We have to disconnect from the caller by redirecting both STDIN and
Leigh B. Stoller's avatar
Leigh B. Stoller committed
528
529
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
530
531
    #
    open(STDIN, "< /dev/null") or
532
	fatal("opening /dev/null for STDIN: $!");
533
534
535
536
537
538
539
540
541

    open(STDERR, ">> $logname") or
	fatal("opening $logname for STDERR: $!");
    open(STDOUT, ">> $logname") or
	fatal("opening $logname for STDOUT: $!");

    return 0;
}

542
sub fatal($)
543
544
545
{
    my($mesg) = $_[0];

546
    notify_user($mesg, "Failure", 1);
547
548
549
550

    exit(-1);
}

551
sub email_status($)
552
{
553
    my($mesg) = $_[0];
554

555
    notify_user($mesg, "Status", 0);
556
557
}

558
sub notify_user($$$)
559
{
560
    my($mesg, $subtext, $iserr) = @_;
561
    my($subject, $from, $to, $cc);
562
    my $MAIL;
563
564
565

    print STDOUT "$mesg\n";

566
567
568
    $subject = "TESTBED: Batch Mode Experiment $subtext $pid/$eid";
    $from    = $TBOPS;
    $to      = "$user_name <$user_email>";
569
    if ($iserr) {
570
	$cc = "Cc: $TBOPS";
571
572
    }
    else {
573
	$cc = "Bcc: $TBLOGS";
574
575
    }

576
577
578
    if (! ($MAIL = OPENMAIL($to, $subject, $from, $cc))) {
	die("Cannot start mail program!");
    }
579

580
    print $MAIL $mesg;
581
582

    if (defined($logname) && open(IN, "$logname")) {
583
	print $MAIL "\n\n---------\n\n";
584
585
	
	while (<IN>) {
586
	    print $MAIL "$_";
587
588
589
	}
	close(IN);
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
590
591

    if (defined($nsfile) && open(IN, "$nsfile")) {
592
	print $MAIL "\n\n---------\n\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
593
594
	
	while (<IN>) {
595
	    print $MAIL "$_";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
596
597
598
599
	}
	close(IN);
    }
    
600
    close($MAIL);
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
}

#
# Become a daemon.
# 
sub daemonize()
{
    my $mypid = fork();
    if ($mypid) {
	exit(0);
    }

    #
    # We have to disconnect from the caller by redirecting both STDIN and
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
    #
    open(STDIN, "< /dev/null") or
	die("opening /dev/null for STDIN: $!");

    #
    # Open the batch log and start writing to it. 
    #
    open(STDERR, ">> $batchlog") or die("opening $batchlog for STDERR: $!");
    open(STDOUT, ">> $batchlog") or die("opening $batchlog for STDOUT: $!");

    return 0;
}