batchexp.in 28.4 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5
6
# All rights reserved.
#
7
8
use English;
use Getopt::Std;
9
use POSIX qw(isatty setsid);
10
use POSIX qw(strftime);
11
use Errno qw(EDQUOT);
12
13

#
14
15
16
17
# Create an experiment. The experiment is either run immediately, or
# placed into the batch system to be run later. If no NS file is
# supplied, an experiment shell is created (this is currently an admin
# only option).
18
#
19
# TODO: Remove expt_expires and priority.
20
#       Add calls to check_slot() to verify inputs.
21
# 
Leigh B. Stoller's avatar
Leigh B. Stoller committed
22
23
24
25
26
27
28
29
30
# Exit codes are important; they tell the web page what has happened so
# it can say something useful to the user. Fatal errors are mostly done
# with die(), but expected errors use this routine. At some point we will
# use the DB to communicate the actual error.
#
# $status < 0 - Fatal error. Something went wrong we did not expect.
# $status = 0 - Everything okay.
# $status > 0 - Expected error. User not allowed for some reason. 
# 
31
32
sub usage()
{
33
    print(STDERR
34
	  "Usage: batchexp [-q] [-i [-w]] [-n] [-f] [-E description] [-g gid]\n".
35
	  "                [-S reason] [-L reason] [-a <time>] [-l <time>]\n".
36
	  "                -p <pid> -e <eid> <nsfile>\n".
37
	  "switches and arguments:\n".
38
39
40
	  "-i       - swapin immediately; by default experiment is batched\n".
	  "-w       - wait for non-batchmode experiment to preload or swapin\n".
	  "-f       - preload experiment (do not swapin or queue yet)\n".
41
	  "-q       - be less chatty\n".
42
43
	  "-S <str> - Experiment cannot be swapped; must provide reason\n".
	  "-L <str> - Experiment cannot be IDLE swapped; must provide reason\n".
44
	  "-n       - Do not send idle email (internal option only)\n".
45
46
47
48
49
50
51
52
53
54
	  "-a <nnn> - Auto swapout nnn minutes after experiment is swapped in\n".
	  "-l <nnn> - Auto swapout nnn minutes after experiment goes idle\n".
	  "-E <str> - A pithy sentence describing your experiment\n".
	  "-p <pid> - The project in which to create the experiment\n".
	  "-g <gid> - The group in which to create the experiment\n".
	  "-e <eid> - The experiment name (unique, alphanumeric, no blanks)\n".
	  "<nsfile> - NS file to parse for experiment.\n");
    exit(-1);
}

55
56
57
sub ParseArgs();
sub fatal($);

58
my $optlist = "iE:g:e:p:S:L:a:l:fwqt:nz";
59
my $batchmode= 1;
60
61
my $frontend = 0;
my $waitmode = 0;
62
my $quiet    = 0;
63
my $linktest = 0;	# non-zero means level to run at.
64
65
my $zeemode  = 0;	# Hey, out of options.
my $zeeopt   = "";	# To pass along.
66
67
68
69
70

#
# Configure variables
#
my $TB       = "@prefix@";
71
my $PROJROOT = "/proj";
72
my $EVENTSYS = @EVENTSYS@;
73
74
my $TBOPS    = "@TBOPSEMAIL@";
my $TBLOGS   = "@TBLOGSEMAIL@";
75
76
my $TBDOCBASE = "@TBDOCBASE@";
my $TBBASE   = "@TBBASE@";
77
my $CONTROL  = "@USERNODE@";
78

79
80
81
82
83
84
85
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

86
my $parser   = "$TB/libexec/parse-ns";
87
my $mkexpdir = "$TB/libexec/mkexpdir";
88
my $checkquota = "$TB/sbin/checkquota";
89
my $tbbindir = "$TB/bin/";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
90
my $errorstat=-1;
91
92
my $user_name;
my $user_email;
93
my $dbuid;
94

95
96
97
98
99
100
101
# Be careful not to exit on transient error; 0 means infinite retry.
$libdb::DBQUERY_MAXTRIES = 0;

# For the END block below.
my $cleaning = 0;
my $justexit = 1;
my $signaled = 0;
102
103
104
105
106
107

#
# Turn off line buffering on output
#
$| = 1;

108
109
110
111
112
113
114
#
# Set umask for start/swap. We want other members in the project to be
# able to swap/end experiments, so the log and intermediate files need
# to be 664 since some are opened for append.
#
umask(0002);

115
116
#
# Untaint the path
117
#
118
119
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
120
121
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

122
123
124
125
126
my $eid;
my $pid;
my $gid;
my $description;
my $tempnsfile;
127
128
129
my $swappable    = 1;
my $noswap_reason;
my $idleswap     = 1;
130
my $idleswaptime = 60 * TBGetSiteVar("idle/threshold");
131
my $noidleswap_reason;
132
my $autoswap     = 0;
133
my $autoswaptime = 10 * 60;
134
135
136
my $idleignore   = 0;
my $priority     = TB_EXPTPRIORITY_LOW;
my $exptstate    = EXPTSTATE_NEW();
137
my $batchstate   = BATCHSTATE_UNLOCKED();
138
139
my $now          = localtime();
my $committed    = 0;
140

141
#
142
# Verify user and get his DB uid.
143
#
144
145
146
if (! UNIX2DBUID($UID, \$dbuid)) {
    die("*** $0:\n".
        "    You do not exist in the Emulab Database!\n");
147
148
}

149
150
151
152
153
154
155
156
#
# Get email info for user.
#
if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
    die("*** $0:\n".
        "    Cannot determine your name and email address.\n");
}

157
158
159
160
161
162
163
164
165
166
167
#
# Before doing anything else, check for overquota ... lets not waste
# our time. Make sure user sees the error by exiting with 1.
#
if (system("$checkquota $dbuid") != 0) {
    print STDERR
	"*** $0:\n".
	"    You are over your disk quota on $CONTROL; please cleanup!\n";
    exit(1);
}

168
#
169
# Parse command arguments.
170
#
171
172
173
174
ParseArgs();

#
# Sanity check them.
175
#
176
177
178
179
180
181
if (!defined($pid) || !defined($eid)) {
    usage();
}
if (!defined($gid)) {
    $gid = $pid;
}
182
if (!defined($description)) {
183
184
    $description = "'Created by $dbuid'";
}
185
186
187
188
189
190
if (! $swappable && (!defined($noswap_reason) || $noswap_reason eq "")) {
    die("Must provide a reason with -S option (not swappable reason)!\n");
}
if (! $idleswap && (!defined($noidleswap_reason) || $noidleswap_reason eq "")) {
    die("Must provide a reason with -L option (no idleswap reason)!\n");
}
191
192
193
194
if (!defined($tempnsfile) && !TBAdmin($dbuid)) {
    die("*** $0:\n".
	"    Only admins can create experiments with no NS file\n");
}
195
196
my $nsfile  = "$eid.ns";
my $repfile = "$eid.report";
197

198
199
200
201
202
203
# Defaults for the DB and for the email message. 
$noswap_reason = "'None Given'"
    if (!defined($noswap_reason));
$noidleswap_reason = "'None Given'"
    if (!defined($noidleswap_reason));

204
#
205
# Make sure UID is allowed to create experiments in this project.
206
#
207
208
209
if (! TBProjAccessCheck($dbuid, $pid, $gid, TB_PROJECT_CREATEEXPT)) {
    die("*** $0:\n".
	"    You do not have permission to create experiments in $pid/$gid\n");
210
211
}

212
213
214
215
216
217
218
219
#
# If no NS file, then override swap/idle stuff. Again, might change
# when new forms installed
#
if (!defined($tempnsfile)) {
    $swappable     = 0;
    $idleswap      = 0;
}
220
elsif (! -f $tempnsfile || ! -r $tempnsfile || -z $tempnsfile) {
221
222
223
224
225
    # Exit so that user sees the error, not us.
    print STDERR "*** $0:\n".
	         "    $tempnsfile does not exist or is not a readable file!\n";
    exit(1);
}
226

227
228
229
230
#
# Batch jobs get a shorter idle time
#
my $swaptime = $idleswaptime;
231
if ($batchmode && TBSiteVarExists("idle/batch_threshold")) {
232
233
234
235
236
237
    my $batchidleswaptime = TBGetSiteVar("idle/batch_threshold");
    if ($swaptime > $batchidleswaptime) {
	$swaptime = $batchidleswaptime;
    }
}

238
#
239
240
# Grab me a secret key for accessing tar/rpm files via the web interface.
# Grab another secret key for the event system HMACs.
241
#
242
243
my $webkey   = TBGenSecretKey();
my $eventkey = TBGenSecretKey();
244

245
246
247
248
249
250
251
252
253
#
# In wait mode, block SIGINT until we spin off the background process.
#
if ($waitmode) {
    $SIG{QUIT} = 'IGNORE';
    $SIG{TERM} = 'IGNORE';
    $SIG{INT}  = 'IGNORE';
}

254
#
255
256
# Create an experiment record. The pid/eid has to be unique, so lock the
# table for the check/insert.
257
#
258
259
260
261
DBQueryFatal("lock tables experiments write, ".
	     "            experiment_stats write, ".
	     "            experiment_resources write, ".
	     "            testbed_stats read");
262

263
264
265
$query_result =
    DBQueryFatal("SELECT pid,eid FROM experiments ".
		 "WHERE eid='$eid' and pid='$pid'");
266

267
268
269
270
271
if ($query_result->numrows) {
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
        "    Experiment $eid in project $pid already exists!\n");
}
272

273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#
# Grab the next highest index to use. We used to use an auto_increment
# field in the table, but if the DB is ever "dropped" and recreated,
# it will reuse indicies that are crossed referenced in the other two
# tables.
#
$query_result =
    DBQueryFatal("select MAX(exptidx) from experiment_stats");
my ($exptidx) = $query_result->fetchrow_array();
$exptidx++;

#
# Lets be really sure!
#
foreach my $table ("experiments", "experiment_stats", "experiment_resources",
		   "testbed_stats") {

    my $slot = (($table eq "experiments") ? "idx" : "exptidx");
	
    $query_result =
	DBQueryFatal("select * from $table where ${slot}=$exptidx");
    
    if ($query_result->numrows) {
	DBQueryWarn("unlock tables");
	die("*** $0:\n".
	    "    Experiment index $exptidx exists in $table; this is bad!\n");
    }
}

302
#
303
304
# Insert the record. This reserves the pid/eid for us. If its a batchmode
# experiment, we will update the record later so that the batch daemon
305
306
# will recognize it. We insert the record as locked and ACTIVATING so that
# no one can mess with the experiment until later. 
307
#
308
if (! DBQueryWarn("INSERT INTO experiments ".
309
		  "(idx, eid, pid, gid, expt_created, expt_name,".
310
		  " expt_head_uid,expt_swap_uid, state, priority, swappable,".
311
		  " idleswap, idleswap_timeout, autoswap, autoswap_timeout,".
312
		  " idle_ignore, keyhash, expt_locked, eventkey,".
313
		  " noswap_reason, noidleswap_reason, batchmode, ".
314
		  " batchstate, linktest_level) ".
315
		  "VALUES ($exptidx, '$eid', '$pid', '$gid', now(), ".
316
		  "$description,'$dbuid', '$dbuid', '$exptstate', $priority, ".
317
		  "$swappable, $idleswap, '$swaptime', $autoswap, ".
318
		  "'$autoswaptime', $idleignore, '$webkey', ".
319
		  "now(), '$eventkey', $noswap_reason, ".
320
321
		  "$noidleswap_reason, $batchmode, '$batchstate', ".
		  "$linktest)")) {
322
323
    DBQueryWarn("unlock tables");
    die("*** $0:\n".
324
	"    DB error inserting experiment record for $pid/$eid!\n");
325
326
}

327
#
328
# Create an experiment_resources record for the above record.
329
#
330
331
332
333
334
335
$query_result =
    DBQueryWarn("insert into experiment_resources (tstamp, exptidx) ".
		"values (now(), $exptidx)");

if (!$query_result) {
    DBQueryWarn("delete from experiments where pid='$pid' and eid='$eid'");
336
    DBQueryWarn("unlock tables");
337
338
    die("*** $0:\n".
	"    DB error inserting experiment resources record for $pid/$eid!");
339
}
340
my $rsrcidx = $query_result->insertid;
341
342
343

#
# Now create an experiment_stats record to match.
344
#
345
if (! DBQueryWarn("insert into experiment_stats ".
346
347
348
349
350
		  "(eid, pid, creator, gid, created, batch, exptidx, rsrcidx) ".
		  "values('$eid', '$pid', '$dbuid', '$gid', now(), ".
		  "$batchmode, $exptidx, $rsrcidx)")) {
    DBQueryWarn("delete from experiments where pid='$pid' and eid='$eid'");
    DBQueryWarn("delete from experiment_resources where idx=$rsrcidx");
351
    DBQueryWarn("unlock tables");
352
353
354
355
356
357
358
359
360
361
    die("*** $0:\n".
	"    DB error inserting experiment stats record for $pid/$eid!");
}

if (! DBQueryWarn("unlock tables")) {
    DBQueryWarn("delete from experiments where pid='$pid' and eid='$eid'");
    DBQueryWarn("delete from experiment_resources where idx=$rsrcidx");
    DBQueryWarn("delete from experiment_stats where exptidx=$exptidx");
    die("*** $0:\n".
	"    DB error unlocking tables!");
362
363
}

364
365
366
367
368
369
#
# At this point, we need to force a cleanup no matter how we exit.
# See the END block below.
#
$justexit = 0;

370
#
371
# Create a directory structure for the experiment.
372
#
373
if (system("$mkexpdir $pid $gid $eid") != 0) {
374
375
376
377
    if (($? >> 8) == EDQUOT()) {
	# Obey exit status protocol for web page; User should see this.
	$errorstat = 1;
    }
378
    fatal("$mkexpdir failed");
379
380
}

381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#
# Grab the working directory path, and thats where we work.
# The user's experiment directory is off in /proj space.
#
my $workdir = TBExptWorkDir($pid, $eid);
my $userdir = TBExptUserDir($pid, $eid);

chdir("$workdir") or
    fatal("Could not chdir to $workdir: $!");

#
# It would be nice to check for overquota during the rest of this
# setup, but thats going to be a pain. Besides, its very unlikely
# that someone goes overquota at this point. Typically, the person
# is already overquota at this point. If this turns out to be wrong,
# it will be easy to add a little test to make sure there is at least
# a reasonable amount of room to proceed (create temp file and stick
# some data in it).
# 

401
402
403
404
405
406
407
408
409
410
#
# Dump the eventkey into a file in the experiment directory. 
#
if ($EVENTSYS) {
    open(KEY, ">" . TBDB_EVENTKEY($pid, $eid)) or
	fatal("Could not create eventkey file: $!");
    print KEY $eventkey;
    close(KEY);
}

411
412
413
414
415
416
# And dump the web key too.
open(KEY, ">" . TBDB_WEBKEY($pid, $eid)) or
    fatal("Could not create webkey file: $!");
print KEY $webkey;
close(KEY);

417
#
418
419
420
# If no NS file, we are done. We must unlock it and reset its state
# appropriately. We leave the experiment in the "new" state so that
# the user is forced to do a modify first (to give it a topology). 
421
#
422
if (!defined($tempnsfile)) {
423
    TBUnLockExp($pid, $eid, EXPTSTATE_NEW());
424
425
    exit(0);
}
426

427
#
428
# Now we can get the NS file!
429
#
430
if (system("/bin/cp", "$tempnsfile", "$nsfile")) {
431
    fatal("Could not copy $tempnsfile to $workdir/$nsfile");
432
}
433
chmod(0664, "$nsfile");
434

435
#
436
437
# Run parse in impotent mode on the NS file.  This has no effect but
# will display any errors.
438
#
439
if (system("$parser -n $zeeopt $pid $gid $eid $nsfile") != 0) {
Leigh B. Stoller's avatar
Leigh B. Stoller committed
440
441
    # Obey exit status protocol for web page.
    $errorstat = 1;
442
    fatal("NS Parse failed!");
443
444
}

445
446
447
#
# Gather statistics; start the clock ticking.
#
448
if ($frontend || $batchmode) {
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_PRELOAD, 0,
		    TBDB_STATS_FLAGS_START);
}
else {
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_START, 0,
		    TBDB_STATS_FLAGS_START);
}

#
# The rest of this goes into the background so that the user sees
# immediate response. We will send email later when the experiment
# is ready. In waitmode, we hold the parent waiting so that the user
# can script it. Must protect against async (^C) termination though.
#
my $logname = TBExptCreateLogFile($pid, $eid, "startexp");
TBExptSetLogFile($pid, $eid, $logname);
TBExptOpenLogFile($pid, $eid);
    
if (my $childpid = TBBackGround($logname)) {
    #
469
470
    # Parent exits normally, unless in waitmode. We have to set
    # justexit to make sure the END block below does not run.
471
    #
472
473
    $justexit = 1;
    
474
475
476
    if (!$waitmode) {
	print("Experiment $pid/$eid is now configuring\n".
 	      "You will be notified via email when the experiment is ".
477
478
	      "ready to use\n")
	    if (! $quiet);
479
480
	exit(0);
    }
481
    print("Waiting for " . ($batchmode ? "batch " : "") . "experiment $eid ".
482
483
484
485
486
487
488
	  "to finish " . ($frontend ? "preloading." : "swapping in.") . "\n")
	if (! $quiet);

    if (isatty(STDIN) && !$quiet) {
	print("You may type ^C at anytime; you will be notified via email.\n".
	      "You will not actually interrupt the experiment itself.\n");
    }
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

    # Give child a chance to run.
    select(undef, undef, undef, 0.25);
	
    #
    # Reset signal handlers. User can now kill this process, without
    # stopping the child.
    #
    $SIG{TERM} = 'DEFAULT';
    $SIG{QUIT} = 'DEFAULT';
    $SIG{INT}  = 'DEFAULT';

    #
    # Wait until child exits or until user gets bored and types ^C.
    #
    waitpid($childpid, 0);

506
507
    print("Done. Exited with status: $?\n")
	if (! $quiet);
508
509
    exit($? >> 8);
}
510
TBdbfork();
511
512
513
514
515
516
517
518
519
520
521
522

# We are committed now. Affects how fatal() operates.
$committed = 1;

#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}

523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
#
# We need to catch TERM cause sometimes shit happens and we have to kill
# an experiment setup that is hung or otherwise scrogged. Rather then 
# trying to kill off the children one by one, lets arrange to catch it
# here and send a killpg to the children. This is not to be done lightly,
# cause it can leave things worse then they were before!
#
sub handler ($) {
    my ($signame) = @_;
    
    $SIG{TERM} = 'IGNORE';
    my $pgrp = getpgrp(0);
    kill('TERM', -$pgrp);
    sleep(1);
    $signaled = 1;
    fatal("Caught SIG${signame}! Killing experiment setup ...");
}
$SIG{TERM} = \&handler;
$SIG{QUIT} = 'DEFAULT';

543
544
#
# The guts of starting an experiment!
545
#
546
547
# A batch experiment is essentially preloaded (frontend mode) and then
# dropped into the batch queue, unless the user requested only preload.
548
549
#

550
#
551
# Run the various scripts. We want to propagate the error from tbprerun
552
553
# and tbrun back out, hence the bogus looking errorstat variable.
#
554
555
SetExpState($pid, $eid, EXPTSTATE_PRERUN)
    or fatal("Failed to set experiment state to " . EXPTSTATE_PRERUN());
556
557
558
559
$zeeopt = "-z"
    if ($zeemode);
print "Running 'tbprerun $zeeopt $pid $eid $nsfile'\n";
if (system("$tbbindir/tbprerun $zeeopt $pid $eid $nsfile") != 0) {
560
    $errorstat = $? >> 8;
561
    fatal("tbprerun failed!");
562
}
563
564
SetExpState($pid, $eid, EXPTSTATE_SWAPPED)
    or fatal("Failed to set experiment state to " . EXPTSTATE_SWAPPED());
565
566
567
568

#
# If not in frontend mode (preload only) continue to swapping exp in.
# 
569
if (! ($frontend || $batchmode)) {
570
571
    SetExpState($pid, $eid, EXPTSTATE_ACTIVATING)
	or fatal("Failed to set experiment state to ". EXPTSTATE_ACTIVATING());
572
573
574
575

    print "Running 'tbswap in $pid $eid'\n";
    if (system("$tbbindir/tbswap in $pid $eid") != 0) {
	$errorstat = $? >> 8;
576
	fatal("tbswap in failed!");
577
    }
578
579
    SetExpState($pid, $eid, EXPTSTATE_ACTIVE)
	or fatal("Failed to set experiment state to " . EXPTSTATE_ACTIVE());
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605

    #
    # Look for the unsual case of more than 2 nodes and no vlans. Send a
    # warning message.
    #
    my @localnodes = ExpNodes($pid, $eid, 1);
    
    if (defined(@localnodes) && scalar(@localnodes) > 2) {
	my $vlans_result =
	    DBQueryFatal("select pid from virt_lans ".
			 "where pid='$pid' and eid='$eid'");
    
	if (!$vlans_result->numrows) {
	    SENDMAIL("$user_name <$user_email>",
		     "WARNING: Experiment Configuration: $pid/$eid",
		     "This experiment has zero network links defined.\n".
		     "Please check your NS file to verify this is what you ".
		     "want!\n",
		     "$user_name <$user_email>",
		     "Cc: $TBOPS", ($nsfile));
	}
    }
}

# We append this report in the email message below.
if (system("$tbbindir/tbreport -b $pid $eid 2>&1 > $repfile") != 0) {
606
    fatal("tbreport failed!");
607
608
609
610
611
612
613
614
615
616
617
618
619
}

#
# Try to copy off the files for testbed information gathering.
#
TBSaveExpLogFiles($pid, $eid);

#
# Make a copy of the work dir in the user visible space so the user
# can see the log files.
#
system("cp -Rfp $workdir/ $userdir/tbdata");

620
621
622
623
624
625
626
627
628
629
#
# Gather statistics.
#
if ($frontend || $batchmode) {
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_PRELOAD, 0);
}
else {
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_START, 0);
}

630
631
632
633
634
635
636
637
638
639
#
# Old accounting info (still used in web interface and autoswap).
#
TBSetExpSwapTime($pid, $eid);

#
# Set the swapper uid on success only, and *after* gathering swap stats!
#
TBExptSetSwapUID($pid, $eid, $dbuid);

640
641
642
643
644
645
646
647
#
# Close up the log file so the webpage stops.
#
TBExptCloseLogFile($pid, $eid);

#
# Must unlock and drop batch experiments into the queue before exit.
#
648
if ($batchmode && !$frontend) {
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
    TBUnLockExp($pid, $eid, EXPTSTATE_QUEUED());
}
else {
    TBUnLockExp($pid, $eid);
}

#
# Clear the cancel flag now that the operation is complete. Must be
# done after we change the experiment state (above).
#
TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR());

#
# Dump the report file and the log file to the user via email. 
#
664
# Yuck, we want the created time that mysql assigned at the insertion.
665
666
#
$query_result =
667
    DBQueryFatal("select expt_created from experiments ".
668
		 "where pid='$pid' and eid='$eid'");
669
my ($expt_created) = $query_result->fetchrow_array();
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
my $message;

if ($frontend) {
    $message =
	"Your experiment `$eid' in project `$pid' has been created.\n" .
	"You can check the web interface to see if it looks the way\n" .
	"you expected it to. If so, you may swap the experiment in,\n" .
	"or terminate it, at any time.\n" .
        "\n";
}
else {
    $message =
	"Your experiment `$eid' in project `$pid' has been started.\n" .
	"Here is the experiment summary detailing the nodes that were\n" .
	"allocated to you. You may use the `Qualified Name' to log on\n" .
	"to your nodes. See /etc/hosts on your nodes (when running\n" .
	"FreeBSD, Linux, or NetBSD) for the IP mapping on each node.\n" .
        "\n";
688
}
689
690
691
692
693
694
695
696
$message .=
    "User:        $user_name\n" .
    "EID:         $eid\n" .
    "PID:         $pid\n" .
    "GID:         $gid\n" .
    "Description: $description\n" .
    "Swappable:   " . ($swappable ? "Yes\n" :
		                    "No  (Reason: $noswap_reason)\n") .
697
    "Idle-Swap:   " . ($idleswap  ? "Yes, at " . $idleswaptime/60.0 . " hours\n":
698
		                    "No  (Reason: $noidleswap_reason)\n") .
699
    "Auto-Swap:   " . ($autoswap  ? "Yes, at " . $autoswaptime/60.0 . " hours\n":
700
701
702
703
704
705
706
707
708
709
710
711
712
713
		                    "No\n") .
    "Created:     $expt_created\n".
    "Directory:   $userdir\n".
    "\n".
    "Appended at the end is the output of the experiment setup. If you\n" .
    "have any questions or comments, please include the output below\n" .
    "in your message to $TBOPS";

SENDMAIL("$user_name <$user_email>",
	 "New Experiment " . (($frontend == 0) ? "Started" : "Created") .
	 ": $pid/$eid",
	 $message,
	 "$user_name <$user_email>",
	 "Bcc: $TBLOGS",
714
715
	 ($repfile, $logname, $nsfile))
    if (!$zeemode);
716
717

# Done!
718
exit(0);
719

720
721
722
#
#
#
723
sub cleanup()
724
{
725
    #
726
727
728
729
730
731
732
    # Failed early (say, in parsing). No point in keeping any of the
    # stats or resource records. Just a waste of space since the
    # testbed_stats log indicates there was a failure and why (sorta,
    # via the exit code).
    # 
    if (!$committed) {
	#
733
	# Clear the experiment record and cleanup directories
734
735
	#
	TBExptDestroy($pid, $eid);
736
737
738
739
740

	#
	# Now we can clean up the stats records. 
	#
	DBQueryWarn("DELETE from experiment_resources ".
741
		    "WHERE idx=$rsrcidx");
742
743
744
745
746
747
748
749
	
	DBQueryWarn("DELETE from testbed_stats ".
		    "WHERE exptidx=$exptidx");

	# This must be last cause it provides the unique exptidx above.
	DBQueryWarn("DELETE from experiment_stats ".
		    "WHERE eid='$eid' and pid='$pid' and exptidx=$exptidx");

750
	return;
751
    }
752

753
754
755
756
757
758
759
760
761
    #
    # Gather statistics.
    #
    if ($frontend) {
	GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_PRELOAD, $errorstat);
    }
    else {
	GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_START, $errorstat);
    }
762

763
764
765
766
    #
    # Must clean up the experiment if it made it our of NEW state.
    #
    my $estate = ExpState($pid, $eid);
767
768
769
770
771
772
773
774
    if ($estate ne EXPTSTATE_NEW) {
	#
	# We do not know exactly where things stopped, so if the
	# experiment was activating when the signal was delivered,
	# run tbswap on it. 
	# 
	if ($estate eq EXPTSTATE_ACTIVE ||
	    ($estate eq EXPTSTATE_ACTIVATING && $signaled)) {
775
	    print "Running 'tbswap out -force $pid $eid'\n";
776
	    if (system("$tbbindir/tbswap out -force $pid $eid") != 0) {
777
778
		print "tbswap out failed!\n";
	    }
779
	    SetExpState($pid, $eid, EXPTSTATE_SWAPPED);
780
	}
781
	
782
783
784
	print "Running 'tbend -force $pid $eid'\n";
	if (system("$tbbindir/tbend -force $pid $eid") != 0) {
	    print "tbend failed!\n";
785
786
	}
    }
787
    SetExpState($pid, $eid, EXPTSTATE_TERMINATED);
788

789
    #
790
791
792
793
794
795
796
797
798
799
800
    # Okay, we *are* going to terminate the experiment.
    # 
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_TERMINATE, 0);

    # Clear the logfile so the webpage stops. 
    TBExptClearLogFile($pid, $eid);

    #
    # Send a message to the testbed list. 
    #
    SENDMAIL("$user_name <$user_email>",
801
802
803
804
805
806
807
808
809
810
811
812
813
814
	 "Experiment Configure Failure: $pid/$eid",
	 "Please look at the log below to see what happened. If the error\n".
	 "resulted from a lack of free nodes, you can use this web page to\n".
	 "get a summary of free nodes:\n\n".
	 "  ${TBBASE}/nodecontrol_list.php3\n".
	 "\n".
	 "Please do not try again until you see enough nodes free. Or, you\n".
	 "can use the batch system to swap your experiment in when enough\n".
	 "nodes are free:\n\n".
	 "  ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html".
	     "#BatchMode\n",
	 "$user_name <$user_email>",
	 "Cc: $TBOPS",
	 ($logname, "assign.log", "wanassign.log", $nsfile));
815

816
    #
817
818
819
820
    # Back up the work dir for post-mortem debugging. 
    #
    system("/bin/rm -rf  ${workdir}-failed");
    system("/bin/mv -f   $workdir ${workdir}-failed");
821

822
823
824
825
    #
    # Clear the record and cleanup.
    # 
    TBExptDestroy($pid, $eid);    
826
827
828
}

#
829
830
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
831
#
832
sub ParseArgs()
833
{
834
835
836
837
    my %options = ();
    if (! getopts($optlist, \%options)) {
	usage();
    }
838

839
    if (@ARGV > 1) {
840
841
	usage();
    }
842
843
    if (@ARGV == 1) {
	$tempnsfile = $ARGV[0];
844

845
	# Note different taint check (allow /).
846
	if ($tempnsfile =~ /^([-\w\.\/]+)$/) {
847
848
849
	    $tempnsfile = $1;
	}
	else {
850
851
	    die("*** $0:\n".
		"    Bad data in nsfile: $tempnsfile\n");
852
	}
853

854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
	#
	# Called from ops interactively. Make sure NS file in /proj or /users.
	#
	# Use realpath to resolve any symlinks.
	#
	my $translated = `realpath $tempnsfile`;
	if ($translated =~ /^([-\w\.\/]+)$/) {
	    $tempnsfile = $1;
	}
	else {
	    die("*** $0:\n".
		"    Bad data returned by realpath: $translated\n");
	}

	#
	# The file must reside in /proj, /groups, or /users. Since this script
	# runs as the caller, regular file permission checks ensure its a file
	# the user is allowed to use. /tmp/$pid-$eid.nsfile.XXXXX also allowed
	# since this script is invoked directly from web interface.
	#
	if (! ($tempnsfile =~ /^\/tmp\/[-\w]+-\d+\.nsfile/) &&
875
	    ! ($tempnsfile =~ /^\/tmp\/\d+\.ns/) &&
876
877
878
879
880
881
882
883
884
885
886
887
	    ! ($tempnsfile =~ /^\/var\/tmp\/php\w+/) &&
	    ! ($tempnsfile =~ /^\/proj/) &&
	    ! ($tempnsfile =~ /^\/groups/) &&
	    ! ($tempnsfile =~ /^\/users/)) {
	    print STDERR
		"*** $0:\n".
		"    $tempnsfile does not resolve to an allowed directory!\n";
	    # Note positive status; so error goes to user not tbops.
	    exit(1);
	}
    }
    
888
    if (defined($options{"i"})) {
889
	$batchmode = 0;
890
    }
891
892
893
    if (defined($options{"f"})) {
	$frontend = 1;
    }
894
895
896
    if (defined($options{"q"})) {
	$quiet = 1;
    }
897
898
899
900
    if (defined($options{"z"})) {
	$zeemode = 1;
	$zeeopt  = "-p";
    }
901
902
903
904
    # This option should not be exported via the XMLRPC server. 
    if (defined($options{"n"})) {
	$idleignore = 1;
    }
905
906
907
908

    #
    # pid,eid,gid get passed along as shell commands args; must taint check.
    # 
909
910
    if (defined($options{"p"})) {
	$pid = $options{"p"};
911

912
	if ($pid =~ /^([-\w]+)$/) {
913
914
	    $pid = $1;
	}
915
916
	else {
	    die("Bad data in argument: $pid.");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
917
	}
918
919
920
921
    }
    if (defined($options{"e"})) {
	$eid = $options{"e"};

922
	if ($eid =~ /^([-\w]+)$/) {
923
924
925
926
	    $eid = $1;
	}
	else {
	    die("Bad data in argument: $eid.");
927
	}
928
929
930
931
	if (! TBcheck_dbslot($eid, "experiments", "eid",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper experiment name (id)!\n");
	}
932
933
934
935
    }
    if (defined($options{"g"})) {
	$gid = $options{"g"};

936
	if ($gid =~ /^([-\w]+)$/) {
937
	    $gid = $1;
938
	}
939
940
	else {
	    die("Bad data in argument: $gid.");
941
942
	}
    }
943
    if (defined($options{"E"})) {
944
945
946
947
948
	if (! TBcheck_dbslot($options{"E"}, "experiments", "expt_name",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper experiment description!\n");
	}
	$description = DBQuoteSpecial($options{"E"});
949
    }
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
    if (defined($options{"S"})) {
	if (! TBcheck_dbslot($options{"S"}, "experiments", "noswap_reason",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper noswap reason!\n");
	}
	$swappable     = 0;
	$noswap_reason = DBQuoteSpecial($options{"S"});
    }
    if (defined($options{"L"})) {
	if (! TBcheck_dbslot($options{"L"}, "experiments", "noidleswap_reason",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper noidleswap reason!\n");
	}
	$idleswap          = 0;
	$noidleswap_reason = DBQuoteSpecial($options{"L"});
965
    }
966
    if (defined($options{"l"})) {
967
968
969
970
971
	if (! TBcheck_dbslot($options{"l"}, "experiments", "idleswap_timeout",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper idleswap timeout!\n");
	}
        $idleswap     = 1;
972
973
974
	$idleswaptime = $options{"l"};
    }
    if (defined($options{"a"})) {
975
976
977
978
979
	if (! TBcheck_dbslot($options{"a"}, "experiments", "autoswap_timeout",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper autoswap timeout!\n");
	}
        $autoswap     = 1;
980
981
	$autoswaptime = $options{"a"};
    }
982
983
984
985
986
987
988
    if (defined($options{"t"})) {
	if (! TBcheck_dbslot($options{"t"}, "experiments", "linktest_level",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    die("Improper linktest level!\n");
	}
	$linktest = $options{"t"};
    }
989
990
    if (defined($options{"w"})) {
	$waitmode = 1;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
991
    }
992
}
993
994
995
996
997
998
999
1000

#
# We need this END block to make sure that we clean up after a fatal
# exit in the library. This is problematic, cause we could be exiting
# cause the mysql server has gone whacky again. 
#
sub fatal($)
{