batchexp.in 40.5 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh Stoller's avatar
Leigh Stoller committed
2
#
Leigh Stoller's avatar
Leigh Stoller committed
3
# Copyright (c) 2000-2018 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh Stoller's avatar
Leigh Stoller committed
23
#
24 25
use English;
use Getopt::Std;
26
use POSIX qw(isatty setsid);
27
use POSIX qw(strftime);
28
use Errno qw(EDQUOT ENOSPC);
29
use RPC::XML;
30
use Cwd qw(realpath);
31 32

#
33 34 35 36
# Create an experiment. The experiment is either run immediately, or
# placed into the batch system to be run later. If no NS file is
# supplied, an experiment shell is created (this is currently an admin
# only option).
37
#
38
# TODO: Remove expt_expires and priority.
39
#       Add calls to check_slot() to verify inputs.
40
# 
Leigh Stoller's avatar
Leigh Stoller committed
41 42 43 44 45 46 47 48 49
# Exit codes are important; they tell the web page what has happened so
# it can say something useful to the user. Fatal errors are mostly done
# with die(), but expected errors use this routine. At some point we will
# use the DB to communicate the actual error.
#
# $status < 0 - Fatal error. Something went wrong we did not expect.
# $status = 0 - Everything okay.
# $status > 0 - Expected error. User not allowed for some reason. 
# 
50 51
sub usage()
{
52
    print(STDERR
53
	  "Usage: batchexp [-q] [-i [-w]] [-n] [-f] [-N] [-E description] [-g gid]\n".
54
	  "                [-S reason] [-L reason] [-a <time>] [-l <time>]\n".
55
	  "                -p <pid> -e <eid> <nsfile>\n".
56
	  "switches and arguments:\n".
57 58 59
	  "-i       - swapin immediately; by default experiment is batched\n".
	  "-w       - wait for non-batchmode experiment to preload or swapin\n".
	  "-f       - preload experiment (do not swapin or queue yet)\n".
60
	  "-q       - be less chatty\n".
61 62
	  "-S <str> - Experiment cannot be swapped; must provide reason\n".
	  "-L <str> - Experiment cannot be IDLE swapped; must provide reason\n".
63
	  "-n       - Do not send idle email (internal option only)\n".
64 65
	  "-a <nnn> - Auto swapout nnn minutes after experiment is swapped in\n".
	  "-l <nnn> - Auto swapout nnn minutes after experiment goes idle\n".
66
	  "-s       - Save disk state on swapout\n".
67 68 69 70
	  "-E <str> - A pithy sentence describing your experiment\n".
	  "-p <pid> - The project in which to create the experiment\n".
	  "-g <gid> - The group in which to create the experiment\n".
	  "-e <eid> - The experiment name (unique, alphanumeric, no blanks)\n".
71
	  "-N       - Suppress most email to the user and testbed-ops\n".
72 73 74 75
	  "<nsfile> - NS file to parse for experiment.\n");
    exit(-1);
}

76
sub ParseArgs();
77 78
sub CheckCopyArgs();
sub CopyInArchive();
79
sub fatal($;$);
80

81
my $optlist = "iE:g:e:p:S:L:a:l:sfwqt:nzc:bx:y:h:jkNXRC:"; # Enough options?
82
my $batchmode= 1;
83 84
my $frontend = 0;
my $waitmode = 0;
85
my $quiet    = 0;
86
my $lockdown = 0;
87
my $linktest = 0;	# non-zero means level to run at.
88 89
my $zeemode  = 0;	# Hey, out of options.
my $zeeopt   = "";	# To pass along.
90
my $fromrpc  = 0;	# Invoked from XMLRPC server.
91
my $genimode;           # Allow creation with no NS file, uuid given.
92
my $savestate= 0;
93
my $nonsfile = 0;	# Admin only option to activate experiment anyway.
94 95 96
my $template;		# New stuff; experiment templates.
my $branch_template;    # New stuff; experiment templates.
my $instance;		# New stuff; experiment templates.
97
my $creator;		# Create experiment record as this user.
98 99 100 101 102 103 104 105 106
# All of these are for experiment dup and branch. Really mucks things up.
# These globals are set when we parse the -c argument, but used later
my $copyfrom;		# Copy from where, archive or current experiment.
my $copyarg;            # The -c argument.
my $copyidx;            # The index of the experiment copied.
my $copypid;		# The pid of the experiment copied.
my $copyeid;		# The eid of the experiment copied.
my $copytag;		# The archive tag to us.
my $copydir;		# Directory extracted from archive, to delete.
107 108 109 110 111

#
# Configure variables
#
my $TB       = "@prefix@";
112
my $EVENTSYS = @EVENTSYS@;
113 114
my $TBOPS    = "@TBOPSEMAIL@";
my $TBLOGS   = "@TBLOGSEMAIL@";
115 116
my $TBDOCBASE = "@TBDOCBASE@";
my $TBBASE   = "@TBBASE@";
117
my $CONTROL  = "@USERNODE@";
118
my $ISFS     = ("@BOSSNODE_IP@" eq "@FSNODE_IP@") ? 1 : 0;
119

120 121 122 123 124 125
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
126
use libtblog;
127
use libArchive;
128
use Experiment;
129
use Template;
130
use User;
131
use Project;
132
use Group;
133

134 135 136 137
my $parser      = "$TB/libexec/parse-ns";
my $checkquota  = "$TB/sbin/checkquota";
my $tbbindir    = "$TB/bin/";
my $RSYNC	= "/usr/local/bin/rsync";
Leigh Stoller's avatar
Leigh Stoller committed
138
my $errorstat=-1;
139
my $exptidx;
140
my $logfile;
141
my $logname;
142

143 144 145 146
# For the END block below.
my $cleaning = 0;
my $justexit = 1;
my $signaled = 0;
147 148 149 150 151 152

#
# Turn off line buffering on output
#
$| = 1;

153 154 155 156 157 158 159
#
# Set umask for start/swap. We want other members in the project to be
# able to swap/end experiments, so the log and intermediate files need
# to be 664 since some are opened for append.
#
umask(0002);

160 161
#
# Untaint the path
162
#
163 164
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
165 166
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

167 168 169 170 171
my $eid;
my $pid;
my $gid;
my $description;
my $tempnsfile;
172 173 174
my $swappable    = 1;
my $noswap_reason;
my $idleswap     = 1;
175
my $idleswaptime = 60 * TBGetSiteVar("idle/threshold");
176
my $noidleswap_reason;
177 178 179
my $autoswap     = TBGetSiteVar("general/autoswap_mode") ? 1 : 0;
my $autoswaptime = 60 * TBGetSiteVar("general/autoswap_threshold");
my $AUTOSWAPMAX  = 60 * TBGetSiteVar("general/autoswap_max");
180 181 182
my $idleignore   = 0;
my $priority     = TB_EXPTPRIORITY_LOW;
my $exptstate    = EXPTSTATE_NEW();
183
my $batchstate   = BATCHSTATE_UNLOCKED();
184 185
my $now          = localtime();
my $committed    = 0;
186 187
my $experiment;
my $copy_experiment;
188
my $noemail      = 0;
189
my $xmlout       = 0;
190

191
#
192
# Verify user and get his DB uid and other info for later.
193
#
194 195 196
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
197
}
198 199 200 201
my $user_dbid  = $this_user->dbid();
my $user_uid   = $this_user->uid();
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
202

203 204 205 206 207 208 209 210 211 212 213 214 215 216
#
# Lets check to make sure user did not delete their home dir.
# Maybe in the future, when exports_setup does not cause all
# mounts to hiccup.
#
if (0) {
    my $homdirerror;
    if ($this_user->HomeDirOkay(\$homdirerror) != 0) {
	tberror({cause => 'user', type => 'primary', severity => SEV_ERROR},
		$homdirerror);
	exit(1);
    }
}

217
#
218
# Parse command arguments.
219
#
220
ParseArgs();
221
CheckCopyArgs();
222 223 224

#
# Sanity check them.
225
#
226 227 228
usage()
    if (!defined($pid) || !defined($eid));
usage()
229
    if (defined($tempnsfile) && defined($copyarg));
230

231 232 233 234 235 236 237 238 239
#
# Before doing anything else, check for overquota ... lets not waste
# our time. Make sure user sees the error by exiting with 1.
#
if (system("$checkquota -p $pid $user_uid") != 0) {
    tberror({cause => 'user', type => 'primary', severity => SEV_ERROR,
	     error => ['over_disk_quota', $CONTROL]},
	    "You are over your disk quota on $CONTROL; ".
	    "please login there and cleanup!");
240
    exit(ENOSPC);
241 242
}

243 244 245 246 247
if (! -e "$TB/expinfo/$pid") {
    die("*** $0:\n".
	"    $TB/expinfo/$pid has not been created yet!\n".
	"    Did you run 'fixexpinfo' like you were supposed to?\n");
}
248 249 250
if (!defined($creator)) {
    $creator = $this_user;
}
251 252 253
if (!defined($gid)) {
    $gid = $pid;
}
254
if (!defined($description)) {
255
    $description = "Created by $user_uid";
256
}
257
if (!$swappable && !(TBAdmin() || $genimode)) {
258 259
    tbdie("Only testbed admins can disable swapping!");
}
260
if (! $swappable && (!defined($noswap_reason) || $noswap_reason eq "")) {
261
    tbdie("Must provide a reason with -S option (not swappable reason)!");
262
}
263
if (! $idleswap && (!defined($noidleswap_reason) || $noidleswap_reason eq "")){
264
    tbdie("Must provide a reason with -L option (no idleswap reason)!");
265
}
266 267
if (!defined($tempnsfile) && !defined($copyarg)
    && !TBAdmin() && !defined($genimode)) {
268
    tbdie("Only admins can create experiments with no NS file");
269
}
270
my $nsfile  = "$eid.ns";
271
my $repfile = "report";
272

273
# Defaults for the DB and for the email message. 
274
$noswap_reason = "None Given"
275
    if (!defined($noswap_reason));
276
$noidleswap_reason = "None Given"
277 278
    if (!defined($noidleswap_reason));

279 280 281 282 283 284
# Need the group
my $group = Group->Lookup($pid, $gid);
if (!defined($group)) {
    die("No such group $pid/$gid!");
}

285
#
286
# Make sure UID is allowed to create experiments in this project.
287
#
288
if (! $group->AccessCheck($this_user, TB_PROJECT_CREATEEXPT)) {
289
    die("You do not have permission to create experiments in $pid/$gid");
290 291
}

292 293 294
if (defined($copyarg)) {
    # This will be handled below.
    ;
295 296
}
elsif (!defined($tempnsfile)) {
297 298 299 300 301
    if (! $genimode) {
	# If no NS file, then override swap/idle stuff. 
	$swappable     = 0;
	$idleswap      = 0;
    }
302
}
303
elsif (! -f $tempnsfile || ! -r $tempnsfile || -z $tempnsfile) {
304
    # Exit so that user sees the error, not us.
305 306 307
    tberror({type => 'primary', severity => SEV_ERROR,
	     error => ['bogus_ns_file', $tempnsfile]},
	    "$tempnsfile does not exist or is not a readable file!");
308 309
    exit(1);
}
310

311 312 313 314
#
# Batch jobs get a shorter idle time
#
my $swaptime = $idleswaptime;
315
if ($batchmode && TBSiteVarExists("idle/batch_threshold")) {
316 317 318 319 320 321
    my $batchidleswaptime = TBGetSiteVar("idle/batch_threshold");
    if ($swaptime > $batchidleswaptime) {
	$swaptime = $batchidleswaptime;
    }
}

322
#
323 324
# Grab me a secret key for accessing tar/rpm files via the web interface.
# Grab another secret key for the event system HMACs.
325
#
326 327
my $webkey   = TBGenSecretKey();
my $eventkey = TBGenSecretKey();
328

329 330 331 332 333 334 335 336 337
#
# In wait mode, block SIGINT until we spin off the background process.
#
if ($waitmode) {
    $SIG{QUIT} = 'IGNORE';
    $SIG{TERM} = 'IGNORE';
    $SIG{INT}  = 'IGNORE';
}

338
#
339 340 341 342
# Create an arg array of parameters.
#
my %args = ();

343 344 345 346
$args{'expt_head_uid'}	   = $creator->uid();
$args{'expt_swap_uid'}	   = $creator->uid();
$args{'creator_idx'}	   = $creator->dbid();
$args{'swapper_idx'}	   = $creator->dbid();
347 348 349 350
$args{'state'}		   = $exptstate;
$args{'priority'}	   = $priority;
$args{'swappable'}	   = $swappable;
$args{'idleswap'}	   = $idleswap;
351
$args{'idleswap_timeout'}  = ($genimode ? 2 * $swaptime : $swaptime);
352 353 354 355
$args{'autoswap'}	   = $autoswap;
$args{'autoswap_timeout'}  = $autoswaptime;
$args{'idle_ignore'}	   = $idleignore;
$args{'keyhash'}	   = $webkey;
356
$args{'lockdown'}	   = $lockdown;
357 358 359 360 361
$args{'eventkey'}	   = $eventkey;
$args{'batchmode'}	   = $batchmode;
$args{'batchstate'}	   = $batchstate;
$args{'linktest_level'}    = $linktest;
$args{'savedisk'}	   = $savestate;
362
$args{'instance_idx'}	   = (defined($instance) ? $instance->idx() : 0);
363 364 365 366
# These are special; the library will DBQuote them. 
$args{'expt_name'}	   = $description;
$args{'noswap_reason'}	   = $noswap_reason;
$args{'noidleswap_reason'} = $noidleswap_reason;
367 368
$args{'eid_uuid'}          = $genimode
    if (defined($genimode));
369 370

# Now create the experiment; we get back a perl class instance.
371
if (! ($experiment = Experiment->Create($group, $eid, \%args))) {
372 373 374
    tbdie({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['create_experiment_record_failed']},
	  "Could not create a new experiment record!");
375 376
}

377 378 379 380 381 382
#
# At this point, we need to force a cleanup no matter how we exit.
# See the END block below.
#
$justexit = 0;

383 384 385 386 387
#
# Set error reporting info
# 
tblog_set_info($pid,$eid,$UID);

Leigh Stoller's avatar
Leigh Stoller committed
388 389 390 391 392 393 394 395 396
#
# Create the per-experiment RSA key pair and derived ssh pubkey
#
if ($experiment->GenerateKeys() != 0) {
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['create_experiment_keys_failed']},
	  "Failed to create experiment RSA/SSH keys");
}

397
#
398
# Create a directory structure for the experiment.
399 400 401 402 403
#
if ($experiment->CreateDirectory() != 0) {
    if (($? >> 8) == EDQUOT()) {
	# Obey exit status protocol for web page; User should see this.
	$errorstat = 1;
404
    }
405 406 407
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['create_experiment_directory_failed']},
	  "Failed to created experiment directory");
408
}
409 410 411
if (defined($instance)) {
    # Need to cross-mark the instance right away so that it is flagged.
    # Would be better to do this with a plain flag.
412
    my %args = ();
413
    $args{'exptidx'} = $experiment->idx();
414
    
415 416 417 418 419
    $instance->Update(0, \%args) == 0
	or fatal("Could not update experiment instance record!");
}
elsif (defined($template)) {
    # Tell the template with the new experiment index.
420 421 422 423 424
    %args = ();
    $args{'exptidx'} = $experiment->idx();

    $template->Update(\%args) == 0
	or fatal("Could not update template record!");
425 426
}

427 428 429 430
#
# Grab the working directory path, and thats where we work.
# The user's experiment directory is off in /proj space.
#
431 432
my $workdir = $experiment->WorkDir();
my $userdir = $experiment->UserDir();
433 434 435 436

chdir("$workdir") or
    fatal("Could not chdir to $workdir: $!");

437 438 439 440
#
# Create a new archive, which might actually be a branch of an existing one
# when doing an experiment branch (fork).
#
441 442 443
if (defined($branch_template)) {
    my $archive_eid = $branch_template->eid();
    
444 445 446
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['archive_op_failed', 'create', undef, undef]},
	  "Could not create experiment archive!")
447 448 449 450
	if (libArchive::TBForkExperimentArchive($pid, $eid,
						$pid, $archive_eid, undef)
	    < 0);
}
451
elsif (libArchive::TBCreateExperimentArchive($pid, $eid) < 0) {
452 453 454
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['archive_op_failed', 'create', undef, undef]},
	  "Could not create experiment archive!");
455 456 457 458 459 460 461 462 463 464 465 466
}

#
# Okay, if copying/branching an experiment, we have to go find the
# NS file, extracting the special (currently by convention) archive
# directory into the new experiment. This will set the tempnsfile
# variable needed below.
#
if ($copyarg) {
    CopyInArchive();
}

467 468 469 470 471 472 473 474 475 476
#
# It would be nice to check for overquota during the rest of this
# setup, but thats going to be a pain. Besides, its very unlikely
# that someone goes overquota at this point. Typically, the person
# is already overquota at this point. If this turns out to be wrong,
# it will be easy to add a little test to make sure there is at least
# a reasonable amount of room to proceed (create temp file and stick
# some data in it).
# 

477 478 479 480
#
# Dump the eventkey into a file in the experiment directory. 
#
if ($EVENTSYS) {
481
    open(KEY, ">" . $experiment->EventKeyPath()) or
482 483 484 485 486
	fatal("Could not create eventkey file: $!");
    print KEY $eventkey;
    close(KEY);
}

487
# And dump the web key too.
488
open(KEY, ">" . $experiment->WebKeyPath()) or
489 490 491 492
    fatal("Could not create webkey file: $!");
print KEY $webkey;
close(KEY);

493
#
494 495 496
# If no NS file, we are done. We must unlock it and reset its state
# appropriately. We leave the experiment in the "new" state so that
# the user is forced to do a modify first (to give it a topology). 
497
#
498
if (!defined($tempnsfile)) {
499 500 501 502 503 504 505 506 507 508
    #
    # In GeniMode, we skip past the nsfile stuff.
    #
    if (defined($genimode)) {
	$nonsfile = 1;
    }
    if (!$nonsfile) {
	$experiment->Unlock(EXPTSTATE_NEW());
	exit(0);
    }
509
}
510

511 512 513 514 515 516 517 518 519 520
if (!$nonsfile) {
    #
    # Now we can get the NS file!
    #
    if (system("/bin/cp", "$tempnsfile", "$nsfile")) {
	fatal({type => 'primary', severity => SEV_ERROR,
	       error => ['copy_ns_file_failed', $tempnsfile, $nsfile]},
	      "Could not copy $tempnsfile to $workdir/$nsfile");
    }
    chmod(0664, "$nsfile");
521

522 523 524 525 526 527
    # Future; do not name the ns as above, but use generic name.
    if (system("/bin/cp", "$tempnsfile", "nsfile.ns")) {
	fatal({type => 'primary', severity => SEV_ERROR,
	       error => ['copy_ns_file_failed', $tempnsfile, "nsfile.ns"]},
	      "Could not copy $tempnsfile to $workdir/$nsfile");
    }
528 529 530 531 532 533 534 535 536 537
}
# We created this file below so kill it.
if ($copyarg) {
    # This is tainted for reasons I do not understand.
    if ($tempnsfile =~ /^([-\w\.\/]+)$/) {
	$tempnsfile = $1;
    }
    unlink($tempnsfile);
}

538
#
539 540
# Run parse in impotent mode on the NS file.  This has no effect but
# will display any errors.
541
#
542 543
if (!$nonsfile &&
    system("$parser -n $zeeopt $pid $gid $eid $nsfile") != 0) {
Leigh Stoller's avatar
Leigh Stoller committed
544 545
    # Obey exit status protocol for web page.
    $errorstat = 1;
546 547 548
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['ns_parse_failed']},
	  "NS Parse failed!");
549 550
}

551 552 553
#
# Gather statistics; start the clock ticking.
#
554
if ($frontend || $batchmode || $genimode) {
Leigh Stoller's avatar
Leigh Stoller committed
555
    $experiment->PreSwap($this_user, TBDB_STATS_PRELOAD, $exptstate) == 0 or
556
	fatal("Preswap failed!");
557 558
}
else {
Leigh Stoller's avatar
Leigh Stoller committed
559
    $experiment->PreSwap($this_user, TBDB_STATS_START, $exptstate) == 0 or
560
	fatal("Preswap failed!");
561 562
}

563
goto skiplog
564
    if (defined($template));
565

566 567 568 569 570 571
#
# The rest of this goes into the background so that the user sees
# immediate response. We will send email later when the experiment
# is ready. In waitmode, we hold the parent waiting so that the user
# can script it. Must protect against async (^C) termination though.
#
572 573
$logfile = $experiment->CreateLogFile("startexp");
if (!defined($logfile)) {
574 575
    fatal("Could not create logfile!");
}
576 577 578 579 580
$logname = $logfile->filename();
# We want it to spew to the web.
$experiment->SetLogFile($logfile);
# Mark it open since we are going to start using it right away.
$logfile->Open();
581

582 583
if (my $childpid = TBBackGround($logname)) {
    #
584 585
    # Parent exits normally, unless in waitmode. We have to set
    # justexit to make sure the END block below does not run.
586
    #
587 588
    $justexit = 1;
    
589 590 591
    if (!$waitmode) {
	print("Experiment $pid/$eid is now configuring\n".
 	      "You will be notified via email when the experiment is ".
592 593
	      "ready to use\n")
	    if (! $quiet);
594 595
	exit(0);
    }
596
    print("Waiting for " . ($batchmode ? "batch " : "") . "experiment $eid ".
597 598 599 600 601 602 603
	  "to finish " . ($frontend ? "preloading." : "swapping in.") . "\n")
	if (! $quiet);

    if (isatty(STDIN) && !$quiet) {
	print("You may type ^C at anytime; you will be notified via email.\n".
	      "You will not actually interrupt the experiment itself.\n");
    }
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620

    # Give child a chance to run.
    select(undef, undef, undef, 0.25);
	
    #
    # Reset signal handlers. User can now kill this process, without
    # stopping the child.
    #
    $SIG{TERM} = 'DEFAULT';
    $SIG{QUIT} = 'DEFAULT';
    $SIG{INT}  = 'DEFAULT';

    #
    # Wait until child exits or until user gets bored and types ^C.
    #
    waitpid($childpid, 0);

621 622
    print("Done. Exited with status: $?\n")
	if (! $quiet);
623 624 625 626 627

    my $exit_code = $? >> 8;
    
    if ($exit_code != 0) {
	my $d = tblog_lookup_error();
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
	my $output = tblog_format_error($d);
	if ($xmlout) {
	    use libtblog '*SOUT'; # to avoid an unnecessary, and large, 
                                  # log entry
	    if (open(IN, "$logname")) {
		$d->{log} = '';
		while (<IN>) {
		    $d->{log} .= $_;
		}
		close IN;
	    }
	    $d->{output} = $output;
	    print SOUT RPC::XML::response->new($d)->as_string(), "\n";
	} else {
	    print $output;
	}
644 645 646
    }
    
    exit $exit_code;
647
}
648
TBdbfork();
649 650 651 652 653 654 655 656

#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}
657 658 659 660
skiplog:

# We are committed now. Affects how fatal() operates.
$committed = 1;
661

662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
#
# We need to catch TERM cause sometimes shit happens and we have to kill
# an experiment setup that is hung or otherwise scrogged. Rather then 
# trying to kill off the children one by one, lets arrange to catch it
# here and send a killpg to the children. This is not to be done lightly,
# cause it can leave things worse then they were before!
#
sub handler ($) {
    my ($signame) = @_;
    
    $SIG{TERM} = 'IGNORE';
    my $pgrp = getpgrp(0);
    kill('TERM', -$pgrp);
    sleep(1);
    $signaled = 1;
    fatal("Caught SIG${signame}! Killing experiment setup ...");
}
$SIG{TERM} = \&handler;
$SIG{QUIT} = 'DEFAULT';

682 683
#
# The guts of starting an experiment!
684
#
685 686
# A batch experiment is essentially preloaded (frontend mode) and then
# dropped into the batch queue, unless the user requested only preload.
687 688
#

689
#
690
# Run the various scripts. We want to propagate the error from tbprerun
691 692
# and tbrun back out, hence the bogus looking errorstat variable.
#
693
$experiment->SetState(EXPTSTATE_PRERUN) == 0
694
    or fatal("Failed to set experiment state to " . EXPTSTATE_PRERUN());
695

696 697
if (!$nonsfile &&
    $experiment->PreRun($nsfile, ($zeeopt ? "-z" : ""))) {
698
    $errorstat = $? >> 8;
Leigh Stoller's avatar
Leigh Stoller committed
699
    $errorstat = -1 if ($errorstat == 255);
700 701 702
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['tbprerun_failed']},
	  "tbprerun failed!");
703
}
704 705

$experiment->SetState(EXPTSTATE_SWAPPED) == 0
706
    or fatal("Failed to set experiment state to " . EXPTSTATE_SWAPPED());
707 708 709 710

#
# If not in frontend mode (preload only) continue to swapping exp in.
# 
711
if (! ($frontend || $batchmode)) {
712
    $experiment->SetState(EXPTSTATE_ACTIVATING) == 0
713
	or fatal("Failed to set experiment state to ". EXPTSTATE_ACTIVATING());
714

715 716
    if (!$nonsfile &&
	$experiment->Swap($Experiment::EXPT_SWAPIN) != 0) {
717
	$errorstat = $? >> 8;
Leigh Stoller's avatar
Leigh Stoller committed
718
	$errorstat = -1 if ($errorstat == 255);
719 720 721
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['tbswap_in_failed']},
	      "tbswap in failed!");
722
    }
723 724
    
    $experiment->SetState(EXPTSTATE_ACTIVE) == 0
725
	or fatal("Failed to set experiment state to " . EXPTSTATE_ACTIVE());
726 727 728 729 730

    #
    # Look for the unsual case of more than 2 nodes and no vlans. Send a
    # warning message.
    #
731 732
    my @localnodes = ();
    fatal("Could not get local node list for $pid/$eid")
733
	if ($experiment->LocalNodeListNames(\@localnodes));
734

Leigh Stoller's avatar
Leigh Stoller committed
735
    if (@localnodes && scalar(@localnodes) > 2) {
736 737 738 739
	my $vlans_result =
	    DBQueryFatal("select pid from virt_lans ".
			 "where pid='$pid' and eid='$eid'");
    
740
	if (!$vlans_result->numrows && !$noemail) {
741 742 743 744 745 746 747 748 749 750 751 752
	    SENDMAIL("$user_name <$user_email>",
		     "WARNING: Experiment Configuration: $pid/$eid",
		     "This experiment has zero network links defined.\n".
		     "Please check your NS file to verify this is what you ".
		     "want!\n",
		     "$user_name <$user_email>",
		     "Cc: $TBOPS", ($nsfile));
	}
    }
}

# We append this report in the email message below.
753
if ($experiment->Report($repfile, "-b") != 0) {
754 755 756
    fatal({type => 'secondary', severity => SEV_SECONDARY,
	   error => ['tbreport_failed']},
	  "tbreport failed!");
757 758
}

759 760 761 762 763
# Latest log is always called the same thing.
if (defined($logname)) {
    system("cp -fp $logname $workdir/" . EXPTLOGNAME());
}

764
#
765 766 767
# Save the final experiment state so that we get a running record of
# the state on disk, for post-mortem debugging. This will get copied
# to the expinfo directory in the next line. 
768
#
769
$experiment->SaveExperimentState();
770

771 772 773
#
# Save a copy of the files for testbed information gathering (long term).
#
774
$experiment->SaveLogFiles();
775 776 777 778 779

#
# Make a copy of the work dir in the user visible space so the user
# can see the log files.
#
780
$experiment->CopyLogFiles();
781

782
# Tell the archive library to add all files to the archive. 
783 784
libArchive::TBExperimentArchiveAddUserFiles($pid, $eid) == 0
    or fatal("Failed to add user archive files to the archive!");
785

786
#
787 788
# Do a SavePoint on the experiment files. In template mode, let the wrapper
# deal with this. Avoids duplication of work.
789
#
790
if (! defined($template)) {
791 792 793 794 795 796
    print "Doing a savepoint on the experiment archive ...\n";
    if (libArchive::TBExperimentArchiveSavePoint($pid, $eid, "startexp") < 0) {
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['archive_op_failed', 'savepoint', undef, undef]},
	      "Failed to do a savepoint on the experiment archive!");
    }
797 798
}

799
#
800 801
# Gather statistics. This is not likely to fail, but if it does I want to
# bail cause the inconsistent records are a pain in the ass to deal with!
802 803
#
if ($frontend || $batchmode) {
804 805
    $experiment->PostSwap($this_user, TBDB_STATS_PRELOAD) == 0 or
	fatal("Postswap failed!");
806 807
}
else {
808 809
    $experiment->PostSwap($this_user, TBDB_STATS_START) == 0 or
	fatal("Postswap failed!");
810 811
}

812
#
813
# Set accounting stuff, but on success only, and *after* gathering swap stats!
814
#
815
$experiment->SetSwapInfo($this_user);
816

817 818 819
#
# Close up the log file so the webpage stops.
#
820
if (!defined($template)) {
821
    print "Experiment $pid/$eid has been successfully created!\n";
822
    $experiment->CloseLogFile();
823
}
824 825 826 827

#
# Must unlock and drop batch experiments into the queue before exit.
#
828
if ($batchmode && !$frontend) {
829
    $experiment->Unlock(EXPTSTATE_QUEUED());
830 831
}
else {
832
    $experiment->Unlock();
833 834 835 836 837 838
}

#
# Clear the cancel flag now that the operation is complete. Must be
# done after we change the experiment state (above).
#
839
$experiment->SetCancelFlag(EXPTCANCEL_CLEAR());
840

841 842 843 844
#
# In template_mode we are done; the caller finishes up.
#
exit(0)
845
    if (defined($template));
846

847 848 849
#
# Dump the report file and the log file to the user via email. 
#
850
my ($expt_created) = $experiment->created();
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
my $message;

if ($frontend) {
    $message =
	"Your experiment `$eid' in project `$pid' has been created.\n" .
	"You can check the web interface to see if it looks the way\n" .
	"you expected it to. If so, you may swap the experiment in,\n" .
	"or terminate it, at any time.\n" .
        "\n";
}
else {
    $message =
	"Your experiment `$eid' in project `$pid' has been started.\n" .
	"Here is the experiment summary detailing the nodes that were\n" .
	"allocated to you. You may use the `Qualified Name' to log on\n" .
	"to your nodes. See /etc/hosts on your nodes (when running\n" .
	"FreeBSD, Linux, or NetBSD) for the IP mapping on each node.\n" .
        "\n";
869
}
870 871 872 873 874 875 876 877
$message .=
    "User:        $user_name\n" .
    "EID:         $eid\n" .
    "PID:         $pid\n" .
    "GID:         $gid\n" .
    "Description: $description\n" .
    "Swappable:   " . ($swappable ? "Yes\n" :
		                    "No  (Reason: $noswap_reason)\n") .
878
    "Idle-Swap:   " . ($idleswap  ? "Yes, at " . $idleswaptime/60.0 . " hours\n":
879
		                    "No  (Reason: $noidleswap_reason)\n") .
880
    "Auto-Swap:   " . ($autoswap  ? "Yes, at " . $autoswaptime/60.0 . " hours\n":
881 882 883 884 885 886 887 888
		                    "No\n") .
    "Created:     $expt_created\n".
    "Directory:   $userdir\n".
    "\n".
    "Appended at the end is the output of the experiment setup. If you\n" .
    "have any questions or comments, please include the output below\n" .
    "in your message to $TBOPS";

889
SENDMAIL(($noemail ? $TBLOGS : "$user_name <$user_email>"),
890 891 892 893
	 "New Experiment " . (($frontend == 0) ? "Started" : "Created") .
	 ": $pid/$eid",
	 $message,
	 "$user_name <$user_email>",
894
	 ($noemail ? "" : "Bcc: $TBLOGS"),
895
	 ($repfile, $logname, $nsfile))
896
    if (! ($zeemode || $genimode));
897 898

# Done!
899
exit(0);
900

901 902 903
#
#
#
904
sub cleanup()
905
{
906
    #
907 908 909 910 911 912
    # Failed early (say, in parsing). No point in keeping any of the
    # stats or resource records. Just a waste of space since the
    # testbed_stats log indicates there was a failure and why (sorta,
    # via the exit code).
    # 
    if (!$committed) {
913 914 915
	# Completely remove all trace of the archive.
        libArchive::TBDestroyExperimentArchive($pid, $eid);

916
	#
917
	# Clear the experiment record and cleanup directories
918
	#
919 920
	$experiment->Delete(1)
	    if (defined($experiment));
921

922
	return;
923
    }
924

925 926 927 928
    #
    # Gather statistics.
    #
    if ($frontend) {
929
	$experiment->SwapFail($this_user, TBDB_STATS_PRELOAD, $errorstat);
930 931
    }
    else {
932
	$experiment->SwapFail($this_user, TBDB_STATS_START, $errorstat);
933
    }
934

935 936 937
    #
    # Must clean up the experiment if it made it our of NEW state.
    #
938
    my $estate = $experiment->state();
939 940 941 942 943 944 945 946
    if ($estate ne EXPTSTATE_NEW) {
	#
	# We do not know exactly where things stopped, so if the
	# experiment was activating when the signal was delivered,
	# run tbswap on it. 
	# 
	if ($estate eq EXPTSTATE_ACTIVE ||
	    ($estate eq EXPTSTATE_ACTIVATING && $signaled)) {
947 948
	    if ($experiment->Swap("out", "-force") != 0) {
		print "tbswap out -force failed!\n";
949
	    }
950
	    $experiment->SetState(EXPTSTATE_SWAPPED);
951
	}
952
	
953
	if ($experiment->End("-f") != 0) {
954
	    print "tbend failed!\n";
955 956
	}
    }
957
    $experiment->SetState(EXPTSTATE_TERMINATED);
958

959 960
    # Old swap gathering stuff.
    $experiment->GatherSwapStats($this_user, TBDB_STATS_TERMINATE, 0);
961

962 963
    # Clear the logfile so the webpage stops.
    $experiment->CloseLogFile();
964

965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980
    if (!$ENV{'TBAUDITON'}) {
	#
	# Figure out the error if possible
	#
	my $error_data = tblog_find_error();
	
	#
	# Send a message to the testbed list. 
	#
	tblog_email_error($error_data,
			  "$user_name <$user_email>",
			  "Config Failure", "$pid/$eid",
			  "$user_name <$user_email>",
			  "",
			  "Cc: $TBOPS",
			  "",
981 982
			  ($logname, "assign.log", "wanassign.log", $nsfile))
	    unless $noemail;
983 984
    } 	
	
985
    #
986 987 988 989
    # Back up the work dir for post-mortem debugging. 
    #
    system("/bin/rm -rf  ${workdir}-failed");
    system("/bin/mv -f   $workdir ${workdir}-failed");
990

991 992
    #
    # Clear the record and cleanup.
993 994
    #
    $experiment->Delete();
995 996 997
}

#
998 999
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
1000
#
1001
sub ParseArgs()
1002
{
1003 1004 1005 1006
    my %options = ();
    if (! getopts($optlist, \%options)) {
	usage();
    }
1007

1008
    if (@ARGV > 1) {
1009 1010
	usage();
    }
1011 1012
    if (@ARGV == 1) {
	$tempnsfile = $ARGV[0];
1013

1014
	# Note different taint check (allow /).
1015
	if ($tempnsfile =~ /^([-\w\.\/]+)$/) {
1016 1017 1018
	    $tempnsfile = $1;
	}
	else {
1019
	    tbdie("Bad data in nsfile: $tempnsfile");
1020
	}
1021

1022
	#
1023 1024
	# Called from ops interactively. Make sure NS file resides in an
	# appropriate location.
1025 1026 1027
	#
	# Use realpath to resolve any symlinks.
	#
1028
	my $translated = realpath($tempnsfile);
1029
	if (defined($translated) && $translated =~ /^([-\w\.\/]+)$/) {
1030 1031 1032
	    $tempnsfile = $1;
	}
	else {
1033
	    tbdie({type => 'primary', severity => SEV_ERROR,
1034 1035
		   error => ['bad_data', 'realpath', $tempnsfile]},
		  "Bad data returned by realpath for: $tempnsfile");
1036 1037 1038
	}

	#
1039 1040 1041 1042 1043 1044 1045
	# The file must reside in an acceptible location. Since this script
	# runs as the caller, regular file permission checks ensure it is a
	# file the user is allowed to use.  So we don't have to be too tight
	# with the RE matching /tmp and /var/tmp files.  Note that
	# /tmp/$guid-$nsref.nsfile is also allowed since this script is
	# invoked directly from web interface which generates a name that
	# should not be guessable.
1046 1047
	#
	if (! ($tempnsfile =~ /^\/tmp\/[-\w]+-\d+\.nsfile/) &&
1048
	    ! ($tempnsfile =~ /^\/tmp\/\d+\.ns/) &&
1049
	    ! ($tempnsfile =~ /^\/tmp\/php[-\w]+/) &&
1050
	    ! ($tempnsfile =~ /^\/var\/tmp\/php[-\w]+/) &&
1051
	    ! TBValidUserDir($tempnsfile, $ISFS)) {
1052 1053 1054
	    tberror({type => 'primary', severity => SEV_ERROR,
		     error => ['disallowed_directory', $tempnsfile]},
		    "$tempnsfile does not resolve to an allowed directory!");
1055 1056 1057 1058 1059
	    # Note positive status; so error goes to user not tbops.
	    exit(1);
	}
    }
    
1060
    if (defined($options{"i"})) {
1061
	$batchmode = 0;
1062
    }
1063 1064 1065
    if (defined($options{"f"})) {
	$frontend = 1;
    }
1066 1067 1068
    if (defined($options{"q"})) {
	$quiet = 1;
    }
1069 1070 1071 1072
    if (defined($options{"z"})) {
	$zeemode = 1;
	$zeeopt  = "-p";
    }
1073 1074 1075 1076
    # This option should not be exported via the XMLRPC server. 
    if (defined($options{"n"})) {
	$idleignore = 1;
    }
1077

1078 1079 1080
    if (defined($options{"s"})) {
	$savestate = 1;
    }
1081 1082 1083 1084 1085 1086 1087 1088
    if (defined($options{"h"})) {
	$genimode = $options{"h"};
	if (! ($genimode =~ /^[-\w]+$/)) {
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'argument', $genimode]},
		  "Bad data in argument: $genimode.");
	}
    }
1089 1090 1091 1092 1093 1094 1095 1096 1097
    if (defined($options{"C"})) {
	my $uid = $options{"C"};
	$creator = User->Lookup($uid);
	if (!defined($creator)) {
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'argument', $uid]},
		  "Bad data in argument: $uid");
	}
    }
1098 1099 1100
    if (defined($options{"j"})) {
	$nonsfile = 1;
    }
1101 1102 1103
    if (defined($options{"k"})) {
	$lockdown = 1;
    }
1104

1105 1106 1107 1108 1109
    #
    # Clone an experiment, either an existing experiment or an old one
    # (using the archive). 
    #
    if (defined($options{"c"})) {
1110
	$copyarg = $options{"c"};
1111

1112 1113
	if (! (($copyarg =~ /^([-\w]+),([-\w]+)(?::[-\w]*)?$/) ||
	       ($copyarg =~ /^(\d+)(?::[-\w]*)?$/))) {
1114 1115 1116
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'argument', $copyarg]},
		  "Bad data in argument: $copyarg");
1117
	}
1118 1119
    }

1120 1121 1122
    #
    # pid,eid,gid get passed along as shell commands args; must taint check.
    # 
1123 1124
    if (defined($options{"p"})) {
	$pid = $options{"p"};
1125

1126
	if ($pid =~ /^([-\w]+)$/) {
1127 1128
	    $pid = $1;
	}
1129
	else {
1130 1131 1132
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'argument', $pid]},
		  "Bad data in argument: $pid.");
Leigh Stoller's avatar
Leigh Stoller committed
1133
	}
1134 1135 1136 1137
    }
    if (defined($options{"e"})) {
	$eid = $options{"e"};

1138
	if ($eid =~ /^([-\w]+)$/) {
1139 1140 1141
	    $eid = $1;
	}
	else {
1142 1143 1144
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'argument', $eid]},
		  "Bad data in argument: $eid.");
1145
	}
1146 1147
	if (! TBcheck_dbslot($eid, "experiments", "eid",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1148 1149 1150
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'eid', $eid]},
		  "Improper experiment name (id)!");
1151
	}
1152 1153 1154 1155
    }
    if (defined($options{"g"})) {
	$gid = $options{"g"};

1156
	if ($gid =~ /^([-\w]+)$/) {
1157
	    $gid = $1;
1158
	}
1159
	else {
1160 1161 1162
	    tbdie({type => 'primary', severity => SEV_ERROR,
		   error => ['bad_data', 'argument', $gid]},
		  "Bad data in argument: $gid.");
1163 1164
	}
    }
1165
    if (defined($options{"E"})) {
1166 1167
	if (! TBcheck_dbslot($options{"E"}, "experiments", "expt_name",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1168
	    tbdie("Improper experiment description!");
1169
	}
1170
	$description = $options{"E"};
1171
    }
1172 1173 1174
    if (defined($options{"S"})) {
	if (! TBcheck_dbslot($options{"S"}, "experiments", "noswap_reason",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1175
	    tbdie("Improper noswap reason!");
1176 1177
	}
	$swappable     = 0;
1178
	$autoswap      = 0;
1179
	$noswap_reason = $options{"S"};
1180 1181 1182 1183
    }
    if (defined($options{"L"})) {
	if (! TBcheck_dbslot($options{"L"}, "experiments", "noidleswap_reason",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1184
	    tbdie("Improper noidleswap reason!");
1185 1186
	}
	$idleswap          = 0;
1187
	$noidleswap_reason = $options{"L"};
1188
    }
1189
    if (defined($options{"l"})) {
1190 1191
	if (! TBcheck_dbslot($options{"l"}, "experiments", "idleswap_timeout",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1192
	    tbdie("Improper idleswap timeout!");
1193
	}
1194 1195 1196 1197 1198 1199
	# Don't allow caller to increase idleswap time beyond the
	# sitevar-enforced limit.  $idleswaptime was previously set to
	# the maximum (via the sitevar).
	if ($options{"l"} > $idleswaptime && !TBAdmin()) {
	    tbdie("Idle-swap time provided is larger than site-imposed maximum!");
	}
1200
        $idleswap     = 1;
1201 1202 1203
	$idleswaptime = $options{"l"};
    }
    if (defined($options{"a"})) {
1204 1205
	if (! TBcheck_dbslot($options{"a"}, "experiments", "autoswap_timeout",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1206
	    tbdie("Improper autoswap timeout!");
1207
	}
1208 1209 1210 1211 1212
	# Don't allow caller to increase autoswap time beyond the
	# sitevar-enforced limit.
	if ($options{"a"} > $AUTOSWAPMAX && !TBAdmin()) {
	    tbdie("Auto-swap time provided is larger than site-imposed maximum!");
	}
1213
        $autoswap     = 1;
1214 1215
	$autoswaptime = $options{"a"};
    }
1216 1217 1218
    if (defined($options{"t"})) {
	if (! TBcheck_dbslot($options{"t"}, "experiments", "linktest_level",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
1219
	    tbdie("Improper linktest level!");
1220 1221 1222
	}
	$linktest = $options{"t"};
    }
1223 1224
    if (defined($options{"w"})) {
	$waitmode = 1;
1225
    }
1226
    if (defined($options{"x"})) {
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
	my $guid;
	my $vers;
	my $branch_guid;
	my $branch_vers;
	
	if ($options{"x"} =~ /^(\d*)\/(\d*)$/) {
	    $guid = $1;
	    $vers = $2;
	}
	elsif ($options{"x"} =~ /^(\d*)\/(\d*),(\d*)\/(\d*)$/) {
	    $guid = $1;
	    $vers = $2;
	    $branch_guid = $3;
	    $branch_vers = $4;
	}
	else {
	    tbdie("Bad arguments for -x option");
	}
1245

1246 1247 1248 1249 1250 1251 1252 1253 1254
	$template = Template->Lookup($guid, $vers);
	if (!defined($template)) {
	    tbdie("No such template $guid/$vers");
	}

	if (defined($branch_guid)) {
	    $branch_template = Template->Lookup($branch_guid, $branch_vers);
	    if (!defined($branch_template)) {
		tbdie("No such template $branch_guid/$branch_vers");
1255 1256
	    }
	}
1257 1258
	
	if (defined($options{"y"})) {
1259
	    my $instance_idx = $options{"y"};
1260 1261 1262 1263 1264
	    
	    if ($instance_idx =~ /^([\d]+)$/) {
		$instance_idx = $1;
	    }
	    else {
1265 1266 1267
		tbdie({type => 'primary', severity => SEV_ERROR,
		       error => ['bad_data', 'argument', $instance_idx]},
		      "Bad data in argument: $instance_idx.");