template_exprun.in 33.2 KB
Newer Older
1 2
#!/usr/bin/perl -wT
#
3
# Copyright (c) 2006, 2007 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
23 24
#
use English;
25
use strict;
26 27 28
use Getopt::Std;
use POSIX qw(isatty setsid);
use POSIX qw(strftime);
29
use Errno qw(EDQUOT ETIMEDOUT);
30
use XML::Simple;
31 32
use File::Find;
use File::stat;
33
use Data::Dumper;
34
use Cwd qw(realpath);
35 36

#
37
# Start/Stop/Pause experiment runs ...
38 39 40 41 42 43 44 45 46 47 48 49 50 51
#
# Exit codes are important; they tell the web page what has happened so
# it can say something useful to the user. Fatal errors are mostly done
# with die(), but expected errors use this routine. At some point we will
# use the DB to communicate the actual error.
#
# $status < 0 - Fatal error. Something went wrong we did not expect.
# $status = 0 - Everything okay.
# $status > 0 - Expected error. User not allowed for some reason. 
# 
sub usage()
{
    print(STDERR
	  "Usage: template_exprun [-q] [-w] [-r <runid>] ".
52
	  "-a <action> -e <eid> [-p <pid> | <guid/vers>]\n".
53
	  "switches and arguments:\n".
54
	  "-a <action> - pause, continue, start or stop\n".
55
	  "-w          - wait for run to start\n".
56
	  "-s          - save DB contents at end of run; default is clean\n".
57
	  "-q          - be less chatty\n".
58
	  "-E <str>    - A pithy sentence describing the run\n".
59
	  "-r <runid>  - A token ... we will make on up for you\n".
60
	  "-x <file>   - XML file of parameter bindings\n".
61
	  "-e <eid>    - The instance name\n".
62
          "-p <pid>    - Use the pid/eid to find the template GUID\n".
63 64 65
	  "<guid/vers> - GUID and version to swapin\n");
    exit(-1);
}
66
my $optlist	 = "qwx:p:E:a:r:e:dscft:iy:m";
67 68 69
my %options      = ();
my $quiet        = 0;
my $waitmode     = 0;
70
my $debug        = 0;
71
my $foreground   = 0;
72
my $ignoreerrors = 0;
73
my $clean        = 0;
74
my $doswapmod    = 0;
75
my $paramwhich;
76 77
my $paramfile;
my %parameters   = ();
78
my %deadnodes    = ();
79 80
my $action;
my $description;
81
my $runid;
82
my $pid;
83 84 85 86
my $eid;
my $guid;
my $version;
my $inputfile;
87
my $handle;
88
my $ctoken;
89 90 91 92 93 94 95 96 97 98 99

#
# Configure variables
#
my $TB		= "@prefix@";
my $EVENTSYS	= @EVENTSYS@;
my $TBOPS	= "@TBOPSEMAIL@";
my $TBLOGS	= "@TBLOGSEMAIL@";
my $TBDOCBASE	= "@TBDOCBASE@";
my $TBBASE	= "@TBBASE@";
my $CONTROL	= "@USERNODE@";
100
my $PARAMS      = "parameters.xml";
101
my $STAMPS      = @STAMPS@;
102 103

# Locals
104
my $logfile;
105
my $logname;
106 107
my $exptidx;
my $template;
108
my $newrun;
109
my $oldrun;
110
my $instance;
111
my $locktoken;
112 113
# For the END block below.
my $cleaning    = 0;
114
my $justexit    = 0;
115 116

# Programs we need
117
my $swapexp     = "$TB/bin/swapexp";
118
my $checkquota  = "$TB/sbin/checkquota";
119
my $archcontrol = "$TB/bin/archive_control";
120
my $eventcontrol= "$TB/bin/eventsys_control";
121
my $tevc        = "$TB/bin/tevc";
122
my $CVSBIN      = "/usr/bin/cvs";
123
my $CVSCTRL     = "/usr/testbed/sbin/cvs_ctrl";
124 125
my $FIND        = "/usr/bin/find";
my $RCS         = "/usr/bin/rcs";
126
my $SSH		= "$TB/bin/sshtb";
127 128 129 130 131

# Protos
sub ParseArgs();
sub fatal($$);
sub sighandler($);
132 133
sub SignalProgAgents($);
sub SendCompletionEvent();
134
sub CheckForDeadNodes($);
135
sub GenXML($$$);
136

137 138 139 140 141 142 143
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
use libtblog;
144
use User;
145
use Template;
146
use Experiment;
147
use Archive;
148
use event;
149

150 151 152
# In libdb
my $projroot = PROJROOT();

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
#
# Turn off line buffering on output
#
$| = 1;

#
# Set umask for start/swap. We want other members in the project to be
# able to swap/end experiments, so the log and intermediate files need
# to be 664 since some are opened for append.
#
umask(0002);

#
# Untaint the path
#
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Verify user and get his DB uid.
#
175 176 177
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
178
}
179 180 181
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
my $user_uid   = $this_user->uid();
182

183 184
$libtestbed::SYSTEM_DEBUG = 1;

185 186 187 188
#
# Before doing anything else, check for overquota ... lets not waste
# our time. Make sure user sees the error by exiting with 1.
#
189
if (system("$checkquota $user_uid") != 0) {
190 191
    tberror("You are over your disk quota on $CONTROL; ".
	    "please login there and cleanup!");
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
    exit(1);
}

# Now parse arguments.
ParseArgs();

#
# In wait mode, block SIGINT until we spin off the background process.
#
if ($waitmode) {
    $SIG{QUIT} = 'IGNORE';
    $SIG{TERM} = 'IGNORE';
    $SIG{INT}  = 'IGNORE';
}

#
208
# This script allows pid/eid to be used to find the template.
209
#
210 211
if (defined($guid)) {
    $template = Template->Lookup($guid, $version);
212

213 214 215 216
    if (!defined($template)) {
	tbdie("Experiment template $guid/$version does not exist!");
    }
    $pid = $template->pid();    
217 218 219
}

#
220
# Find the experiment ...
221
#
222 223 224 225
my $experiment = Experiment->Lookup($pid, $eid);
    
if (! defined($experiment)) {
    tbdie("Experiment $pid/$eid does not exist!");
226 227
}

228 229 230 231
#
# And then the template instance from that ...
#
$instance = Template::Instance->LookupByExptidx($experiment->idx());
232

233
if (!defined($instance)) {
234 235 236 237 238 239 240 241 242 243 244 245
    tbdie("Experiment instance $eid in project $pid does not exist!");
}

# In case we got here by the pid instead of guid.
if (!defined($template)) {
    $template = $instance->template();

    if (!defined($template)) {
	tbdie("Cannot find template for $instance!");
    }
    $guid    = $template->guid();
    $version = $template->vers();
246
}
247 248 249 250
my $archive = $template->GetArchive();
if (!defined($archive)) {
    fatal(-1, "Could net archive object for $template");
}
251
   
252
#
253
# Check permission.
254
#
255 256
if (! $template->AccessCheck($this_user, TB_EXPT_MODIFY)) {
    tberror("You do not have permission to start/stop runs in $instance!");
257 258 259 260
    exit(1);
}

if ($experiment->state() ne EXPTSTATE_ACTIVE()) {
261
    tberror("Template instance experiment $pid/$eid in not active!");
262 263 264
    exit(1);
}

265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
#
# Lets use a lock to prevent confusion; it appears to happen more then I
# thought it would.
#
if ($instance->TryLock(\$locktoken) != 0) {
    if ($instance->locked()) {
	tberror("$instance is locked!\n".
		"Another operation started at ". $instance->locked() . "\n");
	exit(1);
    }
    else {
	tberror("Could not lock the instance!");
	exit(-1);
    }
}

281 282 283 284 285 286 287 288 289 290
#
# Pause and Continue are easy
#
if ($action eq "pause") {
    exit($instance->PauseTime());
}
elsif ($action eq "continue") {
    exit($instance->ContinueTime());
}
elsif ($action eq "start" && !defined($runid)) {
291 292 293 294 295 296 297
    if ($instance->NewRunID(\$runid) < 0) {
	tbdie("Could not determine a new runid; please use the -r option!");
    }
    else {
	print "Using new run ID '$runid' ...\n";
    }
}
298 299 300 301 302 303 304
elsif ($action eq "modify") {
    # Better get a new runid if currently between runs since we want to
    # "speculatively start a new run.
    if (!defined($instance->runidx()) && !defined($runid)) {
	tbdie("Must provide a runid; please use the -r option!");
    }
}
305

306 307 308 309
if ($STAMPS) {
    $instance->Stamp("template_exprun", "starting", "action", $action);
}

310 311
#
# If we have a parameter file, we need to copyin the values and store
312 313
# them in the DB for this experiment. Note that these override existing
# values, so we start with those first.
314
#
315 316
# Start with the requested set of params, and then let the XML file override
# them as needed.
317
#
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
if (defined($paramwhich)) {
    if ($paramwhich eq "template") {
	$template->FormalParameterList(\%parameters) == 0
	    or tbdie("Could not get formal parameters for $template");
    }
    elsif ($paramwhich eq "instance") {
	$instance->BindingList(\%parameters) == 0
	    or tbdie("Could not get binding list for $instance");
    }
    elsif ($paramwhich eq "lastrun") {
	my $lastrun;

	if (defined($instance->runidx())) {
	    $lastrun =
		Template::Instance::Run->LookupByID($instance->exptidx(),
						    $instance->runidx());
	}
	else {
	    $lastrun = $instance->LastRun();
	}
	defined($lastrun) 
	    or tbdie("Could not get run for $instance");

	$lastrun->BindingList(\%parameters) == 0
342
	    or tbdie("Could not get binding list for $lastrun");
343 344 345 346 347 348 349
    }
}
else {
    # Default to the instance bindings. 
    $instance->BindingList(\%parameters) == 0
	or tbdie("Could not get binding list for $instance");
}
350

351 352 353 354 355
if (defined($paramfile)) {
    my $donebad = 0;
    
    my $parse  = XMLin($paramfile,
		       VarAttr => 'name',
356
		       ForceArray => ['parameter'],
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
		       ContentKey => '-content',
		       SuppressEmpty => undef);

    foreach my $name (keys(%{ $parse->{'parameter'} })) {
	my $value = $parse->{'parameter'}->{$name}->{'value'};

	if (! TBcheck_dbslot($name,
			     "experiment_template_instance_bindings", "name",
			     TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    tberror("Illegal characters in parameter name: $name");
	    $donebad++;
	}
	if (defined($value) &&
	    ! TBcheck_dbslot($value,
			     "experiment_template_instance_bindings", "value",
			     TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    tberror("Illegal characters in parameter value: $value");
	    $donebad++;
	}
	# DB records inserted below, once experiment is pre-loaded.
377 378 379
	# Watch for unwanted parameters.
	$parameters{$name} = $value
	    if (exists($parameters{$name}));
380 381 382 383
    }
    # User sees this error.
    exit(1)
	if ($donebad);
384 385 386 387
    
    # Save away for later since returning to web interface deletes it.
    system("/bin/cp", $paramfile, $instance->workdir() . "/$PARAMS") == 0
	or fatal(-1, "Could not save $paramfile to workdir");
388 389
}

390 391 392 393 394 395 396 397
#
# Catch this so we can clean up.
#
$SIG{TERM} = \&sighandler;

#
# If not in batch mode, go into the background. Parent exits.
#
398
if (! ($debug || $foreground)) {
399 400 401 402 403 404 405 406 407
    $logfile = $experiment->CreateLogFile("run");
    if (!defined($logfile)) {
	fatal(-1, "Could not create logfile!");
    }
    $logname = $logfile->filename();
    # We want it to spew to the web.
    $experiment->SetLogFile($logfile);
    # Mark it open since we are going to start using it right away.
    $logfile->Open();
408 409 410 411 412 413 414 415 416

    if (my $childpid = TBBackGround($logname)) {
	#
	# Parent exits normally, unless in waitmode. We have to set
	# justexit to make sure the END block below does not run.
	#
	$justexit = 1;

	if (!$waitmode) {
417 418 419
	    print((($action eq "start") ?
		   "A new run is being started for $pid/$eid.\n" :
		   "Stopping current run in $pid/$eid.\n"))
420 421 422
		if (! $quiet);
	    exit(0);
	}
423
	print("Waiting for run to $action ...\n")
424 425 426 427 428
	    if (! $quiet);
	    
	if (-t STDIN && !$quiet) {
	    print("You may type ^C at anytime.".
		  "\n".
429
		  "You will not interrupt the operation.\n");
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
	}
	
	# Give child a chance to run.
	select(undef, undef, undef, 0.25);
	
	#
	# Reset signal handlers. User can now kill this process, without
	# stopping the child.
	#
	$SIG{TERM} = 'DEFAULT';
	$SIG{INT}  = 'DEFAULT';
	$SIG{QUIT} = 'DEFAULT';

	#
	# Wait until child exits or until user gets bored and types ^C.
	#
	waitpid($childpid, 0);
	
	print("Done. Exited with status: $?\n")
	    if (! $quiet);
	exit($? >> 8);
    }
    TBdbfork();
}

#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}

463 464 465 466
#
# Might not be a current run, which is okay.
#
if (defined($instance->runidx())) {
467
    $oldrun = $instance->CurrentRun();
468

469
    if (!defined($oldrun)) {
470 471
	tbdie("Cannot get current run object for $instance!");
    }
472 473 474 475 476 477 478 479 480 481 482 483 484

    if ($experiment->HaveProgramAgents()) {
	if ($STAMPS) {
	    $instance->Stamp("template_exprun", "stopping agents");
	}
	print "Asking program agents to stop ... this will take a moment.\n";
	SignalProgAgents("HALT") == 0
	    or $ignoreerrors
	    or CheckForDeadNodes($oldrun);

	if ($STAMPS) {
	    $instance->Stamp("template_exprun", "agents stopped");
	}
485
    }
486

487 488 489 490 491 492
    #
    # Modify is going to reuse the current run, so do not do anything else
    # at this point. 
    #
    if ($action eq "modify") {
	goto domodify;
493 494
    }

495 496
    # This sets the stop time.
    $instance->StopCurrentRun() == 0
497
	or fatal(-1, "Could not stop run for $instance!");
498

Leigh Stoller's avatar
Leigh Stoller committed
499 500 501
    if ($action eq "abort") {
	# XXX What about the tag?
	$instance->FinalizeCurrentRun() == 0
502
	    or fatal(-1, "Could not finalize run for $instance!");
Leigh Stoller's avatar
Leigh Stoller committed
503

504
	print "Run has been aborted!\n";    
Leigh Stoller's avatar
Leigh Stoller committed
505 506 507 508 509 510
	
	# Send completion event only on "stop/abort"; otherwise it gets lost.
	SendCompletionEvent();
	goto done;
    }

511 512 513
    if ($STAMPS) {
	$instance->Stamp("template_exprun", "loghole starting");
    }
514 515 516
    # This runs loghole.
    print "Asking loghole to sync the logfiles ... this will take a minute.\n";
    $instance->LogHole() == 0
517
	or $ignoreerrors
518
        or CheckForDeadNodes($oldrun);
519

520 521 522 523 524 525 526
    if ($STAMPS) {
	my $du = 0;
	$experiment->DU(\$du);
	$instance->Stamp("template_exprun", "loghole done", "userdu", $du);
	$instance->Stamp("template_exprun", "dumpdb starting");
    }

527 528
    print "Dumping the instance database ... this will take a minute.\n";
    $instance->DumpDB() == 0
529
	or $ignoreerrors
530
	or fatal(-1, "Dump Database failed");
531

532 533 534 535
    if ($STAMPS) {
	$instance->Stamp("template_exprun", "dumpdb done");
    }

536 537 538
    #
    # Commit the archive. 
    #
539
    my $this_runid = $oldrun->runid();
540
    
541 542 543
    if ($STAMPS) {
	$instance->Stamp("template_exprun", "commit starting");
    }
544
    system("$archcontrol -d -t stoprun_${this_runid} ".
545 546 547
	   "-c 'Stopping Run $this_runid' commit $pid $eid")
	== 0 or	fatal(-1, "Aborting the stoprun cause the commit failed");
	   
548 549 550
    if ($STAMPS) {
	$instance->Stamp("template_exprun", "commit done");
    }
551 552 553 554 555 556 557 558

    #
    # Do the CVS stuff.
    #
    my $runidx       = $oldrun->idx();
    my $exptidx      = $instance->exptidx();
    my $instance_dir = $instance->path();
    my $cvsdir       = "$projroot/$pid/templates/$guid/cvsrepo";
559
    my $cvssubdir    = "records/$exptidx,$runidx";
560 561
    my $tag          = "R${exptidx}-${runidx}_import";
    my $instance_tag = "I${exptidx}";
562
    my $xmlfile      = "$instance_dir/info.xml";
563
    my @symlinklist  = ();
564

565
    if (-e $cvsdir) {
566 567 568
	if ($STAMPS) {
	    $instance->Stamp("template_exprun", "cvs stuff starting");
	}
569 570 571 572 573 574
	# If the repo subdir exists, it is likely left over from a previous
	# attempt to stop the run, that failed for some reason. Kill it.
	if (-e "$cvsdir/$cvssubdir") {
	    fatal(-1, "Could not remove old $cvsdir/$cvssubdir")
		if (System("/bin/rm -rf $cvsdir/$cvssubdir"));
	}
575

576 577
	fatal(-1, "Could not generate XML description for $oldrun")
	    if (GenXML($instance, $xmlfile, \@symlinklist) != 0);
578

579 580 581 582 583 584
	# Remove all the symlinks before the import. Bogus!
	foreach my $ref (@symlinklist) {
	    unlink($instance_dir . "/" . $ref->{'pathname'});
	}

	# This imports the experiment directory into the records subdir.
585 586 587
	System("$CVSCTRL -w $instance_dir -- ".
	       "  -d $cvsdir import -kb -I \"datastore\" ".
	       "  -m \"Import record for run $this_runid\" ".
588
	       "  $cvssubdir ${tag}_branch $tag")
589
	    == 0 or fatal(-1, "Could not import new record into $cvsdir");
590

591
	unlink($xmlfile);
592

593 594 595 596 597 598 599 600 601
	# Restore all the symlinks after the import. Bogus!
	foreach my $ref (@symlinklist) {
	    my $pathname  = $ref->{'pathname'};
	    my $linkvalue = $ref->{'linkvalue'};
	    symlink($linkvalue, "$instance_dir/$pathname");
	}

	# Now tag it with the same tag that was applied for the instance.
	# Apply to the branch tag directly instead of symbolically since that
602 603 604
	# does not work properly for some reason.
	System("$CVSCTRL -- ".
	       " -d $cvsdir rtag -n -r 1.1.1 $instance_tag $cvssubdir")
605
	    == 0 or fatal(-1, "Could not rtag new record in $cvsdir");
606 607 608 609

	if ($STAMPS) {
	    $instance->Stamp("template_exprun", "cvs stuff done");
	}
610
    }
611

612 613
    # This has to be done after the archive commit, so we can find the tag.
    $instance->FinalizeCurrentRun() == 0
614
	or fatal(-1, "Could not finalize run for $instance!");
615

616
    print "Run '$this_runid' has been stopped.\n";
617 618 619 620

    if ($STAMPS) {
	$instance->Stamp("template_exprun", "run stopped");
    }
621
}
622

623 624 625 626 627 628
if ($action eq "stop") {
    # Send completion event only on "stop"; otherwise it gets lost.
    SendCompletionEvent();
    goto done;
}

629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
#
# If this is a startrun, lets see if the record changed. If it did,
# commit a new version of the previous stoprun so that these changes
# are recorded as being part of the previous run, not the current run.
#
if (!defined($oldrun)) {
    my $diff = $archive->Diff($instance->path(), $instance->exptidx(),
			      ("run.log"));

    fatal(-1, "Failed to diff archive!")
	if ($diff < 0);

    if ($diff) {
	my $lastrun = $instance->LastRun();
	fatal(-1, "Could not lookup previous run for $instance")
	    if (!defined($lastrun));

	print "Files for previous run $lastrun changed; revising archive.\n";
	
        # Commit the archive for the run.
	$lastrun->ArchiveCommit("stoprun_revised") == 0 or
	    fatal(-1, "Could not commit revised archive for previous run!");
    }
}

domodify:
655 656 657 658 659 660 661 662 663 664 665 666 667
#
# Clean/Clear if requested before generating the new run, in case there
# is a problem.
#
if ($clean) {
    # This runs loghole.
    print "Asking loghole to clean the logs ... this will take a moment.\n";
    $instance->LogClean() == 0
	or fatal(-1, "Loghole failed");

    print "Cleaning the instance database ... this will take a moment.\n";
    $instance->CleanDB() == 0
	or fatal(-1, "Dump Database failed");
668 669 670 671

    if ($STAMPS) {
	$instance->Stamp("template_exprun", "cleaned");
    }
672 673
}

674
#
675
# Generate a new run. (modify uses existing run if there is one).
676
#
677 678 679 680 681 682 683 684 685 686
if ($action eq "modify" && defined($oldrun)) {
    $newrun = $oldrun;
    $runid  = $oldrun->runid();
}
else {
    $newrun = $instance->NewRun($runid, $this_user, $description);

    if (!defined($newrun)) {
	fatal(-1, "Could not create new run for $instance!");
    }
687
}
688

689
# Mark the start time of the run.
690
$instance->StartRun(($action eq "modify" || $doswapmod ?
691 692
		     Template::STARTRUN_FLAGS_SWAPMOD() : 0)) == 0
    or fatal(-1, "Could not mark start of run for $instance!");
693

694
#
695 696
# And the bindings for the run ... This actually uses a "replace" so its okay
# to do this for a modify operation.
697
#
698 699
foreach my $name (keys(%parameters)) {
    my $value = $parameters{$name};
700

701
    $instance->NewRunBinding($name, $value) == 0
702
	or fatal(-1, "Could not create run binding for $instance!");
703
}
704

705
if ($action eq "modify" || $doswapmod) {
706
    #
707 708
    # Now do the swapmod, using the original NS file for now. The environ
    # variables will be passed to the NS reparse by parse-ns wrapper script.
709
    #
710
    my $oldtag     = $experiment->archive_tag();
711 712
    my $userdir    = $instance->path();
    my $nsfile     = "$userdir/tbdata/nsfile.ns";
713
    my @arguments  = ($swapexp, "-q", "-x", "-s", "modify",
714 715 716 717 718 719
		      $pid, $eid, $nsfile);

    print "Starting a swap modify ...\n";
    system(@arguments);
    fatal($? >> 8, "Swap modify failed!")
	if ($?);
720

721
    $instance->Refresh();
722 723 724 725 726 727 728
    $experiment->Refresh();

    # XXX - Need to move the tag out of the resources table. Dumb idea.
    my $rsrcidx = $experiment->rsrcidx();
    $experiment->TableUpdate("experiment_resources",
			     "archive_tag='$oldtag'", "idx='$rsrcidx'") == 0
	or return -1;
729 730 731 732 733 734

    # This has to be redone since the batchexp will have written
    # incomplete data.
    print "Writing program agent info ...\n";
    $instance->WriteProgramAgents() == 0
	or fatal(-1, "Could not write program agent info");
735
}
736 737 738 739 740 741 742 743 744 745 746 747
else {
    #
    # The swapmod would have rewritten the environment strings, but otherwise
    # it has to be done by hand.
    #
    $instance->InitializeEnvVariables() == 0
	or fatal(-1, "Could not update environment variables for $instance");
    
    print "Writing new environment strings ...\n";
    $instance->WriteEnvVariables() == 0
	or fatal(-1, "Could not rewrite environment strings for $instance");
}
748

749 750 751 752 753
#
# Now we stash the parameter file for the next version of the archive.
#
if (defined($paramfile)) {
    my $source = $instance->workdir() . "/$PARAMS";
754
    my $target = $instance->path() . "/$PARAMS";
755 756 757 758 759 760 761 762

    unlink($target)
	if (-e $target);
    
    system("/bin/cp", $source, $target) == 0
	or fatal(-1, "Could not copy $source to $target");
}

763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
if ($experiment->HaveProgramAgents()) {
    print "Asking program agents to reload ... this will take a moment.\n";
    SignalProgAgents("RELOAD") == 0
	or $ignoreerrors
	or CheckForDeadNodes($newrun);
}

if ($STAMPS) {
    $instance->Stamp("template_exprun", "starting commit");
}

# Commit the archive for the run.
if ($action eq "modify" && defined($oldrun)) {
    $newrun->ArchiveReplace($newrun->start_tag()) == 0 or
	fatal(-1, "Could not replace archive for run!");
}
else {
    $newrun->ArchiveCommit("startrun") == 0 or
	fatal(-1, "Could not commit archive for run!");
}

if ($STAMPS) {
    $instance->Stamp("template_exprun", "starting event system");
}
787 788 789 790 791

#
# Restart the event stream from the beginning.
#
print "Asking the event system to replay events ...\n";
792
system("$eventcontrol replay $pid,$eid") == 0
793
    or fatal(-1, "Could not restart the event system!");
794

795 796 797 798 799
# We lose the log info for this commit if we do not copy it out to the
# user directory.
$experiment->CopyLogFiles()
    if (defined($logname));

800 801 802
if ($STAMPS) {
    $instance->Stamp("template_exprun", "run started");
}
803 804
print "Run '$runid' has been " .
    ($action eq "modify" && defined($oldrun) ? "modified" : "started") . "\n";
805
done:
806

807 808 809 810 811 812 813 814 815 816 817 818
if (keys(%deadnodes)) {
    my $subject;
    my $message = "";

    foreach my $node_id (keys(%deadnodes)) {
	$message .= "$node_id appears to be unresponsive\n";
    }

    if ($action eq "stop") {
	$subject = "Node failures during Stop Run";
    }
    else {
819
	$subject = "Node failures during Start New Run ($runid)";
820
    }
821
    SENDMAIL($user_uid, $subject, $message, $TBOPS, "CC: $TBOPS");
822 823
}

824 825 826 827
# log file gets copied out to the user directory.
$experiment->CopyLogFiles()
    if (defined($logname));

828
# Stop the web interface from spewing.
829
$experiment->CloseLogFile()
830 831
    if (defined($logname));

832 833 834 835 836 837 838 839 840 841 842
exit(0);

#
# Parse command arguments. Once we return from getopts, all that are
# left are the required arguments.
#
sub ParseArgs()
{
    if (! getopts($optlist, \%options)) {
	usage();
    }
843
    
844
    #
845
    # Allow pid to be used instead of GUID.
846
    #
847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
    if (@ARGV == 1) {
	#
	# Pick up guid/version first and untaint.
	#
	my $tmp = shift(@ARGV);

	if ($tmp =~ /^([\w]*)\/([\d]*)$/) {
	    $guid = $1;
	    $version = $2;
	}
	else {
	    tbdie("Bad data in argument: $tmp");
	}
    }
    elsif (defined($options{"p"})) {
	$pid = $options{"p"};
863

864 865 866 867 868 869 870 871 872 873
	if ($pid =~ /^([-\w]+)$/) {
	    $pid = $1;
	}
	else {
	    tbdie("Bad data in argument: $pid.");
	}
	if (! TBcheck_dbslot($pid, "projects", "pid",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    tbdie("Improper project name (pid)!");
	}
874 875
    }
    else {
876 877
	tberror("Must provide GUID or -p option!");
	exit(1);
878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
    }
    
    if (defined($options{"e"})) {
	$eid = $options{"e"};

	if ($eid =~ /^([-\w]+)$/) {
	    $eid = $1;
	}
	else {
	    tbdie("Bad data in argument: $eid.");
	}
	if (! TBcheck_dbslot($eid, "experiments", "eid",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    tbdie("Improper experiment name (id)!");
	}
    }
    else {
	tberror("Must provide an experiment ID (-e option)!");
	exit(1);
    }

    if (defined($options{"r"})) {
	$runid = $options{"r"};

	if ($runid =~ /^([-\w]+)$/) {
	    $runid = $1;
	}
	else {
	    tbdie("Bad data in argument: $runid.");
	}
	if (! TBcheck_dbslot($runid, "experiments", "eid",
			   TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    tbdie("Improper experiment name (id)!");
	}
    }
    
    if (defined($options{"a"})) {
	$action = $options{"a"};

917
	if ($action ne "start" && $action ne "stop" &&
Leigh Stoller's avatar
Leigh Stoller committed
918
	    $action ne "pause" && $action ne "continue" &&
919
	    $action ne "abort" && $action ne "modify") {
920 921
	    tbdie("Improper -a argument: $action.");
	}
922 923 924 925
	# Need the equiv of a taint check.
	if ($action =~ /^([\w]+)$/) {
	    $action = $1;
	}
926 927 928 929 930 931 932 933 934 935 936
    }
    else {
	tbdie("Must provide an action (-a argument)!");
    }

    if (defined($options{"q"})) {
	$quiet = 1;
    }
    if (defined($options{"w"})) {
	$waitmode = 1;
    }
937 938 939
    if (defined($options{"d"})) {
	$debug = 1;
    }
940 941 942
    if (defined($options{"f"})) {
	$foreground = 1;
    }
943 944 945
    if (defined($options{"c"})) {
	$clean = 1;
    }
946 947 948
    if (defined($options{"i"})) {
	$ignoreerrors = 1;
    }
949 950 951
    if (defined($options{"m"})) {
	$doswapmod = 1;
    }
952 953 954 955 956 957 958 959 960 961
    if (defined($options{"t"})) {
	$ctoken = $options{"t"};

	if ($ctoken =~ /^([-\w]+)$/) {
	    $ctoken = $1;
	}
	else {
	    tbdie("Bad data in argument: $ctoken.");
	}
    }
962 963 964 965 966 967 968 969
    if (defined($options{"E"})) {
	if (! TBcheck_dbslot($options{"E"},
			     "experiment_templates", "description",
			     TBDB_CHECKDBSLOT_WARN|TBDB_CHECKDBSLOT_ERROR)) {
	    tbdie("Improper template description!");
	}
	$description = $options{"E"};
    }
970 971 972 973 974 975 976 977 978 979 980 981
    if (defined($options{"y"})) {
	$paramwhich = $options{"y"};

	if ($paramwhich ne "template" && $paramwhich ne "instance" &&
	    $paramwhich ne "lastrun") {
	    tbdie("Improper -y argument: $paramwhich");
	}
	# Need the equiv of a taint check.
	if ($paramwhich =~ /^([\w]+)$/) {
	    $paramwhich = $1;
	}
    }
982

983 984
    if (defined($options{"x"})) {
	my $inputfile = $options{"x"};
985 986 987 988 989 990 991 992 993 994 995 996 997 998

	# Note different taint check (allow /).
	if ($inputfile =~ /^([-\w\.\/]+)$/) {
	    $inputfile = $1;
	}
	else {
	    tbdie("Bad data in parameter file: $inputfile");
	}

	#
	# Called from ops interactively. Make sure NS file in /proj or /users.
	#
	# Use realpath to resolve any symlinks.
	#
999
	my $translated = realpath($inputfile);
1000 1001 1002 1003 1004 1005 1006 1007
	if ($translated =~ /^([-\w\.\/]+)$/) {
	    $inputfile = $1;
	}
	else {
	    tbdie("Bad data returned by realpath: $translated");
	}

	#
1008 1009 1010 1011 1012 1013
	# The file must reside in an acceptible location. Since this script
	# runs as the caller, regular file permission checks ensure it is a
	# file the user is allowed to use.  So we don't have to be too tight
	# with the RE matching /tmp and /var/tmp files.  These .xml files
	# are allowed since this script is invoked directly from web interface
	# which generates a name that should not be guessable.
1014 1015 1016 1017
	#
	if (! ($inputfile =~ /^\/tmp\/[-\w]+-\d+\.xml/) &&
	    ! ($inputfile =~ /^\/tmp\/\d+\.xml/) &&
	    ! ($inputfile =~ /^\/var\/tmp\/php\w+/) &&
1018
	    ! TBValidUserDir($inputfile, 0)) {
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
	    tberror("$inputfile does not resolve to an allowed directory!");
	    # Note positive status; so error goes to user not tbops.
	    exit(1);
	}
	$paramfile = $inputfile;
    }
}

#
# Cleanup the mess.
#
sub cleanup()
{
1032
    # only for start new run; stop run failures do not do this!
1033
    # Do not kill the oldrun on a modify either.
1034
    $instance->DeleteCurrentRun()
1035
	if (defined($instance) && defined($newrun) && !defined($oldrun));
1036 1037 1038

    $instance->UnLock($locktoken)
	if (defined($instance) && defined($locktoken));
1039 1040 1041 1042

    # log file gets copied out to the user directory.
    $experiment->CopyLogFiles()
	if (defined($experiment) && defined($logname));
1043 1044 1045 1046 1047 1048 1049 1050 1051
}

sub fatal($$)
{
    my ($errorstat, $msg) = @_;
    
    tberror $msg;
    tbinfo "Cleaning up and exiting with status $errorstat ...";

1052
    # Stop the web interface from spewing.
1053
    $experiment->CloseLogFile()
1054 1055
	if (defined($logname));

1056 1057 1058 1059 1060 1061
    #
    # This exit will drop into the END block below.
    # 
    exit($errorstat);
}

1062
sub sighandler($) {
1063 1064 1065 1066 1067 1068
    my ($signame) = @_;
    
    $SIG{TERM} = 'IGNORE';
    my $pgrp = getpgrp(0);
    kill('TERM', -$pgrp);
    sleep(1);
1069
    fatal(-1, "Caught SIG${signame}!");
1070 1071
}

1072
#
1073
# Use tevc to send an event and wait for completion.
1074
#
1075
sub SignalProgAgents($)
1076
{
1077
    my ($action) = @_;
1078
    my $agent;
1079

1080
    if ($action eq "HALT") {
1081
	$agent = "__all_programs";
1082
    }
1083 1084
    else {
	$agent = "__all_program-agents";
1085
    }
1086

1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
    system("$tevc -w -t 30 -e $pid/$eid now $agent $action");
    if ($?) {
	#
	# Timeout is important; other errors are real errors.
	#
	return ETIMEDOUT
	    if ($? >> 8 == ETIMEDOUT);

	fatal(-1, "Could not send event notification!");
    }
    return 0;
1098 1099
}

1100 1101
sub SendCompletionEvent()
{
1102 1103 1104
    return
	if (!defined($ctoken));
    
1105
    my $arguments = " CTOKEN=$ctoken ERROR=0 ";
1106

1107 1108 1109 1110 1111 1112
    #
    # Easier to use tevc, and besides the perl swig wrappers are broken
    # in some way cause event_notificaton_set_arguments does not work.
    #
    system("$tevc -e $pid/$eid now ns COMPLETE $arguments") == 0
	or fatal(-1, "Could not send completion event notification!");
1113 1114
}

1115 1116 1117 1118 1119 1120 1121
#
# Look to see if any nodes have died. This is currently our best way to
# determine likely non-responders to the events and loghole operations,
# since right now there is no information from the event scheduler about
# it.  Will probably need to add that, but lets try this for now. The main
# problem is plab nodes.
#
1122
sub CheckForDeadNodes($)
1123
{
1124 1125
    my ($thisrun) = @_;
    
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
    my %nodestatuslist;
    
    $experiment->NodeStatusList(\%nodestatuslist) == 0
	or fatal(-1, "Could not get node status list");

    foreach my $node_id (keys(%nodestatuslist)) {
	next
	    if ($nodestatuslist{$node_id});

	#
	# Node is dead. Need to record this as part of the template record.
	# This hash is for later, to send a summary report to the user.
	#
1139 1140
	$thisrun->MarkNodeDead($node_id) == 0
	    or fatal(-1, "Could not mark node as dead in $thisrun");
1141 1142

	tbwarn("$node_id appears to be dead during start/stop run");
1143 1144 1145 1146 1147

	#
	# This is for email message at the end.
	#
	$deadnodes{$node_id} = $node_id;
1148 1149 1150
    }
}

1151 1152 1153 1154
#
# Callback to generate the file listing.
#
my @allfiles = ();
1155
my @symlinks = ();
1156 1157 1158

sub FindCallBack
{
1159 1160
    my $name    = $_;
    my $linkval = "";
1161 1162 1163 1164 1165 1166 1167

    return
	if ($name eq "." || $name eq "..");

    my $st = stat($name) or
	fatal(-1, "Failed to stat $name!");

1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
    # Force taint check
    my $fullpath = "$File::Find::dir/$name";
    if ($fullpath =~ /^(.*)$/) {
	$fullpath = $1;
    }
    if (-l $name) {
	$linkval = readlink($name);
	if ($linkval =~ /^(.*)$/) {
	    $linkval = $1;
	}
	push(@symlinks, {'pathname'  => $fullpath,
			 'linkvalue' => $linkval});
    }

    push(@allfiles, {'pathname' => $fullpath,
1183
		     'mode'     => $st->mode,
1184
		     'linkval'  => $linkval,
1185 1186 1187 1188 1189 1190 1191
		     'uid'      => $st->uid,
		     'gid'      => $st->gid,
		     'atime'    => $st->atime,
		     'mtime'    => $st->mtime,
		     'ctime'    => $st->ctime});
}

1192 1193 1194
#
# Generate an XML file describing the just finished run.
#
1195
sub GenXML($$$)
1196
{
1197
    my ($instance, $outputfile, $psymlinks) = @_;
1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
    my $root     = {};
    my $template = $instance->GetTemplate();
    my $run      = $instance->CurrentRun();
    my %bindings;
    my %mlist;

    $root->{"infoversion"}= "1.0";
    $root->{"guid"}       = $instance->guid() . "/" . $instance->vers();
    $root->{"pid"}        = $template->pid();
    $root->{"pid_idx"}    = $template->pid_idx();
    $root->{"eid"}        = $instance->eid();
    $root->{"exptidx"}    = $instance->exptidx();
    $root->{"uid"}        = $instance->uid();
    $root->{"uid_idx"}    = $instance->uid_idx();
    $root->{"runid"}      = $run->runid();
    $root->{"run_idx"}    = $run->idx();
    $root->{"start_time"} = $run->start_time();
    $root->{"stop_time"}  = $run->stop_time();

    # Fill in the bindings
    $run->BindingList(\%bindings) == 0
	or fatal(-1, "Could not get bindings for $run");
    if (keys(%bindings)) {
	$root->{"bindings"} = {};
	foreach my $name (keys(%bindings)) {
	    my $value = $bindings{$name};
	    my $description;

	    $template->FormalParameterDescription($name, \$description);
	
	    $root->{"bindings"}->{$name} = {'value' => $value,
					    'description' => $description};
	}
    }

    # Fill in the metadata
    $template->MetadataList(\%mlist) == 0
	or fatal(-1, "Could not get Metadata list for $instance");
    if (keys(%mlist)) {
	$root->{"metadata"} = {};
	foreach my $name (keys(%mlist)) {
	    my $value = $mlist{$name};
	
	    $root->{"metadata"}->{$name} = $value;
	}
    }

1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
    # File listing to preserve attributes, just in case.
    my $cwd = `/bin/pwd`;
    # Must do the taint check too.
    if ($cwd =~ /^(.*)$/) {
	$cwd = $1;
    }
    if (! chdir($instance->path())) {
	print STDERR "*** Could not chdir to instance path\n";
	return -1;
    }
    find({'wanted' => \&FindCallBack, 'untaint' => 1}, ".");
    chdir($cwd);
    $root->{"files"} = {"file" => [@allfiles]};

1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288
    if ($debug) {
	print Dumper($root);
	print "\n";
    }
    my %xmloptions = ('RootName' => 'run',
		      'XMLDecl' => 1,
		      'SuppressEmpty' => undef,
		      'NoAttr' => 1);

    my $xml = XMLout($root, %xmloptions);

    if ($debug) {
	print "$xml\n";

	$root = XMLin($xml, KeyAttr => "name",
		      "ForceArray" => ["metadata", "runs", "bindings"]);
	print Dumper($root);
	print "\n";
    }

    unlink($outputfile)
	if (-e $outputfile);
    if (open(DUMP, ">$outputfile")) {
	print DUMP "$xml\n";
	close(DUMP);
    }
    else {
	print STDERR "*** Could not open $outputfile for writing";
	return -1;
    }
1289
    @$psymlinks = @symlinks;
1290 1291 1292
    return 0;
}

1293 1294
END {
    # Normal exit, nothing to do.
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304
    return
	if ($justexit);
    
    if (!$?) {
        #
        # Unlock ... safe cause the unlock routine will only let the original
        # process do the unlock, and only if its locked.
	#
	$instance->UnLock($locktoken)
	    if (defined($instance) && defined($locktoken));
1305 1306 1307 1308 1309 1310 1311 1312 1313
	return;
    }
    my $saved_exitcode = $?;
    
    if ($cleaning) {
	#
	# We are screwed; a recursive error. Someone will have to clean
	# up by hand. 
	#
1314 1315
	SENDMAIL($TBOPS, 
		 "Template Creation Failure: $pid/$eid",
1316 1317 1318 1319 1320 1321 1322 1323 1324
		 "Recursive error in cleanup! This is very bad.");
	$? = $saved_exitcode;
	return;
    }
    $cleaning = 1;
    cleanup();
    $? = $saved_exitcode;
}