swapexp.in 40.9 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6 7 8
# All rights reserved.
#

9 10
use English;
use Getopt::Std;
11
use POSIX qw(isatty setsid);
12 13

#
Chad Barb's avatar
Chad Barb committed
14
# This gets invoked from the Web interface.
Chad Barb's avatar
 
Chad Barb committed
15
# Swap an experiment in, swap it out, restart or modify.
16
#
Chad Barb's avatar
Chad Barb committed
17

18 19
sub usage()
{
20
    print(STDERR
21
	  "Usage: swapexp [-q] [-b | -w] [-i | -a | -f] [-r] [-e]\n".
22 23 24 25
	  "               <-s in | out | restart | modify | pause>\n".
	  "               <pid> <eid> [<nsfile>]\n".
	  "switches and arguments:\n".
	  "-w       - wait for non-batchmode experiment swap/modify\n".
26
	  "-q       - be less chatty\n".
27 28 29 30 31 32
	  "-r       - reboot nodes when doing a modify experiment\n".
	  "-e       - restart event scheduler when doing a modify experiment\n".
	  "-s <op>  - Operation to perform; one of those listed above\n".
	  "<pid>    - The project the experiment belongs to\n".
	  "<eid>    - The experiment name (id)\n".
	  "<nsfile> - Optional NS file to parse for experiment modify\n");
33 34
    exit(-1);
}
35 36 37

sub fatal($);

38
my  $optlist = "biafres:wq";
39

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
#
# Exit codes are important; they tell the web page what has happened so
# it can say something useful to the user. Fatal errors are mostly done
# with die(), but expected errors use this routine. At some point we will
# use the DB to communicate the actual error.
#
# $status < 0 - Fatal error. Something went wrong we did not expect.
# $status = 0 - Termination is proceeding in the background. Notified later.
# $status > 0 - Expected error. User not allowed for some reason. 
# 
sub ExitWithStatus($$)
{
    my ($status, $message) = @_;
    
    if ($status < 0) {
	die("*** $0:\n".
	    "    $message\n");
    }
    else {
	print STDERR "$message\n";
    }
    exit($status);
}

64 65 66 67 68 69
#
# Configure variables
#
my $TB     = "@prefix@";
my $TBOPS  = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
70
my $TBINFO = "$TB/expinfo";
71
my $TBDOCBASE = "@TBDOCBASE@";
72
my $TBBASE = "@TBBASE@";
73
my $CONTROL  = "@USERNODE@";
74 75 76 77 78 79 80

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
81
use libtblog;
82
use libArchive;
83

84 85 86 87 88 89 90 91 92
# Be careful not to exit on transient error; 0 means infinite retry.
$libdb::DBQUERY_MAXTRIES = 0;

# For the END block below.
my $cleaning = 0;
my $justexit = 1;
my $signaled = 0;

my $tbdir    = "$TB/bin";
93
my $tbdata   = "tbdata";
94
my $checkquota = "$TB/sbin/checkquota";
95
my $batch    = 0;
96
my $idleswap = 0;
97 98
my $autoswap = 0;
my $force    = 0;
Chad Barb's avatar
Chad Barb committed
99
my $reboot   = 0;
100
my $waitmode = 0;
101
my $quiet    = 0;
102
my $eventsys_restart   = 0;
103
my $errorstat= -1;
104 105
my $modifyHosed   = 0;
my $modifySwapped = 0;
Chad Barb's avatar
 
Chad Barb committed
106

107 108 109 110 111
my $inout;
my $logname;
my $dbuid;
my $user_name;
my $user_email;
112
my @allnodes;
113
my @row;
114
my $action;
115
my $tag;
116
my $nextswapstate;
117
my $termswapstate;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
118
my $isadmin  = 0;
Chad Barb's avatar
 
Chad Barb committed
119

120 121 122
#
# Untaint the path
# 
123
$ENV{'PATH'} = "/bin:/usr/bin:$TB/libexec/vis";
124 125 126 127 128 129 130
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Turn off line buffering on output
#
$| = 1;

131 132 133 134 135 136 137
#
# Set umask for start/swap. We want other members in the project to be
# able to swap/end experiments, so the log and intermediate files need
# to be 664 since some are opened for append.
#
umask(0002);

138 139 140 141 142 143 144 145
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
146 147 148
if (defined($options{"i"})) {
    $idleswap = 1;
}
149 150 151
if (defined($options{"w"})) {
    $waitmode = 1;
}
152 153 154 155 156 157
if (defined($options{"a"})) {
    $autoswap = 1;
}
if (defined($options{"f"})) {
    $force = 1;
}
158 159 160
if (defined($options{"b"})) {
    $batch = 1;
}
Chad Barb's avatar
 
Chad Barb committed
161 162 163
if (defined($options{"r"})) {
    $reboot = 1;
}
164 165 166
if (defined($options{"e"})) {
    $eventsys_restart = 1;
}
167 168 169
if (defined($options{"q"})) {
    $quiet = 1;
}
170 171 172
if (defined($options{"s"})) {
    $inout = $options{"s"};

Chad Barb's avatar
Chad Barb committed
173 174 175
    if ($inout ne "out"     &&
	$inout ne "in"      &&
	$inout ne "restart" &&
176
	$inout ne "pause"   &&
Chad Barb's avatar
 
Chad Barb committed
177
	$inout ne "modify") {
178 179 180 181 182 183 184
	usage();
    }
}
else {
    usage();
}

185 186 187 188 189
usage()
    if (($waitmode && $batch) ||
	($inout ne "modify" && @ARGV != 2) ||
	(($waitmode || $batch) && ($idleswap || $autoswap || $force)));

190 191 192 193 194
if ($eventsys_restart && $inout ne "modify") {
    print STDOUT "Usage: swapexp: -e (eventsys_restart) can be used ".
                 "only with -s modify\n";
    usage();
}
Chad Barb's avatar
 
Chad Barb committed
195 196 197
my $pid   = $ARGV[0];
my $eid   = $ARGV[1];

198 199 200
#
# Untaint the arguments.
#
201
if ($pid =~ /^([-\w\.]+)$/) {
202 203 204 205 206
    $pid = $1;
}
else {
    die("Tainted argument $pid!\n");
}
207
if ($eid =~ /^([-\w\.]+)$/) {
208 209 210 211 212
    $eid = $1;
}
else {
    die("Tainted argument $eid!\n");
}
213
my $repfile = "$eid.report";
214 215
my $workdir = TBExptWorkDir($pid, $eid);
my $userdir = TBExptUserDir($pid, $eid);
216 217
my $tempnsfile;
my $modnsfile;
218
my $nsfile;
219

Leigh B. Stoller's avatar
Leigh B. Stoller committed
220
if ($inout eq "modify" && @ARGV > 2) {
221 222 223 224 225
    $tempnsfile = $ARGV[2];

    #
    # Untaint nsfile argument; Allow slash.
    #
226
    if ($tempnsfile =~ /^([-\w\.\/]+)$/) {
227
	$tempnsfile = $1;
228 229
    }
    else {
230 231 232 233 234 235 236 237 238 239
	die("Tainted nsfile name: $tempnsfile\n");
    }
    #
    # Called from ops interactively. Make sure NS file in /proj or /users.
    #
    # Use realpath to resolve any symlinks.
    #
    my $translated = `realpath $tempnsfile`;
    if ($translated =~ /^([-\w\.\/]+)$/) {
	$tempnsfile = $1;
240
    }
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
    else {
	die("Tainted nsfile returned by realpath: $translated\n");
    }

    #
    # The file must reside in /proj, /groups, or /users. Since this script
    # runs as the caller, regular file permission checks ensure its a file
    # the user is allowed to use. /tmp/$guid-$nsref.nsfile also allowed
    # since this script is invoked directly from web interface, which generates
    # a name that should not be guessable, so as long as it looks to be in
    # proper format, we accept it. 
    #
    if (! ($tempnsfile =~ /^\/tmp\/[-\w]+-\d+\.nsfile/) &&
	! ($tempnsfile =~ /^\/var\/tmp\/php\w+/) &&
	! ($tempnsfile =~ /^\/proj/) &&
	! ($tempnsfile =~ /^\/groups/) &&
	! ($tempnsfile =~ /^\/users/)) {
	die("$tempnsfile does not resolve to an appropriate directory!\n");
    }

    if (! -f $tempnsfile || -z $tempnsfile || ! -r $tempnsfile) {
262 263 264
	die("*** $0:\n".
	    "    $tempnsfile does not look like an NS file!\n");
    }
265 266
    $nsfile    = "$eid.ns";
    $modnsfile = "${eid}-modify.ns";
267
}
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283

#
# Verify user and get his DB uid.
#
if (! UNIX2DBUID($UID, \$dbuid)) {
    die("*** $0:\n".
	"    You do not exist in the Emulab Database.\n");
}

#
# Get email info for user.
#
if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
    die("*** $0:\n".
	"    Cannot determine your name and email address.\n");
}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
284
$isadmin = TBAdmin($UID);
285

Kevin Atkinson's avatar
 
Kevin Atkinson committed
286 287 288 289 290
#
# Set error reporting info
#
tblog_set_info($pid,$eid,$UID);

291
#
Chad Barb's avatar
 
Chad Barb committed
292
# Verify that this person can muck with the experiment.
293 294
# Note that any script down the line has to do an admin check also. 
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
295
if ($UID && !$isadmin &&
296 297
    !TBExptAccessCheck($dbuid, $pid, $eid, TB_EXPT_DESTROY)) {
    die("*** $0:\n".
Chad Barb's avatar
 
Chad Barb committed
298
	"    You do not have permission to swap or modify this experiment!\n");
299 300
}

301 302 303 304
# Must do this before lock tables!
# idleswap is in minutes, threshold is in hours
$idleswap_time = 60 * TBGetSiteVar("idle/threshold");

305 306 307 308 309 310 311 312 313
#
# In wait mode, block interrupt until we spin off the background process.
#
if ($waitmode) {
    $SIG{TERM} = 'IGNORE';
    $SIG{QUIT} = 'IGNORE';
    $SIG{INT}  = 'IGNORE';
}

314 315 316 317 318 319 320 321 322 323 324
#
# Check for overquota; we deal with it below, cause of the batch system.
#
my $overquota = system("$checkquota $dbuid");

if ($overquota) {
    print STDERR
	"*** $0:\n".
	"    You are over your disk quota on $CONTROL; please cleanup!\n";
}

325 326 327 328 329 330
#
# Temp fix; Disallow swapmod to firewalled experiments. This will come
# out later.
#
my $firewalled = TBExptFirewall($pid, $eid);

331 332 333 334 335
#
# We have to protect against trying to end an experiment that is currently
# in the process of being terminated. We use a "wrapper" state (actually
# a timestamp so we can say when termination was requested) since
# terminating consists of a couple of different experiment states down inside
Chad Barb's avatar
Chad Barb committed
336
# the tb scripts.
337 338 339 340 341 342 343 344 345 346 347 348
#
DBQueryFatal("lock tables experiments write");

$query_result =
    DBQueryFatal("SELECT * FROM experiments WHERE eid='$eid' and pid='$pid'");

if (! $query_result->numrows) {
    die("*** $0:\n".
	"    No such experiment $pid/$eid exists!\n");
}
my %hashrow = $query_result->fetchhash();
my $expt_head_login = $hashrow{'expt_head_uid'};
349
my $last_swap_uid   = $hashrow{'expt_swap_uid'};
350
my $estate          = $hashrow{'state'};
351
my $batchstate      = $hashrow{'batchstate'};
352
my $expt_path       = $hashrow{'path'};
353
my $expt_locked     = $hashrow{'expt_locked'};
354
my $isbatchexpt     = $hashrow{'batchmode'};
355
my $canceled        = $hashrow{'canceled'};
356 357 358 359 360 361 362 363 364 365
my $swappablebit= $hashrow{'swappable'};
my $idleswapbit = $hashrow{'idleswap'};
my $autoswapbit = $hashrow{'autoswap'};
my $swappablestr= ( $swappablebit ? "Yes" : "No" );
my $idleswapstr = ( $idleswapbit ? "Yes" : "No" );
my $autoswapstr = ( $autoswapbit ? "Yes" : "No" );
my $noswap      = $hashrow{'noswap_reason'};
my $noidleswap  = $hashrow{'noidleswap_reason'};
my $idleswaptime= $hashrow{'idleswap_timeout'} / 60.0;
my $autoswaptime= $hashrow{'autoswap_timeout'} / 60.0;
366
my $rendering   = $hashrow{'prerender_pid'};
367
my $elabinelab  = $hashrow{'elab_in_elab'};
368
my $lockdown    = $hashrow{'lockdown'};
369

370 371
if ($inout ne "out") {
    # I'm going to update this below, so fix the value before I use it.
372
    $idleswap_time = min($idleswaptime * 60, $idleswap_time);
373 374 375
    $idleswaptime = $idleswap_time / 60.0;
}

376 377
my $swapsettings = 
  "Idle-Swap:   $idleswapstr".
378
  ($idleswapbit ? ", at $idleswaptime hours\n" : " (Reason: $noidleswap)\n").
379 380
  "Auto-Swap:   $autoswapstr".
  ($autoswapbit ? ", at $autoswaptime hours\n" : "\n");
381

382
if (! chdir($workdir)) {
383
    die("*** $0:\n".
384
	"    Could not chdir to $workdir: $!\n");
385 386
}

387
#
388 389 390
# This script is called from the batch daemon.
# 
if ($batch) {
391
    #
392 393 394
    # Sanity Check. If called from the daemon, must already be locked,
    # must be a batch experiment, and must be in proper state for the
    # operation requested. 
395
    #
396 397 398 399 400 401 402 403 404
    die("*** $0:\n".
	"    Experiment $pid/$eid is supposed to be a batch experiment!\n")
	if (!$isbatchexpt);
    
    die("*** $0:\n".
	"    Batch experiment $pid/$eid should be locked!\n")
	if (!defined($expt_locked) ||
	    $batchstate ne BATCHSTATE_LOCKED());

405 406 407 408
    die("*** $0:\n".
	"    Batch experiment $pid/$eid is locked down; cannot be swapped!\n")
	if ($lockdown);

409 410 411 412 413 414 415 416 417
    if ($inout eq "in") {
	die("*** $0:\n".
	    "    Batch experiment $pid/$eid is not in the proper state!\n".
	    "    Currently $estate, but should be QUEUED.\n")
	    if ($estate ne EXPTSTATE_QUEUED);
	
	die("*** $0:\n".
	    "    Batch experiment $pid/$eid has been canceled! Aborting.\n")
	    if ($canceled);
418 419 420 421 422

	# Do not allow it to swap in. What about swapout? 
	die("*** $0:\n".
	    "    Batch experiment cannot swap in when over quota! Aborting.\n")
	    if ($overquota);
423 424 425 426 427 428
    }
    elsif ($inout eq "out") {
	die("*** $0:\n".
	    "    Batch experiment $pid/$eid is not in the proper state!\n".
	    "    Currently $estate, but should be ACTIVE.\n")
	    if ($estate ne EXPTSTATE_ACTIVE);
429 430
    }
    else {
431 432 433 434 435 436
	die("*** $0:\n".
	    "    Improper request from batch daemon for $pid/$eid!\n");
    }
}
else {
    if ($isbatchexpt) {
437 438 439 440
	#
	# User is requesting that a batch either be injected or paused.
	# Sanity check the state, but otherwise let the batch daemon
	# handle it.
441 442
	#
	ExitWithStatus(1, "Batch experiment $pid/$eid is still canceling!")
443
	    if ($canceled);
444

445 446 447
	ExitWithStatus(1, "Batch experiment $pid/$eid is locked down!")
	    if ($lockdown);

448
	if ($inout eq "in") {
449
	    ExitWithStatus(1,
450 451 452
			   "Batch experiment $pid/$eid must be SWAPPED to\n".
			   "QUEUE. Currently $estate.")
		if ($estate ne EXPTSTATE_SWAPPED);
453 454 455 456 457 458

	    ExitWithStatus(1,
			   "Batch experiment $pid/$eid cannot swap in when ".
			   "over quota!\n")
		if ($overquota);
	    
459
	    SetExpState($pid, $eid, EXPTSTATE_QUEUED);
460 461
	}
	elsif ($inout eq "out") {
462
	    ExitWithStatus(1,
463 464 465 466
			   "Batch experiment $pid/$eid must be ACTIVE or\n".
			   "ACTIVATING to swap out. Currently $estate.")
		if ($estate ne EXPTSTATE_ACTIVE &&
		    $estate ne EXPTSTATE_ACTIVATING);
467 468 469 470 471

	    #
	    # Since the batch daemon has control, all we can do is set
	    # the cancel bit.
	    # 
472
	    TBSetCancelFlag($pid, $eid, EXPTCANCEL_SWAP);
473 474
	}
	elsif ($inout eq "pause") {
475
	    ExitWithStatus(1,
476 477 478
			   "Batch experiment $pid/$eid must be QUEUED to\n".
			   "DEQUEUE. Currently $estate.")
		if ($estate ne EXPTSTATE_QUEUED);
479 480

	    #
481 482 483 484
	    # XXX. The batch daemon might already have the experiment, but
	    # not have shipped it off to startexp. Change the state
	    # anyway. The error will be noticed later when startexp dies,
	    # and the batch daemon gets the error back. This sucks.
485
	    #
486
	    SetExpState($pid, $eid, EXPTSTATE_SWAPPED);
487
	}
488
	elsif ($inout eq "modify") {
489
	    ExitWithStatus(1,
490 491 492 493
			   "Batch experiment $pid/$eid must be SWAPPED or\n".
			   "ACTIVE to modify. Currently $estate.")
		if (($estate ne EXPTSTATE_SWAPPED &&
		     $estate ne EXPTSTATE_ACTIVATING) ||
494
		    $batchstate ne BATCHSTATE_UNLOCKED());
495

496 497 498
	    ExitWithStatus(1,
			"Cannot modify an active firewalled experiment (yet).")
		if ($firewalled && $estate ne EXPTSTATE_SWAPPED && !$isadmin);
499

500 501 502
	    ExitWithStatus(1,
			"Cannot modify an active ElabInElab experiment (yet).")
		if ($elabinelab && $estate ne EXPTSTATE_SWAPPED && !$isadmin);
503 504 505 506 507 508

	    ExitWithStatus(1,
			   "Cannot modify batch experiment $pid/$eid when ".
			   "over quota!\n")
		if ($overquota);
	    
509
	    #
510
	    # Otherwise, proceed with the modify. The experiment will be
511 512
	    # locked below, and so it cannot be injected or otherwise messed
	    # with since its state is going to be changed before we unlock
513 514 515 516
	    # the experiments table. The batch daemon will leave it alone
	    # until the modify is done. If the modify fails and cannot recover
	    # it is going to get swapped out; that is okay since the batch
	    # daemon does not keep state internally. 
517
	    #
518 519
	    goto doit;
	}
520 521
	else {
	    die("*** $0:\n",
522
		"    Operation $inout not allowed on a batch experiment!\n");
523
	}
524 525
	ExitWithStatus(0, 
		       "Batch experiment $pid/$eid state has been changed.\n");
526
      doit:
527
    }
528 529 530 531 532 533 534 535 536 537
    else {
	#
	# If the cancel flag is set, then user must wait for that to
	# clear before we can do anything else.
	#
	ExitWithStatus(1,
		       "Experiment $pid/$eid has its cancel flag set!.\n".
		       "You must wait for that to clear before you can swap\n".
		       "or modify the experiment.\n")
	    if ($canceled);
538

539 540 541 542
 	ExitWithStatus(1,
		       "Experiment $pid/$eid is locked down; cannot swap!\n")
	    if ($lockdown);

543 544 545 546 547 548 549 550 551 552
	#
	# Check the state for the various operations.
	#
	if (!$force) {
	  SWITCH: for ($inout) {
	      /^in$/i && do {
		  if ($estate ne EXPTSTATE_SWAPPED()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid is not swapped out!");
		  }
553 554 555 556 557
		  ExitWithStatus(1,
				 "Experiment $pid/$eid cannot swap in when ".
				 "over quota!\n")
		      if ($overquota);
		  
558 559 560 561
		  last SWITCH;
	      };
	      /^out$/i && do {
		  if ($estate ne EXPTSTATE_ACTIVE() &&
Leigh B. Stoller's avatar
Leigh B. Stoller committed
562
 		      $estate ne EXPTSTATE_PANICED() &&
563 564 565 566 567 568
		      $estate ne EXPTSTATE_ACTIVATING()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid is not swapped in ".
				     "or activating!\n");
		  }
		  
Leigh B. Stoller's avatar
Leigh B. Stoller committed
569 570 571 572 573 574 575 576 577 578 579 580
 		  #
 		  # Must be an admin person to swap out an experiment that
 		  # has had its panic button pressed.
 		  #
 		  if ($estate eq EXPTSTATE_PANICED() && !$isadmin) {
 		      ExitWithStatus(1,
 				     "Experiment $pid/$eid had its panic ".
 				     "button pressed!\n".
 				     "Only a testbed administrator can swap ".
 				     "this experiment out.");
 		  }

581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
		  if ($estate eq EXPTSTATE_ACTIVATING()) {
		      #
		      # All we can do is set the cancel flag and hope that
		      # it gets noticed. We do not wait. 
		      # 
		      TBSetCancelFlag($pid, $eid, EXPTCANCEL_SWAP);
		      
		      ExitWithStatus(0,
				     "Experiment $pid/$eid swapin has been  ".
				     "marked for cancelation.\n".
				     "You will receive email when the original ".
				     "swap request has finished.");
		  }
		  last SWITCH;
	      };
	      /^restart$/i && do {
		  if ($estate ne EXPTSTATE_ACTIVE()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid is not swapped in!");
		  }
		  last SWITCH;
	      };
	      /^modify$/i && do {
		  if ($estate ne EXPTSTATE_ACTIVE() &&
		      $estate ne EXPTSTATE_SWAPPED()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid must be ACTIVE or\n".
				     "SWAPPED to modify!\n");
		  }
610
		  ExitWithStatus(1,
611 612 613 614
			"Cannot modify an active firewalled experiment (yet).")
		      if ($firewalled &&
			  $estate ne EXPTSTATE_SWAPPED && !$isadmin);

615
		  ExitWithStatus(1,
616 617 618
			"Cannot modify an active ElabInElab experiment (yet).")
		      if ($elabinelab &&
			  $estate ne EXPTSTATE_SWAPPED && !$isadmin);
619 620 621 622 623

		  ExitWithStatus(1,
				 "Experiment $pid/$eid cannot be modified ".
				 "when over quota!\n")
		      if ($overquota);
624
		  
625 626 627 628 629
		  last SWITCH;
	      };
	      die("*** $0:\n".
		  "    Missing state check for action: $action\n");
	  }
630 631
	}
    }
632 633
}

634 635 636 637 638 639 640
#
# Determine the temporary and next state for experiment. If the experiment
# is a batch experiment, then the next state is actually handled by the
# batch daemon, but we still have to deal with the temporary state. 
#
SWITCH: for ($inout) {
    /^in$/i && do {
641
	$nextswapstate = EXPTSTATE_ACTIVATING();
642 643 644
	last SWITCH;
    };
    /^out$/i && do {
645
	$nextswapstate = EXPTSTATE_SWAPPING();
646 647 648
	last SWITCH;
    };
    /^restart$/i && do {
649
	$nextswapstate = EXPTSTATE_RESTARTING();
650 651 652
	last SWITCH;
    };
    /^modify$/i && do {
653 654
	$nextswapstate = (($estate eq EXPTSTATE_SWAPPED()) ?
			  EXPTSTATE_MODIFY_PARSE() : EXPTSTATE_MODIFY_REPARSE());
655 656
	last SWITCH;
    };
657
    die("*** $0:\n".
658
	"    Missing state check for action: $action\n");
659
}
660 661
 
# Update idleswap_timeout to whatever the current value is.
662
if ($inout ne "out") {
663 664 665
    DBQueryFatal("update experiments set idleswap_timeout='$idleswap_time' ".
		 "where eid='$eid' and pid='$pid'");
}
666

667 668 669 670 671 672
#
# On a failure, we go back to this swapstate. Might be modified below.
# 
$termswapstate = $estate;

# Lock the record, set the nextstate, and unlock the table.
673 674 675 676 677 678 679 680 681 682
TBLockExp($pid, $eid, $nextswapstate)
    or die("*** $0:\n".
	   "Failed to set experiment state to $nextswapstate\n");

#
# At this point, we need to force a cleanup no matter how we exit.
# See the END block below.
#
$justexit = 0;

683 684 685 686
DBQueryFatal("unlock tables");

#
# XXX - At this point a failure is going to leave things in an
687 688 689 690
# inconsistent state. Be sure to call fatal() only since we are
# going into the background, and we have to send email since no
# one is going to see printed error messages (output goes into the
# log file, which will be sent along in the email). 
691 692
#

693 694
if ($inout eq "in") {
    $action = "swapped in";
695
    $tag    = "swapin";
696 697 698
}
if ($inout eq "out") {
    $action = "swapped out";
699
    $tag    = "swapout";
700 701 702 703
}
if ($inout eq "restart") {
    $action = "restarted";
}
Chad Barb's avatar
 
Chad Barb committed
704 705
if ($inout eq "modify") {
    $action = "modified";
706
    $tag    = "swapmod";
Chad Barb's avatar
 
Chad Barb committed
707
}
708

709 710 711 712 713 714 715 716 717 718 719 720 721 722 723
#
# Get email address of the experiment head, which may be different than
# the person who is actually terminating the experiment, since its polite
# to let the original creator know whats going on. 
#
my $expt_head_name;
my $expt_head_email;

if (! UserDBInfo($expt_head_login, \$expt_head_name, \$expt_head_email)) {
    print STDERR "*** WARNING: ".
	         "Could not determine name/email for $expt_head_login.\n";
    $expt_head_name  = "TBOPS";
    $expt_head_email = $TBOPS;
}

724 725 726
#
# Before going to background, we have to copy out the NS file!
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
727
if ($inout eq "modify" && defined($modnsfile)) {
728 729
    unlink($modnsfile);
    if (system("/bin/cp", "$tempnsfile", "$modnsfile")) {
730
	fatal("Could not copy $tempnsfile to $modnsfile");
731 732 733 734
    }
    chmod(0664, "$modnsfile");
}

735 736 737 738
#
# If not in batch mode, go into the background. Parent exits.
#
if (! $batch) {
739
    $logname = TBExptCreateLogFile($pid, $eid, "swapexp");
740
    TBExptSetLogFile($pid, $eid, $logname);
741
    TBExptOpenLogFile($pid, $eid);
Chad Barb's avatar
Chad Barb committed
742

743 744
    if (my $childpid = TBBackGround($logname)) {
	#
745 746
	# Parent exits normally, unless in waitmode. We have to set
	# justexit to make sure the END block below does not run.
747
	#
748 749
	$justexit = 1;

750
	if (!$waitmode) {
751 752 753
	    print("Experiment $pid/$eid is now being $action.\n".
		  "You will be notified via email when the this is done.\n")
		if (! $quiet);
754 755
	    exit(0);
	}
756 757 758 759 760 761 762 763
	print("Waiting for experiment $eid to finish its swap${action}\n")
	    if (! $quiet);
	    
	if (isatty(STDIN) && !$quiet) {
	    print("You may type ^C at anytime; you will be notified via email.".
		  "\n".
		  "You will not actually interrupt the experiment itself.\n");
	}
764 765 766 767 768 769 770 771 772 773 774 775
	
	# Give child a chance to run.
	select(undef, undef, undef, 0.25);
	
	#
	# Reset signal handlers. User can now kill this process, without
	# stopping the child.
	#
	$SIG{TERM} = 'DEFAULT';
	$SIG{INT}  = 'DEFAULT';
	$SIG{QUIT} = 'DEFAULT';

776
	#
777
	# Wait until child exits or until user gets bored and types ^C.
778
	#
779 780
	waitpid($childpid, 0);
	
781 782
	print("Done. Exited with status: $?\n")
	    if (! $quiet);
783
	exit($? >> 8);
784
    }
785
    TBdbfork();
786 787
}

788 789 790 791 792 793 794 795
#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}

796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
#
# We need to catch TERM cause sometimes shit happens and we have to kill
# an experiment swap that is hung or otherwise scrogged. Rather then 
# trying to kill off the children one by one, lets arrange to catch it
# here and send a killpg to the children. This is not to be done lightly,
# cause it can leave things worse then they were before!
#
sub handler ($) {
    my ($signame) = @_;
    
    $SIG{TERM} = 'IGNORE';
    my $pgrp = getpgrp(0);
    kill('TERM', -$pgrp);
    sleep(1);
    $signaled = 1;
    fatal("Caught SIG${signame}! Killing experiment setup ...");
}
$SIG{TERM} = \&handler;
$SIG{QUIT} = 'DEFAULT';

816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
#
# Gather stats; start clock ticking
#
if ($inout eq "in") {
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPIN, 0,
		    TBDB_STATS_FLAGS_START);
}
elsif ($inout eq "out") {
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPOUT, 0,
		    TBDB_STATS_FLAGS_START);
}
elsif ($inout eq "modify") {
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPMODIFY, 0,
		    TBDB_STATS_FLAGS_START);
}

832 833 834
#
# Remove old report file since its contents are going to be invalid.
#
835
if ($inout ne "restart" && -e $repfile) {
836 837 838
    unlink("$repfile");
}

839 840 841 842
#
# Sanity check states in case someone changes something.
#
if ($inout eq "out") {
843
    my $optarg = (($force || $idleswap) ? "-force" : "");
844

845 846
    print STDOUT "Running 'tbswap out $optarg $pid $eid'\n";
    if (system("$tbdir/tbswap out $optarg $pid $eid") != 0) {
847
	$errorstat = $? >> 8;
848
	fatal("tbswap out failed!");
849
    }
850 851 852 853 854 855 856 857

    #
    # Add the files that have been detected by tracing to the archive.
    #
    if (libArchive::TBExperimentArchiveAddTracedFiles($pid, $eid) < 0) {
	fatal("Failed to add traced files to the experiment archive!");
    }

858 859
    SetExpState($pid, $eid, EXPTSTATE_SWAPPED)
	or fatal("Failed to set experiment state to " . EXPTSTATE_SWAPPED());
Leigh B. Stoller's avatar
Leigh B. Stoller committed
860
    TBExptClearPanicBit($pid, $eid);
861
}
862
elsif ($inout eq "in") {
863 864
    GatherSwapStats($pid, $eid, $dbuid,
		    TBDB_STATS_SWAPIN, 0, TBDB_STATS_FLAGS_PRESWAPIN);
865 866 867 868

    # Set the swapper now so that nodes use the proper uid. If the swapin
    # fails, we leave the swapper as is, since its harmless and informative.
    TBExptSetSwapUID($pid, $eid, $dbuid);
869
    
870
    print STDOUT "Running 'tbswap in $pid $eid'\n";
Chad Barb's avatar
 
Chad Barb committed
871
    if (system("$tbdir/tbswap in $pid $eid") != 0) {
872
	$errorstat = $? >> 8;
873
	fatal("tbswap in failed!");
874
    }
875 876
    SetExpState($pid, $eid, EXPTSTATE_ACTIVE)
	or fatal("Failed to set experiment state to " . EXPTSTATE_ACTIVE());
877

878
    system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
Chad Barb's avatar
Chad Barb committed
879
}
Chad Barb's avatar
 
Chad Barb committed
880
elsif ($inout eq "modify") {
881
    my $modifyError;
Chad Barb's avatar
Chad Barb committed
882

883 884 885 886 887 888 889
    #
    # Prepare the Archive for the swapmod, in case we have to "roll back".
    #
    if (libArchive::TBExperimentArchivePreSwapMod($pid, $eid) < 0) {
	fatal("Failed to do a preswapmod on the experiment archive!");
    }

890 891 892
    GatherSwapStats($pid, $eid, $dbuid,
		    TBDB_STATS_SWAPMODIFY, 0, TBDB_STATS_FLAGS_PREMODIFY);

893 894 895 896 897
    # Gather up some firewall state for later comparison.
    if (GatherFWinfo() < 0) {
	fatal("Could not gather firewall info; cannot safely continue!");
    }

Chad Barb's avatar
Chad Barb committed
898
    print "Backing up old experiment state ... " . TBTimeStamp() . "\n";
899
    if (TBExptBackupVirtualState($pid, $eid)) {
900
	fatal("Could not backup experiment state; cannot safely continue!");
Chad Barb's avatar
Chad Barb committed
901 902 903
    }

    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
904
    # Rerun tbprerun if modifying, but only if new NS file provided.
905 906
    # Yep, we allow reswap without changing the NS file. For Shashi and SIM.
    # Note that tbprerun kills the renderer if its running.
Chad Barb's avatar
Chad Barb committed
907
    #
Leigh B. Stoller's avatar
Leigh B. Stoller committed
908 909 910
    if (defined($modnsfile)) {
	print STDOUT "Running 'tbprerun $pid $eid $modnsfile'\n";
	if (system("$tbdir/tbprerun $pid $eid $modnsfile") != 0) {
911
	    print STDOUT "Modify Error: tbprerun failed.\n";
912
	  FWHOSED:
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
	    print STDOUT "Recovering experiment state...\n";

	    if (TBExptRemoveVirtualState($pid, $eid) ||
		TBExptRestoreVirtualState($pid, $eid)) {
		$modifyHosed = 1;
		fatal("Experiment state could not be restored!");
		# Never returns;
	    }
	    #
	    # If the renderer was running when we started the swapmod, then we
	    # want to restart it. If it was stopped, then the renderer info
	    # was captured with the rest of the virtual state (restored above).
	    #
	    system("prerender -t $pid $eid")
		if ($rendering);

	    fatal("Update aborted; old virtual state restored.");
	    # Never returns;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
931
	}
932 933 934 935 936 937 938 939 940
	#
	# Okay, whenever a new NS file is presented, we need to do some
	# checks on the firewall to make sure the user is not trying to
	# do something "unsafe". 
	#
	if (CheckFWinfo($estate) != 0) {
	    # All the stuff for recovering is right above, so go there. 
	    goto FWHOSED;
	}
Chad Barb's avatar
Chad Barb committed
941 942
    }

Chad Barb's avatar
 
Chad Barb committed
943
    #
944
    # Our next state depends on whether the experiment was active or swapped.
Chad Barb's avatar
 
Chad Barb committed
945
    #
946 947 948 949 950
    if ($estate eq EXPTSTATE_SWAPPED) {
	SetExpState($pid, $eid, EXPTSTATE_SWAPPED);
    }
    else {
	SetExpState($pid, $eid, EXPTSTATE_MODIFY_RESWAP);
951

952 953 954 955 956
	# Set the swapper now so that nodes use the proper uid. If the
	# swapin fails, we need to reset the swapper back so that he
	# is charged appropriately.
	TBExptSetSwapUID($pid, $eid, $dbuid);

957 958 959 960 961 962 963 964 965 966 967 968
	my $optarg = "";
	#
	# For elabinelab experiments; ignore reboot/eventsys_restart,
	# and force noreconfig; none of it will work or make sense. 
	#
	if ($elabinelab) {
	    $optarg = "-noreconfig";
	}
	else {
	    $optarg  = ($reboot ? "-reboot" : "");
	    $optarg .= ($eventsys_restart ? " -eventsys_restart" : "");
	}
969

970 971 972
	print STDOUT "Running 'tbswap update $optarg $pid $eid'\n";
	if (system("$tbdir/tbswap update $optarg $pid $eid") == 0) {
	    #
973 974
	    # Success. Set the state back to active cause thats where it
	    # started.
975 976 977
	    # 
	    SetExpState($pid, $eid, EXPTSTATE_ACTIVE);
	    $estate = EXPTSTATE_ACTIVE;
978 979 980 981 982 983 984 985

	    #
	    # Add the files that have been detected by tracing to the archive.
	    #
	    if (libArchive::TBExperimentArchiveAddTracedFiles($pid, 
							      $eid) < 0) {
		fatal("Failed to add traced files to the experiment archive!");
	    }
986 987 988 989
	}
	else {
	    $modifyError = $errorstat = $? >> 8;
	    print STDOUT "Modify Error: tbswap update failed.\n";
Chad Barb's avatar
 
Chad Barb committed
990

991
	    #
992 993 994
	    # tbswap either restored the experiment to the way it was,
	    # or it swapped it out completely. In either case, it has
	    # also restored the virtual state. 
995
	    # 
996 997 998 999 1000 1001 1002
	    # Icky. Magic return code that says tbswap swapped it out.
	    # We do not want tbswap to muck with states anymore, so
	    # need to know what it did. At some point we should clean
	    # up the exit reporting! Anyway, fatal() needs to know the
	    # the right state to go back to (no longer ACTIVE).
	    #
	    if ($errorstat & 0x40) {
1003 1004
		$estate = EXPTSTATE_SWAPPED;
		$termswapstate = EXPTSTATE_SWAPPED;
1005
		$modifySwapped = 1;
1006 1007
                # Old accounting info.
		TBSetExpSwapTime($pid, $eid);
1008 1009 1010 1011
		$modifyError = "Update aborted; experiment swapped out.";
	    }
	    else {
		$modifyError = "Update aborted; old state restored.";
1012 1013 1014

		# Reset the swapper since the experiment is still running.
		TBExptSetSwapUID($pid, $eid, $last_swap_uid);
Chad Barb's avatar
 
Chad Barb committed
1015
	    }
1016
	}
Chad Barb's avatar
Chad Barb committed
1017 1018
    }

1019 1020 1021 1022 1023 1024 1025 1026
    #
    # We need to rerender only if the NS file was changed (ran tbprerun),
    # If the swapmod succeeded, then unconditionally run the renderer. If
    # swap failed, then need to run the renderer only if we stopped one in
    # progress.
    #
    if (defined($modnsfile)) {
	system("prerender -t $pid $eid")	
1027
	    if (!defined($modifyError) || $rendering);
Chad Barb's avatar
 
Chad Barb committed
1028
    }
1029 1030 1031 1032 1033 1034 1035

    #
    # Swapmod failed ...
    #
    fatal($modifyError)
	if (defined($modifyError));

1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
    #
    # Move the temporary ns file to its real name.
    #
    if (defined($modnsfile)) {
	unlink($nsfile);
	if (system("/bin/mv", "$modnsfile", "$nsfile")) {
	    fatal("Could not mv $modnsfile to $nsfile");
	}
    }

1046
    TBExptClearBackupState($pid, $eid);
1047
    system("$tbdir/tbreport -b $pid $eid 2>&1 > $repfile");
1048
}
Chad Barb's avatar
 
Chad Barb committed
1049
else { # $inout eq "restart" assumed.
1050
    print STDOUT "Running 'tbrestart $pid $eid'\n";
1051
    if (system("$tbdir/tbrestart $pid $eid") != 0) {
1052
	fatal("tbrestart failed!");
1053
    }
1054
    SetExpState($pid, $eid, EXPTSTATE_ACTIVE);
1055
}
1056

1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
#
# Try to copy off the files for testbed information gathering.
#
TBSaveExpLogFiles($pid, $eid);

#
# Make a copy of the work dir in the user visible space so the user
# can see the log files. This overwrites existing files of course,
# but thats okay.
#
1067
system("cp -Rfp $workdir/ $userdir/tbdata/");
1068

1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101
#
# The archive gets different treatment when doing a swapmod.
#
if ($inout eq "modify") {
    # Get the new NS file into the new swapdir.
    if (defined($nsfile)) {
	system("cp -p $workdir/$nsfile $userdir/$nsfile");
    
	if (libArchive::TBExperimentArchiveAddFile($pid, $eid,
						   "$userdir/$nsfile") < 0) {
	    fatal("Failed to add $userdir/$nsfile to the archive!");
	}
    }

    print "Doing a commit on the previous experiment archive ...\n";
    libArchive::TBExperimentArchiveSwapModCommit($pid, $eid) == 0 or
	fatal("Failed to commit experiment archive!");
}

#
# Do a SavePoint on the experiment files.
#
if (libArchive::TBExperimentArchiveSavePoint($pid, $eid, $tag) < 0) {
    fatal("Failed to do a savepoint on the experiment archive!");
}

# Commit the archive after swapout
if ($inout eq "out") {
    print "Doing a commit on the experiment archive ...\n";
    libArchive::TBCommitExperimentArchive($pid, $eid, $tag) == 0 or
	fatal("Failed to commit experiment archive!");
}

1102 1103 1104 1105
#
# Gather stats. 
#
if ($inout eq "in") {
1106
    GatherSwapStats($pid, $eid, $dbuid, TBDB_STATS_SWAPIN, 0);