swapexp.in 44.8 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3 4

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2007 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
6 7 8
# All rights reserved.
#

9 10
use English;
use Getopt::Std;
11
use POSIX qw(isatty setsid);
12 13

#
14
# This gets invoked from the Web interface.
15
# Swap an experiment in, swap it out, restart or modify.
16
#
17

18 19
sub usage()
{
20
    print(STDERR
21
	  "Usage: swapexp [-q] [-b | -w] [-i | -a | -f] [-r] [-e]\n".
22 23 24 25
	  "               <-s in | out | restart | modify | pause>\n".
	  "               <pid> <eid> [<nsfile>]\n".
	  "switches and arguments:\n".
	  "-w       - wait for non-batchmode experiment swap/modify\n".
26
	  "-q       - be less chatty\n".
27 28 29 30 31 32
	  "-r       - reboot nodes when doing a modify experiment\n".
	  "-e       - restart event scheduler when doing a modify experiment\n".
	  "-s <op>  - Operation to perform; one of those listed above\n".
	  "<pid>    - The project the experiment belongs to\n".
	  "<eid>    - The experiment name (id)\n".
	  "<nsfile> - Optional NS file to parse for experiment modify\n");
33 34
    exit(-1);
}
35
my  $optlist = "biafres:wqx";
36

37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
#
# Exit codes are important; they tell the web page what has happened so
# it can say something useful to the user. Fatal errors are mostly done
# with die(), but expected errors use this routine. At some point we will
# use the DB to communicate the actual error.
#
# $status < 0 - Fatal error. Something went wrong we did not expect.
# $status = 0 - Termination is proceeding in the background. Notified later.
# $status > 0 - Expected error. User not allowed for some reason. 
# 
sub ExitWithStatus($$)
{
    my ($status, $message) = @_;
    
    if ($status < 0) {
52
	tbdie($message);
53
    }
54
    elsif ($status > 0) {
55
	tbnotice($message);
56
    }
57 58 59
    else {
	tbinfo($message);
    }
60 61 62
    exit($status);
}

63 64 65 66 67
#
# Configure variables
#
my $TB     = "@prefix@";
my $TBOPS  = "@TBOPSEMAIL@";
68
my $TBROBOCOPS = "@TBROBOCOPSEMAIL@";
69
my $TBLOGS = "@TBLOGSEMAIL@";
70
my $TBINFO = "$TB/expinfo";
71
my $TBDOCBASE = "@TBDOCBASE@";
72
my $TBBASE = "@TBBASE@";
73
my $CONTROL  = "@USERNODE@";
74 75 76 77 78 79 80

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
81
use libtblog;
82
use libArchive;
83
use Template;
84
use Experiment;
85
use User;
86

87 88 89 90 91 92 93 94 95
# Be careful not to exit on transient error; 0 means infinite retry.
$libdb::DBQUERY_MAXTRIES = 0;

# For the END block below.
my $cleaning = 0;
my $justexit = 1;
my $signaled = 0;

my $tbdir    = "$TB/bin";
96
my $tbdata   = "tbdata";
97
my $checkquota = "$TB/sbin/checkquota";
98
my $batch    = 0;
99
my $idleswap = 0;
100 101
my $autoswap = 0;
my $force    = 0;
102
my $reboot   = 0;
103
my $waitmode = 0;
104
my $quiet    = 0;
105
my $eventsys_restart   = 0;
106
my $errorstat= -1;
107 108
my $modifyHosed   = 0;
my $modifySwapped = 0;
109
my $robotexp = 0;
110
my $template_node = 0;
111

112 113
my $inout;
my $logname;
114
my @allnodes;
115
my @row;
116
my $action;
117
my $tag;
118
my $nextswapstate;
119
my $termswapstate;
120

Kevin Atkinson's avatar
Kevin Atkinson committed
121 122
my $modifyError; # needed when emailing error

123
# Protos
124
sub fatal($;$);
125 126 127
sub CheckFWinfo($);
sub GatherFWinfo();
		
128 129 130
#
# Untaint the path
# 
131
$ENV{'PATH'} = "/bin:/usr/bin:$TB/libexec/vis";
132 133 134 135 136 137 138
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Turn off line buffering on output
#
$| = 1;

139 140 141 142 143 144 145
#
# Set umask for start/swap. We want other members in the project to be
# able to swap/end experiments, so the log and intermediate files need
# to be 664 since some are opened for append.
#
umask(0002);

146 147 148 149 150 151 152 153
#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
154 155 156
if (defined($options{"i"})) {
    $idleswap = 1;
}
157 158 159
if (defined($options{"w"})) {
    $waitmode = 1;
}
160 161 162 163 164 165
if (defined($options{"a"})) {
    $autoswap = 1;
}
if (defined($options{"f"})) {
    $force = 1;
}
166 167 168
if (defined($options{"b"})) {
    $batch = 1;
}
169 170 171
if (defined($options{"r"})) {
    $reboot = 1;
}
172 173 174
if (defined($options{"e"})) {
    $eventsys_restart = 1;
}
175 176 177
if (defined($options{"q"})) {
    $quiet = 1;
}
178 179 180
if (defined($options{"x"})) {
    $template_mode = 1;
}
181 182 183
if (defined($options{"s"})) {
    $inout = $options{"s"};

184 185 186
    if ($inout ne "out"     &&
	$inout ne "in"      &&
	$inout ne "restart" &&
187
	$inout ne "pause"   &&
188
	$inout ne "modify") {
189 190 191 192 193 194 195
	usage();
    }
}
else {
    usage();
}

196 197 198 199 200
usage()
    if (($waitmode && $batch) ||
	($inout ne "modify" && @ARGV != 2) ||
	(($waitmode || $batch) && ($idleswap || $autoswap || $force)));

201 202 203 204 205
if ($eventsys_restart && $inout ne "modify") {
    print STDOUT "Usage: swapexp: -e (eventsys_restart) can be used ".
                 "only with -s modify\n";
    usage();
}
206 207 208
my $pid   = $ARGV[0];
my $eid   = $ARGV[1];

209 210 211
#
# Untaint the arguments.
#
212
if ($pid =~ /^([-\w\.]+)$/) {
213 214 215
    $pid = $1;
}
else {
216
    tbdie("Tainted argument $pid!");
217
}
218
if ($eid =~ /^([-\w\.]+)$/) {
219 220 221
    $eid = $1;
}
else {
222
    tbdie("Tainted argument $eid!");
223
}
224
my $repfile = "tbreport.log";
225 226
my $tempnsfile;
my $modnsfile;
227
my $nsfile;
228

229
if ($inout eq "modify" && @ARGV > 2) {
230 231 232 233 234
    $tempnsfile = $ARGV[2];

    #
    # Untaint nsfile argument; Allow slash.
    #
235
    if ($tempnsfile =~ /^([-\w\.\/]+)$/) {
236
	$tempnsfile = $1;
237 238
    }
    else {
239
	tbdie("Tainted nsfile name: $tempnsfile");
240 241 242 243 244 245 246 247 248
    }
    #
    # Called from ops interactively. Make sure NS file in /proj or /users.
    #
    # Use realpath to resolve any symlinks.
    #
    my $translated = `realpath $tempnsfile`;
    if ($translated =~ /^([-\w\.\/]+)$/) {
	$tempnsfile = $1;
249
    }
250
    else {
251
	tbdie("Tainted nsfile returned by realpath: $translated");
252 253 254
    }

    #
255 256 257 258 259 260 261
    # The file must reside in an acceptible location. Since this script
    # runs as the caller, regular file permission checks ensure it is a
    # file the user is allowed to use.   So we don't have to be too tight
    # with the RE matching /tmp and /var/tmp files.  Note that
    # /tmp/$guid-$nsref.nsfile is also allowed since this script is
    # invoked directly from web interface which generates a name that
    # should not be guessable.
262 263 264
    #
    if (! ($tempnsfile =~ /^\/tmp\/[-\w]+-\d+\.nsfile/) &&
	! ($tempnsfile =~ /^\/var\/tmp\/php\w+/) &&
265 266
	! TBValidUserDir($tempnsfile, 0)) {
	tbdie("$tempnsfile does not resolve to an allowed directory!");
267 268 269
    }

    if (! -f $tempnsfile || -z $tempnsfile || ! -r $tempnsfile) {
270
	tbdie("$tempnsfile does not look like an NS file!");
271
    }
272 273
    $nsfile    = "$eid.ns";
    $modnsfile = "${eid}-modify.ns";
274
}
275

276 277 278 279 280 281 282 283 284
my $experiment = Experiment->Lookup($pid, $eid);
if (! $experiment) {
    die("*** $0:\n".
	"    No such experiment $pid/$eid in the Emulab Database.\n");
}

my $workdir = $experiment->WorkDir();
my $userdir = $experiment->UserDir();

285 286 287 288
#
# See if this is a template instance; error if the -x option not provided,
# since that means user is trying to self-terminate; not in the program.
#
289
if (my $instance = Template::Instance->LookupByExptidx($experiment->idx())) {
290
    if ($inout ne "in" && !$template_mode) {
291 292 293 294 295
	die("*** $0:\n".
	    "    $pid/$eid is a template instance; use another command\n");
    }
}

296
#
297
# Verify user and get his DB uid and other info for later.
298
#
299 300 301
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
302
}
303 304 305 306 307
my $user_dbid  = $this_user->dbid();
my $user_uid   = $this_user->uid();
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
my $isadmin    = TBAdmin();
308

309 310 311 312 313
#
# Set error reporting info
#
tblog_set_info($pid,$eid,$UID);

314
#
315
# Verify that this person can muck with the experiment.
316 317
# Note that any script down the line has to do an admin check also. 
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
318
if ($UID && !$isadmin &&
319
    ! $experiment->AccessCheck($user_uid, TB_EXPT_DESTROY)) {
320
    tbdie("You do not have permission to swap or modify this experiment!");
321 322
}

323 324 325 326
# Must do this before lock tables!
# idleswap is in minutes, threshold is in hours
$idleswap_time = 60 * TBGetSiteVar("idle/threshold");

327 328 329 330 331 332 333 334 335
#
# In wait mode, block interrupt until we spin off the background process.
#
if ($waitmode) {
    $SIG{TERM} = 'IGNORE';
    $SIG{QUIT} = 'IGNORE';
    $SIG{INT}  = 'IGNORE';
}

336 337 338
#
# Check for overquota; we deal with it below, cause of the batch system.
#
339
my $overquota = system("$checkquota $user_uid");
340 341

if ($overquota) {
342 343
    tberror({cause => 'user', severity => SEV_WARNING,
	     error => ['over_disk_quota', $CONTROL]},
344 345
	    "You are over your disk quota on $CONTROL; ".
	    "please login there and cleanup!");
346 347
}

348 349 350 351
#
# Temp fix; Disallow swapmod to firewalled experiments. This will come
# out later.
#
352
my $firewalled = $experiment->IsFirewalled();
353

354 355 356 357 358 359 360 361 362 363
#
# see if we've got a robot exp (this isn't the only check; if this is a
# swapmod, we've got to check tbprerun as well...
#
$robotexp = 
    TBExptContainsNodeCT($pid,$eid,'robot') || 
    TBExptContainsNodeCT($pid,$eid,'mote') || 
    TBExptContainsNodeCT($pid,$eid,'motehost') || 
    TBExptContainsNodeCT($pid,$eid,'powermon');

364 365 366 367 368
#
# We have to protect against trying to end an experiment that is currently
# in the process of being terminated. We use a "wrapper" state (actually
# a timestamp so we can say when termination was requested) since
# terminating consists of a couple of different experiment states down inside
369
# the tb scripts.
370
#
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
$experiment->LockTables("experiments write") == 0
    or die("*** $0:\n".
	   "    Could not lock experiment tables for $pid/$eid!\n");

my $expt_head_login = $experiment->creator();
my $estate          = $experiment->state();
my $batchstate      = $experiment->batchstate();
my $expt_path       = $experiment->path();
my $isbatchexpt     = $experiment->batchmode();
my $cancelflag      = $experiment->canceled();
my $expt_locked     = $experiment->locked();
my $elabinelab      = $experiment->elabinelab();
my $lockdown        = $experiment->lockdown();
my $swappablebit    = $experiment->swappable();
my $idleswapbit     = $experiment->idleswap();
my $autoswapbit     = $experiment->autoswap();
my $swappablestr    = ( $swappablebit ? "Yes" : "No" );
my $idleswapstr     = ( $idleswapbit ? "Yes" : "No" );
my $autoswapstr     = ( $autoswapbit ? "Yes" : "No" );
my $noswap          = $experiment->noswap_reason();
my $noidleswap      = $experiment->noidleswap_reason();
my $idleswaptime    = $experiment->idleswap_timeout() / 60.0;
my $autoswaptime    = $experiment->autoswap_timeout() / 60.0;
my $rendering       = $experiment->prerender_pid();
395

396 397
if ($inout ne "out") {
    # I'm going to update this below, so fix the value before I use it.
398
    $idleswap_time = min($idleswaptime * 60, $idleswap_time);
399 400 401
    $idleswaptime = $idleswap_time / 60.0;
}

402 403
my $swapsettings = 
  "Idle-Swap:   $idleswapstr".
404
  ($idleswapbit ? ", at $idleswaptime hours\n" : " (Reason: $noidleswap)\n").
405 406
  "Auto-Swap:   $autoswapstr".
  ($autoswapbit ? ", at $autoswaptime hours\n" : "\n");
407

408
if (! chdir($workdir)) {
409
    tbdie("Could not chdir to $workdir: $!");
410 411
}

412
#
413 414 415
# This script is called from the batch daemon.
# 
if ($batch) {
416
    #
417 418 419
    # Sanity Check. If called from the daemon, must already be locked,
    # must be a batch experiment, and must be in proper state for the
    # operation requested. 
420
    #
421
    tbdie("Experiment $pid/$eid is supposed to be a batch experiment!")
422 423
	if (!$isbatchexpt);
    
424
    tbdie("Batch experiment $pid/$eid should be locked!")
425 426
	if (!defined($expt_locked) ||
	    $batchstate ne BATCHSTATE_LOCKED());
427 428
    
    tbdie("Batch experiment $pid/$eid is locked down; cannot be swapped!")
429 430
	if ($lockdown);

431
    if ($inout eq "in") {
432 433
	tbdie("Batch experiment $pid/$eid is not in the proper state!\n".
	      "Currently $estate, but should be QUEUED.")
434 435
	    if ($estate ne EXPTSTATE_QUEUED);
	
436 437 438
	tbdie({cause => 'canceled', severity => SEV_IMMEDIATE,
	       error => ['cancel_flag']},
	      "Batch experiment $pid/$eid has been canceled! Aborting.")
439
	    if ($canceled);
440 441

	# Do not allow it to swap in. What about swapout? 
442 443 444
	tbdie({type => 'primary', severity => SEV_ERROR,
	       error => ['over_disk_quota', $CONTROL]},
	      "Batch experiment cannot swap in when over quota! Aborting.")
445
	    if ($overquota);
446 447
    }
    elsif ($inout eq "out") {
448 449
	tbdie("Batch experiment $pid/$eid is not in the proper state!\n".
	      "Currently $estate, but should be ACTIVE.")
450
	    if ($estate ne EXPTSTATE_ACTIVE);
451 452
    }
    else {
453
	tbdie("Improper request from batch daemon for $pid/$eid!\n");
454 455 456 457
    }
}
else {
    if ($isbatchexpt) {
458 459 460 461
	#
	# User is requesting that a batch either be injected or paused.
	# Sanity check the state, but otherwise let the batch daemon
	# handle it.
462 463
	#
	ExitWithStatus(1, "Batch experiment $pid/$eid is still canceling!")
464
	    if ($canceled);
465

466 467 468
	ExitWithStatus(1, "Batch experiment $pid/$eid is locked down!")
	    if ($lockdown);

469
	if ($inout eq "in") {
470
	    ExitWithStatus(1,
471 472 473
			   "Batch experiment $pid/$eid must be SWAPPED to\n".
			   "QUEUE. Currently $estate.")
		if ($estate ne EXPTSTATE_SWAPPED);
474

475 476 477 478 479 480 481 482
	    #
	    # For the moment, only the creator of the batch can queue it. This
	    # avoids all kinds of problems inside the batch daemon, with it
	    # not knowing who to swap the experiment in as. This is a temporary
	    # solution, at least until I have time to decide what is correct.
	    #
	    ExitWithStatus(1,
			   "Only the creator of a batch experiment can queue it!")
483
		if ($experiment->creator() ne $user_uid);
484

485 486 487 488 489 490
	    if ($overquota) {
		tbreport(SEV_ERROR, 'over_disk_quota', $CONTROL);
		ExitWithStatus(1,
			       "Batch experiment $pid/$eid cannot swap in when ".
			       "over quota!\n")
	    }
491
	    
492
	    $experiment->SetState(EXPTSTATE_QUEUED);
493 494
	}
	elsif ($inout eq "out") {
495
	    ExitWithStatus(1,
496 497 498 499
			   "Batch experiment $pid/$eid must be ACTIVE or\n".
			   "ACTIVATING to swap out. Currently $estate.")
		if ($estate ne EXPTSTATE_ACTIVE &&
		    $estate ne EXPTSTATE_ACTIVATING);
500 501 502 503 504

	    #
	    # Since the batch daemon has control, all we can do is set
	    # the cancel bit.
	    # 
505
	    $experiment->SetCancelFlag(EXPTCANCEL_SWAP);
506 507
	}
	elsif ($inout eq "pause") {
508
	    ExitWithStatus(1,
509 510 511
			   "Batch experiment $pid/$eid must be QUEUED to\n".
			   "DEQUEUE. Currently $estate.")
		if ($estate ne EXPTSTATE_QUEUED);
512 513

	    #
514 515 516 517
	    # XXX. The batch daemon might already have the experiment, but
	    # not have shipped it off to startexp. Change the state
	    # anyway. The error will be noticed later when startexp dies,
	    # and the batch daemon gets the error back. This sucks.
518
	    #
519
	    $experiment->SetState(EXPTSTATE_SWAPPED);
520
	}
521
	elsif ($inout eq "modify") {
522
	    ExitWithStatus(1,
523 524 525 526
			   "Batch experiment $pid/$eid must be SWAPPED or\n".
			   "ACTIVE to modify. Currently $estate.")
		if (($estate ne EXPTSTATE_SWAPPED &&
		     $estate ne EXPTSTATE_ACTIVATING) ||
527
		    $batchstate ne BATCHSTATE_UNLOCKED());
528

529 530 531
	    ExitWithStatus(1,
			"Cannot modify an active firewalled experiment (yet).")
		if ($firewalled && $estate ne EXPTSTATE_SWAPPED && !$isadmin);
532

533 534 535
	    ExitWithStatus(1,
			"Cannot modify an active ElabInElab experiment (yet).")
		if ($elabinelab && $estate ne EXPTSTATE_SWAPPED && !$isadmin);
536

537 538 539 540 541 542
	    if ($overquota) {
		tbreport(SEV_ERROR, 'over_disk_quota', $CONTROL);
		ExitWithStatus(1,
			       "Cannot modify batch experiment $pid/$eid when ".
			       "over quota!\n")
	    }
543
	    
544
	    #
545
	    # Otherwise, proceed with the modify. The experiment will be
546 547
	    # locked below, and so it cannot be injected or otherwise messed
	    # with since its state is going to be changed before we unlock
548 549 550 551
	    # the experiments table. The batch daemon will leave it alone
	    # until the modify is done. If the modify fails and cannot recover
	    # it is going to get swapped out; that is okay since the batch
	    # daemon does not keep state internally. 
552
	    #
553 554
	    goto doit;
	}
555
	else {
556
	    tbdie("Operation $inout not allowed on a batch experiment!");
557
	}
558 559
	ExitWithStatus(0, 
		       "Batch experiment $pid/$eid state has been changed.\n");
560
      doit:
561
    }
562 563 564 565 566 567 568 569 570 571
    else {
	#
	# If the cancel flag is set, then user must wait for that to
	# clear before we can do anything else.
	#
	ExitWithStatus(1,
		       "Experiment $pid/$eid has its cancel flag set!.\n".
		       "You must wait for that to clear before you can swap\n".
		       "or modify the experiment.\n")
	    if ($canceled);
572

573 574 575 576
 	ExitWithStatus(1,
		       "Experiment $pid/$eid is locked down; cannot swap!\n")
	    if ($lockdown);

577 578 579 580 581 582 583 584 585 586
	#
	# Check the state for the various operations.
	#
	if (!$force) {
	  SWITCH: for ($inout) {
	      /^in$/i && do {
		  if ($estate ne EXPTSTATE_SWAPPED()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid is not swapped out!");
		  }
587 588 589 590 591 592
		  if ($overquota) {
		      tbreport(SEV_ERROR, 'over_disk_quota', $CONTROL);
		      ExitWithStatus(1,
				     "Experiment $pid/$eid cannot swap in when ".
				     "over quota!\n")
		  }
593
		  
594 595 596 597
		  last SWITCH;
	      };
	      /^out$/i && do {
		  if ($estate ne EXPTSTATE_ACTIVE() &&
Leigh B. Stoller's avatar
Leigh B. Stoller committed
598
 		      $estate ne EXPTSTATE_PANICED() &&
599 600 601 602 603 604
		      $estate ne EXPTSTATE_ACTIVATING()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid is not swapped in ".
				     "or activating!\n");
		  }
		  
Leigh B. Stoller's avatar
Leigh B. Stoller committed
605 606 607 608 609 610 611 612 613 614 615 616
 		  #
 		  # Must be an admin person to swap out an experiment that
 		  # has had its panic button pressed.
 		  #
 		  if ($estate eq EXPTSTATE_PANICED() && !$isadmin) {
 		      ExitWithStatus(1,
 				     "Experiment $pid/$eid had its panic ".
 				     "button pressed!\n".
 				     "Only a testbed administrator can swap ".
 				     "this experiment out.");
 		  }

617 618 619 620 621
		  if ($estate eq EXPTSTATE_ACTIVATING()) {
		      #
		      # All we can do is set the cancel flag and hope that
		      # it gets noticed. We do not wait. 
		      # 
622
		      $experiment->SetCancelFlag(EXPTCANCEL_SWAP);
623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645
		      
		      ExitWithStatus(0,
				     "Experiment $pid/$eid swapin has been  ".
				     "marked for cancelation.\n".
				     "You will receive email when the original ".
				     "swap request has finished.");
		  }
		  last SWITCH;
	      };
	      /^restart$/i && do {
		  if ($estate ne EXPTSTATE_ACTIVE()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid is not swapped in!");
		  }
		  last SWITCH;
	      };
	      /^modify$/i && do {
		  if ($estate ne EXPTSTATE_ACTIVE() &&
		      $estate ne EXPTSTATE_SWAPPED()) {
		      ExitWithStatus(1,
				     "Experiment $pid/$eid must be ACTIVE or\n".
				     "SWAPPED to modify!\n");
		  }
646
		  ExitWithStatus(1,
647 648 649 650
			"Cannot modify an active firewalled experiment (yet).")
		      if ($firewalled &&
			  $estate ne EXPTSTATE_SWAPPED && !$isadmin);

651
		  ExitWithStatus(1,
652 653 654
			"Cannot modify an active ElabInElab experiment (yet).")
		      if ($elabinelab &&
			  $estate ne EXPTSTATE_SWAPPED && !$isadmin);
655

656 657 658 659 660 661
		  if ($overquota) {
		      tbreport(SEV_ERROR, 'over_disk_quota', $CONTROL);
		      ExitWithStatus(1,
				     "Experiment $pid/$eid cannot be modified ".
				     "when over quota!\n")
		  }
662
		  
663 664
		  last SWITCH;
	      };
665
	      tbdie("Missing state check for action: $action");
666
	  }
667 668
	}
    }
669 670
}

671 672 673 674 675 676 677
#
# Determine the temporary and next state for experiment. If the experiment
# is a batch experiment, then the next state is actually handled by the
# batch daemon, but we still have to deal with the temporary state. 
#
SWITCH: for ($inout) {
    /^in$/i && do {
678
	$nextswapstate = EXPTSTATE_ACTIVATING();
679 680 681
	last SWITCH;
    };
    /^out$/i && do {
682
	$nextswapstate = EXPTSTATE_SWAPPING();
683 684 685
	last SWITCH;
    };
    /^restart$/i && do {
686
	$nextswapstate = EXPTSTATE_RESTARTING();
687 688 689
	last SWITCH;
    };
    /^modify$/i && do {
690 691
	$nextswapstate = (($estate eq EXPTSTATE_SWAPPED()) ?
			  EXPTSTATE_MODIFY_PARSE() : EXPTSTATE_MODIFY_REPARSE());
692 693
	last SWITCH;
    };
694
    tbdie("Missing state check for action: $action");
695
}
696 697
 
# Update idleswap_timeout to whatever the current value is.
698
if ($inout ne "out") {
699 700
    $experiment->UpdateIdleSwapTime($idleswap_time) == 0
	or tbdie("Could not update idleswap timeout for $pid/$eid");
701
}
702

703 704 705 706 707 708
#
# On a failure, we go back to this swapstate. Might be modified below.
# 
$termswapstate = $estate;

# Lock the record, set the nextstate, and unlock the table.
709
$experiment->Lock($nextswapstate) == 0
710 711 712
    or tbdie({type => 'secondary', severity => SEV_SECONDARY,
	      error => ['set_experiment_state_failed', $nextswapstate]},
	     "Failed to set experiment state to $nextswapstate");
713 714 715 716 717 718 719

#
# At this point, we need to force a cleanup no matter how we exit.
# See the END block below.
#
$justexit = 0;

720
$experiment->UnLockTables();
721

722 723 724 725 726 727
# Need the previous swapper for rollback below. Safe now that tables unlocked.
my $last_swapper = User->Lookup($experiment->swapper_idx());
if (! defined($last_swapper)) {
    tbdie("Error looking up object for last swapper!");
}

728 729
#
# XXX - At this point a failure is going to leave things in an
730 731 732 733
# inconsistent state. Be sure to call fatal() only since we are
# going into the background, and we have to send email since no
# one is going to see printed error messages (output goes into the
# log file, which will be sent along in the email). 
734 735
#

736 737
if ($inout eq "in") {
    $action = "swapped in";
738
    $tag    = "swapin";
739 740 741
}
if ($inout eq "out") {
    $action = "swapped out";
742
    $tag    = "swapout";
743 744 745 746
}
if ($inout eq "restart") {
    $action = "restarted";
}
747 748
if ($inout eq "modify") {
    $action = "modified";
749
    $tag    = "swapmod";
750
}
751

752 753 754 755 756 757 758 759 760
#
# Get email address of the experiment head, which may be different than
# the person who is actually terminating the experiment, since its polite
# to let the original creator know whats going on. 
#
my $expt_head_name;
my $expt_head_email;

if (! UserDBInfo($expt_head_login, \$expt_head_name, \$expt_head_email)) {
761
    tbwarn("Could not determine name/email for $expt_head_login.");
762 763 764 765
    $expt_head_name  = "TBOPS";
    $expt_head_email = $TBOPS;
}

766 767 768
#
# Before going to background, we have to copy out the NS file!
#
769
if ($inout eq "modify" && defined($modnsfile)) {
770 771
    unlink($modnsfile);
    if (system("/bin/cp", "$tempnsfile", "$modnsfile")) {
772
	fatal("Could not copy $tempnsfile to $modnsfile");
773 774 775 776
    }
    chmod(0664, "$modnsfile");
}

777 778 779
#
# If not in batch mode, go into the background. Parent exits.
#
780
if (! $batch && ! $template_mode) {
781 782 783 784
    # Cleanup
    $experiment->CleanLogFiles() == 0
	or fatal("Could not clean up logfiles!");

785 786 787 788 789
    if ($experiment->CreateLogFile("swapexp", \$logname) != 0) {
	fatal("Could not create logfile!");
    }
    $experiment->SetLogFile($logname);
    $experiment->OpenLogFile($logname);
790

791 792
    if (my $childpid = TBBackGround($logname)) {
	#
793 794
	# Parent exits normally, unless in waitmode. We have to set
	# justexit to make sure the END block below does not run.
795
	#
796 797
	$justexit = 1;

798
	if (!$waitmode) {
799 800 801
	    print("Experiment $pid/$eid is now being $action.\n".
		  "You will be notified via email when the this is done.\n")
		if (! $quiet);
802 803
	    exit(0);
	}
804 805 806 807 808 809 810 811
	print("Waiting for experiment $eid to finish its swap${action}\n")
	    if (! $quiet);
	    
	if (isatty(STDIN) && !$quiet) {
	    print("You may type ^C at anytime; you will be notified via email.".
		  "\n".
		  "You will not actually interrupt the experiment itself.\n");
	}
812 813 814 815 816 817 818 819 820 821 822 823
	
	# Give child a chance to run.
	select(undef, undef, undef, 0.25);
	
	#
	# Reset signal handlers. User can now kill this process, without
	# stopping the child.
	#
	$SIG{TERM} = 'DEFAULT';
	$SIG{INT}  = 'DEFAULT';
	$SIG{QUIT} = 'DEFAULT';

824
	#
825
	# Wait until child exits or until user gets bored and types ^C.
826
	#
827 828
	waitpid($childpid, 0);
	
829 830
	print("Done. Exited with status: $?\n")
	    if (! $quiet);
831 832 833 834 835 836 837 838 839 840

	my $exit_code = $? >> 8;

	if ($exit_code != 0) {
	    my $d = tblog_lookup_error();
	    print tblog_format_error($d);
	}

	exit $exit_code;

841
    }
842
    TBdbfork();
843 844
}

845 846 847 848 849 850 851 852
#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}

853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
#
# We need to catch TERM cause sometimes shit happens and we have to kill
# an experiment swap that is hung or otherwise scrogged. Rather then 
# trying to kill off the children one by one, lets arrange to catch it
# here and send a killpg to the children. This is not to be done lightly,
# cause it can leave things worse then they were before!
#
sub handler ($) {
    my ($signame) = @_;
    
    $SIG{TERM} = 'IGNORE';
    my $pgrp = getpgrp(0);
    kill('TERM', -$pgrp);
    sleep(1);
    $signaled = 1;
    fatal("Caught SIG${signame}! Killing experiment setup ...");
}
$SIG{TERM} = \&handler;
$SIG{QUIT} = 'DEFAULT';

873 874 875 876
#
# Gather stats; start clock ticking
#
if ($inout eq "in") {
877
    GatherSwapStats($pid, $eid, $user_uid, TBDB_STATS_SWAPIN, 0,
878 879 880
		    TBDB_STATS_FLAGS_START);
}
elsif ($inout eq "out") {
881
    GatherSwapStats($pid, $eid, $user_uid, TBDB_STATS_SWAPOUT, 0,
882 883 884
		    TBDB_STATS_FLAGS_START);
}
elsif ($inout eq "modify") {
885
    GatherSwapStats($pid, $eid, $user_uid, TBDB_STATS_SWAPMODIFY, 0,
886 887 888
		    TBDB_STATS_FLAGS_START);
}

889 890 891
#
# Remove old report file since its contents are going to be invalid.
#
892
if ($inout ne "restart" && -e $repfile) {
893 894 895
    unlink("$repfile");
}

896 897 898 899
#
# Sanity check states in case someone changes something.
#
if ($inout eq "out") {
900
    my $optarg = (($force || $idleswap) ? "-force" : "");
901

902
    if ($experiment->Swap("out", $optarg) != 0) {
903
	$errorstat = $? >> 8;
904 905 906
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['tbswap_out_failed']},
	      "tbswap out failed!");
907
    }
908 909 910 911 912

    #
    # Add the files that have been detected by tracing to the archive.
    #
    if (libArchive::TBExperimentArchiveAddTracedFiles($pid, $eid) < 0) {
913 914 915
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['archive_op_failed', 'add_traced_files', undef, undef]},
	      "Failed to add traced files to the experiment archive!");
916 917
    }

918
    #
919
    # Add the experiment directory.
920 921
    #
    if (libArchive::TBExperimentArchiveAddUserFiles($pid, $eid) < 0) {
922 923 924
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['archive_op_failed', 'add_user_files', undef, undef]},
	      "Failed to add user specified files to the experiment archive!");
925 926
    }

927
    $experiment->SetState(EXPTSTATE_SWAPPED) == 0
928 929 930
	or fatal({type => 'secondary', severity => SEV_SECONDARY,
		  error => ['set_experiment_state_failed', EXPTSTATE_SWAPPED()]},
	         "Failed to set experiment state to " . EXPTSTATE_SWAPPED());
931 932 933
    
    $experiment->ClearPanicBit() == 0
	or fatal("Failed to clear the panic bit!");
934
}
935
elsif ($inout eq "in") {
936
    GatherSwapStats($pid, $eid, $user_uid,
937
		    TBDB_STATS_SWAPIN, 0, TBDB_STATS_FLAGS_PRESWAPIN);
938 939 940

    # Set the swapper now so that nodes use the proper uid. If the swapin
    # fails, we leave the swapper as is, since its harmless and informative.
941
    $experiment->SetSwapper($this_user);
942 943

    if ($experiment->Swap("in") != 0) {
944
	$errorstat = $? >> 8;
945 946 947
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['tbswap_in_failed']},
	      "tbswap in failed!");
948
    }
949 950
    
    $experiment->SetState(EXPTSTATE_ACTIVE) == 0
951 952 953
	or fatal({type => 'secondary', severity => SEV_SECONDARY,
		  error => ['set_experiment_state_failed', EXPTSTATE_ACTIVE()]},
	         "Failed to set experiment state to " . EXPTSTATE_ACTIVE());
954 955
    
    $experiment->Report($repfile, "-b");
956
}
957
elsif ($inout eq "modify") {
958 959 960
    #
    # Prepare the Archive for the swapmod, in case we have to "roll back".
    #
961
    print "Doing a preswapmod on the experiment archive ...\n";
962 963 964 965
    if (libArchive::TBExperimentArchivePreSwapMod($pid, $eid) < 0) {
	fatal("Failed to do a preswapmod on the experiment archive!");
    }

966
    GatherSwapStats($pid, $eid, $user_uid,
967 968
		    TBDB_STATS_SWAPMODIFY, 0, TBDB_STATS_FLAGS_PREMODIFY);

969 970 971 972 973
    # Gather up some firewall state for later comparison.
    if (GatherFWinfo() < 0) {
	fatal("Could not gather firewall info; cannot safely continue!");
    }

974
    print "Backing up old experiment state ... " . TBTimeStamp() . "\n";
975 976
    $experiment->BackupVirtualState() == 0
	or fatal("Could not backup experiment state; cannot safely continue!");
977 978

    #
979
    # Rerun tbprerun if modifying, but only if new NS file provided.
980 981
    # Yep, we allow reswap without changing the NS file. For Shashi and SIM.
    # Note that tbprerun kills the renderer if its running.
982
    #
983
    if (defined($modnsfile)) {
984
	if ($experiment->PreRun($modnsfile) != 0) {
985
	    print STDOUT "Modify Error: tbprerun failed.\n";
986
	  FWHOSED:
987 988
	    print STDOUT "Recovering experiment state...\n";

989 990
	    if ($experiment->RemoveVirtualState() ||
		$experiment->RestoreVirtualState()) {
991 992 993 994 995 996 997 998 999 1000 1001 1002
		$modifyHosed = 1;
		fatal("Experiment state could not be restored!");
		# Never returns;
	    }
	    #
	    # If the renderer was running when we started the swapmod, then we
	    # want to restart it. If it was stopped, then the renderer info
	    # was captured with the rest of the virtual state (restored above).
	    #
	    system("prerender -t $pid $eid")
		if ($rendering);

1003
	    $modifyError = "Update aborted; old virtual state restored.";
1004 1005 1006
	    fatal({type => 'secondary', severity => SEV_SECONDARY,
		   error => ['update_aborted', 'virtual_state_restored']},
		  $modifyError);
1007
	    # Never returns;
1008
	}
1009 1010 1011 1012 1013 1014 1015 1016 1017
	#
	# Okay, whenever a new NS file is presented, we need to do some
	# checks on the firewall to make sure the user is not trying to
	# do something "unsafe". 
	#
	if (CheckFWinfo($estate) != 0) {
	    # All the stuff for recovering is right above, so go there. 
	    goto FWHOSED;
	}
1018 1019
    }

1020
    #
1021
    # Our next state depends on whether the experiment was active or swapped.
1022
    #
1023
    if ($estate eq EXPTSTATE_SWAPPED) {
1024 1025
	$experiment->SetState(EXPTSTATE_SWAPPED) == 0
	    or fatal("Failed to set experiment state to ".EXPTSTATE_SWAPPED());
1026 1027
    }
    else {
1028 1029 1030
	$experiment->SetState(EXPTSTATE_MODIFY_RESWAP) == 0
	    or fatal("Failed to set experiment state to " .
		     EXPTSTATE_MODIFY_RESWAP());
1031

1032 1033 1034
	# Set the swapper now so that nodes use the proper uid. If the
	# swapin fails, we need to reset the swapper back so that he
	# is charged appropriately.
1035
	$experiment->SetSwapper($this_user);
1036

1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
	my $optarg = "";
	#
	# For elabinelab experiments; ignore reboot/eventsys_restart,
	# and force noreconfig; none of it will work or make sense. 
	#
	if ($elabinelab) {
	    $optarg = "-noreconfig";
	}
	else {
	    $optarg  = ($reboot ? "-reboot" : "");
	    $optarg .= ($eventsys_restart ? " -eventsys_restart" : "");
	}
1049

1050
	if ($experiment->Swap("update", $optarg) == 0) {
1051
	    #
1052 1053
	    # Success. Set the state back to active cause thats where it
	    # started.
1054 1055 1056 1057
	    #
	    $experiment->SetState(EXPTSTATE_ACTIVE) == 0
		or fatal("Failed to set experiment state to " .
			 EXPTSTATE_ACTIVE());
1058 1059 1060 1061 1062
	    $estate = EXPTSTATE_ACTIVE;
	}
	else {
	    $modifyError = $errorstat = $? >> 8;
	    print STDOUT "Modify Error: tbswap update failed.\n";
1063

1064
	    #
1065 1066 1067
	    # tbswap either restored the experiment to the way it was,
	    # or it swapped it out completely. In either case, it has
	    # also restored the virtual state. 
1068
	    # 
1069 1070 1071 1072 1073 1074 1075
	    # Icky. Magic return code that says tbswap swapped it out.
	    # We do not want tbswap to muck with states anymore, so
	    # need to know what it did. At some point we should clean
	    # up the exit reporting! Anyway, fatal() needs to know the
	    # the right state to go back to (no longer ACTIVE).
	    #
	    if ($errorstat & 0x40) {
1076 1077
		$estate = EXPTSTATE_SWAPPED;
		$termswapstate = EXPTSTATE_SWAPPED;
1078
		$modifySwapped = 1;
1079
                # Old accounting info.
1080
		$experiment->SetSwapTime();
1081
		tbreport(SEV_SECONDARY, 'update_aborted', 'swapped_out');
1082 1083 1084
		$modifyError = "Update aborted; experiment swapped out.";
	    }
	    else {
1085
		tbreport(SEV_SECONDARY, 'update_aborted', 'state_restored');
1086
		$modifyError = "Update aborted; old state restored.";
1087 1088

		# Reset the swapper since the experiment is still running.
1089
		$experiment->SetSwapper($last_swapper);
1090
	    }
1091
	}
1092 1093
    }

1094 1095 1096 1097 1098 1099 1100 1101
    #
    # We need to rerender only if the NS file was changed (ran tbprerun),
    # If the swapmod succeeded, then unconditionally run the renderer. If
    # swap failed, then need to run the renderer only if we stopped one in
    # progress.
    #
    if (defined($modnsfile)) {
	system("prerender -t $pid $eid")	
1102
	    if (!defined($modifyError) || $rendering);
1103
    }
1104 1105 1106 1107 1108 1109 1110

    #
    # Swapmod failed ...
    #
    fatal($modifyError)
	if (defined($modifyError));

1111 1112 1113 1114 1115 1116 1117 1118
    #
    # Move the temporary ns file to its real name.
    #
    if (defined($modnsfile)) {
	unlink($nsfile);
	if (system("/bin/mv", "$modnsfile", "$nsfile")) {
	    fatal("Could not mv $modnsfile to $nsfile");
	}
1119
	unlink("nsfile.ns");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1120
	if (system("/bin/cp", "$nsfile", "nsfile.ns")) {
1121 1122
	    fatal("Could not cp $nsfile to nsfile.ns");
	}
1123 1124
    }

1125 1126
    $experiment->ClearBackupState();
    $experiment->Report($repfile, "-b");    
1127
}
1128
else { # $inout eq "restart" assumed.
1129
    print STDOUT "Running 'tbrestart $pid $eid'\n";
1130
    if (system("$tbdir/tbrestart $pid $eid") != 0) {
1131
	fatal("tbrestart failed!");
1132
    }
1133 1134
    $experiment->SetState(EXPTSTATE_ACTIVE) == 0
	or fatal("Failed to set experiment state to " . EXPTSTATE_ACTIVE());
1135
}
1136

1137 1138 1139 1140 1141
# Latest log is always called the same thing.
if (defined($logname)) {
    system("cp -fp $logname $workdir/" . EXPTLOGNAME());
}

1142 1143 1144
#
# Try to copy off the files for testbed information gathering.
#
1145
$experiment->SaveLogFiles();
1146 1147 1148 1149 1150 1151

#
# Make a copy of the work dir in the user visible space so the user
# can see the log files. This overwrites existing files of course,
# but thats okay.
#
1152
$experiment->CopyLogFiles();
1153

1154
# And tell the archive library to add the experiment directory.
1155 1156
libArchive::TBExperimentArchiveAddUserFiles($pid, $eid) == 0
    or fatal("Failed to add user archive files to the archive!");
1157

1158 1159 1160 1161 1162
#
# The archive gets different treatment when doing a swapmod.
#
if ($inout eq "modify") {
    print "Doing a commit on the previous experiment archive ...\n";
1163 1164
    libArchive::TBExperimentArchiveSwapModCommit($pid, $eid,
						 $estate eq EXPTSTATE_SWAPPED)
1165 1166 1167
	== 0 or	fatal({type => 'secondary', severity => SEV_SECONDARY,
		       error => ['archive_op_failed', 'commit', undef, undef]},
		      "Failed to commit experiment archive!");
1168 1169 1170
}

#
1171 1172 1173
# Do a SavePoint on the experiment files. For both of these archive
# operations, skip if in template mode; we have too many tags building
# up from too many operations. Leave it to the wrapper script.
1174
#
1175 1176 1177 1178 1179 1180 1181
if (! $template_mode) {
    print "Doing a savepoint on the experiment archive ...\n";
    if (libArchive::TBExperimentArchiveSavePoint($pid, $eid, $tag) < 0) {
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['archive_op_failed', 'savepoint', undef, undef]},
	      "Failed to do a savepoint on the experiment archive!");
    }
1182 1183 1184
}

# Commit the archive after swapout
1185
if ($inout eq "out" && !$template_mode) {
1186 1187
    print "Doing a commit on the experiment archive ...\n";
    libArchive::TBCommitExperimentArchive($pid, $eid, $tag) == 0 or
1188 1189 1190
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['archive_op_failed', 'commit', undef, undef]},
	      "Failed to commit experiment archive!");
1191 1192
}

1193 1194 1195 1196
#
# Gather stats. 
#
if ($inout eq "in") {
1197
    GatherSwapStats($pid, $eid, $user_uid, TBDB_STATS_SWAPIN, 0);
1198 1199
}
elsif ($inout eq "out") {
1200
    GatherSwapStats($pid, $eid, $user_uid, TBDB_STATS_SWAPOUT, 0,
1201
		    ($idleswap ? TBDB_STATS_FLAGS_IDLESWAP() : 0));
1202 1203
}
elsif ($inout eq "modify") {
1204
    GatherSwapStats($pid, $eid, $user_uid, TBDB_STATS_SWAPMODIFY, 0);
1205 1206
}

1207
# Accounting info. For swapout, must be after its done with.
1208
$experiment->SetSwapInfo($this_user);
1209 1210

#
1211
# In batch mode, just exit without sending email or unlocking. The
1212
# batch daemon will take care of that.
1213 1214 1215 1216 1217
#
if ($batch) {
    exit(0);
}

1218 1219 1220
#
# Clear the log file so the web page stops spewing. 
#
1221 1222
$experiment->CloseLogFile()
    if (defined($logname));
1223

1224 1225 1226
#
# Must unlock before exit.
#
1227
$experiment->Unlock();
1228 1229 1230 1231 1232

#
# Since the swap completed, clear the cancel flag. This must be done
# after we change the experiment state (above). 
#
1233
$experiment->SetCancelFlag(EXPTCANCEL_CLEAR);
1234

1235 1236 1237
exit(0)
    if ($template_mode);

1238 1239
print "Swap Success!\n";

1240 1241 1242 1243
#
# Send email notification to user.
#
my $message =
1244 1245
    "Experiment $eid in project $pid has been ";

1246
if ($inout eq "out" && ($idleswap || $autoswap || $force) ) {
1247
    $message .= "forcibly swapped out by\nEmulab";
1248 1249 1250 1251 1252
    if ($idleswap) {
	$message .= " because it was idle for too long (Idle-Swap).\n".
	  "(See also the Idle-Swap info in \n".
	  "$TBDOCBASE/docwrapper.php3?docname=swapping.html )\n";
    } elsif ($autoswap) {
1253 1254
	$message .= " because it exceeded its Maximum Duration.\n".
	  "(See also the Max. Duration info in \n".
1255 1256 1257 1258 1259
	  "$TBDOCBASE/docwrapper.php3?docname=swapping.html )\n";
    } elsif ($force) {
	$message .= ". (See also our Node Usage Policies in \n".
	  "$TBDOCBASE/docwrapper.php3?docname=swapping.html )\n";
    }
1260 1261 1262 1263 1264
}
else {
    $message .= "$action.\n";
}