endexp.in 18.1 KB
Newer Older
1
#!/usr/bin/perl -wT
Leigh Stoller's avatar
Leigh Stoller committed
2
#
3
# Copyright (c) 2000-2013 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh Stoller's avatar
Leigh Stoller committed
23
#
24 25
use English;
use Getopt::Std;
26
use POSIX qw(isatty setsid);
27
use strict;
28 29 30 31

#
# This gets invoked from the Web interface. Terminate an experiment.
# Most of the STDOUT prints are never seen since the web interface
32 33
# reports only errors, but this script is also intended to be run by the
# user someday. Perhaps.
34 35 36 37 38 39 40 41
#
# The -b (batch) argument is so that this script can be part of a batchmode
# that starts/ends experiments offline. In that case, we don't want to put
# it into the background and send email, but just want an exit status 
# returned to the batch system.
#
sub usage()
{
42
    print(STDERR
43
	  "Usage: endexp [-q] [-p] [-b | -w] [-N] <eid>\n".
44 45
	  "switches and arguments:\n".
	  "-w       - wait for non-batchmode experiment terminate\n".
46
	  "-q       - be less chatty\n".
47
	  "-N       - Suppress most email to the user and testbed-ops\n".
48
	  "<eid>    - The experiment id\n");
49 50
    exit(-1);
}
51

52
sub fatal($;$);
53

54
my $optlist  = "bwqxpNfog";
55 56
my $waitmode = 0;
my $batch    = 0;
57
my $quiet    = 0;
58
my $purge    = 0;
59 60 61
my $force    = 0;
my $genimode = 0;
my $lockforce= 0;
62
my $template_mode = 0;
63
my $noemail  = 0;
64

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
#
# Exit codes are important; they tell the web page what has happened so
# it can say something useful to the user. Fatal errors are mostly done
# with die(), but expected errors use this routine. At some point we will
# use the DB to communicate the actual error.
#
# $status < 0 - Fatal error. Something went wrong we did not expect.
# $status = 0 - Termination is proceeding in the background. Notified later.
# $status > 0 - Expected error. User not allowed for some reason. 
# 
sub ExitWithStatus($$)
{
    my ($status, $message) = @_;
    
    if ($status < 0) {
	die("*** $0:\n".
	    "    $message\n");
    }
    else {
	print STDERR "$message\n";
    }
    exit($status);
}

89 90 91 92 93 94
#
# Configure variables
#
my $TB     = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS  = "@TBOPSEMAIL@";
95
my $TBLOGS = "@TBLOGSEMAIL@";
96

97 98 99 100 101 102
#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
103
use libtblog;
104
use libArchive;
105
use User;
106
use Template;
107
use Experiment;
108
use EmulabFeatures;
109

110 111 112
my $tbdir       = "$TB/bin/";
my $tbdata      = "tbdata";
my $archcontrol = "$TB/bin/archive_control";
113
my $nextstate;
114
my $logname;
115
my $logfile;
116
    
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
#
# Untaint the path
# 
$ENV{'PATH'} = '/bin:/usr/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

#
# Turn off line buffering on output
#
$| = 1;

#
# Parse command arguments. Once we return from getopts, all that should
# left are the required arguments.
#
132
my %options = ();
133 134 135
if (! getopts($optlist, \%options)) {
    usage();
}
136
if (@ARGV != 1) {
137 138 139
    usage();
}
if (defined($options{"b"})) {
140 141
    $batch = 1;
}
142 143 144
if (defined($options{"g"})) {
    $genimode = 1;
}
145 146 147
if (defined($options{"p"})) {
   $purge = 1;
}
148 149 150 151 152 153
if (defined($options{"f"})) {
   $force = 1;
}
if (defined($options{"o"})) {
    $lockforce = 1;
}
154 155
if (defined($options{"w"})) {
    $waitmode = 1;
156
}
157 158 159
if (defined($options{"q"})) {
    $quiet = 1;
}
160 161 162
if (defined($options{"x"})) {
    $template_mode = 1;
}
163 164 165
if (defined($options{"N"})) {
    $noemail = 1;
}
166 167
usage()
    if ($waitmode && $batch);
168

169 170 171 172
# Map invoking user to object.
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    fatal("You ($UID) do not exist!");
173
}
174 175 176 177
my $isadmin    = TBAdmin();
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
my $user_uid   = $this_user->uid();
178 179 180 181

#
# Grab the experiment.
#
182
my $experiment = Experiment->Lookup($ARGV[0]);
183 184
if (! $experiment) {
    die("*** $0:\n".
185
	"    No such experiment in the Emulab Database.\n");
186
}
187 188 189
my $pid     = $experiment->pid();
my $eid     = $experiment->eid();
my $swapper = $experiment->GetSwapper();
190

191 192 193 194 195
my $workdir = $experiment->WorkDir();
my $userdir = $experiment->UserDir();
my $infodir = $experiment->InfoDir();

# Sanity check.
196
if (! -e $workdir) {
197
    die("*** $0:\n".
198
	"    $pid/$eid is missing a critical directory!\n");
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
}
if (! -e $infodir) {
    if (! -e "$TB/expinfo/$pid") {
	die("*** $0:\n".
	    "    $TB/expinfo/$pid has not been created yet!\n".
	    "    Did you run 'fixexpinfo' like you were supposed to?\n");
    }
    #
    # This is not going to happen unless a devel tree has been
    # mixed with the main tree (or another devel tree). Try to
    # recover, bail of not possible.
    #
    if (system("/bin/mkdir -m 777 -p $infodir")) {
	die("*** $0:\n".
	    "    $infodir was missing and cannot be created!\n");
    }
}

217 218 219 220
#
# See if this is a template instance; error if the -x option not provided,
# since that means user is trying to self-terminate; not in the program.
#
221
if ($experiment->IsInstance() && !$template_mode) {
222
    die("*** $0:\n".
223
	"    $pid/$eid is a template instance; use another command!\n");
224
}
225

226 227 228 229 230 231 232 233
#
# Not allowed to terminate an experiment that is sharing vlans.
#
if ($experiment->SharingVlans()) {
    die("*** $0:\n".
	"    $pid/$eid is sharing vlans. Clear those first!\n");
}

234
#
Leigh Stoller's avatar
Leigh Stoller committed
235
# Verify that this person is allowed to end the experiment.
236
#
237
if ($UID && !$isadmin &&
238
    ! $experiment->AccessCheck($this_user, TB_EXPT_DESTROY)) {
239 240
    die("*** $0:\n".
	"    You do not have permission to end this experiment!\n");
241 242
}

243 244 245 246 247 248 249 250
#
# In wait mode, block interrupt until we spin off the background process.
#
if ($waitmode) {
    $SIG{TERM} = 'IGNORE';
    $SIG{QUIT} = 'IGNORE';
    $SIG{INT}  = 'IGNORE';
}
251

252
#
Leigh Stoller's avatar
Leigh Stoller committed
253 254 255
# We have to protect against trying to end an experiment that is currently
# in the process of being terminated. We use a "wrapper" state (actually
# a timestamp so we can say when termination was requested) since
256
# terminating consists of a couple of different experiment states down inside
Leigh Stoller's avatar
Leigh Stoller committed
257 258
# the tb scripts. 
#
259
$experiment->LockTables() == 0
260 261
    or die("*** $0:\n".
	   "    Could not lock experiment tables for $pid/$eid!\n");
Leigh Stoller's avatar
Leigh Stoller committed
262

263 264 265 266 267 268 269 270 271
my $expt_head_login = $experiment->creator();
my $estate          = $experiment->state();
my $batchstate      = $experiment->batchstate();
my $expt_path       = $experiment->path();
my $isbatchexpt     = $experiment->batchmode();
my $cancelflag      = $experiment->canceled();
my $expt_locked     = $experiment->locked();
my $elabinelab      = $experiment->elabinelab();
my $lockdown        = $experiment->lockdown();
272 273 274 275 276

#
# Batch experiments get a different protocol to avoid races with the
# batch daemon. We can kill the experiment directly, but only if the
# batch daemon is not currently working on it. In this case, its the
277
# same as killing an experiment that is SWAPPED.
278
#
279
# XXX: This script is run from the batch daemon. 
280
#
281
if ($batch) {
282
    #
283 284
    # Sanity Check. If called from the daemon, must already be locked,
    # must be a batch experiment, and must be ACTIVE or SWAPPED.
285
    #
286 287 288 289 290 291 292 293 294
    die("*** $0:\n".
	"    Experiment $pid/$eid is supposed to be a batch experiment!\n")
	if (!$isbatchexpt);
    
    die("*** $0:\n".
	"    Batch experiment $pid/$eid should be locked!\n")
	if (!defined($expt_locked) ||
	    $batchstate ne BATCHSTATE_LOCKED());
	
295 296 297 298
    die("*** $0:\n".
	"    Batch experiment $pid/$eid is locked down; cannot be swapped!\n")
	if ($lockdown);

299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
    die("*** $0:\n".
	"    Batch experiment $pid/$eid is not in the correct state!\n".
	"    Currently $estate, but should be SWAPPED,QUEUED, or ACTIVE\n")
	if ($estate ne EXPTSTATE_ACTIVE &&
	    $estate ne EXPTSTATE_QUEUED &&
	    $estate ne EXPTSTATE_SWAPPED);
}
else {
    #
    # Called from user (via web interface).
    #
    if ($isbatchexpt) {
	#
	# This cancels a batchexp, telling the daemon if it has control.
	# If the daemon is not currently working on the experiment, we
	# can do it right away in this script. Otherwise must wait.
	#
316 317 318
	ExitWithStatus(1, "Batch experiment $pid/$eid is still canceling!")
	    if ($cancelflag);

319 320 321
	ExitWithStatus(1, "Batch experiment $pid/$eid is locked down!")
	    if ($lockdown);

322
	#
323 324 325
	# Set the canceled flag. This will prevent the batch_daemon
	# from trying to run it (once the table is unlocked). It might
	# already be running, but we deal with that by looking at the
326
	# state below.
327
	#
328
	$experiment->SetCancelFlag(EXPTCANCEL_TERM);
329

330
	#
331
	# If the state is QUEUED or SWAPPED, we can do it right away.
332 333
	# Otherwise, have to let the batch daemon deal with it.
	# 
334 335 336 337
	ExitWithStatus(0, 
		       "Batch experiment $pid/$eid has been canceled.\n".
		       "You will receive email when the experiment is\n".
		       "torn down and you can reuse the experiment name.")
338 339
	    if (($estate ne EXPTSTATE_QUEUED &&
		 $estate ne EXPTSTATE_SWAPPED) ||
340
		$batchstate ne BATCHSTATE_UNLOCKED());
341
    }
342 343
    else {
	#
344 345
	# If the cancel flag is set, then user must wait for that to clear 
	# before we can do anything else.
346 347 348 349 350 351 352
	#
	ExitWithStatus(1,
		       "Experiment $pid/$eid has its cancel flag set!\n".
		       "You must wait for that to clear before you can ".
		       "terminate the experiment.\n")
	    if ($cancelflag);
	
353 354
 	ExitWithStatus(1,
		       "Experiment $pid/$eid is locked down; cannot swap!\n")
355
	    if ($lockdown && !$lockforce);
356

357 358 359 360 361 362 363 364
	#
	# Must be unlocked if called by the user.
	#
	ExitWithStatus(1,
		       "Experiment $pid/$eid went into transition at ".
		       "$expt_locked!\n".
		       "You must wait until it is no longer in transition.\n")
	    if (defined($expt_locked));
365

366 367 368 369 370
	#
	# Okay, check state. We do not allow termination to start when the
	# experiment is in transition. A future task would be to allow this,
	# but for now it is not allowed. 
	#
371 372
	ExitWithStatus(1,
		       "Experiment $pid/$eid is currently in transition.\n".
373 374 375 376 377
		       "You must wait until it is no longer $estate!")
	    if ($estate ne EXPTSTATE_SWAPPED() &&
		$estate ne EXPTSTATE_NEW() &&
		$estate ne EXPTSTATE_TERMINATED() &&
		$estate ne EXPTSTATE_ACTIVE());
378 379 380 381 382 383 384 385 386 387 388 389
	
	#
	# Must be an admin person to swap out an experiment that
	# has had its panic button pressed.
	#
	if ($estate eq EXPTSTATE_PANICED() && !$isadmin) {
	    ExitWithStatus(1,
			   "Experiment $pid/$eid had its panic ".
			   "button pressed!\n".
			   "Only a testbed administrator can swap ".
			   "this experiment out.");
	}
390
    }
Leigh Stoller's avatar
Leigh Stoller committed
391 392 393
}

#
394 395
# Lock the experiment and change state so no one can mess with it. We need
# to determine our next state before we unlock the table. 
396
#
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
if ($estate eq EXPTSTATE_ACTIVE) {
    $nextstate = EXPTSTATE_SWAPPING;
}
elsif ($estate eq EXPTSTATE_SWAPPED || 
       $estate eq EXPTSTATE_QUEUED) {
    $nextstate = EXPTSTATE_TERMINATING;
}
elsif ($estate eq EXPTSTATE_NEW ||
       $estate eq EXPTSTATE_TERMINATED) {
    $nextstate = EXPTSTATE_TERMINATED;
}
else {
    die("*** $0:\n".
	"    Experiment $pid/$eid appears to be in the wrong state: $estate\n");
}
412
# Unlock tables at same time.
413 414 415 416 417
$experiment->Lock($nextstate, 1) == 0
    or die("*** $0:\n".
	   "    Experiment $pid/$eid could not be locked\n");
# Maybe Lock() should do this?
$experiment->Refresh();
418 419

#
420
# XXX - At this point a failure is going to leave things in an
421 422 423 424
# inconsistent state. Be sure to call fatal() only since we are
# going into the background, and we have to send email since no
# one is going to see printed error messages (output goes into the
# log file, which will be sent along in the email). 
425
#
426 427 428 429 430

#
# Get email address of the experiment head, which may be different than
# the person who is actually terminating the experiment, since its polite
# to let the original creator know whats going on. 
431
#
432 433
my $expt_head_name  = $swapper->name();
my $expt_head_email = $swapper->email();
Leigh Stoller's avatar
Leigh Stoller committed
434

435 436 437
#
# If not in batch mode, go into the background. Parent exits.
#
438
if (! ($batch || $template_mode || $genimode)) {
439 440 441
    # Cleanup
    $experiment->CleanLogFiles() == 0
	or fatal("Could not clean up logfiles!");
442 443 444

    $logfile = $experiment->CreateLogFile("endexp");
    if (!defined($logfile)) {
445 446
	fatal("Could not create logfile!");
    }
447 448 449 450 451 452
    $logname = $logfile->filename();
    # We want it to spew to the web.
    $experiment->SetLogFile($logfile);
    # Mark it open since we are going to start using it right away.
    $logfile->Open();
    
453 454 455 456 457
    if (my $childpid = TBBackGround($logname)) {
	#
	# Parent exits normally, except if in waitmode. 
	#
	if (!$waitmode) {
458 459 460 461 462 463 464 465 466
	    print("Experiment $pid/$eid is now terminating.\n".
		  "You will be notified via email when termination is ".
		  "complete.\n")
		if (! $quiet);
	    exit(0);
	}
	print("Waiting for experiment $eid to finish terminating.\n")
		if (! $quiet);
	
467
	if (-t && !$quiet) {
468 469 470
	    print("You may type ^C at anytime; you will be notified via email.".
		  "\n".
		  "You will not actually interrupt the experiment itself.\n");
471 472 473 474 475 476 477 478 479 480 481 482 483
	}
	
	# Give child a chance to run.
	select(undef, undef, undef, 0.25);
	
	#
	# Reset signal handlers. User can now kill this process, without
	# stopping the child.
	#
	$SIG{TERM} = 'DEFAULT';
	$SIG{INT}  = 'DEFAULT';
	$SIG{QUIT} = 'DEFAULT';

484
	#
485
	# Wait until child exits or until user gets bored and types ^C.
486
	#
487 488
	waitpid($childpid, 0);
	
489 490
	print("Done. Exited with status: $?\n")
	    if (! $quiet);
491
	exit($? >> 8);
492
    }
493
    TBdbfork();
494
}
495 496 497
# Give the web page a chance to start looking at the log file before
# the entire experiment is gone.
sleep(1);
498

499 500 501 502 503 504 505 506
#
# When in waitmode, must put ourselves in another process group so that
# an interrupt to the parent will not have any effect on the backend.
#
if ($waitmode) {
    POSIX::setsid();
}

507 508 509
#
# Sanity check states in case someone changes something.
#
510 511 512 513
if ($estate eq EXPTSTATE_ACTIVE) {
    if ($experiment->PreSwap($this_user, TBDB_STATS_SWAPOUT, $estate)) {
	# Reset back to original state.
	$experiment->ResetState($estate);
514
	fatal("Preswap failed!");
515
    }
516

517
    if ($experiment->Swap($Experiment::EXPT_SWAPOUT) != 0) {
518 519 520
	fatal({type => 'secondary', severity => SEV_SECONDARY,
	       error => ['tbswap_out_failed']},
	      "tbswap out failed!");
Leigh Stoller's avatar
Leigh Stoller committed
521
    }
522 523 524
    $experiment->PostSwap($this_user, TBDB_STATS_SWAPOUT) == 0 or
	fatal("PostSwap failed!");

525 526 527 528
    $experiment->SetState(EXPTSTATE_TERMINATING) == 0
	or fatal("Failed to set experiment state to " .
		 EXPTSTATE_TERMINATING());

529
    $estate = EXPTSTATE_SWAPPED;
530

531 532
    # Commit the archive after swapout. Easier to use the script. 
    system("$archcontrol -t endexp commit $pid $eid");
Leigh Stoller's avatar
Leigh Stoller committed
533
}
534

535 536 537 538 539 540 541 542 543 544
#
# At this point, there should not be any nodes allocated. But lets
# make this sanity check cause it happens when administrators bypass
# the normal order of things.
#
my @pnodes = $experiment->NodeList(1, 1);
if (@pnodes) {
    fatal("Experiment is not active, but there are nodes allocated: @pnodes\n");
}

545 546
if ($estate eq EXPTSTATE_SWAPPED ||
    $estate eq EXPTSTATE_QUEUED) {
547 548 549
    $experiment->GatherSwapStats($this_user, TBDB_STATS_TERMINATE, 0,
				 TBDB_STATS_FLAGS_START)
	== 0 or fatal("GatherSwapStats failed");
550 551

    if ($experiment->End() != 0) {  
552 553
	fatal("tbend failed!\n");
    }
554 555 556
    $experiment->SetState(EXPTSTATE_TERMINATED) == 0
	or fatal("Failed to set experiment state to " .EXPTSTATE_TERMINATED());
    
557
    $estate = EXPTSTATE_TERMINATED;
558 559 560 561

    #
    # Gather statistics for the swapout.
    #
562 563
    $experiment->GatherSwapStats($this_user, TBDB_STATS_TERMINATE)
	== 0 or fatal("GatherSwapStats failed");
Leigh Stoller's avatar
Leigh Stoller committed
564 565
}

566
# We better be here ...
567
$estate = $experiment->state();
568
if ($estate ne EXPTSTATE_TERMINATED) {
569 570
    fatal("Experiment is in the wrong state: $estate\n");
}
571

572 573
#
# Try to copy off the files for testbed information gathering.
574 575
# Note that the logfile will not contain anything printed after
# this point. 
576
#
577
$experiment->SaveLogFiles();
578

579 580 581 582
# Copy out the archive and then delete it.
print "Archiving and clearing the experiment archive ...\n";
libArchive::TBArchiveExperimentArchive($pid, $eid) == 0 or
    fatal("Could not archive experiment archive!");
583 584

#
585
# Terminate the log so the web page stops spewing.
586
#
587 588
if (!$template_mode) {
    print "Experiment $pid/$eid has been successfully terminated!\n";
589
    $experiment->ClearLogFile();
590
}
591

592 593 594 595 596 597
#
# Kill any features
#
EmulabFeatures->DeleteAll($experiment) == 0 or
    fatal("Could not delete all features for $experiment");

598 599 600 601 602 603
#
# Kill any vlan tag reservations.
#
$experiment->ClearReservedVlanTags() == 0 or
    fatal("Could not delete reserved vlan tags for $experiment");

604 605 606 607
# Just in case ...
$experiment->ClearPortRange() == 0 or
    fatal("Could not delete ip port range for $experiment");

608
#
609 610
# Send email now, since once we call Delete() the log file is going
# to be unlinked.
611
#
612 613 614 615
if (! ($batch || $template_mode || $genimode)) {
    my $message =
	"Experiment `$eid' in project `$pid' has been terminated.\n" .
	"You may now reuse `$eid' as an experiment name.\n";
616

617 618 619 620 621 622 623 624
    SENDMAIL(($noemail ? $TBLOGS : "$user_name <$user_email>"),
	     "Experiment $pid/$eid Terminated",
	     $message,
	     "$user_name <$user_email>",
	     ($noemail ? "" 
	      : "Cc:  $expt_head_name <$expt_head_email>\n".
	      "Bcc: $TBLOGS"), 
	     ($logname));
625 626 627
}

#
628 629
# Cleanup DB state and remove directory. Purge flag is optional and generally
# used by admins for cleaning up bad DB state caused by script errors.
630
#
631
$experiment->Delete($purge);
632

633 634 635 636
# In batch mode, exit now. 
exit(0)
    if ($batch || $template_mode || $genimode);
exit(0);
637

638
sub fatal($;$)
639
{
640 641
    my $parms = {};
    $parms = shift if ref $_[0] eq 'HASH';
642
    my($mesg) = $_[0];
643
    
644
    tberror($parms, $mesg);
645

646
    $experiment->Unlock();
Leigh Stoller's avatar
Leigh Stoller committed
647

648
    # Copy over the log files so the user can see them.
649
    $experiment->CopyLogFiles();    
650

651 652 653
    #
    # In batch mode, exit without sending the email. 
    #
654
    if ($batch || $genimode) {
655 656
	exit(-1);
    }
657 658 659 660 661
    
    #
    # Clear the log file so the web page stops spewing. 
    #
    if (defined($logname)) {
662
	$experiment->CloseLogFile();
663
    }
664 665

    #
666
    # Send a message to the testbed list. Append the logfile.
667
    #
668
    SENDMAIL("$user_name <$user_email>",
669
	     "Termination Failure: $pid/$eid",
670 671 672 673 674
	     $mesg,
	     "$user_name <$user_email>",
	     "Cc:  $expt_head_name <$expt_head_email>\n".
	     "Bcc: $TBOPS",
	     ($logname));
675 676 677

    exit(-1);
}