tbswap.in 58.2 KB
Newer Older
Chad Barb's avatar
 
Chad Barb committed
1 2 3 4
#!/usr/bin/perl -w

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2009 University of Utah and the Flux Group.
Chad Barb's avatar
 
Chad Barb committed
6 7 8 9
# All rights reserved.
#
use English;

Chad Barb's avatar
 
Chad Barb committed
10 11 12
# Returns 0 on success.
# Returns 1 on non-assign_wrapper failure.
# Returns (1 | assign_wrapper's errorcode) on assign_wrapper failure.
13 14
# Returns |0x40 if update caused a swapout. Icky.
# Returns -1 on uncontrolled error (die called).
Chad Barb's avatar
 
Chad Barb committed
15

Chad Barb's avatar
 
Chad Barb committed
16 17 18 19 20
# XXX: handle error cases for update? (backup the db?)
# XXX: Shouldn't do idempotent stuff twice for update.
# XXX: repush/calc routing for update??? (tbprerun)
# XXX: previz for update???              (tbprerun)
# XXX: make snmpit faster for update.
Chad Barb's avatar
Chad Barb committed
21 22 23
#
# XXX: for update, expt is swapped out on os_setup fail.
#      (we only recover if assign fails)
Chad Barb's avatar
 
Chad Barb committed
24 25 26

sub usage()
{
27 28 29
    print STDERR
	"Usage: $0 {in|out|modify [-reboot] [-eventsys_restart]} ".
	"[-noswapout] [-genimode] pid eid\n";
Chad Barb's avatar
 
Chad Barb committed
30 31 32 33 34 35 36
    exit(-1);
}

#
# Configure variables
#
my $TBROOT         = "@prefix@";
Mike Hibler's avatar
Mike Hibler committed
37
my $TBOPS          = "@TBOPSEMAIL@";
Timothy Stack's avatar
Timothy Stack committed
38
my $TBLOGS         = "@TBLOGSEMAIL@";
39
my $MAINSITE	   = @TBMAINSITE@;
40
my $THISHOMEBASE   = "@THISHOMEBASE@";
Chad Barb's avatar
 
Chad Barb committed
41 42
my $TESTMODE       = @TESTMODE@;
my $DISABLE_EVENTS = "@DISABLE_EVENT_SCHED@";
43
my $piper          = "$TBROOT/sbin/locpiper";
Timothy Stack's avatar
Timothy Stack committed
44
my $NFSTRACESUPPORT= @NFSTRACESUPPORT@;
Chad Barb's avatar
 
Chad Barb committed
45 46 47 48 49 50 51 52 53 54 55

# Untaint the path
$ENV{'PATH'} = "/usr/bin:$TBROOT/libexec:$TBROOT/libexec/ns2ir" . 
    ":$TBROOT/sbin:$TBROOT/bin";

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
56
use libadminctrl;
Mike Hibler's avatar
Mike Hibler committed
57
use libadminmfs;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
58
use libtblog;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
59
use libArchive;
60
use Experiment;
61
use User;
62
use Lan;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
63

64
#require exitonwarn; # exitonwarn isn't really a module, so just require it
Chad Barb's avatar
 
Chad Barb committed
65 66 67 68

#
# Actual swap-in and swap-out functions, defined below.
#
Chad Barb's avatar
 
Chad Barb committed
69 70 71
sub doSwapout($);
sub doSwapin($);

72 73 74 75
#
# Firewall stuff
# XXX maybe should be elsewhere
#
76 77 78 79 80
sub FWSETUP()     { return 1; }
sub FWADDNODES()  { return 2; }
sub FWDELNODES()  { return 3; }
sub FWTEARDOWN()  { return 4; }
sub doFW($$$$);
81 82

# XXX fixme: should not be hardwired!
83
my $cnetstack = "-S Control";
84 85 86
my $cnetvlanname = "Control";


87 88 89 90
sub REAL()    { return 5; }
sub CLEANUP() { return 4; }
sub RETRY()   { return 3; }
sub MODIFY()  { return 2; }
Chad Barb's avatar
 
Chad Barb committed
91
sub UPDATE()  { return 1; }
92
sub MODIFY_RECOVER()  { return 0; }
Chad Barb's avatar
 
Chad Barb committed
93

94 95 96
#
# Grab global enable of linkdelays.
#
Kirk Webb's avatar
 
Kirk Webb committed
97
my $enablelinkdelays = TBGetSiteVar("general/linux_endnodeshaping");
98

Chad Barb's avatar
 
Chad Barb committed
99 100 101 102 103 104
#
# Turn off line buffering on output
#

$| = 1;

105 106
my $updateReboot   = 0;
my $updateReconfig = 1;
107 108
my $update_Eventsys_restart = 0;
my $elabinelab     = 0;
109
my $plabinelab     = 0;
110 111 112 113 114
my $force          = 0;
my $noswapout      = 0;
my $genimode       = 0;
my $errors         = 0;
my $updatehosed    = 0;
Chad Barb's avatar
 
Chad Barb committed
115
my $state;
116
my $canceled;
Chad Barb's avatar
 
Chad Barb committed
117
my $os_setup_pid;
118
my $nextState;
Chad Barb's avatar
 
Chad Barb committed
119 120

#
Chad Barb's avatar
 
Chad Barb committed
121
# First argument is either "in", "out", or "update";
Chad Barb's avatar
 
Chad Barb committed
122 123 124 125 126
# this value goes into $swapop.
#

my $swapop = shift;	

Chad Barb's avatar
 
Chad Barb committed
127 128 129
if (!$swapop || 
    (($swapop ne "in") && 
     ($swapop ne "out") &&
130
     ($swapop ne "modify") &&
Chad Barb's avatar
 
Chad Barb committed
131
     ($swapop ne "update"))) {
Chad Barb's avatar
 
Chad Barb committed
132 133 134 135 136 137 138 139 140 141 142
    usage();
}

#
# Get other arguments.
#

while ($#ARGV > 1) {
    $arg = shift;
    if ($arg eq "-force") {
	$force = 1;
Chad Barb's avatar
Chad Barb committed
143 144
    } elsif ($arg eq "-reboot") {
	$updateReboot = 1;
145
	$updateReconfig = 0;
146 147 148
    } elsif ($arg eq "-noreconfig") {
	$updateReboot   = 0;
	$updateReconfig = 0;
149
    } elsif ($arg eq "-eventsys_restart" && $swapop eq "modify") {
150
	$update_Eventsys_restart = 1;
151 152
    } elsif ($arg eq "-noswapout") {
	$noswapout = 0;
Chad Barb's avatar
 
Chad Barb committed
153 154 155 156 157 158 159 160 161
    } else {
	usage();
    }
}
if ($#ARGV < 1) {
    usage();
}
my ($pid,$eid) = @ARGV;

162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
#
# Untaint the arguments.
#
if ($pid =~ /^([-\@\w.]+)$/) {
    $pid = $1;
}
else {
    die("Tainted argument $pid!\n");
}
if ($eid =~ /^([-\@\w.]+)$/) {
    $eid = $1;
}
else {
    die("Tainted argument $eid!\n");
}

Kevin Atkinson's avatar
 
Kevin Atkinson committed
178 179 180 181 182 183
#
# Set Error reporting info
# 
tblog_set_info($pid,$eid,$UID);

#
Mike Hibler's avatar
Mike Hibler committed
184
# Turn on timestamps
Kevin Atkinson's avatar
 
Kevin Atkinson committed
185
#
Chad Barb's avatar
 
Chad Barb committed
186 187
TBDebugTimeStampsOn();

188 189 190 191 192 193
#
# Verify user and get his DB uid and other info for later.
#
my $this_user = User->ThisUser();
if (! defined($this_user)) {
    tbdie("You ($UID) do not exist!");
194
}
195 196 197
my $user_uid   = $this_user->uid();
my $user_name  = $this_user->name();
my $user_email = $this_user->email();
198

199 200 201 202 203 204
# Slowly convert to using Experiment module.
my $experiment = Experiment->Lookup($pid, $eid);
if (!defined($experiment)) {
    tbdie("Could not lookup experiment object!")
}

Chad Barb's avatar
 
Chad Barb committed
205 206 207
#
# Print starting message.
#
208 209
my $exptidx;
TBExptIDX($pid, $eid, \$exptidx);
Chad Barb's avatar
 
Chad Barb committed
210

211 212
print "Beginning swap-$swapop for $pid/$eid ($exptidx). " .
    TBTimeStampWithDate() . "\n";
Chad Barb's avatar
 
Chad Barb committed
213 214 215 216 217 218
TBDebugTimeStamp("tbswap $swapop started");

#
# Get experiment state; verify that experiment exists.
#
if (! ($state = ExpState($pid, $eid))) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
219
    tbdie "No such experiment $pid/$eid";
Chad Barb's avatar
 
Chad Barb committed
220
}
221 222 223
# Sanity check the current state. 
if (!$force) {
    if ($swapop eq "in") {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
224
	tbdie("Experiment should be ACTIVATING. Currently $state.")
225
	    if ($state ne EXPTSTATE_ACTIVATING);
Chad Barb's avatar
 
Chad Barb committed
226
    }
227
    elsif ($swapop eq "out") {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
228
	tbdie("Experiment should be SWAPPING. Currently $state.")
229
	    if ($state ne EXPTSTATE_SWAPPING);
Chad Barb's avatar
 
Chad Barb committed
230
    }
231
    elsif ($swapop eq "modify" || $swapop eq "update") {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
232
	tbdie("Experiment should be MODIFY_RESWAP. Currently $state.")
233
	    if ($state ne EXPTSTATE_MODIFY_RESWAP);
Chad Barb's avatar
 
Chad Barb committed
234 235
    }
}
236 237
# Get elabinelab status. See below.
if (! TBExptIsElabInElab($pid, $eid, \$elabinelab)) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
238
    tbdie("Could not get elabinelab status for experiment $pid/$eid");
239
}
240 241 242 243 244 245 246 247 248 249 250
# and plabinelab status.
if (! TBExptIsPlabInElab($pid, $eid, \$plabinelab)) {
    tbdie("Could not get plabinelab status for experiment $pid/$eid");
}
if ($elabinelab && $plabinelab) {
    tberror "Cannot get my head around Plab in Elab in Elab!\n";
    print "Failingly finished swap-$swapop for $pid/$eid. " .
	TBTimeStamp() . "\n";
    TBDebugTimeStamp("tbswap $swapop finished (failed)");
    exit(1);
}
Chad Barb's avatar
 
Chad Barb committed
251

252 253 254 255 256
#
# See if the experiment is firewalled
#
my $firewalled = TBExptFirewall($pid, $eid);

Chad Barb's avatar
 
Chad Barb committed
257 258 259
#
# Do actual swapping
#
260 261 262 263
if ($swapop eq "out") {
    #
    # Swap out
    #
Kevin Atkinson's avatar
 
Kevin Atkinson committed
264
    tblog_set_attempt(1);
265
    $errors = doSwapout(REAL);
Chad Barb's avatar
 
Chad Barb committed
266
}
267
elsif ($swapop eq "update" || $swapop eq "modify") {
268 269 270 271 272 273
    #
    # Before swapout, do cursory admission control to see if the
    # modified experiment will be swappable. assign_wrapper does a
    # more stringent check using assign.
    #
    print STDERR "Checking with Admission Control ...\n";
274
    if (! TBAdmissionControlCheck(undef, $experiment, undef)) {
275 276 277
	tberror({type => 'secondary', severity => SEV_SECONDARY,
		 error => ['admission_control_failure']},
		"Admission control failure!\n");
278 279 280 281 282 283
	print "Failingly finished swap-$swapop for $pid/$eid. " .
	    TBTimeStamp() . "\n";
	TBDebugTimeStamp("tbswap $swapop finished (failed)");
	exit(1);
    }
    
284 285
    #
    # Update.
Kevin Atkinson's avatar
 
Kevin Atkinson committed
286 287 288
    #
    tblog_set_attempt(1);

289 290 291 292
    #
    # Phase One -- swap experiment partially out.
    #
    print STDERR "Backing up physical state...\n";
293
    $experiment->BackupPhysicalState();
Chad Barb's avatar
 
Chad Barb committed
294

295 296 297 298 299
    #
    # Actually, in update mode this is not done cause we are anticipating
    # adding nodes only.
    #
    $errors = ($swapop eq "modify" ? doSwapout(MODIFY) : 0);
Chad Barb's avatar
Chad Barb committed
300

301 302 303 304 305 306
    if (0) {
	print STDERR "Doing a swapmodswapout on the experiment archive ...\n";
	if (libArchive::TBExperimentArchiveSwapModSwapOut($pid, $eid) < 0) {
	    tberror("Failed swapmodswapout on the experiment archive!");
	    $errors = 1;
	}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
307 308
    }

309
    if ($errors) {
Chad Barb's avatar
 
Chad Barb committed
310
	#
311 312 313 314 315
	# Clean up the mess, leaving the experiment in the SWAPPED state,
	# 
	print STDERR "Cleaning up after errors.\n";
	doSwapout(CLEANUP);
	$updatehosed = 1;
Chad Barb's avatar
 
Chad Barb committed
316
    }
317
    else {
Chad Barb's avatar
 
Chad Barb committed
318
	#
319
	# Phase Two -- swap experiment back in.
Chad Barb's avatar
 
Chad Barb committed
320
	#
321
	$errors = doSwapin(MODIFY);
Chad Barb's avatar
Chad Barb committed
322

323
	if ($errors) {
Chad Barb's avatar
Chad Barb committed
324 325 326
	    #
	    # There were errors; see if we can recover.
	    #
327
	    my $CanRecover = 1;
Chad Barb's avatar
Chad Barb committed
328 329 330

	    if ($errors != 7) {
		print STDERR "Update failure occurred _after_ assign phase; ";
331
		$CanRecover = 0;
Chad Barb's avatar
Chad Barb committed
332 333
	    }

334 335 336
	    if ($CanRecover) {
		print STDERR "Recovering virtual and physical state.\n";

337 338 339
		# It is safe to remove the phystate since we know it was
		# backed up above, and cause we do not know if assign_wrapper
		# made it to that point before it failed. 
340 341 342 343
		if ($experiment->RemoveVirtualState($pid, $eid) ||
		    $experiment->RestoreVirtualState($pid, $eid) ||
		    $experiment->RemovePhysicalState($pid, $eid) ||
		    $experiment->RestorePhysicalState($pid,$eid)) {
344 345
		    print STDERR "Could not restore backed-up state; ";
		    $CanRecover = 0;
Chad Barb's avatar
Chad Barb committed
346
		}
347 348 349
		else {
		    print STDERR "Doing a recovery swap-in of old state.\n";

350
		    if (doSwapin(MODIFY_RECOVER)) {
351 352 353
			print STDERR "Could not swap in old physical state; ";
			$CanRecover = 0;
		    }
Chad Barb's avatar
Chad Barb committed
354 355
		}
	    }
356 357 358 359 360 361

	    #
	    # Some part of the recovery failed; must swap it out. swapexp
	    # (caller) will then have to do more clean up, hence the special
	    # exit status indicated by $updatehosed.
	    # 
362 363 364 365 366 367 368 369 370 371 372
	    if (!$CanRecover) {
		if ($noswapout) {
		    print STDERR
			"No Recovery, but leaving experiment swapped in.\n";
		}
		else {
		    print STDERR
			"Recovery aborted! Swapping experiment out.\n";
		    doSwapout(CLEANUP);
		    $updatehosed = 1;
		}
373 374 375
	    }
	    else {
		print STDERR "Update recovery successful.\n";
Chad Barb's avatar
Chad Barb committed
376
	    }
Chad Barb's avatar
 
Chad Barb committed
377
	}
Chad Barb's avatar
Chad Barb committed
378 379
    }
}
380 381 382 383 384
elsif ($swapop eq "in") {
    #
    # Swap in
    #
    my $retries = 2;
385 386 387 388 389 390

    #
    # Before real swapin, do cursory admission control. assign_wrapper does
    # a more stringent check using assign.
    #
    print STDERR "Checking with Admission Control ...\n";
391
    if (! TBAdmissionControlCheck(undef, $experiment, undef)) {
392 393 394
	tberror({type => 'secondary', severity => SEV_SECONDARY,
		 error => ['admission_control_failure']},
		"Admission control failure!\n");
395 396 397 398 399
	print "Failingly finished swap-$swapop for $pid/$eid. " .
	    TBTimeStamp() . "\n";
	TBDebugTimeStamp("tbswap $swapop finished (failed)");
	exit(1);
    }
Kevin Atkinson's avatar
 
Kevin Atkinson committed
400 401

    tblog_set_attempt(1);
402
    $errors = doSwapin(REAL);
Chad Barb's avatar
Chad Barb committed
403

404 405 406 407 408 409 410 411 412 413
    #
    # Attempt a retry if: 
    #   a) there were errors, 
    #   b) doswapin() indicated (via return code 3) a retry is appropriate,
    #   c) we haven't tried too many times already.
    #   d) The cancelflag has not been set.
    #   e) $TESTMODE == 0.
    #
    while ($errors == 3 && $retries && !$canceled && !$TESTMODE) {
	$retries--;
Kevin Atkinson's avatar
 
Kevin Atkinson committed
414
	tblog_inc_attempt();
Chad Barb's avatar
 
Chad Barb committed
415

416 417
	print STDERR "Cleaning up after errors; will try again.\n";
	doSwapout(RETRY);
Chad Barb's avatar
 
Chad Barb committed
418

419 420
	print STDERR "Trying again...\n";
	$errors = doSwapin(RETRY);
Chad Barb's avatar
 
Chad Barb committed
421
    }
422 423
    if ($errors || $canceled) {
	print STDERR "Cleaning up after " .
424
	    ($canceled ? "cancelation" : "errors") . ".\n";
425
	doSwapout(CLEANUP);
Chad Barb's avatar
 
Chad Barb committed
426 427 428
    }
}

Kevin Atkinson's avatar
 
Kevin Atkinson committed
429 430 431 432
tblog_set_attempt(0);

TBGetCancelFlag($pid, $eid, \$canceled);
if ($canceled) {
433 434
    tberror ({type=>'summary', cause=>'canceled', 
	      severity=>SEV_IMMEDIATE, error=>['cancel_flag']},
Kevin Atkinson's avatar
 
Kevin Atkinson committed
435 436 437
	     "Experiment swap-$swapop canceled by user.");
}

Chad Barb's avatar
 
Chad Barb committed
438 439 440 441
#
# Write appropriate message and exit.
#
if ($errors) {
442
    print "Failingly finished swap-$swapop for $pid/$eid. ".TBTimeStamp()."\n";
Chad Barb's avatar
 
Chad Barb committed
443
    TBDebugTimeStamp("tbswap $swapop finished (failed)");
Chad Barb's avatar
Chad Barb committed
444

445 446
    # Pass out magic value to indicate that update failed!
    exit(1 | ($updatehosed ? 0x40 : 0));
Chad Barb's avatar
 
Chad Barb committed
447
}
448
print "Successfully finished swap-$swapop for $pid/$eid. " .TBTimeStamp()."\n";
449 450
TBDebugTimeStamp("tbswap $swapop finished (succeeded)");
exit(0);
Chad Barb's avatar
 
Chad Barb committed
451 452 453 454

#################################

##
Chad Barb's avatar
Chad Barb committed
455
#
Chad Barb's avatar
 
Chad Barb committed
456 457
# doSwapout - Swaps experiment out.
#
Chad Barb's avatar
Chad Barb committed
458
#             If in REAL or CLEANUP,
Chad Barb's avatar
 
Chad Barb committed
459 460 461
#             this function will free all nodes for the 
#             experiment.
#
Chad Barb's avatar
Chad Barb committed
462
#             If in RETRY or UDPATE,
Chad Barb's avatar
 
Chad Barb committed
463 464 465 466 467 468
#             only nodes not in RES_READY will be freed.
#
#             Returns 0 on success, >0 on failure.
#
##

Chad Barb's avatar
 
Chad Barb committed
469
sub doSwapout($) {
470
    my $type = shift;
Chad Barb's avatar
 
Chad Barb committed
471 472
    my $swapout_errors = 0;

Kevin Atkinson's avatar
 
Kevin Atkinson committed
473 474
    tblog_set_cleanup(1) if $type == CLEANUP;

Chad Barb's avatar
 
Chad Barb committed
475 476 477 478 479 480 481 482 483
    #
    # wait for os_setup;
    # this only applies if called after a failed doswapin.
    #
    if ($os_setup_pid) {
	print "Waiting for os_setup to finish\n";
	waitpid($os_setup_pid, 0);
	undef $os_setup_pid;
    }
Chad Barb's avatar
Chad Barb committed
484

Timothy Stack's avatar
 
Timothy Stack committed
485
    if (0 && $NFSTRACESUPPORT && $type == REAL) {
Timothy Stack's avatar
Timothy Stack committed
486 487
	print "Getting files accessed via NFS.\n";
	TBDebugTimeStamp("nfstrace started");
488
	system("nfstrace transfer $pid $eid");
Timothy Stack's avatar
Timothy Stack committed
489
	TBDebugTimeStamp("nfstrace finished");
490
    }
491

Chad Barb's avatar
 
Chad Barb committed
492
    if (! $TESTMODE) { 
493 494
	if (! ($DISABLE_EVENTS || $elabinelab)) {
	    if ($type >= RETRY ||
495
		($update_Eventsys_restart && $type == MODIFY) ) {
Chad Barb's avatar
 
Chad Barb committed
496
		print "Stopping the event system\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
497
		if (system("eventsys_control stop $pid,$eid")) {
498 499 500
		    tberror({type => 'secondary', severity => SEV_SECONDARY,
			     error => ['eventsys_stop_failed']},
			    "Failed to stop the event system.");
Chad Barb's avatar
 
Chad Barb committed
501 502
		    $swapout_errors = 1;
		}
503 504 505 506 507 508 509

		#
		# Stop the location piper.
		#
		if (-x $piper) {
		    print "Stopping the location piper\n";
		    if (system("$piper -k $pid $eid")) {
510 511 512
			tberror({type => 'secondary', severity => SEV_SECONDARY,
				 error => ['piper_stop_failed']},
				"Failed to stop location piper.");
513 514 515
			$swapout_errors = 1;
		    }
		}
Chad Barb's avatar
 
Chad Barb committed
516 517
	    }
	}
518 519 520 521 522 523
	
	#
	# Do teardown of inner elab. We must do this before we teardown the
	# vlans since the inner control network is a vlan, and we want that
	# active so inner boss can reboot the inner nodes (avoid power cycle).
	#
524
	if ($elabinelab && $type >= CLEANUP) {
525 526
	    print "Tearing down elabinelab. This could take a while.\n";
	    if (system("elabinelab -k $pid $eid")) {
527 528 529
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['elabinelab_tear_down_failed']},
			"Failed to teardown elabinelab!");
530 531 532 533
		$swapout_errors = 1;
	    }
	}

Chad Barb's avatar
 
Chad Barb committed
534
	#
Chad Barb's avatar
Chad Barb committed
535
	# Clean up any VLANs in experiment.
Chad Barb's avatar
 
Chad Barb committed
536
	#
537
	# When modifying an elabinelab experiment, leave the vlans intact
538
	# so that the inner networks are not suddenly disconnected.
539
	#
540
	if ($type != MODIFY) {
541 542 543
	    TBDebugTimeStamp("snmpit started");
	    print STDERR "Removing VLANs.\n";
	    if (system("snmpit -r $pid $eid")) {
544 545 546
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['vlan_reset_failed']},
			"Failed to reset VLANs");
547 548 549
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("snmpit finished");
Chad Barb's avatar
 
Chad Barb committed
550
	}
551 552 553 554 555 556 557 558 559 560 561 562 563 564
	#
	# Must check for stale vlans that we kept around in the above clause
	# since they will not be in the lans table anymore.
	#
	if ($type == CLEANUP) {
	    my @stale;
	    if (VLan->StaleVlanList($experiment, \@stale) != 0) {
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['vlan_reset_failed']},
			"Failed to get stale VLANs");
		$swapout_errors = 1;
	    }
	    if (@stale) {
		print "Removing stale vlans @stale\n";
565
		system("snmpit -f ". join(" ", map("-o $_", @stale)));
566 567 568 569 570 571 572 573
		if ($?) {
		    tberror({type => 'summary', severity => SEV_SECONDARY,
			     error => ['vlan_reset_failed']},
			    "Failed to remove stale vlans");
		    $swapout_errors = 1;
		}
	    }
	}
574
    }
575
	
576
    if ($type >= CLEANUP) {
577 578 579 580 581 582 583 584
	#
	# Undo plab in elab specialness.
	# No need to worry about VLANs here, as all the special abilities
	# involve the control network.
	#
	if (! $TESTMODE && $plabinelab) {
	    print "Tearing down plabinelab.\n";
	    if (system("plabinelab -k $pid $eid")) {
585 586 587
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['plabinelab_tear_down_failed']},
			"Failed to teardown plabinelab!");
588 589 590 591
		$swapout_errors = 1;
	    }
	}

592 593
	#
	# We're not attempting a retry;
Chad Barb's avatar
 
Chad Barb committed
594
	#
595
	# Stop all of the vnodes.
Chad Barb's avatar
 
Chad Barb committed
596
	#
597
	if (! $TESTMODE) { 	
Chad Barb's avatar
 
Chad Barb committed
598 599 600
	    print "Tearing down virtual nodes.\n";
	    TBDebugTimeStamp("vnode_setup -k started");
	    if (system("vnode_setup -d -k $pid $eid")) {
601 602 603
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['vnode_tear_down_failed']},
			"Failed to tear down vnodes.");
Chad Barb's avatar
 
Chad Barb committed
604 605 606
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("vnode_setup finished");
Chad Barb's avatar
 
Chad Barb committed
607 608
	}

Mike Hibler's avatar
Mike Hibler committed
609 610 611 612 613
	#
	# Nodes behind a firewall are treated special.
	# See undoFWNodes for details.
	#
	if ($firewalled && undoFWNodes($pid, $eid)) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
614
	    tblog_set_cleanup(0);
Mike Hibler's avatar
Mike Hibler committed
615 616
	    return 1;
	}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
617

Mike Hibler's avatar
Mike Hibler committed
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	#
	# Perform swapout time admin actions.  Right now there is at most
	# one of these.  It isn't really a general mechanism, just a hook
	# for state saving or data collection during swapout.
	# A couple of important "fer now" notes:
	#
	#	We don't do this for firewalled experiments.  We need a way
	#	to "tag" the saved disk state to ensure it doesn't get
	#	instantiated outside of a firewall.
	#
	#	We only do this on REAL swapouts, and not on CLEANUPs.
	#	There are some types of CLEANUPs where we may want to
	#	do this, in particular an invocation caused by a failed
	#	modify operation, where the admin action is to save the
	#	experiment state.  So we will need to revisit this.
	#
	my %soaction = ();
	if ($type == REAL && !$firewalled) {
	    TBExptGetSwapoutAction($pid, $eid, \%soaction);
	}
638
	if ($soaction{'command'} && doSwapoutAction($pid, $eid, %soaction)) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
639
	    tblog_set_cleanup(0);
640
	    return 1;
641 642
	}

Chad Barb's avatar
 
Chad Barb committed
643 644
	#
	# remove all nodes from the experiment.
Chad Barb's avatar
 
Chad Barb committed
645
	# (nfree will send them to RES_FREE_DIRTY)
Chad Barb's avatar
 
Chad Barb committed
646 647 648
	#
	print STDERR "Freeing nodes.\n";
	TBDebugTimeStamp("nfree started");
649
	if (system("nfree -a $pid $eid")) {
650 651 652
	    tberror({type => 'secondary', severity => SEV_SECONDARY,
		     error => ['nfree_failed']},
		    "Could not free nodes.");
Chad Barb's avatar
 
Chad Barb committed
653 654 655
	    $swapout_errors = 1;
	}
	TBDebugTimeStamp("nfree finished");
Chad Barb's avatar
 
Chad Barb committed
656 657 658 659 660

	#
	# Since this is an actual swapout, 
	# reset our count of swap out nag emails sent.
	#
661 662
	DBQueryWarn("update experiments set swap_requests='', ".
		    "   sim_reswap_count='0' ".
663
		    "where eid='$eid' and pid='$pid'");
Chad Barb's avatar
 
Chad Barb committed
664 665
    } else {
	#
666
	# $type == RETRY or $type == MODIFY.
Chad Barb's avatar
 
Chad Barb committed
667 668
	# Therefore, don't deallocate nodes which have been successfully
	# incorporated into the experiment (i.e., are RES_READY).
Chad Barb's avatar
 
Chad Barb committed
669
	# (nfree will send deallocated nodes to RES_FREE_DIRTY)
Chad Barb's avatar
 
Chad Barb committed
670
	#
671 672 673 674 675 676 677 678
	my @failedpnodes = ();
	my @failedvnodes = ();
	
	my $db_result =
	    DBQueryFatal("select rv.node_id,n.allocstate,nt.isvirtnode ".
                         "  from reserved as rv ".
			 "left join nodes as n on n.node_id = rv.node_id ".
			 "left join node_types as nt on nt.type=n.type ".
679 680
			 "where rv.pid='$pid' and rv.eid='$eid' and ".
			 "      rv.genisliver_idx is null");
Chad Barb's avatar
 
Chad Barb committed
681

682
	while (my ($node,$allocstate,$isvirt) = $db_result->fetchrow_array) {
Chad Barb's avatar
 
Chad Barb committed
683
	    if ($allocstate ne TBDB_ALLOCSTATE_RES_READY()) {
684 685 686 687 688 689
		if ($isvirt) {
		    push(@failedvnodes, $node);
		}
		else {
		    push(@failedpnodes, $node);
		}
Chad Barb's avatar
 
Chad Barb committed
690 691 692
	    }
	}

693 694 695 696 697 698 699
	#
	# Tear down failed vnodes. Perhaps not needed?
	# 
	if (!$TESTMODE && @failedvnodes > 0) {
	    print "Tearing down failed virtual nodes.\n";
	    TBDebugTimeStamp("vnode_setup -k started");
	    if (system("vnode_setup -d -k $pid $eid @failedvnodes")) {
700 701 702
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['vnode_tear_down_failed']},
			"Failed to tear down vnodes.");
703 704 705 706
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("vnode_setup -k finished");
	}
Chad Barb's avatar
 
Chad Barb committed
707

708 709 710 711 712 713 714 715 716 717 718
	#
	# Undo plabinelab setup
	#
	if (!$TESTMODE && $plabinelab && @failedpnodes > 0) {
	    print "Removing failed nodes from inner plab.\n";
	    if (system("plabinelab -r $pid $eid @failedpnodes")) {
		tberror "Failed to remove inner nodes!";
		$swapout_errors = 1;
	    }
	}

719 720 721 722
	#
	# Release all failed nodes.
	# 
	if (@failedpnodes > 0 || @failedvnodes > 0) {
723 724
	    print STDERR "Freeing failed nodes.\n";
	    
Chad Barb's avatar
 
Chad Barb committed
725
	    TBDebugTimeStamp("nfree started");
Chad Barb's avatar
 
Chad Barb committed
726 727 728 729 730
	    #
	    # Specify -x switch so when a physical node gets freed,
	    # any virtual nodes (owned by this experiment)
	    # sitting on top of it are freed as well.
	    #
731 732
	    if (system("nfree -x $pid $eid " .
		       join(" ", (@failedpnodes, @failedvnodes)))) {
733 734 735
		tberror({type => 'secondary', severity => SEV_SECONDARY,
			 error => ['nfree_failed']},
			"Could not free nodes.");
Chad Barb's avatar
 
Chad Barb committed
736 737 738 739 740 741
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("nfree finished");
	}
    }

742 743 744 745 746 747 748 749 750 751 752 753
    if (! $TESTMODE) {
	#
	# If the experiment has no Plab dslice nodes left, but still has
	# a Plab slice, destroy the slice
	#

	# Does the slice exist?
	$db_result =
	    DBQueryFatal("select slicename from plab_slices ".
			 "where pid='$pid' and eid='$eid'");

	if ($db_result->numrows) {
754
	    # Are there any nodes left in the slice?
755
	    $db_result =
756 757 758 759 760 761 762 763 764
		DBQueryFatal("select r.node_id ".
			     " from plab_slices as ps".
			     " left join plab_slice_nodes as psn ".
			     "   on (ps.slicename=psn.slicename ".
			     "       and ps.plc_idx=psn.plc_idx) ".
			     " left join reserved as r ".
			     "   on psn.node_id=r.node_id ".
			     " where ps.pid='$pid' and ps.eid='$eid'".
			     "   and r.node_id is not NULL");
765 766

	    if (!$db_result->numrows) {
767
		print "Tearing down Slices.\n";
768
		TBDebugTimeStamp("plabslice destroy started");
769
		if (system("plabslice destroy $pid $eid")) {
770
		    tberror "Failed to tear down Slices.";
771 772 773 774 775 776 777
		    $swapout_errors = 1;
		}
		TBDebugTimeStamp("plabslice destroy finished");
	    }
	}
    }

Chad Barb's avatar
 
Chad Barb committed
778 779 780 781 782 783 784 785
    if (! $TESTMODE) {
	#
	# All of these errors are non-fatal on swapout. We find out about them
	# via email sent from the individual scripts.
	#

	#
	# Only reset mountpoints if this is an actual swapout, and
Chad Barb's avatar
 
Chad Barb committed
786
	# not a failed swapin(cleanup), update, or retry.
Chad Barb's avatar
 
Chad Barb committed
787
	#
Chad Barb's avatar
 
Chad Barb committed
788
	if ($type == REAL) {
Chad Barb's avatar
 
Chad Barb committed
789 790 791
	    print "Resetting mountpoints.\n";
	    TBDebugTimeStamp("exports started");
	    if (system("exports_setup")) {
792 793 794
		tberror({severity => SEV_WARNING,
			 error => ['mountpoint_reset_failed']},
			"Failed to reset mountpoints.");
Chad Barb's avatar
 
Chad Barb committed
795 796
	    }
	    TBDebugTimeStamp("exports finished");
Chad Barb's avatar
Chad Barb committed
797

798 799 800 801 802 803 804 805 806
	    #
	    # Ditto these two. 
	    #
	    print "Resetting named maps.\n";
	    TBDebugTimeStamp("named started");
	    if (system("named_setup")) {
		tbwarn "Failed to reset named map.";
	    }
	    TBDebugTimeStamp("named finished");
Chad Barb's avatar
Chad Barb committed
807

808 809 810 811 812 813
	    print "Resetting email lists.\n";
	    TBDebugTimeStamp("genelists started");
	    if (system("genelists -t")) {
		tbwarn "Failed to reset email lists.";
	    }
	    TBDebugTimeStamp("genelists finished");
Chad Barb's avatar
 
Chad Barb committed
814 815 816
	}
    }

Chad Barb's avatar
 
Chad Barb committed
817
    #
818
    # Wipe the DB clean except during MODIFY or RETRY. In those
819 820
    #    cases, assign_wrapper will reset the DB after reading
    #    the info.
Chad Barb's avatar
 
Chad Barb committed
821
    #
822 823
    if ( $type >= CLEANUP ) {
	print STDERR "Resetting DB.\n";
824 825 826
	$experiment->RemovePhysicalState();
	# Special. 
	$experiment->ClearPortRegistration();
827
    }
Chad Barb's avatar
 
Chad Barb committed
828

Kevin Atkinson's avatar
 
Kevin Atkinson committed
829
    tblog_set_cleanup(0);
Chad Barb's avatar
 
Chad Barb committed
830 831 832 833
    return $swapout_errors;
}

##
Chad Barb's avatar
Chad Barb committed
834
#
Chad Barb's avatar
 
Chad Barb committed
835 836
# doSwapin - Swaps experiment in.
#
Chad Barb's avatar
Chad Barb committed
837 838 839 840 841
#            Returns:
#              0 - successful swapin
#              1 - failed swapin; cleanup required.
#              3 - failed swapin; cleanup required; can retry.
#              7 - failed swapin; assign failed; no cleanup.
Chad Barb's avatar
 
Chad Barb committed
842 843
##

Chad Barb's avatar
 
Chad Barb committed
844
sub doSwapin($) {
845
    my $type = shift; 
846 847
    # Just the physnodes ...
    my @deleted_pnodes = ();
Chad Barb's avatar
 
Chad Barb committed
848

Chad Barb's avatar
 
Chad Barb committed
849 850 851 852
    #
    # assign_wrapper does all the virtual to physical mapping 
    # and updating the DB state.
    #
853
    
854
    if ($type > MODIFY_RECOVER) {
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870
        #
        # Hacky test to allow disabling of linkdelays if the node is going
        # to run Linux. See sitevar above.
        #
	if (! $enablelinkdelays) {
	    $db_result =
		DBQueryFatal("select distinct e.pid,e.eid,vl.vnode,vn.osname ".
			 "  from experiments as e ".
			 "left join virt_lans as vl on vl.pid=e.pid and ".
			 "     vl.eid=e.eid ".
			 "left join virt_nodes as vn on vn.pid=e.pid and ".
			 "     vn.eid=e.eid and vn.vname=vl.vnode ".
			 "left join os_info as o on o.osname=vn.osname and ".
			 "  (o.pid=vl.pid or o.pid='" . TBOPSPID() . "') ".
			 "where (vl.uselinkdelay!=0 or e.uselinkdelays!=0 or ".
			 "       e.forcelinkdelays!=0) and ".
871 872
			 "     (o.os is NULL or o.os='Linux' or ".
			 "      o.os='Fedora') and ".
873 874 875
			 "     e.pid='$pid' and e.eid='$eid'");

	    if ($db_result->numrows) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
876 877
		tberror "Endnodeshaping is disabled on Linux Images!";
		tberror "You must modify your experiment to swap it in.";
878 879 880 881
		return 1;
	    }
	}
	
Chad Barb's avatar
Chad Barb committed
882 883 884 885
	print "Mapping to physical reality ...\n";
	TBDebugTimeStamp("assign_wrapper started");

	#
886 887 888 889
	# Pass the -u (update) switch into assign_wrapper, which turns on
	# update mode. When doing a retry, must also fix the current nodes
	# to avoid stuff jumping around when simply trying to replace a node
	# that did not boot.
Chad Barb's avatar
Chad Barb committed
890 891
	#
	my $exitcode;
892 893 894
	my $wrapper = "assign_wrapper -u";
	$wrapper .= " -f"
	    if ($type == RETRY);
895
	
896
	if (system("$wrapper $pid $eid")) {
897 898
	    $exitcode = $? >> 8;

Kevin Atkinson's avatar
 
Kevin Atkinson committed
899
	    tberror "Failed ($exitcode) to map to reality.";
900

901 902
	    # Wrapper sets this bit when recovery is possible.
	    if ($exitcode & 64) {
903
		# We can recover. 
904
		tbreport(SEV_SECONDARY, 'assign_wrapper_failed', $exitcode);
Chad Barb's avatar
Chad Barb committed
905
		return 7;
906 907
	    }
	    else {
908
		# No recovery, no retry.
909
		tbreport(SEV_SECONDARY, 'assign_wrapper_failed', $exitcode);
Chad Barb's avatar
Chad Barb committed
910 911 912 913
		return 1;
	    }
	}
	TBDebugTimeStamp("assign_wrapper finished");
Chad Barb's avatar
 
Chad Barb committed
914

Chad Barb's avatar
Chad Barb committed
915 916
	print "Mapped to physical reality!\n";
    }
Chad Barb's avatar
 
Chad Barb committed
917

918
    # Check cancel flag before continuing. No retry, 
919
    TBGetCancelFlag($pid, $eid, \$canceled);
920 921 922 923
    if ($canceled) {
	tbreport(SEV_IMMEDIATE, 'cancel_flag');
	return 1
    }
924

925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955
    #
    # When doing a modify, we have to compare vlans to determine which
    # vlans actually changed and need to be deleted, before processing
    # the new vlans for the experiment. Note that vlans that already
    # exist on the switches will be left as is by snmpit.
    #
    # We must do this before the nfree of deleted nodes in the next section
    # because the new test in nfree the prevents nodes from accidentally
    # getting released when they are in a vlan.
    #
    if ($type == MODIFY) {
	my @diff = ();
	my @same = ();
	if (Lan->CompareVlansWithSwitches($experiment, \@diff, \@same) != 0) {
	    tberror({type => 'summary', severity => SEV_SECONDARY,
		     error => ['vlan_setup_failed']},
		    "Failed to compare old vlans");
	    return 1;
	}
	if (@diff) {
	    print "Removing obsolete vlans @diff\n";
	    system("snmpit -f ". join(" ", map("-o $_", @diff)));
	    if ($?) {
		tberror({type => 'summary', severity => SEV_SECONDARY,
			 error => ['vlan_setup_failed']},
			"Failed to remove old vlans");
		return 1;
	    }
	}
    }

956 957 958 959 960 961 962 963 964
    #
    # Look for any nodes in RES_TEARDOWN. These need to be released,
    # and if a virtnode, they need to be torn down. We cannot wait for
    # the virtnodes to go down with the physnode they are hosted on,
    # so teardown and release the virtnodes first, and then do the
    # physnodes.
    #
    # Errors are fatal; no recovery or retry.
    #
965
    if ($type == MODIFY || $type == UPDATE) {
966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
	my $allocstate = TBDB_ALLOCSTATE_RES_TEARDOWN();
	
	$db_result =
	    DBQueryFatal("select r.node_id,nt.isvirtnode,nt.isremotenode ".
			 "  from reserved as r ".
			 "left join nodes as n on n.node_id=r.node_id ".
			 "left join node_types as nt on nt.type=n.type ".
			 "where r.pid='$pid' and r.eid='$eid' and ".
			 "      n.allocstate='$allocstate'");

	if ($db_result->numrows) {
	    my @virtnodes = ();
	    my @physnodes = ();
	    
	    print "Tearing down and releasing unused nodes\n";

	    # First teardown/release virtnodes. 
	    while (my ($node,$isvirt,$isrem) = $db_result->fetchrow_array()) {
		if ($isvirt) {
		    push(@virtnodes, $node);
		}
		elsif (!$isrem) {
		    push(@physnodes, $node);
		}
	    }
991 992 993
	    # See below.
	    @deleted_pnodes = @physnodes;
	    
994 995 996
	    if (@virtnodes) {
		TBDebugTimeStamp("vnode_setup started");
		
997
		if (system("vnode_setup -k $pid $eid @virtnodes")) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
998
		    tberror "Failed to tear down unused virtnodes!\n";
999 1000 1001 1002 1003
		    return 1;
		}
		TBDebugTimeStamp("vnode_setup finished");
		
		if (system("nfree $pid $eid @virtnodes")) {
Kevin Atkinson's avatar
 
Kevin Atkinson committed
1004
		    tberror