tbswap.in 49 KB
Newer Older
Chad Barb's avatar
   
Chad Barb committed
1
2
3
4
#!/usr/bin/perl -w

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
Chad Barb's avatar
   
Chad Barb committed
6
7
8
9
# All rights reserved.
#
use English;

Chad Barb's avatar
   
Chad Barb committed
10
11
12
# Returns 0 on success.
# Returns 1 on non-assign_wrapper failure.
# Returns (1 | assign_wrapper's errorcode) on assign_wrapper failure.
13
14
# Returns |0x40 if update caused a swapout. Icky.
# Returns -1 on uncontrolled error (die called).
Chad Barb's avatar
   
Chad Barb committed
15

Chad Barb's avatar
   
Chad Barb committed
16
17
18
19
20
# XXX: handle error cases for update? (backup the db?)
# XXX: Shouldn't do idempotent stuff twice for update.
# XXX: repush/calc routing for update??? (tbprerun)
# XXX: previz for update???              (tbprerun)
# XXX: make snmpit faster for update.
Chad Barb's avatar
Chad Barb committed
21
22
23
#
# XXX: for update, expt is swapped out on os_setup fail.
#      (we only recover if assign fails)
Chad Barb's avatar
   
Chad Barb committed
24
25
26

sub usage()
{
27
    print STDERR "Usage: $0 -force { in | out | update [-reboot] [-eventsys_restart] } pid eid\n";
Chad Barb's avatar
   
Chad Barb committed
28
29
30
31
32
33
34
    exit(-1);
}

#
# Configure variables
#
my $TBROOT         = "@prefix@";
Mike Hibler's avatar
Mike Hibler committed
35
my $TBOPS          = "@TBOPSEMAIL@";
Timothy Stack's avatar
Timothy Stack committed
36
my $TBLOGS         = "@TBLOGSEMAIL@";
37
my $MAINSITE	   = @TBMAINSITE@;
38
my $THISHOMEBASE   = "@THISHOMEBASE@";
Chad Barb's avatar
   
Chad Barb committed
39
40
my $TESTMODE       = @TESTMODE@;
my $DISABLE_EVENTS = "@DISABLE_EVENT_SCHED@";
41
my $piper          = "$TBROOT/sbin/locpiper";
Timothy Stack's avatar
Timothy Stack committed
42
my $NFSTRACESUPPORT= @NFSTRACESUPPORT@;
Chad Barb's avatar
   
Chad Barb committed
43
44
45
46
47
48
49
50
51
52
53

# Untaint the path
$ENV{'PATH'} = "/usr/bin:$TBROOT/libexec:$TBROOT/libexec/ns2ir" . 
    ":$TBROOT/sbin:$TBROOT/bin";

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
54
use libadminctrl;
Mike Hibler's avatar
Mike Hibler committed
55
use libadminmfs;
Kevin Atkinson's avatar
   
Kevin Atkinson committed
56
use libtblog;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
57
use libArchive;
Kevin Atkinson's avatar
   
Kevin Atkinson committed
58

59
#require exitonwarn; # exitonwarn isn't really a module, so just require it
Chad Barb's avatar
   
Chad Barb committed
60
61
62
63

#
# Actual swap-in and swap-out functions, defined below.
#
Chad Barb's avatar
   
Chad Barb committed
64
65
66
sub doSwapout($);
sub doSwapin($);

67
68
69
70
#
# Firewall stuff
# XXX maybe should be elsewhere
#
71
72
73
74
75
sub FWSETUP()     { return 1; }
sub FWADDNODES()  { return 2; }
sub FWDELNODES()  { return 3; }
sub FWTEARDOWN()  { return 4; }
sub doFW($$$$);
76
77

# XXX fixme: should not be hardwired!
78
my $cnetstack = "-S Control";
79
80
81
my $cnetvlanname = "Control";


Chad Barb's avatar
   
Chad Barb committed
82
83
84
85
sub REAL()    { return 4; }
sub CLEANUP() { return 3; }
sub RETRY()   { return 2; }
sub UPDATE()  { return 1; }
Chad Barb's avatar
Chad Barb committed
86
sub UPDATE_RECOVER()  { return 0; }
Chad Barb's avatar
   
Chad Barb committed
87

88
89
90
#
# Grab global enable of linkdelays.
#
Kirk Webb's avatar
   
Kirk Webb committed
91
my $enablelinkdelays = TBGetSiteVar("general/linux_endnodeshaping");
92

Chad Barb's avatar
   
Chad Barb committed
93
94
95
96
97
98
#
# Turn off line buffering on output
#

$| = 1;

99
100
my $updateReboot   = 0;
my $updateReconfig = 1;
101
102
my $update_Eventsys_restart = 0;
my $elabinelab     = 0;
Chad Barb's avatar
   
Chad Barb committed
103
104
my $force  = 0;
my $errors = 0;
105
my $updatehosed = 0;
Chad Barb's avatar
   
Chad Barb committed
106
my $state;
107
my $canceled;
Chad Barb's avatar
   
Chad Barb committed
108
109
my $os_setup_pid;
my $cleanvlans;
110
my $nextState;
Chad Barb's avatar
   
Chad Barb committed
111
112

#
Chad Barb's avatar
   
Chad Barb committed
113
# First argument is either "in", "out", or "update";
Chad Barb's avatar
   
Chad Barb committed
114
115
116
117
118
# this value goes into $swapop.
#

my $swapop = shift;	

Chad Barb's avatar
   
Chad Barb committed
119
120
121
122
if (!$swapop || 
    (($swapop ne "in") && 
     ($swapop ne "out") &&
     ($swapop ne "update"))) {
Chad Barb's avatar
   
Chad Barb committed
123
124
125
126
127
128
129
130
131
132
133
    usage();
}

#
# Get other arguments.
#

while ($#ARGV > 1) {
    $arg = shift;
    if ($arg eq "-force") {
	$force = 1;
Chad Barb's avatar
Chad Barb committed
134
135
    } elsif ($arg eq "-reboot") {
	$updateReboot = 1;
136
	$updateReconfig = 0;
137
138
139
    } elsif ($arg eq "-noreconfig") {
	$updateReboot   = 0;
	$updateReconfig = 0;
140
141
    } elsif ($arg eq "-eventsys_restart" && $swapop eq "update") {
	$update_Eventsys_restart = 1;
Chad Barb's avatar
   
Chad Barb committed
142
143
144
145
146
147
148
149
150
    } else {
	usage();
    }
}
if ($#ARGV < 1) {
    usage();
}
my ($pid,$eid) = @ARGV;

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#
# Untaint the arguments.
#
if ($pid =~ /^([-\@\w.]+)$/) {
    $pid = $1;
}
else {
    die("Tainted argument $pid!\n");
}
if ($eid =~ /^([-\@\w.]+)$/) {
    $eid = $1;
}
else {
    die("Tainted argument $eid!\n");
}

Kevin Atkinson's avatar
   
Kevin Atkinson committed
167
168
169
170
171
172
#
# Set Error reporting info
# 
tblog_set_info($pid,$eid,$UID);

#
Mike Hibler's avatar
Mike Hibler committed
173
# Turn on timestamps
Kevin Atkinson's avatar
   
Kevin Atkinson committed
174
#
Chad Barb's avatar
   
Chad Barb committed
175
176
TBDebugTimeStampsOn();

177
178
179
180
181
182
# Need this below.
my ($dbuid, $uname, $umail);
if (UNIX2DBUID($UID, \$dbuid)) {
    UserDBInfo($dbuid, \$uname, \$umail);
}

Chad Barb's avatar
   
Chad Barb committed
183
184
185
#
# Print starting message.
#
186
187
my $exptidx;
TBExptIDX($pid, $eid, \$exptidx);
Chad Barb's avatar
   
Chad Barb committed
188

189
190
print "Beginning swap-$swapop for $pid/$eid ($exptidx). " .
    TBTimeStampWithDate() . "\n";
Chad Barb's avatar
   
Chad Barb committed
191
192
193
194
195
196
TBDebugTimeStamp("tbswap $swapop started");

#
# Get experiment state; verify that experiment exists.
#
if (! ($state = ExpState($pid, $eid))) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
197
    tbdie "No such experiment $pid/$eid";
Chad Barb's avatar
   
Chad Barb committed
198
}
199
200
201
# Sanity check the current state. 
if (!$force) {
    if ($swapop eq "in") {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
202
	tbdie("Experiment should be ACTIVATING. Currently $state.")
203
	    if ($state ne EXPTSTATE_ACTIVATING);
Chad Barb's avatar
   
Chad Barb committed
204
    }
205
    elsif ($swapop eq "out") {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
206
	tbdie("Experiment should be SWAPPING. Currently $state.")
207
	    if ($state ne EXPTSTATE_SWAPPING);
Chad Barb's avatar
   
Chad Barb committed
208
    }
209
    elsif ($swapop eq "update") {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
210
	tbdie("Experiment should be MODIFY_RESWAP. Currently $state.")
211
	    if ($state ne EXPTSTATE_MODIFY_RESWAP);
Chad Barb's avatar
   
Chad Barb committed
212
213
    }
}
214
215
# Get elabinelab status. See below.
if (! TBExptIsElabInElab($pid, $eid, \$elabinelab)) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
216
    tbdie("Could not get elabinelab status for experiment $pid/$eid");
217
}
Chad Barb's avatar
   
Chad Barb committed
218

219
220
221
222
223
#
# See if the experiment is firewalled
#
my $firewalled = TBExptFirewall($pid, $eid);

Chad Barb's avatar
   
Chad Barb committed
224
225
226
#
# Do actual swapping
#
227
228
229
230
231
if ($swapop eq "out") {
    #
    # Swap out
    #
    $errors = doSwapout(REAL);
Chad Barb's avatar
   
Chad Barb committed
232
}
233
elsif ($swapop eq "update") {
234
235
236
237
238
239
240
    #
    # Before swapout, do cursory admission control to see if the
    # modified experiment will be swappable. assign_wrapper does a
    # more stringent check using assign.
    #
    print STDERR "Checking with Admission Control ...\n";
    if (! TBAdmissionControlCheck($UID, $pid, $eid, undef)) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
241
	tberror "Admission control failure!\n";
242
243
244
245
246
247
	print "Failingly finished swap-$swapop for $pid/$eid. " .
	    TBTimeStamp() . "\n";
	TBDebugTimeStamp("tbswap $swapop finished (failed)");
	exit(1);
    }
    
248
249
250
251
252
253
254
    #
    # Update.
    #
    # Phase One -- swap experiment partially out.
    #
    print STDERR "Backing up physical state...\n";
    TBExptBackupPhysicalState($pid,$eid);
Chad Barb's avatar
   
Chad Barb committed
255

256
    $errors = doSwapout(UPDATE);
Chad Barb's avatar
Chad Barb committed
257

Leigh B. Stoller's avatar
Leigh B. Stoller committed
258
259
260
261
262
263
    print STDERR "Doing a swapmodswapout on the experiment archive ...\n";    
    if (libArchive::TBExperimentArchiveSwapModSwapOut($pid, $eid) < 0) {
	tberror("Failed to do a swapmodswapout on the experiment archive!");
	$errors = 1;
    }

264
    if ($errors) {
Chad Barb's avatar
   
Chad Barb committed
265
	#
266
267
268
269
270
	# Clean up the mess, leaving the experiment in the SWAPPED state,
	# 
	print STDERR "Cleaning up after errors.\n";
	doSwapout(CLEANUP);
	$updatehosed = 1;
Chad Barb's avatar
   
Chad Barb committed
271
    }
272
    else {
Chad Barb's avatar
   
Chad Barb committed
273
	#
274
	# Phase Two -- swap experiment back in.
Chad Barb's avatar
   
Chad Barb committed
275
	#
Chad Barb's avatar
Chad Barb committed
276
277
	$errors = doSwapin(UPDATE);

278
	if ($errors) {
Chad Barb's avatar
Chad Barb committed
279
280
281
	    #
	    # There were errors; see if we can recover.
	    #
282
	    my $CanRecover = 1;
Chad Barb's avatar
Chad Barb committed
283
284
285

	    if ($errors != 7) {
		print STDERR "Update failure occurred _after_ assign phase; ";
286
		$CanRecover = 0;
Chad Barb's avatar
Chad Barb committed
287
288
	    }

289
290
291
292
293
294
295
296
	    if ($CanRecover) {
		print STDERR "Recovering virtual and physical state.\n";

		if (TBExptRemoveVirtualState($pid, $eid) ||
		    TBExptRestoreVirtualState($pid, $eid) ||
		    TBExptRestorePhysicalState($pid,$eid)) {
		    print STDERR "Could not restore backed-up state; ";
		    $CanRecover = 0;
Chad Barb's avatar
Chad Barb committed
297
		}
298
299
300
301
302
303
304
		else {
		    print STDERR "Doing a recovery swap-in of old state.\n";

		    if (doSwapin(UPDATE_RECOVER)) {
			print STDERR "Could not swap in old physical state; ";
			$CanRecover = 0;
		    }
Chad Barb's avatar
Chad Barb committed
305
306
		}
	    }
307
308
309
310
311
312
313

	    #
	    # Some part of the recovery failed; must swap it out. swapexp
	    # (caller) will then have to do more clean up, hence the special
	    # exit status indicated by $updatehosed.
	    # 
	    if (! $CanRecover) {
314
		print STDERR "Recovery aborted! Swapping experiment out.\n";
Chad Barb's avatar
Chad Barb committed
315
		doSwapout(CLEANUP);
316
317
318
319
		$updatehosed = 1;
	    }
	    else {
		print STDERR "Update recovery successful.\n";
Chad Barb's avatar
Chad Barb committed
320
	    }
Chad Barb's avatar
   
Chad Barb committed
321
	}
Chad Barb's avatar
Chad Barb committed
322
323
    }
}
324
325
326
327
328
elsif ($swapop eq "in") {
    #
    # Swap in
    #
    my $retries = 2;
329
330
331
332
333
334
335

    #
    # Before real swapin, do cursory admission control. assign_wrapper does
    # a more stringent check using assign.
    #
    print STDERR "Checking with Admission Control ...\n";
    if (! TBAdmissionControlCheck($UID, $pid, $eid, undef)) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
336
	tberror "Admission control failure!\n";
337
338
339
340
341
	print "Failingly finished swap-$swapop for $pid/$eid. " .
	    TBTimeStamp() . "\n";
	TBDebugTimeStamp("tbswap $swapop finished (failed)");
	exit(1);
    }
342
343
    
    $errors = doSwapin(REAL);
Chad Barb's avatar
Chad Barb committed
344

345
346
347
348
349
350
351
352
353
354
    #
    # Attempt a retry if: 
    #   a) there were errors, 
    #   b) doswapin() indicated (via return code 3) a retry is appropriate,
    #   c) we haven't tried too many times already.
    #   d) The cancelflag has not been set.
    #   e) $TESTMODE == 0.
    #
    while ($errors == 3 && $retries && !$canceled && !$TESTMODE) {
	$retries--;
Chad Barb's avatar
   
Chad Barb committed
355

356
357
	print STDERR "Cleaning up after errors; will try again.\n";
	doSwapout(RETRY);
Chad Barb's avatar
   
Chad Barb committed
358

359
360
	print STDERR "Trying again...\n";
	$errors = doSwapin(RETRY);
Chad Barb's avatar
   
Chad Barb committed
361
    }
362
363
    if ($errors || $canceled) {
	print STDERR "Cleaning up after " .
364
	    ($canceled ? "cancelation" : "errors") . ".\n";
365
	doSwapout(CLEANUP);
Chad Barb's avatar
   
Chad Barb committed
366
367
368
369
370
371
372
    }
}

#
# Write appropriate message and exit.
#
if ($errors) {
373
    print "Failingly finished swap-$swapop for $pid/$eid. ".TBTimeStamp()."\n";
Chad Barb's avatar
   
Chad Barb committed
374
    TBDebugTimeStamp("tbswap $swapop finished (failed)");
Chad Barb's avatar
Chad Barb committed
375

376
377
    # Pass out magic value to indicate that update failed!
    exit(1 | ($updatehosed ? 0x40 : 0));
Chad Barb's avatar
   
Chad Barb committed
378
}
379
print "Successfully finished swap-$swapop for $pid/$eid. " .TBTimeStamp()."\n";
380
381
TBDebugTimeStamp("tbswap $swapop finished (succeeded)");
exit(0);
Chad Barb's avatar
   
Chad Barb committed
382
383
384
385

#################################

##
Chad Barb's avatar
Chad Barb committed
386
#
Chad Barb's avatar
   
Chad Barb committed
387
388
# doSwapout - Swaps experiment out.
#
Chad Barb's avatar
Chad Barb committed
389
#             If in REAL or CLEANUP,
Chad Barb's avatar
   
Chad Barb committed
390
391
392
#             this function will free all nodes for the 
#             experiment.
#
Chad Barb's avatar
Chad Barb committed
393
#             If in RETRY or UDPATE,
Chad Barb's avatar
   
Chad Barb committed
394
395
396
397
398
399
#             only nodes not in RES_READY will be freed.
#
#             Returns 0 on success, >0 on failure.
#
##

Chad Barb's avatar
   
Chad Barb committed
400
401
sub doSwapout($) {
    my $type = shift; # REAL==4, CLEANUP==3, RETRY==2, UPDATE==1.
Chad Barb's avatar
   
Chad Barb committed
402
403
404
405
406
407
408
409
410
411
412
    my $swapout_errors = 0;

    #
    # wait for os_setup;
    # this only applies if called after a failed doswapin.
    #
    if ($os_setup_pid) {
	print "Waiting for os_setup to finish\n";
	waitpid($os_setup_pid, 0);
	undef $os_setup_pid;
    }
Chad Barb's avatar
Chad Barb committed
413

Timothy Stack's avatar
   
Timothy Stack committed
414
    if (0 && $NFSTRACESUPPORT && $type == REAL) {
Timothy Stack's avatar
Timothy Stack committed
415
416
	print "Getting files accessed via NFS.\n";
	TBDebugTimeStamp("nfstrace started");
417
	system("nfstrace transfer $pid $eid");
Timothy Stack's avatar
Timothy Stack committed
418
	TBDebugTimeStamp("nfstrace finished");
419
    }
420

Chad Barb's avatar
   
Chad Barb committed
421
    if (! $TESTMODE) { 
422
423
424
	if (! ($DISABLE_EVENTS || $elabinelab)) {
	    if ($type >= RETRY ||
		($update_Eventsys_restart && $type == UPDATE) ) {
Chad Barb's avatar
   
Chad Barb committed
425
426
		print "Stopping the event system\n";
		if (system("eventsys_control stop $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
427
		    tberror "Failed to stop the event system.";
Chad Barb's avatar
   
Chad Barb committed
428
429
		    $swapout_errors = 1;
		}
430
431
432
433
434
435
436

		#
		# Stop the location piper.
		#
		if (-x $piper) {
		    print "Stopping the location piper\n";
		    if (system("$piper -k $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
437
			tberror "Failed to stop location piper.";
438
439
440
			$swapout_errors = 1;
		    }
		}
Chad Barb's avatar
   
Chad Barb committed
441
442
	    }
	}
443
444
445
446
447
448
	
	#
	# Do teardown of inner elab. We must do this before we teardown the
	# vlans since the inner control network is a vlan, and we want that
	# active so inner boss can reboot the inner nodes (avoid power cycle).
	#
449
	if ($elabinelab && $type >= CLEANUP) {
450
451
	    print "Tearing down elabinelab. This could take a while.\n";
	    if (system("elabinelab -k $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
452
		tberror "Failed to teardown elabinelab!";
453
454
455
456
		$swapout_errors = 1;
	    }
	}

Chad Barb's avatar
   
Chad Barb committed
457
	#
Chad Barb's avatar
Chad Barb committed
458
	# Clean up any VLANs in experiment.
Chad Barb's avatar
   
Chad Barb committed
459
	#
460
461
462
463
464
465
466
	# When modifying an elabinelab experiment, leave the vlans intact
	# so that the inner networks are not suddenly disconnected!
	#
	if (! ($elabinelab && $type == UPDATE)) {
	    TBDebugTimeStamp("snmpit started");
	    print STDERR "Removing VLANs.\n";
	    if (system("snmpit -r $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
467
		tberror "Failed to reset VLANs";
468
469
470
471
472
		$swapout_errors = 1;
	    } else {
		$cleanvlans = 0;
	    }
	    TBDebugTimeStamp("snmpit finished");
Chad Barb's avatar
   
Chad Barb committed
473
	}
474
    }
Chad Barb's avatar
Chad Barb committed
475

476
477
478
    if ($type >= CLEANUP) {
	#
	# We're not attempting a retry;
Chad Barb's avatar
   
Chad Barb committed
479
	#
480
	# Stop all of the vnodes.
Chad Barb's avatar
   
Chad Barb committed
481
	#
482
	if (! $TESTMODE) { 	
Chad Barb's avatar
   
Chad Barb committed
483
484
485
	    print "Tearing down virtual nodes.\n";
	    TBDebugTimeStamp("vnode_setup -k started");
	    if (system("vnode_setup -d -k $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
486
		tberror "Failed to tear down vnodes.";
Chad Barb's avatar
   
Chad Barb committed
487
488
489
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("vnode_setup finished");
Chad Barb's avatar
   
Chad Barb committed
490
491
	}

Mike Hibler's avatar
Mike Hibler committed
492
493
494
495
496
497
498
	#
	# Nodes behind a firewall are treated special.
	# See undoFWNodes for details.
	#
	if ($firewalled && undoFWNodes($pid, $eid)) {
	    return 1;
	}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
499

Mike Hibler's avatar
Mike Hibler committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
	#
	# Perform swapout time admin actions.  Right now there is at most
	# one of these.  It isn't really a general mechanism, just a hook
	# for state saving or data collection during swapout.
	# A couple of important "fer now" notes:
	#
	#	We don't do this for firewalled experiments.  We need a way
	#	to "tag" the saved disk state to ensure it doesn't get
	#	instantiated outside of a firewall.
	#
	#	We only do this on REAL swapouts, and not on CLEANUPs.
	#	There are some types of CLEANUPs where we may want to
	#	do this, in particular an invocation caused by a failed
	#	modify operation, where the admin action is to save the
	#	experiment state.  So we will need to revisit this.
	#
	my %soaction = ();
	if ($type == REAL && !$firewalled) {
	    TBExptGetSwapoutAction($pid, $eid, \%soaction);
	}
520
521
	if ($soaction{'command'} && doSwapoutAction($pid, $eid, %soaction)) {
	    return 1;
522
523
	}

Chad Barb's avatar
   
Chad Barb committed
524
525
	#
	# remove all nodes from the experiment.
Chad Barb's avatar
   
Chad Barb committed
526
	# (nfree will send them to RES_FREE_DIRTY)
Chad Barb's avatar
   
Chad Barb committed
527
528
529
530
	#
	print STDERR "Freeing nodes.\n";
	TBDebugTimeStamp("nfree started");
	if (system("nfree $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
531
	    tberror "Could not free nodes.";
Chad Barb's avatar
   
Chad Barb committed
532
533
534
	    $swapout_errors = 1;
	}
	TBDebugTimeStamp("nfree finished");
Chad Barb's avatar
   
Chad Barb committed
535
536
537
538
539

	#
	# Since this is an actual swapout, 
	# reset our count of swap out nag emails sent.
	#
540
	DBQueryWarn("update experiments set swap_requests='',sim_reswap_count='0' ".
541
		    "where eid='$eid' and pid='$pid'");
Chad Barb's avatar
   
Chad Barb committed
542
543
    } else {
	#
Chad Barb's avatar
   
Chad Barb committed
544
	# $type == RETRY or $type == UPDATE.
Chad Barb's avatar
   
Chad Barb committed
545
546
	# Therefore, don't deallocate nodes which have been successfully
	# incorporated into the experiment (i.e., are RES_READY).
Chad Barb's avatar
   
Chad Barb committed
547
	# (nfree will send deallocated nodes to RES_FREE_DIRTY)
Chad Barb's avatar
   
Chad Barb committed
548
	#
549
550
551
552
553
554
555
556
	my @failedpnodes = ();
	my @failedvnodes = ();
	
	my $db_result =
	    DBQueryFatal("select rv.node_id,n.allocstate,nt.isvirtnode ".
                         "  from reserved as rv ".
			 "left join nodes as n on n.node_id = rv.node_id ".
			 "left join node_types as nt on nt.type=n.type ".
Chad Barb's avatar
   
Chad Barb committed
557
558
			 "where rv.pid='$pid' and rv.eid='$eid'");

559
	while (my ($node,$allocstate,$isvirt) = $db_result->fetchrow_array) {
Chad Barb's avatar
   
Chad Barb committed
560
	    if ($allocstate ne TBDB_ALLOCSTATE_RES_READY()) {
561
562
563
564
565
566
		if ($isvirt) {
		    push(@failedvnodes, $node);
		}
		else {
		    push(@failedpnodes, $node);
		}
Chad Barb's avatar
   
Chad Barb committed
567
568
569
	    }
	}

570
571
572
573
574
575
576
	#
	# Tear down failed vnodes. Perhaps not needed?
	# 
	if (!$TESTMODE && @failedvnodes > 0) {
	    print "Tearing down failed virtual nodes.\n";
	    TBDebugTimeStamp("vnode_setup -k started");
	    if (system("vnode_setup -d -k $pid $eid @failedvnodes")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
577
		tberror "Failed to tear down vnodes.";
578
579
580
581
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("vnode_setup -k finished");
	}
Chad Barb's avatar
   
Chad Barb committed
582

583
584
585
586
	#
	# Release all failed nodes.
	# 
	if (@failedpnodes > 0 || @failedvnodes > 0) {
587
588
	    print STDERR "Freeing failed nodes.\n";
	    
Chad Barb's avatar
   
Chad Barb committed
589
	    TBDebugTimeStamp("nfree started");
Chad Barb's avatar
   
Chad Barb committed
590
591
592
593
594
	    #
	    # Specify -x switch so when a physical node gets freed,
	    # any virtual nodes (owned by this experiment)
	    # sitting on top of it are freed as well.
	    #
595
596
	    if (system("nfree -x $pid $eid " .
		       join(" ", (@failedpnodes, @failedvnodes)))) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
597
		tberror "Could not free nodes.";
Chad Barb's avatar
   
Chad Barb committed
598
599
600
601
602
603
		$swapout_errors = 1;
	    }
	    TBDebugTimeStamp("nfree finished");
	}
    }

604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
    if (! $TESTMODE) {
	#
	# If the experiment has no Plab dslice nodes left, but still has
	# a Plab slice, destroy the slice
	#

	# Does the slice exist?
	$db_result =
	    DBQueryFatal("select slicename from plab_slices ".
			 "where pid='$pid' and eid='$eid'");

	if ($db_result->numrows) {
	    # Are there any dslice nodes left?
	    $db_result =
		DBQueryFatal("select n.node_id from nodes as n ".
			     "left join node_types as nt on n.type = nt.type ".
			     "left join reserved as r ".
			     " on r.node_id = n.node_id ".
			     "where r.pid='$pid' and r.eid='$eid' ".
			     " and nt.isplabdslice = 1");

	    if (!$db_result->numrows) {
		print "Destroying Planetlab slice.\n";
		TBDebugTimeStamp("plabslice destroy started");
628
		if (system("plabslice destroy $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
629
		    tberror "Failed to destroy Plab dslice.";
630
631
632
633
634
635
636
		    $swapout_errors = 1;
		}
		TBDebugTimeStamp("plabslice destroy finished");
	    }
	}
    }

Chad Barb's avatar
   
Chad Barb committed
637
638
639
640
641
642
643
644
    if (! $TESTMODE) {
	#
	# All of these errors are non-fatal on swapout. We find out about them
	# via email sent from the individual scripts.
	#

	#
	# Only reset mountpoints if this is an actual swapout, and
Chad Barb's avatar
   
Chad Barb committed
645
	# not a failed swapin(cleanup), update, or retry.
Chad Barb's avatar
   
Chad Barb committed
646
	#
Chad Barb's avatar
   
Chad Barb committed
647
	if ($type == REAL) {
Chad Barb's avatar
   
Chad Barb committed
648
649
650
	    print "Resetting mountpoints.\n";
	    TBDebugTimeStamp("exports started");
	    if (system("exports_setup")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
651
		tberror "Failed to reset mountpoints.";
Chad Barb's avatar
   
Chad Barb committed
652
653
654
	    }
	    TBDebugTimeStamp("exports finished");
	}
Chad Barb's avatar
Chad Barb committed
655

Chad Barb's avatar
   
Chad Barb committed
656
657
658
659
	#
	# Resetting named maps and email lists is fast and idempotent,
	# so whatever.
	#
Chad Barb's avatar
   
Chad Barb committed
660
661
662
	print "Resetting named maps.\n";
	TBDebugTimeStamp("named started");
	if (system("named_setup")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
663
	    tbwarn "Failed to reset named map.";
Chad Barb's avatar
   
Chad Barb committed
664
665
	}
	TBDebugTimeStamp("named finished");
Chad Barb's avatar
Chad Barb committed
666

Chad Barb's avatar
   
Chad Barb committed
667
668
	print "Resetting email lists.\n";
	TBDebugTimeStamp("genelists started");
669
	if (system("genelists -t")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
670
	    tbwarn "Failed to reset email lists.";
Chad Barb's avatar
   
Chad Barb committed
671
672
673
674
	}
	TBDebugTimeStamp("genelists finished");
    }

Chad Barb's avatar
   
Chad Barb committed
675
    #
676
677
678
    # Wipe the DB clean except during UPDATE or RETRY. In those
    #    cases, assign_wrapper will reset the DB after reading
    #    the info.
Chad Barb's avatar
   
Chad Barb committed
679
    #
680
681
682
683
    if ( $type >= CLEANUP ) {
	print STDERR "Resetting DB.\n";
	TBExptRemovePhysicalState( $pid, $eid );
    }
Chad Barb's avatar
   
Chad Barb committed
684
685
686
687
688

    return $swapout_errors;
}

##
Chad Barb's avatar
Chad Barb committed
689
#
Chad Barb's avatar
   
Chad Barb committed
690
691
# doSwapin - Swaps experiment in.
#
Chad Barb's avatar
Chad Barb committed
692
693
694
695
696
#            Returns:
#              0 - successful swapin
#              1 - failed swapin; cleanup required.
#              3 - failed swapin; cleanup required; can retry.
#              7 - failed swapin; assign failed; no cleanup.
Chad Barb's avatar
   
Chad Barb committed
697
698
##

Chad Barb's avatar
   
Chad Barb committed
699
sub doSwapin($) {
700
701
702
    my $type = shift; # REAL==4, RETRY==2, UPDATE==1, UPDATE_RECOVER=0.
    # Just the physnodes ...
    my @deleted_pnodes = ();
Chad Barb's avatar
   
Chad Barb committed
703

Chad Barb's avatar
   
Chad Barb committed
704
705
706
707
    #
    # assign_wrapper does all the virtual to physical mapping 
    # and updating the DB state.
    #
708
    
Chad Barb's avatar
Chad Barb committed
709
    if ($type > UPDATE_RECOVER) {
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
        #
        # Hacky test to allow disabling of linkdelays if the node is going
        # to run Linux. See sitevar above.
        #
	if (! $enablelinkdelays) {
	    $db_result =
		DBQueryFatal("select distinct e.pid,e.eid,vl.vnode,vn.osname ".
			 "  from experiments as e ".
			 "left join virt_lans as vl on vl.pid=e.pid and ".
			 "     vl.eid=e.eid ".
			 "left join virt_nodes as vn on vn.pid=e.pid and ".
			 "     vn.eid=e.eid and vn.vname=vl.vnode ".
			 "left join os_info as o on o.osname=vn.osname and ".
			 "  (o.pid=vl.pid or o.pid='" . TBOPSPID() . "') ".
			 "where (vl.uselinkdelay!=0 or e.uselinkdelays!=0 or ".
			 "       e.forcelinkdelays!=0) and ".
726
727
			 "     (o.os is NULL or o.os='Linux' or ".
			 "      o.os='Fedora') and ".
728
729
730
			 "     e.pid='$pid' and e.eid='$eid'");

	    if ($db_result->numrows) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
731
732
		tberror "Endnodeshaping is disabled on Linux Images!";
		tberror "You must modify your experiment to swap it in.";
733
734
735
736
		return 1;
	    }
	}
	
Chad Barb's avatar
Chad Barb committed
737
738
739
740
	print "Mapping to physical reality ...\n";
	TBDebugTimeStamp("assign_wrapper started");

	#
741
742
743
744
	# Pass the -u (update) switch into assign_wrapper, which turns on
	# update mode. When doing a retry, must also fix the current nodes
	# to avoid stuff jumping around when simply trying to replace a node
	# that did not boot.
Chad Barb's avatar
Chad Barb committed
745
746
	#
	my $exitcode;
747
748
749
	my $wrapper = "assign_wrapper -u";
	$wrapper .= " -f"
	    if ($type == RETRY);
750
	
751
	if (system("$wrapper $pid $eid")) {
752
753
	    $exitcode = $? >> 8;

Kevin Atkinson's avatar
   
Kevin Atkinson committed
754
	    tberror "Failed ($exitcode) to map to reality.";
755

756
757
	    # Wrapper sets this bit when recovery is possible.
	    if ($exitcode & 64) {
758
		# We can recover. 
Chad Barb's avatar
Chad Barb committed
759
		return 7;
760
761
	    }
	    else {
762
		# No recovery, no retry.
Chad Barb's avatar
Chad Barb committed
763
764
765
766
		return 1;
	    }
	}
	TBDebugTimeStamp("assign_wrapper finished");
Chad Barb's avatar
   
Chad Barb committed
767

Chad Barb's avatar
Chad Barb committed
768
769
	print "Mapped to physical reality!\n";
    }
Chad Barb's avatar
   
Chad Barb committed
770

771
    # Check cancel flag before continuing. No retry, 
772
    TBGetCancelFlag($pid, $eid, \$canceled);
773
774
775
    return 1
	if ($canceled);

776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
    #
    # Look for any nodes in RES_TEARDOWN. These need to be released,
    # and if a virtnode, they need to be torn down. We cannot wait for
    # the virtnodes to go down with the physnode they are hosted on,
    # so teardown and release the virtnodes first, and then do the
    # physnodes.
    #
    # Errors are fatal; no recovery or retry.
    #
    if ($type == UPDATE) {
	my $allocstate = TBDB_ALLOCSTATE_RES_TEARDOWN();
	
	$db_result =
	    DBQueryFatal("select r.node_id,nt.isvirtnode,nt.isremotenode ".
			 "  from reserved as r ".
			 "left join nodes as n on n.node_id=r.node_id ".
			 "left join node_types as nt on nt.type=n.type ".
			 "where r.pid='$pid' and r.eid='$eid' and ".
			 "      n.allocstate='$allocstate'");

	if ($db_result->numrows) {
	    my @virtnodes = ();
	    my @physnodes = ();
	    
	    print "Tearing down and releasing unused nodes\n";

	    # First teardown/release virtnodes. 
	    while (my ($node,$isvirt,$isrem) = $db_result->fetchrow_array()) {
		if ($isvirt) {
		    push(@virtnodes, $node);
		}
		elsif (!$isrem) {
		    push(@physnodes, $node);
		}
	    }
811
812
813
	    # See below.
	    @deleted_pnodes = @physnodes;
	    
814
815
816
	    if (@virtnodes) {
		TBDebugTimeStamp("vnode_setup started");
		
817
		if (system("vnode_setup -k $pid $eid @virtnodes")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
818
		    tberror "Failed to tear down unused virtnodes!\n";
819
820
821
822
823
		    return 1;
		}
		TBDebugTimeStamp("vnode_setup finished");
		
		if (system("nfree $pid $eid @virtnodes")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
824
		    tberror "Failed to nfree unused virtnodes!\n";
825
826
827
828
		    return 1;
		}
	    }
	    if (@physnodes) {
829
830
831
		if ($elabinelab) {
		    print "Removing nodes from inner elab.\n";
		    if (system("elabinelab -r $pid $eid @physnodes")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
832
			tberror "Failed to remove inner nodes!";
833
834
835
836
837
			return 1;
		    }
		}

		#
Mike Hibler's avatar
Mike Hibler committed
838
839
		# If the experiment is firewalled, cleanup the nodes
		# we are releasing.
840
		# 
Mike Hibler's avatar
Mike Hibler committed
841
		if ($firewalled && undoFWNodes($pid, $eid, @deleted_pnodes)) {
842
843
844
		    return 1;
		}
		
845
		if (system("nfree $pid $eid @physnodes")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
846
		    tberror "Failed to nfree unused physnodes!\n";
847
848
849
850
851
852
		    return 1;
		}
	    }
	}
    }

Chad Barb's avatar
   
Chad Barb committed
853
854
855
856
857
858
    # Exit here if we are testing.
    if ($TESTMODE) {
	print "Testing run - Stopping here.\n";
	return 0;
    }

859
860
861
862
863
864
865
866
867
868
869
    #
    # Handle tarballs - we might need to fetch some from URLs if the user
    # asked for that.
    #
    print "Fetching tarballs and RPMs (if any) ...\n";
    TBDebugTimeStamp("tarfiles_setup started");

    if (system("tarfiles_setup $pid $eid")) {
	#
	# No recovery for now - what would we do?
	#
Kevin Atkinson's avatar
   
Kevin Atkinson committed
870
	tberror "Failed to set up tarballs.";
871
872
873
874
	return 1;
    }
    TBDebugTimeStamp("tarfiles_setup finished");

875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
    #
    # If there are any Plab dslice nodes in the experiment, create the
    # dslice now
    #
    if ($type > UPDATE_RECOVER) {
	# Are there any Plab nodes?
	$db_result =
	    DBQueryFatal("select n.node_id from nodes as n ".
			 "left join node_types as nt on n.type = nt.type ".
			 "left join reserved as r on r.node_id = n.node_id ".
			 "where r.pid='$pid' and r.eid='$eid' ".
			 " and nt.isplabdslice = 1");

	if ($db_result->numrows) {
	    # Does slice already exist?
	    $db_result =
		DBQueryFatal("select slicename from plab_slices ".
			     "where pid='$pid' and eid='$eid'");

	    if (! $db_result->numrows) {
		my @plabnodes = ();
		
		while (my ($node) = $db_result->fetchrow_array()) {
		    push(@plabnodes, $node);
		}
		
		print "Creating Planetlab slice.\n";
		TBDebugTimeStamp("plabslice create started");
903
		if (system("plabslice create $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
904
		    tberror "Failed to create Plab dslice";
905
906
907
908
909
910
911
		    return 3;
		}
		TBDebugTimeStamp("plabslice alloc finished");
	    }
	}
    }

912
    # Check cancel flag before continuing. No retry, 
913
    TBGetCancelFlag($pid, $eid, \$canceled);
914
915
916
    return 1
	if ($canceled);

Chad Barb's avatar
   
Chad Barb committed
917
918
919
920
921
922
923
924
925
    #
    # These things need to get started before the nodes come up, so we'll
    # do them before the os_setup. Everything else can done in parallel with
    # os_setup. (Actually, these probably can too, since they should finish
    # long before the nodes reboot, but better safe than sorry)
    #
    print "Setting up mountpoints.\n";
    TBDebugTimeStamp("mountpoints started");
    if (system("exports_setup")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
926
	tberror "Failed to setup mountpoints.";
Chad Barb's avatar
   
Chad Barb committed
927
928
929
	return 1;
    }
    TBDebugTimeStamp("mountpoints finished");
Chad Barb's avatar
Chad Barb committed
930

Chad Barb's avatar
   
Chad Barb committed
931
932
933
    TBDebugTimeStamp("named started");
    print "Setting up named maps.\n";
    if (system("named_setup")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
934
	tbwarn "Failed to add node names to named map.";
Chad Barb's avatar
   
Chad Barb committed
935
936
	#
	# This is a non-fatal error.
Chad Barb's avatar
Chad Barb committed
937
	#
Chad Barb's avatar
   
Chad Barb committed
938
939
    }
    TBDebugTimeStamp("named finished");
Chad Barb's avatar
Chad Barb committed
940

Timothy Stack's avatar
Timothy Stack committed
941
942
943
944
945
946
947
948
    if ($NFSTRACESUPPORT) {
	print "Cleaning NFS traces.\n";
	TBDebugTimeStamp("nfstrace gc started");
	if (system("nfstrace gc $pid $eid")) {
	    tberror "Failed to setup nfstrace.";
	    return 1;
	}
	TBDebugTimeStamp("nfstrace gc finished");
949
950
    }

951
    # Check cancel flag before continuing. No retry, 
952
    TBGetCancelFlag($pid, $eid, \$canceled);
953
954
    return 1
	if ($canceled);
Chad Barb's avatar
Chad Barb committed
955

956
957
958
959
    #
    # Setup any control-net firewall.
    # This must be done before reloading and rebooting nodes.
    #
960
961
    if ($firewalled && ($type == REAL || $type == UPDATE) &&
	doFW($pid, $eid, (($type == UPDATE) ? FWADDNODES : FWSETUP), undef)) {
962
963
964
	return 1;
    }

Chad Barb's avatar
Chad Barb committed
965
966
    #
    # If user specified -reboot to update,
Chad Barb's avatar
   
Chad Barb committed
967
    # and we are successfully performing the update,
968
    # then mark all nodes in experiment so os_setup will reboot them.
969
970
    # We must reboot nodes on a RETRY as well, since assign has been rerun
    # and may have remapped interfaces on the nodes.
Chad Barb's avatar
Chad Barb committed
971
    #
972
973
974
975
    if ($type == RETRY ||
	($type == UPDATE && ($updateReboot || $updateReconfig))) {
	my $needreboot = ($type == RETRY || $updateReboot);

976
	print STDERR "Marking nodes for reboot/reconfig.\n";
Chad Barb's avatar
Chad Barb committed
977
	$db_result =
978
979
980
	    DBQueryFatal("select r.node_id,n.allocstate from reserved as r ".
			 "left join nodes as n on n.node_id=r.node_id ".
			 "where r.pid='$pid' and r.eid='$eid'");
Chad Barb's avatar
Chad Barb committed
981

982
983
984
985
986
987
988
989
	while (my ($node,$allocstate) = $db_result->fetchrow_array) {
	    #
	    # If the node is INIT_CLEAN, leave it alone. It will still get
	    # rebooted, but will not falsely be tagged as dirty. This is
	    # important for vnodes too, where INIT_CLEAN indicated the vnode
	    # does not even exist yet (plab nodes).
	    #
	    if ($allocstate ne TBDB_ALLOCSTATE_RES_INIT_CLEAN()) {
990
		TBSetNodeAllocState($node,
991
				    ($needreboot ?
992
993
				     TBDB_ALLOCSTATE_RES_INIT_DIRTY() :
				     TBDB_ALLOCSTATE_RES_RECONFIG()));
994
	    }
Chad Barb's avatar
Chad Barb committed
995
996
997
	}
    }

998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
    #
    # Lets run gentopofile again, so we get ltmap right. This will come out
    # later, most likely.
    #
    TBDebugTimeStamp("gentopofile started");
    print "Generating ltmap (again) ...\n";

    if (system("gentopofile $pid $eid")) {
        tberror("gentopofile failed!");
	return 1;
    }
    TBDebugTimeStamp("gentopofile finished");

1011
    # XXX fer now hack
1012
    if (0 && !$firewalled && !$elabinelab &&
1013
	($pid eq "testbed" || $pid eq "tbres")) {
1014
1015
1016
1017
1018
	DBQueryWarn("update experiments set ".
		    "    savedisk=1 where pid='$pid' and eid='$eid'");
    }


Chad Barb's avatar
   
Chad Barb committed
1019
1020
1021
1022
1023
    #
    # Since it'll take a while for the nodes to reboot, we'll start now, and
    # wait for the os_setup to finish, down below
    #
    print "Resetting OS and rebooting.\n";
1024
    TBDebugTimeStamp("launching os_setup");
Chad Barb's avatar
   
Chad Barb committed
1025
1026
1027
    if (!($os_setup_pid = fork())) { 
	exec("os_setup $pid $eid") or return 1;
    } elsif ($os_setup_pid == -1) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1028
	tberror "Fork failed.";
Chad Barb's avatar
   
Chad Barb committed
1029
1030
	return 1;
    }
Chad Barb's avatar
Chad Barb committed
1031

Chad Barb's avatar
   
Chad Barb committed
1032
    #
1033
1034
1035
1036
1037
    # XXX
    # Don't add any steps between here and the waitpid() call below
    # without verifying that 1) It's OK for nodes to come up before
    # the step has completed and 2) It's OK for the command to run in
    # parallel with os_setup (no DB dependencies, etc.)
Chad Barb's avatar
   
Chad Barb committed
1038
1039
1040
1041
1042
    #

    print "Setting up VLANs.\n";
    TBDebugTimeStamp("snmpit started");
    if (system("snmpit -t $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1043
	tberror {type=>'summary'}, "Failed to set up VLANs.";
Chad Barb's avatar
   
Chad Barb committed
1044
1045
1046
	return 1;
    }
    TBDebugTimeStamp("snmpit finished");
Chad Barb's avatar
Chad Barb committed
1047

Chad Barb's avatar
   
Chad Barb committed
1048
1049
1050
1051
    #
    # An error now means that the VLANS need to be cleaned up.
    #
    $cleanvlans = 1;
Chad Barb's avatar
Chad Barb committed
1052

Chad Barb's avatar
   
Chad Barb committed
1053
1054
    print "Setting up email lists.\n";
    TBDebugTimeStamp("genelists started");
1055
    if (system("genelists -t")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1056
	tbwarn "Failed to update email lists.";
Chad Barb's avatar
   
Chad Barb committed
1057
1058
1059
1060
1061
	#
	# This is a non-fatal error.
	# 
    }
    TBDebugTimeStamp("genelists finished");
Chad Barb's avatar
Chad Barb committed
1062

Chad Barb's avatar
   
Chad Barb committed
1063
1064
1065
1066
1067
1068
1069
1070
    #
    # Don't clear port counters on UPDATE.
    # (XXX should clear new nodes' port counters.)

    if ($type >= RETRY) {
	print "Clearing port counters.\n";
	TBDebugTimeStamp("portstats started");
	if (system("portstats -z -a -q $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1071
	    tbwarn "Failed to clear port counters.";
Chad Barb's avatar
   
Chad Barb committed
1072
1073
1074
1075
1076
	    #
	    # This is a non-fatal error.
	    # 
	}
	TBDebugTimeStamp("portstats finished");
Chad Barb's avatar
   
Chad Barb committed
1077
    }
Chad Barb's avatar
Chad Barb committed
1078

Chad Barb's avatar
   
Chad Barb committed
1079
1080
1081
1082
1083
1084
1085
    #
    # OK, let's see how that os_setup did
    #
    $kid = waitpid($os_setup_pid,0);
    if ($kid == $os_setup_pid) {
	undef $os_setup_pid; # Make sure doswapout() doesn't wait for it.
	if ($CHILD_ERROR) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1086
	    tberror "Failed to reset OS and reboot nodes.";
1087
1088
1089
1090
1091
1092
1093
1094
1095
	    #
	    # If there is a firewall involved, it could be that the
	    # firewall rules are preventing essential communication,
	    # so don't retry.
	    #
	    # XXX should only do this if the user has specified additional
	    # rules.  But right now, I may screw up too!
	    #
	    if ($firewalled) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1096
		tberror "Not retrying, ".
1097
		    "possibly an error in firewall setup or configuration.";
1098
1099
1100
		return 1;
	    }

Chad Barb's avatar
   
Chad Barb committed
1101
	    #
Chad Barb's avatar
   
Chad Barb committed
1102
1103
	    # Use returncode from os_setup process to
	    # set global $retry flag, indicating to caller
Chad Barb's avatar
   
Chad Barb committed
1104
1105
1106
	    # that it may be beneficial to attempt
	    # a doSwapin() again.
	    #
Chad Barb's avatar
   
Chad Barb committed
1107
            if (($CHILD_ERROR >> 8) == 1) {
Chad Barb's avatar
Chad Barb committed
1108
		return 3;
Chad Barb's avatar
   
Chad Barb committed
1109
1110
	    } else {
		print STDERR "Not retrying due to error type.\n";
Chad Barb's avatar
Chad Barb committed
1111
		return 1;
Chad Barb's avatar
   
Chad Barb committed
1112
	    }
Chad Barb's avatar
   
Chad Barb committed
1113
1114
1115
	}
    } else {
	undef $os_setup_pid;
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1116
	tberror "Error waiting for os_setup to finish.";
Chad Barb's avatar
   
Chad Barb committed
1117
1118
	return 1;
    }
Chad Barb's avatar
Chad Barb committed
1119

Chad Barb's avatar
   
Chad Barb committed
1120
1121
1122
1123
    #
    # Okay, start the event system now that we know all the nodes have
    # rebooted (os_setup is done). This only takes a moment (puts itself
    # in the background), so its not enough of a delay to worry about.
Chad Barb's avatar
   
Chad Barb committed
1124
1125
    # Don't do this during an update, since we didn't kill the 
    # event system previously, so starting it again will fail!
Chad Barb's avatar
   
Chad Barb committed
1126
    # 
1127
    if (! ($DISABLE_EVENTS || $elabinelab)) {
1128
1129
1130
1131
1132
1133
1134
	#
	# For the robot testbed, start the location piper *before* the event
	# system.
	#
	if (-x $piper && ($type != UPDATE && $type != UPDATE_RECOVER)) {
	    print "Starting the location piper.\n";
	    if (system("$piper $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1135
		tberror "Failed to start the location piper.";
1136
1137
1138
1139
		return 1;
	    }
	}
	
1140
1141
	if ( $update_Eventsys_restart || 
	    ($type != UPDATE && $type != UPDATE_RECOVER) ) {
Chad Barb's avatar
   
Chad Barb committed
1142
1143
1144
	    print "Starting the event system.\n";
	    TBDebugTimeStamp("eventsys_control started");
	    if (system("eventsys_control start $pid $eid")) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1145
		tberror "Failed to start the event system.";
Chad Barb's avatar
   
Chad Barb committed
1146
1147
1148
		return 1;
	    }
	    TBDebugTimeStamp("eventsys_control finished");
Chad Barb's avatar
   
Chad Barb committed
1149
1150
	}
    }
Chad Barb's avatar
Chad Barb committed
1151

1152
1153
1154
1155
1156
1157
1158
    #
    # Do linktest if user requested it at swapin.
    #
    my $query_result =
	DBQueryFatal("select linktest_level,linktest_pid from experiments ".
		     "where pid='$pid' and eid='$eid'");
    my ($linktest_level,$linktest_pid) = $query_result->fetchrow_array();
1159
1160
1161
1162
1163
1164
1165

    # Temporary. Only studly users get to control linktest level.
    # Mere users always get at least level three.
    $query_result =
	DBQueryFatal("select stud from users where uid='$dbuid'");
    my ($studly) = $query_result->fetchrow_array();

1166
    if (! $studly && $MAINSITE) {
1167
1168
	$linktest_level = ($linktest_level <= 3 ? 3 : $linktest_level);
    }
1169
1170
1171
 
    if ($linktest_level && ($type == REAL || $type == UPDATE)) {
	if ($linktest_pid) {
Kevin Atkinson's avatar
   
Kevin Atkinson committed
1172
	    tbwarn "Linktest is already running! $linktest_pid";
1173
1174
1175
1176
1177
	}
	else {
	    #