assign_wrapper.in 135 KB
Newer Older
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2 3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2004 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5 6
# All rights reserved.
#
7 8
use English;
use Getopt::Std;
9 10
use POSIX qw(setsid);
use POSIX ":sys_wait_h";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11

12
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
13 14 15 16 17 18
# This function as the main assign loop.  It converts the virtual
# topology into a top input including LAN and delay translation.  It
# then snapshots the current testbed physical state and runs assign,
# looping a couple times if assign fails.  When assign successfully
# completes it will interpret the results.  Attempt to match any
# existing portmap entries and then update the delays and vlans table.
Chad Barb's avatar
Chad Barb committed
19
#
20 21 22 23
# XXX: Update does not work with widearea nodes.
#      Internally created nodes (jailhost,delay,sim) are not treated
#        consistently. Needs more thought.
#
24 25 26 27
# Return codes: We catch all errors with the END block below, and if
# anyone calls die() (exit value is 255) we add in the CANRECOVER bit.
# Failures in assign always cause the caller to stop retrying. 
#
28
# The CANRECOVER bit indicates 'recoverability' (no db or physical
29 30
# state was modified by the time the error occurred). This is relavent
# to only modify operations (update).
31 32 33
#
my $WRAPPER_SUCCESS		 = 0x00;
my $WRAPPER_FAILED		 = 0x01;	# Failed (Add other values)
34 35
my  $WRAPPER_FAILED_CANRECOVER   = 0x40;        # Can recover from update
my  $WRAPPER_FAILED_FATALLY      = 0x80;	# Do not use this.
36 37 38
# Set this once we modify DB state; forces no recover in fatal().
my $NoRecover = 0;
    
39 40
sub usage ()
{
41 42 43 44
    print STDERR "Usage: $0 [-v] [-u [-f] | -n] pid eid\n";
    print STDERR " -v   - Enables verbose output\n";
    print STDERR " -u   - Enables update mode\n";
    print STDERR " -f   - Fix current resources during update mode\n";
45 46
    print STDERR " -t   - Create the TOP file and then exit\n";
    print STDERR " -n   - Run assign, but do not reserve/modify resources.\n";
47 48
    print STDERR " -p   - Do a precheck for mapability on an empty testbed - ".
		 "implies -n\n";
49
    exit($WRAPPER_FAILED);
50
}
51
my $optlist  = "vutnfp";
52
my $verbose  = 0;
53
my $fixmode  = 0;
54 55 56
my $updating = 0;
my $toponly  = 0;
my $impotent = 0;
57
my $precheck = 0;
58
my $warnings = 0;
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77

#
# Configure variables
#
my $TBROOT	  = "@prefix@";
my $DELAYCAPACITY = @DELAYCAPACITY@;
$ENV{'PATH'} = "/usr/bin:$TBROOT/libexec:$TBROOT/sbin:$TBROOT/bin";

#
# Turn off line buffering on output
#
$| = 1;

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
78
use Node;
79

Chad Barb's avatar
Chad Barb committed
80
#
81 82 83 84 85
# assign_wrapper Settings
#
# Maximum delay in ms above which a delay node is needed.
# (Note that the DB represents delays as floating point numbers)
my $delaythresh = 2;
Chad Barb's avatar
Chad Barb committed
86

87 88
# Maximum number of times we run assign.
my $maxrun = 20;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
89

90 91 92
# Use the switch to delay when possible. Currentlythis only works for 10mbit
# links (actually, its turned off cause it does not work; auto handshake).
my $delaywithswitch = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
93

94 95 96 97 98 99 100
#
# Some handy constants. Speed in Mbits/sec and Kbits/sec units.
#
# Its probably a good idea to leave portbw (current_speed) in Mbs, since
# those numbers are used at the switch and the interfaces, which really
# only think in Mbps.
#
101 102 103 104 105 106
my $S10Mbs   = 10;
my $S100Mbs  = 100;
my $S1000Mbs = 1000;
my $S10Kbs   = 10000;
my $S100Kbs  = 100000;
my $S1000Kbs = 1000000;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
107

108
#
109 110
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
111
#
112 113 114
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
115
}
116 117
if (@ARGV != 2) {
    usage();
Chad Barb's avatar
Chad Barb committed
118
}
119
if (defined($options{"v"})) {
Chad Barb's avatar
 
Chad Barb committed
120 121
    $verbose = 1;
}
122
if (defined($options{"u"})) {
Chad Barb's avatar
 
Chad Barb committed
123
    $updating = 1;
124
}
125 126 127 128 129
if (defined($options{"t"})) {
    $toponly = 1;
}
if (defined($options{"n"})) {
    $impotent = 1;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
130
}
131 132 133 134
if (defined($options{"p"})) {
    $impotent = 1;
    $precheck = 1;
}
135 136 137
if (defined($options{"f"})) {
    $fixmode = 1;
}
138 139 140
my $pid = $ARGV[0];
my $eid = $ARGV[1];
my $ptopfile = "$pid-$eid-$$.ptop";
141 142 143 144 145
# Since the topfile could change across
# swapins and modifies, it makes sense
# to store all of them. Helps in
# degugging.
my $topfile  = "$pid-$eid-$$.top";
146

Shashi Guruprasad's avatar
Shashi Guruprasad committed
147 148
TBDebugTimeStampsOn();

149 150 151 152
#
# All exits happen via this function!
#
sub fatal ($)
153
{
154
    my($message) = @_;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
155

156 157 158
    print STDERR "*** $0:\n".
	         "    $message\n";

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
    # We next go to the END block below.
    exit($WRAPPER_FAILED);
}

#
# We want warnings to cause assign_wrapper to exit abnormally.
# We will come through here no matter how we exit though.
# 
$SIG{__WARN__} = sub { print STDERR $_[0];$warnings++; };

END {
    # Watch for getting here cause of a die()/exit() statement someplace.
    my $exitcode = $?;

    if ($exitcode && $exitcode != $WRAPPER_FAILED) {
	$exitcode = $WRAPPER_FAILED|$WRAPPER_FAILED_FATALLY;
    }
176
    
177 178 179 180 181 182 183 184 185 186 187 188 189
    if ($warnings > 0) {
	print STDERR "*** $0:\n".
	             "    $warnings warnings.\n";

	$exitcode |= $WRAPPER_FAILED;
    }

    # Set recover bit if we are going to fail.
    $exitcode = $exitcode|$WRAPPER_FAILED_CANRECOVER
	if ($exitcode && !$NoRecover);

    # And change the exitcode to be what we want it to be!
    $? = $exitcode;
190
}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
191

192 193
sub printdb ($)
{
Leigh B. Stoller's avatar
Leigh B. Stoller committed
194 195 196
    if ($verbose) {
	print $_[0];
    }
197
}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
198

199
print "assign_wrapper improved started\n";
200 201
TBDebugTimeStamp("assign_wrapper started");

Leigh B. Stoller's avatar
Leigh B. Stoller committed
202
#
203
# The main data structures:
Leigh B. Stoller's avatar
Leigh B. Stoller committed
204
#
205 206 207 208 209
# virt_nodes: The virtual nodes, indexed by vname. Each entry is a
# hash reference, initially of just the DB info, but possibly
# augmented as we proceed through assign.  Do not confuse these
# virtual nodes with the other virtual nodes! These are the ones from
# the actual topology, the virt_nodes table in the DB.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
210
#
211 212
my %virt_nodes = ();

Leigh B. Stoller's avatar
Leigh B. Stoller committed
213
#
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
# virt_lans: The equivalent of virt_nodes; the virt_lans table in the DB.
# Since there are multiple rows per lan (one for each node), this is a
# multilevel structure. The first slot is another hash, one for each node.
# The rest of the slots store other random things associated with the lan.
# So, looks something like:
#
#   %virt_lans = (link0 => {members    => member0 => { db row ref },
#                                         member1 => { db row ref }}
#                           mustdelay    => 0,
#                           emulated     => 0,
#                           uselinkdelay => 0,
#                           nobwshaping  => 0,
#                           useveth      => 0,
#                           trivok       => 0
#                          }
#                 link1 => ...
#                )
#
my %virt_lans = ();

#
# virt_vtypes: The virt_vtypes table in the DB, indexed by the vtype
# name (user chosen name).
#
my %virt_vtypes = ();

#
# node_types: The node_types table from the DB, indexed by the type name.
#
my %node_types = ();

Leigh B. Stoller's avatar
Leigh B. Stoller committed
245
# 
246
# interface_capabilities: We need this to find out the bandwidths of the devices
247 248
# we actually have on the testbed. Index by interface type name.
#
249
my %interface_capabilities = ();
Leigh B. Stoller's avatar
Leigh B. Stoller committed
250

251 252 253 254 255 256 257 258
#
# XXX Hack table for determining if a delay node is required. We store
# the native link speeds for each type,class. Indexed by type and class,
# the value is a list of link hardware speeds for which no delay node is
# required. This is an awful way to do this, and quite wrong.
#
my %node_type_linkbw = ();

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
#
# phys_nodes: The equiv of virt_nodes above, except that these are pulled
# from the DB once the physical resources have been allocated. Indexed
# by physname, but there is a pointer from the virt_nodes table entry
# to the the corresponding physnode entry. 
# 
my %phys_nodes = ();

#
# More physical side data structures.
# v2pmap is indexed by virtual and contains the physical node.
my %v2pmap = ();
# p2vmap is indexed by physical and contains one or more virtual nodes.
my %p2vmap = ();
# plinks is indexed by virtual name and contains
#  (pnodeportA,pnodeportB) .  If one is a delay node it is always
#  the second.
my %plinks = ();
# virtnodes is the list of subnodes on physnodes.
my %virtnodes = ();
my %v2vmap = ();

#
# Support for experiment modify. We create v2p and v2v mappings of the
# current topology so we can figure out how its changed after assign
# runs. These correspond to v2pmap and v2vmap mentioned above.
# 
my %reserved_v2pmap  = ();
my %reserved_v2vmap  = ();
288 289 290 291
my %oldreservednodes = ();
# reserved_p2vmap is indexed by physical and contains one or more virtual
# nodes
my %reserved_p2vmap  = ();
292

Leigh B. Stoller's avatar
Leigh B. Stoller committed
293
#
294 295 296 297 298 299 300 301 302 303 304 305
# Experiment wide options. See below. They come from the experiments table.
# Defining these will override experiment table setting. 
#
# Set this when forcing linkdelays instead of delay nodes. Set in the NS
# file with a tb-compat directive. The force directive says to set up a
# link delay, even when no delay would otherwise be inserted.
# usewatunnels is also set in the NS, and can be used to turn them off. 
# The multiplex_factor is to override node_types table for virtnode.
my $uselinkdelays;
my $forcelinkdelays;
my $usewatunnels;
my $multiplex_factor;
306
my $experiment_idx;
307
my $useprepass;
308

309 310 311 312
# For admission control. Not well defined yet.
my $cpu_usage;
my $mem_usage;

313 314 315 316
# Allow override of jail/delay osids.
my $jail_osid;
my $delay_osid;

317 318 319 320 321 322
# Flag that tells us whether to fix-node current
# resources or not during a swap modify. This is
# useful when vnode weights are tweaked by the experimenter
# before doing a swapmod
my $fix_current_resources;

323
######################################################################
Leigh B. Stoller's avatar
Leigh B. Stoller committed
324

325 326 327 328 329
# ips is indexed by node:port and contains the IP address for the port.
my %ips	      = ();

# memberof is indexed by node:port and holds the lan it is a member of.
my %memberof = ();
330 331

# delaylinks stores the actual link delay info, converted from the
332 333 334 335 336
# virt delay params above. It is indexed by link name and contains a
# [delay,bw,loss] array for each direction. The final member of the
# array is an indicator of whether the info is there strictly for a 
# trivial link (linkdelay inserted only when assign makes link trivial).
my %delaylinks     = ();
337 338 339 340 341 342 343 344 345 346 347

# delaynodes stores the names of delaynodes that we create on the
# fly using delayid. This is useful for doing isdelay? tests.
my %delaynodes = ();
my $delayid    = 0;

# nodedelays and linkdelays are the final (!) representation. Indexed by
# integer id, they store the physical node info and the delay info. 
my %nodedelays = ();
my %linkdelays = ();

348 349 350 351
# Virtual nodes that the user has requested be "fixed" to a specific
# physical node.
my %fixed_nodes     = ();

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
# portbw is indexed by virtual nodeport and contains the bandwidth of
# that port. Note that port bandwidth in the interfaces table is left
# in Mbps units for now. Thats inconsistent, I know. For LANs with
# other bandwidths the port speed will be 100 and a delay node will be
# inserted.
my %portbw = ();

# lannodes is indexed by physical name and is the set of fake lan nodes.
# lan nodes are named lan/<virtual lan>. We need to know these so that
# when they come back from assign, we can ignore them.
my %lannodes = ();

# Node estimates and counts. Avoid running assign if there is no way to
# satisfy the estimates for physical nodes.
my $minimum_nodes;
my $maximum_nodes;
368 369
my $reserved_pcount  = 0;
my $reserved_vcount  = 0;
370
my $reserved_simcount= 0;
371 372 373 374
my $remotecount      = 0;
my $virtcount        = 0;
my $plabcount        = 0;
my $needwanassign    = 0;
375
my $simcount         = 0;
376

377 378 379 380 381 382 383 384 385 386 387 388
#
# This is for stats gathering. It might duplicate other stuff, but
# thats okay.
#
my %expt_stats = (# pnodes include jailnodes and delaynodes.
		  # We let the wrapper determine pnodes once the
		  # experiment is fully swapped in so that the record
		  # is not "committed" until successful swapin.
		  jailnodes   => 0,
		  vnodes      => 0,
                  # vnodes include wanodes.
		  wanodes     => 0,
389 390
		  # wanodes includes plabnodes.
		  plabnodes   => 0,
391 392 393 394 395 396
		  simnodes    => 0,
		  delaynodes  => 0,
		  linkdelays  => 0,
		  links       => 0,
		  walinks     => 0,
		  lans        => 0,
397
		  wirelesslans => 0,
398 399 400 401 402 403 404
		  shapedlinks => 0,
		  shapedlans  => 0,
		  minlinks    => 100000,
		  # includes emulated links. Maybe thats wrong.
		  maxlinks    => 0,
);

405 406 407
my $simhost_id     = 0;
my %pnode2simhostid;
my %simhostid2pnode;
Shashi Guruprasad's avatar
Shashi Guruprasad committed
408

409
# Counters for generating IDs.
410
my $virtnode_id  = 0;
411
my $veth_id      = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
412

413 414 415 416 417 418 419 420 421 422
#
# Every vnode on a pnode gets its own routing
# table, thus an rtabid. In the case of simulated
# nodes, we need rtabids only for border nodes
# i.e. nodes that have links going out of the pnode.
# Either way, new rtabids are requested via getrtabid(<vnode>,<pnode>)
# when the interfaces table is updated or new veth_interfaces
# are inserted. This hash maintains the rtabids per vnode
my %vnode2rtabid = ();

423 424
######################################################################
# Step 1 - Setup virtual topology
425
#
426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
# Here we need to read the virtual topology in from the virt_nodes
# and virt_lans table.  We then need to add delay and lan nodes as
# necessary.
#
# Conversion details:
#   Let L be a LAN with N members.
#   If N == 2 
#      Let N1 be node 1
#      Let N2 be node 2
#      If L is delayed
#         Generate delay node D
#         Link N1 to D
#         Link N2 to D
#      Else
#         Link N1 to N2
#   Else
#      Generate lan node A
#      If L is delayed
#        Foreach node N in L
#           Generate delay node DN
#           Link A to DN
#           Link N to DN
#      Else
#        Foreach node N in L
#           Link N to A
#
# Delay node names:
#  delay nodes are named tbdelayXX N > 2
#   and tbsdelayXX for N == 2.
#
########################################################################
Chad Barb's avatar
 
Chad Barb committed
457

458 459
printdb "Generating TOP file.\n";
TBDebugTimeStamp("TOP started");
460

461 462 463 464 465 466
#
# vtypes are a funny mix beteween physical and virtual state, so we have to
# load them before getting the PhysInfo.
#
LoadVirtTypes();

467 468 469 470 471
#
# Load phys info. Interface types, node types, etc. Its any physical stuff
# we need.
#
LoadPhysInfo();
472

473 474 475 476
#
# Load the Experiment info and virt topology.
#
LoadExperiment();
477

478 479 480 481
#
# If updating, load current experiment resources. We have to be careful
# of how this is merged in with the (new) desired topology. See below.
#
482 483 484 485 486
if ($updating) {
    LoadCurrent();
    print STDERR "Resetting DB before updating.\n";
    TBExptRemovePhysicalState( $pid, $eid );
}
Chad Barb's avatar
 
Chad Barb committed
487

488 489 490 491
#
# Check Max Concurrent for OSID violations.
#
CheckMaxConcurrent();
492

493 494 495 496
#
# Create the TOP file.
#
CreateTopFile();
497

498 499 500 501 502 503
TBDebugTimeStamp("TOP finished");

# Stop here ...
if ($toponly) {
    print "Stopping after creating the TOP file, as directed.\n";
    exit(0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
504
}
505

506

507 508 509 510 511 512 513 514 515
######################################################################
# Step 2 - Assign Loop
# 
# Here we loop up to maxrun times.  In each loop we snapshot the
# current testbed state into a ptop file.  We then run assign.  If
# assign succeeds we attempt to reserve the resources.  If that works
# we're done with step 2 otherwise we loop again.
#
#######################################################################
Leigh B. Stoller's avatar
Leigh B. Stoller committed
516

517 518
my $currentrun = 1;
my $canceled   = 0;
519 520

# XXX plab hack - only run assign once on plab topologies, since they're easy
521
# to map and the physical topology does not change frequently.
522
if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
523
    $maxrun = 2;
524
}
525

526 527
TBDebugTimeStamp("assign_loop started");
while (1) {
528
    # Check cancel flag before continuing. 
529
    TBGetCancelFlag($pid, $eid, \$canceled);
530
    fatal("Cancel flag set; aborting assign run!")
531 532
	if ($canceled);

533
    print "Assign Run $currentrun\n";
534

535
    #
536 537 538
    # RunAssign returns  0 if successful.
    #           returns -1 if failure, but assign says to stop trying.
    #           returns -1 if failure, but assign says to try again.
539
    #
540
    my $retval = RunAssign();
541 542 543 544

    # Success!
    last
	if ($retval == 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
545

546
    if ($currentrun >= $maxrun) {
547
	fatal("Reached run limit. Giving up.");
548 549
    }

550 551 552 553
    if ($retval < 0) {
	#
	# Failure in assign.
	#
554
	fatal("Unretriable error. Giving up.");
555
    }
556
    
557 558 559 560 561 562 563 564 565
    print "Waiting 5 seconds and trying again...\n";
    sleep(5);
    $currentrun++;
}
TBDebugTimeStamp("assign_loop finished");

#
# Run assign once.
# 
566
sub RunAssign ()
567 568
{
    # Clear globals for each run.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
569 570
    undef %v2pmap;
    undef %p2vmap;
571
    undef %v2vmap;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
572
    undef %plinks;
573
    undef %virtnodes;
574 575 576

    my %toreserve = ();
    my %subnodes  = ();
Shashi Guruprasad's avatar
Shashi Guruprasad committed
577
    
578
    TBDebugTimeStamp("ptopgen started");
579
    # Snapshot physical resources.
Chad Barb's avatar
 
Chad Barb committed
580 581 582 583
    #
    # if updating (-u), include any resources that may already be
    # allocated to experiment in the PTOP results.
    #
584 585
    my $ptopargs = "-p $pid ";
    $ptopargs   .= "-e $eid "
586 587 588
	if ($updating);
    $ptopargs   .= "-m $multiplex_factor "
	if (defined($multiplex_factor));
589 590
    $ptopargs   .= "-v "
	if ($virtcount);
591 592
    $ptopargs   .= "-r "
	if ($remotecount);
593 594
    $ptopargs   .= "-S "
	if ($simcount);
595 596
    $ptopargs	.= "-a "
    	if ($precheck);
597
    system("ptopgen $ptopargs > $ptopfile");
598
    TBDebugTimeStamp("ptopgen finished");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
599

600
    TBDebugTimeStamp("assign started");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
601
    # Run assign
Leigh B. Stoller's avatar
Leigh B. Stoller committed
602
    my $cmdargs = "$ptopfile $topfile";
603
    $cmdargs = "-uPod -c .75 $cmdargs"
604
	if ($virtcount || $simcount);
605 606
    $cmdargs = "-n $cmdargs"
    	if ($precheck);
607 608 609 610 611

    my $cmd;

    # If doing an experiment with virtnodes, use the prepass wrapper for assign
    # Turned off for now, because it needs some work.
612 613 614 615 616 617 618 619
    if ($useprepass) {
    	$cmd = "assign_prepass";
    	$cmdargs = "-m $multiplex_factor $cmdargs"
    	    if ($multiplex_factor);
    } else {
    	$cmd = "assign";
    }

620
    print "$cmd $cmdargs\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
621

622 623 624 625 626 627 628 629 630 631 632 633 634 635
    #
    # Fork a child to run assign. Parent spins watching the cancel flag
    # and waiting for assign to finish.
    #
    if (my $childpid = fork()) {
	while (1) {
	    sleep(2);

	    if (waitpid($childpid, &WNOHANG) == $childpid) {
		$assignexitcode = $? >> 8;
		last;
	    }

	    # Check cancel flag.
636
	    TBGetCancelFlag($pid, $eid, \$canceled);
637 638 639 640
	    if ($canceled) {
		if ((my $pgrp = getpgrp($childpid)) > 0) {
		    kill('TERM', -$pgrp);
		    waitpid($childpid, 0);
641 642 643

		    print "Cancel flag set; aborting assign run!\n";
		    return -1;
644 645 646 647 648 649 650 651 652 653 654
		}
		# Loop again to reap child above before exit.
	    }
	}
    }
    else {
	#
	# Change our session so the child can get a killpg without killing
	# the parent. 
	#
        POSIX::setsid();
655
	exec("nice $cmd $cmdargs > assign.log");
656 657 658
	die("*** $0:\n".
	    "    Could not start assign!\n");
    }
Chad Barb's avatar
Chad Barb committed
659

660
    # Check cancel flag before continuing. 
661
    TBGetCancelFlag($pid, $eid, \$canceled);
662 663 664 665
    if ($canceled) {
	print("Cancel flag set; aborting assign run!\n");
	return -1;
    }
666 667 668 669 670

    # Saving up assign.log coz each swapin/modify is
    # different and it is nice to have every mapping
    # for debugging and archiving purposes
    system("/bin/cp assign.log assign-$pid-$eid-$$.log");
671 672 673 674
    if (!open(ASSIGNFP, "assign.log")) {
	print("Could not open assign logfile!\n");
	return -1;
    }
675

676 677 678 679 680 681 682 683 684 685
    printdb "Reading assign results.\n";

    #
    # We no longer care what assign has to say when it fails! Just
    # tell the caller whether we want to keep trying or not. We still
    # send some of the goo to the output stream so that Rob can quickly
    # deduce what what wrong.
    #
    if ($assignexitcode) {
	my $violations = 0;
686

687 688
	print "ASSIGN FAILED:\n";
	while (<ASSIGNFP> !~ /^[\w\s]*precheck:$/) {}
Chad Barb's avatar
Chad Barb committed
689 690
	while (<ASSIGNFP>) {
	    chop;
691 692
	    /^\w*\s*precheck:$/ && do {
		next;
Chad Barb's avatar
Chad Barb committed
693 694 695 696 697
	    };
	    /^With ([0-9]+) violations$/ && do {
		$violations = $1;
		last;
	    };
698
	    print $_ . "\n";
Chad Barb's avatar
Chad Barb committed
699
	}
700 701 702 703 704 705 706
	if ($violations) {
	    while (<ASSIGNFP> !~ /^Violations:/) {}
	    while (<ASSIGNFP>) {
		if (/^Nodes:/) {
		    last;
		}
		print "$_";
707 708 709
	    }
	}
	close(ASSIGNFP);
710
	return (($assignexitcode == 1) ? 1 : -1);
711
    }
712
    
713 714 715 716 717 718 719 720 721
    #
    # If we were doing the precheck, go ahead and exit now - there won't be
    # any useful information to parse out
    #
    if ($precheck) {
	print "Precheck succeeded.\n";
	return 0;
    }

722 723 724 725
    #
    # Assign success; parse results.
    # 
    # read nodes section
726 727 728 729 730 731 732
    while ((my $line = <ASSIGNFP>) !~ /^Nodes:/) {
	# find the 'BEST SCORE' line and print that out for informational
	# purposes
	if ($line =~ /BEST SCORE/) {
	    print $line;
	}
    }
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
    printdb "Nodes:\n";
    while (<ASSIGNFP>) {
	chop;
	/^End Nodes$/ && last;
	@info = split;
	my ($virtual,$physical) = @info[0,1];

	# We don't care about LAN nodes anymore.
	if (defined($lannodes{$virtual})) {
	    next;
	}

	if (physnodeallocated($physical)) {
	    #
	    # Mark node as being reused.
	    #
	    # Look at virtual node being mapped to node;
	    # if it wasn't in the previous map, mark node for reboot.
	    #
	    if (physnodereuse($physical) eq "reboot") {
		# No changes once it goes into reboot.
		;
	    }
	    elsif (virtnodeisvirt($virtual)) {
757
		#
758 759 760 761 762 763 764 765 766 767 768 769 770 771
		# A new virt virtual node on an existing physical node
		# does not force the physnode to be rebooted; we can
		# set up a new virtnode on it without a reboot. If its
		# an existing virtual on the same physnode, then mark
		# both as reused; no need to reboot either. If the 
		# virtnode has moved here from someplace else, no
		# reboot of the physnode either, but obviously the
		# vnode will be released and a new one allocated.  What
		# we cannot determine is if its just a renamed node
		# (which would require a reboot of the the virtual
		# node). 
		# 
		if (!exists($reserved_v2pmap{$virtual})) {
		    physnodesetreuse($physical, "reused");
Chad Barb's avatar
 
Chad Barb committed
772
		}
773 774
		elsif ($reserved_v2pmap{$virtual} eq $physical) {
		    my $reserved = $reserved_v2vmap{$virtual};
775

776 777
		    physnodesetreuse($reserved, "reused");
		    physnodesetreuse($physical, "reused");
778 779
		}
		else {
780
		    physnodesetreuse($physical, "reused");
781 782 783
		}
	    }
	    else {
Chad Barb's avatar
 
Chad Barb committed
784
		#
785 786 787 788 789 790 791
		# If a new virtual node mapped to this physnode (maybe
		# even the luser changed the name of the node), or if an
		# existing virtual node moved to this physnode, must
		# reboot the physnode. Else, the physnode is being
		# reused as is, and no need to mess with it. If the
		# user requested reboot, that will be handled outside
		# of this script.
792
		#
793 794 795 796 797 798
		if (!exists($reserved_v2pmap{$virtual}) ||
		    $reserved_v2pmap{$virtual} ne $physical) {
		    physnodesetreuse($physical, "reboot");
		}
		else {
		    physnodesetreuse($physical, "reused");
799
		}
800
	    }
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
	}
	else {
	    #
	    # This is a new node; we'll have to reserve it. Note that
	    # we do not reserve a widearea physnode when a virtual node
	    # is mapped to it; they are special.
	    #
	    $toreserve{$physical} = 1
		if (!virtnodeisremote($virtual));
	}
	
	if (virtnodeisvirt($virtual)) {
	    #
	    # If mapping a virtual node, then record that, since we need
	    # to allocate the virtnodes on that physnode, later.
	    #
	    if (!defined($virtnodes{$physical})) {
		$virtnodes{$physical} = [];
Shashi Guruprasad's avatar
Shashi Guruprasad committed
819
	    }
820 821 822 823 824 825 826 827 828 829 830 831 832
	    push(@{$virtnodes{$physical}}, $virtual);
	}
	elsif (virtnodeissubnode($virtual)) {
	    #
	    # Need to allocate the parent to. Should be optional?
	    # Save away and deal with once we have all the results.
	    #
	    $subnodes{$virtual} = $physical;
	}
	
	$v2pmap{$virtual} = $physical;
	if( ! defined($p2vmap{$physical}) ) {
	    $p2vmap{$physical} = [];
Leigh B. Stoller's avatar
Leigh B. Stoller committed
833
	}
834 835 836
	push(@{$p2vmap{$physical}}, $virtual);
	printdb "  $virtual $physical\n";
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
837

838 839 840 841 842 843 844 845 846
    #
    # Process the subnodes. We have to allocate the parent at the same
    # time, lest it get sucked away for some other purpose by another
    # experiment. We might want to push this off into nalloc, but not
    # sure yet.
    #
    for my $virtual (keys(%subnodes)) {
	my $physical = $subnodes{$virtual};
	my $parent;
847

848
	TBPhysNodeID($physical, \$parent);
849

850
	printdb "  Subnode: $virtual $physical $parent\n";
851

852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898
	#
	# See if we already have it. Swapmod, retry, or perhaps
	# the parent could be named separately? Or maybe there are
	# several subnodes on the physnode?
	#
	next
	    if (exists($p2vmap{$parent}));

	# Make up a name and add to the list.
	my $newvname = newvname($parent, "phost");

	$v2pmap{$newvname} = $parent;
	$p2vmap{$parent} = [ $newvname ];
	$toreserve{$parent} = 1;
	printdb "  Adding subnode host: $newvname $parent\n";
    }

    # read Edges
    # By convention, in plinks, the delay node is always the second
    # entry.
    while (<ASSIGNFP> !~ /^Edges:/) { }
    printdb "Edges:\n";
    EDGEWHILE: while (<ASSIGNFP>) {
	/^End Edges$/ && last EDGEWHILE;
	@info = split;
	$line = $_;
	$_ = $info[1]; # type
        SWITCH1: {
	    /^intraswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^interswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,$#info];
		last SWITCH1;
	    };
	    /^direct$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^trivial$/ && do {
		# we don't have plinks for trivial links
		$vlink = $info[0];
		$plinks{$vlink} = [];
		next EDGEWHILE;
	    };
	    print "Found garbage: $line\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
899
	}
900 901 902 903 904 905
	$nodeportA = &getnodeport($rawA);
	$nodeportB = &getnodeport($rawB);
	$nodeportA =~ s/\//:/;
	$nodeportB =~ s/\//:/;
	$plinks{$vlink} = [$nodeportA,$nodeportB];
	printdb "  $vlink " . join(" ",@{$plinks{$vlink}}) . "\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
906 907
    }
    close(ASSIGNFP);
908
    TBDebugTimeStamp("assign finished");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
909 910

    # Reserve resources
911 912 913 914
    if ($impotent) {
	print "Skipping physical reservation, as directed.\n";
	return 0;
    }
Chad Barb's avatar
Chad Barb committed
915

916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
    # From here, we can't recover anymore, coz we move
    # previously reserved pnodes/vnodes to the holding reservation
    # and back. By doing this, we will avoid any UNIQUE key issues
    # when a virt_node in the topology moves from one pnode to another
    # from previous to new mapping.
    # Another reason to do this just before nalloc of a new toreserve
    # nodes is that, we can get into name clashes
    # For example, lets say the user called his node pc2 and it was
    # actually mapped to pc99 in the initial swapin. If this was a
    # swapmod where the user asked for another node node0 which got 
    # mapped to pc2. nalloc of pc2 will result in UNIQUE key problems
    # since there exists a reserved vname pc2 (virtual name). By 
    # having this operation of moving the nodes into 
    # OLDRESERVED_PID/OLDRESERVED_EID and back before a new nalloc,
    # avoid this UNIQUE key problem. Also note that simply updating
    # the vname to be the same as the node_id field also won't
    # work all the time i.e. in the example discussed above
    my $oldreserved_pid = OLDRESERVED_PID;	
    my $oldreserved_eid = OLDRESERVED_EID;
    if (scalar(keys %oldreservednodes)) {
	# We can't recover after this coz we are making changes to
	# the DB
	$NoRecover = 1;
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back started");
	system("nfree -o $pid $eid " . join(" ", keys(%oldreservednodes)) );
	system("nalloc $pid $eid " . join(" ", keys(%oldreservednodes)) );
	my $exitval  = $? >> 8;
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back finished");
	#
	# If nalloc failed with a fatal error, lets give it up. No retry.
	# 
	if ($exitval != 0) {
952 953
	    print("Failed to move back Old Reserved nodes back to reserved\n");
	    return -1;
954
	}
955

956 957 958 959 960 961 962
	# We need to move this back and forth the holding reservation only
	# once i.e. in the first call to RunAssign(). If it gets repeatedly
	# called coz only some pnode resources got nalloc'ed, we don't have
	# to do the above again.
	undef %oldreservednodes;
    }

963 964 965 966
    TBDebugTimeStamp("reserving started");
    system("nalloc -p $pid $eid " . join(" ", keys(%toreserve)));
    TBDebugTimeStamp("reserving finished");
    my $exitval  = $? >> 8;
Chad Barb's avatar
 
Chad Barb committed
967

968 969 970 971 972 973 974
    #
    # If nalloc failed with a fatal error, lets give it up. No retry.
    # 
    if ($exitval < 0) {
	print "Failed to reserve any nodes.\n";
	return -1; 
    }
Chad Barb's avatar
 
Chad Barb committed
975

976 977 978 979 980 981 982
    #
    # Okay, if nalloc got anything, we have to set the norecover bit,
    # since tbswap operates on the principle that any change in the DB
    # means no recover is possible. This can probably me dealt with by
    # deallocating any nodes we allocated in the wrapper before exiting.
    #
    $NoRecover = 1;
Chad Barb's avatar
 
Chad Barb committed
983

984 985 986 987 988 989 990 991 992 993 994 995 996 997
    #
    # Otherwise, all newly allocated nodes MUST go to the INIT_DIRTY
    # allocstate since the user now has control of them. If we eventually
    # fail, nodes not in RES_READY are deallocated (retry/modify). 
    #
    if ($exitval > 0) {
	my @reserved = ExpNodes($pid, $eid);

	# We got only some. Need to figure out which.
	print "Reserved some nodes ($exitval) we needed, but not all.\n";
	
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
Chad Barb's avatar
 
Chad Barb committed
998
	    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
999 1000
	}

1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
	#
	# We check to see if were were able to reserve all the fixed
	# nodes we needed. If we couldn't get the fixed list, then
	# this experiment is unlikely to map in the "near" future, so
	# give up now (no retry).
	#
	foreach my $node (values(%fixed_nodes)) {
	    if (! grep {$_ eq $node} @reserved) {
		printdb "  Could not allocate fixed node $node!\n";
		return -1;
	    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1012
	}
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
	#
	# Okay, must extend the fixed list with newly allocated nodes
	# so that we can recreate the top file, and try again with a
	# new set.
	#
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
		foreach my $vname (@{$p2vmap{$node}}) {
		    $fixed_nodes{$vname} = $node;
		}
	    }
1024
	}
1025 1026 1027
	CreateTopFile();
	return 1;
    }
Chad Barb's avatar
Chad Barb committed
1028

1029 1030 1031 1032 1033 1034 1035 1036 1037
    #
    # We got all the nodes we wanted. All newly allocated nodes MUST
    # go to the INIT_DIRTY allocstate since the user now has control
    # of them.
    #
    print "Successfully reserved all physical nodes we needed.\n";
	
    foreach my $node (keys(%toreserve)) {
	TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1038
    }
Chad Barb's avatar
Chad Barb committed
1039

1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
    #
    # Release phys and virt nodes no longer needed. They are marked
    # for teardown. They need to be freed by SOMEONE, currently the
    # wrapper (tbswap), since this only happens when in update mode
    # (swapmod).
    #
    foreach my $pnode (keys(%phys_nodes)) {
	my $reuse = physnodereuse($pnode);
		
	if ($reuse eq "unused") {
	    #
	    # Node was used in previous incarnation, but not any more.
	    #
	    TBSetNodeAllocState($pnode, TBDB_ALLOCSTATE_RES_TEARDOWN());
	}
	elsif ($reuse eq "reboot") {
	    #
	    # Node is being reused, but for a different purpose, so
	    # it should be rebooted.
	    #
	    TBSetNodeAllocState($pnode, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
	}
    }
    return 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1064
}
1065

1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
###########################################################################
# Step 2A
#
# We run the wanassigner to allocate remote nodes. We do this after cause
# it takes so long. We run it just once.
#
# wanassign does its own nalloc.
#
###########################################################################

1076
#
1077
# VIRTNODES HACK: Allocate the remote virtual nodes.
1078
#
1079
if ($needwanassign) {
1080
    my $success  = 0;
1081
    my %wanmap   = ();
1082

1083 1084
    print "Running 'wanassign -d $pid $eid'\n";
    open(WANFP,"wanassign -d $pid $eid 2>&1 | tee wanassign.log |") or
1085
	fatal("Failed to start wanassign: $!");
1086 1087 1088 1089 1090

    printdb "Reading wanassign results.\n";
    while (<WANFP>) {
	chop;
	if ($_ =~ /(\S+) mapsto (\S+)/) {
1091 1092
	    $wanmap{$1} = $2;
	    printdb "  $1 mapsto $2\n";
1093
	}
1094 1095 1096 1097 1098 1099
	if ($_ =~ /^Success/) {
	    $success = 1;
	}
	# Skip other output. Usually its debugging output.
    }
    close(WANFP) or
1100
	fatal("wanassign: " .
1101 1102
	      ($? ? "exited with status: $?." :
	            "error closing pipe: $!"));
1103

1104
    if (!$success) {
1105
	fatal("wanassign could not find a solution!");
1106
    }
1107 1108
    foreach my $virtual (keys(%wanmap)) {
	my $physical = $wanmap{$virtual};
1109

1110
	fatal("Improper mapping from wanassign: $virtual/$physical")
1111 1112 1113 1114 1115 1116 1117 1118
	    if (!virtnodeisvirt($virtual));
	
	#
	# If mapping a virtual node, then record that, since we need
	# to allocate the virtnodes on that physnode, later.
	#
	if (!defined($virtnodes{$physical})) {
	    $virtnodes{$physical} = [];
1119
	}
1120 1121 1122 1123 1124
	push(@{$virtnodes{$physical}}, $virtual);
	
	$v2pmap{$virtual} = $physical;
	if( ! defined($p2vmap{$physical}) ) {
	    $p2vmap{$physical} = [];
1125
	}
1126
	push(@{$p2vmap{$physical}}, $virtual);
1127
    }