assign_wrapper.in 174 KB
Newer Older
Leigh Stoller's avatar
Leigh Stoller committed
1
#!/usr/bin/perl -w
Leigh Stoller's avatar
Leigh Stoller committed
2 3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2008 University of Utah and the Flux Group.
Leigh Stoller's avatar
Leigh Stoller committed
5 6
# All rights reserved.
#
7 8
use English;
use Getopt::Std;
9
use POSIX qw(setsid ceil);
10
use POSIX ":sys_wait_h";
Leigh Stoller's avatar
Leigh Stoller committed
11

12 13 14 15 16 17 18 19 20 21 22
#
# Function phototypes
#
sub RunAssign();
sub getbandwidth($$$);
sub InitPnode($$);
sub nextipportnum($);
sub AddVlan($$$;$);
sub UploadVlans();
sub CheckMaxConcurrent();
sub UploadStats();
23 24 25
sub NewVirtIface($$$;$);
sub AddVirtPatch($$);
sub PatchVirts();
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
sub newvname($$);
sub newvname_simhost($);
sub LoadPhysInfo();
sub interfacespeedmbps($$);
sub requires_delay($$$);
sub LoadVirtNodes();
sub LoadVirtNodeDesires();
sub LoadVirtNodeStartLoc();
sub LoadVirtLans();
sub LoadVirtTypes();
sub LoadExperiment();
sub CreateTopFile();
sub LoadPhysNode($);
sub LoadPhysNodeInterfaces($);
sub LoadPhysResources();
sub physnodenextrtabid($);
sub getrtabid($$);
sub array_diff($$);
sub LoadCurrent();
45
sub SetUpTracing($$$$$);
46
sub fatal(@);
47

48
#
Leigh Stoller's avatar
Leigh Stoller committed
49 50 51 52 53 54
# This function as the main assign loop.  It converts the virtual
# topology into a top input including LAN and delay translation.  It
# then snapshots the current testbed physical state and runs assign,
# looping a couple times if assign fails.  When assign successfully
# completes it will interpret the results.  Attempt to match any
# existing portmap entries and then update the delays and vlans table.
55
#
56 57 58 59
# XXX: Update does not work with widearea nodes.
#      Internally created nodes (jailhost,delay,sim) are not treated
#        consistently. Needs more thought.
#
60 61 62 63
# Return codes: We catch all errors with the END block below, and if
# anyone calls die() (exit value is 255) we add in the CANRECOVER bit.
# Failures in assign always cause the caller to stop retrying. 
#
64
# The CANRECOVER bit indicates 'recoverability' (no db or physical
65 66
# state was modified by the time the error occurred). This is relavent
# to only modify operations (update).
67 68 69
#
my $WRAPPER_SUCCESS		 = 0x00;
my $WRAPPER_FAILED		 = 0x01;	# Failed (Add other values)
70 71
my  $WRAPPER_FAILED_CANRECOVER   = 0x40;        # Can recover from update
my  $WRAPPER_FAILED_FATALLY      = 0x80;	# Do not use this.
72 73
# Set this once we modify DB state; forces no recover in fatal().
my $NoRecover = 0;
74

75 76
sub usage ()
{
77 78 79 80
    print STDERR "Usage: $0 [-v] [-u [-f] | -n] pid eid\n";
    print STDERR " -v   - Enables verbose output\n";
    print STDERR " -u   - Enables update mode\n";
    print STDERR " -f   - Fix current resources during update mode\n";
81 82
    print STDERR " -t   - Create the TOP file and then exit\n";
    print STDERR " -n   - Run assign, but do not reserve/modify resources.\n";
83 84
    print STDERR " -p   - Do a precheck for mapability on an empty testbed - ".
		 "implies -n\n";
85
    exit($WRAPPER_FAILED);
86
}
87
my $optlist  = "vutnfp";
88
my $verbose  = 0;
89
my $fixmode  = 0;
90 91 92
my $updating = 0;
my $toponly  = 0;
my $impotent = 0;
93
my $precheck = 0;
94
my $warnings = 0;
95 96 97 98 99

#
# Configure variables
#
my $TBROOT	  = "@prefix@";
100
my $DELAYCAPACITY = @DELAYCAPACITY@;	# Can be overridden by user!
101 102 103 104 105 106 107 108 109 110 111 112 113
$ENV{'PATH'} = "/usr/bin:$TBROOT/libexec:$TBROOT/sbin:$TBROOT/bin";

#
# Turn off line buffering on output
#
$| = 1;

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
114
use Experiment;
115
use Node;
116
use NodeType;
117
use Lan;
118
use libadminctrl;
119
use libtblog;
120
use libtblog qw(*SOUT *SERR);
121

122
#
123 124 125 126
# assign_wrapper Settings
#
# Maximum delay in ms above which a delay node is needed.
# (Note that the DB represents delays as floating point numbers)
127
my $delaythresh = @DELAYTHRESH@;
128

129
# Maximum number of times we run assign.
130
my $maxrun = 3;
Leigh Stoller's avatar
Leigh Stoller committed
131

132 133 134
# Use the switch to delay when possible. Currentlythis only works for 10mbit
# links (actually, its turned off cause it does not work; auto handshake).
my $delaywithswitch = 0;
Leigh Stoller's avatar
Leigh Stoller committed
135

136 137 138 139 140 141 142
#
# Some handy constants. Speed in Mbits/sec and Kbits/sec units.
#
# Its probably a good idea to leave portbw (current_speed) in Mbs, since
# those numbers are used at the switch and the interfaces, which really
# only think in Mbps.
#
143 144 145 146 147 148
my $S10Mbs   = 10;
my $S100Mbs  = 100;
my $S1000Mbs = 1000;
my $S10Kbs   = 10000;
my $S100Kbs  = 100000;
my $S1000Kbs = 1000000;
Leigh Stoller's avatar
Leigh Stoller committed
149

150
#
151 152
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
153
#
154 155 156
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
157
}
158 159
if (@ARGV != 2) {
    usage();
160
}
161
if (defined($options{"v"})) {
162
    $verbose++;
Chad Barb's avatar
Chad Barb committed
163
}
164
if (defined($options{"u"})) {
Chad Barb's avatar
Chad Barb committed
165
    $updating = 1;
166
}
167 168 169 170 171
if (defined($options{"t"})) {
    $toponly = 1;
}
if (defined($options{"n"})) {
    $impotent = 1;
Leigh Stoller's avatar
Leigh Stoller committed
172
}
173 174 175 176
if (defined($options{"p"})) {
    $impotent = 1;
    $precheck = 1;
}
177 178 179
if (defined($options{"f"})) {
    $fixmode = 1;
}
180 181 182
my $pid = $ARGV[0];
my $eid = $ARGV[1];
my $ptopfile = "$pid-$eid-$$.ptop";
183 184 185 186 187
# Since the topfile could change across
# swapins and modifies, it makes sense
# to store all of them. Helps in
# degugging.
my $topfile  = "$pid-$eid-$$.top";
188

189 190
TBDebugTimeStampsOn();

191 192 193 194
#
# We want warnings to cause assign_wrapper to exit abnormally.
# We will come through here no matter how we exit though.
# 
195 196 197 198
# FIXME: Is this still needed.  "warn" is only used once.  Also this
#  will cause perl internal warnings (such as "Use of uninitialized
#  value ..."  to cause assign_wrapper to fail. -- kevina
#
199
$SIG{__WARN__} = sub { tbwarn $_[0];$warnings++; };
200 201 202 203 204 205 206 207

END {
    # Watch for getting here cause of a die()/exit() statement someplace.
    my $exitcode = $?;

    if ($exitcode && $exitcode != $WRAPPER_FAILED) {
	$exitcode = $WRAPPER_FAILED|$WRAPPER_FAILED_FATALLY;
    }
208
    
209
    if ($warnings > 0) {
210
	tberror "$warnings warnings.\n";
211 212 213 214 215 216 217 218 219 220

	$exitcode |= $WRAPPER_FAILED;
    }

    # Set recover bit if we are going to fail.
    $exitcode = $exitcode|$WRAPPER_FAILED_CANRECOVER
	if ($exitcode && !$NoRecover);

    # And change the exitcode to be what we want it to be!
    $? = $exitcode;
221
}
Leigh Stoller's avatar
Leigh Stoller committed
222

223 224
sub printdb ($)
{
Leigh Stoller's avatar
Leigh Stoller committed
225 226 227
    if ($verbose) {
	print $_[0];
    }
228
}
Leigh Stoller's avatar
Leigh Stoller committed
229

230
print "assign_wrapper improved started\n";
231 232
TBDebugTimeStamp("assign_wrapper started");

Leigh Stoller's avatar
Leigh Stoller committed
233
#
234
# The main data structures:
Leigh Stoller's avatar
Leigh Stoller committed
235
#
236 237 238 239 240
# virt_nodes: The virtual nodes, indexed by vname. Each entry is a
# hash reference, initially of just the DB info, but possibly
# augmented as we proceed through assign.  Do not confuse these
# virtual nodes with the other virtual nodes! These are the ones from
# the actual topology, the virt_nodes table in the DB.
Leigh Stoller's avatar
Leigh Stoller committed
241
#
242 243
my %virt_nodes = ();

244 245 246 247 248 249 250 251 252 253
#
# Helper functions to get virt_node info. Because of internally created
# nodes (tbdelay, vhost, sim), we need a virtnodeisvnode() test to prevent
# entries getting stuck into $virt_nodes when calling, say, virtnodeisvirt.
# Should we create entries for these nodes instead?
#
sub virtnodeisvnode($)  { return exists($virt_nodes{$_[0]}); }
sub virtnodetype($)	{ return $virt_nodes{$_[0]}->{"type"}; }
sub virtnodename($)	{ return $virt_nodes{$_[0]}->{"vname"}; }
sub virtnodeelabrole($) { return $virt_nodes{$_[0]}->{"inner_elab_role"}; }
254
sub virtnodeplabrole($) { return $virt_nodes{$_[0]}->{"plab_role"}; }
255
sub virtnodeosname($)   { return $virt_nodes{$_[0]}->{"__osname"}; }
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
sub virtnodeosid($)     { return $virt_nodes{$_[0]}->{"__osid"}; }
sub virtnodesetosid($$) { return $virt_nodes{$_[0]}->{"__osid"} = $_[1]; }
sub virtnodeparent($)   { return $virt_nodes{$_[0]}->{"__parent"}; }
sub virtnodepnode($)    { return $virt_nodes{$_[0]}->{"__physnode"}; }
sub virtnodesetpnode($$){ return $virt_nodes{$_[0]}->{"__physnode"} = $_[1]; }
sub virtnodeusewan($)   { return $virt_nodes{$_[0]}->{"__usewanassign"}; }
sub virtnodesetusewan($){ return $virt_nodes{$_[0]}->{"__usewanassign"} = 1;}
sub virtnodesettings($) { return @{$virt_nodes{$_[0]}->{"__settings"}}; }
sub virtnodestartloc($) { return $virt_nodes{$_[0]}->{"__startloc"}; }
sub virtnodeisremote($) {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isremotenode"};
}
sub virtnodeisvirt($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isvirtnode"};
}
sub virtnodeissubnode($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__issubnode"};
}
sub virtnodeissim($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__issimnode"};
}
sub virtnodeisdynamic($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isdynamic"};
}
sub virtnodeisplabnode($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isplabnode"};
}
sub virtnodeisjailed($) {
    return virtnodeisvirt($_[0]) && physnodeisjailed(virtnodepnode($_[0]));
}
sub virtnodedesires     {
    my @desires;
    while (my ($desire, $weight) = each %{$virt_nodes{$_[0]}->{"__desires"}}) {
	push @desires, $desire . ":" . sprintf("%f",$weight);
    }
    return join(" ",@desires);
}
293 294 295 296 297 298
sub virtnodeneedslinkdelays($) {
    return $virt_nodes{$_[0]}->{"__needslinkdelays"};
}
sub virtnodesetneedslinkdelays($) {
    return $virt_nodes{$_[0]}->{"__needslinkdelays"} = 1;
}
299

Leigh Stoller's avatar
Leigh Stoller committed
300
#
301 302 303 304 305 306 307 308 309 310 311 312
# virt_lans: The equivalent of virt_nodes; the virt_lans table in the DB.
# Since there are multiple rows per lan (one for each node), this is a
# multilevel structure. The first slot is another hash, one for each node.
# The rest of the slots store other random things associated with the lan.
# So, looks something like:
#
#   %virt_lans = (link0 => {members    => member0 => { db row ref },
#                                         member1 => { db row ref }}
#                           mustdelay    => 0,
#                           emulated     => 0,
#                           uselinkdelay => 0,
#                           nobwshaping  => 0,
313
#                           encapstyle   => "default",
314 315 316 317 318 319 320
#                           trivok       => 0
#                          }
#                 link1 => ...
#                )
#
my %virt_lans = ();

321 322
sub virtlanexists($)	{ return exists($virt_lans{$_[0]}); }
sub virtlanname($)	{ return $virt_lans{$_[0]}->{"VNAME"}; }
323
sub virtlanidx($)	{ return $virt_lans{$_[0]}->{"IDX"}; }
324 325 326 327 328 329
sub virtlanmembers($)	{ return @{$virt_lans{$_[0]}->{"MEMBERLIST"}}; }
sub virtlanmustdelay($)	{ return $virt_lans{$_[0]}->{"MUSTDELAY"}; }
sub virtlanemulated($)	{ return $virt_lans{$_[0]}->{"EMULATED"}; }
sub virtlanlinkdelay($) { return $virt_lans{$_[0]}->{"USELINKDELAY"}; }
sub virtlannobwshape($)	{ return $virt_lans{$_[0]}->{"NOBWSHAPING"}; }
sub virtlantrivok($)	{ return $virt_lans{$_[0]}->{"TRIVIAL_OK"}; }
330
sub virtlanencapstyle($){ return $virt_lans{$_[0]}->{"ENCAPSTYLE"}; }
331 332 333
sub virtlantunnel($)	{ return $virt_lans{$_[0]}->{"TUNNEL"}; }
sub virtlandelayinfo($$){ return @{$virt_lans{$_[0]}->{"DELAYINFO"}->{$_[1]}};}
sub virtlanqueueinfo($$){ return @{$virt_lans{$_[0]}->{"QUEUEINFO"}->{$_[1]}};}
334
sub virtlantraceinfo($$){ return @{$virt_lans{$_[0]}->{"TRACEINFO"}->{$_[1]}};}
335 336 337 338
sub virtlannetmask($)	{ return $virt_lans{$_[0]}->{"MASK"}; }
sub virtlanwidearea($)  { return $virt_lans{$_[0]}->{"WIDEAREA"}; }
sub virtlanallsim($)    { return $virt_lans{$_[0]}->{"ALLSIM"}; }
sub virtlanprotocol($)  { return $virt_lans{$_[0]}->{"PROTOCOL"}; }
339 340 341 342 343 344 345 346 347 348
sub virtlandelayed($)   { return $virt_lans{$_[0]}->{"DELAYED"}; }
sub virtlansetdelayed($){ $virt_lans{$_[0]}->{"DELAYED"} = 1; }

sub virtlanmembershaped($$) {
    return $virt_lans{$_[0]}->{"SHAPEDMEMBERS"}->{$_[1]};
}
sub virtlansetmembershaped($$) {
    $virt_lans{$_[0]}->{"SHAPEDMEMBERS"}->{$_[1]} = 1;
    # Mark the lan as having shaped members.
}
349 350 351 352 353
sub virtlanAP($)        {
    return $virt_lans{$_[0]}->{"ACCESSPOINT"}
        if (defined($virt_lans{$_[0]}->{"ACCESSPOINT"}));
    return undef;
}
354 355 356 357
sub virtlanusevirtiface($) {
    my $encap = virtlanencapstyle($_[0]);
    return ($encap eq "veth" || $encap eq "veth-ne" || $encap eq "vlan");
}
358 359 360 361
sub virtlanusetuniface($) {
    my $encap = virtlanencapstyle($_[0]);
    return ($encap eq "gre" || $encap eq "egre" || $encap eq "vtun");
}
362 363 364 365 366
# We hardcode (for now) the set of LAN/link "protocols" that are wireless.
# Eventually, this should probably come from the database.
sub virtlanwireless($) {
    return (virtlanprotocol($_[0]) =~ /^(80211|flex900)/);
}
367 368 369 370
# Quick way to get to any fixed iface info for the lan and member.
sub virtlanfixediface($$) { 
    return $virt_lans{$_[0]}->{"MEMBERS"}->{$_[1]}->{"fixed_iface"};
}
371 372 373 374 375 376 377 378 379
# We don't actually store information about the protocol heirarchy in the
# database, so we use a simple conventions for now - <basetype>[-subtype]
sub protocolbasetype($) {
    if ($_[0] =~ /^([^-]+)-/) {
        return $1;
    } else {
        return $_[0];
    }
}
380

381 382 383 384 385 386 387 388 389 390 391
#
# virt_vtypes: The virt_vtypes table in the DB, indexed by the vtype
# name (user chosen name).
#
my %virt_vtypes = ();

#
# node_types: The node_types table from the DB, indexed by the type name.
#
my %node_types = ();

392
sub nodetypeistype($)   { return exists($node_types{$_[0]}); }
393 394 395 396 397 398 399 400 401 402 403
sub nodetypetype($)     { return $node_types{$_[0]}->type(); }
sub nodetypeclass($)    { return $node_types{$_[0]}->class(); }
sub nodedelayosid($)    { return $node_types{$_[0]}->delay_osid(); }
sub nodedefaultosid($)  { return $node_types{$_[0]}->default_osid(); }
sub nodetypeisremote($) { return $node_types{$_[0]}->isremotenode(); }
sub nodetypeisvirt($)   { return $node_types{$_[0]}->isvirtnode(); }
sub nodetypeisdynamic($){ return $node_types{$_[0]}->isdynamic(); }
sub nodetypeissub($)    { return $node_types{$_[0]}->issubnode(); }
sub nodetypeisplab($)   { return $node_types{$_[0]}->isplabdslice(); }
sub nodetypeissim($)    { return $node_types{$_[0]}->issimnode(); }
sub nodetypesimcap($)   { return $node_types{$_[0]}->simnode_capacity(); }
404 405 406 407 408 409 410
sub nodetypeisdedicatedremote($) {
    my $ded = $node_types{$_[0]}->GetAttribute("dedicated_widearea");
    if (defined($ded) && "$ded" eq "1") {
	return 1;
    }
    return 0;
}
411

412 413 414 415 416 417
#
# osids: Information from the os_info table from the DB, indexed by osid
#
my %osids = ();

sub osidpath($)         { return $osids{$_[0]}->{"path"}; }
418
sub osidos($)           { return $osids{$_[0]}->{"OS"}; }
419 420
sub osidhaspath($)      { my $path = osidpath($_[0]);
                          return (defined $path) && ($path ne "")};
421 422 423 424 425 426 427 428
sub osidnextosid($)     {
    my $retval = $osids{$_[0]}->{"nextosid"};
    
    if (!defined($retval)) {
	$retval = $_[0];
    }
    return $retval;
}
429

Leigh Stoller's avatar
Leigh Stoller committed
430
# 
431
# interface_capabilities: We need this to find out the bandwidths of the devices
432 433
# we actually have on the testbed. Index by interface type name.
#
434
my %interface_capabilities = ();
Leigh Stoller's avatar
Leigh Stoller committed
435

436 437 438 439 440 441 442 443
#
# XXX Hack table for determining if a delay node is required. We store
# the native link speeds for each type,class. Indexed by type and class,
# the value is a list of link hardware speeds for which no delay node is
# required. This is an awful way to do this, and quite wrong.
#
my %node_type_linkbw = ();

444 445 446 447 448 449 450 451
#
# phys_nodes: The equiv of virt_nodes above, except that these are pulled
# from the DB once the physical resources have been allocated. Indexed
# by physname, but there is a pointer from the virt_nodes table entry
# to the the corresponding physnode entry. 
# 
my %phys_nodes = ();

452
sub physnodeallocated($) { return exists($phys_nodes{$_[0]}); }
453 454 455 456 457 458
sub physnodeisvirtnode($){ return $phys_nodes{$_[0]}->{'p'}->isvirtnode();}
sub physnodeisjailed($)  { return $phys_nodes{$_[0]}->{'p'}->jailflag(); }
sub physnodeissubnode($) { return $phys_nodes{$_[0]}->{'p'}->issubnode(); }
sub physnodephysnode($)  { return $phys_nodes{$_[0]}->{'p'}->phys_nodeid(); }
sub physnodecontroliface($){return $phys_nodes{$_[0]}->{'p'}->control_iface();}
sub physnodetype($)      { return $phys_nodes{$_[0]}->{'p'}->type(); }
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
sub physnodesetvnode($$) { return $phys_nodes{$_[0]}->{"__vname"} = $_[1]; }
sub physnodevnode($$)    { return $phys_nodes{$_[0]}->{"__vname"}; }
sub physnodereuse($)     { return $phys_nodes{$_[0]}->{"__reuse"}; }
sub physnodesetreuse($$) { return $phys_nodes{$_[0]}->{"__reuse"} = $_[1]; }
sub physnodenextpipe($)  {
    my $pipeno = $phys_nodes{$_[0]}->{"__pipeno"} += 10;

    fatal("Too many ipfw pipes on node $pnode!")
	if ($pipeno >= 65535);
    
    return $pipeno;
}
sub physnodenextrtabid($)  {
    my $rtabid = $phys_nodes{$_[0]}->{"__rtabid"} += 1;
    return $rtabid;
}


#
# phys_node_interfaces: ???
#

sub physnodeinterface($$) { return $phys_node_interfaces{$_[0]}{$_[1]}; }
sub physinterfacetype($$) {
    return $phys_node_interfaces{$_[0]}{$_[1]}{"interface_type"};
}
485 486 487
sub physinterfacerole($$) {
    return $phys_node_interfaces{$_[0]}{$_[1]}{"role"};
}
488

489 490 491 492 493 494 495 496 497 498 499 500 501 502
#
# More physical side data structures.
# v2pmap is indexed by virtual and contains the physical node.
my %v2pmap = ();
# p2vmap is indexed by physical and contains one or more virtual nodes.
my %p2vmap = ();
# plinks is indexed by virtual name and contains
#  (pnodeportA,pnodeportB) .  If one is a delay node it is always
#  the second.
my %plinks = ();
# virtnodes is the list of subnodes on physnodes.
my %virtnodes = ();
my %v2vmap = ();

503 504 505 506
# for consing up vlan names.
my $vlanid = 0;
my %protovlans = ();

507 508 509
# Admission control counts
my %admission_control = ();

510 511 512 513 514 515 516
#
# Support for experiment modify. We create v2p and v2v mappings of the
# current topology so we can figure out how its changed after assign
# runs. These correspond to v2pmap and v2vmap mentioned above.
# 
my %reserved_v2pmap  = ();
my %reserved_v2vmap  = ();
517
my %oldreservednodes = ();
518 519
my %newreservednodes = ();
my $oldreservedclean = 0;
520 521 522
# reserved_p2vmap is indexed by physical and contains one or more virtual
# nodes
my %reserved_p2vmap  = ();
523

Leigh Stoller's avatar
Leigh Stoller committed
524
#
525 526 527 528 529 530 531 532 533 534 535 536
# Experiment wide options. See below. They come from the experiments table.
# Defining these will override experiment table setting. 
#
# Set this when forcing linkdelays instead of delay nodes. Set in the NS
# file with a tb-compat directive. The force directive says to set up a
# link delay, even when no delay would otherwise be inserted.
# usewatunnels is also set in the NS, and can be used to turn them off. 
# The multiplex_factor is to override node_types table for virtnode.
my $uselinkdelays;
my $forcelinkdelays;
my $usewatunnels;
my $multiplex_factor;
537
my $experiment_idx;
538
my $useprepass;
539
my $delaycap_override;
540
my $elabinelab = 0;
541

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
#
# Notes on virtual interfaces.
#
# If a link is EMULATED (virtlanemulated) then it will have an "encapsulation
# style" (virtlanencapstyle).  That value is one of:
#
# "alias"	emulation is done with IP aliases on a physical interface;
#		there is no packet encapsulation.
#		Works on FreeBSD or Linux.
#
# "veth"	emulation is done with "veth" virtual devices;
#		encapsulation is standard veth style.
#		Works on FreeBSD only.
#
# "veth-ne"	emulation is done with "veth" devices;
#		no encapsulation is used (uses made-up MAC addresses instead)
#		Works on FreeBSD only.
#
# "vlan"	emulation is done with "vlan" devices;
#		uses 802.1q VLAN tagging.
#		Works on FreeBSD or Linux.
#
# "default"	emulation style was not explicitly set by the users;
#		encap style depends on context ("veth" for vnode, "alias" ow)
#
# Note that the encapsulation style for "default" depends on the node type.
# For non-vnodes the default is "alias".  For vnodes the default is "veth"
# since they must always have some pseudo-device on which to hang a route
# table ID; i.e., we cannot just do IP aliasing.
#
# Encapsulation can be specified per link.  The default link encapsulation
# can also be specified by a per-experiment setting.  At the moment, only
# the latter (global) is implemented.
#
my $encapstyle;

578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
#
# Determine if a given link/lan uses linkdelays
#
sub virtlan_use_linkdelay($$) {
    my ($virtlan,$shaped) = @_;
    #
    # Here are the conditions for using linkdelays on each link/lan
    # Note: $forcelinkdelays and $uselinkdelays are global
    #
    if ( # linkdelays can be globally forced
         $forcelinkdelays ||
         # We use linkdelays on emulated virtlans
         virtlanemulated($virtlan) ||
         # The user requested linkdelays, and this is a virtlan that gets
         # shaped (note - in this case, non-shaped virtlans don't get
         # linkdelays)
         ($uselinkdelays && $shaped) ||
         # The user forced linkdelays for this specific virtlan
         virtlanlinkdelay($virtlan)) {
        # Yep, use linkdelays
        return 1;
    } else {
        # No - either won't be delayed at all, or we'll use a delay node
        return 0;
    }
}

605 606 607 608
# For admission control. Not well defined yet.
my $cpu_usage;
my $mem_usage;

609 610 611 612
# XXX woeful NSE hack
my $sim_osid;
my $sim_osname = "FBSD-NSE";

613 614 615
# Allow override of jail/delay osids.
my $jail_osid;
my $delay_osid;
616
my $delay_osid_desire;
617

618 619 620 621 622 623
# Flag that tells us whether to fix-node current
# resources or not during a swap modify. This is
# useful when vnode weights are tweaked by the experimenter
# before doing a swapmod
my $fix_current_resources;

624
######################################################################
Leigh Stoller's avatar
Leigh Stoller committed
625

626 627 628 629 630
# ips is indexed by node:port and contains the IP address for the port.
my %ips	      = ();

# memberof is indexed by node:port and holds the lan it is a member of.
my %memberof = ();
631 632

# delaylinks stores the actual link delay info, converted from the
633 634 635 636 637
# virt delay params above. It is indexed by link name and contains a
# [delay,bw,loss] array for each direction. The final member of the
# array is an indicator of whether the info is there strictly for a 
# trivial link (linkdelay inserted only when assign makes link trivial).
my %delaylinks     = ();
638 639 640 641 642 643 644 645 646 647 648

# delaynodes stores the names of delaynodes that we create on the
# fly using delayid. This is useful for doing isdelay? tests.
my %delaynodes = ();
my $delayid    = 0;

# nodedelays and linkdelays are the final (!) representation. Indexed by
# integer id, they store the physical node info and the delay info. 
my %nodedelays = ();
my %linkdelays = ();

649 650 651 652
# Virtual nodes that the user has requested be "fixed" to a specific
# physical node.
my %fixed_nodes     = ();

653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
# portbw is indexed by virtual nodeport and contains the bandwidth of
# that port. Note that port bandwidth in the interfaces table is left
# in Mbps units for now. Thats inconsistent, I know. For LANs with
# other bandwidths the port speed will be 100 and a delay node will be
# inserted.
my %portbw = ();

# lannodes is indexed by physical name and is the set of fake lan nodes.
# lan nodes are named lan/<virtual lan>. We need to know these so that
# when they come back from assign, we can ignore them.
my %lannodes = ();

# Node estimates and counts. Avoid running assign if there is no way to
# satisfy the estimates for physical nodes.
my $minimum_nodes;
my $maximum_nodes;
669 670
my $reserved_pcount  = 0;
my $reserved_vcount  = 0;
671
my $reserved_simcount= 0;
672 673 674 675
my $remotecount      = 0;
my $virtcount        = 0;
my $plabcount        = 0;
my $needwanassign    = 0;
676
my $simcount         = 0;
677

678 679 680
# for checks made during topfile creation.
my $toperrors        = 0;

681 682 683 684 685 686 687 688 689 690 691 692
#
# This is for stats gathering. It might duplicate other stuff, but
# thats okay.
#
my %expt_stats = (# pnodes include jailnodes and delaynodes.
		  # We let the wrapper determine pnodes once the
		  # experiment is fully swapped in so that the record
		  # is not "committed" until successful swapin.
		  jailnodes   => 0,
		  vnodes      => 0,
                  # vnodes include wanodes.
		  wanodes     => 0,
693 694
		  # wanodes includes plabnodes.
		  plabnodes   => 0,
695 696 697 698 699 700
		  simnodes    => 0,
		  delaynodes  => 0,
		  linkdelays  => 0,
		  links       => 0,
		  walinks     => 0,
		  lans        => 0,
701
		  wirelesslans => 0,
702 703 704 705 706 707 708
		  shapedlinks => 0,
		  shapedlans  => 0,
		  minlinks    => 100000,
		  # includes emulated links. Maybe thats wrong.
		  maxlinks    => 0,
);

709 710 711
my $simhost_id     = 0;
my %pnode2simhostid;
my %simhostid2pnode;
Shashi Guruprasad's avatar
Shashi Guruprasad committed
712

713
# Counters for generating IDs.
714
my $virtnode_id  = 0;
715
my $veth_id      = 0;
Leigh Stoller's avatar
Leigh Stoller committed
716

717 718 719 720 721 722 723 724 725 726
#
# Every vnode on a pnode gets its own routing
# table, thus an rtabid. In the case of simulated
# nodes, we need rtabids only for border nodes
# i.e. nodes that have links going out of the pnode.
# Either way, new rtabids are requested via getrtabid(<vnode>,<pnode>)
# when the interfaces table is updated or new veth_interfaces
# are inserted. This hash maintains the rtabids per vnode
my %vnode2rtabid = ();

727 728
######################################################################
# Step 1 - Setup virtual topology
729
#
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
# Here we need to read the virtual topology in from the virt_nodes
# and virt_lans table.  We then need to add delay and lan nodes as
# necessary.
#
# Conversion details:
#   Let L be a LAN with N members.
#   If N == 2 
#      Let N1 be node 1
#      Let N2 be node 2
#      If L is delayed
#         Generate delay node D
#         Link N1 to D
#         Link N2 to D
#      Else
#         Link N1 to N2
#   Else
#      Generate lan node A
#      If L is delayed
#        Foreach node N in L
#           Generate delay node DN
#           Link A to DN
#           Link N to DN
#      Else
#        Foreach node N in L
#           Link N to A
#
# Delay node names:
#  delay nodes are named tbdelayXX N > 2
#   and tbsdelayXX for N == 2.
#
########################################################################
761

762 763 764 765 766 767 768

# Slowly convert to using Experiment module.
my $experiment = Experiment->Lookup($pid, $eid);
if (!defined($experiment)) {
    fatal("Could not lookup experiment object!")
}

769 770
printdb "Generating TOP file.\n";
TBDebugTimeStamp("TOP started");
771

772 773 774 775 776
#
# Do admission control test right away.
#
if (!$toponly) {
    fatal("Failed admission control checks!")
777
	if (!TBAdmissionControlCheck(undef, $experiment, \%admission_control));
778 779
}

780 781 782 783 784 785
#
# vtypes are a funny mix beteween physical and virtual state, so we have to
# load them before getting the PhysInfo.
#
LoadVirtTypes();

786 787 788 789 790
#
# Load phys info. Interface types, node types, etc. Its any physical stuff
# we need.
#
LoadPhysInfo();
791

792 793 794 795
#
# Load the Experiment info and virt topology.
#
LoadExperiment();
796

797 798 799 800
#
# If updating, load current experiment resources. We have to be careful
# of how this is merged in with the (new) desired topology. See below.
#
801 802 803
if ($updating) {
    LoadCurrent();
    print STDERR "Resetting DB before updating.\n";
804
    $experiment->RemovePhysicalState();
805
}
Chad Barb's avatar
Chad Barb committed
806

807 808 809 810
#
# Check Max Concurrent for OSID violations.
#
CheckMaxConcurrent();
811

812 813 814 815
#
# Create the TOP file.
#
CreateTopFile();
816

817 818
TBDebugTimeStamp("TOP finished");

819 820 821 822
# Stop if creating the top file generated errors.
exit($WRAPPER_FAILED)
    if ($toperrors);

823 824 825 826
# Stop here ...
if ($toponly) {
    print "Stopping after creating the TOP file, as directed.\n";
    exit(0);
Leigh Stoller's avatar
Leigh Stoller committed
827
}
828

829

830 831 832 833 834 835 836 837 838
######################################################################
# Step 2 - Assign Loop
# 
# Here we loop up to maxrun times.  In each loop we snapshot the
# current testbed state into a ptop file.  We then run assign.  If
# assign succeeds we attempt to reserve the resources.  If that works
# we're done with step 2 otherwise we loop again.
#
#######################################################################
Leigh Stoller's avatar
Leigh Stoller committed
839

840 841
my $currentrun = 1;
my $canceled   = 0;
842
my $tried_precheck = 0;
843 844

# XXX plab hack - only run assign once on plab topologies, since they're easy
845
# to map and the physical topology does not change frequently.
846
if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
847
    $maxrun = 2;
848
}
849

850 851
TBDebugTimeStamp("assign_loop started");
while (1) {
852
    print "Assign Run $currentrun\n";
853

854
    #
855 856
    # RunAssign returns  0 if successful.
    #           returns -1 if failure, but assign says to stop trying.
857 858
    #           returns  1 if failure, but assign says to try again.
    #           returns  2 if we made some forward progress.
859
    #
860
    my $retval = RunAssign();
861 862 863 864

    # Success!
    last
	if ($retval == 0);
Leigh Stoller's avatar
Leigh Stoller committed
865

866
    if (!$precheck && !$tried_precheck) {
867
	my $ptopfile0 = $ptopfile;
868 869
	my $impotent0 = $impotent;
	print("Trying assign on an empty testbed.\n");
870
	$ptopfile = "$pid-$eid-$$-empty.ptop";
871 872 873 874
	$impotent = 1;
	$precheck = 1;
	my $retval = RunAssign();
	if ($retval != 0) {
875 876
	    fatal({type=>'extra', cause=>'user', severity=>SEV_ERROR,
	           error=>['infeasible_resource_assignment']}, 
877 878 879 880 881 882 883
		  "This experiment cannot be instantiated on this ".
                  "testbed. You have most likely asked for hardware ".
                  "this testbed does not have, such as nodes of a type ".
                  "it does not contain, or nodes with too many network ".
                  "interfaces.  You will need to modify this experiment ".
                  "before it can be swapped in - re-submitting the ".
                  "experiment as-is will always result in failure.");
884 885 886 887
	}
	print("Assign succeeded on an empty testbed.\n");
	$precheck = 0;
	$impotent = $impotent0;
888
	$ptopfile = $ptopfile0;
889 890 891 892
	$tried_precheck = 1;
    }


893
    if ($currentrun >= $maxrun && $retval != 2) {
894 895 896
	fatal({type => 'primary', severity => SEV_ERROR,
	       error => ['reached_assign_run_limit']},
	      "Reached run limit. Giving up.");
897 898
    }

899 900 901 902
    if ($retval < 0) {
	#
	# Failure in assign.
	#
903 904 905
	fatal({type => 'primary', severity => SEV_ERROR,
	       error => ['unretriable_assign_error']},
	      "Unretriable error. Giving up.");
906
    }
907
    
908 909 910 911 912 913 914 915 916
    print "Waiting 5 seconds and trying again...\n";
    sleep(5);
    $currentrun++;
}
TBDebugTimeStamp("assign_loop finished");

#
# Run assign once.
# 
917
sub RunAssign ()
918 919
{
    # Clear globals for each run.
Leigh Stoller's avatar
Leigh Stoller committed
920 921
    undef %v2pmap;
    undef %p2vmap;
922
    undef %v2vmap;
Leigh Stoller's avatar
Leigh Stoller committed
923
    undef %plinks;
924
    undef %virtnodes;
925 926 927

    my %toreserve = ();
    my %subnodes  = ();
Shashi Guruprasad's avatar
Shashi Guruprasad committed
928
    
929
    TBDebugTimeStamp("ptopgen started");
930
    # Snapshot physical resources.
Chad Barb's avatar
Chad Barb committed
931 932 933 934
    #
    # if updating (-u), include any resources that may already be
    # allocated to experiment in the PTOP results.
    #
935 936
    my $ptopargs = "-p $pid ";
    $ptopargs   .= "-e $eid "
937
	if ($updating);
938 939
    $ptopargs   .= "-u "
	if ($updating && $elabinelab);
940 941
    $ptopargs   .= "-m $multiplex_factor "
	if (defined($multiplex_factor));
942 943
    $ptopargs   .= "-v "
	if ($virtcount);
944 945
    $ptopargs   .= "-r "
	if ($remotecount);
946 947
    $ptopargs   .= "-S "
	if ($simcount);
948 949
    $ptopargs	.= "-a "
    	if ($precheck);
950 951 952 953
    $ptopargs	.= "-c $delaycap_override "
    	if (defined($delaycap_override));

    print "ptopargs $ptopargs\n";
954
    system("ptopgen $ptopargs > $ptopfile");
955
    TBDebugTimeStamp("ptopgen finished");
Leigh Stoller's avatar
Leigh Stoller committed
956

957 958 959 960 961 962 963 964 965 966 967 968
    if (scalar(keys(%admission_control))) {
	open(PTOP, ">> $ptopfile") or
	    return -1;

	foreach my $typeclass (keys(%admission_control)) {
	    my $count = $admission_control{$typeclass};

	    print PTOP "set-type-limit $typeclass $count\n";
	}
	close(PTOP);
    }

969
    TBDebugTimeStamp("assign started");
Leigh Stoller's avatar
Leigh Stoller committed
970
    # Run assign
971 972
    my $cmdargs = "-P $ptopfile $topfile";
    $cmdargs = "-uod -c .75 $cmdargs"
973
	if ($virtcount || $simcount);
974 975
    $cmdargs = "-n $cmdargs"
    	if ($precheck);
976 977 978 979 980

    my $cmd;

    # If doing an experiment with virtnodes, use the prepass wrapper for assign
    # Turned off for now, because it needs some work.
981 982 983 984 985 986 987 988
    if ($useprepass) {
    	$cmd = "assign_prepass";
    	$cmdargs = "-m $multiplex_factor $cmdargs"
    	    if ($multiplex_factor);
    } else {
    	$cmd = "assign";
    }

989
    print "$cmd $cmdargs\n";
Leigh Stoller's avatar
Leigh Stoller committed
990

991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
    #
    # Fork a child to run assign. Parent spins watching the cancel flag
    # and waiting for assign to finish.
    #
    if (my $childpid = fork()) {
	while (1) {
	    sleep(2);

	    if (waitpid($childpid, &WNOHANG) == $childpid) {
		$assignexitcode = $? >> 8;
		last;
	    }

	    # Check cancel flag.
1005
	    TBGetCancelFlag($pid, $eid, \$canceled);
1006 1007 1008 1009
	    if ($canceled) {
		if ((my $pgrp = getpgrp($childpid)) > 0) {
		    kill('TERM', -$pgrp);
		    waitpid($childpid, 0);
1010

1011 1012 1013
		    fatal({cause => 'canceled', severity => SEV_IMMEDIATE,
			   error => ['cancel_flag']},
			  "Cancel flag set; aborting assign run!");
1014
		    return -1;
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
		}
		# Loop again to reap child above before exit.
	    }
	}
    }
    else {
	#
	# Change our session so the child can get a killpg without killing
	# the parent. 
	#
        POSIX::setsid();
1026
	exec("nice assign_wrapper2 $cmd $cmdargs > assign.log");
1027
	die "Could not start assign!\n";
1028
    }
1029

1030
    # Check cancel flag before continuing. 
1031
    TBGetCancelFlag($pid, $eid, \$canceled);
1032
    if ($canceled) {
1033 1034 1035
	fatal({cause => 'canceled', severity => SEV_IMMEDIATE,
	       error => ['cancel_flag']},
	      "Cancel flag set; aborting assign run!");
1036 1037
	return -1;
    }
1038

1039 1040
    # Check for possible full filesystem ...
    if (-z "assign.log") {
1041
	tbnotice("assign.log is zero length! Stopping ...\n");
1042 1043 1044
	return -1;
    }

1045 1046 1047
    # Saving up assign.log coz each swapin/modify is
    # different and it is nice to have every mapping
    # for debugging and archiving purposes
1048 1049 1050
    # We do not call it .log though, since we do not want it copied
    # out to the user directory every swapin. See Experiment.pm
    system("/bin/cp assign.log $pid-$eid-$$.assign");
1051 1052 1053 1054
    if (!open(ASSIGNFP, "assign.log")) {
	print("Could not open assign logfile!\n");
	return -1;
    }
1055

1056 1057 1058
    printdb "Reading assign results.\n";

    #
1059 1060 1061
    # We no longer care what assign has to say when it fails!
    # Any relevent info was already sent to stderr so just 
    # tell the caller whether we want to keep trying or not.
1062 1063
    #
    if ($assignexitcode) {
1064
	close ASSIGNFP;
1065
	return (($assignexitcode == 1) ? 1 : -1);
1066
    }
1067
    
1068 1069 1070 1071 1072 1073 1074 1075 1076
    #
    # If we were doing the precheck, go ahead and exit now - there won't be
    # any useful information to parse out
    #
    if ($precheck) {
	print "Precheck succeeded.\n";
	return 0;
    }

1077 1078 1079 1080
    #
    # Assign success; parse results.
    # 
    # read nodes section
1081 1082
    my $found_nodes_section = 0;
    while (<ASSIGNFP>) {
1083 1084
	# find the 'BEST SCORE' line and print that out for informational
	# purposes
1085 1086
	if (/BEST SCORE/) {
	    print;
1087
	}
1088 1089 1090 1091 1092 1093 1094 1095 1096
        if (/^Nodes:/) {
            $found_nodes_section = 1;
            last;
        }
    }
    if (!$found_nodes_section) {
        tbwarn("Internal error - unable to find Nodes section in " .
               "assign output");
        return 1;
1097
    }
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
    printdb "Nodes:\n";
    while (<ASSIGNFP>) {
	chop;
	/^End Nodes$/ && last;
	@info = split;
	my ($virtual,$physical) = @info[0,1];

	# We don't care about LAN nodes anymore.
	if (defined($lannodes{$virtual})) {
	    next;
	}

	if (physnodeallocated($physical)) {
	    #
	    # Mark node as being reused.
	    #
	    # Look at virtual node being mapped to node;
	    # if it wasn't in the previous map, mark node for reboot.
	    #
	    if (physnodereuse($physical) eq "reboot") {
		# No changes once it goes into reboot.
		;
	    }
	    elsif (virtnodeisvirt($virtual)) {
1122
		#
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
		# A new virt virtual node on an existing physical node
		# does not force the physnode to be rebooted; we can
		# set up a new virtnode on it without a reboot. If its
		# an existing virtual on the same physnode, then mark
		# both as reused; no need to reboot either. If the 
		# virtnode has moved here from someplace else, no
		# reboot of the physnode either, but obviously the
		# vnode will be released and a new one allocated.  What
		# we cannot determine is if its just a renamed node
		# (which would require a reboot of the the virtual
		# node). 
		# 
		if (!exists($reserved_v2pmap{$virtual})) {
		    physnodesetreuse($physical, "reused");
1137
		}
1138 1139
		elsif ($reserved_v2pmap{$virtual} eq $physical) {
		    my $reserved = $reserved_v2vmap{$virtual};
1140

1141 1142
		    physnodesetreuse($reserved, "reused");
		    physnodesetreuse($physical, "reused");
1143 1144
		}
		else {
1145
		    physnodesetreuse($physical, "reused");
1146 1147 1148
		}
	    }
	    else {
1149
		#
1150 1151 1152 1153 1154 1155 1156
		# If a new virtual node mapped to this physnode (maybe
		# even the luser changed the name of the node), or if an
		# existing virtual node moved to this physnode, must
		# reboot the physnode. Else, the physnode is being
		# reused as is, and no need to mess with it. If the
		# user requested reboot, that will be handled outside
		# of this script.
1157
		#
1158 1159 1160 1161 1162 1163
		if (!exists($reserved_v2pmap{$virtual}) ||
		    $reserved_v2pmap{$virtual} ne $physical) {
		    physnodesetreuse($physical, "reboot");
		}
		else {
		    physnodesetreuse($physical, "reused");
1164
		}
1165
	    }
1166 1167 1168 1169 1170 1171 1172 1173
	}
	else {
	    #
	    # This is a new node; we'll have to reserve it. Note that
	    # we do not reserve a widearea physnode when a virtual node
	    # is mapped to it; they are special.
	    #
	    $toreserve{$physical} = 1
1174 1175 1176
		if (!virtnodeisvnode($virtual) ||
		    !(virtnodeisremote($virtual)
		      && !nodetypeisdedicatedremote(virtnodetype($virtual))));
1177 1178 1179 1180 1181 1182 1183 1184 1185
	}
	
	if (virtnodeisvirt($virtual)) {
	    #
	    # If mapping a virtual node, then record that, since we need
	    # to allocate the virtnodes on that physnode, later.
	    #
	    if (!defined($virtnodes{$physical})) {
		$virtnodes{$physical} = [];
Shashi Guruprasad's avatar
Shashi Guruprasad committed
1186
	    }
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
	    push(@{$virtnodes{$physical}}, $virtual);
	}
	elsif (virtnodeissubnode($virtual)) {
	    #
	    # Need to allocate the parent to. Should be optional?
	    # Save away and deal with once we have all the results.
	    #
	    $subnodes{$virtual} = $physical;
	}
	
	$v2pmap{$virtual} = $physical;
	if( ! defined($p2vmap{$physical}) ) {
	    $p2vmap{$physical} = [];
Leigh Stoller's avatar
Leigh Stoller committed
1200
	}
1201 1202 1203
	push(@{$p2vmap{$physical}}, $virtual);
	printdb "  $virtual $physical\n";
    }
Leigh Stoller's avatar
Leigh Stoller committed
1204

1205 1206 1207 1208 1209 1210 1211 1212 1213
    #
    # Process the subnodes. We have to allocate the parent at the same
    # time, lest it get sucked away for some other purpose by another
    # experiment. We might want to push this off into nalloc, but not
    # sure yet.
    #
    for my $virtual (keys(%subnodes)) {
	my $physical = $subnodes{$virtual};
	my $parent;
1214

1215
	TBPhysNodeID($physical, \$parent);
1216

1217
	printdb "  Subnode: $virtual $physical $parent\n";
1218

1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238
	#
	# See if we already have it. Swapmod, retry, or perhaps
	# the parent could be named separately? Or maybe there are
	# several subnodes on the physnode?
	#
	next
	    if (exists($p2vmap{$parent}));

	# Make up a name and add to the list.
	my $newvname = newvname($parent, "phost");

	$v2pmap{$newvname} = $parent;
	$p2vmap{$parent} = [ $newvname ];
	$toreserve{$parent} = 1;
	printdb "  Adding subnode host: $newvname $parent\n";
    }

    # read Edges
    # By convention, in plinks, the delay node is always the second
    # entry.
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250
    my $found_edges_section = 1;
    while (<ASSIGNFP>) {
        if (/^Edges:/) {
            $found_edges_section = 1;
            last;
        }
    }
    if (!$found_edges_section) {
        tbwarn("Internal error - unable to find Edges section in " .
               "assign output");
        return 1;
    }
1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276
    printdb "Edges:\n";
    EDGEWHILE: while (<ASSIGNFP>) {
	/^End Edges$/ && last EDGEWHILE;
	@info = split;
	$line = $_;
	$_ = $info[1]; # type
        SWITCH1: {
	    /^intraswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^interswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,$#info];
		last SWITCH1;
	    };
	    /^direct$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^trivial$/ && do {
		# we don't have plinks for trivial links
		$vlink = $info[0];
		$plinks{$vlink} = [];
		next EDGEWHILE;
	    };
	    print "Found garbage: $line\n";
Leigh Stoller's avatar
Leigh Stoller committed
1277
	}
1278 1279 1280 1281 1282 1283
	$nodeportA = &getnodeport($rawA);
	$nodeportB = &getnodeport($rawB);
	$nodeportA =~ s/\//:/;
	$nodeportB =~ s/\//:/;
	$plinks{$vlink} = [$nodeportA,$nodeportB];
	printdb "  $vlink " . join(" ",@{$plinks{$vlink}}) . "\n";
Leigh Stoller's avatar
Leigh Stoller committed
1284 1285
    }
    close(ASSIGNFP);
1286
    TBDebugTimeStamp("assign finished");
Leigh Stoller's avatar
Leigh Stoller committed
1287 1288

    # Reserve resources
1289 1290 1291 1292
    if ($impotent) {
	print "Skipping physical reservation, as directed.\n";
	return 0;
    }
1293

1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
    # From here, we can't recover anymore, coz we move
    # previously reserved pnodes/vnodes to the holding reservation
    # and back. By doing this, we will avoid any UNIQUE key issues
    # when a virt_node in the topology moves from one pnode to another
    # from previous to new mapping.
    # Another reason to do this just before nalloc of a new toreserve
    # nodes is that, we can get into name clashes
    # For example, lets say the user called his node pc2 and it was
    # actually mapped to pc99 in the initial swapin. If this was a
    # swapmod where the user asked for another node node0 which got 
    # mapped to pc2. nalloc of pc2 will result in UNIQUE key problems
    # since there exists a reserved vname pc2 (virtual name). By 
    # having this operation of moving the nodes into 
    # OLDRESERVED_PID/OLDRESERVED_EID and back before a new nalloc,
    # avoid this UNIQUE key problem. Also note that simply updating
    # the vname to be the same as the node_id field also won't
    # work all the time i.e. in the example discussed above
    my $oldreserved_pid = OLDRESERVED_PID;	
    my $oldreserved_eid = OLDRESERVED_EID;
1313
    if (scalar(keys(%oldreservednodes)) && !$oldreservedclean) {
1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back started");
	system("nfree -o $pid $eid " . join(" ", keys(%oldreservednodes)) );
	system("nalloc $pid $eid " . join(" ", keys(%oldreservednodes)) );
	my $exitval  = $? >> 8;
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back finished");
	#
	# If nalloc failed with a fatal error, lets give it up. No retry.
	# 
	if ($exitval != 0) {
1327 1328
	    print("Failed to move back Old Reserved nodes back to reserved\n");
	    return -1;
1329
	}
1330

1331 1332 1333 1334 1335 1336
	#
	# We need to only once i.e. in the first call to RunAssign().
	# If it gets repeatedly called coz only some pnode resources
	# got nalloced, we do not have to do the above again.
	#
	$oldreservedclean = 1;
1337 1338
    }

1339 1340 1341 1342
    TBDebugTimeStamp("reserving started");
    system("nalloc -p $pid $eid " . join(" ", keys(%toreserve)));
    TBDebugTimeStamp("reserving finished");
    my $exitval  = $? >> 8;
Chad Barb's avatar
Chad Barb committed
1343

1344 1345 1346 1347 1348 1349 1350
    #
    # If nalloc failed with a fatal error, lets give it up. No retry.
    # 
    if ($exitval < 0) {
	print "Failed to reserve any nodes.\n";
	return -1; 
    }
Chad Barb's avatar
Chad Barb committed
1351

1352 1353 1354 1355 1356 1357 1358
    #
    # Okay, if nalloc got anything, we have to set the norecover bit,
    # since tbswap operates on the principle that any change in the DB
    # means no recover is possible. This can probably me dealt with by
    # deallocating any nodes we allocated in the wrapper before exiting.
    #
    $NoRecover = 1;
1359

1360 1361 1362 1363 1364 1365 1366
    #
    # Otherwise, all newly allocated nodes MUST go to the INIT_DIRTY
    # allocstate since the user now has control of them. If we eventually
    # fail, nodes not in RES_READY are deallocated (retry/modify). 
    #
    if ($exitval > 0) {
	my @reserved = ExpNodes($pid, $eid);
1367
	my $rcount   = scalar(@reserved);
1368
	my $tcount   = scalar(keys(%toreserve));
1369 1370

	# We got only some. Need to figure out which.
1371 1372
	print "Reserved some nodes ($rcount) we needed, ".
	    "but not all ($exitval).\n";
1373 1374 1375
	
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
1376
		$newreservednodes{$node} = $node;
1377
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
1378
	    }
Leigh Stoller's avatar
Leigh Stoller committed
1379 1380
	}

1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
	#
	# We check to see if were were able to reserve all the fixed
	# nodes we needed. If we couldn't get the fixed list, then
	# this experiment is unlikely to map in the "near" future, so
	# give up now (no retry).
	#
	foreach my $node (values(%fixed_nodes)) {
	    if (! grep {$_ eq $node} @reserved) {
		printdb "  Could not allocate fixed node $node!\n";
		return -1;
	    }
Leigh Stoller's avatar
Leigh Stoller committed
1392
	}
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
	#
	# Okay, must extend the fixed list with newly allocated nodes
	# so that we can recreate the top file, and try again with a
	# new set.
	#
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
		foreach my $vname (@{$p2vmap{$node}}) {
		    $fixed_nodes{$vname} = $node;
		}
	    }
1404
	}
1405
	CreateTopFile();
1406 1407 1408 1409 1410 1411 1412 1413

	#
	# Return indicator that we made forward progress (got some nodes).
	# Caller will decide if appropriate to keep trying. We made progress
	# if the return value of nalloc (number of nodes not allocated) does
	# not equal the number of nodes we tried to allocate.
	#
	return (($tcount == $exitval) ? 1 : 2);
1414
    }
1415

1416 1417 1418 1419 1420 1421 1422 1423
    #
    # We got all the nodes we wanted. All newly allocated nodes MUST
    # go to the INIT_DIRTY allocstate since the user now has control
    # of them.
    #
    print "Successfully reserved all physical nodes we needed.\n";
	
    foreach my $node (keys(%toreserve)) {
1424 1425
	# Remeber all newly allocated nodes for later free if failure.
	$newreservednodes{$node} = $node;
1426
	TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
Leigh Stoller's avatar
Leigh Stoller committed
1427
    }
1428

1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
    #
    # Release phys and virt nodes no longer needed. They are marked
    # for teardown. They need to be freed by SOMEONE, currently the
    # wrapper (tbswap), since this only happens when in update mode
    # (swapmod).
    #
    foreach my $pnode (keys(%phys_nodes)) {
	my $reuse = physnodereuse($pnode);
		
	if ($reuse eq "unused") {
	    #
	    # Node was used in previous incarnation, but not any more.
	    #
	    TBSetNodeAllocState($pnode, TBDB_ALLOCSTATE_RES_TEARDOWN());
	}
	elsif ($reuse eq "reboot") {
	    #
	    # Node is being reused, but for a different purpose, so
	    # it should be rebooted.
	    #
	    TBSetNodeAllocState($pnode, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
	}
    }
    return 0;
Leigh Stoller's avatar
Leigh Stoller committed
1453
}
1454

1455 1456 1457 1458 1459 1460 1461 1462 1463 1464
###########################################################################
# Step 2A
#
# We run the wanassigner to allocate remote nodes. We do this after cause
# it takes so long. We run it just once.
#
# wanassign does its own nalloc.
#
###########################################################################

1465
#
1466
# VIRTNODES HACK: Allocate the remote virtual nodes.
1467
#
1468
if ($needwanassign) {
1469
    my $success  = 0;
1470
    my %wanmap   = ();
1471

1472 1473
    print "Running 'wanassign -d $pid $eid'\n";
    open(WANFP,"wanassign -d $pid $eid 2>&1 | tee wanassign.log |") or
1474
	fatal("Failed to start wanassign: $!");
1475 1476 1477 1478 1479

    printdb "Reading wanassign results.\n";
    while (<WANFP>) {
	chop;
	if ($_ =~ /(\S+) mapsto (\S+)/) {
1480 1481
	    $wanmap{$1} = $2;
	    printdb "  $1 mapsto $2\n";
1482
	}
1483 1484 1485 1486 1487 1488
	if ($_ =~ /^Success/) {
	    $success = 1;
	}
	# Skip other output. Usually its debugging output.
    }
    close(WANFP) or
1489
	fatal("wanassign: " .
1490 1491
	      ($? ? "exited with status: $?." :
	            "error closing pipe: $!"));
1492

1493
    if (!$success) {
1494
	fatal("wanassign could not find a solution!");
1495
    }
1496 1497
    foreach my $virtual (keys(%wanmap)) {
	my $physical = $wanmap{$virtual};
1498

1499
	fatal("Improper mapping from wanassign: $virtual/$physical")
1500 1501 1502 1503 1504 1505 1506 1507
	    if (!virtnodeisvirt($virtual));
	
	#
	# If mapping a virtual node, then record that, since we need
	# to allocate the virtnodes on that physnode, later.
	#
	if (!defined($virtnodes{$physical})) {
	    $virtnodes{$physical} = [];
1508
	}
1509 1510 1511 1512 1513
	push(@{$virtnodes{$physical}}, $virtual);
	
	$v2pmap{$virtual} = $physical;
	if( ! defined($p2vmap{$physical}) ) {
	    $p2vmap{$physical} = [];
1514
	}
1515
	push(@{$p2vmap{$physical}}, $virtual);
1516
    }
1517
    TBDebugTimeStamp("wanassign finished");
1518 1519
}

1520 1521
#
# Recoverability ends.
1522
# All fatal() calls from this point do not have the recoverable '64' bit set.
1523
#
1524
#$NoRecover = 1;
1525

1526 1527 1528
# VIRTNODES HACK: Local virtnodes have to be mapped now. This is a little
# hokey in that the virtnodes just need to be allocated from the pool that
# is on the real node. We know they are free, but we should go through
1529
# nalloc anyway. If anything fails, no point in retry.
Leigh Stoller's avatar
Leigh Stoller committed
1530
#
1531 1532
foreach my $pnode (keys(%virtnodes)) {
    my @vlist = @{$virtnodes{$pnode}};
1533
    my $numvs = @vlist;
1534
    my @plist = ();
1535 1536
    my @oplist = ();
    my @ovlist = ();
Leigh Stoller's avatar
Leigh Stoller committed
1537