assign_wrapper.in 175 KB
Newer Older
Leigh Stoller's avatar
Leigh Stoller committed
1
#!/usr/bin/perl -w
Leigh Stoller's avatar
Leigh Stoller committed
2
#
3
# Copyright (c) 2000-2011 University of Utah and the Flux Group.
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# 
# {{{EMULAB-LICENSE
# 
# This file is part of the Emulab network testbed software.
# 
# This file is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# 
# This file is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public
# License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with this file.  If not, see <http://www.gnu.org/licenses/>.
# 
# }}}
Leigh Stoller's avatar
Leigh Stoller committed
23
#
24 25
use English;
use Getopt::Std;
26
use POSIX qw(setsid ceil);
27
use POSIX ":sys_wait_h";
Leigh Stoller's avatar
Leigh Stoller committed
28

29 30 31 32 33 34 35 36 37 38 39
#
# Function phototypes
#
sub RunAssign();
sub getbandwidth($$$);
sub InitPnode($$);
sub nextipportnum($);
sub AddVlan($$$;$);
sub UploadVlans();
sub CheckMaxConcurrent();
sub UploadStats();
40 41 42
sub NewVirtIface($$$;$);
sub AddVirtPatch($$);
sub PatchVirts();
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
sub newvname($$);
sub newvname_simhost($);
sub LoadPhysInfo();
sub interfacespeedmbps($$);
sub requires_delay($$$);
sub LoadVirtNodes();
sub LoadVirtNodeDesires();
sub LoadVirtNodeStartLoc();
sub LoadVirtLans();
sub LoadVirtTypes();
sub LoadExperiment();
sub CreateTopFile();
sub LoadPhysNode($);
sub LoadPhysNodeInterfaces($);
sub LoadPhysResources();
sub physnodenextrtabid($);
sub getrtabid($$);
sub array_diff($$);
sub LoadCurrent();
62
sub SetUpTracing($$$$$);
63
sub fatal(@);
64
sub FinalizeRegression($);
65

66
#
Leigh Stoller's avatar
Leigh Stoller committed
67 68 69 70 71 72
# This function as the main assign loop.  It converts the virtual
# topology into a top input including LAN and delay translation.  It
# then snapshots the current testbed physical state and runs assign,
# looping a couple times if assign fails.  When assign successfully
# completes it will interpret the results.  Attempt to match any
# existing portmap entries and then update the delays and vlans table.
73
#
74 75 76 77
# XXX: Update does not work with widearea nodes.
#      Internally created nodes (jailhost,delay,sim) are not treated
#        consistently. Needs more thought.
#
78 79 80 81
# Return codes: We catch all errors with the END block below, and if
# anyone calls die() (exit value is 255) we add in the CANRECOVER bit.
# Failures in assign always cause the caller to stop retrying. 
#
82
# The CANRECOVER bit indicates 'recoverability' (no db or physical
83 84
# state was modified by the time the error occurred). This is relavent
# to only modify operations (update).
85 86 87
#
my $WRAPPER_SUCCESS		 = 0x00;
my $WRAPPER_FAILED		 = 0x01;	# Failed (Add other values)
88 89
my  $WRAPPER_FAILED_CANRECOVER   = 0x40;        # Can recover from update
my  $WRAPPER_FAILED_FATALLY      = 0x80;	# Do not use this.
90 91
# Set this once we modify DB state; forces no recover in fatal().
my $NoRecover = 0;
92

93 94
sub usage ()
{
95 96 97 98
    print STDERR "Usage: $0 [-v] [-u [-f] | -n] pid eid\n";
    print STDERR " -v   - Enables verbose output\n";
    print STDERR " -u   - Enables update mode\n";
    print STDERR " -f   - Fix current resources during update mode\n";
99 100
    print STDERR " -t   - Create the TOP file and then exit\n";
    print STDERR " -n   - Run assign, but do not reserve/modify resources.\n";
101 102
    print STDERR " -p   - Do a precheck for mapability on an empty testbed - ".
		 "implies -n\n";
103
    exit($WRAPPER_FAILED);
104
}
105
my $optlist  = "vutnfprzxm:k";
106
my $verbose  = 0;
107
my $fixmode  = 0;
108 109 110
my $updating = 0;
my $toponly  = 0;
my $impotent = 0;
111
my $precheck = 0;
112
my $prepass  = 0;
113
my $warnings = 0;
114 115 116 117 118
my $mfactor;
my $regression = 0;
my $noassign   = 0;  # Only with regression mode, use previous solution.
my $noregfree  = 0;  # Only with regression mode, leave physical state at end.
my $usecurrent = 0;  # Only with regression mode, use current solution.
119 120 121 122 123

#
# Configure variables
#
my $TBROOT	  = "@prefix@";
124 125
my $NFREE         = "$TBROOT/bin/nfree";
my $DBNAME  	  = "@TBDBNAME@";
126
my $DELAYCAPACITY = @DELAYCAPACITY@;	# Can be overridden by user!
127 128 129 130 131 132 133 134 135 136 137 138 139
$ENV{'PATH'} = "/usr/bin:$TBROOT/libexec:$TBROOT/sbin:$TBROOT/bin";

#
# Turn off line buffering on output
#
$| = 1;

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;
140
use Experiment;
141
use Node;
142
use NodeType;
143
use Lan;
144
use OSinfo;
145
use libadminctrl;
146
use libtblog;
147
use libtblog qw(*SOUT *SERR);
148

149
#
150 151 152 153
# assign_wrapper Settings
#
# Maximum delay in ms above which a delay node is needed.
# (Note that the DB represents delays as floating point numbers)
154
my $delaythresh = @DELAYTHRESH@;
155

156
# Maximum number of times we run assign.
157
my $maxrun = 3;
Leigh Stoller's avatar
Leigh Stoller committed
158

159 160 161
# Use the switch to delay when possible. Currentlythis only works for 10mbit
# links (actually, its turned off cause it does not work; auto handshake).
my $delaywithswitch = 0;
Leigh Stoller's avatar
Leigh Stoller committed
162

163 164 165 166 167 168 169
#
# Some handy constants. Speed in Mbits/sec and Kbits/sec units.
#
# Its probably a good idea to leave portbw (current_speed) in Mbs, since
# those numbers are used at the switch and the interfaces, which really
# only think in Mbps.
#
170 171 172 173 174 175
my $S10Mbs   = 10;
my $S100Mbs  = 100;
my $S1000Mbs = 1000;
my $S10Kbs   = 10000;
my $S100Kbs  = 100000;
my $S1000Kbs = 1000000;
Leigh Stoller's avatar
Leigh Stoller committed
176

177
#
178 179
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
180
#
181 182 183
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
184
}
185 186
if (@ARGV != 2) {
    usage();
187
}
188
if (defined($options{"v"})) {
189
    $verbose++;
Chad Barb's avatar
Chad Barb committed
190
}
191
if (defined($options{"u"})) {
Chad Barb's avatar
Chad Barb committed
192
    $updating = 1;
193
}
194 195 196 197 198
if (defined($options{"t"})) {
    $toponly = 1;
}
if (defined($options{"n"})) {
    $impotent = 1;
Leigh Stoller's avatar
Leigh Stoller committed
199
}
200 201 202 203
if (defined($options{"p"})) {
    $impotent = 1;
    $precheck = 1;
}
204 205 206 207 208 209
if (defined($options{"x"})) {
    $prepass = 1;
}
if (defined($options{"m"})) {
    $mfactor = $options{"m"};
}
210 211 212
if (defined($options{"f"})) {
    $fixmode = 1;
}
213 214 215 216 217
if (defined($options{"r"})) {
    if ($DBNAME eq "tbdb") {
	fatal("Cannot use regression mode on main DB");
    }
    $regression = 1;
218
    $usecurrent = 1
219
	if (defined($options{"z"}));
220 221
    $noregfree = 1
	if (defined($options{"k"}));
222
}
223 224
my $pid = $ARGV[0];
my $eid = $ARGV[1];
225
my $ptopfile = ($regression ? "$pid-$eid.ptop" : "$pid-$eid-$$.ptop");
226 227 228 229
# Since the topfile could change across
# swapins and modifies, it makes sense
# to store all of them. Helps in
# degugging.
230
my $topfile  = ($regression ? "$pid-$eid.top" : "$pid-$eid-$$.top");
231

232 233
TBDebugTimeStampsOn();

234 235 236 237
#
# We want warnings to cause assign_wrapper to exit abnormally.
# We will come through here no matter how we exit though.
# 
238 239 240 241
# FIXME: Is this still needed.  "warn" is only used once.  Also this
#  will cause perl internal warnings (such as "Use of uninitialized
#  value ..."  to cause assign_wrapper to fail. -- kevina
#
242
$SIG{__WARN__} = sub { tbwarn $_[0];$warnings++; };
243 244 245 246 247 248 249 250

END {
    # Watch for getting here cause of a die()/exit() statement someplace.
    my $exitcode = $?;

    if ($exitcode && $exitcode != $WRAPPER_FAILED) {
	$exitcode = $WRAPPER_FAILED|$WRAPPER_FAILED_FATALLY;
    }
251
    
252
    if ($warnings > 0) {
253
	tberror "$warnings warnings.\n";
254 255 256 257 258 259 260 261 262 263

	$exitcode |= $WRAPPER_FAILED;
    }

    # Set recover bit if we are going to fail.
    $exitcode = $exitcode|$WRAPPER_FAILED_CANRECOVER
	if ($exitcode && !$NoRecover);

    # And change the exitcode to be what we want it to be!
    $? = $exitcode;
264
}
Leigh Stoller's avatar
Leigh Stoller committed
265

266 267
sub printdb ($)
{
Leigh Stoller's avatar
Leigh Stoller committed
268 269 270
    if ($verbose) {
	print $_[0];
    }
271
}
Leigh Stoller's avatar
Leigh Stoller committed
272

273
print "assign_wrapper improved started\n";
274 275
TBDebugTimeStamp("assign_wrapper started");

Leigh Stoller's avatar
Leigh Stoller committed
276
#
277
# The main data structures:
Leigh Stoller's avatar
Leigh Stoller committed
278
#
279 280 281 282 283
# virt_nodes: The virtual nodes, indexed by vname. Each entry is a
# hash reference, initially of just the DB info, but possibly
# augmented as we proceed through assign.  Do not confuse these
# virtual nodes with the other virtual nodes! These are the ones from
# the actual topology, the virt_nodes table in the DB.
Leigh Stoller's avatar
Leigh Stoller committed
284
#
285 286
my %virt_nodes = ();

287 288 289 290 291 292 293 294 295 296
#
# Helper functions to get virt_node info. Because of internally created
# nodes (tbdelay, vhost, sim), we need a virtnodeisvnode() test to prevent
# entries getting stuck into $virt_nodes when calling, say, virtnodeisvirt.
# Should we create entries for these nodes instead?
#
sub virtnodeisvnode($)  { return exists($virt_nodes{$_[0]}); }
sub virtnodetype($)	{ return $virt_nodes{$_[0]}->{"type"}; }
sub virtnodename($)	{ return $virt_nodes{$_[0]}->{"vname"}; }
sub virtnodeelabrole($) { return $virt_nodes{$_[0]}->{"inner_elab_role"}; }
297
sub virtnodeplabrole($) { return $virt_nodes{$_[0]}->{"plab_role"}; }
298
sub virtnodeosname($)   { return $virt_nodes{$_[0]}->{"__osname"}; }
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
sub virtnodeosid($)     { return $virt_nodes{$_[0]}->{"__osid"}; }
sub virtnodesetosid($$) { return $virt_nodes{$_[0]}->{"__osid"} = $_[1]; }
sub virtnodeparent($)   { return $virt_nodes{$_[0]}->{"__parent"}; }
sub virtnodepnode($)    { return $virt_nodes{$_[0]}->{"__physnode"}; }
sub virtnodesetpnode($$){ return $virt_nodes{$_[0]}->{"__physnode"} = $_[1]; }
sub virtnodeusewan($)   { return $virt_nodes{$_[0]}->{"__usewanassign"}; }
sub virtnodesetusewan($){ return $virt_nodes{$_[0]}->{"__usewanassign"} = 1;}
sub virtnodesettings($) { return @{$virt_nodes{$_[0]}->{"__settings"}}; }
sub virtnodestartloc($) { return $virt_nodes{$_[0]}->{"__startloc"}; }
sub virtnodeisremote($) {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isremotenode"};
}
sub virtnodeisvirt($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isvirtnode"};
}
sub virtnodeissubnode($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__issubnode"};
}
sub virtnodeissim($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__issimnode"};
}
sub virtnodeisdynamic($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isdynamic"};
}
sub virtnodeisplabnode($)   {
    return virtnodeisvnode($_[0]) && $virt_nodes{$_[0]}->{"__isplabnode"};
}
sub virtnodeisjailed($) {
    return virtnodeisvirt($_[0]) && physnodeisjailed(virtnodepnode($_[0]));
}
sub virtnodedesires     {
    my @desires;
    while (my ($desire, $weight) = each %{$virt_nodes{$_[0]}->{"__desires"}}) {
	push @desires, $desire . ":" . sprintf("%f",$weight);
    }
    return join(" ",@desires);
}
336 337 338 339 340 341
sub virtnodeneedslinkdelays($) {
    return $virt_nodes{$_[0]}->{"__needslinkdelays"};
}
sub virtnodesetneedslinkdelays($) {
    return $virt_nodes{$_[0]}->{"__needslinkdelays"} = 1;
}
342

Leigh Stoller's avatar
Leigh Stoller committed
343
#
344 345 346 347 348 349 350 351 352 353 354 355
# virt_lans: The equivalent of virt_nodes; the virt_lans table in the DB.
# Since there are multiple rows per lan (one for each node), this is a
# multilevel structure. The first slot is another hash, one for each node.
# The rest of the slots store other random things associated with the lan.
# So, looks something like:
#
#   %virt_lans = (link0 => {members    => member0 => { db row ref },
#                                         member1 => { db row ref }}
#                           mustdelay    => 0,
#                           emulated     => 0,
#                           uselinkdelay => 0,
#                           nobwshaping  => 0,
356
#                           encapstyle   => "default",
357 358 359 360 361 362 363
#                           trivok       => 0
#                          }
#                 link1 => ...
#                )
#
my %virt_lans = ();

364 365
sub virtlanexists($)	{ return exists($virt_lans{$_[0]}); }
sub virtlanname($)	{ return $virt_lans{$_[0]}->{"VNAME"}; }
366
sub virtlanidx($)	{ return $virt_lans{$_[0]}->{"IDX"}; }
367 368 369 370 371 372
sub virtlanmembers($)	{ return @{$virt_lans{$_[0]}->{"MEMBERLIST"}}; }
sub virtlanmustdelay($)	{ return $virt_lans{$_[0]}->{"MUSTDELAY"}; }
sub virtlanemulated($)	{ return $virt_lans{$_[0]}->{"EMULATED"}; }
sub virtlanlinkdelay($) { return $virt_lans{$_[0]}->{"USELINKDELAY"}; }
sub virtlannobwshape($)	{ return $virt_lans{$_[0]}->{"NOBWSHAPING"}; }
sub virtlantrivok($)	{ return $virt_lans{$_[0]}->{"TRIVIAL_OK"}; }
373
sub virtlanencapstyle($){ return $virt_lans{$_[0]}->{"ENCAPSTYLE"}; }
374 375 376
sub virtlantunnel($)	{ return $virt_lans{$_[0]}->{"TUNNEL"}; }
sub virtlandelayinfo($$){ return @{$virt_lans{$_[0]}->{"DELAYINFO"}->{$_[1]}};}
sub virtlanqueueinfo($$){ return @{$virt_lans{$_[0]}->{"QUEUEINFO"}->{$_[1]}};}
377
sub virtlantraceinfo($$){ return @{$virt_lans{$_[0]}->{"TRACEINFO"}->{$_[1]}};}
378 379 380 381
sub virtlannetmask($)	{ return $virt_lans{$_[0]}->{"MASK"}; }
sub virtlanwidearea($)  { return $virt_lans{$_[0]}->{"WIDEAREA"}; }
sub virtlanallsim($)    { return $virt_lans{$_[0]}->{"ALLSIM"}; }
sub virtlanprotocol($)  { return $virt_lans{$_[0]}->{"PROTOCOL"}; }
382 383 384 385 386 387 388 389 390 391
sub virtlandelayed($)   { return $virt_lans{$_[0]}->{"DELAYED"}; }
sub virtlansetdelayed($){ $virt_lans{$_[0]}->{"DELAYED"} = 1; }

sub virtlanmembershaped($$) {
    return $virt_lans{$_[0]}->{"SHAPEDMEMBERS"}->{$_[1]};
}
sub virtlansetmembershaped($$) {
    $virt_lans{$_[0]}->{"SHAPEDMEMBERS"}->{$_[1]} = 1;
    # Mark the lan as having shaped members.
}
392 393 394 395 396
sub virtlanAP($)        {
    return $virt_lans{$_[0]}->{"ACCESSPOINT"}
        if (defined($virt_lans{$_[0]}->{"ACCESSPOINT"}));
    return undef;
}
397 398 399 400
sub virtlanusevirtiface($) {
    my $encap = virtlanencapstyle($_[0]);
    return ($encap eq "veth" || $encap eq "veth-ne" || $encap eq "vlan");
}
401 402 403 404
sub virtlanusetuniface($) {
    my $encap = virtlanencapstyle($_[0]);
    return ($encap eq "gre" || $encap eq "egre" || $encap eq "vtun");
}
405 406 407 408 409
# We hardcode (for now) the set of LAN/link "protocols" that are wireless.
# Eventually, this should probably come from the database.
sub virtlanwireless($) {
    return (virtlanprotocol($_[0]) =~ /^(80211|flex900)/);
}
410 411 412 413
# Quick way to get to any fixed iface info for the lan and member.
sub virtlanfixediface($$) { 
    return $virt_lans{$_[0]}->{"MEMBERS"}->{$_[1]}->{"fixed_iface"};
}
414 415 416 417 418 419 420 421 422
# We don't actually store information about the protocol heirarchy in the
# database, so we use a simple conventions for now - <basetype>[-subtype]
sub protocolbasetype($) {
    if ($_[0] =~ /^([^-]+)-/) {
        return $1;
    } else {
        return $_[0];
    }
}
423

424 425 426 427 428 429 430 431 432 433 434
#
# virt_vtypes: The virt_vtypes table in the DB, indexed by the vtype
# name (user chosen name).
#
my %virt_vtypes = ();

#
# node_types: The node_types table from the DB, indexed by the type name.
#
my %node_types = ();

435
sub nodetypeistype($)   { return exists($node_types{$_[0]}); }
436 437 438 439 440 441 442 443 444 445 446
sub nodetypetype($)     { return $node_types{$_[0]}->type(); }
sub nodetypeclass($)    { return $node_types{$_[0]}->class(); }
sub nodedelayosid($)    { return $node_types{$_[0]}->delay_osid(); }
sub nodedefaultosid($)  { return $node_types{$_[0]}->default_osid(); }
sub nodetypeisremote($) { return $node_types{$_[0]}->isremotenode(); }
sub nodetypeisvirt($)   { return $node_types{$_[0]}->isvirtnode(); }
sub nodetypeisdynamic($){ return $node_types{$_[0]}->isdynamic(); }
sub nodetypeissub($)    { return $node_types{$_[0]}->issubnode(); }
sub nodetypeisplab($)   { return $node_types{$_[0]}->isplabdslice(); }
sub nodetypeissim($)    { return $node_types{$_[0]}->issimnode(); }
sub nodetypesimcap($)   { return $node_types{$_[0]}->simnode_capacity(); }
447 448 449 450 451 452 453
sub nodetypeisdedicatedremote($) {
    my $ded = $node_types{$_[0]}->GetAttribute("dedicated_widearea");
    if (defined($ded) && "$ded" eq "1") {
	return 1;
    }
    return 0;
}
454

Leigh Stoller's avatar
Leigh Stoller committed
455
# 
456
# interface_capabilities: We need this to find out the bandwidths of the devices
457 458
# we actually have on the testbed. Index by interface type name.
#
459
my %interface_capabilities = ();
Leigh Stoller's avatar
Leigh Stoller committed
460

461 462 463 464 465 466 467 468
#
# XXX Hack table for determining if a delay node is required. We store
# the native link speeds for each type,class. Indexed by type and class,
# the value is a list of link hardware speeds for which no delay node is
# required. This is an awful way to do this, and quite wrong.
#
my %node_type_linkbw = ();

469 470 471 472 473 474 475 476
#
# phys_nodes: The equiv of virt_nodes above, except that these are pulled
# from the DB once the physical resources have been allocated. Indexed
# by physname, but there is a pointer from the virt_nodes table entry
# to the the corresponding physnode entry. 
# 
my %phys_nodes = ();

477
sub physnodeallocated($) { return exists($phys_nodes{$_[0]}); }
478 479 480 481 482 483
sub physnodeisvirtnode($){ return $phys_nodes{$_[0]}->{'p'}->isvirtnode();}
sub physnodeisjailed($)  { return $phys_nodes{$_[0]}->{'p'}->jailflag(); }
sub physnodeissubnode($) { return $phys_nodes{$_[0]}->{'p'}->issubnode(); }
sub physnodephysnode($)  { return $phys_nodes{$_[0]}->{'p'}->phys_nodeid(); }
sub physnodecontroliface($){return $phys_nodes{$_[0]}->{'p'}->control_iface();}
sub physnodetype($)      { return $phys_nodes{$_[0]}->{'p'}->type(); }
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509
sub physnodesetvnode($$) { return $phys_nodes{$_[0]}->{"__vname"} = $_[1]; }
sub physnodevnode($$)    { return $phys_nodes{$_[0]}->{"__vname"}; }
sub physnodereuse($)     { return $phys_nodes{$_[0]}->{"__reuse"}; }
sub physnodesetreuse($$) { return $phys_nodes{$_[0]}->{"__reuse"} = $_[1]; }
sub physnodenextpipe($)  {
    my $pipeno = $phys_nodes{$_[0]}->{"__pipeno"} += 10;

    fatal("Too many ipfw pipes on node $pnode!")
	if ($pipeno >= 65535);
    
    return $pipeno;
}
sub physnodenextrtabid($)  {
    my $rtabid = $phys_nodes{$_[0]}->{"__rtabid"} += 1;
    return $rtabid;
}


#
# phys_node_interfaces: ???
#

sub physnodeinterface($$) { return $phys_node_interfaces{$_[0]}{$_[1]}; }
sub physinterfacetype($$) {
    return $phys_node_interfaces{$_[0]}{$_[1]}{"interface_type"};
}
510 511 512
sub physinterfacerole($$) {
    return $phys_node_interfaces{$_[0]}{$_[1]}{"role"};
}
513

514 515 516 517 518 519 520 521 522 523 524 525 526 527
#
# More physical side data structures.
# v2pmap is indexed by virtual and contains the physical node.
my %v2pmap = ();
# p2vmap is indexed by physical and contains one or more virtual nodes.
my %p2vmap = ();
# plinks is indexed by virtual name and contains
#  (pnodeportA,pnodeportB) .  If one is a delay node it is always
#  the second.
my %plinks = ();
# virtnodes is the list of subnodes on physnodes.
my %virtnodes = ();
my %v2vmap = ();

528 529 530 531
# for consing up vlan names.
my $vlanid = 0;
my %protovlans = ();

532 533 534
# Admission control counts
my %admission_control = ();

535 536 537 538 539 540 541
#
# Support for experiment modify. We create v2p and v2v mappings of the
# current topology so we can figure out how its changed after assign
# runs. These correspond to v2pmap and v2vmap mentioned above.
# 
my %reserved_v2pmap  = ();
my %reserved_v2vmap  = ();
542
my %oldreservednodes = ();
543 544
my %newreservednodes = ();
my $oldreservedclean = 0;
545 546 547
# reserved_p2vmap is indexed by physical and contains one or more virtual
# nodes
my %reserved_p2vmap  = ();
548

Leigh Stoller's avatar
Leigh Stoller committed
549
#
550 551 552 553 554 555 556 557 558 559 560 561
# Experiment wide options. See below. They come from the experiments table.
# Defining these will override experiment table setting. 
#
# Set this when forcing linkdelays instead of delay nodes. Set in the NS
# file with a tb-compat directive. The force directive says to set up a
# link delay, even when no delay would otherwise be inserted.
# usewatunnels is also set in the NS, and can be used to turn them off. 
# The multiplex_factor is to override node_types table for virtnode.
my $uselinkdelays;
my $forcelinkdelays;
my $usewatunnels;
my $multiplex_factor;
562
my $experiment_idx;
563
my $useprepass;
564
my $delaycap_override;
565
my $elabinelab = 0;
566
my $experiment;
567

568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
#
# Notes on virtual interfaces.
#
# If a link is EMULATED (virtlanemulated) then it will have an "encapsulation
# style" (virtlanencapstyle).  That value is one of:
#
# "alias"	emulation is done with IP aliases on a physical interface;
#		there is no packet encapsulation.
#		Works on FreeBSD or Linux.
#
# "veth"	emulation is done with "veth" virtual devices;
#		encapsulation is standard veth style.
#		Works on FreeBSD only.
#
# "veth-ne"	emulation is done with "veth" devices;
#		no encapsulation is used (uses made-up MAC addresses instead)
#		Works on FreeBSD only.
#
# "vlan"	emulation is done with "vlan" devices;
#		uses 802.1q VLAN tagging.
#		Works on FreeBSD or Linux.
#
# "default"	emulation style was not explicitly set by the users;
#		encap style depends on context ("veth" for vnode, "alias" ow)
#
# Note that the encapsulation style for "default" depends on the node type.
# For non-vnodes the default is "alias".  For vnodes the default is "veth"
# since they must always have some pseudo-device on which to hang a route
# table ID; i.e., we cannot just do IP aliasing.
#
# Encapsulation can be specified per link.  The default link encapsulation
# can also be specified by a per-experiment setting.  At the moment, only
# the latter (global) is implemented.
#
my $encapstyle;

604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
#
# Determine if a given link/lan uses linkdelays
#
sub virtlan_use_linkdelay($$) {
    my ($virtlan,$shaped) = @_;
    #
    # Here are the conditions for using linkdelays on each link/lan
    # Note: $forcelinkdelays and $uselinkdelays are global
    #
    if ( # linkdelays can be globally forced
         $forcelinkdelays ||
         # We use linkdelays on emulated virtlans
         virtlanemulated($virtlan) ||
         # The user requested linkdelays, and this is a virtlan that gets
         # shaped (note - in this case, non-shaped virtlans don't get
         # linkdelays)
         ($uselinkdelays && $shaped) ||
         # The user forced linkdelays for this specific virtlan
         virtlanlinkdelay($virtlan)) {
        # Yep, use linkdelays
        return 1;
    } else {
        # No - either won't be delayed at all, or we'll use a delay node
        return 0;
    }
}

631 632 633 634
# For admission control. Not well defined yet.
my $cpu_usage;
my $mem_usage;

635 636 637 638
# XXX woeful NSE hack
my $sim_osid;
my $sim_osname = "FBSD-NSE";

639 640 641
# Allow override of jail/delay osids.
my $jail_osid;
my $delay_osid;
642
my $delay_osid_desire;
643

644 645 646 647 648 649
# Flag that tells us whether to fix-node current
# resources or not during a swap modify. This is
# useful when vnode weights are tweaked by the experimenter
# before doing a swapmod
my $fix_current_resources;

650
######################################################################
Leigh Stoller's avatar
Leigh Stoller committed
651

652 653 654 655 656
# ips is indexed by node:port and contains the IP address for the port.
my %ips	      = ();

# memberof is indexed by node:port and holds the lan it is a member of.
my %memberof = ();
657 658

# delaylinks stores the actual link delay info, converted from the
659
# virt delay params above. It is indexed by link name and contains a
660
# [delay,bw,backfill,loss] array for each direction. The final member of the
661 662 663
# array is an indicator of whether the info is there strictly for a 
# trivial link (linkdelay inserted only when assign makes link trivial).
my %delaylinks     = ();
664 665 666 667 668 669 670 671 672 673 674

# delaynodes stores the names of delaynodes that we create on the
# fly using delayid. This is useful for doing isdelay? tests.
my %delaynodes = ();
my $delayid    = 0;

# nodedelays and linkdelays are the final (!) representation. Indexed by
# integer id, they store the physical node info and the delay info. 
my %nodedelays = ();
my %linkdelays = ();

675 676 677 678
# Virtual nodes that the user has requested be "fixed" to a specific
# physical node.
my %fixed_nodes     = ();

679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
# portbw is indexed by virtual nodeport and contains the bandwidth of
# that port. Note that port bandwidth in the interfaces table is left
# in Mbps units for now. Thats inconsistent, I know. For LANs with
# other bandwidths the port speed will be 100 and a delay node will be
# inserted.
my %portbw = ();

# lannodes is indexed by physical name and is the set of fake lan nodes.
# lan nodes are named lan/<virtual lan>. We need to know these so that
# when they come back from assign, we can ignore them.
my %lannodes = ();

# Node estimates and counts. Avoid running assign if there is no way to
# satisfy the estimates for physical nodes.
my $minimum_nodes;
my $maximum_nodes;
695 696
my $reserved_pcount  = 0;
my $reserved_vcount  = 0;
697
my $reserved_simcount= 0;
698 699 700 701
my $remotecount      = 0;
my $virtcount        = 0;
my $plabcount        = 0;
my $needwanassign    = 0;
702
my $simcount         = 0;
703

704 705 706
# for checks made during topfile creation.
my $toperrors        = 0;

707 708 709 710 711 712 713 714 715 716 717 718
#
# This is for stats gathering. It might duplicate other stuff, but
# thats okay.
#
my %expt_stats = (# pnodes include jailnodes and delaynodes.
		  # We let the wrapper determine pnodes once the
		  # experiment is fully swapped in so that the record
		  # is not "committed" until successful swapin.
		  jailnodes   => 0,
		  vnodes      => 0,
                  # vnodes include wanodes.
		  wanodes     => 0,
719 720
		  # wanodes includes plabnodes.
		  plabnodes   => 0,
721 722 723 724 725 726
		  simnodes    => 0,
		  delaynodes  => 0,
		  linkdelays  => 0,
		  links       => 0,
		  walinks     => 0,
		  lans        => 0,
727
		  wirelesslans => 0,
728 729 730 731 732 733 734
		  shapedlinks => 0,
		  shapedlans  => 0,
		  minlinks    => 100000,
		  # includes emulated links. Maybe thats wrong.
		  maxlinks    => 0,
);

735 736 737
my $simhost_id     = 0;
my %pnode2simhostid;
my %simhostid2pnode;
Shashi Guruprasad's avatar
Shashi Guruprasad committed
738

739
# Counters for generating IDs.
740
my $virtnode_id  = 0;
741
my $veth_id      = 0;
Leigh Stoller's avatar
Leigh Stoller committed
742

743 744 745 746 747 748 749 750 751 752
#
# Every vnode on a pnode gets its own routing
# table, thus an rtabid. In the case of simulated
# nodes, we need rtabids only for border nodes
# i.e. nodes that have links going out of the pnode.
# Either way, new rtabids are requested via getrtabid(<vnode>,<pnode>)
# when the interfaces table is updated or new veth_interfaces
# are inserted. This hash maintains the rtabids per vnode
my %vnode2rtabid = ();

753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812
#
# OSinfo lookups.
#
sub osidlookup($)
{
    my $osid   = $_[0];
    my $osinfo = OSinfo->Lookup($osid);
    if (!defined($osinfo)) {
	tbwarn("No such OSID $osid\n");
	return undef;
    }
    return $osinfo;
}
sub osidpath($)
{
    my $osinfo = osidlookup($_[0]);
    return (defined($osinfo) ? $osinfo->path() : undef);
}
sub osidos($)
{
    my $osinfo = osidlookup($_[0]);
    return (defined($osinfo) ? $osinfo->OS() : undef);
}
sub osidfeaturesupported($$)
{
    my $osinfo = osidlookup($_[0]);
    return (defined($osinfo) ? $osinfo->FeatureSupported($_[1]) : 0);
}
sub osidhaspath($)
{
    my $path = osidpath($_[0]);
    return (defined($path) && ($path ne ""));
}
sub osidnextosinfo($)
{
    my $osinfo = osidlookup($_[0]);
    return undef
	if (!defined($osinfo));
    return $osinfo
	if (!defined($osinfo->nextosid()));
    my $nextosinfo = $osinfo->ResolveNextOSID($experiment);
    return undef
	if (!defined($nextosinfo));
    return $nextosinfo;
}
sub osidnextosid($)
{
    my $nextosinfo = osidnextosinfo($_[0]);
    return (defined($nextosinfo) ? $nextosinfo->osid() : undef);
}
sub osidbootcmd($$$)
{
    my ($osid, $role, $default) = @_;
    my $nextosinfo = osidnextosinfo($osid);
    return undef
	if (!defined($nextosinfo) ||
	    $nextosinfo->OSBootCmd($role, \$default) != 0);
    return $default;
}

813 814
######################################################################
# Step 1 - Setup virtual topology
815
#
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
# Here we need to read the virtual topology in from the virt_nodes
# and virt_lans table.  We then need to add delay and lan nodes as
# necessary.
#
# Conversion details:
#   Let L be a LAN with N members.
#   If N == 2 
#      Let N1 be node 1
#      Let N2 be node 2
#      If L is delayed
#         Generate delay node D
#         Link N1 to D
#         Link N2 to D
#      Else
#         Link N1 to N2
#   Else
#      Generate lan node A
#      If L is delayed
#        Foreach node N in L
#           Generate delay node DN
#           Link A to DN
#           Link N to DN
#      Else
#        Foreach node N in L
#           Link N to A
#
# Delay node names:
#  delay nodes are named tbdelayXX N > 2
#   and tbsdelayXX for N == 2.
#
########################################################################
847

848 849

# Slowly convert to using Experiment module.
850
$experiment = Experiment->Lookup($pid, $eid);
851 852 853 854
if (!defined($experiment)) {
    fatal("Could not lookup experiment object!")
}

855 856
printdb "Generating TOP file.\n";
TBDebugTimeStamp("TOP started");
857

858 859 860
#
# Do admission control test right away.
#
861
if (!($toponly || $regression)) {
862
    fatal("Failed admission control checks!")
863
	if (!TBAdmissionControlCheck(undef, $experiment, \%admission_control));
864 865
}

866 867 868 869 870 871
#
# vtypes are a funny mix beteween physical and virtual state, so we have to
# load them before getting the PhysInfo.
#
LoadVirtTypes();

872 873 874 875 876
#
# Load phys info. Interface types, node types, etc. Its any physical stuff
# we need.
#
LoadPhysInfo();
877

878 879 880 881
#
# Load the Experiment info and virt topology.
#
LoadExperiment();
882

883 884 885 886
#
# If updating, load current experiment resources. We have to be careful
# of how this is merged in with the (new) desired topology. See below.
#
887
if ($updating || $regression || $usecurrent) {
888
    LoadCurrent()
889 890 891 892 893 894 895 896
	if ($updating || $usecurrent);

    if ($regression) {
	print STDERR "Freeing reserved nodes in regression mode\n";
	system("export NORELOAD=1; $NFREE -x -a $pid $eid") == 0
	    or fatal("Could not release nodes in regression mode");
    }

897
    print STDERR "Resetting DB before updating.\n";
898
    $experiment->RemovePhysicalState();
899
}
Chad Barb's avatar
Chad Barb committed
900

901 902 903 904
#
# Check Max Concurrent for OSID violations.
#
CheckMaxConcurrent();
905

906 907 908 909
#
# Create the TOP file.
#
CreateTopFile();
910

911 912
TBDebugTimeStamp("TOP finished");

913 914 915 916
# Stop if creating the top file generated errors.
exit($WRAPPER_FAILED)
    if ($toperrors);

917 918 919 920
# Stop here ...
if ($toponly) {
    print "Stopping after creating the TOP file, as directed.\n";
    exit(0);
Leigh Stoller's avatar
Leigh Stoller committed
921
}
922

923

924 925 926 927 928 929 930 931 932
######################################################################
# Step 2 - Assign Loop
# 
# Here we loop up to maxrun times.  In each loop we snapshot the
# current testbed state into a ptop file.  We then run assign.  If
# assign succeeds we attempt to reserve the resources.  If that works
# we're done with step 2 otherwise we loop again.
#
#######################################################################
Leigh Stoller's avatar
Leigh Stoller committed
933

934 935
my $currentrun = 1;
my $canceled   = 0;
936
my $tried_precheck = 0;
937 938

# XXX plab hack - only run assign once on plab topologies, since they're easy
939
# to map and the physical topology does not change frequently.
940
if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
941
    $maxrun = 2;
942
}
943

944 945
TBDebugTimeStamp("assign_loop started");
while (1) {
946
    print "Assign Run $currentrun\n";
947

948
    #
949 950
    # RunAssign returns  0 if successful.
    #           returns -1 if failure, but assign says to stop trying.
951 952
    #           returns  1 if failure, but assign says to try again.
    #           returns  2 if we made some forward progress.
953
    #
954
    my $retval = RunAssign();
955 956 957 958

    # Success!
    last
	if ($retval == 0);
Leigh Stoller's avatar
Leigh Stoller committed
959

960 961 962 963 964
    if ($regression) {
	FinalizeRegression(1);
	fatal("Failed to find solution in regression mode");
    }

965
    if (!$precheck && !$tried_precheck) {
966
	my $ptopfile0 = $ptopfile;
967 968
	my $impotent0 = $impotent;
	print("Trying assign on an empty testbed.\n");
969
	$ptopfile = "$pid-$eid-$$-empty.ptop";
970 971 972 973
	$impotent = 1;
	$precheck = 1;
	my $retval = RunAssign();
	if ($retval != 0) {
974 975
	    fatal({type=>'extra', cause=>'user', severity=>SEV_ERROR,
	           error=>['infeasible_resource_assignment']}, 
976 977 978 979 980 981 982
		  "This experiment cannot be instantiated on this ".
                  "testbed. You have most likely asked for hardware ".
                  "this testbed does not have, such as nodes of a type ".
                  "it does not contain, or nodes with too many network ".
                  "interfaces.  You will need to modify this experiment ".
                  "before it can be swapped in - re-submitting the ".
                  "experiment as-is will always result in failure.");
983 984 985 986
	}
	print("Assign succeeded on an empty testbed.\n");
	$precheck = 0;
	$impotent = $impotent0;
987
	$ptopfile = $ptopfile0;
988 989 990 991
	$tried_precheck = 1;
    }


992
    if ($currentrun >= $maxrun && $retval != 2) {
993 994 995
	fatal({type => 'primary', severity => SEV_ERROR,
	       error => ['reached_assign_run_limit']},
	      "Reached run limit. Giving up.");
996 997
    }

998 999 1000 1001
    if ($retval < 0) {
	#
	# Failure in assign.
	#
1002 1003 1004
	fatal({type => 'primary', severity => SEV_ERROR,
	       error => ['unretriable_assign_error']},
	      "Unretriable error. Giving up.");
1005
    }
1006
    
1007 1008 1009 1010 1011 1012 1013 1014 1015
    print "Waiting 5 seconds and trying again...\n";
    sleep(5);
    $currentrun++;
}
TBDebugTimeStamp("assign_loop finished");

#
# Run assign once.
# 
1016
sub RunAssign ()
1017
{
1018 1019
    my $assignexitcode = 0;
    
1020
    # Clear globals for each run.
Leigh Stoller's avatar
Leigh Stoller committed
1021 1022
    undef %v2pmap;
    undef %p2vmap;
1023
    undef %v2vmap;
Leigh Stoller's avatar
Leigh Stoller committed
1024
    undef %plinks;
1025
    undef %virtnodes;
1026 1027 1028

    my %toreserve = ();
    my %subnodes  = ();
1029 1030

    # Debugging hack for regression mode. Avoid really long assign runs.
1031
    if ($noassign) {
1032 1033 1034 1035 1036 1037 1038
	if (! -e "assign.log") {
	    print "No existing assign results file!\n";
	    return -1;
	}
	print "Using existing assign results file\n";
	goto skiprun;
    }
Shashi Guruprasad's avatar
Shashi Guruprasad committed
1039
    
1040
    TBDebugTimeStamp("ptopgen started");
1041
    # Snapshot physical resources.
Chad Barb's avatar
Chad Barb committed
1042 1043 1044 1045
    #
    # if updating (-u), include any resources that may already be
    # allocated to experiment in the PTOP results.
    #
1046 1047
    my $ptopargs = "-p $pid ";
    $ptopargs   .= "-e $eid "
1048
	if ($updating);
1049 1050
    $ptopargs   .= "-u "
	if ($updating && $elabinelab);
1051 1052
    $ptopargs   .= "-m $multiplex_factor "
	if (defined($multiplex_factor));
1053 1054
    $ptopargs   .= "-v "
	if ($virtcount);
1055 1056
    $ptopargs   .= "-r "
	if ($remotecount);
1057 1058
    $ptopargs   .= "-S "
	if ($simcount);
1059 1060
    $ptopargs	.= "-a "
    	if ($precheck);
1061 1062 1063 1064
    $ptopargs	.= "-c $delaycap_override "
    	if (defined($delaycap_override));

    print "ptopargs $ptopargs\n";
1065
    system("ptopgen $ptopargs > $ptopfile");
1066
    TBDebugTimeStamp("ptopgen finished");
Leigh Stoller's avatar
Leigh Stoller committed
1067

1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
    if (scalar(keys(%admission_control))) {
	open(PTOP, ">> $ptopfile") or
	    return -1;

	foreach my $typeclass (keys(%admission_control)) {
	    my $count = $admission_control{$typeclass};

	    print PTOP "set-type-limit $typeclass $count\n";
	}
	close(PTOP);
    }

1080
    TBDebugTimeStamp("assign started");
Leigh Stoller's avatar
Leigh Stoller committed
1081
    # Run assign
1082 1083
    my $cmdargs = "-P $ptopfile $topfile";
    $cmdargs = "-uod -c .75 $cmdargs"
1084
	if ($virtcount || $simcount);
1085 1086
    $cmdargs = "-n $cmdargs"
    	if ($precheck);
1087 1088
    $cmdargs = "-s 123456 $cmdargs"
    	if ($regression);
1089 1090 1091 1092 1093

    my $cmd;

    # If doing an experiment with virtnodes, use the prepass wrapper for assign
    # Turned off for now, because it needs some work.
1094
    if ($useprepass || $prepass) {
1095 1096 1097 1098 1099 1100 1101
    	$cmd = "assign_prepass";
    	$cmdargs = "-m $multiplex_factor $cmdargs"
    	    if ($multiplex_factor);
    } else {
    	$cmd = "assign";
    }

1102
    print "$cmd $cmdargs\n";
Leigh Stoller's avatar
Leigh Stoller committed
1103

1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
    #
    # Fork a child to run assign. Parent spins watching the cancel flag
    # and waiting for assign to finish.
    #
    if (my $childpid = fork()) {
	while (1) {
	    sleep(2);

	    if (waitpid($childpid, &WNOHANG) == $childpid) {
		$assignexitcode = $? >> 8;
		last;
	    }

	    # Check cancel flag.
1118
	    TBGetCancelFlag($pid, $eid, \$canceled);
1119 1120 1121 1122
	    if ($canceled) {
		if ((my $pgrp = getpgrp($childpid)) > 0) {
		    kill('TERM', -$pgrp);
		    waitpid($childpid, 0);
1123

1124 1125 1126
		    fatal({cause => 'canceled', severity => SEV_IMMEDIATE,
			   error => ['cancel_flag']},
			  "Cancel flag set; aborting assign run!");
1127
		    return -1;
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
		}
		# Loop again to reap child above before exit.
	    }
	}
    }
    else {
	#
	# Change our session so the child can get a killpg without killing
	# the parent. 
	#
        POSIX::setsid();
1139
	exec("nice assign_wrapper2 $cmd $cmdargs > assign.log 2>&1");
1140
	die "Could not start assign!\n";
1141
    }
1142

1143
    # Check cancel flag before continuing. 
1144
    TBGetCancelFlag($pid, $eid, \$canceled);
1145
    if ($canceled) {
1146 1147 1148
	fatal({cause => 'canceled', severity => SEV_IMMEDIATE,
	       error => ['cancel_flag']},
	      "Cancel flag set; aborting assign run!");
1149 1150
	return -1;
    }
1151 1152
    # Check for possible full filesystem ...
    if (-z "assign.log") {
1153
	tbnotice("assign.log is zero length! Stopping ...\n");
1154 1155 1156
	return -1;
    }

1157 1158 1159
    # Saving up assign.log coz each swapin/modify is
    # different and it is nice to have every mapping
    # for debugging and archiving purposes
1160 1161
    # We do not call it .log though, since we do not want it copied
    # out to the user directory every swapin. See Experiment.pm
1162 1163
    my $assignlog = ($regression ? "$pid-$eid.assign" : "$pid-$eid-$$.assign");
    system("/bin/cp assign.log $assignlog");
1164
  skiprun:
1165 1166 1167 1168 1169 1170 1171
    if (!open(ASSIGNFP, "assign.log")) {
	print("Could not open assign logfile!\n");
	return -1;
    }
    printdb "Reading assign results.\n";

    #
1172 1173 1174
    # We no longer care what assign has to say when it fails!
    # Any relevent info was already sent to stderr so just 
    # tell the caller whether we want to keep trying or not.
1175 1176
    #
    if ($assignexitcode) {
1177
	close ASSIGNFP;
1178
	return (($assignexitcode == 1) ? 1 : -1);
1179
    }
1180
    
1181 1182 1183 1184 1185 1186 1187 1188 1189
    #
    # If we were doing the precheck, go ahead and exit now - there won't be
    # any useful information to parse out
    #
    if ($precheck) {
	print "Precheck succeeded.\n";
	return 0;
    }

1190 1191 1192 1193
    #
    # Assign success; parse results.
    # 
    # read nodes section
1194 1195
    my $found_nodes_section = 0;
    while (<ASSIGNFP>) {
1196 1197
	# find the 'BEST SCORE' line and print that out for informational
	# purposes
1198 1199
	if (/BEST SCORE/) {
	    print;
1200
	}
1201 1202 1203 1204 1205 1206 1207 1208 1209
        if (/^Nodes:/) {
            $found_nodes_section = 1;
            last;
        }
    }
    if (!$found_nodes_section) {
        tbwarn("Internal error - unable to find Nodes section in " .
               "assign output");
        return 1;
1210
    }
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
    printdb "Nodes:\n";
    while (<ASSIGNFP>) {
	chop;
	/^End Nodes$/ && last;
	@info = split;
	my ($virtual,$physical) = @info[0,1];

	# We don't care about LAN nodes anymore.
	if (defined($lannodes{$virtual})) {
	    next;
	}

	if (physnodeallocated($physical)) {
	    #
	    # Mark node as being reused.
	    #
	    # Look at virtual node being mapped to node;
	    # if it wasn't in the previous map, mark node for reboot.
	    #
	    if (physnodereuse($physical) eq "reboot") {
		# No changes once it goes into reboot.
		;
	    }
	    elsif (virtnodeisvirt($virtual)) {
1235
		#
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249
		# A new virt virtual node on an existing physical node
		# does not force the physnode to be rebooted; we can
		# set up a new virtnode on it without a reboot. If its
		# an existing virtual on the same physnode, then mark
		# both as reused; no need to reboot either. If the 
		# virtnode has moved here from someplace else, no
		# reboot of the physnode either, but obviously the
		# vnode will be released and a new one allocated.  What
		# we cannot determine is if its just a renamed node
		# (which would require a reboot of the the virtual
		# node). 
		# 
		if (!exists($reserved_v2pmap{$virtual})) {
		    physnodesetreuse($physical, "reused");
1250
		}
1251 1252
		elsif ($reserved_v2pmap{$virtual} eq $physical) {
		    my $reserved = $reserved_v2vmap{$virtual};
1253

1254 1255
		    physnodesetreuse($reserved, "reused");
		    physnodesetreuse($physical, "reused");
1256 1257
		}
		else {
1258
		    physnodesetreuse($physical, "reused");
1259 1260 1261
		}
	    }
	    else {
1262
		#
1263 1264 1265 1266 1267 1268 1269
		# If a new virtual node mapped to this physnode (maybe
		# even the luser changed the name of the node), or if an
		# existing virtual node moved to this physnode, must
		# reboot the physnode. Else, the physnode is being
		# reused as is, and no need to mess with it. If the
		# user requested reboot, that will be handled outside
		# of this script.
1270
		#
1271 1272 1273 1274 1275 1276
		if (!exists($reserved_v2pmap{$virtual}) ||
		    $reserved_v2pmap{$virtual} ne $physical) {
		    physnodesetreuse($physical, "reboot");
		}
		else {
		    physnodesetreuse($physical, "reused");
1277
		}
1278
	    }
1279 1280 1281 1282 1283 1284 1285 1286
	}
	else {
	    #
	    # This is a new node; we'll have to reserve it. Note that
	    # we do not reserve a widearea physnode when a virtual node
	    # is mapped to it; they are special.
	    #
	    $toreserve{$physical} = 1
1287 1288 1289
		if (!virtnodeisvnode($virtual) ||
		    !(virtnodeisremote($virtual)
		      && !nodetypeisdedicatedremote(virtnodetype($virtual))));
1290 1291 1292 1293 1294 1295 1296 1297 1298
	}
	
	if (virtnodeisvirt($virtual)) {
	    #
	    # If mapping a virtual node, then record that, since we need
	    # to allocate the virtnodes on that physnode, later.
	    #
	    if (!defined($virtnodes{$physical})) {
		$virtnodes{$physical} = [];
Shashi Guruprasad's avatar
Shashi Guruprasad committed
1299
	    }
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312
	    push(@{$virtnodes{$physical}}, $virtual);
	}
	elsif (virtnodeissubnode($virtual)) {
	    #
	    # Need to allocate the parent to. Should be optional?
	    # Save away and deal with once we have all the results.
	    #
	    $subnodes{$virtual} = $physical;
	}
	
	$v2pmap{$virtual} = $physical;
	if( ! defined($p2vmap{$physical}) ) {
	    $p2vmap{$physical} = [];
Leigh Stoller's avatar
Leigh Stoller committed
1313
	}
1314 1315 1316
	push(@{$p2vmap{$physical}}, $virtual);
	printdb "  $virtual $physical\n";
    }
Leigh Stoller's avatar
Leigh Stoller committed
1317

1318 1319 1320 1321 1322 1323 1324 1325 1326
    #
    # Process the subnodes. We have to allocate the parent at the same
    # time, lest it get sucked away for some other purpose by another
    # experiment. We might want to push this off into nalloc, but not
    # sure yet.
    #
    for my $virtual (keys(%subnodes)) {
	my $physical = $subnodes{$virtual};
	my $parent;
1327

1328
	TBPhysNodeID($physical, \$parent);
1329

1330
	printdb "  Subnode: $virtual $physical $parent\n";
1331

1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351
	#
	# See if we already have it. Swapmod, retry, or perhaps
	# the parent could be named separately? Or maybe there are
	# several subnodes on the physnode?
	#
	next
	    if (exists($p2vmap{$parent}));

	# Make up a name and add to the list.
	my $newvname = newvname($parent, "phost");

	$v2pmap{$newvname} = $parent;
	$p2vmap{$parent} = [ $newvname ];
	$toreserve{$parent} = 1;
	printdb "  Adding subnode host: $newvname $parent\n";
    }

    # read Edges
    # By convention, in plinks, the delay node is always the second
    # entry.
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
    my $found_edges_section = 1;
    while (<ASSIGNFP>) {
        if (/^Edges:/) {
            $found_edges_section = 1;
            last;
        }
    }
    if (!$found_edges_section) {
        tbwarn("Internal error - unable to find Edges section in " .
               "assign output");
        return 1;
    }
1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
    printdb "Edges:\n";
    EDGEWHILE: while (<ASSIGNFP>) {
	/^End Edges$/ && last EDGEWHILE;
	@info = split;
	$line = $_;
	$_ = $info[1]; # type
        SWITCH1: {
	    /^intraswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^interswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,$#info];
		last SWITCH1;
	    };
	    /^direct$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^trivial$/ && do {
		# we don't have plinks for trivial links
		$vlink = $info[0];
		$plinks{$vlink} = [];
		next EDGEWHILE;
	    };
	    print "Found garbage: $line\n";
Leigh Stoller's avatar
Leigh Stoller committed
1390
	}
1391 1392 1393 1394 1395 1396
	$nodeportA = &getnodeport($rawA);
	$nodeportB = &getnodeport($rawB);
	$nodeportA =~ s/\//:/;
	$nodeportB =~ s/\//:/;
	$plinks{$vlink} = [$nodeportA,$nodeportB];
	printdb "  $vlink " . join(" ",@{$plinks{$vlink}}) . "\n";
Leigh Stoller's avatar
Leigh Stoller committed
1397 1398
    }
    close(ASSIGNFP);
1399
    TBDebugTimeStamp("assign finished");
Leigh Stoller's avatar
Leigh Stoller committed
1400 1401

    # Reserve resources
1402 1403 1404 1405
    if ($impotent) {
	print "Skipping physical reservation, as directed.\n";
	return 0;
    }
1406

1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425
    # From here, we can't recover anymore, coz we move
    # previously reserved pnodes/vnodes to the holding reservation
    # and back. By doing this, we will avoid any UNIQUE key issues
    # when a virt_node in the topology moves from one pnode to another
    # from previous to new mapping.
    # Another reason to do this just before nalloc of a new toreserve
    # nodes is that, we can get into name clashes
    # For example, lets say the user called his node pc2 and it was
    # actually mapped to pc99 in the initial swapin. If this was a
    # swapmod where the user asked for another node node0 which got 
    # mapped to pc2. nalloc of pc2 will result in UNIQUE key problems
    # since there exists a reserved vname pc2 (virtual name). By 
    # having this operation of moving the nodes into 
    # OLDRESERVED_PID/OLDRESERVED_EID and back before a new nalloc,
    # avoid this UNIQUE key problem. Also note that simply updating
    # the vname to be the same as the node_id field also won't
    # work all the time i.e. in the example discussed above
    my $oldreserved_pid = OLDRESERVED_PID;	
    my $oldreserved_eid = OLDRESERVED_EID;
1426
    if (scalar(keys(%oldreservednodes)) && !$oldreservedclean) {
1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back started");
	system("nfree -o $pid $eid " . join(" ", keys(%oldreservednodes)) );
	system("nalloc $pid $eid " . join(" ", keys(%oldreservednodes)) );
	my $exitval  = $? >> 8;
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back finished");
	#
	# If nalloc failed with a fatal error, lets give it up. No retry.
	# 
	if ($exitval != 0) {
1440 1441
	    print("Failed to move back Old Reserved nodes back to reserved\n");
	    return -1;
1442
	}
1443

1444 1445 1446 1447 1448 1449
	#
	# We need to only once i.e. in the first call to RunAssign().
	# If it gets repeatedly called coz only some pnode resources
	# got nalloced, we do not have to do the above again.
	#
	$oldreservedclean = 1;
1450 1451
    }

1452 1453 1454 1455
    TBDebugTimeStamp("reserving started");
    system("nalloc -p $pid $eid " . join(" ", keys(%toreserve)));
    TBDebugTimeStamp("reserving finished");
    my $exitval  = $? >> 8;
Chad Barb's avatar
Chad Barb committed
1456

1457 1458 1459 1460 1461 1462 1463
    #
    # If nalloc failed with a fatal error, lets give it up. No retry.
    # 
    if ($exitval < 0) {
	print "Failed to reserve any nodes.\n";
	return -1; 
    }
Chad Barb's avatar
Chad Barb committed
1464

1465 1466 1467 1468 1469 1470 1471
    #
    # Okay, if nalloc got anything, we have to set the norecover bit,
    # since tbswap operates on the principle that any change in the DB
    # means no recover is possible. This can probably me dealt with by
    # deallocating any nodes we allocated in the wrapper before exiting.
    #
    $NoRecover = 1;
1472

1473 1474 1475 1476 1477 1478 1479
    #
    # Otherwise, all newly allocated nodes MUST go to the INIT_DIRTY
    # allocstate since the user now has control of them. If we eventually
    # fail, nodes not in RES_READY are deallocated (retry/modify). 
    #
    if ($exitval > 0) {
	my @reserved = ExpNodes($pid, $eid);
1480
	my $rcount   = scalar(@reserved);
1481
	my $tcount   = scalar(keys(%toreserve));
1482 1483

	# We got only some. Need to figure out which.
1484 1485
	print "Reserved some nodes ($rcount) we needed, ".
	    "but not all ($exitval).\n";
1486 1487 1488
	
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
1489
		$newreservednodes{$node} = $node;
1490
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
1491
	    }
Leigh Stoller's avatar
Leigh Stoller committed
1492 1493
	}

1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504
	#
	# We check to see if were were able to reserve all the fixed
	# nodes we needed. If we couldn't get the fixed list, then
	# this experiment is unlikely to map in the "near" future, so
	# give up now (no retry).
	#
	foreach my $node (values(%fixed_nodes)) {
	    if (! grep {$_ eq $node} @reserved) {
		printdb "  Could not allocate fixed node $node!\n";
		return -1;
	    }
Leigh Stoller's avatar
Leigh Stoller committed
1505
	}
1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516
	#
	# Okay, must extend the fixed list with newly allocated nodes
	# so that we can recreate the top file, and try again with a
	# new set.
	#
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
		foreach my $vname (@{$p2vmap{$node}}) {
		    $fixed_nodes{$vname} = $node;
		}
	    }
1517
	}
1518
	CreateTopFile();
1519 1520 1521 1522 1523 1524 1525 1526

	#
	# Return indicator that we made forward progress (got some nodes).
	# Caller will decide if appropriate to keep trying. We made progress
	# if the return value of nalloc (number of nodes not allocated) does
	# not equal the number of nodes we tried to allocate.
	#
	return (($tcount == $exitval) ? 1 : 2);
1527
    }
1528

1529 1530 1531 1532 1533 1534 1535 1536
    #
    # We got all the nodes we wanted. All newly allocated nodes MUST
    # go to the INIT_DIRTY allocstate since the user now has control
    # of them.
    #
    print "Successfully reserved all physical nodes we needed.\n";
	
    foreach my $node (keys(%toreserve)) {
1537 1538
	# Remeber all newly allocated nodes for later free if failure.
	$newreservednodes{$node} = $node;
1539
	TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
Leigh Stoller's avatar
Leigh Stoller committed
1540
    }
1541

1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561
    #
    # Release phys and virt nodes no longer needed. They are marked
    # for teardown. They need to be freed by SOMEONE, currently the
    # wrapper (tbswap), since this only happens when in update mode
    # (swapmod).
    #
    foreach my $pnode (keys(%phys_nodes)) {
	my $reuse = physnodereuse($pnode);
		
	if ($reuse eq "unused") {
	    #
	    # Node was used in previous incarnation, but not any more.
	    #
	    TBSetNodeAllocState($pnode, TBDB_ALLOCSTATE_RES_TEARDOWN());
	}
	elsif ($reuse eq "reboot") {
	    #
	    # Node is being reused, but for a different purpose, so
	    # it should be rebooted.
	    #
1562
	    TBSetNodeAllocState($pnode, TBDB_ALLOCSTATE_RES_REBOOT());
1563 1564 1565
	}
    }
    return 0;
Leigh Stoller's avatar
Leigh Stoller committed
1566
}
1567

1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
###########################################################################
# Step 2A
#
# We run the wanassigner to allocate remote nodes. We do this after cause
# it takes so long. We run it just once.
#
# wanassign does its own nalloc.
#
###########################################################################

1578
#
1579
# VIRTNODES HACK: Allocate the remote virtual nodes.
1580
#