assign_wrapper.in 131 KB
Newer Older
Leigh B. Stoller's avatar
Leigh B. Stoller committed
1
#!/usr/bin/perl -w
Leigh B. Stoller's avatar
Leigh B. Stoller committed
2
3
#
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2004 University of Utah and the Flux Group.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
5
6
# All rights reserved.
#
7
8
use English;
use Getopt::Std;
9
10
use POSIX qw(setsid);
use POSIX ":sys_wait_h";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
11

12
#
Leigh B. Stoller's avatar
Leigh B. Stoller committed
13
14
15
16
17
18
# This function as the main assign loop.  It converts the virtual
# topology into a top input including LAN and delay translation.  It
# then snapshots the current testbed physical state and runs assign,
# looping a couple times if assign fails.  When assign successfully
# completes it will interpret the results.  Attempt to match any
# existing portmap entries and then update the delays and vlans table.
Chad Barb's avatar
Chad Barb committed
19
#
20
21
22
23
# XXX: Update does not work with widearea nodes.
#      Internally created nodes (jailhost,delay,sim) are not treated
#        consistently. Needs more thought.
#
24
25
26
27
# Return codes: We catch all errors with the END block below, and if
# anyone calls die() (exit value is 255) we add in the CANRECOVER bit.
# Failures in assign always cause the caller to stop retrying. 
#
28
# The CANRECOVER bit indicates 'recoverability' (no db or physical
29
30
# state was modified by the time the error occurred). This is relavent
# to only modify operations (update).
31
32
33
#
my $WRAPPER_SUCCESS		 = 0x00;
my $WRAPPER_FAILED		 = 0x01;	# Failed (Add other values)
34
35
my  $WRAPPER_FAILED_CANRECOVER   = 0x40;        # Can recover from update
my  $WRAPPER_FAILED_FATALLY      = 0x80;	# Do not use this.
36
37
38
# Set this once we modify DB state; forces no recover in fatal().
my $NoRecover = 0;
    
39
40
sub usage ()
{
41
42
43
44
    print STDERR "Usage: $0 [-v] [-u [-f] | -n] pid eid\n";
    print STDERR " -v   - Enables verbose output\n";
    print STDERR " -u   - Enables update mode\n";
    print STDERR " -f   - Fix current resources during update mode\n";
45
46
    print STDERR " -t   - Create the TOP file and then exit\n";
    print STDERR " -n   - Run assign, but do not reserve/modify resources.\n";
47
    exit($WRAPPER_FAILED);
48
}
49
my $optlist  = "vutnf";
50
my $verbose  = 0;
51
my $fixmode  = 0;
52
53
54
my $updating = 0;
my $toponly  = 0;
my $impotent = 0;
55
my $warnings = 0;
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

#
# Configure variables
#
my $TBROOT	  = "@prefix@";
my $DELAYCAPACITY = @DELAYCAPACITY@;
$ENV{'PATH'} = "/usr/bin:$TBROOT/libexec:$TBROOT/sbin:$TBROOT/bin";

#
# Turn off line buffering on output
#
$| = 1;

#
# Testbed Support libraries
#
use lib "@prefix@/lib";
use libdb;
use libtestbed;

Chad Barb's avatar
Chad Barb committed
76
#
77
78
79
80
81
# assign_wrapper Settings
#
# Maximum delay in ms above which a delay node is needed.
# (Note that the DB represents delays as floating point numbers)
my $delaythresh = 2;
Chad Barb's avatar
Chad Barb committed
82

83
84
# Maximum number of times we run assign.
my $maxrun = 20;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
85

86
87
88
# Use the switch to delay when possible. Currentlythis only works for 10mbit
# links (actually, its turned off cause it does not work; auto handshake).
my $delaywithswitch = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
89

90
91
92
93
94
95
96
#
# Some handy constants. Speed in Mbits/sec and Kbits/sec units.
#
# Its probably a good idea to leave portbw (current_speed) in Mbs, since
# those numbers are used at the switch and the interfaces, which really
# only think in Mbps.
#
97
98
99
100
101
102
my $S10Mbs   = 10;
my $S100Mbs  = 100;
my $S1000Mbs = 1000;
my $S10Kbs   = 10000;
my $S100Kbs  = 100000;
my $S1000Kbs = 1000000;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
103

104
#
105
106
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
107
#
108
109
110
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
111
}
112
113
if (@ARGV != 2) {
    usage();
Chad Barb's avatar
Chad Barb committed
114
}
115
if (defined($options{"v"})) {
Chad Barb's avatar
   
Chad Barb committed
116
117
    $verbose = 1;
}
118
if (defined($options{"u"})) {
Chad Barb's avatar
   
Chad Barb committed
119
    $updating = 1;
120
}
121
122
123
124
125
if (defined($options{"t"})) {
    $toponly = 1;
}
if (defined($options{"n"})) {
    $impotent = 1;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
126
}
127
128
129
if (defined($options{"f"})) {
    $fixmode = 1;
}
130
131
132
my $pid = $ARGV[0];
my $eid = $ARGV[1];
my $ptopfile = "$pid-$eid-$$.ptop";
133
134
135
136
137
# Since the topfile could change across
# swapins and modifies, it makes sense
# to store all of them. Helps in
# degugging.
my $topfile  = "$pid-$eid-$$.top";
138

139
140
141
142
#
# All exits happen via this function!
#
sub fatal ($)
143
{
144
    my($message) = @_;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
145

146
147
148
    print STDERR "*** $0:\n".
	         "    $message\n";

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    # We next go to the END block below.
    exit($WRAPPER_FAILED);
}

#
# We want warnings to cause assign_wrapper to exit abnormally.
# We will come through here no matter how we exit though.
# 
$SIG{__WARN__} = sub { print STDERR $_[0];$warnings++; };

END {
    # Watch for getting here cause of a die()/exit() statement someplace.
    my $exitcode = $?;

    if ($exitcode && $exitcode != $WRAPPER_FAILED) {
	$exitcode = $WRAPPER_FAILED|$WRAPPER_FAILED_FATALLY;
    }
166
    
167
168
169
170
171
172
173
174
175
176
177
178
179
    if ($warnings > 0) {
	print STDERR "*** $0:\n".
	             "    $warnings warnings.\n";

	$exitcode |= $WRAPPER_FAILED;
    }

    # Set recover bit if we are going to fail.
    $exitcode = $exitcode|$WRAPPER_FAILED_CANRECOVER
	if ($exitcode && !$NoRecover);

    # And change the exitcode to be what we want it to be!
    $? = $exitcode;
180
}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
181

182
183
sub printdb ($)
{
Leigh B. Stoller's avatar
Leigh B. Stoller committed
184
185
186
    if ($verbose) {
	print $_[0];
    }
187
}
Leigh B. Stoller's avatar
Leigh B. Stoller committed
188

189
print "assign_wrapper improved started\n";
190
191
TBDebugTimeStamp("assign_wrapper started");

Leigh B. Stoller's avatar
Leigh B. Stoller committed
192
#
193
# The main data structures:
Leigh B. Stoller's avatar
Leigh B. Stoller committed
194
#
195
196
197
198
199
# virt_nodes: The virtual nodes, indexed by vname. Each entry is a
# hash reference, initially of just the DB info, but possibly
# augmented as we proceed through assign.  Do not confuse these
# virtual nodes with the other virtual nodes! These are the ones from
# the actual topology, the virt_nodes table in the DB.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
200
#
201
202
my %virt_nodes = ();

Leigh B. Stoller's avatar
Leigh B. Stoller committed
203
#
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# virt_lans: The equivalent of virt_nodes; the virt_lans table in the DB.
# Since there are multiple rows per lan (one for each node), this is a
# multilevel structure. The first slot is another hash, one for each node.
# The rest of the slots store other random things associated with the lan.
# So, looks something like:
#
#   %virt_lans = (link0 => {members    => member0 => { db row ref },
#                                         member1 => { db row ref }}
#                           mustdelay    => 0,
#                           emulated     => 0,
#                           uselinkdelay => 0,
#                           nobwshaping  => 0,
#                           useveth      => 0,
#                           trivok       => 0
#                          }
#                 link1 => ...
#                )
#
my %virt_lans = ();

#
# virt_vtypes: The virt_vtypes table in the DB, indexed by the vtype
# name (user chosen name).
#
my %virt_vtypes = ();

#
# node_types: The node_types table from the DB, indexed by the type name.
#
my %node_types = ();

Leigh B. Stoller's avatar
Leigh B. Stoller committed
235
# 
236
# interface_capabilities: We need this to find out the bandwidths of the devices
237
238
# we actually have on the testbed. Index by interface type name.
#
239
my %interface_capabilities = ();
Leigh B. Stoller's avatar
Leigh B. Stoller committed
240

241
242
243
244
245
246
247
248
#
# XXX Hack table for determining if a delay node is required. We store
# the native link speeds for each type,class. Indexed by type and class,
# the value is a list of link hardware speeds for which no delay node is
# required. This is an awful way to do this, and quite wrong.
#
my %node_type_linkbw = ();

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#
# phys_nodes: The equiv of virt_nodes above, except that these are pulled
# from the DB once the physical resources have been allocated. Indexed
# by physname, but there is a pointer from the virt_nodes table entry
# to the the corresponding physnode entry. 
# 
my %phys_nodes = ();

#
# More physical side data structures.
# v2pmap is indexed by virtual and contains the physical node.
my %v2pmap = ();
# p2vmap is indexed by physical and contains one or more virtual nodes.
my %p2vmap = ();
# plinks is indexed by virtual name and contains
#  (pnodeportA,pnodeportB) .  If one is a delay node it is always
#  the second.
my %plinks = ();
# virtnodes is the list of subnodes on physnodes.
my %virtnodes = ();
my %v2vmap = ();

#
# Support for experiment modify. We create v2p and v2v mappings of the
# current topology so we can figure out how its changed after assign
# runs. These correspond to v2pmap and v2vmap mentioned above.
# 
my %reserved_v2pmap  = ();
my %reserved_v2vmap  = ();
278
279
280
281
my %oldreservednodes = ();
# reserved_p2vmap is indexed by physical and contains one or more virtual
# nodes
my %reserved_p2vmap  = ();
282

Leigh B. Stoller's avatar
Leigh B. Stoller committed
283
#
284
285
286
287
288
289
290
291
292
293
294
295
# Experiment wide options. See below. They come from the experiments table.
# Defining these will override experiment table setting. 
#
# Set this when forcing linkdelays instead of delay nodes. Set in the NS
# file with a tb-compat directive. The force directive says to set up a
# link delay, even when no delay would otherwise be inserted.
# usewatunnels is also set in the NS, and can be used to turn them off. 
# The multiplex_factor is to override node_types table for virtnode.
my $uselinkdelays;
my $forcelinkdelays;
my $usewatunnels;
my $multiplex_factor;
296
my $experiment_idx;
297

298
299
300
301
# For admission control. Not well defined yet.
my $cpu_usage;
my $mem_usage;

302
303
304
305
# Allow override of jail/delay osids.
my $jail_osid;
my $delay_osid;

306
307
308
309
310
311
# Flag that tells us whether to fix-node current
# resources or not during a swap modify. This is
# useful when vnode weights are tweaked by the experimenter
# before doing a swapmod
my $fix_current_resources;

312
######################################################################
Leigh B. Stoller's avatar
Leigh B. Stoller committed
313

314
315
316
317
318
# ips is indexed by node:port and contains the IP address for the port.
my %ips	      = ();

# memberof is indexed by node:port and holds the lan it is a member of.
my %memberof = ();
319
320

# delaylinks stores the actual link delay info, converted from the
321
322
323
324
325
# virt delay params above. It is indexed by link name and contains a
# [delay,bw,loss] array for each direction. The final member of the
# array is an indicator of whether the info is there strictly for a 
# trivial link (linkdelay inserted only when assign makes link trivial).
my %delaylinks     = ();
326
327
328
329
330
331
332
333
334
335
336

# delaynodes stores the names of delaynodes that we create on the
# fly using delayid. This is useful for doing isdelay? tests.
my %delaynodes = ();
my $delayid    = 0;

# nodedelays and linkdelays are the final (!) representation. Indexed by
# integer id, they store the physical node info and the delay info. 
my %nodedelays = ();
my %linkdelays = ();

337
338
339
340
# Virtual nodes that the user has requested be "fixed" to a specific
# physical node.
my %fixed_nodes     = ();

341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# portbw is indexed by virtual nodeport and contains the bandwidth of
# that port. Note that port bandwidth in the interfaces table is left
# in Mbps units for now. Thats inconsistent, I know. For LANs with
# other bandwidths the port speed will be 100 and a delay node will be
# inserted.
my %portbw = ();

# lannodes is indexed by physical name and is the set of fake lan nodes.
# lan nodes are named lan/<virtual lan>. We need to know these so that
# when they come back from assign, we can ignore them.
my %lannodes = ();

# Node estimates and counts. Avoid running assign if there is no way to
# satisfy the estimates for physical nodes.
my $minimum_nodes;
my $maximum_nodes;
357
358
my $reserved_pcount  = 0;
my $reserved_vcount  = 0;
359
my $reserved_simcount= 0;
360
361
362
363
my $remotecount      = 0;
my $virtcount        = 0;
my $plabcount        = 0;
my $needwanassign    = 0;
364
my $simcount         = 0;
365

366
367
368
369
370
371
372
373
374
375
376
377
#
# This is for stats gathering. It might duplicate other stuff, but
# thats okay.
#
my %expt_stats = (# pnodes include jailnodes and delaynodes.
		  # We let the wrapper determine pnodes once the
		  # experiment is fully swapped in so that the record
		  # is not "committed" until successful swapin.
		  jailnodes   => 0,
		  vnodes      => 0,
                  # vnodes include wanodes.
		  wanodes     => 0,
378
379
		  # wanodes includes plabnodes.
		  plabnodes   => 0,
380
381
382
383
384
385
		  simnodes    => 0,
		  delaynodes  => 0,
		  linkdelays  => 0,
		  links       => 0,
		  walinks     => 0,
		  lans        => 0,
386
		  wirelesslans => 0,
387
388
389
390
391
392
393
		  shapedlinks => 0,
		  shapedlans  => 0,
		  minlinks    => 100000,
		  # includes emulated links. Maybe thats wrong.
		  maxlinks    => 0,
);

394
395
396
my $simhost_id     = 0;
my %pnode2simhostid;
my %simhostid2pnode;
Shashi Guruprasad's avatar
Shashi Guruprasad committed
397

398
# Counters for generating IDs.
399
my $virtnode_id  = 0;
400
my $veth_id      = 0;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
401

402
403
404
405
406
407
408
409
410
411
#
# Every vnode on a pnode gets its own routing
# table, thus an rtabid. In the case of simulated
# nodes, we need rtabids only for border nodes
# i.e. nodes that have links going out of the pnode.
# Either way, new rtabids are requested via getrtabid(<vnode>,<pnode>)
# when the interfaces table is updated or new veth_interfaces
# are inserted. This hash maintains the rtabids per vnode
my %vnode2rtabid = ();

412
413
######################################################################
# Step 1 - Setup virtual topology
414
#
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
# Here we need to read the virtual topology in from the virt_nodes
# and virt_lans table.  We then need to add delay and lan nodes as
# necessary.
#
# Conversion details:
#   Let L be a LAN with N members.
#   If N == 2 
#      Let N1 be node 1
#      Let N2 be node 2
#      If L is delayed
#         Generate delay node D
#         Link N1 to D
#         Link N2 to D
#      Else
#         Link N1 to N2
#   Else
#      Generate lan node A
#      If L is delayed
#        Foreach node N in L
#           Generate delay node DN
#           Link A to DN
#           Link N to DN
#      Else
#        Foreach node N in L
#           Link N to A
#
# Delay node names:
#  delay nodes are named tbdelayXX N > 2
#   and tbsdelayXX for N == 2.
#
########################################################################
Chad Barb's avatar
   
Chad Barb committed
446

447
448
printdb "Generating TOP file.\n";
TBDebugTimeStamp("TOP started");
449

450
451
452
453
454
455
#
# vtypes are a funny mix beteween physical and virtual state, so we have to
# load them before getting the PhysInfo.
#
LoadVirtTypes();

456
457
458
459
460
#
# Load phys info. Interface types, node types, etc. Its any physical stuff
# we need.
#
LoadPhysInfo();
461

462
463
464
465
#
# Load the Experiment info and virt topology.
#
LoadExperiment();
466

467
468
469
470
#
# If updating, load current experiment resources. We have to be careful
# of how this is merged in with the (new) desired topology. See below.
#
471
472
473
474
475
if ($updating) {
    LoadCurrent();
    print STDERR "Resetting DB before updating.\n";
    TBExptRemovePhysicalState( $pid, $eid );
}
Chad Barb's avatar
   
Chad Barb committed
476

477
478
479
480
#
# Check Max Concurrent for OSID violations.
#
CheckMaxConcurrent();
481

482
483
484
485
#
# Create the TOP file.
#
CreateTopFile();
486

487
488
489
490
491
492
TBDebugTimeStamp("TOP finished");

# Stop here ...
if ($toponly) {
    print "Stopping after creating the TOP file, as directed.\n";
    exit(0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
493
}
494

495

496
497
498
499
500
501
502
503
504
######################################################################
# Step 2 - Assign Loop
# 
# Here we loop up to maxrun times.  In each loop we snapshot the
# current testbed state into a ptop file.  We then run assign.  If
# assign succeeds we attempt to reserve the resources.  If that works
# we're done with step 2 otherwise we loop again.
#
#######################################################################
Leigh B. Stoller's avatar
Leigh B. Stoller committed
505

506
507
my $currentrun = 1;
my $canceled   = 0;
508
509

# XXX plab hack - only run assign once on plab topologies, since they're easy
510
# to map and the physical topology does not change frequently.
511
if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
512
    $maxrun = 2;
513
}
514

515
516
TBDebugTimeStamp("assign_loop started");
while (1) {
517
    # Check cancel flag before continuing. 
518
    TBGetCancelFlag($pid, $eid, \$canceled);
519
    fatal("Cancel flag set; aborting assign run!")
520
521
	if ($canceled);

522
    print "Assign Run $currentrun\n";
523

524
    #
525
526
527
    # RunAssign returns  0 if successful.
    #           returns -1 if failure, but assign says to stop trying.
    #           returns -1 if failure, but assign says to try again.
528
    #
529
    my $retval = RunAssign();
530
531
532
533

    # Success!
    last
	if ($retval == 0);
Leigh B. Stoller's avatar
Leigh B. Stoller committed
534

535
    if ($currentrun >= $maxrun) {
536
	fatal("Reached run limit. Giving up.");
537
538
    }

539
540
541
542
    if ($retval < 0) {
	#
	# Failure in assign.
	#
543
	fatal("Unretriable error. Giving up.");
544
    }
545
    
546
547
548
549
550
551
552
553
554
    print "Waiting 5 seconds and trying again...\n";
    sleep(5);
    $currentrun++;
}
TBDebugTimeStamp("assign_loop finished");

#
# Run assign once.
# 
555
sub RunAssign ()
556
557
{
    # Clear globals for each run.
Leigh B. Stoller's avatar
Leigh B. Stoller committed
558
559
    undef %v2pmap;
    undef %p2vmap;
560
    undef %v2vmap;
Leigh B. Stoller's avatar
Leigh B. Stoller committed
561
    undef %plinks;
562
    undef %virtnodes;
563
564
565

    my %toreserve = ();
    my %subnodes  = ();
Shashi Guruprasad's avatar
Shashi Guruprasad committed
566
    
567
    TBDebugTimeStamp("ptopgen started");
568
    # Snapshot physical resources.
Chad Barb's avatar
   
Chad Barb committed
569
570
571
572
    #
    # if updating (-u), include any resources that may already be
    # allocated to experiment in the PTOP results.
    #
573
574
    my $ptopargs = "-p $pid ";
    $ptopargs   .= "-e $eid "
575
576
577
	if ($updating);
    $ptopargs   .= "-m $multiplex_factor "
	if (defined($multiplex_factor));
578
579
    $ptopargs   .= "-v "
	if ($virtcount);
580
581
    $ptopargs   .= "-r "
	if ($remotecount);
582
583
    $ptopargs   .= "-S "
	if ($simcount);
584
    system("ptopgen $ptopargs > $ptopfile");
585
    TBDebugTimeStamp("ptopgen finished");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
586

587
    TBDebugTimeStamp("assign started");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
588
    # Run assign
Leigh B. Stoller's avatar
Leigh B. Stoller committed
589
    my $cmdargs = "$ptopfile $topfile";
590
    $cmdargs = "-uPod -c .75 $cmdargs"
591
	if ($virtcount);
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606

    my $cmd;

    # If doing an experiment with virtnodes, use the prepass wrapper for assign
    # Turned off for now, because it needs some work.
    #if ($virtcount) {
    #	$cmd = "assign_prepass";
    #	$cmdargs = "-m $multiplex_factor $cmdargs"
    #	    if ($multiplex_factor);
    #} else {
    #	$cmd = "assign";
    #}

    $cmd = "assign";
    print "$cmd $cmdargs\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
607

608
609
610
611
612
613
614
615
616
617
618
619
620
621
    #
    # Fork a child to run assign. Parent spins watching the cancel flag
    # and waiting for assign to finish.
    #
    if (my $childpid = fork()) {
	while (1) {
	    sleep(2);

	    if (waitpid($childpid, &WNOHANG) == $childpid) {
		$assignexitcode = $? >> 8;
		last;
	    }

	    # Check cancel flag.
622
	    TBGetCancelFlag($pid, $eid, \$canceled);
623
624
625
626
	    if ($canceled) {
		if ((my $pgrp = getpgrp($childpid)) > 0) {
		    kill('TERM', -$pgrp);
		    waitpid($childpid, 0);
627
628
629

		    print "Cancel flag set; aborting assign run!\n";
		    return -1;
630
631
632
633
634
635
636
637
638
639
640
		}
		# Loop again to reap child above before exit.
	    }
	}
    }
    else {
	#
	# Change our session so the child can get a killpg without killing
	# the parent. 
	#
        POSIX::setsid();
641
	exec("nice $cmd $cmdargs > assign.log");
642
643
644
	die("*** $0:\n".
	    "    Could not start assign!\n");
    }
Chad Barb's avatar
Chad Barb committed
645

646
    # Check cancel flag before continuing. 
647
    TBGetCancelFlag($pid, $eid, \$canceled);
648
649
650
651
    if ($canceled) {
	print("Cancel flag set; aborting assign run!\n");
	return -1;
    }
652
653
654
655
656

    # Saving up assign.log coz each swapin/modify is
    # different and it is nice to have every mapping
    # for debugging and archiving purposes
    system("/bin/cp assign.log assign-$pid-$eid-$$.log");
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
    if (!open(ASSIGNFP, "assign.log")) {
	print("Could not open assign logfile!\n");
	return -1;
    }
    printdb "Reading assign results.\n";

    #
    # We no longer care what assign has to say when it fails! Just
    # tell the caller whether we want to keep trying or not. We still
    # send some of the goo to the output stream so that Rob can quickly
    # deduce what what wrong.
    #
    if ($assignexitcode) {
	my $violations = 0;
	
	print "ASSIGN FAILED:\n";
	while (<ASSIGNFP> !~ /^[\w\s]*precheck:$/) {}
Chad Barb's avatar
Chad Barb committed
674
675
	while (<ASSIGNFP>) {
	    chop;
676
677
	    /^\w*\s*precheck:$/ && do {
		next;
Chad Barb's avatar
Chad Barb committed
678
679
680
681
682
	    };
	    /^With ([0-9]+) violations$/ && do {
		$violations = $1;
		last;
	    };
683
	    print $_ . "\n";
Chad Barb's avatar
Chad Barb committed
684
	}
685
686
687
688
689
690
691
	if ($violations) {
	    while (<ASSIGNFP> !~ /^Violations:/) {}
	    while (<ASSIGNFP>) {
		if (/^Nodes:/) {
		    last;
		}
		print "$_";
692
693
694
	    }
	}
	close(ASSIGNFP);
695
	return (($assignexitcode == 1) ? 1 : -1);
696
    }
697
    
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
    #
    # Assign success; parse results.
    # 
    # read nodes section
    while (<ASSIGNFP> !~ /^Nodes:/) {}
    printdb "Nodes:\n";
    while (<ASSIGNFP>) {
	chop;
	/^End Nodes$/ && last;
	@info = split;
	my ($virtual,$physical) = @info[0,1];

	# We don't care about LAN nodes anymore.
	if (defined($lannodes{$virtual})) {
	    next;
	}

	if (physnodeallocated($physical)) {
	    #
	    # Mark node as being reused.
	    #
	    # Look at virtual node being mapped to node;
	    # if it wasn't in the previous map, mark node for reboot.
	    #
	    if (physnodereuse($physical) eq "reboot") {
		# No changes once it goes into reboot.
		;
	    }
	    elsif (virtnodeisvirt($virtual)) {
727
		#
728
729
730
731
732
733
734
735
736
737
738
739
740
741
		# A new virt virtual node on an existing physical node
		# does not force the physnode to be rebooted; we can
		# set up a new virtnode on it without a reboot. If its
		# an existing virtual on the same physnode, then mark
		# both as reused; no need to reboot either. If the 
		# virtnode has moved here from someplace else, no
		# reboot of the physnode either, but obviously the
		# vnode will be released and a new one allocated.  What
		# we cannot determine is if its just a renamed node
		# (which would require a reboot of the the virtual
		# node). 
		# 
		if (!exists($reserved_v2pmap{$virtual})) {
		    physnodesetreuse($physical, "reused");
Chad Barb's avatar
   
Chad Barb committed
742
		}
743
744
		elsif ($reserved_v2pmap{$virtual} eq $physical) {
		    my $reserved = $reserved_v2vmap{$virtual};
745

746
747
		    physnodesetreuse($reserved, "reused");
		    physnodesetreuse($physical, "reused");
748
749
		}
		else {
750
		    physnodesetreuse($physical, "reused");
751
752
753
		}
	    }
	    else {
Chad Barb's avatar
   
Chad Barb committed
754
		#
755
756
757
758
759
760
761
		# If a new virtual node mapped to this physnode (maybe
		# even the luser changed the name of the node), or if an
		# existing virtual node moved to this physnode, must
		# reboot the physnode. Else, the physnode is being
		# reused as is, and no need to mess with it. If the
		# user requested reboot, that will be handled outside
		# of this script.
762
		#
763
764
765
766
767
768
		if (!exists($reserved_v2pmap{$virtual}) ||
		    $reserved_v2pmap{$virtual} ne $physical) {
		    physnodesetreuse($physical, "reboot");
		}
		else {
		    physnodesetreuse($physical, "reused");
769
		}
770
	    }
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
	}
	else {
	    #
	    # This is a new node; we'll have to reserve it. Note that
	    # we do not reserve a widearea physnode when a virtual node
	    # is mapped to it; they are special.
	    #
	    $toreserve{$physical} = 1
		if (!virtnodeisremote($virtual));
	}
	
	if (virtnodeisvirt($virtual)) {
	    #
	    # If mapping a virtual node, then record that, since we need
	    # to allocate the virtnodes on that physnode, later.
	    #
	    if (!defined($virtnodes{$physical})) {
		$virtnodes{$physical} = [];
Shashi Guruprasad's avatar
Shashi Guruprasad committed
789
	    }
790
791
792
793
794
795
796
797
798
799
800
801
802
	    push(@{$virtnodes{$physical}}, $virtual);
	}
	elsif (virtnodeissubnode($virtual)) {
	    #
	    # Need to allocate the parent to. Should be optional?
	    # Save away and deal with once we have all the results.
	    #
	    $subnodes{$virtual} = $physical;
	}
	
	$v2pmap{$virtual} = $physical;
	if( ! defined($p2vmap{$physical}) ) {
	    $p2vmap{$physical} = [];
Leigh B. Stoller's avatar
Leigh B. Stoller committed
803
	}
804
805
806
	push(@{$p2vmap{$physical}}, $virtual);
	printdb "  $virtual $physical\n";
    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
807

808
809
810
811
812
813
814
815
816
    #
    # Process the subnodes. We have to allocate the parent at the same
    # time, lest it get sucked away for some other purpose by another
    # experiment. We might want to push this off into nalloc, but not
    # sure yet.
    #
    for my $virtual (keys(%subnodes)) {
	my $physical = $subnodes{$virtual};
	my $parent;
817

818
	TBPhysNodeID($physical, \$parent);
819

820
	printdb "  Subnode: $virtual $physical $parent\n";
821

822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
	#
	# See if we already have it. Swapmod, retry, or perhaps
	# the parent could be named separately? Or maybe there are
	# several subnodes on the physnode?
	#
	next
	    if (exists($p2vmap{$parent}));

	# Make up a name and add to the list.
	my $newvname = newvname($parent, "phost");

	$v2pmap{$newvname} = $parent;
	$p2vmap{$parent} = [ $newvname ];
	$toreserve{$parent} = 1;
	printdb "  Adding subnode host: $newvname $parent\n";
    }

    # read Edges
    # By convention, in plinks, the delay node is always the second
    # entry.
    while (<ASSIGNFP> !~ /^Edges:/) { }
    printdb "Edges:\n";
    EDGEWHILE: while (<ASSIGNFP>) {
	/^End Edges$/ && last EDGEWHILE;
	@info = split;
	$line = $_;
	$_ = $info[1]; # type
        SWITCH1: {
	    /^intraswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^interswitch$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,$#info];
		last SWITCH1;
	    };
	    /^direct$/ && do {
		($vlink,$rawA,$rawB) = @info[0,3,5];
		last SWITCH1;
	    };
	    /^trivial$/ && do {
		# we don't have plinks for trivial links
		$vlink = $info[0];
		$plinks{$vlink} = [];
		next EDGEWHILE;
	    };
	    print "Found garbage: $line\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
869
	}
870
871
872
873
874
875
	$nodeportA = &getnodeport($rawA);
	$nodeportB = &getnodeport($rawB);
	$nodeportA =~ s/\//:/;
	$nodeportB =~ s/\//:/;
	$plinks{$vlink} = [$nodeportA,$nodeportB];
	printdb "  $vlink " . join(" ",@{$plinks{$vlink}}) . "\n";
Leigh B. Stoller's avatar
Leigh B. Stoller committed
876
877
    }
    close(ASSIGNFP);
878
    TBDebugTimeStamp("assign finished");
Leigh B. Stoller's avatar
Leigh B. Stoller committed
879
880

    # Reserve resources
881
882
883
884
    if ($impotent) {
	print "Skipping physical reservation, as directed.\n";
	return 0;
    }
Chad Barb's avatar
Chad Barb committed
885

886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
    # From here, we can't recover anymore, coz we move
    # previously reserved pnodes/vnodes to the holding reservation
    # and back. By doing this, we will avoid any UNIQUE key issues
    # when a virt_node in the topology moves from one pnode to another
    # from previous to new mapping.
    # Another reason to do this just before nalloc of a new toreserve
    # nodes is that, we can get into name clashes
    # For example, lets say the user called his node pc2 and it was
    # actually mapped to pc99 in the initial swapin. If this was a
    # swapmod where the user asked for another node node0 which got 
    # mapped to pc2. nalloc of pc2 will result in UNIQUE key problems
    # since there exists a reserved vname pc2 (virtual name). By 
    # having this operation of moving the nodes into 
    # OLDRESERVED_PID/OLDRESERVED_EID and back before a new nalloc,
    # avoid this UNIQUE key problem. Also note that simply updating
    # the vname to be the same as the node_id field also won't
    # work all the time i.e. in the example discussed above
    my $oldreserved_pid = OLDRESERVED_PID;	
    my $oldreserved_eid = OLDRESERVED_EID;
    if (scalar(keys %oldreservednodes)) {
	# We can't recover after this coz we are making changes to
	# the DB
	$NoRecover = 1;
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back started");
	system("nfree -o $pid $eid " . join(" ", keys(%oldreservednodes)) );
	system("nalloc $pid $eid " . join(" ", keys(%oldreservednodes)) );
	my $exitval  = $? >> 8;
	TBDebugTimeStamp("Moving Old Reserved nodes to ".
	    		 "$oldreserved_pid/$oldreserved_eid ".
			 "and back finished");
	#
	# If nalloc failed with a fatal error, lets give it up. No retry.
	# 
	if ($exitval != 0) {
922
923
	    print("Failed to move back Old Reserved nodes back to reserved\n");
	    return -1;
924
	}
925

926
927
928
929
930
931
932
	# We need to move this back and forth the holding reservation only
	# once i.e. in the first call to RunAssign(). If it gets repeatedly
	# called coz only some pnode resources got nalloc'ed, we don't have
	# to do the above again.
	undef %oldreservednodes;
    }

933
934
935
936
    TBDebugTimeStamp("reserving started");
    system("nalloc -p $pid $eid " . join(" ", keys(%toreserve)));
    TBDebugTimeStamp("reserving finished");
    my $exitval  = $? >> 8;
Chad Barb's avatar
   
Chad Barb committed
937

938
939
940
941
942
943
944
    #
    # If nalloc failed with a fatal error, lets give it up. No retry.
    # 
    if ($exitval < 0) {
	print "Failed to reserve any nodes.\n";
	return -1; 
    }
Chad Barb's avatar
   
Chad Barb committed
945

946
947
948
949
950
951
952
    #
    # Okay, if nalloc got anything, we have to set the norecover bit,
    # since tbswap operates on the principle that any change in the DB
    # means no recover is possible. This can probably me dealt with by
    # deallocating any nodes we allocated in the wrapper before exiting.
    #
    $NoRecover = 1;
Chad Barb's avatar
   
Chad Barb committed
953

954
955
956
957
958
959
960
961
962
963
964
965
966
967
    #
    # Otherwise, all newly allocated nodes MUST go to the INIT_DIRTY
    # allocstate since the user now has control of them. If we eventually
    # fail, nodes not in RES_READY are deallocated (retry/modify). 
    #
    if ($exitval > 0) {
	my @reserved = ExpNodes($pid, $eid);

	# We got only some. Need to figure out which.
	print "Reserved some nodes ($exitval) we needed, but not all.\n";
	
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
		TBSetNodeAllocState($node, TBDB_ALLOCSTATE_RES_INIT_DIRTY());
Chad Barb's avatar
   
Chad Barb committed
968
	    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
969
970
	}

971
972
973
974
975
976
977
978
979
980
981
	#
	# We check to see if were were able to reserve all the fixed
	# nodes we needed. If we couldn't get the fixed list, then
	# this experiment is unlikely to map in the "near" future, so
	# give up now (no retry).
	#
	foreach my $node (values(%fixed_nodes)) {
	    if (! grep {$_ eq $node} @reserved) {
		printdb "  Could not allocate fixed node $node!\n";
		return -1;
	    }
Leigh B. Stoller's avatar
Leigh B. Stoller committed
982
	}
983
984
985
986
987
988
989
990
991
992
993
	#
	# Okay, must extend the fixed list with newly allocated nodes
	# so that we can recreate the top file, and try again with a
	# new set.
	#
	foreach my $node (@reserved) {
	    if (exists($toreserve{$node})) {
		foreach my $vname (@{$p2vmap{$node}}) {
		    $fixed_nodes{$vname} = $node;
		}
	    }
994
	}
995
996
997
	CreateTopFile();
	return 1;
    }
Chad Barb's avatar
Chad Barb committed
998

999
1000
    #
    # We got all the nodes we wanted. All newly allocated nodes MUST
For faster browsing, not all history is shown. View entire blame