linktest.pl.in 29.9 KB
Newer Older
1
#!/usr/bin/perl -w -T
2 3 4 5
#
# Linktest test script.
#
# @author: davidand
6
#
7 8
use strict;
use Class::Struct;
9 10
use POSIX qw(uname);
use IO::Handle;
11

12
# path to applications and files
13
use constant LINKTEST_NSPATH => "@LINKTEST_NSPATH@";
14 15 16 17 18 19 20 21
use constant PATH_NICKNAME => "@CLIENT_VARDIR@/boot/nickname";
use constant PATH_KEYFILE => "@CLIENT_VARDIR@/boot/eventkey";
use constant PATH_RUDE => "/usr/local/bin/rude";
use constant PATH_CRUDE => "/usr/local/bin/crude";
use constant PATH_PATHRATE_SND => "/usr/local/bin/pathrate_snd";
use constant PATH_PATHRATE_RCV => "/usr/local/bin/pathrate_rcv";
use constant PATH_EMULAB_SYNC => "@CLIENT_BINDIR@/emulab-sync";
use constant PATH_LTEVENT => "@CLIENT_BINDIR@/ltevent";
22

23 24
# log files used by tests.
use constant CRUDE_DAT => "/tmp/crude.dat"; # binary data
25 26
use constant RUDE_CFG  => "/tmp/rude.cfg";
use constant PATHRATE_DAT => "/tmp/pathrate.dat";
27

28 29 30 31 32 33 34 35 36 37 38 39 40
# pathrate test limits.
use constant LIMIT_BW_HI => 50000000;
use constant LIMIT_BW_LO =>  1000000;
use constant LIMIT_BW_LOSS => 0;

# A hack to make sure that we don't get bogged down in being too
# accurate! Make sure the 
# error is a certain significance before we start reporting it.
use constant INSIGNIFICANT_LAT_ERROR => 0.25; # ms 
use constant INSIGNIFICANT_BW_ERROR  => 0.25;  # mb

# latency must be corrected for xmit delay under this speed.
use constant LAT_LOW_BW => 10000000;
41

42 43 44 45
# slow send rate (for bw 256kbps to 1Mbps)
use constant SLOW_BW => 256000;
use constant SLOW_SEND => 400;
use constant FAST_SEND => 1002;
46

47
# misc contstants
48
use constant BSD => "FreeBSD";
49
use constant LINUX => "Linux";
50 51 52
use constant RTPROTO_STATIC => "Static";
use constant EVENT_STOP => "STOP";
use constant EVENT_SERVER => "boss";
53
use constant PING_SEND_COUNT => 10;
54 55

# test levels
56 57 58 59
use constant TEST_LATENCY => 1; # direct link connectivity & latency
use constant TEST_RT_STATIC => 2;   # prior plus static routing
use constant TEST_LOSS => 3;   # prior plus loss
use constant TEST_BW => 4; # prior plus bandwidth
60

61
# test names
62 63 64 65
use constant NAME_RT_STATIC => "Static Routing";
use constant NAME_LATENCY => "Latency (Round Trip)";
use constant NAME_LOSS => "Loss";
use constant NAME_BW => "Bandwidth";
66 67 68 69 70

# error suffix for logging linktest and development (fatal) errors
use constant SUFFIX_ERROR => ".error";
use constant SUFFIX_FATAL => ".fatal";
use constant SUFFIX_TOPO =>  ".topology";
71 72
use constant DEBUG_ALL => 2; # debug level for all debug info, not just msgs.
use constant LOG_CHANCE => 0.10; # chance of logging.
73

74 75
# more paths
use constant TBCOMPAT_PATH => "/proj/utahstud/users/davidand/public/ltpath";
76
use constant LOG_PATH => "/proj/utahstud/users/davidand/public/logpath/biglog";
77 78 79 80 81 82 83 84 85

# struct for representing a link.
struct ( edge => {
    src => '$',
    dst => '$',
    bw  => '$',
    delay => '$',
    loss => '$'
});
86 87


88 89 90 91 92
##############################################################################
# Globals
##############################################################################

# see init() for initialization of globals
93 94
my $ns_file;    # ns file full path
my $synserv;    # synch server node
95
my $rtproto;    # routing protocol
96
my $hostname;   # this hosts name
97 98 99 100
my $exp_id;     # experiment id
my $proj_id;    # project id
my $gid;        # group id
my $platform;   # name of platform
101 102 103 104
my $startat=1;  # which test to start at
my $stopat=99;  # which test to stop at
my @kill_list;  # PIDs maintained through the life of linktest
                    # which get killed as part of cleanup.
105 106 107
my $debug_level = 0; # enable debug statements
                    # 1 = print debug statements.
                    # 2 = show STDOUT and STDERR
108
my $barr_count;   # used by synserv host, nubmer of hosts -1
109
my $server = "boss";       # event server. default to boss.
110 111

my @hosts; # hosts: list of text strings containing host names.
112
           # sorted alphabetically
113
my @links; # links: list of edge structs.
114
           # sorted alphabetically by src . dst
115
                                  
116 117
my $expt_path;  # experiment path (ie, tbdata) set by init.
my $linktest_path;   # log path (ie tbdata/linktest) set by init.
118

119
# full path to custom NS build. 
120
my $ns_cmd; 
121

122 123 124 125 126 127 128
# signal handler in case the process is killed.
$SIG{INT} = sub {
    &debug("Linktest killed by SIGINT.\n");
    &cleanup;
    exit(1);
};

129 130
# security
$ENV{'PATH'} = '/bin:/usr/bin';
131 132 133 134

##############################################################################
# Main control
##############################################################################
135
&proc_cmd;
136

137
&init;
138

139 140
&debug_top;

141
if(&dotest(TEST_LATENCY)) {
142
    &debug("\nTesting Single Hop Connectivity and Latency...\n\n");
143
    &latency_test;
144 145
}

146 147 148 149
if(&dotest(TEST_RT_STATIC)
    && defined($rtproto)
    && $rtproto eq RTPROTO_STATIC) {

150 151 152
    &debug("\nTesting Static Routing...\n\n");
    &static_rt_test; # nodes not covered by 1hop test
}
153

154
if(&dotest(TEST_LOSS)) {
155
    &debug("\nTesting Loss...\n\n");
156
    &loss_test; 
157
}
158
if(&dotest(TEST_BW)){
159 160 161
    &debug("\nTesting Bandwidth...\n\n");
    &bw_test;
}
162

163 164
&cleanup;

165
&send_done;
166

167 168
&debug("Done\n");

169 170
exit(0);

171 172 173 174
##############################################################################
# Test procedures
##############################################################################

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215

sub write_rude_cfg {
    my ($stream_id, $edge) = @_;
    open FCFG,">" . RUDE_CFG ||  die &fatal ("Could not open " . RUDE_CFG);
    print FCFG "START NOW\n";
    print FCFG "0000 $stream_id ON 3001 " . $edge->dst . ":10001 CONSTANT " . &get_loss_sample_size($edge) . " 20\n";
    print FCFG "1000 $stream_id OFF\n";
    close FCFG;

}

sub get_loss_sample_size {
    my $edge = shift @_;
    if($edge->loss > 0) {
	return &round( 10 / $edge->loss);
    } else {
	return SLOW_SEND; # just in case a slow link with no loss.
    }
}

sub print_link {
    my $edge = shift @_;
    return $edge->src . " to " . $edge->dst . " (" . ($edge->bw / 1000000)  . 
	" Mbps, " . $edge->delay . "ms, " . $edge->loss . "% loss)";
}
# See /users/davidand/magic/parallelism/README for some comments about
# why it's OK to run crude/rude in both directions simultaneously.
#
# Due to problems found while testing, the stream test doesn't check latency,
# only loss. See bas:/users/davidand/writeup2/finalpaper.pdf for test details.
sub loss_test {
    my %analyze;
    my %recv_cnt;
    my $stream_id = 1;
    my @edge_copy = @links;
    while(&has_elems(\@edge_copy)) {
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy);
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src) {
		if(valid_loss($edge)) {
		    &write_rude_cfg($stream_id,$edge);
216
		    &my_system(PATH_RUDE,"-s", RUDE_CFG);
217 218 219 220 221 222 223
		    $analyze{$stream_id} = $other_edge;
		} else {
		    &debug("Skipping loss test for " . &print_link($edge) . "\n");
		}
	    } elsif ($hostname eq $other_edge->src) {
		if(valid_loss($other_edge)) {
		    &write_rude_cfg($stream_id,$other_edge);
224
		    &my_system(PATH_RUDE,"-s", RUDE_CFG);
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
		    $analyze{$stream_id} = $edge;
		} else {
		    &debug("Skipping loss test for " . &print_link($other_edge) . "\n");
		}
	    }
	}
	$stream_id++;
	&barrier();
    }

    # wait for any stragglers due to delay-- there is a  race
    # between the barrier sync on the control net and the expt net latency.
    # XXX should actually sleep as a function of the max latency for speed.
    sleep(1);

    # count packets received for each stream.
241 242 243 244
    my @results = &my_tick(PATH_CRUDE,"-d",CRUDE_DAT);
    my $result_count = @results;
    &debug("result_count from crude: $result_count\n");
    foreach (@results) {
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
	if(/ID=(\d+) /) {
	    $recv_cnt{$1}++;
	}
    }

    # analyze only links for which a stream was received here.
    foreach my $key (keys %analyze) {
	my $edge = $analyze{$key};
	my $received = $recv_cnt{$key};

	if(!defined($received)) {
	    $received=0;
	    &error (NAME_LOSS,$edge,"No packets received from " . $edge->src);
	} else {
	    # this is a large sample test about proportion p.
	    # this is considered a valid statistical estimate for np >= 10.
	    my $p = 1 - $edge->loss;
	    my $n = &get_loss_sample_size($edge) + 1 ;


	    my $p_hat = $received / $n;
	    my $numerator = $p_hat - $p;
	    my $denominator = sqrt( $p * (1 - $p_hat) / $n);

	    if( $edge->loss == 0) {
		if($received < $n) {

		    &log("$platform," . TEST_LOSS . "," 
			 . $edge->loss . ","
			 . $n . ","
			 . $received . "\n");
		    
		    my $errmsg = "Unexpected loss occurred (n=$n, received=$received)\n";
		    &error(NAME_LOSS, $edge, $errmsg);
279
		} # note, no logging of succesful 0-loss. (too common).
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
	    } elsif($denominator == 0) {
		my $errmsg = "No packets were lost (n=$n, plr=" . $edge->loss .")";
		&error(NAME_LOSS, $edge, $errmsg);
	    } else {
		&log("$platform," . TEST_LOSS . "," 
		     . $edge->loss . ","
		     . $n . ","
		     . $received . "\n");
		
		my $z = $numerator / $denominator;
		my $reject_region = 2.58; # alpha = 0.1, normal distro by CLT
		if(abs($z) > $reject_region) {
		    my $errmsg = "Probable loss misconfiguration (n:$n, received: $received; expected: $p, measured=$p_hat)";
		    &error(NAME_LOSS, $edge, $errmsg);
		}
	    }
	}

    }


    # one more barrier, want to make sure io intensive processes
    # finish before the bandwidth test.
    &barrier();
}


307 308 309 310 311 312 313 314 315 316 317 318 319
# does the current node have a link?
# @return: 1 for yes, 0 else
sub linked_node {
    my $node = shift @_;
    foreach my $edge (@links) {
        if($node eq $edge->src
           || $node eq $edge->dst) {
            return 1;
        }
    }
    return 0;
}

320 321 322 323

# Static routing test
# 
# Attempt to reach all nodes with default TTL
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
sub static_rt_test {

    # only need to attempt to reach nodes that are not on direct links
    # WITH THIS HOST!!
    #
    # direct linked nodes are covered by the single-hop test.
    my @host_copy = @hosts;

    my @this_1hops; # 1hop destinations from this host.

    foreach my $edge(@links) {
	if($edge->src eq $hostname) {
	    push @this_1hops, $edge->dst;
	} elsif ($edge->dst eq $hostname) {
	    push @this_1hops, $edge->src;
339 340 341
	}
    }

342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
    for(my $i=0;$i<@host_copy;$i++) {
	if($host_copy[$i] eq $hostname) {
	    $host_copy[$i] = undef;
	} else {
	    # zap any host in the dst list.
	    foreach my $dst( @this_1hops ) {
		if(defined($host_copy[$i])
		   && ($dst eq $host_copy[$i])) {
		    $host_copy[$i] = undef;
		} 
	    }
	}
    }

    my @waitlist;
357
    # fork processes to run the pings in parallel.
358
    for(my $i=0;$i<@host_copy;$i++) {
359 360 361
	if(defined($host_copy[$i])
           && linked_node($host_copy[$i])
           ) {
362
	    my $dst = $host_copy[$i];
363 364
	    my $pid = fork();
	    if(!$pid) {
365
		my ($recv_cnt,$ignored1, $ignored2) = &ping_node($dst,0);
366
		if(!$recv_cnt) {
367 368 369
		    my $newEdge = new edge;
		    $newEdge->src($hostname);
		    $newEdge->dst($dst);
370
		    &error(NAME_RT_STATIC,$newEdge , "$hostname could not ping $dst");
371 372
		} else {
		    &debug("Attempting to reach $dst... OK\n");
373 374
		}
		exit;
375 376
	    } else {
		push @waitlist,$pid;
377 378 379
	    }
	}
    }
380
    &wait_all(\@waitlist);
381 382
}

383

384
# Ping directly connected nodes and get RTT.
385
#
386 387 388 389
# Note: using RTT is suboptimal because it doesnt split out
# links individually. However, due to problems when running crude/rude
# with timestamps on BSD, I am falling back to RTT for now.
sub latency_test {
390 391
    my @waitlist;
    my @edge_copy = @links;
392

393
    while(&has_elems(\@edge_copy)) {
394 395 396
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy);
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src ) {
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
		# todo: consider ignoring latency if no delay node.
		if(&valid_latency($edge) && &valid_latency($other_edge)
		   ) {
		    my $pid = fork();
		    if(!$pid) {

			# call ping_node with ttl=1
			my ($result_cnt, $sample_avg, $sample_dev) = &ping_node($edge->dst,1);


			my $n = PING_SEND_COUNT;

			if($result_cnt == 0) {
			    my $errmsg = "No packets were received (n=$n)\n";
			    &error(NAME_LATENCY, $edge, $errmsg);
			} else {

# facts from analysis in /users/davidand/public/calibrate.
# came from 40 independent swapins (enough for normality assumption)
# (note that data actually is normal at any particular latency point, 
# according to described.lst)

# best fit regression for the error as a function of total latency, according to sas.
# see regression1.lst and regression1.sas
#-0.00523(actual)     0.00003096 fbsd
#-0.00530(actual)     0.00003478 linux
# roughly identical, so use:
#-0.005(actual)

# inherent delay in the system (with a delay node) is
# see described.lst and described.sas
# 0.337737  fbsd
# 0.362282  linux (median was 0.328000)
# for now, round to:
# 0.333 ms
# since I don't have a way to know which is which.

# Also, described.lst provides good support for the notion that
# the distribution of latencies is normal. For Fbsd all of the 
# distributions were normal, and most were for Linux. So, use this
# assumption in order to send fewer test packets.
			   

			    # the null hypothesis value, u.
			    my $u = $edge->delay + $other_edge->delay;
			    
			    # the calibration as a function of $u
			    $u += 0.333 - 0.005 * $u / 2;


# calibration did not include transport delay,
# so only factor it in under the calibrated level.

			    # transport delay: 64 bytes + 18 bytes eth/crc
			    # to units of ms.
			    if($edge->bw < LAT_LOW_BW) {
				$u += 1000 * (82 * 8) / $edge->bw;
			    }
			    if($other_edge->bw < LAT_LOW_BW) {
				$u += 1000 * (82 * 8) / $other_edge->bw;
			    }
			    
			    my $x_bar = $sample_avg;
			    my $numerator = $x_bar - $u;

			    my $S = $sample_dev;
			    

			    my $denominator = $S / sqrt($n);

			    if($denominator == 0) {
				my $errmsg = "Invalid sample standard deviation (possible parse problem, please report). (n=$n, u=$u, x_bar=$x_bar, S=$S)";
				&error(NAME_LATENCY, $edge, $errmsg);
			    } else {
				my $z = $numerator / $denominator;

				&log("$platform," . TEST_LATENCY . "," 
				     . $edge->delay . ","
				     . $other_edge->delay . ","
				     . $u . ","
				     . "$n,$sample_avg,$sample_dev,$z\n");

				my $t_reject = 3.250; # alpha = 0.01, df=9

				if(abs($z) > $t_reject
				   && (abs($x_bar - $u) > INSIGNIFICANT_LAT_ERROR)
				   ) {
				    my $errmsg = "Probable latency misconfiguration (expected=$u, measured=$x_bar).";
				    &error(NAME_LATENCY, $edge, $errmsg);
				}
			    }

			}
			exit(0);

		    } else {
			push @waitlist, $pid;
494
		    }
495
		} else {
496
		    &debug("Skipping latency test for " . &print_link($edge) . " to " . &print_link($other_edge) . "\n");
497 498
		}
	    }
499
#
500 501
	}
    }
502 503

    &wait_all(\@waitlist);
504 505
}

506

507 508 509 510 511 512
# Bandwidth test
#
# See /users/davidand/magic/parallelism/README for some comments about
# why each Pathrate process gets its own machine while running.
sub bw_test {
    my @analyze_list;
513
    my @edge_copy;
514
    @edge_copy = @links;
515 516


517 518 519

    # all nodes will execute the same reductions on the edge list
    # on their own so that the number of barriers is the same.
520
    while(&has_elems(\@edge_copy)) {
521
	my $edge = &get_assign(\@edge_copy);
522

523
	if(defined($edge) ) {
524 525 526 527
	    if($hostname eq $edge->dst) {
		if (&valid_bw($edge)) {
		    push @analyze_list, $edge;
		    
528
		    &my_system(PATH_PATHRATE_RCV, "-Q","-s",$edge->src,"-q","-N", PATHRATE_DAT);
529 530
		} else {
		    &debug("Skipping bandwidth test for " . &print_link($edge) . "\n");
531

532
		}
533
	    } 
534 535 536 537
	}
	&barrier();
    }

538

539 540
    # read the log file.
    if(@analyze_list) {
541 542
	my @results = &read_file(PATHRATE_DAT);
	foreach (@results) {
543 544 545 546
	    my $edge = shift(@analyze_list);
	    my $sender = $edge->src;
	    if(/SNDR=$sender.*CAPL=(\d+\.\d+)Mbps.*CAPH=(\d+\.\d+)Mbps/) {
		my $expected = $edge->bw / 1000000;
547 548
		my $low = $1;
		my $hi = $2;
549 550 551 552 553 554

		&log("$platform," . TEST_BW . "," 
		     . $expected . ","
		     . $low . ","
		     . $hi . "\n");

555 556 557 558 559 560 561 562
		my $diff;
		if($expected > $2) {
		    $diff = $expected - $2;
		} elsif ($expected < $1) {
		    $diff = $1 - $expected;
		} else {
		    $diff = 0;
		}
563

564
		my $output = sprintf "Receive from " . $edge->src
565
		    . ": $1/$2/%.1f/%.1f\n", $expected, $diff;
566 567
		&debug($output);

568 569
		if($diff > INSIGNIFICANT_BW_ERROR) {
		    &error (NAME_BW, $edge, "Bandwidth estimate $low to $hi Mbps fails to support expected $expected");
570
		}
571

572
	    } else {
573
		 die &fatal("Error while parsing " . PATHRATE_DAT);
574 575 576 577
	    }
	}
    }
    
578 579
}

580
# enforce BW limits for Pathrate
581 582
sub valid_bw {
    my $edge = shift @_;
583 584 585
    if($edge->bw >= LIMIT_BW_LO
       && $edge->bw <= LIMIT_BW_HI
       && $edge->loss <= LIMIT_BW_LOSS
586
       ) {
587 588 589 590
	return 1;
    } else {
	return 0;
    }
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
}

# Note, slow bandwidths may cause a problem. 256 kbps can safely handle 400
# pps in crude, so some combinations of loss may overwhelm the link.
sub valid_loss {
    my $edge = shift @_;
    if($edge->bw >= SLOW_BW && $edge->bw < LIMIT_BW_LO) {
	if(&get_loss_sample_size($edge) > SLOW_SEND) {
	    return 0;
	} else {
	    return 1;
	}
    } elsif( $edge->bw >= LIMIT_BW_LO) {
	# also want an upper limit.
	if(&get_loss_sample_size($edge) > FAST_SEND) {
	    return 0;
	} else {
	    return 1;
	}
    } else {
	return 0;
    }
}

# allow all tests, but this hasn't really been tested much under 1Mb.
sub valid_latency {
    return 1;
618

619
}
620

621

622
# Handles reading NS output.
623 624 625
sub get_topo {
    my $ns_outfile = shift(@_);

626 627
    my @results = &read_file($ns_outfile);
    foreach (@results) {
628 629 630 631 632

	# load the output from ns.
	# the file format is simple:
	# expr := h <node name>
	#      || l <src node> <dst node> <bw (Mb/s)> <latency (s)> <loss (%)>
633 634 635
	if( /^h (\S+)/ ) {
	    push @hosts, $1
	} elsif ( /^l (\S+)\s+(\S+)\s+(\d+)\s+(\d\.\d+)\s+(\d\.\d+)/) {
636 637 638 639
	    my $newEdge = new edge;
	    $newEdge->src($1);
	    $newEdge->dst($2);
	    $newEdge->bw($3);
640
	    $newEdge->delay($4 * 1000); # units of ms
641
	    $newEdge->loss($5);
642
	    push @links, $newEdge;
643 644

	# currently recognize only Static routing
645
	} elsif (/^r Static/i) {
646
	    $rtproto = RTPROTO_STATIC;
647 648 649
	}
    }

650 651 652
    # sorted order.
    @hosts = sort { $a cmp $b } @hosts;
    @links = sort { $a->src . $a->dst cmp $b->src . $b->dst } @links;
653 654 655 656 657 658 659 660

}


# prints out the topology read in from the NS file
sub debug_top {
    &debug("ns script: $ns_file\n");
    &debug("nodes:\n");
661
    foreach my $vert (@hosts) {
662 663 664
	&debug( " " . $vert . "\n");
    }
    &debug("links:\n");
665
    foreach my $edge (@links) {
666 667 668 669 670 671 672
	&debug( " " . $edge->src . " " . $edge->dst . " " . $edge->bw
		. " " . $edge->delay . " " . $edge->loss . "\n"
		);
    }
    &debug("routing protocol: $rtproto\n") if defined($rtproto);
}

673
# log to expt problem directory.
674
sub error {
675 676 677
    my($test,$edge,$msg) = @_;

    my $output = "$test\n";
678
    $output .= "  Link:  " . &print_link($edge) . "\n";
679 680
    $output .=     "  Error: $msg\n\n";

681 682 683
    &debug($output);    
    &append_file($linktest_path . "/" . $hostname . SUFFIX_ERROR,
		 $output);
684 685 686 687 688
}

# log to fatal file and exit.
sub fatal {
    my ($msg) = @_;
689 690
    my @output;
    push @output,"Fatal Error: $msg";
691 692 693

    if(defined($linktest_path) && defined($hostname)) {
	my $fname = $linktest_path . "/" . $hostname . SUFFIX_FATAL;
694
	&write_file($fname,@output);
695
    }
696 697 698 699
    
    # clean up any child proceses
    &cleanup;

700
    return $output[0];
701 702
}

703

704 705
# synch all nodes
sub barrier {
706
    if($hostname eq $synserv) {
707 708
	# note, the synserver should know what the node count is
	# since it parsed the NS file.
709
	die &fatal("barrcount not defined!") unless defined($barr_count);
710 711 712
	if($barr_count) {
	    &my_system(PATH_EMULAB_SYNC,"-i",$barr_count);
	}
713
    } else {
714
	&my_system(PATH_EMULAB_SYNC);
715 716
    }

717 718 719
}

sub debug {
720
    return unless $debug_level;
721
    print "@_";
722 723
}

724 725

# initialize globals and set up linktest directory if needed.
726
sub init {
727
    ($platform) = POSIX::uname();
728 729

    if($platform eq BSD) {
730
	$ns_cmd = LINKTEST_NSPATH ."/fbsd/ns";
731
    } elsif ($platform eq LINUX) {
732
	$ns_cmd = LINKTEST_NSPATH ."/linux/ns";
733
    } else {
734
	 die &fatal("Platform $platform is not currently supported.");
735 736
    }

737
    # get the experiment info
738 739 740 741
    my $fname = PATH_NICKNAME;
    die &fatal("Could not locate $fname") unless -e $fname;
    my @results = &read_file($fname);
    ($hostname, $exp_id, $proj_id) = split /\./, $results[0];
742
    chomp $hostname;
743 744 745
    chomp $exp_id;
    chomp $proj_id;
    $gid = $proj_id; # temporarily ignoring groups
746 747

    # get the experiment path and log path
748 749
    $expt_path = "/proj/$proj_id/exp/$exp_id/tbdata";
    $linktest_path = "$expt_path/linktest";
750 751 752 753 754 755 756 757 758
    if(!defined($ns_file)) {
	if(-e "$expt_path/$exp_id-modify.ns") {
	    $ns_file = "$expt_path/$exp_id-modify.ns";
	} elsif (-e "$expt_path/$exp_id.ns") {
	    $ns_file = "$expt_path/$exp_id.ns";
	} else {
	    die &fatal("Could not locate an ns file.");
	}
    }
759 760

    my $ssname = "@CLIENT_VARDIR@/boot/syncserver";
761
     die &fatal("Could not locate an emulab-sync server") unless -e $ssname;
762

763 764
    @results = &read_file($ssname);
    ($synserv) = split/\./, $results[0];
765 766 767
    chomp $synserv;

    # synserv machine makes the linktest directory and invokes ns.
768
    my $outname = "$linktest_path/$exp_id" . SUFFIX_TOPO;
769 770
    if($hostname eq $synserv) {
	if( -e $linktest_path ) {
771
	     die &fatal("Path $linktest_path is not a directory") unless -d $linktest_path;
772

773
	    # blitz all files in the shared directory from the previous run.
774
	    opendir (DIR,$linktest_path) ||  die &fatal("Could not open $linktest_path");
775 776
	    my @dirfiles = grep (!/^\.\.?$/,readdir(DIR));
	    foreach (@dirfiles) {
777
		&do_unlink("$linktest_path/$_");
778 779
	    }

780
	} else {
781
	    mkdir ($linktest_path,0777) or  die &fatal("Could not create directory $linktest_path");
782 783
	}

784
	chdir(TBCOMPAT_PATH);
785 786 787
	@results = &my_tick($ns_cmd,
			    &check_filename($ns_file));
	&write_file($outname,@results);
788
    }
789 790 791 792 793
    
    # blitz local files from last run.
    &do_unlink(CRUDE_DAT);
    &do_unlink(PATHRATE_DAT);
    &do_unlink(RUDE_CFG);
794 795

    # start up collector streams if necessary.
796
    if(&dotest(TEST_LOSS)){
797
	&my_system_initonly(PATH_CRUDE,"-l",CRUDE_DAT);
798
    }
799
    if(&dotest(TEST_BW)){
800
	&my_system_initonly(PATH_PATHRATE_SND,"-i","-q");
801 802
    }

803 804 805 806 807 808 809 810 811
    # synserver host reads the ns file early to get the node count.
    # not using ready count, in practice sometimes didn't get a ready
    # in some cases (expt modify).
    if($hostname eq $synserv) {
	&get_topo($outname) ;
	$barr_count = @hosts;
	$barr_count--;
    }

812 813
    # big first synch waiting for collector startup and ns execution.
    &barrier;
814

815
    # now all nodes read in the topology file output by ns.
816
    &get_topo($outname) unless $hostname eq $synserv;
817 818
}    

819 820 821 822 823 824
# Ping test, which does service both for latency and static
# routing/connectivity tests. Note, send-rate is only 10
# so this can run even on a 1kbps link. 10 packets are enough
# for connectivity even with high loss (plr 0.1, still 0.999 chance
# of getting through).
# 
825 826
# @param[0] := host to ping
# @param[1] := ttl, 0 for default
827
# @return: (received_count, avg_latency ms)
828 829
sub ping_node {
    my ($host,$ttl) = @_;
830 831
    my $count = 0;
    my $avg_latency = 0;
832 833 834 835 836
    my $stddev = 0;

    my $send_count = PING_SEND_COUNT;
    my $timeout = 1; # 1 second
    my $send_rate = $timeout / $send_count;
837 838

    # set deadline to prevent long waits
839
    # TODO: note that some kind of problem occurs with the path.hmm.
840 841
    my $cmd;
    if($ttl) {
842
	if($platform eq BSD) {
843
	    $cmd = "/usr/local/bin/sudo /sbin/ping -c $send_count -q -i $send_rate -t $timeout -m $ttl $host";
844
	} elsif($platform eq LINUX) {
845
	    $cmd = "/usr/bin/sudo /bin/ping -c $send_count -q -i $send_rate -w $timeout -t $ttl $host";
846
	}
847
    } else {
848
	if($platform eq BSD) {
849
	    $cmd = "/usr/local/bin/sudo /sbin/ping -c $send_count -q -i $send_rate -t $timeout $host";
850
	} elsif($platform eq LINUX) {
851
	    $cmd = "/usr/bin/sudo /bin/ping -c $send_count -q -i $send_rate -w $timeout $host";
852
	}
853 854
    }

855
    # note backticks passes SIGINT to child procs
856 857 858
    my @args = split(/\s+/,$cmd);
    my @results = &my_tick(@args);

859 860 861 862 863 864
    my $reslt_cnt = @results;
    my $result = $results[$reslt_cnt-2];
    if($platform eq BSD && $result =~ /(\d+) packets received/) {
	$count = $1;
    } elsif($platform eq LINUX && $result =~ /(\d+) received/) {
	$count = $1;
865
    }
866

867 868
    if($count) {
	$result = $results[$reslt_cnt-1];
869
	if($result=~ /\d+\.\d+\/(\d+\.\d+)\/\d+\.\d+\/(\d+\.\d+)/) {
870
	    $avg_latency = $1;
871
	    $stddev = $2;
872
	}
873
    }
874
    return ($count, $avg_latency, $stddev);
875 876 877
}


878

879
# returns one edge at a time, reserving two nodes.
880 881 882
sub get_assign {
    my ($todo_ref) = @_; # must maintain sorted order invariant
    my $task = undef;
883 884
    my @thisrun;

885 886
    # build a fresh hash to see which nodes are in use.
    my %inuse;
887
    foreach (@hosts) {
888 889 890 891 892
	$inuse{$_}=0;
    }

    for(my $i=0;$i<@{$todo_ref};$i++) {
	my $edge = @{$todo_ref}[$i];
893
	if(defined($edge) && !($inuse{$edge->src} || $inuse {$edge->dst})) {
894 895
	    $inuse{$edge->src} = 1;
	    $inuse{$edge->dst} = 1;
896 897
	    push @thisrun,$edge;
	    @{$todo_ref}[$i] = undef;
898 899 900
	}
    }

901 902 903 904 905 906 907
    # figure out the tasks for this particular host.
    foreach my $edge (@thisrun) {
	if($hostname eq $edge->src || $hostname eq $edge->dst ) {
	    $task = $edge;
	}
    }

908
    # each machine should reduce the todo list the same order due to
909 910
    # alphabetic sorting of info from the ns file.
    # only thing left to do is return this machines' assignment for processing.
911
    return $task; # or undef if no jobs left for this host.
912 913
}

914

915
# returns two edges at a time, reserving two nodes.
916
sub get_twoway_assign {
917
    my ($todo_ref) = @_;
918 919
    my $task = undef;
    my $other_task = undef;
920 921
    my @thisrun;

922 923
    # build a fresh hash to see which nodes are in use.
    my %inuse;
924
    foreach (@hosts) {
925 926 927 928 929
	$inuse{$_}=0;
    }

    for(my $i=0;$i<@{$todo_ref};$i++) {
	my $edge = @{$todo_ref}[$i];
930
	if(defined($edge) && !($inuse{$edge->src} || $inuse {$edge->dst})) {
931 932
	    $inuse{$edge->src} = 1;
	    $inuse{$edge->dst} = 1;
933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948
	    push @thisrun, $edge;
	    @{$todo_ref}[$i]=undef;

	    # get the other side
	    for(my $j=$i;$j<@{$todo_ref};$j++) {
		my $otheredge = @{$todo_ref}[$j];
		if(defined($otheredge)
		   && $edge->src eq  $otheredge->dst
		   && $edge->dst eq $otheredge->src) {
		    push @thisrun,$otheredge;
		    @{$todo_ref}[$j] = undef;
		}
	    }
	    
	}
    }
949 950


951 952 953 954 955 956 957 958 959 960
    # figure out the tasks for this particular host.
    foreach my $edge (@thisrun) {
	if($hostname eq $edge->src || $hostname eq $edge->dst) {
	    $task = $edge;
	}
    }
    if(defined($task)) {
	foreach my $edge (@thisrun) {
	    if($task->dst eq $edge->src && $task->src eq $edge->dst) {
		$other_task = $edge;
961 962 963 964 965 966 967 968 969
	    }
	}
    }

    return ($task,$other_task); # or undef if no jobs left for this machine.


}

970 971 972 973 974 975 976 977 978 979
sub has_elems {
    my ($todo_ref) = @_;
    foreach (@{$todo_ref}) {
	if(defined($_)) {
	    return 1;
	}
    }
    return 0;
}

980

981 982 983 984
sub round {
    my($number) = shift;
    return int($number + .5);
}
985