linktest.pl 72 KB
Newer Older
1
#!/usr/bin/perl -w -T
2
#
3
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
5
# All rights reserved.
6
#
7

8 9
use strict;
use Class::Struct;
10
use POSIX qw(uname);
11
use POSIX qw(strftime);
12
use IO::Handle;
13
use English;
14
use Socket;
15

16
my $LINKTEST_VERSION = "1.2";
17

18 19 20
#
# XXX config stuff that does not belong on the client-side
#
21
#my $PROJROOT      = "@PROJROOT_DIR@";
22

23 24 25
#
# Linktest test script. This script is set up to run as root on
# experiment nodes. It is invoked by the Linktest daemon after the
26 27 28 29 30 31
# daemon receives a Linktest "START" event. The script runs tests
# using ping, rude/crude (a real-time packet emitter/collector) and a
# locally hacked iperf to test all links in the experiment. If the
# results show a 99% chance that the experiment is configured
# incorrectly, an error is logged to the experiment directory in
# tbdata/linktest.  Valid ranges have been calibrated ahead of time.
32 33 34
#
sub usage() 
{
Timothy Stack's avatar
 
Timothy Stack committed
35 36 37
    print("Usage: linktest.pl\n".
	  " [STARTAT=<test step, 1-4>]\n".
	  " [STOPAT=<test step, 1-4>]\n".
Mike Hibler's avatar
Mike Hibler committed
38
	  " [COMPAT=<version or 0>]\n".
39 40 41
	  " [DOARP=<1=yes, 0=no>]\n".
	  " [REPORTONLY=<1=yes, 0=no>]\n".
	  " [NODES=<n1,n2...>\n".
42 43 44 45 46 47
	  " [DEBUG=<debugging level. 1=on, 0=off>]\n".
	  " [LOGRUN=<command/used/for/logging>]\n".
	  " [LOGDIR=<path/to/log/root>]\n".
	  " [BINDIR=<path/to/binary/files>]\n".
	  " [VARDIR=<path/to/config/files>]\n".
	  " [EVENTSERVER=<eventserver hostname>]\n");
48
    print("    <test step>: 1=conn/latency, 2=routing, 3=loss, 4=BW\n".
Mike Hibler's avatar
Mike Hibler committed
49
	  "    COMPAT=<version>: remain compatible with version <version> or earlier\n".
50 51
	  "    DOARP=1: run a single-ping pass to create ARP entries\n".
	  "    REPORTONLY=1: report stats only, do not pass judgement\n".
52 53 54 55 56 57
	  "    NODES: comma-separated list of virtnames to run on\n".
	  "    LOGRUN: Command to run for high-level logging. The log message as its only argument.".
	  "    LOGDIR: Path used to store stderr logs of individual test utilities.\n".
	  "    BINDIR: Path must contain emulab-rude emulab-crude emulab-iperf\n".
	  "    VARDIR: Path must contain...\n".
	  "    EVENTSERVER: Hostname of event server\n");
58
    exit(0);
59 60
}

61

62 63 64
##############################################################################
# Constants
##############################################################################
65 66 67

# log files used by tests.
use constant CRUDE_DAT => "/tmp/crude.dat"; # binary data
68
use constant RUDE_CFG  => "/tmp/rude.cfg";
69
use constant IPERF_DAT => "/tmp/iperf.dat";
70

71 72 73
# Packet size for iperf (1470 default).  Reduce to avoid problems with veths
use constant IPERF_PKTSIZE => 1450;

74 75 76
# max time to run iperf
use constant BW_TEST_MAXDURATION => 10;

77
# iperf test limits.
78 79 80 81 82
use constant LIMIT_BW_HI  => 100000000;
use constant LIMIT_BW_MED =>  10000000;
use constant LIMIT_BW_LO  =>   1000000;
use constant LIMIT_BW_MIN =>     64000;
use constant LIMIT_BW_LOSS => 0.20;
83

84 85
# Make sure that we dont get bogged down in being too accurate! 
# Make sure the error is a certain significance before we start reporting it.
86
use constant INSIGNIFICANT_LAT_ERROR_LO => 0.50;  # ms
87
use constant INSIGNIFICANT_LAT_ERROR_HI => 3.50;  # ms
88 89
use constant INSIGNIFICANT_BW_ERROR_HI  => 0.015; # percent.
use constant INSIGNIFICANT_BW_ERROR_LO  => 0.03;  # percent.
Russ Fish's avatar
Russ Fish committed
90
use constant INSIGNIFICANT_BW_ERROR_LO_Windows  => 0.10;  # Lower expectations.
91

92
# slow send rate (for bw from LIMIT_BW_MIN to LIMIT_BW_LO)
93 94 95
use constant SLOW_SEND => 100;
use constant FAST_SEND => 250;
use constant LOSS_TEST_DURATION => 4;	# In seconds.
96

97
# misc contstants
98
use constant BSD => "FreeBSD";
99
use constant LINUX => "Linux";
100
use constant RTPROTO_STATIC => "Static";
101
use constant RTPROTO_SESSION => "Session";
Timothy Stack's avatar
 
Timothy Stack committed
102
use constant EVENT_COMPLETE => "COMPLETE";
103
use constant EVENT_REPORT => "REPORT";
Timothy Stack's avatar
 
Timothy Stack committed
104
use constant EVENT_LOG => "LOG";
105
use constant PING_SEND_COUNT => 10;
David Anderson's avatar
David Anderson committed
106
use constant SYNC_NAMESPACE => "linktest";
107 108 109 110

# crude should have higher priority than rude on the same machine
use constant RUDE_PRI => 3;
use constant CRUDE_PRI => 5;
111 112

# test levels
113 114 115 116
use constant TEST_LATENCY => 1; # direct link connectivity & latency
use constant TEST_RT_STATIC => 2;   # prior plus static routing
use constant TEST_LOSS => 3;   # prior plus loss
use constant TEST_BW => 4; # prior plus bandwidth
117

118
# test names
119
use constant NAME_RT_STATIC => "Routing";
120
use constant NAME_LATENCY => "Latency";
121 122
use constant NAME_LOSS => "Loss";
use constant NAME_BW => "Bandwidth";
123

124
# error suffix for logs
125
use constant SUFFIX_ERROR => ".error";
126
use constant DEBUG_ALL => 2; # debug level for all debug info, not just msgs.
127

128 129
# exit codes
use constant EXIT_ABORTED => -1;
Timothy Stack's avatar
 
Timothy Stack committed
130
use constant EXIT_NOT_OK => 1;
131 132
use constant EXIT_OK => 0;

133 134 135
# Protos
sub TimeStamp();
sub PATH_NICE();
136 137 138

# struct for representing a link.
struct ( edge => {
139
    name => '$',
140
    src => '$',
141
    srcip => '$',
142
    dst => '$',
143
    dstip => '$',
144
    bw  => '$',
145 146
    testbw  => '$',
    bwtime  => '$',
147
    delay => '$',
148
    loss => '$',
149 150 151 152 153
    queuetype => '$',
    mac => '$',
    mpxstyle => '$',
    dstyle => '$',
    symlanignore => '$'});
154

Timothy Stack's avatar
 
Timothy Stack committed
155 156 157
struct ( host => {
    name => '$',
    visited => '$',
158 159 160 161 162 163 164 165 166
    links => '@',
    isvnode => '$',
    pname => '$',
    phost => '$',
    ptype => '$',
    osid => '$',
    os => '$',
    osvers => '$',
    osfeatures => '$'});
Timothy Stack's avatar
 
Timothy Stack committed
167

168
# fixes emacs colorization woes introduced by above struct definition.
Timothy Stack's avatar
 
Timothy Stack committed
169
# struct ( unused => { foo => '$'});
170

171 172 173
use constant TRUE => 1;
use constant FALSE => 0;

174 175 176 177
##############################################################################
# Globals
##############################################################################

178
my $topology_file;    # location of the topology input file.
179
my $ptopology_file;   # location of the physical topology input file.
180
my $synserv;    # synch server node
181
my $rtproto;    # routing protocol
182
my $hostname;   # this hosts name
183 184 185 186
my $exp_id;     # experiment id
my $proj_id;    # project id
my $gid;        # group id
my $platform;   # name of platform
187 188
my $startat=1;  # which test to start at
my $stopat=99;  # which test to stop at
189
my %kill_list;  # PIDS that are running as background procs, needing cleanup.
190
my $debug_level = 0; # enable debug statements
191 192
                     # 1 = print debug statements.
                     # 2 = show STDOUT and STDERR
193 194 195
my $arpit = 1;	# do a single ping to force ARP exchange
my $reportonly = 0; # just report the values seen, no pass/fail judgement
my $printsched = 0; # just print the test schedule
Mike Hibler's avatar
Mike Hibler committed
196
my $compat = 1.1;   # be compatible (wrt. synch) with old versions
197
my $barriers_hit = 1;
198
my $barr_count;   # used by synserv host, nubmer of hosts -1
199
my $log_file;    # common logfile for information saved over time.
200 201

my @hosts; # hosts: list of text strings containing host names.
202
           # sorted alphabetically
Timothy Stack's avatar
 
Timothy Stack committed
203
my %hostmap;
204 205 206
my $numvnodes = 0;
my %vhostmap;
my %linkmembers;
207
my @links; # links: list of edge structs.
208
           # sorted alphabetically by src . dst
209

210 211
my $expt_path;  # experiment path (ie, tbdata) set by init.
my $linktest_path;   # log path (ie tbdata/linktest) set by init.
Timothy Stack's avatar
 
Timothy Stack committed
212 213 214 215 216 217 218 219
my $simname = "ns";
my $swapper = "";
my $swapperid = 0;
my $swappergid = 0;
my $token = -1;
my $error_count = 0;
my $stage_error_count = 0;
my $total_error_count = 0;
220

221 222 223
my $warn_partial_test = 0;
my $warn_unshaped_links = 0;

224 225 226
my $listener_iperf;
my $listener_crude;

227 228 229
##############################################################################
# Main control
##############################################################################
230

231 232 233 234 235 236
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

237 238 239
# Make sure that files written into the experiment subdir are group writable.
umask(0002);

240 241 242 243 244 245
our $LOGRUN = "";
our $PROJROOT = "";
our $VARDIR = "";
our $BINDIR = "";
our $EVENTSERVER = "";

246 247 248 249 250 251 252 253 254 255 256
#
# Parse command arguments. Since Linktest is run via the event system,
# parse out pairs of <symbol>=<value>.
#
foreach my $arg (@ARGV) {
    if($arg =~ /STOPAT=(\d)/) {
	$stopat=$1;
    }
    if($arg =~ /STARTAT=(\d)/) {
	$startat=$1;
    }
Mike Hibler's avatar
Mike Hibler committed
257 258 259
    if($arg =~ /COMPAT=(\d(?:\.\d+))/) {
	$compat=$1/1.0;
    }
260 261 262 263 264 265
    if($arg =~ /DOARP=(\d)/) {
	$arpit=$1;
    }
    if($arg =~ /REPORTONLY=(\d)/) {
	$reportonly=$1;
    }
266 267 268
    if($arg =~ /DEBUG=(\d)/) {
	$debug_level=$1;
    }
269 270 271
    if($arg =~ /PRINTSCHED=(\d)/) {
	$printsched=$1;
    }
Timothy Stack's avatar
 
Timothy Stack committed
272 273 274 275 276 277 278
    if($arg =~ /TOKEN=(\d+)/) {
	$token=$1;
    }
    if($arg =~ /SWAPPER=(\w+)/) {
	$swapper=$1;
	(undef,undef,$swapperid,$swappergid) = getpwnam($swapper);
    }
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
    if($arg =~ /LOGRUN=(.+)/) {
	$LOGRUN = $1;
    }
    if($arg =~ /LOGDIR=(.+)/) {
	$PROJROOT = $1;
    }
    if($arg =~ /VARDIR=(.+)/) {
	$VARDIR = $1;
    }
    if($arg =~ /BINDIR=(.+)/) {
	$BINDIR = $1;
    }
    if($arg =~ /EVENTSERVER=(.+)/) {
	$EVENTSERVER = $1;
    }
294 295
}

Mike Hibler's avatar
Mike Hibler committed
296 297 298 299 300 301 302 303
$compat = 99
    if ($compat == 0);

# XXX no arp test in 1.1 and before
if ($compat < 1.2) {
    $arpit = 0;
}

304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
# path to applications and files
our $PATH_NICKNAME = "$VARDIR/boot/nickname";
our $PATH_KEYFILE = "$VARDIR/boot/eventkey";
our $PATH_RUDE = "$BINDIR/emulab-rude";
our $PATH_CRUDE = "$BINDIR/emulab-crude";
our $PATH_IPERF = "$BINDIR/emulab-iperf";
our $PATH_RCTOPO = "$BINDIR/rc/rc.topomap";
our $PATH_EMULAB_SYNC = "$BINDIR/emulab-sync";
our $PATH_LTEVENT = "$BINDIR/ltevent";
our $PATH_TEVC = "$BINDIR/tevc";
our $RUN_PATH = "$BINDIR"; # where the linktest-ns runs.
our $PATH_SCHEDFILE = "$VARDIR/logs/linktest.sched";
our $PATH_SYNCSERVER = "$VARDIR/boot/syncserver";
our $PATH_TOPOFILE = "$VARDIR/boot/ltmap";
our $PATH_PTOPOFILE = "$VARDIR/boot/ltpmap";

my $schedfile = $PATH_SCHEDFILE;
if ($printsched) {
    open(SCHED, ">$schedfile") or
    die("Could not open schedule log file $schedfile");
} else {
    unlink($schedfile);
}

328 329 330 331
#
# Parse the nickname file to obtain the host name, 
# experiment ID and the project ID.
#
332
my $fname = $PATH_NICKNAME;
333 334 335 336 337 338
die("Could not locate $fname\n") unless -e $fname;
my @results = &read_file($fname);
($hostname, $exp_id, $proj_id) = split /\./, $results[0];
chomp $hostname;
chomp $exp_id;
chomp $proj_id;
Mike Hibler's avatar
Mike Hibler committed
339 340 341 342 343 344 345 346

# taint check pid/eid
if ($proj_id =~ /([-\w]*)/) {
    $proj_id = $1;
}
if ($exp_id =~ /([-\w]*)/) {
    $exp_id = $1;
}
347 348 349 350 351 352
$gid = $proj_id;

#
# Set path variables storing the experiment logging path,
# the current ns file and the output file for topology info.
#
353
$expt_path = "$PROJROOT/$proj_id/exp/$exp_id/tbdata";
354
$linktest_path = "$expt_path/linktest";
355 356
$topology_file = $PATH_TOPOFILE;
$ptopology_file = $PATH_PTOPOFILE;
357 358

#
359 360
# Determine what OS we are.  Used for handling the occasional difference
# in the location of or options to system utilities (e.g., ping).
361 362 363 364
#
($platform) = POSIX::uname();

#
365 366 367 368 369 370
# First, everyone parses the topo files to see who is participating in
# this run.  All instances will be started simultaneuously from the event
# system, so we throw a random sleep in here to keep nodes from pounding
# the NFS server.
#
sleep(int(rand(5)));
371
&my_system($PATH_RCTOPO, "reconfig");
372
&get_topo($topology_file, $ptopology_file);
373 374 375 376 377 378
&debug_top();

#
# Finally, see who the master is for synchronization.
# We prefer the synch server node for the experiment, but if that node
# is not participating, we choose the first node on the host list.
379
#
380
$synserv = "";
381
my $ssname = $PATH_SYNCSERVER;
382 383 384 385 386
if ($ssname) {
    @results = &read_file($ssname);
    ($synserv) = split/\./, $results[0];
    chomp $synserv;
}
387
if (@hosts > 0 && (!$synserv || !exists($hostmap{$synserv}))) {
388 389 390
    $synserv = $hosts[0];
}
&debug("Synch master is $synserv\n");
391

392
#
393 394
# If the current node is the special node, do some housekeeping
# and initialize the barrier count.
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
#
if(&is_special_node()) {
    #
    # If the shared path used by Linktest for logging and temporary
    # files already exists, clear its contents for this run.
    #
    if( -e $linktest_path ) {
	die("Path $linktest_path is not a directory\n") 
	    unless -d $linktest_path;

	opendir (DIR,$linktest_path)
	    ||  die("Could not open $linktest_path: $!");
	my @dirfiles = grep (/error$/,readdir(DIR));
	foreach (@dirfiles) {
	    &do_unlink("$linktest_path/$_");
	}
	closedir(DIR);

    } else {
	# 
	# The shared path does not exist, create it.
	#
	mkdir (&check_filename($linktest_path),0777) 
	    || die("Could not create directory $linktest_path: $!");
Timothy Stack's avatar
 
Timothy Stack committed
419
	chown($swapperid, $swappergid, $linktest_path);
420 421 422 423
    }

    $barr_count = @hosts;
    $barr_count--;
424
}
425

426 427 428 429 430 431 432 433 434 435 436 437 438 439
#
# If there are no links to test, there is nothing to do!
# Do this after we have identified the synch server.
#
if (@links == 0) {
    &debug("No links to test!\n");

    my $msg = "Linktest skipped, no links";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    &post_event(EVENT_COMPLETE,"ERROR=0 CTOKEN=$token");
    exit(EXIT_OK);
}

440 441 442 443 444 445 446 447 448 449 450 451 452
#
# If it has been determined that we are not a part of the run,
# exit now so we don't screw up the barrier synchs.  Note that post_event
# will only happen if we are the synch server and we can only be the
# synch server if no other node is participating in the run either.
#
if (!exists($hostmap{$hostname})) {
    &debug("$hostname not participating in this linktest run.\n");
    &sim_event(EVENT_LOG,"Linktest skipped, no nodes participating");
    &post_event(EVENT_COMPLETE,"ERROR=0 CTOKEN=$token");
    exit(EXIT_OK);
}

453 454 455 456 457
#
# All nodes remove local temporary files remaining from the last
# run, if any.
# 
&do_unlink(CRUDE_DAT);
458
&do_unlink(IPERF_DAT);
459 460 461
&do_unlink(RUDE_CFG);

#
462
# Start up listeners; they run over the lifetime of Linktest to
463
# reduce the number of barrier synchronizations and startup
464
# delays. Always give the collectors a second to start up.
465
#
466 467 468 469 470 471
my $listeners = 0;
if (&dotest(TEST_BW)) {
    if ($printsched) {
	&schedlog("start iperf listener");
    } else {
	$listener_iperf = &start_listener(PATH_NICE, "-n", "-10",
472
					  $PATH_IPERF,"-s","-f","b","-u",
473 474 475 476 477 478 479 480
					  "-w","200000","-l",IPERF_PKTSIZE);
	$listeners++;
    }
}
if (&dotest(TEST_LOSS)) {
    if ($printsched) {
	&schedlog("start crude listener");
    } else {
481
	$listener_crude = &start_listener($PATH_CRUDE,"-l",CRUDE_DAT,
482 483 484 485 486
					  "-P",CRUDE_PRI);
	$listeners++;
    }
}
if ($listeners) {
487 488 489 490 491 492 493 494 495
    sleep(1);
}

#
# Finally, synchronize so that all nodes have completed startup
# tasks. When all nodes reach this barrier, the topology input file
# has been written, local variables are initialized, background
# listeners have been started, and temporary files have been cleared.
#
496 497 498
if ($printsched) {
    &schedlog("barrier $barriers_hit: post-startup");
}
499 500 501
&barrier;

#
502
# Beginning of the tests.
503
#
504 505 506
my $msg = "Linktest Version $LINKTEST_VERSION";
&sim_event(EVENT_LOG, $msg);
&post_event(EVENT_REPORT, $msg);
Timothy Stack's avatar
 
Timothy Stack committed
507 508
&debug("\n$msg\n\n");

509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
#
# Print out any warnings that alter the behavior of the run
#
if ($warn_partial_test) {
    my $msg = "*** WARNING: some hosts do not support linktest,".
	      " skipping links between those hosts";
    &sim_event(EVENT_LOG, $msg);
    &post_event(EVENT_REPORT, $msg);
    &debug("\n$msg\n\n");
}
if ($warn_unshaped_links && &dotest(TEST_BW)) {
    my $msg = "*** WARNING: tb-set-noshaping used on one or more links,".
	      " skipping BW tests for those links.";
    &sim_event(EVENT_LOG, $msg);
    &post_event(EVENT_REPORT, $msg);
    &debug("\n$msg\n\n");
}

527 528 529 530 531 532 533
if (defined($rtproto) && $rtproto eq RTPROTO_SESSION) {
    my $msg = "Session routing active; waiting a bit to let routes stabilize";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    sleep(30);
}

534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
if ($arpit) {
    my $stamp = TimeStamp();
    my $msg   = "Pre-test ARP on all nodes ... $stamp";
    &post_event(EVENT_REPORT,$msg);
    &sim_event(EVENT_LOG,$msg);
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-arp test");
    }
    &barrier();
    &debug("\n$msg\n\n");
    &force_arp;
}

549
if(&dotest(TEST_LATENCY)) {
550 551
    my $stamp = TimeStamp();
    my $msg   = "Testing Single Hop Connectivity and Latency ... $stamp";
552
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
553
    &sim_event(EVENT_LOG,$msg);
554 555
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
556 557 558
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-latency test");
    }
559
    &barrier();
560
    &debug("\n$msg\n\n");
561
    &latency_test;
562
    &report_status(NAME_LATENCY);
563 564
}

565
if(&dotest(TEST_RT_STATIC)
566
    && defined($rtproto)
567
    && ($rtproto eq RTPROTO_STATIC || $rtproto eq RTPROTO_SESSION)) {
568
    my $msg;
569
    my $stamp = TimeStamp();
570 571 572 573 574

    if ($total_error_count) {
	$msg = "Skipping Routing tests because of previous errors!";
    }
    else {
575
	$msg = "Testing Routing ... $stamp";
576
    }
577
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
578
    &sim_event(EVENT_LOG,$msg);
579 580
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
581 582 583
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-routing test");
    }
584
    &barrier();
585
    &debug("\n$msg\n\n");
586 587
    if (! $total_error_count) {
	&static_rt_test; # nodes not covered by 1hop test
588
	&report_status(NAME_RT_STATIC);
589 590
    }
    else {
591 592 593
	if ($printsched) {
	    &schedlog("barrier $barriers_hit: post-routing test");
	}
594 595
	&barrier();
    }
596
}
597

598
if(&dotest(TEST_LOSS)) {
599 600
    my $stamp = TimeStamp();
    my $msg = "Testing Loss ... $stamp";
601
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
602
    &sim_event(EVENT_LOG,$msg);
603 604
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
605 606 607
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-loss test");
    }
608
    &barrier();
609
    &debug("\n$msg\n\n");
610
    &loss_test; 
611
    &report_status(NAME_LOSS);
612
}
613

614
if(&dotest(TEST_BW)){
615 616
    my $stamp = TimeStamp();
    my $msg = "Testing Bandwidth ... $stamp";
617
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
618
    &sim_event(EVENT_LOG,$msg);
619
    # Ick, this barrier makes sure the above message gets into the log
620
    # first, so as not to confuse Mike
621 622 623
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-bandwidth test");
    }
624
    &barrier();
625
    &debug("\n$msg\n\n");
626
    &bw_test;
627
    &report_status(NAME_BW);
628
}
629

630 631
&cleanup;

632 633 634
if ($printsched) {
    &schedlog("barrier $barriers_hit: post-test");
}
635 636
&barrier();

Timothy Stack's avatar
 
Timothy Stack committed
637 638 639 640
$msg = "Linktest Done";
&sim_event(EVENT_LOG,$msg);
&debug("\n$msg\n\n");

641 642 643
#
# Send an event indicating that Linktest has completed normally.
#
Timothy Stack's avatar
 
Timothy Stack committed
644
&post_event(EVENT_COMPLETE,"ERROR=$total_error_count CTOKEN=$token");
645

646
exit(EXIT_OK);
647

648

649
##############################################################################
650
# Loss Test Functions
651 652
##############################################################################

653

654
# Writes the configuration file used by RUDE.
655 656
sub write_rude_cfg {
    my ($stream_id, $edge) = @_;
657 658
    my $sample_size = &get_loss_sample_size($edge);
    my $millis      = LOSS_TEST_DURATION * 1000;
659
    my @contents;
660
    
661 662 663 664 665 666 667 668 669
    #
    # Run for the desired time and then tell rude to not transmit, but
    # wait, for the edge one-way latency time.  This way rude will not
    # exit before the last packet has a chance to arrive at the destination.
    # Delaying here simplifies the barrier synchronization (we don't have
    # to do a sub-second delay in perl).  Note that a final, zero rate
    # MODIFY like we use here only works in our modified version of rude!
    # Stock rude will just exit after the last actual transmission; i.e.,
    # prior to the final one-way latency wait.
670 671 672 673
    #
    # START <when>
    # <start-offset-ms> <flowID> ON <src-port> \
    #    <dst-addr>:<dst-port> CONSTANT <packets per second> <packet size>
674 675
    # <stop-offset-ms> <flowID> MODIFY CONSTANT 0 <XX>
    # <stop-offset-ms+one-way-link-latency> <flowID> OFF
676 677 678 679 680
    #
    # For our purposes, the variables are:
    #	$sample_size	packet rate
    #	$millis		time to run
    #
681
    push @contents, "START NOW\n";
682 683
    # Let bind() choose the src port; any constant port may be already in use.
    push @contents, "0000 $stream_id ON 0 " 
684
	. $edge->dst . "-" . $edge->name
685
	    . ":10001 CONSTANT $sample_size 20\n";
686 687 688 689
    if ($edge->delay) {
	push @contents, "$millis $stream_id MODIFY CONSTANT 0 20\n";
	$millis += int($edge->delay);
    }
690
    push @contents, "$millis $stream_id OFF\n";
691 692

    &write_file(RUDE_CFG, @contents);
693 694 695

}

696 697
# Returns the sample size used by the Loss test.
# TODO: why this number? (from my ProbStats book.)
698 699 700
sub get_loss_sample_size {
    my $edge = shift @_;
    if($edge->loss > 0) {
701
	return &round( 2.5 / $edge->loss);
702
    } else {
703 704
	# just in case a slow link with no loss.
	return SLOW_SEND; 
705 706 707
    }
}

708 709 710 711


# returns TRUE if the link loss is valid for the linktest loss test.
sub valid_loss {
712
    my $edge = shift @_;
713
    if($edge->bw >= LIMIT_BW_MIN && $edge->bw < LIMIT_BW_LO) {
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
	if(&get_loss_sample_size($edge) > SLOW_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } elsif( $edge->bw >= LIMIT_BW_LO) {
	# also want an upper limit.
	if(&get_loss_sample_size($edge) > FAST_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } else {
	return FALSE;
    }
729
}
730 731 732 733


# This test uses RUDE and CRUDE to send a stream of packets
# in both directions over a duplex link.
734 735 736 737 738
sub loss_test {
    my %analyze;
    my %recv_cnt;
    my $stream_id = 1;
    my @edge_copy = @links;
739 740 741
    my $trun = 1;
    my $rude_arg = "";

Mike Hibler's avatar
Mike Hibler committed
742 743
    # XXX version 1.1 compatibility; used to start crude here and wait
    if ($compat < 1.2) {
744
	&debug("performing barrier synch for backward compatibility\n");
Mike Hibler's avatar
Mike Hibler committed
745 746 747
	&barrier();
    }

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
    #
    # XXX "concession" to vnodes: the stock rude is, well..."rude",
    # when it comes to CPU usage.  It spins between time intervals
    # which is rather painful for vnodes.  So we have a local version
    # which allows sleeping between intervals if the clock resolution
    # is sufficient.  We send 100 pps (10000us) and vnodes have a 1000HZ
    # (1000us) clock, which qualifies as sufficient.
    #
    # So, we add the extra rude option if conditions are met.
    #
    if ($numvnodes && $hostmap{$hostname}->isvnode) {
	my $hz = `/sbin/sysctl kern.clockrate 2>/dev/null`;
	if ($hz =~ /\shz = (\d+),/) {
	    $rude_arg = "-C $1";
	}
    }
764

765
    while(&has_elems(\@edge_copy)) {
766
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy, 1);
767 768 769 770
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src) {
		if(valid_loss($edge)) {
		    &write_rude_cfg($stream_id,$edge);
771 772 773 774 775 776 777
		    if ($printsched) {
			&schedlog("rude " . schedprint_link($edge) .
				  " (pps=" .
				  &get_loss_sample_size($edge) .
				  ", time=" .
				  LOSS_TEST_DURATION . "s, psize=20)");
		    } else {
778
			&my_system($PATH_RUDE,"-s", RUDE_CFG, "-P", RUDE_PRI,
779 780 781
				   $rude_arg);
			$analyze{$stream_id} = $other_edge;
		    }
782
		} else {
783 784 785 786
		    if ($printsched) {
			&schedlog("skipping loss test " .
				  schedprint_link($edge));
		    }
787 788 789 790
		    &debug("Skipping loss test for " .
			   &print_link($edge) . "\n");
		    &info("*** Skipping loss test on $hostname for " .
			  &print_link($edge) . "\n");
791 792 793 794
		}
	    } elsif ($hostname eq $other_edge->src) {
		if(valid_loss($other_edge)) {
		    &write_rude_cfg($stream_id,$other_edge);
795 796 797 798 799 800 801
		    if ($printsched) {
			&schedlog("rude " . schedprint_link($other_edge) .
				  " (pps=" .
				  &get_loss_sample_size($edge) .
				  ", time=" .
				  LOSS_TEST_DURATION . "s, psize=20)");
		    } else {
802
			&my_system($PATH_RUDE,"-s", RUDE_CFG, "-P", RUDE_PRI,
803 804 805
				   $rude_arg);
			$analyze{$stream_id} = $edge;
		    }
806
		} else {
807 808 809 810
		    if ($printsched) {
			&schedlog("skipping loss test " .
				  schedprint_link($other_edge));
		    }
811 812 813 814
		    &debug("Skipping loss test for " .
			   &print_link($other_edge) . "\n");
		    &info("*** Skipping loss test on $hostname for " .
			  &print_link($other_edge) . "\n");
815 816 817 818
		}
	    }
	}
	$stream_id++;
819 820 821 822
	if ($printsched) {
	    &schedlog("barrier $barriers_hit: loss: after run $trun");
	    $trun++;
	}
823 824 825 826 827 828 829 830
	&barrier();
    }

    # wait for any stragglers due to delay-- there is a  race
    # between the barrier sync on the control net and the expt net latency.
    sleep(1);

    # count packets received for each stream.
831 832 833 834
    my @results;
    if ($printsched) {
	@results = ();
    } else {
835
	@results = &my_tick($PATH_CRUDE,"-d",CRUDE_DAT);
836
    }
837 838 839
    my $result_count = @results;
    &debug("result_count from crude: $result_count\n");
    foreach (@results) {
840 841 842 843 844
	if(/ID=(\d+) /) {
	    $recv_cnt{$1}++;
	}
    }

845
    # analyze only links for which a stream was received.
846 847
    foreach my $key (keys %analyze) {
	my $edge = $analyze{$key};
848
	my $sent = (&get_loss_sample_size($edge) * LOSS_TEST_DURATION) + 1 ;
849 850
	my $received = $recv_cnt{$key};

851 852 853 854
	if ($reportonly) {
	    &info("    Loss result on $hostname for " .
		  &print_edge($edge) .
		  ": sent/recv = $sent/$received\n");
855
	    next;
856 857
	}

858 859 860 861 862 863 864 865
	if(!defined($received)) {
	    $received=0;
	    &error (NAME_LOSS,$edge,"No packets received from " . $edge->src);
	} else {
	    # this is a large sample test about proportion p.
	    # this is considered a valid statistical estimate for np >= 10.
	    my $p = 1 - $edge->loss;

866
	    my $p_hat = $received / $sent;
867
	    my $numerator = $p_hat - $p;
868
	    my $denominator = sqrt( abs( $p * (1 - $p_hat) / $sent) );
869

870 871 872 873
	    if ($edge->loss == 0) {
		#
		# Lets not worry about a single lost packet.
		#
874 875
		if ($received < ($sent - 1)) {
		    my $errmsg = "Unexpected loss (sent $sent, received=$received)";
876
		    &error(NAME_LOSS, $edge, $errmsg);
877
		}
878
	    } elsif($denominator == 0) {
879
		my $errmsg = "No packets lost (sent $sent, plr " . $edge->loss .")";
880 881
		&error(NAME_LOSS, $edge, $errmsg);
	    } else {
882

883 884 885 886
		
		my $z = $numerator / $denominator;
		my $reject_region = 2.58; # alpha = 0.1, normal distro by CLT
		if(abs($z) > $reject_region) {
887
		    my $errmsg = "sent $sent, received $received; expected proportion $p, measured proportion $p_hat";
888 889 890 891 892 893
		    &error(NAME_LOSS, $edge, $errmsg);
		}
	    }
	}

    }
894 895 896
    if (!$printsched) {
	kill_listener($listener_crude);
    }
897

898
    # wait for completion before next test.
899 900 901
    if ($printsched) {
	&schedlog("barrier $barriers_hit: loss: after test");
    }
902 903 904
    &barrier();
}

905 906 907
##############################################################################
# Latency Test Functions
##############################################################################
908

909 910 911
# returns whether the link latency is in a valid test range.
sub valid_latency {
    return TRUE;
912

913
}
914

915 916 917 918 919
# Pings a node and returns information.
# @param[0] := host to ping
# @param[1] := ttl, 0 for default
# @return: (received_count, avg_latency ms)
sub ping_node {
920
    my ($host,$ttl,$scount,$timo) = @_;
921 922 923
    my $count = 0;
    my $avg_latency = 0;
    my $stddev = 0;
924

925 926 927
    my $send_count = defined($scount) ? $scount : PING_SEND_COUNT;
    my $timeout = defined($timo) ? $timo : 3;
    my $send_rate = ($timeout > 1) ? (($timeout - 1) / $send_count) : 0;
928

929 930
    # set deadline to prevent long waits
    my $cmd;
931
    my $ttlarg = "";
932 933
    if($ttl) {
	if($platform eq BSD) {
934
	    $ttlarg = "-m $ttl";
935
	} elsif($platform eq LINUX) {
936 937 938 939
	    $ttlarg = "-t $ttl";
	} elsif($platform =~ /CYGWIN/) {
	    # The Windows system ping has a TTL arg; Cygwin ping doesn't. 
	    $ttlarg = "-i $ttl";
940 941
	}
    }
942 943 944 945 946 947
    my $srarg;
    if ($send_rate) {
	$srarg = "-i $send_rate";
    } else {
	$srarg = "";
    }
948
    if($platform eq BSD) {
949
	$cmd = "/sbin/ping -c $send_count -q $srarg -t $timeout $ttlarg $host";
950
    } elsif($platform eq LINUX) {
951
	$cmd = "/bin/ping -c $send_count -q $srarg -w $timeout $ttlarg $host";
952 953 954 955 956
    } elsif($platform =~ /CYGWIN/) {
	# Neither Windows nor Cygwin ping has either send rate or timeout.
	# Windows ping doesn't have -q, but it does have TTL, so use it.
	$cmd = "/cygdrive/c/WINDOWS/system32/ping.exe -n $send_count $ttlarg $host";
    }
957

958 959 960 961 962 963 964 965 966
    # note backticks passes SIGINT to child procs
    my @args = split(/\s+/,$cmd);
    my @results = &my_tick(@args);
    my $reslt_cnt = @results;
    my $result = $results[$reslt_cnt-2];
    if($platform eq BSD && $result =~ /(\d+) packets received/) {
	$count = $1;
    } elsif($platform eq LINUX && $result =~ /(\d+) received/) {
	$count = $1;
967 968 969
    } elsif($platform =~ /CYGWIN/ && 
	    $results[$reslt_cnt-3] =~ /Received = (\d+)/) {
	$count = $1;
970 971
    }

972 973 974 975 976
    if($count) {
	$result = $results[$reslt_cnt-1];
	if($result=~ /\d+\.\d+\/(\d+\.\d+)\/\d+\.\d+\/(\d+\.\d+)/) {
	    $avg_latency = $1;
	    $stddev = $2;
977 978 979
	} elsif($result=~ /Average = (\d+)ms/) {
	    $avg_latency = $1;
	    $stddev = 0.03;	# Stddev is not reported on Windows.
980 981
	}
    }
982
    return ($count, $avg_latency, $stddev);
983 984
}

985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
#
# Ping each directly connected node once to force an ARP.
#
sub force_arp {
    my %waitlist;
    my @edge_copy = @links;

    while(&has_elems(\@edge_copy)) {
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy, 0);
	if (defined($edge) && defined($other_edge<