linktest.pl 72 KB
Newer Older
1
#!/usr/bin/perl -w -T
2
#
3
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2010 University of Utah and the Flux Group.
5
# All rights reserved.
6
#
7

8 9
use strict;
use Class::Struct;
10
use POSIX qw(uname);
11
use POSIX qw(strftime);
12
use IO::Handle;
13
use English;
14
use Socket;
15

16
my $LINKTEST_VERSION = "1.2";
17

18 19 20
#
# Linktest test script. This script is set up to run as root on
# experiment nodes. It is invoked by the Linktest daemon after the
21 22 23 24 25 26
# daemon receives a Linktest "START" event. The script runs tests
# using ping, rude/crude (a real-time packet emitter/collector) and a
# locally hacked iperf to test all links in the experiment. If the
# results show a 99% chance that the experiment is configured
# incorrectly, an error is logged to the experiment directory in
# tbdata/linktest.  Valid ranges have been calibrated ahead of time.
27 28 29
#
sub usage() 
{
Timothy Stack's avatar
 
Timothy Stack committed
30 31 32
    print("Usage: linktest.pl\n".
	  " [STARTAT=<test step, 1-4>]\n".
	  " [STOPAT=<test step, 1-4>]\n".
Mike Hibler's avatar
Mike Hibler committed
33
	  " [COMPAT=<version or 0>]\n".
34 35 36
	  " [DOARP=<1=yes, 0=no>]\n".
	  " [REPORTONLY=<1=yes, 0=no>]\n".
	  " [NODES=<n1,n2...>\n".
37 38 39 40 41 42
	  " [DEBUG=<debugging level. 1=on, 0=off>]\n".
	  " [LOGRUN=<command/used/for/logging>]\n".
	  " [LOGDIR=<path/to/log/root>]\n".
	  " [BINDIR=<path/to/binary/files>]\n".
	  " [VARDIR=<path/to/config/files>]\n".
	  " [EVENTSERVER=<eventserver hostname>]\n");
43
    print("    <test step>: 1=conn/latency, 2=routing, 3=loss, 4=BW\n".
Mike Hibler's avatar
Mike Hibler committed
44
	  "    COMPAT=<version>: remain compatible with version <version> or earlier\n".
45 46
	  "    DOARP=1: run a single-ping pass to create ARP entries\n".
	  "    REPORTONLY=1: report stats only, do not pass judgement\n".
47 48 49 50 51 52
	  "    NODES: comma-separated list of virtnames to run on\n".
	  "    LOGRUN: Command to run for high-level logging. The log message as its only argument.".
	  "    LOGDIR: Path used to store stderr logs of individual test utilities.\n".
	  "    BINDIR: Path must contain emulab-rude emulab-crude emulab-iperf\n".
	  "    VARDIR: Path must contain...\n".
	  "    EVENTSERVER: Hostname of event server\n");
53
    exit(0);
54 55
}

56

57 58 59
##############################################################################
# Constants
##############################################################################
60 61 62

# log files used by tests.
use constant CRUDE_DAT => "/tmp/crude.dat"; # binary data
63
use constant RUDE_CFG  => "/tmp/rude.cfg";
64
use constant IPERF_DAT => "/tmp/iperf.dat";
65

66 67 68
# Packet size for iperf (1470 default).  Reduce to avoid problems with veths
use constant IPERF_PKTSIZE => 1450;

69 70 71
# max time to run iperf
use constant BW_TEST_MAXDURATION => 10;

72
# iperf test limits.
73 74 75 76 77
use constant LIMIT_BW_HI  => 100000000;
use constant LIMIT_BW_MED =>  10000000;
use constant LIMIT_BW_LO  =>   1000000;
use constant LIMIT_BW_MIN =>     64000;
use constant LIMIT_BW_LOSS => 0.20;
78

79 80
# Make sure that we dont get bogged down in being too accurate! 
# Make sure the error is a certain significance before we start reporting it.
81
use constant INSIGNIFICANT_LAT_ERROR_LO => 0.50;  # ms
82
use constant INSIGNIFICANT_LAT_ERROR_HI => 3.50;  # ms
83 84
use constant INSIGNIFICANT_BW_ERROR_HI  => 0.015; # percent.
use constant INSIGNIFICANT_BW_ERROR_LO  => 0.03;  # percent.
Russ Fish's avatar
Russ Fish committed
85
use constant INSIGNIFICANT_BW_ERROR_LO_Windows  => 0.10;  # Lower expectations.
86

87
# slow send rate (for bw from LIMIT_BW_MIN to LIMIT_BW_LO)
88 89 90
use constant SLOW_SEND => 100;
use constant FAST_SEND => 250;
use constant LOSS_TEST_DURATION => 4;	# In seconds.
91

92
# misc contstants
93
use constant BSD => "FreeBSD";
94
use constant LINUX => "Linux";
95
use constant RTPROTO_STATIC => "Static";
96
use constant RTPROTO_SESSION => "Session";
Timothy Stack's avatar
 
Timothy Stack committed
97
use constant EVENT_COMPLETE => "COMPLETE";
98
use constant EVENT_REPORT => "REPORT";
Timothy Stack's avatar
 
Timothy Stack committed
99
use constant EVENT_LOG => "LOG";
100
use constant PING_SEND_COUNT => 10;
David Anderson's avatar
David Anderson committed
101
use constant SYNC_NAMESPACE => "linktest";
102 103 104 105

# crude should have higher priority than rude on the same machine
use constant RUDE_PRI => 3;
use constant CRUDE_PRI => 5;
106 107

# test levels
108 109 110 111
use constant TEST_LATENCY => 1; # direct link connectivity & latency
use constant TEST_RT_STATIC => 2;   # prior plus static routing
use constant TEST_LOSS => 3;   # prior plus loss
use constant TEST_BW => 4; # prior plus bandwidth
112

113
# test names
114
use constant NAME_RT_STATIC => "Routing";
115
use constant NAME_LATENCY => "Latency";
116 117
use constant NAME_LOSS => "Loss";
use constant NAME_BW => "Bandwidth";
118

119
# error suffix for logs
120
use constant SUFFIX_ERROR => ".error";
121
use constant DEBUG_ALL => 2; # debug level for all debug info, not just msgs.
122

123 124
# exit codes
use constant EXIT_ABORTED => -1;
Timothy Stack's avatar
 
Timothy Stack committed
125
use constant EXIT_NOT_OK => 1;
126 127
use constant EXIT_OK => 0;

128 129 130
# Protos
sub TimeStamp();
sub PATH_NICE();
131 132 133

# struct for representing a link.
struct ( edge => {
134
    name => '$',
135
    src => '$',
136
    srcip => '$',
137
    dst => '$',
138
    dstip => '$',
139
    bw  => '$',
140 141
    testbw  => '$',
    bwtime  => '$',
142
    delay => '$',
143
    loss => '$',
144 145 146 147 148
    queuetype => '$',
    mac => '$',
    mpxstyle => '$',
    dstyle => '$',
    symlanignore => '$'});
149

Timothy Stack's avatar
 
Timothy Stack committed
150 151 152
struct ( host => {
    name => '$',
    visited => '$',
153 154 155 156 157 158 159 160 161
    links => '@',
    isvnode => '$',
    pname => '$',
    phost => '$',
    ptype => '$',
    osid => '$',
    os => '$',
    osvers => '$',
    osfeatures => '$'});
Timothy Stack's avatar
 
Timothy Stack committed
162

163
# fixes emacs colorization woes introduced by above struct definition.
Timothy Stack's avatar
 
Timothy Stack committed
164
# struct ( unused => { foo => '$'});
165

166 167 168
use constant TRUE => 1;
use constant FALSE => 0;

169 170 171 172
##############################################################################
# Globals
##############################################################################

173
my $topology_file;    # location of the topology input file.
174
my $ptopology_file;   # location of the physical topology input file.
175
my $synserv;    # synch server node
176
my $rtproto;    # routing protocol
177
my $hostname;   # this hosts name
178 179 180
my $exp_id;     # experiment id
my $proj_id;    # project id
my $platform;   # name of platform
181 182
my $startat=1;  # which test to start at
my $stopat=99;  # which test to stop at
183
my %kill_list;  # PIDS that are running as background procs, needing cleanup.
184
my $debug_level = 0; # enable debug statements
185 186
                     # 1 = print debug statements.
                     # 2 = show STDOUT and STDERR
187 188 189
my $arpit = 1;	# do a single ping to force ARP exchange
my $reportonly = 0; # just report the values seen, no pass/fail judgement
my $printsched = 0; # just print the test schedule
Mike Hibler's avatar
Mike Hibler committed
190
my $compat = 1.1;   # be compatible (wrt. synch) with old versions
191
my $barriers_hit = 1;
192
my $barr_count;   # used by synserv host, nubmer of hosts -1
193
my $log_file;    # common logfile for information saved over time.
194 195

my @hosts; # hosts: list of text strings containing host names.
196
           # sorted alphabetically
Timothy Stack's avatar
 
Timothy Stack committed
197
my %hostmap;
198 199 200
my $numvnodes = 0;
my %vhostmap;
my %linkmembers;
201
my @links; # links: list of edge structs.
202
           # sorted alphabetically by src . dst
203

204
my $linktest_path;   # log path (ie tbdata/linktest) set by init.
Timothy Stack's avatar
 
Timothy Stack committed
205 206 207 208 209 210 211 212
my $simname = "ns";
my $swapper = "";
my $swapperid = 0;
my $swappergid = 0;
my $token = -1;
my $error_count = 0;
my $stage_error_count = 0;
my $total_error_count = 0;
213

214 215 216
my $warn_partial_test = 0;
my $warn_unshaped_links = 0;

217 218 219
my $listener_iperf;
my $listener_crude;

220 221 222
##############################################################################
# Main control
##############################################################################
223

224 225 226 227 228 229
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

230 231 232
# Make sure that files written into the experiment subdir are group writable.
umask(0002);

233 234 235 236 237
# Traditional Emulab defaults that can be overridden but must be set
our $VARDIR = "/var/emulab";
our $BINDIR = "/usr/local/etc/emulab";
our $PROJDIR = "/proj";

238
our $LOGRUN = "";
239
our $LOGDIR = "";
240
our $EVENTSERVER = "";
241
our $EVENTID = "";
242

243 244 245 246 247 248 249 250 251 252 253
#
# Parse command arguments. Since Linktest is run via the event system,
# parse out pairs of <symbol>=<value>.
#
foreach my $arg (@ARGV) {
    if($arg =~ /STOPAT=(\d)/) {
	$stopat=$1;
    }
    if($arg =~ /STARTAT=(\d)/) {
	$startat=$1;
    }
Mike Hibler's avatar
Mike Hibler committed
254 255 256
    if($arg =~ /COMPAT=(\d(?:\.\d+))/) {
	$compat=$1/1.0;
    }
257 258 259 260 261 262
    if($arg =~ /DOARP=(\d)/) {
	$arpit=$1;
    }
    if($arg =~ /REPORTONLY=(\d)/) {
	$reportonly=$1;
    }
263 264 265
    if($arg =~ /DEBUG=(\d)/) {
	$debug_level=$1;
    }
266 267 268
    if($arg =~ /PRINTSCHED=(\d)/) {
	$printsched=$1;
    }
Timothy Stack's avatar
 
Timothy Stack committed
269 270 271 272 273 274 275
    if($arg =~ /TOKEN=(\d+)/) {
	$token=$1;
    }
    if($arg =~ /SWAPPER=(\w+)/) {
	$swapper=$1;
	(undef,undef,$swapperid,$swappergid) = getpwnam($swapper);
    }
276 277 278 279
    if($arg =~ /LOGRUN=(.+)/) {
	$LOGRUN = $1;
    }
    if($arg =~ /LOGDIR=(.+)/) {
280
	$LOGDIR = $1;
281 282 283 284
    }
    if($arg =~ /VARDIR=(.+)/) {
	$VARDIR = $1;
    }
285 286 287
    if($arg =~ /PROJDIR=(.+)/) {
	$PROJDIR = $1;
    }
288 289 290 291 292 293
    if($arg =~ /BINDIR=(.+)/) {
	$BINDIR = $1;
    }
    if($arg =~ /EVENTSERVER=(.+)/) {
	$EVENTSERVER = $1;
    }
294 295
}

Mike Hibler's avatar
Mike Hibler committed
296 297 298 299 300 301 302 303
$compat = 99
    if ($compat == 0);

# XXX no arp test in 1.1 and before
if ($compat < 1.2) {
    $arpit = 0;
}

304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
# path to applications and files
our $PATH_NICKNAME = "$VARDIR/boot/nickname";
our $PATH_KEYFILE = "$VARDIR/boot/eventkey";
our $PATH_RUDE = "$BINDIR/emulab-rude";
our $PATH_CRUDE = "$BINDIR/emulab-crude";
our $PATH_IPERF = "$BINDIR/emulab-iperf";
our $PATH_RCTOPO = "$BINDIR/rc/rc.topomap";
our $PATH_EMULAB_SYNC = "$BINDIR/emulab-sync";
our $PATH_LTEVENT = "$BINDIR/ltevent";
our $PATH_TEVC = "$BINDIR/tevc";
our $RUN_PATH = "$BINDIR"; # where the linktest-ns runs.
our $PATH_SCHEDFILE = "$VARDIR/logs/linktest.sched";
our $PATH_SYNCSERVER = "$VARDIR/boot/syncserver";
our $PATH_TOPOFILE = "$VARDIR/boot/ltmap";
our $PATH_PTOPOFILE = "$VARDIR/boot/ltpmap";

my $schedfile = $PATH_SCHEDFILE;
if ($printsched) {
    open(SCHED, ">$schedfile") or
    die("Could not open schedule log file $schedfile");
} else {
    unlink($schedfile);
}

328 329 330 331
#
# Parse the nickname file to obtain the host name, 
# experiment ID and the project ID.
#
332
my $fname = $PATH_NICKNAME;
333 334 335 336 337 338 339 340 341 342 343
if (-r $fname) {
    my $name = `cat $fname`;
    if ($name =~ /([-\@\w]*)\.([-\@\w]*)\.([-\@\w]*)/) {
	$hostname = $1;
	$exp_id = $2;
	$proj_id = $3;
    } else {
	die("Could not parse $fname info\n");
    }
} else {
    die("Could not locate $fname\n");
Mike Hibler's avatar
Mike Hibler committed
344
}
345 346 347 348 349 350 351 352

#
# Now that we know the pid/eid, defaults some values (unless otherwise set).
#
$LOGDIR = "$PROJDIR/$proj_id/exp/$exp_id/tbdata"
    if ($LOGDIR eq "");
$EVENTID = "$proj_id/$exp_id"
    if ($EVENTID eq "");
353 354 355 356 357

#
# Set path variables storing the experiment logging path,
# the current ns file and the output file for topology info.
#
358
$linktest_path = "$LOGDIR/linktest";
359 360
$topology_file = $PATH_TOPOFILE;
$ptopology_file = $PATH_PTOPOFILE;
361 362

#
363 364
# Determine what OS we are.  Used for handling the occasional difference
# in the location of or options to system utilities (e.g., ping).
365 366 367 368
#
($platform) = POSIX::uname();

#
369 370 371 372 373 374
# First, everyone parses the topo files to see who is participating in
# this run.  All instances will be started simultaneuously from the event
# system, so we throw a random sleep in here to keep nodes from pounding
# the NFS server.
#
sleep(int(rand(5)));
375
&my_system($PATH_RCTOPO, "reconfig");
376
&get_topo($topology_file, $ptopology_file);
377 378 379 380 381 382
&debug_top();

#
# Finally, see who the master is for synchronization.
# We prefer the synch server node for the experiment, but if that node
# is not participating, we choose the first node on the host list.
383
#
384
$synserv = "";
385
my $ssname = $PATH_SYNCSERVER;
386
if ($ssname) {
387
    my @results = &read_file($ssname);
388 389 390
    ($synserv) = split/\./, $results[0];
    chomp $synserv;
}
391
if (@hosts > 0 && (!$synserv || !exists($hostmap{$synserv}))) {
392 393 394
    $synserv = $hosts[0];
}
&debug("Synch master is $synserv\n");
395

396
#
397 398
# If the current node is the special node, do some housekeeping
# and initialize the barrier count.
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
#
if(&is_special_node()) {
    #
    # If the shared path used by Linktest for logging and temporary
    # files already exists, clear its contents for this run.
    #
    if( -e $linktest_path ) {
	die("Path $linktest_path is not a directory\n") 
	    unless -d $linktest_path;

	opendir (DIR,$linktest_path)
	    ||  die("Could not open $linktest_path: $!");
	my @dirfiles = grep (/error$/,readdir(DIR));
	foreach (@dirfiles) {
	    &do_unlink("$linktest_path/$_");
	}
	closedir(DIR);

    } else {
	# 
	# The shared path does not exist, create it.
	#
	mkdir (&check_filename($linktest_path),0777) 
	    || die("Could not create directory $linktest_path: $!");
Timothy Stack's avatar
 
Timothy Stack committed
423
	chown($swapperid, $swappergid, $linktest_path);
424 425 426 427
    }

    $barr_count = @hosts;
    $barr_count--;
428
}
429

430 431 432 433 434 435 436 437 438 439 440 441 442 443
#
# If there are no links to test, there is nothing to do!
# Do this after we have identified the synch server.
#
if (@links == 0) {
    &debug("No links to test!\n");

    my $msg = "Linktest skipped, no links";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    &post_event(EVENT_COMPLETE,"ERROR=0 CTOKEN=$token");
    exit(EXIT_OK);
}

444 445 446 447 448 449 450 451 452 453 454 455 456
#
# If it has been determined that we are not a part of the run,
# exit now so we don't screw up the barrier synchs.  Note that post_event
# will only happen if we are the synch server and we can only be the
# synch server if no other node is participating in the run either.
#
if (!exists($hostmap{$hostname})) {
    &debug("$hostname not participating in this linktest run.\n");
    &sim_event(EVENT_LOG,"Linktest skipped, no nodes participating");
    &post_event(EVENT_COMPLETE,"ERROR=0 CTOKEN=$token");
    exit(EXIT_OK);
}

457 458 459 460 461
#
# All nodes remove local temporary files remaining from the last
# run, if any.
# 
&do_unlink(CRUDE_DAT);
462
&do_unlink(IPERF_DAT);
463 464 465
&do_unlink(RUDE_CFG);

#
466
# Start up listeners; they run over the lifetime of Linktest to
467
# reduce the number of barrier synchronizations and startup
468
# delays. Always give the collectors a second to start up.
469
#
470 471 472 473 474 475
my $listeners = 0;
if (&dotest(TEST_BW)) {
    if ($printsched) {
	&schedlog("start iperf listener");
    } else {
	$listener_iperf = &start_listener(PATH_NICE, "-n", "-10",
476
					  $PATH_IPERF,"-s","-f","b","-u",
477 478 479 480 481 482 483 484
					  "-w","200000","-l",IPERF_PKTSIZE);
	$listeners++;
    }
}
if (&dotest(TEST_LOSS)) {
    if ($printsched) {
	&schedlog("start crude listener");
    } else {
485
	$listener_crude = &start_listener($PATH_CRUDE,"-l",CRUDE_DAT,
486 487 488 489 490
					  "-P",CRUDE_PRI);
	$listeners++;
    }
}
if ($listeners) {
491 492 493 494 495 496 497 498 499
    sleep(1);
}

#
# Finally, synchronize so that all nodes have completed startup
# tasks. When all nodes reach this barrier, the topology input file
# has been written, local variables are initialized, background
# listeners have been started, and temporary files have been cleared.
#
500 501 502
if ($printsched) {
    &schedlog("barrier $barriers_hit: post-startup");
}
503 504 505
&barrier;

#
506
# Beginning of the tests.
507
#
508 509 510
my $msg = "Linktest Version $LINKTEST_VERSION";
&sim_event(EVENT_LOG, $msg);
&post_event(EVENT_REPORT, $msg);
Timothy Stack's avatar
 
Timothy Stack committed
511 512
&debug("\n$msg\n\n");

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
#
# Print out any warnings that alter the behavior of the run
#
if ($warn_partial_test) {
    my $msg = "*** WARNING: some hosts do not support linktest,".
	      " skipping links between those hosts";
    &sim_event(EVENT_LOG, $msg);
    &post_event(EVENT_REPORT, $msg);
    &debug("\n$msg\n\n");
}
if ($warn_unshaped_links && &dotest(TEST_BW)) {
    my $msg = "*** WARNING: tb-set-noshaping used on one or more links,".
	      " skipping BW tests for those links.";
    &sim_event(EVENT_LOG, $msg);
    &post_event(EVENT_REPORT, $msg);
    &debug("\n$msg\n\n");
}

531 532 533 534 535 536 537
if (defined($rtproto) && $rtproto eq RTPROTO_SESSION) {
    my $msg = "Session routing active; waiting a bit to let routes stabilize";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    sleep(30);
}

538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
if ($arpit) {
    my $stamp = TimeStamp();
    my $msg   = "Pre-test ARP on all nodes ... $stamp";
    &post_event(EVENT_REPORT,$msg);
    &sim_event(EVENT_LOG,$msg);
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-arp test");
    }
    &barrier();
    &debug("\n$msg\n\n");
    &force_arp;
}

553
if(&dotest(TEST_LATENCY)) {
554 555
    my $stamp = TimeStamp();
    my $msg   = "Testing Single Hop Connectivity and Latency ... $stamp";
556
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
557
    &sim_event(EVENT_LOG,$msg);
558 559
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
560 561 562
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-latency test");
    }
563
    &barrier();
564
    &debug("\n$msg\n\n");
565
    &latency_test;
566
    &report_status(NAME_LATENCY);
567 568
}

569
if(&dotest(TEST_RT_STATIC)
570
    && defined($rtproto)
571
    && ($rtproto eq RTPROTO_STATIC || $rtproto eq RTPROTO_SESSION)) {
572
    my $msg;
573
    my $stamp = TimeStamp();
574 575 576 577 578

    if ($total_error_count) {
	$msg = "Skipping Routing tests because of previous errors!";
    }
    else {
579
	$msg = "Testing Routing ... $stamp";
580
    }
581
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
582
    &sim_event(EVENT_LOG,$msg);
583 584
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
585 586 587
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-routing test");
    }
588
    &barrier();
589
    &debug("\n$msg\n\n");
590 591
    if (! $total_error_count) {
	&static_rt_test; # nodes not covered by 1hop test
592
	&report_status(NAME_RT_STATIC);
593 594
    }
    else {
595 596 597
	if ($printsched) {
	    &schedlog("barrier $barriers_hit: post-routing test");
	}
598 599
	&barrier();
    }
600
}
601

602
if(&dotest(TEST_LOSS)) {
603 604
    my $stamp = TimeStamp();
    my $msg = "Testing Loss ... $stamp";
605
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
606
    &sim_event(EVENT_LOG,$msg);
607 608
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
609 610 611
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-loss test");
    }
612
    &barrier();
613
    &debug("\n$msg\n\n");
614
    &loss_test; 
615
    &report_status(NAME_LOSS);
616
}
617

618
if(&dotest(TEST_BW)){
619 620
    my $stamp = TimeStamp();
    my $msg = "Testing Bandwidth ... $stamp";
621
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
622
    &sim_event(EVENT_LOG,$msg);
623
    # Ick, this barrier makes sure the above message gets into the log
624
    # first, so as not to confuse Mike
625 626 627
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-bandwidth test");
    }
628
    &barrier();
629
    &debug("\n$msg\n\n");
630
    &bw_test;
631
    &report_status(NAME_BW);
632
}
633

634 635
&cleanup;

636 637 638
if ($printsched) {
    &schedlog("barrier $barriers_hit: post-test");
}
639 640
&barrier();

Timothy Stack's avatar
 
Timothy Stack committed
641 642 643 644
$msg = "Linktest Done";
&sim_event(EVENT_LOG,$msg);
&debug("\n$msg\n\n");

645 646 647
#
# Send an event indicating that Linktest has completed normally.
#
Timothy Stack's avatar
 
Timothy Stack committed
648
&post_event(EVENT_COMPLETE,"ERROR=$total_error_count CTOKEN=$token");
649

650
exit(EXIT_OK);
651

652

653
##############################################################################
654
# Loss Test Functions
655 656
##############################################################################

657

658
# Writes the configuration file used by RUDE.
659 660
sub write_rude_cfg {
    my ($stream_id, $edge) = @_;
661 662
    my $sample_size = &get_loss_sample_size($edge);
    my $millis      = LOSS_TEST_DURATION * 1000;
663
    my @contents;
664
    
665 666 667 668 669 670 671 672 673
    #
    # Run for the desired time and then tell rude to not transmit, but
    # wait, for the edge one-way latency time.  This way rude will not
    # exit before the last packet has a chance to arrive at the destination.
    # Delaying here simplifies the barrier synchronization (we don't have
    # to do a sub-second delay in perl).  Note that a final, zero rate
    # MODIFY like we use here only works in our modified version of rude!
    # Stock rude will just exit after the last actual transmission; i.e.,
    # prior to the final one-way latency wait.
674 675 676 677
    #
    # START <when>
    # <start-offset-ms> <flowID> ON <src-port> \
    #    <dst-addr>:<dst-port> CONSTANT <packets per second> <packet size>
678 679
    # <stop-offset-ms> <flowID> MODIFY CONSTANT 0 <XX>
    # <stop-offset-ms+one-way-link-latency> <flowID> OFF
680 681 682 683 684
    #
    # For our purposes, the variables are:
    #	$sample_size	packet rate
    #	$millis		time to run
    #
685
    push @contents, "START NOW\n";
686 687
    # Let bind() choose the src port; any constant port may be already in use.
    push @contents, "0000 $stream_id ON 0 " 
688
	. $edge->dst . "-" . $edge->name
689
	    . ":10001 CONSTANT $sample_size 20\n";
690 691 692 693
    if ($edge->delay) {
	push @contents, "$millis $stream_id MODIFY CONSTANT 0 20\n";
	$millis += int($edge->delay);
    }
694
    push @contents, "$millis $stream_id OFF\n";
695 696

    &write_file(RUDE_CFG, @contents);
697 698 699

}

700 701
# Returns the sample size used by the Loss test.
# TODO: why this number? (from my ProbStats book.)
702 703 704
sub get_loss_sample_size {
    my $edge = shift @_;
    if($edge->loss > 0) {
705
	return &round( 2.5 / $edge->loss);
706
    } else {
707 708
	# just in case a slow link with no loss.
	return SLOW_SEND; 
709 710 711
    }
}

712 713 714 715


# returns TRUE if the link loss is valid for the linktest loss test.
sub valid_loss {
716
    my $edge = shift @_;
717
    if($edge->bw >= LIMIT_BW_MIN && $edge->bw < LIMIT_BW_LO) {
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732
	if(&get_loss_sample_size($edge) > SLOW_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } elsif( $edge->bw >= LIMIT_BW_LO) {
	# also want an upper limit.
	if(&get_loss_sample_size($edge) > FAST_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } else {
	return FALSE;
    }
733
}
734 735 736 737


# This test uses RUDE and CRUDE to send a stream of packets
# in both directions over a duplex link.
738 739 740 741 742
sub loss_test {
    my %analyze;
    my %recv_cnt;
    my $stream_id = 1;
    my @edge_copy = @links;
743 744 745
    my $trun = 1;
    my $rude_arg = "";

Mike Hibler's avatar
Mike Hibler committed
746 747
    # XXX version 1.1 compatibility; used to start crude here and wait
    if ($compat < 1.2) {
748
	&debug("performing barrier synch for backward compatibility\n");
Mike Hibler's avatar
Mike Hibler committed
749 750 751
	&barrier();
    }

752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
    #
    # XXX "concession" to vnodes: the stock rude is, well..."rude",
    # when it comes to CPU usage.  It spins between time intervals
    # which is rather painful for vnodes.  So we have a local version
    # which allows sleeping between intervals if the clock resolution
    # is sufficient.  We send 100 pps (10000us) and vnodes have a 1000HZ
    # (1000us) clock, which qualifies as sufficient.
    #
    # So, we add the extra rude option if conditions are met.
    #
    if ($numvnodes && $hostmap{$hostname}->isvnode) {
	my $hz = `/sbin/sysctl kern.clockrate 2>/dev/null`;
	if ($hz =~ /\shz = (\d+),/) {
	    $rude_arg = "-C $1";
	}
    }
768

769
    while(&has_elems(\@edge_copy)) {
770
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy, 1);
771 772 773 774
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src) {
		if(valid_loss($edge)) {
		    &write_rude_cfg($stream_id,$edge);
775 776 777 778 779 780 781
		    if ($printsched) {
			&schedlog("rude " . schedprint_link($edge) .
				  " (pps=" .
				  &get_loss_sample_size($edge) .
				  ", time=" .
				  LOSS_TEST_DURATION . "s, psize=20)");
		    } else {
782
			&my_system($PATH_RUDE,"-s", RUDE_CFG, "-P", RUDE_PRI,
783 784 785
				   $rude_arg);
			$analyze{$stream_id} = $other_edge;
		    }
786
		} else {
787 788 789 790
		    if ($printsched) {
			&schedlog("skipping loss test " .
				  schedprint_link($edge));
		    }
791 792 793 794
		    &debug("Skipping loss test for " .
			   &print_link($edge) . "\n");
		    &info("*** Skipping loss test on $hostname for " .
			  &print_link($edge) . "\n");
795 796 797 798
		}
	    } elsif ($hostname eq $other_edge->src) {
		if(valid_loss($other_edge)) {
		    &write_rude_cfg($stream_id,$other_edge);
799 800 801 802 803 804 805
		    if ($printsched) {
			&schedlog("rude " . schedprint_link($other_edge) .
				  " (pps=" .
				  &get_loss_sample_size($edge) .
				  ", time=" .
				  LOSS_TEST_DURATION . "s, psize=20)");
		    } else {
806
			&my_system($PATH_RUDE,"-s", RUDE_CFG, "-P", RUDE_PRI,
807 808 809
				   $rude_arg);
			$analyze{$stream_id} = $edge;
		    }
810
		} else {
811 812 813 814
		    if ($printsched) {
			&schedlog("skipping loss test " .
				  schedprint_link($other_edge));
		    }
815 816 817 818
		    &debug("Skipping loss test for " .
			   &print_link($other_edge) . "\n");
		    &info("*** Skipping loss test on $hostname for " .
			  &print_link($other_edge) . "\n");
819 820 821 822
		}
	    }
	}
	$stream_id++;
823 824 825 826
	if ($printsched) {
	    &schedlog("barrier $barriers_hit: loss: after run $trun");
	    $trun++;
	}
827 828 829 830 831 832 833 834
	&barrier();
    }

    # wait for any stragglers due to delay-- there is a  race
    # between the barrier sync on the control net and the expt net latency.
    sleep(1);

    # count packets received for each stream.
835 836 837 838
    my @results;
    if ($printsched) {
	@results = ();
    } else {
839
	@results = &my_tick($PATH_CRUDE,"-d",CRUDE_DAT);
840
    }
841 842 843
    my $result_count = @results;
    &debug("result_count from crude: $result_count\n");
    foreach (@results) {
844 845 846 847 848
	if(/ID=(\d+) /) {
	    $recv_cnt{$1}++;
	}
    }

849
    # analyze only links for which a stream was received.
850 851
    foreach my $key (keys %analyze) {
	my $edge = $analyze{$key};
852
	my $sent = (&get_loss_sample_size($edge) * LOSS_TEST_DURATION) + 1 ;
853 854
	my $received = $recv_cnt{$key};

855 856 857 858
	if ($reportonly) {
	    &info("    Loss result on $hostname for " .
		  &print_edge($edge) .
		  ": sent/recv = $sent/$received\n");
859
	    next;
860 861
	}

862 863 864 865 866 867 868 869
	if(!defined($received)) {
	    $received=0;
	    &error (NAME_LOSS,$edge,"No packets received from " . $edge->src);
	} else {
	    # this is a large sample test about proportion p.
	    # this is considered a valid statistical estimate for np >= 10.
	    my $p = 1 - $edge->loss;

870
	    my $p_hat = $received / $sent;
871
	    my $numerator = $p_hat - $p;
872
	    my $denominator = sqrt( abs( $p * (1 - $p_hat) / $sent) );
873

874 875 876 877
	    if ($edge->loss == 0) {
		#
		# Lets not worry about a single lost packet.
		#
878 879
		if ($received < ($sent - 1)) {
		    my $errmsg = "Unexpected loss (sent $sent, received=$received)";
880
		    &error(NAME_LOSS, $edge, $errmsg);
881
		}
882
	    } elsif($denominator == 0) {
883
		my $errmsg = "No packets lost (sent $sent, plr " . $edge->loss .")";
884 885
		&error(NAME_LOSS, $edge, $errmsg);
	    } else {
886

887 888 889 890
		
		my $z = $numerator / $denominator;
		my $reject_region = 2.58; # alpha = 0.1, normal distro by CLT
		if(abs($z) > $reject_region) {
891
		    my $errmsg = "sent $sent, received $received; expected proportion $p, measured proportion $p_hat";
892 893 894 895 896 897
		    &error(NAME_LOSS, $edge, $errmsg);
		}
	    }
	}

    }
898 899 900
    if (!$printsched) {
	kill_listener($listener_crude);
    }
901

902
    # wait for completion before next test.
903 904 905
    if ($printsched) {
	&schedlog("barrier $barriers_hit: loss: after test");
    }
906 907 908
    &barrier();
}

909 910 911
##############################################################################
# Latency Test Functions
##############################################################################
912

913 914 915
# returns whether the link latency is in a valid test range.
sub valid_latency {
    return TRUE;
916

917
}
918

919 920 921 922 923
# Pings a node and returns information.
# @param[0] := host to ping
# @param[1] := ttl, 0 for default
# @return: (received_count, avg_latency ms)
sub ping_node