linktest.pl 71.9 KB
Newer Older
1
#!/usr/bin/perl -w -T
2
#
3
# EMULAB-COPYRIGHT
4
# Copyright (c) 2000-2007 University of Utah and the Flux Group.
5
# All rights reserved.
6
#
7

8 9
use strict;
use Class::Struct;
10
use POSIX qw(uname);
11
use POSIX qw(strftime);
12
use IO::Handle;
13
use English;
14
use Socket;
15

16
my $LINKTEST_VERSION = "1.2";
17

18 19 20
#
# XXX config stuff that does not belong on the client-side
#
21
#my $PROJROOT      = "@PROJROOT_DIR@";
22

23 24 25
#
# Linktest test script. This script is set up to run as root on
# experiment nodes. It is invoked by the Linktest daemon after the
26 27 28 29 30 31
# daemon receives a Linktest "START" event. The script runs tests
# using ping, rude/crude (a real-time packet emitter/collector) and a
# locally hacked iperf to test all links in the experiment. If the
# results show a 99% chance that the experiment is configured
# incorrectly, an error is logged to the experiment directory in
# tbdata/linktest.  Valid ranges have been calibrated ahead of time.
32 33 34
#
sub usage() 
{
Timothy Stack's avatar
 
Timothy Stack committed
35 36 37
    print("Usage: linktest.pl\n".
	  " [STARTAT=<test step, 1-4>]\n".
	  " [STOPAT=<test step, 1-4>]\n".
Mike Hibler's avatar
Mike Hibler committed
38
	  " [COMPAT=<version or 0>]\n".
39 40 41
	  " [DOARP=<1=yes, 0=no>]\n".
	  " [REPORTONLY=<1=yes, 0=no>]\n".
	  " [NODES=<n1,n2...>\n".
42 43 44 45 46 47
	  " [DEBUG=<debugging level. 1=on, 0=off>]\n".
	  " [LOGRUN=<command/used/for/logging>]\n".
	  " [LOGDIR=<path/to/log/root>]\n".
	  " [BINDIR=<path/to/binary/files>]\n".
	  " [VARDIR=<path/to/config/files>]\n".
	  " [EVENTSERVER=<eventserver hostname>]\n");
48
    print("    <test step>: 1=conn/latency, 2=routing, 3=loss, 4=BW\n".
Mike Hibler's avatar
Mike Hibler committed
49
	  "    COMPAT=<version>: remain compatible with version <version> or earlier\n".
50 51
	  "    DOARP=1: run a single-ping pass to create ARP entries\n".
	  "    REPORTONLY=1: report stats only, do not pass judgement\n".
52 53 54 55 56 57
	  "    NODES: comma-separated list of virtnames to run on\n".
	  "    LOGRUN: Command to run for high-level logging. The log message as its only argument.".
	  "    LOGDIR: Path used to store stderr logs of individual test utilities.\n".
	  "    BINDIR: Path must contain emulab-rude emulab-crude emulab-iperf\n".
	  "    VARDIR: Path must contain...\n".
	  "    EVENTSERVER: Hostname of event server\n");
58
    exit(0);
59 60
}

61

62 63 64
##############################################################################
# Constants
##############################################################################
65 66 67

# log files used by tests.
use constant CRUDE_DAT => "/tmp/crude.dat"; # binary data
68
use constant RUDE_CFG  => "/tmp/rude.cfg";
69
use constant IPERF_DAT => "/tmp/iperf.dat";
70

71 72 73
# Packet size for iperf (1470 default).  Reduce to avoid problems with veths
use constant IPERF_PKTSIZE => 1450;

74 75 76
# max time to run iperf
use constant BW_TEST_MAXDURATION => 10;

77
# iperf test limits.
78 79 80 81 82
use constant LIMIT_BW_HI  => 100000000;
use constant LIMIT_BW_MED =>  10000000;
use constant LIMIT_BW_LO  =>   1000000;
use constant LIMIT_BW_MIN =>     64000;
use constant LIMIT_BW_LOSS => 0.20;
83

84 85
# Make sure that we dont get bogged down in being too accurate! 
# Make sure the error is a certain significance before we start reporting it.
86
use constant INSIGNIFICANT_LAT_ERROR_LO => 0.50;  # ms
87
use constant INSIGNIFICANT_LAT_ERROR_HI => 3.50;  # ms
88 89
use constant INSIGNIFICANT_BW_ERROR_HI  => 0.015; # percent.
use constant INSIGNIFICANT_BW_ERROR_LO  => 0.03;  # percent.
Russ Fish's avatar
Russ Fish committed
90
use constant INSIGNIFICANT_BW_ERROR_LO_Windows  => 0.10;  # Lower expectations.
91

92
# slow send rate (for bw from LIMIT_BW_MIN to LIMIT_BW_LO)
93 94 95
use constant SLOW_SEND => 100;
use constant FAST_SEND => 250;
use constant LOSS_TEST_DURATION => 4;	# In seconds.
96

97
# misc contstants
98
use constant BSD => "FreeBSD";
99
use constant LINUX => "Linux";
100
use constant RTPROTO_STATIC => "Static";
101
use constant RTPROTO_SESSION => "Session";
Timothy Stack's avatar
 
Timothy Stack committed
102
use constant EVENT_COMPLETE => "COMPLETE";
103
use constant EVENT_REPORT => "REPORT";
Timothy Stack's avatar
 
Timothy Stack committed
104
use constant EVENT_LOG => "LOG";
105
use constant PING_SEND_COUNT => 10;
David Anderson's avatar
David Anderson committed
106
use constant SYNC_NAMESPACE => "linktest";
107 108 109 110

# crude should have higher priority than rude on the same machine
use constant RUDE_PRI => 3;
use constant CRUDE_PRI => 5;
111 112

# test levels
113 114 115 116
use constant TEST_LATENCY => 1; # direct link connectivity & latency
use constant TEST_RT_STATIC => 2;   # prior plus static routing
use constant TEST_LOSS => 3;   # prior plus loss
use constant TEST_BW => 4; # prior plus bandwidth
117

118
# test names
119
use constant NAME_RT_STATIC => "Routing";
120
use constant NAME_LATENCY => "Latency";
121 122
use constant NAME_LOSS => "Loss";
use constant NAME_BW => "Bandwidth";
123

124
# error suffix for logs
125
use constant SUFFIX_ERROR => ".error";
126
use constant DEBUG_ALL => 2; # debug level for all debug info, not just msgs.
127

128 129
# exit codes
use constant EXIT_ABORTED => -1;
Timothy Stack's avatar
 
Timothy Stack committed
130
use constant EXIT_NOT_OK => 1;
131 132
use constant EXIT_OK => 0;

133 134 135
# Protos
sub TimeStamp();
sub PATH_NICE();
136 137 138

# struct for representing a link.
struct ( edge => {
139
    name => '$',
140
    src => '$',
141
    srcip => '$',
142
    dst => '$',
143
    dstip => '$',
144
    bw  => '$',
145 146
    testbw  => '$',
    bwtime  => '$',
147
    delay => '$',
148
    loss => '$',
149 150 151 152 153
    queuetype => '$',
    mac => '$',
    mpxstyle => '$',
    dstyle => '$',
    symlanignore => '$'});
154

Timothy Stack's avatar
 
Timothy Stack committed
155 156 157
struct ( host => {
    name => '$',
    visited => '$',
158 159 160 161 162 163 164 165 166
    links => '@',
    isvnode => '$',
    pname => '$',
    phost => '$',
    ptype => '$',
    osid => '$',
    os => '$',
    osvers => '$',
    osfeatures => '$'});
Timothy Stack's avatar
 
Timothy Stack committed
167

168
# fixes emacs colorization woes introduced by above struct definition.
Timothy Stack's avatar
 
Timothy Stack committed
169
# struct ( unused => { foo => '$'});
170

171 172 173
use constant TRUE => 1;
use constant FALSE => 0;

174 175 176 177
##############################################################################
# Globals
##############################################################################

178
my $topology_file;    # location of the topology input file.
179
my $ptopology_file;   # location of the physical topology input file.
180
my $synserv;    # synch server node
181
my $rtproto;    # routing protocol
182
my $hostname;   # this hosts name
183 184 185 186
my $exp_id;     # experiment id
my $proj_id;    # project id
my $gid;        # group id
my $platform;   # name of platform
187 188
my $startat=1;  # which test to start at
my $stopat=99;  # which test to stop at
189
my %kill_list;  # PIDS that are running as background procs, needing cleanup.
190
my $debug_level = 0; # enable debug statements
191 192
                     # 1 = print debug statements.
                     # 2 = show STDOUT and STDERR
193 194 195
my $arpit = 1;	# do a single ping to force ARP exchange
my $reportonly = 0; # just report the values seen, no pass/fail judgement
my $printsched = 0; # just print the test schedule
Mike Hibler's avatar
Mike Hibler committed
196
my $compat = 1.1;   # be compatible (wrt. synch) with old versions
197
my $barriers_hit = 1;
198
my $barr_count;   # used by synserv host, nubmer of hosts -1
199
my $log_file;    # common logfile for information saved over time.
200 201

my @hosts; # hosts: list of text strings containing host names.
202
           # sorted alphabetically
Timothy Stack's avatar
 
Timothy Stack committed
203
my %hostmap;
204 205 206
my $numvnodes = 0;
my %vhostmap;
my %linkmembers;
207
my @links; # links: list of edge structs.
208
           # sorted alphabetically by src . dst
209

210 211
my $expt_path;  # experiment path (ie, tbdata) set by init.
my $linktest_path;   # log path (ie tbdata/linktest) set by init.
Timothy Stack's avatar
 
Timothy Stack committed
212 213 214 215 216 217 218 219
my $simname = "ns";
my $swapper = "";
my $swapperid = 0;
my $swappergid = 0;
my $token = -1;
my $error_count = 0;
my $stage_error_count = 0;
my $total_error_count = 0;
220

221 222 223
my $warn_partial_test = 0;
my $warn_unshaped_links = 0;

224 225 226
my $listener_iperf;
my $listener_crude;

227 228 229
##############################################################################
# Main control
##############################################################################
230

231 232 233 234 235 236
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

237 238 239
# Make sure that files written into the experiment subdir are group writable.
umask(0002);

240 241 242 243 244 245
our $LOGRUN = "";
our $PROJROOT = "";
our $VARDIR = "";
our $BINDIR = "";
our $EVENTSERVER = "";

246 247 248 249 250 251 252 253 254 255 256
#
# Parse command arguments. Since Linktest is run via the event system,
# parse out pairs of <symbol>=<value>.
#
foreach my $arg (@ARGV) {
    if($arg =~ /STOPAT=(\d)/) {
	$stopat=$1;
    }
    if($arg =~ /STARTAT=(\d)/) {
	$startat=$1;
    }
Mike Hibler's avatar
Mike Hibler committed
257 258 259
    if($arg =~ /COMPAT=(\d(?:\.\d+))/) {
	$compat=$1/1.0;
    }
260 261 262 263 264 265
    if($arg =~ /DOARP=(\d)/) {
	$arpit=$1;
    }
    if($arg =~ /REPORTONLY=(\d)/) {
	$reportonly=$1;
    }
266 267 268
    if($arg =~ /DEBUG=(\d)/) {
	$debug_level=$1;
    }
269 270 271
    if($arg =~ /PRINTSCHED=(\d)/) {
	$printsched=$1;
    }
Timothy Stack's avatar
 
Timothy Stack committed
272 273 274 275 276 277 278
    if($arg =~ /TOKEN=(\d+)/) {
	$token=$1;
    }
    if($arg =~ /SWAPPER=(\w+)/) {
	$swapper=$1;
	(undef,undef,$swapperid,$swappergid) = getpwnam($swapper);
    }
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
    if($arg =~ /LOGRUN=(.+)/) {
	$LOGRUN = $1;
    }
    if($arg =~ /LOGDIR=(.+)/) {
	$PROJROOT = $1;
    }
    if($arg =~ /VARDIR=(.+)/) {
	$VARDIR = $1;
    }
    if($arg =~ /BINDIR=(.+)/) {
	$BINDIR = $1;
    }
    if($arg =~ /EVENTSERVER=(.+)/) {
	$EVENTSERVER = $1;
    }
294 295
}

Mike Hibler's avatar
Mike Hibler committed
296 297 298 299 300 301 302 303
$compat = 99
    if ($compat == 0);

# XXX no arp test in 1.1 and before
if ($compat < 1.2) {
    $arpit = 0;
}

304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
# path to applications and files
our $PATH_NICKNAME = "$VARDIR/boot/nickname";
our $PATH_KEYFILE = "$VARDIR/boot/eventkey";
our $PATH_RUDE = "$BINDIR/emulab-rude";
our $PATH_CRUDE = "$BINDIR/emulab-crude";
our $PATH_IPERF = "$BINDIR/emulab-iperf";
our $PATH_RCTOPO = "$BINDIR/rc/rc.topomap";
our $PATH_EMULAB_SYNC = "$BINDIR/emulab-sync";
our $PATH_LTEVENT = "$BINDIR/ltevent";
our $PATH_TEVC = "$BINDIR/tevc";
our $RUN_PATH = "$BINDIR"; # where the linktest-ns runs.
our $PATH_SCHEDFILE = "$VARDIR/logs/linktest.sched";
our $PATH_SYNCSERVER = "$VARDIR/boot/syncserver";
our $PATH_TOPOFILE = "$VARDIR/boot/ltmap";
our $PATH_PTOPOFILE = "$VARDIR/boot/ltpmap";

my $schedfile = $PATH_SCHEDFILE;
if ($printsched) {
    open(SCHED, ">$schedfile") or
    die("Could not open schedule log file $schedfile");
} else {
    unlink($schedfile);
}

328 329 330 331
#
# Parse the nickname file to obtain the host name, 
# experiment ID and the project ID.
#
332
my $fname = $PATH_NICKNAME;
333 334 335 336 337 338
die("Could not locate $fname\n") unless -e $fname;
my @results = &read_file($fname);
($hostname, $exp_id, $proj_id) = split /\./, $results[0];
chomp $hostname;
chomp $exp_id;
chomp $proj_id;
Mike Hibler's avatar
Mike Hibler committed
339 340 341 342 343 344 345 346

# taint check pid/eid
if ($proj_id =~ /([-\w]*)/) {
    $proj_id = $1;
}
if ($exp_id =~ /([-\w]*)/) {
    $exp_id = $1;
}
347 348 349 350 351 352
$gid = $proj_id;

#
# Set path variables storing the experiment logging path,
# the current ns file and the output file for topology info.
#
353
$expt_path = "$PROJROOT/$proj_id/exp/$exp_id/tbdata";
354
$linktest_path = "$expt_path/linktest";
355 356
$topology_file = $PATH_TOPOFILE;
$ptopology_file = $PATH_PTOPOFILE;
357 358

#
359 360
# Determine what OS we are.  Used for handling the occasional difference
# in the location of or options to system utilities (e.g., ping).
361 362 363 364
#
($platform) = POSIX::uname();

#
365 366 367 368 369 370
# First, everyone parses the topo files to see who is participating in
# this run.  All instances will be started simultaneuously from the event
# system, so we throw a random sleep in here to keep nodes from pounding
# the NFS server.
#
sleep(int(rand(5)));
371
&my_system($PATH_RCTOPO, "reconfig");
372
&get_topo($topology_file, $ptopology_file);
373 374 375 376 377 378
&debug_top();

#
# Finally, see who the master is for synchronization.
# We prefer the synch server node for the experiment, but if that node
# is not participating, we choose the first node on the host list.
379
#
380
$synserv = "";
381
my $ssname = $PATH_SYNCSERVER;
382 383 384 385 386
if ($ssname) {
    @results = &read_file($ssname);
    ($synserv) = split/\./, $results[0];
    chomp $synserv;
}
387
if (@hosts > 0 && (!$synserv || !exists($hostmap{$synserv}))) {
388 389 390
    $synserv = $hosts[0];
}
&debug("Synch master is $synserv\n");
391

392
#
393 394
# If the current node is the special node, do some housekeeping
# and initialize the barrier count.
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
#
if(&is_special_node()) {
    #
    # If the shared path used by Linktest for logging and temporary
    # files already exists, clear its contents for this run.
    #
    if( -e $linktest_path ) {
	die("Path $linktest_path is not a directory\n") 
	    unless -d $linktest_path;

	opendir (DIR,$linktest_path)
	    ||  die("Could not open $linktest_path: $!");
	my @dirfiles = grep (/error$/,readdir(DIR));
	foreach (@dirfiles) {
	    &do_unlink("$linktest_path/$_");
	}
	closedir(DIR);

    } else {
	# 
	# The shared path does not exist, create it.
	#
	mkdir (&check_filename($linktest_path),0777) 
	    || die("Could not create directory $linktest_path: $!");
Timothy Stack's avatar
 
Timothy Stack committed
419
	chown($swapperid, $swappergid, $linktest_path);
420 421 422 423
    }

    $barr_count = @hosts;
    $barr_count--;
424
}
425

426 427 428 429 430 431 432 433 434 435 436 437 438 439
#
# If there are no links to test, there is nothing to do!
# Do this after we have identified the synch server.
#
if (@links == 0) {
    &debug("No links to test!\n");

    my $msg = "Linktest skipped, no links";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    &post_event(EVENT_COMPLETE,"ERROR=0 CTOKEN=$token");
    exit(EXIT_OK);
}

440 441 442 443 444 445 446 447 448 449 450 451 452
#
# If it has been determined that we are not a part of the run,
# exit now so we don't screw up the barrier synchs.  Note that post_event
# will only happen if we are the synch server and we can only be the
# synch server if no other node is participating in the run either.
#
if (!exists($hostmap{$hostname})) {
    &debug("$hostname not participating in this linktest run.\n");
    &sim_event(EVENT_LOG,"Linktest skipped, no nodes participating");
    &post_event(EVENT_COMPLETE,"ERROR=0 CTOKEN=$token");
    exit(EXIT_OK);
}

453 454 455 456 457
#
# All nodes remove local temporary files remaining from the last
# run, if any.
# 
&do_unlink(CRUDE_DAT);
458
&do_unlink(IPERF_DAT);
459 460 461
&do_unlink(RUDE_CFG);

#
462
# Start up listeners; they run over the lifetime of Linktest to
463
# reduce the number of barrier synchronizations and startup
464
# delays. Always give the collectors a second to start up.
465
#
466 467 468 469 470 471
my $listeners = 0;
if (&dotest(TEST_BW)) {
    if ($printsched) {
	&schedlog("start iperf listener");
    } else {
	$listener_iperf = &start_listener(PATH_NICE, "-n", "-10",
472
					  $PATH_IPERF,"-s","-f","b","-u",
473 474 475 476 477 478 479 480
					  "-w","200000","-l",IPERF_PKTSIZE);
	$listeners++;
    }
}
if (&dotest(TEST_LOSS)) {
    if ($printsched) {
	&schedlog("start crude listener");
    } else {
481
	$listener_crude = &start_listener($PATH_CRUDE,"-l",CRUDE_DAT,
482 483 484 485 486
					  "-P",CRUDE_PRI);
	$listeners++;
    }
}
if ($listeners) {
487 488 489 490 491 492 493 494 495
    sleep(1);
}

#
# Finally, synchronize so that all nodes have completed startup
# tasks. When all nodes reach this barrier, the topology input file
# has been written, local variables are initialized, background
# listeners have been started, and temporary files have been cleared.
#
496 497 498
if ($printsched) {
    &schedlog("barrier $barriers_hit: post-startup");
}
499 500 501
&barrier;

#
502
# Beginning of the tests.
503
#
504 505 506
my $msg = "Linktest Version $LINKTEST_VERSION";
&sim_event(EVENT_LOG, $msg);
&post_event(EVENT_REPORT, $msg);
Timothy Stack's avatar
 
Timothy Stack committed
507 508
&debug("\n$msg\n\n");

509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
#
# Print out any warnings that alter the behavior of the run
#
if ($warn_partial_test) {
    my $msg = "*** WARNING: some hosts do not support linktest,".
	      " skipping links between those hosts";
    &sim_event(EVENT_LOG, $msg);
    &post_event(EVENT_REPORT, $msg);
    &debug("\n$msg\n\n");
}
if ($warn_unshaped_links && &dotest(TEST_BW)) {
    my $msg = "*** WARNING: tb-set-noshaping used on one or more links,".
	      " skipping BW tests for those links.";
    &sim_event(EVENT_LOG, $msg);
    &post_event(EVENT_REPORT, $msg);
    &debug("\n$msg\n\n");
}

527 528 529 530 531 532 533
if (defined($rtproto) && $rtproto eq RTPROTO_SESSION) {
    my $msg = "Session routing active; waiting a bit to let routes stabilize";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    sleep(30);
}

534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
if ($arpit) {
    my $stamp = TimeStamp();
    my $msg   = "Pre-test ARP on all nodes ... $stamp";
    &post_event(EVENT_REPORT,$msg);
    &sim_event(EVENT_LOG,$msg);
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-arp test");
    }
    &barrier();
    &debug("\n$msg\n\n");
    &force_arp;
}

549
if(&dotest(TEST_LATENCY)) {
550 551
    my $stamp = TimeStamp();
    my $msg   = "Testing Single Hop Connectivity and Latency ... $stamp";
552
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
553
    &sim_event(EVENT_LOG,$msg);
554 555
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
556 557 558
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-latency test");
    }
559
    &barrier();
560
    &debug("\n$msg\n\n");
561
    &latency_test;
562
    &report_status(NAME_LATENCY);
563 564
}

565
if(&dotest(TEST_RT_STATIC)
566
    && defined($rtproto)
567
    && ($rtproto eq RTPROTO_STATIC || $rtproto eq RTPROTO_SESSION)) {
568
    my $msg;
569
    my $stamp = TimeStamp();
570 571 572 573 574

    if ($total_error_count) {
	$msg = "Skipping Routing tests because of previous errors!";
    }
    else {
575
	$msg = "Testing Routing ... $stamp";
576
    }
577
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
578
    &sim_event(EVENT_LOG,$msg);
579 580
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
581 582 583
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-routing test");
    }
584
    &barrier();
585
    &debug("\n$msg\n\n");
586 587
    if (! $total_error_count) {
	&static_rt_test; # nodes not covered by 1hop test
588
	&report_status(NAME_RT_STATIC);
589 590
    }
    else {
591 592 593
	if ($printsched) {
	    &schedlog("barrier $barriers_hit: post-routing test");
	}
594 595
	&barrier();
    }
596
}
597

598
if(&dotest(TEST_LOSS)) {
599 600
    my $stamp = TimeStamp();
    my $msg = "Testing Loss ... $stamp";
601
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
602
    &sim_event(EVENT_LOG,$msg);
603 604
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
605 606 607
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-loss test");
    }
608
    &barrier();
609
    &debug("\n$msg\n\n");
610
    &loss_test; 
611
    &report_status(NAME_LOSS);
612
}
613

614
if(&dotest(TEST_BW)){
615 616
    my $stamp = TimeStamp();
    my $msg = "Testing Bandwidth ... $stamp";
617
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
 
Timothy Stack committed
618
    &sim_event(EVENT_LOG,$msg);
619
    # Ick, this barrier makes sure the above message gets into the log
620
    # first, so as not to confuse Mike
621 622 623
    if ($printsched) {
	&schedlog("barrier $barriers_hit: pre-bandwidth test");
    }
624
    &barrier();
625
    &debug("\n$msg\n\n");
626
    &bw_test;
627
    &report_status(NAME_BW);
628
}
629

630 631
&cleanup;

632 633 634
if ($printsched) {
    &schedlog("barrier $barriers_hit: post-test");
}
635 636
&barrier();

Timothy Stack's avatar
 
Timothy Stack committed
637 638 639 640
$msg = "Linktest Done";
&sim_event(EVENT_LOG,$msg);
&debug("\n$msg\n\n");

641 642 643
#
# Send an event indicating that Linktest has completed normally.
#
Timothy Stack's avatar
 
Timothy Stack committed
644
&post_event(EVENT_COMPLETE,"ERROR=$total_error_count CTOKEN=$token");
645

646
exit(EXIT_OK);
647

648

649
##############################################################################
650
# Loss Test Functions
651 652
##############################################################################

653

654
# Writes the configuration file used by RUDE.
655 656
sub write_rude_cfg {
    my ($stream_id, $edge) = @_;
657 658
    my $sample_size = &get_loss_sample_size($edge);
    my $millis      = LOSS_TEST_DURATION * 1000;
659
    my @contents;
660
    
661 662 663 664 665 666 667 668 669
    #
    # Run for the desired time and then tell rude to not transmit, but
    # wait, for the edge one-way latency time.  This way rude will not
    # exit before the last packet has a chance to arrive at the destination.
    # Delaying here simplifies the barrier synchronization (we don't have
    # to do a sub-second delay in perl).  Note that a final, zero rate
    # MODIFY like we use here only works in our modified version of rude!
    # Stock rude will just exit after the last actual transmission; i.e.,
    # prior to the final one-way latency wait.
670 671 672 673
    #
    # START <when>
    # <start-offset-ms> <flowID> ON <src-port> \
    #    <dst-addr>:<dst-port> CONSTANT <packets per second> <packet size>
674 675
    # <stop-offset-ms> <flowID> MODIFY CONSTANT 0 <XX>
    # <stop-offset-ms+one-way-link-latency> <flowID> OFF
676 677 678 679 680
    #
    # For our purposes, the variables are:
    #	$sample_size	packet rate
    #	$millis		time to run
    #
681
    push @contents, "START NOW\n";
682 683
    # Let bind() choose the src port; any constant port may be already in use.
    push @contents, "0000 $stream_id ON 0 " 
684
	. $edge->dst . "-" . $edge->name
685
	    . ":10001 CONSTANT $sample_size 20\n";
686 687 688 689
    if ($edge->delay) {
	push @contents, "$millis $stream_id MODIFY CONSTANT 0 20\n";
	$millis += int($edge->delay);
    }
690
    push @contents, "$millis $stream_id OFF\n";
691 692

    &write_file(RUDE_CFG, @contents);
693 694 695

}

696 697
# Returns the sample size used by the Loss test.
# TODO: why this number? (from my ProbStats book.)
698 699 700
sub get_loss_sample_size {
    my $edge = shift @_;
    if($edge->loss > 0) {
701
	return &round( 2.5 / $edge->loss);
702
    } else {
703 704
	# just in case a slow link with no loss.
	return SLOW_SEND; 
705 706 707
    }
}

708 709 710 711


# returns TRUE if the link loss is valid for the linktest loss test.
sub valid_loss {
712
    my $edge = shift @_;
713
    if($edge->bw >= LIMIT_BW_MIN && $edge->bw < LIMIT_BW_LO) {
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
	if(&get_loss_sample_size($edge) > SLOW_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } elsif( $edge->bw >= LIMIT_BW_LO) {
	# also want an upper limit.
	if(&get_loss_sample_size($edge) > FAST_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } else {
	return FALSE;
    }
729
}
730 731 732 733


# This test uses RUDE and CRUDE to send a stream of packets
# in both directions over a duplex link.
734 735 736 737 738
sub loss_test {
    my %analyze;
    my %recv_cnt;
    my $stream_id = 1;
    my @edge_copy = @links;
739 740 741
    my $trun = 1;
    my $rude_arg = "";

Mike Hibler's avatar
Mike Hibler committed
742 743
    # XXX version 1.1 compatibility; used to start crude here and wait
    if ($compat < 1.2) {
744
	&debug("performing barrier synch for backward compatibility\n");
Mike Hibler's avatar
Mike Hibler committed
745 746 747
	&barrier();
    }

748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763
    #
    # XXX "concession" to vnodes: the stock rude is, well..."rude",
    # when it comes to CPU usage.  It spins between time intervals
    # which is rather painful for vnodes.  So we have a local version
    # which allows sleeping between intervals if the clock resolution
    # is sufficient.  We send 100 pps (10000us) and vnodes have a 1000HZ
    # (1000us) clock, which qualifies as sufficient.
    #
    # So, we add the extra rude option if conditions are met.
    #
    if ($numvnodes && $hostmap{$hostname}->isvnode) {
	my $hz = `/sbin/sysctl kern.clockrate 2>/dev/null`;
	if ($hz =~ /\shz = (\d+),/) {
	    $rude_arg = "-C $1";
	}
    }
764

765
    while(&has_elems(\@edge_copy)) {
766
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy, 1);
767 768 769 770
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src) {
		if(valid_loss($edge)) {
		    &write_rude_cfg($stream_id,$edge);
771 772 773 774 775 776 777
		    if ($printsched) {
			&schedlog("rude " . schedprint_link($edge) .
				  " (pps=" .
				  &get_loss_sample_size($edge) .
				  ", time=" .
				  LOSS_TEST_DURATION . "s, psize=20)");
		    } else {
778
			&my_system($PATH_RUDE,"-s", RUDE_CFG, "-P", RUDE_PRI,
779 780 781
				   $rude_arg);
			$analyze{$stream_id} = $other_edge;
		    }
782
		} else {
783 784 785 786
		    if ($printsched) {
			&schedlog("skipping loss test " .
				  schedprint_link($edge));
		    }
787 788 789 790
		    &debug("Skipping loss test for " .
			   &print_link($edge) . "\n");
		    &info("*** Skipping loss test on $hostname for " .
			  &print_link($edge) . "\n");
791 792 793 794
		}
	    } elsif ($hostname eq $other_edge->src) {
		if(valid_loss($other_edge)) {
		    &write_rude_cfg($stream_id,$other_edge);
795 796 797 798 799 800 801
		    if ($printsched) {
			&schedlog("rude " . schedprint_link($other_edge) .
				  " (pps=" .
				  &get_loss_sample_size($edge) .
				  ", time=" .
				  LOSS_TEST_DURATION . "s, psize=20)");
		    } else {
802
			&my_system($PATH_RUDE,"-s", RUDE_CFG, "-P", RUDE_PRI,
803 804 805
				   $rude_arg);
			$analyze{$stream_id} = $edge;
		    }
806
		} else {
807 808 809 810
		    if ($printsched) {
			&schedlog("skipping loss test " .
				  schedprint_link($other_edge));
		    }
811 812 813 814
		    &debug("Skipping loss test for " .
			   &print_link($other_edge) . "\n");
		    &info("*** Skipping loss test on $hostname for " .
			  &print_link($other_edge) . "\n");
815 816 817 818
		}
	    }
	}
	$stream_id++;
819 820 821 822
	if ($printsched) {
	    &schedlog("barrier $barriers_hit: loss: after run $trun");
	    $trun++;
	}
823 824 825 826 827 828 829 830
	&barrier();
    }

    # wait for any stragglers due to delay-- there is a  race
    # between the barrier sync on the control net and the expt net latency.
    sleep(1);

    # count packets received for each stream.
831 832 833 834
    my @results;
    if ($printsched) {
	@results = ();
    } else {
835
	@results = &my_tick($PATH_CRUDE,"-d",CRUDE_DAT);
836
    }
837 838 839
    my $result_count = @results;
    &debug("result_count from crude: $result_count\n");
    foreach (@results) {
840 841 842 843 844
	if(/ID=(\d+) /) {
	    $recv_cnt{$1}++;
	}
    }

845
    # analyze only links for which a stream was received.
846 847
    foreach my $key (keys %analyze) {
	my $edge = $analyze{$key};
848
	my $sent = (&get_loss_sample_size($edge) * LOSS_TEST_DURATION) + 1 ;
849 850
	my $received = $recv_cnt{$key};

851 852 853 854
	if ($reportonly) {
	    &info("    Loss result on $hostname for " .
		  &print_edge($edge) .
		  ": sent/recv = $sent/$received\n");
855
	    next;
856 857
	}

858 859 860 861 862 863 864 865
	if(!defined($received)) {
	    $received=0;
	    &error (NAME_LOSS,$edge,"No packets received from " . $edge->src);
	} else {
	    # this is a large sample test about proportion p.
	    # this is considered a valid statistical estimate for np >= 10.
	    my $p = 1 - $edge->loss;

866
	    my $p_hat = $received / $sent;
867
	    my $numerator = $p_hat - $p;
868
	    my $denominator = sqrt( abs( $p * (1 - $p_hat) / $sent) );
869

870 871 872 873
	    if ($edge->loss == 0) {
		#
		# Lets not worry about a single lost packet.
		#
874 875
		if ($received < ($sent - 1)) {
		    my $errmsg = "Unexpected loss (sent $sent, received=$received)";
876
		    &error(NAME_LOSS, $edge, $errmsg);
877
		}
878
	    } elsif($denominator == 0) {
879
		my $errmsg = "No packets lost (sent $sent, plr " . $edge->loss .")";
880 881
		&error(NAME_LOSS, $edge, $errmsg);
	    } else {
882

883 884 885 886
		
		my $z = $numerator / $denominator;
		my $reject_region = 2.58; # alpha = 0.1, normal distro by CLT
		if(abs($z) > $reject_region) {
887
		    my $errmsg = "sent $sent, received $received; expected proportion $p, measured proportion $p_hat";
888 889 890 891 892 893
		    &error(NAME_LOSS, $edge, $errmsg);
		}
	    }
	}

    }
894 895 896
    if (!$printsched) {
	kill_listener($listener_crude);
    }
897

898
    # wait for completion before next test.
899 900 901
    if ($printsched) {
	&schedlog("barrier $barriers_hit: loss: after test");
    }
902 903 904
    &barrier();
}

905 906 907
##############################################################################
# Latency Test Functions
##############################################################################
908

909 910 911
# returns whether the link latency is in a valid test range.
sub valid_latency {
    return TRUE;
912

913
}
914

915 916 917 918 919
# Pings a node and returns information.
# @param[0] := host to ping
# @param[1] := ttl, 0 for default
# @return: (received_count, avg_latency ms)
sub ping_node {
920
    my ($host,$ttl,$scount,$timo) = @_;
921 922 923
    my $count = 0;
    my $avg_latency = 0;
    my $stddev = 0;
924

925 926 927
    my $send_count = defined($scount) ? $scount : PING_SEND_COUNT;
    my $timeout = defined($timo) ? $timo : 3;
    my $send_rate = ($timeout > 1) ? (($timeout - 1) / $send_count) : 0;
928