linktest.pl.in 32.1 KB
Newer Older
1
#!/usr/bin/perl -w -T
2
#
3
4
5
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2004 University of Utah and the Flux Group.
# All rights reserved.
6
#
7

8
9
use strict;
use Class::Struct;
10
11
use POSIX qw(uname);
use IO::Handle;
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
use English;

#
# Linktest test script. This script is set up to run as root on
# experiment nodes. It is invoked by the Linktest daemon after the
# daemon receives a Linktest "START" event. The script runs tests using
# ping, rude/crude (a real-time packet emitter/collector) and pathrate
# to test all links in the experiment. If the results show a 99% chance
# that the experiment is configured incorrectly, an error is logged to
# the experiment directory in tbdata/linktest.  Valid ranges have been
# calibrated ahead of time.
#
sub usage() 
{
    print "Usage: linktest.pl\n".
27
28
29
30
	" [STARTAT=<test step, 1-4>]\n".
	    " [STOPAT=<test step, 1-4>]\n".
		" [DEBUG=<debugging level. 1=on, 0=off>]\n";
    exit(0);
31
32
}

33
34
35
36
37





38
39
40
##############################################################################
# Constants
##############################################################################
41

42
# path to applications and files
43
use constant LINKTEST_NSPATH => "@LINKTEST_NSPATH@";
44
45
46
47
48
49
50
51
use constant PATH_NICKNAME => "@CLIENT_VARDIR@/boot/nickname";
use constant PATH_KEYFILE => "@CLIENT_VARDIR@/boot/eventkey";
use constant PATH_RUDE => "/usr/local/bin/rude";
use constant PATH_CRUDE => "/usr/local/bin/crude";
use constant PATH_PATHRATE_SND => "/usr/local/bin/pathrate_snd";
use constant PATH_PATHRATE_RCV => "/usr/local/bin/pathrate_rcv";
use constant PATH_EMULAB_SYNC => "@CLIENT_BINDIR@/emulab-sync";
use constant PATH_LTEVENT => "@CLIENT_BINDIR@/ltevent";
52
use constant RUN_PATH => "@CLIENT_BINDIR@"; # where the linktest-ns runs.
53

54
55
# log files used by tests.
use constant CRUDE_DAT => "/tmp/crude.dat"; # binary data
56
57
use constant RUDE_CFG  => "/tmp/rude.cfg";
use constant PATHRATE_DAT => "/tmp/pathrate.dat";
58

59
# pathrate test limits.
60
use constant LIMIT_BW_HI => 45000000;
61
62
63
use constant LIMIT_BW_LO =>  1000000;
use constant LIMIT_BW_LOSS => 0;

64
65
66
67
# Make sure that we dont get bogged down in being too accurate! 
# Make sure the error is a certain significance before we start reporting it.
use constant INSIGNIFICANT_LAT_ERROR => 0.50;  # ms
use constant INSIGNIFICANT_BW_ERROR  => 1.00;  # mb
68
69
70

# latency must be corrected for xmit delay under this speed.
use constant LAT_LOW_BW => 10000000;
71

72
73
74
75
# slow send rate (for bw 256kbps to 1Mbps)
use constant SLOW_BW => 256000;
use constant SLOW_SEND => 400;
use constant FAST_SEND => 1002;
76

77
# misc contstants
78
use constant BSD => "FreeBSD";
79
use constant LINUX => "Linux";
80
81
use constant RTPROTO_STATIC => "Static";
use constant EVENT_STOP => "STOP";
82
use constant EVENT_REPORT => "REPORT";
83
use constant PING_SEND_COUNT => 10;
David Anderson's avatar
David Anderson committed
84
use constant SYNC_NAMESPACE => "linktest";
85
86

# test levels
87
88
89
90
use constant TEST_LATENCY => 1; # direct link connectivity & latency
use constant TEST_RT_STATIC => 2;   # prior plus static routing
use constant TEST_LOSS => 3;   # prior plus loss
use constant TEST_BW => 4; # prior plus bandwidth
91

92
# test names
93
94
95
96
use constant NAME_RT_STATIC => "Static Routing";
use constant NAME_LATENCY => "Latency (Round Trip)";
use constant NAME_LOSS => "Loss";
use constant NAME_BW => "Bandwidth";
97

98
# error suffix for logs
99
100
use constant SUFFIX_ERROR => ".error";
use constant SUFFIX_TOPO =>  ".topology";
101
use constant DEBUG_ALL => 2; # debug level for all debug info, not just msgs.
102

103
104
105
106
# exit codes
use constant EXIT_ABORTED => -1;
use constant EXIT_OK => 0;

107
108
109
110
111
112
113

# struct for representing a link.
struct ( edge => {
    src => '$',
    dst => '$',
    bw  => '$',
    delay => '$',
114
115
116
117
    loss => '$'});

# fixes emacs colorization woes introduced by above struct definition.
struct ( unused => { foo => '$'});
118

119
120
121
use constant TRUE => 1;
use constant FALSE => 0;

122
123
124
125
##############################################################################
# Globals
##############################################################################

126
127
my $ns_file;          # location of the customized ns for Linktest
my $topology_file;    # location of the topology input file.
128
my $synserv;    # synch server node
129
my $rtproto;    # routing protocol
130
my $hostname;   # this hosts name
131
132
133
134
my $exp_id;     # experiment id
my $proj_id;    # project id
my $gid;        # group id
my $platform;   # name of platform
135
136
137
138
my $startat=1;  # which test to start at
my $stopat=99;  # which test to stop at
my @kill_list;  # PIDs maintained through the life of linktest
                    # which get killed as part of cleanup.
139
140
141
my $debug_level = 0; # enable debug statements
                    # 1 = print debug statements.
                    # 2 = show STDOUT and STDERR
142
my $barr_count;   # used by synserv host, nubmer of hosts -1
143
144
my $event_server;       # event server
my $log_file;    # common logfile for information saved over time.
145
146

my @hosts; # hosts: list of text strings containing host names.
147
           # sorted alphabetically
148
my @links; # links: list of edge structs.
149
           # sorted alphabetically by src . dst
150
                                  
151
152
my $expt_path;  # experiment path (ie, tbdata) set by init.
my $linktest_path;   # log path (ie tbdata/linktest) set by init.
153

154
# full path to custom NS build. 
155
my $ns_cmd; 
156
157
158
159

##############################################################################
# Main control
##############################################################################
160

161
162
163
164
165
166
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

167
168
169
# Make sure that files written into the experiment subdir are group writable.
umask(0002);

170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#
# Parse command arguments. Since Linktest is run via the event system,
# parse out pairs of <symbol>=<value>.
#
foreach my $arg (@ARGV) {
    if($arg =~ /STOPAT=(\d)/) {
	$stopat=$1;
    }
    if($arg =~ /STARTAT=(\d)/) {
	$startat=$1;
    }
    if($arg =~ /DEBUG=(\d)/) {
	$debug_level=$1;
    }
}

#
# Parse the nickname file to obtain the host name, 
# experiment ID and the project ID.
#
my $fname = PATH_NICKNAME;
die("Could not locate $fname\n") unless -e $fname;
my @results = &read_file($fname);
($hostname, $exp_id, $proj_id) = split /\./, $results[0];
chomp $hostname;
chomp $exp_id;
chomp $proj_id;
$gid = $proj_id;

#
# Set path variables storing the experiment logging path,
# the current ns file and the output file for topology info.
#
$expt_path = "/proj/$proj_id/exp/$exp_id/tbdata";
$linktest_path = "$expt_path/linktest";
$topology_file = "$linktest_path/$exp_id" . SUFFIX_TOPO;
if(-e "$expt_path/$exp_id-modify.ns") {
    $ns_file = "$expt_path/$exp_id-modify.ns";
} elsif (-e "$expt_path/$exp_id.ns") {
    $ns_file = "$expt_path/$exp_id.ns";
} else {
    die("Could not locate an ns file.\n");
}

#
# Determine location of the customized ns binary for Linktest.
#
($platform) = POSIX::uname();
if($platform eq BSD) {
    $ns_cmd = LINKTEST_NSPATH ."/fbsd/ns";
} elsif ($platform eq LINUX) {
    $ns_cmd = LINKTEST_NSPATH ."/linux/ns";
} else {
    die ("Platform $platform is not currently supported.\n");
}

#
# Parse the syncserver file to find out which node is the sync server.
#
my $ssname = "@CLIENT_VARDIR@/boot/syncserver";
die("Could not locate an emulab-sync server\n") unless -e $ssname;
@results = &read_file($ssname);
($synserv) = split/\./, $results[0];
chomp $synserv;
234

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#
# Execute tmcc to find out the name of the event server.
#
@results = &my_tick("@CLIENT_BINDIR@/tmcc","bossinfo");
if(@results && $results[0] =~ /^([\w\.]*)\s/) {
    $event_server = $1; 
} else {
    die("Could not determine event server name\n");
}

#
# If the current node is the special node (arbitrarily the sync
# server node), do some housekeeping and run ns to generate
# the topology input file, which is read by all nodes to obtain
# the experiment topology.
#
if(&is_special_node()) {

    #
    # If the shared path used by Linktest for logging and temporary
    # files already exists, clear its contents for this run.
    #
    if( -e $linktest_path ) {
	die("Path $linktest_path is not a directory\n") 
	    unless -d $linktest_path;

	opendir (DIR,$linktest_path)
	    ||  die("Could not open $linktest_path: $!");
	my @dirfiles = grep (/error$/,readdir(DIR));
	foreach (@dirfiles) {
	    &do_unlink("$linktest_path/$_");
	}
	closedir(DIR);

    } else {
	# 
	# The shared path does not exist, create it.
	#
	mkdir (&check_filename($linktest_path),0777) 
	    || die("Could not create directory $linktest_path: $!");
    }

    # 
    # Now, run ns in the same path where the nstb_compat.tcl and
    # tb_compat.tcl files are located. Store the output from 
    # the linktest-modified ns for use in generating the topology file.
    #
    chdir(RUN_PATH);
    @results = &my_tick($ns_cmd,
			&check_filename($ns_file));
    
    die("Linktest-NS could not parse $ns_file\n") unless @results;

    #
    # Use results of ns to write the topology input file
    #
    &write_file($topology_file,@results);


    #
    # Call get_topo before the other nodes because
    # get_topo is used to initialize @hosts.
    #
    &get_topo($topology_file);

    #
    # Caculate the barrier count using the results of the NS script.
    # Note: This could be gotten from scalar(@hosts), but that
    # requires a successful parse of the NS file, which may fail.
    #
    $barr_count = @hosts;
    $barr_count--;
    
} # end of chores for the special node.

#
# All nodes remove local temporary files remaining from the last
# run, if any.
# 
&do_unlink(CRUDE_DAT);
&do_unlink(PATHRATE_DAT);
&do_unlink(RUDE_CFG);

#
# Start up child processes for receiving RUDE and Pathrate streams.
# These run over the lifetime of Linktest to reduce the number of
# barrier synchronizations and startup delays. Always give the 
# collectors a moment to start up.
#
if(&dotest(TEST_BW)){
    &my_system_initonly(PATH_PATHRATE_SND,"-i","-q");
    sleep(1);
}
if(&dotest(TEST_LOSS)){
    &my_system_initonly(PATH_CRUDE,"-l",CRUDE_DAT);
    sleep(1);
}

#
# Finally, synchronize so that all nodes have completed startup
# tasks. When all nodes reach this barrier, the topology input file
# has been written, local variables are initialized, background
# listeners have been started, and temporary files have been cleared.
#
&barrier;

#
# Beginning of the tests. First, all nodes read the topology.
# Since this can occur at the same time after the barrier,
# have a short randomized delay to avoid hitting NFS too hard.
#
sleep(int(rand(5)));
&get_topo($topology_file) unless &is_special_node();
&debug_top();
349

350
if(&dotest(TEST_LATENCY)) {
351
352
353
    my $msg = "Testing Single Hop Connectivity and Latency...";
    &post_event(EVENT_REPORT,$msg);
    &debug("\n$msg\n\n");
354
    &latency_test;
355
356
}

357
358
359
if(&dotest(TEST_RT_STATIC)
    && defined($rtproto)
    && $rtproto eq RTPROTO_STATIC) {
360
361
362
    my $msg = "Testing Static Routing...";
    &post_event(EVENT_REPORT,$msg);
    &debug("\n$msg\n\n");
363
364
    &static_rt_test; # nodes not covered by 1hop test
}
365

366
if(&dotest(TEST_LOSS)) {
367
368
369
    my $msg = "Testing Loss...";
    &post_event(EVENT_REPORT,$msg);
    &debug("\n$msg\n\n");
370
    &loss_test; 
371
}
372

373
if(&dotest(TEST_BW)){
374
375
376
    my $msg = "Testing Bandwidth...";
    &post_event(EVENT_REPORT,$msg);
    &debug("\n$msg\n\n");
377
378
    &bw_test;
}
379

380
381
&cleanup;

382
383
&barrier();

384
385
386
387
#
# Send an event indicating that Linktest has completed normally.
#
&post_event(EVENT_STOP,"");
388

389
390
&debug("Done\n");

391
exit(EXIT_OK);
392

393

394
##############################################################################
395
# Loss Test Functions
396
397
##############################################################################

398
# Writes the configuration file used by RUDE.
399
400
sub write_rude_cfg {
    my ($stream_id, $edge) = @_;
401
402
403
404
405
406
407
408
409
410
    my @contents;
    push @contents, "START NOW\n";
    push @contents, "0000 $stream_id ON 3001 " 
	. $edge->dst 
	    . ":10001 CONSTANT " 
		. &get_loss_sample_size($edge) 
		    . " 20\n";
    push @contents, "1000 $stream_id OFF\n";

    &write_file(RUDE_CFG, @contents);
411
412
413

}

414
415
# Returns the sample size used by the Loss test.
# TODO: why this number? (from my ProbStats book.)
416
417
418
419
420
421
422
423
424
sub get_loss_sample_size {
    my $edge = shift @_;
    if($edge->loss > 0) {
	return &round( 10 / $edge->loss);
    } else {
	return SLOW_SEND; # just in case a slow link with no loss.
    }
}

425
426
427
428


# returns TRUE if the link loss is valid for the linktest loss test.
sub valid_loss {
429
    my $edge = shift @_;
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
    if($edge->bw >= SLOW_BW && $edge->bw < LIMIT_BW_LO) {
	if(&get_loss_sample_size($edge) > SLOW_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } elsif( $edge->bw >= LIMIT_BW_LO) {
	# also want an upper limit.
	if(&get_loss_sample_size($edge) > FAST_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } else {
	return FALSE;
    }
446
}
447
448
449
450


# This test uses RUDE and CRUDE to send a stream of packets
# in both directions over a duplex link.
451
452
453
454
455
456
457
458
459
460
461
sub loss_test {
    my %analyze;
    my %recv_cnt;
    my $stream_id = 1;
    my @edge_copy = @links;
    while(&has_elems(\@edge_copy)) {
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy);
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src) {
		if(valid_loss($edge)) {
		    &write_rude_cfg($stream_id,$edge);
462
		    &my_system(PATH_RUDE,"-s", RUDE_CFG);
463
464
465
466
467
468
469
		    $analyze{$stream_id} = $other_edge;
		} else {
		    &debug("Skipping loss test for " . &print_link($edge) . "\n");
		}
	    } elsif ($hostname eq $other_edge->src) {
		if(valid_loss($other_edge)) {
		    &write_rude_cfg($stream_id,$other_edge);
470
		    &my_system(PATH_RUDE,"-s", RUDE_CFG);
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
		    $analyze{$stream_id} = $edge;
		} else {
		    &debug("Skipping loss test for " . &print_link($other_edge) . "\n");
		}
	    }
	}
	$stream_id++;
	&barrier();
    }

    # wait for any stragglers due to delay-- there is a  race
    # between the barrier sync on the control net and the expt net latency.
    sleep(1);

    # count packets received for each stream.
486
487
488
489
    my @results = &my_tick(PATH_CRUDE,"-d",CRUDE_DAT);
    my $result_count = @results;
    &debug("result_count from crude: $result_count\n");
    foreach (@results) {
490
491
492
493
494
	if(/ID=(\d+) /) {
	    $recv_cnt{$1}++;
	}
    }

495
    # analyze only links for which a stream was received.
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
    foreach my $key (keys %analyze) {
	my $edge = $analyze{$key};
	my $received = $recv_cnt{$key};

	if(!defined($received)) {
	    $received=0;
	    &error (NAME_LOSS,$edge,"No packets received from " . $edge->src);
	} else {
	    # this is a large sample test about proportion p.
	    # this is considered a valid statistical estimate for np >= 10.
	    my $p = 1 - $edge->loss;
	    my $n = &get_loss_sample_size($edge) + 1 ;


	    my $p_hat = $received / $n;
	    my $numerator = $p_hat - $p;
512
	    my $denominator = sqrt( abs( $p * (1 - $p_hat) / $n) );
513
514
515
516

	    if( $edge->loss == 0) {
		if($received < $n) {

517
518
519
		    ####
		    ## TODO:

520
521
522
		    
		    my $errmsg = "Unexpected loss occurred (n=$n, received=$received)\n";
		    &error(NAME_LOSS, $edge, $errmsg);
523
		} # note, no logging of succesful 0-loss. (too common).
524
525
526
527
	    } elsif($denominator == 0) {
		my $errmsg = "No packets were lost (n=$n, plr=" . $edge->loss .")";
		&error(NAME_LOSS, $edge, $errmsg);
	    } else {
528

529
530
531
532
		
		my $z = $numerator / $denominator;
		my $reject_region = 2.58; # alpha = 0.1, normal distro by CLT
		if(abs($z) > $reject_region) {
533
		    my $errmsg = "Probable loss misconfiguration (n:$n, received: $received; expected proportion: $p, measured proportion=$p_hat)";
534
535
536
537
538
539
540
541
		    &error(NAME_LOSS, $edge, $errmsg);
		}
	    }
	}

    }


542
    # wait for completion before next test.
543
544
545
    &barrier();
}

546
547
548
##############################################################################
# Latency Test Functions
##############################################################################
549

550
551
552
# returns whether the link latency is in a valid test range.
sub valid_latency {
    return TRUE;
553

554
}
555

556
557
558
559
560
561
562
563
564
# Pings a node and returns information.
# @param[0] := host to ping
# @param[1] := ttl, 0 for default
# @return: (received_count, avg_latency ms)
sub ping_node {
    my ($host,$ttl) = @_;
    my $count = 0;
    my $avg_latency = 0;
    my $stddev = 0;
565

566
567
568
    my $send_count = PING_SEND_COUNT;
    my $timeout = 1; # 1 second
    my $send_rate = $timeout / $send_count;
569

570
571
572
573
574
575
576
577
578
579
580
581
582
    # set deadline to prevent long waits
    my $cmd;
    if($ttl) {
	if($platform eq BSD) {
	    $cmd = "/sbin/ping -c $send_count -q -i $send_rate -t $timeout -m $ttl $host";
	} elsif($platform eq LINUX) {
	    $cmd = "/bin/ping -c $send_count -q -i $send_rate -w $timeout -t $ttl $host";
	}
    } else {
	if($platform eq BSD) {
	    $cmd = "/sbin/ping -c $send_count -q -i $send_rate -t $timeout $host";
	} elsif($platform eq LINUX) {
	    $cmd = "/bin/ping -c $send_count -q -i $send_rate -w $timeout $host";
583
584
585
	}
    }

586
587
588
589
590
591
592
593
594
595
    # note backticks passes SIGINT to child procs
    my @args = split(/\s+/,$cmd);
    my @results = &my_tick(@args);

    my $reslt_cnt = @results;
    my $result = $results[$reslt_cnt-2];
    if($platform eq BSD && $result =~ /(\d+) packets received/) {
	$count = $1;
    } elsif($platform eq LINUX && $result =~ /(\d+) received/) {
	$count = $1;
596
597
    }

598
599
600
601
602
    if($count) {
	$result = $results[$reslt_cnt-1];
	if($result=~ /\d+\.\d+\/(\d+\.\d+)\/\d+\.\d+\/(\d+\.\d+)/) {
	    $avg_latency = $1;
	    $stddev = $2;
603
604
	}
    }
605
    return ($count, $avg_latency, $stddev);
606
607
}

608
# For directly connected hosts, checks latency using Ping.
609
sub latency_test {
610
611
    my @waitlist;
    my @edge_copy = @links;
612

613
    while(&has_elems(\@edge_copy)) {
614
615
616
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy);
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src ) {
617
618
619
620
621
622
623
		# todo: consider ignoring latency if no delay node.
		if(&valid_latency($edge) && &valid_latency($other_edge)
		   ) {
		    my $pid = fork();
		    if(!$pid) {

			# call ping_node with ttl=1
624
625
			my ($result_cnt, $sample_avg, $sample_dev) =
			    &ping_node($edge->dst,1);
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649

			my $n = PING_SEND_COUNT;

			if($result_cnt == 0) {
			    my $errmsg = "No packets were received (n=$n)\n";
			    &error(NAME_LATENCY, $edge, $errmsg);
			} else {

# facts from analysis in /users/davidand/public/calibrate.
# came from 40 independent swapins (enough for normality assumption)
# (note that data actually is normal at any particular latency point, 
# according to described.lst)

# best fit regression for the error as a function of total latency, according to sas.
# see regression1.lst and regression1.sas
#-0.00523(actual)     0.00003096 fbsd
#-0.00530(actual)     0.00003478 linux
# roughly identical, so use:
#-0.005(actual)

# inherent delay in the system (with a delay node) is
# see described.lst and described.sas
# 0.337737  fbsd
# 0.362282  linux (median was 0.328000)
650
# round to:
651
# 0.333 ms
652
653

# note, this has been measured and is in one of the emulab papers (Shashi)
654
655
656
657
658
659
660
661
662
663
664
665
666
667

# Also, described.lst provides good support for the notion that
# the distribution of latencies is normal. For Fbsd all of the 
# distributions were normal, and most were for Linux. So, use this
# assumption in order to send fewer test packets.
			   

			    # the null hypothesis value, u.
			    my $u = $edge->delay + $other_edge->delay;
			    
			    # the calibration as a function of $u
			    $u += 0.333 - 0.005 * $u / 2;


668
# factor in transport delay at slow network speeds.
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684

			    # transport delay: 64 bytes + 18 bytes eth/crc
			    # to units of ms.
			    if($edge->bw < LAT_LOW_BW) {
				$u += 1000 * (82 * 8) / $edge->bw;
			    }
			    if($other_edge->bw < LAT_LOW_BW) {
				$u += 1000 * (82 * 8) / $other_edge->bw;
			    }
			    
			    my $x_bar = $sample_avg;
			    my $numerator = $x_bar - $u;

			    my $S = $sample_dev;
			    

685
			    my $denominator = $S / sqrt( abs( $n ) );
686
687
688
689
690
691
692

			    if($denominator == 0) {
				my $errmsg = "Invalid sample standard deviation (possible parse problem, please report). (n=$n, u=$u, x_bar=$x_bar, S=$S)";
				&error(NAME_LATENCY, $edge, $errmsg);
			    } else {
				my $z = $numerator / $denominator;

693

694
695
696
697
698
699

				my $t_reject = 3.250; # alpha = 0.01, df=9

				if(abs($z) > $t_reject
				   && (abs($x_bar - $u) > INSIGNIFICANT_LAT_ERROR)
				   ) {
700
				    my $errmsg = "Probable latency misconfiguration (expected=$u, measured mean=$x_bar).";
701
702
703
704
705
				    &error(NAME_LATENCY, $edge, $errmsg);
				}
			    }

			}
706
			exit(EXIT_OK);
707
708
709

		    } else {
			push @waitlist, $pid;
710
		    }
711
		} else {
712
		    &debug("Skipping latency test for " . &print_link($edge) . " to " . &print_link($other_edge) . "\n");
713
714
		}
	    }
715
#
716
717
	}
    }
718
719

    &wait_all(\@waitlist);
720
721
    # wait for completion before next test.
    &barrier();
722
723
}

724

725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
##############################################################################
# Bandwidth Test Functions
##############################################################################

# Returns whether the link bandwidth is in a valid test range.
sub valid_bw {
    my $edge = shift @_;
    if($edge->bw >= LIMIT_BW_LO
       && $edge->bw <= LIMIT_BW_HI
       && $edge->loss <= LIMIT_BW_LOSS
       ) {
	return TRUE;
    } else {
	return FALSE;
    }
}


# Checks bandwidth for directly connected links.
744
745
sub bw_test {
    my @analyze_list;
746
    my @edge_copy;
747
    @edge_copy = @links;
748
749


750
751
752

    # all nodes will execute the same reductions on the edge list
    # on their own so that the number of barriers is the same.
753
    while(&has_elems(\@edge_copy)) {
754
	my $edge = &get_assign(\@edge_copy);
755

756
	if(defined($edge) ) {
757
758
759
760
	    if($hostname eq $edge->dst) {
		if (&valid_bw($edge)) {
		    push @analyze_list, $edge;
		    
761
		    &my_system(PATH_PATHRATE_RCV, "-Q","-s",$edge->src,"-q","-N", PATHRATE_DAT);
762
763
		} else {
		    &debug("Skipping bandwidth test for " . &print_link($edge) . "\n");
764

765
		}
766
	    } 
767
768
769
770
	}
	&barrier();
    }

771

772
773
    # read the log file.
    if(@analyze_list) {
774
775
	my @results = &read_file(PATHRATE_DAT);
	foreach (@results) {
776
777
778
779
	    my $edge = shift(@analyze_list);
	    my $sender = $edge->src;
	    if(/SNDR=$sender.*CAPL=(\d+\.\d+)Mbps.*CAPH=(\d+\.\d+)Mbps/) {
		my $expected = $edge->bw / 1000000;
780
781
		my $low = $1;
		my $hi = $2;
782
783


784
785
786
787
788
789
790
791
		my $diff;
		if($expected > $2) {
		    $diff = $expected - $2;
		} elsif ($expected < $1) {
		    $diff = $1 - $expected;
		} else {
		    $diff = 0;
		}
792

793
		my $output = sprintf "Receive from " . $edge->src
794
		    . ": $1/$2/%.1f/%.1f\n", $expected, $diff;
795
796
		&debug($output);

797
		if($diff > INSIGNIFICANT_BW_ERROR) {
798
		    &error (NAME_BW, $edge, "Bandwidth estimate $low to $hi Mbps does not include expected bandwidth $expected Mbps");
799
		}
800

801
	    } else {
802
		 die ("Error while parsing " . PATHRATE_DAT . "\n");
803
804
805
806
	    }
	}
    }
    
807
808
}

809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825

##############################################################################
# Static Routing Connectivity Test Functions
##############################################################################

# Attempts to reach nodes that are not on a direct link
# with this host. IE, use TTL > 1. Pings are in parallel.
sub static_rt_test {
    my @host_copy = @hosts; # copy of all hosts.
    my @this_1hops; # 1hop destinations from this host.

    foreach my $edge(@links) {
	if($edge->src eq $hostname) {
	    push @this_1hops, $edge->dst;
	} elsif ($edge->dst eq $hostname) {
	    push @this_1hops, $edge->src;
	}
826
    }
827

828
829
830
    for(my $i=0;$i<@host_copy;$i++) {
	if($host_copy[$i] eq $hostname) {
	    $host_copy[$i] = undef;
831
	} else {
832
833
834
835
836
837
838
	    # zap any host in the dst list.
	    foreach my $dst( @this_1hops ) {
		if(defined($host_copy[$i])
		   && ($dst eq $host_copy[$i])) {
		    $host_copy[$i] = undef;
		} 
	    }
839
	}
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
    }

    my @waitlist;
    # fork processes to run the pings in parallel.
    for(my $i=0;$i<@host_copy;$i++) {
	if(defined($host_copy[$i])
           && linked_node($host_copy[$i])
           ) {
	    my $dst = $host_copy[$i];
	    my $pid = fork();
	    if(!$pid) {
		my ($recv_cnt,$ignored1, $ignored2) = &ping_node($dst,0);
		if(!$recv_cnt) {
		    my $newEdge = new edge;
		    $newEdge->src($hostname);
		    $newEdge->dst($dst);
		    &error(NAME_RT_STATIC,$newEdge , "$hostname could not ping $dst");
		} else {
		    &debug("Attempting to reach $dst... OK\n");
		}
860
		exit(EXIT_OK);
861
862
863
	    } else {
		push @waitlist,$pid;
	    }
864
865
	}
    }
866
    &wait_all(\@waitlist);
867
868
    # wait for completion before next test.
    &barrier();
869
870
}

871

872
873
874
875
876
877
878
879
880
881

##############################################################################
# Utility Functions
##############################################################################

# Convenience to print information about a link.
sub print_link {
    my $edge = shift @_;
    return $edge->src . " to " . $edge->dst . " (" . ($edge->bw / 1000000)  . 
	" Mbps, " . $edge->delay . "ms, " . (100 * $edge->loss) . "% loss)";
882
}
883

884
885
886
887
888
889
890
891
892
893
894
# returns whether the current node has a link.
sub linked_node {
    my $node = shift @_;
    foreach my $edge (@links) {
        if($node eq $edge->src
           || $node eq $edge->dst) {
            return TRUE;
        }
    }
    return FALSE;
}
895

896
# Handles reading NS output.
897
898
899
sub get_topo {
    my $ns_outfile = shift(@_);

900
901
    my @results = &read_file($ns_outfile);
    foreach (@results) {
902
903
904
905
906

	# load the output from ns.
	# the file format is simple:
	# expr := h <node name>
	#      || l <src node> <dst node> <bw (Mb/s)> <latency (s)> <loss (%)>
907
908
909
	if( /^h (\S+)/ ) {
	    push @hosts, $1
	} elsif ( /^l (\S+)\s+(\S+)\s+(\d+)\s+(\d\.\d+)\s+(\d\.\d+)/) {
910
911
912
913
	    my $newEdge = new edge;
	    $newEdge->src($1);
	    $newEdge->dst($2);
	    $newEdge->bw($3);
914
	    $newEdge->delay($4 * 1000); # units of ms
915
	    $newEdge->loss($5);
916
	    push @links, $newEdge;
917
918

	# currently recognize only Static routing
919
	} elsif (/^r Static/i) {
920
	    $rtproto = RTPROTO_STATIC;
921
922
923
	}
    }

924
925
926
    # sorted order.
    @hosts = sort { $a cmp $b } @hosts;
    @links = sort { $a->src . $a->dst cmp $b->src . $b->dst } @links;
927
928
929
930
931
932
933

}


# prints out the topology read in from the NS file
sub debug_top {
    &debug("ns script: $ns_file\n");
934
    &debug("event server: $event_server\n");
935
    &debug("nodes:\n");
936
    foreach my $vert (@hosts) {
937
938
939
	&debug( " " . $vert . "\n");
    }
    &debug("links:\n");
940
    foreach my $edge (@links) {
941
942
943
944
945
946
947
	&debug( " " . $edge->src . " " . $edge->dst . " " . $edge->bw
		. " " . $edge->delay . " " . $edge->loss . "\n"
		);
    }
    &debug("routing protocol: $rtproto\n") if defined($rtproto);
}

948
# log to expt problem directory.
949
sub error {
950
951
952
    my($test,$edge,$msg) = @_;

    my $output = "$test\n";
953
    $output .= "  Link:  " . &print_link($edge) . "\n";
954
955
    $output .=     "  Error: $msg\n\n";

956
957
958
    &debug($output);    
    &append_file($linktest_path . "/" . $hostname . SUFFIX_ERROR,
		 $output);
959
960
}

961
sub barrier {
962
    if($hostname eq $synserv) {
963
	if($barr_count) {
David Anderson's avatar
David Anderson committed
964
	    &my_system(PATH_EMULAB_SYNC,"-i",$barr_count,"-n",SYNC_NAMESPACE);
965
	}
966
    } else {
David Anderson's avatar
David Anderson committed
967
	&my_system(PATH_EMULAB_SYNC,"-n",SYNC_NAMESPACE);
968
969
    }

970
971
972
}

sub debug {
973
    return unless $debug_level;
974
    print "@_";
975
976
}

977

978

979

980
# returns one edge at a time, reserving two nodes.
981
982
983
sub get_assign {
    my ($todo_ref) = @_; # must maintain sorted order invariant
    my $task = undef;
984
985
    my @thisrun;

986
987
    # build a fresh hash to see which nodes are in use.
    my %inuse;
988
    foreach (@hosts) {
989
990
991
992
993
	$inuse{$_}=0;
    }

    for(my $i=0;$i<@{$todo_ref};$i++) {
	my $edge = @{$todo_ref}[$i];
994
	if(defined($edge) && !($inuse{$edge->src} || $inuse {$edge->dst})) {
995
996
	    $inuse{$edge->src} = 1;
	    $inuse{$edge->dst} = 1;
997
998
	    push @thisrun,$edge;
	    @{$todo_ref}[$i] = undef;
999
1000
	}
    }