linktest.pl.in 39.4 KB
Newer Older
1
#!/usr/bin/perl -w -T
2
#
3
# EMULAB-COPYRIGHT
Mike Hibler's avatar
Mike Hibler committed
4
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
5
# All rights reserved.
6
#
7

8
9
use strict;
use Class::Struct;
10
11
use POSIX qw(uname);
use IO::Handle;
12
use English;
13
use Socket;
14
15
16
17

#
# Linktest test script. This script is set up to run as root on
# experiment nodes. It is invoked by the Linktest daemon after the
18
19
20
21
22
23
# daemon receives a Linktest "START" event. The script runs tests
# using ping, rude/crude (a real-time packet emitter/collector) and a
# locally hacked iperf to test all links in the experiment. If the
# results show a 99% chance that the experiment is configured
# incorrectly, an error is logged to the experiment directory in
# tbdata/linktest.  Valid ranges have been calibrated ahead of time.
24
25
26
#
sub usage() 
{
Timothy Stack's avatar
   
Timothy Stack committed
27
28
29
30
    print("Usage: linktest.pl\n".
	  " [STARTAT=<test step, 1-4>]\n".
	  " [STOPAT=<test step, 1-4>]\n".
	  " [DEBUG=<debugging level. 1=on, 0=off>]\n");
31
    exit(0);
32
33
}

34

35
36
37
##############################################################################
# Constants
##############################################################################
38

39
# path to applications and files
40
use constant LINKTEST_NSPATH => "@LINKTEST_NSPATH@";
41
42
43
44
use constant PATH_NICKNAME => "@CLIENT_VARDIR@/boot/nickname";
use constant PATH_KEYFILE => "@CLIENT_VARDIR@/boot/eventkey";
use constant PATH_RUDE => "/usr/local/bin/rude";
use constant PATH_CRUDE => "/usr/local/bin/crude";
45
use constant PATH_IPERF => "/usr/local/bin/emulab-iperf";
46
use constant PATH_RCTOPO => "@CLIENT_BINDIR@/rc/rc.topomap";
47
48
use constant PATH_EMULAB_SYNC => "@CLIENT_BINDIR@/emulab-sync";
use constant PATH_LTEVENT => "@CLIENT_BINDIR@/ltevent";
Timothy Stack's avatar
   
Timothy Stack committed
49
use constant PATH_TEVC => "@CLIENT_BINDIR@/tevc";
50
use constant RUN_PATH => "@CLIENT_BINDIR@"; # where the linktest-ns runs.
51
use constant EVENTSERVER => "@EVENTSERVER@";
52

53
54
# log files used by tests.
use constant CRUDE_DAT => "/tmp/crude.dat"; # binary data
55
use constant RUDE_CFG  => "/tmp/rude.cfg";
56
use constant IPERF_DAT => "/tmp/iperf.dat";
57

58
59
# iperf test limits.
use constant LIMIT_BW_HI => 100000000;
60
61
62
use constant LIMIT_BW_LO =>  1000000;
use constant LIMIT_BW_LOSS => 0;

63
64
# Make sure that we dont get bogged down in being too accurate! 
# Make sure the error is a certain significance before we start reporting it.
65
use constant INSIGNIFICANT_LAT_ERROR_LO => 0.50;  # ms
66
use constant INSIGNIFICANT_LAT_ERROR_HI => 3.50;  # ms
67
use constant INSIGNIFICANT_BW_ERROR_HI  => 0.01;  # percent.
68
use constant INSIGNIFICANT_BW_ERROR_LO  => 0.06;  # percent.
69
70
71

# latency must be corrected for xmit delay under this speed.
use constant LAT_LOW_BW => 10000000;
72

73
74
75
76
# slow send rate (for bw 256kbps to 1Mbps)
use constant SLOW_BW => 256000;
use constant SLOW_SEND => 400;
use constant FAST_SEND => 1002;
77

78
# misc contstants
79
use constant BSD => "FreeBSD";
80
use constant LINUX => "Linux";
81
use constant RTPROTO_STATIC => "Static";
82
use constant RTPROTO_SESSION => "Session";
Timothy Stack's avatar
   
Timothy Stack committed
83
use constant EVENT_COMPLETE => "COMPLETE";
84
use constant EVENT_REPORT => "REPORT";
Timothy Stack's avatar
   
Timothy Stack committed
85
use constant EVENT_LOG => "LOG";
86
use constant PING_SEND_COUNT => 10;
David Anderson's avatar
David Anderson committed
87
use constant SYNC_NAMESPACE => "linktest";
88
89

# test levels
90
91
92
93
use constant TEST_LATENCY => 1; # direct link connectivity & latency
use constant TEST_RT_STATIC => 2;   # prior plus static routing
use constant TEST_LOSS => 3;   # prior plus loss
use constant TEST_BW => 4; # prior plus bandwidth
94

95
# test names
96
use constant NAME_RT_STATIC => "Routing";
97
98
99
use constant NAME_LATENCY => "Latency (Round Trip)";
use constant NAME_LOSS => "Loss";
use constant NAME_BW => "Bandwidth";
100

101
# error suffix for logs
102
use constant SUFFIX_ERROR => ".error";
103
use constant DEBUG_ALL => 2; # debug level for all debug info, not just msgs.
104

105
106
# exit codes
use constant EXIT_ABORTED => -1;
Timothy Stack's avatar
   
Timothy Stack committed
107
use constant EXIT_NOT_OK => 1;
108
109
use constant EXIT_OK => 0;

110
111
112

# struct for representing a link.
struct ( edge => {
113
    name => '$',
114
    src => '$',
115
    srcip => '$',
116
    dst => '$',
117
    dstip => '$',
118
119
    bw  => '$',
    delay => '$',
120
121
    loss => '$'});

Timothy Stack's avatar
   
Timothy Stack committed
122
123
124
125
126
struct ( host => {
    name => '$',
    visited => '$',
    links => '@'});

127
# fixes emacs colorization woes introduced by above struct definition.
Timothy Stack's avatar
   
Timothy Stack committed
128
# struct ( unused => { foo => '$'});
129

130
131
132
use constant TRUE => 1;
use constant FALSE => 0;

133
134
135
136
##############################################################################
# Globals
##############################################################################

137
my $topology_file;    # location of the topology input file.
138
my $synserv;    # synch server node
139
my $rtproto;    # routing protocol
140
my $hostname;   # this hosts name
141
142
143
144
my $exp_id;     # experiment id
my $proj_id;    # project id
my $gid;        # group id
my $platform;   # name of platform
145
146
147
148
my $startat=1;  # which test to start at
my $stopat=99;  # which test to stop at
my @kill_list;  # PIDs maintained through the life of linktest
                    # which get killed as part of cleanup.
149
150
151
my $debug_level = 0; # enable debug statements
                    # 1 = print debug statements.
                    # 2 = show STDOUT and STDERR
152
my $barr_count;   # used by synserv host, nubmer of hosts -1
153
my $log_file;    # common logfile for information saved over time.
154
155

my @hosts; # hosts: list of text strings containing host names.
156
           # sorted alphabetically
Timothy Stack's avatar
   
Timothy Stack committed
157
my %hostmap;
158
my @links; # links: list of edge structs.
159
           # sorted alphabetically by src . dst
160
                                  
161
162
my $expt_path;  # experiment path (ie, tbdata) set by init.
my $linktest_path;   # log path (ie tbdata/linktest) set by init.
Timothy Stack's avatar
   
Timothy Stack committed
163
164
165
166
167
168
169
170
my $simname = "ns";
my $swapper = "";
my $swapperid = 0;
my $swappergid = 0;
my $token = -1;
my $error_count = 0;
my $stage_error_count = 0;
my $total_error_count = 0;
171
172
173
174

##############################################################################
# Main control
##############################################################################
175

176
177
178
179
180
181
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$| = 1; #Turn off line buffering on output

182
183
184
# Make sure that files written into the experiment subdir are group writable.
umask(0002);

185
186
187
188
189
190
191
192
193
194
195
196
197
198
#
# Parse command arguments. Since Linktest is run via the event system,
# parse out pairs of <symbol>=<value>.
#
foreach my $arg (@ARGV) {
    if($arg =~ /STOPAT=(\d)/) {
	$stopat=$1;
    }
    if($arg =~ /STARTAT=(\d)/) {
	$startat=$1;
    }
    if($arg =~ /DEBUG=(\d)/) {
	$debug_level=$1;
    }
Timothy Stack's avatar
   
Timothy Stack committed
199
200
201
202
203
204
205
    if($arg =~ /TOKEN=(\d+)/) {
	$token=$1;
    }
    if($arg =~ /SWAPPER=(\w+)/) {
	$swapper=$1;
	(undef,undef,$swapperid,$swappergid) = getpwnam($swapper);
    }
206
207
208
209
210
211
212
213
214
215
216
217
218
}

#
# Parse the nickname file to obtain the host name, 
# experiment ID and the project ID.
#
my $fname = PATH_NICKNAME;
die("Could not locate $fname\n") unless -e $fname;
my @results = &read_file($fname);
($hostname, $exp_id, $proj_id) = split /\./, $results[0];
chomp $hostname;
chomp $exp_id;
chomp $proj_id;
Mike Hibler's avatar
Mike Hibler committed
219
220
221
222
223
224
225
226

# taint check pid/eid
if ($proj_id =~ /([-\w]*)/) {
    $proj_id = $1;
}
if ($exp_id =~ /([-\w]*)/) {
    $exp_id = $1;
}
227
228
229
230
231
232
233
234
$gid = $proj_id;

#
# Set path variables storing the experiment logging path,
# the current ns file and the output file for topology info.
#
$expt_path = "/proj/$proj_id/exp/$exp_id/tbdata";
$linktest_path = "$expt_path/linktest";
Timothy Stack's avatar
   
Timothy Stack committed
235
$topology_file = "@CLIENT_VARDIR@/boot/ltmap";
236
237
238
239
240
241
242
243
244
245
246
247
248
249

#
# Determine location of the customized ns binary for Linktest.
#
($platform) = POSIX::uname();

#
# Parse the syncserver file to find out which node is the sync server.
#
my $ssname = "@CLIENT_VARDIR@/boot/syncserver";
die("Could not locate an emulab-sync server\n") unless -e $ssname;
@results = &read_file($ssname);
($synserv) = split/\./, $results[0];
chomp $synserv;
250

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#
# If the current node is the special node (arbitrarily the sync
# server node), do some housekeeping and run ns to generate
# the topology input file, which is read by all nodes to obtain
# the experiment topology.
#
if(&is_special_node()) {

    #
    # If the shared path used by Linktest for logging and temporary
    # files already exists, clear its contents for this run.
    #
    if( -e $linktest_path ) {
	die("Path $linktest_path is not a directory\n") 
	    unless -d $linktest_path;

	opendir (DIR,$linktest_path)
	    ||  die("Could not open $linktest_path: $!");
	my @dirfiles = grep (/error$/,readdir(DIR));
	foreach (@dirfiles) {
	    &do_unlink("$linktest_path/$_");
	}
	closedir(DIR);

    } else {
	# 
	# The shared path does not exist, create it.
	#
	mkdir (&check_filename($linktest_path),0777) 
	    || die("Could not create directory $linktest_path: $!");
Timothy Stack's avatar
   
Timothy Stack committed
281
	chown($swapperid, $swappergid, $linktest_path);
282
283
    }

284
285
286
    # Update our ltmap file so that linktest runs off current data.
    &my_system(PATH_RCTOPO);

287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
    #
    # Call get_topo before the other nodes because
    # get_topo is used to initialize @hosts.
    #
    &get_topo($topology_file);

    #
    # Caculate the barrier count using the results of the NS script.
    # Note: This could be gotten from scalar(@hosts), but that
    # requires a successful parse of the NS file, which may fail.
    #
    $barr_count = @hosts;
    $barr_count--;
    
} # end of chores for the special node.

#
# All nodes remove local temporary files remaining from the last
# run, if any.
# 
&do_unlink(CRUDE_DAT);
308
&do_unlink(IPERF_DAT);
309
310
311
&do_unlink(RUDE_CFG);

#
312
# Start up child processes for receiving RUDE and iperf streams.
313
314
315
316
317
# These run over the lifetime of Linktest to reduce the number of
# barrier synchronizations and startup delays. Always give the 
# collectors a moment to start up.
#
if(&dotest(TEST_BW)){
318
    &my_system_initonly(PATH_IPERF,"-s","-f","b","-u","-w","200000");
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
    sleep(1);
}
if(&dotest(TEST_LOSS)){
    &my_system_initonly(PATH_CRUDE,"-l",CRUDE_DAT);
    sleep(1);
}

#
# Finally, synchronize so that all nodes have completed startup
# tasks. When all nodes reach this barrier, the topology input file
# has been written, local variables are initialized, background
# listeners have been started, and temporary files have been cleared.
#
&barrier;

#
# Beginning of the tests. First, all nodes read the topology.
# Since this can occur at the same time after the barrier,
# have a short randomized delay to avoid hitting NFS too hard.
#
sleep(int(rand(5)));
340
341
# Update our ltmap file so that linktest runs off current data.
&my_system(PATH_RCTOPO);
342
343
&get_topo($topology_file) unless &is_special_node();
&debug_top();
344

Timothy Stack's avatar
   
Timothy Stack committed
345
346
347
348
my $msg = "Linktest Starting";
&sim_event(EVENT_LOG,$msg);
&debug("\n$msg\n\n");

349
350
351
352
353
354
355
if (defined($rtproto) && $rtproto eq RTPROTO_SESSION) {
    my $msg = "Session routing active; waiting a bit to let routes stabilize";
    &sim_event(EVENT_LOG,$msg);
    &debug("\n$msg\n\n");
    sleep(30);
}

356
if(&dotest(TEST_LATENCY)) {
357
358
    my $msg = "Testing Single Hop Connectivity and Latency...";
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
   
Timothy Stack committed
359
    &sim_event(EVENT_LOG,$msg);
360
361
362
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
    &barrier();
363
    &debug("\n$msg\n\n");
364
    &latency_test;
Timothy Stack's avatar
   
Timothy Stack committed
365
    &report_status;
366
367
}

368
369
if(&dotest(TEST_RT_STATIC)
    && defined($rtproto)
370
    && ($rtproto eq RTPROTO_STATIC || $rtproto eq RTPROTO_SESSION)) {
371
    my $msg = "Testing Routing...";
372
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
   
Timothy Stack committed
373
    &sim_event(EVENT_LOG,$msg);
374
375
376
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
    &barrier();
377
    &debug("\n$msg\n\n");
378
    &static_rt_test; # nodes not covered by 1hop test
Timothy Stack's avatar
   
Timothy Stack committed
379
    &report_status;
380
}
381

382
if(&dotest(TEST_LOSS)) {
383
384
    my $msg = "Testing Loss...";
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
   
Timothy Stack committed
385
    &sim_event(EVENT_LOG,$msg);
386
387
388
    # Ick, this barrier makes sure the above message gets into the log
    # first, so as not to confuse Mike
    &barrier();
389
    &debug("\n$msg\n\n");
390
    &loss_test; 
Timothy Stack's avatar
   
Timothy Stack committed
391
    &report_status;
392
}
393

394
if(&dotest(TEST_BW)){
395
396
    my $msg = "Testing Bandwidth...";
    &post_event(EVENT_REPORT,$msg);
Timothy Stack's avatar
   
Timothy Stack committed
397
    &sim_event(EVENT_LOG,$msg);
398
    # Ick, this barrier makes sure the above message gets into the log
399
    # first, so as not to confuse Mike
400
    &barrier();
401
    &debug("\n$msg\n\n");
402
    &bw_test;
Timothy Stack's avatar
   
Timothy Stack committed
403
    &report_status;
404
}
405

406
407
&cleanup;

408
409
&barrier();

Timothy Stack's avatar
   
Timothy Stack committed
410
411
412
413
$msg = "Linktest Done";
&sim_event(EVENT_LOG,$msg);
&debug("\n$msg\n\n");

414
415
416
#
# Send an event indicating that Linktest has completed normally.
#
Timothy Stack's avatar
   
Timothy Stack committed
417
&post_event(EVENT_COMPLETE,"ERROR=$total_error_count CTOKEN=$token");
418

419
exit(EXIT_OK);
420

421

422
##############################################################################
423
# Loss Test Functions
424
425
##############################################################################

426
# Writes the configuration file used by RUDE.
427
428
sub write_rude_cfg {
    my ($stream_id, $edge) = @_;
429
430
431
    my @contents;
    push @contents, "START NOW\n";
    push @contents, "0000 $stream_id ON 3001 " 
432
	. $edge->dst . "-" . $edge->name
433
434
435
436
437
438
	    . ":10001 CONSTANT " 
		. &get_loss_sample_size($edge) 
		    . " 20\n";
    push @contents, "1000 $stream_id OFF\n";

    &write_file(RUDE_CFG, @contents);
439
440
441

}

442
443
# Returns the sample size used by the Loss test.
# TODO: why this number? (from my ProbStats book.)
444
445
446
447
448
449
450
451
452
sub get_loss_sample_size {
    my $edge = shift @_;
    if($edge->loss > 0) {
	return &round( 10 / $edge->loss);
    } else {
	return SLOW_SEND; # just in case a slow link with no loss.
    }
}

453
454
455
456


# returns TRUE if the link loss is valid for the linktest loss test.
sub valid_loss {
457
    my $edge = shift @_;
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
    if($edge->bw >= SLOW_BW && $edge->bw < LIMIT_BW_LO) {
	if(&get_loss_sample_size($edge) > SLOW_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } elsif( $edge->bw >= LIMIT_BW_LO) {
	# also want an upper limit.
	if(&get_loss_sample_size($edge) > FAST_SEND) {
	    return FALSE;
	} else {
	    return TRUE;
	}
    } else {
	return FALSE;
    }
474
}
475
476
477
478


# This test uses RUDE and CRUDE to send a stream of packets
# in both directions over a duplex link.
479
480
481
482
483
484
485
486
487
488
489
sub loss_test {
    my %analyze;
    my %recv_cnt;
    my $stream_id = 1;
    my @edge_copy = @links;
    while(&has_elems(\@edge_copy)) {
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy);
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src) {
		if(valid_loss($edge)) {
		    &write_rude_cfg($stream_id,$edge);
490
		    &my_system(PATH_RUDE,"-s", RUDE_CFG);
491
492
493
494
495
496
497
		    $analyze{$stream_id} = $other_edge;
		} else {
		    &debug("Skipping loss test for " . &print_link($edge) . "\n");
		}
	    } elsif ($hostname eq $other_edge->src) {
		if(valid_loss($other_edge)) {
		    &write_rude_cfg($stream_id,$other_edge);
498
		    &my_system(PATH_RUDE,"-s", RUDE_CFG);
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
		    $analyze{$stream_id} = $edge;
		} else {
		    &debug("Skipping loss test for " . &print_link($other_edge) . "\n");
		}
	    }
	}
	$stream_id++;
	&barrier();
    }

    # wait for any stragglers due to delay-- there is a  race
    # between the barrier sync on the control net and the expt net latency.
    sleep(1);

    # count packets received for each stream.
514
515
516
517
    my @results = &my_tick(PATH_CRUDE,"-d",CRUDE_DAT);
    my $result_count = @results;
    &debug("result_count from crude: $result_count\n");
    foreach (@results) {
518
519
520
521
522
	if(/ID=(\d+) /) {
	    $recv_cnt{$1}++;
	}
    }

523
    # analyze only links for which a stream was received.
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
    foreach my $key (keys %analyze) {
	my $edge = $analyze{$key};
	my $received = $recv_cnt{$key};

	if(!defined($received)) {
	    $received=0;
	    &error (NAME_LOSS,$edge,"No packets received from " . $edge->src);
	} else {
	    # this is a large sample test about proportion p.
	    # this is considered a valid statistical estimate for np >= 10.
	    my $p = 1 - $edge->loss;
	    my $n = &get_loss_sample_size($edge) + 1 ;


	    my $p_hat = $received / $n;
	    my $numerator = $p_hat - $p;
540
	    my $denominator = sqrt( abs( $p * (1 - $p_hat) / $n) );
541
542
543
544

	    if( $edge->loss == 0) {
		if($received < $n) {

545
546
547
		    ####
		    ## TODO:

548
549
550
		    
		    my $errmsg = "Unexpected loss occurred (n=$n, received=$received)\n";
		    &error(NAME_LOSS, $edge, $errmsg);
551
		} # note, no logging of succesful 0-loss. (too common).
552
553
554
555
	    } elsif($denominator == 0) {
		my $errmsg = "No packets were lost (n=$n, plr=" . $edge->loss .")";
		&error(NAME_LOSS, $edge, $errmsg);
	    } else {
556

557
558
559
560
		
		my $z = $numerator / $denominator;
		my $reject_region = 2.58; # alpha = 0.1, normal distro by CLT
		if(abs($z) > $reject_region) {
561
		    my $errmsg = "Probable loss misconfiguration (n:$n, received: $received; expected proportion: $p, measured proportion=$p_hat)";
562
563
564
565
566
567
568
569
		    &error(NAME_LOSS, $edge, $errmsg);
		}
	    }
	}

    }


570
    # wait for completion before next test.
571
572
573
    &barrier();
}

574
575
576
##############################################################################
# Latency Test Functions
##############################################################################
577

578
579
580
# returns whether the link latency is in a valid test range.
sub valid_latency {
    return TRUE;
581

582
}
583

584
585
586
587
588
589
590
591
592
# Pings a node and returns information.
# @param[0] := host to ping
# @param[1] := ttl, 0 for default
# @return: (received_count, avg_latency ms)
sub ping_node {
    my ($host,$ttl) = @_;
    my $count = 0;
    my $avg_latency = 0;
    my $stddev = 0;
593

594
595
596
    my $send_count = PING_SEND_COUNT;
    my $timeout = 1; # 1 second
    my $send_rate = $timeout / $send_count;
597

598
599
    # set deadline to prevent long waits
    my $cmd;
600
    my $ttlarg = "";
601
602
    if($ttl) {
	if($platform eq BSD) {
603
	    $ttlarg = "-m $ttl";
604
	} elsif($platform eq LINUX) {
605
606
607
608
	    $ttlarg = "-t $ttl";
	} elsif($platform =~ /CYGWIN/) {
	    # The Windows system ping has a TTL arg; Cygwin ping doesn't. 
	    $ttlarg = "-i $ttl";
609
610
	}
    }
611
612
613
614
615
616
617
618
619
    if($platform eq BSD) {
	$cmd = "/sbin/ping -c $send_count -q -i $send_rate -t $timeout $ttlarg $host";
    } elsif($platform eq LINUX) {
	$cmd = "/bin/ping -c $send_count -q -i $send_rate -w $timeout $ttlarg $host";
    } elsif($platform =~ /CYGWIN/) {
	# Neither Windows nor Cygwin ping has either send rate or timeout.
	# Windows ping doesn't have -q, but it does have TTL, so use it.
	$cmd = "/cygdrive/c/WINDOWS/system32/ping.exe -n $send_count $ttlarg $host";
    }
620

621
622
623
624
625
626
627
628
629
630
    # note backticks passes SIGINT to child procs
    my @args = split(/\s+/,$cmd);
    my @results = &my_tick(@args);

    my $reslt_cnt = @results;
    my $result = $results[$reslt_cnt-2];
    if($platform eq BSD && $result =~ /(\d+) packets received/) {
	$count = $1;
    } elsif($platform eq LINUX && $result =~ /(\d+) received/) {
	$count = $1;
631
632
633
    } elsif($platform =~ /CYGWIN/ && 
	    $results[$reslt_cnt-3] =~ /Received = (\d+)/) {
	$count = $1;
634
635
    }

636
637
638
639
640
    if($count) {
	$result = $results[$reslt_cnt-1];
	if($result=~ /\d+\.\d+\/(\d+\.\d+)\/\d+\.\d+\/(\d+\.\d+)/) {
	    $avg_latency = $1;
	    $stddev = $2;
641
642
643
	} elsif($result=~ /Average = (\d+)ms/) {
	    $avg_latency = $1;
	    $stddev = 0.03;	# Stddev is not reported on Windows.
644
645
	}
    }
646
    return ($count, $avg_latency, $stddev);
647
648
}

649
# For directly connected hosts, checks latency using Ping.
650
sub latency_test {
Timothy Stack's avatar
   
Timothy Stack committed
651
    my %waitlist;
652
    my @edge_copy = @links;
653

654
    while(&has_elems(\@edge_copy)) {
655
656
657
	my ($edge,$other_edge) = &get_twoway_assign(\@edge_copy);
	if(defined($edge) && defined($other_edge)) {
	    if($hostname eq $edge->src ) {
658
659
660
661
662
663
664
		# todo: consider ignoring latency if no delay node.
		if(&valid_latency($edge) && &valid_latency($other_edge)
		   ) {
		    my $pid = fork();
		    if(!$pid) {

			# call ping_node with ttl=1
665
			my ($result_cnt, $sample_avg, $sample_dev) =
666
			    &ping_node($edge->dst . "-" . $edge->name,1);
667
668
669
670
671
			
			&info("    Latency result on $hostname for " .
			      &print_edge($edge) . 
			      ": count/avg/stddev = ".
			      "$result_cnt/$sample_avg/$sample_dev\n");
672
673
674
675
676
677

			my $n = PING_SEND_COUNT;

			if($result_cnt == 0) {
			    my $errmsg = "No packets were received (n=$n)\n";
			    &error(NAME_LATENCY, $edge, $errmsg);
Timothy Stack's avatar
   
Timothy Stack committed
678
			    exit(EXIT_NOT_OK);
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
			} else {

# facts from analysis in /users/davidand/public/calibrate.
# came from 40 independent swapins (enough for normality assumption)
# (note that data actually is normal at any particular latency point, 
# according to described.lst)

# best fit regression for the error as a function of total latency, according to sas.
# see regression1.lst and regression1.sas
#-0.00523(actual)     0.00003096 fbsd
#-0.00530(actual)     0.00003478 linux
# roughly identical, so use:
#-0.005(actual)

# inherent delay in the system (with a delay node) is
# see described.lst and described.sas
# 0.337737  fbsd
# 0.362282  linux (median was 0.328000)
697
# round to:
698
# 0.333 ms
699
700

# note, this has been measured and is in one of the emulab papers (Shashi)
701
702
703
704
705
706
707
708
709
710
711
712
713
714

# Also, described.lst provides good support for the notion that
# the distribution of latencies is normal. For Fbsd all of the 
# distributions were normal, and most were for Linux. So, use this
# assumption in order to send fewer test packets.
			   

			    # the null hypothesis value, u.
			    my $u = $edge->delay + $other_edge->delay;
			    
			    # the calibration as a function of $u
			    $u += 0.333 - 0.005 * $u / 2;


715
# factor in transport delay at slow network speeds.
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731

			    # transport delay: 64 bytes + 18 bytes eth/crc
			    # to units of ms.
			    if($edge->bw < LAT_LOW_BW) {
				$u += 1000 * (82 * 8) / $edge->bw;
			    }
			    if($other_edge->bw < LAT_LOW_BW) {
				$u += 1000 * (82 * 8) / $other_edge->bw;
			    }
			    
			    my $x_bar = $sample_avg;
			    my $numerator = $x_bar - $u;

			    my $S = $sample_dev;
			    

732
			    my $denominator = $S / sqrt( abs( $n ) );
733
734
735
736

			    if($denominator == 0) {
				my $errmsg = "Invalid sample standard deviation (possible parse problem, please report). (n=$n, u=$u, x_bar=$x_bar, S=$S)";
				&error(NAME_LATENCY, $edge, $errmsg);
Timothy Stack's avatar
   
Timothy Stack committed
737
				exit(EXIT_NOT_OK);
738
739
740
741
			    } else {
				my $z = $numerator / $denominator;
				my $t_reject = 3.250; # alpha = 0.01, df=9

742
743
744
745
746
747
748
749
750
751
752
				if ((abs($z) > $t_reject) &&
				    (($x_bar < $u &&
				      (($u - $x_bar) >=
				       INSIGNIFICANT_LAT_ERROR_LO)) ||
				     ($x_bar > $u &&
				      (($x_bar - $u) >=
				       INSIGNIFICANT_LAT_ERROR_HI)))) {
				    &error(NAME_LATENCY, $edge,
					   "Probable latency ".
					   "misconfiguration (expected=$u, ".
					   "measured mean=$x_bar).");
Timothy Stack's avatar
   
Timothy Stack committed
753
				    exit(EXIT_NOT_OK);
754
755
756
				}
			    }
			}
757
			exit(EXIT_OK);
758
		    } else {
Timothy Stack's avatar
   
Timothy Stack committed
759
			$waitlist{$pid} = 1;
760
		    }
761
		} else {
762
		    &debug("Skipping latency test for " . &print_link($edge) . " to " . &print_link($other_edge) . "\n");
763
764
		}
	    }
765
#
766
767
	}
    }
768

Timothy Stack's avatar
   
Timothy Stack committed
769
    &wait_all(%waitlist);
770
771
    # wait for completion before next test.
    &barrier();
772
773
}

774

775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
##############################################################################
# Bandwidth Test Functions
##############################################################################

# Returns whether the link bandwidth is in a valid test range.
sub valid_bw {
    my $edge = shift @_;
    if($edge->bw >= LIMIT_BW_LO
       && $edge->bw <= LIMIT_BW_HI
       && $edge->loss <= LIMIT_BW_LOSS
       ) {
	return TRUE;
    } else {
	return FALSE;
    }
}


# Checks bandwidth for directly connected links.
794
sub bw_test {
795
796
    my @analyze_list	= ();
    my @edge_copy	= @links;
797

798
    #
799
800
    # all nodes will execute the same reductions on the edge list
    # on their own so that the number of barriers is the same.
801
802
803
    #
    while (&has_elems(\@edge_copy)) {
	my ($edge,$redge) = &get_twoway_assign(\@edge_copy);
804

805
806
807
808
809
	# Figure out what bw to use so as not to overflow the
	# system too badly. Take the max of the two edges and
	# add 10 percent.
	my $bw = 0;

810
	if (defined($edge) && defined($redge)) {
811
	    if($hostname eq $edge->dst) {
812
813
814
		#
		# iperf does a twoway test.
		#
815
		if (&valid_bw($edge)) {
816
		    push(@analyze_list, $edge);
817
818
819

		    $bw = $edge->bw
			if ($edge->bw > $bw);
820
		    
821
822
823
		    &debug("    Starting bandwidth test on $hostname for " .
			  &print_link($edge) . "\n");
		    &info("    Starting bandwidth test on $hostname for " .
824
			  &print_link($edge) . "\n");
825
826
		}
		else {
827
		    &debug("Skipping bandwidth test on $hostname for " .
828
			   &print_link($edge) . "\n");
829
		    
830
		    &info("*** Skipping bandwidth test on $hostname for " .
831
832
833
834
			  &print_link($edge) . "\n");
		    &info("*** Bandwidth is out of range ".
			  "(" . LIMIT_BW_LO . " <= BW <= " . LIMIT_BW_HI .") ".
			  "or loss is too high (> " . LIMIT_BW_LOSS . ").\n");
835
		}
836
837
838
		if (&valid_bw($redge)) {
		    push(@analyze_list, $redge);
		    
839
840
841
		    $bw = $redge->bw
			if ($redge->bw > $bw);
		    
842
843
844
		    &debug("    Starting bandwidth test on $hostname for " .
			  &print_link($redge) . "\n");
		    &info("    Starting bandwidth test on $hostname for " .
845
846
847
			  &print_link($redge) . "\n");
		}
		else {
848
		    &debug("Skipping bandwidth test on $hostname for " .
849
850
			   &print_link($redge) . "\n");
		    
851
		    &info("*** Skipping bandwidth test on $hostname for " .
852
853
854
855
856
			  &print_link($redge) . "\n");
		    &info("*** Bandwidth is out of range ".
			  "(" . LIMIT_BW_LO . " <= BW <= " . LIMIT_BW_HI .") ".
			  "or loss is too high (> " . LIMIT_BW_LOSS . ").\n");
		}
857
858
		my $bw = $bw + int($bw * 0.10);
		
859
860
861
		# Okay, start the test.
		if (&valid_bw($edge) || &valid_bw($redge)) {
		    &my_system(PATH_IPERF,
862
863
			       "-c", $edge->src . "-" . $edge->name,
			       "-t", "10", "-f", "b",
864
			       "-r", "-u", "-w", "200000", "-b", "$bw",
865
866
867
			       "-x", "s", "-y", "c",
			       "-L", "4444", "-o", IPERF_DAT);
		}
868
	    } 
869
870
871
872
	}
	&barrier();
    }

873
    # read the log file.
874
875
876
877
878
879
880
881
882
    if (@analyze_list) {
	my @results = &read_file(IPERF_DAT);
	foreach my $edge (@analyze_list) {
	    my $found_results = 0;
	    
	    foreach my $result (@results) {
		my @stuff = split(",", $result);
		if (scalar(@stuff) < 9) {
		    die("Error parsing " . IPERF_DAT . "\n");
883
		}
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
		my $myip  = $stuff[1];
		my $port  = $stuff[2];
		my $hisip = $stuff[3];
		my $bw    = $stuff[8];
		
		#
		# iperf is a twoway test. Both edges represented in the file.
		#
		if (($hostname eq $edge->dst &&
		     $edge->dstip eq $myip && "$port" eq "4444" &&
		     $edge->srcip eq $hisip) ||
		    ($hostname eq $edge->src &&
		     $edge->dstip eq $myip && "$port" eq "5001" &&
		     $edge->srcip eq $hisip)) {
		    my $expected = $edge->bw;
		    my $diff     = abs($bw - $expected);
		    my $error    = undef;

902
903
		    &debug("BW results on $hostname for " .
			   $edge->src . "-" . $edge->name . ": ".
904
905
			   "$bw/$expected/$diff\n");

906
907
908
909
		    &info("    Bandwidth result on $hostname for " .
			  &print_edge($edge) . 
			  ": expected/actual = $expected/$bw\n");

910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
		    #
		    # The measurement tool does not give perfect results.
		    # However, it reports low all the time, so if it reports
		    # high, then the link is almost certainly bad.
                    #
		    if ($bw > $edge->bw) {
			if ($diff > ($expected * INSIGNIFICANT_BW_ERROR_HI)) {
			    $error = "higher";
			}
		    }
		    elsif ($bw < $edge->bw) {
			if ($diff > ($expected * INSIGNIFICANT_BW_ERROR_LO)) {
			    $error = "lower";
			}
		    }
		    if (defined($error)) {
			&error(NAME_BW, $edge,
			       "Bandwidth estimate ($bw bps) is $error ".
			       "expected ($expected)");
		    }
		    $found_results = 1;
		    last;
932
		}
933
934
935
	    }
	    if (!$found_results) {
		&error(NAME_BW, $edge, "Could not find results!");
936
937
938
	    }
	}
    }
939
940
    # wait for completion before termination so that all errors reported in.
    &barrier();
941
942
}

943
944
945
946
947

##############################################################################
# Static Routing Connectivity Test Functions
##############################################################################

Timothy Stack's avatar
   
Timothy Stack committed
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
# Traverse the links between nodes to figure out which nodes are actually
# reachable.  First parameter is a reference to an array that should be filled
# out with node names.  The second parameter contains the name of the node to
# visit.
sub reachable_nodes {
    my ($nodes_ref, $currnode) = @_;

    $hostmap{$currnode}->visited(1);
    foreach my $edge (@{ $hostmap{$currnode}->links }) {
	my $nextnode;

	if ($edge->src eq $currnode) {
	    $nextnode = $edge->dst;
	} 
	else {
	    $nextnode = $edge->src;
964
	}
Timothy Stack's avatar
   
Timothy Stack committed
965
966
967
	if ($hostmap{$nextnode}->visited == 0) {
	    if (!($currnode eq $hostname)) { # Don't add 1st hop nodes.
		push @{$nodes_ref}, $nextnode;
968
	    }
Timothy Stack's avatar
   
Timothy Stack committed
969
	    &reachable_nodes($nodes_ref, $nextnode);
970
	}
971
    }
Timothy Stack's avatar
   
Timothy Stack committed
972
973
    $hostmap{$currnode}->visited(2);
}
974

Timothy Stack's avatar
   
Timothy Stack committed
975
976
977
978
979
980
# Attempts to reach nodes that are not on a direct link
# with this host. IE, use TTL > 1. Pings are in parallel.
sub static_rt_test {
    my @nodes = ();

    &reachable_nodes(\@nodes, $hostname);
981
    &debug("Route test nodes: @nodes\n");
Timothy Stack's avatar
   
Timothy Stack committed
982
983

    my %waitlist;
984
    # fork processes to run the pings in parallel.
Timothy Stack's avatar
   
Timothy Stack committed
985
986
987
988
989
990
991
992
993
994
    foreach my $dst (@nodes) {
	my $pid = fork();
	if(!$pid) {
	    my ($recv_cnt,$ignored1, $ignored2) = &ping_node($dst,0);
	    if(!$recv_cnt) {
		my $newEdge = new edge;
		$newEdge->src($hostname);
		$newEdge->dst($dst);
		&error(NAME_RT_STATIC,$newEdge , "$hostname could not ping $dst");
		exit(EXIT_NOT_OK);
995
	    } else {
Timothy Stack's avatar
   
Timothy Stack committed
996
		&debug("Attempting to reach $dst... OK\n");
997
	    }
Timothy Stack's avatar
   
Timothy Stack committed
998
999
1000
	    exit(EXIT_OK);
	} else {
	    $waitlist{$pid} = 1;
For faster browsing, not all history is shown. View entire blame