Commit 43d219c0 authored by David Johnson's avatar David Johnson

Added a function to rank plab nodes in terms of unreliability and jitter

(i.e., when plab nodes are up and down a lot for short periods of time).
See the comments to figure out how it works.  It's not quite done yet, but
is good enough for now.
parent 7fdffcb8
......@@ -15,7 +15,7 @@ use Exporter;
use vars qw(@ISA @EXPORT);
@ISA = "Exporter";
@EXPORT = qw (getNodeHistSequences sequenceToStr);
@EXPORT = qw (getNodeHistSequences rankNodesByUnavail sequenceToStr);
# Must come after package declaration!
use lib '@prefix@/lib';
......@@ -58,6 +58,9 @@ sub getNodeHistSequences(;$$) {
my $now = time();
# save off "now" for other functions...
$r{'__NOW__'} = $now;
# Note: the "parsing" requires that the statuses be ordered by
# physnodeid, then time. You know, push as much work to the db as
# possible.
......@@ -84,9 +87,11 @@ sub getNodeHistSequences(;$$) {
my $res = DBQueryFatal($q);
print "getSequences: beginning construction at " .
(strftime("%Y-%m-%d %H:%M:%S",localtime())) . ".\n";
if ($debug) {
print "getSequences: beginning construction at " .
(strftime("%Y-%m-%d %H:%M:%S",localtime())) . ".\n";
}
#my $i = 0;
my @row;
while (@row = $res->fetchrow_array()) {
......@@ -151,14 +156,270 @@ sub getNodeHistSequences(;$$) {
$r{$cpnode}{'lastseq'} = [ $cstatus, $nseq ];
}
print "getSequences: finished construction at " .
(strftime("%Y-%m-%d %H:%M:%S",localtime())) . ".\n";
print "getSequences: added $totalseqs sequences for $totalnodes nodes.\n";
if ($debug) {
print "getSequences: finished construction at " .
(strftime("%Y-%m-%d %H:%M:%S",localtime())) . ".\n";
print "getSequences: added $totalseqs sequences for $totalnodes nodes.\n";
}
return \%r;
}
my $EWMA_ALPHA = 0.10;
my $MINUTE = 60;
my $HOUR = $MINUTE * 60;
my $DAY = $HOUR * 24;
my $BIN_SECONDS = $DAY;
# In order for us to consider that there has been jitter, the avg success
# and failure sequence lengths must be shorter than this value.
my $JITTER_DETECTION_TIME = 1 * $HOUR;
my $JITTER_DEDUCTION_CAP = 0.25;
# this number is multiplied by the number of failures and successes
# to compute a deduction to the node rank. So, if there were 10 failure
# and 8 success sequences for one node in a day, we'd subtract .18 from
# the final node rank (which is essentially uptime).
my $JITTER_DEDUCTION_FACTOR = 0.015;
#
# This function returns a list of hashes, each of which currently has the
# 'nodeid' and 'rank' keys; the list is sorted in
# increasing order of unavailability (so that the most avail are first).
#
# Arguments:
# $1 = a ref to a hash returned by getNodeHistSequences; if undef, this
# function will call getNodeHistSequences.
# $2 = boolean; if !0, will increase unavailability rank for a node if it is
# jittery (i.e., often down for brief periods of time).
# $3 = boolean; if !0, the function will compute rank based on a simple EWMA
# of the data; each successively prior day's unavailability gets a
# decreasing weight. Could be useful; easy to do.
#
sub rankNodesByUnavail(;$$$) {
my ($sref,$dojitter,$doewma) = @_;
if (!defined($sref)) {
$sref = getNodeHistSequences();
}
if (!defined($dojitter)) {
$dojitter = 0;
}
if (!defined($doewma)) {
$doewma = 0;
}
my %rank = ();
my %nstats = ();
my $now = $sref->{'__NOW__'};
my @states = ('failure','success');
foreach my $nodeid (keys(%$sref)) {
if ($nodeid =~ /^__/) {
next;
}
#
# Bin the data into N-hour chunks (default is $BIN_SECONDS). If a
# sequence crosses a day boundary, we count the flip into it on the
# previous day, and add the relevant time to each side of the day
# boundary. Of course, if not using ewma, we don't care about
# this kind of split.
#
# keep track of the oldest bin we accumulate if doing ewma
$nstats{'__MAXBIN__'} = 0;
# need to track separately the total length of all failure/success
# sequences, and the number of each, so that we can calculate jitter.
# Why? Because jitter is not a bin-able value.
$nstats{'totseqnum_failure'} = 0;
$nstats{'totseqlen_failure'} = 0;
$nstats{'totseqnum_success'} = 0;
$nstats{'totseqlen_success'} = 0;
$nstats{'jitseqnum_failure'} = 0;
$nstats{'jitseqnum_success'} = 0;
my $start_slot = 0;
my $stop_slot = 0;
foreach my $s (@states) {
foreach my $seqr (@{$sref->{$nodeid}{$s}}) {
if ($debug) {
print STDERR "" . sequenceToStr($seqr) . "\n";
}
my @seq = @$seqr;
if ($doewma) {
$start_slot = int(($now - $seq[0]) / $BIN_SECONDS);
$stop_slot = int(($now - ($seq[0] + $seq[1]))
/ $BIN_SECONDS);
if ($stop_slot > $nstats{'__MAXBIN__'}) {
$nstats{'__MAXBIN__'} = $stop_slot;
}
if ($start_slot == $stop_slot) {
if (!defined($nstats{"seqlen_$s"}[$start_slot])) {
$nstats{"seqlen_$s"}[$start_slot] = 0;
}
$nstats{"seqlen_$s"}[$start_slot] += $seq[1];
}
else {
if (!defined($nstats{"seqlen_$s"}[$start_slot])) {
$nstats{"seqlen_$s"}[$start_slot] = 0;
}
if (!defined($nstats{"seqlen_$s"}[$stop_slot])) {
$nstats{"seqlen_$s"}[$stop_slot] = 0;
}
my $start_slot_time = (($seq[1] + $seq[0]) -
($stop_slot * $BIN_SECONDS)) -
$seq[0];
my $stop_slot_time = $seq[1] - $start_slot_time;
$nstats{"seqlen_$s"}[$start_slot] +=
$start_slot_time;
$nstats{"seqlen_$s"}[$stop_slot] +=
$stop_slot_time;
}
if (!defined($nstats{"seqnum_$s"}[$stop_slot])) {
$nstats{"seqnum_$s"}[$stop_slot] = 0;
}
++($nstats{"seqnum_$s"}[$stop_slot]);
}
else {
if (!defined($nstats{"seqnum_$s"}[0])) {
$nstats{"seqnum_$s"}[0] = 0;
}
++($nstats{"seqnum_$s"}[0]);
if (!defined($nstats{"seqlen_$s"}[0])) {
$nstats{"seqlen_$s"}[0] = 0;
}
$nstats{"seqlen_$s"}[0] += $seq[1];
}
# accumulate stuff for jitter...
++($nstats{"totseqnum_$s"});
$nstats{"totseqlen_$s"} += $seq[1];
if ($seq[1] < $JITTER_DETECTION_TIME) {
++($nstats{"jitseqnum_$s"});
}
}
}
#
# Now need to compute the average duration of each success/failure
# sequence so we can get an idea of the jitter. It's hard to bin
# (as noted above) since we can't "split" a jittery sequence across
# bins. So, we are going to deduct a bit from the rank depending on
# how how short the success/failure sequence average lengths are.
# Probably "really short" means 5 mins; if the success/failure seqs
# are on average less than 4-8 hrs, we should probably deduct some
# jitter points.
#
if ($nstats{'totseqnum_failure'} > 0) {
$nstats{'avgseqlen_failure'} = $nstats{'totseqlen_failure'} /
$nstats{'totseqnum_failure'};
}
else {
$nstats{'avgseqlen_failure'} = 0;
}
$nstats{'failures_per_bin'} = $nstats{'totseqnum_failure'} /
$BIN_SECONDS;
if ($nstats{'totseqnum_success'} > 0) {
$nstats{'avgseqlen_success'} = $nstats{'totseqlen_success'} /
$nstats{'totseqnum_success'};
}
else {
$nstats{'avgseqlen_success'} = 0;
}
$nstats{'successes_per_bin'} = $nstats{'totseqnum_success'} /
$BIN_SECONDS;
if ($debug) {
print STDERR
"$nodeid: #failseq=" . $nstats{'totseqnum_failure'} .
",failseqtotlen=" . $nstats{'totseqlen_failure'} .
",#succseq=" . $nstats{'totseqnum_success'} .
",succseqtotlen=" . $nstats{'totseqlen_success'} .
"\n";
print STDERR
"$nodeid: failseqavglen=" . $nstats{'avgseqlen_failure'} .
"succseqavglen=" . $nstats{'avgseqlen_success'} . "\n";
print STDERR
"$nodeid: failjitseqnum=" . $nstats{'jitseqnum_failure'} .
",succjitseqnum=" . $nstats{'jitseqnum_success'} . "\n";
}
# we defined a "jitter threshold" in terms of (failures|successes)/day
# above. We cap the deduction at JITTER_DEDUCTION_CAP total, because
# we don't want to penalize nodes THAT much.
my $jitter_deduction = 0;
if ($dojitter) {
# if ($nstats{'avgseqlen_failure'} < $JITTER_DETECTION_AVGTIME
# && $nstats{'avgseqlen_success'} < $JITTER_DETECTION_AVGTIME) {
# $jitter_deduction += $nstats{'totseqnum_failure'} *
# $JITTER_DEDUCTION_FACTOR;
# $jitter_deduction += $nstats{'totseqnum_success'} *
# $JITTER_DEDUCTION_FACTOR;
# }
$jitter_deduction += $nstats{'jitseqnum_failure'} *
$JITTER_DEDUCTION_FACTOR;
$jitter_deduction += $nstats{'jitseqnum_success'} *
$JITTER_DEDUCTION_FACTOR;
if ($jitter_deduction > $JITTER_DEDUCTION_CAP) {
if ($debug) {
print STDERR "$nodeid: jitter deduction = $jitter_deduction;" .
" reducing to $JITTER_DEDUCTION_CAP\n";
}
$jitter_deduction = $JITTER_DEDUCTION_CAP;
}
elsif ($debug) {
print STDERR "$nodeid: jitter deduction = $jitter_deduction\n";
}
}
#
# After that, we need to handle the bins. This is easy; foreach bin,
# exponentially weight its unavail (i.e., failure sequence) time
# as a fraction of the total time in that particular bin (and must be
# careful, since the oldest bin might not be BIN_SECONDS long, but only
# part). Then, sum the unavail times and divide by the sum of the
# weights (or similar).
#
my $unavail = 0;
if ($doewma) {
;
}
else {
$unavail = $nstats{"totseqlen_failure"} /
($nstats{"totseqlen_failure"} + $nstats{"totseqlen_success"});
}
if ($debug) {
print STDERR "$nodeid: unavail = " . $unavail . "\n";
}
my $finalrank = $unavail + $jitter_deduction;
$rank{$nodeid} = $finalrank;
}
# finally, sort by rank and return a list of tuples.
my @retval = ();
my $i = 0;
foreach my $n (sort { $rank{$a} <=> $rank{$b} } keys(%rank)) {
$retval[$i]{'nodeid'} = $n;
$retval[$i]{'rank'} = $rank{$n};
++$i;
}
return \@retval;
}
sub sequenceToStr($) {
my $seq = shift;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment