Commit 866cd027 authored by David Johnson's avatar David Johnson

Add support for plabmonitord to estimate its state from the plab_nodehist

table... thus it no longer starts over every time it gets restarted.
parent 45fb92c7
......@@ -21,7 +21,8 @@ SBIN_STUFF = plabslice plabnode plabrenewd plabmetrics plabstats \
LIB_STUFF = libplab.py mod_dslice.py mod_PLC.py mod_PLCNM.py \
mod_PLC4.py sshhttp.py \
plabmon_badpool.pm plabmon_goodpool.pm libplabmon.pm \
aspects.py timer_advisories.py
aspects.py timer_advisories.py \
libplabnodehist.pm
LIBEXEC_STUFF = webplabstats
......
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2007 University of Utah and the Flux Group.
# All rights reserved.
#
package libplabnodehist;
#
# This library provides functions for interpreting the plab_nodehist table.
#
use strict;
use Exporter;
use vars qw(@ISA @EXPORT);
@ISA = "Exporter";
@EXPORT = qw (getNodeHistSequences sequenceToStr);
# Must come after package declaration!
use lib '@prefix@/lib';
use libdb;
use libtestbed;
use English;
use POSIX qw(strftime);
# Configure variables
my $TB = "@prefix@";
my $BOSSNODE = "@BOSSNODE@";
my $debug = 0;
#
# Returns up/down sequences derived from the plab_nodehist table.
# Args:
# $1 = plab physnode id
# $2 = N, where you want only the sequences derived from the last N seconds
# of history
# Returns:
# A hash of phys_node_id->{'up'|'down'}->array of sequences.
#
# A 'sequence' is an array with the following elements:
# [ starttime, duration, members ]
# where
# starttime is seconds past Epoch; duration is seconds;
# members is the number of consecutive 'success's or 'failure's in
# the sequence.
#
sub getNodeHistSequences(;$$) {
my $fpnode = shift;
my $fstime = shift;
my ($cpnode,$cstatus,$cstarttime,$cendtime,$cmembers) = ('','',0,0,0);
my ($totalseqs,$totalnodes) = (0,0);
my %r = ();
my $now = time();
# Note: the "parsing" requires that the statuses be ordered by
# physnodeid, then time. You know, push as much work to the db as
# possible.
my $filter = "";
if (defined($fpnode) && $fpnode ne '') {
$filter .= " and phys_node_id='$fpnode'";
}
if (defined($fstime)) {
$filter .= " and timestamp > '" . \
strftime("%Y-%m-%d %H:%M:%S",
localtime(time() - $fstime)) . "'";
}
my $q = "select phys_node_id,unix_timestamp(timestamp)," .
" status" .
" from plab_nodehist" .
" where component='node' and operation='create'" .
" $filter" .
" order by phys_node_id,timestamp";
if ($debug) {
print "query = \"$q\"\n";
}
my $res = DBQueryFatal($q);
print "getSequences: beginning construction at " .
(strftime("%Y-%m-%d %H:%M:%S",localtime())) . ".\n";
#my $i = 0;
my @row;
while (@row = $res->fetchrow_array()) {
#++$i;
# save off any current sequence in the previous pnode/status
if (($row[0] ne $cpnode && $cpnode ne '')
|| ($row[2] ne $cstatus && $cstatus ne '')) {
# if this is a new failure/success sequence within the same pnode,
# set the endtime to the starttime of the new sequence.
if ($cpnode eq $row[0] && $cstatus ne $row[2] && $cstatus ne '') {
#print "DEBUG: used endtime $row[1]\n";
$cendtime = $row[1];
}
# otherwise, just use the current time, cause this is a new set
# of data for another pnode.
else {
#print "DEBUG: using default endtime $now\n";
$cendtime = $now;
}
my $nseq = [ $cstarttime, ($cendtime - $cstarttime), $cmembers ];
++$totalseqs;
push @{$r{$cpnode}{$cstatus}}, $nseq;
$r{$cpnode}{'lastseq'} = [ $cstatus, $nseq ];
($cstatus,$cstarttime,$cendtime,$cmembers) = ('',-1,-1,0);
}
if ($row[0] ne $cpnode) {
# new pnode
++$totalnodes;
$cpnode = $row[0];
$r{$cpnode}{'success'} = [];
$r{$cpnode}{'failure'} = [];
$r{$cpnode}{'lastseq'} = [];
#print "DEBUG: new pnode $cpnode\n";
}
if ($row[2] ne $cstatus) {
# start new sequence within this pnode.
#print "DEBUG: new sequence at row $i\n";
$cstatus = $row[2];
$cstarttime = $row[1];
++$cmembers;
}
else {
#print "DEBUG: ${cmembers}th member at row $i\n";
++$cmembers;
}
}
# add the final sequence not caught in the loop...
if ($cpnode ne '' && $cstatus ne '') {
if ($cpnode eq $row[0] && $cstatus ne $row[2] && $cstatus ne '') {
#print "DEBUG: used endtime $row[1]\n";
$cendtime = $row[1];
}
# otherwise, just use the current time, cause this is a new set
# of data for another pnode.
else {
#print "DEBUG: using default endtime $now\n";
$cendtime = $now;
}
my $nseq = [ $cstarttime, ($cendtime - $cstarttime), $cmembers ];
++$totalseqs;
push @{$r{$cpnode}{$cstatus}}, $nseq;
$r{$cpnode}{'lastseq'} = [ $cstatus, $nseq ];
}
print "getSequences: finished construction at " .
(strftime("%Y-%m-%d %H:%M:%S",localtime())) . ".\n";
print "getSequences: added $totalseqs sequences for $totalnodes nodes.\n";
return \%r;
}
sub sequenceToStr($) {
my $seq = shift;
if (!defined($seq)) {
return undef;
}
return sprintf("%s -> %s (%s): %d",
strftime("%Y-%m-%d %H:%M:%S",localtime($$seq[0])),
strftime("%Y-%m-%d %H:%M:%S",localtime($$seq[0]+$$seq[1])),
$$seq[1] . "s",
$$seq[2]);
}
# Make perl happy
1;
......@@ -428,7 +428,7 @@ sub teardownnode($$;$) {
# XXX: may be bogus, but it'll do for now.
my $MININTERVAL = 300;
my $MININTERVAL = 600;
my $MAXINTERVAL = 3600;
sub calcnextcheck($$;$) {
my ($self, $pnode, $reason) = @_;
......
......@@ -58,6 +58,9 @@ use libdb;
use libtestbed;
use libplabmon;
# Grab stuff to interpret the plab_nodehist table
use libplabnodehist;
# Load pool libraries
use plabmon_badpool;
use plabmon_goodpool;
......@@ -94,7 +97,6 @@ if (defined($options{'S'})) {
#
sub updatenodepool($);
sub fatal($);
#sub loadstate($$);
#
# Global vars
......@@ -177,11 +179,21 @@ $SIG{TERM} = \&termsig;
$SIG{HUP} = \&termsig;
$SIG{INT} = \&termsig;
#
# Load state from plab_nodehist. Down in updatenodepool, we load each node's
# consecsuccess or consecfailure vars from this hash, and then we delete the
# node's history entry from this hashref.
# If we ever need the history to hang around longer, have to use a better way
# to only set consec* vars once (easy).
#
my $seqref = getNodeHistSequences();
#
# Main loop: grind around looking for nodes to check in the various
# pools. Sleep until next node is ready to be processed.
#
my $windowsize = 0;
my $i = 0;
while (1) {
my $now = time();
my $sleeptime = $MAXSLEEPTIME;
......@@ -256,94 +268,6 @@ while (1) {
sleep($CHILLTIME);
}
#
# Load monitor state from the plab_nodehist table for the given pnode.
#
sub loadstate($$) {
my $pool = shift;
my $pnode = shift;
my $poolpid = $pool->{'PID'};
my $pooleid = $pool->{'EID'};
my $poolpnodes = $pool->{'PNODES'};
if (!defined($pool) || !defined($pnode))
return;
my $res =
DBQueryFatal("select unix_timestamp(timestamp),component,operation," .
"status " .
" from plab_nodehist where phys_node_id='$pnode' " .
" order by timestamp asc" .
);
# calculate sequence lengths; store most recent 10
my @sequences = ();
my $lastseqstat = '', $lastts = 0;
if ($res && $res->num_rows()) {
while (my @row = $res->fetchrow_array()) {
my ($ts,$com,$op,$status) = @row;
# filter out entries older than two weeks
# NOTE: after we create the sequence for this node, we will
# check the last (most recent) timestamp; if it's older than
# three days, we reject all sequence state for this node
# and start clean. The reason for this is pretty obvious;
# if a node was good for two weeks, then we don't check for two
# weeks (i.e., cause plabmonitord didn't run), we might try to use
# old state to assume new check intervals, and this might be bad.
if ($lastts == 0) {
# first row
$lastseqstat = $status;
$lastts = $ts;
next;
}
# @entry = (up/down,seqtime,conseccount)
# add this one on to the last.
@sequences[0][1] += ($ts - $lastts);
if ($lastseqstat eq $status) {
++(@sequences[0][2]);
}
else {
# new entry
my @entry = ();
if ($status eq 'success') {
entry[0] = 'up';
}
else {
entry[0] = 'down';
}
entry[1] = 0;
entry[2] = 1;
# save off this sequence
push @sequences, \@entry;
}
$lastseqstat = $status;
$lastts = $ts;
}
}
# remove the oldest N sequences...
if (scalar(@sequences) $MAX_STATE_SEQUENCES) {
for (my $i = scalar(@sequences); $i >= $MAX_STATE_SEQUENCES; --$i) {
undef($sequences[$i]);
}
}
$poolpnodes->{$pnode}->{'history'} = \@sequences;
Log(STATUSLOG, "plabmonitord, $pnodename, loadstate, ".
(scalar(@{$poolpnodes->{$pnodename}->{'history'}})).
" sequences added.");
return;
}
#
# Go through the PID/EID associated with the pool and grab any new nodes
# that have appeared.
......@@ -385,6 +309,26 @@ sub updatenodepool($) {
'consecsuccess' => 0,
'setupfails' => 0};
if (exists($seqref->{$pnodename})) {
my $lseqtype = $seqref->{$pnodename}{'lastseq'}->[0];
my $lseqcount = $seqref->{$pnodename}{'lastseq'}->[1]->[2];
if ($lseqtype eq 'success') {
$poolpnodes->{$pnodename}{'consecsuccess'} = $lseqcount;
}
elsif ($lseqtype eq 'failure') {
$poolpnodes->{$pnodename}{'consecfails'} = $lseqcount;
}
$pool->calcnextcheck($poolpnodes->{$pnodename});
delete $seqref->{$pnodename};
print "Loaded nodehist for $pnodename ($lseqtype/$lseqcount).\n";
my $nct = $poolpnodes->{$pnodename}->{'nextchecktime'};
print "calcnextcheck($pnodename) = ".($nct-time())."\n";
}
Log(STATUSLOG, "plabmonitord, $pnodename, addtopool, ".
"nostat, node added to pool $pool->{'NAME'}");
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment