Commit 95a3a6a7 authored by Kevin Atkinson's avatar Kevin Atkinson

Various tblog changes:

  Make an attempt to discover what the error was before an swap-* was
  canceled, if any.  Both the main error (canceled) and the other error
  are stored in the error table.  To support this a new column in the
  error table is added "rank".  The primary error has a rank 0 while the
  other error has a rank 1.

  Make an attempt to determine when an error is a "me too" error or the
  real cause of the problem.  "Me too" errors are errors which are
  generally reported when the callie script determined that the caller
  script fails.  The caller script should have reported the error, but
  in some cases the error didn't make it into the database.  Thus if a
  "me too" is reported as the cause of a "swap-*" more info is needed to
  determine the true cause.  When a "me too" error is reported it is
  followed by a "..." on it's own line.  It is also recorded in the
  errors table under the new column "need_more_info".

  Add inferred column to the errors table.  This is the same value as
  the inferred variable in tblog_find_error.

  Add revision column to errors table to make it easy to tell which
  algorithm was used to determine the error.
parent c7a15cfd
......@@ -366,15 +366,20 @@ CREATE TABLE emulab_indicies (
CREATE TABLE errors (
session int(10) unsigned NOT NULL default '0',
rank tinyint(1) NOT NULL default '0',
stamp int(10) unsigned NOT NULL default '0',
exptidx int(11) NOT NULL default '0',
script smallint(3) NOT NULL default '0',
cause varchar(16) NOT NULL default '',
confidence float NOT NULL default '0',
inferred tinyint(1) default NULL,
need_more_info tinyint(1) default NULL,
mesg text NOT NULL,
PRIMARY KEY (session)
tblog_revision varchar(8) NOT NULL,
PRIMARY KEY (session,rank)
) TYPE=MyISAM;
--
-- Table structure for table `event_eventtypes`
--
......
......@@ -3760,3 +3760,14 @@ last_net_act,last_cpu_act,last_ext_act);
4.98: Drop the columns from the node_types table, that were supposed
to be done in revision 4.74 above. Skip to next revision ...
4.99: Add columns to errors table for better information on errors
alter table errors
add column rank tinyint(1) not null default 0 after session,
drop primary key,
add primary key (session, rank),
add column inferred tinyint(1) null after confidence,
add column need_more_info tinyint(1) null after inferred,
add column tblog_revision varchar(8) not null after mesg;
......@@ -85,10 +85,10 @@ if ($exitcode) {
}
my $sublevel = length($space) > 0 ? 1 : 0;
if ($mesg =~ s/^warning:\s+//i) {
dblog(TBLOG_WARNING, {sublevel=>$sublevel}, $mesg);
dblog(TBLOG_WARNING, {sublevel=>$sublevel, type=>'primary'}, $mesg);
} else {
parse_error($mesg);
dblog(TBLOG_ERR, {sublevel=>$sublevel}, $mesg);
dblog(TBLOG_ERR, {sublevel=>$sublevel, type=>'primary'}, $mesg);
}
}
}
......@@ -97,7 +97,7 @@ if ($exitcode) {
# already found
if (!$obvious_error) {
parse_error($err);
dblog(TBLOG_ERR, {}, $err);
dblog(TBLOG_ERR, {type=>'primary'}, $err);
}
# create a log entry that assign failed
......
......@@ -246,6 +246,8 @@ undef $SCRIPTNAME; # signal to use $ENV{TBLOG_SCRIPTNAME}
my $DB;
my $TBLOG_PID;
my $REVISION_STR = '$Revision: 2.22 $';
#
# Internal Utility Functions
#
......@@ -983,7 +985,17 @@ The relevant errors are also flagged using "relevant" flag:
=cut
sub tblog_find_error () {
sub tblog_find_error ( ;$$ );
sub tblog_find_error ( ;$$ ) {
my ($rank, $filter) = @_;
$rank = 0 unless defined $rank;
if (defined $filter && $filter) {
$filter = "AND ($filter)";
} else {
$filter = '';
}
my $saved_exitcode = $?;
......@@ -993,6 +1005,7 @@ sub tblog_find_error () {
my $cause = '';
my $cause_desc;
my $confidence = 0.0;
my @related_errors;
local $DBQUERY_MAXTRIES = 3;
......@@ -1001,18 +1014,22 @@ sub tblog_find_error () {
check_env();
$session = $ENV{TBLOG_SESSION};
my ($tblog_revision) = $REVISION_STR =~ /Revision: ([^ \$]+)/
or die "bad REVISON string";
#
# Build the Tree
#
# Tree data structure:
# invocation = {invocation => INT, parent => INT,
# [{seq => int ...} || {seq => int, child => invox}]}
#
my $root = {invocation => 0, log => []};
my %lookup = (0 => $root);
my @log;
my $query_result = DBQueryFatal "select seq,parent,invocation,sublevel,priority,mesg,cause,script_name,type,inferred from log natural join scripts where session = $session and priority <= $NOTICE and attempt <= 1 and not cleanup order by seq";
my $query_result = DBQueryFatal "select seq,parent,invocation,sublevel,priority,mesg,cause,script_name,type,inferred from log natural join scripts where session = $session and priority <= $NOTICE and attempt <= 1 and not cleanup $filter order by seq";
for (my $i = 0; $i < $query_result->num_rows; $i++) {
......@@ -1114,6 +1131,7 @@ sub tblog_find_error () {
} elsif ($_->{priority} <= $ERR) {
push @posib_errs, $_;
push @related_errors, $_;
if ($_->{type} eq 'summary') {
last;
......@@ -1158,6 +1176,18 @@ sub tblog_find_error () {
}
$cause = 'unknown' unless $cause;
#
# Determine need_more_info from error type
#
my $need_more_info = -1;
foreach (@relevant) {
if ($_->{type} eq 'summary' || $_->{type} eq 'primary') {
$need_more_info = 0;
} elsif ($_->{type} eq 'secondary') {
$need_more_info = 1 if $need_more_info == -1;
}
}
#
# From script determine confidence
#
......@@ -1176,6 +1206,7 @@ sub tblog_find_error () {
$confidence = 0.6;
}
#
# Finally print/store the relevant errors
#
......@@ -1195,35 +1226,39 @@ sub tblog_find_error () {
# if multiline don't reformat
$error .= "$_->{mesg}\n";
}
$error .= "...\n" if $need_more_info == 1;
$prev = $_;
}
$error = "No clue as to what went wrong!\n" unless length $error > 0;
if (length $error == 0) {
if ($rank == 0) {
$error = "No clue as to what went wrong!\n";
} else {
$error = "";
}
}
chop $error;
$cause_desc = DBQuerySingleFatal
"select cause_desc from causes where cause = '$cause'";
print SERR "**** Experimental information, please ignore ****\n";
print SERR "Session ID = $session\n";
print SERR "Likely Cause of the Problem:\n";
print SERR indent($error, ' ');
print SERR "Cause: $cause\n";
print SERR "Confidence: $confidence\n";
print SERR "Script: $script\n";
print SERR "**** End experimental information ****\n";
my $script_num = DBQuerySingleFatal
sprintf("select script from scripts where script_name=%s",
DBQuoteSpecial $script);
DBQueryFatal
sprintf("insert into errors ".
"(stamp, exptidx, session, cause, confidence, script, mesg) ".
"(stamp, exptidx, session, rank, cause, confidence, inferred, need_more_info, script, mesg, tblog_revision) ".
"values ".
"(UNIX_TIMESTAMP(now()), %d, %d, %s, %f, %d, %s)",
"(UNIX_TIMESTAMP(now()), %d, %d, %d, %s, %f, %d, %s, %d, %s, %s)",
$ENV{TBLOG_EXPTIDX},
$session, DBQuoteSpecial($cause), $confidence, $script_num,
DBQuoteSpecial($error));
$session, $rank, DBQuoteSpecial($cause),
$confidence, $inferred,
($need_more_info == -1 ? 'NULL' : $need_more_info),
$script_num,
DBQuoteSpecial($error),
DBQuoteSpecial($tblog_revision));
DBQueryFatal
sprintf("update log set relevant=1 where seq in (%s)",
......@@ -1245,6 +1280,36 @@ sub tblog_find_error () {
eval {tblog $WARNING, add_prefix("tblog_find_error failed", $err)};
}
if ($rank == 0) {
my $other_error;
if ($cause eq 'canceled') {
my @to_skip = map {$_->{seq}} @related_errors;
my $to_skip = 'seq NOT IN ('.join(',',@to_skip).')';
$other_error = tblog_find_error(1, "cause <> 'canceled' AND $to_skip");
}
print SERR "**** Experimental information, please ignore ****\n";
print SERR "Session ID = $session\n";
print SERR "Likely Cause of the Problem:\n";
print SERR indent($error, ' ');
print SERR "Cause: $cause\n";
print SERR "Confidence: $confidence\n";
print SERR "Script: $script\n";
if ($cause eq 'canceled') {
print SERR "\n";
if (length($other_error->{mesg}) == 0) {
print SERR "No other error found.\n";
} else {
print SERR "Possible Error Before Cancellation:\n";
print SERR indent($other_error->{mesg}, ' ');
print SERR "Cause: $other_error->{cause}\n";
print SERR "Confidence: $other_error->{confidence}\n";
print SERR "Script: $other_error->{script}\n";
}
}
print SERR "**** End experimental information ****\n";
}
$? = $saved_exitcode;
return {session=>$session, exptidx=>$ENV{TBLOG_EXPTIDX},
mesg=>$error, cause=>$cause, cause_desc=>$cause_desc,
......@@ -1271,7 +1336,7 @@ sub tblog_lookup_error ( ;$ ) {
("select session, exptidx, mesg, e.cause, cause_desc, confidence, script_name as script".
" from errors as e, scripts as s, causes as c".
" where e.script = s.script and e.cause = c.cause ".
" and session = $session");
" and session = $session and rank = 0");
$? = $saved_exitcode;
......
......@@ -62,7 +62,7 @@ if (!$isadmin) {
$query_result =
DBQueryFatal("SELECT e.cause,e.confidence,e.mesg,cause_desc FROM experiment_stats as s,errors as e, causes as c ".
"WHERE s.exptidx = $exptidx and e.cause = c.cause and s.last_error = e.session");
"WHERE s.exptidx = $exptidx and e.cause = c.cause and s.last_error = e.session and rank = 0");
if (mysql_num_rows($query_result) != 0) {
$row = mysql_fetch_array($query_result);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment