Commit 183040de authored by Kevin Atkinson's avatar Kevin Atkinson

Many changes to tblog code. Database update needed:

1) Added summary of failed nodes is os_setup.  The cause of the error is now
classified as "user" if it is only user images that failed and the user
image failed on every pc of a particular type.  Otherwise I leave the cause
as "unknown" since it is really hard to tell what the real cause is.

2) Raised the confidence threshold for most errors so that they will appear
on the top.

3) Added a special error when an experiment is canceled.  The cause is
"canceled" and testbed-ops won't see these errors.

4) Fixed a bug in assign_wrapper where it will incorrectly report "This
experiment cannot be instantiated on this testbed..." when really the user
canceled the swapin.

5) Fixed a bug where os_setup errors where being incorrectly reported as
assign errors.  This happens when os_setup fails for some reason and
tbswap tries again, but the second time around there are not enough nodes.
So the last error is coming from assign even though the true cause of the
error is due to failed nodes.  The fix for this involved added a new column
to the log table, "attempt", which will be 1 for the first attempt and then
incremented for each new attempt.  tblog_find_error will then simply ignore
any errors with "attempt > 1".

6) Also fixed a potential problem when there is an error during the cleanup
phase by adding another column "cleanup".  tblog_find_error will
also ignore any errors with the cleanup bit set.
parent e5101688
......@@ -1237,6 +1237,8 @@ CREATE TABLE log (
seq int(10) unsigned NOT NULL auto_increment,
stamp int(10) unsigned NOT NULL default '0',
session int(10) unsigned NOT NULL default '0',
attempt tinyint(1) NOT NULL default '0',
cleanup tinyint(1) NOT NULL default '0',
invocation int(10) unsigned NOT NULL default '0',
parent int(10) unsigned NOT NULL default '0',
script smallint(3) NOT NULL default '0',
......
......@@ -2,7 +2,7 @@
--
-- Host: localhost Database: tbdb
---------------------------------------------------------
-- Server version 3.23.58-log
-- Server version 3.23.59-nightly-20050301-log
--
-- Dumping data for table `causes`
......@@ -15,6 +15,7 @@ REPLACE INTO causes VALUES ('internal','Internal Error');
REPLACE INTO causes VALUES ('software','Software Problem');
REPLACE INTO causes VALUES ('hardware','Hardware Problem');
REPLACE INTO causes VALUES ('unknown','Cause Unknown');
REPLACE INTO causes VALUES ('canceled','Canceled');
--
-- Dumping data for table `comments`
......
......@@ -3434,7 +3434,7 @@ last_net_act,last_cpu_act,last_ext_act);
alter table widearea_nodeinfo add latitude float(3,5);
alter table widearea_nodeinfo add longitude float(3,5);
4.71: Support for 802.1q tagged VLAN based virtual interfaces.
First, generalize the encapsulation field. For compatibility,
we create a new field, we'll delete the old veth_encapsulate at
......@@ -3484,3 +3484,9 @@ last_net_act,last_cpu_act,last_ext_act);
4.72: Re-add changes I made in revision 4.54 which got lost when Rob
did revision 4.70. Skip to next entry please ...
4.73: Add two columns for to "log" table for better error reporting:
alter table log
add attempt tinyint(1) not null default 0 after session,
add cleanup tinyint(1) not null default 0 after attempt;
......@@ -798,11 +798,6 @@ if ($plabcount && (keys(%virt_nodes) == $plabcount)) {
TBDebugTimeStamp("assign_loop started");
while (1) {
# Check cancel flag before continuing.
TBGetCancelFlag($pid, $eid, \$canceled);
fatal("Cancel flag set; aborting assign run!")
if ($canceled);
print "Assign Run $currentrun\n";
#
......@@ -957,7 +952,7 @@ sub RunAssign ()
kill('TERM', -$pgrp);
waitpid($childpid, 0);
print "Cancel flag set; aborting assign run!\n";
fatal("Cancel flag set; aborting assign run!");
return -1;
}
# Loop again to reap child above before exit.
......@@ -977,7 +972,7 @@ sub RunAssign ()
# Check cancel flag before continuing.
TBGetCancelFlag($pid, $eid, \$canceled);
if ($canceled) {
tbnotice("Cancel flag set; aborting assign run!\n");
fatal("Cancel flag set; aborting assign run!");
return -1;
}
......
......@@ -182,7 +182,8 @@ if (! UserDBInfo($dbuid, \$user_name, \$user_email)) {
# our time. Make sure user sees the error by exiting with 1.
#
if (system("$checkquota $dbuid") != 0) {
tberror("You are over your disk quota on $CONTROL; please cleanup!");
tberror({cause => 'user'},
"You are over your disk quota on $CONTROL; please cleanup!");
exit(1);
}
......
......@@ -64,6 +64,7 @@ of an error. The current causes are:
internal - software bug, should not happen
software - software problem, like a bad image
hardware - hardware problem
canceled - canceled
unknown - default
The cause for an error can be set directly when calling tberror by passing
......@@ -209,6 +210,8 @@ use Exporter;
tblog_start_capture tblog_stop_capture
tblog_new_process tblog_init_process tblog_exit
tblog_session tblog_lookup_error tblog_format_error
tblog_set_attempt tblog_inc_attempt tblog_get_attempt
tblog_set_cleanup tblog_get_cleanup
copy_hash
TBLOG_EMERG TBLOG_ALERT TBLOG_CRIT TBLOG_ERR
TBLOG_WARNING TBLOG_NOTICE TBLOG_INFO TBLOG_DEBUG);
......@@ -263,6 +266,8 @@ sub check_env ()
check_env_num 'TBLOG_INVOCATION';
check_env_num 'TBLOG_PARENT_INVOCATION';
check_env_num 'TBLOG_UID';
check_env_num 'TBLOG_ATTEMPT';
check_env_num 'TBLOG_CLEANUP';
check_env_num 'TBLOG_SCRIPTNUM';
check_env_def 'TBLOG_SCRIPTNAME';
check_env_def 'TBLOG_BASE_SCRIPTNAME';
......@@ -271,6 +276,7 @@ sub check_env ()
my %CAUSE_MAP = (# Don't notify testbed-ops
temp => 'temp', # resources temp unavailable
user => 'user', # user error
canceled => 'canceled', # canceled
# Notify testbed-ops
internal => 'internal', # software bug, should not happen
software => 'software', # software problem, like a bad image
......@@ -423,6 +429,8 @@ sub tblog_init_process(@) {
$ENV{TBLOG_LEVEL} = 0;
$ENV{TBLOG_EXPTIDX} = 0;
$ENV{TBLOG_UID} = 0;
$ENV{TBLOG_ATTEMPT} = 0;
$ENV{TBLOG_CLEANUP} = 0;
dblog($NOTICE, {type => 'entering'},
'Entering "', join(' ', informative_scriptname(), @ARGV), '"')
or die;
......@@ -506,6 +514,66 @@ sub tblog_set_default_cause ( $ )
$ENV{TBLOG_CAUSE} = $_[0];
}
=item tblog_set_attempt NUM
Set the attempt number to NUM
=cut
sub tblog_set_attempt ( $ )
{
check_env();
$ENV{TBLOG_ATTEMPT} = $_[0];
}
=item tblog_inc_attempt
Increment the attempt number
=cut
sub tblog_inc_attempt ()
{
check_env();
$ENV{TBLOG_ATTEMPT}++;
}
=item tblog_get_attempt
Get the attempt number
=cut
sub tblog_get_attempt ()
{
check_env();
return $ENV{TBLOG_ATTEMPT};
}
=item tblog_set_cleanup BOOL
Set the cleanup bit to BOOL.
=cut
sub tblog_set_cleanup ( $ )
{
check_env();
$ENV{TBLOG_CLEANUP} = $_[0];
}
=item tblog_get_cleanup
Get the value of the cleanup bit.
=cut
sub tblog_get_cleanup ()
{
check_env();
return $ENV{TBLOG_CLEANUP};
}
=item tblog_exit
Exits a script or sub-process. Generally called automatically when a
......@@ -600,9 +668,11 @@ sub dblog_real ( $$@ ) {
unless defined $cause;
my $query =
sprintf('insert into log (stamp,session,parent,invocation,script,level,sublevel,priority,inferred,cause,type,mesg) '.
'VALUES (UNIX_TIMESTAMP(now()),%d,%d,%d,%d,%d,%d,%d,%d,%s,%s,%s)',
sprintf('insert into log (stamp,session,attempt,cleanup,parent,invocation,script,level,sublevel,priority,inferred,cause,type,mesg) '.
'VALUES (UNIX_TIMESTAMP(now()),%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s,%s)',
$ENV{TBLOG_SESSION},
$ENV{TBLOG_ATTEMPT},
$ENV{TBLOG_CLEANUP},
$ENV{TBLOG_PARENT_INVOCATION},
$ENV{TBLOG_INVOCATION},
$ENV{TBLOG_SCRIPTNUM},
......@@ -698,11 +768,11 @@ sub tblog_find_error () {
my %lookup = (0 => $root);
my @log;
my $query_result = DBQueryFatal "select seq,parent,invocation,sublevel,priority,mesg,cause,script_name,type from log natural join scripts where session = $session and priority <= $NOTICE order by seq";
my $query_result = DBQueryFatal "select seq,parent,invocation,sublevel,priority,mesg,cause,script_name,type,inferred from log natural join scripts where session = $session and priority <= $NOTICE and attempt <= 1 and not cleanup order by seq";
for (my $i = 0; $i < $query_result->num_rows; $i++) {
my ($seq, $parent, $invocation, $sublevel, $priority, $mesg, $cause, $script, $type)
my ($seq, $parent, $invocation, $sublevel, $priority, $mesg, $cause, $script, $type, $inferred)
= $query_result->fetchrow;
if (not exists $lookup{$invocation}) {
......@@ -724,6 +794,7 @@ sub tblog_find_error () {
priority => $priority,
type => $type,
cause => $cause,
inferred => $inferred,
mesg => $mesg};
}
......@@ -823,20 +894,11 @@ sub tblog_find_error () {
$script = $lookup{$relevant[0]->{invocation}}->{script};
}
#
# From script determine confidence
#
$confidence = 0.5;
if ($script =~ /^(assign|parse)/) {
$confidence = 0.9;
} elsif ($script =~ /^(os_setup)/) {
$confidence = 0.2;
}
#
# Figure out the cause;
#
my $type = '';
my $inferred = -1;
# Assumes that 'extra' info will come first
......@@ -844,13 +906,32 @@ sub tblog_find_error () {
if (!$cause && $_->{cause}) {
$cause = $_->{cause};
$type = $_->{type};
} elsif ($_->{cause} &&
} elsif ($_->{cause} && $_->{type} ne 'extra' &&
$_->{type} eq $type && $_->{cause} ne $cause) {
$cause = 'unknown';
}
$inferred = $_->{inferred} if $inferred < $_->{inferred};
}
$cause = 'unknown' unless $cause;
#
# From script determine confidence
#
$confidence = 0.5;
if ($script =~ /^assign/ && ($cause eq 'temp' || $cause eq 'user')) {
$confidence = 0.9;
} elsif ($script =~ /^parse/ && $cause eq 'user') {
$confidence = 0.9;
} elsif ($script =~ /^os_setup/ && $type eq 'summary') {
$confidence = 0.9;
} elsif ($inferred == 0) {
$confidence = 0.7;
} elsif ($inferred == 1) {
$confidence = 0.6;
} elsif ($cause ne 'unknown') {
$confidence = 0.6;
}
#
# Finally print/store the relevant errors
#
......@@ -972,10 +1053,11 @@ sub tblog_email_error ( $$$$$$$@ ) {
my ($d, $to, $what, $pideid, $from, $headers, $testbed_ops, @files) = @_;
my $threshold = 0.65;
my $threshold = 0.55;
unless ($d->{confidence} > $threshold
&& ($d->{cause} eq 'temp' || $d->{cause} eq 'user'))
&& ($d->{cause} eq 'temp' || $d->{cause} eq 'user'
|| $d->{cause} eq 'canceled'))
{
$headers .= "\n" if defined $headers && length($headers) > 0;
$headers .= "$testbed_ops";
......
......@@ -76,6 +76,12 @@ my %bios_waittime = (); # Indexed by node_type.
my %reboot_waittime = (); # Indexed by osid.
my %node_types = (); # Indexed by node_id.
#
# This variable keeps track of the failed nodes of all types.
# values = 'UNKNOWN' 'RELOAD', 'BOOT', 'OTHER'
my %failed_nodes = ();
my %failed_nonfatal_nodes = ();
#
# Ah, Frisbee works so lets do auto reloading for nodes that do not have
# the proper OS loaded on it. This will be a hash of lists; for each
......@@ -553,6 +559,7 @@ if ($firewalled) {
"is resolved.\n");
$failed++;
$failed_nodes{$node} = 'UNKNOWN';
goto tballdone;
}
......@@ -591,6 +598,7 @@ if ($plabinelab) {
"in pid/eid $pid/$eid failed to boot after loading OS.\n\n".
"The nodes have been freed.\n");
$failed++;
$failed_nodes{$node} = 'UNKNOWN';
goto tballdone;
}
......@@ -739,6 +747,7 @@ if (!$TESTMODE) {
foreach my $node (@nodelist) {
tbnotice "Not waiting for $node since its reload/reboot failed!";
$failed++;
$failed_nodes{$node} = 'UNKNOWN';
delete($nodes{$node});
TBSetNodeAllocState($node, TBDB_ALLOCSTATE_DOWN());
......@@ -860,6 +869,7 @@ while ( @nodelist ) {
if ($canfail{$node} && !($canceled || $noretry)) {
push(@informuser, $node);
$failed_nonfatal_nodes{$node} = 'UNKNOWN';
tbnotice "Continuing with experiment setup anyway ...";
next;
}
......@@ -887,6 +897,7 @@ while ( @nodelist ) {
$nodeAllocStates{$node} = TBDB_ALLOCSTATE_DOWN();
$failed++;
$failed_nodes{$node} = 'UNKNOWN';
}
#
......@@ -1099,6 +1110,7 @@ elsif (@vnodelist) {
if ($canfail{$node}) {
# Send mail to testbed-ops and to the user about it.
push(@informuser, $node);
$failed_nonfatal_nodes{$node} = 'UNKNOWN';
tbnotice "Continuing with experiment setup anyway ...";
next;
}
......@@ -1108,6 +1120,7 @@ elsif (@vnodelist) {
else {
$failedvnodes++;
}
$failed_nodes{$node} = 'UNKNOWN';
}
TBDebugTimeStamp("Virtual node waiting finished");
......@@ -1159,12 +1172,96 @@ if ($count > 0) {
tballdone:
tbinfo "OS Setup Done.";
tberror "There were $failed failed nodes"
if ($failed);
tberror "There were $failedvnodes failed virtual nodes"
if ($failedvnodes);
tberror "There were $failedplab failed plab nodes"
if ($failedplab);
my %tally;
my $users_fought = 1;
my %total_osid;
my %total_type;
my $summary = '';
foreach (keys %failed_nodes) {
my $osid = $osids{$_};
my $type = $node_types{$_};
$total_osid{$osid}{failed}++;
$total_osid{$osid}{failed_fatal}++;
$total_osid{$osid}{total}++;
$total_type{$osid}{$type}{failed}++;
$total_type{$osid}{$type}{failed_fatal}++;
$total_type{$osid}{$type}{total}++;
}
foreach (keys %failed_nonfatal_nodes) {
my $osid = $osids{$_};
my $type = $node_types{$_};
$total_osid{$osid}{failed}++;
$total_osid{$osid}{failed_nonfatal}++;
$total_osid{$osid}{total}++;
$total_type{$osid}{$type}{failed}++;
$total_type{$osid}{$type}{failed_nonfatal}++;
$total_type{$osid}{$type}{total}++;
}
foreach ((keys %nodes), (keys %vnodes), (keys %plabvnodes)) {
next if $failed_nodes{$_} || $failed_nonfatal_nodes{$_};
my $osid = $osids{$_};
my $type = $node_types{$_};
$total_osid{$osid}{total}++;
$total_type{$osid}{$type}{total}++;
}
foreach my $osid (sort keys %total_osid) {
my ($osname, $pidofosid);
my $user_image = 1;
if ($total_osid{$osid}{failed} > 0) {
my $query_result =
DBQueryFatal("select osname,pid from os_info where osid='$osid'");
if ($query_result->num_rows > 0) {
($osname, $pidofosid) = $query_result->fetchrow_array();
$user_image = 0 if $pidofosid eq TBOPSPID();
} else {
$osname = $osid;
$user_image = 0;
}
$users_fought = 0 if !$user_image;
my %d = %{$total_osid{$osid}};
$summary .= "$d{failed}/$d{total} nodes with a ";
$summary .= $user_image ? "user" : "system";
$summary .= " osid of \"$osname\" failed to boot";
if ($d{failed_nonfatal} > 0) {
my $count = ($d{failed_nonfatal} == $d{failed}
? "all"
: "$d{failed_nonfatal}/$d{failed}");
$summary .= " ($count non-fatal)";
}
$summary .= ":\n";
foreach my $type (sort keys %{$total_type{$osid}}) {
my %d = %{$total_type{$osid}{$type}};
$summary .= " $d{failed}/$d{total} ${type}'s with this os failed to boot";
if ($d{failed_nonfatal} > 0) {
my $count = ($d{failed_nonfatal} == $d{failed}
? "all"
: "$d{failed_nonfatal}/$d{failed}");
$summary .= " ($count non-fatal)";
}
$summary .= ".\n";
$users_fought = 0 if $d{failed} < $d{total};
}
}
}
if ($failed || $failedvnodes || $failedplab) {
push @msg, "$failed failed nodes" if $failed;
push @msg, "$failedvnodes failed virtual nodes" if $failedvnodes;
push @msg, "$failedplab failed plab nodes" if $failedplab;
tberror ({type=>'summary', cause=>($users_fought ? 'user' : 'unknown')},
"There were ", join(', ', @msg), ".\n\n", $summary);
} elsif ($summary) {
tbwarn $summary;
}
#
# If not failing for any reason, save off swap state.
......@@ -1403,3 +1500,9 @@ sub os_setup_one($$$)
return 1;
}
......@@ -331,7 +331,8 @@ if ($waitmode) {
my $overquota = system("$checkquota $dbuid");
if ($overquota) {
tberror("You are over your disk quota on $CONTROL; please cleanup!");
tberror({cause => 'user'},
"You are over your disk quota on $CONTROL; please cleanup!");
}
#
......
......@@ -240,6 +240,7 @@ if ($swapop eq "out") {
#
# Swap out
#
tblog_set_attempt(1);
$errors = doSwapout(REAL);
}
elsif ($swapop eq "update") {
......@@ -259,6 +260,9 @@ elsif ($swapop eq "update") {
#
# Update.
#
tblog_set_attempt(1);
#
# Phase One -- swap experiment partially out.
#
......@@ -351,7 +355,8 @@ elsif ($swapop eq "in") {
TBDebugTimeStamp("tbswap $swapop finished (failed)");
exit(1);
}
tblog_set_attempt(1);
$errors = doSwapin(REAL);
#
......@@ -364,6 +369,7 @@ elsif ($swapop eq "in") {
#
while ($errors == 3 && $retries && !$canceled && !$TESTMODE) {
$retries--;
tblog_inc_attempt();
print STDERR "Cleaning up after errors; will try again.\n";
doSwapout(RETRY);
......@@ -378,6 +384,14 @@ elsif ($swapop eq "in") {
}
}
tblog_set_attempt(0);
TBGetCancelFlag($pid, $eid, \$canceled);
if ($canceled) {
tberror ({type=>'summary', cause=>'canceled'},
"Experiment swap-$swapop canceled by user.");
}
#
# Write appropriate message and exit.
#
......@@ -413,6 +427,8 @@ sub doSwapout($) {
my $type = shift; # REAL==4, CLEANUP==3, RETRY==2, UPDATE==1.
my $swapout_errors = 0;
tblog_set_cleanup(1) if $type == CLEANUP;
#
# wait for os_setup;
# this only applies if called after a failed doswapin.
......@@ -519,6 +535,7 @@ sub doSwapout($) {
# See undoFWNodes for details.
#
if ($firewalled && undoFWNodes($pid, $eid)) {
tblog_set_cleanup(0);
return 1;
}
......@@ -543,6 +560,7 @@ sub doSwapout($) {
TBExptGetSwapoutAction($pid, $eid, \%soaction);
}
if ($soaction{'command'} && doSwapoutAction($pid, $eid, %soaction)) {
tblog_set_cleanup(0);
return 1;
}
......@@ -718,6 +736,7 @@ sub doSwapout($) {
TBExptRemovePhysicalState( $pid, $eid );
}
tblog_set_cleanup(0);
return $swapout_errors;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment