Commit d258dde6 authored by Kevin Atkinson's avatar Kevin Atkinson

Changed format of email sent to user on errors.  The error will now
appear instead of the generic message when I am confident it is
accurate.  The subject line will also change to reflect the cause of
an error.

Avoid sending mail to testbed-ops during failed swap related evenets
in some cases.  It will instead be sent to a new mailing list
testbed-errors.

Added a new row in the experiment info table "Last Error:" which
states the cause of the error, and links to a new page displaying the
error.

Made some assign/assign_wrapper errors more informative.

The error (as determined by tblog) is now stored in the database in a
more structured fashion.  This inlcudes adding a column for the session
(in the log table) to testbed_stats to link eash swap event with the
logs and possible the error.

Other changes to the database, see sql/database-migrate.txt
parent 9d70c179
......@@ -4740,6 +4740,9 @@ sub GatherSwapStats($$$$$;$)
$TBDB_STATS_STARTCLOCK = time();
return;
}
my $session = tblog_session();
$session = 'NULL' unless defined $session;
local $DBQUERY_MAXTRIES = 0;
......@@ -4770,7 +4773,8 @@ sub GatherSwapStats($$$$$;$)
if ($ecode) {
DBQueryWarn("update experiment_stats set ".
" swap_errors=swap_errors+1, ".
" swap_exitcode=$ecode ".
" swap_exitcode=$ecode, ".
" last_error=$session ".
"where pid='$pid' and eid='$eid' and exptidx=$exptidx");
#
......@@ -4989,11 +4993,14 @@ sub GatherSwapStats($$$$$;$)
$tmp = TBDB_STATS_SWAPIN;
}
DBQueryWarn("update experiment_stats ".
"set ${tmp}_count=${tmp}_count+1, ".
" ${tmp}_last=now(), ".
" swapin_duration=swapin_duration+${duration}, ".
" swap_exitcode=0 ".
" swap_exitcode=0, ".
" last_error=NULL ".
"where pid='$pid' and eid='$eid' and ".
" exptidx=$exptidx");
}
......@@ -5047,9 +5054,9 @@ sub GatherSwapStats($$$$$;$)
DBQueryWarn("insert into testbed_stats ".
"(idx, uid, start_time, end_time, exptidx, rsrcidx, ".
" action, exitcode) ".
" action, exitcode, log_session) ".
"values (0, '$uid', $starttime, now(), $exptidx, $rsrcidx, ".
" '$mode', $ecode)");
" '$mode', $ecode, $session)");
# We do these here since even failed operations implies activity
DBQueryWarn("update user_stats set last_activity=now() ".
......
......@@ -104,6 +104,17 @@ CREATE TABLE cameras (
PRIMARY KEY (name,building,floor)
) TYPE=MyISAM;
--
-- Table structure for table `causes`
--
CREATE TABLE causes (
cause varchar(16) NOT NULL default '',
cause_desc varchar(32) NOT NULL default '',
PRIMARY KEY (cause),
UNIQUE KEY cause_desc (cause_desc)
) TYPE=MyISAM;
--
-- Table structure for table `cdroms`
--
......@@ -336,6 +347,21 @@ CREATE TABLE emulab_indicies (
PRIMARY KEY (name)
) TYPE=MyISAM;
--
-- Table structure for table `errors`
--
CREATE TABLE errors (
session int(10) unsigned NOT NULL default '0',
stamp int(10) unsigned NOT NULL default '0',
exptidx int(11) NOT NULL default '0',
script smallint(3) NOT NULL default '0',
cause varchar(16) NOT NULL default '',
confidence float NOT NULL default '0',
mesg text NOT NULL,
PRIMARY KEY (session)
) TYPE=MyISAM;
--
-- Table structure for table `event_eventtypes`
--
......@@ -455,6 +481,7 @@ CREATE TABLE experiment_stats (
elabinelab_exptidx int(10) unsigned default NULL,
security_level tinyint(1) NOT NULL default '0',
archive_idx int(10) unsigned default NULL,
last_error int(10) unsigned default NULL,
PRIMARY KEY (eid,pid,exptidx),
KEY exptidx (exptidx),
KEY rsrcidx (rsrcidx)
......@@ -974,7 +1001,7 @@ CREATE TABLE location_info (
CREATE TABLE log (
seq int(10) unsigned NOT NULL auto_increment,
stamp int(10) unsigned NOT NULL default '0',
pidx int(11) NOT NULL default '0',
exptidx int(11) NOT NULL default '0',
uid int(11) default NULL,
session int(10) unsigned NOT NULL default '0',
invocation int(10) unsigned NOT NULL default '0',
......@@ -984,7 +1011,7 @@ CREATE TABLE log (
sublevel tinyint(2) NOT NULL default '0',
priority smallint(3) NOT NULL default '0',
inferred tinyint(1) NOT NULL default '0',
cause varchar(32) NOT NULL default '',
cause varchar(16) NOT NULL default '',
type enum('normal','entering','exiting','thecause','extra','summary') NOT NULL default 'normal',
relevant tinyint(1) NOT NULL default '0',
mesg text NOT NULL,
......@@ -1962,6 +1989,7 @@ CREATE TABLE testbed_stats (
action varchar(16) NOT NULL default '',
exitcode tinyint(3) default '0',
uid varchar(8) NOT NULL default '',
log_session int(10) unsigned default NULL,
PRIMARY KEY (idx),
KEY rsrcidx (rsrcidx),
KEY exptidx (exptidx)
......
......@@ -4,6 +4,18 @@
---------------------------------------------------------
-- Server version 3.23.58-log
--
-- Dumping data for table `causes`
--
REPLACE INTO causes VALUES ('temp','Temp Resource Shortage');
REPLACE INTO causes VALUES ('user','User Error');
REPLACE INTO causes VALUES ('internal','Internal Error');
REPLACE INTO causes VALUES ('software','Software Problem');
REPLACE INTO causes VALUES ('hardware','Hardware Problem');
REPLACE INTO causes VALUES ('unknown','Cause Unknown');
--
-- Dumping data for table `comments`
--
......@@ -119,6 +131,7 @@ REPLACE INTO event_objecttypes VALUES (17,'BGMON');
--
REPLACE INTO exported_tables VALUES ('causes');
REPLACE INTO exported_tables VALUES ('comments');
REPLACE INTO exported_tables VALUES ('event_eventtypes');
REPLACE INTO exported_tables VALUES ('event_objecttypes');
......
......@@ -3173,3 +3173,55 @@ last_net_act,last_cpu_act,last_ext_act);
altsrc tinytext,
PRIMARY KEY (name)
) TYPE=MyISAM;
???: Add stuff to store the error, as determined by tblog, in a more
structured fashion:
CREATE TABLE errors (
session int unsigned NOT NULL,
stamp int unsigned NOT NULL,
exptidx int NOT NULL,
script smallint(3) NOT NULL,
cause varchar(16) NOT NULL,
confidence float NOT NULL,
mesg text NOT NULL,
primary key (session)
);
alter table experiment_stats
add last_error int unsigned;
alter table testbed_stats
add log_session int unsigned;
CREATE TABLE causes (
cause varchar(16) NOT NULL,
cause_desc varchar(32) NOT NULL,
primary key (cause),
unique key (cause_desc)
);
ALTER TABLE log
change cause cause varchar(16) NOT NULL,
change pidx exptidx int NOT NULL default '0';
INSERT INTO exported_tables values ('causes');
INSERT INTO causes values ('temp', 'Temp Resource Shortage');
INSERT INTO causes values ('user', 'User Error');
INSERT INTO causes values ('internal', 'Internal Error');
INSERT INTO causes values ('software', 'Software Problem');
INSERT INTO causes values ('hardware', 'Hardware Problem');
INSERT INTO causes values ('unknown', 'Cause Unknown');
The new code will than store all errors in the new table. Before
they where stored as a special entry in the "log" table. To copy
them over into the "errors" table you may also want to run:
perl errors-migrate.pl.
If the script completed with no errors you can than safely purge
those entries from the "log" table using:
delete from log where type = 'thecause';
#!/usr/bin/perl -w
use strict;
use DBI;
my $database = 'tbdb';
my $username = 'root';
my $passwd = '';
my $dbh = DBI->connect("DBI:mysql:database=$database", $username, $passwd);
my $sth = $dbh->prepare("select * from scripts");
$sth->execute;
my %script_map;
my @script_map;
while ((my $ref = $sth->fetchrow_arrayref)) {
$script_map{$ref->[1]} = $ref->[0];
$script_map[$ref->[0]] = $ref->[1];
}
$script_map{unknown} = 0;
$script_map[0] = 'unknown';
$sth = $dbh->prepare("select stamp,session,exptidx,mesg from log where type = 'thecause' order by session");
$sth->execute;
my $sth2 = $dbh->prepare("select script_name,cause,type from log natural join scripts where session = ? and relevant order by seq desc");
my $ins = $dbh->prepare("insert into errors (session,stamp,exptidx,script,cause,confidence,mesg) values (?,?,?,?,?,?,?)");
my $ready = 0;
while ((my $ref = $sth->fetchrow_arrayref)) {
my ($stamp,$session,$exptidx,$mesg) = @$ref;
my $script = "unknown";
my $cause = "unknown";
my $confidence = 0.0;
local $_ = $mesg;
if (/^\d+: ([^:]+)/) {
$script = $1;
} else {
if (s/\nCause: (\S+)\nConfidence: (\S+)$//) {
$cause = $1;
$confidence = $2;
}
s/^ //gm;
$mesg = $_;
($_) = split /\n\s*\n/;
if (/\(([^\)]+)\)$/ && defined $script_map{$1}) {
$script = $1;
} else {
warn "Unable to find script in: $_\n" unless /No clue as to what went wrong/;
}
$sth2->execute($session);
my @rel;
while ((my $r = $sth2->fetchrow_hashref)) {
push @rel, {%$r};
}
if (@rel) {
warn "Script name mismatch: $rel[0]{script_name} ne $script in: $_\n"
if $rel[0]{script_name} ne $script;
$script = $rel[0]{script_name};
my $conf = 0.5;
if ($script =~ /^(assign|parse)/) {
$conf = 0.9;
} elsif ($script =~ /^(os_setup)/) {
$conf = 0.2;
}
my $t = '';
my $c = '';
foreach (@rel) {
$ready = 1 if $_->{cause} && $_->{cause} ne 'unknown';
if (!$c && $_->{cause}) {
$c = $_->{cause};
$t = $_->{type};
} elsif ($_->{cause} &&
$_->{type} eq $t && $_->{cause} ne $c) {
$c = 'unknown';
}
}
$c = 'unknown' unless $c;
if ($cause eq 'unknown' && $ready) {
$cause = $c;
$confidence = $conf;
} elsif ($cause ne 'unknown') {
warn "Cause mismatch" unless $cause eq $c;
warn "Confidence mismatched" unless $conf == $confidence;
}
}
}
$ins->execute($session,$stamp,$exptidx,$script_map{$script},$cause,$confidence,$mesg);
}
......@@ -2,8 +2,8 @@
CREATE TABLE log (
seq int unsigned NOT NULL auto_increment,
stamp int unsigned NOT NULL,
pidx int NOT NULL, -- unique project id
stamp int unsigned NOT NULL, -- timestamp
exptidx int NOT NULL, -- unique experment id
uid int default NULL, -- numeric user id
session int unsigned NOT NULL, -- session id (1)
invocation int unsigned NOT NULL, -- invocation id - unique id for a
......@@ -19,13 +19,14 @@ CREATE TABLE log (
-- given the non-default priority
-- 3 - log created with a normal print and
-- given the default priority
cause varchar(32) NOT NULL, -- string identifying the "cause" of the error
cause varchar(16) NOT NULL, -- string identifying the cause of the error
type enum('normal','entering','exiting','thecause','extra','summary')
NOT NULL default 'normal',-- type of error:
-- normal: normal error generated by tblog
-- entering: auto gen when a script starts
-- exiting: auto gen when a script exists (2)
-- thecause: created by tblog_find_error
-- thecause: no longer used, see the
-- "errors" table
-- extra: extra information to normal info
-- at a lower level
-- summary: useful summary of errors,
......@@ -52,20 +53,48 @@ CREATE TABLE log (
-- seq stamp [session] invocation priority inferred cause type relevant mesg
-- ...
CREATE TABLE errors (
session int unsigned NOT NULL, -- session id
stamp int unsigned NOT NULL, -- timestamp
exptidx int NOT NULL, -- unique experment id
script smallint(3) NOT NULL, -- numeric id for the script name
cause varchar(16) NOT NULL, -- string identifying the cause of the error
confidence float NOT NULL, -- number between 0 and 1 indicating how
-- confident tblog is that this it
-- correctly identified the error.
mesg text NOT NULL, -- the text of the message
primary key (session)
);
CREATE TABLE scripts (
script smallint(3) NOT NULL auto_increment,
script_name varchar(24) NOT NULL default '',
PRIMARY KEY (script),
UNIQUE KEY id (name)
UNIQUE KEY id (script_name)
);
CREATE TABLE priorities (
priority smallint(3) NOT NULL,
priority_name varchar(8) NOT NULL,
primary key (priority),
unique key (name)
unique key (priority_name)
);
CREATE TABLE causes (
cause varchar(16) NOT NULL,
cause_desc varchar(32) NOT NULL,
primary key (cause)
unique key (cause_desc)
);
alter table experiment_stats
add last_error int unsigned; -- session id of last error, otherwise 0
alter table testbed_stats
add log_session int unsigned;
replace into priorities values (000, 'EMERG');
replace into priorities values (100, 'ALERT');
replace into priorities values (200, 'CRIT');
......@@ -75,4 +104,17 @@ replace into priorities values (500, 'NOTICE');
replace into priorities values (600, 'INFO');
replace into priorities values (700, 'DEBUG');
replace into causes values ('temp', 'Temp Resource Shortage');
replace into causes values ('user', 'User Error');
replace into causes values ('internal', 'Internal Error');
replace into causes values ('software', 'Software Problem');
replace into causes values ('hardware', 'Hardware Problem');
replace into causes values ('unknown', 'Cause Unknown');
replace into exported_tables values ('priorities');
replace into exported_tables values ('causes');
replace into exported_tables values ('state_timeouts');
......@@ -785,7 +785,13 @@ while (1) {
my $retval = RunAssign();
if ($retval != 0) {
fatal({type=>'extra', cause=>'user'},
"Experiment can not be run on an empty testbed. Please fix the experiment.");
"This experiment cannot be instantiated on this ".
"testbed. You have most likely asked for hardware ".
"this testbed does not have, such as nodes of a type ".
"it does not contain, or nodes with too many network ".
"interfaces. You will need to modify this experiment ".
"before it can be swapped in - re-submitting the ".
"experiment as-is will always result in failure.");
}
print("Assign succeeded on an empty testbed.\n");
$precheck = 0;
......
......@@ -92,10 +92,16 @@ if ($exitcode) {
dblog(TBLOG_ERR, {}, $err) unless ($obvious_error);
# create a log entry that assign failed
my %parms;
$parms{sublevel} = -1;
$parms{type} = 'extra' unless $obvious_error;
dblog(TBLOG_ERR, \%parms, "Assign Failed.");
dblog(TBLOG_ERR, {sublevel => -1}, "Assign Failed.");
dblog(TBLOG_ERR, {sublevel => -1, type=>'extra'},
"Failed to find a set of physical testbed nodes to run your ".
"experiment on. This might mean that there are not enough ".
"nodes or switch resources free, or your experiment might ".
"require certain hardware which is not available. If you ".
"believe this message is in error, contact @TBOPSEMAIL@.")
unless $obvious_error;
}
exit $exitcode;
......
......@@ -874,24 +874,21 @@ sub cleanup()
# Clear the logfile so the webpage stops.
TBExptClearLogFile($pid, $eid);
#
# Figure out the error if possible
#
my $error_data = tblog_find_error();
#
# Send a message to the testbed list.
#
SENDMAIL("$user_name <$user_email>",
"Experiment Configure Failure: $pid/$eid",
"Please look at the log below to see what happened. If the error\n".
"resulted from a lack of free nodes, you can use this web page to\n".
"get a summary of free nodes:\n\n".
" ${TBBASE}/nodecontrol_list.php3\n".
"\n".
"Please do not try again until you see enough nodes free. Or, you\n".
"can use the batch system to swap your experiment in when enough\n".
"nodes are free:\n\n".
" ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html".
"#BatchMode\n",
"$user_name <$user_email>",
"Cc: $TBOPS",
($logname, "assign.log", "wanassign.log", $nsfile));
tblog_email_error($error_data,
"$user_name <$user_email>",
"Config Failure", "$pid/$eid",
"$user_name <$user_email>",
"",
"Cc: $TBOPS",
($logname, "assign.log", "wanassign.log", $nsfile));
#
# Back up the work dir for post-mortem debugging.
......@@ -1224,8 +1221,6 @@ END {
}
my $saved_exitcode = $?;
tblog_find_error() if $?;
if ($cleaning) {
#
# We are screwed; a recursive error. Someone will have to clean
......@@ -1241,4 +1236,3 @@ END {
cleanup();
$? = $saved_exitcode;
}
......@@ -66,9 +66,6 @@ of an error. The current causes are:
hardware - hardware problem
unknown - default
(Note, that at the moment, internal is the same as software, but in
the future that might change)
The cause for an error can be set directly when calling tberror by passing
in the "cause" parameter. For example:
......@@ -208,9 +205,10 @@ use Exporter;
@ISA = "Exporter";
@EXPORT = qw (tblog tberror tberr tbwarn tbwarning tbnotice tbinfo tbdebug
tbdie tblog_set_info tblog_set_default_cause
tblog_sub_process tblog_find_error
tblog_sub_process tblog_find_error tblog_email_error
tblog_start_capture tblog_stop_capture
tblog_new_process tblog_init_process tblog_exit
tblog_session
copy_hash
TBLOG_EMERG TBLOG_ALERT TBLOG_CRIT TBLOG_ERR
TBLOG_WARNING TBLOG_NOTICE TBLOG_INFO TBLOG_DEBUG);
......@@ -222,6 +220,7 @@ use POSIX qw(isatty setsid);
use File::Basename;
use IO::Handle;
use Text::Wrap;
use Text::Tabs;
use Carp;
use strict;
......@@ -260,7 +259,7 @@ sub check_env ()
{
check_env_num 'TBLOG_LEVEL';
check_env_num 'TBLOG_SESSION';
check_env_num 'TBLOG_PIDX';
check_env_num 'TBLOG_EXPTIDX';
check_env_num 'TBLOG_INVOCATION';
check_env_num 'TBLOG_PARENT_INVOCATION';
check_env_num 'TBLOG_UID';
......@@ -273,7 +272,7 @@ my %CAUSE_MAP = (# Don't notify testbed-ops
temp => 'temp', # resources temp unavailable
user => 'user', # user error
# Notify testbed-ops
internal => 'software', # software bug, should not happen
internal => 'internal', # software bug, should not happen
software => 'software', # software problem, like a bad image
hardware => 'hardware', # hardware problem
unknown => '');
......@@ -284,6 +283,18 @@ sub normalize_cause ( $ ) {
return $cause;
}
sub indent ( $$ ) {
my ($text, $prefix) = @_;
$text =~ s/\n$//;
my $res = '';
foreach (split /\n/, $text) {
$res .= $prefix;
$res .= $_;
$res .= "\n";
}
return $res;
}
#
#
#
......@@ -410,7 +421,7 @@ sub tblog_init_process(@) {
$ENV{TBLOG_INVOCATION} = 0;
$ENV{TBLOG_PARENT_INVOCATION} = 0;
$ENV{TBLOG_LEVEL} = 0;
$ENV{TBLOG_PIDX} = 0;
$ENV{TBLOG_EXPTIDX} = 0;
$ENV{TBLOG_UID} = 0;
dblog($NOTICE, {type => 'entering'},
'Entering "', join(' ', informative_scriptname(), @ARGV), '"')
......@@ -475,12 +486,12 @@ sub tblog_set_info ( $$$ )
check_env();
my ($pid, $eid, $uid) = @_;
local $DBQUERY_MAXTRIES = 3;
$ENV{'TBLOG_PIDX'} =
$ENV{'TBLOG_EXPTIDX'} =
DBQuerySingleFatal("select idx from experiments where pid='$pid' and eid='$eid'");
$ENV{'TBLOG_UID'} = $uid;
DBQueryFatal
sprintf('update log set pidx=%d,uid=%s where session=%s',
$ENV{TBLOG_PIDX}, $ENV{TBLOG_UID}, $ENV{TBLOG_SESSION});
sprintf('update log set exptidx=%d,uid=%s where session=%s',
$ENV{TBLOG_EXPTIDX}, $ENV{TBLOG_UID}, $ENV{TBLOG_SESSION});
}
=item tblog_set_default_cause CAUSE
......@@ -589,9 +600,9 @@ sub dblog_real ( $$@ ) {
unless defined $cause;
my $query =
sprintf('insert into log (stamp,pidx,uid,session,parent,invocation,script,level,sublevel,priority,inferred,cause,type,mesg) '.
sprintf('insert into log (stamp,exptidx,uid,session,parent,invocation,script,level,sublevel,priority,inferred,cause,type,mesg) '.
'VALUES (UNIX_TIMESTAMP(now()),%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s,%s)',
$ENV{TBLOG_PIDX},
$ENV{TBLOG_EXPTIDX},
$ENV{TBLOG_UID},
$ENV{TBLOG_SESSION},
$ENV{TBLOG_PARENT_INVOCATION},
......@@ -627,39 +638,31 @@ sub dblog_real ( $$@ ) {
}
=item tblog_find_error [[SESSION], STORE]
=item tblog_find_error
Attempts to find the relevant error.
Parameters:
=over
=over
=item session
Will act in a way that is safe to use in an END block, that is (1)
never die, (2) don't modify the exit code.
Session id to use. Defaults to the current as given by
$ENV{TVLOG_SESSION}.
Will, also, print the results to and some additional info to STDERR
for diagnosis purposed.
=item store
The results will be stored the "errors" table and also returned in a
hash with the following..
If set store the results into the database and act in
a way that is safe to use in an END block, that is
(1) never die, (2) don't modify the exit code, Will also
print the results to and some additional info to STDERR
for diagnosis purposed. If not set simply print the results to
STDERR. Defaults to true unless the "session" parameter is also
given, in which case default to false.
=back
=back
mesg: text of the error
cause:
confidence:
script:
session:
exptidx:
err: the text of any internal errors in tblog_find_error,
otherwise undefined
To retrieve the results from the database:
SELECT ... WHERE session = <Session ID>
AND type = 'thecause' form log
SELECT ... WHERE session = <Session ID> FROM errors
The relevant errors are also flagged using "relevant" flag:
......@@ -668,23 +671,23 @@ The relevant errors are also flagged using "relevant" flag:
=cut