Commit d258dde6 authored by Kevin Atkinson's avatar Kevin Atkinson

Changed format of email sent to user on errors.  The error will now
appear instead of the generic message when I am confident it is
accurate.  The subject line will also change to reflect the cause of
an error.

Avoid sending mail to testbed-ops during failed swap related evenets
in some cases.  It will instead be sent to a new mailing list
testbed-errors.

Added a new row in the experiment info table "Last Error:" which
states the cause of the error, and links to a new page displaying the
error.

Made some assign/assign_wrapper errors more informative.

The error (as determined by tblog) is now stored in the database in a
more structured fashion.  This inlcudes adding a column for the session
(in the log table) to testbed_stats to link eash swap event with the
logs and possible the error.

Other changes to the database, see sql/database-migrate.txt
parent 9d70c179
...@@ -4740,6 +4740,9 @@ sub GatherSwapStats($$$$$;$) ...@@ -4740,6 +4740,9 @@ sub GatherSwapStats($$$$$;$)
$TBDB_STATS_STARTCLOCK = time(); $TBDB_STATS_STARTCLOCK = time();
return; return;
} }
my $session = tblog_session();
$session = 'NULL' unless defined $session;
local $DBQUERY_MAXTRIES = 0; local $DBQUERY_MAXTRIES = 0;
...@@ -4770,7 +4773,8 @@ sub GatherSwapStats($$$$$;$) ...@@ -4770,7 +4773,8 @@ sub GatherSwapStats($$$$$;$)
if ($ecode) { if ($ecode) {
DBQueryWarn("update experiment_stats set ". DBQueryWarn("update experiment_stats set ".
" swap_errors=swap_errors+1, ". " swap_errors=swap_errors+1, ".
" swap_exitcode=$ecode ". " swap_exitcode=$ecode, ".
" last_error=$session ".
"where pid='$pid' and eid='$eid' and exptidx=$exptidx"); "where pid='$pid' and eid='$eid' and exptidx=$exptidx");
# #
...@@ -4989,11 +4993,14 @@ sub GatherSwapStats($$$$$;$) ...@@ -4989,11 +4993,14 @@ sub GatherSwapStats($$$$$;$)
$tmp = TBDB_STATS_SWAPIN; $tmp = TBDB_STATS_SWAPIN;
} }
DBQueryWarn("update experiment_stats ". DBQueryWarn("update experiment_stats ".
"set ${tmp}_count=${tmp}_count+1, ". "set ${tmp}_count=${tmp}_count+1, ".
" ${tmp}_last=now(), ". " ${tmp}_last=now(), ".
" swapin_duration=swapin_duration+${duration}, ". " swapin_duration=swapin_duration+${duration}, ".
" swap_exitcode=0 ". " swap_exitcode=0, ".
" last_error=NULL ".
"where pid='$pid' and eid='$eid' and ". "where pid='$pid' and eid='$eid' and ".
" exptidx=$exptidx"); " exptidx=$exptidx");
} }
...@@ -5047,9 +5054,9 @@ sub GatherSwapStats($$$$$;$) ...@@ -5047,9 +5054,9 @@ sub GatherSwapStats($$$$$;$)
DBQueryWarn("insert into testbed_stats ". DBQueryWarn("insert into testbed_stats ".
"(idx, uid, start_time, end_time, exptidx, rsrcidx, ". "(idx, uid, start_time, end_time, exptidx, rsrcidx, ".
" action, exitcode) ". " action, exitcode, log_session) ".
"values (0, '$uid', $starttime, now(), $exptidx, $rsrcidx, ". "values (0, '$uid', $starttime, now(), $exptidx, $rsrcidx, ".
" '$mode', $ecode)"); " '$mode', $ecode, $session)");
# We do these here since even failed operations implies activity # We do these here since even failed operations implies activity
DBQueryWarn("update user_stats set last_activity=now() ". DBQueryWarn("update user_stats set last_activity=now() ".
......
...@@ -104,6 +104,17 @@ CREATE TABLE cameras ( ...@@ -104,6 +104,17 @@ CREATE TABLE cameras (
PRIMARY KEY (name,building,floor) PRIMARY KEY (name,building,floor)
) TYPE=MyISAM; ) TYPE=MyISAM;
--
-- Table structure for table `causes`
--
CREATE TABLE causes (
cause varchar(16) NOT NULL default '',
cause_desc varchar(32) NOT NULL default '',
PRIMARY KEY (cause),
UNIQUE KEY cause_desc (cause_desc)
) TYPE=MyISAM;
-- --
-- Table structure for table `cdroms` -- Table structure for table `cdroms`
-- --
...@@ -336,6 +347,21 @@ CREATE TABLE emulab_indicies ( ...@@ -336,6 +347,21 @@ CREATE TABLE emulab_indicies (
PRIMARY KEY (name) PRIMARY KEY (name)
) TYPE=MyISAM; ) TYPE=MyISAM;
--
-- Table structure for table `errors`
--
CREATE TABLE errors (
session int(10) unsigned NOT NULL default '0',
stamp int(10) unsigned NOT NULL default '0',
exptidx int(11) NOT NULL default '0',
script smallint(3) NOT NULL default '0',
cause varchar(16) NOT NULL default '',
confidence float NOT NULL default '0',
mesg text NOT NULL,
PRIMARY KEY (session)
) TYPE=MyISAM;
-- --
-- Table structure for table `event_eventtypes` -- Table structure for table `event_eventtypes`
-- --
...@@ -455,6 +481,7 @@ CREATE TABLE experiment_stats ( ...@@ -455,6 +481,7 @@ CREATE TABLE experiment_stats (
elabinelab_exptidx int(10) unsigned default NULL, elabinelab_exptidx int(10) unsigned default NULL,
security_level tinyint(1) NOT NULL default '0', security_level tinyint(1) NOT NULL default '0',
archive_idx int(10) unsigned default NULL, archive_idx int(10) unsigned default NULL,
last_error int(10) unsigned default NULL,
PRIMARY KEY (eid,pid,exptidx), PRIMARY KEY (eid,pid,exptidx),
KEY exptidx (exptidx), KEY exptidx (exptidx),
KEY rsrcidx (rsrcidx) KEY rsrcidx (rsrcidx)
...@@ -974,7 +1001,7 @@ CREATE TABLE location_info ( ...@@ -974,7 +1001,7 @@ CREATE TABLE location_info (
CREATE TABLE log ( CREATE TABLE log (
seq int(10) unsigned NOT NULL auto_increment, seq int(10) unsigned NOT NULL auto_increment,
stamp int(10) unsigned NOT NULL default '0', stamp int(10) unsigned NOT NULL default '0',
pidx int(11) NOT NULL default '0', exptidx int(11) NOT NULL default '0',
uid int(11) default NULL, uid int(11) default NULL,
session int(10) unsigned NOT NULL default '0', session int(10) unsigned NOT NULL default '0',
invocation int(10) unsigned NOT NULL default '0', invocation int(10) unsigned NOT NULL default '0',
...@@ -984,7 +1011,7 @@ CREATE TABLE log ( ...@@ -984,7 +1011,7 @@ CREATE TABLE log (
sublevel tinyint(2) NOT NULL default '0', sublevel tinyint(2) NOT NULL default '0',
priority smallint(3) NOT NULL default '0', priority smallint(3) NOT NULL default '0',
inferred tinyint(1) NOT NULL default '0', inferred tinyint(1) NOT NULL default '0',
cause varchar(32) NOT NULL default '', cause varchar(16) NOT NULL default '',
type enum('normal','entering','exiting','thecause','extra','summary') NOT NULL default 'normal', type enum('normal','entering','exiting','thecause','extra','summary') NOT NULL default 'normal',
relevant tinyint(1) NOT NULL default '0', relevant tinyint(1) NOT NULL default '0',
mesg text NOT NULL, mesg text NOT NULL,
...@@ -1962,6 +1989,7 @@ CREATE TABLE testbed_stats ( ...@@ -1962,6 +1989,7 @@ CREATE TABLE testbed_stats (
action varchar(16) NOT NULL default '', action varchar(16) NOT NULL default '',
exitcode tinyint(3) default '0', exitcode tinyint(3) default '0',
uid varchar(8) NOT NULL default '', uid varchar(8) NOT NULL default '',
log_session int(10) unsigned default NULL,
PRIMARY KEY (idx), PRIMARY KEY (idx),
KEY rsrcidx (rsrcidx), KEY rsrcidx (rsrcidx),
KEY exptidx (exptidx) KEY exptidx (exptidx)
......
...@@ -4,6 +4,18 @@ ...@@ -4,6 +4,18 @@
--------------------------------------------------------- ---------------------------------------------------------
-- Server version 3.23.58-log -- Server version 3.23.58-log
--
-- Dumping data for table `causes`
--
REPLACE INTO causes VALUES ('temp','Temp Resource Shortage');
REPLACE INTO causes VALUES ('user','User Error');
REPLACE INTO causes VALUES ('internal','Internal Error');
REPLACE INTO causes VALUES ('software','Software Problem');
REPLACE INTO causes VALUES ('hardware','Hardware Problem');
REPLACE INTO causes VALUES ('unknown','Cause Unknown');
-- --
-- Dumping data for table `comments` -- Dumping data for table `comments`
-- --
...@@ -119,6 +131,7 @@ REPLACE INTO event_objecttypes VALUES (17,'BGMON'); ...@@ -119,6 +131,7 @@ REPLACE INTO event_objecttypes VALUES (17,'BGMON');
-- --
REPLACE INTO exported_tables VALUES ('causes');
REPLACE INTO exported_tables VALUES ('comments'); REPLACE INTO exported_tables VALUES ('comments');
REPLACE INTO exported_tables VALUES ('event_eventtypes'); REPLACE INTO exported_tables VALUES ('event_eventtypes');
REPLACE INTO exported_tables VALUES ('event_objecttypes'); REPLACE INTO exported_tables VALUES ('event_objecttypes');
......
...@@ -3173,3 +3173,55 @@ last_net_act,last_cpu_act,last_ext_act); ...@@ -3173,3 +3173,55 @@ last_net_act,last_cpu_act,last_ext_act);
altsrc tinytext, altsrc tinytext,
PRIMARY KEY (name) PRIMARY KEY (name)
) TYPE=MyISAM; ) TYPE=MyISAM;
???: Add stuff to store the error, as determined by tblog, in a more
structured fashion:
CREATE TABLE errors (
session int unsigned NOT NULL,
stamp int unsigned NOT NULL,
exptidx int NOT NULL,
script smallint(3) NOT NULL,
cause varchar(16) NOT NULL,
confidence float NOT NULL,
mesg text NOT NULL,
primary key (session)
);
alter table experiment_stats
add last_error int unsigned;
alter table testbed_stats
add log_session int unsigned;
CREATE TABLE causes (
cause varchar(16) NOT NULL,
cause_desc varchar(32) NOT NULL,
primary key (cause),
unique key (cause_desc)
);
ALTER TABLE log
change cause cause varchar(16) NOT NULL,
change pidx exptidx int NOT NULL default '0';
INSERT INTO exported_tables values ('causes');
INSERT INTO causes values ('temp', 'Temp Resource Shortage');
INSERT INTO causes values ('user', 'User Error');
INSERT INTO causes values ('internal', 'Internal Error');
INSERT INTO causes values ('software', 'Software Problem');
INSERT INTO causes values ('hardware', 'Hardware Problem');
INSERT INTO causes values ('unknown', 'Cause Unknown');
The new code will than store all errors in the new table. Before
they where stored as a special entry in the "log" table. To copy
them over into the "errors" table you may also want to run:
perl errors-migrate.pl.
If the script completed with no errors you can than safely purge
those entries from the "log" table using:
delete from log where type = 'thecause';
#!/usr/bin/perl -w
use strict;
use DBI;
my $database = 'tbdb';
my $username = 'root';
my $passwd = '';
my $dbh = DBI->connect("DBI:mysql:database=$database", $username, $passwd);
my $sth = $dbh->prepare("select * from scripts");
$sth->execute;
my %script_map;
my @script_map;
while ((my $ref = $sth->fetchrow_arrayref)) {
$script_map{$ref->[1]} = $ref->[0];
$script_map[$ref->[0]] = $ref->[1];
}
$script_map{unknown} = 0;
$script_map[0] = 'unknown';
$sth = $dbh->prepare("select stamp,session,exptidx,mesg from log where type = 'thecause' order by session");
$sth->execute;
my $sth2 = $dbh->prepare("select script_name,cause,type from log natural join scripts where session = ? and relevant order by seq desc");
my $ins = $dbh->prepare("insert into errors (session,stamp,exptidx,script,cause,confidence,mesg) values (?,?,?,?,?,?,?)");
my $ready = 0;
while ((my $ref = $sth->fetchrow_arrayref)) {
my ($stamp,$session,$exptidx,$mesg) = @$ref;
my $script = "unknown";
my $cause = "unknown";
my $confidence = 0.0;
local $_ = $mesg;
if (/^\d+: ([^:]+)/) {
$script = $1;
} else {
if (s/\nCause: (\S+)\nConfidence: (\S+)$//) {
$cause = $1;
$confidence = $2;
}
s/^ //gm;
$mesg = $_;
($_) = split /\n\s*\n/;
if (/\(([^\)]+)\)$/ && defined $script_map{$1}) {
$script = $1;
} else {
warn "Unable to find script in: $_\n" unless /No clue as to what went wrong/;
}
$sth2->execute($session);
my @rel;
while ((my $r = $sth2->fetchrow_hashref)) {
push @rel, {%$r};
}
if (@rel) {
warn "Script name mismatch: $rel[0]{script_name} ne $script in: $_\n"
if $rel[0]{script_name} ne $script;
$script = $rel[0]{script_name};
my $conf = 0.5;
if ($script =~ /^(assign|parse)/) {
$conf = 0.9;
} elsif ($script =~ /^(os_setup)/) {
$conf = 0.2;
}
my $t = '';
my $c = '';
foreach (@rel) {
$ready = 1 if $_->{cause} && $_->{cause} ne 'unknown';
if (!$c && $_->{cause}) {
$c = $_->{cause};
$t = $_->{type};
} elsif ($_->{cause} &&
$_->{type} eq $t && $_->{cause} ne $c) {
$c = 'unknown';
}
}
$c = 'unknown' unless $c;
if ($cause eq 'unknown' && $ready) {
$cause = $c;
$confidence = $conf;
} elsif ($cause ne 'unknown') {
warn "Cause mismatch" unless $cause eq $c;
warn "Confidence mismatched" unless $conf == $confidence;
}
}
}
$ins->execute($session,$stamp,$exptidx,$script_map{$script},$cause,$confidence,$mesg);
}
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
CREATE TABLE log ( CREATE TABLE log (
seq int unsigned NOT NULL auto_increment, seq int unsigned NOT NULL auto_increment,
stamp int unsigned NOT NULL, stamp int unsigned NOT NULL, -- timestamp
pidx int NOT NULL, -- unique project id exptidx int NOT NULL, -- unique experment id
uid int default NULL, -- numeric user id uid int default NULL, -- numeric user id
session int unsigned NOT NULL, -- session id (1) session int unsigned NOT NULL, -- session id (1)
invocation int unsigned NOT NULL, -- invocation id - unique id for a invocation int unsigned NOT NULL, -- invocation id - unique id for a
...@@ -19,13 +19,14 @@ CREATE TABLE log ( ...@@ -19,13 +19,14 @@ CREATE TABLE log (
-- given the non-default priority -- given the non-default priority
-- 3 - log created with a normal print and -- 3 - log created with a normal print and
-- given the default priority -- given the default priority
cause varchar(32) NOT NULL, -- string identifying the "cause" of the error cause varchar(16) NOT NULL, -- string identifying the cause of the error
type enum('normal','entering','exiting','thecause','extra','summary') type enum('normal','entering','exiting','thecause','extra','summary')
NOT NULL default 'normal',-- type of error: NOT NULL default 'normal',-- type of error:
-- normal: normal error generated by tblog -- normal: normal error generated by tblog
-- entering: auto gen when a script starts -- entering: auto gen when a script starts
-- exiting: auto gen when a script exists (2) -- exiting: auto gen when a script exists (2)
-- thecause: created by tblog_find_error -- thecause: no longer used, see the
-- "errors" table
-- extra: extra information to normal info -- extra: extra information to normal info
-- at a lower level -- at a lower level
-- summary: useful summary of errors, -- summary: useful summary of errors,
...@@ -52,20 +53,48 @@ CREATE TABLE log ( ...@@ -52,20 +53,48 @@ CREATE TABLE log (
-- seq stamp [session] invocation priority inferred cause type relevant mesg -- seq stamp [session] invocation priority inferred cause type relevant mesg
-- ... -- ...
CREATE TABLE errors (
session int unsigned NOT NULL, -- session id
stamp int unsigned NOT NULL, -- timestamp
exptidx int NOT NULL, -- unique experment id
script smallint(3) NOT NULL, -- numeric id for the script name
cause varchar(16) NOT NULL, -- string identifying the cause of the error
confidence float NOT NULL, -- number between 0 and 1 indicating how
-- confident tblog is that this it
-- correctly identified the error.
mesg text NOT NULL, -- the text of the message
primary key (session)
);
CREATE TABLE scripts ( CREATE TABLE scripts (
script smallint(3) NOT NULL auto_increment, script smallint(3) NOT NULL auto_increment,
script_name varchar(24) NOT NULL default '', script_name varchar(24) NOT NULL default '',
PRIMARY KEY (script), PRIMARY KEY (script),
UNIQUE KEY id (name) UNIQUE KEY id (script_name)
); );
CREATE TABLE priorities ( CREATE TABLE priorities (
priority smallint(3) NOT NULL, priority smallint(3) NOT NULL,
priority_name varchar(8) NOT NULL, priority_name varchar(8) NOT NULL,
primary key (priority), primary key (priority),
unique key (name) unique key (priority_name)
);
CREATE TABLE causes (
cause varchar(16) NOT NULL,
cause_desc varchar(32) NOT NULL,
primary key (cause)
unique key (cause_desc)
); );
alter table experiment_stats
add last_error int unsigned; -- session id of last error, otherwise 0
alter table testbed_stats
add log_session int unsigned;
replace into priorities values (000, 'EMERG'); replace into priorities values (000, 'EMERG');
replace into priorities values (100, 'ALERT'); replace into priorities values (100, 'ALERT');
replace into priorities values (200, 'CRIT'); replace into priorities values (200, 'CRIT');
...@@ -75,4 +104,17 @@ replace into priorities values (500, 'NOTICE'); ...@@ -75,4 +104,17 @@ replace into priorities values (500, 'NOTICE');
replace into priorities values (600, 'INFO'); replace into priorities values (600, 'INFO');
replace into priorities values (700, 'DEBUG'); replace into priorities values (700, 'DEBUG');
replace into causes values ('temp', 'Temp Resource Shortage');
replace into causes values ('user', 'User Error');
replace into causes values ('internal', 'Internal Error');
replace into causes values ('software', 'Software Problem');
replace into causes values ('hardware', 'Hardware Problem');
replace into causes values ('unknown', 'Cause Unknown');
replace into exported_tables values ('priorities');
replace into exported_tables values ('causes');
replace into exported_tables values ('state_timeouts'); replace into exported_tables values ('state_timeouts');
...@@ -785,7 +785,13 @@ while (1) { ...@@ -785,7 +785,13 @@ while (1) {
my $retval = RunAssign(); my $retval = RunAssign();
if ($retval != 0) { if ($retval != 0) {
fatal({type=>'extra', cause=>'user'}, fatal({type=>'extra', cause=>'user'},
"Experiment can not be run on an empty testbed. Please fix the experiment."); "This experiment cannot be instantiated on this ".
"testbed. You have most likely asked for hardware ".
"this testbed does not have, such as nodes of a type ".
"it does not contain, or nodes with too many network ".
"interfaces. You will need to modify this experiment ".
"before it can be swapped in - re-submitting the ".
"experiment as-is will always result in failure.");
} }
print("Assign succeeded on an empty testbed.\n"); print("Assign succeeded on an empty testbed.\n");
$precheck = 0; $precheck = 0;
......
...@@ -92,10 +92,16 @@ if ($exitcode) { ...@@ -92,10 +92,16 @@ if ($exitcode) {
dblog(TBLOG_ERR, {}, $err) unless ($obvious_error); dblog(TBLOG_ERR, {}, $err) unless ($obvious_error);
# create a log entry that assign failed # create a log entry that assign failed
my %parms;
$parms{sublevel} = -1; dblog(TBLOG_ERR, {sublevel => -1}, "Assign Failed.");
$parms{type} = 'extra' unless $obvious_error;
dblog(TBLOG_ERR, \%parms, "Assign Failed."); dblog(TBLOG_ERR, {sublevel => -1, type=>'extra'},
"Failed to find a set of physical testbed nodes to run your ".
"experiment on. This might mean that there are not enough ".
"nodes or switch resources free, or your experiment might ".
"require certain hardware which is not available. If you ".
"believe this message is in error, contact @TBOPSEMAIL@.")
unless $obvious_error;
} }
exit $exitcode; exit $exitcode;
......
...@@ -874,24 +874,21 @@ sub cleanup() ...@@ -874,24 +874,21 @@ sub cleanup()
# Clear the logfile so the webpage stops. # Clear the logfile so the webpage stops.
TBExptClearLogFile($pid, $eid); TBExptClearLogFile($pid, $eid);
#
# Figure out the error if possible
#
my $error_data = tblog_find_error();
# #
# Send a message to the testbed list. # Send a message to the testbed list.
# #
SENDMAIL("$user_name <$user_email>", tblog_email_error($error_data,
"Experiment Configure Failure: $pid/$eid", "$user_name <$user_email>",
"Please look at the log below to see what happened. If the error\n". "Config Failure", "$pid/$eid",
"resulted from a lack of free nodes, you can use this web page to\n". "$user_name <$user_email>",
"get a summary of free nodes:\n\n". "",
" ${TBBASE}/nodecontrol_list.php3\n". "Cc: $TBOPS",
"\n". ($logname, "assign.log", "wanassign.log", $nsfile));
"Please do not try again until you see enough nodes free. Or, you\n".
"can use the batch system to swap your experiment in when enough\n".
"nodes are free:\n\n".
" ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html".
"#BatchMode\n",
"$user_name <$user_email>",
"Cc: $TBOPS",
($logname, "assign.log", "wanassign.log", $nsfile));
# #
# Back up the work dir for post-mortem debugging. # Back up the work dir for post-mortem debugging.
...@@ -1224,8 +1221,6 @@ END { ...@@ -1224,8 +1221,6 @@ END {
} }
my $saved_exitcode = $?; my $saved_exitcode = $?;
tblog_find_error() if $?;
if ($cleaning) { if ($cleaning) {
# #
# We are screwed; a recursive error. Someone will have to clean # We are screwed; a recursive error. Someone will have to clean
...@@ -1241,4 +1236,3 @@ END { ...@@ -1241,4 +1236,3 @@ END {
cleanup(); cleanup();
$? = $saved_exitcode; $? = $saved_exitcode;
} }
...@@ -66,9 +66,6 @@ of an error. The current causes are: ...@@ -66,9 +66,6 @@ of an error. The current causes are:
hardware - hardware problem hardware - hardware problem
unknown - default unknown - default
(Note, that at the moment, internal is the same as software, but in
the future that might change)
The cause for an error can be set directly when calling tberror by passing The cause for an error can be set directly when calling tberror by passing
in the "cause" parameter. For example: in the "cause" parameter. For example:
...@@ -208,9 +205,10 @@ use Exporter; ...@@ -208,9 +205,10 @@ use Exporter;
@ISA = "Exporter"; @ISA = "Exporter";