Commit d258dde6 authored by Kevin Atkinson's avatar Kevin Atkinson

Changed format of email sent to user on errors. The error will now

appear instead of the generic message when I am confident it is
accurate.  The subject line will also change to reflect the cause of
an error.

Avoid sending mail to testbed-ops during failed swap related evenets
in some cases.  It will instead be sent to a new mailing list
testbed-errors.

Added a new row in the experiment info table "Last Error:" which
states the cause of the error, and links to a new page displaying the
error.

Made some assign/assign_wrapper errors more informative.

The error (as determined by tblog) is now stored in the database in a
more structured fashion.  This inlcudes adding a column for the session
(in the log table) to testbed_stats to link eash swap event with the
logs and possible the error.

Other changes to the database, see sql/database-migrate.txt
parent 9d70c179
......@@ -4740,6 +4740,9 @@ sub GatherSwapStats($$$$$;$)
$TBDB_STATS_STARTCLOCK = time();
return;
}
my $session = tblog_session();
$session = 'NULL' unless defined $session;
local $DBQUERY_MAXTRIES = 0;
......@@ -4770,7 +4773,8 @@ sub GatherSwapStats($$$$$;$)
if ($ecode) {
DBQueryWarn("update experiment_stats set ".
" swap_errors=swap_errors+1, ".
" swap_exitcode=$ecode ".
" swap_exitcode=$ecode, ".
" last_error=$session ".
"where pid='$pid' and eid='$eid' and exptidx=$exptidx");
#
......@@ -4989,11 +4993,14 @@ sub GatherSwapStats($$$$$;$)
$tmp = TBDB_STATS_SWAPIN;
}
DBQueryWarn("update experiment_stats ".
"set ${tmp}_count=${tmp}_count+1, ".
" ${tmp}_last=now(), ".
" swapin_duration=swapin_duration+${duration}, ".
" swap_exitcode=0 ".
" swap_exitcode=0, ".
" last_error=NULL ".
"where pid='$pid' and eid='$eid' and ".
" exptidx=$exptidx");
}
......@@ -5047,9 +5054,9 @@ sub GatherSwapStats($$$$$;$)
DBQueryWarn("insert into testbed_stats ".
"(idx, uid, start_time, end_time, exptidx, rsrcidx, ".
" action, exitcode) ".
" action, exitcode, log_session) ".
"values (0, '$uid', $starttime, now(), $exptidx, $rsrcidx, ".
" '$mode', $ecode)");
" '$mode', $ecode, $session)");
# We do these here since even failed operations implies activity
DBQueryWarn("update user_stats set last_activity=now() ".
......
......@@ -104,6 +104,17 @@ CREATE TABLE cameras (
PRIMARY KEY (name,building,floor)
) TYPE=MyISAM;
--
-- Table structure for table `causes`
--
CREATE TABLE causes (
cause varchar(16) NOT NULL default '',
cause_desc varchar(32) NOT NULL default '',
PRIMARY KEY (cause),
UNIQUE KEY cause_desc (cause_desc)
) TYPE=MyISAM;
--
-- Table structure for table `cdroms`
--
......@@ -336,6 +347,21 @@ CREATE TABLE emulab_indicies (
PRIMARY KEY (name)
) TYPE=MyISAM;
--
-- Table structure for table `errors`
--
CREATE TABLE errors (
session int(10) unsigned NOT NULL default '0',
stamp int(10) unsigned NOT NULL default '0',
exptidx int(11) NOT NULL default '0',
script smallint(3) NOT NULL default '0',
cause varchar(16) NOT NULL default '',
confidence float NOT NULL default '0',
mesg text NOT NULL,
PRIMARY KEY (session)
) TYPE=MyISAM;
--
-- Table structure for table `event_eventtypes`
--
......@@ -455,6 +481,7 @@ CREATE TABLE experiment_stats (
elabinelab_exptidx int(10) unsigned default NULL,
security_level tinyint(1) NOT NULL default '0',
archive_idx int(10) unsigned default NULL,
last_error int(10) unsigned default NULL,
PRIMARY KEY (eid,pid,exptidx),
KEY exptidx (exptidx),
KEY rsrcidx (rsrcidx)
......@@ -974,7 +1001,7 @@ CREATE TABLE location_info (
CREATE TABLE log (
seq int(10) unsigned NOT NULL auto_increment,
stamp int(10) unsigned NOT NULL default '0',
pidx int(11) NOT NULL default '0',
exptidx int(11) NOT NULL default '0',
uid int(11) default NULL,
session int(10) unsigned NOT NULL default '0',
invocation int(10) unsigned NOT NULL default '0',
......@@ -984,7 +1011,7 @@ CREATE TABLE log (
sublevel tinyint(2) NOT NULL default '0',
priority smallint(3) NOT NULL default '0',
inferred tinyint(1) NOT NULL default '0',
cause varchar(32) NOT NULL default '',
cause varchar(16) NOT NULL default '',
type enum('normal','entering','exiting','thecause','extra','summary') NOT NULL default 'normal',
relevant tinyint(1) NOT NULL default '0',
mesg text NOT NULL,
......@@ -1962,6 +1989,7 @@ CREATE TABLE testbed_stats (
action varchar(16) NOT NULL default '',
exitcode tinyint(3) default '0',
uid varchar(8) NOT NULL default '',
log_session int(10) unsigned default NULL,
PRIMARY KEY (idx),
KEY rsrcidx (rsrcidx),
KEY exptidx (exptidx)
......
......@@ -4,6 +4,18 @@
---------------------------------------------------------
-- Server version 3.23.58-log
--
-- Dumping data for table `causes`
--
REPLACE INTO causes VALUES ('temp','Temp Resource Shortage');
REPLACE INTO causes VALUES ('user','User Error');
REPLACE INTO causes VALUES ('internal','Internal Error');
REPLACE INTO causes VALUES ('software','Software Problem');
REPLACE INTO causes VALUES ('hardware','Hardware Problem');
REPLACE INTO causes VALUES ('unknown','Cause Unknown');
--
-- Dumping data for table `comments`
--
......@@ -119,6 +131,7 @@ REPLACE INTO event_objecttypes VALUES (17,'BGMON');
--
REPLACE INTO exported_tables VALUES ('causes');
REPLACE INTO exported_tables VALUES ('comments');
REPLACE INTO exported_tables VALUES ('event_eventtypes');
REPLACE INTO exported_tables VALUES ('event_objecttypes');
......
......@@ -3173,3 +3173,55 @@ last_net_act,last_cpu_act,last_ext_act);
altsrc tinytext,
PRIMARY KEY (name)
) TYPE=MyISAM;
???: Add stuff to store the error, as determined by tblog, in a more
structured fashion:
CREATE TABLE errors (
session int unsigned NOT NULL,
stamp int unsigned NOT NULL,
exptidx int NOT NULL,
script smallint(3) NOT NULL,
cause varchar(16) NOT NULL,
confidence float NOT NULL,
mesg text NOT NULL,
primary key (session)
);
alter table experiment_stats
add last_error int unsigned;
alter table testbed_stats
add log_session int unsigned;
CREATE TABLE causes (
cause varchar(16) NOT NULL,
cause_desc varchar(32) NOT NULL,
primary key (cause),
unique key (cause_desc)
);
ALTER TABLE log
change cause cause varchar(16) NOT NULL,
change pidx exptidx int NOT NULL default '0';
INSERT INTO exported_tables values ('causes');
INSERT INTO causes values ('temp', 'Temp Resource Shortage');
INSERT INTO causes values ('user', 'User Error');
INSERT INTO causes values ('internal', 'Internal Error');
INSERT INTO causes values ('software', 'Software Problem');
INSERT INTO causes values ('hardware', 'Hardware Problem');
INSERT INTO causes values ('unknown', 'Cause Unknown');
The new code will than store all errors in the new table. Before
they where stored as a special entry in the "log" table. To copy
them over into the "errors" table you may also want to run:
perl errors-migrate.pl.
If the script completed with no errors you can than safely purge
those entries from the "log" table using:
delete from log where type = 'thecause';
#!/usr/bin/perl -w
use strict;
use DBI;
my $database = 'tbdb';
my $username = 'root';
my $passwd = '';
my $dbh = DBI->connect("DBI:mysql:database=$database", $username, $passwd);
my $sth = $dbh->prepare("select * from scripts");
$sth->execute;
my %script_map;
my @script_map;
while ((my $ref = $sth->fetchrow_arrayref)) {
$script_map{$ref->[1]} = $ref->[0];
$script_map[$ref->[0]] = $ref->[1];
}
$script_map{unknown} = 0;
$script_map[0] = 'unknown';
$sth = $dbh->prepare("select stamp,session,exptidx,mesg from log where type = 'thecause' order by session");
$sth->execute;
my $sth2 = $dbh->prepare("select script_name,cause,type from log natural join scripts where session = ? and relevant order by seq desc");
my $ins = $dbh->prepare("insert into errors (session,stamp,exptidx,script,cause,confidence,mesg) values (?,?,?,?,?,?,?)");
my $ready = 0;
while ((my $ref = $sth->fetchrow_arrayref)) {
my ($stamp,$session,$exptidx,$mesg) = @$ref;
my $script = "unknown";
my $cause = "unknown";
my $confidence = 0.0;
local $_ = $mesg;
if (/^\d+: ([^:]+)/) {
$script = $1;
} else {
if (s/\nCause: (\S+)\nConfidence: (\S+)$//) {
$cause = $1;
$confidence = $2;
}
s/^ //gm;
$mesg = $_;
($_) = split /\n\s*\n/;
if (/\(([^\)]+)\)$/ && defined $script_map{$1}) {
$script = $1;
} else {
warn "Unable to find script in: $_\n" unless /No clue as to what went wrong/;
}
$sth2->execute($session);
my @rel;
while ((my $r = $sth2->fetchrow_hashref)) {
push @rel, {%$r};
}
if (@rel) {
warn "Script name mismatch: $rel[0]{script_name} ne $script in: $_\n"
if $rel[0]{script_name} ne $script;
$script = $rel[0]{script_name};
my $conf = 0.5;
if ($script =~ /^(assign|parse)/) {
$conf = 0.9;
} elsif ($script =~ /^(os_setup)/) {
$conf = 0.2;
}
my $t = '';
my $c = '';
foreach (@rel) {
$ready = 1 if $_->{cause} && $_->{cause} ne 'unknown';
if (!$c && $_->{cause}) {
$c = $_->{cause};
$t = $_->{type};
} elsif ($_->{cause} &&
$_->{type} eq $t && $_->{cause} ne $c) {
$c = 'unknown';
}
}
$c = 'unknown' unless $c;
if ($cause eq 'unknown' && $ready) {
$cause = $c;
$confidence = $conf;
} elsif ($cause ne 'unknown') {
warn "Cause mismatch" unless $cause eq $c;
warn "Confidence mismatched" unless $conf == $confidence;
}
}
}
$ins->execute($session,$stamp,$exptidx,$script_map{$script},$cause,$confidence,$mesg);
}
......@@ -2,8 +2,8 @@
CREATE TABLE log (
seq int unsigned NOT NULL auto_increment,
stamp int unsigned NOT NULL,
pidx int NOT NULL, -- unique project id
stamp int unsigned NOT NULL, -- timestamp
exptidx int NOT NULL, -- unique experment id
uid int default NULL, -- numeric user id
session int unsigned NOT NULL, -- session id (1)
invocation int unsigned NOT NULL, -- invocation id - unique id for a
......@@ -19,13 +19,14 @@ CREATE TABLE log (
-- given the non-default priority
-- 3 - log created with a normal print and
-- given the default priority
cause varchar(32) NOT NULL, -- string identifying the "cause" of the error
cause varchar(16) NOT NULL, -- string identifying the cause of the error
type enum('normal','entering','exiting','thecause','extra','summary')
NOT NULL default 'normal',-- type of error:
-- normal: normal error generated by tblog
-- entering: auto gen when a script starts
-- exiting: auto gen when a script exists (2)
-- thecause: created by tblog_find_error
-- thecause: no longer used, see the
-- "errors" table
-- extra: extra information to normal info
-- at a lower level
-- summary: useful summary of errors,
......@@ -52,20 +53,48 @@ CREATE TABLE log (
-- seq stamp [session] invocation priority inferred cause type relevant mesg
-- ...
CREATE TABLE errors (
session int unsigned NOT NULL, -- session id
stamp int unsigned NOT NULL, -- timestamp
exptidx int NOT NULL, -- unique experment id
script smallint(3) NOT NULL, -- numeric id for the script name
cause varchar(16) NOT NULL, -- string identifying the cause of the error
confidence float NOT NULL, -- number between 0 and 1 indicating how
-- confident tblog is that this it
-- correctly identified the error.
mesg text NOT NULL, -- the text of the message
primary key (session)
);
CREATE TABLE scripts (
script smallint(3) NOT NULL auto_increment,
script_name varchar(24) NOT NULL default '',
PRIMARY KEY (script),
UNIQUE KEY id (name)
UNIQUE KEY id (script_name)
);
CREATE TABLE priorities (
priority smallint(3) NOT NULL,
priority_name varchar(8) NOT NULL,
primary key (priority),
unique key (name)
unique key (priority_name)
);
CREATE TABLE causes (
cause varchar(16) NOT NULL,
cause_desc varchar(32) NOT NULL,
primary key (cause)
unique key (cause_desc)
);
alter table experiment_stats
add last_error int unsigned; -- session id of last error, otherwise 0
alter table testbed_stats
add log_session int unsigned;
replace into priorities values (000, 'EMERG');
replace into priorities values (100, 'ALERT');
replace into priorities values (200, 'CRIT');
......@@ -75,4 +104,17 @@ replace into priorities values (500, 'NOTICE');
replace into priorities values (600, 'INFO');
replace into priorities values (700, 'DEBUG');
replace into causes values ('temp', 'Temp Resource Shortage');
replace into causes values ('user', 'User Error');
replace into causes values ('internal', 'Internal Error');
replace into causes values ('software', 'Software Problem');
replace into causes values ('hardware', 'Hardware Problem');
replace into causes values ('unknown', 'Cause Unknown');
replace into exported_tables values ('priorities');
replace into exported_tables values ('causes');
replace into exported_tables values ('state_timeouts');
......@@ -785,7 +785,13 @@ while (1) {
my $retval = RunAssign();
if ($retval != 0) {
fatal({type=>'extra', cause=>'user'},
"Experiment can not be run on an empty testbed. Please fix the experiment.");
"This experiment cannot be instantiated on this ".
"testbed. You have most likely asked for hardware ".
"this testbed does not have, such as nodes of a type ".
"it does not contain, or nodes with too many network ".
"interfaces. You will need to modify this experiment ".
"before it can be swapped in - re-submitting the ".
"experiment as-is will always result in failure.");
}
print("Assign succeeded on an empty testbed.\n");
$precheck = 0;
......
......@@ -92,10 +92,16 @@ if ($exitcode) {
dblog(TBLOG_ERR, {}, $err) unless ($obvious_error);
# create a log entry that assign failed
my %parms;
$parms{sublevel} = -1;
$parms{type} = 'extra' unless $obvious_error;
dblog(TBLOG_ERR, \%parms, "Assign Failed.");
dblog(TBLOG_ERR, {sublevel => -1}, "Assign Failed.");
dblog(TBLOG_ERR, {sublevel => -1, type=>'extra'},
"Failed to find a set of physical testbed nodes to run your ".
"experiment on. This might mean that there are not enough ".
"nodes or switch resources free, or your experiment might ".
"require certain hardware which is not available. If you ".
"believe this message is in error, contact @TBOPSEMAIL@.")
unless $obvious_error;
}
exit $exitcode;
......
......@@ -874,24 +874,21 @@ sub cleanup()
# Clear the logfile so the webpage stops.
TBExptClearLogFile($pid, $eid);
#
# Figure out the error if possible
#
my $error_data = tblog_find_error();
#
# Send a message to the testbed list.
#
SENDMAIL("$user_name <$user_email>",
"Experiment Configure Failure: $pid/$eid",
"Please look at the log below to see what happened. If the error\n".
"resulted from a lack of free nodes, you can use this web page to\n".
"get a summary of free nodes:\n\n".
" ${TBBASE}/nodecontrol_list.php3\n".
"\n".
"Please do not try again until you see enough nodes free. Or, you\n".
"can use the batch system to swap your experiment in when enough\n".
"nodes are free:\n\n".
" ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html".
"#BatchMode\n",
"$user_name <$user_email>",
"Cc: $TBOPS",
($logname, "assign.log", "wanassign.log", $nsfile));
tblog_email_error($error_data,
"$user_name <$user_email>",
"Config Failure", "$pid/$eid",
"$user_name <$user_email>",
"",
"Cc: $TBOPS",
($logname, "assign.log", "wanassign.log", $nsfile));
#
# Back up the work dir for post-mortem debugging.
......@@ -1224,8 +1221,6 @@ END {
}
my $saved_exitcode = $?;
tblog_find_error() if $?;
if ($cleaning) {
#
# We are screwed; a recursive error. Someone will have to clean
......@@ -1241,4 +1236,3 @@ END {
cleanup();
$? = $saved_exitcode;
}
This diff is collapsed.
......@@ -24,6 +24,7 @@ use Exporter;
@ISA = "Exporter";
@EXPORT = qw (tblog tberror tberr tbwarn tbwarning tbnotice tbinfo tbdebug
tbdie
tblog_session
TBLOG_EMERG TBLOG_ALERT TBLOG_CRIT TBLOG_ERR
TBLOG_WARNING TBLOG_NOTICE TBLOG_INFO TBLOG_DEBUG);
......@@ -105,6 +106,14 @@ sub scriptname() {
if_defined($SCRIPTNAME, $ENV{TBLOG_SCRIPTNAME});
}
#
#
#
sub tblog_session() {
return $ENV{TBLOG_SESSION};
}
#
# Dummy dblog, does nothing in this module
# Once the real "libtblog.pm" is used than this will be replaced
......
......@@ -85,7 +85,7 @@ sub SENDMAIL($$$;$$@)
if (defined($From) && $From) {
print MAIL "From: $From\n";
}
if (defined($Headers)) {
if (defined($Headers) && length($Headers) > 0) {
print MAIL "$Headers\n";
}
print MAIL "X-NetBed: $SCRIPTNAME\n";
......
......@@ -1302,6 +1302,11 @@ sub cleanup()
#
TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR);
#
# Figure out the error if possible
#
my $error_data = tblog_find_error();
#
# Send a message to the testbed list. Append the logfile.
#
......@@ -1314,22 +1319,13 @@ sub cleanup()
$rcops = "\nCc: $TBROBOCOPS";
}
SENDMAIL("$user_name <$user_email>",
"Swap ${inout} Failure: $pid/$eid",
"Please look at the log below to see what happened. If the error\n".
"resulted from a lack of free nodes, you can use this web page to\n".
"get a summary of free nodes:\n\n".
" ${TBBASE}/nodecontrol_list.php3\n".
"\n".
"Please do not try again until you see enough nodes free. Or, you\n".
"can use the batch system to swap your experiment in when enough\n".
"nodes are free:\n\n".
" ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html".
"#BatchMode\n",
($idleswap ? $TBOPS : "$user_name <$user_email>"),
"Cc: $expt_head_name <$expt_head_email>\n".
"Cc: $TBOPS $rcops",
(($logname), (defined($modnsfile) ? ($modnsfile) : ())));
tblog_email_error($error_data,
"$user_name <$user_email>",
"Swap ${inout} Failure", "$pid/$eid",
($idleswap ? $TBOPS : "$user_name <$user_email>"),
"Cc: $expt_head_name <$expt_head_email> $rcops",
"Cc: $TBOPS $rcops",
(($logname), (defined($modnsfile) ? ($modnsfile) : ())));
if ($modifyHosed) {
#
......@@ -1511,8 +1507,6 @@ END {
}
my $saved_exitcode = $?;
tblog_find_error() if $?;
if ($cleaning) {
#
# We are screwed; a recursive error. Someone will have to clean
......
......@@ -113,7 +113,7 @@ if ($expstate) {
WRITESUBMENUBUTTON("View Activity Logfile",
"showlogfile.php3?pid=$exp_pid&eid=$exp_eid");
}
if ($state == $TB_EXPTSTATE_ACTIVE) {
WRITESUBMENUBUTTON("Visualization, NS File, Mapping",
"shownsfile.php3?pid=$exp_pid&eid=$exp_eid");
......
<?php
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
# All rights reserved.
#
include("defs.php3");
include("showstuff.php3");
#
# Standard Testbed Header
#
PAGEHEADER("Last Error");
#
# Only known and logged in users can end experiments.
#
$uid = GETLOGIN();
LOGGEDINORDIE($uid);
$isadmin = ISADMIN($uid);
#
# Verify page arguments.
#
if (!TBvalid_pid($pid)) {
PAGEARGERROR("Invalid project ID.");
}
if (!TBvalid_eid($eid)) {
PAGEARGERROR("Invalid experiment ID.");
}
#
# Check to make sure this is a valid PID/EID tuple.
#
$query_result =
DBQueryFatal("SELECT * FROM experiments WHERE ".
"eid='$eid' and pid='$pid'");
if (mysql_num_rows($query_result) == 0) {
USERERROR("The experiment $eid is not a valid experiment ".
"in project $pid.", 1);
} else {
$row = mysql_fetch_array($query_result);
$exptidx = $row[idx];
}
$expstate = TBExptState($pid, $eid);
#
# Verify that this uid is a member of the project for the experiment
# being displayed.
#
if (!$isadmin) {
$query_result =
DBQueryFatal("SELECT pid FROM group_membership ".
"WHERE uid='$uid' and pid='$pid'");
if (mysql_num_rows($query_result) == 0) {
USERERROR("You are not a member of Project $pid!", 1);
}
}
$query_result =
DBQueryFatal("SELECT e.cause,e.confidence,e.mesg,cause_desc FROM experiment_stats as s,errors as e, causes as c ".
"WHERE s.exptidx = $exptidx and e.cause = c.cause and s.last_error = e.session");
if (mysql_num_rows($query_result) != 0) {
$row = mysql_fetch_array($query_result);
echo "<h2>$row[cause_desc]</h2>\n";
echo "<pre>\n";
echo htmlspecialchars($row[mesg])."\n";
echo "\n";
echo "Cause: $row[cause]\n";
echo "Confidence: $row[confidence]\n";
echo "</pre>\n";
} else {
echo "Nothing\n";
}
#
# Standard Testbed Footer
#
PAGEFOOTER();
?>
......@@ -806,6 +806,19 @@ function SHOWEXP($pid, $eid, $short = 0, $sortby = "") {
else
$expt_locked = "";
$query_result =
DBQueryFatal("select cause_desc ".
"from experiment_stats as s,errors,causes ".
"where s.exptidx = $exptidx ".
"and errors.cause = causes.cause ".
"and s.last_error = errors.session");
if ($row = mysql_fetch_array($query_result)) {
$err_cause = $row[0];
} else {
$err_cause = '';
}
#
# Generate the table.
#
......@@ -905,6 +918,13 @@ function SHOWEXP($pid, $eid, $short = 0, $sortby = "") {
<td class=\"left\">$exp_state $expt_locked</td>
</tr>\n";
if ($err_cause) {
echo "<tr>
<td>Last Error: </td>
<td class=\"left\"><a href=\"showlasterror.php3?pid=$pid&eid=$eid\">$err_cause</a></td>
</tr>\n";
}
if ($linktest_pid) {
$linktest_running = "<b>(Linktest Running)</b>";
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment