diff --git a/db/libdb.pm.in b/db/libdb.pm.in index 29829bc8c1b58ced5b6f5caf142cdbab9be4058f..a7ac3f91837e34355f35a01917d7898e57568fa9 100644 --- a/db/libdb.pm.in +++ b/db/libdb.pm.in @@ -4740,6 +4740,9 @@ sub GatherSwapStats($$$$$;$) $TBDB_STATS_STARTCLOCK = time(); return; } + + my $session = tblog_session(); + $session = 'NULL' unless defined $session; local $DBQUERY_MAXTRIES = 0; @@ -4770,7 +4773,8 @@ sub GatherSwapStats($$$$$;$) if ($ecode) { DBQueryWarn("update experiment_stats set ". " swap_errors=swap_errors+1, ". - " swap_exitcode=$ecode ". + " swap_exitcode=$ecode, ". + " last_error=$session ". "where pid='$pid' and eid='$eid' and exptidx=$exptidx"); # @@ -4989,11 +4993,14 @@ sub GatherSwapStats($$$$$;$) $tmp = TBDB_STATS_SWAPIN; } + + DBQueryWarn("update experiment_stats ". "set ${tmp}_count=${tmp}_count+1, ". " ${tmp}_last=now(), ". " swapin_duration=swapin_duration+${duration}, ". - " swap_exitcode=0 ". + " swap_exitcode=0, ". + " last_error=NULL ". "where pid='$pid' and eid='$eid' and ". " exptidx=$exptidx"); } @@ -5047,9 +5054,9 @@ sub GatherSwapStats($$$$$;$) DBQueryWarn("insert into testbed_stats ". "(idx, uid, start_time, end_time, exptidx, rsrcidx, ". - " action, exitcode) ". + " action, exitcode, log_session) ". "values (0, '$uid', $starttime, now(), $exptidx, $rsrcidx, ". - " '$mode', $ecode)"); + " '$mode', $ecode, $session)"); # We do these here since even failed operations implies activity DBQueryWarn("update user_stats set last_activity=now() ". diff --git a/sql/database-create.sql b/sql/database-create.sql index 6db863b6b2e83946d06417f0f86eb88448c6f962..cbf29a433a1f243a009202b2819fe2f538009fe3 100644 --- a/sql/database-create.sql +++ b/sql/database-create.sql @@ -104,6 +104,17 @@ CREATE TABLE cameras ( PRIMARY KEY (name,building,floor) ) TYPE=MyISAM; +-- +-- Table structure for table `causes` +-- + +CREATE TABLE causes ( + cause varchar(16) NOT NULL default '', + cause_desc varchar(32) NOT NULL default '', + PRIMARY KEY (cause), + UNIQUE KEY cause_desc (cause_desc) +) TYPE=MyISAM; + -- -- Table structure for table `cdroms` -- @@ -336,6 +347,21 @@ CREATE TABLE emulab_indicies ( PRIMARY KEY (name) ) TYPE=MyISAM; +-- +-- Table structure for table `errors` +-- + +CREATE TABLE errors ( + session int(10) unsigned NOT NULL default '0', + stamp int(10) unsigned NOT NULL default '0', + exptidx int(11) NOT NULL default '0', + script smallint(3) NOT NULL default '0', + cause varchar(16) NOT NULL default '', + confidence float NOT NULL default '0', + mesg text NOT NULL, + PRIMARY KEY (session) +) TYPE=MyISAM; + -- -- Table structure for table `event_eventtypes` -- @@ -455,6 +481,7 @@ CREATE TABLE experiment_stats ( elabinelab_exptidx int(10) unsigned default NULL, security_level tinyint(1) NOT NULL default '0', archive_idx int(10) unsigned default NULL, + last_error int(10) unsigned default NULL, PRIMARY KEY (eid,pid,exptidx), KEY exptidx (exptidx), KEY rsrcidx (rsrcidx) @@ -974,7 +1001,7 @@ CREATE TABLE location_info ( CREATE TABLE log ( seq int(10) unsigned NOT NULL auto_increment, stamp int(10) unsigned NOT NULL default '0', - pidx int(11) NOT NULL default '0', + exptidx int(11) NOT NULL default '0', uid int(11) default NULL, session int(10) unsigned NOT NULL default '0', invocation int(10) unsigned NOT NULL default '0', @@ -984,7 +1011,7 @@ CREATE TABLE log ( sublevel tinyint(2) NOT NULL default '0', priority smallint(3) NOT NULL default '0', inferred tinyint(1) NOT NULL default '0', - cause varchar(32) NOT NULL default '', + cause varchar(16) NOT NULL default '', type enum('normal','entering','exiting','thecause','extra','summary') NOT NULL default 'normal', relevant tinyint(1) NOT NULL default '0', mesg text NOT NULL, @@ -1962,6 +1989,7 @@ CREATE TABLE testbed_stats ( action varchar(16) NOT NULL default '', exitcode tinyint(3) default '0', uid varchar(8) NOT NULL default '', + log_session int(10) unsigned default NULL, PRIMARY KEY (idx), KEY rsrcidx (rsrcidx), KEY exptidx (exptidx) diff --git a/sql/database-fill.sql b/sql/database-fill.sql index c7c5768cc4cebfe7d4339d959d8a7fb68b7a7cc2..c9be6cef660d93c410e5e6cb9e5d59011ccaea8b 100644 --- a/sql/database-fill.sql +++ b/sql/database-fill.sql @@ -4,6 +4,18 @@ --------------------------------------------------------- -- Server version 3.23.58-log +-- +-- Dumping data for table `causes` +-- + + +REPLACE INTO causes VALUES ('temp','Temp Resource Shortage'); +REPLACE INTO causes VALUES ('user','User Error'); +REPLACE INTO causes VALUES ('internal','Internal Error'); +REPLACE INTO causes VALUES ('software','Software Problem'); +REPLACE INTO causes VALUES ('hardware','Hardware Problem'); +REPLACE INTO causes VALUES ('unknown','Cause Unknown'); + -- -- Dumping data for table `comments` -- @@ -119,6 +131,7 @@ REPLACE INTO event_objecttypes VALUES (17,'BGMON'); -- +REPLACE INTO exported_tables VALUES ('causes'); REPLACE INTO exported_tables VALUES ('comments'); REPLACE INTO exported_tables VALUES ('event_eventtypes'); REPLACE INTO exported_tables VALUES ('event_objecttypes'); diff --git a/sql/database-migrate.txt b/sql/database-migrate.txt index 16f64c81b6a7e35b771d302a0adba0e26c1bd79a..b3961d1b750e2f3cf8c7068f1d63d62911420e26 100644 --- a/sql/database-migrate.txt +++ b/sql/database-migrate.txt @@ -3173,3 +3173,55 @@ last_net_act,last_cpu_act,last_ext_act); altsrc tinytext, PRIMARY KEY (name) ) TYPE=MyISAM; + +???: Add stuff to store the error, as determined by tblog, in a more + structured fashion: + + CREATE TABLE errors ( + session int unsigned NOT NULL, + stamp int unsigned NOT NULL, + exptidx int NOT NULL, + script smallint(3) NOT NULL, + cause varchar(16) NOT NULL, + confidence float NOT NULL, + mesg text NOT NULL, + primary key (session) + ); + + alter table experiment_stats + add last_error int unsigned; + + alter table testbed_stats + add log_session int unsigned; + + CREATE TABLE causes ( + cause varchar(16) NOT NULL, + cause_desc varchar(32) NOT NULL, + primary key (cause), + unique key (cause_desc) + ); + + ALTER TABLE log + change cause cause varchar(16) NOT NULL, + change pidx exptidx int NOT NULL default '0'; + + INSERT INTO exported_tables values ('causes'); + + INSERT INTO causes values ('temp', 'Temp Resource Shortage'); + INSERT INTO causes values ('user', 'User Error'); + INSERT INTO causes values ('internal', 'Internal Error'); + INSERT INTO causes values ('software', 'Software Problem'); + INSERT INTO causes values ('hardware', 'Hardware Problem'); + INSERT INTO causes values ('unknown', 'Cause Unknown'); + + + The new code will than store all errors in the new table. Before + they where stored as a special entry in the "log" table. To copy + them over into the "errors" table you may also want to run: + + perl errors-migrate.pl. + + If the script completed with no errors you can than safely purge + those entries from the "log" table using: + + delete from log where type = 'thecause'; diff --git a/sql/errors-migrate.pl b/sql/errors-migrate.pl new file mode 100644 index 0000000000000000000000000000000000000000..2b7b110ead4730ca244e311e3266bacb8d3112fb --- /dev/null +++ b/sql/errors-migrate.pl @@ -0,0 +1,101 @@ +#!/usr/bin/perl -w + +use strict; + +use DBI; + +my $database = 'tbdb'; +my $username = 'root'; +my $passwd = ''; + +my $dbh = DBI->connect("DBI:mysql:database=$database", $username, $passwd); + +my $sth = $dbh->prepare("select * from scripts"); +$sth->execute; +my %script_map; +my @script_map; +while ((my $ref = $sth->fetchrow_arrayref)) { + $script_map{$ref->[1]} = $ref->[0]; + $script_map[$ref->[0]] = $ref->[1]; +} +$script_map{unknown} = 0; +$script_map[0] = 'unknown'; + +$sth = $dbh->prepare("select stamp,session,exptidx,mesg from log where type = 'thecause' order by session"); +$sth->execute; + +my $sth2 = $dbh->prepare("select script_name,cause,type from log natural join scripts where session = ? and relevant order by seq desc"); + +my $ins = $dbh->prepare("insert into errors (session,stamp,exptidx,script,cause,confidence,mesg) values (?,?,?,?,?,?,?)"); + +my $ready = 0; + +while ((my $ref = $sth->fetchrow_arrayref)) { + my ($stamp,$session,$exptidx,$mesg) = @$ref; + my $script = "unknown"; + my $cause = "unknown"; + my $confidence = 0.0; + local $_ = $mesg; + if (/^\d+: ([^:]+)/) { + $script = $1; + } else { + if (s/\nCause: (\S+)\nConfidence: (\S+)$//) { + $cause = $1; + $confidence = $2; + } + s/^ //gm; + $mesg = $_; + ($_) = split /\n\s*\n/; + if (/\(([^\)]+)\)$/ && defined $script_map{$1}) { + $script = $1; + } else { + warn "Unable to find script in: $_\n" unless /No clue as to what went wrong/; + } + $sth2->execute($session); + my @rel; + while ((my $r = $sth2->fetchrow_hashref)) { + push @rel, {%$r}; + } + + if (@rel) { + + warn "Script name mismatch: $rel[0]{script_name} ne $script in: $_\n" + if $rel[0]{script_name} ne $script; + + $script = $rel[0]{script_name}; + + my $conf = 0.5; + if ($script =~ /^(assign|parse)/) { + $conf = 0.9; + } elsif ($script =~ /^(os_setup)/) { + $conf = 0.2; + } + + my $t = ''; + my $c = ''; + foreach (@rel) { + $ready = 1 if $_->{cause} && $_->{cause} ne 'unknown'; + if (!$c && $_->{cause}) { + $c = $_->{cause}; + $t = $_->{type}; + } elsif ($_->{cause} && + $_->{type} eq $t && $_->{cause} ne $c) { + $c = 'unknown'; + } + } + $c = 'unknown' unless $c; + + if ($cause eq 'unknown' && $ready) { + $cause = $c; + $confidence = $conf; + } elsif ($cause ne 'unknown') { + warn "Cause mismatch" unless $cause eq $c; + warn "Confidence mismatched" unless $conf == $confidence; + } + } + } + + $ins->execute($session,$stamp,$exptidx,$script_map{$script},$cause,$confidence,$mesg); +} + + diff --git a/sql/libtblog.sql b/sql/libtblog.sql index 9e5194b49332c1870f0343f826b8c001f4240ef7..80f955f10b96f2a2c381aeb616cecb141a853bf3 100644 --- a/sql/libtblog.sql +++ b/sql/libtblog.sql @@ -2,8 +2,8 @@ CREATE TABLE log ( seq int unsigned NOT NULL auto_increment, - stamp int unsigned NOT NULL, - pidx int NOT NULL, -- unique project id + stamp int unsigned NOT NULL, -- timestamp + exptidx int NOT NULL, -- unique experment id uid int default NULL, -- numeric user id session int unsigned NOT NULL, -- session id (1) invocation int unsigned NOT NULL, -- invocation id - unique id for a @@ -19,13 +19,14 @@ CREATE TABLE log ( -- given the non-default priority -- 3 - log created with a normal print and -- given the default priority - cause varchar(32) NOT NULL, -- string identifying the "cause" of the error + cause varchar(16) NOT NULL, -- string identifying the cause of the error type enum('normal','entering','exiting','thecause','extra','summary') NOT NULL default 'normal',-- type of error: -- normal: normal error generated by tblog -- entering: auto gen when a script starts -- exiting: auto gen when a script exists (2) - -- thecause: created by tblog_find_error + -- thecause: no longer used, see the + -- "errors" table -- extra: extra information to normal info -- at a lower level -- summary: useful summary of errors, @@ -52,20 +53,48 @@ CREATE TABLE log ( -- seq stamp [session] invocation priority inferred cause type relevant mesg -- ... +CREATE TABLE errors ( + session int unsigned NOT NULL, -- session id + stamp int unsigned NOT NULL, -- timestamp + exptidx int NOT NULL, -- unique experment id + script smallint(3) NOT NULL, -- numeric id for the script name + cause varchar(16) NOT NULL, -- string identifying the cause of the error + confidence float NOT NULL, -- number between 0 and 1 indicating how + -- confident tblog is that this it + -- correctly identified the error. + mesg text NOT NULL, -- the text of the message + primary key (session) +); + + CREATE TABLE scripts ( script smallint(3) NOT NULL auto_increment, script_name varchar(24) NOT NULL default '', PRIMARY KEY (script), - UNIQUE KEY id (name) + UNIQUE KEY id (script_name) ); CREATE TABLE priorities ( priority smallint(3) NOT NULL, priority_name varchar(8) NOT NULL, primary key (priority), - unique key (name) + unique key (priority_name) +); + +CREATE TABLE causes ( + cause varchar(16) NOT NULL, + cause_desc varchar(32) NOT NULL, + primary key (cause) + unique key (cause_desc) ); + +alter table experiment_stats + add last_error int unsigned; -- session id of last error, otherwise 0 + +alter table testbed_stats + add log_session int unsigned; + replace into priorities values (000, 'EMERG'); replace into priorities values (100, 'ALERT'); replace into priorities values (200, 'CRIT'); @@ -75,4 +104,17 @@ replace into priorities values (500, 'NOTICE'); replace into priorities values (600, 'INFO'); replace into priorities values (700, 'DEBUG'); +replace into causes values ('temp', 'Temp Resource Shortage'); +replace into causes values ('user', 'User Error'); +replace into causes values ('internal', 'Internal Error'); +replace into causes values ('software', 'Software Problem'); +replace into causes values ('hardware', 'Hardware Problem'); +replace into causes values ('unknown', 'Cause Unknown'); + +replace into exported_tables values ('priorities'); +replace into exported_tables values ('causes'); + replace into exported_tables values ('state_timeouts'); + + + diff --git a/tbsetup/assign_wrapper.in b/tbsetup/assign_wrapper.in index 53f4f14398650e1840fe7f9ae4167dcde2d04642..643794b5db9355acce2e48fbeef76a702a0885cb 100644 --- a/tbsetup/assign_wrapper.in +++ b/tbsetup/assign_wrapper.in @@ -785,7 +785,13 @@ while (1) { my $retval = RunAssign(); if ($retval != 0) { fatal({type=>'extra', cause=>'user'}, - "Experiment can not be run on an empty testbed. Please fix the experiment."); + "This experiment cannot be instantiated on this ". + "testbed. You have most likely asked for hardware ". + "this testbed does not have, such as nodes of a type ". + "it does not contain, or nodes with too many network ". + "interfaces. You will need to modify this experiment ". + "before it can be swapped in - re-submitting the ". + "experiment as-is will always result in failure."); } print("Assign succeeded on an empty testbed.\n"); $precheck = 0; diff --git a/tbsetup/assign_wrapper2.in b/tbsetup/assign_wrapper2.in index 74fe564e11f6cff97fc4dc80b302cb7ac46f95fa..02f3f9a237de6ed4f3bec7aea38d44fe2a03bbf2 100644 --- a/tbsetup/assign_wrapper2.in +++ b/tbsetup/assign_wrapper2.in @@ -92,10 +92,16 @@ if ($exitcode) { dblog(TBLOG_ERR, {}, $err) unless ($obvious_error); # create a log entry that assign failed - my %parms; - $parms{sublevel} = -1; - $parms{type} = 'extra' unless $obvious_error; - dblog(TBLOG_ERR, \%parms, "Assign Failed."); + + dblog(TBLOG_ERR, {sublevel => -1}, "Assign Failed."); + + dblog(TBLOG_ERR, {sublevel => -1, type=>'extra'}, + "Failed to find a set of physical testbed nodes to run your ". + "experiment on. This might mean that there are not enough ". + "nodes or switch resources free, or your experiment might ". + "require certain hardware which is not available. If you ". + "believe this message is in error, contact @TBOPSEMAIL@.") + unless $obvious_error; } exit $exitcode; diff --git a/tbsetup/batchexp.in b/tbsetup/batchexp.in index 4bf50e99dd7ac684db2172a1d84fd33022910237..5d7b518f5eed04de6f26b64970cc3c70d2372984 100755 --- a/tbsetup/batchexp.in +++ b/tbsetup/batchexp.in @@ -874,24 +874,21 @@ sub cleanup() # Clear the logfile so the webpage stops. TBExptClearLogFile($pid, $eid); + # + # Figure out the error if possible + # + my $error_data = tblog_find_error(); + # # Send a message to the testbed list. # - SENDMAIL("$user_name <$user_email>", - "Experiment Configure Failure: $pid/$eid", - "Please look at the log below to see what happened. If the error\n". - "resulted from a lack of free nodes, you can use this web page to\n". - "get a summary of free nodes:\n\n". - " ${TBBASE}/nodecontrol_list.php3\n". - "\n". - "Please do not try again until you see enough nodes free. Or, you\n". - "can use the batch system to swap your experiment in when enough\n". - "nodes are free:\n\n". - " ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html". - "#BatchMode\n", - "$user_name <$user_email>", - "Cc: $TBOPS", - ($logname, "assign.log", "wanassign.log", $nsfile)); + tblog_email_error($error_data, + "$user_name <$user_email>", + "Config Failure", "$pid/$eid", + "$user_name <$user_email>", + "", + "Cc: $TBOPS", + ($logname, "assign.log", "wanassign.log", $nsfile)); # # Back up the work dir for post-mortem debugging. @@ -1224,8 +1221,6 @@ END { } my $saved_exitcode = $?; - tblog_find_error() if $?; - if ($cleaning) { # # We are screwed; a recursive error. Someone will have to clean @@ -1241,4 +1236,3 @@ END { cleanup(); $? = $saved_exitcode; } - diff --git a/tbsetup/libtblog.pm.in b/tbsetup/libtblog.pm.in index d20c1a55af4245bb0eacbf4e689979f99ad49b34..ec36e580e3cac7f028134dd6d98efdb92feff0ce 100644 --- a/tbsetup/libtblog.pm.in +++ b/tbsetup/libtblog.pm.in @@ -66,9 +66,6 @@ of an error. The current causes are: hardware - hardware problem unknown - default -(Note, that at the moment, internal is the same as software, but in -the future that might change) - The cause for an error can be set directly when calling tberror by passing in the "cause" parameter. For example: @@ -208,9 +205,10 @@ use Exporter; @ISA = "Exporter"; @EXPORT = qw (tblog tberror tberr tbwarn tbwarning tbnotice tbinfo tbdebug tbdie tblog_set_info tblog_set_default_cause - tblog_sub_process tblog_find_error + tblog_sub_process tblog_find_error tblog_email_error tblog_start_capture tblog_stop_capture tblog_new_process tblog_init_process tblog_exit + tblog_session copy_hash TBLOG_EMERG TBLOG_ALERT TBLOG_CRIT TBLOG_ERR TBLOG_WARNING TBLOG_NOTICE TBLOG_INFO TBLOG_DEBUG); @@ -222,6 +220,7 @@ use POSIX qw(isatty setsid); use File::Basename; use IO::Handle; use Text::Wrap; +use Text::Tabs; use Carp; use strict; @@ -260,7 +259,7 @@ sub check_env () { check_env_num 'TBLOG_LEVEL'; check_env_num 'TBLOG_SESSION'; - check_env_num 'TBLOG_PIDX'; + check_env_num 'TBLOG_EXPTIDX'; check_env_num 'TBLOG_INVOCATION'; check_env_num 'TBLOG_PARENT_INVOCATION'; check_env_num 'TBLOG_UID'; @@ -273,7 +272,7 @@ my %CAUSE_MAP = (# Don't notify testbed-ops temp => 'temp', # resources temp unavailable user => 'user', # user error # Notify testbed-ops - internal => 'software', # software bug, should not happen + internal => 'internal', # software bug, should not happen software => 'software', # software problem, like a bad image hardware => 'hardware', # hardware problem unknown => ''); @@ -284,6 +283,18 @@ sub normalize_cause ( $ ) { return $cause; } +sub indent ( $$ ) { + my ($text, $prefix) = @_; + $text =~ s/\n$//; + my $res = ''; + foreach (split /\n/, $text) { + $res .= $prefix; + $res .= $_; + $res .= "\n"; + } + return $res; +} + # # # @@ -410,7 +421,7 @@ sub tblog_init_process(@) { $ENV{TBLOG_INVOCATION} = 0; $ENV{TBLOG_PARENT_INVOCATION} = 0; $ENV{TBLOG_LEVEL} = 0; - $ENV{TBLOG_PIDX} = 0; + $ENV{TBLOG_EXPTIDX} = 0; $ENV{TBLOG_UID} = 0; dblog($NOTICE, {type => 'entering'}, 'Entering "', join(' ', informative_scriptname(), @ARGV), '"') @@ -475,12 +486,12 @@ sub tblog_set_info ( $$$ ) check_env(); my ($pid, $eid, $uid) = @_; local $DBQUERY_MAXTRIES = 3; - $ENV{'TBLOG_PIDX'} = + $ENV{'TBLOG_EXPTIDX'} = DBQuerySingleFatal("select idx from experiments where pid='$pid' and eid='$eid'"); $ENV{'TBLOG_UID'} = $uid; DBQueryFatal - sprintf('update log set pidx=%d,uid=%s where session=%s', - $ENV{TBLOG_PIDX}, $ENV{TBLOG_UID}, $ENV{TBLOG_SESSION}); + sprintf('update log set exptidx=%d,uid=%s where session=%s', + $ENV{TBLOG_EXPTIDX}, $ENV{TBLOG_UID}, $ENV{TBLOG_SESSION}); } =item tblog_set_default_cause CAUSE @@ -589,9 +600,9 @@ sub dblog_real ( $$@ ) { unless defined $cause; my $query = - sprintf('insert into log (stamp,pidx,uid,session,parent,invocation,script,level,sublevel,priority,inferred,cause,type,mesg) '. + sprintf('insert into log (stamp,exptidx,uid,session,parent,invocation,script,level,sublevel,priority,inferred,cause,type,mesg) '. 'VALUES (UNIX_TIMESTAMP(now()),%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s,%s,%s)', - $ENV{TBLOG_PIDX}, + $ENV{TBLOG_EXPTIDX}, $ENV{TBLOG_UID}, $ENV{TBLOG_SESSION}, $ENV{TBLOG_PARENT_INVOCATION}, @@ -627,39 +638,31 @@ sub dblog_real ( $$@ ) { } -=item tblog_find_error [[SESSION], STORE] +=item tblog_find_error Attempts to find the relevant error. -Parameters: - -=over - -=over - -=item session +Will act in a way that is safe to use in an END block, that is (1) +never die, (2) don't modify the exit code. -Session id to use. Defaults to the current as given by -$ENV{TVLOG_SESSION}. +Will, also, print the results to and some additional info to STDERR +for diagnosis purposed. -=item store +The results will be stored the "errors" table and also returned in a +hash with the following.. -If set store the results into the database and act in -a way that is safe to use in an END block, that is -(1) never die, (2) don't modify the exit code, Will also -print the results to and some additional info to STDERR -for diagnosis purposed. If not set simply print the results to -STDERR. Defaults to true unless the "session" parameter is also -given, in which case default to false. - -=back - -=back + mesg: text of the error + cause: + confidence: + script: + session: + exptidx: + err: the text of any internal errors in tblog_find_error, + otherwise undefined To retrieve the results from the database: - SELECT ... WHERE session = - AND type = 'thecause' form log + SELECT ... WHERE session = FROM errors The relevant errors are also flagged using "relevant" flag: @@ -668,23 +671,23 @@ The relevant errors are also flagged using "relevant" flag: =cut -sub tblog_find_error ( ;$$ ) { +sub tblog_find_error () { - my ($session,$store) = @_; my $saved_exitcode = $?; - local $DBQUERY_MAXTRIES = 3; + my $session = 0; + my $error = ''; + my $script = ''; + my $cause = ''; + my $confidence = 0.0; - if (not defined $session) { - check_env(); - $store = 1; - $session = $ENV{TBLOG_SESSION}; - } - - $store = 0 unless defined $store; + local $DBQUERY_MAXTRIES = 3; eval { + check_env(); + $session = $ENV{TBLOG_SESSION}; + # # Build the Tree # @@ -816,7 +819,7 @@ sub tblog_find_error ( ;$$ ) { # # Get the most relevent script # - my $script = ''; + $script = ''; if (@relevant) { $script = $lookup{$relevant[0]->{invocation}}->{script}; } @@ -824,7 +827,7 @@ sub tblog_find_error ( ;$$ ) { # # From script determine confidence # - my $confidence = 0.5; + $confidence = 0.5; if ($script =~ /^(assign|parse)/) { $confidence = 0.9; } elsif ($script =~ /^(os_setup)/) { @@ -834,7 +837,6 @@ sub tblog_find_error ( ;$$ ) { # # Figure out the cause; # - my $cause = ''; my $type = ''; # Assumes that 'extra' info will come first @@ -854,7 +856,7 @@ sub tblog_find_error ( ;$$ ) { # Finally print/store the relevant errors # - my $error = ''; + local $Text::Wrap::columns = 72; my $prev; foreach (@relevant) { @@ -864,54 +866,136 @@ sub tblog_find_error ( ;$$ ) { $error .= "\n" if defined $prev; if ($_->{mesg} !~ /\n/) { - $error .= wrap(' ',' ', - "$_->{mesg} ($lookup{$_->{invocation}}->{script})\n"); + $error .= expand(wrap('','', "$_->{mesg}\n")); } else { # if multiline don't reformat - foreach (split /\s*\n/, $_->{mesg}) { - $error .= " $_\n"; - } - $error .= " ($lookup{$_->{invocation}}->{script})\n"; + $error .= "$_->{mesg}\n"; } $prev = $_; } $error = "No clue as to what went wrong!\n" unless length $error > 0; + chop $error; + + print SERR "**** Experimental information, please ignore ****\n"; + print SERR "Session ID = $session\n"; + print SERR "Likely Cause of the Problem:\n"; + print SERR indent($error, ' '); + print SERR "Cause: $cause\n"; + print SERR "Confidence: $confidence\n"; + print SERR "Script: $script\n"; + print SERR "**** End experimental information ****\n"; + + my $script_num = DBQuerySingleFatal + sprintf("select script from scripts where script_name=%s", + DBQuoteSpecial $script); + + DBQueryFatal + sprintf("insert into errors ". + "(stamp, exptidx, session, cause, confidence, script, mesg) ". + "values ". + "(UNIX_TIMESTAMP(now()), %d, %d, %s, %f, %d, %s)", + $ENV{TBLOG_EXPTIDX}, + $session, DBQuoteSpecial($cause), $confidence, $script_num, + DBQuoteSpecial($error)); + + DBQueryFatal + sprintf("update log set relevant=1 where seq in (%s)", + join(',', map {$_->{seq}} @relevant)) if @relevant; - if ($store) { - $error .= "Cause: $cause\n"; - $error .= "Confidence: $confidence\n"; - print SERR "**** Experimental information, please ignore ****\n"; - print SERR "Session ID = $ENV{TBLOG_SESSION}\n"; - print SERR "Likely Cause of the Problem:\n"; - print SERR $error; - print SERR "**** End experimental information ****\n"; - chop $error; - dblog $NOTICE, {type=>'thecause'}, $error; - DBQueryFatal(sprintf("update log set relevant=1 where seq in (%s)", - join(',', map {$_->{seq}} @relevant))) if @relevant; - } else { - print SERR $error; - } }; + my $err; + if ($@) { - my $err = $@; + $err = $@; eval {SENDMAIL(TB_OPSEMAIL, "tblog_find_error failed", - "Experiment: $ENV{TBLOG_PIDX}\n". + "Experiment: $ENV{TBLOG_EXPTIDX}\n". "User: $ENV{TBLOG_UID}\n". "Session: $ENV{TBLOG_SESSION}\n". "Script: $ENV{TBLOG_SCRIPTNAME}\n". "\n". "$err\n")}; - if ($store) { - eval {tblog $WARNING, {type=>'thecause'}, - "tblog_find_error failed: $err" }; - } else { - $? = $saved_exitcode; - die; - } + + eval {tblog $WARNING, "tblog_find_error failed: $err" }; } + $? = $saved_exitcode; + return {session=>$session, exptidx=>$ENV{TBLOG_EXPTIDX}, + mesg=>$error, cause=>$cause, + confidence=>$confidence, script=>$script, err=>$err}; +} + +=item tblog_email_error DATA, TO, WHAT, EIDPID, FROM, HEADERS, TBOBS_HEADERS, @FILES + +Email the user and possible testbed-ops the error. + +DATA is the object returned form tblog_find_error. It is OK if it is undefined. + +WHAT is something like "Swap In Failure", "Swap Out Failure", etc. + +=cut + +sub tblog_email_error ( $$$$$$$@ ) { + + my ($d, $to, $what, $pideid, $from, $headers, $testbed_ops, @files) = @_; + + my $threshold = 0.65; + + unless ($d->{confidence} > $threshold + && ($d->{cause} eq 'temp' || $d->{cause} eq 'user')) + { + $headers .= "\n" if defined $headers && length($headers) > 0; + $headers .= "$testbed_ops"; + } else { + $headers .= "\n" if defined $headers && length($headers) > 0; + $headers .= "Cc: @TBERRORSEMAIL@"; + } + + my $subject = "$what: $pideid"; + + if ($d->{confidence} > $threshold && $d->{cause} ne 'unknown') { + + eval { + my $cause_desc = DBQuerySingleFatal + "select cause_desc from causes where cause = '$d->{cause}'"; + + $subject = "$what: $cause_desc: $pideid"; + } + } + + my $body; + + if ($d->{confidence} > $threshold) { + + $body .= $d->{mesg}; + $body .= "\n"; + + if ($d->{cause} eq 'temp') { + $body .= "\n"; + $body .= "Please take a look at this Knowledge Base entry for hints on what to do:\n\n"; + $body .= " @TBBASE@/kb-show.php3?xref_tag=no_resources\n"; + } + + $headers .= "\nX-NetBed-Cause: $d->{cause}"; + + } else { + + $body = + ("Please look at the log below to see what happened. If the error\n". + "resulted from a lack of free nodes, you can use this web page to\n". + "get a summary of free nodes:\n\n". + " @TBBASE@/nodecontrol_list.php3\n". + "\n". + "Please do not try again until you see enough nodes free. Or, you\n". + "can use the batch system to swap your experiment in when enough\n". + "nodes are free:\n\n". + " @TBDOCBASE@/tutorial/docwrapper.php3?docname=tutorial.html". + "#BatchMode\n"); + + } + + + SENDMAIL($to, $subject, $body, $from, $headers, @files); } # @@ -1044,6 +1128,12 @@ sub tblog_stop_capture( ;@ ) { } } +=item tblog_session + +Returns the current session or undefined if it hasn't been defined yet. + +=cut + # # BEGIN # diff --git a/tbsetup/libtblog_simple.pm b/tbsetup/libtblog_simple.pm index 890e74f16ddb0309e7d42cc3313a72b25fab9de9..3190bffd62a0be22eed93fd0fa6bbd3ae4d38514 100644 --- a/tbsetup/libtblog_simple.pm +++ b/tbsetup/libtblog_simple.pm @@ -24,6 +24,7 @@ use Exporter; @ISA = "Exporter"; @EXPORT = qw (tblog tberror tberr tbwarn tbwarning tbnotice tbinfo tbdebug tbdie + tblog_session TBLOG_EMERG TBLOG_ALERT TBLOG_CRIT TBLOG_ERR TBLOG_WARNING TBLOG_NOTICE TBLOG_INFO TBLOG_DEBUG); @@ -105,6 +106,14 @@ sub scriptname() { if_defined($SCRIPTNAME, $ENV{TBLOG_SCRIPTNAME}); } +# +# +# + +sub tblog_session() { + return $ENV{TBLOG_SESSION}; +} + # # Dummy dblog, does nothing in this module # Once the real "libtblog.pm" is used than this will be replaced diff --git a/tbsetup/libtestbed.pm.in b/tbsetup/libtestbed.pm.in index 95b7c20c1b65e0173a96b0e31d36eadb3d28238b..b81b2f6802288d9a767ca50d46175a9cf4a26fb1 100644 --- a/tbsetup/libtestbed.pm.in +++ b/tbsetup/libtestbed.pm.in @@ -85,7 +85,7 @@ sub SENDMAIL($$$;$$@) if (defined($From) && $From) { print MAIL "From: $From\n"; } - if (defined($Headers)) { + if (defined($Headers) && length($Headers) > 0) { print MAIL "$Headers\n"; } print MAIL "X-NetBed: $SCRIPTNAME\n"; diff --git a/tbsetup/swapexp.in b/tbsetup/swapexp.in index a4f42d60a7349b0732ad0199c08d66808161c58a..ef61a0a3b94219bac9eab52415558b51b3326ca5 100644 --- a/tbsetup/swapexp.in +++ b/tbsetup/swapexp.in @@ -1302,6 +1302,11 @@ sub cleanup() # TBSetCancelFlag($pid, $eid, EXPTCANCEL_CLEAR); + # + # Figure out the error if possible + # + my $error_data = tblog_find_error(); + # # Send a message to the testbed list. Append the logfile. # @@ -1314,22 +1319,13 @@ sub cleanup() $rcops = "\nCc: $TBROBOCOPS"; } - SENDMAIL("$user_name <$user_email>", - "Swap ${inout} Failure: $pid/$eid", - "Please look at the log below to see what happened. If the error\n". - "resulted from a lack of free nodes, you can use this web page to\n". - "get a summary of free nodes:\n\n". - " ${TBBASE}/nodecontrol_list.php3\n". - "\n". - "Please do not try again until you see enough nodes free. Or, you\n". - "can use the batch system to swap your experiment in when enough\n". - "nodes are free:\n\n". - " ${TBDOCBASE}/tutorial/docwrapper.php3?docname=tutorial.html". - "#BatchMode\n", - ($idleswap ? $TBOPS : "$user_name <$user_email>"), - "Cc: $expt_head_name <$expt_head_email>\n". - "Cc: $TBOPS $rcops", - (($logname), (defined($modnsfile) ? ($modnsfile) : ()))); + tblog_email_error($error_data, + "$user_name <$user_email>", + "Swap ${inout} Failure", "$pid/$eid", + ($idleswap ? $TBOPS : "$user_name <$user_email>"), + "Cc: $expt_head_name <$expt_head_email> $rcops", + "Cc: $TBOPS $rcops", + (($logname), (defined($modnsfile) ? ($modnsfile) : ()))); if ($modifyHosed) { # @@ -1511,8 +1507,6 @@ END { } my $saved_exitcode = $?; - tblog_find_error() if $?; - if ($cleaning) { # # We are screwed; a recursive error. Someone will have to clean diff --git a/www/showexp.php3 b/www/showexp.php3 index 714582c45501a94be2109374a5ecfec3794e1738..f60e66ccff270ba50b505829dc2607fe18d4468c 100644 --- a/www/showexp.php3 +++ b/www/showexp.php3 @@ -113,7 +113,7 @@ if ($expstate) { WRITESUBMENUBUTTON("View Activity Logfile", "showlogfile.php3?pid=$exp_pid&eid=$exp_eid"); } - + if ($state == $TB_EXPTSTATE_ACTIVE) { WRITESUBMENUBUTTON("Visualization, NS File, Mapping", "shownsfile.php3?pid=$exp_pid&eid=$exp_eid"); diff --git a/www/showlasterror.php3 b/www/showlasterror.php3 new file mode 100644 index 0000000000000000000000000000000000000000..6d24e6830eca28dccce5d1f6df348e6469ea143b --- /dev/null +++ b/www/showlasterror.php3 @@ -0,0 +1,84 @@ +$row[cause_desc]\n"; + echo "
\n";
+  echo htmlspecialchars($row[mesg])."\n";
+  echo "\n";
+  echo "Cause: $row[cause]\n";
+  echo "Confidence: $row[confidence]\n";
+  echo "
\n"; +} else { + echo "Nothing\n"; +} + +# +# Standard Testbed Footer +# +PAGEFOOTER(); +?> diff --git a/www/showstuff.php3 b/www/showstuff.php3 index b44f2d5494f3f4570e53163048618821d88fd2f5..5003ede000e19ddaf01bd02c9c54cda00d1b0194 100644 --- a/www/showstuff.php3 +++ b/www/showstuff.php3 @@ -806,6 +806,19 @@ function SHOWEXP($pid, $eid, $short = 0, $sortby = "") { else $expt_locked = ""; + $query_result = + DBQueryFatal("select cause_desc ". + "from experiment_stats as s,errors,causes ". + "where s.exptidx = $exptidx ". + "and errors.cause = causes.cause ". + "and s.last_error = errors.session"); + + if ($row = mysql_fetch_array($query_result)) { + $err_cause = $row[0]; + } else { + $err_cause = ''; + } + # # Generate the table. # @@ -905,6 +918,13 @@ function SHOWEXP($pid, $eid, $short = 0, $sortby = "") { $exp_state $expt_locked \n"; + if ($err_cause) { + echo " + Last Error: + $err_cause + \n"; + } + if ($linktest_pid) { $linktest_running = "(Linktest Running)"; }