Commit 5710c340 authored by Kevin Atkinson's avatar Kevin Atkinson
Browse files

Various tblog changes:

Added message about recovery action when a swap-modify failed to the
top of the email.

Fine tuned os_setup summary error.  Added (possible partial) list of
nodes that fail; if a large number fail only show as many that will
fit on a single line.  Other tweaks.

Flagged assign_wrapper errors of an Invalid OS as user errors.
parent b1715c29
......@@ -3184,7 +3184,8 @@ sub CheckMaxConcurrent()
my $count = $counts{$osid};
if (!TBOSLoadMaxOkay($osid, $count)) {
fatal("Too many nodes are already running OSID $osid!");
fatal({cause => 'temp'},
"Too many nodes are already running OSID $osid!");
}
}
}
......@@ -3867,7 +3868,8 @@ sub LoadVirtNodes()
if (! ($osid = TBOSID($pid, $osname)) &&
! ($osid = TBOSID(TB_OPSPID, $osname))) {
fatal("Invalid OS $osname in project $pid!");
fatal({cause => 'user'},
"Invalid OS $osname in project $pid!");
}
# Stash this in the virt_nodes data structure.
$rowref->{"__osid"} = $osid;
......@@ -4234,13 +4236,13 @@ sub LoadExperiment()
if (!defined($jail_osid) && defined($jail_osname)) {
if (! ($jail_osid = TBOSID($pid, $jail_osname)) &&
! ($jail_osid = TBOSID(TB_OPSPID, $jail_osname))) {
fatal("Invalid OS $jail_osname in project $pid!");
fatal({cause=>'user'}, "Invalid OS $jail_osname in project $pid!");
}
}
if (!defined($delay_osid) && defined($delay_osname)) {
if (! ($delay_osid = TBOSID($pid, $delay_osname)) &&
! ($delay_osid = TBOSID(TB_OPSPID, $delay_osname))) {
fatal("Invalid OS $delay_osname in project $pid!");
fatal({cause=>'user'}, "Invalid OS $delay_osname in project $pid!");
}
}
# Keep a desire string we can use to make sure that the node type picked
......@@ -5336,3 +5338,4 @@ sub LoadCurrent()
if ($reserved_simcount);
printdb "Old Reserved Nodes: " . join(" ", keys %oldreservednodes) . "\n";
}
......@@ -849,6 +849,7 @@ sub cleanup()
"$user_name <$user_email>",
"",
"Cc: $TBOPS",
"",
($logname, "assign.log", "wanassign.log", $nsfile));
#
......
......@@ -1039,7 +1039,7 @@ sub tblog_lookup_error ( ;$ ) {
}
}
=item tblog_email_error DATA, TO, WHAT, EIDPID, FROM, HEADERS, TBOBS_HEADERS, @FILES
=item tblog_email_error DATA, TO, WHAT, EIDPID, FROM, HEADERS, TBOBS_HEADERS, PREFIX, @FILES
Email the user and possible testbed-ops the error.
......@@ -1049,9 +1049,9 @@ WHAT is something like "Swap In Failure", "Swap Out Failure", etc.
=cut
sub tblog_email_error ( $$$$$$$@ ) {
sub tblog_email_error ( $$$$$$$$@ ) {
my ($d, $to, $what, $pideid, $from, $headers, $testbed_ops, @files) = @_;
my ($d, $to, $what, $pideid, $from, $headers, $testbed_ops, $prefix, @files) = @_;
my $threshold = 0.55;
......@@ -1077,6 +1077,8 @@ sub tblog_email_error ( $$$$$$$@ ) {
my $body;
$body .= "$prefix\n\n" if $prefix;
if ($d->{confidence} > $threshold) {
$body .= $d->{mesg};
......
......@@ -76,6 +76,7 @@ my %canfail = ();
my %bios_waittime = (); # Indexed by node_type.
my %reboot_waittime = (); # Indexed by osid.
my %node_types = (); # Indexed by node_id.
my %vname = (); # Indexed by node_id.
#
# This variable keeps track of the failed nodes of all types.
......@@ -211,7 +212,7 @@ if (TBExptPlabInElabPLC($pid, $eid, \$plcnode)) {
# Get the set of nodes, as well as the nodes table information for them.
#
my $db_result =
DBQueryFatal("select n.*,l.pid from reserved as r ".
DBQueryFatal("select n.*,l.pid,r.vname from reserved as r ".
"left join nodes as n on n.node_id=r.node_id ".
"left join last_reservation as l on n.node_id=l.node_id ".
"where r.pid='$pid' and r.eid='$eid'");
......@@ -227,6 +228,7 @@ while (my %row = $db_result->fetchhash()) {
my $type = $row{'type'};
my $jailnode = $row{'jailflag'};
my $failmode = $row{'failureaction'};
my $vname = $row{'vname'};
my $typeinfo = NodeType->Lookup($type);
my $class = $typeinfo->class();
my $subnode = $typeinfo->issubnode();
......@@ -268,6 +270,7 @@ while (my %row = $db_result->fetchhash()) {
$osids{$node} = $osid;
$bios_waittime{$type} = $bios_wait;
$node_types{$node} = $type;
$vname{$node} = $vname;
#
# Make sure the files specified in the paths exist. We mount the
......@@ -1188,6 +1191,7 @@ foreach (keys %failed_nodes) {
$total_osid{$osid}{total}++;
$total_type{$osid}{$type}{failed}++;
$total_type{$osid}{$type}{failed_fatal}++;
push @{$total_type{$osid}{$type}{failed_fatal_list}}, $_;
$total_type{$osid}{$type}{total}++;
}
foreach (keys %failed_nonfatal_nodes) {
......@@ -1198,6 +1202,7 @@ foreach (keys %failed_nonfatal_nodes) {
$total_osid{$osid}{total}++;
$total_type{$osid}{$type}{failed}++;
$total_type{$osid}{$type}{failed_nonfatal}++;
push @{$total_type{$osid}{$type}{failed_nonfatal_list}}, $_;
$total_type{$osid}{$type}{total}++;
}
foreach ((keys %nodes), (keys %vnodes), (keys %plabvnodes)) {
......@@ -1208,6 +1213,37 @@ foreach ((keys %nodes), (keys %vnodes), (keys %plabvnodes)) {
$total_type{$osid}{$type}{total}++;
}
sub list_failed_nodes ($%) {
no warnings 'uninitialized';
my ($max_length,%d) = @_;
my $byvname = sub { $vname{$a} cmp $vname{$b} };
my @nodes = (sort $byvname @{$d{failed_fatal_list}},
sort $byvname @{$d{failed_nonfatal_list}});
@nodes = map {"$vname{$_}($_)"} @nodes;
my $line = join ' ', @nodes;
if (length($line) > $max_length) {
$max_length -= 4;
my $length = 0;
foreach (@nodes) {
$length += length($_);
last if $length > $max_length;
$line .= "$_ ";
}
$line .= "...";
}
return "$line";
}
sub add_failed_nodes ($$%) {
my ($line, $indent, %d) = @_;
my $nodes_line = list_failed_nodes(78 - $indent, %d);
if (length($line) + 2 + length($nodes_line) > 78) {
return "$line:\n".(' 'x$indent)."$nodes_line\n";
} else {
return "$line: $nodes_line\n";
}
}
foreach my $osid (sort keys %total_osid) {
# I want undefined to mean zero!
no warnings 'uninitialized';
......@@ -1228,43 +1264,61 @@ foreach my $osid (sort keys %total_osid) {
$user_image = 0;
}
my $byfailure = sub {
my $cmp = $total_type{$osid}{$b}{failed} <=> $total_type{$osid}{$a}{failed};
return $cmp if $cmp != 0;
return $a cmp $b;
};
my @node_types = sort $byfailure keys %{$total_type{$osid}};
$users_fault = 0 if !$user_image;
foreach my $type (@node_types) {
my %d = %{$total_type{$osid}{$type}};
$users_fault = 0 if $d{failed} < $d{total};
}
my %d = %{$total_osid{$osid}};
$summary .= "$d{failed}/$d{total} nodes with a ";
$summary .= $user_image ? "user" : "system";
$summary .= " osid of \"$osname\" failed to boot";
my $what = @node_types > 1 ? 'nodes' : "$node_types[0]'s";
my $line;
$line .= "$d{failed}/$d{total} $what with a ";
$line .= $user_image ? "user" : "system";
$line .= " osid of \"$osname\" failed to boot";
if ($d{failed_nonfatal} > 0) {
my $count = ($d{failed_nonfatal} == $d{failed}
? "all"
: "$d{failed_nonfatal}/$d{failed}");
$summary .= " ($count non-fatal)";
$line .= " ($count non-fatal)";
}
$summary .= ":\n";
my $byfailure = sub {
# I want undefined to mean zero!
my $cmp = $total_type{$osid}{$b}{failed} <=> $total_type{$osid}{$a}{failed};
return $cmp if $cmp != 0;
return $a cmp $b;
};
foreach my $type (sort $byfailure keys %{$total_type{$osid}}) {
if (@node_types == 1) {
my $type = $node_types[0];
my %d = %{$total_type{$osid}{$type}};
if ($d{failed} > 0) {
$summary .= " $d{failed}/$d{total} ${type}'s with this os failed to boot";
if ($d{failed_nonfatal} > 0) {
my $count = ($d{failed_nonfatal} == $d{failed}
? "all"
: "$d{failed_nonfatal}/$d{failed}");
$summary .= " ($count non-fatal)";
$summary .= add_failed_nodes($line, 2, %d);
} else {
$summary .= "$line:\n";
foreach my $type (@node_types) {
my %d = %{$total_type{$osid}{$type}};
if ($d{failed} > 0) {
$line = " $d{failed}/$d{total} ${type}'s with this os failed to boot";
if ($d{failed_nonfatal} > 0) {
my $count = ($d{failed_nonfatal} == $d{failed}
? "all"
: "$d{failed_nonfatal}/$d{failed}");
$line .= " ($count non-fatal)";
}
$summary .= add_failed_nodes($line, 4, %d);
} else {
$summary .= " $d{total} ";
$summary .= $d{total} == 1 ? "$type" : "${type}'s";
$summary .= " with this os successfully booted.\n";
}
$summary .= ".\n";
$users_fault = 0 if $d{failed} < $d{total};
} else {
$summary .= " $d{total} ";
$summary .= $d{total} == 1 ? "$type" : "${type}'s";
$summary .= " with this os successfully booted.\n";
}
}
}
......
......@@ -120,6 +120,8 @@ my $nextswapstate;
my $termswapstate;
my $isadmin = 0;
my $modifyError; # needed when emailing error
# Protos
sub fatal($);
sub CheckFWinfo($);
......@@ -900,8 +902,6 @@ elsif ($inout eq "in") {
$experiment->Report($repfile, "-b");
}
elsif ($inout eq "modify") {
my $modifyError;
#
# Prepare the Archive for the swapmod, in case we have to "roll back".
#
......@@ -1374,6 +1374,7 @@ sub cleanup()
($idleswap ? $TBOPS : "$user_name <$user_email>"),
"Cc: $expt_head_name <$expt_head_email> $rcops",
"Cc: $TBOPS $rcops",
$modifyError,
(($logname), (defined($modnsfile) ? ($modnsfile) : ())));
if ($modifyHosed) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment