Commit a8631011 authored by Leigh B. Stoller's avatar Leigh B. Stoller

* A bit more support for swapmod from Start Run. Mostly bookkeeping

  info so we have a record of it.

* First attempt at dealing with nodes that do not respond to the
  synchronous events that are sent from start and stop run. Rather
  then failing, attempt to figure out which nodes are actually dead,
  and save some state in the DB associated with the run. The current
  method for figuring out which nodes are dead is the node_status
  table, since the event scheduler is the only thing that knows what
  nodes did not respond. Will probably revisit this very soon.

* Bug fixes of course.

* Start implementing a Run object so replace some of the code in the
  Instance object.
parent a8cda915
......@@ -40,6 +40,10 @@ my $RSYNC = "/usr/local/bin/rsync";
my %templates = ();
my $debug = 1;
# Flags for functions below.
sub STARTRUN_FLAGS_FIRSTRUN() { 0x1 ;}
sub STARTRUN_FLAGS_SWAPMOD() { 0x2 ;}
#
# Grab a new GUID for a template. We do not have to use it of course.
#
......@@ -1632,6 +1636,10 @@ use English;
use libArchive;
use overload ('""' => 'Stringify');
# Flags for functions below.
sub STARTRUN_FLAGS_FIRSTRUN() { return Template::STARTRUN_FLAGS_FIRSTRUN(); }
sub STARTRUN_FLAGS_SWAPMOD() { return Template::STARTRUN_FLAGS_SWAPMOD(); }
#
# Lookup a template experiment and create a class instance to return.
#
......@@ -1702,6 +1710,11 @@ sub Create($$)
my $query = "insert into experiment_template_instances set ".
join(",", map("$_='" . $argref->{$_} . "'", keys(%{$argref})));
# Give it an initial start time; updated later.
$query .= ", "
if (defined($argref) && scalar(keys%{$argref}));
$query .= "start_time=now() ";
my $query_result = DBQueryWarn($query);
return undef
if (! $query_result);
......@@ -2032,35 +2045,42 @@ sub NewRun($$;$)
my ($self, $runid, $description) = @_;
# Must be a real reference.
return -1
return undef
if (! ref($self));
my $exptidx = $self->exptidx();
my $dclause = "";
my $run = Template::Instance::Run->Create($self, $runid, $description);
return undef
if (! defined($run));
if (defined($description) && $description ne "") {
$description = DBQuoteSpecial($description);
$dclause = "description=$description , ";
# Grab the run idx and store that back into the instance since it
# is now the current run.
my $runidx = $run->idx();
my $idx = $self->idx();
if (! DBQueryWarn("update experiment_template_instances set ".
" runidx='$runidx' ".
"where idx='$idx'")) {
$run->Delete();
return undef;
}
Refresh($self);
return $run;
}
my $query_result =
DBQueryWarn("insert into experiment_runs set ".
" $dclause exptidx='$exptidx', runid='$runid' ");
#
# Lookup a run by its runid.
#
sub LookupRun($$)
{
my ($self, $runid) = @_;
# Must be a real reference.
return -1
if (! $query_result);
# Grab the insert record and store that back into the instance since it
# is now the current run.
my $runidx = $query_result->insertid;
my $idx = $self->idx();
if (! ref($self));
DBQueryWarn("update experiment_template_instances set ".
" runidx='$runidx' ".
"where idx='$idx'")
or return -1;
my $exptidx = $self->exptidx();
return Refresh($self);
return Template::Instance::Run->LookupByRunID($exptidx, $runid);
}
#
......@@ -2089,6 +2109,10 @@ sub DeleteCurrentRun($)
"where exptidx='$exptidx' and idx='$runidx'")
or return -1;
DBQueryWarn("delete from experiment_template_instance_deadnodes ".
"where instance_idx='$idx' and idx='$runidx'")
or return -1;
DBQueryWarn("update experiment_template_instances set runidx=NULL ".
"where idx='$idx'")
or return -1;
......@@ -2099,26 +2123,38 @@ sub DeleteCurrentRun($)
#
# Start the (first) experiment run.
#
sub StartFirstRun($)
sub StartRun($;$)
{
my ($self) = @_;
my ($self, $flags) = @_;
# Must be a real reference.
return -1
if (! ref($self));
$flags = 0
if (!defined($flags));
my $idx = $self->idx();
my $runidx = $self->runidx();
my $exptidx = $self->exptidx();
my $clause = (($flags & STARTRUN_FLAGS_SWAPMOD()) ? ", swapmod=1 " : "");
my $archive_tag;
#
# Grab the current archive tag.
#
return -1
if (!defined($runidx));
if (ArchiveTag($self, \$archive_tag) < 0);
DBQueryWarn("update experiment_runs set start_time=now() ".
DBQueryWarn("update experiment_runs set start_time=now(), ".
" starting_archive_tag='$archive_tag' $clause ".
"where exptidx='$exptidx' and idx='$runidx'")
or return -1;
return $self->Start();
return $self->Start()
if ($flags & STARTRUN_FLAGS_FIRSTRUN());
return 0;
}
#
......@@ -2171,7 +2207,8 @@ sub FinalizeCurrentRun($)
return -1
if (ArchiveTag($self, \$archive_tag) < 0);
DBQueryWarn("update experiment_runs set archive_tag='$archive_tag' ".
DBQueryWarn("update experiment_runs set ".
" ending_archive_tag='$archive_tag' ".
"where exptidx='$exptidx' and idx='$runidx'")
or return -1;
......@@ -2220,7 +2257,7 @@ sub LastRun($)
my ($self) = @_;
# Must be a real reference.
return -1
return undef
if (! ref($self));
my $exptidx = $self->exptidx();
......@@ -2234,6 +2271,25 @@ sub LastRun($)
return $query_result->fetchrow_hashref();
}
sub FirstRun($)
{
my ($self) = @_;
# Must be a real reference.
return undef
if (! ref($self));
my $exptidx = $self->exptidx();
my $query_result =
DBQueryWarn("select * from experiment_runs ".
"where exptidx='$exptidx' order by idx asc limit 1");
return undef
if (!$query_result);
return $query_result->fetchrow_hashref();
}
#
# Return current run.
......@@ -2314,6 +2370,14 @@ sub RunBindingList($$)
my $exptidx = $self->exptidx();
my $runidx = $self->runidx();
if (! defined($runidx)) {
#
# This happens when called during initial swapin.
#
%$prval = %results;
return 0;
}
my $query_result =
DBQueryWarn("select name,value ".
" from experiment_run_bindings ".
......@@ -2886,7 +2950,251 @@ sub InitializeEnvVariables($;$)
return 0;
}
############################################################################
package Template::Instance::Run;
use libdb;
use libtestbed;
use libtblog;
use English;
use libArchive;
use overload ('""' => 'Stringify');
# Flags for functions below.
sub STARTRUN_FLAGS_FIRSTRUN() { return Template::STARTRUN_FLAGS_FIRSTRUN(); }
sub STARTRUN_FLAGS_SWAPMOD() { return Template::STARTRUN_FLAGS_SWAPMOD(); }
#
# Stringify for output.
#
sub Stringify($)
{
my ($self) = @_;
my $guid = $self->instance()->template()->guid();
my $vers = $self->instance()->template()->vers();
my $exptidx = $self->exptidx();
my $runid = $self->runid();
return "[Run:$runid exptidx:$exptidx Template:$guid/$vers]";
}
#
# Create a new run object.
#
sub Create($$$;$)
{
my ($class, $instance, $runid, $description) = @_;
return undef
if (ref($class));
my $exptidx = $instance->exptidx();
my $dclause = "";
if (defined($description) && $description ne "") {
$description = DBQuoteSpecial($description);
$dclause = "description=$description , ";
}
my $query_result =
DBQueryWarn("insert into experiment_runs set ".
" $dclause exptidx='$exptidx', runid='$runid' ");
return undef
if (! $query_result);
# Grab the insert record.
my $runidx = $query_result->insertid;
return Template::Instance::Run->LookupByID($exptidx, $runidx);
}
sub Delete($)
{
my ($self) = @_;
# Must be a real reference.
return -1
if (! ref($self));
my $runidx = $self->idx();
my $exptidx = $self->exptidx();
my $instance_id = $self->instance()->idx();
DBQueryWarn("delete from experiment_run_bindings ".
"where exptidx='$exptidx' and runidx='$runidx'")
or return -1;
DBQueryWarn("delete from experiment_runs ".
"where exptidx='$exptidx' and idx='$runidx'")
or return -1;
DBQueryWarn("delete from experiment_template_instance_deadnodes ".
"where instance_idx='$instance_idx' and idx='$runidx'")
or return -1;
return 0;
}
#
# Lookup a run by its experiment idx and run idx within the experiment.
#
sub LookupByID($$$)
{
my ($class, $exptidx, $idx) = @_;
my $query_result =
DBQueryWarn("select * ".
" from experiment_runs ".
"where exptidx='$exptidx' and idx='$idx'");
return undef
if (!$query_result || !$query_result->numrows);
my $self = {};
$self->{'DB'} = $query_result->fetchrow_hashref();
# Backlink to the instance
my $instance = Template::Instance->LookupByExptidx($exptidx);
return undef
if (!defined($instance));
$self->{'INSTANCE'} = $instance;
bless($self, $class);
return $self;
}
#
# Lookup a run by its experiment idx and run ID.
#
sub LookupByRunID($$$)
{
my ($class, $exptidx, $runid) = @_;
my $query_result =
DBQueryWarn("select * ".
" from experiment_runs ".
"where exptidx='$exptidx' and runid='$runid'");
return undef
if (!$query_result || !$query_result->numrows);
my $self = {};
$self->{'DB'} = $query_result->fetchrow_hashref();
# Backlink to the instance
my $instance = Template::Instance->LookupByExptidx($exptidx);
return undef
if (!defined($instance));
$self->{'INSTANCE'} = $instance;
bless($self, $class);
return $self;
}
# accessors
sub field($$) { return ((! ref($_[0])) ? -1 : $_[0]->{'DB'}->{$_[1]}); }
sub idx($) { return field($_[0], 'idx'); }
sub exptidx($) { return field($_[0], 'exptidx'); }
sub runid($) { return field($_[0], 'runid'); }
sub start_time { return field($_[0], 'start_time'); }
sub stop_time { return field($_[0], 'stop_time'); }
sub start_tag { return field($_[0], 'starting_archive_tag'); }
sub stop_tag { return field($_[0], 'ending_archive_tag'); }
sub instance($){ return ((!ref($_[0])) ? -1 : $_[0]->{'INSTANCE'}); }
sub template($){ return ((!ref($_[0])) ? -1 : instance($_[0])->template()); }
#
# Refresh by reloading from the DB.
#
sub Refresh($)
{
my ($self) = @_;
return -1
if (! ref($self));
my $exptidx = $self->exptidx();
my $idx = $self->idx();
my $query_result =
DBQueryWarn("select * ".
" from experiment_runs ".
"where exptidx='$exptidx' and idx='$idx'");
return -1
if (!$query_result || !$query_result->numrows);
$self->{'DB'} = $query_result->fetchrow_hashref();
return 0;
}
#
# Get list of bindings for the run.
#
sub BindingList($$)
{
my ($self, $prval) = @_;
# Must be a real reference.
return -1
if (! ref($self));
my %results = ();
my $runidx = $self->idx();
my $exptidx = $self->exptidx();
my $query_result =
DBQueryWarn("select name,value ".
" from experiment_run_bindings ".
"where runidx='$runidx' and exptidx='$exptidx'");
return -1
if (!$query_result);
while (my ($name,$value) = $query_result->fetchrow_array()) {
$results{$name} = $value;
}
%$prval = %results;
return 0;
}
#
# Mark a node as dead for this run; this is recorded in the DB.
#
sub MarkNodeDead($$)
{
my ($self, $node_id) = @_;
# Must be a real reference.
return -1
if (! ref($self));
my $runidx = $self->idx();
my $instance_idx = $self->instance()->idx();
my $exptidx = $self->exptidx();
# Need the vname;
my $query_result =
DBQueryWarn("select vname from reserved ".
"where node_id='$node_id'");
return -1
if (!$query_result || !$query_result->numrows);
my ($vname) = $query_result->fetchrow_array();
return -1
if (!defined($vname));
DBQueryWarn("insert into experiment_template_instance_deadnodes set ".
" instance_idx='$instance_idx', exptidx='$exptidx', ".
" runidx='$runidx', node_id='$node_id', vname='$vname'")
or return -1;
return 0;
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -9,7 +9,7 @@ use strict;
use Getopt::Std;
use POSIX qw(isatty setsid);
use POSIX qw(strftime);
use Errno qw(EDQUOT);
use Errno qw(EDQUOT ETIMEDOUT);
use XML::Simple;
use Data::Dumper;
......@@ -54,6 +54,7 @@ my $clean = 0;
my $doswapmod = 0;
my $paramfile;
my %parameters = ();
my %deadnodes = ();
my $action;
my $description;
my $runid;
......@@ -84,6 +85,7 @@ my $logname;
my $dbuid;
my $exptidx;
my $template;
my $run;
my $instance;
# For the END block below.
my $cleaning = 0;
......@@ -102,6 +104,7 @@ sub fatal($$);
sub sighandler($);
sub SignalProgAgents($);
sub SendCompletionEvent();
sub CheckForDeadNodes();
#
# Testbed Support libraries
......@@ -353,10 +356,16 @@ if ($waitmode) {
# Might not be a current run, which is okay.
#
if (defined($instance->runidx())) {
# Ug. I need to figure out how to hook into the event sequence
# mechanism so I can use a completion event.
$run = Template::Instance::Run->LookupByID($instance->exptidx(),
$instance->runidx());
if (!defined($run)) {
tbdie("Cannot get current run object for $instance!");
}
print "Asking program agents to stop ... this will take a moment.\n";
SignalProgAgents("HALT");
SignalProgAgents("HALT") == 0
or $ignoreerrors
or CheckForDeadNodes();
# This sets the stop time.
$instance->StopCurrentRun() == 0
......@@ -366,7 +375,7 @@ if (defined($instance->runidx())) {
print "Asking loghole to sync the logfiles ... this will take a minute.\n";
$instance->LogHole() == 0
or $ignoreerrors
or fatal(-1, "Loghole failed");
or CheckForDeadNodes();
print "Dumping the instance database ... this will take a minute.\n";
$instance->DumpDB() == 0
......@@ -394,7 +403,6 @@ if ($action eq "stop") {
goto done;
}
#
# Clean/Clear if requested before generating the new run, in case there
# is a problem.
......@@ -413,8 +421,10 @@ if ($clean) {
#
# Generate a new run.
#
$instance->NewRun($runid, $description) == 0
or fatal(-1, "Could not create new experiment run for $instance!");
$run = $instance->NewRun($runid, $description);
if (!defined($run)) {
fatal(-1, "Could not create new experiment run for $instance!");
}
#
# At this point, we need to force a cleanup no matter how we exit.
......@@ -422,6 +432,10 @@ $instance->NewRun($runid, $description) == 0
#
$justexit = 0;
# Mark the start time of the run.
$instance->StartRun(($doswapmod ? Template::STARTRUN_FLAGS_SWAPMOD() : 0)) == 0
or fatal(-1, "Could not mark start of new run for $instance!");
#
# And the bindings for the run ...
#
......@@ -442,12 +456,14 @@ $instance->WriteEnvVariables() == 0
or fatal(-1, "Could not write environment strings for program agents");
print "Asking program agents to reload ... this will take a moment.\n";
SignalProgAgents("RELOAD");
SignalProgAgents("RELOAD") == 0
or $ignoreerrors
or CheckForDeadNodes();
if ($doswapmod) {
#
# Now do the swapmod, using the original NS file for now.
# Now do the swapmod, using the original NS file for now. The environ
# variables will be passed to the NS reparse by parse-ns wrapper script.
#
my $archivedir = libArchive::TBUserFileArchiveDirectory($pid, $eid);
my $nsfile = "$archivedir/nsdata/nsfile.ns";
......@@ -471,6 +487,23 @@ else {
print "Experiment run '$runid' has been started.\n";
done:
if (keys(%deadnodes)) {
my $subject;
my $message = "";
foreach my $node_id (keys(%deadnodes)) {
$message .= "$node_id appears to be unresponsive\n";
}
if ($action eq "stop") {
$subject = "Node failures during Stop Run";
}
else {
$subject = "Node failures during Start New Run";
}
SENDMAIL($dbuid, $subject, $message, $TBOPS, "CC: $TBOPS");
}
# Stop the web interface from spewing.
TBExptCloseLogFile($pid, $eid)
if (defined($logname));
......@@ -661,8 +694,9 @@ sub ParseArgs()
#
sub cleanup()
{
# only for start new run; stop run failures do not do this!
$instance->DeleteCurrentRun()
if (defined($instance));
if (defined($instance) && defined($run));
}
sub fatal($$)
......@@ -707,9 +741,17 @@ sub SignalProgAgents($)
$agent = "__all_program-agents";
}
system("$tevc -w -t 60 -e $pid/$eid now $agent $action") == 0
or $ignoreerrors
or fatal(-1, "Could not send event notification!");
system("$tevc -w -t 30 -e $pid/$eid now $agent $action");
if ($?) {
#
# Timeout is important; other errors are real errors.
#
return ETIMEDOUT
if ($? >> 8 == ETIMEDOUT);
fatal(-1, "Could not send event notification!");
}
return 0;
}
sub SendCompletionEvent()
......@@ -727,6 +769,36 @@ sub SendCompletionEvent()
or fatal(-1, "Could not send completion event notification!");
}
#
# Look to see if any nodes have died. This is currently our best way to
# determine likely non-responders to the events and loghole operations,
# since right now there is no information from the event scheduler about
# it. Will probably need to add that, but lets try this for now. The main
# problem is plab nodes.
#
sub CheckForDeadNodes()
{
my %nodestatuslist;
$experiment->NodeStatusList(\%nodestatuslist) == 0
or fatal(-1, "Could not get node status list");
foreach my $node_id (keys(%nodestatuslist)) {
next
if ($nodestatuslist{$node_id});
#
# Node is dead. Need to record this as part of the template record.
# This hash is for later, to send a summary report to the user.
#