Commit 4f59bce7 authored by Leigh B Stoller's avatar Leigh B Stoller

Rework how we update the webtasks for status and imaging, since we now

update from two places (here, and from the apt event daemon).
parent bbeabb89
......@@ -163,6 +163,16 @@ sub DESTROY {
$self->{'HASH'} = undef;
}
#
# Flush from our little cache, as for the expire daemon.
#
sub Flush($)
{
my ($self) = @_;
delete($instances{$self->uuid()});
}
#
# Refresh a class instance by reloading from the DB.
#
......@@ -879,6 +889,54 @@ sub GetSSHKeys($$)
return 0;
}
#
# Update the image status in the webtask for the instance. This is not
# clean at all, need a better way to do this.
#
sub UpdateImageStatus($$)
{
my ($self, $details) = @_;
DBQueryWarn("lock tables web_tasks write, apt_instances write, ".
" apt_instance_aggregates write")
or return {};
$self->Refresh();
if ($self->status() ne "imaging") {
goto done;
}
my $webtask = WebTask->LookupByObject($self->uuid());
if (!defined($webtask)) {
goto done;
}
#
# This will need to change; we can get updates from polling or
# from the event stream. The events are processed out of band from
# from the polling, so we have a consistency problem. In addition,
# the event stream is sending status for just a single node since
# events are bounded in size.
#
# In other words, the new image status has to be merged into the
# existing status. We have use some kind of lock to avoid scrambling
# the json data, and for now it is a table lock.
#
# Also, the blob has a timestamp in it, so we can sorta tell which
# is most recent (not perfect, but does not really need to be).
#
if (defined($webtask->image_stamp()) &&
defined($details->{'utc'}) &&
$details->{'utc'} < $webtask->image_stamp()) {
goto done;
}
$webtask->image_size($details->{'size'});
$webtask->image_status($details->{'status'});
$webtask->image_stamp($details->{'utc'});
$webtask->Store();
done:
DBQueryWarn("unlock tables");
return 0;
}
###################################################################
package APT_Instance::Aggregate;
use emdb;
......@@ -1201,6 +1259,52 @@ sub GetGeniAuthority($)
return APT_Geni::GetAuthority($self->aggregate_urn());
}
#
# Update the sliverstatus in the webtask.
#
sub UpdateWebStatus($$)
{
my ($self, $hash) = @_;
DBQueryWarn("lock tables web_tasks write")
or return {};
$self->webtask()->Refresh();
my $current = $self->webtask()->sliverstatus();
if (!defined($current)) {
$current = {};
}
#
# This will need to change; we can get updates from polling or
# from the event stream. The events are processed out of band from
# from the polling, so we have a consistency problem. In addition,
# the event stream is sending status for just a single node since
# events are bounded in size.
#
# In other words, the new node state has to be merged into the
# existing status. We have use some kind of lock to avoid scrambling
# the json data, and for now it is a table lock.
#
# Also, the blob has a timestamp in it, so we can sorta tell which
# is most recent (not perfect, but does not really need to be).
#
foreach my $urn (keys(%{ $hash })) {
my $details = $hash->{$urn};
my $node_id = $details->{'client_id'};
if (!exists($current->{$node_id}) ||
!exists($current->{$node_id}->{"utc"}) ||
!exists($details->{"utc"}) ||
$details->{"utc"} >= $current->{$node_id}->{"utc"}) {
$current->{$node_id} = $details;
}
}
$self->webtask()->sliverstatus($current);
DBQueryWarn("unlock tables");
return $current;
}
#
# Ask aggregate to terminate a sliver.
#
......
......@@ -908,12 +908,13 @@ sub WaitForSliver($)
# cares about. We get this on each loop, update so the web
# interface can show changes.
#
my $statusblob = $aggobj->UpdateWebStatus($repblob->{'details'});
my $changed = 0;
my $statusblob = {};
foreach my $urn (keys(%{$repblob->{'details'}})) {
my $details = $repblob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
#
# Look at the last blob. If we changed, view that as progress.
#
......@@ -944,7 +945,6 @@ sub WaitForSliver($)
}
}
}
$webtask->sliverstatus($statusblob);
$laststatus = $statusblob;
if (exists($repblob->{'public_url'})) {
......@@ -1006,9 +1006,6 @@ if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@aggregate_list)},
}
print "$slice_urn\n";
# Count up nodes running a startup service.
my $startuprunning = 0;
#
# If we were canceled, then none of the stuff below matters, we
# are going to do a terminate.
......@@ -1021,6 +1018,9 @@ if ($instance->IsCanceled()) {
exit(0);
}
# Count up nodes running a startup service.
my $startuprunning = 0;
#
# Check the exit codes; any failure is a total failure (for now).
#
......
......@@ -459,8 +459,14 @@ sub DoSnapshot()
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
# This is convenience for the web server.
if (defined($webtask)) {
$webtask->aggregate_urn($aggregate->aggregate_urn());
$webtask->client_id($node_id);
}
}
$instance->SetStatus("imaging");
$aggregate->SetStatus("imaging");
#
# This returns pretty fast, and then the imaging takes place in
......@@ -472,6 +478,7 @@ sub DoSnapshot()
if (!defined($response)) {
$errmsg = "Internal error creating image";
$instance->SetStatus($old_status);
$aggregate->SetStatus($old_status);
goto uerror;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
......@@ -481,6 +488,7 @@ sub DoSnapshot()
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_FORBIDDEN);
$instance->SetStatus($old_status);
$aggregate->SetStatus($old_status);
goto uerror;
}
my ($image_urn, $image_url,
......@@ -541,7 +549,7 @@ sub DoSnapshot()
# Poll for a reasonable amount of time.
#
my $seconds = 1500;
my $interval = 10;
my $interval = 15;
my $ready = 0;
my $sliver_ready = 0;
my $failed = 0;
......@@ -565,25 +573,8 @@ sub DoSnapshot()
$response->code() == GENIRESPONSE_RPCERROR);
my $blob = $response->value();
if (defined($webtask)) {
# Special for imaging status display
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
if ($urn eq $sliver_urn) {
$webtask->state($details->{'state'});
$webtask->rawstate($details->{'rawstate'});
}
}
}
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = {};
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
}
$aggregate->webtask()->sliverstatus($statusblob);
$aggregate->UpdateWebStatus($blob->{'details'});
if ($blob->{'status'} eq "failed") {
$failed = 1;
......@@ -610,34 +601,34 @@ sub DoSnapshot()
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_RPCERROR);
$blob = $response->value();
my $imageblob = $response->value();
if (defined($webtask)) {
$webtask->image_size($blob->{'size'})
if (exists($blob->{'size'}));
if (exists($blob->{'status'})) {
#
# If the image is ready, but needs to be copied back to
# its origin, hold of ready till later. We will wait for
# the copyback to finish, see below.
#
if (defined($copyback_uuid)) {
$webtask->image_status("copying");
}
else {
$webtask->image_status($blob->{'status'});
}
my %blobcopy = %{ $imageblob };
#
# If the image is ready, but needs to be copied back to
# its origin, hold of ready till later. We will wait for
# the copyback to finish, see below.
#
if ($imageblob->{'status'} eq "ready" && defined($copyback_uuid)) {
$blobcopy{'status'} = "copying";
}
# This is also being updated by the event system.
$instance->UpdateImageStatus(\%blobcopy);
}
if ($blob->{'status'} eq "ready") {
if ($imageblob->{'status'} eq "ready") {
$ready = 1;
last;
}
elsif ($blob->{'status'} eq "failed") {
elsif ($imageblob->{'status'} eq "failed") {
$failed = 1;
last;
}
}
# Cause of image status events.
$webtask->Refresh()
if (defined($webtask));
if ($failed) {
$errmsg = "Imaging failed"
if (!defined($errmsg));
......@@ -675,6 +666,7 @@ sub DoSnapshot()
($update_profile eq "all" ? 1 : 0));
}
$instance->SetStatus("ready");
$aggregate->SetStatus("ready");
#
# If there is a copyback_uuid, we want to wait for that to finish.
......@@ -743,6 +735,7 @@ sub DoSnapshot()
StartMonitorInternal();
}
$instance->SetStatus("ready");
$aggregate->SetStatus("ready");
if (defined($logfile)) {
SENDMAIL($TBOPS,
"Snapshot failed",
......@@ -1345,17 +1338,8 @@ sub DoRefresh()
elsif ($blob->{'status'} eq "failed") {
$sliver->SetStatus("failed");
}
#
# Convert to something smaller, with info the web interface
# cares about.
#
my $statusblob = {};
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
}
$webtask->sliverstatus($statusblob);
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
}
......@@ -1678,7 +1662,7 @@ sub StartMonitorInternal(;$)
# another node right away. For reboot/reload, nothing interesting
# is going to be reported for a while.
#
sleep(15);
sleep(30);
my $seconds = ($waitforstartup ? 7200 : 900);
my $interval = 15;
......@@ -1718,27 +1702,22 @@ sub StartMonitorInternal(;$)
}
my $blob = $response->value();
#
# Convert to something smaller, with info the web interface
# cares about.
#
my $statusblob = {};
my $executing = 0;
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
# Startup command is still running.
$executing++
if (exists($details->{'execute_state'}) &&
$details->{'execute_state'} ne "exited");
}
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
}
$webtask->sliverstatus($statusblob);
# Look for nodes still executing
my $executing = 0;
if ($waitforstartup) {
foreach my $node_id (keys(${$statusblob})) {
my $details = $statusblob->{'node_id'};
$executing++
if (exists($details->{'execute_state'}) &&
$details->{'execute_state'} ne "exited");
}
}
#
# We poll until the status goes ready. Might not be a good idea.
#
......
<?php
#
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -59,14 +59,6 @@ function Do_CloneStatus()
$taskdata = $webtask->TaskData();
$blob = array();
if ($webtask->exited()) {
# Success, but not sure what to report. Come back to this later.
$blob["exited"] = $webtask->exited();
$blob["exitcode"] = $webtask->exitcode();
if (isset($taskdata["image_name"])) {
$blob["image_name"] = $taskdata["image_name"];
}
}
#
# Size is in KB to avoid bigint problems. But kill the KB.
#
......@@ -79,9 +71,35 @@ function Do_CloneStatus()
else {
$blob["image_size"] = 0;
}
$blob["node_status"] = $taskdata["rawstate"];
$blob["image_status"] = $taskdata["image_status"];
#
# Lets put the node status in too. The backend has helpfully told us
# the aggregate and node to track down the status.
#
if (isset($taskdata["aggregate_urn"]) && isset($taskdata["client_id"])) {
$sliver = InstanceSliver::Lookup($instance, $taskdata["aggregate_urn"]);
if ($sliver) {
$slwebtask = WebTask::Lookup($sliver->webtask_id());
$sliverstatus = $slwebtask->TaskValue("sliverstatus");
if ($sliverstatus) {
foreach ($sliverstatus as $node_id => $node_status) {
if ($node_id == $taskdata["client_id"]) {
$blob["node_status"] = $node_status["rawstate"];
break;
}
}
}
}
}
if ($webtask->exited()) {
# Success, but not sure what to report. Come back to this later.
$blob["exited"] = $webtask->exited();
$blob["exitcode"] = $webtask->exitcode();
if (isset($taskdata["image_name"])) {
$blob["image_name"] = $taskdata["image_name"];
}
}
SPITAJAX_RESPONSE($blob);
}
......
......@@ -886,11 +886,30 @@ function Do_SnapshotStatus()
else {
$blob["image_size"] = 0;
}
$blob["node_status"] = $taskdata["rawstate"];
$blob["image_status"] = $taskdata["image_status"];
if (isset($taskdata["copyback_uuid"])) {
$blob["copyback_uuid"] = $taskdata["copyback_uuid"];
}
$blob["image_status"] = $taskdata["image_status"];
#
# Lets put the node status in too. The backend has helpfully told us
# the aggregate and node to track down the status.
#
if (isset($taskdata["aggregate_urn"]) && isset($taskdata["client_id"])) {
$sliver = InstanceSliver::Lookup($instance, $taskdata["aggregate_urn"]);
if ($sliver) {
$slwebtask = WebTask::Lookup($sliver->webtask_id());
$sliverstatus = $slwebtask->TaskValue("sliverstatus");
if ($sliverstatus) {
foreach ($sliverstatus as $node_id => $node_status) {
if ($node_id == $taskdata["client_id"]) {
$blob["node_status"] = $node_status["rawstate"];
break;
}
}
}
}
}
if ($webtask->exited()) {
# Success, but not sure what to report. Come back to this later.
$blob["exited"] = $webtask->exited();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment