Commit e4b4795b authored by Gary Wong's avatar Gary Wong
Browse files

Merge branch 'current' into 'master'.

parents ebe8baab 103e0385
......@@ -163,6 +163,16 @@ sub DESTROY {
$self->{'HASH'} = undef;
}
#
# Flush from our little cache, as for the expire daemon.
#
sub Flush($)
{
my ($self) = @_;
delete($instances{$self->uuid()});
}
#
# Refresh a class instance by reloading from the DB.
#
......@@ -879,6 +889,54 @@ sub GetSSHKeys($$)
return 0;
}
#
# Update the image status in the webtask for the instance. This is not
# clean at all, need a better way to do this.
#
sub UpdateImageStatus($$)
{
my ($self, $details) = @_;
DBQueryWarn("lock tables web_tasks write, apt_instances write, ".
" apt_instance_aggregates write")
or return {};
$self->Refresh();
if ($self->status() ne "imaging") {
goto done;
}
my $webtask = WebTask->LookupByObject($self->uuid());
if (!defined($webtask)) {
goto done;
}
#
# This will need to change; we can get updates from polling or
# from the event stream. The events are processed out of band from
# from the polling, so we have a consistency problem. In addition,
# the event stream is sending status for just a single node since
# events are bounded in size.
#
# In other words, the new image status has to be merged into the
# existing status. We have use some kind of lock to avoid scrambling
# the json data, and for now it is a table lock.
#
# Also, the blob has a timestamp in it, so we can sorta tell which
# is most recent (not perfect, but does not really need to be).
#
if (defined($webtask->image_stamp()) &&
defined($details->{'utc'}) &&
$details->{'utc'} < $webtask->image_stamp()) {
goto done;
}
$webtask->image_size($details->{'size'});
$webtask->image_status($details->{'status'});
$webtask->image_stamp($details->{'utc'});
$webtask->Store();
done:
DBQueryWarn("unlock tables");
return 0;
}
###################################################################
package APT_Instance::Aggregate;
use emdb;
......@@ -1201,6 +1259,52 @@ sub GetGeniAuthority($)
return APT_Geni::GetAuthority($self->aggregate_urn());
}
#
# Update the sliverstatus in the webtask.
#
sub UpdateWebStatus($$)
{
my ($self, $hash) = @_;
DBQueryWarn("lock tables web_tasks write")
or return {};
$self->webtask()->Refresh();
my $current = $self->webtask()->sliverstatus();
if (!defined($current)) {
$current = {};
}
#
# This will need to change; we can get updates from polling or
# from the event stream. The events are processed out of band from
# from the polling, so we have a consistency problem. In addition,
# the event stream is sending status for just a single node since
# events are bounded in size.
#
# In other words, the new node state has to be merged into the
# existing status. We have use some kind of lock to avoid scrambling
# the json data, and for now it is a table lock.
#
# Also, the blob has a timestamp in it, so we can sorta tell which
# is most recent (not perfect, but does not really need to be).
#
foreach my $urn (keys(%{ $hash })) {
my $details = $hash->{$urn};
my $node_id = $details->{'client_id'};
if (!exists($current->{$node_id}) ||
!exists($current->{$node_id}->{"utc"}) ||
!exists($details->{"utc"}) ||
$details->{"utc"} >= $current->{$node_id}->{"utc"}) {
$current->{$node_id} = $details;
}
}
$self->webtask()->sliverstatus($current);
DBQueryWarn("unlock tables");
return $current;
}
#
# Ask aggregate to terminate a sliver.
#
......
#
# Copyright (c) 2000-2015 University of Utah and the Flux Group.
# Copyright (c) 2000-2016 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -32,7 +32,7 @@ SUBDIRS =
BIN_SCRIPTS = manage_profile manage_instance manage_dataset \
create_instance rungenilib
SBIN_SCRIPTS = apt_daemon
SBIN_SCRIPTS = apt_daemon aptevent_daemon
LIB_SCRIPTS = APT_Profile.pm APT_Instance.pm APT_Dataset.pm APT_Geni.pm \
APT_Aggregate.pm
WEB_BIN_SCRIPTS = webmanage_profile webmanage_instance webmanage_dataset \
......
#!/usr/bin/perl -w
#
# Copyright (c) 2008-2016 University of Utah and the Flux Group.
#
# {{{GENIPUBLIC-LICENSE
#
# GENI Public License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and/or hardware specification (the "Work") to
# deal in the Work without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Work, and to permit persons to whom the Work
# is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Work.
#
# THE WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE WORK OR THE USE OR OTHER DEALINGS
# IN THE WORK.
#
# }}}
#
use strict;
use English;
use Getopt::Std;
use Data::Dumper;
use JSON;
#
# Look for APT things that need to be dealt with.
#
sub usage()
{
print "Usage: aptevent_daemon [-d] [-s] [-n]\n";
exit(1);
}
my $optlist = "dns";
my $debug = 0;
my $impotent = 0;
#
# Configure variables
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $TBLOGS = "@TBLOGSEMAIL@";
my $MAINSITE = @TBMAINSITE@;
my $LOGFILE = "$TB/log/aptevent_daemon.log";
# Portal pubsubd running on this port.
my $PSDPORT = 16507;
# un-taint path
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub HandleSliverStatus($$$);
sub HandleImageStatus($$$);
sub fatal($);
#
# Turn off line buffering on output
#
$| = 1;
if ($UID != 0) {
fatal("Must be root to run this script\n");
}
if (! $MAINSITE) {
exit(0);
}
#
# Check args early so we get the right DB.
#
my %options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"n"})) {
$impotent = 1;
}
# Do this early so that we talk to the right DB.
use vars qw($GENI_DBNAME);
$GENI_DBNAME = "geni-cm";
# Load the Testbed support stuff.
use lib "@prefix@/lib";
use emdb;
require GeniDB;
require GeniSlice;
require GeniSliver;
use libtestbed;
use emutil;
use libEmulab;
use APT_Instance;
use event;
if (!$impotent) {
if (CheckDaemonRunning("aptevent_daemon")) {
fatal("Not starting another aptevent daemon!");
}
# Go to ground.
if (! $debug) {
if (TBBackGround($LOGFILE)) {
exit(0);
}
}
if (MarkDaemonRunning("aptevent_daemon")) {
fatal("Could not mark daemon as running!");
}
}
#
# Capture all events from the local pubsubd.
#
my $localhandle = event_register("elvin://localhost:$PSDPORT", 0);
if (!$localhandle) {
fatal("Unable to register with event system");
}
#
# Subscribe to all events.
#
my $tuple = address_tuple_alloc();
if (!$tuple) {
fatal("Could not allocate an address tuple");
}
if (!event_subscribe($localhandle, \&callback, $tuple)) {
fatal("Could not subscribe to all events");
}
#
# Flag to know when there are no more events to process.
#
my $gotone;
sub callback($$$)
{
my ($handle, $note, $data) = @_;
$gotone++;
my $time = time();
my $site = event_notification_get_site($handle, $note);
my $urn = event_notification_get_string($handle, $note, "urn");
my $slice = event_notification_get_string($handle, $note, "slice");
my $type = event_notification_get_string($handle, $note, "type");
my $details = event_notification_get_string($handle, $note, "details");
if ($debug) {
print "Event: $time $site $type $urn $slice $details\n";
}
my $instance = APT_Instance->LookupBySlice($slice);
return
if (!defined($instance));
if ($type eq "SLIVERSTATUS") {
HandleSliverStatus($site, $instance, $details);
goto done;
}
elsif ($type eq "IMAGESTATUS") {
HandleImageStatus($site, $instance, $details);
goto done;
}
done:
$instance->Flush();
}
#
# Handle an Sliverstatus event.
#
sub HandleSliverStatus($$$)
{
my ($site, $instance, $details) = @_;
if (exists($instance->AggregateHash()->{$site})) {
my $sliver = $instance->AggregateHash()->{$site};
if ($impotent) {
print "Would update sliver status for $sliver from details\n";
}
else {
if ($debug) {
print "Updating sliver status for sliver from $details\n";
}
$details = eval { decode_json($details) };
if ($@) {
print STDERR "Could not decode json data: $details\n";
return;
}
$sliver->UpdateWebStatus({$site => $details});
}
}
}
#
# Handle an IMAGESTATUS event.
#
sub HandleImageStatus($$$)
{
my ($site, $instance, $details) = @_;
if (exists($instance->AggregateHash()->{$site})) {
if ($impotent) {
print "Would update image status for $instance from details\n";
}
else {
if ($debug) {
print "Updating image status for instance from $details\n";
}
$details = eval { decode_json($details) };
if ($@) {
print STDERR "Could not decode json data: $details\n";
return;
}
$instance->UpdateImageStatus($details);
}
}
}
#
# Loop processing events.
#
while (1)
{
$gotone = 1;
while ($gotone) {
$gotone = 0;
event_poll($localhandle);
}
event_poll_blocking($localhandle, 1000);
}
#
# Setup a signal handler for newsyslog.
#
sub handler()
{
my $SAVEEUID = $EUID;
$EUID = 0;
ReOpenLog($LOGFILE);
$EUID = $SAVEEUID;
}
$SIG{HUP} = \&handler
if (! ($debug || $impotent));
exit(0);
sub fatal($)
{
my ($msg) = @_;
if (! ($debug || $impotent)) {
#
# Send a message to the testbed list.
#
SENDMAIL($TBOPS,
"APT Event daemon died",
$msg,
$TBOPS);
}
MarkDaemonStopped("aptevent_daemon")
if (!$impotent);
die("*** $0:\n".
" $msg\n");
}
......@@ -908,12 +908,13 @@ sub WaitForSliver($)
# cares about. We get this on each loop, update so the web
# interface can show changes.
#
my $statusblob = $aggobj->UpdateWebStatus($repblob->{'details'});
my $changed = 0;
my $statusblob = {};
foreach my $urn (keys(%{$repblob->{'details'}})) {
my $details = $repblob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
#
# Look at the last blob. If we changed, view that as progress.
#
......@@ -944,7 +945,6 @@ sub WaitForSliver($)
}
}
}
$webtask->sliverstatus($statusblob);
$laststatus = $statusblob;
if (exists($repblob->{'public_url'})) {
......@@ -1006,9 +1006,6 @@ if (ParRun({"maxwaittime" => 99999, "maxchildren" => scalar(@aggregate_list)},
}
print "$slice_urn\n";
# Count up nodes running a startup service.
my $startuprunning = 0;
#
# If we were canceled, then none of the stuff below matters, we
# are going to do a terminate.
......@@ -1021,6 +1018,9 @@ if ($instance->IsCanceled()) {
exit(0);
}
# Count up nodes running a startup service.
my $startuprunning = 0;
#
# Check the exit codes; any failure is a total failure (for now).
#
......
......@@ -459,8 +459,14 @@ sub DoSnapshot()
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
# This is convenience for the web server.
if (defined($webtask)) {
$webtask->aggregate_urn($aggregate->aggregate_urn());
$webtask->client_id($node_id);
}
}
$instance->SetStatus("imaging");
$aggregate->SetStatus("imaging");
#
# This returns pretty fast, and then the imaging takes place in
......@@ -472,6 +478,7 @@ sub DoSnapshot()
if (!defined($response)) {
$errmsg = "Internal error creating image";
$instance->SetStatus($old_status);
$aggregate->SetStatus($old_status);
goto uerror;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
......@@ -481,6 +488,7 @@ sub DoSnapshot()
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_FORBIDDEN);
$instance->SetStatus($old_status);
$aggregate->SetStatus($old_status);
goto uerror;
}
my ($image_urn, $image_url,
......@@ -541,7 +549,7 @@ sub DoSnapshot()
# Poll for a reasonable amount of time.
#
my $seconds = 1500;
my $interval = 10;
my $interval = 15;
my $ready = 0;
my $sliver_ready = 0;
my $failed = 0;
......@@ -565,25 +573,8 @@ sub DoSnapshot()
$response->code() == GENIRESPONSE_RPCERROR);
my $blob = $response->value();
if (defined($webtask)) {
# Special for imaging status display
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
if ($urn eq $sliver_urn) {
$webtask->state($details->{'state'});
$webtask->rawstate($details->{'rawstate'});
}
}
}
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = {};
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
}
$aggregate->webtask()->sliverstatus($statusblob);
$aggregate->UpdateWebStatus($blob->{'details'});
if ($blob->{'status'} eq "failed") {
$failed = 1;
......@@ -610,34 +601,34 @@ sub DoSnapshot()
$response->code() == GENIRESPONSE_SERVER_UNAVAILABLE ||
$response->code() == GENIRESPONSE_RPCERROR);
$blob = $response->value();
my $imageblob = $response->value();
if (defined($webtask)) {
$webtask->image_size($blob->{'size'})
if (exists($blob->{'size'}));
if (exists($blob->{'status'})) {
#
# If the image is ready, but needs to be copied back to
# its origin, hold of ready till later. We will wait for
# the copyback to finish, see below.
#
if (defined($copyback_uuid)) {
$webtask->image_status("copying");
}
else {
$webtask->image_status($blob->{'status'});
}
my %blobcopy = %{ $imageblob };
#
# If the image is ready, but needs to be copied back to
# its origin, hold of ready till later. We will wait for
# the copyback to finish, see below.
#
if ($imageblob->{'status'} eq "ready" && defined($copyback_uuid)) {
$blobcopy{'status'} = "copying";
}
# This is also being updated by the event system.
$instance->UpdateImageStatus(\%blobcopy);
}
if ($blob->{'status'} eq "ready") {
if ($imageblob->{'status'} eq "ready") {
$ready = 1;
last;
}
elsif ($blob->{'status'} eq "failed") {
elsif ($imageblob->{'status'} eq "failed") {
$failed = 1;
last;
}
}
# Cause of image status events.
$webtask->Refresh()
if (defined($webtask));
if ($failed) {
$errmsg = "Imaging failed"
if (!defined($errmsg));
......@@ -675,6 +666,7 @@ sub DoSnapshot()
($update_profile eq "all" ? 1 : 0));
}
$instance->SetStatus("ready");
$aggregate->SetStatus("ready");
#
# If there is a copyback_uuid, we want to wait for that to finish.
......@@ -743,6 +735,7 @@ sub DoSnapshot()
StartMonitorInternal();
}
$instance->SetStatus("ready");
$aggregate->SetStatus("ready");
if (defined($logfile)) {
SENDMAIL($TBOPS,
"Snapshot failed",
......@@ -1345,17 +1338,8 @@ sub DoRefresh()
elsif ($blob->{'status'} eq "failed") {
$sliver->SetStatus("failed");
}
#
# Convert to something smaller, with info the web interface
# cares about.
#
my $statusblob = {};
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
my $node_id = $details->{'client_id'};
$statusblob->{$node_id} = $details;
}
$webtask->sliverstatus($statusblob);
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
}
......@@ -1678,7 +1662,7 @@ sub StartMonitorInternal(;$)
# another node right away. For reboot/reload, nothing interesting
# is going to be reported for a while.
#
sleep(15);
sleep(30);
my $seconds = ($waitforstartup ? 7200 : 900);
my $interval = 15;
......@@ -1718,27 +1702,22 @@ sub StartMonitorInternal(;$)