Commit e5d36e0d authored by Leigh Stoller's avatar Leigh Stoller

Rework how we store the sliver/slice status from the clusters:

In the beginning, the number and size of experiments was small, and so
storing the entire slice/sliver status blob as json in the web task was
fine, even though we had to lock tables to prevent races between the
event updates and the local polling.

But lately the size of those json blobs is getting huge and the lock is
bogging things down, including not being able to keep up with the number
of events coming from all the clusters, we get really far behind.

So I have moved the status blobs out of the per-instance web task and
into new tables, once per slice and one per node (sliver). This keeps
the blobs very small and thus the lock time very small. So now we can
keep up with the event stream.

If we grow big enough that this problem comes big enough, we can switch
to innodb for the per-sliver table and do row locking instead of table
locking, but I do not think that will happen
parent 60e65004
This diff is collapsed.
......@@ -45,6 +45,7 @@ my $optlist = "dnsv";
my $debug = 0;
my $impotent = 0;
my $verbose = 0;
my $count = 0;
#
# Configure variables
......@@ -62,9 +63,10 @@ $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin:/usr/site/bin';
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub HandleSliverStatus($$$);
sub HandleSliverStatus($$$$);
sub HandleSliceStatus($$$);
sub HandleImageStatus($$$);
sub HandleFrisbeeStatus($$$);
sub HandleFrisbeeStatus($$$$);
sub fatal($);
#
......@@ -166,16 +168,17 @@ sub callback($$$)
my $time = time();
my $site = event_notification_get_site($handle, $note);
my $urn = event_notification_get_string($handle, $note, "urn");
my $slice = event_notification_get_string($handle, $note, "slice");
my $type = event_notification_get_string($handle, $note, "type");
my $details = event_notification_get_string($handle, $note, "details");
# Ignore extraneous events. They happen when listening to the local
# cluster pubsubd.
return
if (! (defined($site) && defined($slice)));
my $urn = event_notification_get_string($handle, $note, "urn");
my $type = event_notification_get_string($handle, $note, "type");
my $details = event_notification_get_string($handle, $note, "details");
#
# Not sure why this is happening, but sometime the slice urn has
# extra double quotes around it. Kill them so the instance lookup
......@@ -187,16 +190,25 @@ sub callback($$$)
# Debugging.
return
if (0 && $slice !~ /stoller/);
$count++;
if ($debug || $verbose) {
print "Event: $time $site $type $urn $slice $details\n";
print "Event: $count $time $site $type $urn $slice $details\n";
}
my $instance = APT_Instance->LookupBySlice($slice);
return
if (!defined($instance));
if (0) {
goto done;
}
if ($type eq "SLIVERSTATUS") {
HandleSliverStatus($site, $instance, $details);
HandleSliverStatus($site, $urn, $instance, $details);
goto done;
}
elsif ($type eq "SLICESTATUS") {
HandleSliceStatus($site, $instance, $details);
goto done;
}
elsif ($type eq "IMAGESTATUS") {
......@@ -204,7 +216,7 @@ sub callback($$$)
goto done;
}
elsif ($type eq "FRISBEESTATUS") {
HandleFrisbeeStatus($site, $instance, $details);
HandleFrisbeeStatus($site, $urn, $instance, $details);
goto done;
}
done:
......@@ -218,9 +230,11 @@ sub callback($$$)
#
# Handle an Sliverstatus event.
#
sub HandleSliverStatus($$$)
sub HandleSliverStatus($$$$)
{
my ($site, $instance, $details) = @_;
my ($site, $sliver_urn, $instance, $json) = @_;
print "HandleSliverStatus: $site, $sliver_urn, $instance\n";
if (exists($instance->AggregateHash()->{$site})) {
my $sliver = $instance->AggregateHash()->{$site};
......@@ -229,14 +243,42 @@ sub HandleSliverStatus($$$)
}
else {
if ($debug || $verbose) {
print "Updating sliver status for sliver from $details\n";
print "Updating sliver status for sliver from $json\n";
}
$details = eval { decode_json($details) };
my $hash = eval { decode_json($json) };
if ($@) {
print STDERR "Could not decode json data: $details\n";
print STDERR "Could not decode json data: $json\n";
return;
}
$sliver->UpdateWebStatus({$site => $details});
$sliver->UpdateSliverStatus($sliver_urn, $hash, $json);
}
}
}
#
# Handle an Slice status event.
#
sub HandleSliceStatus($$$)
{
my ($site, $instance, $json) = @_;
print "HandleSliceStatus: $site, $instance\n";
if (exists($instance->AggregateHash()->{$site})) {
my $sliver = $instance->AggregateHash()->{$site};
if ($impotent) {
print "Would update slice status for $sliver from details\n";
}
else {
if ($debug || $verbose) {
print "Updating slice status for sliver from $json\n";
}
my $hash = eval { decode_json($json) };
if ($@) {
print STDERR "Could not decode json data: $json\n";
return;
}
$sliver->UpdateSliceStatus($hash, $json);
}
}
}
......@@ -246,7 +288,7 @@ sub HandleSliverStatus($$$)
#
sub HandleImageStatus($$$)
{
my ($site, $instance, $details) = @_;
my ($site, $instance, $json) = @_;
if (exists($instance->AggregateHash()->{$site})) {
if ($impotent) {
......@@ -254,14 +296,14 @@ sub HandleImageStatus($$$)
}
else {
if ($debug || $verbose) {
print "Updating image status for instance from $details\n";
print "Updating image status for instance from $json\n";
}
$details = eval { decode_json($details) };
my $hash = eval { decode_json($json) };
if ($@) {
print STDERR "Could not decode json data: $details\n";
print STDERR "Could not decode json data: $json\n";
return;
}
$instance->UpdateImageStatus($details);
$instance->UpdateImageStatusNew($hash, $json);
}
}
}
......@@ -269,25 +311,25 @@ sub HandleImageStatus($$$)
#
# Handle an FRISBEESTATUS event.
#
sub HandleFrisbeeStatus($$$)
sub HandleFrisbeeStatus($$$$)
{
my ($site, $instance, $details) = @_;
my ($site, $sliver_urn, $instance, $json) = @_;
if (exists($instance->AggregateHash()->{$site})) {
my $sliver = $instance->AggregateHash()->{$site};
if (0 && $impotent) {
if ($impotent) {
print "Would update frisbee status for $sliver from details\n";
}
else {
if ($debug || $verbose) {
print "Updating frisbee status for sliver from $details\n";
print "Updating frisbee status for sliver from $json\n";
}
$details = eval { decode_json($details) };
my $hash = eval { decode_json($json) };
if ($@) {
print STDERR "Could not decode json data: $details\n";
print STDERR "Could not decode json data: $json\n";
return;
}
$sliver->UpdateFrisbeeStatus({$site => $details});
$sliver->UpdateFrisbeeStatusNew($sliver_urn, $hash, $json);
}
}
}
......@@ -316,7 +358,7 @@ while (1)
$gotone = 0;
event_poll($localhandle);
}
event_poll_blocking($localhandle, 1000);
event_poll_blocking($localhandle, 100);
}
exit(0);
......
......@@ -1063,15 +1063,15 @@ foreach my $aggobj (@aggregate_list) {
if (defined($aggobj->public_url()));
}
else {
my $statusblob = $aggobj->webtask()->sliverstatus();
my $sliverstatus = $aggobj->SliverStatus();
print Dumper($statusblob);
print Dumper($sliverstatus);
foreach my $details (values(%{ $statusblob })) {
foreach my $status (values(%{ $sliverstatus })) {
# Startup command is still running.
$startuprunning++
if (exists($details->{'execute_state'}) &&
$details->{'execute_state'} ne "exited");
if (exists($status->{"sliver_data"}->{'execute_state'}) &&
$status->{"sliver_data"}->{'execute_state'} ne "exited");
}
}
if (defined($aggobj->public_url())) {
......
......@@ -655,7 +655,7 @@ sub DoSnapshot()
my $blob = $response->value();
# This is the per-aggregate status, we always set this for web UI.
$aggregate->UpdateWebStatus($blob->{'details'});
$aggregate->UpdateSliverStatusAll($blob->{'details'});
if ($blob->{'status'} eq "failed") {
$failed = 1;
......@@ -2256,6 +2256,8 @@ sub DoRefresh()
print STDERR "No slice for instance\n";
goto killit;
}
# Shorten default timeout
Genixmlrpc->SetTimeout(30);
#
# Lock the slice in case it is doing something else, like taking
......@@ -2298,9 +2300,9 @@ sub DoRefresh()
$sliver->SetStatus("failed");
}
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
$sliver->UpdateSliverStatusAll($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
print STDERR Dumper($blob->{'details'});
}
return 0;
bad:
......@@ -2445,16 +2447,6 @@ sub DoRebootOrReload($)
goto bad;
}
gone:
# Tell the web interface something is different. Real status will
# come later when the monitor starts up.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $urn (@urns) {
my $node_id = $node_ids{$urn};
$blob->{$node_id}->{'status'} = "changing";
}
$webtask->sliverstatus($blob);
}
return 0;
bad:
print STDERR "$errmsg\n";
......@@ -2559,7 +2551,7 @@ sub DoManifests()
goto bad;
}
$sliver->SetManifest($manifest);
$sliver->UpdateWebStatus($response->value()->{'details'});
$sliver->UpdateSliverStatusAll($response->value()->{'details'});
return 0;
bad:
return 1;
......@@ -2645,7 +2637,7 @@ sub DoDeleteNodes()
}
}
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
......@@ -2654,6 +2646,12 @@ sub DoDeleteNodes()
$errmsg = "Experiment is busy, cannot lock it. Please try again later";
goto bad;
}
if (!@aggregates) {
$errmsg = "Could not find any nodes to delete!";
$errcode = 1;
goto bad;
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
......@@ -2685,15 +2683,10 @@ sub DoDeleteNodes()
my $manifest = $response->value();
$sliver->SetManifest($manifest);
# Delete the nodes from the status blob.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $node_id (@nodes) {
delete($blob->{$node_id});
}
$webtask->sliverstatus($blob);
# Delete sliver status rows.
foreach my $node_id (@nodes) {
$sliver->DeleteSliverStatus($node_id);
}
$sliver->SetStatus("provisioned");
return 0;
bad:
$sliver->SetStatus("ready");
......@@ -2890,15 +2883,15 @@ sub StartMonitorInternal(;$@)
my $blob = $response->value();
# This is the per-aggregate status, we always set this for web UI.
my $statusblob = $sliver->UpdateWebStatus($blob->{'details'});
$sliver->UpdateSliverStatusAll($blob->{'details'});
if ($debug) {
print STDERR Dumper($statusblob);
print STDERR Dumper($blob->{'details'});
}
# Look for nodes still executing
my $executing = 0;
if ($waitforstartup) {
foreach my $node_id (keys(%{$statusblob})) {
my $details = $statusblob->{'node_id'};
foreach my $urn (keys(%{$blob->{'details'}})) {
my $details = $blob->{'details'}->{$urn};
$executing++
if (exists($details->{'execute_state'}) &&
......@@ -3128,15 +3121,6 @@ sub DoPanic()
$response->output() . "\n";
return -1;
}
# Tell the web interface something is different. Real status will
# come later when the monitor starts up.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $node_id (keys(%{ $blob })) {
$blob->{$node_id}->{'status'} = "changing";
}
$webtask->sliverstatus($blob);
}
return 0;
};
my @return_codes = ();
......
#!/usr/bin/perl
#
# Copyright (c) 2002-2004, 2016 University of Utah and the Flux Group.
# Copyright (c) 2002-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -97,6 +97,8 @@ if (event_unregister($handle) == 0) {
exit(0);
my $count = 0;
sub callbackFunc($$$) {
my ($handle,$note,$data) = @_;
......@@ -106,6 +108,7 @@ sub callbackFunc($$$) {
my $slice = event_notification_get_string($handle, $note, "slice");
my $type = event_notification_get_string($handle, $note, "type");
my $details = event_notification_get_string($handle, $note, "details");
$count++;
print "Event: $time $site $type $urn $slice $details\n";
print "Event: $count $time $site $type $urn $slice $details\n";
}
......@@ -296,6 +296,41 @@ CREATE TABLE `apt_instance_history` (
KEY `profile_id` (`profile_id`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
-- Table structure for table `apt_instance_slice_status`
--
DROP TABLE IF EXISTS `apt_instance_slice_status`;
CREATE TABLE `apt_instance_slice_status` (
`uuid` varchar(40) NOT NULL default '',
`name` varchar(16) default NULL,
`aggregate_urn` varchar(128) NOT NULL default '',
`timestamp` int(10) unsigned NOT NULL default '0',
`modified` datetime NOT NULL default '0000-00-00 00:00:00',
`slice_data` mediumtext,
PRIMARY KEY (`uuid`,`aggregate_urn`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
-- Table structure for table `apt_instance_sliver_status`
--
DROP TABLE IF EXISTS `apt_instance_sliver_status`;
CREATE TABLE `apt_instance_sliver_status` (
`uuid` varchar(40) NOT NULL default '',
`name` varchar(16) default NULL,
`aggregate_urn` varchar(128) NOT NULL default '',
`sliver_urn` varchar(128) NOT NULL default '',
`resource_id` varchar(32) NOT NULL default '',
`client_id` varchar(32) NOT NULL default '',
`timestamp` int(10) unsigned NOT NULL default '0',
`modified` datetime NOT NULL default '0000-00-00 00:00:00',
`sliver_data` mediumtext,
`frisbee_data` mediumtext,
PRIMARY KEY (`uuid`,`aggregate_urn`,`sliver_urn`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
--
-- Table structure for table `apt_instances`
--
......
use strict;
use libdb;
sub DoUpdate($$$)
{
my ($dbhandle, $dbname, $version) = @_;
if (!DBTableExists("apt_instance_slice_status")) {
DBQueryFatal("CREATE TABLE `apt_instance_slice_status` ( ".
" `uuid` varchar(40) NOT NULL default '', ".
" `name` varchar(16) default NULL, ".
" `aggregate_urn` varchar(128) NOT NULL default '', ".
" `timestamp` int(10) unsigned NOT NULL default '0', ".
" `modified` datetime NOT NULL ".
" default '0000-00-00 00:00:00', ".
" `slice_data` mediumtext, ".
" PRIMARY KEY (`uuid`,`aggregate_urn`) ".
") ENGINE=MyISAM DEFAULT CHARSET=latin1");
}
if (!DBTableExists("apt_instance_sliver_status")) {
DBQueryFatal("CREATE TABLE `apt_instance_sliver_status` ( ".
" `uuid` varchar(40) NOT NULL default '', ".
" `name` varchar(16) default NULL, ".
" `aggregate_urn` varchar(128) NOT NULL default '', ".
" `sliver_urn` varchar(128) NOT NULL default '', ".
" `resource_id` varchar(32) NOT NULL default '', ".
" `client_id` varchar(32) NOT NULL default '', ".
" `timestamp` int(10) unsigned NOT NULL default '0', ".
" `modified` datetime NOT NULL ".
" default '0000-00-00 00:00:00', ".
" `sliver_data` mediumtext, ".
" `frisbee_data` mediumtext, ".
" PRIMARY KEY (`uuid`,`aggregate_urn`,`sliver_urn`) ".
") ENGINE=MyISAM DEFAULT CHARSET=latin1");
}
return 0;
}
# Local Variables:
# mode:perl
# End:
......@@ -750,14 +750,11 @@ function Do_GetInfo()
SPITAJAX_ERROR(-1, "Sliver is gone");
return;
}
$slwebtask = WebTask::Lookup($sliver->webtask_id());
$sliverstatus = $slwebtask->TaskValue("sliverstatus");
if ($sliverstatus) {
foreach ($sliverstatus as $node_id => $node_status) {
if ($node_id == $taskdata["client_id"]) {
$blob["node_status"] = $node_status["rawstate"];
break;
}
foreach ($sliver->StatusArray() as $status) {
if (isset($status["sliver_data"]) &&
$status["client_id"] == $taskdata["client_id"]) {
$blob["node_status"] = $status["sliver_details"]["rawstate"];
break;
}
}
}
......
......@@ -856,6 +856,34 @@ class InstanceSliver
}
return $result;
}
#
# Grab the list of sliver status rows. Turn this into a class at some point.
#
function StatusArray() {
$result = array();
$uuid = $this->uuid();
$urn = $this->aggregate_urn();
$query_result =
DBQueryFatal("select * from apt_instance_sliver_status ".
"where uuid='$uuid' and aggregate_urn='$urn'");
while ($row = mysql_fetch_array($query_result)) {
error_log(print_r($row, TRUE));
if ($row["sliver_data"]) {
$row["sliver_details"] = json_decode($row["sliver_data"], true);
if ($row["frisbee_data"]) {
$frisbeestatus = json_decode($row["frisbee_data"], true);
$row["sliver_details"]["frisbeestatus"] = $frisbeestatus;
}
}
$result[] = $row;
}
return $result;
}
}
class ExtensionInfo
......
......@@ -459,14 +459,11 @@ function Do_CloneStatus()
if (isset($taskdata["aggregate_urn"]) && isset($taskdata["client_id"])) {
$sliver = InstanceSliver::Lookup($instance, $taskdata["aggregate_urn"]);
if ($sliver) {
$slwebtask = WebTask::Lookup($sliver->webtask_id());
$sliverstatus = $slwebtask->TaskValue("sliverstatus");
if ($sliverstatus) {
foreach ($sliverstatus as $node_id => $node_status) {
if ($node_id == $taskdata["client_id"]) {
$blob["node_status"] = $node_status["rawstate"];
break;
}
foreach ($sliver->StatusArray() as $status) {
if (isset($status["sliver_data"]) &&
$status["client_id"] == $taskdata["client_id"]) {
$blob["node_status"] = $status["sliver_details"]["rawstate"];
break;
}
}
}
......
......@@ -121,14 +121,15 @@ function Do_GetInstanceStatus()
$havemanifests = 0;
}
else {
if ($sliver->webtask_id() &&
$webtask = WebTask::Lookup($sliver->webtask_id())) {
$sliverstatus = $webtask->TaskValue("sliverstatus");
if ($sliverstatus) {
$blob["sliverstatus"][$sliver->aggregate_urn()] =
$sliverstatus;
$statusblob = array();
foreach ($sliver->StatusArray() as $status) {
if (isset($status["sliver_data"])) {
$statusblob[$status["client_id"]]
= $status["sliver_details"];
}
}
$blob["sliverstatus"][$sliver->aggregate_urn()] = $statusblob;
}
if ($sliver->public_url()) {
$blob["sliverurls"][] = array("name" => $sliver->aggregate_name(),
......@@ -1053,14 +1054,11 @@ function Do_SnapshotStatus()
if (isset($taskdata["aggregate_urn"]) && isset($taskdata["client_id"])) {
$sliver = InstanceSliver::Lookup($instance, $taskdata["aggregate_urn"]);
if ($sliver) {
$slwebtask = WebTask::Lookup($sliver->webtask_id());
$sliverstatus = $slwebtask->TaskValue("sliverstatus");
if ($sliverstatus) {
foreach ($sliverstatus as $node_id => $node_status) {
if ($node_id == $taskdata["client_id"]) {
$blob["node_status"] = $node_status["rawstate"];
break;
}
foreach ($sliver->StatusArray() as $status) {
if (isset($status["sliver_data"]) &&
$status["client_id"] == $taskdata["client_id"]) {
$blob["node_status"] = $status["sliver_details"]["rawstate"];
break;
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment