Commit c9faef6e authored by Leigh B Stoller's avatar Leigh B Stoller

Minor changes to update keys:

1. Do not mark down nodes as needing to be updated, they just get stuck
   there.

2. In the CM daemon, what for slivers/aggregates that have been stuck in
   updating users for too long, and cancel the update. Typically this is
   cause a node is wedged or not participating (CORD), so again just
   cancel and reset the state.
parent 393ecc3b
...@@ -3702,6 +3702,8 @@ sub UpdateAccounts($$) ...@@ -3702,6 +3702,8 @@ sub UpdateAccounts($$)
print STDERR "Could not get sliver list for $self\n"; print STDERR "Could not get sliver list for $self\n";
return -1; return -1;
} }
my $marked = 0;
foreach my $sliver (@slivers) { foreach my $sliver (@slivers) {
next next
if (ref($sliver) ne "GeniSliver::Node"); if (ref($sliver) ne "GeniSliver::Node");
...@@ -3712,13 +3714,22 @@ sub UpdateAccounts($$) ...@@ -3712,13 +3714,22 @@ sub UpdateAccounts($$)
$node->erole() ne EmulabConstants::TBDB_RSRVROLE_NODE() || $node->erole() ne EmulabConstants::TBDB_RSRVROLE_NODE() ||
$node->op_mode() eq EmulabConstants::TBDB_NODESTATE_ALWAYSUP()); $node->op_mode() eq EmulabConstants::TBDB_NODESTATE_ALWAYSUP());
#
# Skip nodes that are not ready; we will just get stuck on them.
#
if (! ($sliver->status() eq "ready" && $sliver->state() eq "started")) {
print STDERR "UpdateAccounts: Skipping sliver ".
$sliver->status() . "/" . $sliver->state() . "\n";
next;
}
$node->MarkForUpdate(); $node->MarkForUpdate();
$marked++;
if ($amapi) { if ($amapi) {
$sliver->SetState("updating_users"); $sliver->SetState("updating_users");
} }
} }
if ($amapi) { if ($amapi && $marked) {
$self->SetState("updating_users"); $self->SetState("updating_users");
} }
return 0; return 0;
......
#!/usr/bin/perl -w #!/usr/bin/perl -w
# #
# Copyright (c) 2008-2017 University of Utah and the Flux Group. # Copyright (c) 2008-2018 University of Utah and the Flux Group.
# #
# {{{GENIPUBLIC-LICENSE # {{{GENIPUBLIC-LICENSE
# #
...@@ -1270,6 +1270,55 @@ sub CheckIdle() ...@@ -1270,6 +1270,55 @@ sub CheckIdle()
} }
} }
#
# Check Aggregate states/status.
#
sub CheckAggregates()
{
my $query_result =
GeniDB::DBQueryWarn("select idx from geni_aggregates ".
"where type='Aggregate' and status!='ready'");
while (my ($idx) = $query_result->fetchrow_array()) {
my $aggregate = GeniAggregate->Lookup($idx);
if (!defined($aggregate)) {
# aggregate is gone, lets not worry.
next;
}
my $slice = $aggregate->GetSlice();
if (!defined($slice)) {
# slice is gone, lets not worry.
$aggregate->Flush();
next;
}
if ($slice->Lock() != 0) {
goto skip;
}
my $state = $aggregate->state();
my $status = $aggregate->status();
my $stamp = $aggregate->status_state_timestamp();
#
# If an aggregate have been trying to update users for more then
# 30 minutes, kill that and return the aggregate to normal status.
#
if ($state eq "updating_users" && time() - $stamp > 1 * 60) {
if ($impotent) {
print STDERR "Would cancel update accounts for $aggregate\n";
}
else {
print STDERR "Canceling update accounts for $aggregate\n";
$aggregate->CancelUpdateAccounts();
}
}
$slice->UnLock();
skip:
$slice->Flush();
$aggregate->Flush();
}
}
# #
# Notify the "portal" of the status of all slivers. Might be the local # Notify the "portal" of the status of all slivers. Might be the local
# portal or the Cloudlab portal. Or both since a cluster might be part # portal or the Cloudlab portal. Or both since a cluster might be part
...@@ -1543,6 +1592,7 @@ while (1) { ...@@ -1543,6 +1592,7 @@ while (1) {
} }
NotifyPortal(); NotifyPortal();
HandleImageTracking(); HandleImageTracking();
CheckAggregates();
# Be certain stale info is gone. # Be certain stale info is gone.
Experiment->FlushAll(); Experiment->FlushAll();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment