Commit 7e13f79b authored by Mike Hibler's avatar Mike Hibler

Introduce a "failed" state for resource allocation.

If a background resource allocation fails, we put the lease in the "failed"
state instead of destroying it. There were some ripple effects, specifically,
the lease_daemon now checks for "failed" leases and send messages to us at
the same frequency as for "unapproved" leases. The correct response here is
almost certainly to destroy the lease, though you can put it back in the
"unapproved" state (via modlease) and try to approve it to see what happened.

Also add background mode to approvelease since it can do time consuming
resource allocation.

Nit: cleanup logfiles used in backgroud operation.
parent 578a2a19
......@@ -88,10 +88,10 @@ use vars qw(@ISA @EXPORT);
LEASE_ACCESS_DESTROY LEASE_ACCESS_MIN LEASE_ACCESS_MAX
LEASE_STATE_VALID LEASE_STATE_UNAPPROVED LEASE_STATE_GRACE
LEASE_STATE_LOCKED LEASE_STATE_EXPIRED
LEASE_STATE_LOCKED LEASE_STATE_EXPIRED LEASE_STATE_FAILED
LEASE_ERROR_NONE LEASE_ERROR_FAILED LEASE_ERROR_BUSY
LEASE_ERROR_GONE
LEASE_ERROR_GONE LEASE_ERROR_ALLOCFAILED
GLOBAL_PERM_ANON_RO GLOBAL_PERM_USER_RO
GLOBAL_PERM_ANON_RO_IDX GLOBAL_PERM_USER_RO_IDX
......@@ -428,6 +428,7 @@ sub LEASE_ACCESS_MAX() { LEASE_ACCESS_DESTROY(); }
# Lease States
sub LEASE_STATE_VALID() { "valid"; }
sub LEASE_STATE_UNAPPROVED() { "unapproved"; }
sub LEASE_STATE_FAILED() { "failed"; }
sub LEASE_STATE_GRACE() { "grace"; }
sub LEASE_STATE_LOCKED() { "locked"; }
sub LEASE_STATE_EXPIRED() { "expired"; }
......@@ -438,6 +439,7 @@ sub LEASE_ERROR_NONE() { 0; }
sub LEASE_ERROR_FAILED() { -1; }
sub LEASE_ERROR_BUSY() { -2; }
sub LEASE_ERROR_GONE() { -3; }
sub LEASE_ERROR_ALLOCFAILED() { -4; }
# Global permissions identifiers and indexes for *_permissions tables.
sub GLOBAL_PERM_ANON_RO { "GLOBAL_ANON_RO"; }
......
......@@ -122,8 +122,9 @@ my %LEASE_VAR_DEFAULTS = (
# All leases are in this state when they are created. No resources are
# allocated in this state, but the desired resources may count against
# a quota. A lease in this state can not be accessed other than to
# destroy or approve it. The latter moves it to the valid state, a
# step that triggers resource allocation.
# destroy or approve it. The latter moves it to the valid state, a step
# that triggers resource allocation. If the allocation fails, the lease
# instead moves to the failed state.
#
# "valid"
# A valid lease can be mapped into an experiment. A lease remains valid
......@@ -137,6 +138,12 @@ my %LEASE_VAR_DEFAULTS = (
# access to a resource. Again, it is not clear what happens if the
# lease is currently mapped.
#
# "failed"
# If resource allocation fails while approving a lease, the lease winds
# up here. In the failed state, a lease has no resources assigned and
# it can only transition back to the unapproved state (to try again) or
# it can be destroyed.
#
# "grace"
# A lease in the grace state may still make resources available to
# experiments, but in a "read-only" mode. For example, for storage
......@@ -163,14 +170,16 @@ my %LEASE_VAR_DEFAULTS = (
my @LEASE_STATES = (
"unapproved",
"valid",
"failed",
"grace",
"locked",
"expired");
# Valid transitions for each state.
my %LEASE_TRANSITIONS = (
"unapproved" => { "valid" => 1, "DEAD" => 1 },
"unapproved" => { "valid" => 1, "failed" => 1, "DEAD" => 1 },
"valid" => { "grace" => 1, "locked" => 1 },
"failed" => { "unapproved" => 1, "DEAD" => 1 },
"grace" => { "valid" => 1, "expired" => 1, "locked" => 1 },
"locked" => { "valid" => 1, "expired" => 1 },
"expired" => { "unapproved" => 1, "DEAD" => 1 },
......@@ -566,7 +575,8 @@ sub GetReservations($) {
# Before doing a big honkin query, see if the lease has resources
# allocated to it.
#
if ($self->type() =~ /dataset$/ && $self->state() ne "unapproved") {
if ($self->type() =~ /dataset$/ &&
!($self->state() eq "unapproved" || $self->state() eq "failed")) {
my $lidx = $self->lease_idx();
my $query_result =
......@@ -670,8 +680,13 @@ sub AllocResources($;$$) {
}
if ($rv) {
print STDERR "$self: AllocResources: could not allocate storage.\n";
# XXX why is this here? Should already be unapproved.
# XXX perhaps because of the potential long duration of bscontrol
# and non-atomicity allowing for a change of state? Seems
# like that should be the caller's concern.
$self->UpdateState("unapproved");
return LEASE_ERROR_FAILED();
return LEASE_ERROR_ALLOCFAILED();
}
#
......@@ -703,9 +718,9 @@ sub DeallocResources($) {
if (!ref($self));
#
# If lease is in the unapproved state, assume there is nothing to do.
# If lease is unapproved or failed, assume there is nothing to do.
#
if ($self->state() eq "unapproved") {
if ($self->state() eq "unapproved" || $self->state eq "failed") {
return 0;
}
......@@ -749,9 +764,9 @@ sub CreateResourceSnapshot($$) {
if (!ref($self));
#
# If lease is in the unapproved state, assume there is nothing to do.
# If lease is unapproved or failed, assume there is nothing to do.
#
if ($self->state() eq "unapproved") {
if ($self->state() eq "unapproved" || $self->state eq "failed") {
return 0;
}
......@@ -805,9 +820,9 @@ sub DestroyResourceSnapshot($$) {
if (!ref($self));
#
# If lease is in the unapproved state, assume there is nothing to do.
# If lease is unapproved or failed, assume there is nothing to do.
#
if ($self->state() eq "unapproved") {
if ($self->state() eq "unapproved" || $self->state eq "failed") {
return 0;
}
......@@ -883,7 +898,7 @@ sub Extend($$)
# can transition to valid.
#
my $cstate = $self->state();
if ($cstate eq "unapproved" || $cstate eq "valid" ||
if ($cstate eq "unapproved" || $cstate eq "failed" || $cstate eq "valid" ||
!$self->ValidTransition("valid")) {
print STDERR
"$self: Extend: cannot transition from '$cstate' -> 'valid'\n";
......
......@@ -1394,11 +1394,17 @@ sub InitKeyDist($;$$)
}
# XXX only PC class nodes for now, since we have to ssh to it
if ($self->class ne "pc" && $self->class ne "pcvm") {
if ($self->class() ne "pc" && $self->class() ne "pcvm") {
$priv = $pub = 0;
goto done;
}
# XXX blockstore vnodes are not real nodes
if ($self->type() eq "blockstore") {
$priv = $pub = 0;
goto done;
}
my $node_id = $self->node_id();
# Get user-supplied values from virt_nodes
......
#!/usr/bin/perl -w
#
# Copyright (c) 2013-2016 University of Utah and the Flux Group.
# Copyright (c) 2013-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -35,6 +35,8 @@ use POSIX qw(strftime);
# * For 'unapproved' leases, periodically (low frequency) ping testbed-ops
# and let them know there are leases that need approving.
#
# * Ditto for 'failed' leases.
#
# * For 'locked' leases, ignore them. Maybe periodically tell testbed-ops
# about them.
#
......@@ -72,12 +74,14 @@ use POSIX qw(strftime);
my $CHECK = (15 * 60);
my $REPORT_UNAPPROVED = ( 1 * 24 * 60 * 60);
my $REPORT_FAILED = ( 1 * 24 * 60 * 60);
my $REPORT_LOCKED = (30 * 24 * 60 * 60);
my $REPORT_EXPIRED = ( 1 * 24 * 60 * 60);
## debugging
#my $CHECK = (1 * 60);
#my $REPORT_UNAPPROVED = (2 * 60);
#my $REPORT_FAILED = (2 * 60);
#my $REPORT_LOCKED = (3 * 60);
#my $REPORT_EXPIRED = (2 * 60);
......@@ -274,6 +278,35 @@ while (1) {
@report = ();
}
#
# 1b. Failed leases.
# Check for failed leases that have not been reported on in
# the last day and report them.
#
logit("Checking failed leases...");
foreach my $lease (@{$leases{LEASE_STATE_FAILED()}}) {
logit(" $lease")
if ($debug);
# If we are in the first interval of the reporting period, we report.
my $delta = $now - $lease->statestamp();
if (($delta % $REPORT_FAILED) < $checkint) {
push(@report, $lease);
}
$lease->BumpLastChecked();
}
if (@report > 0) {
if ($impotent) {
logit(" Would report failed leases: " . join(' ', @report));
} else {
if ($debug) {
logit(" Reporting failed leases: " . join(' ', @report));
}
notify("Failed leases awaiting handling:\n" . lease_list(@report));
}
@report = ();
}
#
# 2. Locked leases.
# Locked leases are ones that have been made administratively
......
#!/usr/bin/perl -w
#
# Copyright (c) 2013-2014 University of Utah and the Flux Group.
# Copyright (c) 2013-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -38,11 +38,13 @@ sub usage()
print STDERR " -s state New state for the lease (defaults to 'valid')\n";
print STDERR " -w time Try for up to time seconds to lock lease (0 means forever)\n";
print STDERR " -D reason Deny the lease and destroy it\n";
print STDERR " -b Allocate resources for approved leases in the background\n";
print STDERR " name Name of lease (of form <pid>/<id>)\n";
exit(-1);
}
my $optlist = "dhs:w:D:";
my $optlist = "dhs:w:D:b";
my $debug = 0;
my $background = 0;
my $pid;
my $gid;
my $state = "valid";
......@@ -96,6 +98,9 @@ if (defined($options{h})) {
if (defined($options{d})) {
$debug++;
}
if (defined($options{b})) {
$background++;
}
if (defined($options{s})) {
$state = $options{s};
}
......@@ -152,7 +157,7 @@ if (!$lease->AccessCheck($this_user, LEASE_ACCESS_MODIFY())) {
fatal("$pid/$lname: you are not allowed to modify lease.");
}
# Aquire the lease lock before we start making state changes.
# Acquire the lease lock before we start making state changes.
if (!defined($waittime)) {
fatal("$pid/$lname: could not acquire lock, try again with -w")
if ($lease->Lock());
......@@ -174,7 +179,7 @@ if ($lease->state() ne LEASE_STATE_UNAPPROVED()) {
print "$pid/$lname: has already been approved.\n";
exit(0);
}
fatal("$pid/$lname: lease is in invalid state '$state'.");
fatal("$pid/$lname: lease is in invalid state '". $lease->state(). "'.");
}
if (!$lease->ValidTransition($state)) {
fatal("$pid/$lname: cannot approve lease to state '$state'.");
......@@ -190,10 +195,40 @@ if (defined($deny)) {
exit(0);
}
# Allocate the resources.
if ($lease->AllocResources($state)) {
# Finally, allocate the resources.
my $logname;
if ($background) {
print "Resource allocation proceeding the background ...\n";
$logname = TBMakeLogname("approvelease");
if (my $childpid = TBBackGround($logname)) {
exit(0);
}
# We want the lock in the child.
$lease->TakeLock();
# Let parent exit;
sleep(2);
}
my $rv = $lease->AllocResources($state);
if ($rv != LEASE_ERROR_NONE()) {
my $msg = ($rv == LEASE_ERROR_ALLOCFAILED() ?
"Resource allocation failed" : "Unexpected failure");
print STDERR "$msg, contact testbed-ops.\n";
if ($background) {
SENDMAIL($TBOPS, "Lease resources allocation failed during approval!",
"Background $msg for Lease '$pid/$gid/$lname'; ".
"lease left in failed state!\n\n",
$TBOPS, undef, $logname);
unlink($logname);
}
$lease->UpdateState("failed");
fatal("$pid/$lname: could not approve lease into state '$state'");
}
if ($background) {
unlink($logname);
}
$lease->Unlock();
print "$pid/$lname: approved, state is now '$state'\n";
......
......@@ -537,23 +537,33 @@ if ($approveme) {
# Let parent exit;
sleep(2);
}
if ($lease->AllocResources("valid")) {
print STDERR "Could not allocate resources, contact testbed-ops.\n";
my $rv = $lease->AllocResources("valid");
if ($rv != LEASE_ERROR_NONE()) {
my $msg = ($rv == LEASE_ERROR_ALLOCFAILED() ?
"Resource allocation failed" : "Unexpected failure");
print STDERR "$msg, contact testbed-ops.\n";
#
# Need to notify on error, if ran in the background.
#
if ($background) {
SENDMAIL($TBOPS, "Lease allocation failed!",
"Background resource allocation for Lease '$pid/$gid/$lname' ".
"failed; lease destroyed!\n\n",
$TBOPS, undef, $logname);
"Background $msg for Lease '$pid/$gid/$lname'; ".
"lease left in failed state!\n\n",
$TBOPS, undef, $logname);
unlink($logname);
} else {
if ($lease->Delete() == 0) {
fatal("Lease not created.");
}
}
if ($lease->Delete()) {
print STDERR "WARNING: could not destroy lease DB state.\n";
}
fatal("Lease not created.");
$lease->UpdateState("failed");
$lease->Unlock();
fatal("Lease left in failed state.");
}
if ($background) {
unlink($logname);
}
$lease->Unlock();
......
#!/usr/bin/perl -w
#
# Copyright (c) 2013-2014 University of Utah and the Flux Group.
# Copyright (c) 2013-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -232,9 +232,13 @@ if ($lease->DeallocResources()) {
"Background resource deallocation for Lease '$pid/$gid/$lname' ".
"failed!\n\n",
$TBOPS, undef, $logname);
unlink($logname);
}
fatal("$pid/$lname: could not deallocate resources, left in 'locked' state.");
}
if ($background) {
unlink($logname);
}
if ($lease->Delete()) {
fatal("$pid/$lname: could not destroy lease.");
......
#!/usr/bin/perl -w
#
# Copyright (c) 2013-2014 University of Utah and the Flux Group.
# Copyright (c) 2013-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -319,7 +319,8 @@ if ($state) {
"requires allocation of resources, use -R");
}
}
if ($state eq LEASE_STATE_UNAPPROVED()) {
if ($state eq LEASE_STATE_UNAPPROVED() &&
$curstate ne LEASE_STATE_FAILED) {
if ($doresources) {
if ($lease->DeallocResources()) {
fatal("$pid/$lname: could not deallocate resources ".
......@@ -388,7 +389,7 @@ if ($addattr) {
$lease->SetAttribute($1, $2)) {
fatal("$pid/$lname: could not set attribute '$addattr'.");
} else {
print "$pid/$lname: added attribute '$addattr'.\n";
print "$pid/$lname: added/changed attribute '$addattr'.\n";
}
}
......
#!/usr/bin/perl -w
#
# Copyright (c) 2013-2016 University of Utah and the Flux Group.
# Copyright (c) 2013-2017 University of Utah and the Flux Group.
#
# {{{EMULAB-LICENSE
#
......@@ -214,6 +214,7 @@ print STDERR "Found ", scalar(@lids), " leases\n"
if (@lids > 0) {
my %states = (
"valid" => 'valid',
"failed" => 'failed',
"unapproved" => 'unappr',
"grace" => 'grace',
"locked" => 'locked',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment