Commit 3cdbe5f7 authored by Leigh Stoller's avatar Leigh Stoller

First attempt at fixing deadlock when stitching. This happens if both

sides try to stitch at the same time. One side has to back off and let
the other proceed. The problem is with the slice locking, which had to
be changed to allow one side to drop the lock so the other side could
proceed. I ended up doing this with an additional stitching lock, used
only when stitching.
parent 6600d18c
......@@ -54,6 +54,7 @@ CREATE TABLE `geni_slices` (
`expires` datetime default NULL,
`shutdown` datetime default NULL,
`locked` datetime default NULL,
`stitch_locked` datetime default NULL,
`creator_uuid` varchar(40) NOT NULL default '',
`creator_urn` tinytext,
`name` tinytext,
......
......@@ -1605,45 +1605,82 @@ sub GetTicketAuxAux($$$$$$$$$)
foreach my $linkname (keys(%external_linkmap)) {
my ($linkref, $ifaceref) = @{ $external_linkmap{$linkname} };
my $slice_urn = $slice->urn();
my $retries = 10;
#
# Already have a reserved tag? This could happen if the other CM
# acted first and talked to this CM before we saw the ticket
# request. Or this is an update and we already have tags reserved.
#
my $tag = VLan::GetReservedVlanTag($slice_experiment, $linkname);
goto tagged
if (defined($tag));
while ($retries) {
#
# Already have a reserved tag? This could happen if the other CM
# acted first and talked to this CM before we saw the ticket
# request. Or this is an update and we already have tags reserved.
#
my $tag = VLan::GetReservedVlanTag($slice_experiment, $linkname);
last
if (defined($tag));
my ($fh, $filename) = tempfile(UNLINK => 0);
if (!defined($fh)) {
print STDERR "Could not create temp file for rspec\n";
$response = GeniResponse->Create(GENIRESPONSE_ERROR);
goto bad;
}
print $fh GeniXML::Serialize($rspec);
close($fh);
system("$RESERVEVLANS '$slice_urn' '$linkname' $filename");
if ($?) {
unlink($filename);
$response = GeniResponse->Create(GENIRESPONSE_ERROR, undef,
"Could not reserve vlan tags for $linkname");
goto bad;
}
unlink($filename);
my ($fh, $filename) = tempfile(UNLINK => 0);
if (!defined($fh)) {
print STDERR "Could not create temp file for rspec\n";
$response = GeniResponse->Create(GENIRESPONSE_ERROR);
goto bad;
}
print $fh GeniXML::Serialize($rspec);
close($fh);
system("$RESERVEVLANS '$slice_urn' '$linkname' $filename");
if ($CHILD_ERROR) {
unlink($filename);
#
# Need to find out what vlan was assigned.
#
$tag = VLan::GetReservedVlanTag($slice_experiment, $linkname);
if (!defined($tag)) {
print STDERR "Did not find the reserved tag for $linkname\n";
$response = GeniResponse->Create(GENIRESPONSE_ERROR, undef,
"Error reserving vlan tag");
goto bad;
#
# Positive exit value indicates the other side returned busy.
# This could be for several reasons, not the least of which is
# that it is trying to stitch the slice at the same time, and
# this side is deadlocked with the other side.
#
# So, lets try to backoff, dropping the stitching lock for a
# while so that other side can proceed. Once we get the lock
# back, look to see if we now have a tag (other side was able
# to proceed). If so we are done, otherwise try again and repeat
# for a while.
#
if (($CHILD_ERROR >> 8) == 1) {
print STDERR "reservevlans for $linkname returned busy, ".
"will retry in a bit.\n";
$slice->StitchUnLock();
# random backoff.
sleep(int(rand(20)) + 5);
# We should be able to get the lock at some point.
for (my $r = 20; $r > 0; $r--) {
goto again
if ($slice->StitchLock() == 0);
sleep(5);
}
print STDERR
"Could not get the stitching lock back. Giving up\n";
}
$response = GeniResponse->Create(GENIRESPONSE_ERROR, undef,
"Could not reserve vlan tags for $linkname");
goto bad;
}
else {
unlink($filename);
#
# Need to find out what vlan was assigned.
#
$tag = VLan::GetReservedVlanTag($slice_experiment, $linkname);
if (defined($tag)) {
GeniXML::SetText("vlantag", $linkref, $tag);
last;
}
# This should not happen.
print STDERR "Did not find the reserved tag for $linkname\n";
$response = GeniResponse->Create(GENIRESPONSE_ERROR, undef,
"Error reserving vlan tag");
goto bad;
}
# Try again.
again:
$retries--;
}
tagged:
GeniXML::SetText("vlantag", $linkref, $tag);
}
print GeniXML::Serialize($rspec);
......
......@@ -1606,23 +1606,15 @@ sub ReserveVlanTags($)
#
# Already exists locally.
#
if ($slice->Lock() != 0) {
if ($slice->StitchLock() != 0) {
return GeniResponse->BusyResponse();
}
#
# Confirm that the certificate is the same.
#
if ($slice->cert() ne $slice_cert) {
$response = GeniResponse->Create(GENIRESPONSE_FORBIDDEN, undef,
"Slice certificate mismatch");
goto done;
}
}
else {
#
# Create a placeholder slice. Have to watch for a concurrent
# slice creation through the normal path, in which case the user
# needs to set if the slice was first created on this path.
# slice creation through the normal path, in which case the code
# needs to see if the slice was first created on this path.
#
my ($auth,$type,$id) = GeniHRN::Parse($slice_urn);
my $sa_urn = GeniHRN::Generate($auth, "authority", "sa");
......@@ -1632,16 +1624,28 @@ sub ReserveVlanTags($)
$response = GeniResponse->Create(GENIRESPONSE_ERROR);
goto done;
}
$slice = GeniSlice->Create($slice_certificate,
undef, $authority, undef, 1);
$slice = GeniSlice->Create($slice_certificate, undef, $authority);
if (!defined($slice)) {
$response = GeniResponse->Create(GENIRESPONSE_ERROR);
goto done;
}
# Slice is returned locked. We will not bother to remove this slice
# on failure since it will get aged out automatically, and it avoids
# dealing with concurrency issues on this path.
#
# Concurrency requires that we try for the lock after we create
# it, since in fact it might not be us that created it.
#
if ($slice->StitchLock() != 0) {
return GeniResponse->BusyResponse();
}
}
#
# Confirm that the certificate is the same.
#
if ($slice->cert() ne $slice_cert) {
$response = GeniResponse->Create(GENIRESPONSE_FORBIDDEN, undef,
"Slice certificate mismatch");
goto done;
}
my $slice_experiment = GeniCM::GeniExperiment($slice);
if (!defined($slice_experiment)) {
print STDERR "Could not create new Geni slice experiment!\n";
......@@ -1859,7 +1863,7 @@ sub ReserveVlanTags($)
$vlan->Destroy();
}
$slice->UnLock()
$slice->StitchUnLock()
if (defined($slice));
return $response;
}
......
......@@ -135,6 +135,7 @@ sub Lookup($$)
$self->{'CERT'} = $certificate;
$self->{'BINDINGS'} = undef;
$self->{'LOCKED'} = 0;
$self->{'STITCHLOCKED'} = 0;
# Add to cache.
$slices{$self->idx()} = $self;
......@@ -168,8 +169,13 @@ sub Create($$$$;$$)
push(@insert_data, "sa_uuid='$sa_uuid'");
push(@insert_data, "exptidx=$exptidx")
if (defined($exptidx));
push(@insert_data, "locked=now()")
if (defined($lock) && $lock);
# This is no good; should not create slices locked without
# making sure this is really the creator. see below
if (defined($lock) && $lock) {
push(@insert_data, "locked=now()");
push(@insert_data, "stitch_locked=now()");
}
my $safe_hrn = DBQuoteSpecial($certificate->hrn());
my $safe_uuid = DBQuoteSpecial($certificate->uuid());
......@@ -230,8 +236,9 @@ sub Create($$$$;$$)
return undef;
}
$idx = $curidx;
goto didit;
}
# Slice exists, no inserts needed. Just lookup.
goto lookup;
}
# Do this after above code since we might need to overwrite entry.
push(@insert_data, "idx='$idx'");
......@@ -242,13 +249,13 @@ sub Create($$$$;$$)
DBQueryWarn("unlock tables");
return undef;
}
didit:
lookup:
DBQueryWarn("unlock tables");
my $slice = GeniSlice->Lookup($idx);
return undef
if (!defined($slice));
$slice->{'LOCKED'} = $$
$slice->{'LOCKED'} = $slice->{'STITCHLOCKED'} = $$
if (defined($lock) && $lock);
return $slice;
}
......@@ -270,6 +277,7 @@ sub isplaceholder($) { return field($_[0], "isplaceholder"); }
sub cert($) { return $_[0]->{'CERT'}->cert(); }
sub GetCertificate($) { return $_[0]->{'CERT'}; }
sub LOCKED($) { return $_[0]->{'LOCKED'}; }
sub STITCHLOCKED($) { return $_[0]->{'STITCHLOCKED'}; }
#
# Stringify for output.
......@@ -392,20 +400,24 @@ sub Lock($)
DBQueryWarn("lock tables geni_slices write")
or return -1;
# Slice lock always takes both locks. Holder can drop the stitch lock.
my $query_result =
DBQueryWarn("select locked from geni_slices ".
"where idx='$idx' and locked is null");
DBQueryWarn("select locked,stitch_locked from geni_slices ".
"where idx='$idx' and ".
" locked is null and stitch_locked is null");
if (!$query_result || !$query_result->numrows) {
DBQueryWarn("unlock tables");
return 1;
}
$query_result =
DBQueryWarn("update geni_slices set locked=now() where idx='$idx'");
DBQueryWarn("update geni_slices set ".
"locked=now(),stitch_locked=now() where idx='$idx'");
DBQueryWarn("unlock tables");
return 1
if (!$query_result);
$self->{'LOCKED'} = $$;
$self->{'STITCHLOCKED'} = $$;
return 0;
}
sub UnLock($)
......@@ -416,10 +428,62 @@ sub UnLock($)
return 1
if (!$self->LOCKED());
DBQueryWarn("update geni_slices set locked=NULL where idx='$idx'")
DBQueryWarn("update geni_slices set ".
"locked=NULL,stitch_locked=NULL where idx='$idx'")
or return -1;
$self->{'LOCKED'} = 0;
$self->{'STITCHLOCKED'} = 0;
return 0;
}
#
# The stitching lock is used solely for controlling concurrency related
# to stitching.
#
sub StitchLock($)
{
my ($self) = @_;
my $idx = $self->idx();
# We already have it locked.
return 0
if ($self->STITCHLOCKED());
DBQueryWarn("lock tables geni_slices write")
or return -1;
my $query_result =
DBQueryWarn("select stitch_locked from geni_slices ".
"where idx='$idx' and ".
" stitch_locked is null");
if (!$query_result || !$query_result->numrows) {
DBQueryWarn("unlock tables");
return 1;
}
$query_result =
DBQueryWarn("update geni_slices set ".
" stitch_locked=now() where idx='$idx'");
DBQueryWarn("unlock tables");
return 1
if (!$query_result);
$self->{'STITCHLOCKED'} = $$;
return 0;
}
sub StitchUnLock($)
{
my ($self) = @_;
my $idx = $self->idx();
return 1
if (!$self->STITCHLOCKED());
DBQueryWarn("update geni_slices set ".
" stitch_locked=NULL where idx='$idx'")
or return -1;
$self->{'STITCHLOCKED'} = 0;
return 0;
}
......
......@@ -37,6 +37,7 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
# Protos
sub fatal($);
sub busy();
sub ReserveLocalTags(@);
sub ReserveRemoteTags($$@);
......@@ -221,8 +222,9 @@ exit(0);
sub ReserveRemoteTags($$@)
{
my ($authority, $othertags, @tags) = @_;
# Lets avoid terminal retry.
my $retries = 10;
# Lets avoid terminal retry. Try a couple of times, and
# return to caller so it can decide what to do.
my $retries = 2;
my $method_args = {};
$method_args->{'credentials'} = [$credential->asString()];
......@@ -247,7 +249,8 @@ again:
$response->code() != GENIRESPONSE_SEARCHFAILED) {
print STDERR "Could not reserve tags at $authority - Error: ";
print STDERR " " . $response->output() . "\n";
fatal("");
# This does not return.
busy();
}
if ($response->code() == GENIRESPONSE_BUSY) {
if ($retries) {
......@@ -366,6 +369,16 @@ sub fatal($)
print STDERR "*** $0:\n".
" $msg\n";
# exit value important.
# Exit value important. negative value indicates fatal error to caller.
exit(-1);
}
sub busy()
{
if (defined($vlan)) {
# Clears all the tags.
$vlan->ClearReservedVlanTag();
$vlan->Destroy();
}
# Exit value important; positive integer indicate BUSY to caller.
exit(1);
}
#
# Add secondary lock to geni_slices for stitching.
#
use strict;
use GeniDB;
sub DoUpdate($$$)
{
my ($dbhandle, $dbname, $version) = @_;
DBSetDefault($dbhandle);
if (!DBSlotExists("geni_slices", "stitch_locked")) {
DBQueryFatal("alter table geni_slices ".
" add `stitch_locked` datetime default NULL");
}
return 0;
}
1;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment