Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
emulab
emulab-devel
Commits
bd129835
Commit
bd129835
authored
Apr 06, 2016
by
Leigh B Stoller
Browse files
Beta version of delete node.
parent
c7e7ae60
Changes
7
Hide whitespace changes
Inline
Side-by-side
apt/APT_Instance.pm.in
View file @
bd129835
...
...
@@ -75,7 +75,7 @@ my %instances = ();
my
$
debug
=
0
;
#
Debugging
my
$
usemydevtree
=
0
;
my
$
usemydevtree
=
1
;
sub
devurl
($)
{
my
($
cmurl
)
=
@
_
;
...
...
@@ -958,6 +958,7 @@ sub UpdateImageStatus($$)
package
APT_Instance
::
Aggregate
;
use
emdb
;
use
WebTask
;
use
libtestbed
;
use
Carp
;
use
POSIX
qw
(
tmpnam
);
use
English
;
...
...
@@ -2145,5 +2146,222 @@ sub UpdateKeys($$)
return
$
response
;
}
#
#
Delete
some
nodes
#
sub
DeleteNodes
($$@)
{
my
($
self
,
$
perrmsg
,
@
nodes
)
=
@
_
;
my
$
authority
=
$
self
->
GetGeniAuthority
();
my
$
geniuser
=
$
self
->
instance
()->
GetGeniUser
();
my
$
urn
=
$
self
->
aggregate_urn
();
my
$
slice
=
$
self
->
instance
()->
GetGeniSlice
();
my
$
context
=
APT_Geni
::
GeniContext
();
return
undef
if
(
! (defined($geniuser) && defined($authority) &&
defined
($
slice
)
&&
defined
($
context
)));
my
($
slice_credential
,
$
speaksfor_credential
)
=
APT_Geni
::
GenCredentials
($
slice
,
$
geniuser
,
undef
,
1
);
return
undef
if
(
!defined($slice_credential));
my
$
credentials
=
[$
slice_credential
->
asString
()];
if
(
defined
($
speaksfor_credential
))
{
$
credentials
=
[@$
credentials
,
$
speaksfor_credential
->
asString
()];
}
my
$
args
=
{
"slice_urn"
=>
$
slice
->
urn
(),
"credentials"
=>
$
credentials
,
"nodes"
=>
\@
nodes
,
};
my
$
cmurl
=
$
authority
->
url
();
$
cmurl
=
devurl
($
cmurl
)
if
($
usemydevtree
);
my
$
response
;
my
$
tries
=
5
;
while
($
tries
)
{
$
response
=
Genixmlrpc
::
CallMethod
($
cmurl
,
$
context
,
"DeleteNodes"
,
$
args
);
if
(
!defined($response) || $response->code() != GENIRESPONSE_SUCCESS) {
if
(
defined
($
response
)
&&
($
response
->
code
()
==
GENIRESPONSE_SERVER_UNAVAILABLE
||
$
response
->
code
()
==
GENIRESPONSE_BUSY
)
&&
$
tries
>=
0
)
{
print
STDERR
"Server for $urn reports too busy or slice busy, "
.
"waiting a while ...
\n
"
;
sleep
(
int
(
rand
(
20
))
+
10
);
$
tries
--;
next
;
}
$$
perrmsg
=
$
response
->
output
()
if
(
defined
($
response
));
return
$
response
;
}
last
;
}
return
$
response
;
}
sub
WaitForSliver
($)
{
my
($
self
)
=
@
_
;
my
$
aggobj
=
$
self
;
$
aggobj
->
Refresh
();
my
$
webtask
=
$
aggobj
->
webtask
();
my
$
authority
=
$
aggobj
->
GetGeniAuthority
();
my
$
cmurl
=
$
authority
->
url
();
my
$
urn
=
$
authority
->
urn
();
$
webtask
->
Refresh
();
$
webtask
->
output
(
""
);
$
webtask
->
exitcode
(
0
);
#
Debugging
$
cmurl
=
APT_Instance
::
devurl
($
cmurl
);
my
$
seconds
=
900
;
my
$
interval
=
15
;
my
$
ready
=
0
;
my
$
failed
=
0
;
my
$
rpcfail
=
0
;
my
$
public_url
;
my
$
repblob
;
my
$
laststatus
;
while
($
seconds
>
0
)
{
sleep
($
interval
);
$
seconds
-=
$
interval
;
my
$
response
=
$
aggobj
->
SliceStatus
();
if
(
!defined($response) || !defined($response->value()) ||
($
response
->
code
()
!= GENIRESPONSE_SUCCESS &&
$
response
->
code
()
!= GENIRESPONSE_SERVER_UNAVAILABLE &&
$
response
->
code
()
!= GENIRESPONSE_BUSY &&
$
response
->
code
()
!= GENIRESPONSE_RPCERROR)) {
print
STDERR
"SliverStatus failed"
;
if
(
defined
($
response
))
{
print
STDERR
": "
.
$
response
->
output
();
print
STDERR
Dumper
($
response
);
if
($
response
->
output
()
=~
/
read
timeout
/)
{
$
webtask
->
output
(
"Lost contact with the aggregate. "
.
"Possibly a network failure, "
.
"please try again later."
);
}
else
{
$
webtask
->
output
($
response
->
output
());
}
}
print
STDERR
"
\n
"
;
$
failed
=
1
;
last
;
}
if
($
response
->
code
()
==
GENIRESPONSE_RPCERROR
)
{
if
($
rpcfail
>
10
)
{
if
($
response
->
output
()
=~
/
read
timeout
/)
{
$
webtask
->
output
(
"Lost contact with the aggregate. "
.
"Possibly a network failure, "
.
"please try again later."
);
}
else
{
$
webtask
->
output
($
response
->
output
());
}
$
failed
=
1
;
last
;
}
$
rpcfail
++;
next
;
}
$
rpcfail
=
0
;
next
if
($
response
->
code
()
==
GENIRESPONSE_BUSY
||
$
response
->
code
()
==
GENIRESPONSE_SERVER_UNAVAILABLE
);
$
repblob
=
$
response
->
value
();
#
#
Convert
to
something
smaller
,
with
info
the
web
interface
#
cares
about
.
We
get
this
on
each
loop
,
update
so
the
web
#
interface
can
show
changes
.
#
my
$
statusblob
=
$
aggobj
->
UpdateWebStatus
($
repblob
->{
'details'
});
my
$
changed
=
0
;
foreach
my
$
urn
(
keys
(%{$
repblob
->{
'details'
}}))
{
my
$
details
=
$
repblob
->{
'details'
}->{$
urn
};
my
$
node_id
=
$
details
->{
'client_id'
};
#
#
Look
at
the
last
blob
.
If
we
changed
,
view
that
as
progress
.
#
#
The
idea
is
that
as
long
as
progress
is
being
made
we
keep
#
waiting
,
by
resetting
the
waittime
if
it
gets
below
a
#
threshold
.
That
way
,
if
we
go
too
long
with
nothing
happening
,
#
we
will
stop
.
The
user
can
always
use
the
Refresh
button
on
#
the
status
page
.
#
if
(
defined
($
laststatus
))
{
if
(
!exists($laststatus->{$node_id})) {
$
seconds
=
600
if
($
seconds
<
600
);
}
else
{
if
(
exists
($
details
->{
"rawstate"
})
&&
$
laststatus
->{$
node_id
}->{
"rawstate"
}
ne
$
details
->{
"rawstate"
})
{
#
This
is
IG
specific
.
$
seconds
=
300
if
($
seconds
<
300
);
}
elsif
($
laststatus
->{$
node_id
}->{
"status"
}
ne
$
details
->{
"status"
})
{
$
seconds
=
450
if
($
seconds
<
450
);
}
}
}
}
$
laststatus
=
$
statusblob
;
if
(
exists
($
repblob
->{
'public_url'
}))
{
$
public_url
=
$
repblob
->{
'public_url'
};
$
aggobj
->
SetPublicURL
($
public_url
);
}
if
($
repblob
->{
'status'
}
eq
"ready"
)
{
$
ready
=
1
;
last
;
}
elsif
($
repblob
->{
'status'
}
eq
"failed"
)
{
$
failed
=
1
;
print
STDERR
"*** $urn failed
\n
"
;
$
webtask
->
output
(
"Experiment setup on $urn failed"
);
last
;
}
elsif
($
aggobj
->
instance
()->
IsCanceled
())
{
last
;
}
}
if
($
aggobj
->
instance
()->
IsCanceled
())
{
$
webtask
->
Exited
(
0
);
return
0
;
}
if
($
failed
||
!$ready) {
$
aggobj
->
SetStatus
(
"failed"
);
if
(
!$ready) {
#
XXX
Need
better
handling
for
timeout
.
print
STDERR
"*** $urn timed out.
\n
"
;
$
webtask
->
output
(
"Experiment setup on $urn timed out"
);
$
webtask
->
Exited
(
GENIRESPONSE_TIMEDOUT
);
}
else
{
$
webtask
->
Exited
(
1
);
}
return
$
webtask
->
exitcode
();
}
$
aggobj
->
SetStatus
(
"ready"
);
$
webtask
->
Exited
(
0
);
return
0
;
}
#
_Always_
make
sure
that
this
1
is
at
the
end
of
the
file
...
1
;
apt/create_instance.in
View file @
bd129835
...
...
@@ -849,169 +849,13 @@ $instance->ComputeNodeCounts();
#
Genixmlrpc
->
SetTimeout
(
60
);
sub
WaitForSliver
($)
{
my
(
$ref
)
=
@_
;
my
$aggobj
=
$ref
;
$aggobj
->
Refresh
();
my
$webtask
=
$aggobj
->
webtask
();
my
$authority
=
$aggobj
->
_authority
();
my
$cmurl
=
$authority
->
url
();
my
$urn
=
$authority
->
urn
();
$webtask
->
Refresh
();
# Debugging
$cmurl
=
APT_Instance::
devurl
(
$cmurl
);
my
$seconds
=
900
;
my
$interval
=
15
;
my
$ready
=
0
;
my
$failed
=
0
;
my
$rpcfail
=
0
;
my
$public_url
;
my
$repblob
;
my
$laststatus
;
while
(
$seconds
>
0
)
{
sleep
(
$interval
);
$seconds
-=
$interval
;
my
$response
=
$aggobj
->
SliceStatus
();
if
(
!
defined
(
$response
)
||
!
defined
(
$response
->
value
())
||
(
$response
->
code
()
!=
GENIRESPONSE_SUCCESS
&&
$response
->
code
()
!=
GENIRESPONSE_SERVER_UNAVAILABLE
&&
$response
->
code
()
!=
GENIRESPONSE_BUSY
&&
$response
->
code
()
!=
GENIRESPONSE_RPCERROR
))
{
print
STDERR
"
SliverStatus failed
";
if
(
defined
(
$response
))
{
print
STDERR
"
:
"
.
$response
->
output
();
print
STDERR
Dumper
(
$response
);
if
(
$response
->
output
()
=~
/read timeout/
)
{
$webtask
->
output
("
Lost contact with the aggregate.
"
.
"
Possibly a network failure,
"
.
"
please try again later.
");
}
else
{
$webtask
->
output
(
$response
->
output
());
}
}
print
STDERR
"
\n
";
$failed
=
1
;
last
;
}
if
(
$response
->
code
()
==
GENIRESPONSE_RPCERROR
)
{
if
(
$rpcfail
>
10
)
{
if
(
$response
->
output
()
=~
/read timeout/
)
{
$webtask
->
output
("
Lost contact with the aggregate.
"
.
"
Possibly a network failure,
"
.
"
please try again later.
");
}
else
{
$webtask
->
output
(
$response
->
output
());
}
$failed
=
1
;
last
;
}
$rpcfail
++
;
next
;
}
$rpcfail
=
0
;
next
if
(
$response
->
code
()
==
GENIRESPONSE_BUSY
||
$response
->
code
()
==
GENIRESPONSE_SERVER_UNAVAILABLE
);
$repblob
=
$response
->
value
();
#
# Convert to something smaller, with info the web interface
# cares about. We get this on each loop, update so the web
# interface can show changes.
#
my
$statusblob
=
$aggobj
->
UpdateWebStatus
(
$repblob
->
{'
details
'});
my
$changed
=
0
;
foreach
my
$urn
(
keys
(
%
{
$repblob
->
{'
details
'}}))
{
my
$details
=
$repblob
->
{'
details
'}
->
{
$urn
};
my
$node_id
=
$details
->
{'
client_id
'};
#
# Look at the last blob. If we changed, view that as progress.
#
# The idea is that as long as progress is being made we keep
# waiting, by resetting the waittime if it gets below a
# threshold. That way, if we go too long with nothing happening,
# we will stop. The user can always use the Refresh button on
# the status page.
#
if
(
defined
(
$laststatus
))
{
if
(
!
exists
(
$laststatus
->
{
$node_id
}))
{
$seconds
=
600
if
(
$seconds
<
600
);
}
else
{
if
(
exists
(
$details
->
{"
rawstate
"})
&&
$laststatus
->
{
$node_id
}
->
{"
rawstate
"}
ne
$details
->
{"
rawstate
"})
{
# This is IG specific.
$seconds
=
300
if
(
$seconds
<
300
);
}
elsif
(
$laststatus
->
{
$node_id
}
->
{"
status
"}
ne
$details
->
{"
status
"})
{
$seconds
=
450
if
(
$seconds
<
450
);
}
}
}
}
$laststatus
=
$statusblob
;
if
(
exists
(
$repblob
->
{'
public_url
'}))
{
$public_url
=
$repblob
->
{'
public_url
'};
$aggobj
->
SetPublicURL
(
$public_url
);
}
if
(
$repblob
->
{'
status
'}
eq
"
ready
")
{
$ready
=
1
;
last
;
}
elsif
(
$repblob
->
{'
status
'}
eq
"
failed
")
{
$failed
=
1
;
print
STDERR
"
***
$urn
failed
\n
";
$webtask
->
output
("
Experiment setup on
$urn
failed
");
last
;
}
elsif
(
$instance
->
IsCanceled
())
{
last
;
}
}
if
(
$instance
->
IsCanceled
())
{
$webtask
->
Exited
(
0
);
return
0
;
}
if
(
$failed
||
!
$ready
)
{
$aggobj
->
SetStatus
("
failed
");
if
(
!
$ready
)
{
# XXX Need better handling for timeout.
print
STDERR
"
***
$urn
timed out.
\n
";
$webtask
->
output
("
Experiment setup on
$urn
timed out
");
$webtask
->
Exited
(
GENIRESPONSE_TIMEDOUT
);
}
else
{
$webtask
->
Exited
(
1
);
}
return
$webtask
->
exitcode
();
}
$aggobj
->
SetStatus
("
ready
");
$webtask
->
Exited
(
0
);
return
0
;
}
#
# Okay, fire off the waits for each aggregate
#
my
@return_codes
=
();
if
(
ParRun
({"
maxwaittime
"
=>
99999
,
"
maxchildren
"
=>
scalar
(
@aggregate_list
)},
\
@return_codes
,
\
&WaitForSliver
,
@aggregate_list
))
{
\
@return_codes
,
\
&
APT_Instance::Aggregate::
WaitForSliver
,
@aggregate_list
))
{
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
...
...
apt/manage_instance.in
View file @
bd129835
...
...
@@ -41,8 +41,9 @@ sub usage()
print
("
Usage: manage_instance extend instance [-f] seconds
\n
");
print
("
Usage: manage_instance terminate instance
\n
");
print
("
Usage: manage_instance refresh instance
\n
");
print
("
Usage: manage_instance reboot instance node_id [node_id ...]
\n
");
print
("
Usage: manage_instance reload instance node_id [node_id ...]
\n
");
print
("
Usage: manage_instance reboot instance node_id ...
\n
");
print
("
Usage: manage_instance reload instance node_id ...
\n
");
print
("
Usage: manage_instance deletenodes instance node_id ...
\n
");
print
("
Usage: manage_instance monitor instance
\n
");
print
("
Usage: manage_instance lockdown instance set|clear user|admin
\n
");
print
("
Usage: manage_instance panic instance set|clear
\n
");
...
...
@@ -116,9 +117,10 @@ sub DoPanic();
sub
DoManifests
();
sub
DoLinktest
();
sub
DoUpdateKeys
();
sub
DoDeleteNodes
();
sub
WriteCredentials
();
sub
StartMonitor
();
sub
StartMonitorInternal
(
;
$
);
sub
StartMonitorInternal
(
;
$
@
);
sub
DoImageTrackerStuff
($$$$$$);
#
...
...
@@ -203,6 +205,9 @@ elsif ($action eq "writecreds") {
elsif
(
$action
eq
"
getmanifests
")
{
DoManifests
()
}
elsif
(
$action
eq
"
deletenodes
")
{
DoDeleteNodes
()
}
else
{
usage
();
}
...
...
@@ -1625,6 +1630,216 @@ sub DoManifests()
exit(1);
}
#
# Delete nodes.
#
sub DoDeleteNodes()
{
my $logname;
my $errmsg;
my $errcode = 1;
usage()
if (!@ARGV);
my $slice = $instance->GetGeniSlice();
if (!defined($slice)) {
fatal("No slice for instance");
}
my @aggregates = ();
my %node_ids = ();
my %aggmap = ();
foreach my $obj ($instance->AggregateList()) {
my $manifest = GeniXML::Parse($obj->manifest());
if (! defined($manifest)) {
fatal("Could not parse manifest");
}
my @nodes = GeniXML::FindNodes("n:node", $manifest)->get_nodelist();
foreach my $node (@nodes) {
my $client_id = GeniXML::GetVirtualId($node);
if (grep {$_ eq $client_id} @ARGV) {
my $sliver_urn = GeniXML::GetSliverId($node);
my $manager_urn = GetManagerId($node);
# No sliver urn or a different aggregate.
next
if (! (defined($sliver_urn) &&
defined($manager_urn) &&
$manager_urn eq $obj->aggregate_urn()));
if (!exists($aggmap{$obj->aggregate_urn()})) {
$aggmap{$obj->aggregate_urn()} = [];
push(@aggregates, $obj);
}
push(@{ $aggmap{$obj->aggregate_urn()} }, $client_id);
$node_ids{$sliver_urn} = $client_id;
}
}
}
#
# Lock the slice in case it is doing something else, like taking
# a disk image.
#
if ($slice->Lock()) {
$errmsg = "Experiment is busy, cannot lock it. Please try again later";
goto bad;
}
#
# Create the webtask object, but AFTER locking the slice so we do
# not destroy one in use.
#
if (defined($webtask_id)) {
$webtask = WebTask->LookupOrCreate($instance->uuid(), $webtask_id);
# Convenient.
$webtask->AutoStore(1);
}
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my @nodes = @{ $aggmap{$sliver->aggregate_urn()} };
my $errcode = -1;
my $errmsg;
$sliver->SetStatus("provisioning");
my $response = $sliver->DeleteNodes(\$errmsg, @nodes);
if (!defined($response)) {
$errmsg = "RPC Error calling DeleteNode";
goto bad;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
if ($response->code() == GENIRESPONSE_SEARCHFAILED) {
print STDERR "Slice is gone on $sliver";
goto bad;
}
if ($response->code() == GENIRESPONSE_BUSY) {
$errmsg = "Experiment is busy; try again later";
goto bad;
}
$errmsg = $response->output();
$errcode = $response->code();
goto bad;
}
# We get back a new manifest.
my $manifest = $response->value();
$sliver->SetManifest($manifest);
# Delete the nodes from the status blob.
if ($webtask->sliverstatus()) {
my $blob = $webtask->sliverstatus();
foreach my $node_id (@nodes) {
delete($blob->{$node_id});
}
$webtask->sliverstatus($blob);
}
$sliver->SetStatus("provisioned");
return 0;
bad:
$sliver->SetStatus("ready");
$webtask->output($errmsg);
$webtask->Exited($errcode);
print STDERR "Returning $errcode from coderef\n";
return $errcode;
};
#
# Set the status back to provisioning for the web interface.
#
$instance->SetStatus("provisioning");
my @return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)},
\@return_codes, $coderef, @aggregates)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error calling DeleteNodes";
goto bad;
}
#
# Check the exit codes.
#
foreach my $aggobj (@aggregates) {
my $code = shift(@return_codes);
# Updated in a forked child, must refresh.
$aggobj->Refresh();
if ($code) {
if ($aggobj->webtask()->output()) {
$errmsg = $aggobj->webtask()->output();
}
else {
$errmsg = "Some nodes could not be deleted";
}
$errcode = $aggobj->webtask()->exitcode();
goto bad;
}
}
#
# Let the web interface continue, we poll now.
#
if (!$debug) {
$logname = TBMakeLogname("deletenode");
if (TBBackGround($logname)) {
exit(0);
}
}
$instance->SetStatus("provisioned");
$instance->ComputeNodeCounts();
@return_codes = ();
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@aggregates)}, \@return_codes,
\&APT_Instance::Aggregate::WaitForSliver, @aggregates)) {
#
# The parent caught a signal. Leave things intact so that we can
# kill things cleanly later.
#
$errmsg = "Internal error waiting for slivers";
goto bad;