Commit ec076c05 authored by Leigh Stoller's avatar Leigh Stoller

Changes to idle handling in ProtoGeni slivers

When a new sliver is created, they are given a relatively short
lifetime. This is the value of protogeni/initial_sliver_lifetime and
defaults to six hours.

A user may renew a sliver for up to the number of days in
protogeni/max_sliver_lifetime (defaults to 90 days), except in Utah
where it is 5 days (Emulab, Utah Rack, Utah DDC Rack).

The CM daemon idle code looks for idle slivers. An idle sliver is one
in which 50% of the physical nodes have been idle for three hours.
(protogeni/idlecheck_threshold). At this point an email message is
sent to the sliver creator.

If the sitevar protogeni/idlecheck_norenew is set, then the email
threatens to mark the sliver as unrenewable if it stays idle. Then, at
2 * protogeni/idlecheck_threshold, if the sliver is still idle, the
sliver is marked as unrenewable. No matter what the user does at this
point, he will not be able to renew the sliver and it will expire out
normally.

If protogeni/idlecheck_norenew is no set, behaviour remains as it is
now; a followup message is sent every 24 hours.

There is a new backend script called "setexpiration" that allows an
aggregate admin person to override the settings on a per-slice basis
so that users who have a need for a long running sliver do not have to
continually renew and/or bypas the max_sliver_lifetime setting. For
example:

boss> wap setexpiration -e YYYY-MM-DD mysliceurn

will extend the termination date to the given date. To restore the
default behavour:

boss> wap setexpiration -E mysliceurn

Note that idle checks are still made. To turn off idle checks for
a slice:

boss> wap setexpiration -i mysliceurn

To turn then back on:

boss> wap setexpiration -I mysliceurn
parent 7c4ca686
......@@ -613,57 +613,11 @@ sub GetTicketAuxAux($$$$$$$$$$)
if (GeniResponse::IsResponse($expires)) {
return $expires;
}
# Convert to a localtime.
my $when = str2time($expires);
if (!defined($when)) {
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"Could not parse valid_until");
}
#
# Do we need a policy limit?
# A sitevar controls the sliver lifetime.
#
my $max_sliver_lifetime = 0;
if (!GetSiteVar('protogeni/max_sliver_lifetime',
\$max_sliver_lifetime)){
# Cannot get the value, default it to 90 days.
$max_sliver_lifetime = 90;
}
# Check if the user has a credential that lets him obtain slivers
# with extended sliver lifetime. If so allow him to get sliver.
foreach my $credential (@$credentials) {
my $nodes = GeniXML::FindNodesNS("//n:max_sliver_lifetime",
$credential->extensions(),
$GeniUtil::EXTENSIONS_NS);
if ($nodes->size > 0) {
$max_sliver_lifetime = int($nodes->pop()->string_value);
last;
}
}
my $diff = $when - time();
if ($diff < (60 * 5)) {
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"such a short life for a sliver? ".
"More time please.");
}
elsif ($diff > (3600 * 24 * $max_sliver_lifetime)) {
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"expiration is greater then the maximum number ".
"of minutes " . (60 * 24 * $max_sliver_lifetime));
}
#
# Must be before the slice expires.
#
my $slice_expires = $slice->expires();
if (defined($slice_expires)) {
$slice_expires = str2time($slice_expires);
if ($when > $slice_expires) {
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"valid_until is past slice expiration");
}
# Note "checkonly" flag; we do not actually change the slice
# until the ticket is redeemed.
my $tmp = SetSliceExpiration($slice, $expires, 1, @{ $credentials });
if (GeniResponse::IsResponse($tmp)) {
return $tmp;
}
}
......@@ -2905,42 +2859,9 @@ sub SliverWorkAux($$$$$$$$)
$message = "Illegal valid_until in rspec";
goto bad;
}
# Convert to a localtime.
my $when = timegm(strptime($expires));
if (!defined($when)) {
$message = "Could not parse valid_until";
goto bad;
}
#
# Do we need a policy limit? No, since we actually made this
# check when we granted the ticket (see above).
#
my $diff = $when - time();
if ($diff < (60 * 5) || $diff > (3600 * 24 * 100)) {
$message = "expiration is out of range";
goto bad;
}
#
# Must be before the slice expires.
#
my $slice_expires = $slice->expires();
if (defined($slice_expires)) {
$slice_expires = str2time($slice_expires);
if ($when > $slice_expires) {
$message = "valid_until is past slice expiration";
goto bad;
}
}
#
# Seems odd, eh? This changes the slice expiration in the DB,
# which was originally the time in the slice credential. The slice
# cannot be extended beyond this point, except by going through
# the RenewSliver() call below.
#
if ($slice->SetExpiration($when) != 0) {
$message = "Could not set expiration time";
my $tmp = SetSliceExpiration($slice, $expires, 0, $credential);
if (GeniResponse::IsResponse($tmp)) {
$message = GeniResponse::output($tmp);
goto bad;
}
}
......@@ -4036,89 +3957,10 @@ sub RenewSliverAux($$)
$message = "Slice has been shutdown";
goto bad;
}
#
# Figure out new expiration time.
#
my $slice_expires = $credential->expires();
if (!defined($slice_expires)) {
$message = "No expiration time in credential";
goto bad;
}
# Convert slice expiration to a time.
my $slice_when = str2time($slice_expires);
if (!defined($slice_when)) {
$message = "Could not parse expiration in credential";
goto bad;
}
#
# A sitevar controls the sliver lifetime.
#
my $max_sliver_lifetime = 0;
if (!GetSiteVar('protogeni/max_sliver_lifetime',
\$max_sliver_lifetime)){
# Cannot get the value, default it to 90 days.
$max_sliver_lifetime = 90;
}
#
# If no time is specified, then the user says they want to use
# the time in the slice credential, but must still check that
# against the local policy.
#
if (defined($expires)) {
# Convert to a localtime.
$when = timegm(strptime($expires));
if (!defined($when)) {
$message = "Could not parse expiration";
goto bad;
}
# Check if the user has a credential that lets him obtain slivers
# with extended sliver lifetime. If so allow him to get sliver.
foreach my $credential (@$credentials) {
my $nodes = GeniXML::FindNodesNS("//n:max_sliver_lifetime",
$credential->extensions(),
$GeniUtil::EXTENSIONS_NS);
if ($nodes->size > 0) {
$max_sliver_lifetime = int($nodes->pop()->string_value);
last;
}
}
my $diff = $when - time();
if ($diff < (60 * 5)) {
$message = "such a short life for a sliver? More time please.";
goto bad;
}
if ($diff > (3600 * 24 * $max_sliver_lifetime)) {
$message = "expiration is greater then the maximum number ".
"(" . (60 * 24 * $max_sliver_lifetime) . ") of minutes";
goto bad;
}
if ($when > $slice_when) {
$message = "Expiration is greater then slice expiration";
goto bad;
}
}
else {
if (($slice_when - time()) > (3600 * 24 * $max_sliver_lifetime)) {
$message = "slice expiration is greater then the maximum number ".
"(" . (60 * 24 * $max_sliver_lifetime) . ") of minutes";
goto bad;
}
$when = $slice_when;
}
if ($when < time()) {
$message = "Expiration is in the past";
goto bad;
}
if ($slice->SetExpiration($when) != 0) {
$message = "Could not set expiration time";
goto bad;
my $response = SetSliceExpiration($slice, $expires, 0, @{ $credentials });
if (GeniResponse::IsError($response)) {
$slice->UnLock();
return $response;
}
#
# Need to delete any cached credentials.
......@@ -5181,6 +5023,11 @@ sub CreateSliceFromCertificate($$)
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"Duplicate slice URN already exists here");
}
# Sanity check.
if (! defined($credential->expires())) {
return GeniResponse->Create(GENIRESPONSE_BADARGS, undef,
"Slice credential does not have an expiration");
}
$slice = GeniSlice->Create($certificate, $user, $authority);
return GeniResponse->Create(GENIRESPONSE_ERROR, undef,
"Cannot create slice object")
......@@ -5195,17 +5042,23 @@ sub CreateSliceFromCertificate($$)
# Cannot get the value, default it to 90 days.
$max_sliver_lifetime = 90;
}
my $initial_sliver_lifetime = 0;
if (!GetSiteVar('protogeni/initial_sliver_lifetime',
\$initial_sliver_lifetime)) {
# Cannot get the value, default it to 6 hours.
$initial_sliver_lifetime = 6;
}
elsif ($initial_sliver_lifetime == 0) {
$initial_sliver_lifetime = $max_sliver_lifetime * 24;
}
my $expires = $credential->expires();
# This is already a localtime.
my $when = timelocal(strptime($expires));
# Reverts to default in Create if this fails
if (defined($when)) {
my $diff = $when - time();
my $diff = $when - time();
if ($diff > (3600 * 24 * $max_sliver_lifetime)) {
# Shorten to policy maximum. Okay to use a unix time.
$expires = time() + (3600 * 24 * $max_sliver_lifetime);
}
if ($diff > (3600 * 24 * $max_sliver_lifetime)) {
# Shorten to policy maximum. Okay to use a unix time.
$expires = time() + (3600 * $max_sliver_lifetime);
}
$slice->SetExpiration($expires);
$slice->SetPublicID();
......@@ -6475,5 +6328,120 @@ sub KillMonitor($)
return 0;
}
#
# Helper function to compute expiration time.
#
sub SetSliceExpiration($$$@)
{
my ($slice, $expiration, $checkonly, @credentials) = @_;
my $message = "Error renewing sliver";
my $when;
#
# Ick, assume first credential is the slice credential. Bad.
#
my $slice_credential = shift(@credentials);
#
# Maximum expiration is what the credential says, but we might
# not allow that long.
#
my $slice_expires = $slice_credential->expires();
if (!defined($slice_expires)) {
$message = "No expiration time in credential";
goto bad;
}
# Convert slice expiration to a time.
my $slice_when = str2time($slice_expires);
if (!defined($slice_when)) {
$message = "Could not parse expiration in credential";
goto bad;
}
#
# A sitevar controls the sliver lifetime.
#
my $max_sliver_lifetime = 0;
if (!GetSiteVar('protogeni/max_sliver_lifetime',
\$max_sliver_lifetime)){
# Cannot get the value, default it to 90 days.
$max_sliver_lifetime = 90;
}
#
# Check if the user has a credential that lets him obtain slivers
# with extended sliver lifetime. If so allow request, but note
# that this will not allow an override of the DB settings.
#
foreach my $credential (@credentials) {
my $nodes = GeniXML::FindNodesNS("//n:max_sliver_lifetime",
$credential->extensions(),
$GeniUtil::EXTENSIONS_NS);
if ($nodes->size > 0) {
$max_sliver_lifetime = int($nodes->pop()->string_value);
last;
}
}
if (defined($expiration)) {
# Convert to a localtime.
$when = timegm(strptime($expiration));
if (!defined($when)) {
$message = "Could not parse expiration";
goto bad;
}
}
else {
$when = $slice_expires;
}
my $diff = $when - time();
if ($diff < (60 * 5)) {
$message = "such a short life for a sliver?";
goto bad;
}
if ($when > $slice_when) {
$message = "expiration is greater then slice expiration";
goto bad;
}
if ($when < time()) {
$message = "expiration is in the past; no time travel allowed";
goto bad;
}
if (defined($slice->renew_limit()) &&
$diff > $slice->renew_limit_stamp()) {
if ($slice->renew_limit_stamp() == 0) {
$message = "you are not allowed to renew cause it ".
"was idle for too long";
}
else {
$message = "expiration increment is greater then local slice ".
"setting: " . $slice->renew_limit();
}
goto bad;
}
if (defined($slice->expiration_max())) {
if ($when > $slice->expiration_max_stamp()) {
$message = "expiration is greater then local slice setting: ".
POSIX::strftime("20%y-%m-%dT%H:%M:%SZ",
gmtime(str2time($slice->expiration_max())));
goto bad;
}
}
elsif ($diff > (3600 * 24 * $max_sliver_lifetime)) {
$message = "expiration increment is greater then the maximum number ".
"(" . (60 * 24 * $max_sliver_lifetime) . ") of minutes";
goto bad;
}
if (!$checkonly && $slice->SetExpiration($when) != 0) {
$message = "could not set expiration time";
goto bad;
}
return 0;
bad:
return GeniResponse->Create(GENIRESPONSE_ERROR, undef, $message);
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -873,19 +873,19 @@ sub PushStats()
#
sub CheckIdle()
{
my $mailinterval;
my $threshold;
my $idlecheck_terminate;
my $mailinterval = 1;
my $idlecheck_threshold = 1;
my $idlecheck_norenew = 0;
my $debug = 1;
if (! (GetSiteVar("idle/mailinterval", \$mailinterval) &&
GetSiteVar("idle/threshold", \$threshold) &&
GetSiteVar("protogeni/idlecheck_terminate",\$idlecheck_terminate))) {
GetSiteVar("protogeni/idlecheck_threshold", \$idlecheck_threshold) &&
GetSiteVar("protogeni/idlecheck_norenew",\$idlecheck_norenew))) {
print STDERR "Could not get idle parameters.\n";
return;
}
# Threshold in hours, convert to minutes.
$threshold = $threshold * 60;
$idlecheck_threshold = $idlecheck_threshold * 60;
my $query_result =
GeniDB::DBQueryWarn("select idx,UNIX_TIMESTAMP(created) ".
......@@ -908,7 +908,8 @@ sub CheckIdle()
# not actually exist yet.
my $aggregate = GeniAggregate->SliceAggregate($slice);
if (!defined($aggregate)) {
print STDERR "No aggregate created yet for $slice\n";
print STDERR "No aggregate created yet for $slice\n"
if ($debug);
$slice->UnLock();
goto skip;
}
......@@ -940,16 +941,7 @@ sub CheckIdle()
" round((unix_timestamp(now()) - ".
" unix_timestamp(last_swap_req))/3600,2) ".
" as lastreq, ".
" count(r.node_id) as nodes, ".
" round((unix_timestamp(now()) - ".
" unix_timestamp(max(greatest(last_tty_act, ".
" last_net_act, last_cpu_act, ".
" last_ext_act))))/60,1) as idle_time, ".
" max(greatest(last_tty_act, last_net_act, ".
" last_cpu_act, ".
" last_ext_act)) as lastact, ".
" (unix_timestamp(now()) - ".
" unix_timestamp(min(last_report))) as staleness ".
" count(r.node_id) as nodes ".
"from node_activity as na ".
"left join reserved as r on na.node_id=r.node_id ".
"left join nodes as n on na.node_id=n.node_id ".
......@@ -972,14 +964,48 @@ sub CheckIdle()
# lastreq in hours
my $lastreq = $row->{'lastreq'};
my $nodes = $row->{'nodes'};
# idletime in in minutes
my $idletime = $row->{'idle_time'};
my $lastact = $row->{'lastact'};
my $staleness = $row->{'staleness'};
my $stale = 0;
my $urn = $slice->urn();
#
# staleness. If undef, none of the nodes have reported any idle data
# Now pull in per-node data. We want to compute a percentage
# of idle nodes so that we can be a little smarter.
#
my $node_result =
emdb::DBQueryWarn("select r.node_id, ".
" greatest(last_tty_act,last_net_act,last_cpu_act,".
" last_ext_act) as lastact,".
" round((unix_timestamp(now()) - ".
" unix_timestamp(greatest(last_tty_act, ".
" last_net_act, last_cpu_act, ".
" last_ext_act)))/60,1) as idle_time, ".
" (unix_timestamp(now()) - ".
" unix_timestamp(last_report)) as staleness ".
"from reserved as r ".
"left join node_activity as na on ".
" na.node_id=r.node_id ".
"left join nodes as n on n.node_id=r.node_id ".
"left join node_types as nt on nt.type=n.type ".
"left join experiments as e on r.exptidx=e.idx ".
"where r.exptidx=$exptidx and nt.isvirtnode=0");
if (!defined($node_result)) {
print STDERR "** Could not get node idle data for $experiment\n";
$slice->UnLock();
goto skip;
}
# Skip if no physical nodes; eventually come back to this.
if (!$node_result->numrows) {
print STDERR "-> No physical nodes; skipping\n";
$slice->UnLock();
goto skip;
}
my %node_idledata = ();
my $stalenodes = 0;
my $idlenodes = 0;
my $reporting = 0;
#
# staleness. If undef, the node has not reported any idle data
# since inception, lets skip since we do not know anything. Note
# that idletime will also be null. If the staleness is really big,
# all of the nodes have stopped reporting, and that means they are
......@@ -988,65 +1014,132 @@ sub CheckIdle()
# staleness is small, we trust the data idle data to be accurate.
# If the staleness is medium, then wait longer.
#
if (defined($staleness)) {
# Stale if more then 10 minutes but less then six hours.
$stale = 1
if ($staleness >= 600 && $staleness < 6 * 3600);
}
while (my $noderow = $node_result->fetchrow_hashref()) {
my $node_id = $noderow->{'node_id'};
my $lastact = $noderow->{'lastact'};
# idletime in in minutes
my $idletime = $noderow->{'idle_time'};
# staleness in in seconds
my $staleness = $noderow->{'staleness'};
$node_idledata{$node_id} = $noderow;
#
# staleness; if null, the node is not reporting any idle
# data since inception; lets skip it since we do not know
# anything. We view these nodes at not idle for now. Needs
# more thought.
#
next
if (!defined($staleness));
$reporting++;
#
# If the staleness is big the node has stopped reporting,
# and that means it is offline or otherwise munged to stop
# reporting. I am going to treat those as idle. The user can
# argue with us about it.
#
if ($staleness > ($idlecheck_threshold * 60)) {
$idlenodes++;
$noderow->{'idle'} = 1;
next;
}
#
# Otherwise, if the staleness is small, we trust the
# idle data to be accurate. If the staleness is medium, then
# wait longer before using this node.
#
if ($staleness > (15 * 60)) {
$stalenodes++;
$noderow->{'stale'} = 1;
next;
}
#
# Finally, look at the idle time.
#
if ($idletime > $idlecheck_threshold) {
$idlenodes++;
$noderow->{'idle'} = 1;
next;
}
}
#
# if no idle data, nothing do do.
# If no idle data, nothing do do.
#
if (!defined($idletime)) {
print STDERR "No idle data for $slice\n"
if (! $reporting) {
print STDERR "-> No idle data for $slice. Skipping ...\n"
if ($debug);
$slice->UnLock();
goto skip;
}
my $idletime_hours = sprintf("%.2f", $idletime / 60.0);
print STDERR "-> $urn is using $nodes physical nodes:\n";
print STDERR " reporting:$reporting, idle:$idlenodes, ".
"stale:$stalenodes\n";
print STDERR " But idle ignore is set.\n"
if ($ignore);
# We do not know (from the query) that idletime>threshold. So
# check that we're either forcing, or that it is idle, and
# then check the swap requests and time of last request, to
# make sure we can send a message.
if ($idletime > $timeout || $idletime > $threshold) {
print STDERR "$slice is using $nodes physical nodes ".
"and has been idle for $idletime_hours hours\n";
if ($ignore) {
print STDERR "*** but idle ignore is set.\n";
}
}
#
# We want to report as much idle data as possible, but
# if the slice has not passed its idletime setting, or
# if idleswap is off, we skip.
#
if ($idletime > $timeout && $idleswap) {
if ($idleswap && ($idlenodes / ($reporting * 1.0) >= 0.50)) {
#
# Send email if none sent or if the minimum time since
# since the last message has passed. In general, we will not
# send more then one message, but for testing lets not
# annoy people by actually terminating slices. Just annoy
# them with email, but only once a day.
# send more then one message in a 24 hour period to avoid
# annoying people to much.
#
if ($swapreqs == 0 || $lastreq > 24) {
if ($idlecheck_norenew &&
$swapreqs && ($lastreq * 60) > (2 * $idlecheck_threshold)) {
#
# Current policy is no second warnings; the
# slice may not be renewed, even if it goes non idle
# again. Setting the renew limit to zero tells the CM
# to not allow any renew.
#
print STDERR "** Slice can no longer be renewed!\n";
$slice->SetRenewLimit(0)
if (!$impotent);
}
elsif ($swapreqs == 0 || $lastreq > 24 || $debug) {
my $geniuser = GeniUser->Lookup($slice->creator_uuid(), 1);
my $emailaddr = $geniuser->email();
my $sliver_urn= $aggregate->urn();
my $urn = $slice->urn();
my $stats = "";
foreach my $node (keys(%node_idledata)) {
if (exists($node_idledata{$node}->{'idle'})) {
$stats .=
sprintf("%40s - %.2f hours idle\n",
"${node}.${OURDOMAIN}",
$node_idledata{$node}->{'idle_time'}/60.0);
}
}
if ($impotent) {
print STDERR " Would send email to $geniuser\n";
print STDERR $stats if ($stats ne "");
}
else {
print STDERR " Sending mail to $geniuser about ".
"idle sliver $aggregate\n";
print STDERR
"Sending mail to $geniuser about idle sliver $aggregate\n";
if (!$impotent) {
SENDMAIL($emailaddr, "Sliver $sliver_urn is idle",
SENDMAIL($emailaddr, "Sliver $sliver_urn is mostly idle",
"Sliver $sliver_urn\n".
"from slice $urn,\n".
"has been idle for approximately ".
"$idletime_hours hour(s), ".
"and is using $nodes physical nodes.\n".
"This sliver will be terminated if it stays idle.\n",
$TBAUTOMAIL, "CC: $TBAUTOMAIL");
"has been mostly idle for an unusual length of time,\n".
"and is using $nodes physical nodes.\n\n".
"$stats\n".
($idlecheck_norenew ?
"You will not be allowed to renew this sliver " :
"This sliver might be terminated ") .
"if it stays idle.\n",
$TBAUTOMAIL, "CC: $TBAUTOMAIL");
emdb::DBQueryWarn("update experiments set ".
" swap_requests=swap_requests+1, ".
......@@ -1054,25 +1147,6 @@ sub CheckIdle()
"where idx='$exptidx'");
}
}
elsif ($idletime > (2 * $timeout) && $idlecheck_terminate) {
print STDERR "Expiring idle $slice $experiment.\n";
if (!$impotent) {
my $slice_uuid = $slice->uuid();
system("$CLEANUPSLICE -f -m $slice_uuid");
if ($?) {
print STDERR "Could not release idle $slice\n";
SENDMAIL($TBOPS, "Could not release idle slice $slice",
"Could not release idle slice $slice");
# Leave it locked.
goto skip;
}
}
}
else {
print STDERR "Waiting to see if idle slice $slice ".
"does something useful.\n";
}
}
else {
# Reset warnings.
......@@ -1131,7 +1205,7 @@ while (1) {
#
# Run the idle check periodically.
#
if ($idlecounter >= (15 * 60)) {
if ($idlecounter >= (10 * 60)) {
CheckIdle()
if ($idlecheck || $doidlechecks);
$idlecounter = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment