Commit cd6a4cb4 authored by Leigh B Stoller's avatar Leigh B Stoller

First cut at auto approving experiments that would otherwise require

admin approval. Informational for now, the results are ignores and
stored in the DB and shown on the admin extend page.
parent d9b96847
...@@ -72,6 +72,7 @@ my $TB = "@prefix@"; ...@@ -72,6 +72,7 @@ my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@"; my $TBOPS = "@TBOPSEMAIL@";
my $GENEXTENDCRED = "$TB/sbin/protogeni/genextendcred"; my $GENEXTENDCRED = "$TB/sbin/protogeni/genextendcred";
my $GENIUSER = "geniuser"; my $GENIUSER = "geniuser";
my $MAINSITE = @TBMAINSITE@;
# Cache of instances to avoid regenerating them. # Cache of instances to avoid regenerating them.
my %instances = (); my %instances = ();
...@@ -784,7 +785,8 @@ sub ComputeNodeCounts($) ...@@ -784,7 +785,8 @@ sub ComputeNodeCounts($)
$manager_urn ne $sliver->aggregate_urn()); $manager_urn ne $sliver->aggregate_urn());
if (defined($virtualization_type) && if (defined($virtualization_type) &&
$virtualization_type eq "emulab-xen") { ($virtualization_type eq "emulab-xen" ||
$virtualization_type eq "emulab-blockstore")) {
$vcount++; $vcount++;
next; next;
} }
...@@ -803,6 +805,66 @@ sub ComputeNodeCounts($) ...@@ -803,6 +805,66 @@ sub ComputeNodeCounts($)
return 0; return 0;
} }
#
# Go through all the manifests and return a list of nodes.
#
sub GetNodeDetails($)
{
my ($self) = @_;
my $rval = {};
my @slivers = $self->AggregateList();
if (!@slivers) {
print STDERR "No slivers for $self\n";
return undef;
}
foreach my $sliver (@slivers) {
my $manifest = GeniXML::Parse($sliver->manifest());
if (! defined($manifest)) {
print STDERR "Could not parse manifest for $sliver\n";
return undef;
}
my $nodes = {};
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist(),
GeniXML::FindNodesNS("n:vhost",
$manifest,
$GeniXML::EMULAB_NS)->get_nodelist()) {
my $virtualization_type = GeniXML::GetVirtualizationSubtype($ref);
my $manager_urn = GetManagerId($ref);
my $isvnode = 0;
# Combined rspec.
next
if (!defined($manager_urn) ||
$manager_urn ne $sliver->aggregate_urn());
if (defined($virtualization_type) &&
($virtualization_type eq "emulab-xen" ||
$virtualization_type eq "emulab-blockstore")) {
$isvnode = 1;
}
my $client_id = GetVirtualId($ref);
my $node_id = GetVnodeId($ref);
if (GeniHRN::IsValid($node_id)) {
my $hrn = GeniHRN->new($node_id);
if (!$hrn->IsNode()) {
print STDERR "$node_id is not a node\n";
return undef;
}
$node_id = $hrn->id();
}
$nodes->{$node_id} = {
"client_id" => $client_id,
"isvirtnode" => $isvnode,
};
}
$rval->{$sliver->aggregate_urn()} = $nodes;
}
return $rval;
}
# #
# Add an aggregate to an instance. # Add an aggregate to an instance.
# #
......
...@@ -30,6 +30,7 @@ use CGI; ...@@ -30,6 +30,7 @@ use CGI;
use POSIX ":sys_wait_h"; use POSIX ":sys_wait_h";
use POSIX qw(setsid strftime ceil floor); use POSIX qw(setsid strftime ceil floor);
use Date::Parse; use Date::Parse;
use JSON;
# #
# Back-end script to manage APT profiles. # Back-end script to manage APT profiles.
...@@ -137,6 +138,8 @@ sub DoUtilization(); ...@@ -137,6 +138,8 @@ sub DoUtilization();
sub DoIdleData(); sub DoIdleData();
sub DoOpenstack(); sub DoOpenstack();
sub DoCheckReservation(); sub DoCheckReservation();
sub DoCheckAutoApprove();
sub CheckAutoApprove($$);
sub CheckReservationInternal($$$); sub CheckReservationInternal($$$);
sub DoMaxExtension(); sub DoMaxExtension();
sub WriteCredentials(); sub WriteCredentials();
...@@ -145,6 +148,7 @@ sub StartMonitorInternal(;$@); ...@@ -145,6 +148,7 @@ sub StartMonitorInternal(;$@);
sub DoImageTrackerStuff($$$$$$$); sub DoImageTrackerStuff($$$$$$$);
sub DenyExtensionInternal($); sub DenyExtensionInternal($);
sub ExtendInternal($$$$$); sub ExtendInternal($$$$$);
sub CallAggregateMethod($$@);
# #
# Parse command arguments. Once we return from getopts, all that should be # Parse command arguments. Once we return from getopts, all that should be
...@@ -269,6 +273,9 @@ elsif ($action eq "checkreservation") { ...@@ -269,6 +273,9 @@ elsif ($action eq "checkreservation") {
elsif ($action eq "maxextension") { elsif ($action eq "maxextension") {
DoMaxExtension() DoMaxExtension()
} }
elsif ($action eq "checkautoapprove") {
DoCheckAutoApprove()
}
else { else {
usage(); usage();
} }
...@@ -442,7 +449,7 @@ sub DoSnapshot() ...@@ -442,7 +449,7 @@ sub DoSnapshot()
if (defined($node_id)); if (defined($node_id));
} }
# #
# Instruct the remote cluster to copy the image back to its origin, # Instruct the remote cluster to copy the image back to its origin,
# but we need to ask the IMS for uuid of the image that is running, # but we need to ask the IMS for uuid of the image that is running,
# so we can tell the cluster, which then tells the origin cluster. # so we can tell the cluster, which then tells the origin cluster.
...@@ -1307,6 +1314,7 @@ sub DoExtend() ...@@ -1307,6 +1314,7 @@ sub DoExtend()
my $granted = 0; my $granted = 0;
my $needapproval = 0; my $needapproval = 0;
my $inhours = 0; my $inhours = 0;
my $autoapprove_info;
my $message; my $message;
my $reason; my $reason;
my $errmsg; my $errmsg;
...@@ -1480,6 +1488,15 @@ sub DoExtend() ...@@ -1480,6 +1488,15 @@ sub DoExtend()
$granted = 0; $granted = 0;
} }
if ($wanted > $granted) { if ($wanted > $granted) {
if (checkautoapprove($wanted, \$autoapprove_info) == 0) {
if ($autoapprove_info->{'approve'}) {
# Informational for now.
if (0) {
$granted = $wanted;
goto grant;
}
}
}
$needapproval = 1; $needapproval = 1;
$message = "because it was started more then ". $message = "because it was started more then ".
"$autoextend_maxage days ago"; "$autoextend_maxage days ago";
...@@ -1502,6 +1519,15 @@ sub DoExtend() ...@@ -1502,6 +1519,15 @@ sub DoExtend()
# management committee." # management committee."
# #
elsif ($wanted > $autoextend_maximum) { elsif ($wanted > $autoextend_maximum) {
if (CheckAutoApprove($wanted, \$autoapprove_info) == 0) {
if ($autoapprove_info->{'approve'}) {
# Informational for now.
if (0) {
$granted = $wanted;
goto grant;
}
}
}
$needapproval = 1; $needapproval = 1;
$message = "because it was for longer then $autoextend_maximum days"; $message = "because it was for longer then $autoextend_maximum days";
# #
...@@ -1539,6 +1565,7 @@ sub DoExtend() ...@@ -1539,6 +1565,7 @@ sub DoExtend()
# #
# Do the extension. # Do the extension.
# #
grant:
if ($granted) { if ($granted) {
my $seconds = $granted * 3600; my $seconds = $granted * 3600;
$seconds *= 24 if (!$inhours); $seconds *= 24 if (!$inhours);
...@@ -1591,6 +1618,16 @@ sub DoExtend() ...@@ -1591,6 +1618,16 @@ sub DoExtend()
$extensionargs->{"uid"} = $instance->creator(); $extensionargs->{"uid"} = $instance->creator();
$extensionargs->{"uid_idx"} = $instance->creator_idx(); $extensionargs->{"uid_idx"} = $instance->creator_idx();
} }
if (defined($autoapprove_info)) {
$extensionargs->{"autoapproved"} = $autoapprove_info->{'approve'};
$extensionargs->{"autoapproved_reason"} = $autoapprove_info->{'reason'};
if (defined($autoapprove_info->{'metrics'})) {
my $metrics = eval { encode_json($autoapprove_info->{'metrics'}); };
if (!$@) {
$extensionargs->{"autoapproved_metrics"} = $metrics;
}
}
}
my $extensioninfo = my $extensioninfo =
APT_Instance::ExtensionInfo->Create($instance, $extensionargs); APT_Instance::ExtensionInfo->Create($instance, $extensionargs);
if (!defined($extensioninfo)) { if (!defined($extensioninfo)) {
...@@ -3482,8 +3519,9 @@ sub DoUpdateKeys() ...@@ -3482,8 +3519,9 @@ sub DoUpdateKeys()
# #
sub DoUtilization() sub DoUtilization()
{ {
my $utilization;
my $errmsg; my $errmsg;
my $errcode = 1; my @aggregates = ();
# #
# Get the nodeid to client id mapping # Get the nodeid to client id mapping
...@@ -3504,72 +3542,31 @@ sub DoUtilization() ...@@ -3504,72 +3542,31 @@ sub DoUtilization()
$client_ids{$obj->aggregate_urn()}->{$node_id} = $client_id; $client_ids{$obj->aggregate_urn()}->{$node_id} = $client_id;
} }
} }
#
# And tell the backend clusters to do the update.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->Utilization();
if (!defined($response)) {
print STDERR "RPC Error calling utilization on $sliver\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not get utilization for sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
$webtask->results($response->value());
return 0;
};
my @return_codes = ();
my @agglist = ();
# #
# Cull out any aggregates with no nodes. # Cull out any aggregates with no nodes.
# #
foreach my $agg ($instance->AggregateList()) { foreach my $aggregate ($instance->AggregateList()) {
push(@agglist, $agg) push(@aggregates, $aggregate)
if ($agg->physnode_count() || $agg->virtnode_count()); if ($aggregate->physnode_count() || $aggregate->virtnode_count());
} }
my $errcode = CallAggregateMethod("Utilization",\$utilization,@aggregates);
if (ParRun({"maxwaittime" => 99999, if ($errcode) {
"maxchildren" => scalar(@agglist)}, $errmsg = $utilization;
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling Utilization()";
goto bad; goto bad;
} }
# foreach my $aggregate (@aggregates) {
# Check the exit codes. my $blob = shift(@{$utilization});
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not get utilization from some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $agg->webtask()->exitcode();
}
goto bad;
}
# #
# Annotate the result with some extra info for the web UI. # Annotate the result with some extra info for the web UI.
# #
my $blob = $agg->webtask()->results();
foreach my $node_id (keys(%{ $blob->{'details'}->{'nodes'} })) { foreach my $node_id (keys(%{ $blob->{'details'}->{'nodes'} })) {
$blob->{'details'}->{'nodes'}->{$node_id}->{"client_id"} = $blob->{'details'}->{'nodes'}->{$node_id}->{"client_id"} =
$client_ids{$agg->aggregate_urn()}->{$node_id}; $client_ids{$aggregate->aggregate_urn()}->{$node_id};
} }
if ($debug) { if ($debug) {
print Dumper($agg->webtask()->results()); print Dumper($blob);
} }
$agg->webtask()->results($blob); $aggregate->webtask()->results($blob);
$agg->webtask()->Store();
} }
exit(0); exit(0);
bad: bad:
...@@ -3669,76 +3666,316 @@ sub DoOpenstack() ...@@ -3669,76 +3666,316 @@ sub DoOpenstack()
# #
sub DoIdleData() sub DoIdleData()
{ {
my $idledata;
my $errmsg; my $errmsg;
my $errcode = 1; my @aggregates = ();
# #
# And ask the backend clusters for the data. # Cull out any aggregates with no nodes.
# #
my $coderef = sub { foreach my $aggregate ($instance->AggregateList()) {
my ($sliver) = @_; push(@aggregates, $aggregate)
my $webtask = $sliver->webtask(); if ($aggregate->physnode_count() || $aggregate->virtnode_count());
}
my $response = $sliver->IdleData(); my $errcode = CallAggregateMethod("IdleData", \$idledata, @aggregates);
if (!defined($response)) { if ($errcode) {
print STDERR "RPC Error calling idledata on $sliver\n"; $errmsg = $idledata;
return -1; goto bad;
} }
if ($response->code() != GENIRESPONSE_SUCCESS) { # We get back json strings, we pass that through to the web interface.
print STDERR "Could not get idledata for sliver: ". foreach my $aggregate (@aggregates) {
$response->output() . "\n"; my $json = shift(@{$idledata});
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
if ($debug) { if ($debug) {
print Dumper($response->value()); print $json . "\n";
} }
$webtask->idledata($response->value()); $aggregate->webtask()->idledata($json);
}
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
exit($errcode);
}
#
# Use the idledata to determine if an experiment can be extended without
# admin intervention.
#
sub DoCheckAutoApprove()
{
my $errmsg;
my $errcode = 1;
my $results;
usage()
if (!@ARGV);
my $days = shift(@ARGV);
$errcode = CheckAutoApprove($days, \$results);
if ($errcode) {
goto bad;
}
print Dumper($results);
exit($results->{'approve'});
bad:
print STDERR $errmsg . "\n" if (defined($errmsg));
if (defined($webtask)) {
$webtask->output($errmsg) if (defined($errmsg));
$webtask->Exited($errcode);
}
exit($errcode);
}
sub CheckAutoApprove($$)
{
my ($days, $pref) = @_;
my $errmsg;
my $errcode = 1;
my $utilization;
my $idledata;
my $loaddata = {};
my $slice = $instance->GetGeniSlice();
my $expires_time = str2time($slice->expires());
my $autoapprove_minloadavg = 0.25;
my $autoapprove_minsamples = 12 * 18;
my $autoapprove_ratio = 0.66;
my $autoapprove_minfree = 0.1;
#
# We do not do auto extensions if more then 1000 physical node hours
# in the request.
#
if ($days * 24 * $instance->physnode_count() > 1000) {
my $rval = {
"reason" => "greater then 1000 node hours requested",
"metrics" => undef,
"approve" => 0,
};
$$pref = $rval;
return 0; return 0;
}; }
my @return_codes = (); # Only shared VMs, people can have them forever.
my @agglist = (); if (!$instance->physnode_count()) {
my $rval = {
"reason" => "extension auto approved because no physical nodes",
"metrics" => undef,
"approve" => 1,
};
$$pref = $rval;
return 0;
}
#
# We do not do auto extensions if more then 1500 physical node hours
# between current time and time at end of extension. This number needs
# calibration.
#
my $diff = ($expires_time + ($days * 3600 * 24)) - time();
if ($diff < 0) {
$errmsg = "Time is in the past. Hmm.\n";
$errcode= -1;
goto bad;
}
if ((($diff / 3600.0) / $instance->physnode_count()) > 1500) {
my $rval = {
"reason" => "granting extension would exceed 1500 node hours",
"metrics" => undef,
"approve" => 0,
};
$$pref = $rval;
return 0;
}
# #
# We ask the clusters for utilization data which includes freecounts.
# Cull out any aggregates with no nodes. # Cull out any aggregates with no nodes.
# #
my @aggregates = ();
foreach my $agg ($instance->AggregateList()) { foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg) push(@aggregates, $agg)
if ($agg->physnode_count() || $agg->virtnode_count()); if ($agg->physnode_count() || $agg->virtnode_count());
} }
$errcode = CallAggregateMethod("Utilization", \$utilization, @aggregates);
if (ParRun({"maxwaittime" => 99999, if ($errcode) {
"maxchildren" => scalar(@agglist)}, $errmsg = $utilization;
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling IdleData()";
goto bad; goto bad;
} }
foreach my $blob (@{$utilization}) {
foreach my $type (keys(%{ $blob->{'typeinfo'} })) {
my ($free,$total);
my $typeinfo = $blob->{'typeinfo'}->{$type};
if (exists($typeinfo->{'reserve_free'})) {
$free = $typeinfo->{'reserve_free'};
}
else {
$free = $typeinfo->{'free'};
}
$total = $typeinfo->{'total'};
# Skip if no data.
next
if (!$total);
if ($free / $total < $autoapprove_minfree) {
my $rval = {
"reason" => "Less then " . $autoapprove_minfree * 100 .
"% of node type $type are free",
"metrics" => $typeinfo,
"approve" => 0,
};
$$pref = $rval;
return 0;
}
}
}
# #
# Check the exit codes. # We need to know which nodes are physical, we do not worry about
# the VMs; we want shared nodes to be used, and busy VMs should be
# reflected in the load on the physical machines.
# #
foreach my $agg (@agglist) { my $nodes = $instance->GetNodeDetails();
my $code = shift(@return_codes); if (!defined($nodes)) {
$agg->webtask()->Refresh(); $errmsg = "Could not get node details for instance\n";
if ($code) { $errcode= -1;
$errmsg = "Could not get idledata from some slivers"; goto bad;
if ($agg->webtask()->output()) { }
$errmsg .= ": " . $agg->webtask()->output(); #
$errcode = $code; # Ask the clusters for the idle stats.
#
$errcode = CallAggregateMethod("IdleData", \$idledata, @aggregates);
if ($errcode) {
$errmsg = $idledata;
goto bad;
}
# We consider load average data only.
foreach my $aggregate (@aggregates) {
my $urn = $aggregate->aggregate_urn();
my $json = shift(@{$idledata});
my $list = eval { decode_json($json); };
if ($@) {
print STDERR "Could not decode json data\n";
next;
}
$loaddata->{$urn} = {};
foreach my $ref (@{ $list }) {
my $node_id = $ref->{"node_id"};
# Skip virtual nodes.
next
if (!exists($nodes->{$urn}->{$node_id}) ||
$nodes->{$urn}->{$node_id}->{'isvirtnode'});
if (!exists($ref->{"main"}) ||
ref($ref->{"main"}) ne "HASH" ||
!exists($ref->{"main"}->{"AVG"})) {
print STDERR "No data for $node_id\n";
$loaddata->{$urn}->{$node_id} = undef;
next;
} }
goto bad;
#
# Look at the last 24 hour 5 minute load average samples on
# each node and count the number of samples greater the
# threshold.
#
my @samples = @{ $ref->{"main"}->{"AVG"} };
# Skip first one, its the labels.
shift(@samples);
my $now = time();
my $timelimit = $now - (24 * 3600); # Last 24 hours.
my $total = 0;
my $count = 0;
# Oldest at the beginning of the list