Commit cd6a4cb4 authored by Leigh B Stoller's avatar Leigh B Stoller

First cut at auto approving experiments that would otherwise require

admin approval. Informational for now, the results are ignores and
stored in the DB and shown on the admin extend page.
parent d9b96847
......@@ -72,6 +72,7 @@ my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $GENEXTENDCRED = "$TB/sbin/protogeni/genextendcred";
my $GENIUSER = "geniuser";
my $MAINSITE = @TBMAINSITE@;
# Cache of instances to avoid regenerating them.
my %instances = ();
......@@ -784,7 +785,8 @@ sub ComputeNodeCounts($)
$manager_urn ne $sliver->aggregate_urn());
if (defined($virtualization_type) &&
$virtualization_type eq "emulab-xen") {
($virtualization_type eq "emulab-xen" ||
$virtualization_type eq "emulab-blockstore")) {
$vcount++;
next;
}
......@@ -803,6 +805,66 @@ sub ComputeNodeCounts($)
return 0;
}
#
# Go through all the manifests and return a list of nodes.
#
sub GetNodeDetails($)
{
my ($self) = @_;
my $rval = {};
my @slivers = $self->AggregateList();
if (!@slivers) {
print STDERR "No slivers for $self\n";
return undef;
}
foreach my $sliver (@slivers) {
my $manifest = GeniXML::Parse($sliver->manifest());
if (! defined($manifest)) {
print STDERR "Could not parse manifest for $sliver\n";
return undef;
}
my $nodes = {};
foreach my $ref (GeniXML::FindNodes("n:node",
$manifest)->get_nodelist(),
GeniXML::FindNodesNS("n:vhost",
$manifest,
$GeniXML::EMULAB_NS)->get_nodelist()) {
my $virtualization_type = GeniXML::GetVirtualizationSubtype($ref);
my $manager_urn = GetManagerId($ref);
my $isvnode = 0;
# Combined rspec.
next
if (!defined($manager_urn) ||
$manager_urn ne $sliver->aggregate_urn());
if (defined($virtualization_type) &&
($virtualization_type eq "emulab-xen" ||
$virtualization_type eq "emulab-blockstore")) {
$isvnode = 1;
}
my $client_id = GetVirtualId($ref);
my $node_id = GetVnodeId($ref);
if (GeniHRN::IsValid($node_id)) {
my $hrn = GeniHRN->new($node_id);
if (!$hrn->IsNode()) {
print STDERR "$node_id is not a node\n";
return undef;
}
$node_id = $hrn->id();
}
$nodes->{$node_id} = {
"client_id" => $client_id,
"isvirtnode" => $isvnode,
};
}
$rval->{$sliver->aggregate_urn()} = $nodes;
}
return $rval;
}
#
# Add an aggregate to an instance.
#
......
......@@ -30,6 +30,7 @@ use CGI;
use POSIX ":sys_wait_h";
use POSIX qw(setsid strftime ceil floor);
use Date::Parse;
use JSON;
#
# Back-end script to manage APT profiles.
......@@ -137,6 +138,8 @@ sub DoUtilization();
sub DoIdleData();
sub DoOpenstack();
sub DoCheckReservation();
sub DoCheckAutoApprove();
sub CheckAutoApprove($$);
sub CheckReservationInternal($$$);
sub DoMaxExtension();
sub WriteCredentials();
......@@ -145,6 +148,7 @@ sub StartMonitorInternal(;$@);
sub DoImageTrackerStuff($$$$$$$);
sub DenyExtensionInternal($);
sub ExtendInternal($$$$$);
sub CallAggregateMethod($$@);
#
# Parse command arguments. Once we return from getopts, all that should be
......@@ -269,6 +273,9 @@ elsif ($action eq "checkreservation") {
elsif ($action eq "maxextension") {
DoMaxExtension()
}
elsif ($action eq "checkautoapprove") {
DoCheckAutoApprove()
}
else {
usage();
}
......@@ -1307,6 +1314,7 @@ sub DoExtend()
my $granted = 0;
my $needapproval = 0;
my $inhours = 0;
my $autoapprove_info;
my $message;
my $reason;
my $errmsg;
......@@ -1480,6 +1488,15 @@ sub DoExtend()
$granted = 0;
}
if ($wanted > $granted) {
if (checkautoapprove($wanted, \$autoapprove_info) == 0) {
if ($autoapprove_info->{'approve'}) {
# Informational for now.
if (0) {
$granted = $wanted;
goto grant;
}
}
}
$needapproval = 1;
$message = "because it was started more then ".
"$autoextend_maxage days ago";
......@@ -1502,6 +1519,15 @@ sub DoExtend()
# management committee."
#
elsif ($wanted > $autoextend_maximum) {
if (CheckAutoApprove($wanted, \$autoapprove_info) == 0) {
if ($autoapprove_info->{'approve'}) {
# Informational for now.
if (0) {
$granted = $wanted;
goto grant;
}
}
}
$needapproval = 1;
$message = "because it was for longer then $autoextend_maximum days";
#
......@@ -1539,6 +1565,7 @@ sub DoExtend()
#
# Do the extension.
#
grant:
if ($granted) {
my $seconds = $granted * 3600;
$seconds *= 24 if (!$inhours);
......@@ -1591,6 +1618,16 @@ sub DoExtend()
$extensionargs->{"uid"} = $instance->creator();
$extensionargs->{"uid_idx"} = $instance->creator_idx();
}
if (defined($autoapprove_info)) {
$extensionargs->{"autoapproved"} = $autoapprove_info->{'approve'};
$extensionargs->{"autoapproved_reason"} = $autoapprove_info->{'reason'};
if (defined($autoapprove_info->{'metrics'})) {
my $metrics = eval { encode_json($autoapprove_info->{'metrics'}); };
if (!$@) {
$extensionargs->{"autoapproved_metrics"} = $metrics;
}
}
}
my $extensioninfo =
APT_Instance::ExtensionInfo->Create($instance, $extensionargs);
if (!defined($extensioninfo)) {
......@@ -3482,8 +3519,9 @@ sub DoUpdateKeys()
#
sub DoUtilization()
{
my $utilization;
my $errmsg;
my $errcode = 1;
my @aggregates = ();
#
# Get the nodeid to client id mapping
......@@ -3504,72 +3542,31 @@ sub DoUtilization()
$client_ids{$obj->aggregate_urn()}->{$node_id} = $client_id;
}
}
#
# And tell the backend clusters to do the update.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->Utilization();
if (!defined($response)) {
print STDERR "RPC Error calling utilization on $sliver\n";
return -1;
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not get utilization for sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
}
$webtask->results($response->value());
return 0;
};
my @return_codes = ();
my @agglist = ();
#
# Cull out any aggregates with no nodes.
#
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
if ($agg->physnode_count() || $agg->virtnode_count());
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling Utilization()";
goto bad;
}
#
# Check the exit codes.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not get utilization from some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $agg->webtask()->exitcode();
foreach my $aggregate ($instance->AggregateList()) {
push(@aggregates, $aggregate)
if ($aggregate->physnode_count() || $aggregate->virtnode_count());
}
my $errcode = CallAggregateMethod("Utilization",\$utilization,@aggregates);
if ($errcode) {
$errmsg = $utilization;
goto bad;
}
foreach my $aggregate (@aggregates) {
my $blob = shift(@{$utilization});
#
# Annotate the result with some extra info for the web UI.
#
my $blob = $agg->webtask()->results();
foreach my $node_id (keys(%{ $blob->{'details'}->{'nodes'} })) {
$blob->{'details'}->{'nodes'}->{$node_id}->{"client_id"} =
$client_ids{$agg->aggregate_urn()}->{$node_id};
$client_ids{$aggregate->aggregate_urn()}->{$node_id};
}
if ($debug) {
print Dumper($agg->webtask()->results());
print Dumper($blob);
}
$agg->webtask()->results($blob);
$agg->webtask()->Store();
$aggregate->webtask()->results($blob);
}
exit(0);
bad:
......@@ -3669,76 +3666,316 @@ sub DoOpenstack()
#
sub DoIdleData()
{
my $idledata;
my $errmsg;
my $errcode = 1;
my @aggregates = ();
#
# And ask the backend clusters for the data.
# Cull out any aggregates with no nodes.
#
my $coderef = sub {
my ($sliver) = @_;
my $webtask = $sliver->webtask();
my $response = $sliver->IdleData();
if (!defined($response)) {
print STDERR "RPC Error calling idledata on $sliver\n";
return -1;
foreach my $aggregate ($instance->AggregateList()) {
push(@aggregates, $aggregate)
if ($aggregate->physnode_count() || $aggregate->virtnode_count());
}
if ($response->code() != GENIRESPONSE_SUCCESS) {
print STDERR "Could not get idledata for sliver: ".
$response->output() . "\n";
$webtask->output($response->output());
$webtask->Exited($response->code());
return $response->code();
my $errcode = CallAggregateMethod("IdleData", \$idledata, @aggregates);
if ($errcode) {
$errmsg = $idledata;
goto bad;
}
# We get back json strings, we pass that through to the web interface.
foreach my $aggregate (@aggregates) {
my $json = shift(@{$idledata});
if ($debug) {
print Dumper($response->value());
print $json . "\n";
}
$aggregate->webtask()->idledata($json);
}
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
}
$webtask->idledata($response->value());
exit($errcode);
}
#
# Use the idledata to determine if an experiment can be extended without
# admin intervention.
#
sub DoCheckAutoApprove()
{
my $errmsg;
my $errcode = 1;
my $results;
usage()
if (!@ARGV);
my $days = shift(@ARGV);
$errcode = CheckAutoApprove($days, \$results);
if ($errcode) {
goto bad;
}
print Dumper($results);
exit($results->{'approve'});
bad:
print STDERR $errmsg . "\n" if (defined($errmsg));
if (defined($webtask)) {
$webtask->output($errmsg) if (defined($errmsg));
$webtask->Exited($errcode);
}
exit($errcode);
}
sub CheckAutoApprove($$)
{
my ($days, $pref) = @_;
my $errmsg;
my $errcode = 1;
my $utilization;
my $idledata;
my $loaddata = {};
my $slice = $instance->GetGeniSlice();
my $expires_time = str2time($slice->expires());
my $autoapprove_minloadavg = 0.25;
my $autoapprove_minsamples = 12 * 18;
my $autoapprove_ratio = 0.66;
my $autoapprove_minfree = 0.1;
#
# We do not do auto extensions if more then 1000 physical node hours
# in the request.
#
if ($days * 24 * $instance->physnode_count() > 1000) {
my $rval = {
"reason" => "greater then 1000 node hours requested",
"metrics" => undef,
"approve" => 0,
};
$$pref = $rval;
return 0;
}
# Only shared VMs, people can have them forever.
if (!$instance->physnode_count()) {
my $rval = {
"reason" => "extension auto approved because no physical nodes",
"metrics" => undef,
"approve" => 1,
};
$$pref = $rval;
return 0;
}
#
# We do not do auto extensions if more then 1500 physical node hours
# between current time and time at end of extension. This number needs
# calibration.
#
my $diff = ($expires_time + ($days * 3600 * 24)) - time();
if ($diff < 0) {
$errmsg = "Time is in the past. Hmm.\n";
$errcode= -1;
goto bad;
}
if ((($diff / 3600.0) / $instance->physnode_count()) > 1500) {
my $rval = {
"reason" => "granting extension would exceed 1500 node hours",
"metrics" => undef,
"approve" => 0,
};
my @return_codes = ();
my @agglist = ();
$$pref = $rval;
return 0;
}
#
# We ask the clusters for utilization data which includes freecounts.
# Cull out any aggregates with no nodes.
#
my @aggregates = ();
foreach my $agg ($instance->AggregateList()) {
push(@agglist, $agg)
push(@aggregates, $agg)
if ($agg->physnode_count() || $agg->virtnode_count());
}
if (ParRun({"maxwaittime" => 99999,
"maxchildren" => scalar(@agglist)},
\@return_codes, $coderef, @agglist)) {
$errmsg = "Internal error calling IdleData()";
$errcode = CallAggregateMethod("Utilization", \$utilization, @aggregates);
if ($errcode) {
$errmsg = $utilization;
goto bad;
}
foreach my $blob (@{$utilization}) {
foreach my $type (keys(%{ $blob->{'typeinfo'} })) {
my ($free,$total);
my $typeinfo = $blob->{'typeinfo'}->{$type};
if (exists($typeinfo->{'reserve_free'})) {
$free = $typeinfo->{'reserve_free'};
}
else {
$free = $typeinfo->{'free'};
}
$total = $typeinfo->{'total'};
# Skip if no data.
next
if (!$total);
if ($free / $total < $autoapprove_minfree) {
my $rval = {
"reason" => "Less then " . $autoapprove_minfree * 100 .
"% of node type $type are free",
"metrics" => $typeinfo,
"approve" => 0,
};
$$pref = $rval;
return 0;
}
}
}
#
# Check the exit codes.
# We need to know which nodes are physical, we do not worry about
# the VMs; we want shared nodes to be used, and busy VMs should be
# reflected in the load on the physical machines.
#
foreach my $agg (@agglist) {
my $code = shift(@return_codes);
$agg->webtask()->Refresh();
if ($code) {
$errmsg = "Could not get idledata from some slivers";
if ($agg->webtask()->output()) {
$errmsg .= ": " . $agg->webtask()->output();
$errcode = $code;
my $nodes = $instance->GetNodeDetails();
if (!defined($nodes)) {
$errmsg = "Could not get node details for instance\n";
$errcode= -1;
goto bad;
}
#
# Ask the clusters for the idle stats.
#
$errcode = CallAggregateMethod("IdleData", \$idledata, @aggregates);
if ($errcode) {
$errmsg = $idledata;
goto bad;
}
if ($debug) {
print Dumper($agg->webtask()->idledata());
# We consider load average data only.
foreach my $aggregate (@aggregates) {
my $urn = $aggregate->aggregate_urn();
my $json = shift(@{$idledata});
my $list = eval { decode_json($json); };
if ($@) {
print STDERR "Could not decode json data\n";
next;
}
$loaddata->{$urn} = {};
foreach my $ref (@{ $list }) {
my $node_id = $ref->{"node_id"};
# Skip virtual nodes.
next
if (!exists($nodes->{$urn}->{$node_id}) ||
$nodes->{$urn}->{$node_id}->{'isvirtnode'});
if (!exists($ref->{"main"}) ||
ref($ref->{"main"}) ne "HASH" ||
!exists($ref->{"main"}->{"AVG"})) {
print STDERR "No data for $node_id\n";
$loaddata->{$urn}->{$node_id} = undef;
next;
}
exit(0);
bad:
print STDERR $errmsg . "\n";
if (defined($webtask)) {
$webtask->output($errmsg);
$webtask->Exited($errcode);
#
# Look at the last 24 hour 5 minute load average samples on
# each node and count the number of samples greater the
# threshold.
#
my @samples = @{ $ref->{"main"}->{"AVG"} };
# Skip first one, its the labels.
shift(@samples);
my $now = time();
my $timelimit = $now - (24 * 3600); # Last 24 hours.
my $total = 0;
my $count = 0;
# Oldest at the beginning of the list
while (@samples) {
my $sample = pop(@samples);
my ($stamp, undef, $load5) = @{$sample};
last
if ($stamp < $timelimit);
$total++;
# We can get null samples (no data).
next
if (!defined($load5));
$count++ if ($load5 >= $autoapprove_minloadavg);
}
exit($errcode);
$loaddata->{$urn}->{$node_id} = {
'samples' => $total,
'busy' => $count,
};
}
}
#
# Need at least 18 hours of samples on every node and 66% of them
# have to be above the threshold.
#
my $message;
foreach my $urn (keys(%{$nodes})) {
foreach my $node_id (keys(%{ $nodes->{$urn} })) {
my $details = $nodes->{$urn}->{$node_id};
next
if ($details->{'isvirtnode'});
# No data for a node is an error.
if (!defined($loaddata->{$urn}->{$node_id})) {
$message = "No node data for some nodes";
last;
}
my $samples = $loaddata->{$urn}->{$node_id}->{'samples'};
my $count = $loaddata->{$urn}->{$node_id}->{'busy'};
# Must have 18 hours of samples.
if ($samples < $autoapprove_minsamples) {
$message = "Not enough 5 minute samples for some nodes. ".
"(less then $autoapprove_minsamples)";
last;
}
# 2/3 of samples greater then the threshold.
if ($count / $samples < $autoapprove_ratio) {
$message = "Some nodes not busy enough in last day. ".
"(less then $autoapprove_ratio of ".
"$autoapprove_minsamples samples > ".
"$autoapprove_minloadavg load average)";
last;
}
}
last
if (defined($message));
}
if (defined($message)) {
my $rval = {
"reason" => $message,
"metrics" => $loaddata,
"approve" => 0,
};
$$pref = $rval;
return 0;
}
$$pref = {
"reason" => "extension auto approved",
"metrics" => $loaddata,
"approve" => 1,
};
return 0;
bad:
print STDERR $errmsg . "\n" if (defined($errmsg));
#
# We want to return this so that it goes into the record, otherwise
# we will be confused as to why the auto approval has no data.
#
$$pref = {
"reason" => $errmsg,
"metrics" => undef,
"approve" => 0,
};
return $errcode;
}
#
......@@ -3967,3 +4204,87 @@ sub escapeshellarg($)
return $str;
}
sub CallAggregateMethod($$@)
{