All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit 7d62d520 authored by Dan Gebhardt's avatar Dan Gebhardt

First "release" of automanage.

- when a site has no available nodes or the bestnode. changes,
  measurements to this node are stopped from all others.

- tests are started to a new node chosen for a site.

- before a site's bestnode is selected, automanage verifies bgmon is
  running correctly on the node

- pseudo-balanced latency initialization is done using randomly
  selected path directions between a node pair.

- other misc. stuff.
parent 26e5f364
......@@ -69,8 +69,9 @@ my $lastupdated_numnodes = 0;
my $socket;
my $sel = IO::Select->new();
#FORWARD DECL'S
sub stopnode($);
sub outputErrors();
print "exp = $expid\n";
#############################################################################
......@@ -103,15 +104,31 @@ foreach my $node (@constrnodes){
#
# Main Loop
#
#my $f_firsttime = 1;
while(1)
{
%deadnodes = ();
#update node list
# print "getnodeinfo\n";
getnodeinfo();
# sleep(10);
# print "choosenodes\n";
choosenodes();
# sleep(10);
# print "modifytests\n";
modifytests();
# sleep(10);
# printchosennodes();
outputErrors();
select(undef, undef, undef, 5.0);
# sleep( 10 );
sleep( 60 );
# $f_firsttime = 0;
}
......@@ -122,17 +139,26 @@ sub getnodeinfo
#retrieve list of nodes
my $rval = libxmlrpc::CallMethod($MODULE, $METHOD,
{"class" => "pcplabphys"});
if( defined $rval ){
%allnodes = %$rval;
}else{ return; }
#populate sitenodes
foreach my $node (keys %allnodes){
my $siteid = $allnodes{$node}{site};
@{$sitenodes{$siteid}} = ();
push @{$sitenodes{$siteid}}, $node;
# print @{$sitenodes{$siteid}}."\n";
}
}
sub printNodeInfo($)
{
my ($node) = @_;
foreach my $key (keys %{$allnodes{$node}} ){
print "\t$key = $allnodes{$node}{$key}\n";
}
}
########################################################
#
......@@ -142,31 +168,40 @@ sub choosenodes
{
foreach my $site (keys %sitenodes){
# print "site $site\n";
# my $bestnode = "NONE";
my $bestnode = choosebestnode($site);
if( "NONE" eq $bestnode ){
if( !defined $bestnode ){ print "BESTNODE NOT DEF!!!\n"; }
if( "NONE" ne $bestnode &&
!defined $intersitenodes{$site} )
{
print "SECTION 1: adding $bestnode at $site\n";
# ** This section handles when a site is seen for the 1st time
#set new node to represent this site
$intersitenodes{$site} = $bestnode;
}
elsif( ("NONE" eq $bestnode) && defined $intersitenodes{$site} )
{
print "SECTION 2: removing tests to $site / ".
"$intersitenodes{$site} \n";
# ** This section handles when a site has no nodes available
#no available node at this site, so remove site from hash
#(done?)TODO: send "stop" signals to all other nodes having this
# site as the destination
foreach my $srcsite (keys %intersitenodes){
if( defined $intersitenodes{$site} ){
stoppairtest( $intersitenodes{$srcsite},
$intersitenodes{$site} );
}
}
delete $intersitenodes{$site};
}
else{
if( (!defined $intersitenodes{$site} ||
$intersitenodes{$site} ne $bestnode)
elsif( defined $intersitenodes{$site} &&
$intersitenodes{$site} ne $bestnode
#&& isnodeinconstrset($bestnode)
)
{
print "SECTION 3: node change at $site from ".
"$intersitenodes{$site} to $bestnode\n";
# ** This section handles when a "bestnode" at a site changes
#(done?)TODO
# Stop sigs to other nodes using old "bestnode" value
if( defined $intersitenodes{$site} ){
foreach my $srcsite (keys %intersitenodes){
......@@ -178,27 +213,31 @@ sub choosenodes
#set new node to represent this site
$intersitenodes{$site} = $bestnode;
#(done?)TODO: start other nodes using this new "bestnode"
# (This uses the EDIT bgmon command - see bgmon.pl)
foreach my $srcsite (keys %intersitenodes){
edittest( $intersitenodes{$srcsite},
$intersitenodes{$site},
$test_per{bw},
"bw" );
}
#TODO: need to do this smartly...
=pod
my $r = rand;
if( $r <= .5 ){
edittest( $intersitenodes{$srcsite},
$intersitenodes{$site},
$test_per{latency},
"latency" );
=cut
}else{
edittest( $intersitenodes{$site},
$intersitenodes{$srcsite},
$test_per{latency},
"latency" );
}
}
}
}
}
#
# Re-adjust the test periods of connections based on number of nodes
#
......@@ -239,6 +278,8 @@ sub choosebestnode($)
my ($site) = @_;
my $bestnode = "NONE"; #default to an error value
=pod
print "$site ";
foreach my $node ( @{$sitenodes{$site}} ){
......@@ -256,16 +297,30 @@ sub choosebestnode($)
print "\n";
}
=cut
if( $allnodes{$node}{free} == 1 && isnodeinconstrset($node) ) {
#this command acts like a bgmon "ping" - used to
#determine if bgmon running correctly
my %cmd = ( expid => $expid,
cmdtype => "EDIT",
dstnode => "NOADDR",
testtype => "bw",
testper => 0 );
if( $allnodes{$node}{free} == 1 &&
isnodeinconstrset($node) )
{
# print "choosing best node for site $site\n";
#first time thru loop...
if( $bestnode eq "NONE" ){
#set this to be best node
$bestnode = $node;
}else{
if( $allnodes{$node}{cpu} < $allnodes{$bestnode}{cpu}
+ $CPUUSAGETHRESHOLD)
if( ($allnodes{$node}{cpu} < $allnodes{$bestnode}{cpu}
- $CPUUSAGETHRESHOLD) &&
(edittest($node,"NOADDR",0,"bw") == 1) )
{
print "setting new bestnode\n";
print '$allnodes{$node}{cpu}'." $allnodes{$node}{cpu}\n";
print '$allnodes{$bestnode}{cpu}'.
" $allnodes{$bestnode}{cpu}\n";
$bestnode = $node;
}
}
......@@ -319,11 +374,29 @@ sub updateTests
}
}
initnode($srcnode, $bw_destnodes, $test_per{bw}, "bw");
}
#TODO! Distribute initialization times evenly
#init latency: fully connected, but only one direction each path
my %initstrs; #build init strings for each site node
my @sitekeys = keys %intersitenodes;
for( my $i = 0; $i < @sitekeys-1; $i++ ){
for( my $j = $i+1; $j < @sitekeys; $j++ ){
my $r = rand;
if( $r <= .5 ){
$initstrs{$intersitenodes{$sitekeys[$i]}} .=
"$intersitenodes{$sitekeys[$j]} ";
}else{
$initstrs{$intersitenodes{$sitekeys[$j]}} .=
"$intersitenodes{$sitekeys[$i]} ";
}
}
}
# now send the inits to all nodes
foreach my $srcsite (keys %intersitenodes){
$srcnode = $intersitenodes{$srcsite};
initnode($srcnode, $initstrs{$srcnode}, $test_per{latency}, "latency");
}
#TODO: LATENCY
}
#
......@@ -336,6 +409,7 @@ sub stopnode($)
if( isnodeinconstrset($node) ){
my %cmd = ( expid => $expid,
cmdtype => "STOPALL" );
print "stopnode $node called\n";
sendcmd($node,\%cmd);
}
}
......@@ -360,7 +434,7 @@ sub edittest($$$$)
testtype => $testtype,
testper => $testper );
sendcmd($srcnode,\%cmd);
return sendcmd($srcnode,\%cmd);
}
#
......@@ -411,7 +485,7 @@ sub sendcmd($$)
my $sercmd = serialize_hash( \%cmd );
my $f_success = 0;
my $max_tries = 5;
my $max_tries = 3;
do{
$socket = IO::Socket::INET->new( PeerPort => $port,
Proto => 'tcp',
......@@ -424,7 +498,7 @@ sub sendcmd($$)
# timeout period?
$sel->add($socket);
my ($ready) = $sel->can_read(1);
if( $ready eq $socket ){
if( defined($ready) && $ready eq $socket ){
my $ack = <$ready>;
chomp $ack;
if( $ack eq "ACK" ){
......@@ -447,20 +521,25 @@ sub sendcmd($$)
if( $f_success == 0 && $max_tries == 0 ){
$deadnodes{$node} = 1;
print "DID NOT GET ACK from $node for command $sercmd\n";
return -1;
}elsif( $f_success == 1 ){
#success!
return 1;
}
}
sub outputErrors()
{
print "Nodes not responding to Command:\n";
if( keys %deadnodes > 0 ){
print "Nodes not responding:\n";
foreach my $node (keys %deadnodes){
print "$node ";
}
print "\n";
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment