All new accounts created on Gitlab now require administrator approval. If you invite any collaborators, please let Flux staff know so they can approve the accounts.

Commit 7d62d520 authored by Dan Gebhardt's avatar Dan Gebhardt

First "release" of automanage.

- when a site has no available nodes or the bestnode. changes,
  measurements to this node are stopped from all others.

- tests are started to a new node chosen for a site.

- before a site's bestnode is selected, automanage verifies bgmon is
  running correctly on the node

- pseudo-balanced latency initialization is done using randomly
  selected path directions between a node pair.

- other misc. stuff.
parent 26e5f364
...@@ -69,8 +69,9 @@ my $lastupdated_numnodes = 0; ...@@ -69,8 +69,9 @@ my $lastupdated_numnodes = 0;
my $socket; my $socket;
my $sel = IO::Select->new(); my $sel = IO::Select->new();
#FORWARD DECL'S
sub stopnode($); sub stopnode($);
sub outputErrors();
print "exp = $expid\n"; print "exp = $expid\n";
############################################################################# #############################################################################
...@@ -103,15 +104,31 @@ foreach my $node (@constrnodes){ ...@@ -103,15 +104,31 @@ foreach my $node (@constrnodes){
# #
# Main Loop # Main Loop
# #
#my $f_firsttime = 1;
while(1) while(1)
{ {
%deadnodes = ();
#update node list #update node list
# print "getnodeinfo\n";
getnodeinfo(); getnodeinfo();
# sleep(10);
# print "choosenodes\n";
choosenodes(); choosenodes();
# sleep(10);
# print "modifytests\n";
modifytests(); modifytests();
# sleep(10);
# printchosennodes(); # printchosennodes();
outputErrors(); outputErrors();
select(undef, undef, undef, 5.0); # sleep( 10 );
sleep( 60 );
# $f_firsttime = 0;
} }
...@@ -122,17 +139,26 @@ sub getnodeinfo ...@@ -122,17 +139,26 @@ sub getnodeinfo
#retrieve list of nodes #retrieve list of nodes
my $rval = libxmlrpc::CallMethod($MODULE, $METHOD, my $rval = libxmlrpc::CallMethod($MODULE, $METHOD,
{"class" => "pcplabphys"}); {"class" => "pcplabphys"});
if( defined $rval ){
%allnodes = %$rval; %allnodes = %$rval;
}else{ return; }
#populate sitenodes #populate sitenodes
foreach my $node (keys %allnodes){ foreach my $node (keys %allnodes){
my $siteid = $allnodes{$node}{site}; my $siteid = $allnodes{$node}{site};
@{$sitenodes{$siteid}} = ();
push @{$sitenodes{$siteid}}, $node; push @{$sitenodes{$siteid}}, $node;
# print @{$sitenodes{$siteid}}."\n"; # print @{$sitenodes{$siteid}}."\n";
} }
} }
sub printNodeInfo($)
{
my ($node) = @_;
foreach my $key (keys %{$allnodes{$node}} ){
print "\t$key = $allnodes{$node}{$key}\n";
}
}
######################################################## ########################################################
# #
...@@ -142,31 +168,40 @@ sub choosenodes ...@@ -142,31 +168,40 @@ sub choosenodes
{ {
foreach my $site (keys %sitenodes){ foreach my $site (keys %sitenodes){
# print "site $site\n"; # print "site $site\n";
# my $bestnode = "NONE";
my $bestnode = choosebestnode($site); my $bestnode = choosebestnode($site);
if( "NONE" eq $bestnode ){ if( !defined $bestnode ){ print "BESTNODE NOT DEF!!!\n"; }
if( "NONE" ne $bestnode &&
!defined $intersitenodes{$site} )
{
print "SECTION 1: adding $bestnode at $site\n";
# ** This section handles when a site is seen for the 1st time
#set new node to represent this site
$intersitenodes{$site} = $bestnode;
}
elsif( ("NONE" eq $bestnode) && defined $intersitenodes{$site} )
{
print "SECTION 2: removing tests to $site / ".
"$intersitenodes{$site} \n";
# ** This section handles when a site has no nodes available # ** This section handles when a site has no nodes available
#no available node at this site, so remove site from hash #no available node at this site, so remove site from hash
#(done?)TODO: send "stop" signals to all other nodes having this
# site as the destination
foreach my $srcsite (keys %intersitenodes){ foreach my $srcsite (keys %intersitenodes){
if( defined $intersitenodes{$site} ){
stoppairtest( $intersitenodes{$srcsite}, stoppairtest( $intersitenodes{$srcsite},
$intersitenodes{$site} ); $intersitenodes{$site} );
} }
}
delete $intersitenodes{$site}; delete $intersitenodes{$site};
} }
else{ elsif( defined $intersitenodes{$site} &&
if( (!defined $intersitenodes{$site} || $intersitenodes{$site} ne $bestnode
$intersitenodes{$site} ne $bestnode)
#&& isnodeinconstrset($bestnode) #&& isnodeinconstrset($bestnode)
) )
{ {
print "SECTION 3: node change at $site from ".
"$intersitenodes{$site} to $bestnode\n";
# ** This section handles when a "bestnode" at a site changes # ** This section handles when a "bestnode" at a site changes
#(done?)TODO
# Stop sigs to other nodes using old "bestnode" value # Stop sigs to other nodes using old "bestnode" value
if( defined $intersitenodes{$site} ){ if( defined $intersitenodes{$site} ){
foreach my $srcsite (keys %intersitenodes){ foreach my $srcsite (keys %intersitenodes){
...@@ -178,27 +213,31 @@ sub choosenodes ...@@ -178,27 +213,31 @@ sub choosenodes
#set new node to represent this site #set new node to represent this site
$intersitenodes{$site} = $bestnode; $intersitenodes{$site} = $bestnode;
#(done?)TODO: start other nodes using this new "bestnode"
# (This uses the EDIT bgmon command - see bgmon.pl)
foreach my $srcsite (keys %intersitenodes){ foreach my $srcsite (keys %intersitenodes){
edittest( $intersitenodes{$srcsite}, edittest( $intersitenodes{$srcsite},
$intersitenodes{$site}, $intersitenodes{$site},
$test_per{bw}, $test_per{bw},
"bw" ); "bw" );
} my $r = rand;
#TODO: need to do this smartly... if( $r <= .5 ){
=pod
edittest( $intersitenodes{$srcsite}, edittest( $intersitenodes{$srcsite},
$intersitenodes{$site}, $intersitenodes{$site},
$test_per{latency}, $test_per{latency},
"latency" ); "latency" );
=cut }else{
edittest( $intersitenodes{$site},
$intersitenodes{$srcsite},
$test_per{latency},
"latency" );
} }
} }
}
} }
} }
# #
# Re-adjust the test periods of connections based on number of nodes # Re-adjust the test periods of connections based on number of nodes
# #
...@@ -239,6 +278,8 @@ sub choosebestnode($) ...@@ -239,6 +278,8 @@ sub choosebestnode($)
my ($site) = @_; my ($site) = @_;
my $bestnode = "NONE"; #default to an error value my $bestnode = "NONE"; #default to an error value
=pod =pod
print "$site "; print "$site ";
foreach my $node ( @{$sitenodes{$site}} ){ foreach my $node ( @{$sitenodes{$site}} ){
...@@ -256,16 +297,30 @@ sub choosebestnode($) ...@@ -256,16 +297,30 @@ sub choosebestnode($)
print "\n"; print "\n";
} }
=cut =cut
if( $allnodes{$node}{free} == 1 && isnodeinconstrset($node) ) { #this command acts like a bgmon "ping" - used to
#determine if bgmon running correctly
my %cmd = ( expid => $expid,
cmdtype => "EDIT",
dstnode => "NOADDR",
testtype => "bw",
testper => 0 );
if( $allnodes{$node}{free} == 1 &&
isnodeinconstrset($node) )
{
# print "choosing best node for site $site\n"; # print "choosing best node for site $site\n";
#first time thru loop... #first time thru loop...
if( $bestnode eq "NONE" ){ if( $bestnode eq "NONE" ){
#set this to be best node #set this to be best node
$bestnode = $node; $bestnode = $node;
}else{ }else{
if( $allnodes{$node}{cpu} < $allnodes{$bestnode}{cpu} if( ($allnodes{$node}{cpu} < $allnodes{$bestnode}{cpu}
+ $CPUUSAGETHRESHOLD) - $CPUUSAGETHRESHOLD) &&
(edittest($node,"NOADDR",0,"bw") == 1) )
{ {
print "setting new bestnode\n";
print '$allnodes{$node}{cpu}'." $allnodes{$node}{cpu}\n";
print '$allnodes{$bestnode}{cpu}'.
" $allnodes{$bestnode}{cpu}\n";
$bestnode = $node; $bestnode = $node;
} }
} }
...@@ -319,11 +374,29 @@ sub updateTests ...@@ -319,11 +374,29 @@ sub updateTests
} }
} }
initnode($srcnode, $bw_destnodes, $test_per{bw}, "bw"); initnode($srcnode, $bw_destnodes, $test_per{bw}, "bw");
}
#TODO! Distribute initialization times evenly #init latency: fully connected, but only one direction each path
my %initstrs; #build init strings for each site node
my @sitekeys = keys %intersitenodes;
for( my $i = 0; $i < @sitekeys-1; $i++ ){
for( my $j = $i+1; $j < @sitekeys; $j++ ){
my $r = rand;
if( $r <= .5 ){
$initstrs{$intersitenodes{$sitekeys[$i]}} .=
"$intersitenodes{$sitekeys[$j]} ";
}else{
$initstrs{$intersitenodes{$sitekeys[$j]}} .=
"$intersitenodes{$sitekeys[$i]} ";
}
}
}
# now send the inits to all nodes
foreach my $srcsite (keys %intersitenodes){
$srcnode = $intersitenodes{$srcsite};
initnode($srcnode, $initstrs{$srcnode}, $test_per{latency}, "latency");
} }
#TODO: LATENCY
} }
# #
...@@ -336,6 +409,7 @@ sub stopnode($) ...@@ -336,6 +409,7 @@ sub stopnode($)
if( isnodeinconstrset($node) ){ if( isnodeinconstrset($node) ){
my %cmd = ( expid => $expid, my %cmd = ( expid => $expid,
cmdtype => "STOPALL" ); cmdtype => "STOPALL" );
print "stopnode $node called\n";
sendcmd($node,\%cmd); sendcmd($node,\%cmd);
} }
} }
...@@ -360,7 +434,7 @@ sub edittest($$$$) ...@@ -360,7 +434,7 @@ sub edittest($$$$)
testtype => $testtype, testtype => $testtype,
testper => $testper ); testper => $testper );
sendcmd($srcnode,\%cmd); return sendcmd($srcnode,\%cmd);
} }
# #
...@@ -411,7 +485,7 @@ sub sendcmd($$) ...@@ -411,7 +485,7 @@ sub sendcmd($$)
my $sercmd = serialize_hash( \%cmd ); my $sercmd = serialize_hash( \%cmd );
my $f_success = 0; my $f_success = 0;
my $max_tries = 5; my $max_tries = 3;
do{ do{
$socket = IO::Socket::INET->new( PeerPort => $port, $socket = IO::Socket::INET->new( PeerPort => $port,
Proto => 'tcp', Proto => 'tcp',
...@@ -424,7 +498,7 @@ sub sendcmd($$) ...@@ -424,7 +498,7 @@ sub sendcmd($$)
# timeout period? # timeout period?
$sel->add($socket); $sel->add($socket);
my ($ready) = $sel->can_read(1); my ($ready) = $sel->can_read(1);
if( $ready eq $socket ){ if( defined($ready) && $ready eq $socket ){
my $ack = <$ready>; my $ack = <$ready>;
chomp $ack; chomp $ack;
if( $ack eq "ACK" ){ if( $ack eq "ACK" ){
...@@ -447,20 +521,25 @@ sub sendcmd($$) ...@@ -447,20 +521,25 @@ sub sendcmd($$)
if( $f_success == 0 && $max_tries == 0 ){ if( $f_success == 0 && $max_tries == 0 ){
$deadnodes{$node} = 1; $deadnodes{$node} = 1;
print "DID NOT GET ACK from $node for command $sercmd\n";
return -1;
}elsif( $f_success == 1 ){
#success!
return 1;
} }
} }
sub outputErrors() sub outputErrors()
{ {
print "Nodes not responding to Command:\n"; if( keys %deadnodes > 0 ){
print "Nodes not responding:\n";
foreach my $node (keys %deadnodes){ foreach my $node (keys %deadnodes){
print "$node "; print "$node ";
} }
print "\n"; print "\n";
}
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment