plabmonitord.in 5.69 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
# All rights reserved.
#
use English;
use Getopt::Std;
use POSIX qw(strftime);

#
# Monitor the condition of plab nodes by continually trying to setup/teardown
# vnodes on pnodes that are in hwdown. The goal is to move the pnodes out
# of hwdown so that the vnodes on that pnode will be considered okay for
# experiments (see ptopgen). 
# 
sub usage()
{
    print STDERR "Usage: plabmonitor [-d]\n";
    exit(-1);
}
my $optlist = "d";
my $debug   = 0;

#
# Only real root can call this.
# 
if ($UID != 0) {
    print STDERR "You must be root to run this script!\n";
    exit(-1);
}

#
# Configure variables
#
my $TB		= "@prefix@";
my $TBOPS       = "@TBOPSEMAIL@";

# un-taint path
$ENV{'PATH'} = "/bin:/usr/bin:/usr/local/bin:$TB/sbin:$TB/bin";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

$ENV{'WITH_TB_ADMIN_PRIVS'} = '1';

# Turn off line buffering on output
$| = 1;

# Load the Testbed support stuff.
use lib "@prefix@/lib";
use libdb;
use libtestbed;

# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;

# Variables from libdb.
my $PLABMOND_PID    = PLABMOND_PID();
my $PLABMOND_EID    = PLABMOND_EID();
my $PLABHOLDING_PID = PLABHOLDING_PID();
my $PLABHOLDING_EID = PLABHOLDING_EID();
my $NODEDEAD_PID    = NODEDEAD_PID();
my $NODEDEAD_EID    = NODEDEAD_EID();

sub TimeStamp()
{
    return POSIX::strftime("%m/%d/%y %H:%M:%S", localtime());
}

#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
73
my %options = ();
74 75 76 77 78 79 80 81 82 83
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV) {
    usage();
}
if (defined($options{"d"})) {
    $debug = 1;
}

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
#
# Local vars
#
my $logfile = "$TB/log/plabmonitord";
my @nodes = ();
my $SLEEPINT = 300;  # five minutes between alloc retries.

#
# daemonize
#
if (!$debug) {
    if (TBBackGround($logfile)) {
        exit(0);
    }
}

print "Plab Monitor Daemon starting... pid $$, at ".`date`;

102 103 104 105 106 107
#
# We want list of all vnodes in our special experiment, whose pnodes are
# in hwdown. These are the nodes we test, hoping to move them out of
# hwdown.
#
while (1) {
108 109 110 111

    print "--------------------------------------------------------------". 
          "----------------\n";

112 113 114 115 116 117 118 119
    my $query_result =
	DBQueryWarn("select r1.node_id,n1.phys_nodeid from reserved as r1 ".
		    "left join nodes as n1 on n1.node_id=r1.node_id ".
		    "left join reserved as r2 on r2.node_id=n1.phys_nodeid ".
		    "where r1.pid='$PLABMOND_PID' and ".
		    "      r1.eid='$PLABMOND_EID' and ".
		    "      r2.pid='$NODEDEAD_PID' and ".
		    "      r2.eid='$NODEDEAD_EID' ".
120
		    "order by rand()");
121 122 123 124 125 126

    if (!$query_result) {
	print "Failed to get node list from DB! Waiting a bit ...\n";
	goto loop;
    }

127 128 129 130
    #
    # Build up current node list
    #
    my @newnodes = ();
131
    while (my ($vnode,$pnode) = $query_result->fetchrow_array()) {
132 133 134 135 136 137 138 139 140 141 142 143
        if (search($vnode, @nodes)) {
            push (@newnodes, [$vnode, $pnode]);
        }
        else {
            unshift(@newnodes, [$vnode, $pnode]);
        }
    }

    @nodes = @newnodes;

    foreach my $nmap (@nodes) {
        my ($vnode, $pnode) = @$nmap;
144
	my $revive = 0;
145 146 147

        sleep(5);

148 149 150 151 152 153

        # * Try full vnode_setup on node - mgmt sliver.
        #   - If (multiple?) fail, try to delete node/sliver from mgmt slice.
        #     (can we push out this action?)
        #

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
        #
        # Make sure the node is still in $NODEDEAD_*
        #
        my $query_result = DBQueryWarn("select node_id from reserved where ".
                                       "node_id = '$pnode' and ".
                                       "pid = '$NODEDEAD_PID' and ".
                                       "eid = '$NODEDEAD_EID'");
	if (!$query_result) {
            print "Node entry DB check failed!  Waiting a bit ...\n";
            last;
        }
        
        if (!$query_result->num_rows()) {
            print "Node was removed out from under us! Continuing on ...\n";
            next;
        }
        
171 172
        print "##############################################################################\n";
	print "### Checking $vnode on $pnode at " . TimeStamp() . "\n";
173 174

	#
175
	# Try to tear it down, set it up, and wait for ISUP.
176 177 178 179 180 181
	#

        # Bah, can't do unassign->assign in plab right now since unassign
        # is async, and may clobber the immediate assign afterward.
        #
	#system("vnode_setup -f -k -d $PLABMOND_PID $PLABMOND_EID $vnode");
182 183
	system("vnode_setup -f -d $PLABMOND_PID $PLABMOND_EID $vnode");
	if ($?) {
184 185 186 187 188 189
	    print "Failed to allocate $vnode on $pnode\n";
        } else {
            if (! TBNodeStateWait($vnode, TBDB_NODESTATE_ISUP, time(), 120)) {
                $revive = 1;
            }
        }
190 191 192 193 194 195 196 197 198 199 200 201

	#
	# That all worked. Move the pnode out of hwdown and back into
	# normal holding experiment.
	#
	if ($revive &&
	    DBQueryWarn("update reserved set ".
			"  pid='$PLABHOLDING_PID',eid='$PLABHOLDING_EID' ".
			"where node_id='$pnode'")) {
	    print "$pnode brought back from the afterworld at ".
		TimeStamp() . "\n";
	    
202
	    TBSetNodeLogEntry($pnode, "root", TB_DEFAULT_NODELOGTYPE(),
203 204
			      "'Moved to $PLABHOLDING_EID; ".
			      "plab node $vnode setup okay by monitor.'");
205

206
	    SENDMAIL($TBOPS, "$pnode is alive",
207
		     "$pnode has been brought back from the afterworld!",
208
		     $TBOPS);	    
209 210 211
	} else {
            print "Leaving $pnode in hwdown!\n";
        }
212 213
    }
  loop:
214
    sleep($SLEEPINT);
215 216 217 218 219 220 221 222 223 224 225
}

exit(0);

sub fatal($)
{
    local($msg) = $_[0];

    SENDMAIL($TBOPS, "Plab Monitor Died", $msg, $TBOPS);
    die($msg);
}
226 227 228 229 230 231 232 233 234 235 236

sub search($@)
{
    $target = shift;
    foreach $elt (@_) {
        if ($target eq $elt->[0]) {
            return 1;
        }
    }
    return 0;
}