reload_daemon.in 5.57 KB
Newer Older
1 2 3 4 5 6 7 8 9
#!/usr/bin/perl -wT
use English;
use Getopt::Std;

#
# Look for nodes to reload.
#
#	usage: reload_daemon [-d]
#
10
# XXX - Hardwired to type "pc600".
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
#       Path to image and the partition are hardwired in.
#
# TODO: Use "logger" instead of writing a log file.
#
sub usage()
{
    print STDOUT "Usage: reload_daemon [-d]\n" .
	"Use the -d option to prevent daemonization\n";
    exit(-1);
}
my  $optlist = "d";

#
# Configure variables
#
my $TB       = "@prefix@";
my $DBNAME   = "@TBDBNAME@";
my $TBOPS    = "@TBOPSEMAIL@";

30
my $TYPE     = "pc600"; # XXX: Temporary hack! needs to change for ISPs
31
my $reloader = "$TB/sbin/sched_reload";
32
my $reboot   = "$TB/bin/node_reboot";
33 34 35 36 37 38 39 40 41 42 43 44 45 46
my $logfile  = "$TB/log/reloadlog";
my $debug    = 0;

#
# Turn off line buffering on output (dots ...).
#
$| = 1;

#
# Untaint the path
# 
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};

47
# Testbed Support library
48 49
use lib "@prefix@/lib";
use libtestbed;
50

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
    usage();
}
if (@ARGV != 0) {
    usage();
}
if (defined($options{"d"})) {
    $debug = $options{"d"};
}

# Go to ground.
if (! $debug) {
    daemonize();
}

#
# Set up for querying the database.
# 
use Mysql;
my $DB = Mysql->connect("localhost", $DBNAME, "script", "none");

#
# Loop, looking for nodes to reload.
# 
while (1) {
81
    my($count, $which, @row, $imageid, $node, $retry, $stamp);
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
    
    #
    # Find all of the free node that have not been reloaded (no pid entry
    # in last_reservation, which is reset anytime a node is reloaded by
    # the system).
    #
    $query_result =
	DBquery("select a.node_id from nodes as a ".
		"left join reserved as b on a.node_id=b.node_id ".
		"left join last_reservation as l on l.node_id=a.node_id ".
		"where b.node_id is null and a.type='$TYPE' and l.pid!='' ".
		"order by a.node_id");
    if (! $query_result) {
	print "DB Error getting free nodes. Waiting a bit.\n";
	sleep(10);
	next;
    }
    $count = $query_result->numrows;

    if (! $count) {
	sleep(10);
	next;
    }

    #
    # RAND() does not work in our version of mysql, so generate a random
    # number with perl and pick out that node.
    #
    $which = int(rand($count));
    $query_result->dataseek($which);
    @row  = $query_result->fetchrow_array();
    $node = $row[0];

    #
116
    # Query for the default imageid. I do this each time through the loop
117 118 119
    # in case it gets changed in the DB.
    #
    $query_result =
120
	DBquery("select imageid from node_types where type='$TYPE'");
121 122 123 124 125 126
    
    if (! $query_result) {
	print "DB Error getting node type. Waiting a bit.\n";
	sleep(10);
	next;
    }
127 128
    @row     = $query_result->fetchrow_array();
    $imageid = $row[0];
129

130
    
131
    print "Trying to reload $node ... \n";
132 133 134 135 136 137 138
    
    #
    # Call sched_reload with the "force" option, which says that if
    # sched_reload cannot reserve the node (cause someone just got it)
    # then don't schedule a reload for later. Just fail outright.
    # We will try again in a bit.
    #
139
    if (system("$reloader -f $imageid $node")) {
140 141 142 143 144 145 146
	#
	# Could not get it. Wait and go around again.
	#
	print "Could not start a reload on $node. Waiting a bit.\n";
	sleep(10);
	next;
    }
147 148 149 150
    $stamp = `date '+20%y-%m-%d %H:%M:%S'`;
    chop($stamp);
	
    print "Reload of $node has started at $stamp.\n";
151 152 153 154

    #
    # Reload was started. We want to wait until its finished.
    #
155 156
    $retry = 0;
  again:
157 158 159 160 161 162 163 164 165
    $count = 0;
    while ($count < 200) {
	$query_result =
	    DBquery("select pid,eid from reserved where node_id='$node'");
	if (! $query_result) {
	    print "DB Error getting reservation for $node. Waiting a bit\n";
	    sleep(10);
	    next;
	}
166 167 168
	$stamp = `date '+20%y-%m-%d %H:%M:%S'`;
	chop($stamp);

169
	if (! $query_result->numrows) {
170
	    print "\nReload of $node appears to have finished at $stamp.\n";
171 172 173 174 175 176 177 178 179 180 181 182
	    last;
	}
	#
	# Make sure its still in the "reloading" experiment. Its possible
	# (although unlikely) that the node will get freed up by the TMCD
	# when it reboots, and then reallocated to another experiment,
	# before we get back here to check.
	#
	# XXX "testbed/reloading" wired in. 
	#
	@row  = $query_result->fetchrow_array();
	if ($row[0] ne "testbed" || $row[1] ne "reloading") {
183
	    print "\nReload of $node has finished at $stamp.\n";
184 185 186 187 188 189 190
	    last;
	}
	print ".";
	$count++;
	sleep(5);
    }
    if ($count == 200) {
191 192 193 194 195 196 197 198 199
	if ($retry) {
	    fatal("$node appears to have wedged. Stopping reload daemon.");
	}
	if (system("$reboot -f $node")) {
	    fatal("$node was wedged, but could not be power cycled.");
	}
	print "\nReload appears wedged. Power cycling and trying once more!\n";
	$retry = 1;
	goto again;
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    }
    sleep(30);
}

sub DBquery($)
{
    my($query) = $_[0];
    my($result);

    $result = $DB->query($query);

    if (! $result) {
	print "DB Query failed: $query\n";
    }

    return $result;
}

sub fatal {
    local($msg) = $_[0];

221
    SENDMAIL($TBOPS, "TESTBED: Reload Daemon Died", $msg);
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
    die($msg);
}

#
# Become a daemon.
# 
sub daemonize()
{
    my $mypid = fork();
    if ($mypid) {
	exit(0);
    }

    #
    # We have to disconnect from the caller by redirecting both STDIN and
    # STDOUT away from the pipe. Otherwise the caller will continue to wait
    # even though the parent has exited. 
    #
    open(STDIN, "< /dev/null") or
	die("opening /dev/null for STDIN: $!");

    #
    # Open the batch log and start writing to it. 
    #
    open(STDERR, ">> $logfile") or die("opening $logfile for STDERR: $!");
    open(STDOUT, ">> $logfile") or die("opening $logfile for STDOUT: $!");

    return 0;
}