Commit 5e43a771 authored by Timothy Stack's avatar Timothy Stack

Initial checkin of a "repositioning" daemon that moves robots back to
their pens on swapout.

	* configure, configure.in: Add tbsetup/repos_daemon.

	* db/libdb.pm.in: Add constants for the
	repositionpending/repositioning experiments.

	* db/nfree.in: When freeing garcias, send them to
	repositionpending instead of reloadpending.

	* event/sched/event-sched.c: Deal with the rare case of no
	SIMULATOR object being in the agent list for an experiment.

	* robots/emc/emcd.c, robots/emc/locpiper.in: Fix some typos.

	* robots/rmcd/masterController.h, robots/rmcd/masterController.c,
	robots/rmcd/obstacles.h, robots/rmcd/obstacles.c: Ignore dynamic
	obstacles that are far away and remove dynamic obstacles where the
	robot is inside the natural obstacle area.

	* sql/database-create.sql, sql/database-migrate.txt: Add a
	reposition_status table that tracks the status of robots that are
	being moved back to their pens.

	* tbsetup/GNUmakefile.in: Install the repos_daemon script.

	* tbsetup/reload_daemon.in: Move robots to the repositionpending
	experiment, if they haven't already reached their pen.

	* tbsetup/repos_daemon.in: Daemon that takes care of seeing robots
	back to their pens after they are freed from an experiment.
parent ab06311e
......@@ -2230,7 +2230,7 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/node_reboot tbsetup/webnscheck tbsetup/nscheck \
tbsetup/resetvlans tbsetup/rmuser tbsetup/rmproj \
tbsetup/sched_reload tbsetup/sched_reserve tbsetup/reload_daemon \
tbsetup/batchexp tbsetup/batch_daemon \
tbsetup/batchexp tbsetup/batch_daemon tbsetup/repos_daemon \
tbsetup/webdelay_config tbsetup/webbatchexp tbsetup/webreport \
tbsetup/wanlinkinfo tbsetup/wanassign \
tbsetup/webswapexp tbsetup/swapexp \
......
......@@ -668,7 +668,7 @@ outfiles="$outfiles Makeconf GNUmakefile \
tbsetup/node_reboot tbsetup/webnscheck tbsetup/nscheck \
tbsetup/resetvlans tbsetup/rmuser tbsetup/rmproj \
tbsetup/sched_reload tbsetup/sched_reserve tbsetup/reload_daemon \
tbsetup/batchexp tbsetup/batch_daemon \
tbsetup/batchexp tbsetup/batch_daemon tbsetup/repos_daemon \
tbsetup/webdelay_config tbsetup/webbatchexp tbsetup/webreport \
tbsetup/wanlinkinfo tbsetup/wanassign \
tbsetup/webswapexp tbsetup/swapexp \
......
......@@ -66,6 +66,8 @@ use vars qw(@ISA @EXPORT);
DBLIMIT_NSFILESIZE NODERELOADPENDING_EID
NODEREPOSITIONING_PID NODEREPOSITIONING_EID NODEREPOSPENDING_EID
EXPTSTATE_NEW EXPTSTATE_PRERUN EXPTSTATE_SWAPPED EXPTSTATE_SWAPPING
EXPTSTATE_ACTIVATING EXPTSTATE_ACTIVE EXPTSTATE_PANICED
EXPTSTATE_TERMINATING EXPTSTATE_TERMINATED EXPTSTATE_QUEUED
......@@ -325,6 +327,9 @@ sub TBDB_EXPT_WORKDIR() { "/usr/testbed/expwork"; }
sub NODERELOADING_PID() { $TBOPSPID; }
sub NODERELOADING_EID() { "reloading"; }
sub NODERELOADPENDING_EID() { "reloadpending"; }
sub NODEREPOSITIONING_PID() { $TBOPSPID; }
sub NODEREPOSITIONING_EID() { "repositioning"; }
sub NODEREPOSPENDING_EID() { "repositionpending"; }
sub NODEDEAD_PID() { $TBOPSPID; }
sub NODEDEAD_EID() { "hwdown"; }
sub PLABMOND_PID() { $TBOPSPID; }
......
......@@ -41,6 +41,7 @@ my $nodereboot = "$TB/bin/node_reboot";
my $reloadpid = "emulab-ops";
my $pendingeid = "reloadpending";
my $reloadeid = "reloading";
my $rppendingeid= "repositionpending";
my $oldreserved_pid = OLDRESERVED_PID;
my $oldreserved_eid = OLDRESERVED_EID;
my $lockedpid = NFREELOCKED_PID();
......@@ -386,10 +387,29 @@ foreach my $n (@freed_nodes) {
$inreloads = 0;
}
if (!$TESTMODE &&
((!$isvirt && $imageable) || # XXX force reload hack!
$inreloads || $mustzero{$n} ||
TBNodeType($n) eq "garcia")) { # XXX Garcia hack
if (TBNodeType($n) eq "garcia") {
print "Moving $n to $reloadpid/$rppendingeid.\n";
DBQueryWarn("update reserved set pid='$reloadpid',".
"eid='$rppendingeid',vname='$n' where node_id='$n'") ||
$error++;
DBQueryWarn("REPLACE INTO scheduled_reloads set node_id='$n'") ||
$error++;
TBSetNodeHistory($n, TB_NODEHISTORY_OP_MOVE, $UID,
$reloadpid, $rppendingeid);
# This little sillyness is for disk reloading.
# Kill the last reservation since this path is special.
DBQueryWarn("delete from last_reservation where node_id='$n'") ||
$error++;
next;
}
elsif (!$TESTMODE &&
((!$isvirt && $imageable) || # XXX force reload hack!
$inreloads || $mustzero{$n})) { # XXX Garcia hack
print "Moving $n to $reloadpid/$pendingeid.\n";
DBQueryWarn("update reserved set pid='$reloadpid',eid='$pendingeid',".
......
......@@ -1215,22 +1215,24 @@ get_static_events(event_handle_t handle)
tuple->expt = pideid;
event.agent.s = primary_simulator_agent->sa_local_agent.la_agent;
event.notification = event_notification_create(
handle,
EA_Experiment, pideid,
EA_Type, TBDB_OBJECTTYPE_SIMULATOR,
EA_Event, TBDB_EVENTTYPE_LOG,
EA_Name, event.agent.s->name,
EA_Arguments, "Time started",
EA_TAG_DONE);
event.time.tv_sec = 0;
event.time.tv_usec = 1;
event.length = 1;
event.flags = SEF_SINGLE_HANDLER;
if (event.agent.s != NULL) {
// XXX emulab-ops experiments
event.notification = event_notification_create(
handle,
EA_Experiment, pideid,
EA_Type, TBDB_OBJECTTYPE_SIMULATOR,
EA_Event, TBDB_EVENTTYPE_LOG,
EA_Name, event.agent.s->name,
EA_Arguments, "Time started",
EA_TAG_DONE);
event.time.tv_sec = 0;
event.time.tv_usec = 1;
event.length = 1;
event.flags = SEF_SINGLE_HANDLER;
sched_event_prepare(handle, &event);
timeline_agent_append(ns_sequence, &event);
sched_event_prepare(handle, &event);
timeline_agent_append(ns_sequence, &event);
}
/*
* Generate a TIME starts message.
......
......@@ -396,7 +396,7 @@ int main(int argc, char *argv[])
if (pidfile)
strcpy(buf, pidfile);
else
sprintf(buf, "%s/progagent.pid", _PATH_VARRUN);
sprintf(buf, "%s/emcd.pid", _PATH_VARRUN);
fp = fopen(buf, "w");
if (fp != NULL) {
fprintf(fp, "%d\n", getpid());
......
......@@ -637,7 +637,7 @@ sub KillThePiper()
if ($err == ESRCH) {
print "*** WARNING:".
"Prerender process $procid for $pid/$eid already dead\n";
"Locpiper process $procid for $pid/$eid already dead\n";
}
else {
SENDMAIL($TBOPS,
......
......@@ -415,6 +415,8 @@ static int mc_process_report(struct master_controller *mc, mtp_packet_t *mp)
assert(mc_invariant(mc));
assert(mp != NULL);
ob_cancel_obstacle(&mc->mc_plan.pp_actual_pos);
mc->mc_pilot->pc_flags &= ~PCF_EXPECTING_RESPONSE;
mcr = &mp->data.mtp_payload_u.contact_report;
......@@ -425,6 +427,10 @@ static int mc_process_report(struct master_controller *mc, mtp_packet_t *mp)
float local_bearing;
local_bearing = atan2f(mcr->points[lpc].y, mcr->points[lpc].x);
if (hypotf(mcr->points[lpc].x, mcr->points[lpc].y) >
MAX_CONTACT_DISTANCE) {
continue;
}
compass |= (mtp_compass(local_bearing) & (MCF_EAST|MCF_WEST));
ob_obstacle_location(&cp,
......@@ -446,6 +452,7 @@ static int mc_process_report(struct master_controller *mc, mtp_packet_t *mp)
case MCF_EAST|MCF_WEST:
info("%s cannot move!\n", mc->mc_pilot->pc_robot->hostname);
mc->mc_pause_time = DEFAULT_PAUSE_TIME;
mc->mc_flags &= ~MCF_CONTACT;
mc->mc_self_obstacle = ob_add_robot(&mc->mc_plan.pp_actual_pos,
mc->mc_pilot->pc_robot->id);
break;
......
......@@ -48,6 +48,7 @@ struct master_controller {
};
#define DEFAULT_PAUSE_TIME 10
#define MAX_CONTACT_DISTANCE 0.36
/**
* Dispatch a packet received from the pilot. We expect to only receive
......
......@@ -291,6 +291,33 @@ void ob_obstacle_location(struct contact_point *dst,
REL2ABS(dst, actual->theta, cp_local, actual);
}
void ob_cancel_obstacle(struct robot_position *rp)
{
struct obstacle_node *on;
assert(rp != NULL);
on = (struct obstacle_node *)ob_data.od_active.lh_Head;
while (on->on_link.ln_Succ != NULL) {
struct obstacle_node *on_succ;
on_succ = (struct obstacle_node *)on->on_link.ln_Succ;
if ((on->on_type == OBT_DYNAMIC) &&
(rc_compute_code(rp->x, rp->y, &on->on_natural) == 0)) {
lnRemove(&on->on_link);
lnAddHead(&ob_data.od_free_list, &on->on_link);
if (ob_data.od_emc_handle != NULL) {
mtp_send_packet2(ob_data.od_emc_handle,
MA_Opcode, MTP_REMOVE_OBSTACLE,
MA_Role, MTP_ROLE_RMC,
MA_ObstacleVal, &on->on_expanded,
MA_TAG_DONE);
}
}
on = on_succ;
}
}
struct obstacle_node *ob_found_obstacle(struct robot_position *actual,
struct contact_point *cp_world)
{
......@@ -315,6 +342,9 @@ struct obstacle_node *ob_found_obstacle(struct robot_position *actual,
opcode = MTP_UPDATE_OBSTACLE;
retval->on_decay_seconds = OB_DECAY_START;
ob_merge_obstacles(&retval->on_expanded, &oc);
ob_expand_obstacle(&retval->on_natural,
&retval->on_expanded,
-OBSTACLE_BUFFER);
info("expanding existing obstacle: %d %.2f %.2f %.2f %.2f\n"
" --> %.2f %.2f %.2f %.2f\n",
......@@ -338,6 +368,7 @@ struct obstacle_node *ob_found_obstacle(struct robot_position *actual,
retval->on_expanded.ymin = oc.ymin;
retval->on_expanded.xmax = oc.xmax;
retval->on_expanded.ymax = oc.ymax;
ob_expand_obstacle(&retval->on_natural, &oc, -OBSTACLE_BUFFER);
/* Put dynamics at the front of the list. */
lnAddHead(&ob_data.od_active, &retval->on_link);
......@@ -378,7 +409,6 @@ void ob_expand_obstacle(struct obstacle_config *dst,
assert(dst != NULL);
assert(src != NULL);
assert(mtp_obstacle_config_invariant(src));
assert(amount > 0.0f);
dst->xmin = src->xmin - amount;
dst->ymin = src->ymin - amount;
......
......@@ -182,6 +182,8 @@ void ob_obstacle_location(struct contact_point *dst_out,
struct robot_position *actual,
struct contact_point *cp_local);
void ob_cancel_obstacle(struct robot_position *actual);
/**
* Construct a dynamic obstacle that was detected by a robot. If the robot
* detects an obstacle that overlaps with an existing dynamic obstacle, the
......
......@@ -33,8 +33,8 @@
#include "pilotConnection.h"
#define DEFAULT_MAX_REFINE_RETRIES 4
#define DEFAULT_METER_TOLERANCE 0.02f
#define DEFAULT_RADIAN_TOLERANCE 0.09f
#define DEFAULT_METER_TOLERANCE 0.0125f
#define DEFAULT_RADIAN_TOLERANCE 0.04f
#define DEFAULT_MAX_DISTANCE 1.5f
/**
......
......@@ -1499,6 +1499,17 @@ CREATE TABLE projects (
KEY pcremote_ok (pcremote_ok)
) TYPE=MyISAM;
--
-- Table structure for table `reposition_status`
--
CREATE TABLE reposition_status (
node_id varchar(32) NOT NULL default '',
attempts tinyint(4) NOT NULL default '0',
distance_remaining float default NULL,
PRIMARY KEY (node_id)
) TYPE=MyISAM;
--
-- Table structure for table `reserved`
--
......
......@@ -2658,3 +2658,12 @@ last_net_act,last_cpu_act,last_ext_act);
alter table experiments add elabinelab_nosetup tinyint(1) \
NOT NULL default '0' after elabinelab_cvstag;
1.138: Add a table to track the progress of moving the robots back to
their pens.
CREATE TABLE reposition_status (
node_id varchar(32) NOT NULL default '',
attempts tinyint(4) NOT NULL default '0',
distance_remaining float default NULL,
PRIMARY KEY (node_id)
) TYPE=MyISAM;
......@@ -28,7 +28,7 @@ SBIN_STUFF = resetvlans console_setup.proxy sched_reload named_setup \
exports_setup.proxy vnode_setup eventsys_start \
sfskey_update sfskey_update.proxy rmuser idleswap \
newnode_reboot savelogs.proxy eventsys.proxy \
elabinelab snmpit.proxy panic
elabinelab snmpit.proxy panic repos_daemon
CTRLBIN_STUFF = console_setup.proxy exports_setup.proxy sfskey_update.proxy \
savelogs.proxy eventsys.proxy
......
......@@ -41,6 +41,10 @@ my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS = "@TBOPSEMAIL@";
# XXX
my $BUILDING = "MEB-ROBOTS";
my $FLOOR = 4;
# Testbed Support library
use lib "@prefix@/lib";
use libdb;
......@@ -55,6 +59,8 @@ $libdb::DBQUERY_MAXTRIES = 30;
my $RELOADPID = NODERELOADING_PID;
my $RELOADEID = NODERELOADING_EID;
my $PENDINGEID = NODERELOADPENDING_EID;
my $REPOSPID = NODEREPOSITIONING_PID;
my $RPPENDINGEID= NODEREPOSPENDING_EID;
my $NODEDEAD_PID= NODEDEAD_PID;
my $NODEDEAD_EID= NODEDEAD_EID;
......@@ -506,15 +512,36 @@ while (1) {
sub freefromreloading($) {
my $node = shift;
DBQueryFatal("delete from current_reloads where node_id='$node'");
DBQueryFatal("delete from current_reloads where node_id='$node'");
my ($pid,$eid);
NodeidToExp($node,\$pid,\$eid);
if ($pid eq $RELOADPID && ($eid eq $RELOADEID || $eid eq $PENDINGEID)) {
DBQueryFatal("delete from scheduled_reloads ".
"where node_id='$node'");
DBQueryFatal("delete from reserved where node_id='$node'");
TBSetNodeHistory($node, TB_NODEHISTORY_OP_FREE,
$UID, $pid, $eid);
DBQueryFatal("delete from scheduled_reloads where node_id='$node'");
# Check if the robot is back in its pen, otherwise we have to throw it
# back to repositionpending.
my $loc_result =
DBQueryWarn("SELECT * FROM reposition_status ".
"WHERE node_id='$node'");
if ($loc_result->numrows) {
if (!DBQueryWarn("update reserved set ".
"rsrv_time=now(),eid='$RPPENDINGEID' ".
"where node_id='$node'")) {
print "Could not update EID for $node. Waiting a bit.\n";
} else {
print "Reposition pending nodes moved to $RPPENDINGEID at ".
`date`;
TBSetNodeHistory($node, TB_NODEHISTORY_OP_MOVE, $UID,
$REPOSPID, $RPPENDINGEID);
}
}
else {
DBQueryFatal("delete from reserved where node_id='$node'");
TBSetNodeHistory($node, TB_NODEHISTORY_OP_FREE,
$UID, $pid, $eid);
}
}
}
......
#!/usr/bin/perl -wT
#
# EMULAB-COPYRIGHT
# Copyright (c) 2005 University of Utah and the Flux Group.
# All rights reserved.
#
use English;
use Getopt::Std;
#
# This should run as root to make sure that it has permission to reboot nodes
# (since only root is allowed to power cycle nodes at any time - it's time-
# limited for anyone else)
#
if ($UID != 0) {
die("*** $0:\n".
" Only root can run this script!\n");
}
#
# Look for nodes to reposition.
#
# usage: repos_daemon [-d]
#
# TODO: Use "logger" instead of writing a log file.
#
sub usage()
{
print STDOUT "Usage: repos_daemon [-dc]\n" .
"Use the -d option to prevent daemonization\n";
exit(-1);
}
my $optlist = "dc";
#
# Configure variables
#
my $TB = "@prefix@";
my $DBNAME = "@TBDBNAME@";
my $TBOPS = "@TBOPSEMAIL@";
# XXX
my $BUILDING = "MEB-ROBOTS";
my $FLOOR = 4;
# Testbed Support library
use lib "@prefix@/lib";
use libdb;
use libtestbed;
# Be careful not to exit on transient error
$libdb::DBQUERY_MAXTRIES = 30;
#
# These come from the library.
#
my $REPOSPID = NODEREPOSITIONING_PID;
my $REPOSEID = NODEREPOSITIONING_EID;
my $PENDINGEID = NODEREPOSPENDING_EID;
my $RLPENDINGEID= NODERELOADPENDING_EID;
my $NODEDEAD_PID= NODEDEAD_PID;
my $NODEDEAD_EID= NODEDEAD_EID;
sub fatal($);
sub notify($);
sub daemonize();
my $reboot = "$TB/bin/node_reboot";
my $evsys = "$TB/bin/eventsys_control";
my $locpiper = "$TB/sbin/locpiper";
my $logfile = "$TB/log/repositionlog";
my $debug = 0;
my $cleanup = 1;
my $retry_time = 2; # in minutes
my $max_attempts= 4;
my %retried = ();
my %warned = ();
my %failed = ();
#
# Turn off line buffering on output (dots ...).
#
$| = 1;
#
# Untaint the path
#
$ENV{'PATH'} = "/bin:/usr/bin:";
delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
#
%options = ();
if (! getopts($optlist, \%options)) {
usage();
}
if (@ARGV != 0) {
usage();
}
if (defined($options{"d"})) {
$debug = $options{"d"};
}
if (defined($options{"c"})) {
$cleanup = $options{"c"};
}
if ($cleanup) {
# For testing purposes mostly, cleanup any leftovers...
DBQueryFatal("delete from reserved ".
"where pid='$REPOSPID' and eid='$REPOSEID'");
DBQueryFatal("delete from virt_node_startloc ".
"where pid='$REPOSPID' and eid='$REPOSEID'");
DBQueryFatal("delete from virt_agents ".
"where pid='$REPOSPID' and eid='$REPOSEID'");
}
# Go to ground.
if (! $debug) {
daemonize();
}
print "Repositioning Daemon starting... pid $$, at ".`date`;
# XXX Need to get an emulab-ops user to run the event system as.
$query_result =
DBQueryFatal("select u.unix_uid from group_membership as gm ".
"left join users as u on u.uid=gm.uid ".
"where gm.pid='$REPOSPID' and gm.gid='$REPOSPID' and ".
"gm.uid!='elabman'");
if (! $query_result || $query_result->numrows == 0) {
fatal("Cannot get experiment head for $REPOSEID\n");
}
my ($ev_uid) = $query_result->fetchrow;
my ($unix_gid, $unix_gidname);
if (! TBGroupUnixInfo($REPOSPID, $REPOSPID, \$unix_gid, \$unix_gidname)) {
die("*** $0:\n".
" Could not get unix group info for $REPOSPID!\n");
}
print "Experiment head: $ev_uid\n"
if ($debug);
#
# Loop, looking for nodes to reposition.
#
my $idle=0;
MAINLOOP: while (1) {
my($count, $which, @row, %hrow, $node, $retry, $attempts, $stamp);
my($pid, $eid);
# Partial delay between loops in case of an error.
if ($idle) { sleep(10); } # Wait longer if we're not doing anything
else { sleep(1); }
$idle=1; # Assume we're going to be idle this iteration
#
# We use this to figure out when to delete nodes from the retried and
# warned hashes
#
my $time = time();
#
# First, look for nodes that have been in the repositioning experiment for
# longer than $retry_time, and try reloading them
#
$query_result =
DBQueryWarn("select r.node_id,rs.attempts from reserved as r " .
"left join reposition_status as rs on rs.node_id=r.node_id ".
"where pid='$REPOSPID' and eid='$REPOSEID' and " .
"(CURRENT_TIMESTAMP - INTERVAL $retry_time MINUTE)".
" > rsrv_time");
if (! $query_result) {
print "DB Error. Waiting a bit.\n";
next;
}
while (($node,$attempts) = $query_result->fetchrow) {
$idle=0;
if (!$retried{$node}) {
print "\nReposition appears wedged at ".`date`."\n";
if ($attempts >= $max_attempts) {
# The node has passed through reloading/repositioning too many
# times, move to hwdown.
DBQueryFatal("delete from reposition_status ".
"where node_id='$node'");
notify("$node is wedged... literally.\n".
"Moved to $NODEDEAD_PID/$NODEDEAD_EID\n");
MarkPhysNodeDown($node);
TBSetNodeLogEntry($node, "daemon",
TB_DEFAULT_NODELOGTYPE(),
"'Moved to hwdown; reposition failed'");
}
else {
# Try reloading the node and restarting pilot.
movetoreloading($node);
}
}
$retried{$node} = $time;
}
#
# We can pull out all nodes that were not 'touched' (matched by the
# select above) during this pass
#
foreach $node (keys %retried) {
if ($retried{$node} != $time) {
delete $retried{$node};
}
}
# Now we check the status of the rest of the nodes in repositioning.
$query_result =
DBQueryWarn("select n.node_id,li.loc_x,li.loc_y,".
"n.destination_x,n.destination_y from nodes as n ".
"left join reserved as r on r.node_id=n.node_id ".
"left join location_info as li on li.node_id=n.node_id ".
"and li.building='$BUILDING' ".
"where r.pid='$REPOSPID' and r.eid='$REPOSEID' and ".
"(n.destination_x is not NULL or ".
"n.destination_y is not NULL) order by node_id");
if (! $query_result) {
print "DB Error. Waiting a bit.\n";
next;
}
if ($query_result->numrows) {
print "Robots are still in motion\n";
$idle = 1;
while (%hrow = $query_result->fetchhash()) {
$node = $hrow{'node_id'};
my $xdist = $hrow{'loc_x'} - $hrow{'destination_x'};
my $ydist = $hrow{'loc_y'} - $hrow{'destination_y'};
my $dist = sqrt(($xdist * $xdist) + ($ydist + $ydist));
if (!DBQueryWarn("UPDATE reposition_status set ".
"distance_remaining=$dist ".
"where node_id='$node'")) {
print "DB Error. Waiting a bit.\n";
next MAINLOOP;
}
}
next;
}
else {
$query_result =
DBQueryWarn("select node_id from reserved where ".
<