Commit e8bb6bca authored by Leigh B. Stoller's avatar Leigh B. Stoller

The bulk of this commit adds the ability to run the program agent on ops

so that users can schedule program events to run there. For example:

	set myprog [new Program $ns]
	$myprog set node "ops"
	$myprog set command "/usr/bin/env >& /tmp/foo"

	$ns at 10 "$myprog start"
or
	tevc -e pid/eid now myprog start

Since the program agent cannot talk to tmcd from ops, there are new
routines to create the config files that the program agent uses, in
the expertment tbdata directory.

I also rewrote the eventsys.proxy script that starts the event
scheduler on ops; I rolled the startup of the program agent into this
script, via new -a option which is passed over from boss when an ops
program agent is detected in the virt topology. This keep the number
of new processes on ops to a small number.

Also part of the above rewrite is that we now catch when event
scheduler (or the program agent) exits abnormally, sending email to
tbops and the swapper of the experiment. We have been seeing abnormal
exits of the scheduler and it would good to detect and see if we can
figure out what is going wrong.

Other small bug fixes in experiment run.
parent 9c6f20f0
......@@ -138,7 +138,7 @@ sub LookupByIndex($$)
"where idx='$exptidx'");
return undef
if (! $query_result || $query_result->numrows);
if (! $query_result || !$query_result->numrows);
my ($pid, $eid) = $query_result->fetchrow_array();
......@@ -593,6 +593,44 @@ sub AddEnvVariable($$$)
return 0;
}
#
# Write the environment strings into a little script in the user directory.
#
sub WriteEnvVariables($)
{
my ($self) = @_;
# Must be a real reference.
return -1
if (! ref($self));
my $pid = $self->pid();
my $eid = $self->eid();
my $query_result =
DBQueryWarn("select name,value from virt_user_environment ".
"where pid='$pid' and eid='$eid' order by idx");
return -1
if (!defined($query_result));
my $userdir = $self->UserDir();
my $envfile = "$userdir/tbdata/environment";
if (!open(FP, "> $envfile")) {
print "Could not open $envfile for writing: $!\n";
return -1;
}
while (my ($name,$value) = $query_result->fetchrow_array()) {
print FP "${name}=\"$value\"\n";
}
if (! close(FP)) {
print "Could not close $envfile: $!\n";
return -1;
}
return 0;
}
#
# Experiment locking and state changes.
#
......@@ -942,7 +980,7 @@ sub CleanLogFiles($)
foreach my $ext ("log", "ptop", "top", "assign") {
$files = "$files $workdir/*.${ext}";
}
foreach my $prefix ("swap", "start", "cancel") {
foreach my $prefix ("swap", "start", "cancel", "newrun") {
$files = "$files $workdir/${prefix}*.*";
}
system("/bin/rm -f $files") == 0
......@@ -967,7 +1005,7 @@ sub CopyLogFiles($)
my $workdir = $self->WorkDir();
my $userdir = $self->UserDir();
system("/bin/cp -Rfp $workdir/*.log $userdir/tbdata");
system("/bin/cp -Rfp $workdir/*.{log,ns,report,png} $userdir/tbdata");
return 0;
}
......@@ -1204,5 +1242,57 @@ sub ClearPortRegistration($)
return 0;
}
#
# Write the virt program data for the program agent that will run on ops.
# Ops does not speak to tmcd for experiments, so need to get this info
# over another way.
#
sub WriteProgramAgents($)
{
my ($self) = @_;
# Must be a real reference.
return -1
if (! ref($self));
my $pid = $self->pid();
my $eid = $self->eid();
my $query_result =
DBQueryWarn("select vname,command,dir,timeout,expected_exit_code ".
" from virt_programs ".
"where vnode='ops' and pid='$pid' and eid='$eid'");
return -1
if (!defined($query_result));
return 0
if (! $query_result->numrows);
my $userdir = $self->UserDir();
my $progfile = "$userdir/tbdata/program_agents";
if (!open(FP, "> $progfile")) {
print "Could not open $progfile for writing: $!\n";
return -1;
}
while (my ($name,$command,$dir,$timeout,$expected_exit_code) =
$query_result->fetchrow_array()) {
print FP "AGENT=$name";
print FP " DIR=$dir"
if (defined($dir) && $dir ne "");
print FP " TIMEOUT=$timeout"
if (defined($timeout) && $timeout ne "");
print FP " EXPECTED_EXIT_CODE=$expected_exit_code"
if (defined($expected_exit_code) && $expected_exit_code ne "");
print FP " COMMAND='$command'\n";
}
if (! close(FP)) {
print "Could not close $progfile: $!\n";
return -1;
}
return 0;
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -3182,8 +3182,7 @@ sub TBExptCreateLogFile($$$)
# Link it to $prefix.log so that the most recent is well know.
if (-e $linkname) {
die("*** $0:\n".
" CreateLogFile: $linkname already exists!\n");
unlink($linkname);
}
if (! link($logname, $linkname)) {
die("*** $0:\n".
......
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -49,6 +49,7 @@ control-install:
@$(MAKE) -C tbgen control-install
@$(MAKE) -C sched control-install
@$(MAKE) -C linktest control-install
@$(MAKE) -C program-agent control-install
post-install:
@$(MAKE) -C linktest post-install
......
#
# EMULAB-COPYRIGHT
# Copyright (c) 2000-2005 University of Utah and the Flux Group.
# Copyright (c) 2000-2006 University of Utah and the Flux Group.
# All rights reserved.
#
......@@ -49,11 +49,13 @@ program-agent-debug: program-agent.o version.o
$(PROGRAMS): ../lib/libevent.a ../lib/event.h
install:
install: $(INSTALL_DIR)/opsdir/sbin/program-agent
-mkdir -p $(INSTALL_DIR)/opsdir/man/man8
$(INSTALL) -m 0644 $(SRCDIR)/program-agent.8 \
$(INSTALL_DIR)/opsdir/man/man8/program-agent.8
control-install: $(INSTALL_SBINDIR)/program-agent
client: $(PROGRAMS)
client-install: client
$(INSTALL_PROGRAM) program-agent$(EXE) $(DESTDIR)$(CLIENT_BINDIR)/program-agent$(EXE)
......@@ -63,3 +65,8 @@ client-install: client
clean:
/bin/rm -f *.o $(PROGRAMS)
$(INSTALL_DIR)/opsdir/sbin/%: %
@echo "Installing $<"
-mkdir -p $(INSTALL_DIR)/opsdir/sbin
$(INSTALL) $< $@
......@@ -54,7 +54,7 @@
* since a "chatty" program can fill it up, which might cause odd behavior to
* happen.
*/
#define LOGDIR "/local/logs"
static char *LOGDIR = "/local/logs";
/**
* Maximum number of agents to be managed by this daemon.
......@@ -81,6 +81,11 @@ static event_handle_t handle;
*/
static char debug;
/**
* Flag indicating the program agent is running on ops.
*/
static int isops;
/**
* The actual number of agents being managed by this daemon.
*/
......@@ -103,6 +108,11 @@ static int childpipe[2];
*/
static char *configfile;
/**
* The environment file, which can be in a different place then configfile.
*/
static char *envfile;
/**
* The project and experiment ID that this daemon is running in.
*/
......@@ -325,6 +335,29 @@ dump_proginfos(void)
}
#endif
/**
* Handler for SIGTERM that kills everything off and exits nicely.
*
* @param sig The actual signal number received.
*/
static void
sigterm(int sig)
{
struct proginfo *pinfo;
/*
* Stop all running programs so that their log files
* are complete.
*/
for (pinfo = proginfos; pinfo != NULL; pinfo = pinfo->next) {
if (pinfo->pid != 0) {
stop_program(pinfo, NULL);
}
}
exit(0);
}
/**
* Print the usage statement to standard error.
*
......@@ -387,7 +420,7 @@ main(int argc, char **argv)
progname = argv[0];
bzero(agentlist, sizeof(agentlist));
while ((c = getopt(argc, argv, "hVdrs:p:l:u:i:e:c:k:")) != -1) {
while ((c = getopt(argc, argv, "hVdrs:p:l:u:i:e:c:k:f:o:")) != -1) {
switch (c) {
case 'h':
usage(progname);
......@@ -414,9 +447,16 @@ main(int argc, char **argv)
case 'c':
configfile = optarg;
break;
case 'f':
envfile = optarg;
break;
case 'u':
user = optarg;
break;
case 'o':
LOGDIR = optarg;
isops = 1;
break;
case 'i':
pidfile = optarg;
break;
......@@ -462,6 +502,8 @@ main(int argc, char **argv)
"error: pid/eid and config file flags are required\n");
usage(progname);
}
if (!envfile)
envfile = configfile;
if (parse_configfile(configfile) != 0)
exit(1);
......@@ -510,7 +552,8 @@ main(int argc, char **argv)
else
loginit(1, "program-agent");
}
signal(SIGTERM, sigterm);
/*
* Must be a valid user of course.
*/
......@@ -541,10 +584,13 @@ main(int argc, char **argv)
}
info("agentlist: %s\n", agentlist);
info("user: %s\n", user);
if ((stat(LOGDIR, &st) < 0) &&
(system("mkdir -p -m 0775 " LOGDIR) != 0)) {
fatal("Could not make log directory: %s", LOGDIR);
if (stat(LOGDIR, &st) < 0) {
sprintf(buf, "mkdir -p -m 0775 %s", LOGDIR);
if (system(buf) != 0) {
fatal("Could not make directory: %s", LOGDIR);
}
}
if (st.st_uid != pw->pw_uid) {
......@@ -626,7 +672,7 @@ main(int argc, char **argv)
/* XXX Need to eval the ENV parts of the config file after we've
* setup the environment.
*/
if (parse_configfile_env(configfile) != 0)
if (parse_configfile_env(envfile) != 0)
exit(1);
/*
......@@ -1067,6 +1113,16 @@ startrun_callback(event_handle_t handle,
if (strcmp(event, TBDB_EVENTTYPE_RELOAD) == 0) {
info("startrun_callback: Got a reload event.\n");
/*
* Ops is special since the file is local and there is no
* tmcd or wrapper.
*/
if (isops) {
parse_configfile_env(envfile);
return;
}
/*
* Wrapper will restart us.
*/
......@@ -1651,30 +1707,32 @@ parse_configfile_env(char *filename)
while (fgets(buf, sizeof(buf), fp)) {
int cc = strlen(buf);
char *bp;
FILE *file;
if (buf[cc-1] == '\n')
buf[cc-1] = (char) NULL;
if (!strncmp(buf, "ENV ", 4)) {
FILE *file;
/* XXX Kind of a stupid way to eval any variables. */
if ((file = popenf("echo %s", "r", &buf[4])) != NULL) {
if (fgets(buf, sizeof(buf), file) != NULL) {
char *idx;
if ((idx = strchr(buf, '\n')) != NULL)
*idx = '\0';
if ((idx = strchr(buf, '=')) != NULL) {
*idx = '\0';
setenv(strdup(buf),
idx + 1,
1);
}
if (isops)
bp = buf;
else if (!strncmp(buf, "ENV ", 4))
bp = &buf[4];
else
continue;
/* XXX Kind of a stupid way to eval any variables. */
if ((file = popenf("echo %s", "r", bp)) != NULL) {
if (fgets(buf, sizeof(buf), file) != NULL) {
char *idx;
if ((idx = strchr(buf, '\n')) != NULL)
*idx = '\0';
if ((idx = strchr(buf, '=')) != NULL) {
*idx = '\0';
setenv(strdup(buf), idx + 1, 1);
}
pclose(file);
file = NULL;
}
continue;
pclose(file);
}
}
......
......@@ -2674,6 +2674,22 @@ sub CreateLogFile($$$)
return 0;
}
sub WriteEnvVariables($)
{
my ($self) = @_;
# Must be a real reference.
return -1
if (! ref($self));
my $experiment = $self->Experiment();
return -1
if (! defined($experiment));
return $experiment->WriteEnvVariables();
}
# _Always_ make sure that this 1 is at the end of the file...
1;
......@@ -24,8 +24,9 @@ sub usage()
"-l logfile -t record_file start|stop|replay\n";
exit(-1);
}
my $optlist = "u:e:k:dl:g:t:";
my $optlist = "u:e:k:dl:g:t:a";
my $debug = 0;
my $runagent= 0;
my $user;
my $pid;
my $eid;
......@@ -40,9 +41,12 @@ my $action;
#
my $TB = "@prefix@";
my $TBOPS = "@TBOPSEMAIL@";
my $CONTROL = "@USERNODE@";
my $sched = "$TB/sbin/event-sched";
my $agent = "$TB/sbin/program-agent";
my $PIDDIR = "/var/run/emulab/evsched";
my $PIDFILE;
my $EXPDIR;
#
# Turn off line buffering on output
......@@ -70,6 +74,9 @@ if ($UID != 0) {
use lib "@prefix@/lib";
use libtestbed;
# Protos
sub StartProgram($@);
#
# Parse command arguments. Once we return from getopts, all that should be
# left are the required arguments.
......@@ -92,6 +99,9 @@ if (! defined($options{"u"}) ||
if (defined($options{"d"})) {
$debug = 1;
}
if (defined($options{"a"})) {
$runagent = 1;
}
if (defined($options{"t"})) {
$recordfile = $options{"t"};
}
......@@ -110,6 +120,7 @@ else {
usage();
}
$PIDFILE = "$PIDDIR/${pid}_${eid}.pid";
$EXPDIR = "/proj/$pid/exp/$eid";
#
# Deal with stop and replay.
......@@ -131,9 +142,11 @@ if ($action eq "stop" || $action eq "replay") {
}
unlink($PIDFILE);
if (! kill('TERM', $epid)) {
die("*** $0:\n".
"Failed to stop event system for $pid/$eid! - $! $epid\n");
if (kill(0, $epid) || ! $!{ESRCH}) {
if (! kill('TERM', $epid)) {
die("*** $0:\n".
"Failed to stop event system for $pid/$eid! - $! $epid\n");
}
}
}
......@@ -192,7 +205,7 @@ if (my $childpid = TBBackGround($logfile)) {
# Delay a moment, and they look for an exit status. This is intended
# to catch startup problems.
#
sleep(1);
sleep(2);
my $foo = waitpid($childpid, &WNOHANG);
if ($foo) {
my $status = $?;
......@@ -208,7 +221,7 @@ if (my $childpid = TBBackGround($logfile)) {
}
#
# Write out a pid file just prior to doing the exec. The user is not granted
# Write out a pid file prior to flipping; the user is not granted
# access to this pid file.
#
if (system("echo '$PID' > $PIDFILE")) {
......@@ -216,17 +229,111 @@ if (system("echo '$PID' > $PIDFILE")) {
" Could not create $PIDFILE!");
}
# Flip to user and never go back!
$GID = $unix_ggid;
$EGID = "$unix_ggid $unix_ggid $unix_pgid";
$EUID = $UID = $unix_uid;
$ENV{'USER'} = $user;
$ENV{'LOGNAME'} = $user;
#
# We will have two subprocesses.
#
my $schedpid;
my $agentpid;
# And run it.
exec("$sched " . ($debug ? "-d" : "") .
($recordfile ? " -t $recordfile" : "") .
" -s localhost -k $keyfile $pid $eid");
die("*** $0:\n".
" Could not exec $sched!");
#
# Catch TERM to kill off the scheduler and the agent. The death is picked
# up in the loop below.
#
sub handler ($) {
my ($signame) = @_;
$SIG{TERM} = 'IGNORE';
print "Caught a TERM; killing the scheduler and agent\n";
kill('TERM', $schedpid)
if (defined($schedpid));
kill('TERM', $agentpid)
if (defined($agentpid));
sleep(1);
}
$SIG{TERM} = \&handler;
#
# Set the command lines for the programs
#
my @sched_command_options = ();
push(@sched_command_options, "-d")
if ($debug);
push(@sched_command_options, ("-t", $recordfile))
if ($recordfile);
push(@sched_command_options, ("-s", "localhost", "-k", $keyfile, $pid, $eid));
my @agent_command_options = ("-u", $user, "-d", "-e", "$pid/$eid",
"-k", $keyfile,
"-c", "$EXPDIR/tbdata/program_agents",
"-f", "$EXPDIR/tbdata/environment",
"-o", "$EXPDIR/logs/ops");
#
# We want to catch these programs if they exit abnormally.
#
$schedpid = StartProgram($sched, @sched_command_options);
$agentpid = StartProgram($agent, @agent_command_options)
if ($runagent);
while (1) {
my $waitpid = wait();
my $exitstatus = $?;
my $which;
# No more children.
last
if ($waitpid < 0);
if ($waitpid == $schedpid) {
$schedpid = undef;
$which = "Event Scheduler";
}
else {
$agentpid = undef;
$which = "Program Agent";
}
#
# Send mail about abnormal exit.
#
if ($?) {
SENDMAIL($user,
"$which for $pid/$eid died on $CONTROL",
"$which exited with status: $?",
"$user",
"CC: $TBOPS");
}
last
if (! (defined($schedpid) || defined($agentpid)));
}
exit(0);
sub StartProgram($@)
{
my ($command, @arguments) = @_;
my $mypid = fork();
if ($mypid) {
return $mypid;
}
select(undef, undef, undef, 0.2);
# Flip to user and never go back
$GID = $unix_ggid;
$EGID = "$unix_ggid $unix_ggid $unix_pgid";
$EUID = $UID = $unix_uid;
$ENV{'USER'} = $user;
$ENV{'LOGNAME'} = $user;
print "$command @arguments\n";
exec $command, @arguments;
die("*** $0:\n".
" Could not exec $command!\n");
}
......@@ -23,7 +23,7 @@ sub usage()
"Usage: eventsys_control [-f] <start|stop|replay> <pid> <eid>\n";
exit(-1);
}
my $optlist = "df";
my $optlist = "dfa";
#
# Configure variables
......@@ -76,6 +76,7 @@ delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'};
my $proxy = "$TB/sbin/eventsys.proxy";
my $debug = 1;
my $force = 0;
my $agent = 0;