From 636aaa2b8bdaea28284d128949119ee09612f5ee Mon Sep 17 00:00:00 2001 From: Timothy Stack <stack@flux.utah.edu> Date: Mon, 25 Oct 2004 19:25:06 +0000 Subject: [PATCH] Changes to the "auto nice daemon" so it can work better in Emulab. * sensors/and/GNUmakefile.in: Emulab-specific make file. Updated to work with a build tree separate from the source and gave it a new version number. Files are installed under "/usr/testbed/" on ops. * sensors/and/Makefile: Add a warning that this is not the real makefile for us. * sensors/and/and-OpenBSD.c: Update to work with FreeBSD and add support for reporting process start time. * sensors/and/and-emulab.conf.in: Emulab-specific configuration, similar to the standard one, except it sends mail to tbops when it does something. * sensors/and/and-emulab.priorities: Emulab-specific priorities database. It excludes daemon pseudo users and the event-scheduler, otherwise, niceness levels apply to everyone. * sensors/and/and.8.man: Add the pid file to the 'FILES' section. * sensors/and/and.c: Added support for running a command when a niceness level change occurs. Also writes out the pid file when not running in test mode. * sensors/and/and.conf.5.man: Add docs for the level commands. * sensors/and/and.h: Add start time and child CPU time to the and_procent struct. * sensors/and/and.startup: Changed to use "test" instead of bash syntax and the pid file is now used instead of killall. Also added a "reconfig" option that HUPs the daemon. --- sensors/and/GNUmakefile.in | 217 ++++++++++++++++++++++++++++++ sensors/and/Makefile | 11 +- sensors/and/and-OpenBSD.c | 28 ++-- sensors/and/and-emulab.conf.in | 59 ++++++++ sensors/and/and-emulab.priorities | 66 +++++++++ sensors/and/and.8.man | 6 + sensors/and/and.c | 214 +++++++++++++++++++++++++++-- sensors/and/and.conf.5.man | 54 ++++++++ sensors/and/and.h | 6 +- sensors/and/and.startup | 34 +++-- 10 files changed, 665 insertions(+), 30 deletions(-) create mode 100644 sensors/and/GNUmakefile.in create mode 100644 sensors/and/and-emulab.conf.in create mode 100644 sensors/and/and-emulab.priorities diff --git a/sensors/and/GNUmakefile.in b/sensors/and/GNUmakefile.in new file mode 100644 index 0000000000..292a40b1a8 --- /dev/null +++ b/sensors/and/GNUmakefile.in @@ -0,0 +1,217 @@ +# +# EMULAB-COPYRIGHT +# Copyright (c) 2004 University of Utah and the Flux Group. +# All rights reserved. +# +# This is the Emulab specific makefile. +# + +# +# Makefile for auto nice daemon +# +# 1999-2004 Patrick Schemitz <schemitz@users.sourceforge.net> +# http://and.sourceforge.net/ +# + +SRCDIR = @srcdir@ +TESTBED_SRCDIR = @top_srcdir@ +OBJDIR = ../.. +SUBDIR = sensors/and + +include $(OBJDIR)/Makeconf + +all: and doc and-emulab.conf + +include $(TESTBED_SRCDIR)/GNUmakerules + +# +# Init script. +# +INITSCRIPT=and.init + +# +# Install to the ops directory. +# +PREFIX=$(INSTALL_DIR)/opsdir +INSTALL_ETC=$(PREFIX)/etc +INSTALL_INITD=$(PREFIX)/etc/rc.d +INSTALL_SBIN=$(PREFIX)/sbin +INSTALL_MAN=$(PREFIX)/man + +# +# Version and date +# +VERSION=1.2.1-emulab +DATE="25 Oct 2004" + +# +# Man pages +# +MANPAGES=and.8 and.conf.5 and.priorities.5 + +# +# Determine architecture from uname(1) +# +ARCH=$(shell uname) + +# +# Architecture-dependent settings: ANSI C compiler and linker +# +ifeq (${ARCH},Linux) + LIBS = +else +ifeq (${ARCH},OSF1) + LIBS = +else +ifeq (${ARCH},OpenBSD) + LIBS = -lkvm +else +ifeq (${ARCH},FreeBSD) + LIBS = -lkvm +endif +endif +endif +endif + + +# +# Build the auto-nice daemon. +# +and: and.o $(INITSCRIPT) and-$(ARCH).o + $(LD) and.o and-$(ARCH).o -o and $(LIBS) + + +# +# Independent part: configuration management, priority database. +# +# XXX Emulab note: We cannot use INSTALL_ETC for the paths since those may +# not exist on ops. +# +and.o: and.c and.h + $(CC) -DDEFAULT_INTERVAL=60 -DDEFAULT_NICE=0 \ + -DDEFAULT_CONFIG_FILE=\"$(INSTALL_DIR)/etc/and.conf\" \ + -DDEFAULT_DATABASE_FILE=\"$(INSTALL_DIR)/etc/and.priorities\" \ + -DAND_VERSION=\"$(VERSION)\" -DAND_DATE=\"$(DATE)\" -c $< + + +# +# Unix variant specific stuff +# +and-Linux.o: and-Linux.c and.h + $(CC) -c $< + +and-OpenBSD.o: and-OpenBSD.c and.h + $(CC) -c $< + +and-FreeBSD.o: and-OpenBSD.c and.h + $(CC) -c $< -o $@ + +and-OSF1.o: and-OSF1.c and.h + $(CC) -c $< $@ + +and-IRIX.o: and-OSF1.c and.h + $(CC) -c $< -o $@ + +and-IRIX64.o: and-OSF1.c and.h + $(CC) -c $< -o $@ + +and-SunOS.o: and-OSF1.c and.h + $(CC) -c $< -o $@ + + + +# +# Create script for SysV init +# +and.init: and.startup + sed s:INSTALL_SBIN:$(INSTALL_SBINDIR):g < $< > $@ + chmod +x and.init + + +# +# Man pages +# +doc: $(MANPAGES) + +and.8: and.8.man + cat $< | \ + sed s/__VERSION__/$(VERSION)/g | \ + sed s/__DATE__/$(DATE)/g > $@ + +and.conf.5: and.conf.5.man + cat $< | \ + sed s/__VERSION__/$(VERSION)/g | \ + sed s/__DATE__/$(DATE)/g > $@ + +and.priorities.5: and.priorities.5.man + cat $< | \ + sed s/__VERSION__/$(VERSION)/g | \ + sed s/__DATE__/$(DATE)/g > $@ + +# +# Nothing to do for client install. +# +client client-install: + +# +# Install and under $(PREFIX)/bin etc. +# +install: all + strip and +#-mkdir $(PREFIX) + -mkdir -p $(DESTDIR)$(INSTALL_SBIN) + -mkdir -p $(DESTDIR)$(INSTALL_ETC) + -mkdir -p $(DESTDIR)$(INSTALL_INITD) + -mkdir -p $(DESTDIR)$(INSTALL_MAN)/man5 + -mkdir -p $(DESTDIR)$(INSTALL_MAN)/man8 + $(INSTALL) -m 0755 and $(DESTDIR)$(INSTALL_SBIN) + $(INSTALL) -m 0644 and-emulab.conf $(DESTDIR)$(INSTALL_ETC)/and.conf + $(INSTALL) -m 0644 $(SRCDIR)/and-emulab.priorities \ + $(DESTDIR)$(INSTALL_ETC)/and.priorities +ifneq (${INITSCRIPT},) +ifneq (${INSTALL_INITD},) + @echo "Installing SysV script in $(DESTDIR)$(INSTALL_INITD)" + $(INSTALL) -m 0755 $(INITSCRIPT) $(DESTDIR)$(INSTALL_INITD)/and +else + @echo "Installing SysV script in $(DESTDIR)$(INSTALL_SBIN)" + $(INSTALL) -m 0755 $(INITSCRIPT) $(DESTDIR)$(INSTALL_SBIN) + @echo "Installing SysV init.d finder in $(DESTDIR)$(INSTALL_SBIN)" + $(INSTALL) -m 0755 and-find-init.d $(DESTDIR)$(INSTALL_SBIN) +endif +endif + $(INSTALL) -m 0644 and.8 $(DESTDIR)$(INSTALL_MAN)/man8 + $(INSTALL) -m 0644 and.conf.5 $(DESTDIR)$(INSTALL_MAN)/man5 + $(INSTALL) -m 0644 and.priorities.5 $(DESTDIR)$(INSTALL_MAN)/man5 + +simpleinstall: and and.init + strip and + mkdir -p $(DESTDIR)$(INSTALL_SBIN) $(DESTDIR)$(INSTALL_ETC) + mkdir -p $(DESTDIR)$(INSTALL_INITD) + mkdir -p $(DESTDIR)$(INSTALL_MAN)/man5 $(DESTDIR)$(INSTALL_MAN)/man8 + cp and $(DESTDIR)$(INSTALL_SBIN) + test -e $(DESTDIR)$(INSTALL_ETC)/and.conf || \ + cp and.conf $(DESTDIR)$(INSTALL_ETC) + test -e $(DESTDIR)$(INSTALL_ETC)/and.priorities || \ + cp and.priorities $(DESTDIR)$(INSTALL_ETC) +ifneq (${INITSCRIPT},) # on SysV only + cp $(INITSCRIPT) $(DESTDIR)$(INSTALL_INITD)/and +endif + cp and.8 $(DESTDIR)$(INSTALL_MAN)/man8 + cp and.conf.5 $(DESTDIR)$(INSTALL_MAN)/man5 + cp and.priorities.5 $(DESTDIR)$(INSTALL_MAN)/man5 + +uninstall: + rm -f $(DESTDIR)$(INSTALL_SBIN)/and + rm -f $(DESTDIR)$(INSTALL_INITD)/and + rm -f $(DESTDIR)$(INSTALL_ETC)/and.conf + rm -f $(DESTDIR)$(INSTALL_ETC)/and.priorities + rm -f $(DESTDIR)$(INSTALL_MAN)/man8/and.8 + rm -f $(DESTDIR)$(INSTALL_MAN)/man5/and.conf.5 + rm -f $(DESTDIR)$(INSTALL_MAN)/man5/and.priorities.5 + + +# +# Clean up generated files. +# +clean: + rm -f *.o and and.init $(MANPAGES) diff --git a/sensors/and/Makefile b/sensors/and/Makefile index 5bf4f78052..fcdc16a2d5 100644 --- a/sensors/and/Makefile +++ b/sensors/and/Makefile @@ -6,6 +6,15 @@ # +# +# EMULAB PEOPLE LOOK HERE! +# +# This file is DEAD, edit the GNUmakefile.in if you want to change something. +# +# EMULAB PEOPLE LOOK HERE! +# + + ####################################################################### # Edit here to adapt to your system! # ####################################################################### @@ -214,7 +223,7 @@ and.priorities.5: and.priorities.5.man # # Install and under $(PREFIX)/bin etc. # -install: and $(INITSCRIPT) +install: and $(INITSCRIPT) doc strip and #-mkdir $(PREFIX) -mkdir -p $(DESTDIR)$(INSTALL_SBIN) diff --git a/sensors/and/and-OpenBSD.c b/sensors/and/and-OpenBSD.c index fb4565041a..c26d65a6e2 100644 --- a/sensors/and/and-OpenBSD.c +++ b/sensors/and/and-OpenBSD.c @@ -1,7 +1,7 @@ /* AND auto nice daemon - renice programs according to their CPU usage. - Copyright (C) 1999-2001 Patrick Schemitz <schemitz@users.sourceforge.net> + Copyright (C) 1999-2004 Patrick Schemitz <schemitz@users.sourceforge.net> http://and.sourceforge.net/ This program is free software; you can redistribute it and/or modify @@ -94,21 +94,31 @@ struct and_procent *openbsd_getnext () strncpy(openbsd_proc.command,openbsd_pt[openbsd_next].kp_proc.p_comm,1023); openbsd_proc.command[1023] = 0; openbsd_proc.pid = openbsd_pt[openbsd_next].kp_proc.p_pid; - openbsd_proc.ppid = openbsd_pt[openbsd_next].kp_proc.p_ppid; /* FIXME that correct? */ - openbsd_proc.nice = openbsd_pt[openbsd_next].kp_proc.p_nice-20; + // openbsd_proc.uid = openbsd_pt[openbsd_next].kp_eproc.e_pcred.p_ruid; openbsd_proc.gid = openbsd_pt[openbsd_next].kp_eproc.e_pcred.p_rgid; +#if defined(__FreeBSD__) + openbsd_proc.ppid = openbsd_pt[openbsd_next].kp_eproc.e_ppid; + openbsd_proc.nice = openbsd_pt[openbsd_next].kp_proc.p_nice; + openbsd_proc.stime = + openbsd_pt[openbsd_next].kp_eproc.e_stats.p_start.tv_sec; + openbsd_proc.utime = + openbsd_pt[openbsd_next].kp_proc.p_runtime / (1000 * 1000); + openbsd_proc.ctime = + openbsd_pt[openbsd_next].kp_eproc.e_stats.p_cru.ru_utime.tv_sec + + openbsd_pt[openbsd_next].kp_eproc.e_stats.p_cru.ru_stime.tv_sec; +#else /* Adapted from top(1) port, as found in the misc@openbsd.org archive */ + openbsd_proc.ppid = openbsd_pt[openbsd_next].kp_proc.p_ppid; /* FIXME that correct? */ + openbsd_proc.nice = openbsd_pt[openbsd_next].kp_proc.p_nice-20; openbsd_proc.utime = (openbsd_pt[openbsd_next].kp_proc.p_uticks + openbsd_pt[openbsd_next].kp_proc.p_sticks + openbsd_pt[openbsd_next].kp_proc.p_iticks) / openbsd_hz; - /* - printf("%-20s %5i %3i %i\n",openbsd_proc.command,openbsd_proc.pid, - openbsd_proc.nice,openbsd_proc.utime); - */ - and_printf(3, "OpenBSD: process %s pid: %d ppid: %d\n", - openbsd_proc.command, openbsd_proc.pid, openbsd_proc.ppid); +#endif + and_printf(3, "OpenBSD: process %s pid: %d ppid: %d cpu_secs: %d\n", + openbsd_proc.command, openbsd_proc.pid, openbsd_proc.ppid, + openbsd_proc.utime); openbsd_next++; return &openbsd_proc; } diff --git a/sensors/and/and-emulab.conf.in b/sensors/and/and-emulab.conf.in new file mode 100644 index 0000000000..6d32fea1cb --- /dev/null +++ b/sensors/and/and-emulab.conf.in @@ -0,0 +1,59 @@ +# +# Emulab-ops configuration file for the auto nice daemon, /etc/and.conf +# +# Comments must have the # in the *first* column! +# +# Read and.conf(5) for details. +# +# 1999, 2000, 2004 Patrick Schemitz, schemitz@users.sourceforge.net +# + +# +# Nice level for jobs that are not in and.priorities. +# 0 = do not renice. +# +defaultnice 0 + +# +# Time interval between renice cycles, in seconds. Default is +# 60 seconds. +# +interval 120 + +# +# Ranges for the nice levels. Jobs with less than lv1time seconds +# CPU time are not reniced; jobs between lv1time and lv2time seconds +# are reniced to the first level in an.priorities; jobs between +# lv2time and lv3time seconds to the second level; jobs with more +# than lv3time seconds are reniced to the third level. +# +lv1time 90 +lv2time 1800 +lv2cmd mail -s "@THISHOMEBASE@: CPU Hog - $AND_COMMAND (30 min.)" @TBOPSEMAIL@ +lv3time 7200 +lv3cmd mail -s "@THISHOMEBASE@: CPU Hog - $AND_COMMAND (2 hrs.)" @TBOPSEMAIL@ + +# +# Strategy for picking the right priority entry for a user/group/job +# triple. The strategy is a permutation of "cgu", "c"ommand, "g"roup, +# "u"ser. The order specifies the affinity of the priority lookup +# method. "cug" means an exact match of the command has priority +# over an exact match of the user or group. See the documentation +# for more details. +# +affinity cpug + +# +# Minimum user/group id to be even considered for renicing. Processes +# with lower user/group id are ignored. This does not affect root +# (user id 0), which is never, ever reniced. +# +minuid 1 +mingid 1 + +# NOTE: Emulab user IDs start at 10000, so if you want to exempt the +# core developers, comment out the above lines and uncomment the ones +# below. +# +# minuid 10000 +# mingid 6000 diff --git a/sensors/and/and-emulab.priorities b/sensors/and/and-emulab.priorities new file mode 100644 index 0000000000..217f627481 --- /dev/null +++ b/sensors/and/and-emulab.priorities @@ -0,0 +1,66 @@ +# +# Sample priority database for the auto-nice daemon, /etc/and.priorities +# +# Comments must have the # in the _first_ column! +# +# File format: +# user group job parent nice1 nice2 nice3 +# - user: user name or user id or * for all +# - group: group name or group id or * for all +# - job: executable (without path; may be a regexp) or * for all +# - parent: keyword "parent=" or "ancestor=" followed by the +# executable (without path; may be a regexp), or * for all +# - nice1, nice2, nice3: nice levels for CPU usage ranges. +# +# At least one of user or group must be an asterisk *. +# +# After /etc/and.conf:lv1time seconds matching jobs are niced to nice1, +# after /etc/and.conf:lv2time to nice2 and after /etc/and.conf:lv3time +# to nice3. +# +# Read and.priorities(5) for details. +# +# 1999, 2000, 2004 Patrick Schemitz, schemitz@users.sourceforge.net +# + +# +# Philosophy: +# +# Knock everyone except for the well known daemons, ops is not a +# general purpose machine, that is what the experimental nodes are for. +# If you make any changes, note that a perceptive user might rename +# his jobs' executables to obtain higher privilegue. So the values +# must be sensible to that. +# + +# +# Default entry -- moderate renicing at first, drop them further down +# after that. +# +* * * * 5 15 19 + +# +# daemon entry -- for the portmapper and such. +# +daemon * * * 0 0 0 + +# +# The main elvind server. +# +nobody * elvind * 0 0 0 + +# +# Sendmail stuff +# +smmsp * * * 0 0 0 + +# +# The event scheduler, not sure about this, but for long running +# experiments the scheduler ends up accumulating a lot of time. +# +# Note that we only do this for the mainline one, development +# versions are not exempt. +# +* * event-sched * 0 0 0 + +* * sshd * 0 5 15 diff --git a/sensors/and/and.8.man b/sensors/and/and.8.man index 418177615b..508a7effff 100644 --- a/sensors/and/and.8.man +++ b/sensors/and/and.8.man @@ -129,6 +129,12 @@ Both files have their own manual pages. Contains logging and status information for debugging purposes. Used in test mode only. +.TP 0.5i +.B /var/run/and.pid +Process identifier file. Stores the PID for the currently running +.I and +process. This file is not written in test mode. + .SH "SEE ALSO" .BR and.conf (5), .BR and.priorities (5), diff --git a/sensors/and/and.c b/sensors/and/and.c index 54933b785a..049cbe2272 100644 --- a/sensors/and/and.c +++ b/sensors/and/and.c @@ -45,9 +45,15 @@ #include <sys/time.h> #include <sys/resource.h> #include <sys/types.h> +#include <sys/wait.h> #include <signal.h> #include <regex.h> #include <values.h> +#include <errno.h> +#include <pwd.h> +#include <grp.h> +#include <fcntl.h> +#include <sys/stat.h> #define DEBUG 0 @@ -60,6 +66,11 @@ #ifdef __GNUC__ int vsnprintf (char *str, size_t n, const char *format, va_list ap); #define HAVE_VSNPRINTF +int snprintf (char *str, size_t n, const char *format, ...); +#define HAVE_SNPRINTF +#define and_snprintf snprintf +#else +#define and_snprintf sprintf #endif #include "and.h" @@ -86,6 +97,8 @@ int vsnprintf (char *str, size_t n, const char *format, va_list ap); #define DEFAULT_DATABASE_FILE "/etc/and.priorities" #endif +#define AND_PIDFILE "/var/run/and.pid" + #ifndef AND_VERSION #define AND_VERSION "1.0.7 or above (not compiled in)" #endif @@ -165,10 +178,12 @@ struct { bool lock_interval; unsigned interval; unsigned time_mark [3]; + char *cmd[3]; char affinity [5]; int weight [PRI_N]; int min_uid; int min_gid; + char *cmd_user; } and_config; @@ -191,6 +206,10 @@ void set_defaults (int argc, char **argv) and_config.min_gid = 0; gethostname(and_config.hostname,511); and_config.hostname[511] = 0; + and_config.cmd[0] = ""; + and_config.cmd[1] = ""; + and_config.cmd[2] = ""; + and_config.cmd_user = "nobody"; } @@ -286,8 +305,11 @@ void print_config () "default nicelevel: %2i\n" "interval [sec]: %3u\n" "level 0 from [sec]: %3u\n" + "level 0 cmd: %s\n" "level 1 from [sec]: %3u\n" + "level 1 cmd: %s\n" "level 2 from [sec]: %3u\n" + "level 2 cmd: %s\n" "minimum uid: %i\n" "minimum gid: %i\n" "affinity: %s\n" @@ -299,8 +321,9 @@ void print_config () (and_config.test?"just checkin'":"I'm serious."), and_config.verbose, and_config.nice_default, and_config.interval, - and_config.time_mark[0], and_config.time_mark[1], - and_config.time_mark[2], + and_config.time_mark[0], and_config.cmd[0], + and_config.time_mark[1], and_config.cmd[1], + and_config.time_mark[2], and_config.cmd[2], and_config.min_uid, and_config.min_gid, and_config.affinity, and_config.weight[PRI_U], and_config.weight[PRI_G], @@ -603,6 +626,10 @@ void read_config () and_printf(0,"Configuration file line %i has invalid value for lv1time: %s.\n", line, value); } + } else if (strcmp(param,"lv1cmd")==0) { + if (buffer[strlen(buffer) - 1] == '\n') + buffer[strlen(buffer) - 1] = '\0'; + and_config.cmd[0] = strdup(&buffer[strlen(param) + 1]); } else if (strcmp(param,"lv2time")==0) { if (uval < UINT_MAX) and_config.time_mark[1] = uval; @@ -611,6 +638,10 @@ void read_config () and_printf(0,"Configuration file line %i has invalid value for lv2time: %s.\n", line, value); } + } else if (strcmp(param,"lv2cmd")==0) { + if (buffer[strlen(buffer) - 1] == '\n') + buffer[strlen(buffer) - 1] = '\0'; + and_config.cmd[1] = strdup(&buffer[strlen(param) + 1]); } else if (strcmp(param,"lv3time")==0) { if (uval < UINT_MAX) and_config.time_mark[2] = uval; @@ -619,6 +650,18 @@ void read_config () and_printf(0,"Configuration file line %i has invalid value for lv3time: %s.\n", line, value); } + } else if (strcmp(param,"lv3cmd")==0) { + if (buffer[strlen(buffer) - 1] == '\n') + buffer[strlen(buffer) - 1] = '\0'; + and_config.cmd[2] = strdup(&buffer[strlen(param) + 1]); + } else if (strcmp(param,"cmduser")==0) { + if (strlen(value) > 0) + and_config.cmd_user = value; + else { + ++bad; + and_printf(0,"Configuration file line %i has empty value for cmduser.\n", + line); + } } else if (strcmp(param,"affinity")==0) { bad_f = -1; u = g = c = p = 0; @@ -679,11 +722,12 @@ void read_config () /* Compute new nice level for given command/uid/gid/utime */ -int and_getnice (int uid, int gid, char *command, struct and_procent *parent, unsigned cpu_seconds) +int and_getnice (int uid, int gid, char *command, struct and_procent *parent, unsigned cpu_seconds, char **cmd_out) { int i, level, entry, exact = -1, last; struct and_procent *par; int exactness [PRI_MAXENTRIES]; + *cmd_out = NULL; if (!command) { and_printf(0,"Process without command string encountered. Aborting.\n"); abort(); @@ -769,13 +813,130 @@ int and_getnice (int uid, int gid, char *command, struct and_procent *parent, un while (level >= 0 && and_config.time_mark[level] > cpu_seconds) { --level; } - and_printf(2,"command=%s (%i,%i,%s) hit on entry=%i, exactness=%i, level=%i.\n", + and_printf(2,"command=%s (%i,%i,%s) hit on entry=%i, exactness=%i, level=%i, cs=%i.\n", command, uid, gid, (parent!=NULL?parent->command:"(orphan)"), - entry, exact, level); + entry, exact, level, cpu_seconds); + *cmd_out = (level >= 0) ? and_config.cmd[level] : NULL; return (level >= 0 ? and_db.entry[entry].nl[level] : 0); } +int and_exec (char *cmd, struct and_procent *current, int newnice) +{ + int rc, retval = -1; + + assert(cmd != NULL); + assert(strlen(cmd) > 0); + assert(newnice != 0); + + switch (rc = fork()) { + case 0: + /* child */ + if (getuid() == 0) { + char *user_name = "(unknown)", *group_name = "(unknown)"; + int exit_value = EXIT_SUCCESS; + struct passwd *pw; + struct group *grp; + char buffer[2048]; + FILE *file; + + setenv("HOME", "", 1); + setenv("USER", and_config.cmd_user, 1); + + if ((pw = getpwnam(and_config.cmd_user)) == NULL) { + syslog(LOG_ERR,"unknown command user: %s",and_config.cmd_user); + exit(1); + } + + if (setgid(pw->pw_gid) || + initgroups(and_config.cmd_user, pw->pw_gid) || + setuid(pw->pw_uid)) { + syslog(LOG_ERR,"unable to drop privileges: %s",strerror(errno)); + exit(1); + } + + setenv("AND_HOST", and_config.hostname, 1); + + if ((pw = getpwuid(current->uid)) != NULL) + user_name = pw->pw_name; + if ((grp = getgrgid(current->gid)) != NULL) + group_name = grp->gr_name; + + sprintf(buffer, "%d", current->pid); + setenv("AND_PID", buffer, 1); + sprintf(buffer, "%d", current->ppid); + setenv("AND_PPID", buffer, 1); + setenv("AND_USER", user_name, 1); + setenv("AND_GROUP", group_name, 1); + setenv("AND_COMMAND", current->command, 1); + + if ((file = popen(cmd, "w")) != NULL) { + char time_string[256]; + time_t current_time; + + current_time = time(NULL); + strftime(time_string, sizeof(time_string), + "%H:%M", + localtime(¤t_time)); + + and_snprintf(buffer, sizeof(buffer), + "\n" + "[This is an automated message from the auto nice daemon (AND)]\n" + "\n"); + buffer[sizeof(buffer) - 1] = '\0'; + fwrite(buffer, 1, strlen(buffer), file); + and_snprintf(buffer, sizeof(buffer), + "A CPU hog has been detected at %s hours on %s:\n" + "\n" + " pid\t\t%d\n" + " ppid\t\t%d\n" + " uid\t\t%d %s\n" + " gid\t\t%d %s\n" + " old nice\t%d\n" + " CPU seconds\t%d\n" + " command\t%s\n" + " start time\t%s" + "\n" + "Action taken:\n" + " %s %d\n", + time_string, + and_config.hostname, + current->pid, + current->ppid, + current->uid, user_name, + current->gid, group_name, + current->nice, + current->utime, + current->command, + ctime(¤t->stime), + (newnice > 0) ? "Changed nice to" : "Signalled process with", + (newnice > 0) ? newnice : -newnice); + buffer[sizeof(buffer) - 1] = '\0'; + fwrite(buffer, 1, strlen(buffer), file); + if (pclose(file) == -1) { + syslog(LOG_ERR,"error while executing: %s",cmd); + + exit_value = EXIT_FAILURE; + } + } + else { + exit_value = EXIT_FAILURE; + } + + exit(exit_value); + } + break; + case -1: + break; + default: + /* parent */ + retval = 0; + break; + } + + return (retval); +} + /********************************************************************** @@ -812,10 +973,14 @@ struct and_procent* and_find_proc (struct and_procent *head, int ppid) void and_loop () { struct and_procent *head, *current, *new, *proc; + int childstatus; int newnice; int njobs = 0; assert(and_getfirst != NULL); assert(and_getnext != NULL); + while (wait3(&childstatus,WNOHANG,NULL) > 0) { + // reaped child... + } head = NULL; current = NULL; proc = and_getfirst(); @@ -843,9 +1008,10 @@ void and_loop () } current = head; while (current != NULL) { + char *cmd; njobs++; newnice = and_getnice(current->uid,current->gid,current->command, - current->parent,current->utime); + current->parent,current->utime,&cmd); if (current->uid != 0) { if (newnice) { if (newnice > 0) { @@ -858,6 +1024,9 @@ void and_loop () current->command); setpriority(PRIO_PROCESS,current->pid,newnice); } + if ((cmd != NULL) && (strlen(cmd) > 0)) { + and_exec(cmd, current, newnice); + } } } else { if (and_config.test) @@ -868,6 +1037,9 @@ void and_loop () current->command); kill(current->pid,-newnice); } + if ((cmd != NULL) && (strlen(cmd) > 0)) { + and_exec(cmd, current, newnice); + } } } } @@ -945,6 +1117,7 @@ void and_getopt (int argc, char** argv) static int g_reload_conf; +static bool g_looping = true; void and_trigger_readconf (int sig) @@ -952,6 +1125,10 @@ void and_trigger_readconf (int sig) g_reload_conf = (sig == SIGHUP); } +void and_trigger_stoplooping (int sig) +{ + g_looping = false; +} void and_readconf () { @@ -961,15 +1138,17 @@ void and_readconf () g_reload_conf = 0; } - void and_worker () { read_config(); read_priorities(); + signal(SIGTERM,and_trigger_stoplooping); + signal(SIGINT,and_trigger_stoplooping); + signal(SIGQUIT,and_trigger_stoplooping); signal(SIGHUP,and_trigger_readconf); and_printf(0,"AND ready.\n"); g_reload_conf = 0; - while (1) { + while (g_looping) { if (g_reload_conf) { and_readconf(); } @@ -986,7 +1165,24 @@ int and_main (int argc, char** argv) if (and_config.test) { and_worker(); } else { - if (fork() == 0) and_worker(); + if (daemon(0,0) < 0) { + perror("Unable to daemonize"); + } + else { + char pidbuf[32]; + int pfd; + + if ((pfd = open(AND_PIDFILE, O_EXCL | O_CREAT | O_WRONLY)) < 0) { + and_printf(0,"Could not create pid file: %s\n",AND_PIDFILE); + exit(1); + } + fchmod(pfd, S_IRUSR | S_IRGRP | S_IROTH); + sprintf(pidbuf,"%d",getpid()); + write(pfd,pidbuf,strlen(pidbuf)); + close(pfd); + and_worker(); + unlink(AND_PIDFILE); + } } return 0; } diff --git a/sensors/and/and.conf.5.man b/sensors/and/and.conf.5.man index ac57573488..bbd6bcc007 100644 --- a/sensors/and/and.conf.5.man +++ b/sensors/and/and.conf.5.man @@ -90,6 +90,56 @@ Defaults are .I 3600 seconds. +.TP 0.5i +.B lv1cmd, lv2cmd, lv3cmd +Commands for the nice levels. Jobs that reach a new level, and have their +priorities changed, will cause the corresponding command to be executed. The +commands are executed using +.B popen(3) +and an informational message describing the offending job is piped in. In +addition, the following environment variables will be set, so you can reference +them in the given command-line: + +.RS +.TP 0.5i +.B AND_HOST +The host where +.I and +is running. + +.TP 0.5i +.B AND_PID +The process ID of the offending job. + +.TP 0.5i +.B AND_PPID +The parent process ID of the offending job. + +.TP 0.5i +.B AND_USER +The user name the offending job is running with. + +.TP 0.5i +.B AND_GROUP +The group name the offending job is running with. + +.TP 0.5i +.B AND_COMMAND +The command name of the offending job. +.RE + +.RS +Normally, the command is executed as "nobody", or the user specified by the +.I cmduser +setting. While this environment is limiting, it is enough to send mail or +perform some other simple communication. +.RE + +.TP 0.5i +.B cmduser +The user name to use when running level commands. The default value for +this is "nobody", which should be sufficient for most uses. + .TP 0.5i .B minuid, mingid Minimum user id and group id to be considered for renicing. @@ -161,6 +211,10 @@ lv2time 1800 .br lv3time 3600 .br +# Get someone involved if they reach level 3 +.br +lv3cmd mail -s "CPU Hog - $AND_COMMAND" bofh@foobar.com +.br # Hosts foo, bar, baz are terminals and must .br # be more responsive, so earlier renice. diff --git a/sensors/and/and.h b/sensors/and/and.h index 7b1108f462..c6b0298b35 100644 --- a/sensors/and/and.h +++ b/sensors/and/and.h @@ -46,10 +46,12 @@ struct and_procent { int uid; int gid; int nice; - unsigned utime; + time_t stime; // process start time in seconds. + unsigned utime; // CPU time in seconds. + unsigned ctime; // reaped child CPU time in seconds. char command [1024]; /* to be filled by and.c: */ - struct and_procent *parent; + struct and_procent *parent; struct and_procent *next; }; diff --git a/sensors/and/and.startup b/sensors/and/and.startup index ea4d968b51..ba77aaddaa 100644 --- a/sensors/and/and.startup +++ b/sensors/and/and.startup @@ -15,14 +15,15 @@ # AND_FLAGS="" -test -r /etc/rc.config && . /etc/rc.config + +# test -r /etc/rc.config && . /etc/rc.config case "$1" in start) echo -n "Starting auto nice daemon:" - INSTALL_SBIN/and $AND_FLAGS >&/dev/null - ps axuw | grep -v grep | grep INSTALL_SBIN/and >/dev/null - if [ $? = 0 ]; then + INSTALL_SBIN/and $AND_FLAGS > /dev/null 2>&1 + ps axuw | grep -v grep | grep INSTALL_SBIN/and > /dev/null + if test $? -eq 0; then echo " done" exit 0 else @@ -32,8 +33,23 @@ case "$1" in ;; stop) echo -n "Shutting down auto nice daemon:" - uname | grep OSF1 >/dev/null || killall INSTALL_SBIN/and - echo " done" + ps axuw | grep -v grep | grep INSTALL_SBIN/and > /dev/null + if test $? -eq 0; then + kill `cat /var/run/and.pid` + echo " done" + else + echo " already stopped" + fi + exit 0 + ;; + reconfig) + echo -n "Reconfiguring auto nice daemon:" + if test -r "/var/run/and.pid"; then + kill -HUP `cat /var/run/and.pid` + echo " done" + else + echo " no process" + fi exit 0 ;; restart) @@ -42,8 +58,8 @@ case "$1" in ;; status) echo -n "Checking for auto nice daemon: " - ps axuw | grep -v grep | grep INSTALL_SBIN/and >/dev/null - if [ $? = 0 ]; then + ps axuw | grep -v grep | grep INSTALL_SBIN/and > /dev/null + if test $? -eq 0; then echo "running" exit 0 else @@ -52,6 +68,6 @@ case "$1" in fi ;; *) - echo "Usage: $0 {start|stop|status|restart}" + echo "Usage: $0 {start|stop|status|reconfig|restart}" exit 1 esac -- GitLab