linktest.c 10.4 KB
Newer Older
1 2
/*
 * EMULAB-COPYRIGHT
3
 * Copyright (c) 2000-2007 University of Utah and the Flux Group.
4 5 6 7 8 9 10
 * All rights reserved.
 */

#include <stdio.h>
#include <ctype.h>
#include <netdb.h>
#include <unistd.h>
11 12
#include <string.h>
#include <errno.h>
13 14 15 16 17 18 19 20 21 22
#include <paths.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <signal.h>
#include <pwd.h>
#include <time.h>
#include "tbdefs.h"
#include "log.h"
23
#include "be_user.h"
24
#include "event.h"
25

26 27 28 29
#define TRUE    1
#define FALSE   0
#define LINKTEST_SCRIPT CLIENT_BINDIR "/linktest.pl"
#define MAX_ARGS 10
30

31 32 33 34
static int	      debug;
static volatile int   locked;
static pid_t          linktest_pid;
static char           *pideid;
35
static char           *swapper;
36
static event_handle_t handle;
37
static unsigned long  token = ~0;
38

39 40
static void	      callback(event_handle_t handle,
			       event_notification_t notification, void *data);
41 42 43
static void	      start_callback(event_handle_t handle,
				     event_notification_t notification,
				     void *data);
44 45 46 47 48
static void           exec_linktest(char *args, int);
static void           sigchld_handler(int sig);
static void           send_group_kill();
static void           send_kill_event();
     
49 50 51 52
void
usage(char *progname)
{
	fprintf(stderr,
53
		"Usage: %s [-d] "
54
		"[-s server] [-p port] [-k keyfile] [-l logfile] [-u user] -e pid/eid\n",
55 56 57 58
		progname);
	exit(-1);
}

59 60
int
main(int argc, char **argv) {
61

62 63 64
	address_tuple_t	tuple;
	char *server = NULL;
	char *port = NULL;
65
	char *keyfile = NULL;
66
	char *pidfile = NULL;
67 68 69 70
	char *logfile = NULL;
	char *progname;
	char c;
	char buf[BUFSIZ];
71
	extern char build_info[];
72
	pideid = NULL;
73 74
	
	progname = argv[0];
75

76
	while ((c = getopt(argc, argv, "s:p:e:l:dk:i:Vu:")) != -1) {
77
	  switch (c) {
78 79 80
	  case 'd':
	    debug++;
	    break;
81 82 83 84 85 86 87 88 89
	  case 's':
	    server = optarg;
	    break;
	  case 'p':
	    port = optarg;
	    break;
	  case 'e':
	    pideid = optarg;
	    break;
90 91 92
	  case 'i':
	    pidfile = optarg;
	    break;
93 94 95
	  case 'l':
	    logfile = optarg;
	    break;
96 97 98
	  case 'k':
	    keyfile = optarg;
	    break;
99 100 101 102 103 104 105
	  case 'u':
	    swapper = optarg;
	    break;
	  case 'V':
	    fprintf(stderr, "%s\n", build_info);
	    exit(0);
	    break;
106 107 108 109 110 111 112 113
	  default:
	    usage(progname);
	  }
	}

	if (!pideid)
	  usage(progname);

114 115 116 117 118 119 120 121 122
	if (debug)
		loginit(0, 0);
	else {
		if (logfile)
			loginit(0, logfile);
		else
			loginit(1, "linktest");
		/* See below for daemonization */
	}
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

	/*
	 * Convert server/port to elvin thing.
	 *
	 * XXX This elvin string stuff should be moved down a layer. 
	 */
	if (server) {
		snprintf(buf, sizeof(buf), "elvin://%s%s%s",
			 server,
			 (port ? ":"  : ""),
			 (port ? port : ""));
		server = buf;
	}

	/*
	 * Construct an address tuple for subscribing to events for
	 * this node.
	 */
	tuple = address_tuple_alloc();
	if (tuple == NULL) {
		fatal("could not allocate an address tuple");
	}
	/*
146
	 * Ask for just the events we care about. 
147 148 149
	 */
	tuple->expt      = pideid;
	tuple->objtype   = TBDB_OBJECTTYPE_LINKTEST;
150 151 152
	tuple->eventtype =
		TBDB_EVENTTYPE_START ","
		TBDB_EVENTTYPE_KILL;
153 154 155 156

	/*
	 * Register with the event system. 
	 */
157
	handle = event_register_withkeyfile(server, 0, keyfile);
158
	if (handle == NULL) {
159
	        fatal("could not register with event system");
160 161 162 163 164 165 166 167
	}
	
	/*
	 * Subscribe to the event we specified above.
	 */
	if (! event_subscribe(handle, callback, tuple, NULL)) {
		fatal("could not subscribe to event");
	}
168

169 170 171 172 173 174 175 176 177 178 179
	tuple->objtype   = TBDB_OBJECTTYPE_TIME;
	tuple->objname   = ADDRESSTUPLE_ANY;
	tuple->eventtype = TBDB_EVENTTYPE_START;

	/*
	 * Subscribe to the TIME start event we specified above.
	 */
	if (! event_subscribe(handle, start_callback, tuple, NULL)) {
		fatal("could not subscribe to event");
	}

180 181 182 183 184
	/*
	 * Do this now, once we have had a chance to fail on the above
	 * event system calls.
	 */
	if (!debug)
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
		daemon(0, 1);

	/*
	 * Write out a pidfile if root (after we daemonize).
	 */
	if (!getuid()) {
		FILE *fp;
		
		if (pidfile)
			strcpy(buf, pidfile);
		else
			sprintf(buf, "%s/linktest.pid", _PATH_VARRUN);
		fp = fopen(buf, "w");
		if (fp != NULL) {
			fprintf(fp, "%d\n", getpid());
			(void) fclose(fp);
		}
	}

204 205 206 207 208 209 210 211
	/*
	 * Initialize variables used to control child execution
	 */
	locked = FALSE;
	if(signal(SIGCHLD,sigchld_handler) == SIG_ERR) {
	        fatal("could not install child handler");
	}
	
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
	/*
	 * Begin the event loop, waiting to receive event notifications:
	 */
	event_main(handle);

	/*
	 * Unregister with the event system:
	 */
	if (event_unregister(handle) == 0) {
		fatal("could not unregister with event system");
	}

	return 0;
}
/*
 * Handle the events.
 */
static void
callback(event_handle_t handle, event_notification_t notification, void *data)
{
	char		objname[TBDB_FLEN_EVOBJTYPE];
	char		event[TBDB_FLEN_EVEVENTTYPE];
	char		args[BUFSIZ];
	struct timeval	now;

	gettimeofday(&now, NULL);
	
	if (! event_notification_get_objname(handle, notification,
					     objname, sizeof(objname))) {
		error("Could not get objname from notification!\n");
		return;
	}

	if (! event_notification_get_eventtype(handle, notification,
					       event, sizeof(event))) {
		error("Could not get event from notification!\n");
		return;
	}

251 252 253
	event_notification_get_int32(handle, notification,
				     "TOKEN", (int32_t *)&token);

254 255 256
	event_notification_get_arguments(handle,
					 notification, args, sizeof(args));

257 258
	info("event: %s - %s - %s\n", objname, event, args);

259 260 261
	/*
	 * Dispatch the event. 
	 */
262 263 264 265 266 267 268 269
	if(!strcmp(event, TBDB_EVENTTYPE_START)) {
	  if(!locked) {

	    /*
	     * Set locked bit. The bit is not set to false
	     * until a SIGCHLD signal is received
	     */
	    locked = TRUE;
270

271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
	    /*
	     * The Linktest script is not running, so
	     * fork a process and execute it.
	     */
	    linktest_pid = fork();
	    if(linktest_pid < 0) {
	         error("Could not fork a process to run linktest script!\n");
	         return;
	    }

	    /*
	     * Changes the process group of the child to itself so
	     * a sigkill to the child process group will not kill
	     * the Linktest daemon.
	     */
	    if(!linktest_pid) {
  	         pid_t mypid = getpid();
288
	         setpgid(0, mypid);
289 290 291 292 293

		 /* Finally, execute the linktest script. */
		 exec_linktest(args, sizeof(args));
	    }
	  }
294 295 296
	  else {
	    info("linktest already in progress\n");
	  }
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
	} else if (!strcmp(event, TBDB_EVENTTYPE_KILL)) {

	  /*
	   * Ignore unless we are running.
	   */
	  if(locked) {

	    /*
	     * If KILL is received, there is a problem on this
	     * or some other node. So, kill off linktest
	     * and its children.
	     */
	    send_group_kill();
	    
	  }
	}
313
}
314

315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
static void
start_callback(event_handle_t handle,
	       event_notification_t notification,
	       void *data)
{
	char		event[TBDB_FLEN_EVEVENTTYPE];

	if (! event_notification_get_eventtype(handle, notification,
					       event, sizeof(event))) {
		error("Could not get event from notification!\n");
		return;
	}

	if (strcmp(event, TBDB_EVENTTYPE_START) == 0) {
	  /*
	   * Ignore unless we are running.
	   */
	  if(locked) {
	    /*
	     * Reset to a clean state.
	     */
	    send_group_kill();
	  }
	  token = ~0;
	}
}

342 343 344 345
/*
 * Executes Linktest with arguments received from the Linktest
 * start event. Does not return.
 */ 
346
static void
347
exec_linktest(char *args, int buflen) {
348
	char	   *word, *argv[MAX_ARGS], swapperarg[128], tokenarg[32];
349 350 351 352 353 354 355 356
	int	   i,res;

	/*
	 * Set up arguments for execv call by parsing contents
	 * of the event string.
	 */
	word = strtok(args," \t");
	i=1;
357 358 359 360
	sprintf(swapperarg, "SWAPPER=%s", swapper);
	argv[i++] = swapperarg;
	sprintf(tokenarg, "TOKEN=%lu", token);
	argv[i++] = tokenarg;
361 362 363 364 365 366 367
	do {
	  argv[i++] = word;
	} while ((word = strtok(NULL," \t"))
		 && (i<MAX_ARGS));
	argv[i] = NULL;
	argv[0] = LINKTEST_SCRIPT;

368 369 370 371 372 373 374
#ifdef __CYGWIN__
	/*
	 * Run as the swapper on Cygwin for access to the shared /proj dir.
	 */
	be_user(swapper);
#endif /* __CYGWIN__ */

375 376 377 378 379 380 381 382
	/*
	 * Execute the script with the arguments from the event
	 */
	res = execv( LINKTEST_SCRIPT,argv);
	if(res < 0) {
	    error("Could not execute the Linktest script.");
	    return;
	}
383
}
384

385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
static
void sigchld_handler(int sig) {
        pid_t pid;
	int status;
	int exit_code;

	/*
	 * If the exit_code is nonzero after a normal exit,
	 * the daemon sends a KILL to let other nodes know
	 * that something has failed.
	 *
	 * However, ignore the case of a non-normal exit,
	 * since that is likely the result of a KILL signal.
	 */
	exit_code = 0; 
	while((pid = waitpid(-1, &status, 0)) > 0) {

	  /*
	   * If Linktest died due to an error, Perl will exit
	   * the script normally with an error code. 
	   */
	  if(WIFEXITED(status)) {
	    exit_code = WEXITSTATUS(status);
	    info("Linktest exit code: %d\n",exit_code);
	    
	    /*
	     * If this was an abnormal exit (exit status != 0)
	     * then we must send KILL to the process group of
	     * Linktest to kill its subchildren. However,
	     * it doesn't seem to hurt (cause kill errors) to
	     * send it anyway.
	     */
	    send_group_kill();
	    
	  } else if (WIFSIGNALED(status)) {
	    /*
	     * Linktest exited due to a signal, likely from
	     * this daemon. If that's the case, group_kill
	     * has already been sent.
	     */
425 426 427 428 429
	    sig = WTERMSIG(status);
	    info("Linktest killed by signal %d.\n", sig);
	    if (sig != SIGTERM)
		    exit_code = sig;

430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466
	  } else {
	    /*
	     * Linktest is stopped unexpectedly.
	     */
	    error("unexpected SIGCHLD received\n");
	  }
	  
	}
	if(errno != ECHILD) {
	  error("waitpid error\n");
	}

	/*
	 * Go ahead and unlock since Linktest and its children
	 * should all be killed and reaped now.
	 */
	locked = FALSE;

	/*
	 * Now let other nodes know about the problem, if any.
	 */
	if(exit_code) {
	  info("Posting KILL event\n");
	  send_kill_event();
	}
	return;
}

static
void send_group_kill() {
        int res;
	
        /*
	 * Kill off all processes in the process group of the
	 * Linktest run. This may include the linktest script
	 * itself, and any children it forked.
	 */
467
	res = killpg(linktest_pid, SIGTERM);
468 469 470 471 472 473 474 475 476 477 478
	if(res < 0) {
	  /*
	   * Not a serious error, likely the process group
	   * has already exited.
	   */
	  return;
	}
}

static
void send_kill_event() {
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
	event_do(handle,
		 EA_Experiment, pideid,
		 EA_Type, TBDB_OBJECTTYPE_LINKTEST,
		 EA_Name, "linktest",
		 EA_Event, TBDB_EVENTTYPE_KILL,
		 EA_TAG_DONE);
	if (token != ~0) {
		event_do(handle,
			 EA_Experiment, pideid,
			 EA_Type, TBDB_OBJECTTYPE_LINKTEST,
			 EA_Name, "linktest",
			 EA_Event, TBDB_EVENTTYPE_COMPLETE,
			 EA_ArgInteger, "ERROR", 1,
			 EA_ArgInteger, "CTOKEN", token,
			 EA_TAG_DONE);
		token = ~0;
495 496
	}
}