linktest.c 10.7 KB
Newer Older
1
2
/*
 * EMULAB-COPYRIGHT
3
 * Copyright (c) 2000-2010 University of Utah and the Flux Group.
4
5
6
7
8
9
10
 * All rights reserved.
 */

#include <stdio.h>
#include <ctype.h>
#include <netdb.h>
#include <unistd.h>
11
12
#include <string.h>
#include <errno.h>
13
#include <paths.h>
14
#include <errno.h>
15
16
17
18
19
20
21
22
23
#include <sys/types.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <signal.h>
#include <pwd.h>
#include <time.h>
#include "tbdefs.h"
#include "log.h"
24
#include "be_user.h"
25
#include "event.h"
26

Timothy Stack's avatar
 
Timothy Stack committed
27
28
#define TRUE    1
#define FALSE   0
29
#define LINKTEST_SCRIPT CLIENT_BINDIR "/elab_linktest.pl"
Timothy Stack's avatar
 
Timothy Stack committed
30
#define MAX_ARGS 10
31

32
33
34
35
static int	      debug;
static volatile int   locked;
static pid_t          linktest_pid;
static char           *pideid;
Timothy Stack's avatar
 
Timothy Stack committed
36
static char           *swapper;
37
static event_handle_t handle;
Timothy Stack's avatar
 
Timothy Stack committed
38
static unsigned long  token = ~0;
39
static char	      *nodelocal_dir;
40

41
42
static void	      callback(event_handle_t handle,
			       event_notification_t notification, void *data);
Timothy Stack's avatar
 
Timothy Stack committed
43
44
45
static void	      start_callback(event_handle_t handle,
				     event_notification_t notification,
				     void *data);
46
47
48
49
50
static void           exec_linktest(char *args, int);
static void           sigchld_handler(int sig);
static void           send_group_kill();
static void           send_kill_event();
     
51
52
53
54
void
usage(char *progname)
{
	fprintf(stderr,
55
		"Usage: %s [-d] "
56
		"[-s server] [-p port] [-k keyfile] [-l logfile] [-u user] [-N dir] -e pid/eid\n",
57
58
59
60
		progname);
	exit(-1);
}

61
62
int
main(int argc, char **argv) {
63

64
65
66
	address_tuple_t	tuple;
	char *server = NULL;
	char *port = NULL;
67
	char *keyfile = NULL;
68
	char *pidfile = NULL;
69
70
71
72
	char *logfile = NULL;
	char *progname;
	char c;
	char buf[BUFSIZ];
Timothy Stack's avatar
 
Timothy Stack committed
73
	extern char build_info[];
74
	pideid = NULL;
75
76
	
	progname = argv[0];
77

78
	while ((c = getopt(argc, argv, "s:p:e:l:dk:i:Vu:N:")) != -1) {
79
	  switch (c) {
80
81
82
	  case 'd':
	    debug++;
	    break;
83
84
85
86
87
88
89
90
91
	  case 's':
	    server = optarg;
	    break;
	  case 'p':
	    port = optarg;
	    break;
	  case 'e':
	    pideid = optarg;
	    break;
92
93
94
	  case 'i':
	    pidfile = optarg;
	    break;
95
96
97
	  case 'l':
	    logfile = optarg;
	    break;
98
99
100
	  case 'k':
	    keyfile = optarg;
	    break;
Timothy Stack's avatar
 
Timothy Stack committed
101
102
103
	  case 'u':
	    swapper = optarg;
	    break;
104
105
106
	  case 'N':
	    nodelocal_dir = optarg;
	    break;
Timothy Stack's avatar
 
Timothy Stack committed
107
108
109
110
	  case 'V':
	    fprintf(stderr, "%s\n", build_info);
	    exit(0);
	    break;
111
112
113
114
115
116
117
118
	  default:
	    usage(progname);
	  }
	}

	if (!pideid)
	  usage(progname);

119
120
121
122
123
124
125
126
127
	if (debug)
		loginit(0, 0);
	else {
		if (logfile)
			loginit(0, logfile);
		else
			loginit(1, "linktest");
		/* See below for daemonization */
	}
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

	/*
	 * Convert server/port to elvin thing.
	 *
	 * XXX This elvin string stuff should be moved down a layer. 
	 */
	if (server) {
		snprintf(buf, sizeof(buf), "elvin://%s%s%s",
			 server,
			 (port ? ":"  : ""),
			 (port ? port : ""));
		server = buf;
	}

	/*
	 * Construct an address tuple for subscribing to events for
	 * this node.
	 */
	tuple = address_tuple_alloc();
	if (tuple == NULL) {
		fatal("could not allocate an address tuple");
	}
	/*
151
	 * Ask for just the events we care about. 
152
153
154
	 */
	tuple->expt      = pideid;
	tuple->objtype   = TBDB_OBJECTTYPE_LINKTEST;
Timothy Stack's avatar
 
Timothy Stack committed
155
156
157
	tuple->eventtype =
		TBDB_EVENTTYPE_START ","
		TBDB_EVENTTYPE_KILL;
158
159
160
161

	/*
	 * Register with the event system. 
	 */
162
	handle = event_register_withkeyfile(server, 0, keyfile);
163
	if (handle == NULL) {
164
	        fatal("could not register with event system");
165
166
167
168
169
170
171
172
	}
	
	/*
	 * Subscribe to the event we specified above.
	 */
	if (! event_subscribe(handle, callback, tuple, NULL)) {
		fatal("could not subscribe to event");
	}
173

Timothy Stack's avatar
 
Timothy Stack committed
174
175
176
177
178
179
180
181
182
183
184
	tuple->objtype   = TBDB_OBJECTTYPE_TIME;
	tuple->objname   = ADDRESSTUPLE_ANY;
	tuple->eventtype = TBDB_EVENTTYPE_START;

	/*
	 * Subscribe to the TIME start event we specified above.
	 */
	if (! event_subscribe(handle, start_callback, tuple, NULL)) {
		fatal("could not subscribe to event");
	}

185
186
187
188
189
	/*
	 * Do this now, once we have had a chance to fail on the above
	 * event system calls.
	 */
	if (!debug)
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
		daemon(0, 1);

	/*
	 * Write out a pidfile if root (after we daemonize).
	 */
	if (!getuid()) {
		FILE *fp;
		
		if (pidfile)
			strcpy(buf, pidfile);
		else
			sprintf(buf, "%s/linktest.pid", _PATH_VARRUN);
		fp = fopen(buf, "w");
		if (fp != NULL) {
			fprintf(fp, "%d\n", getpid());
			(void) fclose(fp);
		}
	}

209
210
211
212
213
214
215
216
	/*
	 * Initialize variables used to control child execution
	 */
	locked = FALSE;
	if(signal(SIGCHLD,sigchld_handler) == SIG_ERR) {
	        fatal("could not install child handler");
	}
	
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
	/*
	 * Begin the event loop, waiting to receive event notifications:
	 */
	event_main(handle);

	/*
	 * Unregister with the event system:
	 */
	if (event_unregister(handle) == 0) {
		fatal("could not unregister with event system");
	}

	return 0;
}
/*
 * Handle the events.
 */
static void
callback(event_handle_t handle, event_notification_t notification, void *data)
{
	char		objname[TBDB_FLEN_EVOBJTYPE];
	char		event[TBDB_FLEN_EVEVENTTYPE];
	char		args[BUFSIZ];
	struct timeval	now;

	gettimeofday(&now, NULL);
	
	if (! event_notification_get_objname(handle, notification,
					     objname, sizeof(objname))) {
		error("Could not get objname from notification!\n");
		return;
	}

	if (! event_notification_get_eventtype(handle, notification,
					       event, sizeof(event))) {
		error("Could not get event from notification!\n");
		return;
	}

Timothy Stack's avatar
 
Timothy Stack committed
256
257
258
	event_notification_get_int32(handle, notification,
				     "TOKEN", (int32_t *)&token);

259
260
261
	event_notification_get_arguments(handle,
					 notification, args, sizeof(args));

262
263
	info("event: %s - %s - %s\n", objname, event, args);

264
265
266
	/*
	 * Dispatch the event. 
	 */
267
268
269
270
271
272
273
274
	if(!strcmp(event, TBDB_EVENTTYPE_START)) {
	  if(!locked) {

	    /*
	     * Set locked bit. The bit is not set to false
	     * until a SIGCHLD signal is received
	     */
	    locked = TRUE;
275

276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
	    /*
	     * The Linktest script is not running, so
	     * fork a process and execute it.
	     */
	    linktest_pid = fork();
	    if(linktest_pid < 0) {
	         error("Could not fork a process to run linktest script!\n");
	         return;
	    }

	    /*
	     * Changes the process group of the child to itself so
	     * a sigkill to the child process group will not kill
	     * the Linktest daemon.
	     */
	    if(!linktest_pid) {
  	         pid_t mypid = getpid();
293
	         setpgid(0, mypid);
294
295
296
297
298

		 /* Finally, execute the linktest script. */
		 exec_linktest(args, sizeof(args));
	    }
	  }
Timothy Stack's avatar
 
Timothy Stack committed
299
300
301
	  else {
	    info("linktest already in progress\n");
	  }
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
	} else if (!strcmp(event, TBDB_EVENTTYPE_KILL)) {

	  /*
	   * Ignore unless we are running.
	   */
	  if(locked) {

	    /*
	     * If KILL is received, there is a problem on this
	     * or some other node. So, kill off linktest
	     * and its children.
	     */
	    send_group_kill();
	    
	  }
	}
318
}
319

Timothy Stack's avatar
 
Timothy Stack committed
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
static void
start_callback(event_handle_t handle,
	       event_notification_t notification,
	       void *data)
{
	char		event[TBDB_FLEN_EVEVENTTYPE];

	if (! event_notification_get_eventtype(handle, notification,
					       event, sizeof(event))) {
		error("Could not get event from notification!\n");
		return;
	}

	if (strcmp(event, TBDB_EVENTTYPE_START) == 0) {
	  /*
	   * Ignore unless we are running.
	   */
	  if(locked) {
	    /*
	     * Reset to a clean state.
	     */
	    send_group_kill();
	  }
	  token = ~0;
	}
}

347
348
349
350
/*
 * Executes Linktest with arguments received from the Linktest
 * start event. Does not return.
 */ 
351
static void
352
exec_linktest(char *args, int buflen) {
Timothy Stack's avatar
 
Timothy Stack committed
353
	char	   *word, *argv[MAX_ARGS], swapperarg[128], tokenarg[32];
354
	char	   logdirarg[128];
355
356
357
358
359
360
361
362
	int	   i,res;

	/*
	 * Set up arguments for execv call by parsing contents
	 * of the event string.
	 */
	word = strtok(args," \t");
	i=1;
363
	snprintf(swapperarg, sizeof(swapperarg), "SWAPPER=%s", swapper);
Timothy Stack's avatar
 
Timothy Stack committed
364
	argv[i++] = swapperarg;
365
	snprintf(tokenarg, sizeof(tokenarg), "TOKEN=%lu", token);
Timothy Stack's avatar
 
Timothy Stack committed
366
	argv[i++] = tokenarg;
367
368
369
370
371
	if (nodelocal_dir) {
		snprintf(logdirarg, sizeof(logdirarg),
			 "SHAREDDIR=0 LOGDIR=%s", nodelocal_dir);
		argv[i++] = logdirarg;
	}
372
	do {
373
374
		argv[i++] = word;
	} while ((word = strtok(NULL," \t")) && (i<MAX_ARGS));
375
376
377
	argv[i] = NULL;
	argv[0] = LINKTEST_SCRIPT;

378
379
380
381
382
383
384
#ifdef __CYGWIN__
	/*
	 * Run as the swapper on Cygwin for access to the shared /proj dir.
	 */
	be_user(swapper);
#endif /* __CYGWIN__ */

385
386
387
	/*
	 * Execute the script with the arguments from the event
	 */
388
	res = execv(LINKTEST_SCRIPT, argv);
389
390
391
392
	if(res < 0) {
	    error("Could not execute the Linktest script.");
	    return;
	}
393
}
394

395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
static
void sigchld_handler(int sig) {
        pid_t pid;
	int status;
	int exit_code;

	/*
	 * If the exit_code is nonzero after a normal exit,
	 * the daemon sends a KILL to let other nodes know
	 * that something has failed.
	 *
	 * However, ignore the case of a non-normal exit,
	 * since that is likely the result of a KILL signal.
	 */
	exit_code = 0; 
	while((pid = waitpid(-1, &status, 0)) > 0) {

	  /*
	   * If Linktest died due to an error, Perl will exit
	   * the script normally with an error code. 
	   */
	  if(WIFEXITED(status)) {
	    exit_code = WEXITSTATUS(status);
	    info("Linktest exit code: %d\n",exit_code);
	    
	    /*
	     * If this was an abnormal exit (exit status != 0)
	     * then we must send KILL to the process group of
	     * Linktest to kill its subchildren. However,
	     * it doesn't seem to hurt (cause kill errors) to
	     * send it anyway.
	     */
	    send_group_kill();
	    
	  } else if (WIFSIGNALED(status)) {
	    /*
	     * Linktest exited due to a signal, likely from
	     * this daemon. If that's the case, group_kill
	     * has already been sent.
	     */
435
436
437
438
439
	    sig = WTERMSIG(status);
	    info("Linktest killed by signal %d.\n", sig);
	    if (sig != SIGTERM)
		    exit_code = sig;

440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
	  } else {
	    /*
	     * Linktest is stopped unexpectedly.
	     */
	    error("unexpected SIGCHLD received\n");
	  }
	  
	}
	if(errno != ECHILD) {
	  error("waitpid error\n");
	}

	/*
	 * Go ahead and unlock since Linktest and its children
	 * should all be killed and reaped now.
	 */
	locked = FALSE;

	/*
	 * Now let other nodes know about the problem, if any.
	 */
	if(exit_code) {
	  info("Posting KILL event\n");
	  send_kill_event();
	}
	return;
}

static
void send_group_kill() {
        int res;
	
        /*
	 * Kill off all processes in the process group of the
	 * Linktest run. This may include the linktest script
	 * itself, and any children it forked.
	 */
477
	res = killpg(linktest_pid, SIGTERM);
478
479
480
481
482
483
484
485
486
487
488
	if(res < 0) {
	  /*
	   * Not a serious error, likely the process group
	   * has already exited.
	   */
	  return;
	}
}

static
void send_kill_event() {
Timothy Stack's avatar
 
Timothy Stack committed
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
	event_do(handle,
		 EA_Experiment, pideid,
		 EA_Type, TBDB_OBJECTTYPE_LINKTEST,
		 EA_Name, "linktest",
		 EA_Event, TBDB_EVENTTYPE_KILL,
		 EA_TAG_DONE);
	if (token != ~0) {
		event_do(handle,
			 EA_Experiment, pideid,
			 EA_Type, TBDB_OBJECTTYPE_LINKTEST,
			 EA_Name, "linktest",
			 EA_Event, TBDB_EVENTTYPE_COMPLETE,
			 EA_ArgInteger, "ERROR", 1,
			 EA_ArgInteger, "CTOKEN", token,
			 EA_TAG_DONE);
		token = ~0;
505
506
	}
}