linktest.c 10.4 KB
Newer Older
1
2
/*
 * EMULAB-COPYRIGHT
3
 * Copyright (c) 2000-2007 University of Utah and the Flux Group.
4
5
6
7
8
9
10
 * All rights reserved.
 */

#include <stdio.h>
#include <ctype.h>
#include <netdb.h>
#include <unistd.h>
11
12
#include <string.h>
#include <errno.h>
13
#include <paths.h>
14
#include <errno.h>
15
16
17
18
19
20
21
22
23
#include <sys/types.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <signal.h>
#include <pwd.h>
#include <time.h>
#include "tbdefs.h"
#include "log.h"
24
#include "be_user.h"
25
#include "event.h"
26

Timothy Stack's avatar
   
Timothy Stack committed
27
28
29
30
#define TRUE    1
#define FALSE   0
#define LINKTEST_SCRIPT CLIENT_BINDIR "/linktest.pl"
#define MAX_ARGS 10
31

32
33
34
35
static int	      debug;
static volatile int   locked;
static pid_t          linktest_pid;
static char           *pideid;
Timothy Stack's avatar
   
Timothy Stack committed
36
static char           *swapper;
37
static event_handle_t handle;
Timothy Stack's avatar
   
Timothy Stack committed
38
static unsigned long  token = ~0;
39

40
41
static void	      callback(event_handle_t handle,
			       event_notification_t notification, void *data);
Timothy Stack's avatar
   
Timothy Stack committed
42
43
44
static void	      start_callback(event_handle_t handle,
				     event_notification_t notification,
				     void *data);
45
46
47
48
49
static void           exec_linktest(char *args, int);
static void           sigchld_handler(int sig);
static void           send_group_kill();
static void           send_kill_event();
     
50
51
52
53
void
usage(char *progname)
{
	fprintf(stderr,
54
		"Usage: %s [-d] "
Timothy Stack's avatar
   
Timothy Stack committed
55
		"[-s server] [-p port] [-k keyfile] [-l logfile] [-u user] -e pid/eid\n",
56
57
58
59
		progname);
	exit(-1);
}

60
61
int
main(int argc, char **argv) {
62

63
64
65
	address_tuple_t	tuple;
	char *server = NULL;
	char *port = NULL;
66
	char *keyfile = NULL;
67
	char *pidfile = NULL;
68
69
70
71
	char *logfile = NULL;
	char *progname;
	char c;
	char buf[BUFSIZ];
Timothy Stack's avatar
   
Timothy Stack committed
72
	extern char build_info[];
73
	pideid = NULL;
74
75
	
	progname = argv[0];
76

Timothy Stack's avatar
   
Timothy Stack committed
77
	while ((c = getopt(argc, argv, "s:p:e:l:dk:i:Vu:")) != -1) {
78
	  switch (c) {
79
80
81
	  case 'd':
	    debug++;
	    break;
82
83
84
85
86
87
88
89
90
	  case 's':
	    server = optarg;
	    break;
	  case 'p':
	    port = optarg;
	    break;
	  case 'e':
	    pideid = optarg;
	    break;
91
92
93
	  case 'i':
	    pidfile = optarg;
	    break;
94
95
96
	  case 'l':
	    logfile = optarg;
	    break;
97
98
99
	  case 'k':
	    keyfile = optarg;
	    break;
Timothy Stack's avatar
   
Timothy Stack committed
100
101
102
103
104
105
106
	  case 'u':
	    swapper = optarg;
	    break;
	  case 'V':
	    fprintf(stderr, "%s\n", build_info);
	    exit(0);
	    break;
107
108
109
110
111
112
113
114
	  default:
	    usage(progname);
	  }
	}

	if (!pideid)
	  usage(progname);

115
116
117
118
119
120
121
122
123
	if (debug)
		loginit(0, 0);
	else {
		if (logfile)
			loginit(0, logfile);
		else
			loginit(1, "linktest");
		/* See below for daemonization */
	}
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

	/*
	 * Convert server/port to elvin thing.
	 *
	 * XXX This elvin string stuff should be moved down a layer. 
	 */
	if (server) {
		snprintf(buf, sizeof(buf), "elvin://%s%s%s",
			 server,
			 (port ? ":"  : ""),
			 (port ? port : ""));
		server = buf;
	}

	/*
	 * Construct an address tuple for subscribing to events for
	 * this node.
	 */
	tuple = address_tuple_alloc();
	if (tuple == NULL) {
		fatal("could not allocate an address tuple");
	}
	/*
147
	 * Ask for just the events we care about. 
148
149
150
	 */
	tuple->expt      = pideid;
	tuple->objtype   = TBDB_OBJECTTYPE_LINKTEST;
Timothy Stack's avatar
   
Timothy Stack committed
151
152
153
	tuple->eventtype =
		TBDB_EVENTTYPE_START ","
		TBDB_EVENTTYPE_KILL;
154
155
156
157

	/*
	 * Register with the event system. 
	 */
158
	handle = event_register_withkeyfile(server, 0, keyfile);
159
	if (handle == NULL) {
160
	        fatal("could not register with event system");
161
162
163
164
165
166
167
168
	}
	
	/*
	 * Subscribe to the event we specified above.
	 */
	if (! event_subscribe(handle, callback, tuple, NULL)) {
		fatal("could not subscribe to event");
	}
169

Timothy Stack's avatar
   
Timothy Stack committed
170
171
172
173
174
175
176
177
178
179
180
	tuple->objtype   = TBDB_OBJECTTYPE_TIME;
	tuple->objname   = ADDRESSTUPLE_ANY;
	tuple->eventtype = TBDB_EVENTTYPE_START;

	/*
	 * Subscribe to the TIME start event we specified above.
	 */
	if (! event_subscribe(handle, start_callback, tuple, NULL)) {
		fatal("could not subscribe to event");
	}

181
182
183
184
185
	/*
	 * Do this now, once we have had a chance to fail on the above
	 * event system calls.
	 */
	if (!debug)
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
		daemon(0, 1);

	/*
	 * Write out a pidfile if root (after we daemonize).
	 */
	if (!getuid()) {
		FILE *fp;
		
		if (pidfile)
			strcpy(buf, pidfile);
		else
			sprintf(buf, "%s/linktest.pid", _PATH_VARRUN);
		fp = fopen(buf, "w");
		if (fp != NULL) {
			fprintf(fp, "%d\n", getpid());
			(void) fclose(fp);
		}
	}

205
206
207
208
209
210
211
212
	/*
	 * Initialize variables used to control child execution
	 */
	locked = FALSE;
	if(signal(SIGCHLD,sigchld_handler) == SIG_ERR) {
	        fatal("could not install child handler");
	}
	
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
	/*
	 * Begin the event loop, waiting to receive event notifications:
	 */
	event_main(handle);

	/*
	 * Unregister with the event system:
	 */
	if (event_unregister(handle) == 0) {
		fatal("could not unregister with event system");
	}

	return 0;
}
/*
 * Handle the events.
 */
static void
callback(event_handle_t handle, event_notification_t notification, void *data)
{
	char		objname[TBDB_FLEN_EVOBJTYPE];
	char		event[TBDB_FLEN_EVEVENTTYPE];
	char		args[BUFSIZ];
	struct timeval	now;

	gettimeofday(&now, NULL);
	
	if (! event_notification_get_objname(handle, notification,
					     objname, sizeof(objname))) {
		error("Could not get objname from notification!\n");
		return;
	}

	if (! event_notification_get_eventtype(handle, notification,
					       event, sizeof(event))) {
		error("Could not get event from notification!\n");
		return;
	}

Timothy Stack's avatar
   
Timothy Stack committed
252
253
254
	event_notification_get_int32(handle, notification,
				     "TOKEN", (int32_t *)&token);

255
256
257
	event_notification_get_arguments(handle,
					 notification, args, sizeof(args));

258
259
	info("event: %s - %s - %s\n", objname, event, args);

260
261
262
	/*
	 * Dispatch the event. 
	 */
263
264
265
266
267
268
269
270
	if(!strcmp(event, TBDB_EVENTTYPE_START)) {
	  if(!locked) {

	    /*
	     * Set locked bit. The bit is not set to false
	     * until a SIGCHLD signal is received
	     */
	    locked = TRUE;
271

272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
	    /*
	     * The Linktest script is not running, so
	     * fork a process and execute it.
	     */
	    linktest_pid = fork();
	    if(linktest_pid < 0) {
	         error("Could not fork a process to run linktest script!\n");
	         return;
	    }

	    /*
	     * Changes the process group of the child to itself so
	     * a sigkill to the child process group will not kill
	     * the Linktest daemon.
	     */
	    if(!linktest_pid) {
  	         pid_t mypid = getpid();
289
	         setpgid(0, mypid);
290
291
292
293
294

		 /* Finally, execute the linktest script. */
		 exec_linktest(args, sizeof(args));
	    }
	  }
Timothy Stack's avatar
   
Timothy Stack committed
295
296
297
	  else {
	    info("linktest already in progress\n");
	  }
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
	} else if (!strcmp(event, TBDB_EVENTTYPE_KILL)) {

	  /*
	   * Ignore unless we are running.
	   */
	  if(locked) {

	    /*
	     * If KILL is received, there is a problem on this
	     * or some other node. So, kill off linktest
	     * and its children.
	     */
	    send_group_kill();
	    
	  }
	}
314
}
315

Timothy Stack's avatar
   
Timothy Stack committed
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
static void
start_callback(event_handle_t handle,
	       event_notification_t notification,
	       void *data)
{
	char		event[TBDB_FLEN_EVEVENTTYPE];

	if (! event_notification_get_eventtype(handle, notification,
					       event, sizeof(event))) {
		error("Could not get event from notification!\n");
		return;
	}

	if (strcmp(event, TBDB_EVENTTYPE_START) == 0) {
	  /*
	   * Ignore unless we are running.
	   */
	  if(locked) {
	    /*
	     * Reset to a clean state.
	     */
	    send_group_kill();
	  }
	  token = ~0;
	}
}

343
344
345
346
/*
 * Executes Linktest with arguments received from the Linktest
 * start event. Does not return.
 */ 
347
static void
348
exec_linktest(char *args, int buflen) {
Timothy Stack's avatar
   
Timothy Stack committed
349
	char	   *word, *argv[MAX_ARGS], swapperarg[128], tokenarg[32];
350
351
352
353
354
355
356
357
	int	   i,res;

	/*
	 * Set up arguments for execv call by parsing contents
	 * of the event string.
	 */
	word = strtok(args," \t");
	i=1;
Timothy Stack's avatar
   
Timothy Stack committed
358
359
360
361
	sprintf(swapperarg, "SWAPPER=%s", swapper);
	argv[i++] = swapperarg;
	sprintf(tokenarg, "TOKEN=%lu", token);
	argv[i++] = tokenarg;
362
363
364
365
366
367
368
	do {
	  argv[i++] = word;
	} while ((word = strtok(NULL," \t"))
		 && (i<MAX_ARGS));
	argv[i] = NULL;
	argv[0] = LINKTEST_SCRIPT;

369
370
371
372
373
374
375
#ifdef __CYGWIN__
	/*
	 * Run as the swapper on Cygwin for access to the shared /proj dir.
	 */
	be_user(swapper);
#endif /* __CYGWIN__ */

376
377
378
379
380
381
382
383
	/*
	 * Execute the script with the arguments from the event
	 */
	res = execv( LINKTEST_SCRIPT,argv);
	if(res < 0) {
	    error("Could not execute the Linktest script.");
	    return;
	}
384
}
385

386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
static
void sigchld_handler(int sig) {
        pid_t pid;
	int status;
	int exit_code;

	/*
	 * If the exit_code is nonzero after a normal exit,
	 * the daemon sends a KILL to let other nodes know
	 * that something has failed.
	 *
	 * However, ignore the case of a non-normal exit,
	 * since that is likely the result of a KILL signal.
	 */
	exit_code = 0; 
	while((pid = waitpid(-1, &status, 0)) > 0) {

	  /*
	   * If Linktest died due to an error, Perl will exit
	   * the script normally with an error code. 
	   */
	  if(WIFEXITED(status)) {
	    exit_code = WEXITSTATUS(status);
	    info("Linktest exit code: %d\n",exit_code);
	    
	    /*
	     * If this was an abnormal exit (exit status != 0)
	     * then we must send KILL to the process group of
	     * Linktest to kill its subchildren. However,
	     * it doesn't seem to hurt (cause kill errors) to
	     * send it anyway.
	     */
	    send_group_kill();
	    
	  } else if (WIFSIGNALED(status)) {
	    /*
	     * Linktest exited due to a signal, likely from
	     * this daemon. If that's the case, group_kill
	     * has already been sent.
	     */
426
427
428
429
430
	    sig = WTERMSIG(status);
	    info("Linktest killed by signal %d.\n", sig);
	    if (sig != SIGTERM)
		    exit_code = sig;

431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
	  } else {
	    /*
	     * Linktest is stopped unexpectedly.
	     */
	    error("unexpected SIGCHLD received\n");
	  }
	  
	}
	if(errno != ECHILD) {
	  error("waitpid error\n");
	}

	/*
	 * Go ahead and unlock since Linktest and its children
	 * should all be killed and reaped now.
	 */
	locked = FALSE;

	/*
	 * Now let other nodes know about the problem, if any.
	 */
	if(exit_code) {
	  info("Posting KILL event\n");
	  send_kill_event();
	}
	return;
}

static
void send_group_kill() {
        int res;
	
        /*
	 * Kill off all processes in the process group of the
	 * Linktest run. This may include the linktest script
	 * itself, and any children it forked.
	 */
468
	res = killpg(linktest_pid, SIGTERM);
469
470
471
472
473
474
475
476
477
478
479
	if(res < 0) {
	  /*
	   * Not a serious error, likely the process group
	   * has already exited.
	   */
	  return;
	}
}

static
void send_kill_event() {
Timothy Stack's avatar
   
Timothy Stack committed
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
	event_do(handle,
		 EA_Experiment, pideid,
		 EA_Type, TBDB_OBJECTTYPE_LINKTEST,
		 EA_Name, "linktest",
		 EA_Event, TBDB_EVENTTYPE_KILL,
		 EA_TAG_DONE);
	if (token != ~0) {
		event_do(handle,
			 EA_Experiment, pideid,
			 EA_Type, TBDB_OBJECTTYPE_LINKTEST,
			 EA_Name, "linktest",
			 EA_Event, TBDB_EVENTTYPE_COMPLETE,
			 EA_ArgInteger, "ERROR", 1,
			 EA_ArgInteger, "CTOKEN", token,
			 EA_TAG_DONE);
		token = ~0;
496
497
	}
}