idlemail.in 16.2 KB
Newer Older
1
2
3
4
#!/usr/bin/perl -w

#
# EMULAB-COPYRIGHT
5
# Copyright (c) 2000-2003 University of Utah and the Flux Group.
6
7
8
# All rights reserved.
#

9
#
10
# idlemail - send mail to idle expts
11
12
13
14
15
16
17
18
#
# see usage message below for details
#
# This should get run out of cron, about every 5 minutes or so.
# Add an entry like this to /etc/crontab:
#
# */5     *       *       *       *       root    /usr/testbed/sbin/idlemail
#
19
20
21
# If you use a frequency other than 5 minutes, make sure to change the
# autowarn sections below, especially the query for who should be warned.
#
22
23
24
25

# Configure variables
use lib '@prefix@/lib';
use libdb;
26
use libtestbed;
27
28
29
use English;
use Getopt::Std;

30
31
32
33
34
# Grab our site variables...
my $mailinterval = TBGetSiteVar("idle/mailinterval");
my $threshold    = TBGetSiteVar("idle/threshold");
my $cc_grp_ldrs  = TBGetSiteVar("idle/cc_grp_ldrs");

35
36
sub help {
    die("Usage:
37
idlemail [-h] [-d] [-n] [[-f] <pid> <eid>]
38
39
 -h     Show this help message
 -d     Enable debugging/verbose output
40
 -n	No email sending. (for debugging only)
41
42
43
 -f	Force sending a message for <pid> <eid>

If <pid> and <eid> are supplied, send a swap request for that experiment.
44
45
Without -f, the message won't be sent if pid/eid has not been idle long
enough or if it hasn't been long enough since its last message.
46
47

idlemail runs periodically (via cron(8)) to send email messages
48
49
50
51
52
53
54
regarding experiments that are idle, starting after $threshold hours 
of inactivity, and sending another message every $mailinterval hours.

Current settings:
idle threshold = $threshold hours
mail interval  = $mailinterval hours
Start CC'ing group leaders on message $cc_grp_ldrs\n");
55
56
}

Mac Newbold's avatar
Mac Newbold committed
57
my $TB               = "@prefix@";
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
my $THISHOMEBASE     = "@THISHOMEBASE@";
my $TBBASE           = "@TBBASE@";
my $TBDOCBASE        = "@TBDOCBASE@";
my $TBMAILADDR_OPS   = "@TBOPSEMAIL@";
my $TBMAILADDR_WWW   = "@TBWWWEMAIL@";
my $TBMAILADDR_AUDIT = "@TBAUDITEMAIL@";
my $TBMAIL_OPS       = "Testbed Ops <$TBMAILADDR_OPS>";
my $TBMAIL_WWW       = "Testbed WWW <$TBMAILADDR_WWW>";
my $TBMAIL_AUDIT     = "Testbed Audit <$TBMAILADDR_AUDIT>";
my $TBMAIL_AUTOMAIL  = "@TBAUTOMAILEMAIL@";

# Turn off line buffering on output
$| = 1;

# Defaults

# Don't put 'my' on these, or they won't be settable with ${$var}
$h = 0; # help mode
$d = 0; # debug mode
$n = 0; # no-mail mode
$f = 0; # force mode

print "Got ARGV = ",join(" ",@ARGV),"\n" if $d;

82
my $optlist = "hdnf";
83
84
85
86
my %opt = ();
if (! getopts($optlist,\%opt)) { help(); }
# Copy the options into global vars
foreach $var (keys %opt) {
87
    ${$var} += $opt{$var};
88
89
90
91
92
93
    print "\$$var = $opt{$var} (".${$var}.")\n" if $d;
}

my $pid = shift || "";
my $eid = shift || "";

Mac Newbold's avatar
Mac Newbold committed
94
95
print "Settings: h=$h  d=$d  n=$n  f=$f  pid=$pid  eid=$eid\n".
  "Sitevars: thresh=$threshold interval=$mailinterval cc=$cc_grp_ldrs\n" if $d;
96
97
98
99
100
101
102
103
104
105
106

if ($h) { help(); }

# Only root or admin types!
if (($UID != 0) && (!TBAdmin($UID))) {
    die("Only root or TB administrators can run idlemail.\n");
}

if ($pid eq "" || $eid eq "") {
    # Normal mode

107
108
109
110
111
112
113
114
115
116
117
    my @idle=();
    # Grab a list of inactive expts, so we know who to reset when we're done
    my $sql = "select pid,eid from experiments where swap_requests > 0";
    my $q = DBQueryFatal($sql);
    while (%r = $q->fetchhash()) {
	$pid = $r{'pid'};
	$eid = $r{'eid'};
	push(@idle,"$pid:$eid");
	if ($d) { print "Was idle: $pid/$eid\n"; }
    }

118
119
120
    # Important note: this query only counts nodes in the
    # node_activity table, which are all local nodes. So no virt or
    # remote nodes get counted towards the node total.
121
    $sql  = <<EOT;
122
123
124
125
126
127
128
129
130
131
132
133
134
135
select r.pid, e.gid, r.eid, swappable, swap_requests,
round((unix_timestamp(now()) - unix_timestamp(last_swap_req))/3600,2)
as lastreq, count(r.node_id) as nodes,
round((unix_timestamp(now()) - unix_timestamp(max(
greatest(last_tty_act, last_net_act, last_cpu_act, last_ext_act)
)))/3600,2) as idle_time, max(greatest(last_tty_act, last_net_act,
last_cpu_act, last_ext_act)) as lastact,
(unix_timestamp(now()) - unix_timestamp(min(last_report))) as staleness
from node_activity as na left join reserved as r on na.node_id=r.node_id
left join experiments as e on r.pid=e.pid and r.eid=e.eid
where r.pid is not null and r.eid is not null and idle_ignore=0
group by pid,eid having idle_time >= $threshold and nodes > 0
order by pid,eid
EOT
136

137
    $q = DBQueryFatal($sql);
138
139
140

    if ($d) { print $q->as_string; $q->dataseek(0); }

141
    my @stillidle=();
142
143
    while (%r = $q->fetchhash()) {
	$pid = $r{'pid'};
144
	$gid = $r{'gid'};
145
	$eid = $r{'eid'};
146
147
148
149
	$swappable = $r{'swappable'};
	$swapreqs = $r{'swap_requests'};
	$lastreq = $r{'lastreq'};
	$nodes = $r{'nodes'};
150
	$time= $r{'idle_time'};
151
152
153
154
155
	$lastact= $r{'lastact'};
        $staleness = $r{'staleness'};
        if ($staleness >= 600) { # 10 minute stale limit
            $stale=1;
        }
156
	
157
158
159
160
	# If it is in the query, it is still idle, even if we don't
	# send a message this time through.
	push(@stillidle,"$pid:$eid");

161
162
163
	# We already know (from the query) that idletime>threshold.
	# So check the swap requests and time of last request, to make
	# sure we can send a message.
164

165
166
167
168
169
170
	if ($swapreqs == 0 || ($swapreqs > 0 && $lastreq > $mailinterval)) {
	    SendMessage($pid,$gid,$eid,$swappable,$swapreqs,$nodes,
			$time,$lastact,$stale);
	} elsif ($d) {
	    print "$pid/$eid got msg #$swapreqs only $lastreq hrs ago\n";
	}
171
    }
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

    # Now reset counters for expts that aren't idle anymore.
    foreach $expt (@idle) {
	my ($pid,$eid)=split(":",$expt);
	my $found=0;
	foreach $e (@stillidle) {
	    if ($e eq $expt) { $found=1; last; }
	}
	if (!$found) {
	    if ($d) { print "Not idle: $pid/$eid\n"; }
	    DBQueryFatal("update experiments set swap_requests='' ".
			 "where pid='$pid' and eid='$eid'");
	} else {
	    if ($d) { print "Still idle: $pid/$eid\n"; }
	}
    }

Mac Newbold's avatar
Mac Newbold committed
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
    # Next we need to check for stuff that needs to get swapped.
    # We need to find stuff to Idle-Swap, and stuff to Auto-Swap,
    # using two different queries.
    $sql  = <<EOT;
select e.pid,e.eid, idleswap_timeout,
(unix_timestamp(now()) - unix_timestamp(max(greatest(
last_tty_act,last_net_act,last_cpu_act,last_ext_act))))/60 as idlemin
from reserved as r left join experiments as e on e.pid=r.pid and e.eid=r.eid
left join node_activity as na on r.node_id=na.node_id
where idleswap !=0 and swappable>0 and idle_ignore=0
group by pid,eid having idlemin >= idleswap_timeout order by pid,eid
EOT
    $q = DBQueryFatal($sql);
    if ($d) { print $q->as_string; $q->dataseek(0); }
    while (%r = $q->fetchhash()) {
	$pid = $r{'pid'};
	$eid = $r{'eid'};
206
	system("$TB/sbin/idleswap -r -i $pid $eid > /dev/null") &&
Mac Newbold's avatar
Mac Newbold committed
207
208
209
210
211
212
213
214
215
216
217
218
219
220
	  warn("idlemail: Problem idleswapping $pid/$eid: $!\n");
    }

    $sql  = <<EOT;
select pid,eid,autoswap_timeout,
(unix_timestamp(now()) - unix_timestamp(expt_swapped))/60 as activemin
from experiments where swappable>0 and state="active" and autoswap>0
having activemin>=autoswap_timeout order by pid,eid
EOT
    $q = DBQueryFatal($sql);
    if ($d) { print $q->as_string; $q->dataseek(0); }
    while (%r = $q->fetchhash()) {
	$pid = $r{'pid'};
	$eid = $r{'eid'};
221
	system("$TB/sbin/idleswap -r -a $pid $eid > /dev/null") &&
Mac Newbold's avatar
Mac Newbold committed
222
223
224
	  warn("idlemail: Problem autoswapping $pid/$eid: $!\n");
    }

225
226
    # Now send warning messages to those who will get autoswapped soon
    my $warnmin = 60; # minutes before autoswap that we warn them
Mac Newbold's avatar
Mac Newbold committed
227
    my $window = 5; # same as idlemail frequency in cron
228
229
230
231
232
233
234
235
236
    $sql  = <<EOT;
select pid,eid,autoswap_timeout,
(unix_timestamp(now()) - unix_timestamp(expt_swapped))/60 as activemin
from experiments where swappable>0 and state="active" and autoswap>0
having activemin+$warnmin>=autoswap_timeout and
activemin+$warnmin<=autoswap_timeout+$window order by pid,eid
EOT
    $q = DBQueryFatal($sql);
    if ($d) { print $q->as_string; $q->dataseek(0); }
237
238
239
240
241
242
243
    # enable for extra debugging
    if (1 && $q->numrows()>0) {
	# SENDMAIL(To, Subject, Message, [From], [More Headers],...)
	SENDMAIL("Mac <newbold\@flux.utah.edu>","idlemail warnings",
		 $q->as_string);
	$q->dataseek(0);
    }
244
245
246
247
248
    while (%r = $q->fetchhash()) {
	# These get an autowarn message
	$pid = $r{'pid'};
	$eid = $r{'eid'};
	# all options ignored but autoswap and warnmin and ids
249
	SendMessage($pid,$pid,$eid,0,0,0,0,0,0,1,$warnmin);
250
251
    }

252
} else {
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
    # pid/eid mode - only check pid/eid, and let $f force sending,
    # even if msg was sent recently or expt isn't idle long enough.

    if ($d) {
	print "Checking $pid/$eid only... force is $f\n";
    }

    # Important note: this query only counts nodes in the
    # node_activity table, which are all local nodes. So no virt or
    # remote nodes get counted towards the node total.

    # diffs from the normal query: don't restrict based on idleness
    # or idle_ignore, and only grab our expt

    my $sql  = <<EOT;
select r.pid, e.gid, r.eid, swappable, swap_requests, idle_ignore,
round((unix_timestamp(now()) - unix_timestamp(last_swap_req))/3600,2)
as lastreq, count(r.node_id) as nodes,
round((unix_timestamp(now()) - unix_timestamp(max(
greatest(last_tty_act, last_net_act, last_cpu_act, last_ext_act)
)))/3600,2) as idle_time, max(greatest(last_tty_act, last_net_act,
last_cpu_act, last_ext_act)) as lastact,
(unix_timestamp(now()) - unix_timestamp(min(last_report))) as staleness
from node_activity as na left join reserved as r on na.node_id=r.node_id
left join experiments as e on r.pid=e.pid and r.eid=e.eid
where r.pid='$pid' and r.eid='$eid'
group by pid,eid having nodes > 0
EOT
281

282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
    my $q = DBQueryFatal($sql);

    if ($d) { print $q->as_string; $q->dataseek(0); }

    if (%r = $q->fetchhash()) {
	$pid = $r{'pid'};
	$gid = $r{'gid'};
	$eid = $r{'eid'};
	$swappable = $r{'swappable'};
	$ignore = $r{'idle_ignore'};
	$swapreqs = $r{'swap_requests'};
	$lastreq = $r{'lastreq'};
	$nodes = $r{'nodes'};
	$time= $r{'idle_time'};
	$lastact= $r{'lastact'};
        $staleness = $r{'staleness'};
        if ($staleness >= 600) { # 10 minute stale limit
            $stale=1;
        }
	
	# We don't know (from the query) that idletime>threshold.  So
	# check that we're either forcing, or that it is idle, and
	# then check the swap requests and time of last request, to
	# make sure we can send a message.
306

307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
	if ($f || ($time > $threshold && !$ignore &&
		   ($swapreqs == 0 ||
		    ($swapreqs > 0 && $lastreq > $mailinterval)))) {
	    SendMessage($pid,$gid,$eid,$swappable,$swapreqs,$nodes,
			$time,$lastact,$stale);
	} else {
	    if ($d) {
		print "$pid/$eid: no msg (idle $time hrs, ".
		  "ignore=$ignore, msg #$swapreqs $lastreq hrs ago)\n";
	    }
	    # no message sent for whatever reason
	    exit(2);
	}
    } else {
	# expt didn't exist, or didn't have any nodes in node_activity
	exit(1);
    }
324
325
326
327
328
}

exit(0);

sub SendMessage {
329
    my ($pid,$gid,$eid,$swappable,$swapreqs,$c,$time,$lastact,$stale,
330
	$autowarn,$warnmin) = @_;
331
332
333
334
335
336
337
338
339
    # enable extra debugging...
    if (0) {
	SENDMAIL("Mac <newbold\@flux.utah.edu>","idlemail warnings",
		 "idlemail: send message(".join(",",@_)."\n".
		 "pid=$pid,gid=$gid,eid=$eid\nswappable=$swappable,".
		 "swapreqs=$swapreqs,c=$c,time=$time,lastact=$lastact,".
		 "stale=$stale\nautowarn=$autowarn,warnmin=$warnmin\n".
		 "Date: ".`date`);
    }
340
    if (!defined($autowarn)) { $autowarn=0; }
341
342
343
344
    $idlehrs = int($time);
    $idlemin = int(($time-$idlehrs)*60);

    if ($d) {
345
346
347
348
349
350
	if ($autowarn) {
	    print "Sending warning message to $pid/$eid before autoswap\n";
	} else {
	    print "Sending message to $pid/$eid, ".
	      "idle $idlehrs hrs $idlemin min, total $time hrs\n";
	}
351
352
353
354
    }

    my $expleader_name;
    my $expleader_email;
Mac Newbold's avatar
Mac Newbold committed
355
356
    my $uid = ExpSwapper($pid,$eid);
    my $uid2 = ExpLeader($pid,$eid);
357
    if ($uid eq "") { $uid=$uid2; }
358
    UserDBInfo($uid,\$expleader_name,\$expleader_email);
Mac Newbold's avatar
Mac Newbold committed
359
360
361
    if ($uid ne $uid2) {
	UserDBInfo($uid2,\$expcreator_name,\$expcreator_email);
    }
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
    my $leaders = TBLeaderMailList($pid,$gid);
    if ($d > 1) {
	print "expt=$pid/$eid (gid=$gid)\n".
	  "uid=$uid  ($expleader_name <$expleader_email>)\n".
	  "leaders=$leaders\n";
    }

    my $wrapwidth=75;

    # Important note about our wordwrapper:
    # It does funkyness with strings that already have some newlines in 
    # them, most especially with \n\n embedded. It also adds a final \n
    # to the end of the string it wraps.

    $msg="Hi, this is an important automated message from $THISHOMEBASE.\n\n";
    $msg .=
      wordwrap("It appears that the $c node".($c!=1?"s":"").
	       " in your experiment ".
	       "'$eid' ".($c!=1?"have":"has")." been inactive for ".
	       "$idlehrs hours, $idlemin minutes, since $lastact. ".
	       ( $stale
		 ? ("(This message may be based on incomplete or ".
		    "stale information. ".
		    "Contact Testbed Ops if this message is a mistake.) ")
		 : "").
	       ( $swapreqs > 0
		 ? ("You have been sent ".$swapreqs." other message".
		    ($swapreqs!=1?"s":"")." since this experiment ".
		    "became idle. ")
		 : "").
	       ($swappable ?
		("This experiment is marked as swappable, so it may be ".
		 "automatically swapped out by $THISHOMEBASE or its ".
		 "operational staff. ") :
		("This experiment has not been marked swappable, so it ".
		 "will not be automatically swapped out. ")), $wrapwidth);
    $msg .= "\n".
      wordwrap("We would appreciate it if you could either terminate or ".
	       "swap this experiment out so that the nodes will be ".
	       "available for use by other experimenters. You can do this ".
	       "by logging into the $THISHOMEBASE Web Interface, and using ".
	       "the swap or terminate links on this page:",$wrapwidth);
404
405
406
407
408
409
410
411
412
    if ($autowarn) {
	# We're sending a different kind of message...
	# Fix the first half, and the rest is the same as the other message.
	$msg="Hi, this is an important automated message from $THISHOMEBASE.".
	  "\n\nYou scheduled your experiment $pid/$eid to be Auto-Swapped\n".
	  "in about $warnmin minutes, whether it is in active use or not.\n".
	  "If you would like to change the timing of the Auto-Swap, please \n".
	  "use the Edit option on this page:\n";
    }
413
414
415
416
417
418
419
420
421
422
423
424
425
426
    $msg .= "\n$TBBASE/showexp.php3?pid=$pid&eid=$eid\n\n";
    $msg .=
      wordwrap("More information on experiment swapping is available ".
	       "in the $THISHOMEBASE FAQ at",$wrapwidth);
    $msg .= "$TBDOCBASE/faq.php3#UTT-Swapping\n\n";
    $msg .=
      wordwrap("More information on our node usage policy is available at ",
	       $wrapwidth);
    $msg .= "$TBDOCBASE/docwrapper.php3?docname=swapping.html\n\n";
    $msg .=
      wordwrap("If you feel this message is in error then please contact ".
	       "Testbed Operations <$TBMAILADDR_OPS>.",$wrapwidth);
    $msg .= "\nThanks!\nTestbed Operations\n";

Mac Newbold's avatar
Mac Newbold committed
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
    # expleader is really the swapper here
    # the real leader is expcreator
    $cclist="";
    if ($uid ne $uid2) {
	# creator and swapper are different
	$cclist="Cc: $expcreator_name <$expcreator_email>";
    }
    if ($swapreqs+1 >= $cc_grp_ldrs) {
	if ($cclist ne "") {
	    $cclist .= ", $leaders\n";
	} else {
	    $cclist = "Cc: $leaders\n"
	}
    } elsif ($cclist ne "") {
	$cclist .="\n";
    }
443

444
445
446
447
448
    if ($autowarn) {
	$subj="Auto-Swap Warning: $pid/$eid";
    } else {
	$subj="$c PC".($c!=1?"s":"")." idle $idlehrs hours: $pid/$eid";
    }
449
450
451
452
453
    if ($n) {
	# no mail mode: don't send mail or update db counters
	print "----NO-MAIL-MODE----\n";
	print "To: $expleader_name <$expleader_email>\n";
	print "From: $TBMAIL_OPS\n".
Mac Newbold's avatar
Mac Newbold committed
454
455
456
	  $cclist.
	  "Bcc: $TBMAIL_AUTOMAIL\n".
	  "Errors-To: $TBMAIL_WWW"."\n";
457
	print "Subject: $subj\n";
458
459
460
461
462
463
	print "\n$msg\n";
	print "----NO-MAIL-MODE----\n";
    } else {
	# libtestbed SENDMAIL syntax:
	# SENDMAIL(To, Subject, Message, [From], [More Headers],...)

Mac Newbold's avatar
Mac Newbold committed
464

465
466
467
	# For debugging:
	#SENDMAIL("Expt Leader <$TBMAILADDR_OPS>",
	SENDMAIL("$expleader_name <$expleader_email>",
468
		 $subj,
469
470
		 $msg,
		 "$TBMAIL_OPS",
Mac Newbold's avatar
Mac Newbold committed
471
		 $cclist.
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
		 "Bcc: $TBMAIL_AUTOMAIL\n".
		 "Errors-To: $TBMAIL_WWW");

	# Update the count and the time in the database
	DBQueryWarn("update experiments set swap_requests= swap_requests+1,
                 last_swap_req=now() where pid='$pid' and eid='$eid';");
    }

}

sub wordwrap($$) {
    # Perl version of the PHP wordwrap function.
    # Got the one-liner at http://www.consistent.org/terran/misc.shtml

    my ($str,$width) = @_;

    # The one liner sometimes produces spurious undefined values warnings, 
    # so we'll temporarily disable it in this function only
    local $WARNING = 0;

    if ($d > 1) { print "WRAPPING: $str => $width\n"; }
    $str=~s/(?:^|\G\n?)(?:(.{1,$width})(?:\s|\n|$)|(\S{$width})|\n)/$1$2\n/sg;
    if ($d > 1) { print "WRAPPING: => \n$str\n"; }

    return $str;
}