select.c 25.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

17
#include <linux/kernel.h>
18
#include <linux/sched.h>
Linus Torvalds's avatar
Linus Torvalds committed
19
#include <linux/syscalls.h>
20
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
21
22
23
24
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
25
#include <linux/fdtable.h>
Linus Torvalds's avatar
Linus Torvalds committed
26
#include <linux/fs.h>
27
#include <linux/rcupdate.h>
28
#include <linux/hrtimer.h>
29
#include <linux/sched/rt.h>
30
#include <linux/freezer.h>
31
#include <net/busy_poll.h>
Linus Torvalds's avatar
Linus Torvalds committed
32
33
34

#include <asm/uaccess.h>

35
36
37
38
39
40
41
42
43
44
45
46
47

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

48
49
#define MAX_SLACK	(100 * NSEC_PER_MSEC)

50
static long __estimate_accuracy(struct timespec *tv)
51
{
52
	long slack;
53
54
	int divfactor = 1000;

55
56
57
	if (tv->tv_sec < 0)
		return 0;

58
	if (task_nice(current) > 0)
59
60
		divfactor = divfactor / 5;

61
62
63
	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
		return MAX_SLACK;

64
65
66
	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

67
68
	if (slack > MAX_SLACK)
		return MAX_SLACK;
69

70
71
72
	return slack;
}

73
long select_estimate_accuracy(struct timespec *tv)
74
75
76
77
78
79
80
81
{
	unsigned long ret;
	struct timespec now;

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

82
	if (rt_task(current))
83
84
85
86
87
88
89
90
91
92
93
94
		return 0;

	ktime_get_ts(&now);
	now = timespec_sub(*tv, now);
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



Linus Torvalds's avatar
Linus Torvalds committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
Adrian Bunk's avatar
Adrian Bunk committed
116
117
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
Linus Torvalds's avatar
Linus Torvalds committed
118
119
120
121

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
Tejun Heo's avatar
Tejun Heo committed
122
	pwq->polling_task = current;
123
	pwq->triggered = 0;
Linus Torvalds's avatar
Linus Torvalds committed
124
125
	pwq->error = 0;
	pwq->table = NULL;
126
	pwq->inline_index = 0;
Linus Torvalds's avatar
Linus Torvalds committed
127
128
129
}
EXPORT_SYMBOL(poll_initwait);

130
131
static void free_poll_entry(struct poll_table_entry *entry)
{
WANG Cong's avatar
WANG Cong committed
132
	remove_wait_queue(entry->wait_address, &entry->wait);
133
134
135
	fput(entry->filp);
}

Linus Torvalds's avatar
Linus Torvalds committed
136
137
138
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
139
140
141
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
Linus Torvalds's avatar
Linus Torvalds committed
142
143
144
145
146
147
148
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
149
			free_poll_entry(entry);
Linus Torvalds's avatar
Linus Torvalds committed
150
151
152
153
154
155
156
157
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

Tejun Heo's avatar
Tejun Heo committed
158
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
Linus Torvalds's avatar
Linus Torvalds committed
159
160
161
{
	struct poll_table_page *table = p->table;

162
163
164
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

Linus Torvalds's avatar
Linus Torvalds committed
165
166
167
168
169
170
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
171
			return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
172
173
174
175
176
177
178
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

179
180
181
	return table->entry++;
}

182
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
Tejun Heo's avatar
Tejun Heo committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	 * and is paired with set_mb() in poll_schedule_timeout.
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

208
209
210
211
212
213
214
215
216
217
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
	if (key && !((unsigned long)key & entry->key))
		return 0;
	return __pollwake(wait, mode, sync, key);
}

218
219
220
221
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
Tejun Heo's avatar
Tejun Heo committed
222
223
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
224
225
	if (!entry)
		return;
Al Viro's avatar
Al Viro committed
226
	entry->filp = get_file(filp);
227
	entry->wait_address = wait_address;
228
	entry->key = p->_key;
Tejun Heo's avatar
Tejun Heo committed
229
230
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
WANG Cong's avatar
WANG Cong committed
231
	add_wait_queue(wait_address, &entry->wait);
Linus Torvalds's avatar
Linus Torvalds committed
232
233
}

Tejun Heo's avatar
Tejun Heo committed
234
235
236
237
238
239
240
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
241
242
		rc = freezable_schedule_hrtimeout_range(expires, slack,
							HRTIMER_MODE_ABS);
Tejun Heo's avatar
Tejun Heo committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
	 * The following set_mb() serves two purposes.  First, it's
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
	set_mb(pwq->triggered, 0);

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:		pointer to timespec variable for the final timeout
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
{
	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};

	if (!timespec_valid(&ts))
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
		ktime_get_ts(to);
		*to = timespec_add_safe(*to, ts);
	}
	return 0;
}

static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
				      int timeval, int ret)
{
	struct timespec rts;
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

	ktime_get_ts(&rts);
	rts = timespec_sub(*end_time, rts);
	if (rts.tv_sec < 0)
		rts.tv_sec = rts.tv_nsec = 0;

	if (timeval) {
312
313
		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
			memset(&rtv, 0, sizeof(rtv));
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
		rtv.tv_sec = rts.tv_sec;
		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

	} else if (!copy_to_user(p, &rts, sizeof(rts)))
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
337
338
339
340
341
342
343
344
345
346
347
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
348
	struct fdtable *fdt;
Linus Torvalds's avatar
Linus Torvalds committed
349
350

	/* handle last in-complete long-word first */
351
352
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	n /= BITS_PER_LONG;
353
	fdt = files_fdtable(current->files);
354
	open_fds = fdt->open_fds + n;
Linus Torvalds's avatar
Linus Torvalds committed
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
379
		max += n * BITS_PER_LONG;
Linus Torvalds's avatar
Linus Torvalds committed
380
381
382
383
384
385
386
387
388
	}

	return max;
}

#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

389
static inline void wait_key_set(poll_table *wait, unsigned long in,
390
391
				unsigned long out, unsigned long bit,
				unsigned int ll_flag)
392
{
393
	wait->_key = POLLEX_SET | ll_flag;
394
395
396
397
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
398
399
}

400
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
401
{
402
	ktime_t expire, *to = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
403
404
	struct poll_wqueues table;
	poll_table *wait;
405
	int retval, i, timed_out = 0;
406
	unsigned long slack = 0;
407
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
408
	unsigned long busy_end = 0;
Linus Torvalds's avatar
Linus Torvalds committed
409

410
	rcu_read_lock();
Linus Torvalds's avatar
Linus Torvalds committed
411
	retval = max_select_fd(n, fds);
412
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
413
414
415
416
417
418
419

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
420
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
421
		wait->_qproc = NULL;
422
423
424
		timed_out = 1;
	}

425
	if (end_time && !timed_out)
426
		slack = select_estimate_accuracy(end_time);
427

Linus Torvalds's avatar
Linus Torvalds committed
428
429
430
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
431
		bool can_busy_loop = false;
Linus Torvalds's avatar
Linus Torvalds committed
432
433
434
435
436
437
438
439
440
441
442

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
443
				i += BITS_PER_LONG;
Linus Torvalds's avatar
Linus Torvalds committed
444
445
446
				continue;
			}

447
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
448
				struct fd f;
Linus Torvalds's avatar
Linus Torvalds committed
449
450
451
452
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
453
454
455
456
				f = fdget(i);
				if (f.file) {
					const struct file_operations *f_op;
					f_op = f.file->f_op;
Linus Torvalds's avatar
Linus Torvalds committed
457
					mask = DEFAULT_POLLMASK;
458
					if (f_op && f_op->poll) {
459
						wait_key_set(wait, in, out,
460
							     bit, busy_flag);
461
						mask = (*f_op->poll)(f.file, wait);
462
					}
463
					fdput(f);
Linus Torvalds's avatar
Linus Torvalds committed
464
465
466
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
467
						wait->_qproc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
468
469
470
471
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
472
						wait->_qproc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
473
474
475
476
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
477
						wait->_qproc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
478
					}
479
					/* got something, stop busy polling */
480
481
482
483
484
485
486
487
488
489
490
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

Linus Torvalds's avatar
Linus Torvalds committed
491
492
493
494
495
496
497
498
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
499
			cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
500
		}
501
		wait->_qproc = NULL;
502
		if (retval || timed_out || signal_pending(current))
Linus Torvalds's avatar
Linus Torvalds committed
503
			break;
Pavel Machek's avatar
Pavel Machek committed
504
		if (table.error) {
Linus Torvalds's avatar
Linus Torvalds committed
505
506
507
			retval = table.error;
			break;
		}
508

509
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
510
511
512
513
514
515
516
517
518
		if (can_busy_loop && !need_resched()) {
			if (!busy_end) {
				busy_end = busy_loop_end_time();
				continue;
			}
			if (!busy_loop_timeout(busy_end))
				continue;
		}
		busy_flag = 0;
519

520
521
522
523
524
525
526
527
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
528
		}
529

Tejun Heo's avatar
Tejun Heo committed
530
531
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
532
			timed_out = 1;
Linus Torvalds's avatar
Linus Torvalds committed
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
548
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
549
			   fd_set __user *exp, struct timespec *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
550
551
{
	fd_set_bits fds;
Andrew Morton's avatar
Andrew Morton committed
552
	void *bits;
553
	int ret, max_fds;
554
	unsigned int size;
555
	struct fdtable *fdt;
556
	/* Allocate small arguments on the stack to save memory and be faster */
557
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
Linus Torvalds's avatar
Linus Torvalds committed
558
559
560
561
562

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

563
	/* max_fds can increase, so grab it once to avoid race */
564
	rcu_read_lock();
565
	fdt = files_fdtable(current->files);
566
	max_fds = fdt->max_fds;
567
	rcu_read_unlock();
568
569
	if (n > max_fds)
		n = max_fds;
Linus Torvalds's avatar
Linus Torvalds committed
570
571
572
573
574
575
576

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
577
578
579
580
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
581
		bits = kmalloc(6 * size, GFP_KERNEL);
582
583
584
		if (!bits)
			goto out_nofds;
	}
Andrew Morton's avatar
Andrew Morton committed
585
586
587
588
589
590
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
Linus Torvalds's avatar
Linus Torvalds committed
591
592
593
594
595
596
597
598
599

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

600
	ret = do_select(n, &fds, end_time);
Linus Torvalds's avatar
Linus Torvalds committed
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
617
618
	if (bits != stack_fds)
		kfree(bits);
Linus Torvalds's avatar
Linus Torvalds committed
619
620
621
622
out_nofds:
	return ret;
}

623
624
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
625
{
626
	struct timespec end_time, *to = NULL;
627
628
629
630
631
632
633
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

634
		to = &end_time;
635
636
637
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
638
639
640
			return -EINVAL;
	}

641
642
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
643
644
645
646

	return ret;
}

647
648
649
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timespec __user *tsp,
		       const sigset_t __user *sigmask, size_t sigsetsize)
650
651
{
	sigset_t ksigmask, sigsaved;
652
	struct timespec ts, end_time, *to = NULL;
653
654
655
656
657
658
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

659
660
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
661
662
663
664
665
666
667
668
669
670
671
672
673
674
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

Bernd Schmidt's avatar
Bernd Schmidt committed
675
	ret = core_sys_select(n, inp, outp, exp, to);
676
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
677
678
679
680
681
682
683
684
685
686

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
687
			set_restore_sigmask();
688
689
690
691
692
693
694
695
696
697
698
699
700
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
701
702
703
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timespec __user *, tsp,
		void __user *, sig)
704
705
706
707
708
709
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
710
		    || __get_user(up, (sigset_t __user * __user *)sig)
711
		    || __get_user(sigsetsize,
712
				(size_t __user *)(sig+sizeof(void *))))
713
714
715
			return -EFAULT;
	}

716
	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
717
718
}

719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
	unsigned long n;
	fd_set __user *inp, *outp, *exp;
	struct timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
	struct sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
736
737
738
739
740
741
742
743
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

744
745
746
747
748
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
749
 * if pwait->_qproc is non-NULL.
750
 */
751
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
752
753
				     bool *can_busy_poll,
				     unsigned int busy_flag)
Linus Torvalds's avatar
Linus Torvalds committed
754
{
755
756
757
758
759
760
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
761
		struct fd f = fdget(fd);
762
		mask = POLLNVAL;
763
		if (f.file) {
764
			mask = DEFAULT_POLLMASK;
765
			if (f.file->f_op && f.file->f_op->poll) {
766
				pwait->_key = pollfd->events|POLLERR|POLLHUP;
767
				pwait->_key |= busy_flag;
768
				mask = f.file->f_op->poll(f.file, pwait);
769
770
				if (mask & busy_flag)
					*can_busy_poll = true;
771
			}
772
773
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
774
			fdput(f);
Linus Torvalds's avatar
Linus Torvalds committed
775
776
		}
	}
777
778
779
	pollfd->revents = mask;

	return mask;
Linus Torvalds's avatar
Linus Torvalds committed
780
781
782
}

static int do_poll(unsigned int nfds,  struct poll_list *list,
783
		   struct poll_wqueues *wait, struct timespec *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
784
785
{
	poll_table* pt = &wait->pt;
786
787
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
788
	unsigned long slack = 0;
789
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
790
	unsigned long busy_end = 0;
Linus Torvalds's avatar
Linus Torvalds committed
791

792
	/* Optimise the no-wait case */
793
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
794
		pt->_qproc = NULL;
795
796
		timed_out = 1;
	}
797

798
	if (end_time && !timed_out)
799
		slack = select_estimate_accuracy(end_time);
800

Linus Torvalds's avatar
Linus Torvalds committed
801
802
	for (;;) {
		struct poll_list *walk;
803
		bool can_busy_loop = false;
804

805
806
807
808
809
810
811
812
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
813
				 * and kill poll_table->_qproc, so we don't
814
815
816
817
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
818
819
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
820
					count++;
821
					pt->_qproc = NULL;
822
823
824
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
825
826
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
827
		}
828
829
		/*
		 * All waiters have already been registered, so don't provide
830
		 * a poll_table->_qproc to them on the next loop iteration.
831
		 */
832
		pt->_qproc = NULL;
833
834
835
836
837
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
838
		if (count || timed_out)
Linus Torvalds's avatar
Linus Torvalds committed
839
			break;
840

841
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
842
843
844
845
846
847
848
849
850
		if (can_busy_loop && !need_resched()) {
			if (!busy_end) {
				busy_end = busy_loop_end_time();
				continue;
			}
			if (!busy_loop_timeout(busy_end))
				continue;
		}
		busy_flag = 0;
851

852
853
854
855
856
857
858
859
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec_to_ktime(*end_time);
			to = &expire;
860
861
		}

Tejun Heo's avatar
Tejun Heo committed
862
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
863
			timed_out = 1;
Linus Torvalds's avatar
Linus Torvalds committed
864
865
866
867
	}
	return count;
}

868
869
870
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

871
872
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
873
874
{
	struct poll_wqueues table;
875
 	int err = -EFAULT, fdcount, len, size;
876
877
878
879
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
880
881
882
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
Linus Torvalds's avatar
Linus Torvalds committed
883

Jiri Slaby's avatar
Jiri Slaby committed
884
	if (nfds > rlimit(RLIMIT_NOFILE))
Linus Torvalds's avatar
Linus Torvalds committed
885
886
		return -EINVAL;

887
888
889
890
891
892
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
893

894
895
896
897
898
899
900
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
901

902
903
904
905
906
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
907
908
909
			goto out_fds;
		}
	}
910

911
	poll_initwait(&table);
912
	fdcount = do_poll(nfds, head, &table, end_time);
913
	poll_freewait(&table);
Linus Torvalds's avatar
Linus Torvalds committed
914

915
	for (walk = head; walk; walk = walk->next) {
Linus Torvalds's avatar
Linus Torvalds committed
916
917
918
		struct pollfd *fds = walk->entries;
		int j;

919
920
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
Linus Torvalds's avatar
Linus Torvalds committed
921
922
				goto out_fds;
  	}
923

Linus Torvalds's avatar
Linus Torvalds committed
924
925
	err = fdcount;
out_fds:
926
927
928
929
930
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
Linus Torvalds's avatar
Linus Torvalds committed
931
	}
932

Linus Torvalds's avatar
Linus Torvalds committed
933
934
	return err;
}
935

936
937
static long do_restart_poll(struct restart_block *restart_block)
{
938
939
940
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
	struct timespec *to = NULL, end_time;
941
942
	int ret;

943
944
945
946
947
948
949
950
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

951
952
953
954
955
956
957
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

958
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
959
		int, timeout_msecs)
960
{
961
	struct timespec end_time, *to = NULL;
962
	int ret;
963

964
965
966
967
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
968
969
	}

970
971
	ret = do_sys_poll(ufds, nfds, to);

972
973
	if (ret == -EINTR) {
		struct restart_block *restart_block;
974

975
976
		restart_block = &current_thread_info()->restart_block;
		restart_block->fn = do_restart_poll;
977
978
979
980
981
982
983
984
985
986
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

987
988
989
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
990
991
}

992
993
994
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
		size_t, sigsetsize)
995
996
{
	sigset_t ksigmask, sigsaved;
997
	struct timespec ts, end_time, *to = NULL;
998
999
1000
1001
1002
1003
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

1004
1005
1006
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

1020
	ret = do_sys_poll(ufds, nfds, to);
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
1032
			set_restore_sigmask();
1033
1034
1035
1036
1037
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

1038
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1039
1040
1041

	return ret;
}