sock.c 55.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic socket support routines. Memory allocators, socket lock/release
 *		handler for protocols to use and generic option handler.
 *
 *
 * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
 *
12
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *		Alan Cox	: 	Numerous verify_area() problems
 *		Alan Cox	:	Connecting on a connecting socket
 *					now returns an error for tcp.
 *		Alan Cox	:	sock->protocol is set correctly.
 *					and is not sometimes left as 0.
 *		Alan Cox	:	connect handles icmp errors on a
 *					connect properly. Unfortunately there
 *					is a restart syscall nasty there. I
 *					can't match BSD without hacking the C
 *					library. Ideas urgently sought!
 *		Alan Cox	:	Disallow bind() to addresses that are
 *					not ours - especially broadcast ones!!
 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
 *					instead they leave that for the DESTROY timer.
 *		Alan Cox	:	Clean up error flag in accept
 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
 *					was buggy. Put a remove_sock() in the handler
 *					for memory when we hit 0. Also altered the timer
37
 *					code. The ACK stuff can wait and needs major
Linus Torvalds's avatar
Linus Torvalds committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
 *					TCP layer surgery.
 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
 *					and fixed timer/inet_bh race.
 *		Alan Cox	:	Added zapped flag for TCP
 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
 *	Pauline Middelink	:	identd support
 *		Alan Cox	:	Fixed connect() taking signals I think.
 *		Alan Cox	:	SO_LINGER supported
 *		Alan Cox	:	Error reporting fixes
 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
 *		Alan Cox	:	inet sockets don't set sk->type!
 *		Alan Cox	:	Split socket option code
 *		Alan Cox	:	Callbacks
 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
 *		Alex		:	Removed restriction on inet fioctl
 *		Alan Cox	:	Splitting INET from NET core
 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
 *		Alan Cox	:	Split IP from generic code
 *		Alan Cox	:	New kfree_skbmem()
 *		Alan Cox	:	Make SO_DEBUG superuser only.
 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
 *					(compatibility fix)
 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
 *		Alan Cox	:	Allocator for a socket is settable.
 *		Alan Cox	:	SO_ERROR includes soft errors.
 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
 *		Alan Cox	: 	Generic socket allocation to make hooks
 *					easier (suggested by Craig Metz).
 *		Michael Pall	:	SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
 *		Andi Kleen	:	Fix write_space callback
 *		Chris Evans	:	Security fixes - signedness again
 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 *
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

94
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/init.h>
114
#include <linux/highmem.h>
Linus Torvalds's avatar
Linus Torvalds committed
115
116
117
118
119
120
121

#include <asm/uaccess.h>
#include <asm/system.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
122
#include <net/net_namespace.h>
123
#include <net/request_sock.h>
Linus Torvalds's avatar
Linus Torvalds committed
124
125
126
127
128
129
130
131
132
133
#include <net/sock.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>

#include <linux/filter.h>

#ifdef CONFIG_INET
#include <net/tcp.h>
#endif

134
135
136
137
/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family:
 */
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */
static const char *af_family_key_strings[AF_MAX+1] = {
  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
157
  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
158
159
  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
  "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
160
161
162
163
164
165
166
167
168
169
170
};
static const char *af_family_slock_key_strings[AF_MAX+1] = {
  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171
  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172
173
  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
  "slock-AF_RXRPC" , "slock-AF_MAX"
174
};
175
176
177
178
179
180
181
182
183
184
185
static const char *af_family_clock_key_strings[AF_MAX+1] = {
  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
  "clock-27"       , "clock-28"          , "clock-29"          ,
186
187
  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
  "clock-AF_RXRPC" , "clock-AF_MAX"
188
};
189
#endif
190
191
192
193
194
195
196

/*
 * sk_callback_lock locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];

Linus Torvalds's avatar
Linus Torvalds committed
197
198
199
200
201
202
203
204
205
206
207
/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS		256
#define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

/* Run time adjustable parameters. */
208
209
210
211
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds's avatar
Linus Torvalds committed
212
213

/* Maximal space eaten by iovec or ancilliary data plus some space */
214
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Linus Torvalds's avatar
Linus Torvalds committed
215
216
217
218
219
220
221
222
223

static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
	struct timeval tv;

	if (optlen < sizeof(tv))
		return -EINVAL;
	if (copy_from_user(&tv, optval, sizeof(tv)))
		return -EFAULT;
224
225
	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
		return -EDOM;
Linus Torvalds's avatar
Linus Torvalds committed
226

227
	if (tv.tv_sec < 0) {
228
229
		static int warned __read_mostly;

230
231
232
233
234
		*timeo_p = 0;
		if (warned < 10 && net_ratelimit())
			warned++;
			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
			       "tries to set negative timeout\n",
235
				current->comm, task_pid_nr(current));
236
237
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
238
239
240
241
242
243
244
245
246
247
248
249
	*timeo_p = MAX_SCHEDULE_TIMEOUT;
	if (tv.tv_sec == 0 && tv.tv_usec == 0)
		return 0;
	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
	return 0;
}

static void sock_warn_obsolete_bsdism(const char *name)
{
	static int warned;
	static char warncomm[TASK_COMM_LEN];
250
251
	if (strcmp(warncomm, current->comm) && warned < 5) {
		strcpy(warncomm,  current->comm);
Linus Torvalds's avatar
Linus Torvalds committed
252
253
254
255
256
257
258
		printk(KERN_WARNING "process `%s' is using obsolete "
		       "%s SO_BSDCOMPAT\n", warncomm, name);
		warned++;
	}
}

static void sock_disable_timestamp(struct sock *sk)
259
260
{
	if (sock_flag(sk, SOCK_TIMESTAMP)) {
Linus Torvalds's avatar
Linus Torvalds committed
261
262
263
264
265
266
		sock_reset_flag(sk, SOCK_TIMESTAMP);
		net_disable_timestamp();
	}
}


267
268
269
270
271
272
273
274
275
276
277
278
279
280
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
	int err = 0;
	int skb_len;

	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
	   number of warnings when compiling with -W --ANK
	 */
	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
	    (unsigned)sk->sk_rcvbuf) {
		err = -ENOMEM;
		goto out;
	}

281
	err = sk_filter(sk, skb);
282
283
284
	if (err)
		goto out;

285
286
287
288
289
	if (!sk_rmem_schedule(sk, skb->truesize)) {
		err = -ENOBUFS;
		goto out;
	}

290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
	skb->dev = NULL;
	skb_set_owner_r(skb, sk);

	/* Cache the SKB length before we tack it onto the receive
	 * queue.  Once it is added it no longer belongs to us and
	 * may be freed by other threads of control pulling packets
	 * from the queue.
	 */
	skb_len = skb->len;

	skb_queue_tail(&sk->sk_receive_queue, skb);

	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk, skb_len);
out:
	return err;
}
EXPORT_SYMBOL(sock_queue_rcv_skb);

309
int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
310
311
312
{
	int rc = NET_RX_SUCCESS;

313
	if (sk_filter(sk, skb))
314
315
316
317
		goto discard_and_relse;

	skb->dev = NULL;

318
319
320
321
	if (nested)
		bh_lock_sock_nested(sk);
	else
		bh_lock_sock(sk);
322
323
324
325
326
327
	if (!sock_owned_by_user(sk)) {
		/*
		 * trylock + unlock semantics:
		 */
		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

328
		rc = sk->sk_backlog_rcv(sk, skb);
329
330
331

		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
	} else
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
		sk_add_backlog(sk, skb);
	bh_unlock_sock(sk);
out:
	sock_put(sk);
	return rc;
discard_and_relse:
	kfree_skb(skb);
	goto out;
}
EXPORT_SYMBOL(sk_receive_skb);

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
	struct dst_entry *dst = sk->sk_dst_cache;

	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
		sk->sk_dst_cache = NULL;
		dst_release(dst);
		return NULL;
	}

	return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
	struct dst_entry *dst = sk_dst_get(sk);

	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
		sk_dst_reset(sk);
		dst_release(dst);
		return NULL;
	}

	return dst;
}
EXPORT_SYMBOL(sk_dst_check);

371
372
373
374
static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
{
	int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
375
	struct net *net = sock_net(sk);
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
	char devname[IFNAMSIZ];
	int index;

	/* Sorry... */
	ret = -EPERM;
	if (!capable(CAP_NET_RAW))
		goto out;

	ret = -EINVAL;
	if (optlen < 0)
		goto out;

	/* Bind this socket to a particular device like "eth0",
	 * as specified in the passed interface name. If the
	 * name is "" or the option length is zero the socket
	 * is not bound.
	 */
	if (optlen > IFNAMSIZ - 1)
		optlen = IFNAMSIZ - 1;
	memset(devname, 0, sizeof(devname));

	ret = -EFAULT;
	if (copy_from_user(devname, optval, optlen))
		goto out;

	if (devname[0] == '\0') {
		index = 0;
	} else {
404
		struct net_device *dev = dev_get_by_name(net, devname);
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

		ret = -ENODEV;
		if (!dev)
			goto out;

		index = dev->ifindex;
		dev_put(dev);
	}

	lock_sock(sk);
	sk->sk_bound_dev_if = index;
	sk_dst_reset(sk);
	release_sock(sk);

	ret = 0;

out:
#endif

	return ret;
}

427
428
429
430
431
432
433
434
static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
{
	if (valbool)
		sock_set_flag(sk, bit);
	else
		sock_reset_flag(sk, bit);
}

Linus Torvalds's avatar
Linus Torvalds committed
435
436
437
438
439
440
441
442
443
444
445
446
447
/*
 *	This is meant for all protocols to use and covers goings on
 *	at the socket level. Everything here is generic.
 */

int sock_setsockopt(struct socket *sock, int level, int optname,
		    char __user *optval, int optlen)
{
	struct sock *sk=sock->sk;
	int val;
	int valbool;
	struct linger ling;
	int ret = 0;
448

Linus Torvalds's avatar
Linus Torvalds committed
449
450
451
452
453
	/*
	 *	Options without arguments
	 */

#ifdef SO_DONTLINGER		/* Compatibility item... */
454
455
456
457
458
	if (optname == SO_DONTLINGER) {
		lock_sock(sk);
		sock_reset_flag(sk, SOCK_LINGER);
		release_sock(sk);
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
459
	}
460
#endif
461

462
463
464
	if (optname == SO_BINDTODEVICE)
		return sock_bindtodevice(sk, optval, optlen);

465
466
	if (optlen < sizeof(int))
		return -EINVAL;
467

Linus Torvalds's avatar
Linus Torvalds committed
468
469
	if (get_user(val, (int __user *)optval))
		return -EFAULT;
470
471

	valbool = val?1:0;
Linus Torvalds's avatar
Linus Torvalds committed
472
473
474

	lock_sock(sk);

475
476
477
478
	switch(optname) {
	case SO_DEBUG:
		if (val && !capable(CAP_NET_ADMIN)) {
			ret = -EACCES;
479
480
		} else
			sock_valbool_flag(sk, SOCK_DBG, valbool);
481
482
483
484
485
486
487
488
489
		break;
	case SO_REUSEADDR:
		sk->sk_reuse = valbool;
		break;
	case SO_TYPE:
	case SO_ERROR:
		ret = -ENOPROTOOPT;
		break;
	case SO_DONTROUTE:
490
		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
491
492
493
494
495
496
497
498
499
500
501
502
		break;
	case SO_BROADCAST:
		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
		break;
	case SO_SNDBUF:
		/* Don't error on this BSD doesn't and if you think
		   about it this is right. Otherwise apps have to
		   play 'guess the biggest size' games. RCVBUF/SNDBUF
		   are treated in BSD as hints */

		if (val > sysctl_wmem_max)
			val = sysctl_wmem_max;
503
set_sndbuf:
504
505
506
507
508
		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
		if ((val * 2) < SOCK_MIN_SNDBUF)
			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
		else
			sk->sk_sndbuf = val * 2;
Linus Torvalds's avatar
Linus Torvalds committed
509

510
511
512
513
514
515
		/*
		 *	Wake up sending tasks if we
		 *	upped the value.
		 */
		sk->sk_write_space(sk);
		break;
Linus Torvalds's avatar
Linus Torvalds committed
516

517
518
519
520
521
522
	case SO_SNDBUFFORCE:
		if (!capable(CAP_NET_ADMIN)) {
			ret = -EPERM;
			break;
		}
		goto set_sndbuf;
523

524
525
526
527
528
	case SO_RCVBUF:
		/* Don't error on this BSD doesn't and if you think
		   about it this is right. Otherwise apps have to
		   play 'guess the biggest size' games. RCVBUF/SNDBUF
		   are treated in BSD as hints */
529

530
531
		if (val > sysctl_rmem_max)
			val = sysctl_rmem_max;
532
set_rcvbuf:
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
		/*
		 * We double it on the way in to account for
		 * "struct sk_buff" etc. overhead.   Applications
		 * assume that the SO_RCVBUF setting they make will
		 * allow that much actual data to be received on that
		 * socket.
		 *
		 * Applications are unaware that "struct sk_buff" and
		 * other overheads allocate from the receive buffer
		 * during socket buffer allocation.
		 *
		 * And after considering the possible alternatives,
		 * returning the value we actually used in getsockopt
		 * is the most desirable behavior.
		 */
		if ((val * 2) < SOCK_MIN_RCVBUF)
			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
		else
			sk->sk_rcvbuf = val * 2;
		break;

	case SO_RCVBUFFORCE:
		if (!capable(CAP_NET_ADMIN)) {
			ret = -EPERM;
Linus Torvalds's avatar
Linus Torvalds committed
558
			break;
559
560
		}
		goto set_rcvbuf;
Linus Torvalds's avatar
Linus Torvalds committed
561

562
	case SO_KEEPALIVE:
Linus Torvalds's avatar
Linus Torvalds committed
563
#ifdef CONFIG_INET
564
565
		if (sk->sk_protocol == IPPROTO_TCP)
			tcp_set_keepalive(sk, valbool);
Linus Torvalds's avatar
Linus Torvalds committed
566
#endif
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
		break;

	case SO_OOBINLINE:
		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
		break;

	case SO_NO_CHECK:
		sk->sk_no_check = valbool;
		break;

	case SO_PRIORITY:
		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
			sk->sk_priority = val;
		else
			ret = -EPERM;
		break;

	case SO_LINGER:
		if (optlen < sizeof(ling)) {
			ret = -EINVAL;	/* 1003.1g */
Linus Torvalds's avatar
Linus Torvalds committed
588
			break;
589
590
591
		}
		if (copy_from_user(&ling,optval,sizeof(ling))) {
			ret = -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
592
			break;
593
594
595
596
		}
		if (!ling.l_onoff)
			sock_reset_flag(sk, SOCK_LINGER);
		else {
Linus Torvalds's avatar
Linus Torvalds committed
597
#if (BITS_PER_LONG == 32)
598
599
			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
Linus Torvalds's avatar
Linus Torvalds committed
600
			else
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
#endif
				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
			sock_set_flag(sk, SOCK_LINGER);
		}
		break;

	case SO_BSDCOMPAT:
		sock_warn_obsolete_bsdism("setsockopt");
		break;

	case SO_PASSCRED:
		if (valbool)
			set_bit(SOCK_PASSCRED, &sock->flags);
		else
			clear_bit(SOCK_PASSCRED, &sock->flags);
		break;

	case SO_TIMESTAMP:
619
	case SO_TIMESTAMPNS:
620
		if (valbool)  {
621
622
623
624
			if (optname == SO_TIMESTAMP)
				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
			else
				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
625
626
			sock_set_flag(sk, SOCK_RCVTSTAMP);
			sock_enable_timestamp(sk);
627
		} else {
628
			sock_reset_flag(sk, SOCK_RCVTSTAMP);
629
630
			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
		}
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
		break;

	case SO_RCVLOWAT:
		if (val < 0)
			val = INT_MAX;
		sk->sk_rcvlowat = val ? : 1;
		break;

	case SO_RCVTIMEO:
		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
		break;

	case SO_SNDTIMEO:
		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
		break;
Linus Torvalds's avatar
Linus Torvalds committed
646

647
648
649
650
	case SO_ATTACH_FILTER:
		ret = -EINVAL;
		if (optlen == sizeof(struct sock_fprog)) {
			struct sock_fprog fprog;
Linus Torvalds's avatar
Linus Torvalds committed
651

652
653
			ret = -EFAULT;
			if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds's avatar
Linus Torvalds committed
654
				break;
655
656
657
658
659
660

			ret = sk_attach_filter(&fprog, sk);
		}
		break;

	case SO_DETACH_FILTER:
661
		ret = sk_detach_filter(sk);
662
		break;
Linus Torvalds's avatar
Linus Torvalds committed
663

664
665
666
667
668
669
	case SO_PASSSEC:
		if (valbool)
			set_bit(SOCK_PASSSEC, &sock->flags);
		else
			clear_bit(SOCK_PASSSEC, &sock->flags);
		break;
670
671
672
673
674
675
676
	case SO_MARK:
		if (!capable(CAP_NET_ADMIN))
			ret = -EPERM;
		else {
			sk->sk_mark = val;
		}
		break;
677

Linus Torvalds's avatar
Linus Torvalds committed
678
679
		/* We implement the SO_SNDLOWAT etc to
		   not be settable (1003.1g 5.3) */
680
681
682
	default:
		ret = -ENOPROTOOPT;
		break;
683
	}
Linus Torvalds's avatar
Linus Torvalds committed
684
685
686
687
688
689
690
691
692
	release_sock(sk);
	return ret;
}


int sock_getsockopt(struct socket *sock, int level, int optname,
		    char __user *optval, int __user *optlen)
{
	struct sock *sk = sock->sk;
693

694
	union {
695
696
		int val;
		struct linger ling;
Linus Torvalds's avatar
Linus Torvalds committed
697
698
		struct timeval tm;
	} v;
699

Linus Torvalds's avatar
Linus Torvalds committed
700
701
	unsigned int lv = sizeof(int);
	int len;
702

703
	if (get_user(len, optlen))
704
		return -EFAULT;
705
	if (len < 0)
Linus Torvalds's avatar
Linus Torvalds committed
706
		return -EINVAL;
707

708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
	switch(optname) {
	case SO_DEBUG:
		v.val = sock_flag(sk, SOCK_DBG);
		break;

	case SO_DONTROUTE:
		v.val = sock_flag(sk, SOCK_LOCALROUTE);
		break;

	case SO_BROADCAST:
		v.val = !!sock_flag(sk, SOCK_BROADCAST);
		break;

	case SO_SNDBUF:
		v.val = sk->sk_sndbuf;
		break;

	case SO_RCVBUF:
		v.val = sk->sk_rcvbuf;
		break;

	case SO_REUSEADDR:
		v.val = sk->sk_reuse;
		break;

	case SO_KEEPALIVE:
		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
		break;

	case SO_TYPE:
		v.val = sk->sk_type;
		break;

	case SO_ERROR:
		v.val = -sock_error(sk);
		if (v.val==0)
			v.val = xchg(&sk->sk_err_soft, 0);
		break;

	case SO_OOBINLINE:
		v.val = !!sock_flag(sk, SOCK_URGINLINE);
		break;

	case SO_NO_CHECK:
		v.val = sk->sk_no_check;
		break;

	case SO_PRIORITY:
		v.val = sk->sk_priority;
		break;

	case SO_LINGER:
		lv		= sizeof(v.ling);
		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
		v.ling.l_linger	= sk->sk_lingertime / HZ;
		break;

	case SO_BSDCOMPAT:
		sock_warn_obsolete_bsdism("getsockopt");
		break;

	case SO_TIMESTAMP:
770
771
772
773
774
775
		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
				!sock_flag(sk, SOCK_RCVTSTAMPNS);
		break;

	case SO_TIMESTAMPNS:
		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
		break;

	case SO_RCVTIMEO:
		lv=sizeof(struct timeval);
		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
			v.tm.tv_sec = 0;
			v.tm.tv_usec = 0;
		} else {
			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
		}
		break;

	case SO_SNDTIMEO:
		lv=sizeof(struct timeval);
		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
			v.tm.tv_sec = 0;
			v.tm.tv_usec = 0;
		} else {
			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
		}
		break;
Linus Torvalds's avatar
Linus Torvalds committed
799

800
801
802
	case SO_RCVLOWAT:
		v.val = sk->sk_rcvlowat;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
803

804
805
806
	case SO_SNDLOWAT:
		v.val=1;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
807

808
809
810
	case SO_PASSCRED:
		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
811

812
813
814
815
816
817
	case SO_PEERCRED:
		if (len > sizeof(sk->sk_peercred))
			len = sizeof(sk->sk_peercred);
		if (copy_to_user(optval, &sk->sk_peercred, len))
			return -EFAULT;
		goto lenout;
Linus Torvalds's avatar
Linus Torvalds committed
818

819
820
821
822
823
824
825
826
827
828
829
830
	case SO_PEERNAME:
	{
		char address[128];

		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
			return -ENOTCONN;
		if (lv < len)
			return -EINVAL;
		if (copy_to_user(optval, address, len))
			return -EFAULT;
		goto lenout;
	}
Linus Torvalds's avatar
Linus Torvalds committed
831

832
833
834
835
836
837
	/* Dubious BSD thing... Probably nobody even uses it, but
	 * the UNIX standard wants it for whatever reason... -DaveM
	 */
	case SO_ACCEPTCONN:
		v.val = sk->sk_state == TCP_LISTEN;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
838

839
840
841
	case SO_PASSSEC:
		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
		break;
842

843
844
	case SO_PEERSEC:
		return security_socket_getpeersec_stream(sock, optval, optlen, len);
Linus Torvalds's avatar
Linus Torvalds committed
845

846
847
848
849
	case SO_MARK:
		v.val = sk->sk_mark;
		break;

850
851
	default:
		return -ENOPROTOOPT;
Linus Torvalds's avatar
Linus Torvalds committed
852
	}
853

Linus Torvalds's avatar
Linus Torvalds committed
854
855
856
857
858
	if (len > lv)
		len = lv;
	if (copy_to_user(optval, &v, len))
		return -EFAULT;
lenout:
859
860
861
	if (put_user(len, optlen))
		return -EFAULT;
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
862
863
}

864
865
866
867
868
/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
Dave Jones's avatar
Dave Jones committed
869
static inline void sock_lock_init(struct sock *sk)
870
{
871
872
873
874
875
	sock_lock_init_class_and_name(sk,
			af_family_slock_key_strings[sk->sk_family],
			af_family_slock_keys + sk->sk_family,
			af_family_key_strings[sk->sk_family],
			af_family_keys + sk->sk_family);
876
877
}

878
879
880
881
882
883
884
885
886
887
888
889
890
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
#ifdef CONFIG_SECURITY_NETWORK
	void *sptr = nsk->sk_security;
#endif

	memcpy(nsk, osk, osk->sk_prot->obj_size);
#ifdef CONFIG_SECURITY_NETWORK
	nsk->sk_security = sptr;
	security_sk_clone(osk, nsk);
#endif
}

891
892
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
		int family)
893
894
895
896
897
898
899
900
901
902
{
	struct sock *sk;
	struct kmem_cache *slab;

	slab = prot->slab;
	if (slab != NULL)
		sk = kmem_cache_alloc(slab, priority);
	else
		sk = kmalloc(prot->obj_size, priority);

903
904
905
906
907
908
909
910
	if (sk != NULL) {
		if (security_sk_alloc(sk, family, priority))
			goto out_free;

		if (!try_module_get(prot->owner))
			goto out_free_sec;
	}

911
	return sk;
912
913
914
915
916
917
918
919
920

out_free_sec:
	security_sk_free(sk);
out_free:
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
	return NULL;
921
922
923
924
925
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
	struct kmem_cache *slab;
926
	struct module *owner;
927

928
	owner = prot->owner;
929
	slab = prot->slab;
930
931

	security_sk_free(sk);
932
933
934
935
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
936
	module_put(owner);
937
938
}

Linus Torvalds's avatar
Linus Torvalds committed
939
940
/**
 *	sk_alloc - All socket objects are allocated here
941
 *	@net: the applicable net namespace
942
943
944
945
 *	@family: protocol family
 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *	@prot: struct proto associated with this new sock instance
 *	@zero_it: if we should zero the newly allocated sock
Linus Torvalds's avatar
Linus Torvalds committed
946
 */
947
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
948
		      struct proto *prot)
Linus Torvalds's avatar
Linus Torvalds committed
949
{
950
	struct sock *sk;
Linus Torvalds's avatar
Linus Torvalds committed
951

952
	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds's avatar
Linus Torvalds committed
953
	if (sk) {
954
955
956
957
958
959
960
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		sock_lock_init(sk);
961
		sock_net_set(sk, get_net(net));
Linus Torvalds's avatar
Linus Torvalds committed
962
	}
963

964
	return sk;
Linus Torvalds's avatar
Linus Torvalds committed
965
966
967
968
969
970
971
972
973
}

void sk_free(struct sock *sk)
{
	struct sk_filter *filter;

	if (sk->sk_destruct)
		sk->sk_destruct(sk);

974
	filter = rcu_dereference(sk->sk_filter);
Linus Torvalds's avatar
Linus Torvalds committed
975
	if (filter) {
976
		sk_filter_uncharge(sk, filter);
977
		rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
978
979
980
981
982
983
	}

	sock_disable_timestamp(sk);

	if (atomic_read(&sk->sk_omem_alloc))
		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
984
		       __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds's avatar
Linus Torvalds committed
985

986
	put_net(sock_net(sk));
987
	sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds's avatar
Linus Torvalds committed
988
989
}

990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
/*
 * Last sock_put should drop referrence to sk->sk_net. It has already
 * been dropped in sk_change_net. Taking referrence to stopping namespace
 * is not an option.
 * Take referrence to a socket to remove it from hash _alive_ and after that
 * destroy it in the context of init_net.
 */
void sk_release_kernel(struct sock *sk)
{
	if (sk == NULL || sk->sk_socket == NULL)
		return;

	sock_hold(sk);
	sock_release(sk->sk_socket);
1004
	sock_net_set(sk, get_net(&init_net));
1005
1006
	sock_put(sk);
}
1007
EXPORT_SYMBOL(sk_release_kernel);
1008

1009
struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1010
{
1011
	struct sock *newsk;
1012

1013
	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1014
1015
1016
	if (newsk != NULL) {
		struct sk_filter *filter;

1017
		sock_copy(newsk, sk);
1018
1019

		/* SANITY */
1020
		get_net(sock_net(newsk));
1021
1022
1023
		sk_node_init(&newsk->sk_node);
		sock_lock_init(newsk);
		bh_lock_sock(newsk);
1024
		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1025
1026
1027
1028
1029
1030

		atomic_set(&newsk->sk_rmem_alloc, 0);
		atomic_set(&newsk->sk_wmem_alloc, 0);
		atomic_set(&newsk->sk_omem_alloc, 0);
		skb_queue_head_init(&newsk->sk_receive_queue);
		skb_queue_head_init(&newsk->sk_write_queue);
1031
1032
1033
#ifdef CONFIG_NET_DMA
		skb_queue_head_init(&newsk->sk_async_wait_queue);
#endif
1034
1035
1036

		rwlock_init(&newsk->sk_dst_lock);
		rwlock_init(&newsk->sk_callback_lock);
1037
1038
1039
		lockdep_set_class_and_name(&newsk->sk_callback_lock,
				af_callback_keys + newsk->sk_family,
				af_family_clock_key_strings[newsk->sk_family]);
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090

		newsk->sk_dst_cache	= NULL;
		newsk->sk_wmem_queued	= 0;
		newsk->sk_forward_alloc = 0;
		newsk->sk_send_head	= NULL;
		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;

		sock_reset_flag(newsk, SOCK_DONE);
		skb_queue_head_init(&newsk->sk_error_queue);

		filter = newsk->sk_filter;
		if (filter != NULL)
			sk_filter_charge(newsk, filter);

		if (unlikely(xfrm_sk_clone_policy(newsk))) {
			/* It is still raw copy of parent, so invalidate
			 * destructor and make plain sk_free() */
			newsk->sk_destruct = NULL;
			sk_free(newsk);
			newsk = NULL;
			goto out;
		}

		newsk->sk_err	   = 0;
		newsk->sk_priority = 0;
		atomic_set(&newsk->sk_refcnt, 2);

		/*
		 * Increment the counter in the same struct proto as the master
		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
		 * is the same as sk->sk_prot->socks, as this field was copied
		 * with memcpy).
		 *
		 * This _changes_ the previous behaviour, where
		 * tcp_create_openreq_child always was incrementing the
		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
		 * to be taken into account in all callers. -acme
		 */
		sk_refcnt_debug_inc(newsk);
		newsk->sk_socket = NULL;
		newsk->sk_sleep	 = NULL;

		if (newsk->sk_prot->sockets_allocated)
			atomic_inc(newsk->sk_prot->sockets_allocated);
	}
out:
	return newsk;
}

EXPORT_SYMBOL_GPL(sk_clone);

1091
1092
1093
1094
1095
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
	__sk_dst_set(sk, dst);
	sk->sk_route_caps = dst->dev->features;
	if (sk->sk_route_caps & NETIF_F_GSO)
1096
		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1097
	if (sk_can_gso(sk)) {
1098
		if (dst->header_len) {
1099
			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1100
		} else {
1101
			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1102
1103
			sk->sk_gso_max_size = dst->dev->gso_max_size;
		}
1104
1105
1106
1107
	}
}
EXPORT_SYMBOL_GPL(sk_setup_caps);

Linus Torvalds's avatar
Linus Torvalds committed
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
void __init sk_init(void)
{
	if (num_physpages <= 4096) {
		sysctl_wmem_max = 32767;
		sysctl_rmem_max = 32767;
		sysctl_wmem_default = 32767;
		sysctl_rmem_default = 32767;
	} else if (num_physpages >= 131072) {
		sysctl_wmem_max = 131071;
		sysctl_rmem_max = 131071;
	}
}

/*
 *	Simple resource managers for sockets.
 */


1126
1127
/*
 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds's avatar
Linus Torvalds committed
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
 */
void sock_wfree(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;

	/* In case it might be waiting for more memory. */
	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
		sk->sk_write_space(sk);
	sock_put(sk);
}

1140
1141
/*
 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds's avatar
Linus Torvalds committed
1142
1143
1144
1145
1146
 */
void sock_rfree(struct sk_buff *skb)
{
	struct sock *sk = skb->sk;

1147
	skb_truesize_check(skb);
Linus Torvalds's avatar
Linus Torvalds committed
1148
	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1149
	sk_mem_uncharge(skb->sk, skb->truesize);
Linus Torvalds's avatar
Linus Torvalds committed
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
}


int sock_i_uid(struct sock *sk)
{
	int uid;

	read_lock(&sk->sk_callback_lock);
	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
	read_unlock(&sk->sk_callback_lock);
	return uid;
}

unsigned long sock_i_ino(struct sock *sk)
{
	unsigned long ino;

	read_lock(&sk->sk_callback_lock);
	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
	read_unlock(&sk->sk_callback_lock);
	return ino;
}

/*
 * Allocate a skb from the socket's send buffer.
 */
Victor Fusco's avatar
Victor Fusco committed
1176
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1177
			     gfp_t priority)
Linus Torvalds's avatar
Linus Torvalds committed
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
{
	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
		struct sk_buff * skb = alloc_skb(size, priority);
		if (skb) {
			skb_set_owner_w(skb, sk);
			return skb;
		}
	}
	return NULL;
}

/*
 * Allocate a skb from the socket's receive buffer.
1191
 */
Victor Fusco's avatar
Victor Fusco committed
1192
struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1193
			     gfp_t priority)
Linus Torvalds's avatar
Linus Torvalds committed
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
{
	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
		struct sk_buff *skb = alloc_skb(size, priority);
		if (skb) {
			skb_set_owner_r(skb, sk);
			return skb;
		}
	}
	return NULL;
}

1205
/*
Linus Torvalds's avatar
Linus Torvalds committed
1206
 * Allocate a memory block from the socket's option memory buffer.
1207
 */
1208
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds's avatar
Linus Torvalds committed
1209
1210
1211
1212
1213
{
	if ((unsigned)size <= sysctl_optmem_max &&
	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
		void *mem;
		/* First do the add, to avoid the race if kmalloc
1214
		 * might sleep.
Linus Torvalds's avatar
Linus Torvalds committed
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
		 */
		atomic_add(size, &sk->sk_omem_alloc);
		mem = kmalloc(size, priority);
		if (mem)
			return mem;
		atomic_sub(size, &sk->sk_omem_alloc);
	}
	return NULL;
}

/*
 * Free an option memory block.
 */
void sock_kfree_s(struct sock *sk, void *mem, int size)
{
	kfree(mem);
	atomic_sub(size, &sk->sk_omem_alloc);
}

/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   I think, these locks should be removed for datagram sockets.
 */
static long sock_wait_for_wmem(struct sock * sk, long timeo)
{
	DEFINE_WAIT(wait);

	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
	for (;;) {
		if (!timeo)
			break;
		if (signal_pending(current))
			break;
		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
			break;
		if (sk->sk_shutdown & SEND_SHUTDOWN)
			break;
		if (sk->sk_err)
			break;
		timeo = schedule_timeout(timeo);
	}
	finish_wait(sk->sk_sleep, &wait);
	return timeo;
}


/*
 *	Generic send/receive buffer handlers
 */

static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
					    unsigned long header_len,
					    unsigned long data_len,
					    int noblock, int *errcode)
{
	struct sk_buff *skb;
Al Viro's avatar
Al Viro committed
1272
	gfp_t gfp_mask;
Linus Torvalds's avatar
Linus Torvalds committed
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
	long timeo;
	int err;

	gfp_mask = sk->sk_allocation;
	if (gfp_mask & __GFP_WAIT)
		gfp_mask |= __GFP_REPEAT;

	timeo = sock_sndtimeo(sk, noblock);
	while (1) {
		err = sock_error(sk);
		if (err != 0)
			goto failure;

		err = -EPIPE;
		if (sk->sk_shutdown & SEND_SHUTDOWN)
			goto failure;

		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1291
			skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
			if (skb) {
				int npages;
				int i;

				/* No pages, we're done... */
				if (!data_len)
					break;

				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
				skb->truesize += data_len;
				skb_shinfo(skb)->nr_frags = npages;
				for (i = 0; i < npages; i++) {
					struct page *page;
					skb_frag_t *frag;

					page = alloc_pages(sk->sk_allocation, 0);
					if (!page) {
						err = -ENOBUFS;
						skb_shinfo(skb)->nr_frags = i;
						kfree_skb(skb);
						goto failure;
					}

					frag = &skb_shinfo(skb)->frags[i];
					frag->page = page;
					frag->page_offset = 0;
					frag->size = (data_len >= PAGE_SIZE ?
						      PAGE_SIZE :
						      data_len);
					data_len -= PAGE_SIZE;
				}

				/* Full success... */
				break;
			}
			err = -ENOBUFS;
			goto failure;
		}
		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
		err = -EAGAIN;
		if (!timeo)
			goto failure;
		if (signal_pending(current))
			goto interrupted;
		timeo = sock_wait_for_wmem(sk, timeo);
	}

	skb_set_owner_w(skb, sk);
	return skb;

interrupted:
	err = sock_intr_errno(timeo);
failure:
	*errcode = err;
	return NULL;
}

1350
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds's avatar
Linus Torvalds committed
1351
1352
1353
1354
1355
1356
1357
1358
1359
				    int noblock, int *errcode)
{
	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
}

static void __lock_sock(struct sock *sk)
{
	DEFINE_WAIT(wait);

1360
	for (;;) {
Linus Torvalds's avatar
Linus Torvalds committed
1361
1362
1363
1364
1365
		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
					TASK_UNINTERRUPTIBLE);
		spin_unlock_bh(&sk->sk_lock.slock);
		schedule();
		spin_lock_bh(&sk->sk_lock.slock);
1366
		if (!sock_owned_by_user(sk))
Linus Torvalds's avatar
Linus Torvalds committed
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
			break;
	}
	finish_wait(&sk->sk_lock.wq, &wait);
}

static void __release_sock(struct sock *sk)
{
	struct sk_buff *skb = sk->sk_backlog.head;

	do {
		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
		bh_unlock_sock(sk);

		do {
			struct sk_buff *next = skb->next;

			skb->next = NULL;
			sk->sk_backlog_rcv(sk, skb);

			/*
			 * We are in process context here with softirqs
			 * disabled, use cond_resched_softirq() to preempt.
			 * This is safe to do because we've taken the backlog
			 * queue private:
			 */
			cond_resched_softirq();

			skb = next;
		} while (skb != NULL);

		bh_lock_sock(sk);
1398
	} while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1399
1400
1401
1402
}

/**
 * sk_wait_data - wait for data to arrive at sk_receive_queue
1403
1404
 * @sk:    sock to wait on
 * @timeo: for how long
Linus Torvalds's avatar
Linus Torvalds committed
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
 *
 * Now socket state including sk->sk_err is changed only under lock,
 * hence we may omit checks after joining wait queue.
 * We check receive queue before schedule() only as optimization;
 * it is very likely that release_sock() added new data.
 */
int sk_wait_data(struct sock *sk, long *timeo)
{
	int rc;
	DEFINE_WAIT(wait);

	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
	finish_wait(sk->sk_sleep, &wait);
	return rc;
}

EXPORT_SYMBOL(sk_wait_data);

1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
/**
 *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
 *	@sk: socket
 *	@size: memory size to allocate
 *	@kind: allocation type
 *
 *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
 *	rmem allocation. This function assumes that protocols which have
 *	memory_pressure use sk_wmem_queued as write buffer accounting.
 */
int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
	struct proto *prot = sk->sk_prot;
	int amt = sk_mem_pages(size);
	int allocated;

	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
	allocated = atomic_add_return(amt, prot->memory_allocated);

	/* Under limit. */
	if (allocated <= prot->sysctl_mem[0]) {
		if (prot->memory_pressure && *prot->memory_pressure)
			*prot->memory_pressure = 0;
		return 1;
	}

	/* Under pressure. */
	if (allocated > prot->sysctl_mem[1])
		if (prot->enter_memory_pressure)
			prot->enter_memory_pressure();

	/* Over hard limit. */
	if (allocated > prot->sysctl_mem[2])
		goto suppress_allocation;

	/* guarantee minimum buffer size under pressure */
	if (kind == SK_MEM_RECV) {
		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
			return 1;
	} else { /* SK_MEM_SEND */
		if (sk->sk_type == SOCK_STREAM) {
			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
				return 1;
		} else if (atomic_read(&sk->sk_wmem_alloc) <
			   prot->sysctl_wmem[0])
				return 1;
	}

	if (prot->memory_pressure) {
		if (!*prot->memory_pressure ||
		    prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
		    sk_mem_pages(sk->sk_wmem_queued +
				 atomic_read(&sk->sk_rmem_alloc) +
				 sk->sk_forward_alloc))
			return 1;
	}

suppress_allocation:

	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
		sk_stream_moderate_sndbuf(sk);

		/* Fail only if socket is _under_ its sndbuf.
		 * In this case we cannot block, so that we have to fail.
		 */
		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
			return 1;
	}

	/* Alas. Undo changes. */
	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
	atomic_sub(amt, prot->memory_allocated);
	return 0;
}

EXPORT_SYMBOL(__sk_mem_schedule);

/**
 *	__sk_reclaim - reclaim memory_allocated
 *	@sk: socket
 */
void __sk_mem_reclaim(struct sock *sk)
{
	struct proto *prot = sk->sk_prot;

1511
	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
		   prot->memory_allocated);
	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;

	if (prot->memory_pressure && *prot->memory_pressure &&
	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
		*prot->memory_pressure = 0;
}

EXPORT_SYMBOL(__sk_mem_reclaim);


Linus Torvalds's avatar
Linus Torvalds committed
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
/*
 * Set of default routines for initialising struct proto_ops when
 * the protocol does not support a particular function. In certain
 * cases where it makes no sense for a protocol to have a "do nothing"
 * function, some default processing is provided.
 */

int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
	return -EOPNOTSUPP;
}

1535
int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds's avatar
Linus Torvalds committed
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
		    int len, int flags)
{
	return -EOPNOTSUPP;
}

int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
	return -EOPNOTSUPP;
}

int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
{
	return -EOPNOTSUPP;
}

1551
int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds's avatar
Linus Torvalds committed
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
		    int *len, int peer)
{
	return -EOPNOTSUPP;
}

unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
{
	return 0;
}

int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	return -EOPNOTSUPP;
}

int sock_no_listen(struct socket *sock, int backlog)
{
	return -EOPNOTSUPP;
}

int sock_no_shutdown(struct socket *sock, int how)
{
	return -EOPNOTSUPP;
}

int sock_no_setsockopt(struct socket *sock, int level, int optname,
		    char __user *optval, int optlen)
{
	return -EOPNOTSUPP;
}

int sock_no_getsockopt(struct socket *sock, int level, int optname,
		    char __user *optval, int __user *optlen)
{
	return -EOPNOTSUPP;
}

int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
		    size_t len)
{
	return -EOPNOTSUPP;
}

int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
		    size_t len, int flags)
{
	return -EOPNOTSUPP;
}

int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
	/* Mirror missing mmap method error code */
	return -ENODEV;
}

ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
	ssize_t res;
	struct msghdr msg = {.msg_flags = flags};
	struct kvec iov;
	char *kaddr = kmap(page);
	iov.iov_base = kaddr + offset;
	iov.iov_len = size;
	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
	kunmap(page);
	return res;
}

/*
 *	Default Socket Callbacks
 */

static void sock_def_wakeup(struct sock *sk)
{
	read_lock(&sk->sk_callback_lock);
	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
		wake_up_interruptible_all(sk->sk_sleep);
	read_unlock(&sk->sk_callback_lock);
}

static void sock_def_error_report(struct sock *sk)
{
	read_lock(&sk->sk_callback_lock);
	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
		wake_up_interruptible(sk->sk_sleep);
1637
	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Linus Torvalds's avatar
Linus Torvalds committed
1638
1639
1640
1641
1642
1643
1644
1645
	read_unlock(&sk->sk_callback_lock);
}

static void sock_def_readable(struct sock *sk, int len)
{
	read_lock(&sk->sk_callback_lock);
	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
		wake_up_interruptible(sk->sk_sleep);
1646
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Linus Torvalds's avatar
Linus Torvalds committed
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
	read_unlock(&sk->sk_callback_lock);
}

static void sock_def_write_space(struct sock *sk)
{
	read_lock(&sk->sk_callback_lock);

	/* Do not wake up a writer until he can make "significant"
	 * progress.  --DaveM
	 */
1657
	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Linus Torvalds's avatar
Linus Torvalds committed
1658
1659
1660
1661
1662
		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
			wake_up_interruptible(sk->sk_sleep);

		/* Should agree with poll, otherwise some programs break */
		if (sock_writeable(sk))
1663
			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds's avatar
Linus Torvalds committed
1664
1665
1666
1667
1668
1669
1670
	}

	read_unlock(&sk->sk_callback_lock);
}

static void sock_def_destruct(struct sock *sk)
{
Jesper Juhl's avatar
Jesper Juhl committed
1671
	kfree(sk->sk_protinfo);
Linus Torvalds's avatar
Linus Torvalds committed
1672
1673
1674
1675
1676
1677
}

void sk_send_sigurg(struct sock *sk)
{
	if (sk->sk_socket && sk->sk_socket->file)
		if (send_sigurg(&sk->sk_socket->file->f_owner))
1678
			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds's avatar
Linus Torvalds committed
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
}

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
		    unsigned long expires)
{
	if (!mod_timer(timer, expires))
		sock_hold(sk);
}

EXPORT_SYMBOL(sk_reset_timer);

void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
	if (timer_pending(timer) && del_timer(timer))
		__sock_put(sk);
}

EXPORT_SYMBOL(sk_stop_timer);

void sock_init_data(struct socket *sock, struct sock *sk)
{
	skb_queue_head_init(&sk->sk_receive_queue);
	skb_queue_head_init(&sk->sk_write_queue);
	skb_queue_head_init(&sk->sk_error_queue);
1703
1704
1705
#ifdef CONFIG_NET_DMA
	skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1706
1707
1708
1709

	sk->sk_send_head	=	NULL;

	init_timer(&sk->sk_timer);
1710

Linus Torvalds's avatar
Linus Torvalds committed
1711
1712
1713
1714
1715
1716
1717
1718
	sk->sk_allocation	=	GFP_KERNEL;
	sk->sk_rcvbuf		=	sysctl_rmem_default;
	sk->sk_sndbuf		=	sysctl_wmem_default;
	sk->sk_state		=	TCP_CLOSE;
	sk->sk_socket		=	sock;

	sock_set_flag(sk, SOCK_ZAPPED);

1719
	if (sock) {
Linus Torvalds's avatar
Linus Torvalds committed
1720
1721
1722
1723
1724
1725
1726
1727
		sk->sk_type	=	sock->type;
		sk->sk_sleep	=	&sock->wait;
		sock->sk	=	sk;
	} else
		sk->sk_sleep	=	NULL;

	rwlock_init(&sk->sk_dst_lock);
	rwlock_init(&sk->sk_callback_lock);
1728
1729
1730
	lockdep_set_class_and_name(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);