socket.c 79.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6
/*
 * NET		An implementation of the SOCKET network access protocol.
 *
 * Version:	@(#)socket.c	1.1.93	18/02/95
 *
 * Authors:	Orest Zborowski, <obz@Kodak.COM>
7
 *		Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 * Fixes:
 *		Anonymous	:	NOTSOCK/BADF cleanup. Error fix in
 *					shutdown()
 *		Alan Cox	:	verify_area() fixes
 *		Alan Cox	:	Removed DDI
 *		Jonathan Kamens	:	SOCK_DGRAM reconnect bug
 *		Alan Cox	:	Moved a load of checks to the very
 *					top level.
 *		Alan Cox	:	Move address structures to/from user
 *					mode above the protocol layers.
 *		Rob Janssen	:	Allow 0 length sends.
 *		Alan Cox	:	Asynchronous I/O support (cribbed from the
 *					tty drivers).
 *		Niibe Yutaka	:	Asynchronous I/O for writes (4.4BSD style)
 *		Jeff Uphoff	:	Made max number of sockets command-line
 *					configurable.
 *		Matti Aarnio	:	Made the number of sockets dynamic,
 *					to be allocated when needed, and mr.
 *					Uphoff's max is used as max to be
 *					allowed to allocate.
 *		Linus		:	Argh. removed all the socket allocation
 *					altogether: it's in the inode now.
 *		Alan Cox	:	Made sock_alloc()/sock_release() public
 *					for NetROM and future kernel nfsd type
 *					stuff.
 *		Alan Cox	:	sendmsg/recvmsg basics.
 *		Tom Dyas	:	Export net symbols.
 *		Marcin Dalecki	:	Fixed problems with CONFIG_NET="n".
 *		Alan Cox	:	Added thread locking to sys_* calls
 *					for sockets. May have errors at the
 *					moment.
 *		Kevin Buhr	:	Fixed the dumb errors in the above.
 *		Andi Kleen	:	Some small cleanups, optimizations,
 *					and fixed a copy_from_user() bug.
 *		Tigran Aivazian	:	sys_send(args) calls sys_sendto(args, NULL, 0)
45
 *		Tigran Aivazian	:	Made listen(2) backlog sanity checks
Linus Torvalds's avatar
Linus Torvalds committed
46 47 48 49 50 51 52 53 54 55
 *					protocol-independent
 *
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 *
 *	This module is effectively the top level interface to the BSD socket
56
 *	paradigm.
Linus Torvalds's avatar
Linus Torvalds committed
57 58 59 60 61 62 63 64 65
 *
 *	Based upon Swansea University Computer Society NET3.039
 */

#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/net.h>
#include <linux/interrupt.h>
66
#include <linux/thread_info.h>
67
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
68 69 70
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
Arjan van de Ven's avatar
Arjan van de Ven committed
71
#include <linux/mutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
72
#include <linux/if_bridge.h>
73 74
#include <linux/if_frad.h>
#include <linux/if_vlan.h>
75
#include <linux/ptp_classify.h>
Linus Torvalds's avatar
Linus Torvalds committed
76 77 78 79 80 81 82 83 84 85
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/cache.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/kmod.h>
86
#include <linux/audit.h>
87
#include <linux/wireless.h>
88
#include <linux/nsproxy.h>
89
#include <linux/magic.h>
90
#include <linux/slab.h>
91
#include <linux/xattr.h>
Linus Torvalds's avatar
Linus Torvalds committed
92 93 94 95 96

#include <asm/uaccess.h>
#include <asm/unistd.h>

#include <net/compat.h>
97
#include <net/wext.h>
98
#include <net/cls_cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
99 100 101 102

#include <net/sock.h>
#include <linux/netfilter.h>

103 104 105 106 107
#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
#include <linux/sockios.h>
#include <linux/atalk.h>
108
#include <net/busy_poll.h>
109
#include <linux/errqueue.h>
110

111
#ifdef CONFIG_NET_RX_BUSY_POLL
112 113
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
114
#endif
115

116 117
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to);
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from);
118
static int sock_mmap(struct file *file, struct vm_area_struct *vma);
Linus Torvalds's avatar
Linus Torvalds committed
119 120 121 122

static int sock_close(struct inode *inode, struct file *file);
static unsigned int sock_poll(struct file *file,
			      struct poll_table_struct *wait);
123
static long sock_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
124 125
#ifdef CONFIG_COMPAT
static long compat_sock_ioctl(struct file *file,
126
			      unsigned int cmd, unsigned long arg);
127
#endif
Linus Torvalds's avatar
Linus Torvalds committed
128 129 130
static int sock_fasync(int fd, struct file *filp, int on);
static ssize_t sock_sendpage(struct file *file, struct page *page,
			     int offset, size_t size, loff_t *ppos, int more);
131
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
132
				struct pipe_inode_info *pipe, size_t len,
133
				unsigned int flags);
Linus Torvalds's avatar
Linus Torvalds committed
134 135 136 137 138 139

/*
 *	Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *	in the operation structures but are done directly via the socketcall() multiplexor.
 */

140
static const struct file_operations socket_file_ops = {
Linus Torvalds's avatar
Linus Torvalds committed
141 142
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
143 144
	.read_iter =	sock_read_iter,
	.write_iter =	sock_write_iter,
Linus Torvalds's avatar
Linus Torvalds committed
145 146
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
147 148 149
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
Linus Torvalds's avatar
Linus Torvalds committed
150 151 152
	.mmap =		sock_mmap,
	.release =	sock_close,
	.fasync =	sock_fasync,
153 154
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
155
	.splice_read =	sock_splice_read,
Linus Torvalds's avatar
Linus Torvalds committed
156 157 158 159 160 161 162
};

/*
 *	The protocol list. Each protocol is registered in here.
 */

static DEFINE_SPINLOCK(net_family_lock);
163
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
164 165 166 167 168

/*
 *	Statistics counters of the socket lists
 */

169
static DEFINE_PER_CPU(int, sockets_in_use);
Linus Torvalds's avatar
Linus Torvalds committed
170 171

/*
172 173 174
 * Support routines.
 * Move socket addresses back and forth across the kernel/user
 * divide and look after the messy bits.
Linus Torvalds's avatar
Linus Torvalds committed
175 176 177 178 179 180 181 182 183 184 185 186 187
 */

/**
 *	move_addr_to_kernel	-	copy a socket address into kernel space
 *	@uaddr: Address in user space
 *	@kaddr: Address in kernel space
 *	@ulen: Length in user space
 *
 *	The address is copied into kernel space. If the provided address is
 *	too long an error code of -EINVAL is returned. If the copy gives
 *	invalid addresses -EFAULT is returned. On a success 0 is returned.
 */

188
int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr)
Linus Torvalds's avatar
Linus Torvalds committed
189
{
190
	if (ulen < 0 || ulen > sizeof(struct sockaddr_storage))
Linus Torvalds's avatar
Linus Torvalds committed
191
		return -EINVAL;
192
	if (ulen == 0)
Linus Torvalds's avatar
Linus Torvalds committed
193
		return 0;
194
	if (copy_from_user(kaddr, uaddr, ulen))
Linus Torvalds's avatar
Linus Torvalds committed
195
		return -EFAULT;
196
	return audit_sockaddr(ulen, kaddr);
Linus Torvalds's avatar
Linus Torvalds committed
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
}

/**
 *	move_addr_to_user	-	copy an address to user space
 *	@kaddr: kernel space address
 *	@klen: length of address in kernel
 *	@uaddr: user space address
 *	@ulen: pointer to user length field
 *
 *	The value pointed to by ulen on entry is the buffer length available.
 *	This is overwritten with the buffer space used. -EINVAL is returned
 *	if an overlong buffer is specified or a negative buffer size. -EFAULT
 *	is returned if either the buffer or the length field are not
 *	accessible.
 *	After copying the data up to the limit the user specifies, the true
 *	length of the data is written over the length limit the user
 *	specified. Zero is returned for a success.
 */
215

216
static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
217
			     void __user *uaddr, int __user *ulen)
Linus Torvalds's avatar
Linus Torvalds committed
218 219 220 221
{
	int err;
	int len;

222
	BUG_ON(klen > sizeof(struct sockaddr_storage));
223 224
	err = get_user(len, ulen);
	if (err)
Linus Torvalds's avatar
Linus Torvalds committed
225
		return err;
226 227
	if (len > klen)
		len = klen;
228
	if (len < 0)
Linus Torvalds's avatar
Linus Torvalds committed
229
		return -EINVAL;
230
	if (len) {
Steve Grubb's avatar
Steve Grubb committed
231 232
		if (audit_sockaddr(klen, kaddr))
			return -ENOMEM;
233
		if (copy_to_user(uaddr, kaddr, len))
Linus Torvalds's avatar
Linus Torvalds committed
234 235 236
			return -EFAULT;
	}
	/*
237 238
	 *      "fromlen shall refer to the value before truncation.."
	 *                      1003.1g
Linus Torvalds's avatar
Linus Torvalds committed
239 240 241 242
	 */
	return __put_user(klen, ulen);
}

243
static struct kmem_cache *sock_inode_cachep __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
244 245 246 247

static struct inode *sock_alloc_inode(struct super_block *sb)
{
	struct socket_alloc *ei;
248
	struct socket_wq *wq;
249

250
	ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
251 252
	if (!ei)
		return NULL;
253 254
	wq = kmalloc(sizeof(*wq), GFP_KERNEL);
	if (!wq) {
255 256 257
		kmem_cache_free(sock_inode_cachep, ei);
		return NULL;
	}
258 259
	init_waitqueue_head(&wq->wait);
	wq->fasync_list = NULL;
260
	wq->flags = 0;
261
	RCU_INIT_POINTER(ei->socket.wq, wq);
262

Linus Torvalds's avatar
Linus Torvalds committed
263 264 265 266 267 268 269 270 271 272 273
	ei->socket.state = SS_UNCONNECTED;
	ei->socket.flags = 0;
	ei->socket.ops = NULL;
	ei->socket.sk = NULL;
	ei->socket.file = NULL;

	return &ei->vfs_inode;
}

static void sock_destroy_inode(struct inode *inode)
{
274
	struct socket_alloc *ei;
275
	struct socket_wq *wq;
276 277

	ei = container_of(inode, struct socket_alloc, vfs_inode);
278
	wq = rcu_dereference_protected(ei->socket.wq, 1);
279
	kfree_rcu(wq, rcu);
280
	kmem_cache_free(sock_inode_cachep, ei);
Linus Torvalds's avatar
Linus Torvalds committed
281 282
}

283
static void init_once(void *foo)
Linus Torvalds's avatar
Linus Torvalds committed
284
{
285
	struct socket_alloc *ei = (struct socket_alloc *)foo;
Linus Torvalds's avatar
Linus Torvalds committed
286

287
	inode_init_once(&ei->vfs_inode);
Linus Torvalds's avatar
Linus Torvalds committed
288
}
289

Linus Torvalds's avatar
Linus Torvalds committed
290 291 292
static int init_inodecache(void)
{
	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
293 294 295 296
					      sizeof(struct socket_alloc),
					      0,
					      (SLAB_HWCACHE_ALIGN |
					       SLAB_RECLAIM_ACCOUNT |
297
					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
298
					      init_once);
Linus Torvalds's avatar
Linus Torvalds committed
299 300 301 302 303
	if (sock_inode_cachep == NULL)
		return -ENOMEM;
	return 0;
}

304
static const struct super_operations sockfs_ops = {
305 306 307
	.alloc_inode	= sock_alloc_inode,
	.destroy_inode	= sock_destroy_inode,
	.statfs		= simple_statfs,
Linus Torvalds's avatar
Linus Torvalds committed
308 309
};

310 311 312 313 314 315
/*
 * sockfs_dname() is called from d_path().
 */
static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
{
	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
316
				d_inode(dentry)->i_ino);
317 318
}

319
static const struct dentry_operations sockfs_dentry_operations = {
320
	.d_dname  = sockfs_dname,
Linus Torvalds's avatar
Linus Torvalds committed
321 322
};

323 324 325 326 327 328 329 330 331 332 333 334 335 336 337
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
			 int flags, const char *dev_name, void *data)
{
	return mount_pseudo(fs_type, "socket:", &sockfs_ops,
		&sockfs_dentry_operations, SOCKFS_MAGIC);
}

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
	.name =		"sockfs",
	.mount =	sockfs_mount,
	.kill_sb =	kill_anon_super,
};

Linus Torvalds's avatar
Linus Torvalds committed
338 339 340
/*
 *	Obtains the first available file descriptor and sets it up for use.
 *
341 342
 *	These functions create file structures and maps them to fd space
 *	of the current process. On success it returns file descriptor
Linus Torvalds's avatar
Linus Torvalds committed
343 344 345 346 347 348 349 350 351 352 353 354
 *	and file struct implicitly stored in sock->file.
 *	Note that another thread may close file descriptor before we return
 *	from this function. We use the fact that now we do not refer
 *	to socket after mapping. If one day we will need it, this
 *	function will increment ref. count on file by 1.
 *
 *	In any case returned fd MAY BE not valid!
 *	This race condition is unavoidable
 *	with shared fd spaces, we cannot solve it inside kernel,
 *	but we take care of internal coherence yet.
 */

355
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
Linus Torvalds's avatar
Linus Torvalds committed
356
{
357
	struct qstr name = { .name = "" };
358
	struct path path;
359
	struct file *file;
Linus Torvalds's avatar
Linus Torvalds committed
360

361 362 363 364 365 366 367
	if (dname) {
		name.name = dname;
		name.len = strlen(name.name);
	} else if (sock->sk) {
		name.name = sock->sk->sk_prot_creator->name;
		name.len = strlen(name.name);
	}
368
	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
369 370
	if (unlikely(!path.dentry))
		return ERR_PTR(-ENOMEM);
371
	path.mnt = mntget(sock_mnt);
372

373
	d_instantiate(path.dentry, SOCK_INODE(sock));
374

375
	file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
376
		  &socket_file_ops);
377
	if (IS_ERR(file)) {
378
		/* drop dentry, keep inode */
379
		ihold(d_inode(path.dentry));
380
		path_put(&path);
381
		return file;
382 383 384
	}

	sock->file = file;
385
	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
386
	file->private_data = sock;
387
	return file;
388
}
389
EXPORT_SYMBOL(sock_alloc_file);
390

391
static int sock_map_fd(struct socket *sock, int flags)
392 393
{
	struct file *newfile;
394 395 396
	int fd = get_unused_fd_flags(flags);
	if (unlikely(fd < 0))
		return fd;
397

398
	newfile = sock_alloc_file(sock, flags, NULL);
399
	if (likely(!IS_ERR(newfile))) {
400
		fd_install(fd, newfile);
401 402
		return fd;
	}
403

404 405
	put_unused_fd(fd);
	return PTR_ERR(newfile);
Linus Torvalds's avatar
Linus Torvalds committed
406 407
}

408
struct socket *sock_from_file(struct file *file, int *err)
409 410 411 412
{
	if (file->f_op == &socket_file_ops)
		return file->private_data;	/* set in sock_map_fd */

413 414
	*err = -ENOTSOCK;
	return NULL;
415
}
416
EXPORT_SYMBOL(sock_from_file);
417

Linus Torvalds's avatar
Linus Torvalds committed
418
/**
419
 *	sockfd_lookup - Go from a file number to its socket slot
Linus Torvalds's avatar
Linus Torvalds committed
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
 *	@fd: file handle
 *	@err: pointer to an error code return
 *
 *	The file handle passed in is locked and the socket it is bound
 *	too is returned. If an error occurs the err pointer is overwritten
 *	with a negative errno code and NULL is returned. The function checks
 *	for both invalid handles and passing a handle which is not a socket.
 *
 *	On a success the socket object pointer is returned.
 */

struct socket *sockfd_lookup(int fd, int *err)
{
	struct file *file;
	struct socket *sock;

436 437
	file = fget(fd);
	if (!file) {
Linus Torvalds's avatar
Linus Torvalds committed
438 439 440
		*err = -EBADF;
		return NULL;
	}
441

442 443
	sock = sock_from_file(file, err);
	if (!sock)
Linus Torvalds's avatar
Linus Torvalds committed
444
		fput(file);
445 446
	return sock;
}
447
EXPORT_SYMBOL(sockfd_lookup);
Linus Torvalds's avatar
Linus Torvalds committed
448

449 450
static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
{
451
	struct fd f = fdget(fd);
452 453
	struct socket *sock;

454
	*err = -EBADF;
455 456 457 458
	if (f.file) {
		sock = sock_from_file(f.file, err);
		if (likely(sock)) {
			*fput_needed = f.flags;
459
			return sock;
460 461
		}
		fdput(f);
Linus Torvalds's avatar
Linus Torvalds committed
462
	}
463
	return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
464 465
}

466 467 468
#define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname"
#define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX)
#define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1)
469
static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode,
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
			       const char *name, void *value, size_t size)
{
	const char *proto_name;
	size_t proto_size;
	int error;

	error = -ENODATA;
	if (!strncmp(name, XATTR_NAME_SOCKPROTONAME, XATTR_NAME_SOCKPROTONAME_LEN)) {
		proto_name = dentry->d_name.name;
		proto_size = strlen(proto_name);

		if (value) {
			error = -ERANGE;
			if (proto_size + 1 > size)
				goto out;

			strncpy(value, proto_name, proto_size + 1);
		}
		error = proto_size + 1;
	}

out:
	return error;
}

static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
				size_t size)
{
	ssize_t len;
	ssize_t used = 0;

501
	len = security_inode_listsecurity(d_inode(dentry), buffer, size);
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
	if (len < 0)
		return len;
	used += len;
	if (buffer) {
		if (size < used)
			return -ERANGE;
		buffer += len;
	}

	len = (XATTR_NAME_SOCKPROTONAME_LEN + 1);
	used += len;
	if (buffer) {
		if (size < used)
			return -ERANGE;
		memcpy(buffer, XATTR_NAME_SOCKPROTONAME, len);
		buffer += len;
	}

	return used;
}

static const struct inode_operations sockfs_inode_ops = {
	.getxattr = sockfs_getxattr,
	.listxattr = sockfs_listxattr,
};

Linus Torvalds's avatar
Linus Torvalds committed
528 529
/**
 *	sock_alloc	-	allocate a socket
530
 *
Linus Torvalds's avatar
Linus Torvalds committed
531 532 533 534 535
 *	Allocate a new inode and socket object. The two are bound together
 *	and initialised. The socket is then returned. If we are out of inodes
 *	NULL is returned.
 */

536
struct socket *sock_alloc(void)
Linus Torvalds's avatar
Linus Torvalds committed
537
{
538 539
	struct inode *inode;
	struct socket *sock;
Linus Torvalds's avatar
Linus Torvalds committed
540

541
	inode = new_inode_pseudo(sock_mnt->mnt_sb);
Linus Torvalds's avatar
Linus Torvalds committed
542 543 544 545 546
	if (!inode)
		return NULL;

	sock = SOCKET_I(inode);

547
	kmemcheck_annotate_bitfield(sock, type);
548
	inode->i_ino = get_next_ino();
549
	inode->i_mode = S_IFSOCK | S_IRWXUGO;
550 551
	inode->i_uid = current_fsuid();
	inode->i_gid = current_fsgid();
552
	inode->i_op = &sockfs_inode_ops;
Linus Torvalds's avatar
Linus Torvalds committed
553

554
	this_cpu_add(sockets_in_use, 1);
Linus Torvalds's avatar
Linus Torvalds committed
555 556
	return sock;
}
557
EXPORT_SYMBOL(sock_alloc);
Linus Torvalds's avatar
Linus Torvalds committed
558 559 560 561 562 563 564

/**
 *	sock_release	-	close a socket
 *	@sock: socket to close
 *
 *	The socket is released from the protocol stack if it has a release
 *	callback, and the inode is then released if the socket is bound to
565
 *	an inode not a file.
Linus Torvalds's avatar
Linus Torvalds committed
566
 */
567

Linus Torvalds's avatar
Linus Torvalds committed
568 569 570 571 572 573 574 575 576 577
void sock_release(struct socket *sock)
{
	if (sock->ops) {
		struct module *owner = sock->ops->owner;

		sock->ops->release(sock);
		sock->ops = NULL;
		module_put(owner);
	}

578
	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
579
		pr_err("%s: fasync list not empty!\n", __func__);
Linus Torvalds's avatar
Linus Torvalds committed
580

581
	this_cpu_sub(sockets_in_use, 1);
Linus Torvalds's avatar
Linus Torvalds committed
582 583 584 585
	if (!sock->file) {
		iput(SOCK_INODE(sock));
		return;
	}
586
	sock->file = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
587
}
588
EXPORT_SYMBOL(sock_release);
Linus Torvalds's avatar
Linus Torvalds committed
589

590
void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags)
591
{
592 593
	u8 flags = *tx_flags;

594
	if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE)
595 596
		flags |= SKBTX_HW_TSTAMP;

597
	if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE)
598 599
		flags |= SKBTX_SW_TSTAMP;

600
	if (tsflags & SOF_TIMESTAMPING_TX_SCHED)
601 602 603
		flags |= SKBTX_SCHED_TSTAMP;

	*tx_flags = flags;
604
}
605
EXPORT_SYMBOL(__sock_tx_timestamp);
606

607
static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg)
Linus Torvalds's avatar
Linus Torvalds committed
608
{
Al Viro's avatar
Al Viro committed
609
	int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg));
610 611
	BUG_ON(ret == -EIOCBQUEUED);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
612 613
}

614
int sock_sendmsg(struct socket *sock, struct msghdr *msg)
615
{
616
	int err = security_socket_sendmsg(sock, msg,
Al Viro's avatar
Al Viro committed
617
					  msg_data_left(msg));
618

619
	return err ?: sock_sendmsg_nosec(sock, msg);
620
}
621
EXPORT_SYMBOL(sock_sendmsg);
Linus Torvalds's avatar
Linus Torvalds committed
622 623 624 625

int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
		   struct kvec *vec, size_t num, size_t size)
{
626
	iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size);
627
	return sock_sendmsg(sock, msg);
Linus Torvalds's avatar
Linus Torvalds committed
628
}
629
EXPORT_SYMBOL(kernel_sendmsg);
Linus Torvalds's avatar
Linus Torvalds committed
630

631 632 633 634 635 636
/*
 * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
 */
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
	struct sk_buff *skb)
{
637
	int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
638
	struct scm_timestamping tss;
639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
	int empty = 1;
	struct skb_shared_hwtstamps *shhwtstamps =
		skb_hwtstamps(skb);

	/* Race occurred between timestamp enabling and packet
	   receiving.  Fill in the current time for now. */
	if (need_software_tstamp && skb->tstamp.tv64 == 0)
		__net_timestamp(skb);

	if (need_software_tstamp) {
		if (!sock_flag(sk, SOCK_RCVTSTAMPNS)) {
			struct timeval tv;
			skb_get_timestamp(skb, &tv);
			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
				 sizeof(tv), &tv);
		} else {
655 656
			struct timespec ts;
			skb_get_timestampns(skb, &ts);
657
			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
658
				 sizeof(ts), &ts);
659 660 661
		}
	}

662
	memset(&tss, 0, sizeof(tss));
663
	if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
664
	    ktime_to_timespec_cond(skb->tstamp, tss.ts + 0))
665
		empty = 0;
666
	if (shhwtstamps &&
667
	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
668
	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
669
		empty = 0;
670 671
	if (!empty)
		put_cmsg(msg, SOL_SOCKET,
672
			 SCM_TIMESTAMPING, sizeof(tss), &tss);
673
}
674 675
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);

676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
	struct sk_buff *skb)
{
	int ack;

	if (!sock_flag(sk, SOCK_WIFI_STATUS))
		return;
	if (!skb->wifi_acked_valid)
		return;

	ack = skb->wifi_acked;

	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
}
EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);

692 693
static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
				   struct sk_buff *skb)
694
{
695
	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && SOCK_SKB_CB(skb)->dropcount)
696
		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
697
			sizeof(__u32), &SOCK_SKB_CB(skb)->dropcount);
698 699
}

700
void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
701 702 703 704 705
	struct sk_buff *skb)
{
	sock_recv_timestamp(msg, sk, skb);
	sock_recv_drops(msg, sk, skb);
}
706
EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
707

708
static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
709
				     int flags)
Linus Torvalds's avatar
Linus Torvalds committed
710
{
711
	return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags);
Linus Torvalds's avatar
Linus Torvalds committed
712 713
}

714
int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags)
715
{
716
	int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags);
717

718
	return err ?: sock_recvmsg_nosec(sock, msg, flags);
Linus Torvalds's avatar
Linus Torvalds committed
719
}
720
EXPORT_SYMBOL(sock_recvmsg);
Linus Torvalds's avatar
Linus Torvalds committed
721

722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
/**
 * kernel_recvmsg - Receive a message from a socket (kernel space)
 * @sock:       The socket to receive the message from
 * @msg:        Received message
 * @vec:        Input s/g array for message data
 * @num:        Size of input s/g array
 * @size:       Number of bytes to read
 * @flags:      Message flags (MSG_DONTWAIT, etc...)
 *
 * On return the msg structure contains the scatter/gather array passed in the
 * vec argument. The array is modified so that it consists of the unfilled
 * portion of the original array.
 *
 * The returned value is the total number of bytes received, or an error.
 */
737 738
int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
		   struct kvec *vec, size_t num, size_t size, int flags)
Linus Torvalds's avatar
Linus Torvalds committed
739 740 741 742
{
	mm_segment_t oldfs = get_fs();
	int result;

743
	iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size);
Linus Torvalds's avatar
Linus Torvalds committed
744
	set_fs(KERNEL_DS);
745
	result = sock_recvmsg(sock, msg, flags);
Linus Torvalds's avatar
Linus Torvalds committed
746 747 748
	set_fs(oldfs);
	return result;
}
749
EXPORT_SYMBOL(kernel_recvmsg);
Linus Torvalds's avatar
Linus Torvalds committed
750

751 752
static ssize_t sock_sendpage(struct file *file, struct page *page,
			     int offset, size_t size, loff_t *ppos, int more)
Linus Torvalds's avatar
Linus Torvalds committed
753 754 755 756
{
	struct socket *sock;
	int flags;

757 758
	sock = file->private_data;

759 760 761
	flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
	/* more is a combination of MSG_MORE and MSG_SENDPAGE_NOTLAST */
	flags |= more;
762

763
	return kernel_sendpage(sock, page, offset, size, flags);
764
}
Linus Torvalds's avatar
Linus Torvalds committed
765

766
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
767
				struct pipe_inode_info *pipe, size_t len,
768 769 770 771
				unsigned int flags)
{
	struct socket *sock = file->private_data;

772 773 774
	if (unlikely(!sock->ops->splice_read))
		return -EINVAL;

775 776 777
	return sock->ops->splice_read(sock, ppos, pipe, len, flags);
}

778
static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
779
{
780 781
	struct file *file = iocb->ki_filp;
	struct socket *sock = file->private_data;
782 783
	struct msghdr msg = {.msg_iter = *to,
			     .msg_iocb = iocb};
784
	ssize_t res;
785

786 787 788 789
	if (file->f_flags & O_NONBLOCK)
		msg.msg_flags = MSG_DONTWAIT;

	if (iocb->ki_pos != 0)
Linus Torvalds's avatar
Linus Torvalds committed
790
		return -ESPIPE;
791

Christoph Hellwig's avatar
Christoph Hellwig committed
792
	if (!iov_iter_count(to))	/* Match SYS5 behaviour */
Linus Torvalds's avatar
Linus Torvalds committed
793 794
		return 0;

795
	res = sock_recvmsg(sock, &msg, msg.msg_flags);
796 797
	*to = msg.msg_iter;
	return res;
Linus Torvalds's avatar
Linus Torvalds committed
798 799
}

800
static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
801
{
802 803
	struct file *file = iocb->ki_filp;
	struct socket *sock = file->private_data;
804 805
	struct msghdr msg = {.msg_iter = *from,
			     .msg_iocb = iocb};
806
	ssize_t res;
Linus Torvalds's avatar
Linus Torvalds committed
807

808
	if (iocb->ki_pos != 0)
809
		return -ESPIPE;
810

811 812 813
	if (file->f_flags & O_NONBLOCK)
		msg.msg_flags = MSG_DONTWAIT;

814 815 816
	if (sock->type == SOCK_SEQPACKET)
		msg.msg_flags |= MSG_EOR;

817
	res = sock_sendmsg(sock, &msg);
818 819
	*from = msg.msg_iter;
	return res;
Linus Torvalds's avatar
Linus Torvalds committed
820 821 822 823 824 825 826
}

/*
 * Atomic setting of ioctl hooks to avoid race
 * with module unload.
 */

Arjan van de Ven's avatar
Arjan van de Ven committed
827
static DEFINE_MUTEX(br_ioctl_mutex);
828
static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
Linus Torvalds's avatar
Linus Torvalds committed
829

830
void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
Linus Torvalds's avatar
Linus Torvalds committed
831
{
Arjan van de Ven's avatar
Arjan van de Ven committed
832
	mutex_lock(&br_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
833
	br_ioctl_hook = hook;
Arjan van de Ven's avatar
Arjan van de Ven committed
834
	mutex_unlock(&br_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
835 836 837
}
EXPORT_SYMBOL(brioctl_set);

Arjan van de Ven's avatar
Arjan van de Ven committed
838
static DEFINE_MUTEX(vlan_ioctl_mutex);
839
static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
Linus Torvalds's avatar
Linus Torvalds committed
840

841
void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
Linus Torvalds's avatar
Linus Torvalds committed
842
{
Arjan van de Ven's avatar
Arjan van de Ven committed
843
	mutex_lock(&vlan_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
844
	vlan_ioctl_hook = hook;
Arjan van de Ven's avatar
Arjan van de Ven committed
845
	mutex_unlock(&vlan_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
846 847 848
}
EXPORT_SYMBOL(vlan_ioctl_set);

Arjan van de Ven's avatar
Arjan van de Ven committed
849
static DEFINE_MUTEX(dlci_ioctl_mutex);
850
static int (*dlci_ioctl_hook) (unsigned int, void __user *);
Linus Torvalds's avatar
Linus Torvalds committed
851

852
void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
Linus Torvalds's avatar
Linus Torvalds committed
853
{
Arjan van de Ven's avatar
Arjan van de Ven committed
854
	mutex_lock(&dlci_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
855
	dlci_ioctl_hook = hook;
Arjan van de Ven's avatar
Arjan van de Ven committed
856
	mutex_unlock(&dlci_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
857 858 859
}
EXPORT_SYMBOL(dlci_ioctl_set);

860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
static long sock_do_ioctl(struct net *net, struct socket *sock,
				 unsigned int cmd, unsigned long arg)
{
	int err;
	void __user *argp = (void __user *)arg;

	err = sock->ops->ioctl(sock, cmd, arg);

	/*
	 * If this ioctl is unknown try to hand it down
	 * to the NIC driver.
	 */
	if (err == -ENOIOCTLCMD)
		err = dev_ioctl(net, cmd, argp);

	return err;
}

Linus Torvalds's avatar
Linus Torvalds committed
878 879 880 881 882 883 884 885
/*
 *	With an ioctl, arg may well be a user mode pointer, but we don't know
 *	what to do with it - that's up to the protocol still.
 */

static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
	struct socket *sock;
886
	struct sock *sk;
Linus Torvalds's avatar
Linus Torvalds committed
887 888
	void __user *argp = (void __user *)arg;
	int pid, err;
889
	struct net *net;
Linus Torvalds's avatar
Linus Torvalds committed
890

891
	sock = file->private_data;
892
	sk = sock->sk;
893
	net = sock_net(sk);
Linus Torvalds's avatar
Linus Torvalds committed
894
	if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
895
		err = dev_ioctl(net, cmd, argp);
Linus Torvalds's avatar
Linus Torvalds committed
896
	} else
Johannes Berg's avatar
Johannes Berg committed
897
#ifdef CONFIG_WEXT_CORE
Linus Torvalds's avatar
Linus Torvalds committed
898
	if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
899
		err = dev_ioctl(net, cmd, argp);
Linus Torvalds's avatar
Linus Torvalds committed
900
	} else
Johannes Berg's avatar
Johannes Berg committed
901
#endif
902
		switch (cmd) {
Linus Torvalds's avatar
Linus Torvalds committed
903 904 905 906 907
		case FIOSETOWN:
		case SIOCSPGRP:
			err = -EFAULT;
			if (get_user(pid, (int __user *)argp))
				break;
908 909
			f_setown(sock->file, pid, 1);
			err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
910 911 912
			break;
		case FIOGETOWN:
		case SIOCGPGRP:
913
			err = put_user(f_getown(sock->file),
914
				       (int __user *)argp);
Linus Torvalds's avatar
Linus Torvalds committed
915 916 917 918 919 920 921 922 923
			break;
		case SIOCGIFBR:
		case SIOCSIFBR:
		case SIOCBRADDBR:
		case SIOCBRDELBR:
			err = -ENOPKG;
			if (!br_ioctl_hook)
				request_module("bridge");

Arjan van de Ven's avatar
Arjan van de Ven committed
924
			mutex_lock(&br_ioctl_mutex);
925
			if (br_ioctl_hook)
926
				err = br_ioctl_hook(net, cmd, argp);
Arjan van de Ven's avatar
Arjan van de Ven committed
927
			mutex_unlock(&br_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
928 929 930 931 932 933 934
			break;
		case SIOCGIFVLAN:
		case SIOCSIFVLAN:
			err = -ENOPKG;
			if (!vlan_ioctl_hook)
				request_module("8021q");

Arjan van de Ven's avatar
Arjan van de Ven committed
935
			mutex_lock(&vlan_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
936
			if (vlan_ioctl_hook)
937
				err = vlan_ioctl_hook(net, argp);
Arjan van de Ven's avatar
Arjan van de Ven committed
938
			mutex_unlock(&vlan_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
939 940 941 942 943 944 945
			break;
		case SIOCADDDLCI:
		case SIOCDELDLCI:
			err = -ENOPKG;
			if (!dlci_ioctl_hook)
				request_module("dlci");

946 947
			mutex_lock(&dlci_ioctl_mutex);
			if (dlci_ioctl_hook)
Linus Torvalds's avatar
Linus Torvalds committed
948
				err = dlci_ioctl_hook(cmd, argp);
949
			mutex_unlock(&dlci_ioctl_mutex);
Linus Torvalds's avatar
Linus Torvalds committed
950 951
			break;
		default:
952
			err = sock_do_ioctl(net, sock, cmd, arg);
Linus Torvalds's avatar
Linus Torvalds committed
953
			break;
954
		}
Linus Torvalds's avatar
Linus Torvalds committed
955 956 957 958 959 960 961
	return err;
}

int sock_create_lite(int family, int type, int protocol, struct socket **res)
{
	int err;
	struct socket *sock = NULL;
962

Linus Torvalds's avatar
Linus Torvalds committed
963 964 965 966 967 968 969 970 971 972 973
	err = security_socket_create(family, type, protocol, 1);
	if (err)
		goto out;

	sock = sock_alloc();
	if (!sock) {
		err = -ENOMEM;
		goto out;
	}

	sock->type = type;
974 975 976 977
	err = security_socket_post_create(sock, family, type, protocol, 1);
	if (err)
		goto out_release;

Linus Torvalds's avatar
Linus Torvalds committed
978 979 980
out:
	*res = sock;
	return err;
981 982 983 984
out_release:
	sock_release(sock);
	sock = NULL;
	goto out;
Linus Torvalds's avatar
Linus Torvalds committed
985
}
986
EXPORT_SYMBOL(sock_create_lite);
Linus Torvalds's avatar
Linus Torvalds committed
987 988

/* No kernel lock held - perfect */
989
static unsigned int sock_poll(struct file *file, poll_table *wait)
Linus Torvalds's avatar
Linus Torvalds committed
990
{
991
	unsigned int busy_flag = 0;
Linus Torvalds's avatar
Linus Torvalds committed
992 993 994
	struct socket *sock;

	/*
995
	 *      We can't return errors to poll, so it's either yes or no.
Linus Torvalds's avatar
Linus Torvalds committed
996
	 */
997
	sock = file->private_data;
998

999
	if (sk_can_busy_loop(sock->sk)) {
1000
		/* this socket can poll_ll so tell the system call */
1001
		busy_flag = POLL_BUSY_LOOP;
1002 1003

		/* once, only if requested by syscall */
1004 1005
		if (wait && (wait->_key & POLL_BUSY_LOOP))
			sk_busy_loop(sock->sk, 1);
1006 1007
	}

1008
	return busy_flag | sock->ops->poll(file, sock, wait);
Linus Torvalds's avatar
Linus Torvalds committed
1009 1010
}

1011
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
1012
{
1013
	struct socket *sock = file->private_data;
Linus Torvalds's avatar
Linus Torvalds committed
1014 1015 1016 1017

	return sock->ops->mmap(file, sock, vma);
}

1018
static int sock_close(struct inode *inode, struct file *filp)
Linus Torvalds's avatar
Linus Torvalds committed
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
{
	sock_release(SOCKET_I(inode));
	return 0;
}

/*
 *	Update the socket async list
 *
 *	Fasync_list locking strategy.
 *
 *	1. fasync_list is modified only under process context socket lock
 *	   i.e. under semaphore.
 *	2. fasync_list is used under read_lock(&sk->sk_callback_lock)
1032
 *	   or under socket lock
Linus Torvalds's avatar
Linus Torvalds committed
1033 1034 1035 1036
 */

static int sock_fasync(int fd, struct file *filp, int on)
{
1037 1038
	struct socket *sock = filp->private_data;
	struct sock *sk = sock->sk;
1039
	struct socket_wq *wq;
Linus Torvalds's avatar
Linus Torvalds committed
1040

1041
	if (sk == NULL)
Linus Torvalds's avatar
Linus Torvalds committed
1042 1043 1044
		return -EINVAL;

	lock_sock(sk);
1045
	wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
1046
	fasync_helper(fd, filp, on, &wq->fasync_list);
Linus Torvalds's avatar
Linus Torvalds committed
1047

1048
	if (!wq->fasync_list)
1049 1050
		sock_reset_flag(sk, SOCK_FASYNC);
	else
1051
		sock_set_flag(sk, SOCK_FASYNC);
Linus Torvalds's avatar
Linus Torvalds committed
1052

1053
	release_sock(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1054 1055 1056
	return 0;
}

1057
/* This function may be called only under rcu_lock */
Linus Torvalds's avatar
Linus Torvalds committed
1058

1059
int sock_wake_async(struct socket_wq *wq, int how, int band)
Linus Torvalds's avatar
Linus Torvalds committed
1060
{
1061
	if (!wq || !wq->fasync_list)
Linus Torvalds's avatar
Linus Torvalds committed
1062
		return -1;
1063

1064
	switch (how) {
1065
	case SOCK_WAKE_WAITD:
1066
		if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
Linus Torvalds's avatar
Linus Torvalds committed
1067 1068
			break;
		goto call_kill;
1069
	case SOCK_WAKE_SPACE:
1070
		if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
Linus Torvalds's avatar
Linus Torvalds committed
1071 1072
			break;
		/* fall through */
1073
	case SOCK_WAKE_IO:
1074
call_kill:
1075
		kill_fasync(&wq->fasync_list, SIGIO, band);
Linus Torvalds's avatar
Linus Torvalds committed
1076
		break;
1077
	case SOCK_WAKE_URG:
1078
		kill_fasync(&wq->fasync_list, SIGURG, band);
Linus Torvalds's avatar
Linus Torvalds committed
1079
	}
1080

Linus Torvalds's avatar
Linus Torvalds committed
1081 1082
	return 0;
}
1083
EXPORT_SYMBOL(sock_wake_async);
Linus Torvalds's avatar
Linus Torvalds committed
1084

1085
int __sock_create(struct net *net, int family, int type, int protocol,
1086
			 struct socket **res, int kern)
Linus Torvalds's avatar
Linus Torvalds committed
1087 1088 1089
{
	int err;
	struct socket *sock;
1090
	const struct net_proto_family *pf;
Linus Torvalds's avatar
Linus Torvalds committed
1091 1092

	/*
1093
	 *      Check protocol is in range
Linus Torvalds's avatar
Linus Torvalds committed
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
1106 1107
		pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			     current->comm);
Linus Torvalds's avatar
Linus Torvalds committed
1108 1109 1110 1111 1112 1113
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;
1114

1115 1116 1117 1118 1119 1120 1121
	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	sock = sock_alloc();
	if (!sock) {
1122
		net_warn_ratelimited("socket: no more sockets\n");
1123 1124 1125 1126 1127 1128
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type;

1129
#ifdef CONFIG_MODULES
1130 1131 1132
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
Linus Torvalds's avatar
Linus Torvalds committed
1133 1134 1135
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
1136
	if (rcu_access_pointer(net_families[family]) == NULL)
1137
		request_module("net-pf-%d", family);
Linus Torvalds's avatar
Linus Torvalds committed
1138 1139
#endif

1140 1141 1142 1143 1144
	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;
Linus Torvalds's avatar
Linus Torvalds committed
1145 1146 1147 1148 1149

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
1150
	if (!try_module_get(pf->owner))
Linus Torvalds's avatar
Linus Torvalds committed
1151 1152
		goto out_release;

1153 1154 1155
	/* Now protected by module ref count */
	rcu_read_unlock();

1156
	err = pf->create(net, sock, protocol, kern);
1157
	if (err < 0)
Linus Torvalds's avatar
Linus Torvalds committed
1158
		goto out_module_put;
1159

Linus Torvalds's avatar
Linus Torvalds committed
1160 1161 1162 1163
	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
1164 1165 1166
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

Linus Torvalds's avatar
Linus Torvalds committed
1167 1168 1169 1170
	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
1171
	module_put(pf->owner);
1172 1173
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
1174
		goto out_sock_release;
1175
	*res = sock;
Linus Torvalds's avatar
Linus Torvalds committed
1176

1177 1178 1179 1180
	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
Linus Torvalds's avatar
Linus Torvalds committed
1181
out_module_put:
1182 1183 1184
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
Linus Torvalds's avatar
Linus Torvalds committed
1185
	sock_release(sock);
1186 1187 1188 1189 1190
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
Linus Torvalds's avatar
Linus Torvalds committed
1191
}
1192
EXPORT_SYMBOL(__sock_create);
Linus Torvalds's avatar
Linus Torvalds committed
1193 1194 1195

int sock_create(int family, int type, int protocol, struct socket **res)
{
1196
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1197
}
1198
EXPORT_SYMBOL(sock_create);
Linus Torvalds's avatar
Linus Torvalds committed
1199

1200
int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
Linus Torvalds's avatar
Linus Torvalds committed
1201
{
1202
	return __sock_create(net, family, type, protocol, res, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1203
}
1204
EXPORT_SYMBOL(sock_create_kern);
Linus Torvalds's avatar
Linus Torvalds committed
1205

1206
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)