udp.c 64.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The User Datagram Protocol (UDP).
 *
8
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
9 10
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
11
 *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15 16 17 18 19 20
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 * Fixes:
 *		Alan Cox	:	verify_area() calls
 *		Alan Cox	: 	stopped close while in use off icmp
 *					messages. Not a fix but a botch that
 *					for udp at least is 'valid'.
 *		Alan Cox	:	Fixed icmp handling properly
 *		Alan Cox	: 	Correct error for oversized datagrams
21 22
 *		Alan Cox	:	Tidied select() semantics.
 *		Alan Cox	:	udp_err() fixed properly, also now
Linus Torvalds's avatar
Linus Torvalds committed
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
 *					select and read wake correctly on errors
 *		Alan Cox	:	udp_send verify_area moved to avoid mem leak
 *		Alan Cox	:	UDP can count its memory
 *		Alan Cox	:	send to an unknown connection causes
 *					an ECONNREFUSED off the icmp, but
 *					does NOT close.
 *		Alan Cox	:	Switched to new sk_buff handlers. No more backlog!
 *		Alan Cox	:	Using generic datagram code. Even smaller and the PEEK
 *					bug no longer crashes it.
 *		Fred Van Kempen	: 	Net2e support for sk->broadcast.
 *		Alan Cox	:	Uses skb_free_datagram
 *		Alan Cox	:	Added get/set sockopt support.
 *		Alan Cox	:	Broadcasting without option set returns EACCES.
 *		Alan Cox	:	No wakeup calls. Instead we now use the callbacks.
 *		Alan Cox	:	Use ip_tos and ip_ttl
 *		Alan Cox	:	SNMP Mibs
 *		Alan Cox	:	MSG_DONTROUTE, and 0.0.0.0 support.
 *		Matt Dillon	:	UDP length checks.
 *		Alan Cox	:	Smarter af_inet used properly.
 *		Alan Cox	:	Use new kernel side addressing.
 *		Alan Cox	:	Incorrect return on truncated datagram receive.
 *	Arnt Gulbrandsen 	:	New udp_send and stuff
 *		Alan Cox	:	Cache last socket
 *		Alan Cox	:	Route cache
 *		Jon Peatfield	:	Minor efficiency fix to sendto().
 *		Mike Shaver	:	RFC1122 checks.
 *		Alan Cox	:	Nonblocking error fix.
 *	Willy Konynenberg	:	Transparent proxying support.
 *		Mike McLagan	:	Routing by source
 *		David S. Miller	:	New socket lookup architecture.
 *					Last socket cache retained as it
 *					does have a high hit rate.
 *		Olaf Kirch	:	Don't linearise iovec on sendmsg.
 *		Andi Kleen	:	Some cleanups, cache destination entry
57
 *					for connect.
Linus Torvalds's avatar
Linus Torvalds committed
58 59 60 61 62 63 64 65 66 67 68 69 70
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
 *					return ENOTCONN for unconnected sockets (POSIX)
 *		Janos Farkas	:	don't deliver multi/broadcasts to a different
 *					bound-to-device socket
 *	Hirokazu Takahashi	:	HW checksumming for outgoing UDP
 *					datagrams.
 *	Hirokazu Takahashi	:	sendfile() on UDP works now.
 *		Arnaldo C. Melo :	convert /proc/net/udp to seq_file
 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
 *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
 *					a single port at the same time.
 *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
71
 *	James Chapman		:	Add L2TP encapsulation type.
Linus Torvalds's avatar
Linus Torvalds committed
72 73 74 75 76 77 78
 *
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */
79

80 81
#define pr_fmt(fmt) "UDP: " fmt

Linus Torvalds's avatar
Linus Torvalds committed
82 83
#include <asm/uaccess.h>
#include <asm/ioctls.h>
Hideo Aoki's avatar
Hideo Aoki committed
84
#include <linux/bootmem.h>
85 86
#include <linux/highmem.h>
#include <linux/swap.h>
Linus Torvalds's avatar
Linus Torvalds committed
87 88 89 90 91
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
92
#include <linux/igmp.h>
93
#include <linux/inetdevice.h>
Linus Torvalds's avatar
Linus Torvalds committed
94 95 96 97 98 99
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
100
#include <linux/slab.h>
101
#include <net/tcp_states.h>
Linus Torvalds's avatar
Linus Torvalds committed
102
#include <linux/skbuff.h>
Tom Herbert's avatar
Tom Herbert committed
103
#include <linux/netdevice.h>
Linus Torvalds's avatar
Linus Torvalds committed
104 105
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
106
#include <net/net_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
107
#include <net/icmp.h>
Shawn Bohrer's avatar
Shawn Bohrer committed
108
#include <net/inet_hashtables.h>
Linus Torvalds's avatar
Linus Torvalds committed
109 110 111
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
112
#include <trace/events/udp.h>
113
#include <linux/static_key.h>
114
#include <trace/events/skb.h>
115
#include <net/busy_poll.h>
116
#include "udp_impl.h"
Linus Torvalds's avatar
Linus Torvalds committed
117

118
struct udp_table udp_table __read_mostly;
119
EXPORT_SYMBOL(udp_table);
Linus Torvalds's avatar
Linus Torvalds committed
120

Eric Dumazet's avatar
Eric Dumazet committed
121
long sysctl_udp_mem[3] __read_mostly;
Hideo Aoki's avatar
Hideo Aoki committed
122
EXPORT_SYMBOL(sysctl_udp_mem);
Eric Dumazet's avatar
Eric Dumazet committed
123 124

int sysctl_udp_rmem_min __read_mostly;
Hideo Aoki's avatar
Hideo Aoki committed
125
EXPORT_SYMBOL(sysctl_udp_rmem_min);
Eric Dumazet's avatar
Eric Dumazet committed
126 127

int sysctl_udp_wmem_min __read_mostly;
Hideo Aoki's avatar
Hideo Aoki committed
128 129
EXPORT_SYMBOL(sysctl_udp_wmem_min);

Eric Dumazet's avatar
Eric Dumazet committed
130
atomic_long_t udp_memory_allocated;
Hideo Aoki's avatar
Hideo Aoki committed
131 132
EXPORT_SYMBOL(udp_memory_allocated);

133 134
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
135

136
static int udp_lib_lport_inuse(struct net *net, __u16 num,
137
			       const struct udp_hslot *hslot,
138
			       unsigned long *bitmap,
139 140
			       struct sock *sk,
			       int (*saddr_comp)(const struct sock *sk1,
141 142
						 const struct sock *sk2),
			       unsigned int log)
Linus Torvalds's avatar
Linus Torvalds committed
143
{
144
	struct sock *sk2;
145
	struct hlist_nulls_node *node;
146
	kuid_t uid = sock_i_uid(sk);
147

148
	sk_nulls_for_each(sk2, node, &hslot->head) {
149 150
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
151
		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
152 153 154
		    (!sk2->sk_reuse || !sk->sk_reuse) &&
		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
155
		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
156 157 158
		     !uid_eq(uid, sock_i_uid(sk2))) &&
		    saddr_comp(sk, sk2)) {
			if (!bitmap)
159
				return 1;
160
			__set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
161
		}
162
	}
163 164 165
	return 0;
}

Eric Dumazet's avatar
Eric Dumazet committed
166 167 168 169 170
/*
 * Note: we still hold spinlock of primary hash chain, so no other writer
 * can insert/delete a socket with local_port == num
 */
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
171 172 173 174
				struct udp_hslot *hslot2,
				struct sock *sk,
				int (*saddr_comp)(const struct sock *sk1,
						  const struct sock *sk2))
Eric Dumazet's avatar
Eric Dumazet committed
175 176 177
{
	struct sock *sk2;
	struct hlist_nulls_node *node;
178
	kuid_t uid = sock_i_uid(sk);
Eric Dumazet's avatar
Eric Dumazet committed
179 180 181
	int res = 0;

	spin_lock(&hslot2->lock);
182
	udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
183 184 185 186 187 188
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
		    (udp_sk(sk2)->udp_port_hash == num) &&
		    (!sk2->sk_reuse || !sk->sk_reuse) &&
		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
189
		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
190 191
		     !uid_eq(uid, sock_i_uid(sk2))) &&
		    saddr_comp(sk, sk2)) {
Eric Dumazet's avatar
Eric Dumazet committed
192 193 194
			res = 1;
			break;
		}
195
	}
Eric Dumazet's avatar
Eric Dumazet committed
196 197 198 199
	spin_unlock(&hslot2->lock);
	return res;
}

200
/**
201
 *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
202 203 204
 *
 *  @sk:          socket struct in question
 *  @snum:        port number to look up
205
 *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
Lucas De Marchi's avatar
Lucas De Marchi committed
206
 *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
Eric Dumazet's avatar
Eric Dumazet committed
207
 *                   with NULL address
208
 */
209
int udp_lib_get_port(struct sock *sk, unsigned short snum,
210 211
		     int (*saddr_comp)(const struct sock *sk1,
				       const struct sock *sk2),
Eric Dumazet's avatar
Eric Dumazet committed
212
		     unsigned int hash2_nulladdr)
213
{
214
	struct udp_hslot *hslot, *hslot2;
215
	struct udp_table *udptable = sk->sk_prot->h.udp_table;
216
	int    error = 1;
217
	struct net *net = sock_net(sk);
Linus Torvalds's avatar
Linus Torvalds committed
218

219
	if (!snum) {
Eric Dumazet's avatar
Eric Dumazet committed
220
		int low, high, remaining;
221
		unsigned int rand;
222 223
		unsigned short first, last;
		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
224

225
		inet_get_local_port_range(net, &low, &high);
226
		remaining = (high - low) + 1;
227

228
		rand = prandom_u32();
229
		first = reciprocal_scale(rand, remaining) + low;
230 231 232
		/*
		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
		 */
233
		rand = (rand | 1) * (udptable->mask + 1);
Eric Dumazet's avatar
Eric Dumazet committed
234 235
		last = first + udptable->mask + 1;
		do {
236
			hslot = udp_hashslot(udptable, net, first);
237
			bitmap_zero(bitmap, PORTS_PER_CHAIN);
238
			spin_lock_bh(&hslot->lock);
239
			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
240
					    saddr_comp, udptable->log);
241 242 243 244 245 246 247

			snum = first;
			/*
			 * Iterate on all possible values of snum for this hash.
			 * Using steps of an odd multiple of UDP_HTABLE_SIZE
			 * give us randomization and full range coverage.
			 */
Eric Dumazet's avatar
Eric Dumazet committed
248
			do {
249
				if (low <= snum && snum <= high &&
250
				    !test_bit(snum >> udptable->log, bitmap) &&
251
				    !inet_is_local_reserved_port(net, snum))
252 253 254 255
					goto found;
				snum += rand;
			} while (snum != first);
			spin_unlock_bh(&hslot->lock);
Eric Dumazet's avatar
Eric Dumazet committed
256
		} while (++first != last);
257
		goto fail;
258
	} else {
259
		hslot = udp_hashslot(udptable, net, snum);
260
		spin_lock_bh(&hslot->lock);
Eric Dumazet's avatar
Eric Dumazet committed
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
		if (hslot->count > 10) {
			int exist;
			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;

			slot2          &= udptable->mask;
			hash2_nulladdr &= udptable->mask;

			hslot2 = udp_hashslot2(udptable, slot2);
			if (hslot->count < hslot2->count)
				goto scan_primary_hash;

			exist = udp_lib_lport_inuse2(net, snum, hslot2,
						     sk, saddr_comp);
			if (!exist && (hash2_nulladdr != slot2)) {
				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
				exist = udp_lib_lport_inuse2(net, snum, hslot2,
							     sk, saddr_comp);
			}
			if (exist)
				goto fail_unlock;
			else
				goto found;
		}
scan_primary_hash:
285 286
		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
					saddr_comp, 0))
287 288
			goto fail_unlock;
	}
289
found:
290
	inet_sk(sk)->inet_num = snum;
291 292
	udp_sk(sk)->udp_port_hash = snum;
	udp_sk(sk)->udp_portaddr_hash ^= snum;
Linus Torvalds's avatar
Linus Torvalds committed
293
	if (sk_unhashed(sk)) {
294
		sk_nulls_add_node_rcu(sk, &hslot->head);
295
		hslot->count++;
296
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
297 298 299 300 301 302 303

		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
		spin_lock(&hslot2->lock);
		hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
					 &hslot2->head);
		hslot2->count++;
		spin_unlock(&hslot2->lock);
Linus Torvalds's avatar
Linus Torvalds committed
304
	}
305
	error = 0;
306 307
fail_unlock:
	spin_unlock_bh(&hslot->lock);
Linus Torvalds's avatar
Linus Torvalds committed
308
fail:
309 310
	return error;
}
Eric Dumazet's avatar
Eric Dumazet committed
311
EXPORT_SYMBOL(udp_lib_get_port);
312

313
static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
314 315 316
{
	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);

Eric Dumazet's avatar
Eric Dumazet committed
317
	return 	(!ipv6_only_sock(sk2)  &&
318 319
		 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
		   inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
320 321
}

322 323
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
			      unsigned int port)
324
{
325
	return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
326 327
}

328
int udp_v4_get_port(struct sock *sk, unsigned short snum)
329
{
Eric Dumazet's avatar
Eric Dumazet committed
330
	unsigned int hash2_nulladdr =
331
		udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
Eric Dumazet's avatar
Eric Dumazet committed
332 333 334
	unsigned int hash2_partial =
		udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);

335
	/* precompute partial secondary hash */
Eric Dumazet's avatar
Eric Dumazet committed
336 337
	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
	return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
338 339
}

340 341 342
static inline int compute_score(struct sock *sk, struct net *net,
				__be32 saddr, unsigned short hnum, __be16 sport,
				__be32 daddr, __be16 dport, int dif)
343
{
344 345
	int score;
	struct inet_sock *inet;
346

347 348 349 350
	if (!net_eq(sock_net(sk), net) ||
	    udp_sk(sk)->udp_port_hash != hnum ||
	    ipv6_only_sock(sk))
		return -1;
351

352 353 354 355 356 357 358
	score = (sk->sk_family == PF_INET) ? 2 : 1;
	inet = inet_sk(sk);

	if (inet->inet_rcv_saddr) {
		if (inet->inet_rcv_saddr != daddr)
			return -1;
		score += 4;
359
	}
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377

	if (inet->inet_daddr) {
		if (inet->inet_daddr != saddr)
			return -1;
		score += 4;
	}

	if (inet->inet_dport) {
		if (inet->inet_dport != sport)
			return -1;
		score += 4;
	}

	if (sk->sk_bound_dev_if) {
		if (sk->sk_bound_dev_if != dif)
			return -1;
		score += 4;
	}
378 379
	if (sk->sk_incoming_cpu == raw_smp_processor_id())
		score++;
380 381 382
	return score;
}

383 384 385 386 387 388 389
/*
 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
 */
static inline int compute_score2(struct sock *sk, struct net *net,
				 __be32 saddr, __be16 sport,
				 __be32 daddr, unsigned int hnum, int dif)
{
390 391
	int score;
	struct inet_sock *inet;
392

393 394 395
	if (!net_eq(sock_net(sk), net) ||
	    ipv6_only_sock(sk))
		return -1;
396

397 398 399 400 401 402 403 404 405 406
	inet = inet_sk(sk);

	if (inet->inet_rcv_saddr != daddr ||
	    inet->inet_num != hnum)
		return -1;

	score = (sk->sk_family == PF_INET) ? 2 : 1;

	if (inet->inet_daddr) {
		if (inet->inet_daddr != saddr)
407
			return -1;
408 409 410 411 412
		score += 4;
	}

	if (inet->inet_dport) {
		if (inet->inet_dport != sport)
413
			return -1;
414 415
		score += 4;
	}
416

417 418 419 420
	if (sk->sk_bound_dev_if) {
		if (sk->sk_bound_dev_if != dif)
			return -1;
		score += 4;
421
	}
422

423 424 425
	if (sk->sk_incoming_cpu == raw_smp_processor_id())
		score++;

426 427 428
	return score;
}

429 430 431
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
		       const __u16 lport, const __be32 faddr,
		       const __be16 fport)
432
{
433 434 435 436
	static u32 udp_ehash_secret __read_mostly;

	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));

437
	return __inet_ehashfn(laddr, lport, faddr, fport,
438
			      udp_ehash_secret + net_hash_mix(net));
439 440
}

441 442 443 444 445 446 447 448
/* called with read_rcu_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
		__be32 saddr, __be16 sport,
		__be32 daddr, unsigned int hnum, int dif,
		struct udp_hslot *hslot2, unsigned int slot2)
{
	struct sock *sk, *result;
	struct hlist_nulls_node *node;
449 450
	int score, badness, matches = 0, reuseport = 0;
	u32 hash = 0;
451 452 453

begin:
	result = NULL;
454
	badness = 0;
455 456 457 458 459 460
	udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
		score = compute_score2(sk, net, saddr, sport,
				      daddr, hnum, dif);
		if (score > badness) {
			result = sk;
			badness = score;
461 462
			reuseport = sk->sk_reuseport;
			if (reuseport) {
463 464
				hash = udp_ehashfn(net, daddr, hnum,
						   saddr, sport);
465 466 467 468
				matches = 1;
			}
		} else if (score == badness && reuseport) {
			matches++;
469
			if (reciprocal_scale(hash, matches) == 0)
470 471
				result = sk;
			hash = next_pseudo_random32(hash);
472 473 474 475 476 477 478 479 480 481
		}
	}
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
	if (get_nulls_value(node) != slot2)
		goto begin;
	if (result) {
482
		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
483 484 485 486 487 488 489 490 491 492
			result = NULL;
		else if (unlikely(compute_score2(result, net, saddr, sport,
				  daddr, hnum, dif) < badness)) {
			sock_put(result);
			goto begin;
		}
	}
	return result;
}

493 494 495
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 * harder than this. -DaveM
 */
496
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
497
		__be16 sport, __be32 daddr, __be16 dport,
498
		int dif, struct udp_table *udptable)
499
{
500
	struct sock *sk, *result;
501
	struct hlist_nulls_node *node;
502
	unsigned short hnum = ntohs(dport);
503 504
	unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
	struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
505 506
	int score, badness, matches = 0, reuseport = 0;
	u32 hash = 0;
507

508
	rcu_read_lock();
509 510 511 512 513 514 515 516 517 518 519
	if (hslot->count > 10) {
		hash2 = udp4_portaddr_hash(net, daddr, hnum);
		slot2 = hash2 & udptable->mask;
		hslot2 = &udptable->hash2[slot2];
		if (hslot->count < hslot2->count)
			goto begin;

		result = udp4_lib_lookup2(net, saddr, sport,
					  daddr, hnum, dif,
					  hslot2, slot2);
		if (!result) {
520
			hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
521 522 523 524 525
			slot2 = hash2 & udptable->mask;
			hslot2 = &udptable->hash2[slot2];
			if (hslot->count < hslot2->count)
				goto begin;

526
			result = udp4_lib_lookup2(net, saddr, sport,
527
						  htonl(INADDR_ANY), hnum, dif,
528 529 530 531 532
						  hslot2, slot2);
		}
		rcu_read_unlock();
		return result;
	}
533 534
begin:
	result = NULL;
535
	badness = 0;
536
	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
537 538 539 540 541
		score = compute_score(sk, net, saddr, hnum, sport,
				      daddr, dport, dif);
		if (score > badness) {
			result = sk;
			badness = score;
542 543
			reuseport = sk->sk_reuseport;
			if (reuseport) {
544 545
				hash = udp_ehashfn(net, daddr, hnum,
						   saddr, sport);
546 547 548 549
				matches = 1;
			}
		} else if (score == badness && reuseport) {
			matches++;
550
			if (reciprocal_scale(hash, matches) == 0)
551 552
				result = sk;
			hash = next_pseudo_random32(hash);
553 554
		}
	}
555 556 557 558 559
	/*
	 * if the nulls value we got at the end of this lookup is
	 * not the expected one, we must restart lookup.
	 * We probably met an item that was moved to another chain.
	 */
560
	if (get_nulls_value(node) != slot)
561 562
		goto begin;

563
	if (result) {
564
		if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
565 566 567 568 569 570 571 572
			result = NULL;
		else if (unlikely(compute_score(result, net, saddr, hnum, sport,
				  daddr, dport, dif) < badness)) {
			sock_put(result);
			goto begin;
		}
	}
	rcu_read_unlock();
573 574
	return result;
}
575
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
576

577 578
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
						 __be16 sport, __be16 dport,
579
						 struct udp_table *udptable)
580 581 582
{
	const struct iphdr *iph = ip_hdr(skb);

583 584 585
	return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
				 iph->daddr, dport, inet_iif(skb),
				 udptable);
586 587
}

588 589 590
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
			     __be32 daddr, __be16 dport, int dif)
{
591
	return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
592 593 594
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);

Shawn Bohrer's avatar
Shawn Bohrer committed
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
				       __be16 loc_port, __be32 loc_addr,
				       __be16 rmt_port, __be32 rmt_addr,
				       int dif, unsigned short hnum)
{
	struct inet_sock *inet = inet_sk(sk);

	if (!net_eq(sock_net(sk), net) ||
	    udp_sk(sk)->udp_port_hash != hnum ||
	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
	    ipv6_only_sock(sk) ||
	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
		return false;
	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
		return false;
	return true;
}

615 616 617 618 619 620 621 622 623 624 625
/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.
 * Header points to the ip header of the error packet. We move
 * on past this. Then (as it used to claim before adjustment)
 * header points to the first 8 bytes of the udp header.  We need
 * to find the appropriate port.
 */

626
void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
627 628
{
	struct inet_sock *inet;
629
	const struct iphdr *iph = (const struct iphdr *)skb->data;
Eric Dumazet's avatar
Eric Dumazet committed
630
	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
631 632 633 634 635
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
	struct sock *sk;
	int harderr;
	int err;
636
	struct net *net = dev_net(skb->dev);
637

638
	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
639
			iph->saddr, uh->source, skb->dev->ifindex, udptable);
640
	if (!sk) {
641
		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
		return;	/* No socket for error */
	}

	err = 0;
	harderr = 0;
	inet = inet_sk(sk);

	switch (type) {
	default:
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	case ICMP_SOURCE_QUENCH:
		goto out;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
		harderr = 1;
		break;
	case ICMP_DEST_UNREACH:
		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
662
			ipv4_sk_update_pmtu(skb, sk, info);
663 664 665 666 667 668 669 670 671 672 673 674 675
			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
				err = EMSGSIZE;
				harderr = 1;
				break;
			}
			goto out;
		}
		err = EHOSTUNREACH;
		if (code <= NR_ICMP_UNREACH) {
			harderr = icmp_err_convert[code].fatal;
			err = icmp_err_convert[code].errno;
		}
		break;
676 677
	case ICMP_REDIRECT:
		ipv4_sk_redirect(skb, sk);
678
		goto out;
679 680 681 682 683 684 685 686 687
	}

	/*
	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
	 *	4.1.3.3.
	 */
	if (!inet->recverr) {
		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
			goto out;
688
	} else
Eric Dumazet's avatar
Eric Dumazet committed
689
		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
690

691 692 693 694 695 696 697 698
	sk->sk_err = err;
	sk->sk_error_report(sk);
out:
	sock_put(sk);
}

void udp_err(struct sk_buff *skb, u32 info)
{
699
	__udp4_lib_err(skb, info, &udp_table);
700 701 702 703 704
}

/*
 * Throw away all pending data and cancel the corking. Socket is locked.
 */
705
void udp_flush_pending_frames(struct sock *sk)
706 707 708 709 710 711 712 713 714
{
	struct udp_sock *up = udp_sk(sk);

	if (up->pending) {
		up->len = 0;
		up->pending = 0;
		ip_flush_pending_frames(sk);
	}
}
715
EXPORT_SYMBOL(udp_flush_pending_frames);
716 717

/**
Herbert Xu's avatar
Herbert Xu committed
718
 * 	udp4_hwcsum  -  handle outgoing HW checksumming
719 720
 * 	@skb: 	sk_buff containing the filled-in UDP header
 * 	        (checksum field must be zeroed out)
Herbert Xu's avatar
Herbert Xu committed
721 722
 *	@src:	source IP address
 *	@dst:	destination IP address
723
 */
724
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
725 726
{
	struct udphdr *uh = udp_hdr(skb);
Herbert Xu's avatar
Herbert Xu committed
727 728 729
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
	int hlen = len;
730 731
	__wsum csum = 0;

732
	if (!skb_has_frag_list(skb)) {
733 734 735 736 737
		/*
		 * Only one fragment on the socket.
		 */
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct udphdr, check);
Herbert Xu's avatar
Herbert Xu committed
738 739
		uh->check = ~csum_tcpudp_magic(src, dst, len,
					       IPPROTO_UDP, 0);
740
	} else {
741 742
		struct sk_buff *frags;

743 744 745 746 747
		/*
		 * HW-checksum won't work as there are two or more
		 * fragments on the socket so that all csums of sk_buffs
		 * should be together
		 */
748
		skb_walk_frags(skb, frags) {
Herbert Xu's avatar
Herbert Xu committed
749 750
			csum = csum_add(csum, frags->csum);
			hlen -= frags->len;
751
		}
752

Herbert Xu's avatar
Herbert Xu committed
753
		csum = skb_checksum(skb, offset, hlen, csum);
754 755 756 757 758 759 760
		skb->ip_summed = CHECKSUM_NONE;

		uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
		if (uh->check == 0)
			uh->check = CSUM_MANGLED_0;
	}
}
761
EXPORT_SYMBOL_GPL(udp4_hwcsum);
762

763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp_set_csum(bool nocheck, struct sk_buff *skb,
		  __be32 saddr, __be32 daddr, int len)
{
	struct udphdr *uh = udp_hdr(skb);

	if (nocheck)
		uh->check = 0;
	else if (skb_is_gso(skb))
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
	else if (skb_dst(skb) && skb_dst(skb)->dev &&
		 (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {

		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);

		skb->ip_summed = CHECKSUM_PARTIAL;
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct udphdr, check);
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
	} else {
		__wsum csum;

		BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);

		uh->check = 0;
		csum = skb_checksum(skb, 0, len, 0);
		uh->check = udp_v4_check(len, saddr, daddr, csum);
		if (uh->check == 0)
			uh->check = CSUM_MANGLED_0;

		skb->ip_summed = CHECKSUM_UNNECESSARY;
	}
}
EXPORT_SYMBOL(udp_set_csum);

800
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
801
{
Herbert Xu's avatar
Herbert Xu committed
802
	struct sock *sk = skb->sk;
803 804 805 806
	struct inet_sock *inet = inet_sk(sk);
	struct udphdr *uh;
	int err = 0;
	int is_udplite = IS_UDPLITE(sk);
Herbert Xu's avatar
Herbert Xu committed
807 808
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
809 810 811 812 813 814
	__wsum csum = 0;

	/*
	 * Create a UDP header
	 */
	uh = udp_hdr(skb);
Herbert Xu's avatar
Herbert Xu committed
815
	uh->source = inet->inet_sport;
816
	uh->dest = fl4->fl4_dport;
Herbert Xu's avatar
Herbert Xu committed
817
	uh->len = htons(len);
818 819 820
	uh->check = 0;

	if (is_udplite)  				 /*     UDP-Lite      */
Herbert Xu's avatar
Herbert Xu committed
821
		csum = udplite_csum(skb);
822

823
	else if (sk->sk_no_check_tx) {   /* UDP csum disabled */
824 825 826 827 828 829

		skb->ip_summed = CHECKSUM_NONE;
		goto send;

	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */

830
		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
831 832
		goto send;

Herbert Xu's avatar
Herbert Xu committed
833 834
	} else
		csum = udp_csum(skb);
835 836

	/* add protocol-dependent pseudo-header */
837
	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
Eric Dumazet's avatar
Eric Dumazet committed
838
				      sk->sk_protocol, csum);
839 840 841 842
	if (uh->check == 0)
		uh->check = CSUM_MANGLED_0;

send:
Eric Dumazet's avatar
Eric Dumazet committed
843
	err = ip_send_skb(sock_net(sk), skb);
Eric Dumazet's avatar
Eric Dumazet committed
844 845 846 847 848 849 850 851 852
	if (err) {
		if (err == -ENOBUFS && !inet->recverr) {
			UDP_INC_STATS_USER(sock_net(sk),
					   UDP_MIB_SNDBUFERRORS, is_udplite);
			err = 0;
		}
	} else
		UDP_INC_STATS_USER(sock_net(sk),
				   UDP_MIB_OUTDATAGRAMS, is_udplite);
Herbert Xu's avatar
Herbert Xu committed
853 854 855 856 857 858
	return err;
}

/*
 * Push out all pending data as one UDP datagram. Socket is locked.
 */
859
int udp_push_pending_frames(struct sock *sk)
Herbert Xu's avatar
Herbert Xu committed
860 861 862
{
	struct udp_sock  *up = udp_sk(sk);
	struct inet_sock *inet = inet_sk(sk);
David S. Miller's avatar
David S. Miller committed
863
	struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
Herbert Xu's avatar
Herbert Xu committed
864 865 866
	struct sk_buff *skb;
	int err = 0;

867
	skb = ip_finish_skb(sk, fl4);
Herbert Xu's avatar
Herbert Xu committed
868 869 870
	if (!skb)
		goto out;

871
	err = udp_send_skb(skb, fl4);
Herbert Xu's avatar
Herbert Xu committed
872

873 874 875 876 877
out:
	up->len = 0;
	up->pending = 0;
	return err;
}
878
EXPORT_SYMBOL(udp_push_pending_frames);
879

880
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
881 882 883
{
	struct inet_sock *inet = inet_sk(sk);
	struct udp_sock *up = udp_sk(sk);
884
	struct flowi4 fl4_stack;
David S. Miller's avatar
David S. Miller committed
885
	struct flowi4 *fl4;
886 887 888 889 890 891 892 893 894 895 896
	int ulen = len;
	struct ipcm_cookie ipc;
	struct rtable *rt = NULL;
	int free = 0;
	int connected = 0;
	__be32 daddr, faddr, saddr;
	__be16 dport;
	u8  tos;
	int err, is_udplite = IS_UDPLITE(sk);
	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
Herbert Xu's avatar
Herbert Xu committed
897
	struct sk_buff *skb;
898
	struct ip_options_data opt_copy;
899 900 901 902 903 904 905 906

	if (len > 0xFFFF)
		return -EMSGSIZE;

	/*
	 *	Check the flags.
	 */

Eric Dumazet's avatar
Eric Dumazet committed
907
	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
908 909 910
		return -EOPNOTSUPP;

	ipc.opt = NULL;
911
	ipc.tx_flags = 0;
912 913
	ipc.ttl = 0;
	ipc.tos = -1;
914

Herbert Xu's avatar
Herbert Xu committed
915 916
	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;

917
	fl4 = &inet->cork.fl.u.ip4;
918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
	if (up->pending) {
		/*
		 * There are pending frames.
		 * The socket lock must be held while it's corked.
		 */
		lock_sock(sk);
		if (likely(up->pending)) {
			if (unlikely(up->pending != AF_INET)) {
				release_sock(sk);
				return -EINVAL;
			}
			goto do_append_data;
		}
		release_sock(sk);
	}
	ulen += sizeof(struct udphdr);

	/*
	 *	Get and verify the address.
	 */
	if (msg->msg_name) {
939
		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
940 941 942 943 944 945 946 947 948 949 950 951 952 953
		if (msg->msg_namelen < sizeof(*usin))
			return -EINVAL;
		if (usin->sin_family != AF_INET) {
			if (usin->sin_family != AF_UNSPEC)
				return -EAFNOSUPPORT;
		}

		daddr = usin->sin_addr.s_addr;
		dport = usin->sin_port;
		if (dport == 0)
			return -EINVAL;
	} else {
		if (sk->sk_state != TCP_ESTABLISHED)
			return -EDESTADDRREQ;
954 955
		daddr = inet->inet_daddr;
		dport = inet->inet_dport;
956 957 958 959 960
		/* Open fast path for connected socket.
		   Route will not be used, if at least one option is set.
		 */
		connected = 1;
	}
961
	ipc.addr = inet->inet_saddr;
962 963

	ipc.oif = sk->sk_bound_dev_if;
964 965