tcp_ipv4.c 67.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 *		IPv4 specific functions
 *
 *
 *		code split from:
 *		linux/ipv4/tcp.c
 *		linux/ipv4/tcp_input.c
 *		linux/ipv4/tcp_output.c
 *
 *		See tcp.c for author information
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*
 * Changes:
 *		David S. Miller	:	New socket lookup architecture.
 *					This code is dedicated to John Dyson.
 *		David S. Miller :	Change semantics of established hash,
 *					half is devoted to TIME_WAIT sockets
 *					and the rest go in the other half.
 *		Andi Kleen :		Add support for syncookies and fixed
 *					some bugs: ip options weren't passed to
 *					the TCP layer, missed a check for an
 *					ACK bit.
 *		Andi Kleen :		Implemented fast path mtu discovery.
 *	     				Fixed many serious bugs in the
37
 *					request_sock handling and moved
Linus Torvalds's avatar
Linus Torvalds committed
38
39
 *					most of it into the af independent code.
 *					Added tail drop and some other bugfixes.
Stephen Hemminger's avatar
Stephen Hemminger committed
40
 *					Added new listen semantics.
Linus Torvalds's avatar
Linus Torvalds committed
41
42
43
44
45
46
47
48
49
50
51
52
 *		Mike McLagan	:	Routing by source
 *	Juan Jose Ciarlante:		ip_dynaddr bits
 *		Andi Kleen:		various fixes.
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year
 *					coma.
 *	Andi Kleen		:	Fix new listen.
 *	Andi Kleen		:	Fix accept error reporting.
 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
 *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
 *					a single port at the same time.
 */

53
#define pr_fmt(fmt) "TCP: " fmt
Linus Torvalds's avatar
Linus Torvalds committed
54

Herbert Xu's avatar
Herbert Xu committed
55
#include <linux/bottom_half.h>
Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59
60
61
62
63
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
64
#include <linux/slab.h>
Linus Torvalds's avatar
Linus Torvalds committed
65

66
#include <net/net_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
67
#include <net/icmp.h>
68
#include <net/inet_hashtables.h>
Linus Torvalds's avatar
Linus Torvalds committed
69
#include <net/tcp.h>
70
#include <net/transp_v6.h>
Linus Torvalds's avatar
Linus Torvalds committed
71
72
#include <net/ipv6.h>
#include <net/inet_common.h>
73
#include <net/timewait_sock.h>
Linus Torvalds's avatar
Linus Torvalds committed
74
#include <net/xfrm.h>
75
#include <net/netdma.h>
76
#include <net/secure_seq.h>
Glauber Costa's avatar
Glauber Costa committed
77
#include <net/tcp_memcontrol.h>
Linus Torvalds's avatar
Linus Torvalds committed
78
79
80
81
82
83
84

#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

85
86
87
#include <linux/crypto.h>
#include <linux/scatterlist.h>

88
89
int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
90
EXPORT_SYMBOL(sysctl_tcp_low_latency);
Linus Torvalds's avatar
Linus Torvalds committed
91
92


93
#ifdef CONFIG_TCP_MD5SIG
Eric Dumazet's avatar
Eric Dumazet committed
94
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95
			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
96
97
#endif

98
struct inet_hashinfo tcp_hashinfo;
99
EXPORT_SYMBOL(tcp_hashinfo);
Linus Torvalds's avatar
Linus Torvalds committed
100

101
static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
102
{
103
104
	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
					  ip_hdr(skb)->saddr,
105
106
					  tcp_hdr(skb)->dest,
					  tcp_hdr(skb)->source);
Linus Torvalds's avatar
Linus Torvalds committed
107
108
}

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
	struct tcp_sock *tp = tcp_sk(sk);

	/* With PAWS, it is safe from the viewpoint
	   of data integrity. Even without PAWS it is safe provided sequence
	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.

	   Actually, the idea is close to VJ's one, only timestamp cache is
	   held not per host, but per port pair and TW bucket is used as state
	   holder.

	   If TW bucket has been already destroyed we fall back to VJ's scheme
	   and use initial timestamp retrieved from peer table.
	 */
	if (tcptw->tw_ts_recent_stamp &&
	    (twp == NULL || (sysctl_tcp_tw_reuse &&
127
			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128
129
130
131
132
133
134
135
136
137
138
139
140
		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
		if (tp->write_seq == 0)
			tp->write_seq = 1;
		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
		sock_hold(sktw);
		return 1;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(tcp_twsk_unique);

Pavel Emelyanov's avatar
Pavel Emelyanov committed
141
142
143
144
145
146
147
148
static int tcp_repair_connect(struct sock *sk)
{
	tcp_connect_init(sk);
	tcp_finish_connect(sk, NULL);

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
149
150
151
/* This will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
152
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
Linus Torvalds's avatar
Linus Torvalds committed
153
154
	struct inet_sock *inet = inet_sk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
155
	__be16 orig_sport, orig_dport;
156
	__be32 daddr, nexthop;
157
	struct flowi4 *fl4;
158
	struct rtable *rt;
Linus Torvalds's avatar
Linus Torvalds committed
159
	int err;
160
	struct ip_options_rcu *inet_opt;
Linus Torvalds's avatar
Linus Torvalds committed
161
162
163
164
165
166
167
168

	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;

	if (usin->sin_family != AF_INET)
		return -EAFNOSUPPORT;

	nexthop = daddr = usin->sin_addr.s_addr;
169
170
171
	inet_opt = rcu_dereference_protected(inet->inet_opt,
					     sock_owned_by_user(sk));
	if (inet_opt && inet_opt->opt.srr) {
Linus Torvalds's avatar
Linus Torvalds committed
172
173
		if (!daddr)
			return -EINVAL;
174
		nexthop = inet_opt->opt.faddr;
Linus Torvalds's avatar
Linus Torvalds committed
175
176
	}

177
178
	orig_sport = inet->inet_sport;
	orig_dport = usin->sin_port;
179
180
	fl4 = &inet->cork.fl.u.ip4;
	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
181
182
183
184
185
186
			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
			      IPPROTO_TCP,
			      orig_sport, orig_dport, sk, true);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		if (err == -ENETUNREACH)
187
			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
188
		return err;
189
	}
Linus Torvalds's avatar
Linus Torvalds committed
190
191
192
193
194
195

	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
		ip_rt_put(rt);
		return -ENETUNREACH;
	}

196
	if (!inet_opt || !inet_opt->opt.srr)
197
		daddr = fl4->daddr;
Linus Torvalds's avatar
Linus Torvalds committed
198

199
	if (!inet->inet_saddr)
200
		inet->inet_saddr = fl4->saddr;
201
	inet->inet_rcv_saddr = inet->inet_saddr;
Linus Torvalds's avatar
Linus Torvalds committed
202

203
	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
Linus Torvalds's avatar
Linus Torvalds committed
204
205
206
		/* Reset inherited state */
		tp->rx_opt.ts_recent	   = 0;
		tp->rx_opt.ts_recent_stamp = 0;
Pavel Emelyanov's avatar
Pavel Emelyanov committed
207
208
		if (likely(!tp->repair))
			tp->write_seq	   = 0;
Linus Torvalds's avatar
Linus Torvalds committed
209
210
	}

211
	if (tcp_death_row.sysctl_tw_recycle &&
212
213
	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
		tcp_fetch_timewait_stamp(sk, &rt->dst);
Linus Torvalds's avatar
Linus Torvalds committed
214

215
216
	inet->inet_dport = usin->sin_port;
	inet->inet_daddr = daddr;
Linus Torvalds's avatar
Linus Torvalds committed
217

218
	inet_csk(sk)->icsk_ext_hdr_len = 0;
219
220
	if (inet_opt)
		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
Linus Torvalds's avatar
Linus Torvalds committed
221

222
	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
223
224
225
226
227
228
229

	/* Socket identity is still unknown (sport may be zero).
	 * However we set state to SYN-SENT and not releasing socket
	 * lock select source port, enter ourselves into the hash tables and
	 * complete initialization after this.
	 */
	tcp_set_state(sk, TCP_SYN_SENT);
230
	err = inet_hash_connect(&tcp_death_row, sk);
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
	if (err)
		goto failure;

234
	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
235
236
237
238
			       inet->inet_sport, inet->inet_dport, sk);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		rt = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
239
		goto failure;
240
	}
Linus Torvalds's avatar
Linus Torvalds committed
241
	/* OK, now commit destination to socket.  */
242
	sk->sk_gso_type = SKB_GSO_TCPV4;
243
	sk_setup_caps(sk, &rt->dst);
Linus Torvalds's avatar
Linus Torvalds committed
244

Pavel Emelyanov's avatar
Pavel Emelyanov committed
245
	if (!tp->write_seq && likely(!tp->repair))
246
247
248
		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
							   inet->inet_daddr,
							   inet->inet_sport,
Linus Torvalds's avatar
Linus Torvalds committed
249
250
							   usin->sin_port);

251
	inet->inet_id = tp->write_seq ^ jiffies;
Linus Torvalds's avatar
Linus Torvalds committed
252

Pavel Emelyanov's avatar
Pavel Emelyanov committed
253
254
255
256
257
	if (likely(!tp->repair))
		err = tcp_connect(sk);
	else
		err = tcp_repair_connect(sk);

Linus Torvalds's avatar
Linus Torvalds committed
258
259
260
261
262
263
264
	rt = NULL;
	if (err)
		goto failure;

	return 0;

failure:
265
266
267
268
	/*
	 * This unhashes the socket and releases the local port,
	 * if necessary.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
269
270
271
	tcp_set_state(sk, TCP_CLOSE);
	ip_rt_put(rt);
	sk->sk_route_caps = 0;
272
	inet->inet_dport = 0;
Linus Torvalds's avatar
Linus Torvalds committed
273
274
	return err;
}
275
EXPORT_SYMBOL(tcp_v4_connect);
Linus Torvalds's avatar
Linus Torvalds committed
276
277

/*
278
279
280
 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 * It can be called through tcp_release_cb() if socket was owned by user
 * at the time tcp_v4_err() was called to handle ICMP message.
Linus Torvalds's avatar
Linus Torvalds committed
281
 */
282
static void tcp_v4_mtu_reduced(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
283
284
285
{
	struct dst_entry *dst;
	struct inet_sock *inet = inet_sk(sk);
286
	u32 mtu = tcp_sk(sk)->mtu_info;
Linus Torvalds's avatar
Linus Torvalds committed
287
288
289
290
291
292
293
294

	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
	 * send out by Linux are always <576bytes so they should go through
	 * unfragmented).
	 */
	if (sk->sk_state == TCP_LISTEN)
		return;

295
296
	dst = inet_csk_update_pmtu(sk, mtu);
	if (!dst)
Linus Torvalds's avatar
Linus Torvalds committed
297
298
299
300
301
302
303
304
305
306
307
		return;

	/* Something is about to be wrong... Remember soft error
	 * for the case, if this connection will not able to recover.
	 */
	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
		sk->sk_err_soft = EMSGSIZE;

	mtu = dst_mtu(dst);

	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308
	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
Linus Torvalds's avatar
Linus Torvalds committed
309
310
311
312
313
314
315
316
317
318
319
		tcp_sync_mss(sk, mtu);

		/* Resend the TCP packet because it's
		 * clear that the old packet has been
		 * dropped. This is the new "fast" path mtu
		 * discovery.
		 */
		tcp_simple_retransmit(sk);
	} /* else let the usual retransmit timer handle it */
}

320
321
322
323
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
	struct dst_entry *dst = __sk_dst_check(sk, 0);

324
	if (dst)
325
		dst->ops->redirect(dst, sk, skb);
326
327
}

Linus Torvalds's avatar
Linus Torvalds committed
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.  After adjustment
 * header points to the first 8 bytes of the tcp header.  We need
 * to find the appropriate port.
 *
 * The locking strategy used here is very "optimistic". When
 * someone else accesses the socket the ICMP is just dropped
 * and for some paths there is no check at all.
 * A more general error queue to queue errors for later handling
 * is probably better.
 *
 */

344
void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
Linus Torvalds's avatar
Linus Torvalds committed
345
{
346
	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
347
	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
348
	struct inet_connection_sock *icsk;
Linus Torvalds's avatar
Linus Torvalds committed
349
350
	struct tcp_sock *tp;
	struct inet_sock *inet;
351
352
	const int type = icmp_hdr(icmp_skb)->type;
	const int code = icmp_hdr(icmp_skb)->code;
Linus Torvalds's avatar
Linus Torvalds committed
353
	struct sock *sk;
354
	struct sk_buff *skb;
Linus Torvalds's avatar
Linus Torvalds committed
355
	__u32 seq;
356
	__u32 remaining;
Linus Torvalds's avatar
Linus Torvalds committed
357
	int err;
358
	struct net *net = dev_net(icmp_skb->dev);
Linus Torvalds's avatar
Linus Torvalds committed
359

360
	if (icmp_skb->len < (iph->ihl << 2) + 8) {
361
		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
Linus Torvalds's avatar
Linus Torvalds committed
362
363
364
		return;
	}

365
	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
366
			iph->saddr, th->source, inet_iif(icmp_skb));
Linus Torvalds's avatar
Linus Torvalds committed
367
	if (!sk) {
368
		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
Linus Torvalds's avatar
Linus Torvalds committed
369
370
371
		return;
	}
	if (sk->sk_state == TCP_TIME_WAIT) {
372
		inet_twsk_put(inet_twsk(sk));
Linus Torvalds's avatar
Linus Torvalds committed
373
374
375
376
377
378
		return;
	}

	bh_lock_sock(sk);
	/* If too many ICMPs get dropped on busy
	 * servers this needs to be solved differently.
379
380
	 * We do take care of PMTU discovery (RFC1191) special case :
	 * we can receive locally generated ICMP messages while socket is held.
Linus Torvalds's avatar
Linus Torvalds committed
381
	 */
382
383
384
	if (sock_owned_by_user(sk) &&
	    type != ICMP_DEST_UNREACH &&
	    code != ICMP_FRAG_NEEDED)
385
		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
Linus Torvalds's avatar
Linus Torvalds committed
386
387
388
389

	if (sk->sk_state == TCP_CLOSE)
		goto out;

390
391
392
393
394
	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
		goto out;
	}

395
	icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
396
397
398
399
	tp = tcp_sk(sk);
	seq = ntohl(th->seq);
	if (sk->sk_state != TCP_LISTEN &&
	    !between(seq, tp->snd_una, tp->snd_nxt)) {
400
		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds's avatar
Linus Torvalds committed
401
402
403
404
		goto out;
	}

	switch (type) {
405
406
407
	case ICMP_REDIRECT:
		do_redirect(icmp_skb, sk);
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
408
409
410
411
412
413
414
415
416
417
418
	case ICMP_SOURCE_QUENCH:
		/* Just silently ignore these. */
		goto out;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
		break;
	case ICMP_DEST_UNREACH:
		if (code > NR_ICMP_UNREACH)
			goto out;

		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
419
			tp->mtu_info = info;
Linus Torvalds's avatar
Linus Torvalds committed
420
			if (!sock_owned_by_user(sk))
421
422
423
				tcp_v4_mtu_reduced(sk);
			else
				set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
Linus Torvalds's avatar
Linus Torvalds committed
424
425
426
427
			goto out;
		}

		err = icmp_err_convert[code].errno;
428
429
430
431
432
433
434
435
		/* check if icmp_skb allows revert of backoff
		 * (see draft-zimmermann-tcp-lcd) */
		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
			break;
		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
		    !icsk->icsk_backoff)
			break;

436
437
438
		if (sock_owned_by_user(sk))
			break;

439
		icsk->icsk_backoff--;
440
441
		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
		tcp_bound_rto(sk);

		skb = tcp_write_queue_head(sk);
		BUG_ON(!skb);

		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
				tcp_time_stamp - TCP_SKB_CB(skb)->when);

		if (remaining) {
			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
						  remaining, TCP_RTO_MAX);
		} else {
			/* RTO revert clocked out retransmission.
			 * Will retransmit now */
			tcp_retransmit_timer(sk);
		}

Linus Torvalds's avatar
Linus Torvalds committed
459
460
461
462
463
464
465
466
467
		break;
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	default:
		goto out;
	}

	switch (sk->sk_state) {
468
		struct request_sock *req, **prev;
Linus Torvalds's avatar
Linus Torvalds committed
469
470
471
472
	case TCP_LISTEN:
		if (sock_owned_by_user(sk))
			goto out;

473
474
		req = inet_csk_search_req(sk, &prev, th->dest,
					  iph->daddr, iph->saddr);
Linus Torvalds's avatar
Linus Torvalds committed
475
476
477
478
479
480
		if (!req)
			goto out;

		/* ICMPs are not backlogged, hence we cannot get
		   an established socket here.
		 */
481
		WARN_ON(req->sk);
Linus Torvalds's avatar
Linus Torvalds committed
482

483
		if (seq != tcp_rsk(req)->snt_isn) {
484
			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
Linus Torvalds's avatar
Linus Torvalds committed
485
486
487
488
489
490
491
492
493
			goto out;
		}

		/*
		 * Still in SYN_RECV, just remove it silently.
		 * There is no good way to pass the error to the newly
		 * created socket, and POSIX does not want network
		 * errors returned from accept().
		 */
494
		inet_csk_reqsk_queue_drop(sk, req, prev);
Linus Torvalds's avatar
Linus Torvalds committed
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
		goto out;

	case TCP_SYN_SENT:
	case TCP_SYN_RECV:  /* Cannot happen.
			       It can f.e. if SYNs crossed.
			     */
		if (!sock_owned_by_user(sk)) {
			sk->sk_err = err;

			sk->sk_error_report(sk);

			tcp_done(sk);
		} else {
			sk->sk_err_soft = err;
		}
		goto out;
	}

	/* If we've already connected we will keep trying
	 * until we time out, or the user gives up.
	 *
	 * rfc1122 4.2.3.9 allows to consider as hard errors
	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
	 * but it is obsoleted by pmtu discovery).
	 *
	 * Note, that in modern internet, where routing is unreliable
	 * and in each dark corner broken firewalls sit, sending random
	 * errors ordered by their masters even this two messages finally lose
	 * their original sense (even Linux sends invalid PORT_UNREACHs)
	 *
	 * Now we are in compliance with RFCs.
	 *							--ANK (980905)
	 */

	inet = inet_sk(sk);
	if (!sock_owned_by_user(sk) && inet->recverr) {
		sk->sk_err = err;
		sk->sk_error_report(sk);
	} else	{ /* Only an error on timeout */
		sk->sk_err_soft = err;
	}

out:
	bh_unlock_sock(sk);
	sock_put(sk);
}

542
543
static void __tcp_v4_send_check(struct sk_buff *skb,
				__be32 saddr, __be32 daddr)
Linus Torvalds's avatar
Linus Torvalds committed
544
{
545
	struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds's avatar
Linus Torvalds committed
546

547
	if (skb->ip_summed == CHECKSUM_PARTIAL) {
548
		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
549
		skb->csum_start = skb_transport_header(skb) - skb->head;
Al Viro's avatar
Al Viro committed
550
		skb->csum_offset = offsetof(struct tcphdr, check);
Linus Torvalds's avatar
Linus Torvalds committed
551
	} else {
552
		th->check = tcp_v4_check(skb->len, saddr, daddr,
553
					 csum_partial(th,
Linus Torvalds's avatar
Linus Torvalds committed
554
555
556
557
558
						      th->doff << 2,
						      skb->csum));
	}
}

559
/* This routine computes an IPv4 TCP checksum. */
560
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
561
{
562
	const struct inet_sock *inet = inet_sk(sk);
563
564
565

	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
566
EXPORT_SYMBOL(tcp_v4_send_check);
567

568
569
int tcp_v4_gso_send_check(struct sk_buff *skb)
{
570
	const struct iphdr *iph;
571
572
573
574
575
	struct tcphdr *th;

	if (!pskb_may_pull(skb, sizeof(*th)))
		return -EINVAL;

576
	iph = ip_hdr(skb);
577
	th = tcp_hdr(skb);
578
579

	th->check = 0;
580
	skb->ip_summed = CHECKSUM_PARTIAL;
581
	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
582
583
584
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
585
586
587
588
589
590
591
592
593
594
595
596
597
/*
 *	This routine will send an RST to the other tcp.
 *
 *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 *		      for reset.
 *	Answer: if a packet caused RST, it is not for a socket
 *		existing in our system, if it is matched to a socket,
 *		it is just duplicate segment or bug in other side's TCP.
 *		So that we build reply only basing on parameters
 *		arrived with segment.
 *	Exception: precedence violation. We do not implement it in any case.
 */

598
static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
599
{
600
	const struct tcphdr *th = tcp_hdr(skb);
601
602
603
	struct {
		struct tcphdr th;
#ifdef CONFIG_TCP_MD5SIG
604
		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
605
606
#endif
	} rep;
Linus Torvalds's avatar
Linus Torvalds committed
607
	struct ip_reply_arg arg;
608
609
#ifdef CONFIG_TCP_MD5SIG
	struct tcp_md5sig_key *key;
610
611
612
613
	const __u8 *hash_location = NULL;
	unsigned char newhash[16];
	int genhash;
	struct sock *sk1 = NULL;
614
#endif
615
	struct net *net;
Linus Torvalds's avatar
Linus Torvalds committed
616
617
618
619
620

	/* Never send a reset in response to a reset. */
	if (th->rst)
		return;

Eric Dumazet's avatar
Eric Dumazet committed
621
	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
Linus Torvalds's avatar
Linus Torvalds committed
622
623
624
		return;

	/* Swap the send and the receive. */
625
626
627
628
629
	memset(&rep, 0, sizeof(rep));
	rep.th.dest   = th->source;
	rep.th.source = th->dest;
	rep.th.doff   = sizeof(struct tcphdr) / 4;
	rep.th.rst    = 1;
Linus Torvalds's avatar
Linus Torvalds committed
630
631

	if (th->ack) {
632
		rep.th.seq = th->ack_seq;
Linus Torvalds's avatar
Linus Torvalds committed
633
	} else {
634
635
636
		rep.th.ack = 1;
		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
				       skb->len - (th->doff << 2));
Linus Torvalds's avatar
Linus Torvalds committed
637
638
	}

639
	memset(&arg, 0, sizeof(arg));
640
641
642
643
	arg.iov[0].iov_base = (unsigned char *)&rep;
	arg.iov[0].iov_len  = sizeof(rep.th);

#ifdef CONFIG_TCP_MD5SIG
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
	hash_location = tcp_parse_md5sig_option(th);
	if (!sk && hash_location) {
		/*
		 * active side is lost. Try to find listening socket through
		 * source port, and then find md5 key through listening socket.
		 * we are not loose security here:
		 * Incoming packet is checked with md5 hash with finding key,
		 * no RST generated if md5 hash doesn't match.
		 */
		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
					     &tcp_hashinfo, ip_hdr(skb)->daddr,
					     ntohs(th->source), inet_iif(skb));
		/* don't send rst if it can't find key */
		if (!sk1)
			return;
		rcu_read_lock();
		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
					&ip_hdr(skb)->saddr, AF_INET);
		if (!key)
			goto release_sk1;

		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
		if (genhash || memcmp(hash_location, newhash, 16) != 0)
			goto release_sk1;
	} else {
		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
					     &ip_hdr(skb)->saddr,
					     AF_INET) : NULL;
	}

674
675
676
677
678
679
680
681
682
	if (key) {
		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
				   (TCPOPT_NOP << 16) |
				   (TCPOPT_MD5SIG << 8) |
				   TCPOLEN_MD5SIG);
		/* Update length and the length the header thinks exists */
		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
		rep.th.doff = arg.iov[0].iov_len / 4;

683
		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
684
685
				     key, ip_hdr(skb)->saddr,
				     ip_hdr(skb)->daddr, &rep.th);
686
687
	}
#endif
688
689
	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				      ip_hdr(skb)->saddr, /* XXX */
690
				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
Linus Torvalds's avatar
Linus Torvalds committed
691
	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
692
	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
693
694
695
696
697
	/* When socket is gone, all binding information is lost.
	 * routing might fail in this case. using iif for oif to
	 * make sure we can deliver it
	 */
	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
Linus Torvalds's avatar
Linus Torvalds committed
698

Eric Dumazet's avatar
Eric Dumazet committed
699
	net = dev_net(skb_dst(skb)->dev);
700
	arg.tos = ip_hdr(skb)->tos;
701
	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
702
			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
Linus Torvalds's avatar
Linus Torvalds committed
703

704
705
	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
706
707
708
709
710
711
712
713

#ifdef CONFIG_TCP_MD5SIG
release_sk1:
	if (sk1) {
		rcu_read_unlock();
		sock_put(sk1);
	}
#endif
Linus Torvalds's avatar
Linus Torvalds committed
714
715
716
717
718
719
}

/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
   outside socket context is ugly, certainly. What can I do?
 */

720
721
static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
			    u32 win, u32 ts, int oif,
722
			    struct tcp_md5sig_key *key,
723
			    int reply_flags, u8 tos)
Linus Torvalds's avatar
Linus Torvalds committed
724
{
725
	const struct tcphdr *th = tcp_hdr(skb);
Linus Torvalds's avatar
Linus Torvalds committed
726
727
	struct {
		struct tcphdr th;
728
		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
729
#ifdef CONFIG_TCP_MD5SIG
730
			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
731
732
#endif
			];
Linus Torvalds's avatar
Linus Torvalds committed
733
734
	} rep;
	struct ip_reply_arg arg;
Eric Dumazet's avatar
Eric Dumazet committed
735
	struct net *net = dev_net(skb_dst(skb)->dev);
Linus Torvalds's avatar
Linus Torvalds committed
736
737

	memset(&rep.th, 0, sizeof(struct tcphdr));
738
	memset(&arg, 0, sizeof(arg));
Linus Torvalds's avatar
Linus Torvalds committed
739
740
741
742

	arg.iov[0].iov_base = (unsigned char *)&rep;
	arg.iov[0].iov_len  = sizeof(rep.th);
	if (ts) {
743
744
745
746
747
		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
				   (TCPOPT_TIMESTAMP << 8) |
				   TCPOLEN_TIMESTAMP);
		rep.opt[1] = htonl(tcp_time_stamp);
		rep.opt[2] = htonl(ts);
748
		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
Linus Torvalds's avatar
Linus Torvalds committed
749
750
751
752
753
754
755
756
757
758
759
	}

	/* Swap the send and the receive. */
	rep.th.dest    = th->source;
	rep.th.source  = th->dest;
	rep.th.doff    = arg.iov[0].iov_len / 4;
	rep.th.seq     = htonl(seq);
	rep.th.ack_seq = htonl(ack);
	rep.th.ack     = 1;
	rep.th.window  = htons(win);

760
761
762
763
764
765
766
767
768
769
770
#ifdef CONFIG_TCP_MD5SIG
	if (key) {
		int offset = (ts) ? 3 : 0;

		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
					  (TCPOPT_NOP << 16) |
					  (TCPOPT_MD5SIG << 8) |
					  TCPOLEN_MD5SIG);
		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
		rep.th.doff = arg.iov[0].iov_len/4;

771
		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
772
773
				    key, ip_hdr(skb)->saddr,
				    ip_hdr(skb)->daddr, &rep.th);
774
775
	}
#endif
776
	arg.flags = reply_flags;
777
778
	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
				      ip_hdr(skb)->saddr, /* XXX */
Linus Torvalds's avatar
Linus Torvalds committed
779
780
				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
781
782
	if (oif)
		arg.bound_dev_if = oif;
783
	arg.tos = tos;
784
	ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
785
			      ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
Linus Torvalds's avatar
Linus Torvalds committed
786

787
	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
Linus Torvalds's avatar
Linus Torvalds committed
788
789
790
791
}

static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
{
792
	struct inet_timewait_sock *tw = inet_twsk(sk);
793
	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
794

795
	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
796
			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
797
798
			tcptw->tw_ts_recent,
			tw->tw_bound_dev_if,
799
			tcp_twsk_md5_key(tcptw),
800
801
			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
			tw->tw_tos
802
			);
Linus Torvalds's avatar
Linus Torvalds committed
803

804
	inet_twsk_put(tw);
Linus Torvalds's avatar
Linus Torvalds committed
805
806
}

807
static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
808
				  struct request_sock *req)
Linus Torvalds's avatar
Linus Torvalds committed
809
{
810
	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
811
			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
812
813
			req->ts_recent,
			0,
Eric Dumazet's avatar
Eric Dumazet committed
814
815
			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
					  AF_INET),
816
817
			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
			ip_hdr(skb)->tos);
Linus Torvalds's avatar
Linus Torvalds committed
818
819
820
}

/*
821
 *	Send a SYN-ACK after having received a SYN.
822
 *	This still operates on a request_sock only, not on a big
Linus Torvalds's avatar
Linus Torvalds committed
823
824
 *	socket.
 */
825
826
static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
			      struct request_sock *req,
827
			      struct request_values *rvp,
828
829
			      u16 queue_mapping,
			      bool nocache)
Linus Torvalds's avatar
Linus Torvalds committed
830
{
831
	const struct inet_request_sock *ireq = inet_rsk(req);
832
	struct flowi4 fl4;
Linus Torvalds's avatar
Linus Torvalds committed
833
834
835
836
	int err = -1;
	struct sk_buff * skb;

	/* First, grab a route. */
837
	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
838
		return -1;
Linus Torvalds's avatar
Linus Torvalds committed
839

840
	skb = tcp_make_synack(sk, dst, req, rvp);
Linus Torvalds's avatar
Linus Torvalds committed
841
842

	if (skb) {
843
		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
Linus Torvalds's avatar
Linus Torvalds committed
844

845
		skb_set_queue_mapping(skb, queue_mapping);
846
847
848
		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
					    ireq->rmt_addr,
					    ireq->opt);
849
		err = net_xmit_eval(err);
Linus Torvalds's avatar
Linus Torvalds committed
850
851
852
853
854
	}

	return err;
}

855
static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
856
			      struct request_values *rvp)
857
{
858
	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
859
	return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
860
861
}

Linus Torvalds's avatar
Linus Torvalds committed
862
/*
863
 *	IPv4 request_sock destructor.
Linus Torvalds's avatar
Linus Torvalds committed
864
 */
865
static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds's avatar
Linus Torvalds committed
866
{
Jesper Juhl's avatar
Jesper Juhl committed
867
	kfree(inet_rsk(req)->opt);
Linus Torvalds's avatar
Linus Torvalds committed
868
869
}

870
/*
Eric Dumazet's avatar
Eric Dumazet committed
871
 * Return true if a syncookie should be sent
872
 */
Eric Dumazet's avatar
Eric Dumazet committed
873
bool tcp_syn_flood_action(struct sock *sk,
874
875
			 const struct sk_buff *skb,
			 const char *proto)
Linus Torvalds's avatar
Linus Torvalds committed
876
{
877
	const char *msg = "Dropping request";
Eric Dumazet's avatar
Eric Dumazet committed
878
	bool want_cookie = false;
879
880
881
	struct listen_sock *lopt;


Linus Torvalds's avatar
Linus Torvalds committed
882

883
#ifdef CONFIG_SYN_COOKIES
884
	if (sysctl_tcp_syncookies) {
885
		msg = "Sending cookies";
Eric Dumazet's avatar
Eric Dumazet committed
886
		want_cookie = true;
887
888
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
	} else
889
#endif
890
891
892
893
894
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);

	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
	if (!lopt->synflood_warned) {
		lopt->synflood_warned = 1;
895
		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
896
897
898
			proto, ntohs(tcp_hdr(skb)->dest), msg);
	}
	return want_cookie;
899
}
900
EXPORT_SYMBOL(tcp_syn_flood_action);
Linus Torvalds's avatar
Linus Torvalds committed
901
902

/*
903
 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds's avatar
Linus Torvalds committed
904
 */
905
906
static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
						  struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
907
{
908
909
	const struct ip_options *opt = &(IPCB(skb)->opt);
	struct ip_options_rcu *dopt = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
910
911

	if (opt && opt->optlen) {
912
913
		int opt_size = sizeof(*dopt) + opt->optlen;

Linus Torvalds's avatar
Linus Torvalds committed
914
915
		dopt = kmalloc(opt_size, GFP_ATOMIC);
		if (dopt) {
916
			if (ip_options_echo(&dopt->opt, skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
917
918
919
920
921
922
923
924
				kfree(dopt);
				dopt = NULL;
			}
		}
	}
	return dopt;
}

925
926
927
928
929
930
931
932
#ifdef CONFIG_TCP_MD5SIG
/*
 * RFC2385 MD5 checksumming requires a mapping of
 * IP address->MD5 Key.
 * We need to maintain these in the sk structure.
 */

/* Find the Key structure for an address.  */
Eric Dumazet's avatar
Eric Dumazet committed
933
934
935
struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
					 const union tcp_md5_addr *addr,
					 int family)
936
937
{
	struct tcp_sock *tp = tcp_sk(sk);
Eric Dumazet's avatar
Eric Dumazet committed
938
939
940
	struct tcp_md5sig_key *key;
	struct hlist_node *pos;
	unsigned int size = sizeof(struct in_addr);
941
	struct tcp_md5sig_info *md5sig;
942

943
944
	/* caller either holds rcu_read_lock() or socket lock */
	md5sig = rcu_dereference_check(tp->md5sig_info,
945
946
				       sock_owned_by_user(sk) ||
				       lockdep_is_held(&sk->sk_lock.slock));
947
	if (!md5sig)
948
		return NULL;
Eric Dumazet's avatar
Eric Dumazet committed
949
950
951
952
#if IS_ENABLED(CONFIG_IPV6)
	if (family == AF_INET6)
		size = sizeof(struct in6_addr);
#endif
953
	hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
Eric Dumazet's avatar
Eric Dumazet committed
954
955
956
957
		if (key->family != family)
			continue;
		if (!memcmp(&key->addr, addr, size))
			return key;
958
959
960
	}
	return NULL;
}
Eric Dumazet's avatar
Eric Dumazet committed
961
EXPORT_SYMBOL(tcp_md5_do_lookup);
962
963
964
965

struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
					 struct sock *addr_sk)
{
Eric Dumazet's avatar
Eric Dumazet committed
966
967
968
969
	union tcp_md5_addr *addr;

	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
	return tcp_md5_do_lookup(sk, addr, AF_INET);
970
971
972
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);

Adrian Bunk's avatar
Adrian Bunk committed
973
974
static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
						      struct request_sock *req)
975
{
Eric Dumazet's avatar
Eric Dumazet committed
976
977
978
979
	union tcp_md5_addr *addr;

	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
	return tcp_md5_do_lookup(sk, addr, AF_INET);
980
981
982
}

/* This can be called on a newly created socket, from other files */
Eric Dumazet's avatar
Eric Dumazet committed
983
984
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
985
986
{
	/* Add Key to the list */
987
	struct tcp_md5sig_key *key;
988
	struct tcp_sock *tp = tcp_sk(sk);
Eric Dumazet's avatar
Eric Dumazet committed
989
	struct tcp_md5sig_info *md5sig;
990

Eric Dumazet's avatar
Eric Dumazet committed
991
	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
992
993
	if (key) {
		/* Pre-existing entry - just update that one. */
Eric Dumazet's avatar
Eric Dumazet committed
994
		memcpy(key->key, newkey, newkeylen);
995
		key->keylen = newkeylen;
Eric Dumazet's avatar
Eric Dumazet committed
996
997
		return 0;
	}
998

999
1000
	md5sig = rcu_dereference_protected(tp->md5sig_info,
					   sock_owned_by_user(sk));