tcp_input.c 156 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
8
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
Stephen Hemminger's avatar
Stephen Hemminger committed
43
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
Linus Torvalds's avatar
Linus Torvalds committed
44
45
46
47
48
49
50
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
51
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
Linus Torvalds's avatar
Linus Torvalds committed
52
 *					connections with MSS<min(MTU,ann. MSS)
53
 *					work without delayed acks.
Linus Torvalds's avatar
Linus Torvalds committed
54
55
56
57
58
59
60
61
62
63
64
65
66
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sysctl.h>
67
#include <net/dst.h>
Linus Torvalds's avatar
Linus Torvalds committed
68
69
70
71
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
72
#include <net/netdma.h>
Linus Torvalds's avatar
Linus Torvalds committed
73

74
75
76
77
78
79
80
81
82
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_ecn __read_mostly;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
Linus Torvalds's avatar
Linus Torvalds committed
83

84
85
86
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
87
int sysctl_tcp_frto __read_mostly = 2;
88
int sysctl_tcp_frto_response __read_mostly;
89
int sysctl_tcp_nometrics_save __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
90

91
92
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
93
94
95
96
97
98
99
100
101
102

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
#define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
Ilpo Järvinen's avatar
Ilpo Järvinen committed
103
#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
104
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
105
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
106
#define FLAG_NONHEAD_RETRANS_ACKED	0x1000 /* Non-head rexmitted data was ACKed */
107
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
Linus Torvalds's avatar
Linus Torvalds committed
108
109
110
111
112

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
113
#define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
Linus Torvalds's avatar
Linus Torvalds committed
114
115

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
116
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
Linus Torvalds's avatar
Linus Torvalds committed
117

118
/* Adapt the MSS value used to make delayed ack decision to the
Linus Torvalds's avatar
Linus Torvalds committed
119
 * real world.
120
 */
121
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
122
{
123
	struct inet_connection_sock *icsk = inet_csk(sk);
124
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
125
	unsigned int len;
Linus Torvalds's avatar
Linus Torvalds committed
126

127
	icsk->icsk_ack.last_seg_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
128
129
130
131

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
132
	len = skb_shinfo(skb)->gso_size ? : skb->len;
133
134
	if (len >= icsk->icsk_ack.rcv_mss) {
		icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
135
136
137
138
139
140
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
141
		len += skb->data - skb_transport_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed
142
143
144
145
146
147
148
		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
149
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
Linus Torvalds's avatar
Linus Torvalds committed
150
151
152
153
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
154
155
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
Linus Torvalds's avatar
Linus Torvalds committed
156
			if (len == lss) {
157
				icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
158
159
160
				return;
			}
		}
161
162
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
163
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
Linus Torvalds's avatar
Linus Torvalds committed
164
165
166
	}
}

167
static void tcp_incr_quickack(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
168
{
169
170
	struct inet_connection_sock *icsk = inet_csk(sk);
	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
Linus Torvalds's avatar
Linus Torvalds committed
171

172
173
	if (quickacks == 0)
		quickacks = 2;
174
175
	if (quickacks > icsk->icsk_ack.quick)
		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
Linus Torvalds's avatar
Linus Torvalds committed
176
177
}

178
void tcp_enter_quickack_mode(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
179
{
180
181
182
183
	struct inet_connection_sock *icsk = inet_csk(sk);
	tcp_incr_quickack(sk);
	icsk->icsk_ack.pingpong = 0;
	icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
184
185
186
187
188
189
}

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

190
static inline int tcp_in_quickack_mode(const struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
191
{
192
193
	const struct inet_connection_sock *icsk = inet_csk(sk);
	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
Linus Torvalds's avatar
Linus Torvalds committed
194
195
}

196
197
static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
{
198
	if (tp->ecn_flags & TCP_ECN_OK)
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (tcp_hdr(skb)->cwr)
		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
{
	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
{
215
	if (tp->ecn_flags & TCP_ECN_OK) {
216
217
218
219
220
221
222
223
224
225
226
227
		if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
		/* Funny extension: if ECT is not set on a segment,
		 * it is surely retransmit. It is not in ECN RFC,
		 * but Linux follows this rule. */
		else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
			tcp_enter_quickack_mode((struct sock *)tp);
	}
}

static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
{
228
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
229
230
231
232
233
		tp->ecn_flags &= ~TCP_ECN_OK;
}

static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
{
234
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
235
236
237
238
239
		tp->ecn_flags &= ~TCP_ECN_OK;
}

static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
{
240
	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
241
242
243
244
		return 1;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

static void tcp_fixup_sndbuf(struct sock *sk)
{
	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
		     sizeof(struct sk_buff);

	if (sk->sk_sndbuf < 3 * sndmem)
		sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
Stephen Hemminger's avatar
Stephen Hemminger committed
280
 * window and then starts to feed us spaghetti. But it should work
Linus Torvalds's avatar
Linus Torvalds committed
281
282
283
284
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
285
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
286
{
287
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
288
	/* Optimize this! */
289
290
	int truesize = tcp_win_from_space(skb->truesize) >> 1;
	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
Linus Torvalds's avatar
Linus Torvalds committed
291
292
293

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
294
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
295
296
297
298
299
300
301

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

302
static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
303
{
304
305
	struct tcp_sock *tp = tcp_sk(sk);

Linus Torvalds's avatar
Linus Torvalds committed
306
307
308
309
310
311
312
313
314
315
	/* Check #1 */
	if (tp->rcv_ssthresh < tp->window_clamp &&
	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
	    !tcp_memory_pressure) {
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
		if (tcp_win_from_space(skb->truesize) <= skb->len)
316
			incr = 2 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
317
		else
318
			incr = __tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
319
320

		if (incr) {
321
322
			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
					       tp->window_clamp);
323
			inet_csk(sk)->icsk_ack.quick |= 1;
Linus Torvalds's avatar
Linus Torvalds committed
324
325
326
327
328
329
330
331
332
333
334
335
		}
	}
}

/* 3. Tuning rcvbuf, when connection enters established state. */

static void tcp_fixup_rcvbuf(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);

	/* Try to select rcvbuf so that 4 mss-sized segments
Stephen Hemminger's avatar
Stephen Hemminger committed
336
	 * will fit to window and corresponding skbs will fit to our rcvbuf.
Linus Torvalds's avatar
Linus Torvalds committed
337
338
339
340
341
342
343
344
	 * (was 3; 4 is minimum to allow fast retransmit to work.)
	 */
	while (tcp_win_from_space(rcvmem) < tp->advmss)
		rcvmem += 128;
	if (sk->sk_rcvbuf < 4 * rcvmem)
		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
345
/* 4. Try to fixup all. It is made immediately after connection enters
Linus Torvalds's avatar
Linus Torvalds committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
 *    established state.
 */
static void tcp_init_buffer_space(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		tcp_fixup_rcvbuf(sk);
	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
		tcp_fixup_sndbuf(sk);

	tp->rcvq_space.space = tp->rcv_wnd;

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
			tp->window_clamp = max(maxwin -
					       (maxwin >> sysctl_tcp_app_win),
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
	if (sysctl_tcp_app_win &&
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
	tp->snd_cwnd_stamp = tcp_time_stamp;
}

/* 5. Recalculate window clamp after socket hit its memory bounds. */
382
static void tcp_clamp_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
383
{
384
	struct tcp_sock *tp = tcp_sk(sk);
385
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
386

387
	icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed
388

389
390
391
392
393
394
	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
	    !tcp_memory_pressure &&
	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				    sysctl_tcp_rmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
395
	}
396
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
397
		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
Linus Torvalds's avatar
Linus Torvalds committed
398
399
}

Stephen Hemminger's avatar
Stephen Hemminger committed
400
401
402
403
404
405
406
407
408
409
410
411
/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

412
	hint = min(hint, tp->rcv_wnd / 2);
Stephen Hemminger's avatar
Stephen Hemminger committed
413
414
415
416
417
418
	hint = min(hint, TCP_MIN_RCVMSS);
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}

Linus Torvalds's avatar
Linus Torvalds committed
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
 *
 * More detail on this code can be found at
 * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
	u32 new_sample = tp->rcv_rtt_est.rtt;
	long m = sample;

	if (m == 0)
		m = 1;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
445
		 * non-timestamp case, we do not smooth things out
Stephen Hemminger's avatar
Stephen Hemminger committed
446
		 * else with timestamps disabled convergence takes too
Linus Torvalds's avatar
Linus Torvalds committed
447
448
449
450
451
452
453
454
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
		} else if (m < new_sample)
			new_sample = m << 3;
	} else {
Stephen Hemminger's avatar
Stephen Hemminger committed
455
		/* No previous measure. */
Linus Torvalds's avatar
Linus Torvalds committed
456
457
458
459
460
461
462
463
464
465
466
467
468
		new_sample = m << 3;
	}

	if (tp->rcv_rtt_est.rtt != new_sample)
		tp->rcv_rtt_est.rtt = new_sample;
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
	if (tp->rcv_rtt_est.time == 0)
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
469
	tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
Linus Torvalds's avatar
Linus Torvalds committed
470
471
472
473
474
475

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
	tp->rcv_rtt_est.time = tcp_time_stamp;
}

476
477
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
					  const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
478
{
479
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
480
481
	if (tp->rx_opt.rcv_tsecr &&
	    (TCP_SKB_CB(skb)->end_seq -
482
	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
Linus Torvalds's avatar
Linus Torvalds committed
483
484
485
486
487
488
489
490
491
492
493
494
		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int time;
	int space;
495

Linus Torvalds's avatar
Linus Torvalds committed
496
497
	if (tp->rcvq_space.time == 0)
		goto new_measure;
498

Linus Torvalds's avatar
Linus Torvalds committed
499
	time = tcp_time_stamp - tp->rcvq_space.time;
500
	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
Linus Torvalds's avatar
Linus Torvalds committed
501
		return;
502

Linus Torvalds's avatar
Linus Torvalds committed
503
504
505
506
507
508
509
510
511
	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);

	space = max(tp->rcvq_space.space, space);

	if (tp->rcvq_space.space != space) {
		int rcvmem;

		tp->rcvq_space.space = space;

512
513
		if (sysctl_tcp_moderate_rcvbuf &&
		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
Linus Torvalds's avatar
Linus Torvalds committed
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
			int new_clamp = space;

			/* Receive space grows, normalize in order to
			 * take into account packet headers and sk_buff
			 * structure overhead.
			 */
			space /= tp->advmss;
			if (!space)
				space = 1;
			rcvmem = (tp->advmss + MAX_TCP_HEADER +
				  16 + sizeof(struct sk_buff));
			while (tcp_win_from_space(rcvmem) < tp->advmss)
				rcvmem += 128;
			space *= rcvmem;
			space = min(space, sysctl_tcp_rmem[2]);
			if (space > sk->sk_rcvbuf) {
				sk->sk_rcvbuf = space;

				/* Make the window clamp follow along.  */
				tp->window_clamp = new_clamp;
			}
		}
	}
537

Linus Torvalds's avatar
Linus Torvalds committed
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
	tp->rcvq_space.time = tcp_time_stamp;
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
553
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
554
{
555
	struct tcp_sock *tp = tcp_sk(sk);
556
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
557
558
	u32 now;

559
	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
560

561
	tcp_measure_rcv_mss(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
562
563

	tcp_rcv_rtt_measure(tp);
564

Linus Torvalds's avatar
Linus Torvalds committed
565
566
	now = tcp_time_stamp;

567
	if (!icsk->icsk_ack.ato) {
Linus Torvalds's avatar
Linus Torvalds committed
568
569
570
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
571
572
		tcp_incr_quickack(sk);
		icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
573
	} else {
574
		int m = now - icsk->icsk_ack.lrcvtime;
Linus Torvalds's avatar
Linus Torvalds committed
575

576
		if (m <= TCP_ATO_MIN / 2) {
Linus Torvalds's avatar
Linus Torvalds committed
577
			/* The fastest case is the first. */
578
579
580
581
582
583
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
Stephen Hemminger's avatar
Stephen Hemminger committed
584
			/* Too long gap. Apparently sender failed to
Linus Torvalds's avatar
Linus Torvalds committed
585
586
			 * restart window, so that we send ACKs quickly.
			 */
587
			tcp_incr_quickack(sk);
588
			sk_mem_reclaim(sk);
Linus Torvalds's avatar
Linus Torvalds committed
589
590
		}
	}
591
	icsk->icsk_ack.lrcvtime = now;
Linus Torvalds's avatar
Linus Torvalds committed
592
593
594
595

	TCP_ECN_check_ce(tp, skb);

	if (skb->len >= 128)
596
		tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
597
598
}

599
600
601
602
603
static u32 tcp_rto_min(struct sock *sk)
{
	struct dst_entry *dst = __sk_dst_get(sk);
	u32 rto_min = TCP_RTO_MIN;

604
	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
Stephen Hemminger's avatar
Stephen Hemminger committed
605
		rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
606
607
608
	return rto_min;
}

Linus Torvalds's avatar
Linus Torvalds committed
609
610
611
612
613
614
615
616
617
/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
618
static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
Linus Torvalds's avatar
Linus Torvalds committed
619
{
620
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
621
622
623
624
625
	long m = mrtt; /* RTT */

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
626
	 *	This is designed to be as fast as possible
Linus Torvalds's avatar
Linus Torvalds committed
627
628
629
630
631
632
633
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
634
	 * too slowly, when it should be increased quickly, decrease too quickly
Linus Torvalds's avatar
Linus Torvalds committed
635
636
637
638
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
Stephen Hemminger's avatar
Stephen Hemminger committed
639
	if (m == 0)
Linus Torvalds's avatar
Linus Torvalds committed
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
		m = 1;
	if (tp->srtt != 0) {
		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
			m -= (tp->mdev >> 2);   /* similar update on mdev */
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
			m -= (tp->mdev >> 2);   /* similar update on mdev */
		}
		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev > tp->mdev_max) {
			tp->mdev_max = tp->mdev;
			if (tp->mdev_max > tp->rttvar)
				tp->rttvar = tp->mdev_max;
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
			if (tp->mdev_max < tp->rttvar)
668
				tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
Linus Torvalds's avatar
Linus Torvalds committed
669
			tp->rtt_seq = tp->snd_nxt;
670
			tp->mdev_max = tcp_rto_min(sk);
Linus Torvalds's avatar
Linus Torvalds committed
671
672
673
		}
	} else {
		/* no previous measure. */
674
675
		tp->srtt = m << 3;	/* take the measured time to be rtt */
		tp->mdev = m << 1;	/* make sure rto = 3*rtt */
676
		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
Linus Torvalds's avatar
Linus Torvalds committed
677
678
679
680
681
682
683
		tp->rtt_seq = tp->snd_nxt;
	}
}

/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
684
static inline void tcp_set_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
685
{
686
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
687
688
689
690
691
692
693
694
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
Stephen Hemminger's avatar
Stephen Hemminger committed
695
	 *    ACKs in some circumstances.
Linus Torvalds's avatar
Linus Torvalds committed
696
	 */
697
	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
Linus Torvalds's avatar
Linus Torvalds committed
698
699
700
701

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
Stephen Hemminger's avatar
Stephen Hemminger committed
702
	 *    with correct one. It is exactly, which we pretend to do.
Linus Torvalds's avatar
Linus Torvalds committed
703
704
705
706
707
708
	 */
}

/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
 * guarantees that rto is higher.
 */
709
static inline void tcp_bound_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
710
{
711
712
	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
Linus Torvalds's avatar
Linus Torvalds committed
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
}

/* Save metrics learned by this TCP session.
   This function is called only, when TCP finishes successfully
   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
 */
void tcp_update_metrics(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct dst_entry *dst = __sk_dst_get(sk);

	if (sysctl_tcp_nometrics_save)
		return;

	dst_confirm(dst);

729
	if (dst && (dst->flags & DST_HOST)) {
730
		const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
731
		int m;
Stephen Hemminger's avatar
Stephen Hemminger committed
732
		unsigned long rtt;
Linus Torvalds's avatar
Linus Torvalds committed
733

734
		if (icsk->icsk_backoff || !tp->srtt) {
Linus Torvalds's avatar
Linus Torvalds committed
735
736
737
738
739
			/* This session failed to estimate rtt. Why?
			 * Probably, no packets returned in time.
			 * Reset our results.
			 */
			if (!(dst_metric_locked(dst, RTAX_RTT)))
740
				dst->metrics[RTAX_RTT - 1] = 0;
Linus Torvalds's avatar
Linus Torvalds committed
741
742
743
			return;
		}

Stephen Hemminger's avatar
Stephen Hemminger committed
744
745
		rtt = dst_metric_rtt(dst, RTAX_RTT);
		m = rtt - tp->srtt;
Linus Torvalds's avatar
Linus Torvalds committed
746
747
748
749
750
751
752

		/* If newly calculated rtt larger than stored one,
		 * store new one. Otherwise, use EWMA. Remember,
		 * rtt overestimation is always better than underestimation.
		 */
		if (!(dst_metric_locked(dst, RTAX_RTT))) {
			if (m <= 0)
Stephen Hemminger's avatar
Stephen Hemminger committed
753
				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
Linus Torvalds's avatar
Linus Torvalds committed
754
			else
Stephen Hemminger's avatar
Stephen Hemminger committed
755
				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
Linus Torvalds's avatar
Linus Torvalds committed
756
757
758
		}

		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
Stephen Hemminger's avatar
Stephen Hemminger committed
759
			unsigned long var;
Linus Torvalds's avatar
Linus Torvalds committed
760
761
762
763
764
765
766
767
			if (m < 0)
				m = -m;

			/* Scale deviation to rttvar fixed point */
			m >>= 1;
			if (m < tp->mdev)
				m = tp->mdev;

Stephen Hemminger's avatar
Stephen Hemminger committed
768
769
770
			var = dst_metric_rtt(dst, RTAX_RTTVAR);
			if (m >= var)
				var = m;
Linus Torvalds's avatar
Linus Torvalds committed
771
			else
Stephen Hemminger's avatar
Stephen Hemminger committed
772
773
774
				var -= (var - m) >> 2;

			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
Linus Torvalds's avatar
Linus Torvalds committed
775
776
777
778
779
780
781
782
783
784
		}

		if (tp->snd_ssthresh >= 0xFFFF) {
			/* Slow start still did not finish. */
			if (dst_metric(dst, RTAX_SSTHRESH) &&
			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
			if (!dst_metric_locked(dst, RTAX_CWND) &&
			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
785
				dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd;
Linus Torvalds's avatar
Linus Torvalds committed
786
		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
787
			   icsk->icsk_ca_state == TCP_CA_Open) {
Linus Torvalds's avatar
Linus Torvalds committed
788
789
790
791
792
			/* Cong. avoidance phase, cwnd is reliable. */
			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
				dst->metrics[RTAX_SSTHRESH-1] =
					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
			if (!dst_metric_locked(dst, RTAX_CWND))
793
				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1;
Linus Torvalds's avatar
Linus Torvalds committed
794
795
796
797
798
		} else {
			/* Else slow start did not finish, cwnd is non-sense,
			   ssthresh may be also invalid.
			 */
			if (!dst_metric_locked(dst, RTAX_CWND))
799
800
				dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1;
			if (dst_metric(dst, RTAX_SSTHRESH) &&
Linus Torvalds's avatar
Linus Torvalds committed
801
			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
802
			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
Linus Torvalds's avatar
Linus Torvalds committed
803
804
805
806
				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
		}

		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
807
			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
Linus Torvalds's avatar
Linus Torvalds committed
808
809
810
811
812
813
			    tp->reordering != sysctl_tcp_reordering)
				dst->metrics[RTAX_REORDERING-1] = tp->reordering;
		}
	}
}

814
815
816
817
818
819
820
821
822
/* Numbers are taken from RFC3390.
 *
 * John Heffner states:
 *
 *	The RFC specifies a window of no more than 4380 bytes
 *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
 *	is a bit misleading because they use a clamp at 4380 bytes
 *	rather than use a multiplier in the relevant range.
 */
Linus Torvalds's avatar
Linus Torvalds committed
823
824
825
826
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

827
828
829
830
831
832
	if (!cwnd) {
		if (tp->mss_cache > 1460)
			cwnd = 2;
		else
			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
	}
Linus Torvalds's avatar
Linus Torvalds committed
833
834
835
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
836
/* Set slow start threshold and cwnd not falling to slow start */
837
void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
Stephen Hemminger's avatar
Stephen Hemminger committed
838
839
{
	struct tcp_sock *tp = tcp_sk(sk);
840
	const struct inet_connection_sock *icsk = inet_csk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
841
842
843

	tp->prior_ssthresh = 0;
	tp->bytes_acked = 0;
844
	if (icsk->icsk_ca_state < TCP_CA_CWR) {
Stephen Hemminger's avatar
Stephen Hemminger committed
845
		tp->undo_marker = 0;
846
847
		if (set_ssthresh)
			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
848
849
850
851
852
853
854
855
856
857
858
		tp->snd_cwnd = min(tp->snd_cwnd,
				   tcp_packets_in_flight(tp) + 1U);
		tp->snd_cwnd_cnt = 0;
		tp->high_seq = tp->snd_nxt;
		tp->snd_cwnd_stamp = tcp_time_stamp;
		TCP_ECN_queue_cwr(tp);

		tcp_set_ca_state(sk, TCP_CA_CWR);
	}
}

859
860
861
862
863
864
/*
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
 */
static void tcp_disable_fack(struct tcp_sock *tp)
{
865
866
867
	/* RFC3517 uses different metric in lost marker => reset on change */
	if (tcp_is_fack(tp))
		tp->lost_skb_hint = NULL;
868
869
870
	tp->rx_opt.sack_ok &= ~2;
}

871
/* Take a notice that peer is sending D-SACKs */
872
873
874
875
876
static void tcp_dsack_seen(struct tcp_sock *tp)
{
	tp->rx_opt.sack_ok |= 4;
}

Linus Torvalds's avatar
Linus Torvalds committed
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
/* Initialize metrics on socket. */

static void tcp_init_metrics(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct dst_entry *dst = __sk_dst_get(sk);

	if (dst == NULL)
		goto reset;

	dst_confirm(dst);

	if (dst_metric_locked(dst, RTAX_CWND))
		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
	if (dst_metric(dst, RTAX_SSTHRESH)) {
		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
			tp->snd_ssthresh = tp->snd_cwnd_clamp;
	}
	if (dst_metric(dst, RTAX_REORDERING) &&
	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
898
		tcp_disable_fack(tp);
Linus Torvalds's avatar
Linus Torvalds committed
899
900
901
902
903
904
		tp->reordering = dst_metric(dst, RTAX_REORDERING);
	}

	if (dst_metric(dst, RTAX_RTT) == 0)
		goto reset;

Stephen Hemminger's avatar
Stephen Hemminger committed
905
	if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
Linus Torvalds's avatar
Linus Torvalds committed
906
907
908
909
910
911
912
913
		goto reset;

	/* Initial rtt is determined from SYN,SYN-ACK.
	 * The segment is small and rtt may appear much
	 * less than real one. Use per-dst memory
	 * to make it more realistic.
	 *
	 * A bit of theory. RTT is time passed after "normal" sized packet
Stephen Hemminger's avatar
Stephen Hemminger committed
914
	 * is sent until it is ACKed. In normal circumstances sending small
Linus Torvalds's avatar
Linus Torvalds committed
915
916
917
918
919
920
921
	 * packets force peer to delay ACKs and calculation is correct too.
	 * The algorithm is adaptive and, provided we follow specs, it
	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
	 * tricks sort of "quick acks" for time long enough to decrease RTT
	 * to low value, and then abruptly stops to do it and starts to delay
	 * ACKs, wait for troubles.
	 */
Stephen Hemminger's avatar
Stephen Hemminger committed
922
923
	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
Linus Torvalds's avatar
Linus Torvalds committed
924
925
		tp->rtt_seq = tp->snd_nxt;
	}
Stephen Hemminger's avatar
Stephen Hemminger committed
926
927
	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
928
		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
Linus Torvalds's avatar
Linus Torvalds committed
929
	}
930
931
932
	tcp_set_rto(sk);
	tcp_bound_rto(sk);
	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
Linus Torvalds's avatar
Linus Torvalds committed
933
934
935
936
937
938
939
940
941
942
943
944
945
		goto reset;
	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
	tp->snd_cwnd_stamp = tcp_time_stamp;
	return;

reset:
	/* Play conservative. If timestamps are not
	 * supported, TCP will fail to recalculate correct
	 * rtt, if initial rto is too small. FORGET ALL AND RESET!
	 */
	if (!tp->rx_opt.saw_tstamp && tp->srtt) {
		tp->srtt = 0;
		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
946
		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds's avatar
Linus Torvalds committed
947
948
949
	}
}

950
951
static void tcp_update_reordering(struct sock *sk, const int metric,
				  const int ts)
Linus Torvalds's avatar
Linus Torvalds committed
952
{
953
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
954
	if (metric > tp->reordering) {
955
956
		int mib_idx;

Linus Torvalds's avatar
Linus Torvalds committed
957
958
959
960
		tp->reordering = min(TCP_MAX_REORDERING, metric);

		/* This exciting event is worth to be remembered. 8) */
		if (ts)
961
			mib_idx = LINUX_MIB_TCPTSREORDER;
962
		else if (tcp_is_reno(tp))
963
			mib_idx = LINUX_MIB_TCPRENOREORDER;
964
		else if (tcp_is_fack(tp))
965
			mib_idx = LINUX_MIB_TCPFACKREORDER;
Linus Torvalds's avatar
Linus Torvalds committed
966
		else
967
968
			mib_idx = LINUX_MIB_TCPSACKREORDER;

969
		NET_INC_STATS_BH(sock_net(sk), mib_idx);
Linus Torvalds's avatar
Linus Torvalds committed
970
971
#if FASTRETRANS_DEBUG > 1
		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
972
		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
Linus Torvalds's avatar
Linus Torvalds committed
973
974
975
976
977
		       tp->reordering,
		       tp->fackets_out,
		       tp->sacked_out,
		       tp->undo_marker ? tp->undo_retrans : 0);
#endif
978
		tcp_disable_fack(tp);
Linus Torvalds's avatar
Linus Torvalds committed
979
980
981
	}
}

982
/* This must be called before lost_out is incremented */
983
984
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
985
	if ((tp->retransmit_skb_hint == NULL) ||
986
987
	    before(TCP_SKB_CB(skb)->seq,
		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
988
989
990
991
992
		tp->retransmit_skb_hint = skb;

	if (!tp->lost_out ||
	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
993
994
}

995
996
997
998
999
1000
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tcp_verify_retransmit_hint(tp, skb);

		tp->lost_out += tcp_skb_pcount(skb);
For faster browsing, not all history is shown. View entire blame