tcp_input.c 143 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
 *
10
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
Stephen Hemminger's avatar
Stephen Hemminger committed
45
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
Linus Torvalds's avatar
Linus Torvalds committed
46
47
48
49
50
51
52
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
53
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
Linus Torvalds's avatar
Linus Torvalds committed
54
 *					connections with MSS<min(MTU,ann. MSS)
55
 *					work without delayed acks.
Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
73
#include <net/netdma.h>
Linus Torvalds's avatar
Linus Torvalds committed
74

75
76
77
78
79
80
81
82
83
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_ecn __read_mostly;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
Linus Torvalds's avatar
Linus Torvalds committed
84

85
86
87
88
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly;
89
int sysctl_tcp_frto_response __read_mostly;
90
int sysctl_tcp_nometrics_save __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
91

92
93
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
94
95
96
97
98
99
100
101
102
103

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
#define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
Ilpo Järvinen's avatar
Ilpo Järvinen committed
104
#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
105
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
106
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained DSACK info */
Linus Torvalds's avatar
Linus Torvalds committed
107
108
109
110
111

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
112
#define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
Linus Torvalds's avatar
Linus Torvalds committed
113

Ilpo Järvinen's avatar
Ilpo Järvinen committed
114
115
#define IsSackFrto() (sysctl_tcp_frto == 0x2)

Linus Torvalds's avatar
Linus Torvalds committed
116
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
117
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
Linus Torvalds's avatar
Linus Torvalds committed
118

119
/* Adapt the MSS value used to make delayed ack decision to the
Linus Torvalds's avatar
Linus Torvalds committed
120
 * real world.
121
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
122
123
static void tcp_measure_rcv_mss(struct sock *sk,
				const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
124
{
125
	struct inet_connection_sock *icsk = inet_csk(sk);
126
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
127
	unsigned int len;
Linus Torvalds's avatar
Linus Torvalds committed
128

129
	icsk->icsk_ack.last_seg_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
130
131
132
133

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
134
	len = skb_shinfo(skb)->gso_size ?: skb->len;
135
136
	if (len >= icsk->icsk_ack.rcv_mss) {
		icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
137
138
139
140
141
142
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
143
		len += skb->data - skb_transport_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed
144
145
146
147
148
149
150
		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
151
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
Linus Torvalds's avatar
Linus Torvalds committed
152
153
154
155
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
156
157
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
Linus Torvalds's avatar
Linus Torvalds committed
158
			if (len == lss) {
159
				icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
160
161
162
				return;
			}
		}
163
164
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
165
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
Linus Torvalds's avatar
Linus Torvalds committed
166
167
168
	}
}

169
static void tcp_incr_quickack(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
170
{
171
172
	struct inet_connection_sock *icsk = inet_csk(sk);
	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
Linus Torvalds's avatar
Linus Torvalds committed
173
174
175

	if (quickacks==0)
		quickacks=2;
176
177
	if (quickacks > icsk->icsk_ack.quick)
		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
Linus Torvalds's avatar
Linus Torvalds committed
178
179
}

180
void tcp_enter_quickack_mode(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
181
{
182
183
184
185
	struct inet_connection_sock *icsk = inet_csk(sk);
	tcp_incr_quickack(sk);
	icsk->icsk_ack.pingpong = 0;
	icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
186
187
188
189
190
191
}

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

192
static inline int tcp_in_quickack_mode(const struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
193
{
194
195
	const struct inet_connection_sock *icsk = inet_csk(sk);
	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
Linus Torvalds's avatar
Linus Torvalds committed
196
197
}

198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
{
	if (tp->ecn_flags&TCP_ECN_OK)
		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (tcp_hdr(skb)->cwr)
		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
{
	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (tp->ecn_flags&TCP_ECN_OK) {
		if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
		/* Funny extension: if ECT is not set on a segment,
		 * it is surely retransmit. It is not in ECN RFC,
		 * but Linux follows this rule. */
		else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
			tcp_enter_quickack_mode((struct sock *)tp);
	}
}

static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
{
	if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr))
		tp->ecn_flags &= ~TCP_ECN_OK;
}

static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
{
	if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr))
		tp->ecn_flags &= ~TCP_ECN_OK;
}

static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
{
	if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK))
		return 1;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

static void tcp_fixup_sndbuf(struct sock *sk)
{
	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
		     sizeof(struct sk_buff);

	if (sk->sk_sndbuf < 3 * sndmem)
		sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
Stephen Hemminger's avatar
Stephen Hemminger committed
282
 * window and then starts to feed us spaghetti. But it should work
Linus Torvalds's avatar
Linus Torvalds committed
283
284
285
286
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
287
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
288
{
289
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
290
291
	/* Optimize this! */
	int truesize = tcp_win_from_space(skb->truesize)/2;
292
	int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
Linus Torvalds's avatar
Linus Torvalds committed
293
294
295

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
296
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
297
298
299
300
301
302
303

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

304
static void tcp_grow_window(struct sock *sk,
Stephen Hemminger's avatar
Stephen Hemminger committed
305
			    struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
306
{
307
308
	struct tcp_sock *tp = tcp_sk(sk);

Linus Torvalds's avatar
Linus Torvalds committed
309
310
311
312
313
314
315
316
317
318
319
320
	/* Check #1 */
	if (tp->rcv_ssthresh < tp->window_clamp &&
	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
	    !tcp_memory_pressure) {
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
		if (tcp_win_from_space(skb->truesize) <= skb->len)
			incr = 2*tp->advmss;
		else
321
			incr = __tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
322
323
324

		if (incr) {
			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
325
			inet_csk(sk)->icsk_ack.quick |= 1;
Linus Torvalds's avatar
Linus Torvalds committed
326
327
328
329
330
331
332
333
334
335
336
337
		}
	}
}

/* 3. Tuning rcvbuf, when connection enters established state. */

static void tcp_fixup_rcvbuf(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);

	/* Try to select rcvbuf so that 4 mss-sized segments
Stephen Hemminger's avatar
Stephen Hemminger committed
338
	 * will fit to window and corresponding skbs will fit to our rcvbuf.
Linus Torvalds's avatar
Linus Torvalds committed
339
340
341
342
343
344
345
346
	 * (was 3; 4 is minimum to allow fast retransmit to work.)
	 */
	while (tcp_win_from_space(rcvmem) < tp->advmss)
		rcvmem += 128;
	if (sk->sk_rcvbuf < 4 * rcvmem)
		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
347
/* 4. Try to fixup all. It is made immediately after connection enters
Linus Torvalds's avatar
Linus Torvalds committed
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
 *    established state.
 */
static void tcp_init_buffer_space(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		tcp_fixup_rcvbuf(sk);
	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
		tcp_fixup_sndbuf(sk);

	tp->rcvq_space.space = tp->rcv_wnd;

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
			tp->window_clamp = max(maxwin -
					       (maxwin >> sysctl_tcp_app_win),
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
	if (sysctl_tcp_app_win &&
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
	tp->snd_cwnd_stamp = tcp_time_stamp;
}

/* 5. Recalculate window clamp after socket hit its memory bounds. */
384
static void tcp_clamp_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
385
{
386
	struct tcp_sock *tp = tcp_sk(sk);
387
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
388

389
	icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed
390

391
392
393
394
395
396
	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
	    !tcp_memory_pressure &&
	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				    sysctl_tcp_rmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
397
	}
398
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
Linus Torvalds's avatar
Linus Torvalds committed
399
400
401
		tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421

/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

	hint = min(hint, tp->rcv_wnd/2);
	hint = min(hint, TCP_MIN_RCVMSS);
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}

Linus Torvalds's avatar
Linus Torvalds committed
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
 *
 * More detail on this code can be found at
 * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
	u32 new_sample = tp->rcv_rtt_est.rtt;
	long m = sample;

	if (m == 0)
		m = 1;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
448
		 * non-timestamp case, we do not smooth things out
Stephen Hemminger's avatar
Stephen Hemminger committed
449
		 * else with timestamps disabled convergence takes too
Linus Torvalds's avatar
Linus Torvalds committed
450
451
452
453
454
455
456
457
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
		} else if (m < new_sample)
			new_sample = m << 3;
	} else {
Stephen Hemminger's avatar
Stephen Hemminger committed
458
		/* No previous measure. */
Linus Torvalds's avatar
Linus Torvalds committed
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
		new_sample = m << 3;
	}

	if (tp->rcv_rtt_est.rtt != new_sample)
		tp->rcv_rtt_est.rtt = new_sample;
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
	if (tp->rcv_rtt_est.time == 0)
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
	tcp_rcv_rtt_update(tp,
			   jiffies - tp->rcv_rtt_est.time,
			   1);

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
	tp->rcv_rtt_est.time = tcp_time_stamp;
}

481
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
482
{
483
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
484
485
	if (tp->rx_opt.rcv_tsecr &&
	    (TCP_SKB_CB(skb)->end_seq -
486
	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
Linus Torvalds's avatar
Linus Torvalds committed
487
488
489
490
491
492
493
494
495
496
497
498
		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int time;
	int space;
499

Linus Torvalds's avatar
Linus Torvalds committed
500
501
	if (tp->rcvq_space.time == 0)
		goto new_measure;
502

Linus Torvalds's avatar
Linus Torvalds committed
503
504
505
506
	time = tcp_time_stamp - tp->rcvq_space.time;
	if (time < (tp->rcv_rtt_est.rtt >> 3) ||
	    tp->rcv_rtt_est.rtt == 0)
		return;
507

Linus Torvalds's avatar
Linus Torvalds committed
508
509
510
511
512
513
514
515
516
	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);

	space = max(tp->rcvq_space.space, space);

	if (tp->rcvq_space.space != space) {
		int rcvmem;

		tp->rcvq_space.space = space;

517
518
		if (sysctl_tcp_moderate_rcvbuf &&
		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
Linus Torvalds's avatar
Linus Torvalds committed
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
			int new_clamp = space;

			/* Receive space grows, normalize in order to
			 * take into account packet headers and sk_buff
			 * structure overhead.
			 */
			space /= tp->advmss;
			if (!space)
				space = 1;
			rcvmem = (tp->advmss + MAX_TCP_HEADER +
				  16 + sizeof(struct sk_buff));
			while (tcp_win_from_space(rcvmem) < tp->advmss)
				rcvmem += 128;
			space *= rcvmem;
			space = min(space, sysctl_tcp_rmem[2]);
			if (space > sk->sk_rcvbuf) {
				sk->sk_rcvbuf = space;

				/* Make the window clamp follow along.  */
				tp->window_clamp = new_clamp;
			}
		}
	}
542

Linus Torvalds's avatar
Linus Torvalds committed
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
	tp->rcvq_space.time = tcp_time_stamp;
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
558
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
559
{
560
	struct tcp_sock *tp = tcp_sk(sk);
561
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
562
563
	u32 now;

564
	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
565

566
	tcp_measure_rcv_mss(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
567
568

	tcp_rcv_rtt_measure(tp);
569

Linus Torvalds's avatar
Linus Torvalds committed
570
571
	now = tcp_time_stamp;

572
	if (!icsk->icsk_ack.ato) {
Linus Torvalds's avatar
Linus Torvalds committed
573
574
575
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
576
577
		tcp_incr_quickack(sk);
		icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
578
	} else {
579
		int m = now - icsk->icsk_ack.lrcvtime;
Linus Torvalds's avatar
Linus Torvalds committed
580
581
582

		if (m <= TCP_ATO_MIN/2) {
			/* The fastest case is the first. */
583
584
585
586
587
588
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
Stephen Hemminger's avatar
Stephen Hemminger committed
589
			/* Too long gap. Apparently sender failed to
Linus Torvalds's avatar
Linus Torvalds committed
590
591
			 * restart window, so that we send ACKs quickly.
			 */
592
			tcp_incr_quickack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
593
594
595
			sk_stream_mem_reclaim(sk);
		}
	}
596
	icsk->icsk_ack.lrcvtime = now;
Linus Torvalds's avatar
Linus Torvalds committed
597
598
599
600

	TCP_ECN_check_ce(tp, skb);

	if (skb->len >= 128)
601
		tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
602
603
}

604
605
606
607
608
static u32 tcp_rto_min(struct sock *sk)
{
	struct dst_entry *dst = __sk_dst_get(sk);
	u32 rto_min = TCP_RTO_MIN;

609
	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
610
611
612
613
		rto_min = dst->metrics[RTAX_RTO_MIN-1];
	return rto_min;
}

Linus Torvalds's avatar
Linus Torvalds committed
614
615
616
617
618
619
620
621
622
/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
623
static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
Linus Torvalds's avatar
Linus Torvalds committed
624
{
625
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
626
627
628
629
630
	long m = mrtt; /* RTT */

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
631
	 *	This is designed to be as fast as possible
Linus Torvalds's avatar
Linus Torvalds committed
632
633
634
635
636
637
638
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
639
	 * too slowly, when it should be increased quickly, decrease too quickly
Linus Torvalds's avatar
Linus Torvalds committed
640
641
642
643
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
Stephen Hemminger's avatar
Stephen Hemminger committed
644
	if (m == 0)
Linus Torvalds's avatar
Linus Torvalds committed
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
		m = 1;
	if (tp->srtt != 0) {
		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
			m -= (tp->mdev >> 2);   /* similar update on mdev */
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
			m -= (tp->mdev >> 2);   /* similar update on mdev */
		}
		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev > tp->mdev_max) {
			tp->mdev_max = tp->mdev;
			if (tp->mdev_max > tp->rttvar)
				tp->rttvar = tp->mdev_max;
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
			if (tp->mdev_max < tp->rttvar)
				tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
			tp->rtt_seq = tp->snd_nxt;
675
			tp->mdev_max = tcp_rto_min(sk);
Linus Torvalds's avatar
Linus Torvalds committed
676
677
678
679
680
		}
	} else {
		/* no previous measure. */
		tp->srtt = m<<3;	/* take the measured time to be rtt */
		tp->mdev = m<<1;	/* make sure rto = 3*rtt */
681
		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
Linus Torvalds's avatar
Linus Torvalds committed
682
683
684
685
686
687
688
		tp->rtt_seq = tp->snd_nxt;
	}
}

/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
689
static inline void tcp_set_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
690
{
691
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
692
693
694
695
696
697
698
699
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
Stephen Hemminger's avatar
Stephen Hemminger committed
700
	 *    ACKs in some circumstances.
Linus Torvalds's avatar
Linus Torvalds committed
701
	 */
702
	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
Linus Torvalds's avatar
Linus Torvalds committed
703
704
705
706

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
Stephen Hemminger's avatar
Stephen Hemminger committed
707
	 *    with correct one. It is exactly, which we pretend to do.
Linus Torvalds's avatar
Linus Torvalds committed
708
709
710
711
712
713
	 */
}

/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
 * guarantees that rto is higher.
 */
714
static inline void tcp_bound_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
715
{
716
717
	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
Linus Torvalds's avatar
Linus Torvalds committed
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
}

/* Save metrics learned by this TCP session.
   This function is called only, when TCP finishes successfully
   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
 */
void tcp_update_metrics(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct dst_entry *dst = __sk_dst_get(sk);

	if (sysctl_tcp_nometrics_save)
		return;

	dst_confirm(dst);

	if (dst && (dst->flags&DST_HOST)) {
735
		const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
736
737
		int m;

738
		if (icsk->icsk_backoff || !tp->srtt) {
Linus Torvalds's avatar
Linus Torvalds committed
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
			/* This session failed to estimate rtt. Why?
			 * Probably, no packets returned in time.
			 * Reset our results.
			 */
			if (!(dst_metric_locked(dst, RTAX_RTT)))
				dst->metrics[RTAX_RTT-1] = 0;
			return;
		}

		m = dst_metric(dst, RTAX_RTT) - tp->srtt;

		/* If newly calculated rtt larger than stored one,
		 * store new one. Otherwise, use EWMA. Remember,
		 * rtt overestimation is always better than underestimation.
		 */
		if (!(dst_metric_locked(dst, RTAX_RTT))) {
			if (m <= 0)
				dst->metrics[RTAX_RTT-1] = tp->srtt;
			else
				dst->metrics[RTAX_RTT-1] -= (m>>3);
		}

		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
			if (m < 0)
				m = -m;

			/* Scale deviation to rttvar fixed point */
			m >>= 1;
			if (m < tp->mdev)
				m = tp->mdev;

			if (m >= dst_metric(dst, RTAX_RTTVAR))
				dst->metrics[RTAX_RTTVAR-1] = m;
			else
				dst->metrics[RTAX_RTTVAR-1] -=
					(dst->metrics[RTAX_RTTVAR-1] - m)>>2;
		}

		if (tp->snd_ssthresh >= 0xFFFF) {
			/* Slow start still did not finish. */
			if (dst_metric(dst, RTAX_SSTHRESH) &&
			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
			if (!dst_metric_locked(dst, RTAX_CWND) &&
			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
787
			   icsk->icsk_ca_state == TCP_CA_Open) {
Linus Torvalds's avatar
Linus Torvalds committed
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
			/* Cong. avoidance phase, cwnd is reliable. */
			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
				dst->metrics[RTAX_SSTHRESH-1] =
					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
			if (!dst_metric_locked(dst, RTAX_CWND))
				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
		} else {
			/* Else slow start did not finish, cwnd is non-sense,
			   ssthresh may be also invalid.
			 */
			if (!dst_metric_locked(dst, RTAX_CWND))
				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
			if (dst->metrics[RTAX_SSTHRESH-1] &&
			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
		}

		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
			    tp->reordering != sysctl_tcp_reordering)
				dst->metrics[RTAX_REORDERING-1] = tp->reordering;
		}
	}
}

814
815
816
817
818
819
820
821
822
/* Numbers are taken from RFC3390.
 *
 * John Heffner states:
 *
 *	The RFC specifies a window of no more than 4380 bytes
 *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
 *	is a bit misleading because they use a clamp at 4380 bytes
 *	rather than use a multiplier in the relevant range.
 */
Linus Torvalds's avatar
Linus Torvalds committed
823
824
825
826
827
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

	if (!cwnd) {
828
		if (tp->mss_cache > 1460)
Linus Torvalds's avatar
Linus Torvalds committed
829
830
			cwnd = 2;
		else
831
			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
Linus Torvalds's avatar
Linus Torvalds committed
832
833
834
835
	}
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
836
/* Set slow start threshold and cwnd not falling to slow start */
837
void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
Stephen Hemminger's avatar
Stephen Hemminger committed
838
839
{
	struct tcp_sock *tp = tcp_sk(sk);
840
	const struct inet_connection_sock *icsk = inet_csk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
841
842
843

	tp->prior_ssthresh = 0;
	tp->bytes_acked = 0;
844
	if (icsk->icsk_ca_state < TCP_CA_CWR) {
Stephen Hemminger's avatar
Stephen Hemminger committed
845
		tp->undo_marker = 0;
846
847
		if (set_ssthresh)
			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
848
849
850
851
852
853
854
855
856
857
858
		tp->snd_cwnd = min(tp->snd_cwnd,
				   tcp_packets_in_flight(tp) + 1U);
		tp->snd_cwnd_cnt = 0;
		tp->high_seq = tp->snd_nxt;
		tp->snd_cwnd_stamp = tcp_time_stamp;
		TCP_ECN_queue_cwr(tp);

		tcp_set_ca_state(sk, TCP_CA_CWR);
	}
}

859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
/*
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
 */
static void tcp_disable_fack(struct tcp_sock *tp)
{
	tp->rx_opt.sack_ok &= ~2;
}

/* Take a notice that peer is sending DSACKs */
static void tcp_dsack_seen(struct tcp_sock *tp)
{
	tp->rx_opt.sack_ok |= 4;
}

Linus Torvalds's avatar
Linus Torvalds committed
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
/* Initialize metrics on socket. */

static void tcp_init_metrics(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct dst_entry *dst = __sk_dst_get(sk);

	if (dst == NULL)
		goto reset;

	dst_confirm(dst);

	if (dst_metric_locked(dst, RTAX_CWND))
		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
	if (dst_metric(dst, RTAX_SSTHRESH)) {
		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
			tp->snd_ssthresh = tp->snd_cwnd_clamp;
	}
	if (dst_metric(dst, RTAX_REORDERING) &&
	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
895
		tcp_disable_fack(tp);
Linus Torvalds's avatar
Linus Torvalds committed
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
		tp->reordering = dst_metric(dst, RTAX_REORDERING);
	}

	if (dst_metric(dst, RTAX_RTT) == 0)
		goto reset;

	if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
		goto reset;

	/* Initial rtt is determined from SYN,SYN-ACK.
	 * The segment is small and rtt may appear much
	 * less than real one. Use per-dst memory
	 * to make it more realistic.
	 *
	 * A bit of theory. RTT is time passed after "normal" sized packet
Stephen Hemminger's avatar
Stephen Hemminger committed
911
	 * is sent until it is ACKed. In normal circumstances sending small
Linus Torvalds's avatar
Linus Torvalds committed
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
	 * packets force peer to delay ACKs and calculation is correct too.
	 * The algorithm is adaptive and, provided we follow specs, it
	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
	 * tricks sort of "quick acks" for time long enough to decrease RTT
	 * to low value, and then abruptly stops to do it and starts to delay
	 * ACKs, wait for troubles.
	 */
	if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
		tp->srtt = dst_metric(dst, RTAX_RTT);
		tp->rtt_seq = tp->snd_nxt;
	}
	if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
		tp->mdev = dst_metric(dst, RTAX_RTTVAR);
		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
	}
927
928
929
	tcp_set_rto(sk);
	tcp_bound_rto(sk);
	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
Linus Torvalds's avatar
Linus Torvalds committed
930
931
932
933
934
935
936
937
938
939
940
941
942
		goto reset;
	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
	tp->snd_cwnd_stamp = tcp_time_stamp;
	return;

reset:
	/* Play conservative. If timestamps are not
	 * supported, TCP will fail to recalculate correct
	 * rtt, if initial rto is too small. FORGET ALL AND RESET!
	 */
	if (!tp->rx_opt.saw_tstamp && tp->srtt) {
		tp->srtt = 0;
		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
943
		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds's avatar
Linus Torvalds committed
944
945
946
	}
}

947
948
static void tcp_update_reordering(struct sock *sk, const int metric,
				  const int ts)
Linus Torvalds's avatar
Linus Torvalds committed
949
{
950
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
951
952
953
954
955
956
	if (metric > tp->reordering) {
		tp->reordering = min(TCP_MAX_REORDERING, metric);

		/* This exciting event is worth to be remembered. 8) */
		if (ts)
			NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
957
		else if (tcp_is_reno(tp))
Linus Torvalds's avatar
Linus Torvalds committed
958
			NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
959
		else if (tcp_is_fack(tp))
Linus Torvalds's avatar
Linus Torvalds committed
960
961
962
963
964
			NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
		else
			NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
#if FASTRETRANS_DEBUG > 1
		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
965
		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
Linus Torvalds's avatar
Linus Torvalds committed
966
967
968
969
970
		       tp->reordering,
		       tp->fackets_out,
		       tp->sacked_out,
		       tp->undo_marker ? tp->undo_retrans : 0);
#endif
971
		tcp_disable_fack(tp);
Linus Torvalds's avatar
Linus Torvalds committed
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
	}
}

/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight	Description
 * 0	1		- orig segment is in flight.
 * S	0		- nothing flies, orig reached receiver.
 * L	0		- nothing flies, orig lost by net.
 * R	2		- both orig and retransmit are in flight.
 * L|R	1		- orig is lost, retransmit is in flight.
 * S|R  1		- orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-curcuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
 * 3. Loss detection event of one of three flavors:
 *	A. Scoreboard estimator decided the packet is lost.
 *	   A'. Reno "three dupacks" marks head of queue lost.
 *	   A''. Its FACK modfication, head until snd.fack is lost.
 *	B. SACK arrives sacking data transmitted after never retransmitted
 *	   hole was sent out.
 *	C. SACK arrives sacking SND.NXT at the moment, when the
 *	   segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
 * It is pleasant to note, that state diagram turns out to be commutative,
 * so that we are allowed not to be bothered by order of our actions,
 * when multiple events arrive simultaneously. (see the function below).
 *
 * Reordering detection.
 * --------------------
 * Reordering metric is maximal distance, which a packet can be displaced
 * in packet stream. With SACKs we can estimate it:
 *
 * 1. SACK fills old hole and the corresponding segment was not
 *    ever retransmitted -> reordering. Alas, we cannot use it
 *    when segment was retransmitted.
 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 *    for retransmitted and already SACKed segment -> reordering..
 * Both of these heuristics are not used in Loss state, when we cannot
 * account for retransmits accurately.
 */
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
			   struct tcp_sack_block_wire *sp, int num_sacks,
			   u32 prior_snd_una)
{
	u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq));
	u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq));
	int dup_sack = 0;

	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
		dup_sack = 1;
1033
		tcp_dsack_seen(tp);
1034
1035
1036
1037
1038
1039
1040
1041
		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
	} else if (num_sacks > 1) {
		u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq));
		u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq));

		if (!after(end_seq_0, end_seq_1) &&
		    !before(start_seq_0, start_seq_1)) {
			dup_sack = 1;
1042
			tcp_dsack_seen(tp);
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
		}
	}

	/* D-SACK for already forgotten data... Do dumb counting. */
	if (dup_sack &&
	    !after(end_seq_0, prior_snd_una) &&
	    after(end_seq_0, tp->undo_marker))
		tp->undo_retrans--;

	return dup_sack;
}

Linus Torvalds's avatar
Linus Torvalds committed
1056
1057
1058
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
1059
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1060
	struct tcp_sock *tp = tcp_sk(sk);
1061
1062
	unsigned char *ptr = (skb_transport_header(ack_skb) +
			      TCP_SKB_CB(ack_skb)->sacked);
1063
	struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
1064
	struct sk_buff *cached_skb;
Linus Torvalds's avatar
Linus Torvalds committed
1065
1066
1067
1068
1069
	int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
	int reord = tp->packets_out;
	int prior_fackets;
	u32 lost_retrans = 0;
	int flag = 0;
1070
	int found_dup_sack = 0;
1071
	int cached_fack_count;
Linus Torvalds's avatar
Linus Torvalds committed
1072
	int i;
1073
	int first_sack_index;
Linus Torvalds's avatar
Linus Torvalds committed
1074

1075
	if (!tp->sacked_out) {
Linus Torvalds's avatar
Linus Torvalds committed
1076
		tp->fackets_out = 0;
1077
1078
		tp->highest_sack = tp->snd_una;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1079
1080
	prior_fackets = tp->fackets_out;

1081
1082
1083
	found_dup_sack = tcp_check_dsack(tp, ack_skb, sp,
					 num_sacks, prior_snd_una);
	if (found_dup_sack)
1084
		flag |= FLAG_DSACKING_ACK;
1085
1086
1087
1088
1089
1090
1091
1092

	/* Eliminate too old ACKs, but take into
	 * account more or less fresh ones, they can
	 * contain valid SACK info.
	 */
	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
		return 0;

1093
1094
1095
1096
1097
	/* SACK fastpath:
	 * if the only SACK change is the increase of the end_seq of
	 * the first block then only apply that SACK block
	 * and use retrans queue hinting otherwise slowpath */
	flag = 1;
1098
1099
1100
	for (i = 0; i < num_sacks; i++) {
		__be32 start_seq = sp[i].start_seq;
		__be32 end_seq = sp[i].end_seq;
1101

1102
		if (i == 0) {
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
			if (tp->recv_sack_cache[i].start_seq != start_seq)
				flag = 0;
		} else {
			if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
			    (tp->recv_sack_cache[i].end_seq != end_seq))
				flag = 0;
		}
		tp->recv_sack_cache[i].start_seq = start_seq;
		tp->recv_sack_cache[i].end_seq = end_seq;
	}
1113
1114
1115
1116
1117
	/* Clear the rest of the cache sack blocks so they won't match mistakenly. */
	for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
		tp->recv_sack_cache[i].start_seq = 0;
		tp->recv_sack_cache[i].end_seq = 0;
	}
1118

1119
	first_sack_index = 0;
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
	if (flag)
		num_sacks = 1;
	else {
		int j;
		tp->fastpath_skb_hint = NULL;

		/* order SACK blocks to allow in order walk of the retrans queue */
		for (i = num_sacks-1; i > 0; i--) {
			for (j = 0; j < i; j++){
				if (after(ntohl(sp[j].start_seq),
					  ntohl(sp[j+1].start_seq))){
1131
1132
1133
1134
1135
					struct tcp_sack_block_wire tmp;

					tmp = sp[j];
					sp[j] = sp[j+1];
					sp[j+1] = tmp;
1136
1137
1138
1139

					/* Track where the first SACK block goes to */
					if (j == first_sack_index)
						first_sack_index = j+1;
1140
1141
1142
1143
1144
1145
1146
1147
1148
				}

			}
		}
	}

	/* clear flag as used for different purpose in following code */
	flag = 0;

1149
1150
1151
1152
	/* Use SACK fastpath hint if valid */
	cached_skb = tp->fastpath_skb_hint;
	cached_fack_count = tp->fastpath_cnt_hint;
	if (!cached_skb) {
1153
		cached_skb = tcp_write_queue_head(sk);
1154
1155
1156
		cached_fack_count = 0;
	}

1157
1158
1159
1160
1161
	for (i=0; i<num_sacks; i++, sp++) {
		struct sk_buff *skb;
		__u32 start_seq = ntohl(sp->start_seq);
		__u32 end_seq = ntohl(sp->end_seq);
		int fack_count;
1162
		int dup_sack = (found_dup_sack && (i == first_sack_index));
1163

1164
1165
		skb = cached_skb;
		fack_count = cached_fack_count;
Linus Torvalds's avatar
Linus Torvalds committed
1166
1167
1168
1169
1170

		/* Event "B" in the comment above. */
		if (after(end_seq, tp->high_seq))
			flag |= FLAG_DATA_LOST;

1171
		tcp_for_write_queue_from(skb, sk) {
1172
1173
			int in_sack, pcount;
			u8 sacked;
Linus Torvalds's avatar
Linus Torvalds committed
1174

1175
1176
1177
			if (skb == tcp_send_head(sk))
				break;

1178
1179
1180
1181
1182
1183
			cached_skb = skb;
			cached_fack_count = fack_count;
			if (i == first_sack_index) {
				tp->fastpath_skb_hint = skb;
				tp->fastpath_cnt_hint = fack_count;
			}
1184

Linus Torvalds's avatar
Linus Torvalds committed
1185
1186
1187
			/* The retransmission queue is always in order, so
			 * we can short-circuit the walk early.
			 */
1188
			if (!before(TCP_SKB_CB(skb)->seq, end_seq))
Linus Torvalds's avatar
Linus Torvalds committed
1189
1190
				break;

1191
1192
1193
			in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				!before(end_seq, TCP_SKB_CB(skb)->end_seq);

1194
1195
			pcount = tcp_skb_pcount(skb);

1196
1197
			if (pcount > 1 && !in_sack &&
			    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1198
1199
				unsigned int pkt_len;

1200
1201
1202
1203
				in_sack = !after(start_seq,
						 TCP_SKB_CB(skb)->seq);

				if (!in_sack)
1204
1205
1206
1207
1208
					pkt_len = (start_seq -
						   TCP_SKB_CB(skb)->seq);
				else
					pkt_len = (end_seq -
						   TCP_SKB_CB(skb)->seq);
1209
				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
1210
1211
1212
1213
1214
					break;
				pcount = tcp_skb_pcount(skb);
			}

			fack_count += pcount;
Linus Torvalds's avatar
Linus Torvalds committed
1215

1216
1217
			sacked = TCP_SKB_CB(skb)->sacked;

Linus Torvalds's avatar
Linus Torvalds committed
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
			/* Account D-SACK for retransmitted packet. */
			if ((dup_sack && in_sack) &&
			    (sacked & TCPCB_RETRANS) &&
			    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
				tp->undo_retrans--;

			/* The frame is ACKed. */
			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
				if (sacked&TCPCB_RETRANS) {
					if ((dup_sack && in_sack) &&
					    (sacked&TCPCB_SACKED_ACKED))
						reord = min(fack_count, reord);
				} else {
					/* If it was in a hole, we detected reordering. */
					if (fack_count < prior_fackets &&
					    !(sacked&TCPCB_SACKED_ACKED))
						reord = min(fack_count, reord);
				}

				/* Nothing to do; acked frame is about to be dropped. */
				continue;
			}

			if ((sacked&TCPCB_SACKED_RETRANS) &&
			    after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
			    (!lost_retrans || after(end_seq, lost_retrans)))
				lost_retrans = end_seq;

			if (!in_sack)
				continue;

			if (!(sacked&TCPCB_SACKED_ACKED)) {
				if (sacked & TCPCB_SACKED_RETRANS) {
					/* If the segment is not tagged as lost,
					 * we do not clear RETRANS, believing
					 * that retransmission is still in flight.
					 */
					if (sacked & TCPCB_LOST) {
						TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
						tp->lost_out -= tcp_skb_pcount(skb);
						tp->retrans_out -= tcp_skb_pcount(skb);
1259
1260
1261

						/* clear lost hint */
						tp->retransmit_skb_hint = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
					}
				} else {
					/* New sack for not retransmitted frame,
					 * which was in hole. It is reordering.
					 */
					if (!(sacked & TCPCB_RETRANS) &&
					    fack_count < prior_fackets)
						reord = min(fack_count, reord);

					if (sacked & TCPCB_LOST) {
						TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
						tp->lost_out -= tcp_skb_pcount(skb);
1274
1275
1276

						/* clear lost hint */
						tp->retransmit_skb_hint = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1277
					}
Ilpo Järvinen's avatar
Ilpo Järvinen committed
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
					/* SACK enhanced F-RTO detection.
					 * Set flag if and only if non-rexmitted
					 * segments below frto_highmark are
					 * SACKed (RFC4138; Appendix B).
					 * Clearing correct due to in-order walk
					 */
					if (after(end_seq, tp->frto_highmark)) {
						flag &= ~FLAG_ONLY_ORIG_SACKED;
					} else {
						if (!(sacked & TCPCB_RETRANS))
							flag |= FLAG_ONLY_ORIG_SACKED;
					}
Linus Torvalds's avatar
Linus Torvalds committed
1290
1291
1292
1293
1294
1295
1296
1297
				}

				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
				flag |= FLAG_DATA_SACKED;
				tp->sacked_out += tcp_skb_pcount(skb);

				if (fack_count > tp->fackets_out)
					tp->fackets_out = fack_count;
1298
1299
1300
1301

				if (after(TCP_SKB_CB(skb)->seq,
				    tp->highest_sack))
					tp->highest_sack = TCP_SKB_CB(skb)->seq;
Linus Torvalds's avatar
Linus Torvalds committed
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
			} else {
				if (dup_sack && (sacked&TCPCB_RETRANS))
					reord = min(fack_count, reord);
			}

			/* D-SACK. We can detect redundant retransmission
			 * in S|R and plain R frames and clear it.
			 * undo_retrans is decreased above, L|R frames
			 * are accounted above as well.
			 */
			if (dup_sack &&
			    (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				tp->retrans_out -= tcp_skb_pcount(skb);
1316
				tp->retransmit_skb_hint = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
			}
		}
	}

	/* Check for lost retransmit. This superb idea is
	 * borrowed from "ratehalving". Event "C".
	 * Later note: FACK people cheated me again 8),
	 * we have to account for reordering! Ugly,
	 * but should help.
	 */
1327
	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
Linus Torvalds's avatar
Linus Torvalds committed
1328
1329
		struct sk_buff *skb;

1330
1331
1332
		tcp_for_write_queue(skb, sk) {
			if (skb == tcp_send_head(sk))
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1333
1334
1335
1336
1337
1338
			if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
				break;
			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				continue;
			if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
			    after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
1339
			    (tcp_is_fack(tp) ||
Linus Torvalds's avatar
Linus Torvalds committed
1340
1341
			     !before(lost_retrans,
				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1342
				     tp->mss_cache))) {
Linus Torvalds's avatar
Linus Torvalds committed
1343
1344
1345
				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				tp->retrans_out -= tcp_skb_pcount(skb);

1346
1347
1348
				/* clear lost hint */
				tp->retransmit_skb_hint = NULL;

Linus Torvalds's avatar
Linus Torvalds committed
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
				if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
					tp->lost_out += tcp_skb_pcount(skb);
					TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
					flag |= FLAG_DATA_SACKED;
					NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
				}
			}
		}
	}

1359
1360
	tcp_verify_left_out(tp);

1361
	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
1362
	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1363
		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
Linus Torvalds's avatar
Linus Torvalds committed
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373

#if FASTRETRANS_DEBUG > 0
	BUG_TRAP((int)tp->sacked_out >= 0);
	BUG_TRAP((int)tp->lost_out >= 0);
	BUG_TRAP((int)tp->retrans_out >= 0);
	BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
#endif
	return flag;
}

1374
1375
/* F-RTO can only be used if TCP has never retransmitted anything other than
 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1376
 */
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
static void tcp_check_reno_reordering(struct sock *sk, const int addend)
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 holes;

	holes = max(tp->lost_out, 1U);
	holes = min(holes, tp->packets_out);

	if ((tp->sacked_out + holes) > tp->packets_out) {
		tp->sacked_out = tp->packets_out - holes;
		tcp_update_reordering(sk, tp->packets_out + addend, 0);
	}
}

/* Emulate SACKs for SACKless connection: account for a new dupack. */

static void tcp_add_reno_sack(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	tp->sacked_out++;
	tcp_check_reno_reordering(sk, 0);
1398
	tcp_verify_left_out(tp);
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
}

/* Account for ACK, ACKing some data in Reno Recovery phase. */

static void tcp_remove_reno_sacks(struct sock *sk, int acked)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (acked > 0) {
		/* One ACK acked hole. The rest eat duplicate ACKs. */
		if (acked-1 >= tp->sacked_out)
			tp->sacked_out = 0;
		else
			tp->sacked_out -= acked-1;
	}
	tcp_check_reno_reordering(sk, acked);
1415
	tcp_verify_left_out(tp);
1416
1417
1418
1419
1420
1421
1422
}

static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
{
	tp->sacked_out = 0;
}

1423
int tcp_use_frto(struct sock *sk)
1424
1425
{
	const struct tcp_sock *tp = tcp_sk(sk);
1426
1427
	struct sk_buff *skb;

1428
	if (!sysctl_tcp_frto)
1429
		return 0;
1430

Ilpo Järvinen's avatar
Ilpo Järvinen committed
1431
1432
1433
	if (IsSackFrto())
		return 1;

1434
1435
1436
1437
	/* Avoid expensive walking of rexmit queue if possible */
	if (tp->retrans_out > 1)
		return 0;

1438
1439
1440
1441
1442
	skb = tcp_write_queue_head(sk);
	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
	tcp_for_write_queue_from(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
1443
1444
1445
1446
1447
1448
1449
		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
			return 0;
		/* Short-circuit when first non-SACKed skb has been checked */
		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
			break;
	}
	return 1;
1450
1451
}

1452
1453
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
 * recovery a bit and use heuristics in tcp_process_frto() to detect if
1454
1455
1456
1457
1458
 * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
 * keep retrans_out counting accurate (with SACK F-RTO, other than head
 * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
 * bits are handled if the Loss state is really to be entered (in
 * tcp_enter_frto_loss).
1459
1460
1461
1462
 *
 * Do like tcp_enter_loss() would; when RTO expires the second time it
 * does:
 *  "Reduce ssthresh if it has not yet been made inside this window."
Linus Torvalds's avatar
Linus Torvalds committed
1463
1464
1465
 */
void tcp_enter_frto(struct sock *sk)
{
1466
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1467
1468
1469
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

1470
	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
1471
	    tp->snd_una == tp->high_seq ||
1472
1473
	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
	     !icsk->icsk_retransmits)) {
Arnaldo Carvalho de Melo's avatar