tcp_input.c 140 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
 * Version:	$Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
 *
10
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
Stephen Hemminger's avatar
Stephen Hemminger committed
45
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
Linus Torvalds's avatar
Linus Torvalds committed
46
47
48
49
50
51
52
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
53
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
Linus Torvalds's avatar
Linus Torvalds committed
54
 *					connections with MSS<min(MTU,ann. MSS)
55
 *					work without delayed acks.
Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
73
#include <net/netdma.h>
Linus Torvalds's avatar
Linus Torvalds committed
74

75
76
77
78
79
80
81
82
83
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_ecn __read_mostly;
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
Linus Torvalds's avatar
Linus Torvalds committed
84

85
86
87
88
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly;
89
int sysctl_tcp_frto_response __read_mostly;
90
int sysctl_tcp_nometrics_save __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
91

92
93
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
94
95
96
97
98
99
100
101
102
103

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
#define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
Ilpo Järvinen's avatar
Ilpo Järvinen committed
104
#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
105
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
106
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained DSACK info */
Linus Torvalds's avatar
Linus Torvalds committed
107
108
109
110
111

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
112
#define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
Linus Torvalds's avatar
Linus Torvalds committed
113
114
115
116
117

#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)

Ilpo Järvinen's avatar
Ilpo Järvinen committed
118
119
#define IsSackFrto() (sysctl_tcp_frto == 0x2)

Linus Torvalds's avatar
Linus Torvalds committed
120
121
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)

122
/* Adapt the MSS value used to make delayed ack decision to the
Linus Torvalds's avatar
Linus Torvalds committed
123
 * real world.
124
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
125
126
static void tcp_measure_rcv_mss(struct sock *sk,
				const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
127
{
128
	struct inet_connection_sock *icsk = inet_csk(sk);
129
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
130
	unsigned int len;
Linus Torvalds's avatar
Linus Torvalds committed
131

132
	icsk->icsk_ack.last_seg_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
133
134
135
136

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
137
	len = skb_shinfo(skb)->gso_size ?: skb->len;
138
139
	if (len >= icsk->icsk_ack.rcv_mss) {
		icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
140
141
142
143
144
145
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
146
		len += skb->data - skb_transport_header(skb);
Linus Torvalds's avatar
Linus Torvalds committed
147
148
149
150
151
152
153
		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
154
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
Linus Torvalds's avatar
Linus Torvalds committed
155
156
157
158
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
159
160
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
Linus Torvalds's avatar
Linus Torvalds committed
161
			if (len == lss) {
162
				icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
163
164
165
				return;
			}
		}
166
167
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
168
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
Linus Torvalds's avatar
Linus Torvalds committed
169
170
171
	}
}

172
static void tcp_incr_quickack(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
173
{
174
175
	struct inet_connection_sock *icsk = inet_csk(sk);
	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
Linus Torvalds's avatar
Linus Torvalds committed
176
177
178

	if (quickacks==0)
		quickacks=2;
179
180
	if (quickacks > icsk->icsk_ack.quick)
		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
Linus Torvalds's avatar
Linus Torvalds committed
181
182
}

183
void tcp_enter_quickack_mode(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
184
{
185
186
187
188
	struct inet_connection_sock *icsk = inet_csk(sk);
	tcp_incr_quickack(sk);
	icsk->icsk_ack.pingpong = 0;
	icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
189
190
191
192
193
194
}

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

195
static inline int tcp_in_quickack_mode(const struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
196
{
197
198
	const struct inet_connection_sock *icsk = inet_csk(sk);
	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
Linus Torvalds's avatar
Linus Torvalds committed
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
}

/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

static void tcp_fixup_sndbuf(struct sock *sk)
{
	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
		     sizeof(struct sk_buff);

	if (sk->sk_sndbuf < 3 * sndmem)
		sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
Stephen Hemminger's avatar
Stephen Hemminger committed
236
 * window and then starts to feed us spaghetti. But it should work
Linus Torvalds's avatar
Linus Torvalds committed
237
238
239
240
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
241
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
242
{
243
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
244
245
	/* Optimize this! */
	int truesize = tcp_win_from_space(skb->truesize)/2;
246
	int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
Linus Torvalds's avatar
Linus Torvalds committed
247
248
249

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
250
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
251
252
253
254
255
256
257

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

258
static void tcp_grow_window(struct sock *sk,
Stephen Hemminger's avatar
Stephen Hemminger committed
259
			    struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
260
{
261
262
	struct tcp_sock *tp = tcp_sk(sk);

Linus Torvalds's avatar
Linus Torvalds committed
263
264
265
266
267
268
269
270
271
272
273
274
	/* Check #1 */
	if (tp->rcv_ssthresh < tp->window_clamp &&
	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
	    !tcp_memory_pressure) {
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
		if (tcp_win_from_space(skb->truesize) <= skb->len)
			incr = 2*tp->advmss;
		else
275
			incr = __tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
276
277
278

		if (incr) {
			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
279
			inet_csk(sk)->icsk_ack.quick |= 1;
Linus Torvalds's avatar
Linus Torvalds committed
280
281
282
283
284
285
286
287
288
289
290
291
		}
	}
}

/* 3. Tuning rcvbuf, when connection enters established state. */

static void tcp_fixup_rcvbuf(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);

	/* Try to select rcvbuf so that 4 mss-sized segments
Stephen Hemminger's avatar
Stephen Hemminger committed
292
	 * will fit to window and corresponding skbs will fit to our rcvbuf.
Linus Torvalds's avatar
Linus Torvalds committed
293
294
295
296
297
298
299
300
	 * (was 3; 4 is minimum to allow fast retransmit to work.)
	 */
	while (tcp_win_from_space(rcvmem) < tp->advmss)
		rcvmem += 128;
	if (sk->sk_rcvbuf < 4 * rcvmem)
		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
301
/* 4. Try to fixup all. It is made immediately after connection enters
Linus Torvalds's avatar
Linus Torvalds committed
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
 *    established state.
 */
static void tcp_init_buffer_space(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		tcp_fixup_rcvbuf(sk);
	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
		tcp_fixup_sndbuf(sk);

	tp->rcvq_space.space = tp->rcv_wnd;

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
			tp->window_clamp = max(maxwin -
					       (maxwin >> sysctl_tcp_app_win),
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
	if (sysctl_tcp_app_win &&
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
	tp->snd_cwnd_stamp = tcp_time_stamp;
}

/* 5. Recalculate window clamp after socket hit its memory bounds. */
338
static void tcp_clamp_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
339
{
340
	struct tcp_sock *tp = tcp_sk(sk);
341
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
342

343
	icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed
344

345
346
347
348
349
350
	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
	    !tcp_memory_pressure &&
	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				    sysctl_tcp_rmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
351
	}
352
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
Linus Torvalds's avatar
Linus Torvalds committed
353
354
355
		tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375

/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

	hint = min(hint, tp->rcv_wnd/2);
	hint = min(hint, TCP_MIN_RCVMSS);
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}

Linus Torvalds's avatar
Linus Torvalds committed
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
 *
 * More detail on this code can be found at
 * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
	u32 new_sample = tp->rcv_rtt_est.rtt;
	long m = sample;

	if (m == 0)
		m = 1;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
402
		 * non-timestamp case, we do not smooth things out
Stephen Hemminger's avatar
Stephen Hemminger committed
403
		 * else with timestamps disabled convergence takes too
Linus Torvalds's avatar
Linus Torvalds committed
404
405
406
407
408
409
410
411
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
		} else if (m < new_sample)
			new_sample = m << 3;
	} else {
Stephen Hemminger's avatar
Stephen Hemminger committed
412
		/* No previous measure. */
Linus Torvalds's avatar
Linus Torvalds committed
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
		new_sample = m << 3;
	}

	if (tp->rcv_rtt_est.rtt != new_sample)
		tp->rcv_rtt_est.rtt = new_sample;
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
	if (tp->rcv_rtt_est.time == 0)
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
	tcp_rcv_rtt_update(tp,
			   jiffies - tp->rcv_rtt_est.time,
			   1);

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
	tp->rcv_rtt_est.time = tcp_time_stamp;
}

435
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
436
{
437
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
438
439
	if (tp->rx_opt.rcv_tsecr &&
	    (TCP_SKB_CB(skb)->end_seq -
440
	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
Linus Torvalds's avatar
Linus Torvalds committed
441
442
443
444
445
446
447
448
449
450
451
452
		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int time;
	int space;
453

Linus Torvalds's avatar
Linus Torvalds committed
454
455
	if (tp->rcvq_space.time == 0)
		goto new_measure;
456

Linus Torvalds's avatar
Linus Torvalds committed
457
458
459
460
	time = tcp_time_stamp - tp->rcvq_space.time;
	if (time < (tp->rcv_rtt_est.rtt >> 3) ||
	    tp->rcv_rtt_est.rtt == 0)
		return;
461

Linus Torvalds's avatar
Linus Torvalds committed
462
463
464
465
466
467
468
469
470
	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);

	space = max(tp->rcvq_space.space, space);

	if (tp->rcvq_space.space != space) {
		int rcvmem;

		tp->rcvq_space.space = space;

471
472
		if (sysctl_tcp_moderate_rcvbuf &&
		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
Linus Torvalds's avatar
Linus Torvalds committed
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
			int new_clamp = space;

			/* Receive space grows, normalize in order to
			 * take into account packet headers and sk_buff
			 * structure overhead.
			 */
			space /= tp->advmss;
			if (!space)
				space = 1;
			rcvmem = (tp->advmss + MAX_TCP_HEADER +
				  16 + sizeof(struct sk_buff));
			while (tcp_win_from_space(rcvmem) < tp->advmss)
				rcvmem += 128;
			space *= rcvmem;
			space = min(space, sysctl_tcp_rmem[2]);
			if (space > sk->sk_rcvbuf) {
				sk->sk_rcvbuf = space;

				/* Make the window clamp follow along.  */
				tp->window_clamp = new_clamp;
			}
		}
	}
496

Linus Torvalds's avatar
Linus Torvalds committed
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
	tp->rcvq_space.time = tcp_time_stamp;
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
512
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
513
{
514
	struct tcp_sock *tp = tcp_sk(sk);
515
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
516
517
	u32 now;

518
	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
519

520
	tcp_measure_rcv_mss(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
521
522

	tcp_rcv_rtt_measure(tp);
523

Linus Torvalds's avatar
Linus Torvalds committed
524
525
	now = tcp_time_stamp;

526
	if (!icsk->icsk_ack.ato) {
Linus Torvalds's avatar
Linus Torvalds committed
527
528
529
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
530
531
		tcp_incr_quickack(sk);
		icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
532
	} else {
533
		int m = now - icsk->icsk_ack.lrcvtime;
Linus Torvalds's avatar
Linus Torvalds committed
534
535
536

		if (m <= TCP_ATO_MIN/2) {
			/* The fastest case is the first. */
537
538
539
540
541
542
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
Stephen Hemminger's avatar
Stephen Hemminger committed
543
			/* Too long gap. Apparently sender failed to
Linus Torvalds's avatar
Linus Torvalds committed
544
545
			 * restart window, so that we send ACKs quickly.
			 */
546
			tcp_incr_quickack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
547
548
549
			sk_stream_mem_reclaim(sk);
		}
	}
550
	icsk->icsk_ack.lrcvtime = now;
Linus Torvalds's avatar
Linus Torvalds committed
551
552
553
554

	TCP_ECN_check_ce(tp, skb);

	if (skb->len >= 128)
555
		tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
556
557
558
559
560
561
562
563
564
565
566
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
567
static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
Linus Torvalds's avatar
Linus Torvalds committed
568
{
569
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
570
571
572
573
574
	long m = mrtt; /* RTT */

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
575
	 *	This is designed to be as fast as possible
Linus Torvalds's avatar
Linus Torvalds committed
576
577
578
579
580
581
582
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
583
	 * too slowly, when it should be increased quickly, decrease too quickly
Linus Torvalds's avatar
Linus Torvalds committed
584
585
586
587
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
Stephen Hemminger's avatar
Stephen Hemminger committed
588
	if (m == 0)
Linus Torvalds's avatar
Linus Torvalds committed
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
		m = 1;
	if (tp->srtt != 0) {
		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
			m -= (tp->mdev >> 2);   /* similar update on mdev */
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
			m -= (tp->mdev >> 2);   /* similar update on mdev */
		}
		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev > tp->mdev_max) {
			tp->mdev_max = tp->mdev;
			if (tp->mdev_max > tp->rttvar)
				tp->rttvar = tp->mdev_max;
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
			if (tp->mdev_max < tp->rttvar)
				tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
			tp->rtt_seq = tp->snd_nxt;
			tp->mdev_max = TCP_RTO_MIN;
		}
	} else {
		/* no previous measure. */
		tp->srtt = m<<3;	/* take the measured time to be rtt */
		tp->mdev = m<<1;	/* make sure rto = 3*rtt */
		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
		tp->rtt_seq = tp->snd_nxt;
	}
}

/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
633
static inline void tcp_set_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
634
{
635
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
636
637
638
639
640
641
642
643
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
Stephen Hemminger's avatar
Stephen Hemminger committed
644
	 *    ACKs in some circumstances.
Linus Torvalds's avatar
Linus Torvalds committed
645
	 */
646
	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
Linus Torvalds's avatar
Linus Torvalds committed
647
648
649
650

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
Stephen Hemminger's avatar
Stephen Hemminger committed
651
	 *    with correct one. It is exactly, which we pretend to do.
Linus Torvalds's avatar
Linus Torvalds committed
652
653
654
655
656
657
	 */
}

/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
 * guarantees that rto is higher.
 */
658
static inline void tcp_bound_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
659
{
660
661
	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
Linus Torvalds's avatar
Linus Torvalds committed
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
}

/* Save metrics learned by this TCP session.
   This function is called only, when TCP finishes successfully
   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
 */
void tcp_update_metrics(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct dst_entry *dst = __sk_dst_get(sk);

	if (sysctl_tcp_nometrics_save)
		return;

	dst_confirm(dst);

	if (dst && (dst->flags&DST_HOST)) {
679
		const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
680
681
		int m;

682
		if (icsk->icsk_backoff || !tp->srtt) {
Linus Torvalds's avatar
Linus Torvalds committed
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
			/* This session failed to estimate rtt. Why?
			 * Probably, no packets returned in time.
			 * Reset our results.
			 */
			if (!(dst_metric_locked(dst, RTAX_RTT)))
				dst->metrics[RTAX_RTT-1] = 0;
			return;
		}

		m = dst_metric(dst, RTAX_RTT) - tp->srtt;

		/* If newly calculated rtt larger than stored one,
		 * store new one. Otherwise, use EWMA. Remember,
		 * rtt overestimation is always better than underestimation.
		 */
		if (!(dst_metric_locked(dst, RTAX_RTT))) {
			if (m <= 0)
				dst->metrics[RTAX_RTT-1] = tp->srtt;
			else
				dst->metrics[RTAX_RTT-1] -= (m>>3);
		}

		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
			if (m < 0)
				m = -m;

			/* Scale deviation to rttvar fixed point */
			m >>= 1;
			if (m < tp->mdev)
				m = tp->mdev;

			if (m >= dst_metric(dst, RTAX_RTTVAR))
				dst->metrics[RTAX_RTTVAR-1] = m;
			else
				dst->metrics[RTAX_RTTVAR-1] -=
					(dst->metrics[RTAX_RTTVAR-1] - m)>>2;
		}

		if (tp->snd_ssthresh >= 0xFFFF) {
			/* Slow start still did not finish. */
			if (dst_metric(dst, RTAX_SSTHRESH) &&
			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
			if (!dst_metric_locked(dst, RTAX_CWND) &&
			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
731
			   icsk->icsk_ca_state == TCP_CA_Open) {
Linus Torvalds's avatar
Linus Torvalds committed
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
			/* Cong. avoidance phase, cwnd is reliable. */
			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
				dst->metrics[RTAX_SSTHRESH-1] =
					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
			if (!dst_metric_locked(dst, RTAX_CWND))
				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
		} else {
			/* Else slow start did not finish, cwnd is non-sense,
			   ssthresh may be also invalid.
			 */
			if (!dst_metric_locked(dst, RTAX_CWND))
				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
			if (dst->metrics[RTAX_SSTHRESH-1] &&
			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
		}

		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
			    tp->reordering != sysctl_tcp_reordering)
				dst->metrics[RTAX_REORDERING-1] = tp->reordering;
		}
	}
}

758
759
760
761
762
763
764
765
766
/* Numbers are taken from RFC3390.
 *
 * John Heffner states:
 *
 *	The RFC specifies a window of no more than 4380 bytes
 *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
 *	is a bit misleading because they use a clamp at 4380 bytes
 *	rather than use a multiplier in the relevant range.
 */
Linus Torvalds's avatar
Linus Torvalds committed
767
768
769
770
771
__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

	if (!cwnd) {
772
		if (tp->mss_cache > 1460)
Linus Torvalds's avatar
Linus Torvalds committed
773
774
			cwnd = 2;
		else
775
			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
Linus Torvalds's avatar
Linus Torvalds committed
776
777
778
779
	}
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

Stephen Hemminger's avatar
Stephen Hemminger committed
780
/* Set slow start threshold and cwnd not falling to slow start */
781
void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
Stephen Hemminger's avatar
Stephen Hemminger committed
782
783
{
	struct tcp_sock *tp = tcp_sk(sk);
784
	const struct inet_connection_sock *icsk = inet_csk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
785
786
787

	tp->prior_ssthresh = 0;
	tp->bytes_acked = 0;
788
	if (icsk->icsk_ca_state < TCP_CA_CWR) {
Stephen Hemminger's avatar
Stephen Hemminger committed
789
		tp->undo_marker = 0;
790
791
		if (set_ssthresh)
			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
792
793
794
795
796
797
798
799
800
801
802
		tp->snd_cwnd = min(tp->snd_cwnd,
				   tcp_packets_in_flight(tp) + 1U);
		tp->snd_cwnd_cnt = 0;
		tp->high_seq = tp->snd_nxt;
		tp->snd_cwnd_stamp = tcp_time_stamp;
		TCP_ECN_queue_cwr(tp);

		tcp_set_ca_state(sk, TCP_CA_CWR);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
/* Initialize metrics on socket. */

static void tcp_init_metrics(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct dst_entry *dst = __sk_dst_get(sk);

	if (dst == NULL)
		goto reset;

	dst_confirm(dst);

	if (dst_metric_locked(dst, RTAX_CWND))
		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
	if (dst_metric(dst, RTAX_SSTHRESH)) {
		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
			tp->snd_ssthresh = tp->snd_cwnd_clamp;
	}
	if (dst_metric(dst, RTAX_REORDERING) &&
	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
		tp->rx_opt.sack_ok &= ~2;
		tp->reordering = dst_metric(dst, RTAX_REORDERING);
	}

	if (dst_metric(dst, RTAX_RTT) == 0)
		goto reset;

	if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
		goto reset;

	/* Initial rtt is determined from SYN,SYN-ACK.
	 * The segment is small and rtt may appear much
	 * less than real one. Use per-dst memory
	 * to make it more realistic.
	 *
	 * A bit of theory. RTT is time passed after "normal" sized packet
Stephen Hemminger's avatar
Stephen Hemminger committed
840
	 * is sent until it is ACKed. In normal circumstances sending small
Linus Torvalds's avatar
Linus Torvalds committed
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
	 * packets force peer to delay ACKs and calculation is correct too.
	 * The algorithm is adaptive and, provided we follow specs, it
	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
	 * tricks sort of "quick acks" for time long enough to decrease RTT
	 * to low value, and then abruptly stops to do it and starts to delay
	 * ACKs, wait for troubles.
	 */
	if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
		tp->srtt = dst_metric(dst, RTAX_RTT);
		tp->rtt_seq = tp->snd_nxt;
	}
	if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
		tp->mdev = dst_metric(dst, RTAX_RTTVAR);
		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
	}
856
857
858
	tcp_set_rto(sk);
	tcp_bound_rto(sk);
	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
Linus Torvalds's avatar
Linus Torvalds committed
859
860
861
862
863
864
865
866
867
868
869
870
871
		goto reset;
	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
	tp->snd_cwnd_stamp = tcp_time_stamp;
	return;

reset:
	/* Play conservative. If timestamps are not
	 * supported, TCP will fail to recalculate correct
	 * rtt, if initial rto is too small. FORGET ALL AND RESET!
	 */
	if (!tp->rx_opt.saw_tstamp && tp->srtt) {
		tp->srtt = 0;
		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
872
		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
Linus Torvalds's avatar
Linus Torvalds committed
873
874
875
	}
}

876
877
static void tcp_update_reordering(struct sock *sk, const int metric,
				  const int ts)
Linus Torvalds's avatar
Linus Torvalds committed
878
{
879
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
880
881
882
883
884
885
886
887
888
889
890
891
892
893
	if (metric > tp->reordering) {
		tp->reordering = min(TCP_MAX_REORDERING, metric);

		/* This exciting event is worth to be remembered. 8) */
		if (ts)
			NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
		else if (IsReno(tp))
			NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
		else if (IsFack(tp))
			NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
		else
			NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
#if FASTRETRANS_DEBUG > 1
		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
894
		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
Linus Torvalds's avatar
Linus Torvalds committed
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
		       tp->reordering,
		       tp->fackets_out,
		       tp->sacked_out,
		       tp->undo_marker ? tp->undo_retrans : 0);
#endif
		/* Disable FACK yet. */
		tp->rx_opt.sack_ok &= ~2;
	}
}

/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight	Description
 * 0	1		- orig segment is in flight.
 * S	0		- nothing flies, orig reached receiver.
 * L	0		- nothing flies, orig lost by net.
 * R	2		- both orig and retransmit are in flight.
 * L|R	1		- orig is lost, retransmit is in flight.
 * S|R  1		- orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-curcuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
 * 3. Loss detection event of one of three flavors:
 *	A. Scoreboard estimator decided the packet is lost.
 *	   A'. Reno "three dupacks" marks head of queue lost.
 *	   A''. Its FACK modfication, head until snd.fack is lost.
 *	B. SACK arrives sacking data transmitted after never retransmitted
 *	   hole was sent out.
 *	C. SACK arrives sacking SND.NXT at the moment, when the
 *	   segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
 * It is pleasant to note, that state diagram turns out to be commutative,
 * so that we are allowed not to be bothered by order of our actions,
 * when multiple events arrive simultaneously. (see the function below).
 *
 * Reordering detection.
 * --------------------
 * Reordering metric is maximal distance, which a packet can be displaced
 * in packet stream. With SACKs we can estimate it:
 *
 * 1. SACK fills old hole and the corresponding segment was not
 *    ever retransmitted -> reordering. Alas, we cannot use it
 *    when segment was retransmitted.
 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 *    for retransmitted and already SACKed segment -> reordering..
 * Both of these heuristics are not used in Loss state, when we cannot
 * account for retransmits accurately.
 */
static int
tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
{
956
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
957
	struct tcp_sock *tp = tcp_sk(sk);
958
959
	unsigned char *ptr = (skb_transport_header(ack_skb) +
			      TCP_SKB_CB(ack_skb)->sacked);
960
	struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
961
	struct sk_buff *cached_skb;
Linus Torvalds's avatar
Linus Torvalds committed
962
963
964
965
966
	int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
	int reord = tp->packets_out;
	int prior_fackets;
	u32 lost_retrans = 0;
	int flag = 0;
967
	int found_dup_sack = 0;
968
	int cached_fack_count;
Linus Torvalds's avatar
Linus Torvalds committed
969
	int i;
970
	int first_sack_index;
Linus Torvalds's avatar
Linus Torvalds committed
971
972
973
974
975

	if (!tp->sacked_out)
		tp->fackets_out = 0;
	prior_fackets = tp->fackets_out;

976
977
	/* Check for D-SACK. */
	if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)->ack_seq)) {
978
		flag |= FLAG_DSACKING_ACK;
979
		found_dup_sack = 1;
980
981
982
983
984
		tp->rx_opt.sack_ok |= 4;
		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
	} else if (num_sacks > 1 &&
			!after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) &&
			!before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) {
985
		flag |= FLAG_DSACKING_ACK;
986
		found_dup_sack = 1;
987
988
989
990
991
992
		tp->rx_opt.sack_ok |= 4;
		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
	}

	/* D-SACK for already forgotten data...
	 * Do dumb counting. */
993
	if (found_dup_sack &&
994
995
996
997
998
999
1000
1001
1002
1003
1004
			!after(ntohl(sp[0].end_seq), prior_snd_una) &&
			after(ntohl(sp[0].end_seq), tp->undo_marker))
		tp->undo_retrans--;

	/* Eliminate too old ACKs, but take into
	 * account more or less fresh ones, they can
	 * contain valid SACK info.
	 */
	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
		return 0;

1005
1006
1007
1008
1009
	/* SACK fastpath:
	 * if the only SACK change is the increase of the end_seq of
	 * the first block then only apply that SACK block
	 * and use retrans queue hinting otherwise slowpath */
	flag = 1;
1010
1011
1012
	for (i = 0; i < num_sacks; i++) {
		__be32 start_seq = sp[i].start_seq;
		__be32 end_seq = sp[i].end_seq;
1013

1014
		if (i == 0) {
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
			if (tp->recv_sack_cache[i].start_seq != start_seq)
				flag = 0;
		} else {
			if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
			    (tp->recv_sack_cache[i].end_seq != end_seq))
				flag = 0;
		}
		tp->recv_sack_cache[i].start_seq = start_seq;
		tp->recv_sack_cache[i].end_seq = end_seq;
	}
1025
1026
1027
1028
1029
	/* Clear the rest of the cache sack blocks so they won't match mistakenly. */
	for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
		tp->recv_sack_cache[i].start_seq = 0;
		tp->recv_sack_cache[i].end_seq = 0;
	}
1030

1031
	first_sack_index = 0;
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
	if (flag)
		num_sacks = 1;
	else {
		int j;
		tp->fastpath_skb_hint = NULL;

		/* order SACK blocks to allow in order walk of the retrans queue */
		for (i = num_sacks-1; i > 0; i--) {
			for (j = 0; j < i; j++){
				if (after(ntohl(sp[j].start_seq),
					  ntohl(sp[j+1].start_seq))){
1043
1044
1045
1046
1047
					struct tcp_sack_block_wire tmp;

					tmp = sp[j];
					sp[j] = sp[j+1];
					sp[j+1] = tmp;
1048
1049
1050
1051

					/* Track where the first SACK block goes to */
					if (j == first_sack_index)
						first_sack_index = j+1;
1052
1053
1054
1055
1056
1057
1058
1059
1060
				}

			}
		}
	}

	/* clear flag as used for different purpose in following code */
	flag = 0;

1061
1062
1063
1064
	/* Use SACK fastpath hint if valid */
	cached_skb = tp->fastpath_skb_hint;
	cached_fack_count = tp->fastpath_cnt_hint;
	if (!cached_skb) {
1065
		cached_skb = tcp_write_queue_head(sk);
1066
1067
1068
		cached_fack_count = 0;
	}

1069
1070
1071
1072
1073
	for (i=0; i<num_sacks; i++, sp++) {
		struct sk_buff *skb;
		__u32 start_seq = ntohl(sp->start_seq);
		__u32 end_seq = ntohl(sp->end_seq);
		int fack_count;
1074
		int dup_sack = (found_dup_sack && (i == first_sack_index));
1075

1076
1077
		skb = cached_skb;
		fack_count = cached_fack_count;
Linus Torvalds's avatar
Linus Torvalds committed
1078
1079
1080
1081
1082

		/* Event "B" in the comment above. */
		if (after(end_seq, tp->high_seq))
			flag |= FLAG_DATA_LOST;

1083
		tcp_for_write_queue_from(skb, sk) {
1084
1085
			int in_sack, pcount;
			u8 sacked;
Linus Torvalds's avatar
Linus Torvalds committed
1086

1087
1088
1089
			if (skb == tcp_send_head(sk))
				break;

1090
1091
1092
1093
1094
1095
			cached_skb = skb;
			cached_fack_count = fack_count;
			if (i == first_sack_index) {
				tp->fastpath_skb_hint = skb;
				tp->fastpath_cnt_hint = fack_count;
			}
1096

Linus Torvalds's avatar
Linus Torvalds committed
1097
1098
1099
			/* The retransmission queue is always in order, so
			 * we can short-circuit the walk early.
			 */
1100
			if (!before(TCP_SKB_CB(skb)->seq, end_seq))
Linus Torvalds's avatar
Linus Torvalds committed
1101
1102
				break;

1103
1104
1105
			in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
				!before(end_seq, TCP_SKB_CB(skb)->end_seq);

1106
1107
			pcount = tcp_skb_pcount(skb);

1108
1109
			if (pcount > 1 && !in_sack &&
			    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1110
1111
				unsigned int pkt_len;

1112
1113
1114
1115
				in_sack = !after(start_seq,
						 TCP_SKB_CB(skb)->seq);

				if (!in_sack)
1116
1117
1118
1119
1120
					pkt_len = (start_seq -
						   TCP_SKB_CB(skb)->seq);
				else
					pkt_len = (end_seq -
						   TCP_SKB_CB(skb)->seq);
1121
				if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
1122
1123
1124
1125
1126
					break;
				pcount = tcp_skb_pcount(skb);
			}

			fack_count += pcount;
Linus Torvalds's avatar
Linus Torvalds committed
1127

1128
1129
			sacked = TCP_SKB_CB(skb)->sacked;

Linus Torvalds's avatar
Linus Torvalds committed
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
			/* Account D-SACK for retransmitted packet. */
			if ((dup_sack && in_sack) &&
			    (sacked & TCPCB_RETRANS) &&
			    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
				tp->undo_retrans--;

			/* The frame is ACKed. */
			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
				if (sacked&TCPCB_RETRANS) {
					if ((dup_sack && in_sack) &&
					    (sacked&TCPCB_SACKED_ACKED))
						reord = min(fack_count, reord);
				} else {
					/* If it was in a hole, we detected reordering. */
					if (fack_count < prior_fackets &&
					    !(sacked&TCPCB_SACKED_ACKED))
						reord = min(fack_count, reord);
				}

				/* Nothing to do; acked frame is about to be dropped. */
				continue;
			}

			if ((sacked&TCPCB_SACKED_RETRANS) &&
			    after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
			    (!lost_retrans || after(end_seq, lost_retrans)))
				lost_retrans = end_seq;

			if (!in_sack)
				continue;

			if (!(sacked&TCPCB_SACKED_ACKED)) {
				if (sacked & TCPCB_SACKED_RETRANS) {
					/* If the segment is not tagged as lost,
					 * we do not clear RETRANS, believing
					 * that retransmission is still in flight.
					 */
					if (sacked & TCPCB_LOST) {
						TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
						tp->lost_out -= tcp_skb_pcount(skb);
						tp->retrans_out -= tcp_skb_pcount(skb);
1171
1172
1173

						/* clear lost hint */
						tp->retransmit_skb_hint = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
					}
				} else {
					/* New sack for not retransmitted frame,
					 * which was in hole. It is reordering.
					 */
					if (!(sacked & TCPCB_RETRANS) &&
					    fack_count < prior_fackets)
						reord = min(fack_count, reord);

					if (sacked & TCPCB_LOST) {
						TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
						tp->lost_out -= tcp_skb_pcount(skb);
1186
1187
1188

						/* clear lost hint */
						tp->retransmit_skb_hint = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1189
					}
Ilpo Järvinen's avatar
Ilpo Järvinen committed
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
					/* SACK enhanced F-RTO detection.
					 * Set flag if and only if non-rexmitted
					 * segments below frto_highmark are
					 * SACKed (RFC4138; Appendix B).
					 * Clearing correct due to in-order walk
					 */
					if (after(end_seq, tp->frto_highmark)) {
						flag &= ~FLAG_ONLY_ORIG_SACKED;
					} else {
						if (!(sacked & TCPCB_RETRANS))
							flag |= FLAG_ONLY_ORIG_SACKED;
					}
Linus Torvalds's avatar
Linus Torvalds committed
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
				}

				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
				flag |= FLAG_DATA_SACKED;
				tp->sacked_out += tcp_skb_pcount(skb);

				if (fack_count > tp->fackets_out)
					tp->fackets_out = fack_count;
			} else {
				if (dup_sack && (sacked&TCPCB_RETRANS))
					reord = min(fack_count, reord);
			}

			/* D-SACK. We can detect redundant retransmission
			 * in S|R and plain R frames and clear it.
			 * undo_retrans is decreased above, L|R frames
			 * are accounted above as well.
			 */
			if (dup_sack &&
			    (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				tp->retrans_out -= tcp_skb_pcount(skb);
1224
				tp->retransmit_skb_hint = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
			}
		}
	}

	/* Check for lost retransmit. This superb idea is
	 * borrowed from "ratehalving". Event "C".
	 * Later note: FACK people cheated me again 8),
	 * we have to account for reordering! Ugly,
	 * but should help.
	 */
1235
	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
Linus Torvalds's avatar
Linus Torvalds committed
1236
1237
		struct sk_buff *skb;

1238
1239
1240
		tcp_for_write_queue(skb, sk) {
			if (skb == tcp_send_head(sk))
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1241
1242
1243
1244
1245
1246
1247
1248
1249
			if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
				break;
			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
				continue;
			if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
			    after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
			    (IsFack(tp) ||
			     !before(lost_retrans,
				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1250
				     tp->mss_cache))) {
Linus Torvalds's avatar
Linus Torvalds committed
1251
1252
1253
				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
				tp->retrans_out -= tcp_skb_pcount(skb);

1254
1255
1256
				/* clear lost hint */
				tp->retransmit_skb_hint = NULL;

Linus Torvalds's avatar
Linus Torvalds committed
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
				if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
					tp->lost_out += tcp_skb_pcount(skb);
					TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
					flag |= FLAG_DATA_SACKED;
					NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
				}
			}
		}
	}

	tp->left_out = tp->sacked_out + tp->lost_out;

1269
	if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
1270
	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
1271
		tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
Linus Torvalds's avatar
Linus Torvalds committed
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281

#if FASTRETRANS_DEBUG > 0
	BUG_TRAP((int)tp->sacked_out >= 0);
	BUG_TRAP((int)tp->lost_out >= 0);
	BUG_TRAP((int)tp->retrans_out >= 0);
	BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
#endif
	return flag;
}

1282
1283
/* F-RTO can only be used if TCP has never retransmitted anything other than
 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1284
 */
1285
int tcp_use_frto(struct sock *sk)
1286
1287
{
	const struct tcp_sock *tp = tcp_sk(sk);
1288
1289
	struct sk_buff *skb;

1290
	if (!sysctl_tcp_frto)
1291
		return 0;
1292

Ilpo Järvinen's avatar
Ilpo Järvinen committed
1293
1294
1295
	if (IsSackFrto())
		return 1;

1296
1297
1298
1299
	/* Avoid expensive walking of rexmit queue if possible */
	if (tp->retrans_out > 1)
		return 0;

1300
1301
1302
1303
1304
	skb = tcp_write_queue_head(sk);
	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
	tcp_for_write_queue_from(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
1305
1306
1307
1308
1309
1310
1311
		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
			return 0;
		/* Short-circuit when first non-SACKed skb has been checked */
		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED))
			break;
	}
	return 1;
1312
1313
}

1314
1315
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
 * recovery a bit and use heuristics in tcp_process_frto() to detect if
1316
1317
1318
1319
1320
 * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
 * keep retrans_out counting accurate (with SACK F-RTO, other than head
 * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
 * bits are handled if the Loss state is really to be entered (in
 * tcp_enter_frto_loss).
1321
1322
1323
1324
 *
 * Do like tcp_enter_loss() would; when RTO expires the second time it
 * does:
 *  "Reduce ssthresh if it has not yet been made inside this window."
Linus Torvalds's avatar
Linus Torvalds committed
1325
1326
1327
 */
void tcp_enter_frto(struct sock *sk)
{
1328
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1329
1330
1331
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

1332
	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
1333
	    tp->snd_una == tp->high_seq ||
1334
1335
	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
	     !icsk->icsk_retransmits)) {
1336
		tp->prior_ssthresh = tcp_current_ssthresh(sk);
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
		/* Our state is too optimistic in ssthresh() call because cwnd
		 * is not reduced until tcp_enter_frto_loss() when previous FRTO
		 * recovery has not yet completed. Pattern would be this: RTO,
		 * Cumulative ACK, RTO (2xRTO for the same segment does not end
		 * up here twice).
		 * RFC4138 should be more specific on what to do, even though
		 * RTO is quite unlikely to occur after the first Cumulative ACK
		 * due to back-off and complexity of triggering events ...
		 */
		if (tp->frto_counter) {
			u32 stored_cwnd;
			stored_cwnd = tp->snd_cwnd;
			tp->snd_cwnd = 2;
			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
			tp->snd_cwnd = stored_cwnd;
		} else {
			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
		}
		/* ... in theory, cong.control module could do "any tricks" in
		 * ssthresh(), which means that ca_state, lost bits and lost_out
		 * counter would have to be faked before the call occurs. We
		 * consider that too expensive, unlikely and hacky, so modules
		 * using these in ssthresh() must deal these incompatibility
		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
		 */
1362
		tcp_ca_event(sk, CA_EVENT_FRTO);
Linus Torvalds's avatar
Linus Torvalds committed
1363
1364
1365
1366
1367
	}

	tp->undo_marker = tp->snd_una;
	tp->undo_retrans = 0;

1368
	skb = tcp_write_queue_head(sk);
1369
	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
1370
		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1371
		tp->retrans_out -= tcp_skb_pcount(skb);
Linus Torvalds's avatar
Linus Torvalds committed
1372
1373
1374
	}
	tcp_sync_left_out(tp);

Ilpo Järvinen's avatar
Ilpo Järvinen committed
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
	/* Earlier loss recovery underway (see RFC4138; Appendix B).
	 * The last condition is necessary at least in tp->frto_counter case.
	 */
	if (IsSackFrto() && (tp->frto_counter ||
	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
	    after(tp->high_seq, tp->snd_una)) {
		tp->frto_highmark = tp->high_seq;
	} else {
		tp->frto_highmark = tp->snd_nxt;
	}
1385
1386
	tcp_set_ca_state(sk, TCP_CA_Disorder);
	tp->high_seq = tp->snd_nxt;
1387
	tp->frto_counter = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1388
1389
1390
1391
1392
1393
}

/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
 * which indicates that we should follow the traditional RTO recovery,
 * i.e. mark everything lost and do go-back-N retransmission.
 */
1394
static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
Linus Torvalds's avatar
Linus Torvalds committed
1395
1396
1397
1398
1399
1400
1401
1402
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	int cnt = 0;

	tp->sacked_out = 0;
	tp->lost_out = 0;
	tp->fackets_out = 0;
1403
	tp->retrans_out = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1404

1405
1406
1407
	tcp_for_write_queue(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
Linus Torvalds's avatar
Linus Torvalds committed
1408
		cnt += tcp_skb_pcount(skb);
1409
1410
1411
1412
1413
		/*
		 * Count the retransmission made on RTO correctly (only when
		 * waiting for the first ACK and did not get it)...
		 */
		if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
1414
1415
1416
			/* For some reason this R-bit might get cleared? */
			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
				tp->retrans_out += tcp_skb_pcount(skb);
1417
1418
1419
1420
1421
			/* ...enter this if branch just for the first segment */
			flag |= FLAG_DATA_ACKED;
		} else {
			TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
		}
Linus Torvalds's avatar
Linus Torvalds committed
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {

			/* Do not mark those segments lost that were
			 * forward transmitted after RTO
			 */
			if (!after(TCP_SKB_CB(skb)->end_seq,
				   tp->frto_highmark)) {
				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
				tp->lost_out += tcp_skb_pcount(skb);
			}
		} else {
			tp->sacked_out += tcp_skb_pcount(skb);
			tp->fackets_out = cnt;
		}
	}
	tcp_sync_left_out(tp);

1439
	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
Linus Torvalds's avatar
Linus Torvalds committed
1440
1441
1442
1443
1444
1445
1446
	tp->snd_cwnd_cnt = 0;
	tp->snd_cwnd_stamp = tcp_time_stamp;
	tp->undo_marker = 0;
	tp->frto_counter = 0;

	tp->reordering = min_t(unsigned int, tp->reordering,
					     sysctl_tcp_reordering);
1447
	tcp_set_ca_state(sk, TCP_CA_Loss);
Linus Torvalds's avatar
Linus Torvalds committed
1448
1449
	tp->high_seq = tp->frto_highmark;
	TCP_ECN_queue_cwr(tp);
1450
1451

	clear_all_retrans_hints(tp);
Linus Torvalds's avatar
Linus Torvalds committed
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
}

void tcp_clear_retrans(struct tcp_sock *tp)
{
	tp->left_out = 0;
	tp->retrans_out = 0;

	tp->fackets_out = 0;
	tp->sacked_out = 0;
	tp->lost_out = 0;

	tp->undo_marker = 0;
	tp->undo_retrans = 0;
}

/* Enter Loss state. If "how" is not zero, forget all SACK information
 * and reset tags completely, otherwise preserve SACKs. If receiver
 * dropped its ofo queue, we will know this due to reneging detection.
 */
void tcp_enter_loss(struct sock *sk, int how)
{
1473
	const struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
1474
1475
1476
1477
1478
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	int cnt = 0;

	/* Reduce ssthresh if it has not yet been made inside this window. */
1479
1480
1481
1482
1483
	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
		tp->prior_ssthresh = tcp_current_ssthresh(sk);
		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
		tcp_ca_event(sk, CA_EVENT_LOSS);
Linus Torvalds's avatar
Linus Torvalds committed
1484
1485
1486
1487
1488
	}
	tp->snd_cwnd	   = 1;
	tp->snd_cwnd_cnt   = 0;
	tp->snd_cwnd_stamp = tcp_time_stamp;

1489
	tp->bytes_acked = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1490
1491
1492
1493
1494
1495
1496
	tcp_clear_retrans(tp);

	/* Push undo marker, if it was plain RTO and nothing
	 * was retransmitted. */
	if (!how)
		tp->undo_marker = tp->snd_una;

1497
1498
1499
	tcp_for_write_queue(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
Linus Torvalds's avatar
Linus Torvalds committed
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
		cnt += tcp_skb_pcount(skb);
		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
			tp->undo_marker = 0;
		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
			tp->lost_out += tcp_skb_pcount(skb);
		} else {
			tp->sacked_out += tcp_skb_pcount(skb);
			tp->fackets_out = cnt;
		}
	}
	tcp_sync_left_out(tp);

	tp->reordering = min_t(unsigned int, tp->reordering,
					     sysctl_tcp_reordering);
1517
	tcp_set_ca_state(sk, TCP_CA_Loss);
Linus Torvalds's avatar
Linus Torvalds committed
1518
1519
	tp->high_seq = tp->snd_nxt;
	TCP_ECN_queue_cwr(tp);