tcp_output.c 91.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
8
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
 *				:	Fragmentation on mtu decrease
 *				:	Segment collapse on retransmit
 *				:	AF independence
 *
 *		Linus Torvalds	:	send_delayed_ack
 *		David S. Miller	:	Charge memory using the right skb
 *					during syn/ack processing.
 *		David S. Miller :	Output engine completely rewritten.
 *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
 *		Cacophonix Gaul :	draft-minshall-nagle-01
 *		J Hadi Salim	:	ECN support
 *
 */

37
38
#define pr_fmt(fmt) "TCP: " fmt

Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
#include <net/tcp.h>

#include <linux/compiler.h>
42
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
#include <linux/module.h>

/* People can turn this off for buggy TCP's found in printers etc. */
46
int sysctl_tcp_retrans_collapse __read_mostly = 1;
Linus Torvalds's avatar
Linus Torvalds committed
47

48
/* People can turn this on to work with those rare, broken TCPs that
49
50
 * interpret the window field as a signed quantity.
 */
51
int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52

Eric Dumazet's avatar
Eric Dumazet committed
53
54
55
/* Default TSQ limit of two TSO segments */
int sysctl_tcp_limit_output_bytes __read_mostly = 131072;

Linus Torvalds's avatar
Linus Torvalds committed
56
57
58
59
/* This limits the percentage of the congestion window which we
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
 */
60
int sysctl_tcp_tso_win_divisor __read_mostly = 3;
Linus Torvalds's avatar
Linus Torvalds committed
61

62
int sysctl_tcp_mtu_probing __read_mostly = 0;
63
int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
John Heffner's avatar
John Heffner committed
64

65
/* By default, RFC2861 behavior.  */
66
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
67

Eric Dumazet's avatar
Eric Dumazet committed
68
69
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
			   int push_one, gfp_t gfp);
70

71
/* Account for new data that has been sent to the network. */
72
static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
73
{
Nandita Dukkipati's avatar
Nandita Dukkipati committed
74
	struct inet_connection_sock *icsk = inet_csk(sk);
75
	struct tcp_sock *tp = tcp_sk(sk);
76
	unsigned int prior_packets = tp->packets_out;
77

78
	tcp_advance_send_head(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
79
	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
80

81
	tp->packets_out += tcp_skb_pcount(skb);
Nandita Dukkipati's avatar
Nandita Dukkipati committed
82
	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
83
	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
84
		tcp_rearm_rto(sk);
85
	}
Linus Torvalds's avatar
Linus Torvalds committed
86
87
88
89
90
91
92
93
}

/* SND.NXT, if window was not shrunk.
 * If window has been shrunk, what should we make? It is not clear at all.
 * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
 * invalid. OK, let's make this for now:
 */
94
static inline __u32 tcp_acceptable_seq(const struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
95
{
96
	const struct tcp_sock *tp = tcp_sk(sk);
97

98
	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
Linus Torvalds's avatar
Linus Torvalds committed
99
100
		return tp->snd_nxt;
	else
101
		return tcp_wnd_end(tp);
Linus Torvalds's avatar
Linus Torvalds committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
}

/* Calculate mss to advertise in SYN segment.
 * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
 *
 * 1. It is independent of path mtu.
 * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
 * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
 *    attached devices, because some buggy hosts are confused by
 *    large MSS.
 * 4. We do not make 3, we advertise MSS, calculated from first
 *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
 *    This may be overridden via information stored in routing table.
 * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
 *    probably even Jumbo".
 */
static __u16 tcp_advertise_mss(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
121
	const struct dst_entry *dst = __sk_dst_get(sk);
Linus Torvalds's avatar
Linus Torvalds committed
122
123
	int mss = tp->advmss;

124
125
126
127
128
129
130
	if (dst) {
		unsigned int metric = dst_metric_advmss(dst);

		if (metric < mss) {
			mss = metric;
			tp->advmss = mss;
		}
Linus Torvalds's avatar
Linus Torvalds committed
131
132
133
134
135
136
137
	}

	return (__u16)mss;
}

/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
 * This is the first part of cwnd validation mechanism. */
138
static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
Linus Torvalds's avatar
Linus Torvalds committed
139
{
140
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
141
142
143
144
	s32 delta = tcp_time_stamp - tp->lsndtime;
	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
	u32 cwnd = tp->snd_cwnd;

145
	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
Linus Torvalds's avatar
Linus Torvalds committed
146

147
	tp->snd_ssthresh = tcp_current_ssthresh(sk);
Linus Torvalds's avatar
Linus Torvalds committed
148
149
	restart_cwnd = min(restart_cwnd, cwnd);

150
	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
Linus Torvalds's avatar
Linus Torvalds committed
151
152
153
154
155
156
		cwnd >>= 1;
	tp->snd_cwnd = max(cwnd, restart_cwnd);
	tp->snd_cwnd_stamp = tcp_time_stamp;
	tp->snd_cwnd_used = 0;
}

157
/* Congestion state accounting after a packet has been sent. */
Stephen Hemminger's avatar
Stephen Hemminger committed
158
static void tcp_event_data_sent(struct tcp_sock *tp,
159
				struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
160
{
161
162
	struct inet_connection_sock *icsk = inet_csk(sk);
	const u32 now = tcp_time_stamp;
163
	const struct dst_entry *dst = __sk_dst_get(sk);
Linus Torvalds's avatar
Linus Torvalds committed
164

165
166
	if (sysctl_tcp_slow_start_after_idle &&
	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
167
		tcp_cwnd_restart(sk, __sk_dst_get(sk));
Linus Torvalds's avatar
Linus Torvalds committed
168
169
170
171
172
173

	tp->lsndtime = now;

	/* If it is a reply for ato after last received
	 * packet, enter pingpong mode.
	 */
174
175
176
	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato &&
	    (!dst || !dst_metric(dst, RTAX_QUICKACK)))
			icsk->icsk_ack.pingpong = 1;
Linus Torvalds's avatar
Linus Torvalds committed
177
178
}

179
/* Account for an ACK we sent. */
Stephen Hemminger's avatar
Stephen Hemminger committed
180
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
Linus Torvalds's avatar
Linus Torvalds committed
181
{
182
183
	tcp_dec_quickack_mode(sk, pkts);
	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
Linus Torvalds's avatar
Linus Torvalds committed
184
185
}

186
187
188
189

u32 tcp_default_init_rwnd(u32 mss)
{
	/* Initial receive window should be twice of TCP_INIT_CWND to
Weiping Pan's avatar
Weiping Pan committed
190
	 * enable proper sending of new unsent data during fast recovery
191
192
193
194
195
196
197
198
199
200
	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
	 * limit when mss is larger than 1460.
	 */
	u32 init_rwnd = TCP_INIT_CWND * 2;

	if (mss > 1460)
		init_rwnd = max((1460 * init_rwnd) / mss, 2U);
	return init_rwnd;
}

Linus Torvalds's avatar
Linus Torvalds committed
201
202
203
204
205
206
207
208
209
/* Determine a window scaling and initial window to offer.
 * Based on the assumption that the given amount of space
 * will be offered. Store the results in the tp structure.
 * NOTE: for smooth operation initial space offering should
 * be a multiple of mss if possible. We assume here that mss >= 1.
 * This MUST be enforced by all callers.
 */
void tcp_select_initial_window(int __space, __u32 mss,
			       __u32 *rcv_wnd, __u32 *window_clamp,
210
211
			       int wscale_ok, __u8 *rcv_wscale,
			       __u32 init_rcv_wnd)
Linus Torvalds's avatar
Linus Torvalds committed
212
213
214
215
216
217
218
219
220
221
222
223
224
{
	unsigned int space = (__space < 0 ? 0 : __space);

	/* If no clamp set the clamp to the max possible scaled window */
	if (*window_clamp == 0)
		(*window_clamp) = (65535 << 14);
	space = min(*window_clamp, space);

	/* Quantize space offering to a multiple of mss if possible. */
	if (space > mss)
		space = (space / mss) * mss;

	/* NOTE: offering an initial window larger than 32767
225
226
227
228
229
230
	 * will break some buggy TCP stacks. If the admin tells us
	 * it is likely we could be speaking with such a buggy stack
	 * we will truncate our initial window offering to 32K-1
	 * unless the remote has sent us a window scaling option,
	 * which we interpret as a sign the remote TCP is not
	 * misinterpreting the window field as a signed quantity.
Linus Torvalds's avatar
Linus Torvalds committed
231
	 */
232
233
234
235
236
	if (sysctl_tcp_workaround_signed_windows)
		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
	else
		(*rcv_wnd) = space;

Linus Torvalds's avatar
Linus Torvalds committed
237
238
239
	(*rcv_wscale) = 0;
	if (wscale_ok) {
		/* Set window scaling on max possible window
240
		 * See RFC1323 for an explanation of the limit to 14
Linus Torvalds's avatar
Linus Torvalds committed
241
242
		 */
		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
243
		space = min_t(u32, space, *window_clamp);
Linus Torvalds's avatar
Linus Torvalds committed
244
245
246
247
248
249
		while (space > 65535 && (*rcv_wscale) < 14) {
			space >>= 1;
			(*rcv_wscale)++;
		}
	}

250
	if (mss > (1 << *rcv_wscale)) {
251
252
253
		if (!init_rcv_wnd) /* Use default unless specified otherwise */
			init_rcv_wnd = tcp_default_init_rwnd(mss);
		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
Linus Torvalds's avatar
Linus Torvalds committed
254
255
256
257
258
	}

	/* Set the clamp no higher than max representable value */
	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
}
259
EXPORT_SYMBOL(tcp_select_initial_window);
Linus Torvalds's avatar
Linus Torvalds committed
260
261
262
263
264
265

/* Chose a new window to advertise, update state in tcp_sock for the
 * socket, and return result with RFC1323 scaling applied.  The return
 * value can be stuffed directly into th->window for an outgoing
 * frame.
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
266
static u16 tcp_select_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
267
268
269
270
271
272
{
	struct tcp_sock *tp = tcp_sk(sk);
	u32 cur_win = tcp_receive_window(tp);
	u32 new_win = __tcp_select_window(sk);

	/* Never shrink the offered window */
Stephen Hemminger's avatar
Stephen Hemminger committed
273
	if (new_win < cur_win) {
Linus Torvalds's avatar
Linus Torvalds committed
274
275
276
277
278
279
280
		/* Danger Will Robinson!
		 * Don't update rcv_wup/rcv_wnd here or else
		 * we will not be able to advertise a zero
		 * window in time.  --DaveM
		 *
		 * Relax Will Robinson.
		 */
281
		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
Linus Torvalds's avatar
Linus Torvalds committed
282
283
284
285
286
287
288
	}
	tp->rcv_wnd = new_win;
	tp->rcv_wup = tp->rcv_nxt;

	/* Make sure we do not exceed the maximum possible
	 * scaled window.
	 */
289
	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
Linus Torvalds's avatar
Linus Torvalds committed
290
291
292
293
294
295
296
297
298
299
300
301
302
303
		new_win = min(new_win, MAX_TCP_WINDOW);
	else
		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));

	/* RFC1323 scaling applied */
	new_win >>= tp->rx_opt.rcv_wscale;

	/* If we advertise zero window, disable fast path. */
	if (new_win == 0)
		tp->pred_flags = 0;

	return new_win;
}

304
/* Packet ECN state for a SYN-ACK */
305
static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
306
{
Eric Dumazet's avatar
Eric Dumazet committed
307
	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
308
	if (!(tp->ecn_flags & TCP_ECN_OK))
Eric Dumazet's avatar
Eric Dumazet committed
309
		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
310
311
}

312
/* Packet ECN state for a SYN.  */
313
314
315
316
317
static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	tp->ecn_flags = 0;
318
	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
Eric Dumazet's avatar
Eric Dumazet committed
319
		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
320
321
322
323
324
		tp->ecn_flags = TCP_ECN_OK;
	}
}

static __inline__ void
325
TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
326
327
328
329
330
{
	if (inet_rsk(req)->ecn_ok)
		th->ece = 1;
}

331
332
333
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
 * be sent.
 */
334
335
336
337
338
339
340
341
342
343
static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
				int tcp_header_len)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (tp->ecn_flags & TCP_ECN_OK) {
		/* Not-retransmitted data segment: set ECT and inject CWR. */
		if (skb->len != tcp_header_len &&
		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
			INET_ECN_xmit(sk);
344
			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
345
346
347
348
349
350
351
352
353
354
355
356
357
				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
				tcp_hdr(skb)->cwr = 1;
				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
			}
		} else {
			/* ACK or retransmitted segment: clear ECT|CE */
			INET_ECN_dontxmit(sk);
		}
		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
			tcp_hdr(skb)->ece = 1;
	}
}

358
359
360
361
362
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
 * auto increment end seqno.
 */
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
363
	skb->ip_summed = CHECKSUM_PARTIAL;
364
365
	skb->csum = 0;

Eric Dumazet's avatar
Eric Dumazet committed
366
	TCP_SKB_CB(skb)->tcp_flags = flags;
367
368
369
370
371
372
373
	TCP_SKB_CB(skb)->sacked = 0;

	skb_shinfo(skb)->gso_segs = 1;
	skb_shinfo(skb)->gso_size = 0;
	skb_shinfo(skb)->gso_type = 0;

	TCP_SKB_CB(skb)->seq = seq;
Changli Gao's avatar
Changli Gao committed
374
	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
375
376
377
378
		seq++;
	TCP_SKB_CB(skb)->end_seq = seq;
}

Eric Dumazet's avatar
Eric Dumazet committed
379
static inline bool tcp_urg_mode(const struct tcp_sock *tp)
Ilpo Järvinen's avatar
Ilpo Järvinen committed
380
381
382
383
{
	return tp->snd_una != tp->snd_up;
}

Adam Langley's avatar
Adam Langley committed
384
385
386
#define OPTION_SACK_ADVERTISE	(1 << 0)
#define OPTION_TS		(1 << 1)
#define OPTION_MD5		(1 << 2)
387
#define OPTION_WSCALE		(1 << 3)
Yuchung Cheng's avatar
Yuchung Cheng committed
388
#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
Adam Langley's avatar
Adam Langley committed
389
390

struct tcp_out_options {
Yuchung Cheng's avatar
Yuchung Cheng committed
391
392
	u16 options;		/* bit field of OPTION_* */
	u16 mss;		/* 0 to disable */
Adam Langley's avatar
Adam Langley committed
393
394
	u8 ws;			/* window scale, 0 to disable */
	u8 num_sack_blocks;	/* number of SACK blocks to include */
395
396
	u8 hash_size;		/* bytes in hash_location */
	__u8 *hash_location;	/* temporary pointer, overloaded */
Yuchung Cheng's avatar
Yuchung Cheng committed
397
398
	__u32 tsval, tsecr;	/* need to include OPTION_TS */
	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
Adam Langley's avatar
Adam Langley committed
399
400
};

401
402
403
/* Write previously computed TCP options to the packet.
 *
 * Beware: Something in the Internet is very sensitive to the ordering of
404
405
406
407
408
409
410
411
412
413
 * TCP options, we learned this through the hard way, so be careful here.
 * Luckily we can at least blame others for their non-compliance but from
 * inter-operatibility perspective it seems that we're somewhat stuck with
 * the ordering which we have been using if we want to keep working with
 * those broken things (not that it currently hurts anybody as there isn't
 * particular reason why the ordering would need to be changed).
 *
 * At least SACK_PERM as the first option is known to lead to a disaster
 * (but it may well be that other scenarios fail similarly).
 */
Adam Langley's avatar
Adam Langley committed
414
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
415
416
			      struct tcp_out_options *opts)
{
Yuchung Cheng's avatar
Yuchung Cheng committed
417
	u16 options = opts->options;	/* mungable copy */
418
419

	if (unlikely(OPTION_MD5 & options)) {
Christoph Paasch's avatar
Christoph Paasch committed
420
421
		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
422
423
		/* overload cookie hash location */
		opts->hash_location = (__u8 *)ptr;
Adam Langley's avatar
Adam Langley committed
424
		ptr += 4;
Stephen Hemminger's avatar
Stephen Hemminger committed
425
	}
Adam Langley's avatar
Adam Langley committed
426

427
428
429
430
431
432
	if (unlikely(opts->mss)) {
		*ptr++ = htonl((TCPOPT_MSS << 24) |
			       (TCPOLEN_MSS << 16) |
			       opts->mss);
	}

433
434
	if (likely(OPTION_TS & options)) {
		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
Adam Langley's avatar
Adam Langley committed
435
436
437
438
			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
				       (TCPOLEN_SACK_PERM << 16) |
				       (TCPOPT_TIMESTAMP << 8) |
				       TCPOLEN_TIMESTAMP);
439
			options &= ~OPTION_SACK_ADVERTISE;
Adam Langley's avatar
Adam Langley committed
440
441
442
443
444
445
446
447
448
449
		} else {
			*ptr++ = htonl((TCPOPT_NOP << 24) |
				       (TCPOPT_NOP << 16) |
				       (TCPOPT_TIMESTAMP << 8) |
				       TCPOLEN_TIMESTAMP);
		}
		*ptr++ = htonl(opts->tsval);
		*ptr++ = htonl(opts->tsecr);
	}

450
	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
Adam Langley's avatar
Adam Langley committed
451
452
453
454
455
456
		*ptr++ = htonl((TCPOPT_NOP << 24) |
			       (TCPOPT_NOP << 16) |
			       (TCPOPT_SACK_PERM << 8) |
			       TCPOLEN_SACK_PERM);
	}

457
	if (unlikely(OPTION_WSCALE & options)) {
Adam Langley's avatar
Adam Langley committed
458
459
460
461
462
463
464
465
466
		*ptr++ = htonl((TCPOPT_NOP << 24) |
			       (TCPOPT_WINDOW << 16) |
			       (TCPOLEN_WINDOW << 8) |
			       opts->ws);
	}

	if (unlikely(opts->num_sack_blocks)) {
		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
			tp->duplicate_sack : tp->selective_acks;
Stephen Hemminger's avatar
Stephen Hemminger committed
467
468
469
470
471
		int this_sack;

		*ptr++ = htonl((TCPOPT_NOP  << 24) |
			       (TCPOPT_NOP  << 16) |
			       (TCPOPT_SACK <<  8) |
Adam Langley's avatar
Adam Langley committed
472
			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
Stephen Hemminger's avatar
Stephen Hemminger committed
473
						     TCPOLEN_SACK_PERBLOCK)));
Stephen Hemminger's avatar
Stephen Hemminger committed
474

Adam Langley's avatar
Adam Langley committed
475
476
		for (this_sack = 0; this_sack < opts->num_sack_blocks;
		     ++this_sack) {
Stephen Hemminger's avatar
Stephen Hemminger committed
477
478
479
			*ptr++ = htonl(sp[this_sack].start_seq);
			*ptr++ = htonl(sp[this_sack].end_seq);
		}
Stephen Hemminger's avatar
Stephen Hemminger committed
480

481
		tp->rx_opt.dsack = 0;
Stephen Hemminger's avatar
Stephen Hemminger committed
482
	}
Yuchung Cheng's avatar
Yuchung Cheng committed
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497

	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;

		*ptr++ = htonl((TCPOPT_EXP << 24) |
			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
			       TCPOPT_FASTOPEN_MAGIC);

		memcpy(ptr, foc->val, foc->len);
		if ((foc->len & 3) == 2) {
			u8 *align = ((u8 *)ptr) + foc->len;
			align[0] = align[1] = TCPOPT_NOP;
		}
		ptr += (foc->len + 3) >> 2;
	}
Adam Langley's avatar
Adam Langley committed
498
499
}

500
501
502
/* Compute TCP options for SYN packets. This is not the final
 * network wire format yet.
 */
503
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
Adam Langley's avatar
Adam Langley committed
504
				struct tcp_out_options *opts,
505
506
				struct tcp_md5sig_key **md5)
{
Adam Langley's avatar
Adam Langley committed
507
	struct tcp_sock *tp = tcp_sk(sk);
508
	unsigned int remaining = MAX_TCP_OPTION_SPACE;
509
	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
Adam Langley's avatar
Adam Langley committed
510

511
#ifdef CONFIG_TCP_MD5SIG
Adam Langley's avatar
Adam Langley committed
512
513
514
	*md5 = tp->af_specific->md5_lookup(sk, sk);
	if (*md5) {
		opts->options |= OPTION_MD5;
515
		remaining -= TCPOLEN_MD5SIG_ALIGNED;
516
	}
Adam Langley's avatar
Adam Langley committed
517
518
#else
	*md5 = NULL;
519
#endif
Adam Langley's avatar
Adam Langley committed
520
521
522
523
524
525
526
527
528
529
530

	/* We always get an MSS option.  The option bytes which will be seen in
	 * normal data packets should timestamps be used, must be in the MSS
	 * advertised.  But we subtract them from tp->mss_cache so that
	 * calculations in tcp_sendmsg are simpler etc.  So account for this
	 * fact here if necessary.  If we don't do this correctly, as a
	 * receiver we won't recognize data packets as being full sized when we
	 * should, and thus we won't abide by the delayed ACK rules correctly.
	 * SACKs don't matter, we never delay an ACK when we have any of those
	 * going out.  */
	opts->mss = tcp_advertise_mss(sk);
531
	remaining -= TCPOLEN_MSS_ALIGNED;
Adam Langley's avatar
Adam Langley committed
532

533
	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
Adam Langley's avatar
Adam Langley committed
534
		opts->options |= OPTION_TS;
535
		opts->tsval = TCP_SKB_CB(skb)->when + tp->tsoffset;
Adam Langley's avatar
Adam Langley committed
536
		opts->tsecr = tp->rx_opt.ts_recent;
537
		remaining -= TCPOLEN_TSTAMP_ALIGNED;
Adam Langley's avatar
Adam Langley committed
538
	}
539
	if (likely(sysctl_tcp_window_scaling)) {
Adam Langley's avatar
Adam Langley committed
540
		opts->ws = tp->rx_opt.rcv_wscale;
541
		opts->options |= OPTION_WSCALE;
542
		remaining -= TCPOLEN_WSCALE_ALIGNED;
Adam Langley's avatar
Adam Langley committed
543
	}
544
	if (likely(sysctl_tcp_sack)) {
Adam Langley's avatar
Adam Langley committed
545
		opts->options |= OPTION_SACK_ADVERTISE;
546
		if (unlikely(!(OPTION_TS & opts->options)))
547
			remaining -= TCPOLEN_SACKPERM_ALIGNED;
Adam Langley's avatar
Adam Langley committed
548
549
	}

550
551
552
553
554
555
556
557
558
559
	if (fastopen && fastopen->cookie.len >= 0) {
		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
		need = (need + 3) & ~3U;  /* Align to 32 bits */
		if (remaining >= need) {
			opts->options |= OPTION_FAST_OPEN_COOKIE;
			opts->fastopen_cookie = &fastopen->cookie;
			remaining -= need;
			tp->syn_fastopen = 1;
		}
	}
560
561

	return MAX_TCP_OPTION_SPACE - remaining;
Stephen Hemminger's avatar
Stephen Hemminger committed
562
563
}

564
/* Set up TCP options for SYN-ACKs. */
565
static unsigned int tcp_synack_options(struct sock *sk,
Adam Langley's avatar
Adam Langley committed
566
				   struct request_sock *req,
567
				   unsigned int mss, struct sk_buff *skb,
Adam Langley's avatar
Adam Langley committed
568
				   struct tcp_out_options *opts,
569
				   struct tcp_md5sig_key **md5,
570
				   struct tcp_fastopen_cookie *foc)
571
{
Adam Langley's avatar
Adam Langley committed
572
	struct inet_request_sock *ireq = inet_rsk(req);
573
	unsigned int remaining = MAX_TCP_OPTION_SPACE;
Adam Langley's avatar
Adam Langley committed
574

575
#ifdef CONFIG_TCP_MD5SIG
Adam Langley's avatar
Adam Langley committed
576
577
578
	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
	if (*md5) {
		opts->options |= OPTION_MD5;
579
580
581
582
583
584
585
		remaining -= TCPOLEN_MD5SIG_ALIGNED;

		/* We can't fit any SACK blocks in a packet with MD5 + TS
		 * options. There was discussion about disabling SACK
		 * rather than TS in order to fit in better with old,
		 * buggy kernels, but that was deemed to be unnecessary.
		 */
586
		ireq->tstamp_ok &= !ireq->sack_ok;
587
	}
Adam Langley's avatar
Adam Langley committed
588
589
#else
	*md5 = NULL;
590
#endif
Adam Langley's avatar
Adam Langley committed
591

592
	/* We always send an MSS option. */
Adam Langley's avatar
Adam Langley committed
593
	opts->mss = mss;
594
	remaining -= TCPOLEN_MSS_ALIGNED;
Adam Langley's avatar
Adam Langley committed
595
596
597

	if (likely(ireq->wscale_ok)) {
		opts->ws = ireq->rcv_wscale;
598
		opts->options |= OPTION_WSCALE;
599
		remaining -= TCPOLEN_WSCALE_ALIGNED;
Adam Langley's avatar
Adam Langley committed
600
	}
601
	if (likely(ireq->tstamp_ok)) {
Adam Langley's avatar
Adam Langley committed
602
603
604
		opts->options |= OPTION_TS;
		opts->tsval = TCP_SKB_CB(skb)->when;
		opts->tsecr = req->ts_recent;
605
		remaining -= TCPOLEN_TSTAMP_ALIGNED;
Adam Langley's avatar
Adam Langley committed
606
607
608
	}
	if (likely(ireq->sack_ok)) {
		opts->options |= OPTION_SACK_ADVERTISE;
609
		if (unlikely(!ireq->tstamp_ok))
610
			remaining -= TCPOLEN_SACKPERM_ALIGNED;
Adam Langley's avatar
Adam Langley committed
611
	}
612
613
614
615
616
617
618
619
620
	if (foc != NULL) {
		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
		need = (need + 3) & ~3U;  /* Align to 32 bits */
		if (remaining >= need) {
			opts->options |= OPTION_FAST_OPEN_COOKIE;
			opts->fastopen_cookie = foc;
			remaining -= need;
		}
	}
Christoph Paasch's avatar
Christoph Paasch committed
621

622
	return MAX_TCP_OPTION_SPACE - remaining;
Adam Langley's avatar
Adam Langley committed
623
624
}

625
626
627
/* Compute TCP options for ESTABLISHED sockets. This is not the
 * final wire format yet.
 */
628
static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
Adam Langley's avatar
Adam Langley committed
629
					struct tcp_out_options *opts,
630
631
					struct tcp_md5sig_key **md5)
{
Adam Langley's avatar
Adam Langley committed
632
633
	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
	struct tcp_sock *tp = tcp_sk(sk);
634
	unsigned int size = 0;
635
	unsigned int eff_sacks;
Adam Langley's avatar
Adam Langley committed
636
637
638
639
640
641
642
643
644
645
646
647
648

#ifdef CONFIG_TCP_MD5SIG
	*md5 = tp->af_specific->md5_lookup(sk, sk);
	if (unlikely(*md5)) {
		opts->options |= OPTION_MD5;
		size += TCPOLEN_MD5SIG_ALIGNED;
	}
#else
	*md5 = NULL;
#endif

	if (likely(tp->rx_opt.tstamp_ok)) {
		opts->options |= OPTION_TS;
649
		opts->tsval = tcb ? tcb->when + tp->tsoffset : 0;
Adam Langley's avatar
Adam Langley committed
650
651
652
653
		opts->tsecr = tp->rx_opt.ts_recent;
		size += TCPOLEN_TSTAMP_ALIGNED;
	}

654
655
	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
	if (unlikely(eff_sacks)) {
656
		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
Adam Langley's avatar
Adam Langley committed
657
		opts->num_sack_blocks =
658
			min_t(unsigned int, eff_sacks,
Adam Langley's avatar
Adam Langley committed
659
660
661
662
663
664
665
			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
			      TCPOLEN_SACK_PERBLOCK);
		size += TCPOLEN_SACK_BASE_ALIGNED +
			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	}

	return size;
Stephen Hemminger's avatar
Stephen Hemminger committed
666
}
Linus Torvalds's avatar
Linus Torvalds committed
667

Eric Dumazet's avatar
Eric Dumazet committed
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688

/* TCP SMALL QUEUES (TSQ)
 *
 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
 * to reduce RTT and bufferbloat.
 * We do this using a special skb destructor (tcp_wfree).
 *
 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
 * needs to be reallocated in a driver.
 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
 *
 * Since transmit from skb destructor is forbidden, we use a tasklet
 * to process all sockets that eventually need to send more skbs.
 * We use one tasklet per cpu, with its own queue of sockets.
 */
struct tsq_tasklet {
	struct tasklet_struct	tasklet;
	struct list_head	head; /* queue of tcp sockets */
};
static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);

689
690
691
692
693
694
695
static void tcp_tsq_handler(struct sock *sk)
{
	if ((1 << sk->sk_state) &
	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
		tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
}
Eric Dumazet's avatar
Eric Dumazet committed
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
/*
 * One tasklest per cpu tries to send more skbs.
 * We run in tasklet context but need to disable irqs when
 * transfering tsq->head because tcp_wfree() might
 * interrupt us (non NAPI drivers)
 */
static void tcp_tasklet_func(unsigned long data)
{
	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
	LIST_HEAD(list);
	unsigned long flags;
	struct list_head *q, *n;
	struct tcp_sock *tp;
	struct sock *sk;

	local_irq_save(flags);
	list_splice_init(&tsq->head, &list);
	local_irq_restore(flags);

	list_for_each_safe(q, n, &list) {
		tp = list_entry(q, struct tcp_sock, tsq_node);
		list_del(&tp->tsq_node);

		sk = (struct sock *)tp;
		bh_lock_sock(sk);

		if (!sock_owned_by_user(sk)) {
723
			tcp_tsq_handler(sk);
Eric Dumazet's avatar
Eric Dumazet committed
724
725
		} else {
			/* defer the work to tcp_release_cb() */
726
			set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
Eric Dumazet's avatar
Eric Dumazet committed
727
728
729
730
731
732
733
734
		}
		bh_unlock_sock(sk);

		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
		sk_free(sk);
	}
}

735
736
#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
737
738
			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
			  (1UL << TCP_MTU_REDUCED_DEFERRED))
Eric Dumazet's avatar
Eric Dumazet committed
739
740
741
742
743
744
745
746
747
748
/**
 * tcp_release_cb - tcp release_sock() callback
 * @sk: socket
 *
 * called from release_sock() to perform protocol dependent
 * actions before socket release.
 */
void tcp_release_cb(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
749
	unsigned long flags, nflags;
Eric Dumazet's avatar
Eric Dumazet committed
750

751
752
753
754
755
756
757
758
759
760
761
	/* perform an atomic operation only if at least one flag is set */
	do {
		flags = tp->tsq_flags;
		if (!(flags & TCP_DEFERRED_ALL))
			return;
		nflags = flags & ~TCP_DEFERRED_ALL;
	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);

	if (flags & (1UL << TCP_TSQ_DEFERRED))
		tcp_tsq_handler(sk);

762
	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
763
		tcp_write_timer_handler(sk);
764
765
766
		__sock_put(sk);
	}
	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
767
		tcp_delack_timer_handler(sk);
768
769
770
		__sock_put(sk);
	}
	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
771
		sk->sk_prot->mtu_reduced(sk);
772
773
		__sock_put(sk);
	}
Eric Dumazet's avatar
Eric Dumazet committed
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
}
EXPORT_SYMBOL(tcp_release_cb);

void __init tcp_tasklet_init(void)
{
	int i;

	for_each_possible_cpu(i) {
		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);

		INIT_LIST_HEAD(&tsq->head);
		tasklet_init(&tsq->tasklet,
			     tcp_tasklet_func,
			     (unsigned long)tsq);
	}
}

/*
 * Write buffer destructor automatically called from kfree_skb.
 * We cant xmit new skbs from this context, as we might already
 * hold qdisc lock.
 */
Eric Dumazet's avatar
Eric Dumazet committed
796
void tcp_wfree(struct sk_buff *skb)
Eric Dumazet's avatar
Eric Dumazet committed
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
{
	struct sock *sk = skb->sk;
	struct tcp_sock *tp = tcp_sk(sk);

	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
	    !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
		unsigned long flags;
		struct tsq_tasklet *tsq;

		/* Keep a ref on socket.
		 * This last ref will be released in tcp_tasklet_func()
		 */
		atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);

		/* queue this socket to tasklet queue */
		local_irq_save(flags);
		tsq = &__get_cpu_var(tsq_tasklet);
		list_add(&tp->tsq_node, &tsq->head);
		tasklet_schedule(&tsq->tasklet);
		local_irq_restore(flags);
	} else {
		sock_wfree(skb);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
822
823
824
825
826
827
828
829
830
831
832
/* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
 * All SKB's seen here are completely headerless.  It is our
 * job to build the TCP header, and pass the packet down to
 * IP so it can do the same plus pass the packet off to the
 * device.
 *
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 */
833
834
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
			    gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
835
{
836
837
838
839
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
Adam Langley's avatar
Adam Langley committed
840
	struct tcp_out_options opts;
841
	unsigned int tcp_options_size, tcp_header_size;
842
	struct tcp_md5sig_key *md5;
843
844
845
846
847
848
849
850
	struct tcphdr *th;
	int err;

	BUG_ON(!skb || !tcp_skb_pcount(skb));

	/* If congestion control is doing timestamping, we must
	 * take such a timestamp before we potentially clone/copy.
	 */
851
	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
852
853
854
		__net_timestamp(skb);

	if (likely(clone_it)) {
855
856
857
858
859
860
861
		const struct sk_buff *fclone = skb + 1;

		if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
			     fclone->fclone == SKB_FCLONE_CLONE))
			NET_INC_STATS_BH(sock_net(sk),
					 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);

862
863
864
865
866
867
868
		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}
Linus Torvalds's avatar
Linus Torvalds committed
869

870
871
872
	inet = inet_sk(sk);
	tp = tcp_sk(sk);
	tcb = TCP_SKB_CB(skb);
Adam Langley's avatar
Adam Langley committed
873
	memset(&opts, 0, sizeof(opts));
Linus Torvalds's avatar
Linus Torvalds committed
874

Eric Dumazet's avatar
Eric Dumazet committed
875
	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
Adam Langley's avatar
Adam Langley committed
876
877
878
879
880
		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
	else
		tcp_options_size = tcp_established_options(sk, skb, &opts,
							   &md5);
	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
881

Eric Dumazet's avatar
Eric Dumazet committed
882
	if (tcp_packets_in_flight(tp) == 0)
883
		tcp_ca_event(sk, CA_EVENT_TX_START);
Eric Dumazet's avatar
Eric Dumazet committed
884
885
886
887
888

	/* if no packet is in qdisc/device queue, then allow XPS to select
	 * another queue.
	 */
	skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
889

890
891
	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);
Eric Dumazet's avatar
Eric Dumazet committed
892
893
894
895
896
897

	skb_orphan(skb);
	skb->sk = sk;
	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
			  tcp_wfree : sock_wfree;
	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
898
899

	/* Build TCP header and checksum it. */
900
	th = tcp_hdr(skb);
901
902
	th->source		= inet->inet_sport;
	th->dest		= inet->inet_dport;
903
904
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(tp->rcv_nxt);
905
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
Eric Dumazet's avatar
Eric Dumazet committed
906
					tcb->tcp_flags);
907

Eric Dumazet's avatar
Eric Dumazet committed
908
	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
909
910
911
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 */
912
		th->window	= htons(min(tp->rcv_wnd, 65535U));
913
914
915
916
917
	} else {
		th->window	= htons(tcp_select_window(sk));
	}
	th->check		= 0;
	th->urg_ptr		= 0;
Linus Torvalds's avatar
Linus Torvalds committed
918

Ilpo Järvinen's avatar
Ilpo Järvinen committed
919
	/* The urg_mode check is necessary during a below snd_una win probe */
920
921
922
923
924
	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
		if (before(tp->snd_up, tcb->seq + 0x10000)) {
			th->urg_ptr = htons(tp->snd_up - tcb->seq);
			th->urg = 1;
		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
925
			th->urg_ptr = htons(0xFFFF);
926
927
			th->urg = 1;
		}
928
	}
Linus Torvalds's avatar
Linus Torvalds committed
929

930
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
Eric Dumazet's avatar
Eric Dumazet committed
931
	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
932
		TCP_ECN_send(sk, skb, tcp_header_size);
Linus Torvalds's avatar
Linus Torvalds committed
933

934
935
936
#ifdef CONFIG_TCP_MD5SIG
	/* Calculate the MD5 hash, as we have all we need now */
	if (md5) {
Eric Dumazet's avatar
Eric Dumazet committed
937
		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
938
		tp->af_specific->calc_md5_hash(opts.hash_location,
939
					       md5, sk, NULL, skb);
940
941
942
	}
#endif

943
	icsk->icsk_af_ops->send_check(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
944

Eric Dumazet's avatar
Eric Dumazet committed
945
	if (likely(tcb->tcp_flags & TCPHDR_ACK))
946
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
Linus Torvalds's avatar
Linus Torvalds committed
947

948
	if (skb->len != tcp_header_size)
949
		tcp_event_data_sent(tp, sk);
Linus Torvalds's avatar
Linus Torvalds committed
950

951
	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
952
953
		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
			      tcp_skb_pcount(skb));
Linus Torvalds's avatar
Linus Torvalds committed
954

955
	err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
956
	if (likely(err <= 0))
957
958
		return err;

959
	tcp_enter_cwr(sk, 1);
960

961
	return net_xmit_eval(err);
Linus Torvalds's avatar
Linus Torvalds committed
962
963
}

964
/* This routine just queues the buffer for sending.
Linus Torvalds's avatar
Linus Torvalds committed
965
966
967
968
969
970
971
972
973
974
975
 *
 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
 * otherwise socket can stall.
 */
static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	/* Advance write_seq and place onto the write_queue. */
	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
	skb_header_release(skb);
976
	tcp_add_write_queue_tail(sk, skb);
977
978
	sk->sk_wmem_queued += skb->truesize;
	sk_mem_charge(sk, skb->truesize);
Linus Torvalds's avatar
Linus Torvalds committed
979
980
}

981
/* Initialize TSO segments for a packet. */
982
static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
983
				 unsigned int mss_now)
984
{
985
986
	if (skb->len <= mss_now || !sk_can_gso(sk) ||
	    skb->ip_summed == CHECKSUM_NONE) {
987
988
989
		/* Avoid the costly divide in the normal
		 * non-TSO case.
		 */
990
991
		skb_shinfo(skb)->gso_segs = 1;
		skb_shinfo(skb)->gso_size = 0;
992
		skb_shinfo(skb)->gso_type = 0;
993
	} else {
Ilpo Järvinen's avatar
Ilpo Järvinen committed
994
		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
995
		skb_shinfo(skb)->gso_size = mss_now;
996
		skb_shinfo(skb)->gso_type = sk->sk_gso_type;
Linus Torvalds's avatar
Linus Torvalds committed
997
998
999
	}
}

1000
/* When a modification to fackets out becomes necessary, we need to check
1001
 * skb is counted to fackets_out or not.
1002
 */
1003
static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
1004
1005
				   int decr)
{
1006
1007
	struct tcp_sock *tp = tcp_sk(sk);

1008
	if (!tp->sacked_out || tcp_is_reno(tp))
1009
1010
		return;

1011
	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
1012
1013
1014
		tp->fackets_out -= decr;
}

1015
1016
1017
/* Pcount in the middle of the write queue got changed, we need to do various
 * tweaks to fix counters
 */
1018
static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
{
	struct tcp_sock *tp = tcp_sk(sk);

	tp->packets_out -= decr;

	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
		tp->sacked_out -= decr;
	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
		tp->retrans_out -= decr;
	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
		tp->lost_out -= decr;

	/* Reno case is special. Sigh... */
	if (tcp_is_reno(tp) && decr > 0)
		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);

	tcp_adjust_fackets_out(sk, skb, decr);

	if (tp->lost_skb_hint &&
	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1039
	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
1040
1041
1042
1043
1044
		tp->lost_cnt_hint -= decr;

	tcp_verify_left_out(tp);
}

Linus Torvalds's avatar
Linus Torvalds committed
1045
1046
/* Function to create two new TCP segments.  Shrinks the given segment
 * to the specified size and appends a new segment with the rest of the
1047
 * packet to the list.  This won't be called frequently, I hope.
Linus Torvalds's avatar
Linus Torvalds committed
1048
1049
 * Remember, these are still headerless SKBs at this point.
 */
1050
1051
int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
		 unsigned int mss_now)
Linus Torvalds's avatar
Linus Torvalds committed
1052
1053
1054
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *buff;
1055
	int nsize, old_factor;
1056
	int nlen;
1057
	u8 flags;
Linus Torvalds's avatar
Linus Torvalds committed
1058

1059
1060
	if (WARN_ON(len > skb->len))
		return -EINVAL;
1061

Linus Torvalds's avatar
Linus Torvalds committed
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
	nsize = skb_headlen(skb) - len;
	if (nsize < 0)
		nsize = 0;

	if (skb_cloned(skb) &&
	    skb_is_nonlinear(skb) &&
	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
		return -ENOMEM;

	/* Get a new skb... force flag on. */
	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
	if (buff == NULL)
		return -ENOMEM; /* We'll just try again later. */
Herbert Xu's avatar
Herbert Xu committed
1075

1076
1077
	sk->sk_wmem_queued += buff->truesize;
	sk_mem_charge(sk, buff->truesize);
1078
1079
1080
	nlen = skb->len - len - nsize;
	buff->truesize += nlen;
	skb->truesize -= nlen;
Linus Torvalds's avatar
Linus Torvalds committed
1081
1082
1083
1084
1085
1086
1087

	/* Correct the sequence numbers. */
	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;

	/* PSH and FIN should only be set in the second packet. */
Eric Dumazet's avatar
Eric Dumazet committed
1088
1089
1090
	flags = TCP_SKB_CB(skb)->tcp_flags;
	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
	TCP_SKB_CB(buff)->tcp_flags = flags;
1091
	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
Linus Torvalds's avatar
Linus Torvalds committed
1092

1093
	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
Linus Torvalds's avatar
Linus Torvalds committed
1094
		/* Copy and checksum data tail into the new buffer. */
1095
1096
		buff->csum = csum_partial_copy_nocheck(skb->data + len,
						       skb_put(buff, nsize),
Linus Torvalds's avatar
Linus Torvalds committed
1097
1098
1099
1100
1101
1102
						       nsize, 0);

		skb_trim(skb, len);

		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
	} else {
1103
		skb->ip_summed = CHECKSUM_PARTIAL;
Linus Torvalds's avatar
Linus Torvalds committed
1104
1105
1106
1107
1108
1109
1110
1111
1112
		skb_split(skb, buff, len);
	}

	buff->ip_summed = skb->ip_summed;

	/* Looks stupid, but our code really uses when of
	 * skbs, which it never sent before. --ANK
	 */
	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
1113
	buff->tstamp = skb->tstamp;
Linus Torvalds's avatar
Linus Torvalds committed
1114

1115
1116
	old_factor = tcp_skb_pcount(skb);

Linus Torvalds's avatar
Linus Torvalds committed
1117
	/* Fix up tso_factor for both original and new SKB.  */
1118
1119
	tcp_set_skb_tso_segs(sk, skb, mss_now);
	tcp_set_skb_tso_segs(sk, buff, mss_now);
Linus Torvalds's avatar
Linus Torvalds committed
1120

1121
1122
1123
	/* If this packet has been sent out already, we must
	 * adjust the various packet counters.
	 */
1124
	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1125
1126
		int diff = old_factor - tcp_skb_pcount(skb) -
			tcp_skb_pcount(buff);
Linus Torvalds's avatar
Linus Torvalds committed
1127

1128
1129
		if (diff)
			tcp_adjust_pcount(sk, skb, diff);
Linus Torvalds's avatar
Linus Torvalds committed
1130
1131
1132
	}

	/* Link BUFF into the send queue. */