skbuff.c 87.7 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
/*
 *	Routines having to do with the 'struct sk_buff' memory handlers.
 *
4
 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
 *			Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *	Fixes:
 *		Alan Cox	:	Fixed the worst of the load
 *					balancer bugs.
 *		Dave Platt	:	Interrupt stacking fix.
 *	Richard Kooijman	:	Timestamp fixes.
 *		Alan Cox	:	Changed buffer format.
 *		Alan Cox	:	destructor hook for AF_UNIX etc.
 *		Linus Torvalds	:	Better skb_clone.
 *		Alan Cox	:	Added skb_copy.
 *		Alan Cox	:	Added all the changed routines Linus
 *					only put in the headers
 *		Ray VanTassle	:	Fixed --skb->lock in free
 *		Alan Cox	:	skb_copy copy arp field
 *		Andi Kleen	:	slabified it.
 *		Robert Olsson	:	Removed skb_head_pool
 *
 *	NOTE:
 *		The __skb_ routines should be called with interrupts
 *	disabled, or you better be *real* sure that the operation is atomic
 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
 *	or via disabling bottom half handlers, etc).
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 */

/*
 *	The functions in this file will not compile correctly with gcc 2.4.x
 */

Joe Perches's avatar
Joe Perches committed
39
40
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Linus Torvalds's avatar
Linus Torvalds committed
41
42
43
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
44
#include <linux/kmemcheck.h>
Linus Torvalds's avatar
Linus Torvalds committed
45
46
47
48
49
50
51
52
53
54
55
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
Jens Axboe's avatar
Jens Axboe committed
56
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
57
58
59
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
60
#include <linux/scatterlist.h>
61
#include <linux/errqueue.h>
62
#include <linux/prefetch.h>
Linus Torvalds's avatar
Linus Torvalds committed
63
64
65
66
67
68
69
70

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/xfrm.h>

#include <asm/uaccess.h>
71
#include <trace/events/skb.h>
Eric Dumazet's avatar
Eric Dumazet committed
72
#include <linux/highmem.h>
73

74
struct kmem_cache *skbuff_head_cache __read_mostly;
75
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
76

Jens Axboe's avatar
Jens Axboe committed
77
78
79
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
				  struct pipe_buffer *buf)
{
80
	put_page(buf->page);
Jens Axboe's avatar
Jens Axboe committed
81
82
83
84
85
}

static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
				struct pipe_buffer *buf)
{
86
	get_page(buf->page);
Jens Axboe's avatar
Jens Axboe committed
87
88
89
90
91
92
93
94
95
96
}

static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
			       struct pipe_buffer *buf)
{
	return 1;
}


/* Pipe buffer operations for a socket. */
97
static const struct pipe_buf_operations sock_pipe_buf_ops = {
Jens Axboe's avatar
Jens Axboe committed
98
99
100
101
102
103
104
105
106
	.can_merge = 0,
	.map = generic_pipe_buf_map,
	.unmap = generic_pipe_buf_unmap,
	.confirm = generic_pipe_buf_confirm,
	.release = sock_pipe_buf_release,
	.steal = sock_pipe_buf_steal,
	.get = sock_pipe_buf_get,
};

Linus Torvalds's avatar
Linus Torvalds committed
107
/**
108
109
110
111
 *	skb_panic - private function for out-of-line support
 *	@skb:	buffer
 *	@sz:	size
 *	@addr:	address
112
 *	@msg:	skb_over_panic or skb_under_panic
Linus Torvalds's avatar
Linus Torvalds committed
113
 *
114
115
116
117
 *	Out-of-line support for skb_put() and skb_push().
 *	Called via the wrapper skb_over_panic() or skb_under_panic().
 *	Keep out of line to prevent kernel bloat.
 *	__builtin_return_address is not used because it is not always reliable.
Linus Torvalds's avatar
Linus Torvalds committed
118
 */
119
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
120
		      const char msg[])
Linus Torvalds's avatar
Linus Torvalds committed
121
{
Joe Perches's avatar
Joe Perches committed
122
	pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
123
		 msg, addr, skb->len, sz, skb->head, skb->data,
Joe Perches's avatar
Joe Perches committed
124
125
		 (unsigned long)skb->tail, (unsigned long)skb->end,
		 skb->dev ? skb->dev->name : "<NULL>");
Linus Torvalds's avatar
Linus Torvalds committed
126
127
128
	BUG();
}

129
static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
130
{
131
	skb_panic(skb, sz, addr, __func__);
Linus Torvalds's avatar
Linus Torvalds committed
132
133
}

134
135
136
137
static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
	skb_panic(skb, sz, addr, __func__);
}
138
139
140
141
142
143
144
145
146
147

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
148
149
150

static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
			       unsigned long ip, bool *pfmemalloc)
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
{
	void *obj;
	bool ret_pfmemalloc = false;

	/*
	 * Try a regular allocation, when that fails and we're not entitled
	 * to the reserves, fail.
	 */
	obj = kmalloc_node_track_caller(size,
					flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
					node);
	if (obj || !(gfp_pfmemalloc_allowed(flags)))
		goto out;

	/* Try again but now we are using pfmemalloc reserves */
	ret_pfmemalloc = true;
	obj = kmalloc_node_track_caller(size, flags, node);

out:
	if (pfmemalloc)
		*pfmemalloc = ret_pfmemalloc;

	return obj;
}

Linus Torvalds's avatar
Linus Torvalds committed
176
177
178
179
180
181
/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
 *	'private' fields and also do memory statistics to find all the
 *	[BEEP] leaks.
 *
 */

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
{
	struct sk_buff *skb;

	/* Get the HEAD */
	skb = kmem_cache_alloc_node(skbuff_head_cache,
				    gfp_mask & ~__GFP_DMA, node);
	if (!skb)
		goto out;

	/*
	 * Only clear those fields we need to clear, not those that we will
	 * actually initialise below. Hence, don't put any more fields after
	 * the tail pointer in struct sk_buff!
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
198
	skb->head = NULL;
199
200
201
	skb->truesize = sizeof(struct sk_buff);
	atomic_set(&skb->users, 1);

Cong Wang's avatar
Cong Wang committed
202
	skb->mac_header = (typeof(skb->mac_header))~0U;
203
204
205
206
out:
	return skb;
}

Linus Torvalds's avatar
Linus Torvalds committed
207
/**
208
 *	__alloc_skb	-	allocate a network buffer
Linus Torvalds's avatar
Linus Torvalds committed
209
210
 *	@size: size to allocate
 *	@gfp_mask: allocation mask
211
212
213
214
 *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *		instead of head cache and allocate a cloned (child) skb.
 *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *		allocations in case the data is required for writeback
215
 *	@node: numa node to allocate memory on
Linus Torvalds's avatar
Linus Torvalds committed
216
217
 *
 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
218
219
 *	tail room of at least size bytes. The object has a reference count
 *	of one. The return is the buffer. On a failure the return is %NULL.
Linus Torvalds's avatar
Linus Torvalds committed
220
221
222
223
 *
 *	Buffers may only be allocated from interrupts using a @gfp_mask of
 *	%GFP_ATOMIC.
 */
224
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
225
			    int flags, int node)
Linus Torvalds's avatar
Linus Torvalds committed
226
{
227
	struct kmem_cache *cache;
228
	struct skb_shared_info *shinfo;
Linus Torvalds's avatar
Linus Torvalds committed
229
230
	struct sk_buff *skb;
	u8 *data;
231
	bool pfmemalloc;
Linus Torvalds's avatar
Linus Torvalds committed
232

233
234
235
236
237
	cache = (flags & SKB_ALLOC_FCLONE)
		? skbuff_fclone_cache : skbuff_head_cache;

	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
		gfp_mask |= __GFP_MEMALLOC;
238

Linus Torvalds's avatar
Linus Torvalds committed
239
	/* Get the HEAD */
240
	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
Linus Torvalds's avatar
Linus Torvalds committed
241
242
	if (!skb)
		goto out;
Eric Dumazet's avatar
Eric Dumazet committed
243
	prefetchw(skb);
Linus Torvalds's avatar
Linus Torvalds committed
244

Eric Dumazet's avatar
Eric Dumazet committed
245
246
247
248
249
	/* We do our best to align skb_shared_info on a separate cache
	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
	 * Both skb->head and skb_shared_info are cache line aligned.
	 */
250
	size = SKB_DATA_ALIGN(size);
Eric Dumazet's avatar
Eric Dumazet committed
251
	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
252
	data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
Linus Torvalds's avatar
Linus Torvalds committed
253
254
	if (!data)
		goto nodata;
Eric Dumazet's avatar
Eric Dumazet committed
255
256
257
258
259
	/* kmalloc(size) might give us more room than requested.
	 * Put skb_shared_info exactly at the end of allocated zone,
	 * to allow max possible filling before reallocation.
	 */
	size = SKB_WITH_OVERHEAD(ksize(data));
Eric Dumazet's avatar
Eric Dumazet committed
260
	prefetchw(data + size);
Linus Torvalds's avatar
Linus Torvalds committed
261

262
	/*
263
264
265
	 * Only clear those fields we need to clear, not those that we will
	 * actually initialise below. Hence, don't put any more fields after
	 * the tail pointer in struct sk_buff!
266
267
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
Eric Dumazet's avatar
Eric Dumazet committed
268
269
	/* Account for allocated memory : skb + skb->head */
	skb->truesize = SKB_TRUESIZE(size);
270
	skb->pfmemalloc = pfmemalloc;
Linus Torvalds's avatar
Linus Torvalds committed
271
272
273
	atomic_set(&skb->users, 1);
	skb->head = data;
	skb->data = data;
274
	skb_reset_tail_pointer(skb);
275
	skb->end = skb->tail + size;
Cong Wang's avatar
Cong Wang committed
276
277
	skb->mac_header = (typeof(skb->mac_header))~0U;
	skb->transport_header = (typeof(skb->transport_header))~0U;
278

279
280
	/* make sure we initialize shinfo sequentially */
	shinfo = skb_shinfo(skb);
Eric Dumazet's avatar
Eric Dumazet committed
281
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
282
	atomic_set(&shinfo->dataref, 1);
283
	kmemcheck_annotate_variable(shinfo->destructor_arg);
284

285
	if (flags & SKB_ALLOC_FCLONE) {
286
287
		struct sk_buff *child = skb + 1;
		atomic_t *fclone_ref = (atomic_t *) (child + 1);
Linus Torvalds's avatar
Linus Torvalds committed
288

289
290
		kmemcheck_annotate_bitfield(child, flags1);
		kmemcheck_annotate_bitfield(child, flags2);
291
292
293
294
		skb->fclone = SKB_FCLONE_ORIG;
		atomic_set(fclone_ref, 1);

		child->fclone = SKB_FCLONE_UNAVAILABLE;
295
		child->pfmemalloc = pfmemalloc;
296
	}
Linus Torvalds's avatar
Linus Torvalds committed
297
298
299
out:
	return skb;
nodata:
300
	kmem_cache_free(cache, skb);
Linus Torvalds's avatar
Linus Torvalds committed
301
302
303
	skb = NULL;
	goto out;
}
304
EXPORT_SYMBOL(__alloc_skb);
Linus Torvalds's avatar
Linus Torvalds committed
305

Eric Dumazet's avatar
Eric Dumazet committed
306
307
308
/**
 * build_skb - build a network buffer
 * @data: data buffer provided by caller
309
 * @frag_size: size of fragment, or 0 if head was kmalloced
Eric Dumazet's avatar
Eric Dumazet committed
310
311
312
313
314
315
316
317
318
319
320
321
322
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated by kmalloc()
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
323
struct sk_buff *build_skb(void *data, unsigned int frag_size)
Eric Dumazet's avatar
Eric Dumazet committed
324
325
326
{
	struct skb_shared_info *shinfo;
	struct sk_buff *skb;
327
	unsigned int size = frag_size ? : ksize(data);
Eric Dumazet's avatar
Eric Dumazet committed
328
329
330
331
332

	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
	if (!skb)
		return NULL;

333
	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
Eric Dumazet's avatar
Eric Dumazet committed
334
335
336

	memset(skb, 0, offsetof(struct sk_buff, tail));
	skb->truesize = SKB_TRUESIZE(size);
337
	skb->head_frag = frag_size != 0;
Eric Dumazet's avatar
Eric Dumazet committed
338
339
340
341
342
	atomic_set(&skb->users, 1);
	skb->head = data;
	skb->data = data;
	skb_reset_tail_pointer(skb);
	skb->end = skb->tail + size;
Cong Wang's avatar
Cong Wang committed
343
344
	skb->mac_header = (typeof(skb->mac_header))~0U;
	skb->transport_header = (typeof(skb->transport_header))~0U;
Eric Dumazet's avatar
Eric Dumazet committed
345
346
347
348
349
350
351
352
353
354
355

	/* make sure we initialize shinfo sequentially */
	shinfo = skb_shinfo(skb);
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
	atomic_set(&shinfo->dataref, 1);
	kmemcheck_annotate_variable(shinfo->destructor_arg);

	return skb;
}
EXPORT_SYMBOL(build_skb);

356
struct netdev_alloc_cache {
357
358
359
360
361
	struct page_frag	frag;
	/* we maintain a pagecount bias, so that we dont dirty cache line
	 * containing page->_count every time we allocate a fragment.
	 */
	unsigned int		pagecnt_bias;
362
363
364
};
static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);

365
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
366
367
368
{
	struct netdev_alloc_cache *nc;
	void *data = NULL;
369
	int order;
370
371
372
373
	unsigned long flags;

	local_irq_save(flags);
	nc = &__get_cpu_var(netdev_alloc_cache);
374
	if (unlikely(!nc->frag.page)) {
375
refill:
376
377
378
379
380
381
382
383
384
385
386
387
		for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
			gfp_t gfp = gfp_mask;

			if (order)
				gfp |= __GFP_COMP | __GFP_NOWARN;
			nc->frag.page = alloc_pages(gfp, order);
			if (likely(nc->frag.page))
				break;
			if (--order < 0)
				goto end;
		}
		nc->frag.size = PAGE_SIZE << order;
388
recycle:
389
390
391
		atomic_set(&nc->frag.page->_count, NETDEV_PAGECNT_MAX_BIAS);
		nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
		nc->frag.offset = 0;
392
	}
393

394
	if (nc->frag.offset + fragsz > nc->frag.size) {
395
		/* avoid unnecessary locked operations if possible */
396
397
		if ((atomic_read(&nc->frag.page->_count) == nc->pagecnt_bias) ||
		    atomic_sub_and_test(nc->pagecnt_bias, &nc->frag.page->_count))
398
399
			goto recycle;
		goto refill;
400
	}
401

402
403
	data = page_address(nc->frag.page) + nc->frag.offset;
	nc->frag.offset += fragsz;
404
405
	nc->pagecnt_bias--;
end:
406
407
408
	local_irq_restore(flags);
	return data;
}
409
410
411
412
413
414
415
416
417
418
419
420

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
void *netdev_alloc_frag(unsigned int fragsz)
{
	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
}
421
422
EXPORT_SYMBOL(netdev_alloc_frag);

423
424
425
426
427
428
429
430
431
432
433
434
435
436
/**
 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *	@dev: network device to receive on
 *	@length: length to allocate
 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *	Allocate a new &sk_buff and assign it a usage count of one. The
 *	buffer has unspecified headroom built in. Users should allocate
 *	the headroom they think they need without accounting for the
 *	built in space. The built in space is used for optimisations.
 *
 *	%NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
437
				   unsigned int length, gfp_t gfp_mask)
438
{
439
	struct sk_buff *skb = NULL;
440
441
442
	unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

443
	if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
444
445
446
447
448
449
		void *data;

		if (sk_memalloc_socks())
			gfp_mask |= __GFP_MEMALLOC;

		data = __netdev_alloc_frag(fragsz, gfp_mask);
450

451
452
453
454
		if (likely(data)) {
			skb = build_skb(data, fragsz);
			if (unlikely(!skb))
				put_page(virt_to_head_page(data));
455
456
		}
	} else {
457
458
		skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
				  SKB_ALLOC_RX, NUMA_NO_NODE);
459
	}
460
	if (likely(skb)) {
461
		skb_reserve(skb, NET_SKB_PAD);
462
463
		skb->dev = dev;
	}
464
465
	return skb;
}
466
EXPORT_SYMBOL(__netdev_alloc_skb);
Linus Torvalds's avatar
Linus Torvalds committed
467

Peter Zijlstra's avatar
Peter Zijlstra committed
468
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
469
		     int size, unsigned int truesize)
Peter Zijlstra's avatar
Peter Zijlstra committed
470
471
472
473
{
	skb_fill_page_desc(skb, i, page, off, size);
	skb->len += size;
	skb->data_len += size;
474
	skb->truesize += truesize;
Peter Zijlstra's avatar
Peter Zijlstra committed
475
476
477
}
EXPORT_SYMBOL(skb_add_rx_frag);

478
static void skb_drop_list(struct sk_buff **listp)
Linus Torvalds's avatar
Linus Torvalds committed
479
{
Eric Dumazet's avatar
Eric Dumazet committed
480
	kfree_skb_list(*listp);
481
	*listp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
482
483
}

484
485
486
487
488
static inline void skb_drop_fraglist(struct sk_buff *skb)
{
	skb_drop_list(&skb_shinfo(skb)->frag_list);
}

Linus Torvalds's avatar
Linus Torvalds committed
489
490
491
492
static void skb_clone_fraglist(struct sk_buff *skb)
{
	struct sk_buff *list;

493
	skb_walk_frags(skb, list)
Linus Torvalds's avatar
Linus Torvalds committed
494
495
496
		skb_get(list);
}

497
498
499
500
501
502
503
504
static void skb_free_head(struct sk_buff *skb)
{
	if (skb->head_frag)
		put_page(virt_to_head_page(skb->head));
	else
		kfree(skb->head);
}

505
static void skb_release_data(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
506
507
508
509
510
511
512
{
	if (!skb->cloned ||
	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
			       &skb_shinfo(skb)->dataref)) {
		if (skb_shinfo(skb)->nr_frags) {
			int i;
			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
513
				skb_frag_unref(skb, i);
Linus Torvalds's avatar
Linus Torvalds committed
514
515
		}

516
517
518
519
520
521
522
523
524
		/*
		 * If skb buf is from userspace, we need to notify the caller
		 * the lower device DMA has done;
		 */
		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
			struct ubuf_info *uarg;

			uarg = skb_shinfo(skb)->destructor_arg;
			if (uarg->callback)
525
				uarg->callback(uarg, true);
526
527
		}

528
		if (skb_has_frag_list(skb))
Linus Torvalds's avatar
Linus Torvalds committed
529
530
			skb_drop_fraglist(skb);

531
		skb_free_head(skb);
Linus Torvalds's avatar
Linus Torvalds committed
532
533
534
535
536
537
	}
}

/*
 *	Free an skbuff by memory without cleaning the state.
 */
538
static void kfree_skbmem(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
539
{
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
	struct sk_buff *other;
	atomic_t *fclone_ref;

	switch (skb->fclone) {
	case SKB_FCLONE_UNAVAILABLE:
		kmem_cache_free(skbuff_head_cache, skb);
		break;

	case SKB_FCLONE_ORIG:
		fclone_ref = (atomic_t *) (skb + 2);
		if (atomic_dec_and_test(fclone_ref))
			kmem_cache_free(skbuff_fclone_cache, skb);
		break;

	case SKB_FCLONE_CLONE:
		fclone_ref = (atomic_t *) (skb + 1);
		other = skb - 1;

		/* The clone portion is available for
		 * fast-cloning again.
		 */
		skb->fclone = SKB_FCLONE_UNAVAILABLE;

		if (atomic_dec_and_test(fclone_ref))
			kmem_cache_free(skbuff_fclone_cache, other);
		break;
566
	}
Linus Torvalds's avatar
Linus Torvalds committed
567
568
}

569
static void skb_release_head_state(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
570
{
Eric Dumazet's avatar
Eric Dumazet committed
571
	skb_dst_drop(skb);
Linus Torvalds's avatar
Linus Torvalds committed
572
573
574
#ifdef CONFIG_XFRM
	secpath_put(skb->sp);
#endif
575
576
	if (skb->destructor) {
		WARN_ON(in_irq());
Linus Torvalds's avatar
Linus Torvalds committed
577
578
		skb->destructor(skb);
	}
Igor Maravić's avatar
Igor Maravić committed
579
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
580
	nf_conntrack_put(skb->nfct);
581
582
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
583
584
	nf_conntrack_put_reasm(skb->nfct_reasm);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
585
586
587
588
589
590
591
592
593
594
#ifdef CONFIG_BRIDGE_NETFILTER
	nf_bridge_put(skb->nf_bridge);
#endif
/* XXX: IS this still necessary? - JHS */
#ifdef CONFIG_NET_SCHED
	skb->tc_index = 0;
#ifdef CONFIG_NET_CLS_ACT
	skb->tc_verd = 0;
#endif
#endif
595
596
597
598
599
600
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb)
{
	skb_release_head_state(skb);
601
	if (likely(skb->head))
602
		skb_release_data(skb);
603
604
605
606
607
608
609
610
611
612
}

/**
 *	__kfree_skb - private function
 *	@skb: buffer
 *
 *	Free an sk_buff. Release anything attached to the buffer.
 *	Clean the state. This is an internal helper function. Users should
 *	always call kfree_skb
 */
Linus Torvalds's avatar
Linus Torvalds committed
613

614
615
616
void __kfree_skb(struct sk_buff *skb)
{
	skb_release_all(skb);
Linus Torvalds's avatar
Linus Torvalds committed
617
618
	kfree_skbmem(skb);
}
619
EXPORT_SYMBOL(__kfree_skb);
Linus Torvalds's avatar
Linus Torvalds committed
620

621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
/**
 *	kfree_skb - free an sk_buff
 *	@skb: buffer to free
 *
 *	Drop a reference to the buffer and free it if the usage count has
 *	hit zero.
 */
void kfree_skb(struct sk_buff *skb)
{
	if (unlikely(!skb))
		return;
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
636
	trace_kfree_skb(skb, __builtin_return_address(0));
637
638
	__kfree_skb(skb);
}
639
EXPORT_SYMBOL(kfree_skb);
640

Eric Dumazet's avatar
Eric Dumazet committed
641
642
643
644
645
646
647
648
649
650
651
void kfree_skb_list(struct sk_buff *segs)
{
	while (segs) {
		struct sk_buff *next = segs->next;

		kfree_skb(segs);
		segs = next;
	}
}
EXPORT_SYMBOL(kfree_skb_list);

652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
/**
 *	skb_tx_error - report an sk_buff xmit error
 *	@skb: buffer that triggered an error
 *
 *	Report xmit error if a device callback is tracking this skb.
 *	skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
		struct ubuf_info *uarg;

		uarg = skb_shinfo(skb)->destructor_arg;
		if (uarg->callback)
			uarg->callback(uarg, false);
		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
	}
}
EXPORT_SYMBOL(skb_tx_error);

672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
/**
 *	consume_skb - free an skbuff
 *	@skb: buffer to free
 *
 *	Drop a ref to the buffer and free it if the usage count has hit zero
 *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *	is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
	if (unlikely(!skb))
		return;
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
688
	trace_consume_skb(skb);
689
690
691
692
	__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);

693
694
695
696
697
698
699
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
	new->tstamp		= old->tstamp;
	new->dev		= old->dev;
	new->transport_header	= old->transport_header;
	new->network_header	= old->network_header;
	new->mac_header		= old->mac_header;
700
	new->inner_protocol	= old->inner_protocol;
701
	new->inner_transport_header = old->inner_transport_header;
702
	new->inner_network_header = old->inner_network_header;
703
	new->inner_mac_header = old->inner_mac_header;
Eric Dumazet's avatar
Eric Dumazet committed
704
	skb_dst_copy(new, old);
Tom Herbert's avatar
Tom Herbert committed
705
	new->rxhash		= old->rxhash;
706
	new->ooo_okay		= old->ooo_okay;
707
	new->l4_rxhash		= old->l4_rxhash;
708
	new->no_fcs		= old->no_fcs;
709
	new->encapsulation	= old->encapsulation;
710
#ifdef CONFIG_XFRM
711
712
713
	new->sp			= secpath_get(old->sp);
#endif
	memcpy(new->cb, old->cb, sizeof(old->cb));
714
	new->csum		= old->csum;
715
716
717
718
719
	new->local_df		= old->local_df;
	new->pkt_type		= old->pkt_type;
	new->ip_summed		= old->ip_summed;
	skb_copy_queue_mapping(new, old);
	new->priority		= old->priority;
Igor Maravić's avatar
Igor Maravić committed
720
#if IS_ENABLED(CONFIG_IP_VS)
721
722
	new->ipvs_property	= old->ipvs_property;
#endif
723
	new->pfmemalloc		= old->pfmemalloc;
724
725
	new->protocol		= old->protocol;
	new->mark		= old->mark;
726
	new->skb_iif		= old->skb_iif;
727
	__nf_copy(new, old);
Igor Maravić's avatar
Igor Maravić committed
728
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
729
730
731
732
733
734
735
736
	new->nf_trace		= old->nf_trace;
#endif
#ifdef CONFIG_NET_SCHED
	new->tc_index		= old->tc_index;
#ifdef CONFIG_NET_CLS_ACT
	new->tc_verd		= old->tc_verd;
#endif
#endif
737
	new->vlan_proto		= old->vlan_proto;
738
739
	new->vlan_tci		= old->vlan_tci;

740
	skb_copy_secmark(new, old);
741
742
743
744

#ifdef CONFIG_NET_LL_RX_POLL
	new->napi_id	= old->napi_id;
#endif
745
746
}

747
748
749
750
/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
Herbert Xu's avatar
Herbert Xu committed
751
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
752
753
754
755
756
{
#define C(x) n->x = skb->x

	n->next = n->prev = NULL;
	n->sk = NULL;
757
758
	__copy_skb_header(n, skb);

Linus Torvalds's avatar
Linus Torvalds committed
759
760
	C(len);
	C(data_len);
761
	C(mac_len);
762
	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
763
	n->cloned = 1;
Linus Torvalds's avatar
Linus Torvalds committed
764
765
766
767
	n->nohdr = 0;
	n->destructor = NULL;
	C(tail);
	C(end);
768
	C(head);
769
	C(head_frag);
770
771
772
	C(data);
	C(truesize);
	atomic_set(&n->users, 1);
Linus Torvalds's avatar
Linus Torvalds committed
773
774
775
776
777

	atomic_inc(&(skb_shinfo(skb)->dataref));
	skb->cloned = 1;

	return n;
Herbert Xu's avatar
Herbert Xu committed
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
#undef C
}

/**
 *	skb_morph	-	morph one skb into another
 *	@dst: the skb to receive the contents
 *	@src: the skb to supply the contents
 *
 *	This is identical to skb_clone except that the target skb is
 *	supplied by the user.
 *
 *	The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
793
	skb_release_all(dst);
Herbert Xu's avatar
Herbert Xu committed
794
795
796
797
	return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

798
799
/**
 *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
800
801
802
803
804
805
806
807
808
809
810
811
812
813
 *	@skb: the skb to modify
 *	@gfp_mask: allocation priority
 *
 *	This must be called on SKBTX_DEV_ZEROCOPY skb.
 *	It will copy all frags into kernel and drop the reference
 *	to userspace pages.
 *
 *	If this function is called from an interrupt gfp_mask() must be
 *	%GFP_ATOMIC.
 *
 *	Returns 0 on success or a negative error code on failure
 *	to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
814
815
816
817
818
819
820
821
822
823
{
	int i;
	int num_frags = skb_shinfo(skb)->nr_frags;
	struct page *page, *head = NULL;
	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;

	for (i = 0; i < num_frags; i++) {
		u8 *vaddr;
		skb_frag_t *f = &skb_shinfo(skb)->frags[i];

824
		page = alloc_page(gfp_mask);
825
826
		if (!page) {
			while (head) {
827
				struct page *next = (struct page *)page_private(head);
828
829
830
831
832
				put_page(head);
				head = next;
			}
			return -ENOMEM;
		}
Eric Dumazet's avatar
Eric Dumazet committed
833
		vaddr = kmap_atomic(skb_frag_page(f));
834
		memcpy(page_address(page),
835
		       vaddr + f->page_offset, skb_frag_size(f));
Eric Dumazet's avatar
Eric Dumazet committed
836
		kunmap_atomic(vaddr);
837
		set_page_private(page, (unsigned long)head);
838
839
840
841
		head = page;
	}

	/* skb frags release userspace buffers */
842
	for (i = 0; i < num_frags; i++)
843
		skb_frag_unref(skb, i);
844

845
	uarg->callback(uarg, false);
846
847

	/* skb frags point to kernel buffers */
848
849
850
	for (i = num_frags - 1; i >= 0; i--) {
		__skb_fill_page_desc(skb, i, head, 0,
				     skb_shinfo(skb)->frags[i].size);
851
		head = (struct page *)page_private(head);
852
	}
853
854

	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
855
856
	return 0;
}
857
EXPORT_SYMBOL_GPL(skb_copy_ubufs);
858

Herbert Xu's avatar
Herbert Xu committed
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
/**
 *	skb_clone	-	duplicate an sk_buff
 *	@skb: buffer to clone
 *	@gfp_mask: allocation priority
 *
 *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *	copies share the same packet data but not structure. The new
 *	buffer has a reference count of 1. If the allocation fails the
 *	function returns %NULL otherwise the new buffer is returned.
 *
 *	If this function is called from an interrupt gfp_mask() must be
 *	%GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
	struct sk_buff *n;

877
878
	if (skb_orphan_frags(skb, gfp_mask))
		return NULL;
879

Herbert Xu's avatar
Herbert Xu committed
880
881
882
883
884
885
886
	n = skb + 1;
	if (skb->fclone == SKB_FCLONE_ORIG &&
	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
		atomic_t *fclone_ref = (atomic_t *) (n + 1);
		n->fclone = SKB_FCLONE_CLONE;
		atomic_inc(fclone_ref);
	} else {
887
888
889
		if (skb_pfmemalloc(skb))
			gfp_mask |= __GFP_MEMALLOC;

Herbert Xu's avatar
Herbert Xu committed
890
891
892
		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
		if (!n)
			return NULL;
893
894
895

		kmemcheck_annotate_bitfield(n, flags1);
		kmemcheck_annotate_bitfield(n, flags2);
Herbert Xu's avatar
Herbert Xu committed
896
897
898
899
		n->fclone = SKB_FCLONE_UNAVAILABLE;
	}

	return __skb_clone(n, skb);
Linus Torvalds's avatar
Linus Torvalds committed
900
}
901
EXPORT_SYMBOL(skb_clone);
Linus Torvalds's avatar
Linus Torvalds committed
902

903
904
905
906
907
908
909
910
911
static void skb_headers_offset_update(struct sk_buff *skb, int off)
{
	/* {transport,network,mac}_header and tail are relative to skb->head */
	skb->transport_header += off;
	skb->network_header   += off;
	if (skb_mac_header_was_set(skb))
		skb->mac_header += off;
	skb->inner_transport_header += off;
	skb->inner_network_header += off;
912
	skb->inner_mac_header += off;
913
914
}

Linus Torvalds's avatar
Linus Torvalds committed
915
916
static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
917
918
	__copy_skb_header(new, old);

919
920
921
	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
Linus Torvalds's avatar
Linus Torvalds committed
922
923
}

924
925
926
927
928
929
930
static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
	if (skb_pfmemalloc(skb))
		return SKB_ALLOC_RX;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
/**
 *	skb_copy	-	create private copy of an sk_buff
 *	@skb: buffer to copy
 *	@gfp_mask: allocation priority
 *
 *	Make a copy of both an &sk_buff and its data. This is used when the
 *	caller wishes to modify the data and needs a private copy of the
 *	data to alter. Returns %NULL on failure or the pointer to the buffer
 *	on success. The returned buffer has a reference count of 1.
 *
 *	As by-product this function converts non-linear &sk_buff to linear
 *	one, so that &sk_buff becomes completely private and caller is allowed
 *	to modify all the data of returned buffer. This means that this
 *	function is not recommended for use in circumstances when only
 *	header is going to be modified. Use pskb_copy() instead.
 */

948
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
949
{
Eric Dumazet's avatar
Eric Dumazet committed
950
	int headerlen = skb_headroom(skb);
951
	unsigned int size = skb_end_offset(skb) + skb->data_len;
952
953
	struct sk_buff *n = __alloc_skb(size, gfp_mask,
					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
Eric Dumazet's avatar
Eric Dumazet committed
954

Linus Torvalds's avatar
Linus Torvalds committed
955
956
957
958
959
960
961
962
963
964
965
966
967
968
	if (!n)
		return NULL;

	/* Set the data pointer */
	skb_reserve(n, headerlen);
	/* Set the tail pointer and length */
	skb_put(n, skb->len);

	if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
		BUG();

	copy_skb_header(n, skb);
	return n;
}
969
EXPORT_SYMBOL(skb_copy);
Linus Torvalds's avatar
Linus Torvalds committed
970
971

/**
Eric Dumazet's avatar
Eric Dumazet committed
972
 *	__pskb_copy	-	create copy of an sk_buff with private head.
Linus Torvalds's avatar
Linus Torvalds committed
973
 *	@skb: buffer to copy
Eric Dumazet's avatar
Eric Dumazet committed
974
 *	@headroom: headroom of new skb
Linus Torvalds's avatar
Linus Torvalds committed
975
976
977
978
979
980
981
982
983
984
 *	@gfp_mask: allocation priority
 *
 *	Make a copy of both an &sk_buff and part of its data, located
 *	in header. Fragmented data remain shared. This is used when
 *	the caller wishes to modify only header of &sk_buff and needs
 *	private copy of the header to alter. Returns %NULL on failure
 *	or the pointer to the buffer on success.
 *	The returned buffer has a reference count of 1.
 */

Eric Dumazet's avatar
Eric Dumazet committed
985
struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
986
{
Eric Dumazet's avatar
Eric Dumazet committed
987
	unsigned int size = skb_headlen(skb) + headroom;
988
989
	struct sk_buff *n = __alloc_skb(size, gfp_mask,
					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
Eric Dumazet's avatar
Eric Dumazet committed
990

Linus Torvalds's avatar
Linus Torvalds committed
991
992
993
994
	if (!n)
		goto out;

	/* Set the data pointer */
Eric Dumazet's avatar
Eric Dumazet committed
995
	skb_reserve(n, headroom);
Linus Torvalds's avatar
Linus Torvalds committed
996
997
998
	/* Set the tail pointer and length */
	skb_put(n, skb_headlen(skb));
	/* Copy the bytes */
999
	skb_copy_from_linear_data(skb, n->data, n->len);
Linus Torvalds's avatar
Linus Torvalds committed
1000

Herbert Xu's avatar
Herbert Xu committed
1001
	n->truesize += skb->data_len;
Linus Torvalds's avatar
Linus Torvalds committed
1002
1003
1004
1005
1006
1007
	n->data_len  = skb->data_len;
	n->len	     = skb->len;

	if (skb_shinfo(skb)->nr_frags) {
		int i;

1008
1009
1010
1011
		if (skb_orphan_frags(skb, gfp_mask)) {
			kfree_skb(n);
			n = NULL;
			goto out;
1012
		}
Linus Torvalds's avatar
Linus Torvalds committed
1013
1014
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
1015
			skb_frag_ref(skb, i);
Linus Torvalds's avatar
Linus Torvalds committed
1016
1017
1018
1019
		}
		skb_shinfo(n)->nr_frags = i;
	}

1020
	if (skb_has_frag_list(skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
1021
1022
1023
1024
1025
1026
1027
1028
		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
		skb_clone_fraglist(n);
	}

	copy_skb_header(n, skb);
out:
	return n;
}
Eric Dumazet's avatar
Eric Dumazet committed
1029
EXPORT_SYMBOL(__pskb_copy);
Linus Torvalds's avatar
Linus Torvalds committed
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046

/**
 *	pskb_expand_head - reallocate header of &sk_buff
 *	@skb: buffer to reallocate
 *	@nhead: room to add at head
 *	@ntail: room to add at tail
 *	@gfp_mask: allocation priority
 *
 *	Expands (or creates identical copy, if &nhead and &ntail are zero)
 *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have
 *	reference count of 1. Returns zero in the case of success or error,
 *	if expansion failed. In the last case, &sk_buff is not changed.
 *
 *	All the pointers pointing into skb header may change and must be
 *	reloaded after call to this function.
 */

Victor Fusco's avatar
Victor Fusco committed
1047
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
1048
		     gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
1049
1050
1051
{
	int i;
	u8 *data;
1052
	int size = nhead + skb_end_offset(skb) + ntail;
Linus Torvalds's avatar
Linus Torvalds committed
1053
1054
	long off;

1055
1056
	BUG_ON(nhead < 0);

Linus Torvalds's avatar
Linus Torvalds committed
1057
1058
1059
1060
1061
	if (skb_shared(skb))
		BUG();

	size = SKB_DATA_ALIGN(size);

1062
1063
1064
1065
	if (skb_pfmemalloc(skb))
		gfp_mask |= __GFP_MEMALLOC;
	data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
			       gfp_mask, NUMA_NO_NODE, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1066
1067
	if (!data)
		goto nodata;
1068
	size = SKB_WITH_OVERHEAD(ksize(data));
Linus Torvalds's avatar
Linus Torvalds committed
1069
1070

	/* Copy only real data... and, alas, header. This should be
Eric Dumazet's avatar
Eric Dumazet committed
1071
1072
1073
1074
1075
1076
	 * optimized for the cases when header is void.
	 */
	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);

	memcpy((struct skb_shared_info *)(data + size),
	       skb_shinfo(skb),
1077
	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
Linus Torvalds's avatar
Linus Torvalds committed
1078

1079
1080
1081
1082
1083
1084
	/*
	 * if shinfo is shared we must drop the old head gracefully, but if it
	 * is not we can just drop the old head and let the existing refcount
	 * be since all we did is relocate the values
	 */
	if (skb_cloned(skb)) {
1085
		/* copy this zero copy skb frags */
1086
1087
		if (skb_orphan_frags(skb, gfp_mask))
			goto nofrags;
1088
		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1089
			skb_frag_ref(skb, i);
Linus Torvalds's avatar
Linus Torvalds committed
1090

1091
1092
		if (skb_has_frag_list(skb))
			skb_clone_fraglist(skb);
Linus Torvalds's avatar
Linus Torvalds committed
1093

1094
		skb_release_data(skb);
1095
1096
	} else {
		skb_free_head(skb);
1097
	}
Linus Torvalds's avatar
Linus Torvalds committed
1098
1099
1100
	off = (data + nhead) - skb->head;

	skb->head     = data;
1101
	skb->head_frag = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1102
	skb->data    += off;
1103
1104
#ifdef NET_SKBUFF_DATA_USES_OFFSET
	skb->end      = size;
1105
	off           = nhead;
1106
1107
#else
	skb->end      = skb->head + size;
1108
#endif
1109
	skb->tail	      += off;
1110
	skb_headers_offset_update(skb, nhead);
1111
1112
1113
	/* Only adjust this if it actually is csum_start rather than csum */
	if (skb->ip_summed == CHECKSUM_PARTIAL)
		skb->csum_start += nhead;
Linus Torvalds's avatar
Linus Torvalds committed
1114
	skb->cloned   = 0;
1115
	skb->hdr_len  = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1116
1117
1118
1119
	skb->nohdr    = 0;
	atomic_set(&skb_shinfo(skb)->dataref, 1);
	return 0;

1120
1121
nofrags:
	kfree(data);
Linus Torvalds's avatar
Linus Torvalds committed
1122
1123
1124
nodata:
	return -ENOMEM;
}
1125
EXPORT_SYMBOL(pskb_expand_head);
Linus Torvalds's avatar
Linus Torvalds committed
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
	struct sk_buff *skb2;
	int delta = headroom - skb_headroom(skb);

	if (delta <= 0)
		skb2 = pskb_copy(skb, GFP_ATOMIC);
	else {
		skb2 = skb_clone(skb, GFP_ATOMIC);
		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
					     GFP_ATOMIC)) {
			kfree_skb(skb2);
			skb2 = NULL;
		}
	}
	return skb2;
}
1146
EXPORT_SYMBOL(skb_realloc_headroom);
Linus Torvalds's avatar
Linus Torvalds committed
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166

/**
 *	skb_copy_expand	-	copy and expand sk_buff
 *	@skb: buffer to copy
 *	@newheadroom: new free bytes at head
 *	@newtailroom: new free bytes at tail
 *	@gfp_mask: allocation priority
 *
 *	Make a copy of both an &sk_buff and its data and while doing so
 *	allocate additional space.
 *
 *	This is used when the caller wishes to modify the data and needs a
 *	private copy of the data to alter as well as more space for new fields.
 *	Returns %NULL on failure or the pointer to the buffer
 *	on success. The returned buffer has a reference count of 1.
 *
 *	You must pass %GFP_ATOMIC as the allocation priority if this function
 *	is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
Victor Fusco's avatar
Victor Fusco committed
1167
				int newheadroom, int newtailroom,
1168
				gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
1169
1170
1171
1172
{
	/*
	 *	Allocate the copy buffer
	 */
1173
1174
1175
	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
					gfp_mask, skb_alloc_rx_flag(skb),
					NUMA_NO_NODE);
1176
	int oldheadroom = skb_headroom(skb);
Linus Torvalds's avatar
Linus Torvalds committed
1177
	int head_copy_len, head_copy_off;
1178
	int off;
Linus Torvalds's avatar
Linus Torvalds committed
1179
1180
1181
1182
1183
1184
1185
1186
1187

	if (!n)
		return NULL;

	skb_reserve(n, newheadroom);

	/* Set the tail pointer and length */
	skb_put(n, skb->len);

1188
	head_copy_len = oldheadroom;
Linus Torvalds's avatar
Linus Torvalds committed
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
	head_copy_off = 0;
	if (newheadroom <= head_copy_len)
		head_copy_len = newheadroom;
	else
		head_copy_off = newheadroom - head_copy_len;

	/* Copy the linear header and data. */
	if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
			  skb->len + head_copy_len))
		BUG();

	copy_skb_header(n, skb);

1202
	off                  = newheadroom - oldheadroom;
1203
1204
	if (n->ip_summed == CHECKSUM_PARTIAL)
		n->csum_start += off;
1205

1206
	skb_headers_offset_update(n, off);
1207

Linus Torvalds's avatar
Linus Torvalds committed
1208
1209
	return n;
}
1210
EXPORT_SYMBOL(skb_copy_expand);
Linus Torvalds's avatar
Linus Torvalds committed
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220

/**
 *	skb_pad			-	zero pad the tail of an skb
 *	@skb: buffer to pad
 *	@pad: space to pad
 *
 *	Ensure that a buffer is followed by a padding area that is zero
 *	filled. Used by network drivers which may DMA or transfer data
 *	beyond the buffer end onto the wire.
 *
1221
 *	May return error in out of memory cases. The skb is freed on error.
Linus Torvalds's avatar
Linus Torvalds committed
1222
 */
1223

1224
int skb_pad(struct sk_buff *skb, int pad)
Linus Torvalds's avatar
Linus Torvalds committed
1225
{
1226
1227
	int err;
	int ntail;
1228

Linus Torvalds's avatar
Linus Torvalds committed
1229
	/* If the skbuff is non linear tailroom is always zero.. */
1230
	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
Linus Torvalds's avatar
Linus Torvalds committed
1231
		memset(skb->data+skb->len, 0, pad);
1232
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1233
	}
1234

1235
	ntail = skb->data_len + pad - (skb->end - skb->tail);
1236
1237
1238
1239
1240
1241
1242