skbuff.c 74.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
/*
 *	Routines having to do with the 'struct sk_buff' memory handlers.
 *
4
 *	Authors:	Alan Cox <alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
 *			Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *	Fixes:
 *		Alan Cox	:	Fixed the worst of the load
 *					balancer bugs.
 *		Dave Platt	:	Interrupt stacking fix.
 *	Richard Kooijman	:	Timestamp fixes.
 *		Alan Cox	:	Changed buffer format.
 *		Alan Cox	:	destructor hook for AF_UNIX etc.
 *		Linus Torvalds	:	Better skb_clone.
 *		Alan Cox	:	Added skb_copy.
 *		Alan Cox	:	Added all the changed routines Linus
 *					only put in the headers
 *		Ray VanTassle	:	Fixed --skb->lock in free
 *		Alan Cox	:	skb_copy copy arp field
 *		Andi Kleen	:	slabified it.
 *		Robert Olsson	:	Removed skb_head_pool
 *
 *	NOTE:
 *		The __skb_ routines should be called with interrupts
 *	disabled, or you better be *real* sure that the operation is atomic
 *	with respect to whatever list is being frobbed (e.g. via lock_sock()
 *	or via disabling bottom half handlers, etc).
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 */

/*
 *	The functions in this file will not compile correctly with gcc 2.4.x
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
42
#include <linux/kmemcheck.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
46
47
48
49
50
51
52
53
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
Jens Axboe's avatar
Jens Axboe committed
54
#include <linux/splice.h>
Linus Torvalds's avatar
Linus Torvalds committed
55
56
57
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
58
#include <linux/scatterlist.h>
59
#include <linux/errqueue.h>
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62
63
64
65
66
67
68

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/xfrm.h>

#include <asm/uaccess.h>
#include <asm/system.h>
69
#include <trace/events/skb.h>
Linus Torvalds's avatar
Linus Torvalds committed
70

71
72
#include "kmap_skb.h"

73
74
static struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
75

Jens Axboe's avatar
Jens Axboe committed
76
77
78
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
				  struct pipe_buffer *buf)
{
79
	put_page(buf->page);
Jens Axboe's avatar
Jens Axboe committed
80
81
82
83
84
}

static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
				struct pipe_buffer *buf)
{
85
	get_page(buf->page);
Jens Axboe's avatar
Jens Axboe committed
86
87
88
89
90
91
92
93
94
95
}

static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
			       struct pipe_buffer *buf)
{
	return 1;
}


/* Pipe buffer operations for a socket. */
96
static const struct pipe_buf_operations sock_pipe_buf_ops = {
Jens Axboe's avatar
Jens Axboe committed
97
98
99
100
101
102
103
104
105
	.can_merge = 0,
	.map = generic_pipe_buf_map,
	.unmap = generic_pipe_buf_unmap,
	.confirm = generic_pipe_buf_confirm,
	.release = sock_pipe_buf_release,
	.steal = sock_pipe_buf_steal,
	.get = sock_pipe_buf_get,
};

Linus Torvalds's avatar
Linus Torvalds committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*
 *	Keep out-of-line to prevent kernel bloat.
 *	__builtin_return_address is not used because it is not always
 *	reliable.
 */

/**
 *	skb_over_panic	- 	private function
 *	@skb: buffer
 *	@sz: size
 *	@here: address
 *
 *	Out of line support code for skb_put(). Not user callable.
 */
120
static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
Linus Torvalds's avatar
Linus Torvalds committed
121
{
122
	printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
123
			  "data:%p tail:%#lx end:%#lx dev:%s\n",
124
	       here, skb->len, sz, skb->head, skb->data,
125
	       (unsigned long)skb->tail, (unsigned long)skb->end,
126
	       skb->dev ? skb->dev->name : "<NULL>");
Linus Torvalds's avatar
Linus Torvalds committed
127
128
129
130
131
132
133
134
135
136
137
138
	BUG();
}

/**
 *	skb_under_panic	- 	private function
 *	@skb: buffer
 *	@sz: size
 *	@here: address
 *
 *	Out of line support code for skb_push(). Not user callable.
 */

139
static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
Linus Torvalds's avatar
Linus Torvalds committed
140
{
141
	printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
142
			  "data:%p tail:%#lx end:%#lx dev:%s\n",
143
	       here, skb->len, sz, skb->head, skb->data,
144
	       (unsigned long)skb->tail, (unsigned long)skb->end,
145
	       skb->dev ? skb->dev->name : "<NULL>");
Linus Torvalds's avatar
Linus Torvalds committed
146
147
148
149
150
151
152
153
154
155
	BUG();
}

/* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
 *	'private' fields and also do memory statistics to find all the
 *	[BEEP] leaks.
 *
 */

/**
156
 *	__alloc_skb	-	allocate a network buffer
Linus Torvalds's avatar
Linus Torvalds committed
157
158
 *	@size: size to allocate
 *	@gfp_mask: allocation mask
159
160
 *	@fclone: allocate from fclone cache instead of head cache
 *		and allocate a cloned (child) skb
161
 *	@node: numa node to allocate memory on
Linus Torvalds's avatar
Linus Torvalds committed
162
163
164
165
166
167
168
169
 *
 *	Allocate a new &sk_buff. The returned buffer has no headroom and a
 *	tail room of size bytes. The object has a reference count of one.
 *	The return is the buffer. On a failure the return is %NULL.
 *
 *	Buffers may only be allocated from interrupts using a @gfp_mask of
 *	%GFP_ATOMIC.
 */
170
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
171
			    int fclone, int node)
Linus Torvalds's avatar
Linus Torvalds committed
172
{
173
	struct kmem_cache *cache;
174
	struct skb_shared_info *shinfo;
Linus Torvalds's avatar
Linus Torvalds committed
175
176
177
	struct sk_buff *skb;
	u8 *data;

178
179
	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;

Linus Torvalds's avatar
Linus Torvalds committed
180
	/* Get the HEAD */
181
	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
Linus Torvalds's avatar
Linus Torvalds committed
182
183
	if (!skb)
		goto out;
Eric Dumazet's avatar
Eric Dumazet committed
184
	prefetchw(skb);
Linus Torvalds's avatar
Linus Torvalds committed
185
186

	size = SKB_DATA_ALIGN(size);
187
188
	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
			gfp_mask, node);
Linus Torvalds's avatar
Linus Torvalds committed
189
190
	if (!data)
		goto nodata;
Eric Dumazet's avatar
Eric Dumazet committed
191
	prefetchw(data + size);
Linus Torvalds's avatar
Linus Torvalds committed
192

193
	/*
194
195
196
	 * Only clear those fields we need to clear, not those that we will
	 * actually initialise below. Hence, don't put any more fields after
	 * the tail pointer in struct sk_buff!
197
198
	 */
	memset(skb, 0, offsetof(struct sk_buff, tail));
Linus Torvalds's avatar
Linus Torvalds committed
199
200
201
202
	skb->truesize = size + sizeof(struct sk_buff);
	atomic_set(&skb->users, 1);
	skb->head = data;
	skb->data = data;
203
	skb_reset_tail_pointer(skb);
204
	skb->end = skb->tail + size;
205
206
	kmemcheck_annotate_bitfield(skb, flags1);
	kmemcheck_annotate_bitfield(skb, flags2);
207
208
209
210
#ifdef NET_SKBUFF_DATA_USES_OFFSET
	skb->mac_header = ~0U;
#endif

211
212
	/* make sure we initialize shinfo sequentially */
	shinfo = skb_shinfo(skb);
Eric Dumazet's avatar
Eric Dumazet committed
213
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
214
215
	atomic_set(&shinfo->dataref, 1);

216
217
218
	if (fclone) {
		struct sk_buff *child = skb + 1;
		atomic_t *fclone_ref = (atomic_t *) (child + 1);
Linus Torvalds's avatar
Linus Torvalds committed
219

220
221
		kmemcheck_annotate_bitfield(child, flags1);
		kmemcheck_annotate_bitfield(child, flags2);
222
223
224
225
226
		skb->fclone = SKB_FCLONE_ORIG;
		atomic_set(fclone_ref, 1);

		child->fclone = SKB_FCLONE_UNAVAILABLE;
	}
Linus Torvalds's avatar
Linus Torvalds committed
227
228
229
out:
	return skb;
nodata:
230
	kmem_cache_free(cache, skb);
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
	skb = NULL;
	goto out;
}
234
EXPORT_SYMBOL(__alloc_skb);
Linus Torvalds's avatar
Linus Torvalds committed
235

236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/**
 *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *	@dev: network device to receive on
 *	@length: length to allocate
 *	@gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *	Allocate a new &sk_buff and assign it a usage count of one. The
 *	buffer has unspecified headroom built in. Users should allocate
 *	the headroom they think they need without accounting for the
 *	built in space. The built in space is used for optimisations.
 *
 *	%NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
		unsigned int length, gfp_t gfp_mask)
{
252
	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
253
254
	struct sk_buff *skb;

255
	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
256
	if (likely(skb)) {
257
		skb_reserve(skb, NET_SKB_PAD);
258
259
		skb->dev = dev;
	}
260
261
	return skb;
}
262
EXPORT_SYMBOL(__netdev_alloc_skb);
Linus Torvalds's avatar
Linus Torvalds committed
263

Peter Zijlstra's avatar
Peter Zijlstra committed
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
{
	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
	struct page *page;

	page = alloc_pages_node(node, gfp_mask, 0);
	return page;
}
EXPORT_SYMBOL(__netdev_alloc_page);

void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
		int size)
{
	skb_fill_page_desc(skb, i, page, off, size);
	skb->len += size;
	skb->data_len += size;
	skb->truesize += size;
}
EXPORT_SYMBOL(skb_add_rx_frag);

284
285
286
287
288
289
290
291
292
293
294
295
296
297
/**
 *	dev_alloc_skb - allocate an skbuff for receiving
 *	@length: length to allocate
 *
 *	Allocate a new &sk_buff and assign it a usage count of one. The
 *	buffer has unspecified headroom built in. Users should allocate
 *	the headroom they think they need without accounting for the
 *	built in space. The built in space is used for optimisations.
 *
 *	%NULL is returned if there is no free memory. Although this function
 *	allocates memory it can be called from an interrupt.
 */
struct sk_buff *dev_alloc_skb(unsigned int length)
{
298
299
	/*
	 * There is more code here than it seems:
300
	 * __dev_alloc_skb is an inline
301
	 */
302
303
304
305
	return __dev_alloc_skb(length, GFP_ATOMIC);
}
EXPORT_SYMBOL(dev_alloc_skb);

306
static void skb_drop_list(struct sk_buff **listp)
Linus Torvalds's avatar
Linus Torvalds committed
307
{
308
	struct sk_buff *list = *listp;
Linus Torvalds's avatar
Linus Torvalds committed
309

310
	*listp = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
311
312
313
314
315
316
317
318

	do {
		struct sk_buff *this = list;
		list = list->next;
		kfree_skb(this);
	} while (list);
}

319
320
321
322
323
static inline void skb_drop_fraglist(struct sk_buff *skb)
{
	skb_drop_list(&skb_shinfo(skb)->frag_list);
}

Linus Torvalds's avatar
Linus Torvalds committed
324
325
326
327
static void skb_clone_fraglist(struct sk_buff *skb)
{
	struct sk_buff *list;

328
	skb_walk_frags(skb, list)
Linus Torvalds's avatar
Linus Torvalds committed
329
330
331
		skb_get(list);
}

332
static void skb_release_data(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
333
334
335
336
337
338
339
340
341
342
{
	if (!skb->cloned ||
	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
			       &skb_shinfo(skb)->dataref)) {
		if (skb_shinfo(skb)->nr_frags) {
			int i;
			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
				put_page(skb_shinfo(skb)->frags[i].page);
		}

343
		if (skb_has_frags(skb))
Linus Torvalds's avatar
Linus Torvalds committed
344
345
346
347
348
349
350
351
352
			skb_drop_fraglist(skb);

		kfree(skb->head);
	}
}

/*
 *	Free an skbuff by memory without cleaning the state.
 */
353
static void kfree_skbmem(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
354
{
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
	struct sk_buff *other;
	atomic_t *fclone_ref;

	switch (skb->fclone) {
	case SKB_FCLONE_UNAVAILABLE:
		kmem_cache_free(skbuff_head_cache, skb);
		break;

	case SKB_FCLONE_ORIG:
		fclone_ref = (atomic_t *) (skb + 2);
		if (atomic_dec_and_test(fclone_ref))
			kmem_cache_free(skbuff_fclone_cache, skb);
		break;

	case SKB_FCLONE_CLONE:
		fclone_ref = (atomic_t *) (skb + 1);
		other = skb - 1;

		/* The clone portion is available for
		 * fast-cloning again.
		 */
		skb->fclone = SKB_FCLONE_UNAVAILABLE;

		if (atomic_dec_and_test(fclone_ref))
			kmem_cache_free(skbuff_fclone_cache, other);
		break;
381
	}
Linus Torvalds's avatar
Linus Torvalds committed
382
383
}

384
static void skb_release_head_state(struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
385
{
Eric Dumazet's avatar
Eric Dumazet committed
386
	skb_dst_drop(skb);
Linus Torvalds's avatar
Linus Torvalds committed
387
388
389
#ifdef CONFIG_XFRM
	secpath_put(skb->sp);
#endif
390
391
	if (skb->destructor) {
		WARN_ON(in_irq());
Linus Torvalds's avatar
Linus Torvalds committed
392
393
		skb->destructor(skb);
	}
394
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
395
	nf_conntrack_put(skb->nfct);
396
397
	nf_conntrack_put_reasm(skb->nfct_reasm);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
398
399
400
401
402
403
404
405
406
407
#ifdef CONFIG_BRIDGE_NETFILTER
	nf_bridge_put(skb->nf_bridge);
#endif
/* XXX: IS this still necessary? - JHS */
#ifdef CONFIG_NET_SCHED
	skb->tc_index = 0;
#ifdef CONFIG_NET_CLS_ACT
	skb->tc_verd = 0;
#endif
#endif
408
409
410
411
412
413
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb)
{
	skb_release_head_state(skb);
414
415
416
417
418
419
420
421
422
423
424
	skb_release_data(skb);
}

/**
 *	__kfree_skb - private function
 *	@skb: buffer
 *
 *	Free an sk_buff. Release anything attached to the buffer.
 *	Clean the state. This is an internal helper function. Users should
 *	always call kfree_skb
 */
Linus Torvalds's avatar
Linus Torvalds committed
425

426
427
428
void __kfree_skb(struct sk_buff *skb)
{
	skb_release_all(skb);
Linus Torvalds's avatar
Linus Torvalds committed
429
430
	kfree_skbmem(skb);
}
431
EXPORT_SYMBOL(__kfree_skb);
Linus Torvalds's avatar
Linus Torvalds committed
432

433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
/**
 *	kfree_skb - free an sk_buff
 *	@skb: buffer to free
 *
 *	Drop a reference to the buffer and free it if the usage count has
 *	hit zero.
 */
void kfree_skb(struct sk_buff *skb)
{
	if (unlikely(!skb))
		return;
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
448
	trace_kfree_skb(skb, __builtin_return_address(0));
449
450
	__kfree_skb(skb);
}
451
EXPORT_SYMBOL(kfree_skb);
452

453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/**
 *	consume_skb - free an skbuff
 *	@skb: buffer to free
 *
 *	Drop a ref to the buffer and free it if the usage count has hit zero
 *	Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *	is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
	if (unlikely(!skb))
		return;
	if (likely(atomic_read(&skb->users) == 1))
		smp_rmb();
	else if (likely(!atomic_dec_and_test(&skb->users)))
		return;
	__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);

473
474
475
476
477
478
479
480
481
482
483
484
/**
 *	skb_recycle_check - check if skb can be reused for receive
 *	@skb: buffer
 *	@skb_size: minimum receive buffer size
 *
 *	Checks that the skb passed in is not shared or cloned, and
 *	that it is linear and its head portion at least as large as
 *	skb_size so that it can be recycled as a receive buffer.
 *	If these conditions are met, this function does any necessary
 *	reference count dropping and cleans up the skbuff as if it
 *	just came from __alloc_skb().
 */
485
bool skb_recycle_check(struct sk_buff *skb, int skb_size)
486
487
488
{
	struct skb_shared_info *shinfo;

489
	if (irqs_disabled())
490
		return false;
491

492
	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
493
		return false;
494
495
496

	skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
	if (skb_end_pointer(skb) - skb->head < skb_size)
497
		return false;
498
499

	if (skb_shared(skb) || skb_cloned(skb))
500
		return false;
501
502

	skb_release_head_state(skb);
Eric Dumazet's avatar
Eric Dumazet committed
503

504
	shinfo = skb_shinfo(skb);
Eric Dumazet's avatar
Eric Dumazet committed
505
	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
506
507
508
509
	atomic_set(&shinfo->dataref, 1);

	memset(skb, 0, offsetof(struct sk_buff, tail));
	skb->data = skb->head + NET_SKB_PAD;
510
	skb_reset_tail_pointer(skb);
511

512
	return true;
513
514
515
}
EXPORT_SYMBOL(skb_recycle_check);

516
517
518
519
520
521
522
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
	new->tstamp		= old->tstamp;
	new->dev		= old->dev;
	new->transport_header	= old->transport_header;
	new->network_header	= old->network_header;
	new->mac_header		= old->mac_header;
Eric Dumazet's avatar
Eric Dumazet committed
523
	skb_dst_copy(new, old);
Tom Herbert's avatar
Tom Herbert committed
524
	new->rxhash		= old->rxhash;
525
#ifdef CONFIG_XFRM
526
527
528
	new->sp			= secpath_get(old->sp);
#endif
	memcpy(new->cb, old->cb, sizeof(old->cb));
529
	new->csum		= old->csum;
530
531
532
533
534
535
536
537
538
539
	new->local_df		= old->local_df;
	new->pkt_type		= old->pkt_type;
	new->ip_summed		= old->ip_summed;
	skb_copy_queue_mapping(new, old);
	new->priority		= old->priority;
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
	new->ipvs_property	= old->ipvs_property;
#endif
	new->protocol		= old->protocol;
	new->mark		= old->mark;
540
	new->skb_iif		= old->skb_iif;
541
542
543
544
545
546
547
548
549
550
551
	__nf_copy(new, old);
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
	new->nf_trace		= old->nf_trace;
#endif
#ifdef CONFIG_NET_SCHED
	new->tc_index		= old->tc_index;
#ifdef CONFIG_NET_CLS_ACT
	new->tc_verd		= old->tc_verd;
#endif
#endif
552
553
	new->vlan_tci		= old->vlan_tci;

554
555
556
	skb_copy_secmark(new, old);
}

557
558
559
560
/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
Herbert Xu's avatar
Herbert Xu committed
561
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
562
563
564
565
566
{
#define C(x) n->x = skb->x

	n->next = n->prev = NULL;
	n->sk = NULL;
567
568
	__copy_skb_header(n, skb);

Linus Torvalds's avatar
Linus Torvalds committed
569
570
	C(len);
	C(data_len);
571
	C(mac_len);
Tom Herbert's avatar
Tom Herbert committed
572
	C(rxhash);
573
	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
574
	n->cloned = 1;
Linus Torvalds's avatar
Linus Torvalds committed
575
576
577
578
	n->nohdr = 0;
	n->destructor = NULL;
	C(tail);
	C(end);
579
580
581
582
	C(head);
	C(data);
	C(truesize);
	atomic_set(&n->users, 1);
Linus Torvalds's avatar
Linus Torvalds committed
583
584
585
586
587

	atomic_inc(&(skb_shinfo(skb)->dataref));
	skb->cloned = 1;

	return n;
Herbert Xu's avatar
Herbert Xu committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
#undef C
}

/**
 *	skb_morph	-	morph one skb into another
 *	@dst: the skb to receive the contents
 *	@src: the skb to supply the contents
 *
 *	This is identical to skb_clone except that the target skb is
 *	supplied by the user.
 *
 *	The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
603
	skb_release_all(dst);
Herbert Xu's avatar
Herbert Xu committed
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
	return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

/**
 *	skb_clone	-	duplicate an sk_buff
 *	@skb: buffer to clone
 *	@gfp_mask: allocation priority
 *
 *	Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *	copies share the same packet data but not structure. The new
 *	buffer has a reference count of 1. If the allocation fails the
 *	function returns %NULL otherwise the new buffer is returned.
 *
 *	If this function is called from an interrupt gfp_mask() must be
 *	%GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
	struct sk_buff *n;

	n = skb + 1;
	if (skb->fclone == SKB_FCLONE_ORIG &&
	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
		atomic_t *fclone_ref = (atomic_t *) (n + 1);
		n->fclone = SKB_FCLONE_CLONE;
		atomic_inc(fclone_ref);
	} else {
		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
		if (!n)
			return NULL;
636
637
638

		kmemcheck_annotate_bitfield(n, flags1);
		kmemcheck_annotate_bitfield(n, flags2);
Herbert Xu's avatar
Herbert Xu committed
639
640
641
642
		n->fclone = SKB_FCLONE_UNAVAILABLE;
	}

	return __skb_clone(n, skb);
Linus Torvalds's avatar
Linus Torvalds committed
643
}
644
EXPORT_SYMBOL(skb_clone);
Linus Torvalds's avatar
Linus Torvalds committed
645
646
647

static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
648
#ifndef NET_SKBUFF_DATA_USES_OFFSET
Linus Torvalds's avatar
Linus Torvalds committed
649
650
651
652
	/*
	 *	Shift between the two data areas in bytes
	 */
	unsigned long offset = new->data - old->data;
653
#endif
654
655
656

	__copy_skb_header(new, old);

657
658
659
660
#ifndef NET_SKBUFF_DATA_USES_OFFSET
	/* {transport,network,mac}_header are relative to skb->head */
	new->transport_header += offset;
	new->network_header   += offset;
661
662
	if (skb_mac_header_was_set(new))
		new->mac_header	      += offset;
663
#endif
664
665
666
	skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
Linus Torvalds's avatar
Linus Torvalds committed
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
}

/**
 *	skb_copy	-	create private copy of an sk_buff
 *	@skb: buffer to copy
 *	@gfp_mask: allocation priority
 *
 *	Make a copy of both an &sk_buff and its data. This is used when the
 *	caller wishes to modify the data and needs a private copy of the
 *	data to alter. Returns %NULL on failure or the pointer to the buffer
 *	on success. The returned buffer has a reference count of 1.
 *
 *	As by-product this function converts non-linear &sk_buff to linear
 *	one, so that &sk_buff becomes completely private and caller is allowed
 *	to modify all the data of returned buffer. This means that this
 *	function is not recommended for use in circumstances when only
 *	header is going to be modified. Use pskb_copy() instead.
 */

686
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
687
688
689
690
691
{
	int headerlen = skb->data - skb->head;
	/*
	 *	Allocate the copy buffer
	 */
692
693
694
695
696
697
	struct sk_buff *n;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
	n = alloc_skb(skb->end + skb->data_len, gfp_mask);
#else
	n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
698
699
700
701
702
703
704
705
706
707
708
709
710
711
	if (!n)
		return NULL;

	/* Set the data pointer */
	skb_reserve(n, headerlen);
	/* Set the tail pointer and length */
	skb_put(n, skb->len);

	if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
		BUG();

	copy_skb_header(n, skb);
	return n;
}
712
EXPORT_SYMBOL(skb_copy);
Linus Torvalds's avatar
Linus Torvalds committed
713
714
715
716
717
718
719
720
721
722
723
724
725
726

/**
 *	pskb_copy	-	create copy of an sk_buff with private head.
 *	@skb: buffer to copy
 *	@gfp_mask: allocation priority
 *
 *	Make a copy of both an &sk_buff and part of its data, located
 *	in header. Fragmented data remain shared. This is used when
 *	the caller wishes to modify only header of &sk_buff and needs
 *	private copy of the header to alter. Returns %NULL on failure
 *	or the pointer to the buffer on success.
 *	The returned buffer has a reference count of 1.
 */

727
struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
728
729
730
731
{
	/*
	 *	Allocate the copy buffer
	 */
732
733
734
735
736
737
	struct sk_buff *n;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
	n = alloc_skb(skb->end, gfp_mask);
#else
	n = alloc_skb(skb->end - skb->head, gfp_mask);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
738
739
740
741
742
743
744
745
	if (!n)
		goto out;

	/* Set the data pointer */
	skb_reserve(n, skb->data - skb->head);
	/* Set the tail pointer and length */
	skb_put(n, skb_headlen(skb));
	/* Copy the bytes */
746
	skb_copy_from_linear_data(skb, n->data, n->len);
Linus Torvalds's avatar
Linus Torvalds committed
747

Herbert Xu's avatar
Herbert Xu committed
748
	n->truesize += skb->data_len;
Linus Torvalds's avatar
Linus Torvalds committed
749
750
751
752
753
754
755
756
757
758
759
760
761
	n->data_len  = skb->data_len;
	n->len	     = skb->len;

	if (skb_shinfo(skb)->nr_frags) {
		int i;

		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
			get_page(skb_shinfo(n)->frags[i].page);
		}
		skb_shinfo(n)->nr_frags = i;
	}

762
	if (skb_has_frags(skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
763
764
765
766
767
768
769
770
		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
		skb_clone_fraglist(n);
	}

	copy_skb_header(n, skb);
out:
	return n;
}
771
EXPORT_SYMBOL(pskb_copy);
Linus Torvalds's avatar
Linus Torvalds committed
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788

/**
 *	pskb_expand_head - reallocate header of &sk_buff
 *	@skb: buffer to reallocate
 *	@nhead: room to add at head
 *	@ntail: room to add at tail
 *	@gfp_mask: allocation priority
 *
 *	Expands (or creates identical copy, if &nhead and &ntail are zero)
 *	header of skb. &sk_buff itself is not changed. &sk_buff MUST have
 *	reference count of 1. Returns zero in the case of success or error,
 *	if expansion failed. In the last case, &sk_buff is not changed.
 *
 *	All the pointers pointing into skb header may change and must be
 *	reloaded after call to this function.
 */

Victor Fusco's avatar
Victor Fusco committed
789
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
790
		     gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
791
792
793
{
	int i;
	u8 *data;
794
795
796
#ifdef NET_SKBUFF_DATA_USES_OFFSET
	int size = nhead + skb->end + ntail;
#else
Linus Torvalds's avatar
Linus Torvalds committed
797
	int size = nhead + (skb->end - skb->head) + ntail;
798
#endif
Linus Torvalds's avatar
Linus Torvalds committed
799
800
	long off;

801
802
	BUG_ON(nhead < 0);

Linus Torvalds's avatar
Linus Torvalds committed
803
804
805
806
807
808
809
810
811
812
813
	if (skb_shared(skb))
		BUG();

	size = SKB_DATA_ALIGN(size);

	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
	if (!data)
		goto nodata;

	/* Copy only real data... and, alas, header. This should be
	 * optimized for the cases when header is void. */
814
#ifdef NET_SKBUFF_DATA_USES_OFFSET
815
	memcpy(data + nhead, skb->head, skb->tail);
816
#else
817
	memcpy(data + nhead, skb->head, skb->tail - skb->head);
818
#endif
819
820
	memcpy(data + size, skb_end_pointer(skb),
	       sizeof(struct skb_shared_info));
Linus Torvalds's avatar
Linus Torvalds committed
821
822
823
824

	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
		get_page(skb_shinfo(skb)->frags[i].page);

825
	if (skb_has_frags(skb))
Linus Torvalds's avatar
Linus Torvalds committed
826
827
828
829
830
831
832
833
		skb_clone_fraglist(skb);

	skb_release_data(skb);

	off = (data + nhead) - skb->head;

	skb->head     = data;
	skb->data    += off;
834
835
#ifdef NET_SKBUFF_DATA_USES_OFFSET
	skb->end      = size;
836
	off           = nhead;
837
838
#else
	skb->end      = skb->head + size;
839
#endif
840
841
	/* {transport,network,mac}_header and tail are relative to skb->head */
	skb->tail	      += off;
842
843
	skb->transport_header += off;
	skb->network_header   += off;
844
845
	if (skb_mac_header_was_set(skb))
		skb->mac_header += off;
846
	skb->csum_start       += nhead;
Linus Torvalds's avatar
Linus Torvalds committed
847
	skb->cloned   = 0;
848
	skb->hdr_len  = 0;
Linus Torvalds's avatar
Linus Torvalds committed
849
850
851
852
853
854
855
	skb->nohdr    = 0;
	atomic_set(&skb_shinfo(skb)->dataref, 1);
	return 0;

nodata:
	return -ENOMEM;
}
856
EXPORT_SYMBOL(pskb_expand_head);
Linus Torvalds's avatar
Linus Torvalds committed
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
	struct sk_buff *skb2;
	int delta = headroom - skb_headroom(skb);

	if (delta <= 0)
		skb2 = pskb_copy(skb, GFP_ATOMIC);
	else {
		skb2 = skb_clone(skb, GFP_ATOMIC);
		if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
					     GFP_ATOMIC)) {
			kfree_skb(skb2);
			skb2 = NULL;
		}
	}
	return skb2;
}
877
EXPORT_SYMBOL(skb_realloc_headroom);
Linus Torvalds's avatar
Linus Torvalds committed
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897

/**
 *	skb_copy_expand	-	copy and expand sk_buff
 *	@skb: buffer to copy
 *	@newheadroom: new free bytes at head
 *	@newtailroom: new free bytes at tail
 *	@gfp_mask: allocation priority
 *
 *	Make a copy of both an &sk_buff and its data and while doing so
 *	allocate additional space.
 *
 *	This is used when the caller wishes to modify the data and needs a
 *	private copy of the data to alter as well as more space for new fields.
 *	Returns %NULL on failure or the pointer to the buffer
 *	on success. The returned buffer has a reference count of 1.
 *
 *	You must pass %GFP_ATOMIC as the allocation priority if this function
 *	is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
Victor Fusco's avatar
Victor Fusco committed
898
				int newheadroom, int newtailroom,
899
				gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
900
901
902
903
904
905
{
	/*
	 *	Allocate the copy buffer
	 */
	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
				      gfp_mask);
906
	int oldheadroom = skb_headroom(skb);
Linus Torvalds's avatar
Linus Torvalds committed
907
	int head_copy_len, head_copy_off;
908
	int off;
Linus Torvalds's avatar
Linus Torvalds committed
909
910
911
912
913
914
915
916
917

	if (!n)
		return NULL;

	skb_reserve(n, newheadroom);

	/* Set the tail pointer and length */
	skb_put(n, skb->len);

918
	head_copy_len = oldheadroom;
Linus Torvalds's avatar
Linus Torvalds committed
919
920
921
922
923
924
925
926
927
928
929
930
931
	head_copy_off = 0;
	if (newheadroom <= head_copy_len)
		head_copy_len = newheadroom;
	else
		head_copy_off = newheadroom - head_copy_len;

	/* Copy the linear header and data. */
	if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
			  skb->len + head_copy_len))
		BUG();

	copy_skb_header(n, skb);

932
	off                  = newheadroom - oldheadroom;
933
934
	n->csum_start       += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
935
936
	n->transport_header += off;
	n->network_header   += off;
937
938
	if (skb_mac_header_was_set(skb))
		n->mac_header += off;
939
#endif
940

Linus Torvalds's avatar
Linus Torvalds committed
941
942
	return n;
}
943
EXPORT_SYMBOL(skb_copy_expand);
Linus Torvalds's avatar
Linus Torvalds committed
944
945
946
947
948
949
950
951
952
953

/**
 *	skb_pad			-	zero pad the tail of an skb
 *	@skb: buffer to pad
 *	@pad: space to pad
 *
 *	Ensure that a buffer is followed by a padding area that is zero
 *	filled. Used by network drivers which may DMA or transfer data
 *	beyond the buffer end onto the wire.
 *
954
 *	May return error in out of memory cases. The skb is freed on error.
Linus Torvalds's avatar
Linus Torvalds committed
955
 */
956

957
int skb_pad(struct sk_buff *skb, int pad)
Linus Torvalds's avatar
Linus Torvalds committed
958
{
959
960
	int err;
	int ntail;
961

Linus Torvalds's avatar
Linus Torvalds committed
962
	/* If the skbuff is non linear tailroom is always zero.. */
963
	if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
Linus Torvalds's avatar
Linus Torvalds committed
964
		memset(skb->data+skb->len, 0, pad);
965
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
966
	}
967

968
	ntail = skb->data_len + pad - (skb->end - skb->tail);
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
	if (likely(skb_cloned(skb) || ntail > 0)) {
		err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
		if (unlikely(err))
			goto free_skb;
	}

	/* FIXME: The use of this function with non-linear skb's really needs
	 * to be audited.
	 */
	err = skb_linearize(skb);
	if (unlikely(err))
		goto free_skb;

	memset(skb->data + skb->len, 0, pad);
	return 0;

free_skb:
Linus Torvalds's avatar
Linus Torvalds committed
986
	kfree_skb(skb);
987
	return err;
988
}
989
EXPORT_SYMBOL(skb_pad);
990

991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
/**
 *	skb_put - add data to a buffer
 *	@skb: buffer to use
 *	@len: amount of data to add
 *
 *	This function extends the used data area of the buffer. If this would
 *	exceed the total buffer size the kernel will panic. A pointer to the
 *	first byte of the extra data is returned.
 */
unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
{
	unsigned char *tmp = skb_tail_pointer(skb);
	SKB_LINEAR_ASSERT(skb);
	skb->tail += len;
	skb->len  += len;
	if (unlikely(skb->tail > skb->end))
		skb_over_panic(skb, len, __builtin_return_address(0));
	return tmp;
}
EXPORT_SYMBOL(skb_put);

1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
/**
 *	skb_push - add data to the start of a buffer
 *	@skb: buffer to use
 *	@len: amount of data to add
 *
 *	This function extends the used data area of the buffer at the buffer
 *	start. If this would exceed the total buffer headroom the kernel will
 *	panic. A pointer to the first byte of the extra data is returned.
 */
unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
{
	skb->data -= len;
	skb->len  += len;
	if (unlikely(skb->data<skb->head))
		skb_under_panic(skb, len, __builtin_return_address(0));
	return skb->data;
}
EXPORT_SYMBOL(skb_push);

1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
/**
 *	skb_pull - remove data from the start of a buffer
 *	@skb: buffer to use
 *	@len: amount of data to remove
 *
 *	This function removes data from the start of a buffer, returning
 *	the memory to the headroom. A pointer to the next data in the buffer
 *	is returned. Once the data has been pulled future pushes will overwrite
 *	the old data.
 */
unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
{
1043
	return skb_pull_inline(skb, len);
1044
1045
1046
}
EXPORT_SYMBOL(skb_pull);

1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
/**
 *	skb_trim - remove end from a buffer
 *	@skb: buffer to alter
 *	@len: new length
 *
 *	Cut the length of a buffer down by removing data from the tail. If
 *	the buffer is already under the length specified it is not modified.
 *	The skb must be linear.
 */
void skb_trim(struct sk_buff *skb, unsigned int len)
{
	if (skb->len > len)
		__skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);

1063
/* Trims skb to length len. It can change skb pointers.
Linus Torvalds's avatar
Linus Torvalds committed
1064
1065
 */

1066
int ___pskb_trim(struct sk_buff *skb, unsigned int len)
Linus Torvalds's avatar
Linus Torvalds committed
1067
{
1068
1069
	struct sk_buff **fragp;
	struct sk_buff *frag;
Linus Torvalds's avatar
Linus Torvalds committed
1070
1071
1072
	int offset = skb_headlen(skb);
	int nfrags = skb_shinfo(skb)->nr_frags;
	int i;
1073
1074
1075
1076
1077
	int err;

	if (skb_cloned(skb) &&
	    unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
		return err;
Linus Torvalds's avatar
Linus Torvalds committed
1078

1079
1080
1081
1082
1083
	i = 0;
	if (offset >= len)
		goto drop_pages;

	for (; i < nfrags; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
1084
		int end = offset + skb_shinfo(skb)->frags[i].size;
1085
1086
1087
1088
1089
1090

		if (end < len) {
			offset = end;
			continue;
		}

1091
		skb_shinfo(skb)->frags[i++].size = len - offset;
1092

1093
drop_pages:
1094
1095
1096
1097
1098
		skb_shinfo(skb)->nr_frags = i;

		for (; i < nfrags; i++)
			put_page(skb_shinfo(skb)->frags[i].page);

1099
		if (skb_has_frags(skb))
1100
			skb_drop_fraglist(skb);
1101
		goto done;
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
	}

	for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
	     fragp = &frag->next) {
		int end = offset + frag->len;

		if (skb_shared(frag)) {
			struct sk_buff *nfrag;

			nfrag = skb_clone(frag, GFP_ATOMIC);
			if (unlikely(!nfrag))
				return -ENOMEM;

			nfrag->next = frag->next;
1116
			kfree_skb(frag);
1117
1118
			frag = nfrag;
			*fragp = frag;
Linus Torvalds's avatar
Linus Torvalds committed
1119
		}
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132

		if (end < len) {
			offset = end;
			continue;
		}

		if (end > len &&
		    unlikely((err = pskb_trim(frag, len - offset))))
			return err;

		if (frag->next)
			skb_drop_list(&frag->next);
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1133
1134
	}

1135
done:
1136
	if (len > skb_headlen(skb)) {
Linus Torvalds's avatar
Linus Torvalds committed
1137
1138
1139
		skb->data_len -= skb->len - len;
		skb->len       = len;
	} else {
1140
1141
		skb->len       = len;
		skb->data_len  = 0;
1142
		skb_set_tail_pointer(skb, len);
Linus Torvalds's avatar
Linus Torvalds committed
1143
1144
1145
1146
	}

	return 0;
}
1147
EXPORT_SYMBOL(___pskb_trim);
Linus Torvalds's avatar
Linus Torvalds committed
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179

/**
 *	__pskb_pull_tail - advance tail of skb header
 *	@skb: buffer to reallocate
 *	@delta: number of bytes to advance tail
 *
 *	The function makes a sense only on a fragmented &sk_buff,
 *	it expands header moving its tail forward and copying necessary
 *	data from fragmented part.
 *
 *	&sk_buff MUST have reference count of 1.
 *
 *	Returns %NULL (and &sk_buff does not change) if pull failed
 *	or value of new tail of skb in the case of success.
 *
 *	All the pointers pointing into skb header may change and must be
 *	reloaded after call to this function.
 */

/* Moves tail of skb head forward, copying data from fragmented part,
 * when it is necessary.
 * 1. It may fail due to malloc failure.
 * 2. It may change skb pointers.
 *
 * It is pretty complicated. Luckily, it is called only in exceptional cases.
 */
unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
	/* If skb has not enough free space at tail, get new one
	 * plus 128 bytes for future expansions. If we have enough
	 * room at tail, reallocate without expansion only if skb is cloned.
	 */
1180
	int i, k, eat = (skb->tail + delta) - skb->end;
Linus Torvalds's avatar
Linus Torvalds committed
1181
1182
1183
1184
1185
1186
1187

	if (eat > 0 || skb_cloned(skb)) {
		if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
				     GFP_ATOMIC))
			return NULL;
	}

1188
	if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
Linus Torvalds's avatar
Linus Torvalds committed
1189
1190
1191
1192
1193
		BUG();

	/* Optimization: no fragments, no reasons to preestimate
	 * size of pulled pages. Superb.
	 */
1194
	if (!skb_has_frags(skb))
Linus Torvalds's avatar
Linus Torvalds committed
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
		goto pull_pages;

	/* Estimate size of pulled pages. */
	eat = delta;
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
		if (skb_shinfo(skb)->frags[i].size >= eat)
			goto pull_pages;
		eat -= skb_shinfo(skb)->frags[i].size;
	}

	/* If we need update frag list, we are in troubles.
	 * Certainly, it possible to add an offset to skb data,
	 * but taking into account that pulling is expected to
	 * be very rare operation, it is worth to fight against
	 * further bloating skb head and crucify ourselves here instead.
	 * Pure masohism, indeed. 8)8)
	 */
	if (eat) {
		struct sk_buff *list = skb_shinfo(skb)->frag_list;
		struct sk_buff *clone = NULL;
		struct sk_buff *insp = NULL;

		do {
1218
			BUG_ON(!list);
Linus Torvalds's avatar
Linus Torvalds committed
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240

			if (list->len <= eat) {
				/* Eaten as whole. */
				eat -= list->len;
				list = list->next;
				insp = list;
			} else {
				/* Eaten partially. */

				if (skb_shared(list)) {
					/* Sucks! We need to fork list. :-( */
					clone = skb_clone(list, GFP_ATOMIC);
					if (!clone)
						return NULL;
					insp = list->next;
					list = clone;
				} else {
					/* This may be pulled without
					 * problems. */
					insp = list;
				}
				if (!pskb_pull(list, eat)) {
1241
					kfree_skb(clone);
Linus Torvalds's avatar
Linus Torvalds committed
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
					return NULL;
				}
				break;
			}
		} while (eat);

		/* Free pulled out fragments. */
		while ((list = skb_shinfo(skb)->frag_list) != insp) {
			skb_shinfo(skb)->frag_list = list->next;
			kfree_skb(list);
		}
		/* And insert new clone at head. */
		if (clone) {
			clone->next = list;
			skb_shinfo(skb)->frag_list = clone;
		}
	}
	/* Success! Now we may commit changes to skb data. */

pull_pages:
	eat = delta;
	k = 0;
	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
		if (skb_shinfo(skb)->frags[i].size <= eat) {
			put_page(skb_shinfo(skb)->frags[i].page);
			eat -= skb_shinfo(skb)->frags[i].size;
		} else {
			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
			if (eat) {
				skb_shinfo(skb)->frags[k].page_offset += eat;
				skb_shinfo(skb)->frags[k].size -= eat;
				eat = 0;
			}
			k++;
		}
	}
	skb_shinfo(skb)->nr_frags = k;

	skb->tail     += delta;
	skb->data_len -= delta;

1283
	return skb_tail_pointer(skb);
Linus Torvalds's avatar
Linus Torvalds committed
1284
}
1285
EXPORT_SYMBOL(__pskb_pull_tail);
Linus Torvalds's avatar
Linus Torvalds committed
1286
1287
1288
1289
1290

/* Copy some data bits from skb to kernel buffer. */

int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
1291
	int start = skb_headlen(skb);
1292
1293
	struct sk_buff *frag_iter;
	int i, copy;
Linus Torvalds's avatar
Linus Torvalds committed
1294
1295
1296
1297
1298

	if (offset > (int)skb->len - len)
		goto fault;

	/* Copy header. */
1299
	if ((copy = start - offset) > 0) {
Linus Torvalds's avatar
Linus Torvalds committed
1300
1301
		if (copy > len)
			copy = len;
1302
		skb_copy_from_linear_data_offset(skb, offset, to, copy);
Linus Torvalds's avatar
Linus Torvalds committed
1303
1304
1305
1306
1307
1308
1309
		if ((len -= copy) == 0)
			return 0;
		offset += copy;
		to     += copy;
	}

	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1310
		int end;
Linus Torvalds's avatar
Linus Torvalds committed
1311

1312
		WARN_ON(start > offset + len);
1313
1314

		end = start + skb_shinfo(skb)->frags[i].size;
Linus Torvalds's avatar
Linus Torvalds committed
1315
1316
1317
1318
1319
1320
1321
1322
		if ((copy = end - offset) > 0) {
			u8 *vaddr;

			if (copy > len)
				copy = len;

			vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
			memcpy(to,
1323
1324
			       vaddr + skb_shinfo(skb)->frags[i].page_offset+
			       offset - start, copy);
Linus Torvalds's avatar
Linus Torvalds committed
1325
1326
1327
1328
1329
1330
1331
			kunmap_skb_frag(vaddr);

			if ((len -= copy) == 0)
				return 0;
			offset += copy;
			to     += copy;
		}
1332
		start = end;
Linus Torvalds's avatar
Linus Torvalds committed
1333
1334
	}

1335
1336
	skb_walk_frags(skb, frag_iter) {
		int end;
Linus Torvalds's avatar
Linus Torvalds committed
1337

1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
		WARN_ON(start > offset + len);

		end = start + frag_iter->len;
		if ((copy = end - offset) > 0) {
			if (copy > len)
				copy = len;
			if (skb_copy_bits(frag_iter, offset - start, to, copy))
				goto fault;
			if ((len -= copy) == 0)
				return 0;
			offset += copy;
			to     += copy;
Linus Torvalds's avatar
Linus Torvalds committed
1350
		}
1351
		start = end;
Linus Torvalds's avatar
Linus Torvalds committed
1352
1353
1354
1355
1356
1357
1358
	}
	if (!len)
		return 0;

fault:
	return -EFAULT;
}
1359
EXPORT_SYMBOL(skb_copy_bits);
Linus Torvalds's avatar
Linus Torvalds committed
1360

Jens Axboe's avatar
Jens Axboe committed
1361
1362
1363
1364
1365
1366
/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
1367
1368
	put_page(spd->pages[i]);
}
Jens Axboe's avatar
Jens Axboe committed
1369

1370
1371
static inline struct page *linear_to_page(struct page *page, unsigned int *len,
					  unsigned int *offset,
1372
					  struct sk_buff *skb, struct sock *sk)
1373
{
1374
1375
1376
1377
1378
1379
1380
1381
	struct page *p = sk->sk_sndmsg_page;
	unsigned int off;

	if (!p) {
new_page:
		p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
		if (!p)
			return NULL;
1382

1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
		off = sk->sk_sndmsg_off = 0;
		/* hold one ref to this page until it's full */
	} else {
		unsigned int mlen;

		off = sk->sk_sndmsg_off;
		mlen = PAGE_SIZE - off;
		if (mlen < 64 && mlen < *len) {
			put_page(p);
			goto new_page;
		}

		*len = min_t(unsigned int, *len, mlen);
	}

	memcpy(page_address(p) + off, page_address(page) + *offset, *len);
	sk->sk_sndmsg_off += *len;
	*offset = off;
	get_page(p);
1402
1403

	return p;
Jens Axboe's avatar
Jens Axboe committed
1404
1405
1406
1407
1408
1409
}

/*
 * Fill page/offset/length into spd, if it can hold more pages.
 */
static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
1410
				unsigned int *len, unsigned int offset,
1411
1412
				struct sk_buff *skb, int linear,
				struct sock *sk)
Jens Axboe's avatar
Jens Axboe committed
1413
1414
1415
1416
{
	if (unlikely(spd->nr_pages == PIPE_BUFFERS))
		return 1;

1417
	if (linear) {
1418
		page = linear_to_page(page, len, &offset, skb, sk);
1419
1420
1421
1422
1423
		if (!page)
			return 1;
	} else
		get_page(page);

Jens Axboe's avatar
Jens Axboe committed
1424
	spd->pages[spd->nr_pages] = page;
1425
	spd->partial[spd->nr_pages].len = *len;
Jens Axboe's avatar
Jens Axboe committed
1426
1427
	spd->partial[spd->nr_pages].offset = offset;
	spd->nr_pages++;
1428

Jens Axboe's avatar
Jens Axboe committed