ip_fragment.c 17.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The IP fragmentation functionality.
7
 *
Linus Torvalds's avatar
Linus Torvalds committed
8
 * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
9
 *		Alan Cox <alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
10
11
12
13
14
15
16
17
18
19
20
21
22
 *
 * Fixes:
 *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
 *		David S. Miller :	Begin massive cleanup...
 *		Andi Kleen	:	Add sysctls.
 *		xxxx		:	Overlapfrag bug.
 *		Ultima          :       ip_expire() kernel panic.
 *		Bill Hawes	:	Frag accounting and evictor fixes.
 *		John McDonald	:	0 length frag bug.
 *		Alexey Kuznetsov:	SMP races, threading, cleanup.
 *		Patrick McHardy :	LRU queue of frag heads for evictor.
 */

Herbert Xu's avatar
Herbert Xu committed
23
#include <linux/compiler.h>
Linus Torvalds's avatar
Linus Torvalds committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
#include <linux/list.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/checksum.h>
Herbert Xu's avatar
Herbert Xu committed
39
#include <net/inetpeer.h>
40
#include <net/inet_frag.h>
Linus Torvalds's avatar
Linus Torvalds committed
41
42
43
44
45
46
47
48
49
50
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>

/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
 * as well. Or notify me, at least. --ANK
 */

51
static int sysctl_ipfrag_max_dist __read_mostly = 64;
Herbert Xu's avatar
Herbert Xu committed
52

Linus Torvalds's avatar
Linus Torvalds committed
53
54
55
56
57
58
struct ipfrag_skb_cb
{
	struct inet_skb_parm	h;
	int			offset;
};

59
#define FRAG_CB(skb)	((struct ipfrag_skb_cb *)((skb)->cb))
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62

/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
63
64
	struct inet_frag_queue q;

Linus Torvalds's avatar
Linus Torvalds committed
65
	u32		user;
66
67
68
	__be32		saddr;
	__be32		daddr;
	__be16		id;
Linus Torvalds's avatar
Linus Torvalds committed
69
	u8		protocol;
Herbert Xu's avatar
Herbert Xu committed
70
71
72
	int             iif;
	unsigned int    rid;
	struct inet_peer *peer;
Linus Torvalds's avatar
Linus Torvalds committed
73
74
};

75
static struct inet_frags ip4_frags;
Linus Torvalds's avatar
Linus Torvalds committed
76

77
int ip_frag_nqueues(struct net *net)
78
{
79
	return net->ipv4.frags.nqueues;
80
}
Linus Torvalds's avatar
Linus Torvalds committed
81

82
int ip_frag_mem(struct net *net)
83
{
84
	return atomic_read(&net->ipv4.frags.mem);
85
}
Linus Torvalds's avatar
Linus Torvalds committed
86

87
88
89
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev);

90
91
92
93
94
struct ip4_create_arg {
	struct iphdr *iph;
	u32 user;
};

95
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
Linus Torvalds's avatar
Linus Torvalds committed
96
{
97
98
	return jhash_3words((__force u32)id << 16 | prot,
			    (__force u32)saddr, (__force u32)daddr,
99
			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
Linus Torvalds's avatar
Linus Torvalds committed
100
101
}

102
static unsigned int ip4_hashfn(struct inet_frag_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
103
{
104
	struct ipq *ipq;
Linus Torvalds's avatar
Linus Torvalds committed
105

106
107
	ipq = container_of(q, struct ipq, q);
	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
Linus Torvalds's avatar
Linus Torvalds committed
108
109
}

110
111
112
113
114
115
116
117
118
119
120
121
122
static int ip4_frag_match(struct inet_frag_queue *q, void *a)
{
	struct ipq *qp;
	struct ip4_create_arg *arg = a;

	qp = container_of(q, struct ipq, q);
	return (qp->id == arg->iph->id &&
			qp->saddr == arg->iph->saddr &&
			qp->daddr == arg->iph->daddr &&
			qp->protocol == arg->iph->protocol &&
			qp->user == arg->user);
}

Linus Torvalds's avatar
Linus Torvalds committed
123
/* Memory Tracking Functions. */
124
125
static __inline__ void frag_kfree_skb(struct netns_frags *nf,
		struct sk_buff *skb, int *work)
Linus Torvalds's avatar
Linus Torvalds committed
126
127
128
{
	if (work)
		*work -= skb->truesize;
129
	atomic_sub(skb->truesize, &nf->mem);
Linus Torvalds's avatar
Linus Torvalds committed
130
131
132
	kfree_skb(skb);
}

133
134
135
136
137
138
139
140
141
142
143
144
145
146
static void ip4_frag_init(struct inet_frag_queue *q, void *a)
{
	struct ipq *qp = container_of(q, struct ipq, q);
	struct ip4_create_arg *arg = a;

	qp->protocol = arg->iph->protocol;
	qp->id = arg->iph->id;
	qp->saddr = arg->iph->saddr;
	qp->daddr = arg->iph->daddr;
	qp->user = arg->user;
	qp->peer = sysctl_ipfrag_max_dist ?
		inet_getpeer(arg->iph->saddr, 1) : NULL;
}

147
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
Linus Torvalds's avatar
Linus Torvalds committed
148
{
149
150
151
152
153
	struct ipq *qp;

	qp = container_of(q, struct ipq, q);
	if (qp->peer)
		inet_putpeer(qp->peer);
Linus Torvalds's avatar
Linus Torvalds committed
154
155
156
157
158
}


/* Destruction primitives. */

159
static __inline__ void ipq_put(struct ipq *ipq)
Linus Torvalds's avatar
Linus Torvalds committed
160
{
161
	inet_frag_put(&ipq->q, &ip4_frags);
Linus Torvalds's avatar
Linus Torvalds committed
162
163
164
165
166
167
168
}

/* Kill ipq entry. It is not destroyed immediately,
 * because caller (and someone more) holds reference count.
 */
static void ipq_kill(struct ipq *ipq)
{
169
	inet_frag_kill(&ipq->q, &ip4_frags);
Linus Torvalds's avatar
Linus Torvalds committed
170
171
}

172
/* Memory limiting on fragments.  Evictor trashes the oldest
Linus Torvalds's avatar
Linus Torvalds committed
173
174
 * fragment queue until we are back under the threshold.
 */
175
static void ip_evictor(struct net *net)
Linus Torvalds's avatar
Linus Torvalds committed
176
{
177
178
	int evicted;

179
	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
180
	if (evicted)
181
		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
Linus Torvalds's avatar
Linus Torvalds committed
182
183
184
185
186
187
188
}

/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
static void ip_expire(unsigned long arg)
{
189
	struct ipq *qp;
190
	struct net *net;
191
192

	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
193
	net = container_of(qp->q.net, struct net, ipv4.frags);
Linus Torvalds's avatar
Linus Torvalds committed
194

195
	spin_lock(&qp->q.lock);
Linus Torvalds's avatar
Linus Torvalds committed
196

197
	if (qp->q.last_in & INET_FRAG_COMPLETE)
Linus Torvalds's avatar
Linus Torvalds committed
198
199
200
201
		goto out;

	ipq_kill(qp);

202
203
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
Linus Torvalds's avatar
Linus Torvalds committed
204

205
	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
206
		struct sk_buff *head = qp->q.fragments;
207

Linus Torvalds's avatar
Linus Torvalds committed
208
		/* Send an ICMP "Fragment Reassembly Timeout" message. */
209
		if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) {
Linus Torvalds's avatar
Linus Torvalds committed
210
211
212
213
214
			icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
			dev_put(head->dev);
		}
	}
out:
215
	spin_unlock(&qp->q.lock);
216
	ipq_put(qp);
Linus Torvalds's avatar
Linus Torvalds committed
217
218
}

219
220
221
/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 */
222
static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
Linus Torvalds's avatar
Linus Torvalds committed
223
{
224
225
	struct inet_frag_queue *q;
	struct ip4_create_arg arg;
226
	unsigned int hash;
Linus Torvalds's avatar
Linus Torvalds committed
227

228
229
	arg.iph = iph;
	arg.user = user;
230
231

	read_lock(&ip4_frags.lock);
232
	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
Linus Torvalds's avatar
Linus Torvalds committed
233

234
	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
235
236
	if (q == NULL)
		goto out_nomem;
Linus Torvalds's avatar
Linus Torvalds committed
237

238
	return container_of(q, struct ipq, q);
Linus Torvalds's avatar
Linus Torvalds committed
239
240

out_nomem:
241
	LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
Linus Torvalds's avatar
Linus Torvalds committed
242
243
244
	return NULL;
}

Herbert Xu's avatar
Herbert Xu committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/* Is the fragment too far ahead to be part of ipq? */
static inline int ip_frag_too_far(struct ipq *qp)
{
	struct inet_peer *peer = qp->peer;
	unsigned int max = sysctl_ipfrag_max_dist;
	unsigned int start, end;

	int rc;

	if (!peer || !max)
		return 0;

	start = qp->rid;
	end = atomic_inc_return(&peer->rid);
	qp->rid = end;

261
	rc = qp->q.fragments && (end - start) > max;
Herbert Xu's avatar
Herbert Xu committed
262
263

	if (rc) {
264
265
266
267
		struct net *net;

		net = container_of(qp->q.net, struct net, ipv4.frags);
		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
Herbert Xu's avatar
Herbert Xu committed
268
269
270
271
272
273
274
275
276
	}

	return rc;
}

static int ip_frag_reinit(struct ipq *qp)
{
	struct sk_buff *fp;

277
	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
278
		atomic_inc(&qp->q.refcnt);
Herbert Xu's avatar
Herbert Xu committed
279
280
281
		return -ETIMEDOUT;
	}

282
	fp = qp->q.fragments;
Herbert Xu's avatar
Herbert Xu committed
283
284
	do {
		struct sk_buff *xp = fp->next;
285
		frag_kfree_skb(qp->q.net, fp, NULL);
Herbert Xu's avatar
Herbert Xu committed
286
287
288
		fp = xp;
	} while (fp);

289
290
291
292
	qp->q.last_in = 0;
	qp->q.len = 0;
	qp->q.meat = 0;
	qp->q.fragments = NULL;
Herbert Xu's avatar
Herbert Xu committed
293
294
295
296
297
	qp->iif = 0;

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
298
/* Add new segment to existing queue. */
299
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
300
301
{
	struct sk_buff *prev, *next;
302
	struct net_device *dev;
Linus Torvalds's avatar
Linus Torvalds committed
303
304
	int flags, offset;
	int ihl, end;
305
	int err = -ENOENT;
Linus Torvalds's avatar
Linus Torvalds committed
306

307
	if (qp->q.last_in & INET_FRAG_COMPLETE)
Linus Torvalds's avatar
Linus Torvalds committed
308
309
		goto err;

Herbert Xu's avatar
Herbert Xu committed
310
	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
311
312
	    unlikely(ip_frag_too_far(qp)) &&
	    unlikely(err = ip_frag_reinit(qp))) {
Herbert Xu's avatar
Herbert Xu committed
313
314
315
316
		ipq_kill(qp);
		goto err;
	}

317
	offset = ntohs(ip_hdr(skb)->frag_off);
Linus Torvalds's avatar
Linus Torvalds committed
318
319
320
	flags = offset & ~IP_OFFSET;
	offset &= IP_OFFSET;
	offset <<= 3;		/* offset is in 8-byte chunks */
321
	ihl = ip_hdrlen(skb);
Linus Torvalds's avatar
Linus Torvalds committed
322
323

	/* Determine the position of this fragment. */
324
	end = offset + skb->len - ihl;
325
	err = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
326
327
328
329
330
331

	/* Is this the final fragment? */
	if ((flags & IP_MF) == 0) {
		/* If we already have some bits beyond end
		 * or have different end, the segment is corrrupted.
		 */
332
		if (end < qp->q.len ||
333
		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
Linus Torvalds's avatar
Linus Torvalds committed
334
			goto err;
335
		qp->q.last_in |= INET_FRAG_LAST_IN;
336
		qp->q.len = end;
Linus Torvalds's avatar
Linus Torvalds committed
337
338
339
340
341
342
	} else {
		if (end&7) {
			end &= ~7;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
343
		if (end > qp->q.len) {
Linus Torvalds's avatar
Linus Torvalds committed
344
			/* Some bits beyond end -> corruption. */
345
			if (qp->q.last_in & INET_FRAG_LAST_IN)
Linus Torvalds's avatar
Linus Torvalds committed
346
				goto err;
347
			qp->q.len = end;
Linus Torvalds's avatar
Linus Torvalds committed
348
349
350
351
352
		}
	}
	if (end == offset)
		goto err;

353
	err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
354
355
	if (pskb_pull(skb, ihl) == NULL)
		goto err;
356
357
358

	err = pskb_trim_rcsum(skb, end - offset);
	if (err)
Linus Torvalds's avatar
Linus Torvalds committed
359
360
361
362
363
364
365
		goto err;

	/* Find out which fragments are in front and at the back of us
	 * in the chain of fragments so far.  We must know where to put
	 * this fragment, right?
	 */
	prev = NULL;
366
	for (next = qp->q.fragments; next != NULL; next = next->next) {
Linus Torvalds's avatar
Linus Torvalds committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
		if (FRAG_CB(next)->offset >= offset)
			break;	/* bingo! */
		prev = next;
	}

	/* We found where to put this one.  Check for overlap with
	 * preceding fragment, and, if needed, align things so that
	 * any overlaps are eliminated.
	 */
	if (prev) {
		int i = (FRAG_CB(prev)->offset + prev->len) - offset;

		if (i > 0) {
			offset += i;
381
			err = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
382
383
			if (end <= offset)
				goto err;
384
			err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
385
386
387
388
389
390
391
			if (!pskb_pull(skb, i))
				goto err;
			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
				skb->ip_summed = CHECKSUM_NONE;
		}
	}

392
393
	err = -ENOMEM;

Linus Torvalds's avatar
Linus Torvalds committed
394
395
396
397
398
399
400
401
402
403
	while (next && FRAG_CB(next)->offset < end) {
		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */

		if (i < next->len) {
			/* Eat head of the next overlapped fragment
			 * and leave the loop. The next ones cannot overlap.
			 */
			if (!pskb_pull(next, i))
				goto err;
			FRAG_CB(next)->offset += i;
404
			qp->q.meat -= i;
Linus Torvalds's avatar
Linus Torvalds committed
405
406
407
408
409
410
			if (next->ip_summed != CHECKSUM_UNNECESSARY)
				next->ip_summed = CHECKSUM_NONE;
			break;
		} else {
			struct sk_buff *free_it = next;

411
			/* Old fragment is completely overridden with
Linus Torvalds's avatar
Linus Torvalds committed
412
413
414
415
416
417
418
			 * new one drop it.
			 */
			next = next->next;

			if (prev)
				prev->next = next;
			else
419
				qp->q.fragments = next;
Linus Torvalds's avatar
Linus Torvalds committed
420

421
			qp->q.meat -= free_it->len;
422
			frag_kfree_skb(qp->q.net, free_it, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
423
424
425
426
427
428
429
430
431
432
		}
	}

	FRAG_CB(skb)->offset = offset;

	/* Insert this fragment in the chain of fragments. */
	skb->next = next;
	if (prev)
		prev->next = skb;
	else
433
		qp->q.fragments = skb;
Linus Torvalds's avatar
Linus Torvalds committed
434

435
436
437
438
439
	dev = skb->dev;
	if (dev) {
		qp->iif = dev->ifindex;
		skb->dev = NULL;
	}
440
441
	qp->q.stamp = skb->tstamp;
	qp->q.meat += skb->len;
442
	atomic_add(skb->truesize, &qp->q.net->mem);
Linus Torvalds's avatar
Linus Torvalds committed
443
	if (offset == 0)
444
		qp->q.last_in |= INET_FRAG_FIRST_IN;
Linus Torvalds's avatar
Linus Torvalds committed
445

446
447
	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
	    qp->q.meat == qp->q.len)
448
449
		return ip_frag_reasm(qp, prev, dev);

450
	write_lock(&ip4_frags.lock);
451
	list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list);
452
	write_unlock(&ip4_frags.lock);
453
	return -EINPROGRESS;
Linus Torvalds's avatar
Linus Torvalds committed
454
455
456

err:
	kfree_skb(skb);
457
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
458
459
460
461
462
}


/* Build a new IP datagram from all its fragments. */

463
464
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
			 struct net_device *dev)
Linus Torvalds's avatar
Linus Torvalds committed
465
466
{
	struct iphdr *iph;
467
	struct sk_buff *fp, *head = qp->q.fragments;
Linus Torvalds's avatar
Linus Torvalds committed
468
469
	int len;
	int ihlen;
470
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
471
472
473

	ipq_kill(qp);

474
475
476
477
478
479
480
481
482
483
	/* Make the one we just received the head. */
	if (prev) {
		head = prev->next;
		fp = skb_clone(head, GFP_ATOMIC);
		if (!fp)
			goto out_nomem;

		fp->next = head->next;
		prev->next = fp;

484
485
		skb_morph(head, qp->q.fragments);
		head->next = qp->q.fragments->next;
486

487
488
		kfree_skb(qp->q.fragments);
		qp->q.fragments = head;
489
490
	}

491
492
	WARN_ON(head == NULL);
	WARN_ON(FRAG_CB(head)->offset != 0);
Linus Torvalds's avatar
Linus Torvalds committed
493
494

	/* Allocate a new buffer for the datagram. */
495
	ihlen = ip_hdrlen(head);
496
	len = ihlen + qp->q.len;
Linus Torvalds's avatar
Linus Torvalds committed
497

498
	err = -E2BIG;
Stephen Hemminger's avatar
Stephen Hemminger committed
499
	if (len > 65535)
Linus Torvalds's avatar
Linus Torvalds committed
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
		goto out_oversize;

	/* Head of list must not be cloned. */
	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
		goto out_nomem;

	/* If the first fragment is fragmented itself, we split
	 * it to two chunks: the first with data and paged part
	 * and the second, holding only fragments. */
	if (skb_shinfo(head)->frag_list) {
		struct sk_buff *clone;
		int i, plen = 0;

		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
			goto out_nomem;
		clone->next = head->next;
		head->next = clone;
		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
		skb_shinfo(head)->frag_list = NULL;
		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
			plen += skb_shinfo(head)->frags[i].size;
		clone->len = clone->data_len = head->data_len - plen;
		head->data_len -= clone->len;
		head->len -= clone->len;
		clone->csum = 0;
		clone->ip_summed = head->ip_summed;
526
		atomic_add(clone->truesize, &qp->q.net->mem);
Linus Torvalds's avatar
Linus Torvalds committed
527
528
529
	}

	skb_shinfo(head)->frag_list = head->next;
530
	skb_push(head, head->data - skb_network_header(head));
531
	atomic_sub(head->truesize, &qp->q.net->mem);
Linus Torvalds's avatar
Linus Torvalds committed
532
533
534
535
536
537

	for (fp=head->next; fp; fp = fp->next) {
		head->data_len += fp->len;
		head->len += fp->len;
		if (head->ip_summed != fp->ip_summed)
			head->ip_summed = CHECKSUM_NONE;
538
		else if (head->ip_summed == CHECKSUM_COMPLETE)
Linus Torvalds's avatar
Linus Torvalds committed
539
540
			head->csum = csum_add(head->csum, fp->csum);
		head->truesize += fp->truesize;
541
		atomic_sub(fp->truesize, &qp->q.net->mem);
Linus Torvalds's avatar
Linus Torvalds committed
542
543
544
545
	}

	head->next = NULL;
	head->dev = dev;
546
	head->tstamp = qp->q.stamp;
Linus Torvalds's avatar
Linus Torvalds committed
547

548
	iph = ip_hdr(head);
Linus Torvalds's avatar
Linus Torvalds committed
549
550
	iph->frag_off = 0;
	iph->tot_len = htons(len);
551
	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS);
552
	qp->q.fragments = NULL;
553
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
554
555

out_nomem:
556
	LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
557
			      "queue %p\n", qp);
558
	err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
559
560
561
	goto out_fail;
out_oversize:
	if (net_ratelimit())
562
563
		printk(KERN_INFO "Oversized IP packet from %pI4.\n",
			&qp->saddr);
Linus Torvalds's avatar
Linus Torvalds committed
564
out_fail:
565
	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS);
566
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
567
568
569
}

/* Process an incoming IP datagram fragment. */
570
int ip_defrag(struct sk_buff *skb, u32 user)
Linus Torvalds's avatar
Linus Torvalds committed
571
572
{
	struct ipq *qp;
573
	struct net *net;
574

575
	net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev);
576
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
Linus Torvalds's avatar
Linus Torvalds committed
577
578

	/* Start by cleaning up the memory. */
579
	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
580
		ip_evictor(net);
Linus Torvalds's avatar
Linus Torvalds committed
581
582

	/* Lookup (or create) queue header */
583
	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
584
		int ret;
Linus Torvalds's avatar
Linus Torvalds committed
585

586
		spin_lock(&qp->q.lock);
Linus Torvalds's avatar
Linus Torvalds committed
587

588
		ret = ip_frag_queue(qp, skb);
Linus Torvalds's avatar
Linus Torvalds committed
589

590
		spin_unlock(&qp->q.lock);
591
		ipq_put(qp);
592
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
593
594
	}

595
	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
Linus Torvalds's avatar
Linus Torvalds committed
596
	kfree_skb(skb);
597
	return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
598
599
}

600
601
602
#ifdef CONFIG_SYSCTL
static int zero;

603
static struct ctl_table ip4_frags_ns_ctl_table[] = {
604
605
606
	{
		.ctl_name	= NET_IPV4_IPFRAG_HIGH_THRESH,
		.procname	= "ipfrag_high_thresh",
607
		.data		= &init_net.ipv4.frags.high_thresh,
608
609
610
611
612
613
614
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec
	},
	{
		.ctl_name	= NET_IPV4_IPFRAG_LOW_THRESH,
		.procname	= "ipfrag_low_thresh",
615
		.data		= &init_net.ipv4.frags.low_thresh,
616
617
618
619
620
621
622
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec
	},
	{
		.ctl_name	= NET_IPV4_IPFRAG_TIME,
		.procname	= "ipfrag_time",
623
		.data		= &init_net.ipv4.frags.timeout,
624
625
626
627
628
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec_jiffies,
		.strategy	= &sysctl_jiffies
	},
629
630
631
632
	{ }
};

static struct ctl_table ip4_frags_ctl_table[] = {
633
634
635
	{
		.ctl_name	= NET_IPV4_IPFRAG_SECRET_INTERVAL,
		.procname	= "ipfrag_secret_interval",
636
		.data		= &ip4_frags.secret_interval,
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec_jiffies,
		.strategy	= &sysctl_jiffies
	},
	{
		.procname	= "ipfrag_max_dist",
		.data		= &sysctl_ipfrag_max_dist,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= &proc_dointvec_minmax,
		.extra1		= &zero
	},
	{ }
};

653
static int ip4_frags_ns_ctl_register(struct net *net)
654
{
655
	struct ctl_table *table;
656
657
	struct ctl_table_header *hdr;

658
	table = ip4_frags_ns_ctl_table;
659
	if (net != &init_net) {
660
		table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
661
662
663
		if (table == NULL)
			goto err_alloc;

664
665
		table[0].data = &net->ipv4.frags.high_thresh;
		table[1].data = &net->ipv4.frags.low_thresh;
666
		table[2].data = &net->ipv4.frags.timeout;
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
	}

	hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
	if (hdr == NULL)
		goto err_reg;

	net->ipv4.frags_hdr = hdr;
	return 0;

err_reg:
	if (net != &init_net)
		kfree(table);
err_alloc:
	return -ENOMEM;
}

683
static void ip4_frags_ns_ctl_unregister(struct net *net)
684
685
686
687
688
689
{
	struct ctl_table *table;

	table = net->ipv4.frags_hdr->ctl_table_arg;
	unregister_net_sysctl_table(net->ipv4.frags_hdr);
	kfree(table);
690
}
691
692
693
694
695

static void ip4_frags_ctl_register(void)
{
	register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
}
696
#else
697
static inline int ip4_frags_ns_ctl_register(struct net *net)
698
699
700
{
	return 0;
}
701

702
static inline void ip4_frags_ns_ctl_unregister(struct net *net)
703
704
{
}
705
706
707
708

static inline void ip4_frags_ctl_register(void)
{
}
709
710
711
712
#endif

static int ipv4_frags_init_net(struct net *net)
{
713
714
715
716
717
718
719
720
	/*
	 * Fragment cache limits. We will commit 256K at one time. Should we
	 * cross that limit we will prune down to 192K. This should cope with
	 * even the most extreme cases without allowing an attacker to
	 * measurably harm machine performance.
	 */
	net->ipv4.frags.high_thresh = 256 * 1024;
	net->ipv4.frags.low_thresh = 192 * 1024;
721
722
723
724
725
726
727
	/*
	 * Important NOTE! Fragment queue must be destroyed before MSL expires.
	 * RFC791 is wrong proposing to prolongate timer each fragment arrival
	 * by TTL.
	 */
	net->ipv4.frags.timeout = IP_FRAG_TIME;

728
729
	inet_frags_init_net(&net->ipv4.frags);

730
	return ip4_frags_ns_ctl_register(net);
731
732
}

733
734
static void ipv4_frags_exit_net(struct net *net)
{
735
	ip4_frags_ns_ctl_unregister(net);
736
737
738
739
740
741
742
743
	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
}

static struct pernet_operations ip4_frags_ops = {
	.init = ipv4_frags_init_net,
	.exit = ipv4_frags_exit_net,
};

744
void __init ipfrag_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
745
{
746
	ip4_frags_ctl_register();
747
	register_pernet_subsys(&ip4_frags_ops);
748
	ip4_frags.hashfn = ip4_hashfn;
749
	ip4_frags.constructor = ip4_frag_init;
750
751
752
	ip4_frags.destructor = ip4_frag_free;
	ip4_frags.skb_free = NULL;
	ip4_frags.qsize = sizeof(struct ipq);
753
	ip4_frags.match = ip4_frag_match;
754
	ip4_frags.frag_expire = ip_expire;
755
	ip4_frags.secret_interval = 10 * 60 * HZ;
756
	inet_frags_init(&ip4_frags);
Linus Torvalds's avatar
Linus Torvalds committed
757
758
759
}

EXPORT_SYMBOL(ip_defrag);