fib_trie.c 58.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/*
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation; either version
 *   2 of the License, or (at your option) any later version.
 *
 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
 *     & Swedish University of Agricultural Sciences.
 *
 *   Jens Laas <jens.laas@data.slu.se> Swedish University of 
 *     Agricultural Sciences.
 * 
 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
 *
 * This work is based on the LPC-trie which is originally descibed in:
 * 
 * An experimental study of compression methods for dynamic tries
 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
 *
 *
 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
 *
 * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
 *
 *
 * Code from fib_hash has been reused which includes the following header:
 *
 *
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IPv4 FIB: lookup engine and maintenance routines.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

46
#define VERSION "0.404"
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

#include <linux/config.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
65
#include <linux/rcupdate.h>
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include "fib_lookup.h"

#undef CONFIG_IP_FIB_TRIE_STATS
#define MAX_CHILDS 16384

#define KEYLENGTH (8*sizeof(t_key))
#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))

typedef unsigned int t_key;

#define T_TNODE 0
#define T_LEAF  1
#define NODE_TYPE_MASK	0x1UL
Olof Johansson's avatar
Olof Johansson committed
90
#define NODE_PARENT(node) \
91
92
93
94
95
96
97
	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))

#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)

#define NODE_SET_PARENT(node, ptr)		\
	rcu_assign_pointer((node)->parent,	\
			   ((unsigned long)(ptr)) | NODE_TYPE(node))
Olof Johansson's avatar
Olof Johansson committed
98
99
100

#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
101
102

struct node {
Olof Johansson's avatar
Olof Johansson committed
103
104
	t_key key;
	unsigned long parent;
105
106
107
};

struct leaf {
Olof Johansson's avatar
Olof Johansson committed
108
109
	t_key key;
	unsigned long parent;
110
	struct hlist_head list;
111
	struct rcu_head rcu;
112
113
114
115
};

struct leaf_info {
	struct hlist_node hlist;
116
	struct rcu_head rcu;
117
118
119
120
121
	int plen;
	struct list_head falh;
};

struct tnode {
Olof Johansson's avatar
Olof Johansson committed
122
123
124
125
126
127
	t_key key;
	unsigned long parent;
	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
	unsigned short full_children;	/* KEYLENGTH bits needed */
	unsigned short empty_children;	/* KEYLENGTH bits needed */
128
	struct rcu_head rcu;
Olof Johansson's avatar
Olof Johansson committed
129
	struct node *child[0];
130
131
132
133
134
135
136
137
138
};

#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
	unsigned int gets;
	unsigned int backtrack;
	unsigned int semantic_match_passed;
	unsigned int semantic_match_miss;
	unsigned int null_node_hit;
139
	unsigned int resize_node_skipped;
140
141
142
143
144
145
146
147
148
149
};
#endif

struct trie_stat {
	unsigned int totdepth;
	unsigned int maxdepth;
	unsigned int tnodes;
	unsigned int leaves;
	unsigned int nullpointers;
	unsigned int nodesizes[MAX_CHILDS];
150
};
151
152

struct trie {
Olof Johansson's avatar
Olof Johansson committed
153
	struct node *trie;
154
155
156
#ifdef CONFIG_IP_FIB_TRIE_STATS
	struct trie_use_stats stats;
#endif
Olof Johansson's avatar
Olof Johansson committed
157
	int size;
158
159
160
161
162
163
	unsigned int revision;
};

static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
static struct node *resize(struct trie *t, struct tnode *tn);
164
165
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
166
167
static void tnode_free(struct tnode *tn);

168
static kmem_cache_t *fn_alias_kmem __read_mostly;
169
170
static struct trie *trie_local = NULL, *trie_main = NULL;

171
172
173

/* rcu_read_lock needs to be hold by caller from readside */

174
static inline struct node *tnode_get_child(struct tnode *tn, int i)
175
{
Olof Johansson's avatar
Olof Johansson committed
176
	BUG_ON(i >= 1 << tn->bits);
177

178
	return rcu_dereference(tn->child[i]);
179
180
}

181
static inline int tnode_child_length(const struct tnode *tn)
182
{
Olof Johansson's avatar
Olof Johansson committed
183
	return 1 << tn->bits;
184
185
186
187
}

static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
{
Olof Johansson's avatar
Olof Johansson committed
188
	if (offset < KEYLENGTH)
189
		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
Olof Johansson's avatar
Olof Johansson committed
190
	else
191
192
193
194
195
		return 0;
}

static inline int tkey_equals(t_key a, t_key b)
{
196
	return a == b;
197
198
199
200
}

static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
{
201
202
	if (bits == 0 || offset >= KEYLENGTH)
		return 1;
Olof Johansson's avatar
Olof Johansson committed
203
204
	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
205
}
206
207
208
209
210
211

static inline int tkey_mismatch(t_key a, int offset, t_key b)
{
	t_key diff = a ^ b;
	int i = offset;

212
213
214
	if (!diff)
		return 0;
	while ((diff << i) >> (KEYLENGTH-1) == 0)
215
216
217
218
219
220
221
222
223
224
225
226
		i++;
	return i;
}

/*
  To understand this stuff, an understanding of keys and all their bits is 
  necessary. Every node in the trie has a key associated with it, but not 
  all of the bits in that key are significant.

  Consider a node 'n' and its parent 'tp'.

  If n is a leaf, every bit in its key is significant. Its presence is 
227
  necessitated by path compression, since during a tree traversal (when 
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
  searching for a leaf - unless we are doing an insertion) we will completely 
  ignore all skipped bits we encounter. Thus we need to verify, at the end of 
  a potentially successful search, that we have indeed been walking the 
  correct key path.

  Note that we can never "miss" the correct key in the tree if present by 
  following the wrong path. Path compression ensures that segments of the key 
  that are the same for all keys with a given prefix are skipped, but the 
  skipped part *is* identical for each node in the subtrie below the skipped 
  bit! trie_insert() in this implementation takes care of that - note the 
  call to tkey_sub_equals() in trie_insert().

  if n is an internal node - a 'tnode' here, the various parts of its key 
  have many different meanings.

  Example:  
  _________________________________________________________________
  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
  -----------------------------------------------------------------
    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 

  _________________________________________________________________
  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
  -----------------------------------------------------------------
   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31

  tp->pos = 7
  tp->bits = 3
  n->pos = 15
Olof Johansson's avatar
Olof Johansson committed
257
  n->bits = 4
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

  First, let's just ignore the bits that come before the parent tp, that is 
  the bits from 0 to (tp->pos-1). They are *known* but at this point we do 
  not use them for anything.

  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
  index into the parent's child array. That is, they will be used to find 
  'n' among tp's children.

  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
  for the node n.

  All the bits we have seen so far are significant to the node n. The rest 
  of the bits are really not needed or indeed known in n->key.

  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 
  n's child array, and will of course be different for each child.
  
276

277
278
279
280
281
  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
  at this point.

*/

Stephen Hemminger's avatar
Stephen Hemminger committed
282
static inline void check_tnode(const struct tnode *tn)
283
{
Stephen Hemminger's avatar
Stephen Hemminger committed
284
	WARN_ON(tn && tn->pos+tn->bits > 32);
285
286
287
288
}

static int halve_threshold = 25;
static int inflate_threshold = 50;
289
290
static int halve_threshold_root = 15;
static int inflate_threshold_root = 25; 
291

292
293

static void __alias_free_mem(struct rcu_head *head)
294
{
295
296
	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
	kmem_cache_free(fn_alias_kmem, fa);
297
298
}

299
static inline void alias_free_mem_rcu(struct fib_alias *fa)
300
{
301
302
	call_rcu(&fa->rcu, __alias_free_mem);
}
Olof Johansson's avatar
Olof Johansson committed
303

304
305
306
307
static void __leaf_free_rcu(struct rcu_head *head)
{
	kfree(container_of(head, struct leaf, rcu));
}
Olof Johansson's avatar
Olof Johansson committed
308

309
310
311
static inline void free_leaf(struct leaf *leaf)
{
	call_rcu(&leaf->rcu, __leaf_free_rcu);
312
313
}

314
static void __leaf_info_free_rcu(struct rcu_head *head)
315
{
316
	kfree(container_of(head, struct leaf_info, rcu));
317
318
}

319
static inline void free_leaf_info(struct leaf_info *leaf)
320
{
321
	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
322
323
}

324
325
static struct tnode *tnode_alloc(unsigned int size)
{
326
327
328
329
330
331
332
333
334
335
	struct page *pages;

	if (size <= PAGE_SIZE)
		return kcalloc(size, 1, GFP_KERNEL);

	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
	if (!pages)
		return NULL;

	return page_address(pages);
336
337
}

338
static void __tnode_free_rcu(struct rcu_head *head)
339
{
340
	struct tnode *tn = container_of(head, struct tnode, rcu);
341
	unsigned int size = sizeof(struct tnode) +
342
		(1 << tn->bits) * sizeof(struct node *);
343
344
345
346
347
348
349

	if (size <= PAGE_SIZE)
		kfree(tn);
	else
		free_pages((unsigned long)tn, get_order(size));
}

350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
static inline void tnode_free(struct tnode *tn)
{
	call_rcu(&tn->rcu, __tnode_free_rcu);
}

static struct leaf *leaf_new(void)
{
	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
	if (l) {
		l->parent = T_LEAF;
		INIT_HLIST_HEAD(&l->list);
	}
	return l;
}

static struct leaf_info *leaf_info_new(int plen)
{
	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
	if (li) {
		li->plen = plen;
		INIT_LIST_HEAD(&li->falh);
	}
	return li;
}

375
376
377
378
static struct tnode* tnode_new(t_key key, int pos, int bits)
{
	int nchildren = 1<<bits;
	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
379
	struct tnode *tn = tnode_alloc(sz);
380

Olof Johansson's avatar
Olof Johansson committed
381
	if (tn) {
382
		memset(tn, 0, sz);
383
		tn->parent = T_TNODE;
384
385
386
387
388
389
		tn->pos = pos;
		tn->bits = bits;
		tn->key = key;
		tn->full_children = 0;
		tn->empty_children = 1<<bits;
	}
390

Stephen Hemminger's avatar
Stephen Hemminger committed
391
392
	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
		 (unsigned int) (sizeof(struct node) * 1<<bits));
393
394
395
396
397
398
399
400
	return tn;
}

/*
 * Check whether a tnode 'n' is "full", i.e. it is an internal node
 * and no bits are skipped. See discussion in dyntree paper p. 6
 */

401
static inline int tnode_full(const struct tnode *tn, const struct node *n)
402
{
403
	if (n == NULL || IS_LEAF(n))
404
405
406
407
408
		return 0;

	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}

409
static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
410
411
412
413
{
	tnode_put_child_reorg(tn, i, n, -1);
}

414
 /*
415
416
417
418
  * Add a child at position i overwriting the old value.
  * Update the value of full_children and empty_children.
  */

419
static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
420
{
421
	struct node *chi = tn->child[i];
422
423
	int isfull;

Stephen Hemminger's avatar
Stephen Hemminger committed
424
425
	BUG_ON(i >= 1<<tn->bits);

426
427
428
429
430
431

	/* update emptyChildren */
	if (n == NULL && chi != NULL)
		tn->empty_children++;
	else if (n != NULL && chi == NULL)
		tn->empty_children--;
432

433
	/* update fullChildren */
Olof Johansson's avatar
Olof Johansson committed
434
	if (wasfull == -1)
435
436
437
		wasfull = tnode_full(tn, chi);

	isfull = tnode_full(tn, n);
438
	if (wasfull && !isfull)
439
		tn->full_children--;
440
	else if (!wasfull && isfull)
441
		tn->full_children++;
Olof Johansson's avatar
Olof Johansson committed
442

443
444
	if (n)
		NODE_SET_PARENT(n, tn);
445

446
	rcu_assign_pointer(tn->child[i], n);
447
448
}

449
static struct node *resize(struct trie *t, struct tnode *tn)
450
451
{
	int i;
452
	int err = 0;
453
	struct tnode *old_tn;
454
455
	int inflate_threshold_use;
	int halve_threshold_use;
456
457
458
459

 	if (!tn)
		return NULL;

Stephen Hemminger's avatar
Stephen Hemminger committed
460
461
	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
		 tn, inflate_threshold, halve_threshold);
462
463
464
465
466
467
468
469
470

	/* No children */
	if (tn->empty_children == tnode_child_length(tn)) {
		tnode_free(tn);
		return NULL;
	}
	/* One child */
	if (tn->empty_children == tnode_child_length(tn) - 1)
		for (i = 0; i < tnode_child_length(tn); i++) {
Olof Johansson's avatar
Olof Johansson committed
471
			struct node *n;
472

Olof Johansson's avatar
Olof Johansson committed
473
			n = tn->child[i];
474
			if (!n)
Olof Johansson's avatar
Olof Johansson committed
475
476
477
				continue;

			/* compress one level */
478
			NODE_SET_PARENT(n, NULL);
Olof Johansson's avatar
Olof Johansson committed
479
480
			tnode_free(tn);
			return n;
481
		}
482
	/*
483
484
485
486
487
	 * Double as long as the resulting node has a number of
	 * nonempty nodes that are above the threshold.
	 */

	/*
488
489
	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
490
	 * Telecommunications, page 6:
491
	 * "A node is doubled if the ratio of non-empty children to all
492
493
	 * children in the *doubled* node is at least 'high'."
	 *
494
495
496
497
498
	 * 'high' in this instance is the variable 'inflate_threshold'. It
	 * is expressed as a percentage, so we multiply it with
	 * tnode_child_length() and instead of multiplying by 2 (since the
	 * child array will be doubled by inflate()) and multiplying
	 * the left-hand side by 100 (to handle the percentage thing) we
499
	 * multiply the left-hand side by 50.
500
501
502
503
	 *
	 * The left-hand side may look a bit weird: tnode_child_length(tn)
	 * - tn->empty_children is of course the number of non-null children
	 * in the current node. tn->full_children is the number of "full"
504
	 * children, that is non-null tnodes with a skip value of 0.
505
	 * All of those will be doubled in the resulting inflated tnode, so
506
	 * we just count them one extra time here.
507
	 *
508
	 * A clearer way to write this would be:
509
	 *
510
	 * to_be_doubled = tn->full_children;
511
	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
512
513
514
515
	 *     tn->full_children;
	 *
	 * new_child_length = tnode_child_length(tn) * 2;
	 *
516
	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
517
518
	 *      new_child_length;
	 * if (new_fill_factor >= inflate_threshold)
519
520
521
	 *
	 * ...and so on, tho it would mess up the while () loop.
	 *
522
523
524
	 * anyway,
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
	 *      inflate_threshold
525
	 *
526
527
528
	 * avoid a division:
	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
	 *      inflate_threshold * new_child_length
529
	 *
530
	 * expand not_to_be_doubled and to_be_doubled, and shorten:
531
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
Olof Johansson's avatar
Olof Johansson committed
532
	 *    tn->full_children) >= inflate_threshold * new_child_length
533
	 *
534
	 * expand new_child_length:
535
	 * 100 * (tnode_child_length(tn) - tn->empty_children +
Olof Johansson's avatar
Olof Johansson committed
536
	 *    tn->full_children) >=
537
	 *      inflate_threshold * tnode_child_length(tn) * 2
538
	 *
539
	 * shorten again:
540
	 * 50 * (tn->full_children + tnode_child_length(tn) -
Olof Johansson's avatar
Olof Johansson committed
541
	 *    tn->empty_children) >= inflate_threshold *
542
	 *    tnode_child_length(tn)
543
	 *
544
545
546
	 */

	check_tnode(tn);
547

548
549
550
551
552
553
554
	/* Keep root node larger  */

	if(!tn->parent)
		inflate_threshold_use = inflate_threshold_root;
	else 
		inflate_threshold_use = inflate_threshold;

555
	err = 0;
556
557
	while ((tn->full_children > 0 &&
	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
558
				inflate_threshold_use * tnode_child_length(tn))) {
559

560
561
562
563
		old_tn = tn;
		tn = inflate(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
564
565
566
567
568
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
569
570
571
572
573
574
575
576
	}

	check_tnode(tn);

	/*
	 * Halve as long as the number of empty children in this
	 * node is above threshold.
	 */
577

578
579
580
581
582
583
584
585

	/* Keep root node larger  */

	if(!tn->parent)
		halve_threshold_use = halve_threshold_root;
	else 
		halve_threshold_use = halve_threshold;

586
	err = 0;
587
588
	while (tn->bits > 1 &&
	       100 * (tnode_child_length(tn) - tn->empty_children) <
589
	       halve_threshold_use * tnode_child_length(tn)) {
590

591
592
593
594
		old_tn = tn;
		tn = halve(t, tn);
		if (IS_ERR(tn)) {
			tn = old_tn;
595
596
597
598
599
600
#ifdef CONFIG_IP_FIB_TRIE_STATS
			t->stats.resize_node_skipped++;
#endif
			break;
		}
	}
601

602

603
604
605
	/* Only one child remains */
	if (tn->empty_children == tnode_child_length(tn) - 1)
		for (i = 0; i < tnode_child_length(tn); i++) {
Olof Johansson's avatar
Olof Johansson committed
606
			struct node *n;
607

Olof Johansson's avatar
Olof Johansson committed
608
			n = tn->child[i];
609
			if (!n)
Olof Johansson's avatar
Olof Johansson committed
610
611
612
613
				continue;

			/* compress one level */

614
			NODE_SET_PARENT(n, NULL);
Olof Johansson's avatar
Olof Johansson committed
615
616
			tnode_free(tn);
			return n;
617
618
619
620
621
		}

	return (struct node *) tn;
}

622
static struct tnode *inflate(struct trie *t, struct tnode *tn)
623
624
625
626
627
628
{
	struct tnode *inode;
	struct tnode *oldtnode = tn;
	int olen = tnode_child_length(tn);
	int i;

Stephen Hemminger's avatar
Stephen Hemminger committed
629
	pr_debug("In inflate\n");
630
631
632

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);

Stephen Hemminger's avatar
Stephen Hemminger committed
633
	if (!tn)
634
		return ERR_PTR(-ENOMEM);
635
636

	/*
637
638
639
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and  inflate
640
641
	 * of tnode is ignored.
	 */
Olof Johansson's avatar
Olof Johansson committed
642
643

	for (i = 0; i < olen; i++) {
644
645
646
647
648
649
650
651
		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);

		if (inode &&
		    IS_TNODE(inode) &&
		    inode->pos == oldtnode->pos + oldtnode->bits &&
		    inode->bits > 1) {
			struct tnode *left, *right;
			t_key m = TKEY_GET_MASK(inode->pos, 1);
652

653
654
			left = tnode_new(inode->key&(~m), inode->pos + 1,
					 inode->bits - 1);
655
656
			if (!left)
				goto nomem;
Olof Johansson's avatar
Olof Johansson committed
657

658
659
660
			right = tnode_new(inode->key|m, inode->pos + 1,
					  inode->bits - 1);

661
662
663
664
                        if (!right) {
				tnode_free(left);
				goto nomem;
                        }
665
666
667
668
669
670

			put_child(t, tn, 2*i, (struct node *) left);
			put_child(t, tn, 2*i+1, (struct node *) right);
		}
	}

Olof Johansson's avatar
Olof Johansson committed
671
	for (i = 0; i < olen; i++) {
672
		struct node *node = tnode_get_child(oldtnode, i);
Olof Johansson's avatar
Olof Johansson committed
673
674
		struct tnode *left, *right;
		int size, j;
675

676
677
678
679
680
681
		/* An empty child */
		if (node == NULL)
			continue;

		/* A leaf or an internal node with skipped bits */

682
		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
683
		   tn->pos + tn->bits - 1) {
684
			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
					     1) == 0)
				put_child(t, tn, 2*i, node);
			else
				put_child(t, tn, 2*i+1, node);
			continue;
		}

		/* An internal node with two children */
		inode = (struct tnode *) node;

		if (inode->bits == 1) {
			put_child(t, tn, 2*i, inode->child[0]);
			put_child(t, tn, 2*i+1, inode->child[1]);

			tnode_free(inode);
Olof Johansson's avatar
Olof Johansson committed
700
			continue;
701
702
		}

Olof Johansson's avatar
Olof Johansson committed
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
		/* An internal node with more than two children */

		/* We will replace this node 'inode' with two new
		 * ones, 'left' and 'right', each with half of the
		 * original children. The two new nodes will have
		 * a position one bit further down the key and this
		 * means that the "significant" part of their keys
		 * (see the discussion near the top of this file)
		 * will differ by one bit, which will be "0" in
		 * left's key and "1" in right's key. Since we are
		 * moving the key position by one step, the bit that
		 * we are moving away from - the bit at position
		 * (inode->pos) - is the one that will differ between
		 * left and right. So... we synthesize that bit in the
		 * two  new keys.
		 * The mask 'm' below will be a single "one" bit at
		 * the position (inode->pos)
		 */
721

Olof Johansson's avatar
Olof Johansson committed
722
723
724
		/* Use the old key, but set the new significant
		 *   bit to zero.
		 */
725

Olof Johansson's avatar
Olof Johansson committed
726
727
		left = (struct tnode *) tnode_get_child(tn, 2*i);
		put_child(t, tn, 2*i, NULL);
728

Olof Johansson's avatar
Olof Johansson committed
729
		BUG_ON(!left);
730

Olof Johansson's avatar
Olof Johansson committed
731
732
		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
		put_child(t, tn, 2*i+1, NULL);
733

Olof Johansson's avatar
Olof Johansson committed
734
		BUG_ON(!right);
735

Olof Johansson's avatar
Olof Johansson committed
736
737
738
739
		size = tnode_child_length(left);
		for (j = 0; j < size; j++) {
			put_child(t, left, j, inode->child[j]);
			put_child(t, right, j, inode->child[j + size]);
740
		}
Olof Johansson's avatar
Olof Johansson committed
741
742
743
744
		put_child(t, tn, 2*i, resize(t, left));
		put_child(t, tn, 2*i+1, resize(t, right));

		tnode_free(inode);
745
746
747
	}
	tnode_free(oldtnode);
	return tn;
748
749
750
751
752
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

Stephen Hemminger's avatar
Stephen Hemminger committed
753
		for (j = 0; j < size; j++)
754
755
756
757
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
Stephen Hemminger's avatar
Stephen Hemminger committed
758

759
760
		return ERR_PTR(-ENOMEM);
	}
761
762
}

763
static struct tnode *halve(struct trie *t, struct tnode *tn)
764
765
766
767
768
769
{
	struct tnode *oldtnode = tn;
	struct node *left, *right;
	int i;
	int olen = tnode_child_length(tn);

Stephen Hemminger's avatar
Stephen Hemminger committed
770
	pr_debug("In halve\n");
771
772

	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
773

774
775
	if (!tn)
		return ERR_PTR(-ENOMEM);
776
777

	/*
778
779
780
	 * Preallocate and store tnodes before the actual work so we
	 * don't get into an inconsistent state if memory allocation
	 * fails. In case of failure we return the oldnode and halve
781
782
783
	 * of tnode is ignored.
	 */

Olof Johansson's avatar
Olof Johansson committed
784
	for (i = 0; i < olen; i += 2) {
785
786
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
787

788
		/* Two nonempty children */
Stephen Hemminger's avatar
Stephen Hemminger committed
789
		if (left && right) {
790
			struct tnode *newn;
Stephen Hemminger's avatar
Stephen Hemminger committed
791

792
			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
Stephen Hemminger's avatar
Stephen Hemminger committed
793
794

			if (!newn)
795
				goto nomem;
Stephen Hemminger's avatar
Stephen Hemminger committed
796

797
			put_child(t, tn, i/2, (struct node *)newn);
798
799
800
		}

	}
801

Olof Johansson's avatar
Olof Johansson committed
802
803
804
	for (i = 0; i < olen; i += 2) {
		struct tnode *newBinNode;

805
806
		left = tnode_get_child(oldtnode, i);
		right = tnode_get_child(oldtnode, i+1);
807

808
809
810
811
812
		/* At least one of the children is empty */
		if (left == NULL) {
			if (right == NULL)    /* Both are empty */
				continue;
			put_child(t, tn, i/2, right);
Olof Johansson's avatar
Olof Johansson committed
813
			continue;
Stephen Hemminger's avatar
Stephen Hemminger committed
814
		}
Olof Johansson's avatar
Olof Johansson committed
815
816

		if (right == NULL) {
817
			put_child(t, tn, i/2, left);
Olof Johansson's avatar
Olof Johansson committed
818
819
			continue;
		}
820

821
		/* Two nonempty children */
Olof Johansson's avatar
Olof Johansson committed
822
823
824
825
826
		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
		put_child(t, tn, i/2, NULL);
		put_child(t, newBinNode, 0, left);
		put_child(t, newBinNode, 1, right);
		put_child(t, tn, i/2, resize(t, newBinNode));
827
828
829
	}
	tnode_free(oldtnode);
	return tn;
830
831
832
833
834
nomem:
	{
		int size = tnode_child_length(tn);
		int j;

Stephen Hemminger's avatar
Stephen Hemminger committed
835
		for (j = 0; j < size; j++)
836
837
838
839
			if (tn->child[j])
				tnode_free((struct tnode *)tn->child[j]);

		tnode_free(tn);
Stephen Hemminger's avatar
Stephen Hemminger committed
840

841
842
		return ERR_PTR(-ENOMEM);
	}
843
844
}

Olof Johansson's avatar
Olof Johansson committed
845
static void trie_init(struct trie *t)
846
{
Olof Johansson's avatar
Olof Johansson committed
847
848
849
850
	if (!t)
		return;

	t->size = 0;
851
	rcu_assign_pointer(t->trie, NULL);
Olof Johansson's avatar
Olof Johansson committed
852
	t->revision = 0;
853
#ifdef CONFIG_IP_FIB_TRIE_STATS
Olof Johansson's avatar
Olof Johansson committed
854
	memset(&t->stats, 0, sizeof(struct trie_use_stats));
855
856
857
#endif
}

858
/* readside must use rcu_read_lock currently dump routines
859
860
 via get_fa_head and dump */

861
static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
862
{
863
	struct hlist_head *head = &l->list;
864
865
866
	struct hlist_node *node;
	struct leaf_info *li;

867
	hlist_for_each_entry_rcu(li, node, head, hlist)
868
		if (li->plen == plen)
869
			return li;
Olof Johansson's avatar
Olof Johansson committed
870

871
872
873
874
875
	return NULL;
}

static inline struct list_head * get_fa_head(struct leaf *l, int plen)
{
876
	struct leaf_info *li = find_leaf_info(l, plen);
877

Olof Johansson's avatar
Olof Johansson committed
878
879
	if (!li)
		return NULL;
880

Olof Johansson's avatar
Olof Johansson committed
881
	return &li->falh;
882
883
884
885
}

static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
{
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
        struct leaf_info *li = NULL, *last = NULL;
        struct hlist_node *node;

        if (hlist_empty(head)) {
                hlist_add_head_rcu(&new->hlist, head);
        } else {
                hlist_for_each_entry(li, node, head, hlist) {
                        if (new->plen > li->plen)
                                break;

                        last = li;
                }
                if (last)
                        hlist_add_after_rcu(&last->hlist, &new->hlist);
                else
                        hlist_add_before_rcu(&new->hlist, &li->hlist);
        }
903
904
}

905
906
/* rcu_read_lock needs to be hold by caller from readside */

907
908
909
910
911
912
913
914
static struct leaf *
fib_find_node(struct trie *t, u32 key)
{
	int pos;
	struct tnode *tn;
	struct node *n;

	pos = 0;
915
	n = rcu_dereference(t->trie);
916
917
918

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
Olof Johansson's avatar
Olof Johansson committed
919

920
		check_tnode(tn);
Olof Johansson's avatar
Olof Johansson committed
921

922
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
Olof Johansson's avatar
Olof Johansson committed
923
			pos = tn->pos + tn->bits;
924
			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
Olof Johansson's avatar
Olof Johansson committed
925
		} else
926
927
928
929
			break;
	}
	/* Case we have found a leaf. Compare prefixes */

Olof Johansson's avatar
Olof Johansson committed
930
931
932
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
		return (struct leaf *)n;

933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
	return NULL;
}

static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
{
	int wasfull;
	t_key cindex, key;
	struct tnode *tp = NULL;

	key = tn->key;

	while (tn != NULL && NODE_PARENT(tn) != NULL) {

		tp = NODE_PARENT(tn);
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
		tn = (struct tnode *) resize (t, (struct tnode *)tn);
		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
Olof Johansson's avatar
Olof Johansson committed
951

952
		if (!NODE_PARENT(tn))
953
954
955
956
957
			break;

		tn = NODE_PARENT(tn);
	}
	/* Handle last (top) tnode */
958
	if (IS_TNODE(tn))
959
960
961
962
963
		tn = (struct tnode*) resize(t, (struct tnode *)tn);

	return (struct node*) tn;
}

964
965
/* only used from updater-side */

966
967
static  struct list_head *
fib_insert_node(struct trie *t, int *err, u32 key, int plen)
968
969
970
971
972
973
{
	int pos, newpos;
	struct tnode *tp = NULL, *tn = NULL;
	struct node *n;
	struct leaf *l;
	int missbit;
974
	struct list_head *fa_head = NULL;
975
976
977
978
	struct leaf_info *li;
	t_key cindex;

	pos = 0;
979
	n = t->trie;
980

981
982
	/* If we point to NULL, stop. Either the tree is empty and we should
	 * just put a new leaf in if, or we have reached an empty child slot,
983
	 * and we should just put our new leaf in that.
984
985
	 * If we point to a T_TNODE, check if it matches our key. Note that
	 * a T_TNODE might be skipping any number of bits - its 'pos' need
986
987
	 * not be the parent's 'pos'+'bits'!
	 *
988
	 * If it does match the current key, get pos/bits from it, extract
989
990
991
992
	 * the index from our key, push the T_TNODE and walk the tree.
	 *
	 * If it doesn't, we have to replace it with a new T_TNODE.
	 *
993
994
995
	 * If we point to a T_LEAF, it might or might not have the same key
	 * as we do. If it does, just change the value, update the T_LEAF's
	 * value, and return it.
996
997
998
999
1000
	 * If it doesn't, we need to replace it with a T_TNODE.
	 */

	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
		tn = (struct tnode *) n;
Olof Johansson's avatar
Olof Johansson committed
1001

1002
		check_tnode(tn);
Olof Johansson's avatar
Olof Johansson committed
1003

1004
		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1005
			tp = tn;
Olof Johansson's avatar
Olof Johansson committed
1006
			pos = tn->pos + tn->bits;
1007
1008
			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));

Stephen Hemminger's avatar
Stephen Hemminger committed
1009
			BUG_ON(n && NODE_PARENT(n) != tn);
Olof Johansson's avatar
Olof Johansson committed
1010
		} else
1011
1012
1013
1014
1015
1016
			break;
	}

	/*
	 * n  ----> NULL, LEAF or TNODE
	 *
1017
	 * tp is n's (parent) ----> NULL or TNODE
1018
1019
	 */

Olof Johansson's avatar
Olof Johansson committed
1020
	BUG_ON(tp && IS_LEAF(tp));
1021
1022
1023

	/* Case 1: n is a leaf. Compare prefixes */

1024
	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
Olof Johansson's avatar
Olof Johansson committed
1025
1026
		struct leaf *l = (struct leaf *) n;

1027
		li = leaf_info_new(plen);
Olof Johansson's avatar
Olof Johansson committed
1028

1029
		if (!li) {
1030
1031
1032
			*err = -ENOMEM;
			goto err;
		}
1033
1034
1035
1036
1037
1038
1039
1040

		fa_head = &li->falh;
		insert_leaf_info(&l->list, li);
		goto done;
	}
	t->size++;
	l = leaf_new();

1041
	if (!l) {
1042
1043
1044
		*err = -ENOMEM;
		goto err;
	}
1045
1046
1047
1048

	l->key = key;
	li = leaf_info_new(plen);

1049
	if (!li) {
1050
1051
1052
1053
		tnode_free((struct tnode *) l);
		*err = -ENOMEM;
		goto err;
	}
1054
1055
1056
1057
1058

	fa_head = &li->falh;
	insert_leaf_info(&l->list, li);

	if (t->trie && n == NULL) {
Olof Johansson's avatar
Olof Johansson committed
1059
		/* Case 2: n is NULL, and will just insert a new leaf */
1060
1061
1062

		NODE_SET_PARENT(l, tp);

Olof Johansson's avatar
Olof Johansson committed
1063
1064
1065
1066
		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
	} else {
		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1067
1068
		/*
		 *  Add a new tnode here
1069
1070
1071
1072
		 *  first tnode need some special handling
		 */

		if (tp)
Olof Johansson's avatar
Olof Johansson committed
1073
			pos = tp->pos+tp->bits;
1074
		else
Olof Johansson's avatar
Olof Johansson committed
1075
1076
			pos = 0;

1077
		if (n) {
1078
1079
			newpos = tkey_mismatch(key, pos, n->key);
			tn = tnode_new(n->key, newpos, 1);
Olof Johansson's avatar
Olof Johansson committed
1080
		} else {
1081
			newpos = 0;
1082
			tn = tnode_new(key, newpos, 1); /* First tnode */
1083
1084
		}

1085
		if (!tn) {
1086
1087
1088
1089
			free_leaf_info(li);
			tnode_free((struct tnode *) l);
			*err = -ENOMEM;
			goto err;
Olof Johansson's avatar
Olof Johansson committed
1090
1091
		}

1092
1093
		NODE_SET_PARENT(tn, tp);

Olof Johansson's avatar
Olof Johansson committed
1094
		missbit = tkey_extract_bits(key, newpos, 1);
1095
1096
1097
		put_child(t, tn, missbit, (struct node *)l);
		put_child(t, tn, 1-missbit, n);

1098
		if (tp) {
1099
1100
			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
Olof Johansson's avatar
Olof Johansson committed
1101
		} else {
1102
			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1103
1104
1105
			tp = tn;
		}
	}
Olof Johansson's avatar
Olof Johansson committed
1106
1107

	if (tp && tp->pos + tp->bits > 32)
1108
		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1109
		       tp, tp->pos, tp->bits, key, plen);
Olof Johansson's avatar
Olof Johansson committed
1110

1111
	/* Rebalance the trie */
1112
1113

	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1114
1115
done:
	t->revision++;
Olof Johansson's avatar
Olof Johansson committed
1116
err:
1117
1118
1119
1120
1121
1122
1123
1124
1125
	return fa_head;
}

static int
fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
{
	struct trie *t = (struct trie *) tb->tb_data;
	struct fib_alias *fa, *new_fa;
1126
	struct list_head *fa_head = NULL;
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
<