1/*
2 *   This program is free software; you can redistribute it and/or
3 *   modify it under the terms of the GNU General Public License
4 *   as published by the Free Software Foundation; either version
5 *   2 of the License, or (at your option) any later version.
6 *
7 *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 *     & Swedish University of Agricultural Sciences.
9 *
10 *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11 *     Agricultural Sciences.
12 *
13 *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET		An implementation of the TCP/IP protocol suite for the LINUX
30 *		operating system.  INET is implemented using the  BSD Socket
31 *		interface as the means of communication with the user level.
32 *
33 *		IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 *		This program is free software; you can redistribute it and/or
39 *		modify it under the terms of the GNU General Public License
40 *		as published by the Free Software Foundation; either version
41 *		2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 *		David S. Miller, <davem@davemloft.net>
46 *		Stephen Hemminger <shemminger@osdl.org>
47 *		Paul E. McKenney <paulmck@us.ibm.com>
48 *		Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <asm/uaccess.h>
54#include <linux/bitops.h>
55#include <linux/types.h>
56#include <linux/kernel.h>
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
64#include <linux/inetdevice.h>
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
68#include <linux/rcupdate.h>
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
73#include <linux/slab.h>
74#include <linux/export.h>
75#include <net/net_namespace.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include <net/switchdev.h>
83#include "fib_lookup.h"
84
85#define MAX_STAT_DEPTH 32
86
87#define KEYLENGTH	(8*sizeof(t_key))
88#define KEY_MAX		((t_key)~0)
89
90typedef unsigned int t_key;
91
92#define IS_TRIE(n)	((n)->pos >= KEYLENGTH)
93#define IS_TNODE(n)	((n)->bits)
94#define IS_LEAF(n)	(!(n)->bits)
95
96struct key_vector {
97	t_key key;
98	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
99	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
100	unsigned char slen;
101	union {
102		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
103		struct hlist_head leaf;
104		/* This array is valid if (pos | bits) > 0 (TNODE) */
105		struct key_vector __rcu *tnode[0];
106	};
107};
108
109struct tnode {
110	struct rcu_head rcu;
111	t_key empty_children;		/* KEYLENGTH bits needed */
112	t_key full_children;		/* KEYLENGTH bits needed */
113	struct key_vector __rcu *parent;
114	struct key_vector kv[1];
115#define tn_bits kv[0].bits
116};
117
118#define TNODE_SIZE(n)	offsetof(struct tnode, kv[0].tnode[n])
119#define LEAF_SIZE	TNODE_SIZE(1)
120
121#ifdef CONFIG_IP_FIB_TRIE_STATS
122struct trie_use_stats {
123	unsigned int gets;
124	unsigned int backtrack;
125	unsigned int semantic_match_passed;
126	unsigned int semantic_match_miss;
127	unsigned int null_node_hit;
128	unsigned int resize_node_skipped;
129};
130#endif
131
132struct trie_stat {
133	unsigned int totdepth;
134	unsigned int maxdepth;
135	unsigned int tnodes;
136	unsigned int leaves;
137	unsigned int nullpointers;
138	unsigned int prefixes;
139	unsigned int nodesizes[MAX_STAT_DEPTH];
140};
141
142struct trie {
143	struct key_vector kv[1];
144#ifdef CONFIG_IP_FIB_TRIE_STATS
145	struct trie_use_stats __percpu *stats;
146#endif
147};
148
149static struct key_vector *resize(struct trie *t, struct key_vector *tn);
150static size_t tnode_free_size;
151
152/*
153 * synchronize_rcu after call_rcu for that many pages; it should be especially
154 * useful before resizing the root node with PREEMPT_NONE configs; the value was
155 * obtained experimentally, aiming to avoid visible slowdown.
156 */
157static const int sync_pages = 128;
158
159static struct kmem_cache *fn_alias_kmem __read_mostly;
160static struct kmem_cache *trie_leaf_kmem __read_mostly;
161
162static inline struct tnode *tn_info(struct key_vector *kv)
163{
164	return container_of(kv, struct tnode, kv[0]);
165}
166
167/* caller must hold RTNL */
168#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
169#define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
170
171/* caller must hold RCU read lock or RTNL */
172#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
173#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
174
175/* wrapper for rcu_assign_pointer */
176static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
177{
178	if (n)
179		rcu_assign_pointer(tn_info(n)->parent, tp);
180}
181
182#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
183
184/* This provides us with the number of children in this node, in the case of a
185 * leaf this will return 0 meaning none of the children are accessible.
186 */
187static inline unsigned long child_length(const struct key_vector *tn)
188{
189	return (1ul << tn->bits) & ~(1ul);
190}
191
192#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
193
194static inline unsigned long get_index(t_key key, struct key_vector *kv)
195{
196	unsigned long index = key ^ kv->key;
197
198	if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
199		return 0;
200
201	return index >> kv->pos;
202}
203
204/* To understand this stuff, an understanding of keys and all their bits is
205 * necessary. Every node in the trie has a key associated with it, but not
206 * all of the bits in that key are significant.
207 *
208 * Consider a node 'n' and its parent 'tp'.
209 *
210 * If n is a leaf, every bit in its key is significant. Its presence is
211 * necessitated by path compression, since during a tree traversal (when
212 * searching for a leaf - unless we are doing an insertion) we will completely
213 * ignore all skipped bits we encounter. Thus we need to verify, at the end of
214 * a potentially successful search, that we have indeed been walking the
215 * correct key path.
216 *
217 * Note that we can never "miss" the correct key in the tree if present by
218 * following the wrong path. Path compression ensures that segments of the key
219 * that are the same for all keys with a given prefix are skipped, but the
220 * skipped part *is* identical for each node in the subtrie below the skipped
221 * bit! trie_insert() in this implementation takes care of that.
222 *
223 * if n is an internal node - a 'tnode' here, the various parts of its key
224 * have many different meanings.
225 *
226 * Example:
227 * _________________________________________________________________
228 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
229 * -----------------------------------------------------------------
230 *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
231 *
232 * _________________________________________________________________
233 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
234 * -----------------------------------------------------------------
235 *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
236 *
237 * tp->pos = 22
238 * tp->bits = 3
239 * n->pos = 13
240 * n->bits = 4
241 *
242 * First, let's just ignore the bits that come before the parent tp, that is
243 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
244 * point we do not use them for anything.
245 *
246 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
247 * index into the parent's child array. That is, they will be used to find
248 * 'n' among tp's children.
249 *
250 * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
251 * for the node n.
252 *
253 * All the bits we have seen so far are significant to the node n. The rest
254 * of the bits are really not needed or indeed known in n->key.
255 *
256 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
257 * n's child array, and will of course be different for each child.
258 *
259 * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
260 * at this point.
261 */
262
263static const int halve_threshold = 25;
264static const int inflate_threshold = 50;
265static const int halve_threshold_root = 15;
266static const int inflate_threshold_root = 30;
267
268static void __alias_free_mem(struct rcu_head *head)
269{
270	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
271	kmem_cache_free(fn_alias_kmem, fa);
272}
273
274static inline void alias_free_mem_rcu(struct fib_alias *fa)
275{
276	call_rcu(&fa->rcu, __alias_free_mem);
277}
278
279#define TNODE_KMALLOC_MAX \
280	ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
281#define TNODE_VMALLOC_MAX \
282	ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
283
284static void __node_free_rcu(struct rcu_head *head)
285{
286	struct tnode *n = container_of(head, struct tnode, rcu);
287
288	if (!n->tn_bits)
289		kmem_cache_free(trie_leaf_kmem, n);
290	else if (n->tn_bits <= TNODE_KMALLOC_MAX)
291		kfree(n);
292	else
293		vfree(n);
294}
295
296#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
297
298static struct tnode *tnode_alloc(int bits)
299{
300	size_t size;
301
302	/* verify bits is within bounds */
303	if (bits > TNODE_VMALLOC_MAX)
304		return NULL;
305
306	/* determine size and verify it is non-zero and didn't overflow */
307	size = TNODE_SIZE(1ul << bits);
308
309	if (size <= PAGE_SIZE)
310		return kzalloc(size, GFP_KERNEL);
311	else
312		return vzalloc(size);
313}
314
315static inline void empty_child_inc(struct key_vector *n)
316{
317	++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
318}
319
320static inline void empty_child_dec(struct key_vector *n)
321{
322	tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
323}
324
325static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
326{
327	struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
328	struct key_vector *l = kv->kv;
329
330	if (!kv)
331		return NULL;
332
333	/* initialize key vector */
334	l->key = key;
335	l->pos = 0;
336	l->bits = 0;
337	l->slen = fa->fa_slen;
338
339	/* link leaf to fib alias */
340	INIT_HLIST_HEAD(&l->leaf);
341	hlist_add_head(&fa->fa_list, &l->leaf);
342
343	return l;
344}
345
346static struct key_vector *tnode_new(t_key key, int pos, int bits)
347{
348	struct tnode *tnode = tnode_alloc(bits);
349	unsigned int shift = pos + bits;
350	struct key_vector *tn = tnode->kv;
351
352	/* verify bits and pos their msb bits clear and values are valid */
353	BUG_ON(!bits || (shift > KEYLENGTH));
354
355	pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
356		 sizeof(struct key_vector *) << bits);
357
358	if (!tnode)
359		return NULL;
360
361	if (bits == KEYLENGTH)
362		tnode->full_children = 1;
363	else
364		tnode->empty_children = 1ul << bits;
365
366	tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
367	tn->pos = pos;
368	tn->bits = bits;
369	tn->slen = pos;
370
371	return tn;
372}
373
374/* Check whether a tnode 'n' is "full", i.e. it is an internal node
375 * and no bits are skipped. See discussion in dyntree paper p. 6
376 */
377static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
378{
379	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
380}
381
382/* Add a child at position i overwriting the old value.
383 * Update the value of full_children and empty_children.
384 */
385static void put_child(struct key_vector *tn, unsigned long i,
386		      struct key_vector *n)
387{
388	struct key_vector *chi = get_child(tn, i);
389	int isfull, wasfull;
390
391	BUG_ON(i >= child_length(tn));
392
393	/* update emptyChildren, overflow into fullChildren */
394	if (!n && chi)
395		empty_child_inc(tn);
396	if (n && !chi)
397		empty_child_dec(tn);
398
399	/* update fullChildren */
400	wasfull = tnode_full(tn, chi);
401	isfull = tnode_full(tn, n);
402
403	if (wasfull && !isfull)
404		tn_info(tn)->full_children--;
405	else if (!wasfull && isfull)
406		tn_info(tn)->full_children++;
407
408	if (n && (tn->slen < n->slen))
409		tn->slen = n->slen;
410
411	rcu_assign_pointer(tn->tnode[i], n);
412}
413
414static void update_children(struct key_vector *tn)
415{
416	unsigned long i;
417
418	/* update all of the child parent pointers */
419	for (i = child_length(tn); i;) {
420		struct key_vector *inode = get_child(tn, --i);
421
422		if (!inode)
423			continue;
424
425		/* Either update the children of a tnode that
426		 * already belongs to us or update the child
427		 * to point to ourselves.
428		 */
429		if (node_parent(inode) == tn)
430			update_children(inode);
431		else
432			node_set_parent(inode, tn);
433	}
434}
435
436static inline void put_child_root(struct key_vector *tp, t_key key,
437				  struct key_vector *n)
438{
439	if (IS_TRIE(tp))
440		rcu_assign_pointer(tp->tnode[0], n);
441	else
442		put_child(tp, get_index(key, tp), n);
443}
444
445static inline void tnode_free_init(struct key_vector *tn)
446{
447	tn_info(tn)->rcu.next = NULL;
448}
449
450static inline void tnode_free_append(struct key_vector *tn,
451				     struct key_vector *n)
452{
453	tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
454	tn_info(tn)->rcu.next = &tn_info(n)->rcu;
455}
456
457static void tnode_free(struct key_vector *tn)
458{
459	struct callback_head *head = &tn_info(tn)->rcu;
460
461	while (head) {
462		head = head->next;
463		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
464		node_free(tn);
465
466		tn = container_of(head, struct tnode, rcu)->kv;
467	}
468
469	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
470		tnode_free_size = 0;
471		synchronize_rcu();
472	}
473}
474
475static struct key_vector *replace(struct trie *t,
476				  struct key_vector *oldtnode,
477				  struct key_vector *tn)
478{
479	struct key_vector *tp = node_parent(oldtnode);
480	unsigned long i;
481
482	/* setup the parent pointer out of and back into this node */
483	NODE_INIT_PARENT(tn, tp);
484	put_child_root(tp, tn->key, tn);
485
486	/* update all of the child parent pointers */
487	update_children(tn);
488
489	/* all pointers should be clean so we are done */
490	tnode_free(oldtnode);
491
492	/* resize children now that oldtnode is freed */
493	for (i = child_length(tn); i;) {
494		struct key_vector *inode = get_child(tn, --i);
495
496		/* resize child node */
497		if (tnode_full(tn, inode))
498			tn = resize(t, inode);
499	}
500
501	return tp;
502}
503
504static struct key_vector *inflate(struct trie *t,
505				  struct key_vector *oldtnode)
506{
507	struct key_vector *tn;
508	unsigned long i;
509	t_key m;
510
511	pr_debug("In inflate\n");
512
513	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
514	if (!tn)
515		goto notnode;
516
517	/* prepare oldtnode to be freed */
518	tnode_free_init(oldtnode);
519
520	/* Assemble all of the pointers in our cluster, in this case that
521	 * represents all of the pointers out of our allocated nodes that
522	 * point to existing tnodes and the links between our allocated
523	 * nodes.
524	 */
525	for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
526		struct key_vector *inode = get_child(oldtnode, --i);
527		struct key_vector *node0, *node1;
528		unsigned long j, k;
529
530		/* An empty child */
531		if (!inode)
532			continue;
533
534		/* A leaf or an internal node with skipped bits */
535		if (!tnode_full(oldtnode, inode)) {
536			put_child(tn, get_index(inode->key, tn), inode);
537			continue;
538		}
539
540		/* drop the node in the old tnode free list */
541		tnode_free_append(oldtnode, inode);
542
543		/* An internal node with two children */
544		if (inode->bits == 1) {
545			put_child(tn, 2 * i + 1, get_child(inode, 1));
546			put_child(tn, 2 * i, get_child(inode, 0));
547			continue;
548		}
549
550		/* We will replace this node 'inode' with two new
551		 * ones, 'node0' and 'node1', each with half of the
552		 * original children. The two new nodes will have
553		 * a position one bit further down the key and this
554		 * means that the "significant" part of their keys
555		 * (see the discussion near the top of this file)
556		 * will differ by one bit, which will be "0" in
557		 * node0's key and "1" in node1's key. Since we are
558		 * moving the key position by one step, the bit that
559		 * we are moving away from - the bit at position
560		 * (tn->pos) - is the one that will differ between
561		 * node0 and node1. So... we synthesize that bit in the
562		 * two new keys.
563		 */
564		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
565		if (!node1)
566			goto nomem;
567		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
568
569		tnode_free_append(tn, node1);
570		if (!node0)
571			goto nomem;
572		tnode_free_append(tn, node0);
573
574		/* populate child pointers in new nodes */
575		for (k = child_length(inode), j = k / 2; j;) {
576			put_child(node1, --j, get_child(inode, --k));
577			put_child(node0, j, get_child(inode, j));
578			put_child(node1, --j, get_child(inode, --k));
579			put_child(node0, j, get_child(inode, j));
580		}
581
582		/* link new nodes to parent */
583		NODE_INIT_PARENT(node1, tn);
584		NODE_INIT_PARENT(node0, tn);
585
586		/* link parent to nodes */
587		put_child(tn, 2 * i + 1, node1);
588		put_child(tn, 2 * i, node0);
589	}
590
591	/* setup the parent pointers into and out of this node */
592	return replace(t, oldtnode, tn);
593nomem:
594	/* all pointers should be clean so we are done */
595	tnode_free(tn);
596notnode:
597	return NULL;
598}
599
600static struct key_vector *halve(struct trie *t,
601				struct key_vector *oldtnode)
602{
603	struct key_vector *tn;
604	unsigned long i;
605
606	pr_debug("In halve\n");
607
608	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
609	if (!tn)
610		goto notnode;
611
612	/* prepare oldtnode to be freed */
613	tnode_free_init(oldtnode);
614
615	/* Assemble all of the pointers in our cluster, in this case that
616	 * represents all of the pointers out of our allocated nodes that
617	 * point to existing tnodes and the links between our allocated
618	 * nodes.
619	 */
620	for (i = child_length(oldtnode); i;) {
621		struct key_vector *node1 = get_child(oldtnode, --i);
622		struct key_vector *node0 = get_child(oldtnode, --i);
623		struct key_vector *inode;
624
625		/* At least one of the children is empty */
626		if (!node1 || !node0) {
627			put_child(tn, i / 2, node1 ? : node0);
628			continue;
629		}
630
631		/* Two nonempty children */
632		inode = tnode_new(node0->key, oldtnode->pos, 1);
633		if (!inode)
634			goto nomem;
635		tnode_free_append(tn, inode);
636
637		/* initialize pointers out of node */
638		put_child(inode, 1, node1);
639		put_child(inode, 0, node0);
640		NODE_INIT_PARENT(inode, tn);
641
642		/* link parent to node */
643		put_child(tn, i / 2, inode);
644	}
645
646	/* setup the parent pointers into and out of this node */
647	return replace(t, oldtnode, tn);
648nomem:
649	/* all pointers should be clean so we are done */
650	tnode_free(tn);
651notnode:
652	return NULL;
653}
654
655static struct key_vector *collapse(struct trie *t,
656				   struct key_vector *oldtnode)
657{
658	struct key_vector *n, *tp;
659	unsigned long i;
660
661	/* scan the tnode looking for that one child that might still exist */
662	for (n = NULL, i = child_length(oldtnode); !n && i;)
663		n = get_child(oldtnode, --i);
664
665	/* compress one level */
666	tp = node_parent(oldtnode);
667	put_child_root(tp, oldtnode->key, n);
668	node_set_parent(n, tp);
669
670	/* drop dead node */
671	node_free(oldtnode);
672
673	return tp;
674}
675
676static unsigned char update_suffix(struct key_vector *tn)
677{
678	unsigned char slen = tn->pos;
679	unsigned long stride, i;
680
681	/* search though the list of children looking for nodes that might
682	 * have a suffix greater than the one we currently have.  This is
683	 * why we start with a stride of 2 since a stride of 1 would
684	 * represent the nodes with suffix length equal to tn->pos
685	 */
686	for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
687		struct key_vector *n = get_child(tn, i);
688
689		if (!n || (n->slen <= slen))
690			continue;
691
692		/* update stride and slen based on new value */
693		stride <<= (n->slen - slen);
694		slen = n->slen;
695		i &= ~(stride - 1);
696
697		/* if slen covers all but the last bit we can stop here
698		 * there will be nothing longer than that since only node
699		 * 0 and 1 << (bits - 1) could have that as their suffix
700		 * length.
701		 */
702		if ((slen + 1) >= (tn->pos + tn->bits))
703			break;
704	}
705
706	tn->slen = slen;
707
708	return slen;
709}
710
711/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
712 * the Helsinki University of Technology and Matti Tikkanen of Nokia
713 * Telecommunications, page 6:
714 * "A node is doubled if the ratio of non-empty children to all
715 * children in the *doubled* node is at least 'high'."
716 *
717 * 'high' in this instance is the variable 'inflate_threshold'. It
718 * is expressed as a percentage, so we multiply it with
719 * child_length() and instead of multiplying by 2 (since the
720 * child array will be doubled by inflate()) and multiplying
721 * the left-hand side by 100 (to handle the percentage thing) we
722 * multiply the left-hand side by 50.
723 *
724 * The left-hand side may look a bit weird: child_length(tn)
725 * - tn->empty_children is of course the number of non-null children
726 * in the current node. tn->full_children is the number of "full"
727 * children, that is non-null tnodes with a skip value of 0.
728 * All of those will be doubled in the resulting inflated tnode, so
729 * we just count them one extra time here.
730 *
731 * A clearer way to write this would be:
732 *
733 * to_be_doubled = tn->full_children;
734 * not_to_be_doubled = child_length(tn) - tn->empty_children -
735 *     tn->full_children;
736 *
737 * new_child_length = child_length(tn) * 2;
738 *
739 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
740 *      new_child_length;
741 * if (new_fill_factor >= inflate_threshold)
742 *
743 * ...and so on, tho it would mess up the while () loop.
744 *
745 * anyway,
746 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
747 *      inflate_threshold
748 *
749 * avoid a division:
750 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
751 *      inflate_threshold * new_child_length
752 *
753 * expand not_to_be_doubled and to_be_doubled, and shorten:
754 * 100 * (child_length(tn) - tn->empty_children +
755 *    tn->full_children) >= inflate_threshold * new_child_length
756 *
757 * expand new_child_length:
758 * 100 * (child_length(tn) - tn->empty_children +
759 *    tn->full_children) >=
760 *      inflate_threshold * child_length(tn) * 2
761 *
762 * shorten again:
763 * 50 * (tn->full_children + child_length(tn) -
764 *    tn->empty_children) >= inflate_threshold *
765 *    child_length(tn)
766 *
767 */
768static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
769{
770	unsigned long used = child_length(tn);
771	unsigned long threshold = used;
772
773	/* Keep root node larger */
774	threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
775	used -= tn_info(tn)->empty_children;
776	used += tn_info(tn)->full_children;
777
778	/* if bits == KEYLENGTH then pos = 0, and will fail below */
779
780	return (used > 1) && tn->pos && ((50 * used) >= threshold);
781}
782
783static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
784{
785	unsigned long used = child_length(tn);
786	unsigned long threshold = used;
787
788	/* Keep root node larger */
789	threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
790	used -= tn_info(tn)->empty_children;
791
792	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
793
794	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
795}
796
797static inline bool should_collapse(struct key_vector *tn)
798{
799	unsigned long used = child_length(tn);
800
801	used -= tn_info(tn)->empty_children;
802
803	/* account for bits == KEYLENGTH case */
804	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
805		used -= KEY_MAX;
806
807	/* One child or none, time to drop us from the trie */
808	return used < 2;
809}
810
811#define MAX_WORK 10
812static struct key_vector *resize(struct trie *t, struct key_vector *tn)
813{
814#ifdef CONFIG_IP_FIB_TRIE_STATS
815	struct trie_use_stats __percpu *stats = t->stats;
816#endif
817	struct key_vector *tp = node_parent(tn);
818	unsigned long cindex = get_index(tn->key, tp);
819	int max_work = MAX_WORK;
820
821	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
822		 tn, inflate_threshold, halve_threshold);
823
824	/* track the tnode via the pointer from the parent instead of
825	 * doing it ourselves.  This way we can let RCU fully do its
826	 * thing without us interfering
827	 */
828	BUG_ON(tn != get_child(tp, cindex));
829
830	/* Double as long as the resulting node has a number of
831	 * nonempty nodes that are above the threshold.
832	 */
833	while (should_inflate(tp, tn) && max_work) {
834		tp = inflate(t, tn);
835		if (!tp) {
836#ifdef CONFIG_IP_FIB_TRIE_STATS
837			this_cpu_inc(stats->resize_node_skipped);
838#endif
839			break;
840		}
841
842		max_work--;
843		tn = get_child(tp, cindex);
844	}
845
846	/* update parent in case inflate failed */
847	tp = node_parent(tn);
848
849	/* Return if at least one inflate is run */
850	if (max_work != MAX_WORK)
851		return tp;
852
853	/* Halve as long as the number of empty children in this
854	 * node is above threshold.
855	 */
856	while (should_halve(tp, tn) && max_work) {
857		tp = halve(t, tn);
858		if (!tp) {
859#ifdef CONFIG_IP_FIB_TRIE_STATS
860			this_cpu_inc(stats->resize_node_skipped);
861#endif
862			break;
863		}
864
865		max_work--;
866		tn = get_child(tp, cindex);
867	}
868
869	/* Only one child remains */
870	if (should_collapse(tn))
871		return collapse(t, tn);
872
873	/* update parent in case halve failed */
874	tp = node_parent(tn);
875
876	/* Return if at least one deflate was run */
877	if (max_work != MAX_WORK)
878		return tp;
879
880	/* push the suffix length to the parent node */
881	if (tn->slen > tn->pos) {
882		unsigned char slen = update_suffix(tn);
883
884		if (slen > tp->slen)
885			tp->slen = slen;
886	}
887
888	return tp;
889}
890
891static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
892{
893	while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
894		if (update_suffix(tp) > l->slen)
895			break;
896		tp = node_parent(tp);
897	}
898}
899
900static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
901{
902	/* if this is a new leaf then tn will be NULL and we can sort
903	 * out parent suffix lengths as a part of trie_rebalance
904	 */
905	while (tn->slen < l->slen) {
906		tn->slen = l->slen;
907		tn = node_parent(tn);
908	}
909}
910
911/* rcu_read_lock needs to be hold by caller from readside */
912static struct key_vector *fib_find_node(struct trie *t,
913					struct key_vector **tp, u32 key)
914{
915	struct key_vector *pn, *n = t->kv;
916	unsigned long index = 0;
917
918	do {
919		pn = n;
920		n = get_child_rcu(n, index);
921
922		if (!n)
923			break;
924
925		index = get_cindex(key, n);
926
927		/* This bit of code is a bit tricky but it combines multiple
928		 * checks into a single check.  The prefix consists of the
929		 * prefix plus zeros for the bits in the cindex. The index
930		 * is the difference between the key and this value.  From
931		 * this we can actually derive several pieces of data.
932		 *   if (index >= (1ul << bits))
933		 *     we have a mismatch in skip bits and failed
934		 *   else
935		 *     we know the value is cindex
936		 *
937		 * This check is safe even if bits == KEYLENGTH due to the
938		 * fact that we can only allocate a node with 32 bits if a
939		 * long is greater than 32 bits.
940		 */
941		if (index >= (1ul << n->bits)) {
942			n = NULL;
943			break;
944		}
945
946		/* keep searching until we find a perfect match leaf or NULL */
947	} while (IS_TNODE(n));
948
949	*tp = pn;
950
951	return n;
952}
953
954/* Return the first fib alias matching TOS with
955 * priority less than or equal to PRIO.
956 */
957static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
958					u8 tos, u32 prio, u32 tb_id)
959{
960	struct fib_alias *fa;
961
962	if (!fah)
963		return NULL;
964
965	hlist_for_each_entry(fa, fah, fa_list) {
966		if (fa->fa_slen < slen)
967			continue;
968		if (fa->fa_slen != slen)
969			break;
970		if (fa->tb_id > tb_id)
971			continue;
972		if (fa->tb_id != tb_id)
973			break;
974		if (fa->fa_tos > tos)
975			continue;
976		if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
977			return fa;
978	}
979
980	return NULL;
981}
982
983static void trie_rebalance(struct trie *t, struct key_vector *tn)
984{
985	while (!IS_TRIE(tn))
986		tn = resize(t, tn);
987}
988
989static int fib_insert_node(struct trie *t, struct key_vector *tp,
990			   struct fib_alias *new, t_key key)
991{
992	struct key_vector *n, *l;
993
994	l = leaf_new(key, new);
995	if (!l)
996		goto noleaf;
997
998	/* retrieve child from parent node */
999	n = get_child(tp, get_index(key, tp));
1000
1001	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1002	 *
1003	 *  Add a new tnode here
1004	 *  first tnode need some special handling
1005	 *  leaves us in position for handling as case 3
1006	 */
1007	if (n) {
1008		struct key_vector *tn;
1009
1010		tn = tnode_new(key, __fls(key ^ n->key), 1);
1011		if (!tn)
1012			goto notnode;
1013
1014		/* initialize routes out of node */
1015		NODE_INIT_PARENT(tn, tp);
1016		put_child(tn, get_index(key, tn) ^ 1, n);
1017
1018		/* start adding routes into the node */
1019		put_child_root(tp, key, tn);
1020		node_set_parent(n, tn);
1021
1022		/* parent now has a NULL spot where the leaf can go */
1023		tp = tn;
1024	}
1025
1026	/* Case 3: n is NULL, and will just insert a new leaf */
1027	NODE_INIT_PARENT(l, tp);
1028	put_child_root(tp, key, l);
1029	trie_rebalance(t, tp);
1030
1031	return 0;
1032notnode:
1033	node_free(l);
1034noleaf:
1035	return -ENOMEM;
1036}
1037
1038static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1039			    struct key_vector *l, struct fib_alias *new,
1040			    struct fib_alias *fa, t_key key)
1041{
1042	if (!l)
1043		return fib_insert_node(t, tp, new, key);
1044
1045	if (fa) {
1046		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1047	} else {
1048		struct fib_alias *last;
1049
1050		hlist_for_each_entry(last, &l->leaf, fa_list) {
1051			if (new->fa_slen < last->fa_slen)
1052				break;
1053			if ((new->fa_slen == last->fa_slen) &&
1054			    (new->tb_id > last->tb_id))
1055				break;
1056			fa = last;
1057		}
1058
1059		if (fa)
1060			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
1061		else
1062			hlist_add_head_rcu(&new->fa_list, &l->leaf);
1063	}
1064
1065	/* if we added to the tail node then we need to update slen */
1066	if (l->slen < new->fa_slen) {
1067		l->slen = new->fa_slen;
1068		leaf_push_suffix(tp, l);
1069	}
1070
1071	return 0;
1072}
1073
1074/* Caller must hold RTNL. */
1075int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1076{
1077	struct trie *t = (struct trie *)tb->tb_data;
1078	struct fib_alias *fa, *new_fa;
1079	struct key_vector *l, *tp;
1080	struct fib_info *fi;
1081	u8 plen = cfg->fc_dst_len;
1082	u8 slen = KEYLENGTH - plen;
1083	u8 tos = cfg->fc_tos;
1084	u32 key;
1085	int err;
1086
1087	if (plen > KEYLENGTH)
1088		return -EINVAL;
1089
1090	key = ntohl(cfg->fc_dst);
1091
1092	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1093
1094	if ((plen < KEYLENGTH) && (key << plen))
1095		return -EINVAL;
1096
1097	fi = fib_create_info(cfg);
1098	if (IS_ERR(fi)) {
1099		err = PTR_ERR(fi);
1100		goto err;
1101	}
1102
1103	l = fib_find_node(t, &tp, key);
1104	fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
1105				tb->tb_id) : NULL;
1106
1107	/* Now fa, if non-NULL, points to the first fib alias
1108	 * with the same keys [prefix,tos,priority], if such key already
1109	 * exists or to the node before which we will insert new one.
1110	 *
1111	 * If fa is NULL, we will need to allocate a new one and
1112	 * insert to the tail of the section matching the suffix length
1113	 * of the new alias.
1114	 */
1115
1116	if (fa && fa->fa_tos == tos &&
1117	    fa->fa_info->fib_priority == fi->fib_priority) {
1118		struct fib_alias *fa_first, *fa_match;
1119
1120		err = -EEXIST;
1121		if (cfg->fc_nlflags & NLM_F_EXCL)
1122			goto out;
1123
1124		/* We have 2 goals:
1125		 * 1. Find exact match for type, scope, fib_info to avoid
1126		 * duplicate routes
1127		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1128		 */
1129		fa_match = NULL;
1130		fa_first = fa;
1131		hlist_for_each_entry_from(fa, fa_list) {
1132			if ((fa->fa_slen != slen) ||
1133			    (fa->tb_id != tb->tb_id) ||
1134			    (fa->fa_tos != tos))
1135				break;
1136			if (fa->fa_info->fib_priority != fi->fib_priority)
1137				break;
1138			if (fa->fa_type == cfg->fc_type &&
1139			    fa->fa_info == fi) {
1140				fa_match = fa;
1141				break;
1142			}
1143		}
1144
1145		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1146			struct fib_info *fi_drop;
1147			u8 state;
1148
1149			fa = fa_first;
1150			if (fa_match) {
1151				if (fa == fa_match)
1152					err = 0;
1153				goto out;
1154			}
1155			err = -ENOBUFS;
1156			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1157			if (!new_fa)
1158				goto out;
1159
1160			fi_drop = fa->fa_info;
1161			new_fa->fa_tos = fa->fa_tos;
1162			new_fa->fa_info = fi;
1163			new_fa->fa_type = cfg->fc_type;
1164			state = fa->fa_state;
1165			new_fa->fa_state = state & ~FA_S_ACCESSED;
1166			new_fa->fa_slen = fa->fa_slen;
1167			new_fa->tb_id = tb->tb_id;
1168
1169			err = netdev_switch_fib_ipv4_add(key, plen, fi,
1170							 new_fa->fa_tos,
1171							 cfg->fc_type,
1172							 cfg->fc_nlflags,
1173							 tb->tb_id);
1174			if (err) {
1175				netdev_switch_fib_ipv4_abort(fi);
1176				kmem_cache_free(fn_alias_kmem, new_fa);
1177				goto out;
1178			}
1179
1180			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1181
1182			alias_free_mem_rcu(fa);
1183
1184			fib_release_info(fi_drop);
1185			if (state & FA_S_ACCESSED)
1186				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1187			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1188				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1189
1190			goto succeeded;
1191		}
1192		/* Error if we find a perfect match which
1193		 * uses the same scope, type, and nexthop
1194		 * information.
1195		 */
1196		if (fa_match)
1197			goto out;
1198
1199		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1200			fa = fa_first;
1201	}
1202	err = -ENOENT;
1203	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1204		goto out;
1205
1206	err = -ENOBUFS;
1207	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1208	if (!new_fa)
1209		goto out;
1210
1211	new_fa->fa_info = fi;
1212	new_fa->fa_tos = tos;
1213	new_fa->fa_type = cfg->fc_type;
1214	new_fa->fa_state = 0;
1215	new_fa->fa_slen = slen;
1216	new_fa->tb_id = tb->tb_id;
1217
1218	/* (Optionally) offload fib entry to switch hardware. */
1219	err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
1220					 cfg->fc_type,
1221					 cfg->fc_nlflags,
1222					 tb->tb_id);
1223	if (err) {
1224		netdev_switch_fib_ipv4_abort(fi);
1225		goto out_free_new_fa;
1226	}
1227
1228	/* Insert new entry to the list. */
1229	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1230	if (err)
1231		goto out_sw_fib_del;
1232
1233	if (!plen)
1234		tb->tb_num_default++;
1235
1236	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1237	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1238		  &cfg->fc_nlinfo, 0);
1239succeeded:
1240	return 0;
1241
1242out_sw_fib_del:
1243	netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
1244out_free_new_fa:
1245	kmem_cache_free(fn_alias_kmem, new_fa);
1246out:
1247	fib_release_info(fi);
1248err:
1249	return err;
1250}
1251
1252static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
1253{
1254	t_key prefix = n->key;
1255
1256	return (key ^ prefix) & (prefix | -prefix);
1257}
1258
1259/* should be called with rcu_read_lock */
1260int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1261		     struct fib_result *res, int fib_flags)
1262{
1263	struct trie *t = (struct trie *) tb->tb_data;
1264#ifdef CONFIG_IP_FIB_TRIE_STATS
1265	struct trie_use_stats __percpu *stats = t->stats;
1266#endif
1267	const t_key key = ntohl(flp->daddr);
1268	struct key_vector *n, *pn;
1269	struct fib_alias *fa;
1270	unsigned long index;
1271	t_key cindex;
1272
1273	pn = t->kv;
1274	cindex = 0;
1275
1276	n = get_child_rcu(pn, cindex);
1277	if (!n)
1278		return -EAGAIN;
1279
1280#ifdef CONFIG_IP_FIB_TRIE_STATS
1281	this_cpu_inc(stats->gets);
1282#endif
1283
1284	/* Step 1: Travel to the longest prefix match in the trie */
1285	for (;;) {
1286		index = get_cindex(key, n);
1287
1288		/* This bit of code is a bit tricky but it combines multiple
1289		 * checks into a single check.  The prefix consists of the
1290		 * prefix plus zeros for the "bits" in the prefix. The index
1291		 * is the difference between the key and this value.  From
1292		 * this we can actually derive several pieces of data.
1293		 *   if (index >= (1ul << bits))
1294		 *     we have a mismatch in skip bits and failed
1295		 *   else
1296		 *     we know the value is cindex
1297		 *
1298		 * This check is safe even if bits == KEYLENGTH due to the
1299		 * fact that we can only allocate a node with 32 bits if a
1300		 * long is greater than 32 bits.
1301		 */
1302		if (index >= (1ul << n->bits))
1303			break;
1304
1305		/* we have found a leaf. Prefixes have already been compared */
1306		if (IS_LEAF(n))
1307			goto found;
1308
1309		/* only record pn and cindex if we are going to be chopping
1310		 * bits later.  Otherwise we are just wasting cycles.
1311		 */
1312		if (n->slen > n->pos) {
1313			pn = n;
1314			cindex = index;
1315		}
1316
1317		n = get_child_rcu(n, index);
1318		if (unlikely(!n))
1319			goto backtrace;
1320	}
1321
1322	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
1323	for (;;) {
1324		/* record the pointer where our next node pointer is stored */
1325		struct key_vector __rcu **cptr = n->tnode;
1326
1327		/* This test verifies that none of the bits that differ
1328		 * between the key and the prefix exist in the region of
1329		 * the lsb and higher in the prefix.
1330		 */
1331		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1332			goto backtrace;
1333
1334		/* exit out and process leaf */
1335		if (unlikely(IS_LEAF(n)))
1336			break;
1337
1338		/* Don't bother recording parent info.  Since we are in
1339		 * prefix match mode we will have to come back to wherever
1340		 * we started this traversal anyway
1341		 */
1342
1343		while ((n = rcu_dereference(*cptr)) == NULL) {
1344backtrace:
1345#ifdef CONFIG_IP_FIB_TRIE_STATS
1346			if (!n)
1347				this_cpu_inc(stats->null_node_hit);
1348#endif
1349			/* If we are at cindex 0 there are no more bits for
1350			 * us to strip at this level so we must ascend back
1351			 * up one level to see if there are any more bits to
1352			 * be stripped there.
1353			 */
1354			while (!cindex) {
1355				t_key pkey = pn->key;
1356
1357				/* If we don't have a parent then there is
1358				 * nothing for us to do as we do not have any
1359				 * further nodes to parse.
1360				 */
1361				if (IS_TRIE(pn))
1362					return -EAGAIN;
1363#ifdef CONFIG_IP_FIB_TRIE_STATS
1364				this_cpu_inc(stats->backtrack);
1365#endif
1366				/* Get Child's index */
1367				pn = node_parent_rcu(pn);
1368				cindex = get_index(pkey, pn);
1369			}
1370
1371			/* strip the least significant bit from the cindex */
1372			cindex &= cindex - 1;
1373
1374			/* grab pointer for next child node */
1375			cptr = &pn->tnode[cindex];
1376		}
1377	}
1378
1379found:
1380	/* this line carries forward the xor from earlier in the function */
1381	index = key ^ n->key;
1382
1383	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1384	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
1385		struct fib_info *fi = fa->fa_info;
1386		int nhsel, err;
1387
1388		if ((index >= (1ul << fa->fa_slen)) &&
1389		    ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
1390			continue;
1391		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1392			continue;
1393		if (fi->fib_dead)
1394			continue;
1395		if (fa->fa_info->fib_scope < flp->flowi4_scope)
1396			continue;
1397		fib_alias_accessed(fa);
1398		err = fib_props[fa->fa_type].error;
1399		if (unlikely(err < 0)) {
1400#ifdef CONFIG_IP_FIB_TRIE_STATS
1401			this_cpu_inc(stats->semantic_match_passed);
1402#endif
1403			return err;
1404		}
1405		if (fi->fib_flags & RTNH_F_DEAD)
1406			continue;
1407		for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1408			const struct fib_nh *nh = &fi->fib_nh[nhsel];
1409
1410			if (nh->nh_flags & RTNH_F_DEAD)
1411				continue;
1412			if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1413				continue;
1414
1415			if (!(fib_flags & FIB_LOOKUP_NOREF))
1416				atomic_inc(&fi->fib_clntref);
1417
1418			res->prefixlen = KEYLENGTH - fa->fa_slen;
1419			res->nh_sel = nhsel;
1420			res->type = fa->fa_type;
1421			res->scope = fi->fib_scope;
1422			res->fi = fi;
1423			res->table = tb;
1424			res->fa_head = &n->leaf;
1425#ifdef CONFIG_IP_FIB_TRIE_STATS
1426			this_cpu_inc(stats->semantic_match_passed);
1427#endif
1428			return err;
1429		}
1430	}
1431#ifdef CONFIG_IP_FIB_TRIE_STATS
1432	this_cpu_inc(stats->semantic_match_miss);
1433#endif
1434	goto backtrace;
1435}
1436EXPORT_SYMBOL_GPL(fib_table_lookup);
1437
1438static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1439			     struct key_vector *l, struct fib_alias *old)
1440{
1441	/* record the location of the previous list_info entry */
1442	struct hlist_node **pprev = old->fa_list.pprev;
1443	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
1444
1445	/* remove the fib_alias from the list */
1446	hlist_del_rcu(&old->fa_list);
1447
1448	/* if we emptied the list this leaf will be freed and we can sort
1449	 * out parent suffix lengths as a part of trie_rebalance
1450	 */
1451	if (hlist_empty(&l->leaf)) {
1452		put_child_root(tp, l->key, NULL);
1453		node_free(l);
1454		trie_rebalance(t, tp);
1455		return;
1456	}
1457
1458	/* only access fa if it is pointing at the last valid hlist_node */
1459	if (*pprev)
1460		return;
1461
1462	/* update the trie with the latest suffix length */
1463	l->slen = fa->fa_slen;
1464	leaf_pull_suffix(tp, l);
1465}
1466
1467/* Caller must hold RTNL. */
1468int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1469{
1470	struct trie *t = (struct trie *) tb->tb_data;
1471	struct fib_alias *fa, *fa_to_delete;
1472	struct key_vector *l, *tp;
1473	u8 plen = cfg->fc_dst_len;
1474	u8 slen = KEYLENGTH - plen;
1475	u8 tos = cfg->fc_tos;
1476	u32 key;
1477
1478	if (plen > KEYLENGTH)
1479		return -EINVAL;
1480
1481	key = ntohl(cfg->fc_dst);
1482
1483	if ((plen < KEYLENGTH) && (key << plen))
1484		return -EINVAL;
1485
1486	l = fib_find_node(t, &tp, key);
1487	if (!l)
1488		return -ESRCH;
1489
1490	fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
1491	if (!fa)
1492		return -ESRCH;
1493
1494	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1495
1496	fa_to_delete = NULL;
1497	hlist_for_each_entry_from(fa, fa_list) {
1498		struct fib_info *fi = fa->fa_info;
1499
1500		if ((fa->fa_slen != slen) ||
1501		    (fa->tb_id != tb->tb_id) ||
1502		    (fa->fa_tos != tos))
1503			break;
1504
1505		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1506		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1507		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1508		    (!cfg->fc_prefsrc ||
1509		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1510		    (!cfg->fc_protocol ||
1511		     fi->fib_protocol == cfg->fc_protocol) &&
1512		    fib_nh_match(cfg, fi) == 0) {
1513			fa_to_delete = fa;
1514			break;
1515		}
1516	}
1517
1518	if (!fa_to_delete)
1519		return -ESRCH;
1520
1521	netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
1522				   cfg->fc_type, tb->tb_id);
1523
1524	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1525		  &cfg->fc_nlinfo, 0);
1526
1527	if (!plen)
1528		tb->tb_num_default--;
1529
1530	fib_remove_alias(t, tp, l, fa_to_delete);
1531
1532	if (fa_to_delete->fa_state & FA_S_ACCESSED)
1533		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1534
1535	fib_release_info(fa_to_delete->fa_info);
1536	alias_free_mem_rcu(fa_to_delete);
1537	return 0;
1538}
1539
1540/* Scan for the next leaf starting at the provided key value */
1541static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
1542{
1543	struct key_vector *pn, *n = *tn;
1544	unsigned long cindex;
1545
1546	/* this loop is meant to try and find the key in the trie */
1547	do {
1548		/* record parent and next child index */
1549		pn = n;
1550		cindex = (key > pn->key) ? get_index(key, pn) : 0;
1551
1552		if (cindex >> pn->bits)
1553			break;
1554
1555		/* descend into the next child */
1556		n = get_child_rcu(pn, cindex++);
1557		if (!n)
1558			break;
1559
1560		/* guarantee forward progress on the keys */
1561		if (IS_LEAF(n) && (n->key >= key))
1562			goto found;
1563	} while (IS_TNODE(n));
1564
1565	/* this loop will search for the next leaf with a greater key */
1566	while (!IS_TRIE(pn)) {
1567		/* if we exhausted the parent node we will need to climb */
1568		if (cindex >= (1ul << pn->bits)) {
1569			t_key pkey = pn->key;
1570
1571			pn = node_parent_rcu(pn);
1572			cindex = get_index(pkey, pn) + 1;
1573			continue;
1574		}
1575
1576		/* grab the next available node */
1577		n = get_child_rcu(pn, cindex++);
1578		if (!n)
1579			continue;
1580
1581		/* no need to compare keys since we bumped the index */
1582		if (IS_LEAF(n))
1583			goto found;
1584
1585		/* Rescan start scanning in new node */
1586		pn = n;
1587		cindex = 0;
1588	}
1589
1590	*tn = pn;
1591	return NULL; /* Root of trie */
1592found:
1593	/* if we are at the limit for keys just return NULL for the tnode */
1594	*tn = pn;
1595	return n;
1596}
1597
1598static void fib_trie_free(struct fib_table *tb)
1599{
1600	struct trie *t = (struct trie *)tb->tb_data;
1601	struct key_vector *pn = t->kv;
1602	unsigned long cindex = 1;
1603	struct hlist_node *tmp;
1604	struct fib_alias *fa;
1605
1606	/* walk trie in reverse order and free everything */
1607	for (;;) {
1608		struct key_vector *n;
1609
1610		if (!(cindex--)) {
1611			t_key pkey = pn->key;
1612
1613			if (IS_TRIE(pn))
1614				break;
1615
1616			n = pn;
1617			pn = node_parent(pn);
1618
1619			/* drop emptied tnode */
1620			put_child_root(pn, n->key, NULL);
1621			node_free(n);
1622
1623			cindex = get_index(pkey, pn);
1624
1625			continue;
1626		}
1627
1628		/* grab the next available node */
1629		n = get_child(pn, cindex);
1630		if (!n)
1631			continue;
1632
1633		if (IS_TNODE(n)) {
1634			/* record pn and cindex for leaf walking */
1635			pn = n;
1636			cindex = 1ul << n->bits;
1637
1638			continue;
1639		}
1640
1641		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1642			hlist_del_rcu(&fa->fa_list);
1643			alias_free_mem_rcu(fa);
1644		}
1645
1646		put_child_root(pn, n->key, NULL);
1647		node_free(n);
1648	}
1649
1650#ifdef CONFIG_IP_FIB_TRIE_STATS
1651	free_percpu(t->stats);
1652#endif
1653	kfree(tb);
1654}
1655
1656struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
1657{
1658	struct trie *ot = (struct trie *)oldtb->tb_data;
1659	struct key_vector *l, *tp = ot->kv;
1660	struct fib_table *local_tb;
1661	struct fib_alias *fa;
1662	struct trie *lt;
1663	t_key key = 0;
1664
1665	if (oldtb->tb_data == oldtb->__data)
1666		return oldtb;
1667
1668	local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
1669	if (!local_tb)
1670		return NULL;
1671
1672	lt = (struct trie *)local_tb->tb_data;
1673
1674	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1675		struct key_vector *local_l = NULL, *local_tp;
1676
1677		hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1678			struct fib_alias *new_fa;
1679
1680			if (local_tb->tb_id != fa->tb_id)
1681				continue;
1682
1683			/* clone fa for new local table */
1684			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1685			if (!new_fa)
1686				goto out;
1687
1688			memcpy(new_fa, fa, sizeof(*fa));
1689
1690			/* insert clone into table */
1691			if (!local_l)
1692				local_l = fib_find_node(lt, &local_tp, l->key);
1693
1694			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
1695					     NULL, l->key))
1696				goto out;
1697		}
1698
1699		/* stop loop if key wrapped back to 0 */
1700		key = l->key + 1;
1701		if (key < l->key)
1702			break;
1703	}
1704
1705	return local_tb;
1706out:
1707	fib_trie_free(local_tb);
1708
1709	return NULL;
1710}
1711
1712/* Caller must hold RTNL */
1713void fib_table_flush_external(struct fib_table *tb)
1714{
1715	struct trie *t = (struct trie *)tb->tb_data;
1716	struct key_vector *pn = t->kv;
1717	unsigned long cindex = 1;
1718	struct hlist_node *tmp;
1719	struct fib_alias *fa;
1720
1721	/* walk trie in reverse order */
1722	for (;;) {
1723		unsigned char slen = 0;
1724		struct key_vector *n;
1725
1726		if (!(cindex--)) {
1727			t_key pkey = pn->key;
1728
1729			/* cannot resize the trie vector */
1730			if (IS_TRIE(pn))
1731				break;
1732
1733			/* resize completed node */
1734			pn = resize(t, pn);
1735			cindex = get_index(pkey, pn);
1736
1737			continue;
1738		}
1739
1740		/* grab the next available node */
1741		n = get_child(pn, cindex);
1742		if (!n)
1743			continue;
1744
1745		if (IS_TNODE(n)) {
1746			/* record pn and cindex for leaf walking */
1747			pn = n;
1748			cindex = 1ul << n->bits;
1749
1750			continue;
1751		}
1752
1753		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1754			struct fib_info *fi = fa->fa_info;
1755
1756			/* if alias was cloned to local then we just
1757			 * need to remove the local copy from main
1758			 */
1759			if (tb->tb_id != fa->tb_id) {
1760				hlist_del_rcu(&fa->fa_list);
1761				alias_free_mem_rcu(fa);
1762				continue;
1763			}
1764
1765			/* record local slen */
1766			slen = fa->fa_slen;
1767
1768			if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
1769				continue;
1770
1771			netdev_switch_fib_ipv4_del(n->key,
1772						   KEYLENGTH - fa->fa_slen,
1773						   fi, fa->fa_tos,
1774						   fa->fa_type, tb->tb_id);
1775		}
1776
1777		/* update leaf slen */
1778		n->slen = slen;
1779
1780		if (hlist_empty(&n->leaf)) {
1781			put_child_root(pn, n->key, NULL);
1782			node_free(n);
1783		}
1784	}
1785}
1786
1787/* Caller must hold RTNL. */
1788int fib_table_flush(struct fib_table *tb)
1789{
1790	struct trie *t = (struct trie *)tb->tb_data;
1791	struct key_vector *pn = t->kv;
1792	unsigned long cindex = 1;
1793	struct hlist_node *tmp;
1794	struct fib_alias *fa;
1795	int found = 0;
1796
1797	/* walk trie in reverse order */
1798	for (;;) {
1799		unsigned char slen = 0;
1800		struct key_vector *n;
1801
1802		if (!(cindex--)) {
1803			t_key pkey = pn->key;
1804
1805			/* cannot resize the trie vector */
1806			if (IS_TRIE(pn))
1807				break;
1808
1809			/* resize completed node */
1810			pn = resize(t, pn);
1811			cindex = get_index(pkey, pn);
1812
1813			continue;
1814		}
1815
1816		/* grab the next available node */
1817		n = get_child(pn, cindex);
1818		if (!n)
1819			continue;
1820
1821		if (IS_TNODE(n)) {
1822			/* record pn and cindex for leaf walking */
1823			pn = n;
1824			cindex = 1ul << n->bits;
1825
1826			continue;
1827		}
1828
1829		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1830			struct fib_info *fi = fa->fa_info;
1831
1832			if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) {
1833				slen = fa->fa_slen;
1834				continue;
1835			}
1836
1837			netdev_switch_fib_ipv4_del(n->key,
1838						   KEYLENGTH - fa->fa_slen,
1839						   fi, fa->fa_tos,
1840						   fa->fa_type, tb->tb_id);
1841			hlist_del_rcu(&fa->fa_list);
1842			fib_release_info(fa->fa_info);
1843			alias_free_mem_rcu(fa);
1844			found++;
1845		}
1846
1847		/* update leaf slen */
1848		n->slen = slen;
1849
1850		if (hlist_empty(&n->leaf)) {
1851			put_child_root(pn, n->key, NULL);
1852			node_free(n);
1853		}
1854	}
1855
1856	pr_debug("trie_flush found=%d\n", found);
1857	return found;
1858}
1859
1860static void __trie_free_rcu(struct rcu_head *head)
1861{
1862	struct fib_table *tb = container_of(head, struct fib_table, rcu);
1863#ifdef CONFIG_IP_FIB_TRIE_STATS
1864	struct trie *t = (struct trie *)tb->tb_data;
1865
1866	if (tb->tb_data == tb->__data)
1867		free_percpu(t->stats);
1868#endif /* CONFIG_IP_FIB_TRIE_STATS */
1869	kfree(tb);
1870}
1871
1872void fib_free_table(struct fib_table *tb)
1873{
1874	call_rcu(&tb->rcu, __trie_free_rcu);
1875}
1876
1877static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
1878			     struct sk_buff *skb, struct netlink_callback *cb)
1879{
1880	__be32 xkey = htonl(l->key);
1881	struct fib_alias *fa;
1882	int i, s_i;
1883
1884	s_i = cb->args[4];
1885	i = 0;
1886
1887	/* rcu_read_lock is hold by caller */
1888	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1889		if (i < s_i) {
1890			i++;
1891			continue;
1892		}
1893
1894		if (tb->tb_id != fa->tb_id) {
1895			i++;
1896			continue;
1897		}
1898
1899		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1900				  cb->nlh->nlmsg_seq,
1901				  RTM_NEWROUTE,
1902				  tb->tb_id,
1903				  fa->fa_type,
1904				  xkey,
1905				  KEYLENGTH - fa->fa_slen,
1906				  fa->fa_tos,
1907				  fa->fa_info, NLM_F_MULTI) < 0) {
1908			cb->args[4] = i;
1909			return -1;
1910		}
1911		i++;
1912	}
1913
1914	cb->args[4] = i;
1915	return skb->len;
1916}
1917
1918/* rcu_read_lock needs to be hold by caller from readside */
1919int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1920		   struct netlink_callback *cb)
1921{
1922	struct trie *t = (struct trie *)tb->tb_data;
1923	struct key_vector *l, *tp = t->kv;
1924	/* Dump starting at last key.
1925	 * Note: 0.0.0.0/0 (ie default) is first key.
1926	 */
1927	int count = cb->args[2];
1928	t_key key = cb->args[3];
1929
1930	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1931		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1932			cb->args[3] = key;
1933			cb->args[2] = count;
1934			return -1;
1935		}
1936
1937		++count;
1938		key = l->key + 1;
1939
1940		memset(&cb->args[4], 0,
1941		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1942
1943		/* stop loop if key wrapped back to 0 */
1944		if (key < l->key)
1945			break;
1946	}
1947
1948	cb->args[3] = key;
1949	cb->args[2] = count;
1950
1951	return skb->len;
1952}
1953
1954void __init fib_trie_init(void)
1955{
1956	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1957					  sizeof(struct fib_alias),
1958					  0, SLAB_PANIC, NULL);
1959
1960	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1961					   LEAF_SIZE,
1962					   0, SLAB_PANIC, NULL);
1963}
1964
1965struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
1966{
1967	struct fib_table *tb;
1968	struct trie *t;
1969	size_t sz = sizeof(*tb);
1970
1971	if (!alias)
1972		sz += sizeof(struct trie);
1973
1974	tb = kzalloc(sz, GFP_KERNEL);
1975	if (!tb)
1976		return NULL;
1977
1978	tb->tb_id = id;
1979	tb->tb_default = -1;
1980	tb->tb_num_default = 0;
1981	tb->tb_data = (alias ? alias->__data : tb->__data);
1982
1983	if (alias)
1984		return tb;
1985
1986	t = (struct trie *) tb->tb_data;
1987	t->kv[0].pos = KEYLENGTH;
1988	t->kv[0].slen = KEYLENGTH;
1989#ifdef CONFIG_IP_FIB_TRIE_STATS
1990	t->stats = alloc_percpu(struct trie_use_stats);
1991	if (!t->stats) {
1992		kfree(tb);
1993		tb = NULL;
1994	}
1995#endif
1996
1997	return tb;
1998}
1999
2000#ifdef CONFIG_PROC_FS
2001/* Depth first Trie walk iterator */
2002struct fib_trie_iter {
2003	struct seq_net_private p;
2004	struct fib_table *tb;
2005	struct key_vector *tnode;
2006	unsigned int index;
2007	unsigned int depth;
2008};
2009
2010static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
2011{
2012	unsigned long cindex = iter->index;
2013	struct key_vector *pn = iter->tnode;
2014	t_key pkey;
2015
2016	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2017		 iter->tnode, iter->index, iter->depth);
2018
2019	while (!IS_TRIE(pn)) {
2020		while (cindex < child_length(pn)) {
2021			struct key_vector *n = get_child_rcu(pn, cindex++);
2022
2023			if (!n)
2024				continue;
2025
2026			if (IS_LEAF(n)) {
2027				iter->tnode = pn;
2028				iter->index = cindex;
2029			} else {
2030				/* push down one level */
2031				iter->tnode = n;
2032				iter->index = 0;
2033				++iter->depth;
2034			}
2035
2036			return n;
2037		}
2038
2039		/* Current node exhausted, pop back up */
2040		pkey = pn->key;
2041		pn = node_parent_rcu(pn);
2042		cindex = get_index(pkey, pn) + 1;
2043		--iter->depth;
2044	}
2045
2046	/* record root node so further searches know we are done */
2047	iter->tnode = pn;
2048	iter->index = 0;
2049
2050	return NULL;
2051}
2052
2053static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
2054					     struct trie *t)
2055{
2056	struct key_vector *n, *pn = t->kv;
2057
2058	if (!t)
2059		return NULL;
2060
2061	n = rcu_dereference(pn->tnode[0]);
2062	if (!n)
2063		return NULL;
2064
2065	if (IS_TNODE(n)) {
2066		iter->tnode = n;
2067		iter->index = 0;
2068		iter->depth = 1;
2069	} else {
2070		iter->tnode = pn;
2071		iter->index = 0;
2072		iter->depth = 0;
2073	}
2074
2075	return n;
2076}
2077
2078static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2079{
2080	struct key_vector *n;
2081	struct fib_trie_iter iter;
2082
2083	memset(s, 0, sizeof(*s));
2084
2085	rcu_read_lock();
2086	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2087		if (IS_LEAF(n)) {
2088			struct fib_alias *fa;
2089
2090			s->leaves++;
2091			s->totdepth += iter.depth;
2092			if (iter.depth > s->maxdepth)
2093				s->maxdepth = iter.depth;
2094
2095			hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
2096				++s->prefixes;
2097		} else {
2098			s->tnodes++;
2099			if (n->bits < MAX_STAT_DEPTH)
2100				s->nodesizes[n->bits]++;
2101			s->nullpointers += tn_info(n)->empty_children;
2102		}
2103	}
2104	rcu_read_unlock();
2105}
2106
2107/*
2108 *	This outputs /proc/net/fib_triestats
2109 */
2110static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2111{
2112	unsigned int i, max, pointers, bytes, avdepth;
2113
2114	if (stat->leaves)
2115		avdepth = stat->totdepth*100 / stat->leaves;
2116	else
2117		avdepth = 0;
2118
2119	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2120		   avdepth / 100, avdepth % 100);
2121	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2122
2123	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2124	bytes = LEAF_SIZE * stat->leaves;
2125
2126	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2127	bytes += sizeof(struct fib_alias) * stat->prefixes;
2128
2129	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2130	bytes += TNODE_SIZE(0) * stat->tnodes;
2131
2132	max = MAX_STAT_DEPTH;
2133	while (max > 0 && stat->nodesizes[max-1] == 0)
2134		max--;
2135
2136	pointers = 0;
2137	for (i = 1; i < max; i++)
2138		if (stat->nodesizes[i] != 0) {
2139			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2140			pointers += (1<<i) * stat->nodesizes[i];
2141		}
2142	seq_putc(seq, '\n');
2143	seq_printf(seq, "\tPointers: %u\n", pointers);
2144
2145	bytes += sizeof(struct key_vector *) * pointers;
2146	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2147	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2148}
2149
2150#ifdef CONFIG_IP_FIB_TRIE_STATS
2151static void trie_show_usage(struct seq_file *seq,
2152			    const struct trie_use_stats __percpu *stats)
2153{
2154	struct trie_use_stats s = { 0 };
2155	int cpu;
2156
2157	/* loop through all of the CPUs and gather up the stats */
2158	for_each_possible_cpu(cpu) {
2159		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2160
2161		s.gets += pcpu->gets;
2162		s.backtrack += pcpu->backtrack;
2163		s.semantic_match_passed += pcpu->semantic_match_passed;
2164		s.semantic_match_miss += pcpu->semantic_match_miss;
2165		s.null_node_hit += pcpu->null_node_hit;
2166		s.resize_node_skipped += pcpu->resize_node_skipped;
2167	}
2168
2169	seq_printf(seq, "\nCounters:\n---------\n");
2170	seq_printf(seq, "gets = %u\n", s.gets);
2171	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2172	seq_printf(seq, "semantic match passed = %u\n",
2173		   s.semantic_match_passed);
2174	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2175	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2176	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2177}
2178#endif /*  CONFIG_IP_FIB_TRIE_STATS */
2179
2180static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2181{
2182	if (tb->tb_id == RT_TABLE_LOCAL)
2183		seq_puts(seq, "Local:\n");
2184	else if (tb->tb_id == RT_TABLE_MAIN)
2185		seq_puts(seq, "Main:\n");
2186	else
2187		seq_printf(seq, "Id %d:\n", tb->tb_id);
2188}
2189
2190
2191static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2192{
2193	struct net *net = (struct net *)seq->private;
2194	unsigned int h;
2195
2196	seq_printf(seq,
2197		   "Basic info: size of leaf:"
2198		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2199		   LEAF_SIZE, TNODE_SIZE(0));
2200
2201	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2202		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2203		struct fib_table *tb;
2204
2205		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2206			struct trie *t = (struct trie *) tb->tb_data;
2207			struct trie_stat stat;
2208
2209			if (!t)
2210				continue;
2211
2212			fib_table_print(seq, tb);
2213
2214			trie_collect_stats(t, &stat);
2215			trie_show_stats(seq, &stat);
2216#ifdef CONFIG_IP_FIB_TRIE_STATS
2217			trie_show_usage(seq, t->stats);
2218#endif
2219		}
2220	}
2221
2222	return 0;
2223}
2224
2225static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2226{
2227	return single_open_net(inode, file, fib_triestat_seq_show);
2228}
2229
2230static const struct file_operations fib_triestat_fops = {
2231	.owner	= THIS_MODULE,
2232	.open	= fib_triestat_seq_open,
2233	.read	= seq_read,
2234	.llseek	= seq_lseek,
2235	.release = single_release_net,
2236};
2237
2238static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2239{
2240	struct fib_trie_iter *iter = seq->private;
2241	struct net *net = seq_file_net(seq);
2242	loff_t idx = 0;
2243	unsigned int h;
2244
2245	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2246		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2247		struct fib_table *tb;
2248
2249		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2250			struct key_vector *n;
2251
2252			for (n = fib_trie_get_first(iter,
2253						    (struct trie *) tb->tb_data);
2254			     n; n = fib_trie_get_next(iter))
2255				if (pos == idx++) {
2256					iter->tb = tb;
2257					return n;
2258				}
2259		}
2260	}
2261
2262	return NULL;
2263}
2264
2265static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2266	__acquires(RCU)
2267{
2268	rcu_read_lock();
2269	return fib_trie_get_idx(seq, *pos);
2270}
2271
2272static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2273{
2274	struct fib_trie_iter *iter = seq->private;
2275	struct net *net = seq_file_net(seq);
2276	struct fib_table *tb = iter->tb;
2277	struct hlist_node *tb_node;
2278	unsigned int h;
2279	struct key_vector *n;
2280
2281	++*pos;
2282	/* next node in same table */
2283	n = fib_trie_get_next(iter);
2284	if (n)
2285		return n;
2286
2287	/* walk rest of this hash chain */
2288	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2289	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2290		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2291		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2292		if (n)
2293			goto found;
2294	}
2295
2296	/* new hash chain */
2297	while (++h < FIB_TABLE_HASHSZ) {
2298		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2299		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2300			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2301			if (n)
2302				goto found;
2303		}
2304	}
2305	return NULL;
2306
2307found:
2308	iter->tb = tb;
2309	return n;
2310}
2311
2312static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2313	__releases(RCU)
2314{
2315	rcu_read_unlock();
2316}
2317
2318static void seq_indent(struct seq_file *seq, int n)
2319{
2320	while (n-- > 0)
2321		seq_puts(seq, "   ");
2322}
2323
2324static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2325{
2326	switch (s) {
2327	case RT_SCOPE_UNIVERSE: return "universe";
2328	case RT_SCOPE_SITE:	return "site";
2329	case RT_SCOPE_LINK:	return "link";
2330	case RT_SCOPE_HOST:	return "host";
2331	case RT_SCOPE_NOWHERE:	return "nowhere";
2332	default:
2333		snprintf(buf, len, "scope=%d", s);
2334		return buf;
2335	}
2336}
2337
2338static const char *const rtn_type_names[__RTN_MAX] = {
2339	[RTN_UNSPEC] = "UNSPEC",
2340	[RTN_UNICAST] = "UNICAST",
2341	[RTN_LOCAL] = "LOCAL",
2342	[RTN_BROADCAST] = "BROADCAST",
2343	[RTN_ANYCAST] = "ANYCAST",
2344	[RTN_MULTICAST] = "MULTICAST",
2345	[RTN_BLACKHOLE] = "BLACKHOLE",
2346	[RTN_UNREACHABLE] = "UNREACHABLE",
2347	[RTN_PROHIBIT] = "PROHIBIT",
2348	[RTN_THROW] = "THROW",
2349	[RTN_NAT] = "NAT",
2350	[RTN_XRESOLVE] = "XRESOLVE",
2351};
2352
2353static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2354{
2355	if (t < __RTN_MAX && rtn_type_names[t])
2356		return rtn_type_names[t];
2357	snprintf(buf, len, "type %u", t);
2358	return buf;
2359}
2360
2361/* Pretty print the trie */
2362static int fib_trie_seq_show(struct seq_file *seq, void *v)
2363{
2364	const struct fib_trie_iter *iter = seq->private;
2365	struct key_vector *n = v;
2366
2367	if (IS_TRIE(node_parent_rcu(n)))
2368		fib_table_print(seq, iter->tb);
2369
2370	if (IS_TNODE(n)) {
2371		__be32 prf = htonl(n->key);
2372
2373		seq_indent(seq, iter->depth-1);
2374		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
2375			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2376			   tn_info(n)->full_children,
2377			   tn_info(n)->empty_children);
2378	} else {
2379		__be32 val = htonl(n->key);
2380		struct fib_alias *fa;
2381
2382		seq_indent(seq, iter->depth);
2383		seq_printf(seq, "  |-- %pI4\n", &val);
2384
2385		hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
2386			char buf1[32], buf2[32];
2387
2388			seq_indent(seq, iter->depth + 1);
2389			seq_printf(seq, "  /%zu %s %s",
2390				   KEYLENGTH - fa->fa_slen,
2391				   rtn_scope(buf1, sizeof(buf1),
2392					     fa->fa_info->fib_scope),
2393				   rtn_type(buf2, sizeof(buf2),
2394					    fa->fa_type));
2395			if (fa->fa_tos)
2396				seq_printf(seq, " tos=%d", fa->fa_tos);
2397			seq_putc(seq, '\n');
2398		}
2399	}
2400
2401	return 0;
2402}
2403
2404static const struct seq_operations fib_trie_seq_ops = {
2405	.start  = fib_trie_seq_start,
2406	.next   = fib_trie_seq_next,
2407	.stop   = fib_trie_seq_stop,
2408	.show   = fib_trie_seq_show,
2409};
2410
2411static int fib_trie_seq_open(struct inode *inode, struct file *file)
2412{
2413	return seq_open_net(inode, file, &fib_trie_seq_ops,
2414			    sizeof(struct fib_trie_iter));
2415}
2416
2417static const struct file_operations fib_trie_fops = {
2418	.owner  = THIS_MODULE,
2419	.open   = fib_trie_seq_open,
2420	.read   = seq_read,
2421	.llseek = seq_lseek,
2422	.release = seq_release_net,
2423};
2424
2425struct fib_route_iter {
2426	struct seq_net_private p;
2427	struct fib_table *main_tb;
2428	struct key_vector *tnode;
2429	loff_t	pos;
2430	t_key	key;
2431};
2432
2433static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
2434					    loff_t pos)
2435{
2436	struct fib_table *tb = iter->main_tb;
2437	struct key_vector *l, **tp = &iter->tnode;
2438	struct trie *t;
2439	t_key key;
2440
2441	/* use cache location of next-to-find key */
2442	if (iter->pos > 0 && pos >= iter->pos) {
2443		pos -= iter->pos;
2444		key = iter->key;
2445	} else {
2446		t = (struct trie *)tb->tb_data;
2447		iter->tnode = t->kv;
2448		iter->pos = 0;
2449		key = 0;
2450	}
2451
2452	while ((l = leaf_walk_rcu(tp, key)) != NULL) {
2453		key = l->key + 1;
2454		iter->pos++;
2455
2456		if (--pos <= 0)
2457			break;
2458
2459		l = NULL;
2460
2461		/* handle unlikely case of a key wrap */
2462		if (!key)
2463			break;
2464	}
2465
2466	if (l)
2467		iter->key = key;	/* remember it */
2468	else
2469		iter->pos = 0;		/* forget it */
2470
2471	return l;
2472}
2473
2474static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2475	__acquires(RCU)
2476{
2477	struct fib_route_iter *iter = seq->private;
2478	struct fib_table *tb;
2479	struct trie *t;
2480
2481	rcu_read_lock();
2482
2483	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2484	if (!tb)
2485		return NULL;
2486
2487	iter->main_tb = tb;
2488
2489	if (*pos != 0)
2490		return fib_route_get_idx(iter, *pos);
2491
2492	t = (struct trie *)tb->tb_data;
2493	iter->tnode = t->kv;
2494	iter->pos = 0;
2495	iter->key = 0;
2496
2497	return SEQ_START_TOKEN;
2498}
2499
2500static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2501{
2502	struct fib_route_iter *iter = seq->private;
2503	struct key_vector *l = NULL;
2504	t_key key = iter->key;
2505
2506	++*pos;
2507
2508	/* only allow key of 0 for start of sequence */
2509	if ((v == SEQ_START_TOKEN) || key)
2510		l = leaf_walk_rcu(&iter->tnode, key);
2511
2512	if (l) {
2513		iter->key = l->key + 1;
2514		iter->pos++;
2515	} else {
2516		iter->pos = 0;
2517	}
2518
2519	return l;
2520}
2521
2522static void fib_route_seq_stop(struct seq_file *seq, void *v)
2523	__releases(RCU)
2524{
2525	rcu_read_unlock();
2526}
2527
2528static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2529{
2530	unsigned int flags = 0;
2531
2532	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2533		flags = RTF_REJECT;
2534	if (fi && fi->fib_nh->nh_gw)
2535		flags |= RTF_GATEWAY;
2536	if (mask == htonl(0xFFFFFFFF))
2537		flags |= RTF_HOST;
2538	flags |= RTF_UP;
2539	return flags;
2540}
2541
2542/*
2543 *	This outputs /proc/net/route.
2544 *	The format of the file is not supposed to be changed
2545 *	and needs to be same as fib_hash output to avoid breaking
2546 *	legacy utilities
2547 */
2548static int fib_route_seq_show(struct seq_file *seq, void *v)
2549{
2550	struct fib_route_iter *iter = seq->private;
2551	struct fib_table *tb = iter->main_tb;
2552	struct fib_alias *fa;
2553	struct key_vector *l = v;
2554	__be32 prefix;
2555
2556	if (v == SEQ_START_TOKEN) {
2557		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2558			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2559			   "\tWindow\tIRTT");
2560		return 0;
2561	}
2562
2563	prefix = htonl(l->key);
2564
2565	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2566		const struct fib_info *fi = fa->fa_info;
2567		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
2568		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2569
2570		if ((fa->fa_type == RTN_BROADCAST) ||
2571		    (fa->fa_type == RTN_MULTICAST))
2572			continue;
2573
2574		if (fa->tb_id != tb->tb_id)
2575			continue;
2576
2577		seq_setwidth(seq, 127);
2578
2579		if (fi)
2580			seq_printf(seq,
2581				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2582				   "%d\t%08X\t%d\t%u\t%u",
2583				   fi->fib_dev ? fi->fib_dev->name : "*",
2584				   prefix,
2585				   fi->fib_nh->nh_gw, flags, 0, 0,
2586				   fi->fib_priority,
2587				   mask,
2588				   (fi->fib_advmss ?
2589				    fi->fib_advmss + 40 : 0),
2590				   fi->fib_window,
2591				   fi->fib_rtt >> 3);
2592		else
2593			seq_printf(seq,
2594				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2595				   "%d\t%08X\t%d\t%u\t%u",
2596				   prefix, 0, flags, 0, 0, 0,
2597				   mask, 0, 0, 0);
2598
2599		seq_pad(seq, '\n');
2600	}
2601
2602	return 0;
2603}
2604
2605static const struct seq_operations fib_route_seq_ops = {
2606	.start  = fib_route_seq_start,
2607	.next   = fib_route_seq_next,
2608	.stop   = fib_route_seq_stop,
2609	.show   = fib_route_seq_show,
2610};
2611
2612static int fib_route_seq_open(struct inode *inode, struct file *file)
2613{
2614	return seq_open_net(inode, file, &fib_route_seq_ops,
2615			    sizeof(struct fib_route_iter));
2616}
2617
2618static const struct file_operations fib_route_fops = {
2619	.owner  = THIS_MODULE,
2620	.open   = fib_route_seq_open,
2621	.read   = seq_read,
2622	.llseek = seq_lseek,
2623	.release = seq_release_net,
2624};
2625
2626int __net_init fib_proc_init(struct net *net)
2627{
2628	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2629		goto out1;
2630
2631	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2632			 &fib_triestat_fops))
2633		goto out2;
2634
2635	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2636		goto out3;
2637
2638	return 0;
2639
2640out3:
2641	remove_proc_entry("fib_triestat", net->proc_net);
2642out2:
2643	remove_proc_entry("fib_trie", net->proc_net);
2644out1:
2645	return -ENOMEM;
2646}
2647
2648void __net_exit fib_proc_exit(struct net *net)
2649{
2650	remove_proc_entry("fib_trie", net->proc_net);
2651	remove_proc_entry("fib_triestat", net->proc_net);
2652	remove_proc_entry("route", net->proc_net);
2653}
2654
2655#endif /* CONFIG_PROC_FS */
2656