blob: 40ae766527423aba61ca1de86c23113e38de7386 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally described in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
49 */
50
51#define VERSION "0.409"
52
53#include <asm/uaccess.h>
54#include <linux/bitops.h>
55#include <linux/types.h>
56#include <linux/kernel.h>
57#include <linux/mm.h>
58#include <linux/string.h>
59#include <linux/socket.h>
60#include <linux/sockios.h>
61#include <linux/errno.h>
62#include <linux/in.h>
63#include <linux/inet.h>
64#include <linux/inetdevice.h>
65#include <linux/netdevice.h>
66#include <linux/if_arp.h>
67#include <linux/proc_fs.h>
68#include <linux/rcupdate.h>
69#include <linux/skbuff.h>
70#include <linux/netlink.h>
71#include <linux/init.h>
72#include <linux/list.h>
73#include <linux/slab.h>
74#include <linux/export.h>
75#include <net/net_namespace.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
84#define MAX_STAT_DEPTH 32
85
86#define KEYLENGTH (8*sizeof(t_key))
87
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF 1
92#define NODE_TYPE_MASK 0x1UL
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
97
98
99
100struct rt_trie_node {
101 unsigned long parent;
102 t_key key;
103};
104
105struct leaf {
106 unsigned long parent;
107 t_key key;
108 struct hlist_head list;
109 struct rcu_head rcu;
110};
111
112struct leaf_info {
113 struct hlist_node hlist;
114 int plen;
115 u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
116 struct list_head falh;
117 struct rcu_head rcu;
118};
119
120struct tnode {
121 unsigned long parent;
122 t_key key;
123 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
124 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
125 unsigned int full_children; /* KEYLENGTH bits needed */
126 unsigned int empty_children; /* KEYLENGTH bits needed */
127 union {
128 struct rcu_head rcu;
129 struct work_struct work;
130 struct tnode *tnode_free;
131 };
132 struct rt_trie_node __rcu *child[0];
133};
134
135#ifdef CONFIG_IP_FIB_TRIE_STATS
136struct trie_use_stats {
137 unsigned int gets;
138 unsigned int backtrack;
139 unsigned int semantic_match_passed;
140 unsigned int semantic_match_miss;
141 unsigned int null_node_hit;
142 unsigned int resize_node_skipped;
143};
144#endif
145
146struct trie_stat {
147 unsigned int totdepth;
148 unsigned int maxdepth;
149 unsigned int tnodes;
150 unsigned int leaves;
151 unsigned int nullpointers;
152 unsigned int prefixes;
153 unsigned int nodesizes[MAX_STAT_DEPTH];
154};
155
156struct trie {
157 struct rt_trie_node __rcu *trie;
158#ifdef CONFIG_IP_FIB_TRIE_STATS
159 struct trie_use_stats stats;
160#endif
161};
162
163static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
165 int wasfull);
166static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn);
168static struct tnode *halve(struct trie *t, struct tnode *tn);
169/* tnodes to free after resize(); protected by RTNL */
170static struct tnode *tnode_free_head;
171static size_t tnode_free_size;
172
173/*
174 * synchronize_rcu after call_rcu for that many pages; it should be especially
175 * useful before resizing the root node with PREEMPT_NONE configs; the value was
176 * obtained experimentally, aiming to avoid visible slowdown.
177 */
178static const int sync_pages = 128;
179
180static struct kmem_cache *fn_alias_kmem __read_mostly;
181static struct kmem_cache *trie_leaf_kmem __read_mostly;
182
183/*
184 * caller must hold RTNL
185 */
186static inline struct tnode *node_parent(const struct rt_trie_node *node)
187{
188 unsigned long parent;
189
190 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
191
192 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
193}
194
195/*
196 * caller must hold RCU read lock or RTNL
197 */
198static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
199{
200 unsigned long parent;
201
202 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
203 lockdep_rtnl_is_held());
204
205 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
206}
207
208/* Same as rcu_assign_pointer
209 * but that macro() assumes that value is a pointer.
210 */
211static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
212{
213 smp_wmb();
214 node->parent = (unsigned long)ptr | NODE_TYPE(node);
215}
216
217/*
218 * caller must hold RTNL
219 */
220static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
221{
222 BUG_ON(i >= 1U << tn->bits);
223
224 return rtnl_dereference(tn->child[i]);
225}
226
227/*
228 * caller must hold RCU read lock or RTNL
229 */
230static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
231{
232 BUG_ON(i >= 1U << tn->bits);
233
234 return rcu_dereference_rtnl(tn->child[i]);
235}
236
237static inline int tnode_child_length(const struct tnode *tn)
238{
239 return 1 << tn->bits;
240}
241
242static inline t_key mask_pfx(t_key k, unsigned int l)
243{
244 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
245}
246
247static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
248{
249 if (offset < KEYLENGTH)
250 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
251 else
252 return 0;
253}
254
255static inline int tkey_equals(t_key a, t_key b)
256{
257 return a == b;
258}
259
260static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
261{
262 if (bits == 0 || offset >= KEYLENGTH)
263 return 1;
264 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
265 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
266}
267
268static inline int tkey_mismatch(t_key a, int offset, t_key b)
269{
270 t_key diff = a ^ b;
271 int i = offset;
272
273 if (!diff)
274 return 0;
275 while ((diff << i) >> (KEYLENGTH-1) == 0)
276 i++;
277 return i;
278}
279
280/*
281 To understand this stuff, an understanding of keys and all their bits is
282 necessary. Every node in the trie has a key associated with it, but not
283 all of the bits in that key are significant.
284
285 Consider a node 'n' and its parent 'tp'.
286
287 If n is a leaf, every bit in its key is significant. Its presence is
288 necessitated by path compression, since during a tree traversal (when
289 searching for a leaf - unless we are doing an insertion) we will completely
290 ignore all skipped bits we encounter. Thus we need to verify, at the end of
291 a potentially successful search, that we have indeed been walking the
292 correct key path.
293
294 Note that we can never "miss" the correct key in the tree if present by
295 following the wrong path. Path compression ensures that segments of the key
296 that are the same for all keys with a given prefix are skipped, but the
297 skipped part *is* identical for each node in the subtrie below the skipped
298 bit! trie_insert() in this implementation takes care of that - note the
299 call to tkey_sub_equals() in trie_insert().
300
301 if n is an internal node - a 'tnode' here, the various parts of its key
302 have many different meanings.
303
304 Example:
305 _________________________________________________________________
306 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
307 -----------------------------------------------------------------
308 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
309
310 _________________________________________________________________
311 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
312 -----------------------------------------------------------------
313 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
314
315 tp->pos = 7
316 tp->bits = 3
317 n->pos = 15
318 n->bits = 4
319
320 First, let's just ignore the bits that come before the parent tp, that is
321 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
322 not use them for anything.
323
324 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
325 index into the parent's child array. That is, they will be used to find
326 'n' among tp's children.
327
328 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
329 for the node n.
330
331 All the bits we have seen so far are significant to the node n. The rest
332 of the bits are really not needed or indeed known in n->key.
333
334 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
335 n's child array, and will of course be different for each child.
336
337
338 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
339 at this point.
340
341*/
342
343static inline void check_tnode(const struct tnode *tn)
344{
345 WARN_ON(tn && tn->pos+tn->bits > 32);
346}
347
348static const int halve_threshold = 25;
349static const int inflate_threshold = 50;
350static const int halve_threshold_root = 15;
351static const int inflate_threshold_root = 30;
352
353static void __alias_free_mem(struct rcu_head *head)
354{
355 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
356 netslab_dec(FIB_TRIE_SLAB);
357 kmem_cache_free(fn_alias_kmem, fa);
358}
359
360static inline void alias_free_mem_rcu(struct fib_alias *fa)
361{
362 call_rcu(&fa->rcu, __alias_free_mem);
363}
364
365static void __leaf_free_rcu(struct rcu_head *head)
366{
367 struct leaf *l = container_of(head, struct leaf, rcu);
368 netslab_dec(FIB_TRIE_SLAB);
369 kmem_cache_free(trie_leaf_kmem, l);
370}
371
372static inline void free_leaf(struct leaf *l)
373{
374 call_rcu_bh(&l->rcu, __leaf_free_rcu);
375}
376
377static inline void free_leaf_info(struct leaf_info *leaf)
378{
379 kfree_rcu(leaf, rcu);
380}
381
382static struct tnode *tnode_alloc(size_t size)
383{
384 if (size <= PAGE_SIZE)
385 return kzalloc(size, GFP_KERNEL);
386 else
387 return vzalloc(size);
388}
389
390static void __tnode_vfree(struct work_struct *arg)
391{
392 struct tnode *tn = container_of(arg, struct tnode, work);
393 vfree(tn);
394}
395
396static void __tnode_free_rcu(struct rcu_head *head)
397{
398 struct tnode *tn = container_of(head, struct tnode, rcu);
399 size_t size = sizeof(struct tnode) +
400 (sizeof(struct rt_trie_node *) << tn->bits);
401
402 if (size <= PAGE_SIZE)
403 kfree(tn);
404 else {
405 INIT_WORK(&tn->work, __tnode_vfree);
406 schedule_work(&tn->work);
407 }
408}
409
410static inline void tnode_free(struct tnode *tn)
411{
412 if (IS_LEAF(tn))
413 free_leaf((struct leaf *) tn);
414 else
415 call_rcu(&tn->rcu, __tnode_free_rcu);
416}
417
418static void tnode_free_safe(struct tnode *tn)
419{
420 BUG_ON(IS_LEAF(tn));
421 tn->tnode_free = tnode_free_head;
422 tnode_free_head = tn;
423 tnode_free_size += sizeof(struct tnode) +
424 (sizeof(struct rt_trie_node *) << tn->bits);
425}
426
427static void tnode_free_flush(void)
428{
429 struct tnode *tn;
430
431 while ((tn = tnode_free_head)) {
432 tnode_free_head = tn->tnode_free;
433 tn->tnode_free = NULL;
434 tnode_free(tn);
435 }
436
437 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
438 tnode_free_size = 0;
439 synchronize_rcu();
440 }
441}
442
443static struct leaf *leaf_new(void)
444{
445 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
446 if (l) {
447 netslab_inc(FIB_TRIE_SLAB);
448 l->parent = T_LEAF;
449 INIT_HLIST_HEAD(&l->list);
450 }
451 return l;
452}
453
454static struct leaf_info *leaf_info_new(int plen)
455{
456 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
457 if (li) {
458 li->plen = plen;
459 li->mask_plen = ntohl(inet_make_mask(plen));
460 INIT_LIST_HEAD(&li->falh);
461 }
462 return li;
463}
464
465static struct tnode *tnode_new(t_key key, int pos, int bits)
466{
467 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
468 struct tnode *tn = tnode_alloc(sz);
469
470 if (tn) {
471 tn->parent = T_TNODE;
472 tn->pos = pos;
473 tn->bits = bits;
474 tn->key = key;
475 tn->full_children = 0;
476 tn->empty_children = 1<<bits;
477 }
478
479 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
480 sizeof(struct rt_trie_node) << bits);
481 return tn;
482}
483
484/*
485 * Check whether a tnode 'n' is "full", i.e. it is an internal node
486 * and no bits are skipped. See discussion in dyntree paper p. 6
487 */
488
489static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
490{
491 if (n == NULL || IS_LEAF(n))
492 return 0;
493
494 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
495}
496
497static inline void put_child(struct trie *t, struct tnode *tn, int i,
498 struct rt_trie_node *n)
499{
500 tnode_put_child_reorg(tn, i, n, -1);
501}
502
503 /*
504 * Add a child at position i overwriting the old value.
505 * Update the value of full_children and empty_children.
506 */
507
508static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
509 int wasfull)
510{
511 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
512 int isfull;
513
514 BUG_ON(i >= 1<<tn->bits);
515
516 /* update emptyChildren */
517 if (n == NULL && chi != NULL)
518 tn->empty_children++;
519 else if (n != NULL && chi == NULL)
520 tn->empty_children--;
521
522 /* update fullChildren */
523 if (wasfull == -1)
524 wasfull = tnode_full(tn, chi);
525
526 isfull = tnode_full(tn, n);
527 if (wasfull && !isfull)
528 tn->full_children--;
529 else if (!wasfull && isfull)
530 tn->full_children++;
531
532 if (n)
533 node_set_parent(n, tn);
534
535 rcu_assign_pointer(tn->child[i], n);
536}
537
538#define MAX_WORK 10
539static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
540{
541 int i;
542 struct tnode *old_tn;
543 int inflate_threshold_use;
544 int halve_threshold_use;
545 int max_work;
546
547 if (!tn)
548 return NULL;
549
550 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
551 tn, inflate_threshold, halve_threshold);
552
553 /* No children */
554 if (tn->empty_children == tnode_child_length(tn)) {
555 tnode_free_safe(tn);
556 return NULL;
557 }
558 /* One child */
559 if (tn->empty_children == tnode_child_length(tn) - 1)
560 goto one_child;
561 /*
562 * Double as long as the resulting node has a number of
563 * nonempty nodes that are above the threshold.
564 */
565
566 /*
567 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
568 * the Helsinki University of Technology and Matti Tikkanen of Nokia
569 * Telecommunications, page 6:
570 * "A node is doubled if the ratio of non-empty children to all
571 * children in the *doubled* node is at least 'high'."
572 *
573 * 'high' in this instance is the variable 'inflate_threshold'. It
574 * is expressed as a percentage, so we multiply it with
575 * tnode_child_length() and instead of multiplying by 2 (since the
576 * child array will be doubled by inflate()) and multiplying
577 * the left-hand side by 100 (to handle the percentage thing) we
578 * multiply the left-hand side by 50.
579 *
580 * The left-hand side may look a bit weird: tnode_child_length(tn)
581 * - tn->empty_children is of course the number of non-null children
582 * in the current node. tn->full_children is the number of "full"
583 * children, that is non-null tnodes with a skip value of 0.
584 * All of those will be doubled in the resulting inflated tnode, so
585 * we just count them one extra time here.
586 *
587 * A clearer way to write this would be:
588 *
589 * to_be_doubled = tn->full_children;
590 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
591 * tn->full_children;
592 *
593 * new_child_length = tnode_child_length(tn) * 2;
594 *
595 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
596 * new_child_length;
597 * if (new_fill_factor >= inflate_threshold)
598 *
599 * ...and so on, tho it would mess up the while () loop.
600 *
601 * anyway,
602 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
603 * inflate_threshold
604 *
605 * avoid a division:
606 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
607 * inflate_threshold * new_child_length
608 *
609 * expand not_to_be_doubled and to_be_doubled, and shorten:
610 * 100 * (tnode_child_length(tn) - tn->empty_children +
611 * tn->full_children) >= inflate_threshold * new_child_length
612 *
613 * expand new_child_length:
614 * 100 * (tnode_child_length(tn) - tn->empty_children +
615 * tn->full_children) >=
616 * inflate_threshold * tnode_child_length(tn) * 2
617 *
618 * shorten again:
619 * 50 * (tn->full_children + tnode_child_length(tn) -
620 * tn->empty_children) >= inflate_threshold *
621 * tnode_child_length(tn)
622 *
623 */
624
625 check_tnode(tn);
626
627 /* Keep root node larger */
628
629 if (!node_parent((struct rt_trie_node *)tn)) {
630 inflate_threshold_use = inflate_threshold_root;
631 halve_threshold_use = halve_threshold_root;
632 } else {
633 inflate_threshold_use = inflate_threshold;
634 halve_threshold_use = halve_threshold;
635 }
636
637 max_work = MAX_WORK;
638 while ((tn->full_children > 0 && max_work-- &&
639 50 * (tn->full_children + tnode_child_length(tn)
640 - tn->empty_children)
641 >= inflate_threshold_use * tnode_child_length(tn))) {
642
643 old_tn = tn;
644 tn = inflate(t, tn);
645
646 if (IS_ERR(tn)) {
647 tn = old_tn;
648#ifdef CONFIG_IP_FIB_TRIE_STATS
649 t->stats.resize_node_skipped++;
650#endif
651 break;
652 }
653 }
654
655 check_tnode(tn);
656
657 /* Return if at least one inflate is run */
658 if (max_work != MAX_WORK)
659 return (struct rt_trie_node *) tn;
660
661 /*
662 * Halve as long as the number of empty children in this
663 * node is above threshold.
664 */
665
666 max_work = MAX_WORK;
667 while (tn->bits > 1 && max_work-- &&
668 100 * (tnode_child_length(tn) - tn->empty_children) <
669 halve_threshold_use * tnode_child_length(tn)) {
670
671 old_tn = tn;
672 tn = halve(t, tn);
673 if (IS_ERR(tn)) {
674 tn = old_tn;
675#ifdef CONFIG_IP_FIB_TRIE_STATS
676 t->stats.resize_node_skipped++;
677#endif
678 break;
679 }
680 }
681
682
683 /* Only one child remains */
684 if (tn->empty_children == tnode_child_length(tn) - 1) {
685one_child:
686 for (i = 0; i < tnode_child_length(tn); i++) {
687 struct rt_trie_node *n;
688
689 n = rtnl_dereference(tn->child[i]);
690 if (!n)
691 continue;
692
693 /* compress one level */
694
695 node_set_parent(n, NULL);
696 tnode_free_safe(tn);
697 return n;
698 }
699 }
700 return (struct rt_trie_node *) tn;
701}
702
703
704static void tnode_clean_free(struct tnode *tn)
705{
706 int i;
707 struct tnode *tofree;
708
709 for (i = 0; i < tnode_child_length(tn); i++) {
710 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
711 if (tofree)
712 tnode_free(tofree);
713 }
714 tnode_free(tn);
715}
716
717static struct tnode *inflate(struct trie *t, struct tnode *tn)
718{
719 struct tnode *oldtnode = tn;
720 int olen = tnode_child_length(tn);
721 int i;
722
723 pr_debug("In inflate\n");
724
725 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
726
727 if (!tn)
728 return ERR_PTR(-ENOMEM);
729
730 /*
731 * Preallocate and store tnodes before the actual work so we
732 * don't get into an inconsistent state if memory allocation
733 * fails. In case of failure we return the oldnode and inflate
734 * of tnode is ignored.
735 */
736
737 for (i = 0; i < olen; i++) {
738 struct tnode *inode;
739
740 inode = (struct tnode *) tnode_get_child(oldtnode, i);
741 if (inode &&
742 IS_TNODE(inode) &&
743 inode->pos == oldtnode->pos + oldtnode->bits &&
744 inode->bits > 1) {
745 struct tnode *left, *right;
746 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
747
748 left = tnode_new(inode->key&(~m), inode->pos + 1,
749 inode->bits - 1);
750 if (!left)
751 goto nomem;
752
753 right = tnode_new(inode->key|m, inode->pos + 1,
754 inode->bits - 1);
755
756 if (!right) {
757 tnode_free(left);
758 goto nomem;
759 }
760
761 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
762 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
763 }
764 }
765
766 for (i = 0; i < olen; i++) {
767 struct tnode *inode;
768 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
769 struct tnode *left, *right;
770 int size, j;
771
772 /* An empty child */
773 if (node == NULL)
774 continue;
775
776 /* A leaf or an internal node with skipped bits */
777
778 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
779 tn->pos + tn->bits - 1) {
780 if (tkey_extract_bits(node->key,
781 oldtnode->pos + oldtnode->bits,
782 1) == 0)
783 put_child(t, tn, 2*i, node);
784 else
785 put_child(t, tn, 2*i+1, node);
786 continue;
787 }
788
789 /* An internal node with two children */
790 inode = (struct tnode *) node;
791
792 if (inode->bits == 1) {
793 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
794 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
795
796 tnode_free_safe(inode);
797 continue;
798 }
799
800 /* An internal node with more than two children */
801
802 /* We will replace this node 'inode' with two new
803 * ones, 'left' and 'right', each with half of the
804 * original children. The two new nodes will have
805 * a position one bit further down the key and this
806 * means that the "significant" part of their keys
807 * (see the discussion near the top of this file)
808 * will differ by one bit, which will be "0" in
809 * left's key and "1" in right's key. Since we are
810 * moving the key position by one step, the bit that
811 * we are moving away from - the bit at position
812 * (inode->pos) - is the one that will differ between
813 * left and right. So... we synthesize that bit in the
814 * two new keys.
815 * The mask 'm' below will be a single "one" bit at
816 * the position (inode->pos)
817 */
818
819 /* Use the old key, but set the new significant
820 * bit to zero.
821 */
822
823 left = (struct tnode *) tnode_get_child(tn, 2*i);
824 put_child(t, tn, 2*i, NULL);
825
826 BUG_ON(!left);
827
828 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
829 put_child(t, tn, 2*i+1, NULL);
830
831 BUG_ON(!right);
832
833 size = tnode_child_length(left);
834 for (j = 0; j < size; j++) {
835 put_child(t, left, j, rtnl_dereference(inode->child[j]));
836 put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
837 }
838 put_child(t, tn, 2*i, resize(t, left));
839 put_child(t, tn, 2*i+1, resize(t, right));
840
841 tnode_free_safe(inode);
842 }
843 tnode_free_safe(oldtnode);
844 return tn;
845nomem:
846 tnode_clean_free(tn);
847 return ERR_PTR(-ENOMEM);
848}
849
850static struct tnode *halve(struct trie *t, struct tnode *tn)
851{
852 struct tnode *oldtnode = tn;
853 struct rt_trie_node *left, *right;
854 int i;
855 int olen = tnode_child_length(tn);
856
857 pr_debug("In halve\n");
858
859 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
860
861 if (!tn)
862 return ERR_PTR(-ENOMEM);
863
864 /*
865 * Preallocate and store tnodes before the actual work so we
866 * don't get into an inconsistent state if memory allocation
867 * fails. In case of failure we return the oldnode and halve
868 * of tnode is ignored.
869 */
870
871 for (i = 0; i < olen; i += 2) {
872 left = tnode_get_child(oldtnode, i);
873 right = tnode_get_child(oldtnode, i+1);
874
875 /* Two nonempty children */
876 if (left && right) {
877 struct tnode *newn;
878
879 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
880
881 if (!newn)
882 goto nomem;
883
884 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
885 }
886
887 }
888
889 for (i = 0; i < olen; i += 2) {
890 struct tnode *newBinNode;
891
892 left = tnode_get_child(oldtnode, i);
893 right = tnode_get_child(oldtnode, i+1);
894
895 /* At least one of the children is empty */
896 if (left == NULL) {
897 if (right == NULL) /* Both are empty */
898 continue;
899 put_child(t, tn, i/2, right);
900 continue;
901 }
902
903 if (right == NULL) {
904 put_child(t, tn, i/2, left);
905 continue;
906 }
907
908 /* Two nonempty children */
909 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
910 put_child(t, tn, i/2, NULL);
911 put_child(t, newBinNode, 0, left);
912 put_child(t, newBinNode, 1, right);
913 put_child(t, tn, i/2, resize(t, newBinNode));
914 }
915 tnode_free_safe(oldtnode);
916 return tn;
917nomem:
918 tnode_clean_free(tn);
919 return ERR_PTR(-ENOMEM);
920}
921
922/* readside must use rcu_read_lock currently dump routines
923 via get_fa_head and dump */
924
925static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
926{
927 struct hlist_head *head = &l->list;
928 struct hlist_node *node;
929 struct leaf_info *li;
930
931 hlist_for_each_entry_rcu(li, node, head, hlist)
932 if (li->plen == plen)
933 return li;
934
935 return NULL;
936}
937
938static inline struct list_head *get_fa_head(struct leaf *l, int plen)
939{
940 struct leaf_info *li = find_leaf_info(l, plen);
941
942 if (!li)
943 return NULL;
944
945 return &li->falh;
946}
947
948static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
949{
950 struct leaf_info *li = NULL, *last = NULL;
951 struct hlist_node *node;
952
953 if (hlist_empty(head)) {
954 hlist_add_head_rcu(&new->hlist, head);
955 } else {
956 hlist_for_each_entry(li, node, head, hlist) {
957 if (new->plen > li->plen)
958 break;
959
960 last = li;
961 }
962 if (last)
963 hlist_add_after_rcu(&last->hlist, &new->hlist);
964 else
965 hlist_add_before_rcu(&new->hlist, &li->hlist);
966 }
967}
968
969/* rcu_read_lock needs to be hold by caller from readside */
970
971static struct leaf *
972fib_find_node(struct trie *t, u32 key)
973{
974 int pos;
975 struct tnode *tn;
976 struct rt_trie_node *n;
977
978 pos = 0;
979 n = rcu_dereference_rtnl(t->trie);
980
981 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
982 tn = (struct tnode *) n;
983
984 check_tnode(tn);
985
986 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
987 pos = tn->pos + tn->bits;
988 n = tnode_get_child_rcu(tn,
989 tkey_extract_bits(key,
990 tn->pos,
991 tn->bits));
992 } else
993 break;
994 }
995 /* Case we have found a leaf. Compare prefixes */
996
997 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
998 return (struct leaf *)n;
999
1000 return NULL;
1001}
1002
1003static void trie_rebalance(struct trie *t, struct tnode *tn)
1004{
1005 int wasfull;
1006 t_key cindex, key;
1007 struct tnode *tp;
1008
1009 key = tn->key;
1010
1011 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1012 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1013 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1014 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1015
1016 tnode_put_child_reorg((struct tnode *)tp, cindex,
1017 (struct rt_trie_node *)tn, wasfull);
1018
1019 tp = node_parent((struct rt_trie_node *) tn);
1020 if (!tp)
1021 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1022
1023 tnode_free_flush();
1024 if (!tp)
1025 break;
1026 tn = tp;
1027 }
1028
1029 /* Handle last (top) tnode */
1030 if (IS_TNODE(tn))
1031 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1032
1033 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1034 tnode_free_flush();
1035}
1036
1037/* only used from updater-side */
1038
1039static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1040{
1041 int pos, newpos;
1042 struct tnode *tp = NULL, *tn = NULL;
1043 struct rt_trie_node *n;
1044 struct leaf *l;
1045 int missbit;
1046 struct list_head *fa_head = NULL;
1047 struct leaf_info *li;
1048 t_key cindex;
1049
1050 pos = 0;
1051 n = rtnl_dereference(t->trie);
1052
1053 /* If we point to NULL, stop. Either the tree is empty and we should
1054 * just put a new leaf in if, or we have reached an empty child slot,
1055 * and we should just put our new leaf in that.
1056 * If we point to a T_TNODE, check if it matches our key. Note that
1057 * a T_TNODE might be skipping any number of bits - its 'pos' need
1058 * not be the parent's 'pos'+'bits'!
1059 *
1060 * If it does match the current key, get pos/bits from it, extract
1061 * the index from our key, push the T_TNODE and walk the tree.
1062 *
1063 * If it doesn't, we have to replace it with a new T_TNODE.
1064 *
1065 * If we point to a T_LEAF, it might or might not have the same key
1066 * as we do. If it does, just change the value, update the T_LEAF's
1067 * value, and return it.
1068 * If it doesn't, we need to replace it with a T_TNODE.
1069 */
1070
1071 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1072 tn = (struct tnode *) n;
1073
1074 check_tnode(tn);
1075
1076 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1077 tp = tn;
1078 pos = tn->pos + tn->bits;
1079 n = tnode_get_child(tn,
1080 tkey_extract_bits(key,
1081 tn->pos,
1082 tn->bits));
1083
1084 BUG_ON(n && node_parent(n) != tn);
1085 } else
1086 break;
1087 }
1088
1089 /*
1090 * n ----> NULL, LEAF or TNODE
1091 *
1092 * tp is n's (parent) ----> NULL or TNODE
1093 */
1094
1095 BUG_ON(tp && IS_LEAF(tp));
1096
1097 /* Case 1: n is a leaf. Compare prefixes */
1098
1099 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1100 l = (struct leaf *) n;
1101 li = leaf_info_new(plen);
1102
1103 if (!li)
1104 return NULL;
1105
1106 fa_head = &li->falh;
1107 insert_leaf_info(&l->list, li);
1108 goto done;
1109 }
1110 l = leaf_new();
1111
1112 if (!l)
1113 return NULL;
1114
1115 l->key = key;
1116 li = leaf_info_new(plen);
1117
1118 if (!li) {
1119 free_leaf(l);
1120 return NULL;
1121 }
1122
1123 fa_head = &li->falh;
1124 insert_leaf_info(&l->list, li);
1125
1126 if (t->trie && n == NULL) {
1127 /* Case 2: n is NULL, and will just insert a new leaf */
1128
1129 node_set_parent((struct rt_trie_node *)l, tp);
1130
1131 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1132 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1133 } else {
1134 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1135 /*
1136 * Add a new tnode here
1137 * first tnode need some special handling
1138 */
1139
1140 if (tp)
1141 pos = tp->pos+tp->bits;
1142 else
1143 pos = 0;
1144
1145 if (n) {
1146 newpos = tkey_mismatch(key, pos, n->key);
1147 tn = tnode_new(n->key, newpos, 1);
1148 } else {
1149 newpos = 0;
1150 tn = tnode_new(key, newpos, 1); /* First tnode */
1151 }
1152
1153 if (!tn) {
1154 free_leaf_info(li);
1155 free_leaf(l);
1156 return NULL;
1157 }
1158
1159 node_set_parent((struct rt_trie_node *)tn, tp);
1160
1161 missbit = tkey_extract_bits(key, newpos, 1);
1162 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1163 put_child(t, tn, 1-missbit, n);
1164
1165 if (tp) {
1166 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1167 put_child(t, (struct tnode *)tp, cindex,
1168 (struct rt_trie_node *)tn);
1169 } else {
1170 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1171 tp = tn;
1172 }
1173 }
1174
1175 if (tp && tp->pos + tp->bits > 32)
1176 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1177 tp, tp->pos, tp->bits, key, plen);
1178
1179 /* Rebalance the trie */
1180
1181 trie_rebalance(t, tp);
1182done:
1183 return fa_head;
1184}
1185
1186/*
1187 * Caller must hold RTNL.
1188 */
1189int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1190{
1191 struct trie *t = (struct trie *) tb->tb_data;
1192 struct fib_alias *fa, *new_fa;
1193 struct list_head *fa_head = NULL;
1194 struct fib_info *fi;
1195 int plen = cfg->fc_dst_len;
1196 u8 tos = cfg->fc_tos;
1197 u32 key, mask;
1198 int err;
1199 struct leaf *l;
1200
1201 if (plen > 32)
1202 return -EINVAL;
1203
1204 key = ntohl(cfg->fc_dst);
1205
1206 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1207
1208 mask = ntohl(inet_make_mask(plen));
1209
1210 if (key & ~mask)
1211 return -EINVAL;
1212
1213 key = key & mask;
1214
1215 fi = fib_create_info(cfg);
1216 if (IS_ERR(fi)) {
1217 err = PTR_ERR(fi);
1218 goto err;
1219 }
1220
1221 l = fib_find_node(t, key);
1222 fa = NULL;
1223
1224 if (l) {
1225 fa_head = get_fa_head(l, plen);
1226 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1227 }
1228
1229 /* Now fa, if non-NULL, points to the first fib alias
1230 * with the same keys [prefix,tos,priority], if such key already
1231 * exists or to the node before which we will insert new one.
1232 *
1233 * If fa is NULL, we will need to allocate a new one and
1234 * insert to the head of f.
1235 *
1236 * If f is NULL, no fib node matched the destination key
1237 * and we need to allocate a new one of those as well.
1238 */
1239
1240 if (fa && fa->fa_tos == tos &&
1241 fa->fa_info->fib_priority == fi->fib_priority) {
1242 struct fib_alias *fa_first, *fa_match;
1243
1244 err = -EEXIST;
1245 if (cfg->fc_nlflags & NLM_F_EXCL)
1246 goto out;
1247
1248 /* We have 2 goals:
1249 * 1. Find exact match for type, scope, fib_info to avoid
1250 * duplicate routes
1251 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1252 */
1253 fa_match = NULL;
1254 fa_first = fa;
1255 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1256 list_for_each_entry_continue(fa, fa_head, fa_list) {
1257 if (fa->fa_tos != tos)
1258 break;
1259 if (fa->fa_info->fib_priority != fi->fib_priority)
1260 break;
1261 if (fa->fa_type == cfg->fc_type &&
1262 fa->fa_info == fi) {
1263 fa_match = fa;
1264 break;
1265 }
1266 }
1267
1268 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1269 struct fib_info *fi_drop;
1270 u8 state;
1271
1272 fa = fa_first;
1273 if (fa_match) {
1274 if (fa == fa_match)
1275 err = 0;
1276 goto out;
1277 }
1278 err = -ENOBUFS;
1279 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1280 if (new_fa == NULL)
1281 goto out;
1282 netslab_inc(FIB_TRIE_SLAB);
1283 fi_drop = fa->fa_info;
1284 new_fa->fa_tos = fa->fa_tos;
1285 new_fa->fa_info = fi;
1286 new_fa->fa_type = cfg->fc_type;
1287 state = fa->fa_state;
1288 new_fa->fa_state = state & ~FA_S_ACCESSED;
1289
1290 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1291 alias_free_mem_rcu(fa);
1292
1293 fib_release_info(fi_drop);
1294 if (state & FA_S_ACCESSED)
1295 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1296 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1297 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1298
1299 goto succeeded;
1300 }
1301 /* Error if we find a perfect match which
1302 * uses the same scope, type, and nexthop
1303 * information.
1304 */
1305 if (fa_match)
1306 goto out;
1307
1308 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1309 fa = fa_first;
1310 }
1311 err = -ENOENT;
1312 if (!(cfg->fc_nlflags & NLM_F_CREATE))
1313 goto out;
1314
1315 err = -ENOBUFS;
1316 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1317 if (new_fa == NULL)
1318 goto out;
1319 netslab_inc(FIB_TRIE_SLAB);
1320 new_fa->fa_info = fi;
1321 new_fa->fa_tos = tos;
1322 new_fa->fa_type = cfg->fc_type;
1323 new_fa->fa_state = 0;
1324 /*
1325 * Insert new entry to the list.
1326 */
1327
1328 if (!fa_head) {
1329 fa_head = fib_insert_node(t, key, plen);
1330 if (unlikely(!fa_head)) {
1331 err = -ENOMEM;
1332 goto out_free_new_fa;
1333 }
1334 }
1335
1336 if (!plen)
1337 tb->tb_num_default++;
1338
1339 list_add_tail_rcu(&new_fa->fa_list,
1340 (fa ? &fa->fa_list : fa_head));
1341
1342 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1343 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1344 &cfg->fc_nlinfo, 0);
1345succeeded:
1346 net_run_track(PRT_ROUTE,"insert route,cfg.fc_scope =%d,example:RT_SCOPE_HOST",cfg->fc_scope);
1347 return 0;
1348
1349out_free_new_fa:
1350 netslab_dec(FIB_TRIE_SLAB);
1351 kmem_cache_free(fn_alias_kmem, new_fa);
1352out:
1353 fib_release_info(fi);
1354err:
1355 return err;
1356}
1357
1358/* should be called with rcu_read_lock */
1359static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1360 t_key key, const struct flowi4 *flp,
1361 struct fib_result *res, int fib_flags)
1362{
1363 struct leaf_info *li;
1364 struct hlist_head *hhead = &l->list;
1365 struct hlist_node *node;
1366
1367 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1368 struct fib_alias *fa;
1369
1370 if (l->key != (key & li->mask_plen))
1371 continue;
1372
1373 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1374 struct fib_info *fi = fa->fa_info;
1375 int nhsel, err;
1376
1377 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1378 continue;
1379 if (fi->fib_dead)
1380 continue;
1381 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1382 continue;
1383 fib_alias_accessed(fa);
1384 err = fib_props[fa->fa_type].error;
1385 if (err) {
1386#ifdef CONFIG_IP_FIB_TRIE_STATS
1387 t->stats.semantic_match_passed++;
1388#endif
1389 return err;
1390 }
1391 if (fi->fib_flags & RTNH_F_DEAD)
1392 continue;
1393 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1394 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1395
1396 if (nh->nh_flags & RTNH_F_DEAD)
1397 continue;
1398 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1399 continue;
1400
1401#ifdef CONFIG_IP_FIB_TRIE_STATS
1402 t->stats.semantic_match_passed++;
1403#endif
1404 res->prefixlen = li->plen;
1405 res->nh_sel = nhsel;
1406 res->type = fa->fa_type;
1407 res->scope = fa->fa_info->fib_scope;
1408 res->fi = fi;
1409 res->table = tb;
1410 res->fa_head = &li->falh;
1411 if (!(fib_flags & FIB_LOOKUP_NOREF))
1412 atomic_inc(&fi->fib_clntref);
1413 return 0;
1414 }
1415 }
1416
1417#ifdef CONFIG_IP_FIB_TRIE_STATS
1418 t->stats.semantic_match_miss++;
1419#endif
1420 }
1421
1422 return 1;
1423}
1424
1425int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1426 struct fib_result *res, int fib_flags)
1427{
1428 struct trie *t = (struct trie *) tb->tb_data;
1429 int ret;
1430 struct rt_trie_node *n;
1431 struct tnode *pn;
1432 unsigned int pos, bits;
1433 t_key key = ntohl(flp->daddr);
1434 unsigned int chopped_off;
1435 t_key cindex = 0;
1436 unsigned int current_prefix_length = KEYLENGTH;
1437 struct tnode *cn;
1438 t_key pref_mismatch;
1439
1440 rcu_read_lock();
1441
1442 n = rcu_dereference(t->trie);
1443 if (!n)
1444 goto failed;
1445
1446#ifdef CONFIG_IP_FIB_TRIE_STATS
1447 t->stats.gets++;
1448#endif
1449
1450 /* Just a leaf? */
1451 if (IS_LEAF(n)) {
1452 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1453 goto found;
1454 }
1455
1456 pn = (struct tnode *) n;
1457 chopped_off = 0;
1458
1459 while (pn) {
1460 pos = pn->pos;
1461 bits = pn->bits;
1462
1463 if (!chopped_off)
1464 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1465 pos, bits);
1466
1467 n = tnode_get_child_rcu(pn, cindex);
1468
1469 if (n == NULL) {
1470#ifdef CONFIG_IP_FIB_TRIE_STATS
1471 t->stats.null_node_hit++;
1472#endif
1473 goto backtrace;
1474 }
1475
1476 if (IS_LEAF(n)) {
1477 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1478 if (ret > 0)
1479 goto backtrace;
1480 goto found;
1481 }
1482
1483 cn = (struct tnode *)n;
1484
1485 /*
1486 * It's a tnode, and we can do some extra checks here if we
1487 * like, to avoid descending into a dead-end branch.
1488 * This tnode is in the parent's child array at index
1489 * key[p_pos..p_pos+p_bits] but potentially with some bits
1490 * chopped off, so in reality the index may be just a
1491 * subprefix, padded with zero at the end.
1492 * We can also take a look at any skipped bits in this
1493 * tnode - everything up to p_pos is supposed to be ok,
1494 * and the non-chopped bits of the index (se previous
1495 * paragraph) are also guaranteed ok, but the rest is
1496 * considered unknown.
1497 *
1498 * The skipped bits are key[pos+bits..cn->pos].
1499 */
1500
1501 /* If current_prefix_length < pos+bits, we are already doing
1502 * actual prefix matching, which means everything from
1503 * pos+(bits-chopped_off) onward must be zero along some
1504 * branch of this subtree - otherwise there is *no* valid
1505 * prefix present. Here we can only check the skipped
1506 * bits. Remember, since we have already indexed into the
1507 * parent's child array, we know that the bits we chopped of
1508 * *are* zero.
1509 */
1510
1511 /* NOTA BENE: Checking only skipped bits
1512 for the new node here */
1513
1514 if (current_prefix_length < pos+bits) {
1515 if (tkey_extract_bits(cn->key, current_prefix_length,
1516 cn->pos - current_prefix_length)
1517 || !(cn->child[0]))
1518 goto backtrace;
1519 }
1520
1521 /*
1522 * If chopped_off=0, the index is fully validated and we
1523 * only need to look at the skipped bits for this, the new,
1524 * tnode. What we actually want to do is to find out if
1525 * these skipped bits match our key perfectly, or if we will
1526 * have to count on finding a matching prefix further down,
1527 * because if we do, we would like to have some way of
1528 * verifying the existence of such a prefix at this point.
1529 */
1530
1531 /* The only thing we can do at this point is to verify that
1532 * any such matching prefix can indeed be a prefix to our
1533 * key, and if the bits in the node we are inspecting that
1534 * do not match our key are not ZERO, this cannot be true.
1535 * Thus, find out where there is a mismatch (before cn->pos)
1536 * and verify that all the mismatching bits are zero in the
1537 * new tnode's key.
1538 */
1539
1540 /*
1541 * Note: We aren't very concerned about the piece of
1542 * the key that precede pn->pos+pn->bits, since these
1543 * have already been checked. The bits after cn->pos
1544 * aren't checked since these are by definition
1545 * "unknown" at this point. Thus, what we want to see
1546 * is if we are about to enter the "prefix matching"
1547 * state, and in that case verify that the skipped
1548 * bits that will prevail throughout this subtree are
1549 * zero, as they have to be if we are to find a
1550 * matching prefix.
1551 */
1552
1553 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1554
1555 /*
1556 * In short: If skipped bits in this node do not match
1557 * the search key, enter the "prefix matching"
1558 * state.directly.
1559 */
1560 if (pref_mismatch) {
1561 int mp = KEYLENGTH - fls(pref_mismatch);
1562
1563 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1564 goto backtrace;
1565
1566 if (current_prefix_length >= cn->pos)
1567 current_prefix_length = mp;
1568 }
1569
1570 pn = (struct tnode *)n; /* Descend */
1571 chopped_off = 0;
1572 continue;
1573
1574backtrace:
1575 chopped_off++;
1576
1577 /* As zero don't change the child key (cindex) */
1578 while ((chopped_off <= pn->bits)
1579 && !(cindex & (1<<(chopped_off-1))))
1580 chopped_off++;
1581
1582 /* Decrease current_... with bits chopped off */
1583 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1584 current_prefix_length = pn->pos + pn->bits
1585 - chopped_off;
1586
1587 /*
1588 * Either we do the actual chop off according or if we have
1589 * chopped off all bits in this tnode walk up to our parent.
1590 */
1591
1592 if (chopped_off <= pn->bits) {
1593 cindex &= ~(1 << (chopped_off-1));
1594 } else {
1595 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1596 if (!parent)
1597 goto failed;
1598
1599 /* Get Child's index */
1600 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1601 pn = parent;
1602 chopped_off = 0;
1603
1604#ifdef CONFIG_IP_FIB_TRIE_STATS
1605 t->stats.backtrack++;
1606#endif
1607 goto backtrace;
1608 }
1609 }
1610failed:
1611 ret = 1;
1612found:
1613 rcu_read_unlock();
1614 net_run_track(PRT_ROUTE,"route");
1615 return ret;
1616}
1617EXPORT_SYMBOL_GPL(fib_table_lookup);
1618
1619/*
1620 * Remove the leaf and return parent.
1621 */
1622static void trie_leaf_remove(struct trie *t, struct leaf *l)
1623{
1624 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1625
1626 pr_debug("entering trie_leaf_remove(%p)\n", l);
1627
1628 if (tp) {
1629 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1630 put_child(t, (struct tnode *)tp, cindex, NULL);
1631 trie_rebalance(t, tp);
1632 } else
1633 RCU_INIT_POINTER(t->trie, NULL);
1634
1635 free_leaf(l);
1636}
1637
1638/*
1639 * Caller must hold RTNL.
1640 */
1641int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1642{
1643 struct trie *t = (struct trie *) tb->tb_data;
1644 u32 key, mask;
1645 int plen = cfg->fc_dst_len;
1646 u8 tos = cfg->fc_tos;
1647 struct fib_alias *fa, *fa_to_delete;
1648 struct list_head *fa_head;
1649 struct leaf *l;
1650 struct leaf_info *li;
1651
1652 if (plen > 32)
1653 return -EINVAL;
1654
1655 key = ntohl(cfg->fc_dst);
1656 mask = ntohl(inet_make_mask(plen));
1657
1658 if (key & ~mask)
1659 return -EINVAL;
1660
1661 key = key & mask;
1662 l = fib_find_node(t, key);
1663
1664 if (!l)
1665 return -ESRCH;
1666
1667 fa_head = get_fa_head(l, plen);
1668 fa = fib_find_alias(fa_head, tos, 0);
1669
1670 if (!fa)
1671 return -ESRCH;
1672
1673 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1674
1675 fa_to_delete = NULL;
1676 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1677 list_for_each_entry_continue(fa, fa_head, fa_list) {
1678 struct fib_info *fi = fa->fa_info;
1679
1680 if (fa->fa_tos != tos)
1681 break;
1682
1683 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1684 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1685 fa->fa_info->fib_scope == cfg->fc_scope) &&
1686 (!cfg->fc_prefsrc ||
1687 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1688 (!cfg->fc_protocol ||
1689 fi->fib_protocol == cfg->fc_protocol) &&
1690 fib_nh_match(cfg, fi) == 0) {
1691 fa_to_delete = fa;
1692 break;
1693 }
1694 }
1695
1696 if (!fa_to_delete)
1697 return -ESRCH;
1698
1699 fa = fa_to_delete;
1700 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1701 &cfg->fc_nlinfo, 0);
1702
1703 l = fib_find_node(t, key);
1704 li = find_leaf_info(l, plen);
1705
1706 list_del_rcu(&fa->fa_list);
1707
1708 if (!plen)
1709 tb->tb_num_default--;
1710
1711 if (list_empty(fa_head)) {
1712 hlist_del_rcu(&li->hlist);
1713 free_leaf_info(li);
1714 }
1715
1716 if (hlist_empty(&l->list))
1717 trie_leaf_remove(t, l);
1718
1719 if (fa->fa_state & FA_S_ACCESSED)
1720 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1721
1722 fib_release_info(fa->fa_info);
1723 alias_free_mem_rcu(fa);
1724 return 0;
1725}
1726
1727static int trie_flush_list(struct list_head *head)
1728{
1729 struct fib_alias *fa, *fa_node;
1730 int found = 0;
1731
1732 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1733 struct fib_info *fi = fa->fa_info;
1734
1735 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1736 list_del_rcu(&fa->fa_list);
1737 fib_release_info(fa->fa_info);
1738 alias_free_mem_rcu(fa);
1739 found++;
1740 }
1741 }
1742 return found;
1743}
1744
1745static int trie_flush_leaf(struct leaf *l)
1746{
1747 int found = 0;
1748 struct hlist_head *lih = &l->list;
1749 struct hlist_node *node, *tmp;
1750 struct leaf_info *li = NULL;
1751
1752 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1753 found += trie_flush_list(&li->falh);
1754
1755 if (list_empty(&li->falh)) {
1756 hlist_del_rcu(&li->hlist);
1757 free_leaf_info(li);
1758 }
1759 }
1760 return found;
1761}
1762
1763/*
1764 * Scan for the next right leaf starting at node p->child[idx]
1765 * Since we have back pointer, no recursion necessary.
1766 */
1767static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1768{
1769 do {
1770 t_key idx;
1771
1772 if (c)
1773 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1774 else
1775 idx = 0;
1776
1777 while (idx < 1u << p->bits) {
1778 c = tnode_get_child_rcu(p, idx++);
1779 if (!c)
1780 continue;
1781
1782 if (IS_LEAF(c))
1783 return (struct leaf *) c;
1784
1785 /* Rescan start scanning in new node */
1786 p = (struct tnode *) c;
1787 idx = 0;
1788 }
1789
1790 /* Node empty, walk back up to parent */
1791 c = (struct rt_trie_node *) p;
1792 } while ((p = node_parent_rcu(c)) != NULL);
1793
1794 return NULL; /* Root of trie */
1795}
1796
1797static struct leaf *trie_firstleaf(struct trie *t)
1798{
1799 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1800
1801 if (!n)
1802 return NULL;
1803
1804 if (IS_LEAF(n)) /* trie is just a leaf */
1805 return (struct leaf *) n;
1806
1807 return leaf_walk_rcu(n, NULL);
1808}
1809
1810static struct leaf *trie_nextleaf(struct leaf *l)
1811{
1812 struct rt_trie_node *c = (struct rt_trie_node *) l;
1813 struct tnode *p = node_parent_rcu(c);
1814
1815 if (!p)
1816 return NULL; /* trie with just one leaf */
1817
1818 return leaf_walk_rcu(p, c);
1819}
1820
1821static struct leaf *trie_leafindex(struct trie *t, int index)
1822{
1823 struct leaf *l = trie_firstleaf(t);
1824
1825 while (l && index-- > 0)
1826 l = trie_nextleaf(l);
1827
1828 return l;
1829}
1830
1831
1832/*
1833 * Caller must hold RTNL.
1834 */
1835int fib_table_flush(struct fib_table *tb)
1836{
1837 struct trie *t = (struct trie *) tb->tb_data;
1838 struct leaf *l, *ll = NULL;
1839 int found = 0;
1840
1841 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1842 found += trie_flush_leaf(l);
1843
1844 if (ll && hlist_empty(&ll->list))
1845 trie_leaf_remove(t, ll);
1846 ll = l;
1847 }
1848
1849 if (ll && hlist_empty(&ll->list))
1850 trie_leaf_remove(t, ll);
1851
1852 pr_debug("trie_flush found=%d\n", found);
1853 return found;
1854}
1855
1856void fib_free_table(struct fib_table *tb)
1857{
1858 kfree(tb);
1859}
1860
1861static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1862 struct fib_table *tb,
1863 struct sk_buff *skb, struct netlink_callback *cb)
1864{
1865 int i, s_i;
1866 struct fib_alias *fa;
1867 __be32 xkey = htonl(key);
1868
1869 s_i = cb->args[5];
1870 i = 0;
1871
1872 /* rcu_read_lock is hold by caller */
1873
1874 list_for_each_entry_rcu(fa, fah, fa_list) {
1875 if (i < s_i) {
1876 i++;
1877 continue;
1878 }
1879
1880 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1881 cb->nlh->nlmsg_seq,
1882 RTM_NEWROUTE,
1883 tb->tb_id,
1884 fa->fa_type,
1885 xkey,
1886 plen,
1887 fa->fa_tos,
1888 fa->fa_info, NLM_F_MULTI) < 0) {
1889 cb->args[5] = i;
1890 return -1;
1891 }
1892 i++;
1893 }
1894 cb->args[5] = i;
1895 return skb->len;
1896}
1897
1898static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1899 struct sk_buff *skb, struct netlink_callback *cb)
1900{
1901 struct leaf_info *li;
1902 struct hlist_node *node;
1903 int i, s_i;
1904
1905 s_i = cb->args[4];
1906 i = 0;
1907
1908 /* rcu_read_lock is hold by caller */
1909 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1910 if (i < s_i) {
1911 i++;
1912 continue;
1913 }
1914
1915 if (i > s_i)
1916 cb->args[5] = 0;
1917
1918 if (list_empty(&li->falh))
1919 continue;
1920
1921 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1922 cb->args[4] = i;
1923 return -1;
1924 }
1925 i++;
1926 }
1927
1928 cb->args[4] = i;
1929 return skb->len;
1930}
1931
1932int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1933 struct netlink_callback *cb)
1934{
1935 struct leaf *l;
1936 struct trie *t = (struct trie *) tb->tb_data;
1937 t_key key = cb->args[2];
1938 int count = cb->args[3];
1939
1940 rcu_read_lock();
1941 /* Dump starting at last key.
1942 * Note: 0.0.0.0/0 (ie default) is first key.
1943 */
1944 if (count == 0)
1945 l = trie_firstleaf(t);
1946 else {
1947 /* Normally, continue from last key, but if that is missing
1948 * fallback to using slow rescan
1949 */
1950 l = fib_find_node(t, key);
1951 if (!l)
1952 l = trie_leafindex(t, count);
1953 }
1954
1955 while (l) {
1956 cb->args[2] = l->key;
1957 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1958 cb->args[3] = count;
1959 rcu_read_unlock();
1960 return -1;
1961 }
1962
1963 ++count;
1964 l = trie_nextleaf(l);
1965 memset(&cb->args[4], 0,
1966 sizeof(cb->args) - 4*sizeof(cb->args[0]));
1967 }
1968 cb->args[3] = count;
1969 rcu_read_unlock();
1970
1971 return skb->len;
1972}
1973
1974void __init fib_trie_init(void)
1975{
1976 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1977 sizeof(struct fib_alias),
1978 0, SLAB_PANIC, NULL);
1979
1980 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1981 max(sizeof(struct leaf),
1982 sizeof(struct leaf_info)),
1983 0, SLAB_PANIC, NULL);
1984}
1985
1986
1987struct fib_table *fib_trie_table(u32 id)
1988{
1989 struct fib_table *tb;
1990 struct trie *t;
1991
1992 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1993 GFP_KERNEL);
1994 if (tb == NULL)
1995 return NULL;
1996
1997 tb->tb_id = id;
1998 tb->tb_default = -1;
1999 tb->tb_num_default = 0;
2000
2001 t = (struct trie *) tb->tb_data;
2002 memset(t, 0, sizeof(*t));
2003 net_run_track(PRT_ROUTE,"create route table:%d",id);
2004 return tb;
2005}
2006
2007#ifdef CONFIG_PROC_FS
2008/* Depth first Trie walk iterator */
2009struct fib_trie_iter {
2010 struct seq_net_private p;
2011 struct fib_table *tb;
2012 struct tnode *tnode;
2013 unsigned int index;
2014 unsigned int depth;
2015};
2016
2017static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2018{
2019 struct tnode *tn = iter->tnode;
2020 unsigned int cindex = iter->index;
2021 struct tnode *p;
2022
2023 /* A single entry routing table */
2024 if (!tn)
2025 return NULL;
2026
2027 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2028 iter->tnode, iter->index, iter->depth);
2029rescan:
2030 while (cindex < (1<<tn->bits)) {
2031 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2032
2033 if (n) {
2034 if (IS_LEAF(n)) {
2035 iter->tnode = tn;
2036 iter->index = cindex + 1;
2037 } else {
2038 /* push down one level */
2039 iter->tnode = (struct tnode *) n;
2040 iter->index = 0;
2041 ++iter->depth;
2042 }
2043 return n;
2044 }
2045
2046 ++cindex;
2047 }
2048
2049 /* Current node exhausted, pop back up */
2050 p = node_parent_rcu((struct rt_trie_node *)tn);
2051 if (p) {
2052 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2053 tn = p;
2054 --iter->depth;
2055 goto rescan;
2056 }
2057
2058 /* got root? */
2059 return NULL;
2060}
2061
2062static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2063 struct trie *t)
2064{
2065 struct rt_trie_node *n;
2066
2067 if (!t)
2068 return NULL;
2069
2070 n = rcu_dereference(t->trie);
2071 if (!n)
2072 return NULL;
2073
2074 if (IS_TNODE(n)) {
2075 iter->tnode = (struct tnode *) n;
2076 iter->index = 0;
2077 iter->depth = 1;
2078 } else {
2079 iter->tnode = NULL;
2080 iter->index = 0;
2081 iter->depth = 0;
2082 }
2083
2084 return n;
2085}
2086
2087static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2088{
2089 struct rt_trie_node *n;
2090 struct fib_trie_iter iter;
2091
2092 memset(s, 0, sizeof(*s));
2093
2094 rcu_read_lock();
2095 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2096 if (IS_LEAF(n)) {
2097 struct leaf *l = (struct leaf *)n;
2098 struct leaf_info *li;
2099 struct hlist_node *tmp;
2100
2101 s->leaves++;
2102 s->totdepth += iter.depth;
2103 if (iter.depth > s->maxdepth)
2104 s->maxdepth = iter.depth;
2105
2106 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2107 ++s->prefixes;
2108 } else {
2109 const struct tnode *tn = (const struct tnode *) n;
2110 int i;
2111
2112 s->tnodes++;
2113 if (tn->bits < MAX_STAT_DEPTH)
2114 s->nodesizes[tn->bits]++;
2115
2116 for (i = 0; i < (1<<tn->bits); i++)
2117 if (!tn->child[i])
2118 s->nullpointers++;
2119 }
2120 }
2121 rcu_read_unlock();
2122}
2123
2124/*
2125 * This outputs /proc/net/fib_triestats
2126 */
2127static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2128{
2129 unsigned int i, max, pointers, bytes, avdepth;
2130
2131 if (stat->leaves)
2132 avdepth = stat->totdepth*100 / stat->leaves;
2133 else
2134 avdepth = 0;
2135
2136 seq_printf(seq, "\tAver depth: %u.%02d\n",
2137 avdepth / 100, avdepth % 100);
2138 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
2139
2140 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
2141 bytes = sizeof(struct leaf) * stat->leaves;
2142
2143 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2144 bytes += sizeof(struct leaf_info) * stat->prefixes;
2145
2146 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2147 bytes += sizeof(struct tnode) * stat->tnodes;
2148
2149 max = MAX_STAT_DEPTH;
2150 while (max > 0 && stat->nodesizes[max-1] == 0)
2151 max--;
2152
2153 pointers = 0;
2154 for (i = 1; i <= max; i++)
2155 if (stat->nodesizes[i] != 0) {
2156 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
2157 pointers += (1<<i) * stat->nodesizes[i];
2158 }
2159 seq_putc(seq, '\n');
2160 seq_printf(seq, "\tPointers: %u\n", pointers);
2161
2162 bytes += sizeof(struct rt_trie_node *) * pointers;
2163 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2164 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2165}
2166
2167#ifdef CONFIG_IP_FIB_TRIE_STATS
2168static void trie_show_usage(struct seq_file *seq,
2169 const struct trie_use_stats *stats)
2170{
2171 seq_printf(seq, "\nCounters:\n---------\n");
2172 seq_printf(seq, "gets = %u\n", stats->gets);
2173 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2174 seq_printf(seq, "semantic match passed = %u\n",
2175 stats->semantic_match_passed);
2176 seq_printf(seq, "semantic match miss = %u\n",
2177 stats->semantic_match_miss);
2178 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2179 seq_printf(seq, "skipped node resize = %u\n\n",
2180 stats->resize_node_skipped);
2181}
2182#endif /* CONFIG_IP_FIB_TRIE_STATS */
2183
2184static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2185{
2186 if (tb->tb_id == RT_TABLE_LOCAL)
2187 seq_puts(seq, "Local:\n");
2188 else if (tb->tb_id == RT_TABLE_MAIN)
2189 seq_puts(seq, "Main:\n");
2190 else
2191 seq_printf(seq, "Id %d:\n", tb->tb_id);
2192}
2193
2194
2195static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2196{
2197 struct net *net = (struct net *)seq->private;
2198 unsigned int h;
2199
2200 seq_printf(seq,
2201 "Basic info: size of leaf:"
2202 " %Zd bytes, size of tnode: %Zd bytes.\n",
2203 sizeof(struct leaf), sizeof(struct tnode));
2204
2205 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2206 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2207 struct hlist_node *node;
2208 struct fib_table *tb;
2209
2210 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2211 struct trie *t = (struct trie *) tb->tb_data;
2212 struct trie_stat stat;
2213
2214 if (!t)
2215 continue;
2216
2217 fib_table_print(seq, tb);
2218
2219 trie_collect_stats(t, &stat);
2220 trie_show_stats(seq, &stat);
2221#ifdef CONFIG_IP_FIB_TRIE_STATS
2222 trie_show_usage(seq, &t->stats);
2223#endif
2224 }
2225 }
2226
2227 return 0;
2228}
2229
2230static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2231{
2232 return single_open_net(inode, file, fib_triestat_seq_show);
2233}
2234
2235static const struct file_operations fib_triestat_fops = {
2236 .owner = THIS_MODULE,
2237 .open = fib_triestat_seq_open,
2238 .read = seq_read,
2239 .llseek = seq_lseek,
2240 .release = single_release_net,
2241};
2242
2243static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2244{
2245 struct fib_trie_iter *iter = seq->private;
2246 struct net *net = seq_file_net(seq);
2247 loff_t idx = 0;
2248 unsigned int h;
2249
2250 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2251 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2252 struct hlist_node *node;
2253 struct fib_table *tb;
2254
2255 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2256 struct rt_trie_node *n;
2257
2258 for (n = fib_trie_get_first(iter,
2259 (struct trie *) tb->tb_data);
2260 n; n = fib_trie_get_next(iter))
2261 if (pos == idx++) {
2262 iter->tb = tb;
2263 return n;
2264 }
2265 }
2266 }
2267
2268 return NULL;
2269}
2270
2271static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2272 __acquires(RCU)
2273{
2274 rcu_read_lock();
2275 return fib_trie_get_idx(seq, *pos);
2276}
2277
2278static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2279{
2280 struct fib_trie_iter *iter = seq->private;
2281 struct net *net = seq_file_net(seq);
2282 struct fib_table *tb = iter->tb;
2283 struct hlist_node *tb_node;
2284 unsigned int h;
2285 struct rt_trie_node *n;
2286
2287 ++*pos;
2288 /* next node in same table */
2289 n = fib_trie_get_next(iter);
2290 if (n)
2291 return n;
2292
2293 /* walk rest of this hash chain */
2294 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2295 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2296 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2297 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2298 if (n)
2299 goto found;
2300 }
2301
2302 /* new hash chain */
2303 while (++h < FIB_TABLE_HASHSZ) {
2304 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2305 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2306 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2307 if (n)
2308 goto found;
2309 }
2310 }
2311 return NULL;
2312
2313found:
2314 iter->tb = tb;
2315 return n;
2316}
2317
2318static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2319 __releases(RCU)
2320{
2321 rcu_read_unlock();
2322}
2323
2324static void seq_indent(struct seq_file *seq, int n)
2325{
2326 while (n-- > 0)
2327 seq_puts(seq, " ");
2328}
2329
2330static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2331{
2332 switch (s) {
2333 case RT_SCOPE_UNIVERSE: return "universe";
2334 case RT_SCOPE_SITE: return "site";
2335 case RT_SCOPE_LINK: return "link";
2336 case RT_SCOPE_HOST: return "host";
2337 case RT_SCOPE_NOWHERE: return "nowhere";
2338 default:
2339 snprintf(buf, len, "scope=%d", s);
2340 return buf;
2341 }
2342}
2343
2344static const char *const rtn_type_names[__RTN_MAX] = {
2345 [RTN_UNSPEC] = "UNSPEC",
2346 [RTN_UNICAST] = "UNICAST",
2347 [RTN_LOCAL] = "LOCAL",
2348 [RTN_BROADCAST] = "BROADCAST",
2349 [RTN_ANYCAST] = "ANYCAST",
2350 [RTN_MULTICAST] = "MULTICAST",
2351 [RTN_BLACKHOLE] = "BLACKHOLE",
2352 [RTN_UNREACHABLE] = "UNREACHABLE",
2353 [RTN_PROHIBIT] = "PROHIBIT",
2354 [RTN_THROW] = "THROW",
2355 [RTN_NAT] = "NAT",
2356 [RTN_XRESOLVE] = "XRESOLVE",
2357};
2358
2359static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2360{
2361 if (t < __RTN_MAX && rtn_type_names[t])
2362 return rtn_type_names[t];
2363 snprintf(buf, len, "type %u", t);
2364 return buf;
2365}
2366
2367/* Pretty print the trie */
2368static int fib_trie_seq_show(struct seq_file *seq, void *v)
2369{
2370 const struct fib_trie_iter *iter = seq->private;
2371 struct rt_trie_node *n = v;
2372
2373 if (!node_parent_rcu(n))
2374 fib_table_print(seq, iter->tb);
2375
2376 if (IS_TNODE(n)) {
2377 struct tnode *tn = (struct tnode *) n;
2378 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2379
2380 seq_indent(seq, iter->depth-1);
2381 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2382 &prf, tn->pos, tn->bits, tn->full_children,
2383 tn->empty_children);
2384
2385 } else {
2386 struct leaf *l = (struct leaf *) n;
2387 struct leaf_info *li;
2388 struct hlist_node *node;
2389 __be32 val = htonl(l->key);
2390
2391 seq_indent(seq, iter->depth);
2392 seq_printf(seq, " |-- %pI4\n", &val);
2393
2394 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2395 struct fib_alias *fa;
2396
2397 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2398 char buf1[32], buf2[32];
2399
2400 seq_indent(seq, iter->depth+1);
2401 seq_printf(seq, " /%d %s %s", li->plen,
2402 rtn_scope(buf1, sizeof(buf1),
2403 fa->fa_info->fib_scope),
2404 rtn_type(buf2, sizeof(buf2),
2405 fa->fa_type));
2406 if (fa->fa_tos)
2407 seq_printf(seq, " tos=%d", fa->fa_tos);
2408 seq_putc(seq, '\n');
2409 }
2410 }
2411 }
2412
2413 return 0;
2414}
2415
2416static const struct seq_operations fib_trie_seq_ops = {
2417 .start = fib_trie_seq_start,
2418 .next = fib_trie_seq_next,
2419 .stop = fib_trie_seq_stop,
2420 .show = fib_trie_seq_show,
2421};
2422
2423static int fib_trie_seq_open(struct inode *inode, struct file *file)
2424{
2425 return seq_open_net(inode, file, &fib_trie_seq_ops,
2426 sizeof(struct fib_trie_iter));
2427}
2428
2429static const struct file_operations fib_trie_fops = {
2430 .owner = THIS_MODULE,
2431 .open = fib_trie_seq_open,
2432 .read = seq_read,
2433 .llseek = seq_lseek,
2434 .release = seq_release_net,
2435};
2436
2437struct fib_route_iter {
2438 struct seq_net_private p;
2439 struct trie *main_trie;
2440 loff_t pos;
2441 t_key key;
2442};
2443
2444static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2445{
2446 struct leaf *l = NULL;
2447 struct trie *t = iter->main_trie;
2448
2449 /* use cache location of last found key */
2450 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2451 pos -= iter->pos;
2452 else {
2453 iter->pos = 0;
2454 l = trie_firstleaf(t);
2455 }
2456
2457 while (l && pos-- > 0) {
2458 iter->pos++;
2459 l = trie_nextleaf(l);
2460 }
2461
2462 if (l)
2463 iter->key = pos; /* remember it */
2464 else
2465 iter->pos = 0; /* forget it */
2466
2467 return l;
2468}
2469
2470static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2471 __acquires(RCU)
2472{
2473 struct fib_route_iter *iter = seq->private;
2474 struct fib_table *tb;
2475
2476 rcu_read_lock();
2477 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2478 if (!tb)
2479 return NULL;
2480
2481 iter->main_trie = (struct trie *) tb->tb_data;
2482 if (*pos == 0)
2483 return SEQ_START_TOKEN;
2484 else
2485 return fib_route_get_idx(iter, *pos - 1);
2486}
2487
2488static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2489{
2490 struct fib_route_iter *iter = seq->private;
2491 struct leaf *l = v;
2492
2493 ++*pos;
2494 if (v == SEQ_START_TOKEN) {
2495 iter->pos = 0;
2496 l = trie_firstleaf(iter->main_trie);
2497 } else {
2498 iter->pos++;
2499 l = trie_nextleaf(l);
2500 }
2501
2502 if (l)
2503 iter->key = l->key;
2504 else
2505 iter->pos = 0;
2506 return l;
2507}
2508
2509static void fib_route_seq_stop(struct seq_file *seq, void *v)
2510 __releases(RCU)
2511{
2512 rcu_read_unlock();
2513}
2514
2515static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2516{
2517 unsigned int flags = 0;
2518
2519 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2520 flags = RTF_REJECT;
2521 if (fi && fi->fib_nh->nh_gw)
2522 flags |= RTF_GATEWAY;
2523 if (mask == htonl(0xFFFFFFFF))
2524 flags |= RTF_HOST;
2525 flags |= RTF_UP;
2526 return flags;
2527}
2528
2529/*
2530 * This outputs /proc/net/route.
2531 * The format of the file is not supposed to be changed
2532 * and needs to be same as fib_hash output to avoid breaking
2533 * legacy utilities
2534 */
2535static int fib_route_seq_show(struct seq_file *seq, void *v)
2536{
2537 struct leaf *l = v;
2538 struct leaf_info *li;
2539 struct hlist_node *node;
2540
2541 if (v == SEQ_START_TOKEN) {
2542 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2543 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2544 "\tWindow\tIRTT");
2545 return 0;
2546 }
2547
2548 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2549 struct fib_alias *fa;
2550 __be32 mask, prefix;
2551
2552 mask = inet_make_mask(li->plen);
2553 prefix = htonl(l->key);
2554
2555 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2556 const struct fib_info *fi = fa->fa_info;
2557 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2558 int len;
2559
2560 if (fa->fa_type == RTN_BROADCAST
2561 || fa->fa_type == RTN_MULTICAST)
2562 continue;
2563
2564 if (fi)
2565 seq_printf(seq,
2566 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2567 "%d\t%08X\t%d\t%u\t%u%n",
2568 fi->fib_dev ? fi->fib_dev->name : "*",
2569 prefix,
2570 fi->fib_nh->nh_gw, flags, 0, 0,
2571 fi->fib_priority,
2572 mask,
2573 (fi->fib_advmss ?
2574 fi->fib_advmss + 40 : 0),
2575 fi->fib_window,
2576 fi->fib_rtt >> 3, &len);
2577 else
2578 seq_printf(seq,
2579 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2580 "%d\t%08X\t%d\t%u\t%u%n",
2581 prefix, 0, flags, 0, 0, 0,
2582 mask, 0, 0, 0, &len);
2583
2584 seq_printf(seq, "%*s\n", 127 - len, "");
2585 }
2586 }
2587
2588 return 0;
2589}
2590
2591static const struct seq_operations fib_route_seq_ops = {
2592 .start = fib_route_seq_start,
2593 .next = fib_route_seq_next,
2594 .stop = fib_route_seq_stop,
2595 .show = fib_route_seq_show,
2596};
2597
2598static int fib_route_seq_open(struct inode *inode, struct file *file)
2599{
2600 return seq_open_net(inode, file, &fib_route_seq_ops,
2601 sizeof(struct fib_route_iter));
2602}
2603
2604static const struct file_operations fib_route_fops = {
2605 .owner = THIS_MODULE,
2606 .open = fib_route_seq_open,
2607 .read = seq_read,
2608 .llseek = seq_lseek,
2609 .release = seq_release_net,
2610};
2611
2612int __net_init fib_proc_init(struct net *net)
2613{
2614 if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
2615 !proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2616 goto out1;
2617
2618 if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
2619 !proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2620 &fib_triestat_fops))
2621 goto out2;
2622
2623 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2624 goto out3;
2625
2626 return 0;
2627
2628out3:
2629 if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
2630 proc_net_remove(net, "fib_triestat");
2631out2:
2632 if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
2633 proc_net_remove(net, "fib_trie");
2634out1:
2635 return -ENOMEM;
2636}
2637
2638void __net_exit fib_proc_exit(struct net *net)
2639{
2640 if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) {
2641 proc_net_remove(net, "fib_trie");
2642 proc_net_remove(net, "fib_triestat");
2643 }
2644 proc_net_remove(net, "route");
2645}
2646
2647#endif /* CONFIG_PROC_FS */