blob: ef1df9d662d623190f56ad848262d7bc2e69ff09 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Linux INET6 implementation
4 * Forwarding Information Database
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Changes:
10 * Yuji SEKIYA @USAGI: Support default route on router node;
11 * remove ip6_null_entry from the top of
12 * routing table.
13 * Ville Nuorvala: Fixed routing subtrees.
14 */
15
16#define pr_fmt(fmt) "IPv6: " fmt
17
18#include <linux/errno.h>
19#include <linux/types.h>
20#include <linux/net.h>
21#include <linux/route.h>
22#include <linux/netdevice.h>
23#include <linux/in6.h>
24#include <linux/init.h>
25#include <linux/list.h>
26#include <linux/slab.h>
27
28#include <net/ip.h>
29#include <net/ipv6.h>
30#include <net/ndisc.h>
31#include <net/addrconf.h>
32#include <net/lwtunnel.h>
33#include <net/fib_notifier.h>
34
35#include <net/ip6_fib.h>
36#include <net/ip6_route.h>
37
38static struct kmem_cache *fib6_node_kmem __read_mostly;
39
40struct fib6_cleaner {
41 struct fib6_walker w;
42 struct net *net;
43 int (*func)(struct fib6_info *, void *arg);
44 int sernum;
45 void *arg;
46 bool skip_notify;
47};
48
49#ifdef CONFIG_IPV6_SUBTREES
50#define FWS_INIT FWS_S
51#else
52#define FWS_INIT FWS_L
53#endif
54
55static struct fib6_info *fib6_find_prefix(struct net *net,
56 struct fib6_table *table,
57 struct fib6_node *fn);
58static struct fib6_node *fib6_repair_tree(struct net *net,
59 struct fib6_table *table,
60 struct fib6_node *fn);
61static int fib6_walk(struct net *net, struct fib6_walker *w);
62static int fib6_walk_continue(struct fib6_walker *w);
63
64/*
65 * A routing update causes an increase of the serial number on the
66 * affected subtree. This allows for cached routes to be asynchronously
67 * tested when modifications are made to the destination cache as a
68 * result of redirects, path MTU changes, etc.
69 */
70
71static void fib6_gc_timer_cb(struct timer_list *t);
72
73#define FOR_WALKERS(net, w) \
74 list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
75
76static void fib6_walker_link(struct net *net, struct fib6_walker *w)
77{
78 write_lock_bh(&net->ipv6.fib6_walker_lock);
79 list_add(&w->lh, &net->ipv6.fib6_walkers);
80 write_unlock_bh(&net->ipv6.fib6_walker_lock);
81}
82
83static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
84{
85 write_lock_bh(&net->ipv6.fib6_walker_lock);
86 list_del(&w->lh);
87 write_unlock_bh(&net->ipv6.fib6_walker_lock);
88}
89
90static int fib6_new_sernum(struct net *net)
91{
92 int new, old;
93
94 do {
95 old = atomic_read(&net->ipv6.fib6_sernum);
96 new = old < INT_MAX ? old + 1 : 1;
97 } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
98 old, new) != old);
99 return new;
100}
101
102enum {
103 FIB6_NO_SERNUM_CHANGE = 0,
104};
105
106void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
107{
108 struct fib6_node *fn;
109
110 fn = rcu_dereference_protected(f6i->fib6_node,
111 lockdep_is_held(&f6i->fib6_table->tb6_lock));
112 if (fn)
113 WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
114}
115
116/*
117 * Auxiliary address test functions for the radix tree.
118 *
119 * These assume a 32bit processor (although it will work on
120 * 64bit processors)
121 */
122
123/*
124 * test bit
125 */
126#if defined(__LITTLE_ENDIAN)
127# define BITOP_BE32_SWIZZLE (0x1F & ~7)
128#else
129# define BITOP_BE32_SWIZZLE 0
130#endif
131
132static __be32 addr_bit_set(const void *token, int fn_bit)
133{
134 const __be32 *addr = token;
135 /*
136 * Here,
137 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
138 * is optimized version of
139 * htonl(1 << ((~fn_bit)&0x1F))
140 * See include/asm-generic/bitops/le.h.
141 */
142 return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
143 addr[fn_bit >> 5];
144}
145
146struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
147{
148 struct fib6_info *f6i;
149 size_t sz = sizeof(*f6i);
150
151 if (with_fib6_nh)
152 sz += sizeof(struct fib6_nh);
153
154 f6i = kzalloc(sz, gfp_flags);
155 if (!f6i)
156 return NULL;
157
158 /* fib6_siblings is a union with nh_list, so this initializes both */
159 INIT_LIST_HEAD(&f6i->fib6_siblings);
160 refcount_set(&f6i->fib6_ref, 1);
161
162 return f6i;
163}
164
165void fib6_info_destroy_rcu(struct rcu_head *head)
166{
167 struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
168
169 WARN_ON(f6i->fib6_node);
170
171 if (f6i->nh)
172 nexthop_put(f6i->nh);
173 else
174 fib6_nh_release(f6i->fib6_nh);
175
176 ip_fib_metrics_put(f6i->fib6_metrics);
177 kfree(f6i);
178}
179EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
180
181static struct fib6_node *node_alloc(struct net *net)
182{
183 struct fib6_node *fn;
184
185 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
186 if (fn)
187 net->ipv6.rt6_stats->fib_nodes++;
188
189 return fn;
190}
191
192static void node_free_immediate(struct net *net, struct fib6_node *fn)
193{
194 kmem_cache_free(fib6_node_kmem, fn);
195 net->ipv6.rt6_stats->fib_nodes--;
196}
197
198static void node_free_rcu(struct rcu_head *head)
199{
200 struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
201
202 kmem_cache_free(fib6_node_kmem, fn);
203}
204
205static void node_free(struct net *net, struct fib6_node *fn)
206{
207 call_rcu(&fn->rcu, node_free_rcu);
208 net->ipv6.rt6_stats->fib_nodes--;
209}
210
211static void fib6_free_table(struct fib6_table *table)
212{
213 inetpeer_invalidate_tree(&table->tb6_peers);
214 kfree(table);
215}
216
217static void fib6_link_table(struct net *net, struct fib6_table *tb)
218{
219 unsigned int h;
220
221 /*
222 * Initialize table lock at a single place to give lockdep a key,
223 * tables aren't visible prior to being linked to the list.
224 */
225 spin_lock_init(&tb->tb6_lock);
226 h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
227
228 /*
229 * No protection necessary, this is the only list mutatation
230 * operation, tables never disappear once they exist.
231 */
232 hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
233}
234
235#ifdef CONFIG_IPV6_MULTIPLE_TABLES
236
237static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
238{
239 struct fib6_table *table;
240
241 table = kzalloc(sizeof(*table), GFP_ATOMIC);
242 if (table) {
243 table->tb6_id = id;
244 rcu_assign_pointer(table->tb6_root.leaf,
245 net->ipv6.fib6_null_entry);
246 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
247 inet_peer_base_init(&table->tb6_peers);
248 }
249
250 return table;
251}
252
253struct fib6_table *fib6_new_table(struct net *net, u32 id)
254{
255 struct fib6_table *tb;
256
257 if (id == 0)
258 id = RT6_TABLE_MAIN;
259 tb = fib6_get_table(net, id);
260 if (tb)
261 return tb;
262
263 tb = fib6_alloc_table(net, id);
264 if (tb)
265 fib6_link_table(net, tb);
266
267 return tb;
268}
269EXPORT_SYMBOL_GPL(fib6_new_table);
270
271struct fib6_table *fib6_get_table(struct net *net, u32 id)
272{
273 struct fib6_table *tb;
274 struct hlist_head *head;
275 unsigned int h;
276
277 if (id == 0)
278 id = RT6_TABLE_MAIN;
279 h = id & (FIB6_TABLE_HASHSZ - 1);
280 rcu_read_lock();
281 head = &net->ipv6.fib_table_hash[h];
282 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
283 if (tb->tb6_id == id) {
284 rcu_read_unlock();
285 return tb;
286 }
287 }
288 rcu_read_unlock();
289
290 return NULL;
291}
292EXPORT_SYMBOL_GPL(fib6_get_table);
293
294static void __net_init fib6_tables_init(struct net *net)
295{
296 fib6_link_table(net, net->ipv6.fib6_main_tbl);
297 fib6_link_table(net, net->ipv6.fib6_local_tbl);
298}
299#else
300
301struct fib6_table *fib6_new_table(struct net *net, u32 id)
302{
303 return fib6_get_table(net, id);
304}
305
306struct fib6_table *fib6_get_table(struct net *net, u32 id)
307{
308 return net->ipv6.fib6_main_tbl;
309}
310
311struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
312 const struct sk_buff *skb,
313 int flags, pol_lookup_t lookup)
314{
315 struct rt6_info *rt;
316
317 rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
318 if (rt->dst.error == -EAGAIN) {
319 ip6_rt_put_flags(rt, flags);
320 rt = net->ipv6.ip6_null_entry;
321 if (!(flags & RT6_LOOKUP_F_DST_NOREF))
322 dst_hold(&rt->dst);
323 }
324
325 return &rt->dst;
326}
327
328/* called with rcu lock held; no reference taken on fib6_info */
329int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
330 struct fib6_result *res, int flags)
331{
332 return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
333 res, flags);
334}
335
336static void __net_init fib6_tables_init(struct net *net)
337{
338 fib6_link_table(net, net->ipv6.fib6_main_tbl);
339}
340
341#endif
342
343unsigned int fib6_tables_seq_read(struct net *net)
344{
345 unsigned int h, fib_seq = 0;
346
347 rcu_read_lock();
348 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
349 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
350 struct fib6_table *tb;
351
352 hlist_for_each_entry_rcu(tb, head, tb6_hlist)
353 fib_seq += tb->fib_seq;
354 }
355 rcu_read_unlock();
356
357 return fib_seq;
358}
359
360static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
361 enum fib_event_type event_type,
362 struct fib6_info *rt)
363{
364 struct fib6_entry_notifier_info info = {
365 .rt = rt,
366 };
367
368 return call_fib6_notifier(nb, net, event_type, &info.info);
369}
370
371int call_fib6_entry_notifiers(struct net *net,
372 enum fib_event_type event_type,
373 struct fib6_info *rt,
374 struct netlink_ext_ack *extack)
375{
376 struct fib6_entry_notifier_info info = {
377 .info.extack = extack,
378 .rt = rt,
379 };
380
381 rt->fib6_table->fib_seq++;
382 return call_fib6_notifiers(net, event_type, &info.info);
383}
384
385int call_fib6_multipath_entry_notifiers(struct net *net,
386 enum fib_event_type event_type,
387 struct fib6_info *rt,
388 unsigned int nsiblings,
389 struct netlink_ext_ack *extack)
390{
391 struct fib6_entry_notifier_info info = {
392 .info.extack = extack,
393 .rt = rt,
394 .nsiblings = nsiblings,
395 };
396
397 rt->fib6_table->fib_seq++;
398 return call_fib6_notifiers(net, event_type, &info.info);
399}
400
401struct fib6_dump_arg {
402 struct net *net;
403 struct notifier_block *nb;
404};
405
406static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
407{
408 if (rt == arg->net->ipv6.fib6_null_entry)
409 return;
410 call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
411}
412
413static int fib6_node_dump(struct fib6_walker *w)
414{
415 struct fib6_info *rt;
416
417 for_each_fib6_walker_rt(w)
418 fib6_rt_dump(rt, w->args);
419 w->leaf = NULL;
420 return 0;
421}
422
423static void fib6_table_dump(struct net *net, struct fib6_table *tb,
424 struct fib6_walker *w)
425{
426 w->root = &tb->tb6_root;
427 spin_lock_bh(&tb->tb6_lock);
428 fib6_walk(net, w);
429 spin_unlock_bh(&tb->tb6_lock);
430}
431
432/* Called with rcu_read_lock() */
433int fib6_tables_dump(struct net *net, struct notifier_block *nb)
434{
435 struct fib6_dump_arg arg;
436 struct fib6_walker *w;
437 unsigned int h;
438
439 w = kzalloc(sizeof(*w), GFP_ATOMIC);
440 if (!w)
441 return -ENOMEM;
442
443 w->func = fib6_node_dump;
444 arg.net = net;
445 arg.nb = nb;
446 w->args = &arg;
447
448 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
449 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
450 struct fib6_table *tb;
451
452 hlist_for_each_entry_rcu(tb, head, tb6_hlist)
453 fib6_table_dump(net, tb, w);
454 }
455
456 kfree(w);
457
458 return 0;
459}
460
461static int fib6_dump_node(struct fib6_walker *w)
462{
463 int res;
464 struct fib6_info *rt;
465
466 for_each_fib6_walker_rt(w) {
467 res = rt6_dump_route(rt, w->args, w->skip_in_node);
468 if (res >= 0) {
469 /* Frame is full, suspend walking */
470 w->leaf = rt;
471
472 /* We'll restart from this node, so if some routes were
473 * already dumped, skip them next time.
474 */
475 w->skip_in_node += res;
476
477 return 1;
478 }
479 w->skip_in_node = 0;
480
481 /* Multipath routes are dumped in one route with the
482 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
483 * last sibling of this route (no need to dump the
484 * sibling routes again)
485 */
486 if (rt->fib6_nsiblings)
487 rt = list_last_entry(&rt->fib6_siblings,
488 struct fib6_info,
489 fib6_siblings);
490 }
491 w->leaf = NULL;
492 return 0;
493}
494
495static void fib6_dump_end(struct netlink_callback *cb)
496{
497 struct net *net = sock_net(cb->skb->sk);
498 struct fib6_walker *w = (void *)cb->args[2];
499
500 if (w) {
501 if (cb->args[4]) {
502 cb->args[4] = 0;
503 fib6_walker_unlink(net, w);
504 }
505 cb->args[2] = 0;
506 kfree(w);
507 }
508 cb->done = (void *)cb->args[3];
509 cb->args[1] = 3;
510}
511
512static int fib6_dump_done(struct netlink_callback *cb)
513{
514 fib6_dump_end(cb);
515 return cb->done ? cb->done(cb) : 0;
516}
517
518static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
519 struct netlink_callback *cb)
520{
521 struct net *net = sock_net(skb->sk);
522 struct fib6_walker *w;
523 int res;
524
525 w = (void *)cb->args[2];
526 w->root = &table->tb6_root;
527
528 if (cb->args[4] == 0) {
529 w->count = 0;
530 w->skip = 0;
531 w->skip_in_node = 0;
532
533 spin_lock_bh(&table->tb6_lock);
534 res = fib6_walk(net, w);
535 spin_unlock_bh(&table->tb6_lock);
536 if (res > 0) {
537 cb->args[4] = 1;
538 cb->args[5] = READ_ONCE(w->root->fn_sernum);
539 }
540 } else {
541 int sernum = READ_ONCE(w->root->fn_sernum);
542 if (cb->args[5] != sernum) {
543 /* Begin at the root if the tree changed */
544 cb->args[5] = sernum;
545 w->state = FWS_INIT;
546 w->node = w->root;
547 w->skip = w->count;
548 w->skip_in_node = 0;
549 } else
550 w->skip = 0;
551
552 spin_lock_bh(&table->tb6_lock);
553 res = fib6_walk_continue(w);
554 spin_unlock_bh(&table->tb6_lock);
555 if (res <= 0) {
556 fib6_walker_unlink(net, w);
557 cb->args[4] = 0;
558 }
559 }
560
561 return res;
562}
563
564static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
565{
566 struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
567 .filter.dump_routes = true };
568 const struct nlmsghdr *nlh = cb->nlh;
569 struct net *net = sock_net(skb->sk);
570 unsigned int h, s_h;
571 unsigned int e = 0, s_e;
572 struct fib6_walker *w;
573 struct fib6_table *tb;
574 struct hlist_head *head;
575 int res = 0;
576
577 if (cb->strict_check) {
578 int err;
579
580 err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
581 if (err < 0)
582 return err;
583 } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
584 struct rtmsg *rtm = nlmsg_data(nlh);
585
586 if (rtm->rtm_flags & RTM_F_PREFIX)
587 arg.filter.flags = RTM_F_PREFIX;
588 }
589
590 w = (void *)cb->args[2];
591 if (!w) {
592 /* New dump:
593 *
594 * 1. allocate and initialize walker.
595 */
596 w = kzalloc(sizeof(*w), GFP_ATOMIC);
597 if (!w)
598 return -ENOMEM;
599 w->func = fib6_dump_node;
600 cb->args[2] = (long)w;
601
602 /* 2. hook callback destructor.
603 */
604 cb->args[3] = (long)cb->done;
605 cb->done = fib6_dump_done;
606
607 }
608
609 arg.skb = skb;
610 arg.cb = cb;
611 arg.net = net;
612 w->args = &arg;
613
614 if (arg.filter.table_id) {
615 tb = fib6_get_table(net, arg.filter.table_id);
616 if (!tb) {
617 if (rtnl_msg_family(cb->nlh) != PF_INET6)
618 goto out;
619
620 NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
621 return -ENOENT;
622 }
623
624 if (!cb->args[0]) {
625 res = fib6_dump_table(tb, skb, cb);
626 if (!res)
627 cb->args[0] = 1;
628 }
629 goto out;
630 }
631
632 s_h = cb->args[0];
633 s_e = cb->args[1];
634
635 rcu_read_lock();
636 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
637 e = 0;
638 head = &net->ipv6.fib_table_hash[h];
639 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
640 if (e < s_e)
641 goto next;
642 res = fib6_dump_table(tb, skb, cb);
643 if (res != 0)
644 goto out_unlock;
645next:
646 e++;
647 }
648 }
649out_unlock:
650 rcu_read_unlock();
651 cb->args[1] = e;
652 cb->args[0] = h;
653out:
654 res = res < 0 ? res : skb->len;
655 if (res <= 0)
656 fib6_dump_end(cb);
657 return res;
658}
659
660void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
661{
662 if (!f6i)
663 return;
664
665 if (f6i->fib6_metrics == &dst_default_metrics) {
666 struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
667
668 if (!p)
669 return;
670
671 refcount_set(&p->refcnt, 1);
672 f6i->fib6_metrics = p;
673 }
674
675 f6i->fib6_metrics->metrics[metric - 1] = val;
676}
677
678/*
679 * Routing Table
680 *
681 * return the appropriate node for a routing tree "add" operation
682 * by either creating and inserting or by returning an existing
683 * node.
684 */
685
686static struct fib6_node *fib6_add_1(struct net *net,
687 struct fib6_table *table,
688 struct fib6_node *root,
689 struct in6_addr *addr, int plen,
690 int offset, int allow_create,
691 int replace_required,
692 struct netlink_ext_ack *extack)
693{
694 struct fib6_node *fn, *in, *ln;
695 struct fib6_node *pn = NULL;
696 struct rt6key *key;
697 int bit;
698 __be32 dir = 0;
699
700 RT6_TRACE("fib6_add_1\n");
701
702 /* insert node in tree */
703
704 fn = root;
705
706 do {
707 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
708 lockdep_is_held(&table->tb6_lock));
709 key = (struct rt6key *)((u8 *)leaf + offset);
710
711 /*
712 * Prefix match
713 */
714 if (plen < fn->fn_bit ||
715 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
716 if (!allow_create) {
717 if (replace_required) {
718 NL_SET_ERR_MSG(extack,
719 "Can not replace route - no match found");
720 pr_warn("Can't replace route, no match found\n");
721 return ERR_PTR(-ENOENT);
722 }
723 pr_warn("NLM_F_CREATE should be set when creating new route\n");
724 }
725 goto insert_above;
726 }
727
728 /*
729 * Exact match ?
730 */
731
732 if (plen == fn->fn_bit) {
733 /* clean up an intermediate node */
734 if (!(fn->fn_flags & RTN_RTINFO)) {
735 RCU_INIT_POINTER(fn->leaf, NULL);
736 fib6_info_release(leaf);
737 /* remove null_entry in the root node */
738 } else if (fn->fn_flags & RTN_TL_ROOT &&
739 rcu_access_pointer(fn->leaf) ==
740 net->ipv6.fib6_null_entry) {
741 RCU_INIT_POINTER(fn->leaf, NULL);
742 }
743
744 return fn;
745 }
746
747 /*
748 * We have more bits to go
749 */
750
751 /* Try to walk down on tree. */
752 dir = addr_bit_set(addr, fn->fn_bit);
753 pn = fn;
754 fn = dir ?
755 rcu_dereference_protected(fn->right,
756 lockdep_is_held(&table->tb6_lock)) :
757 rcu_dereference_protected(fn->left,
758 lockdep_is_held(&table->tb6_lock));
759 } while (fn);
760
761 if (!allow_create) {
762 /* We should not create new node because
763 * NLM_F_REPLACE was specified without NLM_F_CREATE
764 * I assume it is safe to require NLM_F_CREATE when
765 * REPLACE flag is used! Later we may want to remove the
766 * check for replace_required, because according
767 * to netlink specification, NLM_F_CREATE
768 * MUST be specified if new route is created.
769 * That would keep IPv6 consistent with IPv4
770 */
771 if (replace_required) {
772 NL_SET_ERR_MSG(extack,
773 "Can not replace route - no match found");
774 pr_warn("Can't replace route, no match found\n");
775 return ERR_PTR(-ENOENT);
776 }
777 pr_warn("NLM_F_CREATE should be set when creating new route\n");
778 }
779 /*
780 * We walked to the bottom of tree.
781 * Create new leaf node without children.
782 */
783
784 ln = node_alloc(net);
785
786 if (!ln)
787 return ERR_PTR(-ENOMEM);
788 ln->fn_bit = plen;
789 RCU_INIT_POINTER(ln->parent, pn);
790
791 if (dir)
792 rcu_assign_pointer(pn->right, ln);
793 else
794 rcu_assign_pointer(pn->left, ln);
795
796 return ln;
797
798
799insert_above:
800 /*
801 * split since we don't have a common prefix anymore or
802 * we have a less significant route.
803 * we've to insert an intermediate node on the list
804 * this new node will point to the one we need to create
805 * and the current
806 */
807
808 pn = rcu_dereference_protected(fn->parent,
809 lockdep_is_held(&table->tb6_lock));
810
811 /* find 1st bit in difference between the 2 addrs.
812
813 See comment in __ipv6_addr_diff: bit may be an invalid value,
814 but if it is >= plen, the value is ignored in any case.
815 */
816
817 bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
818
819 /*
820 * (intermediate)[in]
821 * / \
822 * (new leaf node)[ln] (old node)[fn]
823 */
824 if (plen > bit) {
825 in = node_alloc(net);
826 ln = node_alloc(net);
827
828 if (!in || !ln) {
829 if (in)
830 node_free_immediate(net, in);
831 if (ln)
832 node_free_immediate(net, ln);
833 return ERR_PTR(-ENOMEM);
834 }
835
836 /*
837 * new intermediate node.
838 * RTN_RTINFO will
839 * be off since that an address that chooses one of
840 * the branches would not match less specific routes
841 * in the other branch
842 */
843
844 in->fn_bit = bit;
845
846 RCU_INIT_POINTER(in->parent, pn);
847 in->leaf = fn->leaf;
848 fib6_info_hold(rcu_dereference_protected(in->leaf,
849 lockdep_is_held(&table->tb6_lock)));
850
851 /* update parent pointer */
852 if (dir)
853 rcu_assign_pointer(pn->right, in);
854 else
855 rcu_assign_pointer(pn->left, in);
856
857 ln->fn_bit = plen;
858
859 RCU_INIT_POINTER(ln->parent, in);
860 rcu_assign_pointer(fn->parent, in);
861
862 if (addr_bit_set(addr, bit)) {
863 rcu_assign_pointer(in->right, ln);
864 rcu_assign_pointer(in->left, fn);
865 } else {
866 rcu_assign_pointer(in->left, ln);
867 rcu_assign_pointer(in->right, fn);
868 }
869 } else { /* plen <= bit */
870
871 /*
872 * (new leaf node)[ln]
873 * / \
874 * (old node)[fn] NULL
875 */
876
877 ln = node_alloc(net);
878
879 if (!ln)
880 return ERR_PTR(-ENOMEM);
881
882 ln->fn_bit = plen;
883
884 RCU_INIT_POINTER(ln->parent, pn);
885
886 if (addr_bit_set(&key->addr, plen))
887 RCU_INIT_POINTER(ln->right, fn);
888 else
889 RCU_INIT_POINTER(ln->left, fn);
890
891 rcu_assign_pointer(fn->parent, ln);
892
893 if (dir)
894 rcu_assign_pointer(pn->right, ln);
895 else
896 rcu_assign_pointer(pn->left, ln);
897 }
898 return ln;
899}
900
901static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
902 const struct fib6_info *match,
903 const struct fib6_table *table)
904{
905 int cpu;
906
907 if (!fib6_nh->rt6i_pcpu)
908 return;
909
910 rcu_read_lock();
911 /* release the reference to this fib entry from
912 * all of its cached pcpu routes
913 */
914 for_each_possible_cpu(cpu) {
915 struct rt6_info **ppcpu_rt;
916 struct rt6_info *pcpu_rt;
917
918 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
919
920 /* Paired with xchg() in rt6_get_pcpu_route() */
921 pcpu_rt = READ_ONCE(*ppcpu_rt);
922
923 /* only dropping the 'from' reference if the cached route
924 * is using 'match'. The cached pcpu_rt->from only changes
925 * from a fib6_info to NULL (ip6_dst_destroy); it can never
926 * change from one fib6_info reference to another
927 */
928 if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
929 struct fib6_info *from;
930
931 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
932 fib6_info_release(from);
933 }
934 }
935 rcu_read_unlock();
936}
937
938struct fib6_nh_pcpu_arg {
939 struct fib6_info *from;
940 const struct fib6_table *table;
941};
942
943static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
944{
945 struct fib6_nh_pcpu_arg *arg = _arg;
946
947 __fib6_drop_pcpu_from(nh, arg->from, arg->table);
948 return 0;
949}
950
951static void fib6_drop_pcpu_from(struct fib6_info *f6i,
952 const struct fib6_table *table)
953{
954 /* Make sure rt6_make_pcpu_route() wont add other percpu routes
955 * while we are cleaning them here.
956 */
957 f6i->fib6_destroying = 1;
958 mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
959
960 if (f6i->nh) {
961 struct fib6_nh_pcpu_arg arg = {
962 .from = f6i,
963 .table = table
964 };
965
966 nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
967 &arg);
968 } else {
969 struct fib6_nh *fib6_nh;
970
971 fib6_nh = f6i->fib6_nh;
972 __fib6_drop_pcpu_from(fib6_nh, f6i, table);
973 }
974}
975
976static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
977 struct net *net)
978{
979 struct fib6_table *table = rt->fib6_table;
980
981 /* Flush all cached dst in exception table */
982 rt6_flush_exceptions(rt);
983 fib6_drop_pcpu_from(rt, table);
984
985 if (rt->nh && !list_empty(&rt->nh_list))
986 list_del_init(&rt->nh_list);
987
988 if (refcount_read(&rt->fib6_ref) != 1) {
989 /* This route is used as dummy address holder in some split
990 * nodes. It is not leaked, but it still holds other resources,
991 * which must be released in time. So, scan ascendant nodes
992 * and replace dummy references to this route with references
993 * to still alive ones.
994 */
995 while (fn) {
996 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
997 lockdep_is_held(&table->tb6_lock));
998 struct fib6_info *new_leaf;
999 if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
1000 new_leaf = fib6_find_prefix(net, table, fn);
1001 fib6_info_hold(new_leaf);
1002
1003 rcu_assign_pointer(fn->leaf, new_leaf);
1004 fib6_info_release(rt);
1005 }
1006 fn = rcu_dereference_protected(fn->parent,
1007 lockdep_is_held(&table->tb6_lock));
1008 }
1009 }
1010}
1011
1012/*
1013 * Insert routing information in a node.
1014 */
1015
1016static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1017 struct nl_info *info,
1018 struct netlink_ext_ack *extack)
1019{
1020 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1021 lockdep_is_held(&rt->fib6_table->tb6_lock));
1022 struct fib6_info *iter = NULL;
1023 struct fib6_info __rcu **ins;
1024 struct fib6_info __rcu **fallback_ins = NULL;
1025 int replace = (info->nlh &&
1026 (info->nlh->nlmsg_flags & NLM_F_REPLACE));
1027 int add = (!info->nlh ||
1028 (info->nlh->nlmsg_flags & NLM_F_CREATE));
1029 int found = 0;
1030 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1031 u16 nlflags = NLM_F_EXCL;
1032 int err;
1033
1034 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
1035 nlflags |= NLM_F_APPEND;
1036
1037 ins = &fn->leaf;
1038
1039 for (iter = leaf; iter;
1040 iter = rcu_dereference_protected(iter->fib6_next,
1041 lockdep_is_held(&rt->fib6_table->tb6_lock))) {
1042 /*
1043 * Search for duplicates
1044 */
1045
1046 if (iter->fib6_metric == rt->fib6_metric) {
1047 /*
1048 * Same priority level
1049 */
1050 if (info->nlh &&
1051 (info->nlh->nlmsg_flags & NLM_F_EXCL))
1052 return -EEXIST;
1053
1054 nlflags &= ~NLM_F_EXCL;
1055 if (replace) {
1056 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
1057 found++;
1058 break;
1059 }
1060 fallback_ins = fallback_ins ?: ins;
1061 goto next_iter;
1062 }
1063
1064 if (rt6_duplicate_nexthop(iter, rt)) {
1065 if (rt->fib6_nsiblings)
1066 rt->fib6_nsiblings = 0;
1067 if (!(iter->fib6_flags & RTF_EXPIRES))
1068 return -EEXIST;
1069 if (!(rt->fib6_flags & RTF_EXPIRES))
1070 fib6_clean_expires(iter);
1071 else
1072 fib6_set_expires(iter, rt->expires);
1073
1074 if (rt->fib6_pmtu)
1075 fib6_metric_set(iter, RTAX_MTU,
1076 rt->fib6_pmtu);
1077 return -EEXIST;
1078 }
1079 /* If we have the same destination and the same metric,
1080 * but not the same gateway, then the route we try to
1081 * add is sibling to this route, increment our counter
1082 * of siblings, and later we will add our route to the
1083 * list.
1084 * Only static routes (which don't have flag
1085 * RTF_EXPIRES) are used for ECMPv6.
1086 *
1087 * To avoid long list, we only had siblings if the
1088 * route have a gateway.
1089 */
1090 if (rt_can_ecmp &&
1091 rt6_qualify_for_ecmp(iter))
1092 rt->fib6_nsiblings++;
1093 }
1094
1095 if (iter->fib6_metric > rt->fib6_metric)
1096 break;
1097
1098next_iter:
1099 ins = &iter->fib6_next;
1100 }
1101
1102 if (fallback_ins && !found) {
1103 /* No matching route with same ecmp-able-ness found, replace
1104 * first matching route
1105 */
1106 ins = fallback_ins;
1107 iter = rcu_dereference_protected(*ins,
1108 lockdep_is_held(&rt->fib6_table->tb6_lock));
1109 found++;
1110 }
1111
1112 /* Reset round-robin state, if necessary */
1113 if (ins == &fn->leaf)
1114 fn->rr_ptr = NULL;
1115
1116 /* Link this route to others same route. */
1117 if (rt->fib6_nsiblings) {
1118 unsigned int fib6_nsiblings;
1119 struct fib6_info *sibling, *temp_sibling;
1120
1121 /* Find the first route that have the same metric */
1122 sibling = leaf;
1123 while (sibling) {
1124 if (sibling->fib6_metric == rt->fib6_metric &&
1125 rt6_qualify_for_ecmp(sibling)) {
1126 list_add_tail(&rt->fib6_siblings,
1127 &sibling->fib6_siblings);
1128 break;
1129 }
1130 sibling = rcu_dereference_protected(sibling->fib6_next,
1131 lockdep_is_held(&rt->fib6_table->tb6_lock));
1132 }
1133 /* For each sibling in the list, increment the counter of
1134 * siblings. BUG() if counters does not match, list of siblings
1135 * is broken!
1136 */
1137 fib6_nsiblings = 0;
1138 list_for_each_entry_safe(sibling, temp_sibling,
1139 &rt->fib6_siblings, fib6_siblings) {
1140 sibling->fib6_nsiblings++;
1141 BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1142 fib6_nsiblings++;
1143 }
1144 BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1145 rt6_multipath_rebalance(temp_sibling);
1146 }
1147
1148 /*
1149 * insert node
1150 */
1151 if (!replace) {
1152 if (!add)
1153 pr_warn("NLM_F_CREATE should be set when creating new route\n");
1154
1155add:
1156 nlflags |= NLM_F_CREATE;
1157
1158 if (!info->skip_notify_kernel) {
1159 err = call_fib6_entry_notifiers(info->nl_net,
1160 FIB_EVENT_ENTRY_ADD,
1161 rt, extack);
1162 if (err) {
1163 struct fib6_info *sibling, *next_sibling;
1164
1165 /* If the route has siblings, then it first
1166 * needs to be unlinked from them.
1167 */
1168 if (!rt->fib6_nsiblings)
1169 return err;
1170
1171 list_for_each_entry_safe(sibling, next_sibling,
1172 &rt->fib6_siblings,
1173 fib6_siblings)
1174 sibling->fib6_nsiblings--;
1175 rt->fib6_nsiblings = 0;
1176 list_del_init(&rt->fib6_siblings);
1177 rt6_multipath_rebalance(next_sibling);
1178 return err;
1179 }
1180 }
1181
1182 rcu_assign_pointer(rt->fib6_next, iter);
1183 fib6_info_hold(rt);
1184 rcu_assign_pointer(rt->fib6_node, fn);
1185 rcu_assign_pointer(*ins, rt);
1186 if (!info->skip_notify)
1187 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1188 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1189
1190 if (!(fn->fn_flags & RTN_RTINFO)) {
1191 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1192 fn->fn_flags |= RTN_RTINFO;
1193 }
1194
1195 } else {
1196 int nsiblings;
1197
1198 if (!found) {
1199 if (add)
1200 goto add;
1201 pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1202 return -ENOENT;
1203 }
1204
1205 if (!info->skip_notify_kernel) {
1206 err = call_fib6_entry_notifiers(info->nl_net,
1207 FIB_EVENT_ENTRY_REPLACE,
1208 rt, extack);
1209 if (err)
1210 return err;
1211 }
1212
1213 fib6_info_hold(rt);
1214 rcu_assign_pointer(rt->fib6_node, fn);
1215 rt->fib6_next = iter->fib6_next;
1216 rcu_assign_pointer(*ins, rt);
1217 if (!info->skip_notify)
1218 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1219 if (!(fn->fn_flags & RTN_RTINFO)) {
1220 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1221 fn->fn_flags |= RTN_RTINFO;
1222 }
1223 nsiblings = iter->fib6_nsiblings;
1224 iter->fib6_node = NULL;
1225 fib6_purge_rt(iter, fn, info->nl_net);
1226 if (rcu_access_pointer(fn->rr_ptr) == iter)
1227 fn->rr_ptr = NULL;
1228 fib6_info_release(iter);
1229
1230 if (nsiblings) {
1231 /* Replacing an ECMP route, remove all siblings */
1232 ins = &rt->fib6_next;
1233 iter = rcu_dereference_protected(*ins,
1234 lockdep_is_held(&rt->fib6_table->tb6_lock));
1235 while (iter) {
1236 if (iter->fib6_metric > rt->fib6_metric)
1237 break;
1238 if (rt6_qualify_for_ecmp(iter)) {
1239 *ins = iter->fib6_next;
1240 iter->fib6_node = NULL;
1241 fib6_purge_rt(iter, fn, info->nl_net);
1242 if (rcu_access_pointer(fn->rr_ptr) == iter)
1243 fn->rr_ptr = NULL;
1244 fib6_info_release(iter);
1245 nsiblings--;
1246 info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1247 } else {
1248 ins = &iter->fib6_next;
1249 }
1250 iter = rcu_dereference_protected(*ins,
1251 lockdep_is_held(&rt->fib6_table->tb6_lock));
1252 }
1253 WARN_ON(nsiblings != 0);
1254 }
1255 }
1256
1257 return 0;
1258}
1259
1260static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1261{
1262 if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1263 (rt->fib6_flags & RTF_EXPIRES))
1264 mod_timer(&net->ipv6.ip6_fib_timer,
1265 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1266}
1267
1268void fib6_force_start_gc(struct net *net)
1269{
1270 if (!timer_pending(&net->ipv6.ip6_fib_timer))
1271 mod_timer(&net->ipv6.ip6_fib_timer,
1272 jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1273}
1274
1275static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1276 int sernum)
1277{
1278 struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1279 lockdep_is_held(&rt->fib6_table->tb6_lock));
1280
1281 /* paired with smp_rmb() in rt6_get_cookie_safe() */
1282 smp_wmb();
1283 while (fn) {
1284 WRITE_ONCE(fn->fn_sernum, sernum);
1285 fn = rcu_dereference_protected(fn->parent,
1286 lockdep_is_held(&rt->fib6_table->tb6_lock));
1287 }
1288}
1289
1290void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1291{
1292 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1293}
1294
1295/* allow ipv4 to update sernum via ipv6_stub */
1296void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1297{
1298 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1299 fib6_update_sernum_upto_root(net, f6i);
1300 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1301}
1302
1303/*
1304 * Add routing information to the routing tree.
1305 * <destination addr>/<source addr>
1306 * with source addr info in sub-trees
1307 * Need to own table->tb6_lock
1308 */
1309
1310int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1311 struct nl_info *info, struct netlink_ext_ack *extack)
1312{
1313 struct fib6_table *table = rt->fib6_table;
1314 struct fib6_node *fn;
1315#ifdef CONFIG_IPV6_SUBTREES
1316 struct fib6_node *pn = NULL;
1317#endif
1318 int err = -ENOMEM;
1319 int allow_create = 1;
1320 int replace_required = 0;
1321
1322 if (info->nlh) {
1323 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1324 allow_create = 0;
1325 if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1326 replace_required = 1;
1327 }
1328 if (!allow_create && !replace_required)
1329 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1330
1331 fn = fib6_add_1(info->nl_net, table, root,
1332 &rt->fib6_dst.addr, rt->fib6_dst.plen,
1333 offsetof(struct fib6_info, fib6_dst), allow_create,
1334 replace_required, extack);
1335 if (IS_ERR(fn)) {
1336 err = PTR_ERR(fn);
1337 fn = NULL;
1338 goto out;
1339 }
1340
1341#ifdef CONFIG_IPV6_SUBTREES
1342 pn = fn;
1343
1344 if (rt->fib6_src.plen) {
1345 struct fib6_node *sn;
1346
1347 if (!rcu_access_pointer(fn->subtree)) {
1348 struct fib6_node *sfn;
1349
1350 /*
1351 * Create subtree.
1352 *
1353 * fn[main tree]
1354 * |
1355 * sfn[subtree root]
1356 * \
1357 * sn[new leaf node]
1358 */
1359
1360 /* Create subtree root node */
1361 sfn = node_alloc(info->nl_net);
1362 if (!sfn)
1363 goto failure;
1364
1365 fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
1366 rcu_assign_pointer(sfn->leaf,
1367 info->nl_net->ipv6.fib6_null_entry);
1368 sfn->fn_flags = RTN_ROOT;
1369
1370 /* Now add the first leaf node to new subtree */
1371
1372 sn = fib6_add_1(info->nl_net, table, sfn,
1373 &rt->fib6_src.addr, rt->fib6_src.plen,
1374 offsetof(struct fib6_info, fib6_src),
1375 allow_create, replace_required, extack);
1376
1377 if (IS_ERR(sn)) {
1378 /* If it is failed, discard just allocated
1379 root, and then (in failure) stale node
1380 in main tree.
1381 */
1382 node_free_immediate(info->nl_net, sfn);
1383 err = PTR_ERR(sn);
1384 goto failure;
1385 }
1386
1387 /* Now link new subtree to main tree */
1388 rcu_assign_pointer(sfn->parent, fn);
1389 rcu_assign_pointer(fn->subtree, sfn);
1390 } else {
1391 sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1392 &rt->fib6_src.addr, rt->fib6_src.plen,
1393 offsetof(struct fib6_info, fib6_src),
1394 allow_create, replace_required, extack);
1395
1396 if (IS_ERR(sn)) {
1397 err = PTR_ERR(sn);
1398 goto failure;
1399 }
1400 }
1401
1402 if (!rcu_access_pointer(fn->leaf)) {
1403 if (fn->fn_flags & RTN_TL_ROOT) {
1404 /* put back null_entry for root node */
1405 rcu_assign_pointer(fn->leaf,
1406 info->nl_net->ipv6.fib6_null_entry);
1407 } else {
1408 fib6_info_hold(rt);
1409 rcu_assign_pointer(fn->leaf, rt);
1410 }
1411 }
1412 fn = sn;
1413 }
1414#endif
1415
1416 err = fib6_add_rt2node(fn, rt, info, extack);
1417 if (!err) {
1418 if (rt->nh)
1419 list_add(&rt->nh_list, &rt->nh->f6i_list);
1420 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
1421 fib6_start_gc(info->nl_net, rt);
1422 }
1423
1424out:
1425 if (err) {
1426#ifdef CONFIG_IPV6_SUBTREES
1427 /*
1428 * If fib6_add_1 has cleared the old leaf pointer in the
1429 * super-tree leaf node we have to find a new one for it.
1430 */
1431 if (pn != fn) {
1432 struct fib6_info *pn_leaf =
1433 rcu_dereference_protected(pn->leaf,
1434 lockdep_is_held(&table->tb6_lock));
1435 if (pn_leaf == rt) {
1436 pn_leaf = NULL;
1437 RCU_INIT_POINTER(pn->leaf, NULL);
1438 fib6_info_release(rt);
1439 }
1440 if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1441 pn_leaf = fib6_find_prefix(info->nl_net, table,
1442 pn);
1443 if (!pn_leaf)
1444 pn_leaf =
1445 info->nl_net->ipv6.fib6_null_entry;
1446 fib6_info_hold(pn_leaf);
1447 rcu_assign_pointer(pn->leaf, pn_leaf);
1448 }
1449 }
1450#endif
1451 goto failure;
1452 }
1453 return err;
1454
1455failure:
1456 /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1457 * 1. fn is an intermediate node and we failed to add the new
1458 * route to it in both subtree creation failure and fib6_add_rt2node()
1459 * failure case.
1460 * 2. fn is the root node in the table and we fail to add the first
1461 * default route to it.
1462 */
1463 if (fn &&
1464 (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1465 (fn->fn_flags & RTN_TL_ROOT &&
1466 !rcu_access_pointer(fn->leaf))))
1467 fib6_repair_tree(info->nl_net, table, fn);
1468 return err;
1469}
1470
1471/*
1472 * Routing tree lookup
1473 *
1474 */
1475
1476struct lookup_args {
1477 int offset; /* key offset on fib6_info */
1478 const struct in6_addr *addr; /* search key */
1479};
1480
1481static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1482 struct lookup_args *args)
1483{
1484 struct fib6_node *fn;
1485 __be32 dir;
1486
1487 if (unlikely(args->offset == 0))
1488 return NULL;
1489
1490 /*
1491 * Descend on a tree
1492 */
1493
1494 fn = root;
1495
1496 for (;;) {
1497 struct fib6_node *next;
1498
1499 dir = addr_bit_set(args->addr, fn->fn_bit);
1500
1501 next = dir ? rcu_dereference(fn->right) :
1502 rcu_dereference(fn->left);
1503
1504 if (next) {
1505 fn = next;
1506 continue;
1507 }
1508 break;
1509 }
1510
1511 while (fn) {
1512 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1513
1514 if (subtree || fn->fn_flags & RTN_RTINFO) {
1515 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1516 struct rt6key *key;
1517
1518 if (!leaf)
1519 goto backtrack;
1520
1521 key = (struct rt6key *) ((u8 *)leaf + args->offset);
1522
1523 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1524#ifdef CONFIG_IPV6_SUBTREES
1525 if (subtree) {
1526 struct fib6_node *sfn;
1527 sfn = fib6_node_lookup_1(subtree,
1528 args + 1);
1529 if (!sfn)
1530 goto backtrack;
1531 fn = sfn;
1532 }
1533#endif
1534 if (fn->fn_flags & RTN_RTINFO)
1535 return fn;
1536 }
1537 }
1538backtrack:
1539 if (fn->fn_flags & RTN_ROOT)
1540 break;
1541
1542 fn = rcu_dereference(fn->parent);
1543 }
1544
1545 return NULL;
1546}
1547
1548/* called with rcu_read_lock() held
1549 */
1550struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1551 const struct in6_addr *daddr,
1552 const struct in6_addr *saddr)
1553{
1554 struct fib6_node *fn;
1555 struct lookup_args args[] = {
1556 {
1557 .offset = offsetof(struct fib6_info, fib6_dst),
1558 .addr = daddr,
1559 },
1560#ifdef CONFIG_IPV6_SUBTREES
1561 {
1562 .offset = offsetof(struct fib6_info, fib6_src),
1563 .addr = saddr,
1564 },
1565#endif
1566 {
1567 .offset = 0, /* sentinel */
1568 }
1569 };
1570
1571 fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1572 if (!fn || fn->fn_flags & RTN_TL_ROOT)
1573 fn = root;
1574
1575 return fn;
1576}
1577
1578/*
1579 * Get node with specified destination prefix (and source prefix,
1580 * if subtrees are used)
1581 * exact_match == true means we try to find fn with exact match of
1582 * the passed in prefix addr
1583 * exact_match == false means we try to find fn with longest prefix
1584 * match of the passed in prefix addr. This is useful for finding fn
1585 * for cached route as it will be stored in the exception table under
1586 * the node with longest prefix length.
1587 */
1588
1589
1590static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1591 const struct in6_addr *addr,
1592 int plen, int offset,
1593 bool exact_match)
1594{
1595 struct fib6_node *fn, *prev = NULL;
1596
1597 for (fn = root; fn ; ) {
1598 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1599 struct rt6key *key;
1600
1601 /* This node is being deleted */
1602 if (!leaf) {
1603 if (plen <= fn->fn_bit)
1604 goto out;
1605 else
1606 goto next;
1607 }
1608
1609 key = (struct rt6key *)((u8 *)leaf + offset);
1610
1611 /*
1612 * Prefix match
1613 */
1614 if (plen < fn->fn_bit ||
1615 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1616 goto out;
1617
1618 if (plen == fn->fn_bit)
1619 return fn;
1620
1621 if (fn->fn_flags & RTN_RTINFO)
1622 prev = fn;
1623
1624next:
1625 /*
1626 * We have more bits to go
1627 */
1628 if (addr_bit_set(addr, fn->fn_bit))
1629 fn = rcu_dereference(fn->right);
1630 else
1631 fn = rcu_dereference(fn->left);
1632 }
1633out:
1634 if (exact_match)
1635 return NULL;
1636 else
1637 return prev;
1638}
1639
1640struct fib6_node *fib6_locate(struct fib6_node *root,
1641 const struct in6_addr *daddr, int dst_len,
1642 const struct in6_addr *saddr, int src_len,
1643 bool exact_match)
1644{
1645 struct fib6_node *fn;
1646
1647 fn = fib6_locate_1(root, daddr, dst_len,
1648 offsetof(struct fib6_info, fib6_dst),
1649 exact_match);
1650
1651#ifdef CONFIG_IPV6_SUBTREES
1652 if (src_len) {
1653 WARN_ON(saddr == NULL);
1654 if (fn) {
1655 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1656
1657 if (subtree) {
1658 fn = fib6_locate_1(subtree, saddr, src_len,
1659 offsetof(struct fib6_info, fib6_src),
1660 exact_match);
1661 }
1662 }
1663 }
1664#endif
1665
1666 if (fn && fn->fn_flags & RTN_RTINFO)
1667 return fn;
1668
1669 return NULL;
1670}
1671
1672
1673/*
1674 * Deletion
1675 *
1676 */
1677
1678static struct fib6_info *fib6_find_prefix(struct net *net,
1679 struct fib6_table *table,
1680 struct fib6_node *fn)
1681{
1682 struct fib6_node *child_left, *child_right;
1683
1684 if (fn->fn_flags & RTN_ROOT)
1685 return net->ipv6.fib6_null_entry;
1686
1687 while (fn) {
1688 child_left = rcu_dereference_protected(fn->left,
1689 lockdep_is_held(&table->tb6_lock));
1690 child_right = rcu_dereference_protected(fn->right,
1691 lockdep_is_held(&table->tb6_lock));
1692 if (child_left)
1693 return rcu_dereference_protected(child_left->leaf,
1694 lockdep_is_held(&table->tb6_lock));
1695 if (child_right)
1696 return rcu_dereference_protected(child_right->leaf,
1697 lockdep_is_held(&table->tb6_lock));
1698
1699 fn = FIB6_SUBTREE(fn);
1700 }
1701 return NULL;
1702}
1703
1704/*
1705 * Called to trim the tree of intermediate nodes when possible. "fn"
1706 * is the node we want to try and remove.
1707 * Need to own table->tb6_lock
1708 */
1709
1710static struct fib6_node *fib6_repair_tree(struct net *net,
1711 struct fib6_table *table,
1712 struct fib6_node *fn)
1713{
1714 int children;
1715 int nstate;
1716 struct fib6_node *child;
1717 struct fib6_walker *w;
1718 int iter = 0;
1719
1720 /* Set fn->leaf to null_entry for root node. */
1721 if (fn->fn_flags & RTN_TL_ROOT) {
1722 rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1723 return fn;
1724 }
1725
1726 for (;;) {
1727 struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1728 lockdep_is_held(&table->tb6_lock));
1729 struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1730 lockdep_is_held(&table->tb6_lock));
1731 struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1732 lockdep_is_held(&table->tb6_lock));
1733 struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1734 lockdep_is_held(&table->tb6_lock));
1735 struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1736 lockdep_is_held(&table->tb6_lock));
1737 struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1738 lockdep_is_held(&table->tb6_lock));
1739 struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1740 lockdep_is_held(&table->tb6_lock));
1741 struct fib6_info *new_fn_leaf;
1742
1743 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1744 iter++;
1745
1746 WARN_ON(fn->fn_flags & RTN_RTINFO);
1747 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1748 WARN_ON(fn_leaf);
1749
1750 children = 0;
1751 child = NULL;
1752 if (fn_r)
1753 child = fn_r, children |= 1;
1754 if (fn_l)
1755 child = fn_l, children |= 2;
1756
1757 if (children == 3 || FIB6_SUBTREE(fn)
1758#ifdef CONFIG_IPV6_SUBTREES
1759 /* Subtree root (i.e. fn) may have one child */
1760 || (children && fn->fn_flags & RTN_ROOT)
1761#endif
1762 ) {
1763 new_fn_leaf = fib6_find_prefix(net, table, fn);
1764#if RT6_DEBUG >= 2
1765 if (!new_fn_leaf) {
1766 WARN_ON(!new_fn_leaf);
1767 new_fn_leaf = net->ipv6.fib6_null_entry;
1768 }
1769#endif
1770 fib6_info_hold(new_fn_leaf);
1771 rcu_assign_pointer(fn->leaf, new_fn_leaf);
1772 return pn;
1773 }
1774
1775#ifdef CONFIG_IPV6_SUBTREES
1776 if (FIB6_SUBTREE(pn) == fn) {
1777 WARN_ON(!(fn->fn_flags & RTN_ROOT));
1778 RCU_INIT_POINTER(pn->subtree, NULL);
1779 nstate = FWS_L;
1780 } else {
1781 WARN_ON(fn->fn_flags & RTN_ROOT);
1782#endif
1783 if (pn_r == fn)
1784 rcu_assign_pointer(pn->right, child);
1785 else if (pn_l == fn)
1786 rcu_assign_pointer(pn->left, child);
1787#if RT6_DEBUG >= 2
1788 else
1789 WARN_ON(1);
1790#endif
1791 if (child)
1792 rcu_assign_pointer(child->parent, pn);
1793 nstate = FWS_R;
1794#ifdef CONFIG_IPV6_SUBTREES
1795 }
1796#endif
1797
1798 read_lock(&net->ipv6.fib6_walker_lock);
1799 FOR_WALKERS(net, w) {
1800 if (!child) {
1801 if (w->node == fn) {
1802 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1803 w->node = pn;
1804 w->state = nstate;
1805 }
1806 } else {
1807 if (w->node == fn) {
1808 w->node = child;
1809 if (children&2) {
1810 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1811 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1812 } else {
1813 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1814 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1815 }
1816 }
1817 }
1818 }
1819 read_unlock(&net->ipv6.fib6_walker_lock);
1820
1821 node_free(net, fn);
1822 if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1823 return pn;
1824
1825 RCU_INIT_POINTER(pn->leaf, NULL);
1826 fib6_info_release(pn_leaf);
1827 fn = pn;
1828 }
1829}
1830
1831static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1832 struct fib6_info __rcu **rtp, struct nl_info *info)
1833{
1834 struct fib6_walker *w;
1835 struct fib6_info *rt = rcu_dereference_protected(*rtp,
1836 lockdep_is_held(&table->tb6_lock));
1837 struct net *net = info->nl_net;
1838
1839 RT6_TRACE("fib6_del_route\n");
1840
1841 /* Unlink it */
1842 *rtp = rt->fib6_next;
1843 rt->fib6_node = NULL;
1844 net->ipv6.rt6_stats->fib_rt_entries--;
1845 net->ipv6.rt6_stats->fib_discarded_routes++;
1846
1847 /* Reset round-robin state, if necessary */
1848 if (rcu_access_pointer(fn->rr_ptr) == rt)
1849 fn->rr_ptr = NULL;
1850
1851 /* Remove this entry from other siblings */
1852 if (rt->fib6_nsiblings) {
1853 struct fib6_info *sibling, *next_sibling;
1854
1855 list_for_each_entry_safe(sibling, next_sibling,
1856 &rt->fib6_siblings, fib6_siblings)
1857 sibling->fib6_nsiblings--;
1858 rt->fib6_nsiblings = 0;
1859 list_del_init(&rt->fib6_siblings);
1860 rt6_multipath_rebalance(next_sibling);
1861 }
1862
1863 /* Adjust walkers */
1864 read_lock(&net->ipv6.fib6_walker_lock);
1865 FOR_WALKERS(net, w) {
1866 if (w->state == FWS_C && w->leaf == rt) {
1867 RT6_TRACE("walker %p adjusted by delroute\n", w);
1868 w->leaf = rcu_dereference_protected(rt->fib6_next,
1869 lockdep_is_held(&table->tb6_lock));
1870 if (!w->leaf)
1871 w->state = FWS_U;
1872 }
1873 }
1874 read_unlock(&net->ipv6.fib6_walker_lock);
1875
1876 /* If it was last route, call fib6_repair_tree() to:
1877 * 1. For root node, put back null_entry as how the table was created.
1878 * 2. For other nodes, expunge its radix tree node.
1879 */
1880 if (!rcu_access_pointer(fn->leaf)) {
1881 if (!(fn->fn_flags & RTN_TL_ROOT)) {
1882 fn->fn_flags &= ~RTN_RTINFO;
1883 net->ipv6.rt6_stats->fib_route_nodes--;
1884 }
1885 fn = fib6_repair_tree(net, table, fn);
1886 }
1887
1888 fib6_purge_rt(rt, fn, net);
1889
1890 if (!info->skip_notify_kernel)
1891 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
1892 if (!info->skip_notify)
1893 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1894
1895 fib6_info_release(rt);
1896}
1897
1898/* Need to own table->tb6_lock */
1899int fib6_del(struct fib6_info *rt, struct nl_info *info)
1900{
1901 struct net *net = info->nl_net;
1902 struct fib6_info __rcu **rtp;
1903 struct fib6_info __rcu **rtp_next;
1904 struct fib6_table *table;
1905 struct fib6_node *fn;
1906
1907 if (rt == net->ipv6.fib6_null_entry)
1908 return -ENOENT;
1909
1910 table = rt->fib6_table;
1911 fn = rcu_dereference_protected(rt->fib6_node,
1912 lockdep_is_held(&table->tb6_lock));
1913 if (!fn)
1914 return -ENOENT;
1915
1916 WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1917
1918 /*
1919 * Walk the leaf entries looking for ourself
1920 */
1921
1922 for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
1923 struct fib6_info *cur = rcu_dereference_protected(*rtp,
1924 lockdep_is_held(&table->tb6_lock));
1925 if (rt == cur) {
1926 fib6_del_route(table, fn, rtp, info);
1927 return 0;
1928 }
1929 rtp_next = &cur->fib6_next;
1930 }
1931 return -ENOENT;
1932}
1933
1934/*
1935 * Tree traversal function.
1936 *
1937 * Certainly, it is not interrupt safe.
1938 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1939 * It means, that we can modify tree during walking
1940 * and use this function for garbage collection, clone pruning,
1941 * cleaning tree when a device goes down etc. etc.
1942 *
1943 * It guarantees that every node will be traversed,
1944 * and that it will be traversed only once.
1945 *
1946 * Callback function w->func may return:
1947 * 0 -> continue walking.
1948 * positive value -> walking is suspended (used by tree dumps,
1949 * and probably by gc, if it will be split to several slices)
1950 * negative value -> terminate walking.
1951 *
1952 * The function itself returns:
1953 * 0 -> walk is complete.
1954 * >0 -> walk is incomplete (i.e. suspended)
1955 * <0 -> walk is terminated by an error.
1956 *
1957 * This function is called with tb6_lock held.
1958 */
1959
1960static int fib6_walk_continue(struct fib6_walker *w)
1961{
1962 struct fib6_node *fn, *pn, *left, *right;
1963
1964 /* w->root should always be table->tb6_root */
1965 WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
1966
1967 for (;;) {
1968 fn = w->node;
1969 if (!fn)
1970 return 0;
1971
1972 switch (w->state) {
1973#ifdef CONFIG_IPV6_SUBTREES
1974 case FWS_S:
1975 if (FIB6_SUBTREE(fn)) {
1976 w->node = FIB6_SUBTREE(fn);
1977 continue;
1978 }
1979 w->state = FWS_L;
1980#endif
1981 /* fall through */
1982 case FWS_L:
1983 left = rcu_dereference_protected(fn->left, 1);
1984 if (left) {
1985 w->node = left;
1986 w->state = FWS_INIT;
1987 continue;
1988 }
1989 w->state = FWS_R;
1990 /* fall through */
1991 case FWS_R:
1992 right = rcu_dereference_protected(fn->right, 1);
1993 if (right) {
1994 w->node = right;
1995 w->state = FWS_INIT;
1996 continue;
1997 }
1998 w->state = FWS_C;
1999 w->leaf = rcu_dereference_protected(fn->leaf, 1);
2000 /* fall through */
2001 case FWS_C:
2002 if (w->leaf && fn->fn_flags & RTN_RTINFO) {
2003 int err;
2004
2005 if (w->skip) {
2006 w->skip--;
2007 goto skip;
2008 }
2009
2010 err = w->func(w);
2011 if (err)
2012 return err;
2013
2014 w->count++;
2015 continue;
2016 }
2017skip:
2018 w->state = FWS_U;
2019 /* fall through */
2020 case FWS_U:
2021 if (fn == w->root)
2022 return 0;
2023 pn = rcu_dereference_protected(fn->parent, 1);
2024 left = rcu_dereference_protected(pn->left, 1);
2025 right = rcu_dereference_protected(pn->right, 1);
2026 w->node = pn;
2027#ifdef CONFIG_IPV6_SUBTREES
2028 if (FIB6_SUBTREE(pn) == fn) {
2029 WARN_ON(!(fn->fn_flags & RTN_ROOT));
2030 w->state = FWS_L;
2031 continue;
2032 }
2033#endif
2034 if (left == fn) {
2035 w->state = FWS_R;
2036 continue;
2037 }
2038 if (right == fn) {
2039 w->state = FWS_C;
2040 w->leaf = rcu_dereference_protected(w->node->leaf, 1);
2041 continue;
2042 }
2043#if RT6_DEBUG >= 2
2044 WARN_ON(1);
2045#endif
2046 }
2047 }
2048}
2049
2050static int fib6_walk(struct net *net, struct fib6_walker *w)
2051{
2052 int res;
2053
2054 w->state = FWS_INIT;
2055 w->node = w->root;
2056
2057 fib6_walker_link(net, w);
2058 res = fib6_walk_continue(w);
2059 if (res <= 0)
2060 fib6_walker_unlink(net, w);
2061 return res;
2062}
2063
2064static int fib6_clean_node(struct fib6_walker *w)
2065{
2066 int res;
2067 struct fib6_info *rt;
2068 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
2069 struct nl_info info = {
2070 .nl_net = c->net,
2071 .skip_notify = c->skip_notify,
2072 };
2073
2074 if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
2075 READ_ONCE(w->node->fn_sernum) != c->sernum)
2076 WRITE_ONCE(w->node->fn_sernum, c->sernum);
2077
2078 if (!c->func) {
2079 WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
2080 w->leaf = NULL;
2081 return 0;
2082 }
2083
2084 for_each_fib6_walker_rt(w) {
2085 res = c->func(rt, c->arg);
2086 if (res == -1) {
2087 w->leaf = rt;
2088 res = fib6_del(rt, &info);
2089 if (res) {
2090#if RT6_DEBUG >= 2
2091 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
2092 __func__, rt,
2093 rcu_access_pointer(rt->fib6_node),
2094 res);
2095#endif
2096 continue;
2097 }
2098 return 0;
2099 } else if (res == -2) {
2100 if (WARN_ON(!rt->fib6_nsiblings))
2101 continue;
2102 rt = list_last_entry(&rt->fib6_siblings,
2103 struct fib6_info, fib6_siblings);
2104 continue;
2105 }
2106 WARN_ON(res != 0);
2107 }
2108 w->leaf = rt;
2109 return 0;
2110}
2111
2112/*
2113 * Convenient frontend to tree walker.
2114 *
2115 * func is called on each route.
2116 * It may return -2 -> skip multipath route.
2117 * -1 -> delete this route.
2118 * 0 -> continue walking
2119 */
2120
2121static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2122 int (*func)(struct fib6_info *, void *arg),
2123 int sernum, void *arg, bool skip_notify)
2124{
2125 struct fib6_cleaner c;
2126
2127 c.w.root = root;
2128 c.w.func = fib6_clean_node;
2129 c.w.count = 0;
2130 c.w.skip = 0;
2131 c.w.skip_in_node = 0;
2132 c.func = func;
2133 c.sernum = sernum;
2134 c.arg = arg;
2135 c.net = net;
2136 c.skip_notify = skip_notify;
2137
2138 fib6_walk(net, &c.w);
2139}
2140
2141static void __fib6_clean_all(struct net *net,
2142 int (*func)(struct fib6_info *, void *),
2143 int sernum, void *arg, bool skip_notify)
2144{
2145 struct fib6_table *table;
2146 struct hlist_head *head;
2147 unsigned int h;
2148
2149 rcu_read_lock();
2150 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2151 head = &net->ipv6.fib_table_hash[h];
2152 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2153 spin_lock_bh(&table->tb6_lock);
2154 fib6_clean_tree(net, &table->tb6_root,
2155 func, sernum, arg, skip_notify);
2156 spin_unlock_bh(&table->tb6_lock);
2157 }
2158 }
2159 rcu_read_unlock();
2160}
2161
2162void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2163 void *arg)
2164{
2165 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2166}
2167
2168void fib6_clean_all_skip_notify(struct net *net,
2169 int (*func)(struct fib6_info *, void *),
2170 void *arg)
2171{
2172 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2173}
2174
2175static void fib6_flush_trees(struct net *net)
2176{
2177 int new_sernum = fib6_new_sernum(net);
2178
2179 __fib6_clean_all(net, NULL, new_sernum, NULL, false);
2180}
2181
2182/*
2183 * Garbage collection
2184 */
2185
2186static int fib6_age(struct fib6_info *rt, void *arg)
2187{
2188 struct fib6_gc_args *gc_args = arg;
2189 unsigned long now = jiffies;
2190
2191 /*
2192 * check addrconf expiration here.
2193 * Routes are expired even if they are in use.
2194 */
2195
2196 if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2197 if (time_after(now, rt->expires)) {
2198 RT6_TRACE("expiring %p\n", rt);
2199 return -1;
2200 }
2201 gc_args->more++;
2202 }
2203
2204 /* Also age clones in the exception table.
2205 * Note, that clones are aged out
2206 * only if they are not in use now.
2207 */
2208 rt6_age_exceptions(rt, gc_args, now);
2209
2210 return 0;
2211}
2212
2213void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2214{
2215 struct fib6_gc_args gc_args;
2216 unsigned long now;
2217
2218 if (force) {
2219 spin_lock_bh(&net->ipv6.fib6_gc_lock);
2220 } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2221 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2222 return;
2223 }
2224 gc_args.timeout = expires ? (int)expires :
2225 net->ipv6.sysctl.ip6_rt_gc_interval;
2226 gc_args.more = 0;
2227
2228 fib6_clean_all(net, fib6_age, &gc_args);
2229 now = jiffies;
2230 net->ipv6.ip6_rt_last_gc = now;
2231
2232 if (gc_args.more)
2233 mod_timer(&net->ipv6.ip6_fib_timer,
2234 round_jiffies(now
2235 + net->ipv6.sysctl.ip6_rt_gc_interval));
2236 else
2237 del_timer(&net->ipv6.ip6_fib_timer);
2238 spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2239}
2240
2241static void fib6_gc_timer_cb(struct timer_list *t)
2242{
2243 struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
2244
2245 fib6_run_gc(0, arg, true);
2246}
2247
2248static int __net_init fib6_net_init(struct net *net)
2249{
2250 size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2251 int err;
2252
2253 err = fib6_notifier_init(net);
2254 if (err)
2255 return err;
2256
2257 spin_lock_init(&net->ipv6.fib6_gc_lock);
2258 rwlock_init(&net->ipv6.fib6_walker_lock);
2259 INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2260 timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2261
2262 net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
2263 if (!net->ipv6.rt6_stats)
2264 goto out_timer;
2265
2266 /* Avoid false sharing : Use at least a full cache line */
2267 size = max_t(size_t, size, L1_CACHE_BYTES);
2268
2269 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2270 if (!net->ipv6.fib_table_hash)
2271 goto out_rt6_stats;
2272
2273 net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
2274 GFP_KERNEL);
2275 if (!net->ipv6.fib6_main_tbl)
2276 goto out_fib_table_hash;
2277
2278 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2279 rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2280 net->ipv6.fib6_null_entry);
2281 net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2282 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2283 inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2284
2285#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2286 net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
2287 GFP_KERNEL);
2288 if (!net->ipv6.fib6_local_tbl)
2289 goto out_fib6_main_tbl;
2290 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2291 rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2292 net->ipv6.fib6_null_entry);
2293 net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2294 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2295 inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2296#endif
2297 fib6_tables_init(net);
2298
2299 return 0;
2300
2301#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2302out_fib6_main_tbl:
2303 kfree(net->ipv6.fib6_main_tbl);
2304#endif
2305out_fib_table_hash:
2306 kfree(net->ipv6.fib_table_hash);
2307out_rt6_stats:
2308 kfree(net->ipv6.rt6_stats);
2309out_timer:
2310 fib6_notifier_exit(net);
2311 return -ENOMEM;
2312}
2313
2314static void fib6_net_exit(struct net *net)
2315{
2316 unsigned int i;
2317
2318 del_timer_sync(&net->ipv6.ip6_fib_timer);
2319
2320 for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2321 struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2322 struct hlist_node *tmp;
2323 struct fib6_table *tb;
2324
2325 hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2326 hlist_del(&tb->tb6_hlist);
2327 fib6_free_table(tb);
2328 }
2329 }
2330
2331 kfree(net->ipv6.fib_table_hash);
2332 kfree(net->ipv6.rt6_stats);
2333 fib6_notifier_exit(net);
2334}
2335
2336static struct pernet_operations fib6_net_ops = {
2337 .init = fib6_net_init,
2338 .exit = fib6_net_exit,
2339};
2340
2341int __init fib6_init(void)
2342{
2343 int ret = -ENOMEM;
2344
2345 fib6_node_kmem = kmem_cache_create("fib6_nodes",
2346 sizeof(struct fib6_node),
2347 0, SLAB_HWCACHE_ALIGN,
2348 NULL);
2349 if (!fib6_node_kmem)
2350 goto out;
2351
2352 ret = register_pernet_subsys(&fib6_net_ops);
2353 if (ret)
2354 goto out_kmem_cache_create;
2355
2356 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
2357 inet6_dump_fib, 0);
2358 if (ret)
2359 goto out_unregister_subsys;
2360
2361 __fib6_flush_trees = fib6_flush_trees;
2362out:
2363 return ret;
2364
2365out_unregister_subsys:
2366 unregister_pernet_subsys(&fib6_net_ops);
2367out_kmem_cache_create:
2368 kmem_cache_destroy(fib6_node_kmem);
2369 goto out;
2370}
2371
2372void fib6_gc_cleanup(void)
2373{
2374 unregister_pernet_subsys(&fib6_net_ops);
2375 kmem_cache_destroy(fib6_node_kmem);
2376}
2377
2378#ifdef CONFIG_PROC_FS
2379static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2380{
2381 struct fib6_info *rt = v;
2382 struct ipv6_route_iter *iter = seq->private;
2383 struct fib6_nh *fib6_nh = rt->fib6_nh;
2384 unsigned int flags = rt->fib6_flags;
2385 const struct net_device *dev;
2386
2387 if (rt->nh)
2388 fib6_nh = nexthop_fib6_nh_bh(rt->nh);
2389
2390 seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2391
2392#ifdef CONFIG_IPV6_SUBTREES
2393 seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2394#else
2395 seq_puts(seq, "00000000000000000000000000000000 00 ");
2396#endif
2397 if (fib6_nh->fib_nh_gw_family) {
2398 flags |= RTF_GATEWAY;
2399 seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2400 } else {
2401 seq_puts(seq, "00000000000000000000000000000000");
2402 }
2403
2404 dev = fib6_nh->fib_nh_dev;
2405 seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2406 rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2407 flags, dev ? dev->name : "");
2408 iter->w.leaf = NULL;
2409 return 0;
2410}
2411
2412static int ipv6_route_yield(struct fib6_walker *w)
2413{
2414 struct ipv6_route_iter *iter = w->args;
2415
2416 if (!iter->skip)
2417 return 1;
2418
2419 do {
2420 iter->w.leaf = rcu_dereference_protected(
2421 iter->w.leaf->fib6_next,
2422 lockdep_is_held(&iter->tbl->tb6_lock));
2423 iter->skip--;
2424 if (!iter->skip && iter->w.leaf)
2425 return 1;
2426 } while (iter->w.leaf);
2427
2428 return 0;
2429}
2430
2431static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2432 struct net *net)
2433{
2434 memset(&iter->w, 0, sizeof(iter->w));
2435 iter->w.func = ipv6_route_yield;
2436 iter->w.root = &iter->tbl->tb6_root;
2437 iter->w.state = FWS_INIT;
2438 iter->w.node = iter->w.root;
2439 iter->w.args = iter;
2440 iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
2441 INIT_LIST_HEAD(&iter->w.lh);
2442 fib6_walker_link(net, &iter->w);
2443}
2444
2445static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2446 struct net *net)
2447{
2448 unsigned int h;
2449 struct hlist_node *node;
2450
2451 if (tbl) {
2452 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2453 node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
2454 } else {
2455 h = 0;
2456 node = NULL;
2457 }
2458
2459 while (!node && h < FIB6_TABLE_HASHSZ) {
2460 node = rcu_dereference_bh(
2461 hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2462 }
2463 return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2464}
2465
2466static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2467{
2468 int sernum = READ_ONCE(iter->w.root->fn_sernum);
2469
2470 if (iter->sernum != sernum) {
2471 iter->sernum = sernum;
2472 iter->w.state = FWS_INIT;
2473 iter->w.node = iter->w.root;
2474 WARN_ON(iter->w.skip);
2475 iter->w.skip = iter->w.count;
2476 }
2477}
2478
2479static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2480{
2481 int r;
2482 struct fib6_info *n;
2483 struct net *net = seq_file_net(seq);
2484 struct ipv6_route_iter *iter = seq->private;
2485
2486 ++(*pos);
2487 if (!v)
2488 goto iter_table;
2489
2490 n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
2491 if (n)
2492 return n;
2493
2494iter_table:
2495 ipv6_route_check_sernum(iter);
2496 spin_lock_bh(&iter->tbl->tb6_lock);
2497 r = fib6_walk_continue(&iter->w);
2498 spin_unlock_bh(&iter->tbl->tb6_lock);
2499 if (r > 0) {
2500 return iter->w.leaf;
2501 } else if (r < 0) {
2502 fib6_walker_unlink(net, &iter->w);
2503 return NULL;
2504 }
2505 fib6_walker_unlink(net, &iter->w);
2506
2507 iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2508 if (!iter->tbl)
2509 return NULL;
2510
2511 ipv6_route_seq_setup_walk(iter, net);
2512 goto iter_table;
2513}
2514
2515static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2516 __acquires(RCU_BH)
2517{
2518 struct net *net = seq_file_net(seq);
2519 struct ipv6_route_iter *iter = seq->private;
2520
2521 rcu_read_lock_bh();
2522 iter->tbl = ipv6_route_seq_next_table(NULL, net);
2523 iter->skip = *pos;
2524
2525 if (iter->tbl) {
2526 loff_t p = 0;
2527
2528 ipv6_route_seq_setup_walk(iter, net);
2529 return ipv6_route_seq_next(seq, NULL, &p);
2530 } else {
2531 return NULL;
2532 }
2533}
2534
2535static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2536{
2537 struct fib6_walker *w = &iter->w;
2538 return w->node && !(w->state == FWS_U && w->node == w->root);
2539}
2540
2541static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2542 __releases(RCU_BH)
2543{
2544 struct net *net = seq_file_net(seq);
2545 struct ipv6_route_iter *iter = seq->private;
2546
2547 if (ipv6_route_iter_active(iter))
2548 fib6_walker_unlink(net, &iter->w);
2549
2550 rcu_read_unlock_bh();
2551}
2552
2553const struct seq_operations ipv6_route_seq_ops = {
2554 .start = ipv6_route_seq_start,
2555 .next = ipv6_route_seq_next,
2556 .stop = ipv6_route_seq_stop,
2557 .show = ipv6_route_seq_show
2558};
2559#endif /* CONFIG_PROC_FS */