blob: 0b2f2bc1382fb1f8c2dc553878eed1972f31409c [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27#define pr_fmt(fmt) "IPv6: " fmt
28
29#include <linux/capability.h>
30#include <linux/errno.h>
31#include <linux/export.h>
32#include <linux/types.h>
33#include <linux/times.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/net.h>
37#include <linux/route.h>
38#include <linux/netdevice.h>
39#include <linux/in6.h>
40#include <linux/mroute6.h>
41#include <linux/init.h>
42#include <linux/if_arp.h>
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#include <linux/nsproxy.h>
46#include <linux/slab.h>
47#include <linux/jhash.h>
48#include <net/net_namespace.h>
49#include <net/snmp.h>
50#include <net/ipv6.h>
51#include <net/ip6_fib.h>
52#include <net/ip6_route.h>
53#include <net/ndisc.h>
54#include <net/addrconf.h>
55#include <net/tcp.h>
56#include <linux/rtnetlink.h>
57#include <net/dst.h>
58#include <net/dst_metadata.h>
59#include <net/xfrm.h>
60#include <net/netevent.h>
61#include <net/netlink.h>
62#include <net/nexthop.h>
63#include <net/lwtunnel.h>
64#include <net/ip_tunnels.h>
65#include <net/l3mdev.h>
66#include <net/ip.h>
67#include <linux/uaccess.h>
68
69#ifdef CONFIG_SYSCTL
70#include <linux/sysctl.h>
71#endif
72
73static int ip6_rt_type_to_error(u8 fib6_type);
74
75#define CREATE_TRACE_POINTS
76#include <trace/events/fib6.h>
77EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78#undef CREATE_TRACE_POINTS
79
80enum rt6_nud_state {
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
84 RT6_NUD_SUCCEED = 1
85};
86
87static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89static unsigned int ip6_mtu(const struct dst_entry *dst);
90static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91static void ip6_dst_destroy(struct dst_entry *);
92static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94static int ip6_dst_gc(struct dst_ops *ops);
95
96static int ip6_pkt_discard(struct sk_buff *skb);
97static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98static int ip6_pkt_prohibit(struct sk_buff *skb);
99static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100static int ip6_pkt_policy_failed(struct sk_buff *skb);
101static int ip6_pkt_policy_failed_out(struct net *net, struct sock *sk, struct sk_buff *skb);
102static void ip6_link_failure(struct sk_buff *skb);
103static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb, u32 mtu,
105 bool confirm_neigh);
106static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
107 struct sk_buff *skb);
108static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
109static size_t rt6_nlmsg_size(struct fib6_info *rt);
110static int rt6_fill_node(struct net *net, struct sk_buff *skb,
111 struct fib6_info *rt, struct dst_entry *dst,
112 struct in6_addr *dest, struct in6_addr *src,
113 int iif, int type, u32 portid, u32 seq,
114 unsigned int flags);
115static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
116 const struct in6_addr *daddr,
117 const struct in6_addr *saddr);
118
119#ifdef CONFIG_IPV6_ROUTE_INFO
120static struct fib6_info *rt6_add_route_info(struct net *net,
121 const struct in6_addr *prefix, int prefixlen,
122 const struct in6_addr *gwaddr,
123 struct net_device *dev,
124 unsigned int pref);
125static struct fib6_info *rt6_get_route_info(struct net *net,
126 const struct in6_addr *prefix, int prefixlen,
127 const struct in6_addr *gwaddr,
128 struct net_device *dev);
129#endif
130
131struct uncached_list {
132 spinlock_t lock;
133 struct list_head head;
134};
135
136static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
137
138void rt6_uncached_list_add(struct rt6_info *rt)
139{
140 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
141
142 rt->rt6i_uncached_list = ul;
143
144 spin_lock_bh(&ul->lock);
145 list_add_tail(&rt->rt6i_uncached, &ul->head);
146 spin_unlock_bh(&ul->lock);
147}
148
149void rt6_uncached_list_del(struct rt6_info *rt)
150{
151 if (!list_empty(&rt->rt6i_uncached)) {
152 struct uncached_list *ul = rt->rt6i_uncached_list;
153 struct net *net = dev_net(rt->dst.dev);
154
155 spin_lock_bh(&ul->lock);
156 list_del(&rt->rt6i_uncached);
157 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
158 spin_unlock_bh(&ul->lock);
159 }
160}
161
162static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
163{
164 struct net_device *loopback_dev = net->loopback_dev;
165 int cpu;
166
167 if (dev == loopback_dev)
168 return;
169
170 for_each_possible_cpu(cpu) {
171 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
172 struct rt6_info *rt;
173
174 spin_lock_bh(&ul->lock);
175 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
176 struct inet6_dev *rt_idev = rt->rt6i_idev;
177 struct net_device *rt_dev = rt->dst.dev;
178
179 if (rt_idev->dev == dev) {
180 rt->rt6i_idev = in6_dev_get(loopback_dev);
181 in6_dev_put(rt_idev);
182 }
183
184 if (rt_dev == dev) {
185 rt->dst.dev = loopback_dev;
186 dev_hold(rt->dst.dev);
187 dev_put(rt_dev);
188 }
189 }
190 spin_unlock_bh(&ul->lock);
191 }
192}
193
194static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 struct sk_buff *skb,
196 const void *daddr)
197{
198 if (!ipv6_addr_any(p))
199 return (const void *) p;
200 else if (skb)
201 return &ipv6_hdr(skb)->daddr;
202 return daddr;
203}
204
205struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
206 struct net_device *dev,
207 struct sk_buff *skb,
208 const void *daddr)
209{
210 struct neighbour *n;
211
212 daddr = choose_neigh_daddr(gw, skb, daddr);
213 n = __ipv6_neigh_lookup(dev, daddr);
214 if (n)
215 return n;
216
217 n = neigh_create(&nd_tbl, daddr, dev);
218 return IS_ERR(n) ? NULL : n;
219}
220
221static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 struct sk_buff *skb,
223 const void *daddr)
224{
225 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
226
227 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
228}
229
230static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
231{
232 struct net_device *dev = dst->dev;
233 struct rt6_info *rt = (struct rt6_info *)dst;
234
235 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
236 if (!daddr)
237 return;
238 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
239 return;
240 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
241 return;
242 __ipv6_confirm_neigh(dev, daddr);
243}
244
245static struct dst_ops ip6_dst_ops_template = {
246 .family = AF_INET6,
247 .gc = ip6_dst_gc,
248 .gc_thresh = 1024,
249 .check = ip6_dst_check,
250 .default_advmss = ip6_default_advmss,
251 .mtu = ip6_mtu,
252 .cow_metrics = dst_cow_metrics_generic,
253 .destroy = ip6_dst_destroy,
254 .ifdown = ip6_dst_ifdown,
255 .negative_advice = ip6_negative_advice,
256 .link_failure = ip6_link_failure,
257 .update_pmtu = ip6_rt_update_pmtu,
258 .redirect = rt6_do_redirect,
259 .local_out = __ip6_local_out,
260 .neigh_lookup = ip6_dst_neigh_lookup,
261 .confirm_neigh = ip6_confirm_neigh,
262};
263
264static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
265{
266 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
267
268 return mtu ? : dst->dev->mtu;
269}
270
271static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
272 struct sk_buff *skb, u32 mtu,
273 bool confirm_neigh)
274{
275}
276
277static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 struct sk_buff *skb)
279{
280}
281
282static struct dst_ops ip6_dst_blackhole_ops = {
283 .family = AF_INET6,
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_dst_neigh_lookup,
292};
293
294static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
296};
297
298static const struct fib6_info fib6_null_entry_template = {
299 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
300 .fib6_protocol = RTPROT_KERNEL,
301 .fib6_metric = ~(u32)0,
302 .fib6_ref = ATOMIC_INIT(1),
303 .fib6_type = RTN_UNREACHABLE,
304 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
305};
306
307static const struct rt6_info ip6_null_entry_template = {
308 .dst = {
309 .__refcnt = ATOMIC_INIT(1),
310 .__use = 1,
311 .obsolete = DST_OBSOLETE_FORCE_CHK,
312 .error = -ENETUNREACH,
313 .input = ip6_pkt_discard,
314 .output = ip6_pkt_discard_out,
315 },
316 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
317};
318
319#ifdef CONFIG_IPV6_MULTIPLE_TABLES
320
321static const struct rt6_info ip6_prohibit_entry_template = {
322 .dst = {
323 .__refcnt = ATOMIC_INIT(1),
324 .__use = 1,
325 .obsolete = DST_OBSOLETE_FORCE_CHK,
326 .error = -EACCES,
327 .input = ip6_pkt_prohibit,
328 .output = ip6_pkt_prohibit_out,
329 },
330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
331};
332
333static const struct rt6_info ip6_policy_failed_entry_template = {
334 .dst = {
335 .__refcnt = ATOMIC_INIT(1),
336 .__use = 1,
337 .obsolete = DST_OBSOLETE_FORCE_CHK,
338 .error = -EACCES,
339 .input = ip6_pkt_policy_failed,
340 .output = ip6_pkt_policy_failed_out,
341 },
342 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343};
344
345static const struct rt6_info ip6_blk_hole_entry_template = {
346 .dst = {
347 .__refcnt = ATOMIC_INIT(1),
348 .__use = 1,
349 .obsolete = DST_OBSOLETE_FORCE_CHK,
350 .error = -EINVAL,
351 .input = dst_discard,
352 .output = dst_discard_out,
353 },
354 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
355};
356
357#endif
358
359static void rt6_info_init(struct rt6_info *rt)
360{
361 struct dst_entry *dst = &rt->dst;
362
363 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
364 INIT_LIST_HEAD(&rt->rt6i_uncached);
365}
366
367/* allocate dst with ip6_dst_ops */
368struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
369 int flags)
370{
371 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
372 1, DST_OBSOLETE_FORCE_CHK, flags);
373
374 if (rt) {
375 rt6_info_init(rt);
376 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
377 }
378
379 return rt;
380}
381EXPORT_SYMBOL(ip6_dst_alloc);
382
383static void ip6_dst_destroy(struct dst_entry *dst)
384{
385 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
386 struct rt6_info *rt = (struct rt6_info *)dst;
387 struct fib6_info *from;
388 struct inet6_dev *idev;
389
390 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
391 kfree(p);
392
393 rt6_uncached_list_del(rt);
394
395 idev = rt->rt6i_idev;
396 if (idev) {
397 rt->rt6i_idev = NULL;
398 in6_dev_put(idev);
399 }
400
401 from = xchg((__force struct fib6_info **)&rt->from, NULL);
402 fib6_info_release(from);
403}
404
405static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
406 int how)
407{
408 struct rt6_info *rt = (struct rt6_info *)dst;
409 struct inet6_dev *idev = rt->rt6i_idev;
410 struct net_device *loopback_dev =
411 dev_net(dev)->loopback_dev;
412
413 if (idev && idev->dev != loopback_dev) {
414 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
415 if (loopback_idev) {
416 rt->rt6i_idev = loopback_idev;
417 in6_dev_put(idev);
418 }
419 }
420}
421
422static bool __rt6_check_expired(const struct rt6_info *rt)
423{
424 if (rt->rt6i_flags & RTF_EXPIRES)
425 return time_after(jiffies, rt->dst.expires);
426 else
427 return false;
428}
429
430static bool rt6_check_expired(const struct rt6_info *rt)
431{
432 struct fib6_info *from;
433
434 from = rcu_dereference(rt->from);
435
436 if (rt->rt6i_flags & RTF_EXPIRES) {
437 if (time_after(jiffies, rt->dst.expires))
438 return true;
439 } else if (from) {
440 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
441 fib6_check_expired(from);
442 }
443 return false;
444}
445
446struct fib6_info *fib6_multipath_select(const struct net *net,
447 struct fib6_info *match,
448 struct flowi6 *fl6, int oif,
449 const struct sk_buff *skb,
450 int strict)
451{
452 struct fib6_info *sibling, *next_sibling;
453
454 /* We might have already computed the hash for ICMPv6 errors. In such
455 * case it will always be non-zero. Otherwise now is the time to do it.
456 */
457 if (!fl6->mp_hash)
458 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
459
460 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
461 return match;
462
463 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
464 fib6_siblings) {
465 int nh_upper_bound;
466
467 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
468 if (fl6->mp_hash > nh_upper_bound)
469 continue;
470 if (rt6_score_route(sibling, oif, strict) < 0)
471 break;
472 match = sibling;
473 break;
474 }
475
476 return match;
477}
478
479/*
480 * Route lookup. rcu_read_lock() should be held.
481 */
482
483static inline struct fib6_info *rt6_device_match(struct net *net,
484 struct fib6_info *rt,
485 const struct in6_addr *saddr,
486 int oif,
487 int flags)
488{
489 struct fib6_info *sprt;
490
491 if (!oif && ipv6_addr_any(saddr) &&
492 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
493 return rt;
494
495 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
496 const struct net_device *dev = sprt->fib6_nh.nh_dev;
497
498 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
499 continue;
500
501 if (oif) {
502 if (dev->ifindex == oif)
503 return sprt;
504 } else {
505 if (ipv6_chk_addr(net, saddr, dev,
506 flags & RT6_LOOKUP_F_IFACE))
507 return sprt;
508 }
509 }
510
511 if (oif && flags & RT6_LOOKUP_F_IFACE)
512 return net->ipv6.fib6_null_entry;
513
514 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
515}
516
517#ifdef CONFIG_IPV6_ROUTER_PREF
518struct __rt6_probe_work {
519 struct work_struct work;
520 struct in6_addr target;
521 struct net_device *dev;
522};
523
524static void rt6_probe_deferred(struct work_struct *w)
525{
526 struct in6_addr mcaddr;
527 struct __rt6_probe_work *work =
528 container_of(w, struct __rt6_probe_work, work);
529
530 addrconf_addr_solict_mult(&work->target, &mcaddr);
531 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
532 dev_put(work->dev);
533 kfree(work);
534}
535
536static void rt6_probe(struct fib6_info *rt)
537{
538 struct __rt6_probe_work *work = NULL;
539 const struct in6_addr *nh_gw;
540 unsigned long last_probe;
541 struct neighbour *neigh;
542 struct net_device *dev;
543 struct inet6_dev *idev;
544
545 /*
546 * Okay, this does not seem to be appropriate
547 * for now, however, we need to check if it
548 * is really so; aka Router Reachability Probing.
549 *
550 * Router Reachability Probe MUST be rate-limited
551 * to no more than one per minute.
552 */
553 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
554 return;
555
556 nh_gw = &rt->fib6_nh.nh_gw;
557 dev = rt->fib6_nh.nh_dev;
558 rcu_read_lock_bh();
559 last_probe = READ_ONCE(rt->last_probe);
560 idev = __in6_dev_get(dev);
561 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
562 if (neigh) {
563 if (neigh->nud_state & NUD_VALID)
564 goto out;
565
566 write_lock(&neigh->lock);
567 if (!(neigh->nud_state & NUD_VALID) &&
568 time_after(jiffies,
569 neigh->updated + idev->cnf.rtr_probe_interval)) {
570 work = kmalloc(sizeof(*work), GFP_ATOMIC);
571 if (work)
572 __neigh_set_probe_once(neigh);
573 }
574 write_unlock(&neigh->lock);
575 } else if (time_after(jiffies, last_probe +
576 idev->cnf.rtr_probe_interval)) {
577 work = kmalloc(sizeof(*work), GFP_ATOMIC);
578 }
579
580 if (!work || cmpxchg(&rt->last_probe,
581 last_probe, jiffies) != last_probe) {
582 kfree(work);
583 } else {
584 INIT_WORK(&work->work, rt6_probe_deferred);
585 work->target = *nh_gw;
586 dev_hold(dev);
587 work->dev = dev;
588 schedule_work(&work->work);
589 }
590
591out:
592 rcu_read_unlock_bh();
593}
594#else
595static inline void rt6_probe(struct fib6_info *rt)
596{
597}
598#endif
599
600/*
601 * Default Router Selection (RFC 2461 6.3.6)
602 */
603static inline int rt6_check_dev(struct fib6_info *rt, int oif)
604{
605 const struct net_device *dev = rt->fib6_nh.nh_dev;
606
607 if (!oif || dev->ifindex == oif)
608 return 2;
609 return 0;
610}
611
612static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
613{
614 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
615 struct neighbour *neigh;
616
617 if (rt->fib6_flags & RTF_NONEXTHOP ||
618 !(rt->fib6_flags & RTF_GATEWAY))
619 return RT6_NUD_SUCCEED;
620
621 rcu_read_lock_bh();
622 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
623 &rt->fib6_nh.nh_gw);
624 if (neigh) {
625 read_lock(&neigh->lock);
626 if (neigh->nud_state & NUD_VALID)
627 ret = RT6_NUD_SUCCEED;
628#ifdef CONFIG_IPV6_ROUTER_PREF
629 else if (!(neigh->nud_state & NUD_FAILED))
630 ret = RT6_NUD_SUCCEED;
631 else
632 ret = RT6_NUD_FAIL_PROBE;
633#endif
634 read_unlock(&neigh->lock);
635 } else {
636 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
637 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
638 }
639 rcu_read_unlock_bh();
640
641 return ret;
642}
643
644static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
645{
646 int m;
647
648 m = rt6_check_dev(rt, oif);
649 if (!m && (strict & RT6_LOOKUP_F_IFACE))
650 return RT6_NUD_FAIL_HARD;
651#ifdef CONFIG_IPV6_ROUTER_PREF
652 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
653#endif
654 if (strict & RT6_LOOKUP_F_REACHABLE) {
655 int n = rt6_check_neigh(rt);
656 if (n < 0)
657 return n;
658 }
659 return m;
660}
661
662/* called with rc_read_lock held */
663static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
664{
665 const struct net_device *dev = fib6_info_nh_dev(f6i);
666 bool rc = false;
667
668 if (dev) {
669 const struct inet6_dev *idev = __in6_dev_get(dev);
670
671 rc = !!idev->cnf.ignore_routes_with_linkdown;
672 }
673
674 return rc;
675}
676
677static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
678 int *mpri, struct fib6_info *match,
679 bool *do_rr)
680{
681 int m;
682 bool match_do_rr = false;
683
684 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
685 goto out;
686
687 if (fib6_ignore_linkdown(rt) &&
688 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
689 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
690 goto out;
691
692 if (fib6_check_expired(rt))
693 goto out;
694
695 m = rt6_score_route(rt, oif, strict);
696 if (m == RT6_NUD_FAIL_DO_RR) {
697 match_do_rr = true;
698 m = 0; /* lowest valid score */
699 } else if (m == RT6_NUD_FAIL_HARD) {
700 goto out;
701 }
702
703 if (strict & RT6_LOOKUP_F_REACHABLE)
704 rt6_probe(rt);
705
706 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
707 if (m > *mpri) {
708 *do_rr = match_do_rr;
709 *mpri = m;
710 match = rt;
711 }
712out:
713 return match;
714}
715
716static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
717 struct fib6_info *leaf,
718 struct fib6_info *rr_head,
719 u32 metric, int oif, int strict,
720 bool *do_rr)
721{
722 struct fib6_info *rt, *match, *cont;
723 int mpri = -1;
724
725 match = NULL;
726 cont = NULL;
727 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
728 if (rt->fib6_metric != metric) {
729 cont = rt;
730 break;
731 }
732
733 match = find_match(rt, oif, strict, &mpri, match, do_rr);
734 }
735
736 for (rt = leaf; rt && rt != rr_head;
737 rt = rcu_dereference(rt->fib6_next)) {
738 if (rt->fib6_metric != metric) {
739 cont = rt;
740 break;
741 }
742
743 match = find_match(rt, oif, strict, &mpri, match, do_rr);
744 }
745
746 if (match || !cont)
747 return match;
748
749 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
750 match = find_match(rt, oif, strict, &mpri, match, do_rr);
751
752 return match;
753}
754
755static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
756 int oif, int strict)
757{
758 struct fib6_info *leaf = rcu_dereference(fn->leaf);
759 struct fib6_info *match, *rt0;
760 bool do_rr = false;
761 int key_plen;
762
763 if (!leaf || leaf == net->ipv6.fib6_null_entry)
764 return net->ipv6.fib6_null_entry;
765
766 rt0 = rcu_dereference(fn->rr_ptr);
767 if (!rt0)
768 rt0 = leaf;
769
770 /* Double check to make sure fn is not an intermediate node
771 * and fn->leaf does not points to its child's leaf
772 * (This might happen if all routes under fn are deleted from
773 * the tree and fib6_repair_tree() is called on the node.)
774 */
775 key_plen = rt0->fib6_dst.plen;
776#ifdef CONFIG_IPV6_SUBTREES
777 if (rt0->fib6_src.plen)
778 key_plen = rt0->fib6_src.plen;
779#endif
780 if (fn->fn_bit != key_plen)
781 return net->ipv6.fib6_null_entry;
782
783 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
784 &do_rr);
785
786 if (do_rr) {
787 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
788
789 /* no entries matched; do round-robin */
790 if (!next || next->fib6_metric != rt0->fib6_metric)
791 next = leaf;
792
793 if (next != rt0) {
794 spin_lock_bh(&leaf->fib6_table->tb6_lock);
795 /* make sure next is not being deleted from the tree */
796 if (next->fib6_node)
797 rcu_assign_pointer(fn->rr_ptr, next);
798 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
799 }
800 }
801
802 return match ? match : net->ipv6.fib6_null_entry;
803}
804
805static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
806{
807 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
808}
809
810#ifdef CONFIG_IPV6_ROUTE_INFO
811int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
812 const struct in6_addr *gwaddr)
813{
814 struct net *net = dev_net(dev);
815 struct route_info *rinfo = (struct route_info *) opt;
816 struct in6_addr prefix_buf, *prefix;
817 unsigned int pref;
818 unsigned long lifetime;
819 struct fib6_info *rt;
820
821 if (len < sizeof(struct route_info)) {
822 return -EINVAL;
823 }
824
825 /* Sanity check for prefix_len and length */
826 if (rinfo->length > 3) {
827 return -EINVAL;
828 } else if (rinfo->prefix_len > 128) {
829 return -EINVAL;
830 } else if (rinfo->prefix_len > 64) {
831 if (rinfo->length < 2) {
832 return -EINVAL;
833 }
834 } else if (rinfo->prefix_len > 0) {
835 if (rinfo->length < 1) {
836 return -EINVAL;
837 }
838 }
839
840 pref = rinfo->route_pref;
841 if (pref == ICMPV6_ROUTER_PREF_INVALID)
842 return -EINVAL;
843
844 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
845
846 if (rinfo->length == 3)
847 prefix = (struct in6_addr *)rinfo->prefix;
848 else {
849 /* this function is safe */
850 ipv6_addr_prefix(&prefix_buf,
851 (struct in6_addr *)rinfo->prefix,
852 rinfo->prefix_len);
853 prefix = &prefix_buf;
854 }
855
856 if (rinfo->prefix_len == 0)
857 rt = rt6_get_dflt_router(net, gwaddr, dev);
858 else
859 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
860 gwaddr, dev);
861
862 if (rt && !lifetime) {
863 ip6_del_rt(net, rt);
864 rt = NULL;
865 }
866
867 if (!rt && lifetime)
868 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
869 dev, pref);
870 else if (rt)
871 rt->fib6_flags = RTF_ROUTEINFO |
872 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
873
874 if (rt) {
875 if (!addrconf_finite_timeout(lifetime))
876 fib6_clean_expires(rt);
877 else
878 fib6_set_expires(rt, jiffies + HZ * lifetime);
879
880 fib6_info_release(rt);
881 }
882 return 0;
883}
884#endif
885
886/*
887 * Misc support functions
888 */
889
890/* called with rcu_lock held */
891static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
892{
893 struct net_device *dev = rt->fib6_nh.nh_dev;
894
895 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
896 /* for copies of local routes, dst->dev needs to be the
897 * device if it is a master device, the master device if
898 * device is enslaved, and the loopback as the default
899 */
900 if (netif_is_l3_slave(dev) &&
901 !rt6_need_strict(&rt->fib6_dst.addr))
902 dev = l3mdev_master_dev_rcu(dev);
903 else if (!netif_is_l3_master(dev))
904 dev = dev_net(dev)->loopback_dev;
905 /* last case is netif_is_l3_master(dev) is true in which
906 * case we want dev returned to be dev
907 */
908 }
909
910 return dev;
911}
912
913static const int fib6_prop[RTN_MAX + 1] = {
914 [RTN_UNSPEC] = 0,
915 [RTN_UNICAST] = 0,
916 [RTN_LOCAL] = 0,
917 [RTN_BROADCAST] = 0,
918 [RTN_ANYCAST] = 0,
919 [RTN_MULTICAST] = 0,
920 [RTN_BLACKHOLE] = -EINVAL,
921 [RTN_UNREACHABLE] = -EHOSTUNREACH,
922 [RTN_PROHIBIT] = -EACCES,
923 [RTN_POLICY_FAILED] = -EACCES,
924 [RTN_THROW] = -EAGAIN,
925 [RTN_NAT] = -EINVAL,
926 [RTN_XRESOLVE] = -EINVAL,
927};
928
929static int ip6_rt_type_to_error(u8 fib6_type)
930{
931 return fib6_prop[fib6_type];
932}
933
934static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935{
936 unsigned short flags = 0;
937
938 if (rt->dst_nocount)
939 flags |= DST_NOCOUNT;
940 if (rt->dst_nopolicy)
941 flags |= DST_NOPOLICY;
942 if (rt->dst_host)
943 flags |= DST_HOST;
944
945 return flags;
946}
947
948static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
949{
950 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
951
952 switch (ort->fib6_type) {
953 case RTN_BLACKHOLE:
954 rt->dst.output = dst_discard_out;
955 rt->dst.input = dst_discard;
956 break;
957 case RTN_PROHIBIT:
958 rt->dst.output = ip6_pkt_prohibit_out;
959 rt->dst.input = ip6_pkt_prohibit;
960 break;
961 case RTN_POLICY_FAILED:
962 rt->dst.output = ip6_pkt_policy_failed_out;
963 rt->dst.input = ip6_pkt_policy_failed;
964 break;
965 case RTN_THROW:
966 case RTN_UNREACHABLE:
967 default:
968 rt->dst.output = ip6_pkt_discard_out;
969 rt->dst.input = ip6_pkt_discard;
970 break;
971 }
972}
973
974static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
975{
976 if (ort->fib6_flags & RTF_REJECT) {
977 ip6_rt_init_dst_reject(rt, ort);
978 return;
979 }
980
981 rt->dst.error = 0;
982 rt->dst.output = ip6_output;
983
984 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
985 rt->dst.input = ip6_input;
986 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
987 rt->dst.input = ip6_mc_input;
988 } else {
989 rt->dst.input = ip6_forward;
990 }
991
992 if (ort->fib6_nh.nh_lwtstate) {
993 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
994 lwtunnel_set_redirect(&rt->dst);
995 }
996
997 rt->dst.lastuse = jiffies;
998}
999
1000/* Caller must already hold reference to @from */
1001static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1002{
1003 rt->rt6i_flags &= ~RTF_EXPIRES;
1004 rcu_assign_pointer(rt->from, from);
1005 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
1006 if (from->fib6_metrics != &dst_default_metrics) {
1007 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1008 refcount_inc(&from->fib6_metrics->refcnt);
1009 }
1010}
1011
1012/* Caller must already hold reference to @ort */
1013static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
1014{
1015 struct net_device *dev = fib6_info_nh_dev(ort);
1016
1017 ip6_rt_init_dst(rt, ort);
1018
1019 rt->rt6i_dst = ort->fib6_dst;
1020 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1021 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1022 rt->rt6i_flags = ort->fib6_flags;
1023 rt6_set_from(rt, ort);
1024#ifdef CONFIG_IPV6_SUBTREES
1025 rt->rt6i_src = ort->fib6_src;
1026#endif
1027 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1028}
1029
1030static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1031 struct in6_addr *saddr)
1032{
1033 struct fib6_node *pn, *sn;
1034 while (1) {
1035 if (fn->fn_flags & RTN_TL_ROOT)
1036 return NULL;
1037 pn = rcu_dereference(fn->parent);
1038 sn = FIB6_SUBTREE(pn);
1039 if (sn && sn != fn)
1040 fn = fib6_node_lookup(sn, NULL, saddr);
1041 else
1042 fn = pn;
1043 if (fn->fn_flags & RTN_RTINFO)
1044 return fn;
1045 }
1046}
1047
1048static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1049 bool null_fallback)
1050{
1051 struct rt6_info *rt = *prt;
1052
1053 if (dst_hold_safe(&rt->dst))
1054 return true;
1055 if (null_fallback) {
1056 rt = net->ipv6.ip6_null_entry;
1057 dst_hold(&rt->dst);
1058 } else {
1059 rt = NULL;
1060 }
1061 *prt = rt;
1062 return false;
1063}
1064
1065/* called with rcu_lock held */
1066static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1067{
1068 unsigned short flags = fib6_info_dst_flags(rt);
1069 struct net_device *dev = rt->fib6_nh.nh_dev;
1070 struct rt6_info *nrt;
1071
1072 if (!fib6_info_hold_safe(rt))
1073 goto fallback;
1074
1075 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1076 if (!nrt) {
1077 fib6_info_release(rt);
1078 goto fallback;
1079 }
1080
1081 ip6_rt_copy_init(nrt, rt);
1082 return nrt;
1083
1084fallback:
1085 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1086 dst_hold(&nrt->dst);
1087 return nrt;
1088}
1089
1090static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1091 struct fib6_table *table,
1092 struct flowi6 *fl6,
1093 const struct sk_buff *skb,
1094 int flags)
1095{
1096 struct fib6_info *f6i;
1097 struct fib6_node *fn;
1098 struct rt6_info *rt;
1099
1100 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1101 flags &= ~RT6_LOOKUP_F_IFACE;
1102
1103 rcu_read_lock();
1104 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1105restart:
1106 f6i = rcu_dereference(fn->leaf);
1107 if (!f6i) {
1108 f6i = net->ipv6.fib6_null_entry;
1109 } else {
1110 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1111 fl6->flowi6_oif, flags);
1112 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1113 f6i = fib6_multipath_select(net, f6i, fl6,
1114 fl6->flowi6_oif, skb,
1115 flags);
1116 }
1117 if (f6i == net->ipv6.fib6_null_entry) {
1118 fn = fib6_backtrack(fn, &fl6->saddr);
1119 if (fn)
1120 goto restart;
1121 }
1122
1123 trace_fib6_table_lookup(net, f6i, table, fl6);
1124
1125 /* Search through exception table */
1126 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1127 if (rt) {
1128 if (ip6_hold_safe(net, &rt, true))
1129 dst_use_noref(&rt->dst, jiffies);
1130 } else if (f6i == net->ipv6.fib6_null_entry) {
1131 rt = net->ipv6.ip6_null_entry;
1132 dst_hold(&rt->dst);
1133 } else {
1134 rt = ip6_create_rt_rcu(f6i);
1135 }
1136
1137 rcu_read_unlock();
1138
1139 return rt;
1140}
1141
1142struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143 const struct sk_buff *skb, int flags)
1144{
1145 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146}
1147EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148
1149struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150 const struct in6_addr *saddr, int oif,
1151 const struct sk_buff *skb, int strict)
1152{
1153 struct flowi6 fl6 = {
1154 .flowi6_oif = oif,
1155 .daddr = *daddr,
1156 };
1157 struct dst_entry *dst;
1158 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159
1160 if (saddr) {
1161 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162 flags |= RT6_LOOKUP_F_HAS_SADDR;
1163 }
1164
1165 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166 if (dst->error == 0)
1167 return (struct rt6_info *) dst;
1168
1169 dst_release(dst);
1170
1171 return NULL;
1172}
1173EXPORT_SYMBOL(rt6_lookup);
1174
1175/* ip6_ins_rt is called with FREE table->tb6_lock.
1176 * It takes new route entry, the addition fails by any reason the
1177 * route is released.
1178 * Caller must hold dst before calling it.
1179 */
1180
1181static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182 struct netlink_ext_ack *extack)
1183{
1184 int err;
1185 struct fib6_table *table;
1186
1187 table = rt->fib6_table;
1188 spin_lock_bh(&table->tb6_lock);
1189 err = fib6_add(&table->tb6_root, rt, info, extack);
1190 spin_unlock_bh(&table->tb6_lock);
1191
1192 return err;
1193}
1194
1195int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196{
1197 struct nl_info info = { .nl_net = net, };
1198
1199 return __ip6_ins_rt(rt, &info, NULL);
1200}
1201
1202static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1203 const struct in6_addr *daddr,
1204 const struct in6_addr *saddr)
1205{
1206 struct net_device *dev;
1207 struct rt6_info *rt;
1208
1209 /*
1210 * Clone the route.
1211 */
1212
1213 if (!fib6_info_hold_safe(ort))
1214 return NULL;
1215
1216 dev = ip6_rt_get_dev_rcu(ort);
1217 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1218 if (!rt) {
1219 fib6_info_release(ort);
1220 return NULL;
1221 }
1222
1223 ip6_rt_copy_init(rt, ort);
1224 rt->rt6i_flags |= RTF_CACHE;
1225 rt->dst.flags |= DST_HOST;
1226 rt->rt6i_dst.addr = *daddr;
1227 rt->rt6i_dst.plen = 128;
1228
1229 if (!rt6_is_gw_or_nonexthop(ort)) {
1230 if (ort->fib6_dst.plen != 128 &&
1231 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1232 rt->rt6i_flags |= RTF_ANYCAST;
1233#ifdef CONFIG_IPV6_SUBTREES
1234 if (rt->rt6i_src.plen && saddr) {
1235 rt->rt6i_src.addr = *saddr;
1236 rt->rt6i_src.plen = 128;
1237 }
1238#endif
1239 }
1240
1241 return rt;
1242}
1243
1244static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1245{
1246 unsigned short flags = fib6_info_dst_flags(rt);
1247 struct net_device *dev;
1248 struct rt6_info *pcpu_rt;
1249
1250 if (!fib6_info_hold_safe(rt))
1251 return NULL;
1252
1253 rcu_read_lock();
1254 dev = ip6_rt_get_dev_rcu(rt);
1255 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1256 rcu_read_unlock();
1257 if (!pcpu_rt) {
1258 fib6_info_release(rt);
1259 return NULL;
1260 }
1261 ip6_rt_copy_init(pcpu_rt, rt);
1262 pcpu_rt->rt6i_flags |= RTF_PCPU;
1263 return pcpu_rt;
1264}
1265
1266/* It should be called with rcu_read_lock() acquired */
1267static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1268{
1269 struct rt6_info *pcpu_rt, **p;
1270
1271 p = this_cpu_ptr(rt->rt6i_pcpu);
1272 pcpu_rt = *p;
1273
1274 if (pcpu_rt)
1275 ip6_hold_safe(NULL, &pcpu_rt, false);
1276
1277 return pcpu_rt;
1278}
1279
1280static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1281 struct fib6_info *rt)
1282{
1283 struct rt6_info *pcpu_rt, *prev, **p;
1284
1285 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1286 if (!pcpu_rt) {
1287 dst_hold(&net->ipv6.ip6_null_entry->dst);
1288 return net->ipv6.ip6_null_entry;
1289 }
1290
1291 dst_hold(&pcpu_rt->dst);
1292 p = this_cpu_ptr(rt->rt6i_pcpu);
1293 prev = cmpxchg(p, NULL, pcpu_rt);
1294 BUG_ON(prev);
1295
1296 if (rt->fib6_destroying) {
1297 struct fib6_info *from;
1298
1299 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1300 fib6_info_release(from);
1301 }
1302
1303 return pcpu_rt;
1304}
1305
1306/* exception hash table implementation
1307 */
1308static DEFINE_SPINLOCK(rt6_exception_lock);
1309
1310/* Remove rt6_ex from hash table and free the memory
1311 * Caller must hold rt6_exception_lock
1312 */
1313static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1314 struct rt6_exception *rt6_ex)
1315{
1316 struct fib6_info *from;
1317 struct net *net;
1318
1319 if (!bucket || !rt6_ex)
1320 return;
1321
1322 net = dev_net(rt6_ex->rt6i->dst.dev);
1323 net->ipv6.rt6_stats->fib_rt_cache--;
1324
1325 /* purge completely the exception to allow releasing the held resources:
1326 * some [sk] cache may keep the dst around for unlimited time
1327 */
1328 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1329 fib6_info_release(from);
1330 dst_dev_put(&rt6_ex->rt6i->dst);
1331
1332 hlist_del_rcu(&rt6_ex->hlist);
1333 dst_release(&rt6_ex->rt6i->dst);
1334 kfree_rcu(rt6_ex, rcu);
1335 WARN_ON_ONCE(!bucket->depth);
1336 bucket->depth--;
1337}
1338
1339/* Remove oldest rt6_ex in bucket and free the memory
1340 * Caller must hold rt6_exception_lock
1341 */
1342static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1343{
1344 struct rt6_exception *rt6_ex, *oldest = NULL;
1345
1346 if (!bucket)
1347 return;
1348
1349 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1350 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1351 oldest = rt6_ex;
1352 }
1353 rt6_remove_exception(bucket, oldest);
1354}
1355
1356static u32 rt6_exception_hash(const struct in6_addr *dst,
1357 const struct in6_addr *src)
1358{
1359 static u32 seed __read_mostly;
1360 u32 val;
1361
1362 net_get_random_once(&seed, sizeof(seed));
1363 val = jhash(dst, sizeof(*dst), seed);
1364
1365#ifdef CONFIG_IPV6_SUBTREES
1366 if (src)
1367 val = jhash(src, sizeof(*src), val);
1368#endif
1369 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1370}
1371
1372/* Helper function to find the cached rt in the hash table
1373 * and update bucket pointer to point to the bucket for this
1374 * (daddr, saddr) pair
1375 * Caller must hold rt6_exception_lock
1376 */
1377static struct rt6_exception *
1378__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1379 const struct in6_addr *daddr,
1380 const struct in6_addr *saddr)
1381{
1382 struct rt6_exception *rt6_ex;
1383 u32 hval;
1384
1385 if (!(*bucket) || !daddr)
1386 return NULL;
1387
1388 hval = rt6_exception_hash(daddr, saddr);
1389 *bucket += hval;
1390
1391 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1392 struct rt6_info *rt6 = rt6_ex->rt6i;
1393 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1394
1395#ifdef CONFIG_IPV6_SUBTREES
1396 if (matched && saddr)
1397 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1398#endif
1399 if (matched)
1400 return rt6_ex;
1401 }
1402 return NULL;
1403}
1404
1405/* Helper function to find the cached rt in the hash table
1406 * and update bucket pointer to point to the bucket for this
1407 * (daddr, saddr) pair
1408 * Caller must hold rcu_read_lock()
1409 */
1410static struct rt6_exception *
1411__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1412 const struct in6_addr *daddr,
1413 const struct in6_addr *saddr)
1414{
1415 struct rt6_exception *rt6_ex;
1416 u32 hval;
1417
1418 WARN_ON_ONCE(!rcu_read_lock_held());
1419
1420 if (!(*bucket) || !daddr)
1421 return NULL;
1422
1423 hval = rt6_exception_hash(daddr, saddr);
1424 *bucket += hval;
1425
1426 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1427 struct rt6_info *rt6 = rt6_ex->rt6i;
1428 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1429
1430#ifdef CONFIG_IPV6_SUBTREES
1431 if (matched && saddr)
1432 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1433#endif
1434 if (matched)
1435 return rt6_ex;
1436 }
1437 return NULL;
1438}
1439
1440static unsigned int fib6_mtu(const struct fib6_info *rt)
1441{
1442 unsigned int mtu;
1443
1444 if (rt->fib6_pmtu) {
1445 mtu = rt->fib6_pmtu;
1446 } else {
1447 struct net_device *dev = fib6_info_nh_dev(rt);
1448 struct inet6_dev *idev;
1449
1450 rcu_read_lock();
1451 idev = __in6_dev_get(dev);
1452 mtu = idev->cnf.mtu6;
1453 rcu_read_unlock();
1454 }
1455
1456 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1457
1458 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1459}
1460
1461static int rt6_insert_exception(struct rt6_info *nrt,
1462 struct fib6_info *ort)
1463{
1464 struct net *net = dev_net(nrt->dst.dev);
1465 struct rt6_exception_bucket *bucket;
1466 struct in6_addr *src_key = NULL;
1467 struct rt6_exception *rt6_ex;
1468 int err = 0;
1469
1470 spin_lock_bh(&rt6_exception_lock);
1471
1472 if (ort->exception_bucket_flushed) {
1473 err = -EINVAL;
1474 goto out;
1475 }
1476
1477 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1478 lockdep_is_held(&rt6_exception_lock));
1479 if (!bucket) {
1480 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1481 GFP_ATOMIC);
1482 if (!bucket) {
1483 err = -ENOMEM;
1484 goto out;
1485 }
1486 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1487 }
1488
1489#ifdef CONFIG_IPV6_SUBTREES
1490 /* rt6i_src.plen != 0 indicates ort is in subtree
1491 * and exception table is indexed by a hash of
1492 * both rt6i_dst and rt6i_src.
1493 * Otherwise, the exception table is indexed by
1494 * a hash of only rt6i_dst.
1495 */
1496 if (ort->fib6_src.plen)
1497 src_key = &nrt->rt6i_src.addr;
1498#endif
1499
1500 /* Update rt6i_prefsrc as it could be changed
1501 * in rt6_remove_prefsrc()
1502 */
1503 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1504 /* rt6_mtu_change() might lower mtu on ort.
1505 * Only insert this exception route if its mtu
1506 * is less than ort's mtu value.
1507 */
1508 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1509 err = -EINVAL;
1510 goto out;
1511 }
1512
1513 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1514 src_key);
1515 if (rt6_ex)
1516 rt6_remove_exception(bucket, rt6_ex);
1517
1518 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1519 if (!rt6_ex) {
1520 err = -ENOMEM;
1521 goto out;
1522 }
1523 rt6_ex->rt6i = nrt;
1524 rt6_ex->stamp = jiffies;
1525 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1526 bucket->depth++;
1527 net->ipv6.rt6_stats->fib_rt_cache++;
1528
1529 if (bucket->depth > FIB6_MAX_DEPTH)
1530 rt6_exception_remove_oldest(bucket);
1531
1532out:
1533 spin_unlock_bh(&rt6_exception_lock);
1534
1535 /* Update fn->fn_sernum to invalidate all cached dst */
1536 if (!err) {
1537 spin_lock_bh(&ort->fib6_table->tb6_lock);
1538 fib6_update_sernum(net, ort);
1539 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1540 fib6_force_start_gc(net);
1541 }
1542
1543 return err;
1544}
1545
1546void rt6_flush_exceptions(struct fib6_info *rt)
1547{
1548 struct rt6_exception_bucket *bucket;
1549 struct rt6_exception *rt6_ex;
1550 struct hlist_node *tmp;
1551 int i;
1552
1553 spin_lock_bh(&rt6_exception_lock);
1554 /* Prevent rt6_insert_exception() to recreate the bucket list */
1555 rt->exception_bucket_flushed = 1;
1556
1557 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1558 lockdep_is_held(&rt6_exception_lock));
1559 if (!bucket)
1560 goto out;
1561
1562 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1563 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1564 rt6_remove_exception(bucket, rt6_ex);
1565 WARN_ON_ONCE(bucket->depth);
1566 bucket++;
1567 }
1568
1569out:
1570 spin_unlock_bh(&rt6_exception_lock);
1571}
1572
1573/* Find cached rt in the hash table inside passed in rt
1574 * Caller has to hold rcu_read_lock()
1575 */
1576static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1577 const struct in6_addr *daddr,
1578 const struct in6_addr *saddr)
1579{
1580 const struct in6_addr *src_key = NULL;
1581 struct rt6_exception_bucket *bucket;
1582 struct rt6_exception *rt6_ex;
1583 struct rt6_info *res = NULL;
1584
1585#ifdef CONFIG_IPV6_SUBTREES
1586 /* rt6i_src.plen != 0 indicates rt is in subtree
1587 * and exception table is indexed by a hash of
1588 * both rt6i_dst and rt6i_src.
1589 * However, the src addr used to create the hash
1590 * might not be exactly the passed in saddr which
1591 * is a /128 addr from the flow.
1592 * So we need to use f6i->fib6_src to redo lookup
1593 * if the passed in saddr does not find anything.
1594 * (See the logic in ip6_rt_cache_alloc() on how
1595 * rt->rt6i_src is updated.)
1596 */
1597 if (rt->fib6_src.plen)
1598 src_key = saddr;
1599find_ex:
1600#endif
1601 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1602 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1603
1604 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1605 res = rt6_ex->rt6i;
1606
1607#ifdef CONFIG_IPV6_SUBTREES
1608 /* Use fib6_src as src_key and redo lookup */
1609 if (!res && src_key && src_key != &rt->fib6_src.addr) {
1610 src_key = &rt->fib6_src.addr;
1611 goto find_ex;
1612 }
1613#endif
1614
1615 return res;
1616}
1617
1618/* Remove the passed in cached rt from the hash table that contains it */
1619static int rt6_remove_exception_rt(struct rt6_info *rt)
1620{
1621 struct rt6_exception_bucket *bucket;
1622 struct in6_addr *src_key = NULL;
1623 struct rt6_exception *rt6_ex;
1624 struct fib6_info *from;
1625 int err;
1626
1627 from = rcu_dereference(rt->from);
1628 if (!from ||
1629 !(rt->rt6i_flags & RTF_CACHE))
1630 return -EINVAL;
1631
1632 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1633 return -ENOENT;
1634
1635 spin_lock_bh(&rt6_exception_lock);
1636 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1637 lockdep_is_held(&rt6_exception_lock));
1638#ifdef CONFIG_IPV6_SUBTREES
1639 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1640 * and exception table is indexed by a hash of
1641 * both rt6i_dst and rt6i_src.
1642 * Otherwise, the exception table is indexed by
1643 * a hash of only rt6i_dst.
1644 */
1645 if (from->fib6_src.plen)
1646 src_key = &rt->rt6i_src.addr;
1647#endif
1648 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1649 &rt->rt6i_dst.addr,
1650 src_key);
1651 if (rt6_ex) {
1652 rt6_remove_exception(bucket, rt6_ex);
1653 err = 0;
1654 } else {
1655 err = -ENOENT;
1656 }
1657
1658 spin_unlock_bh(&rt6_exception_lock);
1659 return err;
1660}
1661
1662/* Find rt6_ex which contains the passed in rt cache and
1663 * refresh its stamp
1664 */
1665static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1666{
1667 struct rt6_exception_bucket *bucket;
1668 struct in6_addr *src_key = NULL;
1669 struct rt6_exception *rt6_ex;
1670 struct fib6_info *from;
1671
1672 rcu_read_lock();
1673 from = rcu_dereference(rt->from);
1674 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1675 goto unlock;
1676
1677 bucket = rcu_dereference(from->rt6i_exception_bucket);
1678
1679#ifdef CONFIG_IPV6_SUBTREES
1680 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1681 * and exception table is indexed by a hash of
1682 * both rt6i_dst and rt6i_src.
1683 * Otherwise, the exception table is indexed by
1684 * a hash of only rt6i_dst.
1685 */
1686 if (from->fib6_src.plen)
1687 src_key = &rt->rt6i_src.addr;
1688#endif
1689 rt6_ex = __rt6_find_exception_rcu(&bucket,
1690 &rt->rt6i_dst.addr,
1691 src_key);
1692 if (rt6_ex)
1693 rt6_ex->stamp = jiffies;
1694
1695unlock:
1696 rcu_read_unlock();
1697}
1698
1699static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1700{
1701 struct rt6_exception_bucket *bucket;
1702 struct rt6_exception *rt6_ex;
1703 int i;
1704
1705 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706 lockdep_is_held(&rt6_exception_lock));
1707
1708 if (bucket) {
1709 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1711 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1712 }
1713 bucket++;
1714 }
1715 }
1716}
1717
1718static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1719 struct rt6_info *rt, int mtu)
1720{
1721 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1722 * lowest MTU in the path: always allow updating the route PMTU to
1723 * reflect PMTU decreases.
1724 *
1725 * If the new MTU is higher, and the route PMTU is equal to the local
1726 * MTU, this means the old MTU is the lowest in the path, so allow
1727 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1728 * handle this.
1729 */
1730
1731 if (dst_mtu(&rt->dst) >= mtu)
1732 return true;
1733
1734 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1735 return true;
1736
1737 return false;
1738}
1739
1740static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1741 struct fib6_info *rt, int mtu)
1742{
1743 struct rt6_exception_bucket *bucket;
1744 struct rt6_exception *rt6_ex;
1745 int i;
1746
1747 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1748 lockdep_is_held(&rt6_exception_lock));
1749
1750 if (!bucket)
1751 return;
1752
1753 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1754 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1755 struct rt6_info *entry = rt6_ex->rt6i;
1756
1757 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1758 * route), the metrics of its rt->from have already
1759 * been updated.
1760 */
1761 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1762 rt6_mtu_change_route_allowed(idev, entry, mtu))
1763 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1764 }
1765 bucket++;
1766 }
1767}
1768
1769#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1770
1771static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1772 struct in6_addr *gateway)
1773{
1774 struct rt6_exception_bucket *bucket;
1775 struct rt6_exception *rt6_ex;
1776 struct hlist_node *tmp;
1777 int i;
1778
1779 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1780 return;
1781
1782 spin_lock_bh(&rt6_exception_lock);
1783 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1784 lockdep_is_held(&rt6_exception_lock));
1785
1786 if (bucket) {
1787 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1788 hlist_for_each_entry_safe(rt6_ex, tmp,
1789 &bucket->chain, hlist) {
1790 struct rt6_info *entry = rt6_ex->rt6i;
1791
1792 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1793 RTF_CACHE_GATEWAY &&
1794 ipv6_addr_equal(gateway,
1795 &entry->rt6i_gateway)) {
1796 rt6_remove_exception(bucket, rt6_ex);
1797 }
1798 }
1799 bucket++;
1800 }
1801 }
1802
1803 spin_unlock_bh(&rt6_exception_lock);
1804}
1805
1806static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1807 struct rt6_exception *rt6_ex,
1808 struct fib6_gc_args *gc_args,
1809 unsigned long now)
1810{
1811 struct rt6_info *rt = rt6_ex->rt6i;
1812
1813 /* we are pruning and obsoleting aged-out and non gateway exceptions
1814 * even if others have still references to them, so that on next
1815 * dst_check() such references can be dropped.
1816 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1817 * expired, independently from their aging, as per RFC 8201 section 4
1818 */
1819 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1820 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1821 RT6_TRACE("aging clone %p\n", rt);
1822 rt6_remove_exception(bucket, rt6_ex);
1823 return;
1824 }
1825 } else if (time_after(jiffies, rt->dst.expires)) {
1826 RT6_TRACE("purging expired route %p\n", rt);
1827 rt6_remove_exception(bucket, rt6_ex);
1828 return;
1829 }
1830
1831 if (rt->rt6i_flags & RTF_GATEWAY) {
1832 struct neighbour *neigh;
1833 __u8 neigh_flags = 0;
1834
1835 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1836 if (neigh)
1837 neigh_flags = neigh->flags;
1838
1839 if (!(neigh_flags & NTF_ROUTER)) {
1840 RT6_TRACE("purging route %p via non-router but gateway\n",
1841 rt);
1842 rt6_remove_exception(bucket, rt6_ex);
1843 return;
1844 }
1845 }
1846
1847 gc_args->more++;
1848}
1849
1850void rt6_age_exceptions(struct fib6_info *rt,
1851 struct fib6_gc_args *gc_args,
1852 unsigned long now)
1853{
1854 struct rt6_exception_bucket *bucket;
1855 struct rt6_exception *rt6_ex;
1856 struct hlist_node *tmp;
1857 int i;
1858
1859 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1860 return;
1861
1862 rcu_read_lock_bh();
1863 spin_lock(&rt6_exception_lock);
1864 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1865 lockdep_is_held(&rt6_exception_lock));
1866
1867 if (bucket) {
1868 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1869 hlist_for_each_entry_safe(rt6_ex, tmp,
1870 &bucket->chain, hlist) {
1871 rt6_age_examine_exception(bucket, rt6_ex,
1872 gc_args, now);
1873 }
1874 bucket++;
1875 }
1876 }
1877 spin_unlock(&rt6_exception_lock);
1878 rcu_read_unlock_bh();
1879}
1880
1881/* must be called with rcu lock held */
1882struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1883 int oif, struct flowi6 *fl6, int strict)
1884{
1885 struct fib6_node *fn, *saved_fn;
1886 struct fib6_info *f6i;
1887
1888 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1889 saved_fn = fn;
1890
1891 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1892 oif = 0;
1893
1894redo_rt6_select:
1895 f6i = rt6_select(net, fn, oif, strict);
1896 if (f6i == net->ipv6.fib6_null_entry) {
1897 fn = fib6_backtrack(fn, &fl6->saddr);
1898 if (fn)
1899 goto redo_rt6_select;
1900 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1901 /* also consider unreachable route */
1902 strict &= ~RT6_LOOKUP_F_REACHABLE;
1903 fn = saved_fn;
1904 goto redo_rt6_select;
1905 }
1906 }
1907
1908 trace_fib6_table_lookup(net, f6i, table, fl6);
1909
1910 return f6i;
1911}
1912
1913struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1914 int oif, struct flowi6 *fl6,
1915 const struct sk_buff *skb, int flags)
1916{
1917 struct fib6_info *f6i;
1918 struct rt6_info *rt;
1919 int strict = 0;
1920
1921 strict |= flags & RT6_LOOKUP_F_IFACE;
1922 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1923 if (net->ipv6.devconf_all->forwarding == 0)
1924 strict |= RT6_LOOKUP_F_REACHABLE;
1925
1926 rcu_read_lock();
1927
1928 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1929 if (f6i->fib6_nsiblings)
1930 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1931
1932 if (f6i == net->ipv6.fib6_null_entry) {
1933 rt = net->ipv6.ip6_null_entry;
1934 rcu_read_unlock();
1935 dst_hold(&rt->dst);
1936 return rt;
1937 }
1938
1939 /*Search through exception table */
1940 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1941 if (rt) {
1942 if (ip6_hold_safe(net, &rt, true))
1943 dst_use_noref(&rt->dst, jiffies);
1944
1945 rcu_read_unlock();
1946 return rt;
1947 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1948 !(f6i->fib6_flags & RTF_GATEWAY))) {
1949 /* Create a RTF_CACHE clone which will not be
1950 * owned by the fib6 tree. It is for the special case where
1951 * the daddr in the skb during the neighbor look-up is different
1952 * from the fl6->daddr used to look-up route here.
1953 */
1954 struct rt6_info *uncached_rt;
1955
1956 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1957
1958 rcu_read_unlock();
1959
1960 if (uncached_rt) {
1961 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1962 * No need for another dst_hold()
1963 */
1964 rt6_uncached_list_add(uncached_rt);
1965 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1966 } else {
1967 uncached_rt = net->ipv6.ip6_null_entry;
1968 dst_hold(&uncached_rt->dst);
1969 }
1970
1971 return uncached_rt;
1972 } else {
1973 /* Get a percpu copy */
1974
1975 struct rt6_info *pcpu_rt;
1976
1977 local_bh_disable();
1978 pcpu_rt = rt6_get_pcpu_route(f6i);
1979
1980 if (!pcpu_rt)
1981 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1982
1983 local_bh_enable();
1984 rcu_read_unlock();
1985
1986 return pcpu_rt;
1987 }
1988}
1989EXPORT_SYMBOL_GPL(ip6_pol_route);
1990
1991static struct rt6_info *ip6_pol_route_input(struct net *net,
1992 struct fib6_table *table,
1993 struct flowi6 *fl6,
1994 const struct sk_buff *skb,
1995 int flags)
1996{
1997 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1998}
1999
2000struct dst_entry *ip6_route_input_lookup(struct net *net,
2001 struct net_device *dev,
2002 struct flowi6 *fl6,
2003 const struct sk_buff *skb,
2004 int flags)
2005{
2006 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2007 flags |= RT6_LOOKUP_F_IFACE;
2008
2009 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2010}
2011EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2012
2013static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2014 struct flow_keys *keys,
2015 struct flow_keys *flkeys)
2016{
2017 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2018 const struct ipv6hdr *key_iph = outer_iph;
2019 struct flow_keys *_flkeys = flkeys;
2020 const struct ipv6hdr *inner_iph;
2021 const struct icmp6hdr *icmph;
2022 struct ipv6hdr _inner_iph;
2023 struct icmp6hdr _icmph;
2024
2025 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2026 goto out;
2027
2028 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2029 sizeof(_icmph), &_icmph);
2030 if (!icmph)
2031 goto out;
2032
2033 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2034 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2035 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2036 icmph->icmp6_type != ICMPV6_PARAMPROB)
2037 goto out;
2038
2039 inner_iph = skb_header_pointer(skb,
2040 skb_transport_offset(skb) + sizeof(*icmph),
2041 sizeof(_inner_iph), &_inner_iph);
2042 if (!inner_iph)
2043 goto out;
2044
2045 key_iph = inner_iph;
2046 _flkeys = NULL;
2047out:
2048 if (_flkeys) {
2049 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2050 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2051 keys->tags.flow_label = _flkeys->tags.flow_label;
2052 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2053 } else {
2054 keys->addrs.v6addrs.src = key_iph->saddr;
2055 keys->addrs.v6addrs.dst = key_iph->daddr;
2056 keys->tags.flow_label = ip6_flowlabel(key_iph);
2057 keys->basic.ip_proto = key_iph->nexthdr;
2058 }
2059}
2060
2061/* if skb is set it will be used and fl6 can be NULL */
2062u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2063 const struct sk_buff *skb, struct flow_keys *flkeys)
2064{
2065 struct flow_keys hash_keys;
2066 u32 mhash;
2067
2068 switch (ip6_multipath_hash_policy(net)) {
2069 case 0:
2070 memset(&hash_keys, 0, sizeof(hash_keys));
2071 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2072 if (skb) {
2073 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2074 } else {
2075 hash_keys.addrs.v6addrs.src = fl6->saddr;
2076 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2077 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2078 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2079 }
2080 break;
2081 case 1:
2082 if (skb) {
2083 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2084 struct flow_keys keys;
2085
2086 /* short-circuit if we already have L4 hash present */
2087 if (skb->l4_hash)
2088 return skb_get_hash_raw(skb) >> 1;
2089
2090 memset(&hash_keys, 0, sizeof(hash_keys));
2091
2092 if (!flkeys) {
2093 skb_flow_dissect_flow_keys(skb, &keys, flag);
2094 flkeys = &keys;
2095 }
2096 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2097 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2098 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2099 hash_keys.ports.src = flkeys->ports.src;
2100 hash_keys.ports.dst = flkeys->ports.dst;
2101 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2102 } else {
2103 memset(&hash_keys, 0, sizeof(hash_keys));
2104 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2105 hash_keys.addrs.v6addrs.src = fl6->saddr;
2106 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2107 hash_keys.ports.src = fl6->fl6_sport;
2108 hash_keys.ports.dst = fl6->fl6_dport;
2109 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2110 }
2111 break;
2112 }
2113 mhash = flow_hash_from_keys(&hash_keys);
2114
2115 return mhash >> 1;
2116}
2117
2118void ip6_route_input(struct sk_buff *skb)
2119{
2120 const struct ipv6hdr *iph = ipv6_hdr(skb);
2121 struct net *net = dev_net(skb->dev);
2122 int flags = RT6_LOOKUP_F_HAS_SADDR;
2123 struct ip_tunnel_info *tun_info;
2124 struct flowi6 fl6 = {
2125 .flowi6_iif = skb->dev->ifindex,
2126 .daddr = iph->daddr,
2127 .saddr = iph->saddr,
2128 .flowlabel = ip6_flowinfo(iph),
2129 .flowi6_mark = skb->mark,
2130 .flowi6_proto = iph->nexthdr,
2131 };
2132 struct flow_keys *flkeys = NULL, _flkeys;
2133
2134 tun_info = skb_tunnel_info(skb);
2135 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2136 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2137
2138 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2139 flkeys = &_flkeys;
2140
2141 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2142 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2143 skb_dst_drop(skb);
2144 skb_dst_set(skb,
2145 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2146}
2147
2148static struct rt6_info *ip6_pol_route_output(struct net *net,
2149 struct fib6_table *table,
2150 struct flowi6 *fl6,
2151 const struct sk_buff *skb,
2152 int flags)
2153{
2154 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2155}
2156
2157struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2158 struct flowi6 *fl6, int flags)
2159{
2160 bool any_src;
2161
2162 if (rt6_need_strict(&fl6->daddr)) {
2163 struct dst_entry *dst;
2164
2165 dst = l3mdev_link_scope_lookup(net, fl6);
2166 if (dst)
2167 return dst;
2168 }
2169
2170 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2171
2172 any_src = ipv6_addr_any(&fl6->saddr);
2173 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2174 (fl6->flowi6_oif && any_src))
2175 flags |= RT6_LOOKUP_F_IFACE;
2176
2177 if (!any_src)
2178 flags |= RT6_LOOKUP_F_HAS_SADDR;
2179 else if (sk)
2180 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2181
2182 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2183}
2184EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2185
2186struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2187{
2188 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2189 struct net_device *loopback_dev = net->loopback_dev;
2190 struct dst_entry *new = NULL;
2191
2192 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2193 DST_OBSOLETE_DEAD, 0);
2194 if (rt) {
2195 rt6_info_init(rt);
2196 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2197
2198 new = &rt->dst;
2199 new->__use = 1;
2200 new->input = dst_discard;
2201 new->output = dst_discard_out;
2202
2203 dst_copy_metrics(new, &ort->dst);
2204
2205 rt->rt6i_idev = in6_dev_get(loopback_dev);
2206 rt->rt6i_gateway = ort->rt6i_gateway;
2207 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2208
2209 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2210#ifdef CONFIG_IPV6_SUBTREES
2211 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2212#endif
2213 }
2214
2215 dst_release(dst_orig);
2216 return new ? new : ERR_PTR(-ENOMEM);
2217}
2218
2219/*
2220 * Destination cache support functions
2221 */
2222
2223static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2224{
2225 u32 rt_cookie = 0;
2226
2227 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2228 return false;
2229
2230 if (fib6_check_expired(f6i))
2231 return false;
2232
2233 return true;
2234}
2235
2236static struct dst_entry *rt6_check(struct rt6_info *rt,
2237 struct fib6_info *from,
2238 u32 cookie)
2239{
2240 u32 rt_cookie = 0;
2241
2242 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2243 rt_cookie != cookie)
2244 return NULL;
2245
2246 if (rt6_check_expired(rt))
2247 return NULL;
2248
2249 return &rt->dst;
2250}
2251
2252static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2253 struct fib6_info *from,
2254 u32 cookie)
2255{
2256 if (!__rt6_check_expired(rt) &&
2257 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2258 fib6_check(from, cookie))
2259 return &rt->dst;
2260 else
2261 return NULL;
2262}
2263
2264static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2265{
2266 struct dst_entry *dst_ret;
2267 struct fib6_info *from;
2268 struct rt6_info *rt;
2269
2270 rt = container_of(dst, struct rt6_info, dst);
2271
2272 rcu_read_lock();
2273
2274 /* All IPV6 dsts are created with ->obsolete set to the value
2275 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2276 * into this function always.
2277 */
2278
2279 from = rcu_dereference(rt->from);
2280
2281 if (from && (rt->rt6i_flags & RTF_PCPU ||
2282 unlikely(!list_empty(&rt->rt6i_uncached))))
2283 dst_ret = rt6_dst_from_check(rt, from, cookie);
2284 else
2285 dst_ret = rt6_check(rt, from, cookie);
2286
2287 rcu_read_unlock();
2288
2289 return dst_ret;
2290}
2291
2292static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2293{
2294 struct rt6_info *rt = (struct rt6_info *) dst;
2295
2296 if (rt) {
2297 if (rt->rt6i_flags & RTF_CACHE) {
2298 rcu_read_lock();
2299 if (rt6_check_expired(rt)) {
2300 rt6_remove_exception_rt(rt);
2301 dst = NULL;
2302 }
2303 rcu_read_unlock();
2304 } else {
2305 dst_release(dst);
2306 dst = NULL;
2307 }
2308 }
2309 return dst;
2310}
2311
2312static void ip6_link_failure(struct sk_buff *skb)
2313{
2314 struct rt6_info *rt;
2315
2316 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2317
2318 rt = (struct rt6_info *) skb_dst(skb);
2319 if (rt) {
2320 rcu_read_lock();
2321 if (rt->rt6i_flags & RTF_CACHE) {
2322 rt6_remove_exception_rt(rt);
2323 } else {
2324 struct fib6_info *from;
2325 struct fib6_node *fn;
2326
2327 from = rcu_dereference(rt->from);
2328 if (from) {
2329 fn = rcu_dereference(from->fib6_node);
2330 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2331 fn->fn_sernum = -1;
2332 }
2333 }
2334 rcu_read_unlock();
2335 }
2336}
2337
2338static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2339{
2340 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2341 struct fib6_info *from;
2342
2343 rcu_read_lock();
2344 from = rcu_dereference(rt0->from);
2345 if (from)
2346 rt0->dst.expires = from->expires;
2347 rcu_read_unlock();
2348 }
2349
2350 dst_set_expires(&rt0->dst, timeout);
2351 rt0->rt6i_flags |= RTF_EXPIRES;
2352}
2353
2354static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2355{
2356 struct net *net = dev_net(rt->dst.dev);
2357
2358 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2359 rt->rt6i_flags |= RTF_MODIFIED;
2360 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2361}
2362
2363static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2364{
2365 bool from_set;
2366
2367 rcu_read_lock();
2368 from_set = !!rcu_dereference(rt->from);
2369 rcu_read_unlock();
2370
2371 return !(rt->rt6i_flags & RTF_CACHE) &&
2372 (rt->rt6i_flags & RTF_PCPU || from_set);
2373}
2374
2375static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2376 const struct ipv6hdr *iph, u32 mtu,
2377 bool confirm_neigh)
2378{
2379 const struct in6_addr *daddr, *saddr;
2380 struct rt6_info *rt6 = (struct rt6_info *)dst;
2381
2382 if (dst_metric_locked(dst, RTAX_MTU))
2383 return;
2384
2385 if (iph) {
2386 daddr = &iph->daddr;
2387 saddr = &iph->saddr;
2388 } else if (sk) {
2389 daddr = &sk->sk_v6_daddr;
2390 saddr = &inet6_sk(sk)->saddr;
2391 } else {
2392 daddr = NULL;
2393 saddr = NULL;
2394 }
2395
2396 if (confirm_neigh)
2397 dst_confirm_neigh(dst, daddr);
2398
2399 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2400 if (mtu >= dst_mtu(dst))
2401 return;
2402
2403 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2404 rt6_do_update_pmtu(rt6, mtu);
2405 /* update rt6_ex->stamp for cache */
2406 if (rt6->rt6i_flags & RTF_CACHE)
2407 rt6_update_exception_stamp_rt(rt6);
2408 } else if (daddr) {
2409 struct fib6_info *from;
2410 struct rt6_info *nrt6;
2411
2412 rcu_read_lock();
2413 from = rcu_dereference(rt6->from);
2414 if (!from) {
2415 rcu_read_unlock();
2416 return;
2417 }
2418 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2419 if (nrt6) {
2420 rt6_do_update_pmtu(nrt6, mtu);
2421 if (rt6_insert_exception(nrt6, from))
2422 dst_release_immediate(&nrt6->dst);
2423 }
2424 rcu_read_unlock();
2425 }
2426}
2427
2428static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2429 struct sk_buff *skb, u32 mtu,
2430 bool confirm_neigh)
2431{
2432 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2433 confirm_neigh);
2434}
2435
2436void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2437 int oif, u32 mark, kuid_t uid)
2438{
2439 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2440 struct dst_entry *dst;
2441 struct flowi6 fl6;
2442
2443 memset(&fl6, 0, sizeof(fl6));
2444 fl6.flowi6_oif = oif;
2445 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2446 fl6.daddr = iph->daddr;
2447 fl6.saddr = iph->saddr;
2448 fl6.flowlabel = ip6_flowinfo(iph);
2449 fl6.flowi6_uid = uid;
2450
2451 dst = ip6_route_output(net, NULL, &fl6);
2452 if (!dst->error)
2453 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2454 dst_release(dst);
2455}
2456EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2457
2458void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2459{
2460 int oif = sk->sk_bound_dev_if;
2461 struct dst_entry *dst;
2462
2463 if (!oif && skb->dev)
2464 oif = l3mdev_master_ifindex(skb->dev);
2465
2466 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2467
2468 dst = __sk_dst_get(sk);
2469 if (!dst || !dst->obsolete ||
2470 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2471 return;
2472
2473 bh_lock_sock(sk);
2474 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2475 ip6_datagram_dst_update(sk, false);
2476 bh_unlock_sock(sk);
2477}
2478EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2479
2480void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2481 const struct flowi6 *fl6)
2482{
2483#ifdef CONFIG_IPV6_SUBTREES
2484 struct ipv6_pinfo *np = inet6_sk(sk);
2485#endif
2486
2487 ip6_dst_store(sk, dst,
2488 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2489 &sk->sk_v6_daddr : NULL,
2490#ifdef CONFIG_IPV6_SUBTREES
2491 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2492 &np->saddr :
2493#endif
2494 NULL);
2495}
2496
2497/* Handle redirects */
2498struct ip6rd_flowi {
2499 struct flowi6 fl6;
2500 struct in6_addr gateway;
2501};
2502
2503static struct rt6_info *__ip6_route_redirect(struct net *net,
2504 struct fib6_table *table,
2505 struct flowi6 *fl6,
2506 const struct sk_buff *skb,
2507 int flags)
2508{
2509 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2510 struct rt6_info *ret = NULL, *rt_cache;
2511 struct fib6_info *rt;
2512 struct fib6_node *fn;
2513
2514 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2515 * this case we must match on the real ingress device, so reset it
2516 */
2517 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2518 fl6->flowi6_oif = skb->dev->ifindex;
2519
2520 /* Get the "current" route for this destination and
2521 * check if the redirect has come from appropriate router.
2522 *
2523 * RFC 4861 specifies that redirects should only be
2524 * accepted if they come from the nexthop to the target.
2525 * Due to the way the routes are chosen, this notion
2526 * is a bit fuzzy and one might need to check all possible
2527 * routes.
2528 */
2529
2530 rcu_read_lock();
2531 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2532restart:
2533 for_each_fib6_node_rt_rcu(fn) {
2534 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2535 continue;
2536 if (fib6_check_expired(rt))
2537 continue;
2538 if (rt->fib6_flags & RTF_REJECT)
2539 break;
2540 if (!(rt->fib6_flags & RTF_GATEWAY))
2541 continue;
2542 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2543 continue;
2544 /* rt_cache's gateway might be different from its 'parent'
2545 * in the case of an ip redirect.
2546 * So we keep searching in the exception table if the gateway
2547 * is different.
2548 */
2549 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2550 rt_cache = rt6_find_cached_rt(rt,
2551 &fl6->daddr,
2552 &fl6->saddr);
2553 if (rt_cache &&
2554 ipv6_addr_equal(&rdfl->gateway,
2555 &rt_cache->rt6i_gateway)) {
2556 ret = rt_cache;
2557 break;
2558 }
2559 continue;
2560 }
2561 break;
2562 }
2563
2564 if (!rt)
2565 rt = net->ipv6.fib6_null_entry;
2566 else if (rt->fib6_flags & RTF_REJECT) {
2567 ret = net->ipv6.ip6_null_entry;
2568 goto out;
2569 }
2570
2571 if (rt == net->ipv6.fib6_null_entry) {
2572 fn = fib6_backtrack(fn, &fl6->saddr);
2573 if (fn)
2574 goto restart;
2575 }
2576
2577out:
2578 if (ret)
2579 ip6_hold_safe(net, &ret, true);
2580 else
2581 ret = ip6_create_rt_rcu(rt);
2582
2583 rcu_read_unlock();
2584
2585 trace_fib6_table_lookup(net, rt, table, fl6);
2586 return ret;
2587};
2588
2589static struct dst_entry *ip6_route_redirect(struct net *net,
2590 const struct flowi6 *fl6,
2591 const struct sk_buff *skb,
2592 const struct in6_addr *gateway)
2593{
2594 int flags = RT6_LOOKUP_F_HAS_SADDR;
2595 struct ip6rd_flowi rdfl;
2596
2597 rdfl.fl6 = *fl6;
2598 rdfl.gateway = *gateway;
2599
2600 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2601 flags, __ip6_route_redirect);
2602}
2603
2604void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2605 kuid_t uid)
2606{
2607 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2608 struct dst_entry *dst;
2609 struct flowi6 fl6;
2610
2611 memset(&fl6, 0, sizeof(fl6));
2612 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2613 fl6.flowi6_oif = oif;
2614 fl6.flowi6_mark = mark;
2615 fl6.daddr = iph->daddr;
2616 fl6.saddr = iph->saddr;
2617 fl6.flowlabel = ip6_flowinfo(iph);
2618 fl6.flowi6_uid = uid;
2619
2620 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2621 rt6_do_redirect(dst, NULL, skb);
2622 dst_release(dst);
2623}
2624EXPORT_SYMBOL_GPL(ip6_redirect);
2625
2626void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2627 u32 mark)
2628{
2629 const struct ipv6hdr *iph = ipv6_hdr(skb);
2630 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2631 struct dst_entry *dst;
2632 struct flowi6 fl6;
2633
2634 memset(&fl6, 0, sizeof(fl6));
2635 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2636 fl6.flowi6_oif = oif;
2637 fl6.flowi6_mark = mark;
2638 fl6.daddr = msg->dest;
2639 fl6.saddr = iph->daddr;
2640 fl6.flowi6_uid = sock_net_uid(net, NULL);
2641
2642 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2643 rt6_do_redirect(dst, NULL, skb);
2644 dst_release(dst);
2645}
2646
2647void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2648{
2649 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2650 sk->sk_uid);
2651}
2652EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2653
2654static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2655{
2656 struct net_device *dev = dst->dev;
2657 unsigned int mtu = dst_mtu(dst);
2658 struct net *net = dev_net(dev);
2659
2660 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2661
2662 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2663 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2664
2665 /*
2666 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2667 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2668 * IPV6_MAXPLEN is also valid and means: "any MSS,
2669 * rely only on pmtu discovery"
2670 */
2671 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2672 mtu = IPV6_MAXPLEN;
2673 return mtu;
2674}
2675
2676static unsigned int ip6_mtu(const struct dst_entry *dst)
2677{
2678 struct inet6_dev *idev;
2679 unsigned int mtu;
2680
2681 mtu = dst_metric_raw(dst, RTAX_MTU);
2682 if (mtu)
2683 goto out;
2684
2685 mtu = IPV6_MIN_MTU;
2686
2687 rcu_read_lock();
2688 idev = __in6_dev_get(dst->dev);
2689 if (idev)
2690 mtu = idev->cnf.mtu6;
2691 rcu_read_unlock();
2692
2693out:
2694 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2695
2696 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2697}
2698
2699/* MTU selection:
2700 * 1. mtu on route is locked - use it
2701 * 2. mtu from nexthop exception
2702 * 3. mtu from egress device
2703 *
2704 * based on ip6_dst_mtu_forward and exception logic of
2705 * rt6_find_cached_rt; called with rcu_read_lock
2706 */
2707u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2708 struct in6_addr *saddr)
2709{
2710 struct inet6_dev *idev;
2711 struct rt6_info *rt;
2712 u32 mtu = 0;
2713
2714 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2715 mtu = f6i->fib6_pmtu;
2716 if (mtu)
2717 goto out;
2718 }
2719
2720 rt = rt6_find_cached_rt(f6i, daddr, saddr);
2721 if (unlikely(rt)) {
2722 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2723 } else {
2724 struct net_device *dev = fib6_info_nh_dev(f6i);
2725
2726 mtu = IPV6_MIN_MTU;
2727 idev = __in6_dev_get(dev);
2728 if (idev && idev->cnf.mtu6 > mtu)
2729 mtu = idev->cnf.mtu6;
2730 }
2731
2732 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2733out:
2734 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2735}
2736
2737struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2738 struct flowi6 *fl6)
2739{
2740 struct dst_entry *dst;
2741 struct rt6_info *rt;
2742 struct inet6_dev *idev = in6_dev_get(dev);
2743 struct net *net = dev_net(dev);
2744
2745 if (unlikely(!idev))
2746 return ERR_PTR(-ENODEV);
2747
2748 rt = ip6_dst_alloc(net, dev, 0);
2749 if (unlikely(!rt)) {
2750 in6_dev_put(idev);
2751 dst = ERR_PTR(-ENOMEM);
2752 goto out;
2753 }
2754
2755 rt->dst.flags |= DST_HOST;
2756 rt->dst.input = ip6_input;
2757 rt->dst.output = ip6_output;
2758 rt->rt6i_gateway = fl6->daddr;
2759 rt->rt6i_dst.addr = fl6->daddr;
2760 rt->rt6i_dst.plen = 128;
2761 rt->rt6i_idev = idev;
2762 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2763
2764 /* Add this dst into uncached_list so that rt6_disable_ip() can
2765 * do proper release of the net_device
2766 */
2767 rt6_uncached_list_add(rt);
2768 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2769
2770 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2771
2772out:
2773 return dst;
2774}
2775
2776static int ip6_dst_gc(struct dst_ops *ops)
2777{
2778 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2779 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2780 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2781 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2782 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2783 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2784 int entries;
2785
2786 entries = dst_entries_get_fast(ops);
2787 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2788 entries <= rt_max_size)
2789 goto out;
2790
2791 net->ipv6.ip6_rt_gc_expire++;
2792 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2793 entries = dst_entries_get_slow(ops);
2794 if (entries < ops->gc_thresh)
2795 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2796out:
2797 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2798 return entries > rt_max_size;
2799}
2800
2801static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2802 struct fib6_config *cfg)
2803{
2804 struct dst_metrics *p;
2805
2806 if (!cfg->fc_mx)
2807 return 0;
2808
2809 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2810 if (unlikely(!p))
2811 return -ENOMEM;
2812
2813 refcount_set(&p->refcnt, 1);
2814 rt->fib6_metrics = p;
2815
2816 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2817}
2818
2819static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2820 struct fib6_config *cfg,
2821 const struct in6_addr *gw_addr,
2822 u32 tbid, int flags)
2823{
2824 struct flowi6 fl6 = {
2825 .flowi6_oif = cfg->fc_ifindex,
2826 .daddr = *gw_addr,
2827 .saddr = cfg->fc_prefsrc,
2828 };
2829 struct fib6_table *table;
2830 struct rt6_info *rt;
2831
2832 table = fib6_get_table(net, tbid);
2833 if (!table)
2834 return NULL;
2835
2836 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2837 flags |= RT6_LOOKUP_F_HAS_SADDR;
2838
2839 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2840 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2841
2842 /* if table lookup failed, fall back to full lookup */
2843 if (rt == net->ipv6.ip6_null_entry) {
2844 ip6_rt_put(rt);
2845 rt = NULL;
2846 }
2847
2848 return rt;
2849}
2850
2851static int ip6_route_check_nh_onlink(struct net *net,
2852 struct fib6_config *cfg,
2853 const struct net_device *dev,
2854 struct netlink_ext_ack *extack)
2855{
2856 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2857 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2858 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2859 struct fib6_info *from;
2860 struct rt6_info *grt;
2861 int err;
2862
2863 err = 0;
2864 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2865 if (grt) {
2866 rcu_read_lock();
2867 from = rcu_dereference(grt->from);
2868 if (!grt->dst.error &&
2869 /* ignore match if it is the default route */
2870 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2871 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2872 NL_SET_ERR_MSG(extack,
2873 "Nexthop has invalid gateway or device mismatch");
2874 err = -EINVAL;
2875 }
2876 rcu_read_unlock();
2877
2878 ip6_rt_put(grt);
2879 }
2880
2881 return err;
2882}
2883
2884static int ip6_route_check_nh(struct net *net,
2885 struct fib6_config *cfg,
2886 struct net_device **_dev,
2887 struct inet6_dev **idev)
2888{
2889 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2890 struct net_device *dev = _dev ? *_dev : NULL;
2891 struct rt6_info *grt = NULL;
2892 int err = -EHOSTUNREACH;
2893
2894 if (cfg->fc_table) {
2895 int flags = RT6_LOOKUP_F_IFACE;
2896
2897 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2898 cfg->fc_table, flags);
2899 if (grt) {
2900 if (grt->rt6i_flags & RTF_GATEWAY ||
2901 (dev && dev != grt->dst.dev)) {
2902 ip6_rt_put(grt);
2903 grt = NULL;
2904 }
2905 }
2906 }
2907
2908 if (!grt)
2909 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2910
2911 if (!grt)
2912 goto out;
2913
2914 if (dev) {
2915 if (dev != grt->dst.dev) {
2916 ip6_rt_put(grt);
2917 goto out;
2918 }
2919 } else {
2920 *_dev = dev = grt->dst.dev;
2921 *idev = grt->rt6i_idev;
2922 dev_hold(dev);
2923 in6_dev_hold(grt->rt6i_idev);
2924 }
2925
2926 if (!(grt->rt6i_flags & RTF_GATEWAY))
2927 err = 0;
2928
2929 ip6_rt_put(grt);
2930
2931out:
2932 return err;
2933}
2934
2935static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2936 struct net_device **_dev, struct inet6_dev **idev,
2937 struct netlink_ext_ack *extack)
2938{
2939 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2940 int gwa_type = ipv6_addr_type(gw_addr);
2941 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2942 const struct net_device *dev = *_dev;
2943 bool need_addr_check = !dev;
2944 int err = -EINVAL;
2945
2946 /* if gw_addr is local we will fail to detect this in case
2947 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2948 * will return already-added prefix route via interface that
2949 * prefix route was assigned to, which might be non-loopback.
2950 */
2951 if (dev &&
2952 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2953 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2954 goto out;
2955 }
2956
2957 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2958 /* IPv6 strictly inhibits using not link-local
2959 * addresses as nexthop address.
2960 * Otherwise, router will not able to send redirects.
2961 * It is very good, but in some (rare!) circumstances
2962 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2963 * some exceptions. --ANK
2964 * We allow IPv4-mapped nexthops to support RFC4798-type
2965 * addressing
2966 */
2967 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2968 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2969 goto out;
2970 }
2971
2972 if (cfg->fc_flags & RTNH_F_ONLINK)
2973 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2974 else
2975 err = ip6_route_check_nh(net, cfg, _dev, idev);
2976
2977 if (err)
2978 goto out;
2979 }
2980
2981 /* reload in case device was changed */
2982 dev = *_dev;
2983
2984 err = -EINVAL;
2985 if (!dev) {
2986 NL_SET_ERR_MSG(extack, "Egress device not specified");
2987 goto out;
2988 } else if (dev->flags & IFF_LOOPBACK) {
2989 NL_SET_ERR_MSG(extack,
2990 "Egress device can not be loopback device for this route");
2991 goto out;
2992 }
2993
2994 /* if we did not check gw_addr above, do so now that the
2995 * egress device has been resolved.
2996 */
2997 if (need_addr_check &&
2998 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2999 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3000 goto out;
3001 }
3002
3003 err = 0;
3004out:
3005 return err;
3006}
3007
3008static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3009 gfp_t gfp_flags,
3010 struct netlink_ext_ack *extack)
3011{
3012 struct net *net = cfg->fc_nlinfo.nl_net;
3013 struct fib6_info *rt = NULL;
3014 struct net_device *dev = NULL;
3015 struct inet6_dev *idev = NULL;
3016 struct fib6_table *table;
3017 int addr_type;
3018 int err = -EINVAL;
3019
3020 /* RTF_PCPU is an internal flag; can not be set by userspace */
3021 if (cfg->fc_flags & RTF_PCPU) {
3022 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3023 goto out;
3024 }
3025
3026 /* RTF_CACHE is an internal flag; can not be set by userspace */
3027 if (cfg->fc_flags & RTF_CACHE) {
3028 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3029 goto out;
3030 }
3031
3032 if (cfg->fc_type > RTN_MAX) {
3033 NL_SET_ERR_MSG(extack, "Invalid route type");
3034 goto out;
3035 }
3036
3037 if (cfg->fc_dst_len > 128) {
3038 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3039 goto out;
3040 }
3041 if (cfg->fc_src_len > 128) {
3042 NL_SET_ERR_MSG(extack, "Invalid source address length");
3043 goto out;
3044 }
3045#ifndef CONFIG_IPV6_SUBTREES
3046 if (cfg->fc_src_len) {
3047 NL_SET_ERR_MSG(extack,
3048 "Specifying source address requires IPV6_SUBTREES to be enabled");
3049 goto out;
3050 }
3051#endif
3052 if (cfg->fc_ifindex) {
3053 err = -ENODEV;
3054 dev = dev_get_by_index(net, cfg->fc_ifindex);
3055 if (!dev)
3056 goto out;
3057 idev = in6_dev_get(dev);
3058 if (!idev)
3059 goto out;
3060 }
3061
3062 if (cfg->fc_metric == 0)
3063 cfg->fc_metric = IP6_RT_PRIO_USER;
3064
3065 if (cfg->fc_flags & RTNH_F_ONLINK) {
3066 if (!dev) {
3067 NL_SET_ERR_MSG(extack,
3068 "Nexthop device required for onlink");
3069 err = -ENODEV;
3070 goto out;
3071 }
3072
3073 if (!(dev->flags & IFF_UP)) {
3074 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3075 err = -ENETDOWN;
3076 goto out;
3077 }
3078 }
3079
3080 err = -ENOBUFS;
3081 if (cfg->fc_nlinfo.nlh &&
3082 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3083 table = fib6_get_table(net, cfg->fc_table);
3084 if (!table) {
3085 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3086 table = fib6_new_table(net, cfg->fc_table);
3087 }
3088 } else {
3089 table = fib6_new_table(net, cfg->fc_table);
3090 }
3091
3092 if (!table)
3093 goto out;
3094
3095 err = -ENOMEM;
3096 rt = fib6_info_alloc(gfp_flags);
3097 if (!rt)
3098 goto out;
3099
3100#ifdef CONFIG_IPV6_ROUTER_PREF
3101 rt->last_probe = jiffies;
3102#endif
3103 if (cfg->fc_flags & RTF_ADDRCONF)
3104 rt->dst_nocount = true;
3105
3106 err = ip6_convert_metrics(net, rt, cfg);
3107 if (err < 0)
3108 goto out;
3109
3110 if (cfg->fc_flags & RTF_EXPIRES)
3111 fib6_set_expires(rt, jiffies +
3112 clock_t_to_jiffies(cfg->fc_expires));
3113 else
3114 fib6_clean_expires(rt);
3115
3116 if (cfg->fc_protocol == RTPROT_UNSPEC)
3117 cfg->fc_protocol = RTPROT_BOOT;
3118 rt->fib6_protocol = cfg->fc_protocol;
3119
3120 addr_type = ipv6_addr_type(&cfg->fc_dst);
3121
3122 if (cfg->fc_encap) {
3123 struct lwtunnel_state *lwtstate;
3124
3125 err = lwtunnel_build_state(cfg->fc_encap_type,
3126 cfg->fc_encap, AF_INET6, cfg,
3127 &lwtstate, extack);
3128 if (err)
3129 goto out;
3130 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3131 }
3132
3133 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3134 rt->fib6_dst.plen = cfg->fc_dst_len;
3135 if (rt->fib6_dst.plen == 128)
3136 rt->dst_host = true;
3137
3138#ifdef CONFIG_IPV6_SUBTREES
3139 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3140 rt->fib6_src.plen = cfg->fc_src_len;
3141#endif
3142
3143 rt->fib6_metric = cfg->fc_metric;
3144 rt->fib6_nh.nh_weight = 1;
3145
3146 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3147
3148 /* We cannot add true routes via loopback here,
3149 they would result in kernel looping; promote them to reject routes
3150 */
3151 if ((cfg->fc_flags & RTF_REJECT) ||
3152 (dev && (dev->flags & IFF_LOOPBACK) &&
3153 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3154 !(cfg->fc_flags & RTF_LOCAL))) {
3155 /* hold loopback dev/idev if we haven't done so. */
3156 if (dev != net->loopback_dev) {
3157 if (dev) {
3158 dev_put(dev);
3159 in6_dev_put(idev);
3160 }
3161 dev = net->loopback_dev;
3162 dev_hold(dev);
3163 idev = in6_dev_get(dev);
3164 if (!idev) {
3165 err = -ENODEV;
3166 goto out;
3167 }
3168 }
3169 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3170 goto install_route;
3171 }
3172
3173 if (cfg->fc_flags & RTF_GATEWAY) {
3174 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3175 if (err)
3176 goto out;
3177
3178 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3179 }
3180
3181 err = -ENODEV;
3182 if (!dev)
3183 goto out;
3184
3185 if (idev->cnf.disable_ipv6) {
3186 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3187 err = -EACCES;
3188 goto out;
3189 }
3190
3191 if (!(dev->flags & IFF_UP)) {
3192 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3193 err = -ENETDOWN;
3194 goto out;
3195 }
3196
3197 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3198 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3199 NL_SET_ERR_MSG(extack, "Invalid source address");
3200 err = -EINVAL;
3201 goto out;
3202 }
3203 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3204 rt->fib6_prefsrc.plen = 128;
3205 } else
3206 rt->fib6_prefsrc.plen = 0;
3207
3208 rt->fib6_flags = cfg->fc_flags;
3209
3210install_route:
3211 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3212 !netif_carrier_ok(dev))
3213 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3214 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3215 rt->fib6_nh.nh_dev = dev;
3216 rt->fib6_table = table;
3217
3218 cfg->fc_nlinfo.nl_net = dev_net(dev);
3219
3220 if (idev)
3221 in6_dev_put(idev);
3222
3223 return rt;
3224out:
3225 if (dev)
3226 dev_put(dev);
3227 if (idev)
3228 in6_dev_put(idev);
3229
3230 fib6_info_release(rt);
3231 return ERR_PTR(err);
3232}
3233
3234int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3235 struct netlink_ext_ack *extack)
3236{
3237 struct fib6_info *rt;
3238 int err;
3239
3240 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3241 if (IS_ERR(rt))
3242 return PTR_ERR(rt);
3243
3244 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3245 fib6_info_release(rt);
3246
3247 return err;
3248}
3249
3250static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3251{
3252 struct net *net = info->nl_net;
3253 struct fib6_table *table;
3254 int err;
3255
3256 if (rt == net->ipv6.fib6_null_entry) {
3257 err = -ENOENT;
3258 goto out;
3259 }
3260
3261 table = rt->fib6_table;
3262 spin_lock_bh(&table->tb6_lock);
3263 err = fib6_del(rt, info);
3264 spin_unlock_bh(&table->tb6_lock);
3265
3266out:
3267 fib6_info_release(rt);
3268 return err;
3269}
3270
3271int ip6_del_rt(struct net *net, struct fib6_info *rt)
3272{
3273 struct nl_info info = { .nl_net = net };
3274
3275 return __ip6_del_rt(rt, &info);
3276}
3277
3278static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3279{
3280 struct nl_info *info = &cfg->fc_nlinfo;
3281 struct net *net = info->nl_net;
3282 struct sk_buff *skb = NULL;
3283 struct fib6_table *table;
3284 int err = -ENOENT;
3285
3286 if (rt == net->ipv6.fib6_null_entry)
3287 goto out_put;
3288 table = rt->fib6_table;
3289 spin_lock_bh(&table->tb6_lock);
3290
3291 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3292 struct fib6_info *sibling, *next_sibling;
3293
3294 /* prefer to send a single notification with all hops */
3295 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3296 if (skb) {
3297 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3298
3299 if (rt6_fill_node(net, skb, rt, NULL,
3300 NULL, NULL, 0, RTM_DELROUTE,
3301 info->portid, seq, 0) < 0) {
3302 kfree_skb(skb);
3303 skb = NULL;
3304 } else
3305 info->skip_notify = 1;
3306 }
3307
3308 list_for_each_entry_safe(sibling, next_sibling,
3309 &rt->fib6_siblings,
3310 fib6_siblings) {
3311 err = fib6_del(sibling, info);
3312 if (err)
3313 goto out_unlock;
3314 }
3315 }
3316
3317 err = fib6_del(rt, info);
3318out_unlock:
3319 spin_unlock_bh(&table->tb6_lock);
3320out_put:
3321 fib6_info_release(rt);
3322
3323 if (skb) {
3324 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3325 info->nlh, gfp_any());
3326 }
3327 return err;
3328}
3329
3330static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3331{
3332 int rc = -ESRCH;
3333
3334 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3335 goto out;
3336
3337 if (cfg->fc_flags & RTF_GATEWAY &&
3338 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3339 goto out;
3340
3341 rc = rt6_remove_exception_rt(rt);
3342out:
3343 return rc;
3344}
3345
3346static int ip6_route_del(struct fib6_config *cfg,
3347 struct netlink_ext_ack *extack)
3348{
3349 struct rt6_info *rt_cache;
3350 struct fib6_table *table;
3351 struct fib6_info *rt;
3352 struct fib6_node *fn;
3353 int err = -ESRCH;
3354
3355 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3356 if (!table) {
3357 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3358 return err;
3359 }
3360
3361 rcu_read_lock();
3362
3363 fn = fib6_locate(&table->tb6_root,
3364 &cfg->fc_dst, cfg->fc_dst_len,
3365 &cfg->fc_src, cfg->fc_src_len,
3366 !(cfg->fc_flags & RTF_CACHE));
3367
3368 if (fn) {
3369 for_each_fib6_node_rt_rcu(fn) {
3370 if (cfg->fc_flags & RTF_CACHE) {
3371 int rc;
3372
3373 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3374 &cfg->fc_src);
3375 if (rt_cache) {
3376 rc = ip6_del_cached_rt(rt_cache, cfg);
3377 if (rc != -ESRCH) {
3378 rcu_read_unlock();
3379 return rc;
3380 }
3381 }
3382 continue;
3383 }
3384 if (cfg->fc_ifindex &&
3385 (!rt->fib6_nh.nh_dev ||
3386 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3387 continue;
3388 if (cfg->fc_flags & RTF_GATEWAY &&
3389 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3390 continue;
3391 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3392 continue;
3393 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3394 continue;
3395 if (!fib6_info_hold_safe(rt))
3396 continue;
3397 rcu_read_unlock();
3398
3399 /* if gateway was specified only delete the one hop */
3400 if (cfg->fc_flags & RTF_GATEWAY)
3401 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3402
3403 return __ip6_del_rt_siblings(rt, cfg);
3404 }
3405 }
3406 rcu_read_unlock();
3407
3408 return err;
3409}
3410
3411static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3412{
3413 struct netevent_redirect netevent;
3414 struct rt6_info *rt, *nrt = NULL;
3415 struct ndisc_options ndopts;
3416 struct inet6_dev *in6_dev;
3417 struct neighbour *neigh;
3418 struct fib6_info *from;
3419 struct rd_msg *msg;
3420 int optlen, on_link;
3421 u8 *lladdr;
3422
3423 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3424 optlen -= sizeof(*msg);
3425
3426 if (optlen < 0) {
3427 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3428 return;
3429 }
3430
3431 msg = (struct rd_msg *)icmp6_hdr(skb);
3432
3433 if (ipv6_addr_is_multicast(&msg->dest)) {
3434 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3435 return;
3436 }
3437
3438 on_link = 0;
3439 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3440 on_link = 1;
3441 } else if (ipv6_addr_type(&msg->target) !=
3442 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3443 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3444 return;
3445 }
3446
3447 in6_dev = __in6_dev_get(skb->dev);
3448 if (!in6_dev)
3449 return;
3450 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3451 return;
3452
3453 /* RFC2461 8.1:
3454 * The IP source address of the Redirect MUST be the same as the current
3455 * first-hop router for the specified ICMP Destination Address.
3456 */
3457
3458 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3459 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3460 return;
3461 }
3462
3463 lladdr = NULL;
3464 if (ndopts.nd_opts_tgt_lladdr) {
3465 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3466 skb->dev);
3467 if (!lladdr) {
3468 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3469 return;
3470 }
3471 }
3472
3473 rt = (struct rt6_info *) dst;
3474 if (rt->rt6i_flags & RTF_REJECT) {
3475 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3476 return;
3477 }
3478
3479 /* Redirect received -> path was valid.
3480 * Look, redirects are sent only in response to data packets,
3481 * so that this nexthop apparently is reachable. --ANK
3482 */
3483 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3484
3485 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3486 if (!neigh)
3487 return;
3488
3489 /*
3490 * We have finally decided to accept it.
3491 */
3492
3493 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3494 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3495 NEIGH_UPDATE_F_OVERRIDE|
3496 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3497 NEIGH_UPDATE_F_ISROUTER)),
3498 NDISC_REDIRECT, &ndopts);
3499
3500 rcu_read_lock();
3501 from = rcu_dereference(rt->from);
3502 if (!from)
3503 goto out;
3504
3505 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3506 if (!nrt)
3507 goto out;
3508
3509 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3510 if (on_link)
3511 nrt->rt6i_flags &= ~RTF_GATEWAY;
3512
3513 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3514
3515 /* rt6_insert_exception() will take care of duplicated exceptions */
3516 if (rt6_insert_exception(nrt, from)) {
3517 dst_release_immediate(&nrt->dst);
3518 goto out;
3519 }
3520
3521 netevent.old = &rt->dst;
3522 netevent.new = &nrt->dst;
3523 netevent.daddr = &msg->dest;
3524 netevent.neigh = neigh;
3525 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3526
3527out:
3528 rcu_read_unlock();
3529 neigh_release(neigh);
3530}
3531
3532#ifdef CONFIG_IPV6_ROUTE_INFO
3533static struct fib6_info *rt6_get_route_info(struct net *net,
3534 const struct in6_addr *prefix, int prefixlen,
3535 const struct in6_addr *gwaddr,
3536 struct net_device *dev)
3537{
3538 u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
3539 int ifindex = dev->ifindex;
3540 struct fib6_node *fn;
3541 struct fib6_info *rt = NULL;
3542 struct fib6_table *table;
3543
3544 table = fib6_get_table(net, tb_id);
3545 if (!table)
3546 return NULL;
3547
3548 rcu_read_lock();
3549 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3550 if (!fn)
3551 goto out;
3552
3553 for_each_fib6_node_rt_rcu(fn) {
3554 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3555 continue;
3556 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3557 continue;
3558 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3559 continue;
3560 if (!fib6_info_hold_safe(rt))
3561 continue;
3562 break;
3563 }
3564out:
3565 rcu_read_unlock();
3566 return rt;
3567}
3568
3569static struct fib6_info *rt6_add_route_info(struct net *net,
3570 const struct in6_addr *prefix, int prefixlen,
3571 const struct in6_addr *gwaddr,
3572 struct net_device *dev,
3573 unsigned int pref)
3574{
3575 struct fib6_config cfg = {
3576 .fc_metric = IP6_RT_PRIO_USER,
3577 .fc_ifindex = dev->ifindex,
3578 .fc_dst_len = prefixlen,
3579 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3580 RTF_UP | RTF_PREF(pref),
3581 .fc_protocol = RTPROT_RA,
3582 .fc_type = RTN_UNICAST,
3583 .fc_nlinfo.portid = 0,
3584 .fc_nlinfo.nlh = NULL,
3585 .fc_nlinfo.nl_net = net,
3586 };
3587
3588 cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
3589 cfg.fc_dst = *prefix;
3590 cfg.fc_gateway = *gwaddr;
3591
3592 /* We should treat it as a default route if prefix length is 0. */
3593 if (!prefixlen)
3594 cfg.fc_flags |= RTF_DEFAULT;
3595
3596 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3597
3598 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3599}
3600#endif
3601
3602struct fib6_info *rt6_get_dflt_router(struct net *net,
3603 const struct in6_addr *addr,
3604 struct net_device *dev)
3605{
3606 u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT);
3607 struct fib6_info *rt;
3608 struct fib6_table *table;
3609
3610 table = fib6_get_table(net, tb_id);
3611 if (!table)
3612 return NULL;
3613
3614 rcu_read_lock();
3615 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3616 if (dev == rt->fib6_nh.nh_dev &&
3617 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3618 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3619 break;
3620 }
3621 if (rt && !fib6_info_hold_safe(rt))
3622 rt = NULL;
3623 rcu_read_unlock();
3624 return rt;
3625}
3626
3627struct fib6_info *rt6_get_dflt_router_expires(struct net_device *dev)
3628{
3629 struct fib6_info *rt;
3630 struct fib6_table *table;
3631 #define RTF_ADGE (RTF_ADDRCONF | RTF_DEFAULT \
3632 | RTF_GATEWAY | RTF_EXPIRES)
3633
3634 table = fib6_get_table(dev_net(dev),
3635 addrconf_rt_table(dev, RT6_TABLE_MAIN));
3636 if (!table)
3637 return NULL;
3638
3639 rcu_read_lock();
3640 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3641 if (dev == rt->fib6_nh.nh_dev &&
3642 ((rt->fib6_flags & RTF_ADGE) == RTF_ADGE))
3643 break;
3644 }
3645 if (rt && !fib6_info_hold_safe(rt))
3646 rt = NULL;
3647 rcu_read_unlock();
3648 return rt;
3649}
3650
3651struct fib6_info *rt6_add_dflt_router(struct net *net,
3652 const struct in6_addr *gwaddr,
3653 struct net_device *dev,
3654 unsigned int pref)
3655{
3656 struct fib6_config cfg = {
3657 .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
3658 .fc_metric = IP6_RT_PRIO_USER,
3659 .fc_ifindex = dev->ifindex,
3660 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3661 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3662 .fc_protocol = RTPROT_RA,
3663 .fc_type = RTN_UNICAST,
3664 .fc_nlinfo.portid = 0,
3665 .fc_nlinfo.nlh = NULL,
3666 .fc_nlinfo.nl_net = net,
3667 };
3668
3669 cfg.fc_gateway = *gwaddr;
3670
3671 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3672 struct fib6_table *table;
3673
3674 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3675 if (table)
3676 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3677 }
3678
3679 return rt6_get_dflt_router(net, gwaddr, dev);
3680}
3681
3682static int rt6_addrconf_purge(struct fib6_info *rt, void *arg)
3683{
3684 struct net_device *dev = fib6_info_nh_dev(rt);
3685 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3686
3687 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3688 (!idev || idev->cnf.accept_ra != 2)) {
3689 /* Delete this route. See fib6_clean_tree() */
3690 return -1;
3691 }
3692
3693 /* Continue walking */
3694 return 0;
3695}
3696
3697void rt6_purge_dflt_routers(struct net *net)
3698{
3699 fib6_clean_all(net, rt6_addrconf_purge, NULL);
3700}
3701
3702static void rtmsg_to_fib6_config(struct net *net,
3703 struct in6_rtmsg *rtmsg,
3704 struct fib6_config *cfg)
3705{
3706 memset(cfg, 0, sizeof(*cfg));
3707
3708 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3709 : RT6_TABLE_MAIN;
3710 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3711 cfg->fc_metric = rtmsg->rtmsg_metric;
3712 cfg->fc_expires = rtmsg->rtmsg_info;
3713 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3714 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3715 cfg->fc_flags = rtmsg->rtmsg_flags;
3716 cfg->fc_type = rtmsg->rtmsg_type;
3717
3718 cfg->fc_nlinfo.nl_net = net;
3719
3720 cfg->fc_dst = rtmsg->rtmsg_dst;
3721 cfg->fc_src = rtmsg->rtmsg_src;
3722 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3723}
3724
3725int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3726{
3727 struct fib6_config cfg;
3728 struct in6_rtmsg rtmsg;
3729 int err;
3730
3731 switch (cmd) {
3732 case SIOCADDRT: /* Add a route */
3733 case SIOCDELRT: /* Delete a route */
3734 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3735 return -EPERM;
3736 err = copy_from_user(&rtmsg, arg,
3737 sizeof(struct in6_rtmsg));
3738 if (err)
3739 return -EFAULT;
3740
3741 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3742
3743 rtnl_lock();
3744 switch (cmd) {
3745 case SIOCADDRT:
3746 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3747 break;
3748 case SIOCDELRT:
3749 err = ip6_route_del(&cfg, NULL);
3750 break;
3751 default:
3752 err = -EINVAL;
3753 }
3754 rtnl_unlock();
3755
3756 return err;
3757 }
3758
3759 return -EINVAL;
3760}
3761
3762/*
3763 * Drop the packet on the floor
3764 */
3765
3766static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3767{
3768 int type;
3769 struct dst_entry *dst = skb_dst(skb);
3770 switch (ipstats_mib_noroutes) {
3771 case IPSTATS_MIB_INNOROUTES:
3772 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3773 if (type == IPV6_ADDR_ANY) {
3774 IP6_INC_STATS(dev_net(dst->dev),
3775 __in6_dev_get_safely(skb->dev),
3776 IPSTATS_MIB_INADDRERRORS);
3777 break;
3778 }
3779 /* FALLTHROUGH */
3780 case IPSTATS_MIB_OUTNOROUTES:
3781 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3782 ipstats_mib_noroutes);
3783 break;
3784 }
3785 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3786 kfree_skb(skb);
3787 return 0;
3788}
3789
3790static int ip6_pkt_discard(struct sk_buff *skb)
3791{
3792 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3793}
3794
3795static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3796{
3797 skb->dev = skb_dst(skb)->dev;
3798 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3799}
3800
3801static int ip6_pkt_prohibit(struct sk_buff *skb)
3802{
3803 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3804}
3805
3806static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3807{
3808 skb->dev = skb_dst(skb)->dev;
3809 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3810}
3811
3812static int ip6_pkt_policy_failed(struct sk_buff *skb)
3813{
3814 return ip6_pkt_drop(skb, ICMPV6_POLICY_FAIL, IPSTATS_MIB_INNOROUTES);
3815}
3816
3817static int ip6_pkt_policy_failed_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3818{
3819 skb->dev = skb_dst(skb)->dev;
3820 return ip6_pkt_drop(skb, ICMPV6_POLICY_FAIL, IPSTATS_MIB_OUTNOROUTES);
3821}
3822
3823/*
3824 * Allocate a dst for local (unicast / anycast) address.
3825 */
3826
3827struct fib6_info *addrconf_f6i_alloc(struct net *net,
3828 struct inet6_dev *idev,
3829 const struct in6_addr *addr,
3830 bool anycast, gfp_t gfp_flags)
3831{
3832 u32 tb_id;
3833 struct net_device *dev = idev->dev;
3834 struct fib6_info *f6i;
3835
3836 f6i = fib6_info_alloc(gfp_flags);
3837 if (!f6i)
3838 return ERR_PTR(-ENOMEM);
3839
3840 f6i->dst_nocount = true;
3841 f6i->dst_host = true;
3842 f6i->fib6_protocol = RTPROT_KERNEL;
3843 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3844 if (anycast) {
3845 f6i->fib6_type = RTN_ANYCAST;
3846 f6i->fib6_flags |= RTF_ANYCAST;
3847 } else {
3848 f6i->fib6_type = RTN_LOCAL;
3849 f6i->fib6_flags |= RTF_LOCAL;
3850 }
3851
3852 f6i->fib6_nh.nh_gw = *addr;
3853 dev_hold(dev);
3854 f6i->fib6_nh.nh_dev = dev;
3855 f6i->fib6_dst.addr = *addr;
3856 f6i->fib6_dst.plen = 128;
3857 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3858 f6i->fib6_table = fib6_get_table(net, tb_id);
3859
3860 return f6i;
3861}
3862
3863/* remove deleted ip from prefsrc entries */
3864struct arg_dev_net_ip {
3865 struct net_device *dev;
3866 struct net *net;
3867 struct in6_addr *addr;
3868};
3869
3870static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3871{
3872 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3873 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3874 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3875
3876 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3877 rt != net->ipv6.fib6_null_entry &&
3878 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3879 spin_lock_bh(&rt6_exception_lock);
3880 /* remove prefsrc entry */
3881 rt->fib6_prefsrc.plen = 0;
3882 /* need to update cache as well */
3883 rt6_exceptions_remove_prefsrc(rt);
3884 spin_unlock_bh(&rt6_exception_lock);
3885 }
3886 return 0;
3887}
3888
3889void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3890{
3891 struct net *net = dev_net(ifp->idev->dev);
3892 struct arg_dev_net_ip adni = {
3893 .dev = ifp->idev->dev,
3894 .net = net,
3895 .addr = &ifp->addr,
3896 };
3897 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3898}
3899
3900#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3901
3902/* Remove routers and update dst entries when gateway turn into host. */
3903static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3904{
3905 struct in6_addr *gateway = (struct in6_addr *)arg;
3906
3907 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3908 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3909 return -1;
3910 }
3911
3912 /* Further clean up cached routes in exception table.
3913 * This is needed because cached route may have a different
3914 * gateway than its 'parent' in the case of an ip redirect.
3915 */
3916 rt6_exceptions_clean_tohost(rt, gateway);
3917
3918 return 0;
3919}
3920
3921void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3922{
3923 fib6_clean_all(net, fib6_clean_tohost, gateway);
3924}
3925
3926struct arg_netdev_event {
3927 const struct net_device *dev;
3928 union {
3929 unsigned int nh_flags;
3930 unsigned long event;
3931 };
3932};
3933
3934static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3935{
3936 struct fib6_info *iter;
3937 struct fib6_node *fn;
3938
3939 fn = rcu_dereference_protected(rt->fib6_node,
3940 lockdep_is_held(&rt->fib6_table->tb6_lock));
3941 iter = rcu_dereference_protected(fn->leaf,
3942 lockdep_is_held(&rt->fib6_table->tb6_lock));
3943 while (iter) {
3944 if (iter->fib6_metric == rt->fib6_metric &&
3945 rt6_qualify_for_ecmp(iter))
3946 return iter;
3947 iter = rcu_dereference_protected(iter->fib6_next,
3948 lockdep_is_held(&rt->fib6_table->tb6_lock));
3949 }
3950
3951 return NULL;
3952}
3953
3954static bool rt6_is_dead(const struct fib6_info *rt)
3955{
3956 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3957 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3958 fib6_ignore_linkdown(rt)))
3959 return true;
3960
3961 return false;
3962}
3963
3964static int rt6_multipath_total_weight(const struct fib6_info *rt)
3965{
3966 struct fib6_info *iter;
3967 int total = 0;
3968
3969 if (!rt6_is_dead(rt))
3970 total += rt->fib6_nh.nh_weight;
3971
3972 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3973 if (!rt6_is_dead(iter))
3974 total += iter->fib6_nh.nh_weight;
3975 }
3976
3977 return total;
3978}
3979
3980static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3981{
3982 int upper_bound = -1;
3983
3984 if (!rt6_is_dead(rt)) {
3985 *weight += rt->fib6_nh.nh_weight;
3986 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3987 total) - 1;
3988 }
3989 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3990}
3991
3992static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3993{
3994 struct fib6_info *iter;
3995 int weight = 0;
3996
3997 rt6_upper_bound_set(rt, &weight, total);
3998
3999 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4000 rt6_upper_bound_set(iter, &weight, total);
4001}
4002
4003void rt6_multipath_rebalance(struct fib6_info *rt)
4004{
4005 struct fib6_info *first;
4006 int total;
4007
4008 /* In case the entire multipath route was marked for flushing,
4009 * then there is no need to rebalance upon the removal of every
4010 * sibling route.
4011 */
4012 if (!rt->fib6_nsiblings || rt->should_flush)
4013 return;
4014
4015 /* During lookup routes are evaluated in order, so we need to
4016 * make sure upper bounds are assigned from the first sibling
4017 * onwards.
4018 */
4019 first = rt6_multipath_first_sibling(rt);
4020 if (WARN_ON_ONCE(!first))
4021 return;
4022
4023 total = rt6_multipath_total_weight(first);
4024 rt6_multipath_upper_bound_set(first, total);
4025}
4026
4027static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4028{
4029 const struct arg_netdev_event *arg = p_arg;
4030 struct net *net = dev_net(arg->dev);
4031
4032 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
4033 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
4034 fib6_update_sernum_upto_root(net, rt);
4035 rt6_multipath_rebalance(rt);
4036 }
4037
4038 return 0;
4039}
4040
4041void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
4042{
4043 struct arg_netdev_event arg = {
4044 .dev = dev,
4045 {
4046 .nh_flags = nh_flags,
4047 },
4048 };
4049
4050 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4051 arg.nh_flags |= RTNH_F_LINKDOWN;
4052
4053 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4054}
4055
4056static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4057 const struct net_device *dev)
4058{
4059 struct fib6_info *iter;
4060
4061 if (rt->fib6_nh.nh_dev == dev)
4062 return true;
4063 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4064 if (iter->fib6_nh.nh_dev == dev)
4065 return true;
4066
4067 return false;
4068}
4069
4070static void rt6_multipath_flush(struct fib6_info *rt)
4071{
4072 struct fib6_info *iter;
4073
4074 rt->should_flush = 1;
4075 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076 iter->should_flush = 1;
4077}
4078
4079static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4080 const struct net_device *down_dev)
4081{
4082 struct fib6_info *iter;
4083 unsigned int dead = 0;
4084
4085 if (rt->fib6_nh.nh_dev == down_dev ||
4086 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4087 dead++;
4088 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4089 if (iter->fib6_nh.nh_dev == down_dev ||
4090 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4091 dead++;
4092
4093 return dead;
4094}
4095
4096static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4097 const struct net_device *dev,
4098 unsigned int nh_flags)
4099{
4100 struct fib6_info *iter;
4101
4102 if (rt->fib6_nh.nh_dev == dev)
4103 rt->fib6_nh.nh_flags |= nh_flags;
4104 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4105 if (iter->fib6_nh.nh_dev == dev)
4106 iter->fib6_nh.nh_flags |= nh_flags;
4107}
4108
4109/* called with write lock held for table with rt */
4110static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4111{
4112 const struct arg_netdev_event *arg = p_arg;
4113 const struct net_device *dev = arg->dev;
4114 struct net *net = dev_net(dev);
4115
4116 if (rt == net->ipv6.fib6_null_entry)
4117 return 0;
4118
4119 switch (arg->event) {
4120 case NETDEV_UNREGISTER:
4121 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4122 case NETDEV_DOWN:
4123 if (rt->should_flush)
4124 return -1;
4125 if (!rt->fib6_nsiblings)
4126 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4127 if (rt6_multipath_uses_dev(rt, dev)) {
4128 unsigned int count;
4129
4130 count = rt6_multipath_dead_count(rt, dev);
4131 if (rt->fib6_nsiblings + 1 == count) {
4132 rt6_multipath_flush(rt);
4133 return -1;
4134 }
4135 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4136 RTNH_F_LINKDOWN);
4137 fib6_update_sernum(net, rt);
4138 rt6_multipath_rebalance(rt);
4139 }
4140 return -2;
4141 case NETDEV_CHANGE:
4142 if (rt->fib6_nh.nh_dev != dev ||
4143 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4144 break;
4145 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4146 rt6_multipath_rebalance(rt);
4147 break;
4148 }
4149
4150 return 0;
4151}
4152
4153void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4154{
4155 struct arg_netdev_event arg = {
4156 .dev = dev,
4157 {
4158 .event = event,
4159 },
4160 };
4161
4162 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4163}
4164
4165void rt6_disable_ip(struct net_device *dev, unsigned long event)
4166{
4167 rt6_sync_down_dev(dev, event);
4168 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4169 neigh_ifdown(&nd_tbl, dev);
4170}
4171
4172struct rt6_mtu_change_arg {
4173 struct net_device *dev;
4174 unsigned int mtu;
4175};
4176
4177static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4178{
4179 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4180 struct inet6_dev *idev;
4181
4182 /* In IPv6 pmtu discovery is not optional,
4183 so that RTAX_MTU lock cannot disable it.
4184 We still use this lock to block changes
4185 caused by addrconf/ndisc.
4186 */
4187
4188 idev = __in6_dev_get(arg->dev);
4189 if (!idev)
4190 return 0;
4191
4192 /* For administrative MTU increase, there is no way to discover
4193 IPv6 PMTU increase, so PMTU increase should be updated here.
4194 Since RFC 1981 doesn't include administrative MTU increase
4195 update PMTU increase is a MUST. (i.e. jumbo frame)
4196 */
4197 if (rt->fib6_nh.nh_dev == arg->dev &&
4198 !fib6_metric_locked(rt, RTAX_MTU)) {
4199 u32 mtu = rt->fib6_pmtu;
4200
4201 if (mtu >= arg->mtu ||
4202 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4203 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4204
4205 spin_lock_bh(&rt6_exception_lock);
4206 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4207 spin_unlock_bh(&rt6_exception_lock);
4208 }
4209 return 0;
4210}
4211
4212void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4213{
4214 struct rt6_mtu_change_arg arg = {
4215 .dev = dev,
4216 .mtu = mtu,
4217 };
4218
4219 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4220}
4221
4222static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4223 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4224 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4225 [RTA_OIF] = { .type = NLA_U32 },
4226 [RTA_IIF] = { .type = NLA_U32 },
4227 [RTA_PRIORITY] = { .type = NLA_U32 },
4228 [RTA_METRICS] = { .type = NLA_NESTED },
4229 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4230 [RTA_PREF] = { .type = NLA_U8 },
4231 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4232 [RTA_ENCAP] = { .type = NLA_NESTED },
4233 [RTA_EXPIRES] = { .type = NLA_U32 },
4234 [RTA_UID] = { .type = NLA_U32 },
4235 [RTA_MARK] = { .type = NLA_U32 },
4236 [RTA_TABLE] = { .type = NLA_U32 },
4237 [RTA_IP_PROTO] = { .type = NLA_U8 },
4238 [RTA_SPORT] = { .type = NLA_U16 },
4239 [RTA_DPORT] = { .type = NLA_U16 },
4240};
4241
4242static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4243 struct fib6_config *cfg,
4244 struct netlink_ext_ack *extack)
4245{
4246 struct rtmsg *rtm;
4247 struct nlattr *tb[RTA_MAX+1];
4248 unsigned int pref;
4249 int err;
4250
4251 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4252 NULL);
4253 if (err < 0)
4254 goto errout;
4255
4256 err = -EINVAL;
4257 rtm = nlmsg_data(nlh);
4258 memset(cfg, 0, sizeof(*cfg));
4259
4260 cfg->fc_table = rtm->rtm_table;
4261 cfg->fc_dst_len = rtm->rtm_dst_len;
4262 cfg->fc_src_len = rtm->rtm_src_len;
4263 cfg->fc_flags = RTF_UP;
4264 cfg->fc_protocol = rtm->rtm_protocol;
4265 cfg->fc_type = rtm->rtm_type;
4266
4267 if (rtm->rtm_type == RTN_UNREACHABLE ||
4268 rtm->rtm_type == RTN_BLACKHOLE ||
4269 rtm->rtm_type == RTN_PROHIBIT ||
4270 rtm->rtm_type == RTN_THROW ||
4271 rtm->rtm_type == RTN_POLICY_FAILED)
4272 cfg->fc_flags |= RTF_REJECT;
4273
4274 if (rtm->rtm_type == RTN_LOCAL)
4275 cfg->fc_flags |= RTF_LOCAL;
4276
4277 if (rtm->rtm_flags & RTM_F_CLONED)
4278 cfg->fc_flags |= RTF_CACHE;
4279
4280 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4281
4282 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4283 cfg->fc_nlinfo.nlh = nlh;
4284 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4285
4286 if (tb[RTA_GATEWAY]) {
4287 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4288 cfg->fc_flags |= RTF_GATEWAY;
4289 }
4290 if (tb[RTA_VIA]) {
4291 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4292 goto errout;
4293 }
4294
4295 if (tb[RTA_DST]) {
4296 int plen = (rtm->rtm_dst_len + 7) >> 3;
4297
4298 if (nla_len(tb[RTA_DST]) < plen)
4299 goto errout;
4300
4301 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4302 }
4303
4304 if (tb[RTA_SRC]) {
4305 int plen = (rtm->rtm_src_len + 7) >> 3;
4306
4307 if (nla_len(tb[RTA_SRC]) < plen)
4308 goto errout;
4309
4310 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4311 }
4312
4313 if (tb[RTA_PREFSRC])
4314 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4315
4316 if (tb[RTA_OIF])
4317 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4318
4319 if (tb[RTA_PRIORITY])
4320 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4321
4322 if (tb[RTA_METRICS]) {
4323 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4324 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4325 }
4326
4327 if (tb[RTA_TABLE])
4328 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4329
4330 if (tb[RTA_MULTIPATH]) {
4331 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4332 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4333
4334 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4335 cfg->fc_mp_len, extack);
4336 if (err < 0)
4337 goto errout;
4338 }
4339
4340 if (tb[RTA_PREF]) {
4341 pref = nla_get_u8(tb[RTA_PREF]);
4342 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4343 pref != ICMPV6_ROUTER_PREF_HIGH)
4344 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4345 cfg->fc_flags |= RTF_PREF(pref);
4346 }
4347
4348 if (tb[RTA_ENCAP])
4349 cfg->fc_encap = tb[RTA_ENCAP];
4350
4351 if (tb[RTA_ENCAP_TYPE]) {
4352 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4353
4354 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4355 if (err < 0)
4356 goto errout;
4357 }
4358
4359 if (tb[RTA_EXPIRES]) {
4360 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4361
4362 if (addrconf_finite_timeout(timeout)) {
4363 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4364 cfg->fc_flags |= RTF_EXPIRES;
4365 }
4366 }
4367
4368 err = 0;
4369errout:
4370 return err;
4371}
4372
4373struct rt6_nh {
4374 struct fib6_info *fib6_info;
4375 struct fib6_config r_cfg;
4376 struct list_head next;
4377};
4378
4379static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4380{
4381 struct rt6_nh *nh;
4382
4383 list_for_each_entry(nh, rt6_nh_list, next) {
4384 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4385 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4386 nh->r_cfg.fc_ifindex);
4387 }
4388}
4389
4390static int ip6_route_info_append(struct net *net,
4391 struct list_head *rt6_nh_list,
4392 struct fib6_info *rt,
4393 struct fib6_config *r_cfg)
4394{
4395 struct rt6_nh *nh;
4396 int err = -EEXIST;
4397
4398 list_for_each_entry(nh, rt6_nh_list, next) {
4399 /* check if fib6_info already exists */
4400 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4401 return err;
4402 }
4403
4404 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4405 if (!nh)
4406 return -ENOMEM;
4407 nh->fib6_info = rt;
4408 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4409 list_add_tail(&nh->next, rt6_nh_list);
4410
4411 return 0;
4412}
4413
4414static void ip6_route_mpath_notify(struct fib6_info *rt,
4415 struct fib6_info *rt_last,
4416 struct nl_info *info,
4417 __u16 nlflags)
4418{
4419 /* if this is an APPEND route, then rt points to the first route
4420 * inserted and rt_last points to last route inserted. Userspace
4421 * wants a consistent dump of the route which starts at the first
4422 * nexthop. Since sibling routes are always added at the end of
4423 * the list, find the first sibling of the last route appended
4424 */
4425 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4426 rt = list_first_entry(&rt_last->fib6_siblings,
4427 struct fib6_info,
4428 fib6_siblings);
4429 }
4430
4431 if (rt)
4432 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4433}
4434
4435static int ip6_route_multipath_add(struct fib6_config *cfg,
4436 struct netlink_ext_ack *extack)
4437{
4438 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4439 struct nl_info *info = &cfg->fc_nlinfo;
4440 struct fib6_config r_cfg;
4441 struct rtnexthop *rtnh;
4442 struct fib6_info *rt;
4443 struct rt6_nh *err_nh;
4444 struct rt6_nh *nh, *nh_safe;
4445 __u16 nlflags;
4446 int remaining;
4447 int attrlen;
4448 int err = 1;
4449 int nhn = 0;
4450 int replace = (cfg->fc_nlinfo.nlh &&
4451 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4452 LIST_HEAD(rt6_nh_list);
4453
4454 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4455 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4456 nlflags |= NLM_F_APPEND;
4457
4458 remaining = cfg->fc_mp_len;
4459 rtnh = (struct rtnexthop *)cfg->fc_mp;
4460
4461 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4462 * fib6_info structs per nexthop
4463 */
4464 while (rtnh_ok(rtnh, remaining)) {
4465 memcpy(&r_cfg, cfg, sizeof(*cfg));
4466 if (rtnh->rtnh_ifindex)
4467 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4468
4469 attrlen = rtnh_attrlen(rtnh);
4470 if (attrlen > 0) {
4471 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4472
4473 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4474 if (nla) {
4475 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4476 r_cfg.fc_flags |= RTF_GATEWAY;
4477 }
4478 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4479 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4480 if (nla)
4481 r_cfg.fc_encap_type = nla_get_u16(nla);
4482 }
4483
4484 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4485 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4486 if (IS_ERR(rt)) {
4487 err = PTR_ERR(rt);
4488 rt = NULL;
4489 goto cleanup;
4490 }
4491 if (!rt6_qualify_for_ecmp(rt)) {
4492 err = -EINVAL;
4493 NL_SET_ERR_MSG(extack,
4494 "Device only routes can not be added for IPv6 using the multipath API.");
4495 fib6_info_release(rt);
4496 goto cleanup;
4497 }
4498
4499 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4500
4501 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4502 rt, &r_cfg);
4503 if (err) {
4504 fib6_info_release(rt);
4505 goto cleanup;
4506 }
4507
4508 rtnh = rtnh_next(rtnh, &remaining);
4509 }
4510
4511 /* for add and replace send one notification with all nexthops.
4512 * Skip the notification in fib6_add_rt2node and send one with
4513 * the full route when done
4514 */
4515 info->skip_notify = 1;
4516
4517 err_nh = NULL;
4518 list_for_each_entry(nh, &rt6_nh_list, next) {
4519 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4520 fib6_info_release(nh->fib6_info);
4521
4522 if (!err) {
4523 /* save reference to last route successfully inserted */
4524 rt_last = nh->fib6_info;
4525
4526 /* save reference to first route for notification */
4527 if (!rt_notif)
4528 rt_notif = nh->fib6_info;
4529 }
4530
4531 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4532 nh->fib6_info = NULL;
4533 if (err) {
4534 if (replace && nhn)
4535 ip6_print_replace_route_err(&rt6_nh_list);
4536 err_nh = nh;
4537 goto add_errout;
4538 }
4539
4540 /* Because each route is added like a single route we remove
4541 * these flags after the first nexthop: if there is a collision,
4542 * we have already failed to add the first nexthop:
4543 * fib6_add_rt2node() has rejected it; when replacing, old
4544 * nexthops have been replaced by first new, the rest should
4545 * be added to it.
4546 */
4547 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4548 NLM_F_REPLACE);
4549 nhn++;
4550 }
4551
4552 /* success ... tell user about new route */
4553 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4554 goto cleanup;
4555
4556add_errout:
4557 /* send notification for routes that were added so that
4558 * the delete notifications sent by ip6_route_del are
4559 * coherent
4560 */
4561 if (rt_notif)
4562 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4563
4564 /* Delete routes that were already added */
4565 list_for_each_entry(nh, &rt6_nh_list, next) {
4566 if (err_nh == nh)
4567 break;
4568 ip6_route_del(&nh->r_cfg, extack);
4569 }
4570
4571cleanup:
4572 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4573 if (nh->fib6_info)
4574 fib6_info_release(nh->fib6_info);
4575 list_del(&nh->next);
4576 kfree(nh);
4577 }
4578
4579 return err;
4580}
4581
4582static int ip6_route_multipath_del(struct fib6_config *cfg,
4583 struct netlink_ext_ack *extack)
4584{
4585 struct fib6_config r_cfg;
4586 struct rtnexthop *rtnh;
4587 int remaining;
4588 int attrlen;
4589 int err = 1, last_err = 0;
4590
4591 remaining = cfg->fc_mp_len;
4592 rtnh = (struct rtnexthop *)cfg->fc_mp;
4593
4594 /* Parse a Multipath Entry */
4595 while (rtnh_ok(rtnh, remaining)) {
4596 memcpy(&r_cfg, cfg, sizeof(*cfg));
4597 if (rtnh->rtnh_ifindex)
4598 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4599
4600 attrlen = rtnh_attrlen(rtnh);
4601 if (attrlen > 0) {
4602 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4603
4604 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4605 if (nla) {
4606 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4607 r_cfg.fc_flags |= RTF_GATEWAY;
4608 }
4609 }
4610 err = ip6_route_del(&r_cfg, extack);
4611 if (err)
4612 last_err = err;
4613
4614 rtnh = rtnh_next(rtnh, &remaining);
4615 }
4616
4617 return last_err;
4618}
4619
4620static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4621 struct netlink_ext_ack *extack)
4622{
4623 struct fib6_config cfg;
4624 int err;
4625
4626 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4627 if (err < 0)
4628 return err;
4629
4630 if (cfg.fc_mp)
4631 return ip6_route_multipath_del(&cfg, extack);
4632 else {
4633 cfg.fc_delete_all_nh = 1;
4634 return ip6_route_del(&cfg, extack);
4635 }
4636}
4637
4638static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4639 struct netlink_ext_ack *extack)
4640{
4641 struct fib6_config cfg;
4642 int err;
4643
4644 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4645 if (err < 0)
4646 return err;
4647
4648 if (cfg.fc_mp)
4649 return ip6_route_multipath_add(&cfg, extack);
4650 else
4651 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4652}
4653
4654static size_t rt6_nlmsg_size(struct fib6_info *rt)
4655{
4656 int nexthop_len = 0;
4657
4658 if (rt->fib6_nsiblings) {
4659 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4660 + NLA_ALIGN(sizeof(struct rtnexthop))
4661 + nla_total_size(16) /* RTA_GATEWAY */
4662 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4663
4664 nexthop_len *= rt->fib6_nsiblings;
4665 }
4666
4667 return NLMSG_ALIGN(sizeof(struct rtmsg))
4668 + nla_total_size(16) /* RTA_SRC */
4669 + nla_total_size(16) /* RTA_DST */
4670 + nla_total_size(16) /* RTA_GATEWAY */
4671 + nla_total_size(16) /* RTA_PREFSRC */
4672 + nla_total_size(4) /* RTA_TABLE */
4673 + nla_total_size(4) /* RTA_IIF */
4674 + nla_total_size(4) /* RTA_OIF */
4675 + nla_total_size(4) /* RTA_PRIORITY */
4676 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4677 + nla_total_size(sizeof(struct rta_cacheinfo))
4678 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4679 + nla_total_size(1) /* RTA_PREF */
4680 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4681 + nexthop_len;
4682}
4683
4684static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4685 unsigned int *flags, bool skip_oif)
4686{
4687 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4688 *flags |= RTNH_F_DEAD;
4689
4690 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4691 *flags |= RTNH_F_LINKDOWN;
4692
4693 rcu_read_lock();
4694 if (fib6_ignore_linkdown(rt))
4695 *flags |= RTNH_F_DEAD;
4696 rcu_read_unlock();
4697 }
4698
4699 if (rt->fib6_flags & RTF_GATEWAY) {
4700 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4701 goto nla_put_failure;
4702 }
4703
4704 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4705 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4706 *flags |= RTNH_F_OFFLOAD;
4707
4708 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4709 if (!skip_oif && rt->fib6_nh.nh_dev &&
4710 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4711 goto nla_put_failure;
4712
4713 if (rt->fib6_nh.nh_lwtstate &&
4714 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4715 goto nla_put_failure;
4716
4717 return 0;
4718
4719nla_put_failure:
4720 return -EMSGSIZE;
4721}
4722
4723/* add multipath next hop */
4724static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4725{
4726 const struct net_device *dev = rt->fib6_nh.nh_dev;
4727 struct rtnexthop *rtnh;
4728 unsigned int flags = 0;
4729
4730 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4731 if (!rtnh)
4732 goto nla_put_failure;
4733
4734 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4735 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4736
4737 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4738 goto nla_put_failure;
4739
4740 rtnh->rtnh_flags = flags;
4741
4742 /* length of rtnetlink header + attributes */
4743 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4744
4745 return 0;
4746
4747nla_put_failure:
4748 return -EMSGSIZE;
4749}
4750
4751static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4752 struct fib6_info *rt, struct dst_entry *dst,
4753 struct in6_addr *dest, struct in6_addr *src,
4754 int iif, int type, u32 portid, u32 seq,
4755 unsigned int flags)
4756{
4757 struct rt6_info *rt6 = (struct rt6_info *)dst;
4758 struct rt6key *rt6_dst, *rt6_src;
4759 u32 *pmetrics, table, rt6_flags;
4760 struct nlmsghdr *nlh;
4761 struct rtmsg *rtm;
4762 long expires = 0;
4763
4764 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4765 if (!nlh)
4766 return -EMSGSIZE;
4767
4768 if (rt6) {
4769 rt6_dst = &rt6->rt6i_dst;
4770 rt6_src = &rt6->rt6i_src;
4771 rt6_flags = rt6->rt6i_flags;
4772 } else {
4773 rt6_dst = &rt->fib6_dst;
4774 rt6_src = &rt->fib6_src;
4775 rt6_flags = rt->fib6_flags;
4776 }
4777
4778 rtm = nlmsg_data(nlh);
4779 rtm->rtm_family = AF_INET6;
4780 rtm->rtm_dst_len = rt6_dst->plen;
4781 rtm->rtm_src_len = rt6_src->plen;
4782 rtm->rtm_tos = 0;
4783 if (rt->fib6_table)
4784 table = rt->fib6_table->tb6_id;
4785 else
4786 table = RT6_TABLE_UNSPEC;
4787 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4788 if (nla_put_u32(skb, RTA_TABLE, table))
4789 goto nla_put_failure;
4790
4791 rtm->rtm_type = rt->fib6_type;
4792 rtm->rtm_flags = 0;
4793 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4794 rtm->rtm_protocol = rt->fib6_protocol;
4795
4796 if (rt6_flags & RTF_CACHE)
4797 rtm->rtm_flags |= RTM_F_CLONED;
4798
4799 if (dest) {
4800 if (nla_put_in6_addr(skb, RTA_DST, dest))
4801 goto nla_put_failure;
4802 rtm->rtm_dst_len = 128;
4803 } else if (rtm->rtm_dst_len)
4804 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4805 goto nla_put_failure;
4806#ifdef CONFIG_IPV6_SUBTREES
4807 if (src) {
4808 if (nla_put_in6_addr(skb, RTA_SRC, src))
4809 goto nla_put_failure;
4810 rtm->rtm_src_len = 128;
4811 } else if (rtm->rtm_src_len &&
4812 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4813 goto nla_put_failure;
4814#endif
4815 if (iif) {
4816#ifdef CONFIG_IPV6_MROUTE
4817 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4818 int err = ip6mr_get_route(net, skb, rtm, portid);
4819
4820 if (err == 0)
4821 return 0;
4822 if (err < 0)
4823 goto nla_put_failure;
4824 } else
4825#endif
4826 if (nla_put_u32(skb, RTA_IIF, iif))
4827 goto nla_put_failure;
4828 } else if (dest) {
4829 struct in6_addr saddr_buf;
4830 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4831 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4832 goto nla_put_failure;
4833 }
4834
4835 if (rt->fib6_prefsrc.plen) {
4836 struct in6_addr saddr_buf;
4837 saddr_buf = rt->fib6_prefsrc.addr;
4838 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4839 goto nla_put_failure;
4840 }
4841
4842 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4843 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4844 goto nla_put_failure;
4845
4846 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4847 goto nla_put_failure;
4848
4849 /* For multipath routes, walk the siblings list and add
4850 * each as a nexthop within RTA_MULTIPATH.
4851 */
4852 if (rt6) {
4853 if (rt6_flags & RTF_GATEWAY &&
4854 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4855 goto nla_put_failure;
4856
4857 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4858 goto nla_put_failure;
4859 } else if (rt->fib6_nsiblings) {
4860 struct fib6_info *sibling, *next_sibling;
4861 struct nlattr *mp;
4862
4863 mp = nla_nest_start(skb, RTA_MULTIPATH);
4864 if (!mp)
4865 goto nla_put_failure;
4866
4867 if (rt6_add_nexthop(skb, rt) < 0)
4868 goto nla_put_failure;
4869
4870 list_for_each_entry_safe(sibling, next_sibling,
4871 &rt->fib6_siblings, fib6_siblings) {
4872 if (rt6_add_nexthop(skb, sibling) < 0)
4873 goto nla_put_failure;
4874 }
4875
4876 nla_nest_end(skb, mp);
4877 } else {
4878 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4879 goto nla_put_failure;
4880 }
4881
4882 if (rt6_flags & RTF_EXPIRES) {
4883 expires = dst ? dst->expires : rt->expires;
4884 expires -= jiffies;
4885 }
4886
4887 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4888 goto nla_put_failure;
4889
4890 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4891 goto nla_put_failure;
4892
4893
4894 nlmsg_end(skb, nlh);
4895 return 0;
4896
4897nla_put_failure:
4898 nlmsg_cancel(skb, nlh);
4899 return -EMSGSIZE;
4900}
4901
4902int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4903{
4904 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4905 struct net *net = arg->net;
4906
4907 if (rt == net->ipv6.fib6_null_entry)
4908 return 0;
4909
4910 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4911 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4912
4913 /* user wants prefix routes only */
4914 if (rtm->rtm_flags & RTM_F_PREFIX &&
4915 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4916 /* success since this is not a prefix route */
4917 return 1;
4918 }
4919 }
4920
4921 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4922 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4923 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4924}
4925
4926static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4927 struct netlink_ext_ack *extack)
4928{
4929 struct net *net = sock_net(in_skb->sk);
4930 struct nlattr *tb[RTA_MAX+1];
4931 int err, iif = 0, oif = 0;
4932 struct fib6_info *from;
4933 struct dst_entry *dst;
4934 struct rt6_info *rt;
4935 struct sk_buff *skb;
4936 struct rtmsg *rtm;
4937 struct flowi6 fl6;
4938 bool fibmatch;
4939
4940 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4941 extack);
4942 if (err < 0)
4943 goto errout;
4944
4945 err = -EINVAL;
4946 memset(&fl6, 0, sizeof(fl6));
4947 rtm = nlmsg_data(nlh);
4948 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4949 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4950
4951 if (tb[RTA_SRC]) {
4952 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4953 goto errout;
4954
4955 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4956 }
4957
4958 if (tb[RTA_DST]) {
4959 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4960 goto errout;
4961
4962 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4963 }
4964
4965 if (tb[RTA_IIF])
4966 iif = nla_get_u32(tb[RTA_IIF]);
4967
4968 if (tb[RTA_OIF])
4969 oif = nla_get_u32(tb[RTA_OIF]);
4970
4971 if (tb[RTA_MARK])
4972 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4973
4974 if (tb[RTA_UID])
4975 fl6.flowi6_uid = make_kuid(current_user_ns(),
4976 nla_get_u32(tb[RTA_UID]));
4977 else
4978 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4979
4980 if (tb[RTA_SPORT])
4981 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4982
4983 if (tb[RTA_DPORT])
4984 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4985
4986 if (tb[RTA_IP_PROTO]) {
4987 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4988 &fl6.flowi6_proto, AF_INET6,
4989 extack);
4990 if (err)
4991 goto errout;
4992 }
4993
4994 if (iif) {
4995 struct net_device *dev;
4996 int flags = 0;
4997
4998 rcu_read_lock();
4999
5000 dev = dev_get_by_index_rcu(net, iif);
5001 if (!dev) {
5002 rcu_read_unlock();
5003 err = -ENODEV;
5004 goto errout;
5005 }
5006
5007 fl6.flowi6_iif = iif;
5008
5009 if (!ipv6_addr_any(&fl6.saddr))
5010 flags |= RT6_LOOKUP_F_HAS_SADDR;
5011
5012 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5013
5014 rcu_read_unlock();
5015 } else {
5016 fl6.flowi6_oif = oif;
5017
5018 dst = ip6_route_output(net, NULL, &fl6);
5019 }
5020
5021
5022 rt = container_of(dst, struct rt6_info, dst);
5023 if (rt->dst.error) {
5024 err = rt->dst.error;
5025 ip6_rt_put(rt);
5026 goto errout;
5027 }
5028
5029 if (rt == net->ipv6.ip6_null_entry) {
5030 err = rt->dst.error;
5031 ip6_rt_put(rt);
5032 goto errout;
5033 }
5034
5035 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5036 if (!skb) {
5037 ip6_rt_put(rt);
5038 err = -ENOBUFS;
5039 goto errout;
5040 }
5041
5042 skb_dst_set(skb, &rt->dst);
5043
5044 rcu_read_lock();
5045 from = rcu_dereference(rt->from);
5046 if (from) {
5047 if (fibmatch)
5048 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5049 iif, RTM_NEWROUTE,
5050 NETLINK_CB(in_skb).portid,
5051 nlh->nlmsg_seq, 0);
5052 else
5053 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5054 &fl6.saddr, iif, RTM_NEWROUTE,
5055 NETLINK_CB(in_skb).portid,
5056 nlh->nlmsg_seq, 0);
5057 } else {
5058 err = -ENETUNREACH;
5059 }
5060 rcu_read_unlock();
5061
5062 if (err < 0) {
5063 kfree_skb(skb);
5064 goto errout;
5065 }
5066
5067 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5068errout:
5069 return err;
5070}
5071
5072void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5073 unsigned int nlm_flags)
5074{
5075 struct sk_buff *skb;
5076 struct net *net = info->nl_net;
5077 u32 seq;
5078 int err;
5079
5080 err = -ENOBUFS;
5081 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5082
5083 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5084 if (!skb)
5085 goto errout;
5086
5087 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5088 event, info->portid, seq, nlm_flags);
5089 if (err < 0) {
5090 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5091 WARN_ON(err == -EMSGSIZE);
5092 kfree_skb(skb);
5093 goto errout;
5094 }
5095 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5096 info->nlh, gfp_any());
5097 return;
5098errout:
5099 if (err < 0)
5100 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5101}
5102
5103static int ip6_route_dev_notify(struct notifier_block *this,
5104 unsigned long event, void *ptr)
5105{
5106 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5107 struct net *net = dev_net(dev);
5108
5109 if (!(dev->flags & IFF_LOOPBACK))
5110 return NOTIFY_OK;
5111
5112 if (event == NETDEV_REGISTER) {
5113 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5114 net->ipv6.ip6_null_entry->dst.dev = dev;
5115 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5116#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5117 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5118 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5119 net->ipv6.ip6_policy_failed_entry->dst.dev = dev;
5120 net->ipv6.ip6_policy_failed_entry->rt6i_idev = in6_dev_get(dev);
5121 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5122 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5123#endif
5124 } else if (event == NETDEV_UNREGISTER &&
5125 dev->reg_state != NETREG_UNREGISTERED) {
5126 /* NETDEV_UNREGISTER could be fired for multiple times by
5127 * netdev_wait_allrefs(). Make sure we only call this once.
5128 */
5129 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5130#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5131 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5132 in6_dev_put_clear(&net->ipv6.ip6_policy_failed_entry->rt6i_idev);
5133 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5134#endif
5135 }
5136
5137 return NOTIFY_OK;
5138}
5139
5140/*
5141 * /proc
5142 */
5143
5144#ifdef CONFIG_PROC_FS
5145static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5146{
5147 struct net *net = (struct net *)seq->private;
5148 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5149 net->ipv6.rt6_stats->fib_nodes,
5150 net->ipv6.rt6_stats->fib_route_nodes,
5151 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5152 net->ipv6.rt6_stats->fib_rt_entries,
5153 net->ipv6.rt6_stats->fib_rt_cache,
5154 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5155 net->ipv6.rt6_stats->fib_discarded_routes);
5156
5157 return 0;
5158}
5159#endif /* CONFIG_PROC_FS */
5160
5161#ifdef CONFIG_SYSCTL
5162
5163static
5164int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5165 void __user *buffer, size_t *lenp, loff_t *ppos)
5166{
5167 struct net *net;
5168 int delay;
5169 if (!write)
5170 return -EINVAL;
5171
5172 net = (struct net *)ctl->extra1;
5173 delay = net->ipv6.sysctl.flush_delay;
5174 proc_dointvec(ctl, write, buffer, lenp, ppos);
5175 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5176 return 0;
5177}
5178
5179struct ctl_table ipv6_route_table_template[] = {
5180 {
5181 .procname = "flush",
5182 .data = &init_net.ipv6.sysctl.flush_delay,
5183 .maxlen = sizeof(int),
5184 .mode = 0200,
5185 .proc_handler = ipv6_sysctl_rtcache_flush
5186 },
5187 {
5188 .procname = "gc_thresh",
5189 .data = &ip6_dst_ops_template.gc_thresh,
5190 .maxlen = sizeof(int),
5191 .mode = 0644,
5192 .proc_handler = proc_dointvec,
5193 },
5194 {
5195 .procname = "max_size",
5196 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5197 .maxlen = sizeof(int),
5198 .mode = 0644,
5199 .proc_handler = proc_dointvec,
5200 },
5201 {
5202 .procname = "gc_min_interval",
5203 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5204 .maxlen = sizeof(int),
5205 .mode = 0644,
5206 .proc_handler = proc_dointvec_jiffies,
5207 },
5208 {
5209 .procname = "gc_timeout",
5210 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5211 .maxlen = sizeof(int),
5212 .mode = 0644,
5213 .proc_handler = proc_dointvec_jiffies,
5214 },
5215 {
5216 .procname = "gc_interval",
5217 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5218 .maxlen = sizeof(int),
5219 .mode = 0644,
5220 .proc_handler = proc_dointvec_jiffies,
5221 },
5222 {
5223 .procname = "gc_elasticity",
5224 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5225 .maxlen = sizeof(int),
5226 .mode = 0644,
5227 .proc_handler = proc_dointvec,
5228 },
5229 {
5230 .procname = "mtu_expires",
5231 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5232 .maxlen = sizeof(int),
5233 .mode = 0644,
5234 .proc_handler = proc_dointvec_jiffies,
5235 },
5236 {
5237 .procname = "min_adv_mss",
5238 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5239 .maxlen = sizeof(int),
5240 .mode = 0644,
5241 .proc_handler = proc_dointvec,
5242 },
5243 {
5244 .procname = "gc_min_interval_ms",
5245 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5246 .maxlen = sizeof(int),
5247 .mode = 0644,
5248 .proc_handler = proc_dointvec_ms_jiffies,
5249 },
5250 { }
5251};
5252
5253struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5254{
5255 struct ctl_table *table;
5256
5257 table = kmemdup(ipv6_route_table_template,
5258 sizeof(ipv6_route_table_template),
5259 GFP_KERNEL);
5260
5261 if (table) {
5262 table[0].data = &net->ipv6.sysctl.flush_delay;
5263 table[0].extra1 = net;
5264 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5265 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5266 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5267 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5268 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5269 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5270 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5271 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5272 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5273
5274 /* Don't export sysctls to unprivileged users */
5275 if (net->user_ns != &init_user_ns)
5276 table[0].procname = NULL;
5277 }
5278
5279 return table;
5280}
5281#endif
5282
5283static int __net_init ip6_route_net_init(struct net *net)
5284{
5285 int ret = -ENOMEM;
5286
5287 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5288 sizeof(net->ipv6.ip6_dst_ops));
5289
5290 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5291 goto out_ip6_dst_ops;
5292
5293 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5294 sizeof(*net->ipv6.fib6_null_entry),
5295 GFP_KERNEL);
5296 if (!net->ipv6.fib6_null_entry)
5297 goto out_ip6_dst_entries;
5298
5299 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5300 sizeof(*net->ipv6.ip6_null_entry),
5301 GFP_KERNEL);
5302 if (!net->ipv6.ip6_null_entry)
5303 goto out_fib6_null_entry;
5304 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5305 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5306 ip6_template_metrics, true);
5307
5308#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5309 net->ipv6.fib6_has_custom_rules = false;
5310 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5311 sizeof(*net->ipv6.ip6_prohibit_entry),
5312 GFP_KERNEL);
5313 if (!net->ipv6.ip6_prohibit_entry)
5314 goto out_ip6_null_entry;
5315 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5316 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5317 ip6_template_metrics, true);
5318
5319 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5320 sizeof(*net->ipv6.ip6_blk_hole_entry),
5321 GFP_KERNEL);
5322 if (!net->ipv6.ip6_blk_hole_entry)
5323 goto out_ip6_prohibit_entry;
5324 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5325 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5326 ip6_template_metrics, true);
5327
5328 net->ipv6.ip6_policy_failed_entry =
5329 kmemdup(&ip6_policy_failed_entry_template,
5330 sizeof(*net->ipv6.ip6_policy_failed_entry), GFP_KERNEL);
5331 if (!net->ipv6.ip6_policy_failed_entry)
5332 goto out_ip6_blk_hole_entry;
5333 net->ipv6.ip6_policy_failed_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5334 dst_init_metrics(&net->ipv6.ip6_policy_failed_entry->dst,
5335 ip6_template_metrics, true);
5336#endif
5337
5338 net->ipv6.sysctl.flush_delay = 0;
5339 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5340 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5341 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5342 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5343 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5344 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5345 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5346
5347 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5348
5349 ret = 0;
5350out:
5351 return ret;
5352
5353#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5354out_ip6_blk_hole_entry:
5355 kfree(net->ipv6.ip6_blk_hole_entry);
5356out_ip6_prohibit_entry:
5357 kfree(net->ipv6.ip6_prohibit_entry);
5358out_ip6_null_entry:
5359 kfree(net->ipv6.ip6_null_entry);
5360#endif
5361out_fib6_null_entry:
5362 kfree(net->ipv6.fib6_null_entry);
5363out_ip6_dst_entries:
5364 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5365out_ip6_dst_ops:
5366 goto out;
5367}
5368
5369static void __net_exit ip6_route_net_exit(struct net *net)
5370{
5371 kfree(net->ipv6.fib6_null_entry);
5372 kfree(net->ipv6.ip6_null_entry);
5373#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5374 kfree(net->ipv6.ip6_prohibit_entry);
5375 kfree(net->ipv6.ip6_blk_hole_entry);
5376 kfree(net->ipv6.ip6_policy_failed_entry);
5377#endif
5378 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5379}
5380
5381static int __net_init ip6_route_net_init_late(struct net *net)
5382{
5383#ifdef CONFIG_PROC_FS
5384 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5385 sizeof(struct ipv6_route_iter));
5386 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5387 rt6_stats_seq_show, NULL);
5388#endif
5389 return 0;
5390}
5391
5392static void __net_exit ip6_route_net_exit_late(struct net *net)
5393{
5394#ifdef CONFIG_PROC_FS
5395 remove_proc_entry("ipv6_route", net->proc_net);
5396 remove_proc_entry("rt6_stats", net->proc_net);
5397#endif
5398}
5399
5400static struct pernet_operations ip6_route_net_ops = {
5401 .init = ip6_route_net_init,
5402 .exit = ip6_route_net_exit,
5403};
5404
5405static int __net_init ipv6_inetpeer_init(struct net *net)
5406{
5407 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5408
5409 if (!bp)
5410 return -ENOMEM;
5411 inet_peer_base_init(bp);
5412 net->ipv6.peers = bp;
5413 return 0;
5414}
5415
5416static void __net_exit ipv6_inetpeer_exit(struct net *net)
5417{
5418 struct inet_peer_base *bp = net->ipv6.peers;
5419
5420 net->ipv6.peers = NULL;
5421 inetpeer_invalidate_tree(bp);
5422 kfree(bp);
5423}
5424
5425static struct pernet_operations ipv6_inetpeer_ops = {
5426 .init = ipv6_inetpeer_init,
5427 .exit = ipv6_inetpeer_exit,
5428};
5429
5430static struct pernet_operations ip6_route_net_late_ops = {
5431 .init = ip6_route_net_init_late,
5432 .exit = ip6_route_net_exit_late,
5433};
5434
5435static struct notifier_block ip6_route_dev_notifier = {
5436 .notifier_call = ip6_route_dev_notify,
5437 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5438};
5439
5440void __init ip6_route_init_special_entries(void)
5441{
5442 /* Registering of the loopback is done before this portion of code,
5443 * the loopback reference in rt6_info will not be taken, do it
5444 * manually for init_net */
5445 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5446 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5447 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5448 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5449 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5450 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5451 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5452 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5453 init_net.ipv6.ip6_policy_failed_entry->dst.dev = init_net.loopback_dev;
5454 init_net.ipv6.ip6_policy_failed_entry->rt6i_idev =
5455 in6_dev_get(init_net.loopback_dev);
5456 #endif
5457}
5458
5459int __init ip6_route_init(void)
5460{
5461 int ret;
5462 int cpu;
5463
5464 ret = -ENOMEM;
5465 ip6_dst_ops_template.kmem_cachep =
5466 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5467 SLAB_HWCACHE_ALIGN, NULL);
5468 if (!ip6_dst_ops_template.kmem_cachep)
5469 goto out;
5470
5471 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5472 if (ret)
5473 goto out_kmem_cache;
5474
5475 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5476 if (ret)
5477 goto out_dst_entries;
5478
5479 ret = register_pernet_subsys(&ip6_route_net_ops);
5480 if (ret)
5481 goto out_register_inetpeer;
5482
5483 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5484
5485 ret = fib6_init();
5486 if (ret)
5487 goto out_register_subsys;
5488
5489 ret = xfrm6_init();
5490 if (ret)
5491 goto out_fib6_init;
5492
5493 ret = fib6_rules_init();
5494 if (ret)
5495 goto xfrm6_init;
5496
5497 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5498 if (ret)
5499 goto fib6_rules_init;
5500
5501 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5502 inet6_rtm_newroute, NULL, 0);
5503 if (ret < 0)
5504 goto out_register_late_subsys;
5505
5506 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5507 inet6_rtm_delroute, NULL, 0);
5508 if (ret < 0)
5509 goto out_register_late_subsys;
5510
5511 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5512 inet6_rtm_getroute, NULL,
5513 RTNL_FLAG_DOIT_UNLOCKED);
5514 if (ret < 0)
5515 goto out_register_late_subsys;
5516
5517 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5518 if (ret)
5519 goto out_register_late_subsys;
5520
5521 for_each_possible_cpu(cpu) {
5522 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5523
5524 INIT_LIST_HEAD(&ul->head);
5525 spin_lock_init(&ul->lock);
5526 }
5527
5528out:
5529 return ret;
5530
5531out_register_late_subsys:
5532 rtnl_unregister_all(PF_INET6);
5533 unregister_pernet_subsys(&ip6_route_net_late_ops);
5534fib6_rules_init:
5535 fib6_rules_cleanup();
5536xfrm6_init:
5537 xfrm6_fini();
5538out_fib6_init:
5539 fib6_gc_cleanup();
5540out_register_subsys:
5541 unregister_pernet_subsys(&ip6_route_net_ops);
5542out_register_inetpeer:
5543 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5544out_dst_entries:
5545 dst_entries_destroy(&ip6_dst_blackhole_ops);
5546out_kmem_cache:
5547 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5548 goto out;
5549}
5550
5551void ip6_route_cleanup(void)
5552{
5553 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5554 unregister_pernet_subsys(&ip6_route_net_late_ops);
5555 fib6_rules_cleanup();
5556 xfrm6_fini();
5557 fib6_gc_cleanup();
5558 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5559 unregister_pernet_subsys(&ip6_route_net_ops);
5560 dst_entries_destroy(&ip6_dst_blackhole_ops);
5561 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5562}