blob: 98ab1564aade672434147e0b698b912294b350d5 [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27#define pr_fmt(fmt) "IPv6: " fmt
28
29#include <linux/capability.h>
30#include <linux/errno.h>
31#include <linux/export.h>
32#include <linux/types.h>
33#include <linux/times.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/net.h>
37#include <linux/route.h>
38#include <linux/netdevice.h>
39#include <linux/in6.h>
40#include <linux/mroute6.h>
41#include <linux/init.h>
42#include <linux/if_arp.h>
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
45#include <linux/nsproxy.h>
46#include <linux/slab.h>
47#include <net/net_namespace.h>
48#include <net/snmp.h>
49#include <net/ipv6.h>
50#include <net/ip6_fib.h>
51#include <net/ip6_route.h>
52#include <net/ndisc.h>
53#include <net/addrconf.h>
54#include <net/tcp.h>
55#include <linux/rtnetlink.h>
56#include <net/dst.h>
57#include <net/dst_metadata.h>
58#include <net/xfrm.h>
59#include <net/netevent.h>
60#include <net/netlink.h>
61#include <net/nexthop.h>
62#include <net/lwtunnel.h>
63#include <net/ip_tunnels.h>
64#include <net/l3mdev.h>
65#include <trace/events/fib6.h>
66
67#include <linux/uaccess.h>
68
69#ifdef CONFIG_SYSCTL
70#include <linux/sysctl.h>
71#endif
72
73enum rt6_nud_state {
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
77 RT6_NUD_SUCCEED = 1
78};
79
80static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83static unsigned int ip6_mtu(const struct dst_entry *dst);
84static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85static void ip6_dst_destroy(struct dst_entry *);
86static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88static int ip6_dst_gc(struct dst_ops *ops);
89
90static int ip6_pkt_discard(struct sk_buff *skb);
91static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92static int ip6_pkt_prohibit(struct sk_buff *skb);
93static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94static void ip6_link_failure(struct sk_buff *skb);
95static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu,
97 bool confirm_neigh);
98static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb);
100static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102static size_t rt6_nlmsg_size(struct rt6_info *rt);
103static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags);
108
109#ifdef CONFIG_IPV6_ROUTE_INFO
110static struct rt6_info *rt6_add_route_info(struct net *net,
111 const struct in6_addr *prefix, int prefixlen,
112 const struct in6_addr *gwaddr,
113 struct net_device *dev,
114 unsigned int pref);
115static struct rt6_info *rt6_get_route_info(struct net *net,
116 const struct in6_addr *prefix, int prefixlen,
117 const struct in6_addr *gwaddr,
118 struct net_device *dev);
119#endif
120
121struct uncached_list {
122 spinlock_t lock;
123 struct list_head head;
124};
125
126static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
127
128static void rt6_uncached_list_add(struct rt6_info *rt)
129{
130 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
131
132 rt->rt6i_uncached_list = ul;
133
134 spin_lock_bh(&ul->lock);
135 list_add_tail(&rt->rt6i_uncached, &ul->head);
136 spin_unlock_bh(&ul->lock);
137}
138
139static void rt6_uncached_list_del(struct rt6_info *rt)
140{
141 if (!list_empty(&rt->rt6i_uncached)) {
142 struct uncached_list *ul = rt->rt6i_uncached_list;
143
144 spin_lock_bh(&ul->lock);
145 list_del(&rt->rt6i_uncached);
146 spin_unlock_bh(&ul->lock);
147 }
148}
149
150static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
151{
152 struct net_device *loopback_dev = net->loopback_dev;
153 int cpu;
154
155 if (dev == loopback_dev)
156 return;
157
158 for_each_possible_cpu(cpu) {
159 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
160 struct rt6_info *rt;
161
162 spin_lock_bh(&ul->lock);
163 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164 struct inet6_dev *rt_idev = rt->rt6i_idev;
165 struct net_device *rt_dev = rt->dst.dev;
166
167 if (rt_idev->dev == dev) {
168 rt->rt6i_idev = in6_dev_get(loopback_dev);
169 in6_dev_put(rt_idev);
170 }
171
172 if (rt_dev == dev) {
173 rt->dst.dev = loopback_dev;
174 dev_hold(rt->dst.dev);
175 dev_put(rt_dev);
176 }
177 }
178 spin_unlock_bh(&ul->lock);
179 }
180}
181
182static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
183{
184 return dst_metrics_write_ptr(rt->dst.from);
185}
186
187static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
188{
189 struct rt6_info *rt = (struct rt6_info *)dst;
190
191 if (rt->rt6i_flags & RTF_PCPU)
192 return rt6_pcpu_cow_metrics(rt);
193 else if (rt->rt6i_flags & RTF_CACHE)
194 return NULL;
195 else
196 return dst_cow_metrics_generic(dst, old);
197}
198
199static inline const void *choose_neigh_daddr(struct rt6_info *rt,
200 struct sk_buff *skb,
201 const void *daddr)
202{
203 struct in6_addr *p = &rt->rt6i_gateway;
204
205 if (!ipv6_addr_any(p))
206 return (const void *) p;
207 else if (skb)
208 return &ipv6_hdr(skb)->daddr;
209 return daddr;
210}
211
212static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
213 struct sk_buff *skb,
214 const void *daddr)
215{
216 struct rt6_info *rt = (struct rt6_info *) dst;
217 struct neighbour *n;
218
219 daddr = choose_neigh_daddr(rt, skb, daddr);
220 n = __ipv6_neigh_lookup(dst->dev, daddr);
221 if (n)
222 return n;
223 return neigh_create(&nd_tbl, daddr, dst->dev);
224}
225
226static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227{
228 struct net_device *dev = dst->dev;
229 struct rt6_info *rt = (struct rt6_info *)dst;
230
231 daddr = choose_neigh_daddr(rt, NULL, daddr);
232 if (!daddr)
233 return;
234 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 return;
236 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 return;
238 __ipv6_confirm_neigh(dev, daddr);
239}
240
241static struct dst_ops ip6_dst_ops_template = {
242 .family = AF_INET6,
243 .gc = ip6_dst_gc,
244 .gc_thresh = 1024,
245 .check = ip6_dst_check,
246 .default_advmss = ip6_default_advmss,
247 .mtu = ip6_mtu,
248 .cow_metrics = ipv6_cow_metrics,
249 .destroy = ip6_dst_destroy,
250 .ifdown = ip6_dst_ifdown,
251 .negative_advice = ip6_negative_advice,
252 .link_failure = ip6_link_failure,
253 .update_pmtu = ip6_rt_update_pmtu,
254 .redirect = rt6_do_redirect,
255 .local_out = __ip6_local_out,
256 .neigh_lookup = ip6_neigh_lookup,
257 .confirm_neigh = ip6_confirm_neigh,
258};
259
260static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261{
262 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263
264 return mtu ? : dst->dev->mtu;
265}
266
267static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268 struct sk_buff *skb, u32 mtu,
269 bool confirm_neigh)
270{
271}
272
273static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274 struct sk_buff *skb)
275{
276}
277
278static struct dst_ops ip6_dst_blackhole_ops = {
279 .family = AF_INET6,
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_neigh_lookup,
288};
289
290static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
292};
293
294static const struct rt6_info ip6_null_entry_template = {
295 .dst = {
296 .__refcnt = ATOMIC_INIT(1),
297 .__use = 1,
298 .obsolete = DST_OBSOLETE_FORCE_CHK,
299 .error = -ENETUNREACH,
300 .input = ip6_pkt_discard,
301 .output = ip6_pkt_discard_out,
302 },
303 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
304 .rt6i_protocol = RTPROT_KERNEL,
305 .rt6i_metric = ~(u32) 0,
306 .rt6i_ref = ATOMIC_INIT(1),
307};
308
309#ifdef CONFIG_IPV6_MULTIPLE_TABLES
310
311static const struct rt6_info ip6_prohibit_entry_template = {
312 .dst = {
313 .__refcnt = ATOMIC_INIT(1),
314 .__use = 1,
315 .obsolete = DST_OBSOLETE_FORCE_CHK,
316 .error = -EACCES,
317 .input = ip6_pkt_prohibit,
318 .output = ip6_pkt_prohibit_out,
319 },
320 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
321 .rt6i_protocol = RTPROT_KERNEL,
322 .rt6i_metric = ~(u32) 0,
323 .rt6i_ref = ATOMIC_INIT(1),
324};
325
326static const struct rt6_info ip6_blk_hole_entry_template = {
327 .dst = {
328 .__refcnt = ATOMIC_INIT(1),
329 .__use = 1,
330 .obsolete = DST_OBSOLETE_FORCE_CHK,
331 .error = -EINVAL,
332 .input = dst_discard,
333 .output = dst_discard_out,
334 },
335 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
336 .rt6i_protocol = RTPROT_KERNEL,
337 .rt6i_metric = ~(u32) 0,
338 .rt6i_ref = ATOMIC_INIT(1),
339};
340
341#endif
342
343static void rt6_info_init(struct rt6_info *rt)
344{
345 struct dst_entry *dst = &rt->dst;
346
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_siblings);
349 INIT_LIST_HEAD(&rt->rt6i_uncached);
350}
351
352/* allocate dst with ip6_dst_ops */
353static struct rt6_info *__ip6_dst_alloc(struct net *net,
354 struct net_device *dev,
355 int flags)
356{
357 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
358 1, DST_OBSOLETE_FORCE_CHK, flags);
359
360 if (rt)
361 rt6_info_init(rt);
362
363 return rt;
364}
365
366struct rt6_info *ip6_dst_alloc(struct net *net,
367 struct net_device *dev,
368 int flags)
369{
370 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
371
372 if (rt) {
373 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
374 if (rt->rt6i_pcpu) {
375 int cpu;
376
377 for_each_possible_cpu(cpu) {
378 struct rt6_info **p;
379
380 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
381 /* no one shares rt */
382 *p = NULL;
383 }
384 } else {
385 dst_release_immediate(&rt->dst);
386 return NULL;
387 }
388 }
389
390 return rt;
391}
392EXPORT_SYMBOL(ip6_dst_alloc);
393
394static void ip6_dst_destroy(struct dst_entry *dst)
395{
396 struct rt6_info *rt = (struct rt6_info *)dst;
397 struct dst_entry *from = dst->from;
398 struct inet6_dev *idev;
399
400 dst_destroy_metrics_generic(dst);
401 free_percpu(rt->rt6i_pcpu);
402 rt6_uncached_list_del(rt);
403
404 idev = rt->rt6i_idev;
405 if (idev) {
406 rt->rt6i_idev = NULL;
407 in6_dev_put(idev);
408 }
409
410 dst->from = NULL;
411 dst_release(from);
412}
413
414static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
415 int how)
416{
417 struct rt6_info *rt = (struct rt6_info *)dst;
418 struct inet6_dev *idev = rt->rt6i_idev;
419 struct net_device *loopback_dev =
420 dev_net(dev)->loopback_dev;
421
422 if (idev && idev->dev != loopback_dev) {
423 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
424 if (loopback_idev) {
425 rt->rt6i_idev = loopback_idev;
426 in6_dev_put(idev);
427 }
428 }
429}
430
431static bool __rt6_check_expired(const struct rt6_info *rt)
432{
433 if (rt->rt6i_flags & RTF_EXPIRES)
434 return time_after(jiffies, rt->dst.expires);
435 else
436 return false;
437}
438
439static bool rt6_check_expired(const struct rt6_info *rt)
440{
441 if (rt->rt6i_flags & RTF_EXPIRES) {
442 if (time_after(jiffies, rt->dst.expires))
443 return true;
444 } else if (rt->dst.from) {
445 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
446 rt6_check_expired((struct rt6_info *)rt->dst.from);
447 }
448 return false;
449}
450
451static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 struct flowi6 *fl6, int oif,
453 int strict)
454{
455 struct rt6_info *sibling, *next_sibling;
456 int route_choosen;
457
458 /* We might have already computed the hash for ICMPv6 errors. In such
459 * case it will always be non-zero. Otherwise now is the time to do it.
460 */
461 if (!fl6->mp_hash)
462 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
463
464 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
465 /* Don't change the route, if route_choosen == 0
466 * (siblings does not include ourself)
467 */
468 if (route_choosen)
469 list_for_each_entry_safe(sibling, next_sibling,
470 &match->rt6i_siblings, rt6i_siblings) {
471 route_choosen--;
472 if (route_choosen == 0) {
473 if (rt6_score_route(sibling, oif, strict) < 0)
474 break;
475 match = sibling;
476 break;
477 }
478 }
479 return match;
480}
481
482/*
483 * Route lookup. Any table->tb6_lock is implied.
484 */
485
486static inline struct rt6_info *rt6_device_match(struct net *net,
487 struct rt6_info *rt,
488 const struct in6_addr *saddr,
489 int oif,
490 int flags)
491{
492 struct rt6_info *local = NULL;
493 struct rt6_info *sprt;
494
495 if (!oif && ipv6_addr_any(saddr))
496 goto out;
497
498 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
499 struct net_device *dev = sprt->dst.dev;
500
501 if (oif) {
502 if (dev->ifindex == oif)
503 return sprt;
504 if (dev->flags & IFF_LOOPBACK) {
505 if (!sprt->rt6i_idev ||
506 sprt->rt6i_idev->dev->ifindex != oif) {
507 if (flags & RT6_LOOKUP_F_IFACE)
508 continue;
509 if (local &&
510 local->rt6i_idev->dev->ifindex == oif)
511 continue;
512 }
513 local = sprt;
514 }
515 } else {
516 if (ipv6_chk_addr(net, saddr, dev,
517 flags & RT6_LOOKUP_F_IFACE))
518 return sprt;
519 }
520 }
521
522 if (oif) {
523 if (local)
524 return local;
525
526 if (flags & RT6_LOOKUP_F_IFACE)
527 return net->ipv6.ip6_null_entry;
528 }
529out:
530 return rt;
531}
532
533#ifdef CONFIG_IPV6_ROUTER_PREF
534struct __rt6_probe_work {
535 struct work_struct work;
536 struct in6_addr target;
537 struct net_device *dev;
538};
539
540static void rt6_probe_deferred(struct work_struct *w)
541{
542 struct in6_addr mcaddr;
543 struct __rt6_probe_work *work =
544 container_of(w, struct __rt6_probe_work, work);
545
546 addrconf_addr_solict_mult(&work->target, &mcaddr);
547 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
548 dev_put(work->dev);
549 kfree(work);
550}
551
552static void rt6_probe(struct rt6_info *rt)
553{
554 struct __rt6_probe_work *work;
555 struct neighbour *neigh;
556 /*
557 * Okay, this does not seem to be appropriate
558 * for now, however, we need to check if it
559 * is really so; aka Router Reachability Probing.
560 *
561 * Router Reachability Probe MUST be rate-limited
562 * to no more than one per minute.
563 */
564 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
565 return;
566 rcu_read_lock_bh();
567 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
568 if (neigh) {
569 if (neigh->nud_state & NUD_VALID)
570 goto out;
571
572 work = NULL;
573 write_lock(&neigh->lock);
574 if (!(neigh->nud_state & NUD_VALID) &&
575 time_after(jiffies,
576 neigh->updated +
577 rt->rt6i_idev->cnf.rtr_probe_interval)) {
578 work = kmalloc(sizeof(*work), GFP_ATOMIC);
579 if (work)
580 __neigh_set_probe_once(neigh);
581 }
582 write_unlock(&neigh->lock);
583 } else {
584 work = kmalloc(sizeof(*work), GFP_ATOMIC);
585 }
586
587 if (work) {
588 INIT_WORK(&work->work, rt6_probe_deferred);
589 work->target = rt->rt6i_gateway;
590 dev_hold(rt->dst.dev);
591 work->dev = rt->dst.dev;
592 schedule_work(&work->work);
593 }
594
595out:
596 rcu_read_unlock_bh();
597}
598#else
599static inline void rt6_probe(struct rt6_info *rt)
600{
601}
602#endif
603
604/*
605 * Default Router Selection (RFC 2461 6.3.6)
606 */
607static inline int rt6_check_dev(struct rt6_info *rt, int oif)
608{
609 struct net_device *dev = rt->dst.dev;
610 if (!oif || dev->ifindex == oif)
611 return 2;
612 if ((dev->flags & IFF_LOOPBACK) &&
613 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
614 return 1;
615 return 0;
616}
617
618static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
619{
620 struct neighbour *neigh;
621 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
622
623 if (rt->rt6i_flags & RTF_NONEXTHOP ||
624 !(rt->rt6i_flags & RTF_GATEWAY))
625 return RT6_NUD_SUCCEED;
626
627 rcu_read_lock_bh();
628 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
629 if (neigh) {
630 read_lock(&neigh->lock);
631 if (neigh->nud_state & NUD_VALID)
632 ret = RT6_NUD_SUCCEED;
633#ifdef CONFIG_IPV6_ROUTER_PREF
634 else if (!(neigh->nud_state & NUD_FAILED))
635 ret = RT6_NUD_SUCCEED;
636 else
637 ret = RT6_NUD_FAIL_PROBE;
638#endif
639 read_unlock(&neigh->lock);
640 } else {
641 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
642 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
643 }
644 rcu_read_unlock_bh();
645
646 return ret;
647}
648
649static int rt6_score_route(struct rt6_info *rt, int oif,
650 int strict)
651{
652 int m;
653
654 m = rt6_check_dev(rt, oif);
655 if (!m && (strict & RT6_LOOKUP_F_IFACE))
656 return RT6_NUD_FAIL_HARD;
657#ifdef CONFIG_IPV6_ROUTER_PREF
658 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
659#endif
660 if (strict & RT6_LOOKUP_F_REACHABLE) {
661 int n = rt6_check_neigh(rt);
662 if (n < 0)
663 return n;
664 }
665 return m;
666}
667
668static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
669 int *mpri, struct rt6_info *match,
670 bool *do_rr)
671{
672 int m;
673 bool match_do_rr = false;
674 struct inet6_dev *idev = rt->rt6i_idev;
675 struct net_device *dev = rt->dst.dev;
676
677 if (dev && !netif_carrier_ok(dev) &&
678 idev->cnf.ignore_routes_with_linkdown &&
679 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
680 goto out;
681
682 if (rt6_check_expired(rt))
683 goto out;
684
685 m = rt6_score_route(rt, oif, strict);
686 if (m == RT6_NUD_FAIL_DO_RR) {
687 match_do_rr = true;
688 m = 0; /* lowest valid score */
689 } else if (m == RT6_NUD_FAIL_HARD) {
690 goto out;
691 }
692
693 if (strict & RT6_LOOKUP_F_REACHABLE)
694 rt6_probe(rt);
695
696 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
697 if (m > *mpri) {
698 *do_rr = match_do_rr;
699 *mpri = m;
700 match = rt;
701 }
702out:
703 return match;
704}
705
706static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
707 struct rt6_info *rr_head,
708 u32 metric, int oif, int strict,
709 bool *do_rr)
710{
711 struct rt6_info *rt, *match, *cont;
712 int mpri = -1;
713
714 match = NULL;
715 cont = NULL;
716 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
717 if (rt->rt6i_metric != metric) {
718 cont = rt;
719 break;
720 }
721
722 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 }
724
725 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
726 if (rt->rt6i_metric != metric) {
727 cont = rt;
728 break;
729 }
730
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 }
733
734 if (match || !cont)
735 return match;
736
737 for (rt = cont; rt; rt = rt->dst.rt6_next)
738 match = find_match(rt, oif, strict, &mpri, match, do_rr);
739
740 return match;
741}
742
743static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
744{
745 struct rt6_info *match, *rt0;
746 struct net *net;
747 bool do_rr = false;
748
749 rt0 = fn->rr_ptr;
750 if (!rt0)
751 fn->rr_ptr = rt0 = fn->leaf;
752
753 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
754 &do_rr);
755
756 if (do_rr) {
757 struct rt6_info *next = rt0->dst.rt6_next;
758
759 /* no entries matched; do round-robin */
760 if (!next || next->rt6i_metric != rt0->rt6i_metric)
761 next = fn->leaf;
762
763 if (next != rt0)
764 fn->rr_ptr = next;
765 }
766
767 net = dev_net(rt0->dst.dev);
768 return match ? match : net->ipv6.ip6_null_entry;
769}
770
771static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
772{
773 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
774}
775
776#ifdef CONFIG_IPV6_ROUTE_INFO
777int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
778 const struct in6_addr *gwaddr)
779{
780 struct net *net = dev_net(dev);
781 struct route_info *rinfo = (struct route_info *) opt;
782 struct in6_addr prefix_buf, *prefix;
783 unsigned int pref;
784 unsigned long lifetime;
785 struct rt6_info *rt;
786
787 if (len < sizeof(struct route_info)) {
788 return -EINVAL;
789 }
790
791 /* Sanity check for prefix_len and length */
792 if (rinfo->length > 3) {
793 return -EINVAL;
794 } else if (rinfo->prefix_len > 128) {
795 return -EINVAL;
796 } else if (rinfo->prefix_len > 64) {
797 if (rinfo->length < 2) {
798 return -EINVAL;
799 }
800 } else if (rinfo->prefix_len > 0) {
801 if (rinfo->length < 1) {
802 return -EINVAL;
803 }
804 }
805
806 pref = rinfo->route_pref;
807 if (pref == ICMPV6_ROUTER_PREF_INVALID)
808 return -EINVAL;
809
810 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
811
812 if (rinfo->length == 3)
813 prefix = (struct in6_addr *)rinfo->prefix;
814 else {
815 /* this function is safe */
816 ipv6_addr_prefix(&prefix_buf,
817 (struct in6_addr *)rinfo->prefix,
818 rinfo->prefix_len);
819 prefix = &prefix_buf;
820 }
821
822 if (rinfo->prefix_len == 0)
823 rt = rt6_get_dflt_router(gwaddr, dev);
824 else
825 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
826 gwaddr, dev);
827
828 if (rt && !lifetime) {
829 ip6_del_rt(rt);
830 rt = NULL;
831 }
832
833 if (!rt && lifetime)
834 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
835 dev, pref);
836 else if (rt)
837 rt->rt6i_flags = RTF_ROUTEINFO |
838 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
839
840 if (rt) {
841 if (!addrconf_finite_timeout(lifetime))
842 rt6_clean_expires(rt);
843 else
844 rt6_set_expires(rt, jiffies + HZ * lifetime);
845
846 ip6_rt_put(rt);
847 }
848 return 0;
849}
850#endif
851
852static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
853 struct in6_addr *saddr)
854{
855 struct fib6_node *pn;
856 while (1) {
857 if (fn->fn_flags & RTN_TL_ROOT)
858 return NULL;
859 pn = fn->parent;
860 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
861 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
862 else
863 fn = pn;
864 if (fn->fn_flags & RTN_RTINFO)
865 return fn;
866 }
867}
868
869static struct rt6_info *ip6_pol_route_lookup(struct net *net,
870 struct fib6_table *table,
871 struct flowi6 *fl6, int flags)
872{
873 struct fib6_node *fn;
874 struct rt6_info *rt;
875
876 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
877 flags &= ~RT6_LOOKUP_F_IFACE;
878
879 read_lock_bh(&table->tb6_lock);
880 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
881restart:
882 rt = fn->leaf;
883 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
884 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
885 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
886 if (rt == net->ipv6.ip6_null_entry) {
887 fn = fib6_backtrack(fn, &fl6->saddr);
888 if (fn)
889 goto restart;
890 }
891 dst_use(&rt->dst, jiffies);
892 read_unlock_bh(&table->tb6_lock);
893
894 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
895
896 return rt;
897
898}
899
900struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
901 int flags)
902{
903 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
904}
905EXPORT_SYMBOL_GPL(ip6_route_lookup);
906
907struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
908 const struct in6_addr *saddr, int oif, int strict)
909{
910 struct flowi6 fl6 = {
911 .flowi6_oif = oif,
912 .daddr = *daddr,
913 };
914 struct dst_entry *dst;
915 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
916
917 if (saddr) {
918 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
919 flags |= RT6_LOOKUP_F_HAS_SADDR;
920 }
921
922 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
923 if (dst->error == 0)
924 return (struct rt6_info *) dst;
925
926 dst_release(dst);
927
928 return NULL;
929}
930EXPORT_SYMBOL(rt6_lookup);
931
932/* ip6_ins_rt is called with FREE table->tb6_lock.
933 * It takes new route entry, the addition fails by any reason the
934 * route is released.
935 * Caller must hold dst before calling it.
936 */
937
938static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
939 struct mx6_config *mxc,
940 struct netlink_ext_ack *extack)
941{
942 int err;
943 struct fib6_table *table;
944
945 table = rt->rt6i_table;
946 write_lock_bh(&table->tb6_lock);
947 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
948 write_unlock_bh(&table->tb6_lock);
949
950 return err;
951}
952
953int ip6_ins_rt(struct rt6_info *rt)
954{
955 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
956 struct mx6_config mxc = { .mx = NULL, };
957
958 /* Hold dst to account for the reference from the fib6 tree */
959 dst_hold(&rt->dst);
960 return __ip6_ins_rt(rt, &info, &mxc, NULL);
961}
962
963/* called with rcu_lock held */
964static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
965{
966 struct net_device *dev = rt->dst.dev;
967
968 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
969 /* for copies of local routes, dst->dev needs to be the
970 * device if it is a master device, the master device if
971 * device is enslaved, and the loopback as the default
972 */
973 if (netif_is_l3_slave(dev) &&
974 !rt6_need_strict(&rt->rt6i_dst.addr))
975 dev = l3mdev_master_dev_rcu(dev);
976 else if (!netif_is_l3_master(dev))
977 dev = dev_net(dev)->loopback_dev;
978 /* last case is netif_is_l3_master(dev) is true in which
979 * case we want dev returned to be dev
980 */
981 }
982
983 return dev;
984}
985
986static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
987 const struct in6_addr *daddr,
988 const struct in6_addr *saddr)
989{
990 struct net_device *dev;
991 struct rt6_info *rt;
992
993 /*
994 * Clone the route.
995 */
996
997 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
998 ort = (struct rt6_info *)ort->dst.from;
999
1000 rcu_read_lock();
1001 dev = ip6_rt_get_dev_rcu(ort);
1002 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1003 rcu_read_unlock();
1004 if (!rt)
1005 return NULL;
1006
1007 ip6_rt_copy_init(rt, ort);
1008 rt->rt6i_flags |= RTF_CACHE;
1009 rt->rt6i_metric = 0;
1010 rt->dst.flags |= DST_HOST;
1011 rt->rt6i_dst.addr = *daddr;
1012 rt->rt6i_dst.plen = 128;
1013
1014 if (!rt6_is_gw_or_nonexthop(ort)) {
1015 if (ort->rt6i_dst.plen != 128 &&
1016 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1017 rt->rt6i_flags |= RTF_ANYCAST;
1018#ifdef CONFIG_IPV6_SUBTREES
1019 if (rt->rt6i_src.plen && saddr) {
1020 rt->rt6i_src.addr = *saddr;
1021 rt->rt6i_src.plen = 128;
1022 }
1023#endif
1024 }
1025
1026 return rt;
1027}
1028
1029static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1030{
1031 struct net_device *dev;
1032 struct rt6_info *pcpu_rt;
1033
1034 rcu_read_lock();
1035 dev = ip6_rt_get_dev_rcu(rt);
1036 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1037 rcu_read_unlock();
1038 if (!pcpu_rt)
1039 return NULL;
1040 ip6_rt_copy_init(pcpu_rt, rt);
1041 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1042 pcpu_rt->rt6i_flags |= RTF_PCPU;
1043 return pcpu_rt;
1044}
1045
1046/* It should be called with read_lock_bh(&tb6_lock) acquired */
1047static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1048{
1049 struct rt6_info *pcpu_rt, **p;
1050
1051 p = this_cpu_ptr(rt->rt6i_pcpu);
1052 pcpu_rt = *p;
1053
1054 if (pcpu_rt) {
1055 dst_hold(&pcpu_rt->dst);
1056 rt6_dst_from_metrics_check(pcpu_rt);
1057 }
1058 return pcpu_rt;
1059}
1060
1061static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1062{
1063 struct fib6_table *table = rt->rt6i_table;
1064 struct rt6_info *pcpu_rt, *prev, **p;
1065
1066 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1067 if (!pcpu_rt) {
1068 struct net *net = dev_net(rt->dst.dev);
1069
1070 dst_hold(&net->ipv6.ip6_null_entry->dst);
1071 return net->ipv6.ip6_null_entry;
1072 }
1073
1074 read_lock_bh(&table->tb6_lock);
1075 if (rt->rt6i_pcpu) {
1076 p = this_cpu_ptr(rt->rt6i_pcpu);
1077 prev = cmpxchg(p, NULL, pcpu_rt);
1078 if (prev) {
1079 /* If someone did it before us, return prev instead */
1080 dst_release_immediate(&pcpu_rt->dst);
1081 pcpu_rt = prev;
1082 }
1083 } else {
1084 /* rt has been removed from the fib6 tree
1085 * before we have a chance to acquire the read_lock.
1086 * In this case, don't brother to create a pcpu rt
1087 * since rt is going away anyway. The next
1088 * dst_check() will trigger a re-lookup.
1089 */
1090 dst_release_immediate(&pcpu_rt->dst);
1091 pcpu_rt = rt;
1092 }
1093 dst_hold(&pcpu_rt->dst);
1094 rt6_dst_from_metrics_check(pcpu_rt);
1095 read_unlock_bh(&table->tb6_lock);
1096 return pcpu_rt;
1097}
1098
1099struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1100 int oif, struct flowi6 *fl6, int flags)
1101{
1102 struct fib6_node *fn, *saved_fn;
1103 struct rt6_info *rt;
1104 int strict = 0;
1105
1106 strict |= flags & RT6_LOOKUP_F_IFACE;
1107 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1108 if (net->ipv6.devconf_all->forwarding == 0)
1109 strict |= RT6_LOOKUP_F_REACHABLE;
1110
1111 read_lock_bh(&table->tb6_lock);
1112
1113 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1114 saved_fn = fn;
1115
1116 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1117 oif = 0;
1118
1119redo_rt6_select:
1120 rt = rt6_select(fn, oif, strict);
1121 if (rt->rt6i_nsiblings)
1122 rt = rt6_multipath_select(rt, fl6, oif, strict);
1123 if (rt == net->ipv6.ip6_null_entry) {
1124 fn = fib6_backtrack(fn, &fl6->saddr);
1125 if (fn)
1126 goto redo_rt6_select;
1127 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1128 /* also consider unreachable route */
1129 strict &= ~RT6_LOOKUP_F_REACHABLE;
1130 fn = saved_fn;
1131 goto redo_rt6_select;
1132 }
1133 }
1134
1135
1136 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1137 dst_use(&rt->dst, jiffies);
1138 read_unlock_bh(&table->tb6_lock);
1139
1140 rt6_dst_from_metrics_check(rt);
1141
1142 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1143 return rt;
1144 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1145 !(rt->rt6i_flags & RTF_GATEWAY))) {
1146 /* Create a RTF_CACHE clone which will not be
1147 * owned by the fib6 tree. It is for the special case where
1148 * the daddr in the skb during the neighbor look-up is different
1149 * from the fl6->daddr used to look-up route here.
1150 */
1151
1152 struct rt6_info *uncached_rt;
1153
1154 dst_use(&rt->dst, jiffies);
1155 read_unlock_bh(&table->tb6_lock);
1156
1157 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1158 dst_release(&rt->dst);
1159
1160 if (uncached_rt) {
1161 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1162 * No need for another dst_hold()
1163 */
1164 rt6_uncached_list_add(uncached_rt);
1165 } else {
1166 uncached_rt = net->ipv6.ip6_null_entry;
1167 dst_hold(&uncached_rt->dst);
1168 }
1169
1170 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1171 return uncached_rt;
1172
1173 } else {
1174 /* Get a percpu copy */
1175
1176 struct rt6_info *pcpu_rt;
1177
1178 rt->dst.lastuse = jiffies;
1179 rt->dst.__use++;
1180 pcpu_rt = rt6_get_pcpu_route(rt);
1181
1182 if (pcpu_rt) {
1183 read_unlock_bh(&table->tb6_lock);
1184 } else {
1185 /* We have to do the read_unlock first
1186 * because rt6_make_pcpu_route() may trigger
1187 * ip6_dst_gc() which will take the write_lock.
1188 */
1189 dst_hold(&rt->dst);
1190 read_unlock_bh(&table->tb6_lock);
1191 pcpu_rt = rt6_make_pcpu_route(rt);
1192 dst_release(&rt->dst);
1193 }
1194
1195 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1196 return pcpu_rt;
1197
1198 }
1199}
1200EXPORT_SYMBOL_GPL(ip6_pol_route);
1201
1202static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1203 struct flowi6 *fl6, int flags)
1204{
1205 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1206}
1207
1208struct dst_entry *ip6_route_input_lookup(struct net *net,
1209 struct net_device *dev,
1210 struct flowi6 *fl6, int flags)
1211{
1212 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1213 flags |= RT6_LOOKUP_F_IFACE;
1214
1215 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1216}
1217EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1218
1219static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1220 struct flow_keys *keys)
1221{
1222 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1223 const struct ipv6hdr *key_iph = outer_iph;
1224 const struct ipv6hdr *inner_iph;
1225 const struct icmp6hdr *icmph;
1226 struct ipv6hdr _inner_iph;
1227 struct icmp6hdr _icmph;
1228
1229 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1230 goto out;
1231
1232 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1233 sizeof(_icmph), &_icmph);
1234 if (!icmph)
1235 goto out;
1236
1237 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1238 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1239 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1240 icmph->icmp6_type != ICMPV6_PARAMPROB)
1241 goto out;
1242
1243 inner_iph = skb_header_pointer(skb,
1244 skb_transport_offset(skb) + sizeof(*icmph),
1245 sizeof(_inner_iph), &_inner_iph);
1246 if (!inner_iph)
1247 goto out;
1248
1249 key_iph = inner_iph;
1250out:
1251 memset(keys, 0, sizeof(*keys));
1252 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1253 keys->addrs.v6addrs.src = key_iph->saddr;
1254 keys->addrs.v6addrs.dst = key_iph->daddr;
1255 keys->tags.flow_label = ip6_flowlabel(key_iph);
1256 keys->basic.ip_proto = key_iph->nexthdr;
1257}
1258
1259/* if skb is set it will be used and fl6 can be NULL */
1260u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1261{
1262 struct flow_keys hash_keys;
1263
1264 if (skb) {
1265 ip6_multipath_l3_keys(skb, &hash_keys);
1266 return flow_hash_from_keys(&hash_keys);
1267 }
1268
1269 return get_hash_from_flowi6(fl6);
1270}
1271
1272void ip6_route_input(struct sk_buff *skb)
1273{
1274 const struct ipv6hdr *iph = ipv6_hdr(skb);
1275 struct net *net = dev_net(skb->dev);
1276 int flags = RT6_LOOKUP_F_HAS_SADDR;
1277 struct ip_tunnel_info *tun_info;
1278 struct flowi6 fl6 = {
1279 .flowi6_iif = skb->dev->ifindex,
1280 .daddr = iph->daddr,
1281 .saddr = iph->saddr,
1282 .flowlabel = ip6_flowinfo(iph),
1283 .flowi6_mark = skb->mark,
1284 .flowi6_proto = iph->nexthdr,
1285 };
1286
1287 tun_info = skb_tunnel_info(skb);
1288 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1289 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1290 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1291 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1292 skb_dst_drop(skb);
1293 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1294}
1295
1296static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1297 struct flowi6 *fl6, int flags)
1298{
1299 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1300}
1301
1302struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1303 struct flowi6 *fl6, int flags)
1304{
1305 bool any_src;
1306
1307 if (rt6_need_strict(&fl6->daddr)) {
1308 struct dst_entry *dst;
1309
1310 dst = l3mdev_link_scope_lookup(net, fl6);
1311 if (dst)
1312 return dst;
1313 }
1314
1315 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1316
1317 any_src = ipv6_addr_any(&fl6->saddr);
1318 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1319 (fl6->flowi6_oif && any_src))
1320 flags |= RT6_LOOKUP_F_IFACE;
1321
1322 if (!any_src)
1323 flags |= RT6_LOOKUP_F_HAS_SADDR;
1324 else if (sk)
1325 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1326
1327 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1328}
1329EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1330
1331struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1332{
1333 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1334 struct net_device *loopback_dev = net->loopback_dev;
1335 struct dst_entry *new = NULL;
1336
1337 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1338 DST_OBSOLETE_DEAD, 0);
1339 if (rt) {
1340 rt6_info_init(rt);
1341
1342 new = &rt->dst;
1343 new->__use = 1;
1344 new->input = dst_discard;
1345 new->output = dst_discard_out;
1346
1347 dst_copy_metrics(new, &ort->dst);
1348
1349 rt->rt6i_idev = in6_dev_get(loopback_dev);
1350 rt->rt6i_gateway = ort->rt6i_gateway;
1351 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1352 rt->rt6i_metric = 0;
1353
1354 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1355#ifdef CONFIG_IPV6_SUBTREES
1356 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1357#endif
1358 }
1359
1360 dst_release(dst_orig);
1361 return new ? new : ERR_PTR(-ENOMEM);
1362}
1363
1364/*
1365 * Destination cache support functions
1366 */
1367
1368static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1369{
1370 if (rt->dst.from &&
1371 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1372 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1373}
1374
1375static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1376{
1377 u32 rt_cookie = 0;
1378
1379 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1380 return NULL;
1381
1382 if (rt6_check_expired(rt))
1383 return NULL;
1384
1385 return &rt->dst;
1386}
1387
1388static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1389{
1390 if (!__rt6_check_expired(rt) &&
1391 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1392 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1393 return &rt->dst;
1394 else
1395 return NULL;
1396}
1397
1398static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1399{
1400 struct rt6_info *rt;
1401
1402 rt = (struct rt6_info *) dst;
1403
1404 /* All IPV6 dsts are created with ->obsolete set to the value
1405 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1406 * into this function always.
1407 */
1408
1409 rt6_dst_from_metrics_check(rt);
1410
1411 if (rt->rt6i_flags & RTF_PCPU ||
1412 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1413 return rt6_dst_from_check(rt, cookie);
1414 else
1415 return rt6_check(rt, cookie);
1416}
1417
1418static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1419{
1420 struct rt6_info *rt = (struct rt6_info *) dst;
1421
1422 if (rt) {
1423 if (rt->rt6i_flags & RTF_CACHE) {
1424 if (rt6_check_expired(rt)) {
1425 ip6_del_rt(rt);
1426 dst = NULL;
1427 }
1428 } else {
1429 dst_release(dst);
1430 dst = NULL;
1431 }
1432 }
1433 return dst;
1434}
1435
1436static void ip6_link_failure(struct sk_buff *skb)
1437{
1438 struct rt6_info *rt;
1439
1440 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1441
1442 rt = (struct rt6_info *) skb_dst(skb);
1443 if (rt) {
1444 if (rt->rt6i_flags & RTF_CACHE) {
1445 if (dst_hold_safe(&rt->dst))
1446 ip6_del_rt(rt);
1447 } else {
1448 struct fib6_node *fn;
1449
1450 rcu_read_lock();
1451 fn = rcu_dereference(rt->rt6i_node);
1452 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1453 fn->fn_sernum = -1;
1454 rcu_read_unlock();
1455 }
1456 }
1457}
1458
1459static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1460{
1461 struct net *net = dev_net(rt->dst.dev);
1462
1463 rt->rt6i_flags |= RTF_MODIFIED;
1464 rt->rt6i_pmtu = mtu;
1465 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1466}
1467
1468static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1469{
1470 return !(rt->rt6i_flags & RTF_CACHE) &&
1471 (rt->rt6i_flags & RTF_PCPU ||
1472 rcu_access_pointer(rt->rt6i_node));
1473}
1474
1475static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1476 const struct ipv6hdr *iph, u32 mtu,
1477 bool confirm_neigh)
1478{
1479 const struct in6_addr *daddr, *saddr;
1480 struct rt6_info *rt6 = (struct rt6_info *)dst;
1481
1482 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
1483 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
1484 * [see also comment in rt6_mtu_change_route()]
1485 */
1486
1487 if (iph) {
1488 daddr = &iph->daddr;
1489 saddr = &iph->saddr;
1490 } else if (sk) {
1491 daddr = &sk->sk_v6_daddr;
1492 saddr = &inet6_sk(sk)->saddr;
1493 } else {
1494 daddr = NULL;
1495 saddr = NULL;
1496 }
1497
1498 if (confirm_neigh)
1499 dst_confirm_neigh(dst, daddr);
1500
1501 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1502 if (mtu >= dst_mtu(dst))
1503 return;
1504
1505 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1506 rt6_do_update_pmtu(rt6, mtu);
1507 } else if (daddr) {
1508 struct rt6_info *nrt6;
1509
1510 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1511 if (nrt6) {
1512 rt6_do_update_pmtu(nrt6, mtu);
1513
1514 /* ip6_ins_rt(nrt6) will bump the
1515 * rt6->rt6i_node->fn_sernum
1516 * which will fail the next rt6_check() and
1517 * invalidate the sk->sk_dst_cache.
1518 */
1519 ip6_ins_rt(nrt6);
1520 /* Release the reference taken in
1521 * ip6_rt_cache_alloc()
1522 */
1523 dst_release(&nrt6->dst);
1524 }
1525 }
1526}
1527
1528static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1529 struct sk_buff *skb, u32 mtu,
1530 bool confirm_neigh)
1531{
1532 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
1533 confirm_neigh);
1534}
1535
1536void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1537 int oif, u32 mark, kuid_t uid)
1538{
1539 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1540 struct dst_entry *dst;
1541 struct flowi6 fl6;
1542
1543 memset(&fl6, 0, sizeof(fl6));
1544 fl6.flowi6_oif = oif;
1545 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1546 fl6.daddr = iph->daddr;
1547 fl6.saddr = iph->saddr;
1548 fl6.flowlabel = ip6_flowinfo(iph);
1549 fl6.flowi6_uid = uid;
1550
1551 dst = ip6_route_output(net, NULL, &fl6);
1552 if (!dst->error)
1553 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
1554 dst_release(dst);
1555}
1556EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1557
1558void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1559{
1560 int oif = sk->sk_bound_dev_if;
1561 struct dst_entry *dst;
1562
1563 if (!oif && skb->dev)
1564 oif = l3mdev_master_ifindex(skb->dev);
1565
1566 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
1567
1568 dst = __sk_dst_get(sk);
1569 if (!dst || !dst->obsolete ||
1570 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1571 return;
1572
1573 bh_lock_sock(sk);
1574 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1575 ip6_datagram_dst_update(sk, false);
1576 bh_unlock_sock(sk);
1577}
1578EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1579
1580/* Handle redirects */
1581struct ip6rd_flowi {
1582 struct flowi6 fl6;
1583 struct in6_addr gateway;
1584};
1585
1586static struct rt6_info *__ip6_route_redirect(struct net *net,
1587 struct fib6_table *table,
1588 struct flowi6 *fl6,
1589 int flags)
1590{
1591 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1592 struct rt6_info *rt;
1593 struct fib6_node *fn;
1594
1595 /* Get the "current" route for this destination and
1596 * check if the redirect has come from appropriate router.
1597 *
1598 * RFC 4861 specifies that redirects should only be
1599 * accepted if they come from the nexthop to the target.
1600 * Due to the way the routes are chosen, this notion
1601 * is a bit fuzzy and one might need to check all possible
1602 * routes.
1603 */
1604
1605 read_lock_bh(&table->tb6_lock);
1606 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1607restart:
1608 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1609 if (rt6_check_expired(rt))
1610 continue;
1611 if (rt->dst.error)
1612 break;
1613 if (!(rt->rt6i_flags & RTF_GATEWAY))
1614 continue;
1615 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1616 continue;
1617 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1618 continue;
1619 break;
1620 }
1621
1622 if (!rt)
1623 rt = net->ipv6.ip6_null_entry;
1624 else if (rt->dst.error) {
1625 rt = net->ipv6.ip6_null_entry;
1626 goto out;
1627 }
1628
1629 if (rt == net->ipv6.ip6_null_entry) {
1630 fn = fib6_backtrack(fn, &fl6->saddr);
1631 if (fn)
1632 goto restart;
1633 }
1634
1635out:
1636 dst_hold(&rt->dst);
1637
1638 read_unlock_bh(&table->tb6_lock);
1639
1640 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1641 return rt;
1642};
1643
1644static struct dst_entry *ip6_route_redirect(struct net *net,
1645 const struct flowi6 *fl6,
1646 const struct in6_addr *gateway)
1647{
1648 int flags = RT6_LOOKUP_F_HAS_SADDR;
1649 struct ip6rd_flowi rdfl;
1650
1651 rdfl.fl6 = *fl6;
1652 rdfl.gateway = *gateway;
1653
1654 return fib6_rule_lookup(net, &rdfl.fl6,
1655 flags, __ip6_route_redirect);
1656}
1657
1658void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1659 kuid_t uid)
1660{
1661 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1662 struct dst_entry *dst;
1663 struct flowi6 fl6;
1664
1665 memset(&fl6, 0, sizeof(fl6));
1666 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1667 fl6.flowi6_oif = oif;
1668 fl6.flowi6_mark = mark;
1669 fl6.daddr = iph->daddr;
1670 fl6.saddr = iph->saddr;
1671 fl6.flowlabel = ip6_flowinfo(iph);
1672 fl6.flowi6_uid = uid;
1673
1674 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1675 rt6_do_redirect(dst, NULL, skb);
1676 dst_release(dst);
1677}
1678EXPORT_SYMBOL_GPL(ip6_redirect);
1679
1680void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1681 u32 mark)
1682{
1683 const struct ipv6hdr *iph = ipv6_hdr(skb);
1684 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1685 struct dst_entry *dst;
1686 struct flowi6 fl6;
1687
1688 memset(&fl6, 0, sizeof(fl6));
1689 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1690 fl6.flowi6_oif = oif;
1691 fl6.flowi6_mark = mark;
1692 fl6.daddr = msg->dest;
1693 fl6.saddr = iph->daddr;
1694 fl6.flowi6_uid = sock_net_uid(net, NULL);
1695
1696 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1697 rt6_do_redirect(dst, NULL, skb);
1698 dst_release(dst);
1699}
1700
1701void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1702{
1703 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1704 sk->sk_uid);
1705}
1706EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1707
1708static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1709{
1710 struct net_device *dev = dst->dev;
1711 unsigned int mtu = dst_mtu(dst);
1712 struct net *net = dev_net(dev);
1713
1714 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1715
1716 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1717 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1718
1719 /*
1720 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1721 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1722 * IPV6_MAXPLEN is also valid and means: "any MSS,
1723 * rely only on pmtu discovery"
1724 */
1725 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1726 mtu = IPV6_MAXPLEN;
1727 return mtu;
1728}
1729
1730static unsigned int ip6_mtu(const struct dst_entry *dst)
1731{
1732 const struct rt6_info *rt = (const struct rt6_info *)dst;
1733 unsigned int mtu = rt->rt6i_pmtu;
1734 struct inet6_dev *idev;
1735
1736 if (mtu)
1737 goto out;
1738
1739 mtu = dst_metric_raw(dst, RTAX_MTU);
1740 if (mtu)
1741 goto out;
1742
1743 mtu = IPV6_MIN_MTU;
1744
1745 rcu_read_lock();
1746 idev = __in6_dev_get(dst->dev);
1747 if (idev)
1748 mtu = idev->cnf.mtu6;
1749 rcu_read_unlock();
1750
1751out:
1752 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1753
1754 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1755}
1756
1757struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1758 struct flowi6 *fl6)
1759{
1760 struct dst_entry *dst;
1761 struct rt6_info *rt;
1762 struct inet6_dev *idev = in6_dev_get(dev);
1763 struct net *net = dev_net(dev);
1764
1765 if (unlikely(!idev))
1766 return ERR_PTR(-ENODEV);
1767
1768 rt = ip6_dst_alloc(net, dev, 0);
1769 if (unlikely(!rt)) {
1770 in6_dev_put(idev);
1771 dst = ERR_PTR(-ENOMEM);
1772 goto out;
1773 }
1774
1775 rt->dst.flags |= DST_HOST;
1776 rt->dst.input = ip6_input;
1777 rt->dst.output = ip6_output;
1778 rt->rt6i_gateway = fl6->daddr;
1779 rt->rt6i_dst.addr = fl6->daddr;
1780 rt->rt6i_dst.plen = 128;
1781 rt->rt6i_idev = idev;
1782 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1783
1784 /* Add this dst into uncached_list so that rt6_ifdown() can
1785 * do proper release of the net_device
1786 */
1787 rt6_uncached_list_add(rt);
1788
1789 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1790
1791out:
1792 return dst;
1793}
1794
1795static int ip6_dst_gc(struct dst_ops *ops)
1796{
1797 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1798 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1799 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1800 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1801 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1802 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1803 int entries;
1804
1805 entries = dst_entries_get_fast(ops);
1806 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1807 entries <= rt_max_size)
1808 goto out;
1809
1810 net->ipv6.ip6_rt_gc_expire++;
1811 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1812 entries = dst_entries_get_slow(ops);
1813 if (entries < ops->gc_thresh)
1814 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1815out:
1816 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1817 return entries > rt_max_size;
1818}
1819
1820static int ip6_convert_metrics(struct mx6_config *mxc,
1821 const struct fib6_config *cfg)
1822{
1823 bool ecn_ca = false;
1824 struct nlattr *nla;
1825 int remaining;
1826 u32 *mp;
1827
1828 if (!cfg->fc_mx)
1829 return 0;
1830
1831 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1832 if (unlikely(!mp))
1833 return -ENOMEM;
1834
1835 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1836 int type = nla_type(nla);
1837 u32 val;
1838
1839 if (!type)
1840 continue;
1841 if (unlikely(type > RTAX_MAX))
1842 goto err;
1843
1844 if (type == RTAX_CC_ALGO) {
1845 char tmp[TCP_CA_NAME_MAX];
1846
1847 nla_strlcpy(tmp, nla, sizeof(tmp));
1848 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1849 if (val == TCP_CA_UNSPEC)
1850 goto err;
1851 } else {
1852 val = nla_get_u32(nla);
1853 }
1854 if (type == RTAX_HOPLIMIT && val > 255)
1855 val = 255;
1856 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1857 goto err;
1858
1859 mp[type - 1] = val;
1860 __set_bit(type - 1, mxc->mx_valid);
1861 }
1862
1863 if (ecn_ca) {
1864 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1865 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1866 }
1867
1868 mxc->mx = mp;
1869 return 0;
1870 err:
1871 kfree(mp);
1872 return -EINVAL;
1873}
1874
1875static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1876 struct fib6_config *cfg,
1877 const struct in6_addr *gw_addr)
1878{
1879 struct flowi6 fl6 = {
1880 .flowi6_oif = cfg->fc_ifindex,
1881 .daddr = *gw_addr,
1882 .saddr = cfg->fc_prefsrc,
1883 };
1884 struct fib6_table *table;
1885 struct rt6_info *rt;
1886 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1887
1888 table = fib6_get_table(net, cfg->fc_table);
1889 if (!table)
1890 return NULL;
1891
1892 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1893 flags |= RT6_LOOKUP_F_HAS_SADDR;
1894
1895 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1896
1897 /* if table lookup failed, fall back to full lookup */
1898 if (rt == net->ipv6.ip6_null_entry) {
1899 ip6_rt_put(rt);
1900 rt = NULL;
1901 }
1902
1903 return rt;
1904}
1905
1906static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1907 struct netlink_ext_ack *extack)
1908{
1909 struct net *net = cfg->fc_nlinfo.nl_net;
1910 struct rt6_info *rt = NULL;
1911 struct net_device *dev = NULL;
1912 struct inet6_dev *idev = NULL;
1913 struct fib6_table *table;
1914 int addr_type;
1915 int err = -EINVAL;
1916
1917 /* RTF_PCPU is an internal flag; can not be set by userspace */
1918 if (cfg->fc_flags & RTF_PCPU) {
1919 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1920 goto out;
1921 }
1922
1923 if (cfg->fc_dst_len > 128) {
1924 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1925 goto out;
1926 }
1927 if (cfg->fc_src_len > 128) {
1928 NL_SET_ERR_MSG(extack, "Invalid source address length");
1929 goto out;
1930 }
1931#ifndef CONFIG_IPV6_SUBTREES
1932 if (cfg->fc_src_len) {
1933 NL_SET_ERR_MSG(extack,
1934 "Specifying source address requires IPV6_SUBTREES to be enabled");
1935 goto out;
1936 }
1937#endif
1938 if (cfg->fc_ifindex) {
1939 err = -ENODEV;
1940 dev = dev_get_by_index(net, cfg->fc_ifindex);
1941 if (!dev)
1942 goto out;
1943 idev = in6_dev_get(dev);
1944 if (!idev)
1945 goto out;
1946 }
1947
1948 if (cfg->fc_metric == 0)
1949 cfg->fc_metric = IP6_RT_PRIO_USER;
1950
1951 err = -ENOBUFS;
1952 if (cfg->fc_nlinfo.nlh &&
1953 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1954 table = fib6_get_table(net, cfg->fc_table);
1955 if (!table) {
1956 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1957 table = fib6_new_table(net, cfg->fc_table);
1958 }
1959 } else {
1960 table = fib6_new_table(net, cfg->fc_table);
1961 }
1962
1963 if (!table)
1964 goto out;
1965
1966 rt = ip6_dst_alloc(net, NULL,
1967 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1968
1969 if (!rt) {
1970 err = -ENOMEM;
1971 goto out;
1972 }
1973
1974 if (cfg->fc_flags & RTF_EXPIRES)
1975 rt6_set_expires(rt, jiffies +
1976 clock_t_to_jiffies(cfg->fc_expires));
1977 else
1978 rt6_clean_expires(rt);
1979
1980 if (cfg->fc_protocol == RTPROT_UNSPEC)
1981 cfg->fc_protocol = RTPROT_BOOT;
1982 rt->rt6i_protocol = cfg->fc_protocol;
1983
1984 addr_type = ipv6_addr_type(&cfg->fc_dst);
1985
1986 if (addr_type & IPV6_ADDR_MULTICAST)
1987 rt->dst.input = ip6_mc_input;
1988 else if (cfg->fc_flags & RTF_LOCAL)
1989 rt->dst.input = ip6_input;
1990 else
1991 rt->dst.input = ip6_forward;
1992
1993 rt->dst.output = ip6_output;
1994
1995 if (cfg->fc_encap) {
1996 struct lwtunnel_state *lwtstate;
1997
1998 err = lwtunnel_build_state(cfg->fc_encap_type,
1999 cfg->fc_encap, AF_INET6, cfg,
2000 &lwtstate, extack);
2001 if (err)
2002 goto out;
2003 rt->dst.lwtstate = lwtstate_get(lwtstate);
2004 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2005 rt->dst.lwtstate->orig_output = rt->dst.output;
2006 rt->dst.output = lwtunnel_output;
2007 }
2008 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2009 rt->dst.lwtstate->orig_input = rt->dst.input;
2010 rt->dst.input = lwtunnel_input;
2011 }
2012 }
2013
2014 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2015 rt->rt6i_dst.plen = cfg->fc_dst_len;
2016 if (rt->rt6i_dst.plen == 128)
2017 rt->dst.flags |= DST_HOST;
2018
2019#ifdef CONFIG_IPV6_SUBTREES
2020 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2021 rt->rt6i_src.plen = cfg->fc_src_len;
2022#endif
2023
2024 rt->rt6i_metric = cfg->fc_metric;
2025
2026 /* We cannot add true routes via loopback here,
2027 they would result in kernel looping; promote them to reject routes
2028 */
2029 if ((cfg->fc_flags & RTF_REJECT) ||
2030 (dev && (dev->flags & IFF_LOOPBACK) &&
2031 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2032 !(cfg->fc_flags & RTF_LOCAL))) {
2033 /* hold loopback dev/idev if we haven't done so. */
2034 if (dev != net->loopback_dev) {
2035 if (dev) {
2036 dev_put(dev);
2037 in6_dev_put(idev);
2038 }
2039 dev = net->loopback_dev;
2040 dev_hold(dev);
2041 idev = in6_dev_get(dev);
2042 if (!idev) {
2043 err = -ENODEV;
2044 goto out;
2045 }
2046 }
2047 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2048 switch (cfg->fc_type) {
2049 case RTN_BLACKHOLE:
2050 rt->dst.error = -EINVAL;
2051 rt->dst.output = dst_discard_out;
2052 rt->dst.input = dst_discard;
2053 break;
2054 case RTN_PROHIBIT:
2055 rt->dst.error = -EACCES;
2056 rt->dst.output = ip6_pkt_prohibit_out;
2057 rt->dst.input = ip6_pkt_prohibit;
2058 break;
2059 case RTN_THROW:
2060 case RTN_UNREACHABLE:
2061 default:
2062 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2063 : (cfg->fc_type == RTN_UNREACHABLE)
2064 ? -EHOSTUNREACH : -ENETUNREACH;
2065 rt->dst.output = ip6_pkt_discard_out;
2066 rt->dst.input = ip6_pkt_discard;
2067 break;
2068 }
2069 goto install_route;
2070 }
2071
2072 if (cfg->fc_flags & RTF_GATEWAY) {
2073 const struct in6_addr *gw_addr;
2074 int gwa_type;
2075
2076 gw_addr = &cfg->fc_gateway;
2077 gwa_type = ipv6_addr_type(gw_addr);
2078
2079 /* if gw_addr is local we will fail to detect this in case
2080 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2081 * will return already-added prefix route via interface that
2082 * prefix route was assigned to, which might be non-loopback.
2083 */
2084 err = -EINVAL;
2085 if (ipv6_chk_addr_and_flags(net, gw_addr,
2086 gwa_type & IPV6_ADDR_LINKLOCAL ?
2087 dev : NULL, 0, 0)) {
2088 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2089 goto out;
2090 }
2091 rt->rt6i_gateway = *gw_addr;
2092
2093 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2094 struct rt6_info *grt = NULL;
2095
2096 /* IPv6 strictly inhibits using not link-local
2097 addresses as nexthop address.
2098 Otherwise, router will not able to send redirects.
2099 It is very good, but in some (rare!) circumstances
2100 (SIT, PtP, NBMA NOARP links) it is handy to allow
2101 some exceptions. --ANK
2102 We allow IPv4-mapped nexthops to support RFC4798-type
2103 addressing
2104 */
2105 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2106 IPV6_ADDR_MAPPED))) {
2107 NL_SET_ERR_MSG(extack,
2108 "Invalid gateway address");
2109 goto out;
2110 }
2111
2112 if (cfg->fc_table) {
2113 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2114
2115 if (grt) {
2116 if (grt->rt6i_flags & RTF_GATEWAY ||
2117 (dev && dev != grt->dst.dev)) {
2118 ip6_rt_put(grt);
2119 grt = NULL;
2120 }
2121 }
2122 }
2123
2124 if (!grt)
2125 grt = rt6_lookup(net, gw_addr, NULL,
2126 cfg->fc_ifindex, 1);
2127
2128 err = -EHOSTUNREACH;
2129 if (!grt)
2130 goto out;
2131 if (dev) {
2132 if (dev != grt->dst.dev) {
2133 ip6_rt_put(grt);
2134 goto out;
2135 }
2136 } else {
2137 dev = grt->dst.dev;
2138 idev = grt->rt6i_idev;
2139 dev_hold(dev);
2140 in6_dev_hold(grt->rt6i_idev);
2141 }
2142 if (!(grt->rt6i_flags & RTF_GATEWAY))
2143 err = 0;
2144 ip6_rt_put(grt);
2145
2146 if (err)
2147 goto out;
2148 }
2149 err = -EINVAL;
2150 if (!dev) {
2151 NL_SET_ERR_MSG(extack, "Egress device not specified");
2152 goto out;
2153 } else if (dev->flags & IFF_LOOPBACK) {
2154 NL_SET_ERR_MSG(extack,
2155 "Egress device can not be loopback device for this route");
2156 goto out;
2157 }
2158 }
2159
2160 err = -ENODEV;
2161 if (!dev)
2162 goto out;
2163
2164 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2165 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2166 NL_SET_ERR_MSG(extack, "Invalid source address");
2167 err = -EINVAL;
2168 goto out;
2169 }
2170 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2171 rt->rt6i_prefsrc.plen = 128;
2172 } else
2173 rt->rt6i_prefsrc.plen = 0;
2174
2175 rt->rt6i_flags = cfg->fc_flags;
2176
2177install_route:
2178 rt->dst.dev = dev;
2179 rt->rt6i_idev = idev;
2180 rt->rt6i_table = table;
2181
2182 cfg->fc_nlinfo.nl_net = dev_net(dev);
2183
2184 return rt;
2185out:
2186 if (dev)
2187 dev_put(dev);
2188 if (idev)
2189 in6_dev_put(idev);
2190 if (rt)
2191 dst_release_immediate(&rt->dst);
2192
2193 return ERR_PTR(err);
2194}
2195
2196int ip6_route_add(struct fib6_config *cfg,
2197 struct netlink_ext_ack *extack)
2198{
2199 struct mx6_config mxc = { .mx = NULL, };
2200 struct rt6_info *rt;
2201 int err;
2202
2203 rt = ip6_route_info_create(cfg, extack);
2204 if (IS_ERR(rt)) {
2205 err = PTR_ERR(rt);
2206 rt = NULL;
2207 goto out;
2208 }
2209
2210 err = ip6_convert_metrics(&mxc, cfg);
2211 if (err)
2212 goto out;
2213
2214 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2215
2216 kfree(mxc.mx);
2217
2218 return err;
2219out:
2220 if (rt)
2221 dst_release_immediate(&rt->dst);
2222
2223 return err;
2224}
2225
2226static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2227{
2228 int err;
2229 struct fib6_table *table;
2230 struct net *net = dev_net(rt->dst.dev);
2231
2232 if (rt == net->ipv6.ip6_null_entry) {
2233 err = -ENOENT;
2234 goto out;
2235 }
2236
2237 table = rt->rt6i_table;
2238 write_lock_bh(&table->tb6_lock);
2239 err = fib6_del(rt, info);
2240 write_unlock_bh(&table->tb6_lock);
2241
2242out:
2243 ip6_rt_put(rt);
2244 return err;
2245}
2246
2247int ip6_del_rt(struct rt6_info *rt)
2248{
2249 struct nl_info info = {
2250 .nl_net = dev_net(rt->dst.dev),
2251 };
2252 return __ip6_del_rt(rt, &info);
2253}
2254
2255static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2256{
2257 struct nl_info *info = &cfg->fc_nlinfo;
2258 struct net *net = info->nl_net;
2259 struct sk_buff *skb = NULL;
2260 struct fib6_table *table;
2261 int err = -ENOENT;
2262
2263 if (rt == net->ipv6.ip6_null_entry)
2264 goto out_put;
2265 table = rt->rt6i_table;
2266 write_lock_bh(&table->tb6_lock);
2267
2268 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2269 struct rt6_info *sibling, *next_sibling;
2270
2271 /* prefer to send a single notification with all hops */
2272 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2273 if (skb) {
2274 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2275
2276 if (rt6_fill_node(net, skb, rt,
2277 NULL, NULL, 0, RTM_DELROUTE,
2278 info->portid, seq, 0) < 0) {
2279 kfree_skb(skb);
2280 skb = NULL;
2281 } else
2282 info->skip_notify = 1;
2283 }
2284
2285 list_for_each_entry_safe(sibling, next_sibling,
2286 &rt->rt6i_siblings,
2287 rt6i_siblings) {
2288 err = fib6_del(sibling, info);
2289 if (err)
2290 goto out_unlock;
2291 }
2292 }
2293
2294 err = fib6_del(rt, info);
2295out_unlock:
2296 write_unlock_bh(&table->tb6_lock);
2297out_put:
2298 ip6_rt_put(rt);
2299
2300 if (skb) {
2301 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2302 info->nlh, gfp_any());
2303 }
2304 return err;
2305}
2306
2307static int ip6_route_del(struct fib6_config *cfg,
2308 struct netlink_ext_ack *extack)
2309{
2310 struct fib6_table *table;
2311 struct fib6_node *fn;
2312 struct rt6_info *rt;
2313 int err = -ESRCH;
2314
2315 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2316 if (!table) {
2317 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2318 return err;
2319 }
2320
2321 read_lock_bh(&table->tb6_lock);
2322
2323 fn = fib6_locate(&table->tb6_root,
2324 &cfg->fc_dst, cfg->fc_dst_len,
2325 &cfg->fc_src, cfg->fc_src_len);
2326
2327 if (fn) {
2328 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2329 if ((rt->rt6i_flags & RTF_CACHE) &&
2330 !(cfg->fc_flags & RTF_CACHE))
2331 continue;
2332 if (cfg->fc_ifindex &&
2333 (!rt->dst.dev ||
2334 rt->dst.dev->ifindex != cfg->fc_ifindex))
2335 continue;
2336 if (cfg->fc_flags & RTF_GATEWAY &&
2337 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2338 continue;
2339 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2340 continue;
2341 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2342 continue;
2343 dst_hold(&rt->dst);
2344 read_unlock_bh(&table->tb6_lock);
2345
2346 /* if gateway was specified only delete the one hop */
2347 if (cfg->fc_flags & RTF_GATEWAY)
2348 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2349
2350 return __ip6_del_rt_siblings(rt, cfg);
2351 }
2352 }
2353 read_unlock_bh(&table->tb6_lock);
2354
2355 return err;
2356}
2357
2358static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2359{
2360 struct netevent_redirect netevent;
2361 struct rt6_info *rt, *nrt = NULL;
2362 struct ndisc_options ndopts;
2363 struct inet6_dev *in6_dev;
2364 struct neighbour *neigh;
2365 struct rd_msg *msg;
2366 int optlen, on_link;
2367 u8 *lladdr;
2368
2369 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2370 optlen -= sizeof(*msg);
2371
2372 if (optlen < 0) {
2373 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2374 return;
2375 }
2376
2377 msg = (struct rd_msg *)icmp6_hdr(skb);
2378
2379 if (ipv6_addr_is_multicast(&msg->dest)) {
2380 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2381 return;
2382 }
2383
2384 on_link = 0;
2385 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2386 on_link = 1;
2387 } else if (ipv6_addr_type(&msg->target) !=
2388 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2389 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2390 return;
2391 }
2392
2393 in6_dev = __in6_dev_get(skb->dev);
2394 if (!in6_dev)
2395 return;
2396 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2397 return;
2398
2399 /* RFC2461 8.1:
2400 * The IP source address of the Redirect MUST be the same as the current
2401 * first-hop router for the specified ICMP Destination Address.
2402 */
2403
2404 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2405 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2406 return;
2407 }
2408
2409 lladdr = NULL;
2410 if (ndopts.nd_opts_tgt_lladdr) {
2411 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2412 skb->dev);
2413 if (!lladdr) {
2414 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2415 return;
2416 }
2417 }
2418
2419 rt = (struct rt6_info *) dst;
2420 if (rt->rt6i_flags & RTF_REJECT) {
2421 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2422 return;
2423 }
2424
2425 /* Redirect received -> path was valid.
2426 * Look, redirects are sent only in response to data packets,
2427 * so that this nexthop apparently is reachable. --ANK
2428 */
2429 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2430
2431 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2432 if (!neigh)
2433 return;
2434
2435 /*
2436 * We have finally decided to accept it.
2437 */
2438
2439 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2440 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2441 NEIGH_UPDATE_F_OVERRIDE|
2442 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2443 NEIGH_UPDATE_F_ISROUTER)),
2444 NDISC_REDIRECT, &ndopts);
2445
2446 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2447 if (!nrt)
2448 goto out;
2449
2450 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2451 if (on_link)
2452 nrt->rt6i_flags &= ~RTF_GATEWAY;
2453
2454 nrt->rt6i_protocol = RTPROT_REDIRECT;
2455 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2456
2457 if (ip6_ins_rt(nrt))
2458 goto out_release;
2459
2460 netevent.old = &rt->dst;
2461 netevent.new = &nrt->dst;
2462 netevent.daddr = &msg->dest;
2463 netevent.neigh = neigh;
2464 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2465
2466 if (rt->rt6i_flags & RTF_CACHE) {
2467 rt = (struct rt6_info *) dst_clone(&rt->dst);
2468 ip6_del_rt(rt);
2469 }
2470
2471out_release:
2472 /* Release the reference taken in
2473 * ip6_rt_cache_alloc()
2474 */
2475 dst_release(&nrt->dst);
2476
2477out:
2478 neigh_release(neigh);
2479}
2480
2481/*
2482 * Misc support functions
2483 */
2484
2485static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2486{
2487 BUG_ON(from->dst.from);
2488
2489 rt->rt6i_flags &= ~RTF_EXPIRES;
2490 dst_hold(&from->dst);
2491 rt->dst.from = &from->dst;
2492 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2493}
2494
2495static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2496{
2497 rt->dst.input = ort->dst.input;
2498 rt->dst.output = ort->dst.output;
2499 rt->rt6i_dst = ort->rt6i_dst;
2500 rt->dst.error = ort->dst.error;
2501 rt->rt6i_idev = ort->rt6i_idev;
2502 if (rt->rt6i_idev)
2503 in6_dev_hold(rt->rt6i_idev);
2504 rt->dst.lastuse = jiffies;
2505 rt->rt6i_gateway = ort->rt6i_gateway;
2506 rt->rt6i_flags = ort->rt6i_flags;
2507 rt6_set_from(rt, ort);
2508 rt->rt6i_metric = ort->rt6i_metric;
2509#ifdef CONFIG_IPV6_SUBTREES
2510 rt->rt6i_src = ort->rt6i_src;
2511#endif
2512 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2513 rt->rt6i_table = ort->rt6i_table;
2514 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2515}
2516
2517#ifdef CONFIG_IPV6_ROUTE_INFO
2518static struct rt6_info *rt6_get_route_info(struct net *net,
2519 const struct in6_addr *prefix, int prefixlen,
2520 const struct in6_addr *gwaddr,
2521 struct net_device *dev)
2522{
2523 u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
2524 struct fib6_node *fn;
2525 struct rt6_info *rt = NULL;
2526 struct fib6_table *table;
2527
2528 table = fib6_get_table(net, tb_id);
2529 if (!table)
2530 return NULL;
2531
2532 read_lock_bh(&table->tb6_lock);
2533 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2534 if (!fn)
2535 goto out;
2536
2537 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2538 if (rt->dst.dev->ifindex != dev->ifindex)
2539 continue;
2540 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2541 continue;
2542 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2543 continue;
2544 dst_hold(&rt->dst);
2545 break;
2546 }
2547out:
2548 read_unlock_bh(&table->tb6_lock);
2549 return rt;
2550}
2551
2552static struct rt6_info *rt6_add_route_info(struct net *net,
2553 const struct in6_addr *prefix, int prefixlen,
2554 const struct in6_addr *gwaddr,
2555 struct net_device *dev,
2556 unsigned int pref)
2557{
2558 struct fib6_config cfg = {
2559 .fc_metric = IP6_RT_PRIO_USER,
2560 .fc_ifindex = dev->ifindex,
2561 .fc_dst_len = prefixlen,
2562 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2563 RTF_UP | RTF_PREF(pref),
2564 .fc_protocol = RTPROT_RA,
2565 .fc_nlinfo.portid = 0,
2566 .fc_nlinfo.nlh = NULL,
2567 .fc_nlinfo.nl_net = net,
2568 };
2569
2570 cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
2571 cfg.fc_dst = *prefix;
2572 cfg.fc_gateway = *gwaddr;
2573
2574 /* We should treat it as a default route if prefix length is 0. */
2575 if (!prefixlen)
2576 cfg.fc_flags |= RTF_DEFAULT;
2577
2578 ip6_route_add(&cfg, NULL);
2579
2580 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2581}
2582#endif
2583
2584struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2585{
2586 u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN);
2587 struct rt6_info *rt;
2588 struct fib6_table *table;
2589
2590 table = fib6_get_table(dev_net(dev), tb_id);
2591 if (!table)
2592 return NULL;
2593
2594 read_lock_bh(&table->tb6_lock);
2595 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2596 if (dev == rt->dst.dev &&
2597 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2598 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2599 break;
2600 }
2601 if (rt)
2602 dst_hold(&rt->dst);
2603 read_unlock_bh(&table->tb6_lock);
2604 return rt;
2605}
2606
2607struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2608 struct net_device *dev,
2609 unsigned int pref)
2610{
2611 struct fib6_config cfg = {
2612 .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
2613 .fc_metric = IP6_RT_PRIO_USER,
2614 .fc_ifindex = dev->ifindex,
2615 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2616 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2617 .fc_protocol = RTPROT_RA,
2618 .fc_nlinfo.portid = 0,
2619 .fc_nlinfo.nlh = NULL,
2620 .fc_nlinfo.nl_net = dev_net(dev),
2621 };
2622
2623 cfg.fc_gateway = *gwaddr;
2624
2625 if (!ip6_route_add(&cfg, NULL)) {
2626 struct fib6_table *table;
2627
2628 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2629 if (table)
2630 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2631 }
2632
2633 return rt6_get_dflt_router(gwaddr, dev);
2634}
2635
2636int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
2637 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2638 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
2639 return -1;
2640 return 0;
2641}
2642
2643void rt6_purge_dflt_routers(struct net *net)
2644{
2645 fib6_clean_all(net, rt6_addrconf_purge, NULL);
2646}
2647
2648static void rtmsg_to_fib6_config(struct net *net,
2649 struct in6_rtmsg *rtmsg,
2650 struct fib6_config *cfg)
2651{
2652 memset(cfg, 0, sizeof(*cfg));
2653
2654 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2655 : RT6_TABLE_MAIN;
2656 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2657 cfg->fc_metric = rtmsg->rtmsg_metric;
2658 cfg->fc_expires = rtmsg->rtmsg_info;
2659 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2660 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2661 cfg->fc_flags = rtmsg->rtmsg_flags;
2662
2663 cfg->fc_nlinfo.nl_net = net;
2664
2665 cfg->fc_dst = rtmsg->rtmsg_dst;
2666 cfg->fc_src = rtmsg->rtmsg_src;
2667 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2668}
2669
2670int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2671{
2672 struct fib6_config cfg;
2673 struct in6_rtmsg rtmsg;
2674 int err;
2675
2676 switch (cmd) {
2677 case SIOCADDRT: /* Add a route */
2678 case SIOCDELRT: /* Delete a route */
2679 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2680 return -EPERM;
2681 err = copy_from_user(&rtmsg, arg,
2682 sizeof(struct in6_rtmsg));
2683 if (err)
2684 return -EFAULT;
2685
2686 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2687
2688 rtnl_lock();
2689 switch (cmd) {
2690 case SIOCADDRT:
2691 err = ip6_route_add(&cfg, NULL);
2692 break;
2693 case SIOCDELRT:
2694 err = ip6_route_del(&cfg, NULL);
2695 break;
2696 default:
2697 err = -EINVAL;
2698 }
2699 rtnl_unlock();
2700
2701 return err;
2702 }
2703
2704 return -EINVAL;
2705}
2706
2707/*
2708 * Drop the packet on the floor
2709 */
2710
2711static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2712{
2713 int type;
2714 struct dst_entry *dst = skb_dst(skb);
2715 switch (ipstats_mib_noroutes) {
2716 case IPSTATS_MIB_INNOROUTES:
2717 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2718 if (type == IPV6_ADDR_ANY) {
2719 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2720 IPSTATS_MIB_INADDRERRORS);
2721 break;
2722 }
2723 /* FALLTHROUGH */
2724 case IPSTATS_MIB_OUTNOROUTES:
2725 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2726 ipstats_mib_noroutes);
2727 break;
2728 }
2729 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2730 kfree_skb(skb);
2731 return 0;
2732}
2733
2734static int ip6_pkt_discard(struct sk_buff *skb)
2735{
2736 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2737}
2738
2739static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2740{
2741 skb->dev = skb_dst(skb)->dev;
2742 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2743}
2744
2745static int ip6_pkt_prohibit(struct sk_buff *skb)
2746{
2747 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2748}
2749
2750static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2751{
2752 skb->dev = skb_dst(skb)->dev;
2753 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2754}
2755
2756/*
2757 * Allocate a dst for local (unicast / anycast) address.
2758 */
2759
2760struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2761 const struct in6_addr *addr,
2762 bool anycast)
2763{
2764 u32 tb_id;
2765 struct net *net = dev_net(idev->dev);
2766 struct net_device *dev = idev->dev;
2767 struct rt6_info *rt;
2768
2769 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2770 if (!rt)
2771 return ERR_PTR(-ENOMEM);
2772
2773 in6_dev_hold(idev);
2774
2775 rt->dst.flags |= DST_HOST;
2776 rt->dst.input = ip6_input;
2777 rt->dst.output = ip6_output;
2778 rt->rt6i_idev = idev;
2779
2780 rt->rt6i_protocol = RTPROT_KERNEL;
2781 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2782 if (anycast)
2783 rt->rt6i_flags |= RTF_ANYCAST;
2784 else
2785 rt->rt6i_flags |= RTF_LOCAL;
2786
2787 rt->rt6i_gateway = *addr;
2788 rt->rt6i_dst.addr = *addr;
2789 rt->rt6i_dst.plen = 128;
2790 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2791 rt->rt6i_table = fib6_get_table(net, tb_id);
2792
2793 return rt;
2794}
2795
2796/* remove deleted ip from prefsrc entries */
2797struct arg_dev_net_ip {
2798 struct net_device *dev;
2799 struct net *net;
2800 struct in6_addr *addr;
2801};
2802
2803static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2804{
2805 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2806 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2807 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2808
2809 if (((void *)rt->dst.dev == dev || !dev) &&
2810 rt != net->ipv6.ip6_null_entry &&
2811 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2812 /* remove prefsrc entry */
2813 rt->rt6i_prefsrc.plen = 0;
2814 }
2815 return 0;
2816}
2817
2818void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2819{
2820 struct net *net = dev_net(ifp->idev->dev);
2821 struct arg_dev_net_ip adni = {
2822 .dev = ifp->idev->dev,
2823 .net = net,
2824 .addr = &ifp->addr,
2825 };
2826 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2827}
2828
2829#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2830#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2831
2832/* Remove routers and update dst entries when gateway turn into host. */
2833static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2834{
2835 struct in6_addr *gateway = (struct in6_addr *)arg;
2836
2837 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2838 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2839 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2840 return -1;
2841 }
2842 return 0;
2843}
2844
2845void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2846{
2847 fib6_clean_all(net, fib6_clean_tohost, gateway);
2848}
2849
2850struct arg_dev_net {
2851 struct net_device *dev;
2852 struct net *net;
2853};
2854
2855/* called with write lock held for table with rt */
2856static int fib6_ifdown(struct rt6_info *rt, void *arg)
2857{
2858 const struct arg_dev_net *adn = arg;
2859 const struct net_device *dev = adn->dev;
2860
2861 if ((rt->dst.dev == dev || !dev) &&
2862 rt != adn->net->ipv6.ip6_null_entry &&
2863 (rt->rt6i_nsiblings == 0 ||
2864 (dev && netdev_unregistering(dev)) ||
2865 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2866 return -1;
2867
2868 return 0;
2869}
2870
2871void rt6_ifdown(struct net *net, struct net_device *dev)
2872{
2873 struct arg_dev_net adn = {
2874 .dev = dev,
2875 .net = net,
2876 };
2877
2878 fib6_clean_all(net, fib6_ifdown, &adn);
2879 if (dev)
2880 rt6_uncached_list_flush_dev(net, dev);
2881}
2882
2883struct rt6_mtu_change_arg {
2884 struct net_device *dev;
2885 unsigned int mtu;
2886};
2887
2888static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2889{
2890 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2891 struct inet6_dev *idev;
2892
2893 /* In IPv6 pmtu discovery is not optional,
2894 so that RTAX_MTU lock cannot disable it.
2895 We still use this lock to block changes
2896 caused by addrconf/ndisc.
2897 */
2898
2899 idev = __in6_dev_get(arg->dev);
2900 if (!idev)
2901 return 0;
2902
2903 /* For administrative MTU increase, there is no way to discover
2904 IPv6 PMTU increase, so PMTU increase should be updated here.
2905 Since RFC 1981 doesn't include administrative MTU increase
2906 update PMTU increase is a MUST. (i.e. jumbo frame)
2907 */
2908 /*
2909 If new MTU is less than route PMTU, this new MTU will be the
2910 lowest MTU in the path, update the route PMTU to reflect PMTU
2911 decreases; if new MTU is greater than route PMTU, and the
2912 old MTU is the lowest MTU in the path, update the route PMTU
2913 to reflect the increase. In this case if the other nodes' MTU
2914 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2915 PMTU discovery.
2916 */
2917 if (rt->dst.dev == arg->dev &&
2918 dst_metric_raw(&rt->dst, RTAX_MTU) &&
2919 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2920 if (rt->rt6i_flags & RTF_CACHE) {
2921 /* For RTF_CACHE with rt6i_pmtu == 0
2922 * (i.e. a redirected route),
2923 * the metrics of its rt->dst.from has already
2924 * been updated.
2925 */
2926 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2927 rt->rt6i_pmtu = arg->mtu;
2928 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2929 (dst_mtu(&rt->dst) < arg->mtu &&
2930 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2931 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2932 }
2933 }
2934 return 0;
2935}
2936
2937void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2938{
2939 struct rt6_mtu_change_arg arg = {
2940 .dev = dev,
2941 .mtu = mtu,
2942 };
2943
2944 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2945}
2946
2947static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2948 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2949 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
2950 [RTA_OIF] = { .type = NLA_U32 },
2951 [RTA_IIF] = { .type = NLA_U32 },
2952 [RTA_PRIORITY] = { .type = NLA_U32 },
2953 [RTA_METRICS] = { .type = NLA_NESTED },
2954 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2955 [RTA_PREF] = { .type = NLA_U8 },
2956 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2957 [RTA_ENCAP] = { .type = NLA_NESTED },
2958 [RTA_EXPIRES] = { .type = NLA_U32 },
2959 [RTA_UID] = { .type = NLA_U32 },
2960 [RTA_MARK] = { .type = NLA_U32 },
2961 [RTA_TABLE] = { .type = NLA_U32 },
2962};
2963
2964static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2965 struct fib6_config *cfg,
2966 struct netlink_ext_ack *extack)
2967{
2968 struct rtmsg *rtm;
2969 struct nlattr *tb[RTA_MAX+1];
2970 unsigned int pref;
2971 int err;
2972
2973 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2974 NULL);
2975 if (err < 0)
2976 goto errout;
2977
2978 err = -EINVAL;
2979 rtm = nlmsg_data(nlh);
2980 memset(cfg, 0, sizeof(*cfg));
2981
2982 cfg->fc_table = rtm->rtm_table;
2983 cfg->fc_dst_len = rtm->rtm_dst_len;
2984 cfg->fc_src_len = rtm->rtm_src_len;
2985 cfg->fc_flags = RTF_UP;
2986 cfg->fc_protocol = rtm->rtm_protocol;
2987 cfg->fc_type = rtm->rtm_type;
2988
2989 if (rtm->rtm_type == RTN_UNREACHABLE ||
2990 rtm->rtm_type == RTN_BLACKHOLE ||
2991 rtm->rtm_type == RTN_PROHIBIT ||
2992 rtm->rtm_type == RTN_THROW)
2993 cfg->fc_flags |= RTF_REJECT;
2994
2995 if (rtm->rtm_type == RTN_LOCAL)
2996 cfg->fc_flags |= RTF_LOCAL;
2997
2998 if (rtm->rtm_flags & RTM_F_CLONED)
2999 cfg->fc_flags |= RTF_CACHE;
3000
3001 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3002 cfg->fc_nlinfo.nlh = nlh;
3003 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3004
3005 if (tb[RTA_GATEWAY]) {
3006 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3007 cfg->fc_flags |= RTF_GATEWAY;
3008 }
3009 if (tb[RTA_VIA]) {
3010 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
3011 goto errout;
3012 }
3013
3014 if (tb[RTA_DST]) {
3015 int plen = (rtm->rtm_dst_len + 7) >> 3;
3016
3017 if (nla_len(tb[RTA_DST]) < plen)
3018 goto errout;
3019
3020 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3021 }
3022
3023 if (tb[RTA_SRC]) {
3024 int plen = (rtm->rtm_src_len + 7) >> 3;
3025
3026 if (nla_len(tb[RTA_SRC]) < plen)
3027 goto errout;
3028
3029 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3030 }
3031
3032 if (tb[RTA_PREFSRC])
3033 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3034
3035 if (tb[RTA_OIF])
3036 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3037
3038 if (tb[RTA_PRIORITY])
3039 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3040
3041 if (tb[RTA_METRICS]) {
3042 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3043 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3044 }
3045
3046 if (tb[RTA_TABLE])
3047 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3048
3049 if (tb[RTA_MULTIPATH]) {
3050 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3051 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3052
3053 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3054 cfg->fc_mp_len, extack);
3055 if (err < 0)
3056 goto errout;
3057 }
3058
3059 if (tb[RTA_PREF]) {
3060 pref = nla_get_u8(tb[RTA_PREF]);
3061 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3062 pref != ICMPV6_ROUTER_PREF_HIGH)
3063 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3064 cfg->fc_flags |= RTF_PREF(pref);
3065 }
3066
3067 if (tb[RTA_ENCAP])
3068 cfg->fc_encap = tb[RTA_ENCAP];
3069
3070 if (tb[RTA_ENCAP_TYPE]) {
3071 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3072
3073 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3074 if (err < 0)
3075 goto errout;
3076 }
3077
3078 if (tb[RTA_EXPIRES]) {
3079 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3080
3081 if (addrconf_finite_timeout(timeout)) {
3082 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3083 cfg->fc_flags |= RTF_EXPIRES;
3084 }
3085 }
3086
3087 err = 0;
3088errout:
3089 return err;
3090}
3091
3092struct rt6_nh {
3093 struct rt6_info *rt6_info;
3094 struct fib6_config r_cfg;
3095 struct mx6_config mxc;
3096 struct list_head next;
3097};
3098
3099static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3100{
3101 struct rt6_nh *nh;
3102
3103 list_for_each_entry(nh, rt6_nh_list, next) {
3104 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3105 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3106 nh->r_cfg.fc_ifindex);
3107 }
3108}
3109
3110static int ip6_route_info_append(struct list_head *rt6_nh_list,
3111 struct rt6_info *rt, struct fib6_config *r_cfg)
3112{
3113 struct rt6_nh *nh;
3114 int err = -EEXIST;
3115
3116 list_for_each_entry(nh, rt6_nh_list, next) {
3117 /* check if rt6_info already exists */
3118 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3119 return err;
3120 }
3121
3122 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3123 if (!nh)
3124 return -ENOMEM;
3125 nh->rt6_info = rt;
3126 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3127 if (err) {
3128 kfree(nh);
3129 return err;
3130 }
3131 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3132 list_add_tail(&nh->next, rt6_nh_list);
3133
3134 return 0;
3135}
3136
3137static void ip6_route_mpath_notify(struct rt6_info *rt,
3138 struct rt6_info *rt_last,
3139 struct nl_info *info,
3140 __u16 nlflags)
3141{
3142 /* if this is an APPEND route, then rt points to the first route
3143 * inserted and rt_last points to last route inserted. Userspace
3144 * wants a consistent dump of the route which starts at the first
3145 * nexthop. Since sibling routes are always added at the end of
3146 * the list, find the first sibling of the last route appended
3147 */
3148 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3149 rt = list_first_entry(&rt_last->rt6i_siblings,
3150 struct rt6_info,
3151 rt6i_siblings);
3152 }
3153
3154 if (rt)
3155 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3156}
3157
3158static int ip6_route_multipath_add(struct fib6_config *cfg,
3159 struct netlink_ext_ack *extack)
3160{
3161 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3162 struct nl_info *info = &cfg->fc_nlinfo;
3163 struct fib6_config r_cfg;
3164 struct rtnexthop *rtnh;
3165 struct rt6_info *rt;
3166 struct rt6_nh *err_nh;
3167 struct rt6_nh *nh, *nh_safe;
3168 __u16 nlflags;
3169 int remaining;
3170 int attrlen;
3171 int err = 1;
3172 int nhn = 0;
3173 int replace = (cfg->fc_nlinfo.nlh &&
3174 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3175 LIST_HEAD(rt6_nh_list);
3176
3177 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3178 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3179 nlflags |= NLM_F_APPEND;
3180
3181 remaining = cfg->fc_mp_len;
3182 rtnh = (struct rtnexthop *)cfg->fc_mp;
3183
3184 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3185 * rt6_info structs per nexthop
3186 */
3187 while (rtnh_ok(rtnh, remaining)) {
3188 memcpy(&r_cfg, cfg, sizeof(*cfg));
3189 if (rtnh->rtnh_ifindex)
3190 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3191
3192 attrlen = rtnh_attrlen(rtnh);
3193 if (attrlen > 0) {
3194 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3195
3196 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3197 if (nla) {
3198 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3199 r_cfg.fc_flags |= RTF_GATEWAY;
3200 }
3201 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3202 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3203 if (nla)
3204 r_cfg.fc_encap_type = nla_get_u16(nla);
3205 }
3206
3207 rt = ip6_route_info_create(&r_cfg, extack);
3208 if (IS_ERR(rt)) {
3209 err = PTR_ERR(rt);
3210 rt = NULL;
3211 goto cleanup;
3212 }
3213
3214 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3215 if (err) {
3216 dst_release_immediate(&rt->dst);
3217 goto cleanup;
3218 }
3219
3220 rtnh = rtnh_next(rtnh, &remaining);
3221 }
3222
3223 /* for add and replace send one notification with all nexthops.
3224 * Skip the notification in fib6_add_rt2node and send one with
3225 * the full route when done
3226 */
3227 info->skip_notify = 1;
3228
3229 err_nh = NULL;
3230 list_for_each_entry(nh, &rt6_nh_list, next) {
3231 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3232
3233 if (!err) {
3234 /* save reference to last route successfully inserted */
3235 rt_last = nh->rt6_info;
3236
3237 /* save reference to first route for notification */
3238 if (!rt_notif)
3239 rt_notif = nh->rt6_info;
3240 }
3241
3242 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3243 nh->rt6_info = NULL;
3244 if (err) {
3245 if (replace && nhn)
3246 ip6_print_replace_route_err(&rt6_nh_list);
3247 err_nh = nh;
3248 goto add_errout;
3249 }
3250
3251 /* Because each route is added like a single route we remove
3252 * these flags after the first nexthop: if there is a collision,
3253 * we have already failed to add the first nexthop:
3254 * fib6_add_rt2node() has rejected it; when replacing, old
3255 * nexthops have been replaced by first new, the rest should
3256 * be added to it.
3257 */
3258 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3259 NLM_F_REPLACE);
3260 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
3261 nhn++;
3262 }
3263
3264 /* success ... tell user about new route */
3265 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3266 goto cleanup;
3267
3268add_errout:
3269 /* send notification for routes that were added so that
3270 * the delete notifications sent by ip6_route_del are
3271 * coherent
3272 */
3273 if (rt_notif)
3274 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3275
3276 /* Delete routes that were already added */
3277 list_for_each_entry(nh, &rt6_nh_list, next) {
3278 if (err_nh == nh)
3279 break;
3280 ip6_route_del(&nh->r_cfg, extack);
3281 }
3282
3283cleanup:
3284 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3285 if (nh->rt6_info)
3286 dst_release_immediate(&nh->rt6_info->dst);
3287 kfree(nh->mxc.mx);
3288 list_del(&nh->next);
3289 kfree(nh);
3290 }
3291
3292 return err;
3293}
3294
3295static int ip6_route_multipath_del(struct fib6_config *cfg,
3296 struct netlink_ext_ack *extack)
3297{
3298 struct fib6_config r_cfg;
3299 struct rtnexthop *rtnh;
3300 int remaining;
3301 int attrlen;
3302 int err = 1, last_err = 0;
3303
3304 remaining = cfg->fc_mp_len;
3305 rtnh = (struct rtnexthop *)cfg->fc_mp;
3306
3307 /* Parse a Multipath Entry */
3308 while (rtnh_ok(rtnh, remaining)) {
3309 memcpy(&r_cfg, cfg, sizeof(*cfg));
3310 if (rtnh->rtnh_ifindex)
3311 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3312
3313 attrlen = rtnh_attrlen(rtnh);
3314 if (attrlen > 0) {
3315 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3316
3317 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3318 if (nla) {
3319 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3320 r_cfg.fc_flags |= RTF_GATEWAY;
3321 }
3322 }
3323 err = ip6_route_del(&r_cfg, extack);
3324 if (err)
3325 last_err = err;
3326
3327 rtnh = rtnh_next(rtnh, &remaining);
3328 }
3329
3330 return last_err;
3331}
3332
3333static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3334 struct netlink_ext_ack *extack)
3335{
3336 struct fib6_config cfg;
3337 int err;
3338
3339 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3340 if (err < 0)
3341 return err;
3342
3343 if (cfg.fc_mp)
3344 return ip6_route_multipath_del(&cfg, extack);
3345 else {
3346 cfg.fc_delete_all_nh = 1;
3347 return ip6_route_del(&cfg, extack);
3348 }
3349}
3350
3351static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3352 struct netlink_ext_ack *extack)
3353{
3354 struct fib6_config cfg;
3355 int err;
3356
3357 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3358 if (err < 0)
3359 return err;
3360
3361 if (cfg.fc_mp)
3362 return ip6_route_multipath_add(&cfg, extack);
3363 else
3364 return ip6_route_add(&cfg, extack);
3365}
3366
3367static size_t rt6_nlmsg_size(struct rt6_info *rt)
3368{
3369 int nexthop_len = 0;
3370
3371 if (rt->rt6i_nsiblings) {
3372 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3373 + NLA_ALIGN(sizeof(struct rtnexthop))
3374 + nla_total_size(16) /* RTA_GATEWAY */
3375 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3376
3377 nexthop_len *= rt->rt6i_nsiblings;
3378 }
3379
3380 return NLMSG_ALIGN(sizeof(struct rtmsg))
3381 + nla_total_size(16) /* RTA_SRC */
3382 + nla_total_size(16) /* RTA_DST */
3383 + nla_total_size(16) /* RTA_GATEWAY */
3384 + nla_total_size(16) /* RTA_PREFSRC */
3385 + nla_total_size(4) /* RTA_TABLE */
3386 + nla_total_size(4) /* RTA_IIF */
3387 + nla_total_size(4) /* RTA_OIF */
3388 + nla_total_size(4) /* RTA_PRIORITY */
3389 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3390 + nla_total_size(sizeof(struct rta_cacheinfo))
3391 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3392 + nla_total_size(1) /* RTA_PREF */
3393 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3394 + nexthop_len;
3395}
3396
3397static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3398 unsigned int *flags, bool skip_oif)
3399{
3400 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3401 *flags |= RTNH_F_LINKDOWN;
3402 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3403 *flags |= RTNH_F_DEAD;
3404 }
3405
3406 if (rt->rt6i_flags & RTF_GATEWAY) {
3407 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3408 goto nla_put_failure;
3409 }
3410
3411 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3412 *flags |= RTNH_F_OFFLOAD;
3413
3414 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3415 if (!skip_oif && rt->dst.dev &&
3416 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3417 goto nla_put_failure;
3418
3419 if (rt->dst.lwtstate &&
3420 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3421 goto nla_put_failure;
3422
3423 return 0;
3424
3425nla_put_failure:
3426 return -EMSGSIZE;
3427}
3428
3429/* add multipath next hop */
3430static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3431{
3432 struct rtnexthop *rtnh;
3433 unsigned int flags = 0;
3434
3435 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3436 if (!rtnh)
3437 goto nla_put_failure;
3438
3439 rtnh->rtnh_hops = 0;
3440 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3441
3442 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3443 goto nla_put_failure;
3444
3445 rtnh->rtnh_flags = flags;
3446
3447 /* length of rtnetlink header + attributes */
3448 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3449
3450 return 0;
3451
3452nla_put_failure:
3453 return -EMSGSIZE;
3454}
3455
3456static int rt6_fill_node(struct net *net,
3457 struct sk_buff *skb, struct rt6_info *rt,
3458 struct in6_addr *dst, struct in6_addr *src,
3459 int iif, int type, u32 portid, u32 seq,
3460 unsigned int flags)
3461{
3462 u32 metrics[RTAX_MAX];
3463 struct rtmsg *rtm;
3464 struct nlmsghdr *nlh;
3465 long expires;
3466 u32 table;
3467
3468 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3469 if (!nlh)
3470 return -EMSGSIZE;
3471
3472 rtm = nlmsg_data(nlh);
3473 rtm->rtm_family = AF_INET6;
3474 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3475 rtm->rtm_src_len = rt->rt6i_src.plen;
3476 rtm->rtm_tos = 0;
3477 if (rt->rt6i_table)
3478 table = rt->rt6i_table->tb6_id;
3479 else
3480 table = RT6_TABLE_UNSPEC;
3481 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
3482 if (nla_put_u32(skb, RTA_TABLE, table))
3483 goto nla_put_failure;
3484 if (rt->rt6i_flags & RTF_REJECT) {
3485 switch (rt->dst.error) {
3486 case -EINVAL:
3487 rtm->rtm_type = RTN_BLACKHOLE;
3488 break;
3489 case -EACCES:
3490 rtm->rtm_type = RTN_PROHIBIT;
3491 break;
3492 case -EAGAIN:
3493 rtm->rtm_type = RTN_THROW;
3494 break;
3495 default:
3496 rtm->rtm_type = RTN_UNREACHABLE;
3497 break;
3498 }
3499 }
3500 else if (rt->rt6i_flags & RTF_LOCAL)
3501 rtm->rtm_type = RTN_LOCAL;
3502 else if (rt->rt6i_flags & RTF_ANYCAST)
3503 rtm->rtm_type = RTN_ANYCAST;
3504 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3505 rtm->rtm_type = RTN_LOCAL;
3506 else
3507 rtm->rtm_type = RTN_UNICAST;
3508 rtm->rtm_flags = 0;
3509 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3510 rtm->rtm_protocol = rt->rt6i_protocol;
3511
3512 if (rt->rt6i_flags & RTF_CACHE)
3513 rtm->rtm_flags |= RTM_F_CLONED;
3514
3515 if (dst) {
3516 if (nla_put_in6_addr(skb, RTA_DST, dst))
3517 goto nla_put_failure;
3518 rtm->rtm_dst_len = 128;
3519 } else if (rtm->rtm_dst_len)
3520 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3521 goto nla_put_failure;
3522#ifdef CONFIG_IPV6_SUBTREES
3523 if (src) {
3524 if (nla_put_in6_addr(skb, RTA_SRC, src))
3525 goto nla_put_failure;
3526 rtm->rtm_src_len = 128;
3527 } else if (rtm->rtm_src_len &&
3528 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3529 goto nla_put_failure;
3530#endif
3531 if (iif) {
3532#ifdef CONFIG_IPV6_MROUTE
3533 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3534 int err = ip6mr_get_route(net, skb, rtm, portid);
3535
3536 if (err == 0)
3537 return 0;
3538 if (err < 0)
3539 goto nla_put_failure;
3540 } else
3541#endif
3542 if (nla_put_u32(skb, RTA_IIF, iif))
3543 goto nla_put_failure;
3544 } else if (dst) {
3545 struct in6_addr saddr_buf;
3546 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3547 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3548 goto nla_put_failure;
3549 }
3550
3551 if (rt->rt6i_prefsrc.plen) {
3552 struct in6_addr saddr_buf;
3553 saddr_buf = rt->rt6i_prefsrc.addr;
3554 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3555 goto nla_put_failure;
3556 }
3557
3558 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3559 if (rt->rt6i_pmtu)
3560 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3561 if (rtnetlink_put_metrics(skb, metrics) < 0)
3562 goto nla_put_failure;
3563
3564 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3565 goto nla_put_failure;
3566
3567 /* For multipath routes, walk the siblings list and add
3568 * each as a nexthop within RTA_MULTIPATH.
3569 */
3570 if (rt->rt6i_nsiblings) {
3571 struct rt6_info *sibling, *next_sibling;
3572 struct nlattr *mp;
3573
3574 mp = nla_nest_start(skb, RTA_MULTIPATH);
3575 if (!mp)
3576 goto nla_put_failure;
3577
3578 if (rt6_add_nexthop(skb, rt) < 0)
3579 goto nla_put_failure;
3580
3581 list_for_each_entry_safe(sibling, next_sibling,
3582 &rt->rt6i_siblings, rt6i_siblings) {
3583 if (rt6_add_nexthop(skb, sibling) < 0)
3584 goto nla_put_failure;
3585 }
3586
3587 nla_nest_end(skb, mp);
3588 } else {
3589 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3590 goto nla_put_failure;
3591 }
3592
3593 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3594
3595 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3596 goto nla_put_failure;
3597
3598 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3599 goto nla_put_failure;
3600
3601
3602 nlmsg_end(skb, nlh);
3603 return 0;
3604
3605nla_put_failure:
3606 nlmsg_cancel(skb, nlh);
3607 return -EMSGSIZE;
3608}
3609
3610int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3611{
3612 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3613 struct net *net = arg->net;
3614
3615 if (rt == net->ipv6.ip6_null_entry)
3616 return 0;
3617
3618 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3619 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3620
3621 /* user wants prefix routes only */
3622 if (rtm->rtm_flags & RTM_F_PREFIX &&
3623 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3624 /* success since this is not a prefix route */
3625 return 1;
3626 }
3627 }
3628
3629 return rt6_fill_node(net,
3630 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3631 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3632 NLM_F_MULTI);
3633}
3634
3635static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3636 struct netlink_ext_ack *extack)
3637{
3638 struct net *net = sock_net(in_skb->sk);
3639 struct nlattr *tb[RTA_MAX+1];
3640 int err, iif = 0, oif = 0;
3641 struct dst_entry *dst;
3642 struct rt6_info *rt;
3643 struct sk_buff *skb;
3644 struct rtmsg *rtm;
3645 struct flowi6 fl6;
3646 bool fibmatch;
3647
3648 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3649 extack);
3650 if (err < 0)
3651 goto errout;
3652
3653 err = -EINVAL;
3654 memset(&fl6, 0, sizeof(fl6));
3655 rtm = nlmsg_data(nlh);
3656 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3657 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3658
3659 if (tb[RTA_SRC]) {
3660 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3661 goto errout;
3662
3663 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3664 }
3665
3666 if (tb[RTA_DST]) {
3667 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3668 goto errout;
3669
3670 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3671 }
3672
3673 if (tb[RTA_IIF])
3674 iif = nla_get_u32(tb[RTA_IIF]);
3675
3676 if (tb[RTA_OIF])
3677 oif = nla_get_u32(tb[RTA_OIF]);
3678
3679 if (tb[RTA_MARK])
3680 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3681
3682 if (tb[RTA_UID])
3683 fl6.flowi6_uid = make_kuid(current_user_ns(),
3684 nla_get_u32(tb[RTA_UID]));
3685 else
3686 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3687
3688 if (iif) {
3689 struct net_device *dev;
3690 int flags = 0;
3691
3692 rcu_read_lock();
3693
3694 dev = dev_get_by_index_rcu(net, iif);
3695 if (!dev) {
3696 rcu_read_unlock();
3697 err = -ENODEV;
3698 goto errout;
3699 }
3700
3701 fl6.flowi6_iif = iif;
3702
3703 if (!ipv6_addr_any(&fl6.saddr))
3704 flags |= RT6_LOOKUP_F_HAS_SADDR;
3705
3706 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3707
3708 rcu_read_unlock();
3709 } else {
3710 fl6.flowi6_oif = oif;
3711
3712 dst = ip6_route_output(net, NULL, &fl6);
3713 }
3714
3715
3716 rt = container_of(dst, struct rt6_info, dst);
3717 if (rt->dst.error) {
3718 err = rt->dst.error;
3719 ip6_rt_put(rt);
3720 goto errout;
3721 }
3722
3723 if (rt == net->ipv6.ip6_null_entry) {
3724 err = rt->dst.error;
3725 ip6_rt_put(rt);
3726 goto errout;
3727 }
3728
3729 if (fibmatch && rt->dst.from) {
3730 struct rt6_info *ort = container_of(rt->dst.from,
3731 struct rt6_info, dst);
3732
3733 dst_hold(&ort->dst);
3734 ip6_rt_put(rt);
3735 rt = ort;
3736 }
3737
3738 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3739 if (!skb) {
3740 ip6_rt_put(rt);
3741 err = -ENOBUFS;
3742 goto errout;
3743 }
3744
3745 skb_dst_set(skb, &rt->dst);
3746 if (fibmatch)
3747 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3748 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3749 nlh->nlmsg_seq, 0);
3750 else
3751 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3752 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3753 nlh->nlmsg_seq, 0);
3754 if (err < 0) {
3755 kfree_skb(skb);
3756 goto errout;
3757 }
3758
3759 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3760errout:
3761 return err;
3762}
3763
3764void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3765 unsigned int nlm_flags)
3766{
3767 struct sk_buff *skb;
3768 struct net *net = info->nl_net;
3769 u32 seq;
3770 int err;
3771
3772 err = -ENOBUFS;
3773 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3774
3775 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3776 if (!skb)
3777 goto errout;
3778
3779 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3780 event, info->portid, seq, nlm_flags);
3781 if (err < 0) {
3782 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3783 WARN_ON(err == -EMSGSIZE);
3784 kfree_skb(skb);
3785 goto errout;
3786 }
3787 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3788 info->nlh, gfp_any());
3789 return;
3790errout:
3791 if (err < 0)
3792 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3793}
3794
3795static int ip6_route_dev_notify(struct notifier_block *this,
3796 unsigned long event, void *ptr)
3797{
3798 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3799 struct net *net = dev_net(dev);
3800
3801 if (!(dev->flags & IFF_LOOPBACK))
3802 return NOTIFY_OK;
3803
3804 if (event == NETDEV_REGISTER) {
3805 net->ipv6.ip6_null_entry->dst.dev = dev;
3806 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3807#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3808 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3809 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3810 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3811 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3812#endif
3813 } else if (event == NETDEV_UNREGISTER &&
3814 dev->reg_state != NETREG_UNREGISTERED) {
3815 /* NETDEV_UNREGISTER could be fired for multiple times by
3816 * netdev_wait_allrefs(). Make sure we only call this once.
3817 */
3818 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3819#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3820 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3821 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3822#endif
3823 }
3824
3825 return NOTIFY_OK;
3826}
3827
3828/*
3829 * /proc
3830 */
3831
3832#ifdef CONFIG_PROC_FS
3833
3834static const struct file_operations ipv6_route_proc_fops = {
3835 .owner = THIS_MODULE,
3836 .open = ipv6_route_open,
3837 .read = seq_read,
3838 .llseek = seq_lseek,
3839 .release = seq_release_net,
3840};
3841
3842static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3843{
3844 struct net *net = (struct net *)seq->private;
3845 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3846 net->ipv6.rt6_stats->fib_nodes,
3847 net->ipv6.rt6_stats->fib_route_nodes,
3848 net->ipv6.rt6_stats->fib_rt_alloc,
3849 net->ipv6.rt6_stats->fib_rt_entries,
3850 net->ipv6.rt6_stats->fib_rt_cache,
3851 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3852 net->ipv6.rt6_stats->fib_discarded_routes);
3853
3854 return 0;
3855}
3856
3857static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3858{
3859 return single_open_net(inode, file, rt6_stats_seq_show);
3860}
3861
3862static const struct file_operations rt6_stats_seq_fops = {
3863 .owner = THIS_MODULE,
3864 .open = rt6_stats_seq_open,
3865 .read = seq_read,
3866 .llseek = seq_lseek,
3867 .release = single_release_net,
3868};
3869#endif /* CONFIG_PROC_FS */
3870
3871#ifdef CONFIG_SYSCTL
3872
3873static
3874int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3875 void __user *buffer, size_t *lenp, loff_t *ppos)
3876{
3877 struct net *net;
3878 int delay;
3879 if (!write)
3880 return -EINVAL;
3881
3882 net = (struct net *)ctl->extra1;
3883 delay = net->ipv6.sysctl.flush_delay;
3884 proc_dointvec(ctl, write, buffer, lenp, ppos);
3885 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3886 return 0;
3887}
3888
3889struct ctl_table ipv6_route_table_template[] = {
3890 {
3891 .procname = "flush",
3892 .data = &init_net.ipv6.sysctl.flush_delay,
3893 .maxlen = sizeof(int),
3894 .mode = 0200,
3895 .proc_handler = ipv6_sysctl_rtcache_flush
3896 },
3897 {
3898 .procname = "gc_thresh",
3899 .data = &ip6_dst_ops_template.gc_thresh,
3900 .maxlen = sizeof(int),
3901 .mode = 0644,
3902 .proc_handler = proc_dointvec,
3903 },
3904 {
3905 .procname = "max_size",
3906 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3907 .maxlen = sizeof(int),
3908 .mode = 0644,
3909 .proc_handler = proc_dointvec,
3910 },
3911 {
3912 .procname = "gc_min_interval",
3913 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3914 .maxlen = sizeof(int),
3915 .mode = 0644,
3916 .proc_handler = proc_dointvec_jiffies,
3917 },
3918 {
3919 .procname = "gc_timeout",
3920 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3921 .maxlen = sizeof(int),
3922 .mode = 0644,
3923 .proc_handler = proc_dointvec_jiffies,
3924 },
3925 {
3926 .procname = "gc_interval",
3927 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3928 .maxlen = sizeof(int),
3929 .mode = 0644,
3930 .proc_handler = proc_dointvec_jiffies,
3931 },
3932 {
3933 .procname = "gc_elasticity",
3934 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3935 .maxlen = sizeof(int),
3936 .mode = 0644,
3937 .proc_handler = proc_dointvec,
3938 },
3939 {
3940 .procname = "mtu_expires",
3941 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3942 .maxlen = sizeof(int),
3943 .mode = 0644,
3944 .proc_handler = proc_dointvec_jiffies,
3945 },
3946 {
3947 .procname = "min_adv_mss",
3948 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3949 .maxlen = sizeof(int),
3950 .mode = 0644,
3951 .proc_handler = proc_dointvec,
3952 },
3953 {
3954 .procname = "gc_min_interval_ms",
3955 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3956 .maxlen = sizeof(int),
3957 .mode = 0644,
3958 .proc_handler = proc_dointvec_ms_jiffies,
3959 },
3960 { }
3961};
3962
3963struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3964{
3965 struct ctl_table *table;
3966
3967 table = kmemdup(ipv6_route_table_template,
3968 sizeof(ipv6_route_table_template),
3969 GFP_KERNEL);
3970
3971 if (table) {
3972 table[0].data = &net->ipv6.sysctl.flush_delay;
3973 table[0].extra1 = net;
3974 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3975 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3976 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3977 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3978 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3979 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3980 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3981 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3982 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3983
3984 /* Don't export sysctls to unprivileged users */
3985 if (net->user_ns != &init_user_ns)
3986 table[0].procname = NULL;
3987 }
3988
3989 return table;
3990}
3991#endif
3992
3993static int __net_init ip6_route_net_init(struct net *net)
3994{
3995 int ret = -ENOMEM;
3996
3997 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3998 sizeof(net->ipv6.ip6_dst_ops));
3999
4000 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4001 goto out_ip6_dst_ops;
4002
4003 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4004 sizeof(*net->ipv6.ip6_null_entry),
4005 GFP_KERNEL);
4006 if (!net->ipv6.ip6_null_entry)
4007 goto out_ip6_dst_entries;
4008 net->ipv6.ip6_null_entry->dst.path =
4009 (struct dst_entry *)net->ipv6.ip6_null_entry;
4010 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4011 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4012 ip6_template_metrics, true);
4013
4014#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4015 net->ipv6.fib6_has_custom_rules = false;
4016 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4017 sizeof(*net->ipv6.ip6_prohibit_entry),
4018 GFP_KERNEL);
4019 if (!net->ipv6.ip6_prohibit_entry)
4020 goto out_ip6_null_entry;
4021 net->ipv6.ip6_prohibit_entry->dst.path =
4022 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4023 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4024 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4025 ip6_template_metrics, true);
4026
4027 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4028 sizeof(*net->ipv6.ip6_blk_hole_entry),
4029 GFP_KERNEL);
4030 if (!net->ipv6.ip6_blk_hole_entry)
4031 goto out_ip6_prohibit_entry;
4032 net->ipv6.ip6_blk_hole_entry->dst.path =
4033 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4034 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4035 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4036 ip6_template_metrics, true);
4037#endif
4038
4039 net->ipv6.sysctl.flush_delay = 0;
4040 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4041 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4042 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4043 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4044 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4045 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4046 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4047
4048 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4049
4050 ret = 0;
4051out:
4052 return ret;
4053
4054#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4055out_ip6_prohibit_entry:
4056 kfree(net->ipv6.ip6_prohibit_entry);
4057out_ip6_null_entry:
4058 kfree(net->ipv6.ip6_null_entry);
4059#endif
4060out_ip6_dst_entries:
4061 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4062out_ip6_dst_ops:
4063 goto out;
4064}
4065
4066static void __net_exit ip6_route_net_exit(struct net *net)
4067{
4068 kfree(net->ipv6.ip6_null_entry);
4069#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4070 kfree(net->ipv6.ip6_prohibit_entry);
4071 kfree(net->ipv6.ip6_blk_hole_entry);
4072#endif
4073 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4074}
4075
4076static int __net_init ip6_route_net_init_late(struct net *net)
4077{
4078#ifdef CONFIG_PROC_FS
4079 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4080 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4081#endif
4082 return 0;
4083}
4084
4085static void __net_exit ip6_route_net_exit_late(struct net *net)
4086{
4087#ifdef CONFIG_PROC_FS
4088 remove_proc_entry("ipv6_route", net->proc_net);
4089 remove_proc_entry("rt6_stats", net->proc_net);
4090#endif
4091}
4092
4093static struct pernet_operations ip6_route_net_ops = {
4094 .init = ip6_route_net_init,
4095 .exit = ip6_route_net_exit,
4096};
4097
4098static int __net_init ipv6_inetpeer_init(struct net *net)
4099{
4100 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4101
4102 if (!bp)
4103 return -ENOMEM;
4104 inet_peer_base_init(bp);
4105 net->ipv6.peers = bp;
4106 return 0;
4107}
4108
4109static void __net_exit ipv6_inetpeer_exit(struct net *net)
4110{
4111 struct inet_peer_base *bp = net->ipv6.peers;
4112
4113 net->ipv6.peers = NULL;
4114 inetpeer_invalidate_tree(bp);
4115 kfree(bp);
4116}
4117
4118static struct pernet_operations ipv6_inetpeer_ops = {
4119 .init = ipv6_inetpeer_init,
4120 .exit = ipv6_inetpeer_exit,
4121};
4122
4123static struct pernet_operations ip6_route_net_late_ops = {
4124 .init = ip6_route_net_init_late,
4125 .exit = ip6_route_net_exit_late,
4126};
4127
4128static struct notifier_block ip6_route_dev_notifier = {
4129 .notifier_call = ip6_route_dev_notify,
4130 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4131};
4132
4133void __init ip6_route_init_special_entries(void)
4134{
4135 /* Registering of the loopback is done before this portion of code,
4136 * the loopback reference in rt6_info will not be taken, do it
4137 * manually for init_net */
4138 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4139 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4140 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4141 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4142 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4143 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4144 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4145 #endif
4146}
4147
4148int __init ip6_route_init(void)
4149{
4150 int ret;
4151 int cpu;
4152
4153 ret = -ENOMEM;
4154 ip6_dst_ops_template.kmem_cachep =
4155 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4156 SLAB_HWCACHE_ALIGN, NULL);
4157 if (!ip6_dst_ops_template.kmem_cachep)
4158 goto out;
4159
4160 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4161 if (ret)
4162 goto out_kmem_cache;
4163
4164 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4165 if (ret)
4166 goto out_dst_entries;
4167
4168 ret = register_pernet_subsys(&ip6_route_net_ops);
4169 if (ret)
4170 goto out_register_inetpeer;
4171
4172 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4173
4174 ret = fib6_init();
4175 if (ret)
4176 goto out_register_subsys;
4177
4178 ret = xfrm6_init();
4179 if (ret)
4180 goto out_fib6_init;
4181
4182 ret = fib6_rules_init();
4183 if (ret)
4184 goto xfrm6_init;
4185
4186 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4187 if (ret)
4188 goto fib6_rules_init;
4189
4190 ret = -ENOBUFS;
4191 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4192 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4193 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4194 RTNL_FLAG_DOIT_UNLOCKED))
4195 goto out_register_late_subsys;
4196
4197 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4198 if (ret)
4199 goto out_register_late_subsys;
4200
4201 for_each_possible_cpu(cpu) {
4202 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4203
4204 INIT_LIST_HEAD(&ul->head);
4205 spin_lock_init(&ul->lock);
4206 }
4207
4208out:
4209 return ret;
4210
4211out_register_late_subsys:
4212 unregister_pernet_subsys(&ip6_route_net_late_ops);
4213fib6_rules_init:
4214 fib6_rules_cleanup();
4215xfrm6_init:
4216 xfrm6_fini();
4217out_fib6_init:
4218 fib6_gc_cleanup();
4219out_register_subsys:
4220 unregister_pernet_subsys(&ip6_route_net_ops);
4221out_register_inetpeer:
4222 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4223out_dst_entries:
4224 dst_entries_destroy(&ip6_dst_blackhole_ops);
4225out_kmem_cache:
4226 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4227 goto out;
4228}
4229
4230void ip6_route_cleanup(void)
4231{
4232 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4233 unregister_pernet_subsys(&ip6_route_net_late_ops);
4234 fib6_rules_cleanup();
4235 xfrm6_fini();
4236 fib6_gc_cleanup();
4237 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4238 unregister_pernet_subsys(&ip6_route_net_ops);
4239 dst_entries_destroy(&ip6_dst_blackhole_ops);
4240 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4241}