blob: e409816c2f69c0530dbb814f2700efe779af041b [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
40 *
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
59 */
60
61#define pr_fmt(fmt) "IPv4: " fmt
62
63#include <linux/module.h>
64#include <linux/uaccess.h>
65#include <linux/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
68#include <linux/mm.h>
69#include <linux/memblock.h>
70#include <linux/string.h>
71#include <linux/socket.h>
72#include <linux/sockios.h>
73#include <linux/errno.h>
74#include <linux/in.h>
75#include <linux/inet.h>
76#include <linux/netdevice.h>
77#include <linux/proc_fs.h>
78#include <linux/init.h>
79#include <linux/skbuff.h>
80#include <linux/inetdevice.h>
81#include <linux/igmp.h>
82#include <linux/pkt_sched.h>
83#include <linux/mroute.h>
84#include <linux/netfilter_ipv4.h>
85#include <linux/random.h>
86#include <linux/rcupdate.h>
87#include <linux/times.h>
88#include <linux/slab.h>
89#include <linux/jhash.h>
90#include <net/dst.h>
91#include <net/dst_metadata.h>
92#include <net/net_namespace.h>
93#include <net/protocol.h>
94#include <net/ip.h>
95#include <net/route.h>
96#include <net/inetpeer.h>
97#include <net/sock.h>
98#include <net/ip_fib.h>
99#include <net/nexthop.h>
100#include <net/arp.h>
101#include <net/tcp.h>
102#include <net/icmp.h>
103#include <net/xfrm.h>
104#include <net/lwtunnel.h>
105#include <net/netevent.h>
106#include <net/rtnetlink.h>
107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110#include <net/secure_seq.h>
111#include <net/ip_tunnels.h>
112#include <net/l3mdev.h>
113
114#include "fib_lookup.h"
115
116#define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119#define RT_GC_TIMEOUT (300*HZ)
120
121static int ip_rt_max_size;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129static int ip_rt_min_advmss __read_mostly = 256;
130
131static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
132
133/*
134 * Interface to generic destination cache.
135 */
136
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139static unsigned int ipv4_mtu(const struct dst_entry *dst);
140static void ipv4_negative_advice(struct sock *sk,
141 struct dst_entry *dst);
142static void ipv4_link_failure(struct sk_buff *skb);
143static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
144 struct sk_buff *skb, u32 mtu,
145 bool confirm_neigh);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
148static void ipv4_dst_destroy(struct dst_entry *dst);
149
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
152 WARN_ON(1);
153 return NULL;
154}
155
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
163 .check = ipv4_dst_check,
164 .default_advmss = ipv4_default_advmss,
165 .mtu = ipv4_mtu,
166 .cow_metrics = ipv4_cow_metrics,
167 .destroy = ipv4_dst_destroy,
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
171 .redirect = ip_do_redirect,
172 .local_out = __ip_local_out,
173 .neigh_lookup = ipv4_neigh_lookup,
174 .confirm_neigh = ipv4_confirm_neigh,
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
179const __u8 ip_tos2prio[16] = {
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
197EXPORT_SYMBOL(ip_tos2prio);
198
199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202#ifdef CONFIG_PROC_FS
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
205 if (*pos)
206 return NULL;
207 return SEQ_START_TOKEN;
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
212 ++*pos;
213 return NULL;
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
227 return 0;
228}
229
230static const struct seq_operations rt_cache_seq_ops = {
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
239 return seq_open(file, &rt_cache_seq_ops);
240}
241
242static const struct file_operations rt_cache_seq_fops = {
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
246 .release = seq_release,
247};
248
249
250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251{
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
261 return &per_cpu(rt_cache_stat, cpu);
262 }
263 return NULL;
264}
265
266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267{
268 int cpu;
269
270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
274 return &per_cpu(rt_cache_stat, cpu);
275 }
276 (*pos)++;
277 return NULL;
278
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292 return 0;
293 }
294
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297 dst_entries_get_slow(&ipv4_dst_ops),
298 0, /* st->in_hit */
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
306 0, /* st->out_hit */
307 st->out_slow_tot,
308 st->out_slow_mc,
309
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
316 );
317 return 0;
318}
319
320static const struct seq_operations rt_cpu_seq_ops = {
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
333static const struct file_operations rt_cpu_seq_fops = {
334 .open = rt_cpu_seq_open,
335 .read = seq_read,
336 .llseek = seq_lseek,
337 .release = seq_release,
338};
339
340#ifdef CONFIG_IP_ROUTE_CLASSID
341static int rt_acct_proc_show(struct seq_file *m, void *v)
342{
343 struct ip_rt_acct *dst, *src;
344 unsigned int i, j;
345
346 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 if (!dst)
348 return -ENOMEM;
349
350 for_each_possible_cpu(i) {
351 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352 for (j = 0; j < 256; j++) {
353 dst[j].o_bytes += src[j].o_bytes;
354 dst[j].o_packets += src[j].o_packets;
355 dst[j].i_bytes += src[j].i_bytes;
356 dst[j].i_packets += src[j].i_packets;
357 }
358 }
359
360 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361 kfree(dst);
362 return 0;
363}
364#endif
365
366static int __net_init ip_rt_do_proc_init(struct net *net)
367{
368 struct proc_dir_entry *pde;
369
370 pde = proc_create("rt_cache", 0444, net->proc_net,
371 &rt_cache_seq_fops);
372 if (!pde)
373 goto err1;
374
375 pde = proc_create("rt_cache", 0444,
376 net->proc_net_stat, &rt_cpu_seq_fops);
377 if (!pde)
378 goto err2;
379
380#ifdef CONFIG_IP_ROUTE_CLASSID
381 pde = proc_create_single("rt_acct", 0, net->proc_net,
382 rt_acct_proc_show);
383 if (!pde)
384 goto err3;
385#endif
386 return 0;
387
388#ifdef CONFIG_IP_ROUTE_CLASSID
389err3:
390 remove_proc_entry("rt_cache", net->proc_net_stat);
391#endif
392err2:
393 remove_proc_entry("rt_cache", net->proc_net);
394err1:
395 return -ENOMEM;
396}
397
398static void __net_exit ip_rt_do_proc_exit(struct net *net)
399{
400 remove_proc_entry("rt_cache", net->proc_net_stat);
401 remove_proc_entry("rt_cache", net->proc_net);
402#ifdef CONFIG_IP_ROUTE_CLASSID
403 remove_proc_entry("rt_acct", net->proc_net);
404#endif
405}
406
407static struct pernet_operations ip_rt_proc_ops __net_initdata = {
408 .init = ip_rt_do_proc_init,
409 .exit = ip_rt_do_proc_exit,
410};
411
412static int __init ip_rt_proc_init(void)
413{
414 if (IS_ENABLED(CONFIG_PROC_STRIPPED))
415 return 0;
416
417 return register_pernet_subsys(&ip_rt_proc_ops);
418}
419
420#else
421static inline int ip_rt_proc_init(void)
422{
423 return 0;
424}
425#endif /* CONFIG_PROC_FS */
426
427static inline bool rt_is_expired(const struct rtable *rth)
428{
429 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
430}
431
432void rt_cache_flush(struct net *net)
433{
434 rt_genid_bump_ipv4(net);
435}
436
437static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
438 struct sk_buff *skb,
439 const void *daddr)
440{
441 const struct rtable *rt = container_of(dst, struct rtable, dst);
442 struct net_device *dev = dst->dev;
443 struct neighbour *n;
444
445 rcu_read_lock_bh();
446
447 if (likely(rt->rt_gw_family == AF_INET)) {
448 n = ip_neigh_gw4(dev, rt->rt_gw4);
449 } else if (rt->rt_gw_family == AF_INET6) {
450 n = ip_neigh_gw6(dev, &rt->rt_gw6);
451 } else {
452 __be32 pkey;
453
454 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
455 n = ip_neigh_gw4(dev, pkey);
456 }
457
458 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
459 n = NULL;
460
461 rcu_read_unlock_bh();
462
463 return n;
464}
465
466static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
467{
468 const struct rtable *rt = container_of(dst, struct rtable, dst);
469 struct net_device *dev = dst->dev;
470 const __be32 *pkey = daddr;
471
472 if (rt->rt_gw_family == AF_INET) {
473 pkey = (const __be32 *)&rt->rt_gw4;
474 } else if (rt->rt_gw_family == AF_INET6) {
475 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
476 } else if (!daddr ||
477 (rt->rt_flags &
478 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
479 return;
480 }
481 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
482}
483
484/* Hash tables of size 2048..262144 depending on RAM size.
485 * Each bucket uses 8 bytes.
486 */
487static u32 ip_idents_mask __read_mostly;
488static atomic_t *ip_idents __read_mostly;
489static u32 *ip_tstamps __read_mostly;
490
491/* In order to protect privacy, we add a perturbation to identifiers
492 * if one generator is seldom used. This makes hard for an attacker
493 * to infer how many packets were sent between two points in time.
494 */
495u32 ip_idents_reserve(u32 hash, int segs)
496{
497 u32 bucket, old, now = (u32)jiffies;
498 atomic_t *p_id;
499 u32 *p_tstamp;
500 u32 delta = 0;
501
502 bucket = hash & ip_idents_mask;
503 p_tstamp = ip_tstamps + bucket;
504 p_id = ip_idents + bucket;
505 old = READ_ONCE(*p_tstamp);
506
507 if (old != now && cmpxchg(p_tstamp, old, now) == old)
508 delta = prandom_u32_max(now - old);
509
510 /* If UBSAN reports an error there, please make sure your compiler
511 * supports -fno-strict-overflow before reporting it that was a bug
512 * in UBSAN, and it has been fixed in GCC-8.
513 */
514 return atomic_add_return(segs + delta, p_id) - segs;
515}
516EXPORT_SYMBOL(ip_idents_reserve);
517
518void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
519{
520 u32 hash, id;
521
522 /* Note the following code is not safe, but this is okay. */
523 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
524 get_random_bytes(&net->ipv4.ip_id_key,
525 sizeof(net->ipv4.ip_id_key));
526
527 hash = siphash_3u32((__force u32)iph->daddr,
528 (__force u32)iph->saddr,
529 iph->protocol,
530 &net->ipv4.ip_id_key);
531 id = ip_idents_reserve(hash, segs);
532 iph->id = htons(id);
533}
534EXPORT_SYMBOL(__ip_select_ident);
535
536static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
537 const struct sock *sk,
538 const struct iphdr *iph,
539 int oif, u8 tos,
540 u8 prot, u32 mark, int flow_flags)
541{
542 if (sk) {
543 const struct inet_sock *inet = inet_sk(sk);
544
545 oif = sk->sk_bound_dev_if;
546 mark = sk->sk_mark;
547 tos = RT_CONN_FLAGS(sk);
548 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
549 }
550 flowi4_init_output(fl4, oif, mark, tos,
551 RT_SCOPE_UNIVERSE, prot,
552 flow_flags,
553 iph->daddr, iph->saddr, 0, 0,
554 sock_net_uid(net, sk));
555}
556
557static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
558 const struct sock *sk)
559{
560 const struct net *net = dev_net(skb->dev);
561 const struct iphdr *iph = ip_hdr(skb);
562 int oif = skb->dev->ifindex;
563 u8 tos = RT_TOS(iph->tos);
564 u8 prot = iph->protocol;
565 u32 mark = skb->mark;
566
567 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
568}
569
570static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
571{
572 const struct inet_sock *inet = inet_sk(sk);
573 const struct ip_options_rcu *inet_opt;
574 __be32 daddr = inet->inet_daddr;
575
576 rcu_read_lock();
577 inet_opt = rcu_dereference(inet->inet_opt);
578 if (inet_opt && inet_opt->opt.srr)
579 daddr = inet_opt->opt.faddr;
580 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
581 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
582 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
583 inet_sk_flowi_flags(sk),
584 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
585 rcu_read_unlock();
586}
587
588static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
589 const struct sk_buff *skb)
590{
591 if (skb)
592 build_skb_flow_key(fl4, skb, sk);
593 else
594 build_sk_flow_key(fl4, sk);
595}
596
597static DEFINE_SPINLOCK(fnhe_lock);
598
599static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
600{
601 struct rtable *rt;
602
603 rt = rcu_dereference(fnhe->fnhe_rth_input);
604 if (rt) {
605 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
606 dst_dev_put(&rt->dst);
607 dst_release(&rt->dst);
608 }
609 rt = rcu_dereference(fnhe->fnhe_rth_output);
610 if (rt) {
611 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
612 dst_dev_put(&rt->dst);
613 dst_release(&rt->dst);
614 }
615}
616
617static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
618{
619 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
620 struct fib_nh_exception *fnhe, *oldest = NULL;
621
622 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
623 fnhe = rcu_dereference_protected(*fnhe_p,
624 lockdep_is_held(&fnhe_lock));
625 if (!fnhe)
626 break;
627 if (!oldest ||
628 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
629 oldest = fnhe;
630 oldest_p = fnhe_p;
631 }
632 }
633 fnhe_flush_routes(oldest);
634 *oldest_p = oldest->fnhe_next;
635 kfree_rcu(oldest, rcu);
636}
637
638static u32 fnhe_hashfun(__be32 daddr)
639{
640 static siphash_key_t fnhe_hash_key __read_mostly;
641 u64 hval;
642
643 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
644 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
645 return hash_64(hval, FNHE_HASH_SHIFT);
646}
647
648static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
649{
650 rt->rt_pmtu = fnhe->fnhe_pmtu;
651 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
652 rt->dst.expires = fnhe->fnhe_expires;
653
654 if (fnhe->fnhe_gw) {
655 rt->rt_flags |= RTCF_REDIRECTED;
656 rt->rt_uses_gateway = 1;
657 rt->rt_gw_family = AF_INET;
658 rt->rt_gw4 = fnhe->fnhe_gw;
659 }
660}
661
662static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
663 __be32 gw, u32 pmtu, bool lock,
664 unsigned long expires)
665{
666 struct fnhe_hash_bucket *hash;
667 struct fib_nh_exception *fnhe;
668 struct rtable *rt;
669 u32 genid, hval;
670 unsigned int i;
671 int depth;
672
673 genid = fnhe_genid(dev_net(nhc->nhc_dev));
674 hval = fnhe_hashfun(daddr);
675
676 spin_lock_bh(&fnhe_lock);
677
678 hash = rcu_dereference(nhc->nhc_exceptions);
679 if (!hash) {
680 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
681 if (!hash)
682 goto out_unlock;
683 rcu_assign_pointer(nhc->nhc_exceptions, hash);
684 }
685
686 hash += hval;
687
688 depth = 0;
689 for (fnhe = rcu_dereference(hash->chain); fnhe;
690 fnhe = rcu_dereference(fnhe->fnhe_next)) {
691 if (fnhe->fnhe_daddr == daddr)
692 break;
693 depth++;
694 }
695
696 if (fnhe) {
697 if (fnhe->fnhe_genid != genid)
698 fnhe->fnhe_genid = genid;
699 if (gw)
700 fnhe->fnhe_gw = gw;
701 if (pmtu) {
702 fnhe->fnhe_pmtu = pmtu;
703 fnhe->fnhe_mtu_locked = lock;
704 }
705 fnhe->fnhe_expires = max(1UL, expires);
706 /* Update all cached dsts too */
707 rt = rcu_dereference(fnhe->fnhe_rth_input);
708 if (rt)
709 fill_route_from_fnhe(rt, fnhe);
710 rt = rcu_dereference(fnhe->fnhe_rth_output);
711 if (rt)
712 fill_route_from_fnhe(rt, fnhe);
713 } else {
714 /* Randomize max depth to avoid some side channels attacks. */
715 int max_depth = FNHE_RECLAIM_DEPTH +
716 prandom_u32_max(FNHE_RECLAIM_DEPTH);
717
718 while (depth > max_depth) {
719 fnhe_remove_oldest(hash);
720 depth--;
721 }
722
723 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
724 if (!fnhe)
725 goto out_unlock;
726
727 fnhe->fnhe_next = hash->chain;
728
729 fnhe->fnhe_genid = genid;
730 fnhe->fnhe_daddr = daddr;
731 fnhe->fnhe_gw = gw;
732 fnhe->fnhe_pmtu = pmtu;
733 fnhe->fnhe_mtu_locked = lock;
734 fnhe->fnhe_expires = max(1UL, expires);
735
736 rcu_assign_pointer(hash->chain, fnhe);
737
738 /* Exception created; mark the cached routes for the nexthop
739 * stale, so anyone caching it rechecks if this exception
740 * applies to them.
741 */
742 rt = rcu_dereference(nhc->nhc_rth_input);
743 if (rt)
744 rt->dst.obsolete = DST_OBSOLETE_KILL;
745
746 for_each_possible_cpu(i) {
747 struct rtable __rcu **prt;
748 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
749 rt = rcu_dereference(*prt);
750 if (rt)
751 rt->dst.obsolete = DST_OBSOLETE_KILL;
752 }
753 }
754
755 fnhe->fnhe_stamp = jiffies;
756
757out_unlock:
758 spin_unlock_bh(&fnhe_lock);
759}
760
761static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
762 bool kill_route)
763{
764 __be32 new_gw = icmp_hdr(skb)->un.gateway;
765 __be32 old_gw = ip_hdr(skb)->saddr;
766 struct net_device *dev = skb->dev;
767 struct in_device *in_dev;
768 struct fib_result res;
769 struct neighbour *n;
770 struct net *net;
771
772 switch (icmp_hdr(skb)->code & 7) {
773 case ICMP_REDIR_NET:
774 case ICMP_REDIR_NETTOS:
775 case ICMP_REDIR_HOST:
776 case ICMP_REDIR_HOSTTOS:
777 break;
778
779 default:
780 return;
781 }
782
783 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
784 return;
785
786 in_dev = __in_dev_get_rcu(dev);
787 if (!in_dev)
788 return;
789
790 net = dev_net(dev);
791 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
792 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
793 ipv4_is_zeronet(new_gw))
794 goto reject_redirect;
795
796 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
797 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
798 goto reject_redirect;
799 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
800 goto reject_redirect;
801 } else {
802 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
803 goto reject_redirect;
804 }
805
806 n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
807 if (!n)
808 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
809 if (!IS_ERR(n)) {
810 if (!(n->nud_state & NUD_VALID)) {
811 neigh_event_send(n, NULL);
812 } else {
813 if (fib_lookup(net, fl4, &res, 0) == 0) {
814 struct fib_nh_common *nhc;
815
816 fib_select_path(net, &res, fl4, skb);
817 nhc = FIB_RES_NHC(res);
818 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
819 0, false,
820 jiffies + ip_rt_gc_timeout);
821 }
822 if (kill_route)
823 rt->dst.obsolete = DST_OBSOLETE_KILL;
824 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
825 }
826 neigh_release(n);
827 }
828 return;
829
830reject_redirect:
831#ifdef CONFIG_IP_ROUTE_VERBOSE
832 if (IN_DEV_LOG_MARTIANS(in_dev)) {
833 const struct iphdr *iph = (const struct iphdr *) skb->data;
834 __be32 daddr = iph->daddr;
835 __be32 saddr = iph->saddr;
836
837 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
838 " Advised path = %pI4 -> %pI4\n",
839 &old_gw, dev->name, &new_gw,
840 &saddr, &daddr);
841 }
842#endif
843 ;
844}
845
846static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
847{
848 struct rtable *rt;
849 struct flowi4 fl4;
850 const struct iphdr *iph = (const struct iphdr *) skb->data;
851 struct net *net = dev_net(skb->dev);
852 int oif = skb->dev->ifindex;
853 u8 tos = RT_TOS(iph->tos);
854 u8 prot = iph->protocol;
855 u32 mark = skb->mark;
856
857 rt = (struct rtable *) dst;
858
859 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
860 __ip_do_redirect(rt, skb, &fl4, true);
861}
862
863static void ipv4_negative_advice(struct sock *sk,
864 struct dst_entry *dst)
865{
866 struct rtable *rt = (struct rtable *)dst;
867
868 if ((dst->obsolete > 0) ||
869 (rt->rt_flags & RTCF_REDIRECTED) ||
870 rt->dst.expires)
871 sk_dst_reset(sk);
872}
873
874/*
875 * Algorithm:
876 * 1. The first ip_rt_redirect_number redirects are sent
877 * with exponential backoff, then we stop sending them at all,
878 * assuming that the host ignores our redirects.
879 * 2. If we did not see packets requiring redirects
880 * during ip_rt_redirect_silence, we assume that the host
881 * forgot redirected route and start to send redirects again.
882 *
883 * This algorithm is much cheaper and more intelligent than dumb load limiting
884 * in icmp.c.
885 *
886 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
887 * and "frag. need" (breaks PMTU discovery) in icmp.c.
888 */
889
890void ip_rt_send_redirect(struct sk_buff *skb)
891{
892 struct rtable *rt = skb_rtable(skb);
893 struct in_device *in_dev;
894 struct inet_peer *peer;
895 struct net *net;
896 int log_martians;
897 int vif;
898
899 rcu_read_lock();
900 in_dev = __in_dev_get_rcu(rt->dst.dev);
901 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
902 rcu_read_unlock();
903 return;
904 }
905 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
906 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
907 rcu_read_unlock();
908
909 net = dev_net(rt->dst.dev);
910 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
911 if (!peer) {
912 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
913 rt_nexthop(rt, ip_hdr(skb)->daddr));
914 return;
915 }
916
917 /* No redirected packets during ip_rt_redirect_silence;
918 * reset the algorithm.
919 */
920 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
921 peer->rate_tokens = 0;
922 peer->n_redirects = 0;
923 }
924
925 /* Too many ignored redirects; do not send anything
926 * set dst.rate_last to the last seen redirected packet.
927 */
928 if (peer->n_redirects >= ip_rt_redirect_number) {
929 peer->rate_last = jiffies;
930 goto out_put_peer;
931 }
932
933 /* Check for load limit; set rate_last to the latest sent
934 * redirect.
935 */
936 if (peer->n_redirects == 0 ||
937 time_after(jiffies,
938 (peer->rate_last +
939 (ip_rt_redirect_load << peer->n_redirects)))) {
940 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
941
942 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
943 peer->rate_last = jiffies;
944 ++peer->n_redirects;
945 if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
946 peer->n_redirects == ip_rt_redirect_number)
947 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
948 &ip_hdr(skb)->saddr, inet_iif(skb),
949 &ip_hdr(skb)->daddr, &gw);
950 }
951out_put_peer:
952 inet_putpeer(peer);
953}
954
955static int ip_error(struct sk_buff *skb)
956{
957 struct rtable *rt = skb_rtable(skb);
958 struct net_device *dev = skb->dev;
959 struct in_device *in_dev;
960 struct inet_peer *peer;
961 unsigned long now;
962 struct net *net;
963 bool send;
964 int code;
965
966 if (netif_is_l3_master(skb->dev)) {
967 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
968 if (!dev)
969 goto out;
970 }
971
972 in_dev = __in_dev_get_rcu(dev);
973
974 /* IP on this device is disabled. */
975 if (!in_dev)
976 goto out;
977
978 net = dev_net(rt->dst.dev);
979 if (!IN_DEV_FORWARD(in_dev)) {
980 switch (rt->dst.error) {
981 case EHOSTUNREACH:
982 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
983 break;
984
985 case ENETUNREACH:
986 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
987 break;
988 }
989 goto out;
990 }
991
992 switch (rt->dst.error) {
993 case EINVAL:
994 default:
995 goto out;
996 case EHOSTUNREACH:
997 code = ICMP_HOST_UNREACH;
998 break;
999 case ENETUNREACH:
1000 code = ICMP_NET_UNREACH;
1001 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002 break;
1003 case EACCES:
1004 code = ICMP_PKT_FILTERED;
1005 break;
1006 }
1007
1008 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1009 l3mdev_master_ifindex(skb->dev), 1);
1010
1011 send = true;
1012 if (peer) {
1013 now = jiffies;
1014 peer->rate_tokens += now - peer->rate_last;
1015 if (peer->rate_tokens > ip_rt_error_burst)
1016 peer->rate_tokens = ip_rt_error_burst;
1017 peer->rate_last = now;
1018 if (peer->rate_tokens >= ip_rt_error_cost)
1019 peer->rate_tokens -= ip_rt_error_cost;
1020 else
1021 send = false;
1022 inet_putpeer(peer);
1023 }
1024 if (send)
1025 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1026
1027out: kfree_skb(skb);
1028 return 0;
1029}
1030
1031static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1032{
1033 struct dst_entry *dst = &rt->dst;
1034 struct net *net = dev_net(dst->dev);
1035 u32 old_mtu = ipv4_mtu(dst);
1036 struct fib_result res;
1037 bool lock = false;
1038
1039 if (ip_mtu_locked(dst))
1040 return;
1041
1042 if (old_mtu < mtu)
1043 return;
1044
1045 if (mtu < ip_rt_min_pmtu) {
1046 lock = true;
1047 mtu = min(old_mtu, ip_rt_min_pmtu);
1048 }
1049
1050 if (rt->rt_pmtu == mtu && !lock &&
1051 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1052 return;
1053
1054 rcu_read_lock();
1055 if (fib_lookup(net, fl4, &res, 0) == 0) {
1056 struct fib_nh_common *nhc;
1057
1058 fib_select_path(net, &res, fl4, NULL);
1059 nhc = FIB_RES_NHC(res);
1060 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1061 jiffies + ip_rt_mtu_expires);
1062 }
1063 rcu_read_unlock();
1064}
1065
1066static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1067 struct sk_buff *skb, u32 mtu,
1068 bool confirm_neigh)
1069{
1070 struct rtable *rt = (struct rtable *) dst;
1071 struct flowi4 fl4;
1072
1073 ip_rt_build_flow_key(&fl4, sk, skb);
1074 __ip_rt_update_pmtu(rt, &fl4, mtu);
1075}
1076
1077void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1078 int oif, u8 protocol)
1079{
1080 const struct iphdr *iph = (const struct iphdr *) skb->data;
1081 struct flowi4 fl4;
1082 struct rtable *rt;
1083 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1084
1085 __build_flow_key(net, &fl4, NULL, iph, oif,
1086 RT_TOS(iph->tos), protocol, mark, 0);
1087 rt = __ip_route_output_key(net, &fl4);
1088 if (!IS_ERR(rt)) {
1089 __ip_rt_update_pmtu(rt, &fl4, mtu);
1090 ip_rt_put(rt);
1091 }
1092}
1093EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1094
1095static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1096{
1097 const struct iphdr *iph = (const struct iphdr *) skb->data;
1098 struct flowi4 fl4;
1099 struct rtable *rt;
1100
1101 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1102
1103 if (!fl4.flowi4_mark)
1104 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1105
1106 rt = __ip_route_output_key(sock_net(sk), &fl4);
1107 if (!IS_ERR(rt)) {
1108 __ip_rt_update_pmtu(rt, &fl4, mtu);
1109 ip_rt_put(rt);
1110 }
1111}
1112
1113void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1114{
1115 const struct iphdr *iph = (const struct iphdr *) skb->data;
1116 struct flowi4 fl4;
1117 struct rtable *rt;
1118 struct dst_entry *odst = NULL;
1119 bool new = false;
1120 struct net *net = sock_net(sk);
1121
1122 bh_lock_sock(sk);
1123
1124 if (!ip_sk_accept_pmtu(sk))
1125 goto out;
1126
1127 odst = sk_dst_get(sk);
1128
1129 if (sock_owned_by_user(sk) || !odst) {
1130 __ipv4_sk_update_pmtu(skb, sk, mtu);
1131 goto out;
1132 }
1133
1134 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1135
1136 rt = (struct rtable *)odst;
1137 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1138 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1139 if (IS_ERR(rt))
1140 goto out;
1141
1142 new = true;
1143 }
1144
1145 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1146
1147 if (!dst_check(&rt->dst, 0)) {
1148 if (new)
1149 dst_release(&rt->dst);
1150
1151 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1152 if (IS_ERR(rt))
1153 goto out;
1154
1155 new = true;
1156 }
1157
1158 if (new)
1159 sk_dst_set(sk, &rt->dst);
1160
1161out:
1162 bh_unlock_sock(sk);
1163 dst_release(odst);
1164}
1165EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1166
1167void ipv4_redirect(struct sk_buff *skb, struct net *net,
1168 int oif, u8 protocol)
1169{
1170 const struct iphdr *iph = (const struct iphdr *) skb->data;
1171 struct flowi4 fl4;
1172 struct rtable *rt;
1173
1174 __build_flow_key(net, &fl4, NULL, iph, oif,
1175 RT_TOS(iph->tos), protocol, 0, 0);
1176 rt = __ip_route_output_key(net, &fl4);
1177 if (!IS_ERR(rt)) {
1178 __ip_do_redirect(rt, skb, &fl4, false);
1179 ip_rt_put(rt);
1180 }
1181}
1182EXPORT_SYMBOL_GPL(ipv4_redirect);
1183
1184void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1185{
1186 const struct iphdr *iph = (const struct iphdr *) skb->data;
1187 struct flowi4 fl4;
1188 struct rtable *rt;
1189 struct net *net = sock_net(sk);
1190
1191 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1192 rt = __ip_route_output_key(net, &fl4);
1193 if (!IS_ERR(rt)) {
1194 __ip_do_redirect(rt, skb, &fl4, false);
1195 ip_rt_put(rt);
1196 }
1197}
1198EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1199
1200static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1201{
1202 struct rtable *rt = (struct rtable *) dst;
1203
1204 /* All IPV4 dsts are created with ->obsolete set to the value
1205 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1206 * into this function always.
1207 *
1208 * When a PMTU/redirect information update invalidates a route,
1209 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1210 * DST_OBSOLETE_DEAD.
1211 */
1212 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1213 return NULL;
1214 return dst;
1215}
1216
1217static void ipv4_send_dest_unreach(struct sk_buff *skb)
1218{
1219 struct net_device *dev;
1220 struct ip_options opt;
1221 int res;
1222
1223 /* Recompile ip options since IPCB may not be valid anymore.
1224 * Also check we have a reasonable ipv4 header.
1225 */
1226 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1227 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1228 return;
1229
1230 memset(&opt, 0, sizeof(opt));
1231 if (ip_hdr(skb)->ihl > 5) {
1232 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1233 return;
1234 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1235
1236 rcu_read_lock();
1237 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1238 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1239 rcu_read_unlock();
1240
1241 if (res)
1242 return;
1243 }
1244 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1245}
1246
1247static void ipv4_link_failure(struct sk_buff *skb)
1248{
1249 struct rtable *rt;
1250
1251 ipv4_send_dest_unreach(skb);
1252
1253 rt = skb_rtable(skb);
1254 if (rt)
1255 dst_set_expires(&rt->dst, 0);
1256}
1257
1258static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1259{
1260 pr_debug("%s: %pI4 -> %pI4, %s\n",
1261 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1262 skb->dev ? skb->dev->name : "?");
1263 kfree_skb(skb);
1264 WARN_ON(1);
1265 return 0;
1266}
1267
1268/*
1269 We do not cache source address of outgoing interface,
1270 because it is used only by IP RR, TS and SRR options,
1271 so that it out of fast path.
1272
1273 BTW remember: "addr" is allowed to be not aligned
1274 in IP options!
1275 */
1276
1277void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1278{
1279 __be32 src;
1280
1281 if (rt_is_output_route(rt))
1282 src = ip_hdr(skb)->saddr;
1283 else {
1284 struct fib_result res;
1285 struct iphdr *iph = ip_hdr(skb);
1286 struct flowi4 fl4 = {
1287 .daddr = iph->daddr,
1288 .saddr = iph->saddr,
1289 .flowi4_tos = iph->tos & IPTOS_RT_MASK,
1290 .flowi4_oif = rt->dst.dev->ifindex,
1291 .flowi4_iif = skb->dev->ifindex,
1292 .flowi4_mark = skb->mark,
1293 };
1294
1295 rcu_read_lock();
1296 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1297 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1298 else
1299 src = inet_select_addr(rt->dst.dev,
1300 rt_nexthop(rt, iph->daddr),
1301 RT_SCOPE_UNIVERSE);
1302 rcu_read_unlock();
1303 }
1304 memcpy(addr, &src, 4);
1305}
1306
1307#ifdef CONFIG_IP_ROUTE_CLASSID
1308static void set_class_tag(struct rtable *rt, u32 tag)
1309{
1310 if (!(rt->dst.tclassid & 0xFFFF))
1311 rt->dst.tclassid |= tag & 0xFFFF;
1312 if (!(rt->dst.tclassid & 0xFFFF0000))
1313 rt->dst.tclassid |= tag & 0xFFFF0000;
1314}
1315#endif
1316
1317static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1318{
1319 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1320 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1321 ip_rt_min_advmss);
1322
1323 return min(advmss, IPV4_MAX_PMTU - header_size);
1324}
1325
1326static unsigned int ipv4_mtu(const struct dst_entry *dst)
1327{
1328 const struct rtable *rt = (const struct rtable *) dst;
1329 unsigned int mtu = rt->rt_pmtu;
1330
1331 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1332 mtu = dst_metric_raw(dst, RTAX_MTU);
1333
1334 if (mtu)
1335 goto out;
1336
1337 mtu = READ_ONCE(dst->dev->mtu);
1338
1339 if (unlikely(ip_mtu_locked(dst))) {
1340 if (rt->rt_uses_gateway && mtu > 576)
1341 mtu = 576;
1342 }
1343
1344out:
1345 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1346
1347 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1348}
1349
1350static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1351{
1352 struct fnhe_hash_bucket *hash;
1353 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1354 u32 hval = fnhe_hashfun(daddr);
1355
1356 spin_lock_bh(&fnhe_lock);
1357
1358 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1359 lockdep_is_held(&fnhe_lock));
1360 hash += hval;
1361
1362 fnhe_p = &hash->chain;
1363 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1364 while (fnhe) {
1365 if (fnhe->fnhe_daddr == daddr) {
1366 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1367 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1368 /* set fnhe_daddr to 0 to ensure it won't bind with
1369 * new dsts in rt_bind_exception().
1370 */
1371 fnhe->fnhe_daddr = 0;
1372 fnhe_flush_routes(fnhe);
1373 kfree_rcu(fnhe, rcu);
1374 break;
1375 }
1376 fnhe_p = &fnhe->fnhe_next;
1377 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1378 lockdep_is_held(&fnhe_lock));
1379 }
1380
1381 spin_unlock_bh(&fnhe_lock);
1382}
1383
1384static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1385 __be32 daddr)
1386{
1387 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1388 struct fib_nh_exception *fnhe;
1389 u32 hval;
1390
1391 if (!hash)
1392 return NULL;
1393
1394 hval = fnhe_hashfun(daddr);
1395
1396 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1397 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1398 if (fnhe->fnhe_daddr == daddr) {
1399 if (fnhe->fnhe_expires &&
1400 time_after(jiffies, fnhe->fnhe_expires)) {
1401 ip_del_fnhe(nhc, daddr);
1402 break;
1403 }
1404 return fnhe;
1405 }
1406 }
1407 return NULL;
1408}
1409
1410/* MTU selection:
1411 * 1. mtu on route is locked - use it
1412 * 2. mtu from nexthop exception
1413 * 3. mtu from egress device
1414 */
1415
1416u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1417{
1418 struct fib_nh_common *nhc = res->nhc;
1419 struct net_device *dev = nhc->nhc_dev;
1420 struct fib_info *fi = res->fi;
1421 u32 mtu = 0;
1422
1423 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1424 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1425 mtu = fi->fib_mtu;
1426
1427 if (likely(!mtu)) {
1428 struct fib_nh_exception *fnhe;
1429
1430 fnhe = find_exception(nhc, daddr);
1431 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1432 mtu = fnhe->fnhe_pmtu;
1433 }
1434
1435 if (likely(!mtu))
1436 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1437
1438 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1439}
1440
1441static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1442 __be32 daddr, const bool do_cache)
1443{
1444 bool ret = false;
1445
1446 spin_lock_bh(&fnhe_lock);
1447
1448 if (daddr == fnhe->fnhe_daddr) {
1449 struct rtable __rcu **porig;
1450 struct rtable *orig;
1451 int genid = fnhe_genid(dev_net(rt->dst.dev));
1452
1453 if (rt_is_input_route(rt))
1454 porig = &fnhe->fnhe_rth_input;
1455 else
1456 porig = &fnhe->fnhe_rth_output;
1457 orig = rcu_dereference(*porig);
1458
1459 if (fnhe->fnhe_genid != genid) {
1460 fnhe->fnhe_genid = genid;
1461 fnhe->fnhe_gw = 0;
1462 fnhe->fnhe_pmtu = 0;
1463 fnhe->fnhe_expires = 0;
1464 fnhe->fnhe_mtu_locked = false;
1465 fnhe_flush_routes(fnhe);
1466 orig = NULL;
1467 }
1468 fill_route_from_fnhe(rt, fnhe);
1469 if (!rt->rt_gw4) {
1470 rt->rt_gw4 = daddr;
1471 rt->rt_gw_family = AF_INET;
1472 }
1473
1474 if (do_cache) {
1475 dst_hold(&rt->dst);
1476 rcu_assign_pointer(*porig, rt);
1477 if (orig) {
1478 dst_dev_put(&orig->dst);
1479 dst_release(&orig->dst);
1480 }
1481 ret = true;
1482 }
1483
1484 fnhe->fnhe_stamp = jiffies;
1485 }
1486 spin_unlock_bh(&fnhe_lock);
1487
1488 return ret;
1489}
1490
1491static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1492{
1493 struct rtable *orig, *prev, **p;
1494 bool ret = true;
1495
1496 if (rt_is_input_route(rt)) {
1497 p = (struct rtable **)&nhc->nhc_rth_input;
1498 } else {
1499 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1500 }
1501 orig = *p;
1502
1503 /* hold dst before doing cmpxchg() to avoid race condition
1504 * on this dst
1505 */
1506 dst_hold(&rt->dst);
1507 prev = cmpxchg(p, orig, rt);
1508 if (prev == orig) {
1509 if (orig) {
1510 rt_add_uncached_list(orig);
1511 dst_release(&orig->dst);
1512 }
1513 } else {
1514 dst_release(&rt->dst);
1515 ret = false;
1516 }
1517
1518 return ret;
1519}
1520
1521struct uncached_list {
1522 spinlock_t lock;
1523 struct list_head head;
1524};
1525
1526static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1527
1528void rt_add_uncached_list(struct rtable *rt)
1529{
1530 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1531
1532 rt->rt_uncached_list = ul;
1533
1534 spin_lock_bh(&ul->lock);
1535 list_add_tail(&rt->rt_uncached, &ul->head);
1536 spin_unlock_bh(&ul->lock);
1537}
1538
1539void rt_del_uncached_list(struct rtable *rt)
1540{
1541 if (!list_empty(&rt->rt_uncached)) {
1542 struct uncached_list *ul = rt->rt_uncached_list;
1543
1544 spin_lock_bh(&ul->lock);
1545 list_del(&rt->rt_uncached);
1546 spin_unlock_bh(&ul->lock);
1547 }
1548}
1549
1550static void ipv4_dst_destroy(struct dst_entry *dst)
1551{
1552 struct rtable *rt = (struct rtable *)dst;
1553
1554 ip_dst_metrics_put(dst);
1555 rt_del_uncached_list(rt);
1556}
1557
1558void rt_flush_dev(struct net_device *dev)
1559{
1560 struct rtable *rt;
1561 int cpu;
1562
1563 for_each_possible_cpu(cpu) {
1564 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1565
1566 spin_lock_bh(&ul->lock);
1567 list_for_each_entry(rt, &ul->head, rt_uncached) {
1568 if (rt->dst.dev != dev)
1569 continue;
1570 rt->dst.dev = blackhole_netdev;
1571 dev_hold(rt->dst.dev);
1572 dev_put(dev);
1573 }
1574 spin_unlock_bh(&ul->lock);
1575 }
1576}
1577
1578static bool rt_cache_valid(const struct rtable *rt)
1579{
1580 return rt &&
1581 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1582 !rt_is_expired(rt);
1583}
1584
1585static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1586 const struct fib_result *res,
1587 struct fib_nh_exception *fnhe,
1588 struct fib_info *fi, u16 type, u32 itag,
1589 const bool do_cache)
1590{
1591 bool cached = false;
1592
1593 if (fi) {
1594 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1595
1596 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1597 rt->rt_uses_gateway = 1;
1598 rt->rt_gw_family = nhc->nhc_gw_family;
1599 /* only INET and INET6 are supported */
1600 if (likely(nhc->nhc_gw_family == AF_INET))
1601 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1602 else
1603 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1604 }
1605
1606 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1607
1608#ifdef CONFIG_IP_ROUTE_CLASSID
1609 if (nhc->nhc_family == AF_INET) {
1610 struct fib_nh *nh;
1611
1612 nh = container_of(nhc, struct fib_nh, nh_common);
1613 rt->dst.tclassid = nh->nh_tclassid;
1614 }
1615#endif
1616 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1617 if (unlikely(fnhe))
1618 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1619 else if (do_cache)
1620 cached = rt_cache_route(nhc, rt);
1621 if (unlikely(!cached)) {
1622 /* Routes we intend to cache in nexthop exception or
1623 * FIB nexthop have the DST_NOCACHE bit clear.
1624 * However, if we are unsuccessful at storing this
1625 * route into the cache we really need to set it.
1626 */
1627 if (!rt->rt_gw4) {
1628 rt->rt_gw_family = AF_INET;
1629 rt->rt_gw4 = daddr;
1630 }
1631 rt_add_uncached_list(rt);
1632 }
1633 } else
1634 rt_add_uncached_list(rt);
1635
1636#ifdef CONFIG_IP_ROUTE_CLASSID
1637#ifdef CONFIG_IP_MULTIPLE_TABLES
1638 set_class_tag(rt, res->tclassid);
1639#endif
1640 set_class_tag(rt, itag);
1641#endif
1642}
1643
1644struct rtable *rt_dst_alloc(struct net_device *dev,
1645 unsigned int flags, u16 type,
1646 bool nopolicy, bool noxfrm, bool will_cache)
1647{
1648 struct rtable *rt;
1649
1650 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1651 (will_cache ? 0 : DST_HOST) |
1652 (nopolicy ? DST_NOPOLICY : 0) |
1653 (noxfrm ? DST_NOXFRM : 0));
1654
1655 if (rt) {
1656 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1657 rt->rt_flags = flags;
1658 rt->rt_type = type;
1659 rt->rt_is_input = 0;
1660 rt->rt_iif = 0;
1661 rt->rt_pmtu = 0;
1662 rt->rt_mtu_locked = 0;
1663 rt->rt_uses_gateway = 0;
1664 rt->rt_gw_family = 0;
1665 rt->rt_gw4 = 0;
1666 INIT_LIST_HEAD(&rt->rt_uncached);
1667
1668 rt->dst.output = ip_output;
1669 if (flags & RTCF_LOCAL)
1670 rt->dst.input = ip_local_deliver;
1671 }
1672
1673 return rt;
1674}
1675EXPORT_SYMBOL(rt_dst_alloc);
1676
1677struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1678{
1679 struct rtable *new_rt;
1680
1681 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1682 rt->dst.flags);
1683
1684 if (new_rt) {
1685 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1686 new_rt->rt_flags = rt->rt_flags;
1687 new_rt->rt_type = rt->rt_type;
1688 new_rt->rt_is_input = rt->rt_is_input;
1689 new_rt->rt_iif = rt->rt_iif;
1690 new_rt->rt_pmtu = rt->rt_pmtu;
1691 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1692 new_rt->rt_gw_family = rt->rt_gw_family;
1693 if (rt->rt_gw_family == AF_INET)
1694 new_rt->rt_gw4 = rt->rt_gw4;
1695 else if (rt->rt_gw_family == AF_INET6)
1696 new_rt->rt_gw6 = rt->rt_gw6;
1697 INIT_LIST_HEAD(&new_rt->rt_uncached);
1698
1699 new_rt->dst.flags |= DST_HOST;
1700 new_rt->dst.input = rt->dst.input;
1701 new_rt->dst.output = rt->dst.output;
1702 new_rt->dst.error = rt->dst.error;
1703 new_rt->dst.lastuse = jiffies;
1704 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1705 }
1706 return new_rt;
1707}
1708EXPORT_SYMBOL(rt_dst_clone);
1709
1710/* called in rcu_read_lock() section */
1711int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712 u8 tos, struct net_device *dev,
1713 struct in_device *in_dev, u32 *itag)
1714{
1715 int err;
1716
1717 /* Primary sanity checks. */
1718 if (!in_dev)
1719 return -EINVAL;
1720
1721 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1722 skb->protocol != htons(ETH_P_IP))
1723 return -EINVAL;
1724
1725 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1726 return -EINVAL;
1727
1728 if (ipv4_is_zeronet(saddr)) {
1729 if (!ipv4_is_local_multicast(daddr) &&
1730 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1731 return -EINVAL;
1732 } else {
1733 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1734 in_dev, itag);
1735 if (err < 0)
1736 return err;
1737 }
1738 return 0;
1739}
1740
1741/* called in rcu_read_lock() section */
1742static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1743 u8 tos, struct net_device *dev, int our)
1744{
1745 struct in_device *in_dev = __in_dev_get_rcu(dev);
1746 unsigned int flags = RTCF_MULTICAST;
1747 struct rtable *rth;
1748 u32 itag = 0;
1749 int err;
1750
1751 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1752 if (err)
1753 return err;
1754
1755 if (our)
1756 flags |= RTCF_LOCAL;
1757
1758 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1759 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1760 if (!rth)
1761 return -ENOBUFS;
1762
1763#ifdef CONFIG_IP_ROUTE_CLASSID
1764 rth->dst.tclassid = itag;
1765#endif
1766 rth->dst.output = ip_rt_bug;
1767 rth->rt_is_input= 1;
1768
1769#ifdef CONFIG_IP_MROUTE
1770 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1771 rth->dst.input = ip_mr_input;
1772#endif
1773 RT_CACHE_STAT_INC(in_slow_mc);
1774
1775 skb_dst_drop(skb);
1776 skb_dst_set(skb, &rth->dst);
1777 return 0;
1778}
1779
1780
1781static void ip_handle_martian_source(struct net_device *dev,
1782 struct in_device *in_dev,
1783 struct sk_buff *skb,
1784 __be32 daddr,
1785 __be32 saddr)
1786{
1787 RT_CACHE_STAT_INC(in_martian_src);
1788#ifdef CONFIG_IP_ROUTE_VERBOSE
1789 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1790 /*
1791 * RFC1812 recommendation, if source is martian,
1792 * the only hint is MAC header.
1793 */
1794 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1795 &daddr, &saddr, dev->name);
1796 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1797 print_hex_dump(KERN_WARNING, "ll header: ",
1798 DUMP_PREFIX_OFFSET, 16, 1,
1799 skb_mac_header(skb),
1800 dev->hard_header_len, false);
1801 }
1802 }
1803#endif
1804}
1805
1806/* called in rcu_read_lock() section */
1807static int __mkroute_input(struct sk_buff *skb,
1808 const struct fib_result *res,
1809 struct in_device *in_dev,
1810 __be32 daddr, __be32 saddr, u32 tos)
1811{
1812 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1813 struct net_device *dev = nhc->nhc_dev;
1814 struct fib_nh_exception *fnhe;
1815 struct rtable *rth;
1816 int err;
1817 struct in_device *out_dev;
1818 bool do_cache;
1819 u32 itag = 0;
1820
1821 /* get a working reference to the output device */
1822 out_dev = __in_dev_get_rcu(dev);
1823 if (!out_dev) {
1824 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1825 return -EINVAL;
1826 }
1827
1828 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1829 in_dev->dev, in_dev, &itag);
1830 if (err < 0) {
1831 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1832 saddr);
1833
1834 goto cleanup;
1835 }
1836
1837 do_cache = res->fi && !itag;
1838 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1839 skb->protocol == htons(ETH_P_IP)) {
1840 __be32 gw;
1841
1842 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1843 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1844 inet_addr_onlink(out_dev, saddr, gw))
1845 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1846 }
1847
1848 if (skb->protocol != htons(ETH_P_IP)) {
1849 /* Not IP (i.e. ARP). Do not create route, if it is
1850 * invalid for proxy arp. DNAT routes are always valid.
1851 *
1852 * Proxy arp feature have been extended to allow, ARP
1853 * replies back to the same interface, to support
1854 * Private VLAN switch technologies. See arp.c.
1855 */
1856 if (out_dev == in_dev &&
1857 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1858 err = -EINVAL;
1859 goto cleanup;
1860 }
1861 }
1862
1863 fnhe = find_exception(nhc, daddr);
1864 if (do_cache) {
1865 if (fnhe)
1866 rth = rcu_dereference(fnhe->fnhe_rth_input);
1867 else
1868 rth = rcu_dereference(nhc->nhc_rth_input);
1869 if (rt_cache_valid(rth)) {
1870 skb_dst_set_noref(skb, &rth->dst);
1871 goto out;
1872 }
1873 }
1874
1875 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1876 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1877 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1878 if (!rth) {
1879 err = -ENOBUFS;
1880 goto cleanup;
1881 }
1882
1883 rth->rt_is_input = 1;
1884 RT_CACHE_STAT_INC(in_slow_tot);
1885
1886 rth->dst.input = ip_forward;
1887
1888 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1889 do_cache);
1890 lwtunnel_set_redirect(&rth->dst);
1891 skb_dst_set(skb, &rth->dst);
1892out:
1893 err = 0;
1894 cleanup:
1895 return err;
1896}
1897
1898#ifdef CONFIG_IP_ROUTE_MULTIPATH
1899/* To make ICMP packets follow the right flow, the multipath hash is
1900 * calculated from the inner IP addresses.
1901 */
1902static void ip_multipath_l3_keys(const struct sk_buff *skb,
1903 struct flow_keys *hash_keys)
1904{
1905 const struct iphdr *outer_iph = ip_hdr(skb);
1906 const struct iphdr *key_iph = outer_iph;
1907 const struct iphdr *inner_iph;
1908 const struct icmphdr *icmph;
1909 struct iphdr _inner_iph;
1910 struct icmphdr _icmph;
1911
1912 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1913 goto out;
1914
1915 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1916 goto out;
1917
1918 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1919 &_icmph);
1920 if (!icmph)
1921 goto out;
1922
1923 if (icmph->type != ICMP_DEST_UNREACH &&
1924 icmph->type != ICMP_REDIRECT &&
1925 icmph->type != ICMP_TIME_EXCEEDED &&
1926 icmph->type != ICMP_PARAMETERPROB)
1927 goto out;
1928
1929 inner_iph = skb_header_pointer(skb,
1930 outer_iph->ihl * 4 + sizeof(_icmph),
1931 sizeof(_inner_iph), &_inner_iph);
1932 if (!inner_iph)
1933 goto out;
1934
1935 key_iph = inner_iph;
1936out:
1937 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1938 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1939}
1940
1941/* if skb is set it will be used and fl4 can be NULL */
1942int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1943 const struct sk_buff *skb, struct flow_keys *flkeys)
1944{
1945 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1946 struct flow_keys hash_keys;
1947 u32 mhash;
1948
1949 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1950 case 0:
1951 memset(&hash_keys, 0, sizeof(hash_keys));
1952 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1953 if (skb) {
1954 ip_multipath_l3_keys(skb, &hash_keys);
1955 } else {
1956 hash_keys.addrs.v4addrs.src = fl4->saddr;
1957 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1958 }
1959 break;
1960 case 1:
1961 /* skb is currently provided only when forwarding */
1962 if (skb) {
1963 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1964 struct flow_keys keys;
1965
1966 /* short-circuit if we already have L4 hash present */
1967 if (skb->l4_hash)
1968 return skb_get_hash_raw(skb) >> 1;
1969
1970 memset(&hash_keys, 0, sizeof(hash_keys));
1971
1972 if (!flkeys) {
1973 skb_flow_dissect_flow_keys(skb, &keys, flag);
1974 flkeys = &keys;
1975 }
1976
1977 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1979 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1980 hash_keys.ports.src = flkeys->ports.src;
1981 hash_keys.ports.dst = flkeys->ports.dst;
1982 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1983 } else {
1984 memset(&hash_keys, 0, sizeof(hash_keys));
1985 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1986 hash_keys.addrs.v4addrs.src = fl4->saddr;
1987 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1988 hash_keys.ports.src = fl4->fl4_sport;
1989 hash_keys.ports.dst = fl4->fl4_dport;
1990 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1991 }
1992 break;
1993 case 2:
1994 memset(&hash_keys, 0, sizeof(hash_keys));
1995 /* skb is currently provided only when forwarding */
1996 if (skb) {
1997 struct flow_keys keys;
1998
1999 skb_flow_dissect_flow_keys(skb, &keys, 0);
2000 /* Inner can be v4 or v6 */
2001 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2002 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2004 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2005 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2006 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2007 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2008 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2009 hash_keys.tags.flow_label = keys.tags.flow_label;
2010 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2011 } else {
2012 /* Same as case 0 */
2013 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2014 ip_multipath_l3_keys(skb, &hash_keys);
2015 }
2016 } else {
2017 /* Same as case 0 */
2018 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2019 hash_keys.addrs.v4addrs.src = fl4->saddr;
2020 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2021 }
2022 break;
2023 }
2024 mhash = flow_hash_from_keys(&hash_keys);
2025
2026 if (multipath_hash)
2027 mhash = jhash_2words(mhash, multipath_hash, 0);
2028
2029 return mhash >> 1;
2030}
2031#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2032
2033static int ip_mkroute_input(struct sk_buff *skb,
2034 struct fib_result *res,
2035 struct in_device *in_dev,
2036 __be32 daddr, __be32 saddr, u32 tos,
2037 struct flow_keys *hkeys)
2038{
2039#ifdef CONFIG_IP_ROUTE_MULTIPATH
2040 if (res->fi && fib_info_num_path(res->fi) > 1) {
2041 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2042
2043 fib_select_multipath(res, h);
2044 }
2045#endif
2046
2047 /* create a routing cache entry */
2048 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2049}
2050
2051/*
2052 * NOTE. We drop all the packets that has local source
2053 * addresses, because every properly looped back packet
2054 * must have correct destination already attached by output routine.
2055 *
2056 * Such approach solves two big problems:
2057 * 1. Not simplex devices are handled properly.
2058 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2059 * called with rcu_read_lock()
2060 */
2061
2062static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2063 u8 tos, struct net_device *dev,
2064 struct fib_result *res)
2065{
2066 struct in_device *in_dev = __in_dev_get_rcu(dev);
2067 struct flow_keys *flkeys = NULL, _flkeys;
2068 struct net *net = dev_net(dev);
2069 struct ip_tunnel_info *tun_info;
2070 int err = -EINVAL;
2071 unsigned int flags = 0;
2072 u32 itag = 0;
2073 struct rtable *rth;
2074 struct flowi4 fl4;
2075 bool do_cache = true;
2076
2077 /* IP on this device is disabled. */
2078
2079 if (!in_dev)
2080 goto out;
2081
2082 /* Check for the most weird martians, which can be not detected
2083 by fib_lookup.
2084 */
2085
2086 tun_info = skb_tunnel_info(skb);
2087 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2088 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2089 else
2090 fl4.flowi4_tun_key.tun_id = 0;
2091 skb_dst_drop(skb);
2092
2093 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2094 goto martian_source;
2095
2096 res->fi = NULL;
2097 res->table = NULL;
2098 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2099 goto brd_input;
2100
2101 /* Accept zero addresses only to limited broadcast;
2102 * I even do not know to fix it or not. Waiting for complains :-)
2103 */
2104 if (ipv4_is_zeronet(saddr))
2105 goto martian_source;
2106
2107 if (ipv4_is_zeronet(daddr))
2108 goto martian_destination;
2109
2110 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2111 * and call it once if daddr or/and saddr are loopback addresses
2112 */
2113 if (ipv4_is_loopback(daddr)) {
2114 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2115 goto martian_destination;
2116 } else if (ipv4_is_loopback(saddr)) {
2117 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2118 goto martian_source;
2119 }
2120
2121 /*
2122 * Now we are ready to route packet.
2123 */
2124 fl4.flowi4_oif = 0;
2125 fl4.flowi4_iif = dev->ifindex;
2126 fl4.flowi4_mark = skb->mark;
2127 fl4.flowi4_tos = tos;
2128 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2129 fl4.flowi4_flags = 0;
2130 fl4.daddr = daddr;
2131 fl4.saddr = saddr;
2132 fl4.flowi4_uid = sock_net_uid(net, NULL);
2133 fl4.flowi4_multipath_hash = 0;
2134
2135 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2136 flkeys = &_flkeys;
2137 } else {
2138 fl4.flowi4_proto = 0;
2139 fl4.fl4_sport = 0;
2140 fl4.fl4_dport = 0;
2141 }
2142
2143 err = fib_lookup(net, &fl4, res, 0);
2144 if (err != 0) {
2145 if (!IN_DEV_FORWARD(in_dev))
2146 err = -EHOSTUNREACH;
2147 goto no_route;
2148 }
2149
2150 if (res->type == RTN_BROADCAST) {
2151 if (IN_DEV_BFORWARD(in_dev))
2152 goto make_route;
2153 /* not do cache if bc_forwarding is enabled */
2154 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2155 do_cache = false;
2156 goto brd_input;
2157 }
2158
2159 if (res->type == RTN_LOCAL) {
2160 err = fib_validate_source(skb, saddr, daddr, tos,
2161 0, dev, in_dev, &itag);
2162 if (err < 0)
2163 goto martian_source;
2164 goto local_input;
2165 }
2166
2167 if (!IN_DEV_FORWARD(in_dev)) {
2168 err = -EHOSTUNREACH;
2169 goto no_route;
2170 }
2171 if (res->type != RTN_UNICAST)
2172 goto martian_destination;
2173
2174make_route:
2175 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2176out: return err;
2177
2178brd_input:
2179 if (skb->protocol != htons(ETH_P_IP))
2180 goto e_inval;
2181
2182 if (!ipv4_is_zeronet(saddr)) {
2183 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2184 in_dev, &itag);
2185 if (err < 0)
2186 goto martian_source;
2187 }
2188 flags |= RTCF_BROADCAST;
2189 res->type = RTN_BROADCAST;
2190 RT_CACHE_STAT_INC(in_brd);
2191
2192local_input:
2193 do_cache &= res->fi && !itag;
2194 if (do_cache) {
2195 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2196
2197 rth = rcu_dereference(nhc->nhc_rth_input);
2198 if (rt_cache_valid(rth)) {
2199 skb_dst_set_noref(skb, &rth->dst);
2200 err = 0;
2201 goto out;
2202 }
2203 }
2204
2205 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2206 flags | RTCF_LOCAL, res->type,
2207 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2208 if (!rth)
2209 goto e_nobufs;
2210
2211 rth->dst.output= ip_rt_bug;
2212#ifdef CONFIG_IP_ROUTE_CLASSID
2213 rth->dst.tclassid = itag;
2214#endif
2215 rth->rt_is_input = 1;
2216
2217 RT_CACHE_STAT_INC(in_slow_tot);
2218 if (res->type == RTN_UNREACHABLE) {
2219 rth->dst.input= ip_error;
2220 rth->dst.error= -err;
2221 rth->rt_flags &= ~RTCF_LOCAL;
2222 }
2223
2224 if (do_cache) {
2225 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2226
2227 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2228 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2229 WARN_ON(rth->dst.input == lwtunnel_input);
2230 rth->dst.lwtstate->orig_input = rth->dst.input;
2231 rth->dst.input = lwtunnel_input;
2232 }
2233
2234 if (unlikely(!rt_cache_route(nhc, rth)))
2235 rt_add_uncached_list(rth);
2236 }
2237 skb_dst_set(skb, &rth->dst);
2238 err = 0;
2239 goto out;
2240
2241no_route:
2242 RT_CACHE_STAT_INC(in_no_route);
2243 res->type = RTN_UNREACHABLE;
2244 res->fi = NULL;
2245 res->table = NULL;
2246 goto local_input;
2247
2248 /*
2249 * Do not cache martian addresses: they should be logged (RFC1812)
2250 */
2251martian_destination:
2252 RT_CACHE_STAT_INC(in_martian_dst);
2253#ifdef CONFIG_IP_ROUTE_VERBOSE
2254 if (IN_DEV_LOG_MARTIANS(in_dev))
2255 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2256 &daddr, &saddr, dev->name);
2257#endif
2258
2259e_inval:
2260 err = -EINVAL;
2261 goto out;
2262
2263e_nobufs:
2264 err = -ENOBUFS;
2265 goto out;
2266
2267martian_source:
2268 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2269 goto out;
2270}
2271
2272int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2273 u8 tos, struct net_device *dev)
2274{
2275 struct fib_result res;
2276 int err;
2277
2278 tos &= IPTOS_RT_MASK;
2279 rcu_read_lock();
2280 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2281 rcu_read_unlock();
2282
2283 return err;
2284}
2285EXPORT_SYMBOL(ip_route_input_noref);
2286
2287/* called with rcu_read_lock held */
2288int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2289 u8 tos, struct net_device *dev, struct fib_result *res)
2290{
2291 /* Multicast recognition logic is moved from route cache to here.
2292 The problem was that too many Ethernet cards have broken/missing
2293 hardware multicast filters :-( As result the host on multicasting
2294 network acquires a lot of useless route cache entries, sort of
2295 SDR messages from all the world. Now we try to get rid of them.
2296 Really, provided software IP multicast filter is organized
2297 reasonably (at least, hashed), it does not result in a slowdown
2298 comparing with route cache reject entries.
2299 Note, that multicast routers are not affected, because
2300 route cache entry is created eventually.
2301 */
2302 if (ipv4_is_multicast(daddr)) {
2303 struct in_device *in_dev = __in_dev_get_rcu(dev);
2304 int our = 0;
2305 int err = -EINVAL;
2306
2307 if (!in_dev)
2308 return err;
2309 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2310 ip_hdr(skb)->protocol);
2311
2312 /* check l3 master if no match yet */
2313 if (!our && netif_is_l3_slave(dev)) {
2314 struct in_device *l3_in_dev;
2315
2316 l3_in_dev = __in_dev_get_rcu(skb->dev);
2317 if (l3_in_dev)
2318 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2319 ip_hdr(skb)->protocol);
2320 }
2321
2322 if (our
2323#ifdef CONFIG_IP_MROUTE
2324 ||
2325 (!ipv4_is_local_multicast(daddr) &&
2326 IN_DEV_MFORWARD(in_dev))
2327#endif
2328 ) {
2329 err = ip_route_input_mc(skb, daddr, saddr,
2330 tos, dev, our);
2331 }
2332 return err;
2333 }
2334
2335 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2336}
2337
2338/* called with rcu_read_lock() */
2339static struct rtable *__mkroute_output(const struct fib_result *res,
2340 const struct flowi4 *fl4, int orig_oif,
2341 struct net_device *dev_out,
2342 unsigned int flags)
2343{
2344 struct fib_info *fi = res->fi;
2345 struct fib_nh_exception *fnhe;
2346 struct in_device *in_dev;
2347 u16 type = res->type;
2348 struct rtable *rth;
2349 bool do_cache;
2350
2351 in_dev = __in_dev_get_rcu(dev_out);
2352 if (!in_dev)
2353 return ERR_PTR(-EINVAL);
2354
2355 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2356 if (ipv4_is_loopback(fl4->saddr) &&
2357 !(dev_out->flags & IFF_LOOPBACK) &&
2358 !netif_is_l3_master(dev_out))
2359 return ERR_PTR(-EINVAL);
2360
2361 if (ipv4_is_lbcast(fl4->daddr))
2362 type = RTN_BROADCAST;
2363 else if (ipv4_is_multicast(fl4->daddr))
2364 type = RTN_MULTICAST;
2365 else if (ipv4_is_zeronet(fl4->daddr))
2366 return ERR_PTR(-EINVAL);
2367
2368 if (dev_out->flags & IFF_LOOPBACK)
2369 flags |= RTCF_LOCAL;
2370
2371 do_cache = true;
2372 if (type == RTN_BROADCAST) {
2373 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2374 fi = NULL;
2375 } else if (type == RTN_MULTICAST) {
2376 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2377 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2378 fl4->flowi4_proto))
2379 flags &= ~RTCF_LOCAL;
2380 else
2381 do_cache = false;
2382 /* If multicast route do not exist use
2383 * default one, but do not gateway in this case.
2384 * Yes, it is hack.
2385 */
2386 if (fi && res->prefixlen < 4)
2387 fi = NULL;
2388 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2389 (orig_oif != dev_out->ifindex)) {
2390 /* For local routes that require a particular output interface
2391 * we do not want to cache the result. Caching the result
2392 * causes incorrect behaviour when there are multiple source
2393 * addresses on the interface, the end result being that if the
2394 * intended recipient is waiting on that interface for the
2395 * packet he won't receive it because it will be delivered on
2396 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2397 * be set to the loopback interface as well.
2398 */
2399 do_cache = false;
2400 }
2401
2402 fnhe = NULL;
2403 do_cache &= fi != NULL;
2404 if (fi) {
2405 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2406 struct rtable __rcu **prth;
2407
2408 fnhe = find_exception(nhc, fl4->daddr);
2409 if (!do_cache)
2410 goto add;
2411 if (fnhe) {
2412 prth = &fnhe->fnhe_rth_output;
2413 } else {
2414 if (unlikely(fl4->flowi4_flags &
2415 FLOWI_FLAG_KNOWN_NH &&
2416 !(nhc->nhc_gw_family &&
2417 nhc->nhc_scope == RT_SCOPE_LINK))) {
2418 do_cache = false;
2419 goto add;
2420 }
2421 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2422 }
2423 rth = rcu_dereference(*prth);
2424 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2425 return rth;
2426 }
2427
2428add:
2429 rth = rt_dst_alloc(dev_out, flags, type,
2430 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2431 IN_DEV_CONF_GET(in_dev, NOXFRM),
2432 do_cache);
2433 if (!rth)
2434 return ERR_PTR(-ENOBUFS);
2435
2436 rth->rt_iif = orig_oif;
2437
2438 RT_CACHE_STAT_INC(out_slow_tot);
2439
2440 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2441 if (flags & RTCF_LOCAL &&
2442 !(dev_out->flags & IFF_LOOPBACK)) {
2443 rth->dst.output = ip_mc_output;
2444 RT_CACHE_STAT_INC(out_slow_mc);
2445 }
2446#ifdef CONFIG_IP_MROUTE
2447 if (type == RTN_MULTICAST) {
2448 if (IN_DEV_MFORWARD(in_dev) &&
2449 !ipv4_is_local_multicast(fl4->daddr)) {
2450 rth->dst.input = ip_mr_input;
2451 rth->dst.output = ip_mc_output;
2452 }
2453 }
2454#endif
2455 }
2456
2457 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2458 lwtunnel_set_redirect(&rth->dst);
2459
2460 return rth;
2461}
2462
2463/*
2464 * Major route resolver routine.
2465 */
2466
2467struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2468 const struct sk_buff *skb)
2469{
2470 __u8 tos = RT_FL_TOS(fl4);
2471 struct fib_result res = {
2472 .type = RTN_UNSPEC,
2473 .fi = NULL,
2474 .table = NULL,
2475 .tclassid = 0,
2476 };
2477 struct rtable *rth;
2478
2479 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2480 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2481 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2482 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2483
2484 rcu_read_lock();
2485 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2486 rcu_read_unlock();
2487
2488 return rth;
2489}
2490EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2491
2492struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2493 struct fib_result *res,
2494 const struct sk_buff *skb)
2495{
2496 struct net_device *dev_out = NULL;
2497 int orig_oif = fl4->flowi4_oif;
2498 unsigned int flags = 0;
2499 struct rtable *rth;
2500 int err;
2501
2502 if (fl4->saddr) {
2503 if (ipv4_is_multicast(fl4->saddr) ||
2504 ipv4_is_lbcast(fl4->saddr) ||
2505 ipv4_is_zeronet(fl4->saddr)) {
2506 rth = ERR_PTR(-EINVAL);
2507 goto out;
2508 }
2509
2510 rth = ERR_PTR(-ENETUNREACH);
2511
2512 /* I removed check for oif == dev_out->oif here.
2513 It was wrong for two reasons:
2514 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2515 is assigned to multiple interfaces.
2516 2. Moreover, we are allowed to send packets with saddr
2517 of another iface. --ANK
2518 */
2519
2520 if (fl4->flowi4_oif == 0 &&
2521 (ipv4_is_multicast(fl4->daddr) ||
2522 ipv4_is_lbcast(fl4->daddr))) {
2523 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2524 dev_out = __ip_dev_find(net, fl4->saddr, false);
2525 if (!dev_out)
2526 goto out;
2527
2528 /* Special hack: user can direct multicasts
2529 and limited broadcast via necessary interface
2530 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2531 This hack is not just for fun, it allows
2532 vic,vat and friends to work.
2533 They bind socket to loopback, set ttl to zero
2534 and expect that it will work.
2535 From the viewpoint of routing cache they are broken,
2536 because we are not allowed to build multicast path
2537 with loopback source addr (look, routing cache
2538 cannot know, that ttl is zero, so that packet
2539 will not leave this host and route is valid).
2540 Luckily, this hack is good workaround.
2541 */
2542
2543 fl4->flowi4_oif = dev_out->ifindex;
2544 goto make_route;
2545 }
2546
2547 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2548 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2549 if (!__ip_dev_find(net, fl4->saddr, false))
2550 goto out;
2551 }
2552 }
2553
2554
2555 if (fl4->flowi4_oif) {
2556 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2557 rth = ERR_PTR(-ENODEV);
2558 if (!dev_out)
2559 goto out;
2560
2561 /* RACE: Check return value of inet_select_addr instead. */
2562 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2563 rth = ERR_PTR(-ENETUNREACH);
2564 goto out;
2565 }
2566 if (ipv4_is_local_multicast(fl4->daddr) ||
2567 ipv4_is_lbcast(fl4->daddr) ||
2568 fl4->flowi4_proto == IPPROTO_IGMP) {
2569 if (!fl4->saddr)
2570 fl4->saddr = inet_select_addr(dev_out, 0,
2571 RT_SCOPE_LINK);
2572 goto make_route;
2573 }
2574 if (!fl4->saddr) {
2575 if (ipv4_is_multicast(fl4->daddr))
2576 fl4->saddr = inet_select_addr(dev_out, 0,
2577 fl4->flowi4_scope);
2578 else if (!fl4->daddr)
2579 fl4->saddr = inet_select_addr(dev_out, 0,
2580 RT_SCOPE_HOST);
2581 }
2582 }
2583
2584 if (!fl4->daddr) {
2585 fl4->daddr = fl4->saddr;
2586 if (!fl4->daddr)
2587 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2588 dev_out = net->loopback_dev;
2589 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2590 res->type = RTN_LOCAL;
2591 flags |= RTCF_LOCAL;
2592 goto make_route;
2593 }
2594
2595 err = fib_lookup(net, fl4, res, 0);
2596 if (err) {
2597 res->fi = NULL;
2598 res->table = NULL;
2599 if (fl4->flowi4_oif &&
2600 (ipv4_is_multicast(fl4->daddr) ||
2601 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2602 /* Apparently, routing tables are wrong. Assume,
2603 that the destination is on link.
2604
2605 WHY? DW.
2606 Because we are allowed to send to iface
2607 even if it has NO routes and NO assigned
2608 addresses. When oif is specified, routing
2609 tables are looked up with only one purpose:
2610 to catch if destination is gatewayed, rather than
2611 direct. Moreover, if MSG_DONTROUTE is set,
2612 we send packet, ignoring both routing tables
2613 and ifaddr state. --ANK
2614
2615
2616 We could make it even if oif is unknown,
2617 likely IPv6, but we do not.
2618 */
2619
2620 if (fl4->saddr == 0)
2621 fl4->saddr = inet_select_addr(dev_out, 0,
2622 RT_SCOPE_LINK);
2623 res->type = RTN_UNICAST;
2624 goto make_route;
2625 }
2626 rth = ERR_PTR(err);
2627 goto out;
2628 }
2629
2630 if (res->type == RTN_LOCAL) {
2631 if (!fl4->saddr) {
2632 if (res->fi->fib_prefsrc)
2633 fl4->saddr = res->fi->fib_prefsrc;
2634 else
2635 fl4->saddr = fl4->daddr;
2636 }
2637
2638 /* L3 master device is the loopback for that domain */
2639 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2640 net->loopback_dev;
2641
2642 /* make sure orig_oif points to fib result device even
2643 * though packet rx/tx happens over loopback or l3mdev
2644 */
2645 orig_oif = FIB_RES_OIF(*res);
2646
2647 fl4->flowi4_oif = dev_out->ifindex;
2648 flags |= RTCF_LOCAL;
2649 goto make_route;
2650 }
2651
2652 fib_select_path(net, res, fl4, skb);
2653
2654 dev_out = FIB_RES_DEV(*res);
2655
2656make_route:
2657 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2658
2659out:
2660 return rth;
2661}
2662
2663static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2664{
2665 return NULL;
2666}
2667
2668static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2669{
2670 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2671
2672 return mtu ? : dst->dev->mtu;
2673}
2674
2675static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2676 struct sk_buff *skb, u32 mtu,
2677 bool confirm_neigh)
2678{
2679}
2680
2681static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2682 struct sk_buff *skb)
2683{
2684}
2685
2686static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2687 unsigned long old)
2688{
2689 return NULL;
2690}
2691
2692static struct dst_ops ipv4_dst_blackhole_ops = {
2693 .family = AF_INET,
2694 .check = ipv4_blackhole_dst_check,
2695 .mtu = ipv4_blackhole_mtu,
2696 .default_advmss = ipv4_default_advmss,
2697 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2698 .redirect = ipv4_rt_blackhole_redirect,
2699 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2700 .neigh_lookup = ipv4_neigh_lookup,
2701};
2702
2703struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2704{
2705 struct rtable *ort = (struct rtable *) dst_orig;
2706 struct rtable *rt;
2707
2708 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2709 if (rt) {
2710 struct dst_entry *new = &rt->dst;
2711
2712 new->__use = 1;
2713 new->input = dst_discard;
2714 new->output = dst_discard_out;
2715
2716 new->dev = net->loopback_dev;
2717 if (new->dev)
2718 dev_hold(new->dev);
2719
2720 rt->rt_is_input = ort->rt_is_input;
2721 rt->rt_iif = ort->rt_iif;
2722 rt->rt_pmtu = ort->rt_pmtu;
2723 rt->rt_mtu_locked = ort->rt_mtu_locked;
2724
2725 rt->rt_genid = rt_genid_ipv4(net);
2726 rt->rt_flags = ort->rt_flags;
2727 rt->rt_type = ort->rt_type;
2728 rt->rt_uses_gateway = ort->rt_uses_gateway;
2729 rt->rt_gw_family = ort->rt_gw_family;
2730 if (rt->rt_gw_family == AF_INET)
2731 rt->rt_gw4 = ort->rt_gw4;
2732 else if (rt->rt_gw_family == AF_INET6)
2733 rt->rt_gw6 = ort->rt_gw6;
2734
2735 INIT_LIST_HEAD(&rt->rt_uncached);
2736 }
2737
2738 dst_release(dst_orig);
2739
2740 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2741}
2742
2743struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2744 const struct sock *sk)
2745{
2746 struct rtable *rt = __ip_route_output_key(net, flp4);
2747
2748 if (IS_ERR(rt))
2749 return rt;
2750
2751 if (flp4->flowi4_proto) {
2752 flp4->flowi4_oif = rt->dst.dev->ifindex;
2753 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2754 flowi4_to_flowi(flp4),
2755 sk, 0);
2756 }
2757
2758 return rt;
2759}
2760EXPORT_SYMBOL_GPL(ip_route_output_flow);
2761
2762/* called with rcu_read_lock held */
2763static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2764 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2765 struct sk_buff *skb, u32 portid, u32 seq,
2766 unsigned int flags)
2767{
2768 struct rtmsg *r;
2769 struct nlmsghdr *nlh;
2770 unsigned long expires = 0;
2771 u32 error;
2772 u32 metrics[RTAX_MAX];
2773
2774 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2775 if (!nlh)
2776 return -EMSGSIZE;
2777
2778 r = nlmsg_data(nlh);
2779 r->rtm_family = AF_INET;
2780 r->rtm_dst_len = 32;
2781 r->rtm_src_len = 0;
2782 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2783 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2784 if (nla_put_u32(skb, RTA_TABLE, table_id))
2785 goto nla_put_failure;
2786 r->rtm_type = rt->rt_type;
2787 r->rtm_scope = RT_SCOPE_UNIVERSE;
2788 r->rtm_protocol = RTPROT_UNSPEC;
2789 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2790 if (rt->rt_flags & RTCF_NOTIFY)
2791 r->rtm_flags |= RTM_F_NOTIFY;
2792 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2793 r->rtm_flags |= RTCF_DOREDIRECT;
2794
2795 if (nla_put_in_addr(skb, RTA_DST, dst))
2796 goto nla_put_failure;
2797 if (src) {
2798 r->rtm_src_len = 32;
2799 if (nla_put_in_addr(skb, RTA_SRC, src))
2800 goto nla_put_failure;
2801 }
2802 if (rt->dst.dev &&
2803 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2804 goto nla_put_failure;
2805#ifdef CONFIG_IP_ROUTE_CLASSID
2806 if (rt->dst.tclassid &&
2807 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2808 goto nla_put_failure;
2809#endif
2810 if (fl4 && !rt_is_input_route(rt) &&
2811 fl4->saddr != src) {
2812 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2813 goto nla_put_failure;
2814 }
2815 if (rt->rt_uses_gateway) {
2816 if (rt->rt_gw_family == AF_INET &&
2817 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2818 goto nla_put_failure;
2819 } else if (rt->rt_gw_family == AF_INET6) {
2820 int alen = sizeof(struct in6_addr);
2821 struct nlattr *nla;
2822 struct rtvia *via;
2823
2824 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2825 if (!nla)
2826 goto nla_put_failure;
2827
2828 via = nla_data(nla);
2829 via->rtvia_family = AF_INET6;
2830 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2831 }
2832 }
2833
2834 expires = rt->dst.expires;
2835 if (expires) {
2836 unsigned long now = jiffies;
2837
2838 if (time_before(now, expires))
2839 expires -= now;
2840 else
2841 expires = 0;
2842 }
2843
2844 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2845 if (rt->rt_pmtu && expires)
2846 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2847 if (rt->rt_mtu_locked && expires)
2848 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2849 if (rtnetlink_put_metrics(skb, metrics) < 0)
2850 goto nla_put_failure;
2851
2852 if (fl4) {
2853 if (fl4->flowi4_mark &&
2854 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2855 goto nla_put_failure;
2856
2857 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2858 nla_put_u32(skb, RTA_UID,
2859 from_kuid_munged(current_user_ns(),
2860 fl4->flowi4_uid)))
2861 goto nla_put_failure;
2862
2863 if (rt_is_input_route(rt)) {
2864#ifdef CONFIG_IP_MROUTE
2865 if (ipv4_is_multicast(dst) &&
2866 !ipv4_is_local_multicast(dst) &&
2867 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2868 int err = ipmr_get_route(net, skb,
2869 fl4->saddr, fl4->daddr,
2870 r, portid);
2871
2872 if (err <= 0) {
2873 if (err == 0)
2874 return 0;
2875 goto nla_put_failure;
2876 }
2877 } else
2878#endif
2879 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2880 goto nla_put_failure;
2881 }
2882 }
2883
2884 error = rt->dst.error;
2885
2886 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2887 goto nla_put_failure;
2888
2889 nlmsg_end(skb, nlh);
2890 return 0;
2891
2892nla_put_failure:
2893 nlmsg_cancel(skb, nlh);
2894 return -EMSGSIZE;
2895}
2896
2897static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2898 struct netlink_callback *cb, u32 table_id,
2899 struct fnhe_hash_bucket *bucket, int genid,
2900 int *fa_index, int fa_start, unsigned int flags)
2901{
2902 int i;
2903
2904 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2905 struct fib_nh_exception *fnhe;
2906
2907 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2908 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2909 struct rtable *rt;
2910 int err;
2911
2912 if (*fa_index < fa_start)
2913 goto next;
2914
2915 if (fnhe->fnhe_genid != genid)
2916 goto next;
2917
2918 if (fnhe->fnhe_expires &&
2919 time_after(jiffies, fnhe->fnhe_expires))
2920 goto next;
2921
2922 rt = rcu_dereference(fnhe->fnhe_rth_input);
2923 if (!rt)
2924 rt = rcu_dereference(fnhe->fnhe_rth_output);
2925 if (!rt)
2926 goto next;
2927
2928 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2929 table_id, NULL, skb,
2930 NETLINK_CB(cb->skb).portid,
2931 cb->nlh->nlmsg_seq, flags);
2932 if (err)
2933 return err;
2934next:
2935 (*fa_index)++;
2936 }
2937 }
2938
2939 return 0;
2940}
2941
2942int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2943 u32 table_id, struct fib_info *fi,
2944 int *fa_index, int fa_start, unsigned int flags)
2945{
2946 struct net *net = sock_net(cb->skb->sk);
2947 int nhsel, genid = fnhe_genid(net);
2948
2949 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2950 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2951 struct fnhe_hash_bucket *bucket;
2952 int err;
2953
2954 if (nhc->nhc_flags & RTNH_F_DEAD)
2955 continue;
2956
2957 rcu_read_lock();
2958 bucket = rcu_dereference(nhc->nhc_exceptions);
2959 err = 0;
2960 if (bucket)
2961 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2962 genid, fa_index, fa_start,
2963 flags);
2964 rcu_read_unlock();
2965 if (err)
2966 return err;
2967 }
2968
2969 return 0;
2970}
2971
2972static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2973 u8 ip_proto, __be16 sport,
2974 __be16 dport)
2975{
2976 struct sk_buff *skb;
2977 struct iphdr *iph;
2978
2979 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2980 if (!skb)
2981 return NULL;
2982
2983 /* Reserve room for dummy headers, this skb can pass
2984 * through good chunk of routing engine.
2985 */
2986 skb_reset_mac_header(skb);
2987 skb_reset_network_header(skb);
2988 skb->protocol = htons(ETH_P_IP);
2989 iph = skb_put(skb, sizeof(struct iphdr));
2990 iph->protocol = ip_proto;
2991 iph->saddr = src;
2992 iph->daddr = dst;
2993 iph->version = 0x4;
2994 iph->frag_off = 0;
2995 iph->ihl = 0x5;
2996 skb_set_transport_header(skb, skb->len);
2997
2998 switch (iph->protocol) {
2999 case IPPROTO_UDP: {
3000 struct udphdr *udph;
3001
3002 udph = skb_put_zero(skb, sizeof(struct udphdr));
3003 udph->source = sport;
3004 udph->dest = dport;
3005 udph->len = htons(sizeof(struct udphdr));
3006 udph->check = 0;
3007 break;
3008 }
3009 case IPPROTO_TCP: {
3010 struct tcphdr *tcph;
3011
3012 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3013 tcph->source = sport;
3014 tcph->dest = dport;
3015 tcph->doff = sizeof(struct tcphdr) / 4;
3016 tcph->rst = 1;
3017 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3018 src, dst, 0);
3019 break;
3020 }
3021 case IPPROTO_ICMP: {
3022 struct icmphdr *icmph;
3023
3024 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3025 icmph->type = ICMP_ECHO;
3026 icmph->code = 0;
3027 }
3028 }
3029
3030 return skb;
3031}
3032
3033static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3034 const struct nlmsghdr *nlh,
3035 struct nlattr **tb,
3036 struct netlink_ext_ack *extack)
3037{
3038 struct rtmsg *rtm;
3039 int i, err;
3040
3041 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3042 NL_SET_ERR_MSG(extack,
3043 "ipv4: Invalid header for route get request");
3044 return -EINVAL;
3045 }
3046
3047 if (!netlink_strict_get_check(skb))
3048 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3049 rtm_ipv4_policy, extack);
3050
3051 rtm = nlmsg_data(nlh);
3052 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3053 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3054 rtm->rtm_table || rtm->rtm_protocol ||
3055 rtm->rtm_scope || rtm->rtm_type) {
3056 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3057 return -EINVAL;
3058 }
3059
3060 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3061 RTM_F_LOOKUP_TABLE |
3062 RTM_F_FIB_MATCH)) {
3063 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3064 return -EINVAL;
3065 }
3066
3067 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3068 rtm_ipv4_policy, extack);
3069 if (err)
3070 return err;
3071
3072 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3073 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3074 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3075 return -EINVAL;
3076 }
3077
3078 for (i = 0; i <= RTA_MAX; i++) {
3079 if (!tb[i])
3080 continue;
3081
3082 switch (i) {
3083 case RTA_IIF:
3084 case RTA_OIF:
3085 case RTA_SRC:
3086 case RTA_DST:
3087 case RTA_IP_PROTO:
3088 case RTA_SPORT:
3089 case RTA_DPORT:
3090 case RTA_MARK:
3091 case RTA_UID:
3092 break;
3093 default:
3094 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3095 return -EINVAL;
3096 }
3097 }
3098
3099 return 0;
3100}
3101
3102static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3103 struct netlink_ext_ack *extack)
3104{
3105 struct net *net = sock_net(in_skb->sk);
3106 struct nlattr *tb[RTA_MAX+1];
3107 u32 table_id = RT_TABLE_MAIN;
3108 __be16 sport = 0, dport = 0;
3109 struct fib_result res = {};
3110 u8 ip_proto = IPPROTO_UDP;
3111 struct rtable *rt = NULL;
3112 struct sk_buff *skb;
3113 struct rtmsg *rtm;
3114 struct flowi4 fl4 = {};
3115 __be32 dst = 0;
3116 __be32 src = 0;
3117 kuid_t uid;
3118 u32 iif;
3119 int err;
3120 int mark;
3121
3122 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3123 if (err < 0)
3124 return err;
3125
3126 rtm = nlmsg_data(nlh);
3127 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3128 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3129 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3130 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3131 if (tb[RTA_UID])
3132 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3133 else
3134 uid = (iif ? INVALID_UID : current_uid());
3135
3136 if (tb[RTA_IP_PROTO]) {
3137 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3138 &ip_proto, AF_INET, extack);
3139 if (err)
3140 return err;
3141 }
3142
3143 if (tb[RTA_SPORT])
3144 sport = nla_get_be16(tb[RTA_SPORT]);
3145
3146 if (tb[RTA_DPORT])
3147 dport = nla_get_be16(tb[RTA_DPORT]);
3148
3149 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3150 if (!skb)
3151 return -ENOBUFS;
3152
3153 fl4.daddr = dst;
3154 fl4.saddr = src;
3155 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3156 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3157 fl4.flowi4_mark = mark;
3158 fl4.flowi4_uid = uid;
3159 if (sport)
3160 fl4.fl4_sport = sport;
3161 if (dport)
3162 fl4.fl4_dport = dport;
3163 fl4.flowi4_proto = ip_proto;
3164
3165 rcu_read_lock();
3166
3167 if (iif) {
3168 struct net_device *dev;
3169
3170 dev = dev_get_by_index_rcu(net, iif);
3171 if (!dev) {
3172 err = -ENODEV;
3173 goto errout_rcu;
3174 }
3175
3176 fl4.flowi4_iif = iif; /* for rt_fill_info */
3177 skb->dev = dev;
3178 skb->mark = mark;
3179 err = ip_route_input_rcu(skb, dst, src,
3180 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3181 &res);
3182
3183 rt = skb_rtable(skb);
3184 if (err == 0 && rt->dst.error)
3185 err = -rt->dst.error;
3186 } else {
3187 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3188 skb->dev = net->loopback_dev;
3189 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3190 err = 0;
3191 if (IS_ERR(rt))
3192 err = PTR_ERR(rt);
3193 else
3194 skb_dst_set(skb, &rt->dst);
3195 }
3196
3197 if (err)
3198 goto errout_rcu;
3199
3200 if (rtm->rtm_flags & RTM_F_NOTIFY)
3201 rt->rt_flags |= RTCF_NOTIFY;
3202
3203 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3204 table_id = res.table ? res.table->tb_id : 0;
3205
3206 /* reset skb for netlink reply msg */
3207 skb_trim(skb, 0);
3208 skb_reset_network_header(skb);
3209 skb_reset_transport_header(skb);
3210 skb_reset_mac_header(skb);
3211
3212 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3213 if (!res.fi) {
3214 err = fib_props[res.type].error;
3215 if (!err)
3216 err = -EHOSTUNREACH;
3217 goto errout_rcu;
3218 }
3219 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3220 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3221 rt->rt_type, res.prefix, res.prefixlen,
3222 fl4.flowi4_tos, res.fi, 0);
3223 } else {
3224 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3225 NETLINK_CB(in_skb).portid,
3226 nlh->nlmsg_seq, 0);
3227 }
3228 if (err < 0)
3229 goto errout_rcu;
3230
3231 rcu_read_unlock();
3232
3233 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3234
3235errout_free:
3236 return err;
3237errout_rcu:
3238 rcu_read_unlock();
3239 kfree_skb(skb);
3240 goto errout_free;
3241}
3242
3243void ip_rt_multicast_event(struct in_device *in_dev)
3244{
3245 rt_cache_flush(dev_net(in_dev->dev));
3246}
3247
3248#ifdef CONFIG_SYSCTL
3249static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3250static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3251static int ip_rt_gc_elasticity __read_mostly = 8;
3252static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3253
3254static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3255 void __user *buffer,
3256 size_t *lenp, loff_t *ppos)
3257{
3258 struct net *net = (struct net *)__ctl->extra1;
3259
3260 if (write) {
3261 rt_cache_flush(net);
3262 fnhe_genid_bump(net);
3263 return 0;
3264 }
3265
3266 return -EINVAL;
3267}
3268
3269static struct ctl_table ipv4_route_table[] = {
3270 {
3271 .procname = "gc_thresh",
3272 .data = &ipv4_dst_ops.gc_thresh,
3273 .maxlen = sizeof(int),
3274 .mode = 0644,
3275 .proc_handler = proc_dointvec,
3276 },
3277 {
3278 .procname = "max_size",
3279 .data = &ip_rt_max_size,
3280 .maxlen = sizeof(int),
3281 .mode = 0644,
3282 .proc_handler = proc_dointvec,
3283 },
3284 {
3285 /* Deprecated. Use gc_min_interval_ms */
3286
3287 .procname = "gc_min_interval",
3288 .data = &ip_rt_gc_min_interval,
3289 .maxlen = sizeof(int),
3290 .mode = 0644,
3291 .proc_handler = proc_dointvec_jiffies,
3292 },
3293 {
3294 .procname = "gc_min_interval_ms",
3295 .data = &ip_rt_gc_min_interval,
3296 .maxlen = sizeof(int),
3297 .mode = 0644,
3298 .proc_handler = proc_dointvec_ms_jiffies,
3299 },
3300 {
3301 .procname = "gc_timeout",
3302 .data = &ip_rt_gc_timeout,
3303 .maxlen = sizeof(int),
3304 .mode = 0644,
3305 .proc_handler = proc_dointvec_jiffies,
3306 },
3307 {
3308 .procname = "gc_interval",
3309 .data = &ip_rt_gc_interval,
3310 .maxlen = sizeof(int),
3311 .mode = 0644,
3312 .proc_handler = proc_dointvec_jiffies,
3313 },
3314 {
3315 .procname = "redirect_load",
3316 .data = &ip_rt_redirect_load,
3317 .maxlen = sizeof(int),
3318 .mode = 0644,
3319 .proc_handler = proc_dointvec,
3320 },
3321 {
3322 .procname = "redirect_number",
3323 .data = &ip_rt_redirect_number,
3324 .maxlen = sizeof(int),
3325 .mode = 0644,
3326 .proc_handler = proc_dointvec,
3327 },
3328 {
3329 .procname = "redirect_silence",
3330 .data = &ip_rt_redirect_silence,
3331 .maxlen = sizeof(int),
3332 .mode = 0644,
3333 .proc_handler = proc_dointvec,
3334 },
3335 {
3336 .procname = "error_cost",
3337 .data = &ip_rt_error_cost,
3338 .maxlen = sizeof(int),
3339 .mode = 0644,
3340 .proc_handler = proc_dointvec,
3341 },
3342 {
3343 .procname = "error_burst",
3344 .data = &ip_rt_error_burst,
3345 .maxlen = sizeof(int),
3346 .mode = 0644,
3347 .proc_handler = proc_dointvec,
3348 },
3349 {
3350 .procname = "gc_elasticity",
3351 .data = &ip_rt_gc_elasticity,
3352 .maxlen = sizeof(int),
3353 .mode = 0644,
3354 .proc_handler = proc_dointvec,
3355 },
3356 {
3357 .procname = "mtu_expires",
3358 .data = &ip_rt_mtu_expires,
3359 .maxlen = sizeof(int),
3360 .mode = 0644,
3361 .proc_handler = proc_dointvec_jiffies,
3362 },
3363 {
3364 .procname = "min_pmtu",
3365 .data = &ip_rt_min_pmtu,
3366 .maxlen = sizeof(int),
3367 .mode = 0644,
3368 .proc_handler = proc_dointvec_minmax,
3369 .extra1 = &ip_min_valid_pmtu,
3370 },
3371 {
3372 .procname = "min_adv_mss",
3373 .data = &ip_rt_min_advmss,
3374 .maxlen = sizeof(int),
3375 .mode = 0644,
3376 .proc_handler = proc_dointvec,
3377 },
3378 { }
3379};
3380
3381static const char ipv4_route_flush_procname[] = "flush";
3382
3383static struct ctl_table ipv4_route_flush_table[] = {
3384 {
3385 .procname = ipv4_route_flush_procname,
3386 .maxlen = sizeof(int),
3387 .mode = 0200,
3388 .proc_handler = ipv4_sysctl_rtcache_flush,
3389 },
3390 { },
3391};
3392
3393static __net_init int sysctl_route_net_init(struct net *net)
3394{
3395 struct ctl_table *tbl;
3396
3397 tbl = ipv4_route_flush_table;
3398 if (!net_eq(net, &init_net)) {
3399 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3400 if (!tbl)
3401 goto err_dup;
3402
3403 /* Don't export non-whitelisted sysctls to unprivileged users */
3404 if (net->user_ns != &init_user_ns) {
3405 if (tbl[0].procname != ipv4_route_flush_procname)
3406 tbl[0].procname = NULL;
3407 }
3408 }
3409 tbl[0].extra1 = net;
3410
3411 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3412 if (!net->ipv4.route_hdr)
3413 goto err_reg;
3414 return 0;
3415
3416err_reg:
3417 if (tbl != ipv4_route_flush_table)
3418 kfree(tbl);
3419err_dup:
3420 return -ENOMEM;
3421}
3422
3423static __net_exit void sysctl_route_net_exit(struct net *net)
3424{
3425 struct ctl_table *tbl;
3426
3427 tbl = net->ipv4.route_hdr->ctl_table_arg;
3428 unregister_net_sysctl_table(net->ipv4.route_hdr);
3429 BUG_ON(tbl == ipv4_route_flush_table);
3430 kfree(tbl);
3431}
3432
3433static __net_initdata struct pernet_operations sysctl_route_ops = {
3434 .init = sysctl_route_net_init,
3435 .exit = sysctl_route_net_exit,
3436};
3437#endif
3438
3439static __net_init int rt_genid_init(struct net *net)
3440{
3441 atomic_set(&net->ipv4.rt_genid, 0);
3442 atomic_set(&net->fnhe_genid, 0);
3443 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3444 return 0;
3445}
3446
3447static __net_initdata struct pernet_operations rt_genid_ops = {
3448 .init = rt_genid_init,
3449};
3450
3451static int __net_init ipv4_inetpeer_init(struct net *net)
3452{
3453 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3454
3455 if (!bp)
3456 return -ENOMEM;
3457 inet_peer_base_init(bp);
3458 net->ipv4.peers = bp;
3459 return 0;
3460}
3461
3462static void __net_exit ipv4_inetpeer_exit(struct net *net)
3463{
3464 struct inet_peer_base *bp = net->ipv4.peers;
3465
3466 net->ipv4.peers = NULL;
3467 inetpeer_invalidate_tree(bp);
3468 kfree(bp);
3469}
3470
3471static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3472 .init = ipv4_inetpeer_init,
3473 .exit = ipv4_inetpeer_exit,
3474};
3475
3476#ifdef CONFIG_IP_ROUTE_CLASSID
3477struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3478#endif /* CONFIG_IP_ROUTE_CLASSID */
3479
3480int __init ip_rt_init(void)
3481{
3482 void *idents_hash;
3483 int cpu;
3484
3485 /* For modern hosts, this will use 2 MB of memory */
3486 idents_hash = alloc_large_system_hash("IP idents",
3487 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3488 0,
3489 16, /* one bucket per 64 KB */
3490 HASH_ZERO,
3491 NULL,
3492 &ip_idents_mask,
3493 2048,
3494 256*1024);
3495
3496 ip_idents = idents_hash;
3497
3498 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3499
3500 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3501
3502 for_each_possible_cpu(cpu) {
3503 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3504
3505 INIT_LIST_HEAD(&ul->head);
3506 spin_lock_init(&ul->lock);
3507 }
3508#ifdef CONFIG_IP_ROUTE_CLASSID
3509 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3510 if (!ip_rt_acct)
3511 panic("IP: failed to allocate ip_rt_acct\n");
3512#endif
3513
3514 ipv4_dst_ops.kmem_cachep =
3515 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3516 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3517
3518 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3519
3520 if (dst_entries_init(&ipv4_dst_ops) < 0)
3521 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3522
3523 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3524 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3525
3526 ipv4_dst_ops.gc_thresh = ~0;
3527 ip_rt_max_size = INT_MAX;
3528
3529 devinet_init();
3530 ip_fib_init();
3531
3532 if (ip_rt_proc_init())
3533 pr_err("Unable to create route proc files\n");
3534#ifdef CONFIG_XFRM
3535 xfrm_init();
3536 xfrm4_init();
3537#endif
3538 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3539 RTNL_FLAG_DOIT_UNLOCKED);
3540
3541#ifdef CONFIG_SYSCTL
3542 register_pernet_subsys(&sysctl_route_ops);
3543#endif
3544 register_pernet_subsys(&rt_genid_ops);
3545 register_pernet_subsys(&ipv4_inetpeer_ops);
3546 return 0;
3547}
3548
3549#ifdef CONFIG_SYSCTL
3550/*
3551 * We really need to sanitize the damn ipv4 init order, then all
3552 * this nonsense will go away.
3553 */
3554void __init ip_static_sysctl_init(void)
3555{
3556 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3557}
3558#endif