blob: 6fcb12e083d99cca99c9e0bb2beb8cee45420276 [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h>
68#include <linux/uaccess.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
89#include <linux/rcupdate.h>
90#include <linux/times.h>
91#include <linux/slab.h>
92#include <linux/jhash.h>
93#include <net/dst.h>
94#include <net/dst_metadata.h>
95#include <net/net_namespace.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/lwtunnel.h>
107#include <net/netevent.h>
108#include <net/rtnetlink.h>
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#include <linux/kmemleak.h>
112#endif
113#include <net/secure_seq.h>
114#include <net/ip_tunnels.h>
115#include <net/l3mdev.h>
116
117#include "fib_lookup.h"
118
119#define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
121
122#define RT_GC_TIMEOUT (300*HZ)
123
124static int ip_rt_max_size;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
133
134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
135
136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142static unsigned int ipv4_mtu(const struct dst_entry *dst);
143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144static void ipv4_link_failure(struct sk_buff *skb);
145static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb, u32 mtu,
147 bool confirm_neigh);
148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb);
150static void ipv4_dst_destroy(struct dst_entry *dst);
151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 WARN_ON(1);
155 return NULL;
156}
157
158static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
159 struct sk_buff *skb,
160 const void *daddr);
161static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
162
163static struct dst_ops ipv4_dst_ops = {
164 .family = AF_INET,
165 .check = ipv4_dst_check,
166 .default_advmss = ipv4_default_advmss,
167 .mtu = ipv4_mtu,
168 .cow_metrics = ipv4_cow_metrics,
169 .destroy = ipv4_dst_destroy,
170 .negative_advice = ipv4_negative_advice,
171 .link_failure = ipv4_link_failure,
172 .update_pmtu = ip_rt_update_pmtu,
173 .redirect = ip_do_redirect,
174 .local_out = __ip_local_out,
175 .neigh_lookup = ipv4_neigh_lookup,
176 .confirm_neigh = ipv4_confirm_neigh,
177};
178
179#define ECN_OR_COST(class) TC_PRIO_##class
180
181const __u8 ip_tos2prio[16] = {
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BESTEFFORT,
185 ECN_OR_COST(BESTEFFORT),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_BULK,
189 ECN_OR_COST(BULK),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE,
193 ECN_OR_COST(INTERACTIVE),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK),
196 TC_PRIO_INTERACTIVE_BULK,
197 ECN_OR_COST(INTERACTIVE_BULK)
198};
199EXPORT_SYMBOL(ip_tos2prio);
200
201static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
202#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
203
204#ifdef CONFIG_PROC_FS
205static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
206{
207 if (*pos)
208 return NULL;
209 return SEQ_START_TOKEN;
210}
211
212static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
213{
214 ++*pos;
215 return NULL;
216}
217
218static void rt_cache_seq_stop(struct seq_file *seq, void *v)
219{
220}
221
222static int rt_cache_seq_show(struct seq_file *seq, void *v)
223{
224 if (v == SEQ_START_TOKEN)
225 seq_printf(seq, "%-127s\n",
226 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
227 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
228 "HHUptod\tSpecDst");
229 return 0;
230}
231
232static const struct seq_operations rt_cache_seq_ops = {
233 .start = rt_cache_seq_start,
234 .next = rt_cache_seq_next,
235 .stop = rt_cache_seq_stop,
236 .show = rt_cache_seq_show,
237};
238
239static int rt_cache_seq_open(struct inode *inode, struct file *file)
240{
241 return seq_open(file, &rt_cache_seq_ops);
242}
243
244static const struct file_operations rt_cache_seq_fops = {
245 .owner = THIS_MODULE,
246 .open = rt_cache_seq_open,
247 .read = seq_read,
248 .llseek = seq_lseek,
249 .release = seq_release,
250};
251
252
253static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
254{
255 int cpu;
256
257 if (*pos == 0)
258 return SEQ_START_TOKEN;
259
260 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
261 if (!cpu_possible(cpu))
262 continue;
263 *pos = cpu+1;
264 return &per_cpu(rt_cache_stat, cpu);
265 }
266 return NULL;
267}
268
269static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
270{
271 int cpu;
272
273 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
274 if (!cpu_possible(cpu))
275 continue;
276 *pos = cpu+1;
277 return &per_cpu(rt_cache_stat, cpu);
278 }
279 (*pos)++;
280 return NULL;
281
282}
283
284static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285{
286
287}
288
289static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290{
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 return 0;
296 }
297
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
301 0, /* st->in_hit */
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 0, /* st->out_hit */
310 st->out_slow_tot,
311 st->out_slow_mc,
312
313 0, /* st->gc_total */
314 0, /* st->gc_ignored */
315 0, /* st->gc_goal_miss */
316 0, /* st->gc_dst_overflow */
317 0, /* st->in_hlist_search */
318 0 /* st->out_hlist_search */
319 );
320 return 0;
321}
322
323static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328};
329
330
331static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332{
333 return seq_open(file, &rt_cpu_seq_ops);
334}
335
336static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342};
343
344#ifdef CONFIG_IP_ROUTE_CLASSID
345static int rt_acct_proc_show(struct seq_file *m, void *v)
346{
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
362 }
363
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367}
368
369static int rt_acct_proc_open(struct inode *inode, struct file *file)
370{
371 return single_open(file, rt_acct_proc_show, NULL);
372}
373
374static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380};
381#endif
382
383static int __net_init ip_rt_do_proc_init(struct net *net)
384{
385 struct proc_dir_entry *pde;
386
387 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
394 if (!pde)
395 goto err2;
396
397#ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 if (!pde)
400 goto err3;
401#endif
402 return 0;
403
404#ifdef CONFIG_IP_ROUTE_CLASSID
405err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407#endif
408err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410err1:
411 return -ENOMEM;
412}
413
414static void __net_exit ip_rt_do_proc_exit(struct net *net)
415{
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418#ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
420#endif
421}
422
423static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426};
427
428static int __init ip_rt_proc_init(void)
429{
430 return register_pernet_subsys(&ip_rt_proc_ops);
431}
432
433#else
434static inline int ip_rt_proc_init(void)
435{
436 return 0;
437}
438#endif /* CONFIG_PROC_FS */
439
440static inline bool rt_is_expired(const struct rtable *rth)
441{
442 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
443}
444
445void rt_cache_flush(struct net *net)
446{
447 rt_genid_bump_ipv4(net);
448}
449
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
453{
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
457 struct neighbour *n;
458
459 rt = (const struct rtable *) dst;
460 if (rt->rt_gateway)
461 pkey = (const __be32 *) &rt->rt_gateway;
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
464
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 if (n)
467 return n;
468 return neigh_create(&arp_tbl, pkey, dev);
469}
470
471static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
472{
473 struct net_device *dev = dst->dev;
474 const __be32 *pkey = daddr;
475 const struct rtable *rt;
476
477 rt = (const struct rtable *)dst;
478 if (rt->rt_gateway)
479 pkey = (const __be32 *)&rt->rt_gateway;
480 else if (!daddr ||
481 (rt->rt_flags &
482 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
483 return;
484
485 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
486}
487
488#define IP_IDENTS_SZ 2048u
489
490static atomic_t *ip_idents __read_mostly;
491static u32 *ip_tstamps __read_mostly;
492
493/* In order to protect privacy, we add a perturbation to identifiers
494 * if one generator is seldom used. This makes hard for an attacker
495 * to infer how many packets were sent between two points in time.
496 */
497u32 ip_idents_reserve(u32 hash, int segs)
498{
499 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
500 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
501 u32 old = ACCESS_ONCE(*p_tstamp);
502 u32 now = (u32)jiffies;
503 u32 delta = 0;
504
505 if (old != now && cmpxchg(p_tstamp, old, now) == old)
506 delta = prandom_u32_max(now - old);
507
508 /* If UBSAN reports an error there, please make sure your compiler
509 * supports -fno-strict-overflow before reporting it that was a bug
510 * in UBSAN, and it has been fixed in GCC-8.
511 */
512 return atomic_add_return(segs + delta, p_id) - segs;
513}
514EXPORT_SYMBOL(ip_idents_reserve);
515
516void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
517{
518 u32 hash, id;
519
520 /* Note the following code is not safe, but this is okay. */
521 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
522 get_random_bytes(&net->ipv4.ip_id_key,
523 sizeof(net->ipv4.ip_id_key));
524
525 hash = siphash_3u32((__force u32)iph->daddr,
526 (__force u32)iph->saddr,
527 iph->protocol,
528 &net->ipv4.ip_id_key);
529 id = ip_idents_reserve(hash, segs);
530 iph->id = htons(id);
531}
532EXPORT_SYMBOL(__ip_select_ident);
533
534static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
535 const struct sock *sk,
536 const struct iphdr *iph,
537 int oif, u8 tos,
538 u8 prot, u32 mark, int flow_flags)
539{
540 if (sk) {
541 const struct inet_sock *inet = inet_sk(sk);
542
543 oif = sk->sk_bound_dev_if;
544 mark = sk->sk_mark;
545 tos = RT_CONN_FLAGS(sk);
546 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
547 }
548 flowi4_init_output(fl4, oif, mark, tos,
549 RT_SCOPE_UNIVERSE, prot,
550 flow_flags,
551 iph->daddr, iph->saddr, 0, 0,
552 sock_net_uid(net, sk));
553}
554
555static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
556 const struct sock *sk)
557{
558 const struct net *net = dev_net(skb->dev);
559 const struct iphdr *iph = ip_hdr(skb);
560 int oif = skb->dev->ifindex;
561 u8 tos = RT_TOS(iph->tos);
562 u8 prot = iph->protocol;
563 u32 mark = skb->mark;
564
565 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
566}
567
568static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
569{
570 const struct inet_sock *inet = inet_sk(sk);
571 const struct ip_options_rcu *inet_opt;
572 __be32 daddr = inet->inet_daddr;
573
574 rcu_read_lock();
575 inet_opt = rcu_dereference(inet->inet_opt);
576 if (inet_opt && inet_opt->opt.srr)
577 daddr = inet_opt->opt.faddr;
578 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
579 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
580 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
581 inet_sk_flowi_flags(sk),
582 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
583 rcu_read_unlock();
584}
585
586static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 const struct sk_buff *skb)
588{
589 if (skb)
590 build_skb_flow_key(fl4, skb, sk);
591 else
592 build_sk_flow_key(fl4, sk);
593}
594
595static DEFINE_SPINLOCK(fnhe_lock);
596
597static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
598{
599 struct rtable *rt;
600
601 rt = rcu_dereference(fnhe->fnhe_rth_input);
602 if (rt) {
603 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
604 dst_dev_put(&rt->dst);
605 dst_release(&rt->dst);
606 }
607 rt = rcu_dereference(fnhe->fnhe_rth_output);
608 if (rt) {
609 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
610 dst_dev_put(&rt->dst);
611 dst_release(&rt->dst);
612 }
613}
614
615static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
616{
617 struct fib_nh_exception *fnhe, *oldest;
618
619 oldest = rcu_dereference(hash->chain);
620 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
621 fnhe = rcu_dereference(fnhe->fnhe_next)) {
622 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
623 oldest = fnhe;
624 }
625 fnhe_flush_routes(oldest);
626 return oldest;
627}
628
629static inline u32 fnhe_hashfun(__be32 daddr)
630{
631 static u32 fnhe_hashrnd __read_mostly;
632 u32 hval;
633
634 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
635 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
636 return hash_32(hval, FNHE_HASH_SHIFT);
637}
638
639static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640{
641 rt->rt_pmtu = fnhe->fnhe_pmtu;
642 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
643 rt->dst.expires = fnhe->fnhe_expires;
644
645 if (fnhe->fnhe_gw) {
646 rt->rt_flags |= RTCF_REDIRECTED;
647 rt->rt_gateway = fnhe->fnhe_gw;
648 rt->rt_uses_gateway = 1;
649 }
650}
651
652static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
653 u32 pmtu, bool lock, unsigned long expires)
654{
655 struct fnhe_hash_bucket *hash;
656 struct fib_nh_exception *fnhe;
657 struct rtable *rt;
658 u32 genid, hval;
659 unsigned int i;
660 int depth;
661
662 genid = fnhe_genid(dev_net(nh->nh_dev));
663 hval = fnhe_hashfun(daddr);
664
665 spin_lock_bh(&fnhe_lock);
666
667 hash = rcu_dereference(nh->nh_exceptions);
668 if (!hash) {
669 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
670 if (!hash)
671 goto out_unlock;
672 rcu_assign_pointer(nh->nh_exceptions, hash);
673 }
674
675 hash += hval;
676
677 depth = 0;
678 for (fnhe = rcu_dereference(hash->chain); fnhe;
679 fnhe = rcu_dereference(fnhe->fnhe_next)) {
680 if (fnhe->fnhe_daddr == daddr)
681 break;
682 depth++;
683 }
684
685 if (fnhe) {
686 if (fnhe->fnhe_genid != genid)
687 fnhe->fnhe_genid = genid;
688 if (gw)
689 fnhe->fnhe_gw = gw;
690 if (pmtu) {
691 fnhe->fnhe_pmtu = pmtu;
692 fnhe->fnhe_mtu_locked = lock;
693 }
694 fnhe->fnhe_expires = max(1UL, expires);
695 /* Update all cached dsts too */
696 rt = rcu_dereference(fnhe->fnhe_rth_input);
697 if (rt)
698 fill_route_from_fnhe(rt, fnhe);
699 rt = rcu_dereference(fnhe->fnhe_rth_output);
700 if (rt)
701 fill_route_from_fnhe(rt, fnhe);
702 } else {
703 if (depth > FNHE_RECLAIM_DEPTH)
704 fnhe = fnhe_oldest(hash);
705 else {
706 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
707 if (!fnhe)
708 goto out_unlock;
709
710 fnhe->fnhe_next = hash->chain;
711 rcu_assign_pointer(hash->chain, fnhe);
712 }
713 fnhe->fnhe_genid = genid;
714 fnhe->fnhe_daddr = daddr;
715 fnhe->fnhe_gw = gw;
716 fnhe->fnhe_pmtu = pmtu;
717 fnhe->fnhe_mtu_locked = lock;
718 fnhe->fnhe_expires = max(1UL, expires);
719
720 /* Exception created; mark the cached routes for the nexthop
721 * stale, so anyone caching it rechecks if this exception
722 * applies to them.
723 */
724 rt = rcu_dereference(nh->nh_rth_input);
725 if (rt)
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
727
728 for_each_possible_cpu(i) {
729 struct rtable __rcu **prt;
730 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
731 rt = rcu_dereference(*prt);
732 if (rt)
733 rt->dst.obsolete = DST_OBSOLETE_KILL;
734 }
735 }
736
737 fnhe->fnhe_stamp = jiffies;
738
739out_unlock:
740 spin_unlock_bh(&fnhe_lock);
741}
742
743static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
744 bool kill_route)
745{
746 __be32 new_gw = icmp_hdr(skb)->un.gateway;
747 __be32 old_gw = ip_hdr(skb)->saddr;
748 struct net_device *dev = skb->dev;
749 struct in_device *in_dev;
750 struct fib_result res;
751 struct neighbour *n;
752 struct net *net;
753
754 switch (icmp_hdr(skb)->code & 7) {
755 case ICMP_REDIR_NET:
756 case ICMP_REDIR_NETTOS:
757 case ICMP_REDIR_HOST:
758 case ICMP_REDIR_HOSTTOS:
759 break;
760
761 default:
762 return;
763 }
764
765 if (rt->rt_gateway != old_gw)
766 return;
767
768 in_dev = __in_dev_get_rcu(dev);
769 if (!in_dev)
770 return;
771
772 net = dev_net(dev);
773 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
774 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
775 ipv4_is_zeronet(new_gw))
776 goto reject_redirect;
777
778 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
779 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
780 goto reject_redirect;
781 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
782 goto reject_redirect;
783 } else {
784 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
785 goto reject_redirect;
786 }
787
788 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
789 if (!n)
790 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
791 if (!IS_ERR(n)) {
792 if (!(n->nud_state & NUD_VALID)) {
793 neigh_event_send(n, NULL);
794 } else {
795 if (fib_lookup(net, fl4, &res, 0) == 0) {
796 struct fib_nh *nh = &FIB_RES_NH(res);
797
798 fib_select_path(net, &res, fl4, skb);
799 nh = &FIB_RES_NH(res);
800 update_or_create_fnhe(nh, fl4->daddr, new_gw,
801 0, false,
802 jiffies + ip_rt_gc_timeout);
803 }
804 if (kill_route)
805 rt->dst.obsolete = DST_OBSOLETE_KILL;
806 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
807 }
808 neigh_release(n);
809 }
810 return;
811
812reject_redirect:
813#ifdef CONFIG_IP_ROUTE_VERBOSE
814 if (IN_DEV_LOG_MARTIANS(in_dev)) {
815 const struct iphdr *iph = (const struct iphdr *) skb->data;
816 __be32 daddr = iph->daddr;
817 __be32 saddr = iph->saddr;
818
819 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
820 " Advised path = %pI4 -> %pI4\n",
821 &old_gw, dev->name, &new_gw,
822 &saddr, &daddr);
823 }
824#endif
825 ;
826}
827
828static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
829{
830 struct rtable *rt;
831 struct flowi4 fl4;
832 const struct iphdr *iph = (const struct iphdr *) skb->data;
833 struct net *net = dev_net(skb->dev);
834 int oif = skb->dev->ifindex;
835 u8 tos = RT_TOS(iph->tos);
836 u8 prot = iph->protocol;
837 u32 mark = skb->mark;
838
839 rt = (struct rtable *) dst;
840
841 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
842 __ip_do_redirect(rt, skb, &fl4, true);
843}
844
845static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
846{
847 struct rtable *rt = (struct rtable *)dst;
848 struct dst_entry *ret = dst;
849
850 if (rt) {
851 if (dst->obsolete > 0) {
852 ip_rt_put(rt);
853 ret = NULL;
854 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
855 rt->dst.expires) {
856 ip_rt_put(rt);
857 ret = NULL;
858 }
859 }
860 return ret;
861}
862
863/*
864 * Algorithm:
865 * 1. The first ip_rt_redirect_number redirects are sent
866 * with exponential backoff, then we stop sending them at all,
867 * assuming that the host ignores our redirects.
868 * 2. If we did not see packets requiring redirects
869 * during ip_rt_redirect_silence, we assume that the host
870 * forgot redirected route and start to send redirects again.
871 *
872 * This algorithm is much cheaper and more intelligent than dumb load limiting
873 * in icmp.c.
874 *
875 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
876 * and "frag. need" (breaks PMTU discovery) in icmp.c.
877 */
878
879void ip_rt_send_redirect(struct sk_buff *skb)
880{
881 struct rtable *rt = skb_rtable(skb);
882 struct in_device *in_dev;
883 struct inet_peer *peer;
884 struct net *net;
885 int log_martians;
886 int vif;
887
888 rcu_read_lock();
889 in_dev = __in_dev_get_rcu(rt->dst.dev);
890 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
891 rcu_read_unlock();
892 return;
893 }
894 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
895 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
896 rcu_read_unlock();
897
898 net = dev_net(rt->dst.dev);
899 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
900 if (!peer) {
901 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
902 rt_nexthop(rt, ip_hdr(skb)->daddr));
903 return;
904 }
905
906 /* No redirected packets during ip_rt_redirect_silence;
907 * reset the algorithm.
908 */
909 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
910 peer->rate_tokens = 0;
911 peer->n_redirects = 0;
912 }
913
914 /* Too many ignored redirects; do not send anything
915 * set dst.rate_last to the last seen redirected packet.
916 */
917 if (peer->n_redirects >= ip_rt_redirect_number) {
918 peer->rate_last = jiffies;
919 goto out_put_peer;
920 }
921
922 /* Check for load limit; set rate_last to the latest sent
923 * redirect.
924 */
925 if (peer->n_redirects == 0 ||
926 time_after(jiffies,
927 (peer->rate_last +
928 (ip_rt_redirect_load << peer->n_redirects)))) {
929 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
930
931 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
932 peer->rate_last = jiffies;
933 ++peer->n_redirects;
934#ifdef CONFIG_IP_ROUTE_VERBOSE
935 if (log_martians &&
936 peer->n_redirects == ip_rt_redirect_number)
937 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
938 &ip_hdr(skb)->saddr, inet_iif(skb),
939 &ip_hdr(skb)->daddr, &gw);
940#endif
941 }
942out_put_peer:
943 inet_putpeer(peer);
944}
945
946static int ip_error(struct sk_buff *skb)
947{
948 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
949 struct rtable *rt = skb_rtable(skb);
950 struct inet_peer *peer;
951 unsigned long now;
952 struct net *net;
953 bool send;
954 int code;
955
956 /* IP on this device is disabled. */
957 if (!in_dev)
958 goto out;
959
960 net = dev_net(rt->dst.dev);
961 if (!IN_DEV_FORWARD(in_dev)) {
962 switch (rt->dst.error) {
963 case EHOSTUNREACH:
964 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
965 break;
966
967 case ENETUNREACH:
968 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969 break;
970 }
971 goto out;
972 }
973
974 switch (rt->dst.error) {
975 case EINVAL:
976 default:
977 goto out;
978 case EHOSTUNREACH:
979 code = ICMP_HOST_UNREACH;
980 break;
981 case ENETUNREACH:
982 code = ICMP_NET_UNREACH;
983 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
984 break;
985 case EACCES:
986 code = ICMP_PKT_FILTERED;
987 break;
988 }
989
990 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991 l3mdev_master_ifindex(skb->dev), 1);
992
993 send = true;
994 if (peer) {
995 now = jiffies;
996 peer->rate_tokens += now - peer->rate_last;
997 if (peer->rate_tokens > ip_rt_error_burst)
998 peer->rate_tokens = ip_rt_error_burst;
999 peer->rate_last = now;
1000 if (peer->rate_tokens >= ip_rt_error_cost)
1001 peer->rate_tokens -= ip_rt_error_cost;
1002 else
1003 send = false;
1004 inet_putpeer(peer);
1005 }
1006 if (send)
1007 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out: kfree_skb(skb);
1010 return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015 struct dst_entry *dst = &rt->dst;
1016 struct net *net = dev_net(dst->dev);
1017 u32 old_mtu = ipv4_mtu(dst);
1018 struct fib_result res;
1019 bool lock = false;
1020
1021 if (ip_mtu_locked(dst))
1022 return;
1023
1024 if (old_mtu < mtu)
1025 return;
1026
1027 if (mtu < ip_rt_min_pmtu) {
1028 lock = true;
1029 mtu = min(old_mtu, ip_rt_min_pmtu);
1030 }
1031
1032 if (rt->rt_pmtu == mtu && !lock &&
1033 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034 return;
1035
1036 rcu_read_lock();
1037 if (fib_lookup(net, fl4, &res, 0) == 0) {
1038 struct fib_nh *nh;
1039
1040 fib_select_path(net, &res, fl4, NULL);
1041 nh = &FIB_RES_NH(res);
1042 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1043 jiffies + ip_rt_mtu_expires);
1044 }
1045 rcu_read_unlock();
1046}
1047
1048static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1049 struct sk_buff *skb, u32 mtu,
1050 bool confirm_neigh)
1051{
1052 struct rtable *rt = (struct rtable *) dst;
1053 struct flowi4 fl4;
1054
1055 ip_rt_build_flow_key(&fl4, sk, skb);
1056 __ip_rt_update_pmtu(rt, &fl4, mtu);
1057}
1058
1059void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1060 int oif, u32 mark, u8 protocol, int flow_flags)
1061{
1062 const struct iphdr *iph = (const struct iphdr *) skb->data;
1063 struct flowi4 fl4;
1064 struct rtable *rt;
1065
1066 if (!mark)
1067 mark = IP4_REPLY_MARK(net, skb->mark);
1068
1069 __build_flow_key(net, &fl4, NULL, iph, oif,
1070 RT_TOS(iph->tos), protocol, mark, flow_flags);
1071 rt = __ip_route_output_key(net, &fl4);
1072 if (!IS_ERR(rt)) {
1073 __ip_rt_update_pmtu(rt, &fl4, mtu);
1074 ip_rt_put(rt);
1075 }
1076}
1077EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1078
1079static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1080{
1081 const struct iphdr *iph = (const struct iphdr *) skb->data;
1082 struct flowi4 fl4;
1083 struct rtable *rt;
1084
1085 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1086
1087 if (!fl4.flowi4_mark)
1088 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1089
1090 rt = __ip_route_output_key(sock_net(sk), &fl4);
1091 if (!IS_ERR(rt)) {
1092 __ip_rt_update_pmtu(rt, &fl4, mtu);
1093 ip_rt_put(rt);
1094 }
1095}
1096
1097void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1098{
1099 const struct iphdr *iph = (const struct iphdr *) skb->data;
1100 struct flowi4 fl4;
1101 struct rtable *rt;
1102 struct dst_entry *odst = NULL;
1103 bool new = false;
1104 struct net *net = sock_net(sk);
1105
1106 bh_lock_sock(sk);
1107
1108 if (!ip_sk_accept_pmtu(sk))
1109 goto out;
1110
1111 odst = sk_dst_get(sk);
1112
1113 if (sock_owned_by_user(sk) || !odst) {
1114 __ipv4_sk_update_pmtu(skb, sk, mtu);
1115 goto out;
1116 }
1117
1118 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1119
1120 rt = (struct rtable *)odst;
1121 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1122 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1123 if (IS_ERR(rt))
1124 goto out;
1125
1126 new = true;
1127 }
1128
1129 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1130
1131 if (!dst_check(&rt->dst, 0)) {
1132 if (new)
1133 dst_release(&rt->dst);
1134
1135 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1136 if (IS_ERR(rt))
1137 goto out;
1138
1139 new = true;
1140 }
1141
1142 if (new)
1143 sk_dst_set(sk, &rt->dst);
1144
1145out:
1146 bh_unlock_sock(sk);
1147 dst_release(odst);
1148}
1149EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1150
1151void ipv4_redirect(struct sk_buff *skb, struct net *net,
1152 int oif, u32 mark, u8 protocol, int flow_flags)
1153{
1154 const struct iphdr *iph = (const struct iphdr *) skb->data;
1155 struct flowi4 fl4;
1156 struct rtable *rt;
1157
1158 __build_flow_key(net, &fl4, NULL, iph, oif,
1159 RT_TOS(iph->tos), protocol, mark, flow_flags);
1160 rt = __ip_route_output_key(net, &fl4);
1161 if (!IS_ERR(rt)) {
1162 __ip_do_redirect(rt, skb, &fl4, false);
1163 ip_rt_put(rt);
1164 }
1165}
1166EXPORT_SYMBOL_GPL(ipv4_redirect);
1167
1168void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1169{
1170 const struct iphdr *iph = (const struct iphdr *) skb->data;
1171 struct flowi4 fl4;
1172 struct rtable *rt;
1173 struct net *net = sock_net(sk);
1174
1175 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1176 rt = __ip_route_output_key(net, &fl4);
1177 if (!IS_ERR(rt)) {
1178 __ip_do_redirect(rt, skb, &fl4, false);
1179 ip_rt_put(rt);
1180 }
1181}
1182EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1183
1184static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1185{
1186 struct rtable *rt = (struct rtable *) dst;
1187
1188 /* All IPV4 dsts are created with ->obsolete set to the value
1189 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1190 * into this function always.
1191 *
1192 * When a PMTU/redirect information update invalidates a route,
1193 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1194 * DST_OBSOLETE_DEAD by dst_free().
1195 */
1196 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1197 return NULL;
1198 return dst;
1199}
1200
1201static void ipv4_send_dest_unreach(struct sk_buff *skb)
1202{
1203 struct ip_options opt;
1204 int res;
1205
1206 /* Recompile ip options since IPCB may not be valid anymore.
1207 * Also check we have a reasonable ipv4 header.
1208 */
1209 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1210 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1211 return;
1212
1213 memset(&opt, 0, sizeof(opt));
1214 if (ip_hdr(skb)->ihl > 5) {
1215 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1216 return;
1217 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1218
1219 rcu_read_lock();
1220 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1221 rcu_read_unlock();
1222
1223 if (res)
1224 return;
1225 }
1226 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1227}
1228
1229static void ipv4_link_failure(struct sk_buff *skb)
1230{
1231 struct rtable *rt;
1232
1233 ipv4_send_dest_unreach(skb);
1234
1235 rt = skb_rtable(skb);
1236 if (rt)
1237 dst_set_expires(&rt->dst, 0);
1238}
1239
1240static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1241{
1242 pr_debug("%s: %pI4 -> %pI4, %s\n",
1243 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1244 skb->dev ? skb->dev->name : "?");
1245 kfree_skb(skb);
1246 WARN_ON(1);
1247 return 0;
1248}
1249
1250/*
1251 We do not cache source address of outgoing interface,
1252 because it is used only by IP RR, TS and SRR options,
1253 so that it out of fast path.
1254
1255 BTW remember: "addr" is allowed to be not aligned
1256 in IP options!
1257 */
1258
1259void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1260{
1261 __be32 src;
1262
1263 if (rt_is_output_route(rt))
1264 src = ip_hdr(skb)->saddr;
1265 else {
1266 struct fib_result res;
1267 struct flowi4 fl4;
1268 struct iphdr *iph;
1269
1270 iph = ip_hdr(skb);
1271
1272 memset(&fl4, 0, sizeof(fl4));
1273 fl4.daddr = iph->daddr;
1274 fl4.saddr = iph->saddr;
1275 fl4.flowi4_tos = RT_TOS(iph->tos);
1276 fl4.flowi4_oif = rt->dst.dev->ifindex;
1277 fl4.flowi4_iif = skb->dev->ifindex;
1278 fl4.flowi4_mark = skb->mark;
1279
1280 rcu_read_lock();
1281 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1282 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1283 else
1284 src = inet_select_addr(rt->dst.dev,
1285 rt_nexthop(rt, iph->daddr),
1286 RT_SCOPE_UNIVERSE);
1287 rcu_read_unlock();
1288 }
1289 memcpy(addr, &src, 4);
1290}
1291
1292#ifdef CONFIG_IP_ROUTE_CLASSID
1293static void set_class_tag(struct rtable *rt, u32 tag)
1294{
1295 if (!(rt->dst.tclassid & 0xFFFF))
1296 rt->dst.tclassid |= tag & 0xFFFF;
1297 if (!(rt->dst.tclassid & 0xFFFF0000))
1298 rt->dst.tclassid |= tag & 0xFFFF0000;
1299}
1300#endif
1301
1302static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1303{
1304 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1305 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1306 ip_rt_min_advmss);
1307
1308 return min(advmss, IPV4_MAX_PMTU - header_size);
1309}
1310
1311static unsigned int ipv4_mtu(const struct dst_entry *dst)
1312{
1313 const struct rtable *rt = (const struct rtable *) dst;
1314 unsigned int mtu = rt->rt_pmtu;
1315
1316 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1317 mtu = dst_metric_raw(dst, RTAX_MTU);
1318
1319 if (mtu)
1320 return mtu;
1321
1322 mtu = READ_ONCE(dst->dev->mtu);
1323
1324 if (unlikely(ip_mtu_locked(dst))) {
1325 if (rt->rt_uses_gateway && mtu > 576)
1326 mtu = 576;
1327 }
1328
1329 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1330
1331 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1332}
1333
1334static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1335{
1336 struct fnhe_hash_bucket *hash;
1337 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1338 u32 hval = fnhe_hashfun(daddr);
1339
1340 spin_lock_bh(&fnhe_lock);
1341
1342 hash = rcu_dereference_protected(nh->nh_exceptions,
1343 lockdep_is_held(&fnhe_lock));
1344 hash += hval;
1345
1346 fnhe_p = &hash->chain;
1347 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1348 while (fnhe) {
1349 if (fnhe->fnhe_daddr == daddr) {
1350 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1351 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1352 /* set fnhe_daddr to 0 to ensure it won't bind with
1353 * new dsts in rt_bind_exception().
1354 */
1355 fnhe->fnhe_daddr = 0;
1356 fnhe_flush_routes(fnhe);
1357 kfree_rcu(fnhe, rcu);
1358 break;
1359 }
1360 fnhe_p = &fnhe->fnhe_next;
1361 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1362 lockdep_is_held(&fnhe_lock));
1363 }
1364
1365 spin_unlock_bh(&fnhe_lock);
1366}
1367
1368static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1369{
1370 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1371 struct fib_nh_exception *fnhe;
1372 u32 hval;
1373
1374 if (!hash)
1375 return NULL;
1376
1377 hval = fnhe_hashfun(daddr);
1378
1379 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1380 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1381 if (fnhe->fnhe_daddr == daddr) {
1382 if (fnhe->fnhe_expires &&
1383 time_after(jiffies, fnhe->fnhe_expires)) {
1384 ip_del_fnhe(nh, daddr);
1385 break;
1386 }
1387 return fnhe;
1388 }
1389 }
1390 return NULL;
1391}
1392
1393static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1394 __be32 daddr, const bool do_cache)
1395{
1396 bool ret = false;
1397
1398 spin_lock_bh(&fnhe_lock);
1399
1400 if (daddr == fnhe->fnhe_daddr) {
1401 struct rtable __rcu **porig;
1402 struct rtable *orig;
1403 int genid = fnhe_genid(dev_net(rt->dst.dev));
1404
1405 if (rt_is_input_route(rt))
1406 porig = &fnhe->fnhe_rth_input;
1407 else
1408 porig = &fnhe->fnhe_rth_output;
1409 orig = rcu_dereference(*porig);
1410
1411 if (fnhe->fnhe_genid != genid) {
1412 fnhe->fnhe_genid = genid;
1413 fnhe->fnhe_gw = 0;
1414 fnhe->fnhe_pmtu = 0;
1415 fnhe->fnhe_expires = 0;
1416 fnhe_flush_routes(fnhe);
1417 orig = NULL;
1418 }
1419 fill_route_from_fnhe(rt, fnhe);
1420 if (!rt->rt_gateway)
1421 rt->rt_gateway = daddr;
1422
1423 if (do_cache) {
1424 dst_hold(&rt->dst);
1425 rcu_assign_pointer(*porig, rt);
1426 if (orig) {
1427 dst_dev_put(&orig->dst);
1428 dst_release(&orig->dst);
1429 }
1430 ret = true;
1431 }
1432
1433 fnhe->fnhe_stamp = jiffies;
1434 }
1435 spin_unlock_bh(&fnhe_lock);
1436
1437 return ret;
1438}
1439
1440static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1441{
1442 struct rtable *orig, *prev, **p;
1443 bool ret = true;
1444
1445 if (rt_is_input_route(rt)) {
1446 p = (struct rtable **)&nh->nh_rth_input;
1447 } else {
1448 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1449 }
1450 orig = *p;
1451
1452 /* hold dst before doing cmpxchg() to avoid race condition
1453 * on this dst
1454 */
1455 dst_hold(&rt->dst);
1456 prev = cmpxchg(p, orig, rt);
1457 if (prev == orig) {
1458 if (orig) {
1459 dst_dev_put(&orig->dst);
1460 dst_release(&orig->dst);
1461 }
1462 } else {
1463 dst_release(&rt->dst);
1464 ret = false;
1465 }
1466
1467 return ret;
1468}
1469
1470struct uncached_list {
1471 spinlock_t lock;
1472 struct list_head head;
1473};
1474
1475static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1476
1477static void rt_add_uncached_list(struct rtable *rt)
1478{
1479 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1480
1481 rt->rt_uncached_list = ul;
1482
1483 spin_lock_bh(&ul->lock);
1484 list_add_tail(&rt->rt_uncached, &ul->head);
1485 spin_unlock_bh(&ul->lock);
1486}
1487
1488static void ipv4_dst_destroy(struct dst_entry *dst)
1489{
1490 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1491 struct rtable *rt = (struct rtable *) dst;
1492
1493 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1494 kfree(p);
1495
1496 if (!list_empty(&rt->rt_uncached)) {
1497 struct uncached_list *ul = rt->rt_uncached_list;
1498
1499 spin_lock_bh(&ul->lock);
1500 list_del(&rt->rt_uncached);
1501 spin_unlock_bh(&ul->lock);
1502 }
1503}
1504
1505void rt_flush_dev(struct net_device *dev)
1506{
1507 struct net *net = dev_net(dev);
1508 struct rtable *rt;
1509 int cpu;
1510
1511 for_each_possible_cpu(cpu) {
1512 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1513
1514 spin_lock_bh(&ul->lock);
1515 list_for_each_entry(rt, &ul->head, rt_uncached) {
1516 if (rt->dst.dev != dev)
1517 continue;
1518 rt->dst.dev = net->loopback_dev;
1519 dev_hold(rt->dst.dev);
1520 dev_put(dev);
1521 }
1522 spin_unlock_bh(&ul->lock);
1523 }
1524}
1525
1526static bool rt_cache_valid(const struct rtable *rt)
1527{
1528 return rt &&
1529 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1530 !rt_is_expired(rt);
1531}
1532
1533static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1534 const struct fib_result *res,
1535 struct fib_nh_exception *fnhe,
1536 struct fib_info *fi, u16 type, u32 itag,
1537 const bool do_cache)
1538{
1539 bool cached = false;
1540
1541 if (fi) {
1542 struct fib_nh *nh = &FIB_RES_NH(*res);
1543
1544 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1545 rt->rt_gateway = nh->nh_gw;
1546 rt->rt_uses_gateway = 1;
1547 }
1548 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1549 if (fi->fib_metrics != &dst_default_metrics) {
1550 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1551 refcount_inc(&fi->fib_metrics->refcnt);
1552 }
1553#ifdef CONFIG_IP_ROUTE_CLASSID
1554 rt->dst.tclassid = nh->nh_tclassid;
1555#endif
1556 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1557 if (unlikely(fnhe))
1558 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1559 else if (do_cache)
1560 cached = rt_cache_route(nh, rt);
1561 if (unlikely(!cached)) {
1562 /* Routes we intend to cache in nexthop exception or
1563 * FIB nexthop have the DST_NOCACHE bit clear.
1564 * However, if we are unsuccessful at storing this
1565 * route into the cache we really need to set it.
1566 */
1567 if (!rt->rt_gateway)
1568 rt->rt_gateway = daddr;
1569 rt_add_uncached_list(rt);
1570 }
1571 } else
1572 rt_add_uncached_list(rt);
1573
1574#ifdef CONFIG_IP_ROUTE_CLASSID
1575#ifdef CONFIG_IP_MULTIPLE_TABLES
1576 set_class_tag(rt, res->tclassid);
1577#endif
1578 set_class_tag(rt, itag);
1579#endif
1580}
1581
1582struct rtable *rt_dst_alloc(struct net_device *dev,
1583 unsigned int flags, u16 type,
1584 bool nopolicy, bool noxfrm, bool will_cache)
1585{
1586 struct rtable *rt;
1587
1588 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1589 (will_cache ? 0 : DST_HOST) |
1590 (nopolicy ? DST_NOPOLICY : 0) |
1591 (noxfrm ? DST_NOXFRM : 0));
1592
1593 if (rt) {
1594 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1595 rt->rt_flags = flags;
1596 rt->rt_type = type;
1597 rt->rt_is_input = 0;
1598 rt->rt_iif = 0;
1599 rt->rt_pmtu = 0;
1600 rt->rt_mtu_locked = 0;
1601 rt->rt_gateway = 0;
1602 rt->rt_uses_gateway = 0;
1603 rt->rt_table_id = 0;
1604 INIT_LIST_HEAD(&rt->rt_uncached);
1605
1606 rt->dst.output = ip_output;
1607 if (flags & RTCF_LOCAL)
1608 rt->dst.input = ip_local_deliver;
1609 }
1610
1611 return rt;
1612}
1613EXPORT_SYMBOL(rt_dst_alloc);
1614
1615/* called in rcu_read_lock() section */
1616int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1617 u8 tos, struct net_device *dev,
1618 struct in_device *in_dev, u32 *itag)
1619{
1620 int err;
1621
1622 /* Primary sanity checks. */
1623 if (!in_dev)
1624 return -EINVAL;
1625
1626 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1627 skb->protocol != htons(ETH_P_IP))
1628 return -EINVAL;
1629
1630 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1631 return -EINVAL;
1632
1633 if (ipv4_is_zeronet(saddr)) {
1634 if (!ipv4_is_local_multicast(daddr))
1635 return -EINVAL;
1636 } else {
1637 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1638 in_dev, itag);
1639 if (err < 0)
1640 return err;
1641 }
1642 return 0;
1643}
1644
1645/* called in rcu_read_lock() section */
1646static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1647 u8 tos, struct net_device *dev, int our)
1648{
1649 struct in_device *in_dev = __in_dev_get_rcu(dev);
1650 unsigned int flags = RTCF_MULTICAST;
1651 struct rtable *rth;
1652 u32 itag = 0;
1653 int err;
1654
1655 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1656 if (err)
1657 return err;
1658
1659 if (our)
1660 flags |= RTCF_LOCAL;
1661
1662 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1663 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1664 if (!rth)
1665 return -ENOBUFS;
1666
1667#ifdef CONFIG_IP_ROUTE_CLASSID
1668 rth->dst.tclassid = itag;
1669#endif
1670 rth->dst.output = ip_rt_bug;
1671 rth->rt_is_input= 1;
1672
1673#ifdef CONFIG_IP_MROUTE
1674 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1675 rth->dst.input = ip_mr_input;
1676#endif
1677 RT_CACHE_STAT_INC(in_slow_mc);
1678
1679 skb_dst_set(skb, &rth->dst);
1680 return 0;
1681}
1682
1683
1684static void ip_handle_martian_source(struct net_device *dev,
1685 struct in_device *in_dev,
1686 struct sk_buff *skb,
1687 __be32 daddr,
1688 __be32 saddr)
1689{
1690 RT_CACHE_STAT_INC(in_martian_src);
1691#ifdef CONFIG_IP_ROUTE_VERBOSE
1692 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1693 /*
1694 * RFC1812 recommendation, if source is martian,
1695 * the only hint is MAC header.
1696 */
1697 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1698 &daddr, &saddr, dev->name);
1699 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1700 print_hex_dump(KERN_WARNING, "ll header: ",
1701 DUMP_PREFIX_OFFSET, 16, 1,
1702 skb_mac_header(skb),
1703 dev->hard_header_len, true);
1704 }
1705 }
1706#endif
1707}
1708
1709static void set_lwt_redirect(struct rtable *rth)
1710{
1711 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1712 rth->dst.lwtstate->orig_output = rth->dst.output;
1713 rth->dst.output = lwtunnel_output;
1714 }
1715
1716 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1717 rth->dst.lwtstate->orig_input = rth->dst.input;
1718 rth->dst.input = lwtunnel_input;
1719 }
1720}
1721
1722/* called in rcu_read_lock() section */
1723static int __mkroute_input(struct sk_buff *skb,
1724 const struct fib_result *res,
1725 struct in_device *in_dev,
1726 __be32 daddr, __be32 saddr, u32 tos)
1727{
1728 struct fib_nh_exception *fnhe;
1729 struct rtable *rth;
1730 int err;
1731 struct in_device *out_dev;
1732 bool do_cache;
1733 u32 itag = 0;
1734
1735 /* get a working reference to the output device */
1736 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1737 if (!out_dev) {
1738 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1739 return -EINVAL;
1740 }
1741
1742 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1743 in_dev->dev, in_dev, &itag);
1744 if (err < 0) {
1745 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1746 saddr);
1747
1748 goto cleanup;
1749 }
1750
1751 do_cache = res->fi && !itag;
1752 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1753 skb->protocol == htons(ETH_P_IP) &&
1754 (IN_DEV_SHARED_MEDIA(out_dev) ||
1755 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1757
1758 if (skb->protocol != htons(ETH_P_IP)) {
1759 /* Not IP (i.e. ARP). Do not create route, if it is
1760 * invalid for proxy arp. DNAT routes are always valid.
1761 *
1762 * Proxy arp feature have been extended to allow, ARP
1763 * replies back to the same interface, to support
1764 * Private VLAN switch technologies. See arp.c.
1765 */
1766 if (out_dev == in_dev &&
1767 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1768 err = -EINVAL;
1769 goto cleanup;
1770 }
1771 }
1772
1773 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1774 if (do_cache) {
1775 if (fnhe)
1776 rth = rcu_dereference(fnhe->fnhe_rth_input);
1777 else
1778 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1779 if (rt_cache_valid(rth)) {
1780 skb_dst_set_noref(skb, &rth->dst);
1781 goto out;
1782 }
1783 }
1784
1785 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1786 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1787 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1788 if (!rth) {
1789 err = -ENOBUFS;
1790 goto cleanup;
1791 }
1792
1793 rth->rt_is_input = 1;
1794 if (res->table)
1795 rth->rt_table_id = res->table->tb_id;
1796 RT_CACHE_STAT_INC(in_slow_tot);
1797
1798 rth->dst.input = ip_forward;
1799
1800 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1801 do_cache);
1802 set_lwt_redirect(rth);
1803 skb_dst_set(skb, &rth->dst);
1804out:
1805 err = 0;
1806 cleanup:
1807 return err;
1808}
1809
1810#ifdef CONFIG_IP_ROUTE_MULTIPATH
1811/* To make ICMP packets follow the right flow, the multipath hash is
1812 * calculated from the inner IP addresses.
1813 */
1814static void ip_multipath_l3_keys(const struct sk_buff *skb,
1815 struct flow_keys *hash_keys)
1816{
1817 const struct iphdr *outer_iph = ip_hdr(skb);
1818 const struct iphdr *inner_iph;
1819 const struct icmphdr *icmph;
1820 struct iphdr _inner_iph;
1821 struct icmphdr _icmph;
1822
1823 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1824 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1825 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1826 return;
1827
1828 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1829 return;
1830
1831 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1832 &_icmph);
1833 if (!icmph)
1834 return;
1835
1836 if (icmph->type != ICMP_DEST_UNREACH &&
1837 icmph->type != ICMP_REDIRECT &&
1838 icmph->type != ICMP_TIME_EXCEEDED &&
1839 icmph->type != ICMP_PARAMETERPROB)
1840 return;
1841
1842 inner_iph = skb_header_pointer(skb,
1843 outer_iph->ihl * 4 + sizeof(_icmph),
1844 sizeof(_inner_iph), &_inner_iph);
1845 if (!inner_iph)
1846 return;
1847 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1848 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1849}
1850
1851/* if skb is set it will be used and fl4 can be NULL */
1852int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1853 const struct sk_buff *skb)
1854{
1855 struct net *net = fi->fib_net;
1856 struct flow_keys hash_keys;
1857 u32 mhash;
1858
1859 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1860 case 0:
1861 memset(&hash_keys, 0, sizeof(hash_keys));
1862 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1863 if (skb) {
1864 ip_multipath_l3_keys(skb, &hash_keys);
1865 } else {
1866 hash_keys.addrs.v4addrs.src = fl4->saddr;
1867 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1868 }
1869 break;
1870 case 1:
1871 /* skb is currently provided only when forwarding */
1872 if (skb) {
1873 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1874 struct flow_keys keys;
1875
1876 /* short-circuit if we already have L4 hash present */
1877 if (skb->l4_hash)
1878 return skb_get_hash_raw(skb) >> 1;
1879 memset(&hash_keys, 0, sizeof(hash_keys));
1880 skb_flow_dissect_flow_keys(skb, &keys, flag);
1881
1882 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1883 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1884 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1885 hash_keys.ports.src = keys.ports.src;
1886 hash_keys.ports.dst = keys.ports.dst;
1887 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1888 } else {
1889 memset(&hash_keys, 0, sizeof(hash_keys));
1890 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1891 hash_keys.addrs.v4addrs.src = fl4->saddr;
1892 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1893 hash_keys.ports.src = fl4->fl4_sport;
1894 hash_keys.ports.dst = fl4->fl4_dport;
1895 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1896 }
1897 break;
1898 }
1899 mhash = flow_hash_from_keys(&hash_keys);
1900
1901 return mhash >> 1;
1902}
1903EXPORT_SYMBOL_GPL(fib_multipath_hash);
1904#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1905
1906static int ip_mkroute_input(struct sk_buff *skb,
1907 struct fib_result *res,
1908 struct in_device *in_dev,
1909 __be32 daddr, __be32 saddr, u32 tos)
1910{
1911#ifdef CONFIG_IP_ROUTE_MULTIPATH
1912 if (res->fi && res->fi->fib_nhs > 1) {
1913 int h = fib_multipath_hash(res->fi, NULL, skb);
1914
1915 fib_select_multipath(res, h);
1916 }
1917#endif
1918
1919 /* create a routing cache entry */
1920 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1921}
1922
1923/*
1924 * NOTE. We drop all the packets that has local source
1925 * addresses, because every properly looped back packet
1926 * must have correct destination already attached by output routine.
1927 *
1928 * Such approach solves two big problems:
1929 * 1. Not simplex devices are handled properly.
1930 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1931 * called with rcu_read_lock()
1932 */
1933
1934static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1935 u8 tos, struct net_device *dev,
1936 struct fib_result *res)
1937{
1938 struct in_device *in_dev = __in_dev_get_rcu(dev);
1939 struct ip_tunnel_info *tun_info;
1940 struct flowi4 fl4;
1941 unsigned int flags = 0;
1942 u32 itag = 0;
1943 struct rtable *rth;
1944 int err = -EINVAL;
1945 struct net *net = dev_net(dev);
1946 bool do_cache;
1947
1948 /* IP on this device is disabled. */
1949
1950 if (!in_dev)
1951 goto out;
1952
1953 /* Check for the most weird martians, which can be not detected
1954 by fib_lookup.
1955 */
1956
1957 tun_info = skb_tunnel_info(skb);
1958 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1959 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1960 else
1961 fl4.flowi4_tun_key.tun_id = 0;
1962 skb_dst_drop(skb);
1963
1964 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1965 goto martian_source;
1966
1967 res->fi = NULL;
1968 res->table = NULL;
1969 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1970 goto brd_input;
1971
1972 /* Accept zero addresses only to limited broadcast;
1973 * I even do not know to fix it or not. Waiting for complains :-)
1974 */
1975 if (ipv4_is_zeronet(saddr))
1976 goto martian_source;
1977
1978 if (ipv4_is_zeronet(daddr))
1979 goto martian_destination;
1980
1981 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1982 * and call it once if daddr or/and saddr are loopback addresses
1983 */
1984 if (ipv4_is_loopback(daddr)) {
1985 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1986 goto martian_destination;
1987 } else if (ipv4_is_loopback(saddr)) {
1988 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1989 goto martian_source;
1990 }
1991
1992 /*
1993 * Now we are ready to route packet.
1994 */
1995 fl4.flowi4_oif = 0;
1996 fl4.flowi4_iif = dev->ifindex;
1997 fl4.flowi4_mark = skb->mark;
1998 fl4.flowi4_tos = tos;
1999 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2000 fl4.flowi4_flags = 0;
2001 fl4.daddr = daddr;
2002 fl4.saddr = saddr;
2003 fl4.flowi4_uid = sock_net_uid(net, NULL);
2004 err = fib_lookup(net, &fl4, res, 0);
2005 if (err != 0) {
2006 if (!IN_DEV_FORWARD(in_dev))
2007 err = -EHOSTUNREACH;
2008 goto no_route;
2009 }
2010
2011 if (res->type == RTN_BROADCAST)
2012 goto brd_input;
2013
2014 if (res->type == RTN_LOCAL) {
2015 err = fib_validate_source(skb, saddr, daddr, tos,
2016 0, dev, in_dev, &itag);
2017 if (err < 0)
2018 goto martian_source;
2019 goto local_input;
2020 }
2021
2022 if (!IN_DEV_FORWARD(in_dev)) {
2023 err = -EHOSTUNREACH;
2024 goto no_route;
2025 }
2026 if (res->type != RTN_UNICAST)
2027 goto martian_destination;
2028
2029 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2030out: return err;
2031
2032brd_input:
2033 if (skb->protocol != htons(ETH_P_IP))
2034 goto e_inval;
2035
2036 if (!ipv4_is_zeronet(saddr)) {
2037 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2038 in_dev, &itag);
2039 if (err < 0)
2040 goto martian_source;
2041 }
2042 flags |= RTCF_BROADCAST;
2043 res->type = RTN_BROADCAST;
2044 RT_CACHE_STAT_INC(in_brd);
2045
2046local_input:
2047 do_cache = false;
2048 if (res->fi) {
2049 if (!itag) {
2050 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2051 if (rt_cache_valid(rth)) {
2052 skb_dst_set_noref(skb, &rth->dst);
2053 err = 0;
2054 goto out;
2055 }
2056 do_cache = true;
2057 }
2058 }
2059
2060 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2061 flags | RTCF_LOCAL, res->type,
2062 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2063 if (!rth)
2064 goto e_nobufs;
2065
2066 rth->dst.output= ip_rt_bug;
2067#ifdef CONFIG_IP_ROUTE_CLASSID
2068 rth->dst.tclassid = itag;
2069#endif
2070 rth->rt_is_input = 1;
2071 if (res->table)
2072 rth->rt_table_id = res->table->tb_id;
2073
2074 RT_CACHE_STAT_INC(in_slow_tot);
2075 if (res->type == RTN_UNREACHABLE) {
2076 rth->dst.input= ip_error;
2077 rth->dst.error= -err;
2078 rth->rt_flags &= ~RTCF_LOCAL;
2079 }
2080
2081 if (do_cache) {
2082 struct fib_nh *nh = &FIB_RES_NH(*res);
2083
2084 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2085 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2086 WARN_ON(rth->dst.input == lwtunnel_input);
2087 rth->dst.lwtstate->orig_input = rth->dst.input;
2088 rth->dst.input = lwtunnel_input;
2089 }
2090
2091 if (unlikely(!rt_cache_route(nh, rth)))
2092 rt_add_uncached_list(rth);
2093 }
2094 skb_dst_set(skb, &rth->dst);
2095 err = 0;
2096 goto out;
2097
2098no_route:
2099 RT_CACHE_STAT_INC(in_no_route);
2100 res->type = RTN_UNREACHABLE;
2101 res->fi = NULL;
2102 res->table = NULL;
2103 goto local_input;
2104
2105 /*
2106 * Do not cache martian addresses: they should be logged (RFC1812)
2107 */
2108martian_destination:
2109 RT_CACHE_STAT_INC(in_martian_dst);
2110#ifdef CONFIG_IP_ROUTE_VERBOSE
2111 if (IN_DEV_LOG_MARTIANS(in_dev))
2112 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2113 &daddr, &saddr, dev->name);
2114#endif
2115
2116e_inval:
2117 err = -EINVAL;
2118 goto out;
2119
2120e_nobufs:
2121 err = -ENOBUFS;
2122 goto out;
2123
2124martian_source:
2125 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2126 goto out;
2127}
2128
2129int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2130 u8 tos, struct net_device *dev)
2131{
2132 struct fib_result res;
2133 int err;
2134
2135 tos &= IPTOS_RT_MASK;
2136 rcu_read_lock();
2137 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2138 rcu_read_unlock();
2139
2140 return err;
2141}
2142EXPORT_SYMBOL(ip_route_input_noref);
2143
2144/* called with rcu_read_lock held */
2145int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2146 u8 tos, struct net_device *dev, struct fib_result *res)
2147{
2148 /* Multicast recognition logic is moved from route cache to here.
2149 The problem was that too many Ethernet cards have broken/missing
2150 hardware multicast filters :-( As result the host on multicasting
2151 network acquires a lot of useless route cache entries, sort of
2152 SDR messages from all the world. Now we try to get rid of them.
2153 Really, provided software IP multicast filter is organized
2154 reasonably (at least, hashed), it does not result in a slowdown
2155 comparing with route cache reject entries.
2156 Note, that multicast routers are not affected, because
2157 route cache entry is created eventually.
2158 */
2159 if (ipv4_is_multicast(daddr)) {
2160 struct in_device *in_dev = __in_dev_get_rcu(dev);
2161 int our = 0;
2162 int err = -EINVAL;
2163
2164 if (!in_dev)
2165 return err;
2166 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2167 ip_hdr(skb)->protocol);
2168
2169 /* check l3 master if no match yet */
2170 if (!our && netif_is_l3_slave(dev)) {
2171 struct in_device *l3_in_dev;
2172
2173 l3_in_dev = __in_dev_get_rcu(skb->dev);
2174 if (l3_in_dev)
2175 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2176 ip_hdr(skb)->protocol);
2177 }
2178
2179 if (our
2180#ifdef CONFIG_IP_MROUTE
2181 ||
2182 (!ipv4_is_local_multicast(daddr) &&
2183 IN_DEV_MFORWARD(in_dev))
2184#endif
2185 ) {
2186 err = ip_route_input_mc(skb, daddr, saddr,
2187 tos, dev, our);
2188 }
2189 return err;
2190 }
2191
2192 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2193}
2194
2195/* called with rcu_read_lock() */
2196static struct rtable *__mkroute_output(const struct fib_result *res,
2197 const struct flowi4 *fl4, int orig_oif,
2198 struct net_device *dev_out,
2199 unsigned int flags)
2200{
2201 struct fib_info *fi = res->fi;
2202 struct fib_nh_exception *fnhe;
2203 struct in_device *in_dev;
2204 u16 type = res->type;
2205 struct rtable *rth;
2206 bool do_cache;
2207
2208 in_dev = __in_dev_get_rcu(dev_out);
2209 if (!in_dev)
2210 return ERR_PTR(-EINVAL);
2211
2212 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2213 if (ipv4_is_loopback(fl4->saddr) &&
2214 !(dev_out->flags & IFF_LOOPBACK) &&
2215 !netif_is_l3_master(dev_out))
2216 return ERR_PTR(-EINVAL);
2217
2218 if (ipv4_is_lbcast(fl4->daddr))
2219 type = RTN_BROADCAST;
2220 else if (ipv4_is_multicast(fl4->daddr))
2221 type = RTN_MULTICAST;
2222 else if (ipv4_is_zeronet(fl4->daddr))
2223 return ERR_PTR(-EINVAL);
2224
2225 if (dev_out->flags & IFF_LOOPBACK)
2226 flags |= RTCF_LOCAL;
2227
2228 do_cache = true;
2229 if (type == RTN_BROADCAST) {
2230 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2231 fi = NULL;
2232 } else if (type == RTN_MULTICAST) {
2233 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2234 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2235 fl4->flowi4_proto))
2236 flags &= ~RTCF_LOCAL;
2237 else
2238 do_cache = false;
2239 /* If multicast route do not exist use
2240 * default one, but do not gateway in this case.
2241 * Yes, it is hack.
2242 */
2243 if (fi && res->prefixlen < 4)
2244 fi = NULL;
2245 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2246 (orig_oif != dev_out->ifindex)) {
2247 /* For local routes that require a particular output interface
2248 * we do not want to cache the result. Caching the result
2249 * causes incorrect behaviour when there are multiple source
2250 * addresses on the interface, the end result being that if the
2251 * intended recipient is waiting on that interface for the
2252 * packet he won't receive it because it will be delivered on
2253 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2254 * be set to the loopback interface as well.
2255 */
2256 do_cache = false;
2257 }
2258
2259 fnhe = NULL;
2260 do_cache &= fi != NULL;
2261 if (fi) {
2262 struct rtable __rcu **prth;
2263 struct fib_nh *nh = &FIB_RES_NH(*res);
2264
2265 fnhe = find_exception(nh, fl4->daddr);
2266 if (!do_cache)
2267 goto add;
2268 if (fnhe) {
2269 prth = &fnhe->fnhe_rth_output;
2270 } else {
2271 if (unlikely(fl4->flowi4_flags &
2272 FLOWI_FLAG_KNOWN_NH &&
2273 !(nh->nh_gw &&
2274 nh->nh_scope == RT_SCOPE_LINK))) {
2275 do_cache = false;
2276 goto add;
2277 }
2278 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2279 }
2280 rth = rcu_dereference(*prth);
2281 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2282 return rth;
2283 }
2284
2285add:
2286 rth = rt_dst_alloc(dev_out, flags, type,
2287 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2288 IN_DEV_CONF_GET(in_dev, NOXFRM),
2289 do_cache);
2290 if (!rth)
2291 return ERR_PTR(-ENOBUFS);
2292
2293 rth->rt_iif = orig_oif;
2294 if (res->table)
2295 rth->rt_table_id = res->table->tb_id;
2296
2297 RT_CACHE_STAT_INC(out_slow_tot);
2298
2299 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2300 if (flags & RTCF_LOCAL &&
2301 !(dev_out->flags & IFF_LOOPBACK)) {
2302 rth->dst.output = ip_mc_output;
2303 RT_CACHE_STAT_INC(out_slow_mc);
2304 }
2305#ifdef CONFIG_IP_MROUTE
2306 if (type == RTN_MULTICAST) {
2307 if (IN_DEV_MFORWARD(in_dev) &&
2308 !ipv4_is_local_multicast(fl4->daddr)) {
2309 rth->dst.input = ip_mr_input;
2310 rth->dst.output = ip_mc_output;
2311 }
2312 }
2313#endif
2314 }
2315
2316 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2317 set_lwt_redirect(rth);
2318
2319 return rth;
2320}
2321
2322/*
2323 * Major route resolver routine.
2324 */
2325
2326struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2327 const struct sk_buff *skb)
2328{
2329 __u8 tos = RT_FL_TOS(fl4);
2330 struct fib_result res = {
2331 .type = RTN_UNSPEC,
2332 .fi = NULL,
2333 .table = NULL,
2334 .tclassid = 0,
2335 };
2336 struct rtable *rth;
2337
2338 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2339 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2340 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2341 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2342
2343 rcu_read_lock();
2344 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2345 rcu_read_unlock();
2346
2347 return rth;
2348}
2349EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2350
2351struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2352 struct fib_result *res,
2353 const struct sk_buff *skb)
2354{
2355 struct net_device *dev_out = NULL;
2356 int orig_oif = fl4->flowi4_oif;
2357 unsigned int flags = 0;
2358 struct rtable *rth;
2359 int err;
2360
2361 if (fl4->saddr) {
2362 if (ipv4_is_multicast(fl4->saddr) ||
2363 ipv4_is_lbcast(fl4->saddr) ||
2364 ipv4_is_zeronet(fl4->saddr)) {
2365 rth = ERR_PTR(-EINVAL);
2366 goto out;
2367 }
2368
2369 rth = ERR_PTR(-ENETUNREACH);
2370
2371 /* I removed check for oif == dev_out->oif here.
2372 It was wrong for two reasons:
2373 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2374 is assigned to multiple interfaces.
2375 2. Moreover, we are allowed to send packets with saddr
2376 of another iface. --ANK
2377 */
2378
2379 if (fl4->flowi4_oif == 0 &&
2380 (ipv4_is_multicast(fl4->daddr) ||
2381 ipv4_is_lbcast(fl4->daddr))) {
2382 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2383 dev_out = __ip_dev_find(net, fl4->saddr, false);
2384 if (!dev_out)
2385 goto out;
2386
2387 /* Special hack: user can direct multicasts
2388 and limited broadcast via necessary interface
2389 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2390 This hack is not just for fun, it allows
2391 vic,vat and friends to work.
2392 They bind socket to loopback, set ttl to zero
2393 and expect that it will work.
2394 From the viewpoint of routing cache they are broken,
2395 because we are not allowed to build multicast path
2396 with loopback source addr (look, routing cache
2397 cannot know, that ttl is zero, so that packet
2398 will not leave this host and route is valid).
2399 Luckily, this hack is good workaround.
2400 */
2401
2402 fl4->flowi4_oif = dev_out->ifindex;
2403 goto make_route;
2404 }
2405
2406 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2407 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2408 if (!__ip_dev_find(net, fl4->saddr, false))
2409 goto out;
2410 }
2411 }
2412
2413
2414 if (fl4->flowi4_oif) {
2415 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2416 rth = ERR_PTR(-ENODEV);
2417 if (!dev_out)
2418 goto out;
2419
2420 /* RACE: Check return value of inet_select_addr instead. */
2421 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2422 rth = ERR_PTR(-ENETUNREACH);
2423 goto out;
2424 }
2425 if (ipv4_is_local_multicast(fl4->daddr) ||
2426 ipv4_is_lbcast(fl4->daddr) ||
2427 fl4->flowi4_proto == IPPROTO_IGMP) {
2428 if (!fl4->saddr)
2429 fl4->saddr = inet_select_addr(dev_out, 0,
2430 RT_SCOPE_LINK);
2431 goto make_route;
2432 }
2433 if (!fl4->saddr) {
2434 if (ipv4_is_multicast(fl4->daddr))
2435 fl4->saddr = inet_select_addr(dev_out, 0,
2436 fl4->flowi4_scope);
2437 else if (!fl4->daddr)
2438 fl4->saddr = inet_select_addr(dev_out, 0,
2439 RT_SCOPE_HOST);
2440 }
2441 }
2442
2443 if (!fl4->daddr) {
2444 fl4->daddr = fl4->saddr;
2445 if (!fl4->daddr)
2446 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2447 dev_out = net->loopback_dev;
2448 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2449 res->type = RTN_LOCAL;
2450 flags |= RTCF_LOCAL;
2451 goto make_route;
2452 }
2453
2454 err = fib_lookup(net, fl4, res, 0);
2455 if (err) {
2456 res->fi = NULL;
2457 res->table = NULL;
2458 if (fl4->flowi4_oif &&
2459 (ipv4_is_multicast(fl4->daddr) ||
2460 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2461 /* Apparently, routing tables are wrong. Assume,
2462 that the destination is on link.
2463
2464 WHY? DW.
2465 Because we are allowed to send to iface
2466 even if it has NO routes and NO assigned
2467 addresses. When oif is specified, routing
2468 tables are looked up with only one purpose:
2469 to catch if destination is gatewayed, rather than
2470 direct. Moreover, if MSG_DONTROUTE is set,
2471 we send packet, ignoring both routing tables
2472 and ifaddr state. --ANK
2473
2474
2475 We could make it even if oif is unknown,
2476 likely IPv6, but we do not.
2477 */
2478
2479 if (fl4->saddr == 0)
2480 fl4->saddr = inet_select_addr(dev_out, 0,
2481 RT_SCOPE_LINK);
2482 res->type = RTN_UNICAST;
2483 goto make_route;
2484 }
2485 rth = ERR_PTR(err);
2486 goto out;
2487 }
2488
2489 if (res->type == RTN_LOCAL) {
2490 if (!fl4->saddr) {
2491 if (res->fi->fib_prefsrc)
2492 fl4->saddr = res->fi->fib_prefsrc;
2493 else
2494 fl4->saddr = fl4->daddr;
2495 }
2496
2497 /* L3 master device is the loopback for that domain */
2498 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2499 net->loopback_dev;
2500
2501 /* make sure orig_oif points to fib result device even
2502 * though packet rx/tx happens over loopback or l3mdev
2503 */
2504 orig_oif = FIB_RES_OIF(*res);
2505
2506 fl4->flowi4_oif = dev_out->ifindex;
2507 flags |= RTCF_LOCAL;
2508 goto make_route;
2509 }
2510
2511 fib_select_path(net, res, fl4, skb);
2512
2513 dev_out = FIB_RES_DEV(*res);
2514
2515make_route:
2516 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2517
2518out:
2519 return rth;
2520}
2521
2522static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2523{
2524 return NULL;
2525}
2526
2527static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2528{
2529 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2530
2531 return mtu ? : dst->dev->mtu;
2532}
2533
2534static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2535 struct sk_buff *skb, u32 mtu,
2536 bool confirm_neigh)
2537{
2538}
2539
2540static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2541 struct sk_buff *skb)
2542{
2543}
2544
2545static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2546 unsigned long old)
2547{
2548 return NULL;
2549}
2550
2551static struct dst_ops ipv4_dst_blackhole_ops = {
2552 .family = AF_INET,
2553 .check = ipv4_blackhole_dst_check,
2554 .mtu = ipv4_blackhole_mtu,
2555 .default_advmss = ipv4_default_advmss,
2556 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2557 .redirect = ipv4_rt_blackhole_redirect,
2558 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2559 .neigh_lookup = ipv4_neigh_lookup,
2560};
2561
2562struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2563{
2564 struct rtable *ort = (struct rtable *) dst_orig;
2565 struct rtable *rt;
2566
2567 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2568 if (rt) {
2569 struct dst_entry *new = &rt->dst;
2570
2571 new->__use = 1;
2572 new->input = dst_discard;
2573 new->output = dst_discard_out;
2574
2575 new->dev = net->loopback_dev;
2576 if (new->dev)
2577 dev_hold(new->dev);
2578
2579 rt->rt_is_input = ort->rt_is_input;
2580 rt->rt_iif = ort->rt_iif;
2581 rt->rt_pmtu = ort->rt_pmtu;
2582 rt->rt_mtu_locked = ort->rt_mtu_locked;
2583
2584 rt->rt_genid = rt_genid_ipv4(net);
2585 rt->rt_flags = ort->rt_flags;
2586 rt->rt_type = ort->rt_type;
2587 rt->rt_gateway = ort->rt_gateway;
2588 rt->rt_uses_gateway = ort->rt_uses_gateway;
2589
2590 INIT_LIST_HEAD(&rt->rt_uncached);
2591 }
2592
2593 dst_release(dst_orig);
2594
2595 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2596}
2597
2598struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2599 const struct sock *sk)
2600{
2601 struct rtable *rt = __ip_route_output_key(net, flp4);
2602
2603 if (IS_ERR(rt))
2604 return rt;
2605
2606 if (flp4->flowi4_proto)
2607 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2608 flowi4_to_flowi(flp4),
2609 sk, 0);
2610
2611 return rt;
2612}
2613EXPORT_SYMBOL_GPL(ip_route_output_flow);
2614
2615/* called with rcu_read_lock held */
2616static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2617 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2618 u32 seq)
2619{
2620 struct rtable *rt = skb_rtable(skb);
2621 struct rtmsg *r;
2622 struct nlmsghdr *nlh;
2623 unsigned long expires = 0;
2624 u32 error;
2625 u32 metrics[RTAX_MAX];
2626
2627 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2628 if (!nlh)
2629 return -EMSGSIZE;
2630
2631 r = nlmsg_data(nlh);
2632 r->rtm_family = AF_INET;
2633 r->rtm_dst_len = 32;
2634 r->rtm_src_len = 0;
2635 r->rtm_tos = fl4->flowi4_tos;
2636 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2637 if (nla_put_u32(skb, RTA_TABLE, table_id))
2638 goto nla_put_failure;
2639 r->rtm_type = rt->rt_type;
2640 r->rtm_scope = RT_SCOPE_UNIVERSE;
2641 r->rtm_protocol = RTPROT_UNSPEC;
2642 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2643 if (rt->rt_flags & RTCF_NOTIFY)
2644 r->rtm_flags |= RTM_F_NOTIFY;
2645 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2646 r->rtm_flags |= RTCF_DOREDIRECT;
2647
2648 if (nla_put_in_addr(skb, RTA_DST, dst))
2649 goto nla_put_failure;
2650 if (src) {
2651 r->rtm_src_len = 32;
2652 if (nla_put_in_addr(skb, RTA_SRC, src))
2653 goto nla_put_failure;
2654 }
2655 if (rt->dst.dev &&
2656 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2657 goto nla_put_failure;
2658#ifdef CONFIG_IP_ROUTE_CLASSID
2659 if (rt->dst.tclassid &&
2660 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2661 goto nla_put_failure;
2662#endif
2663 if (!rt_is_input_route(rt) &&
2664 fl4->saddr != src) {
2665 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2666 goto nla_put_failure;
2667 }
2668 if (rt->rt_uses_gateway &&
2669 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2670 goto nla_put_failure;
2671
2672 expires = rt->dst.expires;
2673 if (expires) {
2674 unsigned long now = jiffies;
2675
2676 if (time_before(now, expires))
2677 expires -= now;
2678 else
2679 expires = 0;
2680 }
2681
2682 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2683 if (rt->rt_pmtu && expires)
2684 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2685 if (rt->rt_mtu_locked && expires)
2686 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2687 if (rtnetlink_put_metrics(skb, metrics) < 0)
2688 goto nla_put_failure;
2689
2690 if (fl4->flowi4_mark &&
2691 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2692 goto nla_put_failure;
2693
2694 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2695 nla_put_u32(skb, RTA_UID,
2696 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2697 goto nla_put_failure;
2698
2699 error = rt->dst.error;
2700
2701 if (rt_is_input_route(rt)) {
2702#ifdef CONFIG_IP_MROUTE
2703 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2704 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2705 int err = ipmr_get_route(net, skb,
2706 fl4->saddr, fl4->daddr,
2707 r, portid);
2708
2709 if (err <= 0) {
2710 if (err == 0)
2711 return 0;
2712 goto nla_put_failure;
2713 }
2714 } else
2715#endif
2716 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2717 goto nla_put_failure;
2718 }
2719
2720 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2721 goto nla_put_failure;
2722
2723 nlmsg_end(skb, nlh);
2724 return 0;
2725
2726nla_put_failure:
2727 nlmsg_cancel(skb, nlh);
2728 return -EMSGSIZE;
2729}
2730
2731static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2732 struct netlink_ext_ack *extack)
2733{
2734 struct net *net = sock_net(in_skb->sk);
2735 struct rtmsg *rtm;
2736 struct nlattr *tb[RTA_MAX+1];
2737 struct fib_result res = {};
2738 struct rtable *rt = NULL;
2739 struct flowi4 fl4;
2740 __be32 dst = 0;
2741 __be32 src = 0;
2742 u32 iif;
2743 int err;
2744 int mark;
2745 struct sk_buff *skb;
2746 u32 table_id = RT_TABLE_MAIN;
2747 kuid_t uid;
2748
2749 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2750 extack);
2751 if (err < 0)
2752 goto errout;
2753
2754 rtm = nlmsg_data(nlh);
2755
2756 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2757 if (!skb) {
2758 err = -ENOBUFS;
2759 goto errout;
2760 }
2761
2762 /* Reserve room for dummy headers, this skb can pass
2763 through good chunk of routing engine.
2764 */
2765 skb_reset_mac_header(skb);
2766 skb_reset_network_header(skb);
2767
2768 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2769 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2770 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2771 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2772 if (tb[RTA_UID])
2773 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2774 else
2775 uid = (iif ? INVALID_UID : current_uid());
2776
2777 /* Bugfix: need to give ip_route_input enough of an IP header to
2778 * not gag.
2779 */
2780 ip_hdr(skb)->protocol = IPPROTO_UDP;
2781 ip_hdr(skb)->saddr = src;
2782 ip_hdr(skb)->daddr = dst;
2783
2784 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2785
2786 memset(&fl4, 0, sizeof(fl4));
2787 fl4.daddr = dst;
2788 fl4.saddr = src;
2789 fl4.flowi4_tos = rtm->rtm_tos;
2790 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2791 fl4.flowi4_mark = mark;
2792 fl4.flowi4_uid = uid;
2793
2794 rcu_read_lock();
2795
2796 if (iif) {
2797 struct net_device *dev;
2798
2799 dev = dev_get_by_index_rcu(net, iif);
2800 if (!dev) {
2801 err = -ENODEV;
2802 goto errout_free;
2803 }
2804
2805 skb->protocol = htons(ETH_P_IP);
2806 skb->dev = dev;
2807 skb->mark = mark;
2808 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2809 dev, &res);
2810
2811 rt = skb_rtable(skb);
2812 if (err == 0 && rt->dst.error)
2813 err = -rt->dst.error;
2814 } else {
2815 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2816 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2817 err = 0;
2818 if (IS_ERR(rt))
2819 err = PTR_ERR(rt);
2820 else
2821 skb_dst_set(skb, &rt->dst);
2822 }
2823
2824 if (err)
2825 goto errout_free;
2826
2827 if (rtm->rtm_flags & RTM_F_NOTIFY)
2828 rt->rt_flags |= RTCF_NOTIFY;
2829
2830 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2831 table_id = rt->rt_table_id;
2832
2833 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2834 if (!res.fi) {
2835 err = fib_props[res.type].error;
2836 if (!err)
2837 err = -EHOSTUNREACH;
2838 goto errout_free;
2839 }
2840 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2841 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2842 rt->rt_type, res.prefix, res.prefixlen,
2843 fl4.flowi4_tos, res.fi, 0);
2844 } else {
2845 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2846 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2847 }
2848 if (err < 0)
2849 goto errout_free;
2850
2851 rcu_read_unlock();
2852
2853 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2854errout:
2855 return err;
2856
2857errout_free:
2858 rcu_read_unlock();
2859 kfree_skb(skb);
2860 goto errout;
2861}
2862
2863void ip_rt_multicast_event(struct in_device *in_dev)
2864{
2865 rt_cache_flush(dev_net(in_dev->dev));
2866}
2867
2868#ifdef CONFIG_SYSCTL
2869static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2870static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2871static int ip_rt_gc_elasticity __read_mostly = 8;
2872static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
2873
2874static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2875 void __user *buffer,
2876 size_t *lenp, loff_t *ppos)
2877{
2878 struct net *net = (struct net *)__ctl->extra1;
2879
2880 if (write) {
2881 rt_cache_flush(net);
2882 fnhe_genid_bump(net);
2883 return 0;
2884 }
2885
2886 return -EINVAL;
2887}
2888
2889static struct ctl_table ipv4_route_table[] = {
2890 {
2891 .procname = "gc_thresh",
2892 .data = &ipv4_dst_ops.gc_thresh,
2893 .maxlen = sizeof(int),
2894 .mode = 0644,
2895 .proc_handler = proc_dointvec,
2896 },
2897 {
2898 .procname = "max_size",
2899 .data = &ip_rt_max_size,
2900 .maxlen = sizeof(int),
2901 .mode = 0644,
2902 .proc_handler = proc_dointvec,
2903 },
2904 {
2905 /* Deprecated. Use gc_min_interval_ms */
2906
2907 .procname = "gc_min_interval",
2908 .data = &ip_rt_gc_min_interval,
2909 .maxlen = sizeof(int),
2910 .mode = 0644,
2911 .proc_handler = proc_dointvec_jiffies,
2912 },
2913 {
2914 .procname = "gc_min_interval_ms",
2915 .data = &ip_rt_gc_min_interval,
2916 .maxlen = sizeof(int),
2917 .mode = 0644,
2918 .proc_handler = proc_dointvec_ms_jiffies,
2919 },
2920 {
2921 .procname = "gc_timeout",
2922 .data = &ip_rt_gc_timeout,
2923 .maxlen = sizeof(int),
2924 .mode = 0644,
2925 .proc_handler = proc_dointvec_jiffies,
2926 },
2927 {
2928 .procname = "gc_interval",
2929 .data = &ip_rt_gc_interval,
2930 .maxlen = sizeof(int),
2931 .mode = 0644,
2932 .proc_handler = proc_dointvec_jiffies,
2933 },
2934 {
2935 .procname = "redirect_load",
2936 .data = &ip_rt_redirect_load,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
2939 .proc_handler = proc_dointvec,
2940 },
2941 {
2942 .procname = "redirect_number",
2943 .data = &ip_rt_redirect_number,
2944 .maxlen = sizeof(int),
2945 .mode = 0644,
2946 .proc_handler = proc_dointvec,
2947 },
2948 {
2949 .procname = "redirect_silence",
2950 .data = &ip_rt_redirect_silence,
2951 .maxlen = sizeof(int),
2952 .mode = 0644,
2953 .proc_handler = proc_dointvec,
2954 },
2955 {
2956 .procname = "error_cost",
2957 .data = &ip_rt_error_cost,
2958 .maxlen = sizeof(int),
2959 .mode = 0644,
2960 .proc_handler = proc_dointvec,
2961 },
2962 {
2963 .procname = "error_burst",
2964 .data = &ip_rt_error_burst,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = proc_dointvec,
2968 },
2969 {
2970 .procname = "gc_elasticity",
2971 .data = &ip_rt_gc_elasticity,
2972 .maxlen = sizeof(int),
2973 .mode = 0644,
2974 .proc_handler = proc_dointvec,
2975 },
2976 {
2977 .procname = "mtu_expires",
2978 .data = &ip_rt_mtu_expires,
2979 .maxlen = sizeof(int),
2980 .mode = 0644,
2981 .proc_handler = proc_dointvec_jiffies,
2982 },
2983 {
2984 .procname = "min_pmtu",
2985 .data = &ip_rt_min_pmtu,
2986 .maxlen = sizeof(int),
2987 .mode = 0644,
2988 .proc_handler = proc_dointvec_minmax,
2989 .extra1 = &ip_min_valid_pmtu,
2990 },
2991 {
2992 .procname = "min_adv_mss",
2993 .data = &ip_rt_min_advmss,
2994 .maxlen = sizeof(int),
2995 .mode = 0644,
2996 .proc_handler = proc_dointvec,
2997 },
2998 { }
2999};
3000
3001static struct ctl_table ipv4_route_flush_table[] = {
3002 {
3003 .procname = "flush",
3004 .maxlen = sizeof(int),
3005 .mode = 0200,
3006 .proc_handler = ipv4_sysctl_rtcache_flush,
3007 },
3008 { },
3009};
3010
3011static __net_init int sysctl_route_net_init(struct net *net)
3012{
3013 struct ctl_table *tbl;
3014
3015 tbl = ipv4_route_flush_table;
3016 if (!net_eq(net, &init_net)) {
3017 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3018 if (!tbl)
3019 goto err_dup;
3020
3021 /* Don't export sysctls to unprivileged users */
3022 if (net->user_ns != &init_user_ns)
3023 tbl[0].procname = NULL;
3024 }
3025 tbl[0].extra1 = net;
3026
3027 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3028 if (!net->ipv4.route_hdr)
3029 goto err_reg;
3030 return 0;
3031
3032err_reg:
3033 if (tbl != ipv4_route_flush_table)
3034 kfree(tbl);
3035err_dup:
3036 return -ENOMEM;
3037}
3038
3039static __net_exit void sysctl_route_net_exit(struct net *net)
3040{
3041 struct ctl_table *tbl;
3042
3043 tbl = net->ipv4.route_hdr->ctl_table_arg;
3044 unregister_net_sysctl_table(net->ipv4.route_hdr);
3045 BUG_ON(tbl == ipv4_route_flush_table);
3046 kfree(tbl);
3047}
3048
3049static __net_initdata struct pernet_operations sysctl_route_ops = {
3050 .init = sysctl_route_net_init,
3051 .exit = sysctl_route_net_exit,
3052};
3053#endif
3054
3055static __net_init int rt_genid_init(struct net *net)
3056{
3057 atomic_set(&net->ipv4.rt_genid, 0);
3058 atomic_set(&net->fnhe_genid, 0);
3059 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3060 return 0;
3061}
3062
3063static __net_initdata struct pernet_operations rt_genid_ops = {
3064 .init = rt_genid_init,
3065};
3066
3067static int __net_init ipv4_inetpeer_init(struct net *net)
3068{
3069 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3070
3071 if (!bp)
3072 return -ENOMEM;
3073 inet_peer_base_init(bp);
3074 net->ipv4.peers = bp;
3075 return 0;
3076}
3077
3078static void __net_exit ipv4_inetpeer_exit(struct net *net)
3079{
3080 struct inet_peer_base *bp = net->ipv4.peers;
3081
3082 net->ipv4.peers = NULL;
3083 inetpeer_invalidate_tree(bp);
3084 kfree(bp);
3085}
3086
3087static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3088 .init = ipv4_inetpeer_init,
3089 .exit = ipv4_inetpeer_exit,
3090};
3091
3092#ifdef CONFIG_IP_ROUTE_CLASSID
3093struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3094#endif /* CONFIG_IP_ROUTE_CLASSID */
3095
3096int __init ip_rt_init(void)
3097{
3098 int rc = 0;
3099 int cpu;
3100
3101 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3102 if (!ip_idents)
3103 panic("IP: failed to allocate ip_idents\n");
3104
3105 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3106
3107 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3108 if (!ip_tstamps)
3109 panic("IP: failed to allocate ip_tstamps\n");
3110
3111 for_each_possible_cpu(cpu) {
3112 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3113
3114 INIT_LIST_HEAD(&ul->head);
3115 spin_lock_init(&ul->lock);
3116 }
3117#ifdef CONFIG_IP_ROUTE_CLASSID
3118 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3119 if (!ip_rt_acct)
3120 panic("IP: failed to allocate ip_rt_acct\n");
3121#endif
3122
3123 ipv4_dst_ops.kmem_cachep =
3124 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3125 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3126
3127 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3128
3129 if (dst_entries_init(&ipv4_dst_ops) < 0)
3130 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3131
3132 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3133 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3134
3135 ipv4_dst_ops.gc_thresh = ~0;
3136 ip_rt_max_size = INT_MAX;
3137
3138 devinet_init();
3139 ip_fib_init();
3140
3141 if (ip_rt_proc_init())
3142 pr_err("Unable to create route proc files\n");
3143#ifdef CONFIG_XFRM
3144 xfrm_init();
3145 xfrm4_init();
3146#endif
3147 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3148 RTNL_FLAG_DOIT_UNLOCKED);
3149
3150#ifdef CONFIG_SYSCTL
3151 register_pernet_subsys(&sysctl_route_ops);
3152#endif
3153 register_pernet_subsys(&rt_genid_ops);
3154 register_pernet_subsys(&ipv4_inetpeer_ops);
3155 return rc;
3156}
3157
3158#ifdef CONFIG_SYSCTL
3159/*
3160 * We really need to sanitize the damn ipv4 init order, then all
3161 * this nonsense will go away.
3162 */
3163void __init ip_static_sysctl_init(void)
3164{
3165 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3166}
3167#endif