Blame - ap/os/linux/linux-3.4.x/net/ipv4/route.c - T106_DC

blob: 29e211daf255232195e0900b3e556c0ecf7c04c6 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* ROUTE - implementation of the IP router.
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				11	* Linus Torvalds, <Linus.Torvalds@helsinki.fi>
				12	* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
				13	*
				14	* Fixes:
				15	* Alan Cox : Verify area fixes.
				16	* Alan Cox : cli() protects routing changes
				17	* Rui Oliveira : ICMP routing table updates
				18	* (rco@di.uminho.pt) Routing table insertion and update
				19	* Linus Torvalds : Rewrote bits to be sensible
				20	* Alan Cox : Added BSD route gw semantics
				21	* Alan Cox : Super /proc >4K
				22	* Alan Cox : MTU in route table
				23	* Alan Cox : MSS actually. Also added the window
				24	* clamper.
				25	* Sam Lantinga : Fixed route matching in rt_del()
				26	* Alan Cox : Routing cache support.
				27	* Alan Cox : Removed compatibility cruft.
				28	* Alan Cox : RTF_REJECT support.
				29	* Alan Cox : TCP irtt support.
				30	* Jonathan Naylor : Added Metric support.
				31	* Miquel van Smoorenburg : BSD API fixes.
				32	* Miquel van Smoorenburg : Metrics.
				33	* Alan Cox : Use __u32 properly
				34	* Alan Cox : Aligned routing errors more closely with BSD
				35	* our system is still very different.
				36	* Alan Cox : Faster /proc handling
				37	* Alexey Kuznetsov : Massive rework to support tree based routing,
				38	* routing caches and better behaviour.
				39	*
				40	* Olaf Erb : irtt wasn't being copied right.
				41	* Bjorn Ekwall : Kerneld route support.
				42	* Alan Cox : Multicast fixed (I hope)
				43	* Pavel Krauz : Limited broadcast fixed
				44	* Mike McLagan : Routing by source
				45	* Alexey Kuznetsov : End of old history. Split to fib.c and
				46	* route.c and rewritten from scratch.
				47	* Andi Kleen : Load-limit warning messages.
				48	* Vitaly E. Lavrov : Transparent proxy revived after year coma.
				49	* Vitaly E. Lavrov : Race condition in ip_route_input_slow.
				50	* Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
				51	* Vladimir V. Ivanov : IP rule info (flowid) is really useful.
				52	* Marc Boucher : routing by fwmark
				53	* Robert Olsson : Added rt_cache statistics
				54	* Arnaldo C. Melo : Convert proc stuff to seq_file
				55	* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
				56	* Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
				57	* Ilia Sotnikov : Removed TOS from hash calculations
				58	*
				59	* This program is free software; you can redistribute it and/or
				60	* modify it under the terms of the GNU General Public License
				61	* as published by the Free Software Foundation; either version
				62	* 2 of the License, or (at your option) any later version.
				63	*/
				64
				65	#define pr_fmt(fmt) "IPv4: " fmt
				66
				67	#include <linux/module.h>
				68	#include <asm/uaccess.h>
				69	#include <linux/bitops.h>
				70	#include <linux/types.h>
				71	#include <linux/kernel.h>
				72	#include <linux/mm.h>
				73	#include <linux/bootmem.h>
				74	#include <linux/string.h>
				75	#include <linux/socket.h>
				76	#include <linux/sockios.h>
				77	#include <linux/errno.h>
				78	#include <linux/in.h>
				79	#include <linux/inet.h>
				80	#include <linux/netdevice.h>
				81	#include <linux/proc_fs.h>
				82	#include <linux/init.h>
				83	#include <linux/workqueue.h>
				84	#include <linux/skbuff.h>
				85	#include <linux/inetdevice.h>
				86	#include <linux/igmp.h>
				87	#include <linux/pkt_sched.h>
				88	#include <linux/mroute.h>
				89	#include <linux/netfilter_ipv4.h>
				90	#include <linux/random.h>
				91	#include <linux/jhash.h>
				92	#include <linux/rcupdate.h>
				93	#include <linux/times.h>
				94	#include <linux/slab.h>
				95	#include <linux/prefetch.h>
				96	#include <net/dst.h>
				97	#include <net/net_namespace.h>
				98	#include <net/protocol.h>
				99	#include <net/ip.h>
				100	#include <net/route.h>
				101	#include <net/inetpeer.h>
				102	#include <net/sock.h>
				103	#include <net/ip_fib.h>
				104	#include <net/arp.h>
				105	#include <net/tcp.h>
				106	#include <net/icmp.h>
				107	#include <net/xfrm.h>
				108	#include <net/netevent.h>
				109	#include <net/rtnetlink.h>
				110	#ifdef CONFIG_SYSCTL
				111	#include <linux/sysctl.h>
				112	#endif
				113	#include <net/secure_seq.h>
				114	#include <net/SI/print_sun.h>
				115
				116
				117	#define RT_FL_TOS(oldflp4) \
				118	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK \| RTO_ONLINK))
				119
				120	#define IP_MAX_MTU 0xFFF0
				121
				122	#define RT_GC_TIMEOUT (300*HZ)
				123
				124	static int ip_rt_max_size;
				125	static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
				126	static int ip_rt_gc_interval __read_mostly = 60 * HZ;
				127	static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
				128	static int ip_rt_redirect_number __read_mostly = 9;
				129	static int ip_rt_redirect_load __read_mostly = HZ / 50;
				130	static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
				131	static int ip_rt_error_cost __read_mostly = HZ;
				132	static int ip_rt_error_burst __read_mostly = 5 * HZ;
				133	static int ip_rt_gc_elasticity __read_mostly = 8;
				134	static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
				135	static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
				136	static int ip_rt_min_advmss __read_mostly = 256;
				137	static int rt_chain_length_max __read_mostly = 20;
				138
				139	static struct delayed_work expires_work;
				140	static unsigned long expires_ljiffies;
				141
				142	/*
				143	* Interface to generic destination cache.
				144	*/
				145
				146	static struct dst_entry ipv4_dst_check(struct dst_entry dst, u32 cookie);
				147	static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
				148	static unsigned int ipv4_mtu(const struct dst_entry *dst);
				149	static void ipv4_dst_destroy(struct dst_entry *dst);
				150	static struct dst_entry ipv4_negative_advice(struct dst_entry dst);
				151	static void ipv4_link_failure(struct sk_buff *skb);
				152	static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
				153	static int rt_garbage_collect(struct dst_ops *ops);
				154
				155	static void __rt_garbage_collect(struct work_struct *w);
				156	static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
				157
				158	static void ipv4_dst_ifdown(struct dst_entry dst, struct net_device dev,
				159	int how)
				160	{
				161	}
				162
				163	static u32 ipv4_cow_metrics(struct dst_entry dst, unsigned long old)
				164	{
				165	struct rtable rt = (struct rtable ) dst;
				166	struct inet_peer *peer;
				167	u32 *p = NULL;
				168
				169	if (!rt->peer)
				170	rt_bind_peer(rt, rt->rt_dst, 1);
				171
				172	peer = rt->peer;
				173	if (peer) {
				174	u32 *old_p = __DST_METRICS_PTR(old);
				175	unsigned long prev, new;
				176
				177	p = peer->metrics;
				178	if (inet_metrics_new(peer))
				179	memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
				180
				181	new = (unsigned long) p;
				182	prev = cmpxchg(&dst->_metrics, old, new);
				183
				184	if (prev != old) {
				185	p = __DST_METRICS_PTR(prev);
				186	if (prev & DST_METRICS_READ_ONLY)
				187	p = NULL;
				188	} else {
				189	if (rt->fi) {
				190	fib_info_put(rt->fi);
				191	rt->fi = NULL;
				192	}
				193	}
				194	}
				195	return p;
				196	}
				197
				198	static struct neighbour ipv4_neigh_lookup(const struct dst_entry dst, const void *daddr);
				199
				200	static struct dst_ops ipv4_dst_ops = {
				201	.family = AF_INET,
				202	.protocol = cpu_to_be16(ETH_P_IP),
				203	.gc = rt_garbage_collect,
				204	.check = ipv4_dst_check,
				205	.default_advmss = ipv4_default_advmss,
				206	.mtu = ipv4_mtu,
				207	.cow_metrics = ipv4_cow_metrics,
				208	.destroy = ipv4_dst_destroy,
				209	.ifdown = ipv4_dst_ifdown,
				210	.negative_advice = ipv4_negative_advice,
				211	.link_failure = ipv4_link_failure,
				212	.update_pmtu = ip_rt_update_pmtu,
				213	.local_out = __ip_local_out,
				214	.neigh_lookup = ipv4_neigh_lookup,
				215	};
				216
				217	#define ECN_OR_COST(class) TC_PRIO_##class
				218
				219	const __u8 ip_tos2prio[16] = {
				220	TC_PRIO_BESTEFFORT,
				221	ECN_OR_COST(BESTEFFORT),
				222	TC_PRIO_BESTEFFORT,
				223	ECN_OR_COST(BESTEFFORT),
				224	TC_PRIO_BULK,
				225	ECN_OR_COST(BULK),
				226	TC_PRIO_BULK,
				227	ECN_OR_COST(BULK),
				228	TC_PRIO_INTERACTIVE,
				229	ECN_OR_COST(INTERACTIVE),
				230	TC_PRIO_INTERACTIVE,
				231	ECN_OR_COST(INTERACTIVE),
				232	TC_PRIO_INTERACTIVE_BULK,
				233	ECN_OR_COST(INTERACTIVE_BULK),
				234	TC_PRIO_INTERACTIVE_BULK,
				235	ECN_OR_COST(INTERACTIVE_BULK)
				236	};
				237
				238
				239	/*
				240	* Route cache.
				241	*/
				242
				243	/* The locking scheme is rather straight forward:
				244	*
				245	* 1) Read-Copy Update protects the buckets of the central route hash.
				246	* 2) Only writers remove entries, and they hold the lock
				247	* as they look at rtable reference counts.
				248	* 3) Only readers acquire references to rtable entries,
				249	* they do so with atomic increments and with the
				250	* lock held.
				251	*/
				252
				253	struct rt_hash_bucket {
				254	struct rtable __rcu *chain;
				255	};
				256
				257	#if defined(CONFIG_SMP) \|\| defined(CONFIG_DEBUG_SPINLOCK) \|\| \
				258	defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_PREEMPT_RT_FULL)
				259	/*
				260	* Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
				261	* The size of this table is a power of two and depends on the number of CPUS.
				262	* (on lockdep we have a quite big spinlock_t, so keep the size down there)
				263	*/
				264	#ifdef CONFIG_LOCKDEP
				265	# define RT_HASH_LOCK_SZ 256
				266	#else
				267	# if NR_CPUS >= 32
				268	# define RT_HASH_LOCK_SZ 4096
				269	# elif NR_CPUS >= 16
				270	# define RT_HASH_LOCK_SZ 2048
				271	# elif NR_CPUS >= 8
				272	# define RT_HASH_LOCK_SZ 1024
				273	# elif NR_CPUS >= 4
				274	# define RT_HASH_LOCK_SZ 512
				275	# else
				276	# define RT_HASH_LOCK_SZ 256
				277	# endif
				278	#endif
				279
				280	static spinlock_t *rt_hash_locks;
				281	# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
				282
				283	static __init void rt_hash_lock_init(void)
				284	{
				285	int i;
				286
				287	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
				288	GFP_KERNEL);
				289	if (!rt_hash_locks)
				290	panic("IP: failed to allocate rt_hash_locks\n");
				291
				292	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
				293	spin_lock_init(&rt_hash_locks[i]);
				294	}
				295	#else
				296	# define rt_hash_lock_addr(slot) NULL
				297
				298	static inline void rt_hash_lock_init(void)
				299	{
				300	}
				301	#endif
				302
				303	static struct rt_hash_bucket *rt_hash_table __read_mostly;
				304	static unsigned rt_hash_mask __read_mostly;
				305	static unsigned int rt_hash_log __read_mostly;
				306
				307	static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
				308	#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
				309
				310	static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
				311	int genid)
				312	{
				313	return jhash_3words((__force u32)daddr, (__force u32)saddr,
				314	idx, genid)
				315	& rt_hash_mask;
				316	}
				317
				318	static inline int rt_genid(struct net *net)
				319	{
				320	return atomic_read(&net->ipv4.rt_genid);
				321	}
				322
				323	#ifdef CONFIG_PROC_FS
				324	struct rt_cache_iter_state {
				325	struct seq_net_private p;
				326	int bucket;
				327	int genid;
				328	};
				329
				330	static struct rtable rt_cache_get_first(struct seq_file seq)
				331	{
				332	struct rt_cache_iter_state *st = seq->private;
				333	struct rtable *r = NULL;
				334
				335	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
				336	if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
				337	continue;
				338	rcu_read_lock_bh();
				339	r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
				340	while (r) {
				341	if (dev_net(r->dst.dev) == seq_file_net(seq) &&
				342	r->rt_genid == st->genid)
				343	return r;
				344	r = rcu_dereference_bh(r->dst.rt_next);
				345	}
				346	rcu_read_unlock_bh();
				347	}
				348	net_run_track(PRT_ROUTE," route");
				349	return r;
				350	}
				351
				352	static struct rtable __rt_cache_get_next(struct seq_file seq,
				353	struct rtable *r)
				354	{
				355	struct rt_cache_iter_state *st = seq->private;
				356
				357	r = rcu_dereference_bh(r->dst.rt_next);
				358	while (!r) {
				359	rcu_read_unlock_bh();
				360	do {
				361	if (--st->bucket < 0)
				362	return NULL;
				363	} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
				364	rcu_read_lock_bh();
				365	r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
				366	}
				367	return r;
				368	}
				369
				370	static struct rtable rt_cache_get_next(struct seq_file seq,
				371	struct rtable *r)
				372	{
				373	struct rt_cache_iter_state *st = seq->private;
				374	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
				375	if (dev_net(r->dst.dev) != seq_file_net(seq))
				376	continue;
				377	if (r->rt_genid == st->genid)
				378	break;
				379	}
				380	return r;
				381	}
				382
				383	static struct rtable rt_cache_get_idx(struct seq_file seq, loff_t pos)
				384	{
				385	struct rtable *r = rt_cache_get_first(seq);
				386
				387	if (r)
				388	while (pos && (r = rt_cache_get_next(seq, r)))
				389	--pos;
				390	return pos ? NULL : r;
				391	}
				392
				393	static void rt_cache_seq_start(struct seq_file seq, loff_t *pos)
				394	{
				395	struct rt_cache_iter_state *st = seq->private;
				396	if (*pos)
				397	return rt_cache_get_idx(seq, *pos - 1);
				398	st->genid = rt_genid(seq_file_net(seq));
				399	return SEQ_START_TOKEN;
				400	}
				401
				402	static void rt_cache_seq_next(struct seq_file seq, void v, loff_t pos)
				403	{
				404	struct rtable *r;
				405
				406	if (v == SEQ_START_TOKEN)
				407	r = rt_cache_get_first(seq);
				408	else
				409	r = rt_cache_get_next(seq, v);
				410	++*pos;
				411	return r;
				412	}
				413
				414	static void rt_cache_seq_stop(struct seq_file seq, void v)
				415	{
				416	if (v && v != SEQ_START_TOKEN)
				417	rcu_read_unlock_bh();
				418	}
				419
				420	static int rt_cache_seq_show(struct seq_file seq, void v)
				421	{
				422	if (v == SEQ_START_TOKEN)
				423	seq_printf(seq, "%-127s\n",
				424	"Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
				425	"Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
				426	"HHUptod\tSpecDst");
				427	else {
				428	struct rtable *r = v;
				429	struct neighbour *n;
				430	int len, HHUptod;
				431
				432	rcu_read_lock();
				433	n = dst_get_neighbour_noref(&r->dst);
				434	HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
				435	rcu_read_unlock();
				436
				437	seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
				438	"%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
				439	r->dst.dev ? r->dst.dev->name : "*",
				440	(__force u32)r->rt_dst,
				441	(__force u32)r->rt_gateway,
				442	r->rt_flags, atomic_read(&r->dst.__refcnt),
				443	r->dst.__use, 0, (__force u32)r->rt_src,
				444	dst_metric_advmss(&r->dst) + 40,
				445	dst_metric(&r->dst, RTAX_WINDOW),
				446	(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
				447	dst_metric(&r->dst, RTAX_RTTVAR)),
				448	r->rt_key_tos,
				449	-1,
				450	HHUptod,
				451	r->rt_spec_dst, &len);
				452
				453	seq_printf(seq, "%*s\n", 127 - len, "");
				454	}
				455	return 0;
				456	}
				457
				458	static const struct seq_operations rt_cache_seq_ops = {
				459	.start = rt_cache_seq_start,
				460	.next = rt_cache_seq_next,
				461	.stop = rt_cache_seq_stop,
				462	.show = rt_cache_seq_show,
				463	};
				464
				465	static int rt_cache_seq_open(struct inode inode, struct file file)
				466	{
				467	return seq_open_net(inode, file, &rt_cache_seq_ops,
				468	sizeof(struct rt_cache_iter_state));
				469	}
				470
				471	static const struct file_operations rt_cache_seq_fops = {
				472	.owner = THIS_MODULE,
				473	.open = rt_cache_seq_open,
				474	.read = seq_read,
				475	.llseek = seq_lseek,
				476	.release = seq_release_net,
				477	};
				478
				479
				480	static void rt_cpu_seq_start(struct seq_file seq, loff_t *pos)
				481	{
				482	int cpu;
				483
				484	if (*pos == 0)
				485	return SEQ_START_TOKEN;
				486
				487	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
				488	if (!cpu_possible(cpu))
				489	continue;
				490	*pos = cpu+1;
				491	return &per_cpu(rt_cache_stat, cpu);
				492	}
				493	return NULL;
				494	}
				495
				496	static void rt_cpu_seq_next(struct seq_file seq, void v, loff_t pos)
				497	{
				498	int cpu;
				499
				500	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
				501	if (!cpu_possible(cpu))
				502	continue;
				503	*pos = cpu+1;
				504	return &per_cpu(rt_cache_stat, cpu);
				505	}
				506	return NULL;
				507
				508	}
				509
				510	static void rt_cpu_seq_stop(struct seq_file seq, void v)
				511	{
				512
				513	}
				514
				515	static int rt_cpu_seq_show(struct seq_file seq, void v)
				516	{
				517	struct rt_cache_stat *st = v;
				518
				519	if (v == SEQ_START_TOKEN) {
				520	seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
				521	return 0;
				522	}
				523
				524	seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
				525	" %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
				526	dst_entries_get_slow(&ipv4_dst_ops),
				527	st->in_hit,
				528	st->in_slow_tot,
				529	st->in_slow_mc,
				530	st->in_no_route,
				531	st->in_brd,
				532	st->in_martian_dst,
				533	st->in_martian_src,
				534
				535	st->out_hit,
				536	st->out_slow_tot,
				537	st->out_slow_mc,
				538
				539	st->gc_total,
				540	st->gc_ignored,
				541	st->gc_goal_miss,
				542	st->gc_dst_overflow,
				543	st->in_hlist_search,
				544	st->out_hlist_search
				545	);
				546	return 0;
				547	}
				548
				549	static const struct seq_operations rt_cpu_seq_ops = {
				550	.start = rt_cpu_seq_start,
				551	.next = rt_cpu_seq_next,
				552	.stop = rt_cpu_seq_stop,
				553	.show = rt_cpu_seq_show,
				554	};
				555
				556
				557	static int rt_cpu_seq_open(struct inode inode, struct file file)
				558	{
				559	return seq_open(file, &rt_cpu_seq_ops);
				560	}
				561
				562	static const struct file_operations rt_cpu_seq_fops = {
				563	.owner = THIS_MODULE,
				564	.open = rt_cpu_seq_open,
				565	.read = seq_read,
				566	.llseek = seq_lseek,
				567	.release = seq_release,
				568	};
				569
				570	#ifdef CONFIG_IP_ROUTE_CLASSID
				571	static int rt_acct_proc_show(struct seq_file m, void v)
				572	{
				573	struct ip_rt_acct dst, src;
				574	unsigned int i, j;
				575
				576	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
				577	if (!dst)
				578	return -ENOMEM;
				579
				580	for_each_possible_cpu(i) {
				581	src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
				582	for (j = 0; j < 256; j++) {
				583	dst[j].o_bytes += src[j].o_bytes;
				584	dst[j].o_packets += src[j].o_packets;
				585	dst[j].i_bytes += src[j].i_bytes;
				586	dst[j].i_packets += src[j].i_packets;
				587	}
				588	}
				589
				590	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
				591	kfree(dst);
				592	return 0;
				593	}
				594
				595	static int rt_acct_proc_open(struct inode inode, struct file file)
				596	{
				597	return single_open(file, rt_acct_proc_show, NULL);
				598	}
				599
				600	static const struct file_operations rt_acct_proc_fops = {
				601	.owner = THIS_MODULE,
				602	.open = rt_acct_proc_open,
				603	.read = seq_read,
				604	.llseek = seq_lseek,
				605	.release = single_release,
				606	};
				607	#endif
				608
				609	static int __net_init ip_rt_do_proc_init(struct net *net)
				610	{
				611	struct proc_dir_entry *pde;
				612
				613	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
				614	&rt_cache_seq_fops);
				615	if (!pde)
				616	goto err1;
				617
				618	pde = proc_create("rt_cache", S_IRUGO,
				619	net->proc_net_stat, &rt_cpu_seq_fops);
				620	if (!pde)
				621	goto err2;
				622
				623	#ifdef CONFIG_IP_ROUTE_CLASSID
				624	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
				625	if (!pde)
				626	goto err3;
				627	#endif
				628	return 0;
				629
				630	#ifdef CONFIG_IP_ROUTE_CLASSID
				631	err3:
				632	remove_proc_entry("rt_cache", net->proc_net_stat);
				633	#endif
				634	err2:
				635	remove_proc_entry("rt_cache", net->proc_net);
				636	err1:
				637	return -ENOMEM;
				638	}
				639
				640	static void __net_exit ip_rt_do_proc_exit(struct net *net)
				641	{
				642	remove_proc_entry("rt_cache", net->proc_net_stat);
				643	remove_proc_entry("rt_cache", net->proc_net);
				644	#ifdef CONFIG_IP_ROUTE_CLASSID
				645	remove_proc_entry("rt_acct", net->proc_net);
				646	#endif
				647	}
				648
				649	static struct pernet_operations ip_rt_proc_ops __net_initdata = {
				650	.init = ip_rt_do_proc_init,
				651	.exit = ip_rt_do_proc_exit,
				652	};
				653
				654	static int __init ip_rt_proc_init(void)
				655	{
				656	if (IS_ENABLED(CONFIG_PROC_STRIPPED))
				657	return 0;
				658
				659	return register_pernet_subsys(&ip_rt_proc_ops);
				660	}
				661
				662	#else
				663	static inline int ip_rt_proc_init(void)
				664	{
				665	return 0;
				666	}
				667	#endif /* CONFIG_PROC_FS */
				668
				669	static inline void rt_free(struct rtable *rt)
				670	{
				671	net_run_track(PRT_ROUTE," free");
				672	netruninfo_add(NULL, RT_HASH_DEL);
				673	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
				674	}
				675
				676	static inline void rt_drop(struct rtable *rt)
				677	{
				678	ip_rt_put(rt);
				679	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
				680	}
				681
				682	static inline int rt_fast_clean(struct rtable *rth)
				683	{
				684	/* Kill broadcast/multicast entries very aggresively, if they
				685	collide in hash table with more useful entries */
				686	return (rth->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST)) &&
				687	rt_is_input_route(rth) && rth->dst.rt_next;
				688	}
				689
				690	static inline int rt_valuable(struct rtable *rth)
				691	{
				692	return (rth->rt_flags & (RTCF_REDIRECTED \| RTCF_NOTIFY)) \|\|
				693	(rth->peer && rth->peer->pmtu_expires);
				694	}
				695
				696	static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
				697	{
				698	unsigned long age;
				699	int ret = 0;
				700
				701	if (atomic_read(&rth->dst.__refcnt))
				702	goto out;
				703
				704	age = jiffies - rth->dst.lastuse;
				705	if ((age <= tmo1 && !rt_fast_clean(rth)) \|\|
				706	(age <= tmo2 && rt_valuable(rth)))
				707	goto out;
				708	ret = 1;
				709	out: return ret;
				710	}
				711
				712	/* Bits of score are:
				713	* 31: very valuable
				714	* 30: not quite useless
				715	* 29..0: usage counter
				716	*/
				717	static inline u32 rt_score(struct rtable *rt)
				718	{
				719	u32 score = jiffies - rt->dst.lastuse;
				720
				721	score = ~score & ~(3<<30);
				722
				723	if (rt_valuable(rt))
				724	score \|= (1<<31);
				725
				726	if (rt_is_output_route(rt) \|\|
				727	!(rt->rt_flags & (RTCF_BROADCAST\|RTCF_MULTICAST\|RTCF_LOCAL)))
				728	score \|= (1<<30);
				729
				730	return score;
				731	}
				732
				733	static inline bool rt_caching(const struct net *net)
				734	{
				735	return net->ipv4.current_rt_cache_rebuild_count <=
				736	net->ipv4.sysctl_rt_cache_rebuild_count;
				737	}
				738
				739	static inline bool compare_hash_inputs(const struct rtable *rt1,
				740	const struct rtable *rt2)
				741	{
				742	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) \|
				743	((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) \|
				744	(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
				745	}
				746
				747	static inline int compare_keys(struct rtable rt1, struct rtable rt2)
				748	{
				749	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) \|
				750	((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) \|
				751	(rt1->rt_mark ^ rt2->rt_mark) \|
				752	(rt1->rt_key_tos ^ rt2->rt_key_tos) \|
				753	(rt1->rt_route_iif ^ rt2->rt_route_iif) \|
				754	(rt1->rt_oif ^ rt2->rt_oif)) == 0;
				755	}
				756
				757	static inline int compare_netns(struct rtable rt1, struct rtable rt2)
				758	{
				759	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
				760	}
				761
				762	static inline int rt_is_expired(struct rtable *rth)
				763	{
				764	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
				765	}
				766
				767	/*
				768	* Perform a full scan of hash table and free all entries.
				769	* Can be called by a softirq or a process.
				770	* In the later case, we want to be reschedule if necessary
				771	*/
				772	static void rt_do_flush(struct net *net, int process_context)
				773	{
				774	unsigned int i;
				775	struct rtable rth, next;
				776
				777	for (i = 0; i <= rt_hash_mask; i++) {
				778	struct rtable __rcu **pprev;
				779	struct rtable *list;
				780
				781	if (process_context && need_resched())
				782	cond_resched();
				783	rth = rcu_access_pointer(rt_hash_table[i].chain);
				784	if (!rth)
				785	continue;
				786
				787	spin_lock_bh(rt_hash_lock_addr(i));
				788
				789	list = NULL;
				790	pprev = &rt_hash_table[i].chain;
				791	rth = rcu_dereference_protected(*pprev,
				792	lockdep_is_held(rt_hash_lock_addr(i)));
				793
				794	while (rth) {
				795	next = rcu_dereference_protected(rth->dst.rt_next,
				796	lockdep_is_held(rt_hash_lock_addr(i)));
				797
				798	if (!net \|\|
				799	net_eq(dev_net(rth->dst.dev), net)) {
				800	rcu_assign_pointer(*pprev, next);
				801	rcu_assign_pointer(rth->dst.rt_next, list);
				802	list = rth;
				803	} else {
				804	pprev = &rth->dst.rt_next;
				805	}
				806	rth = next;
				807	}
				808
				809	spin_unlock_bh(rt_hash_lock_addr(i));
				810
				811	for (; list; list = next) {
				812	next = rcu_dereference_protected(list->dst.rt_next, 1);
				813	rt_free(list);
				814	}
				815	net_run_track(PRT_ROUTE,"flush route:%d",process_context);
				816	}
				817	}
				818
				819	/*
				820	* While freeing expired entries, we compute average chain length
				821	* and standard deviation, using fixed-point arithmetic.
				822	* This to have an estimation of rt_chain_length_max
				823	* rt_chain_length_max = max(elasticity, AVG + 4*SD)
				824	* We use 3 bits for frational part, and 29 (or 61) for magnitude.
				825	*/
				826
				827	#define FRACT_BITS 3
				828	#define ONE (1UL << FRACT_BITS)
				829
				830	/*
				831	* Given a hash chain and an item in this hash chain,
				832	* find if a previous entry has the same hash_inputs
				833	* (but differs on tos, mark or oif)
				834	* Returns 0 if an alias is found.
				835	* Returns ONE if rth has no alias before itself.
				836	*/
				837	static int has_noalias(const struct rtable head, const struct rtable rth)
				838	{
				839	const struct rtable *aux = head;
				840
				841	while (aux != rth) {
				842	if (compare_hash_inputs(aux, rth))
				843	return 0;
				844	aux = rcu_dereference_protected(aux->dst.rt_next, 1);
				845	}
				846	return ONE;
				847	}
				848
				849	static void rt_check_expire(void)
				850	{
				851	static unsigned int rover;
				852	unsigned int i = rover, goal;
				853	struct rtable *rth;
				854	struct rtable __rcu **rthp;
				855	unsigned long samples = 0;
				856	unsigned long sum = 0, sum2 = 0;
				857	unsigned long delta;
				858	u64 mult;
				859
				860	delta = jiffies - expires_ljiffies;
				861	expires_ljiffies = jiffies;
				862	mult = ((u64)delta) << rt_hash_log;
				863	if (ip_rt_gc_timeout > 1)
				864	do_div(mult, ip_rt_gc_timeout);
				865	goal = (unsigned int)mult;
				866	if (goal > rt_hash_mask)
				867	goal = rt_hash_mask + 1;
				868	for (; goal > 0; goal--) {
				869	unsigned long tmo = ip_rt_gc_timeout;
				870	unsigned long length;
				871
				872	i = (i + 1) & rt_hash_mask;
				873	rthp = &rt_hash_table[i].chain;
				874
				875	if (need_resched())
				876	cond_resched();
				877
				878	samples++;
				879
				880	if (rcu_dereference_raw(*rthp) == NULL)
				881	continue;
				882	length = 0;
				883	spin_lock_bh(rt_hash_lock_addr(i));
				884	while ((rth = rcu_dereference_protected(*rthp,
				885	lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
				886	prefetch(rth->dst.rt_next);
				887	if (rt_is_expired(rth)) {
				888	*rthp = rth->dst.rt_next;
				889	rt_free(rth);
				890	continue;
				891	}
				892	if (rth->dst.expires) {
				893	/* Entry is expired even if it is in use */
				894	if (time_before_eq(jiffies, rth->dst.expires)) {
				895	nofree:
				896	tmo >>= 1;
				897	rthp = &rth->dst.rt_next;
				898	/*
				899	* We only count entries on
				900	* a chain with equal hash inputs once
				901	* so that entries for different QOS
				902	* levels, and other non-hash input
				903	* attributes don't unfairly skew
				904	* the length computation
				905	*/
				906	length += has_noalias(rt_hash_table[i].chain, rth);
				907	continue;
				908	}
				909	} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
				910	goto nofree;
				911
				912	/* Cleanup aged off entries. */
				913	*rthp = rth->dst.rt_next;
				914	rt_free(rth);
				915	}
				916	spin_unlock_bh(rt_hash_lock_addr(i));
				917	sum += length;
				918	sum2 += length*length;
				919	}
				920	if (samples) {
				921	unsigned long avg = sum / samples;
				922	unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
				923	rt_chain_length_max = max_t(unsigned long,
				924	ip_rt_gc_elasticity,
				925	(avg + 4*sd) >> FRACT_BITS);
				926	}
				927	rover = i;
				928	}
				929
				930	/*
				931	* rt_worker_func() is run in process context.
				932	* we call rt_check_expire() to scan part of the hash table
				933	*/
				934	static void rt_worker_func(struct work_struct *work)
				935	{
				936	rt_check_expire();
				937	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
				938	}
				939
				940	/*
				941	* Perturbation of rt_genid by a small quantity [1..256]
				942	* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
				943	* many times (2^24) without giving recent rt_genid.
				944	* Jenkins hash is strong enough that litle changes of rt_genid are OK.
				945	*/
				946	static void rt_cache_invalidate(struct net *net)
				947	{
				948	unsigned char shuffle;
				949	netruninfo_add(NULL, RT_CACHE_INVALID);
				950	get_random_bytes(&shuffle, sizeof(shuffle));
				951	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
				952	inetpeer_invalidate_tree(AF_INET);
				953	}
				954
				955	/*
				956	* delay < 0 : invalidate cache (fast : entries will be deleted later)
				957	* delay >= 0 : invalidate & flush cache (can be long)
				958	*/
				959	void rt_cache_flush(struct net *net, int delay)
				960	{
				961	rt_cache_invalidate(net);
				962	if (delay >= 0)
				963	rt_do_flush(net, !in_softirq());
				964	}
				965
				966	/* Flush previous cache invalidated entries from the cache */
				967	void rt_cache_flush_batch(struct net *net)
				968	{
				969	rt_do_flush(net, !in_softirq());
				970	}
				971
				972	static void rt_emergency_hash_rebuild(struct net *net)
				973	{
				974	if (net_ratelimit())
				975	pr_warn("Route hash chain too long!\n");
				976	rt_cache_invalidate(net);
				977	}
				978
				979	/*
				980	Short description of GC goals.
				981
				982	We want to build algorithm, which will keep routing cache
				983	at some equilibrium point, when number of aged off entries
				984	is kept approximately equal to newly generated ones.
				985
				986	Current expiration strength is variable "expire".
				987	We try to adjust it dynamically, so that if networking
				988	is idle expires is large enough to keep enough of warm entries,
				989	and when load increases it reduces to limit cache size.
				990	*/
				991
				992	static void __do_rt_garbage_collect(int elasticity, int min_interval)
				993	{
				994	static unsigned long expire = RT_GC_TIMEOUT;
				995	static unsigned long last_gc;
				996	static int rover;
				997	static int equilibrium;
				998	static DEFINE_SPINLOCK(rt_gc_lock);
				999	struct rtable *rth;
				1000	struct rtable __rcu **rthp;
				1001	unsigned long now = jiffies;
				1002	int goal;
				1003	int entries = dst_entries_get_fast(&ipv4_dst_ops);
				1004
				1005	/*
				1006	* Garbage collection is pretty expensive,
				1007	* do not make it too frequently.
				1008	*/
				1009
				1010	spin_lock_bh(&rt_gc_lock);
				1011
				1012	RT_CACHE_STAT_INC(gc_total);
				1013
				1014	if (now - last_gc < min_interval &&
				1015	entries < ip_rt_max_size) {
				1016	RT_CACHE_STAT_INC(gc_ignored);
				1017	goto out;
				1018	}
				1019
				1020	entries = dst_entries_get_slow(&ipv4_dst_ops);
				1021	/* Calculate number of entries, which we want to expire now. */
				1022	goal = entries - (elasticity << rt_hash_log);
				1023	if (goal <= 0) {
				1024	if (equilibrium < ipv4_dst_ops.gc_thresh)
				1025	equilibrium = ipv4_dst_ops.gc_thresh;
				1026	goal = entries - equilibrium;
				1027	if (goal > 0) {
				1028	equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
				1029	goal = entries - equilibrium;
				1030	}
				1031	} else {
				1032	/* We are in dangerous area. Try to reduce cache really
				1033	* aggressively.
				1034	*/
				1035	goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
				1036	equilibrium = entries - goal;
				1037	}
				1038
				1039	if (now - last_gc >= min_interval)
				1040	last_gc = now;
				1041
				1042	if (goal <= 0) {
				1043	equilibrium += goal;
				1044	goto work_done;
				1045	}
				1046
				1047	do {
				1048	int i, k;
				1049
				1050	for (i = rt_hash_mask, k = rover; i >= 0; i--) {
				1051	unsigned long tmo = expire;
				1052
				1053	k = (k + 1) & rt_hash_mask;
				1054	rthp = &rt_hash_table[k].chain;
				1055	spin_lock_bh(rt_hash_lock_addr(k));
				1056	while ((rth = rcu_dereference_protected(*rthp,
				1057	lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
				1058	if (!rt_is_expired(rth) &&
				1059	!rt_may_expire(rth, tmo, expire)) {
				1060	tmo >>= 1;
				1061	rthp = &rth->dst.rt_next;
				1062	continue;
				1063	}
				1064	*rthp = rth->dst.rt_next;
				1065	rt_free(rth);
				1066	goal--;
				1067	}
				1068	spin_unlock_bh(rt_hash_lock_addr(k));
				1069	if (goal <= 0)
				1070	break;
				1071	}
				1072	rover = k;
				1073
				1074	if (goal <= 0)
				1075	goto work_done;
				1076
				1077	/* Goal is not achieved. We stop process if:
				1078
				1079	- if expire reduced to zero. Otherwise, expire is halfed.
				1080	- if table is not full.
				1081	- if we are called from interrupt.
				1082	- jiffies check is just fallback/debug loop breaker.
				1083	We will not spin here for long time in any case.
				1084	*/
				1085
				1086	RT_CACHE_STAT_INC(gc_goal_miss);
				1087
				1088	if (expire == 0)
				1089	break;
				1090
				1091	expire >>= 1;
				1092
				1093	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
				1094	goto out;
				1095	} while (!in_softirq() && time_before_eq(jiffies, now));
				1096
				1097	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
				1098	goto out;
				1099	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
				1100	goto out;
				1101	if (net_ratelimit())
				1102	pr_warn("dst cache overflow\n");
				1103	RT_CACHE_STAT_INC(gc_dst_overflow);
				1104	goto out;
				1105
				1106	work_done:
				1107	expire += min_interval;
				1108	if (expire > ip_rt_gc_timeout \|\|
				1109	dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh \|\|
				1110	dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
				1111	expire = ip_rt_gc_timeout;
				1112	out:
				1113	spin_unlock_bh(&rt_gc_lock);
				1114	}
				1115
				1116	static void __rt_garbage_collect(struct work_struct *w)
				1117	{
				1118	__do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
				1119	}
				1120
				1121	static int rt_garbage_collect(struct dst_ops *ops)
				1122	{
				1123	if (!work_pending(&rt_gc_worker))
				1124	schedule_work(&rt_gc_worker);
				1125
				1126	if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size \|\|
				1127	dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
				1128	RT_CACHE_STAT_INC(gc_dst_overflow);
				1129	return 1;
				1130	}
				1131	return 0;
				1132	}
				1133
				1134	/*
				1135	* Returns number of entries in a hash chain that have different hash_inputs
				1136	*/
				1137	static int slow_chain_length(const struct rtable *head)
				1138	{
				1139	int length = 0;
				1140	const struct rtable *rth = head;
				1141
				1142	while (rth) {
				1143	length += has_noalias(head, rth);
				1144	rth = rcu_dereference_protected(rth->dst.rt_next, 1);
				1145	}
				1146	return length >> FRACT_BITS;
				1147	}
				1148
				1149	static struct neighbour ipv4_neigh_lookup(const struct dst_entry dst, const void *daddr)
				1150	{
				1151	static const __be32 inaddr_any = 0;
				1152	struct net_device *dev = dst->dev;
				1153	const __be32 *pkey = daddr;
				1154	const struct rtable *rt;
				1155	struct neighbour *n;
				1156
				1157	rt = (const struct rtable *) dst;
				1158
				1159	if (dev->flags & (IFF_LOOPBACK \| IFF_POINTOPOINT))
				1160	pkey = &inaddr_any;
				1161	else if (rt->rt_gateway)
				1162	pkey = (const __be32 *) &rt->rt_gateway;
				1163
				1164	n = __ipv4_neigh_lookup(dev, (__force u32 )pkey);
				1165	if (n)
				1166	return n;
				1167	return neigh_create(&arp_tbl, pkey, dev);
				1168	}
				1169
				1170	static int rt_bind_neighbour(struct rtable *rt)
				1171	{
				1172	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
				1173	if (IS_ERR(n))
				1174	return PTR_ERR(n);
				1175	dst_set_neighbour(&rt->dst, n);
				1176
				1177	return 0;
				1178	}
				1179
				1180	static struct rtable rt_intern_hash(unsigned hash, struct rtable rt,
				1181	struct sk_buff *skb, int ifindex)
				1182	{
				1183	struct rtable rth, cand;
				1184	struct rtable __rcu rthp, candp;
				1185	unsigned long now;
				1186	u32 min_score;
				1187	int chain_length;
				1188	int attempts = 1;
				1189
				1190	restart:
				1191	chain_length = 0;
				1192	min_score = ~(u32)0;
				1193	cand = NULL;
				1194	candp = NULL;
				1195	now = jiffies;
				1196
				1197	if (!rt_caching(dev_net(rt->dst.dev))) {
				1198	/*
				1199	* If we're not caching, just tell the caller we
				1200	* were successful and don't touch the route. The
				1201	* caller hold the sole reference to the cache entry, and
				1202	* it will be released when the caller is done with it.
				1203	* If we drop it here, the callers have no way to resolve routes
				1204	* when we're not caching. Instead, just point *rp at rt, so
				1205	* the caller gets a single use out of the route
				1206	* Note that we do rt_free on this new route entry, so that
				1207	* once its refcount hits zero, we are still able to reap it
				1208	* (Thanks Alexey)
				1209	* Note: To avoid expensive rcu stuff for this uncached dst,
				1210	* we set DST_NOCACHE so that dst_release() can free dst without
				1211	* waiting a grace period.
				1212	*/
				1213
				1214	rt->dst.flags \|= DST_NOCACHE;
				1215	if (rt->rt_type == RTN_UNICAST \|\| rt_is_output_route(rt)) {
				1216	int err = rt_bind_neighbour(rt);
				1217	if (err) {
				1218	if (net_ratelimit())
				1219	pr_warn("Neighbour table failure & not caching routes\n");
				1220	ip_rt_put(rt);
				1221	return ERR_PTR(err);
				1222	}
				1223	}
				1224
				1225	goto skip_hashing;
				1226	}
				1227
				1228	rthp = &rt_hash_table[hash].chain;
				1229
				1230	spin_lock_bh(rt_hash_lock_addr(hash));
				1231	while ((rth = rcu_dereference_protected(*rthp,
				1232	lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
				1233	if (rt_is_expired(rth)) {
				1234	*rthp = rth->dst.rt_next;
				1235	rt_free(rth);
				1236	continue;
				1237	}
				1238	if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
				1239	/* Put it first */
				1240	*rthp = rth->dst.rt_next;
				1241	/*
				1242	* Since lookup is lockfree, the deletion
				1243	* must be visible to another weakly ordered CPU before
				1244	* the insertion at the start of the hash chain.
				1245	*/
				1246	rcu_assign_pointer(rth->dst.rt_next,
				1247	rt_hash_table[hash].chain);
				1248	/*
				1249	* Since lookup is lockfree, the update writes
				1250	* must be ordered for consistency on SMP.
				1251	*/
				1252	rcu_assign_pointer(rt_hash_table[hash].chain, rth);
				1253
				1254	dst_use(&rth->dst, now);
				1255	spin_unlock_bh(rt_hash_lock_addr(hash));
				1256
				1257	rt_drop(rt);
				1258	if (skb)
				1259	skb_dst_set(skb, &rth->dst);
				1260	net_run_track(PRT_ROUTE," route");
				1261	return rth;
				1262	}
				1263
				1264	if (!atomic_read(&rth->dst.__refcnt)) {
				1265	u32 score = rt_score(rth);
				1266
				1267	if (score <= min_score) {
				1268	cand = rth;
				1269	candp = rthp;
				1270	min_score = score;
				1271	}
				1272	}
				1273
				1274	chain_length++;
				1275
				1276	rthp = &rth->dst.rt_next;
				1277	}
				1278
				1279	if (cand) {
				1280	/* ip_rt_gc_elasticity used to be average length of chain
				1281	* length, when exceeded gc becomes really aggressive.
				1282	*
				1283	* The second limit is less certain. At the moment it allows
				1284	* only 2 entries per bucket. We will see.
				1285	*/
				1286	if (chain_length > ip_rt_gc_elasticity) {
				1287	*candp = cand->dst.rt_next;
				1288	rt_free(cand);
				1289	}
				1290	} else {
				1291	if (chain_length > rt_chain_length_max &&
				1292	slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
				1293	struct net *net = dev_net(rt->dst.dev);
				1294	int num = ++net->ipv4.current_rt_cache_rebuild_count;
				1295	if (!rt_caching(net)) {
				1296	pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
				1297	rt->dst.dev->name, num);
				1298	}
				1299	rt_emergency_hash_rebuild(net);
				1300	spin_unlock_bh(rt_hash_lock_addr(hash));
				1301
				1302	hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
				1303	ifindex, rt_genid(net));
				1304	goto restart;
				1305	}
				1306	}
				1307
				1308	/* Try to bind route to arp only if it is output
				1309	route or unicast forwarding path.
				1310	*/
				1311	if (rt->rt_type == RTN_UNICAST \|\| rt_is_output_route(rt)) {
				1312	int err = rt_bind_neighbour(rt);
				1313	if (err) {
				1314	spin_unlock_bh(rt_hash_lock_addr(hash));
				1315
				1316	if (err != -ENOBUFS) {
				1317	rt_drop(rt);
				1318	return ERR_PTR(err);
				1319	}
				1320
				1321	/* Neighbour tables are full and nothing
				1322	can be released. Try to shrink route cache,
				1323	it is most likely it holds some neighbour records.
				1324	*/
				1325	if (!in_softirq() && attempts-- > 0) {
				1326	static DEFINE_SPINLOCK(lock);
				1327
				1328	if (spin_trylock(&lock)) {
				1329	__do_rt_garbage_collect(1, 0);
				1330	spin_unlock(&lock);
				1331	} else {
				1332	spin_unlock_wait(&lock);
				1333	}
				1334	goto restart;
				1335	}
				1336
				1337	if (net_ratelimit())
				1338	pr_warn("Neighbour table overflow\n");
				1339	rt_drop(rt);
				1340	return ERR_PTR(-ENOBUFS);
				1341	}
				1342	}
				1343
				1344	rt->dst.rt_next = rt_hash_table[hash].chain;
				1345
				1346	/*
				1347	* Since lookup is lockfree, we must make sure
				1348	* previous writes to rt are committed to memory
				1349	* before making rt visible to other CPUS.
				1350	*/
				1351	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
				1352	netruninfo_add(NULL, RT_HASH_ADD);
				1353
				1354	spin_unlock_bh(rt_hash_lock_addr(hash));
				1355
				1356	skip_hashing:
				1357	if (skb)
				1358	skb_dst_set(skb, &rt->dst);
				1359	net_run_track(PRT_ROUTE," rt_intern_hash");
				1360	return rt;
				1361	}
				1362
				1363	static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
				1364
				1365	static u32 rt_peer_genid(void)
				1366	{
				1367	return atomic_read(&__rt_peer_genid);
				1368	}
				1369
				1370	void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
				1371	{
				1372	struct inet_peer *peer;
				1373
				1374	peer = inet_getpeer_v4(daddr, create);
				1375
				1376	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
				1377	inet_putpeer(peer);
				1378	else
				1379	rt->rt_peer_genid = rt_peer_genid();
				1380	}
				1381
				1382	#define IP_IDENTS_SZ 2048u
				1383	struct ip_ident_bucket {
				1384	atomic_t id;
				1385	u32 stamp32;
				1386	};
				1387
				1388	static struct ip_ident_bucket *ip_idents __read_mostly;
				1389
				1390	/* In order to protect privacy, we add a perturbation to identifiers
				1391	* if one generator is seldom used. This makes hard for an attacker
				1392	* to infer how many packets were sent between two points in time.
				1393	*/
				1394	u32 ip_idents_reserve(u32 hash, int segs)
				1395	{
				1396	struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
				1397	u32 old = ACCESS_ONCE(bucket->stamp32);
				1398	u32 now = (u32)jiffies;
				1399	u32 delta = 0;
				1400
				1401	if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
				1402	u64 x = random32();
				1403
				1404	x *= (now - old);
				1405	delta = (u32)(x >> 32);
				1406	}
				1407
				1408	return atomic_add_return(segs + delta, &bucket->id) - segs;
				1409	}
				1410	EXPORT_SYMBOL(ip_idents_reserve);
				1411
				1412	void __ip_select_ident(struct iphdr *iph, int segs)
				1413	{
				1414	static u32 ip_idents_hashrnd __read_mostly;
				1415	static bool hashrnd_initialized = false;
				1416	u32 hash, id;
				1417
				1418	if (unlikely(!hashrnd_initialized)) {
				1419	hashrnd_initialized = true;
				1420	get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
				1421	}
				1422
				1423	hash = jhash_3words((__force u32)iph->daddr,
				1424	(__force u32)iph->saddr,
				1425	iph->protocol,
				1426	ip_idents_hashrnd);
				1427	id = ip_idents_reserve(hash, segs);
				1428	iph->id = htons(id);
				1429	}
				1430	EXPORT_SYMBOL(__ip_select_ident);
				1431
				1432	static void rt_del(unsigned hash, struct rtable *rt)
				1433	{
				1434	struct rtable __rcu **rthp;
				1435	struct rtable *aux;
				1436
				1437
				1438	rthp = &rt_hash_table[hash].chain;
				1439	spin_lock_bh(rt_hash_lock_addr(hash));
				1440	ip_rt_put(rt);
				1441	while ((aux = rcu_dereference_protected(*rthp,
				1442	lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
				1443	if (aux == rt \|\| rt_is_expired(aux)) {
				1444	*rthp = aux->dst.rt_next;
				1445	rt_free(aux);
				1446	continue;
				1447	}
				1448	rthp = &aux->dst.rt_next;
				1449	}
				1450	spin_unlock_bh(rt_hash_lock_addr(hash));
				1451	}
				1452
				1453	static void check_peer_redir(struct dst_entry dst, struct inet_peer peer)
				1454	{
				1455	struct rtable rt = (struct rtable ) dst;
				1456	__be32 orig_gw = rt->rt_gateway;
				1457	struct neighbour n, old_n;
				1458
				1459	dst_confirm(&rt->dst);
				1460
				1461	rt->rt_gateway = peer->redirect_learned.a4;
				1462
				1463	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
				1464	if (IS_ERR(n)) {
				1465	rt->rt_gateway = orig_gw;
				1466	return;
				1467	}
				1468	net_run_track(PRT_ROUTE," redirect");
				1469	old_n = xchg(&rt->dst._neighbour, n);
				1470	if (old_n)
				1471	neigh_release(old_n);
				1472	if (!(n->nud_state & NUD_VALID)) {
				1473	neigh_event_send(n, NULL);
				1474	} else {
				1475	rt->rt_flags \|= RTCF_REDIRECTED;
				1476	call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
				1477	}
				1478	}
				1479
				1480	/* called in rcu_read_lock() section */
				1481	void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
				1482	__be32 saddr, struct net_device *dev)
				1483	{
				1484	int s, i;
				1485	struct in_device *in_dev = __in_dev_get_rcu(dev);
				1486	__be32 skeys[2] = { saddr, 0 };
				1487	int ikeys[2] = { dev->ifindex, 0 };
				1488	struct inet_peer *peer;
				1489	struct net *net;
				1490
				1491	if (!in_dev)
				1492	return;
				1493
				1494	net = dev_net(dev);
				1495	if (new_gw == old_gw \|\| !IN_DEV_RX_REDIRECTS(in_dev) \|\|
				1496	ipv4_is_multicast(new_gw) \|\| ipv4_is_lbcast(new_gw) \|\|
				1497	ipv4_is_zeronet(new_gw))
				1498	goto reject_redirect;
				1499
				1500	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
				1501	if (!inet_addr_onlink(in_dev, new_gw, old_gw))
				1502	goto reject_redirect;
				1503	if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
				1504	goto reject_redirect;
				1505	} else {
				1506	if (inet_addr_type(net, new_gw) != RTN_UNICAST)
				1507	goto reject_redirect;
				1508	}
				1509
				1510	for (s = 0; s < 2; s++) {
				1511	for (i = 0; i < 2; i++) {
				1512	unsigned int hash;
				1513	struct rtable __rcu **rthp;
				1514	struct rtable *rt;
				1515
				1516	hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
				1517
				1518	rthp = &rt_hash_table[hash].chain;
				1519
				1520	while ((rt = rcu_dereference(*rthp)) != NULL) {
				1521	rthp = &rt->dst.rt_next;
				1522
				1523	if (rt->rt_key_dst != daddr \|\|
				1524	rt->rt_key_src != skeys[s] \|\|
				1525	rt->rt_oif != ikeys[i] \|\|
				1526	rt_is_input_route(rt) \|\|
				1527	rt_is_expired(rt) \|\|
				1528	!net_eq(dev_net(rt->dst.dev), net) \|\|
				1529	rt->dst.error \|\|
				1530	rt->dst.dev != dev \|\|
				1531	rt->rt_gateway != old_gw)
				1532	continue;
				1533
				1534	if (!rt->peer)
				1535	rt_bind_peer(rt, rt->rt_dst, 1);
				1536
				1537	peer = rt->peer;
				1538	if (peer) {
				1539	if (peer->redirect_learned.a4 != new_gw) {
				1540	peer->redirect_learned.a4 = new_gw;
				1541	atomic_inc(&__rt_peer_genid);
				1542	}
				1543	check_peer_redir(&rt->dst, peer);
				1544	}
				1545	}
				1546	}
				1547	}
				1548	return;
				1549
				1550	reject_redirect:
				1551	#ifdef CONFIG_IP_ROUTE_VERBOSE
				1552	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
				1553	pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
				1554	" Advised path = %pI4 -> %pI4\n",
				1555	&old_gw, dev->name, &new_gw,
				1556	&saddr, &daddr);
				1557	#endif
				1558	;
				1559	}
				1560
				1561	static bool peer_pmtu_expired(struct inet_peer *peer)
				1562	{
				1563	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
				1564
				1565	return orig &&
				1566	time_after_eq(jiffies, orig) &&
				1567	cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
				1568	}
				1569
				1570	static bool peer_pmtu_cleaned(struct inet_peer *peer)
				1571	{
				1572	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
				1573
				1574	return orig &&
				1575	cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
				1576	}
				1577
				1578	static struct dst_entry ipv4_negative_advice(struct dst_entry dst)
				1579	{
				1580	struct rtable rt = (struct rtable )dst;
				1581	struct dst_entry *ret = dst;
				1582
				1583	if (rt) {
				1584	if (dst->obsolete > 0) {
				1585	ip_rt_put(rt);
				1586	ret = NULL;
				1587	} else if (rt->rt_flags & RTCF_REDIRECTED) {
				1588	unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
				1589	rt->rt_oif,
				1590	rt_genid(dev_net(dst->dev)));
				1591	rt_del(hash, rt);
				1592	ret = NULL;
				1593	} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
				1594	dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
				1595	}
				1596	}
				1597	return ret;
				1598	}
				1599
				1600	/*
				1601	* Algorithm:
				1602	* 1. The first ip_rt_redirect_number redirects are sent
				1603	* with exponential backoff, then we stop sending them at all,
				1604	* assuming that the host ignores our redirects.
				1605	* 2. If we did not see packets requiring redirects
				1606	* during ip_rt_redirect_silence, we assume that the host
				1607	* forgot redirected route and start to send redirects again.
				1608	*
				1609	* This algorithm is much cheaper and more intelligent than dumb load limiting
				1610	* in icmp.c.
				1611	*
				1612	* NOTE. Do not forget to inhibit load limiting for redirects (redundant)
				1613	* and "frag. need" (breaks PMTU discovery) in icmp.c.
				1614	*/
				1615
				1616	void ip_rt_send_redirect(struct sk_buff *skb)
				1617	{
				1618	struct rtable *rt = skb_rtable(skb);
				1619	struct in_device *in_dev;
				1620	struct inet_peer *peer;
				1621	int log_martians;
				1622
				1623	rcu_read_lock();
				1624	in_dev = __in_dev_get_rcu(rt->dst.dev);
				1625	if (!in_dev \|\| !IN_DEV_TX_REDIRECTS(in_dev)) {
				1626	rcu_read_unlock();
				1627	return;
				1628	}
				1629	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
				1630	rcu_read_unlock();
				1631
				1632	if (!rt->peer)
				1633	rt_bind_peer(rt, rt->rt_dst, 1);
				1634	peer = rt->peer;
				1635	if (!peer) {
				1636	icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
				1637	return;
				1638	}
				1639
				1640	/* No redirected packets during ip_rt_redirect_silence;
				1641	* reset the algorithm.
				1642	*/
				1643	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
				1644	peer->rate_tokens = 0;
				1645
				1646	/* Too many ignored redirects; do not send anything
				1647	* set dst.rate_last to the last seen redirected packet.
				1648	*/
				1649	if (peer->rate_tokens >= ip_rt_redirect_number) {
				1650	peer->rate_last = jiffies;
				1651	return;
				1652	}
				1653
				1654	/* Check for load limit; set rate_last to the latest sent
				1655	* redirect.
				1656	*/
				1657	if (peer->rate_tokens == 0 \|\|
				1658	time_after(jiffies,
				1659	(peer->rate_last +
				1660	(ip_rt_redirect_load << peer->rate_tokens)))) {
				1661	icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
				1662	peer->rate_last = jiffies;
				1663	++peer->rate_tokens;
				1664	#ifdef CONFIG_IP_ROUTE_VERBOSE
				1665	if (log_martians &&
				1666	peer->rate_tokens == ip_rt_redirect_number &&
				1667	net_ratelimit())
				1668	pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
				1669	&ip_hdr(skb)->saddr, rt->rt_iif,
				1670	&rt->rt_dst, &rt->rt_gateway);
				1671	#endif
				1672	}
				1673	}
				1674
				1675	static int ip_error(struct sk_buff *skb)
				1676	{
				1677	struct rtable *rt = skb_rtable(skb);
				1678	struct inet_peer *peer;
				1679	unsigned long now;
				1680	bool send;
				1681	int code;
				1682
				1683	switch (rt->dst.error) {
				1684	case EINVAL:
				1685	default:
				1686	goto out;
				1687	case EHOSTUNREACH:
				1688	code = ICMP_HOST_UNREACH;
				1689	break;
				1690	case ENETUNREACH:
				1691	code = ICMP_NET_UNREACH;
				1692	IP_INC_STATS_BH(dev_net(rt->dst.dev),
				1693	IPSTATS_MIB_INNOROUTES);
				1694	break;
				1695	case EACCES:
				1696	code = ICMP_PKT_FILTERED;
				1697	break;
				1698	}
				1699
				1700	if (!rt->peer)
				1701	rt_bind_peer(rt, rt->rt_dst, 1);
				1702	peer = rt->peer;
				1703
				1704	send = true;
				1705	if (peer) {
				1706	now = jiffies;
				1707	peer->rate_tokens += now - peer->rate_last;
				1708	if (peer->rate_tokens > ip_rt_error_burst)
				1709	peer->rate_tokens = ip_rt_error_burst;
				1710	peer->rate_last = now;
				1711	if (peer->rate_tokens >= ip_rt_error_cost)
				1712	peer->rate_tokens -= ip_rt_error_cost;
				1713	else
				1714	send = false;
				1715	}
				1716	if (send)
				1717	icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
				1718
				1719	out: kfree_skb(skb);
				1720	return 0;
				1721	}
				1722
				1723	/*
				1724	* The last two values are not from the RFC but
				1725	* are needed for AMPRnet AX.25 paths.
				1726	*/
				1727
				1728	static const unsigned short mtu_plateau[] =
				1729	{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
				1730
				1731	static inline unsigned short guess_mtu(unsigned short old_mtu)
				1732	{
				1733	int i;
				1734
				1735	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
				1736	if (old_mtu > mtu_plateau[i])
				1737	return mtu_plateau[i];
				1738	return 68;
				1739	}
				1740
				1741	unsigned short ip_rt_frag_needed(struct net net, const struct iphdr iph,
				1742	unsigned short new_mtu,
				1743	struct net_device *dev)
				1744	{
				1745	unsigned short old_mtu = ntohs(iph->tot_len);
				1746	unsigned short est_mtu = 0;
				1747	struct inet_peer *peer;
				1748
				1749	peer = inet_getpeer_v4(iph->daddr, 1);
				1750	if (peer) {
				1751	unsigned short mtu = new_mtu;
				1752
				1753	if (new_mtu < 68 \|\| new_mtu >= old_mtu) {
				1754	/* BSD 4.2 derived systems incorrectly adjust
				1755	* tot_len by the IP header length, and report
				1756	* a zero MTU in the ICMP message.
				1757	*/
				1758	if (mtu == 0 &&
				1759	old_mtu >= 68 + (iph->ihl << 2))
				1760	old_mtu -= iph->ihl << 2;
				1761	mtu = guess_mtu(old_mtu);
				1762	}
				1763
				1764	if (mtu < ip_rt_min_pmtu)
				1765	mtu = ip_rt_min_pmtu;
				1766	if (!peer->pmtu_expires \|\| mtu < peer->pmtu_learned) {
				1767	unsigned long pmtu_expires;
				1768
				1769	pmtu_expires = jiffies + ip_rt_mtu_expires;
				1770	if (!pmtu_expires)
				1771	pmtu_expires = 1UL;
				1772
				1773	est_mtu = mtu;
				1774	peer->pmtu_learned = mtu;
				1775	peer->pmtu_expires = pmtu_expires;
				1776	atomic_inc(&__rt_peer_genid);
				1777	}
				1778
				1779	inet_putpeer(peer);
				1780	}
				1781	return est_mtu ? : new_mtu;
				1782	}
				1783
				1784	static void check_peer_pmtu(struct dst_entry dst, struct inet_peer peer)
				1785	{
				1786	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
				1787
				1788	if (!expires)
				1789	return;
				1790	if (time_before(jiffies, expires)) {
				1791	u32 orig_dst_mtu = dst_mtu(dst);
				1792	if (peer->pmtu_learned < orig_dst_mtu) {
				1793	if (!peer->pmtu_orig)
				1794	peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
				1795	dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
				1796	}
				1797	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
				1798	dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
				1799	}
				1800
				1801	static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
				1802	{
				1803	struct rtable rt = (struct rtable ) dst;
				1804	struct inet_peer *peer;
				1805
				1806	dst_confirm(dst);
				1807
				1808	if (!rt->peer)
				1809	rt_bind_peer(rt, rt->rt_dst, 1);
				1810	peer = rt->peer;
				1811	if (peer) {
				1812	unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
				1813
				1814	if (mtu < ip_rt_min_pmtu)
				1815	mtu = ip_rt_min_pmtu;
				1816	if (!pmtu_expires \|\| mtu < peer->pmtu_learned) {
				1817
				1818	pmtu_expires = jiffies + ip_rt_mtu_expires;
				1819	if (!pmtu_expires)
				1820	pmtu_expires = 1UL;
				1821
				1822	peer->pmtu_learned = mtu;
				1823	peer->pmtu_expires = pmtu_expires;
				1824
				1825	atomic_inc(&__rt_peer_genid);
				1826	rt->rt_peer_genid = rt_peer_genid();
				1827	}
				1828	check_peer_pmtu(dst, peer);
				1829	}
				1830	}
				1831
				1832
				1833	static void ipv4_validate_peer(struct rtable *rt)
				1834	{
				1835	if (rt->rt_peer_genid != rt_peer_genid()) {
				1836	struct inet_peer *peer;
				1837
				1838	if (!rt->peer)
				1839	rt_bind_peer(rt, rt->rt_dst, 0);
				1840
				1841	peer = rt->peer;
				1842	if (peer) {
				1843	check_peer_pmtu(&rt->dst, peer);
				1844
				1845	if (peer->redirect_learned.a4 &&
				1846	peer->redirect_learned.a4 != rt->rt_gateway)
				1847	check_peer_redir(&rt->dst, peer);
				1848	}
				1849
				1850	rt->rt_peer_genid = rt_peer_genid();
				1851	}
				1852	}
				1853
				1854	static struct dst_entry ipv4_dst_check(struct dst_entry dst, u32 cookie)
				1855	{
				1856	struct rtable rt = (struct rtable ) dst;
				1857
				1858	if (rt_is_expired(rt))
				1859	return NULL;
				1860	ipv4_validate_peer(rt);
				1861	return dst;
				1862	}
				1863
				1864	static void ipv4_dst_destroy(struct dst_entry *dst)
				1865	{
				1866	struct rtable rt = (struct rtable ) dst;
				1867	struct inet_peer *peer = rt->peer;
				1868
				1869	if (rt->fi) {
				1870	fib_info_put(rt->fi);
				1871	rt->fi = NULL;
				1872	}
				1873	if (peer) {
				1874	rt->peer = NULL;
				1875	inet_putpeer(peer);
				1876	}
				1877	}
				1878
				1879
				1880	static void ipv4_link_failure(struct sk_buff *skb)
				1881	{
				1882	struct rtable *rt;
				1883
				1884	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
				1885
				1886	rt = skb_rtable(skb);
				1887	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
				1888	dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
				1889	}
				1890
				1891	static int ip_rt_bug(struct sk_buff *skb)
				1892	{
				1893	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
				1894	&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
				1895	skb->dev ? skb->dev->name : "?");
				1896	kfree_skb(skb);
				1897	WARN_ON(1);
				1898	return 0;
				1899	}
				1900
				1901	/*
				1902	We do not cache source address of outgoing interface,
				1903	because it is used only by IP RR, TS and SRR options,
				1904	so that it out of fast path.
				1905
				1906	BTW remember: "addr" is allowed to be not aligned
				1907	in IP options!
				1908	*/
				1909
				1910	void ip_rt_get_source(u8 addr, struct sk_buff skb, struct rtable *rt)
				1911	{
				1912	__be32 src;
				1913
				1914	if (rt_is_output_route(rt))
				1915	src = ip_hdr(skb)->saddr;
				1916	else {
				1917	struct fib_result res;
				1918	struct flowi4 fl4;
				1919	struct iphdr *iph;
				1920
				1921	iph = ip_hdr(skb);
				1922
				1923	memset(&fl4, 0, sizeof(fl4));
				1924	fl4.daddr = iph->daddr;
				1925	fl4.saddr = iph->saddr;
				1926	fl4.flowi4_tos = RT_TOS(iph->tos);
				1927	fl4.flowi4_oif = rt->dst.dev->ifindex;
				1928	fl4.flowi4_iif = skb->dev->ifindex;
				1929	fl4.flowi4_mark = skb->mark;
				1930
				1931	rcu_read_lock();
				1932	if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
				1933	src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
				1934	else
				1935	src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
				1936	RT_SCOPE_UNIVERSE);
				1937	rcu_read_unlock();
				1938	}
				1939	memcpy(addr, &src, 4);
				1940	}
				1941
				1942	#ifdef CONFIG_IP_ROUTE_CLASSID
				1943	static void set_class_tag(struct rtable *rt, u32 tag)
				1944	{
				1945	if (!(rt->dst.tclassid & 0xFFFF))
				1946	rt->dst.tclassid \|= tag & 0xFFFF;
				1947	if (!(rt->dst.tclassid & 0xFFFF0000))
				1948	rt->dst.tclassid \|= tag & 0xFFFF0000;
				1949	}
				1950	#endif
				1951
				1952	static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
				1953	{
				1954	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
				1955
				1956	if (advmss == 0) {
				1957	advmss = max_t(unsigned int, dst->dev->mtu - 40,
				1958	ip_rt_min_advmss);
				1959	if (advmss > 65535 - 40)
				1960	advmss = 65535 - 40;
				1961	}
				1962	return advmss;
				1963	}
				1964
				1965	static unsigned int ipv4_mtu(const struct dst_entry *dst)
				1966	{
				1967	const struct rtable rt = (const struct rtable ) dst;
				1968	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
				1969
				1970	if (mtu && rt_is_output_route(rt))
				1971	return mtu;
				1972
				1973	mtu = dst->dev->mtu;
				1974
				1975	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
				1976
				1977	if (rt->rt_gateway != rt->rt_dst && mtu > 576)
				1978	mtu = 576;
				1979	}
				1980
				1981	if (mtu > IP_MAX_MTU)
				1982	mtu = IP_MAX_MTU;
				1983
				1984	return mtu;
				1985	}
				1986
				1987	static void rt_init_metrics(struct rtable rt, const struct flowi4 fl4,
				1988	struct fib_info *fi)
				1989	{
				1990	struct inet_peer *peer;
				1991	int create = 0;
				1992
				1993	/* If a peer entry exists for this destination, we must hook
				1994	* it up in order to get at cached metrics.
				1995	*/
				1996	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
				1997	create = 1;
				1998
				1999	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
				2000	if (peer) {
				2001	rt->rt_peer_genid = rt_peer_genid();
				2002	if (inet_metrics_new(peer))
				2003	memcpy(peer->metrics, fi->fib_metrics,
				2004	sizeof(u32) * RTAX_MAX);
				2005	dst_init_metrics(&rt->dst, peer->metrics, false);
				2006
				2007	check_peer_pmtu(&rt->dst, peer);
				2008
				2009	if (peer->redirect_learned.a4 &&
				2010	peer->redirect_learned.a4 != rt->rt_gateway) {
				2011	rt->rt_gateway = peer->redirect_learned.a4;
				2012	rt->rt_flags \|= RTCF_REDIRECTED;
				2013	}
				2014	} else {
				2015	if (fi->fib_metrics != (u32 *) dst_default_metrics) {
				2016	rt->fi = fi;
				2017	atomic_inc(&fi->fib_clntref);
				2018	}
				2019	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
				2020	}
				2021	}
				2022
				2023	static void rt_set_nexthop(struct rtable rt, const struct flowi4 fl4,
				2024	const struct fib_result *res,
				2025	struct fib_info *fi, u16 type, u32 itag)
				2026	{
				2027	struct dst_entry *dst = &rt->dst;
				2028
				2029	if (fi) {
				2030	if (FIB_RES_GW(*res) &&
				2031	FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
				2032	rt->rt_gateway = FIB_RES_GW(*res);
				2033	net_run_track(PRT_NEXTHOP,"rt_set_nexthop");
				2034	rt_init_metrics(rt, fl4, fi);
				2035	#ifdef CONFIG_IP_ROUTE_CLASSID
				2036	dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
				2037	#endif
				2038	}
				2039
				2040	if (dst_mtu(dst) > IP_MAX_MTU)
				2041	dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
				2042	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
				2043	dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
				2044
				2045	#ifdef CONFIG_IP_ROUTE_CLASSID
				2046	#ifdef CONFIG_IP_MULTIPLE_TABLES
				2047	set_class_tag(rt, fib_rules_tclass(res));
				2048	#endif
				2049	set_class_tag(rt, itag);
				2050	#endif
				2051	}
				2052
				2053	static struct rtable rt_dst_alloc(struct net_device dev,
				2054	bool nopolicy, bool noxfrm)
				2055	{
				2056	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
				2057	DST_HOST \|
				2058	(nopolicy ? DST_NOPOLICY : 0) \|
				2059	(noxfrm ? DST_NOXFRM : 0));
				2060	}
				2061
				2062	/* called in rcu_read_lock() section */
				2063	static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
				2064	u8 tos, struct net_device *dev, int our)
				2065	{
				2066	unsigned int hash;
				2067	struct rtable *rth;
				2068	__be32 spec_dst;
				2069	struct in_device *in_dev = __in_dev_get_rcu(dev);
				2070	u32 itag = 0;
				2071	int err;
				2072
				2073	/* Primary sanity checks. */
				2074
				2075	if (in_dev == NULL)
				2076	return -EINVAL;
				2077
				2078	if (ipv4_is_multicast(saddr) \|\| ipv4_is_lbcast(saddr) \|\|
				2079	ipv4_is_loopback(saddr) \|\| skb->protocol != htons(ETH_P_IP))
				2080	goto e_inval;
				2081
				2082	if (ipv4_is_zeronet(saddr)) {
				2083	if (!ipv4_is_local_multicast(daddr))
				2084	goto e_inval;
				2085	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
				2086	} else {
				2087	err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
				2088	&itag);
				2089	if (err < 0)
				2090	goto e_err;
				2091	}
				2092	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
				2093	IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
				2094	if (!rth)
				2095	goto e_nobufs;
				2096
				2097	#ifdef CONFIG_IP_ROUTE_CLASSID
				2098	rth->dst.tclassid = itag;
				2099	#endif
				2100	rth->dst.output = ip_rt_bug;
				2101
				2102	rth->rt_key_dst = daddr;
				2103	rth->rt_key_src = saddr;
				2104	rth->rt_genid = rt_genid(dev_net(dev));
				2105	rth->rt_flags = RTCF_MULTICAST;
				2106	rth->rt_type = RTN_MULTICAST;
				2107	rth->rt_key_tos = tos;
				2108	rth->rt_dst = daddr;
				2109	rth->rt_src = saddr;
				2110	rth->rt_route_iif = dev->ifindex;
				2111	rth->rt_iif = dev->ifindex;
				2112	rth->rt_oif = 0;
				2113	rth->rt_mark = skb->mark;
				2114	rth->rt_gateway = daddr;
				2115	rth->rt_spec_dst= spec_dst;
				2116	rth->rt_peer_genid = 0;
				2117	rth->peer = NULL;
				2118	rth->fi = NULL;
				2119	if (our) {
				2120	rth->dst.input= ip_local_deliver;
				2121	rth->rt_flags \|= RTCF_LOCAL;
				2122	}
				2123
				2124	#ifdef CONFIG_IP_MROUTE
				2125	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
				2126	rth->dst.input = ip_mr_input;
				2127	#endif
				2128	RT_CACHE_STAT_INC(in_slow_mc);
				2129
				2130	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
				2131	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
				2132	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
				2133
				2134	e_nobufs:
				2135	return -ENOBUFS;
				2136	e_inval:
				2137	return -EINVAL;
				2138	e_err:
				2139	return err;
				2140	}
				2141
				2142
				2143	static void ip_handle_martian_source(struct net_device *dev,
				2144	struct in_device *in_dev,
				2145	struct sk_buff *skb,
				2146	__be32 daddr,
				2147	__be32 saddr)
				2148	{
				2149	RT_CACHE_STAT_INC(in_martian_src);
				2150	#ifdef CONFIG_IP_ROUTE_VERBOSE
				2151	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
				2152	/*
				2153	* RFC1812 recommendation, if source is martian,
				2154	* the only hint is MAC header.
				2155	*/
				2156	pr_warn("martian source %pI4 from %pI4, on dev %s\n",
				2157	&daddr, &saddr, dev->name);
				2158	if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
				2159	print_hex_dump(KERN_WARNING, "ll header: ",
				2160	DUMP_PREFIX_OFFSET, 16, 1,
				2161	skb_mac_header(skb),
				2162	dev->hard_header_len, true);
				2163	}
				2164	}
				2165	#endif
				2166	}
				2167
				2168	/* called in rcu_read_lock() section */
				2169	static int __mkroute_input(struct sk_buff *skb,
				2170	const struct fib_result *res,
				2171	struct in_device *in_dev,
				2172	__be32 daddr, __be32 saddr, u32 tos,
				2173	struct rtable **result)
				2174	{
				2175	struct rtable *rth;
				2176	int err;
				2177	struct in_device *out_dev;
				2178	unsigned int flags = 0;
				2179	__be32 spec_dst;
				2180	u32 itag = 0;
				2181
				2182	/* get a working reference to the output device */
				2183	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
				2184	if (out_dev == NULL) {
				2185	if (net_ratelimit())
				2186	pr_crit("Bug in ip_route_input_slow(). Please report.\n");
				2187	return -EINVAL;
				2188	}
				2189
				2190
				2191	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
				2192	in_dev->dev, &spec_dst, &itag);
				2193	if (err < 0) {
				2194	ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
				2195	saddr);
				2196
				2197	goto cleanup;
				2198	}
				2199
				2200	if (err)
				2201	flags \|= RTCF_DIRECTSRC;
				2202
				2203	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
				2204	skb->protocol == htons(ETH_P_IP) &&
				2205	(IN_DEV_SHARED_MEDIA(out_dev) \|\|
				2206	inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
				2207	IPCB(skb)->flags \|= IPSKB_DOREDIRECT;
				2208
				2209	if (skb->protocol != htons(ETH_P_IP)) {
				2210	/* Not IP (i.e. ARP). Do not create route, if it is
				2211	* invalid for proxy arp. DNAT routes are always valid.
				2212	*
				2213	* Proxy arp feature have been extended to allow, ARP
				2214	* replies back to the same interface, to support
				2215	* Private VLAN switch technologies. See arp.c.
				2216	*/
				2217	if (out_dev == in_dev &&
				2218	IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
				2219	err = -EINVAL;
				2220	goto cleanup;
				2221	}
				2222	}
				2223
				2224	rth = rt_dst_alloc(out_dev->dev,
				2225	IN_DEV_CONF_GET(in_dev, NOPOLICY),
				2226	IN_DEV_CONF_GET(out_dev, NOXFRM));
				2227	if (!rth) {
				2228	err = -ENOBUFS;
				2229	goto cleanup;
				2230	}
				2231
				2232	rth->rt_key_dst = daddr;
				2233	rth->rt_key_src = saddr;
				2234	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
				2235	rth->rt_flags = flags;
				2236	rth->rt_type = res->type;
				2237	rth->rt_key_tos = tos;
				2238	rth->rt_dst = daddr;
				2239	rth->rt_src = saddr;
				2240	rth->rt_route_iif = in_dev->dev->ifindex;
				2241	rth->rt_iif = in_dev->dev->ifindex;
				2242	rth->rt_oif = 0;
				2243	rth->rt_mark = skb->mark;
				2244	rth->rt_gateway = daddr;
				2245	rth->rt_spec_dst= spec_dst;
				2246	rth->rt_peer_genid = 0;
				2247	rth->peer = NULL;
				2248	rth->fi = NULL;
				2249
				2250	rth->dst.input = ip_forward;
				2251	rth->dst.output = ip_output;
				2252
				2253	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
				2254
				2255	*result = rth;
				2256	err = 0;
				2257	cleanup:
				2258	return err;
				2259	}
				2260
				2261	static int ip_mkroute_input(struct sk_buff *skb,
				2262	struct fib_result *res,
				2263	const struct flowi4 *fl4,
				2264	struct in_device *in_dev,
				2265	__be32 daddr, __be32 saddr, u32 tos)
				2266	{
				2267	struct rtable* rth = NULL;
				2268	int err;
				2269	unsigned hash;
				2270
				2271	#ifdef CONFIG_IP_ROUTE_MULTIPATH
				2272	if (res->fi && res->fi->fib_nhs > 1)
				2273	fib_select_multipath(res);
				2274	#endif
				2275
				2276	/* create a routing cache entry */
				2277	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
				2278	if (err)
				2279	return err;
				2280
				2281	/* put it into the cache */
				2282	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
				2283	rt_genid(dev_net(rth->dst.dev)));
				2284	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
				2285	if (IS_ERR(rth))
				2286	return PTR_ERR(rth);
				2287	return 0;
				2288	}
				2289
				2290	/*
				2291	* NOTE. We drop all the packets that has local source
				2292	* addresses, because every properly looped back packet
				2293	* must have correct destination already attached by output routine.
				2294	*
				2295	* Such approach solves two big problems:
				2296	* 1. Not simplex devices are handled properly.
				2297	* 2. IP spoofing attempts are filtered with 100% of guarantee.
				2298	* called with rcu_read_lock()
				2299	*/
				2300
				2301	static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
				2302	u8 tos, struct net_device *dev)
				2303	{
				2304	struct fib_result res;
				2305	struct in_device *in_dev = __in_dev_get_rcu(dev);
				2306	struct flowi4 fl4;
				2307	unsigned flags = 0;
				2308	u32 itag = 0;
				2309	struct rtable * rth;
				2310	unsigned hash;
				2311	__be32 spec_dst;
				2312	int err = -EINVAL;
				2313	struct net * net = dev_net(dev);
				2314
				2315	/* IP on this device is disabled. */
				2316
				2317	if (!in_dev)
				2318	goto out;
				2319
				2320	/* Check for the most weird martians, which can be not detected
				2321	by fib_lookup.
				2322	*/
				2323
				2324	if (ipv4_is_multicast(saddr) \|\| ipv4_is_lbcast(saddr) \|\|
				2325	ipv4_is_loopback(saddr))
				2326	goto martian_source;
				2327
				2328	if (ipv4_is_lbcast(daddr) \|\| (saddr == 0 && daddr == 0))
				2329	goto brd_input;
				2330
				2331	/* Accept zero addresses only to limited broadcast;
				2332	* I even do not know to fix it or not. Waiting for complains :-)
				2333	*/
				2334	if (ipv4_is_zeronet(saddr))
				2335	goto martian_source;
				2336
				2337	if (ipv4_is_zeronet(daddr) \|\| ipv4_is_loopback(daddr))
				2338	goto martian_destination;
				2339
				2340	/*
				2341	* Now we are ready to route packet.
				2342	*/
				2343	fl4.flowi4_oif = 0;
				2344	fl4.flowi4_iif = dev->ifindex;
				2345	fl4.flowi4_mark = skb->mark;
				2346	fl4.flowi4_tos = tos;
				2347	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
				2348	fl4.daddr = daddr;
				2349	fl4.saddr = saddr;
				2350	err = fib_lookup(net, &fl4, &res);
				2351	if (err != 0) {
				2352	if (!IN_DEV_FORWARD(in_dev))
				2353	goto e_hostunreach;
				2354	goto no_route;
				2355	}
				2356
				2357	RT_CACHE_STAT_INC(in_slow_tot);
				2358
				2359	if (res.type == RTN_BROADCAST)
				2360	goto brd_input;
				2361
				2362	if (res.type == RTN_LOCAL) {
				2363	err = fib_validate_source(skb, saddr, daddr, tos,
				2364	net->loopback_dev->ifindex,
				2365	dev, &spec_dst, &itag);
				2366	if (err < 0)
				2367	goto martian_source_keep_err;
				2368	if (err)
				2369	flags \|= RTCF_DIRECTSRC;
				2370	spec_dst = daddr;
				2371	goto local_input;
				2372	}
				2373
				2374	if (!IN_DEV_FORWARD(in_dev))
				2375	goto e_hostunreach;
				2376	if (res.type != RTN_UNICAST)
				2377	goto martian_destination;
				2378
				2379	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
				2380	out:
				2381	net_run_track(PRT_ROUTE," route err = %d",err);
				2382	return err;
				2383
				2384	brd_input:
				2385	if (skb->protocol != htons(ETH_P_IP))
				2386	goto e_inval;
				2387
				2388	if (ipv4_is_zeronet(saddr))
				2389	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
				2390	else {
				2391	err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
				2392	&itag);
				2393	if (err < 0)
				2394	goto martian_source_keep_err;
				2395	if (err)
				2396	flags \|= RTCF_DIRECTSRC;
				2397	}
				2398	flags \|= RTCF_BROADCAST;
				2399	res.type = RTN_BROADCAST;
				2400	RT_CACHE_STAT_INC(in_brd);
				2401
				2402	local_input:
				2403	rth = rt_dst_alloc(net->loopback_dev,
				2404	IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
				2405	if (!rth)
				2406	goto e_nobufs;
				2407
				2408	rth->dst.input= ip_local_deliver;
				2409	rth->dst.output= ip_rt_bug;
				2410	#ifdef CONFIG_IP_ROUTE_CLASSID
				2411	rth->dst.tclassid = itag;
				2412	#endif
				2413
				2414	rth->rt_key_dst = daddr;
				2415	rth->rt_key_src = saddr;
				2416	rth->rt_genid = rt_genid(net);
				2417	rth->rt_flags = flags\|RTCF_LOCAL;
				2418	rth->rt_type = res.type;
				2419	rth->rt_key_tos = tos;
				2420	rth->rt_dst = daddr;
				2421	rth->rt_src = saddr;
				2422	#ifdef CONFIG_IP_ROUTE_CLASSID
				2423	rth->dst.tclassid = itag;
				2424	#endif
				2425	rth->rt_route_iif = dev->ifindex;
				2426	rth->rt_iif = dev->ifindex;
				2427	rth->rt_oif = 0;
				2428	rth->rt_mark = skb->mark;
				2429	rth->rt_gateway = daddr;
				2430	rth->rt_spec_dst= spec_dst;
				2431	rth->rt_peer_genid = 0;
				2432	rth->peer = NULL;
				2433	rth->fi = NULL;
				2434	if (res.type == RTN_UNREACHABLE) {
				2435	rth->dst.input= ip_error;
				2436	rth->dst.error= -err;
				2437	rth->rt_flags &= ~RTCF_LOCAL;
				2438	}
				2439	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
				2440	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
				2441	err = 0;
				2442	if (IS_ERR(rth))
				2443	err = PTR_ERR(rth);
				2444	goto out;
				2445
				2446	no_route:
				2447	RT_CACHE_STAT_INC(in_no_route);
				2448	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
				2449	res.type = RTN_UNREACHABLE;
				2450	if (err == -ESRCH)
				2451	err = -ENETUNREACH;
				2452	goto local_input;
				2453
				2454	/*
				2455	* Do not cache martian addresses: they should be logged (RFC1812)
				2456	*/
				2457	martian_destination:
				2458	RT_CACHE_STAT_INC(in_martian_dst);
				2459	#ifdef CONFIG_IP_ROUTE_VERBOSE
				2460	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
				2461	pr_warn("martian destination %pI4 from %pI4, dev %s\n",
				2462	&daddr, &saddr, dev->name);
				2463	#endif
				2464
				2465	e_hostunreach:
				2466	err = -EHOSTUNREACH;
				2467	goto out;
				2468
				2469	e_inval:
				2470	err = -EINVAL;
				2471	goto out;
				2472
				2473	e_nobufs:
				2474	err = -ENOBUFS;
				2475	goto out;
				2476
				2477	martian_source:
				2478	err = -EINVAL;
				2479	martian_source_keep_err:
				2480	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
				2481	goto out;
				2482	}
				2483
				2484	int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
				2485	u8 tos, struct net_device *dev, bool noref)
				2486	{
				2487	struct rtable * rth;
				2488	unsigned hash;
				2489	int iif = dev->ifindex;
				2490	struct net *net;
				2491	int res;
				2492
				2493	net = dev_net(dev);
				2494
				2495	rcu_read_lock();
				2496
				2497	if (!rt_caching(net))
				2498	goto skip_cache;
				2499
				2500	tos &= IPTOS_RT_MASK;
				2501	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
				2502
				2503	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
				2504	rth = rcu_dereference(rth->dst.rt_next)) {
				2505	if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) \|
				2506	((__force u32)rth->rt_key_src ^ (__force u32)saddr) \|
				2507	(rth->rt_route_iif ^ iif) \|
				2508	(rth->rt_key_tos ^ tos)) == 0 &&
				2509	rth->rt_mark == skb->mark &&
				2510	net_eq(dev_net(rth->dst.dev), net) &&
				2511	!rt_is_expired(rth)) {
				2512	ipv4_validate_peer(rth);
				2513	if (noref) {
				2514	dst_use_noref(&rth->dst, jiffies);
				2515	skb_dst_set_noref(skb, &rth->dst);
				2516	} else {
				2517	dst_use(&rth->dst, jiffies);
				2518	skb_dst_set(skb, &rth->dst);
				2519	}
				2520	RT_CACHE_STAT_INC(in_hit);
				2521	rcu_read_unlock();
				2522	return 0;
				2523	}
				2524	RT_CACHE_STAT_INC(in_hlist_search);
				2525	}
				2526
				2527	skip_cache:
				2528	/* Multicast recognition logic is moved from route cache to here.
				2529	The problem was that too many Ethernet cards have broken/missing
				2530	hardware multicast filters :-( As result the host on multicasting
				2531	network acquires a lot of useless route cache entries, sort of
				2532	SDR messages from all the world. Now we try to get rid of them.
				2533	Really, provided software IP multicast filter is organized
				2534	reasonably (at least, hashed), it does not result in a slowdown
				2535	comparing with route cache reject entries.
				2536	Note, that multicast routers are not affected, because
				2537	route cache entry is created eventually.
				2538	*/
				2539	if (ipv4_is_multicast(daddr)) {
				2540	struct in_device *in_dev = __in_dev_get_rcu(dev);
				2541
				2542	if (in_dev) {
				2543	int our = ip_check_mc_rcu(in_dev, daddr, saddr,
				2544	ip_hdr(skb)->protocol);
				2545	if (our
				2546	#ifdef CONFIG_IP_MROUTE
				2547	\|\|
				2548	(!ipv4_is_local_multicast(daddr) &&
				2549	IN_DEV_MFORWARD(in_dev))
				2550	#endif
				2551	) {
				2552	int res = ip_route_input_mc(skb, daddr, saddr,
				2553	tos, dev, our);
				2554	rcu_read_unlock();
				2555	net_run_track(PRT_ROUTE," route");
				2556	return res;
				2557	}
				2558	}
				2559	rcu_read_unlock();
				2560	return -EINVAL;
				2561	}
				2562	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
				2563	rcu_read_unlock();
				2564	net_run_track(PRT_ROUTE," route");
				2565	return res;
				2566	}
				2567	EXPORT_SYMBOL(ip_route_input_common);
				2568
				2569	/* called with rcu_read_lock() */
				2570	static struct rtable __mkroute_output(const struct fib_result res,
				2571	const struct flowi4 *fl4,
				2572	__be32 orig_daddr, __be32 orig_saddr,
				2573	int orig_oif, __u8 orig_rtos,
				2574	struct net_device *dev_out,
				2575	unsigned int flags)
				2576	{
				2577	struct fib_info *fi = res->fi;
				2578	struct in_device *in_dev;
				2579	u16 type = res->type;
				2580	struct rtable *rth;
				2581
				2582	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
				2583	return ERR_PTR(-EINVAL);
				2584
				2585	if (ipv4_is_lbcast(fl4->daddr))
				2586	type = RTN_BROADCAST;
				2587	else if (ipv4_is_multicast(fl4->daddr))
				2588	type = RTN_MULTICAST;
				2589	else if (ipv4_is_zeronet(fl4->daddr))
				2590	return ERR_PTR(-EINVAL);
				2591
				2592	if (dev_out->flags & IFF_LOOPBACK)
				2593	flags \|= RTCF_LOCAL;
				2594
				2595	in_dev = __in_dev_get_rcu(dev_out);
				2596	if (!in_dev)
				2597	return ERR_PTR(-EINVAL);
				2598
				2599	if (type == RTN_BROADCAST) {
				2600	flags \|= RTCF_BROADCAST \| RTCF_LOCAL;
				2601	fi = NULL;
				2602	} else if (type == RTN_MULTICAST) {
				2603	flags \|= RTCF_MULTICAST \| RTCF_LOCAL;
				2604	if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
				2605	fl4->flowi4_proto))
				2606	flags &= ~RTCF_LOCAL;
				2607	/* If multicast route do not exist use
				2608	* default one, but do not gateway in this case.
				2609	* Yes, it is hack.
				2610	*/
				2611	if (fi && res->prefixlen < 4)
				2612	fi = NULL;
				2613	}
				2614
				2615	rth = rt_dst_alloc(dev_out,
				2616	IN_DEV_CONF_GET(in_dev, NOPOLICY),
				2617	IN_DEV_CONF_GET(in_dev, NOXFRM));
				2618	if (!rth)
				2619	return ERR_PTR(-ENOBUFS);
				2620
				2621	rth->dst.output = ip_output;
				2622
				2623	rth->rt_key_dst = orig_daddr;
				2624	rth->rt_key_src = orig_saddr;
				2625	rth->rt_genid = rt_genid(dev_net(dev_out));
				2626	rth->rt_flags = flags;
				2627	rth->rt_type = type;
				2628	rth->rt_key_tos = orig_rtos;
				2629	rth->rt_dst = fl4->daddr;
				2630	rth->rt_src = fl4->saddr;
				2631	rth->rt_route_iif = 0;
				2632	rth->rt_iif = orig_oif ? : dev_out->ifindex;
				2633	rth->rt_oif = orig_oif;
				2634	rth->rt_mark = fl4->flowi4_mark;
				2635	rth->rt_gateway = fl4->daddr;
				2636	rth->rt_spec_dst= fl4->saddr;
				2637	rth->rt_peer_genid = 0;
				2638	rth->peer = NULL;
				2639	rth->fi = NULL;
				2640
				2641	RT_CACHE_STAT_INC(out_slow_tot);
				2642
				2643	if (flags & RTCF_LOCAL) {
				2644	rth->dst.input = ip_local_deliver;
				2645	rth->rt_spec_dst = fl4->daddr;
				2646	}
				2647	if (flags & (RTCF_BROADCAST \| RTCF_MULTICAST)) {
				2648	rth->rt_spec_dst = fl4->saddr;
				2649	if (flags & RTCF_LOCAL &&
				2650	!(dev_out->flags & IFF_LOOPBACK)) {
				2651	rth->dst.output = ip_mc_output;
				2652	RT_CACHE_STAT_INC(out_slow_mc);
				2653	}
				2654	#ifdef CONFIG_IP_MROUTE
				2655	if (type == RTN_MULTICAST) {
				2656	if (IN_DEV_MFORWARD(in_dev) &&
				2657	!ipv4_is_local_multicast(fl4->daddr)) {
				2658	rth->dst.input = ip_mr_input;
				2659	rth->dst.output = ip_mc_output;
				2660	}
				2661	}
				2662	#endif
				2663	}
				2664
				2665	rt_set_nexthop(rth, fl4, res, fi, type, 0);
				2666
				2667	return rth;
				2668	}
				2669
				2670	/*
				2671	* Major route resolver routine.
				2672	* called with rcu_read_lock();
				2673	*/
				2674
				2675	static struct rtable ip_route_output_slow(struct net net, struct flowi4 *fl4)
				2676	{
				2677	struct net_device *dev_out = NULL;
				2678	__u8 tos = RT_FL_TOS(fl4);
				2679	unsigned int flags = 0;
				2680	struct fib_result res;
				2681	struct rtable *rth;
				2682	__be32 orig_daddr;
				2683	__be32 orig_saddr;
				2684	int orig_oif;
				2685
				2686	res.fi = NULL;
				2687	#ifdef CONFIG_IP_MULTIPLE_TABLES
				2688	res.r = NULL;
				2689	#endif
				2690
				2691	orig_daddr = fl4->daddr;
				2692	orig_saddr = fl4->saddr;
				2693	orig_oif = fl4->flowi4_oif;
				2694
				2695	fl4->flowi4_iif = net->loopback_dev->ifindex;
				2696	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
				2697	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
				2698	RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
				2699
				2700	rcu_read_lock();
				2701	if (fl4->saddr) {
				2702	rth = ERR_PTR(-EINVAL);
				2703	if (ipv4_is_multicast(fl4->saddr) \|\|
				2704	ipv4_is_lbcast(fl4->saddr) \|\|
				2705	ipv4_is_zeronet(fl4->saddr))
				2706	goto out;
				2707
				2708	/* I removed check for oif == dev_out->oif here.
				2709	It was wrong for two reasons:
				2710	1. ip_dev_find(net, saddr) can return wrong iface, if saddr
				2711	is assigned to multiple interfaces.
				2712	2. Moreover, we are allowed to send packets with saddr
				2713	of another iface. --ANK
				2714	*/
				2715
				2716	if (fl4->flowi4_oif == 0 &&
				2717	(ipv4_is_multicast(fl4->daddr) \|\|
				2718	ipv4_is_lbcast(fl4->daddr))) {
				2719	/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
				2720	dev_out = __ip_dev_find(net, fl4->saddr, false);
				2721	if (dev_out == NULL)
				2722	goto out;
				2723
				2724	/* Special hack: user can direct multicasts
				2725	and limited broadcast via necessary interface
				2726	without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
				2727	This hack is not just for fun, it allows
				2728	vic,vat and friends to work.
				2729	They bind socket to loopback, set ttl to zero
				2730	and expect that it will work.
				2731	From the viewpoint of routing cache they are broken,
				2732	because we are not allowed to build multicast path
				2733	with loopback source addr (look, routing cache
				2734	cannot know, that ttl is zero, so that packet
				2735	will not leave this host and route is valid).
				2736	Luckily, this hack is good workaround.
				2737	*/
				2738
				2739	fl4->flowi4_oif = dev_out->ifindex;
				2740	goto make_route;
				2741	}
				2742
				2743	if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
				2744	/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
				2745	if (!__ip_dev_find(net, fl4->saddr, false))
				2746	goto out;
				2747	}
				2748	}
				2749
				2750
				2751	if (fl4->flowi4_oif) {
				2752	dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
				2753	rth = ERR_PTR(-ENODEV);
				2754	if (dev_out == NULL)
				2755	goto out;
				2756
				2757	/* RACE: Check return value of inet_select_addr instead. */
				2758	if (!(dev_out->flags & IFF_UP) \|\| !__in_dev_get_rcu(dev_out)) {
				2759	rth = ERR_PTR(-ENETUNREACH);
				2760	goto out;
				2761	}
				2762	if (ipv4_is_local_multicast(fl4->daddr) \|\|
				2763	ipv4_is_lbcast(fl4->daddr)) {
				2764	if (!fl4->saddr)
				2765	fl4->saddr = inet_select_addr(dev_out, 0,
				2766	RT_SCOPE_LINK);
				2767	goto make_route;
				2768	}
				2769	if (!fl4->saddr) {
				2770	if (ipv4_is_multicast(fl4->daddr))
				2771	fl4->saddr = inet_select_addr(dev_out, 0,
				2772	fl4->flowi4_scope);
				2773	else if (!fl4->daddr)
				2774	fl4->saddr = inet_select_addr(dev_out, 0,
				2775	RT_SCOPE_HOST);
				2776	}
				2777	}
				2778
				2779	if (!fl4->daddr) {
				2780	fl4->daddr = fl4->saddr;
				2781	if (!fl4->daddr)
				2782	fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
				2783	dev_out = net->loopback_dev;
				2784	fl4->flowi4_oif = net->loopback_dev->ifindex;
				2785	res.type = RTN_LOCAL;
				2786	net_run_track(PRT_ROUTE,"local route");
				2787	flags \|= RTCF_LOCAL;
				2788	goto make_route;
				2789	}
				2790
				2791	if (fib_lookup(net, fl4, &res)) {
				2792	res.fi = NULL;
				2793	if (fl4->flowi4_oif) {
				2794	/* Apparently, routing tables are wrong. Assume,
				2795	that the destination is on link.
				2796
				2797	WHY? DW.
				2798	Because we are allowed to send to iface
				2799	even if it has NO routes and NO assigned
				2800	addresses. When oif is specified, routing
				2801	tables are looked up with only one purpose:
				2802	to catch if destination is gatewayed, rather than
				2803	direct. Moreover, if MSG_DONTROUTE is set,
				2804	we send packet, ignoring both routing tables
				2805	and ifaddr state. --ANK
				2806
				2807
				2808	We could make it even if oif is unknown,
				2809	likely IPv6, but we do not.
				2810	*/
				2811
				2812	if (fl4->saddr == 0)
				2813	fl4->saddr = inet_select_addr(dev_out, 0,
				2814	RT_SCOPE_LINK);
				2815	res.type = RTN_UNICAST;
				2816	goto make_route;
				2817	}
				2818	rth = ERR_PTR(-ENETUNREACH);
				2819	goto out;
				2820	}
				2821
				2822	if (res.type == RTN_LOCAL) {
				2823	if (!fl4->saddr) {
				2824	if (res.fi->fib_prefsrc)
				2825	fl4->saddr = res.fi->fib_prefsrc;
				2826	else
				2827	fl4->saddr = fl4->daddr;
				2828	}
				2829	dev_out = net->loopback_dev;
				2830	fl4->flowi4_oif = dev_out->ifindex;
				2831	res.fi = NULL;
				2832	flags \|= RTCF_LOCAL;
				2833	goto make_route;
				2834	}
				2835
				2836	#ifdef CONFIG_IP_ROUTE_MULTIPATH
				2837	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
				2838	fib_select_multipath(&res);
				2839	else
				2840	#endif
				2841	if (!res.prefixlen &&
				2842	res.table->tb_num_default > 1 &&
				2843	res.type == RTN_UNICAST && !fl4->flowi4_oif)
				2844	fib_select_default(&res);
				2845
				2846	if (!fl4->saddr)
				2847	fl4->saddr = FIB_RES_PREFSRC(net, res);
				2848
				2849	dev_out = FIB_RES_DEV(res);
				2850	fl4->flowi4_oif = dev_out->ifindex;
				2851
				2852
				2853	make_route:
				2854	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
				2855	tos, dev_out, flags);
				2856	if (!IS_ERR(rth)) {
				2857	unsigned int hash;
				2858
				2859	hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
				2860	rt_genid(dev_net(dev_out)));
				2861	rth = rt_intern_hash(hash, rth, NULL, orig_oif);
				2862	}
				2863
				2864	out:
				2865	rcu_read_unlock();
				2866	return rth;
				2867	}
				2868
				2869	struct rtable __ip_route_output_key(struct net net, struct flowi4 *flp4)
				2870	{
				2871	struct rtable *rth;
				2872	unsigned int hash;
				2873
				2874	if (!rt_caching(net))
				2875	goto slow_output;
				2876
				2877	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
				2878
				2879	rcu_read_lock_bh();
				2880	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
				2881	rth = rcu_dereference_bh(rth->dst.rt_next)) {
				2882	if (rth->rt_key_dst == flp4->daddr &&
				2883	rth->rt_key_src == flp4->saddr &&
				2884	rt_is_output_route(rth) &&
				2885	rth->rt_oif == flp4->flowi4_oif &&
				2886	rth->rt_mark == flp4->flowi4_mark &&
				2887	!((rth->rt_key_tos ^ flp4->flowi4_tos) &
				2888	(IPTOS_RT_MASK \| RTO_ONLINK)) &&
				2889	net_eq(dev_net(rth->dst.dev), net) &&
				2890	!rt_is_expired(rth)) {
				2891	ipv4_validate_peer(rth);
				2892	dst_use(&rth->dst, jiffies);
				2893	RT_CACHE_STAT_INC(out_hit);
				2894	rcu_read_unlock_bh();
				2895	if (!flp4->saddr)
				2896	flp4->saddr = rth->rt_src;
				2897	if (!flp4->daddr)
				2898	flp4->daddr = rth->rt_dst;
				2899	net_run_track(PRT_ROUTE," route");
				2900	return rth;
				2901	}
				2902	RT_CACHE_STAT_INC(out_hlist_search);
				2903	}
				2904	rcu_read_unlock_bh();
				2905
				2906	slow_output:
				2907	return ip_route_output_slow(net, flp4);
				2908	}
				2909	EXPORT_SYMBOL_GPL(__ip_route_output_key);
				2910
				2911	static struct dst_entry ipv4_blackhole_dst_check(struct dst_entry dst, u32 cookie)
				2912	{
				2913	return NULL;
				2914	}
				2915
				2916	static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
				2917	{
				2918	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
				2919
				2920	return mtu ? : dst->dev->mtu;
				2921	}
				2922
				2923	static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
				2924	{
				2925	}
				2926
				2927	static u32 ipv4_rt_blackhole_cow_metrics(struct dst_entry dst,
				2928	unsigned long old)
				2929	{
				2930	return NULL;
				2931	}
				2932
				2933	static struct dst_ops ipv4_dst_blackhole_ops = {
				2934	.family = AF_INET,
				2935	.protocol = cpu_to_be16(ETH_P_IP),
				2936	.destroy = ipv4_dst_destroy,
				2937	.check = ipv4_blackhole_dst_check,
				2938	.mtu = ipv4_blackhole_mtu,
				2939	.default_advmss = ipv4_default_advmss,
				2940	.update_pmtu = ipv4_rt_blackhole_update_pmtu,
				2941	.cow_metrics = ipv4_rt_blackhole_cow_metrics,
				2942	.neigh_lookup = ipv4_neigh_lookup,
				2943	};
				2944
				2945	struct dst_entry ipv4_blackhole_route(struct net net, struct dst_entry *dst_orig)
				2946	{
				2947	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
				2948	struct rtable ort = (struct rtable ) dst_orig;
				2949
				2950	if (rt) {
				2951	struct dst_entry *new = &rt->dst;
				2952
				2953	new->__use = 1;
				2954	new->input = dst_discard;
				2955	new->output = dst_discard;
				2956	dst_copy_metrics(new, &ort->dst);
				2957
				2958	new->dev = ort->dst.dev;
				2959	if (new->dev)
				2960	dev_hold(new->dev);
				2961
				2962	rt->rt_key_dst = ort->rt_key_dst;
				2963	rt->rt_key_src = ort->rt_key_src;
				2964	rt->rt_key_tos = ort->rt_key_tos;
				2965	rt->rt_route_iif = ort->rt_route_iif;
				2966	rt->rt_iif = ort->rt_iif;
				2967	rt->rt_oif = ort->rt_oif;
				2968	rt->rt_mark = ort->rt_mark;
				2969
				2970	rt->rt_genid = rt_genid(net);
				2971	rt->rt_flags = ort->rt_flags;
				2972	rt->rt_type = ort->rt_type;
				2973	rt->rt_dst = ort->rt_dst;
				2974	rt->rt_src = ort->rt_src;
				2975	rt->rt_gateway = ort->rt_gateway;
				2976	rt->rt_spec_dst = ort->rt_spec_dst;
				2977	rt->peer = ort->peer;
				2978	if (rt->peer)
				2979	atomic_inc(&rt->peer->refcnt);
				2980	rt->fi = ort->fi;
				2981	if (rt->fi)
				2982	atomic_inc(&rt->fi->fib_clntref);
				2983
				2984	dst_free(new);
				2985	}
				2986
				2987	dst_release(dst_orig);
				2988
				2989	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
				2990	}
				2991
				2992	struct rtable ip_route_output_flow(struct net net, struct flowi4 *flp4,
				2993	struct sock *sk)
				2994	{
				2995	struct rtable *rt = __ip_route_output_key(net, flp4);
				2996
				2997	if (IS_ERR(rt))
				2998	return rt;
				2999
				3000	if (flp4->flowi4_proto)
				3001	rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
				3002	flowi4_to_flowi(flp4),
				3003	sk, 0);
				3004
				3005	return rt;
				3006	}
				3007	EXPORT_SYMBOL_GPL(ip_route_output_flow);
				3008
				3009	static int rt_fill_info(struct net *net,
				3010	struct sk_buff *skb, u32 pid, u32 seq, int event,
				3011	int nowait, unsigned int flags)
				3012	{
				3013	struct rtable *rt = skb_rtable(skb);
				3014	struct rtmsg *r;
				3015	struct nlmsghdr *nlh;
				3016	unsigned long expires = 0;
				3017	const struct inet_peer *peer = rt->peer;
				3018	u32 id = 0, ts = 0, tsage = 0, error;
				3019
				3020	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
				3021	if (nlh == NULL)
				3022	return -EMSGSIZE;
				3023
				3024	r = nlmsg_data(nlh);
				3025	r->rtm_family = AF_INET;
				3026	r->rtm_dst_len = 32;
				3027	r->rtm_src_len = 0;
				3028	r->rtm_tos = rt->rt_key_tos;
				3029	r->rtm_table = RT_TABLE_MAIN;
				3030	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
				3031	r->rtm_type = rt->rt_type;
				3032	r->rtm_scope = RT_SCOPE_UNIVERSE;
				3033	r->rtm_protocol = RTPROT_UNSPEC;
				3034	r->rtm_flags = (rt->rt_flags & ~0xFFFF) \| RTM_F_CLONED;
				3035	if (rt->rt_flags & RTCF_NOTIFY)
				3036	r->rtm_flags \|= RTM_F_NOTIFY;
				3037	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
				3038	r->rtm_flags \|= RTCF_DOREDIRECT;
				3039
				3040	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
				3041
				3042	if (rt->rt_key_src) {
				3043	r->rtm_src_len = 32;
				3044	NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
				3045	}
				3046	if (rt->dst.dev)
				3047	NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
				3048	#ifdef CONFIG_IP_ROUTE_CLASSID
				3049	if (rt->dst.tclassid)
				3050	NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
				3051	#endif
				3052	if (rt_is_input_route(rt))
				3053	NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
				3054	else if (rt->rt_src != rt->rt_key_src)
				3055	NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
				3056
				3057	if (rt->rt_dst != rt->rt_gateway)
				3058	NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
				3059
				3060	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
				3061	goto nla_put_failure;
				3062
				3063	if (rt->rt_mark)
				3064	NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
				3065
				3066	error = rt->dst.error;
				3067	if (peer) {
				3068	inet_peer_refcheck(rt->peer);
				3069	if (peer->tcp_ts_stamp) {
				3070	ts = peer->tcp_ts;
				3071	tsage = get_seconds() - peer->tcp_ts_stamp;
				3072	}
				3073	expires = ACCESS_ONCE(peer->pmtu_expires);
				3074	if (expires) {
				3075	if (time_before(jiffies, expires))
				3076	expires -= jiffies;
				3077	else
				3078	expires = 0;
				3079	}
				3080	}
				3081
				3082	if (rt_is_input_route(rt)) {
				3083	#ifdef CONFIG_IP_MROUTE
				3084	__be32 dst = rt->rt_dst;
				3085
				3086	if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
				3087	IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
				3088	int err = ipmr_get_route(net, skb,
				3089	rt->rt_src, rt->rt_dst,
				3090	r, nowait);
				3091	if (err <= 0) {
				3092	if (!nowait) {
				3093	if (err == 0)
				3094	return 0;
				3095	goto nla_put_failure;
				3096	} else {
				3097	if (err == -EMSGSIZE)
				3098	goto nla_put_failure;
				3099	error = err;
				3100	}
				3101	}
				3102	} else
				3103	#endif
				3104	NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
				3105	}
				3106
				3107	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
				3108	expires, error) < 0)
				3109	goto nla_put_failure;
				3110
				3111	return nlmsg_end(skb, nlh);
				3112
				3113	nla_put_failure:
				3114	nlmsg_cancel(skb, nlh);
				3115	return -EMSGSIZE;
				3116	}
				3117
				3118	static int inet_rtm_getroute(struct sk_buff in_skb, struct nlmsghdr nlh, void *arg)
				3119	{
				3120	struct net *net = sock_net(in_skb->sk);
				3121	struct rtmsg *rtm;
				3122	struct nlattr *tb[RTA_MAX+1];
				3123	struct rtable *rt = NULL;
				3124	__be32 dst = 0;
				3125	__be32 src = 0;
				3126	u32 iif;
				3127	int err;
				3128	int mark;
				3129	struct sk_buff *skb;
				3130
				3131	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
				3132	if (err < 0)
				3133	goto errout;
				3134
				3135	rtm = nlmsg_data(nlh);
				3136
				3137	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
				3138	if (skb == NULL) {
				3139	err = -ENOBUFS;
				3140	goto errout;
				3141	}
				3142
				3143	/* Reserve room for dummy headers, this skb can pass
				3144	through good chunk of routing engine.
				3145	*/
				3146	skb_reset_mac_header(skb);
				3147	skb_reset_network_header(skb);
				3148
				3149	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
				3150	ip_hdr(skb)->protocol = IPPROTO_ICMP;
				3151	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
				3152
				3153	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
				3154	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
				3155	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
				3156	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
				3157
				3158	if (iif) {
				3159	struct net_device *dev;
				3160
				3161	dev = __dev_get_by_index(net, iif);
				3162	if (dev == NULL) {
				3163	err = -ENODEV;
				3164	goto errout_free;
				3165	}
				3166
				3167	skb->protocol = htons(ETH_P_IP);
				3168	skb->dev = dev;
				3169	skb->mark = mark;
				3170	local_bh_disable();
				3171	err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
				3172	local_bh_enable();
				3173
				3174	rt = skb_rtable(skb);
				3175	if (err == 0 && rt->dst.error)
				3176	err = -rt->dst.error;
				3177	} else {
				3178	struct flowi4 fl4 = {
				3179	.daddr = dst,
				3180	.saddr = src,
				3181	.flowi4_tos = rtm->rtm_tos,
				3182	.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
				3183	.flowi4_mark = mark,
				3184	};
				3185	rt = ip_route_output_key(net, &fl4);
				3186
				3187	err = 0;
				3188	if (IS_ERR(rt))
				3189	err = PTR_ERR(rt);
				3190	}
				3191
				3192	if (err)
				3193	goto errout_free;
				3194
				3195	skb_dst_set(skb, &rt->dst);
				3196	if (rtm->rtm_flags & RTM_F_NOTIFY)
				3197	rt->rt_flags \|= RTCF_NOTIFY;
				3198
				3199	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
				3200	RTM_NEWROUTE, 0, 0);
				3201	if (err <= 0)
				3202	goto errout_free;
				3203
				3204	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
				3205	errout:
				3206	return err;
				3207
				3208	errout_free:
				3209	kfree_skb(skb);
				3210	goto errout;
				3211	}
				3212
				3213	int ip_rt_dump(struct sk_buff skb, struct netlink_callback cb)
				3214	{
				3215	struct rtable *rt;
				3216	int h, s_h;
				3217	int idx, s_idx;
				3218	struct net *net;
				3219
				3220	net = sock_net(skb->sk);
				3221
				3222	s_h = cb->args[0];
				3223	if (s_h < 0)
				3224	s_h = 0;
				3225	s_idx = idx = cb->args[1];
				3226	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
				3227	if (!rt_hash_table[h].chain)
				3228	continue;
				3229	rcu_read_lock_bh();
				3230	for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
				3231	rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
				3232	if (!net_eq(dev_net(rt->dst.dev), net) \|\| idx < s_idx)
				3233	continue;
				3234	if (rt_is_expired(rt))
				3235	continue;
				3236	skb_dst_set_noref(skb, &rt->dst);
				3237	if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
				3238	cb->nlh->nlmsg_seq, RTM_NEWROUTE,
				3239	1, NLM_F_MULTI) <= 0) {
				3240	skb_dst_drop(skb);
				3241	rcu_read_unlock_bh();
				3242	goto done;
				3243	}
				3244	skb_dst_drop(skb);
				3245	}
				3246	rcu_read_unlock_bh();
				3247	}
				3248
				3249	done:
				3250	cb->args[0] = h;
				3251	cb->args[1] = idx;
				3252	return skb->len;
				3253	}
				3254
				3255	void ip_rt_multicast_event(struct in_device *in_dev)
				3256	{
				3257	rt_cache_flush(dev_net(in_dev->dev), 0);
				3258	}
				3259
				3260	#ifdef CONFIG_SYSCTL
				3261	static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
				3262	void __user *buffer,
				3263	size_t lenp, loff_t ppos)
				3264	{
				3265	if (write) {
				3266	int flush_delay;
				3267	ctl_table ctl;
				3268	struct net *net;
				3269
				3270	memcpy(&ctl, __ctl, sizeof(ctl));
				3271	ctl.data = &flush_delay;
				3272	proc_dointvec(&ctl, write, buffer, lenp, ppos);
				3273
				3274	net = (struct net *)__ctl->extra1;
				3275	rt_cache_flush(net, flush_delay);
				3276	return 0;
				3277	}
				3278
				3279	return -EINVAL;
				3280	}
				3281
				3282	static ctl_table ipv4_route_table[] = {
				3283	{
				3284	.procname = "gc_thresh",
				3285	.data = &ipv4_dst_ops.gc_thresh,
				3286	.maxlen = sizeof(int),
				3287	.mode = 0644,
				3288	.proc_handler = proc_dointvec,
				3289	},
				3290	{
				3291	.procname = "max_size",
				3292	.data = &ip_rt_max_size,
				3293	.maxlen = sizeof(int),
				3294	.mode = 0644,
				3295	.proc_handler = proc_dointvec,
				3296	},
				3297	{
				3298	/* Deprecated. Use gc_min_interval_ms */
				3299
				3300	.procname = "gc_min_interval",
				3301	.data = &ip_rt_gc_min_interval,
				3302	.maxlen = sizeof(int),
				3303	.mode = 0644,
				3304	.proc_handler = proc_dointvec_jiffies,
				3305	},
				3306	{
				3307	.procname = "gc_min_interval_ms",
				3308	.data = &ip_rt_gc_min_interval,
				3309	.maxlen = sizeof(int),
				3310	.mode = 0644,
				3311	.proc_handler = proc_dointvec_ms_jiffies,
				3312	},
				3313	{
				3314	.procname = "gc_timeout",
				3315	.data = &ip_rt_gc_timeout,
				3316	.maxlen = sizeof(int),
				3317	.mode = 0644,
				3318	.proc_handler = proc_dointvec_jiffies,
				3319	},
				3320	{
				3321	.procname = "gc_interval",
				3322	.data = &ip_rt_gc_interval,
				3323	.maxlen = sizeof(int),
				3324	.mode = 0644,
				3325	.proc_handler = proc_dointvec_jiffies,
				3326	},
				3327	{
				3328	.procname = "redirect_load",
				3329	.data = &ip_rt_redirect_load,
				3330	.maxlen = sizeof(int),
				3331	.mode = 0644,
				3332	.proc_handler = proc_dointvec,
				3333	},
				3334	{
				3335	.procname = "redirect_number",
				3336	.data = &ip_rt_redirect_number,
				3337	.maxlen = sizeof(int),
				3338	.mode = 0644,
				3339	.proc_handler = proc_dointvec,
				3340	},
				3341	{
				3342	.procname = "redirect_silence",
				3343	.data = &ip_rt_redirect_silence,
				3344	.maxlen = sizeof(int),
				3345	.mode = 0644,
				3346	.proc_handler = proc_dointvec,
				3347	},
				3348	{
				3349	.procname = "error_cost",
				3350	.data = &ip_rt_error_cost,
				3351	.maxlen = sizeof(int),
				3352	.mode = 0644,
				3353	.proc_handler = proc_dointvec,
				3354	},
				3355	{
				3356	.procname = "error_burst",
				3357	.data = &ip_rt_error_burst,
				3358	.maxlen = sizeof(int),
				3359	.mode = 0644,
				3360	.proc_handler = proc_dointvec,
				3361	},
				3362	{
				3363	.procname = "gc_elasticity",
				3364	.data = &ip_rt_gc_elasticity,
				3365	.maxlen = sizeof(int),
				3366	.mode = 0644,
				3367	.proc_handler = proc_dointvec,
				3368	},
				3369	{
				3370	.procname = "mtu_expires",
				3371	.data = &ip_rt_mtu_expires,
				3372	.maxlen = sizeof(int),
				3373	.mode = 0644,
				3374	.proc_handler = proc_dointvec_jiffies,
				3375	},
				3376	{
				3377	.procname = "min_pmtu",
				3378	.data = &ip_rt_min_pmtu,
				3379	.maxlen = sizeof(int),
				3380	.mode = 0644,
				3381	.proc_handler = proc_dointvec,
				3382	},
				3383	{
				3384	.procname = "min_adv_mss",
				3385	.data = &ip_rt_min_advmss,
				3386	.maxlen = sizeof(int),
				3387	.mode = 0644,
				3388	.proc_handler = proc_dointvec,
				3389	},
				3390	{ }
				3391	};
				3392
				3393	static struct ctl_table empty[1];
				3394
				3395	static struct ctl_table ipv4_skeleton[] =
				3396	{
				3397	{ .procname = "route",
				3398	.mode = 0555, .child = ipv4_route_table},
				3399	{ .procname = "neigh",
				3400	.mode = 0555, .child = empty},
				3401	{ }
				3402	};
				3403
				3404	static __net_initdata struct ctl_path ipv4_path[] = {
				3405	{ .procname = "net", },
				3406	{ .procname = "ipv4", },
				3407	{ },
				3408	};
				3409
				3410	static struct ctl_table ipv4_route_flush_table[] = {
				3411	{
				3412	.procname = "flush",
				3413	.maxlen = sizeof(int),
				3414	.mode = 0200,
				3415	.proc_handler = ipv4_sysctl_rtcache_flush,
				3416	},
				3417	{ },
				3418	};
				3419
				3420	static __net_initdata struct ctl_path ipv4_route_path[] = {
				3421	{ .procname = "net", },
				3422	{ .procname = "ipv4", },
				3423	{ .procname = "route", },
				3424	{ },
				3425	};
				3426
				3427	static __net_init int sysctl_route_net_init(struct net *net)
				3428	{
				3429	struct ctl_table *tbl;
				3430
				3431	tbl = ipv4_route_flush_table;
				3432	if (!net_eq(net, &init_net)) {
				3433	tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
				3434	if (tbl == NULL)
				3435	goto err_dup;
				3436	}
				3437	tbl[0].extra1 = net;
				3438
				3439	net->ipv4.route_hdr =
				3440	register_net_sysctl_table(net, ipv4_route_path, tbl);
				3441	if (net->ipv4.route_hdr == NULL)
				3442	goto err_reg;
				3443	return 0;
				3444
				3445	err_reg:
				3446	if (tbl != ipv4_route_flush_table)
				3447	kfree(tbl);
				3448	err_dup:
				3449	return -ENOMEM;
				3450	}
				3451
				3452	static __net_exit void sysctl_route_net_exit(struct net *net)
				3453	{
				3454	struct ctl_table *tbl;
				3455
				3456	tbl = net->ipv4.route_hdr->ctl_table_arg;
				3457	unregister_net_sysctl_table(net->ipv4.route_hdr);
				3458	BUG_ON(tbl == ipv4_route_flush_table);
				3459	kfree(tbl);
				3460	}
				3461
				3462	static __net_initdata struct pernet_operations sysctl_route_ops = {
				3463	.init = sysctl_route_net_init,
				3464	.exit = sysctl_route_net_exit,
				3465	};
				3466	#endif
				3467
				3468	static __net_init int rt_genid_init(struct net *net)
				3469	{
				3470	get_random_bytes(&net->ipv4.rt_genid,
				3471	sizeof(net->ipv4.rt_genid));
				3472	get_random_bytes(&net->ipv4.dev_addr_genid,
				3473	sizeof(net->ipv4.dev_addr_genid));
				3474	return 0;
				3475	}
				3476
				3477	static __net_initdata struct pernet_operations rt_genid_ops = {
				3478	.init = rt_genid_init,
				3479	};
				3480
				3481
				3482	#ifdef CONFIG_IP_ROUTE_CLASSID
				3483	struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
				3484	#endif /* CONFIG_IP_ROUTE_CLASSID */
				3485
				3486	static __initdata unsigned long rhash_entries;
				3487	static int __init set_rhash_entries(char *str)
				3488	{
				3489	if (!str)
				3490	return 0;
				3491	rhash_entries = simple_strtoul(str, &str, 0);
				3492	return 1;
				3493	}
				3494	__setup("rhash_entries=", set_rhash_entries);
				3495
				3496	int __init ip_rt_init(void)
				3497	{
				3498	int rc = 0;
				3499
				3500	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
				3501	if (!ip_idents)
				3502	panic("IP: failed to allocate ip_idents\n");
				3503
				3504	get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
				3505
				3506	#ifdef CONFIG_IP_ROUTE_CLASSID
				3507	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
				3508	if (!ip_rt_acct)
				3509	panic("IP: failed to allocate ip_rt_acct\n");
				3510	#endif
				3511
				3512	ipv4_dst_ops.kmem_cachep =
				3513	kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
				3514	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
				3515
				3516	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
				3517
				3518	if (dst_entries_init(&ipv4_dst_ops) < 0)
				3519	panic("IP: failed to allocate ipv4_dst_ops counter\n");
				3520
				3521	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
				3522	panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
				3523
				3524	rt_hash_table = (struct rt_hash_bucket *)
				3525	alloc_large_system_hash("IP route cache",
				3526	sizeof(struct rt_hash_bucket),
				3527	rhash_entries,
				3528	(totalram_pages >= 128 * 1024) ?
				3529	15 : 17,
				3530	0,
				3531	&rt_hash_log,
				3532	&rt_hash_mask,
				3533	rhash_entries ? 0 : 512 * 1024);
				3534	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
				3535	rt_hash_lock_init();
				3536
				3537	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
				3538	ip_rt_max_size = (rt_hash_mask + 1) * 16;
				3539
				3540	devinet_init();
				3541	ip_fib_init();
				3542
				3543	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
				3544	expires_ljiffies = jiffies;
				3545	schedule_delayed_work(&expires_work,
				3546	net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
				3547
				3548	if (ip_rt_proc_init())
				3549	pr_err("Unable to create route proc files\n");
				3550	#ifdef CONFIG_XFRM
				3551	xfrm_init();
				3552	xfrm4_init(ip_rt_max_size);
				3553	#endif
				3554	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
				3555
				3556	#ifdef CONFIG_SYSCTL
				3557	register_pernet_subsys(&sysctl_route_ops);
				3558	#endif
				3559	register_pernet_subsys(&rt_genid_ops);
				3560	return rc;
				3561	}
				3562
				3563	#ifdef CONFIG_SYSCTL
				3564	/*
				3565	* We really need to sanitize the damn ipv4 init order, then all
				3566	* this nonsense will go away.
				3567	*/
				3568	void __init ip_static_sysctl_init(void)
				3569	{
				3570	register_sysctl_paths(ipv4_path, ipv4_skeleton);
				3571	}
				3572	#endif