Blame - src/kernel/linux/v4.19/net/ipv4/icmp.c - T800

blob: 4efa5e33513e3b5e04521877209bf3de07a9e5ed [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* NET3: Implementation of the ICMP protocol layer.
				3	*
				4	* Alan Cox, <alan@lxorguk.ukuu.org.uk>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Some of the function names and the icmp unreach table for this
				12	* module were derived from [icmp.c 1.0.11 06/02/93] by
				13	* Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
				14	* Other than that this module is a complete rewrite.
				15	*
				16	* Fixes:
				17	* Clemens Fruhwirth : introduce global icmp rate limiting
				18	* with icmp type masking ability instead
				19	* of broken per type icmp timeouts.
				20	* Mike Shaver : RFC1122 checks.
				21	* Alan Cox : Multicast ping reply as self.
				22	* Alan Cox : Fix atomicity lockup in ip_build_xmit
				23	* call.
				24	* Alan Cox : Added 216,128 byte paths to the MTU
				25	* code.
				26	* Martin Mares : RFC1812 checks.
				27	* Martin Mares : Can be configured to follow redirects
				28	* if acting as a router _without_ a
				29	* routing protocol (RFC 1812).
				30	* Martin Mares : Echo requests may be configured to
				31	* be ignored (RFC 1812).
				32	* Martin Mares : Limitation of ICMP error message
				33	* transmit rate (RFC 1812).
				34	* Martin Mares : TOS and Precedence set correctly
				35	* (RFC 1812).
				36	* Martin Mares : Now copying as much data from the
				37	* original packet as we can without
				38	* exceeding 576 bytes (RFC 1812).
				39	* Willy Konynenberg : Transparent proxying support.
				40	* Keith Owens : RFC1191 correction for 4.2BSD based
				41	* path MTU bug.
				42	* Thomas Quinot : ICMP Dest Unreach codes up to 15 are
				43	* valid (RFC 1812).
				44	* Andi Kleen : Check all packet lengths properly
				45	* and moved all kfree_skb() up to
				46	* icmp_rcv.
				47	* Andi Kleen : Move the rate limit bookkeeping
				48	* into the dest entry and use a token
				49	* bucket filter (thanks to ANK). Make
				50	* the rates sysctl configurable.
				51	* Yu Tianli : Fixed two ugly bugs in icmp_send
				52	* - IP option length was accounted wrongly
				53	* - ICMP header length was not accounted
				54	* at all.
				55	* Tristan Greaves : Added sysctl option to ignore bogus
				56	* broadcast responses from broken routers.
				57	*
				58	* To Fix:
				59	*
				60	* - Should use skb_pull() instead of all the manual checking.
				61	* This would also greatly simply some upper layer error handlers. --AK
				62	*
				63	*/
				64
				65	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				66
				67	#include <linux/module.h>
				68	#include <linux/types.h>
				69	#include <linux/jiffies.h>
				70	#include <linux/kernel.h>
				71	#include <linux/fcntl.h>
				72	#include <linux/socket.h>
				73	#include <linux/in.h>
				74	#include <linux/inet.h>
				75	#include <linux/inetdevice.h>
				76	#include <linux/netdevice.h>
				77	#include <linux/string.h>
				78	#include <linux/netfilter_ipv4.h>
				79	#include <linux/slab.h>
				80	#include <net/snmp.h>
				81	#include <net/ip.h>
				82	#include <net/route.h>
				83	#include <net/protocol.h>
				84	#include <net/icmp.h>
				85	#include <net/tcp.h>
				86	#include <net/udp.h>
				87	#include <net/raw.h>
				88	#include <net/ping.h>
				89	#include <linux/skbuff.h>
				90	#include <net/sock.h>
				91	#include <linux/errno.h>
				92	#include <linux/timer.h>
				93	#include <linux/init.h>
				94	#include <linux/uaccess.h>
				95	#include <net/checksum.h>
				96	#include <net/xfrm.h>
				97	#include <net/inet_common.h>
				98	#include <net/ip_fib.h>
				99	#include <net/l3mdev.h>
				100
				101	/*
				102	* Build xmit assembly blocks
				103	*/
				104
				105	struct icmp_bxm {
				106	struct sk_buff *skb;
				107	int offset;
				108	int data_len;
				109
				110	struct {
				111	struct icmphdr icmph;
				112	__be32 times[3];
				113	} data;
				114	int head_len;
				115	struct ip_options_data replyopts;
				116	};
				117
				118	/* An array of errno for error messages from dest unreach. */
				119	/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
				120
				121	const struct icmp_err icmp_err_convert[] = {
				122	{
				123	.errno = ENETUNREACH, /* ICMP_NET_UNREACH */
				124	.fatal = 0,
				125	},
				126	{
				127	.errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
				128	.fatal = 0,
				129	},
				130	{
				131	.errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
				132	.fatal = 1,
				133	},
				134	{
				135	.errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
				136	.fatal = 1,
				137	},
				138	{
				139	.errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
				140	.fatal = 0,
				141	},
				142	{
				143	.errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
				144	.fatal = 0,
				145	},
				146	{
				147	.errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
				148	.fatal = 1,
				149	},
				150	{
				151	.errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
				152	.fatal = 1,
				153	},
				154	{
				155	.errno = ENONET, /* ICMP_HOST_ISOLATED */
				156	.fatal = 1,
				157	},
				158	{
				159	.errno = ENETUNREACH, /* ICMP_NET_ANO */
				160	.fatal = 1,
				161	},
				162	{
				163	.errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
				164	.fatal = 1,
				165	},
				166	{
				167	.errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
				168	.fatal = 0,
				169	},
				170	{
				171	.errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
				172	.fatal = 0,
				173	},
				174	{
				175	.errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
				176	.fatal = 1,
				177	},
				178	{
				179	.errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
				180	.fatal = 1,
				181	},
				182	{
				183	.errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
				184	.fatal = 1,
				185	},
				186	};
				187	EXPORT_SYMBOL(icmp_err_convert);
				188
				189	/*
				190	* ICMP control array. This specifies what to do with each ICMP.
				191	*/
				192
				193	struct icmp_control {
				194	bool (handler)(struct sk_buff skb);
				195	short error; /* This ICMP is classed as an error message */
				196	};
				197
				198	static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
				199
				200	/*
				201	* The ICMP socket(s). This is the most convenient way to flow control
				202	* our ICMP output as well as maintain a clean interface throughout
				203	* all layers. All Socketless IP sends will soon be gone.
				204	*
				205	* On SMP we have one ICMP socket per-cpu.
				206	*/
				207	static struct sock icmp_sk(struct net net)
				208	{
				209	return *this_cpu_ptr(net->ipv4.icmp_sk);
				210	}
				211
				212	/* Called with BH disabled */
				213	static inline struct sock icmp_xmit_lock(struct net net)
				214	{
				215	struct sock *sk;
				216
				217	sk = icmp_sk(net);
				218
				219	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
				220	/* This can happen if the output path signals a
				221	* dst_link_failure() for an outgoing ICMP packet.
				222	*/
				223	return NULL;
				224	}
				225	return sk;
				226	}
				227
				228	static inline void icmp_xmit_unlock(struct sock *sk)
				229	{
				230	spin_unlock(&sk->sk_lock.slock);
				231	}
				232
				233	int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
				234	int sysctl_icmp_msgs_burst __read_mostly = 50;
				235
				236	static struct {
				237	spinlock_t lock;
				238	u32 credit;
				239	u32 stamp;
				240	} icmp_global = {
				241	.lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
				242	};
				243
				244	/**
				245	* icmp_global_allow - Are we allowed to send one more ICMP message ?
				246	*
				247	* Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
				248	* Returns false if we reached the limit and can not send another packet.
				249	* Note: called with BH disabled
				250	*/
				251	bool icmp_global_allow(void)
				252	{
				253	u32 credit, delta, incr = 0, now = (u32)jiffies;
				254	bool rc = false;
				255
				256	/* Check if token bucket is empty and cannot be refilled
				257	* without taking the spinlock. The READ_ONCE() are paired
				258	* with the following WRITE_ONCE() in this same function.
				259	*/
				260	if (!READ_ONCE(icmp_global.credit)) {
				261	delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
				262	if (delta < HZ / 50)
				263	return false;
				264	}
				265
				266	spin_lock(&icmp_global.lock);
				267	delta = min_t(u32, now - icmp_global.stamp, HZ);
				268	if (delta >= HZ / 50) {
				269	incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
				270	if (incr)
				271	WRITE_ONCE(icmp_global.stamp, now);
				272	}
				273	credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
				274	if (credit) {
				275	credit--;
				276	rc = true;
				277	}
				278	WRITE_ONCE(icmp_global.credit, credit);
				279	spin_unlock(&icmp_global.lock);
				280	return rc;
				281	}
				282	EXPORT_SYMBOL(icmp_global_allow);
				283
				284	static bool icmpv4_mask_allow(struct net *net, int type, int code)
				285	{
				286	if (type > NR_ICMP_TYPES)
				287	return true;
				288
				289	/* Don't limit PMTU discovery. */
				290	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
				291	return true;
				292
				293	/* Limit if icmp type is enabled in ratemask. */
				294	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
				295	return true;
				296
				297	return false;
				298	}
				299
				300	static bool icmpv4_global_allow(struct net *net, int type, int code)
				301	{
				302	if (icmpv4_mask_allow(net, type, code))
				303	return true;
				304
				305	if (icmp_global_allow())
				306	return true;
				307
				308	return false;
				309	}
				310
				311	/*
				312	* Send an ICMP frame.
				313	*/
				314
				315	static bool icmpv4_xrlim_allow(struct net net, struct rtable rt,
				316	struct flowi4 *fl4, int type, int code)
				317	{
				318	struct dst_entry *dst = &rt->dst;
				319	struct inet_peer *peer;
				320	bool rc = true;
				321	int vif;
				322
				323	if (icmpv4_mask_allow(net, type, code))
				324	goto out;
				325
				326	/* No rate limit on loopback */
				327	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
				328	goto out;
				329
				330	vif = l3mdev_master_ifindex(dst->dev);
				331	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
				332	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
				333	if (peer)
				334	inet_putpeer(peer);
				335	out:
				336	return rc;
				337	}
				338
				339	/*
				340	* Maintain the counters used in the SNMP statistics for outgoing ICMP
				341	*/
				342	void icmp_out_count(struct net *net, unsigned char type)
				343	{
				344	ICMPMSGOUT_INC_STATS(net, type);
				345	ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
				346	}
				347
				348	/*
				349	* Checksum each fragment, and on the first include the headers and final
				350	* checksum.
				351	*/
				352	static int icmp_glue_bits(void from, char to, int offset, int len, int odd,
				353	struct sk_buff *skb)
				354	{
				355	struct icmp_bxm icmp_param = (struct icmp_bxm )from;
				356	__wsum csum;
				357
				358	csum = skb_copy_and_csum_bits(icmp_param->skb,
				359	icmp_param->offset + offset,
				360	to, len, 0);
				361
				362	skb->csum = csum_block_add(skb->csum, csum, odd);
				363	if (icmp_pointers[icmp_param->data.icmph.type].error)
				364	nf_ct_attach(skb, icmp_param->skb);
				365	return 0;
				366	}
				367
				368	static void icmp_push_reply(struct icmp_bxm *icmp_param,
				369	struct flowi4 *fl4,
				370	struct ipcm_cookie ipc, struct rtable *rt)
				371	{
				372	struct sock *sk;
				373	struct sk_buff *skb;
				374
				375	sk = icmp_sk(dev_net((*rt)->dst.dev));
				376	if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
				377	icmp_param->data_len+icmp_param->head_len,
				378	icmp_param->head_len,
				379	ipc, rt, MSG_DONTWAIT) < 0) {
				380	__ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
				381	ip_flush_pending_frames(sk);
				382	} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
				383	struct icmphdr *icmph = icmp_hdr(skb);
				384	__wsum csum = 0;
				385	struct sk_buff *skb1;
				386
				387	skb_queue_walk(&sk->sk_write_queue, skb1) {
				388	csum = csum_add(csum, skb1->csum);
				389	}
				390	csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
				391	(char *)icmph,
				392	icmp_param->head_len, csum);
				393	icmph->checksum = csum_fold(csum);
				394	skb->ip_summed = CHECKSUM_NONE;
				395	ip_push_pending_frames(sk, fl4);
				396	}
				397	}
				398
				399	/*
				400	* Driving logic for building and sending ICMP messages.
				401	*/
				402
				403	static void icmp_reply(struct icmp_bxm icmp_param, struct sk_buff skb)
				404	{
				405	struct ipcm_cookie ipc;
				406	struct rtable *rt = skb_rtable(skb);
				407	struct net *net = dev_net(rt->dst.dev);
				408	struct flowi4 fl4;
				409	struct sock *sk;
				410	struct inet_sock *inet;
				411	__be32 daddr, saddr;
				412	u32 mark = IP4_REPLY_MARK(net, skb->mark);
				413	int type = icmp_param->data.icmph.type;
				414	int code = icmp_param->data.icmph.code;
				415
				416	if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
				417	return;
				418
				419	/* Needed by both icmp_global_allow and icmp_xmit_lock */
				420	local_bh_disable();
				421
				422	/* global icmp_msgs_per_sec */
				423	if (!icmpv4_global_allow(net, type, code))
				424	goto out_bh_enable;
				425
				426	sk = icmp_xmit_lock(net);
				427	if (!sk)
				428	goto out_bh_enable;
				429	inet = inet_sk(sk);
				430
				431	icmp_param->data.icmph.checksum = 0;
				432
				433	ipcm_init(&ipc);
				434	inet->tos = ip_hdr(skb)->tos;
				435	sk->sk_mark = mark;
				436	daddr = ipc.addr = ip_hdr(skb)->saddr;
				437	saddr = fib_compute_spec_dst(skb);
				438
				439	if (icmp_param->replyopts.opt.opt.optlen) {
				440	ipc.opt = &icmp_param->replyopts.opt;
				441	if (ipc.opt->opt.srr)
				442	daddr = icmp_param->replyopts.opt.opt.faddr;
				443	}
				444	memset(&fl4, 0, sizeof(fl4));
				445	fl4.daddr = daddr;
				446	fl4.saddr = saddr;
				447	fl4.flowi4_mark = mark;
				448	fl4.flowi4_uid = sock_net_uid(net, NULL);
				449	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
				450	fl4.flowi4_proto = IPPROTO_ICMP;
				451	fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
				452	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
				453	rt = ip_route_output_key(net, &fl4);
				454	if (IS_ERR(rt))
				455	goto out_unlock;
				456	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
				457	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
				458	ip_rt_put(rt);
				459	out_unlock:
				460	icmp_xmit_unlock(sk);
				461	out_bh_enable:
				462	local_bh_enable();
				463	}
				464
				465	static struct rtable icmp_route_lookup(struct net net,
				466	struct flowi4 *fl4,
				467	struct sk_buff *skb_in,
				468	const struct iphdr *iph,
				469	__be32 saddr, u8 tos, u32 mark,
				470	int type, int code,
				471	struct icmp_bxm *param)
				472	{
				473	struct rtable rt, rt2;
				474	struct flowi4 fl4_dec;
				475	int err;
				476
				477	memset(fl4, 0, sizeof(*fl4));
				478	fl4->daddr = (param->replyopts.opt.opt.srr ?
				479	param->replyopts.opt.opt.faddr : iph->saddr);
				480	fl4->saddr = saddr;
				481	fl4->flowi4_mark = mark;
				482	fl4->flowi4_uid = sock_net_uid(net, NULL);
				483	fl4->flowi4_tos = RT_TOS(tos);
				484	fl4->flowi4_proto = IPPROTO_ICMP;
				485	fl4->fl4_icmp_type = type;
				486	fl4->fl4_icmp_code = code;
				487	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
				488
				489	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
				490	rt = ip_route_output_key_hash(net, fl4, skb_in);
				491	if (IS_ERR(rt))
				492	return rt;
				493
				494	/* No need to clone since we're just using its address. */
				495	rt2 = rt;
				496
				497	rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
				498	flowi4_to_flowi(fl4), NULL, 0);
				499	if (!IS_ERR(rt)) {
				500	if (rt != rt2)
				501	return rt;
				502	} else if (PTR_ERR(rt) == -EPERM) {
				503	rt = NULL;
				504	} else
				505	return rt;
				506
				507	err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
				508	if (err)
				509	goto relookup_failed;
				510
				511	if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
				512	fl4_dec.saddr) == RTN_LOCAL) {
				513	rt2 = __ip_route_output_key(net, &fl4_dec);
				514	if (IS_ERR(rt2))
				515	err = PTR_ERR(rt2);
				516	} else {
				517	struct flowi4 fl4_2 = {};
				518	unsigned long orefdst;
				519
				520	fl4_2.daddr = fl4_dec.saddr;
				521	rt2 = ip_route_output_key(net, &fl4_2);
				522	if (IS_ERR(rt2)) {
				523	err = PTR_ERR(rt2);
				524	goto relookup_failed;
				525	}
				526	/* Ugh! */
				527	orefdst = skb_in->_skb_refdst; /* save old refdst */
				528	skb_dst_set(skb_in, NULL);
				529	err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
				530	RT_TOS(tos), rt2->dst.dev);
				531
				532	dst_release(&rt2->dst);
				533	rt2 = skb_rtable(skb_in);
				534	skb_in->_skb_refdst = orefdst; /* restore old refdst */
				535	}
				536
				537	if (err)
				538	goto relookup_failed;
				539
				540	rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
				541	flowi4_to_flowi(&fl4_dec), NULL,
				542	XFRM_LOOKUP_ICMP);
				543	if (!IS_ERR(rt2)) {
				544	dst_release(&rt->dst);
				545	memcpy(fl4, &fl4_dec, sizeof(*fl4));
				546	rt = rt2;
				547	} else if (PTR_ERR(rt2) == -EPERM) {
				548	if (rt)
				549	dst_release(&rt->dst);
				550	return rt2;
				551	} else {
				552	err = PTR_ERR(rt2);
				553	goto relookup_failed;
				554	}
				555	return rt;
				556
				557	relookup_failed:
				558	if (rt)
				559	return rt;
				560	return ERR_PTR(err);
				561	}
				562
				563	/*
				564	* Send an ICMP message in response to a situation
				565	*
				566	* RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
				567	* MAY send more (we do).
				568	* MUST NOT change this header information.
				569	* MUST NOT reply to a multicast/broadcast IP address.
				570	* MUST NOT reply to a multicast/broadcast MAC address.
				571	* MUST reply to only the first fragment.
				572	*/
				573
				574	void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
				575	const struct ip_options *opt)
				576	{
				577	struct iphdr *iph;
				578	int room;
				579	struct icmp_bxm icmp_param;
				580	struct rtable *rt = skb_rtable(skb_in);
				581	struct ipcm_cookie ipc;
				582	struct flowi4 fl4;
				583	__be32 saddr;
				584	u8 tos;
				585	u32 mark;
				586	struct net *net;
				587	struct sock *sk;
				588
				589	if (!rt)
				590	goto out;
				591
				592	if (rt->dst.dev)
				593	net = dev_net(rt->dst.dev);
				594	else if (skb_in->dev)
				595	net = dev_net(skb_in->dev);
				596	else
				597	goto out;
				598
				599	/*
				600	* Find the original header. It is expected to be valid, of course.
				601	* Check this, icmp_send is called from the most obscure devices
				602	* sometimes.
				603	*/
				604	iph = ip_hdr(skb_in);
				605
				606	if ((u8 *)iph < skb_in->head \|\|
				607	(skb_network_header(skb_in) + sizeof(*iph)) >
				608	skb_tail_pointer(skb_in))
				609	goto out;
				610
				611	/*
				612	* No replies to physical multicast/broadcast
				613	*/
				614	if (skb_in->pkt_type != PACKET_HOST)
				615	goto out;
				616
				617	/*
				618	* Now check at the protocol level
				619	*/
				620	if (rt->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
				621	goto out;
				622
				623	/*
				624	* Only reply to fragment 0. We byte re-order the constant
				625	* mask for efficiency.
				626	*/
				627	if (iph->frag_off & htons(IP_OFFSET))
				628	goto out;
				629
				630	/*
				631	* If we send an ICMP error to an ICMP error a mess would result..
				632	*/
				633	if (icmp_pointers[type].error) {
				634	/*
				635	* We are an error, check if we are replying to an
				636	* ICMP error
				637	*/
				638	if (iph->protocol == IPPROTO_ICMP) {
				639	u8 _inner_type, *itp;
				640
				641	itp = skb_header_pointer(skb_in,
				642	skb_network_header(skb_in) +
				643	(iph->ihl << 2) +
				644	offsetof(struct icmphdr,
				645	type) -
				646	skb_in->data,
				647	sizeof(_inner_type),
				648	&_inner_type);
				649	if (!itp)
				650	goto out;
				651
				652	/*
				653	* Assume any unknown ICMP type is an error. This
				654	* isn't specified by the RFC, but think about it..
				655	*/
				656	if (*itp > NR_ICMP_TYPES \|\|
				657	icmp_pointers[*itp].error)
				658	goto out;
				659	}
				660	}
				661
				662	/* Needed by both icmp_global_allow and icmp_xmit_lock */
				663	local_bh_disable();
				664
				665	/* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
				666	* incoming dev is loopback. If outgoing dev change to not be
				667	* loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
				668	*/
				669	if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
				670	!icmpv4_global_allow(net, type, code))
				671	goto out_bh_enable;
				672
				673	sk = icmp_xmit_lock(net);
				674	if (!sk)
				675	goto out_bh_enable;
				676
				677	/*
				678	* Construct source address and options.
				679	*/
				680
				681	saddr = iph->daddr;
				682	if (!(rt->rt_flags & RTCF_LOCAL)) {
				683	struct net_device *dev = NULL;
				684
				685	rcu_read_lock();
				686	if (rt_is_input_route(rt) &&
				687	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
				688	dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
				689
				690	if (dev)
				691	saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
				692	else
				693	saddr = 0;
				694	rcu_read_unlock();
				695	}
				696
				697	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) \|
				698	IPTOS_PREC_INTERNETCONTROL) :
				699	iph->tos;
				700	mark = IP4_REPLY_MARK(net, skb_in->mark);
				701
				702	if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
				703	goto out_unlock;
				704
				705
				706	/*
				707	* Prepare data for ICMP header.
				708	*/
				709
				710	icmp_param.data.icmph.type = type;
				711	icmp_param.data.icmph.code = code;
				712	icmp_param.data.icmph.un.gateway = info;
				713	icmp_param.data.icmph.checksum = 0;
				714	icmp_param.skb = skb_in;
				715	icmp_param.offset = skb_network_offset(skb_in);
				716	inet_sk(sk)->tos = tos;
				717	sk->sk_mark = mark;
				718	ipcm_init(&ipc);
				719	ipc.addr = iph->saddr;
				720	ipc.opt = &icmp_param.replyopts.opt;
				721
				722	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
				723	type, code, &icmp_param);
				724	if (IS_ERR(rt))
				725	goto out_unlock;
				726
				727	/* peer icmp_ratelimit */
				728	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
				729	goto ende;
				730
				731	/* RFC says return as much as we can without exceeding 576 bytes. */
				732
				733	room = dst_mtu(&rt->dst);
				734	if (room > 576)
				735	room = 576;
				736	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
				737	room -= sizeof(struct icmphdr);
				738
				739	icmp_param.data_len = skb_in->len - icmp_param.offset;
				740	if (icmp_param.data_len > room)
				741	icmp_param.data_len = room;
				742	icmp_param.head_len = sizeof(struct icmphdr);
				743
				744	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
				745	ende:
				746	ip_rt_put(rt);
				747	out_unlock:
				748	icmp_xmit_unlock(sk);
				749	out_bh_enable:
				750	local_bh_enable();
				751	out:;
				752	}
				753	EXPORT_SYMBOL(__icmp_send);
				754
				755
				756	static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
				757	{
				758	const struct iphdr iph = (const struct iphdr ) skb->data;
				759	const struct net_protocol *ipprot;
				760	int protocol = iph->protocol;
				761
				762	/* Checkin full IP header plus 8 bytes of protocol to
				763	* avoid additional coding at protocol handlers.
				764	*/
				765	if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
				766	__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
				767	return;
				768	}
				769
				770	raw_icmp_error(skb, protocol, info);
				771
				772	ipprot = rcu_dereference(inet_protos[protocol]);
				773	if (ipprot && ipprot->err_handler)
				774	ipprot->err_handler(skb, info);
				775	}
				776
				777	static bool icmp_tag_validation(int proto)
				778	{
				779	bool ok;
				780
				781	rcu_read_lock();
				782	ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
				783	rcu_read_unlock();
				784	return ok;
				785	}
				786
				787	/*
				788	* Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
				789	* ICMP_PARAMETERPROB.
				790	*/
				791
				792	static bool icmp_unreach(struct sk_buff *skb)
				793	{
				794	const struct iphdr *iph;
				795	struct icmphdr *icmph;
				796	struct net *net;
				797	u32 info = 0;
				798
				799	net = dev_net(skb_dst(skb)->dev);
				800
				801	/*
				802	* Incomplete header ?
				803	* Only checks for the IP header, there should be an
				804	* additional check for longer headers in upper levels.
				805	*/
				806
				807	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
				808	goto out_err;
				809
				810	icmph = icmp_hdr(skb);
				811	iph = (const struct iphdr *)skb->data;
				812
				813	if (iph->ihl < 5) /* Mangled header, drop. */
				814	goto out_err;
				815
				816	switch (icmph->type) {
				817	case ICMP_DEST_UNREACH:
				818	switch (icmph->code & 15) {
				819	case ICMP_NET_UNREACH:
				820	case ICMP_HOST_UNREACH:
				821	case ICMP_PROT_UNREACH:
				822	case ICMP_PORT_UNREACH:
				823	break;
				824	case ICMP_FRAG_NEEDED:
				825	/* for documentation of the ip_no_pmtu_disc
				826	* values please see
				827	* Documentation/networking/ip-sysctl.txt
				828	*/
				829	switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
				830	default:
				831	net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
				832	&iph->daddr);
				833	break;
				834	case 2:
				835	goto out;
				836	case 3:
				837	if (!icmp_tag_validation(iph->protocol))
				838	goto out;
				839	/* fall through */
				840	case 0:
				841	info = ntohs(icmph->un.frag.mtu);
				842	}
				843	break;
				844	case ICMP_SR_FAILED:
				845	net_dbg_ratelimited("%pI4: Source Route Failed\n",
				846	&iph->daddr);
				847	break;
				848	default:
				849	break;
				850	}
				851	if (icmph->code > NR_ICMP_UNREACH)
				852	goto out;
				853	break;
				854	case ICMP_PARAMETERPROB:
				855	info = ntohl(icmph->un.gateway) >> 24;
				856	break;
				857	case ICMP_TIME_EXCEEDED:
				858	__ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
				859	if (icmph->code == ICMP_EXC_FRAGTIME)
				860	goto out;
				861	break;
				862	}
				863
				864	/*
				865	* Throw it at our lower layers
				866	*
				867	* RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
				868	* header.
				869	* RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
				870	* transport layer.
				871	* RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
				872	* transport layer.
				873	*/
				874
				875	/*
				876	* Check the other end isn't violating RFC 1122. Some routers send
				877	* bogus responses to broadcast frames. If you see this message
				878	* first check your netmask matches at both ends, if it does then
				879	* get the other vendor to fix their kit.
				880	*/
				881
				882	if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
				883	inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
				884	net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
				885	&ip_hdr(skb)->saddr,
				886	icmph->type, icmph->code,
				887	&iph->daddr, skb->dev->name);
				888	goto out;
				889	}
				890
				891	icmp_socket_deliver(skb, info);
				892
				893	out:
				894	return true;
				895	out_err:
				896	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
				897	return false;
				898	}
				899
				900
				901	/*
				902	* Handle ICMP_REDIRECT.
				903	*/
				904
				905	static bool icmp_redirect(struct sk_buff *skb)
				906	{
				907	if (skb->len < sizeof(struct iphdr)) {
				908	__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
				909	return false;
				910	}
				911
				912	if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
				913	/* there aught to be a stat */
				914	return false;
				915	}
				916
				917	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
				918	return true;
				919	}
				920
				921	/*
				922	* Handle ICMP_ECHO ("ping") requests.
				923	*
				924	* RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
				925	* requests.
				926	* RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
				927	* included in the reply.
				928	* RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
				929	* echo requests, MUST have default=NOT.
				930	* See also WRT handling of options once they are done and working.
				931	*/
				932
				933	static bool icmp_echo(struct sk_buff *skb)
				934	{
				935	struct net *net;
				936
				937	net = dev_net(skb_dst(skb)->dev);
				938	if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
				939	struct icmp_bxm icmp_param;
				940
				941	icmp_param.data.icmph = *icmp_hdr(skb);
				942	icmp_param.data.icmph.type = ICMP_ECHOREPLY;
				943	icmp_param.skb = skb;
				944	icmp_param.offset = 0;
				945	icmp_param.data_len = skb->len;
				946	icmp_param.head_len = sizeof(struct icmphdr);
				947	icmp_reply(&icmp_param, skb);
				948	}
				949	/* should there be an ICMP stat for ignored echos? */
				950	return true;
				951	}
				952
				953	/*
				954	* Handle ICMP Timestamp requests.
				955	* RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
				956	* SHOULD be in the kernel for minimum random latency.
				957	* MUST be accurate to a few minutes.
				958	* MUST be updated at least at 15Hz.
				959	*/
				960	static bool icmp_timestamp(struct sk_buff *skb)
				961	{
				962	struct icmp_bxm icmp_param;
				963	/*
				964	* Too short.
				965	*/
				966	if (skb->len < 4)
				967	goto out_err;
				968
				969	/*
				970	* Fill in the current time as ms since midnight UT:
				971	*/
				972	icmp_param.data.times[1] = inet_current_timestamp();
				973	icmp_param.data.times[2] = icmp_param.data.times[1];
				974
				975	BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
				976
				977	icmp_param.data.icmph = *icmp_hdr(skb);
				978	icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
				979	icmp_param.data.icmph.code = 0;
				980	icmp_param.skb = skb;
				981	icmp_param.offset = 0;
				982	icmp_param.data_len = 0;
				983	icmp_param.head_len = sizeof(struct icmphdr) + 12;
				984	icmp_reply(&icmp_param, skb);
				985	return true;
				986
				987	out_err:
				988	__ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
				989	return false;
				990	}
				991
				992	static bool icmp_discard(struct sk_buff *skb)
				993	{
				994	/* pretend it was a success */
				995	return true;
				996	}
				997
				998	/*
				999	* Deal with incoming ICMP packets.
				1000	*/
				1001	int icmp_rcv(struct sk_buff *skb)
				1002	{
				1003	struct icmphdr *icmph;
				1004	struct rtable *rt = skb_rtable(skb);
				1005	struct net *net = dev_net(rt->dst.dev);
				1006	bool success;
				1007
				1008	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1009	struct sec_path *sp = skb_sec_path(skb);
				1010	int nh;
				1011
				1012	if (!(sp && sp->xvec[sp->len - 1]->props.flags &
				1013	XFRM_STATE_ICMP))
				1014	goto drop;
				1015
				1016	if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
				1017	goto drop;
				1018
				1019	nh = skb_network_offset(skb);
				1020	skb_set_network_header(skb, sizeof(*icmph));
				1021
				1022	if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
				1023	goto drop;
				1024
				1025	skb_set_network_header(skb, nh);
				1026	}
				1027
				1028	__ICMP_INC_STATS(net, ICMP_MIB_INMSGS);
				1029
				1030	if (skb_checksum_simple_validate(skb))
				1031	goto csum_error;
				1032
				1033	if (!pskb_pull(skb, sizeof(*icmph)))
				1034	goto error;
				1035
				1036	icmph = icmp_hdr(skb);
				1037
				1038	ICMPMSGIN_INC_STATS(net, icmph->type);
				1039	/*
				1040	* 18 is the highest 'known' ICMP type. Anything else is a mystery
				1041	*
				1042	* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
				1043	* discarded.
				1044	*/
				1045	if (icmph->type > NR_ICMP_TYPES)
				1046	goto error;
				1047
				1048
				1049	/*
				1050	* Parse the ICMP message
				1051	*/
				1052
				1053	if (rt->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST)) {
				1054	/*
				1055	* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
				1056	* silently ignored (we let user decide with a sysctl).
				1057	* RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
				1058	* discarded if to broadcast/multicast.
				1059	*/
				1060	if ((icmph->type == ICMP_ECHO \|\|
				1061	icmph->type == ICMP_TIMESTAMP) &&
				1062	net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
				1063	goto error;
				1064	}
				1065	if (icmph->type != ICMP_ECHO &&
				1066	icmph->type != ICMP_TIMESTAMP &&
				1067	icmph->type != ICMP_ADDRESS &&
				1068	icmph->type != ICMP_ADDRESSREPLY) {
				1069	goto error;
				1070	}
				1071	}
				1072
				1073	success = icmp_pointers[icmph->type].handler(skb);
				1074
				1075	if (success) {
				1076	consume_skb(skb);
				1077	return NET_RX_SUCCESS;
				1078	}
				1079
				1080	drop:
				1081	kfree_skb(skb);
				1082	return NET_RX_DROP;
				1083	csum_error:
				1084	__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
				1085	error:
				1086	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
				1087	goto drop;
				1088	}
				1089
				1090	void icmp_err(struct sk_buff *skb, u32 info)
				1091	{
				1092	struct iphdr iph = (struct iphdr )skb->data;
				1093	int offset = iph->ihl<<2;
				1094	struct icmphdr icmph = (struct icmphdr )(skb->data + offset);
				1095	int type = icmp_hdr(skb)->type;
				1096	int code = icmp_hdr(skb)->code;
				1097	struct net *net = dev_net(skb->dev);
				1098
				1099	/*
				1100	* Use ping_err to handle all icmp errors except those
				1101	* triggered by ICMP_ECHOREPLY which sent from kernel.
				1102	*/
				1103	if (icmph->type != ICMP_ECHOREPLY) {
				1104	ping_err(skb, offset, info);
				1105	return;
				1106	}
				1107
				1108	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
				1109	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
				1110	else if (type == ICMP_REDIRECT)
				1111	ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
				1112	}
				1113
				1114	/*
				1115	* This table is the definition of how we handle ICMP.
				1116	*/
				1117	static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
				1118	[ICMP_ECHOREPLY] = {
				1119	.handler = ping_rcv,
				1120	},
				1121	[1] = {
				1122	.handler = icmp_discard,
				1123	.error = 1,
				1124	},
				1125	[2] = {
				1126	.handler = icmp_discard,
				1127	.error = 1,
				1128	},
				1129	[ICMP_DEST_UNREACH] = {
				1130	.handler = icmp_unreach,
				1131	.error = 1,
				1132	},
				1133	[ICMP_SOURCE_QUENCH] = {
				1134	.handler = icmp_unreach,
				1135	.error = 1,
				1136	},
				1137	[ICMP_REDIRECT] = {
				1138	.handler = icmp_redirect,
				1139	.error = 1,
				1140	},
				1141	[6] = {
				1142	.handler = icmp_discard,
				1143	.error = 1,
				1144	},
				1145	[7] = {
				1146	.handler = icmp_discard,
				1147	.error = 1,
				1148	},
				1149	[ICMP_ECHO] = {
				1150	.handler = icmp_echo,
				1151	},
				1152	[9] = {
				1153	.handler = icmp_discard,
				1154	.error = 1,
				1155	},
				1156	[10] = {
				1157	.handler = icmp_discard,
				1158	.error = 1,
				1159	},
				1160	[ICMP_TIME_EXCEEDED] = {
				1161	.handler = icmp_unreach,
				1162	.error = 1,
				1163	},
				1164	[ICMP_PARAMETERPROB] = {
				1165	.handler = icmp_unreach,
				1166	.error = 1,
				1167	},
				1168	[ICMP_TIMESTAMP] = {
				1169	.handler = icmp_timestamp,
				1170	},
				1171	[ICMP_TIMESTAMPREPLY] = {
				1172	.handler = icmp_discard,
				1173	},
				1174	[ICMP_INFO_REQUEST] = {
				1175	.handler = icmp_discard,
				1176	},
				1177	[ICMP_INFO_REPLY] = {
				1178	.handler = icmp_discard,
				1179	},
				1180	[ICMP_ADDRESS] = {
				1181	.handler = icmp_discard,
				1182	},
				1183	[ICMP_ADDRESSREPLY] = {
				1184	.handler = icmp_discard,
				1185	},
				1186	};
				1187
				1188	static void __net_exit icmp_sk_exit(struct net *net)
				1189	{
				1190	int i;
				1191
				1192	for_each_possible_cpu(i)
				1193	inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
				1194	free_percpu(net->ipv4.icmp_sk);
				1195	net->ipv4.icmp_sk = NULL;
				1196	}
				1197
				1198	static int __net_init icmp_sk_init(struct net *net)
				1199	{
				1200	int i, err;
				1201
				1202	net->ipv4.icmp_sk = alloc_percpu(struct sock *);
				1203	if (!net->ipv4.icmp_sk)
				1204	return -ENOMEM;
				1205
				1206	for_each_possible_cpu(i) {
				1207	struct sock *sk;
				1208
				1209	err = inet_ctl_sock_create(&sk, PF_INET,
				1210	SOCK_RAW, IPPROTO_ICMP, net);
				1211	if (err < 0)
				1212	goto fail;
				1213
				1214	*per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
				1215
				1216	/* Enough space for 2 64K ICMP packets, including
				1217	* sk_buff/skb_shared_info struct overhead.
				1218	*/
				1219	sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
				1220
				1221	/*
				1222	* Speedup sock_wfree()
				1223	*/
				1224	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				1225	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
				1226	}
				1227
				1228	/* Control parameters for ECHO replies. */
				1229	net->ipv4.sysctl_icmp_echo_ignore_all = 0;
				1230	net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
				1231
				1232	/* Control parameter - ignore bogus broadcast responses? */
				1233	net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
				1234
				1235	/*
				1236	* Configurable global rate limit.
				1237	*
				1238	* ratelimit defines tokens/packet consumed for dst->rate_token
				1239	* bucket ratemask defines which icmp types are ratelimited by
				1240	* setting it's bit position.
				1241	*
				1242	* default:
				1243	* dest unreachable (3), source quench (4),
				1244	* time exceeded (11), parameter problem (12)
				1245	*/
				1246
				1247	net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
				1248	net->ipv4.sysctl_icmp_ratemask = 0x1818;
				1249	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
				1250
				1251	return 0;
				1252
				1253	fail:
				1254	for_each_possible_cpu(i)
				1255	inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
				1256	free_percpu(net->ipv4.icmp_sk);
				1257	return err;
				1258	}
				1259
				1260	static struct pernet_operations __net_initdata icmp_sk_ops = {
				1261	.init = icmp_sk_init,
				1262	.exit = icmp_sk_exit,
				1263	};
				1264
				1265	int __init icmp_init(void)
				1266	{
				1267	return register_pernet_subsys(&icmp_sk_ops);
				1268	}