Blame - src/kernel/linux/v4.14/net/ipv4/icmp.c - T103

blob: 995ef3d23368987c0fe2e1964081552647f68a04 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* NET3: Implementation of the ICMP protocol layer.
				3	*
				4	* Alan Cox, <alan@lxorguk.ukuu.org.uk>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	* Some of the function names and the icmp unreach table for this
				12	* module were derived from [icmp.c 1.0.11 06/02/93] by
				13	* Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
				14	* Other than that this module is a complete rewrite.
				15	*
				16	* Fixes:
				17	* Clemens Fruhwirth : introduce global icmp rate limiting
				18	* with icmp type masking ability instead
				19	* of broken per type icmp timeouts.
				20	* Mike Shaver : RFC1122 checks.
				21	* Alan Cox : Multicast ping reply as self.
				22	* Alan Cox : Fix atomicity lockup in ip_build_xmit
				23	* call.
				24	* Alan Cox : Added 216,128 byte paths to the MTU
				25	* code.
				26	* Martin Mares : RFC1812 checks.
				27	* Martin Mares : Can be configured to follow redirects
				28	* if acting as a router _without_ a
				29	* routing protocol (RFC 1812).
				30	* Martin Mares : Echo requests may be configured to
				31	* be ignored (RFC 1812).
				32	* Martin Mares : Limitation of ICMP error message
				33	* transmit rate (RFC 1812).
				34	* Martin Mares : TOS and Precedence set correctly
				35	* (RFC 1812).
				36	* Martin Mares : Now copying as much data from the
				37	* original packet as we can without
				38	* exceeding 576 bytes (RFC 1812).
				39	* Willy Konynenberg : Transparent proxying support.
				40	* Keith Owens : RFC1191 correction for 4.2BSD based
				41	* path MTU bug.
				42	* Thomas Quinot : ICMP Dest Unreach codes up to 15 are
				43	* valid (RFC 1812).
				44	* Andi Kleen : Check all packet lengths properly
				45	* and moved all kfree_skb() up to
				46	* icmp_rcv.
				47	* Andi Kleen : Move the rate limit bookkeeping
				48	* into the dest entry and use a token
				49	* bucket filter (thanks to ANK). Make
				50	* the rates sysctl configurable.
				51	* Yu Tianli : Fixed two ugly bugs in icmp_send
				52	* - IP option length was accounted wrongly
				53	* - ICMP header length was not accounted
				54	* at all.
				55	* Tristan Greaves : Added sysctl option to ignore bogus
				56	* broadcast responses from broken routers.
				57	*
				58	* To Fix:
				59	*
				60	* - Should use skb_pull() instead of all the manual checking.
				61	* This would also greatly simply some upper layer error handlers. --AK
				62	*
				63	*/
				64
				65	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				66
				67	#include <linux/module.h>
				68	#include <linux/types.h>
				69	#include <linux/jiffies.h>
				70	#include <linux/kernel.h>
				71	#include <linux/fcntl.h>
				72	#include <linux/socket.h>
				73	#include <linux/in.h>
				74	#include <linux/inet.h>
				75	#include <linux/inetdevice.h>
				76	#include <linux/netdevice.h>
				77	#include <linux/string.h>
				78	#include <linux/netfilter_ipv4.h>
				79	#include <linux/slab.h>
				80	#include <net/snmp.h>
				81	#include <net/ip.h>
				82	#include <net/route.h>
				83	#include <net/protocol.h>
				84	#include <net/icmp.h>
				85	#include <net/tcp.h>
				86	#include <net/udp.h>
				87	#include <net/raw.h>
				88	#include <net/ping.h>
				89	#include <linux/skbuff.h>
				90	#include <net/sock.h>
				91	#include <linux/errno.h>
				92	#include <linux/timer.h>
				93	#include <linux/init.h>
				94	#include <linux/uaccess.h>
				95	#include <net/checksum.h>
				96	#include <net/xfrm.h>
				97	#include <net/inet_common.h>
				98	#include <net/ip_fib.h>
				99	#include <net/l3mdev.h>
				100
				101	/*
				102	* Build xmit assembly blocks
				103	*/
				104
				105	struct icmp_bxm {
				106	struct sk_buff *skb;
				107	int offset;
				108	int data_len;
				109
				110	struct {
				111	struct icmphdr icmph;
				112	__be32 times[3];
				113	} data;
				114	int head_len;
				115	struct ip_options_data replyopts;
				116	};
				117
				118	/* An array of errno for error messages from dest unreach. */
				119	/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
				120
				121	const struct icmp_err icmp_err_convert[] = {
				122	{
				123	.errno = ENETUNREACH, /* ICMP_NET_UNREACH */
				124	.fatal = 0,
				125	},
				126	{
				127	.errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */
				128	.fatal = 0,
				129	},
				130	{
				131	.errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */,
				132	.fatal = 1,
				133	},
				134	{
				135	.errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */
				136	.fatal = 1,
				137	},
				138	{
				139	.errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */
				140	.fatal = 0,
				141	},
				142	{
				143	.errno = EOPNOTSUPP, /* ICMP_SR_FAILED */
				144	.fatal = 0,
				145	},
				146	{
				147	.errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */
				148	.fatal = 1,
				149	},
				150	{
				151	.errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */
				152	.fatal = 1,
				153	},
				154	{
				155	.errno = ENONET, /* ICMP_HOST_ISOLATED */
				156	.fatal = 1,
				157	},
				158	{
				159	.errno = ENETUNREACH, /* ICMP_NET_ANO */
				160	.fatal = 1,
				161	},
				162	{
				163	.errno = EHOSTUNREACH, /* ICMP_HOST_ANO */
				164	.fatal = 1,
				165	},
				166	{
				167	.errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */
				168	.fatal = 0,
				169	},
				170	{
				171	.errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */
				172	.fatal = 0,
				173	},
				174	{
				175	.errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */
				176	.fatal = 1,
				177	},
				178	{
				179	.errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */
				180	.fatal = 1,
				181	},
				182	{
				183	.errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */
				184	.fatal = 1,
				185	},
				186	};
				187	EXPORT_SYMBOL(icmp_err_convert);
				188
				189	/*
				190	* ICMP control array. This specifies what to do with each ICMP.
				191	*/
				192
				193	struct icmp_control {
				194	bool (handler)(struct sk_buff skb);
				195	short error; /* This ICMP is classed as an error message */
				196	};
				197
				198	static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
				199
				200	/*
				201	* The ICMP socket(s). This is the most convenient way to flow control
				202	* our ICMP output as well as maintain a clean interface throughout
				203	* all layers. All Socketless IP sends will soon be gone.
				204	*
				205	* On SMP we have one ICMP socket per-cpu.
				206	*/
				207	static struct sock icmp_sk(struct net net)
				208	{
				209	return *this_cpu_ptr(net->ipv4.icmp_sk);
				210	}
				211
				212	/* Called with BH disabled */
				213	static inline struct sock icmp_xmit_lock(struct net net)
				214	{
				215	struct sock *sk;
				216
				217	sk = icmp_sk(net);
				218
				219	if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
				220	/* This can happen if the output path signals a
				221	* dst_link_failure() for an outgoing ICMP packet.
				222	*/
				223	return NULL;
				224	}
				225	return sk;
				226	}
				227
				228	static inline void icmp_xmit_unlock(struct sock *sk)
				229	{
				230	spin_unlock(&sk->sk_lock.slock);
				231	}
				232
				233	int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
				234	int sysctl_icmp_msgs_burst __read_mostly = 50;
				235
				236	static struct {
				237	spinlock_t lock;
				238	u32 credit;
				239	u32 stamp;
				240	} icmp_global = {
				241	.lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
				242	};
				243
				244	/**
				245	* icmp_global_allow - Are we allowed to send one more ICMP message ?
				246	*
				247	* Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
				248	* Returns false if we reached the limit and can not send another packet.
				249	* Note: called with BH disabled
				250	*/
				251	bool icmp_global_allow(void)
				252	{
				253	u32 credit, delta, incr = 0, now = (u32)jiffies;
				254	bool rc = false;
				255
				256	/* Check if token bucket is empty and cannot be refilled
				257	* without taking the spinlock. The READ_ONCE() are paired
				258	* with the following WRITE_ONCE() in this same function.
				259	*/
				260	if (!READ_ONCE(icmp_global.credit)) {
				261	delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
				262	if (delta < HZ / 50)
				263	return false;
				264	}
				265
				266	spin_lock(&icmp_global.lock);
				267	delta = min_t(u32, now - icmp_global.stamp, HZ);
				268	if (delta >= HZ / 50) {
				269	incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
				270	if (incr)
				271	WRITE_ONCE(icmp_global.stamp, now);
				272	}
				273	credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
				274	if (credit) {
				275	credit--;
				276	rc = true;
				277	}
				278	WRITE_ONCE(icmp_global.credit, credit);
				279	spin_unlock(&icmp_global.lock);
				280	return rc;
				281	}
				282	EXPORT_SYMBOL(icmp_global_allow);
				283
				284	static bool icmpv4_mask_allow(struct net *net, int type, int code)
				285	{
				286	if (type > NR_ICMP_TYPES)
				287	return true;
				288
				289	/* Don't limit PMTU discovery. */
				290	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
				291	return true;
				292
				293	/* Limit if icmp type is enabled in ratemask. */
				294	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
				295	return true;
				296
				297	return false;
				298	}
				299
				300	static bool icmpv4_global_allow(struct net *net, int type, int code)
				301	{
				302	if (icmpv4_mask_allow(net, type, code))
				303	return true;
				304
				305	if (icmp_global_allow())
				306	return true;
				307
				308	return false;
				309	}
				310
				311	/*
				312	* Send an ICMP frame.
				313	*/
				314
				315	static bool icmpv4_xrlim_allow(struct net net, struct rtable rt,
				316	struct flowi4 *fl4, int type, int code)
				317	{
				318	struct dst_entry *dst = &rt->dst;
				319	struct inet_peer *peer;
				320	bool rc = true;
				321	int vif;
				322
				323	if (icmpv4_mask_allow(net, type, code))
				324	goto out;
				325
				326	/* No rate limit on loopback */
				327	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
				328	goto out;
				329
				330	vif = l3mdev_master_ifindex(dst->dev);
				331	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
				332	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
				333	if (peer)
				334	inet_putpeer(peer);
				335	out:
				336	return rc;
				337	}
				338
				339	/*
				340	* Maintain the counters used in the SNMP statistics for outgoing ICMP
				341	*/
				342	void icmp_out_count(struct net *net, unsigned char type)
				343	{
				344	ICMPMSGOUT_INC_STATS(net, type);
				345	ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS);
				346	}
				347
				348	/*
				349	* Checksum each fragment, and on the first include the headers and final
				350	* checksum.
				351	*/
				352	static int icmp_glue_bits(void from, char to, int offset, int len, int odd,
				353	struct sk_buff *skb)
				354	{
				355	struct icmp_bxm icmp_param = (struct icmp_bxm )from;
				356	__wsum csum;
				357
				358	csum = skb_copy_and_csum_bits(icmp_param->skb,
				359	icmp_param->offset + offset,
				360	to, len, 0);
				361
				362	skb->csum = csum_block_add(skb->csum, csum, odd);
				363	if (icmp_pointers[icmp_param->data.icmph.type].error)
				364	nf_ct_attach(skb, icmp_param->skb);
				365	return 0;
				366	}
				367
				368	static void icmp_push_reply(struct icmp_bxm *icmp_param,
				369	struct flowi4 *fl4,
				370	struct ipcm_cookie ipc, struct rtable *rt)
				371	{
				372	struct sock *sk;
				373	struct sk_buff *skb;
				374
				375	sk = icmp_sk(dev_net((*rt)->dst.dev));
				376	if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
				377	icmp_param->data_len+icmp_param->head_len,
				378	icmp_param->head_len,
				379	ipc, rt, MSG_DONTWAIT) < 0) {
				380	__ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
				381	ip_flush_pending_frames(sk);
				382	} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
				383	struct icmphdr *icmph = icmp_hdr(skb);
				384	__wsum csum = 0;
				385	struct sk_buff *skb1;
				386
				387	skb_queue_walk(&sk->sk_write_queue, skb1) {
				388	csum = csum_add(csum, skb1->csum);
				389	}
				390	csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
				391	(char *)icmph,
				392	icmp_param->head_len, csum);
				393	icmph->checksum = csum_fold(csum);
				394	skb->ip_summed = CHECKSUM_NONE;
				395	ip_push_pending_frames(sk, fl4);
				396	}
				397	}
				398
				399	/*
				400	* Driving logic for building and sending ICMP messages.
				401	*/
				402
				403	static void icmp_reply(struct icmp_bxm icmp_param, struct sk_buff skb)
				404	{
				405	struct ipcm_cookie ipc;
				406	struct rtable *rt = skb_rtable(skb);
				407	struct net *net = dev_net(rt->dst.dev);
				408	struct flowi4 fl4;
				409	struct sock *sk;
				410	struct inet_sock *inet;
				411	__be32 daddr, saddr;
				412	u32 mark = IP4_REPLY_MARK(net, skb->mark);
				413	int type = icmp_param->data.icmph.type;
				414	int code = icmp_param->data.icmph.code;
				415
				416	if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
				417	return;
				418
				419	/* Needed by both icmp_global_allow and icmp_xmit_lock */
				420	local_bh_disable();
				421
				422	/* global icmp_msgs_per_sec */
				423	if (!icmpv4_global_allow(net, type, code))
				424	goto out_bh_enable;
				425
				426	sk = icmp_xmit_lock(net);
				427	if (!sk)
				428	goto out_bh_enable;
				429	inet = inet_sk(sk);
				430
				431	icmp_param->data.icmph.checksum = 0;
				432
				433	inet->tos = ip_hdr(skb)->tos;
				434	sk->sk_mark = mark;
				435	daddr = ipc.addr = ip_hdr(skb)->saddr;
				436	saddr = fib_compute_spec_dst(skb);
				437	ipc.opt = NULL;
				438	ipc.tx_flags = 0;
				439	ipc.ttl = 0;
				440	ipc.tos = -1;
				441
				442	if (icmp_param->replyopts.opt.opt.optlen) {
				443	ipc.opt = &icmp_param->replyopts.opt;
				444	if (ipc.opt->opt.srr)
				445	daddr = icmp_param->replyopts.opt.opt.faddr;
				446	}
				447	memset(&fl4, 0, sizeof(fl4));
				448	fl4.daddr = daddr;
				449	fl4.saddr = saddr;
				450	fl4.flowi4_mark = mark;
				451	fl4.flowi4_uid = sock_net_uid(net, NULL);
				452	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
				453	fl4.flowi4_proto = IPPROTO_ICMP;
				454	fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
				455	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
				456	rt = ip_route_output_key(net, &fl4);
				457	if (IS_ERR(rt))
				458	goto out_unlock;
				459	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
				460	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
				461	ip_rt_put(rt);
				462	out_unlock:
				463	icmp_xmit_unlock(sk);
				464	out_bh_enable:
				465	local_bh_enable();
				466	}
				467
				468	static struct rtable icmp_route_lookup(struct net net,
				469	struct flowi4 *fl4,
				470	struct sk_buff *skb_in,
				471	const struct iphdr *iph,
				472	__be32 saddr, u8 tos, u32 mark,
				473	int type, int code,
				474	struct icmp_bxm *param)
				475	{
				476	struct rtable rt, rt2;
				477	struct flowi4 fl4_dec;
				478	int err;
				479
				480	memset(fl4, 0, sizeof(*fl4));
				481	fl4->daddr = (param->replyopts.opt.opt.srr ?
				482	param->replyopts.opt.opt.faddr : iph->saddr);
				483	fl4->saddr = saddr;
				484	fl4->flowi4_mark = mark;
				485	fl4->flowi4_uid = sock_net_uid(net, NULL);
				486	fl4->flowi4_tos = RT_TOS(tos);
				487	fl4->flowi4_proto = IPPROTO_ICMP;
				488	fl4->fl4_icmp_type = type;
				489	fl4->fl4_icmp_code = code;
				490	fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
				491
				492	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
				493	rt = ip_route_output_key_hash(net, fl4, skb_in);
				494	if (IS_ERR(rt))
				495	return rt;
				496
				497	/* No need to clone since we're just using its address. */
				498	rt2 = rt;
				499
				500	rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
				501	flowi4_to_flowi(fl4), NULL, 0);
				502	if (!IS_ERR(rt)) {
				503	if (rt != rt2)
				504	return rt;
				505	} else if (PTR_ERR(rt) == -EPERM) {
				506	rt = NULL;
				507	} else
				508	return rt;
				509
				510	err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
				511	if (err)
				512	goto relookup_failed;
				513
				514	if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
				515	fl4_dec.saddr) == RTN_LOCAL) {
				516	rt2 = __ip_route_output_key(net, &fl4_dec);
				517	if (IS_ERR(rt2))
				518	err = PTR_ERR(rt2);
				519	} else {
				520	struct flowi4 fl4_2 = {};
				521	unsigned long orefdst;
				522
				523	fl4_2.daddr = fl4_dec.saddr;
				524	rt2 = ip_route_output_key(net, &fl4_2);
				525	if (IS_ERR(rt2)) {
				526	err = PTR_ERR(rt2);
				527	goto relookup_failed;
				528	}
				529	/* Ugh! */
				530	orefdst = skb_in->_skb_refdst; /* save old refdst */
				531	skb_dst_set(skb_in, NULL);
				532	err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
				533	RT_TOS(tos), rt2->dst.dev);
				534
				535	dst_release(&rt2->dst);
				536	rt2 = skb_rtable(skb_in);
				537	skb_in->_skb_refdst = orefdst; /* restore old refdst */
				538	}
				539
				540	if (err)
				541	goto relookup_failed;
				542
				543	rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
				544	flowi4_to_flowi(&fl4_dec), NULL,
				545	XFRM_LOOKUP_ICMP);
				546	if (!IS_ERR(rt2)) {
				547	dst_release(&rt->dst);
				548	memcpy(fl4, &fl4_dec, sizeof(*fl4));
				549	rt = rt2;
				550	} else if (PTR_ERR(rt2) == -EPERM) {
				551	if (rt)
				552	dst_release(&rt->dst);
				553	return rt2;
				554	} else {
				555	err = PTR_ERR(rt2);
				556	goto relookup_failed;
				557	}
				558	return rt;
				559
				560	relookup_failed:
				561	if (rt)
				562	return rt;
				563	return ERR_PTR(err);
				564	}
				565
				566	/*
				567	* Send an ICMP message in response to a situation
				568	*
				569	* RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header.
				570	* MAY send more (we do).
				571	* MUST NOT change this header information.
				572	* MUST NOT reply to a multicast/broadcast IP address.
				573	* MUST NOT reply to a multicast/broadcast MAC address.
				574	* MUST reply to only the first fragment.
				575	*/
				576
				577	void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
				578	const struct ip_options *opt)
				579	{
				580	struct iphdr *iph;
				581	int room;
				582	struct icmp_bxm icmp_param;
				583	struct rtable *rt = skb_rtable(skb_in);
				584	struct ipcm_cookie ipc;
				585	struct flowi4 fl4;
				586	__be32 saddr;
				587	u8 tos;
				588	u32 mark;
				589	struct net *net;
				590	struct sock *sk;
				591
				592	if (!rt)
				593	goto out;
				594	net = dev_net(rt->dst.dev);
				595
				596	/*
				597	* Find the original header. It is expected to be valid, of course.
				598	* Check this, icmp_send is called from the most obscure devices
				599	* sometimes.
				600	*/
				601	iph = ip_hdr(skb_in);
				602
				603	if ((u8 *)iph < skb_in->head \|\|
				604	(skb_network_header(skb_in) + sizeof(*iph)) >
				605	skb_tail_pointer(skb_in))
				606	goto out;
				607
				608	/*
				609	* No replies to physical multicast/broadcast
				610	*/
				611	if (skb_in->pkt_type != PACKET_HOST)
				612	goto out;
				613
				614	/*
				615	* Now check at the protocol level
				616	*/
				617	if (rt->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
				618	goto out;
				619
				620	/*
				621	* Only reply to fragment 0. We byte re-order the constant
				622	* mask for efficiency.
				623	*/
				624	if (iph->frag_off & htons(IP_OFFSET))
				625	goto out;
				626
				627	/*
				628	* If we send an ICMP error to an ICMP error a mess would result..
				629	*/
				630	if (icmp_pointers[type].error) {
				631	/*
				632	* We are an error, check if we are replying to an
				633	* ICMP error
				634	*/
				635	if (iph->protocol == IPPROTO_ICMP) {
				636	u8 _inner_type, *itp;
				637
				638	itp = skb_header_pointer(skb_in,
				639	skb_network_header(skb_in) +
				640	(iph->ihl << 2) +
				641	offsetof(struct icmphdr,
				642	type) -
				643	skb_in->data,
				644	sizeof(_inner_type),
				645	&_inner_type);
				646	if (!itp)
				647	goto out;
				648
				649	/*
				650	* Assume any unknown ICMP type is an error. This
				651	* isn't specified by the RFC, but think about it..
				652	*/
				653	if (*itp > NR_ICMP_TYPES \|\|
				654	icmp_pointers[*itp].error)
				655	goto out;
				656	}
				657	}
				658
				659	/* Needed by both icmp_global_allow and icmp_xmit_lock */
				660	local_bh_disable();
				661
				662	/* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
				663	* incoming dev is loopback. If outgoing dev change to not be
				664	* loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
				665	*/
				666	if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
				667	!icmpv4_global_allow(net, type, code))
				668	goto out_bh_enable;
				669
				670	sk = icmp_xmit_lock(net);
				671	if (!sk)
				672	goto out_bh_enable;
				673
				674	/*
				675	* Construct source address and options.
				676	*/
				677
				678	saddr = iph->daddr;
				679	if (!(rt->rt_flags & RTCF_LOCAL)) {
				680	struct net_device *dev = NULL;
				681
				682	rcu_read_lock();
				683	if (rt_is_input_route(rt) &&
				684	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
				685	dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
				686
				687	if (dev)
				688	saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
				689	else
				690	saddr = 0;
				691	rcu_read_unlock();
				692	}
				693
				694	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) \|
				695	IPTOS_PREC_INTERNETCONTROL) :
				696	iph->tos;
				697	mark = IP4_REPLY_MARK(net, skb_in->mark);
				698
				699	if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
				700	goto out_unlock;
				701
				702
				703	/*
				704	* Prepare data for ICMP header.
				705	*/
				706
				707	icmp_param.data.icmph.type = type;
				708	icmp_param.data.icmph.code = code;
				709	icmp_param.data.icmph.un.gateway = info;
				710	icmp_param.data.icmph.checksum = 0;
				711	icmp_param.skb = skb_in;
				712	icmp_param.offset = skb_network_offset(skb_in);
				713	inet_sk(sk)->tos = tos;
				714	sk->sk_mark = mark;
				715	ipc.addr = iph->saddr;
				716	ipc.opt = &icmp_param.replyopts.opt;
				717	ipc.tx_flags = 0;
				718	ipc.ttl = 0;
				719	ipc.tos = -1;
				720
				721	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
				722	type, code, &icmp_param);
				723	if (IS_ERR(rt))
				724	goto out_unlock;
				725
				726	/* peer icmp_ratelimit */
				727	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
				728	goto ende;
				729
				730	/* RFC says return as much as we can without exceeding 576 bytes. */
				731
				732	room = dst_mtu(&rt->dst);
				733	if (room > 576)
				734	room = 576;
				735	room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
				736	room -= sizeof(struct icmphdr);
				737
				738	icmp_param.data_len = skb_in->len - icmp_param.offset;
				739	if (icmp_param.data_len > room)
				740	icmp_param.data_len = room;
				741	icmp_param.head_len = sizeof(struct icmphdr);
				742
				743	icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
				744	ende:
				745	ip_rt_put(rt);
				746	out_unlock:
				747	icmp_xmit_unlock(sk);
				748	out_bh_enable:
				749	local_bh_enable();
				750	out:;
				751	}
				752	EXPORT_SYMBOL(__icmp_send);
				753
				754
				755	static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
				756	{
				757	const struct iphdr iph = (const struct iphdr ) skb->data;
				758	const struct net_protocol *ipprot;
				759	int protocol = iph->protocol;
				760
				761	/* Checkin full IP header plus 8 bytes of protocol to
				762	* avoid additional coding at protocol handlers.
				763	*/
				764	if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
				765	__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
				766	return;
				767	}
				768
				769	raw_icmp_error(skb, protocol, info);
				770
				771	ipprot = rcu_dereference(inet_protos[protocol]);
				772	if (ipprot && ipprot->err_handler)
				773	ipprot->err_handler(skb, info);
				774	}
				775
				776	static bool icmp_tag_validation(int proto)
				777	{
				778	bool ok;
				779
				780	rcu_read_lock();
				781	ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
				782	rcu_read_unlock();
				783	return ok;
				784	}
				785
				786	/*
				787	* Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and
				788	* ICMP_PARAMETERPROB.
				789	*/
				790
				791	static bool icmp_unreach(struct sk_buff *skb)
				792	{
				793	const struct iphdr *iph;
				794	struct icmphdr *icmph;
				795	struct net *net;
				796	u32 info = 0;
				797
				798	net = dev_net(skb_dst(skb)->dev);
				799
				800	/*
				801	* Incomplete header ?
				802	* Only checks for the IP header, there should be an
				803	* additional check for longer headers in upper levels.
				804	*/
				805
				806	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
				807	goto out_err;
				808
				809	icmph = icmp_hdr(skb);
				810	iph = (const struct iphdr *)skb->data;
				811
				812	if (iph->ihl < 5) /* Mangled header, drop. */
				813	goto out_err;
				814
				815	switch (icmph->type) {
				816	case ICMP_DEST_UNREACH:
				817	switch (icmph->code & 15) {
				818	case ICMP_NET_UNREACH:
				819	case ICMP_HOST_UNREACH:
				820	case ICMP_PROT_UNREACH:
				821	case ICMP_PORT_UNREACH:
				822	break;
				823	case ICMP_FRAG_NEEDED:
				824	/* for documentation of the ip_no_pmtu_disc
				825	* values please see
				826	* Documentation/networking/ip-sysctl.txt
				827	*/
				828	switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
				829	default:
				830	net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
				831	&iph->daddr);
				832	break;
				833	case 2:
				834	goto out;
				835	case 3:
				836	if (!icmp_tag_validation(iph->protocol))
				837	goto out;
				838	/* fall through */
				839	case 0:
				840	info = ntohs(icmph->un.frag.mtu);
				841	}
				842	break;
				843	case ICMP_SR_FAILED:
				844	net_dbg_ratelimited("%pI4: Source Route Failed\n",
				845	&iph->daddr);
				846	break;
				847	default:
				848	break;
				849	}
				850	if (icmph->code > NR_ICMP_UNREACH)
				851	goto out;
				852	break;
				853	case ICMP_PARAMETERPROB:
				854	info = ntohl(icmph->un.gateway) >> 24;
				855	break;
				856	case ICMP_TIME_EXCEEDED:
				857	__ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS);
				858	if (icmph->code == ICMP_EXC_FRAGTIME)
				859	goto out;
				860	break;
				861	}
				862
				863	/*
				864	* Throw it at our lower layers
				865	*
				866	* RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
				867	* header.
				868	* RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
				869	* transport layer.
				870	* RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
				871	* transport layer.
				872	*/
				873
				874	/*
				875	* Check the other end isn't violating RFC 1122. Some routers send
				876	* bogus responses to broadcast frames. If you see this message
				877	* first check your netmask matches at both ends, if it does then
				878	* get the other vendor to fix their kit.
				879	*/
				880
				881	if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
				882	inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
				883	net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
				884	&ip_hdr(skb)->saddr,
				885	icmph->type, icmph->code,
				886	&iph->daddr, skb->dev->name);
				887	goto out;
				888	}
				889
				890	icmp_socket_deliver(skb, info);
				891
				892	out:
				893	return true;
				894	out_err:
				895	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
				896	return false;
				897	}
				898
				899
				900	/*
				901	* Handle ICMP_REDIRECT.
				902	*/
				903
				904	static bool icmp_redirect(struct sk_buff *skb)
				905	{
				906	if (skb->len < sizeof(struct iphdr)) {
				907	__ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
				908	return false;
				909	}
				910
				911	if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
				912	/* there aught to be a stat */
				913	return false;
				914	}
				915
				916	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
				917	return true;
				918	}
				919
				920	/*
				921	* Handle ICMP_ECHO ("ping") requests.
				922	*
				923	* RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
				924	* requests.
				925	* RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
				926	* included in the reply.
				927	* RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
				928	* echo requests, MUST have default=NOT.
				929	* See also WRT handling of options once they are done and working.
				930	*/
				931
				932	static bool icmp_echo(struct sk_buff *skb)
				933	{
				934	struct net *net;
				935
				936	net = dev_net(skb_dst(skb)->dev);
				937	if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
				938	struct icmp_bxm icmp_param;
				939
				940	icmp_param.data.icmph = *icmp_hdr(skb);
				941	icmp_param.data.icmph.type = ICMP_ECHOREPLY;
				942	icmp_param.skb = skb;
				943	icmp_param.offset = 0;
				944	icmp_param.data_len = skb->len;
				945	icmp_param.head_len = sizeof(struct icmphdr);
				946	icmp_reply(&icmp_param, skb);
				947	}
				948	/* should there be an ICMP stat for ignored echos? */
				949	return true;
				950	}
				951
				952	/*
				953	* Handle ICMP Timestamp requests.
				954	* RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
				955	* SHOULD be in the kernel for minimum random latency.
				956	* MUST be accurate to a few minutes.
				957	* MUST be updated at least at 15Hz.
				958	*/
				959	static bool icmp_timestamp(struct sk_buff *skb)
				960	{
				961	struct icmp_bxm icmp_param;
				962	/*
				963	* Too short.
				964	*/
				965	if (skb->len < 4)
				966	goto out_err;
				967
				968	/*
				969	* Fill in the current time as ms since midnight UT:
				970	*/
				971	icmp_param.data.times[1] = inet_current_timestamp();
				972	icmp_param.data.times[2] = icmp_param.data.times[1];
				973	if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
				974	BUG();
				975	icmp_param.data.icmph = *icmp_hdr(skb);
				976	icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
				977	icmp_param.data.icmph.code = 0;
				978	icmp_param.skb = skb;
				979	icmp_param.offset = 0;
				980	icmp_param.data_len = 0;
				981	icmp_param.head_len = sizeof(struct icmphdr) + 12;
				982	icmp_reply(&icmp_param, skb);
				983	return true;
				984
				985	out_err:
				986	__ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
				987	return false;
				988	}
				989
				990	static bool icmp_discard(struct sk_buff *skb)
				991	{
				992	/* pretend it was a success */
				993	return true;
				994	}
				995
				996	/*
				997	* Deal with incoming ICMP packets.
				998	*/
				999	int icmp_rcv(struct sk_buff *skb)
				1000	{
				1001	struct icmphdr *icmph;
				1002	struct rtable *rt = skb_rtable(skb);
				1003	struct net *net = dev_net(rt->dst.dev);
				1004	bool success;
				1005
				1006	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				1007	struct sec_path *sp = skb_sec_path(skb);
				1008	int nh;
				1009
				1010	if (!(sp && sp->xvec[sp->len - 1]->props.flags &
				1011	XFRM_STATE_ICMP))
				1012	goto drop;
				1013
				1014	if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
				1015	goto drop;
				1016
				1017	nh = skb_network_offset(skb);
				1018	skb_set_network_header(skb, sizeof(*icmph));
				1019
				1020	if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
				1021	goto drop;
				1022
				1023	skb_set_network_header(skb, nh);
				1024	}
				1025
				1026	__ICMP_INC_STATS(net, ICMP_MIB_INMSGS);
				1027
				1028	if (skb_checksum_simple_validate(skb))
				1029	goto csum_error;
				1030
				1031	if (!pskb_pull(skb, sizeof(*icmph)))
				1032	goto error;
				1033
				1034	icmph = icmp_hdr(skb);
				1035
				1036	ICMPMSGIN_INC_STATS(net, icmph->type);
				1037	/*
				1038	* 18 is the highest 'known' ICMP type. Anything else is a mystery
				1039	*
				1040	* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
				1041	* discarded.
				1042	*/
				1043	if (icmph->type > NR_ICMP_TYPES)
				1044	goto error;
				1045
				1046
				1047	/*
				1048	* Parse the ICMP message
				1049	*/
				1050
				1051	if (rt->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST)) {
				1052	/*
				1053	* RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
				1054	* silently ignored (we let user decide with a sysctl).
				1055	* RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
				1056	* discarded if to broadcast/multicast.
				1057	*/
				1058	if ((icmph->type == ICMP_ECHO \|\|
				1059	icmph->type == ICMP_TIMESTAMP) &&
				1060	net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
				1061	goto error;
				1062	}
				1063	if (icmph->type != ICMP_ECHO &&
				1064	icmph->type != ICMP_TIMESTAMP &&
				1065	icmph->type != ICMP_ADDRESS &&
				1066	icmph->type != ICMP_ADDRESSREPLY) {
				1067	goto error;
				1068	}
				1069	}
				1070
				1071	success = icmp_pointers[icmph->type].handler(skb);
				1072
				1073	if (success) {
				1074	consume_skb(skb);
				1075	return NET_RX_SUCCESS;
				1076	}
				1077
				1078	drop:
				1079	kfree_skb(skb);
				1080	return NET_RX_DROP;
				1081	csum_error:
				1082	__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
				1083	error:
				1084	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
				1085	goto drop;
				1086	}
				1087
				1088	void icmp_err(struct sk_buff *skb, u32 info)
				1089	{
				1090	struct iphdr iph = (struct iphdr )skb->data;
				1091	int offset = iph->ihl<<2;
				1092	struct icmphdr icmph = (struct icmphdr )(skb->data + offset);
				1093	int type = icmp_hdr(skb)->type;
				1094	int code = icmp_hdr(skb)->code;
				1095	struct net *net = dev_net(skb->dev);
				1096
				1097	/*
				1098	* Use ping_err to handle all icmp errors except those
				1099	* triggered by ICMP_ECHOREPLY which sent from kernel.
				1100	*/
				1101	if (icmph->type != ICMP_ECHOREPLY) {
				1102	ping_err(skb, offset, info);
				1103	return;
				1104	}
				1105
				1106	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
				1107	ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
				1108	else if (type == ICMP_REDIRECT)
				1109	ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
				1110	}
				1111
				1112	/*
				1113	* This table is the definition of how we handle ICMP.
				1114	*/
				1115	static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
				1116	[ICMP_ECHOREPLY] = {
				1117	.handler = ping_rcv,
				1118	},
				1119	[1] = {
				1120	.handler = icmp_discard,
				1121	.error = 1,
				1122	},
				1123	[2] = {
				1124	.handler = icmp_discard,
				1125	.error = 1,
				1126	},
				1127	[ICMP_DEST_UNREACH] = {
				1128	.handler = icmp_unreach,
				1129	.error = 1,
				1130	},
				1131	[ICMP_SOURCE_QUENCH] = {
				1132	.handler = icmp_unreach,
				1133	.error = 1,
				1134	},
				1135	[ICMP_REDIRECT] = {
				1136	.handler = icmp_redirect,
				1137	.error = 1,
				1138	},
				1139	[6] = {
				1140	.handler = icmp_discard,
				1141	.error = 1,
				1142	},
				1143	[7] = {
				1144	.handler = icmp_discard,
				1145	.error = 1,
				1146	},
				1147	[ICMP_ECHO] = {
				1148	.handler = icmp_echo,
				1149	},
				1150	[9] = {
				1151	.handler = icmp_discard,
				1152	.error = 1,
				1153	},
				1154	[10] = {
				1155	.handler = icmp_discard,
				1156	.error = 1,
				1157	},
				1158	[ICMP_TIME_EXCEEDED] = {
				1159	.handler = icmp_unreach,
				1160	.error = 1,
				1161	},
				1162	[ICMP_PARAMETERPROB] = {
				1163	.handler = icmp_unreach,
				1164	.error = 1,
				1165	},
				1166	[ICMP_TIMESTAMP] = {
				1167	.handler = icmp_timestamp,
				1168	},
				1169	[ICMP_TIMESTAMPREPLY] = {
				1170	.handler = icmp_discard,
				1171	},
				1172	[ICMP_INFO_REQUEST] = {
				1173	.handler = icmp_discard,
				1174	},
				1175	[ICMP_INFO_REPLY] = {
				1176	.handler = icmp_discard,
				1177	},
				1178	[ICMP_ADDRESS] = {
				1179	.handler = icmp_discard,
				1180	},
				1181	[ICMP_ADDRESSREPLY] = {
				1182	.handler = icmp_discard,
				1183	},
				1184	};
				1185
				1186	static void __net_exit icmp_sk_exit(struct net *net)
				1187	{
				1188	int i;
				1189
				1190	for_each_possible_cpu(i)
				1191	inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
				1192	free_percpu(net->ipv4.icmp_sk);
				1193	net->ipv4.icmp_sk = NULL;
				1194	}
				1195
				1196	static int __net_init icmp_sk_init(struct net *net)
				1197	{
				1198	int i, err;
				1199
				1200	net->ipv4.icmp_sk = alloc_percpu(struct sock *);
				1201	if (!net->ipv4.icmp_sk)
				1202	return -ENOMEM;
				1203
				1204	for_each_possible_cpu(i) {
				1205	struct sock *sk;
				1206
				1207	err = inet_ctl_sock_create(&sk, PF_INET,
				1208	SOCK_RAW, IPPROTO_ICMP, net);
				1209	if (err < 0)
				1210	goto fail;
				1211
				1212	*per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
				1213
				1214	/* Enough space for 2 64K ICMP packets, including
				1215	* sk_buff/skb_shared_info struct overhead.
				1216	*/
				1217	sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
				1218
				1219	/*
				1220	* Speedup sock_wfree()
				1221	*/
				1222	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
				1223	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
				1224	}
				1225
				1226	/* Control parameters for ECHO replies. */
				1227	net->ipv4.sysctl_icmp_echo_ignore_all = 0;
				1228	net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
				1229
				1230	/* Control parameter - ignore bogus broadcast responses? */
				1231	net->ipv4.sysctl_icmp_ignore_bogus_error_responses = 1;
				1232
				1233	/*
				1234	* Configurable global rate limit.
				1235	*
				1236	* ratelimit defines tokens/packet consumed for dst->rate_token
				1237	* bucket ratemask defines which icmp types are ratelimited by
				1238	* setting it's bit position.
				1239	*
				1240	* default:
				1241	* dest unreachable (3), source quench (4),
				1242	* time exceeded (11), parameter problem (12)
				1243	*/
				1244
				1245	net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
				1246	net->ipv4.sysctl_icmp_ratemask = 0x1818;
				1247	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
				1248
				1249	return 0;
				1250
				1251	fail:
				1252	for_each_possible_cpu(i)
				1253	inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
				1254	free_percpu(net->ipv4.icmp_sk);
				1255	return err;
				1256	}
				1257
				1258	static struct pernet_operations __net_initdata icmp_sk_ops = {
				1259	.init = icmp_sk_init,
				1260	.exit = icmp_sk_exit,
				1261	};
				1262
				1263	int __init icmp_init(void)
				1264	{
				1265	return register_pernet_subsys(&icmp_sk_ops);
				1266	}