Blame - src/kernel/linux/v4.14/net/ipv4/ip_output.c - T103

blob: 819d51101cbd91dc8fae22ef7404a47766c894e0 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* The Internet Protocol (IP) output module.
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Donald Becker, <becker@super.org>
				11	* Alan Cox, <Alan.Cox@linux.org>
				12	* Richard Underwood
				13	* Stefan Becker, <stefanb@yello.ping.de>
				14	* Jorge Cwik, <jorge@laser.satlink.net>
				15	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				16	* Hirokazu Takahashi, <taka@valinux.co.jp>
				17	*
				18	* See ip_input.c for original log
				19	*
				20	* Fixes:
				21	* Alan Cox : Missing nonblock feature in ip_build_xmit.
				22	* Mike Kilburn : htons() missing in ip_build_xmit.
				23	* Bradford Johnson: Fix faulty handling of some frames when
				24	* no route is found.
				25	* Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
				26	* (in case if packet not accepted by
				27	* output firewall rules)
				28	* Mike McLagan : Routing by source
				29	* Alexey Kuznetsov: use new route cache
				30	* Andi Kleen: Fix broken PMTU recovery and remove
				31	* some redundant tests.
				32	* Vitaly E. Lavrov : Transparent proxy revived after year coma.
				33	* Andi Kleen : Replace ip_reply with ip_send_reply.
				34	* Andi Kleen : Split fast and slow ip_build_xmit path
				35	* for decreased register pressure on x86
				36	* and more readibility.
				37	* Marc Boucher : When call_out_firewall returns FW_QUEUE,
				38	* silently drop skb instead of failing with -EPERM.
				39	* Detlev Wengorz : Copy protocol for fragments.
				40	* Hirokazu Takahashi: HW checksumming for outgoing UDP
				41	* datagrams.
				42	* Hirokazu Takahashi: sendfile() on UDP works now.
				43	*/
				44
				45	#include <linux/uaccess.h>
				46	#include <linux/module.h>
				47	#include <linux/types.h>
				48	#include <linux/kernel.h>
				49	#include <linux/mm.h>
				50	#include <linux/string.h>
				51	#include <linux/errno.h>
				52	#include <linux/highmem.h>
				53	#include <linux/slab.h>
				54
				55	#include <linux/socket.h>
				56	#include <linux/sockios.h>
				57	#include <linux/in.h>
				58	#include <linux/inet.h>
				59	#include <linux/netdevice.h>
				60	#include <linux/etherdevice.h>
				61	#include <linux/proc_fs.h>
				62	#include <linux/stat.h>
				63	#include <linux/init.h>
				64
				65	#include <net/snmp.h>
				66	#include <net/ip.h>
				67	#include <net/protocol.h>
				68	#include <net/route.h>
				69	#include <net/xfrm.h>
				70	#include <linux/skbuff.h>
				71	#include <net/sock.h>
				72	#include <net/arp.h>
				73	#include <net/icmp.h>
				74	#include <net/checksum.h>
				75	#include <net/inetpeer.h>
				76	#include <net/inet_ecn.h>
				77	#include <net/lwtunnel.h>
				78	#include <linux/bpf-cgroup.h>
				79	#include <linux/igmp.h>
				80	#include <linux/netfilter_ipv4.h>
				81	#include <linux/netfilter_bridge.h>
				82	#include <linux/netlink.h>
				83	#include <linux/tcp.h>
				84
				85	static int
				86	ip_fragment(struct net net, struct sock sk, struct sk_buff *skb,
				87	unsigned int mtu,
				88	int (output)(struct net , struct sock , struct sk_buff ));
				89
				90	/* Generate a checksum for an outgoing IP datagram. */
				91	void ip_send_check(struct iphdr *iph)
				92	{
				93	iph->check = 0;
				94	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
				95	}
				96	EXPORT_SYMBOL(ip_send_check);
				97
				98	int __ip_local_out(struct net net, struct sock sk, struct sk_buff *skb)
				99	{
				100	struct iphdr *iph = ip_hdr(skb);
				101
				102	iph->tot_len = htons(skb->len);
				103	ip_send_check(iph);
				104
				105	/* if egress device is enslaved to an L3 master device pass the
				106	* skb to its handler for processing
				107	*/
				108	skb = l3mdev_ip_out(sk, skb);
				109	if (unlikely(!skb))
				110	return 0;
				111
				112	skb->protocol = htons(ETH_P_IP);
				113
				114	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
				115	net, sk, skb, NULL, skb_dst(skb)->dev,
				116	dst_output);
				117	}
				118
				119	int ip_local_out(struct net net, struct sock sk, struct sk_buff *skb)
				120	{
				121	int err;
				122
				123	err = __ip_local_out(net, sk, skb);
				124	if (likely(err == 1))
				125	err = dst_output(net, sk, skb);
				126
				127	return err;
				128	}
				129	EXPORT_SYMBOL_GPL(ip_local_out);
				130
				131	static inline int ip_select_ttl(struct inet_sock inet, struct dst_entry dst)
				132	{
				133	int ttl = inet->uc_ttl;
				134
				135	if (ttl < 0)
				136	ttl = ip4_dst_hoplimit(dst);
				137	return ttl;
				138	}
				139
				140	/*
				141	* Add an ip header to a skbuff and send it out.
				142	*
				143	*/
				144	int ip_build_and_send_pkt(struct sk_buff skb, const struct sock sk,
				145	__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
				146	{
				147	struct inet_sock *inet = inet_sk(sk);
				148	struct rtable *rt = skb_rtable(skb);
				149	struct net *net = sock_net(sk);
				150	struct iphdr *iph;
				151
				152	/* Build the IP header. */
				153	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
				154	skb_reset_network_header(skb);
				155	iph = ip_hdr(skb);
				156	iph->version = 4;
				157	iph->ihl = 5;
				158	iph->tos = inet->tos;
				159	iph->ttl = ip_select_ttl(inet, &rt->dst);
				160	iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
				161	iph->saddr = saddr;
				162	iph->protocol = sk->sk_protocol;
				163	if (ip_dont_fragment(sk, &rt->dst)) {
				164	iph->frag_off = htons(IP_DF);
				165	iph->id = 0;
				166	} else {
				167	iph->frag_off = 0;
				168	__ip_select_ident(net, iph, 1);
				169	}
				170
				171	if (opt && opt->opt.optlen) {
				172	iph->ihl += opt->opt.optlen>>2;
				173	ip_options_build(skb, &opt->opt, daddr, rt, 0);
				174	}
				175
				176	skb->priority = sk->sk_priority;
				177	if (!skb->mark)
				178	skb->mark = sk->sk_mark;
				179
				180	/* Send it out. */
				181	return ip_local_out(net, skb->sk, skb);
				182	}
				183	EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
				184
				185	static int ip_finish_output2(struct net net, struct sock sk, struct sk_buff *skb)
				186	{
				187	struct dst_entry *dst = skb_dst(skb);
				188	struct rtable rt = (struct rtable )dst;
				189	struct net_device *dev = dst->dev;
				190	unsigned int hh_len = LL_RESERVED_SPACE(dev);
				191	struct neighbour *neigh;
				192	u32 nexthop;
				193
				194	if (rt->rt_type == RTN_MULTICAST) {
				195	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
				196	} else if (rt->rt_type == RTN_BROADCAST)
				197	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
				198
				199	/* Be paranoid, rather than too clever. */
				200	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
				201	struct sk_buff *skb2;
				202
				203	skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
				204	if (!skb2) {
				205	kfree_skb(skb);
				206	return -ENOMEM;
				207	}
				208	if (skb->sk)
				209	skb_set_owner_w(skb2, skb->sk);
				210	consume_skb(skb);
				211	skb = skb2;
				212	}
				213
				214	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
				215	int res = lwtunnel_xmit(skb);
				216
				217	if (res < 0 \|\| res == LWTUNNEL_XMIT_DONE)
				218	return res;
				219	}
				220
				221	rcu_read_lock_bh();
				222	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
				223	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
				224	if (unlikely(!neigh))
				225	neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
				226	if (!IS_ERR(neigh)) {
				227	int res;
				228
				229	sock_confirm_neigh(skb, neigh);
				230	res = neigh_output(neigh, skb);
				231
				232	rcu_read_unlock_bh();
				233	return res;
				234	}
				235	rcu_read_unlock_bh();
				236
				237	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
				238	__func__);
				239	kfree_skb(skb);
				240	return -EINVAL;
				241	}
				242
				243	static int ip_finish_output_gso(struct net net, struct sock sk,
				244	struct sk_buff *skb, unsigned int mtu)
				245	{
				246	netdev_features_t features;
				247	struct sk_buff *segs;
				248	int ret = 0;
				249
				250	/* common case: seglen is <= mtu
				251	*/
				252	if (skb_gso_validate_mtu(skb, mtu))
				253	return ip_finish_output2(net, sk, skb);
				254
				255	/* Slowpath - GSO segment length exceeds the egress MTU.
				256	*
				257	* This can happen in several cases:
				258	* - Forwarding of a TCP GRO skb, when DF flag is not set.
				259	* - Forwarding of an skb that arrived on a virtualization interface
				260	* (virtio-net/vhost/tap) with TSO/GSO size set by other network
				261	* stack.
				262	* - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
				263	* interface with a smaller MTU.
				264	* - Arriving GRO skb (or GSO skb in a virtualized environment) that is
				265	* bridged to a NETIF_F_TSO tunnel stacked over an interface with an
				266	* insufficent MTU.
				267	*/
				268	features = netif_skb_features(skb);
				269	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
				270	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
				271	if (IS_ERR_OR_NULL(segs)) {
				272	kfree_skb(skb);
				273	return -ENOMEM;
				274	}
				275
				276	consume_skb(skb);
				277
				278	do {
				279	struct sk_buff *nskb = segs->next;
				280	int err;
				281
				282	segs->next = NULL;
				283	err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
				284
				285	if (err && ret == 0)
				286	ret = err;
				287	segs = nskb;
				288	} while (segs);
				289
				290	return ret;
				291	}
				292
				293	static int ip_finish_output(struct net net, struct sock sk, struct sk_buff *skb)
				294	{
				295	unsigned int mtu;
				296	int ret;
				297
				298	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
				299	if (ret) {
				300	kfree_skb(skb);
				301	return ret;
				302	}
				303
				304	#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
				305	/* Policy lookup after SNAT yielded a new policy */
				306	if (skb_dst(skb)->xfrm) {
				307	IPCB(skb)->flags \|= IPSKB_REROUTED;
				308	return dst_output(net, sk, skb);
				309	}
				310	#endif
				311	mtu = ip_skb_dst_mtu(sk, skb);
				312	if (skb_is_gso(skb))
				313	return ip_finish_output_gso(net, sk, skb, mtu);
				314
				315	if (skb->len > mtu \|\| (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
				316	return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
				317
				318	return ip_finish_output2(net, sk, skb);
				319	}
				320
				321	static int ip_mc_finish_output(struct net net, struct sock sk,
				322	struct sk_buff *skb)
				323	{
				324	int ret;
				325
				326	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
				327	if (ret) {
				328	kfree_skb(skb);
				329	return ret;
				330	}
				331
				332	return dev_loopback_xmit(net, sk, skb);
				333	}
				334
				335	int ip_mc_output(struct net net, struct sock sk, struct sk_buff *skb)
				336	{
				337	struct rtable *rt = skb_rtable(skb);
				338	struct net_device *dev = rt->dst.dev;
				339
				340	/*
				341	* If the indicated interface is up and running, send the packet.
				342	*/
				343	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
				344
				345	skb->dev = dev;
				346	skb->protocol = htons(ETH_P_IP);
				347
				348	/*
				349	* Multicasts are looped back for other local users
				350	*/
				351
				352	if (rt->rt_flags&RTCF_MULTICAST) {
				353	if (sk_mc_loop(sk)
				354	#ifdef CONFIG_IP_MROUTE
				355	/* Small optimization: do not loopback not local frames,
				356	which returned after forwarding; they will be dropped
				357	by ip_mr_input in any case.
				358	Note, that local frames are looped back to be delivered
				359	to local recipients.
				360
				361	This check is duplicated in ip_mr_input at the moment.
				362	*/
				363	&&
				364	((rt->rt_flags & RTCF_LOCAL) \|\|
				365	!(IPCB(skb)->flags & IPSKB_FORWARDED))
				366	#endif
				367	) {
				368	struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
				369	if (newskb)
				370	NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				371	net, sk, newskb, NULL, newskb->dev,
				372	ip_mc_finish_output);
				373	}
				374
				375	/* Multicasts with ttl 0 must not go beyond the host */
				376
				377	if (ip_hdr(skb)->ttl == 0) {
				378	kfree_skb(skb);
				379	return 0;
				380	}
				381	}
				382
				383	if (rt->rt_flags&RTCF_BROADCAST) {
				384	struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
				385	if (newskb)
				386	NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				387	net, sk, newskb, NULL, newskb->dev,
				388	ip_mc_finish_output);
				389	}
				390
				391	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				392	net, sk, skb, NULL, skb->dev,
				393	ip_finish_output,
				394	!(IPCB(skb)->flags & IPSKB_REROUTED));
				395	}
				396
				397	int ip_output(struct net net, struct sock sk, struct sk_buff *skb)
				398	{
				399	struct net_device *dev = skb_dst(skb)->dev;
				400
				401	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
				402
				403	skb->dev = dev;
				404	skb->protocol = htons(ETH_P_IP);
				405
				406	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				407	net, sk, skb, NULL, dev,
				408	ip_finish_output,
				409	!(IPCB(skb)->flags & IPSKB_REROUTED));
				410	}
				411
				412	/*
				413	* copy saddr and daddr, possibly using 64bit load/stores
				414	* Equivalent to :
				415	* iph->saddr = fl4->saddr;
				416	* iph->daddr = fl4->daddr;
				417	*/
				418	static void ip_copy_addrs(struct iphdr iph, const struct flowi4 fl4)
				419	{
				420	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
				421	offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
				422	memcpy(&iph->saddr, &fl4->saddr,
				423	sizeof(fl4->saddr) + sizeof(fl4->daddr));
				424	}
				425
				426	/* Note: skb->sk can be different from sk, in case of tunnels */
				427	int ip_queue_xmit(struct sock sk, struct sk_buff skb, struct flowi *fl)
				428	{
				429	struct inet_sock *inet = inet_sk(sk);
				430	struct net *net = sock_net(sk);
				431	struct ip_options_rcu *inet_opt;
				432	struct flowi4 *fl4;
				433	struct rtable *rt;
				434	struct iphdr *iph;
				435	int res;
				436
				437	/* Skip all of this if the packet is already routed,
				438	* f.e. by something like SCTP.
				439	*/
				440	rcu_read_lock();
				441	inet_opt = rcu_dereference(inet->inet_opt);
				442	fl4 = &fl->u.ip4;
				443	rt = skb_rtable(skb);
				444	if (rt)
				445	goto packet_routed;
				446
				447	/* Make sure we can route this packet. */
				448	rt = (struct rtable *)__sk_dst_check(sk, 0);
				449	if (!rt) {
				450	__be32 daddr;
				451
				452	/* Use correct destination address if we have options. */
				453	daddr = inet->inet_daddr;
				454	if (inet_opt && inet_opt->opt.srr)
				455	daddr = inet_opt->opt.faddr;
				456
				457	/* If this fails, retransmit mechanism of transport layer will
				458	* keep trying until route appears or the connection times
				459	* itself out.
				460	*/
				461	rt = ip_route_output_ports(net, fl4, sk,
				462	daddr, inet->inet_saddr,
				463	inet->inet_dport,
				464	inet->inet_sport,
				465	sk->sk_protocol,
				466	RT_CONN_FLAGS(sk),
				467	sk->sk_bound_dev_if);
				468	if (IS_ERR(rt))
				469	goto no_route;
				470	sk_setup_caps(sk, &rt->dst);
				471	}
				472	skb_dst_set_noref(skb, &rt->dst);
				473
				474	packet_routed:
				475	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
				476	goto no_route;
				477
				478	/* OK, we know where to send it, allocate and build IP header. */
				479	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
				480	skb_reset_network_header(skb);
				481	iph = ip_hdr(skb);
				482	((__be16 )iph) = htons((4 << 12) \| (5 << 8) \| (inet->tos & 0xff));
				483	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
				484	iph->frag_off = htons(IP_DF);
				485	else
				486	iph->frag_off = 0;
				487	iph->ttl = ip_select_ttl(inet, &rt->dst);
				488	iph->protocol = sk->sk_protocol;
				489	ip_copy_addrs(iph, fl4);
				490
				491	/* Transport layer set skb->h.foo itself. */
				492
				493	if (inet_opt && inet_opt->opt.optlen) {
				494	iph->ihl += inet_opt->opt.optlen >> 2;
				495	ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
				496	}
				497
				498	ip_select_ident_segs(net, skb, sk,
				499	skb_shinfo(skb)->gso_segs ?: 1);
				500
				501	/* TODO : should we use skb->sk here instead of sk ? */
				502	skb->priority = sk->sk_priority;
				503	skb->mark = sk->sk_mark;
				504
				505	res = ip_local_out(net, sk, skb);
				506	rcu_read_unlock();
				507	return res;
				508
				509	no_route:
				510	rcu_read_unlock();
				511	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
				512	kfree_skb(skb);
				513	return -EHOSTUNREACH;
				514	}
				515	EXPORT_SYMBOL(ip_queue_xmit);
				516
				517	static void ip_copy_metadata(struct sk_buff to, struct sk_buff from)
				518	{
				519	to->pkt_type = from->pkt_type;
				520	to->priority = from->priority;
				521	to->protocol = from->protocol;
				522	to->skb_iif = from->skb_iif;
				523	skb_dst_drop(to);
				524	skb_dst_copy(to, from);
				525	to->dev = from->dev;
				526	to->mark = from->mark;
				527
				528	skb_copy_hash(to, from);
				529
				530	/* Copy the flags to each fragment. */
				531	IPCB(to)->flags = IPCB(from)->flags;
				532
				533	#ifdef CONFIG_NET_SCHED
				534	to->tc_index = from->tc_index;
				535	#endif
				536	nf_copy(to, from);
				537	#if IS_ENABLED(CONFIG_IP_VS)
				538	to->ipvs_property = from->ipvs_property;
				539	#endif
				540	skb_copy_secmark(to, from);
				541	}
				542
				543	static int ip_fragment(struct net net, struct sock sk, struct sk_buff *skb,
				544	unsigned int mtu,
				545	int (output)(struct net , struct sock , struct sk_buff ))
				546	{
				547	struct iphdr *iph = ip_hdr(skb);
				548
				549	if ((iph->frag_off & htons(IP_DF)) == 0)
				550	return ip_do_fragment(net, sk, skb, output);
				551
				552	if (unlikely(!skb->ignore_df \|\|
				553	(IPCB(skb)->frag_max_size &&
				554	IPCB(skb)->frag_max_size > mtu))) {
				555	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
				556	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
				557	htonl(mtu));
				558	kfree_skb(skb);
				559	return -EMSGSIZE;
				560	}
				561
				562	return ip_do_fragment(net, sk, skb, output);
				563	}
				564
				565	/*
				566	* This IP datagram is too large to be sent in one piece. Break it up into
				567	* smaller pieces (each of size equal to IP header plus
				568	* a block of the data of the original IP data part) that will yet fit in a
				569	* single device frame, and queue such a frame for sending.
				570	*/
				571
				572	int ip_do_fragment(struct net net, struct sock sk, struct sk_buff *skb,
				573	int (output)(struct net , struct sock , struct sk_buff ))
				574	{
				575	struct iphdr *iph;
				576	int ptr;
				577	struct sk_buff *skb2;
				578	unsigned int mtu, hlen, left, len, ll_rs;
				579	int offset;
				580	__be16 not_last_frag;
				581	struct rtable *rt = skb_rtable(skb);
				582	int err = 0;
				583
				584	/* for offloaded checksums cleanup checksum before fragmentation */
				585	if (skb->ip_summed == CHECKSUM_PARTIAL &&
				586	(err = skb_checksum_help(skb)))
				587	goto fail;
				588
				589	/*
				590	* Point into the IP datagram header.
				591	*/
				592
				593	iph = ip_hdr(skb);
				594
				595	mtu = ip_skb_dst_mtu(sk, skb);
				596	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
				597	mtu = IPCB(skb)->frag_max_size;
				598
				599	/*
				600	* Setup starting values.
				601	*/
				602
				603	hlen = iph->ihl * 4;
				604	mtu = mtu - hlen; /* Size of data space */
				605	IPCB(skb)->flags \|= IPSKB_FRAG_COMPLETE;
				606	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
				607
				608	/* When frag_list is given, use it. First, check its validity:
				609	* some transformers could create wrong frag_list or break existing
				610	* one, it is not prohibited. In this case fall back to copying.
				611	*
				612	* LATER: this step can be merged to real generation of fragments,
				613	* we can switch to copy when see the first bad fragment.
				614	*/
				615	if (skb_has_frag_list(skb)) {
				616	struct sk_buff frag, frag2;
				617	unsigned int first_len = skb_pagelen(skb);
				618
				619	if (first_len - hlen > mtu \|\|
				620	((first_len - hlen) & 7) \|\|
				621	ip_is_fragment(iph) \|\|
				622	skb_cloned(skb) \|\|
				623	skb_headroom(skb) < ll_rs)
				624	goto slow_path;
				625
				626	skb_walk_frags(skb, frag) {
				627	/* Correct geometry. */
				628	if (frag->len > mtu \|\|
				629	((frag->len & 7) && frag->next) \|\|
				630	skb_headroom(frag) < hlen + ll_rs)
				631	goto slow_path_clean;
				632
				633	/* Partially cloned skb? */
				634	if (skb_shared(frag))
				635	goto slow_path_clean;
				636
				637	BUG_ON(frag->sk);
				638	if (skb->sk) {
				639	frag->sk = skb->sk;
				640	frag->destructor = sock_wfree;
				641	}
				642	skb->truesize -= frag->truesize;
				643	}
				644
				645	/* Everything is OK. Generate! */
				646
				647	err = 0;
				648	offset = 0;
				649	frag = skb_shinfo(skb)->frag_list;
				650	skb_frag_list_init(skb);
				651	skb->data_len = first_len - skb_headlen(skb);
				652	skb->len = first_len;
				653	iph->tot_len = htons(first_len);
				654	iph->frag_off = htons(IP_MF);
				655	ip_send_check(iph);
				656
				657	for (;;) {
				658	/* Prepare header of the next frame,
				659	* before previous one went down. */
				660	if (frag) {
				661	frag->ip_summed = CHECKSUM_NONE;
				662	skb_reset_transport_header(frag);
				663	__skb_push(frag, hlen);
				664	skb_reset_network_header(frag);
				665	memcpy(skb_network_header(frag), iph, hlen);
				666	iph = ip_hdr(frag);
				667	iph->tot_len = htons(frag->len);
				668	ip_copy_metadata(frag, skb);
				669	if (offset == 0)
				670	ip_options_fragment(frag);
				671	offset += skb->len - hlen;
				672	iph->frag_off = htons(offset>>3);
				673	if (frag->next)
				674	iph->frag_off \|= htons(IP_MF);
				675	/* Ready, complete checksum */
				676	ip_send_check(iph);
				677	}
				678
				679	err = output(net, sk, skb);
				680
				681	if (!err)
				682	IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
				683	if (err \|\| !frag)
				684	break;
				685
				686	skb = frag;
				687	frag = skb->next;
				688	skb->next = NULL;
				689	}
				690
				691	if (err == 0) {
				692	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
				693	return 0;
				694	}
				695
				696	while (frag) {
				697	skb = frag->next;
				698	kfree_skb(frag);
				699	frag = skb;
				700	}
				701	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
				702	return err;
				703
				704	slow_path_clean:
				705	skb_walk_frags(skb, frag2) {
				706	if (frag2 == frag)
				707	break;
				708	frag2->sk = NULL;
				709	frag2->destructor = NULL;
				710	skb->truesize += frag2->truesize;
				711	}
				712	}
				713
				714	slow_path:
				715	iph = ip_hdr(skb);
				716
				717	left = skb->len - hlen; /* Space per frame */
				718	ptr = hlen; /* Where to start from */
				719
				720	/*
				721	* Fragment the datagram.
				722	*/
				723
				724	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
				725	not_last_frag = iph->frag_off & htons(IP_MF);
				726
				727	/*
				728	* Keep copying data until we run out.
				729	*/
				730
				731	while (left > 0) {
				732	len = left;
				733	/* IF: it doesn't fit, use 'mtu' - the data space left */
				734	if (len > mtu)
				735	len = mtu;
				736	/* IF: we are not sending up to and including the packet end
				737	then align the next start on an eight byte boundary */
				738	if (len < left) {
				739	len &= ~7;
				740	}
				741
				742	/* Allocate buffer */
				743	skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
				744	if (!skb2) {
				745	err = -ENOMEM;
				746	goto fail;
				747	}
				748
				749	/*
				750	* Set up data on packet
				751	*/
				752
				753	ip_copy_metadata(skb2, skb);
				754	skb_reserve(skb2, ll_rs);
				755	skb_put(skb2, len + hlen);
				756	skb_reset_network_header(skb2);
				757	skb2->transport_header = skb2->network_header + hlen;
				758
				759	/*
				760	* Charge the memory for the fragment to any owner
				761	* it might possess
				762	*/
				763
				764	if (skb->sk)
				765	skb_set_owner_w(skb2, skb->sk);
				766
				767	/*
				768	* Copy the packet header into the new buffer.
				769	*/
				770
				771	skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
				772
				773	/*
				774	* Copy a block of the IP datagram.
				775	*/
				776	if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
				777	BUG();
				778	left -= len;
				779
				780	/*
				781	* Fill in the new header fields.
				782	*/
				783	iph = ip_hdr(skb2);
				784	iph->frag_off = htons((offset >> 3));
				785
				786	if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
				787	iph->frag_off \|= htons(IP_DF);
				788
				789	/* ANK: dirty, but effective trick. Upgrade options only if
				790	* the segment to be fragmented was THE FIRST (otherwise,
				791	* options are already fixed) and make it ONCE
				792	* on the initial skb, so that all the following fragments
				793	* will inherit fixed options.
				794	*/
				795	if (offset == 0)
				796	ip_options_fragment(skb);
				797
				798	/*
				799	* Added AC : If we are fragmenting a fragment that's not the
				800	* last fragment then keep MF on each bit
				801	*/
				802	if (left > 0 \|\| not_last_frag)
				803	iph->frag_off \|= htons(IP_MF);
				804	ptr += len;
				805	offset += len;
				806
				807	/*
				808	* Put this fragment into the sending queue.
				809	*/
				810	iph->tot_len = htons(len + hlen);
				811
				812	ip_send_check(iph);
				813
				814	err = output(net, sk, skb2);
				815	if (err)
				816	goto fail;
				817
				818	IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
				819	}
				820	consume_skb(skb);
				821	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
				822	return err;
				823
				824	fail:
				825	kfree_skb(skb);
				826	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
				827	return err;
				828	}
				829	EXPORT_SYMBOL(ip_do_fragment);
				830
				831	int
				832	ip_generic_getfrag(void from, char to, int offset, int len, int odd, struct sk_buff *skb)
				833	{
				834	struct msghdr *msg = from;
				835
				836	if (skb->ip_summed == CHECKSUM_PARTIAL) {
				837	if (!copy_from_iter_full(to, len, &msg->msg_iter))
				838	return -EFAULT;
				839	} else {
				840	__wsum csum = 0;
				841	if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
				842	return -EFAULT;
				843	skb->csum = csum_block_add(skb->csum, csum, odd);
				844	}
				845	return 0;
				846	}
				847	EXPORT_SYMBOL(ip_generic_getfrag);
				848
				849	static inline __wsum
				850	csum_page(struct page *page, int offset, int copy)
				851	{
				852	char *kaddr;
				853	__wsum csum;
				854	kaddr = kmap(page);
				855	csum = csum_partial(kaddr + offset, copy, 0);
				856	kunmap(page);
				857	return csum;
				858	}
				859
				860	static int __ip_append_data(struct sock *sk,
				861	struct flowi4 *fl4,
				862	struct sk_buff_head *queue,
				863	struct inet_cork *cork,
				864	struct page_frag *pfrag,
				865	int getfrag(void from, char to, int offset,
				866	int len, int odd, struct sk_buff *skb),
				867	void *from, int length, int transhdrlen,
				868	unsigned int flags)
				869	{
				870	struct inet_sock *inet = inet_sk(sk);
				871	struct sk_buff *skb;
				872
				873	struct ip_options *opt = cork->opt;
				874	int hh_len;
				875	int exthdrlen;
				876	int mtu;
				877	int copy;
				878	int err;
				879	int offset = 0;
				880	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
				881	int csummode = CHECKSUM_NONE;
				882	struct rtable rt = (struct rtable )cork->dst;
				883	u32 tskey = 0;
				884
				885	skb = skb_peek_tail(queue);
				886
				887	exthdrlen = !skb ? rt->dst.header_len : 0;
				888	mtu = cork->fragsize;
				889	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
				890	sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
				891	tskey = sk->sk_tskey++;
				892
				893	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
				894
				895	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
				896	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
				897	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
				898
				899	if (cork->length + length > maxnonfragsize - fragheaderlen) {
				900	ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
				901	mtu - (opt ? opt->optlen : 0));
				902	return -EMSGSIZE;
				903	}
				904
				905	/*
				906	* transhdrlen > 0 means that this is the first fragment and we wish
				907	* it won't be fragmented in the future.
				908	*/
				909	if (transhdrlen &&
				910	length + fragheaderlen <= mtu &&
				911	rt->dst.dev->features & (NETIF_F_HW_CSUM \| NETIF_F_IP_CSUM) &&
				912	!(flags & MSG_MORE) &&
				913	!exthdrlen)
				914	csummode = CHECKSUM_PARTIAL;
				915
				916	cork->length += length;
				917
				918	/* So, what's going on in the loop below?
				919	*
				920	* We use calculated fragment length to generate chained skb,
				921	* each of segments is IP fragment ready for sending to network after
				922	* adding appropriate IP header.
				923	*/
				924
				925	if (!skb)
				926	goto alloc_new_skb;
				927
				928	while (length > 0) {
				929	/* Check if the remaining data fits into current packet. */
				930	copy = mtu - skb->len;
				931	if (copy < length)
				932	copy = maxfraglen - skb->len;
				933	if (copy <= 0) {
				934	char *data;
				935	unsigned int datalen;
				936	unsigned int fraglen;
				937	unsigned int fraggap;
				938	unsigned int alloclen;
				939	struct sk_buff *skb_prev;
				940	alloc_new_skb:
				941	skb_prev = skb;
				942	if (skb_prev)
				943	fraggap = skb_prev->len - maxfraglen;
				944	else
				945	fraggap = 0;
				946
				947	/*
				948	* If remaining data exceeds the mtu,
				949	* we know we need more fragment(s).
				950	*/
				951	datalen = length + fraggap;
				952	if (datalen > mtu - fragheaderlen)
				953	datalen = maxfraglen - fragheaderlen;
				954	fraglen = datalen + fragheaderlen;
				955
				956	if ((flags & MSG_MORE) &&
				957	!(rt->dst.dev->features&NETIF_F_SG))
				958	alloclen = mtu;
				959	else
				960	alloclen = fraglen;
				961
				962	alloclen += exthdrlen;
				963
				964	/* The last fragment gets additional space at tail.
				965	* Note, with MSG_MORE we overallocate on fragments,
				966	* because we have no idea what fragment will be
				967	* the last.
				968	*/
				969	if (datalen == length + fraggap)
				970	alloclen += rt->dst.trailer_len;
				971
				972	if (transhdrlen) {
				973	skb = sock_alloc_send_skb(sk,
				974	alloclen + hh_len + 15,
				975	(flags & MSG_DONTWAIT), &err);
				976	} else {
				977	skb = NULL;
				978	if (refcount_read(&sk->sk_wmem_alloc) <=
				979	2 * sk->sk_sndbuf)
				980	skb = sock_wmalloc(sk,
				981	alloclen + hh_len + 15, 1,
				982	sk->sk_allocation);
				983	if (unlikely(!skb))
				984	err = -ENOBUFS;
				985	}
				986	if (!skb)
				987	goto error;
				988
				989	/*
				990	* Fill in the control structures
				991	*/
				992	skb->ip_summed = csummode;
				993	skb->csum = 0;
				994	skb_reserve(skb, hh_len);
				995
				996	/* only the initial fragment is time stamped */
				997	skb_shinfo(skb)->tx_flags = cork->tx_flags;
				998	cork->tx_flags = 0;
				999	skb_shinfo(skb)->tskey = tskey;
				1000	tskey = 0;
				1001
				1002	/*
				1003	* Find where to start putting bytes.
				1004	*/
				1005	data = skb_put(skb, fraglen + exthdrlen);
				1006	skb_set_network_header(skb, exthdrlen);
				1007	skb->transport_header = (skb->network_header +
				1008	fragheaderlen);
				1009	data += fragheaderlen + exthdrlen;
				1010
				1011	if (fraggap) {
				1012	skb->csum = skb_copy_and_csum_bits(
				1013	skb_prev, maxfraglen,
				1014	data + transhdrlen, fraggap, 0);
				1015	skb_prev->csum = csum_sub(skb_prev->csum,
				1016	skb->csum);
				1017	data += fraggap;
				1018	pskb_trim_unique(skb_prev, maxfraglen);
				1019	}
				1020
				1021	copy = datalen - transhdrlen - fraggap;
				1022	if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				1023	err = -EFAULT;
				1024	kfree_skb(skb);
				1025	goto error;
				1026	}
				1027
				1028	offset += copy;
				1029	length -= datalen - fraggap;
				1030	transhdrlen = 0;
				1031	exthdrlen = 0;
				1032	csummode = CHECKSUM_NONE;
				1033
				1034	if ((flags & MSG_CONFIRM) && !skb_prev)
				1035	skb_set_dst_pending_confirm(skb, 1);
				1036
				1037	/*
				1038	* Put the packet on the pending queue.
				1039	*/
				1040	__skb_queue_tail(queue, skb);
				1041	continue;
				1042	}
				1043
				1044	if (copy > length)
				1045	copy = length;
				1046
				1047	if (!(rt->dst.dev->features&NETIF_F_SG) &&
				1048	skb_tailroom(skb) >= copy) {
				1049	unsigned int off;
				1050
				1051	off = skb->len;
				1052	if (getfrag(from, skb_put(skb, copy),
				1053	offset, copy, off, skb) < 0) {
				1054	__skb_trim(skb, off);
				1055	err = -EFAULT;
				1056	goto error;
				1057	}
				1058	} else {
				1059	int i = skb_shinfo(skb)->nr_frags;
				1060
				1061	err = -ENOMEM;
				1062	if (!sk_page_frag_refill(sk, pfrag))
				1063	goto error;
				1064
				1065	if (!skb_can_coalesce(skb, i, pfrag->page,
				1066	pfrag->offset)) {
				1067	err = -EMSGSIZE;
				1068	if (i == MAX_SKB_FRAGS)
				1069	goto error;
				1070
				1071	__skb_fill_page_desc(skb, i, pfrag->page,
				1072	pfrag->offset, 0);
				1073	skb_shinfo(skb)->nr_frags = ++i;
				1074	get_page(pfrag->page);
				1075	}
				1076	copy = min_t(int, copy, pfrag->size - pfrag->offset);
				1077	if (getfrag(from,
				1078	page_address(pfrag->page) + pfrag->offset,
				1079	offset, copy, skb->len, skb) < 0)
				1080	goto error_efault;
				1081
				1082	pfrag->offset += copy;
				1083	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				1084	skb->len += copy;
				1085	skb->data_len += copy;
				1086	skb->truesize += copy;
				1087	refcount_add(copy, &sk->sk_wmem_alloc);
				1088	}
				1089	offset += copy;
				1090	length -= copy;
				1091	}
				1092
				1093	return 0;
				1094
				1095	error_efault:
				1096	err = -EFAULT;
				1097	error:
				1098	cork->length -= length;
				1099	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
				1100	return err;
				1101	}
				1102
				1103	static int ip_setup_cork(struct sock sk, struct inet_cork cork,
				1104	struct ipcm_cookie ipc, struct rtable *rtp)
				1105	{
				1106	struct ip_options_rcu *opt;
				1107	struct rtable *rt;
				1108
				1109	/*
				1110	* setup for corking.
				1111	*/
				1112	opt = ipc->opt;
				1113	if (opt) {
				1114	if (!cork->opt) {
				1115	cork->opt = kmalloc(sizeof(struct ip_options) + 40,
				1116	sk->sk_allocation);
				1117	if (unlikely(!cork->opt))
				1118	return -ENOBUFS;
				1119	}
				1120	memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
				1121	cork->flags \|= IPCORK_OPT;
				1122	cork->addr = ipc->addr;
				1123	}
				1124	rt = *rtp;
				1125	if (unlikely(!rt))
				1126	return -EFAULT;
				1127
				1128	cork->fragsize = ip_sk_use_pmtu(sk) ?
				1129	dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
				1130
				1131	if (!inetdev_valid_mtu(cork->fragsize))
				1132	return -ENETUNREACH;
				1133
				1134	cork->dst = &rt->dst;
				1135	/* We stole this route, caller should not release it. */
				1136	*rtp = NULL;
				1137
				1138	cork->length = 0;
				1139	cork->ttl = ipc->ttl;
				1140	cork->tos = ipc->tos;
				1141	cork->priority = ipc->priority;
				1142	cork->tx_flags = ipc->tx_flags;
				1143
				1144	return 0;
				1145	}
				1146
				1147	/*
				1148	* ip_append_data() and ip_append_page() can make one large IP datagram
				1149	* from many pieces of data. Each pieces will be holded on the socket
				1150	* until ip_push_pending_frames() is called. Each piece can be a page
				1151	* or non-page data.
				1152	*
				1153	* Not only UDP, other transport protocols - e.g. raw sockets - can use
				1154	* this interface potentially.
				1155	*
				1156	* LATER: length must be adjusted by pad at tail, when it is required.
				1157	*/
				1158	int ip_append_data(struct sock sk, struct flowi4 fl4,
				1159	int getfrag(void from, char to, int offset, int len,
				1160	int odd, struct sk_buff *skb),
				1161	void *from, int length, int transhdrlen,
				1162	struct ipcm_cookie ipc, struct rtable *rtp,
				1163	unsigned int flags)
				1164	{
				1165	struct inet_sock *inet = inet_sk(sk);
				1166	int err;
				1167
				1168	if (flags&MSG_PROBE)
				1169	return 0;
				1170
				1171	if (skb_queue_empty(&sk->sk_write_queue)) {
				1172	err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
				1173	if (err)
				1174	return err;
				1175	} else {
				1176	transhdrlen = 0;
				1177	}
				1178
				1179	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
				1180	sk_page_frag(sk), getfrag,
				1181	from, length, transhdrlen, flags);
				1182	}
				1183
				1184	ssize_t ip_append_page(struct sock sk, struct flowi4 fl4, struct page *page,
				1185	int offset, size_t size, int flags)
				1186	{
				1187	struct inet_sock *inet = inet_sk(sk);
				1188	struct sk_buff *skb;
				1189	struct rtable *rt;
				1190	struct ip_options *opt = NULL;
				1191	struct inet_cork *cork;
				1192	int hh_len;
				1193	int mtu;
				1194	int len;
				1195	int err;
				1196	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
				1197
				1198	if (inet->hdrincl)
				1199	return -EPERM;
				1200
				1201	if (flags&MSG_PROBE)
				1202	return 0;
				1203
				1204	if (skb_queue_empty(&sk->sk_write_queue))
				1205	return -EINVAL;
				1206
				1207	cork = &inet->cork.base;
				1208	rt = (struct rtable *)cork->dst;
				1209	if (cork->flags & IPCORK_OPT)
				1210	opt = cork->opt;
				1211
				1212	if (!(rt->dst.dev->features&NETIF_F_SG))
				1213	return -EOPNOTSUPP;
				1214
				1215	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
				1216	mtu = cork->fragsize;
				1217
				1218	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
				1219	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
				1220	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
				1221
				1222	if (cork->length + size > maxnonfragsize - fragheaderlen) {
				1223	ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
				1224	mtu - (opt ? opt->optlen : 0));
				1225	return -EMSGSIZE;
				1226	}
				1227
				1228	skb = skb_peek_tail(&sk->sk_write_queue);
				1229	if (!skb)
				1230	return -EINVAL;
				1231
				1232	cork->length += size;
				1233
				1234	while (size > 0) {
				1235	/* Check if the remaining data fits into current packet. */
				1236	len = mtu - skb->len;
				1237	if (len < size)
				1238	len = maxfraglen - skb->len;
				1239
				1240	if (len <= 0) {
				1241	struct sk_buff *skb_prev;
				1242	int alloclen;
				1243
				1244	skb_prev = skb;
				1245	fraggap = skb_prev->len - maxfraglen;
				1246
				1247	alloclen = fragheaderlen + hh_len + fraggap + 15;
				1248	skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
				1249	if (unlikely(!skb)) {
				1250	err = -ENOBUFS;
				1251	goto error;
				1252	}
				1253
				1254	/*
				1255	* Fill in the control structures
				1256	*/
				1257	skb->ip_summed = CHECKSUM_NONE;
				1258	skb->csum = 0;
				1259	skb_reserve(skb, hh_len);
				1260
				1261	/*
				1262	* Find where to start putting bytes.
				1263	*/
				1264	skb_put(skb, fragheaderlen + fraggap);
				1265	skb_reset_network_header(skb);
				1266	skb->transport_header = (skb->network_header +
				1267	fragheaderlen);
				1268	if (fraggap) {
				1269	skb->csum = skb_copy_and_csum_bits(skb_prev,
				1270	maxfraglen,
				1271	skb_transport_header(skb),
				1272	fraggap, 0);
				1273	skb_prev->csum = csum_sub(skb_prev->csum,
				1274	skb->csum);
				1275	pskb_trim_unique(skb_prev, maxfraglen);
				1276	}
				1277
				1278	/*
				1279	* Put the packet on the pending queue.
				1280	*/
				1281	__skb_queue_tail(&sk->sk_write_queue, skb);
				1282	continue;
				1283	}
				1284
				1285	if (len > size)
				1286	len = size;
				1287
				1288	if (skb_append_pagefrags(skb, page, offset, len)) {
				1289	err = -EMSGSIZE;
				1290	goto error;
				1291	}
				1292
				1293	if (skb->ip_summed == CHECKSUM_NONE) {
				1294	__wsum csum;
				1295	csum = csum_page(page, offset, len);
				1296	skb->csum = csum_block_add(skb->csum, csum, skb->len);
				1297	}
				1298
				1299	skb->len += len;
				1300	skb->data_len += len;
				1301	skb->truesize += len;
				1302	refcount_add(len, &sk->sk_wmem_alloc);
				1303	offset += len;
				1304	size -= len;
				1305	}
				1306	return 0;
				1307
				1308	error:
				1309	cork->length -= size;
				1310	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
				1311	return err;
				1312	}
				1313
				1314	static void ip_cork_release(struct inet_cork *cork)
				1315	{
				1316	cork->flags &= ~IPCORK_OPT;
				1317	kfree(cork->opt);
				1318	cork->opt = NULL;
				1319	dst_release(cork->dst);
				1320	cork->dst = NULL;
				1321	}
				1322
				1323	/*
				1324	* Combined all pending IP fragments on the socket as one IP datagram
				1325	* and push them out.
				1326	*/
				1327	struct sk_buff __ip_make_skb(struct sock sk,
				1328	struct flowi4 *fl4,
				1329	struct sk_buff_head *queue,
				1330	struct inet_cork *cork)
				1331	{
				1332	struct sk_buff skb, tmp_skb;
				1333	struct sk_buff **tail_skb;
				1334	struct inet_sock *inet = inet_sk(sk);
				1335	struct net *net = sock_net(sk);
				1336	struct ip_options *opt = NULL;
				1337	struct rtable rt = (struct rtable )cork->dst;
				1338	struct iphdr *iph;
				1339	__be16 df = 0;
				1340	__u8 ttl;
				1341
				1342	skb = __skb_dequeue(queue);
				1343	if (!skb)
				1344	goto out;
				1345	tail_skb = &(skb_shinfo(skb)->frag_list);
				1346
				1347	/* move skb->data to ip header from ext header */
				1348	if (skb->data < skb_network_header(skb))
				1349	__skb_pull(skb, skb_network_offset(skb));
				1350	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
				1351	__skb_pull(tmp_skb, skb_network_header_len(skb));
				1352	*tail_skb = tmp_skb;
				1353	tail_skb = &(tmp_skb->next);
				1354	skb->len += tmp_skb->len;
				1355	skb->data_len += tmp_skb->len;
				1356	skb->truesize += tmp_skb->truesize;
				1357	tmp_skb->destructor = NULL;
				1358	tmp_skb->sk = NULL;
				1359	}
				1360
				1361	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
				1362	* to fragment the frame generated here. No matter, what transforms
				1363	* how transforms change size of the packet, it will come out.
				1364	*/
				1365	skb->ignore_df = ip_sk_ignore_df(sk);
				1366
				1367	/* DF bit is set when we want to see DF on outgoing frames.
				1368	* If ignore_df is set too, we still allow to fragment this frame
				1369	* locally. */
				1370	if (inet->pmtudisc == IP_PMTUDISC_DO \|\|
				1371	inet->pmtudisc == IP_PMTUDISC_PROBE \|\|
				1372	(skb->len <= dst_mtu(&rt->dst) &&
				1373	ip_dont_fragment(sk, &rt->dst)))
				1374	df = htons(IP_DF);
				1375
				1376	if (cork->flags & IPCORK_OPT)
				1377	opt = cork->opt;
				1378
				1379	if (cork->ttl != 0)
				1380	ttl = cork->ttl;
				1381	else if (rt->rt_type == RTN_MULTICAST)
				1382	ttl = inet->mc_ttl;
				1383	else
				1384	ttl = ip_select_ttl(inet, &rt->dst);
				1385
				1386	iph = ip_hdr(skb);
				1387	iph->version = 4;
				1388	iph->ihl = 5;
				1389	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
				1390	iph->frag_off = df;
				1391	iph->ttl = ttl;
				1392	iph->protocol = sk->sk_protocol;
				1393	ip_copy_addrs(iph, fl4);
				1394	ip_select_ident(net, skb, sk);
				1395
				1396	if (opt) {
				1397	iph->ihl += opt->optlen>>2;
				1398	ip_options_build(skb, opt, cork->addr, rt, 0);
				1399	}
				1400
				1401	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
				1402	skb->mark = sk->sk_mark;
				1403	/*
				1404	* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
				1405	* on dst refcount
				1406	*/
				1407	cork->dst = NULL;
				1408	skb_dst_set(skb, &rt->dst);
				1409
				1410	if (iph->protocol == IPPROTO_ICMP)
				1411	icmp_out_count(net, ((struct icmphdr *)
				1412	skb_transport_header(skb))->type);
				1413
				1414	ip_cork_release(cork);
				1415	out:
				1416	return skb;
				1417	}
				1418
				1419	int ip_send_skb(struct net net, struct sk_buff skb)
				1420	{
				1421	int err;
				1422
				1423	err = ip_local_out(net, skb->sk, skb);
				1424	if (err) {
				1425	if (err > 0)
				1426	err = net_xmit_errno(err);
				1427	if (err)
				1428	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
				1429	}
				1430
				1431	return err;
				1432	}
				1433
				1434	int ip_push_pending_frames(struct sock sk, struct flowi4 fl4)
				1435	{
				1436	struct sk_buff *skb;
				1437
				1438	skb = ip_finish_skb(sk, fl4);
				1439	if (!skb)
				1440	return 0;
				1441
				1442	/* Netfilter gets whole the not fragmented skb. */
				1443	return ip_send_skb(sock_net(sk), skb);
				1444	}
				1445
				1446	/*
				1447	* Throw away all pending data on the socket.
				1448	*/
				1449	static void __ip_flush_pending_frames(struct sock *sk,
				1450	struct sk_buff_head *queue,
				1451	struct inet_cork *cork)
				1452	{
				1453	struct sk_buff *skb;
				1454
				1455	while ((skb = __skb_dequeue_tail(queue)) != NULL)
				1456	kfree_skb(skb);
				1457
				1458	ip_cork_release(cork);
				1459	}
				1460
				1461	void ip_flush_pending_frames(struct sock *sk)
				1462	{
				1463	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
				1464	}
				1465
				1466	struct sk_buff ip_make_skb(struct sock sk,
				1467	struct flowi4 *fl4,
				1468	int getfrag(void from, char to, int offset,
				1469	int len, int odd, struct sk_buff *skb),
				1470	void *from, int length, int transhdrlen,
				1471	struct ipcm_cookie ipc, struct rtable *rtp,
				1472	unsigned int flags)
				1473	{
				1474	struct inet_cork cork;
				1475	struct sk_buff_head queue;
				1476	int err;
				1477
				1478	if (flags & MSG_PROBE)
				1479	return NULL;
				1480
				1481	__skb_queue_head_init(&queue);
				1482
				1483	cork.flags = 0;
				1484	cork.addr = 0;
				1485	cork.opt = NULL;
				1486	err = ip_setup_cork(sk, &cork, ipc, rtp);
				1487	if (err)
				1488	return ERR_PTR(err);
				1489
				1490	err = __ip_append_data(sk, fl4, &queue, &cork,
				1491	&current->task_frag, getfrag,
				1492	from, length, transhdrlen, flags);
				1493	if (err) {
				1494	__ip_flush_pending_frames(sk, &queue, &cork);
				1495	return ERR_PTR(err);
				1496	}
				1497
				1498	return __ip_make_skb(sk, fl4, &queue, &cork);
				1499	}
				1500
				1501	/*
				1502	* Fetch data from kernel space and fill in checksum if needed.
				1503	*/
				1504	static int ip_reply_glue_bits(void dptr, char to, int offset,
				1505	int len, int odd, struct sk_buff *skb)
				1506	{
				1507	__wsum csum;
				1508
				1509	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
				1510	skb->csum = csum_block_add(skb->csum, csum, odd);
				1511	return 0;
				1512	}
				1513
				1514	/*
				1515	* Generic function to send a packet as reply to another packet.
				1516	* Used to send some TCP resets/acks so far.
				1517	*/
				1518	void ip_send_unicast_reply(struct sock sk, struct sk_buff skb,
				1519	const struct ip_options *sopt,
				1520	__be32 daddr, __be32 saddr,
				1521	const struct ip_reply_arg *arg,
				1522	unsigned int len)
				1523	{
				1524	struct ip_options_data replyopts;
				1525	struct ipcm_cookie ipc;
				1526	struct flowi4 fl4;
				1527	struct rtable *rt = skb_rtable(skb);
				1528	struct net *net = sock_net(sk);
				1529	struct sk_buff *nskb;
				1530	int err;
				1531	int oif;
				1532
				1533	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
				1534	return;
				1535
				1536	ipc.addr = daddr;
				1537	ipc.opt = NULL;
				1538	ipc.tx_flags = 0;
				1539	ipc.ttl = 0;
				1540	ipc.tos = -1;
				1541
				1542	if (replyopts.opt.opt.optlen) {
				1543	ipc.opt = &replyopts.opt;
				1544
				1545	if (replyopts.opt.opt.srr)
				1546	daddr = replyopts.opt.opt.faddr;
				1547	}
				1548
				1549	oif = arg->bound_dev_if;
				1550	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
				1551	oif = skb->skb_iif;
				1552
				1553	flowi4_init_output(&fl4, oif,
				1554	IP4_REPLY_MARK(net, skb->mark),
				1555	RT_TOS(arg->tos),
				1556	RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
				1557	ip_reply_arg_flowi_flags(arg),
				1558	daddr, saddr,
				1559	tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
				1560	arg->uid);
				1561	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
				1562	rt = ip_route_output_key(net, &fl4);
				1563	if (IS_ERR(rt))
				1564	return;
				1565
				1566	inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
				1567
				1568	sk->sk_priority = skb->priority;
				1569	sk->sk_protocol = ip_hdr(skb)->protocol;
				1570	sk->sk_bound_dev_if = arg->bound_dev_if;
				1571	sk->sk_sndbuf = sysctl_wmem_default;
				1572	sk->sk_mark = fl4.flowi4_mark;
				1573	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
				1574	len, 0, &ipc, &rt, MSG_DONTWAIT);
				1575	if (unlikely(err)) {
				1576	ip_flush_pending_frames(sk);
				1577	goto out;
				1578	}
				1579
				1580	nskb = skb_peek(&sk->sk_write_queue);
				1581	if (nskb) {
				1582	if (arg->csumoffset >= 0)
				1583	((__sum16 )skb_transport_header(nskb) +
				1584	arg->csumoffset) = csum_fold(csum_add(nskb->csum,
				1585	arg->csum));
				1586	nskb->ip_summed = CHECKSUM_NONE;
				1587	ip_push_pending_frames(sk, &fl4);
				1588	}
				1589	out:
				1590	ip_rt_put(rt);
				1591	}
				1592
				1593	void __init ip_init(void)
				1594	{
				1595	ip_rt_init();
				1596	inet_initpeers();
				1597
				1598	#if defined(CONFIG_IP_MULTICAST)
				1599	igmp_mc_init();
				1600	#endif
				1601	}