Blame - src/kernel/linux/v4.19/net/ipv4/ip_output.c - T800

blob: d25034734eceebaf03c2a75c58280451b0404b9f [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* The Internet Protocol (IP) output module.
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Donald Becker, <becker@super.org>
				11	* Alan Cox, <Alan.Cox@linux.org>
				12	* Richard Underwood
				13	* Stefan Becker, <stefanb@yello.ping.de>
				14	* Jorge Cwik, <jorge@laser.satlink.net>
				15	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				16	* Hirokazu Takahashi, <taka@valinux.co.jp>
				17	*
				18	* See ip_input.c for original log
				19	*
				20	* Fixes:
				21	* Alan Cox : Missing nonblock feature in ip_build_xmit.
				22	* Mike Kilburn : htons() missing in ip_build_xmit.
				23	* Bradford Johnson: Fix faulty handling of some frames when
				24	* no route is found.
				25	* Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
				26	* (in case if packet not accepted by
				27	* output firewall rules)
				28	* Mike McLagan : Routing by source
				29	* Alexey Kuznetsov: use new route cache
				30	* Andi Kleen: Fix broken PMTU recovery and remove
				31	* some redundant tests.
				32	* Vitaly E. Lavrov : Transparent proxy revived after year coma.
				33	* Andi Kleen : Replace ip_reply with ip_send_reply.
				34	* Andi Kleen : Split fast and slow ip_build_xmit path
				35	* for decreased register pressure on x86
				36	* and more readibility.
				37	* Marc Boucher : When call_out_firewall returns FW_QUEUE,
				38	* silently drop skb instead of failing with -EPERM.
				39	* Detlev Wengorz : Copy protocol for fragments.
				40	* Hirokazu Takahashi: HW checksumming for outgoing UDP
				41	* datagrams.
				42	* Hirokazu Takahashi: sendfile() on UDP works now.
				43	*/
				44
				45	#include <linux/uaccess.h>
				46	#include <linux/module.h>
				47	#include <linux/types.h>
				48	#include <linux/kernel.h>
				49	#include <linux/mm.h>
				50	#include <linux/string.h>
				51	#include <linux/errno.h>
				52	#include <linux/highmem.h>
				53	#include <linux/slab.h>
				54
				55	#include <linux/socket.h>
				56	#include <linux/sockios.h>
				57	#include <linux/in.h>
				58	#include <linux/inet.h>
				59	#include <linux/netdevice.h>
				60	#include <linux/etherdevice.h>
				61	#include <linux/proc_fs.h>
				62	#include <linux/stat.h>
				63	#include <linux/init.h>
				64
				65	#include <net/snmp.h>
				66	#include <net/ip.h>
				67	#include <net/protocol.h>
				68	#include <net/route.h>
				69	#include <net/xfrm.h>
				70	#include <linux/skbuff.h>
				71	#include <net/sock.h>
				72	#include <net/arp.h>
				73	#include <net/icmp.h>
				74	#include <net/checksum.h>
				75	#include <net/inetpeer.h>
				76	#include <net/lwtunnel.h>
				77	#include <linux/bpf-cgroup.h>
				78	#include <linux/igmp.h>
				79	#include <linux/netfilter_ipv4.h>
				80	#include <linux/netfilter_bridge.h>
				81	#include <linux/netlink.h>
				82	#include <linux/tcp.h>
				83	#include <net/ra_nat.h>
				84
				85	/* Generate a checksum for an outgoing IP datagram. */
				86	void ip_send_check(struct iphdr *iph)
				87	{
				88	iph->check = 0;
				89	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
				90	}
				91	EXPORT_SYMBOL(ip_send_check);
				92
				93	int __ip_local_out(struct net net, struct sock sk, struct sk_buff *skb)
				94	{
				95	struct iphdr *iph = ip_hdr(skb);
				96
				97	iph->tot_len = htons(skb->len);
				98	ip_send_check(iph);
				99
				100	/* if egress device is enslaved to an L3 master device pass the
				101	* skb to its handler for processing
				102	*/
				103	skb = l3mdev_ip_out(sk, skb);
				104	if (unlikely(!skb))
				105	return 0;
				106
				107	skb->protocol = htons(ETH_P_IP);
				108
				109	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
				110	net, sk, skb, NULL, skb_dst(skb)->dev,
				111	dst_output);
				112	}
				113
				114	int ip_local_out(struct net net, struct sock sk, struct sk_buff *skb)
				115	{
				116	int err;
				117
				118	err = __ip_local_out(net, sk, skb);
				119	if (likely(err == 1))
				120	err = dst_output(net, sk, skb);
				121
				122	return err;
				123	}
				124	EXPORT_SYMBOL_GPL(ip_local_out);
				125
				126	static inline int ip_select_ttl(struct inet_sock inet, struct dst_entry dst)
				127	{
				128	int ttl = inet->uc_ttl;
				129
				130	if (ttl < 0)
				131	ttl = ip4_dst_hoplimit(dst);
				132	return ttl;
				133	}
				134
				135	/*
				136	* Add an ip header to a skbuff and send it out.
				137	*
				138	*/
				139	int ip_build_and_send_pkt(struct sk_buff skb, const struct sock sk,
				140	__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
				141	{
				142	struct inet_sock *inet = inet_sk(sk);
				143	struct rtable *rt = skb_rtable(skb);
				144	struct net *net = sock_net(sk);
				145	struct iphdr *iph;
				146
				147	/* Build the IP header. */
				148	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
				149	skb_reset_network_header(skb);
				150	iph = ip_hdr(skb);
				151	iph->version = 4;
				152	iph->ihl = 5;
				153	iph->tos = inet->tos;
				154	iph->ttl = ip_select_ttl(inet, &rt->dst);
				155	iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
				156	iph->saddr = saddr;
				157	iph->protocol = sk->sk_protocol;
				158	if (ip_dont_fragment(sk, &rt->dst)) {
				159	iph->frag_off = htons(IP_DF);
				160	iph->id = 0;
				161	} else {
				162	iph->frag_off = 0;
				163	__ip_select_ident(net, iph, 1);
				164	}
				165
				166	if (opt && opt->opt.optlen) {
				167	iph->ihl += opt->opt.optlen>>2;
				168	ip_options_build(skb, &opt->opt, daddr, rt, 0);
				169	}
				170
				171	skb->priority = sk->sk_priority;
				172	if (!skb->mark)
				173	skb->mark = sk->sk_mark;
				174
				175	/* Send it out. */
				176	return ip_local_out(net, skb->sk, skb);
				177	}
				178	EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
				179
				180	static int ip_finish_output2(struct net net, struct sock sk, struct sk_buff *skb)
				181	{
				182	struct dst_entry *dst = skb_dst(skb);
				183	struct rtable rt = (struct rtable )dst;
				184	struct net_device *dev = dst->dev;
				185	unsigned int hh_len = LL_RESERVED_SPACE(dev);
				186	struct neighbour *neigh;
				187	u32 nexthop;
				188
				189	if (rt->rt_type == RTN_MULTICAST) {
				190	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
				191	} else if (rt->rt_type == RTN_BROADCAST)
				192	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
				193
				194	/* Be paranoid, rather than too clever. */
				195	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
				196	struct sk_buff *skb2;
				197
				198	skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
				199	if (!skb2) {
				200	kfree_skb(skb);
				201	return -ENOMEM;
				202	}
				203	if (skb->sk)
				204	skb_set_owner_w(skb2, skb->sk);
				205	consume_skb(skb);
				206	skb = skb2;
				207	}
				208
				209	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
				210	int res = lwtunnel_xmit(skb);
				211
				212	if (res < 0 \|\| res == LWTUNNEL_XMIT_DONE)
				213	return res;
				214	}
				215
				216	rcu_read_lock_bh();
				217	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
				218	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
				219	if (unlikely(!neigh))
				220	neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
				221	if (!IS_ERR(neigh)) {
				222	int res;
				223
				224	sock_confirm_neigh(skb, neigh);
				225	res = neigh_output(neigh, skb);
				226
				227	rcu_read_unlock_bh();
				228	return res;
				229	}
				230	rcu_read_unlock_bh();
				231
				232	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
				233	__func__);
				234	kfree_skb(skb);
				235	return -EINVAL;
				236	}
				237
				238	static int ip_finish_output_gso(struct net net, struct sock sk,
				239	struct sk_buff *skb, unsigned int mtu)
				240	{
				241	netdev_features_t features;
				242	struct sk_buff *segs;
				243	int ret = 0;
				244
				245	/* common case: seglen is <= mtu
				246	*/
				247	if (skb_gso_validate_network_len(skb, mtu))
				248	return ip_finish_output2(net, sk, skb);
				249
				250	/* Slowpath - GSO segment length exceeds the egress MTU.
				251	*
				252	* This can happen in several cases:
				253	* - Forwarding of a TCP GRO skb, when DF flag is not set.
				254	* - Forwarding of an skb that arrived on a virtualization interface
				255	* (virtio-net/vhost/tap) with TSO/GSO size set by other network
				256	* stack.
				257	* - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
				258	* interface with a smaller MTU.
				259	* - Arriving GRO skb (or GSO skb in a virtualized environment) that is
				260	* bridged to a NETIF_F_TSO tunnel stacked over an interface with an
				261	* insufficent MTU.
				262	*/
				263	features = netif_skb_features(skb);
				264	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
				265	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
				266	if (IS_ERR_OR_NULL(segs)) {
				267	kfree_skb(skb);
				268	return -ENOMEM;
				269	}
				270
				271	consume_skb(skb);
				272
				273	do {
				274	struct sk_buff *nskb = segs->next;
				275	int err;
				276
				277	segs->next = NULL;
				278	err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
				279
				280	if (err && ret == 0)
				281	ret = err;
				282	segs = nskb;
				283	} while (segs);
				284
				285	return ret;
				286	}
				287
				288	static int ip_finish_output(struct net net, struct sock sk, struct sk_buff *skb)
				289	{
				290	unsigned int mtu;
				291	int ret;
				292
				293	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
				294	if (ret) {
				295	kfree_skb(skb);
				296	return ret;
				297	}
				298
				299	#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
				300	/* Policy lookup after SNAT yielded a new policy */
				301	if (skb_dst(skb)->xfrm) {
				302	IPCB(skb)->flags \|= IPSKB_REROUTED;
				303	return dst_output(net, sk, skb);
				304	}
				305	#endif
				306	mtu = ip_skb_dst_mtu(sk, skb);
				307	if (skb_is_gso(skb))
				308	return ip_finish_output_gso(net, sk, skb, mtu);
				309
				310	if (skb->len > mtu \|\| (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
				311	return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
				312
				313	return ip_finish_output2(net, sk, skb);
				314	}
				315
				316	static int ip_mc_finish_output(struct net net, struct sock sk,
				317	struct sk_buff *skb)
				318	{
				319	int ret;
				320
				321	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
				322	if (ret) {
				323	kfree_skb(skb);
				324	return ret;
				325	}
				326
				327	return dev_loopback_xmit(net, sk, skb);
				328	}
				329
				330	int ip_mc_output(struct net net, struct sock sk, struct sk_buff *skb)
				331	{
				332	struct rtable *rt = skb_rtable(skb);
				333	struct net_device *dev = rt->dst.dev;
				334
				335	/*
				336	* If the indicated interface is up and running, send the packet.
				337	*/
				338	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
				339
				340	skb->dev = dev;
				341	skb->protocol = htons(ETH_P_IP);
				342
				343	/*
				344	* Multicasts are looped back for other local users
				345	*/
				346
				347	if (rt->rt_flags&RTCF_MULTICAST) {
				348	if (sk_mc_loop(sk)
				349	#ifdef CONFIG_IP_MROUTE
				350	/* Small optimization: do not loopback not local frames,
				351	which returned after forwarding; they will be dropped
				352	by ip_mr_input in any case.
				353	Note, that local frames are looped back to be delivered
				354	to local recipients.
				355
				356	This check is duplicated in ip_mr_input at the moment.
				357	*/
				358	&&
				359	((rt->rt_flags & RTCF_LOCAL) \|\|
				360	!(IPCB(skb)->flags & IPSKB_FORWARDED))
				361	#endif
				362	) {
				363	struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
				364	if (newskb)
				365	NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				366	net, sk, newskb, NULL, newskb->dev,
				367	ip_mc_finish_output);
				368	}
				369
				370	/* Multicasts with ttl 0 must not go beyond the host */
				371
				372	if (ip_hdr(skb)->ttl == 0) {
				373	kfree_skb(skb);
				374	return 0;
				375	}
				376	}
				377
				378	if (rt->rt_flags&RTCF_BROADCAST) {
				379	struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
				380	if (newskb)
				381	NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				382	net, sk, newskb, NULL, newskb->dev,
				383	ip_mc_finish_output);
				384	}
				385
				386	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				387	net, sk, skb, NULL, skb->dev,
				388	ip_finish_output,
				389	!(IPCB(skb)->flags & IPSKB_REROUTED));
				390	}
				391
				392	int ip_output(struct net net, struct sock sk, struct sk_buff *skb)
				393	{
				394	struct net_device *dev = skb_dst(skb)->dev;
				395
				396	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
				397
				398	skb->dev = dev;
				399	skb->protocol = htons(ETH_P_IP);
				400
				401	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				402	net, sk, skb, NULL, dev,
				403	ip_finish_output,
				404	!(IPCB(skb)->flags & IPSKB_REROUTED));
				405	}
				406
				407	/*
				408	* copy saddr and daddr, possibly using 64bit load/stores
				409	* Equivalent to :
				410	* iph->saddr = fl4->saddr;
				411	* iph->daddr = fl4->daddr;
				412	*/
				413	static void ip_copy_addrs(struct iphdr iph, const struct flowi4 fl4)
				414	{
				415	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
				416	offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
				417	memcpy(&iph->saddr, &fl4->saddr,
				418	sizeof(fl4->saddr) + sizeof(fl4->daddr));
				419	}
				420
				421	/* Note: skb->sk can be different from sk, in case of tunnels */
				422	int __ip_queue_xmit(struct sock sk, struct sk_buff skb, struct flowi *fl,
				423	__u8 tos)
				424	{
				425	struct inet_sock *inet = inet_sk(sk);
				426	struct net *net = sock_net(sk);
				427	struct ip_options_rcu *inet_opt;
				428	struct flowi4 *fl4;
				429	struct rtable *rt;
				430	struct iphdr *iph;
				431	int res;
				432
				433	/* Skip all of this if the packet is already routed,
				434	* f.e. by something like SCTP.
				435	*/
				436	rcu_read_lock();
				437	inet_opt = rcu_dereference(inet->inet_opt);
				438	fl4 = &fl->u.ip4;
				439	rt = skb_rtable(skb);
				440	if (rt)
				441	goto packet_routed;
				442
				443	/* Make sure we can route this packet. */
				444	rt = (struct rtable *)__sk_dst_check(sk, 0);
				445	if (!rt) {
				446	__be32 daddr;
				447
				448	/* Use correct destination address if we have options. */
				449	daddr = inet->inet_daddr;
				450	if (inet_opt && inet_opt->opt.srr)
				451	daddr = inet_opt->opt.faddr;
				452
				453	/* If this fails, retransmit mechanism of transport layer will
				454	* keep trying until route appears or the connection times
				455	* itself out.
				456	*/
				457	rt = ip_route_output_ports(net, fl4, sk,
				458	daddr, inet->inet_saddr,
				459	inet->inet_dport,
				460	inet->inet_sport,
				461	sk->sk_protocol,
				462	RT_CONN_FLAGS_TOS(sk, tos),
				463	sk->sk_bound_dev_if);
				464	if (IS_ERR(rt))
				465	goto no_route;
				466	sk_setup_caps(sk, &rt->dst);
				467	}
				468	skb_dst_set_noref(skb, &rt->dst);
				469
				470	packet_routed:
				471	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
				472	goto no_route;
				473
				474	/* OK, we know where to send it, allocate and build IP header. */
				475	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
				476	skb_reset_network_header(skb);
				477	iph = ip_hdr(skb);
				478	((__be16 )iph) = htons((4 << 12) \| (5 << 8) \| (tos & 0xff));
				479	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
				480	iph->frag_off = htons(IP_DF);
				481	else
				482	iph->frag_off = 0;
				483	iph->ttl = ip_select_ttl(inet, &rt->dst);
				484	iph->protocol = sk->sk_protocol;
				485	ip_copy_addrs(iph, fl4);
				486
				487	/* Transport layer set skb->h.foo itself. */
				488
				489	if (inet_opt && inet_opt->opt.optlen) {
				490	iph->ihl += inet_opt->opt.optlen >> 2;
				491	ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
				492	}
				493
				494	ip_select_ident_segs(net, skb, sk,
				495	skb_shinfo(skb)->gso_segs ?: 1);
				496
				497	/* TODO : should we use skb->sk here instead of sk ? */
				498	skb->priority = sk->sk_priority;
				499	skb->mark = sk->sk_mark;
				500
				501	/* hw_nat use*/
				502	hwnat_set_l2tp_unhit(iph, skb);
				503	hwnat_check_magic_tag(skb);
				504
				505	res = ip_local_out(net, sk, skb);
				506	rcu_read_unlock();
				507	return res;
				508
				509	no_route:
				510	rcu_read_unlock();
				511	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
				512	kfree_skb(skb);
				513	return -EHOSTUNREACH;
				514	}
				515	EXPORT_SYMBOL(__ip_queue_xmit);
				516
				517	static void ip_copy_metadata(struct sk_buff to, struct sk_buff from)
				518	{
				519	to->pkt_type = from->pkt_type;
				520	to->priority = from->priority;
				521	to->protocol = from->protocol;
				522	to->skb_iif = from->skb_iif;
				523	skb_dst_drop(to);
				524	skb_dst_copy(to, from);
				525	to->dev = from->dev;
				526	to->mark = from->mark;
				527
				528	skb_copy_hash(to, from);
				529
				530	/* Copy the flags to each fragment. */
				531	IPCB(to)->flags = IPCB(from)->flags;
				532
				533	#ifdef CONFIG_NET_SCHED
				534	to->tc_index = from->tc_index;
				535	#endif
				536	nf_copy(to, from);
				537	#if IS_ENABLED(CONFIG_IP_VS)
				538	to->ipvs_property = from->ipvs_property;
				539	#endif
				540	skb_copy_secmark(to, from);
				541	}
				542
				543	int ip_fragment(struct net net, struct sock sk, struct sk_buff *skb,
				544	unsigned int mtu,
				545	int (output)(struct net , struct sock , struct sk_buff ))
				546	{
				547	struct iphdr *iph = ip_hdr(skb);
				548
				549	if ((iph->frag_off & htons(IP_DF)) == 0)
				550	return ip_do_fragment(net, sk, skb, output);
				551
				552	if (unlikely(!skb->ignore_df \|\|
				553	(IPCB(skb)->frag_max_size &&
				554	IPCB(skb)->frag_max_size > mtu))) {
				555	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
				556	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
				557	htonl(mtu));
				558	kfree_skb(skb);
				559	return -EMSGSIZE;
				560	}
				561
				562	return ip_do_fragment(net, sk, skb, output);
				563	}
				564
				565	/*
				566	* This IP datagram is too large to be sent in one piece. Break it up into
				567	* smaller pieces (each of size equal to IP header plus
				568	* a block of the data of the original IP data part) that will yet fit in a
				569	* single device frame, and queue such a frame for sending.
				570	*/
				571
				572	int ip_do_fragment(struct net net, struct sock sk, struct sk_buff *skb,
				573	int (output)(struct net , struct sock , struct sk_buff ))
				574	{
				575	struct iphdr *iph;
				576	int ptr;
				577	struct sk_buff *skb2;
				578	unsigned int mtu, hlen, left, len, ll_rs;
				579	int offset;
				580	__be16 not_last_frag;
				581	struct rtable *rt = skb_rtable(skb);
				582	int err = 0;
				583
				584	/* for offloaded checksums cleanup checksum before fragmentation */
				585	if (skb->ip_summed == CHECKSUM_PARTIAL &&
				586	(err = skb_checksum_help(skb)))
				587	goto fail;
				588
				589	/*
				590	* Point into the IP datagram header.
				591	*/
				592
				593	iph = ip_hdr(skb);
				594
				595	mtu = ip_skb_dst_mtu(sk, skb);
				596	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
				597	mtu = IPCB(skb)->frag_max_size;
				598
				599	/*
				600	* Setup starting values.
				601	*/
				602
				603	hlen = iph->ihl * 4;
				604	mtu = mtu - hlen; /* Size of data space */
				605	IPCB(skb)->flags \|= IPSKB_FRAG_COMPLETE;
				606	ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
				607
				608	/* When frag_list is given, use it. First, check its validity:
				609	* some transformers could create wrong frag_list or break existing
				610	* one, it is not prohibited. In this case fall back to copying.
				611	*
				612	* LATER: this step can be merged to real generation of fragments,
				613	* we can switch to copy when see the first bad fragment.
				614	*/
				615	if (skb_has_frag_list(skb)) {
				616	struct sk_buff frag, frag2;
				617	unsigned int first_len = skb_pagelen(skb);
				618
				619	if (first_len - hlen > mtu \|\|
				620	((first_len - hlen) & 7) \|\|
				621	ip_is_fragment(iph) \|\|
				622	skb_cloned(skb) \|\|
				623	skb_headroom(skb) < ll_rs)
				624	goto slow_path;
				625
				626	skb_walk_frags(skb, frag) {
				627	/* Correct geometry. */
				628	if (frag->len > mtu \|\|
				629	((frag->len & 7) && frag->next) \|\|
				630	skb_headroom(frag) < hlen + ll_rs)
				631	goto slow_path_clean;
				632
				633	/* Partially cloned skb? */
				634	if (skb_shared(frag))
				635	goto slow_path_clean;
				636
				637	BUG_ON(frag->sk);
				638	if (skb->sk) {
				639	frag->sk = skb->sk;
				640	frag->destructor = sock_wfree;
				641	}
				642	skb->truesize -= frag->truesize;
				643	}
				644
				645	/* Everything is OK. Generate! */
				646
				647	err = 0;
				648	offset = 0;
				649	frag = skb_shinfo(skb)->frag_list;
				650	skb_frag_list_init(skb);
				651	skb->data_len = first_len - skb_headlen(skb);
				652	skb->len = first_len;
				653	iph->tot_len = htons(first_len);
				654	iph->frag_off = htons(IP_MF);
				655	ip_send_check(iph);
				656
				657	for (;;) {
				658	/* Prepare header of the next frame,
				659	* before previous one went down. */
				660	if (frag) {
				661	frag->ip_summed = CHECKSUM_NONE;
				662	skb_reset_transport_header(frag);
				663	__skb_push(frag, hlen);
				664	skb_reset_network_header(frag);
				665	memcpy(skb_network_header(frag), iph, hlen);
				666	iph = ip_hdr(frag);
				667	iph->tot_len = htons(frag->len);
				668	ip_copy_metadata(frag, skb);
				669	if (offset == 0)
				670	ip_options_fragment(frag);
				671	offset += skb->len - hlen;
				672	iph->frag_off = htons(offset>>3);
				673	if (frag->next)
				674	iph->frag_off \|= htons(IP_MF);
				675	/* Ready, complete checksum */
				676	ip_send_check(iph);
				677	}
				678
				679	err = output(net, sk, skb);
				680
				681	if (!err)
				682	IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
				683	if (err \|\| !frag)
				684	break;
				685
				686	skb = frag;
				687	frag = skb->next;
				688	skb->next = NULL;
				689	}
				690
				691	if (err == 0) {
				692	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
				693	return 0;
				694	}
				695
				696	while (frag) {
				697	skb = frag->next;
				698	kfree_skb(frag);
				699	frag = skb;
				700	}
				701	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
				702	return err;
				703
				704	slow_path_clean:
				705	skb_walk_frags(skb, frag2) {
				706	if (frag2 == frag)
				707	break;
				708	frag2->sk = NULL;
				709	frag2->destructor = NULL;
				710	skb->truesize += frag2->truesize;
				711	}
				712	}
				713
				714	slow_path:
				715	iph = ip_hdr(skb);
				716
				717	left = skb->len - hlen; /* Space per frame */
				718	ptr = hlen; /* Where to start from */
				719
				720	/*
				721	* Fragment the datagram.
				722	*/
				723
				724	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
				725	not_last_frag = iph->frag_off & htons(IP_MF);
				726
				727	/*
				728	* Keep copying data until we run out.
				729	*/
				730
				731	while (left > 0) {
				732	len = left;
				733	/* IF: it doesn't fit, use 'mtu' - the data space left */
				734	if (len > mtu)
				735	len = mtu;
				736	/* IF: we are not sending up to and including the packet end
				737	then align the next start on an eight byte boundary */
				738	if (len < left) {
				739	len &= ~7;
				740	}
				741
				742	/* Allocate buffer */
				743	skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
				744	if (!skb2) {
				745	err = -ENOMEM;
				746	goto fail;
				747	}
				748
				749	/*
				750	* Set up data on packet
				751	*/
				752
				753	ip_copy_metadata(skb2, skb);
				754	skb_reserve(skb2, ll_rs);
				755	skb_put(skb2, len + hlen);
				756	skb_reset_network_header(skb2);
				757	skb2->transport_header = skb2->network_header + hlen;
				758
				759	/*
				760	* Charge the memory for the fragment to any owner
				761	* it might possess
				762	*/
				763
				764	if (skb->sk)
				765	skb_set_owner_w(skb2, skb->sk);
				766
				767	/*
				768	* Copy the packet header into the new buffer.
				769	*/
				770
				771	skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
				772
				773	/*
				774	* Copy a block of the IP datagram.
				775	*/
				776	if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
				777	BUG();
				778	left -= len;
				779
				780	/*
				781	* Fill in the new header fields.
				782	*/
				783	iph = ip_hdr(skb2);
				784	iph->frag_off = htons((offset >> 3));
				785
				786	if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
				787	iph->frag_off \|= htons(IP_DF);
				788
				789	/* ANK: dirty, but effective trick. Upgrade options only if
				790	* the segment to be fragmented was THE FIRST (otherwise,
				791	* options are already fixed) and make it ONCE
				792	* on the initial skb, so that all the following fragments
				793	* will inherit fixed options.
				794	*/
				795	if (offset == 0)
				796	ip_options_fragment(skb);
				797
				798	/*
				799	* Added AC : If we are fragmenting a fragment that's not the
				800	* last fragment then keep MF on each bit
				801	*/
				802	if (left > 0 \|\| not_last_frag)
				803	iph->frag_off \|= htons(IP_MF);
				804	ptr += len;
				805	offset += len;
				806
				807	/*
				808	* Put this fragment into the sending queue.
				809	*/
				810	iph->tot_len = htons(len + hlen);
				811
				812	ip_send_check(iph);
				813
				814	err = output(net, sk, skb2);
				815	if (err)
				816	goto fail;
				817
				818	IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
				819	}
				820	consume_skb(skb);
				821	IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
				822	return err;
				823
				824	fail:
				825	kfree_skb(skb);
				826	IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
				827	return err;
				828	}
				829	EXPORT_SYMBOL(ip_do_fragment);
				830
				831	int
				832	ip_generic_getfrag(void from, char to, int offset, int len, int odd, struct sk_buff *skb)
				833	{
				834	struct msghdr *msg = from;
				835
				836	if (skb->ip_summed == CHECKSUM_PARTIAL) {
				837	if (!copy_from_iter_full(to, len, &msg->msg_iter))
				838	return -EFAULT;
				839	} else {
				840	__wsum csum = 0;
				841	if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
				842	return -EFAULT;
				843	skb->csum = csum_block_add(skb->csum, csum, odd);
				844	}
				845	return 0;
				846	}
				847	EXPORT_SYMBOL(ip_generic_getfrag);
				848
				849	static inline __wsum
				850	csum_page(struct page *page, int offset, int copy)
				851	{
				852	char *kaddr;
				853	__wsum csum;
				854	kaddr = kmap(page);
				855	csum = csum_partial(kaddr + offset, copy, 0);
				856	kunmap(page);
				857	return csum;
				858	}
				859
				860	static int __ip_append_data(struct sock *sk,
				861	struct flowi4 *fl4,
				862	struct sk_buff_head *queue,
				863	struct inet_cork *cork,
				864	struct page_frag *pfrag,
				865	int getfrag(void from, char to, int offset,
				866	int len, int odd, struct sk_buff *skb),
				867	void *from, int length, int transhdrlen,
				868	unsigned int flags)
				869	{
				870	struct inet_sock *inet = inet_sk(sk);
				871	struct sk_buff *skb;
				872
				873	struct ip_options *opt = cork->opt;
				874	int hh_len;
				875	int exthdrlen;
				876	int mtu;
				877	int copy;
				878	int err;
				879	int offset = 0;
				880	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
				881	int csummode = CHECKSUM_NONE;
				882	struct rtable rt = (struct rtable )cork->dst;
				883	unsigned int wmem_alloc_delta = 0;
				884	u32 tskey = 0;
				885	bool paged;
				886
				887	skb = skb_peek_tail(queue);
				888
				889	exthdrlen = !skb ? rt->dst.header_len : 0;
				890	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
				891	paged = !!cork->gso_size;
				892
				893	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
				894	sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
				895	tskey = sk->sk_tskey++;
				896
				897	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
				898
				899	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
				900	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
				901	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
				902
				903	if (cork->length + length > maxnonfragsize - fragheaderlen) {
				904	ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
				905	mtu - (opt ? opt->optlen : 0));
				906	return -EMSGSIZE;
				907	}
				908
				909	/*
				910	* transhdrlen > 0 means that this is the first fragment and we wish
				911	* it won't be fragmented in the future.
				912	*/
				913	if (transhdrlen &&
				914	length + fragheaderlen <= mtu &&
				915	rt->dst.dev->features & (NETIF_F_HW_CSUM \| NETIF_F_IP_CSUM) &&
				916	(!(flags & MSG_MORE) \|\| cork->gso_size) &&
				917	(!exthdrlen \|\| (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
				918	csummode = CHECKSUM_PARTIAL;
				919
				920	cork->length += length;
				921
				922	/* So, what's going on in the loop below?
				923	*
				924	* We use calculated fragment length to generate chained skb,
				925	* each of segments is IP fragment ready for sending to network after
				926	* adding appropriate IP header.
				927	*/
				928
				929	if (!skb)
				930	goto alloc_new_skb;
				931
				932	while (length > 0) {
				933	/* Check if the remaining data fits into current packet. */
				934	copy = mtu - skb->len;
				935	if (copy < length)
				936	copy = maxfraglen - skb->len;
				937	if (copy <= 0) {
				938	char *data;
				939	unsigned int datalen;
				940	unsigned int fraglen;
				941	unsigned int fraggap;
				942	unsigned int alloclen;
				943	unsigned int pagedlen = 0;
				944	struct sk_buff *skb_prev;
				945	alloc_new_skb:
				946	skb_prev = skb;
				947	if (skb_prev)
				948	fraggap = skb_prev->len - maxfraglen;
				949	else
				950	fraggap = 0;
				951
				952	/*
				953	* If remaining data exceeds the mtu,
				954	* we know we need more fragment(s).
				955	*/
				956	datalen = length + fraggap;
				957	if (datalen > mtu - fragheaderlen)
				958	datalen = maxfraglen - fragheaderlen;
				959	fraglen = datalen + fragheaderlen;
				960
				961	if ((flags & MSG_MORE) &&
				962	!(rt->dst.dev->features&NETIF_F_SG))
				963	alloclen = mtu;
				964	else if (!paged)
				965	alloclen = fraglen;
				966	else {
				967	alloclen = min_t(int, fraglen, MAX_HEADER);
				968	pagedlen = fraglen - alloclen;
				969	}
				970
				971	alloclen += exthdrlen;
				972
				973	/* The last fragment gets additional space at tail.
				974	* Note, with MSG_MORE we overallocate on fragments,
				975	* because we have no idea what fragment will be
				976	* the last.
				977	*/
				978	if (datalen == length + fraggap)
				979	alloclen += rt->dst.trailer_len;
				980
				981	if (transhdrlen) {
				982	skb = sock_alloc_send_skb(sk,
				983	alloclen + hh_len + 15,
				984	(flags & MSG_DONTWAIT), &err);
				985	} else {
				986	skb = NULL;
				987	if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
				988	2 * sk->sk_sndbuf)
				989	skb = alloc_skb(alloclen + hh_len + 15,
				990	sk->sk_allocation);
				991	if (unlikely(!skb))
				992	err = -ENOBUFS;
				993	}
				994	if (!skb)
				995	goto error;
				996
				997	/*
				998	* Fill in the control structures
				999	*/
				1000	skb->ip_summed = csummode;
				1001	skb->csum = 0;
				1002	skb_reserve(skb, hh_len);
				1003
				1004	/* only the initial fragment is time stamped */
				1005	skb_shinfo(skb)->tx_flags = cork->tx_flags;
				1006	cork->tx_flags = 0;
				1007	skb_shinfo(skb)->tskey = tskey;
				1008	tskey = 0;
				1009
				1010	/*
				1011	* Find where to start putting bytes.
				1012	*/
				1013	data = skb_put(skb, fraglen + exthdrlen - pagedlen);
				1014	skb_set_network_header(skb, exthdrlen);
				1015	skb->transport_header = (skb->network_header +
				1016	fragheaderlen);
				1017	data += fragheaderlen + exthdrlen;
				1018
				1019	if (fraggap) {
				1020	skb->csum = skb_copy_and_csum_bits(
				1021	skb_prev, maxfraglen,
				1022	data + transhdrlen, fraggap, 0);
				1023	skb_prev->csum = csum_sub(skb_prev->csum,
				1024	skb->csum);
				1025	data += fraggap;
				1026	pskb_trim_unique(skb_prev, maxfraglen);
				1027	}
				1028
				1029	copy = datalen - transhdrlen - fraggap - pagedlen;
				1030	if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				1031	err = -EFAULT;
				1032	kfree_skb(skb);
				1033	goto error;
				1034	}
				1035
				1036	offset += copy;
				1037	length -= copy + transhdrlen;
				1038	transhdrlen = 0;
				1039	exthdrlen = 0;
				1040	csummode = CHECKSUM_NONE;
				1041
				1042	if ((flags & MSG_CONFIRM) && !skb_prev)
				1043	skb_set_dst_pending_confirm(skb, 1);
				1044
				1045	/*
				1046	* Put the packet on the pending queue.
				1047	*/
				1048	if (!skb->destructor) {
				1049	skb->destructor = sock_wfree;
				1050	skb->sk = sk;
				1051	wmem_alloc_delta += skb->truesize;
				1052	}
				1053	__skb_queue_tail(queue, skb);
				1054	continue;
				1055	}
				1056
				1057	if (copy > length)
				1058	copy = length;
				1059
				1060	if (!(rt->dst.dev->features&NETIF_F_SG) &&
				1061	skb_tailroom(skb) >= copy) {
				1062	unsigned int off;
				1063
				1064	off = skb->len;
				1065	if (getfrag(from, skb_put(skb, copy),
				1066	offset, copy, off, skb) < 0) {
				1067	__skb_trim(skb, off);
				1068	err = -EFAULT;
				1069	goto error;
				1070	}
				1071	} else {
				1072	int i = skb_shinfo(skb)->nr_frags;
				1073
				1074	err = -ENOMEM;
				1075	if (!sk_page_frag_refill(sk, pfrag))
				1076	goto error;
				1077
				1078	if (!skb_can_coalesce(skb, i, pfrag->page,
				1079	pfrag->offset)) {
				1080	err = -EMSGSIZE;
				1081	if (i == MAX_SKB_FRAGS)
				1082	goto error;
				1083
				1084	__skb_fill_page_desc(skb, i, pfrag->page,
				1085	pfrag->offset, 0);
				1086	skb_shinfo(skb)->nr_frags = ++i;
				1087	get_page(pfrag->page);
				1088	}
				1089	copy = min_t(int, copy, pfrag->size - pfrag->offset);
				1090	if (getfrag(from,
				1091	page_address(pfrag->page) + pfrag->offset,
				1092	offset, copy, skb->len, skb) < 0)
				1093	goto error_efault;
				1094
				1095	pfrag->offset += copy;
				1096	skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
				1097	skb->len += copy;
				1098	skb->data_len += copy;
				1099	skb->truesize += copy;
				1100	wmem_alloc_delta += copy;
				1101	}
				1102	offset += copy;
				1103	length -= copy;
				1104	}
				1105
				1106	if (wmem_alloc_delta)
				1107	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
				1108	return 0;
				1109
				1110	error_efault:
				1111	err = -EFAULT;
				1112	error:
				1113	cork->length -= length;
				1114	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
				1115	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
				1116	return err;
				1117	}
				1118
				1119	static int ip_setup_cork(struct sock sk, struct inet_cork cork,
				1120	struct ipcm_cookie ipc, struct rtable *rtp)
				1121	{
				1122	struct ip_options_rcu *opt;
				1123	struct rtable *rt;
				1124
				1125	rt = *rtp;
				1126	if (unlikely(!rt))
				1127	return -EFAULT;
				1128
				1129	/*
				1130	* setup for corking.
				1131	*/
				1132	opt = ipc->opt;
				1133	if (opt) {
				1134	if (!cork->opt) {
				1135	cork->opt = kmalloc(sizeof(struct ip_options) + 40,
				1136	sk->sk_allocation);
				1137	if (unlikely(!cork->opt))
				1138	return -ENOBUFS;
				1139	}
				1140	memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
				1141	cork->flags \|= IPCORK_OPT;
				1142	cork->addr = ipc->addr;
				1143	}
				1144
				1145	cork->fragsize = ip_sk_use_pmtu(sk) ?
				1146	dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
				1147
				1148	if (!inetdev_valid_mtu(cork->fragsize))
				1149	return -ENETUNREACH;
				1150
				1151	cork->gso_size = ipc->gso_size;
				1152
				1153	cork->dst = &rt->dst;
				1154	/* We stole this route, caller should not release it. */
				1155	*rtp = NULL;
				1156
				1157	cork->length = 0;
				1158	cork->ttl = ipc->ttl;
				1159	cork->tos = ipc->tos;
				1160	cork->priority = ipc->priority;
				1161	cork->transmit_time = ipc->sockc.transmit_time;
				1162	cork->tx_flags = 0;
				1163	sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
				1164
				1165	return 0;
				1166	}
				1167
				1168	/*
				1169	* ip_append_data() and ip_append_page() can make one large IP datagram
				1170	* from many pieces of data. Each pieces will be holded on the socket
				1171	* until ip_push_pending_frames() is called. Each piece can be a page
				1172	* or non-page data.
				1173	*
				1174	* Not only UDP, other transport protocols - e.g. raw sockets - can use
				1175	* this interface potentially.
				1176	*
				1177	* LATER: length must be adjusted by pad at tail, when it is required.
				1178	*/
				1179	int ip_append_data(struct sock sk, struct flowi4 fl4,
				1180	int getfrag(void from, char to, int offset, int len,
				1181	int odd, struct sk_buff *skb),
				1182	void *from, int length, int transhdrlen,
				1183	struct ipcm_cookie ipc, struct rtable *rtp,
				1184	unsigned int flags)
				1185	{
				1186	struct inet_sock *inet = inet_sk(sk);
				1187	int err;
				1188
				1189	if (flags&MSG_PROBE)
				1190	return 0;
				1191
				1192	if (skb_queue_empty(&sk->sk_write_queue)) {
				1193	err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
				1194	if (err)
				1195	return err;
				1196	} else {
				1197	transhdrlen = 0;
				1198	}
				1199
				1200	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
				1201	sk_page_frag(sk), getfrag,
				1202	from, length, transhdrlen, flags);
				1203	}
				1204
				1205	ssize_t ip_append_page(struct sock sk, struct flowi4 fl4, struct page *page,
				1206	int offset, size_t size, int flags)
				1207	{
				1208	struct inet_sock *inet = inet_sk(sk);
				1209	struct sk_buff *skb;
				1210	struct rtable *rt;
				1211	struct ip_options *opt = NULL;
				1212	struct inet_cork *cork;
				1213	int hh_len;
				1214	int mtu;
				1215	int len;
				1216	int err;
				1217	unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
				1218
				1219	if (inet->hdrincl)
				1220	return -EPERM;
				1221
				1222	if (flags&MSG_PROBE)
				1223	return 0;
				1224
				1225	if (skb_queue_empty(&sk->sk_write_queue))
				1226	return -EINVAL;
				1227
				1228	cork = &inet->cork.base;
				1229	rt = (struct rtable *)cork->dst;
				1230	if (cork->flags & IPCORK_OPT)
				1231	opt = cork->opt;
				1232
				1233	if (!(rt->dst.dev->features&NETIF_F_SG))
				1234	return -EOPNOTSUPP;
				1235
				1236	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
				1237	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
				1238
				1239	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
				1240	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
				1241	maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
				1242
				1243	if (cork->length + size > maxnonfragsize - fragheaderlen) {
				1244	ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
				1245	mtu - (opt ? opt->optlen : 0));
				1246	return -EMSGSIZE;
				1247	}
				1248
				1249	skb = skb_peek_tail(&sk->sk_write_queue);
				1250	if (!skb)
				1251	return -EINVAL;
				1252
				1253	cork->length += size;
				1254
				1255	while (size > 0) {
				1256	/* Check if the remaining data fits into current packet. */
				1257	len = mtu - skb->len;
				1258	if (len < size)
				1259	len = maxfraglen - skb->len;
				1260
				1261	if (len <= 0) {
				1262	struct sk_buff *skb_prev;
				1263	int alloclen;
				1264
				1265	skb_prev = skb;
				1266	fraggap = skb_prev->len - maxfraglen;
				1267
				1268	alloclen = fragheaderlen + hh_len + fraggap + 15;
				1269	skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
				1270	if (unlikely(!skb)) {
				1271	err = -ENOBUFS;
				1272	goto error;
				1273	}
				1274
				1275	/*
				1276	* Fill in the control structures
				1277	*/
				1278	skb->ip_summed = CHECKSUM_NONE;
				1279	skb->csum = 0;
				1280	skb_reserve(skb, hh_len);
				1281
				1282	/*
				1283	* Find where to start putting bytes.
				1284	*/
				1285	skb_put(skb, fragheaderlen + fraggap);
				1286	skb_reset_network_header(skb);
				1287	skb->transport_header = (skb->network_header +
				1288	fragheaderlen);
				1289	if (fraggap) {
				1290	skb->csum = skb_copy_and_csum_bits(skb_prev,
				1291	maxfraglen,
				1292	skb_transport_header(skb),
				1293	fraggap, 0);
				1294	skb_prev->csum = csum_sub(skb_prev->csum,
				1295	skb->csum);
				1296	pskb_trim_unique(skb_prev, maxfraglen);
				1297	}
				1298
				1299	/*
				1300	* Put the packet on the pending queue.
				1301	*/
				1302	__skb_queue_tail(&sk->sk_write_queue, skb);
				1303	continue;
				1304	}
				1305
				1306	if (len > size)
				1307	len = size;
				1308
				1309	if (skb_append_pagefrags(skb, page, offset, len)) {
				1310	err = -EMSGSIZE;
				1311	goto error;
				1312	}
				1313
				1314	if (skb->ip_summed == CHECKSUM_NONE) {
				1315	__wsum csum;
				1316	csum = csum_page(page, offset, len);
				1317	skb->csum = csum_block_add(skb->csum, csum, skb->len);
				1318	}
				1319
				1320	skb->len += len;
				1321	skb->data_len += len;
				1322	skb->truesize += len;
				1323	refcount_add(len, &sk->sk_wmem_alloc);
				1324	offset += len;
				1325	size -= len;
				1326	}
				1327	return 0;
				1328
				1329	error:
				1330	cork->length -= size;
				1331	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
				1332	return err;
				1333	}
				1334
				1335	static void ip_cork_release(struct inet_cork *cork)
				1336	{
				1337	cork->flags &= ~IPCORK_OPT;
				1338	kfree(cork->opt);
				1339	cork->opt = NULL;
				1340	dst_release(cork->dst);
				1341	cork->dst = NULL;
				1342	}
				1343
				1344	/*
				1345	* Combined all pending IP fragments on the socket as one IP datagram
				1346	* and push them out.
				1347	*/
				1348	struct sk_buff __ip_make_skb(struct sock sk,
				1349	struct flowi4 *fl4,
				1350	struct sk_buff_head *queue,
				1351	struct inet_cork *cork)
				1352	{
				1353	struct sk_buff skb, tmp_skb;
				1354	struct sk_buff **tail_skb;
				1355	struct inet_sock *inet = inet_sk(sk);
				1356	struct net *net = sock_net(sk);
				1357	struct ip_options *opt = NULL;
				1358	struct rtable rt = (struct rtable )cork->dst;
				1359	struct iphdr *iph;
				1360	__be16 df = 0;
				1361	__u8 ttl;
				1362
				1363	skb = __skb_dequeue(queue);
				1364	if (!skb)
				1365	goto out;
				1366	tail_skb = &(skb_shinfo(skb)->frag_list);
				1367
				1368	/* move skb->data to ip header from ext header */
				1369	if (skb->data < skb_network_header(skb))
				1370	__skb_pull(skb, skb_network_offset(skb));
				1371	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
				1372	__skb_pull(tmp_skb, skb_network_header_len(skb));
				1373	*tail_skb = tmp_skb;
				1374	tail_skb = &(tmp_skb->next);
				1375	skb->len += tmp_skb->len;
				1376	skb->data_len += tmp_skb->len;
				1377	skb->truesize += tmp_skb->truesize;
				1378	tmp_skb->destructor = NULL;
				1379	tmp_skb->sk = NULL;
				1380	}
				1381
				1382	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
				1383	* to fragment the frame generated here. No matter, what transforms
				1384	* how transforms change size of the packet, it will come out.
				1385	*/
				1386	skb->ignore_df = ip_sk_ignore_df(sk);
				1387
				1388	/* DF bit is set when we want to see DF on outgoing frames.
				1389	* If ignore_df is set too, we still allow to fragment this frame
				1390	* locally. */
				1391	if (inet->pmtudisc == IP_PMTUDISC_DO \|\|
				1392	inet->pmtudisc == IP_PMTUDISC_PROBE \|\|
				1393	(skb->len <= dst_mtu(&rt->dst) &&
				1394	ip_dont_fragment(sk, &rt->dst)))
				1395	df = htons(IP_DF);
				1396
				1397	if (cork->flags & IPCORK_OPT)
				1398	opt = cork->opt;
				1399
				1400	if (cork->ttl != 0)
				1401	ttl = cork->ttl;
				1402	else if (rt->rt_type == RTN_MULTICAST)
				1403	ttl = inet->mc_ttl;
				1404	else
				1405	ttl = ip_select_ttl(inet, &rt->dst);
				1406
				1407	iph = ip_hdr(skb);
				1408	iph->version = 4;
				1409	iph->ihl = 5;
				1410	iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
				1411	iph->frag_off = df;
				1412	iph->ttl = ttl;
				1413	iph->protocol = sk->sk_protocol;
				1414	ip_copy_addrs(iph, fl4);
				1415	ip_select_ident(net, skb, sk);
				1416
				1417	if (opt) {
				1418	iph->ihl += opt->optlen>>2;
				1419	ip_options_build(skb, opt, cork->addr, rt, 0);
				1420	}
				1421
				1422	skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
				1423	skb->mark = sk->sk_mark;
				1424	skb->tstamp = cork->transmit_time;
				1425	/*
				1426	* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
				1427	* on dst refcount
				1428	*/
				1429	cork->dst = NULL;
				1430	skb_dst_set(skb, &rt->dst);
				1431
				1432	if (iph->protocol == IPPROTO_ICMP)
				1433	icmp_out_count(net, ((struct icmphdr *)
				1434	skb_transport_header(skb))->type);
				1435
				1436	ip_cork_release(cork);
				1437	out:
				1438	return skb;
				1439	}
				1440
				1441	int ip_send_skb(struct net net, struct sk_buff skb)
				1442	{
				1443	int err;
				1444
				1445	err = ip_local_out(net, skb->sk, skb);
				1446	if (err) {
				1447	if (err > 0)
				1448	err = net_xmit_errno(err);
				1449	if (err)
				1450	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
				1451	}
				1452
				1453	return err;
				1454	}
				1455
				1456	int ip_push_pending_frames(struct sock sk, struct flowi4 fl4)
				1457	{
				1458	struct sk_buff *skb;
				1459
				1460	skb = ip_finish_skb(sk, fl4);
				1461	if (!skb)
				1462	return 0;
				1463
				1464	/* Netfilter gets whole the not fragmented skb. */
				1465	return ip_send_skb(sock_net(sk), skb);
				1466	}
				1467
				1468	/*
				1469	* Throw away all pending data on the socket.
				1470	*/
				1471	static void __ip_flush_pending_frames(struct sock *sk,
				1472	struct sk_buff_head *queue,
				1473	struct inet_cork *cork)
				1474	{
				1475	struct sk_buff *skb;
				1476
				1477	while ((skb = __skb_dequeue_tail(queue)) != NULL)
				1478	kfree_skb(skb);
				1479
				1480	ip_cork_release(cork);
				1481	}
				1482
				1483	void ip_flush_pending_frames(struct sock *sk)
				1484	{
				1485	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
				1486	}
				1487
				1488	struct sk_buff ip_make_skb(struct sock sk,
				1489	struct flowi4 *fl4,
				1490	int getfrag(void from, char to, int offset,
				1491	int len, int odd, struct sk_buff *skb),
				1492	void *from, int length, int transhdrlen,
				1493	struct ipcm_cookie ipc, struct rtable *rtp,
				1494	struct inet_cork *cork, unsigned int flags)
				1495	{
				1496	struct sk_buff_head queue;
				1497	int err;
				1498
				1499	if (flags & MSG_PROBE)
				1500	return NULL;
				1501
				1502	__skb_queue_head_init(&queue);
				1503
				1504	cork->flags = 0;
				1505	cork->addr = 0;
				1506	cork->opt = NULL;
				1507	err = ip_setup_cork(sk, cork, ipc, rtp);
				1508	if (err)
				1509	return ERR_PTR(err);
				1510
				1511	err = __ip_append_data(sk, fl4, &queue, cork,
				1512	&current->task_frag, getfrag,
				1513	from, length, transhdrlen, flags);
				1514	if (err) {
				1515	__ip_flush_pending_frames(sk, &queue, cork);
				1516	return ERR_PTR(err);
				1517	}
				1518
				1519	return __ip_make_skb(sk, fl4, &queue, cork);
				1520	}
				1521
				1522	/*
				1523	* Fetch data from kernel space and fill in checksum if needed.
				1524	*/
				1525	static int ip_reply_glue_bits(void dptr, char to, int offset,
				1526	int len, int odd, struct sk_buff *skb)
				1527	{
				1528	__wsum csum;
				1529
				1530	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
				1531	skb->csum = csum_block_add(skb->csum, csum, odd);
				1532	return 0;
				1533	}
				1534
				1535	/*
				1536	* Generic function to send a packet as reply to another packet.
				1537	* Used to send some TCP resets/acks so far.
				1538	*/
				1539	void ip_send_unicast_reply(struct sock sk, struct sk_buff skb,
				1540	const struct ip_options *sopt,
				1541	__be32 daddr, __be32 saddr,
				1542	const struct ip_reply_arg *arg,
				1543	unsigned int len)
				1544	{
				1545	struct ip_options_data replyopts;
				1546	struct ipcm_cookie ipc;
				1547	struct flowi4 fl4;
				1548	struct rtable *rt = skb_rtable(skb);
				1549	struct net *net = sock_net(sk);
				1550	struct sk_buff *nskb;
				1551	int err;
				1552	int oif;
				1553
				1554	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
				1555	return;
				1556
				1557	ipcm_init(&ipc);
				1558	ipc.addr = daddr;
				1559
				1560	if (replyopts.opt.opt.optlen) {
				1561	ipc.opt = &replyopts.opt;
				1562
				1563	if (replyopts.opt.opt.srr)
				1564	daddr = replyopts.opt.opt.faddr;
				1565	}
				1566
				1567	oif = arg->bound_dev_if;
				1568	if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
				1569	oif = skb->skb_iif;
				1570
				1571	flowi4_init_output(&fl4, oif,
				1572	IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
				1573	RT_TOS(arg->tos),
				1574	RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
				1575	ip_reply_arg_flowi_flags(arg),
				1576	daddr, saddr,
				1577	tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
				1578	arg->uid);
				1579	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
				1580	rt = ip_route_output_key(net, &fl4);
				1581	if (IS_ERR(rt))
				1582	return;
				1583
				1584	inet_sk(sk)->tos = arg->tos;
				1585
				1586	sk->sk_priority = skb->priority;
				1587	sk->sk_protocol = ip_hdr(skb)->protocol;
				1588	sk->sk_bound_dev_if = arg->bound_dev_if;
				1589	sk->sk_sndbuf = sysctl_wmem_default;
				1590	sk->sk_mark = fl4.flowi4_mark;
				1591	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
				1592	len, 0, &ipc, &rt, MSG_DONTWAIT);
				1593	if (unlikely(err)) {
				1594	ip_flush_pending_frames(sk);
				1595	goto out;
				1596	}
				1597
				1598	nskb = skb_peek(&sk->sk_write_queue);
				1599	if (nskb) {
				1600	if (arg->csumoffset >= 0)
				1601	((__sum16 )skb_transport_header(nskb) +
				1602	arg->csumoffset) = csum_fold(csum_add(nskb->csum,
				1603	arg->csum));
				1604	nskb->ip_summed = CHECKSUM_NONE;
				1605	ip_push_pending_frames(sk, &fl4);
				1606	}
				1607	out:
				1608	ip_rt_put(rt);
				1609	}
				1610
				1611	void __init ip_init(void)
				1612	{
				1613	ip_rt_init();
				1614	inet_initpeers();
				1615
				1616	#if defined(CONFIG_IP_MULTICAST)
				1617	igmp_mc_init();
				1618	#endif
				1619	}