Blame - ap/os/linux/linux-3.4.x/net/ipv4/ip_output.c - R306

blob: 166bc0c5e8af55a0393511bf09af4c7014798702 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* The Internet Protocol (IP) output module.
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Donald Becker, <becker@super.org>
				11	* Alan Cox, <Alan.Cox@linux.org>
				12	* Richard Underwood
				13	* Stefan Becker, <stefanb@yello.ping.de>
				14	* Jorge Cwik, <jorge@laser.satlink.net>
				15	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
				16	* Hirokazu Takahashi, <taka@valinux.co.jp>
				17	*
				18	* See ip_input.c for original log
				19	*
				20	* Fixes:
				21	* Alan Cox : Missing nonblock feature in ip_build_xmit.
				22	* Mike Kilburn : htons() missing in ip_build_xmit.
				23	* Bradford Johnson: Fix faulty handling of some frames when
				24	* no route is found.
				25	* Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
				26	* (in case if packet not accepted by
				27	* output firewall rules)
				28	* Mike McLagan : Routing by source
				29	* Alexey Kuznetsov: use new route cache
				30	* Andi Kleen: Fix broken PMTU recovery and remove
				31	* some redundant tests.
				32	* Vitaly E. Lavrov : Transparent proxy revived after year coma.
				33	* Andi Kleen : Replace ip_reply with ip_send_reply.
				34	* Andi Kleen : Split fast and slow ip_build_xmit path
				35	* for decreased register pressure on x86
				36	* and more readibility.
				37	* Marc Boucher : When call_out_firewall returns FW_QUEUE,
				38	* silently drop skb instead of failing with -EPERM.
				39	* Detlev Wengorz : Copy protocol for fragments.
				40	* Hirokazu Takahashi: HW checksumming for outgoing UDP
				41	* datagrams.
				42	* Hirokazu Takahashi: sendfile() on UDP works now.
				43	*/
				44
				45	#include <asm/uaccess.h>
				46	#include <linux/module.h>
				47	#include <linux/types.h>
				48	#include <linux/kernel.h>
				49	#include <linux/mm.h>
				50	#include <linux/string.h>
				51	#include <linux/errno.h>
				52	#include <linux/highmem.h>
				53	#include <linux/slab.h>
				54
				55	#include <linux/socket.h>
				56	#include <linux/sockios.h>
				57	#include <linux/in.h>
				58	#include <linux/inet.h>
				59	#include <linux/netdevice.h>
				60	#include <linux/etherdevice.h>
				61	#include <linux/proc_fs.h>
				62	#include <linux/stat.h>
				63	#include <linux/init.h>
				64
				65	#include <net/snmp.h>
				66	#include <net/ip.h>
				67	#include <net/protocol.h>
				68	#include <net/route.h>
				69	#include <net/xfrm.h>
				70	#include <linux/skbuff.h>
				71	#include <net/sock.h>
				72	#include <net/arp.h>
				73	#include <net/icmp.h>
				74	#include <net/checksum.h>
				75	#include <net/inetpeer.h>
				76	#include <linux/igmp.h>
				77	#include <linux/netfilter_ipv4.h>
				78	#include <linux/netfilter_bridge.h>
				79	#include <linux/mroute.h>
				80	#include <linux/netlink.h>
				81	#include <linux/tcp.h>
				82
				83	#include <net/SI/fast_common.h>
				84
				85
				86	int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
				87	EXPORT_SYMBOL(sysctl_ip_default_ttl);
				88
				89	/* Generate a checksum for an outgoing IP datagram. */
				90	__inline__ void ip_send_check(struct iphdr *iph)
				91	{
				92	iph->check = 0;
				93	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
				94	}
				95	EXPORT_SYMBOL(ip_send_check);
				96
				97	int __ip_local_out(struct sk_buff *skb)
				98	{
				99	struct iphdr *iph = ip_hdr(skb);
				100
				101	iph->tot_len = htons(skb->len);
				102	ip_send_check(iph);
				103	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
				104	skb_dst(skb)->dev, dst_output);
				105	}
				106
				107	int ip_local_out(struct sk_buff *skb)
				108	{
				109	int err;
				110
				111	err = __ip_local_out(skb);
				112	if (likely(err == 1))
				113	err = dst_output(skb);
				114
				115	return err;
				116	}
				117	EXPORT_SYMBOL_GPL(ip_local_out);
				118
				119	/* dev_loopback_xmit for use with netfilter. */
				120	static int ip_dev_loopback_xmit(struct sk_buff *newskb)
				121	{
				122	skb_reset_mac_header(newskb);
				123	__skb_pull(newskb, skb_network_offset(newskb));
				124	newskb->pkt_type = PACKET_LOOPBACK;
				125	newskb->ip_summed = CHECKSUM_UNNECESSARY;
				126	WARN_ON(!skb_dst(newskb));
				127	skb_dst_force(newskb);
				128	netif_rx_ni(newskb);
				129	return 0;
				130	}
				131
				132	static inline int ip_select_ttl(struct inet_sock inet, struct dst_entry dst)
				133	{
				134	int ttl = inet->uc_ttl;
				135
				136	if (ttl < 0)
				137	ttl = ip4_dst_hoplimit(dst);
				138	return ttl;
				139	}
				140
				141	/*
				142	* Add an ip header to a skbuff and send it out.
				143	*
				144	*/
				145	int ip_build_and_send_pkt(struct sk_buff skb, struct sock sk,
				146	__be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
				147	{
				148	struct inet_sock *inet = inet_sk(sk);
				149	struct rtable *rt = skb_rtable(skb);
				150	struct iphdr *iph;
				151
				152	/* Build the IP header. */
				153	skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
				154	skb_reset_network_header(skb);
				155	iph = ip_hdr(skb);
				156	iph->version = 4;
				157	iph->ihl = 5;
				158	iph->tos = inet->tos;
				159	if (ip_dont_fragment(sk, &rt->dst))
				160	iph->frag_off = htons(IP_DF);
				161	else
				162	iph->frag_off = 0;
				163	iph->ttl = ip_select_ttl(inet, &rt->dst);
				164	iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
				165	iph->saddr = saddr;
				166	iph->protocol = sk->sk_protocol;
				167	ip_select_ident(skb, sk);
				168
				169	if (opt && opt->opt.optlen) {
				170	iph->ihl += opt->opt.optlen>>2;
				171	ip_options_build(skb, &opt->opt, daddr, rt, 0);
				172	}
				173
				174	skb->priority = sk->sk_priority;
				175	skb->mark = sk->sk_mark;
				176
				177	/* Send it out. */
				178	return ip_local_out(skb);
				179	}
				180	EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
				181
				182	static inline int ip_finish_output2(struct sk_buff *skb)
				183	{
				184	struct dst_entry *dst = skb_dst(skb);
				185	struct rtable rt = (struct rtable )dst;
				186	struct net_device *dev = dst->dev;
				187	unsigned int hh_len = LL_RESERVED_SPACE(dev);
				188	struct neighbour *neigh;
				189
				190	if (rt->rt_type == RTN_MULTICAST) {
				191	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
				192	} else if (rt->rt_type == RTN_BROADCAST)
				193	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
				194
				195	/* Be paranoid, rather than too clever. */
				196	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
				197	struct sk_buff *skb2;
				198
				199	skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
				200	if (skb2 == NULL) {
				201	kfree_skb(skb);
				202	return -ENOMEM;
				203	}
				204	if (skb->sk)
				205	skb_set_owner_w(skb2, skb->sk);
				206	kfree_skb(skb);
				207	skb = skb2;
				208	}
				209
				210	rcu_read_lock();
				211	neigh = dst_get_neighbour_noref(dst);
				212	if (neigh) {
				213	int res = neigh_output(neigh, skb);
				214
				215	rcu_read_unlock();
				216	return res;
				217	}
				218	rcu_read_unlock();
				219
				220	if (net_ratelimit())
				221	printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
				222	kfree_skb(skb);
				223	return -EINVAL;
				224	}
				225
				226	static inline int ip_skb_dst_mtu(struct sk_buff *skb)
				227	{
				228	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
				229
				230	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
				231	skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
				232	}
				233
				234	static int ip_finish_output(struct sk_buff *skb)
				235	{
				236	#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
				237	/* Policy lookup after SNAT yielded a new policy */
				238	if (skb_dst(skb)->xfrm != NULL) {
				239	IPCB(skb)->flags \|= IPSKB_REROUTED;
				240	return dst_output(skb);
				241	}
				242	#endif
				243	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
				244	return ip_fragment(skb, ip_finish_output2);
				245	else
				246	return ip_finish_output2(skb);
				247	}
				248
				249	int ip_mc_output(struct sk_buff *skb)
				250	{
				251	struct sock *sk = skb->sk;
				252	struct rtable *rt = skb_rtable(skb);
				253	struct net_device *dev = rt->dst.dev;
				254
				255	/*
				256	* If the indicated interface is up and running, send the packet.
				257	*/
				258	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
				259
				260	skb->dev = dev;
				261	skb->protocol = htons(ETH_P_IP);
				262
				263	/*
				264	* Multicasts are looped back for other local users
				265	*/
				266
				267	if (rt->rt_flags&RTCF_MULTICAST) {
				268	if (sk_mc_loop(sk)
				269	#ifdef CONFIG_IP_MROUTE
				270	/* Small optimization: do not loopback not local frames,
				271	which returned after forwarding; they will be dropped
				272	by ip_mr_input in any case.
				273	Note, that local frames are looped back to be delivered
				274	to local recipients.
				275
				276	This check is duplicated in ip_mr_input at the moment.
				277	*/
				278	&&
				279	((rt->rt_flags & RTCF_LOCAL) \|\|
				280	!(IPCB(skb)->flags & IPSKB_FORWARDED))
				281	#endif
				282	) {
				283	struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
				284	if (newskb)
				285	NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
				286	newskb, NULL, newskb->dev,
				287	ip_dev_loopback_xmit);
				288	}
				289
				290	/* Multicasts with ttl 0 must not go beyond the host */
				291
				292	if (ip_hdr(skb)->ttl == 0) {
				293	kfree_skb(skb);
				294	return 0;
				295	}
				296	}
				297
				298	if (rt->rt_flags&RTCF_BROADCAST) {
				299	struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
				300	if (newskb)
				301	NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
				302	NULL, newskb->dev, ip_dev_loopback_xmit);
				303	}
				304
				305	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
				306	skb->dev, ip_finish_output,
				307	!(IPCB(skb)->flags & IPSKB_REROUTED));
				308	}
				309
				310	int ip_output(struct sk_buff *skb)
				311	{
				312	struct net_device *dev = skb_dst(skb)->dev;
				313
				314	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
				315
				316	skb->dev = dev;
				317	skb->protocol = htons(ETH_P_IP);
				318
				319	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
				320	ip_finish_output,
				321	!(IPCB(skb)->flags & IPSKB_REROUTED));
				322	}
				323
				324	/*
				325	* copy saddr and daddr, possibly using 64bit load/stores
				326	* Equivalent to :
				327	* iph->saddr = fl4->saddr;
				328	* iph->daddr = fl4->daddr;
				329	*/
				330	static void ip_copy_addrs(struct iphdr iph, const struct flowi4 fl4)
				331	{
				332	BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
				333	offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
				334	memcpy(&iph->saddr, &fl4->saddr,
				335	sizeof(fl4->saddr) + sizeof(fl4->daddr));
				336	}
				337
				338	extern int fast_local4_output_num;
				339
				340	int ip_queue_xmit(struct sk_buff skb, struct flowi fl)
				341	{
				342	struct sock *sk = skb->sk;
				343	struct inet_sock *inet = inet_sk(sk);
				344	struct ip_options_rcu *inet_opt;
				345	struct flowi4 *fl4;
				346	struct rtable *rt;
				347	struct iphdr *iph;
				348	int res;
				349	int fast_flag = 0;
				350	struct nf_conn *ct;
				351
				352	/* Skip all of this if the packet is already routed,
				353	* f.e. by something like SCTP.
				354	*/
				355	rcu_read_lock();
				356	inet_opt = rcu_dereference(inet->inet_opt);
				357	fl4 = &fl->u.ip4;
				358	rt = skb_rtable(skb);
				359	if (rt != NULL)
				360	goto packet_routed;
				361
				362	/* Make sure we can route this packet. */
				363	rt = (struct rtable *)__sk_dst_check(sk, 0);
				364	if (rt == NULL) {
				365	__be32 daddr;
				366
				367	/* Use correct destination address if we have options. */
				368	daddr = inet->inet_daddr;
				369	if (inet_opt && inet_opt->opt.srr)
				370	daddr = inet_opt->opt.faddr;
				371
				372	/* If this fails, retransmit mechanism of transport layer will
				373	* keep trying until route appears or the connection times
				374	* itself out.
				375	*/
				376	rt = ip_route_output_ports(sock_net(sk), fl4, sk,
				377	daddr, inet->inet_saddr,
				378	inet->inet_dport,
				379	inet->inet_sport,
				380	sk->sk_protocol,
				381	RT_CONN_FLAGS(sk),
				382	sk->sk_bound_dev_if);
				383	if (IS_ERR(rt))
				384	goto no_route;
				385	sk_setup_caps(sk, &rt->dst);
				386	}
				387	skb_dst_set_noref(skb, &rt->dst);
				388
				389	packet_routed:
				390	if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
				391	goto no_route;
				392
				393	/* OK, we know where to send it, allocate and build IP header. */
				394	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
				395	skb_reset_network_header(skb);
				396	iph = ip_hdr(skb);
				397	((__be16 )iph) = htons((4 << 12) \| (5 << 8) \| (inet->tos & 0xff));
				398	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
				399	iph->frag_off = htons(IP_DF);
				400	else
				401	iph->frag_off = 0;
				402	iph->ttl = ip_select_ttl(inet, &rt->dst);
				403	iph->protocol = sk->sk_protocol;
				404	ip_copy_addrs(iph, fl4);
				405
				406	/* Transport layer set skb->h.foo itself. */
				407
				408	if (inet_opt && inet_opt->opt.optlen) {
				409	iph->ihl += inet_opt->opt.optlen >> 2;
				410	ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
				411	}
				412
				413	ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
				414
				415	skb->priority = sk->sk_priority;
				416	skb->mark = sk->sk_mark;
				417
				418	//Èç¹ûÒÑ¾·¢ËÍ³¬¹ýãÐÖµÁË£¬Ö±½Ó¿ìËÙÌí¼ÓMACÍ·£¬Ìø¹ýËùÓÐµÄIP²ãHOOK¹³×Óº¯Êý
				419	if (fast_local4_output_proc && fast_local4_output_proc(skb))
				420	{
				421	fast_local4_output_num++;
				422	res = ip_finish_output(skb);
				423	}
				424	else
				425	{
				426	sk->sk_send_sum++;
				427	res = ip_local_out(skb);
				428	}
				429
				430	rcu_read_unlock();
				431	return res;
				432
				433	no_route:
				434	rcu_read_unlock();
				435	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
				436	kfree_skb(skb);
				437	return -EHOSTUNREACH;
				438	}
				439	EXPORT_SYMBOL(ip_queue_xmit);
				440
				441
				442	static void ip_copy_metadata(struct sk_buff to, struct sk_buff from)
				443	{
				444	to->pkt_type = from->pkt_type;
				445	to->priority = from->priority;
				446	to->protocol = from->protocol;
				447	skb_dst_drop(to);
				448	skb_dst_copy(to, from);
				449	to->dev = from->dev;
				450	to->mark = from->mark;
				451
				452	/* Copy the flags to each fragment. */
				453	IPCB(to)->flags = IPCB(from)->flags;
				454
				455	#ifdef CONFIG_NET_SCHED
				456	to->tc_index = from->tc_index;
				457	#endif
				458	nf_copy(to, from);
				459	#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) \|\| \
				460	defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
				461	to->nf_trace = from->nf_trace;
				462	#endif
				463	#if defined(CONFIG_IP_VS) \|\| defined(CONFIG_IP_VS_MODULE)
				464	to->ipvs_property = from->ipvs_property;
				465	#endif
				466	skb_copy_secmark(to, from);
				467	}
				468
				469	/*
				470	* This IP datagram is too large to be sent in one piece. Break it up into
				471	* smaller pieces (each of size equal to IP header plus
				472	* a block of the data of the original IP data part) that will yet fit in a
				473	* single device frame, and queue such a frame for sending.
				474	*/
				475
				476	int ip_fragment(struct sk_buff skb, int (output)(struct sk_buff *))
				477	{
				478	struct iphdr *iph;
				479	int ptr;
				480	struct net_device *dev;
				481	struct sk_buff *skb2;
				482	unsigned int mtu, hlen, left, len, ll_rs;
				483	int offset;
				484	__be16 not_last_frag;
				485	struct rtable *rt = skb_rtable(skb);
				486	int err = 0;
				487
				488	dev = rt->dst.dev;
				489
				490	/*
				491	* Point into the IP datagram header.
				492	*/
				493
				494	iph = ip_hdr(skb);
				495
				496	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
				497	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
				498	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
				499	htonl(ip_skb_dst_mtu(skb)));
				500	kfree_skb(skb);
				501	return -EMSGSIZE;
				502	}
				503
				504	/*
				505	* Setup starting values.
				506	*/
				507
				508	hlen = iph->ihl * 4;
				509	mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
				510	#ifdef CONFIG_BRIDGE_NETFILTER
				511	if (skb->nf_bridge)
				512	mtu -= nf_bridge_mtu_reduction(skb);
				513	#endif
				514	IPCB(skb)->flags \|= IPSKB_FRAG_COMPLETE;
				515
				516	/* When frag_list is given, use it. First, check its validity:
				517	* some transformers could create wrong frag_list or break existing
				518	* one, it is not prohibited. In this case fall back to copying.
				519	*
				520	* LATER: this step can be merged to real generation of fragments,
				521	* we can switch to copy when see the first bad fragment.
				522	*/
				523	if (skb_has_frag_list(skb)) {
				524	struct sk_buff frag, frag2;
				525	int first_len = skb_pagelen(skb);
				526
				527	if (first_len - hlen > mtu \|\|
				528	((first_len - hlen) & 7) \|\|
				529	ip_is_fragment(iph) \|\|
				530	skb_cloned(skb))
				531	goto slow_path;
				532
				533	skb_walk_frags(skb, frag) {
				534	/* Correct geometry. */
				535	if (frag->len > mtu \|\|
				536	((frag->len & 7) && frag->next) \|\|
				537	skb_headroom(frag) < hlen)
				538	goto slow_path_clean;
				539
				540	/* Partially cloned skb? */
				541	if (skb_shared(frag))
				542	goto slow_path_clean;
				543
				544	BUG_ON(frag->sk);
				545	if (skb->sk) {
				546	frag->sk = skb->sk;
				547	frag->destructor = sock_wfree;
				548	}
				549	skb->truesize -= frag->truesize;
				550	}
				551
				552	/* Everything is OK. Generate! */
				553
				554	err = 0;
				555	offset = 0;
				556	frag = skb_shinfo(skb)->frag_list;
				557	skb_frag_list_init(skb);
				558	skb->data_len = first_len - skb_headlen(skb);
				559	skb->len = first_len;
				560	iph->tot_len = htons(first_len);
				561	iph->frag_off = htons(IP_MF);
				562	ip_send_check(iph);
				563
				564	for (;;) {
				565	/* Prepare header of the next frame,
				566	* before previous one went down. */
				567	if (frag) {
				568	frag->ip_summed = CHECKSUM_NONE;
				569	skb_reset_transport_header(frag);
				570	__skb_push(frag, hlen);
				571	skb_reset_network_header(frag);
				572	memcpy(skb_network_header(frag), iph, hlen);
				573	iph = ip_hdr(frag);
				574	iph->tot_len = htons(frag->len);
				575	ip_copy_metadata(frag, skb);
				576	if (offset == 0)
				577	ip_options_fragment(frag);
				578	offset += skb->len - hlen;
				579	iph->frag_off = htons(offset>>3);
				580	if (frag->next != NULL)
				581	iph->frag_off \|= htons(IP_MF);
				582	/* Ready, complete checksum */
				583	ip_send_check(iph);
				584	}
				585	net_run_track(PRT_FRAGMENT, "ip_fragment!\n");
				586	err = output(skb);
				587
				588	if (!err)
				589	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
				590	if (err \|\| !frag)
				591	break;
				592
				593	skb = frag;
				594	frag = skb->next;
				595	skb->next = NULL;
				596	}
				597
				598	if (err == 0) {
				599	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
				600	return 0;
				601	}
				602
				603	while (frag) {
				604	skb = frag->next;
				605	kfree_skb(frag);
				606	frag = skb;
				607	}
				608	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
				609	return err;
				610
				611	slow_path_clean:
				612	skb_walk_frags(skb, frag2) {
				613	if (frag2 == frag)
				614	break;
				615	frag2->sk = NULL;
				616	frag2->destructor = NULL;
				617	skb->truesize += frag2->truesize;
				618	}
				619	}
				620
				621	slow_path:
				622	left = skb->len - hlen; /* Space per frame */
				623	ptr = hlen; /* Where to start from */
				624
				625	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
				626	* we need to make room for the encapsulating header
				627	*/
				628	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
				629
				630	/*
				631	* Fragment the datagram.
				632	*/
				633
				634	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
				635	not_last_frag = iph->frag_off & htons(IP_MF);
				636
				637	/*
				638	* Keep copying data until we run out.
				639	*/
				640
				641	while (left > 0) {
				642	len = left;
				643	/* IF: it doesn't fit, use 'mtu' - the data space left */
				644	if (len > mtu)
				645	len = mtu;
				646	/* IF: we are not sending up to and including the packet end
				647	then align the next start on an eight byte boundary */
				648	if (len < left) {
				649	len &= ~7;
				650	}
				651	/*
				652	* Allocate buffer.
				653	*/
				654
				655	if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
				656	NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
				657	err = -ENOMEM;
				658	goto fail;
				659	}
				660
				661	/*
				662	* Set up data on packet
				663	*/
				664
				665	ip_copy_metadata(skb2, skb);
				666	skb_reserve(skb2, ll_rs);
				667	skb_put(skb2, len + hlen);
				668	skb_reset_network_header(skb2);
				669	skb2->transport_header = skb2->network_header + hlen;
				670
				671	/*
				672	* Charge the memory for the fragment to any owner
				673	* it might possess
				674	*/
				675
				676	if (skb->sk)
				677	skb_set_owner_w(skb2, skb->sk);
				678
				679	/*
				680	* Copy the packet header into the new buffer.
				681	*/
				682
				683	skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
				684
				685	/*
				686	* Copy a block of the IP datagram.
				687	*/
				688	if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
				689	BUG();
				690	left -= len;
				691
				692	/*
				693	* Fill in the new header fields.
				694	*/
				695	iph = ip_hdr(skb2);
				696	iph->frag_off = htons((offset >> 3));
				697
				698	/* ANK: dirty, but effective trick. Upgrade options only if
				699	* the segment to be fragmented was THE FIRST (otherwise,
				700	* options are already fixed) and make it ONCE
				701	* on the initial skb, so that all the following fragments
				702	* will inherit fixed options.
				703	*/
				704	if (offset == 0)
				705	ip_options_fragment(skb);
				706
				707	/*
				708	* Added AC : If we are fragmenting a fragment that's not the
				709	* last fragment then keep MF on each bit
				710	*/
				711	if (left > 0 \|\| not_last_frag)
				712	iph->frag_off \|= htons(IP_MF);
				713	ptr += len;
				714	offset += len;
				715
				716	/*
				717	* Put this fragment into the sending queue.
				718	*/
				719	iph->tot_len = htons(len + hlen);
				720
				721	ip_send_check(iph);
				722
				723	net_run_track(PRT_FRAGMENT, "ip_fragment slow path!\n");
				724	err = output(skb2);
				725	if (err)
				726	goto fail;
				727
				728	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
				729	}
				730	kfree_skb(skb);
				731	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
				732	return err;
				733
				734	fail:
				735	kfree_skb(skb);
				736	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
				737	return err;
				738	}
				739	EXPORT_SYMBOL(ip_fragment);
				740
				741	int
				742	ip_generic_getfrag(void from, char to, int offset, int len, int odd, struct sk_buff *skb)
				743	{
				744	struct iovec *iov = from;
				745
				746	if (skb->ip_summed == CHECKSUM_PARTIAL) {
				747	if (memcpy_fromiovecend(to, iov, offset, len) < 0)
				748	return -EFAULT;
				749	} else {
				750	__wsum csum = 0;
				751	if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
				752	return -EFAULT;
				753	skb->csum = csum_block_add(skb->csum, csum, odd);
				754	}
				755	return 0;
				756	}
				757	EXPORT_SYMBOL(ip_generic_getfrag);
				758
				759	static inline __wsum
				760	csum_page(struct page *page, int offset, int copy)
				761	{
				762	char *kaddr;
				763	__wsum csum;
				764	kaddr = kmap(page);
				765	csum = csum_partial(kaddr + offset, copy, 0);
				766	kunmap(page);
				767	return csum;
				768	}
				769
				770	static inline int ip_ufo_append_data(struct sock *sk,
				771	struct sk_buff_head *queue,
				772	int getfrag(void from, char to, int offset, int len,
				773	int odd, struct sk_buff *skb),
				774	void *from, int length, int hh_len, int fragheaderlen,
				775	int transhdrlen, int maxfraglen, unsigned int flags)
				776	{
				777	struct sk_buff *skb;
				778	int err;
				779
				780	/* There is support for UDP fragmentation offload by network
				781	* device, so create one single skb packet containing complete
				782	* udp datagram
				783	*/
				784	if ((skb = skb_peek_tail(queue)) == NULL) {
				785	skb = sock_alloc_send_skb(sk,
				786	hh_len + fragheaderlen + transhdrlen + 20,
				787	(flags & MSG_DONTWAIT), &err);
				788
				789	if (skb == NULL)
				790	return err;
				791
				792	/* reserve space for Hardware header */
				793	skb_reserve(skb, hh_len);
				794
				795	/* create space for UDP/IP header */
				796	skb_put(skb, fragheaderlen + transhdrlen);
				797
				798	/* initialize network header pointer */
				799	skb_reset_network_header(skb);
				800
				801	/* initialize protocol header pointer */
				802	skb->transport_header = skb->network_header + fragheaderlen;
				803
				804	skb->ip_summed = CHECKSUM_PARTIAL;
				805	skb->csum = 0;
				806
				807	/* specify the length of each IP datagram fragment */
				808	skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
				809	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
				810	__skb_queue_tail(queue, skb);
				811	}
				812
				813	return skb_append_datato_frags(sk, skb, getfrag, from,
				814	(length - transhdrlen));
				815	}
				816
				817	static int __ip_append_data(struct sock *sk,
				818	struct flowi4 *fl4,
				819	struct sk_buff_head *queue,
				820	struct inet_cork *cork,
				821	int getfrag(void from, char to, int offset,
				822	int len, int odd, struct sk_buff *skb),
				823	void *from, int length, int transhdrlen,
				824	unsigned int flags)
				825	{
				826	struct inet_sock *inet = inet_sk(sk);
				827	struct sk_buff *skb;
				828
				829	struct ip_options *opt = cork->opt;
				830	int hh_len;
				831	int exthdrlen;
				832	int mtu;
				833	int copy;
				834	int err;
				835	int offset = 0;
				836	unsigned int maxfraglen, fragheaderlen;
				837	int csummode = CHECKSUM_NONE;
				838	struct rtable rt = (struct rtable )cork->dst;
				839
				840	skb = skb_peek_tail(queue);
				841
				842	exthdrlen = !skb ? rt->dst.header_len : 0;
				843	mtu = cork->fragsize;
				844
				845	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
				846
				847	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
				848	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
				849
				850	if (cork->length + length > 0xFFFF - fragheaderlen) {
				851	ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
				852	mtu-exthdrlen);
				853	return -EMSGSIZE;
				854	}
				855
				856	/*
				857	* transhdrlen > 0 means that this is the first fragment and we wish
				858	* it won't be fragmented in the future.
				859	*/
				860	if (transhdrlen &&
				861	length + fragheaderlen <= mtu &&
				862	rt->dst.dev->features & NETIF_F_V4_CSUM &&
				863	!exthdrlen)
				864	csummode = CHECKSUM_PARTIAL;
				865
				866	cork->length += length;
				867	/*
				868	if (((length > mtu) \|\| (skb && skb_has_frags(skb))) &&
				869	(sk->sk_protocol == IPPROTO_UDP) &&
				870	(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
				871	*/
				872	if ((skb && skb_is_gso(skb)) \|\|//CVE-2017-1000112
				873	(((length > mtu) \|\| (skb && skb_has_frags(skb))) &&
				874	(sk->sk_protocol == IPPROTO_UDP) &&
				875	(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len)) {
				876	err = ip_ufo_append_data(sk, queue, getfrag, from, length,
				877	hh_len, fragheaderlen, transhdrlen,
				878	maxfraglen, flags);
				879	if (err)
				880	goto error;
				881	return 0;
				882	}
				883
				884	/* So, what's going on in the loop below?
				885	*
				886	* We use calculated fragment length to generate chained skb,
				887	* each of segments is IP fragment ready for sending to network after
				888	* adding appropriate IP header.
				889	*/
				890
				891	if (!skb)
				892	goto alloc_new_skb;
				893
				894	while (length > 0) {
				895	/* Check if the remaining data fits into current packet. */
				896	copy = mtu - skb->len;
				897	if (copy < length)
				898	copy = maxfraglen - skb->len;
				899	if (copy <= 0) {
				900	char *data;
				901	unsigned int datalen;
				902	unsigned int fraglen;
				903	unsigned int fraggap;
				904	unsigned int alloclen;
				905	struct sk_buff *skb_prev;
				906	alloc_new_skb:
				907	skb_prev = skb;
				908	if (skb_prev)
				909	fraggap = skb_prev->len - maxfraglen;
				910	else
				911	fraggap = 0;
				912
				913	/*
				914	* If remaining data exceeds the mtu,
				915	* we know we need more fragment(s).
				916	*/
				917	datalen = length + fraggap;
				918	if (datalen > mtu - fragheaderlen)
				919	datalen = maxfraglen - fragheaderlen;
				920	fraglen = datalen + fragheaderlen;
				921
				922	if ((flags & MSG_MORE) &&
				923	!(rt->dst.dev->features&NETIF_F_SG))
				924	alloclen = mtu;
				925	else
				926	alloclen = fraglen;
				927
				928	alloclen += exthdrlen;
				929
				930	/* The last fragment gets additional space at tail.
				931	* Note, with MSG_MORE we overallocate on fragments,
				932	* because we have no idea what fragment will be
				933	* the last.
				934	*/
				935	if (datalen == length + fraggap)
				936	alloclen += rt->dst.trailer_len;
				937
				938	if (transhdrlen) {
				939	skb = sock_alloc_send_skb(sk,
				940	alloclen + hh_len + 15,
				941	(flags & MSG_DONTWAIT), &err);
				942	} else {
				943	skb = NULL;
				944	if (atomic_read(&sk->sk_wmem_alloc) <=
				945	2 * sk->sk_sndbuf)
				946	skb = sock_wmalloc(sk,
				947	alloclen + hh_len + 15, 1,
				948	sk->sk_allocation);
				949	if (unlikely(skb == NULL))
				950	err = -ENOBUFS;
				951	else
				952	/* only the initial fragment is
				953	time stamped */
				954	cork->tx_flags = 0;
				955	}
				956	if (skb == NULL)
				957	goto error;
				958
				959	/*
				960	* Fill in the control structures
				961	*/
				962	skb->ip_summed = csummode;
				963	skb->csum = 0;
				964	skb_reserve(skb, hh_len);
				965	skb_shinfo(skb)->tx_flags = cork->tx_flags;
				966
				967	/*
				968	* Find where to start putting bytes.
				969	*/
				970	data = skb_put(skb, fraglen + exthdrlen);
				971	skb_set_network_header(skb, exthdrlen);
				972	skb->transport_header = (skb->network_header +
				973	fragheaderlen);
				974	data += fragheaderlen + exthdrlen;
				975
				976	if (fraggap) {
				977	skb->csum = skb_copy_and_csum_bits(
				978	skb_prev, maxfraglen,
				979	data + transhdrlen, fraggap, 0);
				980	skb_prev->csum = csum_sub(skb_prev->csum,
				981	skb->csum);
				982	data += fraggap;
				983	pskb_trim_unique(skb_prev, maxfraglen);
				984	}
				985
				986	copy = datalen - transhdrlen - fraggap;
				987	if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
				988	err = -EFAULT;
				989	kfree_skb(skb);
				990	goto error;
				991	}
				992
				993	offset += copy;
				994	length -= datalen - fraggap;
				995	transhdrlen = 0;
				996	exthdrlen = 0;
				997	csummode = CHECKSUM_NONE;
				998
				999	/*
				1000	* Put the packet on the pending queue.
				1001	*/
				1002	__skb_queue_tail(queue, skb);
				1003	continue;
				1004	}
				1005
				1006	if (copy > length)
				1007	copy = length;
				1008
				1009	if (!(rt->dst.dev->features&NETIF_F_SG)) {
				1010	unsigned int off;
				1011
				1012	off = skb->len;
				1013	if (getfrag(from, skb_put(skb, copy),
				1014	offset, copy, off, skb) < 0) {
				1015	__skb_trim(skb, off);
				1016	err = -EFAULT;
				1017	goto error;
				1018	}
				1019	} else {
				1020	int i = skb_shinfo(skb)->nr_frags;
				1021	skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
				1022	struct page *page = cork->page;
				1023	int off = cork->off;
				1024	unsigned int left;
				1025
				1026	if (page && (left = PAGE_SIZE - off) > 0) {
				1027	if (copy >= left)
				1028	copy = left;
				1029	if (page != skb_frag_page(frag)) {
				1030	if (i == MAX_SKB_FRAGS) {
				1031	err = -EMSGSIZE;
				1032	goto error;
				1033	}
				1034	skb_fill_page_desc(skb, i, page, off, 0);
				1035	skb_frag_ref(skb, i);
				1036	frag = &skb_shinfo(skb)->frags[i];
				1037	}
				1038	} else if (i < MAX_SKB_FRAGS) {
				1039	if (copy > PAGE_SIZE)
				1040	copy = PAGE_SIZE;
				1041	page = alloc_pages(sk->sk_allocation, 0);
				1042	if (page == NULL) {
				1043	err = -ENOMEM;
				1044	goto error;
				1045	}
				1046	netslab_inc(IP_OUTPUT_ALLOC_PAGES);
				1047	cork->page = page;
				1048	cork->off = 0;
				1049
				1050	skb_fill_page_desc(skb, i, page, 0, 0);
				1051	frag = &skb_shinfo(skb)->frags[i];
				1052	} else {
				1053	err = -EMSGSIZE;
				1054	goto error;
				1055	}
				1056	if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
				1057	offset, copy, skb->len, skb) < 0) {
				1058	err = -EFAULT;
				1059	goto error;
				1060	}
				1061	cork->off += copy;
				1062	skb_frag_size_add(frag, copy);
				1063	skb->len += copy;
				1064	skb->data_len += copy;
				1065	skb->truesize += copy;
				1066	atomic_add(copy, &sk->sk_wmem_alloc);
				1067	}
				1068	offset += copy;
				1069	length -= copy;
				1070	}
				1071
				1072	return 0;
				1073
				1074	error:
				1075	cork->length -= length;
				1076	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
				1077	return err;
				1078	}
				1079
				1080	static int ip_setup_cork(struct sock sk, struct inet_cork cork,
				1081	struct ipcm_cookie ipc, struct rtable *rtp)
				1082	{
				1083	struct inet_sock *inet = inet_sk(sk);
				1084	struct ip_options_rcu *opt;
				1085	struct rtable *rt;
				1086
				1087	/*
				1088	* setup for corking.
				1089	*/
				1090	opt = ipc->opt;
				1091	if (opt) {
				1092	if (cork->opt == NULL) {
				1093	cork->opt = kmalloc(sizeof(struct ip_options) + 40,
				1094	sk->sk_allocation);
				1095	if (unlikely(cork->opt == NULL))
				1096	return -ENOBUFS;
				1097	}
				1098	memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
				1099	cork->flags \|= IPCORK_OPT;
				1100	cork->addr = ipc->addr;
				1101	}
				1102	rt = *rtp;
				1103	if (unlikely(!rt))
				1104	return -EFAULT;
				1105	/*
				1106	* We steal reference to this route, caller should not release it
				1107	*/
				1108	*rtp = NULL;
				1109	cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
				1110	rt->dst.dev->mtu : dst_mtu(&rt->dst);
				1111	cork->dst = &rt->dst;
				1112	cork->length = 0;
				1113	cork->tx_flags = ipc->tx_flags;
				1114	cork->page = NULL;
				1115	cork->off = 0;
				1116
				1117	return 0;
				1118	}
				1119
				1120	/*
				1121	* ip_append_data() and ip_append_page() can make one large IP datagram
				1122	* from many pieces of data. Each pieces will be holded on the socket
				1123	* until ip_push_pending_frames() is called. Each piece can be a page
				1124	* or non-page data.
				1125	*
				1126	* Not only UDP, other transport protocols - e.g. raw sockets - can use
				1127	* this interface potentially.
				1128	*
				1129	* LATER: length must be adjusted by pad at tail, when it is required.
				1130	*/
				1131	int ip_append_data(struct sock sk, struct flowi4 fl4,
				1132	int getfrag(void from, char to, int offset, int len,
				1133	int odd, struct sk_buff *skb),
				1134	void *from, int length, int transhdrlen,
				1135	struct ipcm_cookie ipc, struct rtable *rtp,
				1136	unsigned int flags)
				1137	{
				1138	struct inet_sock *inet = inet_sk(sk);
				1139	int err;
				1140
				1141	if (flags&MSG_PROBE)
				1142	return 0;
				1143
				1144	if (skb_queue_empty(&sk->sk_write_queue)) {
				1145	err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
				1146	if (err)
				1147	return err;
				1148	} else {
				1149	transhdrlen = 0;
				1150	}
				1151
				1152	return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
				1153	from, length, transhdrlen, flags);
				1154	}
				1155
				1156	ssize_t ip_append_page(struct sock sk, struct flowi4 fl4, struct page *page,
				1157	int offset, size_t size, int flags)
				1158	{
				1159	struct inet_sock *inet = inet_sk(sk);
				1160	struct sk_buff *skb;
				1161	struct rtable *rt;
				1162	struct ip_options *opt = NULL;
				1163	struct inet_cork *cork;
				1164	int hh_len;
				1165	int mtu;
				1166	int len;
				1167	int err;
				1168	unsigned int maxfraglen, fragheaderlen, fraggap;
				1169
				1170	if (inet->hdrincl)
				1171	return -EPERM;
				1172
				1173	if (flags&MSG_PROBE)
				1174	return 0;
				1175
				1176	if (skb_queue_empty(&sk->sk_write_queue))
				1177	return -EINVAL;
				1178
				1179	cork = &inet->cork.base;
				1180	rt = (struct rtable *)cork->dst;
				1181	if (cork->flags & IPCORK_OPT)
				1182	opt = cork->opt;
				1183
				1184	if (!(rt->dst.dev->features&NETIF_F_SG))
				1185	return -EOPNOTSUPP;
				1186
				1187	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
				1188	mtu = cork->fragsize;
				1189
				1190	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
				1191	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
				1192
				1193	if (cork->length + size > 0xFFFF - fragheaderlen) {
				1194	ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
				1195	return -EMSGSIZE;
				1196	}
				1197
				1198	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
				1199	return -EINVAL;
				1200
				1201	cork->length += size;
				1202	if ((size + skb->len > mtu) &&
				1203	(skb_queue_len(&sk->sk_write_queue) == 1) &&//CVE-2017-1000112
				1204	(sk->sk_protocol == IPPROTO_UDP) &&
				1205	(rt->dst.dev->features & NETIF_F_UFO)) {
				1206	skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
				1207	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
				1208	}
				1209
				1210
				1211	while (size > 0) {
				1212	int i;
				1213
				1214	if (skb_is_gso(skb))
				1215	len = size;
				1216	else {
				1217
				1218	/* Check if the remaining data fits into current packet. */
				1219	len = mtu - skb->len;
				1220	if (len < size)
				1221	len = maxfraglen - skb->len;
				1222	}
				1223	if (len <= 0) {
				1224	struct sk_buff *skb_prev;
				1225	int alloclen;
				1226
				1227	skb_prev = skb;
				1228	fraggap = skb_prev->len - maxfraglen;
				1229
				1230	alloclen = fragheaderlen + hh_len + fraggap + 15;
				1231	skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
				1232	if (unlikely(!skb)) {
				1233	err = -ENOBUFS;
				1234	goto error;
				1235	}
				1236
				1237	/*
				1238	* Fill in the control structures
				1239	*/
				1240	skb->ip_summed = CHECKSUM_NONE;
				1241	skb->csum = 0;
				1242	skb_reserve(skb, hh_len);
				1243
				1244	/*
				1245	* Find where to start putting bytes.
				1246	*/
				1247	skb_put(skb, fragheaderlen + fraggap);
				1248	skb_reset_network_header(skb);
				1249	skb->transport_header = (skb->network_header +
				1250	fragheaderlen);
				1251	if (fraggap) {
				1252	skb->csum = skb_copy_and_csum_bits(skb_prev,
				1253	maxfraglen,
				1254	skb_transport_header(skb),
				1255	fraggap, 0);
				1256	skb_prev->csum = csum_sub(skb_prev->csum,
				1257	skb->csum);
				1258	pskb_trim_unique(skb_prev, maxfraglen);
				1259	}
				1260
				1261	/*
				1262	* Put the packet on the pending queue.
				1263	*/
				1264	__skb_queue_tail(&sk->sk_write_queue, skb);
				1265	continue;
				1266	}
				1267
				1268	i = skb_shinfo(skb)->nr_frags;
				1269	if (len > size)
				1270	len = size;
				1271	if (skb_can_coalesce(skb, i, page, offset)) {
				1272	skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
				1273	} else if (i < MAX_SKB_FRAGS) {
				1274	get_page(page);
				1275	skb_fill_page_desc(skb, i, page, offset, len);
				1276	} else {
				1277	err = -EMSGSIZE;
				1278	goto error;
				1279	}
				1280
				1281	if (skb->ip_summed == CHECKSUM_NONE) {
				1282	__wsum csum;
				1283	csum = csum_page(page, offset, len);
				1284	skb->csum = csum_block_add(skb->csum, csum, skb->len);
				1285	}
				1286
				1287	skb->len += len;
				1288	skb->data_len += len;
				1289	skb->truesize += len;
				1290	atomic_add(len, &sk->sk_wmem_alloc);
				1291	offset += len;
				1292	size -= len;
				1293	}
				1294	return 0;
				1295
				1296	error:
				1297	cork->length -= size;
				1298	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
				1299	return err;
				1300	}
				1301
				1302	static void ip_cork_release(struct inet_cork *cork)
				1303	{
				1304	cork->flags &= ~IPCORK_OPT;
				1305	kfree(cork->opt);
				1306	cork->opt = NULL;
				1307	dst_release(cork->dst);
				1308	cork->dst = NULL;
				1309	}
				1310
				1311	/*
				1312	* Combined all pending IP fragments on the socket as one IP datagram
				1313	* and push them out.
				1314	*/
				1315	struct sk_buff __ip_make_skb(struct sock sk,
				1316	struct flowi4 *fl4,
				1317	struct sk_buff_head *queue,
				1318	struct inet_cork *cork)
				1319	{
				1320	struct sk_buff skb, tmp_skb;
				1321	struct sk_buff **tail_skb;
				1322	struct inet_sock *inet = inet_sk(sk);
				1323	struct net *net = sock_net(sk);
				1324	struct ip_options *opt = NULL;
				1325	struct rtable rt = (struct rtable )cork->dst;
				1326	struct iphdr *iph;
				1327	__be16 df = 0;
				1328	__u8 ttl;
				1329
				1330	if ((skb = __skb_dequeue(queue)) == NULL)
				1331	goto out;
				1332	tail_skb = &(skb_shinfo(skb)->frag_list);
				1333
				1334	/* move skb->data to ip header from ext header */
				1335	if (skb->data < skb_network_header(skb))
				1336	__skb_pull(skb, skb_network_offset(skb));
				1337	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
				1338	__skb_pull(tmp_skb, skb_network_header_len(skb));
				1339	*tail_skb = tmp_skb;
				1340	tail_skb = &(tmp_skb->next);
				1341	skb->len += tmp_skb->len;
				1342	skb->data_len += tmp_skb->len;
				1343	skb->truesize += tmp_skb->truesize;
				1344	tmp_skb->destructor = NULL;
				1345	tmp_skb->sk = NULL;
				1346	}
				1347
				1348	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
				1349	* to fragment the frame generated here. No matter, what transforms
				1350	* how transforms change size of the packet, it will come out.
				1351	*/
				1352	if (inet->pmtudisc < IP_PMTUDISC_DO)
				1353	skb->local_df = 1;
				1354
				1355	/* DF bit is set when we want to see DF on outgoing frames.
				1356	* If local_df is set too, we still allow to fragment this frame
				1357	* locally. */
				1358	if (inet->pmtudisc >= IP_PMTUDISC_DO \|\|
				1359	(skb->len <= dst_mtu(&rt->dst) &&
				1360	ip_dont_fragment(sk, &rt->dst)))
				1361	df = htons(IP_DF);
				1362
				1363	if (cork->flags & IPCORK_OPT)
				1364	opt = cork->opt;
				1365
				1366	if (rt->rt_type == RTN_MULTICAST)
				1367	ttl = inet->mc_ttl;
				1368	else
				1369	ttl = ip_select_ttl(inet, &rt->dst);
				1370
				1371	iph = ip_hdr(skb);
				1372	iph->version = 4;
				1373	iph->ihl = 5;
				1374	iph->tos = inet->tos;
				1375	iph->frag_off = df;
				1376	iph->ttl = ttl;
				1377	iph->protocol = sk->sk_protocol;
				1378	ip_copy_addrs(iph, fl4);
				1379	ip_select_ident(skb, sk);
				1380
				1381	if (opt) {
				1382	iph->ihl += opt->optlen>>2;
				1383	ip_options_build(skb, opt, cork->addr, rt, 0);
				1384	}
				1385
				1386	skb->priority = sk->sk_priority;
				1387	skb->mark = sk->sk_mark;
				1388	/*
				1389	* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
				1390	* on dst refcount
				1391	*/
				1392	cork->dst = NULL;
				1393	skb_dst_set(skb, &rt->dst);
				1394
				1395	if (iph->protocol == IPPROTO_ICMP)
				1396	icmp_out_count(net, ((struct icmphdr *)
				1397	skb_transport_header(skb))->type);
				1398
				1399	ip_cork_release(cork);
				1400	out:
				1401	return skb;
				1402	}
				1403
				1404	int ip_send_skb(struct sk_buff *skb)
				1405	{
				1406	struct net *net = sock_net(skb->sk);
				1407	int err;
				1408
				1409	if (fast_local4_output_proc && fast_local4_output_proc(skb))
				1410	{
				1411	fast_local4_output_num++;
				1412	err = ip_finish_output(skb);
				1413	}
				1414	else
				1415	{
				1416	skb->sk->sk_send_sum++;
				1417	err = ip_local_out(skb);
				1418	}
				1419	if (err) {
				1420	if (err > 0)
				1421	err = net_xmit_errno(err);
				1422	if (err)
				1423	IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
				1424	}
				1425
				1426	return err;
				1427	}
				1428
				1429	int ip_push_pending_frames(struct sock sk, struct flowi4 fl4)
				1430	{
				1431	struct sk_buff *skb;
				1432
				1433	skb = ip_finish_skb(sk, fl4);
				1434	if (!skb)
				1435	return 0;
				1436
				1437	/* Netfilter gets whole the not fragmented skb. */
				1438	return ip_send_skb(skb);
				1439	}
				1440
				1441	/*
				1442	* Throw away all pending data on the socket.
				1443	*/
				1444	static void __ip_flush_pending_frames(struct sock *sk,
				1445	struct sk_buff_head *queue,
				1446	struct inet_cork *cork)
				1447	{
				1448	struct sk_buff *skb;
				1449
				1450	while ((skb = __skb_dequeue_tail(queue)) != NULL)
				1451	kfree_skb(skb);
				1452
				1453	ip_cork_release(cork);
				1454	}
				1455
				1456	void ip_flush_pending_frames(struct sock *sk)
				1457	{
				1458	__ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
				1459	}
				1460
				1461	struct sk_buff ip_make_skb(struct sock sk,
				1462	struct flowi4 *fl4,
				1463	int getfrag(void from, char to, int offset,
				1464	int len, int odd, struct sk_buff *skb),
				1465	void *from, int length, int transhdrlen,
				1466	struct ipcm_cookie ipc, struct rtable *rtp,
				1467	unsigned int flags)
				1468	{
				1469	struct inet_cork cork;
				1470	struct sk_buff_head queue;
				1471	int err;
				1472
				1473	if (flags & MSG_PROBE)
				1474	return NULL;
				1475
				1476	__skb_queue_head_init(&queue);
				1477
				1478	cork.flags = 0;
				1479	cork.addr = 0;
				1480	cork.opt = NULL;
				1481	err = ip_setup_cork(sk, &cork, ipc, rtp);
				1482	if (err)
				1483	return ERR_PTR(err);
				1484
				1485	err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
				1486	from, length, transhdrlen, flags);
				1487	if (err) {
				1488	__ip_flush_pending_frames(sk, &queue, &cork);
				1489	return ERR_PTR(err);
				1490	}
				1491
				1492	return __ip_make_skb(sk, fl4, &queue, &cork);
				1493	}
				1494
				1495	/*
				1496	* Fetch data from kernel space and fill in checksum if needed.
				1497	*/
				1498	static int ip_reply_glue_bits(void dptr, char to, int offset,
				1499	int len, int odd, struct sk_buff *skb)
				1500	{
				1501	__wsum csum;
				1502
				1503	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
				1504	skb->csum = csum_block_add(skb->csum, csum, odd);
				1505	return 0;
				1506	}
				1507
				1508	/*
				1509	* Generic function to send a packet as reply to another packet.
				1510	* Used to send TCP resets so far. ICMP should use this function too.
				1511	*
				1512	* Should run single threaded per socket because it uses the sock
				1513	* structure to pass arguments.
				1514	*/
				1515	void ip_send_reply(struct sock sk, struct sk_buff skb, __be32 daddr,
				1516	const struct ip_reply_arg *arg, unsigned int len)
				1517	{
				1518	struct inet_sock *inet = inet_sk(sk);
				1519	struct ip_options_data replyopts;
				1520	struct ipcm_cookie ipc;
				1521	struct flowi4 fl4;
				1522	struct rtable *rt = skb_rtable(skb);
				1523
				1524	if (ip_options_echo(&replyopts.opt.opt, skb))
				1525	return;
				1526
				1527	ipc.addr = daddr;
				1528	ipc.opt = NULL;
				1529	ipc.tx_flags = 0;
				1530
				1531	if (replyopts.opt.opt.optlen) {
				1532	ipc.opt = &replyopts.opt;
				1533
				1534	if (replyopts.opt.opt.srr)
				1535	daddr = replyopts.opt.opt.faddr;
				1536	}
				1537
				1538	flowi4_init_output(&fl4, arg->bound_dev_if, 0,
				1539	RT_TOS(arg->tos),
				1540	RT_SCOPE_UNIVERSE, sk->sk_protocol,
				1541	ip_reply_arg_flowi_flags(arg),
				1542	daddr, rt->rt_spec_dst,
				1543	tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
				1544	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
				1545	rt = ip_route_output_key(sock_net(sk), &fl4);
				1546	if (IS_ERR(rt))
				1547	return;
				1548
				1549	/* And let IP do all the hard work.
				1550
				1551	This chunk is not reenterable, hence spinlock.
				1552	Note that it uses the fact, that this function is called
				1553	with locally disabled BH and that sk cannot be already spinlocked.
				1554	*/
				1555	bh_lock_sock(sk);
				1556	inet->tos = arg->tos;
				1557	sk->sk_priority = skb->priority;
				1558	sk->sk_protocol = ip_hdr(skb)->protocol;
				1559	sk->sk_bound_dev_if = arg->bound_dev_if;
				1560	ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
				1561	&ipc, &rt, MSG_DONTWAIT);
				1562	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
				1563	if (arg->csumoffset >= 0)
				1564	((__sum16 )skb_transport_header(skb) +
				1565	arg->csumoffset) = csum_fold(csum_add(skb->csum,
				1566	arg->csum));
				1567	skb->ip_summed = CHECKSUM_NONE;
				1568	ip_push_pending_frames(sk, &fl4);
				1569	}
				1570
				1571	bh_unlock_sock(sk);
				1572
				1573	ip_rt_put(rt);
				1574	}
				1575
				1576	void __init ip_init(void)
				1577	{
				1578	ip_rt_init();
				1579	inet_initpeers();
				1580
				1581	#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
				1582	igmp_mc_proc_init();
				1583	#endif
				1584	}