blob: 166bc0c5e8af55a0393511bf09af4c7014798702 [file] [log] [blame]
yuezonghe824eb0c2024-06-27 02:32:26 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
49#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
52#include <linux/highmem.h>
53#include <linux/slab.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83#include <net/SI/fast_common.h>
84
85
86int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
87EXPORT_SYMBOL(sysctl_ip_default_ttl);
88
89/* Generate a checksum for an outgoing IP datagram. */
90__inline__ void ip_send_check(struct iphdr *iph)
91{
92 iph->check = 0;
93 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
94}
95EXPORT_SYMBOL(ip_send_check);
96
97int __ip_local_out(struct sk_buff *skb)
98{
99 struct iphdr *iph = ip_hdr(skb);
100
101 iph->tot_len = htons(skb->len);
102 ip_send_check(iph);
103 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
104 skb_dst(skb)->dev, dst_output);
105}
106
107int ip_local_out(struct sk_buff *skb)
108{
109 int err;
110
111 err = __ip_local_out(skb);
112 if (likely(err == 1))
113 err = dst_output(skb);
114
115 return err;
116}
117EXPORT_SYMBOL_GPL(ip_local_out);
118
119/* dev_loopback_xmit for use with netfilter. */
120static int ip_dev_loopback_xmit(struct sk_buff *newskb)
121{
122 skb_reset_mac_header(newskb);
123 __skb_pull(newskb, skb_network_offset(newskb));
124 newskb->pkt_type = PACKET_LOOPBACK;
125 newskb->ip_summed = CHECKSUM_UNNECESSARY;
126 WARN_ON(!skb_dst(newskb));
127 skb_dst_force(newskb);
128 netif_rx_ni(newskb);
129 return 0;
130}
131
132static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
133{
134 int ttl = inet->uc_ttl;
135
136 if (ttl < 0)
137 ttl = ip4_dst_hoplimit(dst);
138 return ttl;
139}
140
141/*
142 * Add an ip header to a skbuff and send it out.
143 *
144 */
145int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
146 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
147{
148 struct inet_sock *inet = inet_sk(sk);
149 struct rtable *rt = skb_rtable(skb);
150 struct iphdr *iph;
151
152 /* Build the IP header. */
153 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
154 skb_reset_network_header(skb);
155 iph = ip_hdr(skb);
156 iph->version = 4;
157 iph->ihl = 5;
158 iph->tos = inet->tos;
159 if (ip_dont_fragment(sk, &rt->dst))
160 iph->frag_off = htons(IP_DF);
161 else
162 iph->frag_off = 0;
163 iph->ttl = ip_select_ttl(inet, &rt->dst);
164 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
165 iph->saddr = saddr;
166 iph->protocol = sk->sk_protocol;
167 ip_select_ident(skb, sk);
168
169 if (opt && opt->opt.optlen) {
170 iph->ihl += opt->opt.optlen>>2;
171 ip_options_build(skb, &opt->opt, daddr, rt, 0);
172 }
173
174 skb->priority = sk->sk_priority;
175 skb->mark = sk->sk_mark;
176
177 /* Send it out. */
178 return ip_local_out(skb);
179}
180EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
181
182static inline int ip_finish_output2(struct sk_buff *skb)
183{
184 struct dst_entry *dst = skb_dst(skb);
185 struct rtable *rt = (struct rtable *)dst;
186 struct net_device *dev = dst->dev;
187 unsigned int hh_len = LL_RESERVED_SPACE(dev);
188 struct neighbour *neigh;
189
190 if (rt->rt_type == RTN_MULTICAST) {
191 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
192 } else if (rt->rt_type == RTN_BROADCAST)
193 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
194
195 /* Be paranoid, rather than too clever. */
196 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
197 struct sk_buff *skb2;
198
199 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
200 if (skb2 == NULL) {
201 kfree_skb(skb);
202 return -ENOMEM;
203 }
204 if (skb->sk)
205 skb_set_owner_w(skb2, skb->sk);
206 kfree_skb(skb);
207 skb = skb2;
208 }
209
210 rcu_read_lock();
211 neigh = dst_get_neighbour_noref(dst);
212 if (neigh) {
213 int res = neigh_output(neigh, skb);
214
215 rcu_read_unlock();
216 return res;
217 }
218 rcu_read_unlock();
219
220 if (net_ratelimit())
221 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
222 kfree_skb(skb);
223 return -EINVAL;
224}
225
226static inline int ip_skb_dst_mtu(struct sk_buff *skb)
227{
228 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
229
230 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
231 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
232}
233
234static int ip_finish_output(struct sk_buff *skb)
235{
236#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
237 /* Policy lookup after SNAT yielded a new policy */
238 if (skb_dst(skb)->xfrm != NULL) {
239 IPCB(skb)->flags |= IPSKB_REROUTED;
240 return dst_output(skb);
241 }
242#endif
243 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
244 return ip_fragment(skb, ip_finish_output2);
245 else
246 return ip_finish_output2(skb);
247}
248
249int ip_mc_output(struct sk_buff *skb)
250{
251 struct sock *sk = skb->sk;
252 struct rtable *rt = skb_rtable(skb);
253 struct net_device *dev = rt->dst.dev;
254
255 /*
256 * If the indicated interface is up and running, send the packet.
257 */
258 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
259
260 skb->dev = dev;
261 skb->protocol = htons(ETH_P_IP);
262
263 /*
264 * Multicasts are looped back for other local users
265 */
266
267 if (rt->rt_flags&RTCF_MULTICAST) {
268 if (sk_mc_loop(sk)
269#ifdef CONFIG_IP_MROUTE
270 /* Small optimization: do not loopback not local frames,
271 which returned after forwarding; they will be dropped
272 by ip_mr_input in any case.
273 Note, that local frames are looped back to be delivered
274 to local recipients.
275
276 This check is duplicated in ip_mr_input at the moment.
277 */
278 &&
279 ((rt->rt_flags & RTCF_LOCAL) ||
280 !(IPCB(skb)->flags & IPSKB_FORWARDED))
281#endif
282 ) {
283 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
284 if (newskb)
285 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
286 newskb, NULL, newskb->dev,
287 ip_dev_loopback_xmit);
288 }
289
290 /* Multicasts with ttl 0 must not go beyond the host */
291
292 if (ip_hdr(skb)->ttl == 0) {
293 kfree_skb(skb);
294 return 0;
295 }
296 }
297
298 if (rt->rt_flags&RTCF_BROADCAST) {
299 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
300 if (newskb)
301 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
302 NULL, newskb->dev, ip_dev_loopback_xmit);
303 }
304
305 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
306 skb->dev, ip_finish_output,
307 !(IPCB(skb)->flags & IPSKB_REROUTED));
308}
309
310int ip_output(struct sk_buff *skb)
311{
312 struct net_device *dev = skb_dst(skb)->dev;
313
314 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
315
316 skb->dev = dev;
317 skb->protocol = htons(ETH_P_IP);
318
319 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
320 ip_finish_output,
321 !(IPCB(skb)->flags & IPSKB_REROUTED));
322}
323
324/*
325 * copy saddr and daddr, possibly using 64bit load/stores
326 * Equivalent to :
327 * iph->saddr = fl4->saddr;
328 * iph->daddr = fl4->daddr;
329 */
330static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
331{
332 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
333 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
334 memcpy(&iph->saddr, &fl4->saddr,
335 sizeof(fl4->saddr) + sizeof(fl4->daddr));
336}
337
338extern int fast_local4_output_num;
339
340int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
341{
342 struct sock *sk = skb->sk;
343 struct inet_sock *inet = inet_sk(sk);
344 struct ip_options_rcu *inet_opt;
345 struct flowi4 *fl4;
346 struct rtable *rt;
347 struct iphdr *iph;
348 int res;
349 int fast_flag = 0;
350 struct nf_conn *ct;
351
352 /* Skip all of this if the packet is already routed,
353 * f.e. by something like SCTP.
354 */
355 rcu_read_lock();
356 inet_opt = rcu_dereference(inet->inet_opt);
357 fl4 = &fl->u.ip4;
358 rt = skb_rtable(skb);
359 if (rt != NULL)
360 goto packet_routed;
361
362 /* Make sure we can route this packet. */
363 rt = (struct rtable *)__sk_dst_check(sk, 0);
364 if (rt == NULL) {
365 __be32 daddr;
366
367 /* Use correct destination address if we have options. */
368 daddr = inet->inet_daddr;
369 if (inet_opt && inet_opt->opt.srr)
370 daddr = inet_opt->opt.faddr;
371
372 /* If this fails, retransmit mechanism of transport layer will
373 * keep trying until route appears or the connection times
374 * itself out.
375 */
376 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
377 daddr, inet->inet_saddr,
378 inet->inet_dport,
379 inet->inet_sport,
380 sk->sk_protocol,
381 RT_CONN_FLAGS(sk),
382 sk->sk_bound_dev_if);
383 if (IS_ERR(rt))
384 goto no_route;
385 sk_setup_caps(sk, &rt->dst);
386 }
387 skb_dst_set_noref(skb, &rt->dst);
388
389packet_routed:
390 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
391 goto no_route;
392
393 /* OK, we know where to send it, allocate and build IP header. */
394 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
395 skb_reset_network_header(skb);
396 iph = ip_hdr(skb);
397 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
398 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
399 iph->frag_off = htons(IP_DF);
400 else
401 iph->frag_off = 0;
402 iph->ttl = ip_select_ttl(inet, &rt->dst);
403 iph->protocol = sk->sk_protocol;
404 ip_copy_addrs(iph, fl4);
405
406 /* Transport layer set skb->h.foo itself. */
407
408 if (inet_opt && inet_opt->opt.optlen) {
409 iph->ihl += inet_opt->opt.optlen >> 2;
410 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
411 }
412
413 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
414
415 skb->priority = sk->sk_priority;
416 skb->mark = sk->sk_mark;
417
418 //Èç¹ûÒѾ­·¢Ëͳ¬¹ýãÐÖµÁË£¬Ö±½Ó¿ìËÙÌí¼ÓMACÍ·£¬Ìø¹ýËùÓеÄIP²ãHOOK¹³×Óº¯Êý
419 if (fast_local4_output_proc && fast_local4_output_proc(skb))
420 {
421 fast_local4_output_num++;
422 res = ip_finish_output(skb);
423 }
424 else
425 {
426 sk->sk_send_sum++;
427 res = ip_local_out(skb);
428 }
429
430 rcu_read_unlock();
431 return res;
432
433no_route:
434 rcu_read_unlock();
435 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
436 kfree_skb(skb);
437 return -EHOSTUNREACH;
438}
439EXPORT_SYMBOL(ip_queue_xmit);
440
441
442static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
443{
444 to->pkt_type = from->pkt_type;
445 to->priority = from->priority;
446 to->protocol = from->protocol;
447 skb_dst_drop(to);
448 skb_dst_copy(to, from);
449 to->dev = from->dev;
450 to->mark = from->mark;
451
452 /* Copy the flags to each fragment. */
453 IPCB(to)->flags = IPCB(from)->flags;
454
455#ifdef CONFIG_NET_SCHED
456 to->tc_index = from->tc_index;
457#endif
458 nf_copy(to, from);
459#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
460 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
461 to->nf_trace = from->nf_trace;
462#endif
463#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
464 to->ipvs_property = from->ipvs_property;
465#endif
466 skb_copy_secmark(to, from);
467}
468
469/*
470 * This IP datagram is too large to be sent in one piece. Break it up into
471 * smaller pieces (each of size equal to IP header plus
472 * a block of the data of the original IP data part) that will yet fit in a
473 * single device frame, and queue such a frame for sending.
474 */
475
476int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
477{
478 struct iphdr *iph;
479 int ptr;
480 struct net_device *dev;
481 struct sk_buff *skb2;
482 unsigned int mtu, hlen, left, len, ll_rs;
483 int offset;
484 __be16 not_last_frag;
485 struct rtable *rt = skb_rtable(skb);
486 int err = 0;
487
488 dev = rt->dst.dev;
489
490 /*
491 * Point into the IP datagram header.
492 */
493
494 iph = ip_hdr(skb);
495
496 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
497 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
498 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
499 htonl(ip_skb_dst_mtu(skb)));
500 kfree_skb(skb);
501 return -EMSGSIZE;
502 }
503
504 /*
505 * Setup starting values.
506 */
507
508 hlen = iph->ihl * 4;
509 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
510#ifdef CONFIG_BRIDGE_NETFILTER
511 if (skb->nf_bridge)
512 mtu -= nf_bridge_mtu_reduction(skb);
513#endif
514 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
515
516 /* When frag_list is given, use it. First, check its validity:
517 * some transformers could create wrong frag_list or break existing
518 * one, it is not prohibited. In this case fall back to copying.
519 *
520 * LATER: this step can be merged to real generation of fragments,
521 * we can switch to copy when see the first bad fragment.
522 */
523 if (skb_has_frag_list(skb)) {
524 struct sk_buff *frag, *frag2;
525 int first_len = skb_pagelen(skb);
526
527 if (first_len - hlen > mtu ||
528 ((first_len - hlen) & 7) ||
529 ip_is_fragment(iph) ||
530 skb_cloned(skb))
531 goto slow_path;
532
533 skb_walk_frags(skb, frag) {
534 /* Correct geometry. */
535 if (frag->len > mtu ||
536 ((frag->len & 7) && frag->next) ||
537 skb_headroom(frag) < hlen)
538 goto slow_path_clean;
539
540 /* Partially cloned skb? */
541 if (skb_shared(frag))
542 goto slow_path_clean;
543
544 BUG_ON(frag->sk);
545 if (skb->sk) {
546 frag->sk = skb->sk;
547 frag->destructor = sock_wfree;
548 }
549 skb->truesize -= frag->truesize;
550 }
551
552 /* Everything is OK. Generate! */
553
554 err = 0;
555 offset = 0;
556 frag = skb_shinfo(skb)->frag_list;
557 skb_frag_list_init(skb);
558 skb->data_len = first_len - skb_headlen(skb);
559 skb->len = first_len;
560 iph->tot_len = htons(first_len);
561 iph->frag_off = htons(IP_MF);
562 ip_send_check(iph);
563
564 for (;;) {
565 /* Prepare header of the next frame,
566 * before previous one went down. */
567 if (frag) {
568 frag->ip_summed = CHECKSUM_NONE;
569 skb_reset_transport_header(frag);
570 __skb_push(frag, hlen);
571 skb_reset_network_header(frag);
572 memcpy(skb_network_header(frag), iph, hlen);
573 iph = ip_hdr(frag);
574 iph->tot_len = htons(frag->len);
575 ip_copy_metadata(frag, skb);
576 if (offset == 0)
577 ip_options_fragment(frag);
578 offset += skb->len - hlen;
579 iph->frag_off = htons(offset>>3);
580 if (frag->next != NULL)
581 iph->frag_off |= htons(IP_MF);
582 /* Ready, complete checksum */
583 ip_send_check(iph);
584 }
585 net_run_track(PRT_FRAGMENT, "ip_fragment!\n");
586 err = output(skb);
587
588 if (!err)
589 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
590 if (err || !frag)
591 break;
592
593 skb = frag;
594 frag = skb->next;
595 skb->next = NULL;
596 }
597
598 if (err == 0) {
599 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
600 return 0;
601 }
602
603 while (frag) {
604 skb = frag->next;
605 kfree_skb(frag);
606 frag = skb;
607 }
608 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
609 return err;
610
611slow_path_clean:
612 skb_walk_frags(skb, frag2) {
613 if (frag2 == frag)
614 break;
615 frag2->sk = NULL;
616 frag2->destructor = NULL;
617 skb->truesize += frag2->truesize;
618 }
619 }
620
621slow_path:
622 left = skb->len - hlen; /* Space per frame */
623 ptr = hlen; /* Where to start from */
624
625 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
626 * we need to make room for the encapsulating header
627 */
628 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
629
630 /*
631 * Fragment the datagram.
632 */
633
634 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
635 not_last_frag = iph->frag_off & htons(IP_MF);
636
637 /*
638 * Keep copying data until we run out.
639 */
640
641 while (left > 0) {
642 len = left;
643 /* IF: it doesn't fit, use 'mtu' - the data space left */
644 if (len > mtu)
645 len = mtu;
646 /* IF: we are not sending up to and including the packet end
647 then align the next start on an eight byte boundary */
648 if (len < left) {
649 len &= ~7;
650 }
651 /*
652 * Allocate buffer.
653 */
654
655 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
656 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
657 err = -ENOMEM;
658 goto fail;
659 }
660
661 /*
662 * Set up data on packet
663 */
664
665 ip_copy_metadata(skb2, skb);
666 skb_reserve(skb2, ll_rs);
667 skb_put(skb2, len + hlen);
668 skb_reset_network_header(skb2);
669 skb2->transport_header = skb2->network_header + hlen;
670
671 /*
672 * Charge the memory for the fragment to any owner
673 * it might possess
674 */
675
676 if (skb->sk)
677 skb_set_owner_w(skb2, skb->sk);
678
679 /*
680 * Copy the packet header into the new buffer.
681 */
682
683 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
684
685 /*
686 * Copy a block of the IP datagram.
687 */
688 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
689 BUG();
690 left -= len;
691
692 /*
693 * Fill in the new header fields.
694 */
695 iph = ip_hdr(skb2);
696 iph->frag_off = htons((offset >> 3));
697
698 /* ANK: dirty, but effective trick. Upgrade options only if
699 * the segment to be fragmented was THE FIRST (otherwise,
700 * options are already fixed) and make it ONCE
701 * on the initial skb, so that all the following fragments
702 * will inherit fixed options.
703 */
704 if (offset == 0)
705 ip_options_fragment(skb);
706
707 /*
708 * Added AC : If we are fragmenting a fragment that's not the
709 * last fragment then keep MF on each bit
710 */
711 if (left > 0 || not_last_frag)
712 iph->frag_off |= htons(IP_MF);
713 ptr += len;
714 offset += len;
715
716 /*
717 * Put this fragment into the sending queue.
718 */
719 iph->tot_len = htons(len + hlen);
720
721 ip_send_check(iph);
722
723 net_run_track(PRT_FRAGMENT, "ip_fragment slow path!\n");
724 err = output(skb2);
725 if (err)
726 goto fail;
727
728 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
729 }
730 kfree_skb(skb);
731 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
732 return err;
733
734fail:
735 kfree_skb(skb);
736 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
737 return err;
738}
739EXPORT_SYMBOL(ip_fragment);
740
741int
742ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
743{
744 struct iovec *iov = from;
745
746 if (skb->ip_summed == CHECKSUM_PARTIAL) {
747 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
748 return -EFAULT;
749 } else {
750 __wsum csum = 0;
751 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
752 return -EFAULT;
753 skb->csum = csum_block_add(skb->csum, csum, odd);
754 }
755 return 0;
756}
757EXPORT_SYMBOL(ip_generic_getfrag);
758
759static inline __wsum
760csum_page(struct page *page, int offset, int copy)
761{
762 char *kaddr;
763 __wsum csum;
764 kaddr = kmap(page);
765 csum = csum_partial(kaddr + offset, copy, 0);
766 kunmap(page);
767 return csum;
768}
769
770static inline int ip_ufo_append_data(struct sock *sk,
771 struct sk_buff_head *queue,
772 int getfrag(void *from, char *to, int offset, int len,
773 int odd, struct sk_buff *skb),
774 void *from, int length, int hh_len, int fragheaderlen,
775 int transhdrlen, int maxfraglen, unsigned int flags)
776{
777 struct sk_buff *skb;
778 int err;
779
780 /* There is support for UDP fragmentation offload by network
781 * device, so create one single skb packet containing complete
782 * udp datagram
783 */
784 if ((skb = skb_peek_tail(queue)) == NULL) {
785 skb = sock_alloc_send_skb(sk,
786 hh_len + fragheaderlen + transhdrlen + 20,
787 (flags & MSG_DONTWAIT), &err);
788
789 if (skb == NULL)
790 return err;
791
792 /* reserve space for Hardware header */
793 skb_reserve(skb, hh_len);
794
795 /* create space for UDP/IP header */
796 skb_put(skb, fragheaderlen + transhdrlen);
797
798 /* initialize network header pointer */
799 skb_reset_network_header(skb);
800
801 /* initialize protocol header pointer */
802 skb->transport_header = skb->network_header + fragheaderlen;
803
804 skb->ip_summed = CHECKSUM_PARTIAL;
805 skb->csum = 0;
806
807 /* specify the length of each IP datagram fragment */
808 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
809 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
810 __skb_queue_tail(queue, skb);
811 }
812
813 return skb_append_datato_frags(sk, skb, getfrag, from,
814 (length - transhdrlen));
815}
816
817static int __ip_append_data(struct sock *sk,
818 struct flowi4 *fl4,
819 struct sk_buff_head *queue,
820 struct inet_cork *cork,
821 int getfrag(void *from, char *to, int offset,
822 int len, int odd, struct sk_buff *skb),
823 void *from, int length, int transhdrlen,
824 unsigned int flags)
825{
826 struct inet_sock *inet = inet_sk(sk);
827 struct sk_buff *skb;
828
829 struct ip_options *opt = cork->opt;
830 int hh_len;
831 int exthdrlen;
832 int mtu;
833 int copy;
834 int err;
835 int offset = 0;
836 unsigned int maxfraglen, fragheaderlen;
837 int csummode = CHECKSUM_NONE;
838 struct rtable *rt = (struct rtable *)cork->dst;
839
840 skb = skb_peek_tail(queue);
841
842 exthdrlen = !skb ? rt->dst.header_len : 0;
843 mtu = cork->fragsize;
844
845 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
846
847 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
848 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
849
850 if (cork->length + length > 0xFFFF - fragheaderlen) {
851 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
852 mtu-exthdrlen);
853 return -EMSGSIZE;
854 }
855
856 /*
857 * transhdrlen > 0 means that this is the first fragment and we wish
858 * it won't be fragmented in the future.
859 */
860 if (transhdrlen &&
861 length + fragheaderlen <= mtu &&
862 rt->dst.dev->features & NETIF_F_V4_CSUM &&
863 !exthdrlen)
864 csummode = CHECKSUM_PARTIAL;
865
866 cork->length += length;
867/*
868 if (((length > mtu) || (skb && skb_has_frags(skb))) &&
869 (sk->sk_protocol == IPPROTO_UDP) &&
870 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
871*/
872 if ((skb && skb_is_gso(skb)) ||//CVE-2017-1000112
873 (((length > mtu) || (skb && skb_has_frags(skb))) &&
874 (sk->sk_protocol == IPPROTO_UDP) &&
875 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len)) {
876 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
877 hh_len, fragheaderlen, transhdrlen,
878 maxfraglen, flags);
879 if (err)
880 goto error;
881 return 0;
882 }
883
884 /* So, what's going on in the loop below?
885 *
886 * We use calculated fragment length to generate chained skb,
887 * each of segments is IP fragment ready for sending to network after
888 * adding appropriate IP header.
889 */
890
891 if (!skb)
892 goto alloc_new_skb;
893
894 while (length > 0) {
895 /* Check if the remaining data fits into current packet. */
896 copy = mtu - skb->len;
897 if (copy < length)
898 copy = maxfraglen - skb->len;
899 if (copy <= 0) {
900 char *data;
901 unsigned int datalen;
902 unsigned int fraglen;
903 unsigned int fraggap;
904 unsigned int alloclen;
905 struct sk_buff *skb_prev;
906alloc_new_skb:
907 skb_prev = skb;
908 if (skb_prev)
909 fraggap = skb_prev->len - maxfraglen;
910 else
911 fraggap = 0;
912
913 /*
914 * If remaining data exceeds the mtu,
915 * we know we need more fragment(s).
916 */
917 datalen = length + fraggap;
918 if (datalen > mtu - fragheaderlen)
919 datalen = maxfraglen - fragheaderlen;
920 fraglen = datalen + fragheaderlen;
921
922 if ((flags & MSG_MORE) &&
923 !(rt->dst.dev->features&NETIF_F_SG))
924 alloclen = mtu;
925 else
926 alloclen = fraglen;
927
928 alloclen += exthdrlen;
929
930 /* The last fragment gets additional space at tail.
931 * Note, with MSG_MORE we overallocate on fragments,
932 * because we have no idea what fragment will be
933 * the last.
934 */
935 if (datalen == length + fraggap)
936 alloclen += rt->dst.trailer_len;
937
938 if (transhdrlen) {
939 skb = sock_alloc_send_skb(sk,
940 alloclen + hh_len + 15,
941 (flags & MSG_DONTWAIT), &err);
942 } else {
943 skb = NULL;
944 if (atomic_read(&sk->sk_wmem_alloc) <=
945 2 * sk->sk_sndbuf)
946 skb = sock_wmalloc(sk,
947 alloclen + hh_len + 15, 1,
948 sk->sk_allocation);
949 if (unlikely(skb == NULL))
950 err = -ENOBUFS;
951 else
952 /* only the initial fragment is
953 time stamped */
954 cork->tx_flags = 0;
955 }
956 if (skb == NULL)
957 goto error;
958
959 /*
960 * Fill in the control structures
961 */
962 skb->ip_summed = csummode;
963 skb->csum = 0;
964 skb_reserve(skb, hh_len);
965 skb_shinfo(skb)->tx_flags = cork->tx_flags;
966
967 /*
968 * Find where to start putting bytes.
969 */
970 data = skb_put(skb, fraglen + exthdrlen);
971 skb_set_network_header(skb, exthdrlen);
972 skb->transport_header = (skb->network_header +
973 fragheaderlen);
974 data += fragheaderlen + exthdrlen;
975
976 if (fraggap) {
977 skb->csum = skb_copy_and_csum_bits(
978 skb_prev, maxfraglen,
979 data + transhdrlen, fraggap, 0);
980 skb_prev->csum = csum_sub(skb_prev->csum,
981 skb->csum);
982 data += fraggap;
983 pskb_trim_unique(skb_prev, maxfraglen);
984 }
985
986 copy = datalen - transhdrlen - fraggap;
987 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
988 err = -EFAULT;
989 kfree_skb(skb);
990 goto error;
991 }
992
993 offset += copy;
994 length -= datalen - fraggap;
995 transhdrlen = 0;
996 exthdrlen = 0;
997 csummode = CHECKSUM_NONE;
998
999 /*
1000 * Put the packet on the pending queue.
1001 */
1002 __skb_queue_tail(queue, skb);
1003 continue;
1004 }
1005
1006 if (copy > length)
1007 copy = length;
1008
1009 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1010 unsigned int off;
1011
1012 off = skb->len;
1013 if (getfrag(from, skb_put(skb, copy),
1014 offset, copy, off, skb) < 0) {
1015 __skb_trim(skb, off);
1016 err = -EFAULT;
1017 goto error;
1018 }
1019 } else {
1020 int i = skb_shinfo(skb)->nr_frags;
1021 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1022 struct page *page = cork->page;
1023 int off = cork->off;
1024 unsigned int left;
1025
1026 if (page && (left = PAGE_SIZE - off) > 0) {
1027 if (copy >= left)
1028 copy = left;
1029 if (page != skb_frag_page(frag)) {
1030 if (i == MAX_SKB_FRAGS) {
1031 err = -EMSGSIZE;
1032 goto error;
1033 }
1034 skb_fill_page_desc(skb, i, page, off, 0);
1035 skb_frag_ref(skb, i);
1036 frag = &skb_shinfo(skb)->frags[i];
1037 }
1038 } else if (i < MAX_SKB_FRAGS) {
1039 if (copy > PAGE_SIZE)
1040 copy = PAGE_SIZE;
1041 page = alloc_pages(sk->sk_allocation, 0);
1042 if (page == NULL) {
1043 err = -ENOMEM;
1044 goto error;
1045 }
1046 netslab_inc(IP_OUTPUT_ALLOC_PAGES);
1047 cork->page = page;
1048 cork->off = 0;
1049
1050 skb_fill_page_desc(skb, i, page, 0, 0);
1051 frag = &skb_shinfo(skb)->frags[i];
1052 } else {
1053 err = -EMSGSIZE;
1054 goto error;
1055 }
1056 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1057 offset, copy, skb->len, skb) < 0) {
1058 err = -EFAULT;
1059 goto error;
1060 }
1061 cork->off += copy;
1062 skb_frag_size_add(frag, copy);
1063 skb->len += copy;
1064 skb->data_len += copy;
1065 skb->truesize += copy;
1066 atomic_add(copy, &sk->sk_wmem_alloc);
1067 }
1068 offset += copy;
1069 length -= copy;
1070 }
1071
1072 return 0;
1073
1074error:
1075 cork->length -= length;
1076 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1077 return err;
1078}
1079
1080static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1081 struct ipcm_cookie *ipc, struct rtable **rtp)
1082{
1083 struct inet_sock *inet = inet_sk(sk);
1084 struct ip_options_rcu *opt;
1085 struct rtable *rt;
1086
1087 /*
1088 * setup for corking.
1089 */
1090 opt = ipc->opt;
1091 if (opt) {
1092 if (cork->opt == NULL) {
1093 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1094 sk->sk_allocation);
1095 if (unlikely(cork->opt == NULL))
1096 return -ENOBUFS;
1097 }
1098 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1099 cork->flags |= IPCORK_OPT;
1100 cork->addr = ipc->addr;
1101 }
1102 rt = *rtp;
1103 if (unlikely(!rt))
1104 return -EFAULT;
1105 /*
1106 * We steal reference to this route, caller should not release it
1107 */
1108 *rtp = NULL;
1109 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1110 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1111 cork->dst = &rt->dst;
1112 cork->length = 0;
1113 cork->tx_flags = ipc->tx_flags;
1114 cork->page = NULL;
1115 cork->off = 0;
1116
1117 return 0;
1118}
1119
1120/*
1121 * ip_append_data() and ip_append_page() can make one large IP datagram
1122 * from many pieces of data. Each pieces will be holded on the socket
1123 * until ip_push_pending_frames() is called. Each piece can be a page
1124 * or non-page data.
1125 *
1126 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1127 * this interface potentially.
1128 *
1129 * LATER: length must be adjusted by pad at tail, when it is required.
1130 */
1131int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1132 int getfrag(void *from, char *to, int offset, int len,
1133 int odd, struct sk_buff *skb),
1134 void *from, int length, int transhdrlen,
1135 struct ipcm_cookie *ipc, struct rtable **rtp,
1136 unsigned int flags)
1137{
1138 struct inet_sock *inet = inet_sk(sk);
1139 int err;
1140
1141 if (flags&MSG_PROBE)
1142 return 0;
1143
1144 if (skb_queue_empty(&sk->sk_write_queue)) {
1145 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1146 if (err)
1147 return err;
1148 } else {
1149 transhdrlen = 0;
1150 }
1151
1152 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1153 from, length, transhdrlen, flags);
1154}
1155
1156ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1157 int offset, size_t size, int flags)
1158{
1159 struct inet_sock *inet = inet_sk(sk);
1160 struct sk_buff *skb;
1161 struct rtable *rt;
1162 struct ip_options *opt = NULL;
1163 struct inet_cork *cork;
1164 int hh_len;
1165 int mtu;
1166 int len;
1167 int err;
1168 unsigned int maxfraglen, fragheaderlen, fraggap;
1169
1170 if (inet->hdrincl)
1171 return -EPERM;
1172
1173 if (flags&MSG_PROBE)
1174 return 0;
1175
1176 if (skb_queue_empty(&sk->sk_write_queue))
1177 return -EINVAL;
1178
1179 cork = &inet->cork.base;
1180 rt = (struct rtable *)cork->dst;
1181 if (cork->flags & IPCORK_OPT)
1182 opt = cork->opt;
1183
1184 if (!(rt->dst.dev->features&NETIF_F_SG))
1185 return -EOPNOTSUPP;
1186
1187 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1188 mtu = cork->fragsize;
1189
1190 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1191 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1192
1193 if (cork->length + size > 0xFFFF - fragheaderlen) {
1194 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1195 return -EMSGSIZE;
1196 }
1197
1198 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1199 return -EINVAL;
1200
1201 cork->length += size;
1202 if ((size + skb->len > mtu) &&
1203 (skb_queue_len(&sk->sk_write_queue) == 1) &&//CVE-2017-1000112
1204 (sk->sk_protocol == IPPROTO_UDP) &&
1205 (rt->dst.dev->features & NETIF_F_UFO)) {
1206 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1207 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1208 }
1209
1210
1211 while (size > 0) {
1212 int i;
1213
1214 if (skb_is_gso(skb))
1215 len = size;
1216 else {
1217
1218 /* Check if the remaining data fits into current packet. */
1219 len = mtu - skb->len;
1220 if (len < size)
1221 len = maxfraglen - skb->len;
1222 }
1223 if (len <= 0) {
1224 struct sk_buff *skb_prev;
1225 int alloclen;
1226
1227 skb_prev = skb;
1228 fraggap = skb_prev->len - maxfraglen;
1229
1230 alloclen = fragheaderlen + hh_len + fraggap + 15;
1231 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1232 if (unlikely(!skb)) {
1233 err = -ENOBUFS;
1234 goto error;
1235 }
1236
1237 /*
1238 * Fill in the control structures
1239 */
1240 skb->ip_summed = CHECKSUM_NONE;
1241 skb->csum = 0;
1242 skb_reserve(skb, hh_len);
1243
1244 /*
1245 * Find where to start putting bytes.
1246 */
1247 skb_put(skb, fragheaderlen + fraggap);
1248 skb_reset_network_header(skb);
1249 skb->transport_header = (skb->network_header +
1250 fragheaderlen);
1251 if (fraggap) {
1252 skb->csum = skb_copy_and_csum_bits(skb_prev,
1253 maxfraglen,
1254 skb_transport_header(skb),
1255 fraggap, 0);
1256 skb_prev->csum = csum_sub(skb_prev->csum,
1257 skb->csum);
1258 pskb_trim_unique(skb_prev, maxfraglen);
1259 }
1260
1261 /*
1262 * Put the packet on the pending queue.
1263 */
1264 __skb_queue_tail(&sk->sk_write_queue, skb);
1265 continue;
1266 }
1267
1268 i = skb_shinfo(skb)->nr_frags;
1269 if (len > size)
1270 len = size;
1271 if (skb_can_coalesce(skb, i, page, offset)) {
1272 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1273 } else if (i < MAX_SKB_FRAGS) {
1274 get_page(page);
1275 skb_fill_page_desc(skb, i, page, offset, len);
1276 } else {
1277 err = -EMSGSIZE;
1278 goto error;
1279 }
1280
1281 if (skb->ip_summed == CHECKSUM_NONE) {
1282 __wsum csum;
1283 csum = csum_page(page, offset, len);
1284 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1285 }
1286
1287 skb->len += len;
1288 skb->data_len += len;
1289 skb->truesize += len;
1290 atomic_add(len, &sk->sk_wmem_alloc);
1291 offset += len;
1292 size -= len;
1293 }
1294 return 0;
1295
1296error:
1297 cork->length -= size;
1298 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1299 return err;
1300}
1301
1302static void ip_cork_release(struct inet_cork *cork)
1303{
1304 cork->flags &= ~IPCORK_OPT;
1305 kfree(cork->opt);
1306 cork->opt = NULL;
1307 dst_release(cork->dst);
1308 cork->dst = NULL;
1309}
1310
1311/*
1312 * Combined all pending IP fragments on the socket as one IP datagram
1313 * and push them out.
1314 */
1315struct sk_buff *__ip_make_skb(struct sock *sk,
1316 struct flowi4 *fl4,
1317 struct sk_buff_head *queue,
1318 struct inet_cork *cork)
1319{
1320 struct sk_buff *skb, *tmp_skb;
1321 struct sk_buff **tail_skb;
1322 struct inet_sock *inet = inet_sk(sk);
1323 struct net *net = sock_net(sk);
1324 struct ip_options *opt = NULL;
1325 struct rtable *rt = (struct rtable *)cork->dst;
1326 struct iphdr *iph;
1327 __be16 df = 0;
1328 __u8 ttl;
1329
1330 if ((skb = __skb_dequeue(queue)) == NULL)
1331 goto out;
1332 tail_skb = &(skb_shinfo(skb)->frag_list);
1333
1334 /* move skb->data to ip header from ext header */
1335 if (skb->data < skb_network_header(skb))
1336 __skb_pull(skb, skb_network_offset(skb));
1337 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1338 __skb_pull(tmp_skb, skb_network_header_len(skb));
1339 *tail_skb = tmp_skb;
1340 tail_skb = &(tmp_skb->next);
1341 skb->len += tmp_skb->len;
1342 skb->data_len += tmp_skb->len;
1343 skb->truesize += tmp_skb->truesize;
1344 tmp_skb->destructor = NULL;
1345 tmp_skb->sk = NULL;
1346 }
1347
1348 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1349 * to fragment the frame generated here. No matter, what transforms
1350 * how transforms change size of the packet, it will come out.
1351 */
1352 if (inet->pmtudisc < IP_PMTUDISC_DO)
1353 skb->local_df = 1;
1354
1355 /* DF bit is set when we want to see DF on outgoing frames.
1356 * If local_df is set too, we still allow to fragment this frame
1357 * locally. */
1358 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1359 (skb->len <= dst_mtu(&rt->dst) &&
1360 ip_dont_fragment(sk, &rt->dst)))
1361 df = htons(IP_DF);
1362
1363 if (cork->flags & IPCORK_OPT)
1364 opt = cork->opt;
1365
1366 if (rt->rt_type == RTN_MULTICAST)
1367 ttl = inet->mc_ttl;
1368 else
1369 ttl = ip_select_ttl(inet, &rt->dst);
1370
1371 iph = ip_hdr(skb);
1372 iph->version = 4;
1373 iph->ihl = 5;
1374 iph->tos = inet->tos;
1375 iph->frag_off = df;
1376 iph->ttl = ttl;
1377 iph->protocol = sk->sk_protocol;
1378 ip_copy_addrs(iph, fl4);
1379 ip_select_ident(skb, sk);
1380
1381 if (opt) {
1382 iph->ihl += opt->optlen>>2;
1383 ip_options_build(skb, opt, cork->addr, rt, 0);
1384 }
1385
1386 skb->priority = sk->sk_priority;
1387 skb->mark = sk->sk_mark;
1388 /*
1389 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1390 * on dst refcount
1391 */
1392 cork->dst = NULL;
1393 skb_dst_set(skb, &rt->dst);
1394
1395 if (iph->protocol == IPPROTO_ICMP)
1396 icmp_out_count(net, ((struct icmphdr *)
1397 skb_transport_header(skb))->type);
1398
1399 ip_cork_release(cork);
1400out:
1401 return skb;
1402}
1403
1404int ip_send_skb(struct sk_buff *skb)
1405{
1406 struct net *net = sock_net(skb->sk);
1407 int err;
1408
1409 if (fast_local4_output_proc && fast_local4_output_proc(skb))
1410 {
1411 fast_local4_output_num++;
1412 err = ip_finish_output(skb);
1413 }
1414 else
1415 {
1416 skb->sk->sk_send_sum++;
1417 err = ip_local_out(skb);
1418 }
1419 if (err) {
1420 if (err > 0)
1421 err = net_xmit_errno(err);
1422 if (err)
1423 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1424 }
1425
1426 return err;
1427}
1428
1429int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1430{
1431 struct sk_buff *skb;
1432
1433 skb = ip_finish_skb(sk, fl4);
1434 if (!skb)
1435 return 0;
1436
1437 /* Netfilter gets whole the not fragmented skb. */
1438 return ip_send_skb(skb);
1439}
1440
1441/*
1442 * Throw away all pending data on the socket.
1443 */
1444static void __ip_flush_pending_frames(struct sock *sk,
1445 struct sk_buff_head *queue,
1446 struct inet_cork *cork)
1447{
1448 struct sk_buff *skb;
1449
1450 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1451 kfree_skb(skb);
1452
1453 ip_cork_release(cork);
1454}
1455
1456void ip_flush_pending_frames(struct sock *sk)
1457{
1458 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1459}
1460
1461struct sk_buff *ip_make_skb(struct sock *sk,
1462 struct flowi4 *fl4,
1463 int getfrag(void *from, char *to, int offset,
1464 int len, int odd, struct sk_buff *skb),
1465 void *from, int length, int transhdrlen,
1466 struct ipcm_cookie *ipc, struct rtable **rtp,
1467 unsigned int flags)
1468{
1469 struct inet_cork cork;
1470 struct sk_buff_head queue;
1471 int err;
1472
1473 if (flags & MSG_PROBE)
1474 return NULL;
1475
1476 __skb_queue_head_init(&queue);
1477
1478 cork.flags = 0;
1479 cork.addr = 0;
1480 cork.opt = NULL;
1481 err = ip_setup_cork(sk, &cork, ipc, rtp);
1482 if (err)
1483 return ERR_PTR(err);
1484
1485 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1486 from, length, transhdrlen, flags);
1487 if (err) {
1488 __ip_flush_pending_frames(sk, &queue, &cork);
1489 return ERR_PTR(err);
1490 }
1491
1492 return __ip_make_skb(sk, fl4, &queue, &cork);
1493}
1494
1495/*
1496 * Fetch data from kernel space and fill in checksum if needed.
1497 */
1498static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1499 int len, int odd, struct sk_buff *skb)
1500{
1501 __wsum csum;
1502
1503 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1504 skb->csum = csum_block_add(skb->csum, csum, odd);
1505 return 0;
1506}
1507
1508/*
1509 * Generic function to send a packet as reply to another packet.
1510 * Used to send TCP resets so far. ICMP should use this function too.
1511 *
1512 * Should run single threaded per socket because it uses the sock
1513 * structure to pass arguments.
1514 */
1515void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1516 const struct ip_reply_arg *arg, unsigned int len)
1517{
1518 struct inet_sock *inet = inet_sk(sk);
1519 struct ip_options_data replyopts;
1520 struct ipcm_cookie ipc;
1521 struct flowi4 fl4;
1522 struct rtable *rt = skb_rtable(skb);
1523
1524 if (ip_options_echo(&replyopts.opt.opt, skb))
1525 return;
1526
1527 ipc.addr = daddr;
1528 ipc.opt = NULL;
1529 ipc.tx_flags = 0;
1530
1531 if (replyopts.opt.opt.optlen) {
1532 ipc.opt = &replyopts.opt;
1533
1534 if (replyopts.opt.opt.srr)
1535 daddr = replyopts.opt.opt.faddr;
1536 }
1537
1538 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1539 RT_TOS(arg->tos),
1540 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1541 ip_reply_arg_flowi_flags(arg),
1542 daddr, rt->rt_spec_dst,
1543 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1544 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1545 rt = ip_route_output_key(sock_net(sk), &fl4);
1546 if (IS_ERR(rt))
1547 return;
1548
1549 /* And let IP do all the hard work.
1550
1551 This chunk is not reenterable, hence spinlock.
1552 Note that it uses the fact, that this function is called
1553 with locally disabled BH and that sk cannot be already spinlocked.
1554 */
1555 bh_lock_sock(sk);
1556 inet->tos = arg->tos;
1557 sk->sk_priority = skb->priority;
1558 sk->sk_protocol = ip_hdr(skb)->protocol;
1559 sk->sk_bound_dev_if = arg->bound_dev_if;
1560 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1561 &ipc, &rt, MSG_DONTWAIT);
1562 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1563 if (arg->csumoffset >= 0)
1564 *((__sum16 *)skb_transport_header(skb) +
1565 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1566 arg->csum));
1567 skb->ip_summed = CHECKSUM_NONE;
1568 ip_push_pending_frames(sk, &fl4);
1569 }
1570
1571 bh_unlock_sock(sk);
1572
1573 ip_rt_put(rt);
1574}
1575
1576void __init ip_init(void)
1577{
1578 ip_rt_init();
1579 inet_initpeers();
1580
1581#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1582 igmp_mc_proc_init();
1583#endif
1584}