blob: 9d8581bf7f84645845651df1637b541801dd5061 [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/bpf-cgroup.h>
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58#include <linux/mroute6.h>
59#include <net/l3mdev.h>
60#include <net/lwtunnel.h>
61
62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63{
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
69
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_is_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
87
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
108
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
111 }
112
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
123 }
124 rcu_read_unlock_bh();
125
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
129}
130
131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132{
133 int ret;
134
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
139 }
140
141#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
146 }
147#endif
148
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
155}
156
157int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158{
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
164
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
169 }
170
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175}
176
177bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178{
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
183}
184
185/*
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
190 */
191int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
193{
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 unsigned int head_room;
199 struct ipv6hdr *hdr;
200 u8 proto = fl6->flowi6_proto;
201 int seg_len = skb->len;
202 int hlimit = -1;
203 u32 mtu;
204
205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
206 if (opt)
207 head_room += opt->opt_nflen + opt->opt_flen;
208
209 if (unlikely(skb_headroom(skb) < head_room)) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211 if (!skb2) {
212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
215 return -ENOBUFS;
216 }
217 if (skb->sk)
218 skb_set_owner_w(skb2, skb->sk);
219 consume_skb(skb);
220 skb = skb2;
221 }
222
223 if (opt) {
224 seg_len += opt->opt_nflen + opt->opt_flen;
225
226 if (opt->opt_flen)
227 ipv6_push_frag_opts(skb, opt, &proto);
228
229 if (opt->opt_nflen)
230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
231 &fl6->saddr);
232 }
233
234 skb_push(skb, sizeof(struct ipv6hdr));
235 skb_reset_network_header(skb);
236 hdr = ipv6_hdr(skb);
237
238 /*
239 * Fill in the IPv6 header
240 */
241 if (np)
242 hlimit = np->hop_limit;
243 if (hlimit < 0)
244 hlimit = ip6_dst_hoplimit(dst);
245
246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
247 ip6_autoflowlabel(net, np), fl6));
248
249 hdr->payload_len = htons(seg_len);
250 hdr->nexthdr = proto;
251 hdr->hop_limit = hlimit;
252
253 hdr->saddr = fl6->saddr;
254 hdr->daddr = *first_hop;
255
256 skb->protocol = htons(ETH_P_IPV6);
257 skb->priority = sk->sk_priority;
258 skb->mark = mark;
259
260 mtu = dst_mtu(dst);
261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263 IPSTATS_MIB_OUT, skb->len);
264
265 /* if egress device is enslaved to an L3 master device pass the
266 * skb to its handler for processing
267 */
268 skb = l3mdev_ip6_out((struct sock *)sk, skb);
269 if (unlikely(!skb))
270 return 0;
271
272 /* hooks should never assume socket lock is held.
273 * we promote our socket to non const
274 */
275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
276 net, (struct sock *)sk, skb, NULL, dst->dev,
277 dst_output);
278 }
279
280 skb->dev = dst->dev;
281 /* ipv6_local_error() does not require socket lock,
282 * we promote our socket to non const
283 */
284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
285
286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
287 kfree_skb(skb);
288 return -EMSGSIZE;
289}
290EXPORT_SYMBOL(ip6_xmit);
291
292static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
293{
294 struct ip6_ra_chain *ra;
295 struct sock *last = NULL;
296
297 read_lock(&ip6_ra_lock);
298 for (ra = ip6_ra_chain; ra; ra = ra->next) {
299 struct sock *sk = ra->sk;
300 if (sk && ra->sel == sel &&
301 (!sk->sk_bound_dev_if ||
302 sk->sk_bound_dev_if == skb->dev->ifindex)) {
303 if (last) {
304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
305 if (skb2)
306 rawv6_rcv(last, skb2);
307 }
308 last = sk;
309 }
310 }
311
312 if (last) {
313 rawv6_rcv(last, skb);
314 read_unlock(&ip6_ra_lock);
315 return 1;
316 }
317 read_unlock(&ip6_ra_lock);
318 return 0;
319}
320
321static int ip6_forward_proxy_check(struct sk_buff *skb)
322{
323 struct ipv6hdr *hdr = ipv6_hdr(skb);
324 u8 nexthdr = hdr->nexthdr;
325 __be16 frag_off;
326 int offset;
327
328 if (ipv6_ext_hdr(nexthdr)) {
329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
330 if (offset < 0)
331 return 0;
332 } else
333 offset = sizeof(struct ipv6hdr);
334
335 if (nexthdr == IPPROTO_ICMPV6) {
336 struct icmp6hdr *icmp6;
337
338 if (!pskb_may_pull(skb, (skb_network_header(skb) +
339 offset + 1 - skb->data)))
340 return 0;
341
342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
343
344 switch (icmp6->icmp6_type) {
345 case NDISC_ROUTER_SOLICITATION:
346 case NDISC_ROUTER_ADVERTISEMENT:
347 case NDISC_NEIGHBOUR_SOLICITATION:
348 case NDISC_NEIGHBOUR_ADVERTISEMENT:
349 case NDISC_REDIRECT:
350 /* For reaction involving unicast neighbor discovery
351 * message destined to the proxied address, pass it to
352 * input function.
353 */
354 return 1;
355 default:
356 break;
357 }
358 }
359
360 /*
361 * The proxying router can't forward traffic sent to a link-local
362 * address, so signal the sender and discard the packet. This
363 * behavior is clarified by the MIPv6 specification.
364 */
365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
366 dst_link_failure(skb);
367 return -1;
368 }
369
370 return 0;
371}
372
373static inline int ip6_forward_finish(struct net *net, struct sock *sk,
374 struct sk_buff *skb)
375{
376 struct dst_entry *dst = skb_dst(skb);
377
378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
380
381 skb->tstamp = 0;
382 return dst_output(net, sk, skb);
383}
384
385static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
386{
387 if (skb->len <= mtu)
388 return false;
389
390 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
391 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
392 return true;
393
394 if (skb->ignore_df)
395 return false;
396
397 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
398 return false;
399
400 return true;
401}
402
403int ip6_forward(struct sk_buff *skb)
404{
405 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
406 struct dst_entry *dst = skb_dst(skb);
407 struct ipv6hdr *hdr = ipv6_hdr(skb);
408 struct inet6_skb_parm *opt = IP6CB(skb);
409 struct net *net = dev_net(dst->dev);
410 u32 mtu;
411
412 if (net->ipv6.devconf_all->forwarding == 0)
413 goto error;
414
415 if (skb->pkt_type != PACKET_HOST)
416 goto drop;
417
418 if (unlikely(skb->sk))
419 goto drop;
420
421 if (skb_warn_if_lro(skb))
422 goto drop;
423
424 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
425 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
426 goto drop;
427 }
428
429 skb_forward_csum(skb);
430
431 /*
432 * We DO NOT make any processing on
433 * RA packets, pushing them to user level AS IS
434 * without ane WARRANTY that application will be able
435 * to interpret them. The reason is that we
436 * cannot make anything clever here.
437 *
438 * We are not end-node, so that if packet contains
439 * AH/ESP, we cannot make anything.
440 * Defragmentation also would be mistake, RA packets
441 * cannot be fragmented, because there is no warranty
442 * that different fragments will go along one path. --ANK
443 */
444 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
445 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
446 return 0;
447 }
448
449 /*
450 * check and decrement ttl
451 */
452 if (hdr->hop_limit <= 1) {
453 /* Force OUTPUT device used as source address */
454 skb->dev = dst->dev;
455 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
456 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
457
458 kfree_skb(skb);
459 return -ETIMEDOUT;
460 }
461
462 /* XXX: idev->cnf.proxy_ndp? */
463 if (net->ipv6.devconf_all->proxy_ndp &&
464 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
465 int proxied = ip6_forward_proxy_check(skb);
466 if (proxied > 0)
467 return ip6_input(skb);
468 else if (proxied < 0) {
469 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
470 goto drop;
471 }
472 }
473
474 if (!xfrm6_route_forward(skb)) {
475 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
476 goto drop;
477 }
478 dst = skb_dst(skb);
479
480 /* IPv6 specs say nothing about it, but it is clear that we cannot
481 send redirects to source routed frames.
482 We don't send redirects to frames decapsulated from IPsec.
483 */
484 if (IP6CB(skb)->iif == dst->dev->ifindex &&
485 opt->srcrt == 0 && !skb_sec_path(skb)) {
486 struct in6_addr *target = NULL;
487 struct inet_peer *peer;
488 struct rt6_info *rt;
489
490 /*
491 * incoming and outgoing devices are the same
492 * send a redirect.
493 */
494
495 rt = (struct rt6_info *) dst;
496 if (rt->rt6i_flags & RTF_GATEWAY)
497 target = &rt->rt6i_gateway;
498 else
499 target = &hdr->daddr;
500
501 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
502
503 /* Limit redirects both by destination (here)
504 and by source (inside ndisc_send_redirect)
505 */
506 if (inet_peer_xrlim_allow(peer, 1*HZ))
507 ndisc_send_redirect(skb, target);
508 if (peer)
509 inet_putpeer(peer);
510 } else {
511 int addrtype = ipv6_addr_type(&hdr->saddr);
512
513 /* This check is security critical. */
514 if (addrtype == IPV6_ADDR_ANY ||
515 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
516 goto error;
517 if (addrtype & IPV6_ADDR_LINKLOCAL) {
518 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
519 ICMPV6_NOT_NEIGHBOUR, 0);
520 goto error;
521 }
522 }
523
524 mtu = ip6_dst_mtu_forward(dst);
525 if (mtu < IPV6_MIN_MTU)
526 mtu = IPV6_MIN_MTU;
527
528 if (ip6_pkt_too_big(skb, mtu)) {
529 /* Again, force OUTPUT device used as source address */
530 skb->dev = dst->dev;
531 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
532 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
533 __IP6_INC_STATS(net, ip6_dst_idev(dst),
534 IPSTATS_MIB_FRAGFAILS);
535 kfree_skb(skb);
536 return -EMSGSIZE;
537 }
538
539 if (skb_cow(skb, dst->dev->hard_header_len)) {
540 __IP6_INC_STATS(net, ip6_dst_idev(dst),
541 IPSTATS_MIB_OUTDISCARDS);
542 goto drop;
543 }
544
545 hdr = ipv6_hdr(skb);
546
547 /* Mangling hops number delayed to point after skb COW */
548
549 hdr->hop_limit--;
550
551 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
552 net, NULL, skb, skb->dev, dst->dev,
553 ip6_forward_finish);
554
555error:
556 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
557drop:
558 kfree_skb(skb);
559 return -EINVAL;
560}
561
562static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
563{
564 to->pkt_type = from->pkt_type;
565 to->priority = from->priority;
566 to->protocol = from->protocol;
567 skb_dst_drop(to);
568 skb_dst_set(to, dst_clone(skb_dst(from)));
569 to->dev = from->dev;
570 to->mark = from->mark;
571
572 skb_copy_hash(to, from);
573
574#ifdef CONFIG_NET_SCHED
575 to->tc_index = from->tc_index;
576#endif
577 nf_copy(to, from);
578 skb_copy_secmark(to, from);
579}
580
581static int ignore_double_fragment(struct sk_buff *skb)
582{
583 struct frag_hdr *fh;
584 u8 prevhdr = ipv6_hdr(skb)->nexthdr;
585
586 if (prevhdr != NEXTHDR_FRAGMENT)
587 return 0;
588 fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr));
589 if (fh->nexthdr == NEXTHDR_ESP)
590 return 1;
591 return 0;
592}
593
594int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
595 int (*output)(struct net *, struct sock *, struct sk_buff *))
596{
597 struct sk_buff *frag;
598 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
599 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
600 inet6_sk(skb->sk) : NULL;
601 struct ipv6hdr *tmp_hdr;
602 struct frag_hdr *fh;
603 unsigned int mtu, hlen, left, len, nexthdr_offset;
604 int hroom, troom;
605 __be32 frag_id;
606 int ptr, offset = 0, err = 0;
607 u8 *prevhdr, nexthdr = 0;
608
609 err = ip6_find_1stfragopt(skb, &prevhdr);
610 if (err < 0)
611 goto fail;
612 hlen = err;
613 nexthdr = *prevhdr;
614 nexthdr_offset = prevhdr - skb_network_header(skb);
615
616 mtu = ip6_skb_dst_mtu(skb);
617 if (ignore_double_fragment(skb) && skb->len > mtu) {
618 pr_info_ratelimited("[mtk_net] %s ignore to avoid double fragment\n",
619 __func__);
620 err = output(net, sk, skb);
621 return err;
622 }
623
624 /* We must not fragment if the socket is set to force MTU discovery
625 * or if the skb it not generated by a local socket.
626 */
627 if (unlikely(!skb->ignore_df && skb->len > mtu)) {
628 if (ipv6_hdr(skb)->nexthdr != NEXTHDR_ESP)
629 goto fail_toobig;
630 pr_info_ratelimited("[mtk_net] fix tcp packet_too_big\n");
631 }
632
633 if (IP6CB(skb)->frag_max_size) {
634 if (IP6CB(skb)->frag_max_size > mtu)
635 goto fail_toobig;
636
637 /* don't send fragments larger than what we received */
638 mtu = IP6CB(skb)->frag_max_size;
639 if (mtu < IPV6_MIN_MTU)
640 mtu = IPV6_MIN_MTU;
641 }
642
643 if (np && np->frag_size < mtu) {
644 if (np->frag_size)
645 mtu = np->frag_size;
646 }
647 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
648 goto fail_toobig;
649 mtu -= hlen + sizeof(struct frag_hdr);
650
651 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
652 &ipv6_hdr(skb)->saddr);
653
654 if (skb->ip_summed == CHECKSUM_PARTIAL &&
655 (err = skb_checksum_help(skb)))
656 goto fail;
657
658 prevhdr = skb_network_header(skb) + nexthdr_offset;
659 hroom = LL_RESERVED_SPACE(rt->dst.dev);
660 if (skb_has_frag_list(skb)) {
661 unsigned int first_len = skb_pagelen(skb);
662 struct sk_buff *frag2;
663
664 if (first_len - hlen > mtu ||
665 ((first_len - hlen) & 7) ||
666 skb_cloned(skb) ||
667 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
668 goto slow_path;
669
670 skb_walk_frags(skb, frag) {
671 /* Correct geometry. */
672 if (frag->len > mtu ||
673 ((frag->len & 7) && frag->next) ||
674 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
675 goto slow_path_clean;
676
677 /* Partially cloned skb? */
678 if (skb_shared(frag))
679 goto slow_path_clean;
680
681 BUG_ON(frag->sk);
682 if (skb->sk) {
683 frag->sk = skb->sk;
684 frag->destructor = sock_wfree;
685 }
686 skb->truesize -= frag->truesize;
687 }
688
689 err = 0;
690 offset = 0;
691 /* BUILD HEADER */
692
693 *prevhdr = NEXTHDR_FRAGMENT;
694 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
695 if (!tmp_hdr) {
696 err = -ENOMEM;
697 goto fail;
698 }
699 frag = skb_shinfo(skb)->frag_list;
700 skb_frag_list_init(skb);
701
702 __skb_pull(skb, hlen);
703 fh = __skb_push(skb, sizeof(struct frag_hdr));
704 __skb_push(skb, hlen);
705 skb_reset_network_header(skb);
706 memcpy(skb_network_header(skb), tmp_hdr, hlen);
707
708 fh->nexthdr = nexthdr;
709 fh->reserved = 0;
710 fh->frag_off = htons(IP6_MF);
711 fh->identification = frag_id;
712
713 first_len = skb_pagelen(skb);
714 skb->data_len = first_len - skb_headlen(skb);
715 skb->len = first_len;
716 ipv6_hdr(skb)->payload_len = htons(first_len -
717 sizeof(struct ipv6hdr));
718
719 for (;;) {
720 /* Prepare header of the next frame,
721 * before previous one went down. */
722 if (frag) {
723 frag->ip_summed = CHECKSUM_NONE;
724 skb_reset_transport_header(frag);
725 fh = __skb_push(frag, sizeof(struct frag_hdr));
726 __skb_push(frag, hlen);
727 skb_reset_network_header(frag);
728 memcpy(skb_network_header(frag), tmp_hdr,
729 hlen);
730 offset += skb->len - hlen - sizeof(struct frag_hdr);
731 fh->nexthdr = nexthdr;
732 fh->reserved = 0;
733 fh->frag_off = htons(offset);
734 if (frag->next)
735 fh->frag_off |= htons(IP6_MF);
736 fh->identification = frag_id;
737 ipv6_hdr(frag)->payload_len =
738 htons(frag->len -
739 sizeof(struct ipv6hdr));
740 ip6_copy_metadata(frag, skb);
741 }
742
743 err = output(net, sk, skb);
744 if (!err)
745 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
746 IPSTATS_MIB_FRAGCREATES);
747
748 if (err || !frag)
749 break;
750
751 skb = frag;
752 frag = skb->next;
753 skb->next = NULL;
754 }
755
756 kfree(tmp_hdr);
757
758 if (err == 0) {
759 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
760 IPSTATS_MIB_FRAGOKS);
761 return 0;
762 }
763
764 kfree_skb_list(frag);
765
766 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 IPSTATS_MIB_FRAGFAILS);
768 return err;
769
770slow_path_clean:
771 skb_walk_frags(skb, frag2) {
772 if (frag2 == frag)
773 break;
774 frag2->sk = NULL;
775 frag2->destructor = NULL;
776 skb->truesize += frag2->truesize;
777 }
778 }
779
780slow_path:
781 left = skb->len - hlen; /* Space per frame */
782 ptr = hlen; /* Where to start from */
783
784 /*
785 * Fragment the datagram.
786 */
787
788 troom = rt->dst.dev->needed_tailroom;
789
790 /*
791 * Keep copying data until we run out.
792 */
793 while (left > 0) {
794 u8 *fragnexthdr_offset;
795
796 len = left;
797 /* IF: it doesn't fit, use 'mtu' - the data space left */
798 if (len > mtu)
799 len = mtu;
800 /* IF: we are not sending up to and including the packet end
801 then align the next start on an eight byte boundary */
802 if (len < left) {
803 len &= ~7;
804 }
805
806 /* Allocate buffer */
807 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
808 hroom + troom, GFP_ATOMIC);
809 if (!frag) {
810 err = -ENOMEM;
811 goto fail;
812 }
813
814 /*
815 * Set up data on packet
816 */
817
818 ip6_copy_metadata(frag, skb);
819 skb_reserve(frag, hroom);
820 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
821 skb_reset_network_header(frag);
822 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
823 frag->transport_header = (frag->network_header + hlen +
824 sizeof(struct frag_hdr));
825
826 /*
827 * Charge the memory for the fragment to any owner
828 * it might possess
829 */
830 if (skb->sk)
831 skb_set_owner_w(frag, skb->sk);
832
833 /*
834 * Copy the packet header into the new buffer.
835 */
836 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
837
838 fragnexthdr_offset = skb_network_header(frag);
839 fragnexthdr_offset += prevhdr - skb_network_header(skb);
840 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
841
842 /*
843 * Build fragment header.
844 */
845 fh->nexthdr = nexthdr;
846 fh->reserved = 0;
847 fh->identification = frag_id;
848
849 /*
850 * Copy a block of the IP datagram.
851 */
852 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
853 len));
854 left -= len;
855
856 fh->frag_off = htons(offset);
857 if (left > 0)
858 fh->frag_off |= htons(IP6_MF);
859 ipv6_hdr(frag)->payload_len = htons(frag->len -
860 sizeof(struct ipv6hdr));
861
862 ptr += len;
863 offset += len;
864
865 /*
866 * Put this fragment into the sending queue.
867 */
868 err = output(net, sk, frag);
869 if (err)
870 goto fail;
871
872 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873 IPSTATS_MIB_FRAGCREATES);
874 }
875 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
876 IPSTATS_MIB_FRAGOKS);
877 consume_skb(skb);
878 return err;
879
880fail_toobig:
881 if (skb->sk && dst_allfrag(skb_dst(skb)))
882 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
883
884 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
885 err = -EMSGSIZE;
886
887fail:
888 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
889 IPSTATS_MIB_FRAGFAILS);
890 kfree_skb(skb);
891 return err;
892}
893
894static inline int ip6_rt_check(const struct rt6key *rt_key,
895 const struct in6_addr *fl_addr,
896 const struct in6_addr *addr_cache)
897{
898 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
899 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
900}
901
902static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
903 struct dst_entry *dst,
904 const struct flowi6 *fl6)
905{
906 struct ipv6_pinfo *np = inet6_sk(sk);
907 struct rt6_info *rt;
908
909 if (!dst)
910 goto out;
911
912 if (dst->ops->family != AF_INET6) {
913 dst_release(dst);
914 return NULL;
915 }
916
917 rt = (struct rt6_info *)dst;
918 /* Yes, checking route validity in not connected
919 * case is not very simple. Take into account,
920 * that we do not support routing by source, TOS,
921 * and MSG_DONTROUTE --ANK (980726)
922 *
923 * 1. ip6_rt_check(): If route was host route,
924 * check that cached destination is current.
925 * If it is network route, we still may
926 * check its validity using saved pointer
927 * to the last used address: daddr_cache.
928 * We do not want to save whole address now,
929 * (because main consumer of this service
930 * is tcp, which has not this problem),
931 * so that the last trick works only on connected
932 * sockets.
933 * 2. oif also should be the same.
934 */
935 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
936#ifdef CONFIG_IPV6_SUBTREES
937 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
938#endif
939 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
940 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
941 dst_release(dst);
942 dst = NULL;
943 }
944
945out:
946 return dst;
947}
948
949static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
950 struct dst_entry **dst, struct flowi6 *fl6)
951{
952#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
953 struct neighbour *n;
954 struct rt6_info *rt;
955#endif
956 int err;
957 int flags = 0;
958
959 /* The correct way to handle this would be to do
960 * ip6_route_get_saddr, and then ip6_route_output; however,
961 * the route-specific preferred source forces the
962 * ip6_route_output call _before_ ip6_route_get_saddr.
963 *
964 * In source specific routing (no src=any default route),
965 * ip6_route_output will fail given src=any saddr, though, so
966 * that's why we try it again later.
967 */
968 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
969 struct fib6_info *from;
970 struct rt6_info *rt;
971 bool had_dst = *dst != NULL;
972
973 if (!had_dst)
974 *dst = ip6_route_output(net, sk, fl6);
975 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
976
977 rcu_read_lock();
978 from = rt ? rcu_dereference(rt->from) : NULL;
979 err = ip6_route_get_saddr(net, from, &fl6->daddr,
980 sk ? inet6_sk(sk)->srcprefs : 0,
981 &fl6->saddr);
982 rcu_read_unlock();
983
984 if (err)
985 goto out_err_release;
986
987 /* If we had an erroneous initial result, pretend it
988 * never existed and let the SA-enabled version take
989 * over.
990 */
991 if (!had_dst && (*dst)->error) {
992 dst_release(*dst);
993 *dst = NULL;
994 }
995
996 if (fl6->flowi6_oif)
997 flags |= RT6_LOOKUP_F_IFACE;
998 }
999
1000 if (!*dst)
1001 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1002
1003 err = (*dst)->error;
1004 if (err)
1005 goto out_err_release;
1006
1007#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1008 /*
1009 * Here if the dst entry we've looked up
1010 * has a neighbour entry that is in the INCOMPLETE
1011 * state and the src address from the flow is
1012 * marked as OPTIMISTIC, we release the found
1013 * dst entry and replace it instead with the
1014 * dst entry of the nexthop router
1015 */
1016 rt = (struct rt6_info *) *dst;
1017 rcu_read_lock_bh();
1018 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1019 rt6_nexthop(rt, &fl6->daddr));
1020 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1021 rcu_read_unlock_bh();
1022
1023 if (err) {
1024 struct inet6_ifaddr *ifp;
1025 struct flowi6 fl_gw6;
1026 int redirect;
1027
1028 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1029 (*dst)->dev, 1);
1030
1031 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1032 if (ifp)
1033 in6_ifa_put(ifp);
1034
1035 if (redirect) {
1036 /*
1037 * We need to get the dst entry for the
1038 * default router instead
1039 */
1040 dst_release(*dst);
1041 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1042 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1043 *dst = ip6_route_output(net, sk, &fl_gw6);
1044 err = (*dst)->error;
1045 if (err)
1046 goto out_err_release;
1047 }
1048 }
1049#endif
1050 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1051 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1052 err = -EAFNOSUPPORT;
1053 goto out_err_release;
1054 }
1055
1056 return 0;
1057
1058out_err_release:
1059 dst_release(*dst);
1060 *dst = NULL;
1061
1062 if (err == -ENETUNREACH)
1063 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1064 return err;
1065}
1066
1067/**
1068 * ip6_dst_lookup - perform route lookup on flow
1069 * @sk: socket which provides route info
1070 * @dst: pointer to dst_entry * for result
1071 * @fl6: flow to lookup
1072 *
1073 * This function performs a route lookup on the given flow.
1074 *
1075 * It returns zero on success, or a standard errno code on error.
1076 */
1077int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1078 struct flowi6 *fl6)
1079{
1080 *dst = NULL;
1081 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1082}
1083EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1084
1085/**
1086 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1087 * @sk: socket which provides route info
1088 * @fl6: flow to lookup
1089 * @final_dst: final destination address for ipsec lookup
1090 *
1091 * This function performs a route lookup on the given flow.
1092 *
1093 * It returns a valid dst pointer on success, or a pointer encoded
1094 * error code.
1095 */
1096struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1097 const struct in6_addr *final_dst)
1098{
1099 struct dst_entry *dst = NULL;
1100 int err;
1101
1102 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1103 if (err)
1104 return ERR_PTR(err);
1105 if (final_dst)
1106 fl6->daddr = *final_dst;
1107
1108 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1109}
1110EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1111
1112/**
1113 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1114 * @sk: socket which provides the dst cache and route info
1115 * @fl6: flow to lookup
1116 * @final_dst: final destination address for ipsec lookup
1117 * @connected: whether @sk is connected or not
1118 *
1119 * This function performs a route lookup on the given flow with the
1120 * possibility of using the cached route in the socket if it is valid.
1121 * It will take the socket dst lock when operating on the dst cache.
1122 * As a result, this function can only be used in process context.
1123 *
1124 * In addition, for a connected socket, cache the dst in the socket
1125 * if the current cache is not valid.
1126 *
1127 * It returns a valid dst pointer on success, or a pointer encoded
1128 * error code.
1129 */
1130struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1131 const struct in6_addr *final_dst,
1132 bool connected)
1133{
1134 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1135
1136 dst = ip6_sk_dst_check(sk, dst, fl6);
1137 if (dst)
1138 return dst;
1139
1140 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1141 if (connected && !IS_ERR(dst))
1142 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1143
1144 return dst;
1145}
1146EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1147
1148static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1149 gfp_t gfp)
1150{
1151 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1152}
1153
1154static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1155 gfp_t gfp)
1156{
1157 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1158}
1159
1160static void ip6_append_data_mtu(unsigned int *mtu,
1161 int *maxfraglen,
1162 unsigned int fragheaderlen,
1163 struct sk_buff *skb,
1164 struct rt6_info *rt,
1165 unsigned int orig_mtu)
1166{
1167 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1168 if (!skb) {
1169 /* first fragment, reserve header_len */
1170 *mtu = orig_mtu - rt->dst.header_len;
1171
1172 } else {
1173 /*
1174 * this fragment is not first, the headers
1175 * space is regarded as data space.
1176 */
1177 *mtu = orig_mtu;
1178 }
1179 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1180 + fragheaderlen - sizeof(struct frag_hdr);
1181 }
1182}
1183
1184static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1185 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1186 struct rt6_info *rt, struct flowi6 *fl6)
1187{
1188 struct ipv6_pinfo *np = inet6_sk(sk);
1189 unsigned int mtu;
1190 struct ipv6_txoptions *opt = ipc6->opt;
1191
1192 /*
1193 * setup for corking
1194 */
1195 if (opt) {
1196 if (WARN_ON(v6_cork->opt))
1197 return -EINVAL;
1198
1199 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1200 if (unlikely(!v6_cork->opt))
1201 return -ENOBUFS;
1202
1203 v6_cork->opt->tot_len = sizeof(*opt);
1204 v6_cork->opt->opt_flen = opt->opt_flen;
1205 v6_cork->opt->opt_nflen = opt->opt_nflen;
1206
1207 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1208 sk->sk_allocation);
1209 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1210 return -ENOBUFS;
1211
1212 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1213 sk->sk_allocation);
1214 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1215 return -ENOBUFS;
1216
1217 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1218 sk->sk_allocation);
1219 if (opt->hopopt && !v6_cork->opt->hopopt)
1220 return -ENOBUFS;
1221
1222 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1223 sk->sk_allocation);
1224 if (opt->srcrt && !v6_cork->opt->srcrt)
1225 return -ENOBUFS;
1226
1227 /* need source address above miyazawa*/
1228 }
1229 dst_hold(&rt->dst);
1230 cork->base.dst = &rt->dst;
1231 cork->fl.u.ip6 = *fl6;
1232 v6_cork->hop_limit = ipc6->hlimit;
1233 v6_cork->tclass = ipc6->tclass;
1234 if (rt->dst.flags & DST_XFRM_TUNNEL)
1235 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1236 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1237 else
1238 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1239 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1240 if (np->frag_size < mtu) {
1241 if (np->frag_size)
1242 mtu = np->frag_size;
1243 }
1244 if (mtu < IPV6_MIN_MTU)
1245 return -EINVAL;
1246 cork->base.fragsize = mtu;
1247 cork->base.gso_size = ipc6->gso_size;
1248 cork->base.tx_flags = 0;
1249 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1250
1251 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1252 cork->base.flags |= IPCORK_ALLFRAG;
1253 cork->base.length = 0;
1254
1255 cork->base.transmit_time = ipc6->sockc.transmit_time;
1256
1257 return 0;
1258}
1259
1260static int __ip6_append_data(struct sock *sk,
1261 struct flowi6 *fl6,
1262 struct sk_buff_head *queue,
1263 struct inet_cork *cork,
1264 struct inet6_cork *v6_cork,
1265 struct page_frag *pfrag,
1266 int getfrag(void *from, char *to, int offset,
1267 int len, int odd, struct sk_buff *skb),
1268 void *from, int length, int transhdrlen,
1269 unsigned int flags, struct ipcm6_cookie *ipc6)
1270{
1271 struct sk_buff *skb, *skb_prev = NULL;
1272 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1273 int exthdrlen = 0;
1274 int dst_exthdrlen = 0;
1275 int hh_len;
1276 int copy;
1277 int err;
1278 int offset = 0;
1279 u32 tskey = 0;
1280 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1281 struct ipv6_txoptions *opt = v6_cork->opt;
1282 int csummode = CHECKSUM_NONE;
1283 unsigned int maxnonfragsize, headersize;
1284 unsigned int wmem_alloc_delta = 0;
1285 bool paged;
1286
1287 skb = skb_peek_tail(queue);
1288 if (!skb) {
1289 exthdrlen = opt ? opt->opt_flen : 0;
1290 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1291 }
1292
1293 paged = !!cork->gso_size;
1294 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1295 orig_mtu = mtu;
1296
1297 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1298 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1299 tskey = sk->sk_tskey++;
1300
1301 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1302
1303 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1304 (opt ? opt->opt_nflen : 0);
1305 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1306 sizeof(struct frag_hdr);
1307
1308 headersize = sizeof(struct ipv6hdr) +
1309 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1310 (dst_allfrag(&rt->dst) ?
1311 sizeof(struct frag_hdr) : 0) +
1312 rt->rt6i_nfheader_len;
1313
1314 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1315 * the first fragment
1316 */
1317 if (headersize + transhdrlen > mtu)
1318 goto emsgsize;
1319
1320 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1321 (sk->sk_protocol == IPPROTO_UDP ||
1322 sk->sk_protocol == IPPROTO_RAW)) {
1323 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1324 sizeof(struct ipv6hdr));
1325 goto emsgsize;
1326 }
1327
1328 if (ip6_sk_ignore_df(sk))
1329 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1330 else
1331 maxnonfragsize = mtu;
1332
1333 if (cork->length + length > maxnonfragsize - headersize) {
1334emsgsize:
1335 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1336 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1337 return -EMSGSIZE;
1338 }
1339
1340 /* CHECKSUM_PARTIAL only with no extension headers and when
1341 * we are not going to fragment
1342 */
1343 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1344 headersize == sizeof(struct ipv6hdr) &&
1345 length <= mtu - headersize &&
1346 (!(flags & MSG_MORE) || cork->gso_size) &&
1347 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1348 csummode = CHECKSUM_PARTIAL;
1349
1350 /*
1351 * Let's try using as much space as possible.
1352 * Use MTU if total length of the message fits into the MTU.
1353 * Otherwise, we need to reserve fragment header and
1354 * fragment alignment (= 8-15 octects, in total).
1355 *
1356 * Note that we may need to "move" the data from the tail of
1357 * of the buffer to the new fragment when we split
1358 * the message.
1359 *
1360 * FIXME: It may be fragmented into multiple chunks
1361 * at once if non-fragmentable extension headers
1362 * are too large.
1363 * --yoshfuji
1364 */
1365
1366 cork->length += length;
1367 if (!skb)
1368 goto alloc_new_skb;
1369
1370 while (length > 0) {
1371 /* Check if the remaining data fits into current packet. */
1372 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1373 if (copy < length)
1374 copy = maxfraglen - skb->len;
1375
1376 if (copy <= 0) {
1377 char *data;
1378 unsigned int datalen;
1379 unsigned int fraglen;
1380 unsigned int fraggap;
1381 unsigned int alloclen;
1382 unsigned int pagedlen = 0;
1383alloc_new_skb:
1384 /* There's no room in the current skb */
1385 if (skb)
1386 fraggap = skb->len - maxfraglen;
1387 else
1388 fraggap = 0;
1389 /* update mtu and maxfraglen if necessary */
1390 if (!skb || !skb_prev)
1391 ip6_append_data_mtu(&mtu, &maxfraglen,
1392 fragheaderlen, skb, rt,
1393 orig_mtu);
1394
1395 skb_prev = skb;
1396
1397 /*
1398 * If remaining data exceeds the mtu,
1399 * we know we need more fragment(s).
1400 */
1401 datalen = length + fraggap;
1402
1403 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1404 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1405 fraglen = datalen + fragheaderlen;
1406
1407 if ((flags & MSG_MORE) &&
1408 !(rt->dst.dev->features&NETIF_F_SG))
1409 alloclen = mtu;
1410 else if (!paged)
1411 alloclen = fraglen;
1412 else {
1413 alloclen = min_t(int, fraglen, MAX_HEADER);
1414 pagedlen = fraglen - alloclen;
1415 }
1416
1417 alloclen += dst_exthdrlen;
1418
1419 if (datalen != length + fraggap) {
1420 /*
1421 * this is not the last fragment, the trailer
1422 * space is regarded as data space.
1423 */
1424 datalen += rt->dst.trailer_len;
1425 }
1426
1427 alloclen += rt->dst.trailer_len;
1428 fraglen = datalen + fragheaderlen;
1429
1430 /*
1431 * We just reserve space for fragment header.
1432 * Note: this may be overallocation if the message
1433 * (without MSG_MORE) fits into the MTU.
1434 */
1435 alloclen += sizeof(struct frag_hdr);
1436
1437 copy = datalen - transhdrlen - fraggap - pagedlen;
1438 if (copy < 0) {
1439 err = -EINVAL;
1440 goto error;
1441 }
1442 if (transhdrlen) {
1443 skb = sock_alloc_send_skb(sk,
1444 alloclen + hh_len,
1445 (flags & MSG_DONTWAIT), &err);
1446 } else {
1447 skb = NULL;
1448 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1449 2 * sk->sk_sndbuf)
1450 skb = alloc_skb(alloclen + hh_len,
1451 sk->sk_allocation);
1452 if (unlikely(!skb))
1453 err = -ENOBUFS;
1454 }
1455 if (!skb)
1456 goto error;
1457 /*
1458 * Fill in the control structures
1459 */
1460 skb->protocol = htons(ETH_P_IPV6);
1461 skb->ip_summed = csummode;
1462 skb->csum = 0;
1463 /* reserve for fragmentation and ipsec header */
1464 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1465 dst_exthdrlen);
1466
1467 /* Only the initial fragment is time stamped */
1468 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1469 cork->tx_flags = 0;
1470 skb_shinfo(skb)->tskey = tskey;
1471 tskey = 0;
1472
1473 /*
1474 * Find where to start putting bytes
1475 */
1476 data = skb_put(skb, fraglen - pagedlen);
1477 skb_set_network_header(skb, exthdrlen);
1478 data += fragheaderlen;
1479 skb->transport_header = (skb->network_header +
1480 fragheaderlen);
1481 if (fraggap) {
1482 skb->csum = skb_copy_and_csum_bits(
1483 skb_prev, maxfraglen,
1484 data + transhdrlen, fraggap, 0);
1485 skb_prev->csum = csum_sub(skb_prev->csum,
1486 skb->csum);
1487 data += fraggap;
1488 pskb_trim_unique(skb_prev, maxfraglen);
1489 }
1490 if (copy > 0 &&
1491 getfrag(from, data + transhdrlen, offset,
1492 copy, fraggap, skb) < 0) {
1493 err = -EFAULT;
1494 kfree_skb(skb);
1495 goto error;
1496 }
1497
1498 offset += copy;
1499 length -= copy + transhdrlen;
1500 transhdrlen = 0;
1501 exthdrlen = 0;
1502 dst_exthdrlen = 0;
1503
1504 if ((flags & MSG_CONFIRM) && !skb_prev)
1505 skb_set_dst_pending_confirm(skb, 1);
1506
1507 /*
1508 * Put the packet on the pending queue
1509 */
1510 if (!skb->destructor) {
1511 skb->destructor = sock_wfree;
1512 skb->sk = sk;
1513 wmem_alloc_delta += skb->truesize;
1514 }
1515 __skb_queue_tail(queue, skb);
1516 continue;
1517 }
1518
1519 if (copy > length)
1520 copy = length;
1521
1522 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1523 skb_tailroom(skb) >= copy) {
1524 unsigned int off;
1525
1526 off = skb->len;
1527 if (getfrag(from, skb_put(skb, copy),
1528 offset, copy, off, skb) < 0) {
1529 __skb_trim(skb, off);
1530 err = -EFAULT;
1531 goto error;
1532 }
1533 } else {
1534 int i = skb_shinfo(skb)->nr_frags;
1535
1536 err = -ENOMEM;
1537 if (!sk_page_frag_refill(sk, pfrag))
1538 goto error;
1539
1540 if (!skb_can_coalesce(skb, i, pfrag->page,
1541 pfrag->offset)) {
1542 err = -EMSGSIZE;
1543 if (i == MAX_SKB_FRAGS)
1544 goto error;
1545
1546 __skb_fill_page_desc(skb, i, pfrag->page,
1547 pfrag->offset, 0);
1548 skb_shinfo(skb)->nr_frags = ++i;
1549 get_page(pfrag->page);
1550 }
1551 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1552 if (getfrag(from,
1553 page_address(pfrag->page) + pfrag->offset,
1554 offset, copy, skb->len, skb) < 0)
1555 goto error_efault;
1556
1557 pfrag->offset += copy;
1558 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1559 skb->len += copy;
1560 skb->data_len += copy;
1561 skb->truesize += copy;
1562 wmem_alloc_delta += copy;
1563 }
1564 offset += copy;
1565 length -= copy;
1566 }
1567
1568 if (wmem_alloc_delta)
1569 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1570 return 0;
1571
1572error_efault:
1573 err = -EFAULT;
1574error:
1575 cork->length -= length;
1576 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1577 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1578 return err;
1579}
1580
1581int ip6_append_data(struct sock *sk,
1582 int getfrag(void *from, char *to, int offset, int len,
1583 int odd, struct sk_buff *skb),
1584 void *from, int length, int transhdrlen,
1585 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1586 struct rt6_info *rt, unsigned int flags)
1587{
1588 struct inet_sock *inet = inet_sk(sk);
1589 struct ipv6_pinfo *np = inet6_sk(sk);
1590 int exthdrlen;
1591 int err;
1592
1593 if (flags&MSG_PROBE)
1594 return 0;
1595 if (skb_queue_empty(&sk->sk_write_queue)) {
1596 /*
1597 * setup for corking
1598 */
1599 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1600 ipc6, rt, fl6);
1601 if (err)
1602 return err;
1603
1604 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1605 length += exthdrlen;
1606 transhdrlen += exthdrlen;
1607 } else {
1608 fl6 = &inet->cork.fl.u.ip6;
1609 transhdrlen = 0;
1610 }
1611
1612 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1613 &np->cork, sk_page_frag(sk), getfrag,
1614 from, length, transhdrlen, flags, ipc6);
1615}
1616EXPORT_SYMBOL_GPL(ip6_append_data);
1617
1618static void ip6_cork_release(struct inet_cork_full *cork,
1619 struct inet6_cork *v6_cork)
1620{
1621 if (v6_cork->opt) {
1622 kfree(v6_cork->opt->dst0opt);
1623 kfree(v6_cork->opt->dst1opt);
1624 kfree(v6_cork->opt->hopopt);
1625 kfree(v6_cork->opt->srcrt);
1626 kfree(v6_cork->opt);
1627 v6_cork->opt = NULL;
1628 }
1629
1630 if (cork->base.dst) {
1631 dst_release(cork->base.dst);
1632 cork->base.dst = NULL;
1633 cork->base.flags &= ~IPCORK_ALLFRAG;
1634 }
1635 memset(&cork->fl, 0, sizeof(cork->fl));
1636}
1637
1638struct sk_buff *__ip6_make_skb(struct sock *sk,
1639 struct sk_buff_head *queue,
1640 struct inet_cork_full *cork,
1641 struct inet6_cork *v6_cork)
1642{
1643 struct sk_buff *skb, *tmp_skb;
1644 struct sk_buff **tail_skb;
1645 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1646 struct ipv6_pinfo *np = inet6_sk(sk);
1647 struct net *net = sock_net(sk);
1648 struct ipv6hdr *hdr;
1649 struct ipv6_txoptions *opt = v6_cork->opt;
1650 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1651 struct flowi6 *fl6 = &cork->fl.u.ip6;
1652 unsigned char proto = fl6->flowi6_proto;
1653
1654 skb = __skb_dequeue(queue);
1655 if (!skb)
1656 goto out;
1657 tail_skb = &(skb_shinfo(skb)->frag_list);
1658
1659 /* move skb->data to ip header from ext header */
1660 if (skb->data < skb_network_header(skb))
1661 __skb_pull(skb, skb_network_offset(skb));
1662 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1663 __skb_pull(tmp_skb, skb_network_header_len(skb));
1664 *tail_skb = tmp_skb;
1665 tail_skb = &(tmp_skb->next);
1666 skb->len += tmp_skb->len;
1667 skb->data_len += tmp_skb->len;
1668 skb->truesize += tmp_skb->truesize;
1669 tmp_skb->destructor = NULL;
1670 tmp_skb->sk = NULL;
1671 }
1672
1673 /* Allow local fragmentation. */
1674 skb->ignore_df = ip6_sk_ignore_df(sk);
1675
1676 *final_dst = fl6->daddr;
1677 __skb_pull(skb, skb_network_header_len(skb));
1678 if (opt && opt->opt_flen)
1679 ipv6_push_frag_opts(skb, opt, &proto);
1680 if (opt && opt->opt_nflen)
1681 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1682
1683 skb_push(skb, sizeof(struct ipv6hdr));
1684 skb_reset_network_header(skb);
1685 hdr = ipv6_hdr(skb);
1686
1687 ip6_flow_hdr(hdr, v6_cork->tclass,
1688 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1689 ip6_autoflowlabel(net, np), fl6));
1690 hdr->hop_limit = v6_cork->hop_limit;
1691 hdr->nexthdr = proto;
1692 hdr->saddr = fl6->saddr;
1693 hdr->daddr = *final_dst;
1694
1695 skb->priority = sk->sk_priority;
1696 skb->mark = sk->sk_mark;
1697
1698 skb->tstamp = cork->base.transmit_time;
1699
1700 skb_dst_set(skb, dst_clone(&rt->dst));
1701 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1702 if (proto == IPPROTO_ICMPV6) {
1703 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1704
1705 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1706 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1707 }
1708
1709 ip6_cork_release(cork, v6_cork);
1710out:
1711 return skb;
1712}
1713
1714int ip6_send_skb(struct sk_buff *skb)
1715{
1716 struct net *net = sock_net(skb->sk);
1717 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1718 int err;
1719
1720 err = ip6_local_out(net, skb->sk, skb);
1721 if (err) {
1722 if (err > 0)
1723 err = net_xmit_errno(err);
1724 if (err)
1725 IP6_INC_STATS(net, rt->rt6i_idev,
1726 IPSTATS_MIB_OUTDISCARDS);
1727 }
1728
1729 return err;
1730}
1731
1732int ip6_push_pending_frames(struct sock *sk)
1733{
1734 struct sk_buff *skb;
1735
1736 skb = ip6_finish_skb(sk);
1737 if (!skb)
1738 return 0;
1739
1740 return ip6_send_skb(skb);
1741}
1742EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1743
1744static void __ip6_flush_pending_frames(struct sock *sk,
1745 struct sk_buff_head *queue,
1746 struct inet_cork_full *cork,
1747 struct inet6_cork *v6_cork)
1748{
1749 struct sk_buff *skb;
1750
1751 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1752 if (skb_dst(skb))
1753 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1754 IPSTATS_MIB_OUTDISCARDS);
1755 kfree_skb(skb);
1756 }
1757
1758 ip6_cork_release(cork, v6_cork);
1759}
1760
1761void ip6_flush_pending_frames(struct sock *sk)
1762{
1763 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1764 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1765}
1766EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1767
1768struct sk_buff *ip6_make_skb(struct sock *sk,
1769 int getfrag(void *from, char *to, int offset,
1770 int len, int odd, struct sk_buff *skb),
1771 void *from, int length, int transhdrlen,
1772 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1773 struct rt6_info *rt, unsigned int flags,
1774 struct inet_cork_full *cork)
1775{
1776 struct inet6_cork v6_cork;
1777 struct sk_buff_head queue;
1778 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1779 int err;
1780
1781 if (flags & MSG_PROBE)
1782 return NULL;
1783
1784 __skb_queue_head_init(&queue);
1785
1786 cork->base.flags = 0;
1787 cork->base.addr = 0;
1788 cork->base.opt = NULL;
1789 cork->base.dst = NULL;
1790 v6_cork.opt = NULL;
1791 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1792 if (err) {
1793 ip6_cork_release(cork, &v6_cork);
1794 return ERR_PTR(err);
1795 }
1796 if (ipc6->dontfrag < 0)
1797 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1798
1799 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1800 &current->task_frag, getfrag, from,
1801 length + exthdrlen, transhdrlen + exthdrlen,
1802 flags, ipc6);
1803 if (err) {
1804 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1805 return ERR_PTR(err);
1806 }
1807
1808 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1809}