blob: 44cc17c43a6b5825111e3a6d1eb7c4406948f76b [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/init.h>
34#include <linux/in6.h>
35#include <linux/inetdevice.h>
36#include <linux/igmp.h>
37#include <linux/netfilter_ipv4.h>
38#include <linux/etherdevice.h>
39#include <linux/if_ether.h>
40#include <linux/if_vlan.h>
41#include <linux/rculist.h>
42#include <linux/err.h>
43
44#include <net/sock.h>
45#include <net/ip.h>
46#include <net/icmp.h>
47#include <net/protocol.h>
48#include <net/ip_tunnels.h>
49#include <net/arp.h>
50#include <net/checksum.h>
51#include <net/dsfield.h>
52#include <net/inet_ecn.h>
53#include <net/xfrm.h>
54#include <net/net_namespace.h>
55#include <net/netns/generic.h>
56#include <net/rtnetlink.h>
57#include <net/udp.h>
58#include <net/dst_metadata.h>
59
60#if IS_ENABLED(CONFIG_IPV6)
61#include <net/ipv6.h>
62#include <net/ip6_fib.h>
63#include <net/ip6_route.h>
64#endif
65
66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67{
68 return hash_32((__force u32)key ^ (__force u32)remote,
69 IP_TNL_HASH_BITS);
70}
71
72static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 __be16 flags, __be32 key)
74{
75 if (p->i_flags & TUNNEL_KEY) {
76 if (flags & TUNNEL_KEY)
77 return key == p->i_key;
78 else
79 /* key expected, none present */
80 return false;
81 } else
82 return !(flags & TUNNEL_KEY);
83}
84
85/* Fallback tunnel: no source, no destination, no key, no options
86
87 Tunnel hash table:
88 We require exact key match i.e. if a key is present in packet
89 it will match only tunnel with the same key; if it is not present,
90 it will match only keyless tunnel.
91
92 All keysless packets, if not matched configured keyless tunnels
93 will match fallback tunnel.
94 Given src, dst and key, find appropriate for input tunnel.
95*/
96struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 int link, __be16 flags,
98 __be32 remote, __be32 local,
99 __be32 key)
100{
101 struct ip_tunnel *t, *cand = NULL;
102 struct hlist_head *head;
103 struct net_device *ndev;
104 unsigned int hash;
105
106 hash = ip_tunnel_hash(key, remote);
107 head = &itn->tunnels[hash];
108
109 hlist_for_each_entry_rcu(t, head, hash_node) {
110 if (local != t->parms.iph.saddr ||
111 remote != t->parms.iph.daddr ||
112 !(t->dev->flags & IFF_UP))
113 continue;
114
115 if (!ip_tunnel_key_match(&t->parms, flags, key))
116 continue;
117
118 if (t->parms.link == link)
119 return t;
120 else
121 cand = t;
122 }
123
124 hlist_for_each_entry_rcu(t, head, hash_node) {
125 if (remote != t->parms.iph.daddr ||
126 t->parms.iph.saddr != 0 ||
127 !(t->dev->flags & IFF_UP))
128 continue;
129
130 if (!ip_tunnel_key_match(&t->parms, flags, key))
131 continue;
132
133 if (t->parms.link == link)
134 return t;
135 else if (!cand)
136 cand = t;
137 }
138
139 hash = ip_tunnel_hash(key, 0);
140 head = &itn->tunnels[hash];
141
142 hlist_for_each_entry_rcu(t, head, hash_node) {
143 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
144 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
145 continue;
146
147 if (!(t->dev->flags & IFF_UP))
148 continue;
149
150 if (!ip_tunnel_key_match(&t->parms, flags, key))
151 continue;
152
153 if (t->parms.link == link)
154 return t;
155 else if (!cand)
156 cand = t;
157 }
158
159 hlist_for_each_entry_rcu(t, head, hash_node) {
160 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
161 t->parms.iph.saddr != 0 ||
162 t->parms.iph.daddr != 0 ||
163 !(t->dev->flags & IFF_UP))
164 continue;
165
166 if (t->parms.link == link)
167 return t;
168 else if (!cand)
169 cand = t;
170 }
171
172 if (cand)
173 return cand;
174
175 t = rcu_dereference(itn->collect_md_tun);
176 if (t && t->dev->flags & IFF_UP)
177 return t;
178
179 ndev = READ_ONCE(itn->fb_tunnel_dev);
180 if (ndev && ndev->flags & IFF_UP)
181 return netdev_priv(ndev);
182
183 return NULL;
184}
185EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
186
187static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
188 struct ip_tunnel_parm *parms)
189{
190 unsigned int h;
191 __be32 remote;
192 __be32 i_key = parms->i_key;
193
194 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
195 remote = parms->iph.daddr;
196 else
197 remote = 0;
198
199 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
200 i_key = 0;
201
202 h = ip_tunnel_hash(i_key, remote);
203 return &itn->tunnels[h];
204}
205
206static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
207{
208 struct hlist_head *head = ip_bucket(itn, &t->parms);
209
210 if (t->collect_md)
211 rcu_assign_pointer(itn->collect_md_tun, t);
212 hlist_add_head_rcu(&t->hash_node, head);
213}
214
215static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
216{
217 if (t->collect_md)
218 rcu_assign_pointer(itn->collect_md_tun, NULL);
219 hlist_del_init_rcu(&t->hash_node);
220}
221
222static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
223 struct ip_tunnel_parm *parms,
224 int type)
225{
226 __be32 remote = parms->iph.daddr;
227 __be32 local = parms->iph.saddr;
228 __be32 key = parms->i_key;
229 __be16 flags = parms->i_flags;
230 int link = parms->link;
231 struct ip_tunnel *t = NULL;
232 struct hlist_head *head = ip_bucket(itn, parms);
233
234 hlist_for_each_entry_rcu(t, head, hash_node) {
235 if (local == t->parms.iph.saddr &&
236 remote == t->parms.iph.daddr &&
237 link == t->parms.link &&
238 type == t->dev->type &&
239 ip_tunnel_key_match(&t->parms, flags, key))
240 break;
241 }
242 return t;
243}
244
245static struct net_device *__ip_tunnel_create(struct net *net,
246 const struct rtnl_link_ops *ops,
247 struct ip_tunnel_parm *parms)
248{
249 int err;
250 struct ip_tunnel *tunnel;
251 struct net_device *dev;
252 char name[IFNAMSIZ];
253
254 err = -E2BIG;
255 if (parms->name[0]) {
256 if (!dev_valid_name(parms->name))
257 goto failed;
258 strlcpy(name, parms->name, IFNAMSIZ);
259 } else {
260 if (strlen(ops->kind) > (IFNAMSIZ - 3))
261 goto failed;
262 strcpy(name, ops->kind);
263 strcat(name, "%d");
264 }
265
266 ASSERT_RTNL();
267 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268 if (!dev) {
269 err = -ENOMEM;
270 goto failed;
271 }
272 dev_net_set(dev, net);
273
274 dev->rtnl_link_ops = ops;
275
276 tunnel = netdev_priv(dev);
277 tunnel->parms = *parms;
278 tunnel->net = net;
279
280 err = register_netdevice(dev);
281 if (err)
282 goto failed_free;
283
284 return dev;
285
286failed_free:
287 free_netdev(dev);
288failed:
289 return ERR_PTR(err);
290}
291
292static inline void init_tunnel_flow(struct flowi4 *fl4,
293 int proto,
294 __be32 daddr, __be32 saddr,
295 __be32 key, __u8 tos, int oif,
296 __u32 mark)
297{
298 memset(fl4, 0, sizeof(*fl4));
299 fl4->flowi4_oif = oif;
300 fl4->daddr = daddr;
301 fl4->saddr = saddr;
302 fl4->flowi4_tos = tos;
303 fl4->flowi4_proto = proto;
304 fl4->fl4_gre_key = key;
305 fl4->flowi4_mark = mark;
306}
307
308static int ip_tunnel_bind_dev(struct net_device *dev)
309{
310 struct net_device *tdev = NULL;
311 struct ip_tunnel *tunnel = netdev_priv(dev);
312 const struct iphdr *iph;
313 int hlen = LL_MAX_HEADER;
314 int mtu = ETH_DATA_LEN;
315 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
316
317 iph = &tunnel->parms.iph;
318
319 /* Guess output device to choose reasonable mtu and needed_headroom */
320 if (iph->daddr) {
321 struct flowi4 fl4;
322 struct rtable *rt;
323
324 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
325 iph->saddr, tunnel->parms.o_key,
326 RT_TOS(iph->tos), tunnel->parms.link,
327 tunnel->fwmark);
328 rt = ip_route_output_key(tunnel->net, &fl4);
329
330 if (!IS_ERR(rt)) {
331 tdev = rt->dst.dev;
332 ip_rt_put(rt);
333 }
334 if (dev->type != ARPHRD_ETHER)
335 dev->flags |= IFF_POINTOPOINT;
336
337 dst_cache_reset(&tunnel->dst_cache);
338 }
339
340 if (!tdev && tunnel->parms.link)
341 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
342
343 if (tdev) {
344 hlen = tdev->hard_header_len + tdev->needed_headroom;
345 mtu = tdev->mtu;
346 }
347
348 dev->needed_headroom = t_hlen + hlen;
349 mtu -= (dev->hard_header_len + t_hlen);
350
351 if (mtu < IPV4_MIN_MTU)
352 mtu = IPV4_MIN_MTU;
353
354 return mtu;
355}
356
357static struct ip_tunnel *ip_tunnel_create(struct net *net,
358 struct ip_tunnel_net *itn,
359 struct ip_tunnel_parm *parms)
360{
361 struct ip_tunnel *nt;
362 struct net_device *dev;
363 int t_hlen;
364
365 BUG_ON(!itn->fb_tunnel_dev);
366 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
367 if (IS_ERR(dev))
368 return ERR_CAST(dev);
369
370 dev->mtu = ip_tunnel_bind_dev(dev);
371
372 nt = netdev_priv(dev);
373 t_hlen = nt->hlen + sizeof(struct iphdr);
374 dev->min_mtu = ETH_MIN_MTU;
375 dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
376 ip_tunnel_add(itn, nt);
377 return nt;
378}
379
380int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
381 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
382 bool log_ecn_error)
383{
384 struct pcpu_sw_netstats *tstats;
385 const struct iphdr *iph = ip_hdr(skb);
386 int err;
387
388#ifdef CONFIG_NET_IPGRE_BROADCAST
389 if (ipv4_is_multicast(iph->daddr)) {
390 tunnel->dev->stats.multicast++;
391 skb->pkt_type = PACKET_BROADCAST;
392 }
393#endif
394
395 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
396 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
397 tunnel->dev->stats.rx_crc_errors++;
398 tunnel->dev->stats.rx_errors++;
399 goto drop;
400 }
401
402 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
403 if (!(tpi->flags&TUNNEL_SEQ) ||
404 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
405 tunnel->dev->stats.rx_fifo_errors++;
406 tunnel->dev->stats.rx_errors++;
407 goto drop;
408 }
409 tunnel->i_seqno = ntohl(tpi->seq) + 1;
410 }
411
412 skb_reset_network_header(skb);
413
414 err = IP_ECN_decapsulate(iph, skb);
415 if (unlikely(err)) {
416 if (log_ecn_error)
417 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
418 &iph->saddr, iph->tos);
419 if (err > 1) {
420 ++tunnel->dev->stats.rx_frame_errors;
421 ++tunnel->dev->stats.rx_errors;
422 goto drop;
423 }
424 }
425
426 tstats = this_cpu_ptr(tunnel->dev->tstats);
427 u64_stats_update_begin(&tstats->syncp);
428 tstats->rx_packets++;
429 tstats->rx_bytes += skb->len;
430 u64_stats_update_end(&tstats->syncp);
431
432 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
433
434 if (tunnel->dev->type == ARPHRD_ETHER) {
435 skb->protocol = eth_type_trans(skb, tunnel->dev);
436 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
437 } else {
438 skb->dev = tunnel->dev;
439 }
440
441 if (tun_dst)
442 skb_dst_set(skb, (struct dst_entry *)tun_dst);
443
444 gro_cells_receive(&tunnel->gro_cells, skb);
445 return 0;
446
447drop:
448 if (tun_dst)
449 dst_release((struct dst_entry *)tun_dst);
450 kfree_skb(skb);
451 return 0;
452}
453EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
454
455int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
456 unsigned int num)
457{
458 if (num >= MAX_IPTUN_ENCAP_OPS)
459 return -ERANGE;
460
461 return !cmpxchg((const struct ip_tunnel_encap_ops **)
462 &iptun_encaps[num],
463 NULL, ops) ? 0 : -1;
464}
465EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
466
467int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
468 unsigned int num)
469{
470 int ret;
471
472 if (num >= MAX_IPTUN_ENCAP_OPS)
473 return -ERANGE;
474
475 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
476 &iptun_encaps[num],
477 ops, NULL) == ops) ? 0 : -1;
478
479 synchronize_net();
480
481 return ret;
482}
483EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
484
485int ip_tunnel_encap_setup(struct ip_tunnel *t,
486 struct ip_tunnel_encap *ipencap)
487{
488 int hlen;
489
490 memset(&t->encap, 0, sizeof(t->encap));
491
492 hlen = ip_encap_hlen(ipencap);
493 if (hlen < 0)
494 return hlen;
495
496 t->encap.type = ipencap->type;
497 t->encap.sport = ipencap->sport;
498 t->encap.dport = ipencap->dport;
499 t->encap.flags = ipencap->flags;
500
501 t->encap_hlen = hlen;
502 t->hlen = t->encap_hlen + t->tun_hlen;
503
504 return 0;
505}
506EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
507
508static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
509 struct rtable *rt, __be16 df,
510 const struct iphdr *inner_iph)
511{
512 struct ip_tunnel *tunnel = netdev_priv(dev);
513 int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
514 int mtu;
515
516 if (df)
517 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
518 - sizeof(struct iphdr) - tunnel->hlen;
519 else
520 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
521
522 skb_dst_update_pmtu_no_confirm(skb, mtu);
523
524 if (skb->protocol == htons(ETH_P_IP)) {
525 if (!skb_is_gso(skb) &&
526 (inner_iph->frag_off & htons(IP_DF)) &&
527 mtu < pkt_size) {
528 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
529 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
530 return -E2BIG;
531 }
532 }
533#if IS_ENABLED(CONFIG_IPV6)
534 else if (skb->protocol == htons(ETH_P_IPV6)) {
535 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
536
537 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538 mtu >= IPV6_MIN_MTU) {
539 if ((tunnel->parms.iph.daddr &&
540 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
541 rt6->rt6i_dst.plen == 128) {
542 rt6->rt6i_flags |= RTF_MODIFIED;
543 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
544 }
545 }
546
547 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548 mtu < pkt_size) {
549 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550 return -E2BIG;
551 }
552 }
553#endif
554 return 0;
555}
556
557void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
558{
559 struct ip_tunnel *tunnel = netdev_priv(dev);
560 u32 headroom = sizeof(struct iphdr);
561 struct ip_tunnel_info *tun_info;
562 const struct ip_tunnel_key *key;
563 const struct iphdr *inner_iph;
564 struct rtable *rt;
565 struct flowi4 fl4;
566 __be16 df = 0;
567 u8 tos, ttl;
568
569 tun_info = skb_tunnel_info(skb);
570 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
571 ip_tunnel_info_af(tun_info) != AF_INET))
572 goto tx_error;
573 key = &tun_info->key;
574 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
575 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
576 tos = key->tos;
577 if (tos == 1) {
578 if (skb->protocol == htons(ETH_P_IP))
579 tos = inner_iph->tos;
580 else if (skb->protocol == htons(ETH_P_IPV6))
581 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
582 }
583 init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
584 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
585 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
586 goto tx_error;
587 rt = ip_route_output_key(tunnel->net, &fl4);
588 if (IS_ERR(rt)) {
589 dev->stats.tx_carrier_errors++;
590 goto tx_error;
591 }
592 if (rt->dst.dev == dev) {
593 ip_rt_put(rt);
594 dev->stats.collisions++;
595 goto tx_error;
596 }
597 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
598 ttl = key->ttl;
599 if (ttl == 0) {
600 if (skb->protocol == htons(ETH_P_IP))
601 ttl = inner_iph->ttl;
602 else if (skb->protocol == htons(ETH_P_IPV6))
603 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
604 else
605 ttl = ip4_dst_hoplimit(&rt->dst);
606 }
607 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
608 df = htons(IP_DF);
609 else if (skb->protocol == htons(ETH_P_IP))
610 df = inner_iph->frag_off & htons(IP_DF);
611 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612 if (headroom > dev->needed_headroom)
613 dev->needed_headroom = headroom;
614
615 if (skb_cow_head(skb, dev->needed_headroom)) {
616 ip_rt_put(rt);
617 goto tx_dropped;
618 }
619 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
620 df, !net_eq(tunnel->net, dev_net(dev)));
621 return;
622tx_error:
623 dev->stats.tx_errors++;
624 goto kfree;
625tx_dropped:
626 dev->stats.tx_dropped++;
627kfree:
628 kfree_skb(skb);
629}
630EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631
632void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633 const struct iphdr *tnl_params, u8 protocol)
634{
635 struct ip_tunnel *tunnel = netdev_priv(dev);
636 unsigned int inner_nhdr_len = 0;
637 const struct iphdr *inner_iph;
638 struct flowi4 fl4;
639 u8 tos, ttl;
640 __be16 df;
641 struct rtable *rt; /* Route to the other host */
642 unsigned int max_headroom; /* The extra header space needed */
643 __be32 dst;
644 bool connected;
645
646 /* ensure we can access the inner net header, for several users below */
647 if (skb->protocol == htons(ETH_P_IP))
648 inner_nhdr_len = sizeof(struct iphdr);
649 else if (skb->protocol == htons(ETH_P_IPV6))
650 inner_nhdr_len = sizeof(struct ipv6hdr);
651 if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
652 goto tx_error;
653
654 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
655 connected = (tunnel->parms.iph.daddr != 0);
656
657 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
658
659 dst = tnl_params->daddr;
660 if (dst == 0) {
661 /* NBMA tunnel */
662 struct ip_tunnel_info *tun_info;
663
664 if (!skb_dst(skb)) {
665 dev->stats.tx_fifo_errors++;
666 goto tx_error;
667 }
668
669 tun_info = skb_tunnel_info(skb);
670 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
671 ip_tunnel_info_af(tun_info) == AF_INET &&
672 tun_info->key.u.ipv4.dst)
673 dst = tun_info->key.u.ipv4.dst;
674 else if (skb->protocol == htons(ETH_P_IP)) {
675 rt = skb_rtable(skb);
676 dst = rt_nexthop(rt, inner_iph->daddr);
677 }
678#if IS_ENABLED(CONFIG_IPV6)
679 else if (skb->protocol == htons(ETH_P_IPV6)) {
680 const struct in6_addr *addr6;
681 struct neighbour *neigh;
682 bool do_tx_error_icmp;
683 int addr_type;
684
685 neigh = dst_neigh_lookup(skb_dst(skb),
686 &ipv6_hdr(skb)->daddr);
687 if (!neigh)
688 goto tx_error;
689
690 addr6 = (const struct in6_addr *)&neigh->primary_key;
691 addr_type = ipv6_addr_type(addr6);
692
693 if (addr_type == IPV6_ADDR_ANY) {
694 addr6 = &ipv6_hdr(skb)->daddr;
695 addr_type = ipv6_addr_type(addr6);
696 }
697
698 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
699 do_tx_error_icmp = true;
700 else {
701 do_tx_error_icmp = false;
702 dst = addr6->s6_addr32[3];
703 }
704 neigh_release(neigh);
705 if (do_tx_error_icmp)
706 goto tx_error_icmp;
707 }
708#endif
709 else
710 goto tx_error;
711
712 connected = false;
713 }
714
715 tos = tnl_params->tos;
716 if (tos & 0x1) {
717 tos &= ~0x1;
718 if (skb->protocol == htons(ETH_P_IP)) {
719 tos = inner_iph->tos;
720 connected = false;
721 } else if (skb->protocol == htons(ETH_P_IPV6)) {
722 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
723 connected = false;
724 }
725 }
726
727 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
728 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
729 tunnel->fwmark);
730
731 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
732 goto tx_error;
733
734 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
735 NULL;
736
737 if (!rt) {
738 rt = ip_route_output_key(tunnel->net, &fl4);
739
740 if (IS_ERR(rt)) {
741 dev->stats.tx_carrier_errors++;
742 goto tx_error;
743 }
744 if (connected)
745 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
746 fl4.saddr);
747 }
748
749 if (rt->dst.dev == dev) {
750 ip_rt_put(rt);
751 dev->stats.collisions++;
752 goto tx_error;
753 }
754
755 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
756 ip_rt_put(rt);
757 goto tx_error;
758 }
759
760 if (tunnel->err_count > 0) {
761 if (time_before(jiffies,
762 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
763 tunnel->err_count--;
764
765 dst_link_failure(skb);
766 } else
767 tunnel->err_count = 0;
768 }
769
770 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
771 ttl = tnl_params->ttl;
772 if (ttl == 0) {
773 if (skb->protocol == htons(ETH_P_IP))
774 ttl = inner_iph->ttl;
775#if IS_ENABLED(CONFIG_IPV6)
776 else if (skb->protocol == htons(ETH_P_IPV6))
777 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
778#endif
779 else
780 ttl = ip4_dst_hoplimit(&rt->dst);
781 }
782
783 df = tnl_params->frag_off;
784 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
785 df |= (inner_iph->frag_off&htons(IP_DF));
786
787 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
788 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
789 if (max_headroom > dev->needed_headroom)
790 dev->needed_headroom = max_headroom;
791
792 if (skb_cow_head(skb, dev->needed_headroom)) {
793 ip_rt_put(rt);
794 dev->stats.tx_dropped++;
795 kfree_skb(skb);
796 return;
797 }
798
799 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
800 df, !net_eq(tunnel->net, dev_net(dev)));
801 return;
802
803#if IS_ENABLED(CONFIG_IPV6)
804tx_error_icmp:
805 dst_link_failure(skb);
806#endif
807tx_error:
808 dev->stats.tx_errors++;
809 kfree_skb(skb);
810}
811EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
812
813static void ip_tunnel_update(struct ip_tunnel_net *itn,
814 struct ip_tunnel *t,
815 struct net_device *dev,
816 struct ip_tunnel_parm *p,
817 bool set_mtu,
818 __u32 fwmark)
819{
820 ip_tunnel_del(itn, t);
821 t->parms.iph.saddr = p->iph.saddr;
822 t->parms.iph.daddr = p->iph.daddr;
823 t->parms.i_key = p->i_key;
824 t->parms.o_key = p->o_key;
825 if (dev->type != ARPHRD_ETHER) {
826 memcpy(dev->dev_addr, &p->iph.saddr, 4);
827 memcpy(dev->broadcast, &p->iph.daddr, 4);
828 }
829 ip_tunnel_add(itn, t);
830
831 t->parms.iph.ttl = p->iph.ttl;
832 t->parms.iph.tos = p->iph.tos;
833 t->parms.iph.frag_off = p->iph.frag_off;
834
835 if (t->parms.link != p->link || t->fwmark != fwmark) {
836 int mtu;
837
838 t->parms.link = p->link;
839 t->fwmark = fwmark;
840 mtu = ip_tunnel_bind_dev(dev);
841 if (set_mtu)
842 dev->mtu = mtu;
843 }
844 dst_cache_reset(&t->dst_cache);
845 netdev_state_change(dev);
846}
847
848int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
849{
850 int err = 0;
851 struct ip_tunnel *t = netdev_priv(dev);
852 struct net *net = t->net;
853 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
854
855 BUG_ON(!itn->fb_tunnel_dev);
856 switch (cmd) {
857 case SIOCGETTUNNEL:
858 if (dev == itn->fb_tunnel_dev) {
859 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
860 if (!t)
861 t = netdev_priv(dev);
862 }
863 memcpy(p, &t->parms, sizeof(*p));
864 break;
865
866 case SIOCADDTUNNEL:
867 case SIOCCHGTUNNEL:
868 err = -EPERM;
869 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
870 goto done;
871 if (p->iph.ttl)
872 p->iph.frag_off |= htons(IP_DF);
873 if (!(p->i_flags & VTI_ISVTI)) {
874 if (!(p->i_flags & TUNNEL_KEY))
875 p->i_key = 0;
876 if (!(p->o_flags & TUNNEL_KEY))
877 p->o_key = 0;
878 }
879
880 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
881
882 if (cmd == SIOCADDTUNNEL) {
883 if (!t) {
884 t = ip_tunnel_create(net, itn, p);
885 err = PTR_ERR_OR_ZERO(t);
886 break;
887 }
888
889 err = -EEXIST;
890 break;
891 }
892 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
893 if (t) {
894 if (t->dev != dev) {
895 err = -EEXIST;
896 break;
897 }
898 } else {
899 unsigned int nflags = 0;
900
901 if (ipv4_is_multicast(p->iph.daddr))
902 nflags = IFF_BROADCAST;
903 else if (p->iph.daddr)
904 nflags = IFF_POINTOPOINT;
905
906 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
907 err = -EINVAL;
908 break;
909 }
910
911 t = netdev_priv(dev);
912 }
913 }
914
915 if (t) {
916 err = 0;
917 ip_tunnel_update(itn, t, dev, p, true, 0);
918 } else {
919 err = -ENOENT;
920 }
921 break;
922
923 case SIOCDELTUNNEL:
924 err = -EPERM;
925 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
926 goto done;
927
928 if (dev == itn->fb_tunnel_dev) {
929 err = -ENOENT;
930 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
931 if (!t)
932 goto done;
933 err = -EPERM;
934 if (t == netdev_priv(itn->fb_tunnel_dev))
935 goto done;
936 dev = t->dev;
937 }
938 unregister_netdevice(dev);
939 err = 0;
940 break;
941
942 default:
943 err = -EINVAL;
944 }
945
946done:
947 return err;
948}
949EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
950
951int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
952{
953 struct ip_tunnel *tunnel = netdev_priv(dev);
954 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
955 int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
956
957 if (new_mtu < ETH_MIN_MTU)
958 return -EINVAL;
959
960 if (new_mtu > max_mtu) {
961 if (strict)
962 return -EINVAL;
963
964 new_mtu = max_mtu;
965 }
966
967 dev->mtu = new_mtu;
968 return 0;
969}
970EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
971
972int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
973{
974 return __ip_tunnel_change_mtu(dev, new_mtu, true);
975}
976EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
977
978static void ip_tunnel_dev_free(struct net_device *dev)
979{
980 struct ip_tunnel *tunnel = netdev_priv(dev);
981
982 gro_cells_destroy(&tunnel->gro_cells);
983 dst_cache_destroy(&tunnel->dst_cache);
984 free_percpu(dev->tstats);
985}
986
987void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
988{
989 struct ip_tunnel *tunnel = netdev_priv(dev);
990 struct ip_tunnel_net *itn;
991
992 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
993
994 if (itn->fb_tunnel_dev != dev) {
995 ip_tunnel_del(itn, netdev_priv(dev));
996 unregister_netdevice_queue(dev, head);
997 }
998}
999EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1000
1001struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1002{
1003 struct ip_tunnel *tunnel = netdev_priv(dev);
1004
1005 return tunnel->net;
1006}
1007EXPORT_SYMBOL(ip_tunnel_get_link_net);
1008
1009int ip_tunnel_get_iflink(const struct net_device *dev)
1010{
1011 struct ip_tunnel *tunnel = netdev_priv(dev);
1012
1013 return tunnel->parms.link;
1014}
1015EXPORT_SYMBOL(ip_tunnel_get_iflink);
1016
1017int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1018 struct rtnl_link_ops *ops, char *devname)
1019{
1020 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1021 struct ip_tunnel_parm parms;
1022 unsigned int i;
1023
1024 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1025 INIT_HLIST_HEAD(&itn->tunnels[i]);
1026
1027 if (!ops) {
1028 itn->fb_tunnel_dev = NULL;
1029 return 0;
1030 }
1031
1032 memset(&parms, 0, sizeof(parms));
1033 if (devname)
1034 strlcpy(parms.name, devname, IFNAMSIZ);
1035
1036 rtnl_lock();
1037 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1038 /* FB netdevice is special: we have one, and only one per netns.
1039 * Allowing to move it to another netns is clearly unsafe.
1040 */
1041 if (!IS_ERR(itn->fb_tunnel_dev)) {
1042 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1043 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1044 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1045 }
1046 rtnl_unlock();
1047
1048 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1049}
1050EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1051
1052static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1053 struct rtnl_link_ops *ops)
1054{
1055 struct net *net = dev_net(itn->fb_tunnel_dev);
1056 struct net_device *dev, *aux;
1057 int h;
1058
1059 for_each_netdev_safe(net, dev, aux)
1060 if (dev->rtnl_link_ops == ops)
1061 unregister_netdevice_queue(dev, head);
1062
1063 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1064 struct ip_tunnel *t;
1065 struct hlist_node *n;
1066 struct hlist_head *thead = &itn->tunnels[h];
1067
1068 hlist_for_each_entry_safe(t, n, thead, hash_node)
1069 /* If dev is in the same netns, it has already
1070 * been added to the list by the previous loop.
1071 */
1072 if (!net_eq(dev_net(t->dev), net))
1073 unregister_netdevice_queue(t->dev, head);
1074 }
1075}
1076
1077void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1078{
1079 LIST_HEAD(list);
1080
1081 rtnl_lock();
1082 ip_tunnel_destroy(itn, &list, ops);
1083 unregister_netdevice_many(&list);
1084 rtnl_unlock();
1085}
1086EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1087
1088int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1089 struct ip_tunnel_parm *p, __u32 fwmark)
1090{
1091 struct ip_tunnel *nt;
1092 struct net *net = dev_net(dev);
1093 struct ip_tunnel_net *itn;
1094 int mtu;
1095 int err;
1096
1097 nt = netdev_priv(dev);
1098 itn = net_generic(net, nt->ip_tnl_net_id);
1099
1100 if (nt->collect_md) {
1101 if (rtnl_dereference(itn->collect_md_tun))
1102 return -EEXIST;
1103 } else {
1104 if (ip_tunnel_find(itn, p, dev->type))
1105 return -EEXIST;
1106 }
1107
1108 nt->net = net;
1109 nt->parms = *p;
1110 nt->fwmark = fwmark;
1111 err = register_netdevice(dev);
1112 if (err)
1113 goto out;
1114
1115 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1116 eth_hw_addr_random(dev);
1117
1118 mtu = ip_tunnel_bind_dev(dev);
1119 if (tb[IFLA_MTU]) {
1120 unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen;
1121
1122 dev->mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1123 (unsigned int)(max - sizeof(struct iphdr)));
1124 } else {
1125 dev->mtu = mtu;
1126 }
1127
1128 ip_tunnel_add(itn, nt);
1129out:
1130 return err;
1131}
1132EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1133
1134int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1135 struct ip_tunnel_parm *p, __u32 fwmark)
1136{
1137 struct ip_tunnel *t;
1138 struct ip_tunnel *tunnel = netdev_priv(dev);
1139 struct net *net = tunnel->net;
1140 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1141
1142 if (dev == itn->fb_tunnel_dev)
1143 return -EINVAL;
1144
1145 t = ip_tunnel_find(itn, p, dev->type);
1146
1147 if (t) {
1148 if (t->dev != dev)
1149 return -EEXIST;
1150 } else {
1151 t = tunnel;
1152
1153 if (dev->type != ARPHRD_ETHER) {
1154 unsigned int nflags = 0;
1155
1156 if (ipv4_is_multicast(p->iph.daddr))
1157 nflags = IFF_BROADCAST;
1158 else if (p->iph.daddr)
1159 nflags = IFF_POINTOPOINT;
1160
1161 if ((dev->flags ^ nflags) &
1162 (IFF_POINTOPOINT | IFF_BROADCAST))
1163 return -EINVAL;
1164 }
1165 }
1166
1167 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1168 return 0;
1169}
1170EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1171
1172int ip_tunnel_init(struct net_device *dev)
1173{
1174 struct ip_tunnel *tunnel = netdev_priv(dev);
1175 struct iphdr *iph = &tunnel->parms.iph;
1176 int err;
1177
1178 dev->needs_free_netdev = true;
1179 dev->priv_destructor = ip_tunnel_dev_free;
1180 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1181 if (!dev->tstats)
1182 return -ENOMEM;
1183
1184 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1185 if (err) {
1186 free_percpu(dev->tstats);
1187 return err;
1188 }
1189
1190 err = gro_cells_init(&tunnel->gro_cells, dev);
1191 if (err) {
1192 dst_cache_destroy(&tunnel->dst_cache);
1193 free_percpu(dev->tstats);
1194 return err;
1195 }
1196
1197 tunnel->dev = dev;
1198 tunnel->net = dev_net(dev);
1199 strcpy(tunnel->parms.name, dev->name);
1200 iph->version = 4;
1201 iph->ihl = 5;
1202
1203 if (tunnel->collect_md)
1204 netif_keep_dst(dev);
1205 return 0;
1206}
1207EXPORT_SYMBOL_GPL(ip_tunnel_init);
1208
1209void ip_tunnel_uninit(struct net_device *dev)
1210{
1211 struct ip_tunnel *tunnel = netdev_priv(dev);
1212 struct net *net = tunnel->net;
1213 struct ip_tunnel_net *itn;
1214
1215 itn = net_generic(net, tunnel->ip_tnl_net_id);
1216 ip_tunnel_del(itn, netdev_priv(dev));
1217 if (itn->fb_tunnel_dev == dev)
1218 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1219
1220 dst_cache_reset(&tunnel->dst_cache);
1221}
1222EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1223
1224/* Do least required initialization, rest of init is done in tunnel_init call */
1225void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1226{
1227 struct ip_tunnel *tunnel = netdev_priv(dev);
1228 tunnel->ip_tnl_net_id = net_id;
1229}
1230EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1231
1232MODULE_LICENSE("GPL");