blob: 0adce9bf7a1e56ead36271f4a4cef918b7b7c580 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * TUN - Universal TUN/TAP device driver.
4 * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
5 *
6 * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
7 */
8
9/*
10 * Changes:
11 *
12 * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
13 * Add TUNSETLINK ioctl to set the link encapsulation
14 *
15 * Mark Smith <markzzzsmith@yahoo.com.au>
16 * Use eth_random_addr() for tap MAC address.
17 *
18 * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20
19 * Fixes in packet dropping, queue length setting and queue wakeup.
20 * Increased default tx queue length.
21 * Added ethtool API.
22 * Minor cleanups
23 *
24 * Daniel Podlejski <underley@underley.eu.org>
25 * Modifications for 2.3.99-pre5 kernel.
26 */
27
28#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
29
30#define DRV_NAME "tun"
31#define DRV_VERSION "1.6"
32#define DRV_DESCRIPTION "Universal TUN/TAP device driver"
33#define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
34
35#include <linux/module.h>
36#include <linux/errno.h>
37#include <linux/kernel.h>
38#include <linux/sched/signal.h>
39#include <linux/major.h>
40#include <linux/slab.h>
41#include <linux/poll.h>
42#include <linux/fcntl.h>
43#include <linux/init.h>
44#include <linux/skbuff.h>
45#include <linux/netdevice.h>
46#include <linux/etherdevice.h>
47#include <linux/miscdevice.h>
48#include <linux/ethtool.h>
49#include <linux/rtnetlink.h>
50#include <linux/compat.h>
51#include <linux/if.h>
52#include <linux/if_arp.h>
53#include <linux/if_ether.h>
54#include <linux/if_tun.h>
55#include <linux/if_vlan.h>
56#include <linux/crc32.h>
57#include <linux/nsproxy.h>
58#include <linux/virtio_net.h>
59#include <linux/rcupdate.h>
60#include <net/net_namespace.h>
61#include <net/netns/generic.h>
62#include <net/rtnetlink.h>
63#include <net/sock.h>
64#include <net/xdp.h>
65#include <linux/seq_file.h>
66#include <linux/uio.h>
67#include <linux/skb_array.h>
68#include <linux/bpf.h>
69#include <linux/bpf_trace.h>
70#include <linux/mutex.h>
71#include <linux/ieee802154.h>
72#include <linux/if_ltalk.h>
73#include <uapi/linux/if_fddi.h>
74#include <uapi/linux/if_hippi.h>
75#include <uapi/linux/if_fc.h>
76#include <net/ax25.h>
77#include <net/rose.h>
78#include <net/6lowpan.h>
79
80#include <linux/uaccess.h>
81#include <linux/proc_fs.h>
82
83static void tun_default_link_ksettings(struct net_device *dev,
84 struct ethtool_link_ksettings *cmd);
85
86/* Uncomment to enable debugging */
87/* #define TUN_DEBUG 1 */
88
89#ifdef TUN_DEBUG
90static int debug;
91
92#define tun_debug(level, tun, fmt, args...) \
93do { \
94 if (tun->debug) \
95 netdev_printk(level, tun->dev, fmt, ##args); \
96} while (0)
97#define DBG1(level, fmt, args...) \
98do { \
99 if (debug == 2) \
100 printk(level fmt, ##args); \
101} while (0)
102#else
103#define tun_debug(level, tun, fmt, args...) \
104do { \
105 if (0) \
106 netdev_printk(level, tun->dev, fmt, ##args); \
107} while (0)
108#define DBG1(level, fmt, args...) \
109do { \
110 if (0) \
111 printk(level fmt, ##args); \
112} while (0)
113#endif
114
115#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
116
117/* TUN device flags */
118
119/* IFF_ATTACH_QUEUE is never stored in device flags,
120 * overload it to mean fasync when stored there.
121 */
122#define TUN_FASYNC IFF_ATTACH_QUEUE
123/* High bits in flags field are unused. */
124#define TUN_VNET_LE 0x80000000
125#define TUN_VNET_BE 0x40000000
126
127#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
128 IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
129
130#define GOODCOPY_LEN 128
131
132#define FLT_EXACT_COUNT 8
133struct tap_filter {
134 unsigned int count; /* Number of addrs. Zero means disabled */
135 u32 mask[2]; /* Mask of the hashed addrs */
136 unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
137};
138
139/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
140 * to max number of VCPUs in guest. */
141#define MAX_TAP_QUEUES 256
142#define MAX_TAP_FLOWS 4096
143
144#define TUN_FLOW_EXPIRE (3 * HZ)
145
146struct tun_pcpu_stats {
147 u64 rx_packets;
148 u64 rx_bytes;
149 u64 tx_packets;
150 u64 tx_bytes;
151 struct u64_stats_sync syncp;
152 u32 rx_dropped;
153 u32 tx_dropped;
154 u32 rx_frame_errors;
155};
156
157/* A tun_file connects an open character device to a tuntap netdevice. It
158 * also contains all socket related structures (except sock_fprog and tap_filter)
159 * to serve as one transmit queue for tuntap device. The sock_fprog and
160 * tap_filter were kept in tun_struct since they were used for filtering for the
161 * netdevice not for a specific queue (at least I didn't see the requirement for
162 * this).
163 *
164 * RCU usage:
165 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
166 * other can only be read while rcu_read_lock or rtnl_lock is held.
167 */
168struct tun_file {
169 struct sock sk;
170 struct socket socket;
171 struct tun_struct __rcu *tun;
172 struct fasync_struct *fasync;
173 /* only used for fasnyc */
174 unsigned int flags;
175 union {
176 u16 queue_index;
177 unsigned int ifindex;
178 };
179 struct napi_struct napi;
180 bool napi_enabled;
181 bool napi_frags_enabled;
182 struct mutex napi_mutex; /* Protects access to the above napi */
183 struct list_head next;
184 struct tun_struct *detached;
185 struct ptr_ring tx_ring;
186 struct xdp_rxq_info xdp_rxq;
187};
188
189struct tun_page {
190 struct page *page;
191 int count;
192};
193
194struct tun_flow_entry {
195 struct hlist_node hash_link;
196 struct rcu_head rcu;
197 struct tun_struct *tun;
198
199 u32 rxhash;
200 u32 rps_rxhash;
201 int queue_index;
202 unsigned long updated ____cacheline_aligned_in_smp;
203};
204
205#define TUN_NUM_FLOW_ENTRIES 1024
206#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
207
208struct tun_prog {
209 struct rcu_head rcu;
210 struct bpf_prog *prog;
211};
212
213/* Since the socket were moved to tun_file, to preserve the behavior of persist
214 * device, socket filter, sndbuf and vnet header size were restore when the
215 * file were attached to a persist device.
216 */
217struct tun_struct {
218 struct tun_file __rcu *tfiles[MAX_TAP_QUEUES];
219 unsigned int numqueues;
220 unsigned int flags;
221 kuid_t owner;
222 kgid_t group;
223
224 struct net_device *dev;
225 netdev_features_t set_features;
226#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
227 NETIF_F_TSO6)
228
229 int align;
230 int vnet_hdr_sz;
231 int sndbuf;
232 struct tap_filter txflt;
233 struct sock_fprog fprog;
234 /* protected by rtnl lock */
235 bool filter_attached;
236#ifdef TUN_DEBUG
237 int debug;
238#endif
239 spinlock_t lock;
240 struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
241 struct timer_list flow_gc_timer;
242 unsigned long ageing_time;
243 unsigned int numdisabled;
244 struct list_head disabled;
245 void *security;
246 u32 flow_count;
247 u32 rx_batched;
248 struct tun_pcpu_stats __percpu *pcpu_stats;
249 struct bpf_prog __rcu *xdp_prog;
250 struct tun_prog __rcu *steering_prog;
251 struct tun_prog __rcu *filter_prog;
252 struct ethtool_link_ksettings link_ksettings;
253 /* init args */
254 struct file *file;
255 struct ifreq *ifr;
256};
257
258struct veth {
259 __be16 h_vlan_proto;
260 __be16 h_vlan_TCI;
261};
262
263bool tun_is_xdp_frame(void *ptr)
264{
265 return (unsigned long)ptr & TUN_XDP_FLAG;
266}
267EXPORT_SYMBOL(tun_is_xdp_frame);
268
269void *tun_xdp_to_ptr(void *ptr)
270{
271 return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
272}
273EXPORT_SYMBOL(tun_xdp_to_ptr);
274
275void *tun_ptr_to_xdp(void *ptr)
276{
277 return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
278}
279EXPORT_SYMBOL(tun_ptr_to_xdp);
280
281static void tun_flow_init(struct tun_struct *tun);
282static void tun_flow_uninit(struct tun_struct *tun);
283
284static int tun_napi_receive(struct napi_struct *napi, int budget)
285{
286 struct tun_file *tfile = container_of(napi, struct tun_file, napi);
287 struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
288 struct sk_buff_head process_queue;
289 struct sk_buff *skb;
290 int received = 0;
291
292 __skb_queue_head_init(&process_queue);
293
294 spin_lock(&queue->lock);
295 skb_queue_splice_tail_init(queue, &process_queue);
296 spin_unlock(&queue->lock);
297
298 while (received < budget && (skb = __skb_dequeue(&process_queue))) {
299 napi_gro_receive(napi, skb);
300 ++received;
301 }
302
303 if (!skb_queue_empty(&process_queue)) {
304 spin_lock(&queue->lock);
305 skb_queue_splice(&process_queue, queue);
306 spin_unlock(&queue->lock);
307 }
308
309 return received;
310}
311
312static int tun_napi_poll(struct napi_struct *napi, int budget)
313{
314 unsigned int received;
315
316 received = tun_napi_receive(napi, budget);
317
318 if (received < budget)
319 napi_complete_done(napi, received);
320
321 return received;
322}
323
324static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
325 bool napi_en, bool napi_frags)
326{
327 tfile->napi_enabled = napi_en;
328 tfile->napi_frags_enabled = napi_en && napi_frags;
329 if (napi_en) {
330 netif_tx_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
331 NAPI_POLL_WEIGHT);
332 napi_enable(&tfile->napi);
333 }
334}
335
336static void tun_napi_enable(struct tun_file *tfile)
337{
338 if (tfile->napi_enabled)
339 napi_enable(&tfile->napi);
340}
341
342static void tun_napi_disable(struct tun_file *tfile)
343{
344 if (tfile->napi_enabled)
345 napi_disable(&tfile->napi);
346}
347
348static void tun_napi_del(struct tun_file *tfile)
349{
350 if (tfile->napi_enabled)
351 netif_napi_del(&tfile->napi);
352}
353
354static bool tun_napi_frags_enabled(const struct tun_file *tfile)
355{
356 return tfile->napi_frags_enabled;
357}
358
359#ifdef CONFIG_TUN_VNET_CROSS_LE
360static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
361{
362 return tun->flags & TUN_VNET_BE ? false :
363 virtio_legacy_is_little_endian();
364}
365
366static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
367{
368 int be = !!(tun->flags & TUN_VNET_BE);
369
370 if (put_user(be, argp))
371 return -EFAULT;
372
373 return 0;
374}
375
376static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
377{
378 int be;
379
380 if (get_user(be, argp))
381 return -EFAULT;
382
383 if (be)
384 tun->flags |= TUN_VNET_BE;
385 else
386 tun->flags &= ~TUN_VNET_BE;
387
388 return 0;
389}
390#else
391static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
392{
393 return virtio_legacy_is_little_endian();
394}
395
396static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
397{
398 return -EINVAL;
399}
400
401static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
402{
403 return -EINVAL;
404}
405#endif /* CONFIG_TUN_VNET_CROSS_LE */
406
407static inline bool tun_is_little_endian(struct tun_struct *tun)
408{
409 return tun->flags & TUN_VNET_LE ||
410 tun_legacy_is_little_endian(tun);
411}
412
413static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
414{
415 return __virtio16_to_cpu(tun_is_little_endian(tun), val);
416}
417
418static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
419{
420 return __cpu_to_virtio16(tun_is_little_endian(tun), val);
421}
422
423static inline u32 tun_hashfn(u32 rxhash)
424{
425 return rxhash & TUN_MASK_FLOW_ENTRIES;
426}
427
428static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
429{
430 struct tun_flow_entry *e;
431
432 hlist_for_each_entry_rcu(e, head, hash_link) {
433 if (e->rxhash == rxhash)
434 return e;
435 }
436 return NULL;
437}
438
439static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
440 struct hlist_head *head,
441 u32 rxhash, u16 queue_index)
442{
443 struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
444
445 if (e) {
446 tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
447 rxhash, queue_index);
448 e->updated = jiffies;
449 e->rxhash = rxhash;
450 e->rps_rxhash = 0;
451 e->queue_index = queue_index;
452 e->tun = tun;
453 hlist_add_head_rcu(&e->hash_link, head);
454 ++tun->flow_count;
455 }
456 return e;
457}
458
459static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
460{
461 tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
462 e->rxhash, e->queue_index);
463 hlist_del_rcu(&e->hash_link);
464 kfree_rcu(e, rcu);
465 --tun->flow_count;
466}
467
468static void tun_flow_flush(struct tun_struct *tun)
469{
470 int i;
471
472 spin_lock_bh(&tun->lock);
473 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
474 struct tun_flow_entry *e;
475 struct hlist_node *n;
476
477 hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
478 tun_flow_delete(tun, e);
479 }
480 spin_unlock_bh(&tun->lock);
481}
482
483static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
484{
485 int i;
486
487 spin_lock_bh(&tun->lock);
488 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
489 struct tun_flow_entry *e;
490 struct hlist_node *n;
491
492 hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
493 if (e->queue_index == queue_index)
494 tun_flow_delete(tun, e);
495 }
496 }
497 spin_unlock_bh(&tun->lock);
498}
499
500static void tun_flow_cleanup(struct timer_list *t)
501{
502 struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
503 unsigned long delay = tun->ageing_time;
504 unsigned long next_timer = jiffies + delay;
505 unsigned long count = 0;
506 int i;
507
508 tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");
509
510 spin_lock(&tun->lock);
511 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
512 struct tun_flow_entry *e;
513 struct hlist_node *n;
514
515 hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
516 unsigned long this_timer;
517
518 this_timer = e->updated + delay;
519 if (time_before_eq(this_timer, jiffies)) {
520 tun_flow_delete(tun, e);
521 continue;
522 }
523 count++;
524 if (time_before(this_timer, next_timer))
525 next_timer = this_timer;
526 }
527 }
528
529 if (count)
530 mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
531 spin_unlock(&tun->lock);
532}
533
534static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
535 struct tun_file *tfile)
536{
537 struct hlist_head *head;
538 struct tun_flow_entry *e;
539 unsigned long delay = tun->ageing_time;
540 u16 queue_index = tfile->queue_index;
541
542 head = &tun->flows[tun_hashfn(rxhash)];
543
544 rcu_read_lock();
545
546 e = tun_flow_find(head, rxhash);
547 if (likely(e)) {
548 /* TODO: keep queueing to old queue until it's empty? */
549 if (READ_ONCE(e->queue_index) != queue_index)
550 WRITE_ONCE(e->queue_index, queue_index);
551 if (e->updated != jiffies)
552 e->updated = jiffies;
553 sock_rps_record_flow_hash(e->rps_rxhash);
554 } else {
555 spin_lock_bh(&tun->lock);
556 if (!tun_flow_find(head, rxhash) &&
557 tun->flow_count < MAX_TAP_FLOWS)
558 tun_flow_create(tun, head, rxhash, queue_index);
559
560 if (!timer_pending(&tun->flow_gc_timer))
561 mod_timer(&tun->flow_gc_timer,
562 round_jiffies_up(jiffies + delay));
563 spin_unlock_bh(&tun->lock);
564 }
565
566 rcu_read_unlock();
567}
568
569/**
570 * Save the hash received in the stack receive path and update the
571 * flow_hash table accordingly.
572 */
573static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
574{
575 if (unlikely(e->rps_rxhash != hash))
576 e->rps_rxhash = hash;
577}
578
579/* We try to identify a flow through its rxhash. The reason that
580 * we do not check rxq no. is because some cards(e.g 82599), chooses
581 * the rxq based on the txq where the last packet of the flow comes. As
582 * the userspace application move between processors, we may get a
583 * different rxq no. here.
584 */
585static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
586{
587 struct tun_flow_entry *e;
588 u32 txq = 0;
589 u32 numqueues = 0;
590
591 numqueues = READ_ONCE(tun->numqueues);
592
593 txq = __skb_get_hash_symmetric(skb);
594 e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
595 if (e) {
596 tun_flow_save_rps_rxhash(e, txq);
597 txq = e->queue_index;
598 } else {
599 /* use multiply and shift instead of expensive divide */
600 txq = ((u64)txq * numqueues) >> 32;
601 }
602
603 return txq;
604}
605
606static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
607{
608 struct tun_prog *prog;
609 u32 numqueues;
610 u16 ret = 0;
611
612 numqueues = READ_ONCE(tun->numqueues);
613 if (!numqueues)
614 return 0;
615
616 prog = rcu_dereference(tun->steering_prog);
617 if (prog)
618 ret = bpf_prog_run_clear_cb(prog->prog, skb);
619
620 return ret % numqueues;
621}
622
623static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
624 struct net_device *sb_dev)
625{
626 struct tun_struct *tun = netdev_priv(dev);
627 u16 ret;
628
629 rcu_read_lock();
630 if (rcu_dereference(tun->steering_prog))
631 ret = tun_ebpf_select_queue(tun, skb);
632 else
633 ret = tun_automq_select_queue(tun, skb);
634 rcu_read_unlock();
635
636 return ret;
637}
638
639static inline bool tun_not_capable(struct tun_struct *tun)
640{
641 const struct cred *cred = current_cred();
642 struct net *net = dev_net(tun->dev);
643
644 return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
645 (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
646 !ns_capable(net->user_ns, CAP_NET_ADMIN);
647}
648
649static void tun_set_real_num_queues(struct tun_struct *tun)
650{
651 netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
652 netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
653}
654
655static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
656{
657 tfile->detached = tun;
658 list_add_tail(&tfile->next, &tun->disabled);
659 ++tun->numdisabled;
660}
661
662static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
663{
664 struct tun_struct *tun = tfile->detached;
665
666 tfile->detached = NULL;
667 list_del_init(&tfile->next);
668 --tun->numdisabled;
669 return tun;
670}
671
672void tun_ptr_free(void *ptr)
673{
674 if (!ptr)
675 return;
676 if (tun_is_xdp_frame(ptr)) {
677 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
678
679 xdp_return_frame(xdpf);
680 } else {
681 __skb_array_destroy_skb(ptr);
682 }
683}
684EXPORT_SYMBOL_GPL(tun_ptr_free);
685
686static void tun_queue_purge(struct tun_file *tfile)
687{
688 void *ptr;
689
690 while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
691 tun_ptr_free(ptr);
692
693 skb_queue_purge(&tfile->sk.sk_write_queue);
694 skb_queue_purge(&tfile->sk.sk_error_queue);
695}
696
697static void __tun_detach(struct tun_file *tfile, bool clean)
698{
699 struct tun_file *ntfile;
700 struct tun_struct *tun;
701
702 tun = rtnl_dereference(tfile->tun);
703
704 if (tun && clean) {
705 if (!tfile->detached)
706 tun_napi_disable(tfile);
707 tun_napi_del(tfile);
708 }
709
710 if (tun && !tfile->detached) {
711 u16 index = tfile->queue_index;
712 BUG_ON(index >= tun->numqueues);
713
714 rcu_assign_pointer(tun->tfiles[index],
715 tun->tfiles[tun->numqueues - 1]);
716 ntfile = rtnl_dereference(tun->tfiles[index]);
717 ntfile->queue_index = index;
718 ntfile->xdp_rxq.queue_index = index;
719 rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
720 NULL);
721
722 --tun->numqueues;
723 if (clean) {
724 RCU_INIT_POINTER(tfile->tun, NULL);
725 sock_put(&tfile->sk);
726 } else {
727 tun_disable_queue(tun, tfile);
728 tun_napi_disable(tfile);
729 }
730
731 synchronize_net();
732 tun_flow_delete_by_queue(tun, tun->numqueues + 1);
733 /* Drop read queue */
734 tun_queue_purge(tfile);
735 tun_set_real_num_queues(tun);
736 } else if (tfile->detached && clean) {
737 tun = tun_enable_queue(tfile);
738 sock_put(&tfile->sk);
739 }
740
741 if (clean) {
742 if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
743 netif_carrier_off(tun->dev);
744
745 if (!(tun->flags & IFF_PERSIST) &&
746 tun->dev->reg_state == NETREG_REGISTERED)
747 unregister_netdevice(tun->dev);
748 }
749 if (tun)
750 xdp_rxq_info_unreg(&tfile->xdp_rxq);
751 ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
752 }
753}
754
755static void tun_detach(struct tun_file *tfile, bool clean)
756{
757 struct tun_struct *tun;
758 struct net_device *dev;
759
760 rtnl_lock();
761 tun = rtnl_dereference(tfile->tun);
762 dev = tun ? tun->dev : NULL;
763 __tun_detach(tfile, clean);
764 if (dev)
765 netdev_state_change(dev);
766 rtnl_unlock();
767
768 if (clean)
769 sock_put(&tfile->sk);
770}
771
772static void tun_detach_all(struct net_device *dev)
773{
774 struct tun_struct *tun = netdev_priv(dev);
775 struct tun_file *tfile, *tmp;
776 int i, n = tun->numqueues;
777
778 for (i = 0; i < n; i++) {
779 tfile = rtnl_dereference(tun->tfiles[i]);
780 BUG_ON(!tfile);
781 tun_napi_disable(tfile);
782 tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
783 tfile->socket.sk->sk_data_ready(tfile->socket.sk);
784 RCU_INIT_POINTER(tfile->tun, NULL);
785 --tun->numqueues;
786 }
787 list_for_each_entry(tfile, &tun->disabled, next) {
788 tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
789 tfile->socket.sk->sk_data_ready(tfile->socket.sk);
790 RCU_INIT_POINTER(tfile->tun, NULL);
791 }
792 BUG_ON(tun->numqueues != 0);
793
794 synchronize_net();
795 for (i = 0; i < n; i++) {
796 tfile = rtnl_dereference(tun->tfiles[i]);
797 tun_napi_del(tfile);
798 /* Drop read queue */
799 tun_queue_purge(tfile);
800 xdp_rxq_info_unreg(&tfile->xdp_rxq);
801 sock_put(&tfile->sk);
802 }
803 list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
804 tun_napi_del(tfile);
805 tun_enable_queue(tfile);
806 tun_queue_purge(tfile);
807 xdp_rxq_info_unreg(&tfile->xdp_rxq);
808 sock_put(&tfile->sk);
809 }
810 BUG_ON(tun->numdisabled != 0);
811
812 if (tun->flags & IFF_PERSIST)
813 module_put(THIS_MODULE);
814}
815
816static int tun_attach(struct tun_struct *tun, struct file *file,
817 bool skip_filter, bool napi, bool napi_frags,
818 bool publish_tun)
819{
820 struct tun_file *tfile = file->private_data;
821 struct net_device *dev = tun->dev;
822 int err;
823
824 err = security_tun_dev_attach(tfile->socket.sk, tun->security);
825 if (err < 0)
826 goto out;
827
828 err = -EINVAL;
829 if (rtnl_dereference(tfile->tun) && !tfile->detached)
830 goto out;
831
832 err = -EBUSY;
833 if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
834 goto out;
835
836 err = -E2BIG;
837 if (!tfile->detached &&
838 tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
839 goto out;
840
841 err = 0;
842
843 /* Re-attach the filter to persist device */
844 if (!skip_filter && (tun->filter_attached == true)) {
845 lock_sock(tfile->socket.sk);
846 err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
847 release_sock(tfile->socket.sk);
848 if (!err)
849 goto out;
850 }
851
852 if (!tfile->detached &&
853 ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
854 GFP_KERNEL, tun_ptr_free)) {
855 err = -ENOMEM;
856 goto out;
857 }
858
859 tfile->queue_index = tun->numqueues;
860 tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
861
862 if (tfile->detached) {
863 /* Re-attach detached tfile, updating XDP queue_index */
864 WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
865
866 if (tfile->xdp_rxq.queue_index != tfile->queue_index)
867 tfile->xdp_rxq.queue_index = tfile->queue_index;
868 } else {
869 /* Setup XDP RX-queue info, for new tfile getting attached */
870 err = xdp_rxq_info_reg(&tfile->xdp_rxq,
871 tun->dev, tfile->queue_index);
872 if (err < 0)
873 goto out;
874 err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
875 MEM_TYPE_PAGE_SHARED, NULL);
876 if (err < 0) {
877 xdp_rxq_info_unreg(&tfile->xdp_rxq);
878 goto out;
879 }
880 err = 0;
881 }
882
883 if (tfile->detached) {
884 tun_enable_queue(tfile);
885 tun_napi_enable(tfile);
886 } else {
887 sock_hold(&tfile->sk);
888 tun_napi_init(tun, tfile, napi, napi_frags);
889 }
890
891 if (rtnl_dereference(tun->xdp_prog))
892 sock_set_flag(&tfile->sk, SOCK_XDP);
893
894 /* device is allowed to go away first, so no need to hold extra
895 * refcnt.
896 */
897
898 /* Publish tfile->tun and tun->tfiles only after we've fully
899 * initialized tfile; otherwise we risk using half-initialized
900 * object.
901 */
902 if (publish_tun)
903 rcu_assign_pointer(tfile->tun, tun);
904 rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
905 tun->numqueues++;
906 tun_set_real_num_queues(tun);
907out:
908 return err;
909}
910
911static struct tun_struct *tun_get(struct tun_file *tfile)
912{
913 struct tun_struct *tun;
914
915 rcu_read_lock();
916 tun = rcu_dereference(tfile->tun);
917 if (tun)
918 dev_hold(tun->dev);
919 rcu_read_unlock();
920
921 return tun;
922}
923
924static void tun_put(struct tun_struct *tun)
925{
926 dev_put(tun->dev);
927}
928
929/* TAP filtering */
930static void addr_hash_set(u32 *mask, const u8 *addr)
931{
932 int n = ether_crc(ETH_ALEN, addr) >> 26;
933 mask[n >> 5] |= (1 << (n & 31));
934}
935
936static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
937{
938 int n = ether_crc(ETH_ALEN, addr) >> 26;
939 return mask[n >> 5] & (1 << (n & 31));
940}
941
942static int update_filter(struct tap_filter *filter, void __user *arg)
943{
944 struct { u8 u[ETH_ALEN]; } *addr;
945 struct tun_filter uf;
946 int err, alen, n, nexact;
947
948 if (copy_from_user(&uf, arg, sizeof(uf)))
949 return -EFAULT;
950
951 if (!uf.count) {
952 /* Disabled */
953 filter->count = 0;
954 return 0;
955 }
956
957 alen = ETH_ALEN * uf.count;
958 addr = memdup_user(arg + sizeof(uf), alen);
959 if (IS_ERR(addr))
960 return PTR_ERR(addr);
961
962 /* The filter is updated without holding any locks. Which is
963 * perfectly safe. We disable it first and in the worst
964 * case we'll accept a few undesired packets. */
965 filter->count = 0;
966 wmb();
967
968 /* Use first set of addresses as an exact filter */
969 for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
970 memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
971
972 nexact = n;
973
974 /* Remaining multicast addresses are hashed,
975 * unicast will leave the filter disabled. */
976 memset(filter->mask, 0, sizeof(filter->mask));
977 for (; n < uf.count; n++) {
978 if (!is_multicast_ether_addr(addr[n].u)) {
979 err = 0; /* no filter */
980 goto free_addr;
981 }
982 addr_hash_set(filter->mask, addr[n].u);
983 }
984
985 /* For ALLMULTI just set the mask to all ones.
986 * This overrides the mask populated above. */
987 if ((uf.flags & TUN_FLT_ALLMULTI))
988 memset(filter->mask, ~0, sizeof(filter->mask));
989
990 /* Now enable the filter */
991 wmb();
992 filter->count = nexact;
993
994 /* Return the number of exact filters */
995 err = nexact;
996free_addr:
997 kfree(addr);
998 return err;
999}
1000
1001/* Returns: 0 - drop, !=0 - accept */
1002static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
1003{
1004 /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
1005 * at this point. */
1006 struct ethhdr *eh = (struct ethhdr *) skb->data;
1007 int i;
1008
1009 /* Exact match */
1010 for (i = 0; i < filter->count; i++)
1011 if (ether_addr_equal(eh->h_dest, filter->addr[i]))
1012 return 1;
1013
1014 /* Inexact match (multicast only) */
1015 if (is_multicast_ether_addr(eh->h_dest))
1016 return addr_hash_test(filter->mask, eh->h_dest);
1017
1018 return 0;
1019}
1020
1021/*
1022 * Checks whether the packet is accepted or not.
1023 * Returns: 0 - drop, !=0 - accept
1024 */
1025static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
1026{
1027 if (!filter->count)
1028 return 1;
1029
1030 return run_filter(filter, skb);
1031}
1032
1033/* Network device part of the driver */
1034
1035static const struct ethtool_ops tun_ethtool_ops;
1036
1037static int tun_net_init(struct net_device *dev)
1038{
1039 struct tun_struct *tun = netdev_priv(dev);
1040 struct ifreq *ifr = tun->ifr;
1041 int err;
1042
1043 tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
1044 if (!tun->pcpu_stats)
1045 return -ENOMEM;
1046
1047 spin_lock_init(&tun->lock);
1048
1049 err = security_tun_dev_alloc_security(&tun->security);
1050 if (err < 0) {
1051 free_percpu(tun->pcpu_stats);
1052 return err;
1053 }
1054
1055 tun_flow_init(tun);
1056
1057 dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
1058 TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
1059 NETIF_F_HW_VLAN_STAG_TX;
1060 dev->features = dev->hw_features | NETIF_F_LLTX;
1061 dev->vlan_features = dev->features &
1062 ~(NETIF_F_HW_VLAN_CTAG_TX |
1063 NETIF_F_HW_VLAN_STAG_TX);
1064
1065 tun->flags = (tun->flags & ~TUN_FEATURES) |
1066 (ifr->ifr_flags & TUN_FEATURES);
1067
1068 INIT_LIST_HEAD(&tun->disabled);
1069 err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
1070 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
1071 if (err < 0) {
1072 tun_flow_uninit(tun);
1073 security_tun_dev_free_security(tun->security);
1074 free_percpu(tun->pcpu_stats);
1075 return err;
1076 }
1077 return 0;
1078}
1079
1080/* Net device detach from fd. */
1081static void tun_net_uninit(struct net_device *dev)
1082{
1083 tun_detach_all(dev);
1084}
1085
1086/* Net device open. */
1087static int tun_net_open(struct net_device *dev)
1088{
1089 netif_tx_start_all_queues(dev);
1090
1091 return 0;
1092}
1093
1094/* Net device close. */
1095static int tun_net_close(struct net_device *dev)
1096{
1097 netif_tx_stop_all_queues(dev);
1098 return 0;
1099}
1100
1101/* Net device start xmit */
1102static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
1103{
1104#ifdef CONFIG_RPS
1105 if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
1106 /* Select queue was not called for the skbuff, so we extract the
1107 * RPS hash and save it into the flow_table here.
1108 */
1109 struct tun_flow_entry *e;
1110 __u32 rxhash;
1111
1112 rxhash = __skb_get_hash_symmetric(skb);
1113 e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
1114 if (e)
1115 tun_flow_save_rps_rxhash(e, rxhash);
1116 }
1117#endif
1118}
1119
1120static unsigned int run_ebpf_filter(struct tun_struct *tun,
1121 struct sk_buff *skb,
1122 int len)
1123{
1124 struct tun_prog *prog = rcu_dereference(tun->filter_prog);
1125
1126 if (prog)
1127 len = bpf_prog_run_clear_cb(prog->prog, skb);
1128
1129 return len;
1130}
1131
1132/* Net device start xmit */
1133static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
1134{
1135 struct tun_struct *tun = netdev_priv(dev);
1136 int txq = skb->queue_mapping;
1137 struct netdev_queue *queue;
1138 struct tun_file *tfile;
1139 int len = skb->len;
1140
1141 rcu_read_lock();
1142 tfile = rcu_dereference(tun->tfiles[txq]);
1143
1144 /* Drop packet if interface is not attached */
1145 if (!tfile)
1146 goto drop;
1147
1148 if (!rcu_dereference(tun->steering_prog))
1149 tun_automq_xmit(tun, skb);
1150
1151 tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
1152
1153 BUG_ON(!tfile);
1154
1155 /* Drop if the filter does not like it.
1156 * This is a noop if the filter is disabled.
1157 * Filter can be enabled only for the TAP devices. */
1158 if (!check_filter(&tun->txflt, skb))
1159 goto drop;
1160
1161 if (tfile->socket.sk->sk_filter &&
1162 sk_filter(tfile->socket.sk, skb))
1163 goto drop;
1164
1165 len = run_ebpf_filter(tun, skb, len);
1166 if (len == 0 || pskb_trim(skb, len))
1167 goto drop;
1168
1169 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1170 goto drop;
1171
1172 skb_tx_timestamp(skb);
1173
1174 /* Orphan the skb - required as we might hang on to it
1175 * for indefinite time.
1176 */
1177 skb_orphan(skb);
1178
1179 nf_reset_ct(skb);
1180
1181 if (ptr_ring_produce(&tfile->tx_ring, skb))
1182 goto drop;
1183
1184 /* NETIF_F_LLTX requires to do our own update of trans_start */
1185 queue = netdev_get_tx_queue(dev, txq);
1186 queue->trans_start = jiffies;
1187
1188 /* Notify and wake up reader process */
1189 if (tfile->flags & TUN_FASYNC)
1190 kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1191 tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1192
1193 rcu_read_unlock();
1194 return NETDEV_TX_OK;
1195
1196drop:
1197 this_cpu_inc(tun->pcpu_stats->tx_dropped);
1198 skb_tx_error(skb);
1199 kfree_skb(skb);
1200 rcu_read_unlock();
1201 return NET_XMIT_DROP;
1202}
1203
1204static void tun_net_mclist(struct net_device *dev)
1205{
1206 /*
1207 * This callback is supposed to deal with mc filter in
1208 * _rx_ path and has nothing to do with the _tx_ path.
1209 * In rx path we always accept everything userspace gives us.
1210 */
1211}
1212
1213static netdev_features_t tun_net_fix_features(struct net_device *dev,
1214 netdev_features_t features)
1215{
1216 struct tun_struct *tun = netdev_priv(dev);
1217
1218 return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
1219}
1220
1221static void tun_set_headroom(struct net_device *dev, int new_hr)
1222{
1223 struct tun_struct *tun = netdev_priv(dev);
1224
1225 if (new_hr < NET_SKB_PAD)
1226 new_hr = NET_SKB_PAD;
1227
1228 tun->align = new_hr;
1229}
1230
1231static void
1232tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1233{
1234 u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
1235 struct tun_struct *tun = netdev_priv(dev);
1236 struct tun_pcpu_stats *p;
1237 int i;
1238
1239 for_each_possible_cpu(i) {
1240 u64 rxpackets, rxbytes, txpackets, txbytes;
1241 unsigned int start;
1242
1243 p = per_cpu_ptr(tun->pcpu_stats, i);
1244 do {
1245 start = u64_stats_fetch_begin(&p->syncp);
1246 rxpackets = p->rx_packets;
1247 rxbytes = p->rx_bytes;
1248 txpackets = p->tx_packets;
1249 txbytes = p->tx_bytes;
1250 } while (u64_stats_fetch_retry(&p->syncp, start));
1251
1252 stats->rx_packets += rxpackets;
1253 stats->rx_bytes += rxbytes;
1254 stats->tx_packets += txpackets;
1255 stats->tx_bytes += txbytes;
1256
1257 /* u32 counters */
1258 rx_dropped += p->rx_dropped;
1259 rx_frame_errors += p->rx_frame_errors;
1260 tx_dropped += p->tx_dropped;
1261 }
1262 stats->rx_dropped = rx_dropped;
1263 stats->rx_frame_errors = rx_frame_errors;
1264 stats->tx_dropped = tx_dropped;
1265}
1266
1267static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1268 struct netlink_ext_ack *extack)
1269{
1270 struct tun_struct *tun = netdev_priv(dev);
1271 struct tun_file *tfile;
1272 struct bpf_prog *old_prog;
1273 int i;
1274
1275 old_prog = rtnl_dereference(tun->xdp_prog);
1276 rcu_assign_pointer(tun->xdp_prog, prog);
1277 if (old_prog)
1278 bpf_prog_put(old_prog);
1279
1280 for (i = 0; i < tun->numqueues; i++) {
1281 tfile = rtnl_dereference(tun->tfiles[i]);
1282 if (prog)
1283 sock_set_flag(&tfile->sk, SOCK_XDP);
1284 else
1285 sock_reset_flag(&tfile->sk, SOCK_XDP);
1286 }
1287 list_for_each_entry(tfile, &tun->disabled, next) {
1288 if (prog)
1289 sock_set_flag(&tfile->sk, SOCK_XDP);
1290 else
1291 sock_reset_flag(&tfile->sk, SOCK_XDP);
1292 }
1293
1294 return 0;
1295}
1296
1297static u32 tun_xdp_query(struct net_device *dev)
1298{
1299 struct tun_struct *tun = netdev_priv(dev);
1300 const struct bpf_prog *xdp_prog;
1301
1302 xdp_prog = rtnl_dereference(tun->xdp_prog);
1303 if (xdp_prog)
1304 return xdp_prog->aux->id;
1305
1306 return 0;
1307}
1308
1309static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1310{
1311 switch (xdp->command) {
1312 case XDP_SETUP_PROG:
1313 return tun_xdp_set(dev, xdp->prog, xdp->extack);
1314 case XDP_QUERY_PROG:
1315 xdp->prog_id = tun_xdp_query(dev);
1316 return 0;
1317 default:
1318 return -EINVAL;
1319 }
1320}
1321
1322static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
1323{
1324 if (new_carrier) {
1325 struct tun_struct *tun = netdev_priv(dev);
1326
1327 if (!tun->numqueues)
1328 return -EPERM;
1329
1330 netif_carrier_on(dev);
1331 } else {
1332 netif_carrier_off(dev);
1333 }
1334 return 0;
1335}
1336
1337static const struct net_device_ops tun_netdev_ops = {
1338 .ndo_init = tun_net_init,
1339 .ndo_uninit = tun_net_uninit,
1340 .ndo_open = tun_net_open,
1341 .ndo_stop = tun_net_close,
1342 .ndo_start_xmit = tun_net_xmit,
1343 .ndo_fix_features = tun_net_fix_features,
1344 .ndo_select_queue = tun_select_queue,
1345 .ndo_set_rx_headroom = tun_set_headroom,
1346 .ndo_get_stats64 = tun_net_get_stats64,
1347 .ndo_change_carrier = tun_net_change_carrier,
1348};
1349
1350static void __tun_xdp_flush_tfile(struct tun_file *tfile)
1351{
1352 /* Notify and wake up reader process */
1353 if (tfile->flags & TUN_FASYNC)
1354 kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1355 tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1356}
1357
1358static int tun_xdp_xmit(struct net_device *dev, int n,
1359 struct xdp_frame **frames, u32 flags)
1360{
1361 struct tun_struct *tun = netdev_priv(dev);
1362 struct tun_file *tfile;
1363 u32 numqueues;
1364 int drops = 0;
1365 int cnt = n;
1366 int i;
1367
1368 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
1369 return -EINVAL;
1370
1371 rcu_read_lock();
1372
1373resample:
1374 numqueues = READ_ONCE(tun->numqueues);
1375 if (!numqueues) {
1376 rcu_read_unlock();
1377 return -ENXIO; /* Caller will free/return all frames */
1378 }
1379
1380 tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
1381 numqueues]);
1382 if (unlikely(!tfile))
1383 goto resample;
1384
1385 spin_lock(&tfile->tx_ring.producer_lock);
1386 for (i = 0; i < n; i++) {
1387 struct xdp_frame *xdp = frames[i];
1388 /* Encode the XDP flag into lowest bit for consumer to differ
1389 * XDP buffer from sk_buff.
1390 */
1391 void *frame = tun_xdp_to_ptr(xdp);
1392
1393 if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
1394 this_cpu_inc(tun->pcpu_stats->tx_dropped);
1395 xdp_return_frame_rx_napi(xdp);
1396 drops++;
1397 }
1398 }
1399 spin_unlock(&tfile->tx_ring.producer_lock);
1400
1401 if (flags & XDP_XMIT_FLUSH)
1402 __tun_xdp_flush_tfile(tfile);
1403
1404 rcu_read_unlock();
1405 return cnt - drops;
1406}
1407
1408static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
1409{
1410 struct xdp_frame *frame = convert_to_xdp_frame(xdp);
1411
1412 if (unlikely(!frame))
1413 return -EOVERFLOW;
1414
1415 return tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
1416}
1417
1418static const struct net_device_ops tap_netdev_ops = {
1419 .ndo_init = tun_net_init,
1420 .ndo_uninit = tun_net_uninit,
1421 .ndo_open = tun_net_open,
1422 .ndo_stop = tun_net_close,
1423 .ndo_start_xmit = tun_net_xmit,
1424 .ndo_fix_features = tun_net_fix_features,
1425 .ndo_set_rx_mode = tun_net_mclist,
1426 .ndo_set_mac_address = eth_mac_addr,
1427 .ndo_validate_addr = eth_validate_addr,
1428 .ndo_select_queue = tun_select_queue,
1429 .ndo_features_check = passthru_features_check,
1430 .ndo_set_rx_headroom = tun_set_headroom,
1431 .ndo_get_stats64 = tun_net_get_stats64,
1432 .ndo_bpf = tun_xdp,
1433 .ndo_xdp_xmit = tun_xdp_xmit,
1434 .ndo_change_carrier = tun_net_change_carrier,
1435};
1436
1437static void tun_flow_init(struct tun_struct *tun)
1438{
1439 int i;
1440
1441 for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
1442 INIT_HLIST_HEAD(&tun->flows[i]);
1443
1444 tun->ageing_time = TUN_FLOW_EXPIRE;
1445 timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
1446 mod_timer(&tun->flow_gc_timer,
1447 round_jiffies_up(jiffies + tun->ageing_time));
1448}
1449
1450static void tun_flow_uninit(struct tun_struct *tun)
1451{
1452 del_timer_sync(&tun->flow_gc_timer);
1453 tun_flow_flush(tun);
1454}
1455
1456#define MIN_MTU 68
1457#define MAX_MTU 65535
1458
1459/* Initialize net device. */
1460static void tun_net_initialize(struct net_device *dev)
1461{
1462 struct tun_struct *tun = netdev_priv(dev);
1463
1464 switch (tun->flags & TUN_TYPE_MASK) {
1465 case IFF_TUN:
1466 dev->netdev_ops = &tun_netdev_ops;
1467
1468 /* Point-to-Point TUN Device */
1469 dev->hard_header_len = 0;
1470 dev->addr_len = 0;
1471 dev->mtu = 1500;
1472
1473 /* Zero header length */
1474 dev->type = ARPHRD_NONE;
1475 dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
1476 break;
1477
1478 case IFF_TAP:
1479 dev->netdev_ops = &tap_netdev_ops;
1480 /* Ethernet TAP Device */
1481 ether_setup(dev);
1482 dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1483 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1484
1485 eth_hw_addr_random(dev);
1486
1487 break;
1488 }
1489
1490 dev->min_mtu = MIN_MTU;
1491 dev->max_mtu = MAX_MTU - dev->hard_header_len;
1492}
1493
1494static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
1495{
1496 struct sock *sk = tfile->socket.sk;
1497
1498 return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
1499}
1500
1501/* Character device part */
1502
1503/* Poll */
1504static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
1505{
1506 struct tun_file *tfile = file->private_data;
1507 struct tun_struct *tun = tun_get(tfile);
1508 struct sock *sk;
1509 __poll_t mask = 0;
1510
1511 if (!tun)
1512 return EPOLLERR;
1513
1514 sk = tfile->socket.sk;
1515
1516 tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
1517
1518 poll_wait(file, sk_sleep(sk), wait);
1519
1520 if (!ptr_ring_empty(&tfile->tx_ring))
1521 mask |= EPOLLIN | EPOLLRDNORM;
1522
1523 /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
1524 * guarantee EPOLLOUT to be raised by either here or
1525 * tun_sock_write_space(). Then process could get notification
1526 * after it writes to a down device and meets -EIO.
1527 */
1528 if (tun_sock_writeable(tun, tfile) ||
1529 (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
1530 tun_sock_writeable(tun, tfile)))
1531 mask |= EPOLLOUT | EPOLLWRNORM;
1532
1533 if (tun->dev->reg_state != NETREG_REGISTERED)
1534 mask = EPOLLERR;
1535
1536 tun_put(tun);
1537 return mask;
1538}
1539
1540static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
1541 size_t len,
1542 const struct iov_iter *it)
1543{
1544 struct sk_buff *skb;
1545 size_t linear;
1546 int err;
1547 int i;
1548
1549 if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
1550 len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
1551 return ERR_PTR(-EMSGSIZE);
1552
1553 local_bh_disable();
1554 skb = napi_get_frags(&tfile->napi);
1555 local_bh_enable();
1556 if (!skb)
1557 return ERR_PTR(-ENOMEM);
1558
1559 linear = iov_iter_single_seg_count(it);
1560 err = __skb_grow(skb, linear);
1561 if (err)
1562 goto free;
1563
1564 skb->len = len;
1565 skb->data_len = len - linear;
1566 skb->truesize += skb->data_len;
1567
1568 for (i = 1; i < it->nr_segs; i++) {
1569 size_t fragsz = it->iov[i].iov_len;
1570 struct page *page;
1571 void *frag;
1572
1573 if (fragsz == 0 || fragsz > PAGE_SIZE) {
1574 err = -EINVAL;
1575 goto free;
1576 }
1577 frag = netdev_alloc_frag(fragsz);
1578 if (!frag) {
1579 err = -ENOMEM;
1580 goto free;
1581 }
1582 page = virt_to_head_page(frag);
1583 skb_fill_page_desc(skb, i - 1, page,
1584 frag - page_address(page), fragsz);
1585 }
1586
1587 return skb;
1588free:
1589 /* frees skb and all frags allocated with napi_alloc_frag() */
1590 napi_free_frags(&tfile->napi);
1591 return ERR_PTR(err);
1592}
1593
1594/* prepad is the amount to reserve at front. len is length after that.
1595 * linear is a hint as to how much to copy (usually headers). */
1596static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1597 size_t prepad, size_t len,
1598 size_t linear, int noblock)
1599{
1600 struct sock *sk = tfile->socket.sk;
1601 struct sk_buff *skb;
1602 int err;
1603
1604 /* Under a page? Don't bother with paged skb. */
1605 if (prepad + len < PAGE_SIZE || !linear)
1606 linear = len;
1607
1608 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1609 &err, 0);
1610 if (!skb)
1611 return ERR_PTR(err);
1612
1613 skb_reserve(skb, prepad);
1614 skb_put(skb, linear);
1615 skb->data_len = len - linear;
1616 skb->len += len - linear;
1617
1618 return skb;
1619}
1620
1621static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
1622 struct sk_buff *skb, int more)
1623{
1624 struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1625 struct sk_buff_head process_queue;
1626 u32 rx_batched = tun->rx_batched;
1627 bool rcv = false;
1628
1629 if (!rx_batched || (!more && skb_queue_empty(queue))) {
1630 local_bh_disable();
1631 skb_record_rx_queue(skb, tfile->queue_index);
1632 netif_receive_skb(skb);
1633 local_bh_enable();
1634 return;
1635 }
1636
1637 spin_lock(&queue->lock);
1638 if (!more || skb_queue_len(queue) == rx_batched) {
1639 __skb_queue_head_init(&process_queue);
1640 skb_queue_splice_tail_init(queue, &process_queue);
1641 rcv = true;
1642 } else {
1643 __skb_queue_tail(queue, skb);
1644 }
1645 spin_unlock(&queue->lock);
1646
1647 if (rcv) {
1648 struct sk_buff *nskb;
1649
1650 local_bh_disable();
1651 while ((nskb = __skb_dequeue(&process_queue))) {
1652 skb_record_rx_queue(nskb, tfile->queue_index);
1653 netif_receive_skb(nskb);
1654 }
1655 skb_record_rx_queue(skb, tfile->queue_index);
1656 netif_receive_skb(skb);
1657 local_bh_enable();
1658 }
1659}
1660
1661static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
1662 int len, int noblock, bool zerocopy)
1663{
1664 if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
1665 return false;
1666
1667 if (tfile->socket.sk->sk_sndbuf != INT_MAX)
1668 return false;
1669
1670 if (!noblock)
1671 return false;
1672
1673 if (zerocopy)
1674 return false;
1675
1676 if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
1677 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
1678 return false;
1679
1680 return true;
1681}
1682
1683static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
1684 struct page_frag *alloc_frag, char *buf,
1685 int buflen, int len, int pad)
1686{
1687 struct sk_buff *skb = build_skb(buf, buflen);
1688
1689 if (!skb)
1690 return ERR_PTR(-ENOMEM);
1691
1692 skb_reserve(skb, pad);
1693 skb_put(skb, len);
1694 skb_set_owner_w(skb, tfile->socket.sk);
1695
1696 get_page(alloc_frag->page);
1697 alloc_frag->offset += buflen;
1698
1699 return skb;
1700}
1701
1702static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
1703 struct xdp_buff *xdp, u32 act)
1704{
1705 int err;
1706
1707 switch (act) {
1708 case XDP_REDIRECT:
1709 err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
1710 if (err)
1711 return err;
1712 break;
1713 case XDP_TX:
1714 err = tun_xdp_tx(tun->dev, xdp);
1715 if (err < 0)
1716 return err;
1717 break;
1718 case XDP_PASS:
1719 break;
1720 default:
1721 bpf_warn_invalid_xdp_action(act);
1722 /* fall through */
1723 case XDP_ABORTED:
1724 trace_xdp_exception(tun->dev, xdp_prog, act);
1725 /* fall through */
1726 case XDP_DROP:
1727 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1728 break;
1729 }
1730
1731 return act;
1732}
1733
1734static struct sk_buff *tun_build_skb(struct tun_struct *tun,
1735 struct tun_file *tfile,
1736 struct iov_iter *from,
1737 struct virtio_net_hdr *hdr,
1738 int len, int *skb_xdp)
1739{
1740 struct page_frag *alloc_frag = &current->task_frag;
1741 struct bpf_prog *xdp_prog;
1742 int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1743 char *buf;
1744 size_t copied;
1745 int pad = TUN_RX_PAD;
1746 int err = 0;
1747
1748 rcu_read_lock();
1749 xdp_prog = rcu_dereference(tun->xdp_prog);
1750 if (xdp_prog)
1751 pad += XDP_PACKET_HEADROOM;
1752 buflen += SKB_DATA_ALIGN(len + pad);
1753 rcu_read_unlock();
1754
1755 alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
1756 if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
1757 return ERR_PTR(-ENOMEM);
1758
1759 buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1760 copied = copy_page_from_iter(alloc_frag->page,
1761 alloc_frag->offset + pad,
1762 len, from);
1763 if (copied != len)
1764 return ERR_PTR(-EFAULT);
1765
1766 /* There's a small window that XDP may be set after the check
1767 * of xdp_prog above, this should be rare and for simplicity
1768 * we do XDP on skb in case the headroom is not enough.
1769 */
1770 if (hdr->gso_type || !xdp_prog) {
1771 *skb_xdp = 1;
1772 return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
1773 pad);
1774 }
1775
1776 *skb_xdp = 0;
1777
1778 local_bh_disable();
1779 rcu_read_lock();
1780 xdp_prog = rcu_dereference(tun->xdp_prog);
1781 if (xdp_prog) {
1782 struct xdp_buff xdp;
1783 u32 act;
1784
1785 xdp.data_hard_start = buf;
1786 xdp.data = buf + pad;
1787 xdp_set_data_meta_invalid(&xdp);
1788 xdp.data_end = xdp.data + len;
1789 xdp.rxq = &tfile->xdp_rxq;
1790
1791 act = bpf_prog_run_xdp(xdp_prog, &xdp);
1792 if (act == XDP_REDIRECT || act == XDP_TX) {
1793 get_page(alloc_frag->page);
1794 alloc_frag->offset += buflen;
1795 }
1796 err = tun_xdp_act(tun, xdp_prog, &xdp, act);
1797 if (err < 0) {
1798 if (act == XDP_REDIRECT || act == XDP_TX)
1799 put_page(alloc_frag->page);
1800 goto out;
1801 }
1802
1803 if (err == XDP_REDIRECT)
1804 xdp_do_flush_map();
1805 if (err != XDP_PASS)
1806 goto out;
1807
1808 pad = xdp.data - xdp.data_hard_start;
1809 len = xdp.data_end - xdp.data;
1810 }
1811 rcu_read_unlock();
1812 local_bh_enable();
1813
1814 return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
1815
1816out:
1817 rcu_read_unlock();
1818 local_bh_enable();
1819 return NULL;
1820}
1821
1822/* Get packet from user space buffer */
1823static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1824 void *msg_control, struct iov_iter *from,
1825 int noblock, bool more)
1826{
1827 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
1828 struct sk_buff *skb;
1829 size_t total_len = iov_iter_count(from);
1830 size_t len = total_len, align = tun->align, linear;
1831 struct virtio_net_hdr gso = { 0 };
1832 struct tun_pcpu_stats *stats;
1833 int good_linear;
1834 int copylen;
1835 bool zerocopy = false;
1836 int err;
1837 u32 rxhash = 0;
1838 int skb_xdp = 1;
1839 bool frags = tun_napi_frags_enabled(tfile);
1840
1841 if (!(tun->flags & IFF_NO_PI)) {
1842 if (len < sizeof(pi))
1843 return -EINVAL;
1844 len -= sizeof(pi);
1845
1846 if (!copy_from_iter_full(&pi, sizeof(pi), from))
1847 return -EFAULT;
1848 }
1849
1850 if (tun->flags & IFF_VNET_HDR) {
1851 int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
1852
1853 if (len < vnet_hdr_sz)
1854 return -EINVAL;
1855 len -= vnet_hdr_sz;
1856
1857 if (!copy_from_iter_full(&gso, sizeof(gso), from))
1858 return -EFAULT;
1859
1860 if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1861 tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
1862 gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
1863
1864 if (tun16_to_cpu(tun, gso.hdr_len) > len)
1865 return -EINVAL;
1866 iov_iter_advance(from, vnet_hdr_sz - sizeof(gso));
1867 }
1868
1869 if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
1870 align += NET_IP_ALIGN;
1871 if (unlikely(len < ETH_HLEN ||
1872 (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
1873 return -EINVAL;
1874 }
1875
1876 good_linear = SKB_MAX_HEAD(align);
1877
1878 if (msg_control) {
1879 struct iov_iter i = *from;
1880
1881 /* There are 256 bytes to be copied in skb, so there is
1882 * enough room for skb expand head in case it is used.
1883 * The rest of the buffer is mapped from userspace.
1884 */
1885 copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
1886 if (copylen > good_linear)
1887 copylen = good_linear;
1888 linear = copylen;
1889 iov_iter_advance(&i, copylen);
1890 if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1891 zerocopy = true;
1892 }
1893
1894 if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1895 /* For the packet that is not easy to be processed
1896 * (e.g gso or jumbo packet), we will do it at after
1897 * skb was created with generic XDP routine.
1898 */
1899 skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
1900 if (IS_ERR(skb)) {
1901 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1902 return PTR_ERR(skb);
1903 }
1904 if (!skb)
1905 return total_len;
1906 } else {
1907 if (!zerocopy) {
1908 copylen = len;
1909 if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1910 linear = good_linear;
1911 else
1912 linear = tun16_to_cpu(tun, gso.hdr_len);
1913 }
1914
1915 if (frags) {
1916 mutex_lock(&tfile->napi_mutex);
1917 skb = tun_napi_alloc_frags(tfile, copylen, from);
1918 /* tun_napi_alloc_frags() enforces a layout for the skb.
1919 * If zerocopy is enabled, then this layout will be
1920 * overwritten by zerocopy_sg_from_iter().
1921 */
1922 zerocopy = false;
1923 } else {
1924 skb = tun_alloc_skb(tfile, align, copylen, linear,
1925 noblock);
1926 }
1927
1928 if (IS_ERR(skb)) {
1929 if (PTR_ERR(skb) != -EAGAIN)
1930 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1931 if (frags)
1932 mutex_unlock(&tfile->napi_mutex);
1933 return PTR_ERR(skb);
1934 }
1935
1936 if (zerocopy)
1937 err = zerocopy_sg_from_iter(skb, from);
1938 else
1939 err = skb_copy_datagram_from_iter(skb, 0, from, len);
1940
1941 if (err) {
1942 err = -EFAULT;
1943drop:
1944 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1945 kfree_skb(skb);
1946 if (frags) {
1947 tfile->napi.skb = NULL;
1948 mutex_unlock(&tfile->napi_mutex);
1949 }
1950
1951 return err;
1952 }
1953 }
1954
1955 if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
1956 this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
1957 kfree_skb(skb);
1958 if (frags) {
1959 tfile->napi.skb = NULL;
1960 mutex_unlock(&tfile->napi_mutex);
1961 }
1962
1963 return -EINVAL;
1964 }
1965
1966 switch (tun->flags & TUN_TYPE_MASK) {
1967 case IFF_TUN:
1968 if (tun->flags & IFF_NO_PI) {
1969 u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;
1970
1971 switch (ip_version) {
1972 case 4:
1973 pi.proto = htons(ETH_P_IP);
1974 break;
1975 case 6:
1976 pi.proto = htons(ETH_P_IPV6);
1977 break;
1978 default:
1979 this_cpu_inc(tun->pcpu_stats->rx_dropped);
1980 kfree_skb(skb);
1981 return -EINVAL;
1982 }
1983 }
1984
1985 skb_reset_mac_header(skb);
1986 skb->protocol = pi.proto;
1987 skb->dev = tun->dev;
1988 break;
1989 case IFF_TAP:
1990 if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
1991 err = -ENOMEM;
1992 goto drop;
1993 }
1994 skb->protocol = eth_type_trans(skb, tun->dev);
1995 break;
1996 }
1997
1998 /* copy skb_ubuf_info for callback when skb has no error */
1999 if (zerocopy) {
2000 skb_shinfo(skb)->destructor_arg = msg_control;
2001 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
2002 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
2003 } else if (msg_control) {
2004 struct ubuf_info *uarg = msg_control;
2005 uarg->callback(uarg, false);
2006 }
2007
2008 skb_reset_network_header(skb);
2009 skb_probe_transport_header(skb);
2010 skb_record_rx_queue(skb, tfile->queue_index);
2011
2012 if (skb_xdp) {
2013 struct bpf_prog *xdp_prog;
2014 int ret;
2015
2016 local_bh_disable();
2017 rcu_read_lock();
2018 xdp_prog = rcu_dereference(tun->xdp_prog);
2019 if (xdp_prog) {
2020 ret = do_xdp_generic(xdp_prog, skb);
2021 if (ret != XDP_PASS) {
2022 rcu_read_unlock();
2023 local_bh_enable();
2024 if (frags) {
2025 tfile->napi.skb = NULL;
2026 mutex_unlock(&tfile->napi_mutex);
2027 }
2028 return total_len;
2029 }
2030 }
2031 rcu_read_unlock();
2032 local_bh_enable();
2033 }
2034
2035 /* Compute the costly rx hash only if needed for flow updates.
2036 * We may get a very small possibility of OOO during switching, not
2037 * worth to optimize.
2038 */
2039 if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
2040 !tfile->detached)
2041 rxhash = __skb_get_hash_symmetric(skb);
2042
2043 rcu_read_lock();
2044 if (unlikely(!(tun->dev->flags & IFF_UP))) {
2045 err = -EIO;
2046 rcu_read_unlock();
2047 goto drop;
2048 }
2049
2050 if (frags) {
2051 u32 headlen;
2052
2053 /* Exercise flow dissector code path. */
2054 skb_push(skb, ETH_HLEN);
2055 headlen = eth_get_headlen(tun->dev, skb->data,
2056 skb_headlen(skb));
2057
2058 if (unlikely(headlen > skb_headlen(skb))) {
2059 WARN_ON_ONCE(1);
2060 err = -ENOMEM;
2061 this_cpu_inc(tun->pcpu_stats->rx_dropped);
2062napi_busy:
2063 napi_free_frags(&tfile->napi);
2064 rcu_read_unlock();
2065 mutex_unlock(&tfile->napi_mutex);
2066 return err;
2067 }
2068
2069 if (likely(napi_schedule_prep(&tfile->napi))) {
2070 local_bh_disable();
2071 napi_gro_frags(&tfile->napi);
2072 napi_complete(&tfile->napi);
2073 local_bh_enable();
2074 } else {
2075 err = -EBUSY;
2076 goto napi_busy;
2077 }
2078 mutex_unlock(&tfile->napi_mutex);
2079 } else if (tfile->napi_enabled) {
2080 struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
2081 int queue_len;
2082
2083 spin_lock_bh(&queue->lock);
2084 __skb_queue_tail(queue, skb);
2085 queue_len = skb_queue_len(queue);
2086 spin_unlock(&queue->lock);
2087
2088 if (!more || queue_len > NAPI_POLL_WEIGHT)
2089 napi_schedule(&tfile->napi);
2090
2091 local_bh_enable();
2092 } else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
2093 tun_rx_batched(tun, tfile, skb, more);
2094 } else {
2095 netif_rx_ni(skb);
2096 }
2097 rcu_read_unlock();
2098
2099 stats = get_cpu_ptr(tun->pcpu_stats);
2100 u64_stats_update_begin(&stats->syncp);
2101 stats->rx_packets++;
2102 stats->rx_bytes += len;
2103 u64_stats_update_end(&stats->syncp);
2104 put_cpu_ptr(stats);
2105
2106 if (rxhash)
2107 tun_flow_update(tun, rxhash, tfile);
2108
2109 return total_len;
2110}
2111
2112static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
2113{
2114 struct file *file = iocb->ki_filp;
2115 struct tun_file *tfile = file->private_data;
2116 struct tun_struct *tun = tun_get(tfile);
2117 ssize_t result;
2118 int noblock = 0;
2119
2120 if (!tun)
2121 return -EBADFD;
2122
2123 if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
2124 noblock = 1;
2125
2126 result = tun_get_user(tun, tfile, NULL, from, noblock, false);
2127
2128 tun_put(tun);
2129 return result;
2130}
2131
2132static ssize_t tun_put_user_xdp(struct tun_struct *tun,
2133 struct tun_file *tfile,
2134 struct xdp_frame *xdp_frame,
2135 struct iov_iter *iter)
2136{
2137 int vnet_hdr_sz = 0;
2138 size_t size = xdp_frame->len;
2139 struct tun_pcpu_stats *stats;
2140 size_t ret;
2141
2142 if (tun->flags & IFF_VNET_HDR) {
2143 struct virtio_net_hdr gso = { 0 };
2144
2145 vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2146 if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
2147 return -EINVAL;
2148 if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
2149 sizeof(gso)))
2150 return -EFAULT;
2151 iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2152 }
2153
2154 ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
2155
2156 stats = get_cpu_ptr(tun->pcpu_stats);
2157 u64_stats_update_begin(&stats->syncp);
2158 stats->tx_packets++;
2159 stats->tx_bytes += ret;
2160 u64_stats_update_end(&stats->syncp);
2161 put_cpu_ptr(tun->pcpu_stats);
2162
2163 return ret;
2164}
2165
2166/* Put packet to the user space buffer */
2167static ssize_t tun_put_user(struct tun_struct *tun,
2168 struct tun_file *tfile,
2169 struct sk_buff *skb,
2170 struct iov_iter *iter)
2171{
2172 struct tun_pi pi = { 0, skb->protocol };
2173 struct tun_pcpu_stats *stats;
2174 ssize_t total;
2175 int vlan_offset = 0;
2176 int vlan_hlen = 0;
2177 int vnet_hdr_sz = 0;
2178
2179 if (skb_vlan_tag_present(skb))
2180 vlan_hlen = VLAN_HLEN;
2181
2182 if (tun->flags & IFF_VNET_HDR)
2183 vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2184
2185 total = skb->len + vlan_hlen + vnet_hdr_sz;
2186
2187 if (!(tun->flags & IFF_NO_PI)) {
2188 if (iov_iter_count(iter) < sizeof(pi))
2189 return -EINVAL;
2190
2191 total += sizeof(pi);
2192 if (iov_iter_count(iter) < total) {
2193 /* Packet will be striped */
2194 pi.flags |= TUN_PKT_STRIP;
2195 }
2196
2197 if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
2198 return -EFAULT;
2199 }
2200
2201 if (vnet_hdr_sz) {
2202 struct virtio_net_hdr gso;
2203
2204 if (iov_iter_count(iter) < vnet_hdr_sz)
2205 return -EINVAL;
2206
2207 if (virtio_net_hdr_from_skb(skb, &gso,
2208 tun_is_little_endian(tun), true,
2209 vlan_hlen)) {
2210 struct skb_shared_info *sinfo = skb_shinfo(skb);
2211
2212 if (net_ratelimit()) {
2213 netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
2214 sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
2215 tun16_to_cpu(tun, gso.hdr_len));
2216 print_hex_dump(KERN_ERR, "tun: ",
2217 DUMP_PREFIX_NONE,
2218 16, 1, skb->head,
2219 min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
2220 }
2221 WARN_ON_ONCE(1);
2222 return -EINVAL;
2223 }
2224
2225 if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
2226 return -EFAULT;
2227
2228 iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2229 }
2230
2231 if (vlan_hlen) {
2232 int ret;
2233 struct veth veth;
2234
2235 veth.h_vlan_proto = skb->vlan_proto;
2236 veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
2237
2238 vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
2239
2240 ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
2241 if (ret || !iov_iter_count(iter))
2242 goto done;
2243
2244 ret = copy_to_iter(&veth, sizeof(veth), iter);
2245 if (ret != sizeof(veth) || !iov_iter_count(iter))
2246 goto done;
2247 }
2248
2249 skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
2250
2251done:
2252 /* caller is in process context, */
2253 stats = get_cpu_ptr(tun->pcpu_stats);
2254 u64_stats_update_begin(&stats->syncp);
2255 stats->tx_packets++;
2256 stats->tx_bytes += skb->len + vlan_hlen;
2257 u64_stats_update_end(&stats->syncp);
2258 put_cpu_ptr(tun->pcpu_stats);
2259
2260 return total;
2261}
2262
2263static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
2264{
2265 DECLARE_WAITQUEUE(wait, current);
2266 void *ptr = NULL;
2267 int error = 0;
2268
2269 ptr = ptr_ring_consume(&tfile->tx_ring);
2270 if (ptr)
2271 goto out;
2272 if (noblock) {
2273 error = -EAGAIN;
2274 goto out;
2275 }
2276
2277 add_wait_queue(&tfile->socket.wq.wait, &wait);
2278
2279 while (1) {
2280 set_current_state(TASK_INTERRUPTIBLE);
2281 ptr = ptr_ring_consume(&tfile->tx_ring);
2282 if (ptr)
2283 break;
2284 if (signal_pending(current)) {
2285 error = -ERESTARTSYS;
2286 break;
2287 }
2288 if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
2289 error = -EFAULT;
2290 break;
2291 }
2292
2293 schedule();
2294 }
2295
2296 __set_current_state(TASK_RUNNING);
2297 remove_wait_queue(&tfile->socket.wq.wait, &wait);
2298
2299out:
2300 *err = error;
2301 return ptr;
2302}
2303
2304static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
2305 struct iov_iter *to,
2306 int noblock, void *ptr)
2307{
2308 ssize_t ret;
2309 int err;
2310
2311 tun_debug(KERN_INFO, tun, "tun_do_read\n");
2312
2313 if (!iov_iter_count(to)) {
2314 tun_ptr_free(ptr);
2315 return 0;
2316 }
2317
2318 if (!ptr) {
2319 /* Read frames from ring */
2320 ptr = tun_ring_recv(tfile, noblock, &err);
2321 if (!ptr)
2322 return err;
2323 }
2324
2325 if (tun_is_xdp_frame(ptr)) {
2326 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2327
2328 ret = tun_put_user_xdp(tun, tfile, xdpf, to);
2329 xdp_return_frame(xdpf);
2330 } else {
2331 struct sk_buff *skb = ptr;
2332
2333 ret = tun_put_user(tun, tfile, skb, to);
2334 if (unlikely(ret < 0))
2335 kfree_skb(skb);
2336 else
2337 consume_skb(skb);
2338 }
2339
2340 return ret;
2341}
2342
2343static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
2344{
2345 struct file *file = iocb->ki_filp;
2346 struct tun_file *tfile = file->private_data;
2347 struct tun_struct *tun = tun_get(tfile);
2348 ssize_t len = iov_iter_count(to), ret;
2349 int noblock = 0;
2350
2351 if (!tun)
2352 return -EBADFD;
2353
2354 if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
2355 noblock = 1;
2356
2357 ret = tun_do_read(tun, tfile, to, noblock, NULL);
2358 ret = min_t(ssize_t, ret, len);
2359 if (ret > 0)
2360 iocb->ki_pos = ret;
2361 tun_put(tun);
2362 return ret;
2363}
2364
2365static void tun_prog_free(struct rcu_head *rcu)
2366{
2367 struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
2368
2369 bpf_prog_destroy(prog->prog);
2370 kfree(prog);
2371}
2372
2373static int __tun_set_ebpf(struct tun_struct *tun,
2374 struct tun_prog __rcu **prog_p,
2375 struct bpf_prog *prog)
2376{
2377 struct tun_prog *old, *new = NULL;
2378
2379 if (prog) {
2380 new = kmalloc(sizeof(*new), GFP_KERNEL);
2381 if (!new)
2382 return -ENOMEM;
2383 new->prog = prog;
2384 }
2385
2386 spin_lock_bh(&tun->lock);
2387 old = rcu_dereference_protected(*prog_p,
2388 lockdep_is_held(&tun->lock));
2389 rcu_assign_pointer(*prog_p, new);
2390 spin_unlock_bh(&tun->lock);
2391
2392 if (old)
2393 call_rcu(&old->rcu, tun_prog_free);
2394
2395 return 0;
2396}
2397
2398static void tun_free_netdev(struct net_device *dev)
2399{
2400 struct tun_struct *tun = netdev_priv(dev);
2401
2402 BUG_ON(!(list_empty(&tun->disabled)));
2403 free_percpu(tun->pcpu_stats);
2404 tun_flow_uninit(tun);
2405 security_tun_dev_free_security(tun->security);
2406 __tun_set_ebpf(tun, &tun->steering_prog, NULL);
2407 __tun_set_ebpf(tun, &tun->filter_prog, NULL);
2408}
2409
2410static void tun_setup(struct net_device *dev)
2411{
2412 struct tun_struct *tun = netdev_priv(dev);
2413
2414 tun->owner = INVALID_UID;
2415 tun->group = INVALID_GID;
2416 tun_default_link_ksettings(dev, &tun->link_ksettings);
2417
2418 dev->ethtool_ops = &tun_ethtool_ops;
2419 dev->needs_free_netdev = true;
2420 dev->priv_destructor = tun_free_netdev;
2421 /* We prefer our own queue length */
2422 dev->tx_queue_len = TUN_READQ_SIZE;
2423}
2424
2425/* Trivial set of netlink ops to allow deleting tun or tap
2426 * device with netlink.
2427 */
2428static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
2429 struct netlink_ext_ack *extack)
2430{
2431 NL_SET_ERR_MSG(extack,
2432 "tun/tap creation via rtnetlink is not supported.");
2433 return -EOPNOTSUPP;
2434}
2435
2436static size_t tun_get_size(const struct net_device *dev)
2437{
2438 BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
2439 BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));
2440
2441 return nla_total_size(sizeof(uid_t)) + /* OWNER */
2442 nla_total_size(sizeof(gid_t)) + /* GROUP */
2443 nla_total_size(sizeof(u8)) + /* TYPE */
2444 nla_total_size(sizeof(u8)) + /* PI */
2445 nla_total_size(sizeof(u8)) + /* VNET_HDR */
2446 nla_total_size(sizeof(u8)) + /* PERSIST */
2447 nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
2448 nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
2449 nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
2450 0;
2451}
2452
2453static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
2454{
2455 struct tun_struct *tun = netdev_priv(dev);
2456
2457 if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
2458 goto nla_put_failure;
2459 if (uid_valid(tun->owner) &&
2460 nla_put_u32(skb, IFLA_TUN_OWNER,
2461 from_kuid_munged(current_user_ns(), tun->owner)))
2462 goto nla_put_failure;
2463 if (gid_valid(tun->group) &&
2464 nla_put_u32(skb, IFLA_TUN_GROUP,
2465 from_kgid_munged(current_user_ns(), tun->group)))
2466 goto nla_put_failure;
2467 if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
2468 goto nla_put_failure;
2469 if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
2470 goto nla_put_failure;
2471 if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
2472 goto nla_put_failure;
2473 if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
2474 !!(tun->flags & IFF_MULTI_QUEUE)))
2475 goto nla_put_failure;
2476 if (tun->flags & IFF_MULTI_QUEUE) {
2477 if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
2478 goto nla_put_failure;
2479 if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
2480 tun->numdisabled))
2481 goto nla_put_failure;
2482 }
2483
2484 return 0;
2485
2486nla_put_failure:
2487 return -EMSGSIZE;
2488}
2489
2490static struct rtnl_link_ops tun_link_ops __read_mostly = {
2491 .kind = DRV_NAME,
2492 .priv_size = sizeof(struct tun_struct),
2493 .setup = tun_setup,
2494 .validate = tun_validate,
2495 .get_size = tun_get_size,
2496 .fill_info = tun_fill_info,
2497};
2498
2499static void tun_sock_write_space(struct sock *sk)
2500{
2501 struct tun_file *tfile;
2502 wait_queue_head_t *wqueue;
2503
2504 if (!sock_writeable(sk))
2505 return;
2506
2507 if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
2508 return;
2509
2510 wqueue = sk_sleep(sk);
2511 if (wqueue && waitqueue_active(wqueue))
2512 wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
2513 EPOLLWRNORM | EPOLLWRBAND);
2514
2515 tfile = container_of(sk, struct tun_file, sk);
2516 kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
2517}
2518
2519static void tun_put_page(struct tun_page *tpage)
2520{
2521 if (tpage->page)
2522 __page_frag_cache_drain(tpage->page, tpage->count);
2523}
2524
2525static int tun_xdp_one(struct tun_struct *tun,
2526 struct tun_file *tfile,
2527 struct xdp_buff *xdp, int *flush,
2528 struct tun_page *tpage)
2529{
2530 unsigned int datasize = xdp->data_end - xdp->data;
2531 struct tun_xdp_hdr *hdr = xdp->data_hard_start;
2532 struct virtio_net_hdr *gso = &hdr->gso;
2533 struct tun_pcpu_stats *stats;
2534 struct bpf_prog *xdp_prog;
2535 struct sk_buff *skb = NULL;
2536 u32 rxhash = 0, act;
2537 int buflen = hdr->buflen;
2538 int err = 0;
2539 bool skb_xdp = false;
2540 struct page *page;
2541
2542 if (unlikely(datasize < ETH_HLEN))
2543 return -EINVAL;
2544
2545 xdp_prog = rcu_dereference(tun->xdp_prog);
2546 if (xdp_prog) {
2547 if (gso->gso_type) {
2548 skb_xdp = true;
2549 goto build;
2550 }
2551 xdp_set_data_meta_invalid(xdp);
2552 xdp->rxq = &tfile->xdp_rxq;
2553
2554 act = bpf_prog_run_xdp(xdp_prog, xdp);
2555 err = tun_xdp_act(tun, xdp_prog, xdp, act);
2556 if (err < 0) {
2557 put_page(virt_to_head_page(xdp->data));
2558 return err;
2559 }
2560
2561 switch (err) {
2562 case XDP_REDIRECT:
2563 *flush = true;
2564 /* fall through */
2565 case XDP_TX:
2566 return 0;
2567 case XDP_PASS:
2568 break;
2569 default:
2570 page = virt_to_head_page(xdp->data);
2571 if (tpage->page == page) {
2572 ++tpage->count;
2573 } else {
2574 tun_put_page(tpage);
2575 tpage->page = page;
2576 tpage->count = 1;
2577 }
2578 return 0;
2579 }
2580 }
2581
2582build:
2583 skb = build_skb(xdp->data_hard_start, buflen);
2584 if (!skb) {
2585 err = -ENOMEM;
2586 goto out;
2587 }
2588
2589 skb_reserve(skb, xdp->data - xdp->data_hard_start);
2590 skb_put(skb, xdp->data_end - xdp->data);
2591
2592 if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
2593 this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
2594 kfree_skb(skb);
2595 err = -EINVAL;
2596 goto out;
2597 }
2598
2599 skb->protocol = eth_type_trans(skb, tun->dev);
2600 skb_reset_network_header(skb);
2601 skb_probe_transport_header(skb);
2602 skb_record_rx_queue(skb, tfile->queue_index);
2603
2604 if (skb_xdp) {
2605 err = do_xdp_generic(xdp_prog, skb);
2606 if (err != XDP_PASS)
2607 goto out;
2608 }
2609
2610 if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
2611 !tfile->detached)
2612 rxhash = __skb_get_hash_symmetric(skb);
2613
2614 netif_receive_skb(skb);
2615
2616 /* No need for get_cpu_ptr() here since this function is
2617 * always called with bh disabled
2618 */
2619 stats = this_cpu_ptr(tun->pcpu_stats);
2620 u64_stats_update_begin(&stats->syncp);
2621 stats->rx_packets++;
2622 stats->rx_bytes += datasize;
2623 u64_stats_update_end(&stats->syncp);
2624
2625 if (rxhash)
2626 tun_flow_update(tun, rxhash, tfile);
2627
2628out:
2629 return err;
2630}
2631
2632static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
2633{
2634 int ret, i;
2635 struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2636 struct tun_struct *tun = tun_get(tfile);
2637 struct tun_msg_ctl *ctl = m->msg_control;
2638 struct xdp_buff *xdp;
2639
2640 if (!tun)
2641 return -EBADFD;
2642
2643 if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
2644 ctl && ctl->type == TUN_MSG_PTR) {
2645 struct tun_page tpage;
2646 int n = ctl->num;
2647 int flush = 0;
2648
2649 memset(&tpage, 0, sizeof(tpage));
2650
2651 local_bh_disable();
2652 rcu_read_lock();
2653
2654 for (i = 0; i < n; i++) {
2655 xdp = &((struct xdp_buff *)ctl->ptr)[i];
2656 tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
2657 }
2658
2659 if (flush)
2660 xdp_do_flush_map();
2661
2662 rcu_read_unlock();
2663 local_bh_enable();
2664
2665 tun_put_page(&tpage);
2666
2667 ret = total_len;
2668 goto out;
2669 }
2670
2671 ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
2672 m->msg_flags & MSG_DONTWAIT,
2673 m->msg_flags & MSG_MORE);
2674out:
2675 tun_put(tun);
2676 return ret;
2677}
2678
2679static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
2680 int flags)
2681{
2682 struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2683 struct tun_struct *tun = tun_get(tfile);
2684 void *ptr = m->msg_control;
2685 int ret;
2686
2687 if (!tun) {
2688 ret = -EBADFD;
2689 goto out_free;
2690 }
2691
2692 if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
2693 ret = -EINVAL;
2694 goto out_put_tun;
2695 }
2696 if (flags & MSG_ERRQUEUE) {
2697 ret = sock_recv_errqueue(sock->sk, m, total_len,
2698 SOL_PACKET, TUN_TX_TIMESTAMP);
2699 goto out;
2700 }
2701 ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
2702 if (ret > (ssize_t)total_len) {
2703 m->msg_flags |= MSG_TRUNC;
2704 ret = flags & MSG_TRUNC ? ret : total_len;
2705 }
2706out:
2707 tun_put(tun);
2708 return ret;
2709
2710out_put_tun:
2711 tun_put(tun);
2712out_free:
2713 tun_ptr_free(ptr);
2714 return ret;
2715}
2716
2717static int tun_ptr_peek_len(void *ptr)
2718{
2719 if (likely(ptr)) {
2720 if (tun_is_xdp_frame(ptr)) {
2721 struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2722
2723 return xdpf->len;
2724 }
2725 return __skb_array_len_with_tag(ptr);
2726 } else {
2727 return 0;
2728 }
2729}
2730
2731static int tun_peek_len(struct socket *sock)
2732{
2733 struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2734 struct tun_struct *tun;
2735 int ret = 0;
2736
2737 tun = tun_get(tfile);
2738 if (!tun)
2739 return 0;
2740
2741 ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
2742 tun_put(tun);
2743
2744 return ret;
2745}
2746
2747/* Ops structure to mimic raw sockets with tun */
2748static const struct proto_ops tun_socket_ops = {
2749 .peek_len = tun_peek_len,
2750 .sendmsg = tun_sendmsg,
2751 .recvmsg = tun_recvmsg,
2752};
2753
2754static struct proto tun_proto = {
2755 .name = "tun",
2756 .owner = THIS_MODULE,
2757 .obj_size = sizeof(struct tun_file),
2758};
2759
2760static int tun_flags(struct tun_struct *tun)
2761{
2762 return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
2763}
2764
2765static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
2766 char *buf)
2767{
2768 struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2769 return sprintf(buf, "0x%x\n", tun_flags(tun));
2770}
2771
2772static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
2773 char *buf)
2774{
2775 struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2776 return uid_valid(tun->owner)?
2777 sprintf(buf, "%u\n",
2778 from_kuid_munged(current_user_ns(), tun->owner)):
2779 sprintf(buf, "-1\n");
2780}
2781
2782static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
2783 char *buf)
2784{
2785 struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2786 return gid_valid(tun->group) ?
2787 sprintf(buf, "%u\n",
2788 from_kgid_munged(current_user_ns(), tun->group)):
2789 sprintf(buf, "-1\n");
2790}
2791
2792static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
2793static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
2794static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
2795
2796static struct attribute *tun_dev_attrs[] = {
2797 &dev_attr_tun_flags.attr,
2798 &dev_attr_owner.attr,
2799 &dev_attr_group.attr,
2800 NULL
2801};
2802
2803static const struct attribute_group tun_attr_group = {
2804 .attrs = tun_dev_attrs
2805};
2806
2807static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
2808{
2809 struct tun_struct *tun;
2810 struct tun_file *tfile = file->private_data;
2811 struct net_device *dev;
2812 int err;
2813
2814 if (tfile->detached)
2815 return -EINVAL;
2816
2817 if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2818 if (!capable(CAP_NET_ADMIN))
2819 return -EPERM;
2820
2821 if (!(ifr->ifr_flags & IFF_NAPI) ||
2822 (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2823 return -EINVAL;
2824 }
2825
2826 dev = __dev_get_by_name(net, ifr->ifr_name);
2827 if (dev) {
2828 if (ifr->ifr_flags & IFF_TUN_EXCL)
2829 return -EBUSY;
2830 if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2831 tun = netdev_priv(dev);
2832 else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2833 tun = netdev_priv(dev);
2834 else
2835 return -EINVAL;
2836
2837 if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
2838 !!(tun->flags & IFF_MULTI_QUEUE))
2839 return -EINVAL;
2840
2841 if (tun_not_capable(tun))
2842 return -EPERM;
2843 err = security_tun_dev_open(tun->security);
2844 if (err < 0)
2845 return err;
2846
2847 err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
2848 ifr->ifr_flags & IFF_NAPI,
2849 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
2850 if (err < 0)
2851 return err;
2852
2853 if (tun->flags & IFF_MULTI_QUEUE &&
2854 (tun->numqueues + tun->numdisabled > 1)) {
2855 /* One or more queue has already been attached, no need
2856 * to initialize the device again.
2857 */
2858 netdev_state_change(dev);
2859 return 0;
2860 }
2861
2862 tun->flags = (tun->flags & ~TUN_FEATURES) |
2863 (ifr->ifr_flags & TUN_FEATURES);
2864
2865 netdev_state_change(dev);
2866 } else {
2867 char *name;
2868 unsigned long flags = 0;
2869 int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
2870 MAX_TAP_QUEUES : 1;
2871
2872 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2873 return -EPERM;
2874 err = security_tun_dev_create();
2875 if (err < 0)
2876 return err;
2877
2878 /* Set dev type */
2879 if (ifr->ifr_flags & IFF_TUN) {
2880 /* TUN device */
2881 flags |= IFF_TUN;
2882 name = "tun%d";
2883 } else if (ifr->ifr_flags & IFF_TAP) {
2884 /* TAP device */
2885 flags |= IFF_TAP;
2886 name = "tap%d";
2887 } else
2888 return -EINVAL;
2889
2890 if (*ifr->ifr_name)
2891 name = ifr->ifr_name;
2892
2893 dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
2894 NET_NAME_UNKNOWN, tun_setup, queues,
2895 queues);
2896
2897 if (!dev)
2898 return -ENOMEM;
2899
2900 dev_net_set(dev, net);
2901 dev->rtnl_link_ops = &tun_link_ops;
2902 dev->ifindex = tfile->ifindex;
2903 dev->sysfs_groups[0] = &tun_attr_group;
2904
2905 tun = netdev_priv(dev);
2906 tun->dev = dev;
2907 tun->flags = flags;
2908 tun->txflt.count = 0;
2909 tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
2910
2911 tun->align = NET_SKB_PAD;
2912 tun->filter_attached = false;
2913 tun->sndbuf = tfile->socket.sk->sk_sndbuf;
2914 tun->rx_batched = 0;
2915 RCU_INIT_POINTER(tun->steering_prog, NULL);
2916
2917 tun->ifr = ifr;
2918 tun->file = file;
2919
2920 tun_net_initialize(dev);
2921
2922 err = register_netdevice(tun->dev);
2923 if (err < 0) {
2924 free_netdev(dev);
2925 return err;
2926 }
2927 /* free_netdev() won't check refcnt, to aovid race
2928 * with dev_put() we need publish tun after registration.
2929 */
2930 rcu_assign_pointer(tfile->tun, tun);
2931 }
2932
2933 netif_carrier_on(tun->dev);
2934
2935 tun_debug(KERN_INFO, tun, "tun_set_iff\n");
2936
2937 /* Make sure persistent devices do not get stuck in
2938 * xoff state.
2939 */
2940 if (netif_running(tun->dev))
2941 netif_tx_wake_all_queues(tun->dev);
2942
2943 strcpy(ifr->ifr_name, tun->dev->name);
2944 return 0;
2945}
2946
2947static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
2948{
2949 tun_debug(KERN_INFO, tun, "tun_get_iff\n");
2950
2951 strcpy(ifr->ifr_name, tun->dev->name);
2952
2953 ifr->ifr_flags = tun_flags(tun);
2954
2955}
2956
2957/* This is like a cut-down ethtool ops, except done via tun fd so no
2958 * privs required. */
2959static int set_offload(struct tun_struct *tun, unsigned long arg)
2960{
2961 netdev_features_t features = 0;
2962
2963 if (arg & TUN_F_CSUM) {
2964 features |= NETIF_F_HW_CSUM;
2965 arg &= ~TUN_F_CSUM;
2966
2967 if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
2968 if (arg & TUN_F_TSO_ECN) {
2969 features |= NETIF_F_TSO_ECN;
2970 arg &= ~TUN_F_TSO_ECN;
2971 }
2972 if (arg & TUN_F_TSO4)
2973 features |= NETIF_F_TSO;
2974 if (arg & TUN_F_TSO6)
2975 features |= NETIF_F_TSO6;
2976 arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
2977 }
2978
2979 arg &= ~TUN_F_UFO;
2980 }
2981
2982 /* This gives the user a way to test for new features in future by
2983 * trying to set them. */
2984 if (arg)
2985 return -EINVAL;
2986
2987 tun->set_features = features;
2988 tun->dev->wanted_features &= ~TUN_USER_FEATURES;
2989 tun->dev->wanted_features |= features;
2990 netdev_update_features(tun->dev);
2991
2992 return 0;
2993}
2994
2995static void tun_detach_filter(struct tun_struct *tun, int n)
2996{
2997 int i;
2998 struct tun_file *tfile;
2999
3000 for (i = 0; i < n; i++) {
3001 tfile = rtnl_dereference(tun->tfiles[i]);
3002 lock_sock(tfile->socket.sk);
3003 sk_detach_filter(tfile->socket.sk);
3004 release_sock(tfile->socket.sk);
3005 }
3006
3007 tun->filter_attached = false;
3008}
3009
3010static int tun_attach_filter(struct tun_struct *tun)
3011{
3012 int i, ret = 0;
3013 struct tun_file *tfile;
3014
3015 for (i = 0; i < tun->numqueues; i++) {
3016 tfile = rtnl_dereference(tun->tfiles[i]);
3017 lock_sock(tfile->socket.sk);
3018 ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
3019 release_sock(tfile->socket.sk);
3020 if (ret) {
3021 tun_detach_filter(tun, i);
3022 return ret;
3023 }
3024 }
3025
3026 tun->filter_attached = true;
3027 return ret;
3028}
3029
3030static void tun_set_sndbuf(struct tun_struct *tun)
3031{
3032 struct tun_file *tfile;
3033 int i;
3034
3035 for (i = 0; i < tun->numqueues; i++) {
3036 tfile = rtnl_dereference(tun->tfiles[i]);
3037 tfile->socket.sk->sk_sndbuf = tun->sndbuf;
3038 }
3039}
3040
3041static int tun_set_queue(struct file *file, struct ifreq *ifr)
3042{
3043 struct tun_file *tfile = file->private_data;
3044 struct tun_struct *tun;
3045 int ret = 0;
3046
3047 rtnl_lock();
3048
3049 if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
3050 tun = tfile->detached;
3051 if (!tun) {
3052 ret = -EINVAL;
3053 goto unlock;
3054 }
3055 ret = security_tun_dev_attach_queue(tun->security);
3056 if (ret < 0)
3057 goto unlock;
3058 ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
3059 tun->flags & IFF_NAPI_FRAGS, true);
3060 } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
3061 tun = rtnl_dereference(tfile->tun);
3062 if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
3063 ret = -EINVAL;
3064 else
3065 __tun_detach(tfile, false);
3066 } else
3067 ret = -EINVAL;
3068
3069 if (ret >= 0)
3070 netdev_state_change(tun->dev);
3071
3072unlock:
3073 rtnl_unlock();
3074 return ret;
3075}
3076
3077static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
3078 void __user *data)
3079{
3080 struct bpf_prog *prog;
3081 int fd;
3082
3083 if (copy_from_user(&fd, data, sizeof(fd)))
3084 return -EFAULT;
3085
3086 if (fd == -1) {
3087 prog = NULL;
3088 } else {
3089 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
3090 if (IS_ERR(prog))
3091 return PTR_ERR(prog);
3092 }
3093
3094 return __tun_set_ebpf(tun, prog_p, prog);
3095}
3096
3097/* Return correct value for tun->dev->addr_len based on tun->dev->type. */
3098static unsigned char tun_get_addr_len(unsigned short type)
3099{
3100 switch (type) {
3101 case ARPHRD_IP6GRE:
3102 case ARPHRD_TUNNEL6:
3103 return sizeof(struct in6_addr);
3104 case ARPHRD_IPGRE:
3105 case ARPHRD_TUNNEL:
3106 case ARPHRD_SIT:
3107 return 4;
3108 case ARPHRD_ETHER:
3109 return ETH_ALEN;
3110 case ARPHRD_IEEE802154:
3111 case ARPHRD_IEEE802154_MONITOR:
3112 return IEEE802154_EXTENDED_ADDR_LEN;
3113 case ARPHRD_PHONET_PIPE:
3114 case ARPHRD_PPP:
3115 case ARPHRD_NONE:
3116 return 0;
3117 case ARPHRD_6LOWPAN:
3118 return EUI64_ADDR_LEN;
3119 case ARPHRD_FDDI:
3120 return FDDI_K_ALEN;
3121 case ARPHRD_HIPPI:
3122 return HIPPI_ALEN;
3123 case ARPHRD_IEEE802:
3124 return FC_ALEN;
3125 case ARPHRD_ROSE:
3126 return ROSE_ADDR_LEN;
3127 case ARPHRD_NETROM:
3128 return AX25_ADDR_LEN;
3129 case ARPHRD_LOCALTLK:
3130 return LTALK_ALEN;
3131 default:
3132 return 0;
3133 }
3134}
3135
3136static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
3137 unsigned long arg, int ifreq_len)
3138{
3139 struct tun_file *tfile = file->private_data;
3140 struct net *net = sock_net(&tfile->sk);
3141 struct tun_struct *tun;
3142 void __user* argp = (void __user*)arg;
3143 unsigned int carrier;
3144 struct ifreq ifr;
3145 kuid_t owner;
3146 kgid_t group;
3147 int ifindex;
3148 int sndbuf;
3149 int vnet_hdr_sz;
3150 int le;
3151 int ret;
3152 bool do_notify = false;
3153
3154 if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
3155 (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
3156 if (copy_from_user(&ifr, argp, ifreq_len))
3157 return -EFAULT;
3158 } else {
3159 memset(&ifr, 0, sizeof(ifr));
3160 }
3161 if (cmd == TUNGETFEATURES) {
3162 /* Currently this just means: "what IFF flags are valid?".
3163 * This is needed because we never checked for invalid flags on
3164 * TUNSETIFF.
3165 */
3166 return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
3167 (unsigned int __user*)argp);
3168 } else if (cmd == TUNSETQUEUE) {
3169 return tun_set_queue(file, &ifr);
3170 } else if (cmd == SIOCGSKNS) {
3171 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3172 return -EPERM;
3173 return open_related_ns(&net->ns, get_net_ns);
3174 }
3175
3176 ret = 0;
3177 rtnl_lock();
3178
3179 tun = tun_get(tfile);
3180 if (cmd == TUNSETIFF) {
3181 ret = -EEXIST;
3182 if (tun)
3183 goto unlock;
3184
3185 ifr.ifr_name[IFNAMSIZ-1] = '\0';
3186
3187 ret = tun_set_iff(net, file, &ifr);
3188
3189 if (ret)
3190 goto unlock;
3191
3192 if (copy_to_user(argp, &ifr, ifreq_len))
3193 ret = -EFAULT;
3194 goto unlock;
3195 }
3196 if (cmd == TUNSETIFINDEX) {
3197 ret = -EPERM;
3198 if (tun)
3199 goto unlock;
3200
3201 ret = -EFAULT;
3202 if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
3203 goto unlock;
3204 ret = -EINVAL;
3205 if (ifindex < 0)
3206 goto unlock;
3207 ret = 0;
3208 tfile->ifindex = ifindex;
3209 goto unlock;
3210 }
3211
3212 ret = -EBADFD;
3213 if (!tun)
3214 goto unlock;
3215
3216 tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
3217
3218 net = dev_net(tun->dev);
3219 ret = 0;
3220 switch (cmd) {
3221 case TUNGETIFF:
3222 tun_get_iff(tun, &ifr);
3223
3224 if (tfile->detached)
3225 ifr.ifr_flags |= IFF_DETACH_QUEUE;
3226 if (!tfile->socket.sk->sk_filter)
3227 ifr.ifr_flags |= IFF_NOFILTER;
3228
3229 if (copy_to_user(argp, &ifr, ifreq_len))
3230 ret = -EFAULT;
3231 break;
3232
3233 case TUNSETNOCSUM:
3234 /* Disable/Enable checksum */
3235
3236 /* [unimplemented] */
3237 tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
3238 arg ? "disabled" : "enabled");
3239 break;
3240
3241 case TUNSETPERSIST:
3242 /* Disable/Enable persist mode. Keep an extra reference to the
3243 * module to prevent the module being unprobed.
3244 */
3245 if (arg && !(tun->flags & IFF_PERSIST)) {
3246 tun->flags |= IFF_PERSIST;
3247 __module_get(THIS_MODULE);
3248 do_notify = true;
3249 }
3250 if (!arg && (tun->flags & IFF_PERSIST)) {
3251 tun->flags &= ~IFF_PERSIST;
3252 module_put(THIS_MODULE);
3253 do_notify = true;
3254 }
3255
3256 tun_debug(KERN_INFO, tun, "persist %s\n",
3257 arg ? "enabled" : "disabled");
3258 break;
3259
3260 case TUNSETOWNER:
3261 /* Set owner of the device */
3262 owner = make_kuid(current_user_ns(), arg);
3263 if (!uid_valid(owner)) {
3264 ret = -EINVAL;
3265 break;
3266 }
3267 tun->owner = owner;
3268 do_notify = true;
3269 tun_debug(KERN_INFO, tun, "owner set to %u\n",
3270 from_kuid(&init_user_ns, tun->owner));
3271 break;
3272
3273 case TUNSETGROUP:
3274 /* Set group of the device */
3275 group = make_kgid(current_user_ns(), arg);
3276 if (!gid_valid(group)) {
3277 ret = -EINVAL;
3278 break;
3279 }
3280 tun->group = group;
3281 do_notify = true;
3282 tun_debug(KERN_INFO, tun, "group set to %u\n",
3283 from_kgid(&init_user_ns, tun->group));
3284 break;
3285
3286 case TUNSETLINK:
3287 /* Only allow setting the type when the interface is down */
3288 if (tun->dev->flags & IFF_UP) {
3289 tun_debug(KERN_INFO, tun,
3290 "Linktype set failed because interface is up\n");
3291 ret = -EBUSY;
3292 } else {
3293 tun->dev->type = (int) arg;
3294 tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
3295 tun_debug(KERN_INFO, tun, "linktype set to %d\n",
3296 tun->dev->type);
3297 ret = 0;
3298 }
3299 break;
3300
3301#ifdef TUN_DEBUG
3302 case TUNSETDEBUG:
3303 tun->debug = arg;
3304 break;
3305#endif
3306 case TUNSETOFFLOAD:
3307 ret = set_offload(tun, arg);
3308 break;
3309
3310 case TUNSETTXFILTER:
3311 /* Can be set only for TAPs */
3312 ret = -EINVAL;
3313 if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3314 break;
3315 ret = update_filter(&tun->txflt, (void __user *)arg);
3316 break;
3317
3318 case SIOCGIFHWADDR:
3319 /* Get hw address */
3320 memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
3321 dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
3322 if (copy_to_user(argp, &ifr, ifreq_len))
3323 ret = -EFAULT;
3324 break;
3325
3326 case SIOCSIFHWADDR:
3327 /* Set hw address */
3328 tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
3329 ifr.ifr_hwaddr.sa_data);
3330
3331 ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
3332 break;
3333
3334 case TUNGETSNDBUF:
3335 sndbuf = tfile->socket.sk->sk_sndbuf;
3336 if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
3337 ret = -EFAULT;
3338 break;
3339
3340 case TUNSETSNDBUF:
3341 if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
3342 ret = -EFAULT;
3343 break;
3344 }
3345 if (sndbuf <= 0) {
3346 ret = -EINVAL;
3347 break;
3348 }
3349
3350 tun->sndbuf = sndbuf;
3351 tun_set_sndbuf(tun);
3352 break;
3353
3354 case TUNGETVNETHDRSZ:
3355 vnet_hdr_sz = tun->vnet_hdr_sz;
3356 if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
3357 ret = -EFAULT;
3358 break;
3359
3360 case TUNSETVNETHDRSZ:
3361 if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
3362 ret = -EFAULT;
3363 break;
3364 }
3365 if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
3366 ret = -EINVAL;
3367 break;
3368 }
3369
3370 tun->vnet_hdr_sz = vnet_hdr_sz;
3371 break;
3372
3373 case TUNGETVNETLE:
3374 le = !!(tun->flags & TUN_VNET_LE);
3375 if (put_user(le, (int __user *)argp))
3376 ret = -EFAULT;
3377 break;
3378
3379 case TUNSETVNETLE:
3380 if (get_user(le, (int __user *)argp)) {
3381 ret = -EFAULT;
3382 break;
3383 }
3384 if (le)
3385 tun->flags |= TUN_VNET_LE;
3386 else
3387 tun->flags &= ~TUN_VNET_LE;
3388 break;
3389
3390 case TUNGETVNETBE:
3391 ret = tun_get_vnet_be(tun, argp);
3392 break;
3393
3394 case TUNSETVNETBE:
3395 ret = tun_set_vnet_be(tun, argp);
3396 break;
3397
3398 case TUNATTACHFILTER:
3399 /* Can be set only for TAPs */
3400 ret = -EINVAL;
3401 if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3402 break;
3403 ret = -EFAULT;
3404 if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
3405 break;
3406
3407 ret = tun_attach_filter(tun);
3408 break;
3409
3410 case TUNDETACHFILTER:
3411 /* Can be set only for TAPs */
3412 ret = -EINVAL;
3413 if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3414 break;
3415 ret = 0;
3416 tun_detach_filter(tun, tun->numqueues);
3417 break;
3418
3419 case TUNGETFILTER:
3420 ret = -EINVAL;
3421 if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3422 break;
3423 ret = -EFAULT;
3424 if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
3425 break;
3426 ret = 0;
3427 break;
3428
3429 case TUNSETSTEERINGEBPF:
3430 ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
3431 break;
3432
3433 case TUNSETFILTEREBPF:
3434 ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
3435 break;
3436
3437 case TUNSETCARRIER:
3438 ret = -EFAULT;
3439 if (copy_from_user(&carrier, argp, sizeof(carrier)))
3440 goto unlock;
3441
3442 ret = tun_net_change_carrier(tun->dev, (bool)carrier);
3443 break;
3444
3445 case TUNGETDEVNETNS:
3446 ret = -EPERM;
3447 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3448 goto unlock;
3449 ret = open_related_ns(&net->ns, get_net_ns);
3450 break;
3451
3452 default:
3453 ret = -EINVAL;
3454 break;
3455 }
3456
3457 if (do_notify)
3458 netdev_state_change(tun->dev);
3459
3460unlock:
3461 rtnl_unlock();
3462 if (tun)
3463 tun_put(tun);
3464 return ret;
3465}
3466
3467static long tun_chr_ioctl(struct file *file,
3468 unsigned int cmd, unsigned long arg)
3469{
3470 return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
3471}
3472
3473#ifdef CONFIG_COMPAT
3474static long tun_chr_compat_ioctl(struct file *file,
3475 unsigned int cmd, unsigned long arg)
3476{
3477 switch (cmd) {
3478 case TUNSETIFF:
3479 case TUNGETIFF:
3480 case TUNSETTXFILTER:
3481 case TUNGETSNDBUF:
3482 case TUNSETSNDBUF:
3483 case SIOCGIFHWADDR:
3484 case SIOCSIFHWADDR:
3485 arg = (unsigned long)compat_ptr(arg);
3486 break;
3487 default:
3488 arg = (compat_ulong_t)arg;
3489 break;
3490 }
3491
3492 /*
3493 * compat_ifreq is shorter than ifreq, so we must not access beyond
3494 * the end of that structure. All fields that are used in this
3495 * driver are compatible though, we don't need to convert the
3496 * contents.
3497 */
3498 return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
3499}
3500#endif /* CONFIG_COMPAT */
3501
3502static int tun_chr_fasync(int fd, struct file *file, int on)
3503{
3504 struct tun_file *tfile = file->private_data;
3505 int ret;
3506
3507 if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
3508 goto out;
3509
3510 if (on) {
3511 __f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
3512 tfile->flags |= TUN_FASYNC;
3513 } else
3514 tfile->flags &= ~TUN_FASYNC;
3515 ret = 0;
3516out:
3517 return ret;
3518}
3519
3520static int tun_chr_open(struct inode *inode, struct file * file)
3521{
3522 struct net *net = current->nsproxy->net_ns;
3523 struct tun_file *tfile;
3524
3525 DBG1(KERN_INFO, "tunX: tun_chr_open\n");
3526
3527 tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
3528 &tun_proto, 0);
3529 if (!tfile)
3530 return -ENOMEM;
3531 if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
3532 sk_free(&tfile->sk);
3533 return -ENOMEM;
3534 }
3535
3536 mutex_init(&tfile->napi_mutex);
3537 RCU_INIT_POINTER(tfile->tun, NULL);
3538 tfile->flags = 0;
3539 tfile->ifindex = 0;
3540
3541 init_waitqueue_head(&tfile->socket.wq.wait);
3542
3543 tfile->socket.file = file;
3544 tfile->socket.ops = &tun_socket_ops;
3545
3546 sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid());
3547
3548 tfile->sk.sk_write_space = tun_sock_write_space;
3549 tfile->sk.sk_sndbuf = INT_MAX;
3550
3551 file->private_data = tfile;
3552 INIT_LIST_HEAD(&tfile->next);
3553
3554 sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
3555
3556 return 0;
3557}
3558
3559static int tun_chr_close(struct inode *inode, struct file *file)
3560{
3561 struct tun_file *tfile = file->private_data;
3562
3563 tun_detach(tfile, true);
3564
3565 return 0;
3566}
3567
3568#ifdef CONFIG_PROC_FS
3569static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
3570{
3571 struct tun_file *tfile = file->private_data;
3572 struct tun_struct *tun;
3573 struct ifreq ifr;
3574
3575 memset(&ifr, 0, sizeof(ifr));
3576
3577 rtnl_lock();
3578 tun = tun_get(tfile);
3579 if (tun)
3580 tun_get_iff(tun, &ifr);
3581 rtnl_unlock();
3582
3583 if (tun)
3584 tun_put(tun);
3585
3586 seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
3587}
3588#endif
3589
3590static const struct file_operations tun_fops = {
3591 .owner = THIS_MODULE,
3592 .llseek = no_llseek,
3593 .read_iter = tun_chr_read_iter,
3594 .write_iter = tun_chr_write_iter,
3595 .poll = tun_chr_poll,
3596 .unlocked_ioctl = tun_chr_ioctl,
3597#ifdef CONFIG_COMPAT
3598 .compat_ioctl = tun_chr_compat_ioctl,
3599#endif
3600 .open = tun_chr_open,
3601 .release = tun_chr_close,
3602 .fasync = tun_chr_fasync,
3603#ifdef CONFIG_PROC_FS
3604 .show_fdinfo = tun_chr_show_fdinfo,
3605#endif
3606};
3607
3608static struct miscdevice tun_miscdev = {
3609 .minor = TUN_MINOR,
3610 .name = "tun",
3611 .nodename = "net/tun",
3612 .fops = &tun_fops,
3613};
3614
3615/* ethtool interface */
3616
3617static void tun_default_link_ksettings(struct net_device *dev,
3618 struct ethtool_link_ksettings *cmd)
3619{
3620 ethtool_link_ksettings_zero_link_mode(cmd, supported);
3621 ethtool_link_ksettings_zero_link_mode(cmd, advertising);
3622 cmd->base.speed = SPEED_10;
3623 cmd->base.duplex = DUPLEX_FULL;
3624 cmd->base.port = PORT_TP;
3625 cmd->base.phy_address = 0;
3626 cmd->base.autoneg = AUTONEG_DISABLE;
3627}
3628
3629static int tun_get_link_ksettings(struct net_device *dev,
3630 struct ethtool_link_ksettings *cmd)
3631{
3632 struct tun_struct *tun = netdev_priv(dev);
3633
3634 memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
3635 return 0;
3636}
3637
3638static int tun_set_link_ksettings(struct net_device *dev,
3639 const struct ethtool_link_ksettings *cmd)
3640{
3641 struct tun_struct *tun = netdev_priv(dev);
3642
3643 memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
3644 return 0;
3645}
3646
3647static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
3648{
3649 struct tun_struct *tun = netdev_priv(dev);
3650
3651 strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
3652 strlcpy(info->version, DRV_VERSION, sizeof(info->version));
3653
3654 switch (tun->flags & TUN_TYPE_MASK) {
3655 case IFF_TUN:
3656 strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
3657 break;
3658 case IFF_TAP:
3659 strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
3660 break;
3661 }
3662}
3663
3664static u32 tun_get_msglevel(struct net_device *dev)
3665{
3666#ifdef TUN_DEBUG
3667 struct tun_struct *tun = netdev_priv(dev);
3668 return tun->debug;
3669#else
3670 return -EOPNOTSUPP;
3671#endif
3672}
3673
3674static void tun_set_msglevel(struct net_device *dev, u32 value)
3675{
3676#ifdef TUN_DEBUG
3677 struct tun_struct *tun = netdev_priv(dev);
3678 tun->debug = value;
3679#endif
3680}
3681
3682static int tun_get_coalesce(struct net_device *dev,
3683 struct ethtool_coalesce *ec)
3684{
3685 struct tun_struct *tun = netdev_priv(dev);
3686
3687 ec->rx_max_coalesced_frames = tun->rx_batched;
3688
3689 return 0;
3690}
3691
3692static int tun_set_coalesce(struct net_device *dev,
3693 struct ethtool_coalesce *ec)
3694{
3695 struct tun_struct *tun = netdev_priv(dev);
3696
3697 if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
3698 tun->rx_batched = NAPI_POLL_WEIGHT;
3699 else
3700 tun->rx_batched = ec->rx_max_coalesced_frames;
3701
3702 return 0;
3703}
3704
3705static const struct ethtool_ops tun_ethtool_ops = {
3706 .get_drvinfo = tun_get_drvinfo,
3707 .get_msglevel = tun_get_msglevel,
3708 .set_msglevel = tun_set_msglevel,
3709 .get_link = ethtool_op_get_link,
3710 .get_ts_info = ethtool_op_get_ts_info,
3711 .get_coalesce = tun_get_coalesce,
3712 .set_coalesce = tun_set_coalesce,
3713 .get_link_ksettings = tun_get_link_ksettings,
3714 .set_link_ksettings = tun_set_link_ksettings,
3715};
3716
3717static int tun_queue_resize(struct tun_struct *tun)
3718{
3719 struct net_device *dev = tun->dev;
3720 struct tun_file *tfile;
3721 struct ptr_ring **rings;
3722 int n = tun->numqueues + tun->numdisabled;
3723 int ret, i;
3724
3725 rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
3726 if (!rings)
3727 return -ENOMEM;
3728
3729 for (i = 0; i < tun->numqueues; i++) {
3730 tfile = rtnl_dereference(tun->tfiles[i]);
3731 rings[i] = &tfile->tx_ring;
3732 }
3733 list_for_each_entry(tfile, &tun->disabled, next)
3734 rings[i++] = &tfile->tx_ring;
3735
3736 ret = ptr_ring_resize_multiple(rings, n,
3737 dev->tx_queue_len, GFP_KERNEL,
3738 tun_ptr_free);
3739
3740 kfree(rings);
3741 return ret;
3742}
3743
3744static int tun_device_event(struct notifier_block *unused,
3745 unsigned long event, void *ptr)
3746{
3747 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3748 struct tun_struct *tun = netdev_priv(dev);
3749 int i;
3750
3751 if (dev->rtnl_link_ops != &tun_link_ops)
3752 return NOTIFY_DONE;
3753
3754 switch (event) {
3755 case NETDEV_CHANGE_TX_QUEUE_LEN:
3756 if (tun_queue_resize(tun))
3757 return NOTIFY_BAD;
3758 break;
3759 case NETDEV_UP:
3760 for (i = 0; i < tun->numqueues; i++) {
3761 struct tun_file *tfile;
3762
3763 tfile = rtnl_dereference(tun->tfiles[i]);
3764 tfile->socket.sk->sk_write_space(tfile->socket.sk);
3765 }
3766 break;
3767 default:
3768 break;
3769 }
3770
3771 return NOTIFY_DONE;
3772}
3773
3774static struct notifier_block tun_notifier_block __read_mostly = {
3775 .notifier_call = tun_device_event,
3776};
3777
3778static int __init tun_init(void)
3779{
3780 int ret = 0;
3781
3782 pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
3783
3784 ret = rtnl_link_register(&tun_link_ops);
3785 if (ret) {
3786 pr_err("Can't register link_ops\n");
3787 goto err_linkops;
3788 }
3789
3790 ret = misc_register(&tun_miscdev);
3791 if (ret) {
3792 pr_err("Can't register misc device %d\n", TUN_MINOR);
3793 goto err_misc;
3794 }
3795
3796 ret = register_netdevice_notifier(&tun_notifier_block);
3797 if (ret) {
3798 pr_err("Can't register netdevice notifier\n");
3799 goto err_notifier;
3800 }
3801
3802 return 0;
3803
3804err_notifier:
3805 misc_deregister(&tun_miscdev);
3806err_misc:
3807 rtnl_link_unregister(&tun_link_ops);
3808err_linkops:
3809 return ret;
3810}
3811
3812static void tun_cleanup(void)
3813{
3814 misc_deregister(&tun_miscdev);
3815 rtnl_link_unregister(&tun_link_ops);
3816 unregister_netdevice_notifier(&tun_notifier_block);
3817}
3818
3819/* Get an underlying socket object from tun file. Returns error unless file is
3820 * attached to a device. The returned object works like a packet socket, it
3821 * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for
3822 * holding a reference to the file for as long as the socket is in use. */
3823struct socket *tun_get_socket(struct file *file)
3824{
3825 struct tun_file *tfile;
3826 if (file->f_op != &tun_fops)
3827 return ERR_PTR(-EINVAL);
3828 tfile = file->private_data;
3829 if (!tfile)
3830 return ERR_PTR(-EBADFD);
3831 return &tfile->socket;
3832}
3833EXPORT_SYMBOL_GPL(tun_get_socket);
3834
3835struct ptr_ring *tun_get_tx_ring(struct file *file)
3836{
3837 struct tun_file *tfile;
3838
3839 if (file->f_op != &tun_fops)
3840 return ERR_PTR(-EINVAL);
3841 tfile = file->private_data;
3842 if (!tfile)
3843 return ERR_PTR(-EBADFD);
3844 return &tfile->tx_ring;
3845}
3846EXPORT_SYMBOL_GPL(tun_get_tx_ring);
3847
3848module_init(tun_init);
3849module_exit(tun_cleanup);
3850MODULE_DESCRIPTION(DRV_DESCRIPTION);
3851MODULE_AUTHOR(DRV_COPYRIGHT);
3852MODULE_LICENSE("GPL");
3853MODULE_ALIAS_MISCDEV(TUN_MINOR);
3854MODULE_ALIAS("devname:net/tun");