Blame - marvell/linux/net/packet/af_packet.c - T108

blob: 556c21c10415609b9900975cecb6ef0cec36bf7f [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/*
				3	* INET An implementation of the TCP/IP protocol suite for the LINUX
				4	* operating system. INET is implemented using the BSD Socket
				5	* interface as the means of communication with the user level.
				6	*
				7	* PACKET - implements raw packet sockets.
				8	*
				9	* Authors: Ross Biro
				10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				11	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				12	*
				13	* Fixes:
				14	* Alan Cox : verify_area() now used correctly
				15	* Alan Cox : new skbuff lists, look ma no backlogs!
				16	* Alan Cox : tidied skbuff lists.
				17	* Alan Cox : Now uses generic datagram routines I
				18	* added. Also fixed the peek/read crash
				19	* from all old Linux datagram code.
				20	* Alan Cox : Uses the improved datagram code.
				21	* Alan Cox : Added NULL's for socket options.
				22	* Alan Cox : Re-commented the code.
				23	* Alan Cox : Use new kernel side addressing
				24	* Rob Janssen : Correct MTU usage.
				25	* Dave Platt : Counter leaks caused by incorrect
				26	* interrupt locking and some slightly
				27	* dubious gcc output. Can you read
				28	* compiler: it said _VOLATILE_
				29	* Richard Kooijman : Timestamp fixes.
				30	* Alan Cox : New buffers. Use sk->mac.raw.
				31	* Alan Cox : sendmsg/recvmsg support.
				32	* Alan Cox : Protocol setting support
				33	* Alexey Kuznetsov : Untied from IPv4 stack.
				34	* Cyrus Durgin : Fixed kerneld for kmod.
				35	* Michal Ostrowski : Module initialization cleanup.
				36	* Ulises Alonso : Frame number limit removal and
				37	* packet_set_ring memory leak.
				38	* Eric Biederman : Allow for > 8 byte hardware addresses.
				39	* The convention is that longer addresses
				40	* will simply extend the hardware address
				41	* byte arrays at the end of sockaddr_ll
				42	* and packet_mreq.
				43	* Johann Baudy : Added TX RING.
				44	* Chetan Loke : Implemented TPACKET_V3 block abstraction
				45	* layer.
				46	* Copyright (C) 2011, <lokec@ccs.neu.edu>
				47	*/
				48
				49	#include <linux/types.h>
				50	#include <linux/mm.h>
				51	#include <linux/capability.h>
				52	#include <linux/fcntl.h>
				53	#include <linux/socket.h>
				54	#include <linux/in.h>
				55	#include <linux/inet.h>
				56	#include <linux/netdevice.h>
				57	#include <linux/if_packet.h>
				58	#include <linux/wireless.h>
				59	#include <linux/kernel.h>
				60	#include <linux/kmod.h>
				61	#include <linux/slab.h>
				62	#include <linux/vmalloc.h>
				63	#include <net/net_namespace.h>
				64	#include <net/ip.h>
				65	#include <net/protocol.h>
				66	#include <linux/skbuff.h>
				67	#include <net/sock.h>
				68	#include <linux/errno.h>
				69	#include <linux/timer.h>
				70	#include <linux/uaccess.h>
				71	#include <asm/ioctls.h>
				72	#include <asm/page.h>
				73	#include <asm/cacheflush.h>
				74	#include <asm/io.h>
				75	#include <linux/proc_fs.h>
				76	#include <linux/seq_file.h>
				77	#include <linux/poll.h>
				78	#include <linux/module.h>
				79	#include <linux/init.h>
				80	#include <linux/mutex.h>
				81	#include <linux/if_vlan.h>
				82	#include <linux/virtio_net.h>
				83	#include <linux/errqueue.h>
				84	#include <linux/net_tstamp.h>
				85	#include <linux/percpu.h>
				86	#ifdef CONFIG_INET
				87	#include <net/inet_common.h>
				88	#endif
				89	#include <linux/bpf.h>
				90	#include <net/compat.h>
				91
				92	#include "internal.h"
				93
				94	/*
				95	Assumptions:
				96	- if device has no dev->hard_header routine, it adds and removes ll header
				97	inside itself. In this case ll header is invisible outside of device,
				98	but higher levels still should reserve dev->hard_header_len.
				99	Some devices are enough clever to reallocate skb, when header
				100	will not fit to reserved space (tunnel), another ones are silly
				101	(PPP).
				102	- packet socket receives packets with pulled ll header,
				103	so that SOCK_RAW should push it back.
				104
				105	On receive:
				106	-----------
				107
				108	Incoming, dev->hard_header!=NULL
				109	mac_header -> ll header
				110	data -> data
				111
				112	Outgoing, dev->hard_header!=NULL
				113	mac_header -> ll header
				114	data -> ll header
				115
				116	Incoming, dev->hard_header==NULL
				117	mac_header -> UNKNOWN position. It is very likely, that it points to ll
				118	header. PPP makes it, that is wrong, because introduce
				119	assymetry between rx and tx paths.
				120	data -> data
				121
				122	Outgoing, dev->hard_header==NULL
				123	mac_header -> data. ll header is still not built!
				124	data -> data
				125
				126	Resume
				127	If dev->hard_header==NULL we are unlikely to restore sensible ll header.
				128
				129
				130	On transmit:
				131	------------
				132
				133	dev->hard_header != NULL
				134	mac_header -> ll header
				135	data -> ll header
				136
				137	dev->hard_header == NULL (ll header is added by device, we cannot control it)
				138	mac_header -> data
				139	data -> data
				140
				141	We should set nh.raw on output to correct posistion,
				142	packet classifier depends on it.
				143	*/
				144
				145	/* Private packet socket structures. */
				146
				147	/* identical to struct packet_mreq except it has
				148	* a longer address field.
				149	*/
				150	struct packet_mreq_max {
				151	int mr_ifindex;
				152	unsigned short mr_type;
				153	unsigned short mr_alen;
				154	unsigned char mr_address[MAX_ADDR_LEN];
				155	};
				156
				157	union tpacket_uhdr {
				158	struct tpacket_hdr *h1;
				159	struct tpacket2_hdr *h2;
				160	struct tpacket3_hdr *h3;
				161	void *raw;
				162	};
				163
				164	static int packet_set_ring(struct sock sk, union tpacket_req_u req_u,
				165	int closing, int tx_ring);
				166
				167	#define V3_ALIGNMENT (8)
				168
				169	#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
				170
				171	#define BLK_PLUS_PRIV(sz_of_priv) \
				172	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
				173
				174	#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
				175	#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
				176	#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
				177	#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
				178	#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
				179	#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
				180	#define BLOCK_PRIV(x) ((void )((char )(x) + BLOCK_O2PRIV(x)))
				181
				182	struct packet_sock;
				183	static int tpacket_rcv(struct sk_buff skb, struct net_device dev,
				184	struct packet_type pt, struct net_device orig_dev);
				185
				186	static void packet_previous_frame(struct packet_sock po,
				187	struct packet_ring_buffer *rb,
				188	int status);
				189	static void packet_increment_head(struct packet_ring_buffer *buff);
				190	static int prb_curr_blk_in_use(struct tpacket_block_desc *);
				191	static void prb_dispatch_next_block(struct tpacket_kbdq_core ,
				192	struct packet_sock *);
				193	static void prb_retire_current_block(struct tpacket_kbdq_core *,
				194	struct packet_sock *, unsigned int status);
				195	static int prb_queue_frozen(struct tpacket_kbdq_core *);
				196	static void prb_open_block(struct tpacket_kbdq_core *,
				197	struct tpacket_block_desc *);
				198	static void prb_retire_rx_blk_timer_expired(struct timer_list *);
				199	static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
				200	static void prb_fill_rxhash(struct tpacket_kbdq_core , struct tpacket3_hdr );
				201	static void prb_clear_rxhash(struct tpacket_kbdq_core *,
				202	struct tpacket3_hdr *);
				203	static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
				204	struct tpacket3_hdr *);
				205	static void packet_flush_mclist(struct sock *sk);
				206	static u16 packet_pick_tx_queue(struct sk_buff *skb);
				207
				208	struct packet_skb_cb {
				209	union {
				210	struct sockaddr_pkt pkt;
				211	union {
				212	/* Trick: alias skb original length with
				213	* ll.sll_family and ll.protocol in order
				214	* to save room.
				215	*/
				216	unsigned int origlen;
				217	struct sockaddr_ll ll;
				218	};
				219	} sa;
				220	};
				221
				222	#define vio_le() virtio_legacy_is_little_endian()
				223
				224	#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
				225
				226	#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
				227	#define GET_PBLOCK_DESC(x, bid) \
				228	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
				229	#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
				230	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
				231	#define GET_NEXT_PRB_BLK_NUM(x) \
				232	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
				233	((x)->kactive_blk_num+1) : 0)
				234
				235	static void __fanout_unlink(struct sock sk, struct packet_sock po);
				236	static void __fanout_link(struct sock sk, struct packet_sock po);
				237
				238	static int packet_direct_xmit(struct sk_buff *skb)
				239	{
				240	return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
				241	}
				242
				243	static struct net_device packet_cached_dev_get(struct packet_sock po)
				244	{
				245	struct net_device *dev;
				246
				247	rcu_read_lock();
				248	dev = rcu_dereference(po->cached_dev);
				249	if (likely(dev))
				250	dev_hold(dev);
				251	rcu_read_unlock();
				252
				253	return dev;
				254	}
				255
				256	static void packet_cached_dev_assign(struct packet_sock *po,
				257	struct net_device *dev)
				258	{
				259	rcu_assign_pointer(po->cached_dev, dev);
				260	}
				261
				262	static void packet_cached_dev_reset(struct packet_sock *po)
				263	{
				264	RCU_INIT_POINTER(po->cached_dev, NULL);
				265	}
				266
				267	static bool packet_use_direct_xmit(const struct packet_sock *po)
				268	{
				269	/* Paired with WRITE_ONCE() in packet_setsockopt() */
				270	return READ_ONCE(po->xmit) == packet_direct_xmit;
				271	}
				272
				273	static u16 packet_pick_tx_queue(struct sk_buff *skb)
				274	{
				275	struct net_device *dev = skb->dev;
				276	const struct net_device_ops *ops = dev->netdev_ops;
				277	int cpu = raw_smp_processor_id();
				278	u16 queue_index;
				279
				280	#ifdef CONFIG_XPS
				281	skb->sender_cpu = cpu + 1;
				282	#endif
				283	skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
				284	if (ops->ndo_select_queue) {
				285	queue_index = ops->ndo_select_queue(dev, skb, NULL);
				286	queue_index = netdev_cap_txqueue(dev, queue_index);
				287	} else {
				288	queue_index = netdev_pick_tx(dev, skb, NULL);
				289	}
				290
				291	return queue_index;
				292	}
				293
				294	/* __register_prot_hook must be invoked through register_prot_hook
				295	* or from a context in which asynchronous accesses to the packet
				296	* socket is not possible (packet_create()).
				297	*/
				298	static void __register_prot_hook(struct sock *sk)
				299	{
				300	struct packet_sock *po = pkt_sk(sk);
				301
				302	if (!po->running) {
				303	if (po->fanout)
				304	__fanout_link(sk, po);
				305	else
				306	dev_add_pack(&po->prot_hook);
				307
				308	sock_hold(sk);
				309	po->running = 1;
				310	}
				311	}
				312
				313	static void register_prot_hook(struct sock *sk)
				314	{
				315	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
				316	__register_prot_hook(sk);
				317	}
				318
				319	/* If the sync parameter is true, we will temporarily drop
				320	* the po->bind_lock and do a synchronize_net to make sure no
				321	* asynchronous packet processing paths still refer to the elements
				322	* of po->prot_hook. If the sync parameter is false, it is the
				323	* callers responsibility to take care of this.
				324	*/
				325	static void __unregister_prot_hook(struct sock *sk, bool sync)
				326	{
				327	struct packet_sock *po = pkt_sk(sk);
				328
				329	lockdep_assert_held_once(&po->bind_lock);
				330
				331	po->running = 0;
				332
				333	if (po->fanout)
				334	__fanout_unlink(sk, po);
				335	else
				336	__dev_remove_pack(&po->prot_hook);
				337
				338	__sock_put(sk);
				339
				340	if (sync) {
				341	spin_unlock(&po->bind_lock);
				342	synchronize_net();
				343	spin_lock(&po->bind_lock);
				344	}
				345	}
				346
				347	static void unregister_prot_hook(struct sock *sk, bool sync)
				348	{
				349	struct packet_sock *po = pkt_sk(sk);
				350
				351	if (po->running)
				352	__unregister_prot_hook(sk, sync);
				353	}
				354
				355	static inline struct page * __pure pgv_to_page(void *addr)
				356	{
				357	if (is_vmalloc_addr(addr))
				358	return vmalloc_to_page(addr);
				359	return virt_to_page(addr);
				360	}
				361
				362	static void __packet_set_status(struct packet_sock po, void frame, int status)
				363	{
				364	union tpacket_uhdr h;
				365
				366	/* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */
				367
				368	h.raw = frame;
				369	switch (po->tp_version) {
				370	case TPACKET_V1:
				371	WRITE_ONCE(h.h1->tp_status, status);
				372	flush_dcache_page(pgv_to_page(&h.h1->tp_status));
				373	break;
				374	case TPACKET_V2:
				375	WRITE_ONCE(h.h2->tp_status, status);
				376	flush_dcache_page(pgv_to_page(&h.h2->tp_status));
				377	break;
				378	case TPACKET_V3:
				379	WRITE_ONCE(h.h3->tp_status, status);
				380	flush_dcache_page(pgv_to_page(&h.h3->tp_status));
				381	break;
				382	default:
				383	WARN(1, "TPACKET version not supported.\n");
				384	BUG();
				385	}
				386
				387	smp_wmb();
				388	}
				389
				390	static int __packet_get_status(const struct packet_sock po, void frame)
				391	{
				392	union tpacket_uhdr h;
				393
				394	smp_rmb();
				395
				396	/* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */
				397
				398	h.raw = frame;
				399	switch (po->tp_version) {
				400	case TPACKET_V1:
				401	flush_dcache_page(pgv_to_page(&h.h1->tp_status));
				402	return READ_ONCE(h.h1->tp_status);
				403	case TPACKET_V2:
				404	flush_dcache_page(pgv_to_page(&h.h2->tp_status));
				405	return READ_ONCE(h.h2->tp_status);
				406	case TPACKET_V3:
				407	flush_dcache_page(pgv_to_page(&h.h3->tp_status));
				408	return READ_ONCE(h.h3->tp_status);
				409	default:
				410	WARN(1, "TPACKET version not supported.\n");
				411	BUG();
				412	return 0;
				413	}
				414	}
				415
				416	static __u32 tpacket_get_timestamp(struct sk_buff skb, struct timespec64 ts,
				417	unsigned int flags)
				418	{
				419	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
				420
				421	if (shhwtstamps &&
				422	(flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
				423	ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
				424	return TP_STATUS_TS_RAW_HARDWARE;
				425
				426	if (ktime_to_timespec64_cond(skb->tstamp, ts))
				427	return TP_STATUS_TS_SOFTWARE;
				428
				429	return 0;
				430	}
				431
				432	static __u32 __packet_set_timestamp(struct packet_sock po, void frame,
				433	struct sk_buff *skb)
				434	{
				435	union tpacket_uhdr h;
				436	struct timespec64 ts;
				437	__u32 ts_status;
				438
				439	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
				440	return 0;
				441
				442	h.raw = frame;
				443	/*
				444	* versions 1 through 3 overflow the timestamps in y2106, since they
				445	* all store the seconds in a 32-bit unsigned integer.
				446	* If we create a version 4, that should have a 64-bit timestamp,
				447	* either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
				448	* nanoseconds.
				449	*/
				450	switch (po->tp_version) {
				451	case TPACKET_V1:
				452	h.h1->tp_sec = ts.tv_sec;
				453	h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
				454	break;
				455	case TPACKET_V2:
				456	h.h2->tp_sec = ts.tv_sec;
				457	h.h2->tp_nsec = ts.tv_nsec;
				458	break;
				459	case TPACKET_V3:
				460	h.h3->tp_sec = ts.tv_sec;
				461	h.h3->tp_nsec = ts.tv_nsec;
				462	break;
				463	default:
				464	WARN(1, "TPACKET version not supported.\n");
				465	BUG();
				466	}
				467
				468	/* one flush is safe, as both fields always lie on the same cacheline */
				469	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
				470	smp_wmb();
				471
				472	return ts_status;
				473	}
				474
				475	static void packet_lookup_frame(const struct packet_sock po,
				476	const struct packet_ring_buffer *rb,
				477	unsigned int position,
				478	int status)
				479	{
				480	unsigned int pg_vec_pos, frame_offset;
				481	union tpacket_uhdr h;
				482
				483	pg_vec_pos = position / rb->frames_per_block;
				484	frame_offset = position % rb->frames_per_block;
				485
				486	h.raw = rb->pg_vec[pg_vec_pos].buffer +
				487	(frame_offset * rb->frame_size);
				488
				489	if (status != __packet_get_status(po, h.raw))
				490	return NULL;
				491
				492	return h.raw;
				493	}
				494
				495	static void packet_current_frame(struct packet_sock po,
				496	struct packet_ring_buffer *rb,
				497	int status)
				498	{
				499	return packet_lookup_frame(po, rb, rb->head, status);
				500	}
				501
				502	static u16 vlan_get_tci(const struct sk_buff skb, struct net_device dev)
				503	{
				504	struct vlan_hdr vhdr, *vh;
				505	unsigned int header_len;
				506
				507	if (!dev)
				508	return 0;
				509
				510	/* In the SOCK_DGRAM scenario, skb data starts at the network
				511	* protocol, which is after the VLAN headers. The outer VLAN
				512	* header is at the hard_header_len offset in non-variable
				513	* length link layer headers. If it's a VLAN device, the
				514	* min_header_len should be used to exclude the VLAN header
				515	* size.
				516	*/
				517	if (dev->min_header_len == dev->hard_header_len)
				518	header_len = dev->hard_header_len;
				519	else if (is_vlan_dev(dev))
				520	header_len = dev->min_header_len;
				521	else
				522	return 0;
				523
				524	vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len,
				525	sizeof(vhdr), &vhdr);
				526	if (unlikely(!vh))
				527	return 0;
				528
				529	return ntohs(vh->h_vlan_TCI);
				530	}
				531
				532	static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
				533	{
				534	__be16 proto = skb->protocol;
				535
				536	if (unlikely(eth_type_vlan(proto)))
				537	proto = __vlan_get_protocol_offset(skb, proto,
				538	skb_mac_offset(skb), NULL);
				539
				540	return proto;
				541	}
				542
				543	static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
				544	{
				545	del_timer_sync(&pkc->retire_blk_timer);
				546	}
				547
				548	static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
				549	struct sk_buff_head *rb_queue)
				550	{
				551	struct tpacket_kbdq_core *pkc;
				552
				553	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				554
				555	spin_lock_bh(&rb_queue->lock);
				556	pkc->delete_blk_timer = 1;
				557	spin_unlock_bh(&rb_queue->lock);
				558
				559	prb_del_retire_blk_timer(pkc);
				560	}
				561
				562	static void prb_setup_retire_blk_timer(struct packet_sock *po)
				563	{
				564	struct tpacket_kbdq_core *pkc;
				565
				566	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				567	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
				568	0);
				569	pkc->retire_blk_timer.expires = jiffies;
				570	}
				571
				572	static int prb_calc_retire_blk_tmo(struct packet_sock *po,
				573	int blk_size_in_bytes)
				574	{
				575	struct net_device *dev;
				576	unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
				577	struct ethtool_link_ksettings ecmd;
				578	int err;
				579
				580	rtnl_lock();
				581	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
				582	if (unlikely(!dev)) {
				583	rtnl_unlock();
				584	return DEFAULT_PRB_RETIRE_TOV;
				585	}
				586	err = __ethtool_get_link_ksettings(dev, &ecmd);
				587	rtnl_unlock();
				588	if (!err) {
				589	/*
				590	* If the link speed is so slow you don't really
				591	* need to worry about perf anyways
				592	*/
				593	if (ecmd.base.speed < SPEED_1000 \|\|
				594	ecmd.base.speed == SPEED_UNKNOWN) {
				595	return DEFAULT_PRB_RETIRE_TOV;
				596	} else {
				597	msec = 1;
				598	div = ecmd.base.speed / 1000;
				599	}
				600	} else
				601	return DEFAULT_PRB_RETIRE_TOV;
				602
				603	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
				604
				605	if (div)
				606	mbits /= div;
				607
				608	tmo = mbits * msec;
				609
				610	if (div)
				611	return tmo+1;
				612	return tmo;
				613	}
				614
				615	static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
				616	union tpacket_req_u *req_u)
				617	{
				618	p1->feature_req_word = req_u->req3.tp_feature_req_word;
				619	}
				620
				621	static void init_prb_bdqc(struct packet_sock *po,
				622	struct packet_ring_buffer *rb,
				623	struct pgv *pg_vec,
				624	union tpacket_req_u *req_u)
				625	{
				626	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
				627	struct tpacket_block_desc *pbd;
				628
				629	memset(p1, 0x0, sizeof(*p1));
				630
				631	p1->knxt_seq_num = 1;
				632	p1->pkbdq = pg_vec;
				633	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
				634	p1->pkblk_start = pg_vec[0].buffer;
				635	p1->kblk_size = req_u->req3.tp_block_size;
				636	p1->knum_blocks = req_u->req3.tp_block_nr;
				637	p1->hdrlen = po->tp_hdrlen;
				638	p1->version = po->tp_version;
				639	p1->last_kactive_blk_num = 0;
				640	po->stats.stats3.tp_freeze_q_cnt = 0;
				641	if (req_u->req3.tp_retire_blk_tov)
				642	p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
				643	else
				644	p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
				645	req_u->req3.tp_block_size);
				646	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
				647	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
				648
				649	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
				650	prb_init_ft_ops(p1, req_u);
				651	prb_setup_retire_blk_timer(po);
				652	prb_open_block(p1, pbd);
				653	}
				654
				655	/* Do NOT update the last_blk_num first.
				656	* Assumes sk_buff_head lock is held.
				657	*/
				658	static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
				659	{
				660	mod_timer(&pkc->retire_blk_timer,
				661	jiffies + pkc->tov_in_jiffies);
				662	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
				663	}
				664
				665	/*
				666	* Timer logic:
				667	* 1) We refresh the timer only when we open a block.
				668	* By doing this we don't waste cycles refreshing the timer
				669	* on packet-by-packet basis.
				670	*
				671	* With a 1MB block-size, on a 1Gbps line, it will take
				672	* i) ~8 ms to fill a block + ii) memcpy etc.
				673	* In this cut we are not accounting for the memcpy time.
				674	*
				675	* So, if the user sets the 'tmo' to 10ms then the timer
				676	* will never fire while the block is still getting filled
				677	* (which is what we want). However, the user could choose
				678	* to close a block early and that's fine.
				679	*
				680	* But when the timer does fire, we check whether or not to refresh it.
				681	* Since the tmo granularity is in msecs, it is not too expensive
				682	* to refresh the timer, lets say every '8' msecs.
				683	* Either the user can set the 'tmo' or we can derive it based on
				684	* a) line-speed and b) block-size.
				685	* prb_calc_retire_blk_tmo() calculates the tmo.
				686	*
				687	*/
				688	static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
				689	{
				690	struct packet_sock *po =
				691	from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
				692	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				693	unsigned int frozen;
				694	struct tpacket_block_desc *pbd;
				695
				696	spin_lock(&po->sk.sk_receive_queue.lock);
				697
				698	frozen = prb_queue_frozen(pkc);
				699	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				700
				701	if (unlikely(pkc->delete_blk_timer))
				702	goto out;
				703
				704	/* We only need to plug the race when the block is partially filled.
				705	* tpacket_rcv:
				706	* lock(); increment BLOCK_NUM_PKTS; unlock()
				707	* copy_bits() is in progress ...
				708	* timer fires on other cpu:
				709	* we can't retire the current block because copy_bits
				710	* is in progress.
				711	*
				712	*/
				713	if (BLOCK_NUM_PKTS(pbd)) {
				714	while (atomic_read(&pkc->blk_fill_in_prog)) {
				715	/* Waiting for skb_copy_bits to finish... */
				716	cpu_relax();
				717	}
				718	}
				719
				720	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
				721	if (!frozen) {
				722	if (!BLOCK_NUM_PKTS(pbd)) {
				723	/* An empty block. Just refresh the timer. */
				724	goto refresh_timer;
				725	}
				726	prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
				727	if (!prb_dispatch_next_block(pkc, po))
				728	goto refresh_timer;
				729	else
				730	goto out;
				731	} else {
				732	/* Case 1. Queue was frozen because user-space was
				733	* lagging behind.
				734	*/
				735	if (prb_curr_blk_in_use(pbd)) {
				736	/*
				737	* Ok, user-space is still behind.
				738	* So just refresh the timer.
				739	*/
				740	goto refresh_timer;
				741	} else {
				742	/* Case 2. queue was frozen,user-space caught up,
				743	* now the link went idle && the timer fired.
				744	* We don't have a block to close.So we open this
				745	* block and restart the timer.
				746	* opening a block thaws the queue,restarts timer
				747	* Thawing/timer-refresh is a side effect.
				748	*/
				749	prb_open_block(pkc, pbd);
				750	goto out;
				751	}
				752	}
				753	}
				754
				755	refresh_timer:
				756	_prb_refresh_rx_retire_blk_timer(pkc);
				757
				758	out:
				759	spin_unlock(&po->sk.sk_receive_queue.lock);
				760	}
				761
				762	static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
				763	struct tpacket_block_desc *pbd1, __u32 status)
				764	{
				765	/* Flush everything minus the block header */
				766
				767	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
				768	u8 start, end;
				769
				770	start = (u8 *)pbd1;
				771
				772	/* Skip the block header(we know header WILL fit in 4K) */
				773	start += PAGE_SIZE;
				774
				775	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
				776	for (; start < end; start += PAGE_SIZE)
				777	flush_dcache_page(pgv_to_page(start));
				778
				779	smp_wmb();
				780	#endif
				781
				782	/* Now update the block status. */
				783
				784	BLOCK_STATUS(pbd1) = status;
				785
				786	/* Flush the block header */
				787
				788	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
				789	start = (u8 *)pbd1;
				790	flush_dcache_page(pgv_to_page(start));
				791
				792	smp_wmb();
				793	#endif
				794	}
				795
				796	/*
				797	* Side effect:
				798	*
				799	* 1) flush the block
				800	* 2) Increment active_blk_num
				801	*
				802	* Note:We DONT refresh the timer on purpose.
				803	* Because almost always the next block will be opened.
				804	*/
				805	static void prb_close_block(struct tpacket_kbdq_core *pkc1,
				806	struct tpacket_block_desc *pbd1,
				807	struct packet_sock *po, unsigned int stat)
				808	{
				809	__u32 status = TP_STATUS_USER \| stat;
				810
				811	struct tpacket3_hdr *last_pkt;
				812	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
				813	struct sock *sk = &po->sk;
				814
				815	if (atomic_read(&po->tp_drops))
				816	status \|= TP_STATUS_LOSING;
				817
				818	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
				819	last_pkt->tp_next_offset = 0;
				820
				821	/* Get the ts of the last pkt */
				822	if (BLOCK_NUM_PKTS(pbd1)) {
				823	h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
				824	h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
				825	} else {
				826	/* Ok, we tmo'd - so get the current time.
				827	*
				828	* It shouldn't really happen as we don't close empty
				829	* blocks. See prb_retire_rx_blk_timer_expired().
				830	*/
				831	struct timespec64 ts;
				832	ktime_get_real_ts64(&ts);
				833	h1->ts_last_pkt.ts_sec = ts.tv_sec;
				834	h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
				835	}
				836
				837	smp_wmb();
				838
				839	/* Flush the block */
				840	prb_flush_block(pkc1, pbd1, status);
				841
				842	sk->sk_data_ready(sk);
				843
				844	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
				845	}
				846
				847	static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
				848	{
				849	pkc->reset_pending_on_curr_blk = 0;
				850	}
				851
				852	/*
				853	* Side effect of opening a block:
				854	*
				855	* 1) prb_queue is thawed.
				856	* 2) retire_blk_timer is refreshed.
				857	*
				858	*/
				859	static void prb_open_block(struct tpacket_kbdq_core *pkc1,
				860	struct tpacket_block_desc *pbd1)
				861	{
				862	struct timespec64 ts;
				863	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
				864
				865	smp_rmb();
				866
				867	/* We could have just memset this but we will lose the
				868	* flexibility of making the priv area sticky
				869	*/
				870
				871	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
				872	BLOCK_NUM_PKTS(pbd1) = 0;
				873	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
				874
				875	ktime_get_real_ts64(&ts);
				876
				877	h1->ts_first_pkt.ts_sec = ts.tv_sec;
				878	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
				879
				880	pkc1->pkblk_start = (char *)pbd1;
				881	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
				882
				883	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
				884	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
				885
				886	pbd1->version = pkc1->version;
				887	pkc1->prev = pkc1->nxt_offset;
				888	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
				889
				890	prb_thaw_queue(pkc1);
				891	_prb_refresh_rx_retire_blk_timer(pkc1);
				892
				893	smp_wmb();
				894	}
				895
				896	/*
				897	* Queue freeze logic:
				898	* 1) Assume tp_block_nr = 8 blocks.
				899	* 2) At time 't0', user opens Rx ring.
				900	* 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
				901	* 4) user-space is either sleeping or processing block '0'.
				902	* 5) tpacket_rcv is currently filling block '7', since there is no space left,
				903	* it will close block-7,loop around and try to fill block '0'.
				904	* call-flow:
				905	* __packet_lookup_frame_in_block
				906	* prb_retire_current_block()
				907	* prb_dispatch_next_block()
				908	* \|->(BLOCK_STATUS == USER) evaluates to true
				909	* 5.1) Since block-0 is currently in-use, we just freeze the queue.
				910	* 6) Now there are two cases:
				911	* 6.1) Link goes idle right after the queue is frozen.
				912	* But remember, the last open_block() refreshed the timer.
				913	* When this timer expires,it will refresh itself so that we can
				914	* re-open block-0 in near future.
				915	* 6.2) Link is busy and keeps on receiving packets. This is a simple
				916	* case and __packet_lookup_frame_in_block will check if block-0
				917	* is free and can now be re-used.
				918	*/
				919	static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
				920	struct packet_sock *po)
				921	{
				922	pkc->reset_pending_on_curr_blk = 1;
				923	po->stats.stats3.tp_freeze_q_cnt++;
				924	}
				925
				926	#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
				927
				928	/*
				929	* If the next block is free then we will dispatch it
				930	* and return a good offset.
				931	* Else, we will freeze the queue.
				932	* So, caller must check the return value.
				933	*/
				934	static void prb_dispatch_next_block(struct tpacket_kbdq_core pkc,
				935	struct packet_sock *po)
				936	{
				937	struct tpacket_block_desc *pbd;
				938
				939	smp_rmb();
				940
				941	/* 1. Get current block num */
				942	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				943
				944	/* 2. If this block is currently in_use then freeze the queue */
				945	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
				946	prb_freeze_queue(pkc, po);
				947	return NULL;
				948	}
				949
				950	/*
				951	* 3.
				952	* open this block and return the offset where the first packet
				953	* needs to get stored.
				954	*/
				955	prb_open_block(pkc, pbd);
				956	return (void *)pkc->nxt_offset;
				957	}
				958
				959	static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
				960	struct packet_sock *po, unsigned int status)
				961	{
				962	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				963
				964	/* retire/close the current block */
				965	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
				966	/*
				967	* Plug the case where copy_bits() is in progress on
				968	* cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
				969	* have space to copy the pkt in the current block and
				970	* called prb_retire_current_block()
				971	*
				972	* We don't need to worry about the TMO case because
				973	* the timer-handler already handled this case.
				974	*/
				975	if (!(status & TP_STATUS_BLK_TMO)) {
				976	while (atomic_read(&pkc->blk_fill_in_prog)) {
				977	/* Waiting for skb_copy_bits to finish... */
				978	cpu_relax();
				979	}
				980	}
				981	prb_close_block(pkc, pbd, po, status);
				982	return;
				983	}
				984	}
				985
				986	static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
				987	{
				988	return TP_STATUS_USER & BLOCK_STATUS(pbd);
				989	}
				990
				991	static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
				992	{
				993	return pkc->reset_pending_on_curr_blk;
				994	}
				995
				996	static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
				997	__releases(&pkc->blk_fill_in_prog_lock)
				998	{
				999	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
				1000	atomic_dec(&pkc->blk_fill_in_prog);
				1001	}
				1002
				1003	static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
				1004	struct tpacket3_hdr *ppd)
				1005	{
				1006	ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
				1007	}
				1008
				1009	static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
				1010	struct tpacket3_hdr *ppd)
				1011	{
				1012	ppd->hv1.tp_rxhash = 0;
				1013	}
				1014
				1015	static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
				1016	struct tpacket3_hdr *ppd)
				1017	{
				1018	struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc);
				1019
				1020	if (skb_vlan_tag_present(pkc->skb)) {
				1021	ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
				1022	ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
				1023	ppd->tp_status = TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				1024	} else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) {
				1025	ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev);
				1026	ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol);
				1027	ppd->tp_status = TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				1028	} else {
				1029	ppd->hv1.tp_vlan_tci = 0;
				1030	ppd->hv1.tp_vlan_tpid = 0;
				1031	ppd->tp_status = TP_STATUS_AVAILABLE;
				1032	}
				1033	}
				1034
				1035	static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
				1036	struct tpacket3_hdr *ppd)
				1037	{
				1038	ppd->hv1.tp_padding = 0;
				1039	prb_fill_vlan_info(pkc, ppd);
				1040
				1041	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
				1042	prb_fill_rxhash(pkc, ppd);
				1043	else
				1044	prb_clear_rxhash(pkc, ppd);
				1045	}
				1046
				1047	static void prb_fill_curr_block(char *curr,
				1048	struct tpacket_kbdq_core *pkc,
				1049	struct tpacket_block_desc *pbd,
				1050	unsigned int len)
				1051	__acquires(&pkc->blk_fill_in_prog_lock)
				1052	{
				1053	struct tpacket3_hdr *ppd;
				1054
				1055	ppd = (struct tpacket3_hdr *)curr;
				1056	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
				1057	pkc->prev = curr;
				1058	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
				1059	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
				1060	BLOCK_NUM_PKTS(pbd) += 1;
				1061	atomic_inc(&pkc->blk_fill_in_prog);
				1062	prb_run_all_ft_ops(pkc, ppd);
				1063	}
				1064
				1065	/* Assumes caller has the sk->rx_queue.lock */
				1066	static void __packet_lookup_frame_in_block(struct packet_sock po,
				1067	struct sk_buff *skb,
				1068	unsigned int len
				1069	)
				1070	{
				1071	struct tpacket_kbdq_core *pkc;
				1072	struct tpacket_block_desc *pbd;
				1073	char curr, end;
				1074
				1075	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				1076	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				1077
				1078	/* Queue is frozen when user space is lagging behind */
				1079	if (prb_queue_frozen(pkc)) {
				1080	/*
				1081	* Check if that last block which caused the queue to freeze,
				1082	* is still in_use by user-space.
				1083	*/
				1084	if (prb_curr_blk_in_use(pbd)) {
				1085	/* Can't record this packet */
				1086	return NULL;
				1087	} else {
				1088	/*
				1089	* Ok, the block was released by user-space.
				1090	* Now let's open that block.
				1091	* opening a block also thaws the queue.
				1092	* Thawing is a side effect.
				1093	*/
				1094	prb_open_block(pkc, pbd);
				1095	}
				1096	}
				1097
				1098	smp_mb();
				1099	curr = pkc->nxt_offset;
				1100	pkc->skb = skb;
				1101	end = (char *)pbd + pkc->kblk_size;
				1102
				1103	/* first try the current block */
				1104	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
				1105	prb_fill_curr_block(curr, pkc, pbd, len);
				1106	return (void *)curr;
				1107	}
				1108
				1109	/* Ok, close the current block */
				1110	prb_retire_current_block(pkc, po, 0);
				1111
				1112	/* Now, try to dispatch the next block */
				1113	curr = (char *)prb_dispatch_next_block(pkc, po);
				1114	if (curr) {
				1115	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				1116	prb_fill_curr_block(curr, pkc, pbd, len);
				1117	return (void *)curr;
				1118	}
				1119
				1120	/*
				1121	* No free blocks are available.user_space hasn't caught up yet.
				1122	* Queue was just frozen and now this packet will get dropped.
				1123	*/
				1124	return NULL;
				1125	}
				1126
				1127	static void packet_current_rx_frame(struct packet_sock po,
				1128	struct sk_buff *skb,
				1129	int status, unsigned int len)
				1130	{
				1131	char *curr = NULL;
				1132	switch (po->tp_version) {
				1133	case TPACKET_V1:
				1134	case TPACKET_V2:
				1135	curr = packet_lookup_frame(po, &po->rx_ring,
				1136	po->rx_ring.head, status);
				1137	return curr;
				1138	case TPACKET_V3:
				1139	return __packet_lookup_frame_in_block(po, skb, len);
				1140	default:
				1141	WARN(1, "TPACKET version not supported\n");
				1142	BUG();
				1143	return NULL;
				1144	}
				1145	}
				1146
				1147	static void prb_lookup_block(const struct packet_sock po,
				1148	const struct packet_ring_buffer *rb,
				1149	unsigned int idx,
				1150	int status)
				1151	{
				1152	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
				1153	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
				1154
				1155	if (status != BLOCK_STATUS(pbd))
				1156	return NULL;
				1157	return pbd;
				1158	}
				1159
				1160	static int prb_previous_blk_num(struct packet_ring_buffer *rb)
				1161	{
				1162	unsigned int prev;
				1163	if (rb->prb_bdqc.kactive_blk_num)
				1164	prev = rb->prb_bdqc.kactive_blk_num-1;
				1165	else
				1166	prev = rb->prb_bdqc.knum_blocks-1;
				1167	return prev;
				1168	}
				1169
				1170	/* Assumes caller has held the rx_queue.lock */
				1171	static void __prb_previous_block(struct packet_sock po,
				1172	struct packet_ring_buffer *rb,
				1173	int status)
				1174	{
				1175	unsigned int previous = prb_previous_blk_num(rb);
				1176	return prb_lookup_block(po, rb, previous, status);
				1177	}
				1178
				1179	static void packet_previous_rx_frame(struct packet_sock po,
				1180	struct packet_ring_buffer *rb,
				1181	int status)
				1182	{
				1183	if (po->tp_version <= TPACKET_V2)
				1184	return packet_previous_frame(po, rb, status);
				1185
				1186	return __prb_previous_block(po, rb, status);
				1187	}
				1188
				1189	static void packet_increment_rx_head(struct packet_sock *po,
				1190	struct packet_ring_buffer *rb)
				1191	{
				1192	switch (po->tp_version) {
				1193	case TPACKET_V1:
				1194	case TPACKET_V2:
				1195	return packet_increment_head(rb);
				1196	case TPACKET_V3:
				1197	default:
				1198	WARN(1, "TPACKET version not supported.\n");
				1199	BUG();
				1200	return;
				1201	}
				1202	}
				1203
				1204	static void packet_previous_frame(struct packet_sock po,
				1205	struct packet_ring_buffer *rb,
				1206	int status)
				1207	{
				1208	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
				1209	return packet_lookup_frame(po, rb, previous, status);
				1210	}
				1211
				1212	static void packet_increment_head(struct packet_ring_buffer *buff)
				1213	{
				1214	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
				1215	}
				1216
				1217	static void packet_inc_pending(struct packet_ring_buffer *rb)
				1218	{
				1219	this_cpu_inc(*rb->pending_refcnt);
				1220	}
				1221
				1222	static void packet_dec_pending(struct packet_ring_buffer *rb)
				1223	{
				1224	this_cpu_dec(*rb->pending_refcnt);
				1225	}
				1226
				1227	static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
				1228	{
				1229	unsigned int refcnt = 0;
				1230	int cpu;
				1231
				1232	/* We don't use pending refcount in rx_ring. */
				1233	if (rb->pending_refcnt == NULL)
				1234	return 0;
				1235
				1236	for_each_possible_cpu(cpu)
				1237	refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
				1238
				1239	return refcnt;
				1240	}
				1241
				1242	static int packet_alloc_pending(struct packet_sock *po)
				1243	{
				1244	po->rx_ring.pending_refcnt = NULL;
				1245
				1246	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
				1247	if (unlikely(po->tx_ring.pending_refcnt == NULL))
				1248	return -ENOBUFS;
				1249
				1250	return 0;
				1251	}
				1252
				1253	static void packet_free_pending(struct packet_sock *po)
				1254	{
				1255	free_percpu(po->tx_ring.pending_refcnt);
				1256	}
				1257
				1258	#define ROOM_POW_OFF 2
				1259	#define ROOM_NONE 0x0
				1260	#define ROOM_LOW 0x1
				1261	#define ROOM_NORMAL 0x2
				1262
				1263	static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
				1264	{
				1265	int idx, len;
				1266
				1267	len = READ_ONCE(po->rx_ring.frame_max) + 1;
				1268	idx = READ_ONCE(po->rx_ring.head);
				1269	if (pow_off)
				1270	idx += len >> pow_off;
				1271	if (idx >= len)
				1272	idx -= len;
				1273	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
				1274	}
				1275
				1276	static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
				1277	{
				1278	int idx, len;
				1279
				1280	len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
				1281	idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
				1282	if (pow_off)
				1283	idx += len >> pow_off;
				1284	if (idx >= len)
				1285	idx -= len;
				1286	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
				1287	}
				1288
				1289	static int __packet_rcv_has_room(const struct packet_sock *po,
				1290	const struct sk_buff *skb)
				1291	{
				1292	const struct sock *sk = &po->sk;
				1293	int ret = ROOM_NONE;
				1294
				1295	if (po->prot_hook.func != tpacket_rcv) {
				1296	int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
				1297	int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
				1298	- (skb ? skb->truesize : 0);
				1299
				1300	if (avail > (rcvbuf >> ROOM_POW_OFF))
				1301	return ROOM_NORMAL;
				1302	else if (avail > 0)
				1303	return ROOM_LOW;
				1304	else
				1305	return ROOM_NONE;
				1306	}
				1307
				1308	if (po->tp_version == TPACKET_V3) {
				1309	if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
				1310	ret = ROOM_NORMAL;
				1311	else if (__tpacket_v3_has_room(po, 0))
				1312	ret = ROOM_LOW;
				1313	} else {
				1314	if (__tpacket_has_room(po, ROOM_POW_OFF))
				1315	ret = ROOM_NORMAL;
				1316	else if (__tpacket_has_room(po, 0))
				1317	ret = ROOM_LOW;
				1318	}
				1319
				1320	return ret;
				1321	}
				1322
				1323	static int packet_rcv_has_room(struct packet_sock po, struct sk_buff skb)
				1324	{
				1325	int pressure, ret;
				1326
				1327	ret = __packet_rcv_has_room(po, skb);
				1328	pressure = ret != ROOM_NORMAL;
				1329
				1330	if (READ_ONCE(po->pressure) != pressure)
				1331	WRITE_ONCE(po->pressure, pressure);
				1332
				1333	return ret;
				1334	}
				1335
				1336	static void packet_rcv_try_clear_pressure(struct packet_sock *po)
				1337	{
				1338	if (READ_ONCE(po->pressure) &&
				1339	__packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
				1340	WRITE_ONCE(po->pressure, 0);
				1341	}
				1342
				1343	static void packet_sock_destruct(struct sock *sk)
				1344	{
				1345	skb_queue_purge(&sk->sk_error_queue);
				1346
				1347	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
				1348	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
				1349
				1350	if (!sock_flag(sk, SOCK_DEAD)) {
				1351	pr_err("Attempt to release alive packet socket: %p\n", sk);
				1352	return;
				1353	}
				1354
				1355	sk_refcnt_debug_dec(sk);
				1356	}
				1357
				1358	static bool fanout_flow_is_huge(struct packet_sock po, struct sk_buff skb)
				1359	{
				1360	u32 *history = po->rollover->history;
				1361	u32 victim, rxhash;
				1362	int i, count = 0;
				1363
				1364	rxhash = skb_get_hash(skb);
				1365	for (i = 0; i < ROLLOVER_HLEN; i++)
				1366	if (READ_ONCE(history[i]) == rxhash)
				1367	count++;
				1368
				1369	victim = prandom_u32() % ROLLOVER_HLEN;
				1370
				1371	/* Avoid dirtying the cache line if possible */
				1372	if (READ_ONCE(history[victim]) != rxhash)
				1373	WRITE_ONCE(history[victim], rxhash);
				1374
				1375	return count > (ROLLOVER_HLEN >> 1);
				1376	}
				1377
				1378	static unsigned int fanout_demux_hash(struct packet_fanout *f,
				1379	struct sk_buff *skb,
				1380	unsigned int num)
				1381	{
				1382	return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
				1383	}
				1384
				1385	static unsigned int fanout_demux_lb(struct packet_fanout *f,
				1386	struct sk_buff *skb,
				1387	unsigned int num)
				1388	{
				1389	unsigned int val = atomic_inc_return(&f->rr_cur);
				1390
				1391	return val % num;
				1392	}
				1393
				1394	static unsigned int fanout_demux_cpu(struct packet_fanout *f,
				1395	struct sk_buff *skb,
				1396	unsigned int num)
				1397	{
				1398	return smp_processor_id() % num;
				1399	}
				1400
				1401	static unsigned int fanout_demux_rnd(struct packet_fanout *f,
				1402	struct sk_buff *skb,
				1403	unsigned int num)
				1404	{
				1405	return prandom_u32_max(num);
				1406	}
				1407
				1408	static unsigned int fanout_demux_rollover(struct packet_fanout *f,
				1409	struct sk_buff *skb,
				1410	unsigned int idx, bool try_self,
				1411	unsigned int num)
				1412	{
				1413	struct packet_sock po, po_next, *po_skip = NULL;
				1414	unsigned int i, j, room = ROOM_NONE;
				1415
				1416	po = pkt_sk(f->arr[idx]);
				1417
				1418	if (try_self) {
				1419	room = packet_rcv_has_room(po, skb);
				1420	if (room == ROOM_NORMAL \|\|
				1421	(room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
				1422	return idx;
				1423	po_skip = po;
				1424	}
				1425
				1426	i = j = min_t(int, po->rollover->sock, num - 1);
				1427	do {
				1428	po_next = pkt_sk(f->arr[i]);
				1429	if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
				1430	packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
				1431	if (i != j)
				1432	po->rollover->sock = i;
				1433	atomic_long_inc(&po->rollover->num);
				1434	if (room == ROOM_LOW)
				1435	atomic_long_inc(&po->rollover->num_huge);
				1436	return i;
				1437	}
				1438
				1439	if (++i == num)
				1440	i = 0;
				1441	} while (i != j);
				1442
				1443	atomic_long_inc(&po->rollover->num_failed);
				1444	return idx;
				1445	}
				1446
				1447	static unsigned int fanout_demux_qm(struct packet_fanout *f,
				1448	struct sk_buff *skb,
				1449	unsigned int num)
				1450	{
				1451	return skb_get_queue_mapping(skb) % num;
				1452	}
				1453
				1454	static unsigned int fanout_demux_bpf(struct packet_fanout *f,
				1455	struct sk_buff *skb,
				1456	unsigned int num)
				1457	{
				1458	struct bpf_prog *prog;
				1459	unsigned int ret = 0;
				1460
				1461	rcu_read_lock();
				1462	prog = rcu_dereference(f->bpf_prog);
				1463	if (prog)
				1464	ret = bpf_prog_run_clear_cb(prog, skb) % num;
				1465	rcu_read_unlock();
				1466
				1467	return ret;
				1468	}
				1469
				1470	static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
				1471	{
				1472	return f->flags & (flag >> 8);
				1473	}
				1474
				1475	static int packet_rcv_fanout(struct sk_buff skb, struct net_device dev,
				1476	struct packet_type pt, struct net_device orig_dev)
				1477	{
				1478	struct packet_fanout *f = pt->af_packet_priv;
				1479	unsigned int num = READ_ONCE(f->num_members);
				1480	struct net *net = read_pnet(&f->net);
				1481	struct packet_sock *po;
				1482	unsigned int idx;
				1483
				1484	if (!net_eq(dev_net(dev), net) \|\| !num) {
				1485	kfree_skb(skb);
				1486	return 0;
				1487	}
				1488
				1489	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
				1490	skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
				1491	if (!skb)
				1492	return 0;
				1493	}
				1494	switch (f->type) {
				1495	case PACKET_FANOUT_HASH:
				1496	default:
				1497	idx = fanout_demux_hash(f, skb, num);
				1498	break;
				1499	case PACKET_FANOUT_LB:
				1500	idx = fanout_demux_lb(f, skb, num);
				1501	break;
				1502	case PACKET_FANOUT_CPU:
				1503	idx = fanout_demux_cpu(f, skb, num);
				1504	break;
				1505	case PACKET_FANOUT_RND:
				1506	idx = fanout_demux_rnd(f, skb, num);
				1507	break;
				1508	case PACKET_FANOUT_QM:
				1509	idx = fanout_demux_qm(f, skb, num);
				1510	break;
				1511	case PACKET_FANOUT_ROLLOVER:
				1512	idx = fanout_demux_rollover(f, skb, 0, false, num);
				1513	break;
				1514	case PACKET_FANOUT_CBPF:
				1515	case PACKET_FANOUT_EBPF:
				1516	idx = fanout_demux_bpf(f, skb, num);
				1517	break;
				1518	}
				1519
				1520	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
				1521	idx = fanout_demux_rollover(f, skb, idx, true, num);
				1522
				1523	po = pkt_sk(f->arr[idx]);
				1524	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
				1525	}
				1526
				1527	DEFINE_MUTEX(fanout_mutex);
				1528	EXPORT_SYMBOL_GPL(fanout_mutex);
				1529	static LIST_HEAD(fanout_list);
				1530	static u16 fanout_next_id;
				1531
				1532	static void __fanout_link(struct sock sk, struct packet_sock po)
				1533	{
				1534	struct packet_fanout *f = po->fanout;
				1535
				1536	spin_lock(&f->lock);
				1537	f->arr[f->num_members] = sk;
				1538	smp_wmb();
				1539	f->num_members++;
				1540	if (f->num_members == 1)
				1541	dev_add_pack(&f->prot_hook);
				1542	spin_unlock(&f->lock);
				1543	}
				1544
				1545	static void __fanout_unlink(struct sock sk, struct packet_sock po)
				1546	{
				1547	struct packet_fanout *f = po->fanout;
				1548	int i;
				1549
				1550	spin_lock(&f->lock);
				1551	for (i = 0; i < f->num_members; i++) {
				1552	if (f->arr[i] == sk)
				1553	break;
				1554	}
				1555	BUG_ON(i >= f->num_members);
				1556	f->arr[i] = f->arr[f->num_members - 1];
				1557	f->num_members--;
				1558	if (f->num_members == 0)
				1559	__dev_remove_pack(&f->prot_hook);
				1560	spin_unlock(&f->lock);
				1561	}
				1562
				1563	static bool match_fanout_group(struct packet_type ptype, struct sock sk)
				1564	{
				1565	if (sk->sk_family != PF_PACKET)
				1566	return false;
				1567
				1568	return ptype->af_packet_priv == pkt_sk(sk)->fanout;
				1569	}
				1570
				1571	static void fanout_init_data(struct packet_fanout *f)
				1572	{
				1573	switch (f->type) {
				1574	case PACKET_FANOUT_LB:
				1575	atomic_set(&f->rr_cur, 0);
				1576	break;
				1577	case PACKET_FANOUT_CBPF:
				1578	case PACKET_FANOUT_EBPF:
				1579	RCU_INIT_POINTER(f->bpf_prog, NULL);
				1580	break;
				1581	}
				1582	}
				1583
				1584	static void __fanout_set_data_bpf(struct packet_fanout f, struct bpf_prog new)
				1585	{
				1586	struct bpf_prog *old;
				1587
				1588	spin_lock(&f->lock);
				1589	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
				1590	rcu_assign_pointer(f->bpf_prog, new);
				1591	spin_unlock(&f->lock);
				1592
				1593	if (old) {
				1594	synchronize_net();
				1595	bpf_prog_destroy(old);
				1596	}
				1597	}
				1598
				1599	static int fanout_set_data_cbpf(struct packet_sock po, char __user data,
				1600	unsigned int len)
				1601	{
				1602	struct bpf_prog *new;
				1603	struct sock_fprog fprog;
				1604	int ret;
				1605
				1606	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
				1607	return -EPERM;
				1608	if (len != sizeof(fprog))
				1609	return -EINVAL;
				1610	if (copy_from_user(&fprog, data, len))
				1611	return -EFAULT;
				1612
				1613	ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
				1614	if (ret)
				1615	return ret;
				1616
				1617	__fanout_set_data_bpf(po->fanout, new);
				1618	return 0;
				1619	}
				1620
				1621	static int fanout_set_data_ebpf(struct packet_sock po, char __user data,
				1622	unsigned int len)
				1623	{
				1624	struct bpf_prog *new;
				1625	u32 fd;
				1626
				1627	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
				1628	return -EPERM;
				1629	if (len != sizeof(fd))
				1630	return -EINVAL;
				1631	if (copy_from_user(&fd, data, len))
				1632	return -EFAULT;
				1633
				1634	new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
				1635	if (IS_ERR(new))
				1636	return PTR_ERR(new);
				1637
				1638	__fanout_set_data_bpf(po->fanout, new);
				1639	return 0;
				1640	}
				1641
				1642	static int fanout_set_data(struct packet_sock po, char __user data,
				1643	unsigned int len)
				1644	{
				1645	switch (po->fanout->type) {
				1646	case PACKET_FANOUT_CBPF:
				1647	return fanout_set_data_cbpf(po, data, len);
				1648	case PACKET_FANOUT_EBPF:
				1649	return fanout_set_data_ebpf(po, data, len);
				1650	default:
				1651	return -EINVAL;
				1652	}
				1653	}
				1654
				1655	static void fanout_release_data(struct packet_fanout *f)
				1656	{
				1657	switch (f->type) {
				1658	case PACKET_FANOUT_CBPF:
				1659	case PACKET_FANOUT_EBPF:
				1660	__fanout_set_data_bpf(f, NULL);
				1661	}
				1662	}
				1663
				1664	static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
				1665	{
				1666	struct packet_fanout *f;
				1667
				1668	list_for_each_entry(f, &fanout_list, list) {
				1669	if (f->id == candidate_id &&
				1670	read_pnet(&f->net) == sock_net(sk)) {
				1671	return false;
				1672	}
				1673	}
				1674	return true;
				1675	}
				1676
				1677	static bool fanout_find_new_id(struct sock sk, u16 new_id)
				1678	{
				1679	u16 id = fanout_next_id;
				1680
				1681	do {
				1682	if (__fanout_id_is_free(sk, id)) {
				1683	*new_id = id;
				1684	fanout_next_id = id + 1;
				1685	return true;
				1686	}
				1687
				1688	id++;
				1689	} while (id != fanout_next_id);
				1690
				1691	return false;
				1692	}
				1693
				1694	static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
				1695	{
				1696	struct packet_rollover *rollover = NULL;
				1697	struct packet_sock *po = pkt_sk(sk);
				1698	struct packet_fanout f, match;
				1699	u8 type = type_flags & 0xff;
				1700	u8 flags = type_flags >> 8;
				1701	int err;
				1702
				1703	switch (type) {
				1704	case PACKET_FANOUT_ROLLOVER:
				1705	if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
				1706	return -EINVAL;
				1707	case PACKET_FANOUT_HASH:
				1708	case PACKET_FANOUT_LB:
				1709	case PACKET_FANOUT_CPU:
				1710	case PACKET_FANOUT_RND:
				1711	case PACKET_FANOUT_QM:
				1712	case PACKET_FANOUT_CBPF:
				1713	case PACKET_FANOUT_EBPF:
				1714	break;
				1715	default:
				1716	return -EINVAL;
				1717	}
				1718
				1719	mutex_lock(&fanout_mutex);
				1720
				1721	err = -EALREADY;
				1722	if (po->fanout)
				1723	goto out;
				1724
				1725	if (type == PACKET_FANOUT_ROLLOVER \|\|
				1726	(type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
				1727	err = -ENOMEM;
				1728	rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
				1729	if (!rollover)
				1730	goto out;
				1731	atomic_long_set(&rollover->num, 0);
				1732	atomic_long_set(&rollover->num_huge, 0);
				1733	atomic_long_set(&rollover->num_failed, 0);
				1734	}
				1735
				1736	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
				1737	if (id != 0) {
				1738	err = -EINVAL;
				1739	goto out;
				1740	}
				1741	if (!fanout_find_new_id(sk, &id)) {
				1742	err = -ENOMEM;
				1743	goto out;
				1744	}
				1745	/* ephemeral flag for the first socket in the group: drop it */
				1746	flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
				1747	}
				1748
				1749	match = NULL;
				1750	list_for_each_entry(f, &fanout_list, list) {
				1751	if (f->id == id &&
				1752	read_pnet(&f->net) == sock_net(sk)) {
				1753	match = f;
				1754	break;
				1755	}
				1756	}
				1757	err = -EINVAL;
				1758	if (match && match->flags != flags)
				1759	goto out;
				1760	if (!match) {
				1761	err = -ENOMEM;
				1762	match = kzalloc(sizeof(*match), GFP_KERNEL);
				1763	if (!match)
				1764	goto out;
				1765	write_pnet(&match->net, sock_net(sk));
				1766	match->id = id;
				1767	match->type = type;
				1768	match->flags = flags;
				1769	INIT_LIST_HEAD(&match->list);
				1770	spin_lock_init(&match->lock);
				1771	refcount_set(&match->sk_ref, 0);
				1772	fanout_init_data(match);
				1773	match->prot_hook.type = po->prot_hook.type;
				1774	match->prot_hook.dev = po->prot_hook.dev;
				1775	match->prot_hook.func = packet_rcv_fanout;
				1776	match->prot_hook.af_packet_priv = match;
				1777	match->prot_hook.af_packet_net = read_pnet(&match->net);
				1778	match->prot_hook.id_match = match_fanout_group;
				1779	list_add(&match->list, &fanout_list);
				1780	}
				1781	err = -EINVAL;
				1782
				1783	spin_lock(&po->bind_lock);
				1784	if (po->running &&
				1785	match->type == type &&
				1786	match->prot_hook.type == po->prot_hook.type &&
				1787	match->prot_hook.dev == po->prot_hook.dev) {
				1788	err = -ENOSPC;
				1789	if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
				1790	__dev_remove_pack(&po->prot_hook);
				1791
				1792	/* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
				1793	WRITE_ONCE(po->fanout, match);
				1794
				1795	po->rollover = rollover;
				1796	rollover = NULL;
				1797	refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
				1798	__fanout_link(sk, po);
				1799	err = 0;
				1800	}
				1801	}
				1802	spin_unlock(&po->bind_lock);
				1803
				1804	if (err && !refcount_read(&match->sk_ref)) {
				1805	list_del(&match->list);
				1806	kfree(match);
				1807	}
				1808
				1809	out:
				1810	kfree(rollover);
				1811	mutex_unlock(&fanout_mutex);
				1812	return err;
				1813	}
				1814
				1815	/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
				1816	* pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
				1817	* It is the responsibility of the caller to call fanout_release_data() and
				1818	* free the returned packet_fanout (after synchronize_net())
				1819	*/
				1820	static struct packet_fanout fanout_release(struct sock sk)
				1821	{
				1822	struct packet_sock *po = pkt_sk(sk);
				1823	struct packet_fanout *f;
				1824
				1825	mutex_lock(&fanout_mutex);
				1826	f = po->fanout;
				1827	if (f) {
				1828	po->fanout = NULL;
				1829
				1830	if (refcount_dec_and_test(&f->sk_ref))
				1831	list_del(&f->list);
				1832	else
				1833	f = NULL;
				1834	}
				1835	mutex_unlock(&fanout_mutex);
				1836
				1837	return f;
				1838	}
				1839
				1840	static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
				1841	struct sk_buff *skb)
				1842	{
				1843	/* Earlier code assumed this would be a VLAN pkt, double-check
				1844	* this now that we have the actual packet in hand. We can only
				1845	* do this check on Ethernet devices.
				1846	*/
				1847	if (unlikely(dev->type != ARPHRD_ETHER))
				1848	return false;
				1849
				1850	skb_reset_mac_header(skb);
				1851	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
				1852	}
				1853
				1854	static const struct proto_ops packet_ops;
				1855
				1856	static const struct proto_ops packet_ops_spkt;
				1857
				1858	static int packet_rcv_spkt(struct sk_buff skb, struct net_device dev,
				1859	struct packet_type pt, struct net_device orig_dev)
				1860	{
				1861	struct sock *sk;
				1862	struct sockaddr_pkt *spkt;
				1863	struct packet_sock *po;
				1864
				1865	/*
				1866	* When we registered the protocol we saved the socket in the data
				1867	* field for just this event.
				1868	*/
				1869
				1870	sk = pt->af_packet_priv;
				1871	po = pkt_sk(sk);
				1872
				1873	/*
				1874	* Yank back the headers [hope the device set this
				1875	* right or kerboom...]
				1876	*
				1877	* Incoming packets have ll header pulled,
				1878	* push it back.
				1879	*
				1880	* For outgoing ones skb->data == skb_mac_header(skb)
				1881	* so that this procedure is noop.
				1882	*/
				1883
				1884	if (!(po->pkt_type & (1 << skb->pkt_type)))
				1885	goto out;
				1886
				1887	if (!net_eq(dev_net(dev), sock_net(sk)))
				1888	goto out;
				1889
				1890	skb = skb_share_check(skb, GFP_ATOMIC);
				1891	if (skb == NULL)
				1892	goto oom;
				1893
				1894	/* drop any routing info */
				1895	skb_dst_drop(skb);
				1896
				1897	/* drop conntrack reference */
				1898	nf_reset_ct(skb);
				1899
				1900	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
				1901
				1902	skb_push(skb, skb->data - skb_mac_header(skb));
				1903
				1904	/*
				1905	* The SOCK_PACKET socket receives _all_ frames.
				1906	*/
				1907
				1908	spkt->spkt_family = dev->type;
				1909	strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
				1910	spkt->spkt_protocol = skb->protocol;
				1911
				1912	/*
				1913	* Charge the memory to the socket. This is done specifically
				1914	* to prevent sockets using all the memory up.
				1915	*/
				1916
				1917	if (sock_queue_rcv_skb(sk, skb) == 0)
				1918	return 0;
				1919
				1920	out:
				1921	kfree_skb(skb);
				1922	oom:
				1923	return 0;
				1924	}
				1925
				1926	static void packet_parse_headers(struct sk_buff skb, struct socket sock)
				1927	{
				1928	int depth;
				1929
				1930	if ((!skb->protocol \|\| skb->protocol == htons(ETH_P_ALL)) &&
				1931	sock->type == SOCK_RAW) {
				1932	skb_reset_mac_header(skb);
				1933	skb->protocol = dev_parse_header_protocol(skb);
				1934	}
				1935
				1936	/* Move network header to the right position for VLAN tagged packets */
				1937	if (likely(skb->dev->type == ARPHRD_ETHER) &&
				1938	eth_type_vlan(skb->protocol) &&
				1939	vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
				1940	skb_set_network_header(skb, depth);
				1941
				1942	skb_probe_transport_header(skb);
				1943	}
				1944
				1945	/*
				1946	* Output a raw packet to a device layer. This bypasses all the other
				1947	* protocol layers and you must therefore supply it with a complete frame
				1948	*/
				1949
				1950	static int packet_sendmsg_spkt(struct socket sock, struct msghdr msg,
				1951	size_t len)
				1952	{
				1953	struct sock *sk = sock->sk;
				1954	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
				1955	struct sk_buff *skb = NULL;
				1956	struct net_device *dev;
				1957	struct sockcm_cookie sockc;
				1958	__be16 proto = 0;
				1959	int err;
				1960	int extra_len = 0;
				1961
				1962	/*
				1963	* Get and verify the address.
				1964	*/
				1965
				1966	if (saddr) {
				1967	if (msg->msg_namelen < sizeof(struct sockaddr))
				1968	return -EINVAL;
				1969	if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
				1970	proto = saddr->spkt_protocol;
				1971	} else
				1972	return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
				1973
				1974	/*
				1975	* Find the device first to size check it
				1976	*/
				1977
				1978	saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
				1979	retry:
				1980	rcu_read_lock();
				1981	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
				1982	err = -ENODEV;
				1983	if (dev == NULL)
				1984	goto out_unlock;
				1985
				1986	err = -ENETDOWN;
				1987	if (!(dev->flags & IFF_UP))
				1988	goto out_unlock;
				1989
				1990	/*
				1991	* You may not queue a frame bigger than the mtu. This is the lowest level
				1992	* raw protocol and you must do your own fragmentation at this level.
				1993	*/
				1994
				1995	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
				1996	if (!netif_supports_nofcs(dev)) {
				1997	err = -EPROTONOSUPPORT;
				1998	goto out_unlock;
				1999	}
				2000	extra_len = 4; /* We're doing our own CRC */
				2001	}
				2002
				2003	err = -EMSGSIZE;
				2004	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
				2005	goto out_unlock;
				2006
				2007	if (!skb) {
				2008	size_t reserved = LL_RESERVED_SPACE(dev);
				2009	int tlen = dev->needed_tailroom;
				2010	unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
				2011
				2012	rcu_read_unlock();
				2013	skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
				2014	if (skb == NULL)
				2015	return -ENOBUFS;
				2016	/* FIXME: Save some space for broken drivers that write a hard
				2017	* header at transmission time by themselves. PPP is the notable
				2018	* one here. This should really be fixed at the driver level.
				2019	*/
				2020	skb_reserve(skb, reserved);
				2021	skb_reset_network_header(skb);
				2022
				2023	/* Try to align data part correctly */
				2024	if (hhlen) {
				2025	skb->data -= hhlen;
				2026	skb->tail -= hhlen;
				2027	if (len < hhlen)
				2028	skb_reset_network_header(skb);
				2029	}
				2030	err = memcpy_from_msg(skb_put(skb, len), msg, len);
				2031	if (err)
				2032	goto out_free;
				2033	goto retry;
				2034	}
				2035
				2036	if (!dev_validate_header(dev, skb->data, len) \|\| !skb->len) {
				2037	err = -EINVAL;
				2038	goto out_unlock;
				2039	}
				2040	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
				2041	!packet_extra_vlan_len_allowed(dev, skb)) {
				2042	err = -EMSGSIZE;
				2043	goto out_unlock;
				2044	}
				2045
				2046	sockcm_init(&sockc, sk);
				2047	if (msg->msg_controllen) {
				2048	err = sock_cmsg_send(sk, msg, &sockc);
				2049	if (unlikely(err))
				2050	goto out_unlock;
				2051	}
				2052
				2053	skb->protocol = proto;
				2054	skb->dev = dev;
				2055	skb->priority = sk->sk_priority;
				2056	skb->mark = sk->sk_mark;
				2057	skb->tstamp = sockc.transmit_time;
				2058
				2059	skb_setup_tx_timestamp(skb, sockc.tsflags);
				2060
				2061	if (unlikely(extra_len == 4))
				2062	skb->no_fcs = 1;
				2063
				2064	packet_parse_headers(skb, sock);
				2065
				2066	dev_queue_xmit(skb);
				2067	rcu_read_unlock();
				2068	return len;
				2069
				2070	out_unlock:
				2071	rcu_read_unlock();
				2072	out_free:
				2073	kfree_skb(skb);
				2074	return err;
				2075	}
				2076
				2077	static unsigned int run_filter(struct sk_buff *skb,
				2078	const struct sock *sk,
				2079	unsigned int res)
				2080	{
				2081	struct sk_filter *filter;
				2082
				2083	rcu_read_lock();
				2084	filter = rcu_dereference(sk->sk_filter);
				2085	if (filter != NULL)
				2086	res = bpf_prog_run_clear_cb(filter->prog, skb);
				2087	rcu_read_unlock();
				2088
				2089	return res;
				2090	}
				2091
				2092	static int packet_rcv_vnet(struct msghdr msg, const struct sk_buff skb,
				2093	size_t *len)
				2094	{
				2095	struct virtio_net_hdr vnet_hdr;
				2096
				2097	if (*len < sizeof(vnet_hdr))
				2098	return -EINVAL;
				2099	*len -= sizeof(vnet_hdr);
				2100
				2101	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
				2102	return -EINVAL;
				2103
				2104	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
				2105	}
				2106
				2107	/*
				2108	* This function makes lazy skb cloning in hope that most of packets
				2109	* are discarded by BPF.
				2110	*
				2111	* Note tricky part: we DO mangle shared skb! skb->data, skb->len
				2112	* and skb->cb are mangled. It works because (and until) packets
				2113	* falling here are owned by current CPU. Output packets are cloned
				2114	* by dev_queue_xmit_nit(), input packets are processed by net_bh
				2115	* sequencially, so that if we return skb to original state on exit,
				2116	* we will not harm anyone.
				2117	*/
				2118
				2119	static int packet_rcv(struct sk_buff skb, struct net_device dev,
				2120	struct packet_type pt, struct net_device orig_dev)
				2121	{
				2122	struct sock *sk;
				2123	struct sockaddr_ll *sll;
				2124	struct packet_sock *po;
				2125	u8 *skb_head = skb->data;
				2126	int skb_len = skb->len;
				2127	unsigned int snaplen, res;
				2128	bool is_drop_n_account = false;
				2129
				2130	sk = pt->af_packet_priv;
				2131	po = pkt_sk(sk);
				2132
				2133	if (!(po->pkt_type & (1 << skb->pkt_type)))
				2134	goto drop;
				2135
				2136	if (!net_eq(dev_net(dev), sock_net(sk)))
				2137	goto drop;
				2138
				2139	skb->dev = dev;
				2140
				2141	if (dev->header_ops) {
				2142	/* The device has an explicit notion of ll header,
				2143	* exported to higher levels.
				2144	*
				2145	* Otherwise, the device hides details of its frame
				2146	* structure, so that corresponding packet head is
				2147	* never delivered to user.
				2148	*/
				2149	if (sk->sk_type != SOCK_DGRAM)
				2150	skb_push(skb, skb->data - skb_mac_header(skb));
				2151	else if (skb->pkt_type == PACKET_OUTGOING) {
				2152	/* Special case: outgoing packets have ll header at head */
				2153	skb_pull(skb, skb_network_offset(skb));
				2154	}
				2155	}
				2156
				2157	snaplen = skb->len;
				2158
				2159	res = run_filter(skb, sk, snaplen);
				2160	if (!res)
				2161	goto drop_n_restore;
				2162	if (snaplen > res)
				2163	snaplen = res;
				2164
				2165	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				2166	goto drop_n_acct;
				2167
				2168	if (skb_shared(skb)) {
				2169	struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
				2170	if (nskb == NULL)
				2171	goto drop_n_acct;
				2172
				2173	if (skb_head != skb->data) {
				2174	skb->data = skb_head;
				2175	skb->len = skb_len;
				2176	}
				2177	consume_skb(skb);
				2178	skb = nskb;
				2179	}
				2180
				2181	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
				2182
				2183	sll = &PACKET_SKB_CB(skb)->sa.ll;
				2184	sll->sll_hatype = dev->type;
				2185	sll->sll_pkttype = skb->pkt_type;
				2186	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
				2187	sll->sll_ifindex = orig_dev->ifindex;
				2188	else
				2189	sll->sll_ifindex = dev->ifindex;
				2190
				2191	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
				2192
				2193	/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
				2194	* Use their space for storing the original skb length.
				2195	*/
				2196	PACKET_SKB_CB(skb)->sa.origlen = skb->len;
				2197
				2198	if (pskb_trim(skb, snaplen))
				2199	goto drop_n_acct;
				2200
				2201	skb_set_owner_r(skb, sk);
				2202	skb->dev = NULL;
				2203	skb_dst_drop(skb);
				2204
				2205	/* drop conntrack reference */
				2206	nf_reset_ct(skb);
				2207
				2208	spin_lock(&sk->sk_receive_queue.lock);
				2209	po->stats.stats1.tp_packets++;
				2210	sock_skb_set_dropcount(sk, skb);
				2211	__skb_queue_tail(&sk->sk_receive_queue, skb);
				2212	spin_unlock(&sk->sk_receive_queue.lock);
				2213	sk->sk_data_ready(sk);
				2214	return 0;
				2215
				2216	drop_n_acct:
				2217	is_drop_n_account = true;
				2218	atomic_inc(&po->tp_drops);
				2219	atomic_inc(&sk->sk_drops);
				2220
				2221	drop_n_restore:
				2222	if (skb_head != skb->data && skb_shared(skb)) {
				2223	skb->data = skb_head;
				2224	skb->len = skb_len;
				2225	}
				2226	drop:
				2227	if (!is_drop_n_account)
				2228	consume_skb(skb);
				2229	else
				2230	kfree_skb(skb);
				2231	return 0;
				2232	}
				2233
				2234	static int tpacket_rcv(struct sk_buff skb, struct net_device dev,
				2235	struct packet_type pt, struct net_device orig_dev)
				2236	{
				2237	struct sock *sk;
				2238	struct packet_sock *po;
				2239	struct sockaddr_ll *sll;
				2240	union tpacket_uhdr h;
				2241	u8 *skb_head = skb->data;
				2242	int skb_len = skb->len;
				2243	unsigned int snaplen, res;
				2244	unsigned long status = TP_STATUS_USER;
				2245	unsigned short macoff, hdrlen;
				2246	unsigned int netoff;
				2247	struct sk_buff *copy_skb = NULL;
				2248	struct timespec64 ts;
				2249	__u32 ts_status;
				2250	bool is_drop_n_account = false;
				2251	unsigned int slot_id = 0;
				2252	bool do_vnet = false;
				2253
				2254	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
				2255	* We may add members to them until current aligned size without forcing
				2256	* userspace to call getsockopt(..., PACKET_HDRLEN, ...).
				2257	*/
				2258	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
				2259	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
				2260
				2261	sk = pt->af_packet_priv;
				2262	po = pkt_sk(sk);
				2263
				2264	if (!(po->pkt_type & (1 << skb->pkt_type)))
				2265	goto drop;
				2266
				2267	if (!net_eq(dev_net(dev), sock_net(sk)))
				2268	goto drop;
				2269
				2270	if (dev->header_ops) {
				2271	if (sk->sk_type != SOCK_DGRAM)
				2272	skb_push(skb, skb->data - skb_mac_header(skb));
				2273	else if (skb->pkt_type == PACKET_OUTGOING) {
				2274	/* Special case: outgoing packets have ll header at head */
				2275	skb_pull(skb, skb_network_offset(skb));
				2276	}
				2277	}
				2278
				2279	snaplen = skb->len;
				2280
				2281	res = run_filter(skb, sk, snaplen);
				2282	if (!res)
				2283	goto drop_n_restore;
				2284
				2285	/* If we are flooded, just give up */
				2286	if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
				2287	atomic_inc(&po->tp_drops);
				2288	goto drop_n_restore;
				2289	}
				2290
				2291	if (skb->ip_summed == CHECKSUM_PARTIAL)
				2292	status \|= TP_STATUS_CSUMNOTREADY;
				2293	else if (skb->pkt_type != PACKET_OUTGOING &&
				2294	skb_csum_unnecessary(skb))
				2295	status \|= TP_STATUS_CSUM_VALID;
				2296
				2297	if (snaplen > res)
				2298	snaplen = res;
				2299
				2300	if (sk->sk_type == SOCK_DGRAM) {
				2301	macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
				2302	po->tp_reserve;
				2303	} else {
				2304	unsigned int maclen = skb_network_offset(skb);
				2305	netoff = TPACKET_ALIGN(po->tp_hdrlen +
				2306	(maclen < 16 ? 16 : maclen)) +
				2307	po->tp_reserve;
				2308	if (po->has_vnet_hdr) {
				2309	netoff += sizeof(struct virtio_net_hdr);
				2310	do_vnet = true;
				2311	}
				2312	macoff = netoff - maclen;
				2313	}
				2314	if (netoff > USHRT_MAX) {
				2315	atomic_inc(&po->tp_drops);
				2316	goto drop_n_restore;
				2317	}
				2318	if (po->tp_version <= TPACKET_V2) {
				2319	if (macoff + snaplen > po->rx_ring.frame_size) {
				2320	if (po->copy_thresh &&
				2321	atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
				2322	if (skb_shared(skb)) {
				2323	copy_skb = skb_clone(skb, GFP_ATOMIC);
				2324	} else {
				2325	copy_skb = skb_get(skb);
				2326	skb_head = skb->data;
				2327	}
				2328	if (copy_skb) {
				2329	memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
				2330	sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
				2331	skb_set_owner_r(copy_skb, sk);
				2332	}
				2333	}
				2334	snaplen = po->rx_ring.frame_size - macoff;
				2335	if ((int)snaplen < 0) {
				2336	snaplen = 0;
				2337	do_vnet = false;
				2338	}
				2339	}
				2340	} else if (unlikely(macoff + snaplen >
				2341	GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
				2342	u32 nval;
				2343
				2344	nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
				2345	pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
				2346	snaplen, nval, macoff);
				2347	snaplen = nval;
				2348	if (unlikely((int)snaplen < 0)) {
				2349	snaplen = 0;
				2350	macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
				2351	do_vnet = false;
				2352	}
				2353	}
				2354	spin_lock(&sk->sk_receive_queue.lock);
				2355	h.raw = packet_current_rx_frame(po, skb,
				2356	TP_STATUS_KERNEL, (macoff+snaplen));
				2357	if (!h.raw)
				2358	goto drop_n_account;
				2359
				2360	if (po->tp_version <= TPACKET_V2) {
				2361	slot_id = po->rx_ring.head;
				2362	if (test_bit(slot_id, po->rx_ring.rx_owner_map))
				2363	goto drop_n_account;
				2364	__set_bit(slot_id, po->rx_ring.rx_owner_map);
				2365	}
				2366
				2367	if (do_vnet &&
				2368	virtio_net_hdr_from_skb(skb, h.raw + macoff -
				2369	sizeof(struct virtio_net_hdr),
				2370	vio_le(), true, 0)) {
				2371	if (po->tp_version == TPACKET_V3)
				2372	prb_clear_blk_fill_status(&po->rx_ring);
				2373	goto drop_n_account;
				2374	}
				2375
				2376	if (po->tp_version <= TPACKET_V2) {
				2377	packet_increment_rx_head(po, &po->rx_ring);
				2378	/*
				2379	* LOSING will be reported till you read the stats,
				2380	* because it's COR - Clear On Read.
				2381	* Anyways, moving it for V1/V2 only as V3 doesn't need this
				2382	* at packet level.
				2383	*/
				2384	if (atomic_read(&po->tp_drops))
				2385	status \|= TP_STATUS_LOSING;
				2386	}
				2387
				2388	po->stats.stats1.tp_packets++;
				2389	if (copy_skb) {
				2390	status \|= TP_STATUS_COPY;
				2391	__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
				2392	}
				2393	spin_unlock(&sk->sk_receive_queue.lock);
				2394
				2395	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
				2396
				2397	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
				2398	ktime_get_real_ts64(&ts);
				2399
				2400	status \|= ts_status;
				2401
				2402	switch (po->tp_version) {
				2403	case TPACKET_V1:
				2404	h.h1->tp_len = skb->len;
				2405	h.h1->tp_snaplen = snaplen;
				2406	h.h1->tp_mac = macoff;
				2407	h.h1->tp_net = netoff;
				2408	h.h1->tp_sec = ts.tv_sec;
				2409	h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
				2410	hdrlen = sizeof(*h.h1);
				2411	break;
				2412	case TPACKET_V2:
				2413	h.h2->tp_len = skb->len;
				2414	h.h2->tp_snaplen = snaplen;
				2415	h.h2->tp_mac = macoff;
				2416	h.h2->tp_net = netoff;
				2417	h.h2->tp_sec = ts.tv_sec;
				2418	h.h2->tp_nsec = ts.tv_nsec;
				2419	if (skb_vlan_tag_present(skb)) {
				2420	h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
				2421	h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
				2422	status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				2423	} else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
				2424	h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev);
				2425	h.h2->tp_vlan_tpid = ntohs(skb->protocol);
				2426	status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				2427	} else {
				2428	h.h2->tp_vlan_tci = 0;
				2429	h.h2->tp_vlan_tpid = 0;
				2430	}
				2431	memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
				2432	hdrlen = sizeof(*h.h2);
				2433	break;
				2434	case TPACKET_V3:
				2435	/* tp_nxt_offset,vlan are already populated above.
				2436	* So DONT clear those fields here
				2437	*/
				2438	h.h3->tp_status \|= status;
				2439	h.h3->tp_len = skb->len;
				2440	h.h3->tp_snaplen = snaplen;
				2441	h.h3->tp_mac = macoff;
				2442	h.h3->tp_net = netoff;
				2443	h.h3->tp_sec = ts.tv_sec;
				2444	h.h3->tp_nsec = ts.tv_nsec;
				2445	memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
				2446	hdrlen = sizeof(*h.h3);
				2447	break;
				2448	default:
				2449	BUG();
				2450	}
				2451
				2452	sll = h.raw + TPACKET_ALIGN(hdrlen);
				2453	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
				2454	sll->sll_family = AF_PACKET;
				2455	sll->sll_hatype = dev->type;
				2456	sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ?
				2457	vlan_get_protocol_dgram(skb) : skb->protocol;
				2458	sll->sll_pkttype = skb->pkt_type;
				2459	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
				2460	sll->sll_ifindex = orig_dev->ifindex;
				2461	else
				2462	sll->sll_ifindex = dev->ifindex;
				2463
				2464	smp_mb();
				2465
				2466	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
				2467	if (po->tp_version <= TPACKET_V2) {
				2468	u8 start, end;
				2469
				2470	end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
				2471	macoff + snaplen);
				2472
				2473	for (start = h.raw; start < end; start += PAGE_SIZE)
				2474	flush_dcache_page(pgv_to_page(start));
				2475	}
				2476	smp_wmb();
				2477	#endif
				2478
				2479	if (po->tp_version <= TPACKET_V2) {
				2480	spin_lock(&sk->sk_receive_queue.lock);
				2481	__packet_set_status(po, h.raw, status);
				2482	__clear_bit(slot_id, po->rx_ring.rx_owner_map);
				2483	spin_unlock(&sk->sk_receive_queue.lock);
				2484	sk->sk_data_ready(sk);
				2485	} else if (po->tp_version == TPACKET_V3) {
				2486	prb_clear_blk_fill_status(&po->rx_ring);
				2487	}
				2488
				2489	drop_n_restore:
				2490	if (skb_head != skb->data && skb_shared(skb)) {
				2491	skb->data = skb_head;
				2492	skb->len = skb_len;
				2493	}
				2494	drop:
				2495	if (!is_drop_n_account)
				2496	consume_skb(skb);
				2497	else
				2498	kfree_skb(skb);
				2499	return 0;
				2500
				2501	drop_n_account:
				2502	spin_unlock(&sk->sk_receive_queue.lock);
				2503	atomic_inc(&po->tp_drops);
				2504	is_drop_n_account = true;
				2505
				2506	sk->sk_data_ready(sk);
				2507	kfree_skb(copy_skb);
				2508	goto drop_n_restore;
				2509	}
				2510
				2511	static void tpacket_destruct_skb(struct sk_buff *skb)
				2512	{
				2513	struct packet_sock *po = pkt_sk(skb->sk);
				2514
				2515	if (likely(po->tx_ring.pg_vec)) {
				2516	void *ph;
				2517	__u32 ts;
				2518
				2519	ph = skb_zcopy_get_nouarg(skb);
				2520	packet_dec_pending(&po->tx_ring);
				2521
				2522	ts = __packet_set_timestamp(po, ph, skb);
				2523	__packet_set_status(po, ph, TP_STATUS_AVAILABLE \| ts);
				2524
				2525	complete(&po->skb_completion);
				2526	}
				2527
				2528	sock_wfree(skb);
				2529	}
				2530
				2531	static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
				2532	{
				2533	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
				2534	(__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
				2535	__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
				2536	__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
				2537	vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
				2538	__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
				2539	__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
				2540
				2541	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
				2542	return -EINVAL;
				2543
				2544	return 0;
				2545	}
				2546
				2547	static int packet_snd_vnet_parse(struct msghdr msg, size_t len,
				2548	struct virtio_net_hdr *vnet_hdr)
				2549	{
				2550	if (len < sizeof(vnet_hdr))
				2551	return -EINVAL;
				2552	len -= sizeof(vnet_hdr);
				2553
				2554	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
				2555	return -EFAULT;
				2556
				2557	return __packet_snd_vnet_parse(vnet_hdr, *len);
				2558	}
				2559
				2560	static int tpacket_fill_skb(struct packet_sock po, struct sk_buff skb,
				2561	void frame, struct net_device dev, void *data, int tp_len,
				2562	__be16 proto, unsigned char *addr, int hlen, int copylen,
				2563	const struct sockcm_cookie *sockc)
				2564	{
				2565	union tpacket_uhdr ph;
				2566	int to_write, offset, len, nr_frags, len_max;
				2567	struct socket *sock = po->sk.sk_socket;
				2568	struct page *page;
				2569	int err;
				2570
				2571	ph.raw = frame;
				2572
				2573	skb->protocol = proto;
				2574	skb->dev = dev;
				2575	skb->priority = po->sk.sk_priority;
				2576	skb->mark = po->sk.sk_mark;
				2577	skb->tstamp = sockc->transmit_time;
				2578	skb_setup_tx_timestamp(skb, sockc->tsflags);
				2579	skb_zcopy_set_nouarg(skb, ph.raw);
				2580
				2581	skb_reserve(skb, hlen);
				2582	skb_reset_network_header(skb);
				2583
				2584	to_write = tp_len;
				2585
				2586	if (sock->type == SOCK_DGRAM) {
				2587	err = dev_hard_header(skb, dev, ntohs(proto), addr,
				2588	NULL, tp_len);
				2589	if (unlikely(err < 0))
				2590	return -EINVAL;
				2591	} else if (copylen) {
				2592	int hdrlen = min_t(int, copylen, tp_len);
				2593
				2594	skb_push(skb, dev->hard_header_len);
				2595	skb_put(skb, copylen - dev->hard_header_len);
				2596	err = skb_store_bits(skb, 0, data, hdrlen);
				2597	if (unlikely(err))
				2598	return err;
				2599	if (!dev_validate_header(dev, skb->data, hdrlen))
				2600	return -EINVAL;
				2601
				2602	data += hdrlen;
				2603	to_write -= hdrlen;
				2604	}
				2605
				2606	offset = offset_in_page(data);
				2607	len_max = PAGE_SIZE - offset;
				2608	len = ((to_write > len_max) ? len_max : to_write);
				2609
				2610	skb->data_len = to_write;
				2611	skb->len += to_write;
				2612	skb->truesize += to_write;
				2613	refcount_add(to_write, &po->sk.sk_wmem_alloc);
				2614
				2615	while (likely(to_write)) {
				2616	nr_frags = skb_shinfo(skb)->nr_frags;
				2617
				2618	if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
				2619	pr_err("Packet exceed the number of skb frags(%lu)\n",
				2620	MAX_SKB_FRAGS);
				2621	return -EFAULT;
				2622	}
				2623
				2624	page = pgv_to_page(data);
				2625	data += len;
				2626	flush_dcache_page(page);
				2627	get_page(page);
				2628	skb_fill_page_desc(skb, nr_frags, page, offset, len);
				2629	to_write -= len;
				2630	offset = 0;
				2631	len_max = PAGE_SIZE;
				2632	len = ((to_write > len_max) ? len_max : to_write);
				2633	}
				2634
				2635	packet_parse_headers(skb, sock);
				2636
				2637	return tp_len;
				2638	}
				2639
				2640	static int tpacket_parse_header(struct packet_sock po, void frame,
				2641	int size_max, void **data)
				2642	{
				2643	union tpacket_uhdr ph;
				2644	int tp_len, off;
				2645
				2646	ph.raw = frame;
				2647
				2648	switch (po->tp_version) {
				2649	case TPACKET_V3:
				2650	if (ph.h3->tp_next_offset != 0) {
				2651	pr_warn_once("variable sized slot not supported");
				2652	return -EINVAL;
				2653	}
				2654	tp_len = ph.h3->tp_len;
				2655	break;
				2656	case TPACKET_V2:
				2657	tp_len = ph.h2->tp_len;
				2658	break;
				2659	default:
				2660	tp_len = ph.h1->tp_len;
				2661	break;
				2662	}
				2663	if (unlikely(tp_len > size_max)) {
				2664	pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
				2665	return -EMSGSIZE;
				2666	}
				2667
				2668	if (unlikely(po->tp_tx_has_off)) {
				2669	int off_min, off_max;
				2670
				2671	off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
				2672	off_max = po->tx_ring.frame_size - tp_len;
				2673	if (po->sk.sk_type == SOCK_DGRAM) {
				2674	switch (po->tp_version) {
				2675	case TPACKET_V3:
				2676	off = ph.h3->tp_net;
				2677	break;
				2678	case TPACKET_V2:
				2679	off = ph.h2->tp_net;
				2680	break;
				2681	default:
				2682	off = ph.h1->tp_net;
				2683	break;
				2684	}
				2685	} else {
				2686	switch (po->tp_version) {
				2687	case TPACKET_V3:
				2688	off = ph.h3->tp_mac;
				2689	break;
				2690	case TPACKET_V2:
				2691	off = ph.h2->tp_mac;
				2692	break;
				2693	default:
				2694	off = ph.h1->tp_mac;
				2695	break;
				2696	}
				2697	}
				2698	if (unlikely((off < off_min) \|\| (off_max < off)))
				2699	return -EINVAL;
				2700	} else {
				2701	off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
				2702	}
				2703
				2704	*data = frame + off;
				2705	return tp_len;
				2706	}
				2707
				2708	static int tpacket_snd(struct packet_sock po, struct msghdr msg)
				2709	{
				2710	struct sk_buff *skb = NULL;
				2711	struct net_device *dev;
				2712	struct virtio_net_hdr *vnet_hdr = NULL;
				2713	struct sockcm_cookie sockc;
				2714	__be16 proto;
				2715	int err, reserve = 0;
				2716	void *ph;
				2717	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
				2718	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
				2719	unsigned char *addr = NULL;
				2720	int tp_len, size_max;
				2721	void *data;
				2722	int len_sum = 0;
				2723	int status = TP_STATUS_AVAILABLE;
				2724	int hlen, tlen, copylen = 0;
				2725	long timeo = 0;
				2726
				2727	mutex_lock(&po->pg_vec_lock);
				2728
				2729	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
				2730	* we need to confirm it under protection of pg_vec_lock.
				2731	*/
				2732	if (unlikely(!po->tx_ring.pg_vec)) {
				2733	err = -EBUSY;
				2734	goto out;
				2735	}
				2736	if (likely(saddr == NULL)) {
				2737	dev = packet_cached_dev_get(po);
				2738	proto = READ_ONCE(po->num);
				2739	} else {
				2740	err = -EINVAL;
				2741	if (msg->msg_namelen < sizeof(struct sockaddr_ll))
				2742	goto out;
				2743	if (msg->msg_namelen < (saddr->sll_halen
				2744	+ offsetof(struct sockaddr_ll,
				2745	sll_addr)))
				2746	goto out;
				2747	proto = saddr->sll_protocol;
				2748	dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
				2749	if (po->sk.sk_socket->type == SOCK_DGRAM) {
				2750	if (dev && msg->msg_namelen < dev->addr_len +
				2751	offsetof(struct sockaddr_ll, sll_addr))
				2752	goto out_put;
				2753	addr = saddr->sll_addr;
				2754	}
				2755	}
				2756
				2757	err = -ENXIO;
				2758	if (unlikely(dev == NULL))
				2759	goto out;
				2760	err = -ENETDOWN;
				2761	if (unlikely(!(dev->flags & IFF_UP)))
				2762	goto out_put;
				2763
				2764	sockcm_init(&sockc, &po->sk);
				2765	if (msg->msg_controllen) {
				2766	err = sock_cmsg_send(&po->sk, msg, &sockc);
				2767	if (unlikely(err))
				2768	goto out_put;
				2769	}
				2770
				2771	if (po->sk.sk_socket->type == SOCK_RAW)
				2772	reserve = dev->hard_header_len;
				2773	size_max = po->tx_ring.frame_size
				2774	- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
				2775
				2776	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
				2777	size_max = dev->mtu + reserve + VLAN_HLEN;
				2778
				2779	reinit_completion(&po->skb_completion);
				2780
				2781	do {
				2782	ph = packet_current_frame(po, &po->tx_ring,
				2783	TP_STATUS_SEND_REQUEST);
				2784	if (unlikely(ph == NULL)) {
				2785	if (need_wait && skb) {
				2786	timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
				2787	timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
				2788	if (timeo <= 0) {
				2789	err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
				2790	goto out_put;
				2791	}
				2792	}
				2793	/* check for additional frames */
				2794	continue;
				2795	}
				2796
				2797	skb = NULL;
				2798	tp_len = tpacket_parse_header(po, ph, size_max, &data);
				2799	if (tp_len < 0)
				2800	goto tpacket_error;
				2801
				2802	status = TP_STATUS_SEND_REQUEST;
				2803	hlen = LL_RESERVED_SPACE(dev);
				2804	tlen = dev->needed_tailroom;
				2805	if (po->has_vnet_hdr) {
				2806	vnet_hdr = data;
				2807	data += sizeof(*vnet_hdr);
				2808	tp_len -= sizeof(*vnet_hdr);
				2809	if (tp_len < 0 \|\|
				2810	__packet_snd_vnet_parse(vnet_hdr, tp_len)) {
				2811	tp_len = -EINVAL;
				2812	goto tpacket_error;
				2813	}
				2814	copylen = __virtio16_to_cpu(vio_le(),
				2815	vnet_hdr->hdr_len);
				2816	}
				2817	copylen = max_t(int, copylen, dev->hard_header_len);
				2818	skb = sock_alloc_send_skb(&po->sk,
				2819	hlen + tlen + sizeof(struct sockaddr_ll) +
				2820	(copylen - dev->hard_header_len),
				2821	!need_wait, &err);
				2822
				2823	if (unlikely(skb == NULL)) {
				2824	/* we assume the socket was initially writeable ... */
				2825	if (likely(len_sum > 0))
				2826	err = len_sum;
				2827	goto out_status;
				2828	}
				2829	tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
				2830	addr, hlen, copylen, &sockc);
				2831	if (likely(tp_len >= 0) &&
				2832	tp_len > dev->mtu + reserve &&
				2833	!po->has_vnet_hdr &&
				2834	!packet_extra_vlan_len_allowed(dev, skb))
				2835	tp_len = -EMSGSIZE;
				2836
				2837	if (unlikely(tp_len < 0)) {
				2838	tpacket_error:
				2839	if (po->tp_loss) {
				2840	__packet_set_status(po, ph,
				2841	TP_STATUS_AVAILABLE);
				2842	packet_increment_head(&po->tx_ring);
				2843	kfree_skb(skb);
				2844	continue;
				2845	} else {
				2846	status = TP_STATUS_WRONG_FORMAT;
				2847	err = tp_len;
				2848	goto out_status;
				2849	}
				2850	}
				2851
				2852	if (po->has_vnet_hdr) {
				2853	if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
				2854	tp_len = -EINVAL;
				2855	goto tpacket_error;
				2856	}
				2857	virtio_net_hdr_set_proto(skb, vnet_hdr);
				2858	}
				2859
				2860	skb->destructor = tpacket_destruct_skb;
				2861	__packet_set_status(po, ph, TP_STATUS_SENDING);
				2862	packet_inc_pending(&po->tx_ring);
				2863
				2864	status = TP_STATUS_SEND_REQUEST;
				2865	/* Paired with WRITE_ONCE() in packet_setsockopt() */
				2866	err = READ_ONCE(po->xmit)(skb);
				2867	if (unlikely(err != 0)) {
				2868	if (err > 0)
				2869	err = net_xmit_errno(err);
				2870	if (err && __packet_get_status(po, ph) ==
				2871	TP_STATUS_AVAILABLE) {
				2872	/* skb was destructed already */
				2873	skb = NULL;
				2874	goto out_status;
				2875	}
				2876	/*
				2877	* skb was dropped but not destructed yet;
				2878	* let's treat it like congestion or err < 0
				2879	*/
				2880	err = 0;
				2881	}
				2882	packet_increment_head(&po->tx_ring);
				2883	len_sum += tp_len;
				2884	} while (likely((ph != NULL) \|\|
				2885	/* Note: packet_read_pending() might be slow if we have
				2886	* to call it as it's per_cpu variable, but in fast-path
				2887	* we already short-circuit the loop with the first
				2888	* condition, and luckily don't have to go that path
				2889	* anyway.
				2890	*/
				2891	(need_wait && packet_read_pending(&po->tx_ring))));
				2892
				2893	err = len_sum;
				2894	goto out_put;
				2895
				2896	out_status:
				2897	__packet_set_status(po, ph, status);
				2898	kfree_skb(skb);
				2899	out_put:
				2900	dev_put(dev);
				2901	out:
				2902	mutex_unlock(&po->pg_vec_lock);
				2903	return err;
				2904	}
				2905
				2906	static struct sk_buff packet_alloc_skb(struct sock sk, size_t prepad,
				2907	size_t reserve, size_t len,
				2908	size_t linear, int noblock,
				2909	int *err)
				2910	{
				2911	struct sk_buff *skb;
				2912
				2913	/* Under a page? Don't bother with paged skb. */
				2914	if (prepad + len < PAGE_SIZE \|\| !linear)
				2915	linear = len;
				2916
				2917	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
				2918	err, 0);
				2919	if (!skb)
				2920	return NULL;
				2921
				2922	skb_reserve(skb, reserve);
				2923	skb_put(skb, linear);
				2924	skb->data_len = len - linear;
				2925	skb->len += len - linear;
				2926
				2927	return skb;
				2928	}
				2929
				2930	static int packet_snd(struct socket sock, struct msghdr msg, size_t len)
				2931	{
				2932	struct sock *sk = sock->sk;
				2933	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
				2934	struct sk_buff *skb;
				2935	struct net_device *dev;
				2936	__be16 proto;
				2937	unsigned char *addr = NULL;
				2938	int err, reserve = 0;
				2939	struct sockcm_cookie sockc;
				2940	struct virtio_net_hdr vnet_hdr = { 0 };
				2941	int offset = 0;
				2942	struct packet_sock *po = pkt_sk(sk);
				2943	bool has_vnet_hdr = false;
				2944	int hlen, tlen, linear;
				2945	int extra_len = 0;
				2946
				2947	/*
				2948	* Get and verify the address.
				2949	*/
				2950
				2951	if (likely(saddr == NULL)) {
				2952	dev = packet_cached_dev_get(po);
				2953	proto = READ_ONCE(po->num);
				2954	} else {
				2955	err = -EINVAL;
				2956	if (msg->msg_namelen < sizeof(struct sockaddr_ll))
				2957	goto out;
				2958	if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
				2959	goto out;
				2960	proto = saddr->sll_protocol;
				2961	dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
				2962	if (sock->type == SOCK_DGRAM) {
				2963	if (dev && msg->msg_namelen < dev->addr_len +
				2964	offsetof(struct sockaddr_ll, sll_addr))
				2965	goto out_unlock;
				2966	addr = saddr->sll_addr;
				2967	}
				2968	}
				2969
				2970	err = -ENXIO;
				2971	if (unlikely(dev == NULL))
				2972	goto out_unlock;
				2973	err = -ENETDOWN;
				2974	if (unlikely(!(dev->flags & IFF_UP)))
				2975	goto out_unlock;
				2976
				2977	sockcm_init(&sockc, sk);
				2978	sockc.mark = sk->sk_mark;
				2979	if (msg->msg_controllen) {
				2980	err = sock_cmsg_send(sk, msg, &sockc);
				2981	if (unlikely(err))
				2982	goto out_unlock;
				2983	}
				2984
				2985	if (sock->type == SOCK_RAW)
				2986	reserve = dev->hard_header_len;
				2987	if (po->has_vnet_hdr) {
				2988	err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
				2989	if (err)
				2990	goto out_unlock;
				2991	has_vnet_hdr = true;
				2992	}
				2993
				2994	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
				2995	if (!netif_supports_nofcs(dev)) {
				2996	err = -EPROTONOSUPPORT;
				2997	goto out_unlock;
				2998	}
				2999	extra_len = 4; /* We're doing our own CRC */
				3000	}
				3001
				3002	err = -EMSGSIZE;
				3003	if (!vnet_hdr.gso_type &&
				3004	(len > dev->mtu + reserve + VLAN_HLEN + extra_len))
				3005	goto out_unlock;
				3006
				3007	err = -ENOBUFS;
				3008	hlen = LL_RESERVED_SPACE(dev);
				3009	tlen = dev->needed_tailroom;
				3010	linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
				3011	linear = max(linear, min_t(int, len, dev->hard_header_len));
				3012	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
				3013	msg->msg_flags & MSG_DONTWAIT, &err);
				3014	if (skb == NULL)
				3015	goto out_unlock;
				3016
				3017	skb_reset_network_header(skb);
				3018
				3019	err = -EINVAL;
				3020	if (sock->type == SOCK_DGRAM) {
				3021	offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
				3022	if (unlikely(offset < 0))
				3023	goto out_free;
				3024	} else if (reserve) {
				3025	skb_reserve(skb, -reserve);
				3026	if (len < reserve + sizeof(struct ipv6hdr) &&
				3027	dev->min_header_len != dev->hard_header_len)
				3028	skb_reset_network_header(skb);
				3029	}
				3030
				3031	/* Returns -EFAULT on error */
				3032	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
				3033	if (err)
				3034	goto out_free;
				3035
				3036	if ((sock->type == SOCK_RAW &&
				3037	!dev_validate_header(dev, skb->data, len)) \|\| !skb->len) {
				3038	err = -EINVAL;
				3039	goto out_free;
				3040	}
				3041
				3042	skb_setup_tx_timestamp(skb, sockc.tsflags);
				3043
				3044	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
				3045	!packet_extra_vlan_len_allowed(dev, skb)) {
				3046	err = -EMSGSIZE;
				3047	goto out_free;
				3048	}
				3049
				3050	skb->protocol = proto;
				3051	skb->dev = dev;
				3052	skb->priority = sk->sk_priority;
				3053	skb->mark = sockc.mark;
				3054	skb->tstamp = sockc.transmit_time;
				3055
				3056	if (unlikely(extra_len == 4))
				3057	skb->no_fcs = 1;
				3058
				3059	packet_parse_headers(skb, sock);
				3060
				3061	if (has_vnet_hdr) {
				3062	err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
				3063	if (err)
				3064	goto out_free;
				3065	len += sizeof(vnet_hdr);
				3066	virtio_net_hdr_set_proto(skb, &vnet_hdr);
				3067	}
				3068
				3069	/* Paired with WRITE_ONCE() in packet_setsockopt() */
				3070	err = READ_ONCE(po->xmit)(skb);
				3071	if (unlikely(err != 0)) {
				3072	if (err > 0)
				3073	err = net_xmit_errno(err);
				3074	if (err)
				3075	goto out_unlock;
				3076	}
				3077
				3078	dev_put(dev);
				3079
				3080	return len;
				3081
				3082	out_free:
				3083	kfree_skb(skb);
				3084	out_unlock:
				3085	if (dev)
				3086	dev_put(dev);
				3087	out:
				3088	return err;
				3089	}
				3090
				3091	static int packet_sendmsg(struct socket sock, struct msghdr msg, size_t len)
				3092	{
				3093	struct sock *sk = sock->sk;
				3094	struct packet_sock *po = pkt_sk(sk);
				3095
				3096	if (po->tx_ring.pg_vec)
				3097	return tpacket_snd(po, msg);
				3098	else
				3099	return packet_snd(sock, msg, len);
				3100	}
				3101
				3102	/*
				3103	* Close a PACKET socket. This is fairly simple. We immediately go
				3104	* to 'closed' state and remove our protocol entry in the device list.
				3105	*/
				3106
				3107	static int packet_release(struct socket *sock)
				3108	{
				3109	struct sock *sk = sock->sk;
				3110	struct packet_sock *po;
				3111	struct packet_fanout *f;
				3112	struct net *net;
				3113	union tpacket_req_u req_u;
				3114
				3115	if (!sk)
				3116	return 0;
				3117
				3118	net = sock_net(sk);
				3119	po = pkt_sk(sk);
				3120
				3121	mutex_lock(&net->packet.sklist_lock);
				3122	sk_del_node_init_rcu(sk);
				3123	mutex_unlock(&net->packet.sklist_lock);
				3124
				3125	preempt_disable();
				3126	sock_prot_inuse_add(net, sk->sk_prot, -1);
				3127	preempt_enable();
				3128
				3129	spin_lock(&po->bind_lock);
				3130	unregister_prot_hook(sk, false);
				3131	packet_cached_dev_reset(po);
				3132
				3133	if (po->prot_hook.dev) {
				3134	dev_put(po->prot_hook.dev);
				3135	po->prot_hook.dev = NULL;
				3136	}
				3137	spin_unlock(&po->bind_lock);
				3138
				3139	packet_flush_mclist(sk);
				3140
				3141	lock_sock(sk);
				3142	if (po->rx_ring.pg_vec) {
				3143	memset(&req_u, 0, sizeof(req_u));
				3144	packet_set_ring(sk, &req_u, 1, 0);
				3145	}
				3146
				3147	if (po->tx_ring.pg_vec) {
				3148	memset(&req_u, 0, sizeof(req_u));
				3149	packet_set_ring(sk, &req_u, 1, 1);
				3150	}
				3151	release_sock(sk);
				3152
				3153	f = fanout_release(sk);
				3154
				3155	synchronize_net();
				3156
				3157	kfree(po->rollover);
				3158	if (f) {
				3159	fanout_release_data(f);
				3160	kfree(f);
				3161	}
				3162	/*
				3163	* Now the socket is dead. No more input will appear.
				3164	*/
				3165	sock_orphan(sk);
				3166	sock->sk = NULL;
				3167
				3168	/* Purge queues */
				3169
				3170	skb_queue_purge(&sk->sk_receive_queue);
				3171	packet_free_pending(po);
				3172	sk_refcnt_debug_release(sk);
				3173
				3174	sock_put(sk);
				3175	return 0;
				3176	}
				3177
				3178	/*
				3179	* Attach a packet hook.
				3180	*/
				3181
				3182	static int packet_do_bind(struct sock sk, const char name, int ifindex,
				3183	__be16 proto)
				3184	{
				3185	struct packet_sock *po = pkt_sk(sk);
				3186	struct net_device *dev_curr;
				3187	__be16 proto_curr;
				3188	bool need_rehook;
				3189	struct net_device *dev = NULL;
				3190	int ret = 0;
				3191	bool unlisted = false;
				3192
				3193	lock_sock(sk);
				3194	spin_lock(&po->bind_lock);
				3195	if (!proto)
				3196	proto = po->num;
				3197
				3198	rcu_read_lock();
				3199
				3200	if (po->fanout) {
				3201	ret = -EINVAL;
				3202	goto out_unlock;
				3203	}
				3204
				3205	if (name) {
				3206	dev = dev_get_by_name_rcu(sock_net(sk), name);
				3207	if (!dev) {
				3208	ret = -ENODEV;
				3209	goto out_unlock;
				3210	}
				3211	} else if (ifindex) {
				3212	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
				3213	if (!dev) {
				3214	ret = -ENODEV;
				3215	goto out_unlock;
				3216	}
				3217	}
				3218
				3219	if (dev)
				3220	dev_hold(dev);
				3221
				3222	proto_curr = po->prot_hook.type;
				3223	dev_curr = po->prot_hook.dev;
				3224
				3225	need_rehook = proto_curr != proto \|\| dev_curr != dev;
				3226
				3227	if (need_rehook) {
				3228	if (po->running) {
				3229	rcu_read_unlock();
				3230	/* prevents packet_notifier() from calling
				3231	* register_prot_hook()
				3232	*/
				3233	WRITE_ONCE(po->num, 0);
				3234	__unregister_prot_hook(sk, true);
				3235	rcu_read_lock();
				3236	dev_curr = po->prot_hook.dev;
				3237	if (dev)
				3238	unlisted = !dev_get_by_index_rcu(sock_net(sk),
				3239	dev->ifindex);
				3240	}
				3241
				3242	BUG_ON(po->running);
				3243	WRITE_ONCE(po->num, proto);
				3244	po->prot_hook.type = proto;
				3245
				3246	if (unlikely(unlisted)) {
				3247	dev_put(dev);
				3248	po->prot_hook.dev = NULL;
				3249	WRITE_ONCE(po->ifindex, -1);
				3250	packet_cached_dev_reset(po);
				3251	} else {
				3252	po->prot_hook.dev = dev;
				3253	WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
				3254	packet_cached_dev_assign(po, dev);
				3255	}
				3256	}
				3257	if (dev_curr)
				3258	dev_put(dev_curr);
				3259
				3260	if (proto == 0 \|\| !need_rehook)
				3261	goto out_unlock;
				3262
				3263	if (!unlisted && (!dev \|\| (dev->flags & IFF_UP))) {
				3264	register_prot_hook(sk);
				3265	} else {
				3266	sk->sk_err = ENETDOWN;
				3267	if (!sock_flag(sk, SOCK_DEAD))
				3268	sk->sk_error_report(sk);
				3269	}
				3270
				3271	out_unlock:
				3272	rcu_read_unlock();
				3273	spin_unlock(&po->bind_lock);
				3274	release_sock(sk);
				3275	return ret;
				3276	}
				3277
				3278	/*
				3279	* Bind a packet socket to a device
				3280	*/
				3281
				3282	static int packet_bind_spkt(struct socket sock, struct sockaddr uaddr,
				3283	int addr_len)
				3284	{
				3285	struct sock *sk = sock->sk;
				3286	char name[sizeof(uaddr->sa_data) + 1];
				3287
				3288	/*
				3289	* Check legality
				3290	*/
				3291
				3292	if (addr_len != sizeof(struct sockaddr))
				3293	return -EINVAL;
				3294	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
				3295	* zero-terminated.
				3296	*/
				3297	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
				3298	name[sizeof(uaddr->sa_data)] = 0;
				3299
				3300	return packet_do_bind(sk, name, 0, 0);
				3301	}
				3302
				3303	static int packet_bind(struct socket sock, struct sockaddr uaddr, int addr_len)
				3304	{
				3305	struct sockaddr_ll sll = (struct sockaddr_ll )uaddr;
				3306	struct sock *sk = sock->sk;
				3307
				3308	/*
				3309	* Check legality
				3310	*/
				3311
				3312	if (addr_len < sizeof(struct sockaddr_ll))
				3313	return -EINVAL;
				3314	if (sll->sll_family != AF_PACKET)
				3315	return -EINVAL;
				3316
				3317	return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
				3318	}
				3319
				3320	static struct proto packet_proto = {
				3321	.name = "PACKET",
				3322	.owner = THIS_MODULE,
				3323	.obj_size = sizeof(struct packet_sock),
				3324	};
				3325
				3326	/*
				3327	* Create a packet of type SOCK_PACKET.
				3328	*/
				3329
				3330	static int packet_create(struct net net, struct socket sock, int protocol,
				3331	int kern)
				3332	{
				3333	struct sock *sk;
				3334	struct packet_sock *po;
				3335	__be16 proto = (__force __be16)protocol; /* weird, but documented */
				3336	int err;
				3337
				3338	if (!ns_capable(net->user_ns, CAP_NET_RAW))
				3339	return -EPERM;
				3340	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
				3341	sock->type != SOCK_PACKET)
				3342	return -ESOCKTNOSUPPORT;
				3343
				3344	sock->state = SS_UNCONNECTED;
				3345
				3346	err = -ENOBUFS;
				3347	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
				3348	if (sk == NULL)
				3349	goto out;
				3350
				3351	sock->ops = &packet_ops;
				3352	if (sock->type == SOCK_PACKET)
				3353	sock->ops = &packet_ops_spkt;
				3354
				3355	po = pkt_sk(sk);
				3356	err = packet_alloc_pending(po);
				3357	if (err)
				3358	goto out_sk_free;
				3359
				3360	sock_init_data(sock, sk);
				3361
				3362	init_completion(&po->skb_completion);
				3363	sk->sk_family = PF_PACKET;
				3364	po->num = proto;
				3365	po->xmit = dev_queue_xmit;
				3366
				3367	packet_cached_dev_reset(po);
				3368
				3369	sk->sk_destruct = packet_sock_destruct;
				3370	sk_refcnt_debug_inc(sk);
				3371
				3372	/*
				3373	* Attach a protocol block
				3374	*/
				3375
				3376	spin_lock_init(&po->bind_lock);
				3377	mutex_init(&po->pg_vec_lock);
				3378	po->rollover = NULL;
				3379	po->prot_hook.func = packet_rcv;
				3380	po->pkt_type = PACKET_MASK_ANY & ~(1 << PACKET_LOOPBACK);
				3381
				3382	if (sock->type == SOCK_PACKET)
				3383	po->prot_hook.func = packet_rcv_spkt;
				3384
				3385	po->prot_hook.af_packet_priv = sk;
				3386	po->prot_hook.af_packet_net = sock_net(sk);
				3387
				3388	if (proto) {
				3389	po->prot_hook.type = proto;
				3390	__register_prot_hook(sk);
				3391	}
				3392
				3393	mutex_lock(&net->packet.sklist_lock);
				3394	sk_add_node_tail_rcu(sk, &net->packet.sklist);
				3395	mutex_unlock(&net->packet.sklist_lock);
				3396
				3397	preempt_disable();
				3398	sock_prot_inuse_add(net, &packet_proto, 1);
				3399	preempt_enable();
				3400
				3401	return 0;
				3402	out_sk_free:
				3403	sk_free(sk);
				3404	out:
				3405	return err;
				3406	}
				3407
				3408	/*
				3409	* Pull a packet from our receive queue and hand it to the user.
				3410	* If necessary we block.
				3411	*/
				3412
				3413	static int packet_recvmsg(struct socket sock, struct msghdr msg, size_t len,
				3414	int flags)
				3415	{
				3416	struct sock *sk = sock->sk;
				3417	struct sk_buff *skb;
				3418	int copied, err;
				3419	int vnet_hdr_len = 0;
				3420	unsigned int origlen = 0;
				3421
				3422	err = -EINVAL;
				3423	if (flags & ~(MSG_PEEK\|MSG_DONTWAIT\|MSG_TRUNC\|MSG_CMSG_COMPAT\|MSG_ERRQUEUE))
				3424	goto out;
				3425
				3426	#if 0
				3427	/* What error should we return now? EUNATTACH? */
				3428	if (pkt_sk(sk)->ifindex < 0)
				3429	return -ENODEV;
				3430	#endif
				3431
				3432	if (flags & MSG_ERRQUEUE) {
				3433	err = sock_recv_errqueue(sk, msg, len,
				3434	SOL_PACKET, PACKET_TX_TIMESTAMP);
				3435	goto out;
				3436	}
				3437
				3438	/*
				3439	* Call the generic datagram receiver. This handles all sorts
				3440	* of horrible races and re-entrancy so we can forget about it
				3441	* in the protocol layers.
				3442	*
				3443	* Now it will return ENETDOWN, if device have just gone down,
				3444	* but then it will block.
				3445	*/
				3446
				3447	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
				3448
				3449	/*
				3450	* An error occurred so return it. Because skb_recv_datagram()
				3451	* handles the blocking we don't see and worry about blocking
				3452	* retries.
				3453	*/
				3454
				3455	if (skb == NULL)
				3456	goto out;
				3457
				3458	packet_rcv_try_clear_pressure(pkt_sk(sk));
				3459
				3460	if (pkt_sk(sk)->has_vnet_hdr) {
				3461	err = packet_rcv_vnet(msg, skb, &len);
				3462	if (err)
				3463	goto out_free;
				3464	vnet_hdr_len = sizeof(struct virtio_net_hdr);
				3465	}
				3466
				3467	/* You lose any data beyond the buffer you gave. If it worries
				3468	* a user program they can ask the device for its MTU
				3469	* anyway.
				3470	*/
				3471	copied = skb->len;
				3472	if (copied > len) {
				3473	copied = len;
				3474	msg->msg_flags \|= MSG_TRUNC;
				3475	}
				3476
				3477	err = skb_copy_datagram_msg(skb, 0, msg, copied);
				3478	if (err)
				3479	goto out_free;
				3480
				3481	if (sock->type != SOCK_PACKET) {
				3482	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
				3483
				3484	/* Original length was stored in sockaddr_ll fields */
				3485	origlen = PACKET_SKB_CB(skb)->sa.origlen;
				3486	sll->sll_family = AF_PACKET;
				3487	sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
				3488	vlan_get_protocol_dgram(skb) : skb->protocol;
				3489	}
				3490
				3491	sock_recv_ts_and_drops(msg, sk, skb);
				3492
				3493	if (msg->msg_name) {
				3494	const size_t max_len = min(sizeof(skb->cb),
				3495	sizeof(struct sockaddr_storage));
				3496	int copy_len;
				3497
				3498	/* If the address length field is there to be filled
				3499	* in, we fill it in now.
				3500	*/
				3501	if (sock->type == SOCK_PACKET) {
				3502	__sockaddr_check_size(sizeof(struct sockaddr_pkt));
				3503	msg->msg_namelen = sizeof(struct sockaddr_pkt);
				3504	copy_len = msg->msg_namelen;
				3505	} else {
				3506	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
				3507
				3508	msg->msg_namelen = sll->sll_halen +
				3509	offsetof(struct sockaddr_ll, sll_addr);
				3510	copy_len = msg->msg_namelen;
				3511	if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
				3512	memset(msg->msg_name +
				3513	offsetof(struct sockaddr_ll, sll_addr),
				3514	0, sizeof(sll->sll_addr));
				3515	msg->msg_namelen = sizeof(struct sockaddr_ll);
				3516	}
				3517	}
				3518	if (WARN_ON_ONCE(copy_len > max_len)) {
				3519	copy_len = max_len;
				3520	msg->msg_namelen = copy_len;
				3521	}
				3522	memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
				3523	}
				3524
				3525	if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
				3526	struct tpacket_auxdata aux;
				3527
				3528	aux.tp_status = TP_STATUS_USER;
				3529	if (skb->ip_summed == CHECKSUM_PARTIAL)
				3530	aux.tp_status \|= TP_STATUS_CSUMNOTREADY;
				3531	else if (skb->pkt_type != PACKET_OUTGOING &&
				3532	skb_csum_unnecessary(skb))
				3533	aux.tp_status \|= TP_STATUS_CSUM_VALID;
				3534
				3535	aux.tp_len = origlen;
				3536	aux.tp_snaplen = skb->len;
				3537	aux.tp_mac = 0;
				3538	aux.tp_net = skb_network_offset(skb);
				3539	if (skb_vlan_tag_present(skb)) {
				3540	aux.tp_vlan_tci = skb_vlan_tag_get(skb);
				3541	aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
				3542	aux.tp_status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				3543	} else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
				3544	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
				3545	struct net_device *dev;
				3546
				3547	rcu_read_lock();
				3548	dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex);
				3549	if (dev) {
				3550	aux.tp_vlan_tci = vlan_get_tci(skb, dev);
				3551	aux.tp_vlan_tpid = ntohs(skb->protocol);
				3552	aux.tp_status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				3553	} else {
				3554	aux.tp_vlan_tci = 0;
				3555	aux.tp_vlan_tpid = 0;
				3556	}
				3557	rcu_read_unlock();
				3558	} else {
				3559	aux.tp_vlan_tci = 0;
				3560	aux.tp_vlan_tpid = 0;
				3561	}
				3562	put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
				3563	}
				3564
				3565	/*
				3566	* Free or return the buffer as appropriate. Again this
				3567	* hides all the races and re-entrancy issues from us.
				3568	*/
				3569	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
				3570
				3571	out_free:
				3572	skb_free_datagram(sk, skb);
				3573	out:
				3574	return err;
				3575	}
				3576
				3577	static int packet_getname_spkt(struct socket sock, struct sockaddr uaddr,
				3578	int peer)
				3579	{
				3580	struct net_device *dev;
				3581	struct sock *sk = sock->sk;
				3582
				3583	if (peer)
				3584	return -EOPNOTSUPP;
				3585
				3586	uaddr->sa_family = AF_PACKET;
				3587	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
				3588	rcu_read_lock();
				3589	dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
				3590	if (dev)
				3591	strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
				3592	rcu_read_unlock();
				3593
				3594	return sizeof(*uaddr);
				3595	}
				3596
				3597	static int packet_getname(struct socket sock, struct sockaddr uaddr,
				3598	int peer)
				3599	{
				3600	struct net_device *dev;
				3601	struct sock *sk = sock->sk;
				3602	struct packet_sock *po = pkt_sk(sk);
				3603	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
				3604	int ifindex;
				3605
				3606	if (peer)
				3607	return -EOPNOTSUPP;
				3608
				3609	ifindex = READ_ONCE(po->ifindex);
				3610	sll->sll_family = AF_PACKET;
				3611	sll->sll_ifindex = ifindex;
				3612	sll->sll_protocol = READ_ONCE(po->num);
				3613	sll->sll_pkttype = 0;
				3614	rcu_read_lock();
				3615	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
				3616	if (dev) {
				3617	sll->sll_hatype = dev->type;
				3618	sll->sll_halen = dev->addr_len;
				3619	memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
				3620	} else {
				3621	sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
				3622	sll->sll_halen = 0;
				3623	}
				3624	rcu_read_unlock();
				3625
				3626	return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
				3627	}
				3628
				3629	static int packet_dev_mc(struct net_device dev, struct packet_mclist i,
				3630	int what)
				3631	{
				3632	switch (i->type) {
				3633	case PACKET_MR_MULTICAST:
				3634	if (i->alen != dev->addr_len)
				3635	return -EINVAL;
				3636	if (what > 0)
				3637	return dev_mc_add(dev, i->addr);
				3638	else
				3639	return dev_mc_del(dev, i->addr);
				3640	break;
				3641	case PACKET_MR_PROMISC:
				3642	return dev_set_promiscuity(dev, what);
				3643	case PACKET_MR_ALLMULTI:
				3644	return dev_set_allmulti(dev, what);
				3645	case PACKET_MR_UNICAST:
				3646	if (i->alen != dev->addr_len)
				3647	return -EINVAL;
				3648	if (what > 0)
				3649	return dev_uc_add(dev, i->addr);
				3650	else
				3651	return dev_uc_del(dev, i->addr);
				3652	break;
				3653	default:
				3654	break;
				3655	}
				3656	return 0;
				3657	}
				3658
				3659	static void packet_dev_mclist_delete(struct net_device *dev,
				3660	struct packet_mclist **mlp)
				3661	{
				3662	struct packet_mclist *ml;
				3663
				3664	while ((ml = *mlp) != NULL) {
				3665	if (ml->ifindex == dev->ifindex) {
				3666	packet_dev_mc(dev, ml, -1);
				3667	*mlp = ml->next;
				3668	kfree(ml);
				3669	} else
				3670	mlp = &ml->next;
				3671	}
				3672	}
				3673
				3674	static int packet_mc_add(struct sock sk, struct packet_mreq_max mreq)
				3675	{
				3676	struct packet_sock *po = pkt_sk(sk);
				3677	struct packet_mclist ml, i;
				3678	struct net_device *dev;
				3679	int err;
				3680
				3681	rtnl_lock();
				3682
				3683	err = -ENODEV;
				3684	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
				3685	if (!dev)
				3686	goto done;
				3687
				3688	err = -EINVAL;
				3689	if (mreq->mr_alen > dev->addr_len)
				3690	goto done;
				3691
				3692	err = -ENOBUFS;
				3693	i = kmalloc(sizeof(*i), GFP_KERNEL);
				3694	if (i == NULL)
				3695	goto done;
				3696
				3697	err = 0;
				3698	for (ml = po->mclist; ml; ml = ml->next) {
				3699	if (ml->ifindex == mreq->mr_ifindex &&
				3700	ml->type == mreq->mr_type &&
				3701	ml->alen == mreq->mr_alen &&
				3702	memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
				3703	ml->count++;
				3704	/* Free the new element ... */
				3705	kfree(i);
				3706	goto done;
				3707	}
				3708	}
				3709
				3710	i->type = mreq->mr_type;
				3711	i->ifindex = mreq->mr_ifindex;
				3712	i->alen = mreq->mr_alen;
				3713	memcpy(i->addr, mreq->mr_address, i->alen);
				3714	memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
				3715	i->count = 1;
				3716	i->next = po->mclist;
				3717	po->mclist = i;
				3718	err = packet_dev_mc(dev, i, 1);
				3719	if (err) {
				3720	po->mclist = i->next;
				3721	kfree(i);
				3722	}
				3723
				3724	done:
				3725	rtnl_unlock();
				3726	return err;
				3727	}
				3728
				3729	static int packet_mc_drop(struct sock sk, struct packet_mreq_max mreq)
				3730	{
				3731	struct packet_mclist ml, *mlp;
				3732
				3733	rtnl_lock();
				3734
				3735	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
				3736	if (ml->ifindex == mreq->mr_ifindex &&
				3737	ml->type == mreq->mr_type &&
				3738	ml->alen == mreq->mr_alen &&
				3739	memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
				3740	if (--ml->count == 0) {
				3741	struct net_device *dev;
				3742	*mlp = ml->next;
				3743	dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
				3744	if (dev)
				3745	packet_dev_mc(dev, ml, -1);
				3746	kfree(ml);
				3747	}
				3748	break;
				3749	}
				3750	}
				3751	rtnl_unlock();
				3752	return 0;
				3753	}
				3754
				3755	static void packet_flush_mclist(struct sock *sk)
				3756	{
				3757	struct packet_sock *po = pkt_sk(sk);
				3758	struct packet_mclist *ml;
				3759
				3760	if (!po->mclist)
				3761	return;
				3762
				3763	rtnl_lock();
				3764	while ((ml = po->mclist) != NULL) {
				3765	struct net_device *dev;
				3766
				3767	po->mclist = ml->next;
				3768	dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
				3769	if (dev != NULL)
				3770	packet_dev_mc(dev, ml, -1);
				3771	kfree(ml);
				3772	}
				3773	rtnl_unlock();
				3774	}
				3775
				3776	static int
				3777	packet_setsockopt(struct socket sock, int level, int optname, char __user optval, unsigned int optlen)
				3778	{
				3779	struct sock *sk = sock->sk;
				3780	struct packet_sock *po = pkt_sk(sk);
				3781	int ret;
				3782
				3783	if (level != SOL_PACKET)
				3784	return -ENOPROTOOPT;
				3785
				3786	switch (optname) {
				3787	case PACKET_ADD_MEMBERSHIP:
				3788	case PACKET_DROP_MEMBERSHIP:
				3789	{
				3790	struct packet_mreq_max mreq;
				3791	int len = optlen;
				3792	memset(&mreq, 0, sizeof(mreq));
				3793	if (len < sizeof(struct packet_mreq))
				3794	return -EINVAL;
				3795	if (len > sizeof(mreq))
				3796	len = sizeof(mreq);
				3797	if (copy_from_user(&mreq, optval, len))
				3798	return -EFAULT;
				3799	if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
				3800	return -EINVAL;
				3801	if (optname == PACKET_ADD_MEMBERSHIP)
				3802	ret = packet_mc_add(sk, &mreq);
				3803	else
				3804	ret = packet_mc_drop(sk, &mreq);
				3805	return ret;
				3806	}
				3807
				3808	case PACKET_RX_RING:
				3809	case PACKET_TX_RING:
				3810	{
				3811	union tpacket_req_u req_u;
				3812	int len;
				3813
				3814	lock_sock(sk);
				3815	switch (po->tp_version) {
				3816	case TPACKET_V1:
				3817	case TPACKET_V2:
				3818	len = sizeof(req_u.req);
				3819	break;
				3820	case TPACKET_V3:
				3821	default:
				3822	len = sizeof(req_u.req3);
				3823	break;
				3824	}
				3825	if (optlen < len) {
				3826	ret = -EINVAL;
				3827	} else {
				3828	if (copy_from_user(&req_u.req, optval, len))
				3829	ret = -EFAULT;
				3830	else
				3831	ret = packet_set_ring(sk, &req_u, 0,
				3832	optname == PACKET_TX_RING);
				3833	}
				3834	release_sock(sk);
				3835	return ret;
				3836	}
				3837	case PACKET_COPY_THRESH:
				3838	{
				3839	int val;
				3840
				3841	if (optlen != sizeof(val))
				3842	return -EINVAL;
				3843	if (copy_from_user(&val, optval, sizeof(val)))
				3844	return -EFAULT;
				3845
				3846	pkt_sk(sk)->copy_thresh = val;
				3847	return 0;
				3848	}
				3849	case PACKET_VERSION:
				3850	{
				3851	int val;
				3852
				3853	if (optlen != sizeof(val))
				3854	return -EINVAL;
				3855	if (copy_from_user(&val, optval, sizeof(val)))
				3856	return -EFAULT;
				3857	switch (val) {
				3858	case TPACKET_V1:
				3859	case TPACKET_V2:
				3860	case TPACKET_V3:
				3861	break;
				3862	default:
				3863	return -EINVAL;
				3864	}
				3865	lock_sock(sk);
				3866	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3867	ret = -EBUSY;
				3868	} else {
				3869	po->tp_version = val;
				3870	ret = 0;
				3871	}
				3872	release_sock(sk);
				3873	return ret;
				3874	}
				3875	case PACKET_RESERVE:
				3876	{
				3877	unsigned int val;
				3878
				3879	if (optlen != sizeof(val))
				3880	return -EINVAL;
				3881	if (copy_from_user(&val, optval, sizeof(val)))
				3882	return -EFAULT;
				3883	if (val > INT_MAX)
				3884	return -EINVAL;
				3885	lock_sock(sk);
				3886	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3887	ret = -EBUSY;
				3888	} else {
				3889	po->tp_reserve = val;
				3890	ret = 0;
				3891	}
				3892	release_sock(sk);
				3893	return ret;
				3894	}
				3895	case PACKET_LOSS:
				3896	{
				3897	unsigned int val;
				3898
				3899	if (optlen != sizeof(val))
				3900	return -EINVAL;
				3901	if (copy_from_user(&val, optval, sizeof(val)))
				3902	return -EFAULT;
				3903
				3904	lock_sock(sk);
				3905	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3906	ret = -EBUSY;
				3907	} else {
				3908	po->tp_loss = !!val;
				3909	ret = 0;
				3910	}
				3911	release_sock(sk);
				3912	return ret;
				3913	}
				3914	case PACKET_AUXDATA:
				3915	{
				3916	int val;
				3917
				3918	if (optlen < sizeof(val))
				3919	return -EINVAL;
				3920	if (copy_from_user(&val, optval, sizeof(val)))
				3921	return -EFAULT;
				3922
				3923	packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
				3924	return 0;
				3925	}
				3926	case PACKET_ORIGDEV:
				3927	{
				3928	int val;
				3929
				3930	if (optlen < sizeof(val))
				3931	return -EINVAL;
				3932	if (copy_from_user(&val, optval, sizeof(val)))
				3933	return -EFAULT;
				3934
				3935	packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
				3936	return 0;
				3937	}
				3938	case PACKET_VNET_HDR:
				3939	{
				3940	int val;
				3941
				3942	if (sock->type != SOCK_RAW)
				3943	return -EINVAL;
				3944	if (optlen < sizeof(val))
				3945	return -EINVAL;
				3946	if (copy_from_user(&val, optval, sizeof(val)))
				3947	return -EFAULT;
				3948
				3949	lock_sock(sk);
				3950	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3951	ret = -EBUSY;
				3952	} else {
				3953	po->has_vnet_hdr = !!val;
				3954	ret = 0;
				3955	}
				3956	release_sock(sk);
				3957	return ret;
				3958	}
				3959	case PACKET_TIMESTAMP:
				3960	{
				3961	int val;
				3962
				3963	if (optlen != sizeof(val))
				3964	return -EINVAL;
				3965	if (copy_from_user(&val, optval, sizeof(val)))
				3966	return -EFAULT;
				3967
				3968	po->tp_tstamp = val;
				3969	return 0;
				3970	}
				3971	case PACKET_FANOUT:
				3972	{
				3973	int val;
				3974
				3975	if (optlen != sizeof(val))
				3976	return -EINVAL;
				3977	if (copy_from_user(&val, optval, sizeof(val)))
				3978	return -EFAULT;
				3979
				3980	return fanout_add(sk, val & 0xffff, val >> 16);
				3981	}
				3982	case PACKET_FANOUT_DATA:
				3983	{
				3984	/* Paired with the WRITE_ONCE() in fanout_add() */
				3985	if (!READ_ONCE(po->fanout))
				3986	return -EINVAL;
				3987
				3988	return fanout_set_data(po, optval, optlen);
				3989	}
				3990	case PACKET_IGNORE_OUTGOING:
				3991	{
				3992	int val;
				3993
				3994	if (optlen != sizeof(val))
				3995	return -EINVAL;
				3996	if (copy_from_user(&val, optval, sizeof(val)))
				3997	return -EFAULT;
				3998	if (val < 0 \|\| val > 1)
				3999	return -EINVAL;
				4000
				4001	WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
				4002	return 0;
				4003	}
				4004	case PACKET_TX_HAS_OFF:
				4005	{
				4006	unsigned int val;
				4007
				4008	if (optlen != sizeof(val))
				4009	return -EINVAL;
				4010	if (copy_from_user(&val, optval, sizeof(val)))
				4011	return -EFAULT;
				4012
				4013	lock_sock(sk);
				4014	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				4015	ret = -EBUSY;
				4016	} else {
				4017	po->tp_tx_has_off = !!val;
				4018	ret = 0;
				4019	}
				4020	release_sock(sk);
				4021	return 0;
				4022	}
				4023	case PACKET_QDISC_BYPASS:
				4024	{
				4025	int val;
				4026
				4027	if (optlen != sizeof(val))
				4028	return -EINVAL;
				4029	if (copy_from_user(&val, optval, sizeof(val)))
				4030	return -EFAULT;
				4031
				4032	/* Paired with all lockless reads of po->xmit */
				4033	WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit);
				4034	return 0;
				4035	}
				4036	case PACKET_RECV_TYPE:
				4037	{
				4038	unsigned int val;
				4039	if (optlen != sizeof(val))
				4040	return -EINVAL;
				4041	if (copy_from_user(&val, optval, sizeof(val)))
				4042	return -EFAULT;
				4043	po->pkt_type = val & ~BIT(PACKET_LOOPBACK);
				4044	return 0;
				4045	}
				4046	default:
				4047	return -ENOPROTOOPT;
				4048	}
				4049	}
				4050
				4051	static int packet_getsockopt(struct socket *sock, int level, int optname,
				4052	char __user optval, int __user optlen)
				4053	{
				4054	int len;
				4055	int val, lv = sizeof(val);
				4056	struct sock *sk = sock->sk;
				4057	struct packet_sock *po = pkt_sk(sk);
				4058	void *data = &val;
				4059	union tpacket_stats_u st;
				4060	struct tpacket_rollover_stats rstats;
				4061	int drops;
				4062
				4063	if (level != SOL_PACKET)
				4064	return -ENOPROTOOPT;
				4065
				4066	if (get_user(len, optlen))
				4067	return -EFAULT;
				4068
				4069	if (len < 0)
				4070	return -EINVAL;
				4071
				4072	switch (optname) {
				4073	case PACKET_STATISTICS:
				4074	spin_lock_bh(&sk->sk_receive_queue.lock);
				4075	memcpy(&st, &po->stats, sizeof(st));
				4076	memset(&po->stats, 0, sizeof(po->stats));
				4077	spin_unlock_bh(&sk->sk_receive_queue.lock);
				4078	drops = atomic_xchg(&po->tp_drops, 0);
				4079
				4080	if (po->tp_version == TPACKET_V3) {
				4081	lv = sizeof(struct tpacket_stats_v3);
				4082	st.stats3.tp_drops = drops;
				4083	st.stats3.tp_packets += drops;
				4084	data = &st.stats3;
				4085	} else {
				4086	lv = sizeof(struct tpacket_stats);
				4087	st.stats1.tp_drops = drops;
				4088	st.stats1.tp_packets += drops;
				4089	data = &st.stats1;
				4090	}
				4091
				4092	break;
				4093	case PACKET_AUXDATA:
				4094	val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
				4095	break;
				4096	case PACKET_ORIGDEV:
				4097	val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
				4098	break;
				4099	case PACKET_VNET_HDR:
				4100	val = po->has_vnet_hdr;
				4101	break;
				4102	case PACKET_RECV_TYPE:
				4103	if (len > sizeof(unsigned int))
				4104	len = sizeof(unsigned int);
				4105	val = po->pkt_type;
				4106
				4107	data = &val;
				4108	break;
				4109	case PACKET_VERSION:
				4110	val = po->tp_version;
				4111	break;
				4112	case PACKET_HDRLEN:
				4113	if (len > sizeof(int))
				4114	len = sizeof(int);
				4115	if (len < sizeof(int))
				4116	return -EINVAL;
				4117	if (copy_from_user(&val, optval, len))
				4118	return -EFAULT;
				4119	switch (val) {
				4120	case TPACKET_V1:
				4121	val = sizeof(struct tpacket_hdr);
				4122	break;
				4123	case TPACKET_V2:
				4124	val = sizeof(struct tpacket2_hdr);
				4125	break;
				4126	case TPACKET_V3:
				4127	val = sizeof(struct tpacket3_hdr);
				4128	break;
				4129	default:
				4130	return -EINVAL;
				4131	}
				4132	break;
				4133	case PACKET_RESERVE:
				4134	val = po->tp_reserve;
				4135	break;
				4136	case PACKET_LOSS:
				4137	val = po->tp_loss;
				4138	break;
				4139	case PACKET_TIMESTAMP:
				4140	val = po->tp_tstamp;
				4141	break;
				4142	case PACKET_FANOUT:
				4143	val = (po->fanout ?
				4144	((u32)po->fanout->id \|
				4145	((u32)po->fanout->type << 16) \|
				4146	((u32)po->fanout->flags << 24)) :
				4147	0);
				4148	break;
				4149	case PACKET_IGNORE_OUTGOING:
				4150	val = READ_ONCE(po->prot_hook.ignore_outgoing);
				4151	break;
				4152	case PACKET_ROLLOVER_STATS:
				4153	if (!po->rollover)
				4154	return -EINVAL;
				4155	rstats.tp_all = atomic_long_read(&po->rollover->num);
				4156	rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
				4157	rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
				4158	data = &rstats;
				4159	lv = sizeof(rstats);
				4160	break;
				4161	case PACKET_TX_HAS_OFF:
				4162	val = po->tp_tx_has_off;
				4163	break;
				4164	case PACKET_QDISC_BYPASS:
				4165	val = packet_use_direct_xmit(po);
				4166	break;
				4167	default:
				4168	return -ENOPROTOOPT;
				4169	}
				4170
				4171	if (len > lv)
				4172	len = lv;
				4173	if (put_user(len, optlen))
				4174	return -EFAULT;
				4175	if (copy_to_user(optval, data, len))
				4176	return -EFAULT;
				4177	return 0;
				4178	}
				4179
				4180
				4181	#ifdef CONFIG_COMPAT
				4182	static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
				4183	char __user *optval, unsigned int optlen)
				4184	{
				4185	struct packet_sock *po = pkt_sk(sock->sk);
				4186
				4187	if (level != SOL_PACKET)
				4188	return -ENOPROTOOPT;
				4189
				4190	if (optname == PACKET_FANOUT_DATA &&
				4191	po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
				4192	optval = (char __user *)get_compat_bpf_fprog(optval);
				4193	if (!optval)
				4194	return -EFAULT;
				4195	optlen = sizeof(struct sock_fprog);
				4196	}
				4197
				4198	return packet_setsockopt(sock, level, optname, optval, optlen);
				4199	}
				4200	#endif
				4201
				4202	static int packet_notifier(struct notifier_block *this,
				4203	unsigned long msg, void *ptr)
				4204	{
				4205	struct sock *sk;
				4206	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
				4207	struct net *net = dev_net(dev);
				4208
				4209	rcu_read_lock();
				4210	sk_for_each_rcu(sk, &net->packet.sklist) {
				4211	struct packet_sock *po = pkt_sk(sk);
				4212
				4213	switch (msg) {
				4214	case NETDEV_UNREGISTER:
				4215	if (po->mclist)
				4216	packet_dev_mclist_delete(dev, &po->mclist);
				4217	/* fallthrough */
				4218
				4219	case NETDEV_DOWN:
				4220	if (dev->ifindex == po->ifindex) {
				4221	spin_lock(&po->bind_lock);
				4222	if (po->running) {
				4223	__unregister_prot_hook(sk, false);
				4224	sk->sk_err = ENETDOWN;
				4225	if (!sock_flag(sk, SOCK_DEAD))
				4226	sk->sk_error_report(sk);
				4227	}
				4228	if (msg == NETDEV_UNREGISTER) {
				4229	packet_cached_dev_reset(po);
				4230	WRITE_ONCE(po->ifindex, -1);
				4231	if (po->prot_hook.dev)
				4232	dev_put(po->prot_hook.dev);
				4233	po->prot_hook.dev = NULL;
				4234	}
				4235	spin_unlock(&po->bind_lock);
				4236	}
				4237	break;
				4238	case NETDEV_UP:
				4239	if (dev->ifindex == po->ifindex) {
				4240	spin_lock(&po->bind_lock);
				4241	if (po->num)
				4242	register_prot_hook(sk);
				4243	spin_unlock(&po->bind_lock);
				4244	}
				4245	break;
				4246	}
				4247	}
				4248	rcu_read_unlock();
				4249	return NOTIFY_DONE;
				4250	}
				4251
				4252
				4253	static int packet_ioctl(struct socket *sock, unsigned int cmd,
				4254	unsigned long arg)
				4255	{
				4256	struct sock *sk = sock->sk;
				4257
				4258	switch (cmd) {
				4259	case SIOCOUTQ:
				4260	{
				4261	int amount = sk_wmem_alloc_get(sk);
				4262
				4263	return put_user(amount, (int __user *)arg);
				4264	}
				4265	case SIOCINQ:
				4266	{
				4267	struct sk_buff *skb;
				4268	int amount = 0;
				4269
				4270	spin_lock_bh(&sk->sk_receive_queue.lock);
				4271	skb = skb_peek(&sk->sk_receive_queue);
				4272	if (skb)
				4273	amount = skb->len;
				4274	spin_unlock_bh(&sk->sk_receive_queue.lock);
				4275	return put_user(amount, (int __user *)arg);
				4276	}
				4277	#ifdef CONFIG_INET
				4278	case SIOCADDRT:
				4279	case SIOCDELRT:
				4280	case SIOCDARP:
				4281	case SIOCGARP:
				4282	case SIOCSARP:
				4283	case SIOCGIFADDR:
				4284	case SIOCSIFADDR:
				4285	case SIOCGIFBRDADDR:
				4286	case SIOCSIFBRDADDR:
				4287	case SIOCGIFNETMASK:
				4288	case SIOCSIFNETMASK:
				4289	case SIOCGIFDSTADDR:
				4290	case SIOCSIFDSTADDR:
				4291	case SIOCSIFFLAGS:
				4292	return inet_dgram_ops.ioctl(sock, cmd, arg);
				4293	#endif
				4294
				4295	default:
				4296	return -ENOIOCTLCMD;
				4297	}
				4298	return 0;
				4299	}
				4300
				4301	static __poll_t packet_poll(struct file file, struct socket sock,
				4302	poll_table *wait)
				4303	{
				4304	struct sock *sk = sock->sk;
				4305	struct packet_sock *po = pkt_sk(sk);
				4306	__poll_t mask = datagram_poll(file, sock, wait);
				4307
				4308	spin_lock_bh(&sk->sk_receive_queue.lock);
				4309	if (po->rx_ring.pg_vec) {
				4310	if (!packet_previous_rx_frame(po, &po->rx_ring,
				4311	TP_STATUS_KERNEL))
				4312	mask \|= EPOLLIN \| EPOLLRDNORM;
				4313	}
				4314	packet_rcv_try_clear_pressure(po);
				4315	spin_unlock_bh(&sk->sk_receive_queue.lock);
				4316	spin_lock_bh(&sk->sk_write_queue.lock);
				4317	if (po->tx_ring.pg_vec) {
				4318	if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
				4319	mask \|= EPOLLOUT \| EPOLLWRNORM;
				4320	}
				4321	spin_unlock_bh(&sk->sk_write_queue.lock);
				4322	return mask;
				4323	}
				4324
				4325
				4326	/* Dirty? Well, I still did not learn better way to account
				4327	* for user mmaps.
				4328	*/
				4329
				4330	static void packet_mm_open(struct vm_area_struct *vma)
				4331	{
				4332	struct file *file = vma->vm_file;
				4333	struct socket *sock = file->private_data;
				4334	struct sock *sk = sock->sk;
				4335
				4336	if (sk)
				4337	atomic_long_inc(&pkt_sk(sk)->mapped);
				4338	}
				4339
				4340	static void packet_mm_close(struct vm_area_struct *vma)
				4341	{
				4342	struct file *file = vma->vm_file;
				4343	struct socket *sock = file->private_data;
				4344	struct sock *sk = sock->sk;
				4345
				4346	if (sk)
				4347	atomic_long_dec(&pkt_sk(sk)->mapped);
				4348	}
				4349
				4350	static const struct vm_operations_struct packet_mmap_ops = {
				4351	.open = packet_mm_open,
				4352	.close = packet_mm_close,
				4353	};
				4354
				4355	static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
				4356	unsigned int len)
				4357	{
				4358	int i;
				4359
				4360	for (i = 0; i < len; i++) {
				4361	if (likely(pg_vec[i].buffer)) {
				4362	if (is_vmalloc_addr(pg_vec[i].buffer))
				4363	vfree(pg_vec[i].buffer);
				4364	else
				4365	free_pages((unsigned long)pg_vec[i].buffer,
				4366	order);
				4367	pg_vec[i].buffer = NULL;
				4368	}
				4369	}
				4370	kfree(pg_vec);
				4371	}
				4372
				4373	static char *alloc_one_pg_vec_page(unsigned long order)
				4374	{
				4375	char *buffer;
				4376	gfp_t gfp_flags = GFP_KERNEL \| __GFP_COMP \|
				4377	__GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY;
				4378
				4379	buffer = (char *) __get_free_pages(gfp_flags, order);
				4380	if (buffer)
				4381	return buffer;
				4382
				4383	/* __get_free_pages failed, fall back to vmalloc */
				4384	buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
				4385	if (buffer)
				4386	return buffer;
				4387
				4388	/* vmalloc failed, lets dig into swap here */
				4389	gfp_flags &= ~__GFP_NORETRY;
				4390	buffer = (char *) __get_free_pages(gfp_flags, order);
				4391	if (buffer)
				4392	return buffer;
				4393
				4394	/* complete and utter failure */
				4395	return NULL;
				4396	}
				4397
				4398	static struct pgv alloc_pg_vec(struct tpacket_req req, int order)
				4399	{
				4400	unsigned int block_nr = req->tp_block_nr;
				4401	struct pgv *pg_vec;
				4402	int i;
				4403
				4404	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL \| __GFP_NOWARN);
				4405	if (unlikely(!pg_vec))
				4406	goto out;
				4407
				4408	for (i = 0; i < block_nr; i++) {
				4409	pg_vec[i].buffer = alloc_one_pg_vec_page(order);
				4410	if (unlikely(!pg_vec[i].buffer))
				4411	goto out_free_pgvec;
				4412	}
				4413
				4414	out:
				4415	return pg_vec;
				4416
				4417	out_free_pgvec:
				4418	free_pg_vec(pg_vec, order, block_nr);
				4419	pg_vec = NULL;
				4420	goto out;
				4421	}
				4422
				4423	static int packet_set_ring(struct sock sk, union tpacket_req_u req_u,
				4424	int closing, int tx_ring)
				4425	{
				4426	struct pgv *pg_vec = NULL;
				4427	struct packet_sock *po = pkt_sk(sk);
				4428	unsigned long *rx_owner_map = NULL;
				4429	int was_running, order = 0;
				4430	struct packet_ring_buffer *rb;
				4431	struct sk_buff_head *rb_queue;
				4432	__be16 num;
				4433	int err = -EINVAL;
				4434	/* Added to avoid minimal code churn */
				4435	struct tpacket_req *req = &req_u->req;
				4436
				4437	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
				4438	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
				4439
				4440	err = -EBUSY;
				4441	if (!closing) {
				4442	if (atomic_long_read(&po->mapped))
				4443	goto out;
				4444	if (packet_read_pending(rb))
				4445	goto out;
				4446	}
				4447
				4448	if (req->tp_block_nr) {
				4449	unsigned int min_frame_size;
				4450
				4451	/* Sanity tests and some calculations */
				4452	err = -EBUSY;
				4453	if (unlikely(rb->pg_vec))
				4454	goto out;
				4455
				4456	switch (po->tp_version) {
				4457	case TPACKET_V1:
				4458	po->tp_hdrlen = TPACKET_HDRLEN;
				4459	break;
				4460	case TPACKET_V2:
				4461	po->tp_hdrlen = TPACKET2_HDRLEN;
				4462	break;
				4463	case TPACKET_V3:
				4464	po->tp_hdrlen = TPACKET3_HDRLEN;
				4465	break;
				4466	}
				4467
				4468	err = -EINVAL;
				4469	if (unlikely((int)req->tp_block_size <= 0))
				4470	goto out;
				4471	if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
				4472	goto out;
				4473	min_frame_size = po->tp_hdrlen + po->tp_reserve;
				4474	if (po->tp_version >= TPACKET_V3 &&
				4475	req->tp_block_size <
				4476	BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
				4477	goto out;
				4478	if (unlikely(req->tp_frame_size < min_frame_size))
				4479	goto out;
				4480	if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
				4481	goto out;
				4482
				4483	rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
				4484	if (unlikely(rb->frames_per_block == 0))
				4485	goto out;
				4486	if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
				4487	goto out;
				4488	if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
				4489	req->tp_frame_nr))
				4490	goto out;
				4491
				4492	err = -ENOMEM;
				4493	order = get_order(req->tp_block_size);
				4494	pg_vec = alloc_pg_vec(req, order);
				4495	if (unlikely(!pg_vec))
				4496	goto out;
				4497	switch (po->tp_version) {
				4498	case TPACKET_V3:
				4499	/* Block transmit is not supported yet */
				4500	if (!tx_ring) {
				4501	init_prb_bdqc(po, rb, pg_vec, req_u);
				4502	} else {
				4503	struct tpacket_req3 *req3 = &req_u->req3;
				4504
				4505	if (req3->tp_retire_blk_tov \|\|
				4506	req3->tp_sizeof_priv \|\|
				4507	req3->tp_feature_req_word) {
				4508	err = -EINVAL;
				4509	goto out_free_pg_vec;
				4510	}
				4511	}
				4512	break;
				4513	default:
				4514	if (!tx_ring) {
				4515	rx_owner_map = bitmap_alloc(req->tp_frame_nr,
				4516	GFP_KERNEL \| __GFP_NOWARN \| __GFP_ZERO);
				4517	if (!rx_owner_map)
				4518	goto out_free_pg_vec;
				4519	}
				4520	break;
				4521	}
				4522	}
				4523	/* Done */
				4524	else {
				4525	err = -EINVAL;
				4526	if (unlikely(req->tp_frame_nr))
				4527	goto out;
				4528	}
				4529
				4530
				4531	/* Detach socket from network */
				4532	spin_lock(&po->bind_lock);
				4533	was_running = po->running;
				4534	num = po->num;
				4535	if (was_running) {
				4536	WRITE_ONCE(po->num, 0);
				4537	__unregister_prot_hook(sk, false);
				4538	}
				4539	spin_unlock(&po->bind_lock);
				4540
				4541	synchronize_net();
				4542
				4543	err = -EBUSY;
				4544	mutex_lock(&po->pg_vec_lock);
				4545	if (closing \|\| atomic_long_read(&po->mapped) == 0) {
				4546	err = 0;
				4547	spin_lock_bh(&rb_queue->lock);
				4548	swap(rb->pg_vec, pg_vec);
				4549	if (po->tp_version <= TPACKET_V2)
				4550	swap(rb->rx_owner_map, rx_owner_map);
				4551	rb->frame_max = (req->tp_frame_nr - 1);
				4552	rb->head = 0;
				4553	rb->frame_size = req->tp_frame_size;
				4554	spin_unlock_bh(&rb_queue->lock);
				4555
				4556	swap(rb->pg_vec_order, order);
				4557	swap(rb->pg_vec_len, req->tp_block_nr);
				4558
				4559	rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
				4560	po->prot_hook.func = (po->rx_ring.pg_vec) ?
				4561	tpacket_rcv : packet_rcv;
				4562	skb_queue_purge(rb_queue);
				4563	if (atomic_long_read(&po->mapped))
				4564	pr_err("packet_mmap: vma is busy: %ld\n",
				4565	atomic_long_read(&po->mapped));
				4566	}
				4567	mutex_unlock(&po->pg_vec_lock);
				4568
				4569	spin_lock(&po->bind_lock);
				4570	if (was_running) {
				4571	WRITE_ONCE(po->num, num);
				4572	register_prot_hook(sk);
				4573	}
				4574	spin_unlock(&po->bind_lock);
				4575	if (pg_vec && (po->tp_version > TPACKET_V2)) {
				4576	/* Because we don't support block-based V3 on tx-ring */
				4577	if (!tx_ring)
				4578	prb_shutdown_retire_blk_timer(po, rb_queue);
				4579	}
				4580
				4581	out_free_pg_vec:
				4582	if (pg_vec) {
				4583	bitmap_free(rx_owner_map);
				4584	free_pg_vec(pg_vec, order, req->tp_block_nr);
				4585	}
				4586	out:
				4587	return err;
				4588	}
				4589
				4590	static int packet_mmap(struct file file, struct socket sock,
				4591	struct vm_area_struct *vma)
				4592	{
				4593	struct sock *sk = sock->sk;
				4594	struct packet_sock *po = pkt_sk(sk);
				4595	unsigned long size, expected_size;
				4596	struct packet_ring_buffer *rb;
				4597	unsigned long start;
				4598	int err = -EINVAL;
				4599	int i;
				4600
				4601	if (vma->vm_pgoff)
				4602	return -EINVAL;
				4603
				4604	mutex_lock(&po->pg_vec_lock);
				4605
				4606	expected_size = 0;
				4607	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
				4608	if (rb->pg_vec) {
				4609	expected_size += rb->pg_vec_len
				4610	* rb->pg_vec_pages
				4611	* PAGE_SIZE;
				4612	}
				4613	}
				4614
				4615	if (expected_size == 0)
				4616	goto out;
				4617
				4618	size = vma->vm_end - vma->vm_start;
				4619	if (size != expected_size)
				4620	goto out;
				4621
				4622	start = vma->vm_start;
				4623	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
				4624	if (rb->pg_vec == NULL)
				4625	continue;
				4626
				4627	for (i = 0; i < rb->pg_vec_len; i++) {
				4628	struct page *page;
				4629	void *kaddr = rb->pg_vec[i].buffer;
				4630	int pg_num;
				4631
				4632	for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
				4633	page = pgv_to_page(kaddr);
				4634	err = vm_insert_page(vma, start, page);
				4635	if (unlikely(err))
				4636	goto out;
				4637	start += PAGE_SIZE;
				4638	kaddr += PAGE_SIZE;
				4639	}
				4640	}
				4641	}
				4642
				4643	atomic_long_inc(&po->mapped);
				4644	vma->vm_ops = &packet_mmap_ops;
				4645	err = 0;
				4646
				4647	out:
				4648	mutex_unlock(&po->pg_vec_lock);
				4649	return err;
				4650	}
				4651
				4652	static const struct proto_ops packet_ops_spkt = {
				4653	.family = PF_PACKET,
				4654	.owner = THIS_MODULE,
				4655	.release = packet_release,
				4656	.bind = packet_bind_spkt,
				4657	.connect = sock_no_connect,
				4658	.socketpair = sock_no_socketpair,
				4659	.accept = sock_no_accept,
				4660	.getname = packet_getname_spkt,
				4661	.poll = datagram_poll,
				4662	.ioctl = packet_ioctl,
				4663	.gettstamp = sock_gettstamp,
				4664	.listen = sock_no_listen,
				4665	.shutdown = sock_no_shutdown,
				4666	.setsockopt = sock_no_setsockopt,
				4667	.getsockopt = sock_no_getsockopt,
				4668	.sendmsg = packet_sendmsg_spkt,
				4669	.recvmsg = packet_recvmsg,
				4670	.mmap = sock_no_mmap,
				4671	.sendpage = sock_no_sendpage,
				4672	};
				4673
				4674	static const struct proto_ops packet_ops = {
				4675	.family = PF_PACKET,
				4676	.owner = THIS_MODULE,
				4677	.release = packet_release,
				4678	.bind = packet_bind,
				4679	.connect = sock_no_connect,
				4680	.socketpair = sock_no_socketpair,
				4681	.accept = sock_no_accept,
				4682	.getname = packet_getname,
				4683	.poll = packet_poll,
				4684	.ioctl = packet_ioctl,
				4685	.gettstamp = sock_gettstamp,
				4686	.listen = sock_no_listen,
				4687	.shutdown = sock_no_shutdown,
				4688	.setsockopt = packet_setsockopt,
				4689	.getsockopt = packet_getsockopt,
				4690	#ifdef CONFIG_COMPAT
				4691	.compat_setsockopt = compat_packet_setsockopt,
				4692	#endif
				4693	.sendmsg = packet_sendmsg,
				4694	.recvmsg = packet_recvmsg,
				4695	.mmap = packet_mmap,
				4696	.sendpage = sock_no_sendpage,
				4697	};
				4698
				4699	static const struct net_proto_family packet_family_ops = {
				4700	.family = PF_PACKET,
				4701	.create = packet_create,
				4702	.owner = THIS_MODULE,
				4703	};
				4704
				4705	static struct notifier_block packet_netdev_notifier = {
				4706	.notifier_call = packet_notifier,
				4707	};
				4708
				4709	#ifdef CONFIG_PROC_FS
				4710
				4711	static void packet_seq_start(struct seq_file seq, loff_t *pos)
				4712	__acquires(RCU)
				4713	{
				4714	struct net *net = seq_file_net(seq);
				4715
				4716	rcu_read_lock();
				4717	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
				4718	}
				4719
				4720	static void packet_seq_next(struct seq_file seq, void v, loff_t pos)
				4721	{
				4722	struct net *net = seq_file_net(seq);
				4723	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
				4724	}
				4725
				4726	static void packet_seq_stop(struct seq_file seq, void v)
				4727	__releases(RCU)
				4728	{
				4729	rcu_read_unlock();
				4730	}
				4731
				4732	static int packet_seq_show(struct seq_file seq, void v)
				4733	{
				4734	if (v == SEQ_START_TOKEN)
				4735	seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
				4736	else {
				4737	struct sock *s = sk_entry(v);
				4738	const struct packet_sock *po = pkt_sk(s);
				4739
				4740	seq_printf(seq,
				4741	"%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
				4742	s,
				4743	refcount_read(&s->sk_refcnt),
				4744	s->sk_type,
				4745	ntohs(READ_ONCE(po->num)),
				4746	READ_ONCE(po->ifindex),
				4747	po->running,
				4748	atomic_read(&s->sk_rmem_alloc),
				4749	from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
				4750	sock_i_ino(s));
				4751	}
				4752
				4753	return 0;
				4754	}
				4755
				4756	static const struct seq_operations packet_seq_ops = {
				4757	.start = packet_seq_start,
				4758	.next = packet_seq_next,
				4759	.stop = packet_seq_stop,
				4760	.show = packet_seq_show,
				4761	};
				4762	#endif
				4763
				4764	static int __net_init packet_net_init(struct net *net)
				4765	{
				4766	mutex_init(&net->packet.sklist_lock);
				4767	INIT_HLIST_HEAD(&net->packet.sklist);
				4768
				4769	if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
				4770	sizeof(struct seq_net_private)))
				4771	return -ENOMEM;
				4772
				4773	return 0;
				4774	}
				4775
				4776	static void __net_exit packet_net_exit(struct net *net)
				4777	{
				4778	remove_proc_entry("packet", net->proc_net);
				4779	WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
				4780	}
				4781
				4782	static struct pernet_operations packet_net_ops = {
				4783	.init = packet_net_init,
				4784	.exit = packet_net_exit,
				4785	};
				4786
				4787
				4788	static void __exit packet_exit(void)
				4789	{
				4790	unregister_netdevice_notifier(&packet_netdev_notifier);
				4791	unregister_pernet_subsys(&packet_net_ops);
				4792	sock_unregister(PF_PACKET);
				4793	proto_unregister(&packet_proto);
				4794	}
				4795
				4796	static int __init packet_init(void)
				4797	{
				4798	int rc;
				4799
				4800	rc = proto_register(&packet_proto, 0);
				4801	if (rc)
				4802	goto out;
				4803	rc = sock_register(&packet_family_ops);
				4804	if (rc)
				4805	goto out_proto;
				4806	rc = register_pernet_subsys(&packet_net_ops);
				4807	if (rc)
				4808	goto out_sock;
				4809	rc = register_netdevice_notifier(&packet_netdev_notifier);
				4810	if (rc)
				4811	goto out_pernet;
				4812
				4813	return 0;
				4814
				4815	out_pernet:
				4816	unregister_pernet_subsys(&packet_net_ops);
				4817	out_sock:
				4818	sock_unregister(PF_PACKET);
				4819	out_proto:
				4820	proto_unregister(&packet_proto);
				4821	out:
				4822	return rc;
				4823	}
				4824
				4825	module_init(packet_init);
				4826	module_exit(packet_exit);
				4827	MODULE_LICENSE("GPL");
				4828	MODULE_ALIAS_NETPROTO(PF_PACKET);