Blame - src/kernel/linux/v4.14/net/packet/af_packet.c - T103

blob: c2356611b3cba58b72ab9b369b5f9685fadceb0f [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* INET An implementation of the TCP/IP protocol suite for the LINUX
				3	* operating system. INET is implemented using the BSD Socket
				4	* interface as the means of communication with the user level.
				5	*
				6	* PACKET - implements raw packet sockets.
				7	*
				8	* Authors: Ross Biro
				9	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
				10	* Alan Cox, <gw4pts@gw4pts.ampr.org>
				11	*
				12	* Fixes:
				13	* Alan Cox : verify_area() now used correctly
				14	* Alan Cox : new skbuff lists, look ma no backlogs!
				15	* Alan Cox : tidied skbuff lists.
				16	* Alan Cox : Now uses generic datagram routines I
				17	* added. Also fixed the peek/read crash
				18	* from all old Linux datagram code.
				19	* Alan Cox : Uses the improved datagram code.
				20	* Alan Cox : Added NULL's for socket options.
				21	* Alan Cox : Re-commented the code.
				22	* Alan Cox : Use new kernel side addressing
				23	* Rob Janssen : Correct MTU usage.
				24	* Dave Platt : Counter leaks caused by incorrect
				25	* interrupt locking and some slightly
				26	* dubious gcc output. Can you read
				27	* compiler: it said _VOLATILE_
				28	* Richard Kooijman : Timestamp fixes.
				29	* Alan Cox : New buffers. Use sk->mac.raw.
				30	* Alan Cox : sendmsg/recvmsg support.
				31	* Alan Cox : Protocol setting support
				32	* Alexey Kuznetsov : Untied from IPv4 stack.
				33	* Cyrus Durgin : Fixed kerneld for kmod.
				34	* Michal Ostrowski : Module initialization cleanup.
				35	* Ulises Alonso : Frame number limit removal and
				36	* packet_set_ring memory leak.
				37	* Eric Biederman : Allow for > 8 byte hardware addresses.
				38	* The convention is that longer addresses
				39	* will simply extend the hardware address
				40	* byte arrays at the end of sockaddr_ll
				41	* and packet_mreq.
				42	* Johann Baudy : Added TX RING.
				43	* Chetan Loke : Implemented TPACKET_V3 block abstraction
				44	* layer.
				45	* Copyright (C) 2011, <lokec@ccs.neu.edu>
				46	*
				47	*
				48	* This program is free software; you can redistribute it and/or
				49	* modify it under the terms of the GNU General Public License
				50	* as published by the Free Software Foundation; either version
				51	* 2 of the License, or (at your option) any later version.
				52	*
				53	*/
				54
				55	#include <linux/types.h>
				56	#include <linux/mm.h>
				57	#include <linux/capability.h>
				58	#include <linux/fcntl.h>
				59	#include <linux/socket.h>
				60	#include <linux/in.h>
				61	#include <linux/inet.h>
				62	#include <linux/netdevice.h>
				63	#include <linux/if_packet.h>
				64	#include <linux/wireless.h>
				65	#include <linux/kernel.h>
				66	#include <linux/kmod.h>
				67	#include <linux/slab.h>
				68	#include <linux/vmalloc.h>
				69	#include <net/net_namespace.h>
				70	#include <net/ip.h>
				71	#include <net/protocol.h>
				72	#include <linux/skbuff.h>
				73	#include <net/sock.h>
				74	#include <linux/errno.h>
				75	#include <linux/timer.h>
				76	#include <linux/uaccess.h>
				77	#include <asm/ioctls.h>
				78	#include <asm/page.h>
				79	#include <asm/cacheflush.h>
				80	#include <asm/io.h>
				81	#include <linux/proc_fs.h>
				82	#include <linux/seq_file.h>
				83	#include <linux/poll.h>
				84	#include <linux/module.h>
				85	#include <linux/init.h>
				86	#include <linux/mutex.h>
				87	#include <linux/if_vlan.h>
				88	#include <linux/virtio_net.h>
				89	#include <linux/errqueue.h>
				90	#include <linux/net_tstamp.h>
				91	#include <linux/percpu.h>
				92	#ifdef CONFIG_INET
				93	#include <net/inet_common.h>
				94	#endif
				95	#include <linux/bpf.h>
				96	#include <net/compat.h>
				97
				98	#include "internal.h"
				99
				100	/*
				101	Assumptions:
				102	- if device has no dev->hard_header routine, it adds and removes ll header
				103	inside itself. In this case ll header is invisible outside of device,
				104	but higher levels still should reserve dev->hard_header_len.
				105	Some devices are enough clever to reallocate skb, when header
				106	will not fit to reserved space (tunnel), another ones are silly
				107	(PPP).
				108	- packet socket receives packets with pulled ll header,
				109	so that SOCK_RAW should push it back.
				110
				111	On receive:
				112	-----------
				113
				114	Incoming, dev->hard_header!=NULL
				115	mac_header -> ll header
				116	data -> data
				117
				118	Outgoing, dev->hard_header!=NULL
				119	mac_header -> ll header
				120	data -> ll header
				121
				122	Incoming, dev->hard_header==NULL
				123	mac_header -> UNKNOWN position. It is very likely, that it points to ll
				124	header. PPP makes it, that is wrong, because introduce
				125	assymetry between rx and tx paths.
				126	data -> data
				127
				128	Outgoing, dev->hard_header==NULL
				129	mac_header -> data. ll header is still not built!
				130	data -> data
				131
				132	Resume
				133	If dev->hard_header==NULL we are unlikely to restore sensible ll header.
				134
				135
				136	On transmit:
				137	------------
				138
				139	dev->hard_header != NULL
				140	mac_header -> ll header
				141	data -> ll header
				142
				143	dev->hard_header == NULL (ll header is added by device, we cannot control it)
				144	mac_header -> data
				145	data -> data
				146
				147	We should set nh.raw on output to correct posistion,
				148	packet classifier depends on it.
				149	*/
				150
				151	/* Private packet socket structures. */
				152
				153	/* identical to struct packet_mreq except it has
				154	* a longer address field.
				155	*/
				156	struct packet_mreq_max {
				157	int mr_ifindex;
				158	unsigned short mr_type;
				159	unsigned short mr_alen;
				160	unsigned char mr_address[MAX_ADDR_LEN];
				161	};
				162
				163	union tpacket_uhdr {
				164	struct tpacket_hdr *h1;
				165	struct tpacket2_hdr *h2;
				166	struct tpacket3_hdr *h3;
				167	void *raw;
				168	};
				169
				170	static int packet_set_ring(struct sock sk, union tpacket_req_u req_u,
				171	int closing, int tx_ring);
				172
				173	#define V3_ALIGNMENT (8)
				174
				175	#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
				176
				177	#define BLK_PLUS_PRIV(sz_of_priv) \
				178	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
				179
				180	#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
				181	#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
				182	#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
				183	#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
				184	#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
				185	#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
				186	#define BLOCK_PRIV(x) ((void )((char )(x) + BLOCK_O2PRIV(x)))
				187
				188	struct packet_sock;
				189	static int tpacket_rcv(struct sk_buff skb, struct net_device dev,
				190	struct packet_type pt, struct net_device orig_dev);
				191
				192	static void packet_previous_frame(struct packet_sock po,
				193	struct packet_ring_buffer *rb,
				194	int status);
				195	static void packet_increment_head(struct packet_ring_buffer *buff);
				196	static int prb_curr_blk_in_use(struct tpacket_block_desc *);
				197	static void prb_dispatch_next_block(struct tpacket_kbdq_core ,
				198	struct packet_sock *);
				199	static void prb_retire_current_block(struct tpacket_kbdq_core *,
				200	struct packet_sock *, unsigned int status);
				201	static int prb_queue_frozen(struct tpacket_kbdq_core *);
				202	static void prb_open_block(struct tpacket_kbdq_core *,
				203	struct tpacket_block_desc *);
				204	static void prb_retire_rx_blk_timer_expired(unsigned long);
				205	static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
				206	static void prb_init_blk_timer(struct packet_sock *,
				207	struct tpacket_kbdq_core *,
				208	void (*func) (unsigned long));
				209	static void prb_fill_rxhash(struct tpacket_kbdq_core , struct tpacket3_hdr );
				210	static void prb_clear_rxhash(struct tpacket_kbdq_core *,
				211	struct tpacket3_hdr *);
				212	static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
				213	struct tpacket3_hdr *);
				214	static void packet_flush_mclist(struct sock *sk);
				215	static void packet_pick_tx_queue(struct net_device dev, struct sk_buff skb);
				216
				217	struct packet_skb_cb {
				218	union {
				219	struct sockaddr_pkt pkt;
				220	union {
				221	/* Trick: alias skb original length with
				222	* ll.sll_family and ll.protocol in order
				223	* to save room.
				224	*/
				225	unsigned int origlen;
				226	struct sockaddr_ll ll;
				227	};
				228	} sa;
				229	};
				230
				231	#define vio_le() virtio_legacy_is_little_endian()
				232
				233	#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
				234
				235	#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
				236	#define GET_PBLOCK_DESC(x, bid) \
				237	((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
				238	#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
				239	((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
				240	#define GET_NEXT_PRB_BLK_NUM(x) \
				241	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
				242	((x)->kactive_blk_num+1) : 0)
				243
				244	static void __fanout_unlink(struct sock sk, struct packet_sock po);
				245	static void __fanout_link(struct sock sk, struct packet_sock po);
				246
				247	static int packet_direct_xmit(struct sk_buff *skb)
				248	{
				249	struct net_device *dev = skb->dev;
				250	struct sk_buff *orig_skb = skb;
				251	struct netdev_queue *txq;
				252	int ret = NETDEV_TX_BUSY;
				253
				254	if (unlikely(!netif_running(dev) \|\|
				255	!netif_carrier_ok(dev)))
				256	goto drop;
				257
				258	skb = validate_xmit_skb_list(skb, dev);
				259	if (skb != orig_skb)
				260	goto drop;
				261
				262	packet_pick_tx_queue(dev, skb);
				263	txq = skb_get_tx_queue(dev, skb);
				264
				265	local_bh_disable();
				266
				267	HARD_TX_LOCK(dev, txq, smp_processor_id());
				268	if (!netif_xmit_frozen_or_drv_stopped(txq))
				269	ret = netdev_start_xmit(skb, dev, txq, false);
				270	HARD_TX_UNLOCK(dev, txq);
				271
				272	local_bh_enable();
				273
				274	if (!dev_xmit_complete(ret))
				275	kfree_skb(skb);
				276
				277	return ret;
				278	drop:
				279	atomic_long_inc(&dev->tx_dropped);
				280	kfree_skb_list(skb);
				281	return NET_XMIT_DROP;
				282	}
				283
				284	static struct net_device packet_cached_dev_get(struct packet_sock po)
				285	{
				286	struct net_device *dev;
				287
				288	rcu_read_lock();
				289	dev = rcu_dereference(po->cached_dev);
				290	if (likely(dev))
				291	dev_hold(dev);
				292	rcu_read_unlock();
				293
				294	return dev;
				295	}
				296
				297	static void packet_cached_dev_assign(struct packet_sock *po,
				298	struct net_device *dev)
				299	{
				300	rcu_assign_pointer(po->cached_dev, dev);
				301	}
				302
				303	static void packet_cached_dev_reset(struct packet_sock *po)
				304	{
				305	RCU_INIT_POINTER(po->cached_dev, NULL);
				306	}
				307
				308	static bool packet_use_direct_xmit(const struct packet_sock *po)
				309	{
				310	return po->xmit == packet_direct_xmit;
				311	}
				312
				313	static u16 __packet_pick_tx_queue(struct net_device dev, struct sk_buff skb)
				314	{
				315	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
				316	}
				317
				318	static void packet_pick_tx_queue(struct net_device dev, struct sk_buff skb)
				319	{
				320	const struct net_device_ops *ops = dev->netdev_ops;
				321	u16 queue_index;
				322
				323	if (ops->ndo_select_queue) {
				324	queue_index = ops->ndo_select_queue(dev, skb, NULL,
				325	__packet_pick_tx_queue);
				326	queue_index = netdev_cap_txqueue(dev, queue_index);
				327	} else {
				328	queue_index = __packet_pick_tx_queue(dev, skb);
				329	}
				330
				331	skb_set_queue_mapping(skb, queue_index);
				332	}
				333
				334	/* __register_prot_hook must be invoked through register_prot_hook
				335	* or from a context in which asynchronous accesses to the packet
				336	* socket is not possible (packet_create()).
				337	*/
				338	static void __register_prot_hook(struct sock *sk)
				339	{
				340	struct packet_sock *po = pkt_sk(sk);
				341
				342	if (!po->running) {
				343	if (po->fanout)
				344	__fanout_link(sk, po);
				345	else
				346	dev_add_pack(&po->prot_hook);
				347
				348	sock_hold(sk);
				349	po->running = 1;
				350	}
				351	}
				352
				353	static void register_prot_hook(struct sock *sk)
				354	{
				355	lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
				356	__register_prot_hook(sk);
				357	}
				358
				359	/* If the sync parameter is true, we will temporarily drop
				360	* the po->bind_lock and do a synchronize_net to make sure no
				361	* asynchronous packet processing paths still refer to the elements
				362	* of po->prot_hook. If the sync parameter is false, it is the
				363	* callers responsibility to take care of this.
				364	*/
				365	static void __unregister_prot_hook(struct sock *sk, bool sync)
				366	{
				367	struct packet_sock *po = pkt_sk(sk);
				368
				369	lockdep_assert_held_once(&po->bind_lock);
				370
				371	po->running = 0;
				372
				373	if (po->fanout)
				374	__fanout_unlink(sk, po);
				375	else
				376	__dev_remove_pack(&po->prot_hook);
				377
				378	__sock_put(sk);
				379
				380	if (sync) {
				381	spin_unlock(&po->bind_lock);
				382	synchronize_net();
				383	spin_lock(&po->bind_lock);
				384	}
				385	}
				386
				387	static void unregister_prot_hook(struct sock *sk, bool sync)
				388	{
				389	struct packet_sock *po = pkt_sk(sk);
				390
				391	if (po->running)
				392	__unregister_prot_hook(sk, sync);
				393	}
				394
				395	static inline struct page * __pure pgv_to_page(void *addr)
				396	{
				397	if (is_vmalloc_addr(addr))
				398	return vmalloc_to_page(addr);
				399	return virt_to_page(addr);
				400	}
				401
				402	static void __packet_set_status(struct packet_sock po, void frame, int status)
				403	{
				404	union tpacket_uhdr h;
				405
				406	h.raw = frame;
				407	switch (po->tp_version) {
				408	case TPACKET_V1:
				409	h.h1->tp_status = status;
				410	flush_dcache_page(pgv_to_page(&h.h1->tp_status));
				411	break;
				412	case TPACKET_V2:
				413	h.h2->tp_status = status;
				414	flush_dcache_page(pgv_to_page(&h.h2->tp_status));
				415	break;
				416	case TPACKET_V3:
				417	h.h3->tp_status = status;
				418	flush_dcache_page(pgv_to_page(&h.h3->tp_status));
				419	break;
				420	default:
				421	WARN(1, "TPACKET version not supported.\n");
				422	BUG();
				423	}
				424
				425	smp_wmb();
				426	}
				427
				428	static int __packet_get_status(struct packet_sock po, void frame)
				429	{
				430	union tpacket_uhdr h;
				431
				432	smp_rmb();
				433
				434	h.raw = frame;
				435	switch (po->tp_version) {
				436	case TPACKET_V1:
				437	flush_dcache_page(pgv_to_page(&h.h1->tp_status));
				438	return h.h1->tp_status;
				439	case TPACKET_V2:
				440	flush_dcache_page(pgv_to_page(&h.h2->tp_status));
				441	return h.h2->tp_status;
				442	case TPACKET_V3:
				443	flush_dcache_page(pgv_to_page(&h.h3->tp_status));
				444	return h.h3->tp_status;
				445	default:
				446	WARN(1, "TPACKET version not supported.\n");
				447	BUG();
				448	return 0;
				449	}
				450	}
				451
				452	static __u32 tpacket_get_timestamp(struct sk_buff skb, struct timespec ts,
				453	unsigned int flags)
				454	{
				455	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
				456
				457	if (shhwtstamps &&
				458	(flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
				459	ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
				460	return TP_STATUS_TS_RAW_HARDWARE;
				461
				462	if (ktime_to_timespec_cond(skb->tstamp, ts))
				463	return TP_STATUS_TS_SOFTWARE;
				464
				465	return 0;
				466	}
				467
				468	static __u32 __packet_set_timestamp(struct packet_sock po, void frame,
				469	struct sk_buff *skb)
				470	{
				471	union tpacket_uhdr h;
				472	struct timespec ts;
				473	__u32 ts_status;
				474
				475	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
				476	return 0;
				477
				478	h.raw = frame;
				479	switch (po->tp_version) {
				480	case TPACKET_V1:
				481	h.h1->tp_sec = ts.tv_sec;
				482	h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
				483	break;
				484	case TPACKET_V2:
				485	h.h2->tp_sec = ts.tv_sec;
				486	h.h2->tp_nsec = ts.tv_nsec;
				487	break;
				488	case TPACKET_V3:
				489	h.h3->tp_sec = ts.tv_sec;
				490	h.h3->tp_nsec = ts.tv_nsec;
				491	break;
				492	default:
				493	WARN(1, "TPACKET version not supported.\n");
				494	BUG();
				495	}
				496
				497	/* one flush is safe, as both fields always lie on the same cacheline */
				498	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
				499	smp_wmb();
				500
				501	return ts_status;
				502	}
				503
				504	static void packet_lookup_frame(struct packet_sock po,
				505	struct packet_ring_buffer *rb,
				506	unsigned int position,
				507	int status)
				508	{
				509	unsigned int pg_vec_pos, frame_offset;
				510	union tpacket_uhdr h;
				511
				512	pg_vec_pos = position / rb->frames_per_block;
				513	frame_offset = position % rb->frames_per_block;
				514
				515	h.raw = rb->pg_vec[pg_vec_pos].buffer +
				516	(frame_offset * rb->frame_size);
				517
				518	if (status != __packet_get_status(po, h.raw))
				519	return NULL;
				520
				521	return h.raw;
				522	}
				523
				524	static void packet_current_frame(struct packet_sock po,
				525	struct packet_ring_buffer *rb,
				526	int status)
				527	{
				528	return packet_lookup_frame(po, rb, rb->head, status);
				529	}
				530
				531	static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
				532	{
				533	del_timer_sync(&pkc->retire_blk_timer);
				534	}
				535
				536	static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
				537	struct sk_buff_head *rb_queue)
				538	{
				539	struct tpacket_kbdq_core *pkc;
				540
				541	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				542
				543	spin_lock_bh(&rb_queue->lock);
				544	pkc->delete_blk_timer = 1;
				545	spin_unlock_bh(&rb_queue->lock);
				546
				547	prb_del_retire_blk_timer(pkc);
				548	}
				549
				550	static void prb_init_blk_timer(struct packet_sock *po,
				551	struct tpacket_kbdq_core *pkc,
				552	void (*func) (unsigned long))
				553	{
				554	init_timer(&pkc->retire_blk_timer);
				555	pkc->retire_blk_timer.data = (long)po;
				556	pkc->retire_blk_timer.function = func;
				557	pkc->retire_blk_timer.expires = jiffies;
				558	}
				559
				560	static void prb_setup_retire_blk_timer(struct packet_sock *po)
				561	{
				562	struct tpacket_kbdq_core *pkc;
				563
				564	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				565	prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
				566	}
				567
				568	static int prb_calc_retire_blk_tmo(struct packet_sock *po,
				569	int blk_size_in_bytes)
				570	{
				571	struct net_device *dev;
				572	unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
				573	struct ethtool_link_ksettings ecmd;
				574	int err;
				575
				576	rtnl_lock();
				577	dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
				578	if (unlikely(!dev)) {
				579	rtnl_unlock();
				580	return DEFAULT_PRB_RETIRE_TOV;
				581	}
				582	err = __ethtool_get_link_ksettings(dev, &ecmd);
				583	rtnl_unlock();
				584	if (!err) {
				585	/*
				586	* If the link speed is so slow you don't really
				587	* need to worry about perf anyways
				588	*/
				589	if (ecmd.base.speed < SPEED_1000 \|\|
				590	ecmd.base.speed == SPEED_UNKNOWN) {
				591	return DEFAULT_PRB_RETIRE_TOV;
				592	} else {
				593	msec = 1;
				594	div = ecmd.base.speed / 1000;
				595	}
				596	} else
				597	return DEFAULT_PRB_RETIRE_TOV;
				598
				599	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
				600
				601	if (div)
				602	mbits /= div;
				603
				604	tmo = mbits * msec;
				605
				606	if (div)
				607	return tmo+1;
				608	return tmo;
				609	}
				610
				611	static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
				612	union tpacket_req_u *req_u)
				613	{
				614	p1->feature_req_word = req_u->req3.tp_feature_req_word;
				615	}
				616
				617	static void init_prb_bdqc(struct packet_sock *po,
				618	struct packet_ring_buffer *rb,
				619	struct pgv *pg_vec,
				620	union tpacket_req_u *req_u)
				621	{
				622	struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
				623	struct tpacket_block_desc *pbd;
				624
				625	memset(p1, 0x0, sizeof(*p1));
				626
				627	p1->knxt_seq_num = 1;
				628	p1->pkbdq = pg_vec;
				629	pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
				630	p1->pkblk_start = pg_vec[0].buffer;
				631	p1->kblk_size = req_u->req3.tp_block_size;
				632	p1->knum_blocks = req_u->req3.tp_block_nr;
				633	p1->hdrlen = po->tp_hdrlen;
				634	p1->version = po->tp_version;
				635	p1->last_kactive_blk_num = 0;
				636	po->stats.stats3.tp_freeze_q_cnt = 0;
				637	if (req_u->req3.tp_retire_blk_tov)
				638	p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
				639	else
				640	p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
				641	req_u->req3.tp_block_size);
				642	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
				643	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
				644
				645	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
				646	prb_init_ft_ops(p1, req_u);
				647	prb_setup_retire_blk_timer(po);
				648	prb_open_block(p1, pbd);
				649	}
				650
				651	/* Do NOT update the last_blk_num first.
				652	* Assumes sk_buff_head lock is held.
				653	*/
				654	static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
				655	{
				656	mod_timer(&pkc->retire_blk_timer,
				657	jiffies + pkc->tov_in_jiffies);
				658	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
				659	}
				660
				661	/*
				662	* Timer logic:
				663	* 1) We refresh the timer only when we open a block.
				664	* By doing this we don't waste cycles refreshing the timer
				665	* on packet-by-packet basis.
				666	*
				667	* With a 1MB block-size, on a 1Gbps line, it will take
				668	* i) ~8 ms to fill a block + ii) memcpy etc.
				669	* In this cut we are not accounting for the memcpy time.
				670	*
				671	* So, if the user sets the 'tmo' to 10ms then the timer
				672	* will never fire while the block is still getting filled
				673	* (which is what we want). However, the user could choose
				674	* to close a block early and that's fine.
				675	*
				676	* But when the timer does fire, we check whether or not to refresh it.
				677	* Since the tmo granularity is in msecs, it is not too expensive
				678	* to refresh the timer, lets say every '8' msecs.
				679	* Either the user can set the 'tmo' or we can derive it based on
				680	* a) line-speed and b) block-size.
				681	* prb_calc_retire_blk_tmo() calculates the tmo.
				682	*
				683	*/
				684	static void prb_retire_rx_blk_timer_expired(unsigned long data)
				685	{
				686	struct packet_sock po = (struct packet_sock )data;
				687	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				688	unsigned int frozen;
				689	struct tpacket_block_desc *pbd;
				690
				691	spin_lock(&po->sk.sk_receive_queue.lock);
				692
				693	frozen = prb_queue_frozen(pkc);
				694	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				695
				696	if (unlikely(pkc->delete_blk_timer))
				697	goto out;
				698
				699	/* We only need to plug the race when the block is partially filled.
				700	* tpacket_rcv:
				701	* lock(); increment BLOCK_NUM_PKTS; unlock()
				702	* copy_bits() is in progress ...
				703	* timer fires on other cpu:
				704	* we can't retire the current block because copy_bits
				705	* is in progress.
				706	*
				707	*/
				708	if (BLOCK_NUM_PKTS(pbd)) {
				709	while (atomic_read(&pkc->blk_fill_in_prog)) {
				710	/* Waiting for skb_copy_bits to finish... */
				711	cpu_relax();
				712	}
				713	}
				714
				715	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
				716	if (!frozen) {
				717	if (!BLOCK_NUM_PKTS(pbd)) {
				718	/* An empty block. Just refresh the timer. */
				719	goto refresh_timer;
				720	}
				721	prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
				722	if (!prb_dispatch_next_block(pkc, po))
				723	goto refresh_timer;
				724	else
				725	goto out;
				726	} else {
				727	/* Case 1. Queue was frozen because user-space was
				728	* lagging behind.
				729	*/
				730	if (prb_curr_blk_in_use(pbd)) {
				731	/*
				732	* Ok, user-space is still behind.
				733	* So just refresh the timer.
				734	*/
				735	goto refresh_timer;
				736	} else {
				737	/* Case 2. queue was frozen,user-space caught up,
				738	* now the link went idle && the timer fired.
				739	* We don't have a block to close.So we open this
				740	* block and restart the timer.
				741	* opening a block thaws the queue,restarts timer
				742	* Thawing/timer-refresh is a side effect.
				743	*/
				744	prb_open_block(pkc, pbd);
				745	goto out;
				746	}
				747	}
				748	}
				749
				750	refresh_timer:
				751	_prb_refresh_rx_retire_blk_timer(pkc);
				752
				753	out:
				754	spin_unlock(&po->sk.sk_receive_queue.lock);
				755	}
				756
				757	static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
				758	struct tpacket_block_desc *pbd1, __u32 status)
				759	{
				760	/* Flush everything minus the block header */
				761
				762	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
				763	u8 start, end;
				764
				765	start = (u8 *)pbd1;
				766
				767	/* Skip the block header(we know header WILL fit in 4K) */
				768	start += PAGE_SIZE;
				769
				770	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
				771	for (; start < end; start += PAGE_SIZE)
				772	flush_dcache_page(pgv_to_page(start));
				773
				774	smp_wmb();
				775	#endif
				776
				777	/* Now update the block status. */
				778
				779	BLOCK_STATUS(pbd1) = status;
				780
				781	/* Flush the block header */
				782
				783	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
				784	start = (u8 *)pbd1;
				785	flush_dcache_page(pgv_to_page(start));
				786
				787	smp_wmb();
				788	#endif
				789	}
				790
				791	/*
				792	* Side effect:
				793	*
				794	* 1) flush the block
				795	* 2) Increment active_blk_num
				796	*
				797	* Note:We DONT refresh the timer on purpose.
				798	* Because almost always the next block will be opened.
				799	*/
				800	static void prb_close_block(struct tpacket_kbdq_core *pkc1,
				801	struct tpacket_block_desc *pbd1,
				802	struct packet_sock *po, unsigned int stat)
				803	{
				804	__u32 status = TP_STATUS_USER \| stat;
				805
				806	struct tpacket3_hdr *last_pkt;
				807	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
				808	struct sock *sk = &po->sk;
				809
				810	if (po->stats.stats3.tp_drops)
				811	status \|= TP_STATUS_LOSING;
				812
				813	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
				814	last_pkt->tp_next_offset = 0;
				815
				816	/* Get the ts of the last pkt */
				817	if (BLOCK_NUM_PKTS(pbd1)) {
				818	h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
				819	h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
				820	} else {
				821	/* Ok, we tmo'd - so get the current time.
				822	*
				823	* It shouldn't really happen as we don't close empty
				824	* blocks. See prb_retire_rx_blk_timer_expired().
				825	*/
				826	struct timespec ts;
				827	getnstimeofday(&ts);
				828	h1->ts_last_pkt.ts_sec = ts.tv_sec;
				829	h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
				830	}
				831
				832	smp_wmb();
				833
				834	/* Flush the block */
				835	prb_flush_block(pkc1, pbd1, status);
				836
				837	sk->sk_data_ready(sk);
				838
				839	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
				840	}
				841
				842	static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
				843	{
				844	pkc->reset_pending_on_curr_blk = 0;
				845	}
				846
				847	/*
				848	* Side effect of opening a block:
				849	*
				850	* 1) prb_queue is thawed.
				851	* 2) retire_blk_timer is refreshed.
				852	*
				853	*/
				854	static void prb_open_block(struct tpacket_kbdq_core *pkc1,
				855	struct tpacket_block_desc *pbd1)
				856	{
				857	struct timespec ts;
				858	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
				859
				860	smp_rmb();
				861
				862	/* We could have just memset this but we will lose the
				863	* flexibility of making the priv area sticky
				864	*/
				865
				866	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
				867	BLOCK_NUM_PKTS(pbd1) = 0;
				868	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
				869
				870	getnstimeofday(&ts);
				871
				872	h1->ts_first_pkt.ts_sec = ts.tv_sec;
				873	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
				874
				875	pkc1->pkblk_start = (char *)pbd1;
				876	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
				877
				878	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
				879	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
				880
				881	pbd1->version = pkc1->version;
				882	pkc1->prev = pkc1->nxt_offset;
				883	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
				884
				885	prb_thaw_queue(pkc1);
				886	_prb_refresh_rx_retire_blk_timer(pkc1);
				887
				888	smp_wmb();
				889	}
				890
				891	/*
				892	* Queue freeze logic:
				893	* 1) Assume tp_block_nr = 8 blocks.
				894	* 2) At time 't0', user opens Rx ring.
				895	* 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
				896	* 4) user-space is either sleeping or processing block '0'.
				897	* 5) tpacket_rcv is currently filling block '7', since there is no space left,
				898	* it will close block-7,loop around and try to fill block '0'.
				899	* call-flow:
				900	* __packet_lookup_frame_in_block
				901	* prb_retire_current_block()
				902	* prb_dispatch_next_block()
				903	* \|->(BLOCK_STATUS == USER) evaluates to true
				904	* 5.1) Since block-0 is currently in-use, we just freeze the queue.
				905	* 6) Now there are two cases:
				906	* 6.1) Link goes idle right after the queue is frozen.
				907	* But remember, the last open_block() refreshed the timer.
				908	* When this timer expires,it will refresh itself so that we can
				909	* re-open block-0 in near future.
				910	* 6.2) Link is busy and keeps on receiving packets. This is a simple
				911	* case and __packet_lookup_frame_in_block will check if block-0
				912	* is free and can now be re-used.
				913	*/
				914	static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
				915	struct packet_sock *po)
				916	{
				917	pkc->reset_pending_on_curr_blk = 1;
				918	po->stats.stats3.tp_freeze_q_cnt++;
				919	}
				920
				921	#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
				922
				923	/*
				924	* If the next block is free then we will dispatch it
				925	* and return a good offset.
				926	* Else, we will freeze the queue.
				927	* So, caller must check the return value.
				928	*/
				929	static void prb_dispatch_next_block(struct tpacket_kbdq_core pkc,
				930	struct packet_sock *po)
				931	{
				932	struct tpacket_block_desc *pbd;
				933
				934	smp_rmb();
				935
				936	/* 1. Get current block num */
				937	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				938
				939	/* 2. If this block is currently in_use then freeze the queue */
				940	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
				941	prb_freeze_queue(pkc, po);
				942	return NULL;
				943	}
				944
				945	/*
				946	* 3.
				947	* open this block and return the offset where the first packet
				948	* needs to get stored.
				949	*/
				950	prb_open_block(pkc, pbd);
				951	return (void *)pkc->nxt_offset;
				952	}
				953
				954	static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
				955	struct packet_sock *po, unsigned int status)
				956	{
				957	struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				958
				959	/* retire/close the current block */
				960	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
				961	/*
				962	* Plug the case where copy_bits() is in progress on
				963	* cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
				964	* have space to copy the pkt in the current block and
				965	* called prb_retire_current_block()
				966	*
				967	* We don't need to worry about the TMO case because
				968	* the timer-handler already handled this case.
				969	*/
				970	if (!(status & TP_STATUS_BLK_TMO)) {
				971	while (atomic_read(&pkc->blk_fill_in_prog)) {
				972	/* Waiting for skb_copy_bits to finish... */
				973	cpu_relax();
				974	}
				975	}
				976	prb_close_block(pkc, pbd, po, status);
				977	return;
				978	}
				979	}
				980
				981	static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
				982	{
				983	return TP_STATUS_USER & BLOCK_STATUS(pbd);
				984	}
				985
				986	static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
				987	{
				988	return pkc->reset_pending_on_curr_blk;
				989	}
				990
				991	static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
				992	__releases(&pkc->blk_fill_in_prog_lock)
				993	{
				994	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
				995	atomic_dec(&pkc->blk_fill_in_prog);
				996	}
				997
				998	static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
				999	struct tpacket3_hdr *ppd)
				1000	{
				1001	ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
				1002	}
				1003
				1004	static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
				1005	struct tpacket3_hdr *ppd)
				1006	{
				1007	ppd->hv1.tp_rxhash = 0;
				1008	}
				1009
				1010	static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
				1011	struct tpacket3_hdr *ppd)
				1012	{
				1013	if (skb_vlan_tag_present(pkc->skb)) {
				1014	ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
				1015	ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
				1016	ppd->tp_status = TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				1017	} else {
				1018	ppd->hv1.tp_vlan_tci = 0;
				1019	ppd->hv1.tp_vlan_tpid = 0;
				1020	ppd->tp_status = TP_STATUS_AVAILABLE;
				1021	}
				1022	}
				1023
				1024	static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
				1025	struct tpacket3_hdr *ppd)
				1026	{
				1027	ppd->hv1.tp_padding = 0;
				1028	prb_fill_vlan_info(pkc, ppd);
				1029
				1030	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
				1031	prb_fill_rxhash(pkc, ppd);
				1032	else
				1033	prb_clear_rxhash(pkc, ppd);
				1034	}
				1035
				1036	static void prb_fill_curr_block(char *curr,
				1037	struct tpacket_kbdq_core *pkc,
				1038	struct tpacket_block_desc *pbd,
				1039	unsigned int len)
				1040	__acquires(&pkc->blk_fill_in_prog_lock)
				1041	{
				1042	struct tpacket3_hdr *ppd;
				1043
				1044	ppd = (struct tpacket3_hdr *)curr;
				1045	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
				1046	pkc->prev = curr;
				1047	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
				1048	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
				1049	BLOCK_NUM_PKTS(pbd) += 1;
				1050	atomic_inc(&pkc->blk_fill_in_prog);
				1051	prb_run_all_ft_ops(pkc, ppd);
				1052	}
				1053
				1054	/* Assumes caller has the sk->rx_queue.lock */
				1055	static void __packet_lookup_frame_in_block(struct packet_sock po,
				1056	struct sk_buff *skb,
				1057	int status,
				1058	unsigned int len
				1059	)
				1060	{
				1061	struct tpacket_kbdq_core *pkc;
				1062	struct tpacket_block_desc *pbd;
				1063	char curr, end;
				1064
				1065	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
				1066	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				1067
				1068	/* Queue is frozen when user space is lagging behind */
				1069	if (prb_queue_frozen(pkc)) {
				1070	/*
				1071	* Check if that last block which caused the queue to freeze,
				1072	* is still in_use by user-space.
				1073	*/
				1074	if (prb_curr_blk_in_use(pbd)) {
				1075	/* Can't record this packet */
				1076	return NULL;
				1077	} else {
				1078	/*
				1079	* Ok, the block was released by user-space.
				1080	* Now let's open that block.
				1081	* opening a block also thaws the queue.
				1082	* Thawing is a side effect.
				1083	*/
				1084	prb_open_block(pkc, pbd);
				1085	}
				1086	}
				1087
				1088	smp_mb();
				1089	curr = pkc->nxt_offset;
				1090	pkc->skb = skb;
				1091	end = (char *)pbd + pkc->kblk_size;
				1092
				1093	/* first try the current block */
				1094	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
				1095	prb_fill_curr_block(curr, pkc, pbd, len);
				1096	return (void *)curr;
				1097	}
				1098
				1099	/* Ok, close the current block */
				1100	prb_retire_current_block(pkc, po, 0);
				1101
				1102	/* Now, try to dispatch the next block */
				1103	curr = (char *)prb_dispatch_next_block(pkc, po);
				1104	if (curr) {
				1105	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
				1106	prb_fill_curr_block(curr, pkc, pbd, len);
				1107	return (void *)curr;
				1108	}
				1109
				1110	/*
				1111	* No free blocks are available.user_space hasn't caught up yet.
				1112	* Queue was just frozen and now this packet will get dropped.
				1113	*/
				1114	return NULL;
				1115	}
				1116
				1117	static void packet_current_rx_frame(struct packet_sock po,
				1118	struct sk_buff *skb,
				1119	int status, unsigned int len)
				1120	{
				1121	char *curr = NULL;
				1122	switch (po->tp_version) {
				1123	case TPACKET_V1:
				1124	case TPACKET_V2:
				1125	curr = packet_lookup_frame(po, &po->rx_ring,
				1126	po->rx_ring.head, status);
				1127	return curr;
				1128	case TPACKET_V3:
				1129	return __packet_lookup_frame_in_block(po, skb, status, len);
				1130	default:
				1131	WARN(1, "TPACKET version not supported\n");
				1132	BUG();
				1133	return NULL;
				1134	}
				1135	}
				1136
				1137	static void prb_lookup_block(struct packet_sock po,
				1138	struct packet_ring_buffer *rb,
				1139	unsigned int idx,
				1140	int status)
				1141	{
				1142	struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
				1143	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
				1144
				1145	if (status != BLOCK_STATUS(pbd))
				1146	return NULL;
				1147	return pbd;
				1148	}
				1149
				1150	static int prb_previous_blk_num(struct packet_ring_buffer *rb)
				1151	{
				1152	unsigned int prev;
				1153	if (rb->prb_bdqc.kactive_blk_num)
				1154	prev = rb->prb_bdqc.kactive_blk_num-1;
				1155	else
				1156	prev = rb->prb_bdqc.knum_blocks-1;
				1157	return prev;
				1158	}
				1159
				1160	/* Assumes caller has held the rx_queue.lock */
				1161	static void __prb_previous_block(struct packet_sock po,
				1162	struct packet_ring_buffer *rb,
				1163	int status)
				1164	{
				1165	unsigned int previous = prb_previous_blk_num(rb);
				1166	return prb_lookup_block(po, rb, previous, status);
				1167	}
				1168
				1169	static void packet_previous_rx_frame(struct packet_sock po,
				1170	struct packet_ring_buffer *rb,
				1171	int status)
				1172	{
				1173	if (po->tp_version <= TPACKET_V2)
				1174	return packet_previous_frame(po, rb, status);
				1175
				1176	return __prb_previous_block(po, rb, status);
				1177	}
				1178
				1179	static void packet_increment_rx_head(struct packet_sock *po,
				1180	struct packet_ring_buffer *rb)
				1181	{
				1182	switch (po->tp_version) {
				1183	case TPACKET_V1:
				1184	case TPACKET_V2:
				1185	return packet_increment_head(rb);
				1186	case TPACKET_V3:
				1187	default:
				1188	WARN(1, "TPACKET version not supported.\n");
				1189	BUG();
				1190	return;
				1191	}
				1192	}
				1193
				1194	static void packet_previous_frame(struct packet_sock po,
				1195	struct packet_ring_buffer *rb,
				1196	int status)
				1197	{
				1198	unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
				1199	return packet_lookup_frame(po, rb, previous, status);
				1200	}
				1201
				1202	static void packet_increment_head(struct packet_ring_buffer *buff)
				1203	{
				1204	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
				1205	}
				1206
				1207	static void packet_inc_pending(struct packet_ring_buffer *rb)
				1208	{
				1209	this_cpu_inc(*rb->pending_refcnt);
				1210	}
				1211
				1212	static void packet_dec_pending(struct packet_ring_buffer *rb)
				1213	{
				1214	this_cpu_dec(*rb->pending_refcnt);
				1215	}
				1216
				1217	static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
				1218	{
				1219	unsigned int refcnt = 0;
				1220	int cpu;
				1221
				1222	/* We don't use pending refcount in rx_ring. */
				1223	if (rb->pending_refcnt == NULL)
				1224	return 0;
				1225
				1226	for_each_possible_cpu(cpu)
				1227	refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
				1228
				1229	return refcnt;
				1230	}
				1231
				1232	static int packet_alloc_pending(struct packet_sock *po)
				1233	{
				1234	po->rx_ring.pending_refcnt = NULL;
				1235
				1236	po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
				1237	if (unlikely(po->tx_ring.pending_refcnt == NULL))
				1238	return -ENOBUFS;
				1239
				1240	return 0;
				1241	}
				1242
				1243	static void packet_free_pending(struct packet_sock *po)
				1244	{
				1245	free_percpu(po->tx_ring.pending_refcnt);
				1246	}
				1247
				1248	#define ROOM_POW_OFF 2
				1249	#define ROOM_NONE 0x0
				1250	#define ROOM_LOW 0x1
				1251	#define ROOM_NORMAL 0x2
				1252
				1253	static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
				1254	{
				1255	int idx, len;
				1256
				1257	len = po->rx_ring.frame_max + 1;
				1258	idx = po->rx_ring.head;
				1259	if (pow_off)
				1260	idx += len >> pow_off;
				1261	if (idx >= len)
				1262	idx -= len;
				1263	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
				1264	}
				1265
				1266	static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
				1267	{
				1268	int idx, len;
				1269
				1270	len = po->rx_ring.prb_bdqc.knum_blocks;
				1271	idx = po->rx_ring.prb_bdqc.kactive_blk_num;
				1272	if (pow_off)
				1273	idx += len >> pow_off;
				1274	if (idx >= len)
				1275	idx -= len;
				1276	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
				1277	}
				1278
				1279	static int __packet_rcv_has_room(struct packet_sock po, struct sk_buff skb)
				1280	{
				1281	struct sock *sk = &po->sk;
				1282	int ret = ROOM_NONE;
				1283
				1284	if (po->prot_hook.func != tpacket_rcv) {
				1285	int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
				1286	- (skb ? skb->truesize : 0);
				1287	if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
				1288	return ROOM_NORMAL;
				1289	else if (avail > 0)
				1290	return ROOM_LOW;
				1291	else
				1292	return ROOM_NONE;
				1293	}
				1294
				1295	if (po->tp_version == TPACKET_V3) {
				1296	if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
				1297	ret = ROOM_NORMAL;
				1298	else if (__tpacket_v3_has_room(po, 0))
				1299	ret = ROOM_LOW;
				1300	} else {
				1301	if (__tpacket_has_room(po, ROOM_POW_OFF))
				1302	ret = ROOM_NORMAL;
				1303	else if (__tpacket_has_room(po, 0))
				1304	ret = ROOM_LOW;
				1305	}
				1306
				1307	return ret;
				1308	}
				1309
				1310	static int packet_rcv_has_room(struct packet_sock po, struct sk_buff skb)
				1311	{
				1312	int ret;
				1313	bool has_room;
				1314
				1315	spin_lock_bh(&po->sk.sk_receive_queue.lock);
				1316	ret = __packet_rcv_has_room(po, skb);
				1317	has_room = ret == ROOM_NORMAL;
				1318	if (po->pressure == has_room)
				1319	po->pressure = !has_room;
				1320	spin_unlock_bh(&po->sk.sk_receive_queue.lock);
				1321
				1322	return ret;
				1323	}
				1324
				1325	static void packet_sock_destruct(struct sock *sk)
				1326	{
				1327	skb_queue_purge(&sk->sk_error_queue);
				1328
				1329	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
				1330	WARN_ON(refcount_read(&sk->sk_wmem_alloc));
				1331
				1332	if (!sock_flag(sk, SOCK_DEAD)) {
				1333	pr_err("Attempt to release alive packet socket: %p\n", sk);
				1334	return;
				1335	}
				1336
				1337	sk_refcnt_debug_dec(sk);
				1338	}
				1339
				1340	static bool fanout_flow_is_huge(struct packet_sock po, struct sk_buff skb)
				1341	{
				1342	u32 *history = po->rollover->history;
				1343	u32 victim, rxhash;
				1344	int i, count = 0;
				1345
				1346	rxhash = skb_get_hash(skb);
				1347	for (i = 0; i < ROLLOVER_HLEN; i++)
				1348	if (READ_ONCE(history[i]) == rxhash)
				1349	count++;
				1350
				1351	victim = prandom_u32() % ROLLOVER_HLEN;
				1352
				1353	/* Avoid dirtying the cache line if possible */
				1354	if (READ_ONCE(history[victim]) != rxhash)
				1355	WRITE_ONCE(history[victim], rxhash);
				1356
				1357	return count > (ROLLOVER_HLEN >> 1);
				1358	}
				1359
				1360	static unsigned int fanout_demux_hash(struct packet_fanout *f,
				1361	struct sk_buff *skb,
				1362	unsigned int num)
				1363	{
				1364	return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
				1365	}
				1366
				1367	static unsigned int fanout_demux_lb(struct packet_fanout *f,
				1368	struct sk_buff *skb,
				1369	unsigned int num)
				1370	{
				1371	unsigned int val = atomic_inc_return(&f->rr_cur);
				1372
				1373	return val % num;
				1374	}
				1375
				1376	static unsigned int fanout_demux_cpu(struct packet_fanout *f,
				1377	struct sk_buff *skb,
				1378	unsigned int num)
				1379	{
				1380	return smp_processor_id() % num;
				1381	}
				1382
				1383	static unsigned int fanout_demux_rnd(struct packet_fanout *f,
				1384	struct sk_buff *skb,
				1385	unsigned int num)
				1386	{
				1387	return prandom_u32_max(num);
				1388	}
				1389
				1390	static unsigned int fanout_demux_rollover(struct packet_fanout *f,
				1391	struct sk_buff *skb,
				1392	unsigned int idx, bool try_self,
				1393	unsigned int num)
				1394	{
				1395	struct packet_sock po, po_next, *po_skip = NULL;
				1396	unsigned int i, j, room = ROOM_NONE;
				1397
				1398	po = pkt_sk(f->arr[idx]);
				1399
				1400	if (try_self) {
				1401	room = packet_rcv_has_room(po, skb);
				1402	if (room == ROOM_NORMAL \|\|
				1403	(room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
				1404	return idx;
				1405	po_skip = po;
				1406	}
				1407
				1408	i = j = min_t(int, po->rollover->sock, num - 1);
				1409	do {
				1410	po_next = pkt_sk(f->arr[i]);
				1411	if (po_next != po_skip && !po_next->pressure &&
				1412	packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
				1413	if (i != j)
				1414	po->rollover->sock = i;
				1415	atomic_long_inc(&po->rollover->num);
				1416	if (room == ROOM_LOW)
				1417	atomic_long_inc(&po->rollover->num_huge);
				1418	return i;
				1419	}
				1420
				1421	if (++i == num)
				1422	i = 0;
				1423	} while (i != j);
				1424
				1425	atomic_long_inc(&po->rollover->num_failed);
				1426	return idx;
				1427	}
				1428
				1429	static unsigned int fanout_demux_qm(struct packet_fanout *f,
				1430	struct sk_buff *skb,
				1431	unsigned int num)
				1432	{
				1433	return skb_get_queue_mapping(skb) % num;
				1434	}
				1435
				1436	static unsigned int fanout_demux_bpf(struct packet_fanout *f,
				1437	struct sk_buff *skb,
				1438	unsigned int num)
				1439	{
				1440	struct bpf_prog *prog;
				1441	unsigned int ret = 0;
				1442
				1443	rcu_read_lock();
				1444	prog = rcu_dereference(f->bpf_prog);
				1445	if (prog)
				1446	ret = bpf_prog_run_clear_cb(prog, skb) % num;
				1447	rcu_read_unlock();
				1448
				1449	return ret;
				1450	}
				1451
				1452	static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
				1453	{
				1454	return f->flags & (flag >> 8);
				1455	}
				1456
				1457	static int packet_rcv_fanout(struct sk_buff skb, struct net_device dev,
				1458	struct packet_type pt, struct net_device orig_dev)
				1459	{
				1460	struct packet_fanout *f = pt->af_packet_priv;
				1461	unsigned int num = READ_ONCE(f->num_members);
				1462	struct net *net = read_pnet(&f->net);
				1463	struct packet_sock *po;
				1464	unsigned int idx;
				1465
				1466	if (!net_eq(dev_net(dev), net) \|\| !num) {
				1467	kfree_skb(skb);
				1468	return 0;
				1469	}
				1470
				1471	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
				1472	skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
				1473	if (!skb)
				1474	return 0;
				1475	}
				1476	switch (f->type) {
				1477	case PACKET_FANOUT_HASH:
				1478	default:
				1479	idx = fanout_demux_hash(f, skb, num);
				1480	break;
				1481	case PACKET_FANOUT_LB:
				1482	idx = fanout_demux_lb(f, skb, num);
				1483	break;
				1484	case PACKET_FANOUT_CPU:
				1485	idx = fanout_demux_cpu(f, skb, num);
				1486	break;
				1487	case PACKET_FANOUT_RND:
				1488	idx = fanout_demux_rnd(f, skb, num);
				1489	break;
				1490	case PACKET_FANOUT_QM:
				1491	idx = fanout_demux_qm(f, skb, num);
				1492	break;
				1493	case PACKET_FANOUT_ROLLOVER:
				1494	idx = fanout_demux_rollover(f, skb, 0, false, num);
				1495	break;
				1496	case PACKET_FANOUT_CBPF:
				1497	case PACKET_FANOUT_EBPF:
				1498	idx = fanout_demux_bpf(f, skb, num);
				1499	break;
				1500	}
				1501
				1502	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
				1503	idx = fanout_demux_rollover(f, skb, idx, true, num);
				1504
				1505	po = pkt_sk(f->arr[idx]);
				1506	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
				1507	}
				1508
				1509	DEFINE_MUTEX(fanout_mutex);
				1510	EXPORT_SYMBOL_GPL(fanout_mutex);
				1511	static LIST_HEAD(fanout_list);
				1512	static u16 fanout_next_id;
				1513
				1514	static void __fanout_link(struct sock sk, struct packet_sock po)
				1515	{
				1516	struct packet_fanout *f = po->fanout;
				1517
				1518	spin_lock(&f->lock);
				1519	f->arr[f->num_members] = sk;
				1520	smp_wmb();
				1521	f->num_members++;
				1522	if (f->num_members == 1)
				1523	dev_add_pack(&f->prot_hook);
				1524	spin_unlock(&f->lock);
				1525	}
				1526
				1527	static void __fanout_unlink(struct sock sk, struct packet_sock po)
				1528	{
				1529	struct packet_fanout *f = po->fanout;
				1530	int i;
				1531
				1532	spin_lock(&f->lock);
				1533	for (i = 0; i < f->num_members; i++) {
				1534	if (f->arr[i] == sk)
				1535	break;
				1536	}
				1537	BUG_ON(i >= f->num_members);
				1538	f->arr[i] = f->arr[f->num_members - 1];
				1539	f->num_members--;
				1540	if (f->num_members == 0)
				1541	__dev_remove_pack(&f->prot_hook);
				1542	spin_unlock(&f->lock);
				1543	}
				1544
				1545	static bool match_fanout_group(struct packet_type ptype, struct sock sk)
				1546	{
				1547	if (sk->sk_family != PF_PACKET)
				1548	return false;
				1549
				1550	return ptype->af_packet_priv == pkt_sk(sk)->fanout;
				1551	}
				1552
				1553	static void fanout_init_data(struct packet_fanout *f)
				1554	{
				1555	switch (f->type) {
				1556	case PACKET_FANOUT_LB:
				1557	atomic_set(&f->rr_cur, 0);
				1558	break;
				1559	case PACKET_FANOUT_CBPF:
				1560	case PACKET_FANOUT_EBPF:
				1561	RCU_INIT_POINTER(f->bpf_prog, NULL);
				1562	break;
				1563	}
				1564	}
				1565
				1566	static void __fanout_set_data_bpf(struct packet_fanout f, struct bpf_prog new)
				1567	{
				1568	struct bpf_prog *old;
				1569
				1570	spin_lock(&f->lock);
				1571	old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
				1572	rcu_assign_pointer(f->bpf_prog, new);
				1573	spin_unlock(&f->lock);
				1574
				1575	if (old) {
				1576	synchronize_net();
				1577	bpf_prog_destroy(old);
				1578	}
				1579	}
				1580
				1581	static int fanout_set_data_cbpf(struct packet_sock po, char __user data,
				1582	unsigned int len)
				1583	{
				1584	struct bpf_prog *new;
				1585	struct sock_fprog fprog;
				1586	int ret;
				1587
				1588	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
				1589	return -EPERM;
				1590	if (len != sizeof(fprog))
				1591	return -EINVAL;
				1592	if (copy_from_user(&fprog, data, len))
				1593	return -EFAULT;
				1594
				1595	ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
				1596	if (ret)
				1597	return ret;
				1598
				1599	__fanout_set_data_bpf(po->fanout, new);
				1600	return 0;
				1601	}
				1602
				1603	static int fanout_set_data_ebpf(struct packet_sock po, char __user data,
				1604	unsigned int len)
				1605	{
				1606	struct bpf_prog *new;
				1607	u32 fd;
				1608
				1609	if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
				1610	return -EPERM;
				1611	if (len != sizeof(fd))
				1612	return -EINVAL;
				1613	if (copy_from_user(&fd, data, len))
				1614	return -EFAULT;
				1615
				1616	new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
				1617	if (IS_ERR(new))
				1618	return PTR_ERR(new);
				1619
				1620	__fanout_set_data_bpf(po->fanout, new);
				1621	return 0;
				1622	}
				1623
				1624	static int fanout_set_data(struct packet_sock po, char __user data,
				1625	unsigned int len)
				1626	{
				1627	switch (po->fanout->type) {
				1628	case PACKET_FANOUT_CBPF:
				1629	return fanout_set_data_cbpf(po, data, len);
				1630	case PACKET_FANOUT_EBPF:
				1631	return fanout_set_data_ebpf(po, data, len);
				1632	default:
				1633	return -EINVAL;
				1634	};
				1635	}
				1636
				1637	static void fanout_release_data(struct packet_fanout *f)
				1638	{
				1639	switch (f->type) {
				1640	case PACKET_FANOUT_CBPF:
				1641	case PACKET_FANOUT_EBPF:
				1642	__fanout_set_data_bpf(f, NULL);
				1643	};
				1644	}
				1645
				1646	static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
				1647	{
				1648	struct packet_fanout *f;
				1649
				1650	list_for_each_entry(f, &fanout_list, list) {
				1651	if (f->id == candidate_id &&
				1652	read_pnet(&f->net) == sock_net(sk)) {
				1653	return false;
				1654	}
				1655	}
				1656	return true;
				1657	}
				1658
				1659	static bool fanout_find_new_id(struct sock sk, u16 new_id)
				1660	{
				1661	u16 id = fanout_next_id;
				1662
				1663	do {
				1664	if (__fanout_id_is_free(sk, id)) {
				1665	*new_id = id;
				1666	fanout_next_id = id + 1;
				1667	return true;
				1668	}
				1669
				1670	id++;
				1671	} while (id != fanout_next_id);
				1672
				1673	return false;
				1674	}
				1675
				1676	static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
				1677	{
				1678	struct packet_rollover *rollover = NULL;
				1679	struct packet_sock *po = pkt_sk(sk);
				1680	struct packet_fanout f, match;
				1681	u8 type = type_flags & 0xff;
				1682	u8 flags = type_flags >> 8;
				1683	int err;
				1684
				1685	switch (type) {
				1686	case PACKET_FANOUT_ROLLOVER:
				1687	if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
				1688	return -EINVAL;
				1689	case PACKET_FANOUT_HASH:
				1690	case PACKET_FANOUT_LB:
				1691	case PACKET_FANOUT_CPU:
				1692	case PACKET_FANOUT_RND:
				1693	case PACKET_FANOUT_QM:
				1694	case PACKET_FANOUT_CBPF:
				1695	case PACKET_FANOUT_EBPF:
				1696	break;
				1697	default:
				1698	return -EINVAL;
				1699	}
				1700
				1701	mutex_lock(&fanout_mutex);
				1702
				1703	err = -EALREADY;
				1704	if (po->fanout)
				1705	goto out;
				1706
				1707	if (type == PACKET_FANOUT_ROLLOVER \|\|
				1708	(type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
				1709	err = -ENOMEM;
				1710	rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
				1711	if (!rollover)
				1712	goto out;
				1713	atomic_long_set(&rollover->num, 0);
				1714	atomic_long_set(&rollover->num_huge, 0);
				1715	atomic_long_set(&rollover->num_failed, 0);
				1716	}
				1717
				1718	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
				1719	if (id != 0) {
				1720	err = -EINVAL;
				1721	goto out;
				1722	}
				1723	if (!fanout_find_new_id(sk, &id)) {
				1724	err = -ENOMEM;
				1725	goto out;
				1726	}
				1727	/* ephemeral flag for the first socket in the group: drop it */
				1728	flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
				1729	}
				1730
				1731	match = NULL;
				1732	list_for_each_entry(f, &fanout_list, list) {
				1733	if (f->id == id &&
				1734	read_pnet(&f->net) == sock_net(sk)) {
				1735	match = f;
				1736	break;
				1737	}
				1738	}
				1739	err = -EINVAL;
				1740	if (match && match->flags != flags)
				1741	goto out;
				1742	if (!match) {
				1743	err = -ENOMEM;
				1744	match = kzalloc(sizeof(*match), GFP_KERNEL);
				1745	if (!match)
				1746	goto out;
				1747	write_pnet(&match->net, sock_net(sk));
				1748	match->id = id;
				1749	match->type = type;
				1750	match->flags = flags;
				1751	INIT_LIST_HEAD(&match->list);
				1752	spin_lock_init(&match->lock);
				1753	refcount_set(&match->sk_ref, 0);
				1754	fanout_init_data(match);
				1755	match->prot_hook.type = po->prot_hook.type;
				1756	match->prot_hook.dev = po->prot_hook.dev;
				1757	match->prot_hook.func = packet_rcv_fanout;
				1758	match->prot_hook.af_packet_priv = match;
				1759	match->prot_hook.id_match = match_fanout_group;
				1760	list_add(&match->list, &fanout_list);
				1761	}
				1762	err = -EINVAL;
				1763
				1764	spin_lock(&po->bind_lock);
				1765	if (po->running &&
				1766	match->type == type &&
				1767	match->prot_hook.type == po->prot_hook.type &&
				1768	match->prot_hook.dev == po->prot_hook.dev) {
				1769	err = -ENOSPC;
				1770	if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
				1771	__dev_remove_pack(&po->prot_hook);
				1772	po->fanout = match;
				1773	po->rollover = rollover;
				1774	rollover = NULL;
				1775	refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
				1776	__fanout_link(sk, po);
				1777	err = 0;
				1778	}
				1779	}
				1780	spin_unlock(&po->bind_lock);
				1781
				1782	if (err && !refcount_read(&match->sk_ref)) {
				1783	list_del(&match->list);
				1784	kfree(match);
				1785	}
				1786
				1787	out:
				1788	kfree(rollover);
				1789	mutex_unlock(&fanout_mutex);
				1790	return err;
				1791	}
				1792
				1793	/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
				1794	* pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
				1795	* It is the responsibility of the caller to call fanout_release_data() and
				1796	* free the returned packet_fanout (after synchronize_net())
				1797	*/
				1798	static struct packet_fanout fanout_release(struct sock sk)
				1799	{
				1800	struct packet_sock *po = pkt_sk(sk);
				1801	struct packet_fanout *f;
				1802
				1803	mutex_lock(&fanout_mutex);
				1804	f = po->fanout;
				1805	if (f) {
				1806	po->fanout = NULL;
				1807
				1808	if (refcount_dec_and_test(&f->sk_ref))
				1809	list_del(&f->list);
				1810	else
				1811	f = NULL;
				1812	}
				1813	mutex_unlock(&fanout_mutex);
				1814
				1815	return f;
				1816	}
				1817
				1818	static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
				1819	struct sk_buff *skb)
				1820	{
				1821	/* Earlier code assumed this would be a VLAN pkt, double-check
				1822	* this now that we have the actual packet in hand. We can only
				1823	* do this check on Ethernet devices.
				1824	*/
				1825	if (unlikely(dev->type != ARPHRD_ETHER))
				1826	return false;
				1827
				1828	skb_reset_mac_header(skb);
				1829	return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
				1830	}
				1831
				1832	static const struct proto_ops packet_ops;
				1833
				1834	static const struct proto_ops packet_ops_spkt;
				1835
				1836	static int packet_rcv_spkt(struct sk_buff skb, struct net_device dev,
				1837	struct packet_type pt, struct net_device orig_dev)
				1838	{
				1839	struct sock *sk;
				1840	struct sockaddr_pkt *spkt;
				1841
				1842	/*
				1843	* When we registered the protocol we saved the socket in the data
				1844	* field for just this event.
				1845	*/
				1846
				1847	sk = pt->af_packet_priv;
				1848
				1849	/*
				1850	* Yank back the headers [hope the device set this
				1851	* right or kerboom...]
				1852	*
				1853	* Incoming packets have ll header pulled,
				1854	* push it back.
				1855	*
				1856	* For outgoing ones skb->data == skb_mac_header(skb)
				1857	* so that this procedure is noop.
				1858	*/
				1859
				1860	if (skb->pkt_type == PACKET_LOOPBACK)
				1861	goto out;
				1862
				1863	if (!net_eq(dev_net(dev), sock_net(sk)))
				1864	goto out;
				1865
				1866	skb = skb_share_check(skb, GFP_ATOMIC);
				1867	if (skb == NULL)
				1868	goto oom;
				1869
				1870	/* drop any routing info */
				1871	skb_dst_drop(skb);
				1872
				1873	/* drop conntrack reference */
				1874	nf_reset(skb);
				1875
				1876	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
				1877
				1878	skb_push(skb, skb->data - skb_mac_header(skb));
				1879
				1880	/*
				1881	* The SOCK_PACKET socket receives _all_ frames.
				1882	*/
				1883
				1884	spkt->spkt_family = dev->type;
				1885	strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
				1886	spkt->spkt_protocol = skb->protocol;
				1887
				1888	/*
				1889	* Charge the memory to the socket. This is done specifically
				1890	* to prevent sockets using all the memory up.
				1891	*/
				1892
				1893	if (sock_queue_rcv_skb(sk, skb) == 0)
				1894	return 0;
				1895
				1896	out:
				1897	kfree_skb(skb);
				1898	oom:
				1899	return 0;
				1900	}
				1901
				1902
				1903	/*
				1904	* Output a raw packet to a device layer. This bypasses all the other
				1905	* protocol layers and you must therefore supply it with a complete frame
				1906	*/
				1907
				1908	static int packet_sendmsg_spkt(struct socket sock, struct msghdr msg,
				1909	size_t len)
				1910	{
				1911	struct sock *sk = sock->sk;
				1912	DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
				1913	struct sk_buff *skb = NULL;
				1914	struct net_device *dev;
				1915	struct sockcm_cookie sockc;
				1916	__be16 proto = 0;
				1917	int err;
				1918	int extra_len = 0;
				1919
				1920	/*
				1921	* Get and verify the address.
				1922	*/
				1923
				1924	if (saddr) {
				1925	if (msg->msg_namelen < sizeof(struct sockaddr))
				1926	return -EINVAL;
				1927	if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
				1928	proto = saddr->spkt_protocol;
				1929	} else
				1930	return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
				1931
				1932	/*
				1933	* Find the device first to size check it
				1934	*/
				1935
				1936	saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
				1937	retry:
				1938	rcu_read_lock();
				1939	dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
				1940	err = -ENODEV;
				1941	if (dev == NULL)
				1942	goto out_unlock;
				1943
				1944	err = -ENETDOWN;
				1945	if (!(dev->flags & IFF_UP))
				1946	goto out_unlock;
				1947
				1948	/*
				1949	* You may not queue a frame bigger than the mtu. This is the lowest level
				1950	* raw protocol and you must do your own fragmentation at this level.
				1951	*/
				1952
				1953	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
				1954	if (!netif_supports_nofcs(dev)) {
				1955	err = -EPROTONOSUPPORT;
				1956	goto out_unlock;
				1957	}
				1958	extra_len = 4; /* We're doing our own CRC */
				1959	}
				1960
				1961	err = -EMSGSIZE;
				1962	if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
				1963	goto out_unlock;
				1964
				1965	if (!skb) {
				1966	size_t reserved = LL_RESERVED_SPACE(dev);
				1967	int tlen = dev->needed_tailroom;
				1968	unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
				1969
				1970	rcu_read_unlock();
				1971	skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
				1972	if (skb == NULL)
				1973	return -ENOBUFS;
				1974	/* FIXME: Save some space for broken drivers that write a hard
				1975	* header at transmission time by themselves. PPP is the notable
				1976	* one here. This should really be fixed at the driver level.
				1977	*/
				1978	skb_reserve(skb, reserved);
				1979	skb_reset_network_header(skb);
				1980
				1981	/* Try to align data part correctly */
				1982	if (hhlen) {
				1983	skb->data -= hhlen;
				1984	skb->tail -= hhlen;
				1985	if (len < hhlen)
				1986	skb_reset_network_header(skb);
				1987	}
				1988	err = memcpy_from_msg(skb_put(skb, len), msg, len);
				1989	if (err)
				1990	goto out_free;
				1991	goto retry;
				1992	}
				1993
				1994	if (!dev_validate_header(dev, skb->data, len)) {
				1995	err = -EINVAL;
				1996	goto out_unlock;
				1997	}
				1998	if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
				1999	!packet_extra_vlan_len_allowed(dev, skb)) {
				2000	err = -EMSGSIZE;
				2001	goto out_unlock;
				2002	}
				2003
				2004	sockc.tsflags = sk->sk_tsflags;
				2005	if (msg->msg_controllen) {
				2006	err = sock_cmsg_send(sk, msg, &sockc);
				2007	if (unlikely(err))
				2008	goto out_unlock;
				2009	}
				2010
				2011	skb->protocol = proto;
				2012	skb->dev = dev;
				2013	skb->priority = sk->sk_priority;
				2014	skb->mark = sk->sk_mark;
				2015
				2016	sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
				2017
				2018	if (unlikely(extra_len == 4))
				2019	skb->no_fcs = 1;
				2020
				2021	skb_probe_transport_header(skb, 0);
				2022
				2023	dev_queue_xmit(skb);
				2024	rcu_read_unlock();
				2025	return len;
				2026
				2027	out_unlock:
				2028	rcu_read_unlock();
				2029	out_free:
				2030	kfree_skb(skb);
				2031	return err;
				2032	}
				2033
				2034	static unsigned int run_filter(struct sk_buff *skb,
				2035	const struct sock *sk,
				2036	unsigned int res)
				2037	{
				2038	struct sk_filter *filter;
				2039
				2040	rcu_read_lock();
				2041	filter = rcu_dereference(sk->sk_filter);
				2042	if (filter != NULL)
				2043	res = bpf_prog_run_clear_cb(filter->prog, skb);
				2044	rcu_read_unlock();
				2045
				2046	return res;
				2047	}
				2048
				2049	static int packet_rcv_vnet(struct msghdr msg, const struct sk_buff skb,
				2050	size_t *len)
				2051	{
				2052	struct virtio_net_hdr vnet_hdr;
				2053
				2054	if (*len < sizeof(vnet_hdr))
				2055	return -EINVAL;
				2056	*len -= sizeof(vnet_hdr);
				2057
				2058	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
				2059	return -EINVAL;
				2060
				2061	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
				2062	}
				2063
				2064	/*
				2065	* This function makes lazy skb cloning in hope that most of packets
				2066	* are discarded by BPF.
				2067	*
				2068	* Note tricky part: we DO mangle shared skb! skb->data, skb->len
				2069	* and skb->cb are mangled. It works because (and until) packets
				2070	* falling here are owned by current CPU. Output packets are cloned
				2071	* by dev_queue_xmit_nit(), input packets are processed by net_bh
				2072	* sequencially, so that if we return skb to original state on exit,
				2073	* we will not harm anyone.
				2074	*/
				2075
				2076	static int packet_rcv(struct sk_buff skb, struct net_device dev,
				2077	struct packet_type pt, struct net_device orig_dev)
				2078	{
				2079	struct sock *sk;
				2080	struct sockaddr_ll *sll;
				2081	struct packet_sock *po;
				2082	u8 *skb_head = skb->data;
				2083	int skb_len = skb->len;
				2084	unsigned int snaplen, res;
				2085	bool is_drop_n_account = false;
				2086
				2087	if (skb->pkt_type == PACKET_LOOPBACK)
				2088	goto drop;
				2089
				2090	sk = pt->af_packet_priv;
				2091	po = pkt_sk(sk);
				2092
				2093	if (!net_eq(dev_net(dev), sock_net(sk)))
				2094	goto drop;
				2095
				2096	skb->dev = dev;
				2097
				2098	if (dev->header_ops) {
				2099	/* The device has an explicit notion of ll header,
				2100	* exported to higher levels.
				2101	*
				2102	* Otherwise, the device hides details of its frame
				2103	* structure, so that corresponding packet head is
				2104	* never delivered to user.
				2105	*/
				2106	if (sk->sk_type != SOCK_DGRAM)
				2107	skb_push(skb, skb->data - skb_mac_header(skb));
				2108	else if (skb->pkt_type == PACKET_OUTGOING) {
				2109	/* Special case: outgoing packets have ll header at head */
				2110	skb_pull(skb, skb_network_offset(skb));
				2111	}
				2112	}
				2113
				2114	snaplen = skb->len;
				2115
				2116	res = run_filter(skb, sk, snaplen);
				2117	if (!res)
				2118	goto drop_n_restore;
				2119	if (snaplen > res)
				2120	snaplen = res;
				2121
				2122	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
				2123	goto drop_n_acct;
				2124
				2125	if (skb_shared(skb)) {
				2126	struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
				2127	if (nskb == NULL)
				2128	goto drop_n_acct;
				2129
				2130	if (skb_head != skb->data) {
				2131	skb->data = skb_head;
				2132	skb->len = skb_len;
				2133	}
				2134	consume_skb(skb);
				2135	skb = nskb;
				2136	}
				2137
				2138	sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
				2139
				2140	sll = &PACKET_SKB_CB(skb)->sa.ll;
				2141	sll->sll_hatype = dev->type;
				2142	sll->sll_pkttype = skb->pkt_type;
				2143	if (unlikely(po->origdev))
				2144	sll->sll_ifindex = orig_dev->ifindex;
				2145	else
				2146	sll->sll_ifindex = dev->ifindex;
				2147
				2148	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
				2149
				2150	/* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
				2151	* Use their space for storing the original skb length.
				2152	*/
				2153	PACKET_SKB_CB(skb)->sa.origlen = skb->len;
				2154
				2155	if (pskb_trim(skb, snaplen))
				2156	goto drop_n_acct;
				2157
				2158	skb_set_owner_r(skb, sk);
				2159	skb->dev = NULL;
				2160	skb_dst_drop(skb);
				2161
				2162	/* drop conntrack reference */
				2163	nf_reset(skb);
				2164
				2165	spin_lock(&sk->sk_receive_queue.lock);
				2166	po->stats.stats1.tp_packets++;
				2167	sock_skb_set_dropcount(sk, skb);
				2168	__skb_queue_tail(&sk->sk_receive_queue, skb);
				2169	spin_unlock(&sk->sk_receive_queue.lock);
				2170	sk->sk_data_ready(sk);
				2171	return 0;
				2172
				2173	drop_n_acct:
				2174	is_drop_n_account = true;
				2175	spin_lock(&sk->sk_receive_queue.lock);
				2176	po->stats.stats1.tp_drops++;
				2177	atomic_inc(&sk->sk_drops);
				2178	spin_unlock(&sk->sk_receive_queue.lock);
				2179
				2180	drop_n_restore:
				2181	if (skb_head != skb->data && skb_shared(skb)) {
				2182	skb->data = skb_head;
				2183	skb->len = skb_len;
				2184	}
				2185	drop:
				2186	if (!is_drop_n_account)
				2187	consume_skb(skb);
				2188	else
				2189	kfree_skb(skb);
				2190	return 0;
				2191	}
				2192
				2193	static int tpacket_rcv(struct sk_buff skb, struct net_device dev,
				2194	struct packet_type pt, struct net_device orig_dev)
				2195	{
				2196	struct sock *sk;
				2197	struct packet_sock *po;
				2198	struct sockaddr_ll *sll;
				2199	union tpacket_uhdr h;
				2200	u8 *skb_head = skb->data;
				2201	int skb_len = skb->len;
				2202	unsigned int snaplen, res;
				2203	unsigned long status = TP_STATUS_USER;
				2204	unsigned short macoff, netoff, hdrlen;
				2205	struct sk_buff *copy_skb = NULL;
				2206	struct timespec ts;
				2207	__u32 ts_status;
				2208	bool is_drop_n_account = false;
				2209	unsigned int slot_id = 0;
				2210	bool do_vnet = false;
				2211
				2212	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
				2213	* We may add members to them until current aligned size without forcing
				2214	* userspace to call getsockopt(..., PACKET_HDRLEN, ...).
				2215	*/
				2216	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
				2217	BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
				2218
				2219	if (skb->pkt_type == PACKET_LOOPBACK)
				2220	goto drop;
				2221
				2222	sk = pt->af_packet_priv;
				2223	po = pkt_sk(sk);
				2224
				2225	if (!net_eq(dev_net(dev), sock_net(sk)))
				2226	goto drop;
				2227
				2228	if (dev->header_ops) {
				2229	if (sk->sk_type != SOCK_DGRAM)
				2230	skb_push(skb, skb->data - skb_mac_header(skb));
				2231	else if (skb->pkt_type == PACKET_OUTGOING) {
				2232	/* Special case: outgoing packets have ll header at head */
				2233	skb_pull(skb, skb_network_offset(skb));
				2234	}
				2235	}
				2236
				2237	snaplen = skb->len;
				2238
				2239	res = run_filter(skb, sk, snaplen);
				2240	if (!res)
				2241	goto drop_n_restore;
				2242
				2243	if (skb->ip_summed == CHECKSUM_PARTIAL)
				2244	status \|= TP_STATUS_CSUMNOTREADY;
				2245	else if (skb->pkt_type != PACKET_OUTGOING &&
				2246	(skb->ip_summed == CHECKSUM_COMPLETE \|\|
				2247	skb_csum_unnecessary(skb)))
				2248	status \|= TP_STATUS_CSUM_VALID;
				2249
				2250	if (snaplen > res)
				2251	snaplen = res;
				2252
				2253	if (sk->sk_type == SOCK_DGRAM) {
				2254	macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
				2255	po->tp_reserve;
				2256	} else {
				2257	unsigned int maclen = skb_network_offset(skb);
				2258	netoff = TPACKET_ALIGN(po->tp_hdrlen +
				2259	(maclen < 16 ? 16 : maclen)) +
				2260	po->tp_reserve;
				2261	if (po->has_vnet_hdr) {
				2262	netoff += sizeof(struct virtio_net_hdr);
				2263	do_vnet = true;
				2264	}
				2265	macoff = netoff - maclen;
				2266	}
				2267	if (po->tp_version <= TPACKET_V2) {
				2268	if (macoff + snaplen > po->rx_ring.frame_size) {
				2269	if (po->copy_thresh &&
				2270	atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
				2271	if (skb_shared(skb)) {
				2272	copy_skb = skb_clone(skb, GFP_ATOMIC);
				2273	} else {
				2274	copy_skb = skb_get(skb);
				2275	skb_head = skb->data;
				2276	}
				2277	if (copy_skb)
				2278	skb_set_owner_r(copy_skb, sk);
				2279	}
				2280	snaplen = po->rx_ring.frame_size - macoff;
				2281	if ((int)snaplen < 0) {
				2282	snaplen = 0;
				2283	do_vnet = false;
				2284	}
				2285	}
				2286	} else if (unlikely(macoff + snaplen >
				2287	GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
				2288	u32 nval;
				2289
				2290	nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
				2291	pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
				2292	snaplen, nval, macoff);
				2293	snaplen = nval;
				2294	if (unlikely((int)snaplen < 0)) {
				2295	snaplen = 0;
				2296	macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
				2297	do_vnet = false;
				2298	}
				2299	}
				2300	spin_lock(&sk->sk_receive_queue.lock);
				2301	h.raw = packet_current_rx_frame(po, skb,
				2302	TP_STATUS_KERNEL, (macoff+snaplen));
				2303	if (!h.raw)
				2304	goto drop_n_account;
				2305
				2306	if (po->tp_version <= TPACKET_V2) {
				2307	slot_id = po->rx_ring.head;
				2308	if (test_bit(slot_id, po->rx_ring.rx_owner_map))
				2309	goto drop_n_account;
				2310	__set_bit(slot_id, po->rx_ring.rx_owner_map);
				2311	}
				2312
				2313	if (do_vnet &&
				2314	virtio_net_hdr_from_skb(skb, h.raw + macoff -
				2315	sizeof(struct virtio_net_hdr),
				2316	vio_le(), true, 0)) {
				2317	if (po->tp_version == TPACKET_V3)
				2318	prb_clear_blk_fill_status(&po->rx_ring);
				2319	goto drop_n_account;
				2320	}
				2321
				2322	if (po->tp_version <= TPACKET_V2) {
				2323	packet_increment_rx_head(po, &po->rx_ring);
				2324	/*
				2325	* LOSING will be reported till you read the stats,
				2326	* because it's COR - Clear On Read.
				2327	* Anyways, moving it for V1/V2 only as V3 doesn't need this
				2328	* at packet level.
				2329	*/
				2330	if (po->stats.stats1.tp_drops)
				2331	status \|= TP_STATUS_LOSING;
				2332	}
				2333
				2334	po->stats.stats1.tp_packets++;
				2335	if (copy_skb) {
				2336	status \|= TP_STATUS_COPY;
				2337	__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
				2338	}
				2339	spin_unlock(&sk->sk_receive_queue.lock);
				2340
				2341	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
				2342
				2343	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
				2344	getnstimeofday(&ts);
				2345
				2346	status \|= ts_status;
				2347
				2348	switch (po->tp_version) {
				2349	case TPACKET_V1:
				2350	h.h1->tp_len = skb->len;
				2351	h.h1->tp_snaplen = snaplen;
				2352	h.h1->tp_mac = macoff;
				2353	h.h1->tp_net = netoff;
				2354	h.h1->tp_sec = ts.tv_sec;
				2355	h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
				2356	hdrlen = sizeof(*h.h1);
				2357	break;
				2358	case TPACKET_V2:
				2359	h.h2->tp_len = skb->len;
				2360	h.h2->tp_snaplen = snaplen;
				2361	h.h2->tp_mac = macoff;
				2362	h.h2->tp_net = netoff;
				2363	h.h2->tp_sec = ts.tv_sec;
				2364	h.h2->tp_nsec = ts.tv_nsec;
				2365	if (skb_vlan_tag_present(skb)) {
				2366	h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
				2367	h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
				2368	status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				2369	} else {
				2370	h.h2->tp_vlan_tci = 0;
				2371	h.h2->tp_vlan_tpid = 0;
				2372	}
				2373	memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
				2374	hdrlen = sizeof(*h.h2);
				2375	break;
				2376	case TPACKET_V3:
				2377	/* tp_nxt_offset,vlan are already populated above.
				2378	* So DONT clear those fields here
				2379	*/
				2380	h.h3->tp_status \|= status;
				2381	h.h3->tp_len = skb->len;
				2382	h.h3->tp_snaplen = snaplen;
				2383	h.h3->tp_mac = macoff;
				2384	h.h3->tp_net = netoff;
				2385	h.h3->tp_sec = ts.tv_sec;
				2386	h.h3->tp_nsec = ts.tv_nsec;
				2387	memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
				2388	hdrlen = sizeof(*h.h3);
				2389	break;
				2390	default:
				2391	BUG();
				2392	}
				2393
				2394	sll = h.raw + TPACKET_ALIGN(hdrlen);
				2395	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
				2396	sll->sll_family = AF_PACKET;
				2397	sll->sll_hatype = dev->type;
				2398	sll->sll_protocol = skb->protocol;
				2399	sll->sll_pkttype = skb->pkt_type;
				2400	if (unlikely(po->origdev))
				2401	sll->sll_ifindex = orig_dev->ifindex;
				2402	else
				2403	sll->sll_ifindex = dev->ifindex;
				2404
				2405	smp_mb();
				2406
				2407	#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
				2408	if (po->tp_version <= TPACKET_V2) {
				2409	u8 start, end;
				2410
				2411	end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
				2412	macoff + snaplen);
				2413
				2414	for (start = h.raw; start < end; start += PAGE_SIZE)
				2415	flush_dcache_page(pgv_to_page(start));
				2416	}
				2417	smp_wmb();
				2418	#endif
				2419
				2420	if (po->tp_version <= TPACKET_V2) {
				2421	spin_lock(&sk->sk_receive_queue.lock);
				2422	__packet_set_status(po, h.raw, status);
				2423	__clear_bit(slot_id, po->rx_ring.rx_owner_map);
				2424	spin_unlock(&sk->sk_receive_queue.lock);
				2425	sk->sk_data_ready(sk);
				2426	} else if (po->tp_version == TPACKET_V3) {
				2427	prb_clear_blk_fill_status(&po->rx_ring);
				2428	}
				2429
				2430	drop_n_restore:
				2431	if (skb_head != skb->data && skb_shared(skb)) {
				2432	skb->data = skb_head;
				2433	skb->len = skb_len;
				2434	}
				2435	drop:
				2436	if (!is_drop_n_account)
				2437	consume_skb(skb);
				2438	else
				2439	kfree_skb(skb);
				2440	return 0;
				2441
				2442	drop_n_account:
				2443	is_drop_n_account = true;
				2444	po->stats.stats1.tp_drops++;
				2445	spin_unlock(&sk->sk_receive_queue.lock);
				2446
				2447	sk->sk_data_ready(sk);
				2448	kfree_skb(copy_skb);
				2449	goto drop_n_restore;
				2450	}
				2451
				2452	static void tpacket_destruct_skb(struct sk_buff *skb)
				2453	{
				2454	struct packet_sock *po = pkt_sk(skb->sk);
				2455
				2456	if (likely(po->tx_ring.pg_vec)) {
				2457	void *ph;
				2458	__u32 ts;
				2459
				2460	ph = skb_zcopy_get_nouarg(skb);
				2461	packet_dec_pending(&po->tx_ring);
				2462
				2463	ts = __packet_set_timestamp(po, ph, skb);
				2464	__packet_set_status(po, ph, TP_STATUS_AVAILABLE \| ts);
				2465
				2466	if (!packet_read_pending(&po->tx_ring))
				2467	complete(&po->skb_completion);
				2468	}
				2469
				2470	sock_wfree(skb);
				2471	}
				2472
				2473	static void tpacket_set_protocol(const struct net_device *dev,
				2474	struct sk_buff *skb)
				2475	{
				2476	if (dev->type == ARPHRD_ETHER) {
				2477	skb_reset_mac_header(skb);
				2478	skb->protocol = eth_hdr(skb)->h_proto;
				2479	}
				2480	}
				2481
				2482	static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
				2483	{
				2484	if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
				2485	(__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
				2486	__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
				2487	__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
				2488	vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
				2489	__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
				2490	__virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
				2491
				2492	if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
				2493	return -EINVAL;
				2494
				2495	return 0;
				2496	}
				2497
				2498	static int packet_snd_vnet_parse(struct msghdr msg, size_t len,
				2499	struct virtio_net_hdr *vnet_hdr)
				2500	{
				2501	if (len < sizeof(vnet_hdr))
				2502	return -EINVAL;
				2503	len -= sizeof(vnet_hdr);
				2504
				2505	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
				2506	return -EFAULT;
				2507
				2508	return __packet_snd_vnet_parse(vnet_hdr, *len);
				2509	}
				2510
				2511	static int tpacket_fill_skb(struct packet_sock po, struct sk_buff skb,
				2512	void frame, struct net_device dev, void *data, int tp_len,
				2513	__be16 proto, unsigned char *addr, int hlen, int copylen,
				2514	const struct sockcm_cookie *sockc)
				2515	{
				2516	union tpacket_uhdr ph;
				2517	int to_write, offset, len, nr_frags, len_max;
				2518	struct socket *sock = po->sk.sk_socket;
				2519	struct page *page;
				2520	int err;
				2521
				2522	ph.raw = frame;
				2523
				2524	skb->protocol = proto;
				2525	skb->dev = dev;
				2526	skb->priority = po->sk.sk_priority;
				2527	skb->mark = po->sk.sk_mark;
				2528	sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
				2529	skb_zcopy_set_nouarg(skb, ph.raw);
				2530
				2531	skb_reserve(skb, hlen);
				2532	skb_reset_network_header(skb);
				2533
				2534	to_write = tp_len;
				2535
				2536	if (sock->type == SOCK_DGRAM) {
				2537	err = dev_hard_header(skb, dev, ntohs(proto), addr,
				2538	NULL, tp_len);
				2539	if (unlikely(err < 0))
				2540	return -EINVAL;
				2541	} else if (copylen) {
				2542	int hdrlen = min_t(int, copylen, tp_len);
				2543
				2544	skb_push(skb, dev->hard_header_len);
				2545	skb_put(skb, copylen - dev->hard_header_len);
				2546	err = skb_store_bits(skb, 0, data, hdrlen);
				2547	if (unlikely(err))
				2548	return err;
				2549	if (!dev_validate_header(dev, skb->data, hdrlen))
				2550	return -EINVAL;
				2551	if (!skb->protocol)
				2552	tpacket_set_protocol(dev, skb);
				2553
				2554	data += hdrlen;
				2555	to_write -= hdrlen;
				2556	}
				2557
				2558	offset = offset_in_page(data);
				2559	len_max = PAGE_SIZE - offset;
				2560	len = ((to_write > len_max) ? len_max : to_write);
				2561
				2562	skb->data_len = to_write;
				2563	skb->len += to_write;
				2564	skb->truesize += to_write;
				2565	refcount_add(to_write, &po->sk.sk_wmem_alloc);
				2566
				2567	while (likely(to_write)) {
				2568	nr_frags = skb_shinfo(skb)->nr_frags;
				2569
				2570	if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
				2571	pr_err("Packet exceed the number of skb frags(%lu)\n",
				2572	MAX_SKB_FRAGS);
				2573	return -EFAULT;
				2574	}
				2575
				2576	page = pgv_to_page(data);
				2577	data += len;
				2578	flush_dcache_page(page);
				2579	get_page(page);
				2580	skb_fill_page_desc(skb, nr_frags, page, offset, len);
				2581	to_write -= len;
				2582	offset = 0;
				2583	len_max = PAGE_SIZE;
				2584	len = ((to_write > len_max) ? len_max : to_write);
				2585	}
				2586
				2587	skb_probe_transport_header(skb, 0);
				2588
				2589	return tp_len;
				2590	}
				2591
				2592	static int tpacket_parse_header(struct packet_sock po, void frame,
				2593	int size_max, void **data)
				2594	{
				2595	union tpacket_uhdr ph;
				2596	int tp_len, off;
				2597
				2598	ph.raw = frame;
				2599
				2600	switch (po->tp_version) {
				2601	case TPACKET_V3:
				2602	if (ph.h3->tp_next_offset != 0) {
				2603	pr_warn_once("variable sized slot not supported");
				2604	return -EINVAL;
				2605	}
				2606	tp_len = ph.h3->tp_len;
				2607	break;
				2608	case TPACKET_V2:
				2609	tp_len = ph.h2->tp_len;
				2610	break;
				2611	default:
				2612	tp_len = ph.h1->tp_len;
				2613	break;
				2614	}
				2615	if (unlikely(tp_len > size_max)) {
				2616	pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
				2617	return -EMSGSIZE;
				2618	}
				2619
				2620	if (unlikely(po->tp_tx_has_off)) {
				2621	int off_min, off_max;
				2622
				2623	off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
				2624	off_max = po->tx_ring.frame_size - tp_len;
				2625	if (po->sk.sk_type == SOCK_DGRAM) {
				2626	switch (po->tp_version) {
				2627	case TPACKET_V3:
				2628	off = ph.h3->tp_net;
				2629	break;
				2630	case TPACKET_V2:
				2631	off = ph.h2->tp_net;
				2632	break;
				2633	default:
				2634	off = ph.h1->tp_net;
				2635	break;
				2636	}
				2637	} else {
				2638	switch (po->tp_version) {
				2639	case TPACKET_V3:
				2640	off = ph.h3->tp_mac;
				2641	break;
				2642	case TPACKET_V2:
				2643	off = ph.h2->tp_mac;
				2644	break;
				2645	default:
				2646	off = ph.h1->tp_mac;
				2647	break;
				2648	}
				2649	}
				2650	if (unlikely((off < off_min) \|\| (off_max < off)))
				2651	return -EINVAL;
				2652	} else {
				2653	off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
				2654	}
				2655
				2656	*data = frame + off;
				2657	return tp_len;
				2658	}
				2659
				2660	static int tpacket_snd(struct packet_sock po, struct msghdr msg)
				2661	{
				2662	struct sk_buff *skb = NULL;
				2663	struct net_device *dev;
				2664	struct virtio_net_hdr *vnet_hdr = NULL;
				2665	struct sockcm_cookie sockc;
				2666	__be16 proto;
				2667	int err, reserve = 0;
				2668	void *ph;
				2669	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
				2670	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
				2671	unsigned char *addr = NULL;
				2672	int tp_len, size_max;
				2673	void *data;
				2674	int len_sum = 0;
				2675	int status = TP_STATUS_AVAILABLE;
				2676	int hlen, tlen, copylen = 0;
				2677	long timeo = 0;
				2678
				2679	mutex_lock(&po->pg_vec_lock);
				2680
				2681	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
				2682	* we need to confirm it under protection of pg_vec_lock.
				2683	*/
				2684	if (unlikely(!po->tx_ring.pg_vec)) {
				2685	err = -EBUSY;
				2686	goto out;
				2687	}
				2688	if (likely(saddr == NULL)) {
				2689	dev = packet_cached_dev_get(po);
				2690	proto = po->num;
				2691	} else {
				2692	err = -EINVAL;
				2693	if (msg->msg_namelen < sizeof(struct sockaddr_ll))
				2694	goto out;
				2695	if (msg->msg_namelen < (saddr->sll_halen
				2696	+ offsetof(struct sockaddr_ll,
				2697	sll_addr)))
				2698	goto out;
				2699	proto = saddr->sll_protocol;
				2700	dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
				2701	if (po->sk.sk_socket->type == SOCK_DGRAM) {
				2702	if (dev && msg->msg_namelen < dev->addr_len +
				2703	offsetof(struct sockaddr_ll, sll_addr))
				2704	goto out_put;
				2705	addr = saddr->sll_addr;
				2706	}
				2707	}
				2708
				2709	err = -ENXIO;
				2710	if (unlikely(dev == NULL))
				2711	goto out;
				2712	err = -ENETDOWN;
				2713	if (unlikely(!(dev->flags & IFF_UP)))
				2714	goto out_put;
				2715
				2716	sockc.tsflags = po->sk.sk_tsflags;
				2717	if (msg->msg_controllen) {
				2718	err = sock_cmsg_send(&po->sk, msg, &sockc);
				2719	if (unlikely(err))
				2720	goto out_put;
				2721	}
				2722
				2723	if (po->sk.sk_socket->type == SOCK_RAW)
				2724	reserve = dev->hard_header_len;
				2725	size_max = po->tx_ring.frame_size
				2726	- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
				2727
				2728	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
				2729	size_max = dev->mtu + reserve + VLAN_HLEN;
				2730
				2731	reinit_completion(&po->skb_completion);
				2732
				2733	do {
				2734	ph = packet_current_frame(po, &po->tx_ring,
				2735	TP_STATUS_SEND_REQUEST);
				2736	if (unlikely(ph == NULL)) {
				2737	if (need_wait && skb) {
				2738	timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
				2739	timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
				2740	if (timeo <= 0) {
				2741	err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
				2742	goto out_put;
				2743	}
				2744	}
				2745	/* check for additional frames */
				2746	continue;
				2747	}
				2748
				2749	skb = NULL;
				2750	tp_len = tpacket_parse_header(po, ph, size_max, &data);
				2751	if (tp_len < 0)
				2752	goto tpacket_error;
				2753
				2754	status = TP_STATUS_SEND_REQUEST;
				2755	hlen = LL_RESERVED_SPACE(dev);
				2756	tlen = dev->needed_tailroom;
				2757	if (po->has_vnet_hdr) {
				2758	vnet_hdr = data;
				2759	data += sizeof(*vnet_hdr);
				2760	tp_len -= sizeof(*vnet_hdr);
				2761	if (tp_len < 0 \|\|
				2762	__packet_snd_vnet_parse(vnet_hdr, tp_len)) {
				2763	tp_len = -EINVAL;
				2764	goto tpacket_error;
				2765	}
				2766	copylen = __virtio16_to_cpu(vio_le(),
				2767	vnet_hdr->hdr_len);
				2768	}
				2769	copylen = max_t(int, copylen, dev->hard_header_len);
				2770	skb = sock_alloc_send_skb(&po->sk,
				2771	hlen + tlen + sizeof(struct sockaddr_ll) +
				2772	(copylen - dev->hard_header_len),
				2773	!need_wait, &err);
				2774
				2775	if (unlikely(skb == NULL)) {
				2776	/* we assume the socket was initially writeable ... */
				2777	if (likely(len_sum > 0))
				2778	err = len_sum;
				2779	goto out_status;
				2780	}
				2781	tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
				2782	addr, hlen, copylen, &sockc);
				2783	if (likely(tp_len >= 0) &&
				2784	tp_len > dev->mtu + reserve &&
				2785	!po->has_vnet_hdr &&
				2786	!packet_extra_vlan_len_allowed(dev, skb))
				2787	tp_len = -EMSGSIZE;
				2788
				2789	if (unlikely(tp_len < 0)) {
				2790	tpacket_error:
				2791	if (po->tp_loss) {
				2792	__packet_set_status(po, ph,
				2793	TP_STATUS_AVAILABLE);
				2794	packet_increment_head(&po->tx_ring);
				2795	kfree_skb(skb);
				2796	continue;
				2797	} else {
				2798	status = TP_STATUS_WRONG_FORMAT;
				2799	err = tp_len;
				2800	goto out_status;
				2801	}
				2802	}
				2803
				2804	if (po->has_vnet_hdr) {
				2805	if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
				2806	tp_len = -EINVAL;
				2807	goto tpacket_error;
				2808	}
				2809	virtio_net_hdr_set_proto(skb, vnet_hdr);
				2810	}
				2811
				2812	skb->destructor = tpacket_destruct_skb;
				2813	__packet_set_status(po, ph, TP_STATUS_SENDING);
				2814	packet_inc_pending(&po->tx_ring);
				2815
				2816	status = TP_STATUS_SEND_REQUEST;
				2817	err = po->xmit(skb);
				2818	if (unlikely(err > 0)) {
				2819	err = net_xmit_errno(err);
				2820	if (err && __packet_get_status(po, ph) ==
				2821	TP_STATUS_AVAILABLE) {
				2822	/* skb was destructed already */
				2823	skb = NULL;
				2824	goto out_status;
				2825	}
				2826	/*
				2827	* skb was dropped but not destructed yet;
				2828	* let's treat it like congestion or err < 0
				2829	*/
				2830	err = 0;
				2831	}
				2832	packet_increment_head(&po->tx_ring);
				2833	len_sum += tp_len;
				2834	} while (likely((ph != NULL) \|\|
				2835	/* Note: packet_read_pending() might be slow if we have
				2836	* to call it as it's per_cpu variable, but in fast-path
				2837	* we already short-circuit the loop with the first
				2838	* condition, and luckily don't have to go that path
				2839	* anyway.
				2840	*/
				2841	(need_wait && packet_read_pending(&po->tx_ring))));
				2842
				2843	err = len_sum;
				2844	goto out_put;
				2845
				2846	out_status:
				2847	__packet_set_status(po, ph, status);
				2848	kfree_skb(skb);
				2849	out_put:
				2850	dev_put(dev);
				2851	out:
				2852	mutex_unlock(&po->pg_vec_lock);
				2853	return err;
				2854	}
				2855
				2856	static struct sk_buff packet_alloc_skb(struct sock sk, size_t prepad,
				2857	size_t reserve, size_t len,
				2858	size_t linear, int noblock,
				2859	int *err)
				2860	{
				2861	struct sk_buff *skb;
				2862
				2863	/* Under a page? Don't bother with paged skb. */
				2864	if (prepad + len < PAGE_SIZE \|\| !linear)
				2865	linear = len;
				2866
				2867	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
				2868	err, 0);
				2869	if (!skb)
				2870	return NULL;
				2871
				2872	skb_reserve(skb, reserve);
				2873	skb_put(skb, linear);
				2874	skb->data_len = len - linear;
				2875	skb->len += len - linear;
				2876
				2877	return skb;
				2878	}
				2879
				2880	static int packet_snd(struct socket sock, struct msghdr msg, size_t len)
				2881	{
				2882	struct sock *sk = sock->sk;
				2883	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
				2884	struct sk_buff *skb;
				2885	struct net_device *dev;
				2886	__be16 proto;
				2887	unsigned char *addr = NULL;
				2888	int err, reserve = 0;
				2889	struct sockcm_cookie sockc;
				2890	struct virtio_net_hdr vnet_hdr = { 0 };
				2891	int offset = 0;
				2892	struct packet_sock *po = pkt_sk(sk);
				2893	bool has_vnet_hdr = false;
				2894	int hlen, tlen, linear;
				2895	int extra_len = 0;
				2896
				2897	/*
				2898	* Get and verify the address.
				2899	*/
				2900
				2901	if (likely(saddr == NULL)) {
				2902	dev = packet_cached_dev_get(po);
				2903	proto = po->num;
				2904	} else {
				2905	err = -EINVAL;
				2906	if (msg->msg_namelen < sizeof(struct sockaddr_ll))
				2907	goto out;
				2908	if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
				2909	goto out;
				2910	proto = saddr->sll_protocol;
				2911	dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
				2912	if (sock->type == SOCK_DGRAM) {
				2913	if (dev && msg->msg_namelen < dev->addr_len +
				2914	offsetof(struct sockaddr_ll, sll_addr))
				2915	goto out_unlock;
				2916	addr = saddr->sll_addr;
				2917	}
				2918	}
				2919
				2920	err = -ENXIO;
				2921	if (unlikely(dev == NULL))
				2922	goto out_unlock;
				2923	err = -ENETDOWN;
				2924	if (unlikely(!(dev->flags & IFF_UP)))
				2925	goto out_unlock;
				2926
				2927	sockc.tsflags = sk->sk_tsflags;
				2928	sockc.mark = sk->sk_mark;
				2929	if (msg->msg_controllen) {
				2930	err = sock_cmsg_send(sk, msg, &sockc);
				2931	if (unlikely(err))
				2932	goto out_unlock;
				2933	}
				2934
				2935	if (sock->type == SOCK_RAW)
				2936	reserve = dev->hard_header_len;
				2937	if (po->has_vnet_hdr) {
				2938	err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
				2939	if (err)
				2940	goto out_unlock;
				2941	has_vnet_hdr = true;
				2942	}
				2943
				2944	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
				2945	if (!netif_supports_nofcs(dev)) {
				2946	err = -EPROTONOSUPPORT;
				2947	goto out_unlock;
				2948	}
				2949	extra_len = 4; /* We're doing our own CRC */
				2950	}
				2951
				2952	err = -EMSGSIZE;
				2953	if (!vnet_hdr.gso_type &&
				2954	(len > dev->mtu + reserve + VLAN_HLEN + extra_len))
				2955	goto out_unlock;
				2956
				2957	err = -ENOBUFS;
				2958	hlen = LL_RESERVED_SPACE(dev);
				2959	tlen = dev->needed_tailroom;
				2960	linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
				2961	linear = max(linear, min_t(int, len, dev->hard_header_len));
				2962	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
				2963	msg->msg_flags & MSG_DONTWAIT, &err);
				2964	if (skb == NULL)
				2965	goto out_unlock;
				2966
				2967	skb_reset_network_header(skb);
				2968
				2969	err = -EINVAL;
				2970	if (sock->type == SOCK_DGRAM) {
				2971	offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
				2972	if (unlikely(offset < 0))
				2973	goto out_free;
				2974	} else if (reserve) {
				2975	skb_reserve(skb, -reserve);
				2976	if (len < reserve)
				2977	skb_reset_network_header(skb);
				2978	}
				2979
				2980	/* Returns -EFAULT on error */
				2981	err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
				2982	if (err)
				2983	goto out_free;
				2984
				2985	if (sock->type == SOCK_RAW &&
				2986	!dev_validate_header(dev, skb->data, len)) {
				2987	err = -EINVAL;
				2988	goto out_free;
				2989	}
				2990
				2991	sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
				2992
				2993	if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
				2994	!packet_extra_vlan_len_allowed(dev, skb)) {
				2995	err = -EMSGSIZE;
				2996	goto out_free;
				2997	}
				2998
				2999	skb->protocol = proto;
				3000	skb->dev = dev;
				3001	skb->priority = sk->sk_priority;
				3002	skb->mark = sockc.mark;
				3003
				3004	if (has_vnet_hdr) {
				3005	err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
				3006	if (err)
				3007	goto out_free;
				3008	len += sizeof(vnet_hdr);
				3009	virtio_net_hdr_set_proto(skb, &vnet_hdr);
				3010	}
				3011
				3012	skb_probe_transport_header(skb, reserve);
				3013
				3014	if (unlikely(extra_len == 4))
				3015	skb->no_fcs = 1;
				3016
				3017	err = po->xmit(skb);
				3018	if (err > 0 && (err = net_xmit_errno(err)) != 0)
				3019	goto out_unlock;
				3020
				3021	dev_put(dev);
				3022
				3023	return len;
				3024
				3025	out_free:
				3026	kfree_skb(skb);
				3027	out_unlock:
				3028	if (dev)
				3029	dev_put(dev);
				3030	out:
				3031	return err;
				3032	}
				3033
				3034	static int packet_sendmsg(struct socket sock, struct msghdr msg, size_t len)
				3035	{
				3036	struct sock *sk = sock->sk;
				3037	struct packet_sock *po = pkt_sk(sk);
				3038
				3039	if (po->tx_ring.pg_vec)
				3040	return tpacket_snd(po, msg);
				3041	else
				3042	return packet_snd(sock, msg, len);
				3043	}
				3044
				3045	/*
				3046	* Close a PACKET socket. This is fairly simple. We immediately go
				3047	* to 'closed' state and remove our protocol entry in the device list.
				3048	*/
				3049
				3050	static int packet_release(struct socket *sock)
				3051	{
				3052	struct sock *sk = sock->sk;
				3053	struct packet_sock *po;
				3054	struct packet_fanout *f;
				3055	struct net *net;
				3056	union tpacket_req_u req_u;
				3057
				3058	if (!sk)
				3059	return 0;
				3060
				3061	net = sock_net(sk);
				3062	po = pkt_sk(sk);
				3063
				3064	mutex_lock(&net->packet.sklist_lock);
				3065	sk_del_node_init_rcu(sk);
				3066	mutex_unlock(&net->packet.sklist_lock);
				3067
				3068	preempt_disable();
				3069	sock_prot_inuse_add(net, sk->sk_prot, -1);
				3070	preempt_enable();
				3071
				3072	spin_lock(&po->bind_lock);
				3073	unregister_prot_hook(sk, false);
				3074	packet_cached_dev_reset(po);
				3075
				3076	if (po->prot_hook.dev) {
				3077	dev_put(po->prot_hook.dev);
				3078	po->prot_hook.dev = NULL;
				3079	}
				3080	spin_unlock(&po->bind_lock);
				3081
				3082	packet_flush_mclist(sk);
				3083
				3084	lock_sock(sk);
				3085	if (po->rx_ring.pg_vec) {
				3086	memset(&req_u, 0, sizeof(req_u));
				3087	packet_set_ring(sk, &req_u, 1, 0);
				3088	}
				3089
				3090	if (po->tx_ring.pg_vec) {
				3091	memset(&req_u, 0, sizeof(req_u));
				3092	packet_set_ring(sk, &req_u, 1, 1);
				3093	}
				3094	release_sock(sk);
				3095
				3096	f = fanout_release(sk);
				3097
				3098	synchronize_net();
				3099
				3100	if (f) {
				3101	kfree(po->rollover);
				3102	fanout_release_data(f);
				3103	kfree(f);
				3104	}
				3105	/*
				3106	* Now the socket is dead. No more input will appear.
				3107	*/
				3108	sock_orphan(sk);
				3109	sock->sk = NULL;
				3110
				3111	/* Purge queues */
				3112
				3113	skb_queue_purge(&sk->sk_receive_queue);
				3114	packet_free_pending(po);
				3115	sk_refcnt_debug_release(sk);
				3116
				3117	sock_put(sk);
				3118	return 0;
				3119	}
				3120
				3121	/*
				3122	* Attach a packet hook.
				3123	*/
				3124
				3125	static int packet_do_bind(struct sock sk, const char name, int ifindex,
				3126	__be16 proto)
				3127	{
				3128	struct packet_sock *po = pkt_sk(sk);
				3129	struct net_device *dev_curr;
				3130	__be16 proto_curr;
				3131	bool need_rehook;
				3132	struct net_device *dev = NULL;
				3133	int ret = 0;
				3134	bool unlisted = false;
				3135
				3136	lock_sock(sk);
				3137	spin_lock(&po->bind_lock);
				3138	rcu_read_lock();
				3139
				3140	if (po->fanout) {
				3141	ret = -EINVAL;
				3142	goto out_unlock;
				3143	}
				3144
				3145	if (name) {
				3146	dev = dev_get_by_name_rcu(sock_net(sk), name);
				3147	if (!dev) {
				3148	ret = -ENODEV;
				3149	goto out_unlock;
				3150	}
				3151	} else if (ifindex) {
				3152	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
				3153	if (!dev) {
				3154	ret = -ENODEV;
				3155	goto out_unlock;
				3156	}
				3157	}
				3158
				3159	if (dev)
				3160	dev_hold(dev);
				3161
				3162	proto_curr = po->prot_hook.type;
				3163	dev_curr = po->prot_hook.dev;
				3164
				3165	need_rehook = proto_curr != proto \|\| dev_curr != dev;
				3166
				3167	if (need_rehook) {
				3168	if (po->running) {
				3169	rcu_read_unlock();
				3170	/* prevents packet_notifier() from calling
				3171	* register_prot_hook()
				3172	*/
				3173	po->num = 0;
				3174	__unregister_prot_hook(sk, true);
				3175	rcu_read_lock();
				3176	dev_curr = po->prot_hook.dev;
				3177	if (dev)
				3178	unlisted = !dev_get_by_index_rcu(sock_net(sk),
				3179	dev->ifindex);
				3180	}
				3181
				3182	BUG_ON(po->running);
				3183	po->num = proto;
				3184	po->prot_hook.type = proto;
				3185
				3186	if (unlikely(unlisted)) {
				3187	dev_put(dev);
				3188	po->prot_hook.dev = NULL;
				3189	po->ifindex = -1;
				3190	packet_cached_dev_reset(po);
				3191	} else {
				3192	po->prot_hook.dev = dev;
				3193	po->ifindex = dev ? dev->ifindex : 0;
				3194	packet_cached_dev_assign(po, dev);
				3195	}
				3196	}
				3197	if (dev_curr)
				3198	dev_put(dev_curr);
				3199
				3200	if (proto == 0 \|\| !need_rehook)
				3201	goto out_unlock;
				3202
				3203	if (!unlisted && (!dev \|\| (dev->flags & IFF_UP))) {
				3204	register_prot_hook(sk);
				3205	} else {
				3206	sk->sk_err = ENETDOWN;
				3207	if (!sock_flag(sk, SOCK_DEAD))
				3208	sk->sk_error_report(sk);
				3209	}
				3210
				3211	out_unlock:
				3212	rcu_read_unlock();
				3213	spin_unlock(&po->bind_lock);
				3214	release_sock(sk);
				3215	return ret;
				3216	}
				3217
				3218	/*
				3219	* Bind a packet socket to a device
				3220	*/
				3221
				3222	static int packet_bind_spkt(struct socket sock, struct sockaddr uaddr,
				3223	int addr_len)
				3224	{
				3225	struct sock *sk = sock->sk;
				3226	char name[sizeof(uaddr->sa_data) + 1];
				3227
				3228	/*
				3229	* Check legality
				3230	*/
				3231
				3232	if (addr_len != sizeof(struct sockaddr))
				3233	return -EINVAL;
				3234	/* uaddr->sa_data comes from the userspace, it's not guaranteed to be
				3235	* zero-terminated.
				3236	*/
				3237	memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
				3238	name[sizeof(uaddr->sa_data)] = 0;
				3239
				3240	return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
				3241	}
				3242
				3243	static int packet_bind(struct socket sock, struct sockaddr uaddr, int addr_len)
				3244	{
				3245	struct sockaddr_ll sll = (struct sockaddr_ll )uaddr;
				3246	struct sock *sk = sock->sk;
				3247
				3248	/*
				3249	* Check legality
				3250	*/
				3251
				3252	if (addr_len < sizeof(struct sockaddr_ll))
				3253	return -EINVAL;
				3254	if (sll->sll_family != AF_PACKET)
				3255	return -EINVAL;
				3256
				3257	return packet_do_bind(sk, NULL, sll->sll_ifindex,
				3258	sll->sll_protocol ? : pkt_sk(sk)->num);
				3259	}
				3260
				3261	static struct proto packet_proto = {
				3262	.name = "PACKET",
				3263	.owner = THIS_MODULE,
				3264	.obj_size = sizeof(struct packet_sock),
				3265	};
				3266
				3267	/*
				3268	* Create a packet of type SOCK_PACKET.
				3269	*/
				3270
				3271	static int packet_create(struct net net, struct socket sock, int protocol,
				3272	int kern)
				3273	{
				3274	struct sock *sk;
				3275	struct packet_sock *po;
				3276	__be16 proto = (__force __be16)protocol; /* weird, but documented */
				3277	int err;
				3278
				3279	if (!ns_capable(net->user_ns, CAP_NET_RAW))
				3280	return -EPERM;
				3281	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
				3282	sock->type != SOCK_PACKET)
				3283	return -ESOCKTNOSUPPORT;
				3284
				3285	sock->state = SS_UNCONNECTED;
				3286
				3287	err = -ENOBUFS;
				3288	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
				3289	if (sk == NULL)
				3290	goto out;
				3291
				3292	sock->ops = &packet_ops;
				3293	if (sock->type == SOCK_PACKET)
				3294	sock->ops = &packet_ops_spkt;
				3295
				3296	sock_init_data(sock, sk);
				3297
				3298	po = pkt_sk(sk);
				3299	init_completion(&po->skb_completion);
				3300	sk->sk_family = PF_PACKET;
				3301	po->num = proto;
				3302	po->xmit = dev_queue_xmit;
				3303
				3304	err = packet_alloc_pending(po);
				3305	if (err)
				3306	goto out2;
				3307
				3308	packet_cached_dev_reset(po);
				3309
				3310	sk->sk_destruct = packet_sock_destruct;
				3311	sk_refcnt_debug_inc(sk);
				3312
				3313	/*
				3314	* Attach a protocol block
				3315	*/
				3316
				3317	spin_lock_init(&po->bind_lock);
				3318	mutex_init(&po->pg_vec_lock);
				3319	po->rollover = NULL;
				3320	po->prot_hook.func = packet_rcv;
				3321
				3322	if (sock->type == SOCK_PACKET)
				3323	po->prot_hook.func = packet_rcv_spkt;
				3324
				3325	po->prot_hook.af_packet_priv = sk;
				3326
				3327	if (proto) {
				3328	po->prot_hook.type = proto;
				3329	__register_prot_hook(sk);
				3330	}
				3331
				3332	mutex_lock(&net->packet.sklist_lock);
				3333	sk_add_node_tail_rcu(sk, &net->packet.sklist);
				3334	mutex_unlock(&net->packet.sklist_lock);
				3335
				3336	preempt_disable();
				3337	sock_prot_inuse_add(net, &packet_proto, 1);
				3338	preempt_enable();
				3339
				3340	return 0;
				3341	out2:
				3342	sk_free(sk);
				3343	out:
				3344	return err;
				3345	}
				3346
				3347	/*
				3348	* Pull a packet from our receive queue and hand it to the user.
				3349	* If necessary we block.
				3350	*/
				3351
				3352	static int packet_recvmsg(struct socket sock, struct msghdr msg, size_t len,
				3353	int flags)
				3354	{
				3355	struct sock *sk = sock->sk;
				3356	struct sk_buff *skb;
				3357	int copied, err;
				3358	int vnet_hdr_len = 0;
				3359	unsigned int origlen = 0;
				3360
				3361	err = -EINVAL;
				3362	if (flags & ~(MSG_PEEK\|MSG_DONTWAIT\|MSG_TRUNC\|MSG_CMSG_COMPAT\|MSG_ERRQUEUE))
				3363	goto out;
				3364
				3365	#if 0
				3366	/* What error should we return now? EUNATTACH? */
				3367	if (pkt_sk(sk)->ifindex < 0)
				3368	return -ENODEV;
				3369	#endif
				3370
				3371	if (flags & MSG_ERRQUEUE) {
				3372	err = sock_recv_errqueue(sk, msg, len,
				3373	SOL_PACKET, PACKET_TX_TIMESTAMP);
				3374	goto out;
				3375	}
				3376
				3377	/*
				3378	* Call the generic datagram receiver. This handles all sorts
				3379	* of horrible races and re-entrancy so we can forget about it
				3380	* in the protocol layers.
				3381	*
				3382	* Now it will return ENETDOWN, if device have just gone down,
				3383	* but then it will block.
				3384	*/
				3385
				3386	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
				3387
				3388	/*
				3389	* An error occurred so return it. Because skb_recv_datagram()
				3390	* handles the blocking we don't see and worry about blocking
				3391	* retries.
				3392	*/
				3393
				3394	if (skb == NULL)
				3395	goto out;
				3396
				3397	if (pkt_sk(sk)->pressure)
				3398	packet_rcv_has_room(pkt_sk(sk), NULL);
				3399
				3400	if (pkt_sk(sk)->has_vnet_hdr) {
				3401	err = packet_rcv_vnet(msg, skb, &len);
				3402	if (err)
				3403	goto out_free;
				3404	vnet_hdr_len = sizeof(struct virtio_net_hdr);
				3405	}
				3406
				3407	/* You lose any data beyond the buffer you gave. If it worries
				3408	* a user program they can ask the device for its MTU
				3409	* anyway.
				3410	*/
				3411	copied = skb->len;
				3412	if (copied > len) {
				3413	copied = len;
				3414	msg->msg_flags \|= MSG_TRUNC;
				3415	}
				3416
				3417	err = skb_copy_datagram_msg(skb, 0, msg, copied);
				3418	if (err)
				3419	goto out_free;
				3420
				3421	if (sock->type != SOCK_PACKET) {
				3422	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
				3423
				3424	/* Original length was stored in sockaddr_ll fields */
				3425	origlen = PACKET_SKB_CB(skb)->sa.origlen;
				3426	sll->sll_family = AF_PACKET;
				3427	sll->sll_protocol = skb->protocol;
				3428	}
				3429
				3430	sock_recv_ts_and_drops(msg, sk, skb);
				3431
				3432	if (msg->msg_name) {
				3433	int copy_len;
				3434
				3435	/* If the address length field is there to be filled
				3436	* in, we fill it in now.
				3437	*/
				3438	if (sock->type == SOCK_PACKET) {
				3439	__sockaddr_check_size(sizeof(struct sockaddr_pkt));
				3440	msg->msg_namelen = sizeof(struct sockaddr_pkt);
				3441	copy_len = msg->msg_namelen;
				3442	} else {
				3443	struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
				3444
				3445	msg->msg_namelen = sll->sll_halen +
				3446	offsetof(struct sockaddr_ll, sll_addr);
				3447	copy_len = msg->msg_namelen;
				3448	if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
				3449	memset(msg->msg_name +
				3450	offsetof(struct sockaddr_ll, sll_addr),
				3451	0, sizeof(sll->sll_addr));
				3452	msg->msg_namelen = sizeof(struct sockaddr_ll);
				3453	}
				3454	}
				3455	memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
				3456	}
				3457
				3458	if (pkt_sk(sk)->auxdata) {
				3459	struct tpacket_auxdata aux;
				3460
				3461	aux.tp_status = TP_STATUS_USER;
				3462	if (skb->ip_summed == CHECKSUM_PARTIAL)
				3463	aux.tp_status \|= TP_STATUS_CSUMNOTREADY;
				3464	else if (skb->pkt_type != PACKET_OUTGOING &&
				3465	(skb->ip_summed == CHECKSUM_COMPLETE \|\|
				3466	skb_csum_unnecessary(skb)))
				3467	aux.tp_status \|= TP_STATUS_CSUM_VALID;
				3468
				3469	aux.tp_len = origlen;
				3470	aux.tp_snaplen = skb->len;
				3471	aux.tp_mac = 0;
				3472	aux.tp_net = skb_network_offset(skb);
				3473	if (skb_vlan_tag_present(skb)) {
				3474	aux.tp_vlan_tci = skb_vlan_tag_get(skb);
				3475	aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
				3476	aux.tp_status \|= TP_STATUS_VLAN_VALID \| TP_STATUS_VLAN_TPID_VALID;
				3477	} else {
				3478	aux.tp_vlan_tci = 0;
				3479	aux.tp_vlan_tpid = 0;
				3480	}
				3481	put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
				3482	}
				3483
				3484	/*
				3485	* Free or return the buffer as appropriate. Again this
				3486	* hides all the races and re-entrancy issues from us.
				3487	*/
				3488	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
				3489
				3490	out_free:
				3491	skb_free_datagram(sk, skb);
				3492	out:
				3493	return err;
				3494	}
				3495
				3496	static int packet_getname_spkt(struct socket sock, struct sockaddr uaddr,
				3497	int *uaddr_len, int peer)
				3498	{
				3499	struct net_device *dev;
				3500	struct sock *sk = sock->sk;
				3501
				3502	if (peer)
				3503	return -EOPNOTSUPP;
				3504
				3505	uaddr->sa_family = AF_PACKET;
				3506	memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
				3507	rcu_read_lock();
				3508	dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
				3509	if (dev)
				3510	strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
				3511	rcu_read_unlock();
				3512	uaddr_len = sizeof(uaddr);
				3513
				3514	return 0;
				3515	}
				3516
				3517	static int packet_getname(struct socket sock, struct sockaddr uaddr,
				3518	int *uaddr_len, int peer)
				3519	{
				3520	struct net_device *dev;
				3521	struct sock *sk = sock->sk;
				3522	struct packet_sock *po = pkt_sk(sk);
				3523	DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
				3524
				3525	if (peer)
				3526	return -EOPNOTSUPP;
				3527
				3528	sll->sll_family = AF_PACKET;
				3529	sll->sll_ifindex = po->ifindex;
				3530	sll->sll_protocol = po->num;
				3531	sll->sll_pkttype = 0;
				3532	rcu_read_lock();
				3533	dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
				3534	if (dev) {
				3535	sll->sll_hatype = dev->type;
				3536	sll->sll_halen = dev->addr_len;
				3537	memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
				3538	} else {
				3539	sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
				3540	sll->sll_halen = 0;
				3541	}
				3542	rcu_read_unlock();
				3543	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
				3544
				3545	return 0;
				3546	}
				3547
				3548	static int packet_dev_mc(struct net_device dev, struct packet_mclist i,
				3549	int what)
				3550	{
				3551	switch (i->type) {
				3552	case PACKET_MR_MULTICAST:
				3553	if (i->alen != dev->addr_len)
				3554	return -EINVAL;
				3555	if (what > 0)
				3556	return dev_mc_add(dev, i->addr);
				3557	else
				3558	return dev_mc_del(dev, i->addr);
				3559	break;
				3560	case PACKET_MR_PROMISC:
				3561	return dev_set_promiscuity(dev, what);
				3562	case PACKET_MR_ALLMULTI:
				3563	return dev_set_allmulti(dev, what);
				3564	case PACKET_MR_UNICAST:
				3565	if (i->alen != dev->addr_len)
				3566	return -EINVAL;
				3567	if (what > 0)
				3568	return dev_uc_add(dev, i->addr);
				3569	else
				3570	return dev_uc_del(dev, i->addr);
				3571	break;
				3572	default:
				3573	break;
				3574	}
				3575	return 0;
				3576	}
				3577
				3578	static void packet_dev_mclist_delete(struct net_device *dev,
				3579	struct packet_mclist **mlp)
				3580	{
				3581	struct packet_mclist *ml;
				3582
				3583	while ((ml = *mlp) != NULL) {
				3584	if (ml->ifindex == dev->ifindex) {
				3585	packet_dev_mc(dev, ml, -1);
				3586	*mlp = ml->next;
				3587	kfree(ml);
				3588	} else
				3589	mlp = &ml->next;
				3590	}
				3591	}
				3592
				3593	static int packet_mc_add(struct sock sk, struct packet_mreq_max mreq)
				3594	{
				3595	struct packet_sock *po = pkt_sk(sk);
				3596	struct packet_mclist ml, i;
				3597	struct net_device *dev;
				3598	int err;
				3599
				3600	rtnl_lock();
				3601
				3602	err = -ENODEV;
				3603	dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
				3604	if (!dev)
				3605	goto done;
				3606
				3607	err = -EINVAL;
				3608	if (mreq->mr_alen > dev->addr_len)
				3609	goto done;
				3610
				3611	err = -ENOBUFS;
				3612	i = kmalloc(sizeof(*i), GFP_KERNEL);
				3613	if (i == NULL)
				3614	goto done;
				3615
				3616	err = 0;
				3617	for (ml = po->mclist; ml; ml = ml->next) {
				3618	if (ml->ifindex == mreq->mr_ifindex &&
				3619	ml->type == mreq->mr_type &&
				3620	ml->alen == mreq->mr_alen &&
				3621	memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
				3622	ml->count++;
				3623	/* Free the new element ... */
				3624	kfree(i);
				3625	goto done;
				3626	}
				3627	}
				3628
				3629	i->type = mreq->mr_type;
				3630	i->ifindex = mreq->mr_ifindex;
				3631	i->alen = mreq->mr_alen;
				3632	memcpy(i->addr, mreq->mr_address, i->alen);
				3633	memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
				3634	i->count = 1;
				3635	i->next = po->mclist;
				3636	po->mclist = i;
				3637	err = packet_dev_mc(dev, i, 1);
				3638	if (err) {
				3639	po->mclist = i->next;
				3640	kfree(i);
				3641	}
				3642
				3643	done:
				3644	rtnl_unlock();
				3645	return err;
				3646	}
				3647
				3648	static int packet_mc_drop(struct sock sk, struct packet_mreq_max mreq)
				3649	{
				3650	struct packet_mclist ml, *mlp;
				3651
				3652	rtnl_lock();
				3653
				3654	for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
				3655	if (ml->ifindex == mreq->mr_ifindex &&
				3656	ml->type == mreq->mr_type &&
				3657	ml->alen == mreq->mr_alen &&
				3658	memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
				3659	if (--ml->count == 0) {
				3660	struct net_device *dev;
				3661	*mlp = ml->next;
				3662	dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
				3663	if (dev)
				3664	packet_dev_mc(dev, ml, -1);
				3665	kfree(ml);
				3666	}
				3667	break;
				3668	}
				3669	}
				3670	rtnl_unlock();
				3671	return 0;
				3672	}
				3673
				3674	static void packet_flush_mclist(struct sock *sk)
				3675	{
				3676	struct packet_sock *po = pkt_sk(sk);
				3677	struct packet_mclist *ml;
				3678
				3679	if (!po->mclist)
				3680	return;
				3681
				3682	rtnl_lock();
				3683	while ((ml = po->mclist) != NULL) {
				3684	struct net_device *dev;
				3685
				3686	po->mclist = ml->next;
				3687	dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
				3688	if (dev != NULL)
				3689	packet_dev_mc(dev, ml, -1);
				3690	kfree(ml);
				3691	}
				3692	rtnl_unlock();
				3693	}
				3694
				3695	static int
				3696	packet_setsockopt(struct socket sock, int level, int optname, char __user optval, unsigned int optlen)
				3697	{
				3698	struct sock *sk = sock->sk;
				3699	struct packet_sock *po = pkt_sk(sk);
				3700	int ret;
				3701
				3702	if (level != SOL_PACKET)
				3703	return -ENOPROTOOPT;
				3704
				3705	switch (optname) {
				3706	case PACKET_ADD_MEMBERSHIP:
				3707	case PACKET_DROP_MEMBERSHIP:
				3708	{
				3709	struct packet_mreq_max mreq;
				3710	int len = optlen;
				3711	memset(&mreq, 0, sizeof(mreq));
				3712	if (len < sizeof(struct packet_mreq))
				3713	return -EINVAL;
				3714	if (len > sizeof(mreq))
				3715	len = sizeof(mreq);
				3716	if (copy_from_user(&mreq, optval, len))
				3717	return -EFAULT;
				3718	if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
				3719	return -EINVAL;
				3720	if (optname == PACKET_ADD_MEMBERSHIP)
				3721	ret = packet_mc_add(sk, &mreq);
				3722	else
				3723	ret = packet_mc_drop(sk, &mreq);
				3724	return ret;
				3725	}
				3726
				3727	case PACKET_RX_RING:
				3728	case PACKET_TX_RING:
				3729	{
				3730	union tpacket_req_u req_u;
				3731	int len;
				3732
				3733	lock_sock(sk);
				3734	switch (po->tp_version) {
				3735	case TPACKET_V1:
				3736	case TPACKET_V2:
				3737	len = sizeof(req_u.req);
				3738	break;
				3739	case TPACKET_V3:
				3740	default:
				3741	len = sizeof(req_u.req3);
				3742	break;
				3743	}
				3744	if (optlen < len) {
				3745	ret = -EINVAL;
				3746	} else {
				3747	if (copy_from_user(&req_u.req, optval, len))
				3748	ret = -EFAULT;
				3749	else
				3750	ret = packet_set_ring(sk, &req_u, 0,
				3751	optname == PACKET_TX_RING);
				3752	}
				3753	release_sock(sk);
				3754	return ret;
				3755	}
				3756	case PACKET_COPY_THRESH:
				3757	{
				3758	int val;
				3759
				3760	if (optlen != sizeof(val))
				3761	return -EINVAL;
				3762	if (copy_from_user(&val, optval, sizeof(val)))
				3763	return -EFAULT;
				3764
				3765	pkt_sk(sk)->copy_thresh = val;
				3766	return 0;
				3767	}
				3768	case PACKET_VERSION:
				3769	{
				3770	int val;
				3771
				3772	if (optlen != sizeof(val))
				3773	return -EINVAL;
				3774	if (copy_from_user(&val, optval, sizeof(val)))
				3775	return -EFAULT;
				3776	switch (val) {
				3777	case TPACKET_V1:
				3778	case TPACKET_V2:
				3779	case TPACKET_V3:
				3780	break;
				3781	default:
				3782	return -EINVAL;
				3783	}
				3784	lock_sock(sk);
				3785	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3786	ret = -EBUSY;
				3787	} else {
				3788	po->tp_version = val;
				3789	ret = 0;
				3790	}
				3791	release_sock(sk);
				3792	return ret;
				3793	}
				3794	case PACKET_RESERVE:
				3795	{
				3796	unsigned int val;
				3797
				3798	if (optlen != sizeof(val))
				3799	return -EINVAL;
				3800	if (copy_from_user(&val, optval, sizeof(val)))
				3801	return -EFAULT;
				3802	if (val > INT_MAX)
				3803	return -EINVAL;
				3804	lock_sock(sk);
				3805	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3806	ret = -EBUSY;
				3807	} else {
				3808	po->tp_reserve = val;
				3809	ret = 0;
				3810	}
				3811	release_sock(sk);
				3812	return ret;
				3813	}
				3814	case PACKET_LOSS:
				3815	{
				3816	unsigned int val;
				3817
				3818	if (optlen != sizeof(val))
				3819	return -EINVAL;
				3820	if (copy_from_user(&val, optval, sizeof(val)))
				3821	return -EFAULT;
				3822
				3823	lock_sock(sk);
				3824	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3825	ret = -EBUSY;
				3826	} else {
				3827	po->tp_loss = !!val;
				3828	ret = 0;
				3829	}
				3830	release_sock(sk);
				3831	return ret;
				3832	}
				3833	case PACKET_AUXDATA:
				3834	{
				3835	int val;
				3836
				3837	if (optlen < sizeof(val))
				3838	return -EINVAL;
				3839	if (copy_from_user(&val, optval, sizeof(val)))
				3840	return -EFAULT;
				3841
				3842	lock_sock(sk);
				3843	po->auxdata = !!val;
				3844	release_sock(sk);
				3845	return 0;
				3846	}
				3847	case PACKET_ORIGDEV:
				3848	{
				3849	int val;
				3850
				3851	if (optlen < sizeof(val))
				3852	return -EINVAL;
				3853	if (copy_from_user(&val, optval, sizeof(val)))
				3854	return -EFAULT;
				3855
				3856	lock_sock(sk);
				3857	po->origdev = !!val;
				3858	release_sock(sk);
				3859	return 0;
				3860	}
				3861	case PACKET_VNET_HDR:
				3862	{
				3863	int val;
				3864
				3865	if (sock->type != SOCK_RAW)
				3866	return -EINVAL;
				3867	if (optlen < sizeof(val))
				3868	return -EINVAL;
				3869	if (copy_from_user(&val, optval, sizeof(val)))
				3870	return -EFAULT;
				3871
				3872	lock_sock(sk);
				3873	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3874	ret = -EBUSY;
				3875	} else {
				3876	po->has_vnet_hdr = !!val;
				3877	ret = 0;
				3878	}
				3879	release_sock(sk);
				3880	return ret;
				3881	}
				3882	case PACKET_TIMESTAMP:
				3883	{
				3884	int val;
				3885
				3886	if (optlen != sizeof(val))
				3887	return -EINVAL;
				3888	if (copy_from_user(&val, optval, sizeof(val)))
				3889	return -EFAULT;
				3890
				3891	po->tp_tstamp = val;
				3892	return 0;
				3893	}
				3894	case PACKET_FANOUT:
				3895	{
				3896	int val;
				3897
				3898	if (optlen != sizeof(val))
				3899	return -EINVAL;
				3900	if (copy_from_user(&val, optval, sizeof(val)))
				3901	return -EFAULT;
				3902
				3903	return fanout_add(sk, val & 0xffff, val >> 16);
				3904	}
				3905	case PACKET_FANOUT_DATA:
				3906	{
				3907	if (!po->fanout)
				3908	return -EINVAL;
				3909
				3910	return fanout_set_data(po, optval, optlen);
				3911	}
				3912	case PACKET_TX_HAS_OFF:
				3913	{
				3914	unsigned int val;
				3915
				3916	if (optlen != sizeof(val))
				3917	return -EINVAL;
				3918	if (copy_from_user(&val, optval, sizeof(val)))
				3919	return -EFAULT;
				3920
				3921	lock_sock(sk);
				3922	if (po->rx_ring.pg_vec \|\| po->tx_ring.pg_vec) {
				3923	ret = -EBUSY;
				3924	} else {
				3925	po->tp_tx_has_off = !!val;
				3926	ret = 0;
				3927	}
				3928	release_sock(sk);
				3929	return 0;
				3930	}
				3931	case PACKET_QDISC_BYPASS:
				3932	{
				3933	int val;
				3934
				3935	if (optlen != sizeof(val))
				3936	return -EINVAL;
				3937	if (copy_from_user(&val, optval, sizeof(val)))
				3938	return -EFAULT;
				3939
				3940	po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
				3941	return 0;
				3942	}
				3943	default:
				3944	return -ENOPROTOOPT;
				3945	}
				3946	}
				3947
				3948	static int packet_getsockopt(struct socket *sock, int level, int optname,
				3949	char __user optval, int __user optlen)
				3950	{
				3951	int len;
				3952	int val, lv = sizeof(val);
				3953	struct sock *sk = sock->sk;
				3954	struct packet_sock *po = pkt_sk(sk);
				3955	void *data = &val;
				3956	union tpacket_stats_u st;
				3957	struct tpacket_rollover_stats rstats;
				3958
				3959	if (level != SOL_PACKET)
				3960	return -ENOPROTOOPT;
				3961
				3962	if (get_user(len, optlen))
				3963	return -EFAULT;
				3964
				3965	if (len < 0)
				3966	return -EINVAL;
				3967
				3968	switch (optname) {
				3969	case PACKET_STATISTICS:
				3970	spin_lock_bh(&sk->sk_receive_queue.lock);
				3971	memcpy(&st, &po->stats, sizeof(st));
				3972	memset(&po->stats, 0, sizeof(po->stats));
				3973	spin_unlock_bh(&sk->sk_receive_queue.lock);
				3974
				3975	if (po->tp_version == TPACKET_V3) {
				3976	lv = sizeof(struct tpacket_stats_v3);
				3977	st.stats3.tp_packets += st.stats3.tp_drops;
				3978	data = &st.stats3;
				3979	} else {
				3980	lv = sizeof(struct tpacket_stats);
				3981	st.stats1.tp_packets += st.stats1.tp_drops;
				3982	data = &st.stats1;
				3983	}
				3984
				3985	break;
				3986	case PACKET_AUXDATA:
				3987	val = po->auxdata;
				3988	break;
				3989	case PACKET_ORIGDEV:
				3990	val = po->origdev;
				3991	break;
				3992	case PACKET_VNET_HDR:
				3993	val = po->has_vnet_hdr;
				3994	break;
				3995	case PACKET_VERSION:
				3996	val = po->tp_version;
				3997	break;
				3998	case PACKET_HDRLEN:
				3999	if (len > sizeof(int))
				4000	len = sizeof(int);
				4001	if (len < sizeof(int))
				4002	return -EINVAL;
				4003	if (copy_from_user(&val, optval, len))
				4004	return -EFAULT;
				4005	switch (val) {
				4006	case TPACKET_V1:
				4007	val = sizeof(struct tpacket_hdr);
				4008	break;
				4009	case TPACKET_V2:
				4010	val = sizeof(struct tpacket2_hdr);
				4011	break;
				4012	case TPACKET_V3:
				4013	val = sizeof(struct tpacket3_hdr);
				4014	break;
				4015	default:
				4016	return -EINVAL;
				4017	}
				4018	break;
				4019	case PACKET_RESERVE:
				4020	val = po->tp_reserve;
				4021	break;
				4022	case PACKET_LOSS:
				4023	val = po->tp_loss;
				4024	break;
				4025	case PACKET_TIMESTAMP:
				4026	val = po->tp_tstamp;
				4027	break;
				4028	case PACKET_FANOUT:
				4029	val = (po->fanout ?
				4030	((u32)po->fanout->id \|
				4031	((u32)po->fanout->type << 16) \|
				4032	((u32)po->fanout->flags << 24)) :
				4033	0);
				4034	break;
				4035	case PACKET_ROLLOVER_STATS:
				4036	if (!po->rollover)
				4037	return -EINVAL;
				4038	rstats.tp_all = atomic_long_read(&po->rollover->num);
				4039	rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
				4040	rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
				4041	data = &rstats;
				4042	lv = sizeof(rstats);
				4043	break;
				4044	case PACKET_TX_HAS_OFF:
				4045	val = po->tp_tx_has_off;
				4046	break;
				4047	case PACKET_QDISC_BYPASS:
				4048	val = packet_use_direct_xmit(po);
				4049	break;
				4050	default:
				4051	return -ENOPROTOOPT;
				4052	}
				4053
				4054	if (len > lv)
				4055	len = lv;
				4056	if (put_user(len, optlen))
				4057	return -EFAULT;
				4058	if (copy_to_user(optval, data, len))
				4059	return -EFAULT;
				4060	return 0;
				4061	}
				4062
				4063
				4064	#ifdef CONFIG_COMPAT
				4065	static int compat_packet_setsockopt(struct socket *sock, int level, int optname,
				4066	char __user *optval, unsigned int optlen)
				4067	{
				4068	struct packet_sock *po = pkt_sk(sock->sk);
				4069
				4070	if (level != SOL_PACKET)
				4071	return -ENOPROTOOPT;
				4072
				4073	if (optname == PACKET_FANOUT_DATA &&
				4074	po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) {
				4075	optval = (char __user *)get_compat_bpf_fprog(optval);
				4076	if (!optval)
				4077	return -EFAULT;
				4078	optlen = sizeof(struct sock_fprog);
				4079	}
				4080
				4081	return packet_setsockopt(sock, level, optname, optval, optlen);
				4082	}
				4083	#endif
				4084
				4085	static int packet_notifier(struct notifier_block *this,
				4086	unsigned long msg, void *ptr)
				4087	{
				4088	struct sock *sk;
				4089	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
				4090	struct net *net = dev_net(dev);
				4091
				4092	rcu_read_lock();
				4093	sk_for_each_rcu(sk, &net->packet.sklist) {
				4094	struct packet_sock *po = pkt_sk(sk);
				4095
				4096	switch (msg) {
				4097	case NETDEV_UNREGISTER:
				4098	if (po->mclist)
				4099	packet_dev_mclist_delete(dev, &po->mclist);
				4100	/* fallthrough */
				4101
				4102	case NETDEV_DOWN:
				4103	if (dev->ifindex == po->ifindex) {
				4104	spin_lock(&po->bind_lock);
				4105	if (po->running) {
				4106	__unregister_prot_hook(sk, false);
				4107	sk->sk_err = ENETDOWN;
				4108	if (!sock_flag(sk, SOCK_DEAD))
				4109	sk->sk_error_report(sk);
				4110	}
				4111	if (msg == NETDEV_UNREGISTER) {
				4112	packet_cached_dev_reset(po);
				4113	po->ifindex = -1;
				4114	if (po->prot_hook.dev)
				4115	dev_put(po->prot_hook.dev);
				4116	po->prot_hook.dev = NULL;
				4117	}
				4118	spin_unlock(&po->bind_lock);
				4119	}
				4120	break;
				4121	case NETDEV_UP:
				4122	if (dev->ifindex == po->ifindex) {
				4123	spin_lock(&po->bind_lock);
				4124	if (po->num)
				4125	register_prot_hook(sk);
				4126	spin_unlock(&po->bind_lock);
				4127	}
				4128	break;
				4129	}
				4130	}
				4131	rcu_read_unlock();
				4132	return NOTIFY_DONE;
				4133	}
				4134
				4135
				4136	static int packet_ioctl(struct socket *sock, unsigned int cmd,
				4137	unsigned long arg)
				4138	{
				4139	struct sock *sk = sock->sk;
				4140
				4141	switch (cmd) {
				4142	case SIOCOUTQ:
				4143	{
				4144	int amount = sk_wmem_alloc_get(sk);
				4145
				4146	return put_user(amount, (int __user *)arg);
				4147	}
				4148	case SIOCINQ:
				4149	{
				4150	struct sk_buff *skb;
				4151	int amount = 0;
				4152
				4153	spin_lock_bh(&sk->sk_receive_queue.lock);
				4154	skb = skb_peek(&sk->sk_receive_queue);
				4155	if (skb)
				4156	amount = skb->len;
				4157	spin_unlock_bh(&sk->sk_receive_queue.lock);
				4158	return put_user(amount, (int __user *)arg);
				4159	}
				4160	case SIOCGSTAMP:
				4161	return sock_get_timestamp(sk, (struct timeval __user *)arg);
				4162	case SIOCGSTAMPNS:
				4163	return sock_get_timestampns(sk, (struct timespec __user *)arg);
				4164
				4165	#ifdef CONFIG_INET
				4166	case SIOCADDRT:
				4167	case SIOCDELRT:
				4168	case SIOCDARP:
				4169	case SIOCGARP:
				4170	case SIOCSARP:
				4171	case SIOCGIFADDR:
				4172	case SIOCSIFADDR:
				4173	case SIOCGIFBRDADDR:
				4174	case SIOCSIFBRDADDR:
				4175	case SIOCGIFNETMASK:
				4176	case SIOCSIFNETMASK:
				4177	case SIOCGIFDSTADDR:
				4178	case SIOCSIFDSTADDR:
				4179	case SIOCSIFFLAGS:
				4180	return inet_dgram_ops.ioctl(sock, cmd, arg);
				4181	#endif
				4182
				4183	default:
				4184	return -ENOIOCTLCMD;
				4185	}
				4186	return 0;
				4187	}
				4188
				4189	static unsigned int packet_poll(struct file file, struct socket sock,
				4190	poll_table *wait)
				4191	{
				4192	struct sock *sk = sock->sk;
				4193	struct packet_sock *po = pkt_sk(sk);
				4194	unsigned int mask = datagram_poll(file, sock, wait);
				4195
				4196	spin_lock_bh(&sk->sk_receive_queue.lock);
				4197	if (po->rx_ring.pg_vec) {
				4198	if (!packet_previous_rx_frame(po, &po->rx_ring,
				4199	TP_STATUS_KERNEL))
				4200	mask \|= POLLIN \| POLLRDNORM;
				4201	}
				4202	if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
				4203	po->pressure = 0;
				4204	spin_unlock_bh(&sk->sk_receive_queue.lock);
				4205	spin_lock_bh(&sk->sk_write_queue.lock);
				4206	if (po->tx_ring.pg_vec) {
				4207	if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
				4208	mask \|= POLLOUT \| POLLWRNORM;
				4209	}
				4210	spin_unlock_bh(&sk->sk_write_queue.lock);
				4211	return mask;
				4212	}
				4213
				4214
				4215	/* Dirty? Well, I still did not learn better way to account
				4216	* for user mmaps.
				4217	*/
				4218
				4219	static void packet_mm_open(struct vm_area_struct *vma)
				4220	{
				4221	struct file *file = vma->vm_file;
				4222	struct socket *sock = file->private_data;
				4223	struct sock *sk = sock->sk;
				4224
				4225	if (sk)
				4226	atomic_inc(&pkt_sk(sk)->mapped);
				4227	}
				4228
				4229	static void packet_mm_close(struct vm_area_struct *vma)
				4230	{
				4231	struct file *file = vma->vm_file;
				4232	struct socket *sock = file->private_data;
				4233	struct sock *sk = sock->sk;
				4234
				4235	if (sk)
				4236	atomic_dec(&pkt_sk(sk)->mapped);
				4237	}
				4238
				4239	static const struct vm_operations_struct packet_mmap_ops = {
				4240	.open = packet_mm_open,
				4241	.close = packet_mm_close,
				4242	};
				4243
				4244	static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
				4245	unsigned int len)
				4246	{
				4247	int i;
				4248
				4249	for (i = 0; i < len; i++) {
				4250	if (likely(pg_vec[i].buffer)) {
				4251	if (is_vmalloc_addr(pg_vec[i].buffer))
				4252	vfree(pg_vec[i].buffer);
				4253	else
				4254	free_pages((unsigned long)pg_vec[i].buffer,
				4255	order);
				4256	pg_vec[i].buffer = NULL;
				4257	}
				4258	}
				4259	kfree(pg_vec);
				4260	}
				4261
				4262	static char *alloc_one_pg_vec_page(unsigned long order)
				4263	{
				4264	char *buffer;
				4265	gfp_t gfp_flags = GFP_KERNEL \| __GFP_COMP \|
				4266	__GFP_ZERO \| __GFP_NOWARN \| __GFP_NORETRY;
				4267
				4268	buffer = (char *) __get_free_pages(gfp_flags, order);
				4269	if (buffer)
				4270	return buffer;
				4271
				4272	/* __get_free_pages failed, fall back to vmalloc */
				4273	buffer = vzalloc((1 << order) * PAGE_SIZE);
				4274	if (buffer)
				4275	return buffer;
				4276
				4277	/* vmalloc failed, lets dig into swap here */
				4278	gfp_flags &= ~__GFP_NORETRY;
				4279	buffer = (char *) __get_free_pages(gfp_flags, order);
				4280	if (buffer)
				4281	return buffer;
				4282
				4283	/* complete and utter failure */
				4284	return NULL;
				4285	}
				4286
				4287	static struct pgv alloc_pg_vec(struct tpacket_req req, int order)
				4288	{
				4289	unsigned int block_nr = req->tp_block_nr;
				4290	struct pgv *pg_vec;
				4291	int i;
				4292
				4293	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL \| __GFP_NOWARN);
				4294	if (unlikely(!pg_vec))
				4295	goto out;
				4296
				4297	for (i = 0; i < block_nr; i++) {
				4298	pg_vec[i].buffer = alloc_one_pg_vec_page(order);
				4299	if (unlikely(!pg_vec[i].buffer))
				4300	goto out_free_pgvec;
				4301	}
				4302
				4303	out:
				4304	return pg_vec;
				4305
				4306	out_free_pgvec:
				4307	free_pg_vec(pg_vec, order, block_nr);
				4308	pg_vec = NULL;
				4309	goto out;
				4310	}
				4311
				4312	static int packet_set_ring(struct sock sk, union tpacket_req_u req_u,
				4313	int closing, int tx_ring)
				4314	{
				4315	struct pgv *pg_vec = NULL;
				4316	struct packet_sock *po = pkt_sk(sk);
				4317	unsigned long *rx_owner_map = NULL;
				4318	int was_running, order = 0;
				4319	struct packet_ring_buffer *rb;
				4320	struct sk_buff_head *rb_queue;
				4321	__be16 num;
				4322	int err = -EINVAL;
				4323	/* Added to avoid minimal code churn */
				4324	struct tpacket_req *req = &req_u->req;
				4325
				4326	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
				4327	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
				4328
				4329	err = -EBUSY;
				4330	if (!closing) {
				4331	if (atomic_read(&po->mapped))
				4332	goto out;
				4333	if (packet_read_pending(rb))
				4334	goto out;
				4335	}
				4336
				4337	if (req->tp_block_nr) {
				4338	unsigned int min_frame_size;
				4339
				4340	/* Sanity tests and some calculations */
				4341	err = -EBUSY;
				4342	if (unlikely(rb->pg_vec))
				4343	goto out;
				4344
				4345	switch (po->tp_version) {
				4346	case TPACKET_V1:
				4347	po->tp_hdrlen = TPACKET_HDRLEN;
				4348	break;
				4349	case TPACKET_V2:
				4350	po->tp_hdrlen = TPACKET2_HDRLEN;
				4351	break;
				4352	case TPACKET_V3:
				4353	po->tp_hdrlen = TPACKET3_HDRLEN;
				4354	break;
				4355	}
				4356
				4357	err = -EINVAL;
				4358	if (unlikely((int)req->tp_block_size <= 0))
				4359	goto out;
				4360	if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
				4361	goto out;
				4362	min_frame_size = po->tp_hdrlen + po->tp_reserve;
				4363	if (po->tp_version >= TPACKET_V3 &&
				4364	req->tp_block_size <
				4365	BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
				4366	goto out;
				4367	if (unlikely(req->tp_frame_size < min_frame_size))
				4368	goto out;
				4369	if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
				4370	goto out;
				4371
				4372	rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
				4373	if (unlikely(rb->frames_per_block == 0))
				4374	goto out;
				4375	if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
				4376	goto out;
				4377	if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
				4378	req->tp_frame_nr))
				4379	goto out;
				4380
				4381	err = -ENOMEM;
				4382	order = get_order(req->tp_block_size);
				4383	pg_vec = alloc_pg_vec(req, order);
				4384	if (unlikely(!pg_vec))
				4385	goto out;
				4386	switch (po->tp_version) {
				4387	case TPACKET_V3:
				4388	/* Block transmit is not supported yet */
				4389	if (!tx_ring) {
				4390	init_prb_bdqc(po, rb, pg_vec, req_u);
				4391	} else {
				4392	struct tpacket_req3 *req3 = &req_u->req3;
				4393
				4394	if (req3->tp_retire_blk_tov \|\|
				4395	req3->tp_sizeof_priv \|\|
				4396	req3->tp_feature_req_word) {
				4397	err = -EINVAL;
				4398	goto out_free_pg_vec;
				4399	}
				4400	}
				4401	break;
				4402	default:
				4403	if (!tx_ring) {
				4404	rx_owner_map = bitmap_alloc(req->tp_frame_nr,
				4405	GFP_KERNEL \| __GFP_NOWARN \| __GFP_ZERO);
				4406	if (!rx_owner_map)
				4407	goto out_free_pg_vec;
				4408	}
				4409	break;
				4410	}
				4411	}
				4412	/* Done */
				4413	else {
				4414	err = -EINVAL;
				4415	if (unlikely(req->tp_frame_nr))
				4416	goto out;
				4417	}
				4418
				4419
				4420	/* Detach socket from network */
				4421	spin_lock(&po->bind_lock);
				4422	was_running = po->running;
				4423	num = po->num;
				4424	if (was_running) {
				4425	po->num = 0;
				4426	__unregister_prot_hook(sk, false);
				4427	}
				4428	spin_unlock(&po->bind_lock);
				4429
				4430	synchronize_net();
				4431
				4432	err = -EBUSY;
				4433	mutex_lock(&po->pg_vec_lock);
				4434	if (closing \|\| atomic_read(&po->mapped) == 0) {
				4435	err = 0;
				4436	spin_lock_bh(&rb_queue->lock);
				4437	swap(rb->pg_vec, pg_vec);
				4438	if (po->tp_version <= TPACKET_V2)
				4439	swap(rb->rx_owner_map, rx_owner_map);
				4440	rb->frame_max = (req->tp_frame_nr - 1);
				4441	rb->head = 0;
				4442	rb->frame_size = req->tp_frame_size;
				4443	spin_unlock_bh(&rb_queue->lock);
				4444
				4445	swap(rb->pg_vec_order, order);
				4446	swap(rb->pg_vec_len, req->tp_block_nr);
				4447
				4448	rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
				4449	po->prot_hook.func = (po->rx_ring.pg_vec) ?
				4450	tpacket_rcv : packet_rcv;
				4451	skb_queue_purge(rb_queue);
				4452	if (atomic_read(&po->mapped))
				4453	pr_err("packet_mmap: vma is busy: %d\n",
				4454	atomic_read(&po->mapped));
				4455	}
				4456	mutex_unlock(&po->pg_vec_lock);
				4457
				4458	spin_lock(&po->bind_lock);
				4459	if (was_running) {
				4460	po->num = num;
				4461	register_prot_hook(sk);
				4462	}
				4463	spin_unlock(&po->bind_lock);
				4464	if (pg_vec && (po->tp_version > TPACKET_V2)) {
				4465	/* Because we don't support block-based V3 on tx-ring */
				4466	if (!tx_ring)
				4467	prb_shutdown_retire_blk_timer(po, rb_queue);
				4468	}
				4469
				4470	out_free_pg_vec:
				4471	bitmap_free(rx_owner_map);
				4472	if (pg_vec)
				4473	free_pg_vec(pg_vec, order, req->tp_block_nr);
				4474	out:
				4475	return err;
				4476	}
				4477
				4478	static int packet_mmap(struct file file, struct socket sock,
				4479	struct vm_area_struct *vma)
				4480	{
				4481	struct sock *sk = sock->sk;
				4482	struct packet_sock *po = pkt_sk(sk);
				4483	unsigned long size, expected_size;
				4484	struct packet_ring_buffer *rb;
				4485	unsigned long start;
				4486	int err = -EINVAL;
				4487	int i;
				4488
				4489	if (vma->vm_pgoff)
				4490	return -EINVAL;
				4491
				4492	mutex_lock(&po->pg_vec_lock);
				4493
				4494	expected_size = 0;
				4495	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
				4496	if (rb->pg_vec) {
				4497	expected_size += rb->pg_vec_len
				4498	* rb->pg_vec_pages
				4499	* PAGE_SIZE;
				4500	}
				4501	}
				4502
				4503	if (expected_size == 0)
				4504	goto out;
				4505
				4506	size = vma->vm_end - vma->vm_start;
				4507	if (size != expected_size)
				4508	goto out;
				4509
				4510	start = vma->vm_start;
				4511	for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
				4512	if (rb->pg_vec == NULL)
				4513	continue;
				4514
				4515	for (i = 0; i < rb->pg_vec_len; i++) {
				4516	struct page *page;
				4517	void *kaddr = rb->pg_vec[i].buffer;
				4518	int pg_num;
				4519
				4520	for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
				4521	page = pgv_to_page(kaddr);
				4522	err = vm_insert_page(vma, start, page);
				4523	if (unlikely(err))
				4524	goto out;
				4525	start += PAGE_SIZE;
				4526	kaddr += PAGE_SIZE;
				4527	}
				4528	}
				4529	}
				4530
				4531	atomic_inc(&po->mapped);
				4532	vma->vm_ops = &packet_mmap_ops;
				4533	err = 0;
				4534
				4535	out:
				4536	mutex_unlock(&po->pg_vec_lock);
				4537	return err;
				4538	}
				4539
				4540	static const struct proto_ops packet_ops_spkt = {
				4541	.family = PF_PACKET,
				4542	.owner = THIS_MODULE,
				4543	.release = packet_release,
				4544	.bind = packet_bind_spkt,
				4545	.connect = sock_no_connect,
				4546	.socketpair = sock_no_socketpair,
				4547	.accept = sock_no_accept,
				4548	.getname = packet_getname_spkt,
				4549	.poll = datagram_poll,
				4550	.ioctl = packet_ioctl,
				4551	.listen = sock_no_listen,
				4552	.shutdown = sock_no_shutdown,
				4553	.setsockopt = sock_no_setsockopt,
				4554	.getsockopt = sock_no_getsockopt,
				4555	.sendmsg = packet_sendmsg_spkt,
				4556	.recvmsg = packet_recvmsg,
				4557	.mmap = sock_no_mmap,
				4558	.sendpage = sock_no_sendpage,
				4559	};
				4560
				4561	static const struct proto_ops packet_ops = {
				4562	.family = PF_PACKET,
				4563	.owner = THIS_MODULE,
				4564	.release = packet_release,
				4565	.bind = packet_bind,
				4566	.connect = sock_no_connect,
				4567	.socketpair = sock_no_socketpair,
				4568	.accept = sock_no_accept,
				4569	.getname = packet_getname,
				4570	.poll = packet_poll,
				4571	.ioctl = packet_ioctl,
				4572	.listen = sock_no_listen,
				4573	.shutdown = sock_no_shutdown,
				4574	.setsockopt = packet_setsockopt,
				4575	.getsockopt = packet_getsockopt,
				4576	#ifdef CONFIG_COMPAT
				4577	.compat_setsockopt = compat_packet_setsockopt,
				4578	#endif
				4579	.sendmsg = packet_sendmsg,
				4580	.recvmsg = packet_recvmsg,
				4581	.mmap = packet_mmap,
				4582	.sendpage = sock_no_sendpage,
				4583	};
				4584
				4585	static const struct net_proto_family packet_family_ops = {
				4586	.family = PF_PACKET,
				4587	.create = packet_create,
				4588	.owner = THIS_MODULE,
				4589	};
				4590
				4591	static struct notifier_block packet_netdev_notifier = {
				4592	.notifier_call = packet_notifier,
				4593	};
				4594
				4595	#ifdef CONFIG_PROC_FS
				4596
				4597	static void packet_seq_start(struct seq_file seq, loff_t *pos)
				4598	__acquires(RCU)
				4599	{
				4600	struct net *net = seq_file_net(seq);
				4601
				4602	rcu_read_lock();
				4603	return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
				4604	}
				4605
				4606	static void packet_seq_next(struct seq_file seq, void v, loff_t pos)
				4607	{
				4608	struct net *net = seq_file_net(seq);
				4609	return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
				4610	}
				4611
				4612	static void packet_seq_stop(struct seq_file seq, void v)
				4613	__releases(RCU)
				4614	{
				4615	rcu_read_unlock();
				4616	}
				4617
				4618	static int packet_seq_show(struct seq_file seq, void v)
				4619	{
				4620	if (v == SEQ_START_TOKEN)
				4621	seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
				4622	else {
				4623	struct sock *s = sk_entry(v);
				4624	const struct packet_sock *po = pkt_sk(s);
				4625
				4626	seq_printf(seq,
				4627	"%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
				4628	s,
				4629	refcount_read(&s->sk_refcnt),
				4630	s->sk_type,
				4631	ntohs(po->num),
				4632	po->ifindex,
				4633	po->running,
				4634	atomic_read(&s->sk_rmem_alloc),
				4635	from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
				4636	sock_i_ino(s));
				4637	}
				4638
				4639	return 0;
				4640	}
				4641
				4642	static const struct seq_operations packet_seq_ops = {
				4643	.start = packet_seq_start,
				4644	.next = packet_seq_next,
				4645	.stop = packet_seq_stop,
				4646	.show = packet_seq_show,
				4647	};
				4648
				4649	static int packet_seq_open(struct inode inode, struct file file)
				4650	{
				4651	return seq_open_net(inode, file, &packet_seq_ops,
				4652	sizeof(struct seq_net_private));
				4653	}
				4654
				4655	static const struct file_operations packet_seq_fops = {
				4656	.owner = THIS_MODULE,
				4657	.open = packet_seq_open,
				4658	.read = seq_read,
				4659	.llseek = seq_lseek,
				4660	.release = seq_release_net,
				4661	};
				4662
				4663	#endif
				4664
				4665	static int __net_init packet_net_init(struct net *net)
				4666	{
				4667	mutex_init(&net->packet.sklist_lock);
				4668	INIT_HLIST_HEAD(&net->packet.sklist);
				4669
				4670	if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
				4671	return -ENOMEM;
				4672
				4673	return 0;
				4674	}
				4675
				4676	static void __net_exit packet_net_exit(struct net *net)
				4677	{
				4678	remove_proc_entry("packet", net->proc_net);
				4679	}
				4680
				4681	static struct pernet_operations packet_net_ops = {
				4682	.init = packet_net_init,
				4683	.exit = packet_net_exit,
				4684	};
				4685
				4686
				4687	static void __exit packet_exit(void)
				4688	{
				4689	unregister_netdevice_notifier(&packet_netdev_notifier);
				4690	unregister_pernet_subsys(&packet_net_ops);
				4691	sock_unregister(PF_PACKET);
				4692	proto_unregister(&packet_proto);
				4693	}
				4694
				4695	static int __init packet_init(void)
				4696	{
				4697	int rc;
				4698
				4699	rc = proto_register(&packet_proto, 0);
				4700	if (rc)
				4701	goto out;
				4702	rc = sock_register(&packet_family_ops);
				4703	if (rc)
				4704	goto out_proto;
				4705	rc = register_pernet_subsys(&packet_net_ops);
				4706	if (rc)
				4707	goto out_sock;
				4708	rc = register_netdevice_notifier(&packet_netdev_notifier);
				4709	if (rc)
				4710	goto out_pernet;
				4711
				4712	return 0;
				4713
				4714	out_pernet:
				4715	unregister_pernet_subsys(&packet_net_ops);
				4716	out_sock:
				4717	sock_unregister(PF_PACKET);
				4718	out_proto:
				4719	proto_unregister(&packet_proto);
				4720	out:
				4721	return rc;
				4722	}
				4723
				4724	module_init(packet_init);
				4725	module_exit(packet_exit);
				4726	MODULE_LICENSE("GPL");
				4727	MODULE_ALIAS_NETPROTO(PF_PACKET);