Blame - marvell/linux/kernel/bpf/devmap.c - T108

blob: 08ff40e3921c05096d4256ab651c077e0a82d017 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
				3	*/
				4
				5	/* Devmaps primary use is as a backend map for XDP BPF helper call
				6	* bpf_redirect_map(). Because XDP is mostly concerned with performance we
				7	* spent some effort to ensure the datapath with redirect maps does not use
				8	* any locking. This is a quick note on the details.
				9	*
				10	* We have three possible paths to get into the devmap control plane bpf
				11	* syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
				12	* will invoke an update, delete, or lookup operation. To ensure updates and
				13	* deletes appear atomic from the datapath side xchg() is used to modify the
				14	* netdev_map array. Then because the datapath does a lookup into the netdev_map
				15	* array (read-only) from an RCU critical section we use call_rcu() to wait for
				16	* an rcu grace period before free'ing the old data structures. This ensures the
				17	* datapath always has a valid copy. However, the datapath does a "flush"
				18	* operation that pushes any pending packets in the driver outside the RCU
				19	* critical section. Each bpf_dtab_netdev tracks these pending operations using
				20	* a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
				21	* this list is empty, indicating outstanding flush operations have completed.
				22	*
				23	* BPF syscalls may race with BPF program calls on any of the update, delete
				24	* or lookup operations. As noted above the xchg() operation also keep the
				25	* netdev_map consistent in this case. From the devmap side BPF programs
				26	* calling into these operations are the same as multiple user space threads
				27	* making system calls.
				28	*
				29	* Finally, any of the above may race with a netdev_unregister notifier. The
				30	* unregister notifier must search for net devices in the map structure that
				31	* contain a reference to the net device and remove them. This is a two step
				32	* process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
				33	* check to see if the ifindex is the same as the net_device being removed.
				34	* When removing the dev a cmpxchg() is used to ensure the correct dev is
				35	* removed, in the case of a concurrent update or delete operation it is
				36	* possible that the initially referenced dev is no longer in the map. As the
				37	* notifier hook walks the map we know that new dev references can not be
				38	* added by the user because core infrastructure ensures dev_get_by_index()
				39	* calls will fail at this point.
				40	*
				41	* The devmap_hash type is a map type which interprets keys as ifindexes and
				42	* indexes these using a hashmap. This allows maps that use ifindex as key to be
				43	* densely packed instead of having holes in the lookup array for unused
				44	* ifindexes. The setup and packet enqueue/send code is shared between the two
				45	* types of devmap; only the lookup and insertion is different.
				46	*/
				47	#include <linux/bpf.h>
				48	#include <net/xdp.h>
				49	#include <linux/filter.h>
				50	#include <trace/events/xdp.h>
				51
				52	#define DEV_CREATE_FLAG_MASK \
				53	(BPF_F_NUMA_NODE \| BPF_F_RDONLY \| BPF_F_WRONLY)
				54
				55	#define DEV_MAP_BULK_SIZE 16
				56	struct bpf_dtab_netdev;
				57
				58	struct xdp_bulk_queue {
				59	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
				60	struct list_head flush_node;
				61	struct net_device *dev_rx;
				62	struct bpf_dtab_netdev *obj;
				63	unsigned int count;
				64	};
				65
				66	struct bpf_dtab_netdev {
				67	struct net_device dev; / must be first member, due to tracepoint */
				68	struct hlist_node index_hlist;
				69	struct bpf_dtab *dtab;
				70	struct xdp_bulk_queue __percpu *bulkq;
				71	struct rcu_head rcu;
				72	unsigned int idx; /* keep track of map index for tracepoint */
				73	};
				74
				75	struct bpf_dtab {
				76	struct bpf_map map;
				77	struct bpf_dtab_netdev *netdev_map; / DEVMAP type only */
				78	struct list_head __percpu *flush_list;
				79	struct list_head list;
				80
				81	/* these are only used for DEVMAP_HASH type maps */
				82	struct hlist_head *dev_index_head;
				83	spinlock_t index_lock;
				84	unsigned int items;
				85	u32 n_buckets;
				86	};
				87
				88	static DEFINE_SPINLOCK(dev_map_lock);
				89	static LIST_HEAD(dev_map_list);
				90
				91	static struct hlist_head *dev_map_create_hash(unsigned int entries,
				92	int numa_node)
				93	{
				94	int i;
				95	struct hlist_head *hash;
				96
				97	hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
				98	if (hash != NULL)
				99	for (i = 0; i < entries; i++)
				100	INIT_HLIST_HEAD(&hash[i]);
				101
				102	return hash;
				103	}
				104
				105	static inline struct hlist_head dev_map_index_hash(struct bpf_dtab dtab,
				106	int idx)
				107	{
				108	return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
				109	}
				110
				111	static int dev_map_init_map(struct bpf_dtab dtab, union bpf_attr attr)
				112	{
				113	int err, cpu;
				114	u64 cost;
				115
				116	/* check sanity of attributes */
				117	if (attr->max_entries == 0 \|\| attr->key_size != 4 \|\|
				118	attr->value_size != 4 \|\| attr->map_flags & ~DEV_CREATE_FLAG_MASK)
				119	return -EINVAL;
				120
				121	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
				122	* verifier prevents writes from the BPF side
				123	*/
				124	attr->map_flags \|= BPF_F_RDONLY_PROG;
				125
				126
				127	bpf_map_init_from_attr(&dtab->map, attr);
				128
				129	/* make sure page count doesn't overflow */
				130	cost = (u64) sizeof(struct list_head) * num_possible_cpus();
				131
				132	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
				133	/* hash table size must be power of 2; roundup_pow_of_two() can
				134	* overflow into UB on 32-bit arches, so check that first
				135	*/
				136	if (dtab->map.max_entries > 1UL << 31)
				137	return -EINVAL;
				138
				139	dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
				140	cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
				141	} else {
				142	cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
				143	}
				144
				145	/* if map size is larger than memlock limit, reject it */
				146	err = bpf_map_charge_init(&dtab->map.memory, cost);
				147	if (err)
				148	return -EINVAL;
				149
				150	dtab->flush_list = alloc_percpu(struct list_head);
				151	if (!dtab->flush_list)
				152	goto free_charge;
				153
				154	for_each_possible_cpu(cpu)
				155	INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
				156
				157	if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
				158	dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
				159	dtab->map.numa_node);
				160	if (!dtab->dev_index_head)
				161	goto free_percpu;
				162
				163	spin_lock_init(&dtab->index_lock);
				164	} else {
				165	dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
				166	sizeof(struct bpf_dtab_netdev *),
				167	dtab->map.numa_node);
				168	if (!dtab->netdev_map)
				169	goto free_percpu;
				170	}
				171
				172	return 0;
				173
				174	free_percpu:
				175	free_percpu(dtab->flush_list);
				176	free_charge:
				177	bpf_map_charge_finish(&dtab->map.memory);
				178	return -ENOMEM;
				179	}
				180
				181	static struct bpf_map dev_map_alloc(union bpf_attr attr)
				182	{
				183	struct bpf_dtab *dtab;
				184	int err;
				185
				186	if (!capable(CAP_NET_ADMIN))
				187	return ERR_PTR(-EPERM);
				188
				189	dtab = kzalloc(sizeof(*dtab), GFP_USER);
				190	if (!dtab)
				191	return ERR_PTR(-ENOMEM);
				192
				193	err = dev_map_init_map(dtab, attr);
				194	if (err) {
				195	kfree(dtab);
				196	return ERR_PTR(err);
				197	}
				198
				199	spin_lock(&dev_map_lock);
				200	list_add_tail_rcu(&dtab->list, &dev_map_list);
				201	spin_unlock(&dev_map_lock);
				202
				203	return &dtab->map;
				204	}
				205
				206	static void dev_map_free(struct bpf_map *map)
				207	{
				208	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				209	u32 i;
				210
				211	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
				212	* so the programs (can be more than one that used this map) were
				213	* disconnected from events. The following synchronize_rcu() guarantees
				214	* both rcu read critical sections complete and waits for
				215	* preempt-disable regions (NAPI being the relevant context here) so we
				216	* are certain there will be no further reads against the netdev_map and
				217	* all flush operations are complete. Flush operations can only be done
				218	* from NAPI context for this reason.
				219	*/
				220
				221	spin_lock(&dev_map_lock);
				222	list_del_rcu(&dtab->list);
				223	spin_unlock(&dev_map_lock);
				224
				225	bpf_clear_redirect_map(map);
				226	synchronize_rcu();
				227
				228	/* Make sure prior __dev_map_entry_free() have completed. */
				229	rcu_barrier();
				230
				231	if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
				232	for (i = 0; i < dtab->n_buckets; i++) {
				233	struct bpf_dtab_netdev *dev;
				234	struct hlist_head *head;
				235	struct hlist_node *next;
				236
				237	head = dev_map_index_hash(dtab, i);
				238
				239	hlist_for_each_entry_safe(dev, next, head, index_hlist) {
				240	hlist_del_rcu(&dev->index_hlist);
				241	free_percpu(dev->bulkq);
				242	dev_put(dev->dev);
				243	kfree(dev);
				244	}
				245	}
				246
				247	bpf_map_area_free(dtab->dev_index_head);
				248	} else {
				249	for (i = 0; i < dtab->map.max_entries; i++) {
				250	struct bpf_dtab_netdev *dev;
				251
				252	dev = dtab->netdev_map[i];
				253	if (!dev)
				254	continue;
				255
				256	free_percpu(dev->bulkq);
				257	dev_put(dev->dev);
				258	kfree(dev);
				259	}
				260
				261	bpf_map_area_free(dtab->netdev_map);
				262	}
				263
				264	free_percpu(dtab->flush_list);
				265	kfree(dtab);
				266	}
				267
				268	static int dev_map_get_next_key(struct bpf_map map, void key, void *next_key)
				269	{
				270	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				271	u32 index = key ? (u32 )key : U32_MAX;
				272	u32 *next = next_key;
				273
				274	if (index >= dtab->map.max_entries) {
				275	*next = 0;
				276	return 0;
				277	}
				278
				279	if (index == dtab->map.max_entries - 1)
				280	return -ENOENT;
				281	*next = index + 1;
				282	return 0;
				283	}
				284
				285	struct bpf_dtab_netdev __dev_map_hash_lookup_elem(struct bpf_map map, u32 key)
				286	{
				287	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				288	struct hlist_head *head = dev_map_index_hash(dtab, key);
				289	struct bpf_dtab_netdev *dev;
				290
				291	hlist_for_each_entry_rcu(dev, head, index_hlist,
				292	lockdep_is_held(&dtab->index_lock))
				293	if (dev->idx == key)
				294	return dev;
				295
				296	return NULL;
				297	}
				298
				299	static int dev_map_hash_get_next_key(struct bpf_map map, void key,
				300	void *next_key)
				301	{
				302	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				303	u32 idx, *next = next_key;
				304	struct bpf_dtab_netdev dev, next_dev;
				305	struct hlist_head *head;
				306	int i = 0;
				307
				308	if (!key)
				309	goto find_first;
				310
				311	idx = (u32 )key;
				312
				313	dev = __dev_map_hash_lookup_elem(map, idx);
				314	if (!dev)
				315	goto find_first;
				316
				317	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
				318	struct bpf_dtab_netdev, index_hlist);
				319
				320	if (next_dev) {
				321	*next = next_dev->idx;
				322	return 0;
				323	}
				324
				325	i = idx & (dtab->n_buckets - 1);
				326	i++;
				327
				328	find_first:
				329	for (; i < dtab->n_buckets; i++) {
				330	head = dev_map_index_hash(dtab, i);
				331
				332	next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
				333	struct bpf_dtab_netdev,
				334	index_hlist);
				335	if (next_dev) {
				336	*next = next_dev->idx;
				337	return 0;
				338	}
				339	}
				340
				341	return -ENOENT;
				342	}
				343
				344	static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags)
				345	{
				346	struct bpf_dtab_netdev *obj = bq->obj;
				347	struct net_device *dev = obj->dev;
				348	int sent = 0, drops = 0, err = 0;
				349	int i;
				350
				351	if (unlikely(!bq->count))
				352	return 0;
				353
				354	for (i = 0; i < bq->count; i++) {
				355	struct xdp_frame *xdpf = bq->q[i];
				356
				357	prefetch(xdpf);
				358	}
				359
				360	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
				361	if (sent < 0) {
				362	err = sent;
				363	sent = 0;
				364	goto error;
				365	}
				366	drops = bq->count - sent;
				367	out:
				368	bq->count = 0;
				369
				370	trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
				371	sent, drops, bq->dev_rx, dev, err);
				372	bq->dev_rx = NULL;
				373	__list_del_clearprev(&bq->flush_node);
				374	return 0;
				375	error:
				376	/* If ndo_xdp_xmit fails with an errno, no frames have been
				377	* xmit'ed and it's our responsibility to them free all.
				378	*/
				379	for (i = 0; i < bq->count; i++) {
				380	struct xdp_frame *xdpf = bq->q[i];
				381
				382	xdp_return_frame_rx_napi(xdpf);
				383	drops++;
				384	}
				385	goto out;
				386	}
				387
				388	/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
				389	* from the driver before returning from its napi->poll() routine. The poll()
				390	* routine is called either from busy_poll context or net_rx_action signaled
				391	* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
				392	* net device can be torn down. On devmap tear down we ensure the flush list
				393	* is empty before completing to ensure all flush operations have completed.
				394	*/
				395	void __dev_map_flush(struct bpf_map *map)
				396	{
				397	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				398	struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
				399	struct xdp_bulk_queue bq, tmp;
				400
				401	rcu_read_lock();
				402	list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
				403	bq_xmit_all(bq, XDP_XMIT_FLUSH);
				404	rcu_read_unlock();
				405	}
				406
				407	/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
				408	* update happens in parallel here a dev_put wont happen until after reading the
				409	* ifindex.
				410	*/
				411	struct bpf_dtab_netdev __dev_map_lookup_elem(struct bpf_map map, u32 key)
				412	{
				413	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				414	struct bpf_dtab_netdev *obj;
				415
				416	if (key >= map->max_entries)
				417	return NULL;
				418
				419	obj = READ_ONCE(dtab->netdev_map[key]);
				420	return obj;
				421	}
				422
				423	/* Runs under RCU-read-side, plus in softirq under NAPI protection.
				424	* Thus, safe percpu variable access.
				425	*/
				426	static int bq_enqueue(struct bpf_dtab_netdev obj, struct xdp_frame xdpf,
				427	struct net_device *dev_rx)
				428
				429	{
				430	struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
				431	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
				432
				433	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
				434	bq_xmit_all(bq, 0);
				435
				436	/* Ingress dev_rx will be the same for all xdp_frame's in
				437	* bulk_queue, because bq stored per-CPU and must be flushed
				438	* from net_device drivers NAPI func end.
				439	*/
				440	if (!bq->dev_rx)
				441	bq->dev_rx = dev_rx;
				442
				443	bq->q[bq->count++] = xdpf;
				444
				445	if (!bq->flush_node.prev)
				446	list_add(&bq->flush_node, flush_list);
				447
				448	return 0;
				449	}
				450
				451	int dev_map_enqueue(struct bpf_dtab_netdev dst, struct xdp_buff xdp,
				452	struct net_device *dev_rx)
				453	{
				454	struct net_device *dev = dst->dev;
				455	struct xdp_frame *xdpf;
				456	int err;
				457
				458	if (!dev->netdev_ops->ndo_xdp_xmit)
				459	return -EOPNOTSUPP;
				460
				461	err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
				462	if (unlikely(err))
				463	return err;
				464
				465	xdpf = convert_to_xdp_frame(xdp);
				466	if (unlikely(!xdpf))
				467	return -EOVERFLOW;
				468
				469	return bq_enqueue(dst, xdpf, dev_rx);
				470	}
				471
				472	int dev_map_generic_redirect(struct bpf_dtab_netdev dst, struct sk_buff skb,
				473	struct bpf_prog *xdp_prog)
				474	{
				475	int err;
				476
				477	err = xdp_ok_fwd_dev(dst->dev, skb->len);
				478	if (unlikely(err))
				479	return err;
				480	skb->dev = dst->dev;
				481	generic_xdp_tx(skb, xdp_prog);
				482
				483	return 0;
				484	}
				485
				486	static void dev_map_lookup_elem(struct bpf_map map, void *key)
				487	{
				488	struct bpf_dtab_netdev obj = __dev_map_lookup_elem(map, (u32 *)key);
				489	struct net_device *dev = obj ? obj->dev : NULL;
				490
				491	return dev ? &dev->ifindex : NULL;
				492	}
				493
				494	static void dev_map_hash_lookup_elem(struct bpf_map map, void *key)
				495	{
				496	struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
				497	(u32 )key);
				498	struct net_device *dev = obj ? obj->dev : NULL;
				499
				500	return dev ? &dev->ifindex : NULL;
				501	}
				502
				503	static void __dev_map_entry_free(struct rcu_head *rcu)
				504	{
				505	struct bpf_dtab_netdev *dev;
				506
				507	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
				508	free_percpu(dev->bulkq);
				509	dev_put(dev->dev);
				510	kfree(dev);
				511	}
				512
				513	static int dev_map_delete_elem(struct bpf_map map, void key)
				514	{
				515	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				516	struct bpf_dtab_netdev *old_dev;
				517	u32 k = (u32 )key;
				518
				519	if (k >= map->max_entries)
				520	return -EINVAL;
				521
				522	/* Use call_rcu() here to ensure any rcu critical sections have
				523	* completed as well as any flush operations because call_rcu
				524	* will wait for preempt-disable region to complete, NAPI in this
				525	* context. And additionally, the driver tear down ensures all
				526	* soft irqs are complete before removing the net device in the
				527	* case of dev_put equals zero.
				528	*/
				529	old_dev = xchg(&dtab->netdev_map[k], NULL);
				530	if (old_dev)
				531	call_rcu(&old_dev->rcu, __dev_map_entry_free);
				532	return 0;
				533	}
				534
				535	static int dev_map_hash_delete_elem(struct bpf_map map, void key)
				536	{
				537	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				538	struct bpf_dtab_netdev *old_dev;
				539	u32 k = (u32 )key;
				540	unsigned long flags;
				541	int ret = -ENOENT;
				542
				543	spin_lock_irqsave(&dtab->index_lock, flags);
				544
				545	old_dev = __dev_map_hash_lookup_elem(map, k);
				546	if (old_dev) {
				547	dtab->items--;
				548	hlist_del_init_rcu(&old_dev->index_hlist);
				549	call_rcu(&old_dev->rcu, __dev_map_entry_free);
				550	ret = 0;
				551	}
				552	spin_unlock_irqrestore(&dtab->index_lock, flags);
				553
				554	return ret;
				555	}
				556
				557	static struct bpf_dtab_netdev __dev_map_alloc_node(struct net net,
				558	struct bpf_dtab *dtab,
				559	u32 ifindex,
				560	unsigned int idx)
				561	{
				562	gfp_t gfp = GFP_ATOMIC \| __GFP_NOWARN;
				563	struct bpf_dtab_netdev *dev;
				564	struct xdp_bulk_queue *bq;
				565	int cpu;
				566
				567	dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
				568	if (!dev)
				569	return ERR_PTR(-ENOMEM);
				570
				571	dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
				572	sizeof(void *), gfp);
				573	if (!dev->bulkq) {
				574	kfree(dev);
				575	return ERR_PTR(-ENOMEM);
				576	}
				577
				578	for_each_possible_cpu(cpu) {
				579	bq = per_cpu_ptr(dev->bulkq, cpu);
				580	bq->obj = dev;
				581	}
				582
				583	dev->dev = dev_get_by_index(net, ifindex);
				584	if (!dev->dev) {
				585	free_percpu(dev->bulkq);
				586	kfree(dev);
				587	return ERR_PTR(-EINVAL);
				588	}
				589
				590	dev->idx = idx;
				591	dev->dtab = dtab;
				592
				593	return dev;
				594	}
				595
				596	static int __dev_map_update_elem(struct net net, struct bpf_map map,
				597	void key, void value, u64 map_flags)
				598	{
				599	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				600	struct bpf_dtab_netdev dev, old_dev;
				601	u32 ifindex = (u32 )value;
				602	u32 i = (u32 )key;
				603
				604	if (unlikely(map_flags > BPF_EXIST))
				605	return -EINVAL;
				606	if (unlikely(i >= dtab->map.max_entries))
				607	return -E2BIG;
				608	if (unlikely(map_flags == BPF_NOEXIST))
				609	return -EEXIST;
				610
				611	if (!ifindex) {
				612	dev = NULL;
				613	} else {
				614	dev = __dev_map_alloc_node(net, dtab, ifindex, i);
				615	if (IS_ERR(dev))
				616	return PTR_ERR(dev);
				617	}
				618
				619	/* Use call_rcu() here to ensure rcu critical sections have completed
				620	* Remembering the driver side flush operation will happen before the
				621	* net device is removed.
				622	*/
				623	old_dev = xchg(&dtab->netdev_map[i], dev);
				624	if (old_dev)
				625	call_rcu(&old_dev->rcu, __dev_map_entry_free);
				626
				627	return 0;
				628	}
				629
				630	static int dev_map_update_elem(struct bpf_map map, void key, void *value,
				631	u64 map_flags)
				632	{
				633	return __dev_map_update_elem(current->nsproxy->net_ns,
				634	map, key, value, map_flags);
				635	}
				636
				637	static int __dev_map_hash_update_elem(struct net net, struct bpf_map map,
				638	void key, void value, u64 map_flags)
				639	{
				640	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
				641	struct bpf_dtab_netdev dev, old_dev;
				642	u32 ifindex = (u32 )value;
				643	u32 idx = (u32 )key;
				644	unsigned long flags;
				645	int err = -EEXIST;
				646
				647	if (unlikely(map_flags > BPF_EXIST \|\| !ifindex))
				648	return -EINVAL;
				649
				650	spin_lock_irqsave(&dtab->index_lock, flags);
				651
				652	old_dev = __dev_map_hash_lookup_elem(map, idx);
				653	if (old_dev && (map_flags & BPF_NOEXIST))
				654	goto out_err;
				655
				656	dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
				657	if (IS_ERR(dev)) {
				658	err = PTR_ERR(dev);
				659	goto out_err;
				660	}
				661
				662	if (old_dev) {
				663	hlist_del_rcu(&old_dev->index_hlist);
				664	} else {
				665	if (dtab->items >= dtab->map.max_entries) {
				666	spin_unlock_irqrestore(&dtab->index_lock, flags);
				667	call_rcu(&dev->rcu, __dev_map_entry_free);
				668	return -E2BIG;
				669	}
				670	dtab->items++;
				671	}
				672
				673	hlist_add_head_rcu(&dev->index_hlist,
				674	dev_map_index_hash(dtab, idx));
				675	spin_unlock_irqrestore(&dtab->index_lock, flags);
				676
				677	if (old_dev)
				678	call_rcu(&old_dev->rcu, __dev_map_entry_free);
				679
				680	return 0;
				681
				682	out_err:
				683	spin_unlock_irqrestore(&dtab->index_lock, flags);
				684	return err;
				685	}
				686
				687	static int dev_map_hash_update_elem(struct bpf_map map, void key, void *value,
				688	u64 map_flags)
				689	{
				690	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
				691	map, key, value, map_flags);
				692	}
				693
				694	const struct bpf_map_ops dev_map_ops = {
				695	.map_alloc = dev_map_alloc,
				696	.map_free = dev_map_free,
				697	.map_get_next_key = dev_map_get_next_key,
				698	.map_lookup_elem = dev_map_lookup_elem,
				699	.map_update_elem = dev_map_update_elem,
				700	.map_delete_elem = dev_map_delete_elem,
				701	.map_check_btf = map_check_no_btf,
				702	};
				703
				704	const struct bpf_map_ops dev_map_hash_ops = {
				705	.map_alloc = dev_map_alloc,
				706	.map_free = dev_map_free,
				707	.map_get_next_key = dev_map_hash_get_next_key,
				708	.map_lookup_elem = dev_map_hash_lookup_elem,
				709	.map_update_elem = dev_map_hash_update_elem,
				710	.map_delete_elem = dev_map_hash_delete_elem,
				711	.map_check_btf = map_check_no_btf,
				712	};
				713
				714	static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
				715	struct net_device *netdev)
				716	{
				717	unsigned long flags;
				718	u32 i;
				719
				720	spin_lock_irqsave(&dtab->index_lock, flags);
				721	for (i = 0; i < dtab->n_buckets; i++) {
				722	struct bpf_dtab_netdev *dev;
				723	struct hlist_head *head;
				724	struct hlist_node *next;
				725
				726	head = dev_map_index_hash(dtab, i);
				727
				728	hlist_for_each_entry_safe(dev, next, head, index_hlist) {
				729	if (netdev != dev->dev)
				730	continue;
				731
				732	dtab->items--;
				733	hlist_del_rcu(&dev->index_hlist);
				734	call_rcu(&dev->rcu, __dev_map_entry_free);
				735	}
				736	}
				737	spin_unlock_irqrestore(&dtab->index_lock, flags);
				738	}
				739
				740	static int dev_map_notification(struct notifier_block *notifier,
				741	ulong event, void *ptr)
				742	{
				743	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
				744	struct bpf_dtab *dtab;
				745	int i;
				746
				747	switch (event) {
				748	case NETDEV_UNREGISTER:
				749	/* This rcu_read_lock/unlock pair is needed because
				750	* dev_map_list is an RCU list AND to ensure a delete
				751	* operation does not free a netdev_map entry while we
				752	* are comparing it against the netdev being unregistered.
				753	*/
				754	rcu_read_lock();
				755	list_for_each_entry_rcu(dtab, &dev_map_list, list) {
				756	if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
				757	dev_map_hash_remove_netdev(dtab, netdev);
				758	continue;
				759	}
				760
				761	for (i = 0; i < dtab->map.max_entries; i++) {
				762	struct bpf_dtab_netdev dev, odev;
				763
				764	dev = READ_ONCE(dtab->netdev_map[i]);
				765	if (!dev \|\| netdev != dev->dev)
				766	continue;
				767	odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
				768	if (dev == odev)
				769	call_rcu(&dev->rcu,
				770	__dev_map_entry_free);
				771	}
				772	}
				773	rcu_read_unlock();
				774	break;
				775	default:
				776	break;
				777	}
				778	return NOTIFY_OK;
				779	}
				780
				781	static struct notifier_block dev_map_notifier = {
				782	.notifier_call = dev_map_notification,
				783	};
				784
				785	static int __init dev_map_init(void)
				786	{
				787	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
				788	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
				789	offsetof(struct _bpf_dtab_netdev, dev));
				790	register_netdevice_notifier(&dev_map_notifier);
				791	return 0;
				792	}
				793
				794	subsys_initcall(dev_map_init);