Blame - marvell/linux/fs/eventfd.c - T108

blob: e144094c831dfde527db04bc9cc508499f216eca [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* fs/eventfd.c
				4	*
				5	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
				6	*
				7	*/
				8
				9	#include <linux/file.h>
				10	#include <linux/poll.h>
				11	#include <linux/init.h>
				12	#include <linux/fs.h>
				13	#include <linux/sched/signal.h>
				14	#include <linux/kernel.h>
				15	#include <linux/slab.h>
				16	#include <linux/list.h>
				17	#include <linux/spinlock.h>
				18	#include <linux/anon_inodes.h>
				19	#include <linux/syscalls.h>
				20	#include <linux/export.h>
				21	#include <linux/kref.h>
				22	#include <linux/eventfd.h>
				23	#include <linux/proc_fs.h>
				24	#include <linux/seq_file.h>
				25	#include <linux/idr.h>
				26
				27	DEFINE_PER_CPU(int, eventfd_wake_count);
				28
				29	static DEFINE_IDA(eventfd_ida);
				30
				31	struct eventfd_ctx {
				32	struct kref kref;
				33	wait_queue_head_t wqh;
				34	/*
				35	* Every time that a write(2) is performed on an eventfd, the
				36	* value of the __u64 being written is added to "count" and a
				37	* wakeup is performed on "wqh". A read(2) will return the "count"
				38	* value to userspace, and will reset "count" to zero. The kernel
				39	* side eventfd_signal() also, adds to the "count" counter and
				40	* issue a wakeup.
				41	*/
				42	__u64 count;
				43	unsigned int flags;
				44	int id;
				45	};
				46
				47	/**
				48	* eventfd_signal - Adds @n to the eventfd counter.
				49	* @ctx: [in] Pointer to the eventfd context.
				50	* @n: [in] Value of the counter to be added to the eventfd internal counter.
				51	* The value cannot be negative.
				52	*
				53	* This function is supposed to be called by the kernel in paths that do not
				54	* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
				55	* value, and we signal this as overflow condition by returning a EPOLLERR
				56	* to poll(2).
				57	*
				58	* Returns the amount by which the counter was incremented. This will be less
				59	* than @n if the counter has overflowed.
				60	*/
				61	__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
				62	{
				63	unsigned long flags;
				64
				65	/*
				66	* Deadlock or stack overflow issues can happen if we recurse here
				67	* through waitqueue wakeup handlers. If the caller users potentially
				68	* nested waitqueues with custom wakeup handlers, then it should
				69	* check eventfd_signal_count() before calling this function. If
				70	* it returns true, the eventfd_signal() call should be deferred to a
				71	* safe context.
				72	*/
				73	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
				74	return 0;
				75
				76	spin_lock_irqsave(&ctx->wqh.lock, flags);
				77	this_cpu_inc(eventfd_wake_count);
				78	if (ULLONG_MAX - ctx->count < n)
				79	n = ULLONG_MAX - ctx->count;
				80	ctx->count += n;
				81	if (waitqueue_active(&ctx->wqh))
				82	wake_up_locked_poll(&ctx->wqh, EPOLLIN);
				83	this_cpu_dec(eventfd_wake_count);
				84	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
				85
				86	return n;
				87	}
				88	EXPORT_SYMBOL_GPL(eventfd_signal);
				89
				90	static void eventfd_free_ctx(struct eventfd_ctx *ctx)
				91	{
				92	if (ctx->id >= 0)
				93	ida_simple_remove(&eventfd_ida, ctx->id);
				94	kfree(ctx);
				95	}
				96
				97	static void eventfd_free(struct kref *kref)
				98	{
				99	struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
				100
				101	eventfd_free_ctx(ctx);
				102	}
				103
				104	/**
				105	* eventfd_ctx_put - Releases a reference to the internal eventfd context.
				106	* @ctx: [in] Pointer to eventfd context.
				107	*
				108	* The eventfd context reference must have been previously acquired either
				109	* with eventfd_ctx_fdget() or eventfd_ctx_fileget().
				110	*/
				111	void eventfd_ctx_put(struct eventfd_ctx *ctx)
				112	{
				113	kref_put(&ctx->kref, eventfd_free);
				114	}
				115	EXPORT_SYMBOL_GPL(eventfd_ctx_put);
				116
				117	static int eventfd_release(struct inode inode, struct file file)
				118	{
				119	struct eventfd_ctx *ctx = file->private_data;
				120
				121	wake_up_poll(&ctx->wqh, EPOLLHUP);
				122	eventfd_ctx_put(ctx);
				123	return 0;
				124	}
				125
				126	static __poll_t eventfd_poll(struct file file, poll_table wait)
				127	{
				128	struct eventfd_ctx *ctx = file->private_data;
				129	__poll_t events = 0;
				130	u64 count;
				131
				132	poll_wait(file, &ctx->wqh, wait);
				133
				134	/*
				135	* All writes to ctx->count occur within ctx->wqh.lock. This read
				136	* can be done outside ctx->wqh.lock because we know that poll_wait
				137	* takes that lock (through add_wait_queue) if our caller will sleep.
				138	*
				139	* The read _can_ therefore seep into add_wait_queue's critical
				140	* section, but cannot move above it! add_wait_queue's spin_lock acts
				141	* as an acquire barrier and ensures that the read be ordered properly
				142	* against the writes. The following CAN happen and is safe:
				143	*
				144	* poll write
				145	* ----------------- ------------
				146	* lock ctx->wqh.lock (in poll_wait)
				147	* count = ctx->count
				148	* __add_wait_queue
				149	* unlock ctx->wqh.lock
				150	* lock ctx->qwh.lock
				151	* ctx->count += n
				152	* if (waitqueue_active)
				153	* wake_up_locked_poll
				154	* unlock ctx->qwh.lock
				155	* eventfd_poll returns 0
				156	*
				157	* but the following, which would miss a wakeup, cannot happen:
				158	*
				159	* poll write
				160	* ----------------- ------------
				161	* count = ctx->count (INVALID!)
				162	* lock ctx->qwh.lock
				163	* ctx->count += n
				164	* waitqueue_active is false
				165	* no wake_up_locked_poll!
				166	* unlock ctx->qwh.lock
				167	* lock ctx->wqh.lock (in poll_wait)
				168	* __add_wait_queue
				169	* unlock ctx->wqh.lock
				170	* eventfd_poll returns 0
				171	*/
				172	count = READ_ONCE(ctx->count);
				173
				174	if (count > 0)
				175	events \|= EPOLLIN;
				176	if (count == ULLONG_MAX)
				177	events \|= EPOLLERR;
				178	if (ULLONG_MAX - 1 > count)
				179	events \|= EPOLLOUT;
				180
				181	return events;
				182	}
				183
				184	void eventfd_ctx_do_read(struct eventfd_ctx ctx, __u64 cnt)
				185	{
				186	lockdep_assert_held(&ctx->wqh.lock);
				187
				188	*cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
				189	ctx->count -= *cnt;
				190	}
				191	EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
				192
				193	/**
				194	* eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
				195	* @ctx: [in] Pointer to eventfd context.
				196	* @wait: [in] Wait queue to be removed.
				197	* @cnt: [out] Pointer to the 64-bit counter value.
				198	*
				199	* Returns %0 if successful, or the following error codes:
				200	*
				201	* -EAGAIN : The operation would have blocked.
				202	*
				203	* This is used to atomically remove a wait queue entry from the eventfd wait
				204	* queue head, and read/reset the counter value.
				205	*/
				206	int eventfd_ctx_remove_wait_queue(struct eventfd_ctx ctx, wait_queue_entry_t wait,
				207	__u64 *cnt)
				208	{
				209	unsigned long flags;
				210
				211	spin_lock_irqsave(&ctx->wqh.lock, flags);
				212	eventfd_ctx_do_read(ctx, cnt);
				213	__remove_wait_queue(&ctx->wqh, wait);
				214	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
				215	wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
				216	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
				217
				218	return *cnt != 0 ? 0 : -EAGAIN;
				219	}
				220	EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
				221
				222	static ssize_t eventfd_read(struct file file, char __user buf, size_t count,
				223	loff_t *ppos)
				224	{
				225	struct eventfd_ctx *ctx = file->private_data;
				226	ssize_t res;
				227	__u64 ucnt = 0;
				228	DECLARE_WAITQUEUE(wait, current);
				229
				230	if (count < sizeof(ucnt))
				231	return -EINVAL;
				232
				233	spin_lock_irq(&ctx->wqh.lock);
				234	res = -EAGAIN;
				235	if (ctx->count > 0)
				236	res = sizeof(ucnt);
				237	else if (!(file->f_flags & O_NONBLOCK)) {
				238	__add_wait_queue(&ctx->wqh, &wait);
				239	for (;;) {
				240	set_current_state(TASK_INTERRUPTIBLE);
				241	if (ctx->count > 0) {
				242	res = sizeof(ucnt);
				243	break;
				244	}
				245	if (signal_pending(current)) {
				246	res = -ERESTARTSYS;
				247	break;
				248	}
				249	spin_unlock_irq(&ctx->wqh.lock);
				250	schedule();
				251	spin_lock_irq(&ctx->wqh.lock);
				252	}
				253	__remove_wait_queue(&ctx->wqh, &wait);
				254	__set_current_state(TASK_RUNNING);
				255	}
				256	if (likely(res > 0)) {
				257	eventfd_ctx_do_read(ctx, &ucnt);
				258	if (waitqueue_active(&ctx->wqh))
				259	wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
				260	}
				261	spin_unlock_irq(&ctx->wqh.lock);
				262
				263	if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
				264	return -EFAULT;
				265
				266	return res;
				267	}
				268
				269	static ssize_t eventfd_write(struct file file, const char __user buf, size_t count,
				270	loff_t *ppos)
				271	{
				272	struct eventfd_ctx *ctx = file->private_data;
				273	ssize_t res;
				274	__u64 ucnt;
				275	DECLARE_WAITQUEUE(wait, current);
				276
				277	if (count < sizeof(ucnt))
				278	return -EINVAL;
				279	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
				280	return -EFAULT;
				281	if (ucnt == ULLONG_MAX)
				282	return -EINVAL;
				283	spin_lock_irq(&ctx->wqh.lock);
				284	res = -EAGAIN;
				285	if (ULLONG_MAX - ctx->count > ucnt)
				286	res = sizeof(ucnt);
				287	else if (!(file->f_flags & O_NONBLOCK)) {
				288	__add_wait_queue(&ctx->wqh, &wait);
				289	for (res = 0;;) {
				290	set_current_state(TASK_INTERRUPTIBLE);
				291	if (ULLONG_MAX - ctx->count > ucnt) {
				292	res = sizeof(ucnt);
				293	break;
				294	}
				295	if (signal_pending(current)) {
				296	res = -ERESTARTSYS;
				297	break;
				298	}
				299	spin_unlock_irq(&ctx->wqh.lock);
				300	schedule();
				301	spin_lock_irq(&ctx->wqh.lock);
				302	}
				303	__remove_wait_queue(&ctx->wqh, &wait);
				304	__set_current_state(TASK_RUNNING);
				305	}
				306	if (likely(res > 0)) {
				307	ctx->count += ucnt;
				308	if (waitqueue_active(&ctx->wqh))
				309	wake_up_locked_poll(&ctx->wqh, EPOLLIN);
				310	}
				311	spin_unlock_irq(&ctx->wqh.lock);
				312
				313	return res;
				314	}
				315
				316	#ifdef CONFIG_PROC_FS
				317	static void eventfd_show_fdinfo(struct seq_file m, struct file f)
				318	{
				319	struct eventfd_ctx *ctx = f->private_data;
				320
				321	spin_lock_irq(&ctx->wqh.lock);
				322	seq_printf(m, "eventfd-count: %16llx\n",
				323	(unsigned long long)ctx->count);
				324	spin_unlock_irq(&ctx->wqh.lock);
				325	seq_printf(m, "eventfd-id: %d\n", ctx->id);
				326	}
				327	#endif
				328
				329	static const struct file_operations eventfd_fops = {
				330	#ifdef CONFIG_PROC_FS
				331	.show_fdinfo = eventfd_show_fdinfo,
				332	#endif
				333	.release = eventfd_release,
				334	.poll = eventfd_poll,
				335	.read = eventfd_read,
				336	.write = eventfd_write,
				337	.llseek = noop_llseek,
				338	};
				339
				340	/**
				341	* eventfd_fget - Acquire a reference of an eventfd file descriptor.
				342	* @fd: [in] Eventfd file descriptor.
				343	*
				344	* Returns a pointer to the eventfd file structure in case of success, or the
				345	* following error pointer:
				346	*
				347	* -EBADF : Invalid @fd file descriptor.
				348	* -EINVAL : The @fd file descriptor is not an eventfd file.
				349	*/
				350	struct file *eventfd_fget(int fd)
				351	{
				352	struct file *file;
				353
				354	file = fget(fd);
				355	if (!file)
				356	return ERR_PTR(-EBADF);
				357	if (file->f_op != &eventfd_fops) {
				358	fput(file);
				359	return ERR_PTR(-EINVAL);
				360	}
				361
				362	return file;
				363	}
				364	EXPORT_SYMBOL_GPL(eventfd_fget);
				365
				366	/**
				367	* eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
				368	* @fd: [in] Eventfd file descriptor.
				369	*
				370	* Returns a pointer to the internal eventfd context, otherwise the error
				371	* pointers returned by the following functions:
				372	*
				373	* eventfd_fget
				374	*/
				375	struct eventfd_ctx *eventfd_ctx_fdget(int fd)
				376	{
				377	struct eventfd_ctx *ctx;
				378	struct fd f = fdget(fd);
				379	if (!f.file)
				380	return ERR_PTR(-EBADF);
				381	ctx = eventfd_ctx_fileget(f.file);
				382	fdput(f);
				383	return ctx;
				384	}
				385	EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
				386
				387	/**
				388	* eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
				389	* @file: [in] Eventfd file pointer.
				390	*
				391	* Returns a pointer to the internal eventfd context, otherwise the error
				392	* pointer:
				393	*
				394	* -EINVAL : The @fd file descriptor is not an eventfd file.
				395	*/
				396	struct eventfd_ctx eventfd_ctx_fileget(struct file file)
				397	{
				398	struct eventfd_ctx *ctx;
				399
				400	if (file->f_op != &eventfd_fops)
				401	return ERR_PTR(-EINVAL);
				402
				403	ctx = file->private_data;
				404	kref_get(&ctx->kref);
				405	return ctx;
				406	}
				407	EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
				408
				409	static int do_eventfd(unsigned int count, int flags)
				410	{
				411	struct eventfd_ctx *ctx;
				412	int fd;
				413
				414	/* Check the EFD_* constants for consistency. */
				415	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
				416	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
				417
				418	if (flags & ~EFD_FLAGS_SET)
				419	return -EINVAL;
				420
				421	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
				422	if (!ctx)
				423	return -ENOMEM;
				424
				425	kref_init(&ctx->kref);
				426	init_waitqueue_head(&ctx->wqh);
				427	ctx->count = count;
				428	ctx->flags = flags;
				429	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
				430
				431	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
				432	O_RDWR \| (flags & EFD_SHARED_FCNTL_FLAGS));
				433	if (fd < 0)
				434	eventfd_free_ctx(ctx);
				435
				436	return fd;
				437	}
				438
				439	SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
				440	{
				441	return do_eventfd(count, flags);
				442	}
				443
				444	SYSCALL_DEFINE1(eventfd, unsigned int, count)
				445	{
				446	return do_eventfd(count, 0);
				447	}
				448