Blame - marvell/linux/fs/eventpoll.c - T108

blob: 752497e773b05b6f8d8826303699c88e434c9dc0 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/*
				3	* fs/eventpoll.c (Efficient event retrieval implementation)
				4	* Copyright (C) 2001,...,2009 Davide Libenzi
				5	*
				6	* Davide Libenzi <davidel@xmailserver.org>
				7	*/
				8
				9	#include <linux/init.h>
				10	#include <linux/kernel.h>
				11	#include <linux/sched/signal.h>
				12	#include <linux/fs.h>
				13	#include <linux/file.h>
				14	#include <linux/signal.h>
				15	#include <linux/errno.h>
				16	#include <linux/mm.h>
				17	#include <linux/slab.h>
				18	#include <linux/poll.h>
				19	#include <linux/string.h>
				20	#include <linux/list.h>
				21	#include <linux/hash.h>
				22	#include <linux/spinlock.h>
				23	#include <linux/syscalls.h>
				24	#include <linux/rbtree.h>
				25	#include <linux/wait.h>
				26	#include <linux/eventpoll.h>
				27	#include <linux/mount.h>
				28	#include <linux/bitops.h>
				29	#include <linux/mutex.h>
				30	#include <linux/anon_inodes.h>
				31	#include <linux/device.h>
				32	#include <linux/freezer.h>
				33	#include <linux/uaccess.h>
				34	#include <asm/io.h>
				35	#include <asm/mman.h>
				36	#include <linux/atomic.h>
				37	#include <linux/proc_fs.h>
				38	#include <linux/seq_file.h>
				39	#include <linux/compat.h>
				40	#include <linux/rculist.h>
				41	#include <net/busy_poll.h>
				42
				43	/*
				44	* LOCKING:
				45	* There are three level of locking required by epoll :
				46	*
				47	* 1) epmutex (mutex)
				48	* 2) ep->mtx (mutex)
				49	* 3) ep->lock (rwlock)
				50	*
				51	* The acquire order is the one listed above, from 1 to 3.
				52	* We need a rwlock (ep->lock) because we manipulate objects
				53	* from inside the poll callback, that might be triggered from
				54	* a wake_up() that in turn might be called from IRQ context.
				55	* So we can't sleep inside the poll callback and hence we need
				56	* a spinlock. During the event transfer loop (from kernel to
				57	* user space) we could end up sleeping due a copy_to_user(), so
				58	* we need a lock that will allow us to sleep. This lock is a
				59	* mutex (ep->mtx). It is acquired during the event transfer loop,
				60	* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
				61	* Then we also need a global mutex to serialize eventpoll_release_file()
				62	* and ep_free().
				63	* This mutex is acquired by ep_free() during the epoll file
				64	* cleanup path and it is also acquired by eventpoll_release_file()
				65	* if a file has been pushed inside an epoll set and it is then
				66	* close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
				67	* It is also acquired when inserting an epoll fd onto another epoll
				68	* fd. We do this so that we walk the epoll tree and ensure that this
				69	* insertion does not create a cycle of epoll file descriptors, which
				70	* could lead to deadlock. We need a global mutex to prevent two
				71	* simultaneous inserts (A into B and B into A) from racing and
				72	* constructing a cycle without either insert observing that it is
				73	* going to.
				74	* It is necessary to acquire multiple "ep->mtx"es at once in the
				75	* case when one epoll fd is added to another. In this case, we
				76	* always acquire the locks in the order of nesting (i.e. after
				77	* epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
				78	* before e2->mtx). Since we disallow cycles of epoll file
				79	* descriptors, this ensures that the mutexes are well-ordered. In
				80	* order to communicate this nesting to lockdep, when walking a tree
				81	* of epoll file descriptors, we use the current recursion depth as
				82	* the lockdep subkey.
				83	* It is possible to drop the "ep->mtx" and to use the global
				84	* mutex "epmutex" (together with "ep->lock") to have it working,
				85	* but having "ep->mtx" will make the interface more scalable.
				86	* Events that require holding "epmutex" are very rare, while for
				87	* normal operations the epoll private "ep->mtx" will guarantee
				88	* a better scalability.
				89	*/
				90
				91	/* Epoll private bits inside the event mask */
				92	#define EP_PRIVATE_BITS (EPOLLWAKEUP \| EPOLLONESHOT \| EPOLLET \| EPOLLEXCLUSIVE)
				93
				94	#define EPOLLINOUT_BITS (EPOLLIN \| EPOLLOUT)
				95
				96	#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS \| EPOLLERR \| EPOLLHUP \| \
				97	EPOLLWAKEUP \| EPOLLET \| EPOLLEXCLUSIVE)
				98
				99	/* Maximum number of nesting allowed inside epoll sets */
				100	#define EP_MAX_NESTS 4
				101
				102	#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
				103
				104	#define EP_UNACTIVE_PTR ((void *) -1L)
				105
				106	#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
				107
				108	struct epoll_filefd {
				109	struct file *file;
				110	int fd;
				111	} __packed;
				112
				113	/*
				114	* Structure used to track possible nested calls, for too deep recursions
				115	* and loop cycles.
				116	*/
				117	struct nested_call_node {
				118	struct list_head llink;
				119	void *cookie;
				120	void *ctx;
				121	};
				122
				123	/*
				124	* This structure is used as collector for nested calls, to check for
				125	* maximum recursion dept and loop cycles.
				126	*/
				127	struct nested_calls {
				128	struct list_head tasks_call_list;
				129	spinlock_t lock;
				130	};
				131
				132	/*
				133	* Each file descriptor added to the eventpoll interface will
				134	* have an entry of this type linked to the "rbr" RB tree.
				135	* Avoid increasing the size of this struct, there can be many thousands
				136	* of these on a server and we do not want this to take another cache line.
				137	*/
				138	struct epitem {
				139	union {
				140	/* RB tree node links this structure to the eventpoll RB tree */
				141	struct rb_node rbn;
				142	/* Used to free the struct epitem */
				143	struct rcu_head rcu;
				144	};
				145
				146	/* List header used to link this structure to the eventpoll ready list */
				147	struct list_head rdllink;
				148
				149	/*
				150	* Works together "struct eventpoll"->ovflist in keeping the
				151	* single linked chain of items.
				152	*/
				153	struct epitem *next;
				154
				155	/* The file descriptor information this item refers to */
				156	struct epoll_filefd ffd;
				157
				158	/* Number of active wait queue attached to poll operations */
				159	int nwait;
				160
				161	/* List containing poll wait queues */
				162	struct list_head pwqlist;
				163
				164	/* The "container" of this item */
				165	struct eventpoll *ep;
				166
				167	/* List header used to link this item to the "struct file" items list */
				168	struct list_head fllink;
				169
				170	/* wakeup_source used when EPOLLWAKEUP is set */
				171	struct wakeup_source __rcu *ws;
				172
				173	/* The structure that describe the interested events and the source fd */
				174	struct epoll_event event;
				175	};
				176
				177	/*
				178	* This structure is stored inside the "private_data" member of the file
				179	* structure and represents the main data structure for the eventpoll
				180	* interface.
				181	*/
				182	struct eventpoll {
				183	/*
				184	* This mutex is used to ensure that files are not removed
				185	* while epoll is using them. This is held during the event
				186	* collection loop, the file cleanup path, the epoll file exit
				187	* code and the ctl operations.
				188	*/
				189	struct mutex mtx;
				190
				191	/* Wait queue used by sys_epoll_wait() */
				192	wait_queue_head_t wq;
				193
				194	/* Wait queue used by file->poll() */
				195	wait_queue_head_t poll_wait;
				196
				197	/* List of ready file descriptors */
				198	struct list_head rdllist;
				199
				200	/* Lock which protects rdllist and ovflist */
				201	rwlock_t lock;
				202
				203	/* RB tree root used to store monitored fd structs */
				204	struct rb_root_cached rbr;
				205
				206	/*
				207	* This is a single linked list that chains all the "struct epitem" that
				208	* happened while transferring ready events to userspace w/out
				209	* holding ->lock.
				210	*/
				211	struct epitem *ovflist;
				212
				213	/* wakeup_source used when ep_scan_ready_list is running */
				214	struct wakeup_source *ws;
				215
				216	/* The user that created the eventpoll descriptor */
				217	struct user_struct *user;
				218
				219	struct file *file;
				220
				221	/* used to optimize loop detection check */
				222	u64 gen;
				223
				224	#ifdef CONFIG_NET_RX_BUSY_POLL
				225	/* used to track busy poll napi_id */
				226	unsigned int napi_id;
				227	#endif
				228	};
				229
				230	/* Wait structure used by the poll hooks */
				231	struct eppoll_entry {
				232	/* List header used to link this structure to the "struct epitem" */
				233	struct list_head llink;
				234
				235	/* The "base" pointer is set to the container "struct epitem" */
				236	struct epitem *base;
				237
				238	/*
				239	* Wait queue item that will be linked to the target file wait
				240	* queue head.
				241	*/
				242	wait_queue_entry_t wait;
				243
				244	/* The wait queue head that linked the "wait" wait queue item */
				245	wait_queue_head_t *whead;
				246	};
				247
				248	/* Wrapper struct used by poll queueing */
				249	struct ep_pqueue {
				250	poll_table pt;
				251	struct epitem *epi;
				252	};
				253
				254	/* Used by the ep_send_events() function as callback private data */
				255	struct ep_send_events_data {
				256	int maxevents;
				257	struct epoll_event __user *events;
				258	int res;
				259	};
				260
				261	/*
				262	* Configuration options available inside /proc/sys/fs/epoll/
				263	*/
				264	/* Maximum number of epoll watched descriptors, per user */
				265	static long max_user_watches __read_mostly;
				266
				267	/*
				268	* This mutex is used to serialize ep_free() and eventpoll_release_file().
				269	*/
				270	static DEFINE_MUTEX(epmutex);
				271
				272	static u64 loop_check_gen = 0;
				273
				274	/* Used to check for epoll file descriptor inclusion loops */
				275	static struct nested_calls poll_loop_ncalls;
				276
				277	/* Slab cache used to allocate "struct epitem" */
				278	static struct kmem_cache *epi_cache __read_mostly;
				279
				280	/* Slab cache used to allocate "struct eppoll_entry" */
				281	static struct kmem_cache *pwq_cache __read_mostly;
				282
				283	/*
				284	* List of files with newly added links, where we may need to limit the number
				285	* of emanating paths. Protected by the epmutex.
				286	*/
				287	static LIST_HEAD(tfile_check_list);
				288
				289	#ifdef CONFIG_SYSCTL
				290
				291	#include <linux/sysctl.h>
				292
				293	static long long_zero;
				294	static long long_max = LONG_MAX;
				295
				296	struct ctl_table epoll_table[] = {
				297	{
				298	.procname = "max_user_watches",
				299	.data = &max_user_watches,
				300	.maxlen = sizeof(max_user_watches),
				301	.mode = 0644,
				302	.proc_handler = proc_doulongvec_minmax,
				303	.extra1 = &long_zero,
				304	.extra2 = &long_max,
				305	},
				306	{ }
				307	};
				308	#endif /* CONFIG_SYSCTL */
				309
				310	static const struct file_operations eventpoll_fops;
				311
				312	static inline int is_file_epoll(struct file *f)
				313	{
				314	return f->f_op == &eventpoll_fops;
				315	}
				316
				317	/* Setup the structure that is used as key for the RB tree */
				318	static inline void ep_set_ffd(struct epoll_filefd *ffd,
				319	struct file *file, int fd)
				320	{
				321	ffd->file = file;
				322	ffd->fd = fd;
				323	}
				324
				325	/* Compare RB tree keys */
				326	static inline int ep_cmp_ffd(struct epoll_filefd *p1,
				327	struct epoll_filefd *p2)
				328	{
				329	return (p1->file > p2->file ? +1:
				330	(p1->file < p2->file ? -1 : p1->fd - p2->fd));
				331	}
				332
				333	/* Tells us if the item is currently linked */
				334	static inline int ep_is_linked(struct epitem *epi)
				335	{
				336	return !list_empty(&epi->rdllink);
				337	}
				338
				339	static inline struct eppoll_entry ep_pwq_from_wait(wait_queue_entry_t p)
				340	{
				341	return container_of(p, struct eppoll_entry, wait);
				342	}
				343
				344	/* Get the "struct epitem" from a wait queue pointer */
				345	static inline struct epitem ep_item_from_wait(wait_queue_entry_t p)
				346	{
				347	return container_of(p, struct eppoll_entry, wait)->base;
				348	}
				349
				350	/* Get the "struct epitem" from an epoll queue wrapper */
				351	static inline struct epitem ep_item_from_epqueue(poll_table p)
				352	{
				353	return container_of(p, struct ep_pqueue, pt)->epi;
				354	}
				355
				356	/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
				357	static inline int ep_op_has_event(int op)
				358	{
				359	return op != EPOLL_CTL_DEL;
				360	}
				361
				362	/* Initialize the poll safe wake up structure */
				363	static void ep_nested_calls_init(struct nested_calls *ncalls)
				364	{
				365	INIT_LIST_HEAD(&ncalls->tasks_call_list);
				366	spin_lock_init(&ncalls->lock);
				367	}
				368
				369	/**
				370	* ep_events_available - Checks if ready events might be available.
				371	*
				372	* @ep: Pointer to the eventpoll context.
				373	*
				374	* Returns: Returns a value different than zero if ready events are available,
				375	* or zero otherwise.
				376	*/
				377	static inline int ep_events_available(struct eventpoll *ep)
				378	{
				379	return !list_empty_careful(&ep->rdllist) \|\|
				380	READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
				381	}
				382
				383	#ifdef CONFIG_NET_RX_BUSY_POLL
				384	static bool ep_busy_loop_end(void *p, unsigned long start_time)
				385	{
				386	struct eventpoll *ep = p;
				387
				388	return ep_events_available(ep) \|\| busy_loop_timeout(start_time);
				389	}
				390
				391	/*
				392	* Busy poll if globally on and supporting sockets found && no events,
				393	* busy loop will return if need_resched or ep_events_available.
				394	*
				395	* we must do our busy polling with irqs enabled
				396	*/
				397	static void ep_busy_loop(struct eventpoll *ep, int nonblock)
				398	{
				399	unsigned int napi_id = READ_ONCE(ep->napi_id);
				400
				401	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
				402	napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
				403	}
				404
				405	static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
				406	{
				407	if (ep->napi_id)
				408	ep->napi_id = 0;
				409	}
				410
				411	/*
				412	* Set epoll busy poll NAPI ID from sk.
				413	*/
				414	static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
				415	{
				416	struct eventpoll *ep;
				417	unsigned int napi_id;
				418	struct socket *sock;
				419	struct sock *sk;
				420	int err;
				421
				422	if (!net_busy_loop_on())
				423	return;
				424
				425	sock = sock_from_file(epi->ffd.file, &err);
				426	if (!sock)
				427	return;
				428
				429	sk = sock->sk;
				430	if (!sk)
				431	return;
				432
				433	napi_id = READ_ONCE(sk->sk_napi_id);
				434	ep = epi->ep;
				435
				436	/* Non-NAPI IDs can be rejected
				437	* or
				438	* Nothing to do if we already have this ID
				439	*/
				440	if (napi_id < MIN_NAPI_ID \|\| napi_id == ep->napi_id)
				441	return;
				442
				443	/* record NAPI ID for use in next busy poll */
				444	ep->napi_id = napi_id;
				445	}
				446
				447	#else
				448
				449	static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
				450	{
				451	}
				452
				453	static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
				454	{
				455	}
				456
				457	static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
				458	{
				459	}
				460
				461	#endif /* CONFIG_NET_RX_BUSY_POLL */
				462
				463	/**
				464	* ep_call_nested - Perform a bound (possibly) nested call, by checking
				465	* that the recursion limit is not exceeded, and that
				466	* the same nested call (by the meaning of same cookie) is
				467	* no re-entered.
				468	*
				469	* @ncalls: Pointer to the nested_calls structure to be used for this call.
				470	* @nproc: Nested call core function pointer.
				471	* @priv: Opaque data to be passed to the @nproc callback.
				472	* @cookie: Cookie to be used to identify this nested call.
				473	* @ctx: This instance context.
				474	*
				475	* Returns: Returns the code returned by the @nproc callback, or -1 if
				476	* the maximum recursion limit has been exceeded.
				477	*/
				478	static int ep_call_nested(struct nested_calls *ncalls,
				479	int (nproc)(void , void , int), void priv,
				480	void cookie, void ctx)
				481	{
				482	int error, call_nests = 0;
				483	unsigned long flags;
				484	struct list_head *lsthead = &ncalls->tasks_call_list;
				485	struct nested_call_node *tncur;
				486	struct nested_call_node tnode;
				487
				488	spin_lock_irqsave(&ncalls->lock, flags);
				489
				490	/*
				491	* Try to see if the current task is already inside this wakeup call.
				492	* We use a list here, since the population inside this set is always
				493	* very much limited.
				494	*/
				495	list_for_each_entry(tncur, lsthead, llink) {
				496	if (tncur->ctx == ctx &&
				497	(tncur->cookie == cookie \|\| ++call_nests > EP_MAX_NESTS)) {
				498	/*
				499	* Ops ... loop detected or maximum nest level reached.
				500	* We abort this wake by breaking the cycle itself.
				501	*/
				502	error = -1;
				503	goto out_unlock;
				504	}
				505	}
				506
				507	/* Add the current task and cookie to the list */
				508	tnode.ctx = ctx;
				509	tnode.cookie = cookie;
				510	list_add(&tnode.llink, lsthead);
				511
				512	spin_unlock_irqrestore(&ncalls->lock, flags);
				513
				514	/* Call the nested function */
				515	error = (*nproc)(priv, cookie, call_nests);
				516
				517	/* Remove the current task from the list */
				518	spin_lock_irqsave(&ncalls->lock, flags);
				519	list_del(&tnode.llink);
				520	out_unlock:
				521	spin_unlock_irqrestore(&ncalls->lock, flags);
				522
				523	return error;
				524	}
				525
				526	/*
				527	* As described in commit 0ccf831cb lockdep: annotate epoll
				528	* the use of wait queues used by epoll is done in a very controlled
				529	* manner. Wake ups can nest inside each other, but are never done
				530	* with the same locking. For example:
				531	*
				532	* dfd = socket(...);
				533	* efd1 = epoll_create();
				534	* efd2 = epoll_create();
				535	* epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
				536	* epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
				537	*
				538	* When a packet arrives to the device underneath "dfd", the net code will
				539	* issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
				540	* callback wakeup entry on that queue, and the wake_up() performed by the
				541	* "dfd" net code will end up in ep_poll_callback(). At this point epoll
				542	* (efd1) notices that it may have some event ready, so it needs to wake up
				543	* the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
				544	* that ends up in another wake_up(), after having checked about the
				545	* recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
				546	* avoid stack blasting.
				547	*
				548	* When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
				549	* this special case of epoll.
				550	*/
				551	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				552
				553	static struct nested_calls poll_safewake_ncalls;
				554
				555	static int ep_poll_wakeup_proc(void priv, void cookie, int call_nests)
				556	{
				557	unsigned long flags;
				558	wait_queue_head_t wqueue = (wait_queue_head_t )cookie;
				559
				560	spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
				561	wake_up_locked_poll(wqueue, EPOLLIN);
				562	spin_unlock_irqrestore(&wqueue->lock, flags);
				563
				564	return 0;
				565	}
				566
				567	static void ep_poll_safewake(wait_queue_head_t *wq)
				568	{
				569	int this_cpu = get_cpu();
				570
				571	ep_call_nested(&poll_safewake_ncalls,
				572	ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
				573
				574	put_cpu();
				575	}
				576
				577	#else
				578
				579	static void ep_poll_safewake(wait_queue_head_t *wq)
				580	{
				581	wake_up_poll(wq, EPOLLIN);
				582	}
				583
				584	#endif
				585
				586	static void ep_remove_wait_queue(struct eppoll_entry *pwq)
				587	{
				588	wait_queue_head_t *whead;
				589
				590	rcu_read_lock();
				591	/*
				592	* If it is cleared by POLLFREE, it should be rcu-safe.
				593	* If we read NULL we need a barrier paired with
				594	* smp_store_release() in ep_poll_callback(), otherwise
				595	* we rely on whead->lock.
				596	*/
				597	whead = smp_load_acquire(&pwq->whead);
				598	if (whead)
				599	remove_wait_queue(whead, &pwq->wait);
				600	rcu_read_unlock();
				601	}
				602
				603	/*
				604	* This function unregisters poll callbacks from the associated file
				605	* descriptor. Must be called with "mtx" held (or "epmutex" if called from
				606	* ep_free).
				607	*/
				608	static void ep_unregister_pollwait(struct eventpoll ep, struct epitem epi)
				609	{
				610	struct list_head *lsthead = &epi->pwqlist;
				611	struct eppoll_entry *pwq;
				612
				613	while (!list_empty(lsthead)) {
				614	pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
				615
				616	list_del(&pwq->llink);
				617	ep_remove_wait_queue(pwq);
				618	kmem_cache_free(pwq_cache, pwq);
				619	}
				620	}
				621
				622	/* call only when ep->mtx is held */
				623	static inline struct wakeup_source ep_wakeup_source(struct epitem epi)
				624	{
				625	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
				626	}
				627
				628	/* call only when ep->mtx is held */
				629	static inline void ep_pm_stay_awake(struct epitem *epi)
				630	{
				631	struct wakeup_source *ws = ep_wakeup_source(epi);
				632
				633	if (ws)
				634	__pm_stay_awake(ws);
				635	}
				636
				637	static inline bool ep_has_wakeup_source(struct epitem *epi)
				638	{
				639	return rcu_access_pointer(epi->ws) ? true : false;
				640	}
				641
				642	/* call when ep->mtx cannot be held (ep_poll_callback) */
				643	static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
				644	{
				645	struct wakeup_source *ws;
				646
				647	rcu_read_lock();
				648	ws = rcu_dereference(epi->ws);
				649	if (ws)
				650	__pm_stay_awake(ws);
				651	rcu_read_unlock();
				652	}
				653
				654	/**
				655	* ep_scan_ready_list - Scans the ready list in a way that makes possible for
				656	* the scan code, to call f_op->poll(). Also allows for
				657	* O(NumReady) performance.
				658	*
				659	* @ep: Pointer to the epoll private data structure.
				660	* @sproc: Pointer to the scan callback.
				661	* @priv: Private opaque data passed to the @sproc callback.
				662	* @depth: The current depth of recursive f_op->poll calls.
				663	* @ep_locked: caller already holds ep->mtx
				664	*
				665	* Returns: The same integer error code returned by the @sproc callback.
				666	*/
				667	static __poll_t ep_scan_ready_list(struct eventpoll *ep,
				668	__poll_t (sproc)(struct eventpoll ,
				669	struct list_head , void ),
				670	void *priv, int depth, bool ep_locked)
				671	{
				672	__poll_t res;
				673	int pwake = 0;
				674	struct epitem epi, nepi;
				675	LIST_HEAD(txlist);
				676
				677	lockdep_assert_irqs_enabled();
				678
				679	/*
				680	* We need to lock this because we could be hit by
				681	* eventpoll_release_file() and epoll_ctl().
				682	*/
				683
				684	if (!ep_locked)
				685	mutex_lock_nested(&ep->mtx, depth);
				686
				687	/*
				688	* Steal the ready list, and re-init the original one to the
				689	* empty list. Also, set ep->ovflist to NULL so that events
				690	* happening while looping w/out locks, are not lost. We cannot
				691	* have the poll callback to queue directly on ep->rdllist,
				692	* because we want the "sproc" callback to be able to do it
				693	* in a lockless way.
				694	*/
				695	write_lock_irq(&ep->lock);
				696	list_splice_init(&ep->rdllist, &txlist);
				697	WRITE_ONCE(ep->ovflist, NULL);
				698	write_unlock_irq(&ep->lock);
				699
				700	/*
				701	* Now call the callback function.
				702	*/
				703	res = (*sproc)(ep, &txlist, priv);
				704
				705	write_lock_irq(&ep->lock);
				706	/*
				707	* During the time we spent inside the "sproc" callback, some
				708	* other events might have been queued by the poll callback.
				709	* We re-insert them inside the main ready-list here.
				710	*/
				711	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
				712	nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
				713	/*
				714	* We need to check if the item is already in the list.
				715	* During the "sproc" callback execution time, items are
				716	* queued into ->ovflist but the "txlist" might already
				717	* contain them, and the list_splice() below takes care of them.
				718	*/
				719	if (!ep_is_linked(epi)) {
				720	/*
				721	* ->ovflist is LIFO, so we have to reverse it in order
				722	* to keep in FIFO.
				723	*/
				724	list_add(&epi->rdllink, &ep->rdllist);
				725	ep_pm_stay_awake(epi);
				726	}
				727	}
				728	/*
				729	* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
				730	* releasing the lock, events will be queued in the normal way inside
				731	* ep->rdllist.
				732	*/
				733	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
				734
				735	/*
				736	* Quickly re-inject items left on "txlist".
				737	*/
				738	list_splice(&txlist, &ep->rdllist);
				739	__pm_relax(ep->ws);
				740
				741	if (!list_empty(&ep->rdllist)) {
				742	/*
				743	* Wake up (if active) both the eventpoll wait list and
				744	* the ->poll() wait list (delayed after we release the lock).
				745	*/
				746	if (waitqueue_active(&ep->wq))
				747	wake_up(&ep->wq);
				748	if (waitqueue_active(&ep->poll_wait))
				749	pwake++;
				750	}
				751	write_unlock_irq(&ep->lock);
				752
				753	if (!ep_locked)
				754	mutex_unlock(&ep->mtx);
				755
				756	/* We have to call this outside the lock */
				757	if (pwake)
				758	ep_poll_safewake(&ep->poll_wait);
				759
				760	return res;
				761	}
				762
				763	static void epi_rcu_free(struct rcu_head *head)
				764	{
				765	struct epitem *epi = container_of(head, struct epitem, rcu);
				766	kmem_cache_free(epi_cache, epi);
				767	}
				768
				769	/*
				770	* Removes a "struct epitem" from the eventpoll RB tree and deallocates
				771	* all the associated resources. Must be called with "mtx" held.
				772	*/
				773	static int ep_remove(struct eventpoll ep, struct epitem epi)
				774	{
				775	struct file *file = epi->ffd.file;
				776
				777	lockdep_assert_irqs_enabled();
				778
				779	/*
				780	* Removes poll wait queue hooks.
				781	*/
				782	ep_unregister_pollwait(ep, epi);
				783
				784	/* Remove the current item from the list of epoll hooks */
				785	spin_lock(&file->f_lock);
				786	list_del_rcu(&epi->fllink);
				787	spin_unlock(&file->f_lock);
				788
				789	rb_erase_cached(&epi->rbn, &ep->rbr);
				790
				791	write_lock_irq(&ep->lock);
				792	if (ep_is_linked(epi))
				793	list_del_init(&epi->rdllink);
				794	write_unlock_irq(&ep->lock);
				795
				796	wakeup_source_unregister(ep_wakeup_source(epi));
				797	/*
				798	* At this point it is safe to free the eventpoll item. Use the union
				799	* field epi->rcu, since we are trying to minimize the size of
				800	* 'struct epitem'. The 'rbn' field is no longer in use. Protected by
				801	* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
				802	* use of the rbn field.
				803	*/
				804	call_rcu(&epi->rcu, epi_rcu_free);
				805
				806	atomic_long_dec(&ep->user->epoll_watches);
				807
				808	return 0;
				809	}
				810
				811	static void ep_free(struct eventpoll *ep)
				812	{
				813	struct rb_node *rbp;
				814	struct epitem *epi;
				815
				816	/* We need to release all tasks waiting for these file */
				817	if (waitqueue_active(&ep->poll_wait))
				818	ep_poll_safewake(&ep->poll_wait);
				819
				820	/*
				821	* We need to lock this because we could be hit by
				822	* eventpoll_release_file() while we're freeing the "struct eventpoll".
				823	* We do not need to hold "ep->mtx" here because the epoll file
				824	* is on the way to be removed and no one has references to it
				825	* anymore. The only hit might come from eventpoll_release_file() but
				826	* holding "epmutex" is sufficient here.
				827	*/
				828	mutex_lock(&epmutex);
				829
				830	/*
				831	* Walks through the whole tree by unregistering poll callbacks.
				832	*/
				833	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
				834	epi = rb_entry(rbp, struct epitem, rbn);
				835
				836	ep_unregister_pollwait(ep, epi);
				837	cond_resched();
				838	}
				839
				840	/*
				841	* Walks through the whole tree by freeing each "struct epitem". At this
				842	* point we are sure no poll callbacks will be lingering around, and also by
				843	* holding "epmutex" we can be sure that no file cleanup code will hit
				844	* us during this operation. So we can avoid the lock on "ep->lock".
				845	* We do not need to lock ep->mtx, either, we only do it to prevent
				846	* a lockdep warning.
				847	*/
				848	mutex_lock(&ep->mtx);
				849	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
				850	epi = rb_entry(rbp, struct epitem, rbn);
				851	ep_remove(ep, epi);
				852	cond_resched();
				853	}
				854	mutex_unlock(&ep->mtx);
				855
				856	mutex_unlock(&epmutex);
				857	mutex_destroy(&ep->mtx);
				858	free_uid(ep->user);
				859	wakeup_source_unregister(ep->ws);
				860	kfree(ep);
				861	}
				862
				863	static int ep_eventpoll_release(struct inode inode, struct file file)
				864	{
				865	struct eventpoll *ep = file->private_data;
				866
				867	if (ep)
				868	ep_free(ep);
				869
				870	return 0;
				871	}
				872
				873	static __poll_t ep_read_events_proc(struct eventpoll ep, struct list_head head,
				874	void *priv);
				875	static void ep_ptable_queue_proc(struct file file, wait_queue_head_t whead,
				876	poll_table *pt);
				877
				878	/*
				879	* Differs from ep_eventpoll_poll() in that internal callers already have
				880	* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
				881	* is correctly annotated.
				882	*/
				883	static __poll_t ep_item_poll(const struct epitem epi, poll_table pt,
				884	int depth)
				885	{
				886	struct eventpoll *ep;
				887	bool locked;
				888
				889	pt->_key = epi->event.events;
				890	if (!is_file_epoll(epi->ffd.file))
				891	return vfs_poll(epi->ffd.file, pt) & epi->event.events;
				892
				893	ep = epi->ffd.file->private_data;
				894	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
				895	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
				896
				897	return ep_scan_ready_list(epi->ffd.file->private_data,
				898	ep_read_events_proc, &depth, depth,
				899	locked) & epi->event.events;
				900	}
				901
				902	static __poll_t ep_read_events_proc(struct eventpoll ep, struct list_head head,
				903	void *priv)
				904	{
				905	struct epitem epi, tmp;
				906	poll_table pt;
				907	int depth = (int )priv;
				908
				909	init_poll_funcptr(&pt, NULL);
				910	depth++;
				911
				912	list_for_each_entry_safe(epi, tmp, head, rdllink) {
				913	if (ep_item_poll(epi, &pt, depth)) {
				914	return EPOLLIN \| EPOLLRDNORM;
				915	} else {
				916	/*
				917	* Item has been dropped into the ready list by the poll
				918	* callback, but it's not actually ready, as far as
				919	* caller requested events goes. We can remove it here.
				920	*/
				921	__pm_relax(ep_wakeup_source(epi));
				922	list_del_init(&epi->rdllink);
				923	}
				924	}
				925
				926	return 0;
				927	}
				928
				929	static __poll_t ep_eventpoll_poll(struct file file, poll_table wait)
				930	{
				931	struct eventpoll *ep = file->private_data;
				932	int depth = 0;
				933
				934	/* Insert inside our poll wait queue */
				935	poll_wait(file, &ep->poll_wait, wait);
				936
				937	/*
				938	* Proceed to find out if wanted events are really available inside
				939	* the ready list.
				940	*/
				941	return ep_scan_ready_list(ep, ep_read_events_proc,
				942	&depth, depth, false);
				943	}
				944
				945	#ifdef CONFIG_PROC_FS
				946	static void ep_show_fdinfo(struct seq_file m, struct file f)
				947	{
				948	struct eventpoll *ep = f->private_data;
				949	struct rb_node *rbp;
				950
				951	mutex_lock(&ep->mtx);
				952	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
				953	struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
				954	struct inode *inode = file_inode(epi->ffd.file);
				955
				956	seq_printf(m, "tfd: %8d events: %8x data: %16llx "
				957	" pos:%lli ino:%lx sdev:%x\n",
				958	epi->ffd.fd, epi->event.events,
				959	(long long)epi->event.data,
				960	(long long)epi->ffd.file->f_pos,
				961	inode->i_ino, inode->i_sb->s_dev);
				962	if (seq_has_overflowed(m))
				963	break;
				964	}
				965	mutex_unlock(&ep->mtx);
				966	}
				967	#endif
				968
				969	/* File callbacks that implement the eventpoll file behaviour */
				970	static const struct file_operations eventpoll_fops = {
				971	#ifdef CONFIG_PROC_FS
				972	.show_fdinfo = ep_show_fdinfo,
				973	#endif
				974	.release = ep_eventpoll_release,
				975	.poll = ep_eventpoll_poll,
				976	.llseek = noop_llseek,
				977	};
				978
				979	/*
				980	* This is called from eventpoll_release() to unlink files from the eventpoll
				981	* interface. We need to have this facility to cleanup correctly files that are
				982	* closed without being removed from the eventpoll interface.
				983	*/
				984	void eventpoll_release_file(struct file *file)
				985	{
				986	struct eventpoll *ep;
				987	struct epitem epi, next;
				988
				989	/*
				990	* We don't want to get "file->f_lock" because it is not
				991	* necessary. It is not necessary because we're in the "struct file"
				992	* cleanup path, and this means that no one is using this file anymore.
				993	* So, for example, epoll_ctl() cannot hit here since if we reach this
				994	* point, the file counter already went to zero and fget() would fail.
				995	* The only hit might come from ep_free() but by holding the mutex
				996	* will correctly serialize the operation. We do need to acquire
				997	* "ep->mtx" after "epmutex" because ep_remove() requires it when called
				998	* from anywhere but ep_free().
				999	*
				1000	* Besides, ep_remove() acquires the lock, so we can't hold it here.
				1001	*/
				1002	mutex_lock(&epmutex);
				1003	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
				1004	ep = epi->ep;
				1005	mutex_lock_nested(&ep->mtx, 0);
				1006	ep_remove(ep, epi);
				1007	mutex_unlock(&ep->mtx);
				1008	}
				1009	mutex_unlock(&epmutex);
				1010	}
				1011
				1012	static int ep_alloc(struct eventpoll **pep)
				1013	{
				1014	int error;
				1015	struct user_struct *user;
				1016	struct eventpoll *ep;
				1017
				1018	user = get_current_user();
				1019	error = -ENOMEM;
				1020	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
				1021	if (unlikely(!ep))
				1022	goto free_uid;
				1023
				1024	mutex_init(&ep->mtx);
				1025	rwlock_init(&ep->lock);
				1026	init_waitqueue_head(&ep->wq);
				1027	init_waitqueue_head(&ep->poll_wait);
				1028	INIT_LIST_HEAD(&ep->rdllist);
				1029	ep->rbr = RB_ROOT_CACHED;
				1030	ep->ovflist = EP_UNACTIVE_PTR;
				1031	ep->user = user;
				1032
				1033	*pep = ep;
				1034
				1035	return 0;
				1036
				1037	free_uid:
				1038	free_uid(user);
				1039	return error;
				1040	}
				1041
				1042	/*
				1043	* Search the file inside the eventpoll tree. The RB tree operations
				1044	* are protected by the "mtx" mutex, and ep_find() must be called with
				1045	* "mtx" held.
				1046	*/
				1047	static struct epitem ep_find(struct eventpoll ep, struct file *file, int fd)
				1048	{
				1049	int kcmp;
				1050	struct rb_node *rbp;
				1051	struct epitem epi, epir = NULL;
				1052	struct epoll_filefd ffd;
				1053
				1054	ep_set_ffd(&ffd, file, fd);
				1055	for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
				1056	epi = rb_entry(rbp, struct epitem, rbn);
				1057	kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
				1058	if (kcmp > 0)
				1059	rbp = rbp->rb_right;
				1060	else if (kcmp < 0)
				1061	rbp = rbp->rb_left;
				1062	else {
				1063	epir = epi;
				1064	break;
				1065	}
				1066	}
				1067
				1068	return epir;
				1069	}
				1070
				1071	#ifdef CONFIG_CHECKPOINT_RESTORE
				1072	static struct epitem ep_find_tfd(struct eventpoll ep, int tfd, unsigned long toff)
				1073	{
				1074	struct rb_node *rbp;
				1075	struct epitem *epi;
				1076
				1077	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
				1078	epi = rb_entry(rbp, struct epitem, rbn);
				1079	if (epi->ffd.fd == tfd) {
				1080	if (toff == 0)
				1081	return epi;
				1082	else
				1083	toff--;
				1084	}
				1085	cond_resched();
				1086	}
				1087
				1088	return NULL;
				1089	}
				1090
				1091	struct file get_epoll_tfile_raw_ptr(struct file file, int tfd,
				1092	unsigned long toff)
				1093	{
				1094	struct file *file_raw;
				1095	struct eventpoll *ep;
				1096	struct epitem *epi;
				1097
				1098	if (!is_file_epoll(file))
				1099	return ERR_PTR(-EINVAL);
				1100
				1101	ep = file->private_data;
				1102
				1103	mutex_lock(&ep->mtx);
				1104	epi = ep_find_tfd(ep, tfd, toff);
				1105	if (epi)
				1106	file_raw = epi->ffd.file;
				1107	else
				1108	file_raw = ERR_PTR(-ENOENT);
				1109	mutex_unlock(&ep->mtx);
				1110
				1111	return file_raw;
				1112	}
				1113	#endif /* CONFIG_CHECKPOINT_RESTORE */
				1114
				1115	/**
				1116	* Adds a new entry to the tail of the list in a lockless way, i.e.
				1117	* multiple CPUs are allowed to call this function concurrently.
				1118	*
				1119	* Beware: it is necessary to prevent any other modifications of the
				1120	* existing list until all changes are completed, in other words
				1121	* concurrent list_add_tail_lockless() calls should be protected
				1122	* with a read lock, where write lock acts as a barrier which
				1123	* makes sure all list_add_tail_lockless() calls are fully
				1124	* completed.
				1125	*
				1126	* Also an element can be locklessly added to the list only in one
				1127	* direction i.e. either to the tail either to the head, otherwise
				1128	* concurrent access will corrupt the list.
				1129	*
				1130	* Returns %false if element has been already added to the list, %true
				1131	* otherwise.
				1132	*/
				1133	static inline bool list_add_tail_lockless(struct list_head *new,
				1134	struct list_head *head)
				1135	{
				1136	struct list_head *prev;
				1137
				1138	/*
				1139	* This is simple 'new->next = head' operation, but cmpxchg()
				1140	* is used in order to detect that same element has been just
				1141	* added to the list from another CPU: the winner observes
				1142	* new->next == new.
				1143	*/
				1144	if (cmpxchg(&new->next, new, head) != new)
				1145	return false;
				1146
				1147	/*
				1148	* Initially ->next of a new element must be updated with the head
				1149	* (we are inserting to the tail) and only then pointers are atomically
				1150	* exchanged. XCHG guarantees memory ordering, thus ->next should be
				1151	* updated before pointers are actually swapped and pointers are
				1152	* swapped before prev->next is updated.
				1153	*/
				1154
				1155	prev = xchg(&head->prev, new);
				1156
				1157	/*
				1158	* It is safe to modify prev->next and new->prev, because a new element
				1159	* is added only to the tail and new->next is updated before XCHG.
				1160	*/
				1161
				1162	prev->next = new;
				1163	new->prev = prev;
				1164
				1165	return true;
				1166	}
				1167
				1168	/**
				1169	* Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
				1170	* i.e. multiple CPUs are allowed to call this function concurrently.
				1171	*
				1172	* Returns %false if epi element has been already chained, %true otherwise.
				1173	*/
				1174	static inline bool chain_epi_lockless(struct epitem *epi)
				1175	{
				1176	struct eventpoll *ep = epi->ep;
				1177
				1178	/* Fast preliminary check */
				1179	if (epi->next != EP_UNACTIVE_PTR)
				1180	return false;
				1181
				1182	/* Check that the same epi has not been just chained from another CPU */
				1183	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
				1184	return false;
				1185
				1186	/* Atomically exchange tail */
				1187	epi->next = xchg(&ep->ovflist, epi);
				1188
				1189	return true;
				1190	}
				1191
				1192	/*
				1193	* This is the callback that is passed to the wait queue wakeup
				1194	* mechanism. It is called by the stored file descriptors when they
				1195	* have events to report.
				1196	*
				1197	* This callback takes a read lock in order not to content with concurrent
				1198	* events from another file descriptors, thus all modifications to ->rdllist
				1199	* or ->ovflist are lockless. Read lock is paired with the write lock from
				1200	* ep_scan_ready_list(), which stops all list modifications and guarantees
				1201	* that lists state is seen correctly.
				1202	*
				1203	* Another thing worth to mention is that ep_poll_callback() can be called
				1204	* concurrently for the same @epi from different CPUs if poll table was inited
				1205	* with several wait queues entries. Plural wakeup from different CPUs of a
				1206	* single wait queue is serialized by wq.lock, but the case when multiple wait
				1207	* queues are used should be detected accordingly. This is detected using
				1208	* cmpxchg() operation.
				1209	*/
				1210	static int ep_poll_callback(wait_queue_entry_t wait, unsigned mode, int sync, void key)
				1211	{
				1212	int pwake = 0;
				1213	struct epitem *epi = ep_item_from_wait(wait);
				1214	struct eventpoll *ep = epi->ep;
				1215	__poll_t pollflags = key_to_poll(key);
				1216	unsigned long flags;
				1217	int ewake = 0;
				1218
				1219	read_lock_irqsave(&ep->lock, flags);
				1220
				1221	ep_set_busy_poll_napi_id(epi);
				1222
				1223	/*
				1224	* If the event mask does not contain any poll(2) event, we consider the
				1225	* descriptor to be disabled. This condition is likely the effect of the
				1226	* EPOLLONESHOT bit that disables the descriptor when an event is received,
				1227	* until the next EPOLL_CTL_MOD will be issued.
				1228	*/
				1229	if (!(epi->event.events & ~EP_PRIVATE_BITS))
				1230	goto out_unlock;
				1231
				1232	/*
				1233	* Check the events coming with the callback. At this stage, not
				1234	* every device reports the events in the "key" parameter of the
				1235	* callback. We need to be able to handle both cases here, hence the
				1236	* test for "key" != NULL before the event match test.
				1237	*/
				1238	if (pollflags && !(pollflags & epi->event.events))
				1239	goto out_unlock;
				1240
				1241	/*
				1242	* If we are transferring events to userspace, we can hold no locks
				1243	* (because we're accessing user memory, and because of linux f_op->poll()
				1244	* semantics). All the events that happen during that period of time are
				1245	* chained in ep->ovflist and requeued later on.
				1246	*/
				1247	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
				1248	if (chain_epi_lockless(epi))
				1249	ep_pm_stay_awake_rcu(epi);
				1250	} else if (!ep_is_linked(epi)) {
				1251	/* In the usual case, add event to ready list. */
				1252	if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
				1253	ep_pm_stay_awake_rcu(epi);
				1254	}
				1255
				1256	/*
				1257	* Wake up ( if active ) both the eventpoll wait list and the ->poll()
				1258	* wait list.
				1259	*/
				1260	if (waitqueue_active(&ep->wq)) {
				1261	if ((epi->event.events & EPOLLEXCLUSIVE) &&
				1262	!(pollflags & POLLFREE)) {
				1263	switch (pollflags & EPOLLINOUT_BITS) {
				1264	case EPOLLIN:
				1265	if (epi->event.events & EPOLLIN)
				1266	ewake = 1;
				1267	break;
				1268	case EPOLLOUT:
				1269	if (epi->event.events & EPOLLOUT)
				1270	ewake = 1;
				1271	break;
				1272	case 0:
				1273	ewake = 1;
				1274	break;
				1275	}
				1276	}
				1277	if (sync)
				1278	wake_up_sync(&ep->wq);
				1279	else
				1280	wake_up(&ep->wq);
				1281	}
				1282	if (waitqueue_active(&ep->poll_wait))
				1283	pwake++;
				1284
				1285	out_unlock:
				1286	read_unlock_irqrestore(&ep->lock, flags);
				1287
				1288	/* We have to call this outside the lock */
				1289	if (pwake)
				1290	ep_poll_safewake(&ep->poll_wait);
				1291
				1292	if (!(epi->event.events & EPOLLEXCLUSIVE))
				1293	ewake = 1;
				1294
				1295	if (pollflags & POLLFREE) {
				1296	/*
				1297	* If we race with ep_remove_wait_queue() it can miss
				1298	* ->whead = NULL and do another remove_wait_queue() after
				1299	* us, so we can't use __remove_wait_queue().
				1300	*/
				1301	list_del_init(&wait->entry);
				1302	/*
				1303	* ->whead != NULL protects us from the race with ep_free()
				1304	* or ep_remove(), ep_remove_wait_queue() takes whead->lock
				1305	* held by the caller. Once we nullify it, nothing protects
				1306	* ep/epi or even wait.
				1307	*/
				1308	smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
				1309	}
				1310
				1311	return ewake;
				1312	}
				1313
				1314	/*
				1315	* This is the callback that is used to add our wait queue to the
				1316	* target file wakeup lists.
				1317	*/
				1318	static void ep_ptable_queue_proc(struct file file, wait_queue_head_t whead,
				1319	poll_table *pt)
				1320	{
				1321	struct epitem *epi = ep_item_from_epqueue(pt);
				1322	struct eppoll_entry *pwq;
				1323
				1324	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
				1325	init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
				1326	pwq->whead = whead;
				1327	pwq->base = epi;
				1328	if (epi->event.events & EPOLLEXCLUSIVE)
				1329	add_wait_queue_exclusive(whead, &pwq->wait);
				1330	else
				1331	add_wait_queue(whead, &pwq->wait);
				1332	list_add_tail(&pwq->llink, &epi->pwqlist);
				1333	epi->nwait++;
				1334	} else {
				1335	/* We have to signal that an error occurred */
				1336	epi->nwait = -1;
				1337	}
				1338	}
				1339
				1340	static void ep_rbtree_insert(struct eventpoll ep, struct epitem epi)
				1341	{
				1342	int kcmp;
				1343	struct rb_node *p = &ep->rbr.rb_root.rb_node, parent = NULL;
				1344	struct epitem *epic;
				1345	bool leftmost = true;
				1346
				1347	while (*p) {
				1348	parent = *p;
				1349	epic = rb_entry(parent, struct epitem, rbn);
				1350	kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
				1351	if (kcmp > 0) {
				1352	p = &parent->rb_right;
				1353	leftmost = false;
				1354	} else
				1355	p = &parent->rb_left;
				1356	}
				1357	rb_link_node(&epi->rbn, parent, p);
				1358	rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
				1359	}
				1360
				1361
				1362
				1363	#define PATH_ARR_SIZE 5
				1364	/*
				1365	* These are the number paths of length 1 to 5, that we are allowing to emanate
				1366	* from a single file of interest. For example, we allow 1000 paths of length
				1367	* 1, to emanate from each file of interest. This essentially represents the
				1368	* potential wakeup paths, which need to be limited in order to avoid massive
				1369	* uncontrolled wakeup storms. The common use case should be a single ep which
				1370	* is connected to n file sources. In this case each file source has 1 path
				1371	* of length 1. Thus, the numbers below should be more than sufficient. These
				1372	* path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
				1373	* and delete can't add additional paths. Protected by the epmutex.
				1374	*/
				1375	static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
				1376	static int path_count[PATH_ARR_SIZE];
				1377
				1378	static int path_count_inc(int nests)
				1379	{
				1380	/* Allow an arbitrary number of depth 1 paths */
				1381	if (nests == 0)
				1382	return 0;
				1383
				1384	if (++path_count[nests] > path_limits[nests])
				1385	return -1;
				1386	return 0;
				1387	}
				1388
				1389	static void path_count_init(void)
				1390	{
				1391	int i;
				1392
				1393	for (i = 0; i < PATH_ARR_SIZE; i++)
				1394	path_count[i] = 0;
				1395	}
				1396
				1397	static int reverse_path_check_proc(void priv, void cookie, int call_nests)
				1398	{
				1399	int error = 0;
				1400	struct file *file = priv;
				1401	struct file *child_file;
				1402	struct epitem *epi;
				1403
				1404	/* CTL_DEL can remove links here, but that can't increase our count */
				1405	rcu_read_lock();
				1406	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
				1407	child_file = epi->ep->file;
				1408	if (is_file_epoll(child_file)) {
				1409	if (list_empty(&child_file->f_ep_links)) {
				1410	if (path_count_inc(call_nests)) {
				1411	error = -1;
				1412	break;
				1413	}
				1414	} else {
				1415	error = ep_call_nested(&poll_loop_ncalls,
				1416	reverse_path_check_proc,
				1417	child_file, child_file,
				1418	current);
				1419	}
				1420	if (error != 0)
				1421	break;
				1422	} else {
				1423	printk(KERN_ERR "reverse_path_check_proc: "
				1424	"file is not an ep!\n");
				1425	}
				1426	}
				1427	rcu_read_unlock();
				1428	return error;
				1429	}
				1430
				1431	/**
				1432	* reverse_path_check - The tfile_check_list is list of file *, which have
				1433	* links that are proposed to be newly added. We need to
				1434	* make sure that those added links don't add too many
				1435	* paths such that we will spend all our time waking up
				1436	* eventpoll objects.
				1437	*
				1438	* Returns: Returns zero if the proposed links don't create too many paths,
				1439	* -1 otherwise.
				1440	*/
				1441	static int reverse_path_check(void)
				1442	{
				1443	int error = 0;
				1444	struct file *current_file;
				1445
				1446	/* let's call this for all tfiles */
				1447	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
				1448	path_count_init();
				1449	error = ep_call_nested(&poll_loop_ncalls,
				1450	reverse_path_check_proc, current_file,
				1451	current_file, current);
				1452	if (error)
				1453	break;
				1454	}
				1455	return error;
				1456	}
				1457
				1458	static int ep_create_wakeup_source(struct epitem *epi)
				1459	{
				1460	struct name_snapshot n;
				1461	struct wakeup_source *ws;
				1462
				1463	if (!epi->ep->ws) {
				1464	epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
				1465	if (!epi->ep->ws)
				1466	return -ENOMEM;
				1467	}
				1468
				1469	take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
				1470	ws = wakeup_source_register(NULL, n.name.name);
				1471	release_dentry_name_snapshot(&n);
				1472
				1473	if (!ws)
				1474	return -ENOMEM;
				1475	rcu_assign_pointer(epi->ws, ws);
				1476
				1477	return 0;
				1478	}
				1479
				1480	/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
				1481	static noinline void ep_destroy_wakeup_source(struct epitem *epi)
				1482	{
				1483	struct wakeup_source *ws = ep_wakeup_source(epi);
				1484
				1485	RCU_INIT_POINTER(epi->ws, NULL);
				1486
				1487	/*
				1488	* wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
				1489	* used internally by wakeup_source_remove, too (called by
				1490	* wakeup_source_unregister), so we cannot use call_rcu
				1491	*/
				1492	synchronize_rcu();
				1493	wakeup_source_unregister(ws);
				1494	}
				1495
				1496	/*
				1497	* Must be called with "mtx" held.
				1498	*/
				1499	static int ep_insert(struct eventpoll ep, const struct epoll_event event,
				1500	struct file *tfile, int fd, int full_check)
				1501	{
				1502	int error, pwake = 0;
				1503	__poll_t revents;
				1504	long user_watches;
				1505	struct epitem *epi;
				1506	struct ep_pqueue epq;
				1507
				1508	lockdep_assert_irqs_enabled();
				1509
				1510	user_watches = atomic_long_read(&ep->user->epoll_watches);
				1511	if (unlikely(user_watches >= max_user_watches))
				1512	return -ENOSPC;
				1513	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
				1514	return -ENOMEM;
				1515
				1516	/* Item initialization follow here ... */
				1517	INIT_LIST_HEAD(&epi->rdllink);
				1518	INIT_LIST_HEAD(&epi->fllink);
				1519	INIT_LIST_HEAD(&epi->pwqlist);
				1520	epi->ep = ep;
				1521	ep_set_ffd(&epi->ffd, tfile, fd);
				1522	epi->event = *event;
				1523	epi->nwait = 0;
				1524	epi->next = EP_UNACTIVE_PTR;
				1525	if (epi->event.events & EPOLLWAKEUP) {
				1526	error = ep_create_wakeup_source(epi);
				1527	if (error)
				1528	goto error_create_wakeup_source;
				1529	} else {
				1530	RCU_INIT_POINTER(epi->ws, NULL);
				1531	}
				1532
				1533	/* Add the current item to the list of active epoll hook for this file */
				1534	spin_lock(&tfile->f_lock);
				1535	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
				1536	spin_unlock(&tfile->f_lock);
				1537
				1538	/*
				1539	* Add the current item to the RB tree. All RB tree operations are
				1540	* protected by "mtx", and ep_insert() is called with "mtx" held.
				1541	*/
				1542	ep_rbtree_insert(ep, epi);
				1543
				1544	/* now check if we've created too many backpaths */
				1545	error = -EINVAL;
				1546	if (full_check && reverse_path_check())
				1547	goto error_remove_epi;
				1548
				1549	/* Initialize the poll table using the queue callback */
				1550	epq.epi = epi;
				1551	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
				1552
				1553	/*
				1554	* Attach the item to the poll hooks and get current event bits.
				1555	* We can safely use the file* here because its usage count has
				1556	* been increased by the caller of this function. Note that after
				1557	* this operation completes, the poll callback can start hitting
				1558	* the new item.
				1559	*/
				1560	revents = ep_item_poll(epi, &epq.pt, 1);
				1561
				1562	/*
				1563	* We have to check if something went wrong during the poll wait queue
				1564	* install process. Namely an allocation for a wait queue failed due
				1565	* high memory pressure.
				1566	*/
				1567	error = -ENOMEM;
				1568	if (epi->nwait < 0)
				1569	goto error_unregister;
				1570
				1571	/* We have to drop the new item inside our item list to keep track of it */
				1572	write_lock_irq(&ep->lock);
				1573
				1574	/* record NAPI ID of new item if present */
				1575	ep_set_busy_poll_napi_id(epi);
				1576
				1577	/* If the file is already "ready" we drop it inside the ready list */
				1578	if (revents && !ep_is_linked(epi)) {
				1579	list_add_tail(&epi->rdllink, &ep->rdllist);
				1580	ep_pm_stay_awake(epi);
				1581
				1582	/* Notify waiting tasks that events are available */
				1583	if (waitqueue_active(&ep->wq))
				1584	wake_up(&ep->wq);
				1585	if (waitqueue_active(&ep->poll_wait))
				1586	pwake++;
				1587	}
				1588
				1589	write_unlock_irq(&ep->lock);
				1590
				1591	atomic_long_inc(&ep->user->epoll_watches);
				1592
				1593	/* We have to call this outside the lock */
				1594	if (pwake)
				1595	ep_poll_safewake(&ep->poll_wait);
				1596
				1597	return 0;
				1598
				1599	error_unregister:
				1600	ep_unregister_pollwait(ep, epi);
				1601	error_remove_epi:
				1602	spin_lock(&tfile->f_lock);
				1603	list_del_rcu(&epi->fllink);
				1604	spin_unlock(&tfile->f_lock);
				1605
				1606	rb_erase_cached(&epi->rbn, &ep->rbr);
				1607
				1608	/*
				1609	* We need to do this because an event could have been arrived on some
				1610	* allocated wait queue. Note that we don't care about the ep->ovflist
				1611	* list, since that is used/cleaned only inside a section bound by "mtx".
				1612	* And ep_insert() is called with "mtx" held.
				1613	*/
				1614	write_lock_irq(&ep->lock);
				1615	if (ep_is_linked(epi))
				1616	list_del_init(&epi->rdllink);
				1617	write_unlock_irq(&ep->lock);
				1618
				1619	wakeup_source_unregister(ep_wakeup_source(epi));
				1620
				1621	error_create_wakeup_source:
				1622	kmem_cache_free(epi_cache, epi);
				1623
				1624	return error;
				1625	}
				1626
				1627	/*
				1628	* Modify the interest event mask by dropping an event if the new mask
				1629	* has a match in the current file status. Must be called with "mtx" held.
				1630	*/
				1631	static int ep_modify(struct eventpoll ep, struct epitem epi,
				1632	const struct epoll_event *event)
				1633	{
				1634	int pwake = 0;
				1635	poll_table pt;
				1636
				1637	lockdep_assert_irqs_enabled();
				1638
				1639	init_poll_funcptr(&pt, NULL);
				1640
				1641	/*
				1642	* Set the new event interest mask before calling f_op->poll();
				1643	* otherwise we might miss an event that happens between the
				1644	* f_op->poll() call and the new event set registering.
				1645	*/
				1646	epi->event.events = event->events; /* need barrier below */
				1647	epi->event.data = event->data; /* protected by mtx */
				1648	if (epi->event.events & EPOLLWAKEUP) {
				1649	if (!ep_has_wakeup_source(epi))
				1650	ep_create_wakeup_source(epi);
				1651	} else if (ep_has_wakeup_source(epi)) {
				1652	ep_destroy_wakeup_source(epi);
				1653	}
				1654
				1655	/*
				1656	* The following barrier has two effects:
				1657	*
				1658	* 1) Flush epi changes above to other CPUs. This ensures
				1659	* we do not miss events from ep_poll_callback if an
				1660	* event occurs immediately after we call f_op->poll().
				1661	* We need this because we did not take ep->lock while
				1662	* changing epi above (but ep_poll_callback does take
				1663	* ep->lock).
				1664	*
				1665	* 2) We also need to ensure we do not miss _past_ events
				1666	* when calling f_op->poll(). This barrier also
				1667	* pairs with the barrier in wq_has_sleeper (see
				1668	* comments for wq_has_sleeper).
				1669	*
				1670	* This barrier will now guarantee ep_poll_callback or f_op->poll
				1671	* (or both) will notice the readiness of an item.
				1672	*/
				1673	smp_mb();
				1674
				1675	/*
				1676	* Get current event bits. We can safely use the file* here because
				1677	* its usage count has been increased by the caller of this function.
				1678	* If the item is "hot" and it is not registered inside the ready
				1679	* list, push it inside.
				1680	*/
				1681	if (ep_item_poll(epi, &pt, 1)) {
				1682	write_lock_irq(&ep->lock);
				1683	if (!ep_is_linked(epi)) {
				1684	list_add_tail(&epi->rdllink, &ep->rdllist);
				1685	ep_pm_stay_awake(epi);
				1686
				1687	/* Notify waiting tasks that events are available */
				1688	if (waitqueue_active(&ep->wq))
				1689	wake_up(&ep->wq);
				1690	if (waitqueue_active(&ep->poll_wait))
				1691	pwake++;
				1692	}
				1693	write_unlock_irq(&ep->lock);
				1694	}
				1695
				1696	/* We have to call this outside the lock */
				1697	if (pwake)
				1698	ep_poll_safewake(&ep->poll_wait);
				1699
				1700	return 0;
				1701	}
				1702
				1703	static __poll_t ep_send_events_proc(struct eventpoll ep, struct list_head head,
				1704	void *priv)
				1705	{
				1706	struct ep_send_events_data *esed = priv;
				1707	__poll_t revents;
				1708	struct epitem epi, tmp;
				1709	struct epoll_event __user *uevent = esed->events;
				1710	struct wakeup_source *ws;
				1711	poll_table pt;
				1712
				1713	init_poll_funcptr(&pt, NULL);
				1714	esed->res = 0;
				1715
				1716	/*
				1717	* We can loop without lock because we are passed a task private list.
				1718	* Items cannot vanish during the loop because ep_scan_ready_list() is
				1719	* holding "mtx" during this call.
				1720	*/
				1721	lockdep_assert_held(&ep->mtx);
				1722
				1723	list_for_each_entry_safe(epi, tmp, head, rdllink) {
				1724	if (esed->res >= esed->maxevents)
				1725	break;
				1726
				1727	/*
				1728	* Activate ep->ws before deactivating epi->ws to prevent
				1729	* triggering auto-suspend here (in case we reactive epi->ws
				1730	* below).
				1731	*
				1732	* This could be rearranged to delay the deactivation of epi->ws
				1733	* instead, but then epi->ws would temporarily be out of sync
				1734	* with ep_is_linked().
				1735	*/
				1736	ws = ep_wakeup_source(epi);
				1737	if (ws) {
				1738	if (ws->active)
				1739	__pm_stay_awake(ep->ws);
				1740	__pm_relax(ws);
				1741	}
				1742
				1743	list_del_init(&epi->rdllink);
				1744
				1745	/*
				1746	* If the event mask intersect the caller-requested one,
				1747	* deliver the event to userspace. Again, ep_scan_ready_list()
				1748	* is holding ep->mtx, so no operations coming from userspace
				1749	* can change the item.
				1750	*/
				1751	revents = ep_item_poll(epi, &pt, 1);
				1752	if (!revents)
				1753	continue;
				1754
				1755	if (__put_user(revents, &uevent->events) \|\|
				1756	__put_user(epi->event.data, &uevent->data)) {
				1757	list_add(&epi->rdllink, head);
				1758	ep_pm_stay_awake(epi);
				1759	if (!esed->res)
				1760	esed->res = -EFAULT;
				1761	return 0;
				1762	}
				1763	esed->res++;
				1764	uevent++;
				1765	if (epi->event.events & EPOLLONESHOT)
				1766	epi->event.events &= EP_PRIVATE_BITS;
				1767	else if (!(epi->event.events & EPOLLET)) {
				1768	/*
				1769	* If this file has been added with Level
				1770	* Trigger mode, we need to insert back inside
				1771	* the ready list, so that the next call to
				1772	* epoll_wait() will check again the events
				1773	* availability. At this point, no one can insert
				1774	* into ep->rdllist besides us. The epoll_ctl()
				1775	* callers are locked out by
				1776	* ep_scan_ready_list() holding "mtx" and the
				1777	* poll callback will queue them in ep->ovflist.
				1778	*/
				1779	list_add_tail(&epi->rdllink, &ep->rdllist);
				1780	ep_pm_stay_awake(epi);
				1781	}
				1782	}
				1783
				1784	return 0;
				1785	}
				1786
				1787	static int ep_send_events(struct eventpoll *ep,
				1788	struct epoll_event __user *events, int maxevents)
				1789	{
				1790	struct ep_send_events_data esed;
				1791
				1792	esed.maxevents = maxevents;
				1793	esed.events = events;
				1794
				1795	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
				1796	return esed.res;
				1797	}
				1798
				1799	static inline struct timespec64 ep_set_mstimeout(long ms)
				1800	{
				1801	struct timespec64 now, ts = {
				1802	.tv_sec = ms / MSEC_PER_SEC,
				1803	.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
				1804	};
				1805
				1806	ktime_get_ts64(&now);
				1807	return timespec64_add_safe(now, ts);
				1808	}
				1809
				1810	/*
				1811	* autoremove_wake_function, but remove even on failure to wake up, because we
				1812	* know that default_wake_function/ttwu will only fail if the thread is already
				1813	* woken, and in that case the ep_poll loop will remove the entry anyways, not
				1814	* try to reuse it.
				1815	*/
				1816	static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
				1817	unsigned int mode, int sync, void *key)
				1818	{
				1819	int ret = default_wake_function(wq_entry, mode, sync, key);
				1820
				1821	/*
				1822	* Pairs with list_empty_careful in ep_poll, and ensures future loop
				1823	* iterations see the cause of this wakeup.
				1824	*/
				1825	list_del_init_careful(&wq_entry->entry);
				1826	return ret;
				1827	}
				1828
				1829	/**
				1830	* ep_poll - Retrieves ready events, and delivers them to the caller supplied
				1831	* event buffer.
				1832	*
				1833	* @ep: Pointer to the eventpoll context.
				1834	* @events: Pointer to the userspace buffer where the ready events should be
				1835	* stored.
				1836	* @maxevents: Size (in terms of number of events) of the caller event buffer.
				1837	* @timeout: Maximum timeout for the ready events fetch operation, in
				1838	* milliseconds. If the @timeout is zero, the function will not block,
				1839	* while if the @timeout is less than zero, the function will block
				1840	* until at least one event has been retrieved (or an error
				1841	* occurred).
				1842	*
				1843	* Returns: Returns the number of ready events which have been fetched, or an
				1844	* error code, in case of error.
				1845	*/
				1846	static int ep_poll(struct eventpoll ep, struct epoll_event __user events,
				1847	int maxevents, long timeout)
				1848	{
				1849	int res = 0, eavail, timed_out = 0;
				1850	u64 slack = 0;
				1851	wait_queue_entry_t wait;
				1852	ktime_t expires, *to = NULL;
				1853
				1854	lockdep_assert_irqs_enabled();
				1855
				1856	if (timeout > 0) {
				1857	struct timespec64 end_time = ep_set_mstimeout(timeout);
				1858
				1859	slack = select_estimate_accuracy(&end_time);
				1860	to = &expires;
				1861	*to = timespec64_to_ktime(end_time);
				1862	} else if (timeout == 0) {
				1863	/*
				1864	* Avoid the unnecessary trip to the wait queue loop, if the
				1865	* caller specified a non blocking operation. We still need
				1866	* lock because we could race and not see an epi being added
				1867	* to the ready list while in irq callback. Thus incorrectly
				1868	* returning 0 back to userspace.
				1869	*/
				1870	timed_out = 1;
				1871
				1872	write_lock_irq(&ep->lock);
				1873	eavail = ep_events_available(ep);
				1874	write_unlock_irq(&ep->lock);
				1875
				1876	goto send_events;
				1877	}
				1878
				1879	fetch_events:
				1880
				1881	if (!ep_events_available(ep))
				1882	ep_busy_loop(ep, timed_out);
				1883
				1884	eavail = ep_events_available(ep);
				1885	if (eavail)
				1886	goto send_events;
				1887
				1888	/*
				1889	* Busy poll timed out. Drop NAPI ID for now, we can add
				1890	* it back in when we have moved a socket with a valid NAPI
				1891	* ID onto the ready list.
				1892	*/
				1893	ep_reset_busy_poll_napi_id(ep);
				1894
				1895	do {
				1896	/*
				1897	* Internally init_wait() uses autoremove_wake_function(),
				1898	* thus wait entry is removed from the wait queue on each
				1899	* wakeup. Why it is important? In case of several waiters
				1900	* each new wakeup will hit the next waiter, giving it the
				1901	* chance to harvest new event. Otherwise wakeup can be
				1902	* lost. This is also good performance-wise, because on
				1903	* normal wakeup path no need to call __remove_wait_queue()
				1904	* explicitly, thus ep->lock is not taken, which halts the
				1905	* event delivery.
				1906	*
				1907	* In fact, we now use an even more aggressive function that
				1908	* unconditionally removes, because we don't reuse the wait
				1909	* entry between loop iterations. This lets us also avoid the
				1910	* performance issue if a process is killed, causing all of its
				1911	* threads to wake up without being removed normally.
				1912	*/
				1913	init_wait(&wait);
				1914	wait.func = ep_autoremove_wake_function;
				1915	write_lock_irq(&ep->lock);
				1916	/*
				1917	* Barrierless variant, waitqueue_active() is called under
				1918	* the same lock on wakeup ep_poll_callback() side, so it
				1919	* is safe to avoid an explicit barrier.
				1920	*/
				1921	__set_current_state(TASK_INTERRUPTIBLE);
				1922
				1923	/*
				1924	* Do the final check under the lock. ep_scan_ready_list()
				1925	* plays with two lists (->rdllist and ->ovflist) and there
				1926	* is always a race when both lists are empty for short
				1927	* period of time although events are pending, so lock is
				1928	* important.
				1929	*/
				1930	eavail = ep_events_available(ep);
				1931	if (!eavail) {
				1932	if (signal_pending(current))
				1933	res = -EINTR;
				1934	else
				1935	__add_wait_queue_exclusive(&ep->wq, &wait);
				1936	}
				1937	write_unlock_irq(&ep->lock);
				1938
				1939	if (!eavail && !res)
				1940	timed_out = !freezable_schedule_hrtimeout_range(to, slack,
				1941	HRTIMER_MODE_ABS);
				1942
				1943	/*
				1944	* We were woken up, thus go and try to harvest some events.
				1945	* If timed out and still on the wait queue, recheck eavail
				1946	* carefully under lock, below.
				1947	*/
				1948	eavail = 1;
				1949	} while (0);
				1950
				1951	__set_current_state(TASK_RUNNING);
				1952
				1953	if (!list_empty_careful(&wait.entry)) {
				1954	write_lock_irq(&ep->lock);
				1955	/*
				1956	* If the thread timed out and is not on the wait queue, it
				1957	* means that the thread was woken up after its timeout expired
				1958	* before it could reacquire the lock. Thus, when wait.entry is
				1959	* empty, it needs to harvest events.
				1960	*/
				1961	if (timed_out)
				1962	eavail = list_empty(&wait.entry);
				1963	__remove_wait_queue(&ep->wq, &wait);
				1964	write_unlock_irq(&ep->lock);
				1965	}
				1966
				1967	send_events:
				1968	if (fatal_signal_pending(current)) {
				1969	/*
				1970	* Always short-circuit for fatal signals to allow
				1971	* threads to make a timely exit without the chance of
				1972	* finding more events available and fetching
				1973	* repeatedly.
				1974	*/
				1975	res = -EINTR;
				1976	}
				1977	/*
				1978	* Try to transfer events to user space. In case we get 0 events and
				1979	* there's still timeout left over, we go trying again in search of
				1980	* more luck.
				1981	*/
				1982	if (!res && eavail &&
				1983	!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
				1984	goto fetch_events;
				1985
				1986	return res;
				1987	}
				1988
				1989	/**
				1990	* ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
				1991	* API, to verify that adding an epoll file inside another
				1992	* epoll structure, does not violate the constraints, in
				1993	* terms of closed loops, or too deep chains (which can
				1994	* result in excessive stack usage).
				1995	*
				1996	* @priv: Pointer to the epoll file to be currently checked.
				1997	* @cookie: Original cookie for this call. This is the top-of-the-chain epoll
				1998	* data structure pointer.
				1999	* @call_nests: Current dept of the @ep_call_nested() call stack.
				2000	*
				2001	* Returns: Returns zero if adding the epoll @file inside current epoll
				2002	* structure @ep does not violate the constraints, or -1 otherwise.
				2003	*/
				2004	static int ep_loop_check_proc(void priv, void cookie, int call_nests)
				2005	{
				2006	int error = 0;
				2007	struct file *file = priv;
				2008	struct eventpoll *ep = file->private_data;
				2009	struct eventpoll *ep_tovisit;
				2010	struct rb_node *rbp;
				2011	struct epitem *epi;
				2012
				2013	mutex_lock_nested(&ep->mtx, call_nests + 1);
				2014	ep->gen = loop_check_gen;
				2015	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
				2016	epi = rb_entry(rbp, struct epitem, rbn);
				2017	if (unlikely(is_file_epoll(epi->ffd.file))) {
				2018	ep_tovisit = epi->ffd.file->private_data;
				2019	if (ep_tovisit->gen == loop_check_gen)
				2020	continue;
				2021	error = ep_call_nested(&poll_loop_ncalls,
				2022	ep_loop_check_proc, epi->ffd.file,
				2023	ep_tovisit, current);
				2024	if (error != 0)
				2025	break;
				2026	} else {
				2027	/*
				2028	* If we've reached a file that is not associated with
				2029	* an ep, then we need to check if the newly added
				2030	* links are going to add too many wakeup paths. We do
				2031	* this by adding it to the tfile_check_list, if it's
				2032	* not already there, and calling reverse_path_check()
				2033	* during ep_insert().
				2034	*/
				2035	if (list_empty(&epi->ffd.file->f_tfile_llink)) {
				2036	if (get_file_rcu(epi->ffd.file))
				2037	list_add(&epi->ffd.file->f_tfile_llink,
				2038	&tfile_check_list);
				2039	}
				2040	}
				2041	}
				2042	mutex_unlock(&ep->mtx);
				2043
				2044	return error;
				2045	}
				2046
				2047	/**
				2048	* ep_loop_check - Performs a check to verify that adding an epoll file (@file)
				2049	* another epoll file (represented by @ep) does not create
				2050	* closed loops or too deep chains.
				2051	*
				2052	* @ep: Pointer to the epoll private data structure.
				2053	* @file: Pointer to the epoll file to be checked.
				2054	*
				2055	* Returns: Returns zero if adding the epoll @file inside current epoll
				2056	* structure @ep does not violate the constraints, or -1 otherwise.
				2057	*/
				2058	static int ep_loop_check(struct eventpoll ep, struct file file)
				2059	{
				2060	return ep_call_nested(&poll_loop_ncalls,
				2061	ep_loop_check_proc, file, ep, current);
				2062	}
				2063
				2064	static void clear_tfile_check_list(void)
				2065	{
				2066	struct file *file;
				2067
				2068	/* first clear the tfile_check_list */
				2069	while (!list_empty(&tfile_check_list)) {
				2070	file = list_first_entry(&tfile_check_list, struct file,
				2071	f_tfile_llink);
				2072	list_del_init(&file->f_tfile_llink);
				2073	fput(file);
				2074	}
				2075	INIT_LIST_HEAD(&tfile_check_list);
				2076	}
				2077
				2078	/*
				2079	* Open an eventpoll file descriptor.
				2080	*/
				2081	static int do_epoll_create(int flags)
				2082	{
				2083	int error, fd;
				2084	struct eventpoll *ep = NULL;
				2085	struct file *file;
				2086
				2087	/* Check the EPOLL_* constant for consistency. */
				2088	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
				2089
				2090	if (flags & ~EPOLL_CLOEXEC)
				2091	return -EINVAL;
				2092	/*
				2093	* Create the internal data structure ("struct eventpoll").
				2094	*/
				2095	error = ep_alloc(&ep);
				2096	if (error < 0)
				2097	return error;
				2098	/*
				2099	* Creates all the items needed to setup an eventpoll file. That is,
				2100	* a file structure and a free file descriptor.
				2101	*/
				2102	fd = get_unused_fd_flags(O_RDWR \| (flags & O_CLOEXEC));
				2103	if (fd < 0) {
				2104	error = fd;
				2105	goto out_free_ep;
				2106	}
				2107	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				2108	O_RDWR \| (flags & O_CLOEXEC));
				2109	if (IS_ERR(file)) {
				2110	error = PTR_ERR(file);
				2111	goto out_free_fd;
				2112	}
				2113	ep->file = file;
				2114	fd_install(fd, file);
				2115	return fd;
				2116
				2117	out_free_fd:
				2118	put_unused_fd(fd);
				2119	out_free_ep:
				2120	ep_free(ep);
				2121	return error;
				2122	}
				2123
				2124	SYSCALL_DEFINE1(epoll_create1, int, flags)
				2125	{
				2126	return do_epoll_create(flags);
				2127	}
				2128
				2129	SYSCALL_DEFINE1(epoll_create, int, size)
				2130	{
				2131	if (size <= 0)
				2132	return -EINVAL;
				2133
				2134	return do_epoll_create(0);
				2135	}
				2136
				2137	/*
				2138	* The following function implements the controller interface for
				2139	* the eventpoll file that enables the insertion/removal/change of
				2140	* file descriptors inside the interest set.
				2141	*/
				2142	SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
				2143	struct epoll_event __user *, event)
				2144	{
				2145	int error;
				2146	int full_check = 0;
				2147	struct fd f, tf;
				2148	struct eventpoll *ep;
				2149	struct epitem *epi;
				2150	struct epoll_event epds;
				2151	struct eventpoll *tep = NULL;
				2152
				2153	error = -EFAULT;
				2154	if (ep_op_has_event(op) &&
				2155	copy_from_user(&epds, event, sizeof(struct epoll_event)))
				2156	goto error_return;
				2157
				2158	error = -EBADF;
				2159	f = fdget(epfd);
				2160	if (!f.file)
				2161	goto error_return;
				2162
				2163	/* Get the "struct file " for the target file /
				2164	tf = fdget(fd);
				2165	if (!tf.file)
				2166	goto error_fput;
				2167
				2168	/* The target file descriptor must support poll */
				2169	error = -EPERM;
				2170	if (!file_can_poll(tf.file))
				2171	goto error_tgt_fput;
				2172
				2173	/* Check if EPOLLWAKEUP is allowed */
				2174	if (ep_op_has_event(op))
				2175	ep_take_care_of_epollwakeup(&epds);
				2176
				2177	/*
				2178	* We have to check that the file structure underneath the file descriptor
				2179	* the user passed to us _is_ an eventpoll file. And also we do not permit
				2180	* adding an epoll file descriptor inside itself.
				2181	*/
				2182	error = -EINVAL;
				2183	if (f.file == tf.file \|\| !is_file_epoll(f.file))
				2184	goto error_tgt_fput;
				2185
				2186	/*
				2187	* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
				2188	* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
				2189	* Also, we do not currently supported nested exclusive wakeups.
				2190	*/
				2191	if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
				2192	if (op == EPOLL_CTL_MOD)
				2193	goto error_tgt_fput;
				2194	if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) \|\|
				2195	(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
				2196	goto error_tgt_fput;
				2197	}
				2198
				2199	/*
				2200	* At this point it is safe to assume that the "private_data" contains
				2201	* our own data structure.
				2202	*/
				2203	ep = f.file->private_data;
				2204
				2205	/*
				2206	* When we insert an epoll file descriptor, inside another epoll file
				2207	* descriptor, there is the change of creating closed loops, which are
				2208	* better be handled here, than in more critical paths. While we are
				2209	* checking for loops we also determine the list of files reachable
				2210	* and hang them on the tfile_check_list, so we can check that we
				2211	* haven't created too many possible wakeup paths.
				2212	*
				2213	* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
				2214	* the epoll file descriptor is attaching directly to a wakeup source,
				2215	* unless the epoll file descriptor is nested. The purpose of taking the
				2216	* 'epmutex' on add is to prevent complex toplogies such as loops and
				2217	* deep wakeup paths from forming in parallel through multiple
				2218	* EPOLL_CTL_ADD operations.
				2219	*/
				2220	mutex_lock_nested(&ep->mtx, 0);
				2221	if (op == EPOLL_CTL_ADD) {
				2222	if (!list_empty(&f.file->f_ep_links) \|\|
				2223	ep->gen == loop_check_gen \|\|
				2224	is_file_epoll(tf.file)) {
				2225	full_check = 1;
				2226	mutex_unlock(&ep->mtx);
				2227	mutex_lock(&epmutex);
				2228	if (is_file_epoll(tf.file)) {
				2229	error = -ELOOP;
				2230	if (ep_loop_check(ep, tf.file) != 0)
				2231	goto error_tgt_fput;
				2232	} else {
				2233	get_file(tf.file);
				2234	list_add(&tf.file->f_tfile_llink,
				2235	&tfile_check_list);
				2236	}
				2237	mutex_lock_nested(&ep->mtx, 0);
				2238	if (is_file_epoll(tf.file)) {
				2239	tep = tf.file->private_data;
				2240	mutex_lock_nested(&tep->mtx, 1);
				2241	}
				2242	}
				2243	}
				2244
				2245	/*
				2246	* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
				2247	* above, we can be sure to be able to use the item looked up by
				2248	* ep_find() till we release the mutex.
				2249	*/
				2250	epi = ep_find(ep, tf.file, fd);
				2251
				2252	error = -EINVAL;
				2253	switch (op) {
				2254	case EPOLL_CTL_ADD:
				2255	if (!epi) {
				2256	epds.events \|= EPOLLERR \| EPOLLHUP;
				2257	error = ep_insert(ep, &epds, tf.file, fd, full_check);
				2258	} else
				2259	error = -EEXIST;
				2260	break;
				2261	case EPOLL_CTL_DEL:
				2262	if (epi)
				2263	error = ep_remove(ep, epi);
				2264	else
				2265	error = -ENOENT;
				2266	break;
				2267	case EPOLL_CTL_MOD:
				2268	if (epi) {
				2269	if (!(epi->event.events & EPOLLEXCLUSIVE)) {
				2270	epds.events \|= EPOLLERR \| EPOLLHUP;
				2271	error = ep_modify(ep, epi, &epds);
				2272	}
				2273	} else
				2274	error = -ENOENT;
				2275	break;
				2276	}
				2277	if (tep != NULL)
				2278	mutex_unlock(&tep->mtx);
				2279	mutex_unlock(&ep->mtx);
				2280
				2281	error_tgt_fput:
				2282	if (full_check) {
				2283	clear_tfile_check_list();
				2284	loop_check_gen++;
				2285	mutex_unlock(&epmutex);
				2286	}
				2287
				2288	fdput(tf);
				2289	error_fput:
				2290	fdput(f);
				2291	error_return:
				2292
				2293	return error;
				2294	}
				2295
				2296	/*
				2297	* Implement the event wait interface for the eventpoll file. It is the kernel
				2298	* part of the user space epoll_wait(2).
				2299	*/
				2300	static int do_epoll_wait(int epfd, struct epoll_event __user *events,
				2301	int maxevents, int timeout)
				2302	{
				2303	int error;
				2304	struct fd f;
				2305	struct eventpoll *ep;
				2306
				2307	/* The maximum number of event must be greater than zero */
				2308	if (maxevents <= 0 \|\| maxevents > EP_MAX_EVENTS)
				2309	return -EINVAL;
				2310
				2311	/* Verify that the area passed by the user is writeable */
				2312	if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
				2313	return -EFAULT;
				2314
				2315	/* Get the "struct file " for the eventpoll file /
				2316	f = fdget(epfd);
				2317	if (!f.file)
				2318	return -EBADF;
				2319
				2320	/*
				2321	* We have to check that the file structure underneath the fd
				2322	* the user passed to us _is_ an eventpoll file.
				2323	*/
				2324	error = -EINVAL;
				2325	if (!is_file_epoll(f.file))
				2326	goto error_fput;
				2327
				2328	/*
				2329	* At this point it is safe to assume that the "private_data" contains
				2330	* our own data structure.
				2331	*/
				2332	ep = f.file->private_data;
				2333
				2334	/* Time to fish for events ... */
				2335	error = ep_poll(ep, events, maxevents, timeout);
				2336
				2337	error_fput:
				2338	fdput(f);
				2339	return error;
				2340	}
				2341
				2342	SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
				2343	int, maxevents, int, timeout)
				2344	{
				2345	return do_epoll_wait(epfd, events, maxevents, timeout);
				2346	}
				2347
				2348	/*
				2349	* Implement the event wait interface for the eventpoll file. It is the kernel
				2350	* part of the user space epoll_pwait(2).
				2351	*/
				2352	SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
				2353	int, maxevents, int, timeout, const sigset_t __user *, sigmask,
				2354	size_t, sigsetsize)
				2355	{
				2356	int error;
				2357
				2358	/*
				2359	* If the caller wants a certain signal mask to be set during the wait,
				2360	* we apply it here.
				2361	*/
				2362	error = set_user_sigmask(sigmask, sigsetsize);
				2363	if (error)
				2364	return error;
				2365
				2366	error = do_epoll_wait(epfd, events, maxevents, timeout);
				2367	restore_saved_sigmask_unless(error == -EINTR);
				2368
				2369	return error;
				2370	}
				2371
				2372	#ifdef CONFIG_COMPAT
				2373	COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
				2374	struct epoll_event __user *, events,
				2375	int, maxevents, int, timeout,
				2376	const compat_sigset_t __user *, sigmask,
				2377	compat_size_t, sigsetsize)
				2378	{
				2379	long err;
				2380
				2381	/*
				2382	* If the caller wants a certain signal mask to be set during the wait,
				2383	* we apply it here.
				2384	*/
				2385	err = set_compat_user_sigmask(sigmask, sigsetsize);
				2386	if (err)
				2387	return err;
				2388
				2389	err = do_epoll_wait(epfd, events, maxevents, timeout);
				2390	restore_saved_sigmask_unless(err == -EINTR);
				2391
				2392	return err;
				2393	}
				2394	#endif
				2395
				2396	static int __init eventpoll_init(void)
				2397	{
				2398	struct sysinfo si;
				2399
				2400	si_meminfo(&si);
				2401	/*
				2402	* Allows top 4% of lomem to be allocated for epoll watches (per user).
				2403	*/
				2404	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
				2405	EP_ITEM_COST;
				2406	BUG_ON(max_user_watches < 0);
				2407
				2408	/*
				2409	* Initialize the structure used to perform epoll file descriptor
				2410	* inclusion loops checks.
				2411	*/
				2412	ep_nested_calls_init(&poll_loop_ncalls);
				2413
				2414	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				2415	/* Initialize the structure used to perform safe poll wait head wake ups */
				2416	ep_nested_calls_init(&poll_safewake_ncalls);
				2417	#endif
				2418
				2419	/*
				2420	* We can have many thousands of epitems, so prevent this from
				2421	* using an extra cache line on 64-bit (and smaller) CPUs
				2422	*/
				2423	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
				2424
				2425	/* Allocates slab cache used to allocate "struct epitem" items */
				2426	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
				2427	0, SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT, NULL);
				2428
				2429	/* Allocates slab cache used to allocate "struct eppoll_entry" */
				2430	pwq_cache = kmem_cache_create("eventpoll_pwq",
				2431	sizeof(struct eppoll_entry), 0, SLAB_PANIC\|SLAB_ACCOUNT, NULL);
				2432
				2433	return 0;
				2434	}
				2435	fs_initcall(eventpoll_init);