Blame - src/kernel/linux/v4.14/fs/userfaultfd.c - T103

blob: 6499056a80645cb4de034128e220f97874984b04 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* fs/userfaultfd.c
				3	*
				4	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
				5	* Copyright (C) 2008-2009 Red Hat, Inc.
				6	* Copyright (C) 2015 Red Hat, Inc.
				7	*
				8	* This work is licensed under the terms of the GNU GPL, version 2. See
				9	* the COPYING file in the top-level directory.
				10	*
				11	* Some part derived from fs/eventfd.c (anon inode setup) and
				12	* mm/ksm.c (mm hashing).
				13	*/
				14
				15	#include <linux/list.h>
				16	#include <linux/hashtable.h>
				17	#include <linux/sched/signal.h>
				18	#include <linux/sched/mm.h>
				19	#include <linux/mm.h>
				20	#include <linux/poll.h>
				21	#include <linux/slab.h>
				22	#include <linux/seq_file.h>
				23	#include <linux/file.h>
				24	#include <linux/bug.h>
				25	#include <linux/anon_inodes.h>
				26	#include <linux/syscalls.h>
				27	#include <linux/userfaultfd_k.h>
				28	#include <linux/mempolicy.h>
				29	#include <linux/ioctl.h>
				30	#include <linux/security.h>
				31	#include <linux/hugetlb.h>
				32
				33	static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
				34
				35	enum userfaultfd_state {
				36	UFFD_STATE_WAIT_API,
				37	UFFD_STATE_RUNNING,
				38	};
				39
				40	/*
				41	* Start with fault_pending_wqh and fault_wqh so they're more likely
				42	* to be in the same cacheline.
				43	*/
				44	struct userfaultfd_ctx {
				45	/* waitqueue head for the pending (i.e. not read) userfaults */
				46	wait_queue_head_t fault_pending_wqh;
				47	/* waitqueue head for the userfaults */
				48	wait_queue_head_t fault_wqh;
				49	/* waitqueue head for the pseudo fd to wakeup poll/read */
				50	wait_queue_head_t fd_wqh;
				51	/* waitqueue head for events */
				52	wait_queue_head_t event_wqh;
				53	/* a refile sequence protected by fault_pending_wqh lock */
				54	struct seqcount refile_seq;
				55	/* pseudo fd refcounting */
				56	atomic_t refcount;
				57	/* userfaultfd syscall flags */
				58	unsigned int flags;
				59	/* features requested from the userspace */
				60	unsigned int features;
				61	/* state machine */
				62	enum userfaultfd_state state;
				63	/* released */
				64	bool released;
				65	/* mm with one ore more vmas attached to this userfaultfd_ctx */
				66	struct mm_struct *mm;
				67	};
				68
				69	struct userfaultfd_fork_ctx {
				70	struct userfaultfd_ctx *orig;
				71	struct userfaultfd_ctx *new;
				72	struct list_head list;
				73	};
				74
				75	struct userfaultfd_unmap_ctx {
				76	struct userfaultfd_ctx *ctx;
				77	unsigned long start;
				78	unsigned long end;
				79	struct list_head list;
				80	};
				81
				82	struct userfaultfd_wait_queue {
				83	struct uffd_msg msg;
				84	wait_queue_entry_t wq;
				85	struct userfaultfd_ctx *ctx;
				86	bool waken;
				87	};
				88
				89	struct userfaultfd_wake_range {
				90	unsigned long start;
				91	unsigned long len;
				92	};
				93
				94	static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
				95	int wake_flags, void *key)
				96	{
				97	struct userfaultfd_wake_range *range = key;
				98	int ret;
				99	struct userfaultfd_wait_queue *uwq;
				100	unsigned long start, len;
				101
				102	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				103	ret = 0;
				104	/* len == 0 means wake all */
				105	start = range->start;
				106	len = range->len;
				107	if (len && (start > uwq->msg.arg.pagefault.address \|\|
				108	start + len <= uwq->msg.arg.pagefault.address))
				109	goto out;
				110	WRITE_ONCE(uwq->waken, true);
				111	/*
				112	* The Program-Order guarantees provided by the scheduler
				113	* ensure uwq->waken is visible before the task is woken.
				114	*/
				115	ret = wake_up_state(wq->private, mode);
				116	if (ret) {
				117	/*
				118	* Wake only once, autoremove behavior.
				119	*
				120	* After the effect of list_del_init is visible to the other
				121	* CPUs, the waitqueue may disappear from under us, see the
				122	* !list_empty_careful() in handle_userfault().
				123	*
				124	* try_to_wake_up() has an implicit smp_mb(), and the
				125	* wq->private is read before calling the extern function
				126	* "wake_up_state" (which in turns calls try_to_wake_up).
				127	*/
				128	list_del_init(&wq->entry);
				129	}
				130	out:
				131	return ret;
				132	}
				133
				134	/**
				135	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
				136	* context.
				137	* @ctx: [in] Pointer to the userfaultfd context.
				138	*/
				139	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
				140	{
				141	if (!atomic_inc_not_zero(&ctx->refcount))
				142	BUG();
				143	}
				144
				145	/**
				146	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
				147	* context.
				148	* @ctx: [in] Pointer to userfaultfd context.
				149	*
				150	* The userfaultfd context reference must have been previously acquired either
				151	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
				152	*/
				153	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
				154	{
				155	if (atomic_dec_and_test(&ctx->refcount)) {
				156	VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
				157	VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
				158	VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
				159	VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
				160	VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
				161	VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
				162	VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
				163	VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
				164	mmdrop(ctx->mm);
				165	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
				166	}
				167	}
				168
				169	static inline void msg_init(struct uffd_msg *msg)
				170	{
				171	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
				172	/*
				173	* Must use memset to zero out the paddings or kernel data is
				174	* leaked to userland.
				175	*/
				176	memset(msg, 0, sizeof(struct uffd_msg));
				177	}
				178
				179	static inline struct uffd_msg userfault_msg(unsigned long address,
				180	unsigned int flags,
				181	unsigned long reason,
				182	unsigned int features)
				183	{
				184	struct uffd_msg msg;
				185	msg_init(&msg);
				186	msg.event = UFFD_EVENT_PAGEFAULT;
				187	msg.arg.pagefault.address = address;
				188	if (flags & FAULT_FLAG_WRITE)
				189	/*
				190	* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
				191	* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
				192	* was not set in a UFFD_EVENT_PAGEFAULT, it means it
				193	* was a read fault, otherwise if set it means it's
				194	* a write fault.
				195	*/
				196	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
				197	if (reason & VM_UFFD_WP)
				198	/*
				199	* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
				200	* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
				201	* not set in a UFFD_EVENT_PAGEFAULT, it means it was
				202	* a missing fault, otherwise if set it means it's a
				203	* write protect fault.
				204	*/
				205	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
				206	if (features & UFFD_FEATURE_THREAD_ID)
				207	msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
				208	return msg;
				209	}
				210
				211	#ifdef CONFIG_HUGETLB_PAGE
				212	/*
				213	* Same functionality as userfaultfd_must_wait below with modifications for
				214	* hugepmd ranges.
				215	*/
				216	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
				217	struct vm_area_struct *vma,
				218	unsigned long address,
				219	unsigned long flags,
				220	unsigned long reason)
				221	{
				222	struct mm_struct *mm = ctx->mm;
				223	pte_t *ptep, pte;
				224	bool ret = true;
				225
				226	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				227
				228	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
				229
				230	if (!ptep)
				231	goto out;
				232
				233	ret = false;
				234	pte = huge_ptep_get(ptep);
				235
				236	/*
				237	* Lockless access: we're in a wait_event so it's ok if it
				238	* changes under us.
				239	*/
				240	if (huge_pte_none(pte))
				241	ret = true;
				242	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
				243	ret = true;
				244	out:
				245	return ret;
				246	}
				247	#else
				248	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
				249	struct vm_area_struct *vma,
				250	unsigned long address,
				251	unsigned long flags,
				252	unsigned long reason)
				253	{
				254	return false; /* should never get here */
				255	}
				256	#endif /* CONFIG_HUGETLB_PAGE */
				257
				258	/*
				259	* Verify the pagetables are still not ok after having reigstered into
				260	* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
				261	* userfault that has already been resolved, if userfaultfd_read and
				262	* UFFDIO_COPY\|ZEROPAGE are being run simultaneously on two different
				263	* threads.
				264	*/
				265	static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
				266	unsigned long address,
				267	unsigned long flags,
				268	unsigned long reason)
				269	{
				270	struct mm_struct *mm = ctx->mm;
				271	pgd_t *pgd;
				272	p4d_t *p4d;
				273	pud_t *pud;
				274	pmd_t *pmd, _pmd;
				275	pte_t *pte;
				276	bool ret = true;
				277
				278	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				279
				280	pgd = pgd_offset(mm, address);
				281	if (!pgd_present(*pgd))
				282	goto out;
				283	p4d = p4d_offset(pgd, address);
				284	if (!p4d_present(*p4d))
				285	goto out;
				286	pud = pud_offset(p4d, address);
				287	if (!pud_present(*pud))
				288	goto out;
				289	pmd = pmd_offset(pud, address);
				290	/*
				291	* READ_ONCE must function as a barrier with narrower scope
				292	* and it must be equivalent to:
				293	* _pmd = *pmd; barrier();
				294	*
				295	* This is to deal with the instability (as in
				296	* pmd_trans_unstable) of the pmd.
				297	*/
				298	_pmd = READ_ONCE(*pmd);
				299	if (!pmd_present(_pmd))
				300	goto out;
				301
				302	ret = false;
				303	if (pmd_trans_huge(_pmd))
				304	goto out;
				305
				306	/*
				307	* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
				308	* and use the standard pte_offset_map() instead of parsing _pmd.
				309	*/
				310	pte = pte_offset_map(pmd, address);
				311	/*
				312	* Lockless access: we're in a wait_event so it's ok if it
				313	* changes under us.
				314	*/
				315	if (pte_none(*pte))
				316	ret = true;
				317	pte_unmap(pte);
				318
				319	out:
				320	return ret;
				321	}
				322
				323	/*
				324	* The locking rules involved in returning VM_FAULT_RETRY depending on
				325	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
				326	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
				327	* recommendation in __lock_page_or_retry is not an understatement.
				328	*
				329	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
				330	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
				331	* not set.
				332	*
				333	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
				334	* set, VM_FAULT_RETRY can still be returned if and only if there are
				335	* fatal_signal_pending()s, and the mmap_sem must be released before
				336	* returning it.
				337	*/
				338	int handle_userfault(struct vm_fault *vmf, unsigned long reason)
				339	{
				340	struct mm_struct *mm = vmf->vma->vm_mm;
				341	struct userfaultfd_ctx *ctx;
				342	struct userfaultfd_wait_queue uwq;
				343	int ret;
				344	bool must_wait, return_to_userland;
				345	long blocking_state;
				346
				347	ret = VM_FAULT_SIGBUS;
				348
				349	/*
				350	* We don't do userfault handling for the final child pid update.
				351	*
				352	* We also don't do userfault handling during
				353	* coredumping. hugetlbfs has the special
				354	* follow_hugetlb_page() to skip missing pages in the
				355	* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
				356	* the no_page_table() helper in follow_page_mask(), but the
				357	* shmem_vm_ops->fault method is invoked even during
				358	* coredumping without mmap_sem and it ends up here.
				359	*/
				360	if (current->flags & (PF_EXITING\|PF_DUMPCORE))
				361	goto out;
				362
				363	/*
				364	* Coredumping runs without mmap_sem so we can only check that
				365	* the mmap_sem is held, if PF_DUMPCORE was not set.
				366	*/
				367	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
				368
				369	ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
				370	if (!ctx)
				371	goto out;
				372
				373	BUG_ON(ctx->mm != mm);
				374
				375	VM_BUG_ON(reason & ~(VM_UFFD_MISSING\|VM_UFFD_WP));
				376	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
				377
				378	if (ctx->features & UFFD_FEATURE_SIGBUS)
				379	goto out;
				380
				381	/*
				382	* If it's already released don't get it. This avoids to loop
				383	* in __get_user_pages if userfaultfd_release waits on the
				384	* caller of handle_userfault to release the mmap_sem.
				385	*/
				386	if (unlikely(ACCESS_ONCE(ctx->released))) {
				387	/*
				388	* Don't return VM_FAULT_SIGBUS in this case, so a non
				389	* cooperative manager can close the uffd after the
				390	* last UFFDIO_COPY, without risking to trigger an
				391	* involuntary SIGBUS if the process was starting the
				392	* userfaultfd while the userfaultfd was still armed
				393	* (but after the last UFFDIO_COPY). If the uffd
				394	* wasn't already closed when the userfault reached
				395	* this point, that would normally be solved by
				396	* userfaultfd_must_wait returning 'false'.
				397	*
				398	* If we were to return VM_FAULT_SIGBUS here, the non
				399	* cooperative manager would be instead forced to
				400	* always call UFFDIO_UNREGISTER before it can safely
				401	* close the uffd.
				402	*/
				403	ret = VM_FAULT_NOPAGE;
				404	goto out;
				405	}
				406
				407	/*
				408	* Check that we can return VM_FAULT_RETRY.
				409	*
				410	* NOTE: it should become possible to return VM_FAULT_RETRY
				411	* even if FAULT_FLAG_TRIED is set without leading to gup()
				412	* -EBUSY failures, if the userfaultfd is to be extended for
				413	* VM_UFFD_WP tracking and we intend to arm the userfault
				414	* without first stopping userland access to the memory. For
				415	* VM_UFFD_MISSING userfaults this is enough for now.
				416	*/
				417	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
				418	/*
				419	* Validate the invariant that nowait must allow retry
				420	* to be sure not to return SIGBUS erroneously on
				421	* nowait invocations.
				422	*/
				423	BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
				424	#ifdef CONFIG_DEBUG_VM
				425	if (printk_ratelimit()) {
				426	printk(KERN_WARNING
				427	"FAULT_FLAG_ALLOW_RETRY missing %x\n",
				428	vmf->flags);
				429	dump_stack();
				430	}
				431	#endif
				432	goto out;
				433	}
				434
				435	/*
				436	* Handle nowait, not much to do other than tell it to retry
				437	* and wait.
				438	*/
				439	ret = VM_FAULT_RETRY;
				440	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
				441	goto out;
				442
				443	/* take the reference before dropping the mmap_sem */
				444	userfaultfd_ctx_get(ctx);
				445
				446	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
				447	uwq.wq.private = current;
				448	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
				449	ctx->features);
				450	uwq.ctx = ctx;
				451	uwq.waken = false;
				452
				453	return_to_userland =
				454	(vmf->flags & (FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE)) ==
				455	(FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE);
				456	blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
				457	TASK_KILLABLE;
				458
				459	spin_lock(&ctx->fault_pending_wqh.lock);
				460	/*
				461	* After the __add_wait_queue the uwq is visible to userland
				462	* through poll/read().
				463	*/
				464	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
				465	/*
				466	* The smp_mb() after __set_current_state prevents the reads
				467	* following the spin_unlock to happen before the list_add in
				468	* __add_wait_queue.
				469	*/
				470	set_current_state(blocking_state);
				471	spin_unlock(&ctx->fault_pending_wqh.lock);
				472
				473	if (!is_vm_hugetlb_page(vmf->vma))
				474	must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
				475	reason);
				476	else
				477	must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
				478	vmf->address,
				479	vmf->flags, reason);
				480	up_read(&mm->mmap_sem);
				481
				482	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
				483	(return_to_userland ? !signal_pending(current) :
				484	!fatal_signal_pending(current)))) {
				485	wake_up_poll(&ctx->fd_wqh, POLLIN);
				486	schedule();
				487	ret \|= VM_FAULT_MAJOR;
				488
				489	/*
				490	* False wakeups can orginate even from rwsem before
				491	* up_read() however userfaults will wait either for a
				492	* targeted wakeup on the specific uwq waitqueue from
				493	* wake_userfault() or for signals or for uffd
				494	* release.
				495	*/
				496	while (!READ_ONCE(uwq.waken)) {
				497	/*
				498	* This needs the full smp_store_mb()
				499	* guarantee as the state write must be
				500	* visible to other CPUs before reading
				501	* uwq.waken from other CPUs.
				502	*/
				503	set_current_state(blocking_state);
				504	if (READ_ONCE(uwq.waken) \|\|
				505	READ_ONCE(ctx->released) \|\|
				506	(return_to_userland ? signal_pending(current) :
				507	fatal_signal_pending(current)))
				508	break;
				509	schedule();
				510	}
				511	}
				512
				513	__set_current_state(TASK_RUNNING);
				514
				515	if (return_to_userland) {
				516	if (signal_pending(current) &&
				517	!fatal_signal_pending(current)) {
				518	/*
				519	* If we got a SIGSTOP or SIGCONT and this is
				520	* a normal userland page fault, just let
				521	* userland return so the signal will be
				522	* handled and gdb debugging works. The page
				523	* fault code immediately after we return from
				524	* this function is going to release the
				525	* mmap_sem and it's not depending on it
				526	* (unlike gup would if we were not to return
				527	* VM_FAULT_RETRY).
				528	*
				529	* If a fatal signal is pending we still take
				530	* the streamlined VM_FAULT_RETRY failure path
				531	* and there's no need to retake the mmap_sem
				532	* in such case.
				533	*/
				534	down_read(&mm->mmap_sem);
				535	ret = VM_FAULT_NOPAGE;
				536	}
				537	}
				538
				539	/*
				540	* Here we race with the list_del; list_add in
				541	* userfaultfd_ctx_read(), however because we don't ever run
				542	* list_del_init() to refile across the two lists, the prev
				543	* and next pointers will never point to self. list_add also
				544	* would never let any of the two pointers to point to
				545	* self. So list_empty_careful won't risk to see both pointers
				546	* pointing to self at any time during the list refile. The
				547	* only case where list_del_init() is called is the full
				548	* removal in the wake function and there we don't re-list_add
				549	* and it's fine not to block on the spinlock. The uwq on this
				550	* kernel stack can be released after the list_del_init.
				551	*/
				552	if (!list_empty_careful(&uwq.wq.entry)) {
				553	spin_lock(&ctx->fault_pending_wqh.lock);
				554	/*
				555	* No need of list_del_init(), the uwq on the stack
				556	* will be freed shortly anyway.
				557	*/
				558	list_del(&uwq.wq.entry);
				559	spin_unlock(&ctx->fault_pending_wqh.lock);
				560	}
				561
				562	/*
				563	* ctx may go away after this if the userfault pseudo fd is
				564	* already released.
				565	*/
				566	userfaultfd_ctx_put(ctx);
				567
				568	out:
				569	return ret;
				570	}
				571
				572	static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
				573	struct userfaultfd_wait_queue *ewq)
				574	{
				575	struct userfaultfd_ctx *release_new_ctx;
				576
				577	if (WARN_ON_ONCE(current->flags & PF_EXITING))
				578	goto out;
				579
				580	ewq->ctx = ctx;
				581	init_waitqueue_entry(&ewq->wq, current);
				582	release_new_ctx = NULL;
				583
				584	spin_lock(&ctx->event_wqh.lock);
				585	/*
				586	* After the __add_wait_queue the uwq is visible to userland
				587	* through poll/read().
				588	*/
				589	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
				590	for (;;) {
				591	set_current_state(TASK_KILLABLE);
				592	if (ewq->msg.event == 0)
				593	break;
				594	if (ACCESS_ONCE(ctx->released) \|\|
				595	fatal_signal_pending(current)) {
				596	/*
				597	* &ewq->wq may be queued in fork_event, but
				598	* __remove_wait_queue ignores the head
				599	* parameter. It would be a problem if it
				600	* didn't.
				601	*/
				602	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
				603	if (ewq->msg.event == UFFD_EVENT_FORK) {
				604	struct userfaultfd_ctx *new;
				605
				606	new = (struct userfaultfd_ctx *)
				607	(unsigned long)
				608	ewq->msg.arg.reserved.reserved1;
				609	release_new_ctx = new;
				610	}
				611	break;
				612	}
				613
				614	spin_unlock(&ctx->event_wqh.lock);
				615
				616	wake_up_poll(&ctx->fd_wqh, POLLIN);
				617	schedule();
				618
				619	spin_lock(&ctx->event_wqh.lock);
				620	}
				621	__set_current_state(TASK_RUNNING);
				622	spin_unlock(&ctx->event_wqh.lock);
				623
				624	if (release_new_ctx) {
				625	struct vm_area_struct *vma;
				626	struct mm_struct *mm = release_new_ctx->mm;
				627
				628	/* the various vma->vm_userfaultfd_ctx still points to it */
				629	down_write(&mm->mmap_sem);
				630	/* no task can run (and in turn coredump) yet */
				631	VM_WARN_ON(!mmget_still_valid(mm));
				632	for (vma = mm->mmap; vma; vma = vma->vm_next)
				633	if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
				634	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				635	vma->vm_flags &= ~(VM_UFFD_WP \| VM_UFFD_MISSING);
				636	}
				637	up_write(&mm->mmap_sem);
				638
				639	userfaultfd_ctx_put(release_new_ctx);
				640	}
				641
				642	/*
				643	* ctx may go away after this if the userfault pseudo fd is
				644	* already released.
				645	*/
				646	out:
				647	userfaultfd_ctx_put(ctx);
				648	}
				649
				650	static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
				651	struct userfaultfd_wait_queue *ewq)
				652	{
				653	ewq->msg.event = 0;
				654	wake_up_locked(&ctx->event_wqh);
				655	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
				656	}
				657
				658	int dup_userfaultfd(struct vm_area_struct vma, struct list_head fcs)
				659	{
				660	struct userfaultfd_ctx ctx = NULL, octx;
				661	struct userfaultfd_fork_ctx *fctx;
				662
				663	octx = vma->vm_userfaultfd_ctx.ctx;
				664	if (!octx \|\| !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
				665	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				666	vma->vm_flags &= ~(VM_UFFD_WP \| VM_UFFD_MISSING);
				667	return 0;
				668	}
				669
				670	list_for_each_entry(fctx, fcs, list)
				671	if (fctx->orig == octx) {
				672	ctx = fctx->new;
				673	break;
				674	}
				675
				676	if (!ctx) {
				677	fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
				678	if (!fctx)
				679	return -ENOMEM;
				680
				681	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
				682	if (!ctx) {
				683	kfree(fctx);
				684	return -ENOMEM;
				685	}
				686
				687	atomic_set(&ctx->refcount, 1);
				688	ctx->flags = octx->flags;
				689	ctx->state = UFFD_STATE_RUNNING;
				690	ctx->features = octx->features;
				691	ctx->released = false;
				692	ctx->mm = vma->vm_mm;
				693	atomic_inc(&ctx->mm->mm_count);
				694
				695	userfaultfd_ctx_get(octx);
				696	fctx->orig = octx;
				697	fctx->new = ctx;
				698	list_add_tail(&fctx->list, fcs);
				699	}
				700
				701	vma->vm_userfaultfd_ctx.ctx = ctx;
				702	return 0;
				703	}
				704
				705	static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
				706	{
				707	struct userfaultfd_ctx *ctx = fctx->orig;
				708	struct userfaultfd_wait_queue ewq;
				709
				710	msg_init(&ewq.msg);
				711
				712	ewq.msg.event = UFFD_EVENT_FORK;
				713	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
				714
				715	userfaultfd_event_wait_completion(ctx, &ewq);
				716	}
				717
				718	void dup_userfaultfd_complete(struct list_head *fcs)
				719	{
				720	struct userfaultfd_fork_ctx fctx, n;
				721
				722	list_for_each_entry_safe(fctx, n, fcs, list) {
				723	dup_fctx(fctx);
				724	list_del(&fctx->list);
				725	kfree(fctx);
				726	}
				727	}
				728
				729	void mremap_userfaultfd_prep(struct vm_area_struct *vma,
				730	struct vm_userfaultfd_ctx *vm_ctx)
				731	{
				732	struct userfaultfd_ctx *ctx;
				733
				734	ctx = vma->vm_userfaultfd_ctx.ctx;
				735	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
				736	vm_ctx->ctx = ctx;
				737	userfaultfd_ctx_get(ctx);
				738	}
				739	}
				740
				741	void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
				742	unsigned long from, unsigned long to,
				743	unsigned long len)
				744	{
				745	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
				746	struct userfaultfd_wait_queue ewq;
				747
				748	if (!ctx)
				749	return;
				750
				751	if (to & ~PAGE_MASK) {
				752	userfaultfd_ctx_put(ctx);
				753	return;
				754	}
				755
				756	msg_init(&ewq.msg);
				757
				758	ewq.msg.event = UFFD_EVENT_REMAP;
				759	ewq.msg.arg.remap.from = from;
				760	ewq.msg.arg.remap.to = to;
				761	ewq.msg.arg.remap.len = len;
				762
				763	userfaultfd_event_wait_completion(ctx, &ewq);
				764	}
				765
				766	bool userfaultfd_remove(struct vm_area_struct *vma,
				767	unsigned long start, unsigned long end)
				768	{
				769	struct mm_struct *mm = vma->vm_mm;
				770	struct userfaultfd_ctx *ctx;
				771	struct userfaultfd_wait_queue ewq;
				772
				773	ctx = vma->vm_userfaultfd_ctx.ctx;
				774	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
				775	return true;
				776
				777	userfaultfd_ctx_get(ctx);
				778	up_read(&mm->mmap_sem);
				779
				780	msg_init(&ewq.msg);
				781
				782	ewq.msg.event = UFFD_EVENT_REMOVE;
				783	ewq.msg.arg.remove.start = start;
				784	ewq.msg.arg.remove.end = end;
				785
				786	userfaultfd_event_wait_completion(ctx, &ewq);
				787
				788	return false;
				789	}
				790
				791	static bool has_unmap_ctx(struct userfaultfd_ctx ctx, struct list_head unmaps,
				792	unsigned long start, unsigned long end)
				793	{
				794	struct userfaultfd_unmap_ctx *unmap_ctx;
				795
				796	list_for_each_entry(unmap_ctx, unmaps, list)
				797	if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
				798	unmap_ctx->end == end)
				799	return true;
				800
				801	return false;
				802	}
				803
				804	int userfaultfd_unmap_prep(struct vm_area_struct *vma,
				805	unsigned long start, unsigned long end,
				806	struct list_head *unmaps)
				807	{
				808	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
				809	struct userfaultfd_unmap_ctx *unmap_ctx;
				810	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
				811
				812	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) \|\|
				813	has_unmap_ctx(ctx, unmaps, start, end))
				814	continue;
				815
				816	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
				817	if (!unmap_ctx)
				818	return -ENOMEM;
				819
				820	userfaultfd_ctx_get(ctx);
				821	unmap_ctx->ctx = ctx;
				822	unmap_ctx->start = start;
				823	unmap_ctx->end = end;
				824	list_add_tail(&unmap_ctx->list, unmaps);
				825	}
				826
				827	return 0;
				828	}
				829
				830	void userfaultfd_unmap_complete(struct mm_struct mm, struct list_head uf)
				831	{
				832	struct userfaultfd_unmap_ctx ctx, n;
				833	struct userfaultfd_wait_queue ewq;
				834
				835	list_for_each_entry_safe(ctx, n, uf, list) {
				836	msg_init(&ewq.msg);
				837
				838	ewq.msg.event = UFFD_EVENT_UNMAP;
				839	ewq.msg.arg.remove.start = ctx->start;
				840	ewq.msg.arg.remove.end = ctx->end;
				841
				842	userfaultfd_event_wait_completion(ctx->ctx, &ewq);
				843
				844	list_del(&ctx->list);
				845	kfree(ctx);
				846	}
				847	}
				848
				849	static int userfaultfd_release(struct inode inode, struct file file)
				850	{
				851	struct userfaultfd_ctx *ctx = file->private_data;
				852	struct mm_struct *mm = ctx->mm;
				853	struct vm_area_struct vma, prev;
				854	/* len == 0 means wake all */
				855	struct userfaultfd_wake_range range = { .len = 0, };
				856	unsigned long new_flags;
				857	bool still_valid;
				858
				859	ACCESS_ONCE(ctx->released) = true;
				860
				861	if (!mmget_not_zero(mm))
				862	goto wakeup;
				863
				864	/*
				865	* Flush page faults out of all CPUs. NOTE: all page faults
				866	* must be retried without returning VM_FAULT_SIGBUS if
				867	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
				868	* changes while handle_userfault released the mmap_sem. So
				869	* it's critical that released is set to true (above), before
				870	* taking the mmap_sem for writing.
				871	*/
				872	down_write(&mm->mmap_sem);
				873	still_valid = mmget_still_valid(mm);
				874	prev = NULL;
				875	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				876	cond_resched();
				877	BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
				878	!!(vma->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				879	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
				880	prev = vma;
				881	continue;
				882	}
				883	new_flags = vma->vm_flags & ~(VM_UFFD_MISSING \| VM_UFFD_WP);
				884	if (still_valid) {
				885	prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
				886	new_flags, vma->anon_vma,
				887	vma->vm_file, vma->vm_pgoff,
				888	vma_policy(vma),
				889	NULL_VM_UFFD_CTX,
				890	vma_get_anon_name(vma));
				891	if (prev)
				892	vma = prev;
				893	else
				894	prev = vma;
				895	}
				896	vma->vm_flags = new_flags;
				897	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				898	}
				899	up_write(&mm->mmap_sem);
				900	mmput(mm);
				901	wakeup:
				902	/*
				903	* After no new page faults can wait on this fault_*wqh, flush
				904	* the last page faults that may have been already waiting on
				905	* the fault_*wqh.
				906	*/
				907	spin_lock(&ctx->fault_pending_wqh.lock);
				908	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
				909	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
				910	spin_unlock(&ctx->fault_pending_wqh.lock);
				911
				912	/* Flush pending events that may still wait on event_wqh */
				913	wake_up_all(&ctx->event_wqh);
				914
				915	wake_up_poll(&ctx->fd_wqh, POLLHUP);
				916	userfaultfd_ctx_put(ctx);
				917	return 0;
				918	}
				919
				920	/* fault_pending_wqh.lock must be hold by the caller */
				921	static inline struct userfaultfd_wait_queue *find_userfault_in(
				922	wait_queue_head_t *wqh)
				923	{
				924	wait_queue_entry_t *wq;
				925	struct userfaultfd_wait_queue *uwq;
				926
				927	VM_BUG_ON(!spin_is_locked(&wqh->lock));
				928
				929	uwq = NULL;
				930	if (!waitqueue_active(wqh))
				931	goto out;
				932	/* walk in reverse to provide FIFO behavior to read userfaults */
				933	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
				934	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				935	out:
				936	return uwq;
				937	}
				938
				939	static inline struct userfaultfd_wait_queue *find_userfault(
				940	struct userfaultfd_ctx *ctx)
				941	{
				942	return find_userfault_in(&ctx->fault_pending_wqh);
				943	}
				944
				945	static inline struct userfaultfd_wait_queue *find_userfault_evt(
				946	struct userfaultfd_ctx *ctx)
				947	{
				948	return find_userfault_in(&ctx->event_wqh);
				949	}
				950
				951	static unsigned int userfaultfd_poll(struct file file, poll_table wait)
				952	{
				953	struct userfaultfd_ctx *ctx = file->private_data;
				954	unsigned int ret;
				955
				956	poll_wait(file, &ctx->fd_wqh, wait);
				957
				958	switch (ctx->state) {
				959	case UFFD_STATE_WAIT_API:
				960	return POLLERR;
				961	case UFFD_STATE_RUNNING:
				962	/*
				963	* poll() never guarantees that read won't block.
				964	* userfaults can be waken before they're read().
				965	*/
				966	if (unlikely(!(file->f_flags & O_NONBLOCK)))
				967	return POLLERR;
				968	/*
				969	* lockless access to see if there are pending faults
				970	* __pollwait last action is the add_wait_queue but
				971	* the spin_unlock would allow the waitqueue_active to
				972	* pass above the actual list_add inside
				973	* add_wait_queue critical section. So use a full
				974	* memory barrier to serialize the list_add write of
				975	* add_wait_queue() with the waitqueue_active read
				976	* below.
				977	*/
				978	ret = 0;
				979	smp_mb();
				980	if (waitqueue_active(&ctx->fault_pending_wqh))
				981	ret = POLLIN;
				982	else if (waitqueue_active(&ctx->event_wqh))
				983	ret = POLLIN;
				984
				985	return ret;
				986	default:
				987	WARN_ON_ONCE(1);
				988	return POLLERR;
				989	}
				990	}
				991
				992	static const struct file_operations userfaultfd_fops;
				993
				994	static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
				995	struct userfaultfd_ctx *new,
				996	struct uffd_msg *msg)
				997	{
				998	int fd;
				999	struct file *file;
				1000	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
				1001
				1002	fd = get_unused_fd_flags(flags);
				1003	if (fd < 0)
				1004	return fd;
				1005
				1006	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
				1007	O_RDWR \| flags);
				1008	if (IS_ERR(file)) {
				1009	put_unused_fd(fd);
				1010	return PTR_ERR(file);
				1011	}
				1012
				1013	fd_install(fd, file);
				1014	msg->arg.reserved.reserved1 = 0;
				1015	msg->arg.fork.ufd = fd;
				1016
				1017	return 0;
				1018	}
				1019
				1020	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
				1021	struct uffd_msg *msg)
				1022	{
				1023	ssize_t ret;
				1024	DECLARE_WAITQUEUE(wait, current);
				1025	struct userfaultfd_wait_queue *uwq;
				1026	/*
				1027	* Handling fork event requires sleeping operations, so
				1028	* we drop the event_wqh lock, then do these ops, then
				1029	* lock it back and wake up the waiter. While the lock is
				1030	* dropped the ewq may go away so we keep track of it
				1031	* carefully.
				1032	*/
				1033	LIST_HEAD(fork_event);
				1034	struct userfaultfd_ctx *fork_nctx = NULL;
				1035
				1036	/* always take the fd_wqh lock before the fault_pending_wqh lock */
				1037	spin_lock(&ctx->fd_wqh.lock);
				1038	__add_wait_queue(&ctx->fd_wqh, &wait);
				1039	for (;;) {
				1040	set_current_state(TASK_INTERRUPTIBLE);
				1041	spin_lock(&ctx->fault_pending_wqh.lock);
				1042	uwq = find_userfault(ctx);
				1043	if (uwq) {
				1044	/*
				1045	* Use a seqcount to repeat the lockless check
				1046	* in wake_userfault() to avoid missing
				1047	* wakeups because during the refile both
				1048	* waitqueue could become empty if this is the
				1049	* only userfault.
				1050	*/
				1051	write_seqcount_begin(&ctx->refile_seq);
				1052
				1053	/*
				1054	* The fault_pending_wqh.lock prevents the uwq
				1055	* to disappear from under us.
				1056	*
				1057	* Refile this userfault from
				1058	* fault_pending_wqh to fault_wqh, it's not
				1059	* pending anymore after we read it.
				1060	*
				1061	* Use list_del() by hand (as
				1062	* userfaultfd_wake_function also uses
				1063	* list_del_init() by hand) to be sure nobody
				1064	* changes __remove_wait_queue() to use
				1065	* list_del_init() in turn breaking the
				1066	* !list_empty_careful() check in
				1067	* handle_userfault(). The uwq->wq.head list
				1068	* must never be empty at any time during the
				1069	* refile, or the waitqueue could disappear
				1070	* from under us. The "wait_queue_head_t"
				1071	* parameter of __remove_wait_queue() is unused
				1072	* anyway.
				1073	*/
				1074	list_del(&uwq->wq.entry);
				1075	__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
				1076
				1077	write_seqcount_end(&ctx->refile_seq);
				1078
				1079	/* careful to always initialize msg if ret == 0 */
				1080	*msg = uwq->msg;
				1081	spin_unlock(&ctx->fault_pending_wqh.lock);
				1082	ret = 0;
				1083	break;
				1084	}
				1085	spin_unlock(&ctx->fault_pending_wqh.lock);
				1086
				1087	spin_lock(&ctx->event_wqh.lock);
				1088	uwq = find_userfault_evt(ctx);
				1089	if (uwq) {
				1090	*msg = uwq->msg;
				1091
				1092	if (uwq->msg.event == UFFD_EVENT_FORK) {
				1093	fork_nctx = (struct userfaultfd_ctx *)
				1094	(unsigned long)
				1095	uwq->msg.arg.reserved.reserved1;
				1096	list_move(&uwq->wq.entry, &fork_event);
				1097	/*
				1098	* fork_nctx can be freed as soon as
				1099	* we drop the lock, unless we take a
				1100	* reference on it.
				1101	*/
				1102	userfaultfd_ctx_get(fork_nctx);
				1103	spin_unlock(&ctx->event_wqh.lock);
				1104	ret = 0;
				1105	break;
				1106	}
				1107
				1108	userfaultfd_event_complete(ctx, uwq);
				1109	spin_unlock(&ctx->event_wqh.lock);
				1110	ret = 0;
				1111	break;
				1112	}
				1113	spin_unlock(&ctx->event_wqh.lock);
				1114
				1115	if (signal_pending(current)) {
				1116	ret = -ERESTARTSYS;
				1117	break;
				1118	}
				1119	if (no_wait) {
				1120	ret = -EAGAIN;
				1121	break;
				1122	}
				1123	spin_unlock(&ctx->fd_wqh.lock);
				1124	schedule();
				1125	spin_lock(&ctx->fd_wqh.lock);
				1126	}
				1127	__remove_wait_queue(&ctx->fd_wqh, &wait);
				1128	__set_current_state(TASK_RUNNING);
				1129	spin_unlock(&ctx->fd_wqh.lock);
				1130
				1131	if (!ret && msg->event == UFFD_EVENT_FORK) {
				1132	ret = resolve_userfault_fork(ctx, fork_nctx, msg);
				1133	spin_lock(&ctx->event_wqh.lock);
				1134	if (!list_empty(&fork_event)) {
				1135	/*
				1136	* The fork thread didn't abort, so we can
				1137	* drop the temporary refcount.
				1138	*/
				1139	userfaultfd_ctx_put(fork_nctx);
				1140
				1141	uwq = list_first_entry(&fork_event,
				1142	typeof(*uwq),
				1143	wq.entry);
				1144	/*
				1145	* If fork_event list wasn't empty and in turn
				1146	* the event wasn't already released by fork
				1147	* (the event is allocated on fork kernel
				1148	* stack), put the event back to its place in
				1149	* the event_wq. fork_event head will be freed
				1150	* as soon as we return so the event cannot
				1151	* stay queued there no matter the current
				1152	* "ret" value.
				1153	*/
				1154	list_del(&uwq->wq.entry);
				1155	__add_wait_queue(&ctx->event_wqh, &uwq->wq);
				1156
				1157	/*
				1158	* Leave the event in the waitqueue and report
				1159	* error to userland if we failed to resolve
				1160	* the userfault fork.
				1161	*/
				1162	if (likely(!ret))
				1163	userfaultfd_event_complete(ctx, uwq);
				1164	} else {
				1165	/*
				1166	* Here the fork thread aborted and the
				1167	* refcount from the fork thread on fork_nctx
				1168	* has already been released. We still hold
				1169	* the reference we took before releasing the
				1170	* lock above. If resolve_userfault_fork
				1171	* failed we've to drop it because the
				1172	* fork_nctx has to be freed in such case. If
				1173	* it succeeded we'll hold it because the new
				1174	* uffd references it.
				1175	*/
				1176	if (ret)
				1177	userfaultfd_ctx_put(fork_nctx);
				1178	}
				1179	spin_unlock(&ctx->event_wqh.lock);
				1180	}
				1181
				1182	return ret;
				1183	}
				1184
				1185	static ssize_t userfaultfd_read(struct file file, char __user buf,
				1186	size_t count, loff_t *ppos)
				1187	{
				1188	struct userfaultfd_ctx *ctx = file->private_data;
				1189	ssize_t _ret, ret = 0;
				1190	struct uffd_msg msg;
				1191	int no_wait = file->f_flags & O_NONBLOCK;
				1192
				1193	if (ctx->state == UFFD_STATE_WAIT_API)
				1194	return -EINVAL;
				1195
				1196	for (;;) {
				1197	if (count < sizeof(msg))
				1198	return ret ? ret : -EINVAL;
				1199	_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
				1200	if (_ret < 0)
				1201	return ret ? ret : _ret;
				1202	if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
				1203	return ret ? ret : -EFAULT;
				1204	ret += sizeof(msg);
				1205	buf += sizeof(msg);
				1206	count -= sizeof(msg);
				1207	/*
				1208	* Allow to read more than one fault at time but only
				1209	* block if waiting for the very first one.
				1210	*/
				1211	no_wait = O_NONBLOCK;
				1212	}
				1213	}
				1214
				1215	static void __wake_userfault(struct userfaultfd_ctx *ctx,
				1216	struct userfaultfd_wake_range *range)
				1217	{
				1218	spin_lock(&ctx->fault_pending_wqh.lock);
				1219	/* wake all in the range and autoremove */
				1220	if (waitqueue_active(&ctx->fault_pending_wqh))
				1221	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
				1222	range);
				1223	if (waitqueue_active(&ctx->fault_wqh))
				1224	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
				1225	spin_unlock(&ctx->fault_pending_wqh.lock);
				1226	}
				1227
				1228	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
				1229	struct userfaultfd_wake_range *range)
				1230	{
				1231	unsigned seq;
				1232	bool need_wakeup;
				1233
				1234	/*
				1235	* To be sure waitqueue_active() is not reordered by the CPU
				1236	* before the pagetable update, use an explicit SMP memory
				1237	* barrier here. PT lock release or up_read(mmap_sem) still
				1238	* have release semantics that can allow the
				1239	* waitqueue_active() to be reordered before the pte update.
				1240	*/
				1241	smp_mb();
				1242
				1243	/*
				1244	* Use waitqueue_active because it's very frequent to
				1245	* change the address space atomically even if there are no
				1246	* userfaults yet. So we take the spinlock only when we're
				1247	* sure we've userfaults to wake.
				1248	*/
				1249	do {
				1250	seq = read_seqcount_begin(&ctx->refile_seq);
				1251	need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) \|\|
				1252	waitqueue_active(&ctx->fault_wqh);
				1253	cond_resched();
				1254	} while (read_seqcount_retry(&ctx->refile_seq, seq));
				1255	if (need_wakeup)
				1256	__wake_userfault(ctx, range);
				1257	}
				1258
				1259	static __always_inline int validate_range(struct mm_struct *mm,
				1260	__u64 start, __u64 len)
				1261	{
				1262	__u64 task_size = mm->task_size;
				1263
				1264	if (start & ~PAGE_MASK)
				1265	return -EINVAL;
				1266	if (len & ~PAGE_MASK)
				1267	return -EINVAL;
				1268	if (!len)
				1269	return -EINVAL;
				1270	if (start < mmap_min_addr)
				1271	return -EINVAL;
				1272	if (start >= task_size)
				1273	return -EINVAL;
				1274	if (len > task_size - start)
				1275	return -EINVAL;
				1276	return 0;
				1277	}
				1278
				1279	static inline bool vma_can_userfault(struct vm_area_struct *vma)
				1280	{
				1281	return vma_is_anonymous(vma) \|\| is_vm_hugetlb_page(vma) \|\|
				1282	vma_is_shmem(vma);
				1283	}
				1284
				1285	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
				1286	unsigned long arg)
				1287	{
				1288	struct mm_struct *mm = ctx->mm;
				1289	struct vm_area_struct vma, prev, *cur;
				1290	int ret;
				1291	struct uffdio_register uffdio_register;
				1292	struct uffdio_register __user *user_uffdio_register;
				1293	unsigned long vm_flags, new_flags;
				1294	bool found;
				1295	bool basic_ioctls;
				1296	unsigned long start, end, vma_end;
				1297
				1298	user_uffdio_register = (struct uffdio_register __user *) arg;
				1299
				1300	ret = -EFAULT;
				1301	if (copy_from_user(&uffdio_register, user_uffdio_register,
				1302	sizeof(uffdio_register)-sizeof(__u64)))
				1303	goto out;
				1304
				1305	ret = -EINVAL;
				1306	if (!uffdio_register.mode)
				1307	goto out;
				1308	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING\|
				1309	UFFDIO_REGISTER_MODE_WP))
				1310	goto out;
				1311	vm_flags = 0;
				1312	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
				1313	vm_flags \|= VM_UFFD_MISSING;
				1314	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
				1315	vm_flags \|= VM_UFFD_WP;
				1316	/*
				1317	* FIXME: remove the below error constraint by
				1318	* implementing the wprotect tracking mode.
				1319	*/
				1320	ret = -EINVAL;
				1321	goto out;
				1322	}
				1323
				1324	ret = validate_range(mm, uffdio_register.range.start,
				1325	uffdio_register.range.len);
				1326	if (ret)
				1327	goto out;
				1328
				1329	start = uffdio_register.range.start;
				1330	end = start + uffdio_register.range.len;
				1331
				1332	ret = -ENOMEM;
				1333	if (!mmget_not_zero(mm))
				1334	goto out;
				1335
				1336	down_write(&mm->mmap_sem);
				1337	if (!mmget_still_valid(mm))
				1338	goto out_unlock;
				1339	vma = find_vma_prev(mm, start, &prev);
				1340	if (!vma)
				1341	goto out_unlock;
				1342
				1343	/* check that there's at least one vma in the range */
				1344	ret = -EINVAL;
				1345	if (vma->vm_start >= end)
				1346	goto out_unlock;
				1347
				1348	/*
				1349	* If the first vma contains huge pages, make sure start address
				1350	* is aligned to huge page size.
				1351	*/
				1352	if (is_vm_hugetlb_page(vma)) {
				1353	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
				1354
				1355	if (start & (vma_hpagesize - 1))
				1356	goto out_unlock;
				1357	}
				1358
				1359	/*
				1360	* Search for not compatible vmas.
				1361	*/
				1362	found = false;
				1363	basic_ioctls = false;
				1364	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				1365	cond_resched();
				1366
				1367	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				1368	!!(cur->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				1369
				1370	/* check not compatible vmas */
				1371	ret = -EINVAL;
				1372	if (!vma_can_userfault(cur))
				1373	goto out_unlock;
				1374
				1375	/*
				1376	* UFFDIO_COPY will fill file holes even without
				1377	* PROT_WRITE. This check enforces that if this is a
				1378	* MAP_SHARED, the process has write permission to the backing
				1379	* file. If VM_MAYWRITE is set it also enforces that on a
				1380	* MAP_SHARED vma: there is no F_WRITE_SEAL and no further
				1381	* F_WRITE_SEAL can be taken until the vma is destroyed.
				1382	*/
				1383	ret = -EPERM;
				1384	if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
				1385	goto out_unlock;
				1386
				1387	/*
				1388	* If this vma contains ending address, and huge pages
				1389	* check alignment.
				1390	*/
				1391	if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
				1392	end > cur->vm_start) {
				1393	unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
				1394
				1395	ret = -EINVAL;
				1396
				1397	if (end & (vma_hpagesize - 1))
				1398	goto out_unlock;
				1399	}
				1400
				1401	/*
				1402	* Check that this vma isn't already owned by a
				1403	* different userfaultfd. We can't allow more than one
				1404	* userfaultfd to own a single vma simultaneously or we
				1405	* wouldn't know which one to deliver the userfaults to.
				1406	*/
				1407	ret = -EBUSY;
				1408	if (cur->vm_userfaultfd_ctx.ctx &&
				1409	cur->vm_userfaultfd_ctx.ctx != ctx)
				1410	goto out_unlock;
				1411
				1412	/*
				1413	* Note vmas containing huge pages
				1414	*/
				1415	if (is_vm_hugetlb_page(cur))
				1416	basic_ioctls = true;
				1417
				1418	found = true;
				1419	}
				1420	BUG_ON(!found);
				1421
				1422	if (vma->vm_start < start)
				1423	prev = vma;
				1424
				1425	ret = 0;
				1426	do {
				1427	cond_resched();
				1428
				1429	BUG_ON(!vma_can_userfault(vma));
				1430	BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
				1431	vma->vm_userfaultfd_ctx.ctx != ctx);
				1432	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
				1433
				1434	/*
				1435	* Nothing to do: this vma is already registered into this
				1436	* userfaultfd and with the right tracking mode too.
				1437	*/
				1438	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
				1439	(vma->vm_flags & vm_flags) == vm_flags)
				1440	goto skip;
				1441
				1442	if (vma->vm_start > start)
				1443	start = vma->vm_start;
				1444	vma_end = min(end, vma->vm_end);
				1445
				1446	new_flags = (vma->vm_flags & ~vm_flags) \| vm_flags;
				1447	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				1448	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				1449	vma_policy(vma),
				1450	((struct vm_userfaultfd_ctx){ ctx }),
				1451	vma_get_anon_name(vma));
				1452	if (prev) {
				1453	vma = prev;
				1454	goto next;
				1455	}
				1456	if (vma->vm_start < start) {
				1457	ret = split_vma(mm, vma, start, 1);
				1458	if (ret)
				1459	break;
				1460	}
				1461	if (vma->vm_end > end) {
				1462	ret = split_vma(mm, vma, end, 0);
				1463	if (ret)
				1464	break;
				1465	}
				1466	next:
				1467	/*
				1468	* In the vma_merge() successful mprotect-like case 8:
				1469	* the next vma was merged into the current one and
				1470	* the current one has not been updated yet.
				1471	*/
				1472	vma->vm_flags = new_flags;
				1473	vma->vm_userfaultfd_ctx.ctx = ctx;
				1474
				1475	skip:
				1476	prev = vma;
				1477	start = vma->vm_end;
				1478	vma = vma->vm_next;
				1479	} while (vma && vma->vm_start < end);
				1480	out_unlock:
				1481	up_write(&mm->mmap_sem);
				1482	mmput(mm);
				1483	if (!ret) {
				1484	/*
				1485	* Now that we scanned all vmas we can already tell
				1486	* userland which ioctls methods are guaranteed to
				1487	* succeed on this range.
				1488	*/
				1489	if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
				1490	UFFD_API_RANGE_IOCTLS,
				1491	&user_uffdio_register->ioctls))
				1492	ret = -EFAULT;
				1493	}
				1494	out:
				1495	return ret;
				1496	}
				1497
				1498	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
				1499	unsigned long arg)
				1500	{
				1501	struct mm_struct *mm = ctx->mm;
				1502	struct vm_area_struct vma, prev, *cur;
				1503	int ret;
				1504	struct uffdio_range uffdio_unregister;
				1505	unsigned long new_flags;
				1506	bool found;
				1507	unsigned long start, end, vma_end;
				1508	const void __user buf = (void __user )arg;
				1509
				1510	ret = -EFAULT;
				1511	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
				1512	goto out;
				1513
				1514	ret = validate_range(mm, uffdio_unregister.start,
				1515	uffdio_unregister.len);
				1516	if (ret)
				1517	goto out;
				1518
				1519	start = uffdio_unregister.start;
				1520	end = start + uffdio_unregister.len;
				1521
				1522	ret = -ENOMEM;
				1523	if (!mmget_not_zero(mm))
				1524	goto out;
				1525
				1526	down_write(&mm->mmap_sem);
				1527	if (!mmget_still_valid(mm))
				1528	goto out_unlock;
				1529	vma = find_vma_prev(mm, start, &prev);
				1530	if (!vma)
				1531	goto out_unlock;
				1532
				1533	/* check that there's at least one vma in the range */
				1534	ret = -EINVAL;
				1535	if (vma->vm_start >= end)
				1536	goto out_unlock;
				1537
				1538	/*
				1539	* If the first vma contains huge pages, make sure start address
				1540	* is aligned to huge page size.
				1541	*/
				1542	if (is_vm_hugetlb_page(vma)) {
				1543	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
				1544
				1545	if (start & (vma_hpagesize - 1))
				1546	goto out_unlock;
				1547	}
				1548
				1549	/*
				1550	* Search for not compatible vmas.
				1551	*/
				1552	found = false;
				1553	ret = -EINVAL;
				1554	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				1555	cond_resched();
				1556
				1557	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				1558	!!(cur->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				1559
				1560	/*
				1561	* Check not compatible vmas, not strictly required
				1562	* here as not compatible vmas cannot have an
				1563	* userfaultfd_ctx registered on them, but this
				1564	* provides for more strict behavior to notice
				1565	* unregistration errors.
				1566	*/
				1567	if (!vma_can_userfault(cur))
				1568	goto out_unlock;
				1569
				1570	found = true;
				1571	}
				1572	BUG_ON(!found);
				1573
				1574	if (vma->vm_start < start)
				1575	prev = vma;
				1576
				1577	ret = 0;
				1578	do {
				1579	cond_resched();
				1580
				1581	BUG_ON(!vma_can_userfault(vma));
				1582
				1583	/*
				1584	* Nothing to do: this vma is already registered into this
				1585	* userfaultfd and with the right tracking mode too.
				1586	*/
				1587	if (!vma->vm_userfaultfd_ctx.ctx)
				1588	goto skip;
				1589
				1590	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
				1591
				1592	if (vma->vm_start > start)
				1593	start = vma->vm_start;
				1594	vma_end = min(end, vma->vm_end);
				1595
				1596	if (userfaultfd_missing(vma)) {
				1597	/*
				1598	* Wake any concurrent pending userfault while
				1599	* we unregister, so they will not hang
				1600	* permanently and it avoids userland to call
				1601	* UFFDIO_WAKE explicitly.
				1602	*/
				1603	struct userfaultfd_wake_range range;
				1604	range.start = start;
				1605	range.len = vma_end - start;
				1606	wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
				1607	}
				1608
				1609	new_flags = vma->vm_flags & ~(VM_UFFD_MISSING \| VM_UFFD_WP);
				1610	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				1611	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				1612	vma_policy(vma),
				1613	NULL_VM_UFFD_CTX,
				1614	vma_get_anon_name(vma));
				1615	if (prev) {
				1616	vma = prev;
				1617	goto next;
				1618	}
				1619	if (vma->vm_start < start) {
				1620	ret = split_vma(mm, vma, start, 1);
				1621	if (ret)
				1622	break;
				1623	}
				1624	if (vma->vm_end > end) {
				1625	ret = split_vma(mm, vma, end, 0);
				1626	if (ret)
				1627	break;
				1628	}
				1629	next:
				1630	/*
				1631	* In the vma_merge() successful mprotect-like case 8:
				1632	* the next vma was merged into the current one and
				1633	* the current one has not been updated yet.
				1634	*/
				1635	vma->vm_flags = new_flags;
				1636	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				1637
				1638	skip:
				1639	prev = vma;
				1640	start = vma->vm_end;
				1641	vma = vma->vm_next;
				1642	} while (vma && vma->vm_start < end);
				1643	out_unlock:
				1644	up_write(&mm->mmap_sem);
				1645	mmput(mm);
				1646	out:
				1647	return ret;
				1648	}
				1649
				1650	/*
				1651	* userfaultfd_wake may be used in combination with the
				1652	* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
				1653	*/
				1654	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
				1655	unsigned long arg)
				1656	{
				1657	int ret;
				1658	struct uffdio_range uffdio_wake;
				1659	struct userfaultfd_wake_range range;
				1660	const void __user buf = (void __user )arg;
				1661
				1662	ret = -EFAULT;
				1663	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
				1664	goto out;
				1665
				1666	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
				1667	if (ret)
				1668	goto out;
				1669
				1670	range.start = uffdio_wake.start;
				1671	range.len = uffdio_wake.len;
				1672
				1673	/*
				1674	* len == 0 means wake all and we don't want to wake all here,
				1675	* so check it again to be sure.
				1676	*/
				1677	VM_BUG_ON(!range.len);
				1678
				1679	wake_userfault(ctx, &range);
				1680	ret = 0;
				1681
				1682	out:
				1683	return ret;
				1684	}
				1685
				1686	static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
				1687	unsigned long arg)
				1688	{
				1689	__s64 ret;
				1690	struct uffdio_copy uffdio_copy;
				1691	struct uffdio_copy __user *user_uffdio_copy;
				1692	struct userfaultfd_wake_range range;
				1693
				1694	user_uffdio_copy = (struct uffdio_copy __user *) arg;
				1695
				1696	ret = -EFAULT;
				1697	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
				1698	/* don't copy "copy" last field */
				1699	sizeof(uffdio_copy)-sizeof(__s64)))
				1700	goto out;
				1701
				1702	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
				1703	if (ret)
				1704	goto out;
				1705	/*
				1706	* double check for wraparound just in case. copy_from_user()
				1707	* will later check uffdio_copy.src + uffdio_copy.len to fit
				1708	* in the userland range.
				1709	*/
				1710	ret = -EINVAL;
				1711	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
				1712	goto out;
				1713	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
				1714	goto out;
				1715	if (mmget_not_zero(ctx->mm)) {
				1716	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
				1717	uffdio_copy.len);
				1718	mmput(ctx->mm);
				1719	} else {
				1720	return -ESRCH;
				1721	}
				1722	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
				1723	return -EFAULT;
				1724	if (ret < 0)
				1725	goto out;
				1726	BUG_ON(!ret);
				1727	/* len == 0 would wake all */
				1728	range.len = ret;
				1729	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
				1730	range.start = uffdio_copy.dst;
				1731	wake_userfault(ctx, &range);
				1732	}
				1733	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
				1734	out:
				1735	return ret;
				1736	}
				1737
				1738	static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
				1739	unsigned long arg)
				1740	{
				1741	__s64 ret;
				1742	struct uffdio_zeropage uffdio_zeropage;
				1743	struct uffdio_zeropage __user *user_uffdio_zeropage;
				1744	struct userfaultfd_wake_range range;
				1745
				1746	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
				1747
				1748	ret = -EFAULT;
				1749	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
				1750	/* don't copy "zeropage" last field */
				1751	sizeof(uffdio_zeropage)-sizeof(__s64)))
				1752	goto out;
				1753
				1754	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
				1755	uffdio_zeropage.range.len);
				1756	if (ret)
				1757	goto out;
				1758	ret = -EINVAL;
				1759	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
				1760	goto out;
				1761
				1762	if (mmget_not_zero(ctx->mm)) {
				1763	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
				1764	uffdio_zeropage.range.len);
				1765	mmput(ctx->mm);
				1766	} else {
				1767	return -ESRCH;
				1768	}
				1769	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
				1770	return -EFAULT;
				1771	if (ret < 0)
				1772	goto out;
				1773	/* len == 0 would wake all */
				1774	BUG_ON(!ret);
				1775	range.len = ret;
				1776	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
				1777	range.start = uffdio_zeropage.range.start;
				1778	wake_userfault(ctx, &range);
				1779	}
				1780	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
				1781	out:
				1782	return ret;
				1783	}
				1784
				1785	static inline unsigned int uffd_ctx_features(__u64 user_features)
				1786	{
				1787	/*
				1788	* For the current set of features the bits just coincide
				1789	*/
				1790	return (unsigned int)user_features;
				1791	}
				1792
				1793	/*
				1794	* userland asks for a certain API version and we return which bits
				1795	* and ioctl commands are implemented in this kernel for such API
				1796	* version or -EINVAL if unknown.
				1797	*/
				1798	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
				1799	unsigned long arg)
				1800	{
				1801	struct uffdio_api uffdio_api;
				1802	void __user buf = (void __user )arg;
				1803	int ret;
				1804	__u64 features;
				1805
				1806	ret = -EINVAL;
				1807	if (ctx->state != UFFD_STATE_WAIT_API)
				1808	goto out;
				1809	ret = -EFAULT;
				1810	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
				1811	goto out;
				1812	features = uffdio_api.features;
				1813	ret = -EINVAL;
				1814	if (uffdio_api.api != UFFD_API \|\| (features & ~UFFD_API_FEATURES))
				1815	goto err_out;
				1816	ret = -EPERM;
				1817	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
				1818	goto err_out;
				1819	/* report all available features and ioctls to userland */
				1820	uffdio_api.features = UFFD_API_FEATURES;
				1821	uffdio_api.ioctls = UFFD_API_IOCTLS;
				1822	ret = -EFAULT;
				1823	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				1824	goto out;
				1825	ctx->state = UFFD_STATE_RUNNING;
				1826	/* only enable the requested features for this uffd context */
				1827	ctx->features = uffd_ctx_features(features);
				1828	ret = 0;
				1829	out:
				1830	return ret;
				1831	err_out:
				1832	memset(&uffdio_api, 0, sizeof(uffdio_api));
				1833	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				1834	ret = -EFAULT;
				1835	goto out;
				1836	}
				1837
				1838	static long userfaultfd_ioctl(struct file *file, unsigned cmd,
				1839	unsigned long arg)
				1840	{
				1841	int ret = -EINVAL;
				1842	struct userfaultfd_ctx *ctx = file->private_data;
				1843
				1844	if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
				1845	return -EINVAL;
				1846
				1847	switch(cmd) {
				1848	case UFFDIO_API:
				1849	ret = userfaultfd_api(ctx, arg);
				1850	break;
				1851	case UFFDIO_REGISTER:
				1852	ret = userfaultfd_register(ctx, arg);
				1853	break;
				1854	case UFFDIO_UNREGISTER:
				1855	ret = userfaultfd_unregister(ctx, arg);
				1856	break;
				1857	case UFFDIO_WAKE:
				1858	ret = userfaultfd_wake(ctx, arg);
				1859	break;
				1860	case UFFDIO_COPY:
				1861	ret = userfaultfd_copy(ctx, arg);
				1862	break;
				1863	case UFFDIO_ZEROPAGE:
				1864	ret = userfaultfd_zeropage(ctx, arg);
				1865	break;
				1866	}
				1867	return ret;
				1868	}
				1869
				1870	#ifdef CONFIG_PROC_FS
				1871	static void userfaultfd_show_fdinfo(struct seq_file m, struct file f)
				1872	{
				1873	struct userfaultfd_ctx *ctx = f->private_data;
				1874	wait_queue_entry_t *wq;
				1875	struct userfaultfd_wait_queue *uwq;
				1876	unsigned long pending = 0, total = 0;
				1877
				1878	spin_lock(&ctx->fault_pending_wqh.lock);
				1879	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
				1880	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				1881	pending++;
				1882	total++;
				1883	}
				1884	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
				1885	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				1886	total++;
				1887	}
				1888	spin_unlock(&ctx->fault_pending_wqh.lock);
				1889
				1890	/*
				1891	* If more protocols will be added, there will be all shown
				1892	* separated by a space. Like this:
				1893	* protocols: aa:... bb:...
				1894	*/
				1895	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
				1896	pending, total, UFFD_API, ctx->features,
				1897	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
				1898	}
				1899	#endif
				1900
				1901	static const struct file_operations userfaultfd_fops = {
				1902	#ifdef CONFIG_PROC_FS
				1903	.show_fdinfo = userfaultfd_show_fdinfo,
				1904	#endif
				1905	.release = userfaultfd_release,
				1906	.poll = userfaultfd_poll,
				1907	.read = userfaultfd_read,
				1908	.unlocked_ioctl = userfaultfd_ioctl,
				1909	.compat_ioctl = userfaultfd_ioctl,
				1910	.llseek = noop_llseek,
				1911	};
				1912
				1913	static void init_once_userfaultfd_ctx(void *mem)
				1914	{
				1915	struct userfaultfd_ctx ctx = (struct userfaultfd_ctx ) mem;
				1916
				1917	init_waitqueue_head(&ctx->fault_pending_wqh);
				1918	init_waitqueue_head(&ctx->fault_wqh);
				1919	init_waitqueue_head(&ctx->event_wqh);
				1920	init_waitqueue_head(&ctx->fd_wqh);
				1921	seqcount_init(&ctx->refile_seq);
				1922	}
				1923
				1924	/**
				1925	* userfaultfd_file_create - Creates a userfaultfd file pointer.
				1926	* @flags: Flags for the userfaultfd file.
				1927	*
				1928	* This function creates a userfaultfd file pointer, w/out installing
				1929	* it into the fd table. This is useful when the userfaultfd file is
				1930	* used during the initialization of data structures that require
				1931	* extra setup after the userfaultfd creation. So the userfaultfd
				1932	* creation is split into the file pointer creation phase, and the
				1933	* file descriptor installation phase. In this way races with
				1934	* userspace closing the newly installed file descriptor can be
				1935	* avoided. Returns a userfaultfd file pointer, or a proper error
				1936	* pointer.
				1937	*/
				1938	static struct file *userfaultfd_file_create(int flags)
				1939	{
				1940	struct file *file;
				1941	struct userfaultfd_ctx *ctx;
				1942
				1943	BUG_ON(!current->mm);
				1944
				1945	/* Check the UFFD_* constants for consistency. */
				1946	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
				1947	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
				1948
				1949	file = ERR_PTR(-EINVAL);
				1950	if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
				1951	goto out;
				1952
				1953	file = ERR_PTR(-ENOMEM);
				1954	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
				1955	if (!ctx)
				1956	goto out;
				1957
				1958	atomic_set(&ctx->refcount, 1);
				1959	ctx->flags = flags;
				1960	ctx->features = 0;
				1961	ctx->state = UFFD_STATE_WAIT_API;
				1962	ctx->released = false;
				1963	ctx->mm = current->mm;
				1964	/* prevent the mm struct to be freed */
				1965	mmgrab(ctx->mm);
				1966
				1967	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
				1968	O_RDWR \| (flags & UFFD_SHARED_FCNTL_FLAGS));
				1969	if (IS_ERR(file)) {
				1970	mmdrop(ctx->mm);
				1971	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
				1972	}
				1973	out:
				1974	return file;
				1975	}
				1976
				1977	SYSCALL_DEFINE1(userfaultfd, int, flags)
				1978	{
				1979	int fd, error;
				1980	struct file *file;
				1981
				1982	error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
				1983	if (error < 0)
				1984	return error;
				1985	fd = error;
				1986
				1987	file = userfaultfd_file_create(flags);
				1988	if (IS_ERR(file)) {
				1989	error = PTR_ERR(file);
				1990	goto err_put_unused_fd;
				1991	}
				1992	fd_install(fd, file);
				1993
				1994	return fd;
				1995
				1996	err_put_unused_fd:
				1997	put_unused_fd(fd);
				1998
				1999	return error;
				2000	}
				2001
				2002	static int __init userfaultfd_init(void)
				2003	{
				2004	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
				2005	sizeof(struct userfaultfd_ctx),
				2006	0,
				2007	SLAB_HWCACHE_ALIGN\|SLAB_PANIC,
				2008	init_once_userfaultfd_ctx);
				2009	return 0;
				2010	}
				2011	__initcall(userfaultfd_init);