Blame - src/kernel/linux/v4.19/fs/userfaultfd.c - T800

blob: 31cb2657d34e33fa62685fecfd6490353f788018 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* fs/userfaultfd.c
				3	*
				4	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
				5	* Copyright (C) 2008-2009 Red Hat, Inc.
				6	* Copyright (C) 2015 Red Hat, Inc.
				7	*
				8	* This work is licensed under the terms of the GNU GPL, version 2. See
				9	* the COPYING file in the top-level directory.
				10	*
				11	* Some part derived from fs/eventfd.c (anon inode setup) and
				12	* mm/ksm.c (mm hashing).
				13	*/
				14
				15	#include <linux/list.h>
				16	#include <linux/hashtable.h>
				17	#include <linux/sched/signal.h>
				18	#include <linux/sched/mm.h>
				19	#include <linux/mm.h>
				20	#include <linux/poll.h>
				21	#include <linux/slab.h>
				22	#include <linux/seq_file.h>
				23	#include <linux/file.h>
				24	#include <linux/bug.h>
				25	#include <linux/anon_inodes.h>
				26	#include <linux/syscalls.h>
				27	#include <linux/userfaultfd_k.h>
				28	#include <linux/mempolicy.h>
				29	#include <linux/ioctl.h>
				30	#include <linux/security.h>
				31	#include <linux/hugetlb.h>
				32
				33	static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
				34
				35	enum userfaultfd_state {
				36	UFFD_STATE_WAIT_API,
				37	UFFD_STATE_RUNNING,
				38	};
				39
				40	/*
				41	* Start with fault_pending_wqh and fault_wqh so they're more likely
				42	* to be in the same cacheline.
				43	*
				44	* Locking order:
				45	* fd_wqh.lock
				46	* fault_pending_wqh.lock
				47	* fault_wqh.lock
				48	* event_wqh.lock
				49	*
				50	* To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
				51	* since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
				52	* also taken in IRQ context.
				53	*/
				54	struct userfaultfd_ctx {
				55	/* waitqueue head for the pending (i.e. not read) userfaults */
				56	wait_queue_head_t fault_pending_wqh;
				57	/* waitqueue head for the userfaults */
				58	wait_queue_head_t fault_wqh;
				59	/* waitqueue head for the pseudo fd to wakeup poll/read */
				60	wait_queue_head_t fd_wqh;
				61	/* waitqueue head for events */
				62	wait_queue_head_t event_wqh;
				63	/* a refile sequence protected by fault_pending_wqh lock */
				64	struct seqcount refile_seq;
				65	/* pseudo fd refcounting */
				66	atomic_t refcount;
				67	/* userfaultfd syscall flags */
				68	unsigned int flags;
				69	/* features requested from the userspace */
				70	unsigned int features;
				71	/* state machine */
				72	enum userfaultfd_state state;
				73	/* released */
				74	bool released;
				75	/* memory mappings are changing because of non-cooperative event */
				76	bool mmap_changing;
				77	/* mm with one ore more vmas attached to this userfaultfd_ctx */
				78	struct mm_struct *mm;
				79	};
				80
				81	struct userfaultfd_fork_ctx {
				82	struct userfaultfd_ctx *orig;
				83	struct userfaultfd_ctx *new;
				84	struct list_head list;
				85	};
				86
				87	struct userfaultfd_unmap_ctx {
				88	struct userfaultfd_ctx *ctx;
				89	unsigned long start;
				90	unsigned long end;
				91	struct list_head list;
				92	};
				93
				94	struct userfaultfd_wait_queue {
				95	struct uffd_msg msg;
				96	wait_queue_entry_t wq;
				97	struct userfaultfd_ctx *ctx;
				98	bool waken;
				99	};
				100
				101	struct userfaultfd_wake_range {
				102	unsigned long start;
				103	unsigned long len;
				104	};
				105
				106	static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
				107	int wake_flags, void *key)
				108	{
				109	struct userfaultfd_wake_range *range = key;
				110	int ret;
				111	struct userfaultfd_wait_queue *uwq;
				112	unsigned long start, len;
				113
				114	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				115	ret = 0;
				116	/* len == 0 means wake all */
				117	start = range->start;
				118	len = range->len;
				119	if (len && (start > uwq->msg.arg.pagefault.address \|\|
				120	start + len <= uwq->msg.arg.pagefault.address))
				121	goto out;
				122	WRITE_ONCE(uwq->waken, true);
				123	/*
				124	* The Program-Order guarantees provided by the scheduler
				125	* ensure uwq->waken is visible before the task is woken.
				126	*/
				127	ret = wake_up_state(wq->private, mode);
				128	if (ret) {
				129	/*
				130	* Wake only once, autoremove behavior.
				131	*
				132	* After the effect of list_del_init is visible to the other
				133	* CPUs, the waitqueue may disappear from under us, see the
				134	* !list_empty_careful() in handle_userfault().
				135	*
				136	* try_to_wake_up() has an implicit smp_mb(), and the
				137	* wq->private is read before calling the extern function
				138	* "wake_up_state" (which in turns calls try_to_wake_up).
				139	*/
				140	list_del_init(&wq->entry);
				141	}
				142	out:
				143	return ret;
				144	}
				145
				146	/**
				147	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
				148	* context.
				149	* @ctx: [in] Pointer to the userfaultfd context.
				150	*/
				151	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
				152	{
				153	if (!atomic_inc_not_zero(&ctx->refcount))
				154	BUG();
				155	}
				156
				157	/**
				158	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
				159	* context.
				160	* @ctx: [in] Pointer to userfaultfd context.
				161	*
				162	* The userfaultfd context reference must have been previously acquired either
				163	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
				164	*/
				165	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
				166	{
				167	if (atomic_dec_and_test(&ctx->refcount)) {
				168	VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
				169	VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
				170	VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
				171	VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
				172	VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
				173	VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
				174	VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
				175	VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
				176	mmdrop(ctx->mm);
				177	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
				178	}
				179	}
				180
				181	static inline void msg_init(struct uffd_msg *msg)
				182	{
				183	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
				184	/*
				185	* Must use memset to zero out the paddings or kernel data is
				186	* leaked to userland.
				187	*/
				188	memset(msg, 0, sizeof(struct uffd_msg));
				189	}
				190
				191	static inline struct uffd_msg userfault_msg(unsigned long address,
				192	unsigned int flags,
				193	unsigned long reason,
				194	unsigned int features)
				195	{
				196	struct uffd_msg msg;
				197	msg_init(&msg);
				198	msg.event = UFFD_EVENT_PAGEFAULT;
				199	msg.arg.pagefault.address = address;
				200	if (flags & FAULT_FLAG_WRITE)
				201	/*
				202	* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
				203	* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
				204	* was not set in a UFFD_EVENT_PAGEFAULT, it means it
				205	* was a read fault, otherwise if set it means it's
				206	* a write fault.
				207	*/
				208	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
				209	if (reason & VM_UFFD_WP)
				210	/*
				211	* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
				212	* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
				213	* not set in a UFFD_EVENT_PAGEFAULT, it means it was
				214	* a missing fault, otherwise if set it means it's a
				215	* write protect fault.
				216	*/
				217	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
				218	if (features & UFFD_FEATURE_THREAD_ID)
				219	msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
				220	return msg;
				221	}
				222
				223	#ifdef CONFIG_HUGETLB_PAGE
				224	/*
				225	* Same functionality as userfaultfd_must_wait below with modifications for
				226	* hugepmd ranges.
				227	*/
				228	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
				229	struct vm_area_struct *vma,
				230	unsigned long address,
				231	unsigned long flags,
				232	unsigned long reason)
				233	{
				234	struct mm_struct *mm = ctx->mm;
				235	pte_t *ptep, pte;
				236	bool ret = true;
				237
				238	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				239
				240	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
				241
				242	if (!ptep)
				243	goto out;
				244
				245	ret = false;
				246	pte = huge_ptep_get(ptep);
				247
				248	/*
				249	* Lockless access: we're in a wait_event so it's ok if it
				250	* changes under us.
				251	*/
				252	if (huge_pte_none(pte))
				253	ret = true;
				254	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
				255	ret = true;
				256	out:
				257	return ret;
				258	}
				259	#else
				260	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
				261	struct vm_area_struct *vma,
				262	unsigned long address,
				263	unsigned long flags,
				264	unsigned long reason)
				265	{
				266	return false; /* should never get here */
				267	}
				268	#endif /* CONFIG_HUGETLB_PAGE */
				269
				270	/*
				271	* Verify the pagetables are still not ok after having reigstered into
				272	* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
				273	* userfault that has already been resolved, if userfaultfd_read and
				274	* UFFDIO_COPY\|ZEROPAGE are being run simultaneously on two different
				275	* threads.
				276	*/
				277	static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
				278	unsigned long address,
				279	unsigned long flags,
				280	unsigned long reason)
				281	{
				282	struct mm_struct *mm = ctx->mm;
				283	pgd_t *pgd;
				284	p4d_t *p4d;
				285	pud_t *pud;
				286	pmd_t *pmd, _pmd;
				287	pte_t *pte;
				288	bool ret = true;
				289
				290	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				291
				292	pgd = pgd_offset(mm, address);
				293	if (!pgd_present(*pgd))
				294	goto out;
				295	p4d = p4d_offset(pgd, address);
				296	if (!p4d_present(*p4d))
				297	goto out;
				298	pud = pud_offset(p4d, address);
				299	if (!pud_present(*pud))
				300	goto out;
				301	pmd = pmd_offset(pud, address);
				302	/*
				303	* READ_ONCE must function as a barrier with narrower scope
				304	* and it must be equivalent to:
				305	* _pmd = *pmd; barrier();
				306	*
				307	* This is to deal with the instability (as in
				308	* pmd_trans_unstable) of the pmd.
				309	*/
				310	_pmd = READ_ONCE(*pmd);
				311	if (pmd_none(_pmd))
				312	goto out;
				313
				314	ret = false;
				315	if (!pmd_present(_pmd))
				316	goto out;
				317
				318	if (pmd_trans_huge(_pmd))
				319	goto out;
				320
				321	/*
				322	* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
				323	* and use the standard pte_offset_map() instead of parsing _pmd.
				324	*/
				325	pte = pte_offset_map(pmd, address);
				326	/*
				327	* Lockless access: we're in a wait_event so it's ok if it
				328	* changes under us.
				329	*/
				330	if (pte_none(*pte))
				331	ret = true;
				332	pte_unmap(pte);
				333
				334	out:
				335	return ret;
				336	}
				337
				338	/*
				339	* The locking rules involved in returning VM_FAULT_RETRY depending on
				340	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
				341	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
				342	* recommendation in __lock_page_or_retry is not an understatement.
				343	*
				344	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
				345	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
				346	* not set.
				347	*
				348	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
				349	* set, VM_FAULT_RETRY can still be returned if and only if there are
				350	* fatal_signal_pending()s, and the mmap_sem must be released before
				351	* returning it.
				352	*/
				353	vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
				354	{
				355	struct mm_struct *mm = vmf->vma->vm_mm;
				356	struct userfaultfd_ctx *ctx;
				357	struct userfaultfd_wait_queue uwq;
				358	vm_fault_t ret = VM_FAULT_SIGBUS;
				359	bool must_wait, return_to_userland;
				360	long blocking_state;
				361
				362	/*
				363	* We don't do userfault handling for the final child pid update.
				364	*
				365	* We also don't do userfault handling during
				366	* coredumping. hugetlbfs has the special
				367	* follow_hugetlb_page() to skip missing pages in the
				368	* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
				369	* the no_page_table() helper in follow_page_mask(), but the
				370	* shmem_vm_ops->fault method is invoked even during
				371	* coredumping without mmap_sem and it ends up here.
				372	*/
				373	if (current->flags & (PF_EXITING\|PF_DUMPCORE))
				374	goto out;
				375
				376	/*
				377	* Coredumping runs without mmap_sem so we can only check that
				378	* the mmap_sem is held, if PF_DUMPCORE was not set.
				379	*/
				380	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
				381
				382	ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
				383	if (!ctx)
				384	goto out;
				385
				386	BUG_ON(ctx->mm != mm);
				387
				388	VM_BUG_ON(reason & ~(VM_UFFD_MISSING\|VM_UFFD_WP));
				389	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
				390
				391	if (ctx->features & UFFD_FEATURE_SIGBUS)
				392	goto out;
				393
				394	/*
				395	* If it's already released don't get it. This avoids to loop
				396	* in __get_user_pages if userfaultfd_release waits on the
				397	* caller of handle_userfault to release the mmap_sem.
				398	*/
				399	if (unlikely(READ_ONCE(ctx->released))) {
				400	/*
				401	* Don't return VM_FAULT_SIGBUS in this case, so a non
				402	* cooperative manager can close the uffd after the
				403	* last UFFDIO_COPY, without risking to trigger an
				404	* involuntary SIGBUS if the process was starting the
				405	* userfaultfd while the userfaultfd was still armed
				406	* (but after the last UFFDIO_COPY). If the uffd
				407	* wasn't already closed when the userfault reached
				408	* this point, that would normally be solved by
				409	* userfaultfd_must_wait returning 'false'.
				410	*
				411	* If we were to return VM_FAULT_SIGBUS here, the non
				412	* cooperative manager would be instead forced to
				413	* always call UFFDIO_UNREGISTER before it can safely
				414	* close the uffd.
				415	*/
				416	ret = VM_FAULT_NOPAGE;
				417	goto out;
				418	}
				419
				420	/*
				421	* Check that we can return VM_FAULT_RETRY.
				422	*
				423	* NOTE: it should become possible to return VM_FAULT_RETRY
				424	* even if FAULT_FLAG_TRIED is set without leading to gup()
				425	* -EBUSY failures, if the userfaultfd is to be extended for
				426	* VM_UFFD_WP tracking and we intend to arm the userfault
				427	* without first stopping userland access to the memory. For
				428	* VM_UFFD_MISSING userfaults this is enough for now.
				429	*/
				430	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
				431	/*
				432	* Validate the invariant that nowait must allow retry
				433	* to be sure not to return SIGBUS erroneously on
				434	* nowait invocations.
				435	*/
				436	BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
				437	#ifdef CONFIG_DEBUG_VM
				438	if (printk_ratelimit()) {
				439	printk(KERN_WARNING
				440	"FAULT_FLAG_ALLOW_RETRY missing %x\n",
				441	vmf->flags);
				442	dump_stack();
				443	}
				444	#endif
				445	goto out;
				446	}
				447
				448	/*
				449	* Handle nowait, not much to do other than tell it to retry
				450	* and wait.
				451	*/
				452	ret = VM_FAULT_RETRY;
				453	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
				454	goto out;
				455
				456	/* take the reference before dropping the mmap_sem */
				457	userfaultfd_ctx_get(ctx);
				458
				459	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
				460	uwq.wq.private = current;
				461	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
				462	ctx->features);
				463	uwq.ctx = ctx;
				464	uwq.waken = false;
				465
				466	return_to_userland =
				467	(vmf->flags & (FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE)) ==
				468	(FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE);
				469	blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
				470	TASK_KILLABLE;
				471
				472	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				473	/*
				474	* After the __add_wait_queue the uwq is visible to userland
				475	* through poll/read().
				476	*/
				477	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
				478	/*
				479	* The smp_mb() after __set_current_state prevents the reads
				480	* following the spin_unlock to happen before the list_add in
				481	* __add_wait_queue.
				482	*/
				483	set_current_state(blocking_state);
				484	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				485
				486	if (!is_vm_hugetlb_page(vmf->vma))
				487	must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
				488	reason);
				489	else
				490	must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
				491	vmf->address,
				492	vmf->flags, reason);
				493	up_read(&mm->mmap_sem);
				494
				495	if (likely(must_wait && !READ_ONCE(ctx->released) &&
				496	(return_to_userland ? !signal_pending(current) :
				497	!fatal_signal_pending(current)))) {
				498	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
				499	schedule();
				500	ret \|= VM_FAULT_MAJOR;
				501
				502	/*
				503	* False wakeups can orginate even from rwsem before
				504	* up_read() however userfaults will wait either for a
				505	* targeted wakeup on the specific uwq waitqueue from
				506	* wake_userfault() or for signals or for uffd
				507	* release.
				508	*/
				509	while (!READ_ONCE(uwq.waken)) {
				510	/*
				511	* This needs the full smp_store_mb()
				512	* guarantee as the state write must be
				513	* visible to other CPUs before reading
				514	* uwq.waken from other CPUs.
				515	*/
				516	set_current_state(blocking_state);
				517	if (READ_ONCE(uwq.waken) \|\|
				518	READ_ONCE(ctx->released) \|\|
				519	(return_to_userland ? signal_pending(current) :
				520	fatal_signal_pending(current)))
				521	break;
				522	schedule();
				523	}
				524	}
				525
				526	__set_current_state(TASK_RUNNING);
				527
				528	if (return_to_userland) {
				529	if (signal_pending(current) &&
				530	!fatal_signal_pending(current)) {
				531	/*
				532	* If we got a SIGSTOP or SIGCONT and this is
				533	* a normal userland page fault, just let
				534	* userland return so the signal will be
				535	* handled and gdb debugging works. The page
				536	* fault code immediately after we return from
				537	* this function is going to release the
				538	* mmap_sem and it's not depending on it
				539	* (unlike gup would if we were not to return
				540	* VM_FAULT_RETRY).
				541	*
				542	* If a fatal signal is pending we still take
				543	* the streamlined VM_FAULT_RETRY failure path
				544	* and there's no need to retake the mmap_sem
				545	* in such case.
				546	*/
				547	down_read(&mm->mmap_sem);
				548	ret = VM_FAULT_NOPAGE;
				549	}
				550	}
				551
				552	/*
				553	* Here we race with the list_del; list_add in
				554	* userfaultfd_ctx_read(), however because we don't ever run
				555	* list_del_init() to refile across the two lists, the prev
				556	* and next pointers will never point to self. list_add also
				557	* would never let any of the two pointers to point to
				558	* self. So list_empty_careful won't risk to see both pointers
				559	* pointing to self at any time during the list refile. The
				560	* only case where list_del_init() is called is the full
				561	* removal in the wake function and there we don't re-list_add
				562	* and it's fine not to block on the spinlock. The uwq on this
				563	* kernel stack can be released after the list_del_init.
				564	*/
				565	if (!list_empty_careful(&uwq.wq.entry)) {
				566	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				567	/*
				568	* No need of list_del_init(), the uwq on the stack
				569	* will be freed shortly anyway.
				570	*/
				571	list_del(&uwq.wq.entry);
				572	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				573	}
				574
				575	/*
				576	* ctx may go away after this if the userfault pseudo fd is
				577	* already released.
				578	*/
				579	userfaultfd_ctx_put(ctx);
				580
				581	out:
				582	return ret;
				583	}
				584
				585	static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
				586	struct userfaultfd_wait_queue *ewq)
				587	{
				588	struct userfaultfd_ctx *release_new_ctx;
				589
				590	if (WARN_ON_ONCE(current->flags & PF_EXITING))
				591	goto out;
				592
				593	ewq->ctx = ctx;
				594	init_waitqueue_entry(&ewq->wq, current);
				595	release_new_ctx = NULL;
				596
				597	spin_lock_irq(&ctx->event_wqh.lock);
				598	/*
				599	* After the __add_wait_queue the uwq is visible to userland
				600	* through poll/read().
				601	*/
				602	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
				603	for (;;) {
				604	set_current_state(TASK_KILLABLE);
				605	if (ewq->msg.event == 0)
				606	break;
				607	if (READ_ONCE(ctx->released) \|\|
				608	fatal_signal_pending(current)) {
				609	/*
				610	* &ewq->wq may be queued in fork_event, but
				611	* __remove_wait_queue ignores the head
				612	* parameter. It would be a problem if it
				613	* didn't.
				614	*/
				615	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
				616	if (ewq->msg.event == UFFD_EVENT_FORK) {
				617	struct userfaultfd_ctx *new;
				618
				619	new = (struct userfaultfd_ctx *)
				620	(unsigned long)
				621	ewq->msg.arg.reserved.reserved1;
				622	release_new_ctx = new;
				623	}
				624	break;
				625	}
				626
				627	spin_unlock_irq(&ctx->event_wqh.lock);
				628
				629	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
				630	schedule();
				631
				632	spin_lock_irq(&ctx->event_wqh.lock);
				633	}
				634	__set_current_state(TASK_RUNNING);
				635	spin_unlock_irq(&ctx->event_wqh.lock);
				636
				637	if (release_new_ctx) {
				638	struct vm_area_struct *vma;
				639	struct mm_struct *mm = release_new_ctx->mm;
				640
				641	/* the various vma->vm_userfaultfd_ctx still points to it */
				642	down_write(&mm->mmap_sem);
				643	/* no task can run (and in turn coredump) yet */
				644	VM_WARN_ON(!mmget_still_valid(mm));
				645	for (vma = mm->mmap; vma; vma = vma->vm_next)
				646	if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
				647	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				648	vma->vm_flags &= ~(VM_UFFD_WP \| VM_UFFD_MISSING);
				649	}
				650	up_write(&mm->mmap_sem);
				651
				652	userfaultfd_ctx_put(release_new_ctx);
				653	}
				654
				655	/*
				656	* ctx may go away after this if the userfault pseudo fd is
				657	* already released.
				658	*/
				659	out:
				660	WRITE_ONCE(ctx->mmap_changing, false);
				661	userfaultfd_ctx_put(ctx);
				662	}
				663
				664	static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
				665	struct userfaultfd_wait_queue *ewq)
				666	{
				667	ewq->msg.event = 0;
				668	wake_up_locked(&ctx->event_wqh);
				669	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
				670	}
				671
				672	int dup_userfaultfd(struct vm_area_struct vma, struct list_head fcs)
				673	{
				674	struct userfaultfd_ctx ctx = NULL, octx;
				675	struct userfaultfd_fork_ctx *fctx;
				676
				677	octx = vma->vm_userfaultfd_ctx.ctx;
				678	if (!octx \|\| !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
				679	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				680	vma->vm_flags &= ~(VM_UFFD_WP \| VM_UFFD_MISSING);
				681	return 0;
				682	}
				683
				684	list_for_each_entry(fctx, fcs, list)
				685	if (fctx->orig == octx) {
				686	ctx = fctx->new;
				687	break;
				688	}
				689
				690	if (!ctx) {
				691	fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
				692	if (!fctx)
				693	return -ENOMEM;
				694
				695	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
				696	if (!ctx) {
				697	kfree(fctx);
				698	return -ENOMEM;
				699	}
				700
				701	atomic_set(&ctx->refcount, 1);
				702	ctx->flags = octx->flags;
				703	ctx->state = UFFD_STATE_RUNNING;
				704	ctx->features = octx->features;
				705	ctx->released = false;
				706	ctx->mmap_changing = false;
				707	ctx->mm = vma->vm_mm;
				708	mmgrab(ctx->mm);
				709
				710	userfaultfd_ctx_get(octx);
				711	WRITE_ONCE(octx->mmap_changing, true);
				712	fctx->orig = octx;
				713	fctx->new = ctx;
				714	list_add_tail(&fctx->list, fcs);
				715	}
				716
				717	vma->vm_userfaultfd_ctx.ctx = ctx;
				718	return 0;
				719	}
				720
				721	static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
				722	{
				723	struct userfaultfd_ctx *ctx = fctx->orig;
				724	struct userfaultfd_wait_queue ewq;
				725
				726	msg_init(&ewq.msg);
				727
				728	ewq.msg.event = UFFD_EVENT_FORK;
				729	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
				730
				731	userfaultfd_event_wait_completion(ctx, &ewq);
				732	}
				733
				734	void dup_userfaultfd_complete(struct list_head *fcs)
				735	{
				736	struct userfaultfd_fork_ctx fctx, n;
				737
				738	list_for_each_entry_safe(fctx, n, fcs, list) {
				739	dup_fctx(fctx);
				740	list_del(&fctx->list);
				741	kfree(fctx);
				742	}
				743	}
				744
				745	void mremap_userfaultfd_prep(struct vm_area_struct *vma,
				746	struct vm_userfaultfd_ctx *vm_ctx)
				747	{
				748	struct userfaultfd_ctx *ctx;
				749
				750	ctx = vma->vm_userfaultfd_ctx.ctx;
				751
				752	if (!ctx)
				753	return;
				754
				755	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
				756	vm_ctx->ctx = ctx;
				757	userfaultfd_ctx_get(ctx);
				758	WRITE_ONCE(ctx->mmap_changing, true);
				759	} else {
				760	/* Drop uffd context if remap feature not enabled */
				761	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				762	vma->vm_flags &= ~(VM_UFFD_WP \| VM_UFFD_MISSING);
				763	}
				764	}
				765
				766	void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
				767	unsigned long from, unsigned long to,
				768	unsigned long len)
				769	{
				770	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
				771	struct userfaultfd_wait_queue ewq;
				772
				773	if (!ctx)
				774	return;
				775
				776	if (to & ~PAGE_MASK) {
				777	userfaultfd_ctx_put(ctx);
				778	return;
				779	}
				780
				781	msg_init(&ewq.msg);
				782
				783	ewq.msg.event = UFFD_EVENT_REMAP;
				784	ewq.msg.arg.remap.from = from;
				785	ewq.msg.arg.remap.to = to;
				786	ewq.msg.arg.remap.len = len;
				787
				788	userfaultfd_event_wait_completion(ctx, &ewq);
				789	}
				790
				791	bool userfaultfd_remove(struct vm_area_struct *vma,
				792	unsigned long start, unsigned long end)
				793	{
				794	struct mm_struct *mm = vma->vm_mm;
				795	struct userfaultfd_ctx *ctx;
				796	struct userfaultfd_wait_queue ewq;
				797
				798	ctx = vma->vm_userfaultfd_ctx.ctx;
				799	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
				800	return true;
				801
				802	userfaultfd_ctx_get(ctx);
				803	WRITE_ONCE(ctx->mmap_changing, true);
				804	up_read(&mm->mmap_sem);
				805
				806	msg_init(&ewq.msg);
				807
				808	ewq.msg.event = UFFD_EVENT_REMOVE;
				809	ewq.msg.arg.remove.start = start;
				810	ewq.msg.arg.remove.end = end;
				811
				812	userfaultfd_event_wait_completion(ctx, &ewq);
				813
				814	return false;
				815	}
				816
				817	static bool has_unmap_ctx(struct userfaultfd_ctx ctx, struct list_head unmaps,
				818	unsigned long start, unsigned long end)
				819	{
				820	struct userfaultfd_unmap_ctx *unmap_ctx;
				821
				822	list_for_each_entry(unmap_ctx, unmaps, list)
				823	if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
				824	unmap_ctx->end == end)
				825	return true;
				826
				827	return false;
				828	}
				829
				830	int userfaultfd_unmap_prep(struct vm_area_struct *vma,
				831	unsigned long start, unsigned long end,
				832	struct list_head *unmaps)
				833	{
				834	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
				835	struct userfaultfd_unmap_ctx *unmap_ctx;
				836	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
				837
				838	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) \|\|
				839	has_unmap_ctx(ctx, unmaps, start, end))
				840	continue;
				841
				842	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
				843	if (!unmap_ctx)
				844	return -ENOMEM;
				845
				846	userfaultfd_ctx_get(ctx);
				847	WRITE_ONCE(ctx->mmap_changing, true);
				848	unmap_ctx->ctx = ctx;
				849	unmap_ctx->start = start;
				850	unmap_ctx->end = end;
				851	list_add_tail(&unmap_ctx->list, unmaps);
				852	}
				853
				854	return 0;
				855	}
				856
				857	void userfaultfd_unmap_complete(struct mm_struct mm, struct list_head uf)
				858	{
				859	struct userfaultfd_unmap_ctx ctx, n;
				860	struct userfaultfd_wait_queue ewq;
				861
				862	list_for_each_entry_safe(ctx, n, uf, list) {
				863	msg_init(&ewq.msg);
				864
				865	ewq.msg.event = UFFD_EVENT_UNMAP;
				866	ewq.msg.arg.remove.start = ctx->start;
				867	ewq.msg.arg.remove.end = ctx->end;
				868
				869	userfaultfd_event_wait_completion(ctx->ctx, &ewq);
				870
				871	list_del(&ctx->list);
				872	kfree(ctx);
				873	}
				874	}
				875
				876	static int userfaultfd_release(struct inode inode, struct file file)
				877	{
				878	struct userfaultfd_ctx *ctx = file->private_data;
				879	struct mm_struct *mm = ctx->mm;
				880	struct vm_area_struct vma, prev;
				881	/* len == 0 means wake all */
				882	struct userfaultfd_wake_range range = { .len = 0, };
				883	unsigned long new_flags;
				884	bool still_valid;
				885
				886	WRITE_ONCE(ctx->released, true);
				887
				888	if (!mmget_not_zero(mm))
				889	goto wakeup;
				890
				891	/*
				892	* Flush page faults out of all CPUs. NOTE: all page faults
				893	* must be retried without returning VM_FAULT_SIGBUS if
				894	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
				895	* changes while handle_userfault released the mmap_sem. So
				896	* it's critical that released is set to true (above), before
				897	* taking the mmap_sem for writing.
				898	*/
				899	down_write(&mm->mmap_sem);
				900	still_valid = mmget_still_valid(mm);
				901	prev = NULL;
				902	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				903	cond_resched();
				904	BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
				905	!!(vma->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				906	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
				907	prev = vma;
				908	continue;
				909	}
				910	new_flags = vma->vm_flags & ~(VM_UFFD_MISSING \| VM_UFFD_WP);
				911	if (still_valid) {
				912	prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
				913	new_flags, vma->anon_vma,
				914	vma->vm_file, vma->vm_pgoff,
				915	vma_policy(vma),
				916	NULL_VM_UFFD_CTX,
				917	vma_get_anon_name(vma));
				918	if (prev)
				919	vma = prev;
				920	else
				921	prev = vma;
				922	}
				923	vma->vm_flags = new_flags;
				924	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				925	}
				926	up_write(&mm->mmap_sem);
				927	mmput(mm);
				928	wakeup:
				929	/*
				930	* After no new page faults can wait on this fault_*wqh, flush
				931	* the last page faults that may have been already waiting on
				932	* the fault_*wqh.
				933	*/
				934	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				935	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
				936	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
				937	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				938
				939	/* Flush pending events that may still wait on event_wqh */
				940	wake_up_all(&ctx->event_wqh);
				941
				942	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
				943	userfaultfd_ctx_put(ctx);
				944	return 0;
				945	}
				946
				947	/* fault_pending_wqh.lock must be hold by the caller */
				948	static inline struct userfaultfd_wait_queue *find_userfault_in(
				949	wait_queue_head_t *wqh)
				950	{
				951	wait_queue_entry_t *wq;
				952	struct userfaultfd_wait_queue *uwq;
				953
				954	VM_BUG_ON(!spin_is_locked(&wqh->lock));
				955
				956	uwq = NULL;
				957	if (!waitqueue_active(wqh))
				958	goto out;
				959	/* walk in reverse to provide FIFO behavior to read userfaults */
				960	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
				961	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				962	out:
				963	return uwq;
				964	}
				965
				966	static inline struct userfaultfd_wait_queue *find_userfault(
				967	struct userfaultfd_ctx *ctx)
				968	{
				969	return find_userfault_in(&ctx->fault_pending_wqh);
				970	}
				971
				972	static inline struct userfaultfd_wait_queue *find_userfault_evt(
				973	struct userfaultfd_ctx *ctx)
				974	{
				975	return find_userfault_in(&ctx->event_wqh);
				976	}
				977
				978	static __poll_t userfaultfd_poll(struct file file, poll_table wait)
				979	{
				980	struct userfaultfd_ctx *ctx = file->private_data;
				981	__poll_t ret;
				982
				983	poll_wait(file, &ctx->fd_wqh, wait);
				984
				985	switch (ctx->state) {
				986	case UFFD_STATE_WAIT_API:
				987	return EPOLLERR;
				988	case UFFD_STATE_RUNNING:
				989	/*
				990	* poll() never guarantees that read won't block.
				991	* userfaults can be waken before they're read().
				992	*/
				993	if (unlikely(!(file->f_flags & O_NONBLOCK)))
				994	return EPOLLERR;
				995	/*
				996	* lockless access to see if there are pending faults
				997	* __pollwait last action is the add_wait_queue but
				998	* the spin_unlock would allow the waitqueue_active to
				999	* pass above the actual list_add inside
				1000	* add_wait_queue critical section. So use a full
				1001	* memory barrier to serialize the list_add write of
				1002	* add_wait_queue() with the waitqueue_active read
				1003	* below.
				1004	*/
				1005	ret = 0;
				1006	smp_mb();
				1007	if (waitqueue_active(&ctx->fault_pending_wqh))
				1008	ret = EPOLLIN;
				1009	else if (waitqueue_active(&ctx->event_wqh))
				1010	ret = EPOLLIN;
				1011
				1012	return ret;
				1013	default:
				1014	WARN_ON_ONCE(1);
				1015	return EPOLLERR;
				1016	}
				1017	}
				1018
				1019	static const struct file_operations userfaultfd_fops;
				1020
				1021	static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
				1022	struct userfaultfd_ctx *new,
				1023	struct uffd_msg *msg)
				1024	{
				1025	int fd;
				1026
				1027	fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
				1028	O_RDWR \| (new->flags & UFFD_SHARED_FCNTL_FLAGS));
				1029	if (fd < 0)
				1030	return fd;
				1031
				1032	msg->arg.reserved.reserved1 = 0;
				1033	msg->arg.fork.ufd = fd;
				1034	return 0;
				1035	}
				1036
				1037	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
				1038	struct uffd_msg *msg)
				1039	{
				1040	ssize_t ret;
				1041	DECLARE_WAITQUEUE(wait, current);
				1042	struct userfaultfd_wait_queue *uwq;
				1043	/*
				1044	* Handling fork event requires sleeping operations, so
				1045	* we drop the event_wqh lock, then do these ops, then
				1046	* lock it back and wake up the waiter. While the lock is
				1047	* dropped the ewq may go away so we keep track of it
				1048	* carefully.
				1049	*/
				1050	LIST_HEAD(fork_event);
				1051	struct userfaultfd_ctx *fork_nctx = NULL;
				1052
				1053	/* always take the fd_wqh lock before the fault_pending_wqh lock */
				1054	spin_lock_irq(&ctx->fd_wqh.lock);
				1055	__add_wait_queue(&ctx->fd_wqh, &wait);
				1056	for (;;) {
				1057	set_current_state(TASK_INTERRUPTIBLE);
				1058	spin_lock(&ctx->fault_pending_wqh.lock);
				1059	uwq = find_userfault(ctx);
				1060	if (uwq) {
				1061	/*
				1062	* Use a seqcount to repeat the lockless check
				1063	* in wake_userfault() to avoid missing
				1064	* wakeups because during the refile both
				1065	* waitqueue could become empty if this is the
				1066	* only userfault.
				1067	*/
				1068	write_seqcount_begin(&ctx->refile_seq);
				1069
				1070	/*
				1071	* The fault_pending_wqh.lock prevents the uwq
				1072	* to disappear from under us.
				1073	*
				1074	* Refile this userfault from
				1075	* fault_pending_wqh to fault_wqh, it's not
				1076	* pending anymore after we read it.
				1077	*
				1078	* Use list_del() by hand (as
				1079	* userfaultfd_wake_function also uses
				1080	* list_del_init() by hand) to be sure nobody
				1081	* changes __remove_wait_queue() to use
				1082	* list_del_init() in turn breaking the
				1083	* !list_empty_careful() check in
				1084	* handle_userfault(). The uwq->wq.head list
				1085	* must never be empty at any time during the
				1086	* refile, or the waitqueue could disappear
				1087	* from under us. The "wait_queue_head_t"
				1088	* parameter of __remove_wait_queue() is unused
				1089	* anyway.
				1090	*/
				1091	list_del(&uwq->wq.entry);
				1092	add_wait_queue(&ctx->fault_wqh, &uwq->wq);
				1093
				1094	write_seqcount_end(&ctx->refile_seq);
				1095
				1096	/* careful to always initialize msg if ret == 0 */
				1097	*msg = uwq->msg;
				1098	spin_unlock(&ctx->fault_pending_wqh.lock);
				1099	ret = 0;
				1100	break;
				1101	}
				1102	spin_unlock(&ctx->fault_pending_wqh.lock);
				1103
				1104	spin_lock(&ctx->event_wqh.lock);
				1105	uwq = find_userfault_evt(ctx);
				1106	if (uwq) {
				1107	*msg = uwq->msg;
				1108
				1109	if (uwq->msg.event == UFFD_EVENT_FORK) {
				1110	fork_nctx = (struct userfaultfd_ctx *)
				1111	(unsigned long)
				1112	uwq->msg.arg.reserved.reserved1;
				1113	list_move(&uwq->wq.entry, &fork_event);
				1114	/*
				1115	* fork_nctx can be freed as soon as
				1116	* we drop the lock, unless we take a
				1117	* reference on it.
				1118	*/
				1119	userfaultfd_ctx_get(fork_nctx);
				1120	spin_unlock(&ctx->event_wqh.lock);
				1121	ret = 0;
				1122	break;
				1123	}
				1124
				1125	userfaultfd_event_complete(ctx, uwq);
				1126	spin_unlock(&ctx->event_wqh.lock);
				1127	ret = 0;
				1128	break;
				1129	}
				1130	spin_unlock(&ctx->event_wqh.lock);
				1131
				1132	if (signal_pending(current)) {
				1133	ret = -ERESTARTSYS;
				1134	break;
				1135	}
				1136	if (no_wait) {
				1137	ret = -EAGAIN;
				1138	break;
				1139	}
				1140	spin_unlock_irq(&ctx->fd_wqh.lock);
				1141	schedule();
				1142	spin_lock_irq(&ctx->fd_wqh.lock);
				1143	}
				1144	__remove_wait_queue(&ctx->fd_wqh, &wait);
				1145	__set_current_state(TASK_RUNNING);
				1146	spin_unlock_irq(&ctx->fd_wqh.lock);
				1147
				1148	if (!ret && msg->event == UFFD_EVENT_FORK) {
				1149	ret = resolve_userfault_fork(ctx, fork_nctx, msg);
				1150	spin_lock_irq(&ctx->event_wqh.lock);
				1151	if (!list_empty(&fork_event)) {
				1152	/*
				1153	* The fork thread didn't abort, so we can
				1154	* drop the temporary refcount.
				1155	*/
				1156	userfaultfd_ctx_put(fork_nctx);
				1157
				1158	uwq = list_first_entry(&fork_event,
				1159	typeof(*uwq),
				1160	wq.entry);
				1161	/*
				1162	* If fork_event list wasn't empty and in turn
				1163	* the event wasn't already released by fork
				1164	* (the event is allocated on fork kernel
				1165	* stack), put the event back to its place in
				1166	* the event_wq. fork_event head will be freed
				1167	* as soon as we return so the event cannot
				1168	* stay queued there no matter the current
				1169	* "ret" value.
				1170	*/
				1171	list_del(&uwq->wq.entry);
				1172	__add_wait_queue(&ctx->event_wqh, &uwq->wq);
				1173
				1174	/*
				1175	* Leave the event in the waitqueue and report
				1176	* error to userland if we failed to resolve
				1177	* the userfault fork.
				1178	*/
				1179	if (likely(!ret))
				1180	userfaultfd_event_complete(ctx, uwq);
				1181	} else {
				1182	/*
				1183	* Here the fork thread aborted and the
				1184	* refcount from the fork thread on fork_nctx
				1185	* has already been released. We still hold
				1186	* the reference we took before releasing the
				1187	* lock above. If resolve_userfault_fork
				1188	* failed we've to drop it because the
				1189	* fork_nctx has to be freed in such case. If
				1190	* it succeeded we'll hold it because the new
				1191	* uffd references it.
				1192	*/
				1193	if (ret)
				1194	userfaultfd_ctx_put(fork_nctx);
				1195	}
				1196	spin_unlock_irq(&ctx->event_wqh.lock);
				1197	}
				1198
				1199	return ret;
				1200	}
				1201
				1202	static ssize_t userfaultfd_read(struct file file, char __user buf,
				1203	size_t count, loff_t *ppos)
				1204	{
				1205	struct userfaultfd_ctx *ctx = file->private_data;
				1206	ssize_t _ret, ret = 0;
				1207	struct uffd_msg msg;
				1208	int no_wait = file->f_flags & O_NONBLOCK;
				1209
				1210	if (ctx->state == UFFD_STATE_WAIT_API)
				1211	return -EINVAL;
				1212
				1213	for (;;) {
				1214	if (count < sizeof(msg))
				1215	return ret ? ret : -EINVAL;
				1216	_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
				1217	if (_ret < 0)
				1218	return ret ? ret : _ret;
				1219	if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
				1220	return ret ? ret : -EFAULT;
				1221	ret += sizeof(msg);
				1222	buf += sizeof(msg);
				1223	count -= sizeof(msg);
				1224	/*
				1225	* Allow to read more than one fault at time but only
				1226	* block if waiting for the very first one.
				1227	*/
				1228	no_wait = O_NONBLOCK;
				1229	}
				1230	}
				1231
				1232	static void __wake_userfault(struct userfaultfd_ctx *ctx,
				1233	struct userfaultfd_wake_range *range)
				1234	{
				1235	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				1236	/* wake all in the range and autoremove */
				1237	if (waitqueue_active(&ctx->fault_pending_wqh))
				1238	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
				1239	range);
				1240	if (waitqueue_active(&ctx->fault_wqh))
				1241	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
				1242	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				1243	}
				1244
				1245	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
				1246	struct userfaultfd_wake_range *range)
				1247	{
				1248	unsigned seq;
				1249	bool need_wakeup;
				1250
				1251	/*
				1252	* To be sure waitqueue_active() is not reordered by the CPU
				1253	* before the pagetable update, use an explicit SMP memory
				1254	* barrier here. PT lock release or up_read(mmap_sem) still
				1255	* have release semantics that can allow the
				1256	* waitqueue_active() to be reordered before the pte update.
				1257	*/
				1258	smp_mb();
				1259
				1260	/*
				1261	* Use waitqueue_active because it's very frequent to
				1262	* change the address space atomically even if there are no
				1263	* userfaults yet. So we take the spinlock only when we're
				1264	* sure we've userfaults to wake.
				1265	*/
				1266	do {
				1267	seq = read_seqcount_begin(&ctx->refile_seq);
				1268	need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) \|\|
				1269	waitqueue_active(&ctx->fault_wqh);
				1270	cond_resched();
				1271	} while (read_seqcount_retry(&ctx->refile_seq, seq));
				1272	if (need_wakeup)
				1273	__wake_userfault(ctx, range);
				1274	}
				1275
				1276	static __always_inline int validate_range(struct mm_struct *mm,
				1277	__u64 *start, __u64 len)
				1278	{
				1279	__u64 task_size = mm->task_size;
				1280
				1281	start = untagged_addr(start);
				1282
				1283	if (*start & ~PAGE_MASK)
				1284	return -EINVAL;
				1285	if (len & ~PAGE_MASK)
				1286	return -EINVAL;
				1287	if (!len)
				1288	return -EINVAL;
				1289	if (*start < mmap_min_addr)
				1290	return -EINVAL;
				1291	if (*start >= task_size)
				1292	return -EINVAL;
				1293	if (len > task_size - *start)
				1294	return -EINVAL;
				1295	return 0;
				1296	}
				1297
				1298	static inline bool vma_can_userfault(struct vm_area_struct *vma)
				1299	{
				1300	return vma_is_anonymous(vma) \|\| is_vm_hugetlb_page(vma) \|\|
				1301	vma_is_shmem(vma);
				1302	}
				1303
				1304	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
				1305	unsigned long arg)
				1306	{
				1307	struct mm_struct *mm = ctx->mm;
				1308	struct vm_area_struct vma, prev, *cur;
				1309	int ret;
				1310	struct uffdio_register uffdio_register;
				1311	struct uffdio_register __user *user_uffdio_register;
				1312	unsigned long vm_flags, new_flags;
				1313	bool found;
				1314	bool basic_ioctls;
				1315	unsigned long start, end, vma_end;
				1316
				1317	user_uffdio_register = (struct uffdio_register __user *) arg;
				1318
				1319	ret = -EFAULT;
				1320	if (copy_from_user(&uffdio_register, user_uffdio_register,
				1321	sizeof(uffdio_register)-sizeof(__u64)))
				1322	goto out;
				1323
				1324	ret = -EINVAL;
				1325	if (!uffdio_register.mode)
				1326	goto out;
				1327	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING\|
				1328	UFFDIO_REGISTER_MODE_WP))
				1329	goto out;
				1330	vm_flags = 0;
				1331	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
				1332	vm_flags \|= VM_UFFD_MISSING;
				1333	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
				1334	vm_flags \|= VM_UFFD_WP;
				1335	/*
				1336	* FIXME: remove the below error constraint by
				1337	* implementing the wprotect tracking mode.
				1338	*/
				1339	ret = -EINVAL;
				1340	goto out;
				1341	}
				1342
				1343	ret = validate_range(mm, &uffdio_register.range.start,
				1344	uffdio_register.range.len);
				1345	if (ret)
				1346	goto out;
				1347
				1348	start = uffdio_register.range.start;
				1349	end = start + uffdio_register.range.len;
				1350
				1351	ret = -ENOMEM;
				1352	if (!mmget_not_zero(mm))
				1353	goto out;
				1354
				1355	down_write(&mm->mmap_sem);
				1356	if (!mmget_still_valid(mm))
				1357	goto out_unlock;
				1358	vma = find_vma_prev(mm, start, &prev);
				1359	if (!vma)
				1360	goto out_unlock;
				1361
				1362	/* check that there's at least one vma in the range */
				1363	ret = -EINVAL;
				1364	if (vma->vm_start >= end)
				1365	goto out_unlock;
				1366
				1367	/*
				1368	* If the first vma contains huge pages, make sure start address
				1369	* is aligned to huge page size.
				1370	*/
				1371	if (is_vm_hugetlb_page(vma)) {
				1372	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
				1373
				1374	if (start & (vma_hpagesize - 1))
				1375	goto out_unlock;
				1376	}
				1377
				1378	/*
				1379	* Search for not compatible vmas.
				1380	*/
				1381	found = false;
				1382	basic_ioctls = false;
				1383	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				1384	cond_resched();
				1385
				1386	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				1387	!!(cur->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				1388
				1389	/* check not compatible vmas */
				1390	ret = -EINVAL;
				1391	if (!vma_can_userfault(cur))
				1392	goto out_unlock;
				1393
				1394	/*
				1395	* UFFDIO_COPY will fill file holes even without
				1396	* PROT_WRITE. This check enforces that if this is a
				1397	* MAP_SHARED, the process has write permission to the backing
				1398	* file. If VM_MAYWRITE is set it also enforces that on a
				1399	* MAP_SHARED vma: there is no F_WRITE_SEAL and no further
				1400	* F_WRITE_SEAL can be taken until the vma is destroyed.
				1401	*/
				1402	ret = -EPERM;
				1403	if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
				1404	goto out_unlock;
				1405
				1406	/*
				1407	* If this vma contains ending address, and huge pages
				1408	* check alignment.
				1409	*/
				1410	if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
				1411	end > cur->vm_start) {
				1412	unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
				1413
				1414	ret = -EINVAL;
				1415
				1416	if (end & (vma_hpagesize - 1))
				1417	goto out_unlock;
				1418	}
				1419
				1420	/*
				1421	* Check that this vma isn't already owned by a
				1422	* different userfaultfd. We can't allow more than one
				1423	* userfaultfd to own a single vma simultaneously or we
				1424	* wouldn't know which one to deliver the userfaults to.
				1425	*/
				1426	ret = -EBUSY;
				1427	if (cur->vm_userfaultfd_ctx.ctx &&
				1428	cur->vm_userfaultfd_ctx.ctx != ctx)
				1429	goto out_unlock;
				1430
				1431	/*
				1432	* Note vmas containing huge pages
				1433	*/
				1434	if (is_vm_hugetlb_page(cur))
				1435	basic_ioctls = true;
				1436
				1437	found = true;
				1438	}
				1439	BUG_ON(!found);
				1440
				1441	if (vma->vm_start < start)
				1442	prev = vma;
				1443
				1444	ret = 0;
				1445	do {
				1446	cond_resched();
				1447
				1448	BUG_ON(!vma_can_userfault(vma));
				1449	BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
				1450	vma->vm_userfaultfd_ctx.ctx != ctx);
				1451	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
				1452
				1453	/*
				1454	* Nothing to do: this vma is already registered into this
				1455	* userfaultfd and with the right tracking mode too.
				1456	*/
				1457	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
				1458	(vma->vm_flags & vm_flags) == vm_flags)
				1459	goto skip;
				1460
				1461	if (vma->vm_start > start)
				1462	start = vma->vm_start;
				1463	vma_end = min(end, vma->vm_end);
				1464
				1465	new_flags = (vma->vm_flags & ~vm_flags) \| vm_flags;
				1466	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				1467	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				1468	vma_policy(vma),
				1469	((struct vm_userfaultfd_ctx){ ctx }),
				1470	vma_get_anon_name(vma));
				1471	if (prev) {
				1472	vma = prev;
				1473	goto next;
				1474	}
				1475	if (vma->vm_start < start) {
				1476	ret = split_vma(mm, vma, start, 1);
				1477	if (ret)
				1478	break;
				1479	}
				1480	if (vma->vm_end > end) {
				1481	ret = split_vma(mm, vma, end, 0);
				1482	if (ret)
				1483	break;
				1484	}
				1485	next:
				1486	/*
				1487	* In the vma_merge() successful mprotect-like case 8:
				1488	* the next vma was merged into the current one and
				1489	* the current one has not been updated yet.
				1490	*/
				1491	vma->vm_flags = new_flags;
				1492	vma->vm_userfaultfd_ctx.ctx = ctx;
				1493
				1494	skip:
				1495	prev = vma;
				1496	start = vma->vm_end;
				1497	vma = vma->vm_next;
				1498	} while (vma && vma->vm_start < end);
				1499	out_unlock:
				1500	up_write(&mm->mmap_sem);
				1501	mmput(mm);
				1502	if (!ret) {
				1503	/*
				1504	* Now that we scanned all vmas we can already tell
				1505	* userland which ioctls methods are guaranteed to
				1506	* succeed on this range.
				1507	*/
				1508	if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
				1509	UFFD_API_RANGE_IOCTLS,
				1510	&user_uffdio_register->ioctls))
				1511	ret = -EFAULT;
				1512	}
				1513	out:
				1514	return ret;
				1515	}
				1516
				1517	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
				1518	unsigned long arg)
				1519	{
				1520	struct mm_struct *mm = ctx->mm;
				1521	struct vm_area_struct vma, prev, *cur;
				1522	int ret;
				1523	struct uffdio_range uffdio_unregister;
				1524	unsigned long new_flags;
				1525	bool found;
				1526	unsigned long start, end, vma_end;
				1527	const void __user buf = (void __user )arg;
				1528
				1529	ret = -EFAULT;
				1530	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
				1531	goto out;
				1532
				1533	ret = validate_range(mm, &uffdio_unregister.start,
				1534	uffdio_unregister.len);
				1535	if (ret)
				1536	goto out;
				1537
				1538	start = uffdio_unregister.start;
				1539	end = start + uffdio_unregister.len;
				1540
				1541	ret = -ENOMEM;
				1542	if (!mmget_not_zero(mm))
				1543	goto out;
				1544
				1545	down_write(&mm->mmap_sem);
				1546	if (!mmget_still_valid(mm))
				1547	goto out_unlock;
				1548	vma = find_vma_prev(mm, start, &prev);
				1549	if (!vma)
				1550	goto out_unlock;
				1551
				1552	/* check that there's at least one vma in the range */
				1553	ret = -EINVAL;
				1554	if (vma->vm_start >= end)
				1555	goto out_unlock;
				1556
				1557	/*
				1558	* If the first vma contains huge pages, make sure start address
				1559	* is aligned to huge page size.
				1560	*/
				1561	if (is_vm_hugetlb_page(vma)) {
				1562	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
				1563
				1564	if (start & (vma_hpagesize - 1))
				1565	goto out_unlock;
				1566	}
				1567
				1568	/*
				1569	* Search for not compatible vmas.
				1570	*/
				1571	found = false;
				1572	ret = -EINVAL;
				1573	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				1574	cond_resched();
				1575
				1576	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				1577	!!(cur->vm_flags & (VM_UFFD_MISSING \| VM_UFFD_WP)));
				1578
				1579	/*
				1580	* Check not compatible vmas, not strictly required
				1581	* here as not compatible vmas cannot have an
				1582	* userfaultfd_ctx registered on them, but this
				1583	* provides for more strict behavior to notice
				1584	* unregistration errors.
				1585	*/
				1586	if (!vma_can_userfault(cur))
				1587	goto out_unlock;
				1588
				1589	found = true;
				1590	}
				1591	BUG_ON(!found);
				1592
				1593	if (vma->vm_start < start)
				1594	prev = vma;
				1595
				1596	ret = 0;
				1597	do {
				1598	cond_resched();
				1599
				1600	BUG_ON(!vma_can_userfault(vma));
				1601
				1602	/*
				1603	* Nothing to do: this vma is already registered into this
				1604	* userfaultfd and with the right tracking mode too.
				1605	*/
				1606	if (!vma->vm_userfaultfd_ctx.ctx)
				1607	goto skip;
				1608
				1609	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
				1610
				1611	if (vma->vm_start > start)
				1612	start = vma->vm_start;
				1613	vma_end = min(end, vma->vm_end);
				1614
				1615	if (userfaultfd_missing(vma)) {
				1616	/*
				1617	* Wake any concurrent pending userfault while
				1618	* we unregister, so they will not hang
				1619	* permanently and it avoids userland to call
				1620	* UFFDIO_WAKE explicitly.
				1621	*/
				1622	struct userfaultfd_wake_range range;
				1623	range.start = start;
				1624	range.len = vma_end - start;
				1625	wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
				1626	}
				1627
				1628	new_flags = vma->vm_flags & ~(VM_UFFD_MISSING \| VM_UFFD_WP);
				1629	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				1630	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				1631	vma_policy(vma),
				1632	NULL_VM_UFFD_CTX,
				1633	vma_get_anon_name(vma));
				1634	if (prev) {
				1635	vma = prev;
				1636	goto next;
				1637	}
				1638	if (vma->vm_start < start) {
				1639	ret = split_vma(mm, vma, start, 1);
				1640	if (ret)
				1641	break;
				1642	}
				1643	if (vma->vm_end > end) {
				1644	ret = split_vma(mm, vma, end, 0);
				1645	if (ret)
				1646	break;
				1647	}
				1648	next:
				1649	/*
				1650	* In the vma_merge() successful mprotect-like case 8:
				1651	* the next vma was merged into the current one and
				1652	* the current one has not been updated yet.
				1653	*/
				1654	vma->vm_flags = new_flags;
				1655	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				1656
				1657	skip:
				1658	prev = vma;
				1659	start = vma->vm_end;
				1660	vma = vma->vm_next;
				1661	} while (vma && vma->vm_start < end);
				1662	out_unlock:
				1663	up_write(&mm->mmap_sem);
				1664	mmput(mm);
				1665	out:
				1666	return ret;
				1667	}
				1668
				1669	/*
				1670	* userfaultfd_wake may be used in combination with the
				1671	* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
				1672	*/
				1673	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
				1674	unsigned long arg)
				1675	{
				1676	int ret;
				1677	struct uffdio_range uffdio_wake;
				1678	struct userfaultfd_wake_range range;
				1679	const void __user buf = (void __user )arg;
				1680
				1681	ret = -EFAULT;
				1682	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
				1683	goto out;
				1684
				1685	ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
				1686	if (ret)
				1687	goto out;
				1688
				1689	range.start = uffdio_wake.start;
				1690	range.len = uffdio_wake.len;
				1691
				1692	/*
				1693	* len == 0 means wake all and we don't want to wake all here,
				1694	* so check it again to be sure.
				1695	*/
				1696	VM_BUG_ON(!range.len);
				1697
				1698	wake_userfault(ctx, &range);
				1699	ret = 0;
				1700
				1701	out:
				1702	return ret;
				1703	}
				1704
				1705	static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
				1706	unsigned long arg)
				1707	{
				1708	__s64 ret;
				1709	struct uffdio_copy uffdio_copy;
				1710	struct uffdio_copy __user *user_uffdio_copy;
				1711	struct userfaultfd_wake_range range;
				1712
				1713	user_uffdio_copy = (struct uffdio_copy __user *) arg;
				1714
				1715	ret = -EAGAIN;
				1716	if (READ_ONCE(ctx->mmap_changing))
				1717	goto out;
				1718
				1719	ret = -EFAULT;
				1720	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
				1721	/* don't copy "copy" last field */
				1722	sizeof(uffdio_copy)-sizeof(__s64)))
				1723	goto out;
				1724
				1725	ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
				1726	if (ret)
				1727	goto out;
				1728	/*
				1729	* double check for wraparound just in case. copy_from_user()
				1730	* will later check uffdio_copy.src + uffdio_copy.len to fit
				1731	* in the userland range.
				1732	*/
				1733	ret = -EINVAL;
				1734	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
				1735	goto out;
				1736	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
				1737	goto out;
				1738	if (mmget_not_zero(ctx->mm)) {
				1739	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
				1740	uffdio_copy.len, &ctx->mmap_changing);
				1741	mmput(ctx->mm);
				1742	} else {
				1743	return -ESRCH;
				1744	}
				1745	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
				1746	return -EFAULT;
				1747	if (ret < 0)
				1748	goto out;
				1749	BUG_ON(!ret);
				1750	/* len == 0 would wake all */
				1751	range.len = ret;
				1752	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
				1753	range.start = uffdio_copy.dst;
				1754	wake_userfault(ctx, &range);
				1755	}
				1756	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
				1757	out:
				1758	return ret;
				1759	}
				1760
				1761	static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
				1762	unsigned long arg)
				1763	{
				1764	__s64 ret;
				1765	struct uffdio_zeropage uffdio_zeropage;
				1766	struct uffdio_zeropage __user *user_uffdio_zeropage;
				1767	struct userfaultfd_wake_range range;
				1768
				1769	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
				1770
				1771	ret = -EAGAIN;
				1772	if (READ_ONCE(ctx->mmap_changing))
				1773	goto out;
				1774
				1775	ret = -EFAULT;
				1776	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
				1777	/* don't copy "zeropage" last field */
				1778	sizeof(uffdio_zeropage)-sizeof(__s64)))
				1779	goto out;
				1780
				1781	ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
				1782	uffdio_zeropage.range.len);
				1783	if (ret)
				1784	goto out;
				1785	ret = -EINVAL;
				1786	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
				1787	goto out;
				1788
				1789	if (mmget_not_zero(ctx->mm)) {
				1790	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
				1791	uffdio_zeropage.range.len,
				1792	&ctx->mmap_changing);
				1793	mmput(ctx->mm);
				1794	} else {
				1795	return -ESRCH;
				1796	}
				1797	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
				1798	return -EFAULT;
				1799	if (ret < 0)
				1800	goto out;
				1801	/* len == 0 would wake all */
				1802	BUG_ON(!ret);
				1803	range.len = ret;
				1804	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
				1805	range.start = uffdio_zeropage.range.start;
				1806	wake_userfault(ctx, &range);
				1807	}
				1808	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
				1809	out:
				1810	return ret;
				1811	}
				1812
				1813	static inline unsigned int uffd_ctx_features(__u64 user_features)
				1814	{
				1815	/*
				1816	* For the current set of features the bits just coincide
				1817	*/
				1818	return (unsigned int)user_features;
				1819	}
				1820
				1821	/*
				1822	* userland asks for a certain API version and we return which bits
				1823	* and ioctl commands are implemented in this kernel for such API
				1824	* version or -EINVAL if unknown.
				1825	*/
				1826	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
				1827	unsigned long arg)
				1828	{
				1829	struct uffdio_api uffdio_api;
				1830	void __user buf = (void __user )arg;
				1831	int ret;
				1832	__u64 features;
				1833
				1834	ret = -EINVAL;
				1835	if (ctx->state != UFFD_STATE_WAIT_API)
				1836	goto out;
				1837	ret = -EFAULT;
				1838	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
				1839	goto out;
				1840	features = uffdio_api.features;
				1841	ret = -EINVAL;
				1842	if (uffdio_api.api != UFFD_API \|\| (features & ~UFFD_API_FEATURES))
				1843	goto err_out;
				1844	ret = -EPERM;
				1845	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
				1846	goto err_out;
				1847	/* report all available features and ioctls to userland */
				1848	uffdio_api.features = UFFD_API_FEATURES;
				1849	uffdio_api.ioctls = UFFD_API_IOCTLS;
				1850	ret = -EFAULT;
				1851	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				1852	goto out;
				1853	ctx->state = UFFD_STATE_RUNNING;
				1854	/* only enable the requested features for this uffd context */
				1855	ctx->features = uffd_ctx_features(features);
				1856	ret = 0;
				1857	out:
				1858	return ret;
				1859	err_out:
				1860	memset(&uffdio_api, 0, sizeof(uffdio_api));
				1861	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				1862	ret = -EFAULT;
				1863	goto out;
				1864	}
				1865
				1866	static long userfaultfd_ioctl(struct file *file, unsigned cmd,
				1867	unsigned long arg)
				1868	{
				1869	int ret = -EINVAL;
				1870	struct userfaultfd_ctx *ctx = file->private_data;
				1871
				1872	if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
				1873	return -EINVAL;
				1874
				1875	switch(cmd) {
				1876	case UFFDIO_API:
				1877	ret = userfaultfd_api(ctx, arg);
				1878	break;
				1879	case UFFDIO_REGISTER:
				1880	ret = userfaultfd_register(ctx, arg);
				1881	break;
				1882	case UFFDIO_UNREGISTER:
				1883	ret = userfaultfd_unregister(ctx, arg);
				1884	break;
				1885	case UFFDIO_WAKE:
				1886	ret = userfaultfd_wake(ctx, arg);
				1887	break;
				1888	case UFFDIO_COPY:
				1889	ret = userfaultfd_copy(ctx, arg);
				1890	break;
				1891	case UFFDIO_ZEROPAGE:
				1892	ret = userfaultfd_zeropage(ctx, arg);
				1893	break;
				1894	}
				1895	return ret;
				1896	}
				1897
				1898	#ifdef CONFIG_PROC_FS
				1899	static void userfaultfd_show_fdinfo(struct seq_file m, struct file f)
				1900	{
				1901	struct userfaultfd_ctx *ctx = f->private_data;
				1902	wait_queue_entry_t *wq;
				1903	unsigned long pending = 0, total = 0;
				1904
				1905	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				1906	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
				1907	pending++;
				1908	total++;
				1909	}
				1910	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
				1911	total++;
				1912	}
				1913	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				1914
				1915	/*
				1916	* If more protocols will be added, there will be all shown
				1917	* separated by a space. Like this:
				1918	* protocols: aa:... bb:...
				1919	*/
				1920	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
				1921	pending, total, UFFD_API, ctx->features,
				1922	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
				1923	}
				1924	#endif
				1925
				1926	static const struct file_operations userfaultfd_fops = {
				1927	#ifdef CONFIG_PROC_FS
				1928	.show_fdinfo = userfaultfd_show_fdinfo,
				1929	#endif
				1930	.release = userfaultfd_release,
				1931	.poll = userfaultfd_poll,
				1932	.read = userfaultfd_read,
				1933	.unlocked_ioctl = userfaultfd_ioctl,
				1934	.compat_ioctl = userfaultfd_ioctl,
				1935	.llseek = noop_llseek,
				1936	};
				1937
				1938	static void init_once_userfaultfd_ctx(void *mem)
				1939	{
				1940	struct userfaultfd_ctx ctx = (struct userfaultfd_ctx ) mem;
				1941
				1942	init_waitqueue_head(&ctx->fault_pending_wqh);
				1943	init_waitqueue_head(&ctx->fault_wqh);
				1944	init_waitqueue_head(&ctx->event_wqh);
				1945	init_waitqueue_head(&ctx->fd_wqh);
				1946	seqcount_init(&ctx->refile_seq);
				1947	}
				1948
				1949	SYSCALL_DEFINE1(userfaultfd, int, flags)
				1950	{
				1951	struct userfaultfd_ctx *ctx;
				1952	int fd;
				1953
				1954	BUG_ON(!current->mm);
				1955
				1956	/* Check the UFFD_* constants for consistency. */
				1957	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
				1958	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
				1959
				1960	if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
				1961	return -EINVAL;
				1962
				1963	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
				1964	if (!ctx)
				1965	return -ENOMEM;
				1966
				1967	atomic_set(&ctx->refcount, 1);
				1968	ctx->flags = flags;
				1969	ctx->features = 0;
				1970	ctx->state = UFFD_STATE_WAIT_API;
				1971	ctx->released = false;
				1972	ctx->mmap_changing = false;
				1973	ctx->mm = current->mm;
				1974	/* prevent the mm struct to be freed */
				1975	mmgrab(ctx->mm);
				1976
				1977	fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
				1978	O_RDWR \| (flags & UFFD_SHARED_FCNTL_FLAGS));
				1979	if (fd < 0) {
				1980	mmdrop(ctx->mm);
				1981	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
				1982	}
				1983	return fd;
				1984	}
				1985
				1986	static int __init userfaultfd_init(void)
				1987	{
				1988	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
				1989	sizeof(struct userfaultfd_ctx),
				1990	0,
				1991	SLAB_HWCACHE_ALIGN\|SLAB_PANIC,
				1992	init_once_userfaultfd_ctx);
				1993	return 0;
				1994	}
				1995	__initcall(userfaultfd_init);