Blame - marvell/linux/fs/userfaultfd.c - T108

blob: 318135c02f12577750ed9a3c50327a373c436f90 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* fs/userfaultfd.c
				4	*
				5	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
				6	* Copyright (C) 2008-2009 Red Hat, Inc.
				7	* Copyright (C) 2015 Red Hat, Inc.
				8	*
				9	* Some part derived from fs/eventfd.c (anon inode setup) and
				10	* mm/ksm.c (mm hashing).
				11	*/
				12
				13	#include <linux/list.h>
				14	#include <linux/hashtable.h>
				15	#include <linux/sched/signal.h>
				16	#include <linux/sched/mm.h>
				17	#include <linux/mm.h>
				18	#include <linux/mmu_notifier.h>
				19	#include <linux/poll.h>
				20	#include <linux/slab.h>
				21	#include <linux/seq_file.h>
				22	#include <linux/file.h>
				23	#include <linux/bug.h>
				24	#include <linux/anon_inodes.h>
				25	#include <linux/syscalls.h>
				26	#include <linux/userfaultfd_k.h>
				27	#include <linux/mempolicy.h>
				28	#include <linux/ioctl.h>
				29	#include <linux/security.h>
				30	#include <linux/hugetlb.h>
				31
				32	int sysctl_unprivileged_userfaultfd __read_mostly;
				33
				34	static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
				35
				36	/*
				37	* Start with fault_pending_wqh and fault_wqh so they're more likely
				38	* to be in the same cacheline.
				39	*
				40	* Locking order:
				41	* fd_wqh.lock
				42	* fault_pending_wqh.lock
				43	* fault_wqh.lock
				44	* event_wqh.lock
				45	*
				46	* To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
				47	* since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
				48	* also taken in IRQ context.
				49	*/
				50	struct userfaultfd_ctx {
				51	/* waitqueue head for the pending (i.e. not read) userfaults */
				52	wait_queue_head_t fault_pending_wqh;
				53	/* waitqueue head for the userfaults */
				54	wait_queue_head_t fault_wqh;
				55	/* waitqueue head for the pseudo fd to wakeup poll/read */
				56	wait_queue_head_t fd_wqh;
				57	/* waitqueue head for events */
				58	wait_queue_head_t event_wqh;
				59	/* a refile sequence protected by fault_pending_wqh lock */
				60	struct seqcount refile_seq;
				61	/* pseudo fd refcounting */
				62	refcount_t refcount;
				63	/* userfaultfd syscall flags */
				64	unsigned int flags;
				65	/* features requested from the userspace */
				66	unsigned int features;
				67	/* released */
				68	bool released;
				69	/* memory mappings are changing because of non-cooperative event */
				70	bool mmap_changing;
				71	/* mm with one ore more vmas attached to this userfaultfd_ctx */
				72	struct mm_struct *mm;
				73	};
				74
				75	struct userfaultfd_fork_ctx {
				76	struct userfaultfd_ctx *orig;
				77	struct userfaultfd_ctx *new;
				78	struct list_head list;
				79	};
				80
				81	struct userfaultfd_unmap_ctx {
				82	struct userfaultfd_ctx *ctx;
				83	unsigned long start;
				84	unsigned long end;
				85	struct list_head list;
				86	};
				87
				88	struct userfaultfd_wait_queue {
				89	struct uffd_msg msg;
				90	wait_queue_entry_t wq;
				91	struct userfaultfd_ctx *ctx;
				92	bool waken;
				93	};
				94
				95	struct userfaultfd_wake_range {
				96	unsigned long start;
				97	unsigned long len;
				98	};
				99
				100	/* internal indication that UFFD_API ioctl was successfully executed */
				101	#define UFFD_FEATURE_INITIALIZED (1u << 31)
				102
				103	static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
				104	{
				105	return ctx->features & UFFD_FEATURE_INITIALIZED;
				106	}
				107
				108	static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
				109	int wake_flags, void *key)
				110	{
				111	struct userfaultfd_wake_range *range = key;
				112	int ret;
				113	struct userfaultfd_wait_queue *uwq;
				114	unsigned long start, len;
				115
				116	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				117	ret = 0;
				118	/* len == 0 means wake all */
				119	start = range->start;
				120	len = range->len;
				121	if (len && (start > uwq->msg.arg.pagefault.address \|\|
				122	start + len <= uwq->msg.arg.pagefault.address))
				123	goto out;
				124	WRITE_ONCE(uwq->waken, true);
				125	/*
				126	* The Program-Order guarantees provided by the scheduler
				127	* ensure uwq->waken is visible before the task is woken.
				128	*/
				129	ret = wake_up_state(wq->private, mode);
				130	if (ret) {
				131	/*
				132	* Wake only once, autoremove behavior.
				133	*
				134	* After the effect of list_del_init is visible to the other
				135	* CPUs, the waitqueue may disappear from under us, see the
				136	* !list_empty_careful() in handle_userfault().
				137	*
				138	* try_to_wake_up() has an implicit smp_mb(), and the
				139	* wq->private is read before calling the extern function
				140	* "wake_up_state" (which in turns calls try_to_wake_up).
				141	*/
				142	list_del_init(&wq->entry);
				143	}
				144	out:
				145	return ret;
				146	}
				147
				148	/**
				149	* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
				150	* context.
				151	* @ctx: [in] Pointer to the userfaultfd context.
				152	*/
				153	static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
				154	{
				155	refcount_inc(&ctx->refcount);
				156	}
				157
				158	/**
				159	* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
				160	* context.
				161	* @ctx: [in] Pointer to userfaultfd context.
				162	*
				163	* The userfaultfd context reference must have been previously acquired either
				164	* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
				165	*/
				166	static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
				167	{
				168	if (refcount_dec_and_test(&ctx->refcount)) {
				169	VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
				170	VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
				171	VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
				172	VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
				173	VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
				174	VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
				175	VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
				176	VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
				177	mmdrop(ctx->mm);
				178	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
				179	}
				180	}
				181
				182	static inline void msg_init(struct uffd_msg *msg)
				183	{
				184	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
				185	/*
				186	* Must use memset to zero out the paddings or kernel data is
				187	* leaked to userland.
				188	*/
				189	memset(msg, 0, sizeof(struct uffd_msg));
				190	}
				191
				192	static inline struct uffd_msg userfault_msg(unsigned long address,
				193	unsigned int flags,
				194	unsigned long reason,
				195	unsigned int features)
				196	{
				197	struct uffd_msg msg;
				198	msg_init(&msg);
				199	msg.event = UFFD_EVENT_PAGEFAULT;
				200	msg.arg.pagefault.address = address;
				201	/*
				202	* These flags indicate why the userfault occurred:
				203	* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
				204	* - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
				205	* - Neither of these flags being set indicates a MISSING fault.
				206	*
				207	* Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
				208	* fault. Otherwise, it was a read fault.
				209	*/
				210	if (flags & FAULT_FLAG_WRITE)
				211	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WRITE;
				212	if (reason & VM_UFFD_WP)
				213	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_WP;
				214	if (reason & VM_UFFD_MINOR)
				215	msg.arg.pagefault.flags \|= UFFD_PAGEFAULT_FLAG_MINOR;
				216	if (features & UFFD_FEATURE_THREAD_ID)
				217	msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
				218	return msg;
				219	}
				220
				221	#ifdef CONFIG_HUGETLB_PAGE
				222	/*
				223	* Same functionality as userfaultfd_must_wait below with modifications for
				224	* hugepmd ranges.
				225	*/
				226	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
				227	struct vm_area_struct *vma,
				228	unsigned long address,
				229	unsigned long flags,
				230	unsigned long reason)
				231	{
				232	struct mm_struct *mm = ctx->mm;
				233	pte_t *ptep, pte;
				234	bool ret = true;
				235
				236	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				237
				238	ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
				239
				240	if (!ptep)
				241	goto out;
				242
				243	ret = false;
				244	pte = huge_ptep_get(ptep);
				245
				246	/*
				247	* Lockless access: we're in a wait_event so it's ok if it
				248	* changes under us.
				249	*/
				250	if (huge_pte_none(pte))
				251	ret = true;
				252	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
				253	ret = true;
				254	out:
				255	return ret;
				256	}
				257	#else
				258	static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
				259	struct vm_area_struct *vma,
				260	unsigned long address,
				261	unsigned long flags,
				262	unsigned long reason)
				263	{
				264	return false; /* should never get here */
				265	}
				266	#endif /* CONFIG_HUGETLB_PAGE */
				267
				268	/*
				269	* Verify the pagetables are still not ok after having reigstered into
				270	* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
				271	* userfault that has already been resolved, if userfaultfd_read and
				272	* UFFDIO_COPY\|ZEROPAGE are being run simultaneously on two different
				273	* threads.
				274	*/
				275	static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
				276	unsigned long address,
				277	unsigned long flags,
				278	unsigned long reason)
				279	{
				280	struct mm_struct *mm = ctx->mm;
				281	pgd_t *pgd;
				282	p4d_t *p4d;
				283	pud_t *pud;
				284	pmd_t *pmd, _pmd;
				285	pte_t *pte;
				286	bool ret = true;
				287
				288	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
				289
				290	pgd = pgd_offset(mm, address);
				291	if (!pgd_present(*pgd))
				292	goto out;
				293	p4d = p4d_offset(pgd, address);
				294	if (!p4d_present(*p4d))
				295	goto out;
				296	pud = pud_offset(p4d, address);
				297	if (!pud_present(*pud))
				298	goto out;
				299	pmd = pmd_offset(pud, address);
				300	/*
				301	* READ_ONCE must function as a barrier with narrower scope
				302	* and it must be equivalent to:
				303	* _pmd = *pmd; barrier();
				304	*
				305	* This is to deal with the instability (as in
				306	* pmd_trans_unstable) of the pmd.
				307	*/
				308	_pmd = READ_ONCE(*pmd);
				309	if (pmd_none(_pmd))
				310	goto out;
				311
				312	ret = false;
				313	if (!pmd_present(_pmd))
				314	goto out;
				315
				316	if (pmd_trans_huge(_pmd))
				317	goto out;
				318
				319	/*
				320	* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
				321	* and use the standard pte_offset_map() instead of parsing _pmd.
				322	*/
				323	pte = pte_offset_map(pmd, address);
				324	/*
				325	* Lockless access: we're in a wait_event so it's ok if it
				326	* changes under us.
				327	*/
				328	if (pte_none(*pte))
				329	ret = true;
				330	pte_unmap(pte);
				331
				332	out:
				333	return ret;
				334	}
				335
				336	/* Should pair with userfaultfd_signal_pending() */
				337	static inline long userfaultfd_get_blocking_state(unsigned int flags)
				338	{
				339	if (flags & FAULT_FLAG_INTERRUPTIBLE)
				340	return TASK_INTERRUPTIBLE;
				341
				342	if (flags & FAULT_FLAG_KILLABLE)
				343	return TASK_KILLABLE;
				344
				345	return TASK_UNINTERRUPTIBLE;
				346	}
				347
				348	/* Should pair with userfaultfd_get_blocking_state() */
				349	static inline bool userfaultfd_signal_pending(unsigned int flags)
				350	{
				351	if (flags & FAULT_FLAG_INTERRUPTIBLE)
				352	return signal_pending(current);
				353
				354	if (flags & FAULT_FLAG_KILLABLE)
				355	return fatal_signal_pending(current);
				356
				357	return false;
				358	}
				359
				360	/*
				361	* The locking rules involved in returning VM_FAULT_RETRY depending on
				362	* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
				363	* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
				364	* recommendation in __lock_page_or_retry is not an understatement.
				365	*
				366	* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
				367	* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
				368	* not set.
				369	*
				370	* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
				371	* set, VM_FAULT_RETRY can still be returned if and only if there are
				372	* fatal_signal_pending()s, and the mmap_sem must be released before
				373	* returning it.
				374	*/
				375	vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
				376	{
				377	struct mm_struct *mm = vmf->vma->vm_mm;
				378	struct userfaultfd_ctx *ctx;
				379	struct userfaultfd_wait_queue uwq;
				380	vm_fault_t ret = VM_FAULT_SIGBUS;
				381	bool must_wait;
				382	long blocking_state;
				383
				384	/*
				385	* We don't do userfault handling for the final child pid update.
				386	*
				387	* We also don't do userfault handling during
				388	* coredumping. hugetlbfs has the special
				389	* follow_hugetlb_page() to skip missing pages in the
				390	* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
				391	* the no_page_table() helper in follow_page_mask(), but the
				392	* shmem_vm_ops->fault method is invoked even during
				393	* coredumping without mmap_sem and it ends up here.
				394	*/
				395	if (current->flags & (PF_EXITING\|PF_DUMPCORE))
				396	goto out;
				397
				398	/*
				399	* Coredumping runs without mmap_sem so we can only check that
				400	* the mmap_sem is held, if PF_DUMPCORE was not set.
				401	*/
				402	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
				403
				404	ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
				405	if (!ctx)
				406	goto out;
				407
				408	BUG_ON(ctx->mm != mm);
				409
				410	/* Any unrecognized flag is a bug. */
				411	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
				412	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
				413	VM_BUG_ON(!reason \|\| (reason & (reason - 1)));
				414
				415	if (ctx->features & UFFD_FEATURE_SIGBUS)
				416	goto out;
				417	if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
				418	ctx->flags & UFFD_USER_MODE_ONLY) {
				419	printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
				420	"sysctl knob to 1 if kernel faults must be handled "
				421	"without obtaining CAP_SYS_PTRACE capability\n");
				422	goto out;
				423	}
				424
				425	/*
				426	* If it's already released don't get it. This avoids to loop
				427	* in __get_user_pages if userfaultfd_release waits on the
				428	* caller of handle_userfault to release the mmap_sem.
				429	*/
				430	if (unlikely(READ_ONCE(ctx->released))) {
				431	/*
				432	* Don't return VM_FAULT_SIGBUS in this case, so a non
				433	* cooperative manager can close the uffd after the
				434	* last UFFDIO_COPY, without risking to trigger an
				435	* involuntary SIGBUS if the process was starting the
				436	* userfaultfd while the userfaultfd was still armed
				437	* (but after the last UFFDIO_COPY). If the uffd
				438	* wasn't already closed when the userfault reached
				439	* this point, that would normally be solved by
				440	* userfaultfd_must_wait returning 'false'.
				441	*
				442	* If we were to return VM_FAULT_SIGBUS here, the non
				443	* cooperative manager would be instead forced to
				444	* always call UFFDIO_UNREGISTER before it can safely
				445	* close the uffd.
				446	*/
				447	ret = VM_FAULT_NOPAGE;
				448	goto out;
				449	}
				450
				451	/*
				452	* Check that we can return VM_FAULT_RETRY.
				453	*
				454	* NOTE: it should become possible to return VM_FAULT_RETRY
				455	* even if FAULT_FLAG_TRIED is set without leading to gup()
				456	* -EBUSY failures, if the userfaultfd is to be extended for
				457	* VM_UFFD_WP tracking and we intend to arm the userfault
				458	* without first stopping userland access to the memory. For
				459	* VM_UFFD_MISSING userfaults this is enough for now.
				460	*/
				461	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
				462	/*
				463	* Validate the invariant that nowait must allow retry
				464	* to be sure not to return SIGBUS erroneously on
				465	* nowait invocations.
				466	*/
				467	BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
				468	#ifdef CONFIG_DEBUG_VM
				469	if (printk_ratelimit()) {
				470	printk(KERN_WARNING
				471	"FAULT_FLAG_ALLOW_RETRY missing %x\n",
				472	vmf->flags);
				473	dump_stack();
				474	}
				475	#endif
				476	goto out;
				477	}
				478
				479	/*
				480	* Handle nowait, not much to do other than tell it to retry
				481	* and wait.
				482	*/
				483	ret = VM_FAULT_RETRY;
				484	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
				485	goto out;
				486
				487	/* take the reference before dropping the mmap_sem */
				488	userfaultfd_ctx_get(ctx);
				489
				490	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
				491	uwq.wq.private = current;
				492	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
				493	ctx->features);
				494	uwq.ctx = ctx;
				495	uwq.waken = false;
				496
				497	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
				498
				499	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				500	/*
				501	* After the __add_wait_queue the uwq is visible to userland
				502	* through poll/read().
				503	*/
				504	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
				505	/*
				506	* The smp_mb() after __set_current_state prevents the reads
				507	* following the spin_unlock to happen before the list_add in
				508	* __add_wait_queue.
				509	*/
				510	set_current_state(blocking_state);
				511	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				512
				513	if (!is_vm_hugetlb_page(vmf->vma))
				514	must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
				515	reason);
				516	else
				517	must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
				518	vmf->address,
				519	vmf->flags, reason);
				520	up_read(&mm->mmap_sem);
				521
				522	if (likely(must_wait && !READ_ONCE(ctx->released) &&
				523	!userfaultfd_signal_pending(vmf->flags))) {
				524	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
				525	schedule();
				526	ret \|= VM_FAULT_MAJOR;
				527
				528	/*
				529	* False wakeups can orginate even from rwsem before
				530	* up_read() however userfaults will wait either for a
				531	* targeted wakeup on the specific uwq waitqueue from
				532	* wake_userfault() or for signals or for uffd
				533	* release.
				534	*/
				535	while (!READ_ONCE(uwq.waken)) {
				536	/*
				537	* This needs the full smp_store_mb()
				538	* guarantee as the state write must be
				539	* visible to other CPUs before reading
				540	* uwq.waken from other CPUs.
				541	*/
				542	set_current_state(blocking_state);
				543	if (READ_ONCE(uwq.waken) \|\|
				544	READ_ONCE(ctx->released) \|\|
				545	userfaultfd_signal_pending(vmf->flags))
				546	break;
				547	schedule();
				548	}
				549	}
				550
				551	__set_current_state(TASK_RUNNING);
				552
				553	/*
				554	* Here we race with the list_del; list_add in
				555	* userfaultfd_ctx_read(), however because we don't ever run
				556	* list_del_init() to refile across the two lists, the prev
				557	* and next pointers will never point to self. list_add also
				558	* would never let any of the two pointers to point to
				559	* self. So list_empty_careful won't risk to see both pointers
				560	* pointing to self at any time during the list refile. The
				561	* only case where list_del_init() is called is the full
				562	* removal in the wake function and there we don't re-list_add
				563	* and it's fine not to block on the spinlock. The uwq on this
				564	* kernel stack can be released after the list_del_init.
				565	*/
				566	if (!list_empty_careful(&uwq.wq.entry)) {
				567	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				568	/*
				569	* No need of list_del_init(), the uwq on the stack
				570	* will be freed shortly anyway.
				571	*/
				572	list_del(&uwq.wq.entry);
				573	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				574	}
				575
				576	/*
				577	* ctx may go away after this if the userfault pseudo fd is
				578	* already released.
				579	*/
				580	userfaultfd_ctx_put(ctx);
				581
				582	out:
				583	return ret;
				584	}
				585
				586	static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
				587	struct userfaultfd_wait_queue *ewq)
				588	{
				589	struct userfaultfd_ctx *release_new_ctx;
				590
				591	if (WARN_ON_ONCE(current->flags & PF_EXITING))
				592	goto out;
				593
				594	ewq->ctx = ctx;
				595	init_waitqueue_entry(&ewq->wq, current);
				596	release_new_ctx = NULL;
				597
				598	spin_lock_irq(&ctx->event_wqh.lock);
				599	/*
				600	* After the __add_wait_queue the uwq is visible to userland
				601	* through poll/read().
				602	*/
				603	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
				604	for (;;) {
				605	set_current_state(TASK_KILLABLE);
				606	if (ewq->msg.event == 0)
				607	break;
				608	if (READ_ONCE(ctx->released) \|\|
				609	fatal_signal_pending(current)) {
				610	/*
				611	* &ewq->wq may be queued in fork_event, but
				612	* __remove_wait_queue ignores the head
				613	* parameter. It would be a problem if it
				614	* didn't.
				615	*/
				616	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
				617	if (ewq->msg.event == UFFD_EVENT_FORK) {
				618	struct userfaultfd_ctx *new;
				619
				620	new = (struct userfaultfd_ctx *)
				621	(unsigned long)
				622	ewq->msg.arg.reserved.reserved1;
				623	release_new_ctx = new;
				624	}
				625	break;
				626	}
				627
				628	spin_unlock_irq(&ctx->event_wqh.lock);
				629
				630	wake_up_poll(&ctx->fd_wqh, EPOLLIN);
				631	schedule();
				632
				633	spin_lock_irq(&ctx->event_wqh.lock);
				634	}
				635	__set_current_state(TASK_RUNNING);
				636	spin_unlock_irq(&ctx->event_wqh.lock);
				637
				638	if (release_new_ctx) {
				639	struct vm_area_struct *vma;
				640	struct mm_struct *mm = release_new_ctx->mm;
				641
				642	/* the various vma->vm_userfaultfd_ctx still points to it */
				643	down_write(&mm->mmap_sem);
				644	/* no task can run (and in turn coredump) yet */
				645	VM_WARN_ON(!mmget_still_valid(mm));
				646	for (vma = mm->mmap; vma; vma = vma->vm_next)
				647	if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
				648	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				649	vma->vm_flags &= ~__VM_UFFD_FLAGS;
				650	}
				651	up_write(&mm->mmap_sem);
				652
				653	userfaultfd_ctx_put(release_new_ctx);
				654	}
				655
				656	/*
				657	* ctx may go away after this if the userfault pseudo fd is
				658	* already released.
				659	*/
				660	out:
				661	WRITE_ONCE(ctx->mmap_changing, false);
				662	userfaultfd_ctx_put(ctx);
				663	}
				664
				665	static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
				666	struct userfaultfd_wait_queue *ewq)
				667	{
				668	ewq->msg.event = 0;
				669	wake_up_locked(&ctx->event_wqh);
				670	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
				671	}
				672
				673	int dup_userfaultfd(struct vm_area_struct vma, struct list_head fcs)
				674	{
				675	struct userfaultfd_ctx ctx = NULL, octx;
				676	struct userfaultfd_fork_ctx *fctx;
				677
				678	octx = vma->vm_userfaultfd_ctx.ctx;
				679	if (!octx \|\| !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
				680	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				681	vma->vm_flags &= ~__VM_UFFD_FLAGS;
				682	return 0;
				683	}
				684
				685	list_for_each_entry(fctx, fcs, list)
				686	if (fctx->orig == octx) {
				687	ctx = fctx->new;
				688	break;
				689	}
				690
				691	if (!ctx) {
				692	fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
				693	if (!fctx)
				694	return -ENOMEM;
				695
				696	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
				697	if (!ctx) {
				698	kfree(fctx);
				699	return -ENOMEM;
				700	}
				701
				702	refcount_set(&ctx->refcount, 1);
				703	ctx->flags = octx->flags;
				704	ctx->features = octx->features;
				705	ctx->released = false;
				706	ctx->mmap_changing = false;
				707	ctx->mm = vma->vm_mm;
				708	mmgrab(ctx->mm);
				709
				710	userfaultfd_ctx_get(octx);
				711	WRITE_ONCE(octx->mmap_changing, true);
				712	fctx->orig = octx;
				713	fctx->new = ctx;
				714	list_add_tail(&fctx->list, fcs);
				715	}
				716
				717	vma->vm_userfaultfd_ctx.ctx = ctx;
				718	return 0;
				719	}
				720
				721	static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
				722	{
				723	struct userfaultfd_ctx *ctx = fctx->orig;
				724	struct userfaultfd_wait_queue ewq;
				725
				726	msg_init(&ewq.msg);
				727
				728	ewq.msg.event = UFFD_EVENT_FORK;
				729	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
				730
				731	userfaultfd_event_wait_completion(ctx, &ewq);
				732	}
				733
				734	void dup_userfaultfd_complete(struct list_head *fcs)
				735	{
				736	struct userfaultfd_fork_ctx fctx, n;
				737
				738	list_for_each_entry_safe(fctx, n, fcs, list) {
				739	dup_fctx(fctx);
				740	list_del(&fctx->list);
				741	kfree(fctx);
				742	}
				743	}
				744
				745	void mremap_userfaultfd_prep(struct vm_area_struct *vma,
				746	struct vm_userfaultfd_ctx *vm_ctx)
				747	{
				748	struct userfaultfd_ctx *ctx;
				749
				750	ctx = vma->vm_userfaultfd_ctx.ctx;
				751
				752	if (!ctx)
				753	return;
				754
				755	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
				756	vm_ctx->ctx = ctx;
				757	userfaultfd_ctx_get(ctx);
				758	WRITE_ONCE(ctx->mmap_changing, true);
				759	} else {
				760	/* Drop uffd context if remap feature not enabled */
				761	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				762	vma->vm_flags &= ~__VM_UFFD_FLAGS;
				763	}
				764	}
				765
				766	void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
				767	unsigned long from, unsigned long to,
				768	unsigned long len)
				769	{
				770	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
				771	struct userfaultfd_wait_queue ewq;
				772
				773	if (!ctx)
				774	return;
				775
				776	if (to & ~PAGE_MASK) {
				777	userfaultfd_ctx_put(ctx);
				778	return;
				779	}
				780
				781	msg_init(&ewq.msg);
				782
				783	ewq.msg.event = UFFD_EVENT_REMAP;
				784	ewq.msg.arg.remap.from = from;
				785	ewq.msg.arg.remap.to = to;
				786	ewq.msg.arg.remap.len = len;
				787
				788	userfaultfd_event_wait_completion(ctx, &ewq);
				789	}
				790
				791	bool userfaultfd_remove(struct vm_area_struct *vma,
				792	unsigned long start, unsigned long end)
				793	{
				794	struct mm_struct *mm = vma->vm_mm;
				795	struct userfaultfd_ctx *ctx;
				796	struct userfaultfd_wait_queue ewq;
				797
				798	ctx = vma->vm_userfaultfd_ctx.ctx;
				799	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
				800	return true;
				801
				802	userfaultfd_ctx_get(ctx);
				803	WRITE_ONCE(ctx->mmap_changing, true);
				804	up_read(&mm->mmap_sem);
				805
				806	msg_init(&ewq.msg);
				807
				808	ewq.msg.event = UFFD_EVENT_REMOVE;
				809	ewq.msg.arg.remove.start = start;
				810	ewq.msg.arg.remove.end = end;
				811
				812	userfaultfd_event_wait_completion(ctx, &ewq);
				813
				814	return false;
				815	}
				816
				817	static bool has_unmap_ctx(struct userfaultfd_ctx ctx, struct list_head unmaps,
				818	unsigned long start, unsigned long end)
				819	{
				820	struct userfaultfd_unmap_ctx *unmap_ctx;
				821
				822	list_for_each_entry(unmap_ctx, unmaps, list)
				823	if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
				824	unmap_ctx->end == end)
				825	return true;
				826
				827	return false;
				828	}
				829
				830	int userfaultfd_unmap_prep(struct vm_area_struct *vma,
				831	unsigned long start, unsigned long end,
				832	struct list_head *unmaps)
				833	{
				834	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
				835	struct userfaultfd_unmap_ctx *unmap_ctx;
				836	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
				837
				838	if (!ctx \|\| !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) \|\|
				839	has_unmap_ctx(ctx, unmaps, start, end))
				840	continue;
				841
				842	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
				843	if (!unmap_ctx)
				844	return -ENOMEM;
				845
				846	userfaultfd_ctx_get(ctx);
				847	WRITE_ONCE(ctx->mmap_changing, true);
				848	unmap_ctx->ctx = ctx;
				849	unmap_ctx->start = start;
				850	unmap_ctx->end = end;
				851	list_add_tail(&unmap_ctx->list, unmaps);
				852	}
				853
				854	return 0;
				855	}
				856
				857	void userfaultfd_unmap_complete(struct mm_struct mm, struct list_head uf)
				858	{
				859	struct userfaultfd_unmap_ctx ctx, n;
				860	struct userfaultfd_wait_queue ewq;
				861
				862	list_for_each_entry_safe(ctx, n, uf, list) {
				863	msg_init(&ewq.msg);
				864
				865	ewq.msg.event = UFFD_EVENT_UNMAP;
				866	ewq.msg.arg.remove.start = ctx->start;
				867	ewq.msg.arg.remove.end = ctx->end;
				868
				869	userfaultfd_event_wait_completion(ctx->ctx, &ewq);
				870
				871	list_del(&ctx->list);
				872	kfree(ctx);
				873	}
				874	}
				875
				876	static int userfaultfd_release(struct inode inode, struct file file)
				877	{
				878	struct userfaultfd_ctx *ctx = file->private_data;
				879	struct mm_struct *mm = ctx->mm;
				880	struct vm_area_struct vma, prev;
				881	/* len == 0 means wake all */
				882	struct userfaultfd_wake_range range = { .len = 0, };
				883	unsigned long new_flags;
				884	bool still_valid;
				885
				886	WRITE_ONCE(ctx->released, true);
				887
				888	if (!mmget_not_zero(mm))
				889	goto wakeup;
				890
				891	/*
				892	* Flush page faults out of all CPUs. NOTE: all page faults
				893	* must be retried without returning VM_FAULT_SIGBUS if
				894	* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
				895	* changes while handle_userfault released the mmap_sem. So
				896	* it's critical that released is set to true (above), before
				897	* taking the mmap_sem for writing.
				898	*/
				899	down_write(&mm->mmap_sem);
				900	still_valid = mmget_still_valid(mm);
				901	prev = NULL;
				902	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				903	cond_resched();
				904	BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
				905	!!(vma->vm_flags & __VM_UFFD_FLAGS));
				906	if (vma->vm_userfaultfd_ctx.ctx != ctx) {
				907	prev = vma;
				908	continue;
				909	}
				910	new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
				911	if (still_valid) {
				912	prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
				913	new_flags, vma->anon_vma,
				914	vma->vm_file, vma->vm_pgoff,
				915	vma_policy(vma),
				916	NULL_VM_UFFD_CTX,
				917	vma_get_anon_name(vma));
				918	if (prev)
				919	vma = prev;
				920	else
				921	prev = vma;
				922	}
				923	vma->vm_flags = new_flags;
				924	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				925	}
				926	up_write(&mm->mmap_sem);
				927	mmput(mm);
				928	wakeup:
				929	/*
				930	* After no new page faults can wait on this fault_*wqh, flush
				931	* the last page faults that may have been already waiting on
				932	* the fault_*wqh.
				933	*/
				934	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				935	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
				936	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
				937	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				938
				939	/* Flush pending events that may still wait on event_wqh */
				940	wake_up_all(&ctx->event_wqh);
				941
				942	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
				943	userfaultfd_ctx_put(ctx);
				944	return 0;
				945	}
				946
				947	/* fault_pending_wqh.lock must be hold by the caller */
				948	static inline struct userfaultfd_wait_queue *find_userfault_in(
				949	wait_queue_head_t *wqh)
				950	{
				951	wait_queue_entry_t *wq;
				952	struct userfaultfd_wait_queue *uwq;
				953
				954	lockdep_assert_held(&wqh->lock);
				955
				956	uwq = NULL;
				957	if (!waitqueue_active(wqh))
				958	goto out;
				959	/* walk in reverse to provide FIFO behavior to read userfaults */
				960	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
				961	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
				962	out:
				963	return uwq;
				964	}
				965
				966	static inline struct userfaultfd_wait_queue *find_userfault(
				967	struct userfaultfd_ctx *ctx)
				968	{
				969	return find_userfault_in(&ctx->fault_pending_wqh);
				970	}
				971
				972	static inline struct userfaultfd_wait_queue *find_userfault_evt(
				973	struct userfaultfd_ctx *ctx)
				974	{
				975	return find_userfault_in(&ctx->event_wqh);
				976	}
				977
				978	static __poll_t userfaultfd_poll(struct file file, poll_table wait)
				979	{
				980	struct userfaultfd_ctx *ctx = file->private_data;
				981	__poll_t ret;
				982
				983	poll_wait(file, &ctx->fd_wqh, wait);
				984
				985	if (!userfaultfd_is_initialized(ctx))
				986	return EPOLLERR;
				987
				988	/*
				989	* poll() never guarantees that read won't block.
				990	* userfaults can be waken before they're read().
				991	*/
				992	if (unlikely(!(file->f_flags & O_NONBLOCK)))
				993	return EPOLLERR;
				994	/*
				995	* lockless access to see if there are pending faults
				996	* __pollwait last action is the add_wait_queue but
				997	* the spin_unlock would allow the waitqueue_active to
				998	* pass above the actual list_add inside
				999	* add_wait_queue critical section. So use a full
				1000	* memory barrier to serialize the list_add write of
				1001	* add_wait_queue() with the waitqueue_active read
				1002	* below.
				1003	*/
				1004	ret = 0;
				1005	smp_mb();
				1006	if (waitqueue_active(&ctx->fault_pending_wqh))
				1007	ret = EPOLLIN;
				1008	else if (waitqueue_active(&ctx->event_wqh))
				1009	ret = EPOLLIN;
				1010
				1011	return ret;
				1012	}
				1013
				1014	static const struct file_operations userfaultfd_fops;
				1015
				1016	static int resolve_userfault_fork(struct userfaultfd_ctx *new,
				1017	struct inode *inode,
				1018	struct uffd_msg *msg)
				1019	{
				1020	int fd;
				1021
				1022	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
				1023	O_RDONLY \| (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
				1024	if (fd < 0)
				1025	return fd;
				1026
				1027	msg->arg.reserved.reserved1 = 0;
				1028	msg->arg.fork.ufd = fd;
				1029	return 0;
				1030	}
				1031
				1032	static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
				1033	struct uffd_msg msg, struct inode inode)
				1034	{
				1035	ssize_t ret;
				1036	DECLARE_WAITQUEUE(wait, current);
				1037	struct userfaultfd_wait_queue *uwq;
				1038	/*
				1039	* Handling fork event requires sleeping operations, so
				1040	* we drop the event_wqh lock, then do these ops, then
				1041	* lock it back and wake up the waiter. While the lock is
				1042	* dropped the ewq may go away so we keep track of it
				1043	* carefully.
				1044	*/
				1045	LIST_HEAD(fork_event);
				1046	struct userfaultfd_ctx *fork_nctx = NULL;
				1047
				1048	/* always take the fd_wqh lock before the fault_pending_wqh lock */
				1049	spin_lock_irq(&ctx->fd_wqh.lock);
				1050	__add_wait_queue(&ctx->fd_wqh, &wait);
				1051	for (;;) {
				1052	set_current_state(TASK_INTERRUPTIBLE);
				1053	spin_lock(&ctx->fault_pending_wqh.lock);
				1054	uwq = find_userfault(ctx);
				1055	if (uwq) {
				1056	/*
				1057	* Use a seqcount to repeat the lockless check
				1058	* in wake_userfault() to avoid missing
				1059	* wakeups because during the refile both
				1060	* waitqueue could become empty if this is the
				1061	* only userfault.
				1062	*/
				1063	write_seqcount_begin(&ctx->refile_seq);
				1064
				1065	/*
				1066	* The fault_pending_wqh.lock prevents the uwq
				1067	* to disappear from under us.
				1068	*
				1069	* Refile this userfault from
				1070	* fault_pending_wqh to fault_wqh, it's not
				1071	* pending anymore after we read it.
				1072	*
				1073	* Use list_del() by hand (as
				1074	* userfaultfd_wake_function also uses
				1075	* list_del_init() by hand) to be sure nobody
				1076	* changes __remove_wait_queue() to use
				1077	* list_del_init() in turn breaking the
				1078	* !list_empty_careful() check in
				1079	* handle_userfault(). The uwq->wq.head list
				1080	* must never be empty at any time during the
				1081	* refile, or the waitqueue could disappear
				1082	* from under us. The "wait_queue_head_t"
				1083	* parameter of __remove_wait_queue() is unused
				1084	* anyway.
				1085	*/
				1086	list_del(&uwq->wq.entry);
				1087	add_wait_queue(&ctx->fault_wqh, &uwq->wq);
				1088
				1089	write_seqcount_end(&ctx->refile_seq);
				1090
				1091	/* careful to always initialize msg if ret == 0 */
				1092	*msg = uwq->msg;
				1093	spin_unlock(&ctx->fault_pending_wqh.lock);
				1094	ret = 0;
				1095	break;
				1096	}
				1097	spin_unlock(&ctx->fault_pending_wqh.lock);
				1098
				1099	spin_lock(&ctx->event_wqh.lock);
				1100	uwq = find_userfault_evt(ctx);
				1101	if (uwq) {
				1102	*msg = uwq->msg;
				1103
				1104	if (uwq->msg.event == UFFD_EVENT_FORK) {
				1105	fork_nctx = (struct userfaultfd_ctx *)
				1106	(unsigned long)
				1107	uwq->msg.arg.reserved.reserved1;
				1108	list_move(&uwq->wq.entry, &fork_event);
				1109	/*
				1110	* fork_nctx can be freed as soon as
				1111	* we drop the lock, unless we take a
				1112	* reference on it.
				1113	*/
				1114	userfaultfd_ctx_get(fork_nctx);
				1115	spin_unlock(&ctx->event_wqh.lock);
				1116	ret = 0;
				1117	break;
				1118	}
				1119
				1120	userfaultfd_event_complete(ctx, uwq);
				1121	spin_unlock(&ctx->event_wqh.lock);
				1122	ret = 0;
				1123	break;
				1124	}
				1125	spin_unlock(&ctx->event_wqh.lock);
				1126
				1127	if (signal_pending(current)) {
				1128	ret = -ERESTARTSYS;
				1129	break;
				1130	}
				1131	if (no_wait) {
				1132	ret = -EAGAIN;
				1133	break;
				1134	}
				1135	spin_unlock_irq(&ctx->fd_wqh.lock);
				1136	schedule();
				1137	spin_lock_irq(&ctx->fd_wqh.lock);
				1138	}
				1139	__remove_wait_queue(&ctx->fd_wqh, &wait);
				1140	__set_current_state(TASK_RUNNING);
				1141	spin_unlock_irq(&ctx->fd_wqh.lock);
				1142
				1143	if (!ret && msg->event == UFFD_EVENT_FORK) {
				1144	ret = resolve_userfault_fork(fork_nctx, inode, msg);
				1145	spin_lock_irq(&ctx->event_wqh.lock);
				1146	if (!list_empty(&fork_event)) {
				1147	/*
				1148	* The fork thread didn't abort, so we can
				1149	* drop the temporary refcount.
				1150	*/
				1151	userfaultfd_ctx_put(fork_nctx);
				1152
				1153	uwq = list_first_entry(&fork_event,
				1154	typeof(*uwq),
				1155	wq.entry);
				1156	/*
				1157	* If fork_event list wasn't empty and in turn
				1158	* the event wasn't already released by fork
				1159	* (the event is allocated on fork kernel
				1160	* stack), put the event back to its place in
				1161	* the event_wq. fork_event head will be freed
				1162	* as soon as we return so the event cannot
				1163	* stay queued there no matter the current
				1164	* "ret" value.
				1165	*/
				1166	list_del(&uwq->wq.entry);
				1167	__add_wait_queue(&ctx->event_wqh, &uwq->wq);
				1168
				1169	/*
				1170	* Leave the event in the waitqueue and report
				1171	* error to userland if we failed to resolve
				1172	* the userfault fork.
				1173	*/
				1174	if (likely(!ret))
				1175	userfaultfd_event_complete(ctx, uwq);
				1176	} else {
				1177	/*
				1178	* Here the fork thread aborted and the
				1179	* refcount from the fork thread on fork_nctx
				1180	* has already been released. We still hold
				1181	* the reference we took before releasing the
				1182	* lock above. If resolve_userfault_fork
				1183	* failed we've to drop it because the
				1184	* fork_nctx has to be freed in such case. If
				1185	* it succeeded we'll hold it because the new
				1186	* uffd references it.
				1187	*/
				1188	if (ret)
				1189	userfaultfd_ctx_put(fork_nctx);
				1190	}
				1191	spin_unlock_irq(&ctx->event_wqh.lock);
				1192	}
				1193
				1194	return ret;
				1195	}
				1196
				1197	static ssize_t userfaultfd_read(struct file file, char __user buf,
				1198	size_t count, loff_t *ppos)
				1199	{
				1200	struct userfaultfd_ctx *ctx = file->private_data;
				1201	ssize_t _ret, ret = 0;
				1202	struct uffd_msg msg;
				1203	int no_wait = file->f_flags & O_NONBLOCK;
				1204	struct inode *inode = file_inode(file);
				1205
				1206	if (!userfaultfd_is_initialized(ctx))
				1207	return -EINVAL;
				1208
				1209	for (;;) {
				1210	if (count < sizeof(msg))
				1211	return ret ? ret : -EINVAL;
				1212	_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
				1213	if (_ret < 0)
				1214	return ret ? ret : _ret;
				1215	if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
				1216	return ret ? ret : -EFAULT;
				1217	ret += sizeof(msg);
				1218	buf += sizeof(msg);
				1219	count -= sizeof(msg);
				1220	/*
				1221	* Allow to read more than one fault at time but only
				1222	* block if waiting for the very first one.
				1223	*/
				1224	no_wait = O_NONBLOCK;
				1225	}
				1226	}
				1227
				1228	static void __wake_userfault(struct userfaultfd_ctx *ctx,
				1229	struct userfaultfd_wake_range *range)
				1230	{
				1231	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				1232	/* wake all in the range and autoremove */
				1233	if (waitqueue_active(&ctx->fault_pending_wqh))
				1234	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
				1235	range);
				1236	if (waitqueue_active(&ctx->fault_wqh))
				1237	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
				1238	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				1239	}
				1240
				1241	static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
				1242	struct userfaultfd_wake_range *range)
				1243	{
				1244	unsigned seq;
				1245	bool need_wakeup;
				1246
				1247	/*
				1248	* To be sure waitqueue_active() is not reordered by the CPU
				1249	* before the pagetable update, use an explicit SMP memory
				1250	* barrier here. PT lock release or up_read(mmap_sem) still
				1251	* have release semantics that can allow the
				1252	* waitqueue_active() to be reordered before the pte update.
				1253	*/
				1254	smp_mb();
				1255
				1256	/*
				1257	* Use waitqueue_active because it's very frequent to
				1258	* change the address space atomically even if there are no
				1259	* userfaults yet. So we take the spinlock only when we're
				1260	* sure we've userfaults to wake.
				1261	*/
				1262	do {
				1263	seq = read_seqcount_begin(&ctx->refile_seq);
				1264	need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) \|\|
				1265	waitqueue_active(&ctx->fault_wqh);
				1266	cond_resched();
				1267	} while (read_seqcount_retry(&ctx->refile_seq, seq));
				1268	if (need_wakeup)
				1269	__wake_userfault(ctx, range);
				1270	}
				1271
				1272	static __always_inline int validate_range(struct mm_struct *mm,
				1273	__u64 start, __u64 len)
				1274	{
				1275	__u64 task_size = mm->task_size;
				1276
				1277	if (start & ~PAGE_MASK)
				1278	return -EINVAL;
				1279	if (len & ~PAGE_MASK)
				1280	return -EINVAL;
				1281	if (!len)
				1282	return -EINVAL;
				1283	if (start < mmap_min_addr)
				1284	return -EINVAL;
				1285	if (start >= task_size)
				1286	return -EINVAL;
				1287	if (len > task_size - start)
				1288	return -EINVAL;
				1289	return 0;
				1290	}
				1291
				1292	static inline bool vma_can_userfault(struct vm_area_struct *vma,
				1293	unsigned long vm_flags)
				1294	{
				1295	if (vm_flags & VM_UFFD_MINOR) {
				1296	if (!(is_vm_hugetlb_page(vma) \|\| vma_is_shmem(vma)))
				1297	return false;
				1298	}
				1299
				1300	return vma_is_anonymous(vma) \|\| is_vm_hugetlb_page(vma) \|\|
				1301	vma_is_shmem(vma);
				1302	}
				1303
				1304	static int userfaultfd_register(struct userfaultfd_ctx *ctx,
				1305	unsigned long arg)
				1306	{
				1307	struct mm_struct *mm = ctx->mm;
				1308	struct vm_area_struct vma, prev, *cur;
				1309	int ret;
				1310	struct uffdio_register uffdio_register;
				1311	struct uffdio_register __user *user_uffdio_register;
				1312	unsigned long vm_flags, new_flags;
				1313	bool found;
				1314	bool basic_ioctls;
				1315	unsigned long start, end, vma_end;
				1316
				1317	user_uffdio_register = (struct uffdio_register __user *) arg;
				1318
				1319	ret = -EFAULT;
				1320	if (copy_from_user(&uffdio_register, user_uffdio_register,
				1321	sizeof(uffdio_register)-sizeof(__u64)))
				1322	goto out;
				1323
				1324	ret = -EINVAL;
				1325	if (!uffdio_register.mode)
				1326	goto out;
				1327	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
				1328	goto out;
				1329	vm_flags = 0;
				1330	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
				1331	vm_flags \|= VM_UFFD_MISSING;
				1332	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
				1333	vm_flags \|= VM_UFFD_WP;
				1334	/*
				1335	* FIXME: remove the below error constraint by
				1336	* implementing the wprotect tracking mode.
				1337	*/
				1338	ret = -EINVAL;
				1339	goto out;
				1340	}
				1341	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
				1342	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
				1343	goto out;
				1344	#endif
				1345	vm_flags \|= VM_UFFD_MINOR;
				1346	}
				1347
				1348	ret = validate_range(mm, uffdio_register.range.start,
				1349	uffdio_register.range.len);
				1350	if (ret)
				1351	goto out;
				1352
				1353	start = uffdio_register.range.start;
				1354	end = start + uffdio_register.range.len;
				1355
				1356	ret = -ENOMEM;
				1357	if (!mmget_not_zero(mm))
				1358	goto out;
				1359
				1360	down_write(&mm->mmap_sem);
				1361	if (!mmget_still_valid(mm))
				1362	goto out_unlock;
				1363	vma = find_vma_prev(mm, start, &prev);
				1364	if (!vma)
				1365	goto out_unlock;
				1366
				1367	/* check that there's at least one vma in the range */
				1368	ret = -EINVAL;
				1369	if (vma->vm_start >= end)
				1370	goto out_unlock;
				1371
				1372	/*
				1373	* If the first vma contains huge pages, make sure start address
				1374	* is aligned to huge page size.
				1375	*/
				1376	if (is_vm_hugetlb_page(vma)) {
				1377	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
				1378
				1379	if (start & (vma_hpagesize - 1))
				1380	goto out_unlock;
				1381	}
				1382
				1383	/*
				1384	* Search for not compatible vmas.
				1385	*/
				1386	found = false;
				1387	basic_ioctls = false;
				1388	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				1389	cond_resched();
				1390
				1391	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				1392	!!(cur->vm_flags & __VM_UFFD_FLAGS));
				1393
				1394	/* check not compatible vmas */
				1395	ret = -EINVAL;
				1396	if (!vma_can_userfault(cur, vm_flags))
				1397	goto out_unlock;
				1398
				1399	/*
				1400	* UFFDIO_COPY will fill file holes even without
				1401	* PROT_WRITE. This check enforces that if this is a
				1402	* MAP_SHARED, the process has write permission to the backing
				1403	* file. If VM_MAYWRITE is set it also enforces that on a
				1404	* MAP_SHARED vma: there is no F_WRITE_SEAL and no further
				1405	* F_WRITE_SEAL can be taken until the vma is destroyed.
				1406	*/
				1407	ret = -EPERM;
				1408	if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
				1409	goto out_unlock;
				1410
				1411	/*
				1412	* If this vma contains ending address, and huge pages
				1413	* check alignment.
				1414	*/
				1415	if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
				1416	end > cur->vm_start) {
				1417	unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
				1418
				1419	ret = -EINVAL;
				1420
				1421	if (end & (vma_hpagesize - 1))
				1422	goto out_unlock;
				1423	}
				1424
				1425	/*
				1426	* Check that this vma isn't already owned by a
				1427	* different userfaultfd. We can't allow more than one
				1428	* userfaultfd to own a single vma simultaneously or we
				1429	* wouldn't know which one to deliver the userfaults to.
				1430	*/
				1431	ret = -EBUSY;
				1432	if (cur->vm_userfaultfd_ctx.ctx &&
				1433	cur->vm_userfaultfd_ctx.ctx != ctx)
				1434	goto out_unlock;
				1435
				1436	/*
				1437	* Note vmas containing huge pages
				1438	*/
				1439	if (is_vm_hugetlb_page(cur))
				1440	basic_ioctls = true;
				1441
				1442	found = true;
				1443	}
				1444	BUG_ON(!found);
				1445
				1446	if (vma->vm_start < start)
				1447	prev = vma;
				1448
				1449	ret = 0;
				1450	do {
				1451	cond_resched();
				1452
				1453	BUG_ON(!vma_can_userfault(vma, vm_flags));
				1454	BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
				1455	vma->vm_userfaultfd_ctx.ctx != ctx);
				1456	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
				1457
				1458	/*
				1459	* Nothing to do: this vma is already registered into this
				1460	* userfaultfd and with the right tracking mode too.
				1461	*/
				1462	if (vma->vm_userfaultfd_ctx.ctx == ctx &&
				1463	(vma->vm_flags & vm_flags) == vm_flags)
				1464	goto skip;
				1465
				1466	if (vma->vm_start > start)
				1467	start = vma->vm_start;
				1468	vma_end = min(end, vma->vm_end);
				1469
				1470	new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) \| vm_flags;
				1471	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				1472	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				1473	vma_policy(vma),
				1474	((struct vm_userfaultfd_ctx){ ctx }),
				1475	vma_get_anon_name(vma));
				1476	if (prev) {
				1477	vma = prev;
				1478	goto next;
				1479	}
				1480	if (vma->vm_start < start) {
				1481	ret = split_vma(mm, vma, start, 1);
				1482	if (ret)
				1483	break;
				1484	}
				1485	if (vma->vm_end > end) {
				1486	ret = split_vma(mm, vma, end, 0);
				1487	if (ret)
				1488	break;
				1489	}
				1490	next:
				1491	/*
				1492	* In the vma_merge() successful mprotect-like case 8:
				1493	* the next vma was merged into the current one and
				1494	* the current one has not been updated yet.
				1495	*/
				1496	vma->vm_flags = new_flags;
				1497	vma->vm_userfaultfd_ctx.ctx = ctx;
				1498
				1499	if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
				1500	hugetlb_unshare_all_pmds(vma);
				1501
				1502	skip:
				1503	prev = vma;
				1504	start = vma->vm_end;
				1505	vma = vma->vm_next;
				1506	} while (vma && vma->vm_start < end);
				1507	out_unlock:
				1508	up_write(&mm->mmap_sem);
				1509	mmput(mm);
				1510	if (!ret) {
				1511	__u64 ioctls_out;
				1512
				1513	ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
				1514	UFFD_API_RANGE_IOCTLS;
				1515
				1516	/* CONTINUE ioctl is only supported for MINOR ranges. */
				1517	if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
				1518	ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
				1519
				1520	/*
				1521	* Now that we scanned all vmas we can already tell
				1522	* userland which ioctls methods are guaranteed to
				1523	* succeed on this range.
				1524	*/
				1525	if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
				1526	UFFD_API_RANGE_IOCTLS,
				1527	&user_uffdio_register->ioctls))
				1528	ret = -EFAULT;
				1529	}
				1530	out:
				1531	return ret;
				1532	}
				1533
				1534	static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
				1535	unsigned long arg)
				1536	{
				1537	struct mm_struct *mm = ctx->mm;
				1538	struct vm_area_struct vma, prev, *cur;
				1539	int ret;
				1540	struct uffdio_range uffdio_unregister;
				1541	unsigned long new_flags;
				1542	bool found;
				1543	unsigned long start, end, vma_end;
				1544	const void __user buf = (void __user )arg;
				1545
				1546	ret = -EFAULT;
				1547	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
				1548	goto out;
				1549
				1550	ret = validate_range(mm, uffdio_unregister.start,
				1551	uffdio_unregister.len);
				1552	if (ret)
				1553	goto out;
				1554
				1555	start = uffdio_unregister.start;
				1556	end = start + uffdio_unregister.len;
				1557
				1558	ret = -ENOMEM;
				1559	if (!mmget_not_zero(mm))
				1560	goto out;
				1561
				1562	down_write(&mm->mmap_sem);
				1563	if (!mmget_still_valid(mm))
				1564	goto out_unlock;
				1565	vma = find_vma_prev(mm, start, &prev);
				1566	if (!vma)
				1567	goto out_unlock;
				1568
				1569	/* check that there's at least one vma in the range */
				1570	ret = -EINVAL;
				1571	if (vma->vm_start >= end)
				1572	goto out_unlock;
				1573
				1574	/*
				1575	* If the first vma contains huge pages, make sure start address
				1576	* is aligned to huge page size.
				1577	*/
				1578	if (is_vm_hugetlb_page(vma)) {
				1579	unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
				1580
				1581	if (start & (vma_hpagesize - 1))
				1582	goto out_unlock;
				1583	}
				1584
				1585	/*
				1586	* Search for not compatible vmas.
				1587	*/
				1588	found = false;
				1589	ret = -EINVAL;
				1590	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
				1591	cond_resched();
				1592
				1593	BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
				1594	!!(cur->vm_flags & __VM_UFFD_FLAGS));
				1595
				1596	/*
				1597	* Check not compatible vmas, not strictly required
				1598	* here as not compatible vmas cannot have an
				1599	* userfaultfd_ctx registered on them, but this
				1600	* provides for more strict behavior to notice
				1601	* unregistration errors.
				1602	*/
				1603	if (!vma_can_userfault(cur, cur->vm_flags))
				1604	goto out_unlock;
				1605
				1606	found = true;
				1607	}
				1608	BUG_ON(!found);
				1609
				1610	if (vma->vm_start < start)
				1611	prev = vma;
				1612
				1613	ret = 0;
				1614	do {
				1615	cond_resched();
				1616
				1617	BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
				1618
				1619	/*
				1620	* Nothing to do: this vma is already registered into this
				1621	* userfaultfd and with the right tracking mode too.
				1622	*/
				1623	if (!vma->vm_userfaultfd_ctx.ctx)
				1624	goto skip;
				1625
				1626	WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
				1627
				1628	if (vma->vm_start > start)
				1629	start = vma->vm_start;
				1630	vma_end = min(end, vma->vm_end);
				1631
				1632	if (userfaultfd_missing(vma)) {
				1633	/*
				1634	* Wake any concurrent pending userfault while
				1635	* we unregister, so they will not hang
				1636	* permanently and it avoids userland to call
				1637	* UFFDIO_WAKE explicitly.
				1638	*/
				1639	struct userfaultfd_wake_range range;
				1640	range.start = start;
				1641	range.len = vma_end - start;
				1642	wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
				1643	}
				1644
				1645	new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
				1646	prev = vma_merge(mm, prev, start, vma_end, new_flags,
				1647	vma->anon_vma, vma->vm_file, vma->vm_pgoff,
				1648	vma_policy(vma),
				1649	NULL_VM_UFFD_CTX,
				1650	vma_get_anon_name(vma));
				1651	if (prev) {
				1652	vma = prev;
				1653	goto next;
				1654	}
				1655	if (vma->vm_start < start) {
				1656	ret = split_vma(mm, vma, start, 1);
				1657	if (ret)
				1658	break;
				1659	}
				1660	if (vma->vm_end > end) {
				1661	ret = split_vma(mm, vma, end, 0);
				1662	if (ret)
				1663	break;
				1664	}
				1665	next:
				1666	/*
				1667	* In the vma_merge() successful mprotect-like case 8:
				1668	* the next vma was merged into the current one and
				1669	* the current one has not been updated yet.
				1670	*/
				1671	vma->vm_flags = new_flags;
				1672	vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
				1673
				1674	skip:
				1675	prev = vma;
				1676	start = vma->vm_end;
				1677	vma = vma->vm_next;
				1678	} while (vma && vma->vm_start < end);
				1679	out_unlock:
				1680	up_write(&mm->mmap_sem);
				1681	mmput(mm);
				1682	out:
				1683	return ret;
				1684	}
				1685
				1686	/*
				1687	* userfaultfd_wake may be used in combination with the
				1688	* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
				1689	*/
				1690	static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
				1691	unsigned long arg)
				1692	{
				1693	int ret;
				1694	struct uffdio_range uffdio_wake;
				1695	struct userfaultfd_wake_range range;
				1696	const void __user buf = (void __user )arg;
				1697
				1698	ret = -EFAULT;
				1699	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
				1700	goto out;
				1701
				1702	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
				1703	if (ret)
				1704	goto out;
				1705
				1706	range.start = uffdio_wake.start;
				1707	range.len = uffdio_wake.len;
				1708
				1709	/*
				1710	* len == 0 means wake all and we don't want to wake all here,
				1711	* so check it again to be sure.
				1712	*/
				1713	VM_BUG_ON(!range.len);
				1714
				1715	wake_userfault(ctx, &range);
				1716	ret = 0;
				1717
				1718	out:
				1719	return ret;
				1720	}
				1721
				1722	static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
				1723	unsigned long arg)
				1724	{
				1725	__s64 ret;
				1726	struct uffdio_copy uffdio_copy;
				1727	struct uffdio_copy __user *user_uffdio_copy;
				1728	struct userfaultfd_wake_range range;
				1729
				1730	user_uffdio_copy = (struct uffdio_copy __user *) arg;
				1731
				1732	ret = -EAGAIN;
				1733	if (READ_ONCE(ctx->mmap_changing))
				1734	goto out;
				1735
				1736	ret = -EFAULT;
				1737	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
				1738	/* don't copy "copy" last field */
				1739	sizeof(uffdio_copy)-sizeof(__s64)))
				1740	goto out;
				1741
				1742	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
				1743	if (ret)
				1744	goto out;
				1745	/*
				1746	* double check for wraparound just in case. copy_from_user()
				1747	* will later check uffdio_copy.src + uffdio_copy.len to fit
				1748	* in the userland range.
				1749	*/
				1750	ret = -EINVAL;
				1751	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
				1752	goto out;
				1753	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
				1754	goto out;
				1755	if (mmget_not_zero(ctx->mm)) {
				1756	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
				1757	uffdio_copy.len, &ctx->mmap_changing);
				1758	mmput(ctx->mm);
				1759	} else {
				1760	return -ESRCH;
				1761	}
				1762	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
				1763	return -EFAULT;
				1764	if (ret < 0)
				1765	goto out;
				1766	BUG_ON(!ret);
				1767	/* len == 0 would wake all */
				1768	range.len = ret;
				1769	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
				1770	range.start = uffdio_copy.dst;
				1771	wake_userfault(ctx, &range);
				1772	}
				1773	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
				1774	out:
				1775	return ret;
				1776	}
				1777
				1778	static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
				1779	unsigned long arg)
				1780	{
				1781	__s64 ret;
				1782	struct uffdio_zeropage uffdio_zeropage;
				1783	struct uffdio_zeropage __user *user_uffdio_zeropage;
				1784	struct userfaultfd_wake_range range;
				1785
				1786	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
				1787
				1788	ret = -EAGAIN;
				1789	if (READ_ONCE(ctx->mmap_changing))
				1790	goto out;
				1791
				1792	ret = -EFAULT;
				1793	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
				1794	/* don't copy "zeropage" last field */
				1795	sizeof(uffdio_zeropage)-sizeof(__s64)))
				1796	goto out;
				1797
				1798	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
				1799	uffdio_zeropage.range.len);
				1800	if (ret)
				1801	goto out;
				1802	ret = -EINVAL;
				1803	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
				1804	goto out;
				1805
				1806	if (mmget_not_zero(ctx->mm)) {
				1807	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
				1808	uffdio_zeropage.range.len,
				1809	&ctx->mmap_changing);
				1810	mmput(ctx->mm);
				1811	} else {
				1812	return -ESRCH;
				1813	}
				1814	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
				1815	return -EFAULT;
				1816	if (ret < 0)
				1817	goto out;
				1818	/* len == 0 would wake all */
				1819	BUG_ON(!ret);
				1820	range.len = ret;
				1821	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
				1822	range.start = uffdio_zeropage.range.start;
				1823	wake_userfault(ctx, &range);
				1824	}
				1825	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
				1826	out:
				1827	return ret;
				1828	}
				1829
				1830	static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
				1831	{
				1832	__s64 ret;
				1833	struct uffdio_continue uffdio_continue;
				1834	struct uffdio_continue __user *user_uffdio_continue;
				1835	struct userfaultfd_wake_range range;
				1836
				1837	user_uffdio_continue = (struct uffdio_continue __user *)arg;
				1838
				1839	ret = -EAGAIN;
				1840	if (READ_ONCE(ctx->mmap_changing))
				1841	goto out;
				1842
				1843	ret = -EFAULT;
				1844	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
				1845	/* don't copy the output fields */
				1846	sizeof(uffdio_continue) - (sizeof(__s64))))
				1847	goto out;
				1848
				1849	ret = validate_range(ctx->mm, uffdio_continue.range.start,
				1850	uffdio_continue.range.len);
				1851	if (ret)
				1852	goto out;
				1853
				1854	ret = -EINVAL;
				1855	/* double check for wraparound just in case. */
				1856	if (uffdio_continue.range.start + uffdio_continue.range.len <=
				1857	uffdio_continue.range.start) {
				1858	goto out;
				1859	}
				1860	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
				1861	goto out;
				1862
				1863	if (mmget_not_zero(ctx->mm)) {
				1864	ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
				1865	uffdio_continue.range.len,
				1866	&ctx->mmap_changing);
				1867	mmput(ctx->mm);
				1868	} else {
				1869	return -ESRCH;
				1870	}
				1871
				1872	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
				1873	return -EFAULT;
				1874	if (ret < 0)
				1875	goto out;
				1876
				1877	/* len == 0 would wake all */
				1878	BUG_ON(!ret);
				1879	range.len = ret;
				1880	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
				1881	range.start = uffdio_continue.range.start;
				1882	wake_userfault(ctx, &range);
				1883	}
				1884	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
				1885
				1886	out:
				1887	return ret;
				1888	}
				1889
				1890	static inline unsigned int uffd_ctx_features(__u64 user_features)
				1891	{
				1892	/*
				1893	* For the current set of features the bits just coincide. Set
				1894	* UFFD_FEATURE_INITIALIZED to mark the features as enabled.
				1895	*/
				1896	return (unsigned int)user_features \| UFFD_FEATURE_INITIALIZED;
				1897	}
				1898
				1899	/*
				1900	* userland asks for a certain API version and we return which bits
				1901	* and ioctl commands are implemented in this kernel for such API
				1902	* version or -EINVAL if unknown.
				1903	*/
				1904	static int userfaultfd_api(struct userfaultfd_ctx *ctx,
				1905	unsigned long arg)
				1906	{
				1907	struct uffdio_api uffdio_api;
				1908	void __user buf = (void __user )arg;
				1909	unsigned int ctx_features;
				1910	int ret;
				1911	__u64 features;
				1912
				1913	ret = -EFAULT;
				1914	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
				1915	goto out;
				1916	features = uffdio_api.features;
				1917	ret = -EINVAL;
				1918	if (uffdio_api.api != UFFD_API \|\| (features & ~UFFD_API_FEATURES))
				1919	goto err_out;
				1920	ret = -EPERM;
				1921	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
				1922	goto err_out;
				1923	/* report all available features and ioctls to userland */
				1924	uffdio_api.features = UFFD_API_FEATURES;
				1925	#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
				1926	uffdio_api.features &=
				1927	~(UFFD_FEATURE_MINOR_HUGETLBFS \| UFFD_FEATURE_MINOR_SHMEM);
				1928	#endif
				1929	uffdio_api.ioctls = UFFD_API_IOCTLS;
				1930	ret = -EFAULT;
				1931	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				1932	goto out;
				1933
				1934	/* only enable the requested features for this uffd context */
				1935	ctx_features = uffd_ctx_features(features);
				1936	ret = -EINVAL;
				1937	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
				1938	goto err_out;
				1939
				1940	ret = 0;
				1941	out:
				1942	return ret;
				1943	err_out:
				1944	memset(&uffdio_api, 0, sizeof(uffdio_api));
				1945	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
				1946	ret = -EFAULT;
				1947	goto out;
				1948	}
				1949
				1950	static long userfaultfd_ioctl(struct file *file, unsigned cmd,
				1951	unsigned long arg)
				1952	{
				1953	int ret = -EINVAL;
				1954	struct userfaultfd_ctx *ctx = file->private_data;
				1955
				1956	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
				1957	return -EINVAL;
				1958
				1959	switch(cmd) {
				1960	case UFFDIO_API:
				1961	ret = userfaultfd_api(ctx, arg);
				1962	break;
				1963	case UFFDIO_REGISTER:
				1964	ret = userfaultfd_register(ctx, arg);
				1965	break;
				1966	case UFFDIO_UNREGISTER:
				1967	ret = userfaultfd_unregister(ctx, arg);
				1968	break;
				1969	case UFFDIO_WAKE:
				1970	ret = userfaultfd_wake(ctx, arg);
				1971	break;
				1972	case UFFDIO_COPY:
				1973	ret = userfaultfd_copy(ctx, arg);
				1974	break;
				1975	case UFFDIO_ZEROPAGE:
				1976	ret = userfaultfd_zeropage(ctx, arg);
				1977	break;
				1978	case UFFDIO_CONTINUE:
				1979	ret = userfaultfd_continue(ctx, arg);
				1980	break;
				1981	}
				1982	return ret;
				1983	}
				1984
				1985	#ifdef CONFIG_PROC_FS
				1986	static void userfaultfd_show_fdinfo(struct seq_file m, struct file f)
				1987	{
				1988	struct userfaultfd_ctx *ctx = f->private_data;
				1989	wait_queue_entry_t *wq;
				1990	unsigned long pending = 0, total = 0;
				1991
				1992	spin_lock_irq(&ctx->fault_pending_wqh.lock);
				1993	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
				1994	pending++;
				1995	total++;
				1996	}
				1997	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
				1998	total++;
				1999	}
				2000	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
				2001
				2002	/*
				2003	* If more protocols will be added, there will be all shown
				2004	* separated by a space. Like this:
				2005	* protocols: aa:... bb:...
				2006	*/
				2007	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
				2008	pending, total, UFFD_API, ctx->features,
				2009	UFFD_API_IOCTLS\|UFFD_API_RANGE_IOCTLS);
				2010	}
				2011	#endif
				2012
				2013	static const struct file_operations userfaultfd_fops = {
				2014	#ifdef CONFIG_PROC_FS
				2015	.show_fdinfo = userfaultfd_show_fdinfo,
				2016	#endif
				2017	.release = userfaultfd_release,
				2018	.poll = userfaultfd_poll,
				2019	.read = userfaultfd_read,
				2020	.unlocked_ioctl = userfaultfd_ioctl,
				2021	.compat_ioctl = userfaultfd_ioctl,
				2022	.llseek = noop_llseek,
				2023	};
				2024
				2025	static void init_once_userfaultfd_ctx(void *mem)
				2026	{
				2027	struct userfaultfd_ctx ctx = (struct userfaultfd_ctx ) mem;
				2028
				2029	init_waitqueue_head(&ctx->fault_pending_wqh);
				2030	init_waitqueue_head(&ctx->fault_wqh);
				2031	init_waitqueue_head(&ctx->event_wqh);
				2032	init_waitqueue_head(&ctx->fd_wqh);
				2033	seqcount_init(&ctx->refile_seq);
				2034	}
				2035
				2036	SYSCALL_DEFINE1(userfaultfd, int, flags)
				2037	{
				2038	struct userfaultfd_ctx *ctx;
				2039	int fd;
				2040
				2041	if (!sysctl_unprivileged_userfaultfd &&
				2042	(flags & UFFD_USER_MODE_ONLY) == 0 &&
				2043	!capable(CAP_SYS_PTRACE)) {
				2044	printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
				2045	"sysctl knob to 1 if kernel faults must be handled "
				2046	"without obtaining CAP_SYS_PTRACE capability\n");
				2047	return -EPERM;
				2048	}
				2049
				2050	BUG_ON(!current->mm);
				2051
				2052	/* Check the UFFD_* constants for consistency. */
				2053	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
				2054	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
				2055	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
				2056
				2057	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS \| UFFD_USER_MODE_ONLY))
				2058	return -EINVAL;
				2059
				2060	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
				2061	if (!ctx)
				2062	return -ENOMEM;
				2063
				2064	refcount_set(&ctx->refcount, 1);
				2065	ctx->flags = flags;
				2066	ctx->features = 0;
				2067	ctx->released = false;
				2068	ctx->mmap_changing = false;
				2069	ctx->mm = current->mm;
				2070	/* prevent the mm struct to be freed */
				2071	mmgrab(ctx->mm);
				2072
				2073	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
				2074	O_RDONLY \| (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
				2075	if (fd < 0) {
				2076	mmdrop(ctx->mm);
				2077	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
				2078	}
				2079	return fd;
				2080	}
				2081
				2082	static int __init userfaultfd_init(void)
				2083	{
				2084	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
				2085	sizeof(struct userfaultfd_ctx),
				2086	0,
				2087	SLAB_HWCACHE_ALIGN\|SLAB_PANIC,
				2088	init_once_userfaultfd_ctx);
				2089	return 0;
				2090	}
				2091	__initcall(userfaultfd_init);