Blame - marvell/linux/mm/memory.c - T108

blob: 1f2ed6cb6385922414bb9261201442b396a03db4 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* linux/mm/memory.c
				4	*
				5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				6	*/
				7
				8	/*
				9	* demand-loading started 01.12.91 - seems it is high on the list of
				10	* things wanted, and it should be easy to implement. - Linus
				11	*/
				12
				13	/*
				14	* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
				15	* pages started 02.12.91, seems to work. - Linus.
				16	*
				17	* Tested sharing by executing about 30 /bin/sh: under the old kernel it
				18	* would have taken more than the 6M I have free, but it worked well as
				19	* far as I could see.
				20	*
				21	* Also corrected some "invalidate()"s - I wasn't doing enough of them.
				22	*/
				23
				24	/*
				25	* Real VM (paging to/from disk) started 18.12.91. Much more work and
				26	* thought has to go into this. Oh, well..
				27	* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
				28	* Found it. Everything seems to work now.
				29	* 20.12.91 - Ok, making the swap-device changeable like the root.
				30	*/
				31
				32	/*
				33	* 05.04.94 - Multi-page memory management added for v1.1.
				34	* Idea by Alex Bligh (alex@cconcepts.co.uk)
				35	*
				36	* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
				37	* (Gerhard.Wichert@pdb.siemens.de)
				38	*
				39	* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
				40	*/
				41
				42	#include <linux/kernel_stat.h>
				43	#include <linux/mm.h>
				44	#include <linux/sched/mm.h>
				45	#include <linux/sched/coredump.h>
				46	#include <linux/sched/numa_balancing.h>
				47	#include <linux/sched/task.h>
				48	#include <linux/hugetlb.h>
				49	#include <linux/mman.h>
				50	#include <linux/swap.h>
				51	#include <linux/highmem.h>
				52	#include <linux/pagemap.h>
				53	#include <linux/memremap.h>
				54	#include <linux/ksm.h>
				55	#include <linux/rmap.h>
				56	#include <linux/export.h>
				57	#include <linux/delayacct.h>
				58	#include <linux/init.h>
				59	#include <linux/pfn_t.h>
				60	#include <linux/writeback.h>
				61	#include <linux/memcontrol.h>
				62	#include <linux/mmu_notifier.h>
				63	#include <linux/swapops.h>
				64	#include <linux/elf.h>
				65	#include <linux/gfp.h>
				66	#include <linux/migrate.h>
				67	#include <linux/string.h>
				68	#include <linux/dma-debug.h>
				69	#include <linux/debugfs.h>
				70	#include <linux/userfaultfd_k.h>
				71	#include <linux/dax.h>
				72	#include <linux/oom.h>
				73	#include <linux/numa.h>
				74
				75	#include <trace/events/kmem.h>
				76
				77	#include <asm/io.h>
				78	#include <asm/mmu_context.h>
				79	#include <asm/pgalloc.h>
				80	#include <linux/uaccess.h>
				81	#include <asm/tlb.h>
				82	#include <asm/tlbflush.h>
				83	#include <asm/pgtable.h>
				84
				85	#include "internal.h"
				86
				87	#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
				88	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
				89	#endif
				90
				91	#ifndef CONFIG_NEED_MULTIPLE_NODES
				92	/* use the per-pgdat data instead for discontigmem - mbligh */
				93	unsigned long max_mapnr;
				94	EXPORT_SYMBOL(max_mapnr);
				95
				96	struct page *mem_map;
				97	EXPORT_SYMBOL(mem_map);
				98	#endif
				99
				100	/*
				101	* A number of key systems in x86 including ioremap() rely on the assumption
				102	* that high_memory defines the upper bound on direct map memory, then end
				103	* of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
				104	* highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
				105	* and ZONE_HIGHMEM.
				106	*/
				107	void *high_memory;
				108	EXPORT_SYMBOL(high_memory);
				109
				110	/*
				111	* Randomize the address space (stacks, mmaps, brk, etc.).
				112	*
				113	* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
				114	* as ancient (libc5 based) binaries can segfault. )
				115	*/
				116	int randomize_va_space __read_mostly =
				117	#ifdef CONFIG_COMPAT_BRK
				118	1;
				119	#else
				120	2;
				121	#endif
				122
				123	#ifndef arch_faults_on_old_pte
				124	static inline bool arch_faults_on_old_pte(void)
				125	{
				126	/*
				127	* Those arches which don't have hw access flag feature need to
				128	* implement their own helper. By default, "true" means pagefault
				129	* will be hit on old pte.
				130	*/
				131	return true;
				132	}
				133	#endif
				134
				135	static int __init disable_randmaps(char *s)
				136	{
				137	randomize_va_space = 0;
				138	return 1;
				139	}
				140	__setup("norandmaps", disable_randmaps);
				141
				142	unsigned long zero_pfn __read_mostly;
				143	EXPORT_SYMBOL(zero_pfn);
				144
				145	unsigned long highest_memmap_pfn __read_mostly;
				146
				147	/*
				148	* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
				149	*/
				150	static int __init init_zero_pfn(void)
				151	{
				152	zero_pfn = page_to_pfn(ZERO_PAGE(0));
				153	return 0;
				154	}
				155	early_initcall(init_zero_pfn);
				156
				157	/*
				158	* Only trace rss_stat when there is a 512kb cross over.
				159	* Smaller changes may be lost unless every small change is
				160	* crossing into or returning to a 512kb boundary.
				161	*/
				162	#define TRACE_MM_COUNTER_THRESHOLD 128
				163
				164	void mm_trace_rss_stat(struct mm_struct *mm, int member, long count,
				165	long value)
				166	{
				167	long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
				168
				169	/* Threshold roll-over, trace it */
				170	if ((count & thresh_mask) != ((count - value) & thresh_mask))
				171	trace_rss_stat(mm, member, count);
				172	}
				173	EXPORT_SYMBOL_GPL(mm_trace_rss_stat);
				174
				175	#if defined(SPLIT_RSS_COUNTING)
				176
				177	void sync_mm_rss(struct mm_struct *mm)
				178	{
				179	int i;
				180
				181	for (i = 0; i < NR_MM_COUNTERS; i++) {
				182	if (current->rss_stat.count[i]) {
				183	add_mm_counter(mm, i, current->rss_stat.count[i]);
				184	current->rss_stat.count[i] = 0;
				185	}
				186	}
				187	current->rss_stat.events = 0;
				188	}
				189
				190	static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
				191	{
				192	struct task_struct *task = current;
				193
				194	if (likely(task->mm == mm))
				195	task->rss_stat.count[member] += val;
				196	else
				197	add_mm_counter(mm, member, val);
				198	}
				199	#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
				200	#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
				201
				202	/* sync counter once per 64 page faults */
				203	#define TASK_RSS_EVENTS_THRESH (64)
				204	static void check_sync_rss_stat(struct task_struct *task)
				205	{
				206	if (unlikely(task != current))
				207	return;
				208	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
				209	sync_mm_rss(task->mm);
				210	}
				211	#else /* SPLIT_RSS_COUNTING */
				212
				213	#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
				214	#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
				215
				216	static void check_sync_rss_stat(struct task_struct *task)
				217	{
				218	}
				219
				220	#endif /* SPLIT_RSS_COUNTING */
				221
				222	/*
				223	* Note: this doesn't free the actual pages themselves. That
				224	* has been handled earlier when unmapping all the memory regions.
				225	*/
				226	static void free_pte_range(struct mmu_gather tlb, pmd_t pmd,
				227	unsigned long addr)
				228	{
				229	pgtable_t token = pmd_pgtable(*pmd);
				230	pmd_clear(pmd);
				231	pte_free_tlb(tlb, token, addr);
				232	mm_dec_nr_ptes(tlb->mm);
				233	}
				234
				235	static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,
				236	unsigned long addr, unsigned long end,
				237	unsigned long floor, unsigned long ceiling)
				238	{
				239	pmd_t *pmd;
				240	unsigned long next;
				241	unsigned long start;
				242
				243	start = addr;
				244	pmd = pmd_offset(pud, addr);
				245	do {
				246	next = pmd_addr_end(addr, end);
				247	if (pmd_none_or_clear_bad(pmd))
				248	continue;
				249	free_pte_range(tlb, pmd, addr);
				250	} while (pmd++, addr = next, addr != end);
				251
				252	start &= PUD_MASK;
				253	if (start < floor)
				254	return;
				255	if (ceiling) {
				256	ceiling &= PUD_MASK;
				257	if (!ceiling)
				258	return;
				259	}
				260	if (end - 1 > ceiling - 1)
				261	return;
				262
				263	pmd = pmd_offset(pud, start);
				264	pud_clear(pud);
				265	pmd_free_tlb(tlb, pmd, start);
				266	mm_dec_nr_pmds(tlb->mm);
				267	}
				268
				269	static inline void free_pud_range(struct mmu_gather tlb, p4d_t p4d,
				270	unsigned long addr, unsigned long end,
				271	unsigned long floor, unsigned long ceiling)
				272	{
				273	pud_t *pud;
				274	unsigned long next;
				275	unsigned long start;
				276
				277	start = addr;
				278	pud = pud_offset(p4d, addr);
				279	do {
				280	next = pud_addr_end(addr, end);
				281	if (pud_none_or_clear_bad(pud))
				282	continue;
				283	free_pmd_range(tlb, pud, addr, next, floor, ceiling);
				284	} while (pud++, addr = next, addr != end);
				285
				286	start &= P4D_MASK;
				287	if (start < floor)
				288	return;
				289	if (ceiling) {
				290	ceiling &= P4D_MASK;
				291	if (!ceiling)
				292	return;
				293	}
				294	if (end - 1 > ceiling - 1)
				295	return;
				296
				297	pud = pud_offset(p4d, start);
				298	p4d_clear(p4d);
				299	pud_free_tlb(tlb, pud, start);
				300	mm_dec_nr_puds(tlb->mm);
				301	}
				302
				303	static inline void free_p4d_range(struct mmu_gather tlb, pgd_t pgd,
				304	unsigned long addr, unsigned long end,
				305	unsigned long floor, unsigned long ceiling)
				306	{
				307	p4d_t *p4d;
				308	unsigned long next;
				309	unsigned long start;
				310
				311	start = addr;
				312	p4d = p4d_offset(pgd, addr);
				313	do {
				314	next = p4d_addr_end(addr, end);
				315	if (p4d_none_or_clear_bad(p4d))
				316	continue;
				317	free_pud_range(tlb, p4d, addr, next, floor, ceiling);
				318	} while (p4d++, addr = next, addr != end);
				319
				320	start &= PGDIR_MASK;
				321	if (start < floor)
				322	return;
				323	if (ceiling) {
				324	ceiling &= PGDIR_MASK;
				325	if (!ceiling)
				326	return;
				327	}
				328	if (end - 1 > ceiling - 1)
				329	return;
				330
				331	p4d = p4d_offset(pgd, start);
				332	pgd_clear(pgd);
				333	p4d_free_tlb(tlb, p4d, start);
				334	}
				335
				336	/*
				337	* This function frees user-level page tables of a process.
				338	*/
				339	void free_pgd_range(struct mmu_gather *tlb,
				340	unsigned long addr, unsigned long end,
				341	unsigned long floor, unsigned long ceiling)
				342	{
				343	pgd_t *pgd;
				344	unsigned long next;
				345
				346	/*
				347	* The next few lines have given us lots of grief...
				348	*
				349	* Why are we testing PMD* at this top level? Because often
				350	* there will be no work to do at all, and we'd prefer not to
				351	* go all the way down to the bottom just to discover that.
				352	*
				353	* Why all these "- 1"s? Because 0 represents both the bottom
				354	* of the address space and the top of it (using -1 for the
				355	* top wouldn't help much: the masks would do the wrong thing).
				356	* The rule is that addr 0 and floor 0 refer to the bottom of
				357	* the address space, but end 0 and ceiling 0 refer to the top
				358	* Comparisons need to use "end - 1" and "ceiling - 1" (though
				359	* that end 0 case should be mythical).
				360	*
				361	* Wherever addr is brought up or ceiling brought down, we must
				362	* be careful to reject "the opposite 0" before it confuses the
				363	* subsequent tests. But what about where end is brought down
				364	* by PMD_SIZE below? no, end can't go down to 0 there.
				365	*
				366	* Whereas we round start (addr) and ceiling down, by different
				367	* masks at different levels, in order to test whether a table
				368	* now has no other vmas using it, so can be freed, we don't
				369	* bother to round floor or end up - the tests don't need that.
				370	*/
				371
				372	addr &= PMD_MASK;
				373	if (addr < floor) {
				374	addr += PMD_SIZE;
				375	if (!addr)
				376	return;
				377	}
				378	if (ceiling) {
				379	ceiling &= PMD_MASK;
				380	if (!ceiling)
				381	return;
				382	}
				383	if (end - 1 > ceiling - 1)
				384	end -= PMD_SIZE;
				385	if (addr > end - 1)
				386	return;
				387	/*
				388	* We add page table cache pages with PAGE_SIZE,
				389	* (see pte_free_tlb()), flush the tlb if we need
				390	*/
				391	tlb_change_page_size(tlb, PAGE_SIZE);
				392	pgd = pgd_offset(tlb->mm, addr);
				393	do {
				394	next = pgd_addr_end(addr, end);
				395	if (pgd_none_or_clear_bad(pgd))
				396	continue;
				397	free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
				398	} while (pgd++, addr = next, addr != end);
				399	}
				400
				401	void free_pgtables(struct mmu_gather tlb, struct vm_area_struct vma,
				402	unsigned long floor, unsigned long ceiling)
				403	{
				404	while (vma) {
				405	struct vm_area_struct *next = vma->vm_next;
				406	unsigned long addr = vma->vm_start;
				407
				408	/*
				409	* Hide vma from rmap and truncate_pagecache before freeing
				410	* pgtables
				411	*/
				412	unlink_anon_vmas(vma);
				413	unlink_file_vma(vma);
				414
				415	if (is_vm_hugetlb_page(vma)) {
				416	hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
				417	floor, next ? next->vm_start : ceiling);
				418	} else {
				419	/*
				420	* Optimization: gather nearby vmas into one call down
				421	*/
				422	while (next && next->vm_start <= vma->vm_end + PMD_SIZE
				423	&& !is_vm_hugetlb_page(next)) {
				424	vma = next;
				425	next = vma->vm_next;
				426	unlink_anon_vmas(vma);
				427	unlink_file_vma(vma);
				428	}
				429	free_pgd_range(tlb, addr, vma->vm_end,
				430	floor, next ? next->vm_start : ceiling);
				431	}
				432	vma = next;
				433	}
				434	}
				435
				436	int __pte_alloc(struct mm_struct mm, pmd_t pmd)
				437	{
				438	spinlock_t *ptl;
				439	pgtable_t new = pte_alloc_one(mm);
				440	if (!new)
				441	return -ENOMEM;
				442
				443	/*
				444	* Ensure all pte setup (eg. pte page lock and page clearing) are
				445	* visible before the pte is made visible to other CPUs by being
				446	* put into page tables.
				447	*
				448	* The other side of the story is the pointer chasing in the page
				449	* table walking code (when walking the page table without locking;
				450	* ie. most of the time). Fortunately, these data accesses consist
				451	* of a chain of data-dependent loads, meaning most CPUs (alpha
				452	* being the notable exception) will already guarantee loads are
				453	* seen in-order. See the alpha page table accessors for the
				454	* smp_read_barrier_depends() barriers in page table walking code.
				455	*/
				456	smp_wmb(); /* Could be smp_wmb__xxx(before\|after)_spin_lock */
				457
				458	ptl = pmd_lock(mm, pmd);
				459	if (likely(pmd_none(pmd))) { / Has another populated it ? */
				460	mm_inc_nr_ptes(mm);
				461	pmd_populate(mm, pmd, new);
				462	new = NULL;
				463	}
				464	spin_unlock(ptl);
				465	if (new)
				466	pte_free(mm, new);
				467	return 0;
				468	}
				469
				470	int __pte_alloc_kernel(pmd_t *pmd)
				471	{
				472	pte_t *new = pte_alloc_one_kernel(&init_mm);
				473	if (!new)
				474	return -ENOMEM;
				475
				476	smp_wmb(); /* See comment in __pte_alloc */
				477
				478	spin_lock(&init_mm.page_table_lock);
				479	if (likely(pmd_none(pmd))) { / Has another populated it ? */
				480	pmd_populate_kernel(&init_mm, pmd, new);
				481	new = NULL;
				482	}
				483	spin_unlock(&init_mm.page_table_lock);
				484	if (new)
				485	pte_free_kernel(&init_mm, new);
				486	return 0;
				487	}
				488
				489	static inline void init_rss_vec(int *rss)
				490	{
				491	memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
				492	}
				493
				494	static inline void add_mm_rss_vec(struct mm_struct mm, int rss)
				495	{
				496	int i;
				497
				498	if (current->mm == mm)
				499	sync_mm_rss(mm);
				500	for (i = 0; i < NR_MM_COUNTERS; i++)
				501	if (rss[i])
				502	add_mm_counter(mm, i, rss[i]);
				503	}
				504
				505	/*
				506	* This function is called to print an error when a bad pte
				507	* is found. For example, we might have a PFN-mapped pte in
				508	* a region that doesn't allow it.
				509	*
				510	* The calling function must still handle the error.
				511	*/
				512	static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
				513	pte_t pte, struct page *page)
				514	{
				515	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
				516	p4d_t *p4d = p4d_offset(pgd, addr);
				517	pud_t *pud = pud_offset(p4d, addr);
				518	pmd_t *pmd = pmd_offset(pud, addr);
				519	struct address_space *mapping;
				520	pgoff_t index;
				521	static unsigned long resume;
				522	static unsigned long nr_shown;
				523	static unsigned long nr_unshown;
				524
				525	/*
				526	* Allow a burst of 60 reports, then keep quiet for that minute;
				527	* or allow a steady drip of one report per second.
				528	*/
				529	if (nr_shown == 60) {
				530	if (time_before(jiffies, resume)) {
				531	nr_unshown++;
				532	return;
				533	}
				534	if (nr_unshown) {
				535	pr_alert("BUG: Bad page map: %lu messages suppressed\n",
				536	nr_unshown);
				537	nr_unshown = 0;
				538	}
				539	nr_shown = 0;
				540	}
				541	if (nr_shown++ == 0)
				542	resume = jiffies + 60 * HZ;
				543
				544	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
				545	index = linear_page_index(vma, addr);
				546
				547	pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
				548	current->comm,
				549	(long long)pte_val(pte), (long long)pmd_val(*pmd));
				550	if (page)
				551	dump_page(page, "bad pte");
				552	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
				553	(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
				554	pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
				555	vma->vm_file,
				556	vma->vm_ops ? vma->vm_ops->fault : NULL,
				557	vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
				558	mapping ? mapping->a_ops->readpage : NULL);
				559	dump_stack();
				560	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
				561	}
				562
				563	/*
				564	* vm_normal_page -- This function gets the "struct page" associated with a pte.
				565	*
				566	* "Special" mappings do not wish to be associated with a "struct page" (either
				567	* it doesn't exist, or it exists but they don't want to touch it). In this
				568	* case, NULL is returned here. "Normal" mappings do have a struct page.
				569	*
				570	* There are 2 broad cases. Firstly, an architecture may define a pte_special()
				571	* pte bit, in which case this function is trivial. Secondly, an architecture
				572	* may not have a spare pte bit, which requires a more complicated scheme,
				573	* described below.
				574	*
				575	* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
				576	* special mapping (even if there are underlying and valid "struct pages").
				577	* COWed pages of a VM_PFNMAP are always normal.
				578	*
				579	* The way we recognize COWed pages within VM_PFNMAP mappings is through the
				580	* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
				581	* set, and the vm_pgoff will point to the first PFN mapped: thus every special
				582	* mapping will always honor the rule
				583	*
				584	* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
				585	*
				586	* And for normal mappings this is false.
				587	*
				588	* This restricts such mappings to be a linear translation from virtual address
				589	* to pfn. To get around this restriction, we allow arbitrary mappings so long
				590	* as the vma is not a COW mapping; in that case, we know that all ptes are
				591	* special (because none can have been COWed).
				592	*
				593	*
				594	* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
				595	*
				596	* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
				597	* page" backing, however the difference is that _all_ pages with a struct
				598	* page (that is, those where pfn_valid is true) are refcounted and considered
				599	* normal pages by the VM. The disadvantage is that pages are refcounted
				600	* (which can be slower and simply not an option for some PFNMAP users). The
				601	* advantage is that we don't have to follow the strict linearity rule of
				602	* PFNMAP mappings in order to support COWable mappings.
				603	*
				604	*/
				605	struct page vm_normal_page(struct vm_area_struct vma, unsigned long addr,
				606	pte_t pte)
				607	{
				608	unsigned long pfn = pte_pfn(pte);
				609
				610	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
				611	if (likely(!pte_special(pte)))
				612	goto check_pfn;
				613	if (vma->vm_ops && vma->vm_ops->find_special_page)
				614	return vma->vm_ops->find_special_page(vma, addr);
				615	if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
				616	return NULL;
				617	if (is_zero_pfn(pfn))
				618	return NULL;
				619	if (pte_devmap(pte))
				620	return NULL;
				621
				622	print_bad_pte(vma, addr, pte, NULL);
				623	return NULL;
				624	}
				625
				626	/* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
				627
				628	if (unlikely(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
				629	if (vma->vm_flags & VM_MIXEDMAP) {
				630	if (!pfn_valid(pfn))
				631	return NULL;
				632	goto out;
				633	} else {
				634	unsigned long off;
				635	off = (addr - vma->vm_start) >> PAGE_SHIFT;
				636	if (pfn == vma->vm_pgoff + off)
				637	return NULL;
				638	if (!is_cow_mapping(vma->vm_flags))
				639	return NULL;
				640	}
				641	}
				642
				643	if (is_zero_pfn(pfn))
				644	return NULL;
				645
				646	check_pfn:
				647	if (unlikely(pfn > highest_memmap_pfn)) {
				648	print_bad_pte(vma, addr, pte, NULL);
				649	return NULL;
				650	}
				651
				652	/*
				653	* NOTE! We still have PageReserved() pages in the page tables.
				654	* eg. VDSO mappings can cause them to exist.
				655	*/
				656	out:
				657	return pfn_to_page(pfn);
				658	}
				659
				660	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				661	struct page vm_normal_page_pmd(struct vm_area_struct vma, unsigned long addr,
				662	pmd_t pmd)
				663	{
				664	unsigned long pfn = pmd_pfn(pmd);
				665
				666	/*
				667	* There is no pmd_special() but there may be special pmds, e.g.
				668	* in a direct-access (dax) mapping, so let's just replicate the
				669	* !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
				670	*/
				671	if (unlikely(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP))) {
				672	if (vma->vm_flags & VM_MIXEDMAP) {
				673	if (!pfn_valid(pfn))
				674	return NULL;
				675	goto out;
				676	} else {
				677	unsigned long off;
				678	off = (addr - vma->vm_start) >> PAGE_SHIFT;
				679	if (pfn == vma->vm_pgoff + off)
				680	return NULL;
				681	if (!is_cow_mapping(vma->vm_flags))
				682	return NULL;
				683	}
				684	}
				685
				686	if (pmd_devmap(pmd))
				687	return NULL;
				688	if (is_zero_pfn(pfn))
				689	return NULL;
				690	if (unlikely(pfn > highest_memmap_pfn))
				691	return NULL;
				692
				693	/*
				694	* NOTE! We still have PageReserved() pages in the page tables.
				695	* eg. VDSO mappings can cause them to exist.
				696	*/
				697	out:
				698	return pfn_to_page(pfn);
				699	}
				700	#endif
				701
				702	/*
				703	* copy one vm_area from one task to the other. Assumes the page tables
				704	* already present in the new task to be cleared in the whole range
				705	* covered by this vma.
				706	*/
				707
				708	static inline unsigned long
				709	copy_one_pte(struct mm_struct dst_mm, struct mm_struct src_mm,
				710	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *vma,
				711	unsigned long addr, int *rss)
				712	{
				713	unsigned long vm_flags = vma->vm_flags;
				714	pte_t pte = *src_pte;
				715	struct page *page;
				716
				717	/* pte contains position in swap or file, so copy. */
				718	if (unlikely(!pte_present(pte))) {
				719	swp_entry_t entry = pte_to_swp_entry(pte);
				720
				721	if (likely(!non_swap_entry(entry))) {
				722	if (swap_duplicate(entry) < 0)
				723	return entry.val;
				724
				725	/* make sure dst_mm is on swapoff's mmlist. */
				726	if (unlikely(list_empty(&dst_mm->mmlist))) {
				727	spin_lock(&mmlist_lock);
				728	if (list_empty(&dst_mm->mmlist))
				729	list_add(&dst_mm->mmlist,
				730	&src_mm->mmlist);
				731	spin_unlock(&mmlist_lock);
				732	}
				733	rss[MM_SWAPENTS]++;
				734	} else if (is_migration_entry(entry)) {
				735	page = migration_entry_to_page(entry);
				736
				737	rss[mm_counter(page)]++;
				738
				739	if (is_write_migration_entry(entry) &&
				740	is_cow_mapping(vm_flags)) {
				741	/*
				742	* COW mappings require pages in both
				743	* parent and child to be set to read.
				744	*/
				745	make_migration_entry_read(&entry);
				746	pte = swp_entry_to_pte(entry);
				747	if (pte_swp_soft_dirty(*src_pte))
				748	pte = pte_swp_mksoft_dirty(pte);
				749	set_pte_at(src_mm, addr, src_pte, pte);
				750	}
				751	} else if (is_device_private_entry(entry)) {
				752	page = device_private_entry_to_page(entry);
				753
				754	/*
				755	* Update rss count even for unaddressable pages, as
				756	* they should treated just like normal pages in this
				757	* respect.
				758	*
				759	* We will likely want to have some new rss counters
				760	* for unaddressable pages, at some point. But for now
				761	* keep things as they are.
				762	*/
				763	get_page(page);
				764	rss[mm_counter(page)]++;
				765	page_dup_rmap(page, false);
				766
				767	/*
				768	* We do not preserve soft-dirty information, because so
				769	* far, checkpoint/restore is the only feature that
				770	* requires that. And checkpoint/restore does not work
				771	* when a device driver is involved (you cannot easily
				772	* save and restore device driver state).
				773	*/
				774	if (is_write_device_private_entry(entry) &&
				775	is_cow_mapping(vm_flags)) {
				776	make_device_private_entry_read(&entry);
				777	pte = swp_entry_to_pte(entry);
				778	set_pte_at(src_mm, addr, src_pte, pte);
				779	}
				780	}
				781	goto out_set_pte;
				782	}
				783
				784	/*
				785	* If it's a COW mapping, write protect it both
				786	* in the parent and the child
				787	*/
				788	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
				789	ptep_set_wrprotect(src_mm, addr, src_pte);
				790	pte = pte_wrprotect(pte);
				791	}
				792
				793	/*
				794	* If it's a shared mapping, mark it clean in
				795	* the child
				796	*/
				797	if (vm_flags & VM_SHARED)
				798	pte = pte_mkclean(pte);
				799	pte = pte_mkold(pte);
				800
				801	page = vm_normal_page(vma, addr, pte);
				802	if (page) {
				803	get_page(page);
				804	page_dup_rmap(page, false);
				805	rss[mm_counter(page)]++;
				806	} else if (pte_devmap(pte)) {
				807	page = pte_page(pte);
				808	}
				809
				810	out_set_pte:
				811	set_pte_at(dst_mm, addr, dst_pte, pte);
				812	return 0;
				813	}
				814
				815	static int copy_pte_range(struct mm_struct dst_mm, struct mm_struct src_mm,
				816	pmd_t dst_pmd, pmd_t src_pmd, struct vm_area_struct *vma,
				817	unsigned long addr, unsigned long end)
				818	{
				819	pte_t orig_src_pte, orig_dst_pte;
				820	pte_t src_pte, dst_pte;
				821	spinlock_t src_ptl, dst_ptl;
				822	int progress = 0;
				823	int rss[NR_MM_COUNTERS];
				824	swp_entry_t entry = (swp_entry_t){0};
				825
				826	again:
				827	init_rss_vec(rss);
				828
				829	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
				830	if (!dst_pte)
				831	return -ENOMEM;
				832	src_pte = pte_offset_map(src_pmd, addr);
				833	src_ptl = pte_lockptr(src_mm, src_pmd);
				834	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
				835	orig_src_pte = src_pte;
				836	orig_dst_pte = dst_pte;
				837	arch_enter_lazy_mmu_mode();
				838
				839	do {
				840	/*
				841	* We are holding two locks at this point - either of them
				842	* could generate latencies in another task on another CPU.
				843	*/
				844	if (progress >= 32) {
				845	progress = 0;
				846	if (need_resched() \|\|
				847	spin_needbreak(src_ptl) \|\| spin_needbreak(dst_ptl))
				848	break;
				849	}
				850	if (pte_none(*src_pte)) {
				851	progress++;
				852	continue;
				853	}
				854	entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
				855	vma, addr, rss);
				856	if (entry.val)
				857	break;
				858	progress += 8;
				859	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
				860
				861	arch_leave_lazy_mmu_mode();
				862	spin_unlock(src_ptl);
				863	pte_unmap(orig_src_pte);
				864	add_mm_rss_vec(dst_mm, rss);
				865	pte_unmap_unlock(orig_dst_pte, dst_ptl);
				866	cond_resched();
				867
				868	if (entry.val) {
				869	if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
				870	return -ENOMEM;
				871	progress = 0;
				872	}
				873	if (addr != end)
				874	goto again;
				875	return 0;
				876	}
				877
				878	static inline int copy_pmd_range(struct mm_struct dst_mm, struct mm_struct src_mm,
				879	pud_t dst_pud, pud_t src_pud, struct vm_area_struct *vma,
				880	unsigned long addr, unsigned long end)
				881	{
				882	pmd_t src_pmd, dst_pmd;
				883	unsigned long next;
				884
				885	dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
				886	if (!dst_pmd)
				887	return -ENOMEM;
				888	src_pmd = pmd_offset(src_pud, addr);
				889	do {
				890	next = pmd_addr_end(addr, end);
				891	if (is_swap_pmd(src_pmd) \|\| pmd_trans_huge(src_pmd)
				892	\|\| pmd_devmap(*src_pmd)) {
				893	int err;
				894	VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
				895	err = copy_huge_pmd(dst_mm, src_mm,
				896	dst_pmd, src_pmd, addr, vma);
				897	if (err == -ENOMEM)
				898	return -ENOMEM;
				899	if (!err)
				900	continue;
				901	/* fall through */
				902	}
				903	if (pmd_none_or_clear_bad(src_pmd))
				904	continue;
				905	if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
				906	vma, addr, next))
				907	return -ENOMEM;
				908	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
				909	return 0;
				910	}
				911
				912	static inline int copy_pud_range(struct mm_struct dst_mm, struct mm_struct src_mm,
				913	p4d_t dst_p4d, p4d_t src_p4d, struct vm_area_struct *vma,
				914	unsigned long addr, unsigned long end)
				915	{
				916	pud_t src_pud, dst_pud;
				917	unsigned long next;
				918
				919	dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
				920	if (!dst_pud)
				921	return -ENOMEM;
				922	src_pud = pud_offset(src_p4d, addr);
				923	do {
				924	next = pud_addr_end(addr, end);
				925	if (pud_trans_huge(src_pud) \|\| pud_devmap(src_pud)) {
				926	int err;
				927
				928	VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
				929	err = copy_huge_pud(dst_mm, src_mm,
				930	dst_pud, src_pud, addr, vma);
				931	if (err == -ENOMEM)
				932	return -ENOMEM;
				933	if (!err)
				934	continue;
				935	/* fall through */
				936	}
				937	if (pud_none_or_clear_bad(src_pud))
				938	continue;
				939	if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
				940	vma, addr, next))
				941	return -ENOMEM;
				942	} while (dst_pud++, src_pud++, addr = next, addr != end);
				943	return 0;
				944	}
				945
				946	static inline int copy_p4d_range(struct mm_struct dst_mm, struct mm_struct src_mm,
				947	pgd_t dst_pgd, pgd_t src_pgd, struct vm_area_struct *vma,
				948	unsigned long addr, unsigned long end)
				949	{
				950	p4d_t src_p4d, dst_p4d;
				951	unsigned long next;
				952
				953	dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
				954	if (!dst_p4d)
				955	return -ENOMEM;
				956	src_p4d = p4d_offset(src_pgd, addr);
				957	do {
				958	next = p4d_addr_end(addr, end);
				959	if (p4d_none_or_clear_bad(src_p4d))
				960	continue;
				961	if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
				962	vma, addr, next))
				963	return -ENOMEM;
				964	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
				965	return 0;
				966	}
				967
				968	int copy_page_range(struct mm_struct dst_mm, struct mm_struct src_mm,
				969	struct vm_area_struct *vma)
				970	{
				971	pgd_t src_pgd, dst_pgd;
				972	unsigned long next;
				973	unsigned long addr = vma->vm_start;
				974	unsigned long end = vma->vm_end;
				975	struct mmu_notifier_range range;
				976	bool is_cow;
				977	int ret;
				978
				979	/*
				980	* Don't copy ptes where a page fault will fill them correctly.
				981	* Fork becomes much lighter when there are big shared or private
				982	* readonly mappings. The tradeoff is that copy_page_range is more
				983	* efficient than faulting.
				984	*/
				985	if (!(vma->vm_flags & (VM_HUGETLB \| VM_PFNMAP \| VM_MIXEDMAP)) &&
				986	!vma->anon_vma)
				987	return 0;
				988
				989	if (is_vm_hugetlb_page(vma))
				990	return copy_hugetlb_page_range(dst_mm, src_mm, vma);
				991
				992	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
				993	/*
				994	* We do not free on error cases below as remove_vma
				995	* gets called on error from higher level routine
				996	*/
				997	ret = track_pfn_copy(vma);
				998	if (ret)
				999	return ret;
				1000	}
				1001
				1002	/*
				1003	* We need to invalidate the secondary MMU mappings only when
				1004	* there could be a permission downgrade on the ptes of the
				1005	* parent mm. And a permission downgrade will only happen if
				1006	* is_cow_mapping() returns true.
				1007	*/
				1008	is_cow = is_cow_mapping(vma->vm_flags);
				1009
				1010	if (is_cow) {
				1011	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
				1012	0, vma, src_mm, addr, end);
				1013	mmu_notifier_invalidate_range_start(&range);
				1014	}
				1015
				1016	ret = 0;
				1017	dst_pgd = pgd_offset(dst_mm, addr);
				1018	src_pgd = pgd_offset(src_mm, addr);
				1019	do {
				1020	next = pgd_addr_end(addr, end);
				1021	if (pgd_none_or_clear_bad(src_pgd))
				1022	continue;
				1023	if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
				1024	vma, addr, next))) {
				1025	ret = -ENOMEM;
				1026	break;
				1027	}
				1028	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
				1029
				1030	if (is_cow)
				1031	mmu_notifier_invalidate_range_end(&range);
				1032	return ret;
				1033	}
				1034
				1035	/* Whether we should zap all COWed (private) pages too */
				1036	static inline bool should_zap_cows(struct zap_details *details)
				1037	{
				1038	/* By default, zap all pages */
				1039	if (!details)
				1040	return true;
				1041
				1042	/* Or, we zap COWed pages only if the caller wants to */
				1043	return !details->check_mapping;
				1044	}
				1045
				1046	static unsigned long zap_pte_range(struct mmu_gather *tlb,
				1047	struct vm_area_struct vma, pmd_t pmd,
				1048	unsigned long addr, unsigned long end,
				1049	struct zap_details *details)
				1050	{
				1051	struct mm_struct *mm = tlb->mm;
				1052	int force_flush = 0;
				1053	int rss[NR_MM_COUNTERS];
				1054	spinlock_t *ptl;
				1055	pte_t *start_pte;
				1056	pte_t *pte;
				1057	swp_entry_t entry;
				1058
				1059	tlb_change_page_size(tlb, PAGE_SIZE);
				1060	again:
				1061	init_rss_vec(rss);
				1062	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				1063	pte = start_pte;
				1064	flush_tlb_batched_pending(mm);
				1065	arch_enter_lazy_mmu_mode();
				1066	do {
				1067	pte_t ptent = *pte;
				1068	if (pte_none(ptent))
				1069	continue;
				1070
				1071	if (need_resched())
				1072	break;
				1073
				1074	if (pte_present(ptent)) {
				1075	struct page *page;
				1076
				1077	page = vm_normal_page(vma, addr, ptent);
				1078	if (unlikely(details) && page) {
				1079	/*
				1080	* unmap_shared_mapping_pages() wants to
				1081	* invalidate cache without truncating:
				1082	* unmap shared but keep private pages.
				1083	*/
				1084	if (details->check_mapping &&
				1085	details->check_mapping != page_rmapping(page))
				1086	continue;
				1087	}
				1088	ptent = ptep_get_and_clear_full(mm, addr, pte,
				1089	tlb->fullmm);
				1090	tlb_remove_tlb_entry(tlb, pte, addr);
				1091	if (unlikely(!page))
				1092	continue;
				1093
				1094	if (!PageAnon(page)) {
				1095	if (pte_dirty(ptent)) {
				1096	force_flush = 1;
				1097	set_page_dirty(page);
				1098	}
				1099	if (pte_young(ptent) &&
				1100	likely(!(vma->vm_flags & VM_SEQ_READ)))
				1101	mark_page_accessed(page);
				1102	}
				1103	rss[mm_counter(page)]--;
				1104	page_remove_rmap(page, false);
				1105	if (unlikely(page_mapcount(page) < 0))
				1106	print_bad_pte(vma, addr, ptent, page);
				1107	if (unlikely(__tlb_remove_page(tlb, page))) {
				1108	force_flush = 1;
				1109	addr += PAGE_SIZE;
				1110	break;
				1111	}
				1112	continue;
				1113	}
				1114
				1115	entry = pte_to_swp_entry(ptent);
				1116	if (non_swap_entry(entry) && is_device_private_entry(entry)) {
				1117	struct page *page = device_private_entry_to_page(entry);
				1118
				1119	if (unlikely(details && details->check_mapping)) {
				1120	/*
				1121	* unmap_shared_mapping_pages() wants to
				1122	* invalidate cache without truncating:
				1123	* unmap shared but keep private pages.
				1124	*/
				1125	if (details->check_mapping !=
				1126	page_rmapping(page))
				1127	continue;
				1128	}
				1129
				1130	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
				1131	rss[mm_counter(page)]--;
				1132	page_remove_rmap(page, false);
				1133	put_page(page);
				1134	continue;
				1135	}
				1136
				1137	if (!non_swap_entry(entry)) {
				1138	/* Genuine swap entry, hence a private anon page */
				1139	if (!should_zap_cows(details))
				1140	continue;
				1141	rss[MM_SWAPENTS]--;
				1142	} else if (is_migration_entry(entry)) {
				1143	struct page *page;
				1144
				1145	page = migration_entry_to_page(entry);
				1146	if (details && details->check_mapping &&
				1147	details->check_mapping != page_rmapping(page))
				1148	continue;
				1149	rss[mm_counter(page)]--;
				1150	}
				1151	if (unlikely(!free_swap_and_cache(entry)))
				1152	print_bad_pte(vma, addr, ptent, NULL);
				1153	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
				1154	} while (pte++, addr += PAGE_SIZE, addr != end);
				1155
				1156	add_mm_rss_vec(mm, rss);
				1157	arch_leave_lazy_mmu_mode();
				1158
				1159	/* Do the actual TLB flush before dropping ptl */
				1160	if (force_flush)
				1161	tlb_flush_mmu_tlbonly(tlb);
				1162	pte_unmap_unlock(start_pte, ptl);
				1163
				1164	/*
				1165	* If we forced a TLB flush (either due to running out of
				1166	* batch buffers or because we needed to flush dirty TLB
				1167	* entries before releasing the ptl), free the batched
				1168	* memory too. Restart if we didn't do everything.
				1169	*/
				1170	if (force_flush) {
				1171	force_flush = 0;
				1172	tlb_flush_mmu(tlb);
				1173	}
				1174
				1175	if (addr != end) {
				1176	cond_resched();
				1177	goto again;
				1178	}
				1179
				1180	return addr;
				1181	}
				1182
				1183	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
				1184	struct vm_area_struct vma, pud_t pud,
				1185	unsigned long addr, unsigned long end,
				1186	struct zap_details *details)
				1187	{
				1188	pmd_t *pmd;
				1189	unsigned long next;
				1190
				1191	pmd = pmd_offset(pud, addr);
				1192	do {
				1193	next = pmd_addr_end(addr, end);
				1194	if (is_swap_pmd(pmd) \|\| pmd_trans_huge(pmd) \|\| pmd_devmap(*pmd)) {
				1195	if (next - addr != HPAGE_PMD_SIZE)
				1196	__split_huge_pmd(vma, pmd, addr, false, NULL);
				1197	else if (zap_huge_pmd(tlb, vma, pmd, addr))
				1198	goto next;
				1199	/* fall through */
				1200	} else if (details && details->single_page &&
				1201	PageTransCompound(details->single_page) &&
				1202	next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
				1203	spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
				1204	/*
				1205	* Take and drop THP pmd lock so that we cannot return
				1206	* prematurely, while zap_huge_pmd() has cleared *pmd,
				1207	* but not yet decremented compound_mapcount().
				1208	*/
				1209	spin_unlock(ptl);
				1210	}
				1211
				1212	/*
				1213	* Here there can be other concurrent MADV_DONTNEED or
				1214	* trans huge page faults running, and if the pmd is
				1215	* none or trans huge it can change under us. This is
				1216	* because MADV_DONTNEED holds the mmap_sem in read
				1217	* mode.
				1218	*/
				1219	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
				1220	goto next;
				1221	next = zap_pte_range(tlb, vma, pmd, addr, next, details);
				1222	next:
				1223	cond_resched();
				1224	} while (pmd++, addr = next, addr != end);
				1225
				1226	return addr;
				1227	}
				1228
				1229	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
				1230	struct vm_area_struct vma, p4d_t p4d,
				1231	unsigned long addr, unsigned long end,
				1232	struct zap_details *details)
				1233	{
				1234	pud_t *pud;
				1235	unsigned long next;
				1236
				1237	pud = pud_offset(p4d, addr);
				1238	do {
				1239	next = pud_addr_end(addr, end);
				1240	if (pud_trans_huge(pud) \|\| pud_devmap(pud)) {
				1241	if (next - addr != HPAGE_PUD_SIZE) {
				1242	VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
				1243	split_huge_pud(vma, pud, addr);
				1244	} else if (zap_huge_pud(tlb, vma, pud, addr))
				1245	goto next;
				1246	/* fall through */
				1247	}
				1248	if (pud_none_or_clear_bad(pud))
				1249	continue;
				1250	next = zap_pmd_range(tlb, vma, pud, addr, next, details);
				1251	next:
				1252	cond_resched();
				1253	} while (pud++, addr = next, addr != end);
				1254
				1255	return addr;
				1256	}
				1257
				1258	static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
				1259	struct vm_area_struct vma, pgd_t pgd,
				1260	unsigned long addr, unsigned long end,
				1261	struct zap_details *details)
				1262	{
				1263	p4d_t *p4d;
				1264	unsigned long next;
				1265
				1266	p4d = p4d_offset(pgd, addr);
				1267	do {
				1268	next = p4d_addr_end(addr, end);
				1269	if (p4d_none_or_clear_bad(p4d))
				1270	continue;
				1271	next = zap_pud_range(tlb, vma, p4d, addr, next, details);
				1272	} while (p4d++, addr = next, addr != end);
				1273
				1274	return addr;
				1275	}
				1276
				1277	void unmap_page_range(struct mmu_gather *tlb,
				1278	struct vm_area_struct *vma,
				1279	unsigned long addr, unsigned long end,
				1280	struct zap_details *details)
				1281	{
				1282	pgd_t *pgd;
				1283	unsigned long next;
				1284
				1285	BUG_ON(addr >= end);
				1286	tlb_start_vma(tlb, vma);
				1287	pgd = pgd_offset(vma->vm_mm, addr);
				1288	do {
				1289	next = pgd_addr_end(addr, end);
				1290	if (pgd_none_or_clear_bad(pgd))
				1291	continue;
				1292	next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
				1293	} while (pgd++, addr = next, addr != end);
				1294	tlb_end_vma(tlb, vma);
				1295	}
				1296
				1297
				1298	static void unmap_single_vma(struct mmu_gather *tlb,
				1299	struct vm_area_struct *vma, unsigned long start_addr,
				1300	unsigned long end_addr,
				1301	struct zap_details *details)
				1302	{
				1303	unsigned long start = max(vma->vm_start, start_addr);
				1304	unsigned long end;
				1305
				1306	if (start >= vma->vm_end)
				1307	return;
				1308	end = min(vma->vm_end, end_addr);
				1309	if (end <= vma->vm_start)
				1310	return;
				1311
				1312	if (vma->vm_file)
				1313	uprobe_munmap(vma, start, end);
				1314
				1315	if (unlikely(vma->vm_flags & VM_PFNMAP))
				1316	untrack_pfn(vma, 0, 0);
				1317
				1318	if (start != end) {
				1319	if (unlikely(is_vm_hugetlb_page(vma))) {
				1320	/*
				1321	* It is undesirable to test vma->vm_file as it
				1322	* should be non-null for valid hugetlb area.
				1323	* However, vm_file will be NULL in the error
				1324	* cleanup path of mmap_region. When
				1325	* hugetlbfs ->mmap method fails,
				1326	* mmap_region() nullifies vma->vm_file
				1327	* before calling this function to clean up.
				1328	* Since no pte has actually been setup, it is
				1329	* safe to do nothing in this case.
				1330	*/
				1331	if (vma->vm_file) {
				1332	i_mmap_lock_write(vma->vm_file->f_mapping);
				1333	__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
				1334	i_mmap_unlock_write(vma->vm_file->f_mapping);
				1335	}
				1336	} else
				1337	unmap_page_range(tlb, vma, start, end, details);
				1338	}
				1339	}
				1340
				1341	/**
				1342	* unmap_vmas - unmap a range of memory covered by a list of vma's
				1343	* @tlb: address of the caller's struct mmu_gather
				1344	* @vma: the starting vma
				1345	* @start_addr: virtual address at which to start unmapping
				1346	* @end_addr: virtual address at which to end unmapping
				1347	*
				1348	* Unmap all pages in the vma list.
				1349	*
				1350	* Only addresses between `start' and `end' will be unmapped.
				1351	*
				1352	* The VMA list must be sorted in ascending virtual address order.
				1353	*
				1354	* unmap_vmas() assumes that the caller will flush the whole unmapped address
				1355	* range after unmap_vmas() returns. So the only responsibility here is to
				1356	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
				1357	* drops the lock and schedules.
				1358	*/
				1359	void unmap_vmas(struct mmu_gather *tlb,
				1360	struct vm_area_struct *vma, unsigned long start_addr,
				1361	unsigned long end_addr)
				1362	{
				1363	struct mmu_notifier_range range;
				1364
				1365	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
				1366	start_addr, end_addr);
				1367	mmu_notifier_invalidate_range_start(&range);
				1368	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
				1369	unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
				1370	mmu_notifier_invalidate_range_end(&range);
				1371	}
				1372
				1373	/**
				1374	* zap_page_range - remove user pages in a given range
				1375	* @vma: vm_area_struct holding the applicable pages
				1376	* @start: starting address of pages to zap
				1377	* @size: number of bytes to zap
				1378	*
				1379	* Caller must protect the VMA list
				1380	*/
				1381	void zap_page_range(struct vm_area_struct *vma, unsigned long start,
				1382	unsigned long size)
				1383	{
				1384	struct mmu_notifier_range range;
				1385	struct mmu_gather tlb;
				1386
				1387	lru_add_drain();
				1388	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
				1389	start, start + size);
				1390	tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
				1391	update_hiwater_rss(vma->vm_mm);
				1392	mmu_notifier_invalidate_range_start(&range);
				1393	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
				1394	unmap_single_vma(&tlb, vma, start, range.end, NULL);
				1395	mmu_notifier_invalidate_range_end(&range);
				1396	tlb_finish_mmu(&tlb, start, range.end);
				1397	}
				1398
				1399	/**
				1400	* zap_page_range_single - remove user pages in a given range
				1401	* @vma: vm_area_struct holding the applicable pages
				1402	* @address: starting address of pages to zap
				1403	* @size: number of bytes to zap
				1404	* @details: details of shared cache invalidation
				1405	*
				1406	* The range must fit into one VMA.
				1407	*/
				1408	static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
				1409	unsigned long size, struct zap_details *details)
				1410	{
				1411	struct mmu_notifier_range range;
				1412	struct mmu_gather tlb;
				1413
				1414	lru_add_drain();
				1415	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
				1416	address, address + size);
				1417	tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
				1418	update_hiwater_rss(vma->vm_mm);
				1419	mmu_notifier_invalidate_range_start(&range);
				1420	unmap_single_vma(&tlb, vma, address, range.end, details);
				1421	mmu_notifier_invalidate_range_end(&range);
				1422	tlb_finish_mmu(&tlb, address, range.end);
				1423	}
				1424
				1425	/**
				1426	* zap_vma_ptes - remove ptes mapping the vma
				1427	* @vma: vm_area_struct holding ptes to be zapped
				1428	* @address: starting address of pages to zap
				1429	* @size: number of bytes to zap
				1430	*
				1431	* This function only unmaps ptes assigned to VM_PFNMAP vmas.
				1432	*
				1433	* The entire address range must be fully contained within the vma.
				1434	*
				1435	*/
				1436	void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
				1437	unsigned long size)
				1438	{
				1439	if (address < vma->vm_start \|\| address + size > vma->vm_end \|\|
				1440	!(vma->vm_flags & VM_PFNMAP))
				1441	return;
				1442
				1443	zap_page_range_single(vma, address, size, NULL);
				1444	}
				1445	EXPORT_SYMBOL_GPL(zap_vma_ptes);
				1446
				1447	pte_t __get_locked_pte(struct mm_struct mm, unsigned long addr,
				1448	spinlock_t **ptl)
				1449	{
				1450	pgd_t *pgd;
				1451	p4d_t *p4d;
				1452	pud_t *pud;
				1453	pmd_t *pmd;
				1454
				1455	pgd = pgd_offset(mm, addr);
				1456	p4d = p4d_alloc(mm, pgd, addr);
				1457	if (!p4d)
				1458	return NULL;
				1459	pud = pud_alloc(mm, p4d, addr);
				1460	if (!pud)
				1461	return NULL;
				1462	pmd = pmd_alloc(mm, pud, addr);
				1463	if (!pmd)
				1464	return NULL;
				1465
				1466	VM_BUG_ON(pmd_trans_huge(*pmd));
				1467	return pte_alloc_map_lock(mm, pmd, addr, ptl);
				1468	}
				1469
				1470	/*
				1471	* This is the old fallback for page remapping.
				1472	*
				1473	* For historical reasons, it only allows reserved pages. Only
				1474	* old drivers should use this, and they needed to mark their
				1475	* pages reserved for the old functions anyway.
				1476	*/
				1477	static int insert_page(struct vm_area_struct *vma, unsigned long addr,
				1478	struct page *page, pgprot_t prot)
				1479	{
				1480	struct mm_struct *mm = vma->vm_mm;
				1481	int retval;
				1482	pte_t *pte;
				1483	spinlock_t *ptl;
				1484
				1485	retval = -EINVAL;
				1486	if (PageAnon(page) \|\| PageSlab(page) \|\| page_has_type(page))
				1487	goto out;
				1488	retval = -ENOMEM;
				1489	flush_dcache_page(page);
				1490	pte = get_locked_pte(mm, addr, &ptl);
				1491	if (!pte)
				1492	goto out;
				1493	retval = -EBUSY;
				1494	if (!pte_none(*pte))
				1495	goto out_unlock;
				1496
				1497	/* Ok, finally just insert the thing.. */
				1498	get_page(page);
				1499	inc_mm_counter_fast(mm, mm_counter_file(page));
				1500	page_add_file_rmap(page, false);
				1501	set_pte_at(mm, addr, pte, mk_pte(page, prot));
				1502
				1503	retval = 0;
				1504	out_unlock:
				1505	pte_unmap_unlock(pte, ptl);
				1506	out:
				1507	return retval;
				1508	}
				1509
				1510	/**
				1511	* vm_insert_page - insert single page into user vma
				1512	* @vma: user vma to map to
				1513	* @addr: target user address of this page
				1514	* @page: source kernel page
				1515	*
				1516	* This allows drivers to insert individual pages they've allocated
				1517	* into a user vma.
				1518	*
				1519	* The page has to be a nice clean _individual_ kernel allocation.
				1520	* If you allocate a compound page, you need to have marked it as
				1521	* such (__GFP_COMP), or manually just split the page up yourself
				1522	* (see split_page()).
				1523	*
				1524	* NOTE! Traditionally this was done with "remap_pfn_range()" which
				1525	* took an arbitrary page protection parameter. This doesn't allow
				1526	* that. Your vma protection will have to be set up correctly, which
				1527	* means that if you want a shared writable mapping, you'd better
				1528	* ask for a shared writable mapping!
				1529	*
				1530	* The page does not need to be reserved.
				1531	*
				1532	* Usually this function is called from f_op->mmap() handler
				1533	* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
				1534	* Caller must set VM_MIXEDMAP on vma if it wants to call this
				1535	* function from other places, for example from page-fault handler.
				1536	*
				1537	* Return: %0 on success, negative error code otherwise.
				1538	*/
				1539	int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
				1540	struct page *page)
				1541	{
				1542	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
				1543	return -EFAULT;
				1544	if (!page_count(page))
				1545	return -EINVAL;
				1546	if (!(vma->vm_flags & VM_MIXEDMAP)) {
				1547	BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
				1548	BUG_ON(vma->vm_flags & VM_PFNMAP);
				1549	vma->vm_flags \|= VM_MIXEDMAP;
				1550	}
				1551	return insert_page(vma, addr, page, vma->vm_page_prot);
				1552	}
				1553	EXPORT_SYMBOL(vm_insert_page);
				1554
				1555	/*
				1556	* __vm_map_pages - maps range of kernel pages into user vma
				1557	* @vma: user vma to map to
				1558	* @pages: pointer to array of source kernel pages
				1559	* @num: number of pages in page array
				1560	* @offset: user's requested vm_pgoff
				1561	*
				1562	* This allows drivers to map range of kernel pages into a user vma.
				1563	*
				1564	* Return: 0 on success and error code otherwise.
				1565	*/
				1566	static int __vm_map_pages(struct vm_area_struct vma, struct page *pages,
				1567	unsigned long num, unsigned long offset)
				1568	{
				1569	unsigned long count = vma_pages(vma);
				1570	unsigned long uaddr = vma->vm_start;
				1571	int ret, i;
				1572
				1573	/* Fail if the user requested offset is beyond the end of the object */
				1574	if (offset >= num)
				1575	return -ENXIO;
				1576
				1577	/* Fail if the user requested size exceeds available object size */
				1578	if (count > num - offset)
				1579	return -ENXIO;
				1580
				1581	for (i = 0; i < count; i++) {
				1582	ret = vm_insert_page(vma, uaddr, pages[offset + i]);
				1583	if (ret < 0)
				1584	return ret;
				1585	uaddr += PAGE_SIZE;
				1586	}
				1587
				1588	return 0;
				1589	}
				1590
				1591	/**
				1592	* vm_map_pages - maps range of kernel pages starts with non zero offset
				1593	* @vma: user vma to map to
				1594	* @pages: pointer to array of source kernel pages
				1595	* @num: number of pages in page array
				1596	*
				1597	* Maps an object consisting of @num pages, catering for the user's
				1598	* requested vm_pgoff
				1599	*
				1600	* If we fail to insert any page into the vma, the function will return
				1601	* immediately leaving any previously inserted pages present. Callers
				1602	* from the mmap handler may immediately return the error as their caller
				1603	* will destroy the vma, removing any successfully inserted pages. Other
				1604	* callers should make their own arrangements for calling unmap_region().
				1605	*
				1606	* Context: Process context. Called by mmap handlers.
				1607	* Return: 0 on success and error code otherwise.
				1608	*/
				1609	int vm_map_pages(struct vm_area_struct vma, struct page *pages,
				1610	unsigned long num)
				1611	{
				1612	return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
				1613	}
				1614	EXPORT_SYMBOL(vm_map_pages);
				1615
				1616	/**
				1617	* vm_map_pages_zero - map range of kernel pages starts with zero offset
				1618	* @vma: user vma to map to
				1619	* @pages: pointer to array of source kernel pages
				1620	* @num: number of pages in page array
				1621	*
				1622	* Similar to vm_map_pages(), except that it explicitly sets the offset
				1623	* to 0. This function is intended for the drivers that did not consider
				1624	* vm_pgoff.
				1625	*
				1626	* Context: Process context. Called by mmap handlers.
				1627	* Return: 0 on success and error code otherwise.
				1628	*/
				1629	int vm_map_pages_zero(struct vm_area_struct vma, struct page *pages,
				1630	unsigned long num)
				1631	{
				1632	return __vm_map_pages(vma, pages, num, 0);
				1633	}
				1634	EXPORT_SYMBOL(vm_map_pages_zero);
				1635
				1636	static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
				1637	pfn_t pfn, pgprot_t prot, bool mkwrite)
				1638	{
				1639	struct mm_struct *mm = vma->vm_mm;
				1640	pte_t *pte, entry;
				1641	spinlock_t *ptl;
				1642
				1643	pte = get_locked_pte(mm, addr, &ptl);
				1644	if (!pte)
				1645	return VM_FAULT_OOM;
				1646	if (!pte_none(*pte)) {
				1647	if (mkwrite) {
				1648	/*
				1649	* For read faults on private mappings the PFN passed
				1650	* in may not match the PFN we have mapped if the
				1651	* mapped PFN is a writeable COW page. In the mkwrite
				1652	* case we are creating a writable PTE for a shared
				1653	* mapping and we expect the PFNs to match. If they
				1654	* don't match, we are likely racing with block
				1655	* allocation and mapping invalidation so just skip the
				1656	* update.
				1657	*/
				1658	if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
				1659	WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
				1660	goto out_unlock;
				1661	}
				1662	entry = pte_mkyoung(*pte);
				1663	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
				1664	if (ptep_set_access_flags(vma, addr, pte, entry, 1))
				1665	update_mmu_cache(vma, addr, pte);
				1666	}
				1667	goto out_unlock;
				1668	}
				1669
				1670	/* Ok, finally just insert the thing.. */
				1671	if (pfn_t_devmap(pfn))
				1672	entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
				1673	else
				1674	entry = pte_mkspecial(pfn_t_pte(pfn, prot));
				1675
				1676	if (mkwrite) {
				1677	entry = pte_mkyoung(entry);
				1678	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
				1679	}
				1680
				1681	set_pte_at(mm, addr, pte, entry);
				1682	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
				1683
				1684	out_unlock:
				1685	pte_unmap_unlock(pte, ptl);
				1686	return VM_FAULT_NOPAGE;
				1687	}
				1688
				1689	/**
				1690	* vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
				1691	* @vma: user vma to map to
				1692	* @addr: target user address of this page
				1693	* @pfn: source kernel pfn
				1694	* @pgprot: pgprot flags for the inserted page
				1695	*
				1696	* This is exactly like vmf_insert_pfn(), except that it allows drivers to
				1697	* to override pgprot on a per-page basis.
				1698	*
				1699	* This only makes sense for IO mappings, and it makes no sense for
				1700	* COW mappings. In general, using multiple vmas is preferable;
				1701	* vmf_insert_pfn_prot should only be used if using multiple VMAs is
				1702	* impractical.
				1703	*
				1704	* Context: Process context. May allocate using %GFP_KERNEL.
				1705	* Return: vm_fault_t value.
				1706	*/
				1707	vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
				1708	unsigned long pfn, pgprot_t pgprot)
				1709	{
				1710	/*
				1711	* Technically, architectures with pte_special can avoid all these
				1712	* restrictions (same for remap_pfn_range). However we would like
				1713	* consistency in testing and feature parity among all, so we should
				1714	* try to keep these invariants in place for everybody.
				1715	*/
				1716	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));
				1717	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
				1718	(VM_PFNMAP\|VM_MIXEDMAP));
				1719	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
				1720	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
				1721
				1722	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
				1723	return VM_FAULT_SIGBUS;
				1724
				1725	if (!pfn_modify_allowed(pfn, pgprot))
				1726	return VM_FAULT_SIGBUS;
				1727
				1728	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
				1729
				1730	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
				1731	false);
				1732	}
				1733	EXPORT_SYMBOL(vmf_insert_pfn_prot);
				1734
				1735	/**
				1736	* vmf_insert_pfn - insert single pfn into user vma
				1737	* @vma: user vma to map to
				1738	* @addr: target user address of this page
				1739	* @pfn: source kernel pfn
				1740	*
				1741	* Similar to vm_insert_page, this allows drivers to insert individual pages
				1742	* they've allocated into a user vma. Same comments apply.
				1743	*
				1744	* This function should only be called from a vm_ops->fault handler, and
				1745	* in that case the handler should return the result of this function.
				1746	*
				1747	* vma cannot be a COW mapping.
				1748	*
				1749	* As this is called only for pages that do not currently exist, we
				1750	* do not need to flush old virtual caches or the TLB.
				1751	*
				1752	* Context: Process context. May allocate using %GFP_KERNEL.
				1753	* Return: vm_fault_t value.
				1754	*/
				1755	vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
				1756	unsigned long pfn)
				1757	{
				1758	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
				1759	}
				1760	EXPORT_SYMBOL(vmf_insert_pfn);
				1761
				1762	static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
				1763	{
				1764	/* these checks mirror the abort conditions in vm_normal_page */
				1765	if (vma->vm_flags & VM_MIXEDMAP)
				1766	return true;
				1767	if (pfn_t_devmap(pfn))
				1768	return true;
				1769	if (pfn_t_special(pfn))
				1770	return true;
				1771	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
				1772	return true;
				1773	return false;
				1774	}
				1775
				1776	static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
				1777	unsigned long addr, pfn_t pfn, bool mkwrite)
				1778	{
				1779	pgprot_t pgprot = vma->vm_page_prot;
				1780	int err;
				1781
				1782	BUG_ON(!vm_mixed_ok(vma, pfn));
				1783
				1784	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
				1785	return VM_FAULT_SIGBUS;
				1786
				1787	track_pfn_insert(vma, &pgprot, pfn);
				1788
				1789	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
				1790	return VM_FAULT_SIGBUS;
				1791
				1792	/*
				1793	* If we don't have pte special, then we have to use the pfn_valid()
				1794	* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we must
				1795	* refcount the page if pfn_valid is true (hence insert_page rather
				1796	* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
				1797	* without pte special, it would there be refcounted as a normal page.
				1798	*/
				1799	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
				1800	!pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
				1801	struct page *page;
				1802
				1803	/*
				1804	* At this point we are committed to insert_page()
				1805	* regardless of whether the caller specified flags that
				1806	* result in pfn_t_has_page() == false.
				1807	*/
				1808	page = pfn_to_page(pfn_t_to_pfn(pfn));
				1809	err = insert_page(vma, addr, page, pgprot);
				1810	} else {
				1811	return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
				1812	}
				1813
				1814	if (err == -ENOMEM)
				1815	return VM_FAULT_OOM;
				1816	if (err < 0 && err != -EBUSY)
				1817	return VM_FAULT_SIGBUS;
				1818
				1819	return VM_FAULT_NOPAGE;
				1820	}
				1821
				1822	vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
				1823	pfn_t pfn)
				1824	{
				1825	return __vm_insert_mixed(vma, addr, pfn, false);
				1826	}
				1827	EXPORT_SYMBOL(vmf_insert_mixed);
				1828
				1829	/*
				1830	* If the insertion of PTE failed because someone else already added a
				1831	* different entry in the mean time, we treat that as success as we assume
				1832	* the same entry was actually inserted.
				1833	*/
				1834	vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
				1835	unsigned long addr, pfn_t pfn)
				1836	{
				1837	return __vm_insert_mixed(vma, addr, pfn, true);
				1838	}
				1839	EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
				1840
				1841	/*
				1842	* maps a range of physical memory into the requested pages. the old
				1843	* mappings are removed. any references to nonexistent pages results
				1844	* in null mappings (currently treated as "copy-on-access")
				1845	*/
				1846	static int remap_pte_range(struct mm_struct mm, pmd_t pmd,
				1847	unsigned long addr, unsigned long end,
				1848	unsigned long pfn, pgprot_t prot)
				1849	{
				1850	pte_t pte, mapped_pte;
				1851	spinlock_t *ptl;
				1852	int err = 0;
				1853
				1854	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
				1855	if (!pte)
				1856	return -ENOMEM;
				1857	arch_enter_lazy_mmu_mode();
				1858	do {
				1859	BUG_ON(!pte_none(*pte));
				1860	if (!pfn_modify_allowed(pfn, prot)) {
				1861	err = -EACCES;
				1862	break;
				1863	}
				1864	set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
				1865	pfn++;
				1866	} while (pte++, addr += PAGE_SIZE, addr != end);
				1867	arch_leave_lazy_mmu_mode();
				1868	pte_unmap_unlock(mapped_pte, ptl);
				1869	return err;
				1870	}
				1871
				1872	static inline int remap_pmd_range(struct mm_struct mm, pud_t pud,
				1873	unsigned long addr, unsigned long end,
				1874	unsigned long pfn, pgprot_t prot)
				1875	{
				1876	pmd_t *pmd;
				1877	unsigned long next;
				1878	int err;
				1879
				1880	pfn -= addr >> PAGE_SHIFT;
				1881	pmd = pmd_alloc(mm, pud, addr);
				1882	if (!pmd)
				1883	return -ENOMEM;
				1884	VM_BUG_ON(pmd_trans_huge(*pmd));
				1885	do {
				1886	next = pmd_addr_end(addr, end);
				1887	err = remap_pte_range(mm, pmd, addr, next,
				1888	pfn + (addr >> PAGE_SHIFT), prot);
				1889	if (err)
				1890	return err;
				1891	} while (pmd++, addr = next, addr != end);
				1892	return 0;
				1893	}
				1894
				1895	static inline int remap_pud_range(struct mm_struct mm, p4d_t p4d,
				1896	unsigned long addr, unsigned long end,
				1897	unsigned long pfn, pgprot_t prot)
				1898	{
				1899	pud_t *pud;
				1900	unsigned long next;
				1901	int err;
				1902
				1903	pfn -= addr >> PAGE_SHIFT;
				1904	pud = pud_alloc(mm, p4d, addr);
				1905	if (!pud)
				1906	return -ENOMEM;
				1907	do {
				1908	next = pud_addr_end(addr, end);
				1909	err = remap_pmd_range(mm, pud, addr, next,
				1910	pfn + (addr >> PAGE_SHIFT), prot);
				1911	if (err)
				1912	return err;
				1913	} while (pud++, addr = next, addr != end);
				1914	return 0;
				1915	}
				1916
				1917	static inline int remap_p4d_range(struct mm_struct mm, pgd_t pgd,
				1918	unsigned long addr, unsigned long end,
				1919	unsigned long pfn, pgprot_t prot)
				1920	{
				1921	p4d_t *p4d;
				1922	unsigned long next;
				1923	int err;
				1924
				1925	pfn -= addr >> PAGE_SHIFT;
				1926	p4d = p4d_alloc(mm, pgd, addr);
				1927	if (!p4d)
				1928	return -ENOMEM;
				1929	do {
				1930	next = p4d_addr_end(addr, end);
				1931	err = remap_pud_range(mm, p4d, addr, next,
				1932	pfn + (addr >> PAGE_SHIFT), prot);
				1933	if (err)
				1934	return err;
				1935	} while (p4d++, addr = next, addr != end);
				1936	return 0;
				1937	}
				1938
				1939	static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
				1940	unsigned long pfn, unsigned long size, pgprot_t prot)
				1941	{
				1942	pgd_t *pgd;
				1943	unsigned long next;
				1944	unsigned long end = addr + PAGE_ALIGN(size);
				1945	struct mm_struct *mm = vma->vm_mm;
				1946	int err;
				1947
				1948	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
				1949	return -EINVAL;
				1950
				1951	/*
				1952	* Physically remapped pages are special. Tell the
				1953	* rest of the world about it:
				1954	* VM_IO tells people not to look at these pages
				1955	* (accesses can have side effects).
				1956	* VM_PFNMAP tells the core MM that the base pages are just
				1957	* raw PFN mappings, and do not have a "struct page" associated
				1958	* with them.
				1959	* VM_DONTEXPAND
				1960	* Disable vma merging and expanding with mremap().
				1961	* VM_DONTDUMP
				1962	* Omit vma from core dump, even when VM_IO turned off.
				1963	*
				1964	* There's a horrible special case to handle copy-on-write
				1965	* behaviour that some programs depend on. We mark the "original"
				1966	* un-COW'ed pages by matching them up with "vma->vm_pgoff".
				1967	* See vm_normal_page() for details.
				1968	*/
				1969	if (is_cow_mapping(vma->vm_flags)) {
				1970	if (addr != vma->vm_start \|\| end != vma->vm_end)
				1971	return -EINVAL;
				1972	vma->vm_pgoff = pfn;
				1973	}
				1974
				1975	vma->vm_flags \|= VM_IO \| VM_PFNMAP \| VM_DONTEXPAND \| VM_DONTDUMP;
				1976
				1977	BUG_ON(addr >= end);
				1978	pfn -= addr >> PAGE_SHIFT;
				1979	pgd = pgd_offset(mm, addr);
				1980	flush_cache_range(vma, addr, end);
				1981	do {
				1982	next = pgd_addr_end(addr, end);
				1983	err = remap_p4d_range(mm, pgd, addr, next,
				1984	pfn + (addr >> PAGE_SHIFT), prot);
				1985	if (err)
				1986	return err;
				1987	} while (pgd++, addr = next, addr != end);
				1988
				1989	return 0;
				1990	}
				1991
				1992	/*
				1993	* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
				1994	* must have pre-validated the caching bits of the pgprot_t.
				1995	*/
				1996	int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
				1997	unsigned long pfn, unsigned long size, pgprot_t prot)
				1998	{
				1999	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
				2000
				2001	if (!error)
				2002	return 0;
				2003
				2004	/*
				2005	* A partial pfn range mapping is dangerous: it does not
				2006	* maintain page reference counts, and callers may free
				2007	* pages due to the error. So zap it early.
				2008	*/
				2009	zap_page_range_single(vma, addr, size, NULL);
				2010	return error;
				2011	}
				2012
				2013	/**
				2014	* remap_pfn_range - remap kernel memory to userspace
				2015	* @vma: user vma to map to
				2016	* @addr: target page aligned user address to start at
				2017	* @pfn: page frame number of kernel physical memory address
				2018	* @size: size of mapping area
				2019	* @prot: page protection flags for this mapping
				2020	*
				2021	* Note: this is only safe if the mm semaphore is held when called.
				2022	*
				2023	* Return: %0 on success, negative error code otherwise.
				2024	*/
				2025	int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
				2026	unsigned long pfn, unsigned long size, pgprot_t prot)
				2027	{
				2028	int err;
				2029
				2030	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
				2031	if (err)
				2032	return -EINVAL;
				2033
				2034	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
				2035	if (err)
				2036	untrack_pfn(vma, pfn, PAGE_ALIGN(size));
				2037	return err;
				2038	}
				2039	EXPORT_SYMBOL(remap_pfn_range);
				2040
				2041	/**
				2042	* vm_iomap_memory - remap memory to userspace
				2043	* @vma: user vma to map to
				2044	* @start: start of area
				2045	* @len: size of area
				2046	*
				2047	* This is a simplified io_remap_pfn_range() for common driver use. The
				2048	* driver just needs to give us the physical memory range to be mapped,
				2049	* we'll figure out the rest from the vma information.
				2050	*
				2051	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
				2052	* whatever write-combining details or similar.
				2053	*
				2054	* Return: %0 on success, negative error code otherwise.
				2055	*/
				2056	int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
				2057	{
				2058	unsigned long vm_len, pfn, pages;
				2059
				2060	/* Check that the physical memory area passed in looks valid */
				2061	if (start + len < start)
				2062	return -EINVAL;
				2063	/*
				2064	* You really shouldn't map things that aren't page-aligned,
				2065	* but we've historically allowed it because IO memory might
				2066	* just have smaller alignment.
				2067	*/
				2068	len += start & ~PAGE_MASK;
				2069	pfn = start >> PAGE_SHIFT;
				2070	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
				2071	if (pfn + pages < pfn)
				2072	return -EINVAL;
				2073
				2074	/* We start the mapping 'vm_pgoff' pages into the area */
				2075	if (vma->vm_pgoff > pages)
				2076	return -EINVAL;
				2077	pfn += vma->vm_pgoff;
				2078	pages -= vma->vm_pgoff;
				2079
				2080	/* Can we fit all of the mapping? */
				2081	vm_len = vma->vm_end - vma->vm_start;
				2082	if (vm_len >> PAGE_SHIFT > pages)
				2083	return -EINVAL;
				2084
				2085	/* Ok, let it rip */
				2086	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
				2087	}
				2088	EXPORT_SYMBOL(vm_iomap_memory);
				2089
				2090	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,
				2091	unsigned long addr, unsigned long end,
				2092	pte_fn_t fn, void *data)
				2093	{
				2094	pte_t *pte;
				2095	int err;
				2096	spinlock_t *uninitialized_var(ptl);
				2097
				2098	pte = (mm == &init_mm) ?
				2099	pte_alloc_kernel(pmd, addr) :
				2100	pte_alloc_map_lock(mm, pmd, addr, &ptl);
				2101	if (!pte)
				2102	return -ENOMEM;
				2103
				2104	BUG_ON(pmd_huge(*pmd));
				2105
				2106	arch_enter_lazy_mmu_mode();
				2107
				2108	do {
				2109	err = fn(pte++, addr, data);
				2110	if (err)
				2111	break;
				2112	} while (addr += PAGE_SIZE, addr != end);
				2113
				2114	arch_leave_lazy_mmu_mode();
				2115
				2116	if (mm != &init_mm)
				2117	pte_unmap_unlock(pte-1, ptl);
				2118	return err;
				2119	}
				2120
				2121	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,
				2122	unsigned long addr, unsigned long end,
				2123	pte_fn_t fn, void *data)
				2124	{
				2125	pmd_t *pmd;
				2126	unsigned long next;
				2127	int err;
				2128
				2129	BUG_ON(pud_huge(*pud));
				2130
				2131	pmd = pmd_alloc(mm, pud, addr);
				2132	if (!pmd)
				2133	return -ENOMEM;
				2134	do {
				2135	next = pmd_addr_end(addr, end);
				2136	err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
				2137	if (err)
				2138	break;
				2139	} while (pmd++, addr = next, addr != end);
				2140	return err;
				2141	}
				2142
				2143	static int apply_to_pud_range(struct mm_struct mm, p4d_t p4d,
				2144	unsigned long addr, unsigned long end,
				2145	pte_fn_t fn, void *data)
				2146	{
				2147	pud_t *pud;
				2148	unsigned long next;
				2149	int err;
				2150
				2151	pud = pud_alloc(mm, p4d, addr);
				2152	if (!pud)
				2153	return -ENOMEM;
				2154	do {
				2155	next = pud_addr_end(addr, end);
				2156	err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
				2157	if (err)
				2158	break;
				2159	} while (pud++, addr = next, addr != end);
				2160	return err;
				2161	}
				2162
				2163	static int apply_to_p4d_range(struct mm_struct mm, pgd_t pgd,
				2164	unsigned long addr, unsigned long end,
				2165	pte_fn_t fn, void *data)
				2166	{
				2167	p4d_t *p4d;
				2168	unsigned long next;
				2169	int err;
				2170
				2171	p4d = p4d_alloc(mm, pgd, addr);
				2172	if (!p4d)
				2173	return -ENOMEM;
				2174	do {
				2175	next = p4d_addr_end(addr, end);
				2176	err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
				2177	if (err)
				2178	break;
				2179	} while (p4d++, addr = next, addr != end);
				2180	return err;
				2181	}
				2182
				2183	/*
				2184	* Scan a region of virtual memory, filling in page tables as necessary
				2185	* and calling a provided function on each leaf page table.
				2186	*/
				2187	int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
				2188	unsigned long size, pte_fn_t fn, void *data)
				2189	{
				2190	pgd_t *pgd;
				2191	unsigned long next;
				2192	unsigned long end = addr + size;
				2193	int err;
				2194
				2195	if (WARN_ON(addr >= end))
				2196	return -EINVAL;
				2197
				2198	pgd = pgd_offset(mm, addr);
				2199	do {
				2200	next = pgd_addr_end(addr, end);
				2201	err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
				2202	if (err)
				2203	break;
				2204	} while (pgd++, addr = next, addr != end);
				2205
				2206	return err;
				2207	}
				2208	EXPORT_SYMBOL_GPL(apply_to_page_range);
				2209
				2210	/*
				2211	* handle_pte_fault chooses page fault handler according to an entry which was
				2212	* read non-atomically. Before making any commitment, on those architectures
				2213	* or configurations (e.g. i386 with PAE) which might give a mix of unmatched
				2214	* parts, do_swap_page must check under lock before unmapping the pte and
				2215	* proceeding (but do_wp_page is only called after already making such a check;
				2216	* and do_anonymous_page can safely check later on).
				2217	*/
				2218	static inline int pte_unmap_same(struct mm_struct mm, pmd_t pmd,
				2219	pte_t *page_table, pte_t orig_pte)
				2220	{
				2221	int same = 1;
				2222	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT)
				2223	if (sizeof(pte_t) > sizeof(unsigned long)) {
				2224	spinlock_t *ptl = pte_lockptr(mm, pmd);
				2225	spin_lock(ptl);
				2226	same = pte_same(*page_table, orig_pte);
				2227	spin_unlock(ptl);
				2228	}
				2229	#endif
				2230	pte_unmap(page_table);
				2231	return same;
				2232	}
				2233
				2234	static inline bool cow_user_page(struct page dst, struct page src,
				2235	struct vm_fault *vmf)
				2236	{
				2237	bool ret;
				2238	void *kaddr;
				2239	void __user *uaddr;
				2240	bool locked = false;
				2241	struct vm_area_struct *vma = vmf->vma;
				2242	struct mm_struct *mm = vma->vm_mm;
				2243	unsigned long addr = vmf->address;
				2244
				2245	debug_dma_assert_idle(src);
				2246
				2247	if (likely(src)) {
				2248	copy_user_highpage(dst, src, addr, vma);
				2249	return true;
				2250	}
				2251
				2252	/*
				2253	* If the source page was a PFN mapping, we don't have
				2254	* a "struct page" for it. We do a best-effort copy by
				2255	* just copying from the original user address. If that
				2256	* fails, we just zero-fill it. Live with it.
				2257	*/
				2258	kaddr = kmap_atomic(dst);
				2259	uaddr = (void __user *)(addr & PAGE_MASK);
				2260
				2261	/*
				2262	* On architectures with software "accessed" bits, we would
				2263	* take a double page fault, so mark it accessed here.
				2264	*/
				2265	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
				2266	pte_t entry;
				2267
				2268	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
				2269	locked = true;
				2270	if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
				2271	/*
				2272	* Other thread has already handled the fault
				2273	* and we don't need to do anything. If it's
				2274	* not the case, the fault will be triggered
				2275	* again on the same address.
				2276	*/
				2277	ret = false;
				2278	goto pte_unlock;
				2279	}
				2280
				2281	entry = pte_mkyoung(vmf->orig_pte);
				2282	if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
				2283	update_mmu_cache(vma, addr, vmf->pte);
				2284	}
				2285
				2286	/*
				2287	* This really shouldn't fail, because the page is there
				2288	* in the page tables. But it might just be unreadable,
				2289	* in which case we just give up and fill the result with
				2290	* zeroes.
				2291	*/
				2292	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
				2293	if (locked)
				2294	goto warn;
				2295
				2296	/* Re-validate under PTL if the page is still mapped */
				2297	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
				2298	locked = true;
				2299	if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
				2300	/* The PTE changed under us. Retry page fault. */
				2301	ret = false;
				2302	goto pte_unlock;
				2303	}
				2304
				2305	/*
				2306	* The same page can be mapped back since last copy attampt.
				2307	* Try to copy again under PTL.
				2308	*/
				2309	if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
				2310	/*
				2311	* Give a warn in case there can be some obscure
				2312	* use-case
				2313	*/
				2314	warn:
				2315	WARN_ON_ONCE(1);
				2316	clear_page(kaddr);
				2317	}
				2318	}
				2319
				2320	ret = true;
				2321
				2322	pte_unlock:
				2323	if (locked)
				2324	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2325	kunmap_atomic(kaddr);
				2326	flush_dcache_page(dst);
				2327
				2328	return ret;
				2329	}
				2330
				2331	static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
				2332	{
				2333	struct file *vm_file = vma->vm_file;
				2334
				2335	if (vm_file)
				2336	return mapping_gfp_mask(vm_file->f_mapping) \| __GFP_FS \| __GFP_IO;
				2337
				2338	/*
				2339	* Special mappings (e.g. VDSO) do not have any file so fake
				2340	* a default GFP_KERNEL for them.
				2341	*/
				2342	return GFP_KERNEL;
				2343	}
				2344
				2345	/*
				2346	* Notify the address space that the page is about to become writable so that
				2347	* it can prohibit this or wait for the page to get into an appropriate state.
				2348	*
				2349	* We do this without the lock held, so that it can sleep if it needs to.
				2350	*/
				2351	static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
				2352	{
				2353	vm_fault_t ret;
				2354	struct page *page = vmf->page;
				2355	unsigned int old_flags = vmf->flags;
				2356
				2357	vmf->flags = FAULT_FLAG_WRITE\|FAULT_FLAG_MKWRITE;
				2358
				2359	if (vmf->vma->vm_file &&
				2360	IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
				2361	return VM_FAULT_SIGBUS;
				2362
				2363	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
				2364	/* Restore original flags so that caller is not surprised */
				2365	vmf->flags = old_flags;
				2366	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))
				2367	return ret;
				2368	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
				2369	lock_page(page);
				2370	if (!page->mapping) {
				2371	unlock_page(page);
				2372	return 0; /* retry */
				2373	}
				2374	ret \|= VM_FAULT_LOCKED;
				2375	} else
				2376	VM_BUG_ON_PAGE(!PageLocked(page), page);
				2377	return ret;
				2378	}
				2379
				2380	/*
				2381	* Handle dirtying of a page in shared file mapping on a write fault.
				2382	*
				2383	* The function expects the page to be locked and unlocks it.
				2384	*/
				2385	static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
				2386	{
				2387	struct vm_area_struct *vma = vmf->vma;
				2388	struct address_space *mapping;
				2389	struct page *page = vmf->page;
				2390	bool dirtied;
				2391	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
				2392
				2393	dirtied = set_page_dirty(page);
				2394	VM_BUG_ON_PAGE(PageAnon(page), page);
				2395	/*
				2396	* Take a local copy of the address_space - page.mapping may be zeroed
				2397	* by truncate after unlock_page(). The address_space itself remains
				2398	* pinned by vma->vm_file's reference. We rely on unlock_page()'s
				2399	* release semantics to prevent the compiler from undoing this copying.
				2400	*/
				2401	mapping = page_rmapping(page);
				2402	unlock_page(page);
				2403
				2404	if (!page_mkwrite)
				2405	file_update_time(vma->vm_file);
				2406
				2407	/*
				2408	* Throttle page dirtying rate down to writeback speed.
				2409	*
				2410	* mapping may be NULL here because some device drivers do not
				2411	* set page.mapping but still dirty their pages
				2412	*
				2413	* Drop the mmap_sem before waiting on IO, if we can. The file
				2414	* is pinning the mapping, as per above.
				2415	*/
				2416	if ((dirtied \|\| page_mkwrite) && mapping) {
				2417	struct file *fpin;
				2418
				2419	fpin = maybe_unlock_mmap_for_io(vmf, NULL);
				2420	balance_dirty_pages_ratelimited(mapping);
				2421	if (fpin) {
				2422	fput(fpin);
				2423	return VM_FAULT_RETRY;
				2424	}
				2425	}
				2426
				2427	return 0;
				2428	}
				2429
				2430	/*
				2431	* Handle write page faults for pages that can be reused in the current vma
				2432	*
				2433	* This can happen either due to the mapping being with the VM_SHARED flag,
				2434	* or due to us being the last reference standing to the page. In either
				2435	* case, all we need to do here is to mark the page as writable and update
				2436	* any related book-keeping.
				2437	*/
				2438	static inline void wp_page_reuse(struct vm_fault *vmf)
				2439	__releases(vmf->ptl)
				2440	{
				2441	struct vm_area_struct *vma = vmf->vma;
				2442	struct page *page = vmf->page;
				2443	pte_t entry;
				2444	/*
				2445	* Clear the pages cpupid information as the existing
				2446	* information potentially belongs to a now completely
				2447	* unrelated process.
				2448	*/
				2449	if (page)
				2450	page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
				2451
				2452	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
				2453	entry = pte_mkyoung(vmf->orig_pte);
				2454	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
				2455	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
				2456	update_mmu_cache(vma, vmf->address, vmf->pte);
				2457	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2458	}
				2459
				2460	/*
				2461	* Handle the case of a page which we actually need to copy to a new page.
				2462	*
				2463	* Called with mmap_sem locked and the old page referenced, but
				2464	* without the ptl held.
				2465	*
				2466	* High level logic flow:
				2467	*
				2468	* - Allocate a page, copy the content of the old page to the new one.
				2469	* - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
				2470	* - Take the PTL. If the pte changed, bail out and release the allocated page
				2471	* - If the pte is still the way we remember it, update the page table and all
				2472	* relevant references. This includes dropping the reference the page-table
				2473	* held to the old page, as well as updating the rmap.
				2474	* - In any case, unlock the PTL and drop the reference we took to the old page.
				2475	*/
				2476	static vm_fault_t wp_page_copy(struct vm_fault *vmf)
				2477	{
				2478	struct vm_area_struct *vma = vmf->vma;
				2479	struct mm_struct *mm = vma->vm_mm;
				2480	struct page *old_page = vmf->page;
				2481	struct page *new_page = NULL;
				2482	pte_t entry;
				2483	int page_copied = 0;
				2484	struct mem_cgroup *memcg;
				2485	struct mmu_notifier_range range;
				2486
				2487	if (unlikely(anon_vma_prepare(vma)))
				2488	goto oom;
				2489
				2490	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
				2491	new_page = alloc_zeroed_user_highpage_movable(vma,
				2492	vmf->address);
				2493	if (!new_page)
				2494	goto oom;
				2495	} else {
				2496	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
				2497	vmf->address);
				2498	if (!new_page)
				2499	goto oom;
				2500
				2501	if (!cow_user_page(new_page, old_page, vmf)) {
				2502	/*
				2503	* COW failed, if the fault was solved by other,
				2504	* it's fine. If not, userspace would re-fault on
				2505	* the same address and we will handle the fault
				2506	* from the second attempt.
				2507	*/
				2508	put_page(new_page);
				2509	if (old_page)
				2510	put_page(old_page);
				2511	return 0;
				2512	}
				2513	}
				2514
				2515	if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
				2516	goto oom_free_new;
				2517
				2518	__SetPageUptodate(new_page);
				2519
				2520	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
				2521	vmf->address & PAGE_MASK,
				2522	(vmf->address & PAGE_MASK) + PAGE_SIZE);
				2523	mmu_notifier_invalidate_range_start(&range);
				2524
				2525	/*
				2526	* Re-check the pte - we dropped the lock
				2527	*/
				2528	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
				2529	if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
				2530	if (old_page) {
				2531	if (!PageAnon(old_page)) {
				2532	dec_mm_counter_fast(mm,
				2533	mm_counter_file(old_page));
				2534	inc_mm_counter_fast(mm, MM_ANONPAGES);
				2535	}
				2536	} else {
				2537	inc_mm_counter_fast(mm, MM_ANONPAGES);
				2538	}
				2539	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
				2540	entry = mk_pte(new_page, vma->vm_page_prot);
				2541	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
				2542	/*
				2543	* Clear the pte entry and flush it first, before updating the
				2544	* pte with the new entry. This will avoid a race condition
				2545	* seen in the presence of one thread doing SMC and another
				2546	* thread doing COW.
				2547	*/
				2548	ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
				2549	page_add_new_anon_rmap(new_page, vma, vmf->address, false);
				2550	mem_cgroup_commit_charge(new_page, memcg, false, false);
				2551	lru_cache_add_active_or_unevictable(new_page, vma);
				2552	/*
				2553	* We call the notify macro here because, when using secondary
				2554	* mmu page tables (such as kvm shadow page tables), we want the
				2555	* new page to be mapped directly into the secondary page table.
				2556	*/
				2557	set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
				2558	update_mmu_cache(vma, vmf->address, vmf->pte);
				2559	if (old_page) {
				2560	/*
				2561	* Only after switching the pte to the new page may
				2562	* we remove the mapcount here. Otherwise another
				2563	* process may come and find the rmap count decremented
				2564	* before the pte is switched to the new page, and
				2565	* "reuse" the old page writing into it while our pte
				2566	* here still points into it and can be read by other
				2567	* threads.
				2568	*
				2569	* The critical issue is to order this
				2570	* page_remove_rmap with the ptp_clear_flush above.
				2571	* Those stores are ordered by (if nothing else,)
				2572	* the barrier present in the atomic_add_negative
				2573	* in page_remove_rmap.
				2574	*
				2575	* Then the TLB flush in ptep_clear_flush ensures that
				2576	* no process can access the old page before the
				2577	* decremented mapcount is visible. And the old page
				2578	* cannot be reused until after the decremented
				2579	* mapcount is visible. So transitively, TLBs to
				2580	* old page will be flushed before it can be reused.
				2581	*/
				2582	page_remove_rmap(old_page, false);
				2583	}
				2584
				2585	/* Free the old page.. */
				2586	new_page = old_page;
				2587	page_copied = 1;
				2588	} else {
				2589	mem_cgroup_cancel_charge(new_page, memcg, false);
				2590	}
				2591
				2592	if (new_page)
				2593	put_page(new_page);
				2594
				2595	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2596	/*
				2597	* No need to double call mmu_notifier->invalidate_range() callback as
				2598	* the above ptep_clear_flush_notify() did already call it.
				2599	*/
				2600	mmu_notifier_invalidate_range_only_end(&range);
				2601	if (old_page) {
				2602	/*
				2603	* Don't let another task, with possibly unlocked vma,
				2604	* keep the mlocked page.
				2605	*/
				2606	if (page_copied && (vma->vm_flags & VM_LOCKED)) {
				2607	lock_page(old_page); /* LRU manipulation */
				2608	if (PageMlocked(old_page))
				2609	munlock_vma_page(old_page);
				2610	unlock_page(old_page);
				2611	}
				2612	put_page(old_page);
				2613	}
				2614	return page_copied ? VM_FAULT_WRITE : 0;
				2615	oom_free_new:
				2616	put_page(new_page);
				2617	oom:
				2618	if (old_page)
				2619	put_page(old_page);
				2620	return VM_FAULT_OOM;
				2621	}
				2622
				2623	/**
				2624	* finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
				2625	* writeable once the page is prepared
				2626	*
				2627	* @vmf: structure describing the fault
				2628	*
				2629	* This function handles all that is needed to finish a write page fault in a
				2630	* shared mapping due to PTE being read-only once the mapped page is prepared.
				2631	* It handles locking of PTE and modifying it.
				2632	*
				2633	* The function expects the page to be locked or other protection against
				2634	* concurrent faults / writeback (such as DAX radix tree locks).
				2635	*
				2636	* Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
				2637	* we acquired PTE lock.
				2638	*/
				2639	vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
				2640	{
				2641	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
				2642	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
				2643	&vmf->ptl);
				2644	/*
				2645	* We might have raced with another page fault while we released the
				2646	* pte_offset_map_lock.
				2647	*/
				2648	if (!pte_same(*vmf->pte, vmf->orig_pte)) {
				2649	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2650	return VM_FAULT_NOPAGE;
				2651	}
				2652	wp_page_reuse(vmf);
				2653	return 0;
				2654	}
				2655
				2656	/*
				2657	* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
				2658	* mapping
				2659	*/
				2660	static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
				2661	{
				2662	struct vm_area_struct *vma = vmf->vma;
				2663
				2664	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
				2665	vm_fault_t ret;
				2666
				2667	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2668	vmf->flags \|= FAULT_FLAG_MKWRITE;
				2669	ret = vma->vm_ops->pfn_mkwrite(vmf);
				2670	if (ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE))
				2671	return ret;
				2672	return finish_mkwrite_fault(vmf);
				2673	}
				2674	wp_page_reuse(vmf);
				2675	return VM_FAULT_WRITE;
				2676	}
				2677
				2678	static vm_fault_t wp_page_shared(struct vm_fault *vmf)
				2679	__releases(vmf->ptl)
				2680	{
				2681	struct vm_area_struct *vma = vmf->vma;
				2682	vm_fault_t ret = VM_FAULT_WRITE;
				2683
				2684	get_page(vmf->page);
				2685
				2686	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
				2687	vm_fault_t tmp;
				2688
				2689	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2690	tmp = do_page_mkwrite(vmf);
				2691	if (unlikely(!tmp \|\| (tmp &
				2692	(VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
				2693	put_page(vmf->page);
				2694	return tmp;
				2695	}
				2696	tmp = finish_mkwrite_fault(vmf);
				2697	if (unlikely(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE))) {
				2698	unlock_page(vmf->page);
				2699	put_page(vmf->page);
				2700	return tmp;
				2701	}
				2702	} else {
				2703	wp_page_reuse(vmf);
				2704	lock_page(vmf->page);
				2705	}
				2706	ret \|= fault_dirty_shared_page(vmf);
				2707	put_page(vmf->page);
				2708
				2709	return ret;
				2710	}
				2711
				2712	/*
				2713	* This routine handles present pages, when users try to write
				2714	* to a shared page. It is done by copying the page to a new address
				2715	* and decrementing the shared-page counter for the old page.
				2716	*
				2717	* Note that this routine assumes that the protection checks have been
				2718	* done by the caller (the low-level page fault routine in most cases).
				2719	* Thus we can safely just mark it writable once we've done any necessary
				2720	* COW.
				2721	*
				2722	* We also mark the page dirty at this point even though the page will
				2723	* change only once the write actually happens. This avoids a few races,
				2724	* and potentially makes it more efficient.
				2725	*
				2726	* We enter with non-exclusive mmap_sem (to exclude vma changes,
				2727	* but allow concurrent faults), with pte both mapped and locked.
				2728	* We return with mmap_sem still held, but pte unmapped and unlocked.
				2729	*/
				2730	static vm_fault_t do_wp_page(struct vm_fault *vmf)
				2731	__releases(vmf->ptl)
				2732	{
				2733	struct vm_area_struct *vma = vmf->vma;
				2734
				2735	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
				2736	if (!vmf->page) {
				2737	/*
				2738	* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
				2739	* VM_PFNMAP VMA.
				2740	*
				2741	* We should not cow pages in a shared writeable mapping.
				2742	* Just mark the pages writable and/or call ops->pfn_mkwrite.
				2743	*/
				2744	if ((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
				2745	(VM_WRITE\|VM_SHARED))
				2746	return wp_pfn_shared(vmf);
				2747
				2748	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2749	return wp_page_copy(vmf);
				2750	}
				2751
				2752	/*
				2753	* Take out anonymous pages first, anonymous shared vmas are
				2754	* not dirty accountable.
				2755	*/
				2756	if (PageAnon(vmf->page)) {
				2757	struct page *page = vmf->page;
				2758
				2759	/* PageKsm() doesn't necessarily raise the page refcount */
				2760	if (PageKsm(page) \|\| page_count(page) != 1)
				2761	goto copy;
				2762	if (!trylock_page(page))
				2763	goto copy;
				2764	if (PageKsm(page) \|\| page_mapcount(page) != 1 \|\| page_count(page) != 1) {
				2765	unlock_page(page);
				2766	goto copy;
				2767	}
				2768	/*
				2769	* Ok, we've got the only map reference, and the only
				2770	* page count reference, and the page is locked,
				2771	* it's dark out, and we're wearing sunglasses. Hit it.
				2772	*/
				2773	unlock_page(page);
				2774	wp_page_reuse(vmf);
				2775	return VM_FAULT_WRITE;
				2776	} else if (unlikely((vma->vm_flags & (VM_WRITE\|VM_SHARED)) ==
				2777	(VM_WRITE\|VM_SHARED))) {
				2778	return wp_page_shared(vmf);
				2779	}
				2780	copy:
				2781	/*
				2782	* Ok, we need to copy. Oh, well..
				2783	*/
				2784	get_page(vmf->page);
				2785
				2786	pte_unmap_unlock(vmf->pte, vmf->ptl);
				2787	return wp_page_copy(vmf);
				2788	}
				2789
				2790	static void unmap_mapping_range_vma(struct vm_area_struct *vma,
				2791	unsigned long start_addr, unsigned long end_addr,
				2792	struct zap_details *details)
				2793	{
				2794	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
				2795	}
				2796
				2797	static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
				2798	struct zap_details *details)
				2799	{
				2800	struct vm_area_struct *vma;
				2801	pgoff_t vba, vea, zba, zea;
				2802
				2803	vma_interval_tree_foreach(vma, root,
				2804	details->first_index, details->last_index) {
				2805
				2806	vba = vma->vm_pgoff;
				2807	vea = vba + vma_pages(vma) - 1;
				2808	zba = details->first_index;
				2809	if (zba < vba)
				2810	zba = vba;
				2811	zea = details->last_index;
				2812	if (zea > vea)
				2813	zea = vea;
				2814
				2815	unmap_mapping_range_vma(vma,
				2816	((zba - vba) << PAGE_SHIFT) + vma->vm_start,
				2817	((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
				2818	details);
				2819	}
				2820	}
				2821
				2822	/**
				2823	* unmap_mapping_page() - Unmap single page from processes.
				2824	* @page: The locked page to be unmapped.
				2825	*
				2826	* Unmap this page from any userspace process which still has it mmaped.
				2827	* Typically, for efficiency, the range of nearby pages has already been
				2828	* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
				2829	* truncation or invalidation holds the lock on a page, it may find that
				2830	* the page has been remapped again: and then uses unmap_mapping_page()
				2831	* to unmap it finally.
				2832	*/
				2833	void unmap_mapping_page(struct page *page)
				2834	{
				2835	struct address_space *mapping = page->mapping;
				2836	struct zap_details details = { };
				2837
				2838	VM_BUG_ON(!PageLocked(page));
				2839	VM_BUG_ON(PageTail(page));
				2840
				2841	details.check_mapping = mapping;
				2842	details.first_index = page->index;
				2843	details.last_index = page->index + hpage_nr_pages(page) - 1;
				2844	details.single_page = page;
				2845
				2846	i_mmap_lock_write(mapping);
				2847	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
				2848	unmap_mapping_range_tree(&mapping->i_mmap, &details);
				2849	i_mmap_unlock_write(mapping);
				2850	}
				2851
				2852	/**
				2853	* unmap_mapping_pages() - Unmap pages from processes.
				2854	* @mapping: The address space containing pages to be unmapped.
				2855	* @start: Index of first page to be unmapped.
				2856	* @nr: Number of pages to be unmapped. 0 to unmap to end of file.
				2857	* @even_cows: Whether to unmap even private COWed pages.
				2858	*
				2859	* Unmap the pages in this address space from any userspace process which
				2860	* has them mmaped. Generally, you want to remove COWed pages as well when
				2861	* a file is being truncated, but not when invalidating pages from the page
				2862	* cache.
				2863	*/
				2864	void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
				2865	pgoff_t nr, bool even_cows)
				2866	{
				2867	struct zap_details details = { };
				2868
				2869	details.check_mapping = even_cows ? NULL : mapping;
				2870	details.first_index = start;
				2871	details.last_index = start + nr - 1;
				2872	if (details.last_index < details.first_index)
				2873	details.last_index = ULONG_MAX;
				2874
				2875	i_mmap_lock_write(mapping);
				2876	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
				2877	unmap_mapping_range_tree(&mapping->i_mmap, &details);
				2878	i_mmap_unlock_write(mapping);
				2879	}
				2880
				2881	/**
				2882	* unmap_mapping_range - unmap the portion of all mmaps in the specified
				2883	* address_space corresponding to the specified byte range in the underlying
				2884	* file.
				2885	*
				2886	* @mapping: the address space containing mmaps to be unmapped.
				2887	* @holebegin: byte in first page to unmap, relative to the start of
				2888	* the underlying file. This will be rounded down to a PAGE_SIZE
				2889	* boundary. Note that this is different from truncate_pagecache(), which
				2890	* must keep the partial page. In contrast, we must get rid of
				2891	* partial pages.
				2892	* @holelen: size of prospective hole in bytes. This will be rounded
				2893	* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
				2894	* end of the file.
				2895	* @even_cows: 1 when truncating a file, unmap even private COWed pages;
				2896	* but 0 when invalidating pagecache, don't throw away private data.
				2897	*/
				2898	void unmap_mapping_range(struct address_space *mapping,
				2899	loff_t const holebegin, loff_t const holelen, int even_cows)
				2900	{
				2901	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
				2902	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2903
				2904	/* Check for overflow. */
				2905	if (sizeof(holelen) > sizeof(hlen)) {
				2906	long long holeend =
				2907	(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2908	if (holeend & ~(long long)ULONG_MAX)
				2909	hlen = ULONG_MAX - hba + 1;
				2910	}
				2911
				2912	unmap_mapping_pages(mapping, hba, hlen, even_cows);
				2913	}
				2914	EXPORT_SYMBOL(unmap_mapping_range);
				2915
				2916	/*
				2917	* We enter with non-exclusive mmap_sem (to exclude vma changes,
				2918	* but allow concurrent faults), and pte mapped but not yet locked.
				2919	* We return with pte unmapped and unlocked.
				2920	*
				2921	* We return with the mmap_sem locked or unlocked in the same cases
				2922	* as does filemap_fault().
				2923	*/
				2924	vm_fault_t do_swap_page(struct vm_fault *vmf)
				2925	{
				2926	struct vm_area_struct *vma = vmf->vma;
				2927	struct page page = NULL, swapcache;
				2928	struct mem_cgroup *memcg;
				2929	swp_entry_t entry;
				2930	pte_t pte;
				2931	int locked;
				2932	int exclusive = 0;
				2933	vm_fault_t ret = 0;
				2934
				2935	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
				2936	goto out;
				2937
				2938	entry = pte_to_swp_entry(vmf->orig_pte);
				2939	if (unlikely(non_swap_entry(entry))) {
				2940	if (is_migration_entry(entry)) {
				2941	migration_entry_wait(vma->vm_mm, vmf->pmd,
				2942	vmf->address);
				2943	} else if (is_device_private_entry(entry)) {
				2944	vmf->page = device_private_entry_to_page(entry);
				2945	ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
				2946	} else if (is_hwpoison_entry(entry)) {
				2947	ret = VM_FAULT_HWPOISON;
				2948	} else {
				2949	print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
				2950	ret = VM_FAULT_SIGBUS;
				2951	}
				2952	goto out;
				2953	}
				2954
				2955
				2956	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
				2957	page = lookup_swap_cache(entry, vma, vmf->address);
				2958	swapcache = page;
				2959
				2960	if (!page) {
				2961	struct swap_info_struct *si = swp_swap_info(entry);
				2962
				2963	if (si->flags & SWP_SYNCHRONOUS_IO &&
				2964	__swap_count(entry) == 1) {
				2965	/* skip swapcache */
				2966	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
				2967	vmf->address);
				2968	if (page) {
				2969	__SetPageLocked(page);
				2970	__SetPageSwapBacked(page);
				2971	set_page_private(page, entry.val);
				2972	lru_cache_add_anon(page);
				2973	swap_readpage(page, true);
				2974	}
				2975	} else {
				2976	page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
				2977	vmf);
				2978	swapcache = page;
				2979	}
				2980
				2981	if (!page) {
				2982	/*
				2983	* Back out if somebody else faulted in this pte
				2984	* while we released the pte lock.
				2985	*/
				2986	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
				2987	vmf->address, &vmf->ptl);
				2988	if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
				2989	ret = VM_FAULT_OOM;
				2990	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
				2991	goto unlock;
				2992	}
				2993
				2994	/* Had to read the page from swap area: Major fault */
				2995	ret = VM_FAULT_MAJOR;
				2996	count_vm_event(PGMAJFAULT);
				2997	count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
				2998	} else if (PageHWPoison(page)) {
				2999	/*
				3000	* hwpoisoned dirty swapcache pages are kept for killing
				3001	* owner processes (which may be unknown at hwpoison time)
				3002	*/
				3003	ret = VM_FAULT_HWPOISON;
				3004	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
				3005	goto out_release;
				3006	}
				3007
				3008	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
				3009
				3010	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
				3011	if (!locked) {
				3012	ret \|= VM_FAULT_RETRY;
				3013	goto out_release;
				3014	}
				3015
				3016	/*
				3017	* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
				3018	* release the swapcache from under us. The page pin, and pte_same
				3019	* test below, are not enough to exclude that. Even if it is still
				3020	* swapcache, we need to check that the page's swap has not changed.
				3021	*/
				3022	if (unlikely((!PageSwapCache(page) \|\|
				3023	page_private(page) != entry.val)) && swapcache)
				3024	goto out_page;
				3025
				3026	page = ksm_might_need_to_copy(page, vma, vmf->address);
				3027	if (unlikely(!page)) {
				3028	ret = VM_FAULT_OOM;
				3029	page = swapcache;
				3030	goto out_page;
				3031	}
				3032
				3033	if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
				3034	&memcg, false)) {
				3035	ret = VM_FAULT_OOM;
				3036	goto out_page;
				3037	}
				3038
				3039	/*
				3040	* Back out if somebody else already faulted in this pte.
				3041	*/
				3042	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
				3043	&vmf->ptl);
				3044	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
				3045	goto out_nomap;
				3046
				3047	if (unlikely(!PageUptodate(page))) {
				3048	ret = VM_FAULT_SIGBUS;
				3049	goto out_nomap;
				3050	}
				3051
				3052	/*
				3053	* The page isn't present yet, go ahead with the fault.
				3054	*
				3055	* Be careful about the sequence of operations here.
				3056	* To get its accounting right, reuse_swap_page() must be called
				3057	* while the page is counted on swap but not yet in mapcount i.e.
				3058	* before page_add_anon_rmap() and swap_free(); try_to_free_swap()
				3059	* must be called after the swap_free(), or it will never succeed.
				3060	*/
				3061
				3062	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
				3063	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
				3064	pte = mk_pte(page, vma->vm_page_prot);
				3065	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
				3066	pte = maybe_mkwrite(pte_mkdirty(pte), vma);
				3067	vmf->flags &= ~FAULT_FLAG_WRITE;
				3068	ret \|= VM_FAULT_WRITE;
				3069	exclusive = RMAP_EXCLUSIVE;
				3070	}
				3071	flush_icache_page(vma, page);
				3072	if (pte_swp_soft_dirty(vmf->orig_pte))
				3073	pte = pte_mksoft_dirty(pte);
				3074	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
				3075	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
				3076	vmf->orig_pte = pte;
				3077
				3078	/* ksm created a completely new copy */
				3079	if (unlikely(page != swapcache && swapcache)) {
				3080	page_add_new_anon_rmap(page, vma, vmf->address, false);
				3081	mem_cgroup_commit_charge(page, memcg, false, false);
				3082	lru_cache_add_active_or_unevictable(page, vma);
				3083	} else {
				3084	do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
				3085	mem_cgroup_commit_charge(page, memcg, true, false);
				3086	activate_page(page);
				3087	}
				3088
				3089	swap_free(entry);
				3090	if (mem_cgroup_swap_full(page) \|\|
				3091	(vma->vm_flags & VM_LOCKED) \|\| PageMlocked(page))
				3092	try_to_free_swap(page);
				3093	unlock_page(page);
				3094	if (page != swapcache && swapcache) {
				3095	/*
				3096	* Hold the lock to avoid the swap entry to be reused
				3097	* until we take the PT lock for the pte_same() check
				3098	* (to avoid false positives from pte_same). For
				3099	* further safety release the lock after the swap_free
				3100	* so that the swap count won't change under a
				3101	* parallel locked swapcache.
				3102	*/
				3103	unlock_page(swapcache);
				3104	put_page(swapcache);
				3105	}
				3106
				3107	if (vmf->flags & FAULT_FLAG_WRITE) {
				3108	ret \|= do_wp_page(vmf);
				3109	if (ret & VM_FAULT_ERROR)
				3110	ret &= VM_FAULT_ERROR;
				3111	goto out;
				3112	}
				3113
				3114	/* No need to invalidate - it was non-present before */
				3115	update_mmu_cache(vma, vmf->address, vmf->pte);
				3116	unlock:
				3117	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3118	out:
				3119	return ret;
				3120	out_nomap:
				3121	mem_cgroup_cancel_charge(page, memcg, false);
				3122	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3123	out_page:
				3124	unlock_page(page);
				3125	out_release:
				3126	put_page(page);
				3127	if (page != swapcache && swapcache) {
				3128	unlock_page(swapcache);
				3129	put_page(swapcache);
				3130	}
				3131	return ret;
				3132	}
				3133
				3134	/*
				3135	* We enter with non-exclusive mmap_sem (to exclude vma changes,
				3136	* but allow concurrent faults), and pte mapped but not yet locked.
				3137	* We return with mmap_sem still held, but pte unmapped and unlocked.
				3138	*/
				3139	static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
				3140	{
				3141	struct vm_area_struct *vma = vmf->vma;
				3142	struct mem_cgroup *memcg;
				3143	struct page *page;
				3144	vm_fault_t ret = 0;
				3145	pte_t entry;
				3146
				3147	/* File mapping without ->vm_ops ? */
				3148	if (vma->vm_flags & VM_SHARED)
				3149	return VM_FAULT_SIGBUS;
				3150
				3151	/*
				3152	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				3153	* pte_offset_map() on pmds where a huge pmd might be created
				3154	* from a different thread.
				3155	*
				3156	* pte_alloc_map() is safe to use under down_write(mmap_sem) or when
				3157	* parallel threads are excluded by other means.
				3158	*
				3159	* Here we only have down_read(mmap_sem).
				3160	*/
				3161	if (pte_alloc(vma->vm_mm, vmf->pmd))
				3162	return VM_FAULT_OOM;
				3163
				3164	/* See the comment in pte_alloc_one_map() */
				3165	if (unlikely(pmd_trans_unstable(vmf->pmd)))
				3166	return 0;
				3167
				3168	/* Use the zero-page for reads */
				3169	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
				3170	!mm_forbids_zeropage(vma->vm_mm)) {
				3171	entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
				3172	vma->vm_page_prot));
				3173	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
				3174	vmf->address, &vmf->ptl);
				3175	if (!pte_none(*vmf->pte))
				3176	goto unlock;
				3177	ret = check_stable_address_space(vma->vm_mm);
				3178	if (ret)
				3179	goto unlock;
				3180	/* Deliver the page fault to userland, check inside PT lock */
				3181	if (userfaultfd_missing(vma)) {
				3182	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3183	return handle_userfault(vmf, VM_UFFD_MISSING);
				3184	}
				3185	goto setpte;
				3186	}
				3187
				3188	/* Allocate our own private page. */
				3189	if (unlikely(anon_vma_prepare(vma)))
				3190	goto oom;
				3191	page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
				3192	if (!page)
				3193	goto oom;
				3194
				3195	if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
				3196	false))
				3197	goto oom_free_page;
				3198
				3199	/*
				3200	* The memory barrier inside __SetPageUptodate makes sure that
				3201	* preceeding stores to the page contents become visible before
				3202	* the set_pte_at() write.
				3203	*/
				3204	__SetPageUptodate(page);
				3205
				3206	entry = mk_pte(page, vma->vm_page_prot);
				3207	if (vma->vm_flags & VM_WRITE)
				3208	entry = pte_mkwrite(pte_mkdirty(entry));
				3209
				3210	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
				3211	&vmf->ptl);
				3212	if (!pte_none(*vmf->pte))
				3213	goto release;
				3214
				3215	ret = check_stable_address_space(vma->vm_mm);
				3216	if (ret)
				3217	goto release;
				3218
				3219	/* Deliver the page fault to userland, check inside PT lock */
				3220	if (userfaultfd_missing(vma)) {
				3221	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3222	mem_cgroup_cancel_charge(page, memcg, false);
				3223	put_page(page);
				3224	return handle_userfault(vmf, VM_UFFD_MISSING);
				3225	}
				3226
				3227	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
				3228	page_add_new_anon_rmap(page, vma, vmf->address, false);
				3229	mem_cgroup_commit_charge(page, memcg, false, false);
				3230	lru_cache_add_active_or_unevictable(page, vma);
				3231	setpte:
				3232	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
				3233
				3234	/* No need to invalidate - it was non-present before */
				3235	update_mmu_cache(vma, vmf->address, vmf->pte);
				3236	unlock:
				3237	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3238	return ret;
				3239	release:
				3240	mem_cgroup_cancel_charge(page, memcg, false);
				3241	put_page(page);
				3242	goto unlock;
				3243	oom_free_page:
				3244	put_page(page);
				3245	oom:
				3246	return VM_FAULT_OOM;
				3247	}
				3248
				3249	/*
				3250	* The mmap_sem must have been held on entry, and may have been
				3251	* released depending on flags and vma->vm_ops->fault() return value.
				3252	* See filemap_fault() and __lock_page_retry().
				3253	*/
				3254	static vm_fault_t __do_fault(struct vm_fault *vmf)
				3255	{
				3256	struct vm_area_struct *vma = vmf->vma;
				3257	vm_fault_t ret;
				3258
				3259	/*
				3260	* Preallocate pte before we take page_lock because this might lead to
				3261	* deadlocks for memcg reclaim which waits for pages under writeback:
				3262	* lock_page(A)
				3263	* SetPageWriteback(A)
				3264	* unlock_page(A)
				3265	* lock_page(B)
				3266	* lock_page(B)
				3267	* pte_alloc_pne
				3268	* shrink_page_list
				3269	* wait_on_page_writeback(A)
				3270	* SetPageWriteback(B)
				3271	* unlock_page(B)
				3272	* # flush A, B to clear the writeback
				3273	*/
				3274	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
				3275	vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
				3276	if (!vmf->prealloc_pte)
				3277	return VM_FAULT_OOM;
				3278	smp_wmb(); /* See comment in __pte_alloc() */
				3279	}
				3280
				3281	ret = vma->vm_ops->fault(vmf);
				3282	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY \|
				3283	VM_FAULT_DONE_COW)))
				3284	return ret;
				3285
				3286	if (unlikely(PageHWPoison(vmf->page))) {
				3287	struct page *page = vmf->page;
				3288	vm_fault_t poisonret = VM_FAULT_HWPOISON;
				3289	if (ret & VM_FAULT_LOCKED) {
				3290	if (page_mapped(page))
				3291	unmap_mapping_pages(page_mapping(page),
				3292	page->index, 1, false);
				3293	/* Retry if a clean page was removed from the cache. */
				3294	if (invalidate_inode_page(page))
				3295	poisonret = VM_FAULT_NOPAGE;
				3296	unlock_page(page);
				3297	}
				3298	put_page(page);
				3299	vmf->page = NULL;
				3300	return poisonret;
				3301	}
				3302
				3303	if (unlikely(!(ret & VM_FAULT_LOCKED)))
				3304	lock_page(vmf->page);
				3305	else
				3306	VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
				3307
				3308	return ret;
				3309	}
				3310
				3311	/*
				3312	* The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
				3313	* If we check pmd_trans_unstable() first we will trip the bad_pmd() check
				3314	* inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
				3315	* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
				3316	*/
				3317	static int pmd_devmap_trans_unstable(pmd_t *pmd)
				3318	{
				3319	return pmd_devmap(*pmd) \|\| pmd_trans_unstable(pmd);
				3320	}
				3321
				3322	static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
				3323	{
				3324	struct vm_area_struct *vma = vmf->vma;
				3325
				3326	if (!pmd_none(*vmf->pmd))
				3327	goto map_pte;
				3328	if (vmf->prealloc_pte) {
				3329	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
				3330	if (unlikely(!pmd_none(*vmf->pmd))) {
				3331	spin_unlock(vmf->ptl);
				3332	goto map_pte;
				3333	}
				3334
				3335	mm_inc_nr_ptes(vma->vm_mm);
				3336	pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
				3337	spin_unlock(vmf->ptl);
				3338	vmf->prealloc_pte = NULL;
				3339	} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
				3340	return VM_FAULT_OOM;
				3341	}
				3342	map_pte:
				3343	/*
				3344	* If a huge pmd materialized under us just retry later. Use
				3345	* pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
				3346	* pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
				3347	* under us and then back to pmd_none, as a result of MADV_DONTNEED
				3348	* running immediately after a huge pmd fault in a different thread of
				3349	* this mm, in turn leading to a misleading pmd_trans_huge() retval.
				3350	* All we have to ensure is that it is a regular pmd that we can walk
				3351	* with pte_offset_map() and we can do that through an atomic read in
				3352	* C, which is what pmd_trans_unstable() provides.
				3353	*/
				3354	if (pmd_devmap_trans_unstable(vmf->pmd))
				3355	return VM_FAULT_NOPAGE;
				3356
				3357	/*
				3358	* At this point we know that our vmf->pmd points to a page of ptes
				3359	* and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
				3360	* for the duration of the fault. If a racing MADV_DONTNEED runs and
				3361	* we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
				3362	* be valid and we will re-check to make sure the vmf->pte isn't
				3363	* pte_none() under vmf->ptl protection when we return to
				3364	* alloc_set_pte().
				3365	*/
				3366	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
				3367	&vmf->ptl);
				3368	return 0;
				3369	}
				3370
				3371	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3372	static void deposit_prealloc_pte(struct vm_fault *vmf)
				3373	{
				3374	struct vm_area_struct *vma = vmf->vma;
				3375
				3376	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
				3377	/*
				3378	* We are going to consume the prealloc table,
				3379	* count that as nr_ptes.
				3380	*/
				3381	mm_inc_nr_ptes(vma->vm_mm);
				3382	vmf->prealloc_pte = NULL;
				3383	}
				3384
				3385	static vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
				3386	{
				3387	struct vm_area_struct *vma = vmf->vma;
				3388	bool write = vmf->flags & FAULT_FLAG_WRITE;
				3389	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
				3390	pmd_t entry;
				3391	int i;
				3392	vm_fault_t ret = VM_FAULT_FALLBACK;
				3393
				3394	if (!transhuge_vma_suitable(vma, haddr))
				3395	return ret;
				3396
				3397	page = compound_head(page);
				3398	if (compound_order(page) != HPAGE_PMD_ORDER)
				3399	return ret;
				3400
				3401	/*
				3402	* Archs like ppc64 need additonal space to store information
				3403	* related to pte entry. Use the preallocated table for that.
				3404	*/
				3405	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
				3406	vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
				3407	if (!vmf->prealloc_pte)
				3408	return VM_FAULT_OOM;
				3409	smp_wmb(); /* See comment in __pte_alloc() */
				3410	}
				3411
				3412	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
				3413	if (unlikely(!pmd_none(*vmf->pmd)))
				3414	goto out;
				3415
				3416	for (i = 0; i < HPAGE_PMD_NR; i++)
				3417	flush_icache_page(vma, page + i);
				3418
				3419	entry = mk_huge_pmd(page, vma->vm_page_prot);
				3420	if (write)
				3421	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
				3422
				3423	add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
				3424	page_add_file_rmap(page, true);
				3425	/*
				3426	* deposit and withdraw with pmd lock held
				3427	*/
				3428	if (arch_needs_pgtable_deposit())
				3429	deposit_prealloc_pte(vmf);
				3430
				3431	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
				3432
				3433	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
				3434
				3435	/* fault is handled */
				3436	ret = 0;
				3437	count_vm_event(THP_FILE_MAPPED);
				3438	out:
				3439	spin_unlock(vmf->ptl);
				3440	return ret;
				3441	}
				3442	#else
				3443	static vm_fault_t do_set_pmd(struct vm_fault vmf, struct page page)
				3444	{
				3445	BUILD_BUG();
				3446	return 0;
				3447	}
				3448	#endif
				3449
				3450	/**
				3451	* alloc_set_pte - setup new PTE entry for given page and add reverse page
				3452	* mapping. If needed, the fucntion allocates page table or use pre-allocated.
				3453	*
				3454	* @vmf: fault environment
				3455	* @memcg: memcg to charge page (only for private mappings)
				3456	* @page: page to map
				3457	*
				3458	* Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
				3459	* return.
				3460	*
				3461	* Target users are page handler itself and implementations of
				3462	* vm_ops->map_pages.
				3463	*
				3464	* Return: %0 on success, %VM_FAULT_ code in case of error.
				3465	*/
				3466	vm_fault_t alloc_set_pte(struct vm_fault vmf, struct mem_cgroup memcg,
				3467	struct page *page)
				3468	{
				3469	struct vm_area_struct *vma = vmf->vma;
				3470	bool write = vmf->flags & FAULT_FLAG_WRITE;
				3471	pte_t entry;
				3472	vm_fault_t ret;
				3473
				3474	if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
				3475	IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
				3476	/* THP on COW? */
				3477	VM_BUG_ON_PAGE(memcg, page);
				3478
				3479	ret = do_set_pmd(vmf, page);
				3480	if (ret != VM_FAULT_FALLBACK)
				3481	return ret;
				3482	}
				3483
				3484	if (!vmf->pte) {
				3485	ret = pte_alloc_one_map(vmf);
				3486	if (ret)
				3487	return ret;
				3488	}
				3489
				3490	/* Re-check under ptl */
				3491	if (unlikely(!pte_none(*vmf->pte)))
				3492	return VM_FAULT_NOPAGE;
				3493
				3494	flush_icache_page(vma, page);
				3495	entry = mk_pte(page, vma->vm_page_prot);
				3496	if (write)
				3497	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
				3498	/* copy-on-write page */
				3499	if (write && !(vma->vm_flags & VM_SHARED)) {
				3500	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
				3501	page_add_new_anon_rmap(page, vma, vmf->address, false);
				3502	mem_cgroup_commit_charge(page, memcg, false, false);
				3503	lru_cache_add_active_or_unevictable(page, vma);
				3504	} else {
				3505	inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
				3506	page_add_file_rmap(page, false);
				3507	}
				3508	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
				3509
				3510	/* no need to invalidate: a not-present page won't be cached */
				3511	update_mmu_cache(vma, vmf->address, vmf->pte);
				3512
				3513	return 0;
				3514	}
				3515
				3516
				3517	/**
				3518	* finish_fault - finish page fault once we have prepared the page to fault
				3519	*
				3520	* @vmf: structure describing the fault
				3521	*
				3522	* This function handles all that is needed to finish a page fault once the
				3523	* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
				3524	* given page, adds reverse page mapping, handles memcg charges and LRU
				3525	* addition.
				3526	*
				3527	* The function expects the page to be locked and on success it consumes a
				3528	* reference of a page being mapped (for the PTE which maps it).
				3529	*
				3530	* Return: %0 on success, %VM_FAULT_ code in case of error.
				3531	*/
				3532	vm_fault_t finish_fault(struct vm_fault *vmf)
				3533	{
				3534	struct page *page;
				3535	vm_fault_t ret = 0;
				3536
				3537	/* Did we COW the page? */
				3538	if ((vmf->flags & FAULT_FLAG_WRITE) &&
				3539	!(vmf->vma->vm_flags & VM_SHARED))
				3540	page = vmf->cow_page;
				3541	else
				3542	page = vmf->page;
				3543
				3544	/*
				3545	* check even for read faults because we might have lost our CoWed
				3546	* page
				3547	*/
				3548	if (!(vmf->vma->vm_flags & VM_SHARED))
				3549	ret = check_stable_address_space(vmf->vma->vm_mm);
				3550	if (!ret)
				3551	ret = alloc_set_pte(vmf, vmf->memcg, page);
				3552	if (vmf->pte)
				3553	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3554	return ret;
				3555	}
				3556
				3557	static unsigned long fault_around_bytes __read_mostly =
				3558	rounddown_pow_of_two(65536);
				3559
				3560	#ifdef CONFIG_DEBUG_FS
				3561	static int fault_around_bytes_get(void data, u64 val)
				3562	{
				3563	*val = fault_around_bytes;
				3564	return 0;
				3565	}
				3566
				3567	/*
				3568	* fault_around_bytes must be rounded down to the nearest page order as it's
				3569	* what do_fault_around() expects to see.
				3570	*/
				3571	static int fault_around_bytes_set(void *data, u64 val)
				3572	{
				3573	if (val / PAGE_SIZE > PTRS_PER_PTE)
				3574	return -EINVAL;
				3575	if (val > PAGE_SIZE)
				3576	fault_around_bytes = rounddown_pow_of_two(val);
				3577	else
				3578	fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
				3579	return 0;
				3580	}
				3581	DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
				3582	fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
				3583
				3584	static int __init fault_around_debugfs(void)
				3585	{
				3586	debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
				3587	&fault_around_bytes_fops);
				3588	return 0;
				3589	}
				3590	late_initcall(fault_around_debugfs);
				3591	#endif
				3592
				3593	/*
				3594	* do_fault_around() tries to map few pages around the fault address. The hope
				3595	* is that the pages will be needed soon and this will lower the number of
				3596	* faults to handle.
				3597	*
				3598	* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
				3599	* not ready to be mapped: not up-to-date, locked, etc.
				3600	*
				3601	* This function is called with the page table lock taken. In the split ptlock
				3602	* case the page table lock only protects only those entries which belong to
				3603	* the page table corresponding to the fault address.
				3604	*
				3605	* This function doesn't cross the VMA boundaries, in order to call map_pages()
				3606	* only once.
				3607	*
				3608	* fault_around_bytes defines how many bytes we'll try to map.
				3609	* do_fault_around() expects it to be set to a power of two less than or equal
				3610	* to PTRS_PER_PTE.
				3611	*
				3612	* The virtual address of the area that we map is naturally aligned to
				3613	* fault_around_bytes rounded down to the machine page size
				3614	* (and therefore to page order). This way it's easier to guarantee
				3615	* that we don't cross page table boundaries.
				3616	*/
				3617	static vm_fault_t do_fault_around(struct vm_fault *vmf)
				3618	{
				3619	unsigned long address = vmf->address, nr_pages, mask;
				3620	pgoff_t start_pgoff = vmf->pgoff;
				3621	pgoff_t end_pgoff;
				3622	int off;
				3623	vm_fault_t ret = 0;
				3624
				3625	nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
				3626	mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
				3627
				3628	vmf->address = max(address & mask, vmf->vma->vm_start);
				3629	off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
				3630	start_pgoff -= off;
				3631
				3632	/*
				3633	* end_pgoff is either the end of the page table, the end of
				3634	* the vma or nr_pages from start_pgoff, depending what is nearest.
				3635	*/
				3636	end_pgoff = start_pgoff -
				3637	((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
				3638	PTRS_PER_PTE - 1;
				3639	end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
				3640	start_pgoff + nr_pages - 1);
				3641
				3642	if (pmd_none(*vmf->pmd)) {
				3643	vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
				3644	if (!vmf->prealloc_pte)
				3645	goto out;
				3646	smp_wmb(); /* See comment in __pte_alloc() */
				3647	}
				3648
				3649	vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
				3650
				3651	/* Huge page is mapped? Page fault is solved */
				3652	if (pmd_trans_huge(*vmf->pmd)) {
				3653	ret = VM_FAULT_NOPAGE;
				3654	goto out;
				3655	}
				3656
				3657	/* ->map_pages() haven't done anything useful. Cold page cache? */
				3658	if (!vmf->pte)
				3659	goto out;
				3660
				3661	/* check if the page fault is solved */
				3662	vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
				3663	if (!pte_none(*vmf->pte))
				3664	ret = VM_FAULT_NOPAGE;
				3665	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3666	out:
				3667	vmf->address = address;
				3668	vmf->pte = NULL;
				3669	return ret;
				3670	}
				3671
				3672	static vm_fault_t do_read_fault(struct vm_fault *vmf)
				3673	{
				3674	struct vm_area_struct *vma = vmf->vma;
				3675	vm_fault_t ret = 0;
				3676
				3677	/*
				3678	* Let's call ->map_pages() first and use ->fault() as fallback
				3679	* if page by the offset is not ready to be mapped (cold cache or
				3680	* something).
				3681	*/
				3682	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
				3683	if (likely(!userfaultfd_minor(vmf->vma))) {
				3684	ret = do_fault_around(vmf);
				3685	if (ret)
				3686	return ret;
				3687	}
				3688	}
				3689
				3690	ret = __do_fault(vmf);
				3691	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
				3692	return ret;
				3693
				3694	ret \|= finish_fault(vmf);
				3695	unlock_page(vmf->page);
				3696	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
				3697	put_page(vmf->page);
				3698	return ret;
				3699	}
				3700
				3701	static vm_fault_t do_cow_fault(struct vm_fault *vmf)
				3702	{
				3703	struct vm_area_struct *vma = vmf->vma;
				3704	vm_fault_t ret;
				3705
				3706	if (unlikely(anon_vma_prepare(vma)))
				3707	return VM_FAULT_OOM;
				3708
				3709	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
				3710	if (!vmf->cow_page)
				3711	return VM_FAULT_OOM;
				3712
				3713	if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
				3714	&vmf->memcg, false)) {
				3715	put_page(vmf->cow_page);
				3716	return VM_FAULT_OOM;
				3717	}
				3718
				3719	ret = __do_fault(vmf);
				3720	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
				3721	goto uncharge_out;
				3722	if (ret & VM_FAULT_DONE_COW)
				3723	return ret;
				3724
				3725	copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
				3726	__SetPageUptodate(vmf->cow_page);
				3727
				3728	ret \|= finish_fault(vmf);
				3729	unlock_page(vmf->page);
				3730	put_page(vmf->page);
				3731	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
				3732	goto uncharge_out;
				3733	return ret;
				3734	uncharge_out:
				3735	mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
				3736	put_page(vmf->cow_page);
				3737	return ret;
				3738	}
				3739
				3740	static vm_fault_t do_shared_fault(struct vm_fault *vmf)
				3741	{
				3742	struct vm_area_struct *vma = vmf->vma;
				3743	vm_fault_t ret, tmp;
				3744
				3745	ret = __do_fault(vmf);
				3746	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
				3747	return ret;
				3748
				3749	/*
				3750	* Check if the backing address space wants to know that the page is
				3751	* about to become writable
				3752	*/
				3753	if (vma->vm_ops->page_mkwrite) {
				3754	unlock_page(vmf->page);
				3755	tmp = do_page_mkwrite(vmf);
				3756	if (unlikely(!tmp \|\|
				3757	(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
				3758	put_page(vmf->page);
				3759	return tmp;
				3760	}
				3761	}
				3762
				3763	ret \|= finish_fault(vmf);
				3764	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \|
				3765	VM_FAULT_RETRY))) {
				3766	unlock_page(vmf->page);
				3767	put_page(vmf->page);
				3768	return ret;
				3769	}
				3770
				3771	ret \|= fault_dirty_shared_page(vmf);
				3772	return ret;
				3773	}
				3774
				3775	/*
				3776	* We enter with non-exclusive mmap_sem (to exclude vma changes,
				3777	* but allow concurrent faults).
				3778	* The mmap_sem may have been released depending on flags and our
				3779	* return value. See filemap_fault() and __lock_page_or_retry().
				3780	* If mmap_sem is released, vma may become invalid (for example
				3781	* by other thread calling munmap()).
				3782	*/
				3783	static vm_fault_t do_fault(struct vm_fault *vmf)
				3784	{
				3785	struct vm_area_struct *vma = vmf->vma;
				3786	struct mm_struct *vm_mm = vma->vm_mm;
				3787	vm_fault_t ret;
				3788
				3789	/*
				3790	* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
				3791	*/
				3792	if (!vma->vm_ops->fault) {
				3793	/*
				3794	* If we find a migration pmd entry or a none pmd entry, which
				3795	* should never happen, return SIGBUS
				3796	*/
				3797	if (unlikely(!pmd_present(*vmf->pmd)))
				3798	ret = VM_FAULT_SIGBUS;
				3799	else {
				3800	vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
				3801	vmf->pmd,
				3802	vmf->address,
				3803	&vmf->ptl);
				3804	/*
				3805	* Make sure this is not a temporary clearing of pte
				3806	* by holding ptl and checking again. A R/M/W update
				3807	* of pte involves: take ptl, clearing the pte so that
				3808	* we don't have concurrent modification by hardware
				3809	* followed by an update.
				3810	*/
				3811	if (unlikely(pte_none(*vmf->pte)))
				3812	ret = VM_FAULT_SIGBUS;
				3813	else
				3814	ret = VM_FAULT_NOPAGE;
				3815
				3816	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3817	}
				3818	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
				3819	ret = do_read_fault(vmf);
				3820	else if (!(vma->vm_flags & VM_SHARED))
				3821	ret = do_cow_fault(vmf);
				3822	else
				3823	ret = do_shared_fault(vmf);
				3824
				3825	/* preallocated pagetable is unused: free it */
				3826	if (vmf->prealloc_pte) {
				3827	pte_free(vm_mm, vmf->prealloc_pte);
				3828	vmf->prealloc_pte = NULL;
				3829	}
				3830	return ret;
				3831	}
				3832
				3833	static int numa_migrate_prep(struct page page, struct vm_area_struct vma,
				3834	unsigned long addr, int page_nid,
				3835	int *flags)
				3836	{
				3837	get_page(page);
				3838
				3839	count_vm_numa_event(NUMA_HINT_FAULTS);
				3840	if (page_nid == numa_node_id()) {
				3841	count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
				3842	*flags \|= TNF_FAULT_LOCAL;
				3843	}
				3844
				3845	return mpol_misplaced(page, vma, addr);
				3846	}
				3847
				3848	static vm_fault_t do_numa_page(struct vm_fault *vmf)
				3849	{
				3850	struct vm_area_struct *vma = vmf->vma;
				3851	struct page *page = NULL;
				3852	int page_nid = NUMA_NO_NODE;
				3853	int last_cpupid;
				3854	int target_nid;
				3855	bool migrated = false;
				3856	pte_t pte, old_pte;
				3857	bool was_writable = pte_savedwrite(vmf->orig_pte);
				3858	int flags = 0;
				3859
				3860	/*
				3861	* The "pte" at this point cannot be used safely without
				3862	* validation through pte_unmap_same(). It's of NUMA type but
				3863	* the pfn may be screwed if the read is non atomic.
				3864	*/
				3865	vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
				3866	spin_lock(vmf->ptl);
				3867	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
				3868	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3869	goto out;
				3870	}
				3871
				3872	/*
				3873	* Make it present again, Depending on how arch implementes non
				3874	* accessible ptes, some can allow access by kernel mode.
				3875	*/
				3876	old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
				3877	pte = pte_modify(old_pte, vma->vm_page_prot);
				3878	pte = pte_mkyoung(pte);
				3879	if (was_writable)
				3880	pte = pte_mkwrite(pte);
				3881	ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
				3882	update_mmu_cache(vma, vmf->address, vmf->pte);
				3883
				3884	page = vm_normal_page(vma, vmf->address, pte);
				3885	if (!page) {
				3886	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3887	return 0;
				3888	}
				3889
				3890	/* TODO: handle PTE-mapped THP */
				3891	if (PageCompound(page)) {
				3892	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3893	return 0;
				3894	}
				3895
				3896	/*
				3897	* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
				3898	* much anyway since they can be in shared cache state. This misses
				3899	* the case where a mapping is writable but the process never writes
				3900	* to it but pte_write gets cleared during protection updates and
				3901	* pte_dirty has unpredictable behaviour between PTE scan updates,
				3902	* background writeback, dirty balancing and application behaviour.
				3903	*/
				3904	if (!pte_write(pte))
				3905	flags \|= TNF_NO_GROUP;
				3906
				3907	/*
				3908	* Flag if the page is shared between multiple address spaces. This
				3909	* is later used when determining whether to group tasks together
				3910	*/
				3911	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
				3912	flags \|= TNF_SHARED;
				3913
				3914	last_cpupid = page_cpupid_last(page);
				3915	page_nid = page_to_nid(page);
				3916	target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
				3917	&flags);
				3918	pte_unmap_unlock(vmf->pte, vmf->ptl);
				3919	if (target_nid == NUMA_NO_NODE) {
				3920	put_page(page);
				3921	goto out;
				3922	}
				3923
				3924	/* Migrate to the requested node */
				3925	migrated = migrate_misplaced_page(page, vma, target_nid);
				3926	if (migrated) {
				3927	page_nid = target_nid;
				3928	flags \|= TNF_MIGRATED;
				3929	} else
				3930	flags \|= TNF_MIGRATE_FAIL;
				3931
				3932	out:
				3933	if (page_nid != NUMA_NO_NODE)
				3934	task_numa_fault(last_cpupid, page_nid, 1, flags);
				3935	return 0;
				3936	}
				3937
				3938	static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
				3939	{
				3940	if (vma_is_anonymous(vmf->vma))
				3941	return do_huge_pmd_anonymous_page(vmf);
				3942	if (vmf->vma->vm_ops->huge_fault)
				3943	return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
				3944	return VM_FAULT_FALLBACK;
				3945	}
				3946
				3947	/* `inline' is required to avoid gcc 4.1.2 build error */
				3948	static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
				3949	{
				3950	if (vma_is_anonymous(vmf->vma))
				3951	return do_huge_pmd_wp_page(vmf, orig_pmd);
				3952	if (vmf->vma->vm_ops->huge_fault)
				3953	return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
				3954
				3955	/* COW handled on pte level: split pmd */
				3956	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
				3957	__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
				3958
				3959	return VM_FAULT_FALLBACK;
				3960	}
				3961
				3962	static inline bool vma_is_accessible(struct vm_area_struct *vma)
				3963	{
				3964	return vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE);
				3965	}
				3966
				3967	static vm_fault_t create_huge_pud(struct vm_fault *vmf)
				3968	{
				3969	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				3970	/* No support for anonymous transparent PUD pages yet */
				3971	if (vma_is_anonymous(vmf->vma))
				3972	return VM_FAULT_FALLBACK;
				3973	if (vmf->vma->vm_ops->huge_fault)
				3974	return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
				3975	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
				3976	return VM_FAULT_FALLBACK;
				3977	}
				3978
				3979	static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
				3980	{
				3981	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				3982	/* No support for anonymous transparent PUD pages yet */
				3983	if (vma_is_anonymous(vmf->vma))
				3984	return VM_FAULT_FALLBACK;
				3985	if (vmf->vma->vm_ops->huge_fault)
				3986	return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
				3987	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
				3988	return VM_FAULT_FALLBACK;
				3989	}
				3990
				3991	/*
				3992	* These routines also need to handle stuff like marking pages dirty
				3993	* and/or accessed for architectures that don't do it in hardware (most
				3994	* RISC architectures). The early dirtying is also good on the i386.
				3995	*
				3996	* There is also a hook called "update_mmu_cache()" that architectures
				3997	* with external mmu caches can use to update those (ie the Sparc or
				3998	* PowerPC hashed page tables that act as extended TLBs).
				3999	*
				4000	* We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
				4001	* concurrent faults).
				4002	*
				4003	* The mmap_sem may have been released depending on flags and our return value.
				4004	* See filemap_fault() and __lock_page_or_retry().
				4005	*/
				4006	static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
				4007	{
				4008	pte_t entry;
				4009
				4010	if (unlikely(pmd_none(*vmf->pmd))) {
				4011	/*
				4012	* Leave __pte_alloc() until later: because vm_ops->fault may
				4013	* want to allocate huge page, and if we expose page table
				4014	* for an instant, it will be difficult to retract from
				4015	* concurrent faults and from rmap lookups.
				4016	*/
				4017	vmf->pte = NULL;
				4018	} else {
				4019	/* See comment in pte_alloc_one_map() */
				4020	if (pmd_devmap_trans_unstable(vmf->pmd))
				4021	return 0;
				4022	/*
				4023	* A regular pmd is established and it can't morph into a huge
				4024	* pmd from under us anymore at this point because we hold the
				4025	* mmap_sem read mode and khugepaged takes it in write mode.
				4026	* So now it's safe to run pte_offset_map().
				4027	*/
				4028	vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
				4029	vmf->orig_pte = *vmf->pte;
				4030
				4031	/*
				4032	* some architectures can have larger ptes than wordsize,
				4033	* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
				4034	* CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
				4035	* accesses. The code below just needs a consistent view
				4036	* for the ifs and we later double check anyway with the
				4037	* ptl lock held. So here a barrier will do.
				4038	*/
				4039	barrier();
				4040	if (pte_none(vmf->orig_pte)) {
				4041	pte_unmap(vmf->pte);
				4042	vmf->pte = NULL;
				4043	}
				4044	}
				4045
				4046	if (!vmf->pte) {
				4047	if (vma_is_anonymous(vmf->vma))
				4048	return do_anonymous_page(vmf);
				4049	else
				4050	return do_fault(vmf);
				4051	}
				4052
				4053	if (!pte_present(vmf->orig_pte))
				4054	return do_swap_page(vmf);
				4055
				4056	if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
				4057	return do_numa_page(vmf);
				4058
				4059	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
				4060	spin_lock(vmf->ptl);
				4061	entry = vmf->orig_pte;
				4062	if (unlikely(!pte_same(*vmf->pte, entry)))
				4063	goto unlock;
				4064	if (vmf->flags & FAULT_FLAG_WRITE) {
				4065	if (!pte_write(entry))
				4066	return do_wp_page(vmf);
				4067	entry = pte_mkdirty(entry);
				4068	}
				4069	entry = pte_mkyoung(entry);
				4070	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
				4071	vmf->flags & FAULT_FLAG_WRITE)) {
				4072	update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
				4073	} else {
				4074	/*
				4075	* This is needed only for protection faults but the arch code
				4076	* is not yet telling us if this is a protection fault or not.
				4077	* This still avoids useless tlb flushes for .text page faults
				4078	* with threads.
				4079	*/
				4080	if (vmf->flags & FAULT_FLAG_WRITE)
				4081	flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
				4082	}
				4083	unlock:
				4084	pte_unmap_unlock(vmf->pte, vmf->ptl);
				4085	return 0;
				4086	}
				4087
				4088	/*
				4089	* By the time we get here, we already hold the mm semaphore
				4090	*
				4091	* The mmap_sem may have been released depending on flags and our
				4092	* return value. See filemap_fault() and __lock_page_or_retry().
				4093	*/
				4094	static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
				4095	unsigned long address, unsigned int flags)
				4096	{
				4097	struct vm_fault vmf = {
				4098	.vma = vma,
				4099	.address = address & PAGE_MASK,
				4100	.flags = flags,
				4101	.pgoff = linear_page_index(vma, address),
				4102	.gfp_mask = __get_fault_gfp_mask(vma),
				4103	};
				4104	unsigned int dirty = flags & FAULT_FLAG_WRITE;
				4105	struct mm_struct *mm = vma->vm_mm;
				4106	pgd_t *pgd;
				4107	p4d_t *p4d;
				4108	vm_fault_t ret;
				4109
				4110	pgd = pgd_offset(mm, address);
				4111	p4d = p4d_alloc(mm, pgd, address);
				4112	if (!p4d)
				4113	return VM_FAULT_OOM;
				4114
				4115	vmf.pud = pud_alloc(mm, p4d, address);
				4116	if (!vmf.pud)
				4117	return VM_FAULT_OOM;
				4118	if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
				4119	ret = create_huge_pud(&vmf);
				4120	if (!(ret & VM_FAULT_FALLBACK))
				4121	return ret;
				4122	} else {
				4123	pud_t orig_pud = *vmf.pud;
				4124
				4125	barrier();
				4126	if (pud_trans_huge(orig_pud) \|\| pud_devmap(orig_pud)) {
				4127
				4128	/* NUMA case for anonymous PUDs would go here */
				4129
				4130	if (dirty && !pud_write(orig_pud)) {
				4131	ret = wp_huge_pud(&vmf, orig_pud);
				4132	if (!(ret & VM_FAULT_FALLBACK))
				4133	return ret;
				4134	} else {
				4135	huge_pud_set_accessed(&vmf, orig_pud);
				4136	return 0;
				4137	}
				4138	}
				4139	}
				4140
				4141	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
				4142	if (!vmf.pmd)
				4143	return VM_FAULT_OOM;
				4144	if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
				4145	ret = create_huge_pmd(&vmf);
				4146	if (!(ret & VM_FAULT_FALLBACK))
				4147	return ret;
				4148	} else {
				4149	pmd_t orig_pmd = *vmf.pmd;
				4150
				4151	barrier();
				4152	if (unlikely(is_swap_pmd(orig_pmd))) {
				4153	VM_BUG_ON(thp_migration_supported() &&
				4154	!is_pmd_migration_entry(orig_pmd));
				4155	if (is_pmd_migration_entry(orig_pmd))
				4156	pmd_migration_entry_wait(mm, vmf.pmd);
				4157	return 0;
				4158	}
				4159	if (pmd_trans_huge(orig_pmd) \|\| pmd_devmap(orig_pmd)) {
				4160	if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
				4161	return do_huge_pmd_numa_page(&vmf, orig_pmd);
				4162
				4163	if (dirty && !pmd_write(orig_pmd)) {
				4164	ret = wp_huge_pmd(&vmf, orig_pmd);
				4165	if (!(ret & VM_FAULT_FALLBACK))
				4166	return ret;
				4167	} else {
				4168	huge_pmd_set_accessed(&vmf, orig_pmd);
				4169	return 0;
				4170	}
				4171	}
				4172	}
				4173
				4174	return handle_pte_fault(&vmf);
				4175	}
				4176
				4177	/*
				4178	* By the time we get here, we already hold the mm semaphore
				4179	*
				4180	* The mmap_sem may have been released depending on flags and our
				4181	* return value. See filemap_fault() and __lock_page_or_retry().
				4182	*/
				4183	vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
				4184	unsigned int flags)
				4185	{
				4186	vm_fault_t ret;
				4187
				4188	__set_current_state(TASK_RUNNING);
				4189
				4190	count_vm_event(PGFAULT);
				4191	count_memcg_event_mm(vma->vm_mm, PGFAULT);
				4192
				4193	/* do counter updates before entering really critical section. */
				4194	check_sync_rss_stat(current);
				4195
				4196	if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
				4197	flags & FAULT_FLAG_INSTRUCTION,
				4198	flags & FAULT_FLAG_REMOTE))
				4199	return VM_FAULT_SIGSEGV;
				4200
				4201	/*
				4202	* Enable the memcg OOM handling for faults triggered in user
				4203	* space. Kernel faults are handled more gracefully.
				4204	*/
				4205	if (flags & FAULT_FLAG_USER)
				4206	mem_cgroup_enter_user_fault();
				4207
				4208	if (unlikely(is_vm_hugetlb_page(vma)))
				4209	ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
				4210	else
				4211	ret = __handle_mm_fault(vma, address, flags);
				4212
				4213	if (flags & FAULT_FLAG_USER) {
				4214	mem_cgroup_exit_user_fault();
				4215	/*
				4216	* The task may have entered a memcg OOM situation but
				4217	* if the allocation error was handled gracefully (no
				4218	* VM_FAULT_OOM), there is no need to kill anything.
				4219	* Just clean up the OOM state peacefully.
				4220	*/
				4221	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
				4222	mem_cgroup_oom_synchronize(false);
				4223	}
				4224
				4225	return ret;
				4226	}
				4227	EXPORT_SYMBOL_GPL(handle_mm_fault);
				4228
				4229	#ifndef __PAGETABLE_P4D_FOLDED
				4230	/*
				4231	* Allocate p4d page table.
				4232	* We've already handled the fast-path in-line.
				4233	*/
				4234	int __p4d_alloc(struct mm_struct mm, pgd_t pgd, unsigned long address)
				4235	{
				4236	p4d_t *new = p4d_alloc_one(mm, address);
				4237	if (!new)
				4238	return -ENOMEM;
				4239
				4240	smp_wmb(); /* See comment in __pte_alloc */
				4241
				4242	spin_lock(&mm->page_table_lock);
				4243	if (pgd_present(pgd)) / Another has populated it */
				4244	p4d_free(mm, new);
				4245	else
				4246	pgd_populate(mm, pgd, new);
				4247	spin_unlock(&mm->page_table_lock);
				4248	return 0;
				4249	}
				4250	#endif /* __PAGETABLE_P4D_FOLDED */
				4251
				4252	#ifndef __PAGETABLE_PUD_FOLDED
				4253	/*
				4254	* Allocate page upper directory.
				4255	* We've already handled the fast-path in-line.
				4256	*/
				4257	int __pud_alloc(struct mm_struct mm, p4d_t p4d, unsigned long address)
				4258	{
				4259	pud_t *new = pud_alloc_one(mm, address);
				4260	if (!new)
				4261	return -ENOMEM;
				4262
				4263	smp_wmb(); /* See comment in __pte_alloc */
				4264
				4265	spin_lock(&mm->page_table_lock);
				4266	#ifndef __ARCH_HAS_5LEVEL_HACK
				4267	if (!p4d_present(*p4d)) {
				4268	mm_inc_nr_puds(mm);
				4269	p4d_populate(mm, p4d, new);
				4270	} else /* Another has populated it */
				4271	pud_free(mm, new);
				4272	#else
				4273	if (!pgd_present(*p4d)) {
				4274	mm_inc_nr_puds(mm);
				4275	pgd_populate(mm, p4d, new);
				4276	} else /* Another has populated it */
				4277	pud_free(mm, new);
				4278	#endif /* __ARCH_HAS_5LEVEL_HACK */
				4279	spin_unlock(&mm->page_table_lock);
				4280	return 0;
				4281	}
				4282	#endif /* __PAGETABLE_PUD_FOLDED */
				4283
				4284	#ifndef __PAGETABLE_PMD_FOLDED
				4285	/*
				4286	* Allocate page middle directory.
				4287	* We've already handled the fast-path in-line.
				4288	*/
				4289	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)
				4290	{
				4291	spinlock_t *ptl;
				4292	pmd_t *new = pmd_alloc_one(mm, address);
				4293	if (!new)
				4294	return -ENOMEM;
				4295
				4296	smp_wmb(); /* See comment in __pte_alloc */
				4297
				4298	ptl = pud_lock(mm, pud);
				4299	#ifndef __ARCH_HAS_4LEVEL_HACK
				4300	if (!pud_present(*pud)) {
				4301	mm_inc_nr_pmds(mm);
				4302	pud_populate(mm, pud, new);
				4303	} else /* Another has populated it */
				4304	pmd_free(mm, new);
				4305	#else
				4306	if (!pgd_present(*pud)) {
				4307	mm_inc_nr_pmds(mm);
				4308	pgd_populate(mm, pud, new);
				4309	} else /* Another has populated it */
				4310	pmd_free(mm, new);
				4311	#endif /* __ARCH_HAS_4LEVEL_HACK */
				4312	spin_unlock(ptl);
				4313	return 0;
				4314	}
				4315	#endif /* __PAGETABLE_PMD_FOLDED */
				4316
				4317	int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
				4318	struct mmu_notifier_range range, pte_t *ptepp,
				4319	pmd_t pmdpp, spinlock_t ptlp)
				4320	{
				4321	pgd_t *pgd;
				4322	p4d_t *p4d;
				4323	pud_t *pud;
				4324	pmd_t *pmd;
				4325	pte_t *ptep;
				4326
				4327	pgd = pgd_offset(mm, address);
				4328	if (pgd_none(pgd) \|\| unlikely(pgd_bad(pgd)))
				4329	goto out;
				4330
				4331	p4d = p4d_offset(pgd, address);
				4332	if (p4d_none(p4d) \|\| unlikely(p4d_bad(p4d)))
				4333	goto out;
				4334
				4335	pud = pud_offset(p4d, address);
				4336	if (pud_none(pud) \|\| unlikely(pud_bad(pud)))
				4337	goto out;
				4338
				4339	pmd = pmd_offset(pud, address);
				4340	VM_BUG_ON(pmd_trans_huge(*pmd));
				4341
				4342	if (pmd_huge(*pmd)) {
				4343	if (!pmdpp)
				4344	goto out;
				4345
				4346	if (range) {
				4347	mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
				4348	NULL, mm, address & PMD_MASK,
				4349	(address & PMD_MASK) + PMD_SIZE);
				4350	mmu_notifier_invalidate_range_start(range);
				4351	}
				4352	*ptlp = pmd_lock(mm, pmd);
				4353	if (pmd_huge(*pmd)) {
				4354	*pmdpp = pmd;
				4355	return 0;
				4356	}
				4357	spin_unlock(*ptlp);
				4358	if (range)
				4359	mmu_notifier_invalidate_range_end(range);
				4360	}
				4361
				4362	if (pmd_none(pmd) \|\| unlikely(pmd_bad(pmd)))
				4363	goto out;
				4364
				4365	if (range) {
				4366	mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
				4367	address & PAGE_MASK,
				4368	(address & PAGE_MASK) + PAGE_SIZE);
				4369	mmu_notifier_invalidate_range_start(range);
				4370	}
				4371	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
				4372	if (!pte_present(*ptep))
				4373	goto unlock;
				4374	*ptepp = ptep;
				4375	return 0;
				4376	unlock:
				4377	pte_unmap_unlock(ptep, *ptlp);
				4378	if (range)
				4379	mmu_notifier_invalidate_range_end(range);
				4380	out:
				4381	return -EINVAL;
				4382	}
				4383
				4384	/**
				4385	* follow_pte - look up PTE at a user virtual address
				4386	* @mm: the mm_struct of the target address space
				4387	* @address: user virtual address
				4388	* @ptepp: location to store found PTE
				4389	* @ptlp: location to store the lock for the PTE
				4390	*
				4391	* On a successful return, the pointer to the PTE is stored in @ptepp;
				4392	* the corresponding lock is taken and its location is stored in @ptlp.
				4393	* The contents of the PTE are only stable until @ptlp is released;
				4394	* any further use, if any, must be protected against invalidation
				4395	* with MMU notifiers.
				4396	*
				4397	* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
				4398	* should be taken for read.
				4399	*
				4400	* KVM uses this function. While it is arguably less bad than ``follow_pfn``,
				4401	* it is not a good general-purpose API.
				4402	*
				4403	* Return: zero on success, -ve otherwise.
				4404	*/
				4405	int follow_pte(struct mm_struct *mm, unsigned long address,
				4406	pte_t ptepp, spinlock_t ptlp)
				4407	{
				4408	return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
				4409	}
				4410	EXPORT_SYMBOL_GPL(follow_pte);
				4411
				4412	/**
				4413	* follow_pfn - look up PFN at a user virtual address
				4414	* @vma: memory mapping
				4415	* @address: user virtual address
				4416	* @pfn: location to store found PFN
				4417	*
				4418	* Only IO mappings and raw PFN mappings are allowed.
				4419	*
				4420	* This function does not allow the caller to read the permissions
				4421	* of the PTE. Do not use it.
				4422	*
				4423	* Return: zero and the pfn at @pfn on success, -ve otherwise.
				4424	*/
				4425	int follow_pfn(struct vm_area_struct *vma, unsigned long address,
				4426	unsigned long *pfn)
				4427	{
				4428	int ret = -EINVAL;
				4429	spinlock_t *ptl;
				4430	pte_t *ptep;
				4431
				4432	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
				4433	return ret;
				4434
				4435	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
				4436	if (ret)
				4437	return ret;
				4438	pfn = pte_pfn(ptep);
				4439	pte_unmap_unlock(ptep, ptl);
				4440	return 0;
				4441	}
				4442	EXPORT_SYMBOL(follow_pfn);
				4443
				4444	#ifdef CONFIG_HAVE_IOREMAP_PROT
				4445	int follow_phys(struct vm_area_struct *vma,
				4446	unsigned long address, unsigned int flags,
				4447	unsigned long prot, resource_size_t phys)
				4448	{
				4449	int ret = -EINVAL;
				4450	pte_t *ptep, pte;
				4451	spinlock_t *ptl;
				4452
				4453	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
				4454	goto out;
				4455
				4456	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
				4457	goto out;
				4458	pte = *ptep;
				4459
				4460	/* Never return PFNs of anon folios in COW mappings. */
				4461	if (vm_normal_page(vma, address, pte))
				4462	goto unlock;
				4463
				4464	if ((flags & FOLL_WRITE) && !pte_write(pte))
				4465	goto unlock;
				4466
				4467	*prot = pgprot_val(pte_pgprot(pte));
				4468	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
				4469
				4470	ret = 0;
				4471	unlock:
				4472	pte_unmap_unlock(ptep, ptl);
				4473	out:
				4474	return ret;
				4475	}
				4476
				4477	int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
				4478	void *buf, int len, int write)
				4479	{
				4480	resource_size_t phys_addr;
				4481	unsigned long prot = 0;
				4482	void __iomem *maddr;
				4483	int offset = addr & (PAGE_SIZE-1);
				4484
				4485	if (follow_phys(vma, addr, write, &prot, &phys_addr))
				4486	return -EINVAL;
				4487
				4488	maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
				4489	if (!maddr)
				4490	return -ENOMEM;
				4491
				4492	if (write)
				4493	memcpy_toio(maddr + offset, buf, len);
				4494	else
				4495	memcpy_fromio(buf, maddr + offset, len);
				4496	iounmap(maddr);
				4497
				4498	return len;
				4499	}
				4500	EXPORT_SYMBOL_GPL(generic_access_phys);
				4501	#endif
				4502
				4503	/*
				4504	* Access another process' address space as given in mm. If non-NULL, use the
				4505	* given task for page fault accounting.
				4506	*/
				4507	int __access_remote_vm(struct task_struct tsk, struct mm_struct mm,
				4508	unsigned long addr, void *buf, int len, unsigned int gup_flags)
				4509	{
				4510	struct vm_area_struct *vma;
				4511	void *old_buf = buf;
				4512	int write = gup_flags & FOLL_WRITE;
				4513
				4514	if (down_read_killable(&mm->mmap_sem))
				4515	return 0;
				4516
				4517	/* ignore errors, just check how much was successfully transferred */
				4518	while (len) {
				4519	int bytes, ret, offset;
				4520	void *maddr;
				4521	struct page *page = NULL;
				4522
				4523	ret = get_user_pages_remote(tsk, mm, addr, 1,
				4524	gup_flags, &page, &vma, NULL);
				4525	if (ret <= 0) {
				4526	#ifndef CONFIG_HAVE_IOREMAP_PROT
				4527	break;
				4528	#else
				4529	/*
				4530	* Check if this is a VM_IO \| VM_PFNMAP VMA, which
				4531	* we can access using slightly different code.
				4532	*/
				4533	vma = find_vma(mm, addr);
				4534	if (!vma \|\| vma->vm_start > addr)
				4535	break;
				4536	if (vma->vm_ops && vma->vm_ops->access)
				4537	ret = vma->vm_ops->access(vma, addr, buf,
				4538	len, write);
				4539	if (ret <= 0)
				4540	break;
				4541	bytes = ret;
				4542	#endif
				4543	} else {
				4544	bytes = len;
				4545	offset = addr & (PAGE_SIZE-1);
				4546	if (bytes > PAGE_SIZE-offset)
				4547	bytes = PAGE_SIZE-offset;
				4548
				4549	maddr = kmap(page);
				4550	if (write) {
				4551	copy_to_user_page(vma, page, addr,
				4552	maddr + offset, buf, bytes);
				4553	set_page_dirty_lock(page);
				4554	} else {
				4555	copy_from_user_page(vma, page, addr,
				4556	buf, maddr + offset, bytes);
				4557	}
				4558	kunmap(page);
				4559	put_page(page);
				4560	}
				4561	len -= bytes;
				4562	buf += bytes;
				4563	addr += bytes;
				4564	}
				4565	up_read(&mm->mmap_sem);
				4566
				4567	return buf - old_buf;
				4568	}
				4569
				4570	/**
				4571	* access_remote_vm - access another process' address space
				4572	* @mm: the mm_struct of the target address space
				4573	* @addr: start address to access
				4574	* @buf: source or destination buffer
				4575	* @len: number of bytes to transfer
				4576	* @gup_flags: flags modifying lookup behaviour
				4577	*
				4578	* The caller must hold a reference on @mm.
				4579	*
				4580	* Return: number of bytes copied from source to destination.
				4581	*/
				4582	int access_remote_vm(struct mm_struct *mm, unsigned long addr,
				4583	void *buf, int len, unsigned int gup_flags)
				4584	{
				4585	return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
				4586	}
				4587
				4588	/*
				4589	* Access another process' address space.
				4590	* Source/target buffer must be kernel space,
				4591	* Do not walk the page table directly, use get_user_pages
				4592	*/
				4593	int access_process_vm(struct task_struct *tsk, unsigned long addr,
				4594	void *buf, int len, unsigned int gup_flags)
				4595	{
				4596	struct mm_struct *mm;
				4597	int ret;
				4598
				4599	mm = get_task_mm(tsk);
				4600	if (!mm)
				4601	return 0;
				4602
				4603	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
				4604
				4605	mmput(mm);
				4606
				4607	return ret;
				4608	}
				4609	EXPORT_SYMBOL_GPL(access_process_vm);
				4610
				4611	/*
				4612	* Print the name of a VMA.
				4613	*/
				4614	void print_vma_addr(char *prefix, unsigned long ip)
				4615	{
				4616	struct mm_struct *mm = current->mm;
				4617	struct vm_area_struct *vma;
				4618
				4619	/*
				4620	* we might be running from an atomic context so we cannot sleep
				4621	*/
				4622	if (!down_read_trylock(&mm->mmap_sem))
				4623	return;
				4624
				4625	vma = find_vma(mm, ip);
				4626	if (vma && vma->vm_file) {
				4627	struct file *f = vma->vm_file;
				4628	char buf = (char )__get_free_page(GFP_NOWAIT);
				4629	if (buf) {
				4630	char *p;
				4631
				4632	p = file_path(f, buf, PAGE_SIZE);
				4633	if (IS_ERR(p))
				4634	p = "?";
				4635	printk("%s%s[%lx+%lx]", prefix, kbasename(p),
				4636	vma->vm_start,
				4637	vma->vm_end - vma->vm_start);
				4638	free_page((unsigned long)buf);
				4639	}
				4640	}
				4641	up_read(&mm->mmap_sem);
				4642	}
				4643
				4644	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)
				4645	void __might_fault(const char *file, int line)
				4646	{
				4647	/*
				4648	* Some code (nfs/sunrpc) uses socket ops on kernel memory while
				4649	* holding the mmap_sem, this is safe because kernel memory doesn't
				4650	* get paged out, therefore we'll never actually fault, and the
				4651	* below annotations will generate false positives.
				4652	*/
				4653	if (uaccess_kernel())
				4654	return;
				4655	if (pagefault_disabled())
				4656	return;
				4657	__might_sleep(file, line, 0);
				4658	#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
				4659	if (current->mm)
				4660	might_lock_read(&current->mm->mmap_sem);
				4661	#endif
				4662	}
				4663	EXPORT_SYMBOL(__might_fault);
				4664	#endif
				4665
				4666	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) \|\| defined(CONFIG_HUGETLBFS)
				4667	/*
				4668	* Process all subpages of the specified huge page with the specified
				4669	* operation. The target subpage will be processed last to keep its
				4670	* cache lines hot.
				4671	*/
				4672	static inline void process_huge_page(
				4673	unsigned long addr_hint, unsigned int pages_per_huge_page,
				4674	void (process_subpage)(unsigned long addr, int idx, void arg),
				4675	void *arg)
				4676	{
				4677	int i, n, base, l;
				4678	unsigned long addr = addr_hint &
				4679	~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
				4680
				4681	/* Process target subpage last to keep its cache lines hot */
				4682	might_sleep();
				4683	n = (addr_hint - addr) / PAGE_SIZE;
				4684	if (2 * n <= pages_per_huge_page) {
				4685	/* If target subpage in first half of huge page */
				4686	base = 0;
				4687	l = n;
				4688	/* Process subpages at the end of huge page */
				4689	for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
				4690	cond_resched();
				4691	process_subpage(addr + i * PAGE_SIZE, i, arg);
				4692	}
				4693	} else {
				4694	/* If target subpage in second half of huge page */
				4695	base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
				4696	l = pages_per_huge_page - n;
				4697	/* Process subpages at the begin of huge page */
				4698	for (i = 0; i < base; i++) {
				4699	cond_resched();
				4700	process_subpage(addr + i * PAGE_SIZE, i, arg);
				4701	}
				4702	}
				4703	/*
				4704	* Process remaining subpages in left-right-left-right pattern
				4705	* towards the target subpage
				4706	*/
				4707	for (i = 0; i < l; i++) {
				4708	int left_idx = base + i;
				4709	int right_idx = base + 2 * l - 1 - i;
				4710
				4711	cond_resched();
				4712	process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
				4713	cond_resched();
				4714	process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
				4715	}
				4716	}
				4717
				4718	static void clear_gigantic_page(struct page *page,
				4719	unsigned long addr,
				4720	unsigned int pages_per_huge_page)
				4721	{
				4722	int i;
				4723	struct page *p = page;
				4724
				4725	might_sleep();
				4726	for (i = 0; i < pages_per_huge_page;
				4727	i++, p = mem_map_next(p, page, i)) {
				4728	cond_resched();
				4729	clear_user_highpage(p, addr + i * PAGE_SIZE);
				4730	}
				4731	}
				4732
				4733	static void clear_subpage(unsigned long addr, int idx, void *arg)
				4734	{
				4735	struct page *page = arg;
				4736
				4737	clear_user_highpage(page + idx, addr);
				4738	}
				4739
				4740	void clear_huge_page(struct page *page,
				4741	unsigned long addr_hint, unsigned int pages_per_huge_page)
				4742	{
				4743	unsigned long addr = addr_hint &
				4744	~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
				4745
				4746	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
				4747	clear_gigantic_page(page, addr, pages_per_huge_page);
				4748	return;
				4749	}
				4750
				4751	process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
				4752	}
				4753
				4754	static void copy_user_gigantic_page(struct page dst, struct page src,
				4755	unsigned long addr,
				4756	struct vm_area_struct *vma,
				4757	unsigned int pages_per_huge_page)
				4758	{
				4759	int i;
				4760	struct page *dst_base = dst;
				4761	struct page *src_base = src;
				4762
				4763	for (i = 0; i < pages_per_huge_page; ) {
				4764	cond_resched();
				4765	copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
				4766
				4767	i++;
				4768	dst = mem_map_next(dst, dst_base, i);
				4769	src = mem_map_next(src, src_base, i);
				4770	}
				4771	}
				4772
				4773	struct copy_subpage_arg {
				4774	struct page *dst;
				4775	struct page *src;
				4776	struct vm_area_struct *vma;
				4777	};
				4778
				4779	static void copy_subpage(unsigned long addr, int idx, void *arg)
				4780	{
				4781	struct copy_subpage_arg *copy_arg = arg;
				4782
				4783	copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
				4784	addr, copy_arg->vma);
				4785	}
				4786
				4787	void copy_user_huge_page(struct page dst, struct page src,
				4788	unsigned long addr_hint, struct vm_area_struct *vma,
				4789	unsigned int pages_per_huge_page)
				4790	{
				4791	unsigned long addr = addr_hint &
				4792	~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
				4793	struct copy_subpage_arg arg = {
				4794	.dst = dst,
				4795	.src = src,
				4796	.vma = vma,
				4797	};
				4798
				4799	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
				4800	copy_user_gigantic_page(dst, src, addr, vma,
				4801	pages_per_huge_page);
				4802	return;
				4803	}
				4804
				4805	process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
				4806	}
				4807
				4808	long copy_huge_page_from_user(struct page *dst_page,
				4809	const void __user *usr_src,
				4810	unsigned int pages_per_huge_page,
				4811	bool allow_pagefault)
				4812	{
				4813	void src = (void )usr_src;
				4814	void *page_kaddr;
				4815	unsigned long i, rc = 0;
				4816	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
				4817	struct page *subpage = dst_page;
				4818
				4819	for (i = 0; i < pages_per_huge_page;
				4820	i++, subpage = mem_map_next(subpage, dst_page, i)) {
				4821	if (allow_pagefault)
				4822	page_kaddr = kmap(subpage);
				4823	else
				4824	page_kaddr = kmap_atomic(subpage);
				4825	rc = copy_from_user(page_kaddr,
				4826	(const void __user )(src + i PAGE_SIZE),
				4827	PAGE_SIZE);
				4828	if (allow_pagefault)
				4829	kunmap(subpage);
				4830	else
				4831	kunmap_atomic(page_kaddr);
				4832
				4833	ret_val -= (PAGE_SIZE - rc);
				4834	if (rc)
				4835	break;
				4836
				4837	flush_dcache_page(subpage);
				4838
				4839	cond_resched();
				4840	}
				4841	return ret_val;
				4842	}
				4843	#endif /* CONFIG_TRANSPARENT_HUGEPAGE \|\| CONFIG_HUGETLBFS */
				4844
				4845	#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
				4846
				4847	static struct kmem_cache *page_ptl_cachep;
				4848
				4849	void __init ptlock_cache_init(void)
				4850	{
				4851	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
				4852	SLAB_PANIC, NULL);
				4853	}
				4854
				4855	bool ptlock_alloc(struct page *page)
				4856	{
				4857	spinlock_t *ptl;
				4858
				4859	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
				4860	if (!ptl)
				4861	return false;
				4862	page->ptl = ptl;
				4863	return true;
				4864	}
				4865
				4866	void ptlock_free(struct page *page)
				4867	{
				4868	kmem_cache_free(page_ptl_cachep, page->ptl);
				4869	}
				4870	#endif