Blame - marvell/linux/mm/rmap.c - T108

blob: c64da910bb7311d75d78183ea64c9f1db6c5cebf [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* mm/rmap.c - physical to virtual reverse mappings
				3	*
				4	* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
				5	* Released under the General Public License (GPL).
				6	*
				7	* Simple, low overhead reverse mapping scheme.
				8	* Please try to keep this thing as modular as possible.
				9	*
				10	* Provides methods for unmapping each kind of mapped page:
				11	* the anon methods track anonymous pages, and
				12	* the file methods track pages belonging to an inode.
				13	*
				14	* Original design by Rik van Riel <riel@conectiva.com.br> 2001
				15	* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
				16	* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
				17	* Contributions by Hugh Dickins 2003, 2004
				18	*/
				19
				20	/*
				21	* Lock ordering in mm:
				22	*
				23	* inode->i_mutex (while writing or truncating, not reading or faulting)
				24	* mm->mmap_sem
				25	* page->flags PG_locked (lock_page)
				26	* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
				27	* mapping->i_mmap_rwsem
				28	* anon_vma->rwsem
				29	* mm->page_table_lock or pte_lock
				30	* pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
				31	* swap_lock (in swap_duplicate, swap_info_get)
				32	* mmlist_lock (in mmput, drain_mmlist and others)
				33	* mapping->private_lock (in __set_page_dirty_buffers)
				34	* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
				35	* i_pages lock (widely used)
				36	* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
				37	* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
				38	* sb_lock (within inode_lock in fs/fs-writeback.c)
				39	* i_pages lock (widely used, in set_page_dirty,
				40	* in arch-dependent flush_dcache_mmap_lock,
				41	* within bdi.wb->list_lock in __sync_single_inode)
				42	*
				43	* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
				44	* ->tasklist_lock
				45	* pte map lock
				46	*/
				47
				48	#include <linux/mm.h>
				49	#include <linux/sched/mm.h>
				50	#include <linux/sched/task.h>
				51	#include <linux/pagemap.h>
				52	#include <linux/swap.h>
				53	#include <linux/swapops.h>
				54	#include <linux/slab.h>
				55	#include <linux/init.h>
				56	#include <linux/ksm.h>
				57	#include <linux/rmap.h>
				58	#include <linux/rcupdate.h>
				59	#include <linux/export.h>
				60	#include <linux/memcontrol.h>
				61	#include <linux/mmu_notifier.h>
				62	#include <linux/migrate.h>
				63	#include <linux/hugetlb.h>
				64	#include <linux/huge_mm.h>
				65	#include <linux/backing-dev.h>
				66	#include <linux/page_idle.h>
				67	#include <linux/memremap.h>
				68	#include <linux/userfaultfd_k.h>
				69
				70	#include <asm/tlbflush.h>
				71
				72	#include <trace/events/tlb.h>
				73
				74	#include "internal.h"
				75
				76	static struct kmem_cache *anon_vma_cachep;
				77	static struct kmem_cache *anon_vma_chain_cachep;
				78
				79	static inline struct anon_vma *anon_vma_alloc(void)
				80	{
				81	struct anon_vma *anon_vma;
				82
				83	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
				84	if (anon_vma) {
				85	atomic_set(&anon_vma->refcount, 1);
				86	anon_vma->num_children = 0;
				87	anon_vma->num_active_vmas = 0;
				88	anon_vma->parent = anon_vma;
				89	/*
				90	* Initialise the anon_vma root to point to itself. If called
				91	* from fork, the root will be reset to the parents anon_vma.
				92	*/
				93	anon_vma->root = anon_vma;
				94	}
				95
				96	return anon_vma;
				97	}
				98
				99	static inline void anon_vma_free(struct anon_vma *anon_vma)
				100	{
				101	VM_BUG_ON(atomic_read(&anon_vma->refcount));
				102
				103	/*
				104	* Synchronize against page_lock_anon_vma_read() such that
				105	* we can safely hold the lock without the anon_vma getting
				106	* freed.
				107	*
				108	* Relies on the full mb implied by the atomic_dec_and_test() from
				109	* put_anon_vma() against the acquire barrier implied by
				110	* down_read_trylock() from page_lock_anon_vma_read(). This orders:
				111	*
				112	* page_lock_anon_vma_read() VS put_anon_vma()
				113	* down_read_trylock() atomic_dec_and_test()
				114	* LOCK MB
				115	* atomic_read() rwsem_is_locked()
				116	*
				117	* LOCK should suffice since the actual taking of the lock must
				118	* happen _before_ what follows.
				119	*/
				120	might_sleep();
				121	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
				122	anon_vma_lock_write(anon_vma);
				123	anon_vma_unlock_write(anon_vma);
				124	}
				125
				126	kmem_cache_free(anon_vma_cachep, anon_vma);
				127	}
				128
				129	static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
				130	{
				131	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
				132	}
				133
				134	static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
				135	{
				136	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
				137	}
				138
				139	static void anon_vma_chain_link(struct vm_area_struct *vma,
				140	struct anon_vma_chain *avc,
				141	struct anon_vma *anon_vma)
				142	{
				143	avc->vma = vma;
				144	avc->anon_vma = anon_vma;
				145	list_add(&avc->same_vma, &vma->anon_vma_chain);
				146	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
				147	}
				148
				149	/**
				150	* __anon_vma_prepare - attach an anon_vma to a memory region
				151	* @vma: the memory region in question
				152	*
				153	* This makes sure the memory mapping described by 'vma' has
				154	* an 'anon_vma' attached to it, so that we can associate the
				155	* anonymous pages mapped into it with that anon_vma.
				156	*
				157	* The common case will be that we already have one, which
				158	* is handled inline by anon_vma_prepare(). But if
				159	* not we either need to find an adjacent mapping that we
				160	* can re-use the anon_vma from (very common when the only
				161	* reason for splitting a vma has been mprotect()), or we
				162	* allocate a new one.
				163	*
				164	* Anon-vma allocations are very subtle, because we may have
				165	* optimistically looked up an anon_vma in page_lock_anon_vma_read()
				166	* and that may actually touch the spinlock even in the newly
				167	* allocated vma (it depends on RCU to make sure that the
				168	* anon_vma isn't actually destroyed).
				169	*
				170	* As a result, we need to do proper anon_vma locking even
				171	* for the new allocation. At the same time, we do not want
				172	* to do any locking for the common case of already having
				173	* an anon_vma.
				174	*
				175	* This must be called with the mmap_sem held for reading.
				176	*/
				177	int __anon_vma_prepare(struct vm_area_struct *vma)
				178	{
				179	struct mm_struct *mm = vma->vm_mm;
				180	struct anon_vma anon_vma, allocated;
				181	struct anon_vma_chain *avc;
				182
				183	might_sleep();
				184
				185	avc = anon_vma_chain_alloc(GFP_KERNEL);
				186	if (!avc)
				187	goto out_enomem;
				188
				189	anon_vma = find_mergeable_anon_vma(vma);
				190	allocated = NULL;
				191	if (!anon_vma) {
				192	anon_vma = anon_vma_alloc();
				193	if (unlikely(!anon_vma))
				194	goto out_enomem_free_avc;
				195	anon_vma->num_children++; /* self-parent link for new root */
				196	allocated = anon_vma;
				197	}
				198
				199	anon_vma_lock_write(anon_vma);
				200	/* page_table_lock to protect against threads */
				201	spin_lock(&mm->page_table_lock);
				202	if (likely(!vma->anon_vma)) {
				203	vma->anon_vma = anon_vma;
				204	anon_vma_chain_link(vma, avc, anon_vma);
				205	anon_vma->num_active_vmas++;
				206	allocated = NULL;
				207	avc = NULL;
				208	}
				209	spin_unlock(&mm->page_table_lock);
				210	anon_vma_unlock_write(anon_vma);
				211
				212	if (unlikely(allocated))
				213	put_anon_vma(allocated);
				214	if (unlikely(avc))
				215	anon_vma_chain_free(avc);
				216
				217	return 0;
				218
				219	out_enomem_free_avc:
				220	anon_vma_chain_free(avc);
				221	out_enomem:
				222	return -ENOMEM;
				223	}
				224
				225	/*
				226	* This is a useful helper function for locking the anon_vma root as
				227	* we traverse the vma->anon_vma_chain, looping over anon_vma's that
				228	* have the same vma.
				229	*
				230	* Such anon_vma's should have the same root, so you'd expect to see
				231	* just a single mutex_lock for the whole traversal.
				232	*/
				233	static inline struct anon_vma lock_anon_vma_root(struct anon_vma root, struct anon_vma *anon_vma)
				234	{
				235	struct anon_vma *new_root = anon_vma->root;
				236	if (new_root != root) {
				237	if (WARN_ON_ONCE(root))
				238	up_write(&root->rwsem);
				239	root = new_root;
				240	down_write(&root->rwsem);
				241	}
				242	return root;
				243	}
				244
				245	static inline void unlock_anon_vma_root(struct anon_vma *root)
				246	{
				247	if (root)
				248	up_write(&root->rwsem);
				249	}
				250
				251	/*
				252	* Attach the anon_vmas from src to dst.
				253	* Returns 0 on success, -ENOMEM on failure.
				254	*
				255	* If dst->anon_vma is NULL this function tries to find and reuse existing
				256	* anon_vma which has no vmas and only one child anon_vma. This prevents
				257	* degradation of anon_vma hierarchy to endless linear chain in case of
				258	* constantly forking task. On the other hand, an anon_vma with more than one
				259	* child isn't reused even if there was no alive vma, thus rmap walker has a
				260	* good chance of avoiding scanning the whole hierarchy when it searches where
				261	* page is mapped.
				262	*/
				263	int anon_vma_clone(struct vm_area_struct dst, struct vm_area_struct src)
				264	{
				265	struct anon_vma_chain avc, pavc;
				266	struct anon_vma *root = NULL;
				267
				268	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
				269	struct anon_vma *anon_vma;
				270
				271	avc = anon_vma_chain_alloc(GFP_NOWAIT \| __GFP_NOWARN);
				272	if (unlikely(!avc)) {
				273	unlock_anon_vma_root(root);
				274	root = NULL;
				275	avc = anon_vma_chain_alloc(GFP_KERNEL);
				276	if (!avc)
				277	goto enomem_failure;
				278	}
				279	anon_vma = pavc->anon_vma;
				280	root = lock_anon_vma_root(root, anon_vma);
				281	anon_vma_chain_link(dst, avc, anon_vma);
				282
				283	/*
				284	* Reuse existing anon_vma if it has no vma and only one
				285	* anon_vma child.
				286	*
				287	* Root anon_vma is never reused:
				288	* it has self-parent reference and at least one child.
				289	*/
				290	if (!dst->anon_vma &&
				291	anon_vma->num_children < 2 &&
				292	anon_vma->num_active_vmas == 0)
				293	dst->anon_vma = anon_vma;
				294	}
				295	if (dst->anon_vma)
				296	dst->anon_vma->num_active_vmas++;
				297	unlock_anon_vma_root(root);
				298	return 0;
				299
				300	enomem_failure:
				301	/*
				302	* dst->anon_vma is dropped here otherwise its degree can be incorrectly
				303	* decremented in unlink_anon_vmas().
				304	* We can safely do this because callers of anon_vma_clone() don't care
				305	* about dst->anon_vma if anon_vma_clone() failed.
				306	*/
				307	dst->anon_vma = NULL;
				308	unlink_anon_vmas(dst);
				309	return -ENOMEM;
				310	}
				311
				312	/*
				313	* Attach vma to its own anon_vma, as well as to the anon_vmas that
				314	* the corresponding VMA in the parent process is attached to.
				315	* Returns 0 on success, non-zero on failure.
				316	*/
				317	int anon_vma_fork(struct vm_area_struct vma, struct vm_area_struct pvma)
				318	{
				319	struct anon_vma_chain *avc;
				320	struct anon_vma *anon_vma;
				321	int error;
				322
				323	/* Don't bother if the parent process has no anon_vma here. */
				324	if (!pvma->anon_vma)
				325	return 0;
				326
				327	/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
				328	vma->anon_vma = NULL;
				329
				330	/*
				331	* First, attach the new VMA to the parent VMA's anon_vmas,
				332	* so rmap can find non-COWed pages in child processes.
				333	*/
				334	error = anon_vma_clone(vma, pvma);
				335	if (error)
				336	return error;
				337
				338	/* An existing anon_vma has been reused, all done then. */
				339	if (vma->anon_vma)
				340	return 0;
				341
				342	/* Then add our own anon_vma. */
				343	anon_vma = anon_vma_alloc();
				344	if (!anon_vma)
				345	goto out_error;
				346	anon_vma->num_active_vmas++;
				347	avc = anon_vma_chain_alloc(GFP_KERNEL);
				348	if (!avc)
				349	goto out_error_free_anon_vma;
				350
				351	/*
				352	* The root anon_vma's spinlock is the lock actually used when we
				353	* lock any of the anon_vmas in this anon_vma tree.
				354	*/
				355	anon_vma->root = pvma->anon_vma->root;
				356	anon_vma->parent = pvma->anon_vma;
				357	/*
				358	* With refcounts, an anon_vma can stay around longer than the
				359	* process it belongs to. The root anon_vma needs to be pinned until
				360	* this anon_vma is freed, because the lock lives in the root.
				361	*/
				362	get_anon_vma(anon_vma->root);
				363	/* Mark this anon_vma as the one where our new (COWed) pages go. */
				364	vma->anon_vma = anon_vma;
				365	anon_vma_lock_write(anon_vma);
				366	anon_vma_chain_link(vma, avc, anon_vma);
				367	anon_vma->parent->num_children++;
				368	anon_vma_unlock_write(anon_vma);
				369
				370	return 0;
				371
				372	out_error_free_anon_vma:
				373	put_anon_vma(anon_vma);
				374	out_error:
				375	unlink_anon_vmas(vma);
				376	return -ENOMEM;
				377	}
				378
				379	void unlink_anon_vmas(struct vm_area_struct *vma)
				380	{
				381	struct anon_vma_chain avc, next;
				382	struct anon_vma *root = NULL;
				383
				384	/*
				385	* Unlink each anon_vma chained to the VMA. This list is ordered
				386	* from newest to oldest, ensuring the root anon_vma gets freed last.
				387	*/
				388	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
				389	struct anon_vma *anon_vma = avc->anon_vma;
				390
				391	root = lock_anon_vma_root(root, anon_vma);
				392	anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
				393
				394	/*
				395	* Leave empty anon_vmas on the list - we'll need
				396	* to free them outside the lock.
				397	*/
				398	if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
				399	anon_vma->parent->num_children--;
				400	continue;
				401	}
				402
				403	list_del(&avc->same_vma);
				404	anon_vma_chain_free(avc);
				405	}
				406	if (vma->anon_vma)
				407	vma->anon_vma->num_active_vmas--;
				408	unlock_anon_vma_root(root);
				409
				410	/*
				411	* Iterate the list once more, it now only contains empty and unlinked
				412	* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
				413	* needing to write-acquire the anon_vma->root->rwsem.
				414	*/
				415	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
				416	struct anon_vma *anon_vma = avc->anon_vma;
				417
				418	VM_WARN_ON(anon_vma->num_children);
				419	VM_WARN_ON(anon_vma->num_active_vmas);
				420	put_anon_vma(anon_vma);
				421
				422	list_del(&avc->same_vma);
				423	anon_vma_chain_free(avc);
				424	}
				425	}
				426
				427	static void anon_vma_ctor(void *data)
				428	{
				429	struct anon_vma *anon_vma = data;
				430
				431	init_rwsem(&anon_vma->rwsem);
				432	atomic_set(&anon_vma->refcount, 0);
				433	anon_vma->rb_root = RB_ROOT_CACHED;
				434	}
				435
				436	void __init anon_vma_init(void)
				437	{
				438	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
				439	0, SLAB_TYPESAFE_BY_RCU\|SLAB_PANIC\|SLAB_ACCOUNT,
				440	anon_vma_ctor);
				441	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
				442	SLAB_PANIC\|SLAB_ACCOUNT);
				443	}
				444
				445	/*
				446	* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
				447	*
				448	* Since there is no serialization what so ever against page_remove_rmap()
				449	* the best this function can do is return a locked anon_vma that might
				450	* have been relevant to this page.
				451	*
				452	* The page might have been remapped to a different anon_vma or the anon_vma
				453	* returned may already be freed (and even reused).
				454	*
				455	* In case it was remapped to a different anon_vma, the new anon_vma will be a
				456	* child of the old anon_vma, and the anon_vma lifetime rules will therefore
				457	* ensure that any anon_vma obtained from the page will still be valid for as
				458	* long as we observe page_mapped() [ hence all those page_mapped() tests ].
				459	*
				460	* All users of this function must be very careful when walking the anon_vma
				461	* chain and verify that the page in question is indeed mapped in it
				462	* [ something equivalent to page_mapped_in_vma() ].
				463	*
				464	* Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
				465	* that the anon_vma pointer from page->mapping is valid if there is a
				466	* mapcount, we can dereference the anon_vma after observing those.
				467	*/
				468	struct anon_vma page_get_anon_vma(struct page page)
				469	{
				470	struct anon_vma *anon_vma = NULL;
				471	unsigned long anon_mapping;
				472
				473	rcu_read_lock();
				474	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
				475	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
				476	goto out;
				477	if (!page_mapped(page))
				478	goto out;
				479
				480	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
				481	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
				482	anon_vma = NULL;
				483	goto out;
				484	}
				485
				486	/*
				487	* If this page is still mapped, then its anon_vma cannot have been
				488	* freed. But if it has been unmapped, we have no security against the
				489	* anon_vma structure being freed and reused (for another anon_vma:
				490	* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
				491	* above cannot corrupt).
				492	*/
				493	if (!page_mapped(page)) {
				494	rcu_read_unlock();
				495	put_anon_vma(anon_vma);
				496	return NULL;
				497	}
				498	out:
				499	rcu_read_unlock();
				500
				501	return anon_vma;
				502	}
				503
				504	/*
				505	* Similar to page_get_anon_vma() except it locks the anon_vma.
				506	*
				507	* Its a little more complex as it tries to keep the fast path to a single
				508	* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
				509	* reference like with page_get_anon_vma() and then block on the mutex.
				510	*/
				511	struct anon_vma page_lock_anon_vma_read(struct page page)
				512	{
				513	struct anon_vma *anon_vma = NULL;
				514	struct anon_vma *root_anon_vma;
				515	unsigned long anon_mapping;
				516
				517	rcu_read_lock();
				518	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
				519	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
				520	goto out;
				521	if (!page_mapped(page))
				522	goto out;
				523
				524	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
				525	root_anon_vma = READ_ONCE(anon_vma->root);
				526	if (down_read_trylock(&root_anon_vma->rwsem)) {
				527	/*
				528	* If the page is still mapped, then this anon_vma is still
				529	* its anon_vma, and holding the mutex ensures that it will
				530	* not go away, see anon_vma_free().
				531	*/
				532	if (!page_mapped(page)) {
				533	up_read(&root_anon_vma->rwsem);
				534	anon_vma = NULL;
				535	}
				536	goto out;
				537	}
				538
				539	/* trylock failed, we got to sleep */
				540	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
				541	anon_vma = NULL;
				542	goto out;
				543	}
				544
				545	if (!page_mapped(page)) {
				546	rcu_read_unlock();
				547	put_anon_vma(anon_vma);
				548	return NULL;
				549	}
				550
				551	/* we pinned the anon_vma, its safe to sleep */
				552	rcu_read_unlock();
				553	anon_vma_lock_read(anon_vma);
				554
				555	if (atomic_dec_and_test(&anon_vma->refcount)) {
				556	/*
				557	* Oops, we held the last refcount, release the lock
				558	* and bail -- can't simply use put_anon_vma() because
				559	* we'll deadlock on the anon_vma_lock_write() recursion.
				560	*/
				561	anon_vma_unlock_read(anon_vma);
				562	__put_anon_vma(anon_vma);
				563	anon_vma = NULL;
				564	}
				565
				566	return anon_vma;
				567
				568	out:
				569	rcu_read_unlock();
				570	return anon_vma;
				571	}
				572
				573	void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
				574	{
				575	anon_vma_unlock_read(anon_vma);
				576	}
				577
				578	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
				579	/*
				580	* Flush TLB entries for recently unmapped pages from remote CPUs. It is
				581	* important if a PTE was dirty when it was unmapped that it's flushed
				582	* before any IO is initiated on the page to prevent lost writes. Similarly,
				583	* it must be flushed before freeing to prevent data leakage.
				584	*/
				585	void try_to_unmap_flush(void)
				586	{
				587	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
				588
				589	if (!tlb_ubc->flush_required)
				590	return;
				591
				592	arch_tlbbatch_flush(&tlb_ubc->arch);
				593	tlb_ubc->flush_required = false;
				594	tlb_ubc->writable = false;
				595	}
				596
				597	/* Flush iff there are potentially writable TLB entries that can race with IO */
				598	void try_to_unmap_flush_dirty(void)
				599	{
				600	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
				601
				602	if (tlb_ubc->writable)
				603	try_to_unmap_flush();
				604	}
				605
				606	static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
				607	{
				608	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
				609
				610	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
				611	tlb_ubc->flush_required = true;
				612
				613	/*
				614	* Ensure compiler does not re-order the setting of tlb_flush_batched
				615	* before the PTE is cleared.
				616	*/
				617	barrier();
				618	mm->tlb_flush_batched = true;
				619
				620	/*
				621	* If the PTE was dirty then it's best to assume it's writable. The
				622	* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
				623	* before the page is queued for IO.
				624	*/
				625	if (writable)
				626	tlb_ubc->writable = true;
				627	}
				628
				629	/*
				630	* Returns true if the TLB flush should be deferred to the end of a batch of
				631	* unmap operations to reduce IPIs.
				632	*/
				633	static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
				634	{
				635	bool should_defer = false;
				636
				637	if (!(flags & TTU_BATCH_FLUSH))
				638	return false;
				639
				640	/* If remote CPUs need to be flushed then defer batch the flush */
				641	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
				642	should_defer = true;
				643	put_cpu();
				644
				645	return should_defer;
				646	}
				647
				648	/*
				649	* Reclaim unmaps pages under the PTL but do not flush the TLB prior to
				650	* releasing the PTL if TLB flushes are batched. It's possible for a parallel
				651	* operation such as mprotect or munmap to race between reclaim unmapping
				652	* the page and flushing the page. If this race occurs, it potentially allows
				653	* access to data via a stale TLB entry. Tracking all mm's that have TLB
				654	* batching in flight would be expensive during reclaim so instead track
				655	* whether TLB batching occurred in the past and if so then do a flush here
				656	* if required. This will cost one additional flush per reclaim cycle paid
				657	* by the first operation at risk such as mprotect and mumap.
				658	*
				659	* This must be called under the PTL so that an access to tlb_flush_batched
				660	* that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
				661	* via the PTL.
				662	*/
				663	void flush_tlb_batched_pending(struct mm_struct *mm)
				664	{
				665	if (mm->tlb_flush_batched) {
				666	flush_tlb_mm(mm);
				667
				668	/*
				669	* Do not allow the compiler to re-order the clearing of
				670	* tlb_flush_batched before the tlb is flushed.
				671	*/
				672	barrier();
				673	mm->tlb_flush_batched = false;
				674	}
				675	}
				676	#else
				677	static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
				678	{
				679	}
				680
				681	static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
				682	{
				683	return false;
				684	}
				685	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
				686
				687	/*
				688	* At what user virtual address is page expected in vma?
				689	* Caller should check the page is actually part of the vma.
				690	*/
				691	unsigned long page_address_in_vma(struct page page, struct vm_area_struct vma)
				692	{
				693	if (PageAnon(page)) {
				694	struct anon_vma *page__anon_vma = page_anon_vma(page);
				695	/*
				696	* Note: swapoff's unuse_vma() is more efficient with this
				697	* check, and needs it to match anon_vma when KSM is active.
				698	*/
				699	if (!vma->anon_vma \|\| !page__anon_vma \|\|
				700	vma->anon_vma->root != page__anon_vma->root)
				701	return -EFAULT;
				702	} else if (!vma->vm_file) {
				703	return -EFAULT;
				704	} else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
				705	return -EFAULT;
				706	}
				707
				708	return vma_address(page, vma);
				709	}
				710
				711	pmd_t mm_find_pmd(struct mm_struct mm, unsigned long address)
				712	{
				713	pgd_t *pgd;
				714	p4d_t *p4d;
				715	pud_t *pud;
				716	pmd_t *pmd = NULL;
				717	pmd_t pmde;
				718
				719	pgd = pgd_offset(mm, address);
				720	if (!pgd_present(*pgd))
				721	goto out;
				722
				723	p4d = p4d_offset(pgd, address);
				724	if (!p4d_present(*p4d))
				725	goto out;
				726
				727	pud = pud_offset(p4d, address);
				728	if (!pud_present(*pud))
				729	goto out;
				730
				731	pmd = pmd_offset(pud, address);
				732	/*
				733	* Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
				734	* without holding anon_vma lock for write. So when looking for a
				735	* genuine pmde (in which to find pte), test present and !THP together.
				736	*/
				737	pmde = *pmd;
				738	barrier();
				739	if (!pmd_present(pmde) \|\| pmd_trans_huge(pmde))
				740	pmd = NULL;
				741	out:
				742	return pmd;
				743	}
				744
				745	struct page_referenced_arg {
				746	int mapcount;
				747	int referenced;
				748	unsigned long vm_flags;
				749	struct mem_cgroup *memcg;
				750	};
				751	/*
				752	* arg: page_referenced_arg will be passed
				753	*/
				754	static bool page_referenced_one(struct page page, struct vm_area_struct vma,
				755	unsigned long address, void *arg)
				756	{
				757	struct page_referenced_arg *pra = arg;
				758	struct page_vma_mapped_walk pvmw = {
				759	.page = page,
				760	.vma = vma,
				761	.address = address,
				762	};
				763	int referenced = 0;
				764
				765	while (page_vma_mapped_walk(&pvmw)) {
				766	address = pvmw.address;
				767
				768	if (vma->vm_flags & VM_LOCKED) {
				769	page_vma_mapped_walk_done(&pvmw);
				770	pra->vm_flags \|= VM_LOCKED;
				771	return false; /* To break the loop */
				772	}
				773
				774	if (pvmw.pte) {
				775	if (ptep_clear_flush_young_notify(vma, address,
				776	pvmw.pte)) {
				777	/*
				778	* Don't treat a reference through
				779	* a sequentially read mapping as such.
				780	* If the page has been used in another mapping,
				781	* we will catch it; if this other mapping is
				782	* already gone, the unmap path will have set
				783	* PG_referenced or activated the page.
				784	*/
				785	if (likely(!(vma->vm_flags & VM_SEQ_READ)))
				786	referenced++;
				787	}
				788	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
				789	if (pmdp_clear_flush_young_notify(vma, address,
				790	pvmw.pmd))
				791	referenced++;
				792	} else {
				793	/* unexpected pmd-mapped page? */
				794	WARN_ON_ONCE(1);
				795	}
				796
				797	pra->mapcount--;
				798	}
				799
				800	if (referenced)
				801	clear_page_idle(page);
				802	if (test_and_clear_page_young(page))
				803	referenced++;
				804
				805	if (referenced) {
				806	pra->referenced++;
				807	pra->vm_flags \|= vma->vm_flags;
				808	}
				809
				810	if (!pra->mapcount)
				811	return false; /* To break the loop */
				812
				813	return true;
				814	}
				815
				816	static bool invalid_page_referenced_vma(struct vm_area_struct vma, void arg)
				817	{
				818	struct page_referenced_arg *pra = arg;
				819	struct mem_cgroup *memcg = pra->memcg;
				820
				821	if (!mm_match_cgroup(vma->vm_mm, memcg))
				822	return true;
				823
				824	return false;
				825	}
				826
				827	/**
				828	* page_referenced - test if the page was referenced
				829	* @page: the page to test
				830	* @is_locked: caller holds lock on the page
				831	* @memcg: target memory cgroup
				832	* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
				833	*
				834	* Quick test_and_clear_referenced for all mappings to a page,
				835	* returns the number of ptes which referenced the page.
				836	*/
				837	int page_referenced(struct page *page,
				838	int is_locked,
				839	struct mem_cgroup *memcg,
				840	unsigned long *vm_flags)
				841	{
				842	int we_locked = 0;
				843	struct page_referenced_arg pra = {
				844	.mapcount = total_mapcount(page),
				845	.memcg = memcg,
				846	};
				847	struct rmap_walk_control rwc = {
				848	.rmap_one = page_referenced_one,
				849	.arg = (void *)&pra,
				850	.anon_lock = page_lock_anon_vma_read,
				851	};
				852
				853	*vm_flags = 0;
				854	if (!pra.mapcount)
				855	return 0;
				856
				857	if (!page_rmapping(page))
				858	return 0;
				859
				860	if (!is_locked && (!PageAnon(page) \|\| PageKsm(page))) {
				861	we_locked = trylock_page(page);
				862	if (!we_locked)
				863	return 1;
				864	}
				865
				866	/*
				867	* If we are reclaiming on behalf of a cgroup, skip
				868	* counting on behalf of references from different
				869	* cgroups
				870	*/
				871	if (memcg) {
				872	rwc.invalid_vma = invalid_page_referenced_vma;
				873	}
				874
				875	rmap_walk(page, &rwc);
				876	*vm_flags = pra.vm_flags;
				877
				878	if (we_locked)
				879	unlock_page(page);
				880
				881	return pra.referenced;
				882	}
				883
				884	static bool page_mkclean_one(struct page page, struct vm_area_struct vma,
				885	unsigned long address, void *arg)
				886	{
				887	struct page_vma_mapped_walk pvmw = {
				888	.page = page,
				889	.vma = vma,
				890	.address = address,
				891	.flags = PVMW_SYNC,
				892	};
				893	struct mmu_notifier_range range;
				894	int *cleaned = arg;
				895
				896	/*
				897	* We have to assume the worse case ie pmd for invalidation. Note that
				898	* the page can not be free from this function.
				899	*/
				900	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
				901	0, vma, vma->vm_mm, address,
				902	vma_address_end(page, vma));
				903	mmu_notifier_invalidate_range_start(&range);
				904
				905	while (page_vma_mapped_walk(&pvmw)) {
				906	int ret = 0;
				907
				908	address = pvmw.address;
				909	if (pvmw.pte) {
				910	pte_t entry;
				911	pte_t *pte = pvmw.pte;
				912
				913	if (!pte_dirty(pte) && !pte_write(pte))
				914	continue;
				915
				916	flush_cache_page(vma, address, pte_pfn(*pte));
				917	entry = ptep_clear_flush(vma, address, pte);
				918	entry = pte_wrprotect(entry);
				919	entry = pte_mkclean(entry);
				920	set_pte_at(vma->vm_mm, address, pte, entry);
				921	ret = 1;
				922	} else {
				923	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				924	pmd_t *pmd = pvmw.pmd;
				925	pmd_t entry;
				926
				927	if (!pmd_dirty(pmd) && !pmd_write(pmd))
				928	continue;
				929
				930	flush_cache_page(vma, address, page_to_pfn(page));
				931	entry = pmdp_invalidate(vma, address, pmd);
				932	entry = pmd_wrprotect(entry);
				933	entry = pmd_mkclean(entry);
				934	set_pmd_at(vma->vm_mm, address, pmd, entry);
				935	ret = 1;
				936	#else
				937	/* unexpected pmd-mapped page? */
				938	WARN_ON_ONCE(1);
				939	#endif
				940	}
				941
				942	/*
				943	* No need to call mmu_notifier_invalidate_range() as we are
				944	* downgrading page table protection not changing it to point
				945	* to a new page.
				946	*
				947	* See Documentation/vm/mmu_notifier.rst
				948	*/
				949	if (ret)
				950	(*cleaned)++;
				951	}
				952
				953	mmu_notifier_invalidate_range_end(&range);
				954
				955	return true;
				956	}
				957
				958	static bool invalid_mkclean_vma(struct vm_area_struct vma, void arg)
				959	{
				960	if (vma->vm_flags & VM_SHARED)
				961	return false;
				962
				963	return true;
				964	}
				965
				966	int page_mkclean(struct page *page)
				967	{
				968	int cleaned = 0;
				969	struct address_space *mapping;
				970	struct rmap_walk_control rwc = {
				971	.arg = (void *)&cleaned,
				972	.rmap_one = page_mkclean_one,
				973	.invalid_vma = invalid_mkclean_vma,
				974	};
				975
				976	BUG_ON(!PageLocked(page));
				977
				978	if (!page_mapped(page))
				979	return 0;
				980
				981	mapping = page_mapping(page);
				982	if (!mapping)
				983	return 0;
				984
				985	rmap_walk(page, &rwc);
				986
				987	return cleaned;
				988	}
				989	EXPORT_SYMBOL_GPL(page_mkclean);
				990
				991	/**
				992	* page_move_anon_rmap - move a page to our anon_vma
				993	* @page: the page to move to our anon_vma
				994	* @vma: the vma the page belongs to
				995	*
				996	* When a page belongs exclusively to one process after a COW event,
				997	* that page can be moved into the anon_vma that belongs to just that
				998	* process, so the rmap code will not search the parent or sibling
				999	* processes.
				1000	*/
				1001	void page_move_anon_rmap(struct page page, struct vm_area_struct vma)
				1002	{
				1003	struct anon_vma *anon_vma = vma->anon_vma;
				1004
				1005	page = compound_head(page);
				1006
				1007	VM_BUG_ON_PAGE(!PageLocked(page), page);
				1008	VM_BUG_ON_VMA(!anon_vma, vma);
				1009
				1010	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
				1011	/*
				1012	* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
				1013	* simultaneously, so a concurrent reader (eg page_referenced()'s
				1014	* PageAnon()) will not see one without the other.
				1015	*/
				1016	WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
				1017	}
				1018
				1019	/**
				1020	* __page_set_anon_rmap - set up new anonymous rmap
				1021	* @page: Page or Hugepage to add to rmap
				1022	* @vma: VM area to add page to.
				1023	* @address: User virtual address of the mapping
				1024	* @exclusive: the page is exclusively owned by the current process
				1025	*/
				1026	static void __page_set_anon_rmap(struct page *page,
				1027	struct vm_area_struct *vma, unsigned long address, int exclusive)
				1028	{
				1029	struct anon_vma *anon_vma = vma->anon_vma;
				1030
				1031	BUG_ON(!anon_vma);
				1032
				1033	if (PageAnon(page))
				1034	return;
				1035
				1036	/*
				1037	* If the page isn't exclusively mapped into this vma,
				1038	* we must use the _oldest_ possible anon_vma for the
				1039	* page mapping!
				1040	*/
				1041	if (!exclusive)
				1042	anon_vma = anon_vma->root;
				1043
				1044	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
				1045	page->mapping = (struct address_space *) anon_vma;
				1046	page->index = linear_page_index(vma, address);
				1047	}
				1048
				1049	/**
				1050	* __page_check_anon_rmap - sanity check anonymous rmap addition
				1051	* @page: the page to add the mapping to
				1052	* @vma: the vm area in which the mapping is added
				1053	* @address: the user virtual address mapped
				1054	*/
				1055	static void __page_check_anon_rmap(struct page *page,
				1056	struct vm_area_struct *vma, unsigned long address)
				1057	{
				1058	#ifdef CONFIG_DEBUG_VM
				1059	/*
				1060	* The page's anon-rmap details (mapping and index) are guaranteed to
				1061	* be set up correctly at this point.
				1062	*
				1063	* We have exclusion against page_add_anon_rmap because the caller
				1064	* always holds the page locked, except if called from page_dup_rmap,
				1065	* in which case the page is already known to be setup.
				1066	*
				1067	* We have exclusion against page_add_new_anon_rmap because those pages
				1068	* are initially only visible via the pagetables, and the pte is locked
				1069	* over the call to page_add_new_anon_rmap.
				1070	*/
				1071	BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
				1072	BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
				1073	#endif
				1074	}
				1075
				1076	/**
				1077	* page_add_anon_rmap - add pte mapping to an anonymous page
				1078	* @page: the page to add the mapping to
				1079	* @vma: the vm area in which the mapping is added
				1080	* @address: the user virtual address mapped
				1081	* @compound: charge the page as compound or small page
				1082	*
				1083	* The caller needs to hold the pte lock, and the page must be locked in
				1084	* the anon_vma case: to serialize mapping,index checking after setting,
				1085	* and to ensure that PageAnon is not being upgraded racily to PageKsm
				1086	* (but PageKsm is never downgraded to PageAnon).
				1087	*/
				1088	void page_add_anon_rmap(struct page *page,
				1089	struct vm_area_struct *vma, unsigned long address, bool compound)
				1090	{
				1091	do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
				1092	}
				1093
				1094	/*
				1095	* Special version of the above for do_swap_page, which often runs
				1096	* into pages that are exclusively owned by the current process.
				1097	* Everybody else should continue to use page_add_anon_rmap above.
				1098	*/
				1099	void do_page_add_anon_rmap(struct page *page,
				1100	struct vm_area_struct *vma, unsigned long address, int flags)
				1101	{
				1102	bool compound = flags & RMAP_COMPOUND;
				1103	bool first;
				1104
				1105	if (compound) {
				1106	atomic_t *mapcount;
				1107	VM_BUG_ON_PAGE(!PageLocked(page), page);
				1108	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
				1109	mapcount = compound_mapcount_ptr(page);
				1110	first = atomic_inc_and_test(mapcount);
				1111	} else {
				1112	first = atomic_inc_and_test(&page->_mapcount);
				1113	}
				1114
				1115	if (first) {
				1116	int nr = compound ? hpage_nr_pages(page) : 1;
				1117	/*
				1118	* We use the irq-unsafe __{inc\|mod}_zone_page_stat because
				1119	* these counters are not modified in interrupt context, and
				1120	* pte lock(a spinlock) is held, which implies preemption
				1121	* disabled.
				1122	*/
				1123	if (compound)
				1124	__inc_node_page_state(page, NR_ANON_THPS);
				1125	__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
				1126	}
				1127	if (unlikely(PageKsm(page)))
				1128	return;
				1129
				1130	VM_BUG_ON_PAGE(!PageLocked(page), page);
				1131
				1132	/* address might be in next vma when migration races vma_adjust */
				1133	if (first)
				1134	__page_set_anon_rmap(page, vma, address,
				1135	flags & RMAP_EXCLUSIVE);
				1136	else
				1137	__page_check_anon_rmap(page, vma, address);
				1138	}
				1139
				1140	/**
				1141	* page_add_new_anon_rmap - add pte mapping to a new anonymous page
				1142	* @page: the page to add the mapping to
				1143	* @vma: the vm area in which the mapping is added
				1144	* @address: the user virtual address mapped
				1145	* @compound: charge the page as compound or small page
				1146	*
				1147	* Same as page_add_anon_rmap but must only be called on new pages.
				1148	* This means the inc-and-test can be bypassed.
				1149	* Page does not have to be locked.
				1150	*/
				1151	void page_add_new_anon_rmap(struct page *page,
				1152	struct vm_area_struct *vma, unsigned long address, bool compound)
				1153	{
				1154	int nr = compound ? hpage_nr_pages(page) : 1;
				1155
				1156	VM_BUG_ON_VMA(address < vma->vm_start \|\| address >= vma->vm_end, vma);
				1157	__SetPageSwapBacked(page);
				1158	if (compound) {
				1159	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
				1160	/* increment count (starts at -1) */
				1161	atomic_set(compound_mapcount_ptr(page), 0);
				1162	__inc_node_page_state(page, NR_ANON_THPS);
				1163	} else {
				1164	/* Anon THP always mapped first with PMD */
				1165	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				1166	/* increment count (starts at -1) */
				1167	atomic_set(&page->_mapcount, 0);
				1168	}
				1169	__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
				1170	__page_set_anon_rmap(page, vma, address, 1);
				1171	}
				1172
				1173	/**
				1174	* page_add_file_rmap - add pte mapping to a file page
				1175	* @page: the page to add the mapping to
				1176	* @compound: charge the page as compound or small page
				1177	*
				1178	* The caller needs to hold the pte lock.
				1179	*/
				1180	void page_add_file_rmap(struct page *page, bool compound)
				1181	{
				1182	int i, nr = 1;
				1183
				1184	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
				1185	lock_page_memcg(page);
				1186	if (compound && PageTransHuge(page)) {
				1187	for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
				1188	if (atomic_inc_and_test(&page[i]._mapcount))
				1189	nr++;
				1190	}
				1191	if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
				1192	goto out;
				1193	if (PageSwapBacked(page))
				1194	__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
				1195	else
				1196	__inc_node_page_state(page, NR_FILE_PMDMAPPED);
				1197	} else {
				1198	if (PageTransCompound(page) && page_mapping(page)) {
				1199	VM_WARN_ON_ONCE(!PageLocked(page));
				1200
				1201	SetPageDoubleMap(compound_head(page));
				1202	if (PageMlocked(page))
				1203	clear_page_mlock(compound_head(page));
				1204	}
				1205	if (!atomic_inc_and_test(&page->_mapcount))
				1206	goto out;
				1207	}
				1208	__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
				1209	out:
				1210	unlock_page_memcg(page);
				1211	}
				1212
				1213	static void page_remove_file_rmap(struct page *page, bool compound)
				1214	{
				1215	int i, nr = 1;
				1216
				1217	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
				1218	lock_page_memcg(page);
				1219
				1220	/* Hugepages are not counted in NR_FILE_MAPPED for now. */
				1221	if (unlikely(PageHuge(page))) {
				1222	/* hugetlb pages are always mapped with pmds */
				1223	atomic_dec(compound_mapcount_ptr(page));
				1224	goto out;
				1225	}
				1226
				1227	/* page still mapped by someone else? */
				1228	if (compound && PageTransHuge(page)) {
				1229	for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
				1230	if (atomic_add_negative(-1, &page[i]._mapcount))
				1231	nr++;
				1232	}
				1233	if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
				1234	goto out;
				1235	if (PageSwapBacked(page))
				1236	__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
				1237	else
				1238	__dec_node_page_state(page, NR_FILE_PMDMAPPED);
				1239	} else {
				1240	if (!atomic_add_negative(-1, &page->_mapcount))
				1241	goto out;
				1242	}
				1243
				1244	/*
				1245	* We use the irq-unsafe __{inc\|mod}_lruvec_page_state because
				1246	* these counters are not modified in interrupt context, and
				1247	* pte lock(a spinlock) is held, which implies preemption disabled.
				1248	*/
				1249	__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
				1250
				1251	if (unlikely(PageMlocked(page)))
				1252	clear_page_mlock(page);
				1253	out:
				1254	unlock_page_memcg(page);
				1255	}
				1256
				1257	static void page_remove_anon_compound_rmap(struct page *page)
				1258	{
				1259	int i, nr;
				1260
				1261	if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
				1262	return;
				1263
				1264	/* Hugepages are not counted in NR_ANON_PAGES for now. */
				1265	if (unlikely(PageHuge(page)))
				1266	return;
				1267
				1268	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
				1269	return;
				1270
				1271	__dec_node_page_state(page, NR_ANON_THPS);
				1272
				1273	if (TestClearPageDoubleMap(page)) {
				1274	/*
				1275	* Subpages can be mapped with PTEs too. Check how many of
				1276	* themi are still mapped.
				1277	*/
				1278	for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
				1279	if (atomic_add_negative(-1, &page[i]._mapcount))
				1280	nr++;
				1281	}
				1282	} else {
				1283	nr = HPAGE_PMD_NR;
				1284	}
				1285
				1286	if (unlikely(PageMlocked(page)))
				1287	clear_page_mlock(page);
				1288
				1289	if (nr) {
				1290	__mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
				1291	deferred_split_huge_page(page);
				1292	}
				1293	}
				1294
				1295	/**
				1296	* page_remove_rmap - take down pte mapping from a page
				1297	* @page: page to remove mapping from
				1298	* @compound: uncharge the page as compound or small page
				1299	*
				1300	* The caller needs to hold the pte lock.
				1301	*/
				1302	void page_remove_rmap(struct page *page, bool compound)
				1303	{
				1304	if (!PageAnon(page))
				1305	return page_remove_file_rmap(page, compound);
				1306
				1307	if (compound)
				1308	return page_remove_anon_compound_rmap(page);
				1309
				1310	/* page still mapped by someone else? */
				1311	if (!atomic_add_negative(-1, &page->_mapcount))
				1312	return;
				1313
				1314	/*
				1315	* We use the irq-unsafe __{inc\|mod}_zone_page_stat because
				1316	* these counters are not modified in interrupt context, and
				1317	* pte lock(a spinlock) is held, which implies preemption disabled.
				1318	*/
				1319	__dec_node_page_state(page, NR_ANON_MAPPED);
				1320
				1321	if (unlikely(PageMlocked(page)))
				1322	clear_page_mlock(page);
				1323
				1324	if (PageTransCompound(page))
				1325	deferred_split_huge_page(compound_head(page));
				1326
				1327	/*
				1328	* It would be tidy to reset the PageAnon mapping here,
				1329	* but that might overwrite a racing page_add_anon_rmap
				1330	* which increments mapcount after us but sets mapping
				1331	* before us: so leave the reset to free_unref_page,
				1332	* and remember that it's only reliable while mapped.
				1333	* Leaving it set also helps swapoff to reinstate ptes
				1334	* faster for those pages still in swapcache.
				1335	*/
				1336	}
				1337
				1338	/*
				1339	* @arg: enum ttu_flags will be passed to this argument
				1340	*/
				1341	static bool try_to_unmap_one(struct page page, struct vm_area_struct vma,
				1342	unsigned long address, void *arg)
				1343	{
				1344	struct mm_struct *mm = vma->vm_mm;
				1345	struct page_vma_mapped_walk pvmw = {
				1346	.page = page,
				1347	.vma = vma,
				1348	.address = address,
				1349	};
				1350	pte_t pteval;
				1351	struct page *subpage;
				1352	bool ret = true;
				1353	struct mmu_notifier_range range;
				1354	enum ttu_flags flags = (enum ttu_flags)arg;
				1355
				1356	/*
				1357	* When racing against e.g. zap_pte_range() on another cpu,
				1358	* in between its ptep_get_and_clear_full() and page_remove_rmap(),
				1359	* try_to_unmap() may return false when it is about to become true,
				1360	* if page table locking is skipped: use TTU_SYNC to wait for that.
				1361	*/
				1362	if (flags & TTU_SYNC)
				1363	pvmw.flags = PVMW_SYNC;
				1364
				1365	/* munlock has nothing to gain from examining un-locked vmas */
				1366	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
				1367	return true;
				1368
				1369	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
				1370	is_zone_device_page(page) && !is_device_private_page(page))
				1371	return true;
				1372
				1373	if (flags & TTU_SPLIT_HUGE_PMD) {
				1374	split_huge_pmd_address(vma, address,
				1375	flags & TTU_SPLIT_FREEZE, page);
				1376	}
				1377
				1378	/*
				1379	* For THP, we have to assume the worse case ie pmd for invalidation.
				1380	* For hugetlb, it could be much worse if we need to do pud
				1381	* invalidation in the case of pmd sharing.
				1382	*
				1383	* Note that the page can not be free in this function as call of
				1384	* try_to_unmap() must hold a reference on the page.
				1385	*/
				1386	range.end = PageKsm(page) ?
				1387	address + PAGE_SIZE : vma_address_end(page, vma);
				1388	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
				1389	address, range.end);
				1390	if (PageHuge(page)) {
				1391	/*
				1392	* If sharing is possible, start and end will be adjusted
				1393	* accordingly.
				1394	*/
				1395	adjust_range_if_pmd_sharing_possible(vma, &range.start,
				1396	&range.end);
				1397	}
				1398	mmu_notifier_invalidate_range_start(&range);
				1399
				1400	while (page_vma_mapped_walk(&pvmw)) {
				1401	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				1402	/* PMD-mapped THP migration entry */
				1403	if (!pvmw.pte && (flags & TTU_MIGRATION)) {
				1404	VM_BUG_ON_PAGE(PageHuge(page) \|\| !PageTransCompound(page), page);
				1405
				1406	set_pmd_migration_entry(&pvmw, page);
				1407	continue;
				1408	}
				1409	#endif
				1410
				1411	/*
				1412	* If the page is mlock()d, we cannot swap it out.
				1413	* If it's recently referenced (perhaps page_referenced
				1414	* skipped over this mm) then we should reactivate it.
				1415	*/
				1416	if (!(flags & TTU_IGNORE_MLOCK)) {
				1417	if (vma->vm_flags & VM_LOCKED) {
				1418	/* PTE-mapped THP are never mlocked */
				1419	if (!PageTransCompound(page)) {
				1420	/*
				1421	* Holding pte lock, we do not need
				1422	* mmap_sem here
				1423	*/
				1424	mlock_vma_page(page);
				1425	}
				1426	ret = false;
				1427	page_vma_mapped_walk_done(&pvmw);
				1428	break;
				1429	}
				1430	if (flags & TTU_MUNLOCK)
				1431	continue;
				1432	}
				1433
				1434	/* Unexpected PMD-mapped THP? */
				1435	VM_BUG_ON_PAGE(!pvmw.pte, page);
				1436
				1437	subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
				1438	address = pvmw.address;
				1439
				1440	if (PageHuge(page)) {
				1441	if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
				1442	/*
				1443	* huge_pmd_unshare unmapped an entire PMD
				1444	* page. There is no way of knowing exactly
				1445	* which PMDs may be cached for this mm, so
				1446	* we must flush them all. start/end were
				1447	* already adjusted above to cover this range.
				1448	*/
				1449	flush_cache_range(vma, range.start, range.end);
				1450	flush_tlb_range(vma, range.start, range.end);
				1451	mmu_notifier_invalidate_range(mm, range.start,
				1452	range.end);
				1453
				1454	/*
				1455	* The ref count of the PMD page was dropped
				1456	* which is part of the way map counting
				1457	* is done for shared PMDs. Return 'true'
				1458	* here. When there is no other sharing,
				1459	* huge_pmd_unshare returns false and we will
				1460	* unmap the actual page and drop map count
				1461	* to zero.
				1462	*/
				1463	page_vma_mapped_walk_done(&pvmw);
				1464	break;
				1465	}
				1466	}
				1467
				1468	if (IS_ENABLED(CONFIG_MIGRATION) &&
				1469	(flags & TTU_MIGRATION) &&
				1470	is_zone_device_page(page)) {
				1471	swp_entry_t entry;
				1472	pte_t swp_pte;
				1473
				1474	pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
				1475
				1476	/*
				1477	* Store the pfn of the page in a special migration
				1478	* pte. do_swap_page() will wait until the migration
				1479	* pte is removed and then restart fault handling.
				1480	*/
				1481	entry = make_migration_entry(page, 0);
				1482	swp_pte = swp_entry_to_pte(entry);
				1483	if (pte_soft_dirty(pteval))
				1484	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				1485	set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
				1486	/*
				1487	* No need to invalidate here it will synchronize on
				1488	* against the special swap migration pte.
				1489	*
				1490	* The assignment to subpage above was computed from a
				1491	* swap PTE which results in an invalid pointer.
				1492	* Since only PAGE_SIZE pages can currently be
				1493	* migrated, just set it to page. This will need to be
				1494	* changed when hugepage migrations to device private
				1495	* memory are supported.
				1496	*/
				1497	subpage = page;
				1498	goto discard;
				1499	}
				1500
				1501	if (!(flags & TTU_IGNORE_ACCESS)) {
				1502	if (ptep_clear_flush_young_notify(vma, address,
				1503	pvmw.pte)) {
				1504	ret = false;
				1505	page_vma_mapped_walk_done(&pvmw);
				1506	break;
				1507	}
				1508	}
				1509
				1510	/* Nuke the page table entry. */
				1511	flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
				1512	if (should_defer_flush(mm, flags)) {
				1513	/*
				1514	* We clear the PTE but do not flush so potentially
				1515	* a remote CPU could still be writing to the page.
				1516	* If the entry was previously clean then the
				1517	* architecture must guarantee that a clear->dirty
				1518	* transition on a cached TLB entry is written through
				1519	* and traps if the PTE is unmapped.
				1520	*/
				1521	pteval = ptep_get_and_clear(mm, address, pvmw.pte);
				1522
				1523	set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
				1524	} else {
				1525	pteval = ptep_clear_flush(vma, address, pvmw.pte);
				1526	}
				1527
				1528	/* Move the dirty bit to the page. Now the pte is gone. */
				1529	if (pte_dirty(pteval))
				1530	set_page_dirty(page);
				1531
				1532	/* Update high watermark before we lower rss */
				1533	update_hiwater_rss(mm);
				1534
				1535	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
				1536	pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
				1537	if (PageHuge(page)) {
				1538	hugetlb_count_sub(compound_nr(page), mm);
				1539	set_huge_swap_pte_at(mm, address,
				1540	pvmw.pte, pteval,
				1541	vma_mmu_pagesize(vma));
				1542	} else {
				1543	dec_mm_counter(mm, mm_counter(page));
				1544	set_pte_at(mm, address, pvmw.pte, pteval);
				1545	}
				1546
				1547	} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
				1548	/*
				1549	* The guest indicated that the page content is of no
				1550	* interest anymore. Simply discard the pte, vmscan
				1551	* will take care of the rest.
				1552	* A future reference will then fault in a new zero
				1553	* page. When userfaultfd is active, we must not drop
				1554	* this page though, as its main user (postcopy
				1555	* migration) will not expect userfaults on already
				1556	* copied pages.
				1557	*/
				1558	dec_mm_counter(mm, mm_counter(page));
				1559	/* We have to invalidate as we cleared the pte */
				1560	mmu_notifier_invalidate_range(mm, address,
				1561	address + PAGE_SIZE);
				1562	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
				1563	(flags & (TTU_MIGRATION\|TTU_SPLIT_FREEZE))) {
				1564	swp_entry_t entry;
				1565	pte_t swp_pte;
				1566
				1567	if (arch_unmap_one(mm, vma, address, pteval) < 0) {
				1568	set_pte_at(mm, address, pvmw.pte, pteval);
				1569	ret = false;
				1570	page_vma_mapped_walk_done(&pvmw);
				1571	break;
				1572	}
				1573
				1574	/*
				1575	* Store the pfn of the page in a special migration
				1576	* pte. do_swap_page() will wait until the migration
				1577	* pte is removed and then restart fault handling.
				1578	*/
				1579	entry = make_migration_entry(subpage,
				1580	pte_write(pteval));
				1581	swp_pte = swp_entry_to_pte(entry);
				1582	if (pte_soft_dirty(pteval))
				1583	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				1584	set_pte_at(mm, address, pvmw.pte, swp_pte);
				1585	/*
				1586	* No need to invalidate here it will synchronize on
				1587	* against the special swap migration pte.
				1588	*/
				1589	} else if (PageAnon(page)) {
				1590	swp_entry_t entry = { .val = page_private(subpage) };
				1591	pte_t swp_pte;
				1592	/*
				1593	* Store the swap location in the pte.
				1594	* See handle_pte_fault() ...
				1595	*/
				1596	if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
				1597	WARN_ON_ONCE(1);
				1598	ret = false;
				1599	/* We have to invalidate as we cleared the pte */
				1600	mmu_notifier_invalidate_range(mm, address,
				1601	address + PAGE_SIZE);
				1602	page_vma_mapped_walk_done(&pvmw);
				1603	break;
				1604	}
				1605
				1606	/* MADV_FREE page check */
				1607	if (!PageSwapBacked(page)) {
				1608	int ref_count, map_count;
				1609
				1610	/*
				1611	* Synchronize with gup_pte_range():
				1612	* - clear PTE; barrier; read refcount
				1613	* - inc refcount; barrier; read PTE
				1614	*/
				1615	smp_mb();
				1616
				1617	ref_count = page_ref_count(page);
				1618	map_count = page_mapcount(page);
				1619
				1620	/*
				1621	* Order reads for page refcount and dirty flag
				1622	* (see comments in __remove_mapping()).
				1623	*/
				1624	smp_rmb();
				1625
				1626	/*
				1627	* The only page refs must be one from isolation
				1628	* plus the rmap(s) (dropped by discard:).
				1629	*/
				1630	if (ref_count == 1 + map_count &&
				1631	!PageDirty(page)) {
				1632	/* Invalidate as we cleared the pte */
				1633	mmu_notifier_invalidate_range(mm,
				1634	address, address + PAGE_SIZE);
				1635	dec_mm_counter(mm, MM_ANONPAGES);
				1636	goto discard;
				1637	}
				1638
				1639	/*
				1640	* If the page was redirtied, it cannot be
				1641	* discarded. Remap the page to page table.
				1642	*/
				1643	set_pte_at(mm, address, pvmw.pte, pteval);
				1644	SetPageSwapBacked(page);
				1645	ret = false;
				1646	page_vma_mapped_walk_done(&pvmw);
				1647	break;
				1648	}
				1649
				1650	if (swap_duplicate(entry) < 0) {
				1651	set_pte_at(mm, address, pvmw.pte, pteval);
				1652	ret = false;
				1653	page_vma_mapped_walk_done(&pvmw);
				1654	break;
				1655	}
				1656	if (arch_unmap_one(mm, vma, address, pteval) < 0) {
				1657	set_pte_at(mm, address, pvmw.pte, pteval);
				1658	ret = false;
				1659	page_vma_mapped_walk_done(&pvmw);
				1660	break;
				1661	}
				1662	if (list_empty(&mm->mmlist)) {
				1663	spin_lock(&mmlist_lock);
				1664	if (list_empty(&mm->mmlist))
				1665	list_add(&mm->mmlist, &init_mm.mmlist);
				1666	spin_unlock(&mmlist_lock);
				1667	}
				1668	dec_mm_counter(mm, MM_ANONPAGES);
				1669	inc_mm_counter(mm, MM_SWAPENTS);
				1670	swp_pte = swp_entry_to_pte(entry);
				1671	if (pte_soft_dirty(pteval))
				1672	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				1673	set_pte_at(mm, address, pvmw.pte, swp_pte);
				1674	/* Invalidate as we cleared the pte */
				1675	mmu_notifier_invalidate_range(mm, address,
				1676	address + PAGE_SIZE);
				1677	} else {
				1678	/*
				1679	* This is a locked file-backed page, thus it cannot
				1680	* be removed from the page cache and replaced by a new
				1681	* page before mmu_notifier_invalidate_range_end, so no
				1682	* concurrent thread might update its page table to
				1683	* point at new page while a device still is using this
				1684	* page.
				1685	*
				1686	* See Documentation/vm/mmu_notifier.rst
				1687	*/
				1688	dec_mm_counter(mm, mm_counter_file(page));
				1689	}
				1690	discard:
				1691	/*
				1692	* No need to call mmu_notifier_invalidate_range() it has be
				1693	* done above for all cases requiring it to happen under page
				1694	* table lock before mmu_notifier_invalidate_range_end()
				1695	*
				1696	* See Documentation/vm/mmu_notifier.rst
				1697	*/
				1698	page_remove_rmap(subpage, PageHuge(page));
				1699	put_page(page);
				1700	}
				1701
				1702	mmu_notifier_invalidate_range_end(&range);
				1703
				1704	return ret;
				1705	}
				1706
				1707	bool is_vma_temporary_stack(struct vm_area_struct *vma)
				1708	{
				1709	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN \| VM_GROWSUP);
				1710
				1711	if (!maybe_stack)
				1712	return false;
				1713
				1714	if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
				1715	VM_STACK_INCOMPLETE_SETUP)
				1716	return true;
				1717
				1718	return false;
				1719	}
				1720
				1721	static bool invalid_migration_vma(struct vm_area_struct vma, void arg)
				1722	{
				1723	return is_vma_temporary_stack(vma);
				1724	}
				1725
				1726	static int page_not_mapped(struct page *page)
				1727	{
				1728	return !page_mapped(page);
				1729	}
				1730
				1731	/**
				1732	* try_to_unmap - try to remove all page table mappings to a page
				1733	* @page: the page to get unmapped
				1734	* @flags: action and flags
				1735	*
				1736	* Tries to remove all the page table entries which are mapping this
				1737	* page, used in the pageout path. Caller must hold the page lock.
				1738	*
				1739	* If unmap is successful, return true. Otherwise, false.
				1740	*/
				1741	bool try_to_unmap(struct page *page, enum ttu_flags flags)
				1742	{
				1743	struct rmap_walk_control rwc = {
				1744	.rmap_one = try_to_unmap_one,
				1745	.arg = (void *)flags,
				1746	.done = page_not_mapped,
				1747	.anon_lock = page_lock_anon_vma_read,
				1748	};
				1749
				1750	/*
				1751	* During exec, a temporary VMA is setup and later moved.
				1752	* The VMA is moved under the anon_vma lock but not the
				1753	* page tables leading to a race where migration cannot
				1754	* find the migration ptes. Rather than increasing the
				1755	* locking requirements of exec(), migration skips
				1756	* temporary VMAs until after exec() completes.
				1757	*/
				1758	if ((flags & (TTU_MIGRATION\|TTU_SPLIT_FREEZE))
				1759	&& !PageKsm(page) && PageAnon(page))
				1760	rwc.invalid_vma = invalid_migration_vma;
				1761
				1762	if (flags & TTU_RMAP_LOCKED)
				1763	rmap_walk_locked(page, &rwc);
				1764	else
				1765	rmap_walk(page, &rwc);
				1766
				1767	/*
				1768	* When racing against e.g. zap_pte_range() on another cpu,
				1769	* in between its ptep_get_and_clear_full() and page_remove_rmap(),
				1770	* try_to_unmap() may return false when it is about to become true,
				1771	* if page table locking is skipped: use TTU_SYNC to wait for that.
				1772	*/
				1773	return !page_mapcount(page);
				1774	}
				1775
				1776	/**
				1777	* try_to_munlock - try to munlock a page
				1778	* @page: the page to be munlocked
				1779	*
				1780	* Called from munlock code. Checks all of the VMAs mapping the page
				1781	* to make sure nobody else has this page mlocked. The page will be
				1782	* returned with PG_mlocked cleared if no other vmas have it mlocked.
				1783	*/
				1784
				1785	void try_to_munlock(struct page *page)
				1786	{
				1787	struct rmap_walk_control rwc = {
				1788	.rmap_one = try_to_unmap_one,
				1789	.arg = (void *)TTU_MUNLOCK,
				1790	.done = page_not_mapped,
				1791	.anon_lock = page_lock_anon_vma_read,
				1792
				1793	};
				1794
				1795	VM_BUG_ON_PAGE(!PageLocked(page) \|\| PageLRU(page), page);
				1796	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
				1797
				1798	rmap_walk(page, &rwc);
				1799	}
				1800
				1801	void __put_anon_vma(struct anon_vma *anon_vma)
				1802	{
				1803	struct anon_vma *root = anon_vma->root;
				1804
				1805	anon_vma_free(anon_vma);
				1806	if (root != anon_vma && atomic_dec_and_test(&root->refcount))
				1807	anon_vma_free(root);
				1808	}
				1809
				1810	static struct anon_vma rmap_walk_anon_lock(struct page page,
				1811	struct rmap_walk_control *rwc)
				1812	{
				1813	struct anon_vma *anon_vma;
				1814
				1815	if (rwc->anon_lock)
				1816	return rwc->anon_lock(page);
				1817
				1818	/*
				1819	* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
				1820	* because that depends on page_mapped(); but not all its usages
				1821	* are holding mmap_sem. Users without mmap_sem are required to
				1822	* take a reference count to prevent the anon_vma disappearing
				1823	*/
				1824	anon_vma = page_anon_vma(page);
				1825	if (!anon_vma)
				1826	return NULL;
				1827
				1828	anon_vma_lock_read(anon_vma);
				1829	return anon_vma;
				1830	}
				1831
				1832	/*
				1833	* rmap_walk_anon - do something to anonymous page using the object-based
				1834	* rmap method
				1835	* @page: the page to be handled
				1836	* @rwc: control variable according to each walk type
				1837	*
				1838	* Find all the mappings of a page using the mapping pointer and the vma chains
				1839	* contained in the anon_vma struct it points to.
				1840	*
				1841	* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
				1842	* where the page was found will be held for write. So, we won't recheck
				1843	* vm_flags for that VMA. That should be OK, because that vma shouldn't be
				1844	* LOCKED.
				1845	*/
				1846	static void rmap_walk_anon(struct page page, struct rmap_walk_control rwc,
				1847	bool locked)
				1848	{
				1849	struct anon_vma *anon_vma;
				1850	pgoff_t pgoff_start, pgoff_end;
				1851	struct anon_vma_chain *avc;
				1852
				1853	if (locked) {
				1854	anon_vma = page_anon_vma(page);
				1855	/* anon_vma disappear under us? */
				1856	VM_BUG_ON_PAGE(!anon_vma, page);
				1857	} else {
				1858	anon_vma = rmap_walk_anon_lock(page, rwc);
				1859	}
				1860	if (!anon_vma)
				1861	return;
				1862
				1863	pgoff_start = page_to_pgoff(page);
				1864	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
				1865	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
				1866	pgoff_start, pgoff_end) {
				1867	struct vm_area_struct *vma = avc->vma;
				1868	unsigned long address = vma_address(page, vma);
				1869
				1870	VM_BUG_ON_VMA(address == -EFAULT, vma);
				1871	cond_resched();
				1872
				1873	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
				1874	continue;
				1875
				1876	if (!rwc->rmap_one(page, vma, address, rwc->arg))
				1877	break;
				1878	if (rwc->done && rwc->done(page))
				1879	break;
				1880	}
				1881
				1882	if (!locked)
				1883	anon_vma_unlock_read(anon_vma);
				1884	}
				1885
				1886	/*
				1887	* rmap_walk_file - do something to file page using the object-based rmap method
				1888	* @page: the page to be handled
				1889	* @rwc: control variable according to each walk type
				1890	*
				1891	* Find all the mappings of a page using the mapping pointer and the vma chains
				1892	* contained in the address_space struct it points to.
				1893	*
				1894	* When called from try_to_munlock(), the mmap_sem of the mm containing the vma
				1895	* where the page was found will be held for write. So, we won't recheck
				1896	* vm_flags for that VMA. That should be OK, because that vma shouldn't be
				1897	* LOCKED.
				1898	*/
				1899	static void rmap_walk_file(struct page page, struct rmap_walk_control rwc,
				1900	bool locked)
				1901	{
				1902	struct address_space *mapping = page_mapping(page);
				1903	pgoff_t pgoff_start, pgoff_end;
				1904	struct vm_area_struct *vma;
				1905
				1906	/*
				1907	* The page lock not only makes sure that page->mapping cannot
				1908	* suddenly be NULLified by truncation, it makes sure that the
				1909	* structure at mapping cannot be freed and reused yet,
				1910	* so we can safely take mapping->i_mmap_rwsem.
				1911	*/
				1912	VM_BUG_ON_PAGE(!PageLocked(page), page);
				1913
				1914	if (!mapping)
				1915	return;
				1916
				1917	pgoff_start = page_to_pgoff(page);
				1918	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
				1919	if (!locked)
				1920	i_mmap_lock_read(mapping);
				1921	vma_interval_tree_foreach(vma, &mapping->i_mmap,
				1922	pgoff_start, pgoff_end) {
				1923	unsigned long address = vma_address(page, vma);
				1924
				1925	VM_BUG_ON_VMA(address == -EFAULT, vma);
				1926	cond_resched();
				1927
				1928	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
				1929	continue;
				1930
				1931	if (!rwc->rmap_one(page, vma, address, rwc->arg))
				1932	goto done;
				1933	if (rwc->done && rwc->done(page))
				1934	goto done;
				1935	}
				1936
				1937	done:
				1938	if (!locked)
				1939	i_mmap_unlock_read(mapping);
				1940	}
				1941
				1942	void rmap_walk(struct page page, struct rmap_walk_control rwc)
				1943	{
				1944	if (unlikely(PageKsm(page)))
				1945	rmap_walk_ksm(page, rwc);
				1946	else if (PageAnon(page))
				1947	rmap_walk_anon(page, rwc, false);
				1948	else
				1949	rmap_walk_file(page, rwc, false);
				1950	}
				1951
				1952	/* Like rmap_walk, but caller holds relevant rmap lock */
				1953	void rmap_walk_locked(struct page page, struct rmap_walk_control rwc)
				1954	{
				1955	/* no ksm support for now */
				1956	VM_BUG_ON_PAGE(PageKsm(page), page);
				1957	if (PageAnon(page))
				1958	rmap_walk_anon(page, rwc, true);
				1959	else
				1960	rmap_walk_file(page, rwc, true);
				1961	}
				1962
				1963	#ifdef CONFIG_HUGETLB_PAGE
				1964	/*
				1965	* The following two functions are for anonymous (private mapped) hugepages.
				1966	* Unlike common anonymous pages, anonymous hugepages have no accounting code
				1967	* and no lru code, because we handle hugepages differently from common pages.
				1968	*/
				1969	void hugepage_add_anon_rmap(struct page *page,
				1970	struct vm_area_struct *vma, unsigned long address)
				1971	{
				1972	struct anon_vma *anon_vma = vma->anon_vma;
				1973	int first;
				1974
				1975	BUG_ON(!PageLocked(page));
				1976	BUG_ON(!anon_vma);
				1977	/* address might be in next vma when migration races vma_adjust */
				1978	first = atomic_inc_and_test(compound_mapcount_ptr(page));
				1979	if (first)
				1980	__page_set_anon_rmap(page, vma, address, 0);
				1981	}
				1982
				1983	void hugepage_add_new_anon_rmap(struct page *page,
				1984	struct vm_area_struct *vma, unsigned long address)
				1985	{
				1986	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
				1987	atomic_set(compound_mapcount_ptr(page), 0);
				1988	__page_set_anon_rmap(page, vma, address, 1);
				1989	}
				1990	#endif /* CONFIG_HUGETLB_PAGE */