Blame - src/kernel/linux/v4.19/mm/migrate.c - T800

blob: 724a6276797570117d87ee83f37d2d7377952717 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Memory Migration functionality - linux/mm/migrate.c
				4	*
				5	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
				6	*
				7	* Page migration was first developed in the context of the memory hotplug
				8	* project. The main authors of the migration code are:
				9	*
				10	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
				11	* Hirokazu Takahashi <taka@valinux.co.jp>
				12	* Dave Hansen <haveblue@us.ibm.com>
				13	* Christoph Lameter
				14	*/
				15
				16	#include <linux/migrate.h>
				17	#include <linux/export.h>
				18	#include <linux/swap.h>
				19	#include <linux/swapops.h>
				20	#include <linux/pagemap.h>
				21	#include <linux/buffer_head.h>
				22	#include <linux/mm_inline.h>
				23	#include <linux/nsproxy.h>
				24	#include <linux/pagevec.h>
				25	#include <linux/ksm.h>
				26	#include <linux/rmap.h>
				27	#include <linux/topology.h>
				28	#include <linux/cpu.h>
				29	#include <linux/cpuset.h>
				30	#include <linux/writeback.h>
				31	#include <linux/mempolicy.h>
				32	#include <linux/vmalloc.h>
				33	#include <linux/security.h>
				34	#include <linux/backing-dev.h>
				35	#include <linux/compaction.h>
				36	#include <linux/syscalls.h>
				37	#include <linux/compat.h>
				38	#include <linux/hugetlb.h>
				39	#include <linux/hugetlb_cgroup.h>
				40	#include <linux/gfp.h>
				41	#include <linux/pfn_t.h>
				42	#include <linux/memremap.h>
				43	#include <linux/userfaultfd_k.h>
				44	#include <linux/balloon_compaction.h>
				45	#include <linux/mmu_notifier.h>
				46	#include <linux/page_idle.h>
				47	#include <linux/page_owner.h>
				48	#include <linux/sched/mm.h>
				49	#include <linux/ptrace.h>
				50
				51	#include <asm/tlbflush.h>
				52
				53	#define CREATE_TRACE_POINTS
				54	#include <trace/events/migrate.h>
				55
				56	#include "internal.h"
				57
				58	/*
				59	* migrate_prep() needs to be called before we start compiling a list of pages
				60	* to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
				61	* undesirable, use migrate_prep_local()
				62	*/
				63	int migrate_prep(void)
				64	{
				65	/*
				66	* Clear the LRU lists so pages can be isolated.
				67	* Note that pages may be moved off the LRU after we have
				68	* drained them. Those pages will fail to migrate like other
				69	* pages that may be busy.
				70	*/
				71	lru_add_drain_all();
				72
				73	return 0;
				74	}
				75
				76	/* Do the necessary work of migrate_prep but not if it involves other CPUs */
				77	int migrate_prep_local(void)
				78	{
				79	lru_add_drain();
				80
				81	return 0;
				82	}
				83
				84	int isolate_movable_page(struct page *page, isolate_mode_t mode)
				85	{
				86	struct address_space *mapping;
				87
				88	/*
				89	* Avoid burning cycles with pages that are yet under __free_pages(),
				90	* or just got freed under us.
				91	*
				92	* In case we 'win' a race for a movable page being freed under us and
				93	* raise its refcount preventing __free_pages() from doing its job
				94	* the put_page() at the end of this block will take care of
				95	* release this page, thus avoiding a nasty leakage.
				96	*/
				97	if (unlikely(!get_page_unless_zero(page)))
				98	goto out;
				99
				100	/*
				101	* Check PageMovable before holding a PG_lock because page's owner
				102	* assumes anybody doesn't touch PG_lock of newly allocated page
				103	* so unconditionally grapping the lock ruins page's owner side.
				104	*/
				105	if (unlikely(!__PageMovable(page)))
				106	goto out_putpage;
				107	/*
				108	* As movable pages are not isolated from LRU lists, concurrent
				109	* compaction threads can race against page migration functions
				110	* as well as race against the releasing a page.
				111	*
				112	* In order to avoid having an already isolated movable page
				113	* being (wrongly) re-isolated while it is under migration,
				114	* or to avoid attempting to isolate pages being released,
				115	* lets be sure we have the page lock
				116	* before proceeding with the movable page isolation steps.
				117	*/
				118	if (unlikely(!trylock_page(page)))
				119	goto out_putpage;
				120
				121	if (!PageMovable(page) \|\| PageIsolated(page))
				122	goto out_no_isolated;
				123
				124	mapping = page_mapping(page);
				125	VM_BUG_ON_PAGE(!mapping, page);
				126
				127	if (!mapping->a_ops->isolate_page(page, mode))
				128	goto out_no_isolated;
				129
				130	/* Driver shouldn't use PG_isolated bit of page->flags */
				131	WARN_ON_ONCE(PageIsolated(page));
				132	__SetPageIsolated(page);
				133	unlock_page(page);
				134
				135	return 0;
				136
				137	out_no_isolated:
				138	unlock_page(page);
				139	out_putpage:
				140	put_page(page);
				141	out:
				142	return -EBUSY;
				143	}
				144
				145	/* It should be called on page which is PG_movable */
				146	void putback_movable_page(struct page *page)
				147	{
				148	struct address_space *mapping;
				149
				150	VM_BUG_ON_PAGE(!PageLocked(page), page);
				151	VM_BUG_ON_PAGE(!PageMovable(page), page);
				152	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				153
				154	mapping = page_mapping(page);
				155	mapping->a_ops->putback_page(page);
				156	__ClearPageIsolated(page);
				157	}
				158
				159	/*
				160	* Put previously isolated pages back onto the appropriate lists
				161	* from where they were once taken off for compaction/migration.
				162	*
				163	* This function shall be used whenever the isolated pageset has been
				164	* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
				165	* and isolate_huge_page().
				166	*/
				167	void putback_movable_pages(struct list_head *l)
				168	{
				169	struct page *page;
				170	struct page *page2;
				171
				172	list_for_each_entry_safe(page, page2, l, lru) {
				173	if (unlikely(PageHuge(page))) {
				174	putback_active_hugepage(page);
				175	continue;
				176	}
				177	list_del(&page->lru);
				178	/*
				179	* We isolated non-lru movable page so here we can use
				180	* __PageMovable because LRU page's mapping cannot have
				181	* PAGE_MAPPING_MOVABLE.
				182	*/
				183	if (unlikely(__PageMovable(page))) {
				184	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				185	lock_page(page);
				186	if (PageMovable(page))
				187	putback_movable_page(page);
				188	else
				189	__ClearPageIsolated(page);
				190	unlock_page(page);
				191	put_page(page);
				192	} else {
				193	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				194	page_is_file_cache(page), -hpage_nr_pages(page));
				195	putback_lru_page(page);
				196	}
				197	}
				198	}
				199
				200	/*
				201	* Restore a potential migration pte to a working pte entry
				202	*/
				203	static bool remove_migration_pte(struct page page, struct vm_area_struct vma,
				204	unsigned long addr, void *old)
				205	{
				206	struct page_vma_mapped_walk pvmw = {
				207	.page = old,
				208	.vma = vma,
				209	.address = addr,
				210	.flags = PVMW_SYNC \| PVMW_MIGRATION,
				211	};
				212	struct page *new;
				213	pte_t pte;
				214	swp_entry_t entry;
				215
				216	VM_BUG_ON_PAGE(PageTail(page), page);
				217	while (page_vma_mapped_walk(&pvmw)) {
				218	if (PageKsm(page))
				219	new = page;
				220	else
				221	new = page - pvmw.page->index +
				222	linear_page_index(vma, pvmw.address);
				223
				224	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				225	/* PMD-mapped THP migration entry */
				226	if (!pvmw.pte) {
				227	VM_BUG_ON_PAGE(PageHuge(page) \|\| !PageTransCompound(page), page);
				228	remove_migration_pmd(&pvmw, new);
				229	continue;
				230	}
				231	#endif
				232
				233	get_page(new);
				234	pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
				235	if (pte_swp_soft_dirty(*pvmw.pte))
				236	pte = pte_mksoft_dirty(pte);
				237
				238	/*
				239	* Recheck VMA as permissions can change since migration started
				240	*/
				241	entry = pte_to_swp_entry(*pvmw.pte);
				242	if (is_write_migration_entry(entry))
				243	pte = maybe_mkwrite(pte, vma);
				244
				245	if (unlikely(is_zone_device_page(new))) {
				246	if (is_device_private_page(new)) {
				247	entry = make_device_private_entry(new, pte_write(pte));
				248	pte = swp_entry_to_pte(entry);
				249	} else if (is_device_public_page(new)) {
				250	pte = pte_mkdevmap(pte);
				251	}
				252	}
				253
				254	#ifdef CONFIG_HUGETLB_PAGE
				255	if (PageHuge(new)) {
				256	pte = pte_mkhuge(pte);
				257	pte = arch_make_huge_pte(pte, vma, new, 0);
				258	set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
				259	if (PageAnon(new))
				260	hugepage_add_anon_rmap(new, vma, pvmw.address);
				261	else
				262	page_dup_rmap(new, true);
				263	} else
				264	#endif
				265	{
				266	set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
				267
				268	if (PageAnon(new))
				269	page_add_anon_rmap(new, vma, pvmw.address, false);
				270	else
				271	page_add_file_rmap(new, false);
				272	}
				273	if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
				274	mlock_vma_page(new);
				275
				276	if (PageTransHuge(page) && PageMlocked(page))
				277	clear_page_mlock(page);
				278
				279	/* No need to invalidate - it was non-present before */
				280	update_mmu_cache(vma, pvmw.address, pvmw.pte);
				281	}
				282
				283	return true;
				284	}
				285
				286	/*
				287	* Get rid of all migration entries and replace them by
				288	* references to the indicated page.
				289	*/
				290	void remove_migration_ptes(struct page old, struct page new, bool locked)
				291	{
				292	struct rmap_walk_control rwc = {
				293	.rmap_one = remove_migration_pte,
				294	.arg = old,
				295	};
				296
				297	if (locked)
				298	rmap_walk_locked(new, &rwc);
				299	else
				300	rmap_walk(new, &rwc);
				301	}
				302
				303	/*
				304	* Something used the pte of a page under migration. We need to
				305	* get to the page and wait until migration is finished.
				306	* When we return from this function the fault will be retried.
				307	*/
				308	void __migration_entry_wait(struct mm_struct mm, pte_t ptep,
				309	spinlock_t *ptl)
				310	{
				311	pte_t pte;
				312	swp_entry_t entry;
				313	struct page *page;
				314
				315	spin_lock(ptl);
				316	pte = *ptep;
				317	if (!is_swap_pte(pte))
				318	goto out;
				319
				320	entry = pte_to_swp_entry(pte);
				321	if (!is_migration_entry(entry))
				322	goto out;
				323
				324	page = migration_entry_to_page(entry);
				325
				326	/*
				327	* Once radix-tree replacement of page migration started, page_count
				328	* must be zero. And, we don't want to call wait_on_page_locked()
				329	* against a page without get_page().
				330	* So, we use get_page_unless_zero(), here. Even failed, page fault
				331	* will occur again.
				332	*/
				333	if (!get_page_unless_zero(page))
				334	goto out;
				335	pte_unmap_unlock(ptep, ptl);
				336	wait_on_page_locked(page);
				337	put_page(page);
				338	return;
				339	out:
				340	pte_unmap_unlock(ptep, ptl);
				341	}
				342
				343	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
				344	unsigned long address)
				345	{
				346	spinlock_t *ptl = pte_lockptr(mm, pmd);
				347	pte_t *ptep = pte_offset_map(pmd, address);
				348	__migration_entry_wait(mm, ptep, ptl);
				349	}
				350
				351	void migration_entry_wait_huge(struct vm_area_struct *vma,
				352	struct mm_struct mm, pte_t pte)
				353	{
				354	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
				355	__migration_entry_wait(mm, pte, ptl);
				356	}
				357
				358	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				359	void pmd_migration_entry_wait(struct mm_struct mm, pmd_t pmd)
				360	{
				361	spinlock_t *ptl;
				362	struct page *page;
				363
				364	ptl = pmd_lock(mm, pmd);
				365	if (!is_pmd_migration_entry(*pmd))
				366	goto unlock;
				367	page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
				368	if (!get_page_unless_zero(page))
				369	goto unlock;
				370	spin_unlock(ptl);
				371	wait_on_page_locked(page);
				372	put_page(page);
				373	return;
				374	unlock:
				375	spin_unlock(ptl);
				376	}
				377	#endif
				378
				379	#ifdef CONFIG_BLOCK
				380	/* Returns true if all buffers are successfully locked */
				381	static bool buffer_migrate_lock_buffers(struct buffer_head *head,
				382	enum migrate_mode mode)
				383	{
				384	struct buffer_head *bh = head;
				385
				386	/* Simple case, sync compaction */
				387	if (mode != MIGRATE_ASYNC) {
				388	do {
				389	get_bh(bh);
				390	lock_buffer(bh);
				391	bh = bh->b_this_page;
				392
				393	} while (bh != head);
				394
				395	return true;
				396	}
				397
				398	/* async case, we cannot block on lock_buffer so use trylock_buffer */
				399	do {
				400	get_bh(bh);
				401	if (!trylock_buffer(bh)) {
				402	/*
				403	* We failed to lock the buffer and cannot stall in
				404	* async migration. Release the taken locks
				405	*/
				406	struct buffer_head *failed_bh = bh;
				407	put_bh(failed_bh);
				408	bh = head;
				409	while (bh != failed_bh) {
				410	unlock_buffer(bh);
				411	put_bh(bh);
				412	bh = bh->b_this_page;
				413	}
				414	return false;
				415	}
				416
				417	bh = bh->b_this_page;
				418	} while (bh != head);
				419	return true;
				420	}
				421	#else
				422	static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
				423	enum migrate_mode mode)
				424	{
				425	return true;
				426	}
				427	#endif /* CONFIG_BLOCK */
				428
				429	/*
				430	* Replace the page in the mapping.
				431	*
				432	* The number of remaining references must be:
				433	* 1 for anonymous pages without a mapping
				434	* 2 for pages with a mapping
				435	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
				436	*/
				437	int migrate_page_move_mapping(struct address_space *mapping,
				438	struct page newpage, struct page page,
				439	struct buffer_head *head, enum migrate_mode mode,
				440	int extra_count)
				441	{
				442	struct zone oldzone, newzone;
				443	int dirty;
				444	int expected_count = 1 + extra_count;
				445	void **pslot;
				446
				447	/*
				448	* Device public or private pages have an extra refcount as they are
				449	* ZONE_DEVICE pages.
				450	*/
				451	expected_count += is_device_private_page(page);
				452	expected_count += is_device_public_page(page);
				453
				454	if (!mapping) {
				455	/* Anonymous page without mapping */
				456	if (page_count(page) != expected_count)
				457	return -EAGAIN;
				458
				459	/* No turning back from here */
				460	newpage->index = page->index;
				461	newpage->mapping = page->mapping;
				462	if (PageSwapBacked(page))
				463	__SetPageSwapBacked(newpage);
				464
				465	return MIGRATEPAGE_SUCCESS;
				466	}
				467
				468	oldzone = page_zone(page);
				469	newzone = page_zone(newpage);
				470
				471	xa_lock_irq(&mapping->i_pages);
				472
				473	pslot = radix_tree_lookup_slot(&mapping->i_pages,
				474	page_index(page));
				475
				476	expected_count += hpage_nr_pages(page) + page_has_private(page);
				477	if (page_count(page) != expected_count \|\|
				478	radix_tree_deref_slot_protected(pslot,
				479	&mapping->i_pages.xa_lock) != page) {
				480	xa_unlock_irq(&mapping->i_pages);
				481	return -EAGAIN;
				482	}
				483
				484	if (!page_ref_freeze(page, expected_count)) {
				485	xa_unlock_irq(&mapping->i_pages);
				486	return -EAGAIN;
				487	}
				488
				489	/*
				490	* In the async migration case of moving a page with buffers, lock the
				491	* buffers using trylock before the mapping is moved. If the mapping
				492	* was moved, we later failed to lock the buffers and could not move
				493	* the mapping back due to an elevated page count, we would have to
				494	* block waiting on other references to be dropped.
				495	*/
				496	if (mode == MIGRATE_ASYNC && head &&
				497	!buffer_migrate_lock_buffers(head, mode)) {
				498	page_ref_unfreeze(page, expected_count);
				499	xa_unlock_irq(&mapping->i_pages);
				500	return -EAGAIN;
				501	}
				502
				503	/*
				504	* Now we know that no one else is looking at the page:
				505	* no turning back from here.
				506	*/
				507	newpage->index = page->index;
				508	newpage->mapping = page->mapping;
				509	page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
				510	if (PageSwapBacked(page)) {
				511	__SetPageSwapBacked(newpage);
				512	if (PageSwapCache(page)) {
				513	SetPageSwapCache(newpage);
				514	set_page_private(newpage, page_private(page));
				515	}
				516	} else {
				517	VM_BUG_ON_PAGE(PageSwapCache(page), page);
				518	}
				519
				520	/* Move dirty while page refs frozen and newpage not yet exposed */
				521	dirty = PageDirty(page);
				522	if (dirty) {
				523	ClearPageDirty(page);
				524	SetPageDirty(newpage);
				525	}
				526
				527	radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
				528	if (PageTransHuge(page)) {
				529	int i;
				530	int index = page_index(page);
				531
				532	for (i = 1; i < HPAGE_PMD_NR; i++) {
				533	pslot = radix_tree_lookup_slot(&mapping->i_pages,
				534	index + i);
				535	radix_tree_replace_slot(&mapping->i_pages, pslot,
				536	newpage + i);
				537	}
				538	}
				539
				540	/*
				541	* Drop cache reference from old page by unfreezing
				542	* to one less reference.
				543	* We know this isn't the last reference.
				544	*/
				545	page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
				546
				547	xa_unlock(&mapping->i_pages);
				548	/* Leave irq disabled to prevent preemption while updating stats */
				549
				550	/*
				551	* If moved to a different zone then also account
				552	* the page for that zone. Other VM counters will be
				553	* taken care of when we establish references to the
				554	* new page and drop references to the old page.
				555	*
				556	* Note that anonymous pages are accounted for
				557	* via NR_FILE_PAGES and NR_ANON_MAPPED if they
				558	* are mapped to swap space.
				559	*/
				560	if (newzone != oldzone) {
				561	__dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
				562	__inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
				563	if (PageSwapBacked(page) && !PageSwapCache(page)) {
				564	__dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
				565	__inc_node_state(newzone->zone_pgdat, NR_SHMEM);
				566	}
				567	if (dirty && mapping_cap_account_dirty(mapping)) {
				568	__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
				569	__dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
				570	__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
				571	__inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
				572	}
				573	}
				574	local_irq_enable();
				575
				576	return MIGRATEPAGE_SUCCESS;
				577	}
				578	EXPORT_SYMBOL(migrate_page_move_mapping);
				579
				580	/*
				581	* The expected number of remaining references is the same as that
				582	* of migrate_page_move_mapping().
				583	*/
				584	int migrate_huge_page_move_mapping(struct address_space *mapping,
				585	struct page newpage, struct page page)
				586	{
				587	int expected_count;
				588	void **pslot;
				589
				590	xa_lock_irq(&mapping->i_pages);
				591
				592	pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
				593
				594	expected_count = 2 + page_has_private(page);
				595	if (page_count(page) != expected_count \|\|
				596	radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
				597	xa_unlock_irq(&mapping->i_pages);
				598	return -EAGAIN;
				599	}
				600
				601	if (!page_ref_freeze(page, expected_count)) {
				602	xa_unlock_irq(&mapping->i_pages);
				603	return -EAGAIN;
				604	}
				605
				606	newpage->index = page->index;
				607	newpage->mapping = page->mapping;
				608
				609	get_page(newpage);
				610
				611	radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
				612
				613	page_ref_unfreeze(page, expected_count - 1);
				614
				615	xa_unlock_irq(&mapping->i_pages);
				616
				617	return MIGRATEPAGE_SUCCESS;
				618	}
				619
				620	/*
				621	* Gigantic pages are so large that we do not guarantee that page++ pointer
				622	* arithmetic will work across the entire page. We need something more
				623	* specialized.
				624	*/
				625	static void __copy_gigantic_page(struct page dst, struct page src,
				626	int nr_pages)
				627	{
				628	int i;
				629	struct page *dst_base = dst;
				630	struct page *src_base = src;
				631
				632	for (i = 0; i < nr_pages; ) {
				633	cond_resched();
				634	copy_highpage(dst, src);
				635
				636	i++;
				637	dst = mem_map_next(dst, dst_base, i);
				638	src = mem_map_next(src, src_base, i);
				639	}
				640	}
				641
				642	static void copy_huge_page(struct page dst, struct page src)
				643	{
				644	int i;
				645	int nr_pages;
				646
				647	if (PageHuge(src)) {
				648	/* hugetlbfs page */
				649	struct hstate *h = page_hstate(src);
				650	nr_pages = pages_per_huge_page(h);
				651
				652	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
				653	__copy_gigantic_page(dst, src, nr_pages);
				654	return;
				655	}
				656	} else {
				657	/* thp page */
				658	BUG_ON(!PageTransHuge(src));
				659	nr_pages = hpage_nr_pages(src);
				660	}
				661
				662	for (i = 0; i < nr_pages; i++) {
				663	cond_resched();
				664	copy_highpage(dst + i, src + i);
				665	}
				666	}
				667
				668	/*
				669	* Copy the page to its new location
				670	*/
				671	void migrate_page_states(struct page newpage, struct page page)
				672	{
				673	int cpupid;
				674
				675	if (PageError(page))
				676	SetPageError(newpage);
				677	if (PageReferenced(page))
				678	SetPageReferenced(newpage);
				679	if (PageUptodate(page))
				680	SetPageUptodate(newpage);
				681	if (TestClearPageActive(page)) {
				682	VM_BUG_ON_PAGE(PageUnevictable(page), page);
				683	SetPageActive(newpage);
				684	} else if (TestClearPageUnevictable(page))
				685	SetPageUnevictable(newpage);
				686	if (PageWorkingset(page))
				687	SetPageWorkingset(newpage);
				688	if (PageChecked(page))
				689	SetPageChecked(newpage);
				690	if (PageMappedToDisk(page))
				691	SetPageMappedToDisk(newpage);
				692
				693	/* Move dirty on pages not done by migrate_page_move_mapping() */
				694	if (PageDirty(page))
				695	SetPageDirty(newpage);
				696
				697	if (page_is_young(page))
				698	set_page_young(newpage);
				699	if (page_is_idle(page))
				700	set_page_idle(newpage);
				701
				702	/*
				703	* Copy NUMA information to the new page, to prevent over-eager
				704	* future migrations of this same page.
				705	*/
				706	cpupid = page_cpupid_xchg_last(page, -1);
				707	page_cpupid_xchg_last(newpage, cpupid);
				708
				709	ksm_migrate_page(newpage, page);
				710	/*
				711	* Please do not reorder this without considering how mm/ksm.c's
				712	* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
				713	*/
				714	if (PageSwapCache(page))
				715	ClearPageSwapCache(page);
				716	ClearPagePrivate(page);
				717	set_page_private(page, 0);
				718
				719	/*
				720	* If any waiters have accumulated on the new page then
				721	* wake them up.
				722	*/
				723	if (PageWriteback(newpage))
				724	end_page_writeback(newpage);
				725
				726	copy_page_owner(page, newpage);
				727
				728	mem_cgroup_migrate(page, newpage);
				729	}
				730	EXPORT_SYMBOL(migrate_page_states);
				731
				732	void migrate_page_copy(struct page newpage, struct page page)
				733	{
				734	if (PageHuge(page) \|\| PageTransHuge(page))
				735	copy_huge_page(newpage, page);
				736	else
				737	copy_highpage(newpage, page);
				738
				739	migrate_page_states(newpage, page);
				740	}
				741	EXPORT_SYMBOL(migrate_page_copy);
				742
				743	/************************************************************
				744	* Migration functions
				745	***********************************************************/
				746
				747	/*
				748	* Common logic to directly migrate a single LRU page suitable for
				749	* pages that do not use PagePrivate/PagePrivate2.
				750	*
				751	* Pages are locked upon entry and exit.
				752	*/
				753	int migrate_page(struct address_space *mapping,
				754	struct page newpage, struct page page,
				755	enum migrate_mode mode)
				756	{
				757	int rc;
				758
				759	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
				760
				761	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
				762
				763	if (rc != MIGRATEPAGE_SUCCESS)
				764	return rc;
				765
				766	if (mode != MIGRATE_SYNC_NO_COPY)
				767	migrate_page_copy(newpage, page);
				768	else
				769	migrate_page_states(newpage, page);
				770	return MIGRATEPAGE_SUCCESS;
				771	}
				772	EXPORT_SYMBOL(migrate_page);
				773
				774	#ifdef CONFIG_BLOCK
				775	/*
				776	* Migration function for pages with buffers. This function can only be used
				777	* if the underlying filesystem guarantees that no other references to "page"
				778	* exist.
				779	*/
				780	int buffer_migrate_page(struct address_space *mapping,
				781	struct page newpage, struct page page, enum migrate_mode mode)
				782	{
				783	struct buffer_head bh, head;
				784	int rc;
				785
				786	if (!page_has_buffers(page))
				787	return migrate_page(mapping, newpage, page, mode);
				788
				789	head = page_buffers(page);
				790
				791	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
				792
				793	if (rc != MIGRATEPAGE_SUCCESS)
				794	return rc;
				795
				796	/*
				797	* In the async case, migrate_page_move_mapping locked the buffers
				798	* with an IRQ-safe spinlock held. In the sync case, the buffers
				799	* need to be locked now
				800	*/
				801	if (mode != MIGRATE_ASYNC)
				802	BUG_ON(!buffer_migrate_lock_buffers(head, mode));
				803
				804	ClearPagePrivate(page);
				805	set_page_private(newpage, page_private(page));
				806	set_page_private(page, 0);
				807	put_page(page);
				808	get_page(newpage);
				809
				810	bh = head;
				811	do {
				812	set_bh_page(bh, newpage, bh_offset(bh));
				813	bh = bh->b_this_page;
				814
				815	} while (bh != head);
				816
				817	SetPagePrivate(newpage);
				818
				819	if (mode != MIGRATE_SYNC_NO_COPY)
				820	migrate_page_copy(newpage, page);
				821	else
				822	migrate_page_states(newpage, page);
				823
				824	bh = head;
				825	do {
				826	unlock_buffer(bh);
				827	put_bh(bh);
				828	bh = bh->b_this_page;
				829
				830	} while (bh != head);
				831
				832	return MIGRATEPAGE_SUCCESS;
				833	}
				834	EXPORT_SYMBOL(buffer_migrate_page);
				835	#endif
				836
				837	/*
				838	* Writeback a page to clean the dirty state
				839	*/
				840	static int writeout(struct address_space mapping, struct page page)
				841	{
				842	struct writeback_control wbc = {
				843	.sync_mode = WB_SYNC_NONE,
				844	.nr_to_write = 1,
				845	.range_start = 0,
				846	.range_end = LLONG_MAX,
				847	.for_reclaim = 1
				848	};
				849	int rc;
				850
				851	if (!mapping->a_ops->writepage)
				852	/* No write method for the address space */
				853	return -EINVAL;
				854
				855	if (!clear_page_dirty_for_io(page))
				856	/* Someone else already triggered a write */
				857	return -EAGAIN;
				858
				859	/*
				860	* A dirty page may imply that the underlying filesystem has
				861	* the page on some queue. So the page must be clean for
				862	* migration. Writeout may mean we loose the lock and the
				863	* page state is no longer what we checked for earlier.
				864	* At this point we know that the migration attempt cannot
				865	* be successful.
				866	*/
				867	remove_migration_ptes(page, page, false);
				868
				869	rc = mapping->a_ops->writepage(page, &wbc);
				870
				871	if (rc != AOP_WRITEPAGE_ACTIVATE)
				872	/* unlocked. Relock */
				873	lock_page(page);
				874
				875	return (rc < 0) ? -EIO : -EAGAIN;
				876	}
				877
				878	/*
				879	* Default handling if a filesystem does not provide a migration function.
				880	*/
				881	static int fallback_migrate_page(struct address_space *mapping,
				882	struct page newpage, struct page page, enum migrate_mode mode)
				883	{
				884	if (PageDirty(page)) {
				885	/* Only writeback pages in full synchronous migration */
				886	switch (mode) {
				887	case MIGRATE_SYNC:
				888	case MIGRATE_SYNC_NO_COPY:
				889	break;
				890	default:
				891	return -EBUSY;
				892	}
				893	return writeout(mapping, page);
				894	}
				895
				896	/*
				897	* Buffers may be managed in a filesystem specific way.
				898	* We must have no buffers or drop them.
				899	*/
				900	if (page_has_private(page) &&
				901	!try_to_release_page(page, GFP_KERNEL))
				902	return -EAGAIN;
				903
				904	return migrate_page(mapping, newpage, page, mode);
				905	}
				906
				907	/*
				908	* Move a page to a newly allocated page
				909	* The page is locked and all ptes have been successfully removed.
				910	*
				911	* The new page will have replaced the old page if this function
				912	* is successful.
				913	*
				914	* Return value:
				915	* < 0 - error code
				916	* MIGRATEPAGE_SUCCESS - success
				917	*/
				918	static int move_to_new_page(struct page newpage, struct page page,
				919	enum migrate_mode mode)
				920	{
				921	struct address_space *mapping;
				922	int rc = -EAGAIN;
				923	bool is_lru = !__PageMovable(page);
				924
				925	VM_BUG_ON_PAGE(!PageLocked(page), page);
				926	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
				927
				928	mapping = page_mapping(page);
				929
				930	if (likely(is_lru)) {
				931	if (!mapping)
				932	rc = migrate_page(mapping, newpage, page, mode);
				933	else if (mapping->a_ops->migratepage)
				934	/*
				935	* Most pages have a mapping and most filesystems
				936	* provide a migratepage callback. Anonymous pages
				937	* are part of swap space which also has its own
				938	* migratepage callback. This is the most common path
				939	* for page migration.
				940	*/
				941	rc = mapping->a_ops->migratepage(mapping, newpage,
				942	page, mode);
				943	else
				944	rc = fallback_migrate_page(mapping, newpage,
				945	page, mode);
				946	} else {
				947	/*
				948	* In case of non-lru page, it could be released after
				949	* isolation step. In that case, we shouldn't try migration.
				950	*/
				951	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				952	if (!PageMovable(page)) {
				953	rc = MIGRATEPAGE_SUCCESS;
				954	__ClearPageIsolated(page);
				955	goto out;
				956	}
				957
				958	rc = mapping->a_ops->migratepage(mapping, newpage,
				959	page, mode);
				960	WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
				961	!PageIsolated(page));
				962	}
				963
				964	/*
				965	* When successful, old pagecache page->mapping must be cleared before
				966	* page is freed; but stats require that PageAnon be left as PageAnon.
				967	*/
				968	if (rc == MIGRATEPAGE_SUCCESS) {
				969	if (__PageMovable(page)) {
				970	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				971
				972	/*
				973	* We clear PG_movable under page_lock so any compactor
				974	* cannot try to migrate this page.
				975	*/
				976	__ClearPageIsolated(page);
				977	}
				978
				979	/*
				980	* Anonymous and movable page->mapping will be cleard by
				981	* free_pages_prepare so don't reset it here for keeping
				982	* the type to work PageAnon, for example.
				983	*/
				984	if (!PageMappingFlags(page))
				985	page->mapping = NULL;
				986
				987	if (unlikely(is_zone_device_page(newpage))) {
				988	if (is_device_public_page(newpage))
				989	flush_dcache_page(newpage);
				990	} else
				991	flush_dcache_page(newpage);
				992
				993	}
				994	out:
				995	return rc;
				996	}
				997
				998	static int __unmap_and_move(struct page page, struct page newpage,
				999	int force, enum migrate_mode mode)
				1000	{
				1001	int rc = -EAGAIN;
				1002	int page_was_mapped = 0;
				1003	struct anon_vma *anon_vma = NULL;
				1004	bool is_lru = !__PageMovable(page);
				1005
				1006	if (!trylock_page(page)) {
				1007	if (!force \|\| mode == MIGRATE_ASYNC)
				1008	goto out;
				1009
				1010	/*
				1011	* It's not safe for direct compaction to call lock_page.
				1012	* For example, during page readahead pages are added locked
				1013	* to the LRU. Later, when the IO completes the pages are
				1014	* marked uptodate and unlocked. However, the queueing
				1015	* could be merging multiple pages for one bio (e.g.
				1016	* mpage_readpages). If an allocation happens for the
				1017	* second or third page, the process can end up locking
				1018	* the same page twice and deadlocking. Rather than
				1019	* trying to be clever about what pages can be locked,
				1020	* avoid the use of lock_page for direct compaction
				1021	* altogether.
				1022	*/
				1023	if (current->flags & PF_MEMALLOC)
				1024	goto out;
				1025
				1026	lock_page(page);
				1027	}
				1028
				1029	if (PageWriteback(page)) {
				1030	/*
				1031	* Only in the case of a full synchronous migration is it
				1032	* necessary to wait for PageWriteback. In the async case,
				1033	* the retry loop is too short and in the sync-light case,
				1034	* the overhead of stalling is too much
				1035	*/
				1036	switch (mode) {
				1037	case MIGRATE_SYNC:
				1038	case MIGRATE_SYNC_NO_COPY:
				1039	break;
				1040	default:
				1041	rc = -EBUSY;
				1042	goto out_unlock;
				1043	}
				1044	if (!force)
				1045	goto out_unlock;
				1046	wait_on_page_writeback(page);
				1047	}
				1048
				1049	/*
				1050	* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
				1051	* we cannot notice that anon_vma is freed while we migrates a page.
				1052	* This get_anon_vma() delays freeing anon_vma pointer until the end
				1053	* of migration. File cache pages are no problem because of page_lock()
				1054	* File Caches may use write_page() or lock_page() in migration, then,
				1055	* just care Anon page here.
				1056	*
				1057	* Only page_get_anon_vma() understands the subtleties of
				1058	* getting a hold on an anon_vma from outside one of its mms.
				1059	* But if we cannot get anon_vma, then we won't need it anyway,
				1060	* because that implies that the anon page is no longer mapped
				1061	* (and cannot be remapped so long as we hold the page lock).
				1062	*/
				1063	if (PageAnon(page) && !PageKsm(page))
				1064	anon_vma = page_get_anon_vma(page);
				1065
				1066	/*
				1067	* Block others from accessing the new page when we get around to
				1068	* establishing additional references. We are usually the only one
				1069	* holding a reference to newpage at this point. We used to have a BUG
				1070	* here if trylock_page(newpage) fails, but would like to allow for
				1071	* cases where there might be a race with the previous use of newpage.
				1072	* This is much like races on refcount of oldpage: just don't BUG().
				1073	*/
				1074	if (unlikely(!trylock_page(newpage)))
				1075	goto out_unlock;
				1076
				1077	if (unlikely(!is_lru)) {
				1078	rc = move_to_new_page(newpage, page, mode);
				1079	goto out_unlock_both;
				1080	}
				1081
				1082	/*
				1083	* Corner case handling:
				1084	* 1. When a new swap-cache page is read into, it is added to the LRU
				1085	* and treated as swapcache but it has no rmap yet.
				1086	* Calling try_to_unmap() against a page->mapping==NULL page will
				1087	* trigger a BUG. So handle it here.
				1088	* 2. An orphaned page (see truncate_complete_page) might have
				1089	* fs-private metadata. The page can be picked up due to memory
				1090	* offlining. Everywhere else except page reclaim, the page is
				1091	* invisible to the vm, so the page can not be migrated. So try to
				1092	* free the metadata, so the page can be freed.
				1093	*/
				1094	if (!page->mapping) {
				1095	VM_BUG_ON_PAGE(PageAnon(page), page);
				1096	if (page_has_private(page)) {
				1097	try_to_free_buffers(page);
				1098	goto out_unlock_both;
				1099	}
				1100	} else if (page_mapped(page)) {
				1101	/* Establish migration ptes */
				1102	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
				1103	page);
				1104	try_to_unmap(page,
				1105	TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
				1106	page_was_mapped = 1;
				1107	}
				1108
				1109	if (!page_mapped(page))
				1110	rc = move_to_new_page(newpage, page, mode);
				1111
				1112	if (page_was_mapped)
				1113	remove_migration_ptes(page,
				1114	rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
				1115
				1116	out_unlock_both:
				1117	unlock_page(newpage);
				1118	out_unlock:
				1119	/* Drop an anon_vma reference if we took one */
				1120	if (anon_vma)
				1121	put_anon_vma(anon_vma);
				1122	unlock_page(page);
				1123	out:
				1124	/*
				1125	* If migration is successful, decrease refcount of the newpage
				1126	* which will not free the page because new page owner increased
				1127	* refcounter. As well, if it is LRU page, add the page to LRU
				1128	* list in here. Use the old state of the isolated source page to
				1129	* determine if we migrated a LRU page. newpage was already unlocked
				1130	* and possibly modified by its owner - don't rely on the page
				1131	* state.
				1132	*/
				1133	if (rc == MIGRATEPAGE_SUCCESS) {
				1134	if (unlikely(!is_lru))
				1135	put_page(newpage);
				1136	else
				1137	putback_lru_page(newpage);
				1138	}
				1139
				1140	return rc;
				1141	}
				1142
				1143	/*
				1144	* gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
				1145	* around it.
				1146	*/
				1147	#if defined(CONFIG_ARM) && \
				1148	defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
				1149	#define ICE_noinline noinline
				1150	#else
				1151	#define ICE_noinline
				1152	#endif
				1153
				1154	/*
				1155	* Obtain the lock on page, remove all ptes and migrate the page
				1156	* to the newly allocated page in newpage.
				1157	*/
				1158	static ICE_noinline int unmap_and_move(new_page_t get_new_page,
				1159	free_page_t put_new_page,
				1160	unsigned long private, struct page *page,
				1161	int force, enum migrate_mode mode,
				1162	enum migrate_reason reason)
				1163	{
				1164	int rc = MIGRATEPAGE_SUCCESS;
				1165	struct page *newpage;
				1166
				1167	if (!thp_migration_supported() && PageTransHuge(page))
				1168	return -ENOMEM;
				1169
				1170	newpage = get_new_page(page, private);
				1171	if (!newpage)
				1172	return -ENOMEM;
				1173
				1174	if (page_count(page) == 1) {
				1175	/* page was freed from under us. So we are done. */
				1176	ClearPageActive(page);
				1177	ClearPageUnevictable(page);
				1178	if (unlikely(__PageMovable(page))) {
				1179	lock_page(page);
				1180	if (!PageMovable(page))
				1181	__ClearPageIsolated(page);
				1182	unlock_page(page);
				1183	}
				1184	if (put_new_page)
				1185	put_new_page(newpage, private);
				1186	else
				1187	put_page(newpage);
				1188	goto out;
				1189	}
				1190
				1191	rc = __unmap_and_move(page, newpage, force, mode);
				1192	if (rc == MIGRATEPAGE_SUCCESS)
				1193	set_page_owner_migrate_reason(newpage, reason);
				1194
				1195	out:
				1196	if (rc != -EAGAIN) {
				1197	/*
				1198	* A page that has been migrated has all references
				1199	* removed and will be freed. A page that has not been
				1200	* migrated will have kepts its references and be
				1201	* restored.
				1202	*/
				1203	list_del(&page->lru);
				1204
				1205	/*
				1206	* Compaction can migrate also non-LRU pages which are
				1207	* not accounted to NR_ISOLATED_*. They can be recognized
				1208	* as __PageMovable
				1209	*/
				1210	if (likely(!__PageMovable(page)))
				1211	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				1212	page_is_file_cache(page), -hpage_nr_pages(page));
				1213	}
				1214
				1215	/*
				1216	* If migration is successful, releases reference grabbed during
				1217	* isolation. Otherwise, restore the page to right list unless
				1218	* we want to retry.
				1219	*/
				1220	if (rc == MIGRATEPAGE_SUCCESS) {
				1221	put_page(page);
				1222	if (reason == MR_MEMORY_FAILURE) {
				1223	/*
				1224	* Set PG_HWPoison on just freed page
				1225	* intentionally. Although it's rather weird,
				1226	* it's how HWPoison flag works at the moment.
				1227	*/
				1228	if (set_hwpoison_free_buddy_page(page))
				1229	num_poisoned_pages_inc();
				1230	}
				1231	} else {
				1232	if (rc != -EAGAIN) {
				1233	if (likely(!__PageMovable(page))) {
				1234	putback_lru_page(page);
				1235	goto put_new;
				1236	}
				1237
				1238	lock_page(page);
				1239	if (PageMovable(page))
				1240	putback_movable_page(page);
				1241	else
				1242	__ClearPageIsolated(page);
				1243	unlock_page(page);
				1244	put_page(page);
				1245	}
				1246	put_new:
				1247	if (put_new_page)
				1248	put_new_page(newpage, private);
				1249	else
				1250	put_page(newpage);
				1251	}
				1252
				1253	return rc;
				1254	}
				1255
				1256	/*
				1257	* Counterpart of unmap_and_move_page() for hugepage migration.
				1258	*
				1259	* This function doesn't wait the completion of hugepage I/O
				1260	* because there is no race between I/O and migration for hugepage.
				1261	* Note that currently hugepage I/O occurs only in direct I/O
				1262	* where no lock is held and PG_writeback is irrelevant,
				1263	* and writeback status of all subpages are counted in the reference
				1264	* count of the head page (i.e. if all subpages of a 2MB hugepage are
				1265	* under direct I/O, the reference of the head page is 512 and a bit more.)
				1266	* This means that when we try to migrate hugepage whose subpages are
				1267	* doing direct I/O, some references remain after try_to_unmap() and
				1268	* hugepage migration fails without data corruption.
				1269	*
				1270	* There is also no race when direct I/O is issued on the page under migration,
				1271	* because then pte is replaced with migration swap entry and direct I/O code
				1272	* will wait in the page fault for migration to complete.
				1273	*/
				1274	static int unmap_and_move_huge_page(new_page_t get_new_page,
				1275	free_page_t put_new_page, unsigned long private,
				1276	struct page *hpage, int force,
				1277	enum migrate_mode mode, int reason)
				1278	{
				1279	int rc = -EAGAIN;
				1280	int page_was_mapped = 0;
				1281	struct page *new_hpage;
				1282	struct anon_vma *anon_vma = NULL;
				1283
				1284	/*
				1285	* Movability of hugepages depends on architectures and hugepage size.
				1286	* This check is necessary because some callers of hugepage migration
				1287	* like soft offline and memory hotremove don't walk through page
				1288	* tables or check whether the hugepage is pmd-based or not before
				1289	* kicking migration.
				1290	*/
				1291	if (!hugepage_migration_supported(page_hstate(hpage))) {
				1292	putback_active_hugepage(hpage);
				1293	return -ENOSYS;
				1294	}
				1295
				1296	new_hpage = get_new_page(hpage, private);
				1297	if (!new_hpage)
				1298	return -ENOMEM;
				1299
				1300	if (!trylock_page(hpage)) {
				1301	if (!force)
				1302	goto out;
				1303	switch (mode) {
				1304	case MIGRATE_SYNC:
				1305	case MIGRATE_SYNC_NO_COPY:
				1306	break;
				1307	default:
				1308	goto out;
				1309	}
				1310	lock_page(hpage);
				1311	}
				1312
				1313	/*
				1314	* Check for pages which are in the process of being freed. Without
				1315	* page_mapping() set, hugetlbfs specific move page routine will not
				1316	* be called and we could leak usage counts for subpools.
				1317	*/
				1318	if (page_private(hpage) && !page_mapping(hpage)) {
				1319	rc = -EBUSY;
				1320	goto out_unlock;
				1321	}
				1322
				1323	if (PageAnon(hpage))
				1324	anon_vma = page_get_anon_vma(hpage);
				1325
				1326	if (unlikely(!trylock_page(new_hpage)))
				1327	goto put_anon;
				1328
				1329	if (page_mapped(hpage)) {
				1330	try_to_unmap(hpage,
				1331	TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
				1332	page_was_mapped = 1;
				1333	}
				1334
				1335	if (!page_mapped(hpage))
				1336	rc = move_to_new_page(new_hpage, hpage, mode);
				1337
				1338	if (page_was_mapped)
				1339	remove_migration_ptes(hpage,
				1340	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
				1341
				1342	unlock_page(new_hpage);
				1343
				1344	put_anon:
				1345	if (anon_vma)
				1346	put_anon_vma(anon_vma);
				1347
				1348	if (rc == MIGRATEPAGE_SUCCESS) {
				1349	move_hugetlb_state(hpage, new_hpage, reason);
				1350	put_new_page = NULL;
				1351	}
				1352
				1353	out_unlock:
				1354	unlock_page(hpage);
				1355	out:
				1356	if (rc != -EAGAIN)
				1357	putback_active_hugepage(hpage);
				1358
				1359	/*
				1360	* If migration was not successful and there's a freeing callback, use
				1361	* it. Otherwise, put_page() will drop the reference grabbed during
				1362	* isolation.
				1363	*/
				1364	if (put_new_page)
				1365	put_new_page(new_hpage, private);
				1366	else
				1367	putback_active_hugepage(new_hpage);
				1368
				1369	return rc;
				1370	}
				1371
				1372	/*
				1373	* migrate_pages - migrate the pages specified in a list, to the free pages
				1374	* supplied as the target for the page migration
				1375	*
				1376	* @from: The list of pages to be migrated.
				1377	* @get_new_page: The function used to allocate free pages to be used
				1378	* as the target of the page migration.
				1379	* @put_new_page: The function used to free target pages if migration
				1380	* fails, or NULL if no special handling is necessary.
				1381	* @private: Private data to be passed on to get_new_page()
				1382	* @mode: The migration mode that specifies the constraints for
				1383	* page migration, if any.
				1384	* @reason: The reason for page migration.
				1385	*
				1386	* The function returns after 10 attempts or if no pages are movable any more
				1387	* because the list has become empty or no retryable pages exist any more.
				1388	* The caller should call putback_movable_pages() to return pages to the LRU
				1389	* or free list only if ret != 0.
				1390	*
				1391	* Returns the number of pages that were not migrated, or an error code.
				1392	*/
				1393	int migrate_pages(struct list_head *from, new_page_t get_new_page,
				1394	free_page_t put_new_page, unsigned long private,
				1395	enum migrate_mode mode, int reason)
				1396	{
				1397	int retry = 1;
				1398	int nr_failed = 0;
				1399	int nr_succeeded = 0;
				1400	int pass = 0;
				1401	struct page *page;
				1402	struct page *page2;
				1403	int swapwrite = current->flags & PF_SWAPWRITE;
				1404	int rc;
				1405
				1406	if (!swapwrite)
				1407	current->flags \|= PF_SWAPWRITE;
				1408
				1409	for(pass = 0; pass < 10 && retry; pass++) {
				1410	retry = 0;
				1411
				1412	list_for_each_entry_safe(page, page2, from, lru) {
				1413	retry:
				1414	cond_resched();
				1415
				1416	if (PageHuge(page))
				1417	rc = unmap_and_move_huge_page(get_new_page,
				1418	put_new_page, private, page,
				1419	pass > 2, mode, reason);
				1420	else
				1421	rc = unmap_and_move(get_new_page, put_new_page,
				1422	private, page, pass > 2, mode,
				1423	reason);
				1424
				1425	switch(rc) {
				1426	case -ENOMEM:
				1427	/*
				1428	* THP migration might be unsupported or the
				1429	* allocation could've failed so we should
				1430	* retry on the same page with the THP split
				1431	* to base pages.
				1432	*
				1433	* Head page is retried immediately and tail
				1434	* pages are added to the tail of the list so
				1435	* we encounter them after the rest of the list
				1436	* is processed.
				1437	*/
				1438	if (PageTransHuge(page) && !PageHuge(page)) {
				1439	lock_page(page);
				1440	rc = split_huge_page_to_list(page, from);
				1441	unlock_page(page);
				1442	if (!rc) {
				1443	list_safe_reset_next(page, page2, lru);
				1444	goto retry;
				1445	}
				1446	}
				1447	nr_failed++;
				1448	goto out;
				1449	case -EAGAIN:
				1450	retry++;
				1451	break;
				1452	case MIGRATEPAGE_SUCCESS:
				1453	nr_succeeded++;
				1454	break;
				1455	default:
				1456	/*
				1457	* Permanent failure (-EBUSY, -ENOSYS, etc.):
				1458	* unlike -EAGAIN case, the failed page is
				1459	* removed from migration page list and not
				1460	* retried in the next outer loop.
				1461	*/
				1462	nr_failed++;
				1463	break;
				1464	}
				1465	}
				1466	}
				1467	nr_failed += retry;
				1468	rc = nr_failed;
				1469	out:
				1470	if (nr_succeeded)
				1471	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
				1472	if (nr_failed)
				1473	count_vm_events(PGMIGRATE_FAIL, nr_failed);
				1474	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
				1475
				1476	if (!swapwrite)
				1477	current->flags &= ~PF_SWAPWRITE;
				1478
				1479	return rc;
				1480	}
				1481
				1482	#ifdef CONFIG_NUMA
				1483
				1484	static int store_status(int __user *status, int start, int value, int nr)
				1485	{
				1486	while (nr-- > 0) {
				1487	if (put_user(value, status + start))
				1488	return -EFAULT;
				1489	start++;
				1490	}
				1491
				1492	return 0;
				1493	}
				1494
				1495	static int do_move_pages_to_node(struct mm_struct *mm,
				1496	struct list_head *pagelist, int node)
				1497	{
				1498	int err;
				1499
				1500	if (list_empty(pagelist))
				1501	return 0;
				1502
				1503	err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
				1504	MIGRATE_SYNC, MR_SYSCALL);
				1505	if (err)
				1506	putback_movable_pages(pagelist);
				1507	return err;
				1508	}
				1509
				1510	/*
				1511	* Resolves the given address to a struct page, isolates it from the LRU and
				1512	* puts it to the given pagelist.
				1513	* Returns:
				1514	* errno - if the page cannot be found/isolated
				1515	* 0 - when it doesn't have to be migrated because it is already on the
				1516	* target node
				1517	* 1 - when it has been queued
				1518	*/
				1519	static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
				1520	int node, struct list_head *pagelist, bool migrate_all)
				1521	{
				1522	struct vm_area_struct *vma;
				1523	struct page *page;
				1524	unsigned int follflags;
				1525	int err;
				1526
				1527	down_read(&mm->mmap_sem);
				1528	err = -EFAULT;
				1529	vma = find_vma(mm, addr);
				1530	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
				1531	goto out;
				1532
				1533	/* FOLL_DUMP to ignore special (like zero) pages */
				1534	follflags = FOLL_GET \| FOLL_DUMP;
				1535	page = follow_page(vma, addr, follflags);
				1536
				1537	err = PTR_ERR(page);
				1538	if (IS_ERR(page))
				1539	goto out;
				1540
				1541	err = -ENOENT;
				1542	if (!page)
				1543	goto out;
				1544
				1545	err = 0;
				1546	if (page_to_nid(page) == node)
				1547	goto out_putpage;
				1548
				1549	err = -EACCES;
				1550	if (page_mapcount(page) > 1 && !migrate_all)
				1551	goto out_putpage;
				1552
				1553	if (PageHuge(page)) {
				1554	if (PageHead(page)) {
				1555	isolate_huge_page(page, pagelist);
				1556	err = 1;
				1557	}
				1558	} else {
				1559	struct page *head;
				1560
				1561	head = compound_head(page);
				1562	err = isolate_lru_page(head);
				1563	if (err)
				1564	goto out_putpage;
				1565
				1566	err = 1;
				1567	list_add_tail(&head->lru, pagelist);
				1568	mod_node_page_state(page_pgdat(head),
				1569	NR_ISOLATED_ANON + page_is_file_cache(head),
				1570	hpage_nr_pages(head));
				1571	}
				1572	out_putpage:
				1573	/*
				1574	* Either remove the duplicate refcount from
				1575	* isolate_lru_page() or drop the page ref if it was
				1576	* not isolated.
				1577	*/
				1578	put_page(page);
				1579	out:
				1580	up_read(&mm->mmap_sem);
				1581	return err;
				1582	}
				1583
				1584	/*
				1585	* Migrate an array of page address onto an array of nodes and fill
				1586	* the corresponding array of status.
				1587	*/
				1588	static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
				1589	unsigned long nr_pages,
				1590	const void __user * __user *pages,
				1591	const int __user *nodes,
				1592	int __user *status, int flags)
				1593	{
				1594	int current_node = NUMA_NO_NODE;
				1595	LIST_HEAD(pagelist);
				1596	int start, i;
				1597	int err = 0, err1;
				1598
				1599	migrate_prep();
				1600
				1601	for (i = start = 0; i < nr_pages; i++) {
				1602	const void __user *p;
				1603	unsigned long addr;
				1604	int node;
				1605
				1606	err = -EFAULT;
				1607	if (get_user(p, pages + i))
				1608	goto out_flush;
				1609	if (get_user(node, nodes + i))
				1610	goto out_flush;
				1611	addr = (unsigned long)untagged_addr(p);
				1612
				1613	err = -ENODEV;
				1614	if (node < 0 \|\| node >= MAX_NUMNODES)
				1615	goto out_flush;
				1616	if (!node_state(node, N_MEMORY))
				1617	goto out_flush;
				1618
				1619	err = -EACCES;
				1620	if (!node_isset(node, task_nodes))
				1621	goto out_flush;
				1622
				1623	if (current_node == NUMA_NO_NODE) {
				1624	current_node = node;
				1625	start = i;
				1626	} else if (node != current_node) {
				1627	err = do_move_pages_to_node(mm, &pagelist, current_node);
				1628	if (err)
				1629	goto out;
				1630	err = store_status(status, start, current_node, i - start);
				1631	if (err)
				1632	goto out;
				1633	start = i;
				1634	current_node = node;
				1635	}
				1636
				1637	/*
				1638	* Errors in the page lookup or isolation are not fatal and we simply
				1639	* report them via status
				1640	*/
				1641	err = add_page_for_migration(mm, addr, current_node,
				1642	&pagelist, flags & MPOL_MF_MOVE_ALL);
				1643
				1644	if (!err) {
				1645	/* The page is already on the target node */
				1646	err = store_status(status, i, current_node, 1);
				1647	if (err)
				1648	goto out_flush;
				1649	continue;
				1650	} else if (err > 0) {
				1651	/* The page is successfully queued for migration */
				1652	continue;
				1653	}
				1654
				1655	err = store_status(status, i, err, 1);
				1656	if (err)
				1657	goto out_flush;
				1658
				1659	err = do_move_pages_to_node(mm, &pagelist, current_node);
				1660	if (err)
				1661	goto out;
				1662	if (i > start) {
				1663	err = store_status(status, start, current_node, i - start);
				1664	if (err)
				1665	goto out;
				1666	}
				1667	current_node = NUMA_NO_NODE;
				1668	}
				1669	out_flush:
				1670	if (list_empty(&pagelist))
				1671	return err;
				1672
				1673	/* Make sure we do not overwrite the existing error */
				1674	err1 = do_move_pages_to_node(mm, &pagelist, current_node);
				1675	if (!err1)
				1676	err1 = store_status(status, start, current_node, i - start);
				1677	if (!err)
				1678	err = err1;
				1679	out:
				1680	return err;
				1681	}
				1682
				1683	/*
				1684	* Determine the nodes of an array of pages and store it in an array of status.
				1685	*/
				1686	static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				1687	const void __user *pages, int status)
				1688	{
				1689	unsigned long i;
				1690
				1691	down_read(&mm->mmap_sem);
				1692
				1693	for (i = 0; i < nr_pages; i++) {
				1694	unsigned long addr = (unsigned long)(*pages);
				1695	struct vm_area_struct *vma;
				1696	struct page *page;
				1697	int err = -EFAULT;
				1698
				1699	vma = find_vma(mm, addr);
				1700	if (!vma \|\| addr < vma->vm_start)
				1701	goto set_status;
				1702
				1703	/* FOLL_DUMP to ignore special (like zero) pages */
				1704	page = follow_page(vma, addr, FOLL_DUMP);
				1705
				1706	err = PTR_ERR(page);
				1707	if (IS_ERR(page))
				1708	goto set_status;
				1709
				1710	err = page ? page_to_nid(page) : -ENOENT;
				1711	set_status:
				1712	*status = err;
				1713
				1714	pages++;
				1715	status++;
				1716	}
				1717
				1718	up_read(&mm->mmap_sem);
				1719	}
				1720
				1721	/*
				1722	* Determine the nodes of a user array of pages and store it in
				1723	* a user array of status.
				1724	*/
				1725	static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
				1726	const void __user * __user *pages,
				1727	int __user *status)
				1728	{
				1729	#define DO_PAGES_STAT_CHUNK_NR 16
				1730	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
				1731	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
				1732
				1733	while (nr_pages) {
				1734	unsigned long chunk_nr;
				1735
				1736	chunk_nr = nr_pages;
				1737	if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
				1738	chunk_nr = DO_PAGES_STAT_CHUNK_NR;
				1739
				1740	if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
				1741	break;
				1742
				1743	do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
				1744
				1745	if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
				1746	break;
				1747
				1748	pages += chunk_nr;
				1749	status += chunk_nr;
				1750	nr_pages -= chunk_nr;
				1751	}
				1752	return nr_pages ? -EFAULT : 0;
				1753	}
				1754
				1755	/*
				1756	* Move a list of pages in the address space of the currently executing
				1757	* process.
				1758	*/
				1759	static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
				1760	const void __user * __user *pages,
				1761	const int __user *nodes,
				1762	int __user *status, int flags)
				1763	{
				1764	struct task_struct *task;
				1765	struct mm_struct *mm;
				1766	int err;
				1767	nodemask_t task_nodes;
				1768
				1769	/* Check flags */
				1770	if (flags & ~(MPOL_MF_MOVE\|MPOL_MF_MOVE_ALL))
				1771	return -EINVAL;
				1772
				1773	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				1774	return -EPERM;
				1775
				1776	/* Find the mm_struct */
				1777	rcu_read_lock();
				1778	task = pid ? find_task_by_vpid(pid) : current;
				1779	if (!task) {
				1780	rcu_read_unlock();
				1781	return -ESRCH;
				1782	}
				1783	get_task_struct(task);
				1784
				1785	/*
				1786	* Check if this process has the right to modify the specified
				1787	* process. Use the regular "ptrace_may_access()" checks.
				1788	*/
				1789	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
				1790	rcu_read_unlock();
				1791	err = -EPERM;
				1792	goto out;
				1793	}
				1794	rcu_read_unlock();
				1795
				1796	err = security_task_movememory(task);
				1797	if (err)
				1798	goto out;
				1799
				1800	task_nodes = cpuset_mems_allowed(task);
				1801	mm = get_task_mm(task);
				1802	put_task_struct(task);
				1803
				1804	if (!mm)
				1805	return -EINVAL;
				1806
				1807	if (nodes)
				1808	err = do_pages_move(mm, task_nodes, nr_pages, pages,
				1809	nodes, status, flags);
				1810	else
				1811	err = do_pages_stat(mm, nr_pages, pages, status);
				1812
				1813	mmput(mm);
				1814	return err;
				1815
				1816	out:
				1817	put_task_struct(task);
				1818	return err;
				1819	}
				1820
				1821	SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
				1822	const void __user * __user *, pages,
				1823	const int __user *, nodes,
				1824	int __user *, status, int, flags)
				1825	{
				1826	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				1827	}
				1828
				1829	#ifdef CONFIG_COMPAT
				1830	COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
				1831	compat_uptr_t __user *, pages32,
				1832	const int __user *, nodes,
				1833	int __user *, status,
				1834	int, flags)
				1835	{
				1836	const void __user * __user *pages;
				1837	int i;
				1838
				1839	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
				1840	for (i = 0; i < nr_pages; i++) {
				1841	compat_uptr_t p;
				1842
				1843	if (get_user(p, pages32 + i) \|\|
				1844	put_user(compat_ptr(p), pages + i))
				1845	return -EFAULT;
				1846	}
				1847	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				1848	}
				1849	#endif /* CONFIG_COMPAT */
				1850
				1851	#ifdef CONFIG_NUMA_BALANCING
				1852	/*
				1853	* Returns true if this is a safe migration target node for misplaced NUMA
				1854	* pages. Currently it only checks the watermarks which crude
				1855	*/
				1856	static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
				1857	unsigned long nr_migrate_pages)
				1858	{
				1859	int z;
				1860
				1861	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
				1862	struct zone *zone = pgdat->node_zones + z;
				1863
				1864	if (!populated_zone(zone))
				1865	continue;
				1866
				1867	/* Avoid waking kswapd by allocating pages_to_migrate pages. */
				1868	if (!zone_watermark_ok(zone, 0,
				1869	high_wmark_pages(zone) +
				1870	nr_migrate_pages,
				1871	0, 0))
				1872	continue;
				1873	return true;
				1874	}
				1875	return false;
				1876	}
				1877
				1878	static struct page alloc_misplaced_dst_page(struct page page,
				1879	unsigned long data)
				1880	{
				1881	int nid = (int) data;
				1882	struct page *newpage;
				1883
				1884	newpage = __alloc_pages_node(nid,
				1885	(GFP_HIGHUSER_MOVABLE \|
				1886	__GFP_THISNODE \| __GFP_NOMEMALLOC \|
				1887	__GFP_NORETRY \| __GFP_NOWARN) &
				1888	~__GFP_RECLAIM, 0);
				1889
				1890	return newpage;
				1891	}
				1892
				1893	static int numamigrate_isolate_page(pg_data_t pgdat, struct page page)
				1894	{
				1895	int page_lru;
				1896
				1897	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
				1898
				1899	/* Avoid migrating to a node that is nearly full */
				1900	if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
				1901	return 0;
				1902
				1903	if (isolate_lru_page(page))
				1904	return 0;
				1905
				1906	/*
				1907	* migrate_misplaced_transhuge_page() skips page migration's usual
				1908	* check on page_count(), so we must do it here, now that the page
				1909	* has been isolated: a GUP pin, or any other pin, prevents migration.
				1910	* The expected page count is 3: 1 for page's mapcount and 1 for the
				1911	* caller's pin and 1 for the reference taken by isolate_lru_page().
				1912	*/
				1913	if (PageTransHuge(page) && page_count(page) != 3) {
				1914	putback_lru_page(page);
				1915	return 0;
				1916	}
				1917
				1918	page_lru = page_is_file_cache(page);
				1919	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
				1920	hpage_nr_pages(page));
				1921
				1922	/*
				1923	* Isolating the page has taken another reference, so the
				1924	* caller's reference can be safely dropped without the page
				1925	* disappearing underneath us during migration.
				1926	*/
				1927	put_page(page);
				1928	return 1;
				1929	}
				1930
				1931	bool pmd_trans_migrating(pmd_t pmd)
				1932	{
				1933	struct page *page = pmd_page(pmd);
				1934	return PageLocked(page);
				1935	}
				1936
				1937	/*
				1938	* Attempt to migrate a misplaced page to the specified destination
				1939	* node. Caller is expected to have an elevated reference count on
				1940	* the page that will be dropped by this function before returning.
				1941	*/
				1942	int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
				1943	int node)
				1944	{
				1945	pg_data_t *pgdat = NODE_DATA(node);
				1946	int isolated;
				1947	int nr_remaining;
				1948	LIST_HEAD(migratepages);
				1949
				1950	/*
				1951	* Don't migrate file pages that are mapped in multiple processes
				1952	* with execute permissions as they are probably shared libraries.
				1953	*/
				1954	if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
				1955	(vma->vm_flags & VM_EXEC))
				1956	goto out;
				1957
				1958	/*
				1959	* Also do not migrate dirty pages as not all filesystems can move
				1960	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
				1961	*/
				1962	if (page_is_file_cache(page) && PageDirty(page))
				1963	goto out;
				1964
				1965	isolated = numamigrate_isolate_page(pgdat, page);
				1966	if (!isolated)
				1967	goto out;
				1968
				1969	list_add(&page->lru, &migratepages);
				1970	nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
				1971	NULL, node, MIGRATE_ASYNC,
				1972	MR_NUMA_MISPLACED);
				1973	if (nr_remaining) {
				1974	if (!list_empty(&migratepages)) {
				1975	list_del(&page->lru);
				1976	dec_node_page_state(page, NR_ISOLATED_ANON +
				1977	page_is_file_cache(page));
				1978	putback_lru_page(page);
				1979	}
				1980	isolated = 0;
				1981	} else
				1982	count_vm_numa_event(NUMA_PAGE_MIGRATE);
				1983	BUG_ON(!list_empty(&migratepages));
				1984	return isolated;
				1985
				1986	out:
				1987	put_page(page);
				1988	return 0;
				1989	}
				1990	#endif /* CONFIG_NUMA_BALANCING */
				1991
				1992	#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
				1993	/*
				1994	* Migrates a THP to a given target node. page must be locked and is unlocked
				1995	* before returning.
				1996	*/
				1997	int migrate_misplaced_transhuge_page(struct mm_struct *mm,
				1998	struct vm_area_struct *vma,
				1999	pmd_t *pmd, pmd_t entry,
				2000	unsigned long address,
				2001	struct page *page, int node)
				2002	{
				2003	spinlock_t *ptl;
				2004	pg_data_t *pgdat = NODE_DATA(node);
				2005	int isolated = 0;
				2006	struct page *new_page = NULL;
				2007	int page_lru = page_is_file_cache(page);
				2008	unsigned long mmun_start = address & HPAGE_PMD_MASK;
				2009	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
				2010
				2011	new_page = alloc_pages_node(node,
				2012	(GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
				2013	HPAGE_PMD_ORDER);
				2014	if (!new_page)
				2015	goto out_fail;
				2016	prep_transhuge_page(new_page);
				2017
				2018	isolated = numamigrate_isolate_page(pgdat, page);
				2019	if (!isolated) {
				2020	put_page(new_page);
				2021	goto out_fail;
				2022	}
				2023
				2024	/* Prepare a page as a migration target */
				2025	__SetPageLocked(new_page);
				2026	if (PageSwapBacked(page))
				2027	__SetPageSwapBacked(new_page);
				2028
				2029	/* anon mapping, we can simply copy page->mapping to the new page: */
				2030	new_page->mapping = page->mapping;
				2031	new_page->index = page->index;
				2032	migrate_page_copy(new_page, page);
				2033	WARN_ON(PageLRU(new_page));
				2034
				2035	/* Recheck the target PMD */
				2036	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
				2037	ptl = pmd_lock(mm, pmd);
				2038	if (unlikely(!pmd_same(*pmd, entry) \|\| !page_ref_freeze(page, 2))) {
				2039	spin_unlock(ptl);
				2040	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
				2041
				2042	/* Reverse changes made by migrate_page_copy() */
				2043	if (TestClearPageActive(new_page))
				2044	SetPageActive(page);
				2045	if (TestClearPageUnevictable(new_page))
				2046	SetPageUnevictable(page);
				2047
				2048	unlock_page(new_page);
				2049	put_page(new_page); /* Free it */
				2050
				2051	/* Retake the callers reference and putback on LRU */
				2052	get_page(page);
				2053	putback_lru_page(page);
				2054	mod_node_page_state(page_pgdat(page),
				2055	NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
				2056
				2057	goto out_unlock;
				2058	}
				2059
				2060	entry = mk_huge_pmd(new_page, vma->vm_page_prot);
				2061	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
				2062
				2063	/*
				2064	* Overwrite the old entry under pagetable lock and establish
				2065	* the new PTE. Any parallel GUP will either observe the old
				2066	* page blocking on the page lock, block on the page table
				2067	* lock or observe the new page. The SetPageUptodate on the
				2068	* new page and page_add_new_anon_rmap guarantee the copy is
				2069	* visible before the pagetable update.
				2070	*/
				2071	flush_cache_range(vma, mmun_start, mmun_end);
				2072	page_add_anon_rmap(new_page, vma, mmun_start, true);
				2073	/*
				2074	* At this point the pmd is numa/protnone (i.e. non present) and the TLB
				2075	* has already been flushed globally. So no TLB can be currently
				2076	* caching this non present pmd mapping. There's no need to clear the
				2077	* pmd before doing set_pmd_at(), nor to flush the TLB after
				2078	* set_pmd_at(). Clearing the pmd here would introduce a race
				2079	* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
				2080	* mmap_sem for reading. If the pmd is set to NULL at any given time,
				2081	* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
				2082	* pmd.
				2083	*/
				2084	set_pmd_at(mm, mmun_start, pmd, entry);
				2085	update_mmu_cache_pmd(vma, address, &entry);
				2086
				2087	page_ref_unfreeze(page, 2);
				2088	mlock_migrate_page(new_page, page);
				2089	page_remove_rmap(page, true);
				2090	set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
				2091
				2092	spin_unlock(ptl);
				2093	/*
				2094	* No need to double call mmu_notifier->invalidate_range() callback as
				2095	* the above pmdp_huge_clear_flush_notify() did already call it.
				2096	*/
				2097	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
				2098
				2099	/* Take an "isolate" reference and put new page on the LRU. */
				2100	get_page(new_page);
				2101	putback_lru_page(new_page);
				2102
				2103	unlock_page(new_page);
				2104	unlock_page(page);
				2105	put_page(page); /* Drop the rmap reference */
				2106	put_page(page); /* Drop the LRU isolation reference */
				2107
				2108	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
				2109	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
				2110
				2111	mod_node_page_state(page_pgdat(page),
				2112	NR_ISOLATED_ANON + page_lru,
				2113	-HPAGE_PMD_NR);
				2114	return isolated;
				2115
				2116	out_fail:
				2117	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
				2118	ptl = pmd_lock(mm, pmd);
				2119	if (pmd_same(*pmd, entry)) {
				2120	entry = pmd_modify(entry, vma->vm_page_prot);
				2121	set_pmd_at(mm, mmun_start, pmd, entry);
				2122	update_mmu_cache_pmd(vma, address, &entry);
				2123	}
				2124	spin_unlock(ptl);
				2125
				2126	out_unlock:
				2127	unlock_page(page);
				2128	put_page(page);
				2129	return 0;
				2130	}
				2131	#endif /* CONFIG_NUMA_BALANCING */
				2132
				2133	#endif /* CONFIG_NUMA */
				2134
				2135	#if defined(CONFIG_MIGRATE_VMA_HELPER)
				2136	struct migrate_vma {
				2137	struct vm_area_struct *vma;
				2138	unsigned long *dst;
				2139	unsigned long *src;
				2140	unsigned long cpages;
				2141	unsigned long npages;
				2142	unsigned long start;
				2143	unsigned long end;
				2144	};
				2145
				2146	static int migrate_vma_collect_hole(unsigned long start,
				2147	unsigned long end,
				2148	struct mm_walk *walk)
				2149	{
				2150	struct migrate_vma *migrate = walk->private;
				2151	unsigned long addr;
				2152
				2153	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
				2154	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
				2155	migrate->dst[migrate->npages] = 0;
				2156	migrate->npages++;
				2157	migrate->cpages++;
				2158	}
				2159
				2160	return 0;
				2161	}
				2162
				2163	static int migrate_vma_collect_skip(unsigned long start,
				2164	unsigned long end,
				2165	struct mm_walk *walk)
				2166	{
				2167	struct migrate_vma *migrate = walk->private;
				2168	unsigned long addr;
				2169
				2170	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
				2171	migrate->dst[migrate->npages] = 0;
				2172	migrate->src[migrate->npages++] = 0;
				2173	}
				2174
				2175	return 0;
				2176	}
				2177
				2178	static int migrate_vma_collect_pmd(pmd_t *pmdp,
				2179	unsigned long start,
				2180	unsigned long end,
				2181	struct mm_walk *walk)
				2182	{
				2183	struct migrate_vma *migrate = walk->private;
				2184	struct vm_area_struct *vma = walk->vma;
				2185	struct mm_struct *mm = vma->vm_mm;
				2186	unsigned long addr = start, unmapped = 0;
				2187	spinlock_t *ptl;
				2188	pte_t *ptep;
				2189
				2190	again:
				2191	if (pmd_none(*pmdp))
				2192	return migrate_vma_collect_hole(start, end, walk);
				2193
				2194	if (pmd_trans_huge(*pmdp)) {
				2195	struct page *page;
				2196
				2197	ptl = pmd_lock(mm, pmdp);
				2198	if (unlikely(!pmd_trans_huge(*pmdp))) {
				2199	spin_unlock(ptl);
				2200	goto again;
				2201	}
				2202
				2203	page = pmd_page(*pmdp);
				2204	if (is_huge_zero_page(page)) {
				2205	spin_unlock(ptl);
				2206	split_huge_pmd(vma, pmdp, addr);
				2207	if (pmd_trans_unstable(pmdp))
				2208	return migrate_vma_collect_skip(start, end,
				2209	walk);
				2210	} else {
				2211	int ret;
				2212
				2213	get_page(page);
				2214	spin_unlock(ptl);
				2215	if (unlikely(!trylock_page(page)))
				2216	return migrate_vma_collect_skip(start, end,
				2217	walk);
				2218	ret = split_huge_page(page);
				2219	unlock_page(page);
				2220	put_page(page);
				2221	if (ret)
				2222	return migrate_vma_collect_skip(start, end,
				2223	walk);
				2224	if (pmd_none(*pmdp))
				2225	return migrate_vma_collect_hole(start, end,
				2226	walk);
				2227	}
				2228	}
				2229
				2230	if (unlikely(pmd_bad(*pmdp)))
				2231	return migrate_vma_collect_skip(start, end, walk);
				2232
				2233	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				2234	arch_enter_lazy_mmu_mode();
				2235
				2236	for (; addr < end; addr += PAGE_SIZE, ptep++) {
				2237	unsigned long mpfn, pfn;
				2238	struct page *page;
				2239	swp_entry_t entry;
				2240	pte_t pte;
				2241
				2242	pte = *ptep;
				2243	pfn = pte_pfn(pte);
				2244
				2245	if (pte_none(pte)) {
				2246	mpfn = MIGRATE_PFN_MIGRATE;
				2247	migrate->cpages++;
				2248	pfn = 0;
				2249	goto next;
				2250	}
				2251
				2252	if (!pte_present(pte)) {
				2253	mpfn = pfn = 0;
				2254
				2255	/*
				2256	* Only care about unaddressable device page special
				2257	* page table entry. Other special swap entries are not
				2258	* migratable, and we ignore regular swapped page.
				2259	*/
				2260	entry = pte_to_swp_entry(pte);
				2261	if (!is_device_private_entry(entry))
				2262	goto next;
				2263
				2264	page = device_private_entry_to_page(entry);
				2265	mpfn = migrate_pfn(page_to_pfn(page))\|
				2266	MIGRATE_PFN_DEVICE \| MIGRATE_PFN_MIGRATE;
				2267	if (is_write_device_private_entry(entry))
				2268	mpfn \|= MIGRATE_PFN_WRITE;
				2269	} else {
				2270	if (is_zero_pfn(pfn)) {
				2271	mpfn = MIGRATE_PFN_MIGRATE;
				2272	migrate->cpages++;
				2273	pfn = 0;
				2274	goto next;
				2275	}
				2276	page = _vm_normal_page(migrate->vma, addr, pte, true);
				2277	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
				2278	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
				2279	}
				2280
				2281	/* FIXME support THP */
				2282	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
				2283	mpfn = pfn = 0;
				2284	goto next;
				2285	}
				2286	pfn = page_to_pfn(page);
				2287
				2288	/*
				2289	* By getting a reference on the page we pin it and that blocks
				2290	* any kind of migration. Side effect is that it "freezes" the
				2291	* pte.
				2292	*
				2293	* We drop this reference after isolating the page from the lru
				2294	* for non device page (device page are not on the lru and thus
				2295	* can't be dropped from it).
				2296	*/
				2297	get_page(page);
				2298	migrate->cpages++;
				2299
				2300	/*
				2301	* Optimize for the common case where page is only mapped once
				2302	* in one process. If we can lock the page, then we can safely
				2303	* set up a special migration page table entry now.
				2304	*/
				2305	if (trylock_page(page)) {
				2306	pte_t swp_pte;
				2307
				2308	mpfn \|= MIGRATE_PFN_LOCKED;
				2309	ptep_get_and_clear(mm, addr, ptep);
				2310
				2311	/* Setup special migration page table entry */
				2312	entry = make_migration_entry(page, mpfn &
				2313	MIGRATE_PFN_WRITE);
				2314	swp_pte = swp_entry_to_pte(entry);
				2315	if (pte_soft_dirty(pte))
				2316	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2317	set_pte_at(mm, addr, ptep, swp_pte);
				2318
				2319	/*
				2320	* This is like regular unmap: we remove the rmap and
				2321	* drop page refcount. Page won't be freed, as we took
				2322	* a reference just above.
				2323	*/
				2324	page_remove_rmap(page, false);
				2325	put_page(page);
				2326
				2327	if (pte_present(pte))
				2328	unmapped++;
				2329	}
				2330
				2331	next:
				2332	migrate->dst[migrate->npages] = 0;
				2333	migrate->src[migrate->npages++] = mpfn;
				2334	}
				2335	arch_leave_lazy_mmu_mode();
				2336	pte_unmap_unlock(ptep - 1, ptl);
				2337
				2338	/* Only flush the TLB if we actually modified any entries */
				2339	if (unmapped)
				2340	flush_tlb_range(walk->vma, start, end);
				2341
				2342	return 0;
				2343	}
				2344
				2345	/*
				2346	* migrate_vma_collect() - collect pages over a range of virtual addresses
				2347	* @migrate: migrate struct containing all migration information
				2348	*
				2349	* This will walk the CPU page table. For each virtual address backed by a
				2350	* valid page, it updates the src array and takes a reference on the page, in
				2351	* order to pin the page until we lock it and unmap it.
				2352	*/
				2353	static void migrate_vma_collect(struct migrate_vma *migrate)
				2354	{
				2355	struct mm_walk mm_walk = {
				2356	.pmd_entry = migrate_vma_collect_pmd,
				2357	.pte_hole = migrate_vma_collect_hole,
				2358	.vma = migrate->vma,
				2359	.mm = migrate->vma->vm_mm,
				2360	.private = migrate,
				2361	};
				2362
				2363	mmu_notifier_invalidate_range_start(mm_walk.mm,
				2364	migrate->start,
				2365	migrate->end);
				2366	walk_page_range(migrate->start, migrate->end, &mm_walk);
				2367	mmu_notifier_invalidate_range_end(mm_walk.mm,
				2368	migrate->start,
				2369	migrate->end);
				2370
				2371	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
				2372	}
				2373
				2374	/*
				2375	* migrate_vma_check_page() - check if page is pinned or not
				2376	* @page: struct page to check
				2377	*
				2378	* Pinned pages cannot be migrated. This is the same test as in
				2379	* migrate_page_move_mapping(), except that here we allow migration of a
				2380	* ZONE_DEVICE page.
				2381	*/
				2382	static bool migrate_vma_check_page(struct page *page)
				2383	{
				2384	/*
				2385	* One extra ref because caller holds an extra reference, either from
				2386	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
				2387	* a device page.
				2388	*/
				2389	int extra = 1;
				2390
				2391	/*
				2392	* FIXME support THP (transparent huge page), it is bit more complex to
				2393	* check them than regular pages, because they can be mapped with a pmd
				2394	* or with a pte (split pte mapping).
				2395	*/
				2396	if (PageCompound(page))
				2397	return false;
				2398
				2399	/* Page from ZONE_DEVICE have one extra reference */
				2400	if (is_zone_device_page(page)) {
				2401	/*
				2402	* Private page can never be pin as they have no valid pte and
				2403	* GUP will fail for those. Yet if there is a pending migration
				2404	* a thread might try to wait on the pte migration entry and
				2405	* will bump the page reference count. Sadly there is no way to
				2406	* differentiate a regular pin from migration wait. Hence to
				2407	* avoid 2 racing thread trying to migrate back to CPU to enter
				2408	* infinite loop (one stoping migration because the other is
				2409	* waiting on pte migration entry). We always return true here.
				2410	*
				2411	* FIXME proper solution is to rework migration_entry_wait() so
				2412	* it does not need to take a reference on page.
				2413	*/
				2414	if (is_device_private_page(page))
				2415	return true;
				2416
				2417	/*
				2418	* Only allow device public page to be migrated and account for
				2419	* the extra reference count imply by ZONE_DEVICE pages.
				2420	*/
				2421	if (!is_device_public_page(page))
				2422	return false;
				2423	extra++;
				2424	}
				2425
				2426	/* For file back page */
				2427	if (page_mapping(page))
				2428	extra += 1 + page_has_private(page);
				2429
				2430	if ((page_count(page) - extra) > page_mapcount(page))
				2431	return false;
				2432
				2433	return true;
				2434	}
				2435
				2436	/*
				2437	* migrate_vma_prepare() - lock pages and isolate them from the lru
				2438	* @migrate: migrate struct containing all migration information
				2439	*
				2440	* This locks pages that have been collected by migrate_vma_collect(). Once each
				2441	* page is locked it is isolated from the lru (for non-device pages). Finally,
				2442	* the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
				2443	* migrated by concurrent kernel threads.
				2444	*/
				2445	static void migrate_vma_prepare(struct migrate_vma *migrate)
				2446	{
				2447	const unsigned long npages = migrate->npages;
				2448	const unsigned long start = migrate->start;
				2449	unsigned long addr, i, restore = 0;
				2450	bool allow_drain = true;
				2451
				2452	lru_add_drain();
				2453
				2454	for (i = 0; (i < npages) && migrate->cpages; i++) {
				2455	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2456	bool remap = true;
				2457
				2458	if (!page)
				2459	continue;
				2460
				2461	if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
				2462	/*
				2463	* Because we are migrating several pages there can be
				2464	* a deadlock between 2 concurrent migration where each
				2465	* are waiting on each other page lock.
				2466	*
				2467	* Make migrate_vma() a best effort thing and backoff
				2468	* for any page we can not lock right away.
				2469	*/
				2470	if (!trylock_page(page)) {
				2471	migrate->src[i] = 0;
				2472	migrate->cpages--;
				2473	put_page(page);
				2474	continue;
				2475	}
				2476	remap = false;
				2477	migrate->src[i] \|= MIGRATE_PFN_LOCKED;
				2478	}
				2479
				2480	/* ZONE_DEVICE pages are not on LRU */
				2481	if (!is_zone_device_page(page)) {
				2482	if (!PageLRU(page) && allow_drain) {
				2483	/* Drain CPU's pagevec */
				2484	lru_add_drain_all();
				2485	allow_drain = false;
				2486	}
				2487
				2488	if (isolate_lru_page(page)) {
				2489	if (remap) {
				2490	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2491	migrate->cpages--;
				2492	restore++;
				2493	} else {
				2494	migrate->src[i] = 0;
				2495	unlock_page(page);
				2496	migrate->cpages--;
				2497	put_page(page);
				2498	}
				2499	continue;
				2500	}
				2501
				2502	/* Drop the reference we took in collect */
				2503	put_page(page);
				2504	}
				2505
				2506	if (!migrate_vma_check_page(page)) {
				2507	if (remap) {
				2508	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2509	migrate->cpages--;
				2510	restore++;
				2511
				2512	if (!is_zone_device_page(page)) {
				2513	get_page(page);
				2514	putback_lru_page(page);
				2515	}
				2516	} else {
				2517	migrate->src[i] = 0;
				2518	unlock_page(page);
				2519	migrate->cpages--;
				2520
				2521	if (!is_zone_device_page(page))
				2522	putback_lru_page(page);
				2523	else
				2524	put_page(page);
				2525	}
				2526	}
				2527	}
				2528
				2529	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
				2530	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2531
				2532	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2533	continue;
				2534
				2535	remove_migration_pte(page, migrate->vma, addr, page);
				2536
				2537	migrate->src[i] = 0;
				2538	unlock_page(page);
				2539	put_page(page);
				2540	restore--;
				2541	}
				2542	}
				2543
				2544	/*
				2545	* migrate_vma_unmap() - replace page mapping with special migration pte entry
				2546	* @migrate: migrate struct containing all migration information
				2547	*
				2548	* Replace page mapping (CPU page table pte) with a special migration pte entry
				2549	* and check again if it has been pinned. Pinned pages are restored because we
				2550	* cannot migrate them.
				2551	*
				2552	* This is the last step before we call the device driver callback to allocate
				2553	* destination memory and copy contents of original page over to new page.
				2554	*/
				2555	static void migrate_vma_unmap(struct migrate_vma *migrate)
				2556	{
				2557	int flags = TTU_MIGRATION \| TTU_IGNORE_MLOCK \| TTU_IGNORE_ACCESS;
				2558	const unsigned long npages = migrate->npages;
				2559	const unsigned long start = migrate->start;
				2560	unsigned long addr, i, restore = 0;
				2561
				2562	for (i = 0; i < npages; i++) {
				2563	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2564
				2565	if (!page \|\| !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2566	continue;
				2567
				2568	if (page_mapped(page)) {
				2569	try_to_unmap(page, flags);
				2570	if (page_mapped(page))
				2571	goto restore;
				2572	}
				2573
				2574	if (migrate_vma_check_page(page))
				2575	continue;
				2576
				2577	restore:
				2578	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2579	migrate->cpages--;
				2580	restore++;
				2581	}
				2582
				2583	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
				2584	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2585
				2586	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2587	continue;
				2588
				2589	remove_migration_ptes(page, page, false);
				2590
				2591	migrate->src[i] = 0;
				2592	unlock_page(page);
				2593	restore--;
				2594
				2595	if (is_zone_device_page(page))
				2596	put_page(page);
				2597	else
				2598	putback_lru_page(page);
				2599	}
				2600	}
				2601
				2602	static void migrate_vma_insert_page(struct migrate_vma *migrate,
				2603	unsigned long addr,
				2604	struct page *page,
				2605	unsigned long *src,
				2606	unsigned long *dst)
				2607	{
				2608	struct vm_area_struct *vma = migrate->vma;
				2609	struct mm_struct *mm = vma->vm_mm;
				2610	struct mem_cgroup *memcg;
				2611	bool flush = false;
				2612	spinlock_t *ptl;
				2613	pte_t entry;
				2614	pgd_t *pgdp;
				2615	p4d_t *p4dp;
				2616	pud_t *pudp;
				2617	pmd_t *pmdp;
				2618	pte_t *ptep;
				2619
				2620	/* Only allow populating anonymous memory */
				2621	if (!vma_is_anonymous(vma))
				2622	goto abort;
				2623
				2624	pgdp = pgd_offset(mm, addr);
				2625	p4dp = p4d_alloc(mm, pgdp, addr);
				2626	if (!p4dp)
				2627	goto abort;
				2628	pudp = pud_alloc(mm, p4dp, addr);
				2629	if (!pudp)
				2630	goto abort;
				2631	pmdp = pmd_alloc(mm, pudp, addr);
				2632	if (!pmdp)
				2633	goto abort;
				2634
				2635	if (pmd_trans_huge(pmdp) \|\| pmd_devmap(pmdp))
				2636	goto abort;
				2637
				2638	/*
				2639	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				2640	* pte_offset_map() on pmds where a huge pmd might be created
				2641	* from a different thread.
				2642	*
				2643	* pte_alloc_map() is safe to use under down_write(mmap_sem) or when
				2644	* parallel threads are excluded by other means.
				2645	*
				2646	* Here we only have down_read(mmap_sem).
				2647	*/
				2648	if (pte_alloc(mm, pmdp, addr))
				2649	goto abort;
				2650
				2651	/* See the comment in pte_alloc_one_map() */
				2652	if (unlikely(pmd_trans_unstable(pmdp)))
				2653	goto abort;
				2654
				2655	if (unlikely(anon_vma_prepare(vma)))
				2656	goto abort;
				2657	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
				2658	goto abort;
				2659
				2660	/*
				2661	* The memory barrier inside __SetPageUptodate makes sure that
				2662	* preceding stores to the page contents become visible before
				2663	* the set_pte_at() write.
				2664	*/
				2665	__SetPageUptodate(page);
				2666
				2667	if (is_zone_device_page(page)) {
				2668	if (is_device_private_page(page)) {
				2669	swp_entry_t swp_entry;
				2670
				2671	swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
				2672	entry = swp_entry_to_pte(swp_entry);
				2673	} else if (is_device_public_page(page)) {
				2674	entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
				2675	if (vma->vm_flags & VM_WRITE)
				2676	entry = pte_mkwrite(pte_mkdirty(entry));
				2677	entry = pte_mkdevmap(entry);
				2678	}
				2679	} else {
				2680	entry = mk_pte(page, vma->vm_page_prot);
				2681	if (vma->vm_flags & VM_WRITE)
				2682	entry = pte_mkwrite(pte_mkdirty(entry));
				2683	}
				2684
				2685	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				2686
				2687	if (pte_present(*ptep)) {
				2688	unsigned long pfn = pte_pfn(*ptep);
				2689
				2690	if (!is_zero_pfn(pfn)) {
				2691	pte_unmap_unlock(ptep, ptl);
				2692	mem_cgroup_cancel_charge(page, memcg, false);
				2693	goto abort;
				2694	}
				2695	flush = true;
				2696	} else if (!pte_none(*ptep)) {
				2697	pte_unmap_unlock(ptep, ptl);
				2698	mem_cgroup_cancel_charge(page, memcg, false);
				2699	goto abort;
				2700	}
				2701
				2702	/*
				2703	* Check for usefaultfd but do not deliver the fault. Instead,
				2704	* just back off.
				2705	*/
				2706	if (userfaultfd_missing(vma)) {
				2707	pte_unmap_unlock(ptep, ptl);
				2708	mem_cgroup_cancel_charge(page, memcg, false);
				2709	goto abort;
				2710	}
				2711
				2712	inc_mm_counter(mm, MM_ANONPAGES);
				2713	page_add_new_anon_rmap(page, vma, addr, false);
				2714	mem_cgroup_commit_charge(page, memcg, false, false);
				2715	if (!is_zone_device_page(page))
				2716	lru_cache_add_active_or_unevictable(page, vma);
				2717	get_page(page);
				2718
				2719	if (flush) {
				2720	flush_cache_page(vma, addr, pte_pfn(*ptep));
				2721	ptep_clear_flush_notify(vma, addr, ptep);
				2722	set_pte_at_notify(mm, addr, ptep, entry);
				2723	update_mmu_cache(vma, addr, ptep);
				2724	} else {
				2725	/* No need to invalidate - it was non-present before */
				2726	set_pte_at(mm, addr, ptep, entry);
				2727	update_mmu_cache(vma, addr, ptep);
				2728	}
				2729
				2730	pte_unmap_unlock(ptep, ptl);
				2731	*src = MIGRATE_PFN_MIGRATE;
				2732	return;
				2733
				2734	abort:
				2735	*src &= ~MIGRATE_PFN_MIGRATE;
				2736	}
				2737
				2738	/*
				2739	* migrate_vma_pages() - migrate meta-data from src page to dst page
				2740	* @migrate: migrate struct containing all migration information
				2741	*
				2742	* This migrates struct page meta-data from source struct page to destination
				2743	* struct page. This effectively finishes the migration from source page to the
				2744	* destination page.
				2745	*/
				2746	static void migrate_vma_pages(struct migrate_vma *migrate)
				2747	{
				2748	const unsigned long npages = migrate->npages;
				2749	const unsigned long start = migrate->start;
				2750	struct vm_area_struct *vma = migrate->vma;
				2751	struct mm_struct *mm = vma->vm_mm;
				2752	unsigned long addr, i, mmu_start;
				2753	bool notified = false;
				2754
				2755	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
				2756	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2757	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2758	struct address_space *mapping;
				2759	int r;
				2760
				2761	if (!newpage) {
				2762	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2763	continue;
				2764	}
				2765
				2766	if (!page) {
				2767	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
				2768	continue;
				2769	}
				2770	if (!notified) {
				2771	mmu_start = addr;
				2772	notified = true;
				2773	mmu_notifier_invalidate_range_start(mm,
				2774	mmu_start,
				2775	migrate->end);
				2776	}
				2777	migrate_vma_insert_page(migrate, addr, newpage,
				2778	&migrate->src[i],
				2779	&migrate->dst[i]);
				2780	continue;
				2781	}
				2782
				2783	mapping = page_mapping(page);
				2784
				2785	if (is_zone_device_page(newpage)) {
				2786	if (is_device_private_page(newpage)) {
				2787	/*
				2788	* For now only support private anonymous when
				2789	* migrating to un-addressable device memory.
				2790	*/
				2791	if (mapping) {
				2792	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2793	continue;
				2794	}
				2795	} else if (!is_device_public_page(newpage)) {
				2796	/*
				2797	* Other types of ZONE_DEVICE page are not
				2798	* supported.
				2799	*/
				2800	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2801	continue;
				2802	}
				2803	}
				2804
				2805	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
				2806	if (r != MIGRATEPAGE_SUCCESS)
				2807	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2808	}
				2809
				2810	/*
				2811	* No need to double call mmu_notifier->invalidate_range() callback as
				2812	* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
				2813	* did already call it.
				2814	*/
				2815	if (notified)
				2816	mmu_notifier_invalidate_range_only_end(mm, mmu_start,
				2817	migrate->end);
				2818	}
				2819
				2820	/*
				2821	* migrate_vma_finalize() - restore CPU page table entry
				2822	* @migrate: migrate struct containing all migration information
				2823	*
				2824	* This replaces the special migration pte entry with either a mapping to the
				2825	* new page if migration was successful for that page, or to the original page
				2826	* otherwise.
				2827	*
				2828	* This also unlocks the pages and puts them back on the lru, or drops the extra
				2829	* refcount, for device pages.
				2830	*/
				2831	static void migrate_vma_finalize(struct migrate_vma *migrate)
				2832	{
				2833	const unsigned long npages = migrate->npages;
				2834	unsigned long i;
				2835
				2836	for (i = 0; i < npages; i++) {
				2837	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2838	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2839
				2840	if (!page) {
				2841	if (newpage) {
				2842	unlock_page(newpage);
				2843	put_page(newpage);
				2844	}
				2845	continue;
				2846	}
				2847
				2848	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
				2849	if (newpage) {
				2850	unlock_page(newpage);
				2851	put_page(newpage);
				2852	}
				2853	newpage = page;
				2854	}
				2855
				2856	remove_migration_ptes(page, newpage, false);
				2857	unlock_page(page);
				2858	migrate->cpages--;
				2859
				2860	if (is_zone_device_page(page))
				2861	put_page(page);
				2862	else
				2863	putback_lru_page(page);
				2864
				2865	if (newpage != page) {
				2866	unlock_page(newpage);
				2867	if (is_zone_device_page(newpage))
				2868	put_page(newpage);
				2869	else
				2870	putback_lru_page(newpage);
				2871	}
				2872	}
				2873	}
				2874
				2875	/*
				2876	* migrate_vma() - migrate a range of memory inside vma
				2877	*
				2878	* @ops: migration callback for allocating destination memory and copying
				2879	* @vma: virtual memory area containing the range to be migrated
				2880	* @start: start address of the range to migrate (inclusive)
				2881	* @end: end address of the range to migrate (exclusive)
				2882	* @src: array of hmm_pfn_t containing source pfns
				2883	* @dst: array of hmm_pfn_t containing destination pfns
				2884	* @private: pointer passed back to each of the callback
				2885	* Returns: 0 on success, error code otherwise
				2886	*
				2887	* This function tries to migrate a range of memory virtual address range, using
				2888	* callbacks to allocate and copy memory from source to destination. First it
				2889	* collects all the pages backing each virtual address in the range, saving this
				2890	* inside the src array. Then it locks those pages and unmaps them. Once the pages
				2891	* are locked and unmapped, it checks whether each page is pinned or not. Pages
				2892	* that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
				2893	* in the corresponding src array entry. It then restores any pages that are
				2894	* pinned, by remapping and unlocking those pages.
				2895	*
				2896	* At this point it calls the alloc_and_copy() callback. For documentation on
				2897	* what is expected from that callback, see struct migrate_vma_ops comments in
				2898	* include/linux/migrate.h
				2899	*
				2900	* After the alloc_and_copy() callback, this function goes over each entry in
				2901	* the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
				2902	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
				2903	* then the function tries to migrate struct page information from the source
				2904	* struct page to the destination struct page. If it fails to migrate the struct
				2905	* page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
				2906	* array.
				2907	*
				2908	* At this point all successfully migrated pages have an entry in the src
				2909	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
				2910	* array entry with MIGRATE_PFN_VALID flag set.
				2911	*
				2912	* It then calls the finalize_and_map() callback. See comments for "struct
				2913	* migrate_vma_ops", in include/linux/migrate.h for details about
				2914	* finalize_and_map() behavior.
				2915	*
				2916	* After the finalize_and_map() callback, for successfully migrated pages, this
				2917	* function updates the CPU page table to point to new pages, otherwise it
				2918	* restores the CPU page table to point to the original source pages.
				2919	*
				2920	* Function returns 0 after the above steps, even if no pages were migrated
				2921	* (The function only returns an error if any of the arguments are invalid.)
				2922	*
				2923	* Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
				2924	* unsigned long entries.
				2925	*/
				2926	int migrate_vma(const struct migrate_vma_ops *ops,
				2927	struct vm_area_struct *vma,
				2928	unsigned long start,
				2929	unsigned long end,
				2930	unsigned long *src,
				2931	unsigned long *dst,
				2932	void *private)
				2933	{
				2934	struct migrate_vma migrate;
				2935
				2936	/* Sanity check the arguments */
				2937	start &= PAGE_MASK;
				2938	end &= PAGE_MASK;
				2939	if (!vma \|\| is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_SPECIAL) \|\|
				2940	vma_is_dax(vma))
				2941	return -EINVAL;
				2942	if (start < vma->vm_start \|\| start >= vma->vm_end)
				2943	return -EINVAL;
				2944	if (end <= vma->vm_start \|\| end > vma->vm_end)
				2945	return -EINVAL;
				2946	if (!ops \|\| !src \|\| !dst \|\| start >= end)
				2947	return -EINVAL;
				2948
				2949	memset(src, 0, sizeof(src) ((end - start) >> PAGE_SHIFT));
				2950	migrate.src = src;
				2951	migrate.dst = dst;
				2952	migrate.start = start;
				2953	migrate.npages = 0;
				2954	migrate.cpages = 0;
				2955	migrate.end = end;
				2956	migrate.vma = vma;
				2957
				2958	/* Collect, and try to unmap source pages */
				2959	migrate_vma_collect(&migrate);
				2960	if (!migrate.cpages)
				2961	return 0;
				2962
				2963	/* Lock and isolate page */
				2964	migrate_vma_prepare(&migrate);
				2965	if (!migrate.cpages)
				2966	return 0;
				2967
				2968	/* Unmap pages */
				2969	migrate_vma_unmap(&migrate);
				2970	if (!migrate.cpages)
				2971	return 0;
				2972
				2973	/*
				2974	* At this point pages are locked and unmapped, and thus they have
				2975	* stable content and can safely be copied to destination memory that
				2976	* is allocated by the callback.
				2977	*
				2978	* Note that migration can fail in migrate_vma_struct_page() for each
				2979	* individual page.
				2980	*/
				2981	ops->alloc_and_copy(vma, src, dst, start, end, private);
				2982
				2983	/* This does the real migration of struct page */
				2984	migrate_vma_pages(&migrate);
				2985
				2986	ops->finalize_and_map(vma, src, dst, start, end, private);
				2987
				2988	/* Unlock and remap pages */
				2989	migrate_vma_finalize(&migrate);
				2990
				2991	return 0;
				2992	}
				2993	EXPORT_SYMBOL(migrate_vma);
				2994	#endif /* defined(MIGRATE_VMA_HELPER) */