Blame - ap/os/linux/linux-3.4.x/mm/migrate.c - T106_DC

blob: 07b67610978ce7df3aaabd74561f17758ff84ed6 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* Memory Migration functionality - linux/mm/migration.c
				3	*
				4	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
				5	*
				6	* Page migration was first developed in the context of the memory hotplug
				7	* project. The main authors of the migration code are:
				8	*
				9	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
				10	* Hirokazu Takahashi <taka@valinux.co.jp>
				11	* Dave Hansen <haveblue@us.ibm.com>
				12	* Christoph Lameter
				13	*/
				14
				15	#include <linux/migrate.h>
				16	#include <linux/export.h>
				17	#include <linux/swap.h>
				18	#include <linux/swapops.h>
				19	#include <linux/pagemap.h>
				20	#include <linux/buffer_head.h>
				21	#include <linux/mm_inline.h>
				22	#include <linux/nsproxy.h>
				23	#include <linux/pagevec.h>
				24	#include <linux/ksm.h>
				25	#include <linux/rmap.h>
				26	#include <linux/topology.h>
				27	#include <linux/cpu.h>
				28	#include <linux/cpuset.h>
				29	#include <linux/writeback.h>
				30	#include <linux/mempolicy.h>
				31	#include <linux/vmalloc.h>
				32	#include <linux/security.h>
				33	#include <linux/memcontrol.h>
				34	#include <linux/syscalls.h>
				35	#include <linux/hugetlb.h>
				36	#include <linux/gfp.h>
				37
				38	#include <asm/tlbflush.h>
				39
				40	#include "internal.h"
				41
				42	/*
				43	* migrate_prep() needs to be called before we start compiling a list of pages
				44	* to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
				45	* undesirable, use migrate_prep_local()
				46	*/
				47	int migrate_prep(void)
				48	{
				49	/*
				50	* Clear the LRU lists so pages can be isolated.
				51	* Note that pages may be moved off the LRU after we have
				52	* drained them. Those pages will fail to migrate like other
				53	* pages that may be busy.
				54	*/
				55	lru_add_drain_all();
				56
				57	return 0;
				58	}
				59
				60	/* Do the necessary work of migrate_prep but not if it involves other CPUs */
				61	int migrate_prep_local(void)
				62	{
				63	lru_add_drain();
				64
				65	return 0;
				66	}
				67
				68	/*
				69	* Add isolated pages on the list back to the LRU under page lock
				70	* to avoid leaking evictable pages back onto unevictable list.
				71	*/
				72	void putback_lru_pages(struct list_head *l)
				73	{
				74	struct page *page;
				75	struct page *page2;
				76
				77	list_for_each_entry_safe(page, page2, l, lru) {
				78	list_del(&page->lru);
				79	dec_zone_page_state(page, NR_ISOLATED_ANON +
				80	page_is_file_cache(page));
				81	putback_lru_page(page);
				82	}
				83	}
				84
				85	/*
				86	* Restore a potential migration pte to a working pte entry
				87	*/
				88	static int remove_migration_pte(struct page new, struct vm_area_struct vma,
				89	unsigned long addr, void *old)
				90	{
				91	struct mm_struct *mm = vma->vm_mm;
				92	swp_entry_t entry;
				93	pgd_t *pgd;
				94	pud_t *pud;
				95	pmd_t *pmd;
				96	pte_t *ptep, pte;
				97	spinlock_t *ptl;
				98
				99	if (unlikely(PageHuge(new))) {
				100	ptep = huge_pte_offset(mm, addr);
				101	if (!ptep)
				102	goto out;
				103	ptl = &mm->page_table_lock;
				104	} else {
				105	pgd = pgd_offset(mm, addr);
				106	if (!pgd_present(*pgd))
				107	goto out;
				108
				109	pud = pud_offset(pgd, addr);
				110	if (!pud_present(*pud))
				111	goto out;
				112
				113	pmd = pmd_offset(pud, addr);
				114	if (pmd_trans_huge(*pmd))
				115	goto out;
				116	if (!pmd_present(*pmd))
				117	goto out;
				118
				119	ptep = pte_offset_map(pmd, addr);
				120
				121	/*
				122	* Peek to check is_swap_pte() before taking ptlock? No, we
				123	* can race mremap's move_ptes(), which skips anon_vma lock.
				124	*/
				125
				126	ptl = pte_lockptr(mm, pmd);
				127	}
				128
				129	spin_lock(ptl);
				130	pte = *ptep;
				131	if (!is_swap_pte(pte))
				132	goto unlock;
				133
				134	entry = pte_to_swp_entry(pte);
				135
				136	if (!is_migration_entry(entry) \|\|
				137	migration_entry_to_page(entry) != old)
				138	goto unlock;
				139
				140	get_page(new);
				141	pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
				142
				143	/* Recheck VMA as permissions can change since migration started */
				144	if (is_write_migration_entry(entry))
				145	pte = maybe_mkwrite(pte, vma);
				146
				147	#ifdef CONFIG_HUGETLB_PAGE
				148	if (PageHuge(new))
				149	pte = pte_mkhuge(pte);
				150	#endif
				151	flush_dcache_page(new);
				152	set_pte_at(mm, addr, ptep, pte);
				153
				154	if (PageHuge(new)) {
				155	if (PageAnon(new))
				156	hugepage_add_anon_rmap(new, vma, addr);
				157	else
				158	page_dup_rmap(new);
				159	} else if (PageAnon(new))
				160	page_add_anon_rmap(new, vma, addr);
				161	else
				162	page_add_file_rmap(new);
				163
				164	/* No need to invalidate - it was non-present before */
				165	update_mmu_cache(vma, addr, ptep);
				166	unlock:
				167	pte_unmap_unlock(ptep, ptl);
				168	out:
				169	return SWAP_AGAIN;
				170	}
				171
				172	/*
				173	* Get rid of all migration entries and replace them by
				174	* references to the indicated page.
				175	*/
				176	static void remove_migration_ptes(struct page old, struct page new)
				177	{
				178	rmap_walk(new, remove_migration_pte, old);
				179	}
				180
				181	/*
				182	* Something used the pte of a page under migration. We need to
				183	* get to the page and wait until migration is finished.
				184	* When we return from this function the fault will be retried.
				185	*/
				186	static void __migration_entry_wait(struct mm_struct mm, pte_t ptep,
				187	spinlock_t *ptl)
				188	{
				189	pte_t pte;
				190	swp_entry_t entry;
				191	struct page *page;
				192
				193	spin_lock(ptl);
				194	pte = *ptep;
				195	if (!is_swap_pte(pte))
				196	goto out;
				197
				198	entry = pte_to_swp_entry(pte);
				199	if (!is_migration_entry(entry))
				200	goto out;
				201
				202	page = migration_entry_to_page(entry);
				203
				204	/*
				205	* Once radix-tree replacement of page migration started, page_count
				206	* must be zero. And, we don't want to call wait_on_page_locked()
				207	* against a page without get_page().
				208	* So, we use get_page_unless_zero(), here. Even failed, page fault
				209	* will occur again.
				210	*/
				211	if (!get_page_unless_zero(page))
				212	goto out;
				213	pte_unmap_unlock(ptep, ptl);
				214	wait_on_page_locked(page);
				215	put_page(page);
				216	return;
				217	out:
				218	pte_unmap_unlock(ptep, ptl);
				219	}
				220
				221	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
				222	unsigned long address)
				223	{
				224	spinlock_t *ptl = pte_lockptr(mm, pmd);
				225	pte_t *ptep = pte_offset_map(pmd, address);
				226	__migration_entry_wait(mm, ptep, ptl);
				227	}
				228
				229	void migration_entry_wait_huge(struct mm_struct mm, pte_t pte)
				230	{
				231	spinlock_t *ptl = &(mm)->page_table_lock;
				232	__migration_entry_wait(mm, pte, ptl);
				233	}
				234
				235	#ifdef CONFIG_BLOCK
				236	/* Returns true if all buffers are successfully locked */
				237	static bool buffer_migrate_lock_buffers(struct buffer_head *head,
				238	enum migrate_mode mode)
				239	{
				240	struct buffer_head *bh = head;
				241
				242	/* Simple case, sync compaction */
				243	if (mode != MIGRATE_ASYNC) {
				244	do {
				245	get_bh(bh);
				246	lock_buffer(bh);
				247	bh = bh->b_this_page;
				248
				249	} while (bh != head);
				250
				251	return true;
				252	}
				253
				254	/* async case, we cannot block on lock_buffer so use trylock_buffer */
				255	do {
				256	get_bh(bh);
				257	if (!trylock_buffer(bh)) {
				258	/*
				259	* We failed to lock the buffer and cannot stall in
				260	* async migration. Release the taken locks
				261	*/
				262	struct buffer_head *failed_bh = bh;
				263	put_bh(failed_bh);
				264	bh = head;
				265	while (bh != failed_bh) {
				266	unlock_buffer(bh);
				267	put_bh(bh);
				268	bh = bh->b_this_page;
				269	}
				270	return false;
				271	}
				272
				273	bh = bh->b_this_page;
				274	} while (bh != head);
				275	return true;
				276	}
				277	#else
				278	static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
				279	enum migrate_mode mode)
				280	{
				281	return true;
				282	}
				283	#endif /* CONFIG_BLOCK */
				284
				285	/*
				286	* Replace the page in the mapping.
				287	*
				288	* The number of remaining references must be:
				289	* 1 for anonymous pages without a mapping
				290	* 2 for pages with a mapping
				291	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
				292	*/
				293	static int migrate_page_move_mapping(struct address_space *mapping,
				294	struct page newpage, struct page page,
				295	struct buffer_head *head, enum migrate_mode mode)
				296	{
				297	int expected_count;
				298	void **pslot;
				299
				300	if (!mapping) {
				301	/* Anonymous page without mapping */
				302	if (page_count(page) != 1)
				303	return -EAGAIN;
				304	return 0;
				305	}
				306
				307	spin_lock_irq(&mapping->tree_lock);
				308
				309	pslot = radix_tree_lookup_slot(&mapping->page_tree,
				310	page_index(page));
				311
				312	expected_count = 2 + page_has_private(page);
				313	if (page_count(page) != expected_count \|\|
				314	radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
				315	spin_unlock_irq(&mapping->tree_lock);
				316	return -EAGAIN;
				317	}
				318
				319	if (!page_freeze_refs(page, expected_count)) {
				320	spin_unlock_irq(&mapping->tree_lock);
				321	return -EAGAIN;
				322	}
				323
				324	/*
				325	* In the async migration case of moving a page with buffers, lock the
				326	* buffers using trylock before the mapping is moved. If the mapping
				327	* was moved, we later failed to lock the buffers and could not move
				328	* the mapping back due to an elevated page count, we would have to
				329	* block waiting on other references to be dropped.
				330	*/
				331	if (mode == MIGRATE_ASYNC && head &&
				332	!buffer_migrate_lock_buffers(head, mode)) {
				333	page_unfreeze_refs(page, expected_count);
				334	spin_unlock_irq(&mapping->tree_lock);
				335	return -EAGAIN;
				336	}
				337
				338	/*
				339	* Now we know that no one else is looking at the page.
				340	*/
				341	get_page(newpage); /* add cache reference */
				342	if (PageSwapCache(page)) {
				343	SetPageSwapCache(newpage);
				344	set_page_private(newpage, page_private(page));
				345	}
				346
				347	radix_tree_replace_slot(pslot, newpage);
				348
				349	/*
				350	* Drop cache reference from old page by unfreezing
				351	* to one less reference.
				352	* We know this isn't the last reference.
				353	*/
				354	page_unfreeze_refs(page, expected_count - 1);
				355
				356	/*
				357	* If moved to a different zone then also account
				358	* the page for that zone. Other VM counters will be
				359	* taken care of when we establish references to the
				360	* new page and drop references to the old page.
				361	*
				362	* Note that anonymous pages are accounted for
				363	* via NR_FILE_PAGES and NR_ANON_PAGES if they
				364	* are mapped to swap space.
				365	*/
				366	__dec_zone_page_state(page, NR_FILE_PAGES);
				367	__inc_zone_page_state(newpage, NR_FILE_PAGES);
				368	if (!PageSwapCache(page) && PageSwapBacked(page)) {
				369	__dec_zone_page_state(page, NR_SHMEM);
				370	__inc_zone_page_state(newpage, NR_SHMEM);
				371	}
				372	#ifdef CONFIG_LIMIT_PAGE_CACHE
				373	if(mapping_gfp_mask(mapping) & __GFP_PAGERAMFS) {
				374	__dec_zone_page_state(page, NR_RAMFS_PAGES);
				375	__inc_zone_page_state(newpage, NR_RAMFS_PAGES);
				376	}
				377	else if(mapping_gfp_mask(mapping) & __GFP_PAGETMPFS) {
				378	__dec_zone_page_state(page, NR_TMPFS_PAGES);
				379	__inc_zone_page_state(newpage, NR_TMPFS_PAGES);
				380	}
				381
				382	#endif
				383	spin_unlock_irq(&mapping->tree_lock);
				384
				385	return 0;
				386	}
				387
				388	/*
				389	* The expected number of remaining references is the same as that
				390	* of migrate_page_move_mapping().
				391	*/
				392	int migrate_huge_page_move_mapping(struct address_space *mapping,
				393	struct page newpage, struct page page)
				394	{
				395	int expected_count;
				396	void **pslot;
				397
				398	if (!mapping) {
				399	if (page_count(page) != 1)
				400	return -EAGAIN;
				401	return 0;
				402	}
				403
				404	spin_lock_irq(&mapping->tree_lock);
				405
				406	pslot = radix_tree_lookup_slot(&mapping->page_tree,
				407	page_index(page));
				408
				409	expected_count = 2 + page_has_private(page);
				410	if (page_count(page) != expected_count \|\|
				411	radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
				412	spin_unlock_irq(&mapping->tree_lock);
				413	return -EAGAIN;
				414	}
				415
				416	if (!page_freeze_refs(page, expected_count)) {
				417	spin_unlock_irq(&mapping->tree_lock);
				418	return -EAGAIN;
				419	}
				420
				421	get_page(newpage);
				422
				423	radix_tree_replace_slot(pslot, newpage);
				424
				425	page_unfreeze_refs(page, expected_count - 1);
				426
				427	spin_unlock_irq(&mapping->tree_lock);
				428	return 0;
				429	}
				430
				431	/*
				432	* Copy the page to its new location
				433	*/
				434	void migrate_page_copy(struct page newpage, struct page page)
				435	{
				436	if (PageHuge(page))
				437	copy_huge_page(newpage, page);
				438	else
				439	copy_highpage(newpage, page);
				440
				441	if (PageError(page))
				442	SetPageError(newpage);
				443	if (PageReferenced(page))
				444	SetPageReferenced(newpage);
				445	if (PageUptodate(page))
				446	SetPageUptodate(newpage);
				447	if (TestClearPageActive(page)) {
				448	VM_BUG_ON(PageUnevictable(page));
				449	SetPageActive(newpage);
				450	} else if (TestClearPageUnevictable(page))
				451	SetPageUnevictable(newpage);
				452	if (PageChecked(page))
				453	SetPageChecked(newpage);
				454	if (PageMappedToDisk(page))
				455	SetPageMappedToDisk(newpage);
				456
				457	if (PageDirty(page)) {
				458	clear_page_dirty_for_io(page);
				459	/*
				460	* Want to mark the page and the radix tree as dirty, and
				461	* redo the accounting that clear_page_dirty_for_io undid,
				462	* but we can't use set_page_dirty because that function
				463	* is actually a signal that all of the page has become dirty.
				464	* Whereas only part of our page may be dirty.
				465	*/
				466	__set_page_dirty_nobuffers(newpage);
				467	}
				468
				469	mlock_migrate_page(newpage, page);
				470	ksm_migrate_page(newpage, page);
				471
				472	ClearPageSwapCache(page);
				473	ClearPagePrivate(page);
				474	set_page_private(page, 0);
				475
				476	/*
				477	* If any waiters have accumulated on the new page then
				478	* wake them up.
				479	*/
				480	if (PageWriteback(newpage))
				481	end_page_writeback(newpage);
				482	}
				483
				484	/************************************************************
				485	* Migration functions
				486	***********************************************************/
				487
				488	/* Always fail migration. Used for mappings that are not movable */
				489	int fail_migrate_page(struct address_space *mapping,
				490	struct page newpage, struct page page)
				491	{
				492	return -EIO;
				493	}
				494	EXPORT_SYMBOL(fail_migrate_page);
				495
				496	/*
				497	* Common logic to directly migrate a single page suitable for
				498	* pages that do not use PagePrivate/PagePrivate2.
				499	*
				500	* Pages are locked upon entry and exit.
				501	*/
				502	int migrate_page(struct address_space *mapping,
				503	struct page newpage, struct page page,
				504	enum migrate_mode mode)
				505	{
				506	int rc;
				507
				508	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
				509
				510	rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
				511
				512	if (rc)
				513	return rc;
				514
				515	migrate_page_copy(newpage, page);
				516	return 0;
				517	}
				518	EXPORT_SYMBOL(migrate_page);
				519
				520	#ifdef CONFIG_BLOCK
				521	/*
				522	* Migration function for pages with buffers. This function can only be used
				523	* if the underlying filesystem guarantees that no other references to "page"
				524	* exist.
				525	*/
				526	int buffer_migrate_page(struct address_space *mapping,
				527	struct page newpage, struct page page, enum migrate_mode mode)
				528	{
				529	struct buffer_head bh, head;
				530	int rc;
				531
				532	if (!page_has_buffers(page))
				533	return migrate_page(mapping, newpage, page, mode);
				534
				535	head = page_buffers(page);
				536
				537	rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
				538
				539	if (rc)
				540	return rc;
				541
				542	/*
				543	* In the async case, migrate_page_move_mapping locked the buffers
				544	* with an IRQ-safe spinlock held. In the sync case, the buffers
				545	* need to be locked now
				546	*/
				547	if (mode != MIGRATE_ASYNC)
				548	BUG_ON(!buffer_migrate_lock_buffers(head, mode));
				549
				550	ClearPagePrivate(page);
				551	set_page_private(newpage, page_private(page));
				552	set_page_private(page, 0);
				553	put_page(page);
				554	get_page(newpage);
				555
				556	bh = head;
				557	do {
				558	set_bh_page(bh, newpage, bh_offset(bh));
				559	bh = bh->b_this_page;
				560
				561	} while (bh != head);
				562
				563	SetPagePrivate(newpage);
				564
				565	migrate_page_copy(newpage, page);
				566
				567	bh = head;
				568	do {
				569	unlock_buffer(bh);
				570	put_bh(bh);
				571	bh = bh->b_this_page;
				572
				573	} while (bh != head);
				574
				575	return 0;
				576	}
				577	EXPORT_SYMBOL(buffer_migrate_page);
				578	#endif
				579
				580	/*
				581	* Writeback a page to clean the dirty state
				582	*/
				583	static int writeout(struct address_space mapping, struct page page)
				584	{
				585	struct writeback_control wbc = {
				586	.sync_mode = WB_SYNC_NONE,
				587	.nr_to_write = 1,
				588	.range_start = 0,
				589	.range_end = LLONG_MAX,
				590	.for_reclaim = 1
				591	};
				592	int rc;
				593
				594	if (!mapping->a_ops->writepage)
				595	/* No write method for the address space */
				596	return -EINVAL;
				597
				598	if (!clear_page_dirty_for_io(page))
				599	/* Someone else already triggered a write */
				600	return -EAGAIN;
				601
				602	/*
				603	* A dirty page may imply that the underlying filesystem has
				604	* the page on some queue. So the page must be clean for
				605	* migration. Writeout may mean we loose the lock and the
				606	* page state is no longer what we checked for earlier.
				607	* At this point we know that the migration attempt cannot
				608	* be successful.
				609	*/
				610	remove_migration_ptes(page, page);
				611
				612	rc = mapping->a_ops->writepage(page, &wbc);
				613
				614	if (rc != AOP_WRITEPAGE_ACTIVATE)
				615	/* unlocked. Relock */
				616	lock_page(page);
				617
				618	return (rc < 0) ? -EIO : -EAGAIN;
				619	}
				620
				621	/*
				622	* Default handling if a filesystem does not provide a migration function.
				623	*/
				624	static int fallback_migrate_page(struct address_space *mapping,
				625	struct page newpage, struct page page, enum migrate_mode mode)
				626	{
				627	if (PageDirty(page)) {
				628	/* Only writeback pages in full synchronous migration */
				629	if (mode != MIGRATE_SYNC)
				630	return -EBUSY;
				631	return writeout(mapping, page);
				632	}
				633
				634	/*
				635	* Buffers may be managed in a filesystem specific way.
				636	* We must have no buffers or drop them.
				637	*/
				638	if (page_has_private(page) &&
				639	!try_to_release_page(page, GFP_KERNEL))
				640	return -EAGAIN;
				641
				642	return migrate_page(mapping, newpage, page, mode);
				643	}
				644
				645	/*
				646	* Move a page to a newly allocated page
				647	* The page is locked and all ptes have been successfully removed.
				648	*
				649	* The new page will have replaced the old page if this function
				650	* is successful.
				651	*
				652	* Return value:
				653	* < 0 - error code
				654	* == 0 - success
				655	*/
				656	static int move_to_new_page(struct page newpage, struct page page,
				657	int remap_swapcache, enum migrate_mode mode)
				658	{
				659	struct address_space *mapping;
				660	int rc;
				661
				662	/*
				663	* Block others from accessing the page when we get around to
				664	* establishing additional references. We are the only one
				665	* holding a reference to the new page at this point.
				666	*/
				667	if (!trylock_page(newpage))
				668	BUG();
				669
				670	/* Prepare mapping for the new page.*/
				671	newpage->index = page->index;
				672	newpage->mapping = page->mapping;
				673	if (PageSwapBacked(page))
				674	SetPageSwapBacked(newpage);
				675
				676	mapping = page_mapping(page);
				677	if (!mapping)
				678	rc = migrate_page(mapping, newpage, page, mode);
				679	else if (mapping->a_ops->migratepage)
				680	/*
				681	* Most pages have a mapping and most filesystems provide a
				682	* migratepage callback. Anonymous pages are part of swap
				683	* space which also has its own migratepage callback. This
				684	* is the most common path for page migration.
				685	*/
				686	rc = mapping->a_ops->migratepage(mapping,
				687	newpage, page, mode);
				688	else
				689	rc = fallback_migrate_page(mapping, newpage, page, mode);
				690
				691	if (rc) {
				692	newpage->mapping = NULL;
				693	} else {
				694	if (remap_swapcache)
				695	remove_migration_ptes(page, newpage);
				696	page->mapping = NULL;
				697	}
				698
				699	unlock_page(newpage);
				700
				701	return rc;
				702	}
				703
				704	static int __unmap_and_move(struct page page, struct page newpage,
				705	int force, bool offlining, enum migrate_mode mode)
				706	{
				707	int rc = -EAGAIN;
				708	int remap_swapcache = 1;
				709	int charge = 0;
				710	struct mem_cgroup *mem;
				711	struct anon_vma *anon_vma = NULL;
				712
				713	if (!trylock_page(page)) {
				714	if (!force \|\| mode == MIGRATE_ASYNC)
				715	goto out;
				716
				717	/*
				718	* It's not safe for direct compaction to call lock_page.
				719	* For example, during page readahead pages are added locked
				720	* to the LRU. Later, when the IO completes the pages are
				721	* marked uptodate and unlocked. However, the queueing
				722	* could be merging multiple pages for one bio (e.g.
				723	* mpage_readpages). If an allocation happens for the
				724	* second or third page, the process can end up locking
				725	* the same page twice and deadlocking. Rather than
				726	* trying to be clever about what pages can be locked,
				727	* avoid the use of lock_page for direct compaction
				728	* altogether.
				729	*/
				730	if (current->flags & PF_MEMALLOC)
				731	goto out;
				732
				733	lock_page(page);
				734	}
				735
				736	/*
				737	* Only memory hotplug's offline_pages() caller has locked out KSM,
				738	* and can safely migrate a KSM page. The other cases have skipped
				739	* PageKsm along with PageReserved - but it is only now when we have
				740	* the page lock that we can be certain it will not go KSM beneath us
				741	* (KSM will not upgrade a page from PageAnon to PageKsm when it sees
				742	* its pagecount raised, but only here do we take the page lock which
				743	* serializes that).
				744	*/
				745	if (PageKsm(page) && !offlining) {
				746	rc = -EBUSY;
				747	goto unlock;
				748	}
				749
				750	/* charge against new page */
				751	charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
				752	if (charge == -ENOMEM) {
				753	rc = -ENOMEM;
				754	goto unlock;
				755	}
				756	BUG_ON(charge);
				757
				758	if (PageWriteback(page)) {
				759	/*
				760	* Only in the case of a full syncronous migration is it
				761	* necessary to wait for PageWriteback. In the async case,
				762	* the retry loop is too short and in the sync-light case,
				763	* the overhead of stalling is too much
				764	*/
				765	if (mode != MIGRATE_SYNC) {
				766	rc = -EBUSY;
				767	goto uncharge;
				768	}
				769	if (!force)
				770	goto uncharge;
				771	wait_on_page_writeback(page);
				772	}
				773	/*
				774	* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
				775	* we cannot notice that anon_vma is freed while we migrates a page.
				776	* This get_anon_vma() delays freeing anon_vma pointer until the end
				777	* of migration. File cache pages are no problem because of page_lock()
				778	* File Caches may use write_page() or lock_page() in migration, then,
				779	* just care Anon page here.
				780	*/
				781	if (PageAnon(page)) {
				782	/*
				783	* Only page_lock_anon_vma() understands the subtleties of
				784	* getting a hold on an anon_vma from outside one of its mms.
				785	*/
				786	anon_vma = page_get_anon_vma(page);
				787	if (anon_vma) {
				788	/*
				789	* Anon page
				790	*/
				791	} else if (PageSwapCache(page)) {
				792	/*
				793	* We cannot be sure that the anon_vma of an unmapped
				794	* swapcache page is safe to use because we don't
				795	* know in advance if the VMA that this page belonged
				796	* to still exists. If the VMA and others sharing the
				797	* data have been freed, then the anon_vma could
				798	* already be invalid.
				799	*
				800	* To avoid this possibility, swapcache pages get
				801	* migrated but are not remapped when migration
				802	* completes
				803	*/
				804	remap_swapcache = 0;
				805	} else {
				806	goto uncharge;
				807	}
				808	}
				809
				810	/*
				811	* Corner case handling:
				812	* 1. When a new swap-cache page is read into, it is added to the LRU
				813	* and treated as swapcache but it has no rmap yet.
				814	* Calling try_to_unmap() against a page->mapping==NULL page will
				815	* trigger a BUG. So handle it here.
				816	* 2. An orphaned page (see truncate_complete_page) might have
				817	* fs-private metadata. The page can be picked up due to memory
				818	* offlining. Everywhere else except page reclaim, the page is
				819	* invisible to the vm, so the page can not be migrated. So try to
				820	* free the metadata, so the page can be freed.
				821	*/
				822	if (!page->mapping) {
				823	VM_BUG_ON(PageAnon(page));
				824	if (page_has_private(page)) {
				825	try_to_free_buffers(page);
				826	goto uncharge;
				827	}
				828	goto skip_unmap;
				829	}
				830
				831	/* Establish migration ptes or remove ptes */
				832	try_to_unmap(page, TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
				833
				834	skip_unmap:
				835	if (!page_mapped(page))
				836	rc = move_to_new_page(newpage, page, remap_swapcache, mode);
				837
				838	if (rc && remap_swapcache)
				839	remove_migration_ptes(page, page);
				840
				841	/* Drop an anon_vma reference if we took one */
				842	if (anon_vma)
				843	put_anon_vma(anon_vma);
				844
				845	uncharge:
				846	if (!charge)
				847	mem_cgroup_end_migration(mem, page, newpage, rc == 0);
				848	unlock:
				849	unlock_page(page);
				850	out:
				851	return rc;
				852	}
				853
				854	/*
				855	* Obtain the lock on page, remove all ptes and migrate the page
				856	* to the newly allocated page in newpage.
				857	*/
				858	static int unmap_and_move(new_page_t get_new_page, unsigned long private,
				859	struct page *page, int force, bool offlining,
				860	enum migrate_mode mode)
				861	{
				862	int rc = 0;
				863	int *result = NULL;
				864	struct page *newpage = get_new_page(page, private, &result);
				865
				866	if (!newpage)
				867	return -ENOMEM;
				868
				869	if (page_count(page) == 1) {
				870	/* page was freed from under us. So we are done. */
				871	goto out;
				872	}
				873
				874	if (unlikely(PageTransHuge(page)))
				875	if (unlikely(split_huge_page(page)))
				876	goto out;
				877
				878	rc = __unmap_and_move(page, newpage, force, offlining, mode);
				879	out:
				880	if (rc != -EAGAIN) {
				881	/*
				882	* A page that has been migrated has all references
				883	* removed and will be freed. A page that has not been
				884	* migrated will have kepts its references and be
				885	* restored.
				886	*/
				887	list_del(&page->lru);
				888	dec_zone_page_state(page, NR_ISOLATED_ANON +
				889	page_is_file_cache(page));
				890	putback_lru_page(page);
				891	}
				892	/*
				893	* Move the new page to the LRU. If migration was not successful
				894	* then this will free the page.
				895	*/
				896	putback_lru_page(newpage);
				897	if (result) {
				898	if (rc)
				899	*result = rc;
				900	else
				901	*result = page_to_nid(newpage);
				902	}
				903	return rc;
				904	}
				905
				906	/*
				907	* Counterpart of unmap_and_move_page() for hugepage migration.
				908	*
				909	* This function doesn't wait the completion of hugepage I/O
				910	* because there is no race between I/O and migration for hugepage.
				911	* Note that currently hugepage I/O occurs only in direct I/O
				912	* where no lock is held and PG_writeback is irrelevant,
				913	* and writeback status of all subpages are counted in the reference
				914	* count of the head page (i.e. if all subpages of a 2MB hugepage are
				915	* under direct I/O, the reference of the head page is 512 and a bit more.)
				916	* This means that when we try to migrate hugepage whose subpages are
				917	* doing direct I/O, some references remain after try_to_unmap() and
				918	* hugepage migration fails without data corruption.
				919	*
				920	* There is also no race when direct I/O is issued on the page under migration,
				921	* because then pte is replaced with migration swap entry and direct I/O code
				922	* will wait in the page fault for migration to complete.
				923	*/
				924	static int unmap_and_move_huge_page(new_page_t get_new_page,
				925	unsigned long private, struct page *hpage,
				926	int force, bool offlining,
				927	enum migrate_mode mode)
				928	{
				929	int rc = 0;
				930	int *result = NULL;
				931	struct page *new_hpage = get_new_page(hpage, private, &result);
				932	struct anon_vma *anon_vma = NULL;
				933
				934	if (!new_hpage)
				935	return -ENOMEM;
				936
				937	rc = -EAGAIN;
				938
				939	if (!trylock_page(hpage)) {
				940	if (!force \|\| mode != MIGRATE_SYNC)
				941	goto out;
				942	lock_page(hpage);
				943	}
				944
				945	if (PageAnon(hpage))
				946	anon_vma = page_get_anon_vma(hpage);
				947
				948	try_to_unmap(hpage, TTU_MIGRATION\|TTU_IGNORE_MLOCK\|TTU_IGNORE_ACCESS);
				949
				950	if (!page_mapped(hpage))
				951	rc = move_to_new_page(new_hpage, hpage, 1, mode);
				952
				953	if (rc)
				954	remove_migration_ptes(hpage, hpage);
				955
				956	if (anon_vma)
				957	put_anon_vma(anon_vma);
				958	unlock_page(hpage);
				959
				960	out:
				961	if (rc != -EAGAIN) {
				962	list_del(&hpage->lru);
				963	put_page(hpage);
				964	}
				965
				966	put_page(new_hpage);
				967
				968	if (result) {
				969	if (rc)
				970	*result = rc;
				971	else
				972	*result = page_to_nid(new_hpage);
				973	}
				974	return rc;
				975	}
				976
				977	/*
				978	* migrate_pages
				979	*
				980	* The function takes one list of pages to migrate and a function
				981	* that determines from the page to be migrated and the private data
				982	* the target of the move and allocates the page.
				983	*
				984	* The function returns after 10 attempts or if no pages
				985	* are movable anymore because to has become empty
				986	* or no retryable pages exist anymore.
				987	* Caller should call putback_lru_pages to return pages to the LRU
				988	* or free list only if ret != 0.
				989	*
				990	* Return: Number of pages not migrated or error code.
				991	*/
				992	int migrate_pages(struct list_head *from,
				993	new_page_t get_new_page, unsigned long private, bool offlining,
				994	enum migrate_mode mode)
				995	{
				996	int retry = 1;
				997	int nr_failed = 0;
				998	int pass = 0;
				999	struct page *page;
				1000	struct page *page2;
				1001	int swapwrite = current->flags & PF_SWAPWRITE;
				1002	int rc;
				1003
				1004	if (!swapwrite)
				1005	current->flags \|= PF_SWAPWRITE;
				1006
				1007	for(pass = 0; pass < 10 && retry; pass++) {
				1008	retry = 0;
				1009
				1010	list_for_each_entry_safe(page, page2, from, lru) {
				1011	cond_resched();
				1012
				1013	rc = unmap_and_move(get_new_page, private,
				1014	page, pass > 2, offlining,
				1015	mode);
				1016
				1017	switch(rc) {
				1018	case -ENOMEM:
				1019	goto out;
				1020	case -EAGAIN:
				1021	retry++;
				1022	break;
				1023	case 0:
				1024	break;
				1025	default:
				1026	/* Permanent failure */
				1027	nr_failed++;
				1028	break;
				1029	}
				1030	}
				1031	}
				1032	rc = 0;
				1033	out:
				1034	if (!swapwrite)
				1035	current->flags &= ~PF_SWAPWRITE;
				1036
				1037	if (rc)
				1038	return rc;
				1039
				1040	return nr_failed + retry;
				1041	}
				1042
				1043	int migrate_huge_pages(struct list_head *from,
				1044	new_page_t get_new_page, unsigned long private, bool offlining,
				1045	enum migrate_mode mode)
				1046	{
				1047	int retry = 1;
				1048	int nr_failed = 0;
				1049	int pass = 0;
				1050	struct page *page;
				1051	struct page *page2;
				1052	int rc;
				1053
				1054	for (pass = 0; pass < 10 && retry; pass++) {
				1055	retry = 0;
				1056
				1057	list_for_each_entry_safe(page, page2, from, lru) {
				1058	cond_resched();
				1059
				1060	rc = unmap_and_move_huge_page(get_new_page,
				1061	private, page, pass > 2, offlining,
				1062	mode);
				1063
				1064	switch(rc) {
				1065	case -ENOMEM:
				1066	goto out;
				1067	case -EAGAIN:
				1068	retry++;
				1069	break;
				1070	case 0:
				1071	break;
				1072	default:
				1073	/* Permanent failure */
				1074	nr_failed++;
				1075	break;
				1076	}
				1077	}
				1078	}
				1079	rc = 0;
				1080	out:
				1081	if (rc)
				1082	return rc;
				1083
				1084	return nr_failed + retry;
				1085	}
				1086
				1087	#ifdef CONFIG_NUMA
				1088	/*
				1089	* Move a list of individual pages
				1090	*/
				1091	struct page_to_node {
				1092	unsigned long addr;
				1093	struct page *page;
				1094	int node;
				1095	int status;
				1096	};
				1097
				1098	static struct page new_page_node(struct page p, unsigned long private,
				1099	int **result)
				1100	{
				1101	struct page_to_node pm = (struct page_to_node )private;
				1102
				1103	while (pm->node != MAX_NUMNODES && pm->page != p)
				1104	pm++;
				1105
				1106	if (pm->node == MAX_NUMNODES)
				1107	return NULL;
				1108
				1109	*result = &pm->status;
				1110
				1111	return alloc_pages_exact_node(pm->node,
				1112	GFP_HIGHUSER_MOVABLE \| GFP_THISNODE, 0);
				1113	}
				1114
				1115	/*
				1116	* Move a set of pages as indicated in the pm array. The addr
				1117	* field must be set to the virtual address of the page to be moved
				1118	* and the node number must contain a valid target node.
				1119	* The pm array ends with node = MAX_NUMNODES.
				1120	*/
				1121	static int do_move_page_to_node_array(struct mm_struct *mm,
				1122	struct page_to_node *pm,
				1123	int migrate_all)
				1124	{
				1125	int err;
				1126	struct page_to_node *pp;
				1127	LIST_HEAD(pagelist);
				1128
				1129	down_read(&mm->mmap_sem);
				1130
				1131	/*
				1132	* Build a list of pages to migrate
				1133	*/
				1134	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
				1135	struct vm_area_struct *vma;
				1136	struct page *page;
				1137
				1138	err = -EFAULT;
				1139	vma = find_vma(mm, pp->addr);
				1140	if (!vma \|\| pp->addr < vma->vm_start \|\| !vma_migratable(vma))
				1141	goto set_status;
				1142
				1143	page = follow_page(vma, pp->addr, FOLL_GET\|FOLL_SPLIT);
				1144
				1145	err = PTR_ERR(page);
				1146	if (IS_ERR(page))
				1147	goto set_status;
				1148
				1149	err = -ENOENT;
				1150	if (!page)
				1151	goto set_status;
				1152
				1153	/* Use PageReserved to check for zero page */
				1154	if (PageReserved(page) \|\| PageKsm(page))
				1155	goto put_and_set;
				1156
				1157	pp->page = page;
				1158	err = page_to_nid(page);
				1159
				1160	if (err == pp->node)
				1161	/*
				1162	* Node already in the right place
				1163	*/
				1164	goto put_and_set;
				1165
				1166	err = -EACCES;
				1167	if (page_mapcount(page) > 1 &&
				1168	!migrate_all)
				1169	goto put_and_set;
				1170
				1171	err = isolate_lru_page(page);
				1172	if (!err) {
				1173	list_add_tail(&page->lru, &pagelist);
				1174	inc_zone_page_state(page, NR_ISOLATED_ANON +
				1175	page_is_file_cache(page));
				1176	}
				1177	put_and_set:
				1178	/*
				1179	* Either remove the duplicate refcount from
				1180	* isolate_lru_page() or drop the page ref if it was
				1181	* not isolated.
				1182	*/
				1183	put_page(page);
				1184	set_status:
				1185	pp->status = err;
				1186	}
				1187
				1188	err = 0;
				1189	if (!list_empty(&pagelist)) {
				1190	err = migrate_pages(&pagelist, new_page_node,
				1191	(unsigned long)pm, 0, MIGRATE_SYNC);
				1192	if (err)
				1193	putback_lru_pages(&pagelist);
				1194	}
				1195
				1196	up_read(&mm->mmap_sem);
				1197	return err;
				1198	}
				1199
				1200	/*
				1201	* Migrate an array of page address onto an array of nodes and fill
				1202	* the corresponding array of status.
				1203	*/
				1204	static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
				1205	unsigned long nr_pages,
				1206	const void __user * __user *pages,
				1207	const int __user *nodes,
				1208	int __user *status, int flags)
				1209	{
				1210	struct page_to_node *pm;
				1211	unsigned long chunk_nr_pages;
				1212	unsigned long chunk_start;
				1213	int err;
				1214
				1215	err = -ENOMEM;
				1216	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
				1217	if (!pm)
				1218	goto out;
				1219
				1220	migrate_prep();
				1221
				1222	/*
				1223	* Store a chunk of page_to_node array in a page,
				1224	* but keep the last one as a marker
				1225	*/
				1226	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
				1227
				1228	for (chunk_start = 0;
				1229	chunk_start < nr_pages;
				1230	chunk_start += chunk_nr_pages) {
				1231	int j;
				1232
				1233	if (chunk_start + chunk_nr_pages > nr_pages)
				1234	chunk_nr_pages = nr_pages - chunk_start;
				1235
				1236	/* fill the chunk pm with addrs and nodes from user-space */
				1237	for (j = 0; j < chunk_nr_pages; j++) {
				1238	const void __user *p;
				1239	int node;
				1240
				1241	err = -EFAULT;
				1242	if (get_user(p, pages + j + chunk_start))
				1243	goto out_pm;
				1244	pm[j].addr = (unsigned long) p;
				1245
				1246	if (get_user(node, nodes + j + chunk_start))
				1247	goto out_pm;
				1248
				1249	err = -ENODEV;
				1250	if (node < 0 \|\| node >= MAX_NUMNODES)
				1251	goto out_pm;
				1252
				1253	if (!node_state(node, N_HIGH_MEMORY))
				1254	goto out_pm;
				1255
				1256	err = -EACCES;
				1257	if (!node_isset(node, task_nodes))
				1258	goto out_pm;
				1259
				1260	pm[j].node = node;
				1261	}
				1262
				1263	/* End marker for this chunk */
				1264	pm[chunk_nr_pages].node = MAX_NUMNODES;
				1265
				1266	/* Migrate this chunk */
				1267	err = do_move_page_to_node_array(mm, pm,
				1268	flags & MPOL_MF_MOVE_ALL);
				1269	if (err < 0)
				1270	goto out_pm;
				1271
				1272	/* Return status information */
				1273	for (j = 0; j < chunk_nr_pages; j++)
				1274	if (put_user(pm[j].status, status + j + chunk_start)) {
				1275	err = -EFAULT;
				1276	goto out_pm;
				1277	}
				1278	}
				1279	err = 0;
				1280
				1281	out_pm:
				1282	free_page((unsigned long)pm);
				1283	out:
				1284	return err;
				1285	}
				1286
				1287	/*
				1288	* Determine the nodes of an array of pages and store it in an array of status.
				1289	*/
				1290	static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				1291	const void __user *pages, int status)
				1292	{
				1293	unsigned long i;
				1294
				1295	down_read(&mm->mmap_sem);
				1296
				1297	for (i = 0; i < nr_pages; i++) {
				1298	unsigned long addr = (unsigned long)(*pages);
				1299	struct vm_area_struct *vma;
				1300	struct page *page;
				1301	int err = -EFAULT;
				1302
				1303	vma = find_vma(mm, addr);
				1304	if (!vma \|\| addr < vma->vm_start)
				1305	goto set_status;
				1306
				1307	page = follow_page(vma, addr, 0);
				1308
				1309	err = PTR_ERR(page);
				1310	if (IS_ERR(page))
				1311	goto set_status;
				1312
				1313	err = -ENOENT;
				1314	/* Use PageReserved to check for zero page */
				1315	if (!page \|\| PageReserved(page) \|\| PageKsm(page))
				1316	goto set_status;
				1317
				1318	err = page_to_nid(page);
				1319	set_status:
				1320	*status = err;
				1321
				1322	pages++;
				1323	status++;
				1324	}
				1325
				1326	up_read(&mm->mmap_sem);
				1327	}
				1328
				1329	/*
				1330	* Determine the nodes of a user array of pages and store it in
				1331	* a user array of status.
				1332	*/
				1333	static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
				1334	const void __user * __user *pages,
				1335	int __user *status)
				1336	{
				1337	#define DO_PAGES_STAT_CHUNK_NR 16
				1338	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
				1339	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
				1340
				1341	while (nr_pages) {
				1342	unsigned long chunk_nr;
				1343
				1344	chunk_nr = nr_pages;
				1345	if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
				1346	chunk_nr = DO_PAGES_STAT_CHUNK_NR;
				1347
				1348	if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
				1349	break;
				1350
				1351	do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
				1352
				1353	if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
				1354	break;
				1355
				1356	pages += chunk_nr;
				1357	status += chunk_nr;
				1358	nr_pages -= chunk_nr;
				1359	}
				1360	return nr_pages ? -EFAULT : 0;
				1361	}
				1362
				1363	/*
				1364	* Move a list of pages in the address space of the currently executing
				1365	* process.
				1366	*/
				1367	SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
				1368	const void __user * __user *, pages,
				1369	const int __user *, nodes,
				1370	int __user *, status, int, flags)
				1371	{
				1372	const struct cred cred = current_cred(), tcred;
				1373	struct task_struct *task;
				1374	struct mm_struct *mm;
				1375	int err;
				1376	nodemask_t task_nodes;
				1377
				1378	/* Check flags */
				1379	if (flags & ~(MPOL_MF_MOVE\|MPOL_MF_MOVE_ALL))
				1380	return -EINVAL;
				1381
				1382	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				1383	return -EPERM;
				1384
				1385	/* Find the mm_struct */
				1386	rcu_read_lock();
				1387	task = pid ? find_task_by_vpid(pid) : current;
				1388	if (!task) {
				1389	rcu_read_unlock();
				1390	return -ESRCH;
				1391	}
				1392	get_task_struct(task);
				1393
				1394	/*
				1395	* Check if this process has the right to modify the specified
				1396	* process. The right exists if the process has administrative
				1397	* capabilities, superuser privileges or the same
				1398	* userid as the target process.
				1399	*/
				1400	tcred = __task_cred(task);
				1401	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
				1402	cred->uid != tcred->suid && cred->uid != tcred->uid &&
				1403	!capable(CAP_SYS_NICE)) {
				1404	rcu_read_unlock();
				1405	err = -EPERM;
				1406	goto out;
				1407	}
				1408	rcu_read_unlock();
				1409
				1410	err = security_task_movememory(task);
				1411	if (err)
				1412	goto out;
				1413
				1414	task_nodes = cpuset_mems_allowed(task);
				1415	mm = get_task_mm(task);
				1416	put_task_struct(task);
				1417
				1418	if (!mm)
				1419	return -EINVAL;
				1420
				1421	if (nodes)
				1422	err = do_pages_move(mm, task_nodes, nr_pages, pages,
				1423	nodes, status, flags);
				1424	else
				1425	err = do_pages_stat(mm, nr_pages, pages, status);
				1426
				1427	mmput(mm);
				1428	return err;
				1429
				1430	out:
				1431	put_task_struct(task);
				1432	return err;
				1433	}
				1434
				1435	/*
				1436	* Call migration functions in the vma_ops that may prepare
				1437	* memory in a vm for migration. migration functions may perform
				1438	* the migration for vmas that do not have an underlying page struct.
				1439	*/
				1440	int migrate_vmas(struct mm_struct mm, const nodemask_t to,
				1441	const nodemask_t *from, unsigned long flags)
				1442	{
				1443	struct vm_area_struct *vma;
				1444	int err = 0;
				1445
				1446	for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
				1447	if (vma->vm_ops && vma->vm_ops->migrate) {
				1448	err = vma->vm_ops->migrate(vma, to, from, flags);
				1449	if (err)
				1450	break;
				1451	}
				1452	}
				1453	return err;
				1454	}
				1455	#endif