Blame - ap/os/linux/linux-3.4.x/mm/swap_state.c - R306

blob: 6c5cf854f0303763163ce8b5c78c2185b0fb4ef8 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* linux/mm/swap_state.c
				3	*
				4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				5	* Swap reorganised 29.12.95, Stephen Tweedie
				6	*
				7	* Rewritten to use page cache, (C) 1998 Stephen Tweedie
				8	*/
				9	#include <linux/mm.h>
				10	#include <linux/gfp.h>
				11	#include <linux/kernel_stat.h>
				12	#include <linux/swap.h>
				13	#include <linux/swapops.h>
				14	#include <linux/init.h>
				15	#include <linux/pagemap.h>
				16	#include <linux/backing-dev.h>
				17	#include <linux/pagevec.h>
				18	#include <linux/migrate.h>
				19	#include <linux/page_cgroup.h>
				20
				21	#include <asm/pgtable.h>
				22
				23	/*
				24	* swapper_space is a fiction, retained to simplify the path through
				25	* vmscan's shrink_page_list.
				26	*/
				27	static const struct address_space_operations swap_aops = {
				28	.writepage = swap_writepage,
				29	.set_page_dirty = __set_page_dirty_no_writeback,
				30	.migratepage = migrate_page,
				31	};
				32
				33	static struct backing_dev_info swap_backing_dev_info = {
				34	.name = "swap",
				35	.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK \| BDI_CAP_SWAP_BACKED,
				36	};
				37
				38	struct address_space swapper_space = {
				39	.page_tree = RADIX_TREE_INIT(GFP_ATOMIC\|__GFP_NOWARN),
				40	.tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
				41	.a_ops = &swap_aops,
				42	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
				43	.backing_dev_info = &swap_backing_dev_info,
				44	};
				45
				46	#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
				47
				48	static struct {
				49	unsigned long add_total;
				50	unsigned long del_total;
				51	unsigned long find_success;
				52	unsigned long find_total;
				53	} swap_cache_info;
				54
				55	void show_swap_cache_info(void)
				56	{
				57	printk("%lu pages in swap cache\n", total_swapcache_pages);
				58	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
				59	swap_cache_info.add_total, swap_cache_info.del_total,
				60	swap_cache_info.find_success, swap_cache_info.find_total);
				61	printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
				62	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
				63	}
				64
				65	/*
				66	* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
				67	* but sets SwapCache flag and private instead of mapping and index.
				68	*/
				69	static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
				70	{
				71	int error;
				72
				73	VM_BUG_ON(!PageLocked(page));
				74	VM_BUG_ON(PageSwapCache(page));
				75	VM_BUG_ON(!PageSwapBacked(page));
				76
				77	page_cache_get(page);
				78	SetPageSwapCache(page);
				79	set_page_private(page, entry.val);
				80
				81	spin_lock_irq(&swapper_space.tree_lock);
				82	error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
				83	if (likely(!error)) {
				84	total_swapcache_pages++;
				85	#ifndef CONFIG_LIMIT_PAGE_CACHE
				86	__inc_zone_page_state(page, NR_FILE_PAGES);
				87	#endif
				88	INC_CACHE_INFO(add_total);
				89	}
				90	spin_unlock_irq(&swapper_space.tree_lock);
				91
				92	if (unlikely(error)) {
				93	/*
				94	* Only the context which have set SWAP_HAS_CACHE flag
				95	* would call add_to_swap_cache().
				96	* So add_to_swap_cache() doesn't returns -EEXIST.
				97	*/
				98	VM_BUG_ON(error == -EEXIST);
				99	set_page_private(page, 0UL);
				100	ClearPageSwapCache(page);
				101	page_cache_release(page);
				102	}
				103
				104	return error;
				105	}
				106
				107
				108	int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
				109	{
				110	int error;
				111
				112	error = radix_tree_preload(gfp_mask);
				113	if (!error) {
				114	error = __add_to_swap_cache(page, entry);
				115	radix_tree_preload_end();
				116	}
				117	return error;
				118	}
				119
				120	/*
				121	* This must be called only on pages that have
				122	* been verified to be in the swap cache.
				123	*/
				124	void __delete_from_swap_cache(struct page *page)
				125	{
				126	VM_BUG_ON(!PageLocked(page));
				127	VM_BUG_ON(!PageSwapCache(page));
				128	VM_BUG_ON(PageWriteback(page));
				129
				130	radix_tree_delete(&swapper_space.page_tree, page_private(page));
				131	set_page_private(page, 0);
				132	ClearPageSwapCache(page);
				133	total_swapcache_pages--;
				134	#ifndef CONFIG_LIMIT_PAGE_CACHE
				135	__dec_zone_page_state(page, NR_FILE_PAGES);
				136	#endif
				137	INC_CACHE_INFO(del_total);
				138	}
				139
				140	/**
				141	* add_to_swap - allocate swap space for a page
				142	* @page: page we want to move to swap
				143	*
				144	* Allocate swap space for the page and add the page to the
				145	* swap cache. Caller needs to hold the page lock.
				146	*/
				147	int add_to_swap(struct page *page)
				148	{
				149	swp_entry_t entry;
				150	int err;
				151
				152	VM_BUG_ON(!PageLocked(page));
				153	VM_BUG_ON(!PageUptodate(page));
				154
				155	entry = get_swap_page();
				156	if (!entry.val)
				157	return 0;
				158
				159	if (unlikely(PageTransHuge(page)))
				160	if (unlikely(split_huge_page(page))) {
				161	swapcache_free(entry, NULL);
				162	return 0;
				163	}
				164
				165	/*
				166	* Radix-tree node allocations from PF_MEMALLOC contexts could
				167	* completely exhaust the page allocator. __GFP_NOMEMALLOC
				168	* stops emergency reserves from being allocated.
				169	*
				170	* TODO: this could cause a theoretical memory reclaim
				171	* deadlock in the swap out path.
				172	*/
				173	/*
				174	* Add it to the swap cache and mark it dirty
				175	*/
				176	err = add_to_swap_cache(page, entry,
				177	__GFP_HIGH\|__GFP_NOMEMALLOC\|__GFP_NOWARN);
				178
				179	if (!err) { /* Success */
				180	SetPageDirty(page);
				181	return 1;
				182	} else { /* -ENOMEM radix-tree allocation failure */
				183	/*
				184	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
				185	* clear SWAP_HAS_CACHE flag.
				186	*/
				187	swapcache_free(entry, NULL);
				188	return 0;
				189	}
				190	}
				191
				192	/*
				193	* This must be called only on pages that have
				194	* been verified to be in the swap cache and locked.
				195	* It will never put the page into the free list,
				196	* the caller has a reference on the page.
				197	*/
				198	void delete_from_swap_cache(struct page *page)
				199	{
				200	swp_entry_t entry;
				201
				202	entry.val = page_private(page);
				203
				204	spin_lock_irq(&swapper_space.tree_lock);
				205	__delete_from_swap_cache(page);
				206	spin_unlock_irq(&swapper_space.tree_lock);
				207
				208	swapcache_free(entry, page);
				209	page_cache_release(page);
				210	}
				211
				212	/*
				213	* If we are the only user, then try to free up the swap cache.
				214	*
				215	* Its ok to check for PageSwapCache without the page lock
				216	* here because we are going to recheck again inside
				217	* try_to_free_swap() _with_ the lock.
				218	* - Marcelo
				219	*/
				220	static inline void free_swap_cache(struct page *page)
				221	{
				222	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
				223	try_to_free_swap(page);
				224	unlock_page(page);
				225	}
				226	}
				227
				228	/*
				229	* Perform a free_page(), also freeing any swap cache associated with
				230	* this page if it is the last user of the page.
				231	*/
				232	void free_page_and_swap_cache(struct page *page)
				233	{
				234	free_swap_cache(page);
				235	page_cache_release(page);
				236	}
				237
				238	/*
				239	* Passed an array of pages, drop them all from swapcache and then release
				240	* them. They are removed from the LRU and freed if this is their last use.
				241	*/
				242	void free_pages_and_swap_cache(struct page **pages, int nr)
				243	{
				244	struct page **pagep = pages;
				245
				246	lru_add_drain();
				247	while (nr) {
				248	int todo = min(nr, PAGEVEC_SIZE);
				249	int i;
				250
				251	for (i = 0; i < todo; i++)
				252	free_swap_cache(pagep[i]);
				253	release_pages(pagep, todo, 0);
				254	pagep += todo;
				255	nr -= todo;
				256	}
				257	}
				258
				259	/*
				260	* Lookup a swap entry in the swap cache. A found page will be returned
				261	* unlocked and with its refcount incremented - we rely on the kernel
				262	* lock getting page table operations atomic even if we drop the page
				263	* lock before returning.
				264	*/
				265	struct page * lookup_swap_cache(swp_entry_t entry)
				266	{
				267	struct page *page;
				268
				269	page = find_get_page(&swapper_space, entry.val);
				270
				271	if (page)
				272	INC_CACHE_INFO(find_success);
				273
				274	INC_CACHE_INFO(find_total);
				275	return page;
				276	}
				277
				278	/*
				279	* Locate a page of swap in physical memory, reserving swap cache space
				280	* and reading the disk if it is not already cached.
				281	* A failure return means that either the page allocation failed or that
				282	* the swap entry is no longer in use.
				283	*/
				284	struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
				285	struct vm_area_struct *vma, unsigned long addr)
				286	{
				287	struct page found_page, new_page = NULL;
				288	int err;
				289
				290	do {
				291	/*
				292	* First check the swap cache. Since this is normally
				293	* called after lookup_swap_cache() failed, re-calling
				294	* that would confuse statistics.
				295	*/
				296	found_page = find_get_page(&swapper_space, entry.val);
				297	if (found_page)
				298	break;
				299
				300	/*
				301	* Get a new page to read into from swap.
				302	*/
				303	if (!new_page) {
				304	new_page = alloc_page_vma(gfp_mask, vma, addr);
				305	if (!new_page)
				306	break; /* Out of memory */
				307	}
				308
				309	/*
				310	* call radix_tree_preload() while we can wait.
				311	*/
				312	err = radix_tree_preload(gfp_mask & GFP_KERNEL);
				313	if (err)
				314	break;
				315
				316	/*
				317	* Swap entry may have been freed since our caller observed it.
				318	*/
				319	err = swapcache_prepare(entry);
				320	if (err == -EEXIST) {
				321	radix_tree_preload_end();
				322	/*
				323	* We might race against get_swap_page() and stumble
				324	* across a SWAP_HAS_CACHE swap_map entry whose page
				325	* has not been brought into the swapcache yet, while
				326	* the other end is scheduled away waiting on discard
				327	* I/O completion at scan_swap_map().
				328	*
				329	* In order to avoid turning this transitory state
				330	* into a permanent loop around this -EEXIST case
				331	* if !CONFIG_PREEMPT and the I/O completion happens
				332	* to be waiting on the CPU waitqueue where we are now
				333	* busy looping, we just conditionally invoke the
				334	* scheduler here, if there are some more important
				335	* tasks to run.
				336	*/
				337	cond_resched();
				338	continue;
				339	}
				340	if (err) { /* swp entry is obsolete ? */
				341	radix_tree_preload_end();
				342	break;
				343	}
				344
				345	/* May fail (-ENOMEM) if radix-tree node allocation failed. */
				346	__set_page_locked(new_page);
				347	SetPageSwapBacked(new_page);
				348	err = __add_to_swap_cache(new_page, entry);
				349	if (likely(!err)) {
				350	radix_tree_preload_end();
				351	/*
				352	* Initiate read into locked page and return.
				353	*/
				354	lru_cache_add_anon(new_page);
				355	swap_readpage(new_page);
				356	return new_page;
				357	}
				358	radix_tree_preload_end();
				359	ClearPageSwapBacked(new_page);
				360	__clear_page_locked(new_page);
				361	/*
				362	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
				363	* clear SWAP_HAS_CACHE flag.
				364	*/
				365	swapcache_free(entry, NULL);
				366	} while (err != -ENOMEM);
				367
				368	if (new_page)
				369	page_cache_release(new_page);
				370	return found_page;
				371	}
				372
				373	/**
				374	* swapin_readahead - swap in pages in hope we need them soon
				375	* @entry: swap entry of this memory
				376	* @gfp_mask: memory allocation flags
				377	* @vma: user vma this address belongs to
				378	* @addr: target address for mempolicy
				379	*
				380	* Returns the struct page for entry and addr, after queueing swapin.
				381	*
				382	* Primitive swap readahead code. We simply read an aligned block of
				383	* (1 << page_cluster) entries in the swap area. This method is chosen
				384	* because it doesn't cost us any seek time. We also make sure to queue
				385	* the 'original' request together with the readahead ones...
				386	*
				387	* This has been extended to use the NUMA policies from the mm triggering
				388	* the readahead.
				389	*
				390	* Caller must hold down_read on the vma->vm_mm if vma is not NULL.
				391	*/
				392	struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
				393	struct vm_area_struct *vma, unsigned long addr)
				394	{
				395	struct page *page;
				396	unsigned long offset = swp_offset(entry);
				397	unsigned long start_offset, end_offset;
				398	unsigned long mask = (1UL << page_cluster) - 1;
				399
				400	/* Read a page_cluster sized and aligned cluster around offset. */
				401	start_offset = offset & ~mask;
				402	end_offset = offset \| mask;
				403	if (!start_offset) /* First page is swap header. */
				404	start_offset++;
				405
				406	for (offset = start_offset; offset <= end_offset ; offset++) {
				407	/* Ok, do the async read-ahead now */
				408	page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
				409	gfp_mask, vma, addr);
				410	if (!page)
				411	continue;
				412	page_cache_release(page);
				413	}
				414	lru_add_drain(); /* Push any new pages onto the LRU now */
				415	return read_swap_cache_async(entry, gfp_mask, vma, addr);
				416	}