Blame - src/kernel/linux/v4.19/mm/swap_state.c - T800

blob: 0d6a7f268d2e6c1abe6fb550a9e14206b26188ff [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/mm/swap_state.c
				4	*
				5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				6	* Swap reorganised 29.12.95, Stephen Tweedie
				7	*
				8	* Rewritten to use page cache, (C) 1998 Stephen Tweedie
				9	*/
				10	#include <linux/mm.h>
				11	#include <linux/gfp.h>
				12	#include <linux/kernel_stat.h>
				13	#include <linux/swap.h>
				14	#include <linux/swapops.h>
				15	#include <linux/init.h>
				16	#include <linux/pagemap.h>
				17	#include <linux/backing-dev.h>
				18	#include <linux/blkdev.h>
				19	#include <linux/pagevec.h>
				20	#include <linux/migrate.h>
				21	#include <linux/vmalloc.h>
				22	#include <linux/swap_slots.h>
				23	#include <linux/huge_mm.h>
				24
				25	#include <asm/pgtable.h>
				26
				27	/*
				28	* swapper_space is a fiction, retained to simplify the path through
				29	* vmscan's shrink_page_list.
				30	*/
				31	static const struct address_space_operations swap_aops = {
				32	.writepage = swap_writepage,
				33	.set_page_dirty = swap_set_page_dirty,
				34	#ifdef CONFIG_MIGRATION
				35	.migratepage = migrate_page,
				36	#endif
				37	};
				38
				39	struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
				40	static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
				41	static bool enable_vma_readahead __read_mostly = true;
				42
				43	#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
				44	#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
				45	#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
				46	#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
				47
				48	#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
				49	#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
				50	#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
				51
				52	#define SWAP_RA_VAL(addr, win, hits) \
				53	(((addr) & PAGE_MASK) \| \
				54	(((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) \| \
				55	((hits) & SWAP_RA_HITS_MASK))
				56
				57	/* Initial readahead hits is 4 to start up with a small window */
				58	#define GET_SWAP_RA_VAL(vma) \
				59	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
				60
				61	#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
				62	#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
				63
				64	static struct {
				65	unsigned long add_total;
				66	unsigned long del_total;
				67	unsigned long find_success;
				68	unsigned long find_total;
				69	} swap_cache_info;
				70
				71	unsigned long total_swapcache_pages(void)
				72	{
				73	unsigned int i, j, nr;
				74	unsigned long ret = 0;
				75	struct address_space *spaces;
				76
				77	rcu_read_lock();
				78	for (i = 0; i < MAX_SWAPFILES; i++) {
				79	/*
				80	* The corresponding entries in nr_swapper_spaces and
				81	* swapper_spaces will be reused only after at least
				82	* one grace period. So it is impossible for them
				83	* belongs to different usage.
				84	*/
				85	nr = nr_swapper_spaces[i];
				86	spaces = rcu_dereference(swapper_spaces[i]);
				87	if (!nr \|\| !spaces)
				88	continue;
				89	for (j = 0; j < nr; j++)
				90	ret += spaces[j].nrpages;
				91	}
				92	rcu_read_unlock();
				93	return ret;
				94	}
				95
				96	static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
				97
				98	void show_swap_cache_info(void)
				99	{
				100	printk("%lu pages in swap cache\n", total_swapcache_pages());
				101	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
				102	swap_cache_info.add_total, swap_cache_info.del_total,
				103	swap_cache_info.find_success, swap_cache_info.find_total);
				104	printk("Free swap = %ldkB\n",
				105	get_nr_swap_pages() << (PAGE_SHIFT - 10));
				106	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
				107	}
				108
				109	/*
				110	* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
				111	* but sets SwapCache flag and private instead of mapping and index.
				112	*/
				113	int __add_to_swap_cache(struct page *page, swp_entry_t entry)
				114	{
				115	int error, i, nr = hpage_nr_pages(page);
				116	struct address_space *address_space;
				117	pgoff_t idx = swp_offset(entry);
				118
				119	VM_BUG_ON_PAGE(!PageLocked(page), page);
				120	VM_BUG_ON_PAGE(PageSwapCache(page), page);
				121	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
				122
				123	page_ref_add(page, nr);
				124	SetPageSwapCache(page);
				125
				126	address_space = swap_address_space(entry);
				127	xa_lock_irq(&address_space->i_pages);
				128	for (i = 0; i < nr; i++) {
				129	set_page_private(page + i, entry.val + i);
				130	error = radix_tree_insert(&address_space->i_pages,
				131	idx + i, page + i);
				132	if (unlikely(error))
				133	break;
				134	}
				135	if (likely(!error)) {
				136	address_space->nrpages += nr;
				137	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
				138	ADD_CACHE_INFO(add_total, nr);
				139	} else {
				140	/*
				141	* Only the context which have set SWAP_HAS_CACHE flag
				142	* would call add_to_swap_cache().
				143	* So add_to_swap_cache() doesn't returns -EEXIST.
				144	*/
				145	VM_BUG_ON(error == -EEXIST);
				146	set_page_private(page + i, 0UL);
				147	while (i--) {
				148	radix_tree_delete(&address_space->i_pages, idx + i);
				149	set_page_private(page + i, 0UL);
				150	}
				151	ClearPageSwapCache(page);
				152	page_ref_sub(page, nr);
				153	}
				154	xa_unlock_irq(&address_space->i_pages);
				155
				156	return error;
				157	}
				158
				159
				160	int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
				161	{
				162	int error;
				163
				164	error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
				165	if (!error) {
				166	error = __add_to_swap_cache(page, entry);
				167	radix_tree_preload_end();
				168	}
				169	return error;
				170	}
				171
				172	/*
				173	* This must be called only on pages that have
				174	* been verified to be in the swap cache.
				175	*/
				176	void __delete_from_swap_cache(struct page *page)
				177	{
				178	struct address_space *address_space;
				179	int i, nr = hpage_nr_pages(page);
				180	swp_entry_t entry;
				181	pgoff_t idx;
				182
				183	VM_BUG_ON_PAGE(!PageLocked(page), page);
				184	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
				185	VM_BUG_ON_PAGE(PageWriteback(page), page);
				186
				187	entry.val = page_private(page);
				188	address_space = swap_address_space(entry);
				189	idx = swp_offset(entry);
				190	for (i = 0; i < nr; i++) {
				191	radix_tree_delete(&address_space->i_pages, idx + i);
				192	set_page_private(page + i, 0);
				193	}
				194	ClearPageSwapCache(page);
				195	address_space->nrpages -= nr;
				196	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
				197	ADD_CACHE_INFO(del_total, nr);
				198	}
				199
				200	/**
				201	* add_to_swap - allocate swap space for a page
				202	* @page: page we want to move to swap
				203	*
				204	* Allocate swap space for the page and add the page to the
				205	* swap cache. Caller needs to hold the page lock.
				206	*/
				207	int add_to_swap(struct page *page)
				208	{
				209	swp_entry_t entry;
				210	int err;
				211
				212	VM_BUG_ON_PAGE(!PageLocked(page), page);
				213	VM_BUG_ON_PAGE(!PageUptodate(page), page);
				214
				215	entry = get_swap_page(page);
				216	if (!entry.val)
				217	return 0;
				218
				219	/*
				220	* Radix-tree node allocations from PF_MEMALLOC contexts could
				221	* completely exhaust the page allocator. __GFP_NOMEMALLOC
				222	* stops emergency reserves from being allocated.
				223	*
				224	* TODO: this could cause a theoretical memory reclaim
				225	* deadlock in the swap out path.
				226	*/
				227	/*
				228	* Add it to the swap cache.
				229	*/
				230	err = add_to_swap_cache(page, entry,
				231	__GFP_HIGH\|__GFP_NOMEMALLOC\|__GFP_NOWARN);
				232	/* -ENOMEM radix-tree allocation failure */
				233	if (err)
				234	/*
				235	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
				236	* clear SWAP_HAS_CACHE flag.
				237	*/
				238	goto fail;
				239	/*
				240	* Normally the page will be dirtied in unmap because its pte should be
				241	* dirty. A special case is MADV_FREE page. The page'e pte could have
				242	* dirty bit cleared but the page's SwapBacked bit is still set because
				243	* clearing the dirty bit and SwapBacked bit has no lock protected. For
				244	* such page, unmap will not set dirty bit for it, so page reclaim will
				245	* not write the page out. This can cause data corruption when the page
				246	* is swap in later. Always setting the dirty bit for the page solves
				247	* the problem.
				248	*/
				249	set_page_dirty(page);
				250
				251	return 1;
				252
				253	fail:
				254	put_swap_page(page, entry);
				255	return 0;
				256	}
				257
				258	/*
				259	* This must be called only on pages that have
				260	* been verified to be in the swap cache and locked.
				261	* It will never put the page into the free list,
				262	* the caller has a reference on the page.
				263	*/
				264	void delete_from_swap_cache(struct page *page)
				265	{
				266	swp_entry_t entry;
				267	struct address_space *address_space;
				268
				269	entry.val = page_private(page);
				270
				271	address_space = swap_address_space(entry);
				272	xa_lock_irq(&address_space->i_pages);
				273	__delete_from_swap_cache(page);
				274	xa_unlock_irq(&address_space->i_pages);
				275
				276	put_swap_page(page, entry);
				277	page_ref_sub(page, hpage_nr_pages(page));
				278	}
				279
				280	/*
				281	* If we are the only user, then try to free up the swap cache.
				282	*
				283	* Its ok to check for PageSwapCache without the page lock
				284	* here because we are going to recheck again inside
				285	* try_to_free_swap() _with_ the lock.
				286	* - Marcelo
				287	*/
				288	static inline void free_swap_cache(struct page *page)
				289	{
				290	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
				291	try_to_free_swap(page);
				292	unlock_page(page);
				293	}
				294	}
				295
				296	/*
				297	* Perform a free_page(), also freeing any swap cache associated with
				298	* this page if it is the last user of the page.
				299	*/
				300	void free_page_and_swap_cache(struct page *page)
				301	{
				302	free_swap_cache(page);
				303	if (!is_huge_zero_page(page))
				304	put_page(page);
				305	}
				306
				307	/*
				308	* Passed an array of pages, drop them all from swapcache and then release
				309	* them. They are removed from the LRU and freed if this is their last use.
				310	*/
				311	void free_pages_and_swap_cache(struct page **pages, int nr)
				312	{
				313	struct page **pagep = pages;
				314	int i;
				315
				316	lru_add_drain();
				317	for (i = 0; i < nr; i++)
				318	free_swap_cache(pagep[i]);
				319	release_pages(pagep, nr);
				320	}
				321
				322	static inline bool swap_use_vma_readahead(void)
				323	{
				324	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
				325	}
				326
				327	/*
				328	* Lookup a swap entry in the swap cache. A found page will be returned
				329	* unlocked and with its refcount incremented - we rely on the kernel
				330	* lock getting page table operations atomic even if we drop the page
				331	* lock before returning.
				332	*/
				333	struct page lookup_swap_cache(swp_entry_t entry, struct vm_area_struct vma,
				334	unsigned long addr)
				335	{
				336	struct page *page;
				337
				338	page = find_get_page(swap_address_space(entry), swp_offset(entry));
				339
				340	INC_CACHE_INFO(find_total);
				341	if (page) {
				342	bool vma_ra = swap_use_vma_readahead();
				343	bool readahead;
				344
				345	INC_CACHE_INFO(find_success);
				346	/*
				347	* At the moment, we don't support PG_readahead for anon THP
				348	* so let's bail out rather than confusing the readahead stat.
				349	*/
				350	if (unlikely(PageTransCompound(page)))
				351	return page;
				352
				353	readahead = TestClearPageReadahead(page);
				354	if (vma && vma_ra) {
				355	unsigned long ra_val;
				356	int win, hits;
				357
				358	ra_val = GET_SWAP_RA_VAL(vma);
				359	win = SWAP_RA_WIN(ra_val);
				360	hits = SWAP_RA_HITS(ra_val);
				361	if (readahead)
				362	hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
				363	atomic_long_set(&vma->swap_readahead_info,
				364	SWAP_RA_VAL(addr, win, hits));
				365	}
				366
				367	if (readahead) {
				368	count_vm_event(SWAP_RA_HIT);
				369	if (!vma \|\| !vma_ra)
				370	atomic_inc(&swapin_readahead_hits);
				371	}
				372	}
				373
				374	return page;
				375	}
				376
				377	struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
				378	struct vm_area_struct *vma, unsigned long addr,
				379	bool *new_page_allocated)
				380	{
				381	struct page found_page, new_page = NULL;
				382	struct address_space *swapper_space = swap_address_space(entry);
				383	int err;
				384	*new_page_allocated = false;
				385
				386	do {
				387	/*
				388	* First check the swap cache. Since this is normally
				389	* called after lookup_swap_cache() failed, re-calling
				390	* that would confuse statistics.
				391	*/
				392	found_page = find_get_page(swapper_space, swp_offset(entry));
				393	if (found_page)
				394	break;
				395
				396	/*
				397	* Just skip read ahead for unused swap slot.
				398	* During swap_off when swap_slot_cache is disabled,
				399	* we have to handle the race between putting
				400	* swap entry in swap cache and marking swap slot
				401	* as SWAP_HAS_CACHE. That's done in later part of code or
				402	* else swap_off will be aborted if we return NULL.
				403	*/
				404	if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
				405	break;
				406
				407	/*
				408	* Get a new page to read into from swap.
				409	*/
				410	if (!new_page) {
				411	new_page = alloc_page_vma(gfp_mask, vma, addr);
				412	if (!new_page)
				413	break; /* Out of memory */
				414	}
				415
				416	/*
				417	* call radix_tree_preload() while we can wait.
				418	*/
				419	err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
				420	if (err)
				421	break;
				422
				423	/*
				424	* Swap entry may have been freed since our caller observed it.
				425	*/
				426	err = swapcache_prepare(entry);
				427	if (err == -EEXIST) {
				428	radix_tree_preload_end();
				429	/*
				430	* We might race against get_swap_page() and stumble
				431	* across a SWAP_HAS_CACHE swap_map entry whose page
				432	* has not been brought into the swapcache yet.
				433	*/
				434	cond_resched();
				435	continue;
				436	}
				437	if (err) { /* swp entry is obsolete ? */
				438	radix_tree_preload_end();
				439	break;
				440	}
				441
				442	/* May fail (-ENOMEM) if radix-tree node allocation failed. */
				443	__SetPageLocked(new_page);
				444	__SetPageSwapBacked(new_page);
				445	err = __add_to_swap_cache(new_page, entry);
				446	if (likely(!err)) {
				447	radix_tree_preload_end();
				448	/*
				449	* Initiate read into locked page and return.
				450	*/
				451	SetPageWorkingset(new_page);
				452	lru_cache_add_anon(new_page);
				453	*new_page_allocated = true;
				454	return new_page;
				455	}
				456	radix_tree_preload_end();
				457	__ClearPageLocked(new_page);
				458	/*
				459	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
				460	* clear SWAP_HAS_CACHE flag.
				461	*/
				462	put_swap_page(new_page, entry);
				463	} while (err != -ENOMEM);
				464
				465	if (new_page)
				466	put_page(new_page);
				467	return found_page;
				468	}
				469
				470	/*
				471	* Locate a page of swap in physical memory, reserving swap cache space
				472	* and reading the disk if it is not already cached.
				473	* A failure return means that either the page allocation failed or that
				474	* the swap entry is no longer in use.
				475	*/
				476	struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
				477	struct vm_area_struct *vma, unsigned long addr, bool do_poll)
				478	{
				479	bool page_was_allocated;
				480	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
				481	vma, addr, &page_was_allocated);
				482
				483	if (page_was_allocated)
				484	swap_readpage(retpage, do_poll);
				485
				486	return retpage;
				487	}
				488
				489	static unsigned int __swapin_nr_pages(unsigned long prev_offset,
				490	unsigned long offset,
				491	int hits,
				492	int max_pages,
				493	int prev_win)
				494	{
				495	unsigned int pages, last_ra;
				496
				497	/*
				498	* This heuristic has been found to work well on both sequential and
				499	* random loads, swapping to hard disk or to SSD: please don't ask
				500	* what the "+ 2" means, it just happens to work well, that's all.
				501	*/
				502	pages = hits + 2;
				503	if (pages == 2) {
				504	/*
				505	* We can have no readahead hits to judge by: but must not get
				506	* stuck here forever, so check for an adjacent offset instead
				507	* (and don't even bother to check whether swap type is same).
				508	*/
				509	if (offset != prev_offset + 1 && offset != prev_offset - 1)
				510	pages = 1;
				511	} else {
				512	unsigned int roundup = 4;
				513	while (roundup < pages)
				514	roundup <<= 1;
				515	pages = roundup;
				516	}
				517
				518	if (pages > max_pages)
				519	pages = max_pages;
				520
				521	/* Don't shrink readahead too fast */
				522	last_ra = prev_win / 2;
				523	if (pages < last_ra)
				524	pages = last_ra;
				525
				526	return pages;
				527	}
				528
				529	static unsigned long swapin_nr_pages(unsigned long offset)
				530	{
				531	static unsigned long prev_offset;
				532	unsigned int hits, pages, max_pages;
				533	static atomic_t last_readahead_pages;
				534
				535	max_pages = 1 << READ_ONCE(page_cluster);
				536	if (max_pages <= 1)
				537	return 1;
				538
				539	hits = atomic_xchg(&swapin_readahead_hits, 0);
				540	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
				541	atomic_read(&last_readahead_pages));
				542	if (!hits)
				543	prev_offset = offset;
				544	atomic_set(&last_readahead_pages, pages);
				545
				546	return pages;
				547	}
				548
				549	/**
				550	* swap_cluster_readahead - swap in pages in hope we need them soon
				551	* @entry: swap entry of this memory
				552	* @gfp_mask: memory allocation flags
				553	* @vmf: fault information
				554	*
				555	* Returns the struct page for entry and addr, after queueing swapin.
				556	*
				557	* Primitive swap readahead code. We simply read an aligned block of
				558	* (1 << page_cluster) entries in the swap area. This method is chosen
				559	* because it doesn't cost us any seek time. We also make sure to queue
				560	* the 'original' request together with the readahead ones...
				561	*
				562	* This has been extended to use the NUMA policies from the mm triggering
				563	* the readahead.
				564	*
				565	* Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
				566	*/
				567	struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
				568	struct vm_fault *vmf)
				569	{
				570	struct page *page;
				571	unsigned long entry_offset = swp_offset(entry);
				572	unsigned long offset = entry_offset;
				573	unsigned long start_offset, end_offset;
				574	unsigned long mask;
				575	struct swap_info_struct *si = swp_swap_info(entry);
				576	struct blk_plug plug;
				577	bool do_poll = true, page_allocated;
				578	struct vm_area_struct *vma = vmf->vma;
				579	unsigned long addr = vmf->address;
				580
				581	mask = swapin_nr_pages(offset) - 1;
				582	if (!mask)
				583	goto skip;
				584
				585	do_poll = false;
				586	/* Read a page_cluster sized and aligned cluster around offset. */
				587	start_offset = offset & ~mask;
				588	end_offset = offset \| mask;
				589	if (!start_offset) /* First page is swap header. */
				590	start_offset++;
				591	if (end_offset >= si->max)
				592	end_offset = si->max - 1;
				593
				594	blk_start_plug(&plug);
				595	for (offset = start_offset; offset <= end_offset ; offset++) {
				596	/* Ok, do the async read-ahead now */
				597	page = __read_swap_cache_async(
				598	swp_entry(swp_type(entry), offset),
				599	gfp_mask, vma, addr, &page_allocated);
				600	if (!page)
				601	continue;
				602	if (page_allocated) {
				603	swap_readpage(page, false);
				604	if (offset != entry_offset) {
				605	SetPageReadahead(page);
				606	count_vm_event(SWAP_RA);
				607	}
				608	}
				609	put_page(page);
				610	}
				611	blk_finish_plug(&plug);
				612
				613	lru_add_drain(); /* Push any new pages onto the LRU now */
				614	skip:
				615	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
				616	}
				617
				618	int init_swap_address_space(unsigned int type, unsigned long nr_pages)
				619	{
				620	struct address_space spaces, space;
				621	unsigned int i, nr;
				622
				623	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
				624	spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
				625	if (!spaces)
				626	return -ENOMEM;
				627	for (i = 0; i < nr; i++) {
				628	space = spaces + i;
				629	INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC\|__GFP_NOWARN);
				630	atomic_set(&space->i_mmap_writable, 0);
				631	space->a_ops = &swap_aops;
				632	/* swap cache doesn't use writeback related tags */
				633	mapping_set_no_writeback_tags(space);
				634	}
				635	nr_swapper_spaces[type] = nr;
				636	rcu_assign_pointer(swapper_spaces[type], spaces);
				637
				638	return 0;
				639	}
				640
				641	void exit_swap_address_space(unsigned int type)
				642	{
				643	struct address_space *spaces;
				644
				645	spaces = swapper_spaces[type];
				646	nr_swapper_spaces[type] = 0;
				647	rcu_assign_pointer(swapper_spaces[type], NULL);
				648	synchronize_rcu();
				649	kvfree(spaces);
				650	}
				651
				652	static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
				653	unsigned long faddr,
				654	unsigned long lpfn,
				655	unsigned long rpfn,
				656	unsigned long *start,
				657	unsigned long *end)
				658	{
				659	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
				660	PFN_DOWN(faddr & PMD_MASK));
				661	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
				662	PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
				663	}
				664
				665	static void swap_ra_info(struct vm_fault *vmf,
				666	struct vma_swap_readahead *ra_info)
				667	{
				668	struct vm_area_struct *vma = vmf->vma;
				669	unsigned long ra_val;
				670	swp_entry_t entry;
				671	unsigned long faddr, pfn, fpfn;
				672	unsigned long start, end;
				673	pte_t pte, orig_pte;
				674	unsigned int max_win, hits, prev_win, win, left;
				675	#ifndef CONFIG_64BIT
				676	pte_t *tpte;
				677	#endif
				678
				679	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
				680	SWAP_RA_ORDER_CEILING);
				681	if (max_win == 1) {
				682	ra_info->win = 1;
				683	return;
				684	}
				685
				686	faddr = vmf->address;
				687	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
				688	entry = pte_to_swp_entry(*pte);
				689	if ((unlikely(non_swap_entry(entry)))) {
				690	pte_unmap(orig_pte);
				691	return;
				692	}
				693
				694	fpfn = PFN_DOWN(faddr);
				695	ra_val = GET_SWAP_RA_VAL(vma);
				696	pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
				697	prev_win = SWAP_RA_WIN(ra_val);
				698	hits = SWAP_RA_HITS(ra_val);
				699	ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
				700	max_win, prev_win);
				701	atomic_long_set(&vma->swap_readahead_info,
				702	SWAP_RA_VAL(faddr, win, 0));
				703
				704	if (win == 1) {
				705	pte_unmap(orig_pte);
				706	return;
				707	}
				708
				709	/* Copy the PTEs because the page table may be unmapped */
				710	if (fpfn == pfn + 1)
				711	swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
				712	else if (pfn == fpfn + 1)
				713	swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
				714	&start, &end);
				715	else {
				716	left = (win - 1) / 2;
				717	swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
				718	&start, &end);
				719	}
				720	ra_info->nr_pte = end - start;
				721	ra_info->offset = fpfn - start;
				722	pte -= ra_info->offset;
				723	#ifdef CONFIG_64BIT
				724	ra_info->ptes = pte;
				725	#else
				726	tpte = ra_info->ptes;
				727	for (pfn = start; pfn != end; pfn++)
				728	tpte++ = pte++;
				729	#endif
				730	pte_unmap(orig_pte);
				731	}
				732
				733	static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
				734	struct vm_fault *vmf)
				735	{
				736	struct blk_plug plug;
				737	struct vm_area_struct *vma = vmf->vma;
				738	struct page *page;
				739	pte_t *pte, pentry;
				740	swp_entry_t entry;
				741	unsigned int i;
				742	bool page_allocated;
				743	struct vma_swap_readahead ra_info = {0,};
				744
				745	swap_ra_info(vmf, &ra_info);
				746	if (ra_info.win == 1)
				747	goto skip;
				748
				749	blk_start_plug(&plug);
				750	for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
				751	i++, pte++) {
				752	pentry = *pte;
				753	if (pte_none(pentry))
				754	continue;
				755	if (pte_present(pentry))
				756	continue;
				757	entry = pte_to_swp_entry(pentry);
				758	if (unlikely(non_swap_entry(entry)))
				759	continue;
				760	page = __read_swap_cache_async(entry, gfp_mask, vma,
				761	vmf->address, &page_allocated);
				762	if (!page)
				763	continue;
				764	if (page_allocated) {
				765	swap_readpage(page, false);
				766	if (i != ra_info.offset) {
				767	SetPageReadahead(page);
				768	count_vm_event(SWAP_RA);
				769	}
				770	}
				771	put_page(page);
				772	}
				773	blk_finish_plug(&plug);
				774	lru_add_drain();
				775	skip:
				776	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
				777	ra_info.win == 1);
				778	}
				779
				780	/**
				781	* swapin_readahead - swap in pages in hope we need them soon
				782	* @entry: swap entry of this memory
				783	* @gfp_mask: memory allocation flags
				784	* @vmf: fault information
				785	*
				786	* Returns the struct page for entry and addr, after queueing swapin.
				787	*
				788	* It's a main entry function for swap readahead. By the configuration,
				789	* it will read ahead blocks by cluster-based(ie, physical disk based)
				790	* or vma-based(ie, virtual address based on faulty address) readahead.
				791	*/
				792	struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
				793	struct vm_fault *vmf)
				794	{
				795	return swap_use_vma_readahead() ?
				796	swap_vma_readahead(entry, gfp_mask, vmf) :
				797	swap_cluster_readahead(entry, gfp_mask, vmf);
				798	}
				799
				800	#ifdef CONFIG_SYSFS
				801	static ssize_t vma_ra_enabled_show(struct kobject *kobj,
				802	struct kobj_attribute attr, char buf)
				803	{
				804	return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
				805	}
				806	static ssize_t vma_ra_enabled_store(struct kobject *kobj,
				807	struct kobj_attribute *attr,
				808	const char *buf, size_t count)
				809	{
				810	if (!strncmp(buf, "true", 4) \|\| !strncmp(buf, "1", 1))
				811	enable_vma_readahead = true;
				812	else if (!strncmp(buf, "false", 5) \|\| !strncmp(buf, "0", 1))
				813	enable_vma_readahead = false;
				814	else
				815	return -EINVAL;
				816
				817	return count;
				818	}
				819	static struct kobj_attribute vma_ra_enabled_attr =
				820	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
				821	vma_ra_enabled_store);
				822
				823	static struct attribute *swap_attrs[] = {
				824	&vma_ra_enabled_attr.attr,
				825	NULL,
				826	};
				827
				828	static struct attribute_group swap_attr_group = {
				829	.attrs = swap_attrs,
				830	};
				831
				832	static int __init swap_init_sysfs(void)
				833	{
				834	int err;
				835	struct kobject *swap_kobj;
				836
				837	swap_kobj = kobject_create_and_add("swap", mm_kobj);
				838	if (!swap_kobj) {
				839	pr_err("failed to create swap kobject\n");
				840	return -ENOMEM;
				841	}
				842	err = sysfs_create_group(swap_kobj, &swap_attr_group);
				843	if (err) {
				844	pr_err("failed to register swap group\n");
				845	goto delete_obj;
				846	}
				847	return 0;
				848
				849	delete_obj:
				850	kobject_put(swap_kobj);
				851	return err;
				852	}
				853	subsys_initcall(swap_init_sysfs);
				854	#endif