Blame - src/kernel/linux/v4.19/mm/swapfile.c - T800

blob: 02ad17aca54d95e6a5572df13787bfc1144b66ce [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* linux/mm/swapfile.c
				3	*
				4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				5	* Swap reorganised 29.12.95, Stephen Tweedie
				6	*/
				7
				8	#include <linux/mm.h>
				9	#include <linux/sched/mm.h>
				10	#include <linux/sched/task.h>
				11	#include <linux/hugetlb.h>
				12	#include <linux/mman.h>
				13	#include <linux/slab.h>
				14	#include <linux/kernel_stat.h>
				15	#include <linux/swap.h>
				16	#include <linux/vmalloc.h>
				17	#include <linux/pagemap.h>
				18	#include <linux/namei.h>
				19	#include <linux/shmem_fs.h>
				20	#include <linux/blkdev.h>
				21	#include <linux/random.h>
				22	#include <linux/writeback.h>
				23	#include <linux/proc_fs.h>
				24	#include <linux/seq_file.h>
				25	#include <linux/init.h>
				26	#include <linux/ksm.h>
				27	#include <linux/rmap.h>
				28	#include <linux/security.h>
				29	#include <linux/backing-dev.h>
				30	#include <linux/mutex.h>
				31	#include <linux/capability.h>
				32	#include <linux/syscalls.h>
				33	#include <linux/memcontrol.h>
				34	#include <linux/poll.h>
				35	#include <linux/oom.h>
				36	#include <linux/frontswap.h>
				37	#include <linux/swapfile.h>
				38	#include <linux/export.h>
				39	#include <linux/swap_slots.h>
				40	#include <linux/sort.h>
				41
				42	#include <asm/pgtable.h>
				43	#include <asm/tlbflush.h>
				44	#include <linux/swapops.h>
				45	#include <linux/swap_cgroup.h>
				46
				47	static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
				48	unsigned char);
				49	static void free_swap_count_continuations(struct swap_info_struct *);
				50	static sector_t map_swap_entry(swp_entry_t, struct block_device**);
				51
				52	DEFINE_SPINLOCK(swap_lock);
				53	static unsigned int nr_swapfiles;
				54	atomic_long_t nr_swap_pages;
				55	/*
				56	* Some modules use swappable objects and may try to swap them out under
				57	* memory pressure (via the shrinker). Before doing so, they may wish to
				58	* check to see if any swap space is available.
				59	*/
				60	EXPORT_SYMBOL_GPL(nr_swap_pages);
				61	/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
				62	long total_swap_pages;
				63	static int least_priority = -1;
				64
				65	static const char Bad_file[] = "Bad swap file entry ";
				66	static const char Unused_file[] = "Unused swap file entry ";
				67	static const char Bad_offset[] = "Bad swap offset entry ";
				68	static const char Unused_offset[] = "Unused swap offset entry ";
				69
				70	/*
				71	* all active swap_info_structs
				72	* protected with swap_lock, and ordered by priority.
				73	*/
				74	PLIST_HEAD(swap_active_head);
				75
				76	/*
				77	* all available (active, not full) swap_info_structs
				78	* protected with swap_avail_lock, ordered by priority.
				79	* This is used by get_swap_page() instead of swap_active_head
				80	* because swap_active_head includes all swap_info_structs,
				81	* but get_swap_page() doesn't need to look at full ones.
				82	* This uses its own lock instead of swap_lock because when a
				83	* swap_info_struct changes between not-full/full, it needs to
				84	* add/remove itself to/from this list, but the swap_info_struct->lock
				85	* is held and the locking order requires swap_lock to be taken
				86	* before any swap_info_struct->lock.
				87	*/
				88	static struct plist_head *swap_avail_heads;
				89	static DEFINE_SPINLOCK(swap_avail_lock);
				90
				91	struct swap_info_struct *swap_info[MAX_SWAPFILES];
				92
				93	static DEFINE_MUTEX(swapon_mutex);
				94
				95	static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
				96	/* Activity counter to indicate that a swapon or swapoff has occurred */
				97	static atomic_t proc_poll_event = ATOMIC_INIT(0);
				98
				99	atomic_t nr_rotate_swap = ATOMIC_INIT(0);
				100
				101	static struct swap_info_struct *swap_type_to_swap_info(int type)
				102	{
				103	if (type >= READ_ONCE(nr_swapfiles))
				104	return NULL;
				105
				106	smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
				107	return READ_ONCE(swap_info[type]);
				108	}
				109
				110	static inline unsigned char swap_count(unsigned char ent)
				111	{
				112	return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
				113	}
				114
				115	/* returns 1 if swap entry is freed */
				116	static int
				117	__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
				118	{
				119	swp_entry_t entry = swp_entry(si->type, offset);
				120	struct page *page;
				121	int ret = 0;
				122
				123	page = find_get_page(swap_address_space(entry), swp_offset(entry));
				124	if (!page)
				125	return 0;
				126	/*
				127	* This function is called from scan_swap_map() and it's called
				128	* by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
				129	* We have to use trylock for avoiding deadlock. This is a special
				130	* case and you should use try_to_free_swap() with explicit lock_page()
				131	* in usual operations.
				132	*/
				133	if (trylock_page(page)) {
				134	ret = try_to_free_swap(page);
				135	unlock_page(page);
				136	}
				137	put_page(page);
				138	return ret;
				139	}
				140
				141	/*
				142	* swapon tell device that all the old swap contents can be discarded,
				143	* to allow the swap device to optimize its wear-levelling.
				144	*/
				145	static int discard_swap(struct swap_info_struct *si)
				146	{
				147	struct swap_extent *se;
				148	sector_t start_block;
				149	sector_t nr_blocks;
				150	int err = 0;
				151
				152	/* Do not discard the swap header page! */
				153	se = &si->first_swap_extent;
				154	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
				155	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
				156	if (nr_blocks) {
				157	err = blkdev_issue_discard(si->bdev, start_block,
				158	nr_blocks, GFP_KERNEL, 0);
				159	if (err)
				160	return err;
				161	cond_resched();
				162	}
				163
				164	list_for_each_entry(se, &si->first_swap_extent.list, list) {
				165	start_block = se->start_block << (PAGE_SHIFT - 9);
				166	nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
				167
				168	err = blkdev_issue_discard(si->bdev, start_block,
				169	nr_blocks, GFP_KERNEL, 0);
				170	if (err)
				171	break;
				172
				173	cond_resched();
				174	}
				175	return err; /* That will often be -EOPNOTSUPP */
				176	}
				177
				178	/*
				179	* swap allocation tell device that a cluster of swap can now be discarded,
				180	* to allow the swap device to optimize its wear-levelling.
				181	*/
				182	static void discard_swap_cluster(struct swap_info_struct *si,
				183	pgoff_t start_page, pgoff_t nr_pages)
				184	{
				185	struct swap_extent *se = si->curr_swap_extent;
				186	int found_extent = 0;
				187
				188	while (nr_pages) {
				189	if (se->start_page <= start_page &&
				190	start_page < se->start_page + se->nr_pages) {
				191	pgoff_t offset = start_page - se->start_page;
				192	sector_t start_block = se->start_block + offset;
				193	sector_t nr_blocks = se->nr_pages - offset;
				194
				195	if (nr_blocks > nr_pages)
				196	nr_blocks = nr_pages;
				197	start_page += nr_blocks;
				198	nr_pages -= nr_blocks;
				199
				200	if (!found_extent++)
				201	si->curr_swap_extent = se;
				202
				203	start_block <<= PAGE_SHIFT - 9;
				204	nr_blocks <<= PAGE_SHIFT - 9;
				205	if (blkdev_issue_discard(si->bdev, start_block,
				206	nr_blocks, GFP_NOIO, 0))
				207	break;
				208	}
				209
				210	se = list_next_entry(se, list);
				211	}
				212	}
				213
				214	#ifdef CONFIG_THP_SWAP
				215	#define SWAPFILE_CLUSTER HPAGE_PMD_NR
				216
				217	#define swap_entry_size(size) (size)
				218	#else
				219	#define SWAPFILE_CLUSTER 256
				220
				221	/*
				222	* Define swap_entry_size() as constant to let compiler to optimize
				223	* out some code if !CONFIG_THP_SWAP
				224	*/
				225	#define swap_entry_size(size) 1
				226	#endif
				227	#define LATENCY_LIMIT 256
				228
				229	static inline void cluster_set_flag(struct swap_cluster_info *info,
				230	unsigned int flag)
				231	{
				232	info->flags = flag;
				233	}
				234
				235	static inline unsigned int cluster_count(struct swap_cluster_info *info)
				236	{
				237	return info->data;
				238	}
				239
				240	static inline void cluster_set_count(struct swap_cluster_info *info,
				241	unsigned int c)
				242	{
				243	info->data = c;
				244	}
				245
				246	static inline void cluster_set_count_flag(struct swap_cluster_info *info,
				247	unsigned int c, unsigned int f)
				248	{
				249	info->flags = f;
				250	info->data = c;
				251	}
				252
				253	static inline unsigned int cluster_next(struct swap_cluster_info *info)
				254	{
				255	return info->data;
				256	}
				257
				258	static inline void cluster_set_next(struct swap_cluster_info *info,
				259	unsigned int n)
				260	{
				261	info->data = n;
				262	}
				263
				264	static inline void cluster_set_next_flag(struct swap_cluster_info *info,
				265	unsigned int n, unsigned int f)
				266	{
				267	info->flags = f;
				268	info->data = n;
				269	}
				270
				271	static inline bool cluster_is_free(struct swap_cluster_info *info)
				272	{
				273	return info->flags & CLUSTER_FLAG_FREE;
				274	}
				275
				276	static inline bool cluster_is_null(struct swap_cluster_info *info)
				277	{
				278	return info->flags & CLUSTER_FLAG_NEXT_NULL;
				279	}
				280
				281	static inline void cluster_set_null(struct swap_cluster_info *info)
				282	{
				283	info->flags = CLUSTER_FLAG_NEXT_NULL;
				284	info->data = 0;
				285	}
				286
				287	static inline bool cluster_is_huge(struct swap_cluster_info *info)
				288	{
				289	if (IS_ENABLED(CONFIG_THP_SWAP))
				290	return info->flags & CLUSTER_FLAG_HUGE;
				291	return false;
				292	}
				293
				294	static inline void cluster_clear_huge(struct swap_cluster_info *info)
				295	{
				296	info->flags &= ~CLUSTER_FLAG_HUGE;
				297	}
				298
				299	static inline struct swap_cluster_info lock_cluster(struct swap_info_struct si,
				300	unsigned long offset)
				301	{
				302	struct swap_cluster_info *ci;
				303
				304	ci = si->cluster_info;
				305	if (ci) {
				306	ci += offset / SWAPFILE_CLUSTER;
				307	spin_lock(&ci->lock);
				308	}
				309	return ci;
				310	}
				311
				312	static inline void unlock_cluster(struct swap_cluster_info *ci)
				313	{
				314	if (ci)
				315	spin_unlock(&ci->lock);
				316	}
				317
				318	/*
				319	* Determine the locking method in use for this device. Return
				320	* swap_cluster_info if SSD-style cluster-based locking is in place.
				321	*/
				322	static inline struct swap_cluster_info *lock_cluster_or_swap_info(
				323	struct swap_info_struct *si, unsigned long offset)
				324	{
				325	struct swap_cluster_info *ci;
				326
				327	/* Try to use fine-grained SSD-style locking if available: */
				328	ci = lock_cluster(si, offset);
				329	/* Otherwise, fall back to traditional, coarse locking: */
				330	if (!ci)
				331	spin_lock(&si->lock);
				332
				333	return ci;
				334	}
				335
				336	static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
				337	struct swap_cluster_info *ci)
				338	{
				339	if (ci)
				340	unlock_cluster(ci);
				341	else
				342	spin_unlock(&si->lock);
				343	}
				344
				345	static inline bool cluster_list_empty(struct swap_cluster_list *list)
				346	{
				347	return cluster_is_null(&list->head);
				348	}
				349
				350	static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
				351	{
				352	return cluster_next(&list->head);
				353	}
				354
				355	static void cluster_list_init(struct swap_cluster_list *list)
				356	{
				357	cluster_set_null(&list->head);
				358	cluster_set_null(&list->tail);
				359	}
				360
				361	static void cluster_list_add_tail(struct swap_cluster_list *list,
				362	struct swap_cluster_info *ci,
				363	unsigned int idx)
				364	{
				365	if (cluster_list_empty(list)) {
				366	cluster_set_next_flag(&list->head, idx, 0);
				367	cluster_set_next_flag(&list->tail, idx, 0);
				368	} else {
				369	struct swap_cluster_info *ci_tail;
				370	unsigned int tail = cluster_next(&list->tail);
				371
				372	/*
				373	* Nested cluster lock, but both cluster locks are
				374	* only acquired when we held swap_info_struct->lock
				375	*/
				376	ci_tail = ci + tail;
				377	spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
				378	cluster_set_next(ci_tail, idx);
				379	spin_unlock(&ci_tail->lock);
				380	cluster_set_next_flag(&list->tail, idx, 0);
				381	}
				382	}
				383
				384	static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
				385	struct swap_cluster_info *ci)
				386	{
				387	unsigned int idx;
				388
				389	idx = cluster_next(&list->head);
				390	if (cluster_next(&list->tail) == idx) {
				391	cluster_set_null(&list->head);
				392	cluster_set_null(&list->tail);
				393	} else
				394	cluster_set_next_flag(&list->head,
				395	cluster_next(&ci[idx]), 0);
				396
				397	return idx;
				398	}
				399
				400	/* Add a cluster to discard list and schedule it to do discard */
				401	static void swap_cluster_schedule_discard(struct swap_info_struct *si,
				402	unsigned int idx)
				403	{
				404	/*
				405	* If scan_swap_map() can't find a free cluster, it will check
				406	* si->swap_map directly. To make sure the discarding cluster isn't
				407	* taken by scan_swap_map(), mark the swap entries bad (occupied). It
				408	* will be cleared after discard
				409	*/
				410	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
				411	SWAP_MAP_BAD, SWAPFILE_CLUSTER);
				412
				413	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
				414
				415	schedule_work(&si->discard_work);
				416	}
				417
				418	static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
				419	{
				420	struct swap_cluster_info *ci = si->cluster_info;
				421
				422	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
				423	cluster_list_add_tail(&si->free_clusters, ci, idx);
				424	}
				425
				426	/*
				427	* Doing discard actually. After a cluster discard is finished, the cluster
				428	* will be added to free cluster list. caller should hold si->lock.
				429	*/
				430	static void swap_do_scheduled_discard(struct swap_info_struct *si)
				431	{
				432	struct swap_cluster_info info, ci;
				433	unsigned int idx;
				434
				435	info = si->cluster_info;
				436
				437	while (!cluster_list_empty(&si->discard_clusters)) {
				438	idx = cluster_list_del_first(&si->discard_clusters, info);
				439	spin_unlock(&si->lock);
				440
				441	discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
				442	SWAPFILE_CLUSTER);
				443
				444	spin_lock(&si->lock);
				445	ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
				446	__free_cluster(si, idx);
				447	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
				448	0, SWAPFILE_CLUSTER);
				449	unlock_cluster(ci);
				450	}
				451	}
				452
				453	static void swap_discard_work(struct work_struct *work)
				454	{
				455	struct swap_info_struct *si;
				456
				457	si = container_of(work, struct swap_info_struct, discard_work);
				458
				459	spin_lock(&si->lock);
				460	swap_do_scheduled_discard(si);
				461	spin_unlock(&si->lock);
				462	}
				463
				464	static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
				465	{
				466	struct swap_cluster_info *ci = si->cluster_info;
				467
				468	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
				469	cluster_list_del_first(&si->free_clusters, ci);
				470	cluster_set_count_flag(ci + idx, 0, 0);
				471	}
				472
				473	static void free_cluster(struct swap_info_struct *si, unsigned long idx)
				474	{
				475	struct swap_cluster_info *ci = si->cluster_info + idx;
				476
				477	VM_BUG_ON(cluster_count(ci) != 0);
				478	/*
				479	* If the swap is discardable, prepare discard the cluster
				480	* instead of free it immediately. The cluster will be freed
				481	* after discard.
				482	*/
				483	if ((si->flags & (SWP_WRITEOK \| SWP_PAGE_DISCARD)) ==
				484	(SWP_WRITEOK \| SWP_PAGE_DISCARD)) {
				485	swap_cluster_schedule_discard(si, idx);
				486	return;
				487	}
				488
				489	__free_cluster(si, idx);
				490	}
				491
				492	/*
				493	* The cluster corresponding to page_nr will be used. The cluster will be
				494	* removed from free cluster list and its usage counter will be increased.
				495	*/
				496	static void inc_cluster_info_page(struct swap_info_struct *p,
				497	struct swap_cluster_info *cluster_info, unsigned long page_nr)
				498	{
				499	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
				500
				501	if (!cluster_info)
				502	return;
				503	if (cluster_is_free(&cluster_info[idx]))
				504	alloc_cluster(p, idx);
				505
				506	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
				507	cluster_set_count(&cluster_info[idx],
				508	cluster_count(&cluster_info[idx]) + 1);
				509	}
				510
				511	/*
				512	* The cluster corresponding to page_nr decreases one usage. If the usage
				513	* counter becomes 0, which means no page in the cluster is in using, we can
				514	* optionally discard the cluster and add it to free cluster list.
				515	*/
				516	static void dec_cluster_info_page(struct swap_info_struct *p,
				517	struct swap_cluster_info *cluster_info, unsigned long page_nr)
				518	{
				519	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
				520
				521	if (!cluster_info)
				522	return;
				523
				524	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
				525	cluster_set_count(&cluster_info[idx],
				526	cluster_count(&cluster_info[idx]) - 1);
				527
				528	if (cluster_count(&cluster_info[idx]) == 0)
				529	free_cluster(p, idx);
				530	}
				531
				532	/*
				533	* It's possible scan_swap_map() uses a free cluster in the middle of free
				534	* cluster list. Avoiding such abuse to avoid list corruption.
				535	*/
				536	static bool
				537	scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
				538	unsigned long offset)
				539	{
				540	struct percpu_cluster *percpu_cluster;
				541	bool conflict;
				542
				543	offset /= SWAPFILE_CLUSTER;
				544	conflict = !cluster_list_empty(&si->free_clusters) &&
				545	offset != cluster_list_first(&si->free_clusters) &&
				546	cluster_is_free(&si->cluster_info[offset]);
				547
				548	if (!conflict)
				549	return false;
				550
				551	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
				552	cluster_set_null(&percpu_cluster->index);
				553	return true;
				554	}
				555
				556	/*
				557	* Try to get a swap entry from current cpu's swap entry pool (a cluster). This
				558	* might involve allocating a new cluster for current CPU too.
				559	*/
				560	static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
				561	unsigned long offset, unsigned long scan_base)
				562	{
				563	struct percpu_cluster *cluster;
				564	struct swap_cluster_info *ci;
				565	bool found_free;
				566	unsigned long tmp, max;
				567
				568	new_cluster:
				569	cluster = this_cpu_ptr(si->percpu_cluster);
				570	if (cluster_is_null(&cluster->index)) {
				571	if (!cluster_list_empty(&si->free_clusters)) {
				572	cluster->index = si->free_clusters.head;
				573	cluster->next = cluster_next(&cluster->index) *
				574	SWAPFILE_CLUSTER;
				575	} else if (!cluster_list_empty(&si->discard_clusters)) {
				576	/*
				577	* we don't have free cluster but have some clusters in
				578	* discarding, do discard now and reclaim them
				579	*/
				580	swap_do_scheduled_discard(si);
				581	scan_base = offset = si->cluster_next;
				582	goto new_cluster;
				583	} else
				584	return false;
				585	}
				586
				587	found_free = false;
				588
				589	/*
				590	* Other CPUs can use our cluster if they can't find a free cluster,
				591	* check if there is still free entry in the cluster
				592	*/
				593	tmp = cluster->next;
				594	max = min_t(unsigned long, si->max,
				595	(cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
				596	if (tmp >= max) {
				597	cluster_set_null(&cluster->index);
				598	goto new_cluster;
				599	}
				600	ci = lock_cluster(si, tmp);
				601	while (tmp < max) {
				602	if (!si->swap_map[tmp]) {
				603	found_free = true;
				604	break;
				605	}
				606	tmp++;
				607	}
				608	unlock_cluster(ci);
				609	if (!found_free) {
				610	cluster_set_null(&cluster->index);
				611	goto new_cluster;
				612	}
				613	cluster->next = tmp + 1;
				614	*offset = tmp;
				615	*scan_base = tmp;
				616	return found_free;
				617	}
				618
				619	static void __del_from_avail_list(struct swap_info_struct *p)
				620	{
				621	int nid;
				622
				623	for_each_node(nid)
				624	plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
				625	}
				626
				627	static void del_from_avail_list(struct swap_info_struct *p)
				628	{
				629	spin_lock(&swap_avail_lock);
				630	__del_from_avail_list(p);
				631	spin_unlock(&swap_avail_lock);
				632	}
				633
				634	static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
				635	unsigned int nr_entries)
				636	{
				637	unsigned int end = offset + nr_entries - 1;
				638
				639	if (offset == si->lowest_bit)
				640	si->lowest_bit += nr_entries;
				641	if (end == si->highest_bit)
				642	si->highest_bit -= nr_entries;
				643	si->inuse_pages += nr_entries;
				644	if (si->inuse_pages == si->pages) {
				645	si->lowest_bit = si->max;
				646	si->highest_bit = 0;
				647	del_from_avail_list(si);
				648	}
				649	}
				650
				651	static void add_to_avail_list(struct swap_info_struct *p)
				652	{
				653	int nid;
				654
				655	spin_lock(&swap_avail_lock);
				656	for_each_node(nid) {
				657	WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
				658	plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
				659	}
				660	spin_unlock(&swap_avail_lock);
				661	}
				662
				663	static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
				664	unsigned int nr_entries)
				665	{
				666	unsigned long end = offset + nr_entries - 1;
				667	void (swap_slot_free_notify)(struct block_device , unsigned long);
				668
				669	if (offset < si->lowest_bit)
				670	si->lowest_bit = offset;
				671	if (end > si->highest_bit) {
				672	bool was_full = !si->highest_bit;
				673
				674	si->highest_bit = end;
				675	if (was_full && (si->flags & SWP_WRITEOK))
				676	add_to_avail_list(si);
				677	}
				678	atomic_long_add(nr_entries, &nr_swap_pages);
				679	si->inuse_pages -= nr_entries;
				680	if (si->flags & SWP_BLKDEV)
				681	swap_slot_free_notify =
				682	si->bdev->bd_disk->fops->swap_slot_free_notify;
				683	else
				684	swap_slot_free_notify = NULL;
				685	while (offset <= end) {
				686	frontswap_invalidate_page(si->type, offset);
				687	if (swap_slot_free_notify)
				688	swap_slot_free_notify(si->bdev, offset);
				689	offset++;
				690	}
				691	}
				692
				693	static int scan_swap_map_slots(struct swap_info_struct *si,
				694	unsigned char usage, int nr,
				695	swp_entry_t slots[])
				696	{
				697	struct swap_cluster_info *ci;
				698	unsigned long offset;
				699	unsigned long scan_base;
				700	unsigned long last_in_cluster = 0;
				701	int latency_ration = LATENCY_LIMIT;
				702	int n_ret = 0;
				703
				704	if (nr > SWAP_BATCH)
				705	nr = SWAP_BATCH;
				706
				707	/*
				708	* We try to cluster swap pages by allocating them sequentially
				709	* in swap. Once we've allocated SWAPFILE_CLUSTER pages this
				710	* way, however, we resort to first-free allocation, starting
				711	* a new cluster. This prevents us from scattering swap pages
				712	* all over the entire swap partition, so that we reduce
				713	* overall disk seek times between swap pages. -- sct
				714	* But we do now try to find an empty cluster. -Andrea
				715	* And we let swap pages go all over an SSD partition. Hugh
				716	*/
				717
				718	si->flags += SWP_SCANNING;
				719	scan_base = offset = si->cluster_next;
				720
				721	/* SSD algorithm */
				722	if (si->cluster_info) {
				723	if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
				724	goto checks;
				725	else
				726	goto scan;
				727	}
				728
				729	if (unlikely(!si->cluster_nr--)) {
				730	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
				731	si->cluster_nr = SWAPFILE_CLUSTER - 1;
				732	goto checks;
				733	}
				734
				735	spin_unlock(&si->lock);
				736
				737	/*
				738	* If seek is expensive, start searching for new cluster from
				739	* start of partition, to minimize the span of allocated swap.
				740	* If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
				741	* case, just handled by scan_swap_map_try_ssd_cluster() above.
				742	*/
				743	scan_base = offset = si->lowest_bit;
				744	last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
				745
				746	/* Locate the first empty (unaligned) cluster */
				747	for (; last_in_cluster <= si->highest_bit; offset++) {
				748	if (si->swap_map[offset])
				749	last_in_cluster = offset + SWAPFILE_CLUSTER;
				750	else if (offset == last_in_cluster) {
				751	spin_lock(&si->lock);
				752	offset -= SWAPFILE_CLUSTER - 1;
				753	si->cluster_next = offset;
				754	si->cluster_nr = SWAPFILE_CLUSTER - 1;
				755	goto checks;
				756	}
				757	if (unlikely(--latency_ration < 0)) {
				758	cond_resched();
				759	latency_ration = LATENCY_LIMIT;
				760	}
				761	}
				762
				763	offset = scan_base;
				764	spin_lock(&si->lock);
				765	si->cluster_nr = SWAPFILE_CLUSTER - 1;
				766	}
				767
				768	checks:
				769	if (si->cluster_info) {
				770	while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
				771	/* take a break if we already got some slots */
				772	if (n_ret)
				773	goto done;
				774	if (!scan_swap_map_try_ssd_cluster(si, &offset,
				775	&scan_base))
				776	goto scan;
				777	}
				778	}
				779	if (!(si->flags & SWP_WRITEOK))
				780	goto no_page;
				781	if (!si->highest_bit)
				782	goto no_page;
				783	if (offset > si->highest_bit)
				784	scan_base = offset = si->lowest_bit;
				785
				786	ci = lock_cluster(si, offset);
				787	/* reuse swap entry of cache-only swap if not busy. */
				788	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
				789	int swap_was_freed;
				790	unlock_cluster(ci);
				791	spin_unlock(&si->lock);
				792	swap_was_freed = __try_to_reclaim_swap(si, offset);
				793	spin_lock(&si->lock);
				794	/* entry was freed successfully, try to use this again */
				795	if (swap_was_freed)
				796	goto checks;
				797	goto scan; /* check next one */
				798	}
				799
				800	if (si->swap_map[offset]) {
				801	unlock_cluster(ci);
				802	if (!n_ret)
				803	goto scan;
				804	else
				805	goto done;
				806	}
				807	si->swap_map[offset] = usage;
				808	inc_cluster_info_page(si, si->cluster_info, offset);
				809	unlock_cluster(ci);
				810
				811	swap_range_alloc(si, offset, 1);
				812	si->cluster_next = offset + 1;
				813	slots[n_ret++] = swp_entry(si->type, offset);
				814
				815	/* got enough slots or reach max slots? */
				816	if ((n_ret == nr) \|\| (offset >= si->highest_bit))
				817	goto done;
				818
				819	/* search for next available slot */
				820
				821	/* time to take a break? */
				822	if (unlikely(--latency_ration < 0)) {
				823	if (n_ret)
				824	goto done;
				825	spin_unlock(&si->lock);
				826	cond_resched();
				827	spin_lock(&si->lock);
				828	latency_ration = LATENCY_LIMIT;
				829	}
				830
				831	/* try to get more slots in cluster */
				832	if (si->cluster_info) {
				833	if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
				834	goto checks;
				835	else
				836	goto done;
				837	}
				838	/* non-ssd case */
				839	++offset;
				840
				841	/* non-ssd case, still more slots in cluster? */
				842	if (si->cluster_nr && !si->swap_map[offset]) {
				843	--si->cluster_nr;
				844	goto checks;
				845	}
				846
				847	done:
				848	si->flags -= SWP_SCANNING;
				849	return n_ret;
				850
				851	scan:
				852	spin_unlock(&si->lock);
				853	while (++offset <= si->highest_bit) {
				854	if (!si->swap_map[offset]) {
				855	spin_lock(&si->lock);
				856	goto checks;
				857	}
				858	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
				859	spin_lock(&si->lock);
				860	goto checks;
				861	}
				862	if (unlikely(--latency_ration < 0)) {
				863	cond_resched();
				864	latency_ration = LATENCY_LIMIT;
				865	}
				866	}
				867	offset = si->lowest_bit;
				868	while (offset < scan_base) {
				869	if (!si->swap_map[offset]) {
				870	spin_lock(&si->lock);
				871	goto checks;
				872	}
				873	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
				874	spin_lock(&si->lock);
				875	goto checks;
				876	}
				877	if (unlikely(--latency_ration < 0)) {
				878	cond_resched();
				879	latency_ration = LATENCY_LIMIT;
				880	}
				881	offset++;
				882	}
				883	spin_lock(&si->lock);
				884
				885	no_page:
				886	si->flags -= SWP_SCANNING;
				887	return n_ret;
				888	}
				889
				890	static int swap_alloc_cluster(struct swap_info_struct si, swp_entry_t slot)
				891	{
				892	unsigned long idx;
				893	struct swap_cluster_info *ci;
				894	unsigned long offset, i;
				895	unsigned char *map;
				896
				897	/*
				898	* Should not even be attempting cluster allocations when huge
				899	* page swap is disabled. Warn and fail the allocation.
				900	*/
				901	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
				902	VM_WARN_ON_ONCE(1);
				903	return 0;
				904	}
				905
				906	if (cluster_list_empty(&si->free_clusters))
				907	return 0;
				908
				909	idx = cluster_list_first(&si->free_clusters);
				910	offset = idx * SWAPFILE_CLUSTER;
				911	ci = lock_cluster(si, offset);
				912	alloc_cluster(si, idx);
				913	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
				914
				915	map = si->swap_map + offset;
				916	for (i = 0; i < SWAPFILE_CLUSTER; i++)
				917	map[i] = SWAP_HAS_CACHE;
				918	unlock_cluster(ci);
				919	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
				920	*slot = swp_entry(si->type, offset);
				921
				922	return 1;
				923	}
				924
				925	static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
				926	{
				927	unsigned long offset = idx * SWAPFILE_CLUSTER;
				928	struct swap_cluster_info *ci;
				929
				930	ci = lock_cluster(si, offset);
				931	cluster_set_count_flag(ci, 0, 0);
				932	free_cluster(si, idx);
				933	unlock_cluster(ci);
				934	swap_range_free(si, offset, SWAPFILE_CLUSTER);
				935	}
				936
				937	static unsigned long scan_swap_map(struct swap_info_struct *si,
				938	unsigned char usage)
				939	{
				940	swp_entry_t entry;
				941	int n_ret;
				942
				943	n_ret = scan_swap_map_slots(si, usage, 1, &entry);
				944
				945	if (n_ret)
				946	return swp_offset(entry);
				947	else
				948	return 0;
				949
				950	}
				951
				952	int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
				953	{
				954	unsigned long size = swap_entry_size(entry_size);
				955	struct swap_info_struct si, next;
				956	long avail_pgs;
				957	int n_ret = 0;
				958	int node;
				959
				960	/* Only single cluster request supported */
				961	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
				962
				963	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
				964	if (avail_pgs <= 0)
				965	goto noswap;
				966
				967	if (n_goal > SWAP_BATCH)
				968	n_goal = SWAP_BATCH;
				969
				970	if (n_goal > avail_pgs)
				971	n_goal = avail_pgs;
				972
				973	atomic_long_sub(n_goal * size, &nr_swap_pages);
				974
				975	spin_lock(&swap_avail_lock);
				976
				977	start_over:
				978	node = numa_node_id();
				979	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
				980	/* requeue si to after same-priority siblings */
				981	plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
				982	spin_unlock(&swap_avail_lock);
				983	spin_lock(&si->lock);
				984	if (!si->highest_bit \|\| !(si->flags & SWP_WRITEOK)) {
				985	spin_lock(&swap_avail_lock);
				986	if (plist_node_empty(&si->avail_lists[node])) {
				987	spin_unlock(&si->lock);
				988	goto nextsi;
				989	}
				990	WARN(!si->highest_bit,
				991	"swap_info %d in list but !highest_bit\n",
				992	si->type);
				993	WARN(!(si->flags & SWP_WRITEOK),
				994	"swap_info %d in list but !SWP_WRITEOK\n",
				995	si->type);
				996	__del_from_avail_list(si);
				997	spin_unlock(&si->lock);
				998	goto nextsi;
				999	}
				1000	if (size == SWAPFILE_CLUSTER) {
				1001	if (!(si->flags & SWP_FILE))
				1002	n_ret = swap_alloc_cluster(si, swp_entries);
				1003	} else
				1004	n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
				1005	n_goal, swp_entries);
				1006	spin_unlock(&si->lock);
				1007	if (n_ret \|\| size == SWAPFILE_CLUSTER)
				1008	goto check_out;
				1009	pr_debug("scan_swap_map of si %d failed to find offset\n",
				1010	si->type);
				1011
				1012	spin_lock(&swap_avail_lock);
				1013	nextsi:
				1014	/*
				1015	* if we got here, it's likely that si was almost full before,
				1016	* and since scan_swap_map() can drop the si->lock, multiple
				1017	* callers probably all tried to get a page from the same si
				1018	* and it filled up before we could get one; or, the si filled
				1019	* up between us dropping swap_avail_lock and taking si->lock.
				1020	* Since we dropped the swap_avail_lock, the swap_avail_head
				1021	* list may have been modified; so if next is still in the
				1022	* swap_avail_head list then try it, otherwise start over
				1023	* if we have not gotten any slots.
				1024	*/
				1025	if (plist_node_empty(&next->avail_lists[node]))
				1026	goto start_over;
				1027	}
				1028
				1029	spin_unlock(&swap_avail_lock);
				1030
				1031	check_out:
				1032	if (n_ret < n_goal)
				1033	atomic_long_add((long)(n_goal - n_ret) * size,
				1034	&nr_swap_pages);
				1035	noswap:
				1036	return n_ret;
				1037	}
				1038
				1039	/* The only caller of this function is now suspend routine */
				1040	swp_entry_t get_swap_page_of_type(int type)
				1041	{
				1042	struct swap_info_struct *si = swap_type_to_swap_info(type);
				1043	pgoff_t offset;
				1044
				1045	if (!si)
				1046	goto fail;
				1047
				1048	spin_lock(&si->lock);
				1049	if (si->flags & SWP_WRITEOK) {
				1050	atomic_long_dec(&nr_swap_pages);
				1051	/* This is called for allocating swap entry, not cache */
				1052	offset = scan_swap_map(si, 1);
				1053	if (offset) {
				1054	spin_unlock(&si->lock);
				1055	return swp_entry(type, offset);
				1056	}
				1057	atomic_long_inc(&nr_swap_pages);
				1058	}
				1059	spin_unlock(&si->lock);
				1060	fail:
				1061	return (swp_entry_t) {0};
				1062	}
				1063
				1064	static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
				1065	{
				1066	struct swap_info_struct *p;
				1067	unsigned long offset, type;
				1068
				1069	if (!entry.val)
				1070	goto out;
				1071	type = swp_type(entry);
				1072	p = swap_type_to_swap_info(type);
				1073	if (!p)
				1074	goto bad_nofile;
				1075	if (!(p->flags & SWP_USED))
				1076	goto bad_device;
				1077	offset = swp_offset(entry);
				1078	if (offset >= p->max)
				1079	goto bad_offset;
				1080	return p;
				1081
				1082	bad_offset:
				1083	pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
				1084	goto out;
				1085	bad_device:
				1086	pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
				1087	goto out;
				1088	bad_nofile:
				1089	pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
				1090	out:
				1091	return NULL;
				1092	}
				1093
				1094	static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
				1095	{
				1096	struct swap_info_struct *p;
				1097
				1098	p = __swap_info_get(entry);
				1099	if (!p)
				1100	goto out;
				1101	if (!p->swap_map[swp_offset(entry)])
				1102	goto bad_free;
				1103	return p;
				1104
				1105	bad_free:
				1106	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
				1107	goto out;
				1108	out:
				1109	return NULL;
				1110	}
				1111
				1112	static struct swap_info_struct *swap_info_get(swp_entry_t entry)
				1113	{
				1114	struct swap_info_struct *p;
				1115
				1116	p = _swap_info_get(entry);
				1117	if (p)
				1118	spin_lock(&p->lock);
				1119	return p;
				1120	}
				1121
				1122	static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
				1123	struct swap_info_struct *q)
				1124	{
				1125	struct swap_info_struct *p;
				1126
				1127	p = _swap_info_get(entry);
				1128
				1129	if (p != q) {
				1130	if (q != NULL)
				1131	spin_unlock(&q->lock);
				1132	if (p != NULL)
				1133	spin_lock(&p->lock);
				1134	}
				1135	return p;
				1136	}
				1137
				1138	static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
				1139	unsigned long offset,
				1140	unsigned char usage)
				1141	{
				1142	unsigned char count;
				1143	unsigned char has_cache;
				1144
				1145	count = p->swap_map[offset];
				1146
				1147	has_cache = count & SWAP_HAS_CACHE;
				1148	count &= ~SWAP_HAS_CACHE;
				1149
				1150	if (usage == SWAP_HAS_CACHE) {
				1151	VM_BUG_ON(!has_cache);
				1152	has_cache = 0;
				1153	} else if (count == SWAP_MAP_SHMEM) {
				1154	/*
				1155	* Or we could insist on shmem.c using a special
				1156	* swap_shmem_free() and free_shmem_swap_and_cache()...
				1157	*/
				1158	count = 0;
				1159	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
				1160	if (count == COUNT_CONTINUED) {
				1161	if (swap_count_continued(p, offset, count))
				1162	count = SWAP_MAP_MAX \| COUNT_CONTINUED;
				1163	else
				1164	count = SWAP_MAP_MAX;
				1165	} else
				1166	count--;
				1167	}
				1168
				1169	usage = count \| has_cache;
				1170	p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
				1171
				1172	return usage;
				1173	}
				1174
				1175	static unsigned char __swap_entry_free(struct swap_info_struct *p,
				1176	swp_entry_t entry, unsigned char usage)
				1177	{
				1178	struct swap_cluster_info *ci;
				1179	unsigned long offset = swp_offset(entry);
				1180
				1181	ci = lock_cluster_or_swap_info(p, offset);
				1182	usage = __swap_entry_free_locked(p, offset, usage);
				1183	unlock_cluster_or_swap_info(p, ci);
				1184
				1185	return usage;
				1186	}
				1187
				1188	static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
				1189	{
				1190	struct swap_cluster_info *ci;
				1191	unsigned long offset = swp_offset(entry);
				1192	unsigned char count;
				1193
				1194	ci = lock_cluster(p, offset);
				1195	count = p->swap_map[offset];
				1196	VM_BUG_ON(count != SWAP_HAS_CACHE);
				1197	p->swap_map[offset] = 0;
				1198	dec_cluster_info_page(p, p->cluster_info, offset);
				1199	unlock_cluster(ci);
				1200
				1201	mem_cgroup_uncharge_swap(entry, 1);
				1202	swap_range_free(p, offset, 1);
				1203	}
				1204
				1205	/*
				1206	* Caller has made sure that the swap device corresponding to entry
				1207	* is still around or has not been recycled.
				1208	*/
				1209	void swap_free(swp_entry_t entry)
				1210	{
				1211	struct swap_info_struct *p;
				1212
				1213	p = _swap_info_get(entry);
				1214	if (p) {
				1215	if (!__swap_entry_free(p, entry, 1))
				1216	free_swap_slot(entry);
				1217	}
				1218	}
				1219
				1220	/*
				1221	* Called after dropping swapcache to decrease refcnt to swap entries.
				1222	*/
				1223	void put_swap_page(struct page *page, swp_entry_t entry)
				1224	{
				1225	unsigned long offset = swp_offset(entry);
				1226	unsigned long idx = offset / SWAPFILE_CLUSTER;
				1227	struct swap_cluster_info *ci;
				1228	struct swap_info_struct *si;
				1229	unsigned char *map;
				1230	unsigned int i, free_entries = 0;
				1231	unsigned char val;
				1232	int size = swap_entry_size(hpage_nr_pages(page));
				1233
				1234	si = _swap_info_get(entry);
				1235	if (!si)
				1236	return;
				1237
				1238	ci = lock_cluster_or_swap_info(si, offset);
				1239	if (size == SWAPFILE_CLUSTER) {
				1240	VM_BUG_ON(!cluster_is_huge(ci));
				1241	map = si->swap_map + offset;
				1242	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
				1243	val = map[i];
				1244	VM_BUG_ON(!(val & SWAP_HAS_CACHE));
				1245	if (val == SWAP_HAS_CACHE)
				1246	free_entries++;
				1247	}
				1248	cluster_clear_huge(ci);
				1249	if (free_entries == SWAPFILE_CLUSTER) {
				1250	unlock_cluster_or_swap_info(si, ci);
				1251	spin_lock(&si->lock);
				1252	ci = lock_cluster(si, offset);
				1253	memset(map, 0, SWAPFILE_CLUSTER);
				1254	unlock_cluster(ci);
				1255	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
				1256	swap_free_cluster(si, idx);
				1257	spin_unlock(&si->lock);
				1258	return;
				1259	}
				1260	}
				1261	for (i = 0; i < size; i++, entry.val++) {
				1262	if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
				1263	unlock_cluster_or_swap_info(si, ci);
				1264	free_swap_slot(entry);
				1265	if (i == size - 1)
				1266	return;
				1267	lock_cluster_or_swap_info(si, offset);
				1268	}
				1269	}
				1270	unlock_cluster_or_swap_info(si, ci);
				1271	}
				1272
				1273	#ifdef CONFIG_THP_SWAP
				1274	int split_swap_cluster(swp_entry_t entry)
				1275	{
				1276	struct swap_info_struct *si;
				1277	struct swap_cluster_info *ci;
				1278	unsigned long offset = swp_offset(entry);
				1279
				1280	si = _swap_info_get(entry);
				1281	if (!si)
				1282	return -EBUSY;
				1283	ci = lock_cluster(si, offset);
				1284	cluster_clear_huge(ci);
				1285	unlock_cluster(ci);
				1286	return 0;
				1287	}
				1288	#endif
				1289
				1290	static int swp_entry_cmp(const void ent1, const void ent2)
				1291	{
				1292	const swp_entry_t e1 = ent1, e2 = ent2;
				1293
				1294	return (int)swp_type(e1) - (int)swp_type(e2);
				1295	}
				1296
				1297	void swapcache_free_entries(swp_entry_t *entries, int n)
				1298	{
				1299	struct swap_info_struct p, prev;
				1300	int i;
				1301
				1302	if (n <= 0)
				1303	return;
				1304
				1305	prev = NULL;
				1306	p = NULL;
				1307
				1308	/*
				1309	* Sort swap entries by swap device, so each lock is only taken once.
				1310	* nr_swapfiles isn't absolutely correct, but the overhead of sort() is
				1311	* so low that it isn't necessary to optimize further.
				1312	*/
				1313	if (nr_swapfiles > 1)
				1314	sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
				1315	for (i = 0; i < n; ++i) {
				1316	p = swap_info_get_cont(entries[i], prev);
				1317	if (p)
				1318	swap_entry_free(p, entries[i]);
				1319	prev = p;
				1320	}
				1321	if (p)
				1322	spin_unlock(&p->lock);
				1323	}
				1324
				1325	/*
				1326	* How many references to page are currently swapped out?
				1327	* This does not give an exact answer when swap count is continued,
				1328	* but does include the high COUNT_CONTINUED flag to allow for that.
				1329	*/
				1330	int page_swapcount(struct page *page)
				1331	{
				1332	int count = 0;
				1333	struct swap_info_struct *p;
				1334	struct swap_cluster_info *ci;
				1335	swp_entry_t entry;
				1336	unsigned long offset;
				1337
				1338	entry.val = page_private(page);
				1339	p = _swap_info_get(entry);
				1340	if (p) {
				1341	offset = swp_offset(entry);
				1342	ci = lock_cluster_or_swap_info(p, offset);
				1343	count = swap_count(p->swap_map[offset]);
				1344	unlock_cluster_or_swap_info(p, ci);
				1345	}
				1346	return count;
				1347	}
				1348
				1349	int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
				1350	{
				1351	pgoff_t offset = swp_offset(entry);
				1352
				1353	return swap_count(si->swap_map[offset]);
				1354	}
				1355
				1356	static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
				1357	{
				1358	int count = 0;
				1359	pgoff_t offset = swp_offset(entry);
				1360	struct swap_cluster_info *ci;
				1361
				1362	ci = lock_cluster_or_swap_info(si, offset);
				1363	count = swap_count(si->swap_map[offset]);
				1364	unlock_cluster_or_swap_info(si, ci);
				1365	return count;
				1366	}
				1367
				1368	/*
				1369	* How many references to @entry are currently swapped out?
				1370	* This does not give an exact answer when swap count is continued,
				1371	* but does include the high COUNT_CONTINUED flag to allow for that.
				1372	*/
				1373	int __swp_swapcount(swp_entry_t entry)
				1374	{
				1375	int count = 0;
				1376	struct swap_info_struct *si;
				1377
				1378	si = __swap_info_get(entry);
				1379	if (si)
				1380	count = swap_swapcount(si, entry);
				1381	return count;
				1382	}
				1383
				1384	/*
				1385	* How many references to @entry are currently swapped out?
				1386	* This considers COUNT_CONTINUED so it returns exact answer.
				1387	*/
				1388	int swp_swapcount(swp_entry_t entry)
				1389	{
				1390	int count, tmp_count, n;
				1391	struct swap_info_struct *p;
				1392	struct swap_cluster_info *ci;
				1393	struct page *page;
				1394	pgoff_t offset;
				1395	unsigned char *map;
				1396
				1397	p = _swap_info_get(entry);
				1398	if (!p)
				1399	return 0;
				1400
				1401	offset = swp_offset(entry);
				1402
				1403	ci = lock_cluster_or_swap_info(p, offset);
				1404
				1405	count = swap_count(p->swap_map[offset]);
				1406	if (!(count & COUNT_CONTINUED))
				1407	goto out;
				1408
				1409	count &= ~COUNT_CONTINUED;
				1410	n = SWAP_MAP_MAX + 1;
				1411
				1412	page = vmalloc_to_page(p->swap_map + offset);
				1413	offset &= ~PAGE_MASK;
				1414	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
				1415
				1416	do {
				1417	page = list_next_entry(page, lru);
				1418	map = kmap_atomic(page);
				1419	tmp_count = map[offset];
				1420	kunmap_atomic(map);
				1421
				1422	count += (tmp_count & ~COUNT_CONTINUED) * n;
				1423	n *= (SWAP_CONT_MAX + 1);
				1424	} while (tmp_count & COUNT_CONTINUED);
				1425	out:
				1426	unlock_cluster_or_swap_info(p, ci);
				1427	return count;
				1428	}
				1429
				1430	static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
				1431	swp_entry_t entry)
				1432	{
				1433	struct swap_cluster_info *ci;
				1434	unsigned char *map = si->swap_map;
				1435	unsigned long roffset = swp_offset(entry);
				1436	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
				1437	int i;
				1438	bool ret = false;
				1439
				1440	ci = lock_cluster_or_swap_info(si, offset);
				1441	if (!ci \|\| !cluster_is_huge(ci)) {
				1442	if (swap_count(map[roffset]))
				1443	ret = true;
				1444	goto unlock_out;
				1445	}
				1446	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
				1447	if (swap_count(map[offset + i])) {
				1448	ret = true;
				1449	break;
				1450	}
				1451	}
				1452	unlock_out:
				1453	unlock_cluster_or_swap_info(si, ci);
				1454	return ret;
				1455	}
				1456
				1457	static bool page_swapped(struct page *page)
				1458	{
				1459	swp_entry_t entry;
				1460	struct swap_info_struct *si;
				1461
				1462	if (!IS_ENABLED(CONFIG_THP_SWAP) \|\| likely(!PageTransCompound(page)))
				1463	return page_swapcount(page) != 0;
				1464
				1465	page = compound_head(page);
				1466	entry.val = page_private(page);
				1467	si = _swap_info_get(entry);
				1468	if (si)
				1469	return swap_page_trans_huge_swapped(si, entry);
				1470	return false;
				1471	}
				1472
				1473	static int page_trans_huge_map_swapcount(struct page page, int total_mapcount,
				1474	int *total_swapcount)
				1475	{
				1476	int i, map_swapcount, _total_mapcount, _total_swapcount;
				1477	unsigned long offset = 0;
				1478	struct swap_info_struct *si;
				1479	struct swap_cluster_info *ci = NULL;
				1480	unsigned char *map = NULL;
				1481	int mapcount, swapcount = 0;
				1482
				1483	/* hugetlbfs shouldn't call it */
				1484	VM_BUG_ON_PAGE(PageHuge(page), page);
				1485
				1486	if (!IS_ENABLED(CONFIG_THP_SWAP) \|\| likely(!PageTransCompound(page))) {
				1487	mapcount = page_trans_huge_mapcount(page, total_mapcount);
				1488	if (PageSwapCache(page))
				1489	swapcount = page_swapcount(page);
				1490	if (total_swapcount)
				1491	*total_swapcount = swapcount;
				1492	return mapcount + swapcount;
				1493	}
				1494
				1495	page = compound_head(page);
				1496
				1497	_total_mapcount = _total_swapcount = map_swapcount = 0;
				1498	if (PageSwapCache(page)) {
				1499	swp_entry_t entry;
				1500
				1501	entry.val = page_private(page);
				1502	si = _swap_info_get(entry);
				1503	if (si) {
				1504	map = si->swap_map;
				1505	offset = swp_offset(entry);
				1506	}
				1507	}
				1508	if (map)
				1509	ci = lock_cluster(si, offset);
				1510	for (i = 0; i < HPAGE_PMD_NR; i++) {
				1511	mapcount = atomic_read(&page[i]._mapcount) + 1;
				1512	_total_mapcount += mapcount;
				1513	if (map) {
				1514	swapcount = swap_count(map[offset + i]);
				1515	_total_swapcount += swapcount;
				1516	}
				1517	map_swapcount = max(map_swapcount, mapcount + swapcount);
				1518	}
				1519	unlock_cluster(ci);
				1520	if (PageDoubleMap(page)) {
				1521	map_swapcount -= 1;
				1522	_total_mapcount -= HPAGE_PMD_NR;
				1523	}
				1524	mapcount = compound_mapcount(page);
				1525	map_swapcount += mapcount;
				1526	_total_mapcount += mapcount;
				1527	if (total_mapcount)
				1528	*total_mapcount = _total_mapcount;
				1529	if (total_swapcount)
				1530	*total_swapcount = _total_swapcount;
				1531
				1532	return map_swapcount;
				1533	}
				1534
				1535	/*
				1536	* We can write to an anon page without COW if there are no other references
				1537	* to it. And as a side-effect, free up its swap: because the old content
				1538	* on disk will never be read, and seeking back there to write new content
				1539	* later would only waste time away from clustering.
				1540	*
				1541	* NOTE: total_map_swapcount should not be relied upon by the caller if
				1542	* reuse_swap_page() returns false, but it may be always overwritten
				1543	* (see the other implementation for CONFIG_SWAP=n).
				1544	*/
				1545	bool reuse_swap_page(struct page page, int total_map_swapcount)
				1546	{
				1547	int count, total_mapcount, total_swapcount;
				1548
				1549	VM_BUG_ON_PAGE(!PageLocked(page), page);
				1550	if (unlikely(PageKsm(page)))
				1551	return false;
				1552	count = page_trans_huge_map_swapcount(page, &total_mapcount,
				1553	&total_swapcount);
				1554	if (total_map_swapcount)
				1555	*total_map_swapcount = total_mapcount + total_swapcount;
				1556	if (count == 1 && PageSwapCache(page) &&
				1557	(likely(!PageTransCompound(page)) \|\|
				1558	/* The remaining swap count will be freed soon */
				1559	total_swapcount == page_swapcount(page))) {
				1560	if (!PageWriteback(page)) {
				1561	page = compound_head(page);
				1562	delete_from_swap_cache(page);
				1563	SetPageDirty(page);
				1564	} else {
				1565	swp_entry_t entry;
				1566	struct swap_info_struct *p;
				1567
				1568	entry.val = page_private(page);
				1569	p = swap_info_get(entry);
				1570	if (p->flags & SWP_STABLE_WRITES) {
				1571	spin_unlock(&p->lock);
				1572	return false;
				1573	}
				1574	spin_unlock(&p->lock);
				1575	}
				1576	}
				1577
				1578	return count <= 1;
				1579	}
				1580
				1581	/*
				1582	* If swap is getting full, or if there are no more mappings of this page,
				1583	* then try_to_free_swap is called to free its swap space.
				1584	*/
				1585	int try_to_free_swap(struct page *page)
				1586	{
				1587	VM_BUG_ON_PAGE(!PageLocked(page), page);
				1588
				1589	if (!PageSwapCache(page))
				1590	return 0;
				1591	if (PageWriteback(page))
				1592	return 0;
				1593	if (page_swapped(page))
				1594	return 0;
				1595
				1596	/*
				1597	* Once hibernation has begun to create its image of memory,
				1598	* there's a danger that one of the calls to try_to_free_swap()
				1599	* - most probably a call from __try_to_reclaim_swap() while
				1600	* hibernation is allocating its own swap pages for the image,
				1601	* but conceivably even a call from memory reclaim - will free
				1602	* the swap from a page which has already been recorded in the
				1603	* image as a clean swapcache page, and then reuse its swap for
				1604	* another page of the image. On waking from hibernation, the
				1605	* original page might be freed under memory pressure, then
				1606	* later read back in from swap, now with the wrong data.
				1607	*
				1608	* Hibernation suspends storage while it is writing the image
				1609	* to disk so check that here.
				1610	*/
				1611	if (pm_suspended_storage())
				1612	return 0;
				1613
				1614	page = compound_head(page);
				1615	delete_from_swap_cache(page);
				1616	SetPageDirty(page);
				1617	return 1;
				1618	}
				1619
				1620	/*
				1621	* Free the swap entry like above, but also try to
				1622	* free the page cache entry if it is the last user.
				1623	*/
				1624	int free_swap_and_cache(swp_entry_t entry)
				1625	{
				1626	struct swap_info_struct *p;
				1627	struct page *page = NULL;
				1628	unsigned char count;
				1629
				1630	if (non_swap_entry(entry))
				1631	return 1;
				1632
				1633	p = _swap_info_get(entry);
				1634	if (p) {
				1635	count = __swap_entry_free(p, entry, 1);
				1636	if (count == SWAP_HAS_CACHE &&
				1637	!swap_page_trans_huge_swapped(p, entry)) {
				1638	page = find_get_page(swap_address_space(entry),
				1639	swp_offset(entry));
				1640	if (page && !trylock_page(page)) {
				1641	put_page(page);
				1642	page = NULL;
				1643	}
				1644	} else if (!count)
				1645	free_swap_slot(entry);
				1646	}
				1647	if (page) {
				1648	/*
				1649	* Not mapped elsewhere, or swap space full? Free it!
				1650	* Also recheck PageSwapCache now page is locked (above).
				1651	*/
				1652	if (PageSwapCache(page) && !PageWriteback(page) &&
				1653	(!page_mapped(page) \|\| mem_cgroup_swap_full(page)) &&
				1654	!swap_page_trans_huge_swapped(p, entry)) {
				1655	page = compound_head(page);
				1656	delete_from_swap_cache(page);
				1657	SetPageDirty(page);
				1658	}
				1659	unlock_page(page);
				1660	put_page(page);
				1661	}
				1662	return p != NULL;
				1663	}
				1664
				1665	#ifdef CONFIG_HIBERNATION
				1666	/*
				1667	* Find the swap type that corresponds to given device (if any).
				1668	*
				1669	* @offset - number of the PAGE_SIZE-sized block of the device, starting
				1670	* from 0, in which the swap header is expected to be located.
				1671	*
				1672	* This is needed for the suspend to disk (aka swsusp).
				1673	*/
				1674	int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
				1675	{
				1676	struct block_device *bdev = NULL;
				1677	int type;
				1678
				1679	if (device)
				1680	bdev = bdget(device);
				1681
				1682	spin_lock(&swap_lock);
				1683	for (type = 0; type < nr_swapfiles; type++) {
				1684	struct swap_info_struct *sis = swap_info[type];
				1685
				1686	if (!(sis->flags & SWP_WRITEOK))
				1687	continue;
				1688
				1689	if (!bdev) {
				1690	if (bdev_p)
				1691	*bdev_p = bdgrab(sis->bdev);
				1692
				1693	spin_unlock(&swap_lock);
				1694	return type;
				1695	}
				1696	if (bdev == sis->bdev) {
				1697	struct swap_extent *se = &sis->first_swap_extent;
				1698
				1699	if (se->start_block == offset) {
				1700	if (bdev_p)
				1701	*bdev_p = bdgrab(sis->bdev);
				1702
				1703	spin_unlock(&swap_lock);
				1704	bdput(bdev);
				1705	return type;
				1706	}
				1707	}
				1708	}
				1709	spin_unlock(&swap_lock);
				1710	if (bdev)
				1711	bdput(bdev);
				1712
				1713	return -ENODEV;
				1714	}
				1715
				1716	/*
				1717	* Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
				1718	* corresponding to given index in swap_info (swap type).
				1719	*/
				1720	sector_t swapdev_block(int type, pgoff_t offset)
				1721	{
				1722	struct block_device *bdev;
				1723	struct swap_info_struct *si = swap_type_to_swap_info(type);
				1724
				1725	if (!si \|\| !(si->flags & SWP_WRITEOK))
				1726	return 0;
				1727	return map_swap_entry(swp_entry(type, offset), &bdev);
				1728	}
				1729
				1730	/*
				1731	* Return either the total number of swap pages of given type, or the number
				1732	* of free pages of that type (depending on @free)
				1733	*
				1734	* This is needed for software suspend
				1735	*/
				1736	unsigned int count_swap_pages(int type, int free)
				1737	{
				1738	unsigned int n = 0;
				1739
				1740	spin_lock(&swap_lock);
				1741	if ((unsigned int)type < nr_swapfiles) {
				1742	struct swap_info_struct *sis = swap_info[type];
				1743
				1744	spin_lock(&sis->lock);
				1745	if (sis->flags & SWP_WRITEOK) {
				1746	n = sis->pages;
				1747	if (free)
				1748	n -= sis->inuse_pages;
				1749	}
				1750	spin_unlock(&sis->lock);
				1751	}
				1752	spin_unlock(&swap_lock);
				1753	return n;
				1754	}
				1755	#endif /* CONFIG_HIBERNATION */
				1756
				1757	static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
				1758	{
				1759	return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
				1760	}
				1761
				1762	/*
				1763	* No need to decide whether this PTE shares the swap entry with others,
				1764	* just let do_wp_page work it out if a write is requested later - to
				1765	* force COW, vm_page_prot omits write permission from any private vma.
				1766	*/
				1767	static int unuse_pte(struct vm_area_struct vma, pmd_t pmd,
				1768	unsigned long addr, swp_entry_t entry, struct page *page)
				1769	{
				1770	struct page *swapcache;
				1771	struct mem_cgroup *memcg;
				1772	spinlock_t *ptl;
				1773	pte_t *pte;
				1774	int ret = 1;
				1775
				1776	swapcache = page;
				1777	page = ksm_might_need_to_copy(page, vma, addr);
				1778	if (unlikely(!page))
				1779	return -ENOMEM;
				1780
				1781	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
				1782	&memcg, false)) {
				1783	ret = -ENOMEM;
				1784	goto out_nolock;
				1785	}
				1786
				1787	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				1788	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
				1789	mem_cgroup_cancel_charge(page, memcg, false);
				1790	ret = 0;
				1791	goto out;
				1792	}
				1793
				1794	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
				1795	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
				1796	get_page(page);
				1797	set_pte_at(vma->vm_mm, addr, pte,
				1798	pte_mkold(mk_pte(page, vma->vm_page_prot)));
				1799	if (page == swapcache) {
				1800	page_add_anon_rmap(page, vma, addr, false);
				1801	mem_cgroup_commit_charge(page, memcg, true, false);
				1802	} else { /* ksm created a completely new copy */
				1803	page_add_new_anon_rmap(page, vma, addr, false);
				1804	mem_cgroup_commit_charge(page, memcg, false, false);
				1805	lru_cache_add_active_or_unevictable(page, vma);
				1806	}
				1807	swap_free(entry);
				1808	/*
				1809	* Move the page to the active list so it is not
				1810	* immediately swapped out again after swapon.
				1811	*/
				1812	activate_page(page);
				1813	out:
				1814	pte_unmap_unlock(pte, ptl);
				1815	out_nolock:
				1816	if (page != swapcache) {
				1817	unlock_page(page);
				1818	put_page(page);
				1819	}
				1820	return ret;
				1821	}
				1822
				1823	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
				1824	unsigned long addr, unsigned long end,
				1825	swp_entry_t entry, struct page *page)
				1826	{
				1827	pte_t swp_pte = swp_entry_to_pte(entry);
				1828	pte_t *pte;
				1829	int ret = 0;
				1830
				1831	/*
				1832	* We don't actually need pte lock while scanning for swp_pte: since
				1833	* we hold page lock and mmap_sem, swp_pte cannot be inserted into the
				1834	* page table while we're scanning; though it could get zapped, and on
				1835	* some architectures (e.g. x86_32 with PAE) we might catch a glimpse
				1836	* of unmatched parts which look like swp_pte, so unuse_pte must
				1837	* recheck under pte lock. Scanning without pte lock lets it be
				1838	* preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
				1839	*/
				1840	pte = pte_offset_map(pmd, addr);
				1841	do {
				1842	/*
				1843	* swapoff spends a _lot_ of time in this loop!
				1844	* Test inline before going to call unuse_pte.
				1845	*/
				1846	if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
				1847	pte_unmap(pte);
				1848	ret = unuse_pte(vma, pmd, addr, entry, page);
				1849	if (ret)
				1850	goto out;
				1851	pte = pte_offset_map(pmd, addr);
				1852	}
				1853	} while (pte++, addr += PAGE_SIZE, addr != end);
				1854	pte_unmap(pte - 1);
				1855	out:
				1856	return ret;
				1857	}
				1858
				1859	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
				1860	unsigned long addr, unsigned long end,
				1861	swp_entry_t entry, struct page *page)
				1862	{
				1863	pmd_t *pmd;
				1864	unsigned long next;
				1865	int ret;
				1866
				1867	pmd = pmd_offset(pud, addr);
				1868	do {
				1869	cond_resched();
				1870	next = pmd_addr_end(addr, end);
				1871	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
				1872	continue;
				1873	ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
				1874	if (ret)
				1875	return ret;
				1876	} while (pmd++, addr = next, addr != end);
				1877	return 0;
				1878	}
				1879
				1880	static inline int unuse_pud_range(struct vm_area_struct vma, p4d_t p4d,
				1881	unsigned long addr, unsigned long end,
				1882	swp_entry_t entry, struct page *page)
				1883	{
				1884	pud_t *pud;
				1885	unsigned long next;
				1886	int ret;
				1887
				1888	pud = pud_offset(p4d, addr);
				1889	do {
				1890	next = pud_addr_end(addr, end);
				1891	if (pud_none_or_clear_bad(pud))
				1892	continue;
				1893	ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
				1894	if (ret)
				1895	return ret;
				1896	} while (pud++, addr = next, addr != end);
				1897	return 0;
				1898	}
				1899
				1900	static inline int unuse_p4d_range(struct vm_area_struct vma, pgd_t pgd,
				1901	unsigned long addr, unsigned long end,
				1902	swp_entry_t entry, struct page *page)
				1903	{
				1904	p4d_t *p4d;
				1905	unsigned long next;
				1906	int ret;
				1907
				1908	p4d = p4d_offset(pgd, addr);
				1909	do {
				1910	next = p4d_addr_end(addr, end);
				1911	if (p4d_none_or_clear_bad(p4d))
				1912	continue;
				1913	ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
				1914	if (ret)
				1915	return ret;
				1916	} while (p4d++, addr = next, addr != end);
				1917	return 0;
				1918	}
				1919
				1920	static int unuse_vma(struct vm_area_struct *vma,
				1921	swp_entry_t entry, struct page *page)
				1922	{
				1923	pgd_t *pgd;
				1924	unsigned long addr, end, next;
				1925	int ret;
				1926
				1927	if (page_anon_vma(page)) {
				1928	addr = page_address_in_vma(page, vma);
				1929	if (addr == -EFAULT)
				1930	return 0;
				1931	else
				1932	end = addr + PAGE_SIZE;
				1933	} else {
				1934	addr = vma->vm_start;
				1935	end = vma->vm_end;
				1936	}
				1937
				1938	pgd = pgd_offset(vma->vm_mm, addr);
				1939	do {
				1940	next = pgd_addr_end(addr, end);
				1941	if (pgd_none_or_clear_bad(pgd))
				1942	continue;
				1943	ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
				1944	if (ret)
				1945	return ret;
				1946	} while (pgd++, addr = next, addr != end);
				1947	return 0;
				1948	}
				1949
				1950	static int unuse_mm(struct mm_struct *mm,
				1951	swp_entry_t entry, struct page *page)
				1952	{
				1953	struct vm_area_struct *vma;
				1954	int ret = 0;
				1955
				1956	if (!down_read_trylock(&mm->mmap_sem)) {
				1957	/*
				1958	* Activate page so shrink_inactive_list is unlikely to unmap
				1959	* its ptes while lock is dropped, so swapoff can make progress.
				1960	*/
				1961	activate_page(page);
				1962	unlock_page(page);
				1963	down_read(&mm->mmap_sem);
				1964	lock_page(page);
				1965	}
				1966	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				1967	if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
				1968	break;
				1969	cond_resched();
				1970	}
				1971	up_read(&mm->mmap_sem);
				1972	return (ret < 0)? ret: 0;
				1973	}
				1974
				1975	/*
				1976	* Scan swap_map (or frontswap_map if frontswap parameter is true)
				1977	* from current position to next entry still in use.
				1978	* Recycle to start on reaching the end, returning 0 when empty.
				1979	*/
				1980	static unsigned int find_next_to_unuse(struct swap_info_struct *si,
				1981	unsigned int prev, bool frontswap)
				1982	{
				1983	unsigned int max = si->max;
				1984	unsigned int i = prev;
				1985	unsigned char count;
				1986
				1987	/*
				1988	* No need for swap_lock here: we're just looking
				1989	* for whether an entry is in use, not modifying it; false
				1990	* hits are okay, and sys_swapoff() has already prevented new
				1991	* allocations from this area (while holding swap_lock).
				1992	*/
				1993	for (;;) {
				1994	if (++i >= max) {
				1995	if (!prev) {
				1996	i = 0;
				1997	break;
				1998	}
				1999	/*
				2000	* No entries in use at top of swap_map,
				2001	* loop back to start and recheck there.
				2002	*/
				2003	max = prev + 1;
				2004	prev = 0;
				2005	i = 1;
				2006	}
				2007	count = READ_ONCE(si->swap_map[i]);
				2008	if (count && swap_count(count) != SWAP_MAP_BAD)
				2009	if (!frontswap \|\| frontswap_test(si, i))
				2010	break;
				2011	if ((i % LATENCY_LIMIT) == 0)
				2012	cond_resched();
				2013	}
				2014	return i;
				2015	}
				2016
				2017	/*
				2018	* We completely avoid races by reading each swap page in advance,
				2019	* and then search for the process using it. All the necessary
				2020	* page table adjustments can then be made atomically.
				2021	*
				2022	* if the boolean frontswap is true, only unuse pages_to_unuse pages;
				2023	* pages_to_unuse==0 means all pages; ignored if frontswap is false
				2024	*/
				2025	int try_to_unuse(unsigned int type, bool frontswap,
				2026	unsigned long pages_to_unuse)
				2027	{
				2028	struct swap_info_struct *si = swap_info[type];
				2029	struct mm_struct *start_mm;
				2030	volatile unsigned char swap_map; / swap_map is accessed without
				2031	* locking. Mark it as volatile
				2032	* to prevent compiler doing
				2033	* something odd.
				2034	*/
				2035	unsigned char swcount;
				2036	struct page *page;
				2037	swp_entry_t entry;
				2038	unsigned int i = 0;
				2039	int retval = 0;
				2040
				2041	/*
				2042	* When searching mms for an entry, a good strategy is to
				2043	* start at the first mm we freed the previous entry from
				2044	* (though actually we don't notice whether we or coincidence
				2045	* freed the entry). Initialize this start_mm with a hold.
				2046	*
				2047	* A simpler strategy would be to start at the last mm we
				2048	* freed the previous entry from; but that would take less
				2049	* advantage of mmlist ordering, which clusters forked mms
				2050	* together, child after parent. If we race with dup_mmap(), we
				2051	* prefer to resolve parent before child, lest we miss entries
				2052	* duplicated after we scanned child: using last mm would invert
				2053	* that.
				2054	*/
				2055	start_mm = &init_mm;
				2056	mmget(&init_mm);
				2057
				2058	/*
				2059	* Keep on scanning until all entries have gone. Usually,
				2060	* one pass through swap_map is enough, but not necessarily:
				2061	* there are races when an instance of an entry might be missed.
				2062	*/
				2063	while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
				2064	if (signal_pending(current)) {
				2065	retval = -EINTR;
				2066	break;
				2067	}
				2068
				2069	/*
				2070	* Get a page for the entry, using the existing swap
				2071	* cache page if there is one. Otherwise, get a clean
				2072	* page and read the swap into it.
				2073	*/
				2074	swap_map = &si->swap_map[i];
				2075	entry = swp_entry(type, i);
				2076	page = read_swap_cache_async(entry,
				2077	GFP_HIGHUSER_MOVABLE, NULL, 0, false);
				2078	if (!page) {
				2079	/*
				2080	* Either swap_duplicate() failed because entry
				2081	* has been freed independently, and will not be
				2082	* reused since sys_swapoff() already disabled
				2083	* allocation from here, or alloc_page() failed.
				2084	*/
				2085	swcount = *swap_map;
				2086	/*
				2087	* We don't hold lock here, so the swap entry could be
				2088	* SWAP_MAP_BAD (when the cluster is discarding).
				2089	* Instead of fail out, We can just skip the swap
				2090	* entry because swapoff will wait for discarding
				2091	* finish anyway.
				2092	*/
				2093	if (!swcount \|\| swcount == SWAP_MAP_BAD)
				2094	continue;
				2095	retval = -ENOMEM;
				2096	break;
				2097	}
				2098
				2099	/*
				2100	* Don't hold on to start_mm if it looks like exiting.
				2101	*/
				2102	if (atomic_read(&start_mm->mm_users) == 1) {
				2103	mmput(start_mm);
				2104	start_mm = &init_mm;
				2105	mmget(&init_mm);
				2106	}
				2107
				2108	/*
				2109	* Wait for and lock page. When do_swap_page races with
				2110	* try_to_unuse, do_swap_page can handle the fault much
				2111	* faster than try_to_unuse can locate the entry. This
				2112	* apparently redundant "wait_on_page_locked" lets try_to_unuse
				2113	* defer to do_swap_page in such a case - in some tests,
				2114	* do_swap_page and try_to_unuse repeatedly compete.
				2115	*/
				2116	wait_on_page_locked(page);
				2117	wait_on_page_writeback(page);
				2118	lock_page(page);
				2119	wait_on_page_writeback(page);
				2120
				2121	/*
				2122	* Remove all references to entry.
				2123	*/
				2124	swcount = *swap_map;
				2125	if (swap_count(swcount) == SWAP_MAP_SHMEM) {
				2126	retval = shmem_unuse(entry, page);
				2127	/* page has already been unlocked and released */
				2128	if (retval < 0)
				2129	break;
				2130	continue;
				2131	}
				2132	if (swap_count(swcount) && start_mm != &init_mm)
				2133	retval = unuse_mm(start_mm, entry, page);
				2134
				2135	if (swap_count(*swap_map)) {
				2136	int set_start_mm = (*swap_map >= swcount);
				2137	struct list_head *p = &start_mm->mmlist;
				2138	struct mm_struct *new_start_mm = start_mm;
				2139	struct mm_struct *prev_mm = start_mm;
				2140	struct mm_struct *mm;
				2141
				2142	mmget(new_start_mm);
				2143	mmget(prev_mm);
				2144	spin_lock(&mmlist_lock);
				2145	while (swap_count(*swap_map) && !retval &&
				2146	(p = p->next) != &start_mm->mmlist) {
				2147	mm = list_entry(p, struct mm_struct, mmlist);
				2148	if (!mmget_not_zero(mm))
				2149	continue;
				2150	spin_unlock(&mmlist_lock);
				2151	mmput(prev_mm);
				2152	prev_mm = mm;
				2153
				2154	cond_resched();
				2155
				2156	swcount = *swap_map;
				2157	if (!swap_count(swcount)) /* any usage ? */
				2158	;
				2159	else if (mm == &init_mm)
				2160	set_start_mm = 1;
				2161	else
				2162	retval = unuse_mm(mm, entry, page);
				2163
				2164	if (set_start_mm && *swap_map < swcount) {
				2165	mmput(new_start_mm);
				2166	mmget(mm);
				2167	new_start_mm = mm;
				2168	set_start_mm = 0;
				2169	}
				2170	spin_lock(&mmlist_lock);
				2171	}
				2172	spin_unlock(&mmlist_lock);
				2173	mmput(prev_mm);
				2174	mmput(start_mm);
				2175	start_mm = new_start_mm;
				2176	}
				2177	if (retval) {
				2178	unlock_page(page);
				2179	put_page(page);
				2180	break;
				2181	}
				2182
				2183	/*
				2184	* If a reference remains (rare), we would like to leave
				2185	* the page in the swap cache; but try_to_unmap could
				2186	* then re-duplicate the entry once we drop page lock,
				2187	* so we might loop indefinitely; also, that page could
				2188	* not be swapped out to other storage meanwhile. So:
				2189	* delete from cache even if there's another reference,
				2190	* after ensuring that the data has been saved to disk -
				2191	* since if the reference remains (rarer), it will be
				2192	* read from disk into another page. Splitting into two
				2193	* pages would be incorrect if swap supported "shared
				2194	* private" pages, but they are handled by tmpfs files.
				2195	*
				2196	* Given how unuse_vma() targets one particular offset
				2197	* in an anon_vma, once the anon_vma has been determined,
				2198	* this splitting happens to be just what is needed to
				2199	* handle where KSM pages have been swapped out: re-reading
				2200	* is unnecessarily slow, but we can fix that later on.
				2201	*/
				2202	if (swap_count(*swap_map) &&
				2203	PageDirty(page) && PageSwapCache(page)) {
				2204	struct writeback_control wbc = {
				2205	.sync_mode = WB_SYNC_NONE,
				2206	};
				2207
				2208	swap_writepage(compound_head(page), &wbc);
				2209	lock_page(page);
				2210	wait_on_page_writeback(page);
				2211	}
				2212
				2213	/*
				2214	* It is conceivable that a racing task removed this page from
				2215	* swap cache just before we acquired the page lock at the top,
				2216	* or while we dropped it in unuse_mm(). The page might even
				2217	* be back in swap cache on another swap area: that we must not
				2218	* delete, since it may not have been written out to swap yet.
				2219	*/
				2220	if (PageSwapCache(page) &&
				2221	likely(page_private(page) == entry.val) &&
				2222	(!PageTransCompound(page) \|\|
				2223	!swap_page_trans_huge_swapped(si, entry)))
				2224	delete_from_swap_cache(compound_head(page));
				2225
				2226	/*
				2227	* So we could skip searching mms once swap count went
				2228	* to 1, we did not mark any present ptes as dirty: must
				2229	* mark page dirty so shrink_page_list will preserve it.
				2230	*/
				2231	SetPageDirty(page);
				2232	unlock_page(page);
				2233	put_page(page);
				2234
				2235	/*
				2236	* Make sure that we aren't completely killing
				2237	* interactive performance.
				2238	*/
				2239	cond_resched();
				2240	if (frontswap && pages_to_unuse > 0) {
				2241	if (!--pages_to_unuse)
				2242	break;
				2243	}
				2244	}
				2245
				2246	mmput(start_mm);
				2247	return retval;
				2248	}
				2249
				2250	/*
				2251	* After a successful try_to_unuse, if no swap is now in use, we know
				2252	* we can empty the mmlist. swap_lock must be held on entry and exit.
				2253	* Note that mmlist_lock nests inside swap_lock, and an mm must be
				2254	* added to the mmlist just after page_duplicate - before would be racy.
				2255	*/
				2256	static void drain_mmlist(void)
				2257	{
				2258	struct list_head p, next;
				2259	unsigned int type;
				2260
				2261	for (type = 0; type < nr_swapfiles; type++)
				2262	if (swap_info[type]->inuse_pages)
				2263	return;
				2264	spin_lock(&mmlist_lock);
				2265	list_for_each_safe(p, next, &init_mm.mmlist)
				2266	list_del_init(p);
				2267	spin_unlock(&mmlist_lock);
				2268	}
				2269
				2270	/*
				2271	* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
				2272	* corresponds to page offset for the specified swap entry.
				2273	* Note that the type of this function is sector_t, but it returns page offset
				2274	* into the bdev, not sector offset.
				2275	*/
				2276	static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
				2277	{
				2278	struct swap_info_struct *sis;
				2279	struct swap_extent *start_se;
				2280	struct swap_extent *se;
				2281	pgoff_t offset;
				2282
				2283	sis = swp_swap_info(entry);
				2284	*bdev = sis->bdev;
				2285
				2286	offset = swp_offset(entry);
				2287	start_se = sis->curr_swap_extent;
				2288	se = start_se;
				2289
				2290	for ( ; ; ) {
				2291	if (se->start_page <= offset &&
				2292	offset < (se->start_page + se->nr_pages)) {
				2293	return se->start_block + (offset - se->start_page);
				2294	}
				2295	se = list_next_entry(se, list);
				2296	sis->curr_swap_extent = se;
				2297	BUG_ON(se == start_se); /* It must be present */
				2298	}
				2299	}
				2300
				2301	/*
				2302	* Returns the page offset into bdev for the specified page's swap entry.
				2303	*/
				2304	sector_t map_swap_page(struct page page, struct block_device *bdev)
				2305	{
				2306	swp_entry_t entry;
				2307	entry.val = page_private(page);
				2308	return map_swap_entry(entry, bdev);
				2309	}
				2310
				2311	/*
				2312	* Free all of a swapdev's extent information
				2313	*/
				2314	static void destroy_swap_extents(struct swap_info_struct *sis)
				2315	{
				2316	while (!list_empty(&sis->first_swap_extent.list)) {
				2317	struct swap_extent *se;
				2318
				2319	se = list_first_entry(&sis->first_swap_extent.list,
				2320	struct swap_extent, list);
				2321	list_del(&se->list);
				2322	kfree(se);
				2323	}
				2324
				2325	if (sis->flags & SWP_FILE) {
				2326	struct file *swap_file = sis->swap_file;
				2327	struct address_space *mapping = swap_file->f_mapping;
				2328
				2329	sis->flags &= ~SWP_FILE;
				2330	mapping->a_ops->swap_deactivate(swap_file);
				2331	}
				2332	}
				2333
				2334	/*
				2335	* Add a block range (and the corresponding page range) into this swapdev's
				2336	* extent list. The extent list is kept sorted in page order.
				2337	*
				2338	* This function rather assumes that it is called in ascending page order.
				2339	*/
				2340	int
				2341	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
				2342	unsigned long nr_pages, sector_t start_block)
				2343	{
				2344	struct swap_extent *se;
				2345	struct swap_extent *new_se;
				2346	struct list_head *lh;
				2347
				2348	if (start_page == 0) {
				2349	se = &sis->first_swap_extent;
				2350	sis->curr_swap_extent = se;
				2351	se->start_page = 0;
				2352	se->nr_pages = nr_pages;
				2353	se->start_block = start_block;
				2354	return 1;
				2355	} else {
				2356	lh = sis->first_swap_extent.list.prev; /* Highest extent */
				2357	se = list_entry(lh, struct swap_extent, list);
				2358	BUG_ON(se->start_page + se->nr_pages != start_page);
				2359	if (se->start_block + se->nr_pages == start_block) {
				2360	/* Merge it */
				2361	se->nr_pages += nr_pages;
				2362	return 0;
				2363	}
				2364	}
				2365
				2366	/*
				2367	* No merge. Insert a new extent, preserving ordering.
				2368	*/
				2369	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
				2370	if (new_se == NULL)
				2371	return -ENOMEM;
				2372	new_se->start_page = start_page;
				2373	new_se->nr_pages = nr_pages;
				2374	new_se->start_block = start_block;
				2375
				2376	list_add_tail(&new_se->list, &sis->first_swap_extent.list);
				2377	return 1;
				2378	}
				2379
				2380	/*
				2381	* A `swap extent' is a simple thing which maps a contiguous range of pages
				2382	* onto a contiguous range of disk blocks. An ordered list of swap extents
				2383	* is built at swapon time and is then used at swap_writepage/swap_readpage
				2384	* time for locating where on disk a page belongs.
				2385	*
				2386	* If the swapfile is an S_ISBLK block device, a single extent is installed.
				2387	* This is done so that the main operating code can treat S_ISBLK and S_ISREG
				2388	* swap files identically.
				2389	*
				2390	* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
				2391	* extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
				2392	* swapfiles are handled identically after swapon time.
				2393	*
				2394	* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
				2395	* and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
				2396	* some stray blocks are found which do not fall within the PAGE_SIZE alignment
				2397	* requirements, they are simply tossed out - we will never use those blocks
				2398	* for swapping.
				2399	*
				2400	* For all swap devices we set S_SWAPFILE across the life of the swapon. This
				2401	* prevents users from writing to the swap device, which will corrupt memory.
				2402	*
				2403	* The amount of disk space which a single swap extent represents varies.
				2404	* Typically it is in the 1-4 megabyte range. So we can have hundreds of
				2405	* extents in the list. To avoid much list walking, we cache the previous
				2406	* search location in `curr_swap_extent', and start new searches from there.
				2407	* This is extremely effective. The average number of iterations in
				2408	* map_swap_page() has been measured at about 0.3 per page. - akpm.
				2409	*/
				2410	static int setup_swap_extents(struct swap_info_struct sis, sector_t span)
				2411	{
				2412	struct file *swap_file = sis->swap_file;
				2413	struct address_space *mapping = swap_file->f_mapping;
				2414	struct inode *inode = mapping->host;
				2415	int ret;
				2416
				2417	if (S_ISBLK(inode->i_mode)) {
				2418	ret = add_swap_extent(sis, 0, sis->max, 0);
				2419	*span = sis->pages;
				2420	return ret;
				2421	}
				2422
				2423	if (mapping->a_ops->swap_activate) {
				2424	ret = mapping->a_ops->swap_activate(sis, swap_file, span);
				2425	if (!ret) {
				2426	sis->flags \|= SWP_FILE;
				2427	ret = add_swap_extent(sis, 0, sis->max, 0);
				2428	*span = sis->pages;
				2429	}
				2430	return ret;
				2431	}
				2432
				2433	return generic_swapfile_activate(sis, swap_file, span);
				2434	}
				2435
				2436	static int swap_node(struct swap_info_struct *p)
				2437	{
				2438	struct block_device *bdev;
				2439
				2440	if (p->bdev)
				2441	bdev = p->bdev;
				2442	else
				2443	bdev = p->swap_file->f_inode->i_sb->s_bdev;
				2444
				2445	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
				2446	}
				2447
				2448	static void _enable_swap_info(struct swap_info_struct *p, int prio,
				2449	unsigned char *swap_map,
				2450	struct swap_cluster_info *cluster_info)
				2451	{
				2452	int i;
				2453
				2454	if (prio >= 0)
				2455	p->prio = prio;
				2456	else
				2457	p->prio = --least_priority;
				2458	/*
				2459	* the plist prio is negated because plist ordering is
				2460	* low-to-high, while swap ordering is high-to-low
				2461	*/
				2462	p->list.prio = -p->prio;
				2463	for_each_node(i) {
				2464	if (p->prio >= 0)
				2465	p->avail_lists[i].prio = -p->prio;
				2466	else {
				2467	if (swap_node(p) == i)
				2468	p->avail_lists[i].prio = 1;
				2469	else
				2470	p->avail_lists[i].prio = -p->prio;
				2471	}
				2472	}
				2473	p->swap_map = swap_map;
				2474	p->cluster_info = cluster_info;
				2475	p->flags \|= SWP_WRITEOK;
				2476	atomic_long_add(p->pages, &nr_swap_pages);
				2477	total_swap_pages += p->pages;
				2478
				2479	assert_spin_locked(&swap_lock);
				2480	/*
				2481	* both lists are plists, and thus priority ordered.
				2482	* swap_active_head needs to be priority ordered for swapoff(),
				2483	* which on removal of any swap_info_struct with an auto-assigned
				2484	* (i.e. negative) priority increments the auto-assigned priority
				2485	* of any lower-priority swap_info_structs.
				2486	* swap_avail_head needs to be priority ordered for get_swap_page(),
				2487	* which allocates swap pages from the highest available priority
				2488	* swap_info_struct.
				2489	*/
				2490	plist_add(&p->list, &swap_active_head);
				2491	add_to_avail_list(p);
				2492	}
				2493
				2494	static void enable_swap_info(struct swap_info_struct *p, int prio,
				2495	unsigned char *swap_map,
				2496	struct swap_cluster_info *cluster_info,
				2497	unsigned long *frontswap_map)
				2498	{
				2499	frontswap_init(p->type, frontswap_map);
				2500	spin_lock(&swap_lock);
				2501	spin_lock(&p->lock);
				2502	_enable_swap_info(p, prio, swap_map, cluster_info);
				2503	spin_unlock(&p->lock);
				2504	spin_unlock(&swap_lock);
				2505	}
				2506
				2507	static void reinsert_swap_info(struct swap_info_struct *p)
				2508	{
				2509	spin_lock(&swap_lock);
				2510	spin_lock(&p->lock);
				2511	_enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
				2512	spin_unlock(&p->lock);
				2513	spin_unlock(&swap_lock);
				2514	}
				2515
				2516	bool has_usable_swap(void)
				2517	{
				2518	bool ret = true;
				2519
				2520	spin_lock(&swap_lock);
				2521	if (plist_head_empty(&swap_active_head))
				2522	ret = false;
				2523	spin_unlock(&swap_lock);
				2524	return ret;
				2525	}
				2526
				2527	SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
				2528	{
				2529	struct swap_info_struct *p = NULL;
				2530	unsigned char *swap_map;
				2531	struct swap_cluster_info *cluster_info;
				2532	unsigned long *frontswap_map;
				2533	struct file swap_file, victim;
				2534	struct address_space *mapping;
				2535	struct inode *inode;
				2536	struct filename *pathname;
				2537	int err, found = 0;
				2538	unsigned int old_block_size;
				2539
				2540	if (!capable(CAP_SYS_ADMIN))
				2541	return -EPERM;
				2542
				2543	BUG_ON(!current->mm);
				2544
				2545	pathname = getname(specialfile);
				2546	if (IS_ERR(pathname))
				2547	return PTR_ERR(pathname);
				2548
				2549	victim = file_open_name(pathname, O_RDWR\|O_LARGEFILE, 0);
				2550	err = PTR_ERR(victim);
				2551	if (IS_ERR(victim))
				2552	goto out;
				2553
				2554	mapping = victim->f_mapping;
				2555	spin_lock(&swap_lock);
				2556	plist_for_each_entry(p, &swap_active_head, list) {
				2557	if (p->flags & SWP_WRITEOK) {
				2558	if (p->swap_file->f_mapping == mapping) {
				2559	found = 1;
				2560	break;
				2561	}
				2562	}
				2563	}
				2564	if (!found) {
				2565	err = -EINVAL;
				2566	spin_unlock(&swap_lock);
				2567	goto out_dput;
				2568	}
				2569	if (!security_vm_enough_memory_mm(current->mm, p->pages))
				2570	vm_unacct_memory(p->pages);
				2571	else {
				2572	err = -ENOMEM;
				2573	spin_unlock(&swap_lock);
				2574	goto out_dput;
				2575	}
				2576	del_from_avail_list(p);
				2577	spin_lock(&p->lock);
				2578	if (p->prio < 0) {
				2579	struct swap_info_struct *si = p;
				2580	int nid;
				2581
				2582	plist_for_each_entry_continue(si, &swap_active_head, list) {
				2583	si->prio++;
				2584	si->list.prio--;
				2585	for_each_node(nid) {
				2586	if (si->avail_lists[nid].prio != 1)
				2587	si->avail_lists[nid].prio--;
				2588	}
				2589	}
				2590	least_priority++;
				2591	}
				2592	plist_del(&p->list, &swap_active_head);
				2593	atomic_long_sub(p->pages, &nr_swap_pages);
				2594	total_swap_pages -= p->pages;
				2595	p->flags &= ~SWP_WRITEOK;
				2596	spin_unlock(&p->lock);
				2597	spin_unlock(&swap_lock);
				2598
				2599	disable_swap_slots_cache_lock();
				2600
				2601	set_current_oom_origin();
				2602	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
				2603	clear_current_oom_origin();
				2604
				2605	if (err) {
				2606	/* re-insert swap space back into swap_list */
				2607	reinsert_swap_info(p);
				2608	reenable_swap_slots_cache_unlock();
				2609	goto out_dput;
				2610	}
				2611
				2612	reenable_swap_slots_cache_unlock();
				2613
				2614	flush_work(&p->discard_work);
				2615
				2616	destroy_swap_extents(p);
				2617	if (p->flags & SWP_CONTINUED)
				2618	free_swap_count_continuations(p);
				2619
				2620	if (!p->bdev \|\| !blk_queue_nonrot(bdev_get_queue(p->bdev)))
				2621	atomic_dec(&nr_rotate_swap);
				2622
				2623	mutex_lock(&swapon_mutex);
				2624	spin_lock(&swap_lock);
				2625	spin_lock(&p->lock);
				2626	drain_mmlist();
				2627
				2628	/* wait for anyone still in scan_swap_map */
				2629	p->highest_bit = 0; /* cuts scans short */
				2630	while (p->flags >= SWP_SCANNING) {
				2631	spin_unlock(&p->lock);
				2632	spin_unlock(&swap_lock);
				2633	schedule_timeout_uninterruptible(1);
				2634	spin_lock(&swap_lock);
				2635	spin_lock(&p->lock);
				2636	}
				2637
				2638	swap_file = p->swap_file;
				2639	old_block_size = p->old_block_size;
				2640	p->swap_file = NULL;
				2641	p->max = 0;
				2642	swap_map = p->swap_map;
				2643	p->swap_map = NULL;
				2644	cluster_info = p->cluster_info;
				2645	p->cluster_info = NULL;
				2646	frontswap_map = frontswap_map_get(p);
				2647	spin_unlock(&p->lock);
				2648	spin_unlock(&swap_lock);
				2649	frontswap_invalidate_area(p->type);
				2650	frontswap_map_set(p, NULL);
				2651	mutex_unlock(&swapon_mutex);
				2652	free_percpu(p->percpu_cluster);
				2653	p->percpu_cluster = NULL;
				2654	vfree(swap_map);
				2655	kvfree(cluster_info);
				2656	kvfree(frontswap_map);
				2657	/* Destroy swap account information */
				2658	swap_cgroup_swapoff(p->type);
				2659	exit_swap_address_space(p->type);
				2660
				2661	inode = mapping->host;
				2662	if (S_ISBLK(inode->i_mode)) {
				2663	struct block_device *bdev = I_BDEV(inode);
				2664
				2665	set_blocksize(bdev, old_block_size);
				2666	blkdev_put(bdev, FMODE_READ \| FMODE_WRITE \| FMODE_EXCL);
				2667	}
				2668
				2669	inode_lock(inode);
				2670	inode->i_flags &= ~S_SWAPFILE;
				2671	inode_unlock(inode);
				2672	filp_close(swap_file, NULL);
				2673
				2674	/*
				2675	* Clear the SWP_USED flag after all resources are freed so that swapon
				2676	* can reuse this swap_info in alloc_swap_info() safely. It is ok to
				2677	* not hold p->lock after we cleared its SWP_WRITEOK.
				2678	*/
				2679	spin_lock(&swap_lock);
				2680	p->flags = 0;
				2681	spin_unlock(&swap_lock);
				2682
				2683	err = 0;
				2684	atomic_inc(&proc_poll_event);
				2685	wake_up_interruptible(&proc_poll_wait);
				2686
				2687	out_dput:
				2688	filp_close(victim, NULL);
				2689	out:
				2690	putname(pathname);
				2691	return err;
				2692	}
				2693
				2694	#ifdef CONFIG_PROC_FS
				2695	static __poll_t swaps_poll(struct file file, poll_table wait)
				2696	{
				2697	struct seq_file *seq = file->private_data;
				2698
				2699	poll_wait(file, &proc_poll_wait, wait);
				2700
				2701	if (seq->poll_event != atomic_read(&proc_poll_event)) {
				2702	seq->poll_event = atomic_read(&proc_poll_event);
				2703	return EPOLLIN \| EPOLLRDNORM \| EPOLLERR \| EPOLLPRI;
				2704	}
				2705
				2706	return EPOLLIN \| EPOLLRDNORM;
				2707	}
				2708
				2709	/* iterator */
				2710	static void swap_start(struct seq_file swap, loff_t *pos)
				2711	{
				2712	struct swap_info_struct *si;
				2713	int type;
				2714	loff_t l = *pos;
				2715
				2716	mutex_lock(&swapon_mutex);
				2717
				2718	if (!l)
				2719	return SEQ_START_TOKEN;
				2720
				2721	for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
				2722	if (!(si->flags & SWP_USED) \|\| !si->swap_map)
				2723	continue;
				2724	if (!--l)
				2725	return si;
				2726	}
				2727
				2728	return NULL;
				2729	}
				2730
				2731	static void swap_next(struct seq_file swap, void v, loff_t pos)
				2732	{
				2733	struct swap_info_struct *si = v;
				2734	int type;
				2735
				2736	if (v == SEQ_START_TOKEN)
				2737	type = 0;
				2738	else
				2739	type = si->type + 1;
				2740
				2741	for (; (si = swap_type_to_swap_info(type)); type++) {
				2742	if (!(si->flags & SWP_USED) \|\| !si->swap_map)
				2743	continue;
				2744	++*pos;
				2745	return si;
				2746	}
				2747
				2748	return NULL;
				2749	}
				2750
				2751	static void swap_stop(struct seq_file swap, void v)
				2752	{
				2753	mutex_unlock(&swapon_mutex);
				2754	}
				2755
				2756	static int swap_show(struct seq_file swap, void v)
				2757	{
				2758	struct swap_info_struct *si = v;
				2759	struct file *file;
				2760	int len;
				2761
				2762	if (si == SEQ_START_TOKEN) {
				2763	seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
				2764	return 0;
				2765	}
				2766
				2767	file = si->swap_file;
				2768	len = seq_file_path(swap, file, " \t\n\\");
				2769	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
				2770	len < 40 ? 40 - len : 1, " ",
				2771	S_ISBLK(file_inode(file)->i_mode) ?
				2772	"partition" : "file\t",
				2773	si->pages << (PAGE_SHIFT - 10),
				2774	si->inuse_pages << (PAGE_SHIFT - 10),
				2775	si->prio);
				2776	return 0;
				2777	}
				2778
				2779	static const struct seq_operations swaps_op = {
				2780	.start = swap_start,
				2781	.next = swap_next,
				2782	.stop = swap_stop,
				2783	.show = swap_show
				2784	};
				2785
				2786	static int swaps_open(struct inode inode, struct file file)
				2787	{
				2788	struct seq_file *seq;
				2789	int ret;
				2790
				2791	ret = seq_open(file, &swaps_op);
				2792	if (ret)
				2793	return ret;
				2794
				2795	seq = file->private_data;
				2796	seq->poll_event = atomic_read(&proc_poll_event);
				2797	return 0;
				2798	}
				2799
				2800	static const struct file_operations proc_swaps_operations = {
				2801	.open = swaps_open,
				2802	.read = seq_read,
				2803	.llseek = seq_lseek,
				2804	.release = seq_release,
				2805	.poll = swaps_poll,
				2806	};
				2807
				2808	static int __init procswaps_init(void)
				2809	{
				2810	proc_create("swaps", 0, NULL, &proc_swaps_operations);
				2811	return 0;
				2812	}
				2813	__initcall(procswaps_init);
				2814	#endif /* CONFIG_PROC_FS */
				2815
				2816	#ifdef MAX_SWAPFILES_CHECK
				2817	static int __init max_swapfiles_check(void)
				2818	{
				2819	MAX_SWAPFILES_CHECK();
				2820	return 0;
				2821	}
				2822	late_initcall(max_swapfiles_check);
				2823	#endif
				2824
				2825	static struct swap_info_struct *alloc_swap_info(void)
				2826	{
				2827	struct swap_info_struct *p;
				2828	unsigned int type;
				2829	int i;
				2830	int size = sizeof(p) + nr_node_ids sizeof(struct plist_node);
				2831
				2832	p = kvzalloc(size, GFP_KERNEL);
				2833	if (!p)
				2834	return ERR_PTR(-ENOMEM);
				2835
				2836	spin_lock(&swap_lock);
				2837	for (type = 0; type < nr_swapfiles; type++) {
				2838	if (!(swap_info[type]->flags & SWP_USED))
				2839	break;
				2840	}
				2841	if (type >= MAX_SWAPFILES) {
				2842	spin_unlock(&swap_lock);
				2843	kvfree(p);
				2844	return ERR_PTR(-EPERM);
				2845	}
				2846	if (type >= nr_swapfiles) {
				2847	p->type = type;
				2848	WRITE_ONCE(swap_info[type], p);
				2849	/*
				2850	* Write swap_info[type] before nr_swapfiles, in case a
				2851	* racing procfs swap_start() or swap_next() is reading them.
				2852	* (We never shrink nr_swapfiles, we never free this entry.)
				2853	*/
				2854	smp_wmb();
				2855	WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
				2856	} else {
				2857	kvfree(p);
				2858	p = swap_info[type];
				2859	/*
				2860	* Do not memset this entry: a racing procfs swap_next()
				2861	* would be relying on p->type to remain valid.
				2862	*/
				2863	}
				2864	INIT_LIST_HEAD(&p->first_swap_extent.list);
				2865	plist_node_init(&p->list, 0);
				2866	for_each_node(i)
				2867	plist_node_init(&p->avail_lists[i], 0);
				2868	p->flags = SWP_USED;
				2869	spin_unlock(&swap_lock);
				2870	spin_lock_init(&p->lock);
				2871	spin_lock_init(&p->cont_lock);
				2872
				2873	return p;
				2874	}
				2875
				2876	static int claim_swapfile(struct swap_info_struct p, struct inode inode)
				2877	{
				2878	int error;
				2879
				2880	if (S_ISBLK(inode->i_mode)) {
				2881	p->bdev = bdgrab(I_BDEV(inode));
				2882	error = blkdev_get(p->bdev,
				2883	FMODE_READ \| FMODE_WRITE \| FMODE_EXCL, p);
				2884	if (error < 0) {
				2885	p->bdev = NULL;
				2886	return error;
				2887	}
				2888	p->old_block_size = block_size(p->bdev);
				2889	error = set_blocksize(p->bdev, PAGE_SIZE);
				2890	if (error < 0)
				2891	return error;
				2892	p->flags \|= SWP_BLKDEV;
				2893	} else if (S_ISREG(inode->i_mode)) {
				2894	p->bdev = inode->i_sb->s_bdev;
				2895	}
				2896
				2897	inode_lock(inode);
				2898	if (IS_SWAPFILE(inode))
				2899	return -EBUSY;
				2900
				2901	return 0;
				2902	}
				2903
				2904
				2905	/*
				2906	* Find out how many pages are allowed for a single swap device. There
				2907	* are two limiting factors:
				2908	* 1) the number of bits for the swap offset in the swp_entry_t type, and
				2909	* 2) the number of bits in the swap pte, as defined by the different
				2910	* architectures.
				2911	*
				2912	* In order to find the largest possible bit mask, a swap entry with
				2913	* swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
				2914	* decoded to a swp_entry_t again, and finally the swap offset is
				2915	* extracted.
				2916	*
				2917	* This will mask all the bits from the initial ~0UL mask that can't
				2918	* be encoded in either the swp_entry_t or the architecture definition
				2919	* of a swap pte.
				2920	*/
				2921	unsigned long generic_max_swapfile_size(void)
				2922	{
				2923	return swp_offset(pte_to_swp_entry(
				2924	swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
				2925	}
				2926
				2927	/* Can be overridden by an architecture for additional checks. */
				2928	__weak unsigned long max_swapfile_size(void)
				2929	{
				2930	return generic_max_swapfile_size();
				2931	}
				2932
				2933	static unsigned long read_swap_header(struct swap_info_struct *p,
				2934	union swap_header *swap_header,
				2935	struct inode *inode)
				2936	{
				2937	int i;
				2938	unsigned long maxpages;
				2939	unsigned long swapfilepages;
				2940	unsigned long last_page;
				2941
				2942	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
				2943	pr_err("Unable to find swap-space signature\n");
				2944	return 0;
				2945	}
				2946
				2947	/* swap partition endianess hack... */
				2948	if (swab32(swap_header->info.version) == 1) {
				2949	swab32s(&swap_header->info.version);
				2950	swab32s(&swap_header->info.last_page);
				2951	swab32s(&swap_header->info.nr_badpages);
				2952	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
				2953	return 0;
				2954	for (i = 0; i < swap_header->info.nr_badpages; i++)
				2955	swab32s(&swap_header->info.badpages[i]);
				2956	}
				2957	/* Check the swap header's sub-version */
				2958	if (swap_header->info.version != 1) {
				2959	pr_warn("Unable to handle swap header version %d\n",
				2960	swap_header->info.version);
				2961	return 0;
				2962	}
				2963
				2964	p->lowest_bit = 1;
				2965	p->cluster_next = 1;
				2966	p->cluster_nr = 0;
				2967
				2968	maxpages = max_swapfile_size();
				2969	last_page = swap_header->info.last_page;
				2970	if (!last_page) {
				2971	pr_warn("Empty swap-file\n");
				2972	return 0;
				2973	}
				2974	if (last_page > maxpages) {
				2975	pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
				2976	maxpages << (PAGE_SHIFT - 10),
				2977	last_page << (PAGE_SHIFT - 10));
				2978	}
				2979	if (maxpages > last_page) {
				2980	maxpages = last_page + 1;
				2981	/* p->max is an unsigned int: don't overflow it */
				2982	if ((unsigned int)maxpages == 0)
				2983	maxpages = UINT_MAX;
				2984	}
				2985	p->highest_bit = maxpages - 1;
				2986
				2987	if (!maxpages)
				2988	return 0;
				2989	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
				2990	if (swapfilepages && maxpages > swapfilepages) {
				2991	pr_warn("Swap area shorter than signature indicates\n");
				2992	return 0;
				2993	}
				2994	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
				2995	return 0;
				2996	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
				2997	return 0;
				2998
				2999	return maxpages;
				3000	}
				3001
				3002	#define SWAP_CLUSTER_INFO_COLS \
				3003	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
				3004	#define SWAP_CLUSTER_SPACE_COLS \
				3005	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
				3006	#define SWAP_CLUSTER_COLS \
				3007	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
				3008
				3009	static int setup_swap_map_and_extents(struct swap_info_struct *p,
				3010	union swap_header *swap_header,
				3011	unsigned char *swap_map,
				3012	struct swap_cluster_info *cluster_info,
				3013	unsigned long maxpages,
				3014	sector_t *span)
				3015	{
				3016	unsigned int j, k;
				3017	unsigned int nr_good_pages;
				3018	int nr_extents;
				3019	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
				3020	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
				3021	unsigned long i, idx;
				3022
				3023	nr_good_pages = maxpages - 1; /* omit header page */
				3024
				3025	cluster_list_init(&p->free_clusters);
				3026	cluster_list_init(&p->discard_clusters);
				3027
				3028	for (i = 0; i < swap_header->info.nr_badpages; i++) {
				3029	unsigned int page_nr = swap_header->info.badpages[i];
				3030	if (page_nr == 0 \|\| page_nr > swap_header->info.last_page)
				3031	return -EINVAL;
				3032	if (page_nr < maxpages) {
				3033	swap_map[page_nr] = SWAP_MAP_BAD;
				3034	nr_good_pages--;
				3035	/*
				3036	* Haven't marked the cluster free yet, no list
				3037	* operation involved
				3038	*/
				3039	inc_cluster_info_page(p, cluster_info, page_nr);
				3040	}
				3041	}
				3042
				3043	/* Haven't marked the cluster free yet, no list operation involved */
				3044	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
				3045	inc_cluster_info_page(p, cluster_info, i);
				3046
				3047	if (nr_good_pages) {
				3048	swap_map[0] = SWAP_MAP_BAD;
				3049	/*
				3050	* Not mark the cluster free yet, no list
				3051	* operation involved
				3052	*/
				3053	inc_cluster_info_page(p, cluster_info, 0);
				3054	p->max = maxpages;
				3055	p->pages = nr_good_pages;
				3056	nr_extents = setup_swap_extents(p, span);
				3057	if (nr_extents < 0)
				3058	return nr_extents;
				3059	nr_good_pages = p->pages;
				3060	}
				3061	if (!nr_good_pages) {
				3062	pr_warn("Empty swap-file\n");
				3063	return -EINVAL;
				3064	}
				3065
				3066	if (!cluster_info)
				3067	return nr_extents;
				3068
				3069
				3070	/*
				3071	* Reduce false cache line sharing between cluster_info and
				3072	* sharing same address space.
				3073	*/
				3074	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
				3075	j = (k + col) % SWAP_CLUSTER_COLS;
				3076	for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
				3077	idx = i * SWAP_CLUSTER_COLS + j;
				3078	if (idx >= nr_clusters)
				3079	continue;
				3080	if (cluster_count(&cluster_info[idx]))
				3081	continue;
				3082	cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
				3083	cluster_list_add_tail(&p->free_clusters, cluster_info,
				3084	idx);
				3085	}
				3086	}
				3087	return nr_extents;
				3088	}
				3089
				3090	/*
				3091	* Helper to sys_swapon determining if a given swap
				3092	* backing device queue supports DISCARD operations.
				3093	*/
				3094	static bool swap_discardable(struct swap_info_struct *si)
				3095	{
				3096	struct request_queue *q = bdev_get_queue(si->bdev);
				3097
				3098	if (!q \|\| !blk_queue_discard(q))
				3099	return false;
				3100
				3101	return true;
				3102	}
				3103
				3104	SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
				3105	{
				3106	struct swap_info_struct *p;
				3107	struct filename *name;
				3108	struct file *swap_file = NULL;
				3109	struct address_space *mapping;
				3110	int prio;
				3111	int error;
				3112	union swap_header *swap_header;
				3113	int nr_extents;
				3114	sector_t span;
				3115	unsigned long maxpages;
				3116	unsigned char *swap_map = NULL;
				3117	struct swap_cluster_info *cluster_info = NULL;
				3118	unsigned long *frontswap_map = NULL;
				3119	struct page *page = NULL;
				3120	struct inode *inode = NULL;
				3121	bool inced_nr_rotate_swap = false;
				3122
				3123	if (swap_flags & ~SWAP_FLAGS_VALID)
				3124	return -EINVAL;
				3125
				3126	if (!capable(CAP_SYS_ADMIN))
				3127	return -EPERM;
				3128
				3129	if (!swap_avail_heads)
				3130	return -ENOMEM;
				3131
				3132	p = alloc_swap_info();
				3133	if (IS_ERR(p))
				3134	return PTR_ERR(p);
				3135
				3136	INIT_WORK(&p->discard_work, swap_discard_work);
				3137
				3138	name = getname(specialfile);
				3139	if (IS_ERR(name)) {
				3140	error = PTR_ERR(name);
				3141	name = NULL;
				3142	goto bad_swap;
				3143	}
				3144	swap_file = file_open_name(name, O_RDWR\|O_LARGEFILE, 0);
				3145	if (IS_ERR(swap_file)) {
				3146	error = PTR_ERR(swap_file);
				3147	swap_file = NULL;
				3148	goto bad_swap;
				3149	}
				3150
				3151	p->swap_file = swap_file;
				3152	mapping = swap_file->f_mapping;
				3153	inode = mapping->host;
				3154
				3155	/* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
				3156	error = claim_swapfile(p, inode);
				3157	if (unlikely(error))
				3158	goto bad_swap;
				3159
				3160	/*
				3161	* Read the swap header.
				3162	*/
				3163	if (!mapping->a_ops->readpage) {
				3164	error = -EINVAL;
				3165	goto bad_swap;
				3166	}
				3167	page = read_mapping_page(mapping, 0, swap_file);
				3168	if (IS_ERR(page)) {
				3169	error = PTR_ERR(page);
				3170	goto bad_swap;
				3171	}
				3172	swap_header = kmap(page);
				3173
				3174	maxpages = read_swap_header(p, swap_header, inode);
				3175	if (unlikely(!maxpages)) {
				3176	error = -EINVAL;
				3177	goto bad_swap;
				3178	}
				3179
				3180	/* OK, set up the swap map and apply the bad block list */
				3181	swap_map = vzalloc(maxpages);
				3182	if (!swap_map) {
				3183	error = -ENOMEM;
				3184	goto bad_swap;
				3185	}
				3186
				3187	if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
				3188	p->flags \|= SWP_STABLE_WRITES;
				3189
				3190	if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
				3191	p->flags \|= SWP_SYNCHRONOUS_IO;
				3192
				3193	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
				3194	int cpu;
				3195	unsigned long ci, nr_cluster;
				3196
				3197	p->flags \|= SWP_SOLIDSTATE;
				3198	/*
				3199	* select a random position to start with to help wear leveling
				3200	* SSD
				3201	*/
				3202	p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
				3203	nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
				3204
				3205	cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
				3206	GFP_KERNEL);
				3207	if (!cluster_info) {
				3208	error = -ENOMEM;
				3209	goto bad_swap;
				3210	}
				3211
				3212	for (ci = 0; ci < nr_cluster; ci++)
				3213	spin_lock_init(&((cluster_info + ci)->lock));
				3214
				3215	p->percpu_cluster = alloc_percpu(struct percpu_cluster);
				3216	if (!p->percpu_cluster) {
				3217	error = -ENOMEM;
				3218	goto bad_swap;
				3219	}
				3220	for_each_possible_cpu(cpu) {
				3221	struct percpu_cluster *cluster;
				3222	cluster = per_cpu_ptr(p->percpu_cluster, cpu);
				3223	cluster_set_null(&cluster->index);
				3224	}
				3225	} else {
				3226	atomic_inc(&nr_rotate_swap);
				3227	inced_nr_rotate_swap = true;
				3228	}
				3229
				3230	error = swap_cgroup_swapon(p->type, maxpages);
				3231	if (error)
				3232	goto bad_swap;
				3233
				3234	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
				3235	cluster_info, maxpages, &span);
				3236	if (unlikely(nr_extents < 0)) {
				3237	error = nr_extents;
				3238	goto bad_swap;
				3239	}
				3240	/* frontswap enabled? set up bit-per-page map for frontswap */
				3241	if (IS_ENABLED(CONFIG_FRONTSWAP))
				3242	frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
				3243	sizeof(long),
				3244	GFP_KERNEL);
				3245
				3246	if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
				3247	/*
				3248	* When discard is enabled for swap with no particular
				3249	* policy flagged, we set all swap discard flags here in
				3250	* order to sustain backward compatibility with older
				3251	* swapon(8) releases.
				3252	*/
				3253	p->flags \|= (SWP_DISCARDABLE \| SWP_AREA_DISCARD \|
				3254	SWP_PAGE_DISCARD);
				3255
				3256	/*
				3257	* By flagging sys_swapon, a sysadmin can tell us to
				3258	* either do single-time area discards only, or to just
				3259	* perform discards for released swap page-clusters.
				3260	* Now it's time to adjust the p->flags accordingly.
				3261	*/
				3262	if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
				3263	p->flags &= ~SWP_PAGE_DISCARD;
				3264	else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
				3265	p->flags &= ~SWP_AREA_DISCARD;
				3266
				3267	/* issue a swapon-time discard if it's still required */
				3268	if (p->flags & SWP_AREA_DISCARD) {
				3269	int err = discard_swap(p);
				3270	if (unlikely(err))
				3271	pr_err("swapon: discard_swap(%p): %d\n",
				3272	p, err);
				3273	}
				3274	}
				3275
				3276	error = init_swap_address_space(p->type, maxpages);
				3277	if (error)
				3278	goto bad_swap;
				3279
				3280	/*
				3281	* Flush any pending IO and dirty mappings before we start using this
				3282	* swap device.
				3283	*/
				3284	inode->i_flags \|= S_SWAPFILE;
				3285	error = inode_drain_writes(inode);
				3286	if (error) {
				3287	inode->i_flags &= ~S_SWAPFILE;
				3288	goto bad_swap;
				3289	}
				3290
				3291	mutex_lock(&swapon_mutex);
				3292	prio = -1;
				3293	if (swap_flags & SWAP_FLAG_PREFER)
				3294	prio =
				3295	(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
				3296	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
				3297
				3298	pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
				3299	p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
				3300	nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
				3301	(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
				3302	(p->flags & SWP_DISCARDABLE) ? "D" : "",
				3303	(p->flags & SWP_AREA_DISCARD) ? "s" : "",
				3304	(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
				3305	(frontswap_map) ? "FS" : "");
				3306
				3307	mutex_unlock(&swapon_mutex);
				3308	atomic_inc(&proc_poll_event);
				3309	wake_up_interruptible(&proc_poll_wait);
				3310
				3311	error = 0;
				3312	goto out;
				3313	bad_swap:
				3314	free_percpu(p->percpu_cluster);
				3315	p->percpu_cluster = NULL;
				3316	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
				3317	set_blocksize(p->bdev, p->old_block_size);
				3318	blkdev_put(p->bdev, FMODE_READ \| FMODE_WRITE \| FMODE_EXCL);
				3319	}
				3320	destroy_swap_extents(p);
				3321	swap_cgroup_swapoff(p->type);
				3322	spin_lock(&swap_lock);
				3323	p->swap_file = NULL;
				3324	p->flags = 0;
				3325	spin_unlock(&swap_lock);
				3326	vfree(swap_map);
				3327	kvfree(cluster_info);
				3328	kvfree(frontswap_map);
				3329	if (inced_nr_rotate_swap)
				3330	atomic_dec(&nr_rotate_swap);
				3331	if (swap_file) {
				3332	if (inode) {
				3333	inode_unlock(inode);
				3334	inode = NULL;
				3335	}
				3336	filp_close(swap_file, NULL);
				3337	}
				3338	out:
				3339	if (page && !IS_ERR(page)) {
				3340	kunmap(page);
				3341	put_page(page);
				3342	}
				3343	if (name)
				3344	putname(name);
				3345	if (inode)
				3346	inode_unlock(inode);
				3347	if (!error)
				3348	enable_swap_slots_cache();
				3349	return error;
				3350	}
				3351
				3352	void si_swapinfo(struct sysinfo *val)
				3353	{
				3354	unsigned int type;
				3355	unsigned long nr_to_be_unused = 0;
				3356
				3357	spin_lock(&swap_lock);
				3358	for (type = 0; type < nr_swapfiles; type++) {
				3359	struct swap_info_struct *si = swap_info[type];
				3360
				3361	if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
				3362	nr_to_be_unused += si->inuse_pages;
				3363	}
				3364	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
				3365	val->totalswap = total_swap_pages + nr_to_be_unused;
				3366	spin_unlock(&swap_lock);
				3367	}
				3368
				3369	/*
				3370	* Verify that a swap entry is valid and increment its swap map count.
				3371	*
				3372	* Returns error code in following case.
				3373	* - success -> 0
				3374	* - swp_entry is invalid -> EINVAL
				3375	* - swp_entry is migration entry -> EINVAL
				3376	* - swap-cache reference is requested but there is already one. -> EEXIST
				3377	* - swap-cache reference is requested but the entry is not used. -> ENOENT
				3378	* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
				3379	*/
				3380	static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
				3381	{
				3382	struct swap_info_struct *p;
				3383	struct swap_cluster_info *ci;
				3384	unsigned long offset;
				3385	unsigned char count;
				3386	unsigned char has_cache;
				3387	int err = -EINVAL;
				3388
				3389	if (non_swap_entry(entry))
				3390	goto out;
				3391
				3392	p = swp_swap_info(entry);
				3393	if (!p)
				3394	goto bad_file;
				3395
				3396	offset = swp_offset(entry);
				3397	if (unlikely(offset >= p->max))
				3398	goto out;
				3399
				3400	ci = lock_cluster_or_swap_info(p, offset);
				3401
				3402	count = p->swap_map[offset];
				3403
				3404	/*
				3405	* swapin_readahead() doesn't check if a swap entry is valid, so the
				3406	* swap entry could be SWAP_MAP_BAD. Check here with lock held.
				3407	*/
				3408	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
				3409	err = -ENOENT;
				3410	goto unlock_out;
				3411	}
				3412
				3413	has_cache = count & SWAP_HAS_CACHE;
				3414	count &= ~SWAP_HAS_CACHE;
				3415	err = 0;
				3416
				3417	if (usage == SWAP_HAS_CACHE) {
				3418
				3419	/* set SWAP_HAS_CACHE if there is no cache and entry is used */
				3420	if (!has_cache && count)
				3421	has_cache = SWAP_HAS_CACHE;
				3422	else if (has_cache) /* someone else added cache */
				3423	err = -EEXIST;
				3424	else /* no users remaining */
				3425	err = -ENOENT;
				3426
				3427	} else if (count \|\| has_cache) {
				3428
				3429	if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
				3430	count += usage;
				3431	else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
				3432	err = -EINVAL;
				3433	else if (swap_count_continued(p, offset, count))
				3434	count = COUNT_CONTINUED;
				3435	else
				3436	err = -ENOMEM;
				3437	} else
				3438	err = -ENOENT; /* unused swap entry */
				3439
				3440	p->swap_map[offset] = count \| has_cache;
				3441
				3442	unlock_out:
				3443	unlock_cluster_or_swap_info(p, ci);
				3444	out:
				3445	return err;
				3446
				3447	bad_file:
				3448	pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
				3449	goto out;
				3450	}
				3451
				3452	/*
				3453	* Help swapoff by noting that swap entry belongs to shmem/tmpfs
				3454	* (in which case its reference count is never incremented).
				3455	*/
				3456	void swap_shmem_alloc(swp_entry_t entry)
				3457	{
				3458	__swap_duplicate(entry, SWAP_MAP_SHMEM);
				3459	}
				3460
				3461	/*
				3462	* Increase reference count of swap entry by 1.
				3463	* Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
				3464	* but could not be atomically allocated. Returns 0, just as if it succeeded,
				3465	* if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
				3466	* might occur if a page table entry has got corrupted.
				3467	*/
				3468	int swap_duplicate(swp_entry_t entry)
				3469	{
				3470	int err = 0;
				3471
				3472	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
				3473	err = add_swap_count_continuation(entry, GFP_ATOMIC);
				3474	return err;
				3475	}
				3476
				3477	/*
				3478	* @entry: swap entry for which we allocate swap cache.
				3479	*
				3480	* Called when allocating swap cache for existing swap entry,
				3481	* This can return error codes. Returns 0 at success.
				3482	* -EBUSY means there is a swap cache.
				3483	* Note: return code is different from swap_duplicate().
				3484	*/
				3485	int swapcache_prepare(swp_entry_t entry)
				3486	{
				3487	return __swap_duplicate(entry, SWAP_HAS_CACHE);
				3488	}
				3489
				3490	struct swap_info_struct *swp_swap_info(swp_entry_t entry)
				3491	{
				3492	return swap_type_to_swap_info(swp_type(entry));
				3493	}
				3494
				3495	struct swap_info_struct page_swap_info(struct page page)
				3496	{
				3497	swp_entry_t entry = { .val = page_private(page) };
				3498	return swp_swap_info(entry);
				3499	}
				3500
				3501	/*
				3502	* out-of-line __page_file_ methods to avoid include hell.
				3503	*/
				3504	struct address_space __page_file_mapping(struct page page)
				3505	{
				3506	return page_swap_info(page)->swap_file->f_mapping;
				3507	}
				3508	EXPORT_SYMBOL_GPL(__page_file_mapping);
				3509
				3510	pgoff_t __page_file_index(struct page *page)
				3511	{
				3512	swp_entry_t swap = { .val = page_private(page) };
				3513	return swp_offset(swap);
				3514	}
				3515	EXPORT_SYMBOL_GPL(__page_file_index);
				3516
				3517	/*
				3518	* add_swap_count_continuation - called when a swap count is duplicated
				3519	* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
				3520	* page of the original vmalloc'ed swap_map, to hold the continuation count
				3521	* (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
				3522	* again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
				3523	*
				3524	* These continuation pages are seldom referenced: the common paths all work
				3525	* on the original swap_map, only referring to a continuation page when the
				3526	* low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
				3527	*
				3528	* add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
				3529	* page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
				3530	* can be called after dropping locks.
				3531	*/
				3532	int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
				3533	{
				3534	struct swap_info_struct *si;
				3535	struct swap_cluster_info *ci;
				3536	struct page *head;
				3537	struct page *page;
				3538	struct page *list_page;
				3539	pgoff_t offset;
				3540	unsigned char count;
				3541
				3542	/*
				3543	* When debugging, it's easier to use __GFP_ZERO here; but it's better
				3544	* for latency not to zero a page while GFP_ATOMIC and holding locks.
				3545	*/
				3546	page = alloc_page(gfp_mask \| __GFP_HIGHMEM);
				3547
				3548	si = swap_info_get(entry);
				3549	if (!si) {
				3550	/*
				3551	* An acceptable race has occurred since the failing
				3552	* __swap_duplicate(): the swap entry has been freed,
				3553	* perhaps even the whole swap_map cleared for swapoff.
				3554	*/
				3555	goto outer;
				3556	}
				3557
				3558	offset = swp_offset(entry);
				3559
				3560	ci = lock_cluster(si, offset);
				3561
				3562	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
				3563
				3564	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
				3565	/*
				3566	* The higher the swap count, the more likely it is that tasks
				3567	* will race to add swap count continuation: we need to avoid
				3568	* over-provisioning.
				3569	*/
				3570	goto out;
				3571	}
				3572
				3573	if (!page) {
				3574	unlock_cluster(ci);
				3575	spin_unlock(&si->lock);
				3576	return -ENOMEM;
				3577	}
				3578
				3579	/*
				3580	* We are fortunate that although vmalloc_to_page uses pte_offset_map,
				3581	* no architecture is using highmem pages for kernel page tables: so it
				3582	* will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
				3583	*/
				3584	head = vmalloc_to_page(si->swap_map + offset);
				3585	offset &= ~PAGE_MASK;
				3586
				3587	spin_lock(&si->cont_lock);
				3588	/*
				3589	* Page allocation does not initialize the page's lru field,
				3590	* but it does always reset its private field.
				3591	*/
				3592	if (!page_private(head)) {
				3593	BUG_ON(count & COUNT_CONTINUED);
				3594	INIT_LIST_HEAD(&head->lru);
				3595	set_page_private(head, SWP_CONTINUED);
				3596	si->flags \|= SWP_CONTINUED;
				3597	}
				3598
				3599	list_for_each_entry(list_page, &head->lru, lru) {
				3600	unsigned char *map;
				3601
				3602	/*
				3603	* If the previous map said no continuation, but we've found
				3604	* a continuation page, free our allocation and use this one.
				3605	*/
				3606	if (!(count & COUNT_CONTINUED))
				3607	goto out_unlock_cont;
				3608
				3609	map = kmap_atomic(list_page) + offset;
				3610	count = *map;
				3611	kunmap_atomic(map);
				3612
				3613	/*
				3614	* If this continuation count now has some space in it,
				3615	* free our allocation and use this one.
				3616	*/
				3617	if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
				3618	goto out_unlock_cont;
				3619	}
				3620
				3621	list_add_tail(&page->lru, &head->lru);
				3622	page = NULL; /* now it's attached, don't free it */
				3623	out_unlock_cont:
				3624	spin_unlock(&si->cont_lock);
				3625	out:
				3626	unlock_cluster(ci);
				3627	spin_unlock(&si->lock);
				3628	outer:
				3629	if (page)
				3630	__free_page(page);
				3631	return 0;
				3632	}
				3633
				3634	/*
				3635	* swap_count_continued - when the original swap_map count is incremented
				3636	* from SWAP_MAP_MAX, check if there is already a continuation page to carry
				3637	* into, carry if so, or else fail until a new continuation page is allocated;
				3638	* when the original swap_map count is decremented from 0 with continuation,
				3639	* borrow from the continuation and report whether it still holds more.
				3640	* Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
				3641	* lock.
				3642	*/
				3643	static bool swap_count_continued(struct swap_info_struct *si,
				3644	pgoff_t offset, unsigned char count)
				3645	{
				3646	struct page *head;
				3647	struct page *page;
				3648	unsigned char *map;
				3649	bool ret;
				3650
				3651	head = vmalloc_to_page(si->swap_map + offset);
				3652	if (page_private(head) != SWP_CONTINUED) {
				3653	BUG_ON(count & COUNT_CONTINUED);
				3654	return false; /* need to add count continuation */
				3655	}
				3656
				3657	spin_lock(&si->cont_lock);
				3658	offset &= ~PAGE_MASK;
				3659	page = list_entry(head->lru.next, struct page, lru);
				3660	map = kmap_atomic(page) + offset;
				3661
				3662	if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
				3663	goto init_map; /* jump over SWAP_CONT_MAX checks */
				3664
				3665	if (count == (SWAP_MAP_MAX \| COUNT_CONTINUED)) { /* incrementing */
				3666	/*
				3667	* Think of how you add 1 to 999
				3668	*/
				3669	while (*map == (SWAP_CONT_MAX \| COUNT_CONTINUED)) {
				3670	kunmap_atomic(map);
				3671	page = list_entry(page->lru.next, struct page, lru);
				3672	BUG_ON(page == head);
				3673	map = kmap_atomic(page) + offset;
				3674	}
				3675	if (*map == SWAP_CONT_MAX) {
				3676	kunmap_atomic(map);
				3677	page = list_entry(page->lru.next, struct page, lru);
				3678	if (page == head) {
				3679	ret = false; /* add count continuation */
				3680	goto out;
				3681	}
				3682	map = kmap_atomic(page) + offset;
				3683	init_map: map = 0; / we didn't zero the page */
				3684	}
				3685	*map += 1;
				3686	kunmap_atomic(map);
				3687	page = list_entry(page->lru.prev, struct page, lru);
				3688	while (page != head) {
				3689	map = kmap_atomic(page) + offset;
				3690	*map = COUNT_CONTINUED;
				3691	kunmap_atomic(map);
				3692	page = list_entry(page->lru.prev, struct page, lru);
				3693	}
				3694	ret = true; /* incremented */
				3695
				3696	} else { /* decrementing */
				3697	/*
				3698	* Think of how you subtract 1 from 1000
				3699	*/
				3700	BUG_ON(count != COUNT_CONTINUED);
				3701	while (*map == COUNT_CONTINUED) {
				3702	kunmap_atomic(map);
				3703	page = list_entry(page->lru.next, struct page, lru);
				3704	BUG_ON(page == head);
				3705	map = kmap_atomic(page) + offset;
				3706	}
				3707	BUG_ON(*map == 0);
				3708	*map -= 1;
				3709	if (*map == 0)
				3710	count = 0;
				3711	kunmap_atomic(map);
				3712	page = list_entry(page->lru.prev, struct page, lru);
				3713	while (page != head) {
				3714	map = kmap_atomic(page) + offset;
				3715	*map = SWAP_CONT_MAX \| count;
				3716	count = COUNT_CONTINUED;
				3717	kunmap_atomic(map);
				3718	page = list_entry(page->lru.prev, struct page, lru);
				3719	}
				3720	ret = count == COUNT_CONTINUED;
				3721	}
				3722	out:
				3723	spin_unlock(&si->cont_lock);
				3724	return ret;
				3725	}
				3726
				3727	/*
				3728	* free_swap_count_continuations - swapoff free all the continuation pages
				3729	* appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
				3730	*/
				3731	static void free_swap_count_continuations(struct swap_info_struct *si)
				3732	{
				3733	pgoff_t offset;
				3734
				3735	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
				3736	struct page *head;
				3737	head = vmalloc_to_page(si->swap_map + offset);
				3738	if (page_private(head)) {
				3739	struct page page, next;
				3740
				3741	list_for_each_entry_safe(page, next, &head->lru, lru) {
				3742	list_del(&page->lru);
				3743	__free_page(page);
				3744	}
				3745	}
				3746	}
				3747	}
				3748
				3749	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
				3750	void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
				3751	gfp_t gfp_mask)
				3752	{
				3753	struct swap_info_struct si, next;
				3754	if (!(gfp_mask & __GFP_IO) \|\| !memcg)
				3755	return;
				3756
				3757	if (!blk_cgroup_congested())
				3758	return;
				3759
				3760	/*
				3761	* We've already scheduled a throttle, avoid taking the global swap
				3762	* lock.
				3763	*/
				3764	if (current->throttle_queue)
				3765	return;
				3766
				3767	spin_lock(&swap_avail_lock);
				3768	plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
				3769	avail_lists[node]) {
				3770	if (si->bdev) {
				3771	blkcg_schedule_throttle(bdev_get_queue(si->bdev),
				3772	true);
				3773	break;
				3774	}
				3775	}
				3776	spin_unlock(&swap_avail_lock);
				3777	}
				3778	#endif
				3779
				3780	static int __init swapfile_init(void)
				3781	{
				3782	int nid;
				3783
				3784	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
				3785	GFP_KERNEL);
				3786	if (!swap_avail_heads) {
				3787	pr_emerg("Not enough memory for swap heads, swap is disabled\n");
				3788	return -ENOMEM;
				3789	}
				3790
				3791	for_each_node(nid)
				3792	plist_head_init(&swap_avail_heads[nid]);
				3793
				3794	return 0;
				3795	}
				3796	subsys_initcall(swapfile_init);