Blame - src/kernel/linux/v4.19/mm/ksm.c - T800

blob: b3ea0f0316eb670acbecc5be52c51f7711327369 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
				7	* Copyright (C) 2008-2009 Red Hat, Inc.
				8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
				12	* Hugh Dickins
				13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2.
				15	*/
				16
				17	#include <linux/errno.h>
				18	#include <linux/mm.h>
				19	#include <linux/fs.h>
				20	#include <linux/mman.h>
				21	#include <linux/sched.h>
				22	#include <linux/sched/mm.h>
				23	#include <linux/sched/coredump.h>
				24	#include <linux/rwsem.h>
				25	#include <linux/pagemap.h>
				26	#include <linux/rmap.h>
				27	#include <linux/spinlock.h>
				28	#include <linux/jhash.h>
				29	#include <linux/delay.h>
				30	#include <linux/kthread.h>
				31	#include <linux/wait.h>
				32	#include <linux/slab.h>
				33	#include <linux/rbtree.h>
				34	#include <linux/memory.h>
				35	#include <linux/mmu_notifier.h>
				36	#include <linux/swap.h>
				37	#include <linux/ksm.h>
				38	#include <linux/hashtable.h>
				39	#include <linux/freezer.h>
				40	#include <linux/oom.h>
				41	#include <linux/numa.h>
				42
				43	#include <asm/tlbflush.h>
				44	#include "internal.h"
				45
				46	#ifdef CONFIG_NUMA
				47	#define NUMA(x) (x)
				48	#define DO_NUMA(x) do { (x); } while (0)
				49	#else
				50	#define NUMA(x) (0)
				51	#define DO_NUMA(x) do { } while (0)
				52	#endif
				53
				54	/**
				55	* DOC: Overview
				56	*
				57	* A few notes about the KSM scanning process,
				58	* to make it easier to understand the data structures below:
				59	*
				60	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				61	* contents into a data structure that holds pointers to the pages' locations.
				62	*
				63	* Since the contents of the pages may change at any moment, KSM cannot just
				64	* insert the pages into a normal sorted tree and expect it to find anything.
				65	* Therefore KSM uses two data structures - the stable and the unstable tree.
				66	*
				67	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				68	* by their contents. Because each such page is write-protected, searching on
				69	* this tree is fully assured to be working (except when pages are unmapped),
				70	* and therefore this tree is called the stable tree.
				71	*
				72	* The stable tree node includes information required for reverse
				73	* mapping from a KSM page to virtual addresses that map this page.
				74	*
				75	* In order to avoid large latencies of the rmap walks on KSM pages,
				76	* KSM maintains two types of nodes in the stable tree:
				77	*
				78	* * the regular nodes that keep the reverse mapping structures in a
				79	* linked list
				80	* * the "chains" that link nodes ("dups") that represent the same
				81	* write protected memory content, but each "dup" corresponds to a
				82	* different KSM page copy of that content
				83	*
				84	* Internally, the regular nodes, "dups" and "chains" are represented
				85	* using the same :c:type:`struct stable_node` structure.
				86	*
				87	* In addition to the stable tree, KSM uses a second data structure called the
				88	* unstable tree: this tree holds pointers to pages which have been found to
				89	* be "unchanged for a period of time". The unstable tree sorts these pages
				90	* by their contents, but since they are not write-protected, KSM cannot rely
				91	* upon the unstable tree to work correctly - the unstable tree is liable to
				92	* be corrupted as its contents are modified, and so it is called unstable.
				93	*
				94	* KSM solves this problem by several techniques:
				95	*
				96	* 1) The unstable tree is flushed every time KSM completes scanning all
				97	* memory areas, and then the tree is rebuilt again from the beginning.
				98	* 2) KSM will only insert into the unstable tree, pages whose hash value
				99	* has not changed since the previous scan of all memory areas.
				100	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				101	* colors of the nodes and not on their contents, assuring that even when
				102	* the tree gets "corrupted" it won't get out of balance, so scanning time
				103	* remains the same (also, searching and inserting nodes in an rbtree uses
				104	* the same algorithm, so we have no overhead when we flush and rebuild).
				105	* 4) KSM never flushes the stable tree, which means that even if it were to
				106	* take 10 attempts to find a page in the unstable tree, once it is found,
				107	* it is secured in the stable tree. (When we scan a new page, we first
				108	* compare it against the stable tree, and then against the unstable tree.)
				109	*
				110	* If the merge_across_nodes tunable is unset, then KSM maintains multiple
				111	* stable trees and multiple unstable trees: one of each for each NUMA node.
				112	*/
				113
				114	/**
				115	* struct mm_slot - ksm information per mm that is being scanned
				116	* @link: link to the mm_slots hash list
				117	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
				118	* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
				119	* @mm: the mm that this information is valid for
				120	*/
				121	struct mm_slot {
				122	struct hlist_node link;
				123	struct list_head mm_list;
				124	struct rmap_item *rmap_list;
				125	struct mm_struct *mm;
				126	};
				127
				128	/**
				129	* struct ksm_scan - cursor for scanning
				130	* @mm_slot: the current mm_slot we are scanning
				131	* @address: the next address inside that to be scanned
				132	* @rmap_list: link to the next rmap to be scanned in the rmap_list
				133	* @seqnr: count of completed full scans (needed when removing unstable node)
				134	*
				135	* There is only the one ksm_scan instance of this cursor structure.
				136	*/
				137	struct ksm_scan {
				138	struct mm_slot *mm_slot;
				139	unsigned long address;
				140	struct rmap_item **rmap_list;
				141	unsigned long seqnr;
				142	};
				143
				144	/**
				145	* struct stable_node - node of the stable rbtree
				146	* @node: rb node of this ksm page in the stable tree
				147	* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
				148	* @hlist_dup: linked into the stable_node->hlist with a stable_node chain
				149	* @list: linked into migrate_nodes, pending placement in the proper node tree
				150	* @hlist: hlist head of rmap_items using this ksm page
				151	* @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
				152	* @chain_prune_time: time of the last full garbage collection
				153	* @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
				154	* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
				155	*/
				156	struct stable_node {
				157	union {
				158	struct rb_node node; /* when node of stable tree */
				159	struct { /* when listed for migration */
				160	struct list_head *head;
				161	struct {
				162	struct hlist_node hlist_dup;
				163	struct list_head list;
				164	};
				165	};
				166	};
				167	struct hlist_head hlist;
				168	union {
				169	unsigned long kpfn;
				170	unsigned long chain_prune_time;
				171	};
				172	/*
				173	* STABLE_NODE_CHAIN can be any negative number in
				174	* rmap_hlist_len negative range, but better not -1 to be able
				175	* to reliably detect underflows.
				176	*/
				177	#define STABLE_NODE_CHAIN -1024
				178	int rmap_hlist_len;
				179	#ifdef CONFIG_NUMA
				180	int nid;
				181	#endif
				182	};
				183
				184	/**
				185	* struct rmap_item - reverse mapping item for virtual addresses
				186	* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
				187	* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
				188	* @nid: NUMA node id of unstable tree in which linked (may not match page)
				189	* @mm: the memory structure this rmap_item is pointing into
				190	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				191	* @oldchecksum: previous checksum of the page at that virtual address
				192	* @node: rb node of this rmap_item in the unstable tree
				193	* @head: pointer to stable_node heading this list in the stable tree
				194	* @hlist: link into hlist of rmap_items hanging off that stable_node
				195	*/
				196	struct rmap_item {
				197	struct rmap_item *rmap_list;
				198	union {
				199	struct anon_vma anon_vma; / when stable */
				200	#ifdef CONFIG_NUMA
				201	int nid; /* when node of unstable tree */
				202	#endif
				203	};
				204	struct mm_struct *mm;
				205	unsigned long address; /* + low bits used for flags below */
				206	unsigned int oldchecksum; /* when unstable */
				207	union {
				208	struct rb_node node; /* when node of unstable tree */
				209	struct { /* when listed from stable tree */
				210	struct stable_node *head;
				211	struct hlist_node hlist;
				212	};
				213	};
				214	};
				215
				216	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
				217	#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
				218	#define STABLE_FLAG 0x200 /* is listed from the stable tree */
				219	#define KSM_FLAG_MASK (SEQNR_MASK\|UNSTABLE_FLAG\|STABLE_FLAG)
				220	/* to mask all the flags */
				221
				222	/* The stable and unstable tree heads */
				223	static struct rb_root one_stable_tree[1] = { RB_ROOT };
				224	static struct rb_root one_unstable_tree[1] = { RB_ROOT };
				225	static struct rb_root *root_stable_tree = one_stable_tree;
				226	static struct rb_root *root_unstable_tree = one_unstable_tree;
				227
				228	/* Recently migrated nodes of stable tree, pending proper placement */
				229	static LIST_HEAD(migrate_nodes);
				230	#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
				231
				232	#define MM_SLOTS_HASH_BITS 10
				233	static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
				234
				235	static struct mm_slot ksm_mm_head = {
				236	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				237	};
				238	static struct ksm_scan ksm_scan = {
				239	.mm_slot = &ksm_mm_head,
				240	};
				241
				242	static struct kmem_cache *rmap_item_cache;
				243	static struct kmem_cache *stable_node_cache;
				244	static struct kmem_cache *mm_slot_cache;
				245
				246	/* The number of nodes in the stable tree */
				247	static unsigned long ksm_pages_shared;
				248
				249	/* The number of page slots additionally sharing those nodes */
				250	static unsigned long ksm_pages_sharing;
				251
				252	/* The number of nodes in the unstable tree */
				253	static unsigned long ksm_pages_unshared;
				254
				255	/* The number of rmap_items in use: to calculate pages_volatile */
				256	static unsigned long ksm_rmap_items;
				257
				258	/* The number of stable_node chains */
				259	static unsigned long ksm_stable_node_chains;
				260
				261	/* The number of stable_node dups linked to the stable_node chains */
				262	static unsigned long ksm_stable_node_dups;
				263
				264	/* Delay in pruning stale stable_node_dups in the stable_node_chains */
				265	static int ksm_stable_node_chains_prune_millisecs = 2000;
				266
				267	/* Maximum number of page slots sharing a stable node */
				268	static int ksm_max_page_sharing = 256;
				269
				270	/* Number of pages ksmd should scan in one batch */
				271	static unsigned int ksm_thread_pages_to_scan = 100;
				272
				273	/* Milliseconds ksmd should sleep between batches */
				274	static unsigned int ksm_thread_sleep_millisecs = 20;
				275
				276	/* Checksum of an empty (zeroed) page */
				277	static unsigned int zero_checksum __read_mostly;
				278
				279	/* Whether to merge empty (zeroed) pages with actual zero pages */
				280	static bool ksm_use_zero_pages __read_mostly;
				281
				282	#ifdef CONFIG_NUMA
				283	/* Zeroed when merging across nodes is not allowed */
				284	static unsigned int ksm_merge_across_nodes = 1;
				285	static int ksm_nr_node_ids = 1;
				286	#else
				287	#define ksm_merge_across_nodes 1U
				288	#define ksm_nr_node_ids 1
				289	#endif
				290
				291	#define KSM_RUN_STOP 0
				292	#define KSM_RUN_MERGE 1
				293	#define KSM_RUN_UNMERGE 2
				294	#define KSM_RUN_OFFLINE 4
				295	static unsigned long ksm_run = KSM_RUN_STOP;
				296	static void wait_while_offlining(void);
				297
				298	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				299	static DEFINE_MUTEX(ksm_thread_mutex);
				300	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				301
				302	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				303	sizeof(struct __struct), __alignof__(struct __struct),\
				304	(__flags), NULL)
				305
				306	static int __init ksm_slab_init(void)
				307	{
				308	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				309	if (!rmap_item_cache)
				310	goto out;
				311
				312	stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
				313	if (!stable_node_cache)
				314	goto out_free1;
				315
				316	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				317	if (!mm_slot_cache)
				318	goto out_free2;
				319
				320	return 0;
				321
				322	out_free2:
				323	kmem_cache_destroy(stable_node_cache);
				324	out_free1:
				325	kmem_cache_destroy(rmap_item_cache);
				326	out:
				327	return -ENOMEM;
				328	}
				329
				330	static void __init ksm_slab_free(void)
				331	{
				332	kmem_cache_destroy(mm_slot_cache);
				333	kmem_cache_destroy(stable_node_cache);
				334	kmem_cache_destroy(rmap_item_cache);
				335	mm_slot_cache = NULL;
				336	}
				337
				338	static __always_inline bool is_stable_node_chain(struct stable_node *chain)
				339	{
				340	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
				341	}
				342
				343	static __always_inline bool is_stable_node_dup(struct stable_node *dup)
				344	{
				345	return dup->head == STABLE_NODE_DUP_HEAD;
				346	}
				347
				348	static inline void stable_node_chain_add_dup(struct stable_node *dup,
				349	struct stable_node *chain)
				350	{
				351	VM_BUG_ON(is_stable_node_dup(dup));
				352	dup->head = STABLE_NODE_DUP_HEAD;
				353	VM_BUG_ON(!is_stable_node_chain(chain));
				354	hlist_add_head(&dup->hlist_dup, &chain->hlist);
				355	ksm_stable_node_dups++;
				356	}
				357
				358	static inline void __stable_node_dup_del(struct stable_node *dup)
				359	{
				360	VM_BUG_ON(!is_stable_node_dup(dup));
				361	hlist_del(&dup->hlist_dup);
				362	ksm_stable_node_dups--;
				363	}
				364
				365	static inline void stable_node_dup_del(struct stable_node *dup)
				366	{
				367	VM_BUG_ON(is_stable_node_chain(dup));
				368	if (is_stable_node_dup(dup))
				369	__stable_node_dup_del(dup);
				370	else
				371	rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
				372	#ifdef CONFIG_DEBUG_VM
				373	dup->head = NULL;
				374	#endif
				375	}
				376
				377	static inline struct rmap_item *alloc_rmap_item(void)
				378	{
				379	struct rmap_item *rmap_item;
				380
				381	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL \|
				382	__GFP_NORETRY \| __GFP_NOWARN);
				383	if (rmap_item)
				384	ksm_rmap_items++;
				385	return rmap_item;
				386	}
				387
				388	static inline void free_rmap_item(struct rmap_item *rmap_item)
				389	{
				390	ksm_rmap_items--;
				391	rmap_item->mm = NULL; /* debug safety */
				392	kmem_cache_free(rmap_item_cache, rmap_item);
				393	}
				394
				395	static inline struct stable_node *alloc_stable_node(void)
				396	{
				397	/*
				398	* The allocation can take too long with GFP_KERNEL when memory is under
				399	* pressure, which may lead to hung task warnings. Adding __GFP_HIGH
				400	* grants access to memory reserves, helping to avoid this problem.
				401	*/
				402	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL \| __GFP_HIGH);
				403	}
				404
				405	static inline void free_stable_node(struct stable_node *stable_node)
				406	{
				407	VM_BUG_ON(stable_node->rmap_hlist_len &&
				408	!is_stable_node_chain(stable_node));
				409	kmem_cache_free(stable_node_cache, stable_node);
				410	}
				411
				412	static inline struct mm_slot *alloc_mm_slot(void)
				413	{
				414	if (!mm_slot_cache) /* initialization failed */
				415	return NULL;
				416	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				417	}
				418
				419	static inline void free_mm_slot(struct mm_slot *mm_slot)
				420	{
				421	kmem_cache_free(mm_slot_cache, mm_slot);
				422	}
				423
				424	static struct mm_slot get_mm_slot(struct mm_struct mm)
				425	{
				426	struct mm_slot *slot;
				427
				428	hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
				429	if (slot->mm == mm)
				430	return slot;
				431
				432	return NULL;
				433	}
				434
				435	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				436	struct mm_slot *mm_slot)
				437	{
				438	mm_slot->mm = mm;
				439	hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
				440	}
				441
				442	/*
				443	* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
				444	* page tables after it has passed through ksm_exit() - which, if necessary,
				445	* takes mmap_sem briefly to serialize against them. ksm_exit() does not set
				446	* a special flag: they can just back out as soon as mm_users goes to zero.
				447	* ksm_test_exit() is used throughout to make this test for exit: in some
				448	* places for correctness, in some places just to avoid unnecessary work.
				449	*/
				450	static inline bool ksm_test_exit(struct mm_struct *mm)
				451	{
				452	return atomic_read(&mm->mm_users) == 0;
				453	}
				454
				455	/*
				456	* We use break_ksm to break COW on a ksm page: it's a stripped down
				457	*
				458	* if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
				459	* put_page(page);
				460	*
				461	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				462	* in case the application has unmapped and remapped mm,addr meanwhile.
				463	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				464	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
				465	*
				466	* FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
				467	* of the process that owns 'vma'. We also do not want to enforce
				468	* protection keys here anyway.
				469	*/
				470	static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
				471	{
				472	struct page *page;
				473	vm_fault_t ret = 0;
				474
				475	do {
				476	cond_resched();
				477	page = follow_page(vma, addr,
				478	FOLL_GET \| FOLL_MIGRATION \| FOLL_REMOTE);
				479	if (IS_ERR_OR_NULL(page))
				480	break;
				481	if (PageKsm(page))
				482	ret = handle_mm_fault(vma, addr,
				483	FAULT_FLAG_WRITE \| FAULT_FLAG_REMOTE);
				484	else
				485	ret = VM_FAULT_WRITE;
				486	put_page(page);
				487	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS \| VM_FAULT_SIGSEGV \| VM_FAULT_OOM)));
				488	/*
				489	* We must loop because handle_mm_fault() may back out if there's
				490	* any difficulty e.g. if pte accessed bit gets updated concurrently.
				491	*
				492	* VM_FAULT_WRITE is what we have been hoping for: it indicates that
				493	* COW has been broken, even if the vma does not permit VM_WRITE;
				494	* but note that a concurrent fault might break PageKsm for us.
				495	*
				496	* VM_FAULT_SIGBUS could occur if we race with truncation of the
				497	* backing file, which also invalidates anonymous pages: that's
				498	* okay, that truncation will have unmapped the PageKsm for us.
				499	*
				500	* VM_FAULT_OOM: at the time of writing (late July 2009), setting
				501	* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
				502	* current task has TIF_MEMDIE set, and will be OOM killed on return
				503	* to user; and ksmd, having no mm, would never be chosen for that.
				504	*
				505	* But if the mm is in a limited mem_cgroup, then the fault may fail
				506	* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
				507	* even ksmd can fail in this way - though it's usually breaking ksm
				508	* just to undo a merge it made a moment before, so unlikely to oom.
				509	*
				510	* That's a pity: we might therefore have more kernel pages allocated
				511	* than we're counting as nodes in the stable tree; but ksm_do_scan
				512	* will retry to break_cow on each pass, so should recover the page
				513	* in due course. The important thing is to not let VM_MERGEABLE
				514	* be cleared while any such pages might remain in the area.
				515	*/
				516	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
				517	}
				518
				519	static struct vm_area_struct find_mergeable_vma(struct mm_struct mm,
				520	unsigned long addr)
				521	{
				522	struct vm_area_struct *vma;
				523	if (ksm_test_exit(mm))
				524	return NULL;
				525	vma = find_vma(mm, addr);
				526	if (!vma \|\| vma->vm_start > addr)
				527	return NULL;
				528	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				529	return NULL;
				530	return vma;
				531	}
				532
				533	static void break_cow(struct rmap_item *rmap_item)
				534	{
				535	struct mm_struct *mm = rmap_item->mm;
				536	unsigned long addr = rmap_item->address;
				537	struct vm_area_struct *vma;
				538
				539	/*
				540	* It is not an accident that whenever we want to break COW
				541	* to undo, we also need to drop a reference to the anon_vma.
				542	*/
				543	put_anon_vma(rmap_item->anon_vma);
				544
				545	down_read(&mm->mmap_sem);
				546	vma = find_mergeable_vma(mm, addr);
				547	if (vma)
				548	break_ksm(vma, addr);
				549	up_read(&mm->mmap_sem);
				550	}
				551
				552	static struct page get_mergeable_page(struct rmap_item rmap_item)
				553	{
				554	struct mm_struct *mm = rmap_item->mm;
				555	unsigned long addr = rmap_item->address;
				556	struct vm_area_struct *vma;
				557	struct page *page;
				558
				559	down_read(&mm->mmap_sem);
				560	vma = find_mergeable_vma(mm, addr);
				561	if (!vma)
				562	goto out;
				563
				564	page = follow_page(vma, addr, FOLL_GET);
				565	if (IS_ERR_OR_NULL(page))
				566	goto out;
				567	if (PageAnon(page)) {
				568	flush_anon_page(vma, page, addr);
				569	flush_dcache_page(page);
				570	} else {
				571	put_page(page);
				572	out:
				573	page = NULL;
				574	}
				575	up_read(&mm->mmap_sem);
				576	return page;
				577	}
				578
				579	/*
				580	* This helper is used for getting right index into array of tree roots.
				581	* When merge_across_nodes knob is set to 1, there are only two rb-trees for
				582	* stable and unstable pages from all nodes with roots in index 0. Otherwise,
				583	* every node has its own stable and unstable tree.
				584	*/
				585	static inline int get_kpfn_nid(unsigned long kpfn)
				586	{
				587	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
				588	}
				589
				590	static struct stable_node alloc_stable_node_chain(struct stable_node dup,
				591	struct rb_root *root)
				592	{
				593	struct stable_node *chain = alloc_stable_node();
				594	VM_BUG_ON(is_stable_node_chain(dup));
				595	if (likely(chain)) {
				596	INIT_HLIST_HEAD(&chain->hlist);
				597	chain->chain_prune_time = jiffies;
				598	chain->rmap_hlist_len = STABLE_NODE_CHAIN;
				599	#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
				600	chain->nid = -1; /* debug */
				601	#endif
				602	ksm_stable_node_chains++;
				603
				604	/*
				605	* Put the stable node chain in the first dimension of
				606	* the stable tree and at the same time remove the old
				607	* stable node.
				608	*/
				609	rb_replace_node(&dup->node, &chain->node, root);
				610
				611	/*
				612	* Move the old stable node to the second dimension
				613	* queued in the hlist_dup. The invariant is that all
				614	* dup stable_nodes in the chain->hlist point to pages
				615	* that are wrprotected and have the exact same
				616	* content.
				617	*/
				618	stable_node_chain_add_dup(dup, chain);
				619	}
				620	return chain;
				621	}
				622
				623	static inline void free_stable_node_chain(struct stable_node *chain,
				624	struct rb_root *root)
				625	{
				626	rb_erase(&chain->node, root);
				627	free_stable_node(chain);
				628	ksm_stable_node_chains--;
				629	}
				630
				631	static void remove_node_from_stable_tree(struct stable_node *stable_node)
				632	{
				633	struct rmap_item *rmap_item;
				634
				635	/* check it's not STABLE_NODE_CHAIN or negative */
				636	BUG_ON(stable_node->rmap_hlist_len < 0);
				637
				638	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
				639	if (rmap_item->hlist.next)
				640	ksm_pages_sharing--;
				641	else
				642	ksm_pages_shared--;
				643	VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
				644	stable_node->rmap_hlist_len--;
				645	put_anon_vma(rmap_item->anon_vma);
				646	rmap_item->address &= PAGE_MASK;
				647	cond_resched();
				648	}
				649
				650	/*
				651	* We need the second aligned pointer of the migrate_nodes
				652	* list_head to stay clear from the rb_parent_color union
				653	* (aligned and different than any node) and also different
				654	* from &migrate_nodes. This will verify that future list.h changes
				655	* don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
				656	*/
				657	#if defined(GCC_VERSION) && GCC_VERSION >= 40903
				658	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
				659	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
				660	#endif
				661
				662	if (stable_node->head == &migrate_nodes)
				663	list_del(&stable_node->list);
				664	else
				665	stable_node_dup_del(stable_node);
				666	free_stable_node(stable_node);
				667	}
				668
				669	/*
				670	* get_ksm_page: checks if the page indicated by the stable node
				671	* is still its ksm page, despite having held no reference to it.
				672	* In which case we can trust the content of the page, and it
				673	* returns the gotten page; but if the page has now been zapped,
				674	* remove the stale node from the stable tree and return NULL.
				675	* But beware, the stable node's page might be being migrated.
				676	*
				677	* You would expect the stable_node to hold a reference to the ksm page.
				678	* But if it increments the page's count, swapping out has to wait for
				679	* ksmd to come around again before it can free the page, which may take
				680	* seconds or even minutes: much too unresponsive. So instead we use a
				681	* "keyhole reference": access to the ksm page from the stable node peeps
				682	* out through its keyhole to see if that page still holds the right key,
				683	* pointing back to this stable node. This relies on freeing a PageAnon
				684	* page to reset its page->mapping to NULL, and relies on no other use of
				685	* a page to put something that might look like our key in page->mapping.
				686	* is on its way to being freed; but it is an anomaly to bear in mind.
				687	*/
				688	static struct page get_ksm_page(struct stable_node stable_node, bool lock_it)
				689	{
				690	struct page *page;
				691	void *expected_mapping;
				692	unsigned long kpfn;
				693
				694	expected_mapping = (void *)((unsigned long)stable_node \|
				695	PAGE_MAPPING_KSM);
				696	again:
				697	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
				698	page = pfn_to_page(kpfn);
				699	if (READ_ONCE(page->mapping) != expected_mapping)
				700	goto stale;
				701
				702	/*
				703	* We cannot do anything with the page while its refcount is 0.
				704	* Usually 0 means free, or tail of a higher-order page: in which
				705	* case this node is no longer referenced, and should be freed;
				706	* however, it might mean that the page is under page_ref_freeze().
				707	* The __remove_mapping() case is easy, again the node is now stale;
				708	* but if page is swapcache in migrate_page_move_mapping(), it might
				709	* still be our page, in which case it's essential to keep the node.
				710	*/
				711	while (!get_page_unless_zero(page)) {
				712	/*
				713	* Another check for page->mapping != expected_mapping would
				714	* work here too. We have chosen the !PageSwapCache test to
				715	* optimize the common case, when the page is or is about to
				716	* be freed: PageSwapCache is cleared (under spin_lock_irq)
				717	* in the ref_freeze section of __remove_mapping(); but Anon
				718	* page->mapping reset to NULL later, in free_pages_prepare().
				719	*/
				720	if (!PageSwapCache(page))
				721	goto stale;
				722	cpu_relax();
				723	}
				724
				725	if (READ_ONCE(page->mapping) != expected_mapping) {
				726	put_page(page);
				727	goto stale;
				728	}
				729
				730	if (lock_it) {
				731	lock_page(page);
				732	if (READ_ONCE(page->mapping) != expected_mapping) {
				733	unlock_page(page);
				734	put_page(page);
				735	goto stale;
				736	}
				737	}
				738	return page;
				739
				740	stale:
				741	/*
				742	* We come here from above when page->mapping or !PageSwapCache
				743	* suggests that the node is stale; but it might be under migration.
				744	* We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
				745	* before checking whether node->kpfn has been changed.
				746	*/
				747	smp_rmb();
				748	if (READ_ONCE(stable_node->kpfn) != kpfn)
				749	goto again;
				750	remove_node_from_stable_tree(stable_node);
				751	return NULL;
				752	}
				753
				754	/*
				755	* Removing rmap_item from stable or unstable tree.
				756	* This function will clean the information from the stable/unstable tree.
				757	*/
				758	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				759	{
				760	if (rmap_item->address & STABLE_FLAG) {
				761	struct stable_node *stable_node;
				762	struct page *page;
				763
				764	stable_node = rmap_item->head;
				765	page = get_ksm_page(stable_node, true);
				766	if (!page)
				767	goto out;
				768
				769	hlist_del(&rmap_item->hlist);
				770	unlock_page(page);
				771	put_page(page);
				772
				773	if (!hlist_empty(&stable_node->hlist))
				774	ksm_pages_sharing--;
				775	else
				776	ksm_pages_shared--;
				777	VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
				778	stable_node->rmap_hlist_len--;
				779
				780	put_anon_vma(rmap_item->anon_vma);
				781	rmap_item->address &= PAGE_MASK;
				782
				783	} else if (rmap_item->address & UNSTABLE_FLAG) {
				784	unsigned char age;
				785	/*
				786	* Usually ksmd can and must skip the rb_erase, because
				787	* root_unstable_tree was already reset to RB_ROOT.
				788	* But be careful when an mm is exiting: do the rb_erase
				789	* if this rmap_item was inserted by this scan, rather
				790	* than left over from before.
				791	*/
				792	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
				793	BUG_ON(age > 1);
				794	if (!age)
				795	rb_erase(&rmap_item->node,
				796	root_unstable_tree + NUMA(rmap_item->nid));
				797	ksm_pages_unshared--;
				798	rmap_item->address &= PAGE_MASK;
				799	}
				800	out:
				801	cond_resched(); /* we're called from many long loops */
				802	}
				803
				804	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
				805	struct rmap_item **rmap_list)
				806	{
				807	while (*rmap_list) {
				808	struct rmap_item rmap_item = rmap_list;
				809	*rmap_list = rmap_item->rmap_list;
				810	remove_rmap_item_from_tree(rmap_item);
				811	free_rmap_item(rmap_item);
				812	}
				813	}
				814
				815	/*
				816	* Though it's very tempting to unmerge rmap_items from stable tree rather
				817	* than check every pte of a given vma, the locking doesn't quite work for
				818	* that - an rmap_item is assigned to the stable tree after inserting ksm
				819	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				820	* rmap_items from parent to child at fork time (so as not to waste time
				821	* if exit comes before the next scan reaches it).
				822	*
				823	* Similarly, although we'd like to remove rmap_items (so updating counts
				824	* and freeing memory) when unmerging an area, it's easier to leave that
				825	* to the next pass of ksmd - consider, for example, how ksmd might be
				826	* in cmp_and_merge_page on one of the rmap_items we would be removing.
				827	*/
				828	static int unmerge_ksm_pages(struct vm_area_struct *vma,
				829	unsigned long start, unsigned long end)
				830	{
				831	unsigned long addr;
				832	int err = 0;
				833
				834	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
				835	if (ksm_test_exit(vma->vm_mm))
				836	break;
				837	if (signal_pending(current))
				838	err = -ERESTARTSYS;
				839	else
				840	err = break_ksm(vma, addr);
				841	}
				842	return err;
				843	}
				844
				845	static inline struct stable_node page_stable_node(struct page page)
				846	{
				847	return PageKsm(page) ? page_rmapping(page) : NULL;
				848	}
				849
				850	static inline void set_page_stable_node(struct page *page,
				851	struct stable_node *stable_node)
				852	{
				853	page->mapping = (void *)((unsigned long)stable_node \| PAGE_MAPPING_KSM);
				854	}
				855
				856	#ifdef CONFIG_SYSFS
				857	/*
				858	* Only called through the sysfs control interface:
				859	*/
				860	static int remove_stable_node(struct stable_node *stable_node)
				861	{
				862	struct page *page;
				863	int err;
				864
				865	page = get_ksm_page(stable_node, true);
				866	if (!page) {
				867	/*
				868	* get_ksm_page did remove_node_from_stable_tree itself.
				869	*/
				870	return 0;
				871	}
				872
				873	/*
				874	* Page could be still mapped if this races with __mmput() running in
				875	* between ksm_exit() and exit_mmap(). Just refuse to let
				876	* merge_across_nodes/max_page_sharing be switched.
				877	*/
				878	err = -EBUSY;
				879	if (!page_mapped(page)) {
				880	/*
				881	* The stable node did not yet appear stale to get_ksm_page(),
				882	* since that allows for an unmapped ksm page to be recognized
				883	* right up until it is freed; but the node is safe to remove.
				884	* This page might be in a pagevec waiting to be freed,
				885	* or it might be PageSwapCache (perhaps under writeback),
				886	* or it might have been removed from swapcache a moment ago.
				887	*/
				888	set_page_stable_node(page, NULL);
				889	remove_node_from_stable_tree(stable_node);
				890	err = 0;
				891	}
				892
				893	unlock_page(page);
				894	put_page(page);
				895	return err;
				896	}
				897
				898	static int remove_stable_node_chain(struct stable_node *stable_node,
				899	struct rb_root *root)
				900	{
				901	struct stable_node *dup;
				902	struct hlist_node *hlist_safe;
				903
				904	if (!is_stable_node_chain(stable_node)) {
				905	VM_BUG_ON(is_stable_node_dup(stable_node));
				906	if (remove_stable_node(stable_node))
				907	return true;
				908	else
				909	return false;
				910	}
				911
				912	hlist_for_each_entry_safe(dup, hlist_safe,
				913	&stable_node->hlist, hlist_dup) {
				914	VM_BUG_ON(!is_stable_node_dup(dup));
				915	if (remove_stable_node(dup))
				916	return true;
				917	}
				918	BUG_ON(!hlist_empty(&stable_node->hlist));
				919	free_stable_node_chain(stable_node, root);
				920	return false;
				921	}
				922
				923	static int remove_all_stable_nodes(void)
				924	{
				925	struct stable_node stable_node, next;
				926	int nid;
				927	int err = 0;
				928
				929	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
				930	while (root_stable_tree[nid].rb_node) {
				931	stable_node = rb_entry(root_stable_tree[nid].rb_node,
				932	struct stable_node, node);
				933	if (remove_stable_node_chain(stable_node,
				934	root_stable_tree + nid)) {
				935	err = -EBUSY;
				936	break; /* proceed to next nid */
				937	}
				938	cond_resched();
				939	}
				940	}
				941	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
				942	if (remove_stable_node(stable_node))
				943	err = -EBUSY;
				944	cond_resched();
				945	}
				946	return err;
				947	}
				948
				949	static int unmerge_and_remove_all_rmap_items(void)
				950	{
				951	struct mm_slot *mm_slot;
				952	struct mm_struct *mm;
				953	struct vm_area_struct *vma;
				954	int err = 0;
				955
				956	spin_lock(&ksm_mmlist_lock);
				957	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
				958	struct mm_slot, mm_list);
				959	spin_unlock(&ksm_mmlist_lock);
				960
				961	for (mm_slot = ksm_scan.mm_slot;
				962	mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
				963	mm = mm_slot->mm;
				964	down_read(&mm->mmap_sem);
				965	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				966	if (ksm_test_exit(mm))
				967	break;
				968	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				969	continue;
				970	err = unmerge_ksm_pages(vma,
				971	vma->vm_start, vma->vm_end);
				972	if (err)
				973	goto error;
				974	}
				975
				976	remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
				977	up_read(&mm->mmap_sem);
				978
				979	spin_lock(&ksm_mmlist_lock);
				980	ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
				981	struct mm_slot, mm_list);
				982	if (ksm_test_exit(mm)) {
				983	hash_del(&mm_slot->link);
				984	list_del(&mm_slot->mm_list);
				985	spin_unlock(&ksm_mmlist_lock);
				986
				987	free_mm_slot(mm_slot);
				988	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				989	mmdrop(mm);
				990	} else
				991	spin_unlock(&ksm_mmlist_lock);
				992	}
				993
				994	/* Clean up stable nodes, but don't worry if some are still busy */
				995	remove_all_stable_nodes();
				996	ksm_scan.seqnr = 0;
				997	return 0;
				998
				999	error:
				1000	up_read(&mm->mmap_sem);
				1001	spin_lock(&ksm_mmlist_lock);
				1002	ksm_scan.mm_slot = &ksm_mm_head;
				1003	spin_unlock(&ksm_mmlist_lock);
				1004	return err;
				1005	}
				1006	#endif /* CONFIG_SYSFS */
				1007
				1008	static u32 calc_checksum(struct page *page)
				1009	{
				1010	u32 checksum;
				1011	void *addr = kmap_atomic(page);
				1012	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
				1013	kunmap_atomic(addr);
				1014	return checksum;
				1015	}
				1016
				1017	static int memcmp_pages(struct page page1, struct page page2)
				1018	{
				1019	char addr1, addr2;
				1020	int ret;
				1021
				1022	addr1 = kmap_atomic(page1);
				1023	addr2 = kmap_atomic(page2);
				1024	ret = memcmp(addr1, addr2, PAGE_SIZE);
				1025	kunmap_atomic(addr2);
				1026	kunmap_atomic(addr1);
				1027	return ret;
				1028	}
				1029
				1030	static inline int pages_identical(struct page page1, struct page page2)
				1031	{
				1032	return !memcmp_pages(page1, page2);
				1033	}
				1034
				1035	static int write_protect_page(struct vm_area_struct vma, struct page page,
				1036	pte_t *orig_pte)
				1037	{
				1038	struct mm_struct *mm = vma->vm_mm;
				1039	struct page_vma_mapped_walk pvmw = {
				1040	.page = page,
				1041	.vma = vma,
				1042	};
				1043	int swapped;
				1044	int err = -EFAULT;
				1045	unsigned long mmun_start; /* For mmu_notifiers */
				1046	unsigned long mmun_end; /* For mmu_notifiers */
				1047
				1048	pvmw.address = page_address_in_vma(page, vma);
				1049	if (pvmw.address == -EFAULT)
				1050	goto out;
				1051
				1052	BUG_ON(PageTransCompound(page));
				1053
				1054	mmun_start = pvmw.address;
				1055	mmun_end = pvmw.address + PAGE_SIZE;
				1056	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
				1057
				1058	if (!page_vma_mapped_walk(&pvmw))
				1059	goto out_mn;
				1060	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
				1061	goto out_unlock;
				1062
				1063	if (pte_write(pvmw.pte) \|\| pte_dirty(pvmw.pte) \|\|
				1064	(pte_protnone(pvmw.pte) && pte_savedwrite(pvmw.pte)) \|\|
				1065	mm_tlb_flush_pending(mm)) {
				1066	pte_t entry;
				1067
				1068	swapped = PageSwapCache(page);
				1069	flush_cache_page(vma, pvmw.address, page_to_pfn(page));
				1070	/*
				1071	* Ok this is tricky, when get_user_pages_fast() run it doesn't
				1072	* take any lock, therefore the check that we are going to make
				1073	* with the pagecount against the mapcount is racey and
				1074	* O_DIRECT can happen right after the check.
				1075	* So we clear the pte and flush the tlb before the check
				1076	* this assure us that no O_DIRECT can happen after the check
				1077	* or in the middle of the check.
				1078	*
				1079	* No need to notify as we are downgrading page table to read
				1080	* only not changing it to point to a new page.
				1081	*
				1082	* See Documentation/vm/mmu_notifier.rst
				1083	*/
				1084	entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
				1085	/*
				1086	* Check that no O_DIRECT or similar I/O is in progress on the
				1087	* page
				1088	*/
				1089	if (page_mapcount(page) + 1 + swapped != page_count(page)) {
				1090	set_pte_at(mm, pvmw.address, pvmw.pte, entry);
				1091	goto out_unlock;
				1092	}
				1093	if (pte_dirty(entry))
				1094	set_page_dirty(page);
				1095
				1096	if (pte_protnone(entry))
				1097	entry = pte_mkclean(pte_clear_savedwrite(entry));
				1098	else
				1099	entry = pte_mkclean(pte_wrprotect(entry));
				1100	set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
				1101	}
				1102	orig_pte = pvmw.pte;
				1103	err = 0;
				1104
				1105	out_unlock:
				1106	page_vma_mapped_walk_done(&pvmw);
				1107	out_mn:
				1108	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
				1109	out:
				1110	return err;
				1111	}
				1112
				1113	/**
				1114	* replace_page - replace page in vma by new ksm page
				1115	* @vma: vma that holds the pte pointing to page
				1116	* @page: the page we are replacing by kpage
				1117	* @kpage: the ksm page we replace page by
				1118	* @orig_pte: the original value of the pte
				1119	*
				1120	* Returns 0 on success, -EFAULT on failure.
				1121	*/
				1122	static int replace_page(struct vm_area_struct vma, struct page page,
				1123	struct page *kpage, pte_t orig_pte)
				1124	{
				1125	struct mm_struct *mm = vma->vm_mm;
				1126	pmd_t *pmd;
				1127	pte_t *ptep;
				1128	pte_t newpte;
				1129	spinlock_t *ptl;
				1130	unsigned long addr;
				1131	int err = -EFAULT;
				1132	unsigned long mmun_start; /* For mmu_notifiers */
				1133	unsigned long mmun_end; /* For mmu_notifiers */
				1134
				1135	addr = page_address_in_vma(page, vma);
				1136	if (addr == -EFAULT)
				1137	goto out;
				1138
				1139	pmd = mm_find_pmd(mm, addr);
				1140	if (!pmd)
				1141	goto out;
				1142
				1143	mmun_start = addr;
				1144	mmun_end = addr + PAGE_SIZE;
				1145	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
				1146
				1147	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				1148	if (!pte_same(*ptep, orig_pte)) {
				1149	pte_unmap_unlock(ptep, ptl);
				1150	goto out_mn;
				1151	}
				1152
				1153	/*
				1154	* No need to check ksm_use_zero_pages here: we can only have a
				1155	* zero_page here if ksm_use_zero_pages was enabled alreaady.
				1156	*/
				1157	if (!is_zero_pfn(page_to_pfn(kpage))) {
				1158	get_page(kpage);
				1159	page_add_anon_rmap(kpage, vma, addr, false);
				1160	newpte = mk_pte(kpage, vma->vm_page_prot);
				1161	} else {
				1162	newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
				1163	vma->vm_page_prot));
				1164	/*
				1165	* We're replacing an anonymous page with a zero page, which is
				1166	* not anonymous. We need to do proper accounting otherwise we
				1167	* will get wrong values in /proc, and a BUG message in dmesg
				1168	* when tearing down the mm.
				1169	*/
				1170	dec_mm_counter(mm, MM_ANONPAGES);
				1171	}
				1172
				1173	flush_cache_page(vma, addr, pte_pfn(*ptep));
				1174	/*
				1175	* No need to notify as we are replacing a read only page with another
				1176	* read only page with the same content.
				1177	*
				1178	* See Documentation/vm/mmu_notifier.rst
				1179	*/
				1180	ptep_clear_flush(vma, addr, ptep);
				1181	set_pte_at_notify(mm, addr, ptep, newpte);
				1182
				1183	page_remove_rmap(page, false);
				1184	if (!page_mapped(page))
				1185	try_to_free_swap(page);
				1186	put_page(page);
				1187
				1188	pte_unmap_unlock(ptep, ptl);
				1189	err = 0;
				1190	out_mn:
				1191	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
				1192	out:
				1193	return err;
				1194	}
				1195
				1196	/*
				1197	* try_to_merge_one_page - take two pages and merge them into one
				1198	* @vma: the vma that holds the pte pointing to page
				1199	* @page: the PageAnon page that we want to replace with kpage
				1200	* @kpage: the PageKsm page that we want to map instead of page,
				1201	* or NULL the first time when we want to use page as kpage.
				1202	*
				1203	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				1204	*/
				1205	static int try_to_merge_one_page(struct vm_area_struct *vma,
				1206	struct page page, struct page kpage)
				1207	{
				1208	pte_t orig_pte = __pte(0);
				1209	int err = -EFAULT;
				1210
				1211	if (page == kpage) /* ksm page forked */
				1212	return 0;
				1213
				1214	if (!PageAnon(page))
				1215	goto out;
				1216
				1217	/*
				1218	* We need the page lock to read a stable PageSwapCache in
				1219	* write_protect_page(). We use trylock_page() instead of
				1220	* lock_page() because we don't want to wait here - we
				1221	* prefer to continue scanning and merging different pages,
				1222	* then come back to this page when it is unlocked.
				1223	*/
				1224	if (!trylock_page(page))
				1225	goto out;
				1226
				1227	if (PageTransCompound(page)) {
				1228	if (split_huge_page(page))
				1229	goto out_unlock;
				1230	}
				1231
				1232	/*
				1233	* If this anonymous page is mapped only here, its pte may need
				1234	* to be write-protected. If it's mapped elsewhere, all of its
				1235	* ptes are necessarily already write-protected. But in either
				1236	* case, we need to lock and check page_count is not raised.
				1237	*/
				1238	if (write_protect_page(vma, page, &orig_pte) == 0) {
				1239	if (!kpage) {
				1240	/*
				1241	* While we hold page lock, upgrade page from
				1242	* PageAnon+anon_vma to PageKsm+NULL stable_node:
				1243	* stable_tree_insert() will update stable_node.
				1244	*/
				1245	set_page_stable_node(page, NULL);
				1246	mark_page_accessed(page);
				1247	/*
				1248	* Page reclaim just frees a clean page with no dirty
				1249	* ptes: make sure that the ksm page would be swapped.
				1250	*/
				1251	if (!PageDirty(page))
				1252	SetPageDirty(page);
				1253	err = 0;
				1254	} else if (pages_identical(page, kpage))
				1255	err = replace_page(vma, page, kpage, orig_pte);
				1256	}
				1257
				1258	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
				1259	munlock_vma_page(page);
				1260	if (!PageMlocked(kpage)) {
				1261	unlock_page(page);
				1262	lock_page(kpage);
				1263	mlock_vma_page(kpage);
				1264	page = kpage; /* for final unlock */
				1265	}
				1266	}
				1267
				1268	out_unlock:
				1269	unlock_page(page);
				1270	out:
				1271	return err;
				1272	}
				1273
				1274	/*
				1275	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				1276	* but no new kernel page is allocated: kpage must already be a ksm page.
				1277	*
				1278	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				1279	*/
				1280	static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
				1281	struct page page, struct page kpage)
				1282	{
				1283	struct mm_struct *mm = rmap_item->mm;
				1284	struct vm_area_struct *vma;
				1285	int err = -EFAULT;
				1286
				1287	down_read(&mm->mmap_sem);
				1288	vma = find_mergeable_vma(mm, rmap_item->address);
				1289	if (!vma)
				1290	goto out;
				1291
				1292	err = try_to_merge_one_page(vma, page, kpage);
				1293	if (err)
				1294	goto out;
				1295
				1296	/* Unstable nid is in union with stable anon_vma: remove first */
				1297	remove_rmap_item_from_tree(rmap_item);
				1298
				1299	/* Must get reference to anon_vma while still holding mmap_sem */
				1300	rmap_item->anon_vma = vma->anon_vma;
				1301	get_anon_vma(vma->anon_vma);
				1302	out:
				1303	up_read(&mm->mmap_sem);
				1304	return err;
				1305	}
				1306
				1307	/*
				1308	* try_to_merge_two_pages - take two identical pages and prepare them
				1309	* to be merged into one page.
				1310	*
				1311	* This function returns the kpage if we successfully merged two identical
				1312	* pages into one ksm page, NULL otherwise.
				1313	*
				1314	* Note that this function upgrades page to ksm page: if one of the pages
				1315	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				1316	*/
				1317	static struct page try_to_merge_two_pages(struct rmap_item rmap_item,
				1318	struct page *page,
				1319	struct rmap_item *tree_rmap_item,
				1320	struct page *tree_page)
				1321	{
				1322	int err;
				1323
				1324	err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
				1325	if (!err) {
				1326	err = try_to_merge_with_ksm_page(tree_rmap_item,
				1327	tree_page, page);
				1328	/*
				1329	* If that fails, we have a ksm page with only one pte
				1330	* pointing to it: so break it.
				1331	*/
				1332	if (err)
				1333	break_cow(rmap_item);
				1334	}
				1335	return err ? NULL : page;
				1336	}
				1337
				1338	static __always_inline
				1339	bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
				1340	{
				1341	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
				1342	/*
				1343	* Check that at least one mapping still exists, otherwise
				1344	* there's no much point to merge and share with this
				1345	* stable_node, as the underlying tree_page of the other
				1346	* sharer is going to be freed soon.
				1347	*/
				1348	return stable_node->rmap_hlist_len &&
				1349	stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
				1350	}
				1351
				1352	static __always_inline
				1353	bool is_page_sharing_candidate(struct stable_node *stable_node)
				1354	{
				1355	return __is_page_sharing_candidate(stable_node, 0);
				1356	}
				1357
				1358	static struct page stable_node_dup(struct stable_node *_stable_node_dup,
				1359	struct stable_node **_stable_node,
				1360	struct rb_root *root,
				1361	bool prune_stale_stable_nodes)
				1362	{
				1363	struct stable_node dup, found = NULL, stable_node = _stable_node;
				1364	struct hlist_node *hlist_safe;
				1365	struct page _tree_page, tree_page = NULL;
				1366	int nr = 0;
				1367	int found_rmap_hlist_len;
				1368
				1369	if (!prune_stale_stable_nodes \|\|
				1370	time_before(jiffies, stable_node->chain_prune_time +
				1371	msecs_to_jiffies(
				1372	ksm_stable_node_chains_prune_millisecs)))
				1373	prune_stale_stable_nodes = false;
				1374	else
				1375	stable_node->chain_prune_time = jiffies;
				1376
				1377	hlist_for_each_entry_safe(dup, hlist_safe,
				1378	&stable_node->hlist, hlist_dup) {
				1379	cond_resched();
				1380	/*
				1381	* We must walk all stable_node_dup to prune the stale
				1382	* stable nodes during lookup.
				1383	*
				1384	* get_ksm_page can drop the nodes from the
				1385	* stable_node->hlist if they point to freed pages
				1386	* (that's why we do a _safe walk). The "dup"
				1387	* stable_node parameter itself will be freed from
				1388	* under us if it returns NULL.
				1389	*/
				1390	_tree_page = get_ksm_page(dup, false);
				1391	if (!_tree_page)
				1392	continue;
				1393	nr += 1;
				1394	if (is_page_sharing_candidate(dup)) {
				1395	if (!found \|\|
				1396	dup->rmap_hlist_len > found_rmap_hlist_len) {
				1397	if (found)
				1398	put_page(tree_page);
				1399	found = dup;
				1400	found_rmap_hlist_len = found->rmap_hlist_len;
				1401	tree_page = _tree_page;
				1402
				1403	/* skip put_page for found dup */
				1404	if (!prune_stale_stable_nodes)
				1405	break;
				1406	continue;
				1407	}
				1408	}
				1409	put_page(_tree_page);
				1410	}
				1411
				1412	if (found) {
				1413	/*
				1414	* nr is counting all dups in the chain only if
				1415	* prune_stale_stable_nodes is true, otherwise we may
				1416	* break the loop at nr == 1 even if there are
				1417	* multiple entries.
				1418	*/
				1419	if (prune_stale_stable_nodes && nr == 1) {
				1420	/*
				1421	* If there's not just one entry it would
				1422	* corrupt memory, better BUG_ON. In KSM
				1423	* context with no lock held it's not even
				1424	* fatal.
				1425	*/
				1426	BUG_ON(stable_node->hlist.first->next);
				1427
				1428	/*
				1429	* There's just one entry and it is below the
				1430	* deduplication limit so drop the chain.
				1431	*/
				1432	rb_replace_node(&stable_node->node, &found->node,
				1433	root);
				1434	free_stable_node(stable_node);
				1435	ksm_stable_node_chains--;
				1436	ksm_stable_node_dups--;
				1437	/*
				1438	* NOTE: the caller depends on the stable_node
				1439	* to be equal to stable_node_dup if the chain
				1440	* was collapsed.
				1441	*/
				1442	*_stable_node = found;
				1443	/*
				1444	* Just for robustneess as stable_node is
				1445	* otherwise left as a stable pointer, the
				1446	* compiler shall optimize it away at build
				1447	* time.
				1448	*/
				1449	stable_node = NULL;
				1450	} else if (stable_node->hlist.first != &found->hlist_dup &&
				1451	__is_page_sharing_candidate(found, 1)) {
				1452	/*
				1453	* If the found stable_node dup can accept one
				1454	* more future merge (in addition to the one
				1455	* that is underway) and is not at the head of
				1456	* the chain, put it there so next search will
				1457	* be quicker in the !prune_stale_stable_nodes
				1458	* case.
				1459	*
				1460	* NOTE: it would be inaccurate to use nr > 1
				1461	* instead of checking the hlist.first pointer
				1462	* directly, because in the
				1463	* prune_stale_stable_nodes case "nr" isn't
				1464	* the position of the found dup in the chain,
				1465	* but the total number of dups in the chain.
				1466	*/
				1467	hlist_del(&found->hlist_dup);
				1468	hlist_add_head(&found->hlist_dup,
				1469	&stable_node->hlist);
				1470	}
				1471	}
				1472
				1473	*_stable_node_dup = found;
				1474	return tree_page;
				1475	}
				1476
				1477	static struct stable_node stable_node_dup_any(struct stable_node stable_node,
				1478	struct rb_root *root)
				1479	{
				1480	if (!is_stable_node_chain(stable_node))
				1481	return stable_node;
				1482	if (hlist_empty(&stable_node->hlist)) {
				1483	free_stable_node_chain(stable_node, root);
				1484	return NULL;
				1485	}
				1486	return hlist_entry(stable_node->hlist.first,
				1487	typeof(*stable_node), hlist_dup);
				1488	}
				1489
				1490	/*
				1491	* Like for get_ksm_page, this function can free the *_stable_node and
				1492	* *_stable_node_dup if the returned tree_page is NULL.
				1493	*
				1494	* It can also free and overwrite *_stable_node with the found
				1495	* stable_node_dup if the chain is collapsed (in which case
				1496	* _stable_node will be equal to _stable_node_dup like if the chain
				1497	* never existed). It's up to the caller to verify tree_page is not
				1498	* NULL before dereferencing _stable_node or _stable_node_dup.
				1499	*
				1500	* *_stable_node_dup is really a second output parameter of this
				1501	* function and will be overwritten in all cases, the caller doesn't
				1502	* need to initialize it.
				1503	*/
				1504	static struct page __stable_node_chain(struct stable_node *_stable_node_dup,
				1505	struct stable_node **_stable_node,
				1506	struct rb_root *root,
				1507	bool prune_stale_stable_nodes)
				1508	{
				1509	struct stable_node stable_node = _stable_node;
				1510	if (!is_stable_node_chain(stable_node)) {
				1511	if (is_page_sharing_candidate(stable_node)) {
				1512	*_stable_node_dup = stable_node;
				1513	return get_ksm_page(stable_node, false);
				1514	}
				1515	/*
				1516	* _stable_node_dup set to NULL means the stable_node
				1517	* reached the ksm_max_page_sharing limit.
				1518	*/
				1519	*_stable_node_dup = NULL;
				1520	return NULL;
				1521	}
				1522	return stable_node_dup(_stable_node_dup, _stable_node, root,
				1523	prune_stale_stable_nodes);
				1524	}
				1525
				1526	static __always_inline struct page chain_prune(struct stable_node *s_n_d,
				1527	struct stable_node **s_n,
				1528	struct rb_root *root)
				1529	{
				1530	return __stable_node_chain(s_n_d, s_n, root, true);
				1531	}
				1532
				1533	static __always_inline struct page chain(struct stable_node *s_n_d,
				1534	struct stable_node *s_n,
				1535	struct rb_root *root)
				1536	{
				1537	struct stable_node *old_stable_node = s_n;
				1538	struct page *tree_page;
				1539
				1540	tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
				1541	/* not pruning dups so s_n cannot have changed */
				1542	VM_BUG_ON(s_n != old_stable_node);
				1543	return tree_page;
				1544	}
				1545
				1546	/*
				1547	* stable_tree_search - search for page inside the stable tree
				1548	*
				1549	* This function checks if there is a page inside the stable tree
				1550	* with identical content to the page that we are scanning right now.
				1551	*
				1552	* This function returns the stable tree node of identical content if found,
				1553	* NULL otherwise.
				1554	*/
				1555	static struct page stable_tree_search(struct page page)
				1556	{
				1557	int nid;
				1558	struct rb_root *root;
				1559	struct rb_node **new;
				1560	struct rb_node *parent;
				1561	struct stable_node stable_node, stable_node_dup, *stable_node_any;
				1562	struct stable_node *page_node;
				1563
				1564	page_node = page_stable_node(page);
				1565	if (page_node && page_node->head != &migrate_nodes) {
				1566	/* ksm page forked */
				1567	get_page(page);
				1568	return page;
				1569	}
				1570
				1571	nid = get_kpfn_nid(page_to_pfn(page));
				1572	root = root_stable_tree + nid;
				1573	again:
				1574	new = &root->rb_node;
				1575	parent = NULL;
				1576
				1577	while (*new) {
				1578	struct page *tree_page;
				1579	int ret;
				1580
				1581	cond_resched();
				1582	stable_node = rb_entry(*new, struct stable_node, node);
				1583	stable_node_any = NULL;
				1584	tree_page = chain_prune(&stable_node_dup, &stable_node, root);
				1585	/*
				1586	* NOTE: stable_node may have been freed by
				1587	* chain_prune() if the returned stable_node_dup is
				1588	* not NULL. stable_node_dup may have been inserted in
				1589	* the rbtree instead as a regular stable_node (in
				1590	* order to collapse the stable_node chain if a single
				1591	* stable_node dup was found in it). In such case the
				1592	* stable_node is overwritten by the calleee to point
				1593	* to the stable_node_dup that was collapsed in the
				1594	* stable rbtree and stable_node will be equal to
				1595	* stable_node_dup like if the chain never existed.
				1596	*/
				1597	if (!stable_node_dup) {
				1598	/*
				1599	* Either all stable_node dups were full in
				1600	* this stable_node chain, or this chain was
				1601	* empty and should be rb_erased.
				1602	*/
				1603	stable_node_any = stable_node_dup_any(stable_node,
				1604	root);
				1605	if (!stable_node_any) {
				1606	/* rb_erase just run */
				1607	goto again;
				1608	}
				1609	/*
				1610	* Take any of the stable_node dups page of
				1611	* this stable_node chain to let the tree walk
				1612	* continue. All KSM pages belonging to the
				1613	* stable_node dups in a stable_node chain
				1614	* have the same content and they're
				1615	* wrprotected at all times. Any will work
				1616	* fine to continue the walk.
				1617	*/
				1618	tree_page = get_ksm_page(stable_node_any, false);
				1619	}
				1620	VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
				1621	if (!tree_page) {
				1622	/*
				1623	* If we walked over a stale stable_node,
				1624	* get_ksm_page() will call rb_erase() and it
				1625	* may rebalance the tree from under us. So
				1626	* restart the search from scratch. Returning
				1627	* NULL would be safe too, but we'd generate
				1628	* false negative insertions just because some
				1629	* stable_node was stale.
				1630	*/
				1631	goto again;
				1632	}
				1633
				1634	ret = memcmp_pages(page, tree_page);
				1635	put_page(tree_page);
				1636
				1637	parent = *new;
				1638	if (ret < 0)
				1639	new = &parent->rb_left;
				1640	else if (ret > 0)
				1641	new = &parent->rb_right;
				1642	else {
				1643	if (page_node) {
				1644	VM_BUG_ON(page_node->head != &migrate_nodes);
				1645	/*
				1646	* Test if the migrated page should be merged
				1647	* into a stable node dup. If the mapcount is
				1648	* 1 we can migrate it with another KSM page
				1649	* without adding it to the chain.
				1650	*/
				1651	if (page_mapcount(page) > 1)
				1652	goto chain_append;
				1653	}
				1654
				1655	if (!stable_node_dup) {
				1656	/*
				1657	* If the stable_node is a chain and
				1658	* we got a payload match in memcmp
				1659	* but we cannot merge the scanned
				1660	* page in any of the existing
				1661	* stable_node dups because they're
				1662	* all full, we need to wait the
				1663	* scanned page to find itself a match
				1664	* in the unstable tree to create a
				1665	* brand new KSM page to add later to
				1666	* the dups of this stable_node.
				1667	*/
				1668	return NULL;
				1669	}
				1670
				1671	/*
				1672	* Lock and unlock the stable_node's page (which
				1673	* might already have been migrated) so that page
				1674	* migration is sure to notice its raised count.
				1675	* It would be more elegant to return stable_node
				1676	* than kpage, but that involves more changes.
				1677	*/
				1678	tree_page = get_ksm_page(stable_node_dup, true);
				1679	if (unlikely(!tree_page))
				1680	/*
				1681	* The tree may have been rebalanced,
				1682	* so re-evaluate parent and new.
				1683	*/
				1684	goto again;
				1685	unlock_page(tree_page);
				1686
				1687	if (get_kpfn_nid(stable_node_dup->kpfn) !=
				1688	NUMA(stable_node_dup->nid)) {
				1689	put_page(tree_page);
				1690	goto replace;
				1691	}
				1692	return tree_page;
				1693	}
				1694	}
				1695
				1696	if (!page_node)
				1697	return NULL;
				1698
				1699	list_del(&page_node->list);
				1700	DO_NUMA(page_node->nid = nid);
				1701	rb_link_node(&page_node->node, parent, new);
				1702	rb_insert_color(&page_node->node, root);
				1703	out:
				1704	if (is_page_sharing_candidate(page_node)) {
				1705	get_page(page);
				1706	return page;
				1707	} else
				1708	return NULL;
				1709
				1710	replace:
				1711	/*
				1712	* If stable_node was a chain and chain_prune collapsed it,
				1713	* stable_node has been updated to be the new regular
				1714	* stable_node. A collapse of the chain is indistinguishable
				1715	* from the case there was no chain in the stable
				1716	* rbtree. Otherwise stable_node is the chain and
				1717	* stable_node_dup is the dup to replace.
				1718	*/
				1719	if (stable_node_dup == stable_node) {
				1720	VM_BUG_ON(is_stable_node_chain(stable_node_dup));
				1721	VM_BUG_ON(is_stable_node_dup(stable_node_dup));
				1722	/* there is no chain */
				1723	if (page_node) {
				1724	VM_BUG_ON(page_node->head != &migrate_nodes);
				1725	list_del(&page_node->list);
				1726	DO_NUMA(page_node->nid = nid);
				1727	rb_replace_node(&stable_node_dup->node,
				1728	&page_node->node,
				1729	root);
				1730	if (is_page_sharing_candidate(page_node))
				1731	get_page(page);
				1732	else
				1733	page = NULL;
				1734	} else {
				1735	rb_erase(&stable_node_dup->node, root);
				1736	page = NULL;
				1737	}
				1738	} else {
				1739	VM_BUG_ON(!is_stable_node_chain(stable_node));
				1740	__stable_node_dup_del(stable_node_dup);
				1741	if (page_node) {
				1742	VM_BUG_ON(page_node->head != &migrate_nodes);
				1743	list_del(&page_node->list);
				1744	DO_NUMA(page_node->nid = nid);
				1745	stable_node_chain_add_dup(page_node, stable_node);
				1746	if (is_page_sharing_candidate(page_node))
				1747	get_page(page);
				1748	else
				1749	page = NULL;
				1750	} else {
				1751	page = NULL;
				1752	}
				1753	}
				1754	stable_node_dup->head = &migrate_nodes;
				1755	list_add(&stable_node_dup->list, stable_node_dup->head);
				1756	return page;
				1757
				1758	chain_append:
				1759	/* stable_node_dup could be null if it reached the limit */
				1760	if (!stable_node_dup)
				1761	stable_node_dup = stable_node_any;
				1762	/*
				1763	* If stable_node was a chain and chain_prune collapsed it,
				1764	* stable_node has been updated to be the new regular
				1765	* stable_node. A collapse of the chain is indistinguishable
				1766	* from the case there was no chain in the stable
				1767	* rbtree. Otherwise stable_node is the chain and
				1768	* stable_node_dup is the dup to replace.
				1769	*/
				1770	if (stable_node_dup == stable_node) {
				1771	VM_BUG_ON(is_stable_node_chain(stable_node_dup));
				1772	VM_BUG_ON(is_stable_node_dup(stable_node_dup));
				1773	/* chain is missing so create it */
				1774	stable_node = alloc_stable_node_chain(stable_node_dup,
				1775	root);
				1776	if (!stable_node)
				1777	return NULL;
				1778	}
				1779	/*
				1780	* Add this stable_node dup that was
				1781	* migrated to the stable_node chain
				1782	* of the current nid for this page
				1783	* content.
				1784	*/
				1785	VM_BUG_ON(!is_stable_node_chain(stable_node));
				1786	VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
				1787	VM_BUG_ON(page_node->head != &migrate_nodes);
				1788	list_del(&page_node->list);
				1789	DO_NUMA(page_node->nid = nid);
				1790	stable_node_chain_add_dup(page_node, stable_node);
				1791	goto out;
				1792	}
				1793
				1794	/*
				1795	* stable_tree_insert - insert stable tree node pointing to new ksm page
				1796	* into the stable tree.
				1797	*
				1798	* This function returns the stable tree node just allocated on success,
				1799	* NULL otherwise.
				1800	*/
				1801	static struct stable_node stable_tree_insert(struct page kpage)
				1802	{
				1803	int nid;
				1804	unsigned long kpfn;
				1805	struct rb_root *root;
				1806	struct rb_node **new;
				1807	struct rb_node *parent;
				1808	struct stable_node stable_node, stable_node_dup, *stable_node_any;
				1809	bool need_chain = false;
				1810
				1811	kpfn = page_to_pfn(kpage);
				1812	nid = get_kpfn_nid(kpfn);
				1813	root = root_stable_tree + nid;
				1814	again:
				1815	parent = NULL;
				1816	new = &root->rb_node;
				1817
				1818	while (*new) {
				1819	struct page *tree_page;
				1820	int ret;
				1821
				1822	cond_resched();
				1823	stable_node = rb_entry(*new, struct stable_node, node);
				1824	stable_node_any = NULL;
				1825	tree_page = chain(&stable_node_dup, stable_node, root);
				1826	if (!stable_node_dup) {
				1827	/*
				1828	* Either all stable_node dups were full in
				1829	* this stable_node chain, or this chain was
				1830	* empty and should be rb_erased.
				1831	*/
				1832	stable_node_any = stable_node_dup_any(stable_node,
				1833	root);
				1834	if (!stable_node_any) {
				1835	/* rb_erase just run */
				1836	goto again;
				1837	}
				1838	/*
				1839	* Take any of the stable_node dups page of
				1840	* this stable_node chain to let the tree walk
				1841	* continue. All KSM pages belonging to the
				1842	* stable_node dups in a stable_node chain
				1843	* have the same content and they're
				1844	* wrprotected at all times. Any will work
				1845	* fine to continue the walk.
				1846	*/
				1847	tree_page = get_ksm_page(stable_node_any, false);
				1848	}
				1849	VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
				1850	if (!tree_page) {
				1851	/*
				1852	* If we walked over a stale stable_node,
				1853	* get_ksm_page() will call rb_erase() and it
				1854	* may rebalance the tree from under us. So
				1855	* restart the search from scratch. Returning
				1856	* NULL would be safe too, but we'd generate
				1857	* false negative insertions just because some
				1858	* stable_node was stale.
				1859	*/
				1860	goto again;
				1861	}
				1862
				1863	ret = memcmp_pages(kpage, tree_page);
				1864	put_page(tree_page);
				1865
				1866	parent = *new;
				1867	if (ret < 0)
				1868	new = &parent->rb_left;
				1869	else if (ret > 0)
				1870	new = &parent->rb_right;
				1871	else {
				1872	need_chain = true;
				1873	break;
				1874	}
				1875	}
				1876
				1877	stable_node_dup = alloc_stable_node();
				1878	if (!stable_node_dup)
				1879	return NULL;
				1880
				1881	INIT_HLIST_HEAD(&stable_node_dup->hlist);
				1882	stable_node_dup->kpfn = kpfn;
				1883	set_page_stable_node(kpage, stable_node_dup);
				1884	stable_node_dup->rmap_hlist_len = 0;
				1885	DO_NUMA(stable_node_dup->nid = nid);
				1886	if (!need_chain) {
				1887	rb_link_node(&stable_node_dup->node, parent, new);
				1888	rb_insert_color(&stable_node_dup->node, root);
				1889	} else {
				1890	if (!is_stable_node_chain(stable_node)) {
				1891	struct stable_node *orig = stable_node;
				1892	/* chain is missing so create it */
				1893	stable_node = alloc_stable_node_chain(orig, root);
				1894	if (!stable_node) {
				1895	free_stable_node(stable_node_dup);
				1896	return NULL;
				1897	}
				1898	}
				1899	stable_node_chain_add_dup(stable_node_dup, stable_node);
				1900	}
				1901
				1902	return stable_node_dup;
				1903	}
				1904
				1905	/*
				1906	* unstable_tree_search_insert - search for identical page,
				1907	* else insert rmap_item into the unstable tree.
				1908	*
				1909	* This function searches for a page in the unstable tree identical to the
				1910	* page currently being scanned; and if no identical page is found in the
				1911	* tree, we insert rmap_item as a new object into the unstable tree.
				1912	*
				1913	* This function returns pointer to rmap_item found to be identical
				1914	* to the currently scanned page, NULL otherwise.
				1915	*
				1916	* This function does both searching and inserting, because they share
				1917	* the same walking algorithm in an rbtree.
				1918	*/
				1919	static
				1920	struct rmap_item unstable_tree_search_insert(struct rmap_item rmap_item,
				1921	struct page *page,
				1922	struct page **tree_pagep)
				1923	{
				1924	struct rb_node **new;
				1925	struct rb_root *root;
				1926	struct rb_node *parent = NULL;
				1927	int nid;
				1928
				1929	nid = get_kpfn_nid(page_to_pfn(page));
				1930	root = root_unstable_tree + nid;
				1931	new = &root->rb_node;
				1932
				1933	while (*new) {
				1934	struct rmap_item *tree_rmap_item;
				1935	struct page *tree_page;
				1936	int ret;
				1937
				1938	cond_resched();
				1939	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				1940	tree_page = get_mergeable_page(tree_rmap_item);
				1941	if (!tree_page)
				1942	return NULL;
				1943
				1944	/*
				1945	* Don't substitute a ksm page for a forked page.
				1946	*/
				1947	if (page == tree_page) {
				1948	put_page(tree_page);
				1949	return NULL;
				1950	}
				1951
				1952	ret = memcmp_pages(page, tree_page);
				1953
				1954	parent = *new;
				1955	if (ret < 0) {
				1956	put_page(tree_page);
				1957	new = &parent->rb_left;
				1958	} else if (ret > 0) {
				1959	put_page(tree_page);
				1960	new = &parent->rb_right;
				1961	} else if (!ksm_merge_across_nodes &&
				1962	page_to_nid(tree_page) != nid) {
				1963	/*
				1964	* If tree_page has been migrated to another NUMA node,
				1965	* it will be flushed out and put in the right unstable
				1966	* tree next time: only merge with it when across_nodes.
				1967	*/
				1968	put_page(tree_page);
				1969	return NULL;
				1970	} else {
				1971	*tree_pagep = tree_page;
				1972	return tree_rmap_item;
				1973	}
				1974	}
				1975
				1976	rmap_item->address \|= UNSTABLE_FLAG;
				1977	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
				1978	DO_NUMA(rmap_item->nid = nid);
				1979	rb_link_node(&rmap_item->node, parent, new);
				1980	rb_insert_color(&rmap_item->node, root);
				1981
				1982	ksm_pages_unshared++;
				1983	return NULL;
				1984	}
				1985
				1986	/*
				1987	* stable_tree_append - add another rmap_item to the linked list of
				1988	* rmap_items hanging off a given node of the stable tree, all sharing
				1989	* the same ksm page.
				1990	*/
				1991	static void stable_tree_append(struct rmap_item *rmap_item,
				1992	struct stable_node *stable_node,
				1993	bool max_page_sharing_bypass)
				1994	{
				1995	/*
				1996	* rmap won't find this mapping if we don't insert the
				1997	* rmap_item in the right stable_node
				1998	* duplicate. page_migration could break later if rmap breaks,
				1999	* so we can as well crash here. We really need to check for
				2000	* rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
				2001	* for other negative values as an undeflow if detected here
				2002	* for the first time (and not when decreasing rmap_hlist_len)
				2003	* would be sign of memory corruption in the stable_node.
				2004	*/
				2005	BUG_ON(stable_node->rmap_hlist_len < 0);
				2006
				2007	stable_node->rmap_hlist_len++;
				2008	if (!max_page_sharing_bypass)
				2009	/* possibly non fatal but unexpected overflow, only warn */
				2010	WARN_ON_ONCE(stable_node->rmap_hlist_len >
				2011	ksm_max_page_sharing);
				2012
				2013	rmap_item->head = stable_node;
				2014	rmap_item->address \|= STABLE_FLAG;
				2015	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
				2016
				2017	if (rmap_item->hlist.next)
				2018	ksm_pages_sharing++;
				2019	else
				2020	ksm_pages_shared++;
				2021	}
				2022
				2023	/*
				2024	* cmp_and_merge_page - first see if page can be merged into the stable tree;
				2025	* if not, compare checksum to previous and if it's the same, see if page can
				2026	* be inserted into the unstable tree, or merged with a page already there and
				2027	* both transferred to the stable tree.
				2028	*
				2029	* @page: the page that we are searching identical page to.
				2030	* @rmap_item: the reverse mapping into the virtual address of this page
				2031	*/
				2032	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				2033	{
				2034	struct mm_struct *mm = rmap_item->mm;
				2035	struct rmap_item *tree_rmap_item;
				2036	struct page *tree_page = NULL;
				2037	struct stable_node *stable_node;
				2038	struct page *kpage;
				2039	unsigned int checksum;
				2040	int err;
				2041	bool max_page_sharing_bypass = false;
				2042
				2043	stable_node = page_stable_node(page);
				2044	if (stable_node) {
				2045	if (stable_node->head != &migrate_nodes &&
				2046	get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
				2047	NUMA(stable_node->nid)) {
				2048	stable_node_dup_del(stable_node);
				2049	stable_node->head = &migrate_nodes;
				2050	list_add(&stable_node->list, stable_node->head);
				2051	}
				2052	if (stable_node->head != &migrate_nodes &&
				2053	rmap_item->head == stable_node)
				2054	return;
				2055	/*
				2056	* If it's a KSM fork, allow it to go over the sharing limit
				2057	* without warnings.
				2058	*/
				2059	if (!is_page_sharing_candidate(stable_node))
				2060	max_page_sharing_bypass = true;
				2061	}
				2062
				2063	/* We first start with searching the page inside the stable tree */
				2064	kpage = stable_tree_search(page);
				2065	if (kpage == page && rmap_item->head == stable_node) {
				2066	put_page(kpage);
				2067	return;
				2068	}
				2069
				2070	remove_rmap_item_from_tree(rmap_item);
				2071
				2072	if (kpage) {
				2073	err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
				2074	if (!err) {
				2075	/*
				2076	* The page was successfully merged:
				2077	* add its rmap_item to the stable tree.
				2078	*/
				2079	lock_page(kpage);
				2080	stable_tree_append(rmap_item, page_stable_node(kpage),
				2081	max_page_sharing_bypass);
				2082	unlock_page(kpage);
				2083	}
				2084	put_page(kpage);
				2085	return;
				2086	}
				2087
				2088	/*
				2089	* If the hash value of the page has changed from the last time
				2090	* we calculated it, this page is changing frequently: therefore we
				2091	* don't want to insert it in the unstable tree, and we don't want
				2092	* to waste our time searching for something identical to it there.
				2093	*/
				2094	checksum = calc_checksum(page);
				2095	if (rmap_item->oldchecksum != checksum) {
				2096	rmap_item->oldchecksum = checksum;
				2097	return;
				2098	}
				2099
				2100	/*
				2101	* Same checksum as an empty page. We attempt to merge it with the
				2102	* appropriate zero page if the user enabled this via sysfs.
				2103	*/
				2104	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
				2105	struct vm_area_struct *vma;
				2106
				2107	down_read(&mm->mmap_sem);
				2108	vma = find_mergeable_vma(mm, rmap_item->address);
				2109	err = try_to_merge_one_page(vma, page,
				2110	ZERO_PAGE(rmap_item->address));
				2111	up_read(&mm->mmap_sem);
				2112	/*
				2113	* In case of failure, the page was not really empty, so we
				2114	* need to continue. Otherwise we're done.
				2115	*/
				2116	if (!err)
				2117	return;
				2118	}
				2119	tree_rmap_item =
				2120	unstable_tree_search_insert(rmap_item, page, &tree_page);
				2121	if (tree_rmap_item) {
				2122	bool split;
				2123
				2124	kpage = try_to_merge_two_pages(rmap_item, page,
				2125	tree_rmap_item, tree_page);
				2126	/*
				2127	* If both pages we tried to merge belong to the same compound
				2128	* page, then we actually ended up increasing the reference
				2129	* count of the same compound page twice, and split_huge_page
				2130	* failed.
				2131	* Here we set a flag if that happened, and we use it later to
				2132	* try split_huge_page again. Since we call put_page right
				2133	* afterwards, the reference count will be correct and
				2134	* split_huge_page should succeed.
				2135	*/
				2136	split = PageTransCompound(page)
				2137	&& compound_head(page) == compound_head(tree_page);
				2138	put_page(tree_page);
				2139	if (kpage) {
				2140	/*
				2141	* The pages were successfully merged: insert new
				2142	* node in the stable tree and add both rmap_items.
				2143	*/
				2144	lock_page(kpage);
				2145	stable_node = stable_tree_insert(kpage);
				2146	if (stable_node) {
				2147	stable_tree_append(tree_rmap_item, stable_node,
				2148	false);
				2149	stable_tree_append(rmap_item, stable_node,
				2150	false);
				2151	}
				2152	unlock_page(kpage);
				2153
				2154	/*
				2155	* If we fail to insert the page into the stable tree,
				2156	* we will have 2 virtual addresses that are pointing
				2157	* to a ksm page left outside the stable tree,
				2158	* in which case we need to break_cow on both.
				2159	*/
				2160	if (!stable_node) {
				2161	break_cow(tree_rmap_item);
				2162	break_cow(rmap_item);
				2163	}
				2164	} else if (split) {
				2165	/*
				2166	* We are here if we tried to merge two pages and
				2167	* failed because they both belonged to the same
				2168	* compound page. We will split the page now, but no
				2169	* merging will take place.
				2170	* We do not want to add the cost of a full lock; if
				2171	* the page is locked, it is better to skip it and
				2172	* perhaps try again later.
				2173	*/
				2174	if (!trylock_page(page))
				2175	return;
				2176	split_huge_page(page);
				2177	unlock_page(page);
				2178	}
				2179	}
				2180	}
				2181
				2182	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
				2183	struct rmap_item **rmap_list,
				2184	unsigned long addr)
				2185	{
				2186	struct rmap_item *rmap_item;
				2187
				2188	while (*rmap_list) {
				2189	rmap_item = *rmap_list;
				2190	if ((rmap_item->address & PAGE_MASK) == addr)
				2191	return rmap_item;
				2192	if (rmap_item->address > addr)
				2193	break;
				2194	*rmap_list = rmap_item->rmap_list;
				2195	remove_rmap_item_from_tree(rmap_item);
				2196	free_rmap_item(rmap_item);
				2197	}
				2198
				2199	rmap_item = alloc_rmap_item();
				2200	if (rmap_item) {
				2201	/* It has already been zeroed */
				2202	rmap_item->mm = mm_slot->mm;
				2203	rmap_item->address = addr;
				2204	rmap_item->rmap_list = *rmap_list;
				2205	*rmap_list = rmap_item;
				2206	}
				2207	return rmap_item;
				2208	}
				2209
				2210	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				2211	{
				2212	struct mm_struct *mm;
				2213	struct mm_slot *slot;
				2214	struct vm_area_struct *vma;
				2215	struct rmap_item *rmap_item;
				2216	int nid;
				2217
				2218	if (list_empty(&ksm_mm_head.mm_list))
				2219	return NULL;
				2220
				2221	slot = ksm_scan.mm_slot;
				2222	if (slot == &ksm_mm_head) {
				2223	/*
				2224	* A number of pages can hang around indefinitely on per-cpu
				2225	* pagevecs, raised page count preventing write_protect_page
				2226	* from merging them. Though it doesn't really matter much,
				2227	* it is puzzling to see some stuck in pages_volatile until
				2228	* other activity jostles them out, and they also prevented
				2229	* LTP's KSM test from succeeding deterministically; so drain
				2230	* them here (here rather than on entry to ksm_do_scan(),
				2231	* so we don't IPI too often when pages_to_scan is set low).
				2232	*/
				2233	lru_add_drain_all();
				2234
				2235	/*
				2236	* Whereas stale stable_nodes on the stable_tree itself
				2237	* get pruned in the regular course of stable_tree_search(),
				2238	* those moved out to the migrate_nodes list can accumulate:
				2239	* so prune them once before each full scan.
				2240	*/
				2241	if (!ksm_merge_across_nodes) {
				2242	struct stable_node stable_node, next;
				2243	struct page *page;
				2244
				2245	list_for_each_entry_safe(stable_node, next,
				2246	&migrate_nodes, list) {
				2247	page = get_ksm_page(stable_node, false);
				2248	if (page)
				2249	put_page(page);
				2250	cond_resched();
				2251	}
				2252	}
				2253
				2254	for (nid = 0; nid < ksm_nr_node_ids; nid++)
				2255	root_unstable_tree[nid] = RB_ROOT;
				2256
				2257	spin_lock(&ksm_mmlist_lock);
				2258	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				2259	ksm_scan.mm_slot = slot;
				2260	spin_unlock(&ksm_mmlist_lock);
				2261	/*
				2262	* Although we tested list_empty() above, a racing __ksm_exit
				2263	* of the last mm on the list may have removed it since then.
				2264	*/
				2265	if (slot == &ksm_mm_head)
				2266	return NULL;
				2267	next_mm:
				2268	ksm_scan.address = 0;
				2269	ksm_scan.rmap_list = &slot->rmap_list;
				2270	}
				2271
				2272	mm = slot->mm;
				2273	down_read(&mm->mmap_sem);
				2274	if (ksm_test_exit(mm))
				2275	vma = NULL;
				2276	else
				2277	vma = find_vma(mm, ksm_scan.address);
				2278
				2279	for (; vma; vma = vma->vm_next) {
				2280	if (!(vma->vm_flags & VM_MERGEABLE))
				2281	continue;
				2282	if (ksm_scan.address < vma->vm_start)
				2283	ksm_scan.address = vma->vm_start;
				2284	if (!vma->anon_vma)
				2285	ksm_scan.address = vma->vm_end;
				2286
				2287	while (ksm_scan.address < vma->vm_end) {
				2288	if (ksm_test_exit(mm))
				2289	break;
				2290	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
				2291	if (IS_ERR_OR_NULL(*page)) {
				2292	ksm_scan.address += PAGE_SIZE;
				2293	cond_resched();
				2294	continue;
				2295	}
				2296	if (PageAnon(*page)) {
				2297	flush_anon_page(vma, *page, ksm_scan.address);
				2298	flush_dcache_page(*page);
				2299	rmap_item = get_next_rmap_item(slot,
				2300	ksm_scan.rmap_list, ksm_scan.address);
				2301	if (rmap_item) {
				2302	ksm_scan.rmap_list =
				2303	&rmap_item->rmap_list;
				2304	ksm_scan.address += PAGE_SIZE;
				2305	} else
				2306	put_page(*page);
				2307	up_read(&mm->mmap_sem);
				2308	return rmap_item;
				2309	}
				2310	put_page(*page);
				2311	ksm_scan.address += PAGE_SIZE;
				2312	cond_resched();
				2313	}
				2314	}
				2315
				2316	if (ksm_test_exit(mm)) {
				2317	ksm_scan.address = 0;
				2318	ksm_scan.rmap_list = &slot->rmap_list;
				2319	}
				2320	/*
				2321	* Nuke all the rmap_items that are above this current rmap:
				2322	* because there were no VM_MERGEABLE vmas with such addresses.
				2323	*/
				2324	remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
				2325
				2326	spin_lock(&ksm_mmlist_lock);
				2327	ksm_scan.mm_slot = list_entry(slot->mm_list.next,
				2328	struct mm_slot, mm_list);
				2329	if (ksm_scan.address == 0) {
				2330	/*
				2331	* We've completed a full scan of all vmas, holding mmap_sem
				2332	* throughout, and found no VM_MERGEABLE: so do the same as
				2333	* __ksm_exit does to remove this mm from all our lists now.
				2334	* This applies either when cleaning up after __ksm_exit
				2335	* (but beware: we can reach here even before __ksm_exit),
				2336	* or when all VM_MERGEABLE areas have been unmapped (and
				2337	* mmap_sem then protects against race with MADV_MERGEABLE).
				2338	*/
				2339	hash_del(&slot->link);
				2340	list_del(&slot->mm_list);
				2341	spin_unlock(&ksm_mmlist_lock);
				2342
				2343	free_mm_slot(slot);
				2344	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				2345	up_read(&mm->mmap_sem);
				2346	mmdrop(mm);
				2347	} else {
				2348	up_read(&mm->mmap_sem);
				2349	/*
				2350	* up_read(&mm->mmap_sem) first because after
				2351	* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
				2352	* already have been freed under us by __ksm_exit()
				2353	* because the "mm_slot" is still hashed and
				2354	* ksm_scan.mm_slot doesn't point to it anymore.
				2355	*/
				2356	spin_unlock(&ksm_mmlist_lock);
				2357	}
				2358
				2359	/* Repeat until we've completed scanning the whole list */
				2360	slot = ksm_scan.mm_slot;
				2361	if (slot != &ksm_mm_head)
				2362	goto next_mm;
				2363
				2364	ksm_scan.seqnr++;
				2365	return NULL;
				2366	}
				2367
				2368	/**
				2369	* ksm_do_scan - the ksm scanner main worker function.
				2370	* @scan_npages: number of pages we want to scan before we return.
				2371	*/
				2372	static void ksm_do_scan(unsigned int scan_npages)
				2373	{
				2374	struct rmap_item *rmap_item;
				2375	struct page *uninitialized_var(page);
				2376
				2377	while (scan_npages-- && likely(!freezing(current))) {
				2378	cond_resched();
				2379	rmap_item = scan_get_next_rmap_item(&page);
				2380	if (!rmap_item)
				2381	return;
				2382	cmp_and_merge_page(page, rmap_item);
				2383	put_page(page);
				2384	}
				2385	}
				2386
				2387	static int ksmd_should_run(void)
				2388	{
				2389	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
				2390	}
				2391
				2392	static int ksm_scan_thread(void *nothing)
				2393	{
				2394	set_freezable();
				2395	set_user_nice(current, 5);
				2396
				2397	while (!kthread_should_stop()) {
				2398	mutex_lock(&ksm_thread_mutex);
				2399	wait_while_offlining();
				2400	if (ksmd_should_run())
				2401	ksm_do_scan(ksm_thread_pages_to_scan);
				2402	mutex_unlock(&ksm_thread_mutex);
				2403
				2404	try_to_freeze();
				2405
				2406	if (ksmd_should_run()) {
				2407	schedule_timeout_interruptible(
				2408	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				2409	} else {
				2410	wait_event_freezable(ksm_thread_wait,
				2411	ksmd_should_run() \|\| kthread_should_stop());
				2412	}
				2413	}
				2414	return 0;
				2415	}
				2416
				2417	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				2418	unsigned long end, int advice, unsigned long *vm_flags)
				2419	{
				2420	struct mm_struct *mm = vma->vm_mm;
				2421	int err;
				2422
				2423	switch (advice) {
				2424	case MADV_MERGEABLE:
				2425	/*
				2426	* Be somewhat over-protective for now!
				2427	*/
				2428	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				2429	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
				2430	VM_HUGETLB \| VM_MIXEDMAP))
				2431	return 0; /* just ignore the advice */
				2432
				2433	if (vma_is_dax(vma))
				2434	return 0;
				2435
				2436	#ifdef VM_SAO
				2437	if (*vm_flags & VM_SAO)
				2438	return 0;
				2439	#endif
				2440	#ifdef VM_SPARC_ADI
				2441	if (*vm_flags & VM_SPARC_ADI)
				2442	return 0;
				2443	#endif
				2444
				2445	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
				2446	err = __ksm_enter(mm);
				2447	if (err)
				2448	return err;
				2449	}
				2450
				2451	*vm_flags \|= VM_MERGEABLE;
				2452	break;
				2453
				2454	case MADV_UNMERGEABLE:
				2455	if (!(*vm_flags & VM_MERGEABLE))
				2456	return 0; /* just ignore the advice */
				2457
				2458	if (vma->anon_vma) {
				2459	err = unmerge_ksm_pages(vma, start, end);
				2460	if (err)
				2461	return err;
				2462	}
				2463
				2464	*vm_flags &= ~VM_MERGEABLE;
				2465	break;
				2466	}
				2467
				2468	return 0;
				2469	}
				2470
				2471	int __ksm_enter(struct mm_struct *mm)
				2472	{
				2473	struct mm_slot *mm_slot;
				2474	int needs_wakeup;
				2475
				2476	mm_slot = alloc_mm_slot();
				2477	if (!mm_slot)
				2478	return -ENOMEM;
				2479
				2480	/* Check ksm_run too? Would need tighter locking */
				2481	needs_wakeup = list_empty(&ksm_mm_head.mm_list);
				2482
				2483	spin_lock(&ksm_mmlist_lock);
				2484	insert_to_mm_slots_hash(mm, mm_slot);
				2485	/*
				2486	* When KSM_RUN_MERGE (or KSM_RUN_STOP),
				2487	* insert just behind the scanning cursor, to let the area settle
				2488	* down a little; when fork is followed by immediate exec, we don't
				2489	* want ksmd to waste time setting up and tearing down an rmap_list.
				2490	*
				2491	* But when KSM_RUN_UNMERGE, it's important to insert ahead of its
				2492	* scanning cursor, otherwise KSM pages in newly forked mms will be
				2493	* missed: then we might as well insert at the end of the list.
				2494	*/
				2495	if (ksm_run & KSM_RUN_UNMERGE)
				2496	list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
				2497	else
				2498	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
				2499	spin_unlock(&ksm_mmlist_lock);
				2500
				2501	set_bit(MMF_VM_MERGEABLE, &mm->flags);
				2502	mmgrab(mm);
				2503
				2504	if (needs_wakeup)
				2505	wake_up_interruptible(&ksm_thread_wait);
				2506
				2507	return 0;
				2508	}
				2509
				2510	void __ksm_exit(struct mm_struct *mm)
				2511	{
				2512	struct mm_slot *mm_slot;
				2513	int easy_to_free = 0;
				2514
				2515	/*
				2516	* This process is exiting: if it's straightforward (as is the
				2517	* case when ksmd was never running), free mm_slot immediately.
				2518	* But if it's at the cursor or has rmap_items linked to it, use
				2519	* mmap_sem to synchronize with any break_cows before pagetables
				2520	* are freed, and leave the mm_slot on the list for ksmd to free.
				2521	* Beware: ksm may already have noticed it exiting and freed the slot.
				2522	*/
				2523
				2524	spin_lock(&ksm_mmlist_lock);
				2525	mm_slot = get_mm_slot(mm);
				2526	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
				2527	if (!mm_slot->rmap_list) {
				2528	hash_del(&mm_slot->link);
				2529	list_del(&mm_slot->mm_list);
				2530	easy_to_free = 1;
				2531	} else {
				2532	list_move(&mm_slot->mm_list,
				2533	&ksm_scan.mm_slot->mm_list);
				2534	}
				2535	}
				2536	spin_unlock(&ksm_mmlist_lock);
				2537
				2538	if (easy_to_free) {
				2539	free_mm_slot(mm_slot);
				2540	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				2541	mmdrop(mm);
				2542	} else if (mm_slot) {
				2543	down_write(&mm->mmap_sem);
				2544	up_write(&mm->mmap_sem);
				2545	}
				2546	}
				2547
				2548	struct page ksm_might_need_to_copy(struct page page,
				2549	struct vm_area_struct *vma, unsigned long address)
				2550	{
				2551	struct anon_vma *anon_vma = page_anon_vma(page);
				2552	struct page *new_page;
				2553
				2554	if (PageKsm(page)) {
				2555	if (page_stable_node(page) &&
				2556	!(ksm_run & KSM_RUN_UNMERGE))
				2557	return page; /* no need to copy it */
				2558	} else if (!anon_vma) {
				2559	return page; /* no need to copy it */
				2560	} else if (anon_vma->root == vma->anon_vma->root &&
				2561	page->index == linear_page_index(vma, address)) {
				2562	return page; /* still no need to copy it */
				2563	}
				2564	if (!PageUptodate(page))
				2565	return page; /* let do_swap_page report the error */
				2566
				2567	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
				2568	if (new_page) {
				2569	copy_user_highpage(new_page, page, address, vma);
				2570
				2571	SetPageDirty(new_page);
				2572	__SetPageUptodate(new_page);
				2573	__SetPageLocked(new_page);
				2574	}
				2575
				2576	return new_page;
				2577	}
				2578
				2579	void rmap_walk_ksm(struct page page, struct rmap_walk_control rwc)
				2580	{
				2581	struct stable_node *stable_node;
				2582	struct rmap_item *rmap_item;
				2583	int search_new_forks = 0;
				2584
				2585	VM_BUG_ON_PAGE(!PageKsm(page), page);
				2586
				2587	/*
				2588	* Rely on the page lock to protect against concurrent modifications
				2589	* to that page's node of the stable tree.
				2590	*/
				2591	VM_BUG_ON_PAGE(!PageLocked(page), page);
				2592
				2593	stable_node = page_stable_node(page);
				2594	if (!stable_node)
				2595	return;
				2596	again:
				2597	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
				2598	struct anon_vma *anon_vma = rmap_item->anon_vma;
				2599	struct anon_vma_chain *vmac;
				2600	struct vm_area_struct *vma;
				2601
				2602	cond_resched();
				2603	anon_vma_lock_read(anon_vma);
				2604	anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
				2605	0, ULONG_MAX) {
				2606	unsigned long addr;
				2607
				2608	cond_resched();
				2609	vma = vmac->vma;
				2610
				2611	/* Ignore the stable/unstable/sqnr flags */
				2612	addr = rmap_item->address & ~KSM_FLAG_MASK;
				2613
				2614	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
				2615	continue;
				2616	/*
				2617	* Initially we examine only the vma which covers this
				2618	* rmap_item; but later, if there is still work to do,
				2619	* we examine covering vmas in other mms: in case they
				2620	* were forked from the original since ksmd passed.
				2621	*/
				2622	if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
				2623	continue;
				2624
				2625	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
				2626	continue;
				2627
				2628	if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
				2629	anon_vma_unlock_read(anon_vma);
				2630	return;
				2631	}
				2632	if (rwc->done && rwc->done(page)) {
				2633	anon_vma_unlock_read(anon_vma);
				2634	return;
				2635	}
				2636	}
				2637	anon_vma_unlock_read(anon_vma);
				2638	}
				2639	if (!search_new_forks++)
				2640	goto again;
				2641	}
				2642
				2643	#ifdef CONFIG_MIGRATION
				2644	void ksm_migrate_page(struct page newpage, struct page oldpage)
				2645	{
				2646	struct stable_node *stable_node;
				2647
				2648	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
				2649	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
				2650	VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
				2651
				2652	stable_node = page_stable_node(newpage);
				2653	if (stable_node) {
				2654	VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
				2655	stable_node->kpfn = page_to_pfn(newpage);
				2656	/*
				2657	* newpage->mapping was set in advance; now we need smp_wmb()
				2658	* to make sure that the new stable_node->kpfn is visible
				2659	* to get_ksm_page() before it can see that oldpage->mapping
				2660	* has gone stale (or that PageSwapCache has been cleared).
				2661	*/
				2662	smp_wmb();
				2663	set_page_stable_node(oldpage, NULL);
				2664	}
				2665	}
				2666	#endif /* CONFIG_MIGRATION */
				2667
				2668	#ifdef CONFIG_MEMORY_HOTREMOVE
				2669	static void wait_while_offlining(void)
				2670	{
				2671	while (ksm_run & KSM_RUN_OFFLINE) {
				2672	mutex_unlock(&ksm_thread_mutex);
				2673	wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
				2674	TASK_UNINTERRUPTIBLE);
				2675	mutex_lock(&ksm_thread_mutex);
				2676	}
				2677	}
				2678
				2679	static bool stable_node_dup_remove_range(struct stable_node *stable_node,
				2680	unsigned long start_pfn,
				2681	unsigned long end_pfn)
				2682	{
				2683	if (stable_node->kpfn >= start_pfn &&
				2684	stable_node->kpfn < end_pfn) {
				2685	/*
				2686	* Don't get_ksm_page, page has already gone:
				2687	* which is why we keep kpfn instead of page*
				2688	*/
				2689	remove_node_from_stable_tree(stable_node);
				2690	return true;
				2691	}
				2692	return false;
				2693	}
				2694
				2695	static bool stable_node_chain_remove_range(struct stable_node *stable_node,
				2696	unsigned long start_pfn,
				2697	unsigned long end_pfn,
				2698	struct rb_root *root)
				2699	{
				2700	struct stable_node *dup;
				2701	struct hlist_node *hlist_safe;
				2702
				2703	if (!is_stable_node_chain(stable_node)) {
				2704	VM_BUG_ON(is_stable_node_dup(stable_node));
				2705	return stable_node_dup_remove_range(stable_node, start_pfn,
				2706	end_pfn);
				2707	}
				2708
				2709	hlist_for_each_entry_safe(dup, hlist_safe,
				2710	&stable_node->hlist, hlist_dup) {
				2711	VM_BUG_ON(!is_stable_node_dup(dup));
				2712	stable_node_dup_remove_range(dup, start_pfn, end_pfn);
				2713	}
				2714	if (hlist_empty(&stable_node->hlist)) {
				2715	free_stable_node_chain(stable_node, root);
				2716	return true; /* notify caller that tree was rebalanced */
				2717	} else
				2718	return false;
				2719	}
				2720
				2721	static void ksm_check_stable_tree(unsigned long start_pfn,
				2722	unsigned long end_pfn)
				2723	{
				2724	struct stable_node stable_node, next;
				2725	struct rb_node *node;
				2726	int nid;
				2727
				2728	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
				2729	node = rb_first(root_stable_tree + nid);
				2730	while (node) {
				2731	stable_node = rb_entry(node, struct stable_node, node);
				2732	if (stable_node_chain_remove_range(stable_node,
				2733	start_pfn, end_pfn,
				2734	root_stable_tree +
				2735	nid))
				2736	node = rb_first(root_stable_tree + nid);
				2737	else
				2738	node = rb_next(node);
				2739	cond_resched();
				2740	}
				2741	}
				2742	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
				2743	if (stable_node->kpfn >= start_pfn &&
				2744	stable_node->kpfn < end_pfn)
				2745	remove_node_from_stable_tree(stable_node);
				2746	cond_resched();
				2747	}
				2748	}
				2749
				2750	static int ksm_memory_callback(struct notifier_block *self,
				2751	unsigned long action, void *arg)
				2752	{
				2753	struct memory_notify *mn = arg;
				2754
				2755	switch (action) {
				2756	case MEM_GOING_OFFLINE:
				2757	/*
				2758	* Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
				2759	* and remove_all_stable_nodes() while memory is going offline:
				2760	* it is unsafe for them to touch the stable tree at this time.
				2761	* But unmerge_ksm_pages(), rmap lookups and other entry points
				2762	* which do not need the ksm_thread_mutex are all safe.
				2763	*/
				2764	mutex_lock(&ksm_thread_mutex);
				2765	ksm_run \|= KSM_RUN_OFFLINE;
				2766	mutex_unlock(&ksm_thread_mutex);
				2767	break;
				2768
				2769	case MEM_OFFLINE:
				2770	/*
				2771	* Most of the work is done by page migration; but there might
				2772	* be a few stable_nodes left over, still pointing to struct
				2773	* pages which have been offlined: prune those from the tree,
				2774	* otherwise get_ksm_page() might later try to access a
				2775	* non-existent struct page.
				2776	*/
				2777	ksm_check_stable_tree(mn->start_pfn,
				2778	mn->start_pfn + mn->nr_pages);
				2779	/* fallthrough */
				2780
				2781	case MEM_CANCEL_OFFLINE:
				2782	mutex_lock(&ksm_thread_mutex);
				2783	ksm_run &= ~KSM_RUN_OFFLINE;
				2784	mutex_unlock(&ksm_thread_mutex);
				2785
				2786	smp_mb(); /* wake_up_bit advises this */
				2787	wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
				2788	break;
				2789	}
				2790	return NOTIFY_OK;
				2791	}
				2792	#else
				2793	static void wait_while_offlining(void)
				2794	{
				2795	}
				2796	#endif /* CONFIG_MEMORY_HOTREMOVE */
				2797
				2798	#ifdef CONFIG_SYSFS
				2799	/*
				2800	* This all compiles without CONFIG_SYSFS, but is a waste of space.
				2801	*/
				2802
				2803	#define KSM_ATTR_RO(_name) \
				2804	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				2805	#define KSM_ATTR(_name) \
				2806	static struct kobj_attribute _name##_attr = \
				2807	__ATTR(_name, 0644, _name##_show, _name##_store)
				2808
				2809	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				2810	struct kobj_attribute attr, char buf)
				2811	{
				2812	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				2813	}
				2814
				2815	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				2816	struct kobj_attribute *attr,
				2817	const char *buf, size_t count)
				2818	{
				2819	unsigned long msecs;
				2820	int err;
				2821
				2822	err = kstrtoul(buf, 10, &msecs);
				2823	if (err \|\| msecs > UINT_MAX)
				2824	return -EINVAL;
				2825
				2826	ksm_thread_sleep_millisecs = msecs;
				2827
				2828	return count;
				2829	}
				2830	KSM_ATTR(sleep_millisecs);
				2831
				2832	static ssize_t pages_to_scan_show(struct kobject *kobj,
				2833	struct kobj_attribute attr, char buf)
				2834	{
				2835	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				2836	}
				2837
				2838	static ssize_t pages_to_scan_store(struct kobject *kobj,
				2839	struct kobj_attribute *attr,
				2840	const char *buf, size_t count)
				2841	{
				2842	int err;
				2843	unsigned long nr_pages;
				2844
				2845	err = kstrtoul(buf, 10, &nr_pages);
				2846	if (err \|\| nr_pages > UINT_MAX)
				2847	return -EINVAL;
				2848
				2849	ksm_thread_pages_to_scan = nr_pages;
				2850
				2851	return count;
				2852	}
				2853	KSM_ATTR(pages_to_scan);
				2854
				2855	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				2856	char *buf)
				2857	{
				2858	return sprintf(buf, "%lu\n", ksm_run);
				2859	}
				2860
				2861	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				2862	const char *buf, size_t count)
				2863	{
				2864	int err;
				2865	unsigned long flags;
				2866
				2867	err = kstrtoul(buf, 10, &flags);
				2868	if (err \|\| flags > UINT_MAX)
				2869	return -EINVAL;
				2870	if (flags > KSM_RUN_UNMERGE)
				2871	return -EINVAL;
				2872
				2873	/*
				2874	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				2875	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
				2876	* breaking COW to free the pages_shared (but leaves mm_slots
				2877	* on the list for when ksmd may be set running again).
				2878	*/
				2879
				2880	mutex_lock(&ksm_thread_mutex);
				2881	wait_while_offlining();
				2882	if (ksm_run != flags) {
				2883	ksm_run = flags;
				2884	if (flags & KSM_RUN_UNMERGE) {
				2885	set_current_oom_origin();
				2886	err = unmerge_and_remove_all_rmap_items();
				2887	clear_current_oom_origin();
				2888	if (err) {
				2889	ksm_run = KSM_RUN_STOP;
				2890	count = err;
				2891	}
				2892	}
				2893	}
				2894	mutex_unlock(&ksm_thread_mutex);
				2895
				2896	if (flags & KSM_RUN_MERGE)
				2897	wake_up_interruptible(&ksm_thread_wait);
				2898
				2899	return count;
				2900	}
				2901	KSM_ATTR(run);
				2902
				2903	#ifdef CONFIG_NUMA
				2904	static ssize_t merge_across_nodes_show(struct kobject *kobj,
				2905	struct kobj_attribute attr, char buf)
				2906	{
				2907	return sprintf(buf, "%u\n", ksm_merge_across_nodes);
				2908	}
				2909
				2910	static ssize_t merge_across_nodes_store(struct kobject *kobj,
				2911	struct kobj_attribute *attr,
				2912	const char *buf, size_t count)
				2913	{
				2914	int err;
				2915	unsigned long knob;
				2916
				2917	err = kstrtoul(buf, 10, &knob);
				2918	if (err)
				2919	return err;
				2920	if (knob > 1)
				2921	return -EINVAL;
				2922
				2923	mutex_lock(&ksm_thread_mutex);
				2924	wait_while_offlining();
				2925	if (ksm_merge_across_nodes != knob) {
				2926	if (ksm_pages_shared \|\| remove_all_stable_nodes())
				2927	err = -EBUSY;
				2928	else if (root_stable_tree == one_stable_tree) {
				2929	struct rb_root *buf;
				2930	/*
				2931	* This is the first time that we switch away from the
				2932	* default of merging across nodes: must now allocate
				2933	* a buffer to hold as many roots as may be needed.
				2934	* Allocate stable and unstable together:
				2935	* MAXSMP NODES_SHIFT 10 will use 16kB.
				2936	*/
				2937	buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
				2938	GFP_KERNEL);
				2939	/* Let us assume that RB_ROOT is NULL is zero */
				2940	if (!buf)
				2941	err = -ENOMEM;
				2942	else {
				2943	root_stable_tree = buf;
				2944	root_unstable_tree = buf + nr_node_ids;
				2945	/* Stable tree is empty but not the unstable */
				2946	root_unstable_tree[0] = one_unstable_tree[0];
				2947	}
				2948	}
				2949	if (!err) {
				2950	ksm_merge_across_nodes = knob;
				2951	ksm_nr_node_ids = knob ? 1 : nr_node_ids;
				2952	}
				2953	}
				2954	mutex_unlock(&ksm_thread_mutex);
				2955
				2956	return err ? err : count;
				2957	}
				2958	KSM_ATTR(merge_across_nodes);
				2959	#endif
				2960
				2961	static ssize_t use_zero_pages_show(struct kobject *kobj,
				2962	struct kobj_attribute attr, char buf)
				2963	{
				2964	return sprintf(buf, "%u\n", ksm_use_zero_pages);
				2965	}
				2966	static ssize_t use_zero_pages_store(struct kobject *kobj,
				2967	struct kobj_attribute *attr,
				2968	const char *buf, size_t count)
				2969	{
				2970	int err;
				2971	bool value;
				2972
				2973	err = kstrtobool(buf, &value);
				2974	if (err)
				2975	return -EINVAL;
				2976
				2977	ksm_use_zero_pages = value;
				2978
				2979	return count;
				2980	}
				2981	KSM_ATTR(use_zero_pages);
				2982
				2983	static ssize_t max_page_sharing_show(struct kobject *kobj,
				2984	struct kobj_attribute attr, char buf)
				2985	{
				2986	return sprintf(buf, "%u\n", ksm_max_page_sharing);
				2987	}
				2988
				2989	static ssize_t max_page_sharing_store(struct kobject *kobj,
				2990	struct kobj_attribute *attr,
				2991	const char *buf, size_t count)
				2992	{
				2993	int err;
				2994	int knob;
				2995
				2996	err = kstrtoint(buf, 10, &knob);
				2997	if (err)
				2998	return err;
				2999	/*
				3000	* When a KSM page is created it is shared by 2 mappings. This
				3001	* being a signed comparison, it implicitly verifies it's not
				3002	* negative.
				3003	*/
				3004	if (knob < 2)
				3005	return -EINVAL;
				3006
				3007	if (READ_ONCE(ksm_max_page_sharing) == knob)
				3008	return count;
				3009
				3010	mutex_lock(&ksm_thread_mutex);
				3011	wait_while_offlining();
				3012	if (ksm_max_page_sharing != knob) {
				3013	if (ksm_pages_shared \|\| remove_all_stable_nodes())
				3014	err = -EBUSY;
				3015	else
				3016	ksm_max_page_sharing = knob;
				3017	}
				3018	mutex_unlock(&ksm_thread_mutex);
				3019
				3020	return err ? err : count;
				3021	}
				3022	KSM_ATTR(max_page_sharing);
				3023
				3024	static ssize_t pages_shared_show(struct kobject *kobj,
				3025	struct kobj_attribute attr, char buf)
				3026	{
				3027	return sprintf(buf, "%lu\n", ksm_pages_shared);
				3028	}
				3029	KSM_ATTR_RO(pages_shared);
				3030
				3031	static ssize_t pages_sharing_show(struct kobject *kobj,
				3032	struct kobj_attribute attr, char buf)
				3033	{
				3034	return sprintf(buf, "%lu\n", ksm_pages_sharing);
				3035	}
				3036	KSM_ATTR_RO(pages_sharing);
				3037
				3038	static ssize_t pages_unshared_show(struct kobject *kobj,
				3039	struct kobj_attribute attr, char buf)
				3040	{
				3041	return sprintf(buf, "%lu\n", ksm_pages_unshared);
				3042	}
				3043	KSM_ATTR_RO(pages_unshared);
				3044
				3045	static ssize_t pages_volatile_show(struct kobject *kobj,
				3046	struct kobj_attribute attr, char buf)
				3047	{
				3048	long ksm_pages_volatile;
				3049
				3050	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
				3051	- ksm_pages_sharing - ksm_pages_unshared;
				3052	/*
				3053	* It was not worth any locking to calculate that statistic,
				3054	* but it might therefore sometimes be negative: conceal that.
				3055	*/
				3056	if (ksm_pages_volatile < 0)
				3057	ksm_pages_volatile = 0;
				3058	return sprintf(buf, "%ld\n", ksm_pages_volatile);
				3059	}
				3060	KSM_ATTR_RO(pages_volatile);
				3061
				3062	static ssize_t stable_node_dups_show(struct kobject *kobj,
				3063	struct kobj_attribute attr, char buf)
				3064	{
				3065	return sprintf(buf, "%lu\n", ksm_stable_node_dups);
				3066	}
				3067	KSM_ATTR_RO(stable_node_dups);
				3068
				3069	static ssize_t stable_node_chains_show(struct kobject *kobj,
				3070	struct kobj_attribute attr, char buf)
				3071	{
				3072	return sprintf(buf, "%lu\n", ksm_stable_node_chains);
				3073	}
				3074	KSM_ATTR_RO(stable_node_chains);
				3075
				3076	static ssize_t
				3077	stable_node_chains_prune_millisecs_show(struct kobject *kobj,
				3078	struct kobj_attribute *attr,
				3079	char *buf)
				3080	{
				3081	return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
				3082	}
				3083
				3084	static ssize_t
				3085	stable_node_chains_prune_millisecs_store(struct kobject *kobj,
				3086	struct kobj_attribute *attr,
				3087	const char *buf, size_t count)
				3088	{
				3089	unsigned long msecs;
				3090	int err;
				3091
				3092	err = kstrtoul(buf, 10, &msecs);
				3093	if (err \|\| msecs > UINT_MAX)
				3094	return -EINVAL;
				3095
				3096	ksm_stable_node_chains_prune_millisecs = msecs;
				3097
				3098	return count;
				3099	}
				3100	KSM_ATTR(stable_node_chains_prune_millisecs);
				3101
				3102	static ssize_t full_scans_show(struct kobject *kobj,
				3103	struct kobj_attribute attr, char buf)
				3104	{
				3105	return sprintf(buf, "%lu\n", ksm_scan.seqnr);
				3106	}
				3107	KSM_ATTR_RO(full_scans);
				3108
				3109	static struct attribute *ksm_attrs[] = {
				3110	&sleep_millisecs_attr.attr,
				3111	&pages_to_scan_attr.attr,
				3112	&run_attr.attr,
				3113	&pages_shared_attr.attr,
				3114	&pages_sharing_attr.attr,
				3115	&pages_unshared_attr.attr,
				3116	&pages_volatile_attr.attr,
				3117	&full_scans_attr.attr,
				3118	#ifdef CONFIG_NUMA
				3119	&merge_across_nodes_attr.attr,
				3120	#endif
				3121	&max_page_sharing_attr.attr,
				3122	&stable_node_chains_attr.attr,
				3123	&stable_node_dups_attr.attr,
				3124	&stable_node_chains_prune_millisecs_attr.attr,
				3125	&use_zero_pages_attr.attr,
				3126	NULL,
				3127	};
				3128
				3129	static const struct attribute_group ksm_attr_group = {
				3130	.attrs = ksm_attrs,
				3131	.name = "ksm",
				3132	};
				3133	#endif /* CONFIG_SYSFS */
				3134
				3135	static int __init ksm_init(void)
				3136	{
				3137	struct task_struct *ksm_thread;
				3138	int err;
				3139
				3140	/* The correct value depends on page size and endianness */
				3141	zero_checksum = calc_checksum(ZERO_PAGE(0));
				3142	/* Default to false for backwards compatibility */
				3143	ksm_use_zero_pages = false;
				3144
				3145	err = ksm_slab_init();
				3146	if (err)
				3147	goto out;
				3148
				3149	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				3150	if (IS_ERR(ksm_thread)) {
				3151	pr_err("ksm: creating kthread failed\n");
				3152	err = PTR_ERR(ksm_thread);
				3153	goto out_free;
				3154	}
				3155
				3156	#ifdef CONFIG_SYSFS
				3157	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				3158	if (err) {
				3159	pr_err("ksm: register sysfs failed\n");
				3160	kthread_stop(ksm_thread);
				3161	goto out_free;
				3162	}
				3163	#else
				3164	ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
				3165
				3166	#endif /* CONFIG_SYSFS */
				3167
				3168	#ifdef CONFIG_MEMORY_HOTREMOVE
				3169	/* There is no significance to this priority 100 */
				3170	hotplug_memory_notifier(ksm_memory_callback, 100);
				3171	#endif
				3172	return 0;
				3173
				3174	out_free:
				3175	ksm_slab_free();
				3176	out:
				3177	return err;
				3178	}
				3179	subsys_initcall(ksm_init);