Blame - marvell/linux/mm/hugetlb.c - T108

blob: e9d9a8c113a4ce3c8433c321f0d11c2a8c0844bc [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* Generic hugetlb support.
				4	* (C) Nadia Yvette Chambers, April 2004
				5	*/
				6	#include <linux/list.h>
				7	#include <linux/init.h>
				8	#include <linux/mm.h>
				9	#include <linux/seq_file.h>
				10	#include <linux/sysctl.h>
				11	#include <linux/highmem.h>
				12	#include <linux/mmu_notifier.h>
				13	#include <linux/nodemask.h>
				14	#include <linux/pagemap.h>
				15	#include <linux/mempolicy.h>
				16	#include <linux/compiler.h>
				17	#include <linux/cpuset.h>
				18	#include <linux/mutex.h>
				19	#include <linux/memblock.h>
				20	#include <linux/sysfs.h>
				21	#include <linux/slab.h>
				22	#include <linux/mmdebug.h>
				23	#include <linux/sched/signal.h>
				24	#include <linux/rmap.h>
				25	#include <linux/string_helpers.h>
				26	#include <linux/swap.h>
				27	#include <linux/swapops.h>
				28	#include <linux/jhash.h>
				29	#include <linux/numa.h>
				30	#include <linux/llist.h>
				31
				32	#include <asm/page.h>
				33	#include <asm/pgtable.h>
				34	#include <asm/tlb.h>
				35
				36	#include <linux/io.h>
				37	#include <linux/hugetlb.h>
				38	#include <linux/hugetlb_cgroup.h>
				39	#include <linux/node.h>
				40	#include <linux/page_owner.h>
				41	#include "internal.h"
				42
				43	int hugetlb_max_hstate __read_mostly;
				44	unsigned int default_hstate_idx;
				45	struct hstate hstates[HUGE_MAX_HSTATE];
				46	/*
				47	* Minimum page order among possible hugepage sizes, set to a proper value
				48	* at boot time.
				49	*/
				50	static unsigned int minimum_order __read_mostly = UINT_MAX;
				51
				52	__initdata LIST_HEAD(huge_boot_pages);
				53
				54	/* for command line parsing */
				55	static struct hstate * __initdata parsed_hstate;
				56	static unsigned long __initdata default_hstate_max_huge_pages;
				57	static unsigned long __initdata default_hstate_size;
				58	static bool __initdata parsed_valid_hugepagesz = true;
				59
				60	/*
				61	* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
				62	* free_huge_pages, and surplus_huge_pages.
				63	*/
				64	DEFINE_SPINLOCK(hugetlb_lock);
				65
				66	/*
				67	* Serializes faults on the same logical page. This is used to
				68	* prevent spurious OOMs when the hugepage pool is fully utilized.
				69	*/
				70	static int num_fault_mutexes;
				71	struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
				72
				73	static inline bool PageHugeFreed(struct page *head)
				74	{
				75	return page_private(head + 4) == -1UL;
				76	}
				77
				78	static inline void SetPageHugeFreed(struct page *head)
				79	{
				80	set_page_private(head + 4, -1UL);
				81	}
				82
				83	static inline void ClearPageHugeFreed(struct page *head)
				84	{
				85	set_page_private(head + 4, 0);
				86	}
				87
				88	/* Forward declaration */
				89	static int hugetlb_acct_memory(struct hstate *h, long delta);
				90
				91	static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
				92	{
				93	bool free = (spool->count == 0) && (spool->used_hpages == 0);
				94
				95	spin_unlock(&spool->lock);
				96
				97	/* If no pages are used, and no other handles to the subpool
				98	* remain, give up any reservations mased on minimum size and
				99	* free the subpool */
				100	if (free) {
				101	if (spool->min_hpages != -1)
				102	hugetlb_acct_memory(spool->hstate,
				103	-spool->min_hpages);
				104	kfree(spool);
				105	}
				106	}
				107
				108	struct hugepage_subpool hugepage_new_subpool(struct hstate h, long max_hpages,
				109	long min_hpages)
				110	{
				111	struct hugepage_subpool *spool;
				112
				113	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
				114	if (!spool)
				115	return NULL;
				116
				117	spin_lock_init(&spool->lock);
				118	spool->count = 1;
				119	spool->max_hpages = max_hpages;
				120	spool->hstate = h;
				121	spool->min_hpages = min_hpages;
				122
				123	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
				124	kfree(spool);
				125	return NULL;
				126	}
				127	spool->rsv_hpages = min_hpages;
				128
				129	return spool;
				130	}
				131
				132	void hugepage_put_subpool(struct hugepage_subpool *spool)
				133	{
				134	spin_lock(&spool->lock);
				135	BUG_ON(!spool->count);
				136	spool->count--;
				137	unlock_or_release_subpool(spool);
				138	}
				139
				140	/*
				141	* Subpool accounting for allocating and reserving pages.
				142	* Return -ENOMEM if there are not enough resources to satisfy the
				143	* the request. Otherwise, return the number of pages by which the
				144	* global pools must be adjusted (upward). The returned value may
				145	* only be different than the passed value (delta) in the case where
				146	* a subpool minimum size must be manitained.
				147	*/
				148	static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
				149	long delta)
				150	{
				151	long ret = delta;
				152
				153	if (!spool)
				154	return ret;
				155
				156	spin_lock(&spool->lock);
				157
				158	if (spool->max_hpages != -1) { /* maximum size accounting */
				159	if ((spool->used_hpages + delta) <= spool->max_hpages)
				160	spool->used_hpages += delta;
				161	else {
				162	ret = -ENOMEM;
				163	goto unlock_ret;
				164	}
				165	}
				166
				167	/* minimum size accounting */
				168	if (spool->min_hpages != -1 && spool->rsv_hpages) {
				169	if (delta > spool->rsv_hpages) {
				170	/*
				171	* Asking for more reserves than those already taken on
				172	* behalf of subpool. Return difference.
				173	*/
				174	ret = delta - spool->rsv_hpages;
				175	spool->rsv_hpages = 0;
				176	} else {
				177	ret = 0; /* reserves already accounted for */
				178	spool->rsv_hpages -= delta;
				179	}
				180	}
				181
				182	unlock_ret:
				183	spin_unlock(&spool->lock);
				184	return ret;
				185	}
				186
				187	/*
				188	* Subpool accounting for freeing and unreserving pages.
				189	* Return the number of global page reservations that must be dropped.
				190	* The return value may only be different than the passed value (delta)
				191	* in the case where a subpool minimum size must be maintained.
				192	*/
				193	static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
				194	long delta)
				195	{
				196	long ret = delta;
				197
				198	if (!spool)
				199	return delta;
				200
				201	spin_lock(&spool->lock);
				202
				203	if (spool->max_hpages != -1) /* maximum size accounting */
				204	spool->used_hpages -= delta;
				205
				206	/* minimum size accounting */
				207	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
				208	if (spool->rsv_hpages + delta <= spool->min_hpages)
				209	ret = 0;
				210	else
				211	ret = spool->rsv_hpages + delta - spool->min_hpages;
				212
				213	spool->rsv_hpages += delta;
				214	if (spool->rsv_hpages > spool->min_hpages)
				215	spool->rsv_hpages = spool->min_hpages;
				216	}
				217
				218	/*
				219	* If hugetlbfs_put_super couldn't free spool due to an outstanding
				220	* quota reference, free it now.
				221	*/
				222	unlock_or_release_subpool(spool);
				223
				224	return ret;
				225	}
				226
				227	static inline struct hugepage_subpool subpool_inode(struct inode inode)
				228	{
				229	return HUGETLBFS_SB(inode->i_sb)->spool;
				230	}
				231
				232	static inline struct hugepage_subpool subpool_vma(struct vm_area_struct vma)
				233	{
				234	return subpool_inode(file_inode(vma->vm_file));
				235	}
				236
				237	/*
				238	* Region tracking -- allows tracking of reservations and instantiated pages
				239	* across the pages in a mapping.
				240	*
				241	* The region data structures are embedded into a resv_map and protected
				242	* by a resv_map's lock. The set of regions within the resv_map represent
				243	* reservations for huge pages, or huge pages that have already been
				244	* instantiated within the map. The from and to elements are huge page
				245	* indicies into the associated mapping. from indicates the starting index
				246	* of the region. to represents the first index past the end of the region.
				247	*
				248	* For example, a file region structure with from == 0 and to == 4 represents
				249	* four huge pages in a mapping. It is important to note that the to element
				250	* represents the first element past the end of the region. This is used in
				251	* arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
				252	*
				253	* Interval notation of the form [from, to) will be used to indicate that
				254	* the endpoint from is inclusive and to is exclusive.
				255	*/
				256	struct file_region {
				257	struct list_head link;
				258	long from;
				259	long to;
				260	};
				261
				262	/*
				263	* Add the huge page range represented by [f, t) to the reserve
				264	* map. In the normal case, existing regions will be expanded
				265	* to accommodate the specified range. Sufficient regions should
				266	* exist for expansion due to the previous call to region_chg
				267	* with the same range. However, it is possible that region_del
				268	* could have been called after region_chg and modifed the map
				269	* in such a way that no region exists to be expanded. In this
				270	* case, pull a region descriptor from the cache associated with
				271	* the map and use that for the new range.
				272	*
				273	* Return the number of new huge pages added to the map. This
				274	* number is greater than or equal to zero.
				275	*/
				276	static long region_add(struct resv_map *resv, long f, long t)
				277	{
				278	struct list_head *head = &resv->regions;
				279	struct file_region rg, nrg, *trg;
				280	long add = 0;
				281
				282	spin_lock(&resv->lock);
				283	/* Locate the region we are either in or before. */
				284	list_for_each_entry(rg, head, link)
				285	if (f <= rg->to)
				286	break;
				287
				288	/*
				289	* If no region exists which can be expanded to include the
				290	* specified range, the list must have been modified by an
				291	* interleving call to region_del(). Pull a region descriptor
				292	* from the cache and use it for this range.
				293	*/
				294	if (&rg->link == head \|\| t < rg->from) {
				295	VM_BUG_ON(resv->region_cache_count <= 0);
				296
				297	resv->region_cache_count--;
				298	nrg = list_first_entry(&resv->region_cache, struct file_region,
				299	link);
				300	list_del(&nrg->link);
				301
				302	nrg->from = f;
				303	nrg->to = t;
				304	list_add(&nrg->link, rg->link.prev);
				305
				306	add += t - f;
				307	goto out_locked;
				308	}
				309
				310	/* Round our left edge to the current segment if it encloses us. */
				311	if (f > rg->from)
				312	f = rg->from;
				313
				314	/* Check for and consume any regions we now overlap with. */
				315	nrg = rg;
				316	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
				317	if (&rg->link == head)
				318	break;
				319	if (rg->from > t)
				320	break;
				321
				322	/* If this area reaches higher then extend our area to
				323	* include it completely. If this is not the first area
				324	* which we intend to reuse, free it. */
				325	if (rg->to > t)
				326	t = rg->to;
				327	if (rg != nrg) {
				328	/* Decrement return value by the deleted range.
				329	* Another range will span this area so that by
				330	* end of routine add will be >= zero
				331	*/
				332	add -= (rg->to - rg->from);
				333	list_del(&rg->link);
				334	kfree(rg);
				335	}
				336	}
				337
				338	add += (nrg->from - f); /* Added to beginning of region */
				339	nrg->from = f;
				340	add += t - nrg->to; /* Added to end of region */
				341	nrg->to = t;
				342
				343	out_locked:
				344	resv->adds_in_progress--;
				345	spin_unlock(&resv->lock);
				346	VM_BUG_ON(add < 0);
				347	return add;
				348	}
				349
				350	/*
				351	* Examine the existing reserve map and determine how many
				352	* huge pages in the specified range [f, t) are NOT currently
				353	* represented. This routine is called before a subsequent
				354	* call to region_add that will actually modify the reserve
				355	* map to add the specified range [f, t). region_chg does
				356	* not change the number of huge pages represented by the
				357	* map. However, if the existing regions in the map can not
				358	* be expanded to represent the new range, a new file_region
				359	* structure is added to the map as a placeholder. This is
				360	* so that the subsequent region_add call will have all the
				361	* regions it needs and will not fail.
				362	*
				363	* Upon entry, region_chg will also examine the cache of region descriptors
				364	* associated with the map. If there are not enough descriptors cached, one
				365	* will be allocated for the in progress add operation.
				366	*
				367	* Returns the number of huge pages that need to be added to the existing
				368	* reservation map for the range [f, t). This number is greater or equal to
				369	* zero. -ENOMEM is returned if a new file_region structure or cache entry
				370	* is needed and can not be allocated.
				371	*/
				372	static long region_chg(struct resv_map *resv, long f, long t)
				373	{
				374	struct list_head *head = &resv->regions;
				375	struct file_region rg, nrg = NULL;
				376	long chg = 0;
				377
				378	retry:
				379	spin_lock(&resv->lock);
				380	retry_locked:
				381	resv->adds_in_progress++;
				382
				383	/*
				384	* Check for sufficient descriptors in the cache to accommodate
				385	* the number of in progress add operations.
				386	*/
				387	if (resv->adds_in_progress > resv->region_cache_count) {
				388	struct file_region *trg;
				389
				390	VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
				391	/* Must drop lock to allocate a new descriptor. */
				392	resv->adds_in_progress--;
				393	spin_unlock(&resv->lock);
				394
				395	trg = kmalloc(sizeof(*trg), GFP_KERNEL);
				396	if (!trg) {
				397	kfree(nrg);
				398	return -ENOMEM;
				399	}
				400
				401	spin_lock(&resv->lock);
				402	list_add(&trg->link, &resv->region_cache);
				403	resv->region_cache_count++;
				404	goto retry_locked;
				405	}
				406
				407	/* Locate the region we are before or in. */
				408	list_for_each_entry(rg, head, link)
				409	if (f <= rg->to)
				410	break;
				411
				412	/* If we are below the current region then a new region is required.
				413	* Subtle, allocate a new region at the position but make it zero
				414	* size such that we can guarantee to record the reservation. */
				415	if (&rg->link == head \|\| t < rg->from) {
				416	if (!nrg) {
				417	resv->adds_in_progress--;
				418	spin_unlock(&resv->lock);
				419	nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
				420	if (!nrg)
				421	return -ENOMEM;
				422
				423	nrg->from = f;
				424	nrg->to = f;
				425	INIT_LIST_HEAD(&nrg->link);
				426	goto retry;
				427	}
				428
				429	list_add(&nrg->link, rg->link.prev);
				430	chg = t - f;
				431	goto out_nrg;
				432	}
				433
				434	/* Round our left edge to the current segment if it encloses us. */
				435	if (f > rg->from)
				436	f = rg->from;
				437	chg = t - f;
				438
				439	/* Check for and consume any regions we now overlap with. */
				440	list_for_each_entry(rg, rg->link.prev, link) {
				441	if (&rg->link == head)
				442	break;
				443	if (rg->from > t)
				444	goto out;
				445
				446	/* We overlap with this area, if it extends further than
				447	* us then we must extend ourselves. Account for its
				448	* existing reservation. */
				449	if (rg->to > t) {
				450	chg += rg->to - t;
				451	t = rg->to;
				452	}
				453	chg -= rg->to - rg->from;
				454	}
				455
				456	out:
				457	spin_unlock(&resv->lock);
				458	/* We already know we raced and no longer need the new region */
				459	kfree(nrg);
				460	return chg;
				461	out_nrg:
				462	spin_unlock(&resv->lock);
				463	return chg;
				464	}
				465
				466	/*
				467	* Abort the in progress add operation. The adds_in_progress field
				468	* of the resv_map keeps track of the operations in progress between
				469	* calls to region_chg and region_add. Operations are sometimes
				470	* aborted after the call to region_chg. In such cases, region_abort
				471	* is called to decrement the adds_in_progress counter.
				472	*
				473	* NOTE: The range arguments [f, t) are not needed or used in this
				474	* routine. They are kept to make reading the calling code easier as
				475	* arguments will match the associated region_chg call.
				476	*/
				477	static void region_abort(struct resv_map *resv, long f, long t)
				478	{
				479	spin_lock(&resv->lock);
				480	VM_BUG_ON(!resv->region_cache_count);
				481	resv->adds_in_progress--;
				482	spin_unlock(&resv->lock);
				483	}
				484
				485	/*
				486	* Delete the specified range [f, t) from the reserve map. If the
				487	* t parameter is LONG_MAX, this indicates that ALL regions after f
				488	* should be deleted. Locate the regions which intersect [f, t)
				489	* and either trim, delete or split the existing regions.
				490	*
				491	* Returns the number of huge pages deleted from the reserve map.
				492	* In the normal case, the return value is zero or more. In the
				493	* case where a region must be split, a new region descriptor must
				494	* be allocated. If the allocation fails, -ENOMEM will be returned.
				495	* NOTE: If the parameter t == LONG_MAX, then we will never split
				496	* a region and possibly return -ENOMEM. Callers specifying
				497	* t == LONG_MAX do not need to check for -ENOMEM error.
				498	*/
				499	static long region_del(struct resv_map *resv, long f, long t)
				500	{
				501	struct list_head *head = &resv->regions;
				502	struct file_region rg, trg;
				503	struct file_region *nrg = NULL;
				504	long del = 0;
				505
				506	retry:
				507	spin_lock(&resv->lock);
				508	list_for_each_entry_safe(rg, trg, head, link) {
				509	/*
				510	* Skip regions before the range to be deleted. file_region
				511	* ranges are normally of the form [from, to). However, there
				512	* may be a "placeholder" entry in the map which is of the form
				513	* (from, to) with from == to. Check for placeholder entries
				514	* at the beginning of the range to be deleted.
				515	*/
				516	if (rg->to <= f && (rg->to != rg->from \|\| rg->to != f))
				517	continue;
				518
				519	if (rg->from >= t)
				520	break;
				521
				522	if (f > rg->from && t < rg->to) { /* Must split region */
				523	/*
				524	* Check for an entry in the cache before dropping
				525	* lock and attempting allocation.
				526	*/
				527	if (!nrg &&
				528	resv->region_cache_count > resv->adds_in_progress) {
				529	nrg = list_first_entry(&resv->region_cache,
				530	struct file_region,
				531	link);
				532	list_del(&nrg->link);
				533	resv->region_cache_count--;
				534	}
				535
				536	if (!nrg) {
				537	spin_unlock(&resv->lock);
				538	nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
				539	if (!nrg)
				540	return -ENOMEM;
				541	goto retry;
				542	}
				543
				544	del += t - f;
				545
				546	/* New entry for end of split region */
				547	nrg->from = t;
				548	nrg->to = rg->to;
				549	INIT_LIST_HEAD(&nrg->link);
				550
				551	/* Original entry is trimmed */
				552	rg->to = f;
				553
				554	list_add(&nrg->link, &rg->link);
				555	nrg = NULL;
				556	break;
				557	}
				558
				559	if (f <= rg->from && t >= rg->to) { /* Remove entire region */
				560	del += rg->to - rg->from;
				561	list_del(&rg->link);
				562	kfree(rg);
				563	continue;
				564	}
				565
				566	if (f <= rg->from) { /* Trim beginning of region */
				567	del += t - rg->from;
				568	rg->from = t;
				569	} else { /* Trim end of region */
				570	del += rg->to - f;
				571	rg->to = f;
				572	}
				573	}
				574
				575	spin_unlock(&resv->lock);
				576	kfree(nrg);
				577	return del;
				578	}
				579
				580	/*
				581	* A rare out of memory error was encountered which prevented removal of
				582	* the reserve map region for a page. The huge page itself was free'ed
				583	* and removed from the page cache. This routine will adjust the subpool
				584	* usage count, and the global reserve count if needed. By incrementing
				585	* these counts, the reserve map entry which could not be deleted will
				586	* appear as a "reserved" entry instead of simply dangling with incorrect
				587	* counts.
				588	*/
				589	void hugetlb_fix_reserve_counts(struct inode *inode)
				590	{
				591	struct hugepage_subpool *spool = subpool_inode(inode);
				592	long rsv_adjust;
				593	bool reserved = false;
				594
				595	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
				596	if (rsv_adjust > 0) {
				597	struct hstate *h = hstate_inode(inode);
				598
				599	if (!hugetlb_acct_memory(h, 1))
				600	reserved = true;
				601	} else if (!rsv_adjust) {
				602	reserved = true;
				603	}
				604
				605	if (!reserved)
				606	pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
				607	}
				608
				609	/*
				610	* Count and return the number of huge pages in the reserve map
				611	* that intersect with the range [f, t).
				612	*/
				613	static long region_count(struct resv_map *resv, long f, long t)
				614	{
				615	struct list_head *head = &resv->regions;
				616	struct file_region *rg;
				617	long chg = 0;
				618
				619	spin_lock(&resv->lock);
				620	/* Locate each segment we overlap with, and count that overlap. */
				621	list_for_each_entry(rg, head, link) {
				622	long seg_from;
				623	long seg_to;
				624
				625	if (rg->to <= f)
				626	continue;
				627	if (rg->from >= t)
				628	break;
				629
				630	seg_from = max(rg->from, f);
				631	seg_to = min(rg->to, t);
				632
				633	chg += seg_to - seg_from;
				634	}
				635	spin_unlock(&resv->lock);
				636
				637	return chg;
				638	}
				639
				640	/*
				641	* Convert the address within this vma to the page offset within
				642	* the mapping, in pagecache page units; huge pages here.
				643	*/
				644	static pgoff_t vma_hugecache_offset(struct hstate *h,
				645	struct vm_area_struct *vma, unsigned long address)
				646	{
				647	return ((address - vma->vm_start) >> huge_page_shift(h)) +
				648	(vma->vm_pgoff >> huge_page_order(h));
				649	}
				650
				651	pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
				652	unsigned long address)
				653	{
				654	return vma_hugecache_offset(hstate_vma(vma), vma, address);
				655	}
				656	EXPORT_SYMBOL_GPL(linear_hugepage_index);
				657
				658	/*
				659	* Return the size of the pages allocated when backing a VMA. In the majority
				660	* cases this will be same size as used by the page table entries.
				661	*/
				662	unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
				663	{
				664	if (vma->vm_ops && vma->vm_ops->pagesize)
				665	return vma->vm_ops->pagesize(vma);
				666	return PAGE_SIZE;
				667	}
				668	EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
				669
				670	/*
				671	* Return the page size being used by the MMU to back a VMA. In the majority
				672	* of cases, the page size used by the kernel matches the MMU size. On
				673	* architectures where it differs, an architecture-specific 'strong'
				674	* version of this symbol is required.
				675	*/
				676	__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
				677	{
				678	return vma_kernel_pagesize(vma);
				679	}
				680
				681	/*
				682	* Flags for MAP_PRIVATE reservations. These are stored in the bottom
				683	* bits of the reservation map pointer, which are always clear due to
				684	* alignment.
				685	*/
				686	#define HPAGE_RESV_OWNER (1UL << 0)
				687	#define HPAGE_RESV_UNMAPPED (1UL << 1)
				688	#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER \| HPAGE_RESV_UNMAPPED)
				689
				690	/*
				691	* These helpers are used to track how many pages are reserved for
				692	* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
				693	* is guaranteed to have their future faults succeed.
				694	*
				695	* With the exception of reset_vma_resv_huge_pages() which is called at fork(),
				696	* the reserve counters are updated with the hugetlb_lock held. It is safe
				697	* to reset the VMA at fork() time as it is not in use yet and there is no
				698	* chance of the global counters getting corrupted as a result of the values.
				699	*
				700	* The private mapping reservation is represented in a subtly different
				701	* manner to a shared mapping. A shared mapping has a region map associated
				702	* with the underlying file, this region map represents the backing file
				703	* pages which have ever had a reservation assigned which this persists even
				704	* after the page is instantiated. A private mapping has a region map
				705	* associated with the original mmap which is attached to all VMAs which
				706	* reference it, this region map represents those offsets which have consumed
				707	* reservation ie. where pages have been instantiated.
				708	*/
				709	static unsigned long get_vma_private_data(struct vm_area_struct *vma)
				710	{
				711	return (unsigned long)vma->vm_private_data;
				712	}
				713
				714	static void set_vma_private_data(struct vm_area_struct *vma,
				715	unsigned long value)
				716	{
				717	vma->vm_private_data = (void *)value;
				718	}
				719
				720	struct resv_map *resv_map_alloc(void)
				721	{
				722	struct resv_map resv_map = kmalloc(sizeof(resv_map), GFP_KERNEL);
				723	struct file_region rg = kmalloc(sizeof(rg), GFP_KERNEL);
				724
				725	if (!resv_map \|\| !rg) {
				726	kfree(resv_map);
				727	kfree(rg);
				728	return NULL;
				729	}
				730
				731	kref_init(&resv_map->refs);
				732	spin_lock_init(&resv_map->lock);
				733	INIT_LIST_HEAD(&resv_map->regions);
				734
				735	resv_map->adds_in_progress = 0;
				736
				737	INIT_LIST_HEAD(&resv_map->region_cache);
				738	list_add(&rg->link, &resv_map->region_cache);
				739	resv_map->region_cache_count = 1;
				740
				741	return resv_map;
				742	}
				743
				744	void resv_map_release(struct kref *ref)
				745	{
				746	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
				747	struct list_head *head = &resv_map->region_cache;
				748	struct file_region rg, trg;
				749
				750	/* Clear out any active regions before we release the map. */
				751	region_del(resv_map, 0, LONG_MAX);
				752
				753	/* ... and any entries left in the cache */
				754	list_for_each_entry_safe(rg, trg, head, link) {
				755	list_del(&rg->link);
				756	kfree(rg);
				757	}
				758
				759	VM_BUG_ON(resv_map->adds_in_progress);
				760
				761	kfree(resv_map);
				762	}
				763
				764	static inline struct resv_map inode_resv_map(struct inode inode)
				765	{
				766	/*
				767	* At inode evict time, i_mapping may not point to the original
				768	* address space within the inode. This original address space
				769	* contains the pointer to the resv_map. So, always use the
				770	* address space embedded within the inode.
				771	* The VERY common case is inode->mapping == &inode->i_data but,
				772	* this may not be true for device special inodes.
				773	*/
				774	return (struct resv_map *)(&inode->i_data)->private_data;
				775	}
				776
				777	static struct resv_map vma_resv_map(struct vm_area_struct vma)
				778	{
				779	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
				780	if (vma->vm_flags & VM_MAYSHARE) {
				781	struct address_space *mapping = vma->vm_file->f_mapping;
				782	struct inode *inode = mapping->host;
				783
				784	return inode_resv_map(inode);
				785
				786	} else {
				787	return (struct resv_map *)(get_vma_private_data(vma) &
				788	~HPAGE_RESV_MASK);
				789	}
				790	}
				791
				792	static void set_vma_resv_map(struct vm_area_struct vma, struct resv_map map)
				793	{
				794	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
				795	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
				796
				797	set_vma_private_data(vma, (get_vma_private_data(vma) &
				798	HPAGE_RESV_MASK) \| (unsigned long)map);
				799	}
				800
				801	static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
				802	{
				803	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
				804	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
				805
				806	set_vma_private_data(vma, get_vma_private_data(vma) \| flags);
				807	}
				808
				809	static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
				810	{
				811	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
				812
				813	return (get_vma_private_data(vma) & flag) != 0;
				814	}
				815
				816	/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
				817	void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
				818	{
				819	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
				820	if (!(vma->vm_flags & VM_MAYSHARE))
				821	vma->vm_private_data = (void *)0;
				822	}
				823
				824	/* Returns true if the VMA has associated reserve pages */
				825	static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
				826	{
				827	if (vma->vm_flags & VM_NORESERVE) {
				828	/*
				829	* This address is already reserved by other process(chg == 0),
				830	* so, we should decrement reserved count. Without decrementing,
				831	* reserve count remains after releasing inode, because this
				832	* allocated page will go into page cache and is regarded as
				833	* coming from reserved pool in releasing step. Currently, we
				834	* don't have any other solution to deal with this situation
				835	* properly, so add work-around here.
				836	*/
				837	if (vma->vm_flags & VM_MAYSHARE && chg == 0)
				838	return true;
				839	else
				840	return false;
				841	}
				842
				843	/* Shared mappings always use reserves */
				844	if (vma->vm_flags & VM_MAYSHARE) {
				845	/*
				846	* We know VM_NORESERVE is not set. Therefore, there SHOULD
				847	* be a region map for all pages. The only situation where
				848	* there is no region map is if a hole was punched via
				849	* fallocate. In this case, there really are no reverves to
				850	* use. This situation is indicated if chg != 0.
				851	*/
				852	if (chg)
				853	return false;
				854	else
				855	return true;
				856	}
				857
				858	/*
				859	* Only the process that called mmap() has reserves for
				860	* private mappings.
				861	*/
				862	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
				863	/*
				864	* Like the shared case above, a hole punch or truncate
				865	* could have been performed on the private mapping.
				866	* Examine the value of chg to determine if reserves
				867	* actually exist or were previously consumed.
				868	* Very Subtle - The value of chg comes from a previous
				869	* call to vma_needs_reserves(). The reserve map for
				870	* private mappings has different (opposite) semantics
				871	* than that of shared mappings. vma_needs_reserves()
				872	* has already taken this difference in semantics into
				873	* account. Therefore, the meaning of chg is the same
				874	* as in the shared case above. Code could easily be
				875	* combined, but keeping it separate draws attention to
				876	* subtle differences.
				877	*/
				878	if (chg)
				879	return false;
				880	else
				881	return true;
				882	}
				883
				884	return false;
				885	}
				886
				887	static void enqueue_huge_page(struct hstate h, struct page page)
				888	{
				889	int nid = page_to_nid(page);
				890	list_move(&page->lru, &h->hugepage_freelists[nid]);
				891	h->free_huge_pages++;
				892	h->free_huge_pages_node[nid]++;
				893	SetPageHugeFreed(page);
				894	}
				895
				896	static struct page dequeue_huge_page_node_exact(struct hstate h, int nid)
				897	{
				898	struct page *page;
				899
				900	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
				901	if (!PageHWPoison(page))
				902	break;
				903	/*
				904	* if 'non-isolated free hugepage' not found on the list,
				905	* the allocation fails.
				906	*/
				907	if (&h->hugepage_freelists[nid] == &page->lru)
				908	return NULL;
				909	list_move(&page->lru, &h->hugepage_activelist);
				910	set_page_refcounted(page);
				911	ClearPageHugeFreed(page);
				912	h->free_huge_pages--;
				913	h->free_huge_pages_node[nid]--;
				914	return page;
				915	}
				916
				917	static struct page dequeue_huge_page_nodemask(struct hstate h, gfp_t gfp_mask, int nid,
				918	nodemask_t *nmask)
				919	{
				920	unsigned int cpuset_mems_cookie;
				921	struct zonelist *zonelist;
				922	struct zone *zone;
				923	struct zoneref *z;
				924	int node = NUMA_NO_NODE;
				925
				926	zonelist = node_zonelist(nid, gfp_mask);
				927
				928	retry_cpuset:
				929	cpuset_mems_cookie = read_mems_allowed_begin();
				930	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
				931	struct page *page;
				932
				933	if (!cpuset_zone_allowed(zone, gfp_mask))
				934	continue;
				935	/*
				936	* no need to ask again on the same node. Pool is node rather than
				937	* zone aware
				938	*/
				939	if (zone_to_nid(zone) == node)
				940	continue;
				941	node = zone_to_nid(zone);
				942
				943	page = dequeue_huge_page_node_exact(h, node);
				944	if (page)
				945	return page;
				946	}
				947	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
				948	goto retry_cpuset;
				949
				950	return NULL;
				951	}
				952
				953	/* Movability of hugepages depends on migration support. */
				954	static inline gfp_t htlb_alloc_mask(struct hstate *h)
				955	{
				956	if (hugepage_movable_supported(h))
				957	return GFP_HIGHUSER_MOVABLE;
				958	else
				959	return GFP_HIGHUSER;
				960	}
				961
				962	static struct page dequeue_huge_page_vma(struct hstate h,
				963	struct vm_area_struct *vma,
				964	unsigned long address, int avoid_reserve,
				965	long chg)
				966	{
				967	struct page *page;
				968	struct mempolicy *mpol;
				969	gfp_t gfp_mask;
				970	nodemask_t *nodemask;
				971	int nid;
				972
				973	/*
				974	* A child process with MAP_PRIVATE mappings created by their parent
				975	* have no page reserves. This check ensures that reservations are
				976	* not "stolen". The child may still get SIGKILLed
				977	*/
				978	if (!vma_has_reserves(vma, chg) &&
				979	h->free_huge_pages - h->resv_huge_pages == 0)
				980	goto err;
				981
				982	/* If reserves cannot be used, ensure enough pages are in the pool */
				983	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
				984	goto err;
				985
				986	gfp_mask = htlb_alloc_mask(h);
				987	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
				988	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
				989	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
				990	SetPagePrivate(page);
				991	h->resv_huge_pages--;
				992	}
				993
				994	mpol_cond_put(mpol);
				995	return page;
				996
				997	err:
				998	return NULL;
				999	}
				1000
				1001	/*
				1002	* common helper functions for hstate_next_node_to_{alloc\|free}.
				1003	* We may have allocated or freed a huge page based on a different
				1004	* nodes_allowed previously, so h->next_node_to_{alloc\|free} might
				1005	* be outside of *nodes_allowed. Ensure that we use an allowed
				1006	* node for alloc or free.
				1007	*/
				1008	static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
				1009	{
				1010	nid = next_node_in(nid, *nodes_allowed);
				1011	VM_BUG_ON(nid >= MAX_NUMNODES);
				1012
				1013	return nid;
				1014	}
				1015
				1016	static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
				1017	{
				1018	if (!node_isset(nid, *nodes_allowed))
				1019	nid = next_node_allowed(nid, nodes_allowed);
				1020	return nid;
				1021	}
				1022
				1023	/*
				1024	* returns the previously saved node ["this node"] from which to
				1025	* allocate a persistent huge page for the pool and advance the
				1026	* next node from which to allocate, handling wrap at end of node
				1027	* mask.
				1028	*/
				1029	static int hstate_next_node_to_alloc(struct hstate *h,
				1030	nodemask_t *nodes_allowed)
				1031	{
				1032	int nid;
				1033
				1034	VM_BUG_ON(!nodes_allowed);
				1035
				1036	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
				1037	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
				1038
				1039	return nid;
				1040	}
				1041
				1042	/*
				1043	* helper for free_pool_huge_page() - return the previously saved
				1044	* node ["this node"] from which to free a huge page. Advance the
				1045	* next node id whether or not we find a free huge page to free so
				1046	* that the next attempt to free addresses the next node.
				1047	*/
				1048	static int hstate_next_node_to_free(struct hstate h, nodemask_t nodes_allowed)
				1049	{
				1050	int nid;
				1051
				1052	VM_BUG_ON(!nodes_allowed);
				1053
				1054	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
				1055	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
				1056
				1057	return nid;
				1058	}
				1059
				1060	#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
				1061	for (nr_nodes = nodes_weight(*mask); \
				1062	nr_nodes > 0 && \
				1063	((node = hstate_next_node_to_alloc(hs, mask)) \|\| 1); \
				1064	nr_nodes--)
				1065
				1066	#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
				1067	for (nr_nodes = nodes_weight(*mask); \
				1068	nr_nodes > 0 && \
				1069	((node = hstate_next_node_to_free(hs, mask)) \|\| 1); \
				1070	nr_nodes--)
				1071
				1072	#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
				1073	static void destroy_compound_gigantic_page(struct page *page,
				1074	unsigned int order)
				1075	{
				1076	int i;
				1077	int nr_pages = 1 << order;
				1078	struct page *p = page + 1;
				1079
				1080	atomic_set(compound_mapcount_ptr(page), 0);
				1081	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
				1082	clear_compound_head(p);
				1083	set_page_refcounted(p);
				1084	}
				1085
				1086	set_compound_order(page, 0);
				1087	__ClearPageHead(page);
				1088	}
				1089
				1090	static void free_gigantic_page(struct page *page, unsigned int order)
				1091	{
				1092	free_contig_range(page_to_pfn(page), 1 << order);
				1093	}
				1094
				1095	#ifdef CONFIG_CONTIG_ALLOC
				1096	static int __alloc_gigantic_page(unsigned long start_pfn,
				1097	unsigned long nr_pages, gfp_t gfp_mask)
				1098	{
				1099	unsigned long end_pfn = start_pfn + nr_pages;
				1100	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
				1101	gfp_mask);
				1102	}
				1103
				1104	static bool pfn_range_valid_gigantic(struct zone *z,
				1105	unsigned long start_pfn, unsigned long nr_pages)
				1106	{
				1107	unsigned long i, end_pfn = start_pfn + nr_pages;
				1108	struct page *page;
				1109
				1110	for (i = start_pfn; i < end_pfn; i++) {
				1111	page = pfn_to_online_page(i);
				1112	if (!page)
				1113	return false;
				1114
				1115	if (page_zone(page) != z)
				1116	return false;
				1117
				1118	if (PageReserved(page))
				1119	return false;
				1120
				1121	if (page_count(page) > 0)
				1122	return false;
				1123
				1124	if (PageHuge(page))
				1125	return false;
				1126	}
				1127
				1128	return true;
				1129	}
				1130
				1131	static bool zone_spans_last_pfn(const struct zone *zone,
				1132	unsigned long start_pfn, unsigned long nr_pages)
				1133	{
				1134	unsigned long last_pfn = start_pfn + nr_pages - 1;
				1135	return zone_spans_pfn(zone, last_pfn);
				1136	}
				1137
				1138	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
				1139	int nid, nodemask_t *nodemask)
				1140	{
				1141	unsigned int order = huge_page_order(h);
				1142	unsigned long nr_pages = 1 << order;
				1143	unsigned long ret, pfn, flags;
				1144	struct zonelist *zonelist;
				1145	struct zone *zone;
				1146	struct zoneref *z;
				1147
				1148	zonelist = node_zonelist(nid, gfp_mask);
				1149	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
				1150	spin_lock_irqsave(&zone->lock, flags);
				1151
				1152	pfn = ALIGN(zone->zone_start_pfn, nr_pages);
				1153	while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
				1154	if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
				1155	/*
				1156	* We release the zone lock here because
				1157	* alloc_contig_range() will also lock the zone
				1158	* at some point. If there's an allocation
				1159	* spinning on this lock, it may win the race
				1160	* and cause alloc_contig_range() to fail...
				1161	*/
				1162	spin_unlock_irqrestore(&zone->lock, flags);
				1163	ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
				1164	if (!ret)
				1165	return pfn_to_page(pfn);
				1166	spin_lock_irqsave(&zone->lock, flags);
				1167	}
				1168	pfn += nr_pages;
				1169	}
				1170
				1171	spin_unlock_irqrestore(&zone->lock, flags);
				1172	}
				1173
				1174	return NULL;
				1175	}
				1176
				1177	static void prep_new_huge_page(struct hstate h, struct page page, int nid);
				1178	static void prep_compound_gigantic_page(struct page *page, unsigned int order);
				1179	#else /* !CONFIG_CONTIG_ALLOC */
				1180	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
				1181	int nid, nodemask_t *nodemask)
				1182	{
				1183	return NULL;
				1184	}
				1185	#endif /* CONFIG_CONTIG_ALLOC */
				1186
				1187	#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
				1188	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
				1189	int nid, nodemask_t *nodemask)
				1190	{
				1191	return NULL;
				1192	}
				1193	static inline void free_gigantic_page(struct page *page, unsigned int order) { }
				1194	static inline void destroy_compound_gigantic_page(struct page *page,
				1195	unsigned int order) { }
				1196	#endif
				1197
				1198	static void update_and_free_page(struct hstate h, struct page page)
				1199	{
				1200	int i;
				1201	struct page *subpage = page;
				1202
				1203	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
				1204	return;
				1205
				1206	h->nr_huge_pages--;
				1207	h->nr_huge_pages_node[page_to_nid(page)]--;
				1208	for (i = 0; i < pages_per_huge_page(h);
				1209	i++, subpage = mem_map_next(subpage, page, i)) {
				1210	subpage->flags &= ~(1 << PG_locked \| 1 << PG_error \|
				1211	1 << PG_referenced \| 1 << PG_dirty \|
				1212	1 << PG_active \| 1 << PG_private \|
				1213	1 << PG_writeback);
				1214	}
				1215	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
				1216	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
				1217	set_page_refcounted(page);
				1218	if (hstate_is_gigantic(h)) {
				1219	destroy_compound_gigantic_page(page, huge_page_order(h));
				1220	free_gigantic_page(page, huge_page_order(h));
				1221	} else {
				1222	__free_pages(page, huge_page_order(h));
				1223	}
				1224	}
				1225
				1226	struct hstate *size_to_hstate(unsigned long size)
				1227	{
				1228	struct hstate *h;
				1229
				1230	for_each_hstate(h) {
				1231	if (huge_page_size(h) == size)
				1232	return h;
				1233	}
				1234	return NULL;
				1235	}
				1236
				1237	/*
				1238	* Test to determine whether the hugepage is "active/in-use" (i.e. being linked
				1239	* to hstate->hugepage_activelist.)
				1240	*
				1241	* This function can be called for tail pages, but never returns true for them.
				1242	*/
				1243	bool page_huge_active(struct page *page)
				1244	{
				1245	return PageHeadHuge(page) && PagePrivate(&page[1]);
				1246	}
				1247
				1248	/* never called for tail page */
				1249	void set_page_huge_active(struct page *page)
				1250	{
				1251	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
				1252	SetPagePrivate(&page[1]);
				1253	}
				1254
				1255	static void clear_page_huge_active(struct page *page)
				1256	{
				1257	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
				1258	ClearPagePrivate(&page[1]);
				1259	}
				1260
				1261	/*
				1262	* Internal hugetlb specific page flag. Do not use outside of the hugetlb
				1263	* code
				1264	*/
				1265	static inline bool PageHugeTemporary(struct page *page)
				1266	{
				1267	if (!PageHuge(page))
				1268	return false;
				1269
				1270	return (unsigned long)page[2].mapping == -1U;
				1271	}
				1272
				1273	static inline void SetPageHugeTemporary(struct page *page)
				1274	{
				1275	page[2].mapping = (void *)-1U;
				1276	}
				1277
				1278	static inline void ClearPageHugeTemporary(struct page *page)
				1279	{
				1280	page[2].mapping = NULL;
				1281	}
				1282
				1283	static void __free_huge_page(struct page *page)
				1284	{
				1285	/*
				1286	* Can't pass hstate in here because it is called from the
				1287	* compound page destructor.
				1288	*/
				1289	struct hstate *h = page_hstate(page);
				1290	int nid = page_to_nid(page);
				1291	struct hugepage_subpool *spool =
				1292	(struct hugepage_subpool *)page_private(page);
				1293	bool restore_reserve;
				1294
				1295	VM_BUG_ON_PAGE(page_count(page), page);
				1296	VM_BUG_ON_PAGE(page_mapcount(page), page);
				1297
				1298	set_page_private(page, 0);
				1299	page->mapping = NULL;
				1300	restore_reserve = PagePrivate(page);
				1301	ClearPagePrivate(page);
				1302
				1303	/*
				1304	* If PagePrivate() was set on page, page allocation consumed a
				1305	* reservation. If the page was associated with a subpool, there
				1306	* would have been a page reserved in the subpool before allocation
				1307	* via hugepage_subpool_get_pages(). Since we are 'restoring' the
				1308	* reservtion, do not call hugepage_subpool_put_pages() as this will
				1309	* remove the reserved page from the subpool.
				1310	*/
				1311	if (!restore_reserve) {
				1312	/*
				1313	* A return code of zero implies that the subpool will be
				1314	* under its minimum size if the reservation is not restored
				1315	* after page is free. Therefore, force restore_reserve
				1316	* operation.
				1317	*/
				1318	if (hugepage_subpool_put_pages(spool, 1) == 0)
				1319	restore_reserve = true;
				1320	}
				1321
				1322	spin_lock(&hugetlb_lock);
				1323	clear_page_huge_active(page);
				1324	hugetlb_cgroup_uncharge_page(hstate_index(h),
				1325	pages_per_huge_page(h), page);
				1326	if (restore_reserve)
				1327	h->resv_huge_pages++;
				1328
				1329	if (PageHugeTemporary(page)) {
				1330	list_del(&page->lru);
				1331	ClearPageHugeTemporary(page);
				1332	update_and_free_page(h, page);
				1333	} else if (h->surplus_huge_pages_node[nid]) {
				1334	/* remove the page from active list */
				1335	list_del(&page->lru);
				1336	update_and_free_page(h, page);
				1337	h->surplus_huge_pages--;
				1338	h->surplus_huge_pages_node[nid]--;
				1339	} else {
				1340	arch_clear_hugepage_flags(page);
				1341	enqueue_huge_page(h, page);
				1342	}
				1343	spin_unlock(&hugetlb_lock);
				1344	}
				1345
				1346	/*
				1347	* As free_huge_page() can be called from a non-task context, we have
				1348	* to defer the actual freeing in a workqueue to prevent potential
				1349	* hugetlb_lock deadlock.
				1350	*
				1351	* free_hpage_workfn() locklessly retrieves the linked list of pages to
				1352	* be freed and frees them one-by-one. As the page->mapping pointer is
				1353	* going to be cleared in __free_huge_page() anyway, it is reused as the
				1354	* llist_node structure of a lockless linked list of huge pages to be freed.
				1355	*/
				1356	static LLIST_HEAD(hpage_freelist);
				1357
				1358	static void free_hpage_workfn(struct work_struct *work)
				1359	{
				1360	struct llist_node *node;
				1361	struct page *page;
				1362
				1363	node = llist_del_all(&hpage_freelist);
				1364
				1365	while (node) {
				1366	page = container_of((struct address_space **)node,
				1367	struct page, mapping);
				1368	node = node->next;
				1369	__free_huge_page(page);
				1370	}
				1371	}
				1372	static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
				1373
				1374	void free_huge_page(struct page *page)
				1375	{
				1376	/*
				1377	* Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
				1378	*/
				1379	if (!in_task()) {
				1380	/*
				1381	* Only call schedule_work() if hpage_freelist is previously
				1382	* empty. Otherwise, schedule_work() had been called but the
				1383	* workfn hasn't retrieved the list yet.
				1384	*/
				1385	if (llist_add((struct llist_node *)&page->mapping,
				1386	&hpage_freelist))
				1387	schedule_work(&free_hpage_work);
				1388	return;
				1389	}
				1390
				1391	__free_huge_page(page);
				1392	}
				1393
				1394	static void prep_new_huge_page(struct hstate h, struct page page, int nid)
				1395	{
				1396	INIT_LIST_HEAD(&page->lru);
				1397	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
				1398	spin_lock(&hugetlb_lock);
				1399	set_hugetlb_cgroup(page, NULL);
				1400	h->nr_huge_pages++;
				1401	h->nr_huge_pages_node[nid]++;
				1402	ClearPageHugeFreed(page);
				1403	spin_unlock(&hugetlb_lock);
				1404	}
				1405
				1406	static void prep_compound_gigantic_page(struct page *page, unsigned int order)
				1407	{
				1408	int i;
				1409	int nr_pages = 1 << order;
				1410	struct page *p = page + 1;
				1411
				1412	/* we rely on prep_new_huge_page to set the destructor */
				1413	set_compound_order(page, order);
				1414	__ClearPageReserved(page);
				1415	__SetPageHead(page);
				1416	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
				1417	/*
				1418	* For gigantic hugepages allocated through bootmem at
				1419	* boot, it's safer to be consistent with the not-gigantic
				1420	* hugepages and clear the PG_reserved bit from all tail pages
				1421	* too. Otherwse drivers using get_user_pages() to access tail
				1422	* pages may get the reference counting wrong if they see
				1423	* PG_reserved set on a tail page (despite the head page not
				1424	* having PG_reserved set). Enforcing this consistency between
				1425	* head and tail pages allows drivers to optimize away a check
				1426	* on the head page when they need know if put_page() is needed
				1427	* after get_user_pages().
				1428	*/
				1429	__ClearPageReserved(p);
				1430	set_page_count(p, 0);
				1431	set_compound_head(p, page);
				1432	}
				1433	atomic_set(compound_mapcount_ptr(page), -1);
				1434	}
				1435
				1436	/*
				1437	* PageHuge() only returns true for hugetlbfs pages, but not for normal or
				1438	* transparent huge pages. See the PageTransHuge() documentation for more
				1439	* details.
				1440	*/
				1441	int PageHuge(struct page *page)
				1442	{
				1443	if (!PageCompound(page))
				1444	return 0;
				1445
				1446	page = compound_head(page);
				1447	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
				1448	}
				1449	EXPORT_SYMBOL_GPL(PageHuge);
				1450
				1451	/*
				1452	* PageHeadHuge() only returns true for hugetlbfs head page, but not for
				1453	* normal or transparent huge pages.
				1454	*/
				1455	int PageHeadHuge(struct page *page_head)
				1456	{
				1457	if (!PageHead(page_head))
				1458	return 0;
				1459
				1460	return get_compound_page_dtor(page_head) == free_huge_page;
				1461	}
				1462
				1463	pgoff_t hugetlb_basepage_index(struct page *page)
				1464	{
				1465	struct page *page_head = compound_head(page);
				1466	pgoff_t index = page_index(page_head);
				1467	unsigned long compound_idx;
				1468
				1469	if (compound_order(page_head) >= MAX_ORDER)
				1470	compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
				1471	else
				1472	compound_idx = page - page_head;
				1473
				1474	return (index << compound_order(page_head)) + compound_idx;
				1475	}
				1476
				1477	static struct page alloc_buddy_huge_page(struct hstate h,
				1478	gfp_t gfp_mask, int nid, nodemask_t *nmask,
				1479	nodemask_t *node_alloc_noretry)
				1480	{
				1481	int order = huge_page_order(h);
				1482	struct page *page;
				1483	bool alloc_try_hard = true;
				1484
				1485	/*
				1486	* By default we always try hard to allocate the page with
				1487	* __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
				1488	* a loop (to adjust global huge page counts) and previous allocation
				1489	* failed, do not continue to try hard on the same node. Use the
				1490	* node_alloc_noretry bitmap to manage this state information.
				1491	*/
				1492	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
				1493	alloc_try_hard = false;
				1494	gfp_mask \|= __GFP_COMP\|__GFP_NOWARN;
				1495	if (alloc_try_hard)
				1496	gfp_mask \|= __GFP_RETRY_MAYFAIL;
				1497	if (nid == NUMA_NO_NODE)
				1498	nid = numa_mem_id();
				1499	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
				1500	if (page)
				1501	__count_vm_event(HTLB_BUDDY_PGALLOC);
				1502	else
				1503	__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
				1504
				1505	/*
				1506	* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
				1507	* indicates an overall state change. Clear bit so that we resume
				1508	* normal 'try hard' allocations.
				1509	*/
				1510	if (node_alloc_noretry && page && !alloc_try_hard)
				1511	node_clear(nid, *node_alloc_noretry);
				1512
				1513	/*
				1514	* If we tried hard to get a page but failed, set bit so that
				1515	* subsequent attempts will not try as hard until there is an
				1516	* overall state change.
				1517	*/
				1518	if (node_alloc_noretry && !page && alloc_try_hard)
				1519	node_set(nid, *node_alloc_noretry);
				1520
				1521	return page;
				1522	}
				1523
				1524	/*
				1525	* Common helper to allocate a fresh hugetlb page. All specific allocators
				1526	* should use this function to get new hugetlb pages
				1527	*/
				1528	static struct page alloc_fresh_huge_page(struct hstate h,
				1529	gfp_t gfp_mask, int nid, nodemask_t *nmask,
				1530	nodemask_t *node_alloc_noretry)
				1531	{
				1532	struct page *page;
				1533
				1534	if (hstate_is_gigantic(h))
				1535	page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
				1536	else
				1537	page = alloc_buddy_huge_page(h, gfp_mask,
				1538	nid, nmask, node_alloc_noretry);
				1539	if (!page)
				1540	return NULL;
				1541
				1542	if (hstate_is_gigantic(h))
				1543	prep_compound_gigantic_page(page, huge_page_order(h));
				1544	prep_new_huge_page(h, page, page_to_nid(page));
				1545
				1546	return page;
				1547	}
				1548
				1549	/*
				1550	* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
				1551	* manner.
				1552	*/
				1553	static int alloc_pool_huge_page(struct hstate h, nodemask_t nodes_allowed,
				1554	nodemask_t *node_alloc_noretry)
				1555	{
				1556	struct page *page;
				1557	int nr_nodes, node;
				1558	gfp_t gfp_mask = htlb_alloc_mask(h) \| __GFP_THISNODE;
				1559
				1560	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
				1561	page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
				1562	node_alloc_noretry);
				1563	if (page)
				1564	break;
				1565	}
				1566
				1567	if (!page)
				1568	return 0;
				1569
				1570	put_page(page); /* free it into the hugepage allocator */
				1571
				1572	return 1;
				1573	}
				1574
				1575	/*
				1576	* Free huge page from pool from next node to free.
				1577	* Attempt to keep persistent huge pages more or less
				1578	* balanced over allowed nodes.
				1579	* Called with hugetlb_lock locked.
				1580	*/
				1581	static int free_pool_huge_page(struct hstate h, nodemask_t nodes_allowed,
				1582	bool acct_surplus)
				1583	{
				1584	int nr_nodes, node;
				1585	int ret = 0;
				1586
				1587	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
				1588	/*
				1589	* If we're returning unused surplus pages, only examine
				1590	* nodes with surplus pages.
				1591	*/
				1592	if ((!acct_surplus \|\| h->surplus_huge_pages_node[node]) &&
				1593	!list_empty(&h->hugepage_freelists[node])) {
				1594	struct page *page =
				1595	list_entry(h->hugepage_freelists[node].next,
				1596	struct page, lru);
				1597	list_del(&page->lru);
				1598	h->free_huge_pages--;
				1599	h->free_huge_pages_node[node]--;
				1600	if (acct_surplus) {
				1601	h->surplus_huge_pages--;
				1602	h->surplus_huge_pages_node[node]--;
				1603	}
				1604	update_and_free_page(h, page);
				1605	ret = 1;
				1606	break;
				1607	}
				1608	}
				1609
				1610	return ret;
				1611	}
				1612
				1613	/*
				1614	* Dissolve a given free hugepage into free buddy pages. This function does
				1615	* nothing for in-use hugepages and non-hugepages.
				1616	* This function returns values like below:
				1617	*
				1618	* -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
				1619	* (allocated or reserved.)
				1620	* 0: successfully dissolved free hugepages or the page is not a
				1621	* hugepage (considered as already dissolved)
				1622	*/
				1623	int dissolve_free_huge_page(struct page *page)
				1624	{
				1625	int rc = -EBUSY;
				1626
				1627	retry:
				1628	/* Not to disrupt normal path by vainly holding hugetlb_lock */
				1629	if (!PageHuge(page))
				1630	return 0;
				1631
				1632	spin_lock(&hugetlb_lock);
				1633	if (!PageHuge(page)) {
				1634	rc = 0;
				1635	goto out;
				1636	}
				1637
				1638	if (!page_count(page)) {
				1639	struct page *head = compound_head(page);
				1640	struct hstate *h = page_hstate(head);
				1641	int nid = page_to_nid(head);
				1642	if (h->free_huge_pages - h->resv_huge_pages == 0)
				1643	goto out;
				1644
				1645	/*
				1646	* We should make sure that the page is already on the free list
				1647	* when it is dissolved.
				1648	*/
				1649	if (unlikely(!PageHugeFreed(head))) {
				1650	spin_unlock(&hugetlb_lock);
				1651	cond_resched();
				1652
				1653	/*
				1654	* Theoretically, we should return -EBUSY when we
				1655	* encounter this race. In fact, we have a chance
				1656	* to successfully dissolve the page if we do a
				1657	* retry. Because the race window is quite small.
				1658	* If we seize this opportunity, it is an optimization
				1659	* for increasing the success rate of dissolving page.
				1660	*/
				1661	goto retry;
				1662	}
				1663
				1664	/*
				1665	* Move PageHWPoison flag from head page to the raw error page,
				1666	* which makes any subpages rather than the error page reusable.
				1667	*/
				1668	if (PageHWPoison(head) && page != head) {
				1669	SetPageHWPoison(page);
				1670	ClearPageHWPoison(head);
				1671	}
				1672	list_del(&head->lru);
				1673	h->free_huge_pages--;
				1674	h->free_huge_pages_node[nid]--;
				1675	h->max_huge_pages--;
				1676	update_and_free_page(h, head);
				1677	rc = 0;
				1678	}
				1679	out:
				1680	spin_unlock(&hugetlb_lock);
				1681	return rc;
				1682	}
				1683
				1684	/*
				1685	* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
				1686	* make specified memory blocks removable from the system.
				1687	* Note that this will dissolve a free gigantic hugepage completely, if any
				1688	* part of it lies within the given range.
				1689	* Also note that if dissolve_free_huge_page() returns with an error, all
				1690	* free hugepages that were dissolved before that error are lost.
				1691	*/
				1692	int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
				1693	{
				1694	unsigned long pfn;
				1695	struct page *page;
				1696	int rc = 0;
				1697
				1698	if (!hugepages_supported())
				1699	return rc;
				1700
				1701	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
				1702	page = pfn_to_page(pfn);
				1703	rc = dissolve_free_huge_page(page);
				1704	if (rc)
				1705	break;
				1706	}
				1707
				1708	return rc;
				1709	}
				1710
				1711	/*
				1712	* Allocates a fresh surplus page from the page allocator.
				1713	*/
				1714	static struct page alloc_surplus_huge_page(struct hstate h, gfp_t gfp_mask,
				1715	int nid, nodemask_t *nmask)
				1716	{
				1717	struct page *page = NULL;
				1718
				1719	if (hstate_is_gigantic(h))
				1720	return NULL;
				1721
				1722	spin_lock(&hugetlb_lock);
				1723	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
				1724	goto out_unlock;
				1725	spin_unlock(&hugetlb_lock);
				1726
				1727	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
				1728	if (!page)
				1729	return NULL;
				1730
				1731	spin_lock(&hugetlb_lock);
				1732	/*
				1733	* We could have raced with the pool size change.
				1734	* Double check that and simply deallocate the new page
				1735	* if we would end up overcommiting the surpluses. Abuse
				1736	* temporary page to workaround the nasty free_huge_page
				1737	* codeflow
				1738	*/
				1739	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
				1740	SetPageHugeTemporary(page);
				1741	spin_unlock(&hugetlb_lock);
				1742	put_page(page);
				1743	return NULL;
				1744	} else {
				1745	h->surplus_huge_pages++;
				1746	h->surplus_huge_pages_node[page_to_nid(page)]++;
				1747	}
				1748
				1749	out_unlock:
				1750	spin_unlock(&hugetlb_lock);
				1751
				1752	return page;
				1753	}
				1754
				1755	struct page alloc_migrate_huge_page(struct hstate h, gfp_t gfp_mask,
				1756	int nid, nodemask_t *nmask)
				1757	{
				1758	struct page *page;
				1759
				1760	if (hstate_is_gigantic(h))
				1761	return NULL;
				1762
				1763	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
				1764	if (!page)
				1765	return NULL;
				1766
				1767	/*
				1768	* We do not account these pages as surplus because they are only
				1769	* temporary and will be released properly on the last reference
				1770	*/
				1771	SetPageHugeTemporary(page);
				1772
				1773	return page;
				1774	}
				1775
				1776	/*
				1777	* Use the VMA's mpolicy to allocate a huge page from the buddy.
				1778	*/
				1779	static
				1780	struct page alloc_buddy_huge_page_with_mpol(struct hstate h,
				1781	struct vm_area_struct *vma, unsigned long addr)
				1782	{
				1783	struct page *page;
				1784	struct mempolicy *mpol;
				1785	gfp_t gfp_mask = htlb_alloc_mask(h);
				1786	int nid;
				1787	nodemask_t *nodemask;
				1788
				1789	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
				1790	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
				1791	mpol_cond_put(mpol);
				1792
				1793	return page;
				1794	}
				1795
				1796	/* page migration callback function */
				1797	struct page alloc_huge_page_node(struct hstate h, int nid)
				1798	{
				1799	gfp_t gfp_mask = htlb_alloc_mask(h);
				1800	struct page *page = NULL;
				1801
				1802	if (nid != NUMA_NO_NODE)
				1803	gfp_mask \|= __GFP_THISNODE;
				1804
				1805	spin_lock(&hugetlb_lock);
				1806	if (h->free_huge_pages - h->resv_huge_pages > 0)
				1807	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
				1808	spin_unlock(&hugetlb_lock);
				1809
				1810	if (!page)
				1811	page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
				1812
				1813	return page;
				1814	}
				1815
				1816	/* page migration callback function */
				1817	struct page alloc_huge_page_nodemask(struct hstate h, int preferred_nid,
				1818	nodemask_t *nmask)
				1819	{
				1820	gfp_t gfp_mask = htlb_alloc_mask(h);
				1821
				1822	spin_lock(&hugetlb_lock);
				1823	if (h->free_huge_pages - h->resv_huge_pages > 0) {
				1824	struct page *page;
				1825
				1826	page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
				1827	if (page) {
				1828	spin_unlock(&hugetlb_lock);
				1829	return page;
				1830	}
				1831	}
				1832	spin_unlock(&hugetlb_lock);
				1833
				1834	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
				1835	}
				1836
				1837	/* mempolicy aware migration callback */
				1838	struct page alloc_huge_page_vma(struct hstate h, struct vm_area_struct *vma,
				1839	unsigned long address)
				1840	{
				1841	struct mempolicy *mpol;
				1842	nodemask_t *nodemask;
				1843	struct page *page;
				1844	gfp_t gfp_mask;
				1845	int node;
				1846
				1847	gfp_mask = htlb_alloc_mask(h);
				1848	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
				1849	page = alloc_huge_page_nodemask(h, node, nodemask);
				1850	mpol_cond_put(mpol);
				1851
				1852	return page;
				1853	}
				1854
				1855	/*
				1856	* Increase the hugetlb pool such that it can accommodate a reservation
				1857	* of size 'delta'.
				1858	*/
				1859	static int gather_surplus_pages(struct hstate *h, int delta)
				1860	{
				1861	struct list_head surplus_list;
				1862	struct page page, tmp;
				1863	int ret, i;
				1864	int needed, allocated;
				1865	bool alloc_ok = true;
				1866
				1867	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
				1868	if (needed <= 0) {
				1869	h->resv_huge_pages += delta;
				1870	return 0;
				1871	}
				1872
				1873	allocated = 0;
				1874	INIT_LIST_HEAD(&surplus_list);
				1875
				1876	ret = -ENOMEM;
				1877	retry:
				1878	spin_unlock(&hugetlb_lock);
				1879	for (i = 0; i < needed; i++) {
				1880	page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
				1881	NUMA_NO_NODE, NULL);
				1882	if (!page) {
				1883	alloc_ok = false;
				1884	break;
				1885	}
				1886	list_add(&page->lru, &surplus_list);
				1887	cond_resched();
				1888	}
				1889	allocated += i;
				1890
				1891	/*
				1892	* After retaking hugetlb_lock, we need to recalculate 'needed'
				1893	* because either resv_huge_pages or free_huge_pages may have changed.
				1894	*/
				1895	spin_lock(&hugetlb_lock);
				1896	needed = (h->resv_huge_pages + delta) -
				1897	(h->free_huge_pages + allocated);
				1898	if (needed > 0) {
				1899	if (alloc_ok)
				1900	goto retry;
				1901	/*
				1902	* We were not able to allocate enough pages to
				1903	* satisfy the entire reservation so we free what
				1904	* we've allocated so far.
				1905	*/
				1906	goto free;
				1907	}
				1908	/*
				1909	* The surplus_list now contains _at_least_ the number of extra pages
				1910	* needed to accommodate the reservation. Add the appropriate number
				1911	* of pages to the hugetlb pool and free the extras back to the buddy
				1912	* allocator. Commit the entire reservation here to prevent another
				1913	* process from stealing the pages as they are added to the pool but
				1914	* before they are reserved.
				1915	*/
				1916	needed += allocated;
				1917	h->resv_huge_pages += delta;
				1918	ret = 0;
				1919
				1920	/* Free the needed pages to the hugetlb pool */
				1921	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
				1922	if ((--needed) < 0)
				1923	break;
				1924	/*
				1925	* This page is now managed by the hugetlb allocator and has
				1926	* no users -- drop the buddy allocator's reference.
				1927	*/
				1928	put_page_testzero(page);
				1929	VM_BUG_ON_PAGE(page_count(page), page);
				1930	enqueue_huge_page(h, page);
				1931	}
				1932	free:
				1933	spin_unlock(&hugetlb_lock);
				1934
				1935	/* Free unnecessary surplus pages to the buddy allocator */
				1936	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
				1937	put_page(page);
				1938	spin_lock(&hugetlb_lock);
				1939
				1940	return ret;
				1941	}
				1942
				1943	/*
				1944	* This routine has two main purposes:
				1945	* 1) Decrement the reservation count (resv_huge_pages) by the value passed
				1946	* in unused_resv_pages. This corresponds to the prior adjustments made
				1947	* to the associated reservation map.
				1948	* 2) Free any unused surplus pages that may have been allocated to satisfy
				1949	* the reservation. As many as unused_resv_pages may be freed.
				1950	*
				1951	* Called with hugetlb_lock held. However, the lock could be dropped (and
				1952	* reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
				1953	* we must make sure nobody else can claim pages we are in the process of
				1954	* freeing. Do this by ensuring resv_huge_page always is greater than the
				1955	* number of huge pages we plan to free when dropping the lock.
				1956	*/
				1957	static void return_unused_surplus_pages(struct hstate *h,
				1958	unsigned long unused_resv_pages)
				1959	{
				1960	unsigned long nr_pages;
				1961
				1962	/* Cannot return gigantic pages currently */
				1963	if (hstate_is_gigantic(h))
				1964	goto out;
				1965
				1966	/*
				1967	* Part (or even all) of the reservation could have been backed
				1968	* by pre-allocated pages. Only free surplus pages.
				1969	*/
				1970	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
				1971
				1972	/*
				1973	* We want to release as many surplus pages as possible, spread
				1974	* evenly across all nodes with memory. Iterate across these nodes
				1975	* until we can no longer free unreserved surplus pages. This occurs
				1976	* when the nodes with surplus pages have no free pages.
				1977	* free_pool_huge_page() will balance the the freed pages across the
				1978	* on-line nodes with memory and will handle the hstate accounting.
				1979	*
				1980	* Note that we decrement resv_huge_pages as we free the pages. If
				1981	* we drop the lock, resv_huge_pages will still be sufficiently large
				1982	* to cover subsequent pages we may free.
				1983	*/
				1984	while (nr_pages--) {
				1985	h->resv_huge_pages--;
				1986	unused_resv_pages--;
				1987	if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
				1988	goto out;
				1989	cond_resched_lock(&hugetlb_lock);
				1990	}
				1991
				1992	out:
				1993	/* Fully uncommit the reservation */
				1994	h->resv_huge_pages -= unused_resv_pages;
				1995	}
				1996
				1997
				1998	/*
				1999	* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
				2000	* are used by the huge page allocation routines to manage reservations.
				2001	*
				2002	* vma_needs_reservation is called to determine if the huge page at addr
				2003	* within the vma has an associated reservation. If a reservation is
				2004	* needed, the value 1 is returned. The caller is then responsible for
				2005	* managing the global reservation and subpool usage counts. After
				2006	* the huge page has been allocated, vma_commit_reservation is called
				2007	* to add the page to the reservation map. If the page allocation fails,
				2008	* the reservation must be ended instead of committed. vma_end_reservation
				2009	* is called in such cases.
				2010	*
				2011	* In the normal case, vma_commit_reservation returns the same value
				2012	* as the preceding vma_needs_reservation call. The only time this
				2013	* is not the case is if a reserve map was changed between calls. It
				2014	* is the responsibility of the caller to notice the difference and
				2015	* take appropriate action.
				2016	*
				2017	* vma_add_reservation is used in error paths where a reservation must
				2018	* be restored when a newly allocated huge page must be freed. It is
				2019	* to be called after calling vma_needs_reservation to determine if a
				2020	* reservation exists.
				2021	*/
				2022	enum vma_resv_mode {
				2023	VMA_NEEDS_RESV,
				2024	VMA_COMMIT_RESV,
				2025	VMA_END_RESV,
				2026	VMA_ADD_RESV,
				2027	};
				2028	static long __vma_reservation_common(struct hstate *h,
				2029	struct vm_area_struct *vma, unsigned long addr,
				2030	enum vma_resv_mode mode)
				2031	{
				2032	struct resv_map *resv;
				2033	pgoff_t idx;
				2034	long ret;
				2035
				2036	resv = vma_resv_map(vma);
				2037	if (!resv)
				2038	return 1;
				2039
				2040	idx = vma_hugecache_offset(h, vma, addr);
				2041	switch (mode) {
				2042	case VMA_NEEDS_RESV:
				2043	ret = region_chg(resv, idx, idx + 1);
				2044	break;
				2045	case VMA_COMMIT_RESV:
				2046	ret = region_add(resv, idx, idx + 1);
				2047	break;
				2048	case VMA_END_RESV:
				2049	region_abort(resv, idx, idx + 1);
				2050	ret = 0;
				2051	break;
				2052	case VMA_ADD_RESV:
				2053	if (vma->vm_flags & VM_MAYSHARE)
				2054	ret = region_add(resv, idx, idx + 1);
				2055	else {
				2056	region_abort(resv, idx, idx + 1);
				2057	ret = region_del(resv, idx, idx + 1);
				2058	}
				2059	break;
				2060	default:
				2061	BUG();
				2062	}
				2063
				2064	if (vma->vm_flags & VM_MAYSHARE)
				2065	return ret;
				2066	else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
				2067	/*
				2068	* In most cases, reserves always exist for private mappings.
				2069	* However, a file associated with mapping could have been
				2070	* hole punched or truncated after reserves were consumed.
				2071	* As subsequent fault on such a range will not use reserves.
				2072	* Subtle - The reserve map for private mappings has the
				2073	* opposite meaning than that of shared mappings. If NO
				2074	* entry is in the reserve map, it means a reservation exists.
				2075	* If an entry exists in the reserve map, it means the
				2076	* reservation has already been consumed. As a result, the
				2077	* return value of this routine is the opposite of the
				2078	* value returned from reserve map manipulation routines above.
				2079	*/
				2080	if (ret)
				2081	return 0;
				2082	else
				2083	return 1;
				2084	}
				2085	else
				2086	return ret < 0 ? ret : 0;
				2087	}
				2088
				2089	static long vma_needs_reservation(struct hstate *h,
				2090	struct vm_area_struct *vma, unsigned long addr)
				2091	{
				2092	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
				2093	}
				2094
				2095	static long vma_commit_reservation(struct hstate *h,
				2096	struct vm_area_struct *vma, unsigned long addr)
				2097	{
				2098	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
				2099	}
				2100
				2101	static void vma_end_reservation(struct hstate *h,
				2102	struct vm_area_struct *vma, unsigned long addr)
				2103	{
				2104	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
				2105	}
				2106
				2107	static long vma_add_reservation(struct hstate *h,
				2108	struct vm_area_struct *vma, unsigned long addr)
				2109	{
				2110	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
				2111	}
				2112
				2113	/*
				2114	* This routine is called to restore a reservation on error paths. In the
				2115	* specific error paths, a huge page was allocated (via alloc_huge_page)
				2116	* and is about to be freed. If a reservation for the page existed,
				2117	* alloc_huge_page would have consumed the reservation and set PagePrivate
				2118	* in the newly allocated page. When the page is freed via free_huge_page,
				2119	* the global reservation count will be incremented if PagePrivate is set.
				2120	* However, free_huge_page can not adjust the reserve map. Adjust the
				2121	* reserve map here to be consistent with global reserve count adjustments
				2122	* to be made by free_huge_page.
				2123	*/
				2124	static void restore_reserve_on_error(struct hstate *h,
				2125	struct vm_area_struct *vma, unsigned long address,
				2126	struct page *page)
				2127	{
				2128	if (unlikely(PagePrivate(page))) {
				2129	long rc = vma_needs_reservation(h, vma, address);
				2130
				2131	if (unlikely(rc < 0)) {
				2132	/*
				2133	* Rare out of memory condition in reserve map
				2134	* manipulation. Clear PagePrivate so that
				2135	* global reserve count will not be incremented
				2136	* by free_huge_page. This will make it appear
				2137	* as though the reservation for this page was
				2138	* consumed. This may prevent the task from
				2139	* faulting in the page at a later time. This
				2140	* is better than inconsistent global huge page
				2141	* accounting of reserve counts.
				2142	*/
				2143	ClearPagePrivate(page);
				2144	} else if (rc) {
				2145	rc = vma_add_reservation(h, vma, address);
				2146	if (unlikely(rc < 0))
				2147	/*
				2148	* See above comment about rare out of
				2149	* memory condition.
				2150	*/
				2151	ClearPagePrivate(page);
				2152	} else
				2153	vma_end_reservation(h, vma, address);
				2154	}
				2155	}
				2156
				2157	struct page alloc_huge_page(struct vm_area_struct vma,
				2158	unsigned long addr, int avoid_reserve)
				2159	{
				2160	struct hugepage_subpool *spool = subpool_vma(vma);
				2161	struct hstate *h = hstate_vma(vma);
				2162	struct page *page;
				2163	long map_chg, map_commit;
				2164	long gbl_chg;
				2165	int ret, idx;
				2166	struct hugetlb_cgroup *h_cg;
				2167
				2168	idx = hstate_index(h);
				2169	/*
				2170	* Examine the region/reserve map to determine if the process
				2171	* has a reservation for the page to be allocated. A return
				2172	* code of zero indicates a reservation exists (no change).
				2173	*/
				2174	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
				2175	if (map_chg < 0)
				2176	return ERR_PTR(-ENOMEM);
				2177
				2178	/*
				2179	* Processes that did not create the mapping will have no
				2180	* reserves as indicated by the region/reserve map. Check
				2181	* that the allocation will not exceed the subpool limit.
				2182	* Allocations for MAP_NORESERVE mappings also need to be
				2183	* checked against any subpool limit.
				2184	*/
				2185	if (map_chg \|\| avoid_reserve) {
				2186	gbl_chg = hugepage_subpool_get_pages(spool, 1);
				2187	if (gbl_chg < 0) {
				2188	vma_end_reservation(h, vma, addr);
				2189	return ERR_PTR(-ENOSPC);
				2190	}
				2191
				2192	/*
				2193	* Even though there was no reservation in the region/reserve
				2194	* map, there could be reservations associated with the
				2195	* subpool that can be used. This would be indicated if the
				2196	* return value of hugepage_subpool_get_pages() is zero.
				2197	* However, if avoid_reserve is specified we still avoid even
				2198	* the subpool reservations.
				2199	*/
				2200	if (avoid_reserve)
				2201	gbl_chg = 1;
				2202	}
				2203
				2204	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
				2205	if (ret)
				2206	goto out_subpool_put;
				2207
				2208	spin_lock(&hugetlb_lock);
				2209	/*
				2210	* glb_chg is passed to indicate whether or not a page must be taken
				2211	* from the global free pool (global change). gbl_chg == 0 indicates
				2212	* a reservation exists for the allocation.
				2213	*/
				2214	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
				2215	if (!page) {
				2216	spin_unlock(&hugetlb_lock);
				2217	page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
				2218	if (!page)
				2219	goto out_uncharge_cgroup;
				2220	spin_lock(&hugetlb_lock);
				2221	if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
				2222	SetPagePrivate(page);
				2223	h->resv_huge_pages--;
				2224	}
				2225	list_move(&page->lru, &h->hugepage_activelist);
				2226	/* Fall through */
				2227	}
				2228	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
				2229	spin_unlock(&hugetlb_lock);
				2230
				2231	set_page_private(page, (unsigned long)spool);
				2232
				2233	map_commit = vma_commit_reservation(h, vma, addr);
				2234	if (unlikely(map_chg > map_commit)) {
				2235	/*
				2236	* The page was added to the reservation map between
				2237	* vma_needs_reservation and vma_commit_reservation.
				2238	* This indicates a race with hugetlb_reserve_pages.
				2239	* Adjust for the subpool count incremented above AND
				2240	* in hugetlb_reserve_pages for the same page. Also,
				2241	* the reservation count added in hugetlb_reserve_pages
				2242	* no longer applies.
				2243	*/
				2244	long rsv_adjust;
				2245
				2246	rsv_adjust = hugepage_subpool_put_pages(spool, 1);
				2247	hugetlb_acct_memory(h, -rsv_adjust);
				2248	}
				2249	return page;
				2250
				2251	out_uncharge_cgroup:
				2252	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
				2253	out_subpool_put:
				2254	if (map_chg \|\| avoid_reserve)
				2255	hugepage_subpool_put_pages(spool, 1);
				2256	vma_end_reservation(h, vma, addr);
				2257	return ERR_PTR(-ENOSPC);
				2258	}
				2259
				2260	int alloc_bootmem_huge_page(struct hstate *h)
				2261	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
				2262	int __alloc_bootmem_huge_page(struct hstate *h)
				2263	{
				2264	struct huge_bootmem_page *m;
				2265	int nr_nodes, node;
				2266
				2267	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
				2268	void *addr;
				2269
				2270	addr = memblock_alloc_try_nid_raw(
				2271	huge_page_size(h), huge_page_size(h),
				2272	0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
				2273	if (addr) {
				2274	/*
				2275	* Use the beginning of the huge page to store the
				2276	* huge_bootmem_page struct (until gather_bootmem
				2277	* puts them into the mem_map).
				2278	*/
				2279	m = addr;
				2280	goto found;
				2281	}
				2282	}
				2283	return 0;
				2284
				2285	found:
				2286	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
				2287	/* Put them into a private list first because mem_map is not up yet */
				2288	INIT_LIST_HEAD(&m->list);
				2289	list_add(&m->list, &huge_boot_pages);
				2290	m->hstate = h;
				2291	return 1;
				2292	}
				2293
				2294	static void __init prep_compound_huge_page(struct page *page,
				2295	unsigned int order)
				2296	{
				2297	if (unlikely(order > (MAX_ORDER - 1)))
				2298	prep_compound_gigantic_page(page, order);
				2299	else
				2300	prep_compound_page(page, order);
				2301	}
				2302
				2303	/* Put bootmem huge pages into the standard lists after mem_map is up */
				2304	static void __init gather_bootmem_prealloc(void)
				2305	{
				2306	struct huge_bootmem_page *m;
				2307
				2308	list_for_each_entry(m, &huge_boot_pages, list) {
				2309	struct page *page = virt_to_page(m);
				2310	struct hstate *h = m->hstate;
				2311
				2312	WARN_ON(page_count(page) != 1);
				2313	prep_compound_huge_page(page, h->order);
				2314	WARN_ON(PageReserved(page));
				2315	prep_new_huge_page(h, page, page_to_nid(page));
				2316	put_page(page); /* free it into the hugepage allocator */
				2317
				2318	/*
				2319	* If we had gigantic hugepages allocated at boot time, we need
				2320	* to restore the 'stolen' pages to totalram_pages in order to
				2321	* fix confusing memory reports from free(1) and another
				2322	* side-effects, like CommitLimit going negative.
				2323	*/
				2324	if (hstate_is_gigantic(h))
				2325	adjust_managed_page_count(page, 1 << h->order);
				2326	cond_resched();
				2327	}
				2328	}
				2329
				2330	static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
				2331	{
				2332	unsigned long i;
				2333	nodemask_t *node_alloc_noretry;
				2334
				2335	if (!hstate_is_gigantic(h)) {
				2336	/*
				2337	* Bit mask controlling how hard we retry per-node allocations.
				2338	* Ignore errors as lower level routines can deal with
				2339	* node_alloc_noretry == NULL. If this kmalloc fails at boot
				2340	* time, we are likely in bigger trouble.
				2341	*/
				2342	node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
				2343	GFP_KERNEL);
				2344	} else {
				2345	/* allocations done at boot time */
				2346	node_alloc_noretry = NULL;
				2347	}
				2348
				2349	/* bit mask controlling how hard we retry per-node allocations */
				2350	if (node_alloc_noretry)
				2351	nodes_clear(*node_alloc_noretry);
				2352
				2353	for (i = 0; i < h->max_huge_pages; ++i) {
				2354	if (hstate_is_gigantic(h)) {
				2355	if (!alloc_bootmem_huge_page(h))
				2356	break;
				2357	} else if (!alloc_pool_huge_page(h,
				2358	&node_states[N_MEMORY],
				2359	node_alloc_noretry))
				2360	break;
				2361	cond_resched();
				2362	}
				2363	if (i < h->max_huge_pages) {
				2364	char buf[32];
				2365
				2366	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
				2367	pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
				2368	h->max_huge_pages, buf, i);
				2369	h->max_huge_pages = i;
				2370	}
				2371
				2372	kfree(node_alloc_noretry);
				2373	}
				2374
				2375	static void __init hugetlb_init_hstates(void)
				2376	{
				2377	struct hstate *h;
				2378
				2379	for_each_hstate(h) {
				2380	if (minimum_order > huge_page_order(h))
				2381	minimum_order = huge_page_order(h);
				2382
				2383	/* oversize hugepages were init'ed in early boot */
				2384	if (!hstate_is_gigantic(h))
				2385	hugetlb_hstate_alloc_pages(h);
				2386	}
				2387	VM_BUG_ON(minimum_order == UINT_MAX);
				2388	}
				2389
				2390	static void __init report_hugepages(void)
				2391	{
				2392	struct hstate *h;
				2393
				2394	for_each_hstate(h) {
				2395	char buf[32];
				2396
				2397	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
				2398	pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
				2399	buf, h->free_huge_pages);
				2400	}
				2401	}
				2402
				2403	#ifdef CONFIG_HIGHMEM
				2404	static void try_to_free_low(struct hstate *h, unsigned long count,
				2405	nodemask_t *nodes_allowed)
				2406	{
				2407	int i;
				2408
				2409	if (hstate_is_gigantic(h))
				2410	return;
				2411
				2412	for_each_node_mask(i, *nodes_allowed) {
				2413	struct page page, next;
				2414	struct list_head *freel = &h->hugepage_freelists[i];
				2415	list_for_each_entry_safe(page, next, freel, lru) {
				2416	if (count >= h->nr_huge_pages)
				2417	return;
				2418	if (PageHighMem(page))
				2419	continue;
				2420	list_del(&page->lru);
				2421	update_and_free_page(h, page);
				2422	h->free_huge_pages--;
				2423	h->free_huge_pages_node[page_to_nid(page)]--;
				2424	}
				2425	}
				2426	}
				2427	#else
				2428	static inline void try_to_free_low(struct hstate *h, unsigned long count,
				2429	nodemask_t *nodes_allowed)
				2430	{
				2431	}
				2432	#endif
				2433
				2434	/*
				2435	* Increment or decrement surplus_huge_pages. Keep node-specific counters
				2436	* balanced by operating on them in a round-robin fashion.
				2437	* Returns 1 if an adjustment was made.
				2438	*/
				2439	static int adjust_pool_surplus(struct hstate h, nodemask_t nodes_allowed,
				2440	int delta)
				2441	{
				2442	int nr_nodes, node;
				2443
				2444	VM_BUG_ON(delta != -1 && delta != 1);
				2445
				2446	if (delta < 0) {
				2447	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
				2448	if (h->surplus_huge_pages_node[node])
				2449	goto found;
				2450	}
				2451	} else {
				2452	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
				2453	if (h->surplus_huge_pages_node[node] <
				2454	h->nr_huge_pages_node[node])
				2455	goto found;
				2456	}
				2457	}
				2458	return 0;
				2459
				2460	found:
				2461	h->surplus_huge_pages += delta;
				2462	h->surplus_huge_pages_node[node] += delta;
				2463	return 1;
				2464	}
				2465
				2466	#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
				2467	static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
				2468	nodemask_t *nodes_allowed)
				2469	{
				2470	unsigned long min_count, ret;
				2471	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
				2472
				2473	/*
				2474	* Bit mask controlling how hard we retry per-node allocations.
				2475	* If we can not allocate the bit mask, do not attempt to allocate
				2476	* the requested huge pages.
				2477	*/
				2478	if (node_alloc_noretry)
				2479	nodes_clear(*node_alloc_noretry);
				2480	else
				2481	return -ENOMEM;
				2482
				2483	spin_lock(&hugetlb_lock);
				2484
				2485	/*
				2486	* Check for a node specific request.
				2487	* Changing node specific huge page count may require a corresponding
				2488	* change to the global count. In any case, the passed node mask
				2489	* (nodes_allowed) will restrict alloc/free to the specified node.
				2490	*/
				2491	if (nid != NUMA_NO_NODE) {
				2492	unsigned long old_count = count;
				2493
				2494	count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
				2495	/*
				2496	* User may have specified a large count value which caused the
				2497	* above calculation to overflow. In this case, they wanted
				2498	* to allocate as many huge pages as possible. Set count to
				2499	* largest possible value to align with their intention.
				2500	*/
				2501	if (count < old_count)
				2502	count = ULONG_MAX;
				2503	}
				2504
				2505	/*
				2506	* Gigantic pages runtime allocation depend on the capability for large
				2507	* page range allocation.
				2508	* If the system does not provide this feature, return an error when
				2509	* the user tries to allocate gigantic pages but let the user free the
				2510	* boottime allocated gigantic pages.
				2511	*/
				2512	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
				2513	if (count > persistent_huge_pages(h)) {
				2514	spin_unlock(&hugetlb_lock);
				2515	NODEMASK_FREE(node_alloc_noretry);
				2516	return -EINVAL;
				2517	}
				2518	/* Fall through to decrease pool */
				2519	}
				2520
				2521	/*
				2522	* Increase the pool size
				2523	* First take pages out of surplus state. Then make up the
				2524	* remaining difference by allocating fresh huge pages.
				2525	*
				2526	* We might race with alloc_surplus_huge_page() here and be unable
				2527	* to convert a surplus huge page to a normal huge page. That is
				2528	* not critical, though, it just means the overall size of the
				2529	* pool might be one hugepage larger than it needs to be, but
				2530	* within all the constraints specified by the sysctls.
				2531	*/
				2532	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
				2533	if (!adjust_pool_surplus(h, nodes_allowed, -1))
				2534	break;
				2535	}
				2536
				2537	while (count > persistent_huge_pages(h)) {
				2538	/*
				2539	* If this allocation races such that we no longer need the
				2540	* page, free_huge_page will handle it by freeing the page
				2541	* and reducing the surplus.
				2542	*/
				2543	spin_unlock(&hugetlb_lock);
				2544
				2545	/* yield cpu to avoid soft lockup */
				2546	cond_resched();
				2547
				2548	ret = alloc_pool_huge_page(h, nodes_allowed,
				2549	node_alloc_noretry);
				2550	spin_lock(&hugetlb_lock);
				2551	if (!ret)
				2552	goto out;
				2553
				2554	/* Bail for signals. Probably ctrl-c from user */
				2555	if (signal_pending(current))
				2556	goto out;
				2557	}
				2558
				2559	/*
				2560	* Decrease the pool size
				2561	* First return free pages to the buddy allocator (being careful
				2562	* to keep enough around to satisfy reservations). Then place
				2563	* pages into surplus state as needed so the pool will shrink
				2564	* to the desired size as pages become free.
				2565	*
				2566	* By placing pages into the surplus state independent of the
				2567	* overcommit value, we are allowing the surplus pool size to
				2568	* exceed overcommit. There are few sane options here. Since
				2569	* alloc_surplus_huge_page() is checking the global counter,
				2570	* though, we'll note that we're not allowed to exceed surplus
				2571	* and won't grow the pool anywhere else. Not until one of the
				2572	* sysctls are changed, or the surplus pages go out of use.
				2573	*/
				2574	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
				2575	min_count = max(count, min_count);
				2576	try_to_free_low(h, min_count, nodes_allowed);
				2577	while (min_count < persistent_huge_pages(h)) {
				2578	if (!free_pool_huge_page(h, nodes_allowed, 0))
				2579	break;
				2580	cond_resched_lock(&hugetlb_lock);
				2581	}
				2582	while (count < persistent_huge_pages(h)) {
				2583	if (!adjust_pool_surplus(h, nodes_allowed, 1))
				2584	break;
				2585	}
				2586	out:
				2587	h->max_huge_pages = persistent_huge_pages(h);
				2588	spin_unlock(&hugetlb_lock);
				2589
				2590	NODEMASK_FREE(node_alloc_noretry);
				2591
				2592	return 0;
				2593	}
				2594
				2595	#define HSTATE_ATTR_RO(_name) \
				2596	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				2597
				2598	#define HSTATE_ATTR(_name) \
				2599	static struct kobj_attribute _name##_attr = \
				2600	__ATTR(_name, 0644, _name##_show, _name##_store)
				2601
				2602	static struct kobject *hugepages_kobj;
				2603	static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
				2604
				2605	static struct hstate kobj_to_node_hstate(struct kobject kobj, int *nidp);
				2606
				2607	static struct hstate kobj_to_hstate(struct kobject kobj, int *nidp)
				2608	{
				2609	int i;
				2610
				2611	for (i = 0; i < HUGE_MAX_HSTATE; i++)
				2612	if (hstate_kobjs[i] == kobj) {
				2613	if (nidp)
				2614	*nidp = NUMA_NO_NODE;
				2615	return &hstates[i];
				2616	}
				2617
				2618	return kobj_to_node_hstate(kobj, nidp);
				2619	}
				2620
				2621	static ssize_t nr_hugepages_show_common(struct kobject *kobj,
				2622	struct kobj_attribute attr, char buf)
				2623	{
				2624	struct hstate *h;
				2625	unsigned long nr_huge_pages;
				2626	int nid;
				2627
				2628	h = kobj_to_hstate(kobj, &nid);
				2629	if (nid == NUMA_NO_NODE)
				2630	nr_huge_pages = h->nr_huge_pages;
				2631	else
				2632	nr_huge_pages = h->nr_huge_pages_node[nid];
				2633
				2634	return sprintf(buf, "%lu\n", nr_huge_pages);
				2635	}
				2636
				2637	static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
				2638	struct hstate *h, int nid,
				2639	unsigned long count, size_t len)
				2640	{
				2641	int err;
				2642	nodemask_t nodes_allowed, *n_mask;
				2643
				2644	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
				2645	return -EINVAL;
				2646
				2647	if (nid == NUMA_NO_NODE) {
				2648	/*
				2649	* global hstate attribute
				2650	*/
				2651	if (!(obey_mempolicy &&
				2652	init_nodemask_of_mempolicy(&nodes_allowed)))
				2653	n_mask = &node_states[N_MEMORY];
				2654	else
				2655	n_mask = &nodes_allowed;
				2656	} else {
				2657	/*
				2658	* Node specific request. count adjustment happens in
				2659	* set_max_huge_pages() after acquiring hugetlb_lock.
				2660	*/
				2661	init_nodemask_of_node(&nodes_allowed, nid);
				2662	n_mask = &nodes_allowed;
				2663	}
				2664
				2665	err = set_max_huge_pages(h, count, nid, n_mask);
				2666
				2667	return err ? err : len;
				2668	}
				2669
				2670	static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
				2671	struct kobject kobj, const char buf,
				2672	size_t len)
				2673	{
				2674	struct hstate *h;
				2675	unsigned long count;
				2676	int nid;
				2677	int err;
				2678
				2679	err = kstrtoul(buf, 10, &count);
				2680	if (err)
				2681	return err;
				2682
				2683	h = kobj_to_hstate(kobj, &nid);
				2684	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
				2685	}
				2686
				2687	static ssize_t nr_hugepages_show(struct kobject *kobj,
				2688	struct kobj_attribute attr, char buf)
				2689	{
				2690	return nr_hugepages_show_common(kobj, attr, buf);
				2691	}
				2692
				2693	static ssize_t nr_hugepages_store(struct kobject *kobj,
				2694	struct kobj_attribute attr, const char buf, size_t len)
				2695	{
				2696	return nr_hugepages_store_common(false, kobj, buf, len);
				2697	}
				2698	HSTATE_ATTR(nr_hugepages);
				2699
				2700	#ifdef CONFIG_NUMA
				2701
				2702	/*
				2703	* hstate attribute for optionally mempolicy-based constraint on persistent
				2704	* huge page alloc/free.
				2705	*/
				2706	static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
				2707	struct kobj_attribute attr, char buf)
				2708	{
				2709	return nr_hugepages_show_common(kobj, attr, buf);
				2710	}
				2711
				2712	static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
				2713	struct kobj_attribute attr, const char buf, size_t len)
				2714	{
				2715	return nr_hugepages_store_common(true, kobj, buf, len);
				2716	}
				2717	HSTATE_ATTR(nr_hugepages_mempolicy);
				2718	#endif
				2719
				2720
				2721	static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
				2722	struct kobj_attribute attr, char buf)
				2723	{
				2724	struct hstate *h = kobj_to_hstate(kobj, NULL);
				2725	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
				2726	}
				2727
				2728	static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
				2729	struct kobj_attribute attr, const char buf, size_t count)
				2730	{
				2731	int err;
				2732	unsigned long input;
				2733	struct hstate *h = kobj_to_hstate(kobj, NULL);
				2734
				2735	if (hstate_is_gigantic(h))
				2736	return -EINVAL;
				2737
				2738	err = kstrtoul(buf, 10, &input);
				2739	if (err)
				2740	return err;
				2741
				2742	spin_lock(&hugetlb_lock);
				2743	h->nr_overcommit_huge_pages = input;
				2744	spin_unlock(&hugetlb_lock);
				2745
				2746	return count;
				2747	}
				2748	HSTATE_ATTR(nr_overcommit_hugepages);
				2749
				2750	static ssize_t free_hugepages_show(struct kobject *kobj,
				2751	struct kobj_attribute attr, char buf)
				2752	{
				2753	struct hstate *h;
				2754	unsigned long free_huge_pages;
				2755	int nid;
				2756
				2757	h = kobj_to_hstate(kobj, &nid);
				2758	if (nid == NUMA_NO_NODE)
				2759	free_huge_pages = h->free_huge_pages;
				2760	else
				2761	free_huge_pages = h->free_huge_pages_node[nid];
				2762
				2763	return sprintf(buf, "%lu\n", free_huge_pages);
				2764	}
				2765	HSTATE_ATTR_RO(free_hugepages);
				2766
				2767	static ssize_t resv_hugepages_show(struct kobject *kobj,
				2768	struct kobj_attribute attr, char buf)
				2769	{
				2770	struct hstate *h = kobj_to_hstate(kobj, NULL);
				2771	return sprintf(buf, "%lu\n", h->resv_huge_pages);
				2772	}
				2773	HSTATE_ATTR_RO(resv_hugepages);
				2774
				2775	static ssize_t surplus_hugepages_show(struct kobject *kobj,
				2776	struct kobj_attribute attr, char buf)
				2777	{
				2778	struct hstate *h;
				2779	unsigned long surplus_huge_pages;
				2780	int nid;
				2781
				2782	h = kobj_to_hstate(kobj, &nid);
				2783	if (nid == NUMA_NO_NODE)
				2784	surplus_huge_pages = h->surplus_huge_pages;
				2785	else
				2786	surplus_huge_pages = h->surplus_huge_pages_node[nid];
				2787
				2788	return sprintf(buf, "%lu\n", surplus_huge_pages);
				2789	}
				2790	HSTATE_ATTR_RO(surplus_hugepages);
				2791
				2792	static struct attribute *hstate_attrs[] = {
				2793	&nr_hugepages_attr.attr,
				2794	&nr_overcommit_hugepages_attr.attr,
				2795	&free_hugepages_attr.attr,
				2796	&resv_hugepages_attr.attr,
				2797	&surplus_hugepages_attr.attr,
				2798	#ifdef CONFIG_NUMA
				2799	&nr_hugepages_mempolicy_attr.attr,
				2800	#endif
				2801	NULL,
				2802	};
				2803
				2804	static const struct attribute_group hstate_attr_group = {
				2805	.attrs = hstate_attrs,
				2806	};
				2807
				2808	static int hugetlb_sysfs_add_hstate(struct hstate h, struct kobject parent,
				2809	struct kobject **hstate_kobjs,
				2810	const struct attribute_group *hstate_attr_group)
				2811	{
				2812	int retval;
				2813	int hi = hstate_index(h);
				2814
				2815	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
				2816	if (!hstate_kobjs[hi])
				2817	return -ENOMEM;
				2818
				2819	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
				2820	if (retval) {
				2821	kobject_put(hstate_kobjs[hi]);
				2822	hstate_kobjs[hi] = NULL;
				2823	}
				2824
				2825	return retval;
				2826	}
				2827
				2828	static void __init hugetlb_sysfs_init(void)
				2829	{
				2830	struct hstate *h;
				2831	int err;
				2832
				2833	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
				2834	if (!hugepages_kobj)
				2835	return;
				2836
				2837	for_each_hstate(h) {
				2838	err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
				2839	hstate_kobjs, &hstate_attr_group);
				2840	if (err)
				2841	pr_err("Hugetlb: Unable to add hstate %s", h->name);
				2842	}
				2843	}
				2844
				2845	#ifdef CONFIG_NUMA
				2846
				2847	/*
				2848	* node_hstate/s - associate per node hstate attributes, via their kobjects,
				2849	* with node devices in node_devices[] using a parallel array. The array
				2850	* index of a node device or _hstate == node id.
				2851	* This is here to avoid any static dependency of the node device driver, in
				2852	* the base kernel, on the hugetlb module.
				2853	*/
				2854	struct node_hstate {
				2855	struct kobject *hugepages_kobj;
				2856	struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
				2857	};
				2858	static struct node_hstate node_hstates[MAX_NUMNODES];
				2859
				2860	/*
				2861	* A subset of global hstate attributes for node devices
				2862	*/
				2863	static struct attribute *per_node_hstate_attrs[] = {
				2864	&nr_hugepages_attr.attr,
				2865	&free_hugepages_attr.attr,
				2866	&surplus_hugepages_attr.attr,
				2867	NULL,
				2868	};
				2869
				2870	static const struct attribute_group per_node_hstate_attr_group = {
				2871	.attrs = per_node_hstate_attrs,
				2872	};
				2873
				2874	/*
				2875	* kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
				2876	* Returns node id via non-NULL nidp.
				2877	*/
				2878	static struct hstate kobj_to_node_hstate(struct kobject kobj, int *nidp)
				2879	{
				2880	int nid;
				2881
				2882	for (nid = 0; nid < nr_node_ids; nid++) {
				2883	struct node_hstate *nhs = &node_hstates[nid];
				2884	int i;
				2885	for (i = 0; i < HUGE_MAX_HSTATE; i++)
				2886	if (nhs->hstate_kobjs[i] == kobj) {
				2887	if (nidp)
				2888	*nidp = nid;
				2889	return &hstates[i];
				2890	}
				2891	}
				2892
				2893	BUG();
				2894	return NULL;
				2895	}
				2896
				2897	/*
				2898	* Unregister hstate attributes from a single node device.
				2899	* No-op if no hstate attributes attached.
				2900	*/
				2901	static void hugetlb_unregister_node(struct node *node)
				2902	{
				2903	struct hstate *h;
				2904	struct node_hstate *nhs = &node_hstates[node->dev.id];
				2905
				2906	if (!nhs->hugepages_kobj)
				2907	return; /* no hstate attributes */
				2908
				2909	for_each_hstate(h) {
				2910	int idx = hstate_index(h);
				2911	if (nhs->hstate_kobjs[idx]) {
				2912	kobject_put(nhs->hstate_kobjs[idx]);
				2913	nhs->hstate_kobjs[idx] = NULL;
				2914	}
				2915	}
				2916
				2917	kobject_put(nhs->hugepages_kobj);
				2918	nhs->hugepages_kobj = NULL;
				2919	}
				2920
				2921
				2922	/*
				2923	* Register hstate attributes for a single node device.
				2924	* No-op if attributes already registered.
				2925	*/
				2926	static void hugetlb_register_node(struct node *node)
				2927	{
				2928	struct hstate *h;
				2929	struct node_hstate *nhs = &node_hstates[node->dev.id];
				2930	int err;
				2931
				2932	if (nhs->hugepages_kobj)
				2933	return; /* already allocated */
				2934
				2935	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
				2936	&node->dev.kobj);
				2937	if (!nhs->hugepages_kobj)
				2938	return;
				2939
				2940	for_each_hstate(h) {
				2941	err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
				2942	nhs->hstate_kobjs,
				2943	&per_node_hstate_attr_group);
				2944	if (err) {
				2945	pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
				2946	h->name, node->dev.id);
				2947	hugetlb_unregister_node(node);
				2948	break;
				2949	}
				2950	}
				2951	}
				2952
				2953	/*
				2954	* hugetlb init time: register hstate attributes for all registered node
				2955	* devices of nodes that have memory. All on-line nodes should have
				2956	* registered their associated device by this time.
				2957	*/
				2958	static void __init hugetlb_register_all_nodes(void)
				2959	{
				2960	int nid;
				2961
				2962	for_each_node_state(nid, N_MEMORY) {
				2963	struct node *node = node_devices[nid];
				2964	if (node->dev.id == nid)
				2965	hugetlb_register_node(node);
				2966	}
				2967
				2968	/*
				2969	* Let the node device driver know we're here so it can
				2970	* [un]register hstate attributes on node hotplug.
				2971	*/
				2972	register_hugetlbfs_with_node(hugetlb_register_node,
				2973	hugetlb_unregister_node);
				2974	}
				2975	#else /* !CONFIG_NUMA */
				2976
				2977	static struct hstate kobj_to_node_hstate(struct kobject kobj, int *nidp)
				2978	{
				2979	BUG();
				2980	if (nidp)
				2981	*nidp = -1;
				2982	return NULL;
				2983	}
				2984
				2985	static void hugetlb_register_all_nodes(void) { }
				2986
				2987	#endif
				2988
				2989	static int __init hugetlb_init(void)
				2990	{
				2991	int i;
				2992
				2993	if (!hugepages_supported())
				2994	return 0;
				2995
				2996	if (!size_to_hstate(default_hstate_size)) {
				2997	if (default_hstate_size != 0) {
				2998	pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
				2999	default_hstate_size, HPAGE_SIZE);
				3000	}
				3001
				3002	default_hstate_size = HPAGE_SIZE;
				3003	if (!size_to_hstate(default_hstate_size))
				3004	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
				3005	}
				3006	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
				3007	if (default_hstate_max_huge_pages) {
				3008	if (!default_hstate.max_huge_pages)
				3009	default_hstate.max_huge_pages = default_hstate_max_huge_pages;
				3010	}
				3011
				3012	hugetlb_init_hstates();
				3013	gather_bootmem_prealloc();
				3014	report_hugepages();
				3015
				3016	hugetlb_sysfs_init();
				3017	hugetlb_register_all_nodes();
				3018	hugetlb_cgroup_file_init();
				3019
				3020	#ifdef CONFIG_SMP
				3021	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
				3022	#else
				3023	num_fault_mutexes = 1;
				3024	#endif
				3025	hugetlb_fault_mutex_table =
				3026	kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
				3027	GFP_KERNEL);
				3028	BUG_ON(!hugetlb_fault_mutex_table);
				3029
				3030	for (i = 0; i < num_fault_mutexes; i++)
				3031	mutex_init(&hugetlb_fault_mutex_table[i]);
				3032	return 0;
				3033	}
				3034	subsys_initcall(hugetlb_init);
				3035
				3036	/* Should be called on processing a hugepagesz=... option */
				3037	void __init hugetlb_bad_size(void)
				3038	{
				3039	parsed_valid_hugepagesz = false;
				3040	}
				3041
				3042	void __init hugetlb_add_hstate(unsigned int order)
				3043	{
				3044	struct hstate *h;
				3045	unsigned long i;
				3046
				3047	if (size_to_hstate(PAGE_SIZE << order)) {
				3048	pr_warn("hugepagesz= specified twice, ignoring\n");
				3049	return;
				3050	}
				3051	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
				3052	BUG_ON(order == 0);
				3053	h = &hstates[hugetlb_max_hstate++];
				3054	h->order = order;
				3055	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
				3056	h->nr_huge_pages = 0;
				3057	h->free_huge_pages = 0;
				3058	for (i = 0; i < MAX_NUMNODES; ++i)
				3059	INIT_LIST_HEAD(&h->hugepage_freelists[i]);
				3060	INIT_LIST_HEAD(&h->hugepage_activelist);
				3061	h->next_nid_to_alloc = first_memory_node;
				3062	h->next_nid_to_free = first_memory_node;
				3063	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
				3064	huge_page_size(h)/1024);
				3065
				3066	parsed_hstate = h;
				3067	}
				3068
				3069	static int __init hugetlb_nrpages_setup(char *s)
				3070	{
				3071	unsigned long *mhp;
				3072	static unsigned long *last_mhp;
				3073
				3074	if (!parsed_valid_hugepagesz) {
				3075	pr_warn("hugepages = %s preceded by "
				3076	"an unsupported hugepagesz, ignoring\n", s);
				3077	parsed_valid_hugepagesz = true;
				3078	return 1;
				3079	}
				3080	/*
				3081	* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
				3082	* so this hugepages= parameter goes to the "default hstate".
				3083	*/
				3084	else if (!hugetlb_max_hstate)
				3085	mhp = &default_hstate_max_huge_pages;
				3086	else
				3087	mhp = &parsed_hstate->max_huge_pages;
				3088
				3089	if (mhp == last_mhp) {
				3090	pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
				3091	return 1;
				3092	}
				3093
				3094	if (sscanf(s, "%lu", mhp) <= 0)
				3095	*mhp = 0;
				3096
				3097	/*
				3098	* Global state is always initialized later in hugetlb_init.
				3099	* But we need to allocate >= MAX_ORDER hstates here early to still
				3100	* use the bootmem allocator.
				3101	*/
				3102	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
				3103	hugetlb_hstate_alloc_pages(parsed_hstate);
				3104
				3105	last_mhp = mhp;
				3106
				3107	return 1;
				3108	}
				3109	__setup("hugepages=", hugetlb_nrpages_setup);
				3110
				3111	static int __init hugetlb_default_setup(char *s)
				3112	{
				3113	default_hstate_size = memparse(s, &s);
				3114	return 1;
				3115	}
				3116	__setup("default_hugepagesz=", hugetlb_default_setup);
				3117
				3118	static unsigned int cpuset_mems_nr(unsigned int *array)
				3119	{
				3120	int node;
				3121	unsigned int nr = 0;
				3122
				3123	for_each_node_mask(node, cpuset_current_mems_allowed)
				3124	nr += array[node];
				3125
				3126	return nr;
				3127	}
				3128
				3129	#ifdef CONFIG_SYSCTL
				3130	static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
				3131	void buffer, size_t length,
				3132	loff_t ppos, unsigned long out)
				3133	{
				3134	struct ctl_table dup_table;
				3135
				3136	/*
				3137	* In order to avoid races with __do_proc_doulongvec_minmax(), we
				3138	* can duplicate the @table and alter the duplicate of it.
				3139	*/
				3140	dup_table = *table;
				3141	dup_table.data = out;
				3142
				3143	return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
				3144	}
				3145
				3146	static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
				3147	struct ctl_table *table, int write,
				3148	void __user buffer, size_t length, loff_t *ppos)
				3149	{
				3150	struct hstate *h = &default_hstate;
				3151	unsigned long tmp = h->max_huge_pages;
				3152	int ret;
				3153
				3154	if (!hugepages_supported())
				3155	return -EOPNOTSUPP;
				3156
				3157	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
				3158	&tmp);
				3159	if (ret)
				3160	goto out;
				3161
				3162	if (write)
				3163	ret = __nr_hugepages_store_common(obey_mempolicy, h,
				3164	NUMA_NO_NODE, tmp, *length);
				3165	out:
				3166	return ret;
				3167	}
				3168
				3169	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
				3170	void __user buffer, size_t length, loff_t *ppos)
				3171	{
				3172
				3173	return hugetlb_sysctl_handler_common(false, table, write,
				3174	buffer, length, ppos);
				3175	}
				3176
				3177	#ifdef CONFIG_NUMA
				3178	int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
				3179	void __user buffer, size_t length, loff_t *ppos)
				3180	{
				3181	return hugetlb_sysctl_handler_common(true, table, write,
				3182	buffer, length, ppos);
				3183	}
				3184	#endif /* CONFIG_NUMA */
				3185
				3186	int hugetlb_overcommit_handler(struct ctl_table *table, int write,
				3187	void __user *buffer,
				3188	size_t length, loff_t ppos)
				3189	{
				3190	struct hstate *h = &default_hstate;
				3191	unsigned long tmp;
				3192	int ret;
				3193
				3194	if (!hugepages_supported())
				3195	return -EOPNOTSUPP;
				3196
				3197	tmp = h->nr_overcommit_huge_pages;
				3198
				3199	if (write && hstate_is_gigantic(h))
				3200	return -EINVAL;
				3201
				3202	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
				3203	&tmp);
				3204	if (ret)
				3205	goto out;
				3206
				3207	if (write) {
				3208	spin_lock(&hugetlb_lock);
				3209	h->nr_overcommit_huge_pages = tmp;
				3210	spin_unlock(&hugetlb_lock);
				3211	}
				3212	out:
				3213	return ret;
				3214	}
				3215
				3216	#endif /* CONFIG_SYSCTL */
				3217
				3218	void hugetlb_report_meminfo(struct seq_file *m)
				3219	{
				3220	struct hstate *h;
				3221	unsigned long total = 0;
				3222
				3223	if (!hugepages_supported())
				3224	return;
				3225
				3226	for_each_hstate(h) {
				3227	unsigned long count = h->nr_huge_pages;
				3228
				3229	total += (PAGE_SIZE << huge_page_order(h)) * count;
				3230
				3231	if (h == &default_hstate)
				3232	seq_printf(m,
				3233	"HugePages_Total: %5lu\n"
				3234	"HugePages_Free: %5lu\n"
				3235	"HugePages_Rsvd: %5lu\n"
				3236	"HugePages_Surp: %5lu\n"
				3237	"Hugepagesize: %8lu kB\n",
				3238	count,
				3239	h->free_huge_pages,
				3240	h->resv_huge_pages,
				3241	h->surplus_huge_pages,
				3242	(PAGE_SIZE << huge_page_order(h)) / 1024);
				3243	}
				3244
				3245	seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
				3246	}
				3247
				3248	int hugetlb_report_node_meminfo(int nid, char *buf)
				3249	{
				3250	struct hstate *h = &default_hstate;
				3251	if (!hugepages_supported())
				3252	return 0;
				3253	return sprintf(buf,
				3254	"Node %d HugePages_Total: %5u\n"
				3255	"Node %d HugePages_Free: %5u\n"
				3256	"Node %d HugePages_Surp: %5u\n",
				3257	nid, h->nr_huge_pages_node[nid],
				3258	nid, h->free_huge_pages_node[nid],
				3259	nid, h->surplus_huge_pages_node[nid]);
				3260	}
				3261
				3262	void hugetlb_show_meminfo(void)
				3263	{
				3264	struct hstate *h;
				3265	int nid;
				3266
				3267	if (!hugepages_supported())
				3268	return;
				3269
				3270	for_each_node_state(nid, N_MEMORY)
				3271	for_each_hstate(h)
				3272	pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
				3273	nid,
				3274	h->nr_huge_pages_node[nid],
				3275	h->free_huge_pages_node[nid],
				3276	h->surplus_huge_pages_node[nid],
				3277	1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
				3278	}
				3279
				3280	void hugetlb_report_usage(struct seq_file m, struct mm_struct mm)
				3281	{
				3282	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
				3283	atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
				3284	}
				3285
				3286	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
				3287	unsigned long hugetlb_total_pages(void)
				3288	{
				3289	struct hstate *h;
				3290	unsigned long nr_total_pages = 0;
				3291
				3292	for_each_hstate(h)
				3293	nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
				3294	return nr_total_pages;
				3295	}
				3296
				3297	static int hugetlb_acct_memory(struct hstate *h, long delta)
				3298	{
				3299	int ret = -ENOMEM;
				3300
				3301	spin_lock(&hugetlb_lock);
				3302	/*
				3303	* When cpuset is configured, it breaks the strict hugetlb page
				3304	* reservation as the accounting is done on a global variable. Such
				3305	* reservation is completely rubbish in the presence of cpuset because
				3306	* the reservation is not checked against page availability for the
				3307	* current cpuset. Application can still potentially OOM'ed by kernel
				3308	* with lack of free htlb page in cpuset that the task is in.
				3309	* Attempt to enforce strict accounting with cpuset is almost
				3310	* impossible (or too ugly) because cpuset is too fluid that
				3311	* task or memory node can be dynamically moved between cpusets.
				3312	*
				3313	* The change of semantics for shared hugetlb mapping with cpuset is
				3314	* undesirable. However, in order to preserve some of the semantics,
				3315	* we fall back to check against current free page availability as
				3316	* a best attempt and hopefully to minimize the impact of changing
				3317	* semantics that cpuset has.
				3318	*/
				3319	if (delta > 0) {
				3320	if (gather_surplus_pages(h, delta) < 0)
				3321	goto out;
				3322
				3323	if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
				3324	return_unused_surplus_pages(h, delta);
				3325	goto out;
				3326	}
				3327	}
				3328
				3329	ret = 0;
				3330	if (delta < 0)
				3331	return_unused_surplus_pages(h, (unsigned long) -delta);
				3332
				3333	out:
				3334	spin_unlock(&hugetlb_lock);
				3335	return ret;
				3336	}
				3337
				3338	static void hugetlb_vm_op_open(struct vm_area_struct *vma)
				3339	{
				3340	struct resv_map *resv = vma_resv_map(vma);
				3341
				3342	/*
				3343	* This new VMA should share its siblings reservation map if present.
				3344	* The VMA will only ever have a valid reservation map pointer where
				3345	* it is being copied for another still existing VMA. As that VMA
				3346	* has a reference to the reservation map it cannot disappear until
				3347	* after this open call completes. It is therefore safe to take a
				3348	* new reference here without additional locking.
				3349	*/
				3350	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
				3351	kref_get(&resv->refs);
				3352	}
				3353
				3354	static void hugetlb_vm_op_close(struct vm_area_struct *vma)
				3355	{
				3356	struct hstate *h = hstate_vma(vma);
				3357	struct resv_map *resv = vma_resv_map(vma);
				3358	struct hugepage_subpool *spool = subpool_vma(vma);
				3359	unsigned long reserve, start, end;
				3360	long gbl_reserve;
				3361
				3362	if (!resv \|\| !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
				3363	return;
				3364
				3365	start = vma_hugecache_offset(h, vma, vma->vm_start);
				3366	end = vma_hugecache_offset(h, vma, vma->vm_end);
				3367
				3368	reserve = (end - start) - region_count(resv, start, end);
				3369
				3370	kref_put(&resv->refs, resv_map_release);
				3371
				3372	if (reserve) {
				3373	/*
				3374	* Decrement reserve counts. The global reserve count may be
				3375	* adjusted if the subpool has a minimum size.
				3376	*/
				3377	gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
				3378	hugetlb_acct_memory(h, -gbl_reserve);
				3379	}
				3380	}
				3381
				3382	static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
				3383	{
				3384	if (addr & ~(huge_page_mask(hstate_vma(vma))))
				3385	return -EINVAL;
				3386	return 0;
				3387	}
				3388
				3389	static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
				3390	{
				3391	struct hstate *hstate = hstate_vma(vma);
				3392
				3393	return 1UL << huge_page_shift(hstate);
				3394	}
				3395
				3396	/*
				3397	* We cannot handle pagefaults against hugetlb pages at all. They cause
				3398	* handle_mm_fault() to try to instantiate regular-sized pages in the
				3399	* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
				3400	* this far.
				3401	*/
				3402	static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
				3403	{
				3404	BUG();
				3405	return 0;
				3406	}
				3407
				3408	/*
				3409	* When a new function is introduced to vm_operations_struct and added
				3410	* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
				3411	* This is because under System V memory model, mappings created via
				3412	* shmget/shmat with "huge page" specified are backed by hugetlbfs files,
				3413	* their original vm_ops are overwritten with shm_vm_ops.
				3414	*/
				3415	const struct vm_operations_struct hugetlb_vm_ops = {
				3416	.fault = hugetlb_vm_op_fault,
				3417	.open = hugetlb_vm_op_open,
				3418	.close = hugetlb_vm_op_close,
				3419	.split = hugetlb_vm_op_split,
				3420	.pagesize = hugetlb_vm_op_pagesize,
				3421	};
				3422
				3423	static pte_t make_huge_pte(struct vm_area_struct vma, struct page page,
				3424	int writable)
				3425	{
				3426	pte_t entry;
				3427
				3428	if (writable) {
				3429	entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
				3430	vma->vm_page_prot)));
				3431	} else {
				3432	entry = huge_pte_wrprotect(mk_huge_pte(page,
				3433	vma->vm_page_prot));
				3434	}
				3435	entry = pte_mkyoung(entry);
				3436	entry = pte_mkhuge(entry);
				3437	entry = arch_make_huge_pte(entry, vma, page, writable);
				3438
				3439	return entry;
				3440	}
				3441
				3442	static void set_huge_ptep_writable(struct vm_area_struct *vma,
				3443	unsigned long address, pte_t *ptep)
				3444	{
				3445	pte_t entry;
				3446
				3447	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
				3448	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
				3449	update_mmu_cache(vma, address, ptep);
				3450	}
				3451
				3452	bool is_hugetlb_entry_migration(pte_t pte)
				3453	{
				3454	swp_entry_t swp;
				3455
				3456	if (huge_pte_none(pte) \|\| pte_present(pte))
				3457	return false;
				3458	swp = pte_to_swp_entry(pte);
				3459	if (non_swap_entry(swp) && is_migration_entry(swp))
				3460	return true;
				3461	else
				3462	return false;
				3463	}
				3464
				3465	static int is_hugetlb_entry_hwpoisoned(pte_t pte)
				3466	{
				3467	swp_entry_t swp;
				3468
				3469	if (huge_pte_none(pte) \|\| pte_present(pte))
				3470	return 0;
				3471	swp = pte_to_swp_entry(pte);
				3472	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
				3473	return 1;
				3474	else
				3475	return 0;
				3476	}
				3477
				3478	int copy_hugetlb_page_range(struct mm_struct dst, struct mm_struct src,
				3479	struct vm_area_struct *vma)
				3480	{
				3481	pte_t src_pte, dst_pte, entry, dst_entry;
				3482	struct page *ptepage;
				3483	unsigned long addr;
				3484	int cow;
				3485	struct hstate *h = hstate_vma(vma);
				3486	unsigned long sz = huge_page_size(h);
				3487	struct mmu_notifier_range range;
				3488	int ret = 0;
				3489
				3490	cow = (vma->vm_flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;
				3491
				3492	if (cow) {
				3493	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
				3494	vma->vm_start,
				3495	vma->vm_end);
				3496	mmu_notifier_invalidate_range_start(&range);
				3497	}
				3498
				3499	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
				3500	spinlock_t src_ptl, dst_ptl;
				3501	src_pte = huge_pte_offset(src, addr, sz);
				3502	if (!src_pte)
				3503	continue;
				3504	dst_pte = huge_pte_alloc(dst, vma, addr, sz);
				3505	if (!dst_pte) {
				3506	ret = -ENOMEM;
				3507	break;
				3508	}
				3509
				3510	/*
				3511	* If the pagetables are shared don't copy or take references.
				3512	* dst_pte == src_pte is the common case of src/dest sharing.
				3513	*
				3514	* However, src could have 'unshared' and dst shares with
				3515	* another vma. If dst_pte !none, this implies sharing.
				3516	* Check here before taking page table lock, and once again
				3517	* after taking the lock below.
				3518	*/
				3519	dst_entry = huge_ptep_get(dst_pte);
				3520	if ((dst_pte == src_pte) \|\| !huge_pte_none(dst_entry))
				3521	continue;
				3522
				3523	dst_ptl = huge_pte_lock(h, dst, dst_pte);
				3524	src_ptl = huge_pte_lockptr(h, src, src_pte);
				3525	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
				3526	entry = huge_ptep_get(src_pte);
				3527	dst_entry = huge_ptep_get(dst_pte);
				3528	if (huge_pte_none(entry) \|\| !huge_pte_none(dst_entry)) {
				3529	/*
				3530	* Skip if src entry none. Also, skip in the
				3531	* unlikely case dst entry !none as this implies
				3532	* sharing with another vma.
				3533	*/
				3534	;
				3535	} else if (unlikely(is_hugetlb_entry_migration(entry) \|\|
				3536	is_hugetlb_entry_hwpoisoned(entry))) {
				3537	swp_entry_t swp_entry = pte_to_swp_entry(entry);
				3538
				3539	if (is_write_migration_entry(swp_entry) && cow) {
				3540	/*
				3541	* COW mappings require pages in both
				3542	* parent and child to be set to read.
				3543	*/
				3544	make_migration_entry_read(&swp_entry);
				3545	entry = swp_entry_to_pte(swp_entry);
				3546	set_huge_swap_pte_at(src, addr, src_pte,
				3547	entry, sz);
				3548	}
				3549	set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
				3550	} else {
				3551	if (cow) {
				3552	/*
				3553	* No need to notify as we are downgrading page
				3554	* table protection not changing it to point
				3555	* to a new page.
				3556	*
				3557	* See Documentation/vm/mmu_notifier.rst
				3558	*/
				3559	huge_ptep_set_wrprotect(src, addr, src_pte);
				3560	}
				3561	entry = huge_ptep_get(src_pte);
				3562	ptepage = pte_page(entry);
				3563	get_page(ptepage);
				3564	page_dup_rmap(ptepage, true);
				3565	set_huge_pte_at(dst, addr, dst_pte, entry);
				3566	hugetlb_count_add(pages_per_huge_page(h), dst);
				3567	}
				3568	spin_unlock(src_ptl);
				3569	spin_unlock(dst_ptl);
				3570	}
				3571
				3572	if (cow)
				3573	mmu_notifier_invalidate_range_end(&range);
				3574
				3575	return ret;
				3576	}
				3577
				3578	void __unmap_hugepage_range(struct mmu_gather tlb, struct vm_area_struct vma,
				3579	unsigned long start, unsigned long end,
				3580	struct page *ref_page)
				3581	{
				3582	struct mm_struct *mm = vma->vm_mm;
				3583	unsigned long address;
				3584	pte_t *ptep;
				3585	pte_t pte;
				3586	spinlock_t *ptl;
				3587	struct page *page;
				3588	struct hstate *h = hstate_vma(vma);
				3589	unsigned long sz = huge_page_size(h);
				3590	struct mmu_notifier_range range;
				3591	bool force_flush = false;
				3592
				3593	WARN_ON(!is_vm_hugetlb_page(vma));
				3594	BUG_ON(start & ~huge_page_mask(h));
				3595	BUG_ON(end & ~huge_page_mask(h));
				3596
				3597	/*
				3598	* This is a hugetlb vma, all the pte entries should point
				3599	* to huge page.
				3600	*/
				3601	tlb_change_page_size(tlb, sz);
				3602	tlb_start_vma(tlb, vma);
				3603
				3604	/*
				3605	* If sharing possible, alert mmu notifiers of worst case.
				3606	*/
				3607	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
				3608	end);
				3609	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
				3610	mmu_notifier_invalidate_range_start(&range);
				3611	address = start;
				3612	for (; address < end; address += sz) {
				3613	ptep = huge_pte_offset(mm, address, sz);
				3614	if (!ptep)
				3615	continue;
				3616
				3617	ptl = huge_pte_lock(h, mm, ptep);
				3618	if (huge_pmd_unshare(mm, &address, ptep)) {
				3619	spin_unlock(ptl);
				3620	tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
				3621	force_flush = true;
				3622	continue;
				3623	}
				3624
				3625	pte = huge_ptep_get(ptep);
				3626	if (huge_pte_none(pte)) {
				3627	spin_unlock(ptl);
				3628	continue;
				3629	}
				3630
				3631	/*
				3632	* Migrating hugepage or HWPoisoned hugepage is already
				3633	* unmapped and its refcount is dropped, so just clear pte here.
				3634	*/
				3635	if (unlikely(!pte_present(pte))) {
				3636	huge_pte_clear(mm, address, ptep, sz);
				3637	spin_unlock(ptl);
				3638	continue;
				3639	}
				3640
				3641	page = pte_page(pte);
				3642	/*
				3643	* If a reference page is supplied, it is because a specific
				3644	* page is being unmapped, not a range. Ensure the page we
				3645	* are about to unmap is the actual page of interest.
				3646	*/
				3647	if (ref_page) {
				3648	if (page != ref_page) {
				3649	spin_unlock(ptl);
				3650	continue;
				3651	}
				3652	/*
				3653	* Mark the VMA as having unmapped its page so that
				3654	* future faults in this VMA will fail rather than
				3655	* looking like data was lost
				3656	*/
				3657	set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
				3658	}
				3659
				3660	pte = huge_ptep_get_and_clear(mm, address, ptep);
				3661	tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
				3662	if (huge_pte_dirty(pte))
				3663	set_page_dirty(page);
				3664
				3665	hugetlb_count_sub(pages_per_huge_page(h), mm);
				3666	page_remove_rmap(page, true);
				3667
				3668	spin_unlock(ptl);
				3669	tlb_remove_page_size(tlb, page, huge_page_size(h));
				3670	/*
				3671	* Bail out after unmapping reference page if supplied
				3672	*/
				3673	if (ref_page)
				3674	break;
				3675	}
				3676	mmu_notifier_invalidate_range_end(&range);
				3677	tlb_end_vma(tlb, vma);
				3678
				3679	/*
				3680	* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
				3681	* could defer the flush until now, since by holding i_mmap_rwsem we
				3682	* guaranteed that the last refernece would not be dropped. But we must
				3683	* do the flushing before we return, as otherwise i_mmap_rwsem will be
				3684	* dropped and the last reference to the shared PMDs page might be
				3685	* dropped as well.
				3686	*
				3687	* In theory we could defer the freeing of the PMD pages as well, but
				3688	* huge_pmd_unshare() relies on the exact page_count for the PMD page to
				3689	* detect sharing, so we cannot defer the release of the page either.
				3690	* Instead, do flush now.
				3691	*/
				3692	if (force_flush)
				3693	tlb_flush_mmu_tlbonly(tlb);
				3694	}
				3695
				3696	void __unmap_hugepage_range_final(struct mmu_gather *tlb,
				3697	struct vm_area_struct *vma, unsigned long start,
				3698	unsigned long end, struct page *ref_page)
				3699	{
				3700	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
				3701
				3702	/*
				3703	* Clear this flag so that x86's huge_pmd_share page_table_shareable
				3704	* test will fail on a vma being torn down, and not grab a page table
				3705	* on its way out. We're lucky that the flag has such an appropriate
				3706	* name, and can in fact be safely cleared here. We could clear it
				3707	* before the __unmap_hugepage_range above, but all that's necessary
				3708	* is to clear it before releasing the i_mmap_rwsem. This works
				3709	* because in the context this is called, the VMA is about to be
				3710	* destroyed and the i_mmap_rwsem is held.
				3711	*/
				3712	vma->vm_flags &= ~VM_MAYSHARE;
				3713	}
				3714
				3715	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
				3716	unsigned long end, struct page *ref_page)
				3717	{
				3718	struct mm_struct *mm;
				3719	struct mmu_gather tlb;
				3720	unsigned long tlb_start = start;
				3721	unsigned long tlb_end = end;
				3722
				3723	/*
				3724	* If shared PMDs were possibly used within this vma range, adjust
				3725	* start/end for worst case tlb flushing.
				3726	* Note that we can not be sure if PMDs are shared until we try to
				3727	* unmap pages. However, we want to make sure TLB flushing covers
				3728	* the largest possible range.
				3729	*/
				3730	adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
				3731
				3732	mm = vma->vm_mm;
				3733
				3734	tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
				3735	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
				3736	tlb_finish_mmu(&tlb, tlb_start, tlb_end);
				3737	}
				3738
				3739	/*
				3740	* This is called when the original mapper is failing to COW a MAP_PRIVATE
				3741	* mappping it owns the reserve page for. The intention is to unmap the page
				3742	* from other VMAs and let the children be SIGKILLed if they are faulting the
				3743	* same region.
				3744	*/
				3745	static void unmap_ref_private(struct mm_struct mm, struct vm_area_struct vma,
				3746	struct page *page, unsigned long address)
				3747	{
				3748	struct hstate *h = hstate_vma(vma);
				3749	struct vm_area_struct *iter_vma;
				3750	struct address_space *mapping;
				3751	pgoff_t pgoff;
				3752
				3753	/*
				3754	* vm_pgoff is in PAGE_SIZE units, hence the different calculation
				3755	* from page cache lookup which is in HPAGE_SIZE units.
				3756	*/
				3757	address = address & huge_page_mask(h);
				3758	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
				3759	vma->vm_pgoff;
				3760	mapping = vma->vm_file->f_mapping;
				3761
				3762	/*
				3763	* Take the mapping lock for the duration of the table walk. As
				3764	* this mapping should be shared between all the VMAs,
				3765	* __unmap_hugepage_range() is called as the lock is already held
				3766	*/
				3767	i_mmap_lock_write(mapping);
				3768	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
				3769	/* Do not unmap the current VMA */
				3770	if (iter_vma == vma)
				3771	continue;
				3772
				3773	/*
				3774	* Shared VMAs have their own reserves and do not affect
				3775	* MAP_PRIVATE accounting but it is possible that a shared
				3776	* VMA is using the same page so check and skip such VMAs.
				3777	*/
				3778	if (iter_vma->vm_flags & VM_MAYSHARE)
				3779	continue;
				3780
				3781	/*
				3782	* Unmap the page from other VMAs without their own reserves.
				3783	* They get marked to be SIGKILLed if they fault in these
				3784	* areas. This is because a future no-page fault on this VMA
				3785	* could insert a zeroed page instead of the data existing
				3786	* from the time of fork. This would look like data corruption
				3787	*/
				3788	if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
				3789	unmap_hugepage_range(iter_vma, address,
				3790	address + huge_page_size(h), page);
				3791	}
				3792	i_mmap_unlock_write(mapping);
				3793	}
				3794
				3795	/*
				3796	* Hugetlb_cow() should be called with page lock of the original hugepage held.
				3797	* Called with hugetlb_instantiation_mutex held and pte_page locked so we
				3798	* cannot race with other handlers or page migration.
				3799	* Keep the pte_same checks anyway to make transition from the mutex easier.
				3800	*/
				3801	static vm_fault_t hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
				3802	unsigned long address, pte_t *ptep,
				3803	struct page pagecache_page, spinlock_t ptl)
				3804	{
				3805	pte_t pte;
				3806	struct hstate *h = hstate_vma(vma);
				3807	struct page old_page, new_page;
				3808	int outside_reserve = 0;
				3809	vm_fault_t ret = 0;
				3810	unsigned long haddr = address & huge_page_mask(h);
				3811	struct mmu_notifier_range range;
				3812
				3813	pte = huge_ptep_get(ptep);
				3814	old_page = pte_page(pte);
				3815
				3816	retry_avoidcopy:
				3817	/* If no-one else is actually using this page, avoid the copy
				3818	* and just make the page writable */
				3819	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
				3820	page_move_anon_rmap(old_page, vma);
				3821	set_huge_ptep_writable(vma, haddr, ptep);
				3822	return 0;
				3823	}
				3824
				3825	/*
				3826	* If the process that created a MAP_PRIVATE mapping is about to
				3827	* perform a COW due to a shared page count, attempt to satisfy
				3828	* the allocation without using the existing reserves. The pagecache
				3829	* page is used to determine if the reserve at this address was
				3830	* consumed or not. If reserves were used, a partial faulted mapping
				3831	* at the time of fork() could consume its reserves on COW instead
				3832	* of the full address range.
				3833	*/
				3834	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
				3835	old_page != pagecache_page)
				3836	outside_reserve = 1;
				3837
				3838	get_page(old_page);
				3839
				3840	/*
				3841	* Drop page table lock as buddy allocator may be called. It will
				3842	* be acquired again before returning to the caller, as expected.
				3843	*/
				3844	spin_unlock(ptl);
				3845	new_page = alloc_huge_page(vma, haddr, outside_reserve);
				3846
				3847	if (IS_ERR(new_page)) {
				3848	/*
				3849	* If a process owning a MAP_PRIVATE mapping fails to COW,
				3850	* it is due to references held by a child and an insufficient
				3851	* huge page pool. To guarantee the original mappers
				3852	* reliability, unmap the page from child processes. The child
				3853	* may get SIGKILLed if it later faults.
				3854	*/
				3855	if (outside_reserve) {
				3856	put_page(old_page);
				3857	BUG_ON(huge_pte_none(pte));
				3858	unmap_ref_private(mm, vma, old_page, haddr);
				3859	BUG_ON(huge_pte_none(pte));
				3860	spin_lock(ptl);
				3861	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
				3862	if (likely(ptep &&
				3863	pte_same(huge_ptep_get(ptep), pte)))
				3864	goto retry_avoidcopy;
				3865	/*
				3866	* race occurs while re-acquiring page table
				3867	* lock, and our job is done.
				3868	*/
				3869	return 0;
				3870	}
				3871
				3872	ret = vmf_error(PTR_ERR(new_page));
				3873	goto out_release_old;
				3874	}
				3875
				3876	/*
				3877	* When the original hugepage is shared one, it does not have
				3878	* anon_vma prepared.
				3879	*/
				3880	if (unlikely(anon_vma_prepare(vma))) {
				3881	ret = VM_FAULT_OOM;
				3882	goto out_release_all;
				3883	}
				3884
				3885	copy_user_huge_page(new_page, old_page, address, vma,
				3886	pages_per_huge_page(h));
				3887	__SetPageUptodate(new_page);
				3888
				3889	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
				3890	haddr + huge_page_size(h));
				3891	mmu_notifier_invalidate_range_start(&range);
				3892
				3893	/*
				3894	* Retake the page table lock to check for racing updates
				3895	* before the page tables are altered
				3896	*/
				3897	spin_lock(ptl);
				3898	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
				3899	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
				3900	ClearPagePrivate(new_page);
				3901
				3902	/* Break COW */
				3903	huge_ptep_clear_flush(vma, haddr, ptep);
				3904	mmu_notifier_invalidate_range(mm, range.start, range.end);
				3905	set_huge_pte_at(mm, haddr, ptep,
				3906	make_huge_pte(vma, new_page, 1));
				3907	page_remove_rmap(old_page, true);
				3908	hugepage_add_new_anon_rmap(new_page, vma, haddr);
				3909	set_page_huge_active(new_page);
				3910	/* Make the old page be freed below */
				3911	new_page = old_page;
				3912	}
				3913	spin_unlock(ptl);
				3914	mmu_notifier_invalidate_range_end(&range);
				3915	out_release_all:
				3916	restore_reserve_on_error(h, vma, haddr, new_page);
				3917	put_page(new_page);
				3918	out_release_old:
				3919	put_page(old_page);
				3920
				3921	spin_lock(ptl); /* Caller expects lock to be held */
				3922	return ret;
				3923	}
				3924
				3925	/* Return the pagecache page at a given address within a VMA */
				3926	static struct page hugetlbfs_pagecache_page(struct hstate h,
				3927	struct vm_area_struct *vma, unsigned long address)
				3928	{
				3929	struct address_space *mapping;
				3930	pgoff_t idx;
				3931
				3932	mapping = vma->vm_file->f_mapping;
				3933	idx = vma_hugecache_offset(h, vma, address);
				3934
				3935	return find_lock_page(mapping, idx);
				3936	}
				3937
				3938	/*
				3939	* Return whether there is a pagecache page to back given address within VMA.
				3940	* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
				3941	*/
				3942	static bool hugetlbfs_pagecache_present(struct hstate *h,
				3943	struct vm_area_struct *vma, unsigned long address)
				3944	{
				3945	struct address_space *mapping;
				3946	pgoff_t idx;
				3947	struct page *page;
				3948
				3949	mapping = vma->vm_file->f_mapping;
				3950	idx = vma_hugecache_offset(h, vma, address);
				3951
				3952	page = find_get_page(mapping, idx);
				3953	if (page)
				3954	put_page(page);
				3955	return page != NULL;
				3956	}
				3957
				3958	int huge_add_to_page_cache(struct page page, struct address_space mapping,
				3959	pgoff_t idx)
				3960	{
				3961	struct inode *inode = mapping->host;
				3962	struct hstate *h = hstate_inode(inode);
				3963	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
				3964
				3965	if (err)
				3966	return err;
				3967	ClearPagePrivate(page);
				3968
				3969	/*
				3970	* set page dirty so that it will not be removed from cache/file
				3971	* by non-hugetlbfs specific code paths.
				3972	*/
				3973	set_page_dirty(page);
				3974
				3975	spin_lock(&inode->i_lock);
				3976	inode->i_blocks += blocks_per_huge_page(h);
				3977	spin_unlock(&inode->i_lock);
				3978	return 0;
				3979	}
				3980
				3981	static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
				3982	struct address_space *mapping,
				3983	struct hstate *h,
				3984	pgoff_t idx,
				3985	unsigned int flags,
				3986	unsigned long haddr,
				3987	unsigned long reason)
				3988	{
				3989	vm_fault_t ret;
				3990	u32 hash;
				3991	struct vm_fault vmf = {
				3992	.vma = vma,
				3993	.address = haddr,
				3994	.flags = flags,
				3995
				3996	/*
				3997	* Hard to debug if it ends up being
				3998	* used by a callee that assumes
				3999	* something about the other
				4000	* uninitialized fields... same as in
				4001	* memory.c
				4002	*/
				4003	};
				4004
				4005	/*
				4006	* hugetlb_fault_mutex and i_mmap_rwsem must be
				4007	* dropped before handling userfault. Reacquire
				4008	* after handling fault to make calling code simpler.
				4009	*/
				4010	hash = hugetlb_fault_mutex_hash(h, mapping, idx);
				4011	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				4012	ret = handle_userfault(&vmf, reason);
				4013	mutex_lock(&hugetlb_fault_mutex_table[hash]);
				4014
				4015	return ret;
				4016	}
				4017
				4018	static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
				4019	struct vm_area_struct *vma,
				4020	struct address_space *mapping, pgoff_t idx,
				4021	unsigned long address, pte_t *ptep, unsigned int flags)
				4022	{
				4023	struct hstate *h = hstate_vma(vma);
				4024	vm_fault_t ret = VM_FAULT_SIGBUS;
				4025	int anon_rmap = 0;
				4026	unsigned long size;
				4027	struct page *page;
				4028	pte_t new_pte;
				4029	spinlock_t *ptl;
				4030	unsigned long haddr = address & huge_page_mask(h);
				4031	bool new_page = false;
				4032
				4033	/*
				4034	* Currently, we are forced to kill the process in the event the
				4035	* original mapper has unmapped pages from the child due to a failed
				4036	* COW. Warn that such a situation has occurred as it may not be obvious
				4037	*/
				4038	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
				4039	pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
				4040	current->pid);
				4041	return ret;
				4042	}
				4043
				4044	/*
				4045	* Use page lock to guard against racing truncation
				4046	* before we get page_table_lock.
				4047	*/
				4048	retry:
				4049	page = find_lock_page(mapping, idx);
				4050	if (!page) {
				4051	size = i_size_read(mapping->host) >> huge_page_shift(h);
				4052	if (idx >= size)
				4053	goto out;
				4054
				4055	/* Check for page in userfault range */
				4056	if (userfaultfd_missing(vma)) {
				4057	ret = hugetlb_handle_userfault(vma, mapping, h,
				4058	idx, flags, haddr,
				4059	VM_UFFD_MISSING);
				4060	goto out;
				4061	}
				4062
				4063	page = alloc_huge_page(vma, haddr, 0);
				4064	if (IS_ERR(page)) {
				4065	/*
				4066	* Returning error will result in faulting task being
				4067	* sent SIGBUS. The hugetlb fault mutex prevents two
				4068	* tasks from racing to fault in the same page which
				4069	* could result in false unable to allocate errors.
				4070	* Page migration does not take the fault mutex, but
				4071	* does a clear then write of pte's under page table
				4072	* lock. Page fault code could race with migration,
				4073	* notice the clear pte and try to allocate a page
				4074	* here. Before returning error, get ptl and make
				4075	* sure there really is no pte entry.
				4076	*/
				4077	ptl = huge_pte_lock(h, mm, ptep);
				4078	if (!huge_pte_none(huge_ptep_get(ptep))) {
				4079	ret = 0;
				4080	spin_unlock(ptl);
				4081	goto out;
				4082	}
				4083	spin_unlock(ptl);
				4084	ret = vmf_error(PTR_ERR(page));
				4085	goto out;
				4086	}
				4087	clear_huge_page(page, address, pages_per_huge_page(h));
				4088	__SetPageUptodate(page);
				4089	new_page = true;
				4090
				4091	if (vma->vm_flags & VM_MAYSHARE) {
				4092	int err = huge_add_to_page_cache(page, mapping, idx);
				4093	if (err) {
				4094	put_page(page);
				4095	if (err == -EEXIST)
				4096	goto retry;
				4097	goto out;
				4098	}
				4099	} else {
				4100	lock_page(page);
				4101	if (unlikely(anon_vma_prepare(vma))) {
				4102	ret = VM_FAULT_OOM;
				4103	goto backout_unlocked;
				4104	}
				4105	anon_rmap = 1;
				4106	}
				4107	} else {
				4108	/*
				4109	* If memory error occurs between mmap() and fault, some process
				4110	* don't have hwpoisoned swap entry for errored virtual address.
				4111	* So we need to block hugepage fault by PG_hwpoison bit check.
				4112	*/
				4113	if (unlikely(PageHWPoison(page))) {
				4114	ret = VM_FAULT_HWPOISON_LARGE \|
				4115	VM_FAULT_SET_HINDEX(hstate_index(h));
				4116	goto backout_unlocked;
				4117	}
				4118
				4119	/* Check for page in userfault range. */
				4120	if (userfaultfd_minor(vma)) {
				4121	unlock_page(page);
				4122	put_page(page);
				4123	ret = hugetlb_handle_userfault(vma, mapping, h,
				4124	idx, flags, haddr,
				4125	VM_UFFD_MINOR);
				4126	goto out;
				4127	}
				4128	}
				4129
				4130	/*
				4131	* If we are going to COW a private mapping later, we examine the
				4132	* pending reservations for this page now. This will ensure that
				4133	* any allocations necessary to record that reservation occur outside
				4134	* the spinlock.
				4135	*/
				4136	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
				4137	if (vma_needs_reservation(h, vma, haddr) < 0) {
				4138	ret = VM_FAULT_OOM;
				4139	goto backout_unlocked;
				4140	}
				4141	/* Just decrements count, does not deallocate */
				4142	vma_end_reservation(h, vma, haddr);
				4143	}
				4144
				4145	ptl = huge_pte_lock(h, mm, ptep);
				4146	size = i_size_read(mapping->host) >> huge_page_shift(h);
				4147	if (idx >= size)
				4148	goto backout;
				4149
				4150	ret = 0;
				4151	if (!huge_pte_none(huge_ptep_get(ptep)))
				4152	goto backout;
				4153
				4154	if (anon_rmap) {
				4155	ClearPagePrivate(page);
				4156	hugepage_add_new_anon_rmap(page, vma, haddr);
				4157	} else
				4158	page_dup_rmap(page, true);
				4159	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
				4160	&& (vma->vm_flags & VM_SHARED)));
				4161	set_huge_pte_at(mm, haddr, ptep, new_pte);
				4162
				4163	hugetlb_count_add(pages_per_huge_page(h), mm);
				4164	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
				4165	/* Optimization, do the COW without a second fault */
				4166	ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
				4167	}
				4168
				4169	spin_unlock(ptl);
				4170
				4171	/*
				4172	* Only make newly allocated pages active. Existing pages found
				4173	* in the pagecache could be !page_huge_active() if they have been
				4174	* isolated for migration.
				4175	*/
				4176	if (new_page)
				4177	set_page_huge_active(page);
				4178
				4179	unlock_page(page);
				4180	out:
				4181	return ret;
				4182
				4183	backout:
				4184	spin_unlock(ptl);
				4185	backout_unlocked:
				4186	unlock_page(page);
				4187	restore_reserve_on_error(h, vma, haddr, page);
				4188	put_page(page);
				4189	goto out;
				4190	}
				4191
				4192	#ifdef CONFIG_SMP
				4193	u32 hugetlb_fault_mutex_hash(struct hstate h, struct address_space mapping,
				4194	pgoff_t idx)
				4195	{
				4196	unsigned long key[2];
				4197	u32 hash;
				4198
				4199	key[0] = (unsigned long) mapping;
				4200	key[1] = idx;
				4201
				4202	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
				4203
				4204	return hash & (num_fault_mutexes - 1);
				4205	}
				4206	#else
				4207	/*
				4208	* For uniprocesor systems we always use a single mutex, so just
				4209	* return 0 and avoid the hashing overhead.
				4210	*/
				4211	u32 hugetlb_fault_mutex_hash(struct hstate h, struct address_space mapping,
				4212	pgoff_t idx)
				4213	{
				4214	return 0;
				4215	}
				4216	#endif
				4217
				4218	vm_fault_t hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
				4219	unsigned long address, unsigned int flags)
				4220	{
				4221	pte_t *ptep, entry;
				4222	spinlock_t *ptl;
				4223	vm_fault_t ret;
				4224	u32 hash;
				4225	pgoff_t idx;
				4226	struct page *page = NULL;
				4227	struct page *pagecache_page = NULL;
				4228	struct hstate *h = hstate_vma(vma);
				4229	struct address_space *mapping;
				4230	int need_wait_lock = 0;
				4231	unsigned long haddr = address & huge_page_mask(h);
				4232
				4233	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
				4234	if (ptep) {
				4235	entry = huge_ptep_get(ptep);
				4236	if (unlikely(is_hugetlb_entry_migration(entry))) {
				4237	migration_entry_wait_huge(vma, mm, ptep);
				4238	return 0;
				4239	} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
				4240	return VM_FAULT_HWPOISON_LARGE \|
				4241	VM_FAULT_SET_HINDEX(hstate_index(h));
				4242	} else {
				4243	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
				4244	if (!ptep)
				4245	return VM_FAULT_OOM;
				4246	}
				4247
				4248	mapping = vma->vm_file->f_mapping;
				4249	idx = vma_hugecache_offset(h, vma, haddr);
				4250
				4251	/*
				4252	* Serialize hugepage allocation and instantiation, so that we don't
				4253	* get spurious allocation failures if two CPUs race to instantiate
				4254	* the same page in the page cache.
				4255	*/
				4256	hash = hugetlb_fault_mutex_hash(h, mapping, idx);
				4257	mutex_lock(&hugetlb_fault_mutex_table[hash]);
				4258
				4259	entry = huge_ptep_get(ptep);
				4260	if (huge_pte_none(entry)) {
				4261	ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
				4262	goto out_mutex;
				4263	}
				4264
				4265	ret = 0;
				4266
				4267	/*
				4268	* entry could be a migration/hwpoison entry at this point, so this
				4269	* check prevents the kernel from going below assuming that we have
				4270	* a active hugepage in pagecache. This goto expects the 2nd page fault,
				4271	* and is_hugetlb_entry_(migration\|hwpoisoned) check will properly
				4272	* handle it.
				4273	*/
				4274	if (!pte_present(entry))
				4275	goto out_mutex;
				4276
				4277	/*
				4278	* If we are going to COW the mapping later, we examine the pending
				4279	* reservations for this page now. This will ensure that any
				4280	* allocations necessary to record that reservation occur outside the
				4281	* spinlock. For private mappings, we also lookup the pagecache
				4282	* page now as it is used to determine if a reservation has been
				4283	* consumed.
				4284	*/
				4285	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
				4286	if (vma_needs_reservation(h, vma, haddr) < 0) {
				4287	ret = VM_FAULT_OOM;
				4288	goto out_mutex;
				4289	}
				4290	/* Just decrements count, does not deallocate */
				4291	vma_end_reservation(h, vma, haddr);
				4292
				4293	if (!(vma->vm_flags & VM_MAYSHARE))
				4294	pagecache_page = hugetlbfs_pagecache_page(h,
				4295	vma, haddr);
				4296	}
				4297
				4298	ptl = huge_pte_lock(h, mm, ptep);
				4299
				4300	/* Check for a racing update before calling hugetlb_cow */
				4301	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
				4302	goto out_ptl;
				4303
				4304	/*
				4305	* hugetlb_cow() requires page locks of pte_page(entry) and
				4306	* pagecache_page, so here we need take the former one
				4307	* when page != pagecache_page or !pagecache_page.
				4308	*/
				4309	page = pte_page(entry);
				4310	if (page != pagecache_page)
				4311	if (!trylock_page(page)) {
				4312	need_wait_lock = 1;
				4313	goto out_ptl;
				4314	}
				4315
				4316	get_page(page);
				4317
				4318	if (flags & FAULT_FLAG_WRITE) {
				4319	if (!huge_pte_write(entry)) {
				4320	ret = hugetlb_cow(mm, vma, address, ptep,
				4321	pagecache_page, ptl);
				4322	goto out_put_page;
				4323	}
				4324	entry = huge_pte_mkdirty(entry);
				4325	}
				4326	entry = pte_mkyoung(entry);
				4327	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
				4328	flags & FAULT_FLAG_WRITE))
				4329	update_mmu_cache(vma, haddr, ptep);
				4330	out_put_page:
				4331	if (page != pagecache_page)
				4332	unlock_page(page);
				4333	put_page(page);
				4334	out_ptl:
				4335	spin_unlock(ptl);
				4336
				4337	if (pagecache_page) {
				4338	unlock_page(pagecache_page);
				4339	put_page(pagecache_page);
				4340	}
				4341	out_mutex:
				4342	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
				4343	/*
				4344	* Generally it's safe to hold refcount during waiting page lock. But
				4345	* here we just wait to defer the next page fault to avoid busy loop and
				4346	* the page is not used after unlocked before returning from the current
				4347	* page fault. So we are safe from accessing freed page, even if we wait
				4348	* here without taking refcount.
				4349	*/
				4350	if (need_wait_lock)
				4351	wait_on_page_locked(page);
				4352	return ret;
				4353	}
				4354
				4355	#ifdef CONFIG_USERFAULTFD
				4356	/*
				4357	* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
				4358	* modifications for huge pages.
				4359	*/
				4360	int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
				4361	pte_t *dst_pte,
				4362	struct vm_area_struct *dst_vma,
				4363	unsigned long dst_addr,
				4364	unsigned long src_addr,
				4365	enum mcopy_atomic_mode mode,
				4366	struct page **pagep)
				4367	{
				4368	bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
				4369	struct address_space *mapping;
				4370	pgoff_t idx;
				4371	unsigned long size;
				4372	int vm_shared = dst_vma->vm_flags & VM_SHARED;
				4373	struct hstate *h = hstate_vma(dst_vma);
				4374	pte_t _dst_pte;
				4375	spinlock_t *ptl;
				4376	int ret;
				4377	struct page *page;
				4378	int writable;
				4379
				4380	mapping = dst_vma->vm_file->f_mapping;
				4381	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
				4382
				4383	if (is_continue) {
				4384	ret = -EFAULT;
				4385	page = find_lock_page(mapping, idx);
				4386	if (!page)
				4387	goto out;
				4388	} else if (!*pagep) {
				4389	/* If a page already exists, then it's UFFDIO_COPY for
				4390	* a non-missing case. Return -EEXIST.
				4391	*/
				4392	if (vm_shared &&
				4393	hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
				4394	ret = -EEXIST;
				4395	goto out;
				4396	}
				4397
				4398	page = alloc_huge_page(dst_vma, dst_addr, 0);
				4399	if (IS_ERR(page)) {
				4400	ret = -ENOMEM;
				4401	goto out;
				4402	}
				4403
				4404	ret = copy_huge_page_from_user(page,
				4405	(const void __user *) src_addr,
				4406	pages_per_huge_page(h), false);
				4407
				4408	/* fallback to copy_from_user outside mmap_sem */
				4409	if (unlikely(ret)) {
				4410	ret = -ENOENT;
				4411	*pagep = page;
				4412	/* don't free the page */
				4413	goto out;
				4414	}
				4415	} else {
				4416	page = *pagep;
				4417	*pagep = NULL;
				4418	}
				4419
				4420	/*
				4421	* The memory barrier inside __SetPageUptodate makes sure that
				4422	* preceding stores to the page contents become visible before
				4423	* the set_pte_at() write.
				4424	*/
				4425	__SetPageUptodate(page);
				4426
				4427	/* Add shared, newly allocated pages to the page cache. */
				4428	if (vm_shared && !is_continue) {
				4429	size = i_size_read(mapping->host) >> huge_page_shift(h);
				4430	ret = -EFAULT;
				4431	if (idx >= size)
				4432	goto out_release_nounlock;
				4433
				4434	/*
				4435	* Serialization between remove_inode_hugepages() and
				4436	* huge_add_to_page_cache() below happens through the
				4437	* hugetlb_fault_mutex_table that here must be hold by
				4438	* the caller.
				4439	*/
				4440	ret = huge_add_to_page_cache(page, mapping, idx);
				4441	if (ret)
				4442	goto out_release_nounlock;
				4443	}
				4444
				4445	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
				4446	spin_lock(ptl);
				4447
				4448	/*
				4449	* Recheck the i_size after holding PT lock to make sure not
				4450	* to leave any page mapped (as page_mapped()) beyond the end
				4451	* of the i_size (remove_inode_hugepages() is strict about
				4452	* enforcing that). If we bail out here, we'll also leave a
				4453	* page in the radix tree in the vm_shared case beyond the end
				4454	* of the i_size, but remove_inode_hugepages() will take care
				4455	* of it as soon as we drop the hugetlb_fault_mutex_table.
				4456	*/
				4457	size = i_size_read(mapping->host) >> huge_page_shift(h);
				4458	ret = -EFAULT;
				4459	if (idx >= size)
				4460	goto out_release_unlock;
				4461
				4462	ret = -EEXIST;
				4463	if (!huge_pte_none(huge_ptep_get(dst_pte)))
				4464	goto out_release_unlock;
				4465
				4466	if (vm_shared) {
				4467	page_dup_rmap(page, true);
				4468	} else {
				4469	ClearPagePrivate(page);
				4470	hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
				4471	}
				4472
				4473	/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
				4474	if (is_continue && !vm_shared)
				4475	writable = 0;
				4476	else
				4477	writable = dst_vma->vm_flags & VM_WRITE;
				4478
				4479	_dst_pte = make_huge_pte(dst_vma, page, writable);
				4480	if (writable)
				4481	_dst_pte = huge_pte_mkdirty(_dst_pte);
				4482	_dst_pte = pte_mkyoung(_dst_pte);
				4483
				4484	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
				4485
				4486	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
				4487	dst_vma->vm_flags & VM_WRITE);
				4488	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
				4489
				4490	/* No need to invalidate - it was non-present before */
				4491	update_mmu_cache(dst_vma, dst_addr, dst_pte);
				4492
				4493	spin_unlock(ptl);
				4494	if (!is_continue)
				4495	set_page_huge_active(page);
				4496	if (vm_shared \|\| is_continue)
				4497	unlock_page(page);
				4498	ret = 0;
				4499	out:
				4500	return ret;
				4501	out_release_unlock:
				4502	spin_unlock(ptl);
				4503	if (vm_shared \|\| is_continue)
				4504	unlock_page(page);
				4505	out_release_nounlock:
				4506	put_page(page);
				4507	goto out;
				4508	}
				4509	#endif /* CONFIG_USERFAULTFD */
				4510
				4511	long follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
				4512	struct page pages, struct vm_area_struct vmas,
				4513	unsigned long position, unsigned long nr_pages,
				4514	long i, unsigned int flags, int *locked)
				4515	{
				4516	unsigned long pfn_offset;
				4517	unsigned long vaddr = *position;
				4518	unsigned long remainder = *nr_pages;
				4519	struct hstate *h = hstate_vma(vma);
				4520	int err = -EFAULT;
				4521
				4522	while (vaddr < vma->vm_end && remainder) {
				4523	pte_t *pte;
				4524	spinlock_t *ptl = NULL;
				4525	int absent;
				4526	struct page *page;
				4527
				4528	/*
				4529	* If we have a pending SIGKILL, don't keep faulting pages and
				4530	* potentially allocating memory.
				4531	*/
				4532	if (fatal_signal_pending(current)) {
				4533	remainder = 0;
				4534	break;
				4535	}
				4536
				4537	/*
				4538	* Some archs (sparc64, sh*) have multiple pte_ts to
				4539	* each hugepage. We have to make sure we get the
				4540	* first, for the page indexing below to work.
				4541	*
				4542	* Note that page table lock is not held when pte is null.
				4543	*/
				4544	pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
				4545	huge_page_size(h));
				4546	if (pte)
				4547	ptl = huge_pte_lock(h, mm, pte);
				4548	absent = !pte \|\| huge_pte_none(huge_ptep_get(pte));
				4549
				4550	/*
				4551	* When coredumping, it suits get_dump_page if we just return
				4552	* an error where there's an empty slot with no huge pagecache
				4553	* to back it. This way, we avoid allocating a hugepage, and
				4554	* the sparse dumpfile avoids allocating disk blocks, but its
				4555	* huge holes still show up with zeroes where they need to be.
				4556	*/
				4557	if (absent && (flags & FOLL_DUMP) &&
				4558	!hugetlbfs_pagecache_present(h, vma, vaddr)) {
				4559	if (pte)
				4560	spin_unlock(ptl);
				4561	remainder = 0;
				4562	break;
				4563	}
				4564
				4565	/*
				4566	* We need call hugetlb_fault for both hugepages under migration
				4567	* (in which case hugetlb_fault waits for the migration,) and
				4568	* hwpoisoned hugepages (in which case we need to prevent the
				4569	* caller from accessing to them.) In order to do this, we use
				4570	* here is_swap_pte instead of is_hugetlb_entry_migration and
				4571	* is_hugetlb_entry_hwpoisoned. This is because it simply covers
				4572	* both cases, and because we can't follow correct pages
				4573	* directly from any kind of swap entries.
				4574	*/
				4575	if (absent \|\| is_swap_pte(huge_ptep_get(pte)) \|\|
				4576	((flags & FOLL_WRITE) &&
				4577	!huge_pte_write(huge_ptep_get(pte)))) {
				4578	vm_fault_t ret;
				4579	unsigned int fault_flags = 0;
				4580
				4581	if (pte)
				4582	spin_unlock(ptl);
				4583	if (flags & FOLL_WRITE)
				4584	fault_flags \|= FAULT_FLAG_WRITE;
				4585	if (locked)
				4586	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \|
				4587	FAULT_FLAG_KILLABLE;
				4588	if (flags & FOLL_NOWAIT)
				4589	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \|
				4590	FAULT_FLAG_RETRY_NOWAIT;
				4591	if (flags & FOLL_TRIED) {
				4592	/*
				4593	* Note: FAULT_FLAG_ALLOW_RETRY and
				4594	* FAULT_FLAG_TRIED can co-exist
				4595	*/
				4596	fault_flags \|= FAULT_FLAG_TRIED;
				4597	}
				4598	ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
				4599	if (ret & VM_FAULT_ERROR) {
				4600	err = vm_fault_to_errno(ret, flags);
				4601	remainder = 0;
				4602	break;
				4603	}
				4604	if (ret & VM_FAULT_RETRY) {
				4605	if (locked &&
				4606	!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
				4607	*locked = 0;
				4608	*nr_pages = 0;
				4609	/*
				4610	* VM_FAULT_RETRY must not return an
				4611	* error, it will return zero
				4612	* instead.
				4613	*
				4614	* No need to update "position" as the
				4615	* caller will not check it after
				4616	* *nr_pages is set to 0.
				4617	*/
				4618	return i;
				4619	}
				4620	continue;
				4621	}
				4622
				4623	pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
				4624	page = pte_page(huge_ptep_get(pte));
				4625
				4626	/*
				4627	* Instead of doing 'try_get_page()' below in the same_page
				4628	* loop, just check the count once here.
				4629	*/
				4630	if (unlikely(page_count(page) <= 0)) {
				4631	if (pages) {
				4632	spin_unlock(ptl);
				4633	remainder = 0;
				4634	err = -ENOMEM;
				4635	break;
				4636	}
				4637	}
				4638	same_page:
				4639	if (pages) {
				4640	pages[i] = mem_map_offset(page, pfn_offset);
				4641	get_page(pages[i]);
				4642	}
				4643
				4644	if (vmas)
				4645	vmas[i] = vma;
				4646
				4647	vaddr += PAGE_SIZE;
				4648	++pfn_offset;
				4649	--remainder;
				4650	++i;
				4651	if (vaddr < vma->vm_end && remainder &&
				4652	pfn_offset < pages_per_huge_page(h)) {
				4653	/*
				4654	* We use pfn_offset to avoid touching the pageframes
				4655	* of this compound page.
				4656	*/
				4657	goto same_page;
				4658	}
				4659	spin_unlock(ptl);
				4660	}
				4661	*nr_pages = remainder;
				4662	/*
				4663	* setting position is actually required only if remainder is
				4664	* not zero but it's faster not to add a "if (remainder)"
				4665	* branch.
				4666	*/
				4667	*position = vaddr;
				4668
				4669	return i ? i : err;
				4670	}
				4671
				4672	unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
				4673	unsigned long address, unsigned long end, pgprot_t newprot)
				4674	{
				4675	struct mm_struct *mm = vma->vm_mm;
				4676	unsigned long start = address;
				4677	pte_t *ptep;
				4678	pte_t pte;
				4679	struct hstate *h = hstate_vma(vma);
				4680	unsigned long pages = 0;
				4681	bool shared_pmd = false;
				4682	struct mmu_notifier_range range;
				4683
				4684	/*
				4685	* In the case of shared PMDs, the area to flush could be beyond
				4686	* start/end. Set range.start/range.end to cover the maximum possible
				4687	* range if PMD sharing is possible.
				4688	*/
				4689	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
				4690	0, vma, mm, start, end);
				4691	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
				4692
				4693	BUG_ON(address >= end);
				4694	flush_cache_range(vma, range.start, range.end);
				4695
				4696	mmu_notifier_invalidate_range_start(&range);
				4697	i_mmap_lock_write(vma->vm_file->f_mapping);
				4698	for (; address < end; address += huge_page_size(h)) {
				4699	spinlock_t *ptl;
				4700	ptep = huge_pte_offset(mm, address, huge_page_size(h));
				4701	if (!ptep)
				4702	continue;
				4703	ptl = huge_pte_lock(h, mm, ptep);
				4704	if (huge_pmd_unshare(mm, &address, ptep)) {
				4705	pages++;
				4706	spin_unlock(ptl);
				4707	shared_pmd = true;
				4708	continue;
				4709	}
				4710	pte = huge_ptep_get(ptep);
				4711	if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
				4712	spin_unlock(ptl);
				4713	continue;
				4714	}
				4715	if (unlikely(is_hugetlb_entry_migration(pte))) {
				4716	swp_entry_t entry = pte_to_swp_entry(pte);
				4717
				4718	if (is_write_migration_entry(entry)) {
				4719	pte_t newpte;
				4720
				4721	make_migration_entry_read(&entry);
				4722	newpte = swp_entry_to_pte(entry);
				4723	set_huge_swap_pte_at(mm, address, ptep,
				4724	newpte, huge_page_size(h));
				4725	pages++;
				4726	}
				4727	spin_unlock(ptl);
				4728	continue;
				4729	}
				4730	if (!huge_pte_none(pte)) {
				4731	pte_t old_pte;
				4732
				4733	old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
				4734	pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
				4735	pte = arch_make_huge_pte(pte, vma, NULL, 0);
				4736	huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
				4737	pages++;
				4738	}
				4739	spin_unlock(ptl);
				4740	}
				4741	/*
				4742	* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
				4743	* may have cleared our pud entry and done put_page on the page table:
				4744	* once we release i_mmap_rwsem, another task can do the final put_page
				4745	* and that page table be reused and filled with junk. If we actually
				4746	* did unshare a page of pmds, flush the range corresponding to the pud.
				4747	*/
				4748	if (shared_pmd)
				4749	flush_hugetlb_tlb_range(vma, range.start, range.end);
				4750	else
				4751	flush_hugetlb_tlb_range(vma, start, end);
				4752	/*
				4753	* No need to call mmu_notifier_invalidate_range() we are downgrading
				4754	* page table protection not changing it to point to a new page.
				4755	*
				4756	* See Documentation/vm/mmu_notifier.rst
				4757	*/
				4758	i_mmap_unlock_write(vma->vm_file->f_mapping);
				4759	mmu_notifier_invalidate_range_end(&range);
				4760
				4761	return pages << h->order;
				4762	}
				4763
				4764	int hugetlb_reserve_pages(struct inode *inode,
				4765	long from, long to,
				4766	struct vm_area_struct *vma,
				4767	vm_flags_t vm_flags)
				4768	{
				4769	long ret, chg;
				4770	struct hstate *h = hstate_inode(inode);
				4771	struct hugepage_subpool *spool = subpool_inode(inode);
				4772	struct resv_map *resv_map;
				4773	long gbl_reserve;
				4774
				4775	/* This should never happen */
				4776	if (from > to) {
				4777	VM_WARN(1, "%s called with a negative range\n", __func__);
				4778	return -EINVAL;
				4779	}
				4780
				4781	/*
				4782	* Only apply hugepage reservation if asked. At fault time, an
				4783	* attempt will be made for VM_NORESERVE to allocate a page
				4784	* without using reserves
				4785	*/
				4786	if (vm_flags & VM_NORESERVE)
				4787	return 0;
				4788
				4789	/*
				4790	* Shared mappings base their reservation on the number of pages that
				4791	* are already allocated on behalf of the file. Private mappings need
				4792	* to reserve the full area even if read-only as mprotect() may be
				4793	* called to make the mapping read-write. Assume !vma is a shm mapping
				4794	*/
				4795	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
				4796	/*
				4797	* resv_map can not be NULL as hugetlb_reserve_pages is only
				4798	* called for inodes for which resv_maps were created (see
				4799	* hugetlbfs_get_inode).
				4800	*/
				4801	resv_map = inode_resv_map(inode);
				4802
				4803	chg = region_chg(resv_map, from, to);
				4804
				4805	} else {
				4806	resv_map = resv_map_alloc();
				4807	if (!resv_map)
				4808	return -ENOMEM;
				4809
				4810	chg = to - from;
				4811
				4812	set_vma_resv_map(vma, resv_map);
				4813	set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
				4814	}
				4815
				4816	if (chg < 0) {
				4817	ret = chg;
				4818	goto out_err;
				4819	}
				4820
				4821	/*
				4822	* There must be enough pages in the subpool for the mapping. If
				4823	* the subpool has a minimum size, there may be some global
				4824	* reservations already in place (gbl_reserve).
				4825	*/
				4826	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
				4827	if (gbl_reserve < 0) {
				4828	ret = -ENOSPC;
				4829	goto out_err;
				4830	}
				4831
				4832	/*
				4833	* Check enough hugepages are available for the reservation.
				4834	* Hand the pages back to the subpool if there are not
				4835	*/
				4836	ret = hugetlb_acct_memory(h, gbl_reserve);
				4837	if (ret < 0) {
				4838	/* put back original number of pages, chg */
				4839	(void)hugepage_subpool_put_pages(spool, chg);
				4840	goto out_err;
				4841	}
				4842
				4843	/*
				4844	* Account for the reservations made. Shared mappings record regions
				4845	* that have reservations as they are shared by multiple VMAs.
				4846	* When the last VMA disappears, the region map says how much
				4847	* the reservation was and the page cache tells how much of
				4848	* the reservation was consumed. Private mappings are per-VMA and
				4849	* only the consumed reservations are tracked. When the VMA
				4850	* disappears, the original reservation is the VMA size and the
				4851	* consumed reservations are stored in the map. Hence, nothing
				4852	* else has to be done for private mappings here
				4853	*/
				4854	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
				4855	long add = region_add(resv_map, from, to);
				4856
				4857	if (unlikely(chg > add)) {
				4858	/*
				4859	* pages in this range were added to the reserve
				4860	* map between region_chg and region_add. This
				4861	* indicates a race with alloc_huge_page. Adjust
				4862	* the subpool and reserve counts modified above
				4863	* based on the difference.
				4864	*/
				4865	long rsv_adjust;
				4866
				4867	rsv_adjust = hugepage_subpool_put_pages(spool,
				4868	chg - add);
				4869	hugetlb_acct_memory(h, -rsv_adjust);
				4870	}
				4871	}
				4872	return 0;
				4873	out_err:
				4874	if (!vma \|\| vma->vm_flags & VM_MAYSHARE)
				4875	/* Don't call region_abort if region_chg failed */
				4876	if (chg >= 0)
				4877	region_abort(resv_map, from, to);
				4878	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
				4879	kref_put(&resv_map->refs, resv_map_release);
				4880	return ret;
				4881	}
				4882
				4883	long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
				4884	long freed)
				4885	{
				4886	struct hstate *h = hstate_inode(inode);
				4887	struct resv_map *resv_map = inode_resv_map(inode);
				4888	long chg = 0;
				4889	struct hugepage_subpool *spool = subpool_inode(inode);
				4890	long gbl_reserve;
				4891
				4892	/*
				4893	* Since this routine can be called in the evict inode path for all
				4894	* hugetlbfs inodes, resv_map could be NULL.
				4895	*/
				4896	if (resv_map) {
				4897	chg = region_del(resv_map, start, end);
				4898	/*
				4899	* region_del() can fail in the rare case where a region
				4900	* must be split and another region descriptor can not be
				4901	* allocated. If end == LONG_MAX, it will not fail.
				4902	*/
				4903	if (chg < 0)
				4904	return chg;
				4905	}
				4906
				4907	spin_lock(&inode->i_lock);
				4908	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
				4909	spin_unlock(&inode->i_lock);
				4910
				4911	/*
				4912	* If the subpool has a minimum size, the number of global
				4913	* reservations to be released may be adjusted.
				4914	*/
				4915	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
				4916	hugetlb_acct_memory(h, -gbl_reserve);
				4917
				4918	return 0;
				4919	}
				4920
				4921	#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
				4922	static unsigned long page_table_shareable(struct vm_area_struct *svma,
				4923	struct vm_area_struct *vma,
				4924	unsigned long addr, pgoff_t idx)
				4925	{
				4926	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
				4927	svma->vm_start;
				4928	unsigned long sbase = saddr & PUD_MASK;
				4929	unsigned long s_end = sbase + PUD_SIZE;
				4930
				4931	/* Allow segments to share if only one is marked locked */
				4932	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
				4933	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
				4934
				4935	/*
				4936	* match the virtual addresses, permission and the alignment of the
				4937	* page table page.
				4938	*/
				4939	if (pmd_index(addr) != pmd_index(saddr) \|\|
				4940	vm_flags != svm_flags \|\|
				4941	sbase < svma->vm_start \|\| svma->vm_end < s_end)
				4942	return 0;
				4943
				4944	return saddr;
				4945	}
				4946
				4947	static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
				4948	{
				4949	unsigned long base = addr & PUD_MASK;
				4950	unsigned long end = base + PUD_SIZE;
				4951
				4952	/*
				4953	* check on proper vm_flags and page table alignment
				4954	*/
				4955	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
				4956	return true;
				4957	return false;
				4958	}
				4959
				4960	bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
				4961	{
				4962	#ifdef CONFIG_USERFAULTFD
				4963	if (uffd_disable_huge_pmd_share(vma))
				4964	return false;
				4965	#endif
				4966	return vma_shareable(vma, addr);
				4967	}
				4968
				4969	/*
				4970	* Determine if start,end range within vma could be mapped by shared pmd.
				4971	* If yes, adjust start and end to cover range associated with possible
				4972	* shared pmd mappings.
				4973	*/
				4974	void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
				4975	unsigned long start, unsigned long end)
				4976	{
				4977	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
				4978	v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
				4979
				4980	/*
				4981	* vma need span at least one aligned PUD size and the start,end range
				4982	* must at least partialy within it.
				4983	*/
				4984	if (!(vma->vm_flags & VM_MAYSHARE) \|\| !(v_end > v_start) \|\|
				4985	(end <= v_start) \|\| (start >= v_end))
				4986	return;
				4987
				4988	/* Extend the range to be PUD aligned for a worst case scenario */
				4989	if (*start > v_start)
				4990	start = ALIGN_DOWN(start, PUD_SIZE);
				4991
				4992	if (*end < v_end)
				4993	end = ALIGN(end, PUD_SIZE);
				4994	}
				4995
				4996	/*
				4997	* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
				4998	* and returns the corresponding pte. While this is not necessary for the
				4999	* !shared pmd case because we can allocate the pmd later as well, it makes the
				5000	* code much cleaner. pmd allocation is essential for the shared case because
				5001	* pud has to be populated inside the same i_mmap_rwsem section - otherwise
				5002	* racing tasks could either miss the sharing (see huge_pte_offset) or select a
				5003	* bad pmd for sharing.
				5004	*/
				5005	pte_t huge_pmd_share(struct mm_struct mm, struct vm_area_struct *vma,
				5006	unsigned long addr, pud_t *pud)
				5007	{
				5008	struct address_space *mapping = vma->vm_file->f_mapping;
				5009	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
				5010	vma->vm_pgoff;
				5011	struct vm_area_struct *svma;
				5012	unsigned long saddr;
				5013	pte_t *spte = NULL;
				5014	pte_t *pte;
				5015	spinlock_t *ptl;
				5016
				5017	i_mmap_lock_write(mapping);
				5018	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
				5019	if (svma == vma)
				5020	continue;
				5021
				5022	saddr = page_table_shareable(svma, vma, addr, idx);
				5023	if (saddr) {
				5024	spte = huge_pte_offset(svma->vm_mm, saddr,
				5025	vma_mmu_pagesize(svma));
				5026	if (spte) {
				5027	get_page(virt_to_page(spte));
				5028	break;
				5029	}
				5030	}
				5031	}
				5032
				5033	if (!spte)
				5034	goto out;
				5035
				5036	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
				5037	if (pud_none(*pud)) {
				5038	pud_populate(mm, pud,
				5039	(pmd_t *)((unsigned long)spte & PAGE_MASK));
				5040	mm_inc_nr_pmds(mm);
				5041	} else {
				5042	put_page(virt_to_page(spte));
				5043	}
				5044	spin_unlock(ptl);
				5045	out:
				5046	pte = (pte_t *)pmd_alloc(mm, pud, addr);
				5047	i_mmap_unlock_write(mapping);
				5048	return pte;
				5049	}
				5050
				5051	/*
				5052	* unmap huge page backed by shared pte.
				5053	*
				5054	* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
				5055	* indicated by page_count > 1, unmap is achieved by clearing pud and
				5056	* decrementing the ref count. If count == 1, the pte page is not shared.
				5057	*
				5058	* called with page table lock held.
				5059	*
				5060	* returns: 1 successfully unmapped a shared pte page
				5061	* 0 the underlying pte page is not shared, or it is the last user
				5062	*/
				5063	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
				5064	{
				5065	pgd_t pgd = pgd_offset(mm, addr);
				5066	p4d_t p4d = p4d_offset(pgd, addr);
				5067	pud_t pud = pud_offset(p4d, addr);
				5068
				5069	BUG_ON(page_count(virt_to_page(ptep)) == 0);
				5070	if (page_count(virt_to_page(ptep)) == 1)
				5071	return 0;
				5072
				5073	pud_clear(pud);
				5074	put_page(virt_to_page(ptep));
				5075	mm_dec_nr_pmds(mm);
				5076	/*
				5077	* This update of passed address optimizes loops sequentially
				5078	* processing addresses in increments of huge page size (PMD_SIZE
				5079	* in this case). By clearing the pud, a PUD_SIZE area is unmapped.
				5080	* Update address to the 'last page' in the cleared area so that
				5081	* calling loop can move to first page past this area.
				5082	*/
				5083	*addr \|= PUD_SIZE - PMD_SIZE;
				5084	return 1;
				5085	}
				5086
				5087	#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
				5088	pte_t huge_pmd_share(struct mm_struct mm, struct vm_area_struct *vma,
				5089	unsigned long addr, pud_t *pud)
				5090	{
				5091	return NULL;
				5092	}
				5093
				5094	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
				5095	{
				5096	return 0;
				5097	}
				5098
				5099	void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
				5100	unsigned long start, unsigned long end)
				5101	{
				5102	}
				5103
				5104	bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
				5105	{
				5106	return false;
				5107	}
				5108	#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
				5109
				5110	#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
				5111	pte_t huge_pte_alloc(struct mm_struct mm, struct vm_area_struct *vma,
				5112	unsigned long addr, unsigned long sz)
				5113	{
				5114	pgd_t *pgd;
				5115	p4d_t *p4d;
				5116	pud_t *pud;
				5117	pte_t *pte = NULL;
				5118
				5119	pgd = pgd_offset(mm, addr);
				5120	p4d = p4d_alloc(mm, pgd, addr);
				5121	if (!p4d)
				5122	return NULL;
				5123	pud = pud_alloc(mm, p4d, addr);
				5124	if (pud) {
				5125	if (sz == PUD_SIZE) {
				5126	pte = (pte_t *)pud;
				5127	} else {
				5128	BUG_ON(sz != PMD_SIZE);
				5129	if (want_pmd_share(vma, addr) && pud_none(*pud))
				5130	pte = huge_pmd_share(mm, vma, addr, pud);
				5131	else
				5132	pte = (pte_t *)pmd_alloc(mm, pud, addr);
				5133	}
				5134	}
				5135	BUG_ON(pte && pte_present(pte) && !pte_huge(pte));
				5136
				5137	return pte;
				5138	}
				5139
				5140	/*
				5141	* huge_pte_offset() - Walk the page table to resolve the hugepage
				5142	* entry at address @addr
				5143	*
				5144	* Return: Pointer to page table or swap entry (PUD or PMD) for
				5145	* address @addr, or NULL if a p*d_none() entry is encountered and the
				5146	* size @sz doesn't match the hugepage size at this level of the page
				5147	* table.
				5148	*/
				5149	pte_t huge_pte_offset(struct mm_struct mm,
				5150	unsigned long addr, unsigned long sz)
				5151	{
				5152	pgd_t *pgd;
				5153	p4d_t *p4d;
				5154	pud_t *pud, pud_entry;
				5155	pmd_t *pmd, pmd_entry;
				5156
				5157	pgd = pgd_offset(mm, addr);
				5158	if (!pgd_present(*pgd))
				5159	return NULL;
				5160	p4d = p4d_offset(pgd, addr);
				5161	if (!p4d_present(*p4d))
				5162	return NULL;
				5163
				5164	pud = pud_offset(p4d, addr);
				5165	pud_entry = READ_ONCE(*pud);
				5166	if (sz != PUD_SIZE && pud_none(pud_entry))
				5167	return NULL;
				5168	/* hugepage or swap? */
				5169	if (pud_huge(pud_entry) \|\| !pud_present(pud_entry))
				5170	return (pte_t *)pud;
				5171
				5172	pmd = pmd_offset(pud, addr);
				5173	pmd_entry = READ_ONCE(*pmd);
				5174	if (sz != PMD_SIZE && pmd_none(pmd_entry))
				5175	return NULL;
				5176	/* hugepage or swap? */
				5177	if (pmd_huge(pmd_entry) \|\| !pmd_present(pmd_entry))
				5178	return (pte_t *)pmd;
				5179
				5180	return NULL;
				5181	}
				5182
				5183	#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
				5184
				5185	/*
				5186	* These functions are overwritable if your architecture needs its own
				5187	* behavior.
				5188	*/
				5189	struct page * __weak
				5190	follow_huge_addr(struct mm_struct *mm, unsigned long address,
				5191	int write)
				5192	{
				5193	return ERR_PTR(-EINVAL);
				5194	}
				5195
				5196	struct page * __weak
				5197	follow_huge_pd(struct vm_area_struct *vma,
				5198	unsigned long address, hugepd_t hpd, int flags, int pdshift)
				5199	{
				5200	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
				5201	return NULL;
				5202	}
				5203
				5204	struct page * __weak
				5205	follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
				5206	{
				5207	struct hstate *h = hstate_vma(vma);
				5208	struct mm_struct *mm = vma->vm_mm;
				5209	struct page *page = NULL;
				5210	spinlock_t *ptl;
				5211	pte_t *ptep, pte;
				5212
				5213	retry:
				5214	ptep = huge_pte_offset(mm, address, huge_page_size(h));
				5215	if (!ptep)
				5216	return NULL;
				5217
				5218	ptl = huge_pte_lock(h, mm, ptep);
				5219	pte = huge_ptep_get(ptep);
				5220	if (pte_present(pte)) {
				5221	page = pte_page(pte) +
				5222	((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
				5223	if (flags & FOLL_GET)
				5224	get_page(page);
				5225	} else {
				5226	if (is_hugetlb_entry_migration(pte)) {
				5227	spin_unlock(ptl);
				5228	__migration_entry_wait(mm, ptep, ptl);
				5229	goto retry;
				5230	}
				5231	/*
				5232	* hwpoisoned entry is treated as no_page_table in
				5233	* follow_page_mask().
				5234	*/
				5235	}
				5236
				5237	spin_unlock(ptl);
				5238	return page;
				5239	}
				5240
				5241	struct page * __weak
				5242	follow_huge_pud(struct mm_struct *mm, unsigned long address,
				5243	pud_t *pud, int flags)
				5244	{
				5245	if (flags & FOLL_GET)
				5246	return NULL;
				5247
				5248	return pte_page((pte_t )pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
				5249	}
				5250
				5251	struct page * __weak
				5252	follow_huge_pgd(struct mm_struct mm, unsigned long address, pgd_t pgd, int flags)
				5253	{
				5254	if (flags & FOLL_GET)
				5255	return NULL;
				5256
				5257	return pte_page((pte_t )pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
				5258	}
				5259
				5260	bool isolate_huge_page(struct page page, struct list_head list)
				5261	{
				5262	bool ret = true;
				5263
				5264	spin_lock(&hugetlb_lock);
				5265	if (!PageHeadHuge(page) \|\| !page_huge_active(page) \|\|
				5266	!get_page_unless_zero(page)) {
				5267	ret = false;
				5268	goto unlock;
				5269	}
				5270	clear_page_huge_active(page);
				5271	list_move_tail(&page->lru, list);
				5272	unlock:
				5273	spin_unlock(&hugetlb_lock);
				5274	return ret;
				5275	}
				5276
				5277	void putback_active_hugepage(struct page *page)
				5278	{
				5279	VM_BUG_ON_PAGE(!PageHead(page), page);
				5280	spin_lock(&hugetlb_lock);
				5281	set_page_huge_active(page);
				5282	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
				5283	spin_unlock(&hugetlb_lock);
				5284	put_page(page);
				5285	}
				5286
				5287	void move_hugetlb_state(struct page oldpage, struct page newpage, int reason)
				5288	{
				5289	struct hstate *h = page_hstate(oldpage);
				5290
				5291	hugetlb_cgroup_migrate(oldpage, newpage);
				5292	set_page_owner_migrate_reason(newpage, reason);
				5293
				5294	/*
				5295	* transfer temporary state of the new huge page. This is
				5296	* reverse to other transitions because the newpage is going to
				5297	* be final while the old one will be freed so it takes over
				5298	* the temporary status.
				5299	*
				5300	* Also note that we have to transfer the per-node surplus state
				5301	* here as well otherwise the global surplus count will not match
				5302	* the per-node's.
				5303	*/
				5304	if (PageHugeTemporary(newpage)) {
				5305	int old_nid = page_to_nid(oldpage);
				5306	int new_nid = page_to_nid(newpage);
				5307
				5308	SetPageHugeTemporary(oldpage);
				5309	ClearPageHugeTemporary(newpage);
				5310
				5311	spin_lock(&hugetlb_lock);
				5312	if (h->surplus_huge_pages_node[old_nid]) {
				5313	h->surplus_huge_pages_node[old_nid]--;
				5314	h->surplus_huge_pages_node[new_nid]++;
				5315	}
				5316	spin_unlock(&hugetlb_lock);
				5317	}
				5318	}
				5319
				5320	/*
				5321	* This function will unconditionally remove all the shared pmd pgtable entries
				5322	* within the specific vma for a hugetlbfs memory range.
				5323	*/
				5324	void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
				5325	{
				5326	struct hstate *h = hstate_vma(vma);
				5327	unsigned long sz = huge_page_size(h);
				5328	struct mm_struct *mm = vma->vm_mm;
				5329	struct mmu_notifier_range range;
				5330	unsigned long address, start, end;
				5331	spinlock_t *ptl;
				5332	pte_t *ptep;
				5333
				5334	if (!(vma->vm_flags & VM_MAYSHARE))
				5335	return;
				5336
				5337	start = ALIGN(vma->vm_start, PUD_SIZE);
				5338	end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
				5339
				5340	if (start >= end)
				5341	return;
				5342
				5343	/*
				5344	* No need to call adjust_range_if_pmd_sharing_possible(), because
				5345	* we have already done the PUD_SIZE alignment.
				5346	*/
				5347	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
				5348	start, end);
				5349	mmu_notifier_invalidate_range_start(&range);
				5350	i_mmap_lock_write(vma->vm_file->f_mapping);
				5351	for (address = start; address < end; address += PUD_SIZE) {
				5352	unsigned long tmp = address;
				5353
				5354	ptep = huge_pte_offset(mm, address, sz);
				5355	if (!ptep)
				5356	continue;
				5357	ptl = huge_pte_lock(h, mm, ptep);
				5358	/* We don't want 'address' to be changed */
				5359	huge_pmd_unshare(mm, &tmp, ptep);
				5360	spin_unlock(ptl);
				5361	}
				5362	flush_hugetlb_tlb_range(vma, start, end);
				5363	i_mmap_unlock_write(vma->vm_file->f_mapping);
				5364	/*
				5365	* No need to call mmu_notifier_invalidate_range(), see
				5366	* Documentation/vm/mmu_notifier.rst.
				5367	*/
				5368	mmu_notifier_invalidate_range_end(&range);
				5369	}
				5370