Blame - src/kernel/linux/v4.14/arch/powerpc/mm/hugetlbpage.c - T103

blob: e2d929ddad7f7ed6691757515c0c86aa71813198 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* PPC Huge TLB Page Support for Kernel.
				3	*
				4	* Copyright (C) 2003 David Gibson, IBM Corporation.
				5	* Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
				6	*
				7	* Based on the IA-32 version:
				8	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/io.h>
				13	#include <linux/slab.h>
				14	#include <linux/hugetlb.h>
				15	#include <linux/export.h>
				16	#include <linux/of_fdt.h>
				17	#include <linux/memblock.h>
				18	#include <linux/bootmem.h>
				19	#include <linux/moduleparam.h>
				20	#include <linux/swap.h>
				21	#include <linux/swapops.h>
				22	#include <linux/kmemleak.h>
				23	#include <asm/pgtable.h>
				24	#include <asm/pgalloc.h>
				25	#include <asm/tlb.h>
				26	#include <asm/setup.h>
				27	#include <asm/hugetlb.h>
				28	#include <asm/pte-walk.h>
				29
				30
				31	#ifdef CONFIG_HUGETLB_PAGE
				32
				33	#define PAGE_SHIFT_64K 16
				34	#define PAGE_SHIFT_512K 19
				35	#define PAGE_SHIFT_8M 23
				36	#define PAGE_SHIFT_16M 24
				37	#define PAGE_SHIFT_16G 34
				38
				39	unsigned int HPAGE_SHIFT;
				40	EXPORT_SYMBOL(HPAGE_SHIFT);
				41
				42	#define hugepd_none(hpd) (hpd_val(hpd) == 0)
				43
				44	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr, unsigned long sz)
				45	{
				46	/*
				47	* Only called for hugetlbfs pages, hence can ignore THP and the
				48	* irq disabled walk.
				49	*/
				50	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
				51	}
				52
				53	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
				54	unsigned long address, unsigned pdshift, unsigned pshift)
				55	{
				56	struct kmem_cache *cachep;
				57	pte_t *new;
				58	int i;
				59	int num_hugepd;
				60
				61	if (pshift >= pdshift) {
				62	cachep = hugepte_cache;
				63	num_hugepd = 1 << (pshift - pdshift);
				64	} else {
				65	cachep = PGT_CACHE(pdshift - pshift);
				66	num_hugepd = 1;
				67	}
				68
				69	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
				70
				71	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
				72	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
				73
				74	if (! new)
				75	return -ENOMEM;
				76
				77	/*
				78	* Make sure other cpus find the hugepd set only after a
				79	* properly initialized page table is visible to them.
				80	* For more details look for comment in __pte_alloc().
				81	*/
				82	smp_wmb();
				83
				84	spin_lock(&mm->page_table_lock);
				85
				86	/*
				87	* We have multiple higher-level entries that point to the same
				88	* actual pte location. Fill in each as we go and backtrack on error.
				89	* We need all of these so the DTLB pgtable walk code can find the
				90	* right higher-level entry without knowing if it's a hugepage or not.
				91	*/
				92	for (i = 0; i < num_hugepd; i++, hpdp++) {
				93	if (unlikely(!hugepd_none(*hpdp)))
				94	break;
				95	else {
				96	#ifdef CONFIG_PPC_BOOK3S_64
				97	*hpdp = __hugepd(__pa(new) \|
				98	(shift_to_mmu_psize(pshift) << 2));
				99	#elif defined(CONFIG_PPC_8xx)
				100	*hpdp = __hugepd(__pa(new) \|
				101	(pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
				102	_PMD_PAGE_512K) \| _PMD_PRESENT);
				103	#else
				104	/* We use the old format for PPC_FSL_BOOK3E */
				105	*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) \| pshift);
				106	#endif
				107	}
				108	}
				109	/* If we bailed from the for loop early, an error occurred, clean up */
				110	if (i < num_hugepd) {
				111	for (i = i - 1 ; i >= 0; i--, hpdp--)
				112	*hpdp = __hugepd(0);
				113	kmem_cache_free(cachep, new);
				114	} else {
				115	kmemleak_ignore(new);
				116	}
				117	spin_unlock(&mm->page_table_lock);
				118	return 0;
				119	}
				120
				121	/*
				122	* These macros define how to determine which level of the page table holds
				123	* the hpdp.
				124	*/
				125	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				126	#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
				127	#define HUGEPD_PUD_SHIFT PUD_SHIFT
				128	#else
				129	#define HUGEPD_PGD_SHIFT PUD_SHIFT
				130	#define HUGEPD_PUD_SHIFT PMD_SHIFT
				131	#endif
				132
				133	/*
				134	* At this point we do the placement change only for BOOK3S 64. This would
				135	* possibly work on other subarchs.
				136	*/
				137	pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr, unsigned long sz)
				138	{
				139	pgd_t *pg;
				140	pud_t *pu;
				141	pmd_t *pm;
				142	hugepd_t *hpdp = NULL;
				143	unsigned pshift = __ffs(sz);
				144	unsigned pdshift = PGDIR_SHIFT;
				145
				146	addr &= ~(sz-1);
				147	pg = pgd_offset(mm, addr);
				148
				149	#ifdef CONFIG_PPC_BOOK3S_64
				150	if (pshift == PGDIR_SHIFT)
				151	/* 16GB huge page */
				152	return (pte_t *) pg;
				153	else if (pshift > PUD_SHIFT)
				154	/*
				155	* We need to use hugepd table
				156	*/
				157	hpdp = (hugepd_t *)pg;
				158	else {
				159	pdshift = PUD_SHIFT;
				160	pu = pud_alloc(mm, pg, addr);
				161	if (pshift == PUD_SHIFT)
				162	return (pte_t *)pu;
				163	else if (pshift > PMD_SHIFT)
				164	hpdp = (hugepd_t *)pu;
				165	else {
				166	pdshift = PMD_SHIFT;
				167	pm = pmd_alloc(mm, pu, addr);
				168	if (pshift == PMD_SHIFT)
				169	/* 16MB hugepage */
				170	return (pte_t *)pm;
				171	else
				172	hpdp = (hugepd_t *)pm;
				173	}
				174	}
				175	#else
				176	if (pshift >= HUGEPD_PGD_SHIFT) {
				177	hpdp = (hugepd_t *)pg;
				178	} else {
				179	pdshift = PUD_SHIFT;
				180	pu = pud_alloc(mm, pg, addr);
				181	if (pshift >= HUGEPD_PUD_SHIFT) {
				182	hpdp = (hugepd_t *)pu;
				183	} else {
				184	pdshift = PMD_SHIFT;
				185	pm = pmd_alloc(mm, pu, addr);
				186	hpdp = (hugepd_t *)pm;
				187	}
				188	}
				189	#endif
				190	if (!hpdp)
				191	return NULL;
				192
				193	BUG_ON(!hugepd_none(hpdp) && !hugepd_ok(hpdp));
				194
				195	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
				196	return NULL;
				197
				198	return hugepte_offset(*hpdp, addr, pdshift);
				199	}
				200
				201	#ifdef CONFIG_PPC_BOOK3S_64
				202	/*
				203	* Tracks gpages after the device tree is scanned and before the
				204	* huge_boot_pages list is ready on pseries.
				205	*/
				206	#define MAX_NUMBER_GPAGES 1024
				207	__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
				208	__initdata static unsigned nr_gpages;
				209
				210	/*
				211	* Build list of addresses of gigantic pages. This function is used in early
				212	* boot before the buddy allocator is setup.
				213	*/
				214	void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
				215	{
				216	if (!addr)
				217	return;
				218	while (number_of_pages > 0) {
				219	gpage_freearray[nr_gpages] = addr;
				220	nr_gpages++;
				221	number_of_pages--;
				222	addr += page_size;
				223	}
				224	}
				225
				226	int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
				227	{
				228	struct huge_bootmem_page *m;
				229	if (nr_gpages == 0)
				230	return 0;
				231	m = phys_to_virt(gpage_freearray[--nr_gpages]);
				232	gpage_freearray[nr_gpages] = 0;
				233	list_add(&m->list, &huge_boot_pages);
				234	m->hstate = hstate;
				235	return 1;
				236	}
				237	#endif
				238
				239
				240	int __init alloc_bootmem_huge_page(struct hstate *h)
				241	{
				242
				243	#ifdef CONFIG_PPC_BOOK3S_64
				244	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
				245	return pseries_alloc_bootmem_huge_page(h);
				246	#endif
				247	return __alloc_bootmem_huge_page(h);
				248	}
				249
				250	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				251	#define HUGEPD_FREELIST_SIZE \
				252	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
				253
				254	struct hugepd_freelist {
				255	struct rcu_head rcu;
				256	unsigned int index;
				257	void *ptes[0];
				258	};
				259
				260	static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
				261
				262	static void hugepd_free_rcu_callback(struct rcu_head *head)
				263	{
				264	struct hugepd_freelist *batch =
				265	container_of(head, struct hugepd_freelist, rcu);
				266	unsigned int i;
				267
				268	for (i = 0; i < batch->index; i++)
				269	kmem_cache_free(hugepte_cache, batch->ptes[i]);
				270
				271	free_page((unsigned long)batch);
				272	}
				273
				274	static void hugepd_free(struct mmu_gather tlb, void hugepte)
				275	{
				276	struct hugepd_freelist **batchp;
				277
				278	batchp = &get_cpu_var(hugepd_freelist_cur);
				279
				280	if (atomic_read(&tlb->mm->mm_users) < 2 \|\|
				281	mm_is_thread_local(tlb->mm)) {
				282	kmem_cache_free(hugepte_cache, hugepte);
				283	put_cpu_var(hugepd_freelist_cur);
				284	return;
				285	}
				286
				287	if (*batchp == NULL) {
				288	batchp = (struct hugepd_freelist )__get_free_page(GFP_ATOMIC);
				289	(*batchp)->index = 0;
				290	}
				291
				292	(batchp)->ptes[(batchp)->index++] = hugepte;
				293	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
				294	call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
				295	*batchp = NULL;
				296	}
				297	put_cpu_var(hugepd_freelist_cur);
				298	}
				299	#else
				300	static inline void hugepd_free(struct mmu_gather tlb, void hugepte) {}
				301	#endif
				302
				303	static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshift,
				304	unsigned long start, unsigned long end,
				305	unsigned long floor, unsigned long ceiling)
				306	{
				307	pte_t hugepte = hugepd_page(hpdp);
				308	int i;
				309
				310	unsigned long pdmask = ~((1UL << pdshift) - 1);
				311	unsigned int num_hugepd = 1;
				312	unsigned int shift = hugepd_shift(*hpdp);
				313
				314	/* Note: On fsl the hpdp may be the first of several */
				315	if (shift > pdshift)
				316	num_hugepd = 1 << (shift - pdshift);
				317
				318	start &= pdmask;
				319	if (start < floor)
				320	return;
				321	if (ceiling) {
				322	ceiling &= pdmask;
				323	if (! ceiling)
				324	return;
				325	}
				326	if (end - 1 > ceiling - 1)
				327	return;
				328
				329	for (i = 0; i < num_hugepd; i++, hpdp++)
				330	*hpdp = __hugepd(0);
				331
				332	if (shift >= pdshift)
				333	hugepd_free(tlb, hugepte);
				334	else
				335	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
				336	}
				337
				338	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
				339	unsigned long addr, unsigned long end,
				340	unsigned long floor, unsigned long ceiling)
				341	{
				342	pmd_t *pmd;
				343	unsigned long next;
				344	unsigned long start;
				345
				346	start = addr;
				347	do {
				348	unsigned long more;
				349
				350	pmd = pmd_offset(pud, addr);
				351	next = pmd_addr_end(addr, end);
				352	if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
				353	/*
				354	* if it is not hugepd pointer, we should already find
				355	* it cleared.
				356	*/
				357	WARN_ON(!pmd_none_or_clear_bad(pmd));
				358	continue;
				359	}
				360	/*
				361	* Increment next by the size of the huge mapping since
				362	* there may be more than one entry at this level for a
				363	* single hugepage, but all of them point to
				364	* the same kmem cache that holds the hugepte.
				365	*/
				366	more = addr + (1 << hugepd_shift((hugepd_t )pmd));
				367	if (more > next)
				368	next = more;
				369
				370	free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				371	addr, next, floor, ceiling);
				372	} while (addr = next, addr != end);
				373
				374	start &= PUD_MASK;
				375	if (start < floor)
				376	return;
				377	if (ceiling) {
				378	ceiling &= PUD_MASK;
				379	if (!ceiling)
				380	return;
				381	}
				382	if (end - 1 > ceiling - 1)
				383	return;
				384
				385	pmd = pmd_offset(pud, start);
				386	pud_clear(pud);
				387	pmd_free_tlb(tlb, pmd, start);
				388	mm_dec_nr_pmds(tlb->mm);
				389	}
				390
				391	static void hugetlb_free_pud_range(struct mmu_gather tlb, pgd_t pgd,
				392	unsigned long addr, unsigned long end,
				393	unsigned long floor, unsigned long ceiling)
				394	{
				395	pud_t *pud;
				396	unsigned long next;
				397	unsigned long start;
				398
				399	start = addr;
				400	do {
				401	pud = pud_offset(pgd, addr);
				402	next = pud_addr_end(addr, end);
				403	if (!is_hugepd(__hugepd(pud_val(*pud)))) {
				404	if (pud_none_or_clear_bad(pud))
				405	continue;
				406	hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
				407	ceiling);
				408	} else {
				409	unsigned long more;
				410	/*
				411	* Increment next by the size of the huge mapping since
				412	* there may be more than one entry at this level for a
				413	* single hugepage, but all of them point to
				414	* the same kmem cache that holds the hugepte.
				415	*/
				416	more = addr + (1 << hugepd_shift((hugepd_t )pud));
				417	if (more > next)
				418	next = more;
				419
				420	free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
				421	addr, next, floor, ceiling);
				422	}
				423	} while (addr = next, addr != end);
				424
				425	start &= PGDIR_MASK;
				426	if (start < floor)
				427	return;
				428	if (ceiling) {
				429	ceiling &= PGDIR_MASK;
				430	if (!ceiling)
				431	return;
				432	}
				433	if (end - 1 > ceiling - 1)
				434	return;
				435
				436	pud = pud_offset(pgd, start);
				437	pgd_clear(pgd);
				438	pud_free_tlb(tlb, pud, start);
				439	}
				440
				441	/*
				442	* This function frees user-level page tables of a process.
				443	*/
				444	void hugetlb_free_pgd_range(struct mmu_gather *tlb,
				445	unsigned long addr, unsigned long end,
				446	unsigned long floor, unsigned long ceiling)
				447	{
				448	pgd_t *pgd;
				449	unsigned long next;
				450
				451	/*
				452	* Because there are a number of different possible pagetable
				453	* layouts for hugepage ranges, we limit knowledge of how
				454	* things should be laid out to the allocation path
				455	* (huge_pte_alloc(), above). Everything else works out the
				456	* structure as it goes from information in the hugepd
				457	* pointers. That means that we can't here use the
				458	* optimization used in the normal page free_pgd_range(), of
				459	* checking whether we're actually covering a large enough
				460	* range to have to do anything at the top level of the walk
				461	* instead of at the bottom.
				462	*
				463	* To make sense of this, you should probably go read the big
				464	* block comment at the top of the normal free_pgd_range(),
				465	* too.
				466	*/
				467
				468	do {
				469	next = pgd_addr_end(addr, end);
				470	pgd = pgd_offset(tlb->mm, addr);
				471	if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
				472	if (pgd_none_or_clear_bad(pgd))
				473	continue;
				474	hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
				475	} else {
				476	unsigned long more;
				477	/*
				478	* Increment next by the size of the huge mapping since
				479	* there may be more than one entry at the pgd level
				480	* for a single hugepage, but all of them point to the
				481	* same kmem cache that holds the hugepte.
				482	*/
				483	more = addr + (1 << hugepd_shift((hugepd_t )pgd));
				484	if (more > next)
				485	next = more;
				486
				487	free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
				488	addr, next, floor, ceiling);
				489	}
				490	} while (addr = next, addr != end);
				491	}
				492
				493	struct page follow_huge_pd(struct vm_area_struct vma,
				494	unsigned long address, hugepd_t hpd,
				495	int flags, int pdshift)
				496	{
				497	pte_t *ptep;
				498	spinlock_t *ptl;
				499	struct page *page = NULL;
				500	unsigned long mask;
				501	int shift = hugepd_shift(hpd);
				502	struct mm_struct *mm = vma->vm_mm;
				503
				504	retry:
				505	ptl = &mm->page_table_lock;
				506	spin_lock(ptl);
				507
				508	ptep = hugepte_offset(hpd, address, pdshift);
				509	if (pte_present(*ptep)) {
				510	mask = (1UL << shift) - 1;
				511	page = pte_page(*ptep);
				512	page += ((address & mask) >> PAGE_SHIFT);
				513	if (flags & FOLL_GET)
				514	get_page(page);
				515	} else {
				516	if (is_hugetlb_entry_migration(*ptep)) {
				517	spin_unlock(ptl);
				518	__migration_entry_wait(mm, ptep, ptl);
				519	goto retry;
				520	}
				521	}
				522	spin_unlock(ptl);
				523	return page;
				524	}
				525
				526	static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				527	unsigned long sz)
				528	{
				529	unsigned long __boundary = (addr + sz) & ~(sz-1);
				530	return (__boundary - 1 < end - 1) ? __boundary : end;
				531	}
				532
				533	int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
				534	unsigned long end, int write, struct page *pages, int nr)
				535	{
				536	pte_t *ptep;
				537	unsigned long sz = 1UL << hugepd_shift(hugepd);
				538	unsigned long next;
				539
				540	ptep = hugepte_offset(hugepd, addr, pdshift);
				541	do {
				542	next = hugepte_addr_end(addr, end, sz);
				543	if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
				544	return 0;
				545	} while (ptep++, addr = next, addr != end);
				546
				547	return 1;
				548	}
				549
				550	#ifdef CONFIG_PPC_MM_SLICES
				551	unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
				552	unsigned long len, unsigned long pgoff,
				553	unsigned long flags)
				554	{
				555	struct hstate *hstate = hstate_file(file);
				556	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
				557
				558	#ifdef CONFIG_PPC_RADIX_MMU
				559	if (radix_enabled())
				560	return radix__hugetlb_get_unmapped_area(file, addr, len,
				561	pgoff, flags);
				562	#endif
				563	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
				564	}
				565	#endif
				566
				567	unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
				568	{
				569	#ifdef CONFIG_PPC_MM_SLICES
				570	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
				571	/* With radix we don't use slice, so derive it from vma*/
				572	if (!radix_enabled())
				573	return 1UL << mmu_psize_to_shift(psize);
				574	#endif
				575	if (!is_vm_hugetlb_page(vma))
				576	return PAGE_SIZE;
				577
				578	return huge_page_size(hstate_vma(vma));
				579	}
				580
				581	static inline bool is_power_of_4(unsigned long x)
				582	{
				583	if (is_power_of_2(x))
				584	return (__ilog2(x) % 2) ? false : true;
				585	return false;
				586	}
				587
				588	static int __init add_huge_page_size(unsigned long long size)
				589	{
				590	int shift = __ffs(size);
				591	int mmu_psize;
				592
				593	/* Check that it is a page size supported by the hardware and
				594	* that it fits within pagetable and slice limits. */
				595	if (size <= PAGE_SIZE)
				596	return -EINVAL;
				597	#if defined(CONFIG_PPC_FSL_BOOK3E)
				598	if (!is_power_of_4(size))
				599	return -EINVAL;
				600	#elif !defined(CONFIG_PPC_8xx)
				601	if (!is_power_of_2(size) \|\| (shift > SLICE_HIGH_SHIFT))
				602	return -EINVAL;
				603	#endif
				604
				605	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
				606	return -EINVAL;
				607
				608	#ifdef CONFIG_PPC_BOOK3S_64
				609	/*
				610	* We need to make sure that for different page sizes reported by
				611	* firmware we only add hugetlb support for page sizes that can be
				612	* supported by linux page table layout.
				613	* For now we have
				614	* Radix: 2M
				615	* Hash: 16M and 16G
				616	*/
				617	if (radix_enabled()) {
				618	if (mmu_psize != MMU_PAGE_2M) {
				619	if (cpu_has_feature(CPU_FTR_POWER9_DD1) \|\|
				620	(mmu_psize != MMU_PAGE_1G))
				621	return -EINVAL;
				622	}
				623	} else {
				624	if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
				625	return -EINVAL;
				626	}
				627	#endif
				628
				629	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
				630
				631	/* Return if huge page size has already been setup */
				632	if (size_to_hstate(size))
				633	return 0;
				634
				635	hugetlb_add_hstate(shift - PAGE_SHIFT);
				636
				637	return 0;
				638	}
				639
				640	static int __init hugepage_setup_sz(char *str)
				641	{
				642	unsigned long long size;
				643
				644	size = memparse(str, &str);
				645
				646	if (add_huge_page_size(size) != 0) {
				647	hugetlb_bad_size();
				648	pr_err("Invalid huge page size specified(%llu)\n", size);
				649	}
				650
				651	return 1;
				652	}
				653	__setup("hugepagesz=", hugepage_setup_sz);
				654
				655	struct kmem_cache *hugepte_cache;
				656	static int __init hugetlbpage_init(void)
				657	{
				658	int psize;
				659
				660	#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
				661	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
				662	return -ENODEV;
				663	#endif
				664	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
				665	unsigned shift;
				666	unsigned pdshift;
				667
				668	if (!mmu_psize_defs[psize].shift)
				669	continue;
				670
				671	shift = mmu_psize_to_shift(psize);
				672
				673	if (add_huge_page_size(1ULL << shift) < 0)
				674	continue;
				675
				676	if (shift < HUGEPD_PUD_SHIFT)
				677	pdshift = PMD_SHIFT;
				678	else if (shift < HUGEPD_PGD_SHIFT)
				679	pdshift = PUD_SHIFT;
				680	else
				681	pdshift = PGDIR_SHIFT;
				682	/*
				683	* if we have pdshift and shift value same, we don't
				684	* use pgt cache for hugepd.
				685	*/
				686	if (pdshift > shift)
				687	pgtable_cache_add(pdshift - shift, NULL);
				688	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				689	else if (!hugepte_cache) {
				690	/*
				691	* Create a kmem cache for hugeptes. The bottom bits in
				692	* the pte have size information encoded in them, so
				693	* align them to allow this
				694	*/
				695	hugepte_cache = kmem_cache_create("hugepte-cache",
				696	sizeof(pte_t),
				697	HUGEPD_SHIFT_MASK + 1,
				698	0, NULL);
				699	if (hugepte_cache == NULL)
				700	panic("%s: Unable to create kmem cache "
				701	"for hugeptes\n", __func__);
				702
				703	}
				704	#endif
				705	}
				706
				707	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				708	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
				709	if (mmu_psize_defs[MMU_PAGE_4M].shift)
				710	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
				711	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
				712	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
				713	#else
				714	/* Set default large page size. Currently, we pick 16M or 1M
				715	* depending on what is available
				716	*/
				717	if (mmu_psize_defs[MMU_PAGE_16M].shift)
				718	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
				719	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
				720	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
				721	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
				722	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
				723	#endif
				724	return 0;
				725	}
				726
				727	arch_initcall(hugetlbpage_init);
				728
				729	void flush_dcache_icache_hugepage(struct page *page)
				730	{
				731	int i;
				732	void *start;
				733
				734	BUG_ON(!PageCompound(page));
				735
				736	for (i = 0; i < (1UL << compound_order(page)); i++) {
				737	if (!PageHighMem(page)) {
				738	__flush_dcache_icache(page_address(page+i));
				739	} else {
				740	start = kmap_atomic(page+i);
				741	__flush_dcache_icache(start);
				742	kunmap_atomic(start);
				743	}
				744	}
				745	}
				746
				747	#endif /* CONFIG_HUGETLB_PAGE */
				748
				749	/*
				750	* We have 4 cases for pgds and pmds:
				751	* (1) invalid (all zeroes)
				752	* (2) pointer to next table, as normal; bottom 6 bits == 0
				753	* (3) leaf pte for huge page _PAGE_PTE set
				754	* (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
				755	*
				756	* So long as we atomically load page table pointers we are safe against teardown,
				757	* we can follow the address down to the the page and take a ref on it.
				758	* This function need to be called with interrupts disabled. We use this variant
				759	* when we have MSR[EE] = 0 but the paca->soft_enabled = 1
				760	*/
				761	pte_t __find_linux_pte(pgd_t pgdir, unsigned long ea,
				762	bool is_thp, unsigned hpage_shift)
				763	{
				764	pgd_t pgd, *pgdp;
				765	pud_t pud, *pudp;
				766	pmd_t pmd, *pmdp;
				767	pte_t *ret_pte;
				768	hugepd_t *hpdp = NULL;
				769	unsigned pdshift = PGDIR_SHIFT;
				770
				771	if (hpage_shift)
				772	*hpage_shift = 0;
				773
				774	if (is_thp)
				775	*is_thp = false;
				776
				777	pgdp = pgdir + pgd_index(ea);
				778	pgd = READ_ONCE(*pgdp);
				779	/*
				780	* Always operate on the local stack value. This make sure the
				781	* value don't get updated by a parallel THP split/collapse,
				782	* page fault or a page unmap. The return pte_t * is still not
				783	* stable. So should be checked there for above conditions.
				784	*/
				785	if (pgd_none(pgd))
				786	return NULL;
				787	else if (pgd_huge(pgd)) {
				788	ret_pte = (pte_t *) pgdp;
				789	goto out;
				790	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
				791	hpdp = (hugepd_t *)&pgd;
				792	else {
				793	/*
				794	* Even if we end up with an unmap, the pgtable will not
				795	* be freed, because we do an rcu free and here we are
				796	* irq disabled
				797	*/
				798	pdshift = PUD_SHIFT;
				799	pudp = pud_offset(&pgd, ea);
				800	pud = READ_ONCE(*pudp);
				801
				802	if (pud_none(pud))
				803	return NULL;
				804	else if (pud_huge(pud)) {
				805	ret_pte = (pte_t *) pudp;
				806	goto out;
				807	} else if (is_hugepd(__hugepd(pud_val(pud))))
				808	hpdp = (hugepd_t *)&pud;
				809	else {
				810	pdshift = PMD_SHIFT;
				811	pmdp = pmd_offset(&pud, ea);
				812	pmd = READ_ONCE(*pmdp);
				813	/*
				814	* A hugepage collapse is captured by pmd_none, because
				815	* it mark the pmd none and do a hpte invalidate.
				816	*/
				817	if (pmd_none(pmd))
				818	return NULL;
				819
				820	if (pmd_trans_huge(pmd) \|\| pmd_devmap(pmd)) {
				821	if (is_thp)
				822	*is_thp = true;
				823	ret_pte = (pte_t *) pmdp;
				824	goto out;
				825	}
				826
				827	if (pmd_huge(pmd)) {
				828	ret_pte = (pte_t *) pmdp;
				829	goto out;
				830	} else if (is_hugepd(__hugepd(pmd_val(pmd))))
				831	hpdp = (hugepd_t *)&pmd;
				832	else
				833	return pte_offset_kernel(&pmd, ea);
				834	}
				835	}
				836	if (!hpdp)
				837	return NULL;
				838
				839	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
				840	pdshift = hugepd_shift(*hpdp);
				841	out:
				842	if (hpage_shift)
				843	*hpage_shift = pdshift;
				844	return ret_pte;
				845	}
				846	EXPORT_SYMBOL_GPL(__find_linux_pte);
				847
				848	int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
				849	unsigned long end, int write, struct page *pages, int nr)
				850	{
				851	unsigned long pte_end;
				852	struct page head, page;
				853	pte_t pte;
				854	int refs;
				855
				856	pte_end = (addr + sz) & ~(sz-1);
				857	if (pte_end < end)
				858	end = pte_end;
				859
				860	pte = READ_ONCE(*ptep);
				861
				862	if (!pte_present(pte) \|\| !pte_read(pte))
				863	return 0;
				864	if (write && !pte_write(pte))
				865	return 0;
				866
				867	/* hugepages are never "special" */
				868	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				869
				870	refs = 0;
				871	head = pte_page(pte);
				872
				873	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
				874	do {
				875	VM_BUG_ON(compound_head(page) != head);
				876	pages[*nr] = page;
				877	(*nr)++;
				878	page++;
				879	refs++;
				880	} while (addr += PAGE_SIZE, addr != end);
				881
				882	if (!page_cache_add_speculative(head, refs)) {
				883	*nr -= refs;
				884	return 0;
				885	}
				886
				887	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
				888	/* Could be optimized better */
				889	*nr -= refs;
				890	while (refs--)
				891	put_page(head);
				892	return 0;
				893	}
				894
				895	return 1;
				896	}