Blame - src/kernel/linux/v4.19/arch/powerpc/mm/hugetlbpage.c - T800

blob: cef0b7ee1024646cdc0edeb4d9c5cbbc0a4837fc [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	/*
				2	* PPC Huge TLB Page Support for Kernel.
				3	*
				4	* Copyright (C) 2003 David Gibson, IBM Corporation.
				5	* Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
				6	*
				7	* Based on the IA-32 version:
				8	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
				9	*/
				10
				11	#include <linux/mm.h>
				12	#include <linux/io.h>
				13	#include <linux/slab.h>
				14	#include <linux/hugetlb.h>
				15	#include <linux/export.h>
				16	#include <linux/of_fdt.h>
				17	#include <linux/memblock.h>
				18	#include <linux/bootmem.h>
				19	#include <linux/moduleparam.h>
				20	#include <linux/swap.h>
				21	#include <linux/swapops.h>
				22	#include <linux/kmemleak.h>
				23	#include <asm/pgtable.h>
				24	#include <asm/pgalloc.h>
				25	#include <asm/tlb.h>
				26	#include <asm/setup.h>
				27	#include <asm/hugetlb.h>
				28	#include <asm/pte-walk.h>
				29
				30
				31	#ifdef CONFIG_HUGETLB_PAGE
				32
				33	#define PAGE_SHIFT_64K 16
				34	#define PAGE_SHIFT_512K 19
				35	#define PAGE_SHIFT_8M 23
				36	#define PAGE_SHIFT_16M 24
				37	#define PAGE_SHIFT_16G 34
				38
				39	bool hugetlb_disabled = false;
				40
				41	unsigned int HPAGE_SHIFT;
				42	EXPORT_SYMBOL(HPAGE_SHIFT);
				43
				44	#define hugepd_none(hpd) (hpd_val(hpd) == 0)
				45
				46	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr, unsigned long sz)
				47	{
				48	/*
				49	* Only called for hugetlbfs pages, hence can ignore THP and the
				50	* irq disabled walk.
				51	*/
				52	return __find_linux_pte(mm->pgd, addr, NULL, NULL);
				53	}
				54
				55	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
				56	unsigned long address, unsigned int pdshift,
				57	unsigned int pshift, spinlock_t *ptl)
				58	{
				59	struct kmem_cache *cachep;
				60	pte_t *new;
				61	int i;
				62	int num_hugepd;
				63
				64	if (pshift >= pdshift) {
				65	cachep = hugepte_cache;
				66	num_hugepd = 1 << (pshift - pdshift);
				67	} else {
				68	cachep = PGT_CACHE(pdshift - pshift);
				69	num_hugepd = 1;
				70	}
				71
				72	new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
				73
				74	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
				75	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
				76
				77	if (! new)
				78	return -ENOMEM;
				79
				80	/*
				81	* Make sure other cpus find the hugepd set only after a
				82	* properly initialized page table is visible to them.
				83	* For more details look for comment in __pte_alloc().
				84	*/
				85	smp_wmb();
				86
				87	spin_lock(ptl);
				88	/*
				89	* We have multiple higher-level entries that point to the same
				90	* actual pte location. Fill in each as we go and backtrack on error.
				91	* We need all of these so the DTLB pgtable walk code can find the
				92	* right higher-level entry without knowing if it's a hugepage or not.
				93	*/
				94	for (i = 0; i < num_hugepd; i++, hpdp++) {
				95	if (unlikely(!hugepd_none(*hpdp)))
				96	break;
				97	else {
				98	#ifdef CONFIG_PPC_BOOK3S_64
				99	*hpdp = __hugepd(__pa(new) \|
				100	(shift_to_mmu_psize(pshift) << 2));
				101	#elif defined(CONFIG_PPC_8xx)
				102	*hpdp = __hugepd(__pa(new) \| _PMD_USER \|
				103	(pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
				104	_PMD_PAGE_512K) \| _PMD_PRESENT);
				105	#else
				106	/* We use the old format for PPC_FSL_BOOK3E */
				107	*hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) \| pshift);
				108	#endif
				109	}
				110	}
				111	/* If we bailed from the for loop early, an error occurred, clean up */
				112	if (i < num_hugepd) {
				113	for (i = i - 1 ; i >= 0; i--, hpdp--)
				114	*hpdp = __hugepd(0);
				115	kmem_cache_free(cachep, new);
				116	} else {
				117	kmemleak_ignore(new);
				118	}
				119	spin_unlock(ptl);
				120	return 0;
				121	}
				122
				123	/*
				124	* At this point we do the placement change only for BOOK3S 64. This would
				125	* possibly work on other subarchs.
				126	*/
				127	pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr, unsigned long sz)
				128	{
				129	pgd_t *pg;
				130	pud_t *pu;
				131	pmd_t *pm;
				132	hugepd_t *hpdp = NULL;
				133	unsigned pshift = __ffs(sz);
				134	unsigned pdshift = PGDIR_SHIFT;
				135	spinlock_t *ptl;
				136
				137	addr &= ~(sz-1);
				138	pg = pgd_offset(mm, addr);
				139
				140	#ifdef CONFIG_PPC_BOOK3S_64
				141	if (pshift == PGDIR_SHIFT)
				142	/* 16GB huge page */
				143	return (pte_t *) pg;
				144	else if (pshift > PUD_SHIFT) {
				145	/*
				146	* We need to use hugepd table
				147	*/
				148	ptl = &mm->page_table_lock;
				149	hpdp = (hugepd_t *)pg;
				150	} else {
				151	pdshift = PUD_SHIFT;
				152	pu = pud_alloc(mm, pg, addr);
				153	if (!pu)
				154	return NULL;
				155	if (pshift == PUD_SHIFT)
				156	return (pte_t *)pu;
				157	else if (pshift > PMD_SHIFT) {
				158	ptl = pud_lockptr(mm, pu);
				159	hpdp = (hugepd_t *)pu;
				160	} else {
				161	pdshift = PMD_SHIFT;
				162	pm = pmd_alloc(mm, pu, addr);
				163	if (!pm)
				164	return NULL;
				165	if (pshift == PMD_SHIFT)
				166	/* 16MB hugepage */
				167	return (pte_t *)pm;
				168	else {
				169	ptl = pmd_lockptr(mm, pm);
				170	hpdp = (hugepd_t *)pm;
				171	}
				172	}
				173	}
				174	#else
				175	if (pshift >= PGDIR_SHIFT) {
				176	ptl = &mm->page_table_lock;
				177	hpdp = (hugepd_t *)pg;
				178	} else {
				179	pdshift = PUD_SHIFT;
				180	pu = pud_alloc(mm, pg, addr);
				181	if (!pu)
				182	return NULL;
				183	if (pshift >= PUD_SHIFT) {
				184	ptl = pud_lockptr(mm, pu);
				185	hpdp = (hugepd_t *)pu;
				186	} else {
				187	pdshift = PMD_SHIFT;
				188	pm = pmd_alloc(mm, pu, addr);
				189	if (!pm)
				190	return NULL;
				191	ptl = pmd_lockptr(mm, pm);
				192	hpdp = (hugepd_t *)pm;
				193	}
				194	}
				195	#endif
				196	if (!hpdp)
				197	return NULL;
				198
				199	BUG_ON(!hugepd_none(hpdp) && !hugepd_ok(hpdp));
				200
				201	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
				202	pdshift, pshift, ptl))
				203	return NULL;
				204
				205	return hugepte_offset(*hpdp, addr, pdshift);
				206	}
				207
				208	#ifdef CONFIG_PPC_BOOK3S_64
				209	/*
				210	* Tracks gpages after the device tree is scanned and before the
				211	* huge_boot_pages list is ready on pseries.
				212	*/
				213	#define MAX_NUMBER_GPAGES 1024
				214	__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
				215	__initdata static unsigned nr_gpages;
				216
				217	/*
				218	* Build list of addresses of gigantic pages. This function is used in early
				219	* boot before the buddy allocator is setup.
				220	*/
				221	void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
				222	{
				223	if (!addr)
				224	return;
				225	while (number_of_pages > 0) {
				226	gpage_freearray[nr_gpages] = addr;
				227	nr_gpages++;
				228	number_of_pages--;
				229	addr += page_size;
				230	}
				231	}
				232
				233	int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
				234	{
				235	struct huge_bootmem_page *m;
				236	if (nr_gpages == 0)
				237	return 0;
				238	m = phys_to_virt(gpage_freearray[--nr_gpages]);
				239	gpage_freearray[nr_gpages] = 0;
				240	list_add(&m->list, &huge_boot_pages);
				241	m->hstate = hstate;
				242	return 1;
				243	}
				244	#endif
				245
				246
				247	int __init alloc_bootmem_huge_page(struct hstate *h)
				248	{
				249
				250	#ifdef CONFIG_PPC_BOOK3S_64
				251	if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
				252	return pseries_alloc_bootmem_huge_page(h);
				253	#endif
				254	return __alloc_bootmem_huge_page(h);
				255	}
				256
				257	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				258	#define HUGEPD_FREELIST_SIZE \
				259	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
				260
				261	struct hugepd_freelist {
				262	struct rcu_head rcu;
				263	unsigned int index;
				264	void *ptes[0];
				265	};
				266
				267	static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
				268
				269	static void hugepd_free_rcu_callback(struct rcu_head *head)
				270	{
				271	struct hugepd_freelist *batch =
				272	container_of(head, struct hugepd_freelist, rcu);
				273	unsigned int i;
				274
				275	for (i = 0; i < batch->index; i++)
				276	kmem_cache_free(hugepte_cache, batch->ptes[i]);
				277
				278	free_page((unsigned long)batch);
				279	}
				280
				281	static void hugepd_free(struct mmu_gather tlb, void hugepte)
				282	{
				283	struct hugepd_freelist **batchp;
				284
				285	batchp = &get_cpu_var(hugepd_freelist_cur);
				286
				287	if (atomic_read(&tlb->mm->mm_users) < 2 \|\|
				288	mm_is_thread_local(tlb->mm)) {
				289	kmem_cache_free(hugepte_cache, hugepte);
				290	put_cpu_var(hugepd_freelist_cur);
				291	return;
				292	}
				293
				294	if (*batchp == NULL) {
				295	batchp = (struct hugepd_freelist )__get_free_page(GFP_ATOMIC);
				296	(*batchp)->index = 0;
				297	}
				298
				299	(batchp)->ptes[(batchp)->index++] = hugepte;
				300	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
				301	call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
				302	*batchp = NULL;
				303	}
				304	put_cpu_var(hugepd_freelist_cur);
				305	}
				306	#else
				307	static inline void hugepd_free(struct mmu_gather tlb, void hugepte) {}
				308	#endif
				309
				310	static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshift,
				311	unsigned long start, unsigned long end,
				312	unsigned long floor, unsigned long ceiling)
				313	{
				314	pte_t hugepte = hugepd_page(hpdp);
				315	int i;
				316
				317	unsigned long pdmask = ~((1UL << pdshift) - 1);
				318	unsigned int num_hugepd = 1;
				319	unsigned int shift = hugepd_shift(*hpdp);
				320
				321	/* Note: On fsl the hpdp may be the first of several */
				322	if (shift > pdshift)
				323	num_hugepd = 1 << (shift - pdshift);
				324
				325	start &= pdmask;
				326	if (start < floor)
				327	return;
				328	if (ceiling) {
				329	ceiling &= pdmask;
				330	if (! ceiling)
				331	return;
				332	}
				333	if (end - 1 > ceiling - 1)
				334	return;
				335
				336	for (i = 0; i < num_hugepd; i++, hpdp++)
				337	*hpdp = __hugepd(0);
				338
				339	if (shift >= pdshift)
				340	hugepd_free(tlb, hugepte);
				341	else
				342	pgtable_free_tlb(tlb, hugepte,
				343	get_hugepd_cache_index(pdshift - shift));
				344	}
				345
				346	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
				347	unsigned long addr, unsigned long end,
				348	unsigned long floor, unsigned long ceiling)
				349	{
				350	pmd_t *pmd;
				351	unsigned long next;
				352	unsigned long start;
				353
				354	start = addr;
				355	do {
				356	unsigned long more;
				357
				358	pmd = pmd_offset(pud, addr);
				359	next = pmd_addr_end(addr, end);
				360	if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
				361	/*
				362	* if it is not hugepd pointer, we should already find
				363	* it cleared.
				364	*/
				365	WARN_ON(!pmd_none_or_clear_bad(pmd));
				366	continue;
				367	}
				368	/*
				369	* Increment next by the size of the huge mapping since
				370	* there may be more than one entry at this level for a
				371	* single hugepage, but all of them point to
				372	* the same kmem cache that holds the hugepte.
				373	*/
				374	more = addr + (1 << hugepd_shift((hugepd_t )pmd));
				375	if (more > next)
				376	next = more;
				377
				378	free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				379	addr, next, floor, ceiling);
				380	} while (addr = next, addr != end);
				381
				382	start &= PUD_MASK;
				383	if (start < floor)
				384	return;
				385	if (ceiling) {
				386	ceiling &= PUD_MASK;
				387	if (!ceiling)
				388	return;
				389	}
				390	if (end - 1 > ceiling - 1)
				391	return;
				392
				393	pmd = pmd_offset(pud, start);
				394	pud_clear(pud);
				395	pmd_free_tlb(tlb, pmd, start);
				396	mm_dec_nr_pmds(tlb->mm);
				397	}
				398
				399	static void hugetlb_free_pud_range(struct mmu_gather tlb, pgd_t pgd,
				400	unsigned long addr, unsigned long end,
				401	unsigned long floor, unsigned long ceiling)
				402	{
				403	pud_t *pud;
				404	unsigned long next;
				405	unsigned long start;
				406
				407	start = addr;
				408	do {
				409	pud = pud_offset(pgd, addr);
				410	next = pud_addr_end(addr, end);
				411	if (!is_hugepd(__hugepd(pud_val(*pud)))) {
				412	if (pud_none_or_clear_bad(pud))
				413	continue;
				414	hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
				415	ceiling);
				416	} else {
				417	unsigned long more;
				418	/*
				419	* Increment next by the size of the huge mapping since
				420	* there may be more than one entry at this level for a
				421	* single hugepage, but all of them point to
				422	* the same kmem cache that holds the hugepte.
				423	*/
				424	more = addr + (1 << hugepd_shift((hugepd_t )pud));
				425	if (more > next)
				426	next = more;
				427
				428	free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
				429	addr, next, floor, ceiling);
				430	}
				431	} while (addr = next, addr != end);
				432
				433	start &= PGDIR_MASK;
				434	if (start < floor)
				435	return;
				436	if (ceiling) {
				437	ceiling &= PGDIR_MASK;
				438	if (!ceiling)
				439	return;
				440	}
				441	if (end - 1 > ceiling - 1)
				442	return;
				443
				444	pud = pud_offset(pgd, start);
				445	pgd_clear(pgd);
				446	pud_free_tlb(tlb, pud, start);
				447	mm_dec_nr_puds(tlb->mm);
				448	}
				449
				450	/*
				451	* This function frees user-level page tables of a process.
				452	*/
				453	void hugetlb_free_pgd_range(struct mmu_gather *tlb,
				454	unsigned long addr, unsigned long end,
				455	unsigned long floor, unsigned long ceiling)
				456	{
				457	pgd_t *pgd;
				458	unsigned long next;
				459
				460	/*
				461	* Because there are a number of different possible pagetable
				462	* layouts for hugepage ranges, we limit knowledge of how
				463	* things should be laid out to the allocation path
				464	* (huge_pte_alloc(), above). Everything else works out the
				465	* structure as it goes from information in the hugepd
				466	* pointers. That means that we can't here use the
				467	* optimization used in the normal page free_pgd_range(), of
				468	* checking whether we're actually covering a large enough
				469	* range to have to do anything at the top level of the walk
				470	* instead of at the bottom.
				471	*
				472	* To make sense of this, you should probably go read the big
				473	* block comment at the top of the normal free_pgd_range(),
				474	* too.
				475	*/
				476
				477	do {
				478	next = pgd_addr_end(addr, end);
				479	pgd = pgd_offset(tlb->mm, addr);
				480	if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
				481	if (pgd_none_or_clear_bad(pgd))
				482	continue;
				483	hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
				484	} else {
				485	unsigned long more;
				486	/*
				487	* Increment next by the size of the huge mapping since
				488	* there may be more than one entry at the pgd level
				489	* for a single hugepage, but all of them point to the
				490	* same kmem cache that holds the hugepte.
				491	*/
				492	more = addr + (1 << hugepd_shift((hugepd_t )pgd));
				493	if (more > next)
				494	next = more;
				495
				496	free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
				497	addr, next, floor, ceiling);
				498	}
				499	} while (addr = next, addr != end);
				500	}
				501
				502	struct page follow_huge_pd(struct vm_area_struct vma,
				503	unsigned long address, hugepd_t hpd,
				504	int flags, int pdshift)
				505	{
				506	pte_t *ptep;
				507	spinlock_t *ptl;
				508	struct page *page = NULL;
				509	unsigned long mask;
				510	int shift = hugepd_shift(hpd);
				511	struct mm_struct *mm = vma->vm_mm;
				512
				513	retry:
				514	/*
				515	* hugepage directory entries are protected by mm->page_table_lock
				516	* Use this instead of huge_pte_lockptr
				517	*/
				518	ptl = &mm->page_table_lock;
				519	spin_lock(ptl);
				520
				521	ptep = hugepte_offset(hpd, address, pdshift);
				522	if (pte_present(*ptep)) {
				523	mask = (1UL << shift) - 1;
				524	page = pte_page(*ptep);
				525	page += ((address & mask) >> PAGE_SHIFT);
				526	if (flags & FOLL_GET)
				527	get_page(page);
				528	} else {
				529	if (is_hugetlb_entry_migration(*ptep)) {
				530	spin_unlock(ptl);
				531	__migration_entry_wait(mm, ptep, ptl);
				532	goto retry;
				533	}
				534	}
				535	spin_unlock(ptl);
				536	return page;
				537	}
				538
				539	static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
				540	unsigned long sz)
				541	{
				542	unsigned long __boundary = (addr + sz) & ~(sz-1);
				543	return (__boundary - 1 < end - 1) ? __boundary : end;
				544	}
				545
				546	int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
				547	unsigned long end, int write, struct page *pages, int nr)
				548	{
				549	pte_t *ptep;
				550	unsigned long sz = 1UL << hugepd_shift(hugepd);
				551	unsigned long next;
				552
				553	ptep = hugepte_offset(hugepd, addr, pdshift);
				554	do {
				555	next = hugepte_addr_end(addr, end, sz);
				556	if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
				557	return 0;
				558	} while (ptep++, addr = next, addr != end);
				559
				560	return 1;
				561	}
				562
				563	#ifdef CONFIG_PPC_MM_SLICES
				564	unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
				565	unsigned long len, unsigned long pgoff,
				566	unsigned long flags)
				567	{
				568	struct hstate *hstate = hstate_file(file);
				569	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
				570
				571	#ifdef CONFIG_PPC_RADIX_MMU
				572	if (radix_enabled())
				573	return radix__hugetlb_get_unmapped_area(file, addr, len,
				574	pgoff, flags);
				575	#endif
				576	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
				577	}
				578	#endif
				579
				580	unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
				581	{
				582	#ifdef CONFIG_PPC_MM_SLICES
				583	/* With radix we don't use slice, so derive it from vma*/
				584	if (!radix_enabled()) {
				585	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
				586
				587	return 1UL << mmu_psize_to_shift(psize);
				588	}
				589	#endif
				590	return vma_kernel_pagesize(vma);
				591	}
				592
				593	static inline bool is_power_of_4(unsigned long x)
				594	{
				595	if (is_power_of_2(x))
				596	return (__ilog2(x) % 2) ? false : true;
				597	return false;
				598	}
				599
				600	static int __init add_huge_page_size(unsigned long long size)
				601	{
				602	int shift = __ffs(size);
				603	int mmu_psize;
				604
				605	/* Check that it is a page size supported by the hardware and
				606	* that it fits within pagetable and slice limits. */
				607	if (size <= PAGE_SIZE)
				608	return -EINVAL;
				609	#if defined(CONFIG_PPC_FSL_BOOK3E)
				610	if (!is_power_of_4(size))
				611	return -EINVAL;
				612	#elif !defined(CONFIG_PPC_8xx)
				613	if (!is_power_of_2(size) \|\| (shift > SLICE_HIGH_SHIFT))
				614	return -EINVAL;
				615	#endif
				616
				617	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
				618	return -EINVAL;
				619
				620	#ifdef CONFIG_PPC_BOOK3S_64
				621	/*
				622	* We need to make sure that for different page sizes reported by
				623	* firmware we only add hugetlb support for page sizes that can be
				624	* supported by linux page table layout.
				625	* For now we have
				626	* Radix: 2M and 1G
				627	* Hash: 16M and 16G
				628	*/
				629	if (radix_enabled()) {
				630	if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
				631	return -EINVAL;
				632	} else {
				633	if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
				634	return -EINVAL;
				635	}
				636	#endif
				637
				638	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
				639
				640	/* Return if huge page size has already been setup */
				641	if (size_to_hstate(size))
				642	return 0;
				643
				644	hugetlb_add_hstate(shift - PAGE_SHIFT);
				645
				646	return 0;
				647	}
				648
				649	static int __init hugepage_setup_sz(char *str)
				650	{
				651	unsigned long long size;
				652
				653	size = memparse(str, &str);
				654
				655	if (add_huge_page_size(size) != 0) {
				656	hugetlb_bad_size();
				657	pr_err("Invalid huge page size specified(%llu)\n", size);
				658	}
				659
				660	return 1;
				661	}
				662	__setup("hugepagesz=", hugepage_setup_sz);
				663
				664	struct kmem_cache *hugepte_cache;
				665	static int __init hugetlbpage_init(void)
				666	{
				667	int psize;
				668
				669	if (hugetlb_disabled) {
				670	pr_info("HugeTLB support is disabled!\n");
				671	return 0;
				672	}
				673
				674	#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
				675	if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
				676	return -ENODEV;
				677	#endif
				678	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
				679	unsigned shift;
				680	unsigned pdshift;
				681
				682	if (!mmu_psize_defs[psize].shift)
				683	continue;
				684
				685	shift = mmu_psize_to_shift(psize);
				686
				687	#ifdef CONFIG_PPC_BOOK3S_64
				688	if (shift > PGDIR_SHIFT)
				689	continue;
				690	else if (shift > PUD_SHIFT)
				691	pdshift = PGDIR_SHIFT;
				692	else if (shift > PMD_SHIFT)
				693	pdshift = PUD_SHIFT;
				694	else
				695	pdshift = PMD_SHIFT;
				696	#else
				697	if (shift < PUD_SHIFT)
				698	pdshift = PMD_SHIFT;
				699	else if (shift < PGDIR_SHIFT)
				700	pdshift = PUD_SHIFT;
				701	else
				702	pdshift = PGDIR_SHIFT;
				703	#endif
				704
				705	if (add_huge_page_size(1ULL << shift) < 0)
				706	continue;
				707	/*
				708	* if we have pdshift and shift value same, we don't
				709	* use pgt cache for hugepd.
				710	*/
				711	if (pdshift > shift)
				712	pgtable_cache_add(pdshift - shift, NULL);
				713	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				714	else if (!hugepte_cache) {
				715	/*
				716	* Create a kmem cache for hugeptes. The bottom bits in
				717	* the pte have size information encoded in them, so
				718	* align them to allow this
				719	*/
				720	hugepte_cache = kmem_cache_create("hugepte-cache",
				721	sizeof(pte_t),
				722	HUGEPD_SHIFT_MASK + 1,
				723	0, NULL);
				724	if (hugepte_cache == NULL)
				725	panic("%s: Unable to create kmem cache "
				726	"for hugeptes\n", __func__);
				727
				728	}
				729	#endif
				730	}
				731
				732	#if defined(CONFIG_PPC_FSL_BOOK3E) \|\| defined(CONFIG_PPC_8xx)
				733	/* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
				734	if (mmu_psize_defs[MMU_PAGE_4M].shift)
				735	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
				736	else if (mmu_psize_defs[MMU_PAGE_512K].shift)
				737	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
				738	#else
				739	/* Set default large page size. Currently, we pick 16M or 1M
				740	* depending on what is available
				741	*/
				742	if (mmu_psize_defs[MMU_PAGE_16M].shift)
				743	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
				744	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
				745	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
				746	else if (mmu_psize_defs[MMU_PAGE_2M].shift)
				747	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
				748	#endif
				749	return 0;
				750	}
				751
				752	arch_initcall(hugetlbpage_init);
				753
				754	void flush_dcache_icache_hugepage(struct page *page)
				755	{
				756	int i;
				757	void *start;
				758
				759	BUG_ON(!PageCompound(page));
				760
				761	for (i = 0; i < (1UL << compound_order(page)); i++) {
				762	if (!PageHighMem(page)) {
				763	__flush_dcache_icache(page_address(page+i));
				764	} else {
				765	start = kmap_atomic(page+i);
				766	__flush_dcache_icache(start);
				767	kunmap_atomic(start);
				768	}
				769	}
				770	}
				771
				772	#endif /* CONFIG_HUGETLB_PAGE */
				773
				774	/*
				775	* We have 4 cases for pgds and pmds:
				776	* (1) invalid (all zeroes)
				777	* (2) pointer to next table, as normal; bottom 6 bits == 0
				778	* (3) leaf pte for huge page _PAGE_PTE set
				779	* (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
				780	*
				781	* So long as we atomically load page table pointers we are safe against teardown,
				782	* we can follow the address down to the the page and take a ref on it.
				783	* This function need to be called with interrupts disabled. We use this variant
				784	* when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
				785	*/
				786	pte_t __find_linux_pte(pgd_t pgdir, unsigned long ea,
				787	bool is_thp, unsigned hpage_shift)
				788	{
				789	pgd_t pgd, *pgdp;
				790	pud_t pud, *pudp;
				791	pmd_t pmd, *pmdp;
				792	pte_t *ret_pte;
				793	hugepd_t *hpdp = NULL;
				794	unsigned pdshift = PGDIR_SHIFT;
				795
				796	if (hpage_shift)
				797	*hpage_shift = 0;
				798
				799	if (is_thp)
				800	*is_thp = false;
				801
				802	pgdp = pgdir + pgd_index(ea);
				803	pgd = READ_ONCE(*pgdp);
				804	/*
				805	* Always operate on the local stack value. This make sure the
				806	* value don't get updated by a parallel THP split/collapse,
				807	* page fault or a page unmap. The return pte_t * is still not
				808	* stable. So should be checked there for above conditions.
				809	*/
				810	if (pgd_none(pgd))
				811	return NULL;
				812	else if (pgd_huge(pgd)) {
				813	ret_pte = (pte_t *) pgdp;
				814	goto out;
				815	} else if (is_hugepd(__hugepd(pgd_val(pgd))))
				816	hpdp = (hugepd_t *)&pgd;
				817	else {
				818	/*
				819	* Even if we end up with an unmap, the pgtable will not
				820	* be freed, because we do an rcu free and here we are
				821	* irq disabled
				822	*/
				823	pdshift = PUD_SHIFT;
				824	pudp = pud_offset(&pgd, ea);
				825	pud = READ_ONCE(*pudp);
				826
				827	if (pud_none(pud))
				828	return NULL;
				829	else if (pud_huge(pud)) {
				830	ret_pte = (pte_t *) pudp;
				831	goto out;
				832	} else if (is_hugepd(__hugepd(pud_val(pud))))
				833	hpdp = (hugepd_t *)&pud;
				834	else {
				835	pdshift = PMD_SHIFT;
				836	pmdp = pmd_offset(&pud, ea);
				837	pmd = READ_ONCE(*pmdp);
				838	/*
				839	* A hugepage collapse is captured by pmd_none, because
				840	* it mark the pmd none and do a hpte invalidate.
				841	*/
				842	if (pmd_none(pmd))
				843	return NULL;
				844
				845	if (pmd_trans_huge(pmd) \|\| pmd_devmap(pmd)) {
				846	if (is_thp)
				847	*is_thp = true;
				848	ret_pte = (pte_t *) pmdp;
				849	goto out;
				850	}
				851
				852	if (pmd_huge(pmd)) {
				853	ret_pte = (pte_t *) pmdp;
				854	goto out;
				855	} else if (is_hugepd(__hugepd(pmd_val(pmd))))
				856	hpdp = (hugepd_t *)&pmd;
				857	else
				858	return pte_offset_kernel(&pmd, ea);
				859	}
				860	}
				861	if (!hpdp)
				862	return NULL;
				863
				864	ret_pte = hugepte_offset(*hpdp, ea, pdshift);
				865	pdshift = hugepd_shift(*hpdp);
				866	out:
				867	if (hpage_shift)
				868	*hpage_shift = pdshift;
				869	return ret_pte;
				870	}
				871	EXPORT_SYMBOL_GPL(__find_linux_pte);
				872
				873	int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
				874	unsigned long end, int write, struct page *pages, int nr)
				875	{
				876	unsigned long pte_end;
				877	struct page head, page;
				878	pte_t pte;
				879	int refs;
				880
				881	pte_end = (addr + sz) & ~(sz-1);
				882	if (pte_end < end)
				883	end = pte_end;
				884
				885	pte = READ_ONCE(*ptep);
				886
				887	if (!pte_access_permitted(pte, write))
				888	return 0;
				889
				890	/* hugepages are never "special" */
				891	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				892
				893	refs = 0;
				894	head = pte_page(pte);
				895
				896	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
				897	do {
				898	VM_BUG_ON(compound_head(page) != head);
				899	pages[*nr] = page;
				900	(*nr)++;
				901	page++;
				902	refs++;
				903	} while (addr += PAGE_SIZE, addr != end);
				904
				905	if (!page_cache_add_speculative(head, refs)) {
				906	*nr -= refs;
				907	return 0;
				908	}
				909
				910	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
				911	/* Could be optimized better */
				912	*nr -= refs;
				913	while (refs--)
				914	put_page(head);
				915	return 0;
				916	}
				917
				918	return 1;
				919	}