blob: e3e00e200b4bf02183da27e86977f06fb8a3ded8 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001/*
2 * PPC Huge TLB Page Support for Kernel.
3 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
5 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6 *
7 * Based on the IA-32 version:
8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9 */
10
11#include <linux/mm.h>
12#include <linux/io.h>
13#include <linux/slab.h>
14#include <linux/hugetlb.h>
15#include <linux/export.h>
16#include <linux/of_fdt.h>
17#include <linux/memblock.h>
18#include <linux/moduleparam.h>
19#include <linux/swap.h>
20#include <linux/swapops.h>
21#include <linux/kmemleak.h>
22#include <asm/pgtable.h>
23#include <asm/pgalloc.h>
24#include <asm/tlb.h>
25#include <asm/setup.h>
26#include <asm/hugetlb.h>
27#include <asm/pte-walk.h>
28
29bool hugetlb_disabled = false;
30
31#define hugepd_none(hpd) (hpd_val(hpd) == 0)
32
33#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
34
35pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
36{
37 /*
38 * Only called for hugetlbfs pages, hence can ignore THP and the
39 * irq disabled walk.
40 */
41 return __find_linux_pte(mm->pgd, addr, NULL, NULL);
42}
43
44static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
45 unsigned long address, unsigned int pdshift,
46 unsigned int pshift, spinlock_t *ptl)
47{
48 struct kmem_cache *cachep;
49 pte_t *new;
50 int i;
51 int num_hugepd;
52
53 if (pshift >= pdshift) {
54 cachep = PGT_CACHE(PTE_T_ORDER);
55 num_hugepd = 1 << (pshift - pdshift);
56 new = NULL;
57 } else if (IS_ENABLED(CONFIG_PPC_8xx)) {
58 cachep = NULL;
59 num_hugepd = 1;
60 new = pte_alloc_one(mm);
61 } else {
62 cachep = PGT_CACHE(pdshift - pshift);
63 num_hugepd = 1;
64 new = NULL;
65 }
66
67 if (!cachep && !new) {
68 WARN_ONCE(1, "No page table cache created for hugetlb tables");
69 return -ENOMEM;
70 }
71
72 if (cachep)
73 new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
74
75 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
76 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
77
78 if (!new)
79 return -ENOMEM;
80
81 /*
82 * Make sure other cpus find the hugepd set only after a
83 * properly initialized page table is visible to them.
84 * For more details look for comment in __pte_alloc().
85 */
86 smp_wmb();
87
88 spin_lock(ptl);
89 /*
90 * We have multiple higher-level entries that point to the same
91 * actual pte location. Fill in each as we go and backtrack on error.
92 * We need all of these so the DTLB pgtable walk code can find the
93 * right higher-level entry without knowing if it's a hugepage or not.
94 */
95 for (i = 0; i < num_hugepd; i++, hpdp++) {
96 if (unlikely(!hugepd_none(*hpdp)))
97 break;
98 hugepd_populate(hpdp, new, pshift);
99 }
100 /* If we bailed from the for loop early, an error occurred, clean up */
101 if (i < num_hugepd) {
102 for (i = i - 1 ; i >= 0; i--, hpdp--)
103 *hpdp = __hugepd(0);
104 if (cachep)
105 kmem_cache_free(cachep, new);
106 else
107 pte_free(mm, new);
108 } else {
109 kmemleak_ignore(new);
110 }
111 spin_unlock(ptl);
112 return 0;
113}
114
115/*
116 * At this point we do the placement change only for BOOK3S 64. This would
117 * possibly work on other subarchs.
118 */
119pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
120 unsigned long addr, unsigned long sz)
121{
122 pgd_t *pg;
123 pud_t *pu;
124 pmd_t *pm;
125 hugepd_t *hpdp = NULL;
126 unsigned pshift = __ffs(sz);
127 unsigned pdshift = PGDIR_SHIFT;
128 spinlock_t *ptl;
129
130 addr &= ~(sz-1);
131 pg = pgd_offset(mm, addr);
132
133#ifdef CONFIG_PPC_BOOK3S_64
134 if (pshift == PGDIR_SHIFT)
135 /* 16GB huge page */
136 return (pte_t *) pg;
137 else if (pshift > PUD_SHIFT) {
138 /*
139 * We need to use hugepd table
140 */
141 ptl = &mm->page_table_lock;
142 hpdp = (hugepd_t *)pg;
143 } else {
144 pdshift = PUD_SHIFT;
145 pu = pud_alloc(mm, pg, addr);
146 if (!pu)
147 return NULL;
148 if (pshift == PUD_SHIFT)
149 return (pte_t *)pu;
150 else if (pshift > PMD_SHIFT) {
151 ptl = pud_lockptr(mm, pu);
152 hpdp = (hugepd_t *)pu;
153 } else {
154 pdshift = PMD_SHIFT;
155 pm = pmd_alloc(mm, pu, addr);
156 if (!pm)
157 return NULL;
158 if (pshift == PMD_SHIFT)
159 /* 16MB hugepage */
160 return (pte_t *)pm;
161 else {
162 ptl = pmd_lockptr(mm, pm);
163 hpdp = (hugepd_t *)pm;
164 }
165 }
166 }
167#else
168 if (pshift >= PGDIR_SHIFT) {
169 ptl = &mm->page_table_lock;
170 hpdp = (hugepd_t *)pg;
171 } else {
172 pdshift = PUD_SHIFT;
173 pu = pud_alloc(mm, pg, addr);
174 if (!pu)
175 return NULL;
176 if (pshift >= PUD_SHIFT) {
177 ptl = pud_lockptr(mm, pu);
178 hpdp = (hugepd_t *)pu;
179 } else {
180 pdshift = PMD_SHIFT;
181 pm = pmd_alloc(mm, pu, addr);
182 if (!pm)
183 return NULL;
184 ptl = pmd_lockptr(mm, pm);
185 hpdp = (hugepd_t *)pm;
186 }
187 }
188#endif
189 if (!hpdp)
190 return NULL;
191
192 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
193
194 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
195 pdshift, pshift, ptl))
196 return NULL;
197
198 return hugepte_offset(*hpdp, addr, pdshift);
199}
200
201#ifdef CONFIG_PPC_BOOK3S_64
202/*
203 * Tracks gpages after the device tree is scanned and before the
204 * huge_boot_pages list is ready on pseries.
205 */
206#define MAX_NUMBER_GPAGES 1024
207__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
208__initdata static unsigned nr_gpages;
209
210/*
211 * Build list of addresses of gigantic pages. This function is used in early
212 * boot before the buddy allocator is setup.
213 */
214void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
215{
216 if (!addr)
217 return;
218 while (number_of_pages > 0) {
219 gpage_freearray[nr_gpages] = addr;
220 nr_gpages++;
221 number_of_pages--;
222 addr += page_size;
223 }
224}
225
226int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
227{
228 struct huge_bootmem_page *m;
229 if (nr_gpages == 0)
230 return 0;
231 m = phys_to_virt(gpage_freearray[--nr_gpages]);
232 gpage_freearray[nr_gpages] = 0;
233 list_add(&m->list, &huge_boot_pages);
234 m->hstate = hstate;
235 return 1;
236}
237#endif
238
239
240int __init alloc_bootmem_huge_page(struct hstate *h)
241{
242
243#ifdef CONFIG_PPC_BOOK3S_64
244 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
245 return pseries_alloc_bootmem_huge_page(h);
246#endif
247 return __alloc_bootmem_huge_page(h);
248}
249
250#ifndef CONFIG_PPC_BOOK3S_64
251#define HUGEPD_FREELIST_SIZE \
252 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
253
254struct hugepd_freelist {
255 struct rcu_head rcu;
256 unsigned int index;
257 void *ptes[0];
258};
259
260static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
261
262static void hugepd_free_rcu_callback(struct rcu_head *head)
263{
264 struct hugepd_freelist *batch =
265 container_of(head, struct hugepd_freelist, rcu);
266 unsigned int i;
267
268 for (i = 0; i < batch->index; i++)
269 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
270
271 free_page((unsigned long)batch);
272}
273
274static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
275{
276 struct hugepd_freelist **batchp;
277
278 batchp = &get_cpu_var(hugepd_freelist_cur);
279
280 if (atomic_read(&tlb->mm->mm_users) < 2 ||
281 mm_is_thread_local(tlb->mm)) {
282 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
283 put_cpu_var(hugepd_freelist_cur);
284 return;
285 }
286
287 if (*batchp == NULL) {
288 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
289 (*batchp)->index = 0;
290 }
291
292 (*batchp)->ptes[(*batchp)->index++] = hugepte;
293 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
294 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
295 *batchp = NULL;
296 }
297 put_cpu_var(hugepd_freelist_cur);
298}
299#else
300static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
301#endif
302
303static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
304 unsigned long start, unsigned long end,
305 unsigned long floor, unsigned long ceiling)
306{
307 pte_t *hugepte = hugepd_page(*hpdp);
308 int i;
309
310 unsigned long pdmask = ~((1UL << pdshift) - 1);
311 unsigned int num_hugepd = 1;
312 unsigned int shift = hugepd_shift(*hpdp);
313
314 /* Note: On fsl the hpdp may be the first of several */
315 if (shift > pdshift)
316 num_hugepd = 1 << (shift - pdshift);
317
318 start &= pdmask;
319 if (start < floor)
320 return;
321 if (ceiling) {
322 ceiling &= pdmask;
323 if (! ceiling)
324 return;
325 }
326 if (end - 1 > ceiling - 1)
327 return;
328
329 for (i = 0; i < num_hugepd; i++, hpdp++)
330 *hpdp = __hugepd(0);
331
332 if (shift >= pdshift)
333 hugepd_free(tlb, hugepte);
334 else if (IS_ENABLED(CONFIG_PPC_8xx))
335 pgtable_free_tlb(tlb, hugepte, 0);
336 else
337 pgtable_free_tlb(tlb, hugepte,
338 get_hugepd_cache_index(pdshift - shift));
339}
340
341static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
342 unsigned long addr, unsigned long end,
343 unsigned long floor, unsigned long ceiling)
344{
345 pmd_t *pmd;
346 unsigned long next;
347 unsigned long start;
348
349 start = addr;
350 do {
351 unsigned long more;
352
353 pmd = pmd_offset(pud, addr);
354 next = pmd_addr_end(addr, end);
355 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
356 /*
357 * if it is not hugepd pointer, we should already find
358 * it cleared.
359 */
360 WARN_ON(!pmd_none_or_clear_bad(pmd));
361 continue;
362 }
363 /*
364 * Increment next by the size of the huge mapping since
365 * there may be more than one entry at this level for a
366 * single hugepage, but all of them point to
367 * the same kmem cache that holds the hugepte.
368 */
369 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
370 if (more > next)
371 next = more;
372
373 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
374 addr, next, floor, ceiling);
375 } while (addr = next, addr != end);
376
377 start &= PUD_MASK;
378 if (start < floor)
379 return;
380 if (ceiling) {
381 ceiling &= PUD_MASK;
382 if (!ceiling)
383 return;
384 }
385 if (end - 1 > ceiling - 1)
386 return;
387
388 pmd = pmd_offset(pud, start);
389 pud_clear(pud);
390 pmd_free_tlb(tlb, pmd, start);
391 mm_dec_nr_pmds(tlb->mm);
392}
393
394static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
395 unsigned long addr, unsigned long end,
396 unsigned long floor, unsigned long ceiling)
397{
398 pud_t *pud;
399 unsigned long next;
400 unsigned long start;
401
402 start = addr;
403 do {
404 pud = pud_offset(pgd, addr);
405 next = pud_addr_end(addr, end);
406 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
407 if (pud_none_or_clear_bad(pud))
408 continue;
409 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
410 ceiling);
411 } else {
412 unsigned long more;
413 /*
414 * Increment next by the size of the huge mapping since
415 * there may be more than one entry at this level for a
416 * single hugepage, but all of them point to
417 * the same kmem cache that holds the hugepte.
418 */
419 more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
420 if (more > next)
421 next = more;
422
423 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
424 addr, next, floor, ceiling);
425 }
426 } while (addr = next, addr != end);
427
428 start &= PGDIR_MASK;
429 if (start < floor)
430 return;
431 if (ceiling) {
432 ceiling &= PGDIR_MASK;
433 if (!ceiling)
434 return;
435 }
436 if (end - 1 > ceiling - 1)
437 return;
438
439 pud = pud_offset(pgd, start);
440 pgd_clear(pgd);
441 pud_free_tlb(tlb, pud, start);
442 mm_dec_nr_puds(tlb->mm);
443}
444
445/*
446 * This function frees user-level page tables of a process.
447 */
448void hugetlb_free_pgd_range(struct mmu_gather *tlb,
449 unsigned long addr, unsigned long end,
450 unsigned long floor, unsigned long ceiling)
451{
452 pgd_t *pgd;
453 unsigned long next;
454
455 /*
456 * Because there are a number of different possible pagetable
457 * layouts for hugepage ranges, we limit knowledge of how
458 * things should be laid out to the allocation path
459 * (huge_pte_alloc(), above). Everything else works out the
460 * structure as it goes from information in the hugepd
461 * pointers. That means that we can't here use the
462 * optimization used in the normal page free_pgd_range(), of
463 * checking whether we're actually covering a large enough
464 * range to have to do anything at the top level of the walk
465 * instead of at the bottom.
466 *
467 * To make sense of this, you should probably go read the big
468 * block comment at the top of the normal free_pgd_range(),
469 * too.
470 */
471
472 do {
473 next = pgd_addr_end(addr, end);
474 pgd = pgd_offset(tlb->mm, addr);
475 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
476 if (pgd_none_or_clear_bad(pgd))
477 continue;
478 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
479 } else {
480 unsigned long more;
481 /*
482 * Increment next by the size of the huge mapping since
483 * there may be more than one entry at the pgd level
484 * for a single hugepage, but all of them point to the
485 * same kmem cache that holds the hugepte.
486 */
487 more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
488 if (more > next)
489 next = more;
490
491 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
492 addr, next, floor, ceiling);
493 }
494 } while (addr = next, addr != end);
495}
496
497struct page *follow_huge_pd(struct vm_area_struct *vma,
498 unsigned long address, hugepd_t hpd,
499 int flags, int pdshift)
500{
501 pte_t *ptep;
502 spinlock_t *ptl;
503 struct page *page = NULL;
504 unsigned long mask;
505 int shift = hugepd_shift(hpd);
506 struct mm_struct *mm = vma->vm_mm;
507
508retry:
509 /*
510 * hugepage directory entries are protected by mm->page_table_lock
511 * Use this instead of huge_pte_lockptr
512 */
513 ptl = &mm->page_table_lock;
514 spin_lock(ptl);
515
516 ptep = hugepte_offset(hpd, address, pdshift);
517 if (pte_present(*ptep)) {
518 mask = (1UL << shift) - 1;
519 page = pte_page(*ptep);
520 page += ((address & mask) >> PAGE_SHIFT);
521 if (flags & FOLL_GET)
522 get_page(page);
523 } else {
524 if (is_hugetlb_entry_migration(*ptep)) {
525 spin_unlock(ptl);
526 __migration_entry_wait(mm, ptep, ptl);
527 goto retry;
528 }
529 }
530 spin_unlock(ptl);
531 return page;
532}
533
534#ifdef CONFIG_PPC_MM_SLICES
535unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
536 unsigned long len, unsigned long pgoff,
537 unsigned long flags)
538{
539 struct hstate *hstate = hstate_file(file);
540 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
541
542#ifdef CONFIG_PPC_RADIX_MMU
543 if (radix_enabled())
544 return radix__hugetlb_get_unmapped_area(file, addr, len,
545 pgoff, flags);
546#endif
547 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
548}
549#endif
550
551unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
552{
553 /* With radix we don't use slice, so derive it from vma*/
554 if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
555 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
556
557 return 1UL << mmu_psize_to_shift(psize);
558 }
559 return vma_kernel_pagesize(vma);
560}
561
562static int __init add_huge_page_size(unsigned long long size)
563{
564 int shift = __ffs(size);
565 int mmu_psize;
566
567 /* Check that it is a page size supported by the hardware and
568 * that it fits within pagetable and slice limits. */
569 if (size <= PAGE_SIZE || !is_power_of_2(size))
570 return -EINVAL;
571
572 mmu_psize = check_and_get_huge_psize(shift);
573 if (mmu_psize < 0)
574 return -EINVAL;
575
576 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
577
578 /* Return if huge page size has already been setup */
579 if (size_to_hstate(size))
580 return 0;
581
582 hugetlb_add_hstate(shift - PAGE_SHIFT);
583
584 return 0;
585}
586
587static int __init hugepage_setup_sz(char *str)
588{
589 unsigned long long size;
590
591 size = memparse(str, &str);
592
593 if (add_huge_page_size(size) != 0) {
594 hugetlb_bad_size();
595 pr_err("Invalid huge page size specified(%llu)\n", size);
596 }
597
598 return 1;
599}
600__setup("hugepagesz=", hugepage_setup_sz);
601
602static int __init hugetlbpage_init(void)
603{
604 bool configured = false;
605 int psize;
606
607 if (hugetlb_disabled) {
608 pr_info("HugeTLB support is disabled!\n");
609 return 0;
610 }
611
612 if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
613 !mmu_has_feature(MMU_FTR_16M_PAGE))
614 return -ENODEV;
615
616 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
617 unsigned shift;
618 unsigned pdshift;
619
620 if (!mmu_psize_defs[psize].shift)
621 continue;
622
623 shift = mmu_psize_to_shift(psize);
624
625#ifdef CONFIG_PPC_BOOK3S_64
626 if (shift > PGDIR_SHIFT)
627 continue;
628 else if (shift > PUD_SHIFT)
629 pdshift = PGDIR_SHIFT;
630 else if (shift > PMD_SHIFT)
631 pdshift = PUD_SHIFT;
632 else
633 pdshift = PMD_SHIFT;
634#else
635 if (shift < PUD_SHIFT)
636 pdshift = PMD_SHIFT;
637 else if (shift < PGDIR_SHIFT)
638 pdshift = PUD_SHIFT;
639 else
640 pdshift = PGDIR_SHIFT;
641#endif
642
643 if (add_huge_page_size(1ULL << shift) < 0)
644 continue;
645 /*
646 * if we have pdshift and shift value same, we don't
647 * use pgt cache for hugepd.
648 */
649 if (pdshift > shift) {
650 if (!IS_ENABLED(CONFIG_PPC_8xx))
651 pgtable_cache_add(pdshift - shift);
652 } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) ||
653 IS_ENABLED(CONFIG_PPC_8xx)) {
654 pgtable_cache_add(PTE_T_ORDER);
655 }
656
657 configured = true;
658 }
659
660 if (configured) {
661 if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
662 hugetlbpage_init_default();
663 } else
664 pr_info("Failed to initialize. Disabling HugeTLB");
665
666 return 0;
667}
668
669arch_initcall(hugetlbpage_init);
670
671void flush_dcache_icache_hugepage(struct page *page)
672{
673 int i;
674 void *start;
675
676 BUG_ON(!PageCompound(page));
677
678 for (i = 0; i < compound_nr(page); i++) {
679 if (!PageHighMem(page)) {
680 __flush_dcache_icache(page_address(page+i));
681 } else {
682 start = kmap_atomic(page+i);
683 __flush_dcache_icache(start);
684 kunmap_atomic(start);
685 }
686 }
687}