blob: bf26d404e62980c429e54a1194183409c6fd2a13 [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001/*
2 * linux/mm/vmalloc.c
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 * Numa awareness, Christoph Lameter, SGI, June 2005
9 */
10
11#include <linux/vmalloc.h>
12#include <linux/mm.h>
13#include <linux/module.h>
14#include <linux/highmem.h>
15#include <linux/sched/signal.h>
16#include <linux/slab.h>
17#include <linux/spinlock.h>
18#include <linux/interrupt.h>
19#include <linux/proc_fs.h>
20#include <linux/seq_file.h>
21#include <linux/debugobjects.h>
22#include <linux/kallsyms.h>
23#include <linux/list.h>
24#include <linux/notifier.h>
25#include <linux/rbtree.h>
26#include <linux/radix-tree.h>
27#include <linux/rcupdate.h>
28#include <linux/pfn.h>
29#include <linux/kmemleak.h>
30#include <linux/atomic.h>
31#include <linux/compiler.h>
32#include <linux/llist.h>
33#include <linux/bitops.h>
34
35#include <linux/uaccess.h>
36#include <asm/tlbflush.h>
37#include <asm/shmparam.h>
38
39#include "internal.h"
40
41struct vfree_deferred {
42 struct llist_head list;
43 struct work_struct wq;
44};
45static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
46
47static void __vunmap(const void *, int);
48
49static void free_work(struct work_struct *w)
50{
51 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
52 struct llist_node *t, *llnode;
53
54 llist_for_each_safe(llnode, t, llist_del_all(&p->list))
55 __vunmap((void *)llnode, 1);
56}
57
58/*** Page table manipulation functions ***/
59
60static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
61{
62 pte_t *pte;
63
64 pte = pte_offset_kernel(pmd, addr);
65 do {
66 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
67 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
68 } while (pte++, addr += PAGE_SIZE, addr != end);
69}
70
71static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
72{
73 pmd_t *pmd;
74 unsigned long next;
75
76 pmd = pmd_offset(pud, addr);
77 do {
78 next = pmd_addr_end(addr, end);
79 if (pmd_clear_huge(pmd))
80 continue;
81 if (pmd_none_or_clear_bad(pmd))
82 continue;
83 vunmap_pte_range(pmd, addr, next);
84 } while (pmd++, addr = next, addr != end);
85}
86
87static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
88{
89 pud_t *pud;
90 unsigned long next;
91
92 pud = pud_offset(p4d, addr);
93 do {
94 next = pud_addr_end(addr, end);
95 if (pud_clear_huge(pud))
96 continue;
97 if (pud_none_or_clear_bad(pud))
98 continue;
99 vunmap_pmd_range(pud, addr, next);
100 } while (pud++, addr = next, addr != end);
101}
102
103static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
104{
105 p4d_t *p4d;
106 unsigned long next;
107
108 p4d = p4d_offset(pgd, addr);
109 do {
110 next = p4d_addr_end(addr, end);
111 if (p4d_clear_huge(p4d))
112 continue;
113 if (p4d_none_or_clear_bad(p4d))
114 continue;
115 vunmap_pud_range(p4d, addr, next);
116 } while (p4d++, addr = next, addr != end);
117}
118
119static void vunmap_page_range(unsigned long addr, unsigned long end)
120{
121 pgd_t *pgd;
122 unsigned long next;
123
124 BUG_ON(addr >= end);
125 pgd = pgd_offset_k(addr);
126 do {
127 next = pgd_addr_end(addr, end);
128 if (pgd_none_or_clear_bad(pgd))
129 continue;
130 vunmap_p4d_range(pgd, addr, next);
131 } while (pgd++, addr = next, addr != end);
132}
133
134static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
135 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
136{
137 pte_t *pte;
138
139 /*
140 * nr is a running index into the array which helps higher level
141 * callers keep track of where we're up to.
142 */
143
144 pte = pte_alloc_kernel(pmd, addr);
145 if (!pte)
146 return -ENOMEM;
147 do {
148 struct page *page = pages[*nr];
149
150 if (WARN_ON(!pte_none(*pte)))
151 return -EBUSY;
152 if (WARN_ON(!page))
153 return -ENOMEM;
154 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
155 (*nr)++;
156 } while (pte++, addr += PAGE_SIZE, addr != end);
157 return 0;
158}
159
160static int vmap_pmd_range(pud_t *pud, unsigned long addr,
161 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
162{
163 pmd_t *pmd;
164 unsigned long next;
165
166 pmd = pmd_alloc(&init_mm, pud, addr);
167 if (!pmd)
168 return -ENOMEM;
169 do {
170 next = pmd_addr_end(addr, end);
171 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
172 return -ENOMEM;
173 } while (pmd++, addr = next, addr != end);
174 return 0;
175}
176
177static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
178 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
179{
180 pud_t *pud;
181 unsigned long next;
182
183 pud = pud_alloc(&init_mm, p4d, addr);
184 if (!pud)
185 return -ENOMEM;
186 do {
187 next = pud_addr_end(addr, end);
188 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
189 return -ENOMEM;
190 } while (pud++, addr = next, addr != end);
191 return 0;
192}
193
194static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
195 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
196{
197 p4d_t *p4d;
198 unsigned long next;
199
200 p4d = p4d_alloc(&init_mm, pgd, addr);
201 if (!p4d)
202 return -ENOMEM;
203 do {
204 next = p4d_addr_end(addr, end);
205 if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
206 return -ENOMEM;
207 } while (p4d++, addr = next, addr != end);
208 return 0;
209}
210
211/*
212 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
213 * will have pfns corresponding to the "pages" array.
214 *
215 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
216 */
217static int vmap_page_range_noflush(unsigned long start, unsigned long end,
218 pgprot_t prot, struct page **pages)
219{
220 pgd_t *pgd;
221 unsigned long next;
222 unsigned long addr = start;
223 int err = 0;
224 int nr = 0;
225
226 BUG_ON(addr >= end);
227 pgd = pgd_offset_k(addr);
228 do {
229 next = pgd_addr_end(addr, end);
230 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
231 if (err)
232 return err;
233 } while (pgd++, addr = next, addr != end);
234
235 return nr;
236}
237
238static int vmap_page_range(unsigned long start, unsigned long end,
239 pgprot_t prot, struct page **pages)
240{
241 int ret;
242
243 ret = vmap_page_range_noflush(start, end, prot, pages);
244 flush_cache_vmap(start, end);
245 return ret;
246}
247
248int is_vmalloc_or_module_addr(const void *x)
249{
250 /*
251 * ARM, x86-64 and sparc64 put modules in a special place,
252 * and fall back on vmalloc() if that fails. Others
253 * just put it in the vmalloc space.
254 */
255#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
256 unsigned long addr = (unsigned long)x;
257 if (addr >= MODULES_VADDR && addr < MODULES_END)
258 return 1;
259#endif
260 return is_vmalloc_addr(x);
261}
262
263/*
264 * Walk a vmap address to the struct page it maps.
265 */
266struct page *vmalloc_to_page(const void *vmalloc_addr)
267{
268 unsigned long addr = (unsigned long) vmalloc_addr;
269 struct page *page = NULL;
270 pgd_t *pgd = pgd_offset_k(addr);
271 p4d_t *p4d;
272 pud_t *pud;
273 pmd_t *pmd;
274 pte_t *ptep, pte;
275
276 /*
277 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
278 * architectures that do not vmalloc module space
279 */
280 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
281
282 if (pgd_none(*pgd))
283 return NULL;
284 p4d = p4d_offset(pgd, addr);
285 if (p4d_none(*p4d))
286 return NULL;
287 pud = pud_offset(p4d, addr);
288
289 /*
290 * Don't dereference bad PUD or PMD (below) entries. This will also
291 * identify huge mappings, which we may encounter on architectures
292 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
293 * identified as vmalloc addresses by is_vmalloc_addr(), but are
294 * not [unambiguously] associated with a struct page, so there is
295 * no correct value to return for them.
296 */
297 WARN_ON_ONCE(pud_bad(*pud));
298 if (pud_none(*pud) || pud_bad(*pud))
299 return NULL;
300 pmd = pmd_offset(pud, addr);
301 WARN_ON_ONCE(pmd_bad(*pmd));
302 if (pmd_none(*pmd) || pmd_bad(*pmd))
303 return NULL;
304
305 ptep = pte_offset_map(pmd, addr);
306 pte = *ptep;
307 if (pte_present(pte))
308 page = pte_page(pte);
309 pte_unmap(ptep);
310 return page;
311}
312EXPORT_SYMBOL(vmalloc_to_page);
313
314/*
315 * Map a vmalloc()-space virtual address to the physical page frame number.
316 */
317unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
318{
319 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
320}
321EXPORT_SYMBOL(vmalloc_to_pfn);
322
323
324/*** Global kva allocator ***/
325
326#define VM_LAZY_FREE 0x02
327#define VM_VM_AREA 0x04
328
329static DEFINE_SPINLOCK(vmap_area_lock);
330/* Export for kexec only */
331LIST_HEAD(vmap_area_list);
332static LLIST_HEAD(vmap_purge_list);
333static struct rb_root vmap_area_root = RB_ROOT;
334
335/* The vmap cache globals are protected by vmap_area_lock */
336static struct rb_node *free_vmap_cache;
337static unsigned long cached_hole_size;
338static unsigned long cached_vstart;
339static unsigned long cached_align;
340
341static unsigned long vmap_area_pcpu_hole;
342
343static atomic_long_t nr_vmalloc_pages;
344
345unsigned long vmalloc_nr_pages(void)
346{
347 return atomic_long_read(&nr_vmalloc_pages);
348}
349
350static struct vmap_area *__find_vmap_area(unsigned long addr)
351{
352 struct rb_node *n = vmap_area_root.rb_node;
353
354 while (n) {
355 struct vmap_area *va;
356
357 va = rb_entry(n, struct vmap_area, rb_node);
358 if (addr < va->va_start)
359 n = n->rb_left;
360 else if (addr >= va->va_end)
361 n = n->rb_right;
362 else
363 return va;
364 }
365
366 return NULL;
367}
368
369static void __insert_vmap_area(struct vmap_area *va)
370{
371 struct rb_node **p = &vmap_area_root.rb_node;
372 struct rb_node *parent = NULL;
373 struct rb_node *tmp;
374
375 while (*p) {
376 struct vmap_area *tmp_va;
377
378 parent = *p;
379 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
380 if (va->va_start < tmp_va->va_end)
381 p = &(*p)->rb_left;
382 else if (va->va_end > tmp_va->va_start)
383 p = &(*p)->rb_right;
384 else
385 BUG();
386 }
387
388 rb_link_node(&va->rb_node, parent, p);
389 rb_insert_color(&va->rb_node, &vmap_area_root);
390
391 /* address-sort this list */
392 tmp = rb_prev(&va->rb_node);
393 if (tmp) {
394 struct vmap_area *prev;
395 prev = rb_entry(tmp, struct vmap_area, rb_node);
396 list_add_rcu(&va->list, &prev->list);
397 } else
398 list_add_rcu(&va->list, &vmap_area_list);
399}
400
401static void purge_vmap_area_lazy(void);
402
403static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
404
405/*
406 * Allocate a region of KVA of the specified size and alignment, within the
407 * vstart and vend.
408 */
409static struct vmap_area *alloc_vmap_area(unsigned long size,
410 unsigned long align,
411 unsigned long vstart, unsigned long vend,
412 int node, gfp_t gfp_mask)
413{
414 struct vmap_area *va;
415 struct rb_node *n;
416 unsigned long addr;
417 int purged = 0;
418 struct vmap_area *first;
419
420 BUG_ON(!size);
421 BUG_ON(offset_in_page(size));
422 BUG_ON(!is_power_of_2(align));
423
424 might_sleep();
425
426 va = kmalloc_node(sizeof(struct vmap_area),
427 gfp_mask & GFP_RECLAIM_MASK, node);
428 if (unlikely(!va))
429 return ERR_PTR(-ENOMEM);
430
431 /*
432 * Only scan the relevant parts containing pointers to other objects
433 * to avoid false negatives.
434 */
435 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
436
437retry:
438 spin_lock(&vmap_area_lock);
439 /*
440 * Invalidate cache if we have more permissive parameters.
441 * cached_hole_size notes the largest hole noticed _below_
442 * the vmap_area cached in free_vmap_cache: if size fits
443 * into that hole, we want to scan from vstart to reuse
444 * the hole instead of allocating above free_vmap_cache.
445 * Note that __free_vmap_area may update free_vmap_cache
446 * without updating cached_hole_size or cached_align.
447 */
448 if (!free_vmap_cache ||
449 size < cached_hole_size ||
450 vstart < cached_vstart ||
451 align < cached_align) {
452nocache:
453 cached_hole_size = 0;
454 free_vmap_cache = NULL;
455 }
456 /* record if we encounter less permissive parameters */
457 cached_vstart = vstart;
458 cached_align = align;
459
460 /* find starting point for our search */
461 if (free_vmap_cache) {
462 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
463 addr = ALIGN(first->va_end, align);
464 if (addr < vstart)
465 goto nocache;
466 if (addr + size < addr)
467 goto overflow;
468
469 } else {
470 addr = ALIGN(vstart, align);
471 if (addr + size < addr)
472 goto overflow;
473
474 n = vmap_area_root.rb_node;
475 first = NULL;
476
477 while (n) {
478 struct vmap_area *tmp;
479 tmp = rb_entry(n, struct vmap_area, rb_node);
480 if (tmp->va_end >= addr) {
481 first = tmp;
482 if (tmp->va_start <= addr)
483 break;
484 n = n->rb_left;
485 } else
486 n = n->rb_right;
487 }
488
489 if (!first)
490 goto found;
491 }
492
493 /* from the starting point, walk areas until a suitable hole is found */
494 while (addr + size > first->va_start && addr + size <= vend) {
495 if (addr + cached_hole_size < first->va_start)
496 cached_hole_size = first->va_start - addr;
497 addr = ALIGN(first->va_end, align);
498 if (addr + size < addr)
499 goto overflow;
500
501 if (list_is_last(&first->list, &vmap_area_list))
502 goto found;
503
504 first = list_next_entry(first, list);
505 }
506
507found:
508 /*
509 * Check also calculated address against the vstart,
510 * because it can be 0 because of big align request.
511 */
512 if (addr + size > vend || addr < vstart)
513 goto overflow;
514
515 va->va_start = addr;
516 va->va_end = addr + size;
517 va->flags = 0;
518 __insert_vmap_area(va);
519 free_vmap_cache = &va->rb_node;
520 spin_unlock(&vmap_area_lock);
521
522 BUG_ON(!IS_ALIGNED(va->va_start, align));
523 BUG_ON(va->va_start < vstart);
524 BUG_ON(va->va_end > vend);
525
526 return va;
527
528overflow:
529 spin_unlock(&vmap_area_lock);
530 if (!purged) {
531 purge_vmap_area_lazy();
532 purged = 1;
533 goto retry;
534 }
535
536 if (gfpflags_allow_blocking(gfp_mask)) {
537 unsigned long freed = 0;
538 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
539 if (freed > 0) {
540 purged = 0;
541 goto retry;
542 }
543 }
544
545 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
546 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
547 size);
548 kfree(va);
549 return ERR_PTR(-EBUSY);
550}
551
552int register_vmap_purge_notifier(struct notifier_block *nb)
553{
554 return blocking_notifier_chain_register(&vmap_notify_list, nb);
555}
556EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
557
558int unregister_vmap_purge_notifier(struct notifier_block *nb)
559{
560 return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
561}
562EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
563
564static void __free_vmap_area(struct vmap_area *va)
565{
566 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
567
568 if (free_vmap_cache) {
569 if (va->va_end < cached_vstart) {
570 free_vmap_cache = NULL;
571 } else {
572 struct vmap_area *cache;
573 cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
574 if (va->va_start <= cache->va_start) {
575 free_vmap_cache = rb_prev(&va->rb_node);
576 /*
577 * We don't try to update cached_hole_size or
578 * cached_align, but it won't go very wrong.
579 */
580 }
581 }
582 }
583 rb_erase(&va->rb_node, &vmap_area_root);
584 RB_CLEAR_NODE(&va->rb_node);
585 list_del_rcu(&va->list);
586
587 /*
588 * Track the highest possible candidate for pcpu area
589 * allocation. Areas outside of vmalloc area can be returned
590 * here too, consider only end addresses which fall inside
591 * vmalloc area proper.
592 */
593 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
594 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
595
596 kfree_rcu(va, rcu_head);
597}
598
599/*
600 * Free a region of KVA allocated by alloc_vmap_area
601 */
602static void free_vmap_area(struct vmap_area *va)
603{
604 spin_lock(&vmap_area_lock);
605 __free_vmap_area(va);
606 spin_unlock(&vmap_area_lock);
607}
608
609/*
610 * Clear the pagetable entries of a given vmap_area
611 */
612static void unmap_vmap_area(struct vmap_area *va)
613{
614 vunmap_page_range(va->va_start, va->va_end);
615}
616
617/*
618 * lazy_max_pages is the maximum amount of virtual address space we gather up
619 * before attempting to purge with a TLB flush.
620 *
621 * There is a tradeoff here: a larger number will cover more kernel page tables
622 * and take slightly longer to purge, but it will linearly reduce the number of
623 * global TLB flushes that must be performed. It would seem natural to scale
624 * this number up linearly with the number of CPUs (because vmapping activity
625 * could also scale linearly with the number of CPUs), however it is likely
626 * that in practice, workloads might be constrained in other ways that mean
627 * vmap activity will not scale linearly with CPUs. Also, I want to be
628 * conservative and not introduce a big latency on huge systems, so go with
629 * a less aggressive log scale. It will still be an improvement over the old
630 * code, and it will be simple to change the scale factor if we find that it
631 * becomes a problem on bigger systems.
632 */
633static unsigned long lazy_max_pages(void)
634{
635 unsigned int log;
636
637 log = fls(num_online_cpus());
638
639 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
640}
641
642static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
643
644/*
645 * Serialize vmap purging. There is no actual criticial section protected
646 * by this look, but we want to avoid concurrent calls for performance
647 * reasons and to make the pcpu_get_vm_areas more deterministic.
648 */
649static DEFINE_MUTEX(vmap_purge_lock);
650
651/* for per-CPU blocks */
652static void purge_fragmented_blocks_allcpus(void);
653
654/*
655 * called before a call to iounmap() if the caller wants vm_area_struct's
656 * immediately freed.
657 */
658void set_iounmap_nonlazy(void)
659{
660 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
661}
662
663/*
664 * Purges all lazily-freed vmap areas.
665 */
666static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
667{
668 struct llist_node *valist;
669 struct vmap_area *va;
670 struct vmap_area *n_va;
671 bool do_free = false;
672
673 lockdep_assert_held(&vmap_purge_lock);
674
675 valist = llist_del_all(&vmap_purge_list);
676 llist_for_each_entry(va, valist, purge_list) {
677 if (va->va_start < start)
678 start = va->va_start;
679 if (va->va_end > end)
680 end = va->va_end;
681 do_free = true;
682 }
683
684 if (!do_free)
685 return false;
686
687 flush_tlb_kernel_range(start, end);
688
689 spin_lock(&vmap_area_lock);
690 llist_for_each_entry_safe(va, n_va, valist, purge_list) {
691 int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
692
693 __free_vmap_area(va);
694 atomic_sub(nr, &vmap_lazy_nr);
695 cond_resched_lock(&vmap_area_lock);
696 }
697 spin_unlock(&vmap_area_lock);
698 return true;
699}
700
701/*
702 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
703 * is already purging.
704 */
705static void try_purge_vmap_area_lazy(void)
706{
707 if (mutex_trylock(&vmap_purge_lock)) {
708 __purge_vmap_area_lazy(ULONG_MAX, 0);
709 mutex_unlock(&vmap_purge_lock);
710 }
711}
712
713/*
714 * Kick off a purge of the outstanding lazy areas.
715 */
716static void purge_vmap_area_lazy(void)
717{
718 mutex_lock(&vmap_purge_lock);
719 purge_fragmented_blocks_allcpus();
720 __purge_vmap_area_lazy(ULONG_MAX, 0);
721 mutex_unlock(&vmap_purge_lock);
722}
723
724/*
725 * Free a vmap area, caller ensuring that the area has been unmapped
726 * and flush_cache_vunmap had been called for the correct range
727 * previously.
728 */
729static void free_vmap_area_noflush(struct vmap_area *va)
730{
731 int nr_lazy;
732
733 nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
734 &vmap_lazy_nr);
735
736 /* After this point, we may free va at any time */
737 llist_add(&va->purge_list, &vmap_purge_list);
738
739 if (unlikely(nr_lazy > lazy_max_pages()))
740 try_purge_vmap_area_lazy();
741}
742
743/*
744 * Free and unmap a vmap area
745 */
746static void free_unmap_vmap_area(struct vmap_area *va)
747{
748 flush_cache_vunmap(va->va_start, va->va_end);
749 unmap_vmap_area(va);
750 if (debug_pagealloc_enabled())
751 flush_tlb_kernel_range(va->va_start, va->va_end);
752
753 free_vmap_area_noflush(va);
754}
755
756static struct vmap_area *find_vmap_area(unsigned long addr)
757{
758 struct vmap_area *va;
759
760 spin_lock(&vmap_area_lock);
761 va = __find_vmap_area(addr);
762 spin_unlock(&vmap_area_lock);
763
764 return va;
765}
766
767/*** Per cpu kva allocator ***/
768
769/*
770 * vmap space is limited especially on 32 bit architectures. Ensure there is
771 * room for at least 16 percpu vmap blocks per CPU.
772 */
773/*
774 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
775 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
776 * instead (we just need a rough idea)
777 */
778#if BITS_PER_LONG == 32
779#define VMALLOC_SPACE (128UL*1024*1024)
780#else
781#define VMALLOC_SPACE (128UL*1024*1024*1024)
782#endif
783
784#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
785#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
786#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
787#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
788#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
789#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
790#define VMAP_BBMAP_BITS \
791 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
792 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
793 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
794
795#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
796
797static bool vmap_initialized __read_mostly = false;
798
799struct vmap_block_queue {
800 spinlock_t lock;
801 struct list_head free;
802};
803
804struct vmap_block {
805 spinlock_t lock;
806 struct vmap_area *va;
807 unsigned long free, dirty;
808 unsigned long dirty_min, dirty_max; /*< dirty range */
809 struct list_head free_list;
810 struct rcu_head rcu_head;
811 struct list_head purge;
812};
813
814/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
815static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
816
817/*
818 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
819 * in the free path. Could get rid of this if we change the API to return a
820 * "cookie" from alloc, to be passed to free. But no big deal yet.
821 */
822static DEFINE_SPINLOCK(vmap_block_tree_lock);
823static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
824
825/*
826 * We should probably have a fallback mechanism to allocate virtual memory
827 * out of partially filled vmap blocks. However vmap block sizing should be
828 * fairly reasonable according to the vmalloc size, so it shouldn't be a
829 * big problem.
830 */
831
832static unsigned long addr_to_vb_idx(unsigned long addr)
833{
834 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
835 addr /= VMAP_BLOCK_SIZE;
836 return addr;
837}
838
839static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
840{
841 unsigned long addr;
842
843 addr = va_start + (pages_off << PAGE_SHIFT);
844 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
845 return (void *)addr;
846}
847
848/**
849 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
850 * block. Of course pages number can't exceed VMAP_BBMAP_BITS
851 * @order: how many 2^order pages should be occupied in newly allocated block
852 * @gfp_mask: flags for the page level allocator
853 *
854 * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
855 */
856static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
857{
858 struct vmap_block_queue *vbq;
859 struct vmap_block *vb;
860 struct vmap_area *va;
861 unsigned long vb_idx;
862 int node, err;
863 void *vaddr;
864
865 node = numa_node_id();
866
867 vb = kmalloc_node(sizeof(struct vmap_block),
868 gfp_mask & GFP_RECLAIM_MASK, node);
869 if (unlikely(!vb))
870 return ERR_PTR(-ENOMEM);
871
872 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
873 VMALLOC_START, VMALLOC_END,
874 node, gfp_mask);
875 if (IS_ERR(va)) {
876 kfree(vb);
877 return ERR_CAST(va);
878 }
879
880 err = radix_tree_preload(gfp_mask);
881 if (unlikely(err)) {
882 kfree(vb);
883 free_vmap_area(va);
884 return ERR_PTR(err);
885 }
886
887 vaddr = vmap_block_vaddr(va->va_start, 0);
888 spin_lock_init(&vb->lock);
889 vb->va = va;
890 /* At least something should be left free */
891 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
892 vb->free = VMAP_BBMAP_BITS - (1UL << order);
893 vb->dirty = 0;
894 vb->dirty_min = VMAP_BBMAP_BITS;
895 vb->dirty_max = 0;
896 INIT_LIST_HEAD(&vb->free_list);
897
898 vb_idx = addr_to_vb_idx(va->va_start);
899 spin_lock(&vmap_block_tree_lock);
900 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
901 spin_unlock(&vmap_block_tree_lock);
902 BUG_ON(err);
903 radix_tree_preload_end();
904
905 vbq = &get_cpu_var(vmap_block_queue);
906 spin_lock(&vbq->lock);
907 list_add_tail_rcu(&vb->free_list, &vbq->free);
908 spin_unlock(&vbq->lock);
909 put_cpu_var(vmap_block_queue);
910
911 return vaddr;
912}
913
914static void free_vmap_block(struct vmap_block *vb)
915{
916 struct vmap_block *tmp;
917 unsigned long vb_idx;
918
919 vb_idx = addr_to_vb_idx(vb->va->va_start);
920 spin_lock(&vmap_block_tree_lock);
921 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
922 spin_unlock(&vmap_block_tree_lock);
923 BUG_ON(tmp != vb);
924
925 free_vmap_area_noflush(vb->va);
926 kfree_rcu(vb, rcu_head);
927}
928
929static void purge_fragmented_blocks(int cpu)
930{
931 LIST_HEAD(purge);
932 struct vmap_block *vb;
933 struct vmap_block *n_vb;
934 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
935
936 rcu_read_lock();
937 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
938
939 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
940 continue;
941
942 spin_lock(&vb->lock);
943 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
944 vb->free = 0; /* prevent further allocs after releasing lock */
945 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
946 vb->dirty_min = 0;
947 vb->dirty_max = VMAP_BBMAP_BITS;
948 spin_lock(&vbq->lock);
949 list_del_rcu(&vb->free_list);
950 spin_unlock(&vbq->lock);
951 spin_unlock(&vb->lock);
952 list_add_tail(&vb->purge, &purge);
953 } else
954 spin_unlock(&vb->lock);
955 }
956 rcu_read_unlock();
957
958 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
959 list_del(&vb->purge);
960 free_vmap_block(vb);
961 }
962}
963
964static void purge_fragmented_blocks_allcpus(void)
965{
966 int cpu;
967
968 for_each_possible_cpu(cpu)
969 purge_fragmented_blocks(cpu);
970}
971
972static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
973{
974 struct vmap_block_queue *vbq;
975 struct vmap_block *vb;
976 void *vaddr = NULL;
977 unsigned int order;
978
979 BUG_ON(offset_in_page(size));
980 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
981 if (WARN_ON(size == 0)) {
982 /*
983 * Allocating 0 bytes isn't what caller wants since
984 * get_order(0) returns funny result. Just warn and terminate
985 * early.
986 */
987 return NULL;
988 }
989 order = get_order(size);
990
991 rcu_read_lock();
992 vbq = &get_cpu_var(vmap_block_queue);
993 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
994 unsigned long pages_off;
995
996 spin_lock(&vb->lock);
997 if (vb->free < (1UL << order)) {
998 spin_unlock(&vb->lock);
999 continue;
1000 }
1001
1002 pages_off = VMAP_BBMAP_BITS - vb->free;
1003 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
1004 vb->free -= 1UL << order;
1005 if (vb->free == 0) {
1006 spin_lock(&vbq->lock);
1007 list_del_rcu(&vb->free_list);
1008 spin_unlock(&vbq->lock);
1009 }
1010
1011 spin_unlock(&vb->lock);
1012 break;
1013 }
1014
1015 put_cpu_var(vmap_block_queue);
1016 rcu_read_unlock();
1017
1018 /* Allocate new block if nothing was found */
1019 if (!vaddr)
1020 vaddr = new_vmap_block(order, gfp_mask);
1021
1022 return vaddr;
1023}
1024
1025static void vb_free(const void *addr, unsigned long size)
1026{
1027 unsigned long offset;
1028 unsigned long vb_idx;
1029 unsigned int order;
1030 struct vmap_block *vb;
1031
1032 BUG_ON(offset_in_page(size));
1033 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
1034
1035 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
1036
1037 order = get_order(size);
1038
1039 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
1040 offset >>= PAGE_SHIFT;
1041
1042 vb_idx = addr_to_vb_idx((unsigned long)addr);
1043 rcu_read_lock();
1044 vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
1045 rcu_read_unlock();
1046 BUG_ON(!vb);
1047
1048 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
1049
1050 if (debug_pagealloc_enabled())
1051 flush_tlb_kernel_range((unsigned long)addr,
1052 (unsigned long)addr + size);
1053
1054 spin_lock(&vb->lock);
1055
1056 /* Expand dirty range */
1057 vb->dirty_min = min(vb->dirty_min, offset);
1058 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
1059
1060 vb->dirty += 1UL << order;
1061 if (vb->dirty == VMAP_BBMAP_BITS) {
1062 BUG_ON(vb->free);
1063 spin_unlock(&vb->lock);
1064 free_vmap_block(vb);
1065 } else
1066 spin_unlock(&vb->lock);
1067}
1068
1069/**
1070 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1071 *
1072 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1073 * to amortize TLB flushing overheads. What this means is that any page you
1074 * have now, may, in a former life, have been mapped into kernel virtual
1075 * address by the vmap layer and so there might be some CPUs with TLB entries
1076 * still referencing that page (additional to the regular 1:1 kernel mapping).
1077 *
1078 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1079 * be sure that none of the pages we have control over will have any aliases
1080 * from the vmap layer.
1081 */
1082void vm_unmap_aliases(void)
1083{
1084 unsigned long start = ULONG_MAX, end = 0;
1085 int cpu;
1086 int flush = 0;
1087
1088 if (unlikely(!vmap_initialized))
1089 return;
1090
1091 might_sleep();
1092
1093 for_each_possible_cpu(cpu) {
1094 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1095 struct vmap_block *vb;
1096
1097 rcu_read_lock();
1098 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1099 spin_lock(&vb->lock);
1100 if (vb->dirty) {
1101 unsigned long va_start = vb->va->va_start;
1102 unsigned long s, e;
1103
1104 s = va_start + (vb->dirty_min << PAGE_SHIFT);
1105 e = va_start + (vb->dirty_max << PAGE_SHIFT);
1106
1107 start = min(s, start);
1108 end = max(e, end);
1109
1110 flush = 1;
1111 }
1112 spin_unlock(&vb->lock);
1113 }
1114 rcu_read_unlock();
1115 }
1116
1117 mutex_lock(&vmap_purge_lock);
1118 purge_fragmented_blocks_allcpus();
1119 if (!__purge_vmap_area_lazy(start, end) && flush)
1120 flush_tlb_kernel_range(start, end);
1121 mutex_unlock(&vmap_purge_lock);
1122}
1123EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1124
1125/**
1126 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
1127 * @mem: the pointer returned by vm_map_ram
1128 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
1129 */
1130void vm_unmap_ram(const void *mem, unsigned int count)
1131{
1132 unsigned long size = (unsigned long)count << PAGE_SHIFT;
1133 unsigned long addr = (unsigned long)mem;
1134 struct vmap_area *va;
1135
1136 might_sleep();
1137 BUG_ON(!addr);
1138 BUG_ON(addr < VMALLOC_START);
1139 BUG_ON(addr > VMALLOC_END);
1140 BUG_ON(!PAGE_ALIGNED(addr));
1141
1142 if (likely(count <= VMAP_MAX_ALLOC)) {
1143 debug_check_no_locks_freed(mem, size);
1144 vb_free(mem, size);
1145 return;
1146 }
1147
1148 va = find_vmap_area(addr);
1149 BUG_ON(!va);
1150 debug_check_no_locks_freed((void *)va->va_start,
1151 (va->va_end - va->va_start));
1152 free_unmap_vmap_area(va);
1153}
1154EXPORT_SYMBOL(vm_unmap_ram);
1155
1156/**
1157 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
1158 * @pages: an array of pointers to the pages to be mapped
1159 * @count: number of pages
1160 * @node: prefer to allocate data structures on this node
1161 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
1162 *
1163 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
1164 * faster than vmap so it's good. But if you mix long-life and short-life
1165 * objects with vm_map_ram(), it could consume lots of address space through
1166 * fragmentation (especially on a 32bit machine). You could see failures in
1167 * the end. Please use this function for short-lived objects.
1168 *
1169 * Returns: a pointer to the address that has been mapped, or %NULL on failure
1170 */
1171void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
1172{
1173 unsigned long size = (unsigned long)count << PAGE_SHIFT;
1174 unsigned long addr;
1175 void *mem;
1176
1177 if (likely(count <= VMAP_MAX_ALLOC)) {
1178 mem = vb_alloc(size, GFP_KERNEL);
1179 if (IS_ERR(mem))
1180 return NULL;
1181 addr = (unsigned long)mem;
1182 } else {
1183 struct vmap_area *va;
1184 va = alloc_vmap_area(size, PAGE_SIZE,
1185 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
1186 if (IS_ERR(va))
1187 return NULL;
1188
1189 addr = va->va_start;
1190 mem = (void *)addr;
1191 }
1192 if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
1193 vm_unmap_ram(mem, count);
1194 return NULL;
1195 }
1196 return mem;
1197}
1198EXPORT_SYMBOL(vm_map_ram);
1199
1200static struct vm_struct *vmlist __initdata;
1201/**
1202 * vm_area_add_early - add vmap area early during boot
1203 * @vm: vm_struct to add
1204 *
1205 * This function is used to add fixed kernel vm area to vmlist before
1206 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
1207 * should contain proper values and the other fields should be zero.
1208 *
1209 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1210 */
1211void __init vm_area_add_early(struct vm_struct *vm)
1212{
1213 struct vm_struct *tmp, **p;
1214
1215 BUG_ON(vmap_initialized);
1216 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1217 if (tmp->addr >= vm->addr) {
1218 BUG_ON(tmp->addr < vm->addr + vm->size);
1219 break;
1220 } else
1221 BUG_ON(tmp->addr + tmp->size > vm->addr);
1222 }
1223 vm->next = *p;
1224 *p = vm;
1225}
1226
1227/**
1228 * vm_area_register_early - register vmap area early during boot
1229 * @vm: vm_struct to register
1230 * @align: requested alignment
1231 *
1232 * This function is used to register kernel vm area before
1233 * vmalloc_init() is called. @vm->size and @vm->flags should contain
1234 * proper values on entry and other fields should be zero. On return,
1235 * vm->addr contains the allocated address.
1236 *
1237 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1238 */
1239void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1240{
1241 static size_t vm_init_off __initdata;
1242 unsigned long addr;
1243
1244 addr = ALIGN(VMALLOC_START + vm_init_off, align);
1245 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1246
1247 vm->addr = (void *)addr;
1248
1249 vm_area_add_early(vm);
1250}
1251
1252void __init vmalloc_init(void)
1253{
1254 struct vmap_area *va;
1255 struct vm_struct *tmp;
1256 int i;
1257
1258 for_each_possible_cpu(i) {
1259 struct vmap_block_queue *vbq;
1260 struct vfree_deferred *p;
1261
1262 vbq = &per_cpu(vmap_block_queue, i);
1263 spin_lock_init(&vbq->lock);
1264 INIT_LIST_HEAD(&vbq->free);
1265 p = &per_cpu(vfree_deferred, i);
1266 init_llist_head(&p->list);
1267 INIT_WORK(&p->wq, free_work);
1268 }
1269
1270 /* Import existing vmlist entries. */
1271 for (tmp = vmlist; tmp; tmp = tmp->next) {
1272 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1273 va->flags = VM_VM_AREA;
1274 va->va_start = (unsigned long)tmp->addr;
1275 va->va_end = va->va_start + tmp->size;
1276 va->vm = tmp;
1277 __insert_vmap_area(va);
1278 }
1279
1280 vmap_area_pcpu_hole = VMALLOC_END;
1281
1282 vmap_initialized = true;
1283}
1284
1285/**
1286 * map_kernel_range_noflush - map kernel VM area with the specified pages
1287 * @addr: start of the VM area to map
1288 * @size: size of the VM area to map
1289 * @prot: page protection flags to use
1290 * @pages: pages to map
1291 *
1292 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1293 * specify should have been allocated using get_vm_area() and its
1294 * friends.
1295 *
1296 * NOTE:
1297 * This function does NOT do any cache flushing. The caller is
1298 * responsible for calling flush_cache_vmap() on to-be-mapped areas
1299 * before calling this function.
1300 *
1301 * RETURNS:
1302 * The number of pages mapped on success, -errno on failure.
1303 */
1304int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1305 pgprot_t prot, struct page **pages)
1306{
1307 return vmap_page_range_noflush(addr, addr + size, prot, pages);
1308}
1309
1310/**
1311 * unmap_kernel_range_noflush - unmap kernel VM area
1312 * @addr: start of the VM area to unmap
1313 * @size: size of the VM area to unmap
1314 *
1315 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1316 * specify should have been allocated using get_vm_area() and its
1317 * friends.
1318 *
1319 * NOTE:
1320 * This function does NOT do any cache flushing. The caller is
1321 * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1322 * before calling this function and flush_tlb_kernel_range() after.
1323 */
1324void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1325{
1326 vunmap_page_range(addr, addr + size);
1327}
1328EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
1329
1330/**
1331 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
1332 * @addr: start of the VM area to unmap
1333 * @size: size of the VM area to unmap
1334 *
1335 * Similar to unmap_kernel_range_noflush() but flushes vcache before
1336 * the unmapping and tlb after.
1337 */
1338void unmap_kernel_range(unsigned long addr, unsigned long size)
1339{
1340 unsigned long end = addr + size;
1341
1342 flush_cache_vunmap(addr, end);
1343 vunmap_page_range(addr, end);
1344 flush_tlb_kernel_range(addr, end);
1345}
1346EXPORT_SYMBOL_GPL(unmap_kernel_range);
1347
1348int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
1349{
1350 unsigned long addr = (unsigned long)area->addr;
1351 unsigned long end = addr + get_vm_area_size(area);
1352 int err;
1353
1354 err = vmap_page_range(addr, end, prot, pages);
1355
1356 return err > 0 ? 0 : err;
1357}
1358EXPORT_SYMBOL_GPL(map_vm_area);
1359
1360static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1361 unsigned long flags, const void *caller)
1362{
1363 spin_lock(&vmap_area_lock);
1364 vm->flags = flags;
1365 vm->addr = (void *)va->va_start;
1366 vm->size = va->va_end - va->va_start;
1367 vm->caller = caller;
1368 va->vm = vm;
1369 va->flags |= VM_VM_AREA;
1370 spin_unlock(&vmap_area_lock);
1371}
1372
1373static void clear_vm_uninitialized_flag(struct vm_struct *vm)
1374{
1375 /*
1376 * Before removing VM_UNINITIALIZED,
1377 * we should make sure that vm has proper values.
1378 * Pair with smp_rmb() in show_numa_info().
1379 */
1380 smp_wmb();
1381 vm->flags &= ~VM_UNINITIALIZED;
1382}
1383
1384static struct vm_struct *__get_vm_area_node(unsigned long size,
1385 unsigned long align, unsigned long flags, unsigned long start,
1386 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
1387{
1388 struct vmap_area *va;
1389 struct vm_struct *area;
1390
1391 BUG_ON(in_interrupt());
1392 size = PAGE_ALIGN(size);
1393 if (unlikely(!size))
1394 return NULL;
1395
1396 if (flags & VM_IOREMAP)
1397 align = 1ul << clamp_t(int, get_count_order_long(size),
1398 PAGE_SHIFT, IOREMAP_MAX_ORDER);
1399
1400 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1401 if (unlikely(!area))
1402 return NULL;
1403
1404 if (!(flags & VM_NO_GUARD))
1405 size += PAGE_SIZE;
1406
1407 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
1408 if (IS_ERR(va)) {
1409 kfree(area);
1410 return NULL;
1411 }
1412
1413 setup_vmalloc_vm(area, va, flags, caller);
1414
1415 return area;
1416}
1417
1418struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1419 unsigned long start, unsigned long end)
1420{
1421 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1422 GFP_KERNEL, __builtin_return_address(0));
1423}
1424EXPORT_SYMBOL_GPL(__get_vm_area);
1425
1426struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1427 unsigned long start, unsigned long end,
1428 const void *caller)
1429{
1430 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1431 GFP_KERNEL, caller);
1432}
1433
1434/**
1435 * get_vm_area - reserve a contiguous kernel virtual area
1436 * @size: size of the area
1437 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
1438 *
1439 * Search an area of @size in the kernel virtual mapping area,
1440 * and reserved it for out purposes. Returns the area descriptor
1441 * on success or %NULL on failure.
1442 */
1443struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1444{
1445 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1446 NUMA_NO_NODE, GFP_KERNEL,
1447 __builtin_return_address(0));
1448}
1449
1450struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1451 const void *caller)
1452{
1453 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1454 NUMA_NO_NODE, GFP_KERNEL, caller);
1455}
1456
1457/**
1458 * find_vm_area - find a continuous kernel virtual area
1459 * @addr: base address
1460 *
1461 * Search for the kernel VM area starting at @addr, and return it.
1462 * It is up to the caller to do all required locking to keep the returned
1463 * pointer valid.
1464 */
1465struct vm_struct *find_vm_area(const void *addr)
1466{
1467 struct vmap_area *va;
1468
1469 va = find_vmap_area((unsigned long)addr);
1470 if (va && va->flags & VM_VM_AREA)
1471 return va->vm;
1472
1473 return NULL;
1474}
1475
1476/**
1477 * remove_vm_area - find and remove a continuous kernel virtual area
1478 * @addr: base address
1479 *
1480 * Search for the kernel VM area starting at @addr, and remove it.
1481 * This function returns the found VM area, but using it is NOT safe
1482 * on SMP machines, except for its size or flags.
1483 */
1484struct vm_struct *remove_vm_area(const void *addr)
1485{
1486 struct vmap_area *va;
1487
1488 might_sleep();
1489
1490 va = find_vmap_area((unsigned long)addr);
1491 if (va && va->flags & VM_VM_AREA) {
1492 struct vm_struct *vm = va->vm;
1493
1494 spin_lock(&vmap_area_lock);
1495 va->vm = NULL;
1496 va->flags &= ~VM_VM_AREA;
1497 va->flags |= VM_LAZY_FREE;
1498 spin_unlock(&vmap_area_lock);
1499
1500 kasan_free_shadow(vm);
1501 free_unmap_vmap_area(va);
1502
1503 return vm;
1504 }
1505 return NULL;
1506}
1507
1508static void __vunmap(const void *addr, int deallocate_pages)
1509{
1510 struct vm_struct *area;
1511
1512 if (!addr)
1513 return;
1514
1515 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
1516 addr))
1517 return;
1518
1519 area = find_vmap_area((unsigned long)addr)->vm;
1520 if (unlikely(!area)) {
1521 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1522 addr);
1523 return;
1524 }
1525
1526 debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
1527 debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
1528
1529 remove_vm_area(addr);
1530 if (deallocate_pages) {
1531 int i;
1532
1533 for (i = 0; i < area->nr_pages; i++) {
1534 struct page *page = area->pages[i];
1535
1536 BUG_ON(!page);
1537 __free_pages(page, 0);
1538 }
1539 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
1540
1541 kvfree(area->pages);
1542 }
1543
1544 kfree(area);
1545 return;
1546}
1547
1548static inline void __vfree_deferred(const void *addr)
1549{
1550 /*
1551 * Use raw_cpu_ptr() because this can be called from preemptible
1552 * context. Preemption is absolutely fine here, because the llist_add()
1553 * implementation is lockless, so it works even if we are adding to
1554 * nother cpu's list. schedule_work() should be fine with this too.
1555 */
1556 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
1557
1558 if (llist_add((struct llist_node *)addr, &p->list))
1559 schedule_work(&p->wq);
1560}
1561
1562/**
1563 * vfree_atomic - release memory allocated by vmalloc()
1564 * @addr: memory base address
1565 *
1566 * This one is just like vfree() but can be called in any atomic context
1567 * except NMIs.
1568 */
1569void vfree_atomic(const void *addr)
1570{
1571 BUG_ON(in_nmi());
1572
1573 kmemleak_free(addr);
1574
1575 if (!addr)
1576 return;
1577 __vfree_deferred(addr);
1578}
1579
1580/**
1581 * vfree - release memory allocated by vmalloc()
1582 * @addr: memory base address
1583 *
1584 * Free the virtually continuous memory area starting at @addr, as
1585 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1586 * NULL, no operation is performed.
1587 *
1588 * Must not be called in NMI context (strictly speaking, only if we don't
1589 * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
1590 * conventions for vfree() arch-depenedent would be a really bad idea)
1591 *
1592 * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
1593 */
1594void vfree(const void *addr)
1595{
1596 BUG_ON(in_nmi());
1597
1598 kmemleak_free(addr);
1599
1600 if (!addr)
1601 return;
1602 if (unlikely(in_interrupt()))
1603 __vfree_deferred(addr);
1604 else
1605 __vunmap(addr, 1);
1606}
1607EXPORT_SYMBOL(vfree);
1608
1609/**
1610 * vunmap - release virtual mapping obtained by vmap()
1611 * @addr: memory base address
1612 *
1613 * Free the virtually contiguous memory area starting at @addr,
1614 * which was created from the page array passed to vmap().
1615 *
1616 * Must not be called in interrupt context.
1617 */
1618void vunmap(const void *addr)
1619{
1620 BUG_ON(in_interrupt());
1621 might_sleep();
1622 if (addr)
1623 __vunmap(addr, 0);
1624}
1625EXPORT_SYMBOL(vunmap);
1626
1627/**
1628 * vmap - map an array of pages into virtually contiguous space
1629 * @pages: array of page pointers
1630 * @count: number of pages to map
1631 * @flags: vm_area->flags
1632 * @prot: page protection for the mapping
1633 *
1634 * Maps @count pages from @pages into contiguous kernel virtual
1635 * space.
1636 */
1637void *vmap(struct page **pages, unsigned int count,
1638 unsigned long flags, pgprot_t prot)
1639{
1640 struct vm_struct *area;
1641 unsigned long size; /* In bytes */
1642
1643 might_sleep();
1644
1645 if (count > totalram_pages)
1646 return NULL;
1647
1648 size = (unsigned long)count << PAGE_SHIFT;
1649 area = get_vm_area_caller(size, flags, __builtin_return_address(0));
1650 if (!area)
1651 return NULL;
1652
1653 if (map_vm_area(area, prot, pages)) {
1654 vunmap(area->addr);
1655 return NULL;
1656 }
1657
1658 return area->addr;
1659}
1660EXPORT_SYMBOL(vmap);
1661
1662static void *__vmalloc_node(unsigned long size, unsigned long align,
1663 gfp_t gfp_mask, pgprot_t prot,
1664 int node, const void *caller);
1665static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1666 pgprot_t prot, int node)
1667{
1668 struct page **pages;
1669 unsigned int nr_pages, array_size, i;
1670 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1671 const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
1672 const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
1673 0 :
1674 __GFP_HIGHMEM;
1675
1676 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1677 array_size = (nr_pages * sizeof(struct page *));
1678
1679 area->nr_pages = nr_pages;
1680 /* Please note that the recursion is strictly bounded. */
1681 if (array_size > PAGE_SIZE) {
1682 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
1683 PAGE_KERNEL, node, area->caller);
1684 } else {
1685 pages = kmalloc_node(array_size, nested_gfp, node);
1686 }
1687 area->pages = pages;
1688 if (!area->pages) {
1689 remove_vm_area(area->addr);
1690 kfree(area);
1691 return NULL;
1692 }
1693
1694 for (i = 0; i < area->nr_pages; i++) {
1695 struct page *page;
1696
1697 if (node == NUMA_NO_NODE)
1698 page = alloc_page(alloc_mask|highmem_mask);
1699 else
1700 page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
1701
1702 if (unlikely(!page)) {
1703 /* Successfully allocated i pages, free them in __vunmap() */
1704 area->nr_pages = i;
1705 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
1706 goto fail;
1707 }
1708 area->pages[i] = page;
1709 if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
1710 cond_resched();
1711 }
1712 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
1713
1714 if (map_vm_area(area, prot, pages))
1715 goto fail;
1716 return area->addr;
1717
1718fail:
1719 warn_alloc(gfp_mask, NULL,
1720 "vmalloc: allocation failure, allocated %ld of %ld bytes",
1721 (area->nr_pages*PAGE_SIZE), area->size);
1722 vfree(area->addr);
1723 return NULL;
1724}
1725
1726/**
1727 * __vmalloc_node_range - allocate virtually contiguous memory
1728 * @size: allocation size
1729 * @align: desired alignment
1730 * @start: vm area range start
1731 * @end: vm area range end
1732 * @gfp_mask: flags for the page level allocator
1733 * @prot: protection mask for the allocated pages
1734 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
1735 * @node: node to use for allocation or NUMA_NO_NODE
1736 * @caller: caller's return address
1737 *
1738 * Allocate enough pages to cover @size from the page level
1739 * allocator with @gfp_mask flags. Map them into contiguous
1740 * kernel virtual space, using a pagetable protection of @prot.
1741 */
1742void *__vmalloc_node_range(unsigned long size, unsigned long align,
1743 unsigned long start, unsigned long end, gfp_t gfp_mask,
1744 pgprot_t prot, unsigned long vm_flags, int node,
1745 const void *caller)
1746{
1747 struct vm_struct *area;
1748 void *addr;
1749 unsigned long real_size = size;
1750
1751 size = PAGE_ALIGN(size);
1752 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1753 goto fail;
1754
1755 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
1756 vm_flags, start, end, node, gfp_mask, caller);
1757 if (!area)
1758 goto fail;
1759
1760 addr = __vmalloc_area_node(area, gfp_mask, prot, node);
1761 if (!addr)
1762 return NULL;
1763
1764 /*
1765 * First make sure the mappings are removed from all page-tables
1766 * before they are freed.
1767 */
1768 vmalloc_sync_all();
1769
1770 /*
1771 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
1772 * flag. It means that vm_struct is not fully initialized.
1773 * Now, it is fully initialized, so remove this flag here.
1774 */
1775 clear_vm_uninitialized_flag(area);
1776
1777 kmemleak_vmalloc(area, size, gfp_mask);
1778
1779 return addr;
1780
1781fail:
1782 warn_alloc(gfp_mask, NULL,
1783 "vmalloc: allocation failure: %lu bytes", real_size);
1784 return NULL;
1785}
1786
1787/**
1788 * __vmalloc_node - allocate virtually contiguous memory
1789 * @size: allocation size
1790 * @align: desired alignment
1791 * @gfp_mask: flags for the page level allocator
1792 * @prot: protection mask for the allocated pages
1793 * @node: node to use for allocation or NUMA_NO_NODE
1794 * @caller: caller's return address
1795 *
1796 * Allocate enough pages to cover @size from the page level
1797 * allocator with @gfp_mask flags. Map them into contiguous
1798 * kernel virtual space, using a pagetable protection of @prot.
1799 *
1800 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
1801 * and __GFP_NOFAIL are not supported
1802 *
1803 * Any use of gfp flags outside of GFP_KERNEL should be consulted
1804 * with mm people.
1805 *
1806 */
1807static void *__vmalloc_node(unsigned long size, unsigned long align,
1808 gfp_t gfp_mask, pgprot_t prot,
1809 int node, const void *caller)
1810{
1811 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1812 gfp_mask, prot, 0, node, caller);
1813}
1814
1815void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1816{
1817 return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
1818 __builtin_return_address(0));
1819}
1820EXPORT_SYMBOL(__vmalloc);
1821
1822static inline void *__vmalloc_node_flags(unsigned long size,
1823 int node, gfp_t flags)
1824{
1825 return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1826 node, __builtin_return_address(0));
1827}
1828
1829
1830void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
1831 void *caller)
1832{
1833 return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
1834}
1835
1836/**
1837 * vmalloc - allocate virtually contiguous memory
1838 * @size: allocation size
1839 * Allocate enough pages to cover @size from the page level
1840 * allocator and map them into contiguous kernel virtual space.
1841 *
1842 * For tight control over page level allocator and protection flags
1843 * use __vmalloc() instead.
1844 */
1845void *vmalloc(unsigned long size)
1846{
1847 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1848 GFP_KERNEL);
1849}
1850EXPORT_SYMBOL(vmalloc);
1851
1852/**
1853 * vzalloc - allocate virtually contiguous memory with zero fill
1854 * @size: allocation size
1855 * Allocate enough pages to cover @size from the page level
1856 * allocator and map them into contiguous kernel virtual space.
1857 * The memory allocated is set to zero.
1858 *
1859 * For tight control over page level allocator and protection flags
1860 * use __vmalloc() instead.
1861 */
1862void *vzalloc(unsigned long size)
1863{
1864 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1865 GFP_KERNEL | __GFP_ZERO);
1866}
1867EXPORT_SYMBOL(vzalloc);
1868
1869/**
1870 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1871 * @size: allocation size
1872 *
1873 * The resulting memory area is zeroed so it can be mapped to userspace
1874 * without leaking data.
1875 */
1876void *vmalloc_user(unsigned long size)
1877{
1878 struct vm_struct *area;
1879 void *ret;
1880
1881 ret = __vmalloc_node(size, SHMLBA,
1882 GFP_KERNEL | __GFP_ZERO,
1883 PAGE_KERNEL, NUMA_NO_NODE,
1884 __builtin_return_address(0));
1885 if (ret) {
1886 area = find_vm_area(ret);
1887 area->flags |= VM_USERMAP;
1888 }
1889 return ret;
1890}
1891EXPORT_SYMBOL(vmalloc_user);
1892
1893/**
1894 * vmalloc_node - allocate memory on a specific node
1895 * @size: allocation size
1896 * @node: numa node
1897 *
1898 * Allocate enough pages to cover @size from the page level
1899 * allocator and map them into contiguous kernel virtual space.
1900 *
1901 * For tight control over page level allocator and protection flags
1902 * use __vmalloc() instead.
1903 */
1904void *vmalloc_node(unsigned long size, int node)
1905{
1906 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
1907 node, __builtin_return_address(0));
1908}
1909EXPORT_SYMBOL(vmalloc_node);
1910
1911/**
1912 * vzalloc_node - allocate memory on a specific node with zero fill
1913 * @size: allocation size
1914 * @node: numa node
1915 *
1916 * Allocate enough pages to cover @size from the page level
1917 * allocator and map them into contiguous kernel virtual space.
1918 * The memory allocated is set to zero.
1919 *
1920 * For tight control over page level allocator and protection flags
1921 * use __vmalloc_node() instead.
1922 */
1923void *vzalloc_node(unsigned long size, int node)
1924{
1925 return __vmalloc_node_flags(size, node,
1926 GFP_KERNEL | __GFP_ZERO);
1927}
1928EXPORT_SYMBOL(vzalloc_node);
1929
1930/**
1931 * vmalloc_exec - allocate virtually contiguous, executable memory
1932 * @size: allocation size
1933 *
1934 * Kernel-internal function to allocate enough pages to cover @size
1935 * the page level allocator and map them into contiguous and
1936 * executable kernel virtual space.
1937 *
1938 * For tight control over page level allocator and protection flags
1939 * use __vmalloc() instead.
1940 */
1941
1942void *vmalloc_exec(unsigned long size)
1943{
1944 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
1945 NUMA_NO_NODE, __builtin_return_address(0));
1946}
1947
1948#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
1949#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
1950#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
1951#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
1952#else
1953/*
1954 * 64b systems should always have either DMA or DMA32 zones. For others
1955 * GFP_DMA32 should do the right thing and use the normal zone.
1956 */
1957#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
1958#endif
1959
1960/**
1961 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
1962 * @size: allocation size
1963 *
1964 * Allocate enough 32bit PA addressable pages to cover @size from the
1965 * page level allocator and map them into contiguous kernel virtual space.
1966 */
1967void *vmalloc_32(unsigned long size)
1968{
1969 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1970 NUMA_NO_NODE, __builtin_return_address(0));
1971}
1972EXPORT_SYMBOL(vmalloc_32);
1973
1974/**
1975 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1976 * @size: allocation size
1977 *
1978 * The resulting memory area is 32bit addressable and zeroed so it can be
1979 * mapped to userspace without leaking data.
1980 */
1981void *vmalloc_32_user(unsigned long size)
1982{
1983 struct vm_struct *area;
1984 void *ret;
1985
1986 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1987 NUMA_NO_NODE, __builtin_return_address(0));
1988 if (ret) {
1989 area = find_vm_area(ret);
1990 area->flags |= VM_USERMAP;
1991 }
1992 return ret;
1993}
1994EXPORT_SYMBOL(vmalloc_32_user);
1995
1996/*
1997 * small helper routine , copy contents to buf from addr.
1998 * If the page is not present, fill zero.
1999 */
2000
2001static int aligned_vread(char *buf, char *addr, unsigned long count)
2002{
2003 struct page *p;
2004 int copied = 0;
2005
2006 while (count) {
2007 unsigned long offset, length;
2008
2009 offset = offset_in_page(addr);
2010 length = PAGE_SIZE - offset;
2011 if (length > count)
2012 length = count;
2013 p = vmalloc_to_page(addr);
2014 /*
2015 * To do safe access to this _mapped_ area, we need
2016 * lock. But adding lock here means that we need to add
2017 * overhead of vmalloc()/vfree() calles for this _debug_
2018 * interface, rarely used. Instead of that, we'll use
2019 * kmap() and get small overhead in this access function.
2020 */
2021 if (p) {
2022 /*
2023 * we can expect USER0 is not used (see vread/vwrite's
2024 * function description)
2025 */
2026 void *map = kmap_atomic(p);
2027 memcpy(buf, map + offset, length);
2028 kunmap_atomic(map);
2029 } else
2030 memset(buf, 0, length);
2031
2032 addr += length;
2033 buf += length;
2034 copied += length;
2035 count -= length;
2036 }
2037 return copied;
2038}
2039
2040static int aligned_vwrite(char *buf, char *addr, unsigned long count)
2041{
2042 struct page *p;
2043 int copied = 0;
2044
2045 while (count) {
2046 unsigned long offset, length;
2047
2048 offset = offset_in_page(addr);
2049 length = PAGE_SIZE - offset;
2050 if (length > count)
2051 length = count;
2052 p = vmalloc_to_page(addr);
2053 /*
2054 * To do safe access to this _mapped_ area, we need
2055 * lock. But adding lock here means that we need to add
2056 * overhead of vmalloc()/vfree() calles for this _debug_
2057 * interface, rarely used. Instead of that, we'll use
2058 * kmap() and get small overhead in this access function.
2059 */
2060 if (p) {
2061 /*
2062 * we can expect USER0 is not used (see vread/vwrite's
2063 * function description)
2064 */
2065 void *map = kmap_atomic(p);
2066 memcpy(map + offset, buf, length);
2067 kunmap_atomic(map);
2068 }
2069 addr += length;
2070 buf += length;
2071 copied += length;
2072 count -= length;
2073 }
2074 return copied;
2075}
2076
2077/**
2078 * vread() - read vmalloc area in a safe way.
2079 * @buf: buffer for reading data
2080 * @addr: vm address.
2081 * @count: number of bytes to be read.
2082 *
2083 * Returns # of bytes which addr and buf should be increased.
2084 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
2085 * includes any intersect with alive vmalloc area.
2086 *
2087 * This function checks that addr is a valid vmalloc'ed area, and
2088 * copy data from that area to a given buffer. If the given memory range
2089 * of [addr...addr+count) includes some valid address, data is copied to
2090 * proper area of @buf. If there are memory holes, they'll be zero-filled.
2091 * IOREMAP area is treated as memory hole and no copy is done.
2092 *
2093 * If [addr...addr+count) doesn't includes any intersects with alive
2094 * vm_struct area, returns 0. @buf should be kernel's buffer.
2095 *
2096 * Note: In usual ops, vread() is never necessary because the caller
2097 * should know vmalloc() area is valid and can use memcpy().
2098 * This is for routines which have to access vmalloc area without
2099 * any informaion, as /dev/kmem.
2100 *
2101 */
2102
2103long vread(char *buf, char *addr, unsigned long count)
2104{
2105 struct vmap_area *va;
2106 struct vm_struct *vm;
2107 char *vaddr, *buf_start = buf;
2108 unsigned long buflen = count;
2109 unsigned long n;
2110
2111 /* Don't allow overflow */
2112 if ((unsigned long) addr + count < count)
2113 count = -(unsigned long) addr;
2114
2115 spin_lock(&vmap_area_lock);
2116 list_for_each_entry(va, &vmap_area_list, list) {
2117 if (!count)
2118 break;
2119
2120 if (!(va->flags & VM_VM_AREA))
2121 continue;
2122
2123 vm = va->vm;
2124 vaddr = (char *) vm->addr;
2125 if (addr >= vaddr + get_vm_area_size(vm))
2126 continue;
2127 while (addr < vaddr) {
2128 if (count == 0)
2129 goto finished;
2130 *buf = '\0';
2131 buf++;
2132 addr++;
2133 count--;
2134 }
2135 n = vaddr + get_vm_area_size(vm) - addr;
2136 if (n > count)
2137 n = count;
2138 if (!(vm->flags & VM_IOREMAP))
2139 aligned_vread(buf, addr, n);
2140 else /* IOREMAP area is treated as memory hole */
2141 memset(buf, 0, n);
2142 buf += n;
2143 addr += n;
2144 count -= n;
2145 }
2146finished:
2147 spin_unlock(&vmap_area_lock);
2148
2149 if (buf == buf_start)
2150 return 0;
2151 /* zero-fill memory holes */
2152 if (buf != buf_start + buflen)
2153 memset(buf, 0, buflen - (buf - buf_start));
2154
2155 return buflen;
2156}
2157
2158/**
2159 * vwrite() - write vmalloc area in a safe way.
2160 * @buf: buffer for source data
2161 * @addr: vm address.
2162 * @count: number of bytes to be read.
2163 *
2164 * Returns # of bytes which addr and buf should be incresed.
2165 * (same number to @count).
2166 * If [addr...addr+count) doesn't includes any intersect with valid
2167 * vmalloc area, returns 0.
2168 *
2169 * This function checks that addr is a valid vmalloc'ed area, and
2170 * copy data from a buffer to the given addr. If specified range of
2171 * [addr...addr+count) includes some valid address, data is copied from
2172 * proper area of @buf. If there are memory holes, no copy to hole.
2173 * IOREMAP area is treated as memory hole and no copy is done.
2174 *
2175 * If [addr...addr+count) doesn't includes any intersects with alive
2176 * vm_struct area, returns 0. @buf should be kernel's buffer.
2177 *
2178 * Note: In usual ops, vwrite() is never necessary because the caller
2179 * should know vmalloc() area is valid and can use memcpy().
2180 * This is for routines which have to access vmalloc area without
2181 * any informaion, as /dev/kmem.
2182 */
2183
2184long vwrite(char *buf, char *addr, unsigned long count)
2185{
2186 struct vmap_area *va;
2187 struct vm_struct *vm;
2188 char *vaddr;
2189 unsigned long n, buflen;
2190 int copied = 0;
2191
2192 /* Don't allow overflow */
2193 if ((unsigned long) addr + count < count)
2194 count = -(unsigned long) addr;
2195 buflen = count;
2196
2197 spin_lock(&vmap_area_lock);
2198 list_for_each_entry(va, &vmap_area_list, list) {
2199 if (!count)
2200 break;
2201
2202 if (!(va->flags & VM_VM_AREA))
2203 continue;
2204
2205 vm = va->vm;
2206 vaddr = (char *) vm->addr;
2207 if (addr >= vaddr + get_vm_area_size(vm))
2208 continue;
2209 while (addr < vaddr) {
2210 if (count == 0)
2211 goto finished;
2212 buf++;
2213 addr++;
2214 count--;
2215 }
2216 n = vaddr + get_vm_area_size(vm) - addr;
2217 if (n > count)
2218 n = count;
2219 if (!(vm->flags & VM_IOREMAP)) {
2220 aligned_vwrite(buf, addr, n);
2221 copied++;
2222 }
2223 buf += n;
2224 addr += n;
2225 count -= n;
2226 }
2227finished:
2228 spin_unlock(&vmap_area_lock);
2229 if (!copied)
2230 return 0;
2231 return buflen;
2232}
2233
2234/**
2235 * remap_vmalloc_range_partial - map vmalloc pages to userspace
2236 * @vma: vma to cover
2237 * @uaddr: target user address to start at
2238 * @kaddr: virtual address of vmalloc kernel memory
2239 * @size: size of map area
2240 *
2241 * Returns: 0 for success, -Exxx on failure
2242 *
2243 * This function checks that @kaddr is a valid vmalloc'ed area,
2244 * and that it is big enough to cover the range starting at
2245 * @uaddr in @vma. Will return failure if that criteria isn't
2246 * met.
2247 *
2248 * Similar to remap_pfn_range() (see mm/memory.c)
2249 */
2250int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2251 void *kaddr, unsigned long size)
2252{
2253 struct vm_struct *area;
2254
2255 size = PAGE_ALIGN(size);
2256
2257 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
2258 return -EINVAL;
2259
2260 area = find_vm_area(kaddr);
2261 if (!area)
2262 return -EINVAL;
2263
2264 if (!(area->flags & VM_USERMAP))
2265 return -EINVAL;
2266
2267 if (kaddr + size > area->addr + get_vm_area_size(area))
2268 return -EINVAL;
2269
2270 do {
2271 struct page *page = vmalloc_to_page(kaddr);
2272 int ret;
2273
2274 ret = vm_insert_page(vma, uaddr, page);
2275 if (ret)
2276 return ret;
2277
2278 uaddr += PAGE_SIZE;
2279 kaddr += PAGE_SIZE;
2280 size -= PAGE_SIZE;
2281 } while (size > 0);
2282
2283 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2284
2285 return 0;
2286}
2287EXPORT_SYMBOL(remap_vmalloc_range_partial);
2288
2289/**
2290 * remap_vmalloc_range - map vmalloc pages to userspace
2291 * @vma: vma to cover (map full range of vma)
2292 * @addr: vmalloc memory
2293 * @pgoff: number of pages into addr before first page to map
2294 *
2295 * Returns: 0 for success, -Exxx on failure
2296 *
2297 * This function checks that addr is a valid vmalloc'ed area, and
2298 * that it is big enough to cover the vma. Will return failure if
2299 * that criteria isn't met.
2300 *
2301 * Similar to remap_pfn_range() (see mm/memory.c)
2302 */
2303int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2304 unsigned long pgoff)
2305{
2306 return remap_vmalloc_range_partial(vma, vma->vm_start,
2307 addr + (pgoff << PAGE_SHIFT),
2308 vma->vm_end - vma->vm_start);
2309}
2310EXPORT_SYMBOL(remap_vmalloc_range);
2311
2312/*
2313 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
2314 * have one.
2315 *
2316 * The purpose of this function is to make sure the vmalloc area
2317 * mappings are identical in all page-tables in the system.
2318 */
2319void __weak vmalloc_sync_all(void)
2320{
2321}
2322
2323
2324static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
2325{
2326 pte_t ***p = data;
2327
2328 if (p) {
2329 *(*p) = pte;
2330 (*p)++;
2331 }
2332 return 0;
2333}
2334
2335/**
2336 * alloc_vm_area - allocate a range of kernel address space
2337 * @size: size of the area
2338 * @ptes: returns the PTEs for the address space
2339 *
2340 * Returns: NULL on failure, vm_struct on success
2341 *
2342 * This function reserves a range of kernel address space, and
2343 * allocates pagetables to map that range. No actual mappings
2344 * are created.
2345 *
2346 * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
2347 * allocated for the VM area are returned.
2348 */
2349struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
2350{
2351 struct vm_struct *area;
2352
2353 area = get_vm_area_caller(size, VM_IOREMAP,
2354 __builtin_return_address(0));
2355 if (area == NULL)
2356 return NULL;
2357
2358 /*
2359 * This ensures that page tables are constructed for this region
2360 * of kernel virtual address space and mapped into init_mm.
2361 */
2362 if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2363 size, f, ptes ? &ptes : NULL)) {
2364 free_vm_area(area);
2365 return NULL;
2366 }
2367
2368 return area;
2369}
2370EXPORT_SYMBOL_GPL(alloc_vm_area);
2371
2372void free_vm_area(struct vm_struct *area)
2373{
2374 struct vm_struct *ret;
2375 ret = remove_vm_area(area->addr);
2376 BUG_ON(ret != area);
2377 kfree(area);
2378}
2379EXPORT_SYMBOL_GPL(free_vm_area);
2380
2381#ifdef CONFIG_SMP
2382static struct vmap_area *node_to_va(struct rb_node *n)
2383{
2384 return rb_entry_safe(n, struct vmap_area, rb_node);
2385}
2386
2387/**
2388 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2389 * @end: target address
2390 * @pnext: out arg for the next vmap_area
2391 * @pprev: out arg for the previous vmap_area
2392 *
2393 * Returns: %true if either or both of next and prev are found,
2394 * %false if no vmap_area exists
2395 *
2396 * Find vmap_areas end addresses of which enclose @end. ie. if not
2397 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
2398 */
2399static bool pvm_find_next_prev(unsigned long end,
2400 struct vmap_area **pnext,
2401 struct vmap_area **pprev)
2402{
2403 struct rb_node *n = vmap_area_root.rb_node;
2404 struct vmap_area *va = NULL;
2405
2406 while (n) {
2407 va = rb_entry(n, struct vmap_area, rb_node);
2408 if (end < va->va_end)
2409 n = n->rb_left;
2410 else if (end > va->va_end)
2411 n = n->rb_right;
2412 else
2413 break;
2414 }
2415
2416 if (!va)
2417 return false;
2418
2419 if (va->va_end > end) {
2420 *pnext = va;
2421 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2422 } else {
2423 *pprev = va;
2424 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2425 }
2426 return true;
2427}
2428
2429/**
2430 * pvm_determine_end - find the highest aligned address between two vmap_areas
2431 * @pnext: in/out arg for the next vmap_area
2432 * @pprev: in/out arg for the previous vmap_area
2433 * @align: alignment
2434 *
2435 * Returns: determined end address
2436 *
2437 * Find the highest aligned address between *@pnext and *@pprev below
2438 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2439 * down address is between the end addresses of the two vmap_areas.
2440 *
2441 * Please note that the address returned by this function may fall
2442 * inside *@pnext vmap_area. The caller is responsible for checking
2443 * that.
2444 */
2445static unsigned long pvm_determine_end(struct vmap_area **pnext,
2446 struct vmap_area **pprev,
2447 unsigned long align)
2448{
2449 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2450 unsigned long addr;
2451
2452 if (*pnext)
2453 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2454 else
2455 addr = vmalloc_end;
2456
2457 while (*pprev && (*pprev)->va_end > addr) {
2458 *pnext = *pprev;
2459 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2460 }
2461
2462 return addr;
2463}
2464
2465/**
2466 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
2467 * @offsets: array containing offset of each area
2468 * @sizes: array containing size of each area
2469 * @nr_vms: the number of areas to allocate
2470 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2471 *
2472 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2473 * vm_structs on success, %NULL on failure
2474 *
2475 * Percpu allocator wants to use congruent vm areas so that it can
2476 * maintain the offsets among percpu areas. This function allocates
2477 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
2478 * be scattered pretty far, distance between two areas easily going up
2479 * to gigabytes. To avoid interacting with regular vmallocs, these
2480 * areas are allocated from top.
2481 *
2482 * Despite its complicated look, this allocator is rather simple. It
2483 * does everything top-down and scans areas from the end looking for
2484 * matching slot. While scanning, if any of the areas overlaps with
2485 * existing vmap_area, the base address is pulled down to fit the
2486 * area. Scanning is repeated till all the areas fit and then all
2487 * necessary data structures are inserted and the result is returned.
2488 */
2489struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2490 const size_t *sizes, int nr_vms,
2491 size_t align)
2492{
2493 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2494 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2495 struct vmap_area **vas, *prev, *next;
2496 struct vm_struct **vms;
2497 int area, area2, last_area, term_area;
2498 unsigned long base, start, end, last_end;
2499 bool purged = false;
2500
2501 /* verify parameters and allocate data structures */
2502 BUG_ON(offset_in_page(align) || !is_power_of_2(align));
2503 for (last_area = 0, area = 0; area < nr_vms; area++) {
2504 start = offsets[area];
2505 end = start + sizes[area];
2506
2507 /* is everything aligned properly? */
2508 BUG_ON(!IS_ALIGNED(offsets[area], align));
2509 BUG_ON(!IS_ALIGNED(sizes[area], align));
2510
2511 /* detect the area with the highest address */
2512 if (start > offsets[last_area])
2513 last_area = area;
2514
2515 for (area2 = area + 1; area2 < nr_vms; area2++) {
2516 unsigned long start2 = offsets[area2];
2517 unsigned long end2 = start2 + sizes[area2];
2518
2519 BUG_ON(start2 < end && start < end2);
2520 }
2521 }
2522 last_end = offsets[last_area] + sizes[last_area];
2523
2524 if (vmalloc_end - vmalloc_start < last_end) {
2525 WARN_ON(true);
2526 return NULL;
2527 }
2528
2529 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
2530 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
2531 if (!vas || !vms)
2532 goto err_free2;
2533
2534 for (area = 0; area < nr_vms; area++) {
2535 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
2536 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
2537 if (!vas[area] || !vms[area])
2538 goto err_free;
2539 }
2540retry:
2541 spin_lock(&vmap_area_lock);
2542
2543 /* start scanning - we scan from the top, begin with the last area */
2544 area = term_area = last_area;
2545 start = offsets[area];
2546 end = start + sizes[area];
2547
2548 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2549 base = vmalloc_end - last_end;
2550 goto found;
2551 }
2552 base = pvm_determine_end(&next, &prev, align) - end;
2553
2554 while (true) {
2555 BUG_ON(next && next->va_end <= base + end);
2556 BUG_ON(prev && prev->va_end > base + end);
2557
2558 /*
2559 * base might have underflowed, add last_end before
2560 * comparing.
2561 */
2562 if (base + last_end < vmalloc_start + last_end) {
2563 spin_unlock(&vmap_area_lock);
2564 if (!purged) {
2565 purge_vmap_area_lazy();
2566 purged = true;
2567 goto retry;
2568 }
2569 goto err_free;
2570 }
2571
2572 /*
2573 * If next overlaps, move base downwards so that it's
2574 * right below next and then recheck.
2575 */
2576 if (next && next->va_start < base + end) {
2577 base = pvm_determine_end(&next, &prev, align) - end;
2578 term_area = area;
2579 continue;
2580 }
2581
2582 /*
2583 * If prev overlaps, shift down next and prev and move
2584 * base so that it's right below new next and then
2585 * recheck.
2586 */
2587 if (prev && prev->va_end > base + start) {
2588 next = prev;
2589 prev = node_to_va(rb_prev(&next->rb_node));
2590 base = pvm_determine_end(&next, &prev, align) - end;
2591 term_area = area;
2592 continue;
2593 }
2594
2595 /*
2596 * This area fits, move on to the previous one. If
2597 * the previous one is the terminal one, we're done.
2598 */
2599 area = (area + nr_vms - 1) % nr_vms;
2600 if (area == term_area)
2601 break;
2602 start = offsets[area];
2603 end = start + sizes[area];
2604 pvm_find_next_prev(base + end, &next, &prev);
2605 }
2606found:
2607 /* we've found a fitting base, insert all va's */
2608 for (area = 0; area < nr_vms; area++) {
2609 struct vmap_area *va = vas[area];
2610
2611 va->va_start = base + offsets[area];
2612 va->va_end = va->va_start + sizes[area];
2613 __insert_vmap_area(va);
2614 }
2615
2616 vmap_area_pcpu_hole = base + offsets[last_area];
2617
2618 spin_unlock(&vmap_area_lock);
2619
2620 /* insert all vm's */
2621 for (area = 0; area < nr_vms; area++)
2622 setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2623 pcpu_get_vm_areas);
2624
2625 kfree(vas);
2626 return vms;
2627
2628err_free:
2629 for (area = 0; area < nr_vms; area++) {
2630 kfree(vas[area]);
2631 kfree(vms[area]);
2632 }
2633err_free2:
2634 kfree(vas);
2635 kfree(vms);
2636 return NULL;
2637}
2638
2639/**
2640 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2641 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2642 * @nr_vms: the number of allocated areas
2643 *
2644 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2645 */
2646void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2647{
2648 int i;
2649
2650 for (i = 0; i < nr_vms; i++)
2651 free_vm_area(vms[i]);
2652 kfree(vms);
2653}
2654#endif /* CONFIG_SMP */
2655
2656#ifdef CONFIG_PROC_FS
2657static void *s_start(struct seq_file *m, loff_t *pos)
2658 __acquires(&vmap_area_lock)
2659{
2660 spin_lock(&vmap_area_lock);
2661 return seq_list_start(&vmap_area_list, *pos);
2662}
2663
2664static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2665{
2666 return seq_list_next(p, &vmap_area_list, pos);
2667}
2668
2669static void s_stop(struct seq_file *m, void *p)
2670 __releases(&vmap_area_lock)
2671{
2672 spin_unlock(&vmap_area_lock);
2673}
2674
2675static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2676{
2677 if (IS_ENABLED(CONFIG_NUMA)) {
2678 unsigned int nr, *counters = m->private;
2679
2680 if (!counters)
2681 return;
2682
2683 if (v->flags & VM_UNINITIALIZED)
2684 return;
2685 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2686 smp_rmb();
2687
2688 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2689
2690 for (nr = 0; nr < v->nr_pages; nr++)
2691 counters[page_to_nid(v->pages[nr])]++;
2692
2693 for_each_node_state(nr, N_HIGH_MEMORY)
2694 if (counters[nr])
2695 seq_printf(m, " N%u=%u", nr, counters[nr]);
2696 }
2697}
2698
2699static int s_show(struct seq_file *m, void *p)
2700{
2701 struct vmap_area *va;
2702 struct vm_struct *v;
2703
2704 va = list_entry(p, struct vmap_area, list);
2705
2706 /*
2707 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
2708 * behalf of vmap area is being tear down or vm_map_ram allocation.
2709 */
2710 if (!(va->flags & VM_VM_AREA)) {
2711 seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
2712 (void *)va->va_start, (void *)va->va_end,
2713 va->va_end - va->va_start,
2714 va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
2715
2716 return 0;
2717 }
2718
2719 v = va->vm;
2720
2721 seq_printf(m, "0x%pK-0x%pK %7ld",
2722 v->addr, v->addr + v->size, v->size);
2723
2724 if (v->caller)
2725 seq_printf(m, " %pS", v->caller);
2726
2727 if (v->nr_pages)
2728 seq_printf(m, " pages=%d", v->nr_pages);
2729
2730 if (v->phys_addr)
2731 seq_printf(m, " phys=%pa", &v->phys_addr);
2732
2733 if (v->flags & VM_IOREMAP)
2734 seq_puts(m, " ioremap");
2735
2736 if (v->flags & VM_ALLOC)
2737 seq_puts(m, " vmalloc");
2738
2739 if (v->flags & VM_MAP)
2740 seq_puts(m, " vmap");
2741
2742 if (v->flags & VM_USERMAP)
2743 seq_puts(m, " user");
2744
2745 if (is_vmalloc_addr(v->pages))
2746 seq_puts(m, " vpages");
2747
2748 show_numa_info(m, v);
2749 seq_putc(m, '\n');
2750 return 0;
2751}
2752
2753static const struct seq_operations vmalloc_op = {
2754 .start = s_start,
2755 .next = s_next,
2756 .stop = s_stop,
2757 .show = s_show,
2758};
2759
2760static int __init proc_vmalloc_init(void)
2761{
2762 if (IS_ENABLED(CONFIG_PROC_STRIPPED))
2763 return 0;
2764 if (IS_ENABLED(CONFIG_NUMA))
2765 proc_create_seq_private("vmallocinfo", 0400, NULL,
2766 &vmalloc_op,
2767 nr_node_ids * sizeof(unsigned int), NULL);
2768 else
2769 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
2770 return 0;
2771}
2772module_init(proc_vmalloc_init);
2773
2774#endif
2775