Blame - src/kernel/linux/v4.14/mm/vmalloc.c - T103

blob: e8c64dddfecd24892a990de8880a390df3622851 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* linux/mm/vmalloc.c
				3	*
				4	* Copyright (C) 1993 Linus Torvalds
				5	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
				6	* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
				7	* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
				8	* Numa awareness, Christoph Lameter, SGI, June 2005
				9	*/
				10
				11	#include <linux/vmalloc.h>
				12	#include <linux/mm.h>
				13	#include <linux/module.h>
				14	#include <linux/highmem.h>
				15	#include <linux/sched/signal.h>
				16	#include <linux/slab.h>
				17	#include <linux/spinlock.h>
				18	#include <linux/interrupt.h>
				19	#include <linux/proc_fs.h>
				20	#include <linux/seq_file.h>
				21	#include <linux/debugobjects.h>
				22	#include <linux/kallsyms.h>
				23	#include <linux/list.h>
				24	#include <linux/notifier.h>
				25	#include <linux/rbtree.h>
				26	#include <linux/radix-tree.h>
				27	#include <linux/rcupdate.h>
				28	#include <linux/pfn.h>
				29	#include <linux/kmemleak.h>
				30	#include <linux/atomic.h>
				31	#include <linux/compiler.h>
				32	#include <linux/llist.h>
				33	#include <linux/bitops.h>
				34	#include <linux/overflow.h>
				35
				36	#include <linux/uaccess.h>
				37	#include <asm/tlbflush.h>
				38	#include <asm/shmparam.h>
				39
				40	#include "internal.h"
				41
				42	struct vfree_deferred {
				43	struct llist_head list;
				44	struct work_struct wq;
				45	};
				46	static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
				47
				48	static void __vunmap(const void *, int);
				49
				50	static void free_work(struct work_struct *w)
				51	{
				52	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
				53	struct llist_node t, llnode;
				54
				55	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
				56	__vunmap((void *)llnode, 1);
				57	}
				58
				59	/* Page table manipulation functions */
				60
				61	static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
				62	{
				63	pte_t *pte;
				64
				65	pte = pte_offset_kernel(pmd, addr);
				66	do {
				67	pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
				68	WARN_ON(!pte_none(ptent) && !pte_present(ptent));
				69	} while (pte++, addr += PAGE_SIZE, addr != end);
				70	}
				71
				72	static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
				73	{
				74	pmd_t *pmd;
				75	unsigned long next;
				76
				77	pmd = pmd_offset(pud, addr);
				78	do {
				79	next = pmd_addr_end(addr, end);
				80	if (pmd_clear_huge(pmd))
				81	continue;
				82	if (pmd_none_or_clear_bad(pmd))
				83	continue;
				84	vunmap_pte_range(pmd, addr, next);
				85	} while (pmd++, addr = next, addr != end);
				86	}
				87
				88	static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
				89	{
				90	pud_t *pud;
				91	unsigned long next;
				92
				93	pud = pud_offset(p4d, addr);
				94	do {
				95	next = pud_addr_end(addr, end);
				96	if (pud_clear_huge(pud))
				97	continue;
				98	if (pud_none_or_clear_bad(pud))
				99	continue;
				100	vunmap_pmd_range(pud, addr, next);
				101	} while (pud++, addr = next, addr != end);
				102	}
				103
				104	static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
				105	{
				106	p4d_t *p4d;
				107	unsigned long next;
				108
				109	p4d = p4d_offset(pgd, addr);
				110	do {
				111	next = p4d_addr_end(addr, end);
				112	if (p4d_clear_huge(p4d))
				113	continue;
				114	if (p4d_none_or_clear_bad(p4d))
				115	continue;
				116	vunmap_pud_range(p4d, addr, next);
				117	} while (p4d++, addr = next, addr != end);
				118	}
				119
				120	static void vunmap_page_range(unsigned long addr, unsigned long end)
				121	{
				122	pgd_t *pgd;
				123	unsigned long next;
				124
				125	BUG_ON(addr >= end);
				126	pgd = pgd_offset_k(addr);
				127	do {
				128	next = pgd_addr_end(addr, end);
				129	if (pgd_none_or_clear_bad(pgd))
				130	continue;
				131	vunmap_p4d_range(pgd, addr, next);
				132	} while (pgd++, addr = next, addr != end);
				133	}
				134
				135	static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
				136	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				137	{
				138	pte_t *pte;
				139
				140	/*
				141	* nr is a running index into the array which helps higher level
				142	* callers keep track of where we're up to.
				143	*/
				144
				145	pte = pte_alloc_kernel(pmd, addr);
				146	if (!pte)
				147	return -ENOMEM;
				148	do {
				149	struct page page = pages[nr];
				150
				151	if (WARN_ON(!pte_none(*pte)))
				152	return -EBUSY;
				153	if (WARN_ON(!page))
				154	return -ENOMEM;
				155	set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
				156	(*nr)++;
				157	} while (pte++, addr += PAGE_SIZE, addr != end);
				158	return 0;
				159	}
				160
				161	static int vmap_pmd_range(pud_t *pud, unsigned long addr,
				162	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				163	{
				164	pmd_t *pmd;
				165	unsigned long next;
				166
				167	pmd = pmd_alloc(&init_mm, pud, addr);
				168	if (!pmd)
				169	return -ENOMEM;
				170	do {
				171	next = pmd_addr_end(addr, end);
				172	if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
				173	return -ENOMEM;
				174	} while (pmd++, addr = next, addr != end);
				175	return 0;
				176	}
				177
				178	static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
				179	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				180	{
				181	pud_t *pud;
				182	unsigned long next;
				183
				184	pud = pud_alloc(&init_mm, p4d, addr);
				185	if (!pud)
				186	return -ENOMEM;
				187	do {
				188	next = pud_addr_end(addr, end);
				189	if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
				190	return -ENOMEM;
				191	} while (pud++, addr = next, addr != end);
				192	return 0;
				193	}
				194
				195	static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
				196	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				197	{
				198	p4d_t *p4d;
				199	unsigned long next;
				200
				201	p4d = p4d_alloc(&init_mm, pgd, addr);
				202	if (!p4d)
				203	return -ENOMEM;
				204	do {
				205	next = p4d_addr_end(addr, end);
				206	if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
				207	return -ENOMEM;
				208	} while (p4d++, addr = next, addr != end);
				209	return 0;
				210	}
				211
				212	/*
				213	* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
				214	* will have pfns corresponding to the "pages" array.
				215	*
				216	* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
				217	*/
				218	static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				219	pgprot_t prot, struct page **pages)
				220	{
				221	pgd_t *pgd;
				222	unsigned long next;
				223	unsigned long addr = start;
				224	int err = 0;
				225	int nr = 0;
				226
				227	BUG_ON(addr >= end);
				228	pgd = pgd_offset_k(addr);
				229	do {
				230	next = pgd_addr_end(addr, end);
				231	err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
				232	if (err)
				233	return err;
				234	} while (pgd++, addr = next, addr != end);
				235
				236	return nr;
				237	}
				238
				239	static int vmap_page_range(unsigned long start, unsigned long end,
				240	pgprot_t prot, struct page **pages)
				241	{
				242	int ret;
				243
				244	ret = vmap_page_range_noflush(start, end, prot, pages);
				245	flush_cache_vmap(start, end);
				246	return ret;
				247	}
				248
				249	int is_vmalloc_or_module_addr(const void *x)
				250	{
				251	/*
				252	* ARM, x86-64 and sparc64 put modules in a special place,
				253	* and fall back on vmalloc() if that fails. Others
				254	* just put it in the vmalloc space.
				255	*/
				256	#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
				257	unsigned long addr = (unsigned long)x;
				258	if (addr >= MODULES_VADDR && addr < MODULES_END)
				259	return 1;
				260	#endif
				261	return is_vmalloc_addr(x);
				262	}
				263
				264	/*
				265	* Walk a vmap address to the struct page it maps.
				266	*/
				267	struct page vmalloc_to_page(const void vmalloc_addr)
				268	{
				269	unsigned long addr = (unsigned long) vmalloc_addr;
				270	struct page *page = NULL;
				271	pgd_t *pgd = pgd_offset_k(addr);
				272	p4d_t *p4d;
				273	pud_t *pud;
				274	pmd_t *pmd;
				275	pte_t *ptep, pte;
				276
				277	/*
				278	* XXX we might need to change this if we add VIRTUAL_BUG_ON for
				279	* architectures that do not vmalloc module space
				280	*/
				281	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
				282
				283	if (pgd_none(*pgd))
				284	return NULL;
				285	p4d = p4d_offset(pgd, addr);
				286	if (p4d_none(*p4d))
				287	return NULL;
				288	pud = pud_offset(p4d, addr);
				289
				290	/*
				291	* Don't dereference bad PUD or PMD (below) entries. This will also
				292	* identify huge mappings, which we may encounter on architectures
				293	* that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
				294	* identified as vmalloc addresses by is_vmalloc_addr(), but are
				295	* not [unambiguously] associated with a struct page, so there is
				296	* no correct value to return for them.
				297	*/
				298	WARN_ON_ONCE(pud_bad(*pud));
				299	if (pud_none(pud) \|\| pud_bad(pud))
				300	return NULL;
				301	pmd = pmd_offset(pud, addr);
				302	WARN_ON_ONCE(pmd_bad(*pmd));
				303	if (pmd_none(pmd) \|\| pmd_bad(pmd))
				304	return NULL;
				305
				306	ptep = pte_offset_map(pmd, addr);
				307	pte = *ptep;
				308	if (pte_present(pte))
				309	page = pte_page(pte);
				310	pte_unmap(ptep);
				311	return page;
				312	}
				313	EXPORT_SYMBOL(vmalloc_to_page);
				314
				315	/*
				316	* Map a vmalloc()-space virtual address to the physical page frame number.
				317	*/
				318	unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
				319	{
				320	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
				321	}
				322	EXPORT_SYMBOL(vmalloc_to_pfn);
				323
				324
				325	/* Global kva allocator */
				326
				327	#define VM_LAZY_FREE 0x02
				328	#define VM_VM_AREA 0x04
				329
				330	static DEFINE_SPINLOCK(vmap_area_lock);
				331	/* Export for kexec only */
				332	LIST_HEAD(vmap_area_list);
				333	static LLIST_HEAD(vmap_purge_list);
				334	static struct rb_root vmap_area_root = RB_ROOT;
				335
				336	/* The vmap cache globals are protected by vmap_area_lock */
				337	static struct rb_node *free_vmap_cache;
				338	static unsigned long cached_hole_size;
				339	static unsigned long cached_vstart;
				340	static unsigned long cached_align;
				341
				342	static unsigned long vmap_area_pcpu_hole;
				343
				344	static struct vmap_area *__find_vmap_area(unsigned long addr)
				345	{
				346	struct rb_node *n = vmap_area_root.rb_node;
				347
				348	while (n) {
				349	struct vmap_area *va;
				350
				351	va = rb_entry(n, struct vmap_area, rb_node);
				352	if (addr < va->va_start)
				353	n = n->rb_left;
				354	else if (addr >= va->va_end)
				355	n = n->rb_right;
				356	else
				357	return va;
				358	}
				359
				360	return NULL;
				361	}
				362
				363	static void __insert_vmap_area(struct vmap_area *va)
				364	{
				365	struct rb_node **p = &vmap_area_root.rb_node;
				366	struct rb_node *parent = NULL;
				367	struct rb_node *tmp;
				368
				369	while (*p) {
				370	struct vmap_area *tmp_va;
				371
				372	parent = *p;
				373	tmp_va = rb_entry(parent, struct vmap_area, rb_node);
				374	if (va->va_start < tmp_va->va_end)
				375	p = &(*p)->rb_left;
				376	else if (va->va_end > tmp_va->va_start)
				377	p = &(*p)->rb_right;
				378	else
				379	BUG();
				380	}
				381
				382	rb_link_node(&va->rb_node, parent, p);
				383	rb_insert_color(&va->rb_node, &vmap_area_root);
				384
				385	/* address-sort this list */
				386	tmp = rb_prev(&va->rb_node);
				387	if (tmp) {
				388	struct vmap_area *prev;
				389	prev = rb_entry(tmp, struct vmap_area, rb_node);
				390	list_add_rcu(&va->list, &prev->list);
				391	} else
				392	list_add_rcu(&va->list, &vmap_area_list);
				393	}
				394
				395	static void purge_vmap_area_lazy(void);
				396
				397	static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
				398
				399	/*
				400	* Allocate a region of KVA of the specified size and alignment, within the
				401	* vstart and vend.
				402	*/
				403	static struct vmap_area *alloc_vmap_area(unsigned long size,
				404	unsigned long align,
				405	unsigned long vstart, unsigned long vend,
				406	int node, gfp_t gfp_mask)
				407	{
				408	struct vmap_area *va;
				409	struct rb_node *n;
				410	unsigned long addr;
				411	int purged = 0;
				412	struct vmap_area *first;
				413
				414	BUG_ON(!size);
				415	BUG_ON(offset_in_page(size));
				416	BUG_ON(!is_power_of_2(align));
				417
				418	might_sleep();
				419
				420	va = kmalloc_node(sizeof(struct vmap_area),
				421	gfp_mask & GFP_RECLAIM_MASK, node);
				422	if (unlikely(!va))
				423	return ERR_PTR(-ENOMEM);
				424
				425	/*
				426	* Only scan the relevant parts containing pointers to other objects
				427	* to avoid false negatives.
				428	*/
				429	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
				430
				431	retry:
				432	spin_lock(&vmap_area_lock);
				433	/*
				434	* Invalidate cache if we have more permissive parameters.
				435	* cached_hole_size notes the largest hole noticed _below_
				436	* the vmap_area cached in free_vmap_cache: if size fits
				437	* into that hole, we want to scan from vstart to reuse
				438	* the hole instead of allocating above free_vmap_cache.
				439	* Note that __free_vmap_area may update free_vmap_cache
				440	* without updating cached_hole_size or cached_align.
				441	*/
				442	if (!free_vmap_cache \|\|
				443	size < cached_hole_size \|\|
				444	vstart < cached_vstart \|\|
				445	align < cached_align) {
				446	nocache:
				447	cached_hole_size = 0;
				448	free_vmap_cache = NULL;
				449	}
				450	/* record if we encounter less permissive parameters */
				451	cached_vstart = vstart;
				452	cached_align = align;
				453
				454	/* find starting point for our search */
				455	if (free_vmap_cache) {
				456	first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
				457	addr = ALIGN(first->va_end, align);
				458	if (addr < vstart)
				459	goto nocache;
				460	if (addr + size < addr)
				461	goto overflow;
				462
				463	} else {
				464	addr = ALIGN(vstart, align);
				465	if (addr + size < addr)
				466	goto overflow;
				467
				468	n = vmap_area_root.rb_node;
				469	first = NULL;
				470
				471	while (n) {
				472	struct vmap_area *tmp;
				473	tmp = rb_entry(n, struct vmap_area, rb_node);
				474	if (tmp->va_end >= addr) {
				475	first = tmp;
				476	if (tmp->va_start <= addr)
				477	break;
				478	n = n->rb_left;
				479	} else
				480	n = n->rb_right;
				481	}
				482
				483	if (!first)
				484	goto found;
				485	}
				486
				487	/* from the starting point, walk areas until a suitable hole is found */
				488	while (addr + size > first->va_start && addr + size <= vend) {
				489	if (addr + cached_hole_size < first->va_start)
				490	cached_hole_size = first->va_start - addr;
				491	addr = ALIGN(first->va_end, align);
				492	if (addr + size < addr)
				493	goto overflow;
				494
				495	if (list_is_last(&first->list, &vmap_area_list))
				496	goto found;
				497
				498	first = list_next_entry(first, list);
				499	}
				500
				501	found:
				502	/*
				503	* Check also calculated address against the vstart,
				504	* because it can be 0 because of big align request.
				505	*/
				506	if (addr + size > vend \|\| addr < vstart)
				507	goto overflow;
				508
				509	va->va_start = addr;
				510	va->va_end = addr + size;
				511	va->flags = 0;
				512	__insert_vmap_area(va);
				513	free_vmap_cache = &va->rb_node;
				514	spin_unlock(&vmap_area_lock);
				515
				516	BUG_ON(!IS_ALIGNED(va->va_start, align));
				517	BUG_ON(va->va_start < vstart);
				518	BUG_ON(va->va_end > vend);
				519
				520	return va;
				521
				522	overflow:
				523	spin_unlock(&vmap_area_lock);
				524	if (!purged) {
				525	purge_vmap_area_lazy();
				526	purged = 1;
				527	goto retry;
				528	}
				529
				530	if (gfpflags_allow_blocking(gfp_mask)) {
				531	unsigned long freed = 0;
				532	blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
				533	if (freed > 0) {
				534	purged = 0;
				535	goto retry;
				536	}
				537	}
				538
				539	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
				540	pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
				541	size);
				542	kfree(va);
				543	return ERR_PTR(-EBUSY);
				544	}
				545
				546	int register_vmap_purge_notifier(struct notifier_block *nb)
				547	{
				548	return blocking_notifier_chain_register(&vmap_notify_list, nb);
				549	}
				550	EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
				551
				552	int unregister_vmap_purge_notifier(struct notifier_block *nb)
				553	{
				554	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
				555	}
				556	EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
				557
				558	static void __free_vmap_area(struct vmap_area *va)
				559	{
				560	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
				561
				562	if (free_vmap_cache) {
				563	if (va->va_end < cached_vstart) {
				564	free_vmap_cache = NULL;
				565	} else {
				566	struct vmap_area *cache;
				567	cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
				568	if (va->va_start <= cache->va_start) {
				569	free_vmap_cache = rb_prev(&va->rb_node);
				570	/*
				571	* We don't try to update cached_hole_size or
				572	* cached_align, but it won't go very wrong.
				573	*/
				574	}
				575	}
				576	}
				577	rb_erase(&va->rb_node, &vmap_area_root);
				578	RB_CLEAR_NODE(&va->rb_node);
				579	list_del_rcu(&va->list);
				580
				581	/*
				582	* Track the highest possible candidate for pcpu area
				583	* allocation. Areas outside of vmalloc area can be returned
				584	* here too, consider only end addresses which fall inside
				585	* vmalloc area proper.
				586	*/
				587	if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
				588	vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
				589
				590	kfree_rcu(va, rcu_head);
				591	}
				592
				593	/*
				594	* Free a region of KVA allocated by alloc_vmap_area
				595	*/
				596	static void free_vmap_area(struct vmap_area *va)
				597	{
				598	spin_lock(&vmap_area_lock);
				599	__free_vmap_area(va);
				600	spin_unlock(&vmap_area_lock);
				601	}
				602
				603	/*
				604	* Clear the pagetable entries of a given vmap_area
				605	*/
				606	static void unmap_vmap_area(struct vmap_area *va)
				607	{
				608	vunmap_page_range(va->va_start, va->va_end);
				609	}
				610
				611	static void vmap_debug_free_range(unsigned long start, unsigned long end)
				612	{
				613	/*
				614	* Unmap page tables and force a TLB flush immediately if pagealloc
				615	* debugging is enabled. This catches use after free bugs similarly to
				616	* those in linear kernel virtual address space after a page has been
				617	* freed.
				618	*
				619	* All the lazy freeing logic is still retained, in order to minimise
				620	* intrusiveness of this debugging feature.
				621	*
				622	* This is going to be slow (linear kernel virtual address debugging
				623	* doesn't do a broadcast TLB flush so it is a lot faster).
				624	*/
				625	if (debug_pagealloc_enabled()) {
				626	vunmap_page_range(start, end);
				627	flush_tlb_kernel_range(start, end);
				628	}
				629	}
				630
				631	/*
				632	* lazy_max_pages is the maximum amount of virtual address space we gather up
				633	* before attempting to purge with a TLB flush.
				634	*
				635	* There is a tradeoff here: a larger number will cover more kernel page tables
				636	* and take slightly longer to purge, but it will linearly reduce the number of
				637	* global TLB flushes that must be performed. It would seem natural to scale
				638	* this number up linearly with the number of CPUs (because vmapping activity
				639	* could also scale linearly with the number of CPUs), however it is likely
				640	* that in practice, workloads might be constrained in other ways that mean
				641	* vmap activity will not scale linearly with CPUs. Also, I want to be
				642	* conservative and not introduce a big latency on huge systems, so go with
				643	* a less aggressive log scale. It will still be an improvement over the old
				644	* code, and it will be simple to change the scale factor if we find that it
				645	* becomes a problem on bigger systems.
				646	*/
				647	static unsigned long lazy_max_pages(void)
				648	{
				649	unsigned int log;
				650
				651	log = fls(num_online_cpus());
				652
				653	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
				654	}
				655
				656	static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
				657
				658	/*
				659	* Serialize vmap purging. There is no actual criticial section protected
				660	* by this look, but we want to avoid concurrent calls for performance
				661	* reasons and to make the pcpu_get_vm_areas more deterministic.
				662	*/
				663	static DEFINE_MUTEX(vmap_purge_lock);
				664
				665	/* for per-CPU blocks */
				666	static void purge_fragmented_blocks_allcpus(void);
				667
				668	/*
				669	* called before a call to iounmap() if the caller wants vm_area_struct's
				670	* immediately freed.
				671	*/
				672	void set_iounmap_nonlazy(void)
				673	{
				674	atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
				675	}
				676
				677	/*
				678	* Purges all lazily-freed vmap areas.
				679	*/
				680	static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
				681	{
				682	struct llist_node *valist;
				683	struct vmap_area *va;
				684	struct vmap_area *n_va;
				685	bool do_free = false;
				686
				687	lockdep_assert_held(&vmap_purge_lock);
				688
				689	valist = llist_del_all(&vmap_purge_list);
				690	llist_for_each_entry(va, valist, purge_list) {
				691	if (va->va_start < start)
				692	start = va->va_start;
				693	if (va->va_end > end)
				694	end = va->va_end;
				695	do_free = true;
				696	}
				697
				698	if (!do_free)
				699	return false;
				700
				701	flush_tlb_kernel_range(start, end);
				702
				703	spin_lock(&vmap_area_lock);
				704	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
				705	int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
				706
				707	__free_vmap_area(va);
				708	atomic_sub(nr, &vmap_lazy_nr);
				709	cond_resched_lock(&vmap_area_lock);
				710	}
				711	spin_unlock(&vmap_area_lock);
				712	return true;
				713	}
				714
				715	/*
				716	* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
				717	* is already purging.
				718	*/
				719	static void try_purge_vmap_area_lazy(void)
				720	{
				721	if (mutex_trylock(&vmap_purge_lock)) {
				722	__purge_vmap_area_lazy(ULONG_MAX, 0);
				723	mutex_unlock(&vmap_purge_lock);
				724	}
				725	}
				726
				727	/*
				728	* Kick off a purge of the outstanding lazy areas.
				729	*/
				730	static void purge_vmap_area_lazy(void)
				731	{
				732	mutex_lock(&vmap_purge_lock);
				733	purge_fragmented_blocks_allcpus();
				734	__purge_vmap_area_lazy(ULONG_MAX, 0);
				735	mutex_unlock(&vmap_purge_lock);
				736	}
				737
				738	/*
				739	* Free a vmap area, caller ensuring that the area has been unmapped
				740	* and flush_cache_vunmap had been called for the correct range
				741	* previously.
				742	*/
				743	static void free_vmap_area_noflush(struct vmap_area *va)
				744	{
				745	int nr_lazy;
				746
				747	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
				748	&vmap_lazy_nr);
				749
				750	/* After this point, we may free va at any time */
				751	llist_add(&va->purge_list, &vmap_purge_list);
				752
				753	if (unlikely(nr_lazy > lazy_max_pages()))
				754	try_purge_vmap_area_lazy();
				755	}
				756
				757	/*
				758	* Free and unmap a vmap area
				759	*/
				760	static void free_unmap_vmap_area(struct vmap_area *va)
				761	{
				762	flush_cache_vunmap(va->va_start, va->va_end);
				763	unmap_vmap_area(va);
				764	free_vmap_area_noflush(va);
				765	}
				766
				767	static struct vmap_area *find_vmap_area(unsigned long addr)
				768	{
				769	struct vmap_area *va;
				770
				771	spin_lock(&vmap_area_lock);
				772	va = __find_vmap_area(addr);
				773	spin_unlock(&vmap_area_lock);
				774
				775	return va;
				776	}
				777
				778	/* Per cpu kva allocator */
				779
				780	/*
				781	* vmap space is limited especially on 32 bit architectures. Ensure there is
				782	* room for at least 16 percpu vmap blocks per CPU.
				783	*/
				784	/*
				785	* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
				786	* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
				787	* instead (we just need a rough idea)
				788	*/
				789	#if BITS_PER_LONG == 32
				790	#define VMALLOC_SPACE (128UL10241024)
				791	#else
				792	#define VMALLOC_SPACE (128UL10241024*1024)
				793	#endif
				794
				795	#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
				796	#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
				797	#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
				798	#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
				799	#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
				800	#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
				801	#define VMAP_BBMAP_BITS \
				802	VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
				803	VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
				804	VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
				805
				806	#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
				807
				808	static bool vmap_initialized __read_mostly = false;
				809
				810	struct vmap_block_queue {
				811	spinlock_t lock;
				812	struct list_head free;
				813	};
				814
				815	struct vmap_block {
				816	spinlock_t lock;
				817	struct vmap_area *va;
				818	unsigned long free, dirty;
				819	unsigned long dirty_min, dirty_max; /< dirty range /
				820	struct list_head free_list;
				821	struct rcu_head rcu_head;
				822	struct list_head purge;
				823	};
				824
				825	/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
				826	static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
				827
				828	/*
				829	* Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
				830	* in the free path. Could get rid of this if we change the API to return a
				831	* "cookie" from alloc, to be passed to free. But no big deal yet.
				832	*/
				833	static DEFINE_SPINLOCK(vmap_block_tree_lock);
				834	static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
				835
				836	/*
				837	* We should probably have a fallback mechanism to allocate virtual memory
				838	* out of partially filled vmap blocks. However vmap block sizing should be
				839	* fairly reasonable according to the vmalloc size, so it shouldn't be a
				840	* big problem.
				841	*/
				842
				843	static unsigned long addr_to_vb_idx(unsigned long addr)
				844	{
				845	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
				846	addr /= VMAP_BLOCK_SIZE;
				847	return addr;
				848	}
				849
				850	static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
				851	{
				852	unsigned long addr;
				853
				854	addr = va_start + (pages_off << PAGE_SHIFT);
				855	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
				856	return (void *)addr;
				857	}
				858
				859	/**
				860	* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
				861	* block. Of course pages number can't exceed VMAP_BBMAP_BITS
				862	* @order: how many 2^order pages should be occupied in newly allocated block
				863	* @gfp_mask: flags for the page level allocator
				864	*
				865	* Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
				866	*/
				867	static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
				868	{
				869	struct vmap_block_queue *vbq;
				870	struct vmap_block *vb;
				871	struct vmap_area *va;
				872	unsigned long vb_idx;
				873	int node, err;
				874	void *vaddr;
				875
				876	node = numa_node_id();
				877
				878	vb = kmalloc_node(sizeof(struct vmap_block),
				879	gfp_mask & GFP_RECLAIM_MASK, node);
				880	if (unlikely(!vb))
				881	return ERR_PTR(-ENOMEM);
				882
				883	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
				884	VMALLOC_START, VMALLOC_END,
				885	node, gfp_mask);
				886	if (IS_ERR(va)) {
				887	kfree(vb);
				888	return ERR_CAST(va);
				889	}
				890
				891	err = radix_tree_preload(gfp_mask);
				892	if (unlikely(err)) {
				893	kfree(vb);
				894	free_vmap_area(va);
				895	return ERR_PTR(err);
				896	}
				897
				898	vaddr = vmap_block_vaddr(va->va_start, 0);
				899	spin_lock_init(&vb->lock);
				900	vb->va = va;
				901	/* At least something should be left free */
				902	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
				903	vb->free = VMAP_BBMAP_BITS - (1UL << order);
				904	vb->dirty = 0;
				905	vb->dirty_min = VMAP_BBMAP_BITS;
				906	vb->dirty_max = 0;
				907	INIT_LIST_HEAD(&vb->free_list);
				908
				909	vb_idx = addr_to_vb_idx(va->va_start);
				910	spin_lock(&vmap_block_tree_lock);
				911	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
				912	spin_unlock(&vmap_block_tree_lock);
				913	BUG_ON(err);
				914	radix_tree_preload_end();
				915
				916	vbq = &get_cpu_var(vmap_block_queue);
				917	spin_lock(&vbq->lock);
				918	list_add_tail_rcu(&vb->free_list, &vbq->free);
				919	spin_unlock(&vbq->lock);
				920	put_cpu_var(vmap_block_queue);
				921
				922	return vaddr;
				923	}
				924
				925	static void free_vmap_block(struct vmap_block *vb)
				926	{
				927	struct vmap_block *tmp;
				928	unsigned long vb_idx;
				929
				930	vb_idx = addr_to_vb_idx(vb->va->va_start);
				931	spin_lock(&vmap_block_tree_lock);
				932	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
				933	spin_unlock(&vmap_block_tree_lock);
				934	BUG_ON(tmp != vb);
				935
				936	free_vmap_area_noflush(vb->va);
				937	kfree_rcu(vb, rcu_head);
				938	}
				939
				940	static void purge_fragmented_blocks(int cpu)
				941	{
				942	LIST_HEAD(purge);
				943	struct vmap_block *vb;
				944	struct vmap_block *n_vb;
				945	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
				946
				947	rcu_read_lock();
				948	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				949
				950	if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
				951	continue;
				952
				953	spin_lock(&vb->lock);
				954	if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
				955	vb->free = 0; /* prevent further allocs after releasing lock */
				956	vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
				957	vb->dirty_min = 0;
				958	vb->dirty_max = VMAP_BBMAP_BITS;
				959	spin_lock(&vbq->lock);
				960	list_del_rcu(&vb->free_list);
				961	spin_unlock(&vbq->lock);
				962	spin_unlock(&vb->lock);
				963	list_add_tail(&vb->purge, &purge);
				964	} else
				965	spin_unlock(&vb->lock);
				966	}
				967	rcu_read_unlock();
				968
				969	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
				970	list_del(&vb->purge);
				971	free_vmap_block(vb);
				972	}
				973	}
				974
				975	static void purge_fragmented_blocks_allcpus(void)
				976	{
				977	int cpu;
				978
				979	for_each_possible_cpu(cpu)
				980	purge_fragmented_blocks(cpu);
				981	}
				982
				983	static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
				984	{
				985	struct vmap_block_queue *vbq;
				986	struct vmap_block *vb;
				987	void *vaddr = NULL;
				988	unsigned int order;
				989
				990	BUG_ON(offset_in_page(size));
				991	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
				992	if (WARN_ON(size == 0)) {
				993	/*
				994	* Allocating 0 bytes isn't what caller wants since
				995	* get_order(0) returns funny result. Just warn and terminate
				996	* early.
				997	*/
				998	return NULL;
				999	}
				1000	order = get_order(size);
				1001
				1002	rcu_read_lock();
				1003	vbq = &get_cpu_var(vmap_block_queue);
				1004	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				1005	unsigned long pages_off;
				1006
				1007	spin_lock(&vb->lock);
				1008	if (vb->free < (1UL << order)) {
				1009	spin_unlock(&vb->lock);
				1010	continue;
				1011	}
				1012
				1013	pages_off = VMAP_BBMAP_BITS - vb->free;
				1014	vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
				1015	vb->free -= 1UL << order;
				1016	if (vb->free == 0) {
				1017	spin_lock(&vbq->lock);
				1018	list_del_rcu(&vb->free_list);
				1019	spin_unlock(&vbq->lock);
				1020	}
				1021
				1022	spin_unlock(&vb->lock);
				1023	break;
				1024	}
				1025
				1026	put_cpu_var(vmap_block_queue);
				1027	rcu_read_unlock();
				1028
				1029	/* Allocate new block if nothing was found */
				1030	if (!vaddr)
				1031	vaddr = new_vmap_block(order, gfp_mask);
				1032
				1033	return vaddr;
				1034	}
				1035
				1036	static void vb_free(const void *addr, unsigned long size)
				1037	{
				1038	unsigned long offset;
				1039	unsigned long vb_idx;
				1040	unsigned int order;
				1041	struct vmap_block *vb;
				1042
				1043	BUG_ON(offset_in_page(size));
				1044	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
				1045
				1046	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
				1047
				1048	order = get_order(size);
				1049
				1050	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
				1051	offset >>= PAGE_SHIFT;
				1052
				1053	vb_idx = addr_to_vb_idx((unsigned long)addr);
				1054	rcu_read_lock();
				1055	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
				1056	rcu_read_unlock();
				1057	BUG_ON(!vb);
				1058
				1059	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
				1060
				1061	spin_lock(&vb->lock);
				1062
				1063	/* Expand dirty range */
				1064	vb->dirty_min = min(vb->dirty_min, offset);
				1065	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
				1066
				1067	vb->dirty += 1UL << order;
				1068	if (vb->dirty == VMAP_BBMAP_BITS) {
				1069	BUG_ON(vb->free);
				1070	spin_unlock(&vb->lock);
				1071	free_vmap_block(vb);
				1072	} else
				1073	spin_unlock(&vb->lock);
				1074	}
				1075
				1076	/**
				1077	* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
				1078	*
				1079	* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
				1080	* to amortize TLB flushing overheads. What this means is that any page you
				1081	* have now, may, in a former life, have been mapped into kernel virtual
				1082	* address by the vmap layer and so there might be some CPUs with TLB entries
				1083	* still referencing that page (additional to the regular 1:1 kernel mapping).
				1084	*
				1085	* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
				1086	* be sure that none of the pages we have control over will have any aliases
				1087	* from the vmap layer.
				1088	*/
				1089	void vm_unmap_aliases(void)
				1090	{
				1091	unsigned long start = ULONG_MAX, end = 0;
				1092	int cpu;
				1093	int flush = 0;
				1094
				1095	if (unlikely(!vmap_initialized))
				1096	return;
				1097
				1098	might_sleep();
				1099
				1100	for_each_possible_cpu(cpu) {
				1101	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
				1102	struct vmap_block *vb;
				1103
				1104	rcu_read_lock();
				1105	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				1106	spin_lock(&vb->lock);
				1107	if (vb->dirty) {
				1108	unsigned long va_start = vb->va->va_start;
				1109	unsigned long s, e;
				1110
				1111	s = va_start + (vb->dirty_min << PAGE_SHIFT);
				1112	e = va_start + (vb->dirty_max << PAGE_SHIFT);
				1113
				1114	start = min(s, start);
				1115	end = max(e, end);
				1116
				1117	flush = 1;
				1118	}
				1119	spin_unlock(&vb->lock);
				1120	}
				1121	rcu_read_unlock();
				1122	}
				1123
				1124	mutex_lock(&vmap_purge_lock);
				1125	purge_fragmented_blocks_allcpus();
				1126	if (!__purge_vmap_area_lazy(start, end) && flush)
				1127	flush_tlb_kernel_range(start, end);
				1128	mutex_unlock(&vmap_purge_lock);
				1129	}
				1130	EXPORT_SYMBOL_GPL(vm_unmap_aliases);
				1131
				1132	/**
				1133	* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
				1134	* @mem: the pointer returned by vm_map_ram
				1135	* @count: the count passed to that vm_map_ram call (cannot unmap partial)
				1136	*/
				1137	void vm_unmap_ram(const void *mem, unsigned int count)
				1138	{
				1139	unsigned long size = (unsigned long)count << PAGE_SHIFT;
				1140	unsigned long addr = (unsigned long)mem;
				1141	struct vmap_area *va;
				1142
				1143	might_sleep();
				1144	BUG_ON(!addr);
				1145	BUG_ON(addr < VMALLOC_START);
				1146	BUG_ON(addr > VMALLOC_END);
				1147	BUG_ON(!PAGE_ALIGNED(addr));
				1148
				1149	debug_check_no_locks_freed(mem, size);
				1150	vmap_debug_free_range(addr, addr+size);
				1151
				1152	if (likely(count <= VMAP_MAX_ALLOC)) {
				1153	vb_free(mem, size);
				1154	return;
				1155	}
				1156
				1157	va = find_vmap_area(addr);
				1158	BUG_ON(!va);
				1159	free_unmap_vmap_area(va);
				1160	}
				1161	EXPORT_SYMBOL(vm_unmap_ram);
				1162
				1163	/**
				1164	* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
				1165	* @pages: an array of pointers to the pages to be mapped
				1166	* @count: number of pages
				1167	* @node: prefer to allocate data structures on this node
				1168	* @prot: memory protection to use. PAGE_KERNEL for regular RAM
				1169	*
				1170	* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
				1171	* faster than vmap so it's good. But if you mix long-life and short-life
				1172	* objects with vm_map_ram(), it could consume lots of address space through
				1173	* fragmentation (especially on a 32bit machine). You could see failures in
				1174	* the end. Please use this function for short-lived objects.
				1175	*
				1176	* Returns: a pointer to the address that has been mapped, or %NULL on failure
				1177	*/
				1178	void vm_map_ram(struct page *pages, unsigned int count, int node, pgprot_t prot)
				1179	{
				1180	unsigned long size = (unsigned long)count << PAGE_SHIFT;
				1181	unsigned long addr;
				1182	void *mem;
				1183
				1184	if (likely(count <= VMAP_MAX_ALLOC)) {
				1185	mem = vb_alloc(size, GFP_KERNEL);
				1186	if (IS_ERR(mem))
				1187	return NULL;
				1188	addr = (unsigned long)mem;
				1189	} else {
				1190	struct vmap_area *va;
				1191	va = alloc_vmap_area(size, PAGE_SIZE,
				1192	VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
				1193	if (IS_ERR(va))
				1194	return NULL;
				1195
				1196	addr = va->va_start;
				1197	mem = (void *)addr;
				1198	}
				1199	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
				1200	vm_unmap_ram(mem, count);
				1201	return NULL;
				1202	}
				1203	return mem;
				1204	}
				1205	EXPORT_SYMBOL(vm_map_ram);
				1206
				1207	static struct vm_struct *vmlist __initdata;
				1208	/**
				1209	* vm_area_add_early - add vmap area early during boot
				1210	* @vm: vm_struct to add
				1211	*
				1212	* This function is used to add fixed kernel vm area to vmlist before
				1213	* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
				1214	* should contain proper values and the other fields should be zero.
				1215	*
				1216	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
				1217	*/
				1218	void __init vm_area_add_early(struct vm_struct *vm)
				1219	{
				1220	struct vm_struct tmp, *p;
				1221
				1222	BUG_ON(vmap_initialized);
				1223	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
				1224	if (tmp->addr >= vm->addr) {
				1225	BUG_ON(tmp->addr < vm->addr + vm->size);
				1226	break;
				1227	} else
				1228	BUG_ON(tmp->addr + tmp->size > vm->addr);
				1229	}
				1230	vm->next = *p;
				1231	*p = vm;
				1232	}
				1233
				1234	/**
				1235	* vm_area_register_early - register vmap area early during boot
				1236	* @vm: vm_struct to register
				1237	* @align: requested alignment
				1238	*
				1239	* This function is used to register kernel vm area before
				1240	* vmalloc_init() is called. @vm->size and @vm->flags should contain
				1241	* proper values on entry and other fields should be zero. On return,
				1242	* vm->addr contains the allocated address.
				1243	*
				1244	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
				1245	*/
				1246	void __init vm_area_register_early(struct vm_struct *vm, size_t align)
				1247	{
				1248	static size_t vm_init_off __initdata;
				1249	unsigned long addr;
				1250
				1251	addr = ALIGN(VMALLOC_START + vm_init_off, align);
				1252	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
				1253
				1254	vm->addr = (void *)addr;
				1255
				1256	vm_area_add_early(vm);
				1257	}
				1258
				1259	void __init vmalloc_init(void)
				1260	{
				1261	struct vmap_area *va;
				1262	struct vm_struct *tmp;
				1263	int i;
				1264
				1265	for_each_possible_cpu(i) {
				1266	struct vmap_block_queue *vbq;
				1267	struct vfree_deferred *p;
				1268
				1269	vbq = &per_cpu(vmap_block_queue, i);
				1270	spin_lock_init(&vbq->lock);
				1271	INIT_LIST_HEAD(&vbq->free);
				1272	p = &per_cpu(vfree_deferred, i);
				1273	init_llist_head(&p->list);
				1274	INIT_WORK(&p->wq, free_work);
				1275	}
				1276
				1277	/* Import existing vmlist entries. */
				1278	for (tmp = vmlist; tmp; tmp = tmp->next) {
				1279	va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
				1280	va->flags = VM_VM_AREA;
				1281	va->va_start = (unsigned long)tmp->addr;
				1282	va->va_end = va->va_start + tmp->size;
				1283	va->vm = tmp;
				1284	__insert_vmap_area(va);
				1285	}
				1286
				1287	vmap_area_pcpu_hole = VMALLOC_END;
				1288
				1289	vmap_initialized = true;
				1290	}
				1291
				1292	/**
				1293	* map_kernel_range_noflush - map kernel VM area with the specified pages
				1294	* @addr: start of the VM area to map
				1295	* @size: size of the VM area to map
				1296	* @prot: page protection flags to use
				1297	* @pages: pages to map
				1298	*
				1299	* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
				1300	* specify should have been allocated using get_vm_area() and its
				1301	* friends.
				1302	*
				1303	* NOTE:
				1304	* This function does NOT do any cache flushing. The caller is
				1305	* responsible for calling flush_cache_vmap() on to-be-mapped areas
				1306	* before calling this function.
				1307	*
				1308	* RETURNS:
				1309	* The number of pages mapped on success, -errno on failure.
				1310	*/
				1311	int map_kernel_range_noflush(unsigned long addr, unsigned long size,
				1312	pgprot_t prot, struct page **pages)
				1313	{
				1314	return vmap_page_range_noflush(addr, addr + size, prot, pages);
				1315	}
				1316
				1317	/**
				1318	* unmap_kernel_range_noflush - unmap kernel VM area
				1319	* @addr: start of the VM area to unmap
				1320	* @size: size of the VM area to unmap
				1321	*
				1322	* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
				1323	* specify should have been allocated using get_vm_area() and its
				1324	* friends.
				1325	*
				1326	* NOTE:
				1327	* This function does NOT do any cache flushing. The caller is
				1328	* responsible for calling flush_cache_vunmap() on to-be-mapped areas
				1329	* before calling this function and flush_tlb_kernel_range() after.
				1330	*/
				1331	void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
				1332	{
				1333	vunmap_page_range(addr, addr + size);
				1334	}
				1335	EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
				1336
				1337	/**
				1338	* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
				1339	* @addr: start of the VM area to unmap
				1340	* @size: size of the VM area to unmap
				1341	*
				1342	* Similar to unmap_kernel_range_noflush() but flushes vcache before
				1343	* the unmapping and tlb after.
				1344	*/
				1345	void unmap_kernel_range(unsigned long addr, unsigned long size)
				1346	{
				1347	unsigned long end = addr + size;
				1348
				1349	flush_cache_vunmap(addr, end);
				1350	vunmap_page_range(addr, end);
				1351	flush_tlb_kernel_range(addr, end);
				1352	}
				1353	EXPORT_SYMBOL_GPL(unmap_kernel_range);
				1354
				1355	int map_vm_area(struct vm_struct area, pgprot_t prot, struct page *pages)
				1356	{
				1357	unsigned long addr = (unsigned long)area->addr;
				1358	unsigned long end = addr + get_vm_area_size(area);
				1359	int err;
				1360
				1361	err = vmap_page_range(addr, end, prot, pages);
				1362
				1363	return err > 0 ? 0 : err;
				1364	}
				1365	EXPORT_SYMBOL_GPL(map_vm_area);
				1366
				1367	static void setup_vmalloc_vm(struct vm_struct vm, struct vmap_area va,
				1368	unsigned long flags, const void *caller)
				1369	{
				1370	spin_lock(&vmap_area_lock);
				1371	vm->flags = flags;
				1372	vm->addr = (void *)va->va_start;
				1373	vm->size = va->va_end - va->va_start;
				1374	vm->caller = caller;
				1375	va->vm = vm;
				1376	va->flags \|= VM_VM_AREA;
				1377	spin_unlock(&vmap_area_lock);
				1378	}
				1379
				1380	static void clear_vm_uninitialized_flag(struct vm_struct *vm)
				1381	{
				1382	/*
				1383	* Before removing VM_UNINITIALIZED,
				1384	* we should make sure that vm has proper values.
				1385	* Pair with smp_rmb() in show_numa_info().
				1386	*/
				1387	smp_wmb();
				1388	vm->flags &= ~VM_UNINITIALIZED;
				1389	}
				1390
				1391	static struct vm_struct *__get_vm_area_node(unsigned long size,
				1392	unsigned long align, unsigned long flags, unsigned long start,
				1393	unsigned long end, int node, gfp_t gfp_mask, const void *caller)
				1394	{
				1395	struct vmap_area *va;
				1396	struct vm_struct *area;
				1397
				1398	BUG_ON(in_interrupt());
				1399	size = PAGE_ALIGN(size);
				1400	if (unlikely(!size))
				1401	return NULL;
				1402
				1403	if (flags & VM_IOREMAP)
				1404	align = 1ul << clamp_t(int, get_count_order_long(size),
				1405	PAGE_SHIFT, IOREMAP_MAX_ORDER);
				1406
				1407	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
				1408	if (unlikely(!area))
				1409	return NULL;
				1410
				1411	if (!(flags & VM_NO_GUARD))
				1412	size += PAGE_SIZE;
				1413
				1414	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
				1415	if (IS_ERR(va)) {
				1416	kfree(area);
				1417	return NULL;
				1418	}
				1419
				1420	setup_vmalloc_vm(area, va, flags, caller);
				1421
				1422	return area;
				1423	}
				1424
				1425	struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
				1426	unsigned long start, unsigned long end)
				1427	{
				1428	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				1429	GFP_KERNEL, __builtin_return_address(0));
				1430	}
				1431	EXPORT_SYMBOL_GPL(__get_vm_area);
				1432
				1433	struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
				1434	unsigned long start, unsigned long end,
				1435	const void *caller)
				1436	{
				1437	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				1438	GFP_KERNEL, caller);
				1439	}
				1440
				1441	/**
				1442	* get_vm_area - reserve a contiguous kernel virtual area
				1443	* @size: size of the area
				1444	* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
				1445	*
				1446	* Search an area of @size in the kernel virtual mapping area,
				1447	* and reserved it for out purposes. Returns the area descriptor
				1448	* on success or %NULL on failure.
				1449	*/
				1450	struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
				1451	{
				1452	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
				1453	NUMA_NO_NODE, GFP_KERNEL,
				1454	__builtin_return_address(0));
				1455	}
				1456
				1457	struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
				1458	const void *caller)
				1459	{
				1460	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
				1461	NUMA_NO_NODE, GFP_KERNEL, caller);
				1462	}
				1463
				1464	/**
				1465	* find_vm_area - find a continuous kernel virtual area
				1466	* @addr: base address
				1467	*
				1468	* Search for the kernel VM area starting at @addr, and return it.
				1469	* It is up to the caller to do all required locking to keep the returned
				1470	* pointer valid.
				1471	*/
				1472	struct vm_struct find_vm_area(const void addr)
				1473	{
				1474	struct vmap_area *va;
				1475
				1476	va = find_vmap_area((unsigned long)addr);
				1477	if (va && va->flags & VM_VM_AREA)
				1478	return va->vm;
				1479
				1480	return NULL;
				1481	}
				1482
				1483	/**
				1484	* remove_vm_area - find and remove a continuous kernel virtual area
				1485	* @addr: base address
				1486	*
				1487	* Search for the kernel VM area starting at @addr, and remove it.
				1488	* This function returns the found VM area, but using it is NOT safe
				1489	* on SMP machines, except for its size or flags.
				1490	*/
				1491	struct vm_struct remove_vm_area(const void addr)
				1492	{
				1493	struct vmap_area *va;
				1494
				1495	might_sleep();
				1496
				1497	va = find_vmap_area((unsigned long)addr);
				1498	if (va && va->flags & VM_VM_AREA) {
				1499	struct vm_struct *vm = va->vm;
				1500
				1501	spin_lock(&vmap_area_lock);
				1502	va->vm = NULL;
				1503	va->flags &= ~VM_VM_AREA;
				1504	va->flags \|= VM_LAZY_FREE;
				1505	spin_unlock(&vmap_area_lock);
				1506
				1507	vmap_debug_free_range(va->va_start, va->va_end);
				1508	kasan_free_shadow(vm);
				1509	free_unmap_vmap_area(va);
				1510
				1511	return vm;
				1512	}
				1513	return NULL;
				1514	}
				1515
				1516	static void __vunmap(const void *addr, int deallocate_pages)
				1517	{
				1518	struct vm_struct *area;
				1519
				1520	if (!addr)
				1521	return;
				1522
				1523	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
				1524	addr))
				1525	return;
				1526
				1527	area = find_vm_area(addr);
				1528	if (unlikely(!area)) {
				1529	WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
				1530	addr);
				1531	return;
				1532	}
				1533
				1534	debug_check_no_locks_freed(addr, get_vm_area_size(area));
				1535	debug_check_no_obj_freed(addr, get_vm_area_size(area));
				1536
				1537	remove_vm_area(addr);
				1538	if (deallocate_pages) {
				1539	int i;
				1540
				1541	for (i = 0; i < area->nr_pages; i++) {
				1542	struct page *page = area->pages[i];
				1543
				1544	BUG_ON(!page);
				1545	__free_pages(page, 0);
				1546	}
				1547
				1548	kvfree(area->pages);
				1549	}
				1550
				1551	kfree(area);
				1552	return;
				1553	}
				1554
				1555	static inline void __vfree_deferred(const void *addr)
				1556	{
				1557	/*
				1558	* Use raw_cpu_ptr() because this can be called from preemptible
				1559	* context. Preemption is absolutely fine here, because the llist_add()
				1560	* implementation is lockless, so it works even if we are adding to
				1561	* nother cpu's list. schedule_work() should be fine with this too.
				1562	*/
				1563	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
				1564
				1565	if (llist_add((struct llist_node *)addr, &p->list))
				1566	schedule_work(&p->wq);
				1567	}
				1568
				1569	/**
				1570	* vfree_atomic - release memory allocated by vmalloc()
				1571	* @addr: memory base address
				1572	*
				1573	* This one is just like vfree() but can be called in any atomic context
				1574	* except NMIs.
				1575	*/
				1576	void vfree_atomic(const void *addr)
				1577	{
				1578	BUG_ON(in_nmi());
				1579
				1580	kmemleak_free(addr);
				1581
				1582	if (!addr)
				1583	return;
				1584	__vfree_deferred(addr);
				1585	}
				1586
				1587	/**
				1588	* vfree - release memory allocated by vmalloc()
				1589	* @addr: memory base address
				1590	*
				1591	* Free the virtually continuous memory area starting at @addr, as
				1592	* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
				1593	* NULL, no operation is performed.
				1594	*
				1595	* Must not be called in NMI context (strictly speaking, only if we don't
				1596	* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
				1597	* conventions for vfree() arch-depenedent would be a really bad idea)
				1598	*
				1599	* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
				1600	*/
				1601	void vfree(const void *addr)
				1602	{
				1603	BUG_ON(in_nmi());
				1604
				1605	kmemleak_free(addr);
				1606
				1607	if (!addr)
				1608	return;
				1609	if (unlikely(in_interrupt()))
				1610	__vfree_deferred(addr);
				1611	else
				1612	__vunmap(addr, 1);
				1613	}
				1614	EXPORT_SYMBOL(vfree);
				1615
				1616	/**
				1617	* vunmap - release virtual mapping obtained by vmap()
				1618	* @addr: memory base address
				1619	*
				1620	* Free the virtually contiguous memory area starting at @addr,
				1621	* which was created from the page array passed to vmap().
				1622	*
				1623	* Must not be called in interrupt context.
				1624	*/
				1625	void vunmap(const void *addr)
				1626	{
				1627	BUG_ON(in_interrupt());
				1628	might_sleep();
				1629	if (addr)
				1630	__vunmap(addr, 0);
				1631	}
				1632	EXPORT_SYMBOL(vunmap);
				1633
				1634	/**
				1635	* vmap - map an array of pages into virtually contiguous space
				1636	* @pages: array of page pointers
				1637	* @count: number of pages to map
				1638	* @flags: vm_area->flags
				1639	* @prot: page protection for the mapping
				1640	*
				1641	* Maps @count pages from @pages into contiguous kernel virtual
				1642	* space.
				1643	*/
				1644	void vmap(struct page *pages, unsigned int count,
				1645	unsigned long flags, pgprot_t prot)
				1646	{
				1647	struct vm_struct *area;
				1648	unsigned long size; /* In bytes */
				1649
				1650	might_sleep();
				1651
				1652	if (count > totalram_pages)
				1653	return NULL;
				1654
				1655	size = (unsigned long)count << PAGE_SHIFT;
				1656	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
				1657	if (!area)
				1658	return NULL;
				1659
				1660	if (map_vm_area(area, prot, pages)) {
				1661	vunmap(area->addr);
				1662	return NULL;
				1663	}
				1664
				1665	return area->addr;
				1666	}
				1667	EXPORT_SYMBOL(vmap);
				1668
				1669	static void *__vmalloc_node(unsigned long size, unsigned long align,
				1670	gfp_t gfp_mask, pgprot_t prot,
				1671	int node, const void *caller);
				1672	static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
				1673	pgprot_t prot, int node)
				1674	{
				1675	struct page **pages;
				1676	unsigned int nr_pages, array_size, i;
				1677	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
				1678	const gfp_t alloc_mask = gfp_mask \| __GFP_NOWARN;
				1679	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA \| GFP_DMA32)) ?
				1680	0 :
				1681	__GFP_HIGHMEM;
				1682
				1683	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
				1684	array_size = (nr_pages * sizeof(struct page *));
				1685
				1686	/* Please note that the recursion is strictly bounded. */
				1687	if (array_size > PAGE_SIZE) {
				1688	pages = __vmalloc_node(array_size, 1, nested_gfp\|highmem_mask,
				1689	PAGE_KERNEL, node, area->caller);
				1690	} else {
				1691	pages = kmalloc_node(array_size, nested_gfp, node);
				1692	}
				1693
				1694	if (!pages) {
				1695	remove_vm_area(area->addr);
				1696	kfree(area);
				1697	return NULL;
				1698	}
				1699
				1700	area->pages = pages;
				1701	area->nr_pages = nr_pages;
				1702
				1703	for (i = 0; i < area->nr_pages; i++) {
				1704	struct page *page;
				1705
				1706	if (node == NUMA_NO_NODE)
				1707	page = alloc_page(alloc_mask\|highmem_mask);
				1708	else
				1709	page = alloc_pages_node(node, alloc_mask\|highmem_mask, 0);
				1710
				1711	if (unlikely(!page)) {
				1712	/* Successfully allocated i pages, free them in __vunmap() */
				1713	area->nr_pages = i;
				1714	goto fail;
				1715	}
				1716	area->pages[i] = page;
				1717	if (gfpflags_allow_blocking(gfp_mask\|highmem_mask))
				1718	cond_resched();
				1719	}
				1720
				1721	if (map_vm_area(area, prot, pages))
				1722	goto fail;
				1723	return area->addr;
				1724
				1725	fail:
				1726	warn_alloc(gfp_mask, NULL,
				1727	"vmalloc: allocation failure, allocated %ld of %ld bytes",
				1728	(area->nr_pages*PAGE_SIZE), area->size);
				1729	vfree(area->addr);
				1730	return NULL;
				1731	}
				1732
				1733	/**
				1734	* __vmalloc_node_range - allocate virtually contiguous memory
				1735	* @size: allocation size
				1736	* @align: desired alignment
				1737	* @start: vm area range start
				1738	* @end: vm area range end
				1739	* @gfp_mask: flags for the page level allocator
				1740	* @prot: protection mask for the allocated pages
				1741	* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
				1742	* @node: node to use for allocation or NUMA_NO_NODE
				1743	* @caller: caller's return address
				1744	*
				1745	* Allocate enough pages to cover @size from the page level
				1746	* allocator with @gfp_mask flags. Map them into contiguous
				1747	* kernel virtual space, using a pagetable protection of @prot.
				1748	*/
				1749	void *__vmalloc_node_range(unsigned long size, unsigned long align,
				1750	unsigned long start, unsigned long end, gfp_t gfp_mask,
				1751	pgprot_t prot, unsigned long vm_flags, int node,
				1752	const void *caller)
				1753	{
				1754	struct vm_struct *area;
				1755	void *addr;
				1756	unsigned long real_size = size;
				1757
				1758	size = PAGE_ALIGN(size);
				1759	if (!size \|\| (size >> PAGE_SHIFT) > totalram_pages)
				1760	goto fail;
				1761
				1762	area = __get_vm_area_node(size, align, VM_ALLOC \| VM_UNINITIALIZED \|
				1763	vm_flags, start, end, node, gfp_mask, caller);
				1764	if (!area)
				1765	goto fail;
				1766
				1767	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
				1768	if (!addr)
				1769	return NULL;
				1770
				1771	/*
				1772	* First make sure the mappings are removed from all page-tables
				1773	* before they are freed.
				1774	*/
				1775	vmalloc_sync_unmappings();
				1776
				1777	/*
				1778	* In this function, newly allocated vm_struct has VM_UNINITIALIZED
				1779	* flag. It means that vm_struct is not fully initialized.
				1780	* Now, it is fully initialized, so remove this flag here.
				1781	*/
				1782	clear_vm_uninitialized_flag(area);
				1783
				1784	kmemleak_vmalloc(area, size, gfp_mask);
				1785
				1786	return addr;
				1787
				1788	fail:
				1789	warn_alloc(gfp_mask, NULL,
				1790	"vmalloc: allocation failure: %lu bytes", real_size);
				1791	return NULL;
				1792	}
				1793
				1794	/**
				1795	* __vmalloc_node - allocate virtually contiguous memory
				1796	* @size: allocation size
				1797	* @align: desired alignment
				1798	* @gfp_mask: flags for the page level allocator
				1799	* @prot: protection mask for the allocated pages
				1800	* @node: node to use for allocation or NUMA_NO_NODE
				1801	* @caller: caller's return address
				1802	*
				1803	* Allocate enough pages to cover @size from the page level
				1804	* allocator with @gfp_mask flags. Map them into contiguous
				1805	* kernel virtual space, using a pagetable protection of @prot.
				1806	*
				1807	* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
				1808	* and __GFP_NOFAIL are not supported
				1809	*
				1810	* Any use of gfp flags outside of GFP_KERNEL should be consulted
				1811	* with mm people.
				1812	*
				1813	*/
				1814	static void *__vmalloc_node(unsigned long size, unsigned long align,
				1815	gfp_t gfp_mask, pgprot_t prot,
				1816	int node, const void *caller)
				1817	{
				1818	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
				1819	gfp_mask, prot, 0, node, caller);
				1820	}
				1821
				1822	void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
				1823	{
				1824	return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
				1825	__builtin_return_address(0));
				1826	}
				1827	EXPORT_SYMBOL(__vmalloc);
				1828
				1829	static inline void *__vmalloc_node_flags(unsigned long size,
				1830	int node, gfp_t flags)
				1831	{
				1832	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
				1833	node, __builtin_return_address(0));
				1834	}
				1835
				1836
				1837	void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
				1838	void *caller)
				1839	{
				1840	return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
				1841	}
				1842
				1843	/**
				1844	* vmalloc - allocate virtually contiguous memory
				1845	* @size: allocation size
				1846	* Allocate enough pages to cover @size from the page level
				1847	* allocator and map them into contiguous kernel virtual space.
				1848	*
				1849	* For tight control over page level allocator and protection flags
				1850	* use __vmalloc() instead.
				1851	*/
				1852	void *vmalloc(unsigned long size)
				1853	{
				1854	return __vmalloc_node_flags(size, NUMA_NO_NODE,
				1855	GFP_KERNEL);
				1856	}
				1857	EXPORT_SYMBOL(vmalloc);
				1858
				1859	/**
				1860	* vzalloc - allocate virtually contiguous memory with zero fill
				1861	* @size: allocation size
				1862	* Allocate enough pages to cover @size from the page level
				1863	* allocator and map them into contiguous kernel virtual space.
				1864	* The memory allocated is set to zero.
				1865	*
				1866	* For tight control over page level allocator and protection flags
				1867	* use __vmalloc() instead.
				1868	*/
				1869	void *vzalloc(unsigned long size)
				1870	{
				1871	return __vmalloc_node_flags(size, NUMA_NO_NODE,
				1872	GFP_KERNEL \| __GFP_ZERO);
				1873	}
				1874	EXPORT_SYMBOL(vzalloc);
				1875
				1876	/**
				1877	* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
				1878	* @size: allocation size
				1879	*
				1880	* The resulting memory area is zeroed so it can be mapped to userspace
				1881	* without leaking data.
				1882	*/
				1883	void *vmalloc_user(unsigned long size)
				1884	{
				1885	struct vm_struct *area;
				1886	void *ret;
				1887
				1888	ret = __vmalloc_node(size, SHMLBA,
				1889	GFP_KERNEL \| __GFP_ZERO,
				1890	PAGE_KERNEL, NUMA_NO_NODE,
				1891	__builtin_return_address(0));
				1892	if (ret) {
				1893	area = find_vm_area(ret);
				1894	area->flags \|= VM_USERMAP;
				1895	}
				1896	return ret;
				1897	}
				1898	EXPORT_SYMBOL(vmalloc_user);
				1899
				1900	/**
				1901	* vmalloc_node - allocate memory on a specific node
				1902	* @size: allocation size
				1903	* @node: numa node
				1904	*
				1905	* Allocate enough pages to cover @size from the page level
				1906	* allocator and map them into contiguous kernel virtual space.
				1907	*
				1908	* For tight control over page level allocator and protection flags
				1909	* use __vmalloc() instead.
				1910	*/
				1911	void *vmalloc_node(unsigned long size, int node)
				1912	{
				1913	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
				1914	node, __builtin_return_address(0));
				1915	}
				1916	EXPORT_SYMBOL(vmalloc_node);
				1917
				1918	/**
				1919	* vzalloc_node - allocate memory on a specific node with zero fill
				1920	* @size: allocation size
				1921	* @node: numa node
				1922	*
				1923	* Allocate enough pages to cover @size from the page level
				1924	* allocator and map them into contiguous kernel virtual space.
				1925	* The memory allocated is set to zero.
				1926	*
				1927	* For tight control over page level allocator and protection flags
				1928	* use __vmalloc_node() instead.
				1929	*/
				1930	void *vzalloc_node(unsigned long size, int node)
				1931	{
				1932	return __vmalloc_node_flags(size, node,
				1933	GFP_KERNEL \| __GFP_ZERO);
				1934	}
				1935	EXPORT_SYMBOL(vzalloc_node);
				1936
				1937	#ifndef PAGE_KERNEL_EXEC
				1938	# define PAGE_KERNEL_EXEC PAGE_KERNEL
				1939	#endif
				1940
				1941	/**
				1942	* vmalloc_exec - allocate virtually contiguous, executable memory
				1943	* @size: allocation size
				1944	*
				1945	* Kernel-internal function to allocate enough pages to cover @size
				1946	* the page level allocator and map them into contiguous and
				1947	* executable kernel virtual space.
				1948	*
				1949	* For tight control over page level allocator and protection flags
				1950	* use __vmalloc() instead.
				1951	*/
				1952
				1953	void *vmalloc_exec(unsigned long size)
				1954	{
				1955	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
				1956	NUMA_NO_NODE, __builtin_return_address(0));
				1957	}
				1958
				1959	#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
				1960	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
				1961	#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
				1962	#define GFP_VMALLOC32 (GFP_DMA \| GFP_KERNEL)
				1963	#else
				1964	/*
				1965	* 64b systems should always have either DMA or DMA32 zones. For others
				1966	* GFP_DMA32 should do the right thing and use the normal zone.
				1967	*/
				1968	#define GFP_VMALLOC32 GFP_DMA32 \| GFP_KERNEL
				1969	#endif
				1970
				1971	/**
				1972	* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
				1973	* @size: allocation size
				1974	*
				1975	* Allocate enough 32bit PA addressable pages to cover @size from the
				1976	* page level allocator and map them into contiguous kernel virtual space.
				1977	*/
				1978	void *vmalloc_32(unsigned long size)
				1979	{
				1980	return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
				1981	NUMA_NO_NODE, __builtin_return_address(0));
				1982	}
				1983	EXPORT_SYMBOL(vmalloc_32);
				1984
				1985	/**
				1986	* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
				1987	* @size: allocation size
				1988	*
				1989	* The resulting memory area is 32bit addressable and zeroed so it can be
				1990	* mapped to userspace without leaking data.
				1991	*/
				1992	void *vmalloc_32_user(unsigned long size)
				1993	{
				1994	struct vm_struct *area;
				1995	void *ret;
				1996
				1997	ret = __vmalloc_node(size, 1, GFP_VMALLOC32 \| __GFP_ZERO, PAGE_KERNEL,
				1998	NUMA_NO_NODE, __builtin_return_address(0));
				1999	if (ret) {
				2000	area = find_vm_area(ret);
				2001	area->flags \|= VM_USERMAP;
				2002	}
				2003	return ret;
				2004	}
				2005	EXPORT_SYMBOL(vmalloc_32_user);
				2006
				2007	/*
				2008	* small helper routine , copy contents to buf from addr.
				2009	* If the page is not present, fill zero.
				2010	*/
				2011
				2012	static int aligned_vread(char buf, char addr, unsigned long count)
				2013	{
				2014	struct page *p;
				2015	int copied = 0;
				2016
				2017	while (count) {
				2018	unsigned long offset, length;
				2019
				2020	offset = offset_in_page(addr);
				2021	length = PAGE_SIZE - offset;
				2022	if (length > count)
				2023	length = count;
				2024	p = vmalloc_to_page(addr);
				2025	/*
				2026	* To do safe access to this _mapped_ area, we need
				2027	* lock. But adding lock here means that we need to add
				2028	* overhead of vmalloc()/vfree() calles for this _debug_
				2029	* interface, rarely used. Instead of that, we'll use
				2030	* kmap() and get small overhead in this access function.
				2031	*/
				2032	if (p) {
				2033	/*
				2034	* we can expect USER0 is not used (see vread/vwrite's
				2035	* function description)
				2036	*/
				2037	void *map = kmap_atomic(p);
				2038	memcpy(buf, map + offset, length);
				2039	kunmap_atomic(map);
				2040	} else
				2041	memset(buf, 0, length);
				2042
				2043	addr += length;
				2044	buf += length;
				2045	copied += length;
				2046	count -= length;
				2047	}
				2048	return copied;
				2049	}
				2050
				2051	static int aligned_vwrite(char buf, char addr, unsigned long count)
				2052	{
				2053	struct page *p;
				2054	int copied = 0;
				2055
				2056	while (count) {
				2057	unsigned long offset, length;
				2058
				2059	offset = offset_in_page(addr);
				2060	length = PAGE_SIZE - offset;
				2061	if (length > count)
				2062	length = count;
				2063	p = vmalloc_to_page(addr);
				2064	/*
				2065	* To do safe access to this _mapped_ area, we need
				2066	* lock. But adding lock here means that we need to add
				2067	* overhead of vmalloc()/vfree() calles for this _debug_
				2068	* interface, rarely used. Instead of that, we'll use
				2069	* kmap() and get small overhead in this access function.
				2070	*/
				2071	if (p) {
				2072	/*
				2073	* we can expect USER0 is not used (see vread/vwrite's
				2074	* function description)
				2075	*/
				2076	void *map = kmap_atomic(p);
				2077	memcpy(map + offset, buf, length);
				2078	kunmap_atomic(map);
				2079	}
				2080	addr += length;
				2081	buf += length;
				2082	copied += length;
				2083	count -= length;
				2084	}
				2085	return copied;
				2086	}
				2087
				2088	/**
				2089	* vread() - read vmalloc area in a safe way.
				2090	* @buf: buffer for reading data
				2091	* @addr: vm address.
				2092	* @count: number of bytes to be read.
				2093	*
				2094	* Returns # of bytes which addr and buf should be increased.
				2095	* (same number to @count). Returns 0 if [addr...addr+count) doesn't
				2096	* includes any intersect with alive vmalloc area.
				2097	*
				2098	* This function checks that addr is a valid vmalloc'ed area, and
				2099	* copy data from that area to a given buffer. If the given memory range
				2100	* of [addr...addr+count) includes some valid address, data is copied to
				2101	* proper area of @buf. If there are memory holes, they'll be zero-filled.
				2102	* IOREMAP area is treated as memory hole and no copy is done.
				2103	*
				2104	* If [addr...addr+count) doesn't includes any intersects with alive
				2105	* vm_struct area, returns 0. @buf should be kernel's buffer.
				2106	*
				2107	* Note: In usual ops, vread() is never necessary because the caller
				2108	* should know vmalloc() area is valid and can use memcpy().
				2109	* This is for routines which have to access vmalloc area without
				2110	* any informaion, as /dev/kmem.
				2111	*
				2112	*/
				2113
				2114	long vread(char buf, char addr, unsigned long count)
				2115	{
				2116	struct vmap_area *va;
				2117	struct vm_struct *vm;
				2118	char vaddr, buf_start = buf;
				2119	unsigned long buflen = count;
				2120	unsigned long n;
				2121
				2122	/* Don't allow overflow */
				2123	if ((unsigned long) addr + count < count)
				2124	count = -(unsigned long) addr;
				2125
				2126	spin_lock(&vmap_area_lock);
				2127	list_for_each_entry(va, &vmap_area_list, list) {
				2128	if (!count)
				2129	break;
				2130
				2131	if (!(va->flags & VM_VM_AREA))
				2132	continue;
				2133
				2134	vm = va->vm;
				2135	vaddr = (char *) vm->addr;
				2136	if (addr >= vaddr + get_vm_area_size(vm))
				2137	continue;
				2138	while (addr < vaddr) {
				2139	if (count == 0)
				2140	goto finished;
				2141	*buf = '\0';
				2142	buf++;
				2143	addr++;
				2144	count--;
				2145	}
				2146	n = vaddr + get_vm_area_size(vm) - addr;
				2147	if (n > count)
				2148	n = count;
				2149	if (!(vm->flags & VM_IOREMAP))
				2150	aligned_vread(buf, addr, n);
				2151	else /* IOREMAP area is treated as memory hole */
				2152	memset(buf, 0, n);
				2153	buf += n;
				2154	addr += n;
				2155	count -= n;
				2156	}
				2157	finished:
				2158	spin_unlock(&vmap_area_lock);
				2159
				2160	if (buf == buf_start)
				2161	return 0;
				2162	/* zero-fill memory holes */
				2163	if (buf != buf_start + buflen)
				2164	memset(buf, 0, buflen - (buf - buf_start));
				2165
				2166	return buflen;
				2167	}
				2168
				2169	/**
				2170	* vwrite() - write vmalloc area in a safe way.
				2171	* @buf: buffer for source data
				2172	* @addr: vm address.
				2173	* @count: number of bytes to be read.
				2174	*
				2175	* Returns # of bytes which addr and buf should be incresed.
				2176	* (same number to @count).
				2177	* If [addr...addr+count) doesn't includes any intersect with valid
				2178	* vmalloc area, returns 0.
				2179	*
				2180	* This function checks that addr is a valid vmalloc'ed area, and
				2181	* copy data from a buffer to the given addr. If specified range of
				2182	* [addr...addr+count) includes some valid address, data is copied from
				2183	* proper area of @buf. If there are memory holes, no copy to hole.
				2184	* IOREMAP area is treated as memory hole and no copy is done.
				2185	*
				2186	* If [addr...addr+count) doesn't includes any intersects with alive
				2187	* vm_struct area, returns 0. @buf should be kernel's buffer.
				2188	*
				2189	* Note: In usual ops, vwrite() is never necessary because the caller
				2190	* should know vmalloc() area is valid and can use memcpy().
				2191	* This is for routines which have to access vmalloc area without
				2192	* any informaion, as /dev/kmem.
				2193	*/
				2194
				2195	long vwrite(char buf, char addr, unsigned long count)
				2196	{
				2197	struct vmap_area *va;
				2198	struct vm_struct *vm;
				2199	char *vaddr;
				2200	unsigned long n, buflen;
				2201	int copied = 0;
				2202
				2203	/* Don't allow overflow */
				2204	if ((unsigned long) addr + count < count)
				2205	count = -(unsigned long) addr;
				2206	buflen = count;
				2207
				2208	spin_lock(&vmap_area_lock);
				2209	list_for_each_entry(va, &vmap_area_list, list) {
				2210	if (!count)
				2211	break;
				2212
				2213	if (!(va->flags & VM_VM_AREA))
				2214	continue;
				2215
				2216	vm = va->vm;
				2217	vaddr = (char *) vm->addr;
				2218	if (addr >= vaddr + get_vm_area_size(vm))
				2219	continue;
				2220	while (addr < vaddr) {
				2221	if (count == 0)
				2222	goto finished;
				2223	buf++;
				2224	addr++;
				2225	count--;
				2226	}
				2227	n = vaddr + get_vm_area_size(vm) - addr;
				2228	if (n > count)
				2229	n = count;
				2230	if (!(vm->flags & VM_IOREMAP)) {
				2231	aligned_vwrite(buf, addr, n);
				2232	copied++;
				2233	}
				2234	buf += n;
				2235	addr += n;
				2236	count -= n;
				2237	}
				2238	finished:
				2239	spin_unlock(&vmap_area_lock);
				2240	if (!copied)
				2241	return 0;
				2242	return buflen;
				2243	}
				2244
				2245	/**
				2246	* remap_vmalloc_range_partial - map vmalloc pages to userspace
				2247	* @vma: vma to cover
				2248	* @uaddr: target user address to start at
				2249	* @kaddr: virtual address of vmalloc kernel memory
				2250	* @pgoff: offset from @kaddr to start at
				2251	* @size: size of map area
				2252	*
				2253	* Returns: 0 for success, -Exxx on failure
				2254	*
				2255	* This function checks that @kaddr is a valid vmalloc'ed area,
				2256	* and that it is big enough to cover the range starting at
				2257	* @uaddr in @vma. Will return failure if that criteria isn't
				2258	* met.
				2259	*
				2260	* Similar to remap_pfn_range() (see mm/memory.c)
				2261	*/
				2262	int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
				2263	void *kaddr, unsigned long pgoff,
				2264	unsigned long size)
				2265	{
				2266	struct vm_struct *area;
				2267	unsigned long off;
				2268	unsigned long end_index;
				2269
				2270	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
				2271	return -EINVAL;
				2272
				2273	size = PAGE_ALIGN(size);
				2274
				2275	if (!PAGE_ALIGNED(uaddr) \|\| !PAGE_ALIGNED(kaddr))
				2276	return -EINVAL;
				2277
				2278	area = find_vm_area(kaddr);
				2279	if (!area)
				2280	return -EINVAL;
				2281
				2282	if (!(area->flags & VM_USERMAP))
				2283	return -EINVAL;
				2284
				2285	if (check_add_overflow(size, off, &end_index) \|\|
				2286	end_index > get_vm_area_size(area))
				2287	return -EINVAL;
				2288	kaddr += off;
				2289
				2290	do {
				2291	struct page *page = vmalloc_to_page(kaddr);
				2292	int ret;
				2293
				2294	ret = vm_insert_page(vma, uaddr, page);
				2295	if (ret)
				2296	return ret;
				2297
				2298	uaddr += PAGE_SIZE;
				2299	kaddr += PAGE_SIZE;
				2300	size -= PAGE_SIZE;
				2301	} while (size > 0);
				2302
				2303	vma->vm_flags \|= VM_DONTEXPAND \| VM_DONTDUMP;
				2304
				2305	return 0;
				2306	}
				2307	EXPORT_SYMBOL(remap_vmalloc_range_partial);
				2308
				2309	/**
				2310	* remap_vmalloc_range - map vmalloc pages to userspace
				2311	* @vma: vma to cover (map full range of vma)
				2312	* @addr: vmalloc memory
				2313	* @pgoff: number of pages into addr before first page to map
				2314	*
				2315	* Returns: 0 for success, -Exxx on failure
				2316	*
				2317	* This function checks that addr is a valid vmalloc'ed area, and
				2318	* that it is big enough to cover the vma. Will return failure if
				2319	* that criteria isn't met.
				2320	*
				2321	* Similar to remap_pfn_range() (see mm/memory.c)
				2322	*/
				2323	int remap_vmalloc_range(struct vm_area_struct vma, void addr,
				2324	unsigned long pgoff)
				2325	{
				2326	return remap_vmalloc_range_partial(vma, vma->vm_start,
				2327	addr, pgoff,
				2328	vma->vm_end - vma->vm_start);
				2329	}
				2330	EXPORT_SYMBOL(remap_vmalloc_range);
				2331
				2332	/*
				2333	* Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
				2334	* not to have one.
				2335	*
				2336	* The purpose of this function is to make sure the vmalloc area
				2337	* mappings are identical in all page-tables in the system.
				2338	*/
				2339	void __weak vmalloc_sync_mappings(void)
				2340	{
				2341	}
				2342
				2343	void __weak vmalloc_sync_unmappings(void)
				2344	{
				2345	}
				2346
				2347	static int f(pte_t pte, pgtable_t table, unsigned long addr, void data)
				2348	{
				2349	pte_t ***p = data;
				2350
				2351	if (p) {
				2352	(p) = pte;
				2353	(*p)++;
				2354	}
				2355	return 0;
				2356	}
				2357
				2358	/**
				2359	* alloc_vm_area - allocate a range of kernel address space
				2360	* @size: size of the area
				2361	* @ptes: returns the PTEs for the address space
				2362	*
				2363	* Returns: NULL on failure, vm_struct on success
				2364	*
				2365	* This function reserves a range of kernel address space, and
				2366	* allocates pagetables to map that range. No actual mappings
				2367	* are created.
				2368	*
				2369	* If @ptes is non-NULL, pointers to the PTEs (in init_mm)
				2370	* allocated for the VM area are returned.
				2371	*/
				2372	struct vm_struct alloc_vm_area(size_t size, pte_t *ptes)
				2373	{
				2374	struct vm_struct *area;
				2375
				2376	area = get_vm_area_caller(size, VM_IOREMAP,
				2377	__builtin_return_address(0));
				2378	if (area == NULL)
				2379	return NULL;
				2380
				2381	/*
				2382	* This ensures that page tables are constructed for this region
				2383	* of kernel virtual address space and mapped into init_mm.
				2384	*/
				2385	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
				2386	size, f, ptes ? &ptes : NULL)) {
				2387	free_vm_area(area);
				2388	return NULL;
				2389	}
				2390
				2391	return area;
				2392	}
				2393	EXPORT_SYMBOL_GPL(alloc_vm_area);
				2394
				2395	void free_vm_area(struct vm_struct *area)
				2396	{
				2397	struct vm_struct *ret;
				2398	ret = remove_vm_area(area->addr);
				2399	BUG_ON(ret != area);
				2400	kfree(area);
				2401	}
				2402	EXPORT_SYMBOL_GPL(free_vm_area);
				2403
				2404	#ifdef CONFIG_SMP
				2405	static struct vmap_area node_to_va(struct rb_node n)
				2406	{
				2407	return rb_entry_safe(n, struct vmap_area, rb_node);
				2408	}
				2409
				2410	/**
				2411	* pvm_find_next_prev - find the next and prev vmap_area surrounding @end
				2412	* @end: target address
				2413	* @pnext: out arg for the next vmap_area
				2414	* @pprev: out arg for the previous vmap_area
				2415	*
				2416	* Returns: %true if either or both of next and prev are found,
				2417	* %false if no vmap_area exists
				2418	*
				2419	* Find vmap_areas end addresses of which enclose @end. ie. if not
				2420	* NULL, pnext->va_end > @end and pprev->va_end <= @end.
				2421	*/
				2422	static bool pvm_find_next_prev(unsigned long end,
				2423	struct vmap_area **pnext,
				2424	struct vmap_area **pprev)
				2425	{
				2426	struct rb_node *n = vmap_area_root.rb_node;
				2427	struct vmap_area *va = NULL;
				2428
				2429	while (n) {
				2430	va = rb_entry(n, struct vmap_area, rb_node);
				2431	if (end < va->va_end)
				2432	n = n->rb_left;
				2433	else if (end > va->va_end)
				2434	n = n->rb_right;
				2435	else
				2436	break;
				2437	}
				2438
				2439	if (!va)
				2440	return false;
				2441
				2442	if (va->va_end > end) {
				2443	*pnext = va;
				2444	pprev = node_to_va(rb_prev(&(pnext)->rb_node));
				2445	} else {
				2446	*pprev = va;
				2447	pnext = node_to_va(rb_next(&(pprev)->rb_node));
				2448	}
				2449	return true;
				2450	}
				2451
				2452	/**
				2453	* pvm_determine_end - find the highest aligned address between two vmap_areas
				2454	* @pnext: in/out arg for the next vmap_area
				2455	* @pprev: in/out arg for the previous vmap_area
				2456	* @align: alignment
				2457	*
				2458	* Returns: determined end address
				2459	*
				2460	* Find the highest aligned address between @pnext and @pprev below
				2461	* VMALLOC_END. @pnext and @pprev are adjusted so that the aligned
				2462	* down address is between the end addresses of the two vmap_areas.
				2463	*
				2464	* Please note that the address returned by this function may fall
				2465	* inside *@pnext vmap_area. The caller is responsible for checking
				2466	* that.
				2467	*/
				2468	static unsigned long pvm_determine_end(struct vmap_area **pnext,
				2469	struct vmap_area **pprev,
				2470	unsigned long align)
				2471	{
				2472	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
				2473	unsigned long addr;
				2474
				2475	if (*pnext)
				2476	addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
				2477	else
				2478	addr = vmalloc_end;
				2479
				2480	while (pprev && (pprev)->va_end > addr) {
				2481	pnext = pprev;
				2482	pprev = node_to_va(rb_prev(&(pnext)->rb_node));
				2483	}
				2484
				2485	return addr;
				2486	}
				2487
				2488	/**
				2489	* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
				2490	* @offsets: array containing offset of each area
				2491	* @sizes: array containing size of each area
				2492	* @nr_vms: the number of areas to allocate
				2493	* @align: alignment, all entries in @offsets and @sizes must be aligned to this
				2494	*
				2495	* Returns: kmalloc'd vm_struct pointer array pointing to allocated
				2496	* vm_structs on success, %NULL on failure
				2497	*
				2498	* Percpu allocator wants to use congruent vm areas so that it can
				2499	* maintain the offsets among percpu areas. This function allocates
				2500	* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
				2501	* be scattered pretty far, distance between two areas easily going up
				2502	* to gigabytes. To avoid interacting with regular vmallocs, these
				2503	* areas are allocated from top.
				2504	*
				2505	* Despite its complicated look, this allocator is rather simple. It
				2506	* does everything top-down and scans areas from the end looking for
				2507	* matching slot. While scanning, if any of the areas overlaps with
				2508	* existing vmap_area, the base address is pulled down to fit the
				2509	* area. Scanning is repeated till all the areas fit and then all
				2510	* necessary data structures are inserted and the result is returned.
				2511	*/
				2512	struct vm_struct *pcpu_get_vm_areas(const unsigned long offsets,
				2513	const size_t *sizes, int nr_vms,
				2514	size_t align)
				2515	{
				2516	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
				2517	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
				2518	struct vmap_area *vas, prev, *next;
				2519	struct vm_struct **vms;
				2520	int area, area2, last_area, term_area;
				2521	unsigned long base, start, end, last_end;
				2522	bool purged = false;
				2523
				2524	/* verify parameters and allocate data structures */
				2525	BUG_ON(offset_in_page(align) \|\| !is_power_of_2(align));
				2526	for (last_area = 0, area = 0; area < nr_vms; area++) {
				2527	start = offsets[area];
				2528	end = start + sizes[area];
				2529
				2530	/* is everything aligned properly? */
				2531	BUG_ON(!IS_ALIGNED(offsets[area], align));
				2532	BUG_ON(!IS_ALIGNED(sizes[area], align));
				2533
				2534	/* detect the area with the highest address */
				2535	if (start > offsets[last_area])
				2536	last_area = area;
				2537
				2538	for (area2 = area + 1; area2 < nr_vms; area2++) {
				2539	unsigned long start2 = offsets[area2];
				2540	unsigned long end2 = start2 + sizes[area2];
				2541
				2542	BUG_ON(start2 < end && start < end2);
				2543	}
				2544	}
				2545	last_end = offsets[last_area] + sizes[last_area];
				2546
				2547	if (vmalloc_end - vmalloc_start < last_end) {
				2548	WARN_ON(true);
				2549	return NULL;
				2550	}
				2551
				2552	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
				2553	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
				2554	if (!vas \|\| !vms)
				2555	goto err_free2;
				2556
				2557	for (area = 0; area < nr_vms; area++) {
				2558	vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
				2559	vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
				2560	if (!vas[area] \|\| !vms[area])
				2561	goto err_free;
				2562	}
				2563	retry:
				2564	spin_lock(&vmap_area_lock);
				2565
				2566	/* start scanning - we scan from the top, begin with the last area */
				2567	area = term_area = last_area;
				2568	start = offsets[area];
				2569	end = start + sizes[area];
				2570
				2571	if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
				2572	base = vmalloc_end - last_end;
				2573	goto found;
				2574	}
				2575	base = pvm_determine_end(&next, &prev, align) - end;
				2576
				2577	while (true) {
				2578	BUG_ON(next && next->va_end <= base + end);
				2579	BUG_ON(prev && prev->va_end > base + end);
				2580
				2581	/*
				2582	* base might have underflowed, add last_end before
				2583	* comparing.
				2584	*/
				2585	if (base + last_end < vmalloc_start + last_end) {
				2586	spin_unlock(&vmap_area_lock);
				2587	if (!purged) {
				2588	purge_vmap_area_lazy();
				2589	purged = true;
				2590	goto retry;
				2591	}
				2592	goto err_free;
				2593	}
				2594
				2595	/*
				2596	* If next overlaps, move base downwards so that it's
				2597	* right below next and then recheck.
				2598	*/
				2599	if (next && next->va_start < base + end) {
				2600	base = pvm_determine_end(&next, &prev, align) - end;
				2601	term_area = area;
				2602	continue;
				2603	}
				2604
				2605	/*
				2606	* If prev overlaps, shift down next and prev and move
				2607	* base so that it's right below new next and then
				2608	* recheck.
				2609	*/
				2610	if (prev && prev->va_end > base + start) {
				2611	next = prev;
				2612	prev = node_to_va(rb_prev(&next->rb_node));
				2613	base = pvm_determine_end(&next, &prev, align) - end;
				2614	term_area = area;
				2615	continue;
				2616	}
				2617
				2618	/*
				2619	* This area fits, move on to the previous one. If
				2620	* the previous one is the terminal one, we're done.
				2621	*/
				2622	area = (area + nr_vms - 1) % nr_vms;
				2623	if (area == term_area)
				2624	break;
				2625	start = offsets[area];
				2626	end = start + sizes[area];
				2627	pvm_find_next_prev(base + end, &next, &prev);
				2628	}
				2629	found:
				2630	/* we've found a fitting base, insert all va's */
				2631	for (area = 0; area < nr_vms; area++) {
				2632	struct vmap_area *va = vas[area];
				2633
				2634	va->va_start = base + offsets[area];
				2635	va->va_end = va->va_start + sizes[area];
				2636	__insert_vmap_area(va);
				2637	}
				2638
				2639	vmap_area_pcpu_hole = base + offsets[last_area];
				2640
				2641	spin_unlock(&vmap_area_lock);
				2642
				2643	/* insert all vm's */
				2644	for (area = 0; area < nr_vms; area++)
				2645	setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
				2646	pcpu_get_vm_areas);
				2647
				2648	kfree(vas);
				2649	return vms;
				2650
				2651	err_free:
				2652	for (area = 0; area < nr_vms; area++) {
				2653	kfree(vas[area]);
				2654	kfree(vms[area]);
				2655	}
				2656	err_free2:
				2657	kfree(vas);
				2658	kfree(vms);
				2659	return NULL;
				2660	}
				2661
				2662	/**
				2663	* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
				2664	* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
				2665	* @nr_vms: the number of allocated areas
				2666	*
				2667	* Free vm_structs and the array allocated by pcpu_get_vm_areas().
				2668	*/
				2669	void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
				2670	{
				2671	int i;
				2672
				2673	for (i = 0; i < nr_vms; i++)
				2674	free_vm_area(vms[i]);
				2675	kfree(vms);
				2676	}
				2677	#endif /* CONFIG_SMP */
				2678
				2679	#ifdef CONFIG_PROC_FS
				2680	static void s_start(struct seq_file m, loff_t *pos)
				2681	__acquires(&vmap_area_lock)
				2682	{
				2683	spin_lock(&vmap_area_lock);
				2684	return seq_list_start(&vmap_area_list, *pos);
				2685	}
				2686
				2687	static void s_next(struct seq_file m, void p, loff_t pos)
				2688	{
				2689	return seq_list_next(p, &vmap_area_list, pos);
				2690	}
				2691
				2692	static void s_stop(struct seq_file m, void p)
				2693	__releases(&vmap_area_lock)
				2694	{
				2695	spin_unlock(&vmap_area_lock);
				2696	}
				2697
				2698	static void show_numa_info(struct seq_file m, struct vm_struct v)
				2699	{
				2700	if (IS_ENABLED(CONFIG_NUMA)) {
				2701	unsigned int nr, *counters = m->private;
				2702
				2703	if (!counters)
				2704	return;
				2705
				2706	if (v->flags & VM_UNINITIALIZED)
				2707	return;
				2708	/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
				2709	smp_rmb();
				2710
				2711	memset(counters, 0, nr_node_ids * sizeof(unsigned int));
				2712
				2713	for (nr = 0; nr < v->nr_pages; nr++)
				2714	counters[page_to_nid(v->pages[nr])]++;
				2715
				2716	for_each_node_state(nr, N_HIGH_MEMORY)
				2717	if (counters[nr])
				2718	seq_printf(m, " N%u=%u", nr, counters[nr]);
				2719	}
				2720	}
				2721
				2722	static int s_show(struct seq_file m, void p)
				2723	{
				2724	struct vmap_area *va;
				2725	struct vm_struct *v;
				2726
				2727	va = list_entry(p, struct vmap_area, list);
				2728
				2729	/*
				2730	* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
				2731	* behalf of vmap area is being tear down or vm_map_ram allocation.
				2732	*/
				2733	if (!(va->flags & VM_VM_AREA)) {
				2734	seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
				2735	(void )va->va_start, (void )va->va_end,
				2736	va->va_end - va->va_start,
				2737	va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
				2738
				2739	return 0;
				2740	}
				2741
				2742	v = va->vm;
				2743
				2744	seq_printf(m, "0x%pK-0x%pK %7ld",
				2745	v->addr, v->addr + v->size, v->size);
				2746
				2747	if (v->caller)
				2748	seq_printf(m, " %pS", v->caller);
				2749
				2750	if (v->nr_pages)
				2751	seq_printf(m, " pages=%d", v->nr_pages);
				2752
				2753	if (v->phys_addr)
				2754	seq_printf(m, " phys=%pa", &v->phys_addr);
				2755
				2756	if (v->flags & VM_IOREMAP)
				2757	seq_puts(m, " ioremap");
				2758
				2759	if (v->flags & VM_ALLOC)
				2760	seq_puts(m, " vmalloc");
				2761
				2762	if (v->flags & VM_MAP)
				2763	seq_puts(m, " vmap");
				2764
				2765	if (v->flags & VM_USERMAP)
				2766	seq_puts(m, " user");
				2767
				2768	if (is_vmalloc_addr(v->pages))
				2769	seq_puts(m, " vpages");
				2770
				2771	show_numa_info(m, v);
				2772	seq_putc(m, '\n');
				2773	return 0;
				2774	}
				2775
				2776	static const struct seq_operations vmalloc_op = {
				2777	.start = s_start,
				2778	.next = s_next,
				2779	.stop = s_stop,
				2780	.show = s_show,
				2781	};
				2782
				2783	static int vmalloc_open(struct inode inode, struct file file)
				2784	{
				2785	if (IS_ENABLED(CONFIG_NUMA))
				2786	return seq_open_private(file, &vmalloc_op,
				2787	nr_node_ids * sizeof(unsigned int));
				2788	else
				2789	return seq_open(file, &vmalloc_op);
				2790	}
				2791
				2792	static const struct file_operations proc_vmalloc_operations = {
				2793	.open = vmalloc_open,
				2794	.read = seq_read,
				2795	.llseek = seq_lseek,
				2796	.release = seq_release_private,
				2797	};
				2798
				2799	static int __init proc_vmalloc_init(void)
				2800	{
				2801	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
				2802	return 0;
				2803	}
				2804	module_init(proc_vmalloc_init);
				2805
				2806	#endif
				2807