Blame - src/kernel/linux/v4.19/mm/vmalloc.c - T800

blob: bf26d404e62980c429e54a1194183409c6fd2a13 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	/*
				2	* linux/mm/vmalloc.c
				3	*
				4	* Copyright (C) 1993 Linus Torvalds
				5	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
				6	* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
				7	* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
				8	* Numa awareness, Christoph Lameter, SGI, June 2005
				9	*/
				10
				11	#include <linux/vmalloc.h>
				12	#include <linux/mm.h>
				13	#include <linux/module.h>
				14	#include <linux/highmem.h>
				15	#include <linux/sched/signal.h>
				16	#include <linux/slab.h>
				17	#include <linux/spinlock.h>
				18	#include <linux/interrupt.h>
				19	#include <linux/proc_fs.h>
				20	#include <linux/seq_file.h>
				21	#include <linux/debugobjects.h>
				22	#include <linux/kallsyms.h>
				23	#include <linux/list.h>
				24	#include <linux/notifier.h>
				25	#include <linux/rbtree.h>
				26	#include <linux/radix-tree.h>
				27	#include <linux/rcupdate.h>
				28	#include <linux/pfn.h>
				29	#include <linux/kmemleak.h>
				30	#include <linux/atomic.h>
				31	#include <linux/compiler.h>
				32	#include <linux/llist.h>
				33	#include <linux/bitops.h>
				34
				35	#include <linux/uaccess.h>
				36	#include <asm/tlbflush.h>
				37	#include <asm/shmparam.h>
				38
				39	#include "internal.h"
				40
				41	struct vfree_deferred {
				42	struct llist_head list;
				43	struct work_struct wq;
				44	};
				45	static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
				46
				47	static void __vunmap(const void *, int);
				48
				49	static void free_work(struct work_struct *w)
				50	{
				51	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
				52	struct llist_node t, llnode;
				53
				54	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
				55	__vunmap((void *)llnode, 1);
				56	}
				57
				58	/* Page table manipulation functions */
				59
				60	static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
				61	{
				62	pte_t *pte;
				63
				64	pte = pte_offset_kernel(pmd, addr);
				65	do {
				66	pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
				67	WARN_ON(!pte_none(ptent) && !pte_present(ptent));
				68	} while (pte++, addr += PAGE_SIZE, addr != end);
				69	}
				70
				71	static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
				72	{
				73	pmd_t *pmd;
				74	unsigned long next;
				75
				76	pmd = pmd_offset(pud, addr);
				77	do {
				78	next = pmd_addr_end(addr, end);
				79	if (pmd_clear_huge(pmd))
				80	continue;
				81	if (pmd_none_or_clear_bad(pmd))
				82	continue;
				83	vunmap_pte_range(pmd, addr, next);
				84	} while (pmd++, addr = next, addr != end);
				85	}
				86
				87	static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
				88	{
				89	pud_t *pud;
				90	unsigned long next;
				91
				92	pud = pud_offset(p4d, addr);
				93	do {
				94	next = pud_addr_end(addr, end);
				95	if (pud_clear_huge(pud))
				96	continue;
				97	if (pud_none_or_clear_bad(pud))
				98	continue;
				99	vunmap_pmd_range(pud, addr, next);
				100	} while (pud++, addr = next, addr != end);
				101	}
				102
				103	static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
				104	{
				105	p4d_t *p4d;
				106	unsigned long next;
				107
				108	p4d = p4d_offset(pgd, addr);
				109	do {
				110	next = p4d_addr_end(addr, end);
				111	if (p4d_clear_huge(p4d))
				112	continue;
				113	if (p4d_none_or_clear_bad(p4d))
				114	continue;
				115	vunmap_pud_range(p4d, addr, next);
				116	} while (p4d++, addr = next, addr != end);
				117	}
				118
				119	static void vunmap_page_range(unsigned long addr, unsigned long end)
				120	{
				121	pgd_t *pgd;
				122	unsigned long next;
				123
				124	BUG_ON(addr >= end);
				125	pgd = pgd_offset_k(addr);
				126	do {
				127	next = pgd_addr_end(addr, end);
				128	if (pgd_none_or_clear_bad(pgd))
				129	continue;
				130	vunmap_p4d_range(pgd, addr, next);
				131	} while (pgd++, addr = next, addr != end);
				132	}
				133
				134	static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
				135	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				136	{
				137	pte_t *pte;
				138
				139	/*
				140	* nr is a running index into the array which helps higher level
				141	* callers keep track of where we're up to.
				142	*/
				143
				144	pte = pte_alloc_kernel(pmd, addr);
				145	if (!pte)
				146	return -ENOMEM;
				147	do {
				148	struct page page = pages[nr];
				149
				150	if (WARN_ON(!pte_none(*pte)))
				151	return -EBUSY;
				152	if (WARN_ON(!page))
				153	return -ENOMEM;
				154	set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
				155	(*nr)++;
				156	} while (pte++, addr += PAGE_SIZE, addr != end);
				157	return 0;
				158	}
				159
				160	static int vmap_pmd_range(pud_t *pud, unsigned long addr,
				161	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				162	{
				163	pmd_t *pmd;
				164	unsigned long next;
				165
				166	pmd = pmd_alloc(&init_mm, pud, addr);
				167	if (!pmd)
				168	return -ENOMEM;
				169	do {
				170	next = pmd_addr_end(addr, end);
				171	if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
				172	return -ENOMEM;
				173	} while (pmd++, addr = next, addr != end);
				174	return 0;
				175	}
				176
				177	static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
				178	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				179	{
				180	pud_t *pud;
				181	unsigned long next;
				182
				183	pud = pud_alloc(&init_mm, p4d, addr);
				184	if (!pud)
				185	return -ENOMEM;
				186	do {
				187	next = pud_addr_end(addr, end);
				188	if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
				189	return -ENOMEM;
				190	} while (pud++, addr = next, addr != end);
				191	return 0;
				192	}
				193
				194	static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
				195	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				196	{
				197	p4d_t *p4d;
				198	unsigned long next;
				199
				200	p4d = p4d_alloc(&init_mm, pgd, addr);
				201	if (!p4d)
				202	return -ENOMEM;
				203	do {
				204	next = p4d_addr_end(addr, end);
				205	if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
				206	return -ENOMEM;
				207	} while (p4d++, addr = next, addr != end);
				208	return 0;
				209	}
				210
				211	/*
				212	* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
				213	* will have pfns corresponding to the "pages" array.
				214	*
				215	* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
				216	*/
				217	static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				218	pgprot_t prot, struct page **pages)
				219	{
				220	pgd_t *pgd;
				221	unsigned long next;
				222	unsigned long addr = start;
				223	int err = 0;
				224	int nr = 0;
				225
				226	BUG_ON(addr >= end);
				227	pgd = pgd_offset_k(addr);
				228	do {
				229	next = pgd_addr_end(addr, end);
				230	err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
				231	if (err)
				232	return err;
				233	} while (pgd++, addr = next, addr != end);
				234
				235	return nr;
				236	}
				237
				238	static int vmap_page_range(unsigned long start, unsigned long end,
				239	pgprot_t prot, struct page **pages)
				240	{
				241	int ret;
				242
				243	ret = vmap_page_range_noflush(start, end, prot, pages);
				244	flush_cache_vmap(start, end);
				245	return ret;
				246	}
				247
				248	int is_vmalloc_or_module_addr(const void *x)
				249	{
				250	/*
				251	* ARM, x86-64 and sparc64 put modules in a special place,
				252	* and fall back on vmalloc() if that fails. Others
				253	* just put it in the vmalloc space.
				254	*/
				255	#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
				256	unsigned long addr = (unsigned long)x;
				257	if (addr >= MODULES_VADDR && addr < MODULES_END)
				258	return 1;
				259	#endif
				260	return is_vmalloc_addr(x);
				261	}
				262
				263	/*
				264	* Walk a vmap address to the struct page it maps.
				265	*/
				266	struct page vmalloc_to_page(const void vmalloc_addr)
				267	{
				268	unsigned long addr = (unsigned long) vmalloc_addr;
				269	struct page *page = NULL;
				270	pgd_t *pgd = pgd_offset_k(addr);
				271	p4d_t *p4d;
				272	pud_t *pud;
				273	pmd_t *pmd;
				274	pte_t *ptep, pte;
				275
				276	/*
				277	* XXX we might need to change this if we add VIRTUAL_BUG_ON for
				278	* architectures that do not vmalloc module space
				279	*/
				280	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
				281
				282	if (pgd_none(*pgd))
				283	return NULL;
				284	p4d = p4d_offset(pgd, addr);
				285	if (p4d_none(*p4d))
				286	return NULL;
				287	pud = pud_offset(p4d, addr);
				288
				289	/*
				290	* Don't dereference bad PUD or PMD (below) entries. This will also
				291	* identify huge mappings, which we may encounter on architectures
				292	* that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
				293	* identified as vmalloc addresses by is_vmalloc_addr(), but are
				294	* not [unambiguously] associated with a struct page, so there is
				295	* no correct value to return for them.
				296	*/
				297	WARN_ON_ONCE(pud_bad(*pud));
				298	if (pud_none(pud) \|\| pud_bad(pud))
				299	return NULL;
				300	pmd = pmd_offset(pud, addr);
				301	WARN_ON_ONCE(pmd_bad(*pmd));
				302	if (pmd_none(pmd) \|\| pmd_bad(pmd))
				303	return NULL;
				304
				305	ptep = pte_offset_map(pmd, addr);
				306	pte = *ptep;
				307	if (pte_present(pte))
				308	page = pte_page(pte);
				309	pte_unmap(ptep);
				310	return page;
				311	}
				312	EXPORT_SYMBOL(vmalloc_to_page);
				313
				314	/*
				315	* Map a vmalloc()-space virtual address to the physical page frame number.
				316	*/
				317	unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
				318	{
				319	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
				320	}
				321	EXPORT_SYMBOL(vmalloc_to_pfn);
				322
				323
				324	/* Global kva allocator */
				325
				326	#define VM_LAZY_FREE 0x02
				327	#define VM_VM_AREA 0x04
				328
				329	static DEFINE_SPINLOCK(vmap_area_lock);
				330	/* Export for kexec only */
				331	LIST_HEAD(vmap_area_list);
				332	static LLIST_HEAD(vmap_purge_list);
				333	static struct rb_root vmap_area_root = RB_ROOT;
				334
				335	/* The vmap cache globals are protected by vmap_area_lock */
				336	static struct rb_node *free_vmap_cache;
				337	static unsigned long cached_hole_size;
				338	static unsigned long cached_vstart;
				339	static unsigned long cached_align;
				340
				341	static unsigned long vmap_area_pcpu_hole;
				342
				343	static atomic_long_t nr_vmalloc_pages;
				344
				345	unsigned long vmalloc_nr_pages(void)
				346	{
				347	return atomic_long_read(&nr_vmalloc_pages);
				348	}
				349
				350	static struct vmap_area *__find_vmap_area(unsigned long addr)
				351	{
				352	struct rb_node *n = vmap_area_root.rb_node;
				353
				354	while (n) {
				355	struct vmap_area *va;
				356
				357	va = rb_entry(n, struct vmap_area, rb_node);
				358	if (addr < va->va_start)
				359	n = n->rb_left;
				360	else if (addr >= va->va_end)
				361	n = n->rb_right;
				362	else
				363	return va;
				364	}
				365
				366	return NULL;
				367	}
				368
				369	static void __insert_vmap_area(struct vmap_area *va)
				370	{
				371	struct rb_node **p = &vmap_area_root.rb_node;
				372	struct rb_node *parent = NULL;
				373	struct rb_node *tmp;
				374
				375	while (*p) {
				376	struct vmap_area *tmp_va;
				377
				378	parent = *p;
				379	tmp_va = rb_entry(parent, struct vmap_area, rb_node);
				380	if (va->va_start < tmp_va->va_end)
				381	p = &(*p)->rb_left;
				382	else if (va->va_end > tmp_va->va_start)
				383	p = &(*p)->rb_right;
				384	else
				385	BUG();
				386	}
				387
				388	rb_link_node(&va->rb_node, parent, p);
				389	rb_insert_color(&va->rb_node, &vmap_area_root);
				390
				391	/* address-sort this list */
				392	tmp = rb_prev(&va->rb_node);
				393	if (tmp) {
				394	struct vmap_area *prev;
				395	prev = rb_entry(tmp, struct vmap_area, rb_node);
				396	list_add_rcu(&va->list, &prev->list);
				397	} else
				398	list_add_rcu(&va->list, &vmap_area_list);
				399	}
				400
				401	static void purge_vmap_area_lazy(void);
				402
				403	static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
				404
				405	/*
				406	* Allocate a region of KVA of the specified size and alignment, within the
				407	* vstart and vend.
				408	*/
				409	static struct vmap_area *alloc_vmap_area(unsigned long size,
				410	unsigned long align,
				411	unsigned long vstart, unsigned long vend,
				412	int node, gfp_t gfp_mask)
				413	{
				414	struct vmap_area *va;
				415	struct rb_node *n;
				416	unsigned long addr;
				417	int purged = 0;
				418	struct vmap_area *first;
				419
				420	BUG_ON(!size);
				421	BUG_ON(offset_in_page(size));
				422	BUG_ON(!is_power_of_2(align));
				423
				424	might_sleep();
				425
				426	va = kmalloc_node(sizeof(struct vmap_area),
				427	gfp_mask & GFP_RECLAIM_MASK, node);
				428	if (unlikely(!va))
				429	return ERR_PTR(-ENOMEM);
				430
				431	/*
				432	* Only scan the relevant parts containing pointers to other objects
				433	* to avoid false negatives.
				434	*/
				435	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
				436
				437	retry:
				438	spin_lock(&vmap_area_lock);
				439	/*
				440	* Invalidate cache if we have more permissive parameters.
				441	* cached_hole_size notes the largest hole noticed _below_
				442	* the vmap_area cached in free_vmap_cache: if size fits
				443	* into that hole, we want to scan from vstart to reuse
				444	* the hole instead of allocating above free_vmap_cache.
				445	* Note that __free_vmap_area may update free_vmap_cache
				446	* without updating cached_hole_size or cached_align.
				447	*/
				448	if (!free_vmap_cache \|\|
				449	size < cached_hole_size \|\|
				450	vstart < cached_vstart \|\|
				451	align < cached_align) {
				452	nocache:
				453	cached_hole_size = 0;
				454	free_vmap_cache = NULL;
				455	}
				456	/* record if we encounter less permissive parameters */
				457	cached_vstart = vstart;
				458	cached_align = align;
				459
				460	/* find starting point for our search */
				461	if (free_vmap_cache) {
				462	first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
				463	addr = ALIGN(first->va_end, align);
				464	if (addr < vstart)
				465	goto nocache;
				466	if (addr + size < addr)
				467	goto overflow;
				468
				469	} else {
				470	addr = ALIGN(vstart, align);
				471	if (addr + size < addr)
				472	goto overflow;
				473
				474	n = vmap_area_root.rb_node;
				475	first = NULL;
				476
				477	while (n) {
				478	struct vmap_area *tmp;
				479	tmp = rb_entry(n, struct vmap_area, rb_node);
				480	if (tmp->va_end >= addr) {
				481	first = tmp;
				482	if (tmp->va_start <= addr)
				483	break;
				484	n = n->rb_left;
				485	} else
				486	n = n->rb_right;
				487	}
				488
				489	if (!first)
				490	goto found;
				491	}
				492
				493	/* from the starting point, walk areas until a suitable hole is found */
				494	while (addr + size > first->va_start && addr + size <= vend) {
				495	if (addr + cached_hole_size < first->va_start)
				496	cached_hole_size = first->va_start - addr;
				497	addr = ALIGN(first->va_end, align);
				498	if (addr + size < addr)
				499	goto overflow;
				500
				501	if (list_is_last(&first->list, &vmap_area_list))
				502	goto found;
				503
				504	first = list_next_entry(first, list);
				505	}
				506
				507	found:
				508	/*
				509	* Check also calculated address against the vstart,
				510	* because it can be 0 because of big align request.
				511	*/
				512	if (addr + size > vend \|\| addr < vstart)
				513	goto overflow;
				514
				515	va->va_start = addr;
				516	va->va_end = addr + size;
				517	va->flags = 0;
				518	__insert_vmap_area(va);
				519	free_vmap_cache = &va->rb_node;
				520	spin_unlock(&vmap_area_lock);
				521
				522	BUG_ON(!IS_ALIGNED(va->va_start, align));
				523	BUG_ON(va->va_start < vstart);
				524	BUG_ON(va->va_end > vend);
				525
				526	return va;
				527
				528	overflow:
				529	spin_unlock(&vmap_area_lock);
				530	if (!purged) {
				531	purge_vmap_area_lazy();
				532	purged = 1;
				533	goto retry;
				534	}
				535
				536	if (gfpflags_allow_blocking(gfp_mask)) {
				537	unsigned long freed = 0;
				538	blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
				539	if (freed > 0) {
				540	purged = 0;
				541	goto retry;
				542	}
				543	}
				544
				545	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
				546	pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
				547	size);
				548	kfree(va);
				549	return ERR_PTR(-EBUSY);
				550	}
				551
				552	int register_vmap_purge_notifier(struct notifier_block *nb)
				553	{
				554	return blocking_notifier_chain_register(&vmap_notify_list, nb);
				555	}
				556	EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
				557
				558	int unregister_vmap_purge_notifier(struct notifier_block *nb)
				559	{
				560	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
				561	}
				562	EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
				563
				564	static void __free_vmap_area(struct vmap_area *va)
				565	{
				566	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
				567
				568	if (free_vmap_cache) {
				569	if (va->va_end < cached_vstart) {
				570	free_vmap_cache = NULL;
				571	} else {
				572	struct vmap_area *cache;
				573	cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
				574	if (va->va_start <= cache->va_start) {
				575	free_vmap_cache = rb_prev(&va->rb_node);
				576	/*
				577	* We don't try to update cached_hole_size or
				578	* cached_align, but it won't go very wrong.
				579	*/
				580	}
				581	}
				582	}
				583	rb_erase(&va->rb_node, &vmap_area_root);
				584	RB_CLEAR_NODE(&va->rb_node);
				585	list_del_rcu(&va->list);
				586
				587	/*
				588	* Track the highest possible candidate for pcpu area
				589	* allocation. Areas outside of vmalloc area can be returned
				590	* here too, consider only end addresses which fall inside
				591	* vmalloc area proper.
				592	*/
				593	if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
				594	vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
				595
				596	kfree_rcu(va, rcu_head);
				597	}
				598
				599	/*
				600	* Free a region of KVA allocated by alloc_vmap_area
				601	*/
				602	static void free_vmap_area(struct vmap_area *va)
				603	{
				604	spin_lock(&vmap_area_lock);
				605	__free_vmap_area(va);
				606	spin_unlock(&vmap_area_lock);
				607	}
				608
				609	/*
				610	* Clear the pagetable entries of a given vmap_area
				611	*/
				612	static void unmap_vmap_area(struct vmap_area *va)
				613	{
				614	vunmap_page_range(va->va_start, va->va_end);
				615	}
				616
				617	/*
				618	* lazy_max_pages is the maximum amount of virtual address space we gather up
				619	* before attempting to purge with a TLB flush.
				620	*
				621	* There is a tradeoff here: a larger number will cover more kernel page tables
				622	* and take slightly longer to purge, but it will linearly reduce the number of
				623	* global TLB flushes that must be performed. It would seem natural to scale
				624	* this number up linearly with the number of CPUs (because vmapping activity
				625	* could also scale linearly with the number of CPUs), however it is likely
				626	* that in practice, workloads might be constrained in other ways that mean
				627	* vmap activity will not scale linearly with CPUs. Also, I want to be
				628	* conservative and not introduce a big latency on huge systems, so go with
				629	* a less aggressive log scale. It will still be an improvement over the old
				630	* code, and it will be simple to change the scale factor if we find that it
				631	* becomes a problem on bigger systems.
				632	*/
				633	static unsigned long lazy_max_pages(void)
				634	{
				635	unsigned int log;
				636
				637	log = fls(num_online_cpus());
				638
				639	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
				640	}
				641
				642	static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
				643
				644	/*
				645	* Serialize vmap purging. There is no actual criticial section protected
				646	* by this look, but we want to avoid concurrent calls for performance
				647	* reasons and to make the pcpu_get_vm_areas more deterministic.
				648	*/
				649	static DEFINE_MUTEX(vmap_purge_lock);
				650
				651	/* for per-CPU blocks */
				652	static void purge_fragmented_blocks_allcpus(void);
				653
				654	/*
				655	* called before a call to iounmap() if the caller wants vm_area_struct's
				656	* immediately freed.
				657	*/
				658	void set_iounmap_nonlazy(void)
				659	{
				660	atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
				661	}
				662
				663	/*
				664	* Purges all lazily-freed vmap areas.
				665	*/
				666	static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
				667	{
				668	struct llist_node *valist;
				669	struct vmap_area *va;
				670	struct vmap_area *n_va;
				671	bool do_free = false;
				672
				673	lockdep_assert_held(&vmap_purge_lock);
				674
				675	valist = llist_del_all(&vmap_purge_list);
				676	llist_for_each_entry(va, valist, purge_list) {
				677	if (va->va_start < start)
				678	start = va->va_start;
				679	if (va->va_end > end)
				680	end = va->va_end;
				681	do_free = true;
				682	}
				683
				684	if (!do_free)
				685	return false;
				686
				687	flush_tlb_kernel_range(start, end);
				688
				689	spin_lock(&vmap_area_lock);
				690	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
				691	int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
				692
				693	__free_vmap_area(va);
				694	atomic_sub(nr, &vmap_lazy_nr);
				695	cond_resched_lock(&vmap_area_lock);
				696	}
				697	spin_unlock(&vmap_area_lock);
				698	return true;
				699	}
				700
				701	/*
				702	* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
				703	* is already purging.
				704	*/
				705	static void try_purge_vmap_area_lazy(void)
				706	{
				707	if (mutex_trylock(&vmap_purge_lock)) {
				708	__purge_vmap_area_lazy(ULONG_MAX, 0);
				709	mutex_unlock(&vmap_purge_lock);
				710	}
				711	}
				712
				713	/*
				714	* Kick off a purge of the outstanding lazy areas.
				715	*/
				716	static void purge_vmap_area_lazy(void)
				717	{
				718	mutex_lock(&vmap_purge_lock);
				719	purge_fragmented_blocks_allcpus();
				720	__purge_vmap_area_lazy(ULONG_MAX, 0);
				721	mutex_unlock(&vmap_purge_lock);
				722	}
				723
				724	/*
				725	* Free a vmap area, caller ensuring that the area has been unmapped
				726	* and flush_cache_vunmap had been called for the correct range
				727	* previously.
				728	*/
				729	static void free_vmap_area_noflush(struct vmap_area *va)
				730	{
				731	int nr_lazy;
				732
				733	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
				734	&vmap_lazy_nr);
				735
				736	/* After this point, we may free va at any time */
				737	llist_add(&va->purge_list, &vmap_purge_list);
				738
				739	if (unlikely(nr_lazy > lazy_max_pages()))
				740	try_purge_vmap_area_lazy();
				741	}
				742
				743	/*
				744	* Free and unmap a vmap area
				745	*/
				746	static void free_unmap_vmap_area(struct vmap_area *va)
				747	{
				748	flush_cache_vunmap(va->va_start, va->va_end);
				749	unmap_vmap_area(va);
				750	if (debug_pagealloc_enabled())
				751	flush_tlb_kernel_range(va->va_start, va->va_end);
				752
				753	free_vmap_area_noflush(va);
				754	}
				755
				756	static struct vmap_area *find_vmap_area(unsigned long addr)
				757	{
				758	struct vmap_area *va;
				759
				760	spin_lock(&vmap_area_lock);
				761	va = __find_vmap_area(addr);
				762	spin_unlock(&vmap_area_lock);
				763
				764	return va;
				765	}
				766
				767	/* Per cpu kva allocator */
				768
				769	/*
				770	* vmap space is limited especially on 32 bit architectures. Ensure there is
				771	* room for at least 16 percpu vmap blocks per CPU.
				772	*/
				773	/*
				774	* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
				775	* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
				776	* instead (we just need a rough idea)
				777	*/
				778	#if BITS_PER_LONG == 32
				779	#define VMALLOC_SPACE (128UL10241024)
				780	#else
				781	#define VMALLOC_SPACE (128UL10241024*1024)
				782	#endif
				783
				784	#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
				785	#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
				786	#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
				787	#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
				788	#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
				789	#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
				790	#define VMAP_BBMAP_BITS \
				791	VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
				792	VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
				793	VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
				794
				795	#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
				796
				797	static bool vmap_initialized __read_mostly = false;
				798
				799	struct vmap_block_queue {
				800	spinlock_t lock;
				801	struct list_head free;
				802	};
				803
				804	struct vmap_block {
				805	spinlock_t lock;
				806	struct vmap_area *va;
				807	unsigned long free, dirty;
				808	unsigned long dirty_min, dirty_max; /< dirty range /
				809	struct list_head free_list;
				810	struct rcu_head rcu_head;
				811	struct list_head purge;
				812	};
				813
				814	/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
				815	static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
				816
				817	/*
				818	* Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
				819	* in the free path. Could get rid of this if we change the API to return a
				820	* "cookie" from alloc, to be passed to free. But no big deal yet.
				821	*/
				822	static DEFINE_SPINLOCK(vmap_block_tree_lock);
				823	static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
				824
				825	/*
				826	* We should probably have a fallback mechanism to allocate virtual memory
				827	* out of partially filled vmap blocks. However vmap block sizing should be
				828	* fairly reasonable according to the vmalloc size, so it shouldn't be a
				829	* big problem.
				830	*/
				831
				832	static unsigned long addr_to_vb_idx(unsigned long addr)
				833	{
				834	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
				835	addr /= VMAP_BLOCK_SIZE;
				836	return addr;
				837	}
				838
				839	static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
				840	{
				841	unsigned long addr;
				842
				843	addr = va_start + (pages_off << PAGE_SHIFT);
				844	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
				845	return (void *)addr;
				846	}
				847
				848	/**
				849	* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
				850	* block. Of course pages number can't exceed VMAP_BBMAP_BITS
				851	* @order: how many 2^order pages should be occupied in newly allocated block
				852	* @gfp_mask: flags for the page level allocator
				853	*
				854	* Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
				855	*/
				856	static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
				857	{
				858	struct vmap_block_queue *vbq;
				859	struct vmap_block *vb;
				860	struct vmap_area *va;
				861	unsigned long vb_idx;
				862	int node, err;
				863	void *vaddr;
				864
				865	node = numa_node_id();
				866
				867	vb = kmalloc_node(sizeof(struct vmap_block),
				868	gfp_mask & GFP_RECLAIM_MASK, node);
				869	if (unlikely(!vb))
				870	return ERR_PTR(-ENOMEM);
				871
				872	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
				873	VMALLOC_START, VMALLOC_END,
				874	node, gfp_mask);
				875	if (IS_ERR(va)) {
				876	kfree(vb);
				877	return ERR_CAST(va);
				878	}
				879
				880	err = radix_tree_preload(gfp_mask);
				881	if (unlikely(err)) {
				882	kfree(vb);
				883	free_vmap_area(va);
				884	return ERR_PTR(err);
				885	}
				886
				887	vaddr = vmap_block_vaddr(va->va_start, 0);
				888	spin_lock_init(&vb->lock);
				889	vb->va = va;
				890	/* At least something should be left free */
				891	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
				892	vb->free = VMAP_BBMAP_BITS - (1UL << order);
				893	vb->dirty = 0;
				894	vb->dirty_min = VMAP_BBMAP_BITS;
				895	vb->dirty_max = 0;
				896	INIT_LIST_HEAD(&vb->free_list);
				897
				898	vb_idx = addr_to_vb_idx(va->va_start);
				899	spin_lock(&vmap_block_tree_lock);
				900	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
				901	spin_unlock(&vmap_block_tree_lock);
				902	BUG_ON(err);
				903	radix_tree_preload_end();
				904
				905	vbq = &get_cpu_var(vmap_block_queue);
				906	spin_lock(&vbq->lock);
				907	list_add_tail_rcu(&vb->free_list, &vbq->free);
				908	spin_unlock(&vbq->lock);
				909	put_cpu_var(vmap_block_queue);
				910
				911	return vaddr;
				912	}
				913
				914	static void free_vmap_block(struct vmap_block *vb)
				915	{
				916	struct vmap_block *tmp;
				917	unsigned long vb_idx;
				918
				919	vb_idx = addr_to_vb_idx(vb->va->va_start);
				920	spin_lock(&vmap_block_tree_lock);
				921	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
				922	spin_unlock(&vmap_block_tree_lock);
				923	BUG_ON(tmp != vb);
				924
				925	free_vmap_area_noflush(vb->va);
				926	kfree_rcu(vb, rcu_head);
				927	}
				928
				929	static void purge_fragmented_blocks(int cpu)
				930	{
				931	LIST_HEAD(purge);
				932	struct vmap_block *vb;
				933	struct vmap_block *n_vb;
				934	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
				935
				936	rcu_read_lock();
				937	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				938
				939	if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
				940	continue;
				941
				942	spin_lock(&vb->lock);
				943	if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
				944	vb->free = 0; /* prevent further allocs after releasing lock */
				945	vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
				946	vb->dirty_min = 0;
				947	vb->dirty_max = VMAP_BBMAP_BITS;
				948	spin_lock(&vbq->lock);
				949	list_del_rcu(&vb->free_list);
				950	spin_unlock(&vbq->lock);
				951	spin_unlock(&vb->lock);
				952	list_add_tail(&vb->purge, &purge);
				953	} else
				954	spin_unlock(&vb->lock);
				955	}
				956	rcu_read_unlock();
				957
				958	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
				959	list_del(&vb->purge);
				960	free_vmap_block(vb);
				961	}
				962	}
				963
				964	static void purge_fragmented_blocks_allcpus(void)
				965	{
				966	int cpu;
				967
				968	for_each_possible_cpu(cpu)
				969	purge_fragmented_blocks(cpu);
				970	}
				971
				972	static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
				973	{
				974	struct vmap_block_queue *vbq;
				975	struct vmap_block *vb;
				976	void *vaddr = NULL;
				977	unsigned int order;
				978
				979	BUG_ON(offset_in_page(size));
				980	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
				981	if (WARN_ON(size == 0)) {
				982	/*
				983	* Allocating 0 bytes isn't what caller wants since
				984	* get_order(0) returns funny result. Just warn and terminate
				985	* early.
				986	*/
				987	return NULL;
				988	}
				989	order = get_order(size);
				990
				991	rcu_read_lock();
				992	vbq = &get_cpu_var(vmap_block_queue);
				993	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				994	unsigned long pages_off;
				995
				996	spin_lock(&vb->lock);
				997	if (vb->free < (1UL << order)) {
				998	spin_unlock(&vb->lock);
				999	continue;
				1000	}
				1001
				1002	pages_off = VMAP_BBMAP_BITS - vb->free;
				1003	vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
				1004	vb->free -= 1UL << order;
				1005	if (vb->free == 0) {
				1006	spin_lock(&vbq->lock);
				1007	list_del_rcu(&vb->free_list);
				1008	spin_unlock(&vbq->lock);
				1009	}
				1010
				1011	spin_unlock(&vb->lock);
				1012	break;
				1013	}
				1014
				1015	put_cpu_var(vmap_block_queue);
				1016	rcu_read_unlock();
				1017
				1018	/* Allocate new block if nothing was found */
				1019	if (!vaddr)
				1020	vaddr = new_vmap_block(order, gfp_mask);
				1021
				1022	return vaddr;
				1023	}
				1024
				1025	static void vb_free(const void *addr, unsigned long size)
				1026	{
				1027	unsigned long offset;
				1028	unsigned long vb_idx;
				1029	unsigned int order;
				1030	struct vmap_block *vb;
				1031
				1032	BUG_ON(offset_in_page(size));
				1033	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
				1034
				1035	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
				1036
				1037	order = get_order(size);
				1038
				1039	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
				1040	offset >>= PAGE_SHIFT;
				1041
				1042	vb_idx = addr_to_vb_idx((unsigned long)addr);
				1043	rcu_read_lock();
				1044	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
				1045	rcu_read_unlock();
				1046	BUG_ON(!vb);
				1047
				1048	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
				1049
				1050	if (debug_pagealloc_enabled())
				1051	flush_tlb_kernel_range((unsigned long)addr,
				1052	(unsigned long)addr + size);
				1053
				1054	spin_lock(&vb->lock);
				1055
				1056	/* Expand dirty range */
				1057	vb->dirty_min = min(vb->dirty_min, offset);
				1058	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
				1059
				1060	vb->dirty += 1UL << order;
				1061	if (vb->dirty == VMAP_BBMAP_BITS) {
				1062	BUG_ON(vb->free);
				1063	spin_unlock(&vb->lock);
				1064	free_vmap_block(vb);
				1065	} else
				1066	spin_unlock(&vb->lock);
				1067	}
				1068
				1069	/**
				1070	* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
				1071	*
				1072	* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
				1073	* to amortize TLB flushing overheads. What this means is that any page you
				1074	* have now, may, in a former life, have been mapped into kernel virtual
				1075	* address by the vmap layer and so there might be some CPUs with TLB entries
				1076	* still referencing that page (additional to the regular 1:1 kernel mapping).
				1077	*
				1078	* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
				1079	* be sure that none of the pages we have control over will have any aliases
				1080	* from the vmap layer.
				1081	*/
				1082	void vm_unmap_aliases(void)
				1083	{
				1084	unsigned long start = ULONG_MAX, end = 0;
				1085	int cpu;
				1086	int flush = 0;
				1087
				1088	if (unlikely(!vmap_initialized))
				1089	return;
				1090
				1091	might_sleep();
				1092
				1093	for_each_possible_cpu(cpu) {
				1094	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
				1095	struct vmap_block *vb;
				1096
				1097	rcu_read_lock();
				1098	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				1099	spin_lock(&vb->lock);
				1100	if (vb->dirty) {
				1101	unsigned long va_start = vb->va->va_start;
				1102	unsigned long s, e;
				1103
				1104	s = va_start + (vb->dirty_min << PAGE_SHIFT);
				1105	e = va_start + (vb->dirty_max << PAGE_SHIFT);
				1106
				1107	start = min(s, start);
				1108	end = max(e, end);
				1109
				1110	flush = 1;
				1111	}
				1112	spin_unlock(&vb->lock);
				1113	}
				1114	rcu_read_unlock();
				1115	}
				1116
				1117	mutex_lock(&vmap_purge_lock);
				1118	purge_fragmented_blocks_allcpus();
				1119	if (!__purge_vmap_area_lazy(start, end) && flush)
				1120	flush_tlb_kernel_range(start, end);
				1121	mutex_unlock(&vmap_purge_lock);
				1122	}
				1123	EXPORT_SYMBOL_GPL(vm_unmap_aliases);
				1124
				1125	/**
				1126	* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
				1127	* @mem: the pointer returned by vm_map_ram
				1128	* @count: the count passed to that vm_map_ram call (cannot unmap partial)
				1129	*/
				1130	void vm_unmap_ram(const void *mem, unsigned int count)
				1131	{
				1132	unsigned long size = (unsigned long)count << PAGE_SHIFT;
				1133	unsigned long addr = (unsigned long)mem;
				1134	struct vmap_area *va;
				1135
				1136	might_sleep();
				1137	BUG_ON(!addr);
				1138	BUG_ON(addr < VMALLOC_START);
				1139	BUG_ON(addr > VMALLOC_END);
				1140	BUG_ON(!PAGE_ALIGNED(addr));
				1141
				1142	if (likely(count <= VMAP_MAX_ALLOC)) {
				1143	debug_check_no_locks_freed(mem, size);
				1144	vb_free(mem, size);
				1145	return;
				1146	}
				1147
				1148	va = find_vmap_area(addr);
				1149	BUG_ON(!va);
				1150	debug_check_no_locks_freed((void *)va->va_start,
				1151	(va->va_end - va->va_start));
				1152	free_unmap_vmap_area(va);
				1153	}
				1154	EXPORT_SYMBOL(vm_unmap_ram);
				1155
				1156	/**
				1157	* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
				1158	* @pages: an array of pointers to the pages to be mapped
				1159	* @count: number of pages
				1160	* @node: prefer to allocate data structures on this node
				1161	* @prot: memory protection to use. PAGE_KERNEL for regular RAM
				1162	*
				1163	* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
				1164	* faster than vmap so it's good. But if you mix long-life and short-life
				1165	* objects with vm_map_ram(), it could consume lots of address space through
				1166	* fragmentation (especially on a 32bit machine). You could see failures in
				1167	* the end. Please use this function for short-lived objects.
				1168	*
				1169	* Returns: a pointer to the address that has been mapped, or %NULL on failure
				1170	*/
				1171	void vm_map_ram(struct page *pages, unsigned int count, int node, pgprot_t prot)
				1172	{
				1173	unsigned long size = (unsigned long)count << PAGE_SHIFT;
				1174	unsigned long addr;
				1175	void *mem;
				1176
				1177	if (likely(count <= VMAP_MAX_ALLOC)) {
				1178	mem = vb_alloc(size, GFP_KERNEL);
				1179	if (IS_ERR(mem))
				1180	return NULL;
				1181	addr = (unsigned long)mem;
				1182	} else {
				1183	struct vmap_area *va;
				1184	va = alloc_vmap_area(size, PAGE_SIZE,
				1185	VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
				1186	if (IS_ERR(va))
				1187	return NULL;
				1188
				1189	addr = va->va_start;
				1190	mem = (void *)addr;
				1191	}
				1192	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
				1193	vm_unmap_ram(mem, count);
				1194	return NULL;
				1195	}
				1196	return mem;
				1197	}
				1198	EXPORT_SYMBOL(vm_map_ram);
				1199
				1200	static struct vm_struct *vmlist __initdata;
				1201	/**
				1202	* vm_area_add_early - add vmap area early during boot
				1203	* @vm: vm_struct to add
				1204	*
				1205	* This function is used to add fixed kernel vm area to vmlist before
				1206	* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
				1207	* should contain proper values and the other fields should be zero.
				1208	*
				1209	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
				1210	*/
				1211	void __init vm_area_add_early(struct vm_struct *vm)
				1212	{
				1213	struct vm_struct tmp, *p;
				1214
				1215	BUG_ON(vmap_initialized);
				1216	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
				1217	if (tmp->addr >= vm->addr) {
				1218	BUG_ON(tmp->addr < vm->addr + vm->size);
				1219	break;
				1220	} else
				1221	BUG_ON(tmp->addr + tmp->size > vm->addr);
				1222	}
				1223	vm->next = *p;
				1224	*p = vm;
				1225	}
				1226
				1227	/**
				1228	* vm_area_register_early - register vmap area early during boot
				1229	* @vm: vm_struct to register
				1230	* @align: requested alignment
				1231	*
				1232	* This function is used to register kernel vm area before
				1233	* vmalloc_init() is called. @vm->size and @vm->flags should contain
				1234	* proper values on entry and other fields should be zero. On return,
				1235	* vm->addr contains the allocated address.
				1236	*
				1237	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
				1238	*/
				1239	void __init vm_area_register_early(struct vm_struct *vm, size_t align)
				1240	{
				1241	static size_t vm_init_off __initdata;
				1242	unsigned long addr;
				1243
				1244	addr = ALIGN(VMALLOC_START + vm_init_off, align);
				1245	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
				1246
				1247	vm->addr = (void *)addr;
				1248
				1249	vm_area_add_early(vm);
				1250	}
				1251
				1252	void __init vmalloc_init(void)
				1253	{
				1254	struct vmap_area *va;
				1255	struct vm_struct *tmp;
				1256	int i;
				1257
				1258	for_each_possible_cpu(i) {
				1259	struct vmap_block_queue *vbq;
				1260	struct vfree_deferred *p;
				1261
				1262	vbq = &per_cpu(vmap_block_queue, i);
				1263	spin_lock_init(&vbq->lock);
				1264	INIT_LIST_HEAD(&vbq->free);
				1265	p = &per_cpu(vfree_deferred, i);
				1266	init_llist_head(&p->list);
				1267	INIT_WORK(&p->wq, free_work);
				1268	}
				1269
				1270	/* Import existing vmlist entries. */
				1271	for (tmp = vmlist; tmp; tmp = tmp->next) {
				1272	va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
				1273	va->flags = VM_VM_AREA;
				1274	va->va_start = (unsigned long)tmp->addr;
				1275	va->va_end = va->va_start + tmp->size;
				1276	va->vm = tmp;
				1277	__insert_vmap_area(va);
				1278	}
				1279
				1280	vmap_area_pcpu_hole = VMALLOC_END;
				1281
				1282	vmap_initialized = true;
				1283	}
				1284
				1285	/**
				1286	* map_kernel_range_noflush - map kernel VM area with the specified pages
				1287	* @addr: start of the VM area to map
				1288	* @size: size of the VM area to map
				1289	* @prot: page protection flags to use
				1290	* @pages: pages to map
				1291	*
				1292	* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
				1293	* specify should have been allocated using get_vm_area() and its
				1294	* friends.
				1295	*
				1296	* NOTE:
				1297	* This function does NOT do any cache flushing. The caller is
				1298	* responsible for calling flush_cache_vmap() on to-be-mapped areas
				1299	* before calling this function.
				1300	*
				1301	* RETURNS:
				1302	* The number of pages mapped on success, -errno on failure.
				1303	*/
				1304	int map_kernel_range_noflush(unsigned long addr, unsigned long size,
				1305	pgprot_t prot, struct page **pages)
				1306	{
				1307	return vmap_page_range_noflush(addr, addr + size, prot, pages);
				1308	}
				1309
				1310	/**
				1311	* unmap_kernel_range_noflush - unmap kernel VM area
				1312	* @addr: start of the VM area to unmap
				1313	* @size: size of the VM area to unmap
				1314	*
				1315	* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
				1316	* specify should have been allocated using get_vm_area() and its
				1317	* friends.
				1318	*
				1319	* NOTE:
				1320	* This function does NOT do any cache flushing. The caller is
				1321	* responsible for calling flush_cache_vunmap() on to-be-mapped areas
				1322	* before calling this function and flush_tlb_kernel_range() after.
				1323	*/
				1324	void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
				1325	{
				1326	vunmap_page_range(addr, addr + size);
				1327	}
				1328	EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
				1329
				1330	/**
				1331	* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
				1332	* @addr: start of the VM area to unmap
				1333	* @size: size of the VM area to unmap
				1334	*
				1335	* Similar to unmap_kernel_range_noflush() but flushes vcache before
				1336	* the unmapping and tlb after.
				1337	*/
				1338	void unmap_kernel_range(unsigned long addr, unsigned long size)
				1339	{
				1340	unsigned long end = addr + size;
				1341
				1342	flush_cache_vunmap(addr, end);
				1343	vunmap_page_range(addr, end);
				1344	flush_tlb_kernel_range(addr, end);
				1345	}
				1346	EXPORT_SYMBOL_GPL(unmap_kernel_range);
				1347
				1348	int map_vm_area(struct vm_struct area, pgprot_t prot, struct page *pages)
				1349	{
				1350	unsigned long addr = (unsigned long)area->addr;
				1351	unsigned long end = addr + get_vm_area_size(area);
				1352	int err;
				1353
				1354	err = vmap_page_range(addr, end, prot, pages);
				1355
				1356	return err > 0 ? 0 : err;
				1357	}
				1358	EXPORT_SYMBOL_GPL(map_vm_area);
				1359
				1360	static void setup_vmalloc_vm(struct vm_struct vm, struct vmap_area va,
				1361	unsigned long flags, const void *caller)
				1362	{
				1363	spin_lock(&vmap_area_lock);
				1364	vm->flags = flags;
				1365	vm->addr = (void *)va->va_start;
				1366	vm->size = va->va_end - va->va_start;
				1367	vm->caller = caller;
				1368	va->vm = vm;
				1369	va->flags \|= VM_VM_AREA;
				1370	spin_unlock(&vmap_area_lock);
				1371	}
				1372
				1373	static void clear_vm_uninitialized_flag(struct vm_struct *vm)
				1374	{
				1375	/*
				1376	* Before removing VM_UNINITIALIZED,
				1377	* we should make sure that vm has proper values.
				1378	* Pair with smp_rmb() in show_numa_info().
				1379	*/
				1380	smp_wmb();
				1381	vm->flags &= ~VM_UNINITIALIZED;
				1382	}
				1383
				1384	static struct vm_struct *__get_vm_area_node(unsigned long size,
				1385	unsigned long align, unsigned long flags, unsigned long start,
				1386	unsigned long end, int node, gfp_t gfp_mask, const void *caller)
				1387	{
				1388	struct vmap_area *va;
				1389	struct vm_struct *area;
				1390
				1391	BUG_ON(in_interrupt());
				1392	size = PAGE_ALIGN(size);
				1393	if (unlikely(!size))
				1394	return NULL;
				1395
				1396	if (flags & VM_IOREMAP)
				1397	align = 1ul << clamp_t(int, get_count_order_long(size),
				1398	PAGE_SHIFT, IOREMAP_MAX_ORDER);
				1399
				1400	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
				1401	if (unlikely(!area))
				1402	return NULL;
				1403
				1404	if (!(flags & VM_NO_GUARD))
				1405	size += PAGE_SIZE;
				1406
				1407	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
				1408	if (IS_ERR(va)) {
				1409	kfree(area);
				1410	return NULL;
				1411	}
				1412
				1413	setup_vmalloc_vm(area, va, flags, caller);
				1414
				1415	return area;
				1416	}
				1417
				1418	struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
				1419	unsigned long start, unsigned long end)
				1420	{
				1421	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				1422	GFP_KERNEL, __builtin_return_address(0));
				1423	}
				1424	EXPORT_SYMBOL_GPL(__get_vm_area);
				1425
				1426	struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
				1427	unsigned long start, unsigned long end,
				1428	const void *caller)
				1429	{
				1430	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				1431	GFP_KERNEL, caller);
				1432	}
				1433
				1434	/**
				1435	* get_vm_area - reserve a contiguous kernel virtual area
				1436	* @size: size of the area
				1437	* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
				1438	*
				1439	* Search an area of @size in the kernel virtual mapping area,
				1440	* and reserved it for out purposes. Returns the area descriptor
				1441	* on success or %NULL on failure.
				1442	*/
				1443	struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
				1444	{
				1445	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
				1446	NUMA_NO_NODE, GFP_KERNEL,
				1447	__builtin_return_address(0));
				1448	}
				1449
				1450	struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
				1451	const void *caller)
				1452	{
				1453	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
				1454	NUMA_NO_NODE, GFP_KERNEL, caller);
				1455	}
				1456
				1457	/**
				1458	* find_vm_area - find a continuous kernel virtual area
				1459	* @addr: base address
				1460	*
				1461	* Search for the kernel VM area starting at @addr, and return it.
				1462	* It is up to the caller to do all required locking to keep the returned
				1463	* pointer valid.
				1464	*/
				1465	struct vm_struct find_vm_area(const void addr)
				1466	{
				1467	struct vmap_area *va;
				1468
				1469	va = find_vmap_area((unsigned long)addr);
				1470	if (va && va->flags & VM_VM_AREA)
				1471	return va->vm;
				1472
				1473	return NULL;
				1474	}
				1475
				1476	/**
				1477	* remove_vm_area - find and remove a continuous kernel virtual area
				1478	* @addr: base address
				1479	*
				1480	* Search for the kernel VM area starting at @addr, and remove it.
				1481	* This function returns the found VM area, but using it is NOT safe
				1482	* on SMP machines, except for its size or flags.
				1483	*/
				1484	struct vm_struct remove_vm_area(const void addr)
				1485	{
				1486	struct vmap_area *va;
				1487
				1488	might_sleep();
				1489
				1490	va = find_vmap_area((unsigned long)addr);
				1491	if (va && va->flags & VM_VM_AREA) {
				1492	struct vm_struct *vm = va->vm;
				1493
				1494	spin_lock(&vmap_area_lock);
				1495	va->vm = NULL;
				1496	va->flags &= ~VM_VM_AREA;
				1497	va->flags \|= VM_LAZY_FREE;
				1498	spin_unlock(&vmap_area_lock);
				1499
				1500	kasan_free_shadow(vm);
				1501	free_unmap_vmap_area(va);
				1502
				1503	return vm;
				1504	}
				1505	return NULL;
				1506	}
				1507
				1508	static void __vunmap(const void *addr, int deallocate_pages)
				1509	{
				1510	struct vm_struct *area;
				1511
				1512	if (!addr)
				1513	return;
				1514
				1515	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
				1516	addr))
				1517	return;
				1518
				1519	area = find_vmap_area((unsigned long)addr)->vm;
				1520	if (unlikely(!area)) {
				1521	WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
				1522	addr);
				1523	return;
				1524	}
				1525
				1526	debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
				1527	debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
				1528
				1529	remove_vm_area(addr);
				1530	if (deallocate_pages) {
				1531	int i;
				1532
				1533	for (i = 0; i < area->nr_pages; i++) {
				1534	struct page *page = area->pages[i];
				1535
				1536	BUG_ON(!page);
				1537	__free_pages(page, 0);
				1538	}
				1539	atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
				1540
				1541	kvfree(area->pages);
				1542	}
				1543
				1544	kfree(area);
				1545	return;
				1546	}
				1547
				1548	static inline void __vfree_deferred(const void *addr)
				1549	{
				1550	/*
				1551	* Use raw_cpu_ptr() because this can be called from preemptible
				1552	* context. Preemption is absolutely fine here, because the llist_add()
				1553	* implementation is lockless, so it works even if we are adding to
				1554	* nother cpu's list. schedule_work() should be fine with this too.
				1555	*/
				1556	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
				1557
				1558	if (llist_add((struct llist_node *)addr, &p->list))
				1559	schedule_work(&p->wq);
				1560	}
				1561
				1562	/**
				1563	* vfree_atomic - release memory allocated by vmalloc()
				1564	* @addr: memory base address
				1565	*
				1566	* This one is just like vfree() but can be called in any atomic context
				1567	* except NMIs.
				1568	*/
				1569	void vfree_atomic(const void *addr)
				1570	{
				1571	BUG_ON(in_nmi());
				1572
				1573	kmemleak_free(addr);
				1574
				1575	if (!addr)
				1576	return;
				1577	__vfree_deferred(addr);
				1578	}
				1579
				1580	/**
				1581	* vfree - release memory allocated by vmalloc()
				1582	* @addr: memory base address
				1583	*
				1584	* Free the virtually continuous memory area starting at @addr, as
				1585	* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
				1586	* NULL, no operation is performed.
				1587	*
				1588	* Must not be called in NMI context (strictly speaking, only if we don't
				1589	* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
				1590	* conventions for vfree() arch-depenedent would be a really bad idea)
				1591	*
				1592	* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
				1593	*/
				1594	void vfree(const void *addr)
				1595	{
				1596	BUG_ON(in_nmi());
				1597
				1598	kmemleak_free(addr);
				1599
				1600	if (!addr)
				1601	return;
				1602	if (unlikely(in_interrupt()))
				1603	__vfree_deferred(addr);
				1604	else
				1605	__vunmap(addr, 1);
				1606	}
				1607	EXPORT_SYMBOL(vfree);
				1608
				1609	/**
				1610	* vunmap - release virtual mapping obtained by vmap()
				1611	* @addr: memory base address
				1612	*
				1613	* Free the virtually contiguous memory area starting at @addr,
				1614	* which was created from the page array passed to vmap().
				1615	*
				1616	* Must not be called in interrupt context.
				1617	*/
				1618	void vunmap(const void *addr)
				1619	{
				1620	BUG_ON(in_interrupt());
				1621	might_sleep();
				1622	if (addr)
				1623	__vunmap(addr, 0);
				1624	}
				1625	EXPORT_SYMBOL(vunmap);
				1626
				1627	/**
				1628	* vmap - map an array of pages into virtually contiguous space
				1629	* @pages: array of page pointers
				1630	* @count: number of pages to map
				1631	* @flags: vm_area->flags
				1632	* @prot: page protection for the mapping
				1633	*
				1634	* Maps @count pages from @pages into contiguous kernel virtual
				1635	* space.
				1636	*/
				1637	void vmap(struct page *pages, unsigned int count,
				1638	unsigned long flags, pgprot_t prot)
				1639	{
				1640	struct vm_struct *area;
				1641	unsigned long size; /* In bytes */
				1642
				1643	might_sleep();
				1644
				1645	if (count > totalram_pages)
				1646	return NULL;
				1647
				1648	size = (unsigned long)count << PAGE_SHIFT;
				1649	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
				1650	if (!area)
				1651	return NULL;
				1652
				1653	if (map_vm_area(area, prot, pages)) {
				1654	vunmap(area->addr);
				1655	return NULL;
				1656	}
				1657
				1658	return area->addr;
				1659	}
				1660	EXPORT_SYMBOL(vmap);
				1661
				1662	static void *__vmalloc_node(unsigned long size, unsigned long align,
				1663	gfp_t gfp_mask, pgprot_t prot,
				1664	int node, const void *caller);
				1665	static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
				1666	pgprot_t prot, int node)
				1667	{
				1668	struct page **pages;
				1669	unsigned int nr_pages, array_size, i;
				1670	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
				1671	const gfp_t alloc_mask = gfp_mask \| __GFP_NOWARN;
				1672	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA \| GFP_DMA32)) ?
				1673	0 :
				1674	__GFP_HIGHMEM;
				1675
				1676	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
				1677	array_size = (nr_pages * sizeof(struct page *));
				1678
				1679	area->nr_pages = nr_pages;
				1680	/* Please note that the recursion is strictly bounded. */
				1681	if (array_size > PAGE_SIZE) {
				1682	pages = __vmalloc_node(array_size, 1, nested_gfp\|highmem_mask,
				1683	PAGE_KERNEL, node, area->caller);
				1684	} else {
				1685	pages = kmalloc_node(array_size, nested_gfp, node);
				1686	}
				1687	area->pages = pages;
				1688	if (!area->pages) {
				1689	remove_vm_area(area->addr);
				1690	kfree(area);
				1691	return NULL;
				1692	}
				1693
				1694	for (i = 0; i < area->nr_pages; i++) {
				1695	struct page *page;
				1696
				1697	if (node == NUMA_NO_NODE)
				1698	page = alloc_page(alloc_mask\|highmem_mask);
				1699	else
				1700	page = alloc_pages_node(node, alloc_mask\|highmem_mask, 0);
				1701
				1702	if (unlikely(!page)) {
				1703	/* Successfully allocated i pages, free them in __vunmap() */
				1704	area->nr_pages = i;
				1705	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
				1706	goto fail;
				1707	}
				1708	area->pages[i] = page;
				1709	if (gfpflags_allow_blocking(gfp_mask\|highmem_mask))
				1710	cond_resched();
				1711	}
				1712	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
				1713
				1714	if (map_vm_area(area, prot, pages))
				1715	goto fail;
				1716	return area->addr;
				1717
				1718	fail:
				1719	warn_alloc(gfp_mask, NULL,
				1720	"vmalloc: allocation failure, allocated %ld of %ld bytes",
				1721	(area->nr_pages*PAGE_SIZE), area->size);
				1722	vfree(area->addr);
				1723	return NULL;
				1724	}
				1725
				1726	/**
				1727	* __vmalloc_node_range - allocate virtually contiguous memory
				1728	* @size: allocation size
				1729	* @align: desired alignment
				1730	* @start: vm area range start
				1731	* @end: vm area range end
				1732	* @gfp_mask: flags for the page level allocator
				1733	* @prot: protection mask for the allocated pages
				1734	* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
				1735	* @node: node to use for allocation or NUMA_NO_NODE
				1736	* @caller: caller's return address
				1737	*
				1738	* Allocate enough pages to cover @size from the page level
				1739	* allocator with @gfp_mask flags. Map them into contiguous
				1740	* kernel virtual space, using a pagetable protection of @prot.
				1741	*/
				1742	void *__vmalloc_node_range(unsigned long size, unsigned long align,
				1743	unsigned long start, unsigned long end, gfp_t gfp_mask,
				1744	pgprot_t prot, unsigned long vm_flags, int node,
				1745	const void *caller)
				1746	{
				1747	struct vm_struct *area;
				1748	void *addr;
				1749	unsigned long real_size = size;
				1750
				1751	size = PAGE_ALIGN(size);
				1752	if (!size \|\| (size >> PAGE_SHIFT) > totalram_pages)
				1753	goto fail;
				1754
				1755	area = __get_vm_area_node(size, align, VM_ALLOC \| VM_UNINITIALIZED \|
				1756	vm_flags, start, end, node, gfp_mask, caller);
				1757	if (!area)
				1758	goto fail;
				1759
				1760	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
				1761	if (!addr)
				1762	return NULL;
				1763
				1764	/*
				1765	* First make sure the mappings are removed from all page-tables
				1766	* before they are freed.
				1767	*/
				1768	vmalloc_sync_all();
				1769
				1770	/*
				1771	* In this function, newly allocated vm_struct has VM_UNINITIALIZED
				1772	* flag. It means that vm_struct is not fully initialized.
				1773	* Now, it is fully initialized, so remove this flag here.
				1774	*/
				1775	clear_vm_uninitialized_flag(area);
				1776
				1777	kmemleak_vmalloc(area, size, gfp_mask);
				1778
				1779	return addr;
				1780
				1781	fail:
				1782	warn_alloc(gfp_mask, NULL,
				1783	"vmalloc: allocation failure: %lu bytes", real_size);
				1784	return NULL;
				1785	}
				1786
				1787	/**
				1788	* __vmalloc_node - allocate virtually contiguous memory
				1789	* @size: allocation size
				1790	* @align: desired alignment
				1791	* @gfp_mask: flags for the page level allocator
				1792	* @prot: protection mask for the allocated pages
				1793	* @node: node to use for allocation or NUMA_NO_NODE
				1794	* @caller: caller's return address
				1795	*
				1796	* Allocate enough pages to cover @size from the page level
				1797	* allocator with @gfp_mask flags. Map them into contiguous
				1798	* kernel virtual space, using a pagetable protection of @prot.
				1799	*
				1800	* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
				1801	* and __GFP_NOFAIL are not supported
				1802	*
				1803	* Any use of gfp flags outside of GFP_KERNEL should be consulted
				1804	* with mm people.
				1805	*
				1806	*/
				1807	static void *__vmalloc_node(unsigned long size, unsigned long align,
				1808	gfp_t gfp_mask, pgprot_t prot,
				1809	int node, const void *caller)
				1810	{
				1811	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
				1812	gfp_mask, prot, 0, node, caller);
				1813	}
				1814
				1815	void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
				1816	{
				1817	return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
				1818	__builtin_return_address(0));
				1819	}
				1820	EXPORT_SYMBOL(__vmalloc);
				1821
				1822	static inline void *__vmalloc_node_flags(unsigned long size,
				1823	int node, gfp_t flags)
				1824	{
				1825	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
				1826	node, __builtin_return_address(0));
				1827	}
				1828
				1829
				1830	void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
				1831	void *caller)
				1832	{
				1833	return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
				1834	}
				1835
				1836	/**
				1837	* vmalloc - allocate virtually contiguous memory
				1838	* @size: allocation size
				1839	* Allocate enough pages to cover @size from the page level
				1840	* allocator and map them into contiguous kernel virtual space.
				1841	*
				1842	* For tight control over page level allocator and protection flags
				1843	* use __vmalloc() instead.
				1844	*/
				1845	void *vmalloc(unsigned long size)
				1846	{
				1847	return __vmalloc_node_flags(size, NUMA_NO_NODE,
				1848	GFP_KERNEL);
				1849	}
				1850	EXPORT_SYMBOL(vmalloc);
				1851
				1852	/**
				1853	* vzalloc - allocate virtually contiguous memory with zero fill
				1854	* @size: allocation size
				1855	* Allocate enough pages to cover @size from the page level
				1856	* allocator and map them into contiguous kernel virtual space.
				1857	* The memory allocated is set to zero.
				1858	*
				1859	* For tight control over page level allocator and protection flags
				1860	* use __vmalloc() instead.
				1861	*/
				1862	void *vzalloc(unsigned long size)
				1863	{
				1864	return __vmalloc_node_flags(size, NUMA_NO_NODE,
				1865	GFP_KERNEL \| __GFP_ZERO);
				1866	}
				1867	EXPORT_SYMBOL(vzalloc);
				1868
				1869	/**
				1870	* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
				1871	* @size: allocation size
				1872	*
				1873	* The resulting memory area is zeroed so it can be mapped to userspace
				1874	* without leaking data.
				1875	*/
				1876	void *vmalloc_user(unsigned long size)
				1877	{
				1878	struct vm_struct *area;
				1879	void *ret;
				1880
				1881	ret = __vmalloc_node(size, SHMLBA,
				1882	GFP_KERNEL \| __GFP_ZERO,
				1883	PAGE_KERNEL, NUMA_NO_NODE,
				1884	__builtin_return_address(0));
				1885	if (ret) {
				1886	area = find_vm_area(ret);
				1887	area->flags \|= VM_USERMAP;
				1888	}
				1889	return ret;
				1890	}
				1891	EXPORT_SYMBOL(vmalloc_user);
				1892
				1893	/**
				1894	* vmalloc_node - allocate memory on a specific node
				1895	* @size: allocation size
				1896	* @node: numa node
				1897	*
				1898	* Allocate enough pages to cover @size from the page level
				1899	* allocator and map them into contiguous kernel virtual space.
				1900	*
				1901	* For tight control over page level allocator and protection flags
				1902	* use __vmalloc() instead.
				1903	*/
				1904	void *vmalloc_node(unsigned long size, int node)
				1905	{
				1906	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
				1907	node, __builtin_return_address(0));
				1908	}
				1909	EXPORT_SYMBOL(vmalloc_node);
				1910
				1911	/**
				1912	* vzalloc_node - allocate memory on a specific node with zero fill
				1913	* @size: allocation size
				1914	* @node: numa node
				1915	*
				1916	* Allocate enough pages to cover @size from the page level
				1917	* allocator and map them into contiguous kernel virtual space.
				1918	* The memory allocated is set to zero.
				1919	*
				1920	* For tight control over page level allocator and protection flags
				1921	* use __vmalloc_node() instead.
				1922	*/
				1923	void *vzalloc_node(unsigned long size, int node)
				1924	{
				1925	return __vmalloc_node_flags(size, node,
				1926	GFP_KERNEL \| __GFP_ZERO);
				1927	}
				1928	EXPORT_SYMBOL(vzalloc_node);
				1929
				1930	/**
				1931	* vmalloc_exec - allocate virtually contiguous, executable memory
				1932	* @size: allocation size
				1933	*
				1934	* Kernel-internal function to allocate enough pages to cover @size
				1935	* the page level allocator and map them into contiguous and
				1936	* executable kernel virtual space.
				1937	*
				1938	* For tight control over page level allocator and protection flags
				1939	* use __vmalloc() instead.
				1940	*/
				1941
				1942	void *vmalloc_exec(unsigned long size)
				1943	{
				1944	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
				1945	NUMA_NO_NODE, __builtin_return_address(0));
				1946	}
				1947
				1948	#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
				1949	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
				1950	#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
				1951	#define GFP_VMALLOC32 (GFP_DMA \| GFP_KERNEL)
				1952	#else
				1953	/*
				1954	* 64b systems should always have either DMA or DMA32 zones. For others
				1955	* GFP_DMA32 should do the right thing and use the normal zone.
				1956	*/
				1957	#define GFP_VMALLOC32 GFP_DMA32 \| GFP_KERNEL
				1958	#endif
				1959
				1960	/**
				1961	* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
				1962	* @size: allocation size
				1963	*
				1964	* Allocate enough 32bit PA addressable pages to cover @size from the
				1965	* page level allocator and map them into contiguous kernel virtual space.
				1966	*/
				1967	void *vmalloc_32(unsigned long size)
				1968	{
				1969	return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
				1970	NUMA_NO_NODE, __builtin_return_address(0));
				1971	}
				1972	EXPORT_SYMBOL(vmalloc_32);
				1973
				1974	/**
				1975	* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
				1976	* @size: allocation size
				1977	*
				1978	* The resulting memory area is 32bit addressable and zeroed so it can be
				1979	* mapped to userspace without leaking data.
				1980	*/
				1981	void *vmalloc_32_user(unsigned long size)
				1982	{
				1983	struct vm_struct *area;
				1984	void *ret;
				1985
				1986	ret = __vmalloc_node(size, 1, GFP_VMALLOC32 \| __GFP_ZERO, PAGE_KERNEL,
				1987	NUMA_NO_NODE, __builtin_return_address(0));
				1988	if (ret) {
				1989	area = find_vm_area(ret);
				1990	area->flags \|= VM_USERMAP;
				1991	}
				1992	return ret;
				1993	}
				1994	EXPORT_SYMBOL(vmalloc_32_user);
				1995
				1996	/*
				1997	* small helper routine , copy contents to buf from addr.
				1998	* If the page is not present, fill zero.
				1999	*/
				2000
				2001	static int aligned_vread(char buf, char addr, unsigned long count)
				2002	{
				2003	struct page *p;
				2004	int copied = 0;
				2005
				2006	while (count) {
				2007	unsigned long offset, length;
				2008
				2009	offset = offset_in_page(addr);
				2010	length = PAGE_SIZE - offset;
				2011	if (length > count)
				2012	length = count;
				2013	p = vmalloc_to_page(addr);
				2014	/*
				2015	* To do safe access to this _mapped_ area, we need
				2016	* lock. But adding lock here means that we need to add
				2017	* overhead of vmalloc()/vfree() calles for this _debug_
				2018	* interface, rarely used. Instead of that, we'll use
				2019	* kmap() and get small overhead in this access function.
				2020	*/
				2021	if (p) {
				2022	/*
				2023	* we can expect USER0 is not used (see vread/vwrite's
				2024	* function description)
				2025	*/
				2026	void *map = kmap_atomic(p);
				2027	memcpy(buf, map + offset, length);
				2028	kunmap_atomic(map);
				2029	} else
				2030	memset(buf, 0, length);
				2031
				2032	addr += length;
				2033	buf += length;
				2034	copied += length;
				2035	count -= length;
				2036	}
				2037	return copied;
				2038	}
				2039
				2040	static int aligned_vwrite(char buf, char addr, unsigned long count)
				2041	{
				2042	struct page *p;
				2043	int copied = 0;
				2044
				2045	while (count) {
				2046	unsigned long offset, length;
				2047
				2048	offset = offset_in_page(addr);
				2049	length = PAGE_SIZE - offset;
				2050	if (length > count)
				2051	length = count;
				2052	p = vmalloc_to_page(addr);
				2053	/*
				2054	* To do safe access to this _mapped_ area, we need
				2055	* lock. But adding lock here means that we need to add
				2056	* overhead of vmalloc()/vfree() calles for this _debug_
				2057	* interface, rarely used. Instead of that, we'll use
				2058	* kmap() and get small overhead in this access function.
				2059	*/
				2060	if (p) {
				2061	/*
				2062	* we can expect USER0 is not used (see vread/vwrite's
				2063	* function description)
				2064	*/
				2065	void *map = kmap_atomic(p);
				2066	memcpy(map + offset, buf, length);
				2067	kunmap_atomic(map);
				2068	}
				2069	addr += length;
				2070	buf += length;
				2071	copied += length;
				2072	count -= length;
				2073	}
				2074	return copied;
				2075	}
				2076
				2077	/**
				2078	* vread() - read vmalloc area in a safe way.
				2079	* @buf: buffer for reading data
				2080	* @addr: vm address.
				2081	* @count: number of bytes to be read.
				2082	*
				2083	* Returns # of bytes which addr and buf should be increased.
				2084	* (same number to @count). Returns 0 if [addr...addr+count) doesn't
				2085	* includes any intersect with alive vmalloc area.
				2086	*
				2087	* This function checks that addr is a valid vmalloc'ed area, and
				2088	* copy data from that area to a given buffer. If the given memory range
				2089	* of [addr...addr+count) includes some valid address, data is copied to
				2090	* proper area of @buf. If there are memory holes, they'll be zero-filled.
				2091	* IOREMAP area is treated as memory hole and no copy is done.
				2092	*
				2093	* If [addr...addr+count) doesn't includes any intersects with alive
				2094	* vm_struct area, returns 0. @buf should be kernel's buffer.
				2095	*
				2096	* Note: In usual ops, vread() is never necessary because the caller
				2097	* should know vmalloc() area is valid and can use memcpy().
				2098	* This is for routines which have to access vmalloc area without
				2099	* any informaion, as /dev/kmem.
				2100	*
				2101	*/
				2102
				2103	long vread(char buf, char addr, unsigned long count)
				2104	{
				2105	struct vmap_area *va;
				2106	struct vm_struct *vm;
				2107	char vaddr, buf_start = buf;
				2108	unsigned long buflen = count;
				2109	unsigned long n;
				2110
				2111	/* Don't allow overflow */
				2112	if ((unsigned long) addr + count < count)
				2113	count = -(unsigned long) addr;
				2114
				2115	spin_lock(&vmap_area_lock);
				2116	list_for_each_entry(va, &vmap_area_list, list) {
				2117	if (!count)
				2118	break;
				2119
				2120	if (!(va->flags & VM_VM_AREA))
				2121	continue;
				2122
				2123	vm = va->vm;
				2124	vaddr = (char *) vm->addr;
				2125	if (addr >= vaddr + get_vm_area_size(vm))
				2126	continue;
				2127	while (addr < vaddr) {
				2128	if (count == 0)
				2129	goto finished;
				2130	*buf = '\0';
				2131	buf++;
				2132	addr++;
				2133	count--;
				2134	}
				2135	n = vaddr + get_vm_area_size(vm) - addr;
				2136	if (n > count)
				2137	n = count;
				2138	if (!(vm->flags & VM_IOREMAP))
				2139	aligned_vread(buf, addr, n);
				2140	else /* IOREMAP area is treated as memory hole */
				2141	memset(buf, 0, n);
				2142	buf += n;
				2143	addr += n;
				2144	count -= n;
				2145	}
				2146	finished:
				2147	spin_unlock(&vmap_area_lock);
				2148
				2149	if (buf == buf_start)
				2150	return 0;
				2151	/* zero-fill memory holes */
				2152	if (buf != buf_start + buflen)
				2153	memset(buf, 0, buflen - (buf - buf_start));
				2154
				2155	return buflen;
				2156	}
				2157
				2158	/**
				2159	* vwrite() - write vmalloc area in a safe way.
				2160	* @buf: buffer for source data
				2161	* @addr: vm address.
				2162	* @count: number of bytes to be read.
				2163	*
				2164	* Returns # of bytes which addr and buf should be incresed.
				2165	* (same number to @count).
				2166	* If [addr...addr+count) doesn't includes any intersect with valid
				2167	* vmalloc area, returns 0.
				2168	*
				2169	* This function checks that addr is a valid vmalloc'ed area, and
				2170	* copy data from a buffer to the given addr. If specified range of
				2171	* [addr...addr+count) includes some valid address, data is copied from
				2172	* proper area of @buf. If there are memory holes, no copy to hole.
				2173	* IOREMAP area is treated as memory hole and no copy is done.
				2174	*
				2175	* If [addr...addr+count) doesn't includes any intersects with alive
				2176	* vm_struct area, returns 0. @buf should be kernel's buffer.
				2177	*
				2178	* Note: In usual ops, vwrite() is never necessary because the caller
				2179	* should know vmalloc() area is valid and can use memcpy().
				2180	* This is for routines which have to access vmalloc area without
				2181	* any informaion, as /dev/kmem.
				2182	*/
				2183
				2184	long vwrite(char buf, char addr, unsigned long count)
				2185	{
				2186	struct vmap_area *va;
				2187	struct vm_struct *vm;
				2188	char *vaddr;
				2189	unsigned long n, buflen;
				2190	int copied = 0;
				2191
				2192	/* Don't allow overflow */
				2193	if ((unsigned long) addr + count < count)
				2194	count = -(unsigned long) addr;
				2195	buflen = count;
				2196
				2197	spin_lock(&vmap_area_lock);
				2198	list_for_each_entry(va, &vmap_area_list, list) {
				2199	if (!count)
				2200	break;
				2201
				2202	if (!(va->flags & VM_VM_AREA))
				2203	continue;
				2204
				2205	vm = va->vm;
				2206	vaddr = (char *) vm->addr;
				2207	if (addr >= vaddr + get_vm_area_size(vm))
				2208	continue;
				2209	while (addr < vaddr) {
				2210	if (count == 0)
				2211	goto finished;
				2212	buf++;
				2213	addr++;
				2214	count--;
				2215	}
				2216	n = vaddr + get_vm_area_size(vm) - addr;
				2217	if (n > count)
				2218	n = count;
				2219	if (!(vm->flags & VM_IOREMAP)) {
				2220	aligned_vwrite(buf, addr, n);
				2221	copied++;
				2222	}
				2223	buf += n;
				2224	addr += n;
				2225	count -= n;
				2226	}
				2227	finished:
				2228	spin_unlock(&vmap_area_lock);
				2229	if (!copied)
				2230	return 0;
				2231	return buflen;
				2232	}
				2233
				2234	/**
				2235	* remap_vmalloc_range_partial - map vmalloc pages to userspace
				2236	* @vma: vma to cover
				2237	* @uaddr: target user address to start at
				2238	* @kaddr: virtual address of vmalloc kernel memory
				2239	* @size: size of map area
				2240	*
				2241	* Returns: 0 for success, -Exxx on failure
				2242	*
				2243	* This function checks that @kaddr is a valid vmalloc'ed area,
				2244	* and that it is big enough to cover the range starting at
				2245	* @uaddr in @vma. Will return failure if that criteria isn't
				2246	* met.
				2247	*
				2248	* Similar to remap_pfn_range() (see mm/memory.c)
				2249	*/
				2250	int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
				2251	void *kaddr, unsigned long size)
				2252	{
				2253	struct vm_struct *area;
				2254
				2255	size = PAGE_ALIGN(size);
				2256
				2257	if (!PAGE_ALIGNED(uaddr) \|\| !PAGE_ALIGNED(kaddr))
				2258	return -EINVAL;
				2259
				2260	area = find_vm_area(kaddr);
				2261	if (!area)
				2262	return -EINVAL;
				2263
				2264	if (!(area->flags & VM_USERMAP))
				2265	return -EINVAL;
				2266
				2267	if (kaddr + size > area->addr + get_vm_area_size(area))
				2268	return -EINVAL;
				2269
				2270	do {
				2271	struct page *page = vmalloc_to_page(kaddr);
				2272	int ret;
				2273
				2274	ret = vm_insert_page(vma, uaddr, page);
				2275	if (ret)
				2276	return ret;
				2277
				2278	uaddr += PAGE_SIZE;
				2279	kaddr += PAGE_SIZE;
				2280	size -= PAGE_SIZE;
				2281	} while (size > 0);
				2282
				2283	vma->vm_flags \|= VM_DONTEXPAND \| VM_DONTDUMP;
				2284
				2285	return 0;
				2286	}
				2287	EXPORT_SYMBOL(remap_vmalloc_range_partial);
				2288
				2289	/**
				2290	* remap_vmalloc_range - map vmalloc pages to userspace
				2291	* @vma: vma to cover (map full range of vma)
				2292	* @addr: vmalloc memory
				2293	* @pgoff: number of pages into addr before first page to map
				2294	*
				2295	* Returns: 0 for success, -Exxx on failure
				2296	*
				2297	* This function checks that addr is a valid vmalloc'ed area, and
				2298	* that it is big enough to cover the vma. Will return failure if
				2299	* that criteria isn't met.
				2300	*
				2301	* Similar to remap_pfn_range() (see mm/memory.c)
				2302	*/
				2303	int remap_vmalloc_range(struct vm_area_struct vma, void addr,
				2304	unsigned long pgoff)
				2305	{
				2306	return remap_vmalloc_range_partial(vma, vma->vm_start,
				2307	addr + (pgoff << PAGE_SHIFT),
				2308	vma->vm_end - vma->vm_start);
				2309	}
				2310	EXPORT_SYMBOL(remap_vmalloc_range);
				2311
				2312	/*
				2313	* Implement a stub for vmalloc_sync_all() if the architecture chose not to
				2314	* have one.
				2315	*
				2316	* The purpose of this function is to make sure the vmalloc area
				2317	* mappings are identical in all page-tables in the system.
				2318	*/
				2319	void __weak vmalloc_sync_all(void)
				2320	{
				2321	}
				2322
				2323
				2324	static int f(pte_t pte, pgtable_t table, unsigned long addr, void data)
				2325	{
				2326	pte_t ***p = data;
				2327
				2328	if (p) {
				2329	(p) = pte;
				2330	(*p)++;
				2331	}
				2332	return 0;
				2333	}
				2334
				2335	/**
				2336	* alloc_vm_area - allocate a range of kernel address space
				2337	* @size: size of the area
				2338	* @ptes: returns the PTEs for the address space
				2339	*
				2340	* Returns: NULL on failure, vm_struct on success
				2341	*
				2342	* This function reserves a range of kernel address space, and
				2343	* allocates pagetables to map that range. No actual mappings
				2344	* are created.
				2345	*
				2346	* If @ptes is non-NULL, pointers to the PTEs (in init_mm)
				2347	* allocated for the VM area are returned.
				2348	*/
				2349	struct vm_struct alloc_vm_area(size_t size, pte_t *ptes)
				2350	{
				2351	struct vm_struct *area;
				2352
				2353	area = get_vm_area_caller(size, VM_IOREMAP,
				2354	__builtin_return_address(0));
				2355	if (area == NULL)
				2356	return NULL;
				2357
				2358	/*
				2359	* This ensures that page tables are constructed for this region
				2360	* of kernel virtual address space and mapped into init_mm.
				2361	*/
				2362	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
				2363	size, f, ptes ? &ptes : NULL)) {
				2364	free_vm_area(area);
				2365	return NULL;
				2366	}
				2367
				2368	return area;
				2369	}
				2370	EXPORT_SYMBOL_GPL(alloc_vm_area);
				2371
				2372	void free_vm_area(struct vm_struct *area)
				2373	{
				2374	struct vm_struct *ret;
				2375	ret = remove_vm_area(area->addr);
				2376	BUG_ON(ret != area);
				2377	kfree(area);
				2378	}
				2379	EXPORT_SYMBOL_GPL(free_vm_area);
				2380
				2381	#ifdef CONFIG_SMP
				2382	static struct vmap_area node_to_va(struct rb_node n)
				2383	{
				2384	return rb_entry_safe(n, struct vmap_area, rb_node);
				2385	}
				2386
				2387	/**
				2388	* pvm_find_next_prev - find the next and prev vmap_area surrounding @end
				2389	* @end: target address
				2390	* @pnext: out arg for the next vmap_area
				2391	* @pprev: out arg for the previous vmap_area
				2392	*
				2393	* Returns: %true if either or both of next and prev are found,
				2394	* %false if no vmap_area exists
				2395	*
				2396	* Find vmap_areas end addresses of which enclose @end. ie. if not
				2397	* NULL, pnext->va_end > @end and pprev->va_end <= @end.
				2398	*/
				2399	static bool pvm_find_next_prev(unsigned long end,
				2400	struct vmap_area **pnext,
				2401	struct vmap_area **pprev)
				2402	{
				2403	struct rb_node *n = vmap_area_root.rb_node;
				2404	struct vmap_area *va = NULL;
				2405
				2406	while (n) {
				2407	va = rb_entry(n, struct vmap_area, rb_node);
				2408	if (end < va->va_end)
				2409	n = n->rb_left;
				2410	else if (end > va->va_end)
				2411	n = n->rb_right;
				2412	else
				2413	break;
				2414	}
				2415
				2416	if (!va)
				2417	return false;
				2418
				2419	if (va->va_end > end) {
				2420	*pnext = va;
				2421	pprev = node_to_va(rb_prev(&(pnext)->rb_node));
				2422	} else {
				2423	*pprev = va;
				2424	pnext = node_to_va(rb_next(&(pprev)->rb_node));
				2425	}
				2426	return true;
				2427	}
				2428
				2429	/**
				2430	* pvm_determine_end - find the highest aligned address between two vmap_areas
				2431	* @pnext: in/out arg for the next vmap_area
				2432	* @pprev: in/out arg for the previous vmap_area
				2433	* @align: alignment
				2434	*
				2435	* Returns: determined end address
				2436	*
				2437	* Find the highest aligned address between @pnext and @pprev below
				2438	* VMALLOC_END. @pnext and @pprev are adjusted so that the aligned
				2439	* down address is between the end addresses of the two vmap_areas.
				2440	*
				2441	* Please note that the address returned by this function may fall
				2442	* inside *@pnext vmap_area. The caller is responsible for checking
				2443	* that.
				2444	*/
				2445	static unsigned long pvm_determine_end(struct vmap_area **pnext,
				2446	struct vmap_area **pprev,
				2447	unsigned long align)
				2448	{
				2449	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
				2450	unsigned long addr;
				2451
				2452	if (*pnext)
				2453	addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
				2454	else
				2455	addr = vmalloc_end;
				2456
				2457	while (pprev && (pprev)->va_end > addr) {
				2458	pnext = pprev;
				2459	pprev = node_to_va(rb_prev(&(pnext)->rb_node));
				2460	}
				2461
				2462	return addr;
				2463	}
				2464
				2465	/**
				2466	* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
				2467	* @offsets: array containing offset of each area
				2468	* @sizes: array containing size of each area
				2469	* @nr_vms: the number of areas to allocate
				2470	* @align: alignment, all entries in @offsets and @sizes must be aligned to this
				2471	*
				2472	* Returns: kmalloc'd vm_struct pointer array pointing to allocated
				2473	* vm_structs on success, %NULL on failure
				2474	*
				2475	* Percpu allocator wants to use congruent vm areas so that it can
				2476	* maintain the offsets among percpu areas. This function allocates
				2477	* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
				2478	* be scattered pretty far, distance between two areas easily going up
				2479	* to gigabytes. To avoid interacting with regular vmallocs, these
				2480	* areas are allocated from top.
				2481	*
				2482	* Despite its complicated look, this allocator is rather simple. It
				2483	* does everything top-down and scans areas from the end looking for
				2484	* matching slot. While scanning, if any of the areas overlaps with
				2485	* existing vmap_area, the base address is pulled down to fit the
				2486	* area. Scanning is repeated till all the areas fit and then all
				2487	* necessary data structures are inserted and the result is returned.
				2488	*/
				2489	struct vm_struct *pcpu_get_vm_areas(const unsigned long offsets,
				2490	const size_t *sizes, int nr_vms,
				2491	size_t align)
				2492	{
				2493	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
				2494	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
				2495	struct vmap_area *vas, prev, *next;
				2496	struct vm_struct **vms;
				2497	int area, area2, last_area, term_area;
				2498	unsigned long base, start, end, last_end;
				2499	bool purged = false;
				2500
				2501	/* verify parameters and allocate data structures */
				2502	BUG_ON(offset_in_page(align) \|\| !is_power_of_2(align));
				2503	for (last_area = 0, area = 0; area < nr_vms; area++) {
				2504	start = offsets[area];
				2505	end = start + sizes[area];
				2506
				2507	/* is everything aligned properly? */
				2508	BUG_ON(!IS_ALIGNED(offsets[area], align));
				2509	BUG_ON(!IS_ALIGNED(sizes[area], align));
				2510
				2511	/* detect the area with the highest address */
				2512	if (start > offsets[last_area])
				2513	last_area = area;
				2514
				2515	for (area2 = area + 1; area2 < nr_vms; area2++) {
				2516	unsigned long start2 = offsets[area2];
				2517	unsigned long end2 = start2 + sizes[area2];
				2518
				2519	BUG_ON(start2 < end && start < end2);
				2520	}
				2521	}
				2522	last_end = offsets[last_area] + sizes[last_area];
				2523
				2524	if (vmalloc_end - vmalloc_start < last_end) {
				2525	WARN_ON(true);
				2526	return NULL;
				2527	}
				2528
				2529	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
				2530	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
				2531	if (!vas \|\| !vms)
				2532	goto err_free2;
				2533
				2534	for (area = 0; area < nr_vms; area++) {
				2535	vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
				2536	vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
				2537	if (!vas[area] \|\| !vms[area])
				2538	goto err_free;
				2539	}
				2540	retry:
				2541	spin_lock(&vmap_area_lock);
				2542
				2543	/* start scanning - we scan from the top, begin with the last area */
				2544	area = term_area = last_area;
				2545	start = offsets[area];
				2546	end = start + sizes[area];
				2547
				2548	if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
				2549	base = vmalloc_end - last_end;
				2550	goto found;
				2551	}
				2552	base = pvm_determine_end(&next, &prev, align) - end;
				2553
				2554	while (true) {
				2555	BUG_ON(next && next->va_end <= base + end);
				2556	BUG_ON(prev && prev->va_end > base + end);
				2557
				2558	/*
				2559	* base might have underflowed, add last_end before
				2560	* comparing.
				2561	*/
				2562	if (base + last_end < vmalloc_start + last_end) {
				2563	spin_unlock(&vmap_area_lock);
				2564	if (!purged) {
				2565	purge_vmap_area_lazy();
				2566	purged = true;
				2567	goto retry;
				2568	}
				2569	goto err_free;
				2570	}
				2571
				2572	/*
				2573	* If next overlaps, move base downwards so that it's
				2574	* right below next and then recheck.
				2575	*/
				2576	if (next && next->va_start < base + end) {
				2577	base = pvm_determine_end(&next, &prev, align) - end;
				2578	term_area = area;
				2579	continue;
				2580	}
				2581
				2582	/*
				2583	* If prev overlaps, shift down next and prev and move
				2584	* base so that it's right below new next and then
				2585	* recheck.
				2586	*/
				2587	if (prev && prev->va_end > base + start) {
				2588	next = prev;
				2589	prev = node_to_va(rb_prev(&next->rb_node));
				2590	base = pvm_determine_end(&next, &prev, align) - end;
				2591	term_area = area;
				2592	continue;
				2593	}
				2594
				2595	/*
				2596	* This area fits, move on to the previous one. If
				2597	* the previous one is the terminal one, we're done.
				2598	*/
				2599	area = (area + nr_vms - 1) % nr_vms;
				2600	if (area == term_area)
				2601	break;
				2602	start = offsets[area];
				2603	end = start + sizes[area];
				2604	pvm_find_next_prev(base + end, &next, &prev);
				2605	}
				2606	found:
				2607	/* we've found a fitting base, insert all va's */
				2608	for (area = 0; area < nr_vms; area++) {
				2609	struct vmap_area *va = vas[area];
				2610
				2611	va->va_start = base + offsets[area];
				2612	va->va_end = va->va_start + sizes[area];
				2613	__insert_vmap_area(va);
				2614	}
				2615
				2616	vmap_area_pcpu_hole = base + offsets[last_area];
				2617
				2618	spin_unlock(&vmap_area_lock);
				2619
				2620	/* insert all vm's */
				2621	for (area = 0; area < nr_vms; area++)
				2622	setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
				2623	pcpu_get_vm_areas);
				2624
				2625	kfree(vas);
				2626	return vms;
				2627
				2628	err_free:
				2629	for (area = 0; area < nr_vms; area++) {
				2630	kfree(vas[area]);
				2631	kfree(vms[area]);
				2632	}
				2633	err_free2:
				2634	kfree(vas);
				2635	kfree(vms);
				2636	return NULL;
				2637	}
				2638
				2639	/**
				2640	* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
				2641	* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
				2642	* @nr_vms: the number of allocated areas
				2643	*
				2644	* Free vm_structs and the array allocated by pcpu_get_vm_areas().
				2645	*/
				2646	void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
				2647	{
				2648	int i;
				2649
				2650	for (i = 0; i < nr_vms; i++)
				2651	free_vm_area(vms[i]);
				2652	kfree(vms);
				2653	}
				2654	#endif /* CONFIG_SMP */
				2655
				2656	#ifdef CONFIG_PROC_FS
				2657	static void s_start(struct seq_file m, loff_t *pos)
				2658	__acquires(&vmap_area_lock)
				2659	{
				2660	spin_lock(&vmap_area_lock);
				2661	return seq_list_start(&vmap_area_list, *pos);
				2662	}
				2663
				2664	static void s_next(struct seq_file m, void p, loff_t pos)
				2665	{
				2666	return seq_list_next(p, &vmap_area_list, pos);
				2667	}
				2668
				2669	static void s_stop(struct seq_file m, void p)
				2670	__releases(&vmap_area_lock)
				2671	{
				2672	spin_unlock(&vmap_area_lock);
				2673	}
				2674
				2675	static void show_numa_info(struct seq_file m, struct vm_struct v)
				2676	{
				2677	if (IS_ENABLED(CONFIG_NUMA)) {
				2678	unsigned int nr, *counters = m->private;
				2679
				2680	if (!counters)
				2681	return;
				2682
				2683	if (v->flags & VM_UNINITIALIZED)
				2684	return;
				2685	/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
				2686	smp_rmb();
				2687
				2688	memset(counters, 0, nr_node_ids * sizeof(unsigned int));
				2689
				2690	for (nr = 0; nr < v->nr_pages; nr++)
				2691	counters[page_to_nid(v->pages[nr])]++;
				2692
				2693	for_each_node_state(nr, N_HIGH_MEMORY)
				2694	if (counters[nr])
				2695	seq_printf(m, " N%u=%u", nr, counters[nr]);
				2696	}
				2697	}
				2698
				2699	static int s_show(struct seq_file m, void p)
				2700	{
				2701	struct vmap_area *va;
				2702	struct vm_struct *v;
				2703
				2704	va = list_entry(p, struct vmap_area, list);
				2705
				2706	/*
				2707	* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
				2708	* behalf of vmap area is being tear down or vm_map_ram allocation.
				2709	*/
				2710	if (!(va->flags & VM_VM_AREA)) {
				2711	seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
				2712	(void )va->va_start, (void )va->va_end,
				2713	va->va_end - va->va_start,
				2714	va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
				2715
				2716	return 0;
				2717	}
				2718
				2719	v = va->vm;
				2720
				2721	seq_printf(m, "0x%pK-0x%pK %7ld",
				2722	v->addr, v->addr + v->size, v->size);
				2723
				2724	if (v->caller)
				2725	seq_printf(m, " %pS", v->caller);
				2726
				2727	if (v->nr_pages)
				2728	seq_printf(m, " pages=%d", v->nr_pages);
				2729
				2730	if (v->phys_addr)
				2731	seq_printf(m, " phys=%pa", &v->phys_addr);
				2732
				2733	if (v->flags & VM_IOREMAP)
				2734	seq_puts(m, " ioremap");
				2735
				2736	if (v->flags & VM_ALLOC)
				2737	seq_puts(m, " vmalloc");
				2738
				2739	if (v->flags & VM_MAP)
				2740	seq_puts(m, " vmap");
				2741
				2742	if (v->flags & VM_USERMAP)
				2743	seq_puts(m, " user");
				2744
				2745	if (is_vmalloc_addr(v->pages))
				2746	seq_puts(m, " vpages");
				2747
				2748	show_numa_info(m, v);
				2749	seq_putc(m, '\n');
				2750	return 0;
				2751	}
				2752
				2753	static const struct seq_operations vmalloc_op = {
				2754	.start = s_start,
				2755	.next = s_next,
				2756	.stop = s_stop,
				2757	.show = s_show,
				2758	};
				2759
				2760	static int __init proc_vmalloc_init(void)
				2761	{
				2762	if (IS_ENABLED(CONFIG_PROC_STRIPPED))
				2763	return 0;
				2764	if (IS_ENABLED(CONFIG_NUMA))
				2765	proc_create_seq_private("vmallocinfo", 0400, NULL,
				2766	&vmalloc_op,
				2767	nr_node_ids * sizeof(unsigned int), NULL);
				2768	else
				2769	proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
				2770	return 0;
				2771	}
				2772	module_init(proc_vmalloc_init);
				2773
				2774	#endif
				2775