Blame - marvell/linux/mm/vmalloc.c - T108

blob: e8e3f52aea4ea288a3b5fadf422576bd7ec3ba6e [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* linux/mm/vmalloc.c
				4	*
				5	* Copyright (C) 1993 Linus Torvalds
				6	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
				7	* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
				8	* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
				9	* Numa awareness, Christoph Lameter, SGI, June 2005
				10	*/
				11
				12	#include <linux/vmalloc.h>
				13	#include <linux/mm.h>
				14	#include <linux/module.h>
				15	#include <linux/highmem.h>
				16	#include <linux/sched/signal.h>
				17	#include <linux/slab.h>
				18	#include <linux/spinlock.h>
				19	#include <linux/interrupt.h>
				20	#include <linux/proc_fs.h>
				21	#include <linux/seq_file.h>
				22	#include <linux/set_memory.h>
				23	#include <linux/debugobjects.h>
				24	#include <linux/kallsyms.h>
				25	#include <linux/list.h>
				26	#include <linux/notifier.h>
				27	#include <linux/rbtree.h>
				28	#include <linux/radix-tree.h>
				29	#include <linux/rcupdate.h>
				30	#include <linux/pfn.h>
				31	#include <linux/kmemleak.h>
				32	#include <linux/atomic.h>
				33	#include <linux/compiler.h>
				34	#include <linux/llist.h>
				35	#include <linux/bitops.h>
				36	#include <linux/rbtree_augmented.h>
				37	#include <linux/overflow.h>
				38
				39	#include <linux/uaccess.h>
				40	#include <asm/tlbflush.h>
				41	#include <asm/shmparam.h>
				42
				43	#include "internal.h"
				44
				45	struct vfree_deferred {
				46	struct llist_head list;
				47	struct work_struct wq;
				48	};
				49	static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
				50
				51	static void __vunmap(const void *, int);
				52
				53	static void free_work(struct work_struct *w)
				54	{
				55	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
				56	struct llist_node t, llnode;
				57
				58	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
				59	__vunmap((void *)llnode, 1);
				60	}
				61
				62	/* Page table manipulation functions */
				63
				64	static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
				65	{
				66	pte_t *pte;
				67
				68	pte = pte_offset_kernel(pmd, addr);
				69	do {
				70	pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
				71	WARN_ON(!pte_none(ptent) && !pte_present(ptent));
				72	} while (pte++, addr += PAGE_SIZE, addr != end);
				73	}
				74
				75	static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
				76	{
				77	pmd_t *pmd;
				78	unsigned long next;
				79
				80	pmd = pmd_offset(pud, addr);
				81	do {
				82	next = pmd_addr_end(addr, end);
				83	if (pmd_clear_huge(pmd))
				84	continue;
				85	if (pmd_none_or_clear_bad(pmd))
				86	continue;
				87	vunmap_pte_range(pmd, addr, next);
				88
				89	cond_resched();
				90	} while (pmd++, addr = next, addr != end);
				91	}
				92
				93	static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
				94	{
				95	pud_t *pud;
				96	unsigned long next;
				97
				98	pud = pud_offset(p4d, addr);
				99	do {
				100	next = pud_addr_end(addr, end);
				101	if (pud_clear_huge(pud))
				102	continue;
				103	if (pud_none_or_clear_bad(pud))
				104	continue;
				105	vunmap_pmd_range(pud, addr, next);
				106	} while (pud++, addr = next, addr != end);
				107	}
				108
				109	static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
				110	{
				111	p4d_t *p4d;
				112	unsigned long next;
				113
				114	p4d = p4d_offset(pgd, addr);
				115	do {
				116	next = p4d_addr_end(addr, end);
				117	if (p4d_clear_huge(p4d))
				118	continue;
				119	if (p4d_none_or_clear_bad(p4d))
				120	continue;
				121	vunmap_pud_range(p4d, addr, next);
				122	} while (p4d++, addr = next, addr != end);
				123	}
				124
				125	static void vunmap_page_range(unsigned long addr, unsigned long end)
				126	{
				127	pgd_t *pgd;
				128	unsigned long next;
				129
				130	BUG_ON(addr >= end);
				131	pgd = pgd_offset_k(addr);
				132	do {
				133	next = pgd_addr_end(addr, end);
				134	if (pgd_none_or_clear_bad(pgd))
				135	continue;
				136	vunmap_p4d_range(pgd, addr, next);
				137	} while (pgd++, addr = next, addr != end);
				138	}
				139
				140	static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
				141	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				142	{
				143	pte_t *pte;
				144
				145	/*
				146	* nr is a running index into the array which helps higher level
				147	* callers keep track of where we're up to.
				148	*/
				149
				150	pte = pte_alloc_kernel(pmd, addr);
				151	if (!pte)
				152	return -ENOMEM;
				153	do {
				154	struct page page = pages[nr];
				155
				156	if (WARN_ON(!pte_none(*pte)))
				157	return -EBUSY;
				158	if (WARN_ON(!page))
				159	return -ENOMEM;
				160	set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
				161	(*nr)++;
				162	} while (pte++, addr += PAGE_SIZE, addr != end);
				163	return 0;
				164	}
				165
				166	static int vmap_pmd_range(pud_t *pud, unsigned long addr,
				167	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				168	{
				169	pmd_t *pmd;
				170	unsigned long next;
				171
				172	pmd = pmd_alloc(&init_mm, pud, addr);
				173	if (!pmd)
				174	return -ENOMEM;
				175	do {
				176	next = pmd_addr_end(addr, end);
				177	if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
				178	return -ENOMEM;
				179	} while (pmd++, addr = next, addr != end);
				180	return 0;
				181	}
				182
				183	static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
				184	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				185	{
				186	pud_t *pud;
				187	unsigned long next;
				188
				189	pud = pud_alloc(&init_mm, p4d, addr);
				190	if (!pud)
				191	return -ENOMEM;
				192	do {
				193	next = pud_addr_end(addr, end);
				194	if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
				195	return -ENOMEM;
				196	} while (pud++, addr = next, addr != end);
				197	return 0;
				198	}
				199
				200	static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
				201	unsigned long end, pgprot_t prot, struct page *pages, int nr)
				202	{
				203	p4d_t *p4d;
				204	unsigned long next;
				205
				206	p4d = p4d_alloc(&init_mm, pgd, addr);
				207	if (!p4d)
				208	return -ENOMEM;
				209	do {
				210	next = p4d_addr_end(addr, end);
				211	if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
				212	return -ENOMEM;
				213	} while (p4d++, addr = next, addr != end);
				214	return 0;
				215	}
				216
				217	/*
				218	* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
				219	* will have pfns corresponding to the "pages" array.
				220	*
				221	* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
				222	*/
				223	static int vmap_page_range_noflush(unsigned long start, unsigned long end,
				224	pgprot_t prot, struct page **pages)
				225	{
				226	pgd_t *pgd;
				227	unsigned long next;
				228	unsigned long addr = start;
				229	int err = 0;
				230	int nr = 0;
				231
				232	BUG_ON(addr >= end);
				233	pgd = pgd_offset_k(addr);
				234	do {
				235	next = pgd_addr_end(addr, end);
				236	err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
				237	if (err)
				238	return err;
				239	} while (pgd++, addr = next, addr != end);
				240
				241	return nr;
				242	}
				243
				244	static int vmap_page_range(unsigned long start, unsigned long end,
				245	pgprot_t prot, struct page **pages)
				246	{
				247	int ret;
				248
				249	ret = vmap_page_range_noflush(start, end, prot, pages);
				250	flush_cache_vmap(start, end);
				251	return ret;
				252	}
				253
				254	int is_vmalloc_or_module_addr(const void *x)
				255	{
				256	/*
				257	* ARM, x86-64 and sparc64 put modules in a special place,
				258	* and fall back on vmalloc() if that fails. Others
				259	* just put it in the vmalloc space.
				260	*/
				261	#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
				262	unsigned long addr = (unsigned long)x;
				263	if (addr >= MODULES_VADDR && addr < MODULES_END)
				264	return 1;
				265	#endif
				266	return is_vmalloc_addr(x);
				267	}
				268
				269	/*
				270	* Walk a vmap address to the struct page it maps.
				271	*/
				272	struct page vmalloc_to_page(const void vmalloc_addr)
				273	{
				274	unsigned long addr = (unsigned long) vmalloc_addr;
				275	struct page *page = NULL;
				276	pgd_t *pgd = pgd_offset_k(addr);
				277	p4d_t *p4d;
				278	pud_t *pud;
				279	pmd_t *pmd;
				280	pte_t *ptep, pte;
				281
				282	/*
				283	* XXX we might need to change this if we add VIRTUAL_BUG_ON for
				284	* architectures that do not vmalloc module space
				285	*/
				286	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
				287
				288	if (pgd_none(*pgd))
				289	return NULL;
				290	p4d = p4d_offset(pgd, addr);
				291	if (p4d_none(*p4d))
				292	return NULL;
				293	pud = pud_offset(p4d, addr);
				294
				295	/*
				296	* Don't dereference bad PUD or PMD (below) entries. This will also
				297	* identify huge mappings, which we may encounter on architectures
				298	* that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
				299	* identified as vmalloc addresses by is_vmalloc_addr(), but are
				300	* not [unambiguously] associated with a struct page, so there is
				301	* no correct value to return for them.
				302	*/
				303	WARN_ON_ONCE(pud_bad(*pud));
				304	if (pud_none(pud) \|\| pud_bad(pud))
				305	return NULL;
				306	pmd = pmd_offset(pud, addr);
				307	WARN_ON_ONCE(pmd_bad(*pmd));
				308	if (pmd_none(pmd) \|\| pmd_bad(pmd))
				309	return NULL;
				310
				311	ptep = pte_offset_map(pmd, addr);
				312	pte = *ptep;
				313	if (pte_present(pte))
				314	page = pte_page(pte);
				315	pte_unmap(ptep);
				316	return page;
				317	}
				318	EXPORT_SYMBOL(vmalloc_to_page);
				319
				320	/*
				321	* Map a vmalloc()-space virtual address to the physical page frame number.
				322	*/
				323	unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
				324	{
				325	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
				326	}
				327	EXPORT_SYMBOL(vmalloc_to_pfn);
				328
				329
				330	/* Global kva allocator */
				331
				332	#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
				333	#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
				334
				335
				336	static DEFINE_SPINLOCK(vmap_area_lock);
				337	/* Export for kexec only */
				338	LIST_HEAD(vmap_area_list);
				339	static LLIST_HEAD(vmap_purge_list);
				340	static struct rb_root vmap_area_root = RB_ROOT;
				341	static bool vmap_initialized __read_mostly;
				342
				343	/*
				344	* This kmem_cache is used for vmap_area objects. Instead of
				345	* allocating from slab we reuse an object from this cache to
				346	* make things faster. Especially in "no edge" splitting of
				347	* free block.
				348	*/
				349	static struct kmem_cache *vmap_area_cachep;
				350
				351	/*
				352	* This linked list is used in pair with free_vmap_area_root.
				353	* It gives O(1) access to prev/next to perform fast coalescing.
				354	*/
				355	static LIST_HEAD(free_vmap_area_list);
				356
				357	/*
				358	* This augment red-black tree represents the free vmap space.
				359	* All vmap_area objects in this tree are sorted by va->va_start
				360	* address. It is used for allocation and merging when a vmap
				361	* object is released.
				362	*
				363	* Each vmap_area node contains a maximum available free block
				364	* of its sub-tree, right or left. Therefore it is possible to
				365	* find a lowest match of free area.
				366	*/
				367	static struct rb_root free_vmap_area_root = RB_ROOT;
				368
				369	/*
				370	* Preload a CPU with one object for "no edge" split case. The
				371	* aim is to get rid of allocations from the atomic context, thus
				372	* to use more permissive allocation masks.
				373	*/
				374	static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
				375
				376	static __always_inline unsigned long
				377	va_size(struct vmap_area *va)
				378	{
				379	return (va->va_end - va->va_start);
				380	}
				381
				382	static __always_inline unsigned long
				383	get_subtree_max_size(struct rb_node *node)
				384	{
				385	struct vmap_area *va;
				386
				387	va = rb_entry_safe(node, struct vmap_area, rb_node);
				388	return va ? va->subtree_max_size : 0;
				389	}
				390
				391	/*
				392	* Gets called when remove the node and rotate.
				393	*/
				394	static __always_inline unsigned long
				395	compute_subtree_max_size(struct vmap_area *va)
				396	{
				397	return max3(va_size(va),
				398	get_subtree_max_size(va->rb_node.rb_left),
				399	get_subtree_max_size(va->rb_node.rb_right));
				400	}
				401
				402	RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
				403	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
				404
				405	static void purge_vmap_area_lazy(void);
				406	static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
				407	static unsigned long lazy_max_pages(void);
				408
				409	static atomic_long_t nr_vmalloc_pages;
				410
				411	unsigned long vmalloc_nr_pages(void)
				412	{
				413	return atomic_long_read(&nr_vmalloc_pages);
				414	}
				415
				416	static struct vmap_area *__find_vmap_area(unsigned long addr)
				417	{
				418	struct rb_node *n = vmap_area_root.rb_node;
				419
				420	while (n) {
				421	struct vmap_area *va;
				422
				423	va = rb_entry(n, struct vmap_area, rb_node);
				424	if (addr < va->va_start)
				425	n = n->rb_left;
				426	else if (addr >= va->va_end)
				427	n = n->rb_right;
				428	else
				429	return va;
				430	}
				431
				432	return NULL;
				433	}
				434
				435	/*
				436	* This function returns back addresses of parent node
				437	* and its left or right link for further processing.
				438	*/
				439	static __always_inline struct rb_node **
				440	find_va_links(struct vmap_area *va,
				441	struct rb_root root, struct rb_node from,
				442	struct rb_node **parent)
				443	{
				444	struct vmap_area *tmp_va;
				445	struct rb_node **link;
				446
				447	if (root) {
				448	link = &root->rb_node;
				449	if (unlikely(!*link)) {
				450	*parent = NULL;
				451	return link;
				452	}
				453	} else {
				454	link = &from;
				455	}
				456
				457	/*
				458	* Go to the bottom of the tree. When we hit the last point
				459	* we end up with parent rb_node and correct direction, i name
				460	* it link, where the new va->rb_node will be attached to.
				461	*/
				462	do {
				463	tmp_va = rb_entry(*link, struct vmap_area, rb_node);
				464
				465	/*
				466	* During the traversal we also do some sanity check.
				467	* Trigger the BUG() if there are sides(left/right)
				468	* or full overlaps.
				469	*/
				470	if (va->va_start < tmp_va->va_end &&
				471	va->va_end <= tmp_va->va_start)
				472	link = &(*link)->rb_left;
				473	else if (va->va_end > tmp_va->va_start &&
				474	va->va_start >= tmp_va->va_end)
				475	link = &(*link)->rb_right;
				476	else
				477	BUG();
				478	} while (*link);
				479
				480	*parent = &tmp_va->rb_node;
				481	return link;
				482	}
				483
				484	static __always_inline struct list_head *
				485	get_va_next_sibling(struct rb_node parent, struct rb_node *link)
				486	{
				487	struct list_head *list;
				488
				489	if (unlikely(!parent))
				490	/*
				491	* The red-black tree where we try to find VA neighbors
				492	* before merging or inserting is empty, i.e. it means
				493	* there is no free vmap space. Normally it does not
				494	* happen but we handle this case anyway.
				495	*/
				496	return NULL;
				497
				498	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
				499	return (&parent->rb_right == link ? list->next : list);
				500	}
				501
				502	static __always_inline void
				503	link_va(struct vmap_area va, struct rb_root root,
				504	struct rb_node parent, struct rb_node link, struct list_head head)
				505	{
				506	/*
				507	* VA is still not in the list, but we can
				508	* identify its future previous list_head node.
				509	*/
				510	if (likely(parent)) {
				511	head = &rb_entry(parent, struct vmap_area, rb_node)->list;
				512	if (&parent->rb_right != link)
				513	head = head->prev;
				514	}
				515
				516	/* Insert to the rb-tree */
				517	rb_link_node(&va->rb_node, parent, link);
				518	if (root == &free_vmap_area_root) {
				519	/*
				520	* Some explanation here. Just perform simple insertion
				521	* to the tree. We do not set va->subtree_max_size to
				522	* its current size before calling rb_insert_augmented().
				523	* It is because of we populate the tree from the bottom
				524	* to parent levels when the node _is_ in the tree.
				525	*
				526	* Therefore we set subtree_max_size to zero after insertion,
				527	* to let __augment_tree_propagate_from() puts everything to
				528	* the correct order later on.
				529	*/
				530	rb_insert_augmented(&va->rb_node,
				531	root, &free_vmap_area_rb_augment_cb);
				532	va->subtree_max_size = 0;
				533	} else {
				534	rb_insert_color(&va->rb_node, root);
				535	}
				536
				537	/* Address-sort this list */
				538	list_add(&va->list, head);
				539	}
				540
				541	static __always_inline void
				542	unlink_va(struct vmap_area va, struct rb_root root)
				543	{
				544	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
				545	return;
				546
				547	if (root == &free_vmap_area_root)
				548	rb_erase_augmented(&va->rb_node,
				549	root, &free_vmap_area_rb_augment_cb);
				550	else
				551	rb_erase(&va->rb_node, root);
				552
				553	list_del(&va->list);
				554	RB_CLEAR_NODE(&va->rb_node);
				555	}
				556
				557	#if DEBUG_AUGMENT_PROPAGATE_CHECK
				558	static void
				559	augment_tree_propagate_check(struct rb_node *n)
				560	{
				561	struct vmap_area *va;
				562	struct rb_node *node;
				563	unsigned long size;
				564	bool found = false;
				565
				566	if (n == NULL)
				567	return;
				568
				569	va = rb_entry(n, struct vmap_area, rb_node);
				570	size = va->subtree_max_size;
				571	node = n;
				572
				573	while (node) {
				574	va = rb_entry(node, struct vmap_area, rb_node);
				575
				576	if (get_subtree_max_size(node->rb_left) == size) {
				577	node = node->rb_left;
				578	} else {
				579	if (va_size(va) == size) {
				580	found = true;
				581	break;
				582	}
				583
				584	node = node->rb_right;
				585	}
				586	}
				587
				588	if (!found) {
				589	va = rb_entry(n, struct vmap_area, rb_node);
				590	pr_emerg("tree is corrupted: %lu, %lu\n",
				591	va_size(va), va->subtree_max_size);
				592	}
				593
				594	augment_tree_propagate_check(n->rb_left);
				595	augment_tree_propagate_check(n->rb_right);
				596	}
				597	#endif
				598
				599	/*
				600	* This function populates subtree_max_size from bottom to upper
				601	* levels starting from VA point. The propagation must be done
				602	* when VA size is modified by changing its va_start/va_end. Or
				603	* in case of newly inserting of VA to the tree.
				604	*
				605	* It means that __augment_tree_propagate_from() must be called:
				606	* - After VA has been inserted to the tree(free path);
				607	* - After VA has been shrunk(allocation path);
				608	* - After VA has been increased(merging path).
				609	*
				610	* Please note that, it does not mean that upper parent nodes
				611	* and their subtree_max_size are recalculated all the time up
				612	* to the root node.
				613	*
				614	* 4--8
				615	* /\
				616	* / \
				617	* / \
				618	* 2--2 8--8
				619	*
				620	* For example if we modify the node 4, shrinking it to 2, then
				621	* no any modification is required. If we shrink the node 2 to 1
				622	* its subtree_max_size is updated only, and set to 1. If we shrink
				623	* the node 8 to 6, then its subtree_max_size is set to 6 and parent
				624	* node becomes 4--6.
				625	*/
				626	static __always_inline void
				627	augment_tree_propagate_from(struct vmap_area *va)
				628	{
				629	struct rb_node *node = &va->rb_node;
				630	unsigned long new_va_sub_max_size;
				631
				632	while (node) {
				633	va = rb_entry(node, struct vmap_area, rb_node);
				634	new_va_sub_max_size = compute_subtree_max_size(va);
				635
				636	/*
				637	* If the newly calculated maximum available size of the
				638	* subtree is equal to the current one, then it means that
				639	* the tree is propagated correctly. So we have to stop at
				640	* this point to save cycles.
				641	*/
				642	if (va->subtree_max_size == new_va_sub_max_size)
				643	break;
				644
				645	va->subtree_max_size = new_va_sub_max_size;
				646	node = rb_parent(&va->rb_node);
				647	}
				648
				649	#if DEBUG_AUGMENT_PROPAGATE_CHECK
				650	augment_tree_propagate_check(free_vmap_area_root.rb_node);
				651	#endif
				652	}
				653
				654	static void
				655	insert_vmap_area(struct vmap_area *va,
				656	struct rb_root root, struct list_head head)
				657	{
				658	struct rb_node **link;
				659	struct rb_node *parent;
				660
				661	link = find_va_links(va, root, NULL, &parent);
				662	link_va(va, root, parent, link, head);
				663	}
				664
				665	static void
				666	insert_vmap_area_augment(struct vmap_area *va,
				667	struct rb_node from, struct rb_root root,
				668	struct list_head *head)
				669	{
				670	struct rb_node **link;
				671	struct rb_node *parent;
				672
				673	if (from)
				674	link = find_va_links(va, NULL, from, &parent);
				675	else
				676	link = find_va_links(va, root, NULL, &parent);
				677
				678	link_va(va, root, parent, link, head);
				679	augment_tree_propagate_from(va);
				680	}
				681
				682	/*
				683	* Merge de-allocated chunk of VA memory with previous
				684	* and next free blocks. If coalesce is not done a new
				685	* free area is inserted. If VA has been merged, it is
				686	* freed.
				687	*/
				688	static __always_inline void
				689	merge_or_add_vmap_area(struct vmap_area *va,
				690	struct rb_root root, struct list_head head)
				691	{
				692	struct vmap_area *sibling;
				693	struct list_head *next;
				694	struct rb_node **link;
				695	struct rb_node *parent;
				696	bool merged = false;
				697
				698	/*
				699	* Find a place in the tree where VA potentially will be
				700	* inserted, unless it is merged with its sibling/siblings.
				701	*/
				702	link = find_va_links(va, root, NULL, &parent);
				703
				704	/*
				705	* Get next node of VA to check if merging can be done.
				706	*/
				707	next = get_va_next_sibling(parent, link);
				708	if (unlikely(next == NULL))
				709	goto insert;
				710
				711	/*
				712	* start end
				713	* \| \|
				714	* \|<------VA------>\|<-----Next----->\|
				715	* \| \|
				716	* start end
				717	*/
				718	if (next != head) {
				719	sibling = list_entry(next, struct vmap_area, list);
				720	if (sibling->va_start == va->va_end) {
				721	sibling->va_start = va->va_start;
				722
				723	/* Check and update the tree if needed. */
				724	augment_tree_propagate_from(sibling);
				725
				726	/* Free vmap_area object. */
				727	kmem_cache_free(vmap_area_cachep, va);
				728
				729	/* Point to the new merged area. */
				730	va = sibling;
				731	merged = true;
				732	}
				733	}
				734
				735	/*
				736	* start end
				737	* \| \|
				738	* \|<-----Prev----->\|<------VA------>\|
				739	* \| \|
				740	* start end
				741	*/
				742	if (next->prev != head) {
				743	sibling = list_entry(next->prev, struct vmap_area, list);
				744	if (sibling->va_end == va->va_start) {
				745	sibling->va_end = va->va_end;
				746
				747	/* Check and update the tree if needed. */
				748	augment_tree_propagate_from(sibling);
				749
				750	if (merged)
				751	unlink_va(va, root);
				752
				753	/* Free vmap_area object. */
				754	kmem_cache_free(vmap_area_cachep, va);
				755	return;
				756	}
				757	}
				758
				759	insert:
				760	if (!merged) {
				761	link_va(va, root, parent, link, head);
				762	augment_tree_propagate_from(va);
				763	}
				764	}
				765
				766	static __always_inline bool
				767	is_within_this_va(struct vmap_area *va, unsigned long size,
				768	unsigned long align, unsigned long vstart)
				769	{
				770	unsigned long nva_start_addr;
				771
				772	if (va->va_start > vstart)
				773	nva_start_addr = ALIGN(va->va_start, align);
				774	else
				775	nva_start_addr = ALIGN(vstart, align);
				776
				777	/* Can be overflowed due to big size or alignment. */
				778	if (nva_start_addr + size < nva_start_addr \|\|
				779	nva_start_addr < vstart)
				780	return false;
				781
				782	return (nva_start_addr + size <= va->va_end);
				783	}
				784
				785	/*
				786	* Find the first free block(lowest start address) in the tree,
				787	* that will accomplish the request corresponding to passing
				788	* parameters.
				789	*/
				790	static __always_inline struct vmap_area *
				791	find_vmap_lowest_match(unsigned long size,
				792	unsigned long align, unsigned long vstart)
				793	{
				794	struct vmap_area *va;
				795	struct rb_node *node;
				796	unsigned long length;
				797
				798	/* Start from the root. */
				799	node = free_vmap_area_root.rb_node;
				800
				801	/* Adjust the search size for alignment overhead. */
				802	length = size + align - 1;
				803
				804	while (node) {
				805	va = rb_entry(node, struct vmap_area, rb_node);
				806
				807	if (get_subtree_max_size(node->rb_left) >= length &&
				808	vstart < va->va_start) {
				809	node = node->rb_left;
				810	} else {
				811	if (is_within_this_va(va, size, align, vstart))
				812	return va;
				813
				814	/*
				815	* Does not make sense to go deeper towards the right
				816	* sub-tree if it does not have a free block that is
				817	* equal or bigger to the requested search length.
				818	*/
				819	if (get_subtree_max_size(node->rb_right) >= length) {
				820	node = node->rb_right;
				821	continue;
				822	}
				823
				824	/*
				825	* OK. We roll back and find the first right sub-tree,
				826	* that will satisfy the search criteria. It can happen
				827	* only once due to "vstart" restriction.
				828	*/
				829	while ((node = rb_parent(node))) {
				830	va = rb_entry(node, struct vmap_area, rb_node);
				831	if (is_within_this_va(va, size, align, vstart))
				832	return va;
				833
				834	if (get_subtree_max_size(node->rb_right) >= length &&
				835	vstart <= va->va_start) {
				836	node = node->rb_right;
				837	break;
				838	}
				839	}
				840	}
				841	}
				842
				843	return NULL;
				844	}
				845
				846	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
				847	#include <linux/random.h>
				848
				849	static struct vmap_area *
				850	find_vmap_lowest_linear_match(unsigned long size,
				851	unsigned long align, unsigned long vstart)
				852	{
				853	struct vmap_area *va;
				854
				855	list_for_each_entry(va, &free_vmap_area_list, list) {
				856	if (!is_within_this_va(va, size, align, vstart))
				857	continue;
				858
				859	return va;
				860	}
				861
				862	return NULL;
				863	}
				864
				865	static void
				866	find_vmap_lowest_match_check(unsigned long size)
				867	{
				868	struct vmap_area va_1, va_2;
				869	unsigned long vstart;
				870	unsigned int rnd;
				871
				872	get_random_bytes(&rnd, sizeof(rnd));
				873	vstart = VMALLOC_START + rnd;
				874
				875	va_1 = find_vmap_lowest_match(size, 1, vstart);
				876	va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
				877
				878	if (va_1 != va_2)
				879	pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
				880	va_1, va_2, vstart);
				881	}
				882	#endif
				883
				884	enum fit_type {
				885	NOTHING_FIT = 0,
				886	FL_FIT_TYPE = 1, /* full fit */
				887	LE_FIT_TYPE = 2, /* left edge fit */
				888	RE_FIT_TYPE = 3, /* right edge fit */
				889	NE_FIT_TYPE = 4 /* no edge fit */
				890	};
				891
				892	static __always_inline enum fit_type
				893	classify_va_fit_type(struct vmap_area *va,
				894	unsigned long nva_start_addr, unsigned long size)
				895	{
				896	enum fit_type type;
				897
				898	/* Check if it is within VA. */
				899	if (nva_start_addr < va->va_start \|\|
				900	nva_start_addr + size > va->va_end)
				901	return NOTHING_FIT;
				902
				903	/* Now classify. */
				904	if (va->va_start == nva_start_addr) {
				905	if (va->va_end == nva_start_addr + size)
				906	type = FL_FIT_TYPE;
				907	else
				908	type = LE_FIT_TYPE;
				909	} else if (va->va_end == nva_start_addr + size) {
				910	type = RE_FIT_TYPE;
				911	} else {
				912	type = NE_FIT_TYPE;
				913	}
				914
				915	return type;
				916	}
				917
				918	static __always_inline int
				919	adjust_va_to_fit_type(struct vmap_area *va,
				920	unsigned long nva_start_addr, unsigned long size,
				921	enum fit_type type)
				922	{
				923	struct vmap_area *lva = NULL;
				924
				925	if (type == FL_FIT_TYPE) {
				926	/*
				927	* No need to split VA, it fully fits.
				928	*
				929	* \| \|
				930	* V NVA V
				931	* \|---------------\|
				932	*/
				933	unlink_va(va, &free_vmap_area_root);
				934	kmem_cache_free(vmap_area_cachep, va);
				935	} else if (type == LE_FIT_TYPE) {
				936	/*
				937	* Split left edge of fit VA.
				938	*
				939	* \| \|
				940	* V NVA V R
				941	* \|-------\|-------\|
				942	*/
				943	va->va_start += size;
				944	} else if (type == RE_FIT_TYPE) {
				945	/*
				946	* Split right edge of fit VA.
				947	*
				948	* \| \|
				949	* L V NVA V
				950	* \|-------\|-------\|
				951	*/
				952	va->va_end = nva_start_addr;
				953	} else if (type == NE_FIT_TYPE) {
				954	/*
				955	* Split no edge of fit VA.
				956	*
				957	* \| \|
				958	* L V NVA V R
				959	* \|---\|-------\|---\|
				960	*/
				961	lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
				962	if (unlikely(!lva)) {
				963	/*
				964	* For percpu allocator we do not do any pre-allocation
				965	* and leave it as it is. The reason is it most likely
				966	* never ends up with NE_FIT_TYPE splitting. In case of
				967	* percpu allocations offsets and sizes are aligned to
				968	* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
				969	* are its main fitting cases.
				970	*
				971	* There are a few exceptions though, as an example it is
				972	* a first allocation (early boot up) when we have "one"
				973	* big free space that has to be split.
				974	*/
				975	lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
				976	if (!lva)
				977	return -1;
				978	}
				979
				980	/*
				981	* Build the remainder.
				982	*/
				983	lva->va_start = va->va_start;
				984	lva->va_end = nva_start_addr;
				985
				986	/*
				987	* Shrink this VA to remaining size.
				988	*/
				989	va->va_start = nva_start_addr + size;
				990	} else {
				991	return -1;
				992	}
				993
				994	if (type != FL_FIT_TYPE) {
				995	augment_tree_propagate_from(va);
				996
				997	if (lva) /* type == NE_FIT_TYPE */
				998	insert_vmap_area_augment(lva, &va->rb_node,
				999	&free_vmap_area_root, &free_vmap_area_list);
				1000	}
				1001
				1002	return 0;
				1003	}
				1004
				1005	/*
				1006	* Returns a start address of the newly allocated area, if success.
				1007	* Otherwise a vend is returned that indicates failure.
				1008	*/
				1009	static __always_inline unsigned long
				1010	__alloc_vmap_area(unsigned long size, unsigned long align,
				1011	unsigned long vstart, unsigned long vend)
				1012	{
				1013	unsigned long nva_start_addr;
				1014	struct vmap_area *va;
				1015	enum fit_type type;
				1016	int ret;
				1017
				1018	va = find_vmap_lowest_match(size, align, vstart);
				1019	if (unlikely(!va))
				1020	return vend;
				1021
				1022	if (va->va_start > vstart)
				1023	nva_start_addr = ALIGN(va->va_start, align);
				1024	else
				1025	nva_start_addr = ALIGN(vstart, align);
				1026
				1027	/* Check the "vend" restriction. */
				1028	if (nva_start_addr + size > vend)
				1029	return vend;
				1030
				1031	/* Classify what we have found. */
				1032	type = classify_va_fit_type(va, nva_start_addr, size);
				1033	if (WARN_ON_ONCE(type == NOTHING_FIT))
				1034	return vend;
				1035
				1036	/* Update the free vmap_area. */
				1037	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
				1038	if (ret)
				1039	return vend;
				1040
				1041	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
				1042	find_vmap_lowest_match_check(size);
				1043	#endif
				1044
				1045	return nva_start_addr;
				1046	}
				1047
				1048	/*
				1049	* Allocate a region of KVA of the specified size and alignment, within the
				1050	* vstart and vend.
				1051	*/
				1052	static struct vmap_area *alloc_vmap_area(unsigned long size,
				1053	unsigned long align,
				1054	unsigned long vstart, unsigned long vend,
				1055	int node, gfp_t gfp_mask)
				1056	{
				1057	struct vmap_area va, pva;
				1058	unsigned long addr;
				1059	int purged = 0;
				1060
				1061	BUG_ON(!size);
				1062	BUG_ON(offset_in_page(size));
				1063	BUG_ON(!is_power_of_2(align));
				1064
				1065	if (unlikely(!vmap_initialized))
				1066	return ERR_PTR(-EBUSY);
				1067
				1068	might_sleep();
				1069
				1070	va = kmem_cache_alloc_node(vmap_area_cachep,
				1071	gfp_mask & GFP_RECLAIM_MASK, node);
				1072	if (unlikely(!va))
				1073	return ERR_PTR(-ENOMEM);
				1074
				1075	/*
				1076	* Only scan the relevant parts containing pointers to other objects
				1077	* to avoid false negatives.
				1078	*/
				1079	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
				1080
				1081	retry:
				1082	/*
				1083	* Preload this CPU with one extra vmap_area object to ensure
				1084	* that we have it available when fit type of free area is
				1085	* NE_FIT_TYPE.
				1086	*
				1087	* The preload is done in non-atomic context, thus it allows us
				1088	* to use more permissive allocation masks to be more stable under
				1089	* low memory condition and high memory pressure.
				1090	*
				1091	* Even if it fails we do not really care about that. Just proceed
				1092	* as it is. "overflow" path will refill the cache we allocate from.
				1093	*/
				1094	preempt_disable();
				1095	if (!__this_cpu_read(ne_fit_preload_node)) {
				1096	preempt_enable();
				1097	pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
				1098	preempt_disable();
				1099
				1100	if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
				1101	if (pva)
				1102	kmem_cache_free(vmap_area_cachep, pva);
				1103	}
				1104	}
				1105
				1106	spin_lock(&vmap_area_lock);
				1107	preempt_enable();
				1108
				1109	/*
				1110	* If an allocation fails, the "vend" address is
				1111	* returned. Therefore trigger the overflow path.
				1112	*/
				1113	addr = __alloc_vmap_area(size, align, vstart, vend);
				1114	if (unlikely(addr == vend))
				1115	goto overflow;
				1116
				1117	va->va_start = addr;
				1118	va->va_end = addr + size;
				1119	va->vm = NULL;
				1120	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
				1121
				1122	spin_unlock(&vmap_area_lock);
				1123
				1124	BUG_ON(!IS_ALIGNED(va->va_start, align));
				1125	BUG_ON(va->va_start < vstart);
				1126	BUG_ON(va->va_end > vend);
				1127
				1128	return va;
				1129
				1130	overflow:
				1131	spin_unlock(&vmap_area_lock);
				1132	if (!purged) {
				1133	purge_vmap_area_lazy();
				1134	purged = 1;
				1135	goto retry;
				1136	}
				1137
				1138	if (gfpflags_allow_blocking(gfp_mask)) {
				1139	unsigned long freed = 0;
				1140	blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
				1141	if (freed > 0) {
				1142	purged = 0;
				1143	goto retry;
				1144	}
				1145	}
				1146
				1147	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
				1148	pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
				1149	size);
				1150
				1151	kmem_cache_free(vmap_area_cachep, va);
				1152	return ERR_PTR(-EBUSY);
				1153	}
				1154
				1155	int register_vmap_purge_notifier(struct notifier_block *nb)
				1156	{
				1157	return blocking_notifier_chain_register(&vmap_notify_list, nb);
				1158	}
				1159	EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
				1160
				1161	int unregister_vmap_purge_notifier(struct notifier_block *nb)
				1162	{
				1163	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
				1164	}
				1165	EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
				1166
				1167	static void __free_vmap_area(struct vmap_area *va)
				1168	{
				1169	/*
				1170	* Remove from the busy tree/list.
				1171	*/
				1172	unlink_va(va, &vmap_area_root);
				1173
				1174	/*
				1175	* Merge VA with its neighbors, otherwise just add it.
				1176	*/
				1177	merge_or_add_vmap_area(va,
				1178	&free_vmap_area_root, &free_vmap_area_list);
				1179	}
				1180
				1181	/*
				1182	* Free a region of KVA allocated by alloc_vmap_area
				1183	*/
				1184	static void free_vmap_area(struct vmap_area *va)
				1185	{
				1186	spin_lock(&vmap_area_lock);
				1187	__free_vmap_area(va);
				1188	spin_unlock(&vmap_area_lock);
				1189	}
				1190
				1191	/*
				1192	* Clear the pagetable entries of a given vmap_area
				1193	*/
				1194	static void unmap_vmap_area(struct vmap_area *va)
				1195	{
				1196	vunmap_page_range(va->va_start, va->va_end);
				1197	}
				1198
				1199	/*
				1200	* lazy_max_pages is the maximum amount of virtual address space we gather up
				1201	* before attempting to purge with a TLB flush.
				1202	*
				1203	* There is a tradeoff here: a larger number will cover more kernel page tables
				1204	* and take slightly longer to purge, but it will linearly reduce the number of
				1205	* global TLB flushes that must be performed. It would seem natural to scale
				1206	* this number up linearly with the number of CPUs (because vmapping activity
				1207	* could also scale linearly with the number of CPUs), however it is likely
				1208	* that in practice, workloads might be constrained in other ways that mean
				1209	* vmap activity will not scale linearly with CPUs. Also, I want to be
				1210	* conservative and not introduce a big latency on huge systems, so go with
				1211	* a less aggressive log scale. It will still be an improvement over the old
				1212	* code, and it will be simple to change the scale factor if we find that it
				1213	* becomes a problem on bigger systems.
				1214	*/
				1215	static unsigned long lazy_max_pages(void)
				1216	{
				1217	unsigned int log;
				1218
				1219	log = fls(num_online_cpus());
				1220
				1221	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
				1222	}
				1223
				1224	static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
				1225
				1226	/*
				1227	* Serialize vmap purging. There is no actual criticial section protected
				1228	* by this look, but we want to avoid concurrent calls for performance
				1229	* reasons and to make the pcpu_get_vm_areas more deterministic.
				1230	*/
				1231	static DEFINE_MUTEX(vmap_purge_lock);
				1232
				1233	/* for per-CPU blocks */
				1234	static void purge_fragmented_blocks_allcpus(void);
				1235
				1236	/*
				1237	* called before a call to iounmap() if the caller wants vm_area_struct's
				1238	* immediately freed.
				1239	*/
				1240	void set_iounmap_nonlazy(void)
				1241	{
				1242	atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
				1243	}
				1244
				1245	/*
				1246	* Purges all lazily-freed vmap areas.
				1247	*/
				1248	static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
				1249	{
				1250	unsigned long resched_threshold;
				1251	struct llist_node *valist;
				1252	struct vmap_area *va;
				1253	struct vmap_area *n_va;
				1254
				1255	lockdep_assert_held(&vmap_purge_lock);
				1256
				1257	valist = llist_del_all(&vmap_purge_list);
				1258	if (unlikely(valist == NULL))
				1259	return false;
				1260
				1261	/* assert on wrong valist */
				1262	if (unlikely((ulong)valist < PAGE_OFFSET)) {
				1263	pr_err("%s: valist %lx\n", __func__, (unsigned long)valist);
				1264	BUG();
				1265	}
				1266
				1267	/*
				1268	* First make sure the mappings are removed from all page-tables
				1269	* before they are freed.
				1270	*/
				1271	vmalloc_sync_unmappings();
				1272
				1273	/*
				1274	* TODO: to calculate a flush range without looping.
				1275	* The list can be up to lazy_max_pages() elements.
				1276	*/
				1277	llist_for_each_entry(va, valist, purge_list) {
				1278	if (va->va_start < start)
				1279	start = va->va_start;
				1280	if (va->va_end > end)
				1281	end = va->va_end;
				1282	}
				1283
				1284	flush_tlb_kernel_range(start, end);
				1285	resched_threshold = lazy_max_pages() << 1;
				1286
				1287	spin_lock(&vmap_area_lock);
				1288	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
				1289	unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
				1290
				1291	/*
				1292	* Finally insert or merge lazily-freed area. It is
				1293	* detached and there is no need to "unlink" it from
				1294	* anything.
				1295	*/
				1296	merge_or_add_vmap_area(va,
				1297	&free_vmap_area_root, &free_vmap_area_list);
				1298
				1299	atomic_long_sub(nr, &vmap_lazy_nr);
				1300
				1301	if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
				1302	cond_resched_lock(&vmap_area_lock);
				1303	}
				1304	spin_unlock(&vmap_area_lock);
				1305	return true;
				1306	}
				1307
				1308	/*
				1309	* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
				1310	* is already purging.
				1311	*/
				1312	static void try_purge_vmap_area_lazy(void)
				1313	{
				1314	if (mutex_trylock(&vmap_purge_lock)) {
				1315	__purge_vmap_area_lazy(ULONG_MAX, 0);
				1316	mutex_unlock(&vmap_purge_lock);
				1317	}
				1318	}
				1319
				1320	/*
				1321	* Kick off a purge of the outstanding lazy areas.
				1322	*/
				1323	static void purge_vmap_area_lazy(void)
				1324	{
				1325	mutex_lock(&vmap_purge_lock);
				1326	purge_fragmented_blocks_allcpus();
				1327	__purge_vmap_area_lazy(ULONG_MAX, 0);
				1328	mutex_unlock(&vmap_purge_lock);
				1329	}
				1330
				1331	/*
				1332	* Free a vmap area, caller ensuring that the area has been unmapped
				1333	* and flush_cache_vunmap had been called for the correct range
				1334	* previously.
				1335	*/
				1336	static void free_vmap_area_noflush(struct vmap_area *va)
				1337	{
				1338	unsigned long nr_lazy;
				1339
				1340	spin_lock(&vmap_area_lock);
				1341	unlink_va(va, &vmap_area_root);
				1342	spin_unlock(&vmap_area_lock);
				1343
				1344	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
				1345	PAGE_SHIFT, &vmap_lazy_nr);
				1346
				1347	/* After this point, we may free va at any time */
				1348	llist_add(&va->purge_list, &vmap_purge_list);
				1349
				1350	if (unlikely(nr_lazy > lazy_max_pages()))
				1351	try_purge_vmap_area_lazy();
				1352	}
				1353
				1354	/*
				1355	* Free and unmap a vmap area
				1356	*/
				1357	static void free_unmap_vmap_area(struct vmap_area *va)
				1358	{
				1359	flush_cache_vunmap(va->va_start, va->va_end);
				1360	unmap_vmap_area(va);
				1361	if (debug_pagealloc_enabled_static())
				1362	flush_tlb_kernel_range(va->va_start, va->va_end);
				1363
				1364	free_vmap_area_noflush(va);
				1365	}
				1366
				1367	static struct vmap_area *find_vmap_area(unsigned long addr)
				1368	{
				1369	struct vmap_area *va;
				1370
				1371	spin_lock(&vmap_area_lock);
				1372	va = __find_vmap_area(addr);
				1373	spin_unlock(&vmap_area_lock);
				1374
				1375	return va;
				1376	}
				1377
				1378	/* Per cpu kva allocator */
				1379
				1380	/*
				1381	* vmap space is limited especially on 32 bit architectures. Ensure there is
				1382	* room for at least 16 percpu vmap blocks per CPU.
				1383	*/
				1384	/*
				1385	* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
				1386	* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
				1387	* instead (we just need a rough idea)
				1388	*/
				1389	#if BITS_PER_LONG == 32
				1390	#define VMALLOC_SPACE (128UL10241024)
				1391	#else
				1392	#define VMALLOC_SPACE (128UL10241024*1024)
				1393	#endif
				1394
				1395	#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
				1396	#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
				1397	#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
				1398	#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
				1399	#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
				1400	#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
				1401	#define VMAP_BBMAP_BITS \
				1402	VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
				1403	VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
				1404	VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
				1405
				1406	#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
				1407
				1408	struct vmap_block_queue {
				1409	spinlock_t lock;
				1410	struct list_head free;
				1411	};
				1412
				1413	struct vmap_block {
				1414	spinlock_t lock;
				1415	struct vmap_area *va;
				1416	unsigned long free, dirty;
				1417	unsigned long dirty_min, dirty_max; /< dirty range /
				1418	struct list_head free_list;
				1419	struct rcu_head rcu_head;
				1420	struct list_head purge;
				1421	};
				1422
				1423	/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
				1424	static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
				1425
				1426	/*
				1427	* Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
				1428	* in the free path. Could get rid of this if we change the API to return a
				1429	* "cookie" from alloc, to be passed to free. But no big deal yet.
				1430	*/
				1431	static DEFINE_SPINLOCK(vmap_block_tree_lock);
				1432	static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
				1433
				1434	/*
				1435	* We should probably have a fallback mechanism to allocate virtual memory
				1436	* out of partially filled vmap blocks. However vmap block sizing should be
				1437	* fairly reasonable according to the vmalloc size, so it shouldn't be a
				1438	* big problem.
				1439	*/
				1440
				1441	static unsigned long addr_to_vb_idx(unsigned long addr)
				1442	{
				1443	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
				1444	addr /= VMAP_BLOCK_SIZE;
				1445	return addr;
				1446	}
				1447
				1448	static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
				1449	{
				1450	unsigned long addr;
				1451
				1452	addr = va_start + (pages_off << PAGE_SHIFT);
				1453	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
				1454	return (void *)addr;
				1455	}
				1456
				1457	/**
				1458	* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
				1459	* block. Of course pages number can't exceed VMAP_BBMAP_BITS
				1460	* @order: how many 2^order pages should be occupied in newly allocated block
				1461	* @gfp_mask: flags for the page level allocator
				1462	*
				1463	* Return: virtual address in a newly allocated block or ERR_PTR(-errno)
				1464	*/
				1465	static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
				1466	{
				1467	struct vmap_block_queue *vbq;
				1468	struct vmap_block *vb;
				1469	struct vmap_area *va;
				1470	unsigned long vb_idx;
				1471	int node, err;
				1472	void *vaddr;
				1473
				1474	node = numa_node_id();
				1475
				1476	vb = kmalloc_node(sizeof(struct vmap_block),
				1477	gfp_mask & GFP_RECLAIM_MASK, node);
				1478	if (unlikely(!vb))
				1479	return ERR_PTR(-ENOMEM);
				1480
				1481	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
				1482	VMALLOC_START, VMALLOC_END,
				1483	node, gfp_mask);
				1484	if (IS_ERR(va)) {
				1485	kfree(vb);
				1486	return ERR_CAST(va);
				1487	}
				1488
				1489	err = radix_tree_preload(gfp_mask);
				1490	if (unlikely(err)) {
				1491	kfree(vb);
				1492	free_vmap_area(va);
				1493	return ERR_PTR(err);
				1494	}
				1495
				1496	vaddr = vmap_block_vaddr(va->va_start, 0);
				1497	spin_lock_init(&vb->lock);
				1498	vb->va = va;
				1499	/* At least something should be left free */
				1500	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
				1501	vb->free = VMAP_BBMAP_BITS - (1UL << order);
				1502	vb->dirty = 0;
				1503	vb->dirty_min = VMAP_BBMAP_BITS;
				1504	vb->dirty_max = 0;
				1505	INIT_LIST_HEAD(&vb->free_list);
				1506
				1507	vb_idx = addr_to_vb_idx(va->va_start);
				1508	spin_lock(&vmap_block_tree_lock);
				1509	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
				1510	spin_unlock(&vmap_block_tree_lock);
				1511	BUG_ON(err);
				1512	radix_tree_preload_end();
				1513
				1514	vbq = &get_cpu_var(vmap_block_queue);
				1515	spin_lock(&vbq->lock);
				1516	list_add_tail_rcu(&vb->free_list, &vbq->free);
				1517	spin_unlock(&vbq->lock);
				1518	put_cpu_var(vmap_block_queue);
				1519
				1520	return vaddr;
				1521	}
				1522
				1523	static void free_vmap_block(struct vmap_block *vb)
				1524	{
				1525	struct vmap_block *tmp;
				1526	unsigned long vb_idx;
				1527
				1528	vb_idx = addr_to_vb_idx(vb->va->va_start);
				1529	spin_lock(&vmap_block_tree_lock);
				1530	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
				1531	spin_unlock(&vmap_block_tree_lock);
				1532	BUG_ON(tmp != vb);
				1533
				1534	free_vmap_area_noflush(vb->va);
				1535	kfree_rcu(vb, rcu_head);
				1536	}
				1537
				1538	static void purge_fragmented_blocks(int cpu)
				1539	{
				1540	LIST_HEAD(purge);
				1541	struct vmap_block *vb;
				1542	struct vmap_block *n_vb;
				1543	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
				1544
				1545	rcu_read_lock();
				1546	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				1547
				1548	if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
				1549	continue;
				1550
				1551	spin_lock(&vb->lock);
				1552	if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
				1553	vb->free = 0; /* prevent further allocs after releasing lock */
				1554	vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
				1555	vb->dirty_min = 0;
				1556	vb->dirty_max = VMAP_BBMAP_BITS;
				1557	spin_lock(&vbq->lock);
				1558	list_del_rcu(&vb->free_list);
				1559	spin_unlock(&vbq->lock);
				1560	spin_unlock(&vb->lock);
				1561	list_add_tail(&vb->purge, &purge);
				1562	} else
				1563	spin_unlock(&vb->lock);
				1564	}
				1565	rcu_read_unlock();
				1566
				1567	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
				1568	list_del(&vb->purge);
				1569	free_vmap_block(vb);
				1570	}
				1571	}
				1572
				1573	static void purge_fragmented_blocks_allcpus(void)
				1574	{
				1575	int cpu;
				1576
				1577	for_each_possible_cpu(cpu)
				1578	purge_fragmented_blocks(cpu);
				1579	}
				1580
				1581	static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
				1582	{
				1583	struct vmap_block_queue *vbq;
				1584	struct vmap_block *vb;
				1585	void *vaddr = NULL;
				1586	unsigned int order;
				1587
				1588	BUG_ON(offset_in_page(size));
				1589	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
				1590	if (WARN_ON(size == 0)) {
				1591	/*
				1592	* Allocating 0 bytes isn't what caller wants since
				1593	* get_order(0) returns funny result. Just warn and terminate
				1594	* early.
				1595	*/
				1596	return NULL;
				1597	}
				1598	order = get_order(size);
				1599
				1600	rcu_read_lock();
				1601	vbq = &get_cpu_var(vmap_block_queue);
				1602	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				1603	unsigned long pages_off;
				1604
				1605	spin_lock(&vb->lock);
				1606	if (vb->free < (1UL << order)) {
				1607	spin_unlock(&vb->lock);
				1608	continue;
				1609	}
				1610
				1611	pages_off = VMAP_BBMAP_BITS - vb->free;
				1612	vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
				1613	vb->free -= 1UL << order;
				1614	if (vb->free == 0) {
				1615	spin_lock(&vbq->lock);
				1616	list_del_rcu(&vb->free_list);
				1617	spin_unlock(&vbq->lock);
				1618	}
				1619
				1620	spin_unlock(&vb->lock);
				1621	break;
				1622	}
				1623
				1624	put_cpu_var(vmap_block_queue);
				1625	rcu_read_unlock();
				1626
				1627	/* Allocate new block if nothing was found */
				1628	if (!vaddr)
				1629	vaddr = new_vmap_block(order, gfp_mask);
				1630
				1631	return vaddr;
				1632	}
				1633
				1634	static void vb_free(const void *addr, unsigned long size)
				1635	{
				1636	unsigned long offset;
				1637	unsigned long vb_idx;
				1638	unsigned int order;
				1639	struct vmap_block *vb;
				1640
				1641	BUG_ON(offset_in_page(size));
				1642	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
				1643
				1644	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
				1645
				1646	order = get_order(size);
				1647
				1648	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
				1649	offset >>= PAGE_SHIFT;
				1650
				1651	vb_idx = addr_to_vb_idx((unsigned long)addr);
				1652	rcu_read_lock();
				1653	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
				1654	rcu_read_unlock();
				1655	BUG_ON(!vb);
				1656
				1657	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
				1658
				1659	if (debug_pagealloc_enabled_static())
				1660	flush_tlb_kernel_range((unsigned long)addr,
				1661	(unsigned long)addr + size);
				1662
				1663	spin_lock(&vb->lock);
				1664
				1665	/* Expand dirty range */
				1666	vb->dirty_min = min(vb->dirty_min, offset);
				1667	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
				1668
				1669	vb->dirty += 1UL << order;
				1670	if (vb->dirty == VMAP_BBMAP_BITS) {
				1671	BUG_ON(vb->free);
				1672	spin_unlock(&vb->lock);
				1673	free_vmap_block(vb);
				1674	} else
				1675	spin_unlock(&vb->lock);
				1676	}
				1677
				1678	static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
				1679	{
				1680	int cpu;
				1681
				1682	if (unlikely(!vmap_initialized))
				1683	return;
				1684
				1685	might_sleep();
				1686
				1687	for_each_possible_cpu(cpu) {
				1688	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
				1689	struct vmap_block *vb;
				1690
				1691	rcu_read_lock();
				1692	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
				1693	spin_lock(&vb->lock);
				1694	if (vb->dirty) {
				1695	unsigned long va_start = vb->va->va_start;
				1696	unsigned long s, e;
				1697
				1698	s = va_start + (vb->dirty_min << PAGE_SHIFT);
				1699	e = va_start + (vb->dirty_max << PAGE_SHIFT);
				1700
				1701	start = min(s, start);
				1702	end = max(e, end);
				1703
				1704	flush = 1;
				1705	}
				1706	spin_unlock(&vb->lock);
				1707	}
				1708	rcu_read_unlock();
				1709	}
				1710
				1711	mutex_lock(&vmap_purge_lock);
				1712	purge_fragmented_blocks_allcpus();
				1713	if (!__purge_vmap_area_lazy(start, end) && flush)
				1714	flush_tlb_kernel_range(start, end);
				1715	mutex_unlock(&vmap_purge_lock);
				1716	}
				1717
				1718	/**
				1719	* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
				1720	*
				1721	* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
				1722	* to amortize TLB flushing overheads. What this means is that any page you
				1723	* have now, may, in a former life, have been mapped into kernel virtual
				1724	* address by the vmap layer and so there might be some CPUs with TLB entries
				1725	* still referencing that page (additional to the regular 1:1 kernel mapping).
				1726	*
				1727	* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
				1728	* be sure that none of the pages we have control over will have any aliases
				1729	* from the vmap layer.
				1730	*/
				1731	void vm_unmap_aliases(void)
				1732	{
				1733	unsigned long start = ULONG_MAX, end = 0;
				1734	int flush = 0;
				1735
				1736	_vm_unmap_aliases(start, end, flush);
				1737	}
				1738	EXPORT_SYMBOL_GPL(vm_unmap_aliases);
				1739
				1740	/**
				1741	* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
				1742	* @mem: the pointer returned by vm_map_ram
				1743	* @count: the count passed to that vm_map_ram call (cannot unmap partial)
				1744	*/
				1745	void vm_unmap_ram(const void *mem, unsigned int count)
				1746	{
				1747	unsigned long size = (unsigned long)count << PAGE_SHIFT;
				1748	unsigned long addr = (unsigned long)mem;
				1749	struct vmap_area *va;
				1750
				1751	might_sleep();
				1752	BUG_ON(!addr);
				1753	BUG_ON(addr < VMALLOC_START);
				1754	BUG_ON(addr > VMALLOC_END);
				1755	BUG_ON(!PAGE_ALIGNED(addr));
				1756
				1757	if (likely(count <= VMAP_MAX_ALLOC)) {
				1758	debug_check_no_locks_freed(mem, size);
				1759	vb_free(mem, size);
				1760	return;
				1761	}
				1762
				1763	va = find_vmap_area(addr);
				1764	BUG_ON(!va);
				1765	debug_check_no_locks_freed((void *)va->va_start,
				1766	(va->va_end - va->va_start));
				1767	free_unmap_vmap_area(va);
				1768	}
				1769	EXPORT_SYMBOL(vm_unmap_ram);
				1770
				1771	/**
				1772	* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
				1773	* @pages: an array of pointers to the pages to be mapped
				1774	* @count: number of pages
				1775	* @node: prefer to allocate data structures on this node
				1776	* @prot: memory protection to use. PAGE_KERNEL for regular RAM
				1777	*
				1778	* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
				1779	* faster than vmap so it's good. But if you mix long-life and short-life
				1780	* objects with vm_map_ram(), it could consume lots of address space through
				1781	* fragmentation (especially on a 32bit machine). You could see failures in
				1782	* the end. Please use this function for short-lived objects.
				1783	*
				1784	* Returns: a pointer to the address that has been mapped, or %NULL on failure
				1785	*/
				1786	void vm_map_ram(struct page *pages, unsigned int count, int node, pgprot_t prot)
				1787	{
				1788	unsigned long size = (unsigned long)count << PAGE_SHIFT;
				1789	unsigned long addr;
				1790	void *mem;
				1791
				1792	if (likely(count <= VMAP_MAX_ALLOC)) {
				1793	mem = vb_alloc(size, GFP_KERNEL);
				1794	if (IS_ERR(mem))
				1795	return NULL;
				1796	addr = (unsigned long)mem;
				1797	} else {
				1798	struct vmap_area *va;
				1799	va = alloc_vmap_area(size, PAGE_SIZE,
				1800	VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
				1801	if (IS_ERR(va))
				1802	return NULL;
				1803
				1804	addr = va->va_start;
				1805	mem = (void *)addr;
				1806	}
				1807	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
				1808	vm_unmap_ram(mem, count);
				1809	return NULL;
				1810	}
				1811	return mem;
				1812	}
				1813	EXPORT_SYMBOL(vm_map_ram);
				1814
				1815	static struct vm_struct *vmlist __initdata;
				1816
				1817	/**
				1818	* vm_area_add_early - add vmap area early during boot
				1819	* @vm: vm_struct to add
				1820	*
				1821	* This function is used to add fixed kernel vm area to vmlist before
				1822	* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
				1823	* should contain proper values and the other fields should be zero.
				1824	*
				1825	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
				1826	*/
				1827	void __init vm_area_add_early(struct vm_struct *vm)
				1828	{
				1829	struct vm_struct tmp, *p;
				1830
				1831	BUG_ON(vmap_initialized);
				1832	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
				1833	if (tmp->addr >= vm->addr) {
				1834	BUG_ON(tmp->addr < vm->addr + vm->size);
				1835	break;
				1836	} else
				1837	BUG_ON(tmp->addr + tmp->size > vm->addr);
				1838	}
				1839	vm->next = *p;
				1840	*p = vm;
				1841	}
				1842
				1843	/**
				1844	* vm_area_register_early - register vmap area early during boot
				1845	* @vm: vm_struct to register
				1846	* @align: requested alignment
				1847	*
				1848	* This function is used to register kernel vm area before
				1849	* vmalloc_init() is called. @vm->size and @vm->flags should contain
				1850	* proper values on entry and other fields should be zero. On return,
				1851	* vm->addr contains the allocated address.
				1852	*
				1853	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
				1854	*/
				1855	void __init vm_area_register_early(struct vm_struct *vm, size_t align)
				1856	{
				1857	static size_t vm_init_off __initdata;
				1858	unsigned long addr;
				1859
				1860	addr = ALIGN(VMALLOC_START + vm_init_off, align);
				1861	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
				1862
				1863	vm->addr = (void *)addr;
				1864
				1865	vm_area_add_early(vm);
				1866	}
				1867
				1868	static void vmap_init_free_space(void)
				1869	{
				1870	unsigned long vmap_start = 1;
				1871	const unsigned long vmap_end = ULONG_MAX;
				1872	struct vmap_area busy, free;
				1873
				1874	/*
				1875	* B F B B B F
				1876	* -\|-----\|.....\|-----\|-----\|-----\|.....\|-
				1877	* \| The KVA space \|
				1878	* \|<--------------------------------->\|
				1879	*/
				1880	list_for_each_entry(busy, &vmap_area_list, list) {
				1881	if (busy->va_start - vmap_start > 0) {
				1882	free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
				1883	if (!WARN_ON_ONCE(!free)) {
				1884	free->va_start = vmap_start;
				1885	free->va_end = busy->va_start;
				1886
				1887	insert_vmap_area_augment(free, NULL,
				1888	&free_vmap_area_root,
				1889	&free_vmap_area_list);
				1890	}
				1891	}
				1892
				1893	vmap_start = busy->va_end;
				1894	}
				1895
				1896	if (vmap_end - vmap_start > 0) {
				1897	free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
				1898	if (!WARN_ON_ONCE(!free)) {
				1899	free->va_start = vmap_start;
				1900	free->va_end = vmap_end;
				1901
				1902	insert_vmap_area_augment(free, NULL,
				1903	&free_vmap_area_root,
				1904	&free_vmap_area_list);
				1905	}
				1906	}
				1907	}
				1908
				1909	void __init vmalloc_init(void)
				1910	{
				1911	struct vmap_area *va;
				1912	struct vm_struct *tmp;
				1913	int i;
				1914
				1915	/*
				1916	* Create the cache for vmap_area objects.
				1917	*/
				1918	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
				1919
				1920	for_each_possible_cpu(i) {
				1921	struct vmap_block_queue *vbq;
				1922	struct vfree_deferred *p;
				1923
				1924	vbq = &per_cpu(vmap_block_queue, i);
				1925	spin_lock_init(&vbq->lock);
				1926	INIT_LIST_HEAD(&vbq->free);
				1927	p = &per_cpu(vfree_deferred, i);
				1928	init_llist_head(&p->list);
				1929	INIT_WORK(&p->wq, free_work);
				1930	}
				1931
				1932	/* Import existing vmlist entries. */
				1933	for (tmp = vmlist; tmp; tmp = tmp->next) {
				1934	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
				1935	if (WARN_ON_ONCE(!va))
				1936	continue;
				1937
				1938	va->va_start = (unsigned long)tmp->addr;
				1939	va->va_end = va->va_start + tmp->size;
				1940	va->vm = tmp;
				1941	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
				1942	}
				1943
				1944	/*
				1945	* Now we can initialize a free vmap space.
				1946	*/
				1947	vmap_init_free_space();
				1948	vmap_initialized = true;
				1949	}
				1950
				1951	/**
				1952	* map_kernel_range_noflush - map kernel VM area with the specified pages
				1953	* @addr: start of the VM area to map
				1954	* @size: size of the VM area to map
				1955	* @prot: page protection flags to use
				1956	* @pages: pages to map
				1957	*
				1958	* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
				1959	* specify should have been allocated using get_vm_area() and its
				1960	* friends.
				1961	*
				1962	* NOTE:
				1963	* This function does NOT do any cache flushing. The caller is
				1964	* responsible for calling flush_cache_vmap() on to-be-mapped areas
				1965	* before calling this function.
				1966	*
				1967	* RETURNS:
				1968	* The number of pages mapped on success, -errno on failure.
				1969	*/
				1970	int map_kernel_range_noflush(unsigned long addr, unsigned long size,
				1971	pgprot_t prot, struct page **pages)
				1972	{
				1973	return vmap_page_range_noflush(addr, addr + size, prot, pages);
				1974	}
				1975
				1976	/**
				1977	* unmap_kernel_range_noflush - unmap kernel VM area
				1978	* @addr: start of the VM area to unmap
				1979	* @size: size of the VM area to unmap
				1980	*
				1981	* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
				1982	* specify should have been allocated using get_vm_area() and its
				1983	* friends.
				1984	*
				1985	* NOTE:
				1986	* This function does NOT do any cache flushing. The caller is
				1987	* responsible for calling flush_cache_vunmap() on to-be-mapped areas
				1988	* before calling this function and flush_tlb_kernel_range() after.
				1989	*/
				1990	void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
				1991	{
				1992	vunmap_page_range(addr, addr + size);
				1993	}
				1994	EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
				1995
				1996	/**
				1997	* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
				1998	* @addr: start of the VM area to unmap
				1999	* @size: size of the VM area to unmap
				2000	*
				2001	* Similar to unmap_kernel_range_noflush() but flushes vcache before
				2002	* the unmapping and tlb after.
				2003	*/
				2004	void unmap_kernel_range(unsigned long addr, unsigned long size)
				2005	{
				2006	unsigned long end = addr + size;
				2007
				2008	flush_cache_vunmap(addr, end);
				2009	vunmap_page_range(addr, end);
				2010	flush_tlb_kernel_range(addr, end);
				2011	}
				2012	EXPORT_SYMBOL_GPL(unmap_kernel_range);
				2013
				2014	int map_vm_area(struct vm_struct area, pgprot_t prot, struct page *pages)
				2015	{
				2016	unsigned long addr = (unsigned long)area->addr;
				2017	unsigned long end = addr + get_vm_area_size(area);
				2018	int err;
				2019
				2020	err = vmap_page_range(addr, end, prot, pages);
				2021
				2022	return err > 0 ? 0 : err;
				2023	}
				2024	EXPORT_SYMBOL_GPL(map_vm_area);
				2025
				2026	static void setup_vmalloc_vm(struct vm_struct vm, struct vmap_area va,
				2027	unsigned long flags, const void *caller)
				2028	{
				2029	spin_lock(&vmap_area_lock);
				2030	vm->flags = flags;
				2031	vm->addr = (void *)va->va_start;
				2032	vm->size = va->va_end - va->va_start;
				2033	vm->caller = caller;
				2034	va->vm = vm;
				2035	spin_unlock(&vmap_area_lock);
				2036	}
				2037
				2038	static void clear_vm_uninitialized_flag(struct vm_struct *vm)
				2039	{
				2040	/*
				2041	* Before removing VM_UNINITIALIZED,
				2042	* we should make sure that vm has proper values.
				2043	* Pair with smp_rmb() in show_numa_info().
				2044	*/
				2045	smp_wmb();
				2046	vm->flags &= ~VM_UNINITIALIZED;
				2047	}
				2048
				2049	static struct vm_struct *__get_vm_area_node(unsigned long size,
				2050	unsigned long align, unsigned long flags, unsigned long start,
				2051	unsigned long end, int node, gfp_t gfp_mask, const void *caller)
				2052	{
				2053	struct vmap_area *va;
				2054	struct vm_struct *area;
				2055
				2056	BUG_ON(in_interrupt());
				2057	size = PAGE_ALIGN(size);
				2058	if (unlikely(!size))
				2059	return NULL;
				2060
				2061	if (flags & VM_IOREMAP)
				2062	align = 1ul << clamp_t(int, get_count_order_long(size),
				2063	PAGE_SHIFT, IOREMAP_MAX_ORDER);
				2064
				2065	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
				2066	if (unlikely(!area))
				2067	return NULL;
				2068
				2069	if (!(flags & VM_NO_GUARD))
				2070	size += PAGE_SIZE;
				2071
				2072	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
				2073	if (IS_ERR(va)) {
				2074	kfree(area);
				2075	return NULL;
				2076	}
				2077
				2078	setup_vmalloc_vm(area, va, flags, caller);
				2079
				2080	return area;
				2081	}
				2082
				2083	struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
				2084	unsigned long start, unsigned long end)
				2085	{
				2086	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				2087	GFP_KERNEL, __builtin_return_address(0));
				2088	}
				2089	EXPORT_SYMBOL_GPL(__get_vm_area);
				2090
				2091	struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
				2092	unsigned long start, unsigned long end,
				2093	const void *caller)
				2094	{
				2095	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				2096	GFP_KERNEL, caller);
				2097	}
				2098
				2099	/**
				2100	* get_vm_area - reserve a contiguous kernel virtual area
				2101	* @size: size of the area
				2102	* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
				2103	*
				2104	* Search an area of @size in the kernel virtual mapping area,
				2105	* and reserved it for out purposes. Returns the area descriptor
				2106	* on success or %NULL on failure.
				2107	*
				2108	* Return: the area descriptor on success or %NULL on failure.
				2109	*/
				2110	struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
				2111	{
				2112	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
				2113	NUMA_NO_NODE, GFP_KERNEL,
				2114	__builtin_return_address(0));
				2115	}
				2116
				2117	struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
				2118	const void *caller)
				2119	{
				2120	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
				2121	NUMA_NO_NODE, GFP_KERNEL, caller);
				2122	}
				2123
				2124	/**
				2125	* find_vm_area - find a continuous kernel virtual area
				2126	* @addr: base address
				2127	*
				2128	* Search for the kernel VM area starting at @addr, and return it.
				2129	* It is up to the caller to do all required locking to keep the returned
				2130	* pointer valid.
				2131	*
				2132	* Return: pointer to the found area or %NULL on faulure
				2133	*/
				2134	struct vm_struct find_vm_area(const void addr)
				2135	{
				2136	struct vmap_area *va;
				2137
				2138	va = find_vmap_area((unsigned long)addr);
				2139	if (!va)
				2140	return NULL;
				2141
				2142	return va->vm;
				2143	}
				2144
				2145	/**
				2146	* remove_vm_area - find and remove a continuous kernel virtual area
				2147	* @addr: base address
				2148	*
				2149	* Search for the kernel VM area starting at @addr, and remove it.
				2150	* This function returns the found VM area, but using it is NOT safe
				2151	* on SMP machines, except for its size or flags.
				2152	*
				2153	* Return: pointer to the found area or %NULL on faulure
				2154	*/
				2155	struct vm_struct remove_vm_area(const void addr)
				2156	{
				2157	struct vmap_area *va;
				2158
				2159	might_sleep();
				2160
				2161	spin_lock(&vmap_area_lock);
				2162	va = __find_vmap_area((unsigned long)addr);
				2163	if (va && va->vm) {
				2164	struct vm_struct *vm = va->vm;
				2165
				2166	va->vm = NULL;
				2167	spin_unlock(&vmap_area_lock);
				2168
				2169	kasan_free_shadow(vm);
				2170	free_unmap_vmap_area(va);
				2171
				2172	return vm;
				2173	}
				2174
				2175	spin_unlock(&vmap_area_lock);
				2176	return NULL;
				2177	}
				2178
				2179	static inline void set_area_direct_map(const struct vm_struct *area,
				2180	int (set_direct_map)(struct page page))
				2181	{
				2182	int i;
				2183
				2184	for (i = 0; i < area->nr_pages; i++)
				2185	if (page_address(area->pages[i]))
				2186	set_direct_map(area->pages[i]);
				2187	}
				2188
				2189	/* Handle removing and resetting vm mappings related to the vm_struct. */
				2190	static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
				2191	{
				2192	unsigned long start = ULONG_MAX, end = 0;
				2193	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
				2194	int flush_dmap = 0;
				2195	int i;
				2196
				2197	remove_vm_area(area->addr);
				2198
				2199	/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
				2200	if (!flush_reset)
				2201	return;
				2202
				2203	/*
				2204	* If not deallocating pages, just do the flush of the VM area and
				2205	* return.
				2206	*/
				2207	if (!deallocate_pages) {
				2208	vm_unmap_aliases();
				2209	return;
				2210	}
				2211
				2212	/*
				2213	* If execution gets here, flush the vm mapping and reset the direct
				2214	* map. Find the start and end range of the direct mappings to make sure
				2215	* the vm_unmap_aliases() flush includes the direct map.
				2216	*/
				2217	for (i = 0; i < area->nr_pages; i++) {
				2218	unsigned long addr = (unsigned long)page_address(area->pages[i]);
				2219	if (addr) {
				2220	start = min(addr, start);
				2221	end = max(addr + PAGE_SIZE, end);
				2222	flush_dmap = 1;
				2223	}
				2224	}
				2225
				2226	/*
				2227	* Set direct map to something invalid so that it won't be cached if
				2228	* there are any accesses after the TLB flush, then flush the TLB and
				2229	* reset the direct map permissions to the default.
				2230	*/
				2231	set_area_direct_map(area, set_direct_map_invalid_noflush);
				2232	_vm_unmap_aliases(start, end, flush_dmap);
				2233	set_area_direct_map(area, set_direct_map_default_noflush);
				2234	}
				2235
				2236	static void __vunmap(const void *addr, int deallocate_pages)
				2237	{
				2238	struct vm_struct *area;
				2239
				2240	if (!addr)
				2241	return;
				2242
				2243	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
				2244	addr))
				2245	return;
				2246
				2247	area = find_vm_area(addr);
				2248	if (unlikely(!area)) {
				2249	WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
				2250	addr);
				2251	return;
				2252	}
				2253
				2254	debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
				2255	debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
				2256
				2257	vm_remove_mappings(area, deallocate_pages);
				2258
				2259	if (deallocate_pages) {
				2260	int i;
				2261
				2262	for (i = 0; i < area->nr_pages; i++) {
				2263	struct page *page = area->pages[i];
				2264
				2265	BUG_ON(!page);
				2266	__free_pages(page, 0);
				2267	}
				2268	atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
				2269
				2270	kvfree(area->pages);
				2271	}
				2272
				2273	kfree(area);
				2274	return;
				2275	}
				2276
				2277	static inline void __vfree_deferred(const void *addr)
				2278	{
				2279	/*
				2280	* Use raw_cpu_ptr() because this can be called from preemptible
				2281	* context. Preemption is absolutely fine here, because the llist_add()
				2282	* implementation is lockless, so it works even if we are adding to
				2283	* nother cpu's list. schedule_work() should be fine with this too.
				2284	*/
				2285	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
				2286
				2287	if (llist_add((struct llist_node *)addr, &p->list))
				2288	schedule_work(&p->wq);
				2289	}
				2290
				2291	/**
				2292	* vfree_atomic - release memory allocated by vmalloc()
				2293	* @addr: memory base address
				2294	*
				2295	* This one is just like vfree() but can be called in any atomic context
				2296	* except NMIs.
				2297	*/
				2298	void vfree_atomic(const void *addr)
				2299	{
				2300	BUG_ON(in_nmi());
				2301
				2302	kmemleak_free(addr);
				2303
				2304	if (!addr)
				2305	return;
				2306	__vfree_deferred(addr);
				2307	}
				2308
				2309	static void __vfree(const void *addr)
				2310	{
				2311	if (unlikely(in_interrupt()))
				2312	__vfree_deferred(addr);
				2313	else
				2314	__vunmap(addr, 1);
				2315	}
				2316
				2317	/**
				2318	* vfree - release memory allocated by vmalloc()
				2319	* @addr: memory base address
				2320	*
				2321	* Free the virtually continuous memory area starting at @addr, as
				2322	* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
				2323	* NULL, no operation is performed.
				2324	*
				2325	* Must not be called in NMI context (strictly speaking, only if we don't
				2326	* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
				2327	* conventions for vfree() arch-depenedent would be a really bad idea)
				2328	*
				2329	* May sleep if called not from interrupt context.
				2330	*
				2331	* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
				2332	*/
				2333	void vfree(const void *addr)
				2334	{
				2335	BUG_ON(in_nmi());
				2336
				2337	kmemleak_free(addr);
				2338
				2339	might_sleep_if(!in_interrupt());
				2340
				2341	if (!addr)
				2342	return;
				2343
				2344	__vfree(addr);
				2345	}
				2346	EXPORT_SYMBOL(vfree);
				2347
				2348	/**
				2349	* vunmap - release virtual mapping obtained by vmap()
				2350	* @addr: memory base address
				2351	*
				2352	* Free the virtually contiguous memory area starting at @addr,
				2353	* which was created from the page array passed to vmap().
				2354	*
				2355	* Must not be called in interrupt context.
				2356	*/
				2357	void vunmap(const void *addr)
				2358	{
				2359	BUG_ON(in_interrupt());
				2360	might_sleep();
				2361	if (addr)
				2362	__vunmap(addr, 0);
				2363	}
				2364	EXPORT_SYMBOL(vunmap);
				2365
				2366	/**
				2367	* vmap - map an array of pages into virtually contiguous space
				2368	* @pages: array of page pointers
				2369	* @count: number of pages to map
				2370	* @flags: vm_area->flags
				2371	* @prot: page protection for the mapping
				2372	*
				2373	* Maps @count pages from @pages into contiguous kernel virtual
				2374	* space.
				2375	*
				2376	* Return: the address of the area or %NULL on failure
				2377	*/
				2378	void vmap(struct page *pages, unsigned int count,
				2379	unsigned long flags, pgprot_t prot)
				2380	{
				2381	struct vm_struct *area;
				2382	unsigned long size; /* In bytes */
				2383
				2384	might_sleep();
				2385
				2386	if (count > totalram_pages())
				2387	return NULL;
				2388
				2389	size = (unsigned long)count << PAGE_SHIFT;
				2390	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
				2391	if (!area)
				2392	return NULL;
				2393
				2394	if (map_vm_area(area, prot, pages)) {
				2395	vunmap(area->addr);
				2396	return NULL;
				2397	}
				2398
				2399	return area->addr;
				2400	}
				2401	EXPORT_SYMBOL(vmap);
				2402
				2403	static void *__vmalloc_node(unsigned long size, unsigned long align,
				2404	gfp_t gfp_mask, pgprot_t prot,
				2405	int node, const void *caller);
				2406	static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
				2407	pgprot_t prot, int node)
				2408	{
				2409	struct page **pages;
				2410	unsigned int nr_pages, array_size, i;
				2411	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
				2412	const gfp_t alloc_mask = gfp_mask \| __GFP_NOWARN;
				2413	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA \| GFP_DMA32)) ?
				2414	0 :
				2415	__GFP_HIGHMEM;
				2416
				2417	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
				2418	array_size = (nr_pages * sizeof(struct page *));
				2419
				2420	/* Please note that the recursion is strictly bounded. */
				2421	if (array_size > PAGE_SIZE) {
				2422	pages = __vmalloc_node(array_size, 1, nested_gfp\|highmem_mask,
				2423	PAGE_KERNEL, node, area->caller);
				2424	} else {
				2425	pages = kmalloc_node(array_size, nested_gfp, node);
				2426	}
				2427
				2428	if (!pages) {
				2429	remove_vm_area(area->addr);
				2430	kfree(area);
				2431	return NULL;
				2432	}
				2433
				2434	area->pages = pages;
				2435	area->nr_pages = nr_pages;
				2436
				2437	for (i = 0; i < area->nr_pages; i++) {
				2438	struct page *page;
				2439
				2440	if (node == NUMA_NO_NODE)
				2441	page = alloc_page(alloc_mask\|highmem_mask);
				2442	else
				2443	page = alloc_pages_node(node, alloc_mask\|highmem_mask, 0);
				2444
				2445	if (unlikely(!page)) {
				2446	/* Successfully allocated i pages, free them in __vunmap() */
				2447	area->nr_pages = i;
				2448	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
				2449	goto fail;
				2450	}
				2451	area->pages[i] = page;
				2452	if (gfpflags_allow_blocking(gfp_mask\|highmem_mask))
				2453	cond_resched();
				2454	}
				2455	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
				2456
				2457	if (map_vm_area(area, prot, pages))
				2458	goto fail;
				2459	return area->addr;
				2460
				2461	fail:
				2462	warn_alloc(gfp_mask, NULL,
				2463	"vmalloc: allocation failure, allocated %ld of %ld bytes",
				2464	(area->nr_pages*PAGE_SIZE), area->size);
				2465	__vfree(area->addr);
				2466	return NULL;
				2467	}
				2468
				2469	/**
				2470	* __vmalloc_node_range - allocate virtually contiguous memory
				2471	* @size: allocation size
				2472	* @align: desired alignment
				2473	* @start: vm area range start
				2474	* @end: vm area range end
				2475	* @gfp_mask: flags for the page level allocator
				2476	* @prot: protection mask for the allocated pages
				2477	* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
				2478	* @node: node to use for allocation or NUMA_NO_NODE
				2479	* @caller: caller's return address
				2480	*
				2481	* Allocate enough pages to cover @size from the page level
				2482	* allocator with @gfp_mask flags. Map them into contiguous
				2483	* kernel virtual space, using a pagetable protection of @prot.
				2484	*
				2485	* Return: the address of the area or %NULL on failure
				2486	*/
				2487	void *__vmalloc_node_range(unsigned long size, unsigned long align,
				2488	unsigned long start, unsigned long end, gfp_t gfp_mask,
				2489	pgprot_t prot, unsigned long vm_flags, int node,
				2490	const void *caller)
				2491	{
				2492	struct vm_struct *area;
				2493	void *addr;
				2494	unsigned long real_size = size;
				2495
				2496	size = PAGE_ALIGN(size);
				2497	if (!size \|\| (size >> PAGE_SHIFT) > totalram_pages())
				2498	goto fail;
				2499
				2500	area = __get_vm_area_node(size, align, VM_ALLOC \| VM_UNINITIALIZED \|
				2501	vm_flags, start, end, node, gfp_mask, caller);
				2502	if (!area)
				2503	goto fail;
				2504
				2505	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
				2506	if (!addr)
				2507	return NULL;
				2508
				2509	/*
				2510	* In this function, newly allocated vm_struct has VM_UNINITIALIZED
				2511	* flag. It means that vm_struct is not fully initialized.
				2512	* Now, it is fully initialized, so remove this flag here.
				2513	*/
				2514	clear_vm_uninitialized_flag(area);
				2515
				2516	kmemleak_vmalloc(area, size, gfp_mask);
				2517
				2518	return addr;
				2519
				2520	fail:
				2521	warn_alloc(gfp_mask, NULL,
				2522	"vmalloc: allocation failure: %lu bytes", real_size);
				2523	return NULL;
				2524	}
				2525
				2526	/*
				2527	* This is only for performance analysis of vmalloc and stress purpose.
				2528	* It is required by vmalloc test module, therefore do not use it other
				2529	* than that.
				2530	*/
				2531	#ifdef CONFIG_TEST_VMALLOC_MODULE
				2532	EXPORT_SYMBOL_GPL(__vmalloc_node_range);
				2533	#endif
				2534
				2535	/**
				2536	* __vmalloc_node - allocate virtually contiguous memory
				2537	* @size: allocation size
				2538	* @align: desired alignment
				2539	* @gfp_mask: flags for the page level allocator
				2540	* @prot: protection mask for the allocated pages
				2541	* @node: node to use for allocation or NUMA_NO_NODE
				2542	* @caller: caller's return address
				2543	*
				2544	* Allocate enough pages to cover @size from the page level
				2545	* allocator with @gfp_mask flags. Map them into contiguous
				2546	* kernel virtual space, using a pagetable protection of @prot.
				2547	*
				2548	* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
				2549	* and __GFP_NOFAIL are not supported
				2550	*
				2551	* Any use of gfp flags outside of GFP_KERNEL should be consulted
				2552	* with mm people.
				2553	*
				2554	* Return: pointer to the allocated memory or %NULL on error
				2555	*/
				2556	static void *__vmalloc_node(unsigned long size, unsigned long align,
				2557	gfp_t gfp_mask, pgprot_t prot,
				2558	int node, const void *caller)
				2559	{
				2560	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
				2561	gfp_mask, prot, 0, node, caller);
				2562	}
				2563
				2564	void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
				2565	{
				2566	return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
				2567	__builtin_return_address(0));
				2568	}
				2569	EXPORT_SYMBOL(__vmalloc);
				2570
				2571	static inline void *__vmalloc_node_flags(unsigned long size,
				2572	int node, gfp_t flags)
				2573	{
				2574	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
				2575	node, __builtin_return_address(0));
				2576	}
				2577
				2578
				2579	void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
				2580	void *caller)
				2581	{
				2582	return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
				2583	}
				2584
				2585	/**
				2586	* vmalloc - allocate virtually contiguous memory
				2587	* @size: allocation size
				2588	*
				2589	* Allocate enough pages to cover @size from the page level
				2590	* allocator and map them into contiguous kernel virtual space.
				2591	*
				2592	* For tight control over page level allocator and protection flags
				2593	* use __vmalloc() instead.
				2594	*
				2595	* Return: pointer to the allocated memory or %NULL on error
				2596	*/
				2597	void *vmalloc(unsigned long size)
				2598	{
				2599	return __vmalloc_node_flags(size, NUMA_NO_NODE,
				2600	GFP_KERNEL);
				2601	}
				2602	EXPORT_SYMBOL(vmalloc);
				2603
				2604	/**
				2605	* vzalloc - allocate virtually contiguous memory with zero fill
				2606	* @size: allocation size
				2607	*
				2608	* Allocate enough pages to cover @size from the page level
				2609	* allocator and map them into contiguous kernel virtual space.
				2610	* The memory allocated is set to zero.
				2611	*
				2612	* For tight control over page level allocator and protection flags
				2613	* use __vmalloc() instead.
				2614	*
				2615	* Return: pointer to the allocated memory or %NULL on error
				2616	*/
				2617	void *vzalloc(unsigned long size)
				2618	{
				2619	return __vmalloc_node_flags(size, NUMA_NO_NODE,
				2620	GFP_KERNEL \| __GFP_ZERO);
				2621	}
				2622	EXPORT_SYMBOL(vzalloc);
				2623
				2624	/**
				2625	* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
				2626	* @size: allocation size
				2627	*
				2628	* The resulting memory area is zeroed so it can be mapped to userspace
				2629	* without leaking data.
				2630	*
				2631	* Return: pointer to the allocated memory or %NULL on error
				2632	*/
				2633	void *vmalloc_user(unsigned long size)
				2634	{
				2635	return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
				2636	GFP_KERNEL \| __GFP_ZERO, PAGE_KERNEL,
				2637	VM_USERMAP, NUMA_NO_NODE,
				2638	__builtin_return_address(0));
				2639	}
				2640	EXPORT_SYMBOL(vmalloc_user);
				2641
				2642	/**
				2643	* vmalloc_node - allocate memory on a specific node
				2644	* @size: allocation size
				2645	* @node: numa node
				2646	*
				2647	* Allocate enough pages to cover @size from the page level
				2648	* allocator and map them into contiguous kernel virtual space.
				2649	*
				2650	* For tight control over page level allocator and protection flags
				2651	* use __vmalloc() instead.
				2652	*
				2653	* Return: pointer to the allocated memory or %NULL on error
				2654	*/
				2655	void *vmalloc_node(unsigned long size, int node)
				2656	{
				2657	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
				2658	node, __builtin_return_address(0));
				2659	}
				2660	EXPORT_SYMBOL(vmalloc_node);
				2661
				2662	/**
				2663	* vzalloc_node - allocate memory on a specific node with zero fill
				2664	* @size: allocation size
				2665	* @node: numa node
				2666	*
				2667	* Allocate enough pages to cover @size from the page level
				2668	* allocator and map them into contiguous kernel virtual space.
				2669	* The memory allocated is set to zero.
				2670	*
				2671	* For tight control over page level allocator and protection flags
				2672	* use __vmalloc_node() instead.
				2673	*
				2674	* Return: pointer to the allocated memory or %NULL on error
				2675	*/
				2676	void *vzalloc_node(unsigned long size, int node)
				2677	{
				2678	return __vmalloc_node_flags(size, node,
				2679	GFP_KERNEL \| __GFP_ZERO);
				2680	}
				2681	EXPORT_SYMBOL(vzalloc_node);
				2682
				2683	/**
				2684	* vmalloc_exec - allocate virtually contiguous, executable memory
				2685	* @size: allocation size
				2686	*
				2687	* Kernel-internal function to allocate enough pages to cover @size
				2688	* the page level allocator and map them into contiguous and
				2689	* executable kernel virtual space.
				2690	*
				2691	* For tight control over page level allocator and protection flags
				2692	* use __vmalloc() instead.
				2693	*
				2694	* Return: pointer to the allocated memory or %NULL on error
				2695	*/
				2696	void *vmalloc_exec(unsigned long size)
				2697	{
				2698	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
				2699	GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
				2700	NUMA_NO_NODE, __builtin_return_address(0));
				2701	}
				2702
				2703	#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
				2704	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
				2705	#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
				2706	#define GFP_VMALLOC32 (GFP_DMA \| GFP_KERNEL)
				2707	#else
				2708	/*
				2709	* 64b systems should always have either DMA or DMA32 zones. For others
				2710	* GFP_DMA32 should do the right thing and use the normal zone.
				2711	*/
				2712	#define GFP_VMALLOC32 GFP_DMA32 \| GFP_KERNEL
				2713	#endif
				2714
				2715	/**
				2716	* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
				2717	* @size: allocation size
				2718	*
				2719	* Allocate enough 32bit PA addressable pages to cover @size from the
				2720	* page level allocator and map them into contiguous kernel virtual space.
				2721	*
				2722	* Return: pointer to the allocated memory or %NULL on error
				2723	*/
				2724	void *vmalloc_32(unsigned long size)
				2725	{
				2726	return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
				2727	NUMA_NO_NODE, __builtin_return_address(0));
				2728	}
				2729	EXPORT_SYMBOL(vmalloc_32);
				2730
				2731	/**
				2732	* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
				2733	* @size: allocation size
				2734	*
				2735	* The resulting memory area is 32bit addressable and zeroed so it can be
				2736	* mapped to userspace without leaking data.
				2737	*
				2738	* Return: pointer to the allocated memory or %NULL on error
				2739	*/
				2740	void *vmalloc_32_user(unsigned long size)
				2741	{
				2742	return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
				2743	GFP_VMALLOC32 \| __GFP_ZERO, PAGE_KERNEL,
				2744	VM_USERMAP, NUMA_NO_NODE,
				2745	__builtin_return_address(0));
				2746	}
				2747	EXPORT_SYMBOL(vmalloc_32_user);
				2748
				2749	/*
				2750	* small helper routine , copy contents to buf from addr.
				2751	* If the page is not present, fill zero.
				2752	*/
				2753
				2754	static int aligned_vread(char buf, char addr, unsigned long count)
				2755	{
				2756	struct page *p;
				2757	int copied = 0;
				2758
				2759	while (count) {
				2760	unsigned long offset, length;
				2761
				2762	offset = offset_in_page(addr);
				2763	length = PAGE_SIZE - offset;
				2764	if (length > count)
				2765	length = count;
				2766	p = vmalloc_to_page(addr);
				2767	/*
				2768	* To do safe access to this _mapped_ area, we need
				2769	* lock. But adding lock here means that we need to add
				2770	* overhead of vmalloc()/vfree() calles for this _debug_
				2771	* interface, rarely used. Instead of that, we'll use
				2772	* kmap() and get small overhead in this access function.
				2773	*/
				2774	if (p) {
				2775	/*
				2776	* we can expect USER0 is not used (see vread/vwrite's
				2777	* function description)
				2778	*/
				2779	void *map = kmap_atomic(p);
				2780	memcpy(buf, map + offset, length);
				2781	kunmap_atomic(map);
				2782	} else
				2783	memset(buf, 0, length);
				2784
				2785	addr += length;
				2786	buf += length;
				2787	copied += length;
				2788	count -= length;
				2789	}
				2790	return copied;
				2791	}
				2792
				2793	static int aligned_vwrite(char buf, char addr, unsigned long count)
				2794	{
				2795	struct page *p;
				2796	int copied = 0;
				2797
				2798	while (count) {
				2799	unsigned long offset, length;
				2800
				2801	offset = offset_in_page(addr);
				2802	length = PAGE_SIZE - offset;
				2803	if (length > count)
				2804	length = count;
				2805	p = vmalloc_to_page(addr);
				2806	/*
				2807	* To do safe access to this _mapped_ area, we need
				2808	* lock. But adding lock here means that we need to add
				2809	* overhead of vmalloc()/vfree() calles for this _debug_
				2810	* interface, rarely used. Instead of that, we'll use
				2811	* kmap() and get small overhead in this access function.
				2812	*/
				2813	if (p) {
				2814	/*
				2815	* we can expect USER0 is not used (see vread/vwrite's
				2816	* function description)
				2817	*/
				2818	void *map = kmap_atomic(p);
				2819	memcpy(map + offset, buf, length);
				2820	kunmap_atomic(map);
				2821	}
				2822	addr += length;
				2823	buf += length;
				2824	copied += length;
				2825	count -= length;
				2826	}
				2827	return copied;
				2828	}
				2829
				2830	/**
				2831	* vread() - read vmalloc area in a safe way.
				2832	* @buf: buffer for reading data
				2833	* @addr: vm address.
				2834	* @count: number of bytes to be read.
				2835	*
				2836	* This function checks that addr is a valid vmalloc'ed area, and
				2837	* copy data from that area to a given buffer. If the given memory range
				2838	* of [addr...addr+count) includes some valid address, data is copied to
				2839	* proper area of @buf. If there are memory holes, they'll be zero-filled.
				2840	* IOREMAP area is treated as memory hole and no copy is done.
				2841	*
				2842	* If [addr...addr+count) doesn't includes any intersects with alive
				2843	* vm_struct area, returns 0. @buf should be kernel's buffer.
				2844	*
				2845	* Note: In usual ops, vread() is never necessary because the caller
				2846	* should know vmalloc() area is valid and can use memcpy().
				2847	* This is for routines which have to access vmalloc area without
				2848	* any information, as /dev/kmem.
				2849	*
				2850	* Return: number of bytes for which addr and buf should be increased
				2851	* (same number as @count) or %0 if [addr...addr+count) doesn't
				2852	* include any intersection with valid vmalloc area
				2853	*/
				2854	long vread(char buf, char addr, unsigned long count)
				2855	{
				2856	struct vmap_area *va;
				2857	struct vm_struct *vm;
				2858	char vaddr, buf_start = buf;
				2859	unsigned long buflen = count;
				2860	unsigned long n;
				2861
				2862	/* Don't allow overflow */
				2863	if ((unsigned long) addr + count < count)
				2864	count = -(unsigned long) addr;
				2865
				2866	spin_lock(&vmap_area_lock);
				2867	list_for_each_entry(va, &vmap_area_list, list) {
				2868	if (!count)
				2869	break;
				2870
				2871	if (!va->vm)
				2872	continue;
				2873
				2874	vm = va->vm;
				2875	vaddr = (char *) vm->addr;
				2876	if (addr >= vaddr + get_vm_area_size(vm))
				2877	continue;
				2878	while (addr < vaddr) {
				2879	if (count == 0)
				2880	goto finished;
				2881	*buf = '\0';
				2882	buf++;
				2883	addr++;
				2884	count--;
				2885	}
				2886	n = vaddr + get_vm_area_size(vm) - addr;
				2887	if (n > count)
				2888	n = count;
				2889	if (!(vm->flags & VM_IOREMAP))
				2890	aligned_vread(buf, addr, n);
				2891	else /* IOREMAP area is treated as memory hole */
				2892	memset(buf, 0, n);
				2893	buf += n;
				2894	addr += n;
				2895	count -= n;
				2896	}
				2897	finished:
				2898	spin_unlock(&vmap_area_lock);
				2899
				2900	if (buf == buf_start)
				2901	return 0;
				2902	/* zero-fill memory holes */
				2903	if (buf != buf_start + buflen)
				2904	memset(buf, 0, buflen - (buf - buf_start));
				2905
				2906	return buflen;
				2907	}
				2908
				2909	/**
				2910	* vwrite() - write vmalloc area in a safe way.
				2911	* @buf: buffer for source data
				2912	* @addr: vm address.
				2913	* @count: number of bytes to be read.
				2914	*
				2915	* This function checks that addr is a valid vmalloc'ed area, and
				2916	* copy data from a buffer to the given addr. If specified range of
				2917	* [addr...addr+count) includes some valid address, data is copied from
				2918	* proper area of @buf. If there are memory holes, no copy to hole.
				2919	* IOREMAP area is treated as memory hole and no copy is done.
				2920	*
				2921	* If [addr...addr+count) doesn't includes any intersects with alive
				2922	* vm_struct area, returns 0. @buf should be kernel's buffer.
				2923	*
				2924	* Note: In usual ops, vwrite() is never necessary because the caller
				2925	* should know vmalloc() area is valid and can use memcpy().
				2926	* This is for routines which have to access vmalloc area without
				2927	* any information, as /dev/kmem.
				2928	*
				2929	* Return: number of bytes for which addr and buf should be
				2930	* increased (same number as @count) or %0 if [addr...addr+count)
				2931	* doesn't include any intersection with valid vmalloc area
				2932	*/
				2933	long vwrite(char buf, char addr, unsigned long count)
				2934	{
				2935	struct vmap_area *va;
				2936	struct vm_struct *vm;
				2937	char *vaddr;
				2938	unsigned long n, buflen;
				2939	int copied = 0;
				2940
				2941	/* Don't allow overflow */
				2942	if ((unsigned long) addr + count < count)
				2943	count = -(unsigned long) addr;
				2944	buflen = count;
				2945
				2946	spin_lock(&vmap_area_lock);
				2947	list_for_each_entry(va, &vmap_area_list, list) {
				2948	if (!count)
				2949	break;
				2950
				2951	if (!va->vm)
				2952	continue;
				2953
				2954	vm = va->vm;
				2955	vaddr = (char *) vm->addr;
				2956	if (addr >= vaddr + get_vm_area_size(vm))
				2957	continue;
				2958	while (addr < vaddr) {
				2959	if (count == 0)
				2960	goto finished;
				2961	buf++;
				2962	addr++;
				2963	count--;
				2964	}
				2965	n = vaddr + get_vm_area_size(vm) - addr;
				2966	if (n > count)
				2967	n = count;
				2968	if (!(vm->flags & VM_IOREMAP)) {
				2969	aligned_vwrite(buf, addr, n);
				2970	copied++;
				2971	}
				2972	buf += n;
				2973	addr += n;
				2974	count -= n;
				2975	}
				2976	finished:
				2977	spin_unlock(&vmap_area_lock);
				2978	if (!copied)
				2979	return 0;
				2980	return buflen;
				2981	}
				2982
				2983	/**
				2984	* remap_vmalloc_range_partial - map vmalloc pages to userspace
				2985	* @vma: vma to cover
				2986	* @uaddr: target user address to start at
				2987	* @kaddr: virtual address of vmalloc kernel memory
				2988	* @pgoff: offset from @kaddr to start at
				2989	* @size: size of map area
				2990	*
				2991	* Returns: 0 for success, -Exxx on failure
				2992	*
				2993	* This function checks that @kaddr is a valid vmalloc'ed area,
				2994	* and that it is big enough to cover the range starting at
				2995	* @uaddr in @vma. Will return failure if that criteria isn't
				2996	* met.
				2997	*
				2998	* Similar to remap_pfn_range() (see mm/memory.c)
				2999	*/
				3000	int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
				3001	void *kaddr, unsigned long pgoff,
				3002	unsigned long size)
				3003	{
				3004	struct vm_struct *area;
				3005	unsigned long off;
				3006	unsigned long end_index;
				3007
				3008	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
				3009	return -EINVAL;
				3010
				3011	size = PAGE_ALIGN(size);
				3012
				3013	if (!PAGE_ALIGNED(uaddr) \|\| !PAGE_ALIGNED(kaddr))
				3014	return -EINVAL;
				3015
				3016	area = find_vm_area(kaddr);
				3017	if (!area)
				3018	return -EINVAL;
				3019
				3020	if (!(area->flags & (VM_USERMAP \| VM_DMA_COHERENT)))
				3021	return -EINVAL;
				3022
				3023	if (check_add_overflow(size, off, &end_index) \|\|
				3024	end_index > get_vm_area_size(area))
				3025	return -EINVAL;
				3026	kaddr += off;
				3027
				3028	do {
				3029	struct page *page = vmalloc_to_page(kaddr);
				3030	int ret;
				3031
				3032	ret = vm_insert_page(vma, uaddr, page);
				3033	if (ret)
				3034	return ret;
				3035
				3036	uaddr += PAGE_SIZE;
				3037	kaddr += PAGE_SIZE;
				3038	size -= PAGE_SIZE;
				3039	} while (size > 0);
				3040
				3041	vma->vm_flags \|= VM_DONTEXPAND \| VM_DONTDUMP;
				3042
				3043	return 0;
				3044	}
				3045	EXPORT_SYMBOL(remap_vmalloc_range_partial);
				3046
				3047	/**
				3048	* remap_vmalloc_range - map vmalloc pages to userspace
				3049	* @vma: vma to cover (map full range of vma)
				3050	* @addr: vmalloc memory
				3051	* @pgoff: number of pages into addr before first page to map
				3052	*
				3053	* Returns: 0 for success, -Exxx on failure
				3054	*
				3055	* This function checks that addr is a valid vmalloc'ed area, and
				3056	* that it is big enough to cover the vma. Will return failure if
				3057	* that criteria isn't met.
				3058	*
				3059	* Similar to remap_pfn_range() (see mm/memory.c)
				3060	*/
				3061	int remap_vmalloc_range(struct vm_area_struct vma, void addr,
				3062	unsigned long pgoff)
				3063	{
				3064	return remap_vmalloc_range_partial(vma, vma->vm_start,
				3065	addr, pgoff,
				3066	vma->vm_end - vma->vm_start);
				3067	}
				3068	EXPORT_SYMBOL(remap_vmalloc_range);
				3069
				3070	/*
				3071	* Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
				3072	* not to have one.
				3073	*
				3074	* The purpose of this function is to make sure the vmalloc area
				3075	* mappings are identical in all page-tables in the system.
				3076	*/
				3077	void __weak vmalloc_sync_mappings(void)
				3078	{
				3079	}
				3080
				3081	void __weak vmalloc_sync_unmappings(void)
				3082	{
				3083	}
				3084
				3085	static int f(pte_t pte, unsigned long addr, void data)
				3086	{
				3087	pte_t ***p = data;
				3088
				3089	if (p) {
				3090	(p) = pte;
				3091	(*p)++;
				3092	}
				3093	return 0;
				3094	}
				3095
				3096	/**
				3097	* alloc_vm_area - allocate a range of kernel address space
				3098	* @size: size of the area
				3099	* @ptes: returns the PTEs for the address space
				3100	*
				3101	* Returns: NULL on failure, vm_struct on success
				3102	*
				3103	* This function reserves a range of kernel address space, and
				3104	* allocates pagetables to map that range. No actual mappings
				3105	* are created.
				3106	*
				3107	* If @ptes is non-NULL, pointers to the PTEs (in init_mm)
				3108	* allocated for the VM area are returned.
				3109	*/
				3110	struct vm_struct alloc_vm_area(size_t size, pte_t *ptes)
				3111	{
				3112	struct vm_struct *area;
				3113
				3114	area = get_vm_area_caller(size, VM_IOREMAP,
				3115	__builtin_return_address(0));
				3116	if (area == NULL)
				3117	return NULL;
				3118
				3119	/*
				3120	* This ensures that page tables are constructed for this region
				3121	* of kernel virtual address space and mapped into init_mm.
				3122	*/
				3123	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
				3124	size, f, ptes ? &ptes : NULL)) {
				3125	free_vm_area(area);
				3126	return NULL;
				3127	}
				3128
				3129	return area;
				3130	}
				3131	EXPORT_SYMBOL_GPL(alloc_vm_area);
				3132
				3133	void free_vm_area(struct vm_struct *area)
				3134	{
				3135	struct vm_struct *ret;
				3136	ret = remove_vm_area(area->addr);
				3137	BUG_ON(ret != area);
				3138	kfree(area);
				3139	}
				3140	EXPORT_SYMBOL_GPL(free_vm_area);
				3141
				3142	#ifdef CONFIG_SMP
				3143	static struct vmap_area node_to_va(struct rb_node n)
				3144	{
				3145	return rb_entry_safe(n, struct vmap_area, rb_node);
				3146	}
				3147
				3148	/**
				3149	* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
				3150	* @addr: target address
				3151	*
				3152	* Returns: vmap_area if it is found. If there is no such area
				3153	* the first highest(reverse order) vmap_area is returned
				3154	* i.e. va->va_start < addr && va->va_end < addr or NULL
				3155	* if there are no any areas before @addr.
				3156	*/
				3157	static struct vmap_area *
				3158	pvm_find_va_enclose_addr(unsigned long addr)
				3159	{
				3160	struct vmap_area va, tmp;
				3161	struct rb_node *n;
				3162
				3163	n = free_vmap_area_root.rb_node;
				3164	va = NULL;
				3165
				3166	while (n) {
				3167	tmp = rb_entry(n, struct vmap_area, rb_node);
				3168	if (tmp->va_start <= addr) {
				3169	va = tmp;
				3170	if (tmp->va_end >= addr)
				3171	break;
				3172
				3173	n = n->rb_right;
				3174	} else {
				3175	n = n->rb_left;
				3176	}
				3177	}
				3178
				3179	return va;
				3180	}
				3181
				3182	/**
				3183	* pvm_determine_end_from_reverse - find the highest aligned address
				3184	* of free block below VMALLOC_END
				3185	* @va:
				3186	* in - the VA we start the search(reverse order);
				3187	* out - the VA with the highest aligned end address.
				3188	*
				3189	* Returns: determined end address within vmap_area
				3190	*/
				3191	static unsigned long
				3192	pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
				3193	{
				3194	unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
				3195	unsigned long addr;
				3196
				3197	if (likely(*va)) {
				3198	list_for_each_entry_from_reverse((*va),
				3199	&free_vmap_area_list, list) {
				3200	addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
				3201	if ((*va)->va_start < addr)
				3202	return addr;
				3203	}
				3204	}
				3205
				3206	return 0;
				3207	}
				3208
				3209	/**
				3210	* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
				3211	* @offsets: array containing offset of each area
				3212	* @sizes: array containing size of each area
				3213	* @nr_vms: the number of areas to allocate
				3214	* @align: alignment, all entries in @offsets and @sizes must be aligned to this
				3215	*
				3216	* Returns: kmalloc'd vm_struct pointer array pointing to allocated
				3217	* vm_structs on success, %NULL on failure
				3218	*
				3219	* Percpu allocator wants to use congruent vm areas so that it can
				3220	* maintain the offsets among percpu areas. This function allocates
				3221	* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
				3222	* be scattered pretty far, distance between two areas easily going up
				3223	* to gigabytes. To avoid interacting with regular vmallocs, these
				3224	* areas are allocated from top.
				3225	*
				3226	* Despite its complicated look, this allocator is rather simple. It
				3227	* does everything top-down and scans free blocks from the end looking
				3228	* for matching base. While scanning, if any of the areas do not fit the
				3229	* base address is pulled down to fit the area. Scanning is repeated till
				3230	* all the areas fit and then all necessary data structures are inserted
				3231	* and the result is returned.
				3232	*/
				3233	struct vm_struct *pcpu_get_vm_areas(const unsigned long offsets,
				3234	const size_t *sizes, int nr_vms,
				3235	size_t align)
				3236	{
				3237	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
				3238	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
				3239	struct vmap_area *vas, va;
				3240	struct vm_struct **vms;
				3241	int area, area2, last_area, term_area;
				3242	unsigned long base, start, size, end, last_end;
				3243	bool purged = false;
				3244	enum fit_type type;
				3245
				3246	/* verify parameters and allocate data structures */
				3247	BUG_ON(offset_in_page(align) \|\| !is_power_of_2(align));
				3248	for (last_area = 0, area = 0; area < nr_vms; area++) {
				3249	start = offsets[area];
				3250	end = start + sizes[area];
				3251
				3252	/* is everything aligned properly? */
				3253	BUG_ON(!IS_ALIGNED(offsets[area], align));
				3254	BUG_ON(!IS_ALIGNED(sizes[area], align));
				3255
				3256	/* detect the area with the highest address */
				3257	if (start > offsets[last_area])
				3258	last_area = area;
				3259
				3260	for (area2 = area + 1; area2 < nr_vms; area2++) {
				3261	unsigned long start2 = offsets[area2];
				3262	unsigned long end2 = start2 + sizes[area2];
				3263
				3264	BUG_ON(start2 < end && start < end2);
				3265	}
				3266	}
				3267	last_end = offsets[last_area] + sizes[last_area];
				3268
				3269	if (vmalloc_end - vmalloc_start < last_end) {
				3270	WARN_ON(true);
				3271	return NULL;
				3272	}
				3273
				3274	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
				3275	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
				3276	if (!vas \|\| !vms)
				3277	goto err_free2;
				3278
				3279	for (area = 0; area < nr_vms; area++) {
				3280	vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
				3281	vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
				3282	if (!vas[area] \|\| !vms[area])
				3283	goto err_free;
				3284	}
				3285	retry:
				3286	spin_lock(&vmap_area_lock);
				3287
				3288	/* start scanning - we scan from the top, begin with the last area */
				3289	area = term_area = last_area;
				3290	start = offsets[area];
				3291	end = start + sizes[area];
				3292
				3293	va = pvm_find_va_enclose_addr(vmalloc_end);
				3294	base = pvm_determine_end_from_reverse(&va, align) - end;
				3295
				3296	while (true) {
				3297	/*
				3298	* base might have underflowed, add last_end before
				3299	* comparing.
				3300	*/
				3301	if (base + last_end < vmalloc_start + last_end)
				3302	goto overflow;
				3303
				3304	/*
				3305	* Fitting base has not been found.
				3306	*/
				3307	if (va == NULL)
				3308	goto overflow;
				3309
				3310	/*
				3311	* If required width exeeds current VA block, move
				3312	* base downwards and then recheck.
				3313	*/
				3314	if (base + end > va->va_end) {
				3315	base = pvm_determine_end_from_reverse(&va, align) - end;
				3316	term_area = area;
				3317	continue;
				3318	}
				3319
				3320	/*
				3321	* If this VA does not fit, move base downwards and recheck.
				3322	*/
				3323	if (base + start < va->va_start) {
				3324	va = node_to_va(rb_prev(&va->rb_node));
				3325	base = pvm_determine_end_from_reverse(&va, align) - end;
				3326	term_area = area;
				3327	continue;
				3328	}
				3329
				3330	/*
				3331	* This area fits, move on to the previous one. If
				3332	* the previous one is the terminal one, we're done.
				3333	*/
				3334	area = (area + nr_vms - 1) % nr_vms;
				3335	if (area == term_area)
				3336	break;
				3337
				3338	start = offsets[area];
				3339	end = start + sizes[area];
				3340	va = pvm_find_va_enclose_addr(base + end);
				3341	}
				3342
				3343	/* we've found a fitting base, insert all va's */
				3344	for (area = 0; area < nr_vms; area++) {
				3345	int ret;
				3346
				3347	start = base + offsets[area];
				3348	size = sizes[area];
				3349
				3350	va = pvm_find_va_enclose_addr(start);
				3351	if (WARN_ON_ONCE(va == NULL))
				3352	/* It is a BUG(), but trigger recovery instead. */
				3353	goto recovery;
				3354
				3355	type = classify_va_fit_type(va, start, size);
				3356	if (WARN_ON_ONCE(type == NOTHING_FIT))
				3357	/* It is a BUG(), but trigger recovery instead. */
				3358	goto recovery;
				3359
				3360	ret = adjust_va_to_fit_type(va, start, size, type);
				3361	if (unlikely(ret))
				3362	goto recovery;
				3363
				3364	/* Allocated area. */
				3365	va = vas[area];
				3366	va->va_start = start;
				3367	va->va_end = start + size;
				3368
				3369	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
				3370	}
				3371
				3372	spin_unlock(&vmap_area_lock);
				3373
				3374	/* insert all vm's */
				3375	for (area = 0; area < nr_vms; area++)
				3376	setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
				3377	pcpu_get_vm_areas);
				3378
				3379	kfree(vas);
				3380	return vms;
				3381
				3382	recovery:
				3383	/* Remove previously inserted areas. */
				3384	while (area--) {
				3385	__free_vmap_area(vas[area]);
				3386	vas[area] = NULL;
				3387	}
				3388
				3389	overflow:
				3390	spin_unlock(&vmap_area_lock);
				3391	if (!purged) {
				3392	purge_vmap_area_lazy();
				3393	purged = true;
				3394
				3395	/* Before "retry", check if we recover. */
				3396	for (area = 0; area < nr_vms; area++) {
				3397	if (vas[area])
				3398	continue;
				3399
				3400	vas[area] = kmem_cache_zalloc(
				3401	vmap_area_cachep, GFP_KERNEL);
				3402	if (!vas[area])
				3403	goto err_free;
				3404	}
				3405
				3406	goto retry;
				3407	}
				3408
				3409	err_free:
				3410	for (area = 0; area < nr_vms; area++) {
				3411	if (vas[area])
				3412	kmem_cache_free(vmap_area_cachep, vas[area]);
				3413
				3414	kfree(vms[area]);
				3415	}
				3416	err_free2:
				3417	kfree(vas);
				3418	kfree(vms);
				3419	return NULL;
				3420	}
				3421
				3422	/**
				3423	* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
				3424	* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
				3425	* @nr_vms: the number of allocated areas
				3426	*
				3427	* Free vm_structs and the array allocated by pcpu_get_vm_areas().
				3428	*/
				3429	void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
				3430	{
				3431	int i;
				3432
				3433	for (i = 0; i < nr_vms; i++)
				3434	free_vm_area(vms[i]);
				3435	kfree(vms);
				3436	}
				3437	#endif /* CONFIG_SMP */
				3438
				3439	#ifdef CONFIG_PROC_FS
				3440	static void s_start(struct seq_file m, loff_t *pos)
				3441	__acquires(&vmap_area_lock)
				3442	{
				3443	spin_lock(&vmap_area_lock);
				3444	return seq_list_start(&vmap_area_list, *pos);
				3445	}
				3446
				3447	static void s_next(struct seq_file m, void p, loff_t pos)
				3448	{
				3449	return seq_list_next(p, &vmap_area_list, pos);
				3450	}
				3451
				3452	static void s_stop(struct seq_file m, void p)
				3453	__releases(&vmap_area_lock)
				3454	{
				3455	spin_unlock(&vmap_area_lock);
				3456	}
				3457
				3458	static void show_numa_info(struct seq_file m, struct vm_struct v)
				3459	{
				3460	if (IS_ENABLED(CONFIG_NUMA)) {
				3461	unsigned int nr, *counters = m->private;
				3462
				3463	if (!counters)
				3464	return;
				3465
				3466	if (v->flags & VM_UNINITIALIZED)
				3467	return;
				3468	/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
				3469	smp_rmb();
				3470
				3471	memset(counters, 0, nr_node_ids * sizeof(unsigned int));
				3472
				3473	for (nr = 0; nr < v->nr_pages; nr++)
				3474	counters[page_to_nid(v->pages[nr])]++;
				3475
				3476	for_each_node_state(nr, N_HIGH_MEMORY)
				3477	if (counters[nr])
				3478	seq_printf(m, " N%u=%u", nr, counters[nr]);
				3479	}
				3480	}
				3481
				3482	static void show_purge_info(struct seq_file *m)
				3483	{
				3484	struct llist_node *head;
				3485	struct vmap_area *va;
				3486
				3487	head = READ_ONCE(vmap_purge_list.first);
				3488	if (head == NULL)
				3489	return;
				3490
				3491	llist_for_each_entry(va, head, purge_list) {
				3492	seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
				3493	(void )va->va_start, (void )va->va_end,
				3494	va->va_end - va->va_start);
				3495	}
				3496	}
				3497
				3498	static int s_show(struct seq_file m, void p)
				3499	{
				3500	struct vmap_area *va;
				3501	struct vm_struct *v;
				3502
				3503	va = list_entry(p, struct vmap_area, list);
				3504
				3505	/*
				3506	* s_show can encounter race with remove_vm_area, !vm on behalf
				3507	* of vmap area is being tear down or vm_map_ram allocation.
				3508	*/
				3509	if (!va->vm) {
				3510	seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
				3511	(void )va->va_start, (void )va->va_end,
				3512	va->va_end - va->va_start);
				3513
				3514	return 0;
				3515	}
				3516
				3517	v = va->vm;
				3518
				3519	seq_printf(m, "0x%pK-0x%pK %7ld",
				3520	v->addr, v->addr + v->size, v->size);
				3521
				3522	if (v->caller)
				3523	seq_printf(m, " %pS", v->caller);
				3524
				3525	if (v->nr_pages)
				3526	seq_printf(m, " pages=%d", v->nr_pages);
				3527
				3528	if (v->phys_addr)
				3529	seq_printf(m, " phys=%pa", &v->phys_addr);
				3530
				3531	if (v->flags & VM_IOREMAP)
				3532	seq_puts(m, " ioremap");
				3533
				3534	if (v->flags & VM_ALLOC)
				3535	seq_puts(m, " vmalloc");
				3536
				3537	if (v->flags & VM_MAP)
				3538	seq_puts(m, " vmap");
				3539
				3540	if (v->flags & VM_USERMAP)
				3541	seq_puts(m, " user");
				3542
				3543	if (v->flags & VM_DMA_COHERENT)
				3544	seq_puts(m, " dma-coherent");
				3545
				3546	if (is_vmalloc_addr(v->pages))
				3547	seq_puts(m, " vpages");
				3548
				3549	show_numa_info(m, v);
				3550	seq_putc(m, '\n');
				3551
				3552	/*
				3553	* As a final step, dump "unpurged" areas. Note,
				3554	* that entire "/proc/vmallocinfo" output will not
				3555	* be address sorted, because the purge list is not
				3556	* sorted.
				3557	*/
				3558	if (list_is_last(&va->list, &vmap_area_list))
				3559	show_purge_info(m);
				3560
				3561	return 0;
				3562	}
				3563
				3564	static const struct seq_operations vmalloc_op = {
				3565	.start = s_start,
				3566	.next = s_next,
				3567	.stop = s_stop,
				3568	.show = s_show,
				3569	};
				3570
				3571	static int __init proc_vmalloc_init(void)
				3572	{
				3573	if (IS_ENABLED(CONFIG_PROC_STRIPPED))
				3574	return 0;
				3575	if (IS_ENABLED(CONFIG_NUMA))
				3576	proc_create_seq_private("vmallocinfo", 0400, NULL,
				3577	&vmalloc_op,
				3578	nr_node_ids * sizeof(unsigned int), NULL);
				3579	else
				3580	proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
				3581	return 0;
				3582	}
				3583	module_init(proc_vmalloc_init);
				3584
				3585	#endif