Blame - src/kernel/linux/v4.19/mm/slab.c - T800

blob: fa53bb09495d65312df7cbab2d46e121a74d58cc [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/mm/slab.c
				4	* Written by Mark Hemment, 1996/97.
				5	* (markhe@nextd.demon.co.uk)
				6	*
				7	* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
				8	*
				9	* Major cleanup, different bufctl logic, per-cpu arrays
				10	* (c) 2000 Manfred Spraul
				11	*
				12	* Cleanup, make the head arrays unconditional, preparation for NUMA
				13	* (c) 2002 Manfred Spraul
				14	*
				15	* An implementation of the Slab Allocator as described in outline in;
				16	* UNIX Internals: The New Frontiers by Uresh Vahalia
				17	* Pub: Prentice Hall ISBN 0-13-101908-2
				18	* or with a little more detail in;
				19	* The Slab Allocator: An Object-Caching Kernel Memory Allocator
				20	* Jeff Bonwick (Sun Microsystems).
				21	* Presented at: USENIX Summer 1994 Technical Conference
				22	*
				23	* The memory is organized in caches, one cache for each object type.
				24	* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
				25	* Each cache consists out of many slabs (they are small (usually one
				26	* page long) and always contiguous), and each slab contains multiple
				27	* initialized objects.
				28	*
				29	* This means, that your constructor is used only for newly allocated
				30	* slabs and you must pass objects with the same initializations to
				31	* kmem_cache_free.
				32	*
				33	* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
				34	* normal). If you need a special memory type, then must create a new
				35	* cache for that memory type.
				36	*
				37	* In order to reduce fragmentation, the slabs are sorted in 3 groups:
				38	* full slabs with 0 free objects
				39	* partial slabs
				40	* empty slabs with no allocated objects
				41	*
				42	* If partial slabs exist, then new allocations come from these slabs,
				43	* otherwise from empty slabs or new slabs are allocated.
				44	*
				45	* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
				46	* during kmem_cache_destroy(). The caller must prevent concurrent allocs.
				47	*
				48	* Each cache has a short per-cpu head array, most allocs
				49	* and frees go into that array, and if that array overflows, then 1/2
				50	* of the entries in the array are given back into the global cache.
				51	* The head array is strictly LIFO and should improve the cache hit rates.
				52	* On SMP, it additionally reduces the spinlock operations.
				53	*
				54	* The c_cpuarray may not be read with enabled local interrupts -
				55	* it's changed with a smp_call_function().
				56	*
				57	* SMP synchronization:
				58	* constructors and destructors are called without any locking.
				59	* Several members in struct kmem_cache and struct slab never change, they
				60	* are accessed without any locking.
				61	* The per-cpu arrays are never accessed from the wrong cpu, no locking,
				62	* and local interrupts are disabled so slab code is preempt-safe.
				63	* The non-constant members are protected with a per-cache irq spinlock.
				64	*
				65	* Many thanks to Mark Hemment, who wrote another per-cpu slab patch
				66	* in 2000 - many ideas in the current implementation are derived from
				67	* his patch.
				68	*
				69	* Further notes from the original documentation:
				70	*
				71	* 11 April '97. Started multi-threading - markhe
				72	* The global cache-chain is protected by the mutex 'slab_mutex'.
				73	* The sem is only needed when accessing/extending the cache-chain, which
				74	* can never happen inside an interrupt (kmem_cache_create(),
				75	* kmem_cache_shrink() and kmem_cache_reap()).
				76	*
				77	* At present, each engine can be growing a cache. This should be blocked.
				78	*
				79	* 15 March 2005. NUMA slab allocator.
				80	* Shai Fultheim <shai@scalex86.org>.
				81	* Shobhit Dayal <shobhit@calsoftinc.com>
				82	* Alok N Kataria <alokk@calsoftinc.com>
				83	* Christoph Lameter <christoph@lameter.com>
				84	*
				85	* Modified the slab allocator to be node aware on NUMA systems.
				86	* Each node has its own list of partial, free and full slabs.
				87	* All object allocations for a node occur from node specific slab lists.
				88	*/
				89
				90	#include <linux/slab.h>
				91	#include <linux/mm.h>
				92	#include <linux/poison.h>
				93	#include <linux/swap.h>
				94	#include <linux/cache.h>
				95	#include <linux/interrupt.h>
				96	#include <linux/init.h>
				97	#include <linux/compiler.h>
				98	#include <linux/cpuset.h>
				99	#include <linux/proc_fs.h>
				100	#include <linux/seq_file.h>
				101	#include <linux/notifier.h>
				102	#include <linux/kallsyms.h>
				103	#include <linux/cpu.h>
				104	#include <linux/sysctl.h>
				105	#include <linux/module.h>
				106	#include <linux/rcupdate.h>
				107	#include <linux/string.h>
				108	#include <linux/uaccess.h>
				109	#include <linux/nodemask.h>
				110	#include <linux/kmemleak.h>
				111	#include <linux/mempolicy.h>
				112	#include <linux/mutex.h>
				113	#include <linux/fault-inject.h>
				114	#include <linux/rtmutex.h>
				115	#include <linux/reciprocal_div.h>
				116	#include <linux/debugobjects.h>
				117	#include <linux/memory.h>
				118	#include <linux/prefetch.h>
				119	#include <linux/sched/task_stack.h>
				120
				121	#include <net/sock.h>
				122
				123	#include <asm/cacheflush.h>
				124	#include <asm/tlbflush.h>
				125	#include <asm/page.h>
				126
				127	#include <trace/events/kmem.h>
				128
				129	#include "internal.h"
				130
				131	#include "slab.h"
				132
				133	/*
				134	* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
				135	* 0 for faster, smaller code (especially in the critical paths).
				136	*
				137	* STATS - 1 to collect stats for /proc/slabinfo.
				138	* 0 for faster, smaller code (especially in the critical paths).
				139	*
				140	* FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
				141	*/
				142
				143	#ifdef CONFIG_DEBUG_SLAB
				144	#define DEBUG 1
				145	#define STATS 1
				146	#define FORCED_DEBUG 1
				147	#else
				148	#define DEBUG 0
				149	#define STATS 0
				150	#define FORCED_DEBUG 0
				151	#endif
				152
				153	/* Shouldn't this be in a header file somewhere? */
				154	#define BYTES_PER_WORD sizeof(void *)
				155	#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
				156
				157	#ifndef ARCH_KMALLOC_FLAGS
				158	#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
				159	#endif
				160
				161	#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
				162	<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
				163
				164	#if FREELIST_BYTE_INDEX
				165	typedef unsigned char freelist_idx_t;
				166	#else
				167	typedef unsigned short freelist_idx_t;
				168	#endif
				169
				170	#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
				171
				172	/*
				173	* struct array_cache
				174	*
				175	* Purpose:
				176	* - LIFO ordering, to hand out cache-warm objects from _alloc
				177	* - reduce the number of linked list operations
				178	* - reduce spinlock operations
				179	*
				180	* The limit is stored in the per-cpu structure to reduce the data cache
				181	* footprint.
				182	*
				183	*/
				184	struct array_cache {
				185	unsigned int avail;
				186	unsigned int limit;
				187	unsigned int batchcount;
				188	unsigned int touched;
				189	void entry[]; /
				190	* Must have this definition in here for the proper
				191	* alignment of array_cache. Also simplifies accessing
				192	* the entries.
				193	*/
				194	};
				195
				196	struct alien_cache {
				197	spinlock_t lock;
				198	struct array_cache ac;
				199	};
				200
				201	/*
				202	* Need this for bootstrapping a per node allocator.
				203	*/
				204	#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
				205	static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
				206	#define CACHE_CACHE 0
				207	#define SIZE_NODE (MAX_NUMNODES)
				208
				209	static int drain_freelist(struct kmem_cache *cache,
				210	struct kmem_cache_node *n, int tofree);
				211	static void free_block(struct kmem_cache cachep, void *objpp, int len,
				212	int node, struct list_head *list);
				213	static void slabs_destroy(struct kmem_cache cachep, struct list_head list);
				214	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
				215	static void cache_reap(struct work_struct *unused);
				216
				217	static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
				218	void **list);
				219	static inline void fixup_slab_list(struct kmem_cache *cachep,
				220	struct kmem_cache_node n, struct page page,
				221	void **list);
				222	static int slab_early_init = 1;
				223
				224	#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
				225
				226	static void kmem_cache_node_init(struct kmem_cache_node *parent)
				227	{
				228	INIT_LIST_HEAD(&parent->slabs_full);
				229	INIT_LIST_HEAD(&parent->slabs_partial);
				230	INIT_LIST_HEAD(&parent->slabs_free);
				231	parent->total_slabs = 0;
				232	parent->free_slabs = 0;
				233	parent->shared = NULL;
				234	parent->alien = NULL;
				235	parent->colour_next = 0;
				236	spin_lock_init(&parent->list_lock);
				237	parent->free_objects = 0;
				238	parent->free_touched = 0;
				239	}
				240
				241	#define MAKE_LIST(cachep, listp, slab, nodeid) \
				242	do { \
				243	INIT_LIST_HEAD(listp); \
				244	list_splice(&get_node(cachep, nodeid)->slab, listp); \
				245	} while (0)
				246
				247	#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
				248	do { \
				249	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
				250	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
				251	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
				252	} while (0)
				253
				254	#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
				255	#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
				256	#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
				257	#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
				258
				259	#define BATCHREFILL_LIMIT 16
				260	/*
				261	* Optimization question: fewer reaps means less probability for unnessary
				262	* cpucache drain/refill cycles.
				263	*
				264	* OTOH the cpuarrays can contain lots of objects,
				265	* which could lock up otherwise freeable slabs.
				266	*/
				267	#define REAPTIMEOUT_AC (2*HZ)
				268	#define REAPTIMEOUT_NODE (4*HZ)
				269
				270	#if STATS
				271	#define STATS_INC_ACTIVE(x) ((x)->num_active++)
				272	#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
				273	#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
				274	#define STATS_INC_GROWN(x) ((x)->grown++)
				275	#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
				276	#define STATS_SET_HIGH(x) \
				277	do { \
				278	if ((x)->num_active > (x)->high_mark) \
				279	(x)->high_mark = (x)->num_active; \
				280	} while (0)
				281	#define STATS_INC_ERR(x) ((x)->errors++)
				282	#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
				283	#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
				284	#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
				285	#define STATS_SET_FREEABLE(x, i) \
				286	do { \
				287	if ((x)->max_freeable < i) \
				288	(x)->max_freeable = i; \
				289	} while (0)
				290	#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
				291	#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
				292	#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
				293	#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
				294	#else
				295	#define STATS_INC_ACTIVE(x) do { } while (0)
				296	#define STATS_DEC_ACTIVE(x) do { } while (0)
				297	#define STATS_INC_ALLOCED(x) do { } while (0)
				298	#define STATS_INC_GROWN(x) do { } while (0)
				299	#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
				300	#define STATS_SET_HIGH(x) do { } while (0)
				301	#define STATS_INC_ERR(x) do { } while (0)
				302	#define STATS_INC_NODEALLOCS(x) do { } while (0)
				303	#define STATS_INC_NODEFREES(x) do { } while (0)
				304	#define STATS_INC_ACOVERFLOW(x) do { } while (0)
				305	#define STATS_SET_FREEABLE(x, i) do { } while (0)
				306	#define STATS_INC_ALLOCHIT(x) do { } while (0)
				307	#define STATS_INC_ALLOCMISS(x) do { } while (0)
				308	#define STATS_INC_FREEHIT(x) do { } while (0)
				309	#define STATS_INC_FREEMISS(x) do { } while (0)
				310	#endif
				311
				312	#if DEBUG
				313
				314	/*
				315	* memory layout of objects:
				316	* 0 : objp
				317	* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
				318	* the end of an object is aligned with the end of the real
				319	* allocation. Catches writes behind the end of the allocation.
				320	* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
				321	* redzone word.
				322	* cachep->obj_offset: The real object.
				323	* cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
				324	* cachep->size - 1* BYTES_PER_WORD: last caller address
				325	* [BYTES_PER_WORD long]
				326	*/
				327	static int obj_offset(struct kmem_cache *cachep)
				328	{
				329	return cachep->obj_offset;
				330	}
				331
				332	static unsigned long long dbg_redzone1(struct kmem_cache cachep, void *objp)
				333	{
				334	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
				335	return (unsigned long long*) (objp + obj_offset(cachep) -
				336	sizeof(unsigned long long));
				337	}
				338
				339	static unsigned long long dbg_redzone2(struct kmem_cache cachep, void *objp)
				340	{
				341	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
				342	if (cachep->flags & SLAB_STORE_USER)
				343	return (unsigned long long *)(objp + cachep->size -
				344	sizeof(unsigned long long) -
				345	REDZONE_ALIGN);
				346	return (unsigned long long *) (objp + cachep->size -
				347	sizeof(unsigned long long));
				348	}
				349
				350	static void *dbg_userword(struct kmem_cache cachep, void *objp)
				351	{
				352	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
				353	return (void **)(objp + cachep->size - BYTES_PER_WORD);
				354	}
				355
				356	#else
				357
				358	#define obj_offset(x) 0
				359	#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
				360	#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
				361	#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
				362
				363	#endif
				364
				365	#ifdef CONFIG_DEBUG_SLAB_LEAK
				366
				367	static inline bool is_store_user_clean(struct kmem_cache *cachep)
				368	{
				369	return atomic_read(&cachep->store_user_clean) == 1;
				370	}
				371
				372	static inline void set_store_user_clean(struct kmem_cache *cachep)
				373	{
				374	atomic_set(&cachep->store_user_clean, 1);
				375	}
				376
				377	static inline void set_store_user_dirty(struct kmem_cache *cachep)
				378	{
				379	if (is_store_user_clean(cachep))
				380	atomic_set(&cachep->store_user_clean, 0);
				381	}
				382
				383	#else
				384	static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
				385
				386	#endif
				387
				388	/*
				389	* Do not go above this order unless 0 objects fit into the slab or
				390	* overridden on the command line.
				391	*/
				392	#define SLAB_MAX_ORDER_HI 1
				393	#define SLAB_MAX_ORDER_LO 0
				394	static int slab_max_order = SLAB_MAX_ORDER_LO;
				395	static bool slab_max_order_set __initdata;
				396
				397	static inline struct kmem_cache virt_to_cache(const void obj)
				398	{
				399	struct page *page = virt_to_head_page(obj);
				400	return page->slab_cache;
				401	}
				402
				403	static inline void index_to_obj(struct kmem_cache cache, struct page *page,
				404	unsigned int idx)
				405	{
				406	return page->s_mem + cache->size * idx;
				407	}
				408
				409	#define BOOT_CPUCACHE_ENTRIES 1
				410	/* internal cache of cache description objs */
				411	static struct kmem_cache kmem_cache_boot = {
				412	.batchcount = 1,
				413	.limit = BOOT_CPUCACHE_ENTRIES,
				414	.shared = 1,
				415	.size = sizeof(struct kmem_cache),
				416	.name = "kmem_cache",
				417	};
				418
				419	static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
				420
				421	static inline struct array_cache cpu_cache_get(struct kmem_cache cachep)
				422	{
				423	return this_cpu_ptr(cachep->cpu_cache);
				424	}
				425
				426	/*
				427	* Calculate the number of objects and left-over bytes for a given buffer size.
				428	*/
				429	static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
				430	slab_flags_t flags, size_t *left_over)
				431	{
				432	unsigned int num;
				433	size_t slab_size = PAGE_SIZE << gfporder;
				434
				435	/*
				436	* The slab management structure can be either off the slab or
				437	* on it. For the latter case, the memory allocated for a
				438	* slab is used for:
				439	*
				440	* - @buffer_size bytes for each object
				441	* - One freelist_idx_t for each object
				442	*
				443	* We don't need to consider alignment of freelist because
				444	* freelist will be at the end of slab page. The objects will be
				445	* at the correct alignment.
				446	*
				447	* If the slab management structure is off the slab, then the
				448	* alignment will already be calculated into the size. Because
				449	* the slabs are all pages aligned, the objects will be at the
				450	* correct alignment when allocated.
				451	*/
				452	if (flags & (CFLGS_OBJFREELIST_SLAB \| CFLGS_OFF_SLAB)) {
				453	num = slab_size / buffer_size;
				454	*left_over = slab_size % buffer_size;
				455	} else {
				456	num = slab_size / (buffer_size + sizeof(freelist_idx_t));
				457	*left_over = slab_size %
				458	(buffer_size + sizeof(freelist_idx_t));
				459	}
				460
				461	return num;
				462	}
				463
				464	#if DEBUG
				465	#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
				466
				467	static void __slab_error(const char function, struct kmem_cache cachep,
				468	char *msg)
				469	{
				470	pr_err("slab error in %s(): cache `%s': %s\n",
				471	function, cachep->name, msg);
				472	dump_stack();
				473	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
				474	}
				475	#endif
				476
				477	/*
				478	* By default on NUMA we use alien caches to stage the freeing of
				479	* objects allocated from other nodes. This causes massive memory
				480	* inefficiencies when using fake NUMA setup to split memory into a
				481	* large number of small nodes, so it can be disabled on the command
				482	* line
				483	*/
				484
				485	static int use_alien_caches __read_mostly = 1;
				486	static int __init noaliencache_setup(char *s)
				487	{
				488	use_alien_caches = 0;
				489	return 1;
				490	}
				491	__setup("noaliencache", noaliencache_setup);
				492
				493	static int __init slab_max_order_setup(char *str)
				494	{
				495	get_option(&str, &slab_max_order);
				496	slab_max_order = slab_max_order < 0 ? 0 :
				497	min(slab_max_order, MAX_ORDER - 1);
				498	slab_max_order_set = true;
				499
				500	return 1;
				501	}
				502	__setup("slab_max_order=", slab_max_order_setup);
				503
				504	#ifdef CONFIG_NUMA
				505	/*
				506	* Special reaping functions for NUMA systems called from cache_reap().
				507	* These take care of doing round robin flushing of alien caches (containing
				508	* objects freed on different nodes from which they were allocated) and the
				509	* flushing of remote pcps by calling drain_node_pages.
				510	*/
				511	static DEFINE_PER_CPU(unsigned long, slab_reap_node);
				512
				513	static void init_reap_node(int cpu)
				514	{
				515	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
				516	node_online_map);
				517	}
				518
				519	static void next_reap_node(void)
				520	{
				521	int node = __this_cpu_read(slab_reap_node);
				522
				523	node = next_node_in(node, node_online_map);
				524	__this_cpu_write(slab_reap_node, node);
				525	}
				526
				527	#else
				528	#define init_reap_node(cpu) do { } while (0)
				529	#define next_reap_node(void) do { } while (0)
				530	#endif
				531
				532	/*
				533	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
				534	* via the workqueue/eventd.
				535	* Add the CPU number into the expiration time to minimize the possibility of
				536	* the CPUs getting into lockstep and contending for the global cache chain
				537	* lock.
				538	*/
				539	static void start_cpu_timer(int cpu)
				540	{
				541	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
				542
				543	if (reap_work->work.func == NULL) {
				544	init_reap_node(cpu);
				545	INIT_DEFERRABLE_WORK(reap_work, cache_reap);
				546	schedule_delayed_work_on(cpu, reap_work,
				547	__round_jiffies_relative(HZ, cpu));
				548	}
				549	}
				550
				551	static void init_arraycache(struct array_cache *ac, int limit, int batch)
				552	{
				553	if (ac) {
				554	ac->avail = 0;
				555	ac->limit = limit;
				556	ac->batchcount = batch;
				557	ac->touched = 0;
				558	}
				559	}
				560
				561	static struct array_cache *alloc_arraycache(int node, int entries,
				562	int batchcount, gfp_t gfp)
				563	{
				564	size_t memsize = sizeof(void ) entries + sizeof(struct array_cache);
				565	struct array_cache *ac = NULL;
				566
				567	ac = kmalloc_node(memsize, gfp, node);
				568	/*
				569	* The array_cache structures contain pointers to free object.
				570	* However, when such objects are allocated or transferred to another
				571	* cache the pointers are not cleared and they could be counted as
				572	* valid references during a kmemleak scan. Therefore, kmemleak must
				573	* not scan such objects.
				574	*/
				575	kmemleak_no_scan(ac);
				576	init_arraycache(ac, entries, batchcount);
				577	return ac;
				578	}
				579
				580	static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
				581	struct page page, void objp)
				582	{
				583	struct kmem_cache_node *n;
				584	int page_node;
				585	LIST_HEAD(list);
				586
				587	page_node = page_to_nid(page);
				588	n = get_node(cachep, page_node);
				589
				590	spin_lock(&n->list_lock);
				591	free_block(cachep, &objp, 1, page_node, &list);
				592	spin_unlock(&n->list_lock);
				593
				594	slabs_destroy(cachep, &list);
				595	}
				596
				597	/*
				598	* Transfer objects in one arraycache to another.
				599	* Locking must be handled by the caller.
				600	*
				601	* Return the number of entries transferred.
				602	*/
				603	static int transfer_objects(struct array_cache *to,
				604	struct array_cache *from, unsigned int max)
				605	{
				606	/* Figure out how many entries to transfer */
				607	int nr = min3(from->avail, max, to->limit - to->avail);
				608
				609	if (!nr)
				610	return 0;
				611
				612	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
				613	sizeof(void ) nr);
				614
				615	from->avail -= nr;
				616	to->avail += nr;
				617	return nr;
				618	}
				619
				620	#ifndef CONFIG_NUMA
				621
				622	#define drain_alien_cache(cachep, alien) do { } while (0)
				623	#define reap_alien(cachep, n) do { } while (0)
				624
				625	static inline struct alien_cache **alloc_alien_cache(int node,
				626	int limit, gfp_t gfp)
				627	{
				628	return NULL;
				629	}
				630
				631	static inline void free_alien_cache(struct alien_cache **ac_ptr)
				632	{
				633	}
				634
				635	static inline int cache_free_alien(struct kmem_cache cachep, void objp)
				636	{
				637	return 0;
				638	}
				639
				640	static inline void alternate_node_alloc(struct kmem_cache cachep,
				641	gfp_t flags)
				642	{
				643	return NULL;
				644	}
				645
				646	static inline void ____cache_alloc_node(struct kmem_cache cachep,
				647	gfp_t flags, int nodeid)
				648	{
				649	return NULL;
				650	}
				651
				652	static inline gfp_t gfp_exact_node(gfp_t flags)
				653	{
				654	return flags & ~__GFP_NOFAIL;
				655	}
				656
				657	#else /* CONFIG_NUMA */
				658
				659	static void ____cache_alloc_node(struct kmem_cache , gfp_t, int);
				660	static void alternate_node_alloc(struct kmem_cache , gfp_t);
				661
				662	static struct alien_cache *__alloc_alien_cache(int node, int entries,
				663	int batch, gfp_t gfp)
				664	{
				665	size_t memsize = sizeof(void ) entries + sizeof(struct alien_cache);
				666	struct alien_cache *alc = NULL;
				667
				668	alc = kmalloc_node(memsize, gfp, node);
				669	if (alc) {
				670	kmemleak_no_scan(alc);
				671	init_arraycache(&alc->ac, entries, batch);
				672	spin_lock_init(&alc->lock);
				673	}
				674	return alc;
				675	}
				676
				677	static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
				678	{
				679	struct alien_cache **alc_ptr;
				680	size_t memsize = sizeof(void ) nr_node_ids;
				681	int i;
				682
				683	if (limit > 1)
				684	limit = 12;
				685	alc_ptr = kzalloc_node(memsize, gfp, node);
				686	if (!alc_ptr)
				687	return NULL;
				688
				689	for_each_node(i) {
				690	if (i == node \|\| !node_online(i))
				691	continue;
				692	alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
				693	if (!alc_ptr[i]) {
				694	for (i--; i >= 0; i--)
				695	kfree(alc_ptr[i]);
				696	kfree(alc_ptr);
				697	return NULL;
				698	}
				699	}
				700	return alc_ptr;
				701	}
				702
				703	static void free_alien_cache(struct alien_cache **alc_ptr)
				704	{
				705	int i;
				706
				707	if (!alc_ptr)
				708	return;
				709	for_each_node(i)
				710	kfree(alc_ptr[i]);
				711	kfree(alc_ptr);
				712	}
				713
				714	static void __drain_alien_cache(struct kmem_cache *cachep,
				715	struct array_cache *ac, int node,
				716	struct list_head *list)
				717	{
				718	struct kmem_cache_node *n = get_node(cachep, node);
				719
				720	if (ac->avail) {
				721	spin_lock(&n->list_lock);
				722	/*
				723	* Stuff objects into the remote nodes shared array first.
				724	* That way we could avoid the overhead of putting the objects
				725	* into the free lists and getting them back later.
				726	*/
				727	if (n->shared)
				728	transfer_objects(n->shared, ac, ac->limit);
				729
				730	free_block(cachep, ac->entry, ac->avail, node, list);
				731	ac->avail = 0;
				732	spin_unlock(&n->list_lock);
				733	}
				734	}
				735
				736	/*
				737	* Called from cache_reap() to regularly drain alien caches round robin.
				738	*/
				739	static void reap_alien(struct kmem_cache cachep, struct kmem_cache_node n)
				740	{
				741	int node = __this_cpu_read(slab_reap_node);
				742
				743	if (n->alien) {
				744	struct alien_cache *alc = n->alien[node];
				745	struct array_cache *ac;
				746
				747	if (alc) {
				748	ac = &alc->ac;
				749	if (ac->avail && spin_trylock_irq(&alc->lock)) {
				750	LIST_HEAD(list);
				751
				752	__drain_alien_cache(cachep, ac, node, &list);
				753	spin_unlock_irq(&alc->lock);
				754	slabs_destroy(cachep, &list);
				755	}
				756	}
				757	}
				758	}
				759
				760	static void drain_alien_cache(struct kmem_cache *cachep,
				761	struct alien_cache **alien)
				762	{
				763	int i = 0;
				764	struct alien_cache *alc;
				765	struct array_cache *ac;
				766	unsigned long flags;
				767
				768	for_each_online_node(i) {
				769	alc = alien[i];
				770	if (alc) {
				771	LIST_HEAD(list);
				772
				773	ac = &alc->ac;
				774	spin_lock_irqsave(&alc->lock, flags);
				775	__drain_alien_cache(cachep, ac, i, &list);
				776	spin_unlock_irqrestore(&alc->lock, flags);
				777	slabs_destroy(cachep, &list);
				778	}
				779	}
				780	}
				781
				782	static int __cache_free_alien(struct kmem_cache cachep, void objp,
				783	int node, int page_node)
				784	{
				785	struct kmem_cache_node *n;
				786	struct alien_cache *alien = NULL;
				787	struct array_cache *ac;
				788	LIST_HEAD(list);
				789
				790	n = get_node(cachep, node);
				791	STATS_INC_NODEFREES(cachep);
				792	if (n->alien && n->alien[page_node]) {
				793	alien = n->alien[page_node];
				794	ac = &alien->ac;
				795	spin_lock(&alien->lock);
				796	if (unlikely(ac->avail == ac->limit)) {
				797	STATS_INC_ACOVERFLOW(cachep);
				798	__drain_alien_cache(cachep, ac, page_node, &list);
				799	}
				800	ac->entry[ac->avail++] = objp;
				801	spin_unlock(&alien->lock);
				802	slabs_destroy(cachep, &list);
				803	} else {
				804	n = get_node(cachep, page_node);
				805	spin_lock(&n->list_lock);
				806	free_block(cachep, &objp, 1, page_node, &list);
				807	spin_unlock(&n->list_lock);
				808	slabs_destroy(cachep, &list);
				809	}
				810	return 1;
				811	}
				812
				813	static inline int cache_free_alien(struct kmem_cache cachep, void objp)
				814	{
				815	int page_node = page_to_nid(virt_to_page(objp));
				816	int node = numa_mem_id();
				817	/*
				818	* Make sure we are not freeing a object from another node to the array
				819	* cache on this cpu.
				820	*/
				821	if (likely(node == page_node))
				822	return 0;
				823
				824	return __cache_free_alien(cachep, objp, node, page_node);
				825	}
				826
				827	/*
				828	* Construct gfp mask to allocate from a specific node but do not reclaim or
				829	* warn about failures.
				830	*/
				831	static inline gfp_t gfp_exact_node(gfp_t flags)
				832	{
				833	return (flags \| __GFP_THISNODE \| __GFP_NOWARN) & ~(__GFP_RECLAIM\|__GFP_NOFAIL);
				834	}
				835	#endif
				836
				837	static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
				838	{
				839	struct kmem_cache_node *n;
				840
				841	/*
				842	* Set up the kmem_cache_node for cpu before we can
				843	* begin anything. Make sure some other cpu on this
				844	* node has not already allocated this
				845	*/
				846	n = get_node(cachep, node);
				847	if (n) {
				848	spin_lock_irq(&n->list_lock);
				849	n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
				850	cachep->num;
				851	spin_unlock_irq(&n->list_lock);
				852
				853	return 0;
				854	}
				855
				856	n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
				857	if (!n)
				858	return -ENOMEM;
				859
				860	kmem_cache_node_init(n);
				861	n->next_reap = jiffies + REAPTIMEOUT_NODE +
				862	((unsigned long)cachep) % REAPTIMEOUT_NODE;
				863
				864	n->free_limit =
				865	(1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
				866
				867	/*
				868	* The kmem_cache_nodes don't come and go as CPUs
				869	* come and go. slab_mutex is sufficient
				870	* protection here.
				871	*/
				872	cachep->node[node] = n;
				873
				874	return 0;
				875	}
				876
				877	#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) \|\| defined(CONFIG_SMP)
				878	/*
				879	* Allocates and initializes node for a node on each slab cache, used for
				880	* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
				881	* will be allocated off-node since memory is not yet online for the new node.
				882	* When hotplugging memory or a cpu, existing node are not replaced if
				883	* already in use.
				884	*
				885	* Must hold slab_mutex.
				886	*/
				887	static int init_cache_node_node(int node)
				888	{
				889	int ret;
				890	struct kmem_cache *cachep;
				891
				892	list_for_each_entry(cachep, &slab_caches, list) {
				893	ret = init_cache_node(cachep, node, GFP_KERNEL);
				894	if (ret)
				895	return ret;
				896	}
				897
				898	return 0;
				899	}
				900	#endif
				901
				902	static int setup_kmem_cache_node(struct kmem_cache *cachep,
				903	int node, gfp_t gfp, bool force_change)
				904	{
				905	int ret = -ENOMEM;
				906	struct kmem_cache_node *n;
				907	struct array_cache *old_shared = NULL;
				908	struct array_cache *new_shared = NULL;
				909	struct alien_cache **new_alien = NULL;
				910	LIST_HEAD(list);
				911
				912	if (use_alien_caches) {
				913	new_alien = alloc_alien_cache(node, cachep->limit, gfp);
				914	if (!new_alien)
				915	goto fail;
				916	}
				917
				918	if (cachep->shared) {
				919	new_shared = alloc_arraycache(node,
				920	cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
				921	if (!new_shared)
				922	goto fail;
				923	}
				924
				925	ret = init_cache_node(cachep, node, gfp);
				926	if (ret)
				927	goto fail;
				928
				929	n = get_node(cachep, node);
				930	spin_lock_irq(&n->list_lock);
				931	if (n->shared && force_change) {
				932	free_block(cachep, n->shared->entry,
				933	n->shared->avail, node, &list);
				934	n->shared->avail = 0;
				935	}
				936
				937	if (!n->shared \|\| force_change) {
				938	old_shared = n->shared;
				939	n->shared = new_shared;
				940	new_shared = NULL;
				941	}
				942
				943	if (!n->alien) {
				944	n->alien = new_alien;
				945	new_alien = NULL;
				946	}
				947
				948	spin_unlock_irq(&n->list_lock);
				949	slabs_destroy(cachep, &list);
				950
				951	/*
				952	* To protect lockless access to n->shared during irq disabled context.
				953	* If n->shared isn't NULL in irq disabled context, accessing to it is
				954	* guaranteed to be valid until irq is re-enabled, because it will be
				955	* freed after synchronize_sched().
				956	*/
				957	if (old_shared && force_change)
				958	synchronize_sched();
				959
				960	fail:
				961	kfree(old_shared);
				962	kfree(new_shared);
				963	free_alien_cache(new_alien);
				964
				965	return ret;
				966	}
				967
				968	#ifdef CONFIG_SMP
				969
				970	static void cpuup_canceled(long cpu)
				971	{
				972	struct kmem_cache *cachep;
				973	struct kmem_cache_node *n = NULL;
				974	int node = cpu_to_mem(cpu);
				975	const struct cpumask *mask = cpumask_of_node(node);
				976
				977	list_for_each_entry(cachep, &slab_caches, list) {
				978	struct array_cache *nc;
				979	struct array_cache *shared;
				980	struct alien_cache **alien;
				981	LIST_HEAD(list);
				982
				983	n = get_node(cachep, node);
				984	if (!n)
				985	continue;
				986
				987	spin_lock_irq(&n->list_lock);
				988
				989	/* Free limit for this kmem_cache_node */
				990	n->free_limit -= cachep->batchcount;
				991
				992	/* cpu is dead; no one can alloc from it. */
				993	nc = per_cpu_ptr(cachep->cpu_cache, cpu);
				994	if (nc) {
				995	free_block(cachep, nc->entry, nc->avail, node, &list);
				996	nc->avail = 0;
				997	}
				998
				999	if (!cpumask_empty(mask)) {
				1000	spin_unlock_irq(&n->list_lock);
				1001	goto free_slab;
				1002	}
				1003
				1004	shared = n->shared;
				1005	if (shared) {
				1006	free_block(cachep, shared->entry,
				1007	shared->avail, node, &list);
				1008	n->shared = NULL;
				1009	}
				1010
				1011	alien = n->alien;
				1012	n->alien = NULL;
				1013
				1014	spin_unlock_irq(&n->list_lock);
				1015
				1016	kfree(shared);
				1017	if (alien) {
				1018	drain_alien_cache(cachep, alien);
				1019	free_alien_cache(alien);
				1020	}
				1021
				1022	free_slab:
				1023	slabs_destroy(cachep, &list);
				1024	}
				1025	/*
				1026	* In the previous loop, all the objects were freed to
				1027	* the respective cache's slabs, now we can go ahead and
				1028	* shrink each nodelist to its limit.
				1029	*/
				1030	list_for_each_entry(cachep, &slab_caches, list) {
				1031	n = get_node(cachep, node);
				1032	if (!n)
				1033	continue;
				1034	drain_freelist(cachep, n, INT_MAX);
				1035	}
				1036	}
				1037
				1038	static int cpuup_prepare(long cpu)
				1039	{
				1040	struct kmem_cache *cachep;
				1041	int node = cpu_to_mem(cpu);
				1042	int err;
				1043
				1044	/*
				1045	* We need to do this right in the beginning since
				1046	* alloc_arraycache's are going to use this list.
				1047	* kmalloc_node allows us to add the slab to the right
				1048	* kmem_cache_node and not this cpu's kmem_cache_node
				1049	*/
				1050	err = init_cache_node_node(node);
				1051	if (err < 0)
				1052	goto bad;
				1053
				1054	/*
				1055	* Now we can go ahead with allocating the shared arrays and
				1056	* array caches
				1057	*/
				1058	list_for_each_entry(cachep, &slab_caches, list) {
				1059	err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
				1060	if (err)
				1061	goto bad;
				1062	}
				1063
				1064	return 0;
				1065	bad:
				1066	cpuup_canceled(cpu);
				1067	return -ENOMEM;
				1068	}
				1069
				1070	int slab_prepare_cpu(unsigned int cpu)
				1071	{
				1072	int err;
				1073
				1074	mutex_lock(&slab_mutex);
				1075	err = cpuup_prepare(cpu);
				1076	mutex_unlock(&slab_mutex);
				1077	return err;
				1078	}
				1079
				1080	/*
				1081	* This is called for a failed online attempt and for a successful
				1082	* offline.
				1083	*
				1084	* Even if all the cpus of a node are down, we don't free the
				1085	* kmem_list3 of any cache. This to avoid a race between cpu_down, and
				1086	* a kmalloc allocation from another cpu for memory from the node of
				1087	* the cpu going down. The list3 structure is usually allocated from
				1088	* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
				1089	*/
				1090	int slab_dead_cpu(unsigned int cpu)
				1091	{
				1092	mutex_lock(&slab_mutex);
				1093	cpuup_canceled(cpu);
				1094	mutex_unlock(&slab_mutex);
				1095	return 0;
				1096	}
				1097	#endif
				1098
				1099	static int slab_online_cpu(unsigned int cpu)
				1100	{
				1101	start_cpu_timer(cpu);
				1102	return 0;
				1103	}
				1104
				1105	static int slab_offline_cpu(unsigned int cpu)
				1106	{
				1107	/*
				1108	* Shutdown cache reaper. Note that the slab_mutex is held so
				1109	* that if cache_reap() is invoked it cannot do anything
				1110	* expensive but will only modify reap_work and reschedule the
				1111	* timer.
				1112	*/
				1113	cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
				1114	/* Now the cache_reaper is guaranteed to be not running. */
				1115	per_cpu(slab_reap_work, cpu).work.func = NULL;
				1116	return 0;
				1117	}
				1118
				1119	#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
				1120	/*
				1121	* Drains freelist for a node on each slab cache, used for memory hot-remove.
				1122	* Returns -EBUSY if all objects cannot be drained so that the node is not
				1123	* removed.
				1124	*
				1125	* Must hold slab_mutex.
				1126	*/
				1127	static int __meminit drain_cache_node_node(int node)
				1128	{
				1129	struct kmem_cache *cachep;
				1130	int ret = 0;
				1131
				1132	list_for_each_entry(cachep, &slab_caches, list) {
				1133	struct kmem_cache_node *n;
				1134
				1135	n = get_node(cachep, node);
				1136	if (!n)
				1137	continue;
				1138
				1139	drain_freelist(cachep, n, INT_MAX);
				1140
				1141	if (!list_empty(&n->slabs_full) \|\|
				1142	!list_empty(&n->slabs_partial)) {
				1143	ret = -EBUSY;
				1144	break;
				1145	}
				1146	}
				1147	return ret;
				1148	}
				1149
				1150	static int __meminit slab_memory_callback(struct notifier_block *self,
				1151	unsigned long action, void *arg)
				1152	{
				1153	struct memory_notify *mnb = arg;
				1154	int ret = 0;
				1155	int nid;
				1156
				1157	nid = mnb->status_change_nid;
				1158	if (nid < 0)
				1159	goto out;
				1160
				1161	switch (action) {
				1162	case MEM_GOING_ONLINE:
				1163	mutex_lock(&slab_mutex);
				1164	ret = init_cache_node_node(nid);
				1165	mutex_unlock(&slab_mutex);
				1166	break;
				1167	case MEM_GOING_OFFLINE:
				1168	mutex_lock(&slab_mutex);
				1169	ret = drain_cache_node_node(nid);
				1170	mutex_unlock(&slab_mutex);
				1171	break;
				1172	case MEM_ONLINE:
				1173	case MEM_OFFLINE:
				1174	case MEM_CANCEL_ONLINE:
				1175	case MEM_CANCEL_OFFLINE:
				1176	break;
				1177	}
				1178	out:
				1179	return notifier_from_errno(ret);
				1180	}
				1181	#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
				1182
				1183	/*
				1184	* swap the static kmem_cache_node with kmalloced memory
				1185	*/
				1186	static void __init init_list(struct kmem_cache cachep, struct kmem_cache_node list,
				1187	int nodeid)
				1188	{
				1189	struct kmem_cache_node *ptr;
				1190
				1191	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
				1192	BUG_ON(!ptr);
				1193
				1194	memcpy(ptr, list, sizeof(struct kmem_cache_node));
				1195	/*
				1196	* Do not assume that spinlocks can be initialized via memcpy:
				1197	*/
				1198	spin_lock_init(&ptr->list_lock);
				1199
				1200	MAKE_ALL_LISTS(cachep, ptr, nodeid);
				1201	cachep->node[nodeid] = ptr;
				1202	}
				1203
				1204	/*
				1205	* For setting up all the kmem_cache_node for cache whose buffer_size is same as
				1206	* size of kmem_cache_node.
				1207	*/
				1208	static void __init set_up_node(struct kmem_cache *cachep, int index)
				1209	{
				1210	int node;
				1211
				1212	for_each_online_node(node) {
				1213	cachep->node[node] = &init_kmem_cache_node[index + node];
				1214	cachep->node[node]->next_reap = jiffies +
				1215	REAPTIMEOUT_NODE +
				1216	((unsigned long)cachep) % REAPTIMEOUT_NODE;
				1217	}
				1218	}
				1219
				1220	/*
				1221	* Initialisation. Called after the page allocator have been initialised and
				1222	* before smp_init().
				1223	*/
				1224	void __init kmem_cache_init(void)
				1225	{
				1226	int i;
				1227
				1228	kmem_cache = &kmem_cache_boot;
				1229
				1230	if (!IS_ENABLED(CONFIG_NUMA) \|\| num_possible_nodes() == 1)
				1231	use_alien_caches = 0;
				1232
				1233	for (i = 0; i < NUM_INIT_LISTS; i++)
				1234	kmem_cache_node_init(&init_kmem_cache_node[i]);
				1235
				1236	/*
				1237	* Fragmentation resistance on low memory - only use bigger
				1238	* page orders on machines with more than 32MB of memory if
				1239	* not overridden on the command line.
				1240	*/
				1241	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
				1242	slab_max_order = SLAB_MAX_ORDER_HI;
				1243
				1244	/* Bootstrap is tricky, because several objects are allocated
				1245	* from caches that do not exist yet:
				1246	* 1) initialize the kmem_cache cache: it contains the struct
				1247	* kmem_cache structures of all caches, except kmem_cache itself:
				1248	* kmem_cache is statically allocated.
				1249	* Initially an __init data area is used for the head array and the
				1250	* kmem_cache_node structures, it's replaced with a kmalloc allocated
				1251	* array at the end of the bootstrap.
				1252	* 2) Create the first kmalloc cache.
				1253	* The struct kmem_cache for the new cache is allocated normally.
				1254	* An __init data area is used for the head array.
				1255	* 3) Create the remaining kmalloc caches, with minimally sized
				1256	* head arrays.
				1257	* 4) Replace the __init data head arrays for kmem_cache and the first
				1258	* kmalloc cache with kmalloc allocated arrays.
				1259	* 5) Replace the __init data for kmem_cache_node for kmem_cache and
				1260	* the other cache's with kmalloc allocated memory.
				1261	* 6) Resize the head arrays of the kmalloc caches to their final sizes.
				1262	*/
				1263
				1264	/* 1) create the kmem_cache */
				1265
				1266	/*
				1267	* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
				1268	*/
				1269	create_boot_cache(kmem_cache, "kmem_cache",
				1270	offsetof(struct kmem_cache, node) +
				1271	nr_node_ids * sizeof(struct kmem_cache_node *),
				1272	SLAB_HWCACHE_ALIGN, 0, 0);
				1273	list_add(&kmem_cache->list, &slab_caches);
				1274	memcg_link_cache(kmem_cache);
				1275	slab_state = PARTIAL;
				1276
				1277	/*
				1278	* Initialize the caches that provide memory for the kmem_cache_node
				1279	* structures first. Without this, further allocations will bug.
				1280	*/
				1281	kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
				1282	kmalloc_info[INDEX_NODE].name,
				1283	kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
				1284	0, kmalloc_size(INDEX_NODE));
				1285	slab_state = PARTIAL_NODE;
				1286	setup_kmalloc_cache_index_table();
				1287
				1288	slab_early_init = 0;
				1289
				1290	/* 5) Replace the bootstrap kmem_cache_node */
				1291	{
				1292	int nid;
				1293
				1294	for_each_online_node(nid) {
				1295	init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
				1296
				1297	init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
				1298	&init_kmem_cache_node[SIZE_NODE + nid], nid);
				1299	}
				1300	}
				1301
				1302	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
				1303	}
				1304
				1305	void __init kmem_cache_init_late(void)
				1306	{
				1307	struct kmem_cache *cachep;
				1308
				1309	/* 6) resize the head arrays to their final sizes */
				1310	mutex_lock(&slab_mutex);
				1311	list_for_each_entry(cachep, &slab_caches, list)
				1312	if (enable_cpucache(cachep, GFP_NOWAIT))
				1313	BUG();
				1314	mutex_unlock(&slab_mutex);
				1315
				1316	/* Done! */
				1317	slab_state = FULL;
				1318
				1319	#ifdef CONFIG_NUMA
				1320	/*
				1321	* Register a memory hotplug callback that initializes and frees
				1322	* node.
				1323	*/
				1324	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
				1325	#endif
				1326
				1327	/*
				1328	* The reap timers are started later, with a module init call: That part
				1329	* of the kernel is not yet operational.
				1330	*/
				1331	}
				1332
				1333	static int __init cpucache_init(void)
				1334	{
				1335	int ret;
				1336
				1337	/*
				1338	* Register the timers that return unneeded pages to the page allocator
				1339	*/
				1340	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
				1341	slab_online_cpu, slab_offline_cpu);
				1342	WARN_ON(ret < 0);
				1343
				1344	return 0;
				1345	}
				1346	__initcall(cpucache_init);
				1347
				1348	static noinline void
				1349	slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
				1350	{
				1351	#if DEBUG
				1352	struct kmem_cache_node *n;
				1353	unsigned long flags;
				1354	int node;
				1355	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				1356	DEFAULT_RATELIMIT_BURST);
				1357
				1358	if ((gfpflags & __GFP_NOWARN) \|\| !__ratelimit(&slab_oom_rs))
				1359	return;
				1360
				1361	pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
				1362	nodeid, gfpflags, &gfpflags);
				1363	pr_warn(" cache: %s, object size: %d, order: %d\n",
				1364	cachep->name, cachep->size, cachep->gfporder);
				1365
				1366	for_each_kmem_cache_node(cachep, node, n) {
				1367	unsigned long total_slabs, free_slabs, free_objs;
				1368
				1369	spin_lock_irqsave(&n->list_lock, flags);
				1370	total_slabs = n->total_slabs;
				1371	free_slabs = n->free_slabs;
				1372	free_objs = n->free_objects;
				1373	spin_unlock_irqrestore(&n->list_lock, flags);
				1374
				1375	pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
				1376	node, total_slabs - free_slabs, total_slabs,
				1377	(total_slabs * cachep->num) - free_objs,
				1378	total_slabs * cachep->num);
				1379	}
				1380	#endif
				1381	}
				1382
				1383	/*
				1384	* Interface to system's page allocator. No need to hold the
				1385	* kmem_cache_node ->list_lock.
				1386	*
				1387	* If we requested dmaable memory, we will get it. Even if we
				1388	* did not request dmaable memory, we might get it, but that
				1389	* would be relatively rare and ignorable.
				1390	*/
				1391	static struct page kmem_getpages(struct kmem_cache cachep, gfp_t flags,
				1392	int nodeid)
				1393	{
				1394	struct page *page;
				1395	int nr_pages;
				1396
				1397	flags \|= cachep->allocflags;
				1398
				1399	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
				1400	if (!page) {
				1401	slab_out_of_memory(cachep, flags, nodeid);
				1402	return NULL;
				1403	}
				1404
				1405	if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
				1406	__free_pages(page, cachep->gfporder);
				1407	return NULL;
				1408	}
				1409
				1410	nr_pages = (1 << cachep->gfporder);
				1411	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
				1412	mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
				1413	else
				1414	mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
				1415
				1416	__SetPageSlab(page);
				1417	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
				1418	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
				1419	SetPageSlabPfmemalloc(page);
				1420
				1421	return page;
				1422	}
				1423
				1424	/*
				1425	* Interface to system's page release.
				1426	*/
				1427	static void kmem_freepages(struct kmem_cache cachep, struct page page)
				1428	{
				1429	int order = cachep->gfporder;
				1430	unsigned long nr_freed = (1 << order);
				1431
				1432	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
				1433	mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
				1434	else
				1435	mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);
				1436
				1437	BUG_ON(!PageSlab(page));
				1438	__ClearPageSlabPfmemalloc(page);
				1439	__ClearPageSlab(page);
				1440	page_mapcount_reset(page);
				1441	page->mapping = NULL;
				1442
				1443	if (current->reclaim_state)
				1444	current->reclaim_state->reclaimed_slab += nr_freed;
				1445	memcg_uncharge_slab(page, order, cachep);
				1446	__free_pages(page, order);
				1447	}
				1448
				1449	static void kmem_rcu_free(struct rcu_head *head)
				1450	{
				1451	struct kmem_cache *cachep;
				1452	struct page *page;
				1453
				1454	page = container_of(head, struct page, rcu_head);
				1455	cachep = page->slab_cache;
				1456
				1457	kmem_freepages(cachep, page);
				1458	}
				1459
				1460	#if DEBUG
				1461	static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
				1462	{
				1463	if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
				1464	(cachep->size % PAGE_SIZE) == 0)
				1465	return true;
				1466
				1467	return false;
				1468	}
				1469
				1470	#ifdef CONFIG_DEBUG_PAGEALLOC
				1471	static void store_stackinfo(struct kmem_cache cachep, unsigned long addr,
				1472	unsigned long caller)
				1473	{
				1474	int size = cachep->object_size;
				1475
				1476	addr = (unsigned long )&((char )addr)[obj_offset(cachep)];
				1477
				1478	if (size < 5 * sizeof(unsigned long))
				1479	return;
				1480
				1481	*addr++ = 0x12345678;
				1482	*addr++ = caller;
				1483	*addr++ = smp_processor_id();
				1484	size -= 3 * sizeof(unsigned long);
				1485	{
				1486	unsigned long *sptr = &caller;
				1487	unsigned long svalue;
				1488
				1489	while (!kstack_end(sptr)) {
				1490	svalue = *sptr++;
				1491	if (kernel_text_address(svalue)) {
				1492	*addr++ = svalue;
				1493	size -= sizeof(unsigned long);
				1494	if (size <= sizeof(unsigned long))
				1495	break;
				1496	}
				1497	}
				1498
				1499	}
				1500	*addr++ = 0x87654321;
				1501	}
				1502
				1503	static void slab_kernel_map(struct kmem_cache cachep, void objp,
				1504	int map, unsigned long caller)
				1505	{
				1506	if (!is_debug_pagealloc_cache(cachep))
				1507	return;
				1508
				1509	if (caller)
				1510	store_stackinfo(cachep, objp, caller);
				1511
				1512	kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
				1513	}
				1514
				1515	#else
				1516	static inline void slab_kernel_map(struct kmem_cache cachep, void objp,
				1517	int map, unsigned long caller) {}
				1518
				1519	#endif
				1520
				1521	static void poison_obj(struct kmem_cache cachep, void addr, unsigned char val)
				1522	{
				1523	int size = cachep->object_size;
				1524	addr = &((char *)addr)[obj_offset(cachep)];
				1525
				1526	memset(addr, val, size);
				1527	(unsigned char )(addr + size - 1) = POISON_END;
				1528	}
				1529
				1530	static void dump_line(char *data, int offset, int limit)
				1531	{
				1532	int i;
				1533	unsigned char error = 0;
				1534	int bad_count = 0;
				1535
				1536	pr_err("%03x: ", offset);
				1537	for (i = 0; i < limit; i++) {
				1538	if (data[offset + i] != POISON_FREE) {
				1539	error = data[offset + i];
				1540	bad_count++;
				1541	}
				1542	}
				1543	print_hex_dump(KERN_CONT, "", 0, 16, 1,
				1544	&data[offset], limit, 1);
				1545
				1546	if (bad_count == 1) {
				1547	error ^= POISON_FREE;
				1548	if (!(error & (error - 1))) {
				1549	pr_err("Single bit error detected. Probably bad RAM.\n");
				1550	#ifdef CONFIG_X86
				1551	pr_err("Run memtest86+ or a similar memory test tool.\n");
				1552	#else
				1553	pr_err("Run a memory test tool.\n");
				1554	#endif
				1555	}
				1556	}
				1557	}
				1558	#endif
				1559
				1560	#if DEBUG
				1561
				1562	static void print_objinfo(struct kmem_cache cachep, void objp, int lines)
				1563	{
				1564	int i, size;
				1565	char *realobj;
				1566
				1567	if (cachep->flags & SLAB_RED_ZONE) {
				1568	pr_err("Redzone: 0x%llx/0x%llx\n",
				1569	*dbg_redzone1(cachep, objp),
				1570	*dbg_redzone2(cachep, objp));
				1571	}
				1572
				1573	if (cachep->flags & SLAB_STORE_USER)
				1574	pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
				1575	realobj = (char *)objp + obj_offset(cachep);
				1576	size = cachep->object_size;
				1577	for (i = 0; i < size && lines; i += 16, lines--) {
				1578	int limit;
				1579	limit = 16;
				1580	if (i + limit > size)
				1581	limit = size - i;
				1582	dump_line(realobj, i, limit);
				1583	}
				1584	}
				1585
				1586	static void check_poison_obj(struct kmem_cache cachep, void objp)
				1587	{
				1588	char *realobj;
				1589	int size, i;
				1590	int lines = 0;
				1591
				1592	if (is_debug_pagealloc_cache(cachep))
				1593	return;
				1594
				1595	realobj = (char *)objp + obj_offset(cachep);
				1596	size = cachep->object_size;
				1597
				1598	for (i = 0; i < size; i++) {
				1599	char exp = POISON_FREE;
				1600	if (i == size - 1)
				1601	exp = POISON_END;
				1602	if (realobj[i] != exp) {
				1603	int limit;
				1604	/* Mismatch ! */
				1605	/* Print header */
				1606	if (lines == 0) {
				1607	pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
				1608	print_tainted(), cachep->name,
				1609	realobj, size);
				1610	print_objinfo(cachep, objp, 0);
				1611	}
				1612	/* Hexdump the affected line */
				1613	i = (i / 16) * 16;
				1614	limit = 16;
				1615	if (i + limit > size)
				1616	limit = size - i;
				1617	dump_line(realobj, i, limit);
				1618	i += 16;
				1619	lines++;
				1620	/* Limit to 5 lines */
				1621	if (lines > 5)
				1622	break;
				1623	}
				1624	}
				1625	if (lines != 0) {
				1626	/* Print some data about the neighboring objects, if they
				1627	* exist:
				1628	*/
				1629	struct page *page = virt_to_head_page(objp);
				1630	unsigned int objnr;
				1631
				1632	objnr = obj_to_index(cachep, page, objp);
				1633	if (objnr) {
				1634	objp = index_to_obj(cachep, page, objnr - 1);
				1635	realobj = (char *)objp + obj_offset(cachep);
				1636	pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
				1637	print_objinfo(cachep, objp, 2);
				1638	}
				1639	if (objnr + 1 < cachep->num) {
				1640	objp = index_to_obj(cachep, page, objnr + 1);
				1641	realobj = (char *)objp + obj_offset(cachep);
				1642	pr_err("Next obj: start=%px, len=%d\n", realobj, size);
				1643	print_objinfo(cachep, objp, 2);
				1644	}
				1645	}
				1646	}
				1647	#endif
				1648
				1649	#if DEBUG
				1650	static void slab_destroy_debugcheck(struct kmem_cache *cachep,
				1651	struct page *page)
				1652	{
				1653	int i;
				1654
				1655	if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
				1656	poison_obj(cachep, page->freelist - obj_offset(cachep),
				1657	POISON_FREE);
				1658	}
				1659
				1660	for (i = 0; i < cachep->num; i++) {
				1661	void *objp = index_to_obj(cachep, page, i);
				1662
				1663	if (cachep->flags & SLAB_POISON) {
				1664	check_poison_obj(cachep, objp);
				1665	slab_kernel_map(cachep, objp, 1, 0);
				1666	}
				1667	if (cachep->flags & SLAB_RED_ZONE) {
				1668	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				1669	slab_error(cachep, "start of a freed object was overwritten");
				1670	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				1671	slab_error(cachep, "end of a freed object was overwritten");
				1672	}
				1673	}
				1674	}
				1675	#else
				1676	static void slab_destroy_debugcheck(struct kmem_cache *cachep,
				1677	struct page *page)
				1678	{
				1679	}
				1680	#endif
				1681
				1682	/**
				1683	* slab_destroy - destroy and release all objects in a slab
				1684	* @cachep: cache pointer being destroyed
				1685	* @page: page pointer being destroyed
				1686	*
				1687	* Destroy all the objs in a slab page, and release the mem back to the system.
				1688	* Before calling the slab page must have been unlinked from the cache. The
				1689	* kmem_cache_node ->list_lock is not held/needed.
				1690	*/
				1691	static void slab_destroy(struct kmem_cache cachep, struct page page)
				1692	{
				1693	void *freelist;
				1694
				1695	freelist = page->freelist;
				1696	slab_destroy_debugcheck(cachep, page);
				1697	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
				1698	call_rcu(&page->rcu_head, kmem_rcu_free);
				1699	else
				1700	kmem_freepages(cachep, page);
				1701
				1702	/*
				1703	* From now on, we don't use freelist
				1704	* although actual page can be freed in rcu context
				1705	*/
				1706	if (OFF_SLAB(cachep))
				1707	kmem_cache_free(cachep->freelist_cache, freelist);
				1708	}
				1709
				1710	static void slabs_destroy(struct kmem_cache cachep, struct list_head list)
				1711	{
				1712	struct page page, n;
				1713
				1714	list_for_each_entry_safe(page, n, list, lru) {
				1715	list_del(&page->lru);
				1716	slab_destroy(cachep, page);
				1717	}
				1718	}
				1719
				1720	/**
				1721	* calculate_slab_order - calculate size (page order) of slabs
				1722	* @cachep: pointer to the cache that is being created
				1723	* @size: size of objects to be created in this cache.
				1724	* @flags: slab allocation flags
				1725	*
				1726	* Also calculates the number of objects per slab.
				1727	*
				1728	* This could be made much more intelligent. For now, try to avoid using
				1729	* high order pages for slabs. When the gfp() functions are more friendly
				1730	* towards high-order requests, this should be changed.
				1731	*/
				1732	static size_t calculate_slab_order(struct kmem_cache *cachep,
				1733	size_t size, slab_flags_t flags)
				1734	{
				1735	size_t left_over = 0;
				1736	int gfporder;
				1737
				1738	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
				1739	unsigned int num;
				1740	size_t remainder;
				1741
				1742	num = cache_estimate(gfporder, size, flags, &remainder);
				1743	if (!num)
				1744	continue;
				1745
				1746	/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
				1747	if (num > SLAB_OBJ_MAX_NUM)
				1748	break;
				1749
				1750	if (flags & CFLGS_OFF_SLAB) {
				1751	struct kmem_cache *freelist_cache;
				1752	size_t freelist_size;
				1753
				1754	freelist_size = num * sizeof(freelist_idx_t);
				1755	freelist_cache = kmalloc_slab(freelist_size, 0u);
				1756	if (!freelist_cache)
				1757	continue;
				1758
				1759	/*
				1760	* Needed to avoid possible looping condition
				1761	* in cache_grow_begin()
				1762	*/
				1763	if (OFF_SLAB(freelist_cache))
				1764	continue;
				1765
				1766	/* check if off slab has enough benefit */
				1767	if (freelist_cache->size > cachep->size / 2)
				1768	continue;
				1769	}
				1770
				1771	/* Found something acceptable - save it away */
				1772	cachep->num = num;
				1773	cachep->gfporder = gfporder;
				1774	left_over = remainder;
				1775
				1776	/*
				1777	* A VFS-reclaimable slab tends to have most allocations
				1778	* as GFP_NOFS and we really don't want to have to be allocating
				1779	* higher-order pages when we are unable to shrink dcache.
				1780	*/
				1781	if (flags & SLAB_RECLAIM_ACCOUNT)
				1782	break;
				1783
				1784	/*
				1785	* Large number of objects is good, but very large slabs are
				1786	* currently bad for the gfp()s.
				1787	*/
				1788	if (gfporder >= slab_max_order)
				1789	break;
				1790
				1791	/*
				1792	* Acceptable internal fragmentation?
				1793	*/
				1794	if (left_over * 8 <= (PAGE_SIZE << gfporder))
				1795	break;
				1796	}
				1797	return left_over;
				1798	}
				1799
				1800	static struct array_cache __percpu *alloc_kmem_cache_cpus(
				1801	struct kmem_cache *cachep, int entries, int batchcount)
				1802	{
				1803	int cpu;
				1804	size_t size;
				1805	struct array_cache __percpu *cpu_cache;
				1806
				1807	size = sizeof(void ) entries + sizeof(struct array_cache);
				1808	cpu_cache = __alloc_percpu(size, sizeof(void *));
				1809
				1810	if (!cpu_cache)
				1811	return NULL;
				1812
				1813	for_each_possible_cpu(cpu) {
				1814	init_arraycache(per_cpu_ptr(cpu_cache, cpu),
				1815	entries, batchcount);
				1816	}
				1817
				1818	return cpu_cache;
				1819	}
				1820
				1821	static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
				1822	{
				1823	if (slab_state >= FULL)
				1824	return enable_cpucache(cachep, gfp);
				1825
				1826	cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
				1827	if (!cachep->cpu_cache)
				1828	return 1;
				1829
				1830	if (slab_state == DOWN) {
				1831	/* Creation of first cache (kmem_cache). */
				1832	set_up_node(kmem_cache, CACHE_CACHE);
				1833	} else if (slab_state == PARTIAL) {
				1834	/* For kmem_cache_node */
				1835	set_up_node(cachep, SIZE_NODE);
				1836	} else {
				1837	int node;
				1838
				1839	for_each_online_node(node) {
				1840	cachep->node[node] = kmalloc_node(
				1841	sizeof(struct kmem_cache_node), gfp, node);
				1842	BUG_ON(!cachep->node[node]);
				1843	kmem_cache_node_init(cachep->node[node]);
				1844	}
				1845	}
				1846
				1847	cachep->node[numa_mem_id()]->next_reap =
				1848	jiffies + REAPTIMEOUT_NODE +
				1849	((unsigned long)cachep) % REAPTIMEOUT_NODE;
				1850
				1851	cpu_cache_get(cachep)->avail = 0;
				1852	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
				1853	cpu_cache_get(cachep)->batchcount = 1;
				1854	cpu_cache_get(cachep)->touched = 0;
				1855	cachep->batchcount = 1;
				1856	cachep->limit = BOOT_CPUCACHE_ENTRIES;
				1857	return 0;
				1858	}
				1859
				1860	slab_flags_t kmem_cache_flags(unsigned int object_size,
				1861	slab_flags_t flags, const char *name,
				1862	void (ctor)(void ))
				1863	{
				1864	return flags;
				1865	}
				1866
				1867	struct kmem_cache *
				1868	__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
				1869	slab_flags_t flags, void (ctor)(void ))
				1870	{
				1871	struct kmem_cache *cachep;
				1872
				1873	cachep = find_mergeable(size, align, flags, name, ctor);
				1874	if (cachep) {
				1875	cachep->refcount++;
				1876
				1877	/*
				1878	* Adjust the object sizes so that we clear
				1879	* the complete object on kzalloc.
				1880	*/
				1881	cachep->object_size = max_t(int, cachep->object_size, size);
				1882	}
				1883	return cachep;
				1884	}
				1885
				1886	static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
				1887	size_t size, slab_flags_t flags)
				1888	{
				1889	size_t left;
				1890
				1891	cachep->num = 0;
				1892
				1893	/*
				1894	* If slab auto-initialization on free is enabled, store the freelist
				1895	* off-slab, so that its contents don't end up in one of the allocated
				1896	* objects.
				1897	*/
				1898	if (unlikely(slab_want_init_on_free(cachep)))
				1899	return false;
				1900
				1901	if (cachep->ctor \|\| flags & SLAB_TYPESAFE_BY_RCU)
				1902	return false;
				1903
				1904	left = calculate_slab_order(cachep, size,
				1905	flags \| CFLGS_OBJFREELIST_SLAB);
				1906	if (!cachep->num)
				1907	return false;
				1908
				1909	if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
				1910	return false;
				1911
				1912	cachep->colour = left / cachep->colour_off;
				1913
				1914	return true;
				1915	}
				1916
				1917	static bool set_off_slab_cache(struct kmem_cache *cachep,
				1918	size_t size, slab_flags_t flags)
				1919	{
				1920	size_t left;
				1921
				1922	cachep->num = 0;
				1923
				1924	/*
				1925	* Always use on-slab management when SLAB_NOLEAKTRACE
				1926	* to avoid recursive calls into kmemleak.
				1927	*/
				1928	if (flags & SLAB_NOLEAKTRACE)
				1929	return false;
				1930
				1931	/*
				1932	* Size is large, assume best to place the slab management obj
				1933	* off-slab (should allow better packing of objs).
				1934	*/
				1935	left = calculate_slab_order(cachep, size, flags \| CFLGS_OFF_SLAB);
				1936	if (!cachep->num)
				1937	return false;
				1938
				1939	/*
				1940	* If the slab has been placed off-slab, and we have enough space then
				1941	* move it on-slab. This is at the expense of any extra colouring.
				1942	*/
				1943	if (left >= cachep->num * sizeof(freelist_idx_t))
				1944	return false;
				1945
				1946	cachep->colour = left / cachep->colour_off;
				1947
				1948	return true;
				1949	}
				1950
				1951	static bool set_on_slab_cache(struct kmem_cache *cachep,
				1952	size_t size, slab_flags_t flags)
				1953	{
				1954	size_t left;
				1955
				1956	cachep->num = 0;
				1957
				1958	left = calculate_slab_order(cachep, size, flags);
				1959	if (!cachep->num)
				1960	return false;
				1961
				1962	cachep->colour = left / cachep->colour_off;
				1963
				1964	return true;
				1965	}
				1966
				1967	/**
				1968	* __kmem_cache_create - Create a cache.
				1969	* @cachep: cache management descriptor
				1970	* @flags: SLAB flags
				1971	*
				1972	* Returns a ptr to the cache on success, NULL on failure.
				1973	* Cannot be called within a int, but can be interrupted.
				1974	* The @ctor is run when new pages are allocated by the cache.
				1975	*
				1976	* The flags are
				1977	*
				1978	* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
				1979	* to catch references to uninitialised memory.
				1980	*
				1981	* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
				1982	* for buffer overruns.
				1983	*
				1984	* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
				1985	* cacheline. This can be beneficial if you're counting cycles as closely
				1986	* as davem.
				1987	*/
				1988	int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
				1989	{
				1990	size_t ralign = BYTES_PER_WORD;
				1991	gfp_t gfp;
				1992	int err;
				1993	unsigned int size = cachep->size;
				1994
				1995	#if DEBUG
				1996	#if FORCED_DEBUG
				1997	/*
				1998	* Enable redzoning and last user accounting, except for caches with
				1999	* large objects, if the increased size would increase the object size
				2000	* above the next power of two: caches with object sizes just above a
				2001	* power of two have a significant amount of internal fragmentation.
				2002	*/
				2003	if (size < 4096 \|\| fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
				2004	2 * sizeof(unsigned long long)))
				2005	flags \|= SLAB_RED_ZONE \| SLAB_STORE_USER;
				2006	if (!(flags & SLAB_TYPESAFE_BY_RCU))
				2007	flags \|= SLAB_POISON;
				2008	#endif
				2009	#endif
				2010
				2011	/*
				2012	* Check that size is in terms of words. This is needed to avoid
				2013	* unaligned accesses for some archs when redzoning is used, and makes
				2014	* sure any on-slab bufctl's are also correctly aligned.
				2015	*/
				2016	size = ALIGN(size, BYTES_PER_WORD);
				2017
				2018	if (flags & SLAB_RED_ZONE) {
				2019	ralign = REDZONE_ALIGN;
				2020	/* If redzoning, ensure that the second redzone is suitably
				2021	* aligned, by adjusting the object size accordingly. */
				2022	size = ALIGN(size, REDZONE_ALIGN);
				2023	}
				2024
				2025	/* 3) caller mandated alignment */
				2026	if (ralign < cachep->align) {
				2027	ralign = cachep->align;
				2028	}
				2029	/* disable debug if necessary */
				2030	if (ralign > __alignof__(unsigned long long))
				2031	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
				2032	/*
				2033	* 4) Store it.
				2034	*/
				2035	cachep->align = ralign;
				2036	cachep->colour_off = cache_line_size();
				2037	/* Offset must be a multiple of the alignment. */
				2038	if (cachep->colour_off < cachep->align)
				2039	cachep->colour_off = cachep->align;
				2040
				2041	if (slab_is_available())
				2042	gfp = GFP_KERNEL;
				2043	else
				2044	gfp = GFP_NOWAIT;
				2045
				2046	#if DEBUG
				2047
				2048	/*
				2049	* Both debugging options require word-alignment which is calculated
				2050	* into align above.
				2051	*/
				2052	if (flags & SLAB_RED_ZONE) {
				2053	/* add space for red zone words */
				2054	cachep->obj_offset += sizeof(unsigned long long);
				2055	size += 2 * sizeof(unsigned long long);
				2056	}
				2057	if (flags & SLAB_STORE_USER) {
				2058	/* user store requires one word storage behind the end of
				2059	* the real object. But if the second red zone needs to be
				2060	* aligned to 64 bits, we must allow that much space.
				2061	*/
				2062	if (flags & SLAB_RED_ZONE)
				2063	size += REDZONE_ALIGN;
				2064	else
				2065	size += BYTES_PER_WORD;
				2066	}
				2067	#endif
				2068
				2069	kasan_cache_create(cachep, &size, &flags);
				2070
				2071	size = ALIGN(size, cachep->align);
				2072	/*
				2073	* We should restrict the number of objects in a slab to implement
				2074	* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
				2075	*/
				2076	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
				2077	size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
				2078
				2079	#if DEBUG
				2080	/*
				2081	* To activate debug pagealloc, off-slab management is necessary
				2082	* requirement. In early phase of initialization, small sized slab
				2083	* doesn't get initialized so it would not be possible. So, we need
				2084	* to check size >= 256. It guarantees that all necessary small
				2085	* sized slab is initialized in current slab initialization sequence.
				2086	*/
				2087	if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
				2088	size >= 256 && cachep->object_size > cache_line_size()) {
				2089	if (size < PAGE_SIZE \|\| size % PAGE_SIZE == 0) {
				2090	size_t tmp_size = ALIGN(size, PAGE_SIZE);
				2091
				2092	if (set_off_slab_cache(cachep, tmp_size, flags)) {
				2093	flags \|= CFLGS_OFF_SLAB;
				2094	cachep->obj_offset += tmp_size - size;
				2095	size = tmp_size;
				2096	goto done;
				2097	}
				2098	}
				2099	}
				2100	#endif
				2101
				2102	if (set_objfreelist_slab_cache(cachep, size, flags)) {
				2103	flags \|= CFLGS_OBJFREELIST_SLAB;
				2104	goto done;
				2105	}
				2106
				2107	if (set_off_slab_cache(cachep, size, flags)) {
				2108	flags \|= CFLGS_OFF_SLAB;
				2109	goto done;
				2110	}
				2111
				2112	if (set_on_slab_cache(cachep, size, flags))
				2113	goto done;
				2114
				2115	return -E2BIG;
				2116
				2117	done:
				2118	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
				2119	cachep->flags = flags;
				2120	cachep->allocflags = __GFP_COMP;
				2121	if (flags & SLAB_CACHE_DMA)
				2122	cachep->allocflags \|= GFP_DMA;
				2123	if (flags & SLAB_CACHE_DMA32)
				2124	cachep->allocflags \|= GFP_DMA32;
				2125	if (flags & SLAB_RECLAIM_ACCOUNT)
				2126	cachep->allocflags \|= __GFP_RECLAIMABLE;
				2127	cachep->size = size;
				2128	cachep->reciprocal_buffer_size = reciprocal_value(size);
				2129
				2130	#if DEBUG
				2131	/*
				2132	* If we're going to use the generic kernel_map_pages()
				2133	* poisoning, then it's going to smash the contents of
				2134	* the redzone and userword anyhow, so switch them off.
				2135	*/
				2136	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
				2137	(cachep->flags & SLAB_POISON) &&
				2138	is_debug_pagealloc_cache(cachep))
				2139	cachep->flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
				2140	#endif
				2141
				2142	if (OFF_SLAB(cachep)) {
				2143	cachep->freelist_cache =
				2144	kmalloc_slab(cachep->freelist_size, 0u);
				2145	}
				2146
				2147	err = setup_cpu_cache(cachep, gfp);
				2148	if (err) {
				2149	__kmem_cache_release(cachep);
				2150	return err;
				2151	}
				2152
				2153	return 0;
				2154	}
				2155
				2156	#if DEBUG
				2157	static void check_irq_off(void)
				2158	{
				2159	BUG_ON(!irqs_disabled());
				2160	}
				2161
				2162	static void check_irq_on(void)
				2163	{
				2164	BUG_ON(irqs_disabled());
				2165	}
				2166
				2167	static void check_mutex_acquired(void)
				2168	{
				2169	BUG_ON(!mutex_is_locked(&slab_mutex));
				2170	}
				2171
				2172	static void check_spinlock_acquired(struct kmem_cache *cachep)
				2173	{
				2174	#ifdef CONFIG_SMP
				2175	check_irq_off();
				2176	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
				2177	#endif
				2178	}
				2179
				2180	static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
				2181	{
				2182	#ifdef CONFIG_SMP
				2183	check_irq_off();
				2184	assert_spin_locked(&get_node(cachep, node)->list_lock);
				2185	#endif
				2186	}
				2187
				2188	#else
				2189	#define check_irq_off() do { } while(0)
				2190	#define check_irq_on() do { } while(0)
				2191	#define check_mutex_acquired() do { } while(0)
				2192	#define check_spinlock_acquired(x) do { } while(0)
				2193	#define check_spinlock_acquired_node(x, y) do { } while(0)
				2194	#endif
				2195
				2196	static void drain_array_locked(struct kmem_cache cachep, struct array_cache ac,
				2197	int node, bool free_all, struct list_head *list)
				2198	{
				2199	int tofree;
				2200
				2201	if (!ac \|\| !ac->avail)
				2202	return;
				2203
				2204	tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
				2205	if (tofree > ac->avail)
				2206	tofree = (ac->avail + 1) / 2;
				2207
				2208	free_block(cachep, ac->entry, tofree, node, list);
				2209	ac->avail -= tofree;
				2210	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void ) ac->avail);
				2211	}
				2212
				2213	static void do_drain(void *arg)
				2214	{
				2215	struct kmem_cache *cachep = arg;
				2216	struct array_cache *ac;
				2217	int node = numa_mem_id();
				2218	struct kmem_cache_node *n;
				2219	LIST_HEAD(list);
				2220
				2221	check_irq_off();
				2222	ac = cpu_cache_get(cachep);
				2223	n = get_node(cachep, node);
				2224	spin_lock(&n->list_lock);
				2225	free_block(cachep, ac->entry, ac->avail, node, &list);
				2226	spin_unlock(&n->list_lock);
				2227	slabs_destroy(cachep, &list);
				2228	ac->avail = 0;
				2229	}
				2230
				2231	static void drain_cpu_caches(struct kmem_cache *cachep)
				2232	{
				2233	struct kmem_cache_node *n;
				2234	int node;
				2235	LIST_HEAD(list);
				2236
				2237	on_each_cpu(do_drain, cachep, 1);
				2238	check_irq_on();
				2239	for_each_kmem_cache_node(cachep, node, n)
				2240	if (n->alien)
				2241	drain_alien_cache(cachep, n->alien);
				2242
				2243	for_each_kmem_cache_node(cachep, node, n) {
				2244	spin_lock_irq(&n->list_lock);
				2245	drain_array_locked(cachep, n->shared, node, true, &list);
				2246	spin_unlock_irq(&n->list_lock);
				2247
				2248	slabs_destroy(cachep, &list);
				2249	}
				2250	}
				2251
				2252	/*
				2253	* Remove slabs from the list of free slabs.
				2254	* Specify the number of slabs to drain in tofree.
				2255	*
				2256	* Returns the actual number of slabs released.
				2257	*/
				2258	static int drain_freelist(struct kmem_cache *cache,
				2259	struct kmem_cache_node *n, int tofree)
				2260	{
				2261	struct list_head *p;
				2262	int nr_freed;
				2263	struct page *page;
				2264
				2265	nr_freed = 0;
				2266	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
				2267
				2268	spin_lock_irq(&n->list_lock);
				2269	p = n->slabs_free.prev;
				2270	if (p == &n->slabs_free) {
				2271	spin_unlock_irq(&n->list_lock);
				2272	goto out;
				2273	}
				2274
				2275	page = list_entry(p, struct page, lru);
				2276	list_del(&page->lru);
				2277	n->free_slabs--;
				2278	n->total_slabs--;
				2279	/*
				2280	* Safe to drop the lock. The slab is no longer linked
				2281	* to the cache.
				2282	*/
				2283	n->free_objects -= cache->num;
				2284	spin_unlock_irq(&n->list_lock);
				2285	slab_destroy(cache, page);
				2286	nr_freed++;
				2287	}
				2288	out:
				2289	return nr_freed;
				2290	}
				2291
				2292	bool __kmem_cache_empty(struct kmem_cache *s)
				2293	{
				2294	int node;
				2295	struct kmem_cache_node *n;
				2296
				2297	for_each_kmem_cache_node(s, node, n)
				2298	if (!list_empty(&n->slabs_full) \|\|
				2299	!list_empty(&n->slabs_partial))
				2300	return false;
				2301	return true;
				2302	}
				2303
				2304	int __kmem_cache_shrink(struct kmem_cache *cachep)
				2305	{
				2306	int ret = 0;
				2307	int node;
				2308	struct kmem_cache_node *n;
				2309
				2310	drain_cpu_caches(cachep);
				2311
				2312	check_irq_on();
				2313	for_each_kmem_cache_node(cachep, node, n) {
				2314	drain_freelist(cachep, n, INT_MAX);
				2315
				2316	ret += !list_empty(&n->slabs_full) \|\|
				2317	!list_empty(&n->slabs_partial);
				2318	}
				2319	return (ret ? 1 : 0);
				2320	}
				2321
				2322	#ifdef CONFIG_MEMCG
				2323	void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
				2324	{
				2325	__kmem_cache_shrink(cachep);
				2326	}
				2327	#endif
				2328
				2329	int __kmem_cache_shutdown(struct kmem_cache *cachep)
				2330	{
				2331	return __kmem_cache_shrink(cachep);
				2332	}
				2333
				2334	void __kmem_cache_release(struct kmem_cache *cachep)
				2335	{
				2336	int i;
				2337	struct kmem_cache_node *n;
				2338
				2339	cache_random_seq_destroy(cachep);
				2340
				2341	free_percpu(cachep->cpu_cache);
				2342
				2343	/* NUMA: free the node structures */
				2344	for_each_kmem_cache_node(cachep, i, n) {
				2345	kfree(n->shared);
				2346	free_alien_cache(n->alien);
				2347	kfree(n);
				2348	cachep->node[i] = NULL;
				2349	}
				2350	}
				2351
				2352	/*
				2353	* Get the memory for a slab management obj.
				2354	*
				2355	* For a slab cache when the slab descriptor is off-slab, the
				2356	* slab descriptor can't come from the same cache which is being created,
				2357	* Because if it is the case, that means we defer the creation of
				2358	* the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
				2359	* And we eventually call down to __kmem_cache_create(), which
				2360	* in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
				2361	* This is a "chicken-and-egg" problem.
				2362	*
				2363	* So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
				2364	* which are all initialized during kmem_cache_init().
				2365	*/
				2366	static void alloc_slabmgmt(struct kmem_cache cachep,
				2367	struct page *page, int colour_off,
				2368	gfp_t local_flags, int nodeid)
				2369	{
				2370	void *freelist;
				2371	void *addr = page_address(page);
				2372
				2373	page->s_mem = addr + colour_off;
				2374	page->active = 0;
				2375
				2376	if (OBJFREELIST_SLAB(cachep))
				2377	freelist = NULL;
				2378	else if (OFF_SLAB(cachep)) {
				2379	/* Slab management obj is off-slab. */
				2380	freelist = kmem_cache_alloc_node(cachep->freelist_cache,
				2381	local_flags, nodeid);
				2382	freelist = kasan_reset_tag(freelist);
				2383	if (!freelist)
				2384	return NULL;
				2385	} else {
				2386	/* We will use last bytes at the slab for freelist */
				2387	freelist = addr + (PAGE_SIZE << cachep->gfporder) -
				2388	cachep->freelist_size;
				2389	}
				2390
				2391	return freelist;
				2392	}
				2393
				2394	static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
				2395	{
				2396	return ((freelist_idx_t *)page->freelist)[idx];
				2397	}
				2398
				2399	static inline void set_free_obj(struct page *page,
				2400	unsigned int idx, freelist_idx_t val)
				2401	{
				2402	((freelist_idx_t *)(page->freelist))[idx] = val;
				2403	}
				2404
				2405	static void cache_init_objs_debug(struct kmem_cache cachep, struct page page)
				2406	{
				2407	#if DEBUG
				2408	int i;
				2409
				2410	for (i = 0; i < cachep->num; i++) {
				2411	void *objp = index_to_obj(cachep, page, i);
				2412
				2413	if (cachep->flags & SLAB_STORE_USER)
				2414	*dbg_userword(cachep, objp) = NULL;
				2415
				2416	if (cachep->flags & SLAB_RED_ZONE) {
				2417	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
				2418	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
				2419	}
				2420	/*
				2421	* Constructors are not allowed to allocate memory from the same
				2422	* cache which they are a constructor for. Otherwise, deadlock.
				2423	* They must also be threaded.
				2424	*/
				2425	if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
				2426	kasan_unpoison_object_data(cachep,
				2427	objp + obj_offset(cachep));
				2428	cachep->ctor(objp + obj_offset(cachep));
				2429	kasan_poison_object_data(
				2430	cachep, objp + obj_offset(cachep));
				2431	}
				2432
				2433	if (cachep->flags & SLAB_RED_ZONE) {
				2434	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				2435	slab_error(cachep, "constructor overwrote the end of an object");
				2436	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				2437	slab_error(cachep, "constructor overwrote the start of an object");
				2438	}
				2439	/* need to poison the objs? */
				2440	if (cachep->flags & SLAB_POISON) {
				2441	poison_obj(cachep, objp, POISON_FREE);
				2442	slab_kernel_map(cachep, objp, 0, 0);
				2443	}
				2444	}
				2445	#endif
				2446	}
				2447
				2448	#ifdef CONFIG_SLAB_FREELIST_RANDOM
				2449	/* Hold information during a freelist initialization */
				2450	union freelist_init_state {
				2451	struct {
				2452	unsigned int pos;
				2453	unsigned int *list;
				2454	unsigned int count;
				2455	};
				2456	struct rnd_state rnd_state;
				2457	};
				2458
				2459	/*
				2460	* Initialize the state based on the randomization methode available.
				2461	* return true if the pre-computed list is available, false otherwize.
				2462	*/
				2463	static bool freelist_state_initialize(union freelist_init_state *state,
				2464	struct kmem_cache *cachep,
				2465	unsigned int count)
				2466	{
				2467	bool ret;
				2468	unsigned int rand;
				2469
				2470	/* Use best entropy available to define a random shift */
				2471	rand = get_random_int();
				2472
				2473	/* Use a random state if the pre-computed list is not available */
				2474	if (!cachep->random_seq) {
				2475	prandom_seed_state(&state->rnd_state, rand);
				2476	ret = false;
				2477	} else {
				2478	state->list = cachep->random_seq;
				2479	state->count = count;
				2480	state->pos = rand % count;
				2481	ret = true;
				2482	}
				2483	return ret;
				2484	}
				2485
				2486	/* Get the next entry on the list and randomize it using a random shift */
				2487	static freelist_idx_t next_random_slot(union freelist_init_state *state)
				2488	{
				2489	if (state->pos >= state->count)
				2490	state->pos = 0;
				2491	return state->list[state->pos++];
				2492	}
				2493
				2494	/* Swap two freelist entries */
				2495	static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
				2496	{
				2497	swap(((freelist_idx_t *)page->freelist)[a],
				2498	((freelist_idx_t *)page->freelist)[b]);
				2499	}
				2500
				2501	/*
				2502	* Shuffle the freelist initialization state based on pre-computed lists.
				2503	* return true if the list was successfully shuffled, false otherwise.
				2504	*/
				2505	static bool shuffle_freelist(struct kmem_cache cachep, struct page page)
				2506	{
				2507	unsigned int objfreelist = 0, i, rand, count = cachep->num;
				2508	union freelist_init_state state;
				2509	bool precomputed;
				2510
				2511	if (count < 2)
				2512	return false;
				2513
				2514	precomputed = freelist_state_initialize(&state, cachep, count);
				2515
				2516	/* Take a random entry as the objfreelist */
				2517	if (OBJFREELIST_SLAB(cachep)) {
				2518	if (!precomputed)
				2519	objfreelist = count - 1;
				2520	else
				2521	objfreelist = next_random_slot(&state);
				2522	page->freelist = index_to_obj(cachep, page, objfreelist) +
				2523	obj_offset(cachep);
				2524	count--;
				2525	}
				2526
				2527	/*
				2528	* On early boot, generate the list dynamically.
				2529	* Later use a pre-computed list for speed.
				2530	*/
				2531	if (!precomputed) {
				2532	for (i = 0; i < count; i++)
				2533	set_free_obj(page, i, i);
				2534
				2535	/* Fisher-Yates shuffle */
				2536	for (i = count - 1; i > 0; i--) {
				2537	rand = prandom_u32_state(&state.rnd_state);
				2538	rand %= (i + 1);
				2539	swap_free_obj(page, i, rand);
				2540	}
				2541	} else {
				2542	for (i = 0; i < count; i++)
				2543	set_free_obj(page, i, next_random_slot(&state));
				2544	}
				2545
				2546	if (OBJFREELIST_SLAB(cachep))
				2547	set_free_obj(page, cachep->num - 1, objfreelist);
				2548
				2549	return true;
				2550	}
				2551	#else
				2552	static inline bool shuffle_freelist(struct kmem_cache *cachep,
				2553	struct page *page)
				2554	{
				2555	return false;
				2556	}
				2557	#endif /* CONFIG_SLAB_FREELIST_RANDOM */
				2558
				2559	static void cache_init_objs(struct kmem_cache *cachep,
				2560	struct page *page)
				2561	{
				2562	int i;
				2563	void *objp;
				2564	bool shuffled;
				2565
				2566	cache_init_objs_debug(cachep, page);
				2567
				2568	/* Try to randomize the freelist if enabled */
				2569	shuffled = shuffle_freelist(cachep, page);
				2570
				2571	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
				2572	page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
				2573	obj_offset(cachep);
				2574	}
				2575
				2576	for (i = 0; i < cachep->num; i++) {
				2577	objp = index_to_obj(cachep, page, i);
				2578	objp = kasan_init_slab_obj(cachep, objp);
				2579
				2580	/* constructor could break poison info */
				2581	if (DEBUG == 0 && cachep->ctor) {
				2582	kasan_unpoison_object_data(cachep, objp);
				2583	cachep->ctor(objp);
				2584	kasan_poison_object_data(cachep, objp);
				2585	}
				2586
				2587	if (!shuffled)
				2588	set_free_obj(page, i, i);
				2589	}
				2590	}
				2591
				2592	static void slab_get_obj(struct kmem_cache cachep, struct page *page)
				2593	{
				2594	void *objp;
				2595
				2596	objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
				2597	page->active++;
				2598
				2599	#if DEBUG
				2600	if (cachep->flags & SLAB_STORE_USER)
				2601	set_store_user_dirty(cachep);
				2602	#endif
				2603
				2604	return objp;
				2605	}
				2606
				2607	static void slab_put_obj(struct kmem_cache *cachep,
				2608	struct page page, void objp)
				2609	{
				2610	unsigned int objnr = obj_to_index(cachep, page, objp);
				2611	#if DEBUG
				2612	unsigned int i;
				2613
				2614	/* Verify double free bug */
				2615	for (i = page->active; i < cachep->num; i++) {
				2616	if (get_free_obj(page, i) == objnr) {
				2617	pr_err("slab: double free detected in cache '%s', objp %px\n",
				2618	cachep->name, objp);
				2619	BUG();
				2620	}
				2621	}
				2622	#endif
				2623	page->active--;
				2624	if (!page->freelist)
				2625	page->freelist = objp + obj_offset(cachep);
				2626
				2627	set_free_obj(page, page->active, objnr);
				2628	}
				2629
				2630	/*
				2631	* Map pages beginning at addr to the given cache and slab. This is required
				2632	* for the slab allocator to be able to lookup the cache and slab of a
				2633	* virtual address for kfree, ksize, and slab debugging.
				2634	*/
				2635	static void slab_map_pages(struct kmem_cache cache, struct page page,
				2636	void *freelist)
				2637	{
				2638	page->slab_cache = cache;
				2639	page->freelist = freelist;
				2640	}
				2641
				2642	/*
				2643	* Grow (by 1) the number of slabs within a cache. This is called by
				2644	* kmem_cache_alloc() when there are no active objs left in a cache.
				2645	*/
				2646	static struct page cache_grow_begin(struct kmem_cache cachep,
				2647	gfp_t flags, int nodeid)
				2648	{
				2649	void *freelist;
				2650	size_t offset;
				2651	gfp_t local_flags;
				2652	int page_node;
				2653	struct kmem_cache_node *n;
				2654	struct page *page;
				2655
				2656	/*
				2657	* Be lazy and only check for valid flags here, keeping it out of the
				2658	* critical path in kmem_cache_alloc().
				2659	*/
				2660	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
				2661	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
				2662	flags &= ~GFP_SLAB_BUG_MASK;
				2663	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
				2664	invalid_mask, &invalid_mask, flags, &flags);
				2665	dump_stack();
				2666	}
				2667	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
				2668	local_flags = flags & (GFP_CONSTRAINT_MASK\|GFP_RECLAIM_MASK);
				2669
				2670	check_irq_off();
				2671	if (gfpflags_allow_blocking(local_flags))
				2672	local_irq_enable();
				2673
				2674	/*
				2675	* Get mem for the objs. Attempt to allocate a physical page from
				2676	* 'nodeid'.
				2677	*/
				2678	page = kmem_getpages(cachep, local_flags, nodeid);
				2679	if (!page)
				2680	goto failed;
				2681
				2682	page_node = page_to_nid(page);
				2683	n = get_node(cachep, page_node);
				2684
				2685	/* Get colour for the slab, and cal the next value. */
				2686	n->colour_next++;
				2687	if (n->colour_next >= cachep->colour)
				2688	n->colour_next = 0;
				2689
				2690	offset = n->colour_next;
				2691	if (offset >= cachep->colour)
				2692	offset = 0;
				2693
				2694	offset *= cachep->colour_off;
				2695
				2696	/*
				2697	* Call kasan_poison_slab() before calling alloc_slabmgmt(), so
				2698	* page_address() in the latter returns a non-tagged pointer,
				2699	* as it should be for slab pages.
				2700	*/
				2701	kasan_poison_slab(page);
				2702
				2703	/* Get slab management. */
				2704	freelist = alloc_slabmgmt(cachep, page, offset,
				2705	local_flags & ~GFP_CONSTRAINT_MASK, page_node);
				2706	if (OFF_SLAB(cachep) && !freelist)
				2707	goto opps1;
				2708
				2709	slab_map_pages(cachep, page, freelist);
				2710
				2711	cache_init_objs(cachep, page);
				2712
				2713	if (gfpflags_allow_blocking(local_flags))
				2714	local_irq_disable();
				2715
				2716	return page;
				2717
				2718	opps1:
				2719	kmem_freepages(cachep, page);
				2720	failed:
				2721	if (gfpflags_allow_blocking(local_flags))
				2722	local_irq_disable();
				2723	return NULL;
				2724	}
				2725
				2726	static void cache_grow_end(struct kmem_cache cachep, struct page page)
				2727	{
				2728	struct kmem_cache_node *n;
				2729	void *list = NULL;
				2730
				2731	check_irq_off();
				2732
				2733	if (!page)
				2734	return;
				2735
				2736	INIT_LIST_HEAD(&page->lru);
				2737	n = get_node(cachep, page_to_nid(page));
				2738
				2739	spin_lock(&n->list_lock);
				2740	n->total_slabs++;
				2741	if (!page->active) {
				2742	list_add_tail(&page->lru, &(n->slabs_free));
				2743	n->free_slabs++;
				2744	} else
				2745	fixup_slab_list(cachep, n, page, &list);
				2746
				2747	STATS_INC_GROWN(cachep);
				2748	n->free_objects += cachep->num - page->active;
				2749	spin_unlock(&n->list_lock);
				2750
				2751	fixup_objfreelist_debug(cachep, &list);
				2752	}
				2753
				2754	#if DEBUG
				2755
				2756	/*
				2757	* Perform extra freeing checks:
				2758	* - detect bad pointers.
				2759	* - POISON/RED_ZONE checking
				2760	*/
				2761	static void kfree_debugcheck(const void *objp)
				2762	{
				2763	if (!virt_addr_valid(objp)) {
				2764	pr_err("kfree_debugcheck: out of range ptr %lxh\n",
				2765	(unsigned long)objp);
				2766	BUG();
				2767	}
				2768	}
				2769
				2770	static inline void verify_redzone_free(struct kmem_cache cache, void obj)
				2771	{
				2772	unsigned long long redzone1, redzone2;
				2773
				2774	redzone1 = *dbg_redzone1(cache, obj);
				2775	redzone2 = *dbg_redzone2(cache, obj);
				2776
				2777	/*
				2778	* Redzone is ok.
				2779	*/
				2780	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
				2781	return;
				2782
				2783	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
				2784	slab_error(cache, "double free detected");
				2785	else
				2786	slab_error(cache, "memory outside object was overwritten");
				2787
				2788	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
				2789	obj, redzone1, redzone2);
				2790	}
				2791
				2792	static void cache_free_debugcheck(struct kmem_cache cachep, void *objp,
				2793	unsigned long caller)
				2794	{
				2795	unsigned int objnr;
				2796	struct page *page;
				2797
				2798	BUG_ON(virt_to_cache(objp) != cachep);
				2799
				2800	objp -= obj_offset(cachep);
				2801	kfree_debugcheck(objp);
				2802	page = virt_to_head_page(objp);
				2803
				2804	if (cachep->flags & SLAB_RED_ZONE) {
				2805	verify_redzone_free(cachep, objp);
				2806	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
				2807	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
				2808	}
				2809	if (cachep->flags & SLAB_STORE_USER) {
				2810	set_store_user_dirty(cachep);
				2811	dbg_userword(cachep, objp) = (void )caller;
				2812	}
				2813
				2814	objnr = obj_to_index(cachep, page, objp);
				2815
				2816	BUG_ON(objnr >= cachep->num);
				2817	BUG_ON(objp != index_to_obj(cachep, page, objnr));
				2818
				2819	if (cachep->flags & SLAB_POISON) {
				2820	poison_obj(cachep, objp, POISON_FREE);
				2821	slab_kernel_map(cachep, objp, 0, caller);
				2822	}
				2823	return objp;
				2824	}
				2825
				2826	#else
				2827	#define kfree_debugcheck(x) do { } while(0)
				2828	#define cache_free_debugcheck(x,objp,z) (objp)
				2829	#endif
				2830
				2831	static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
				2832	void **list)
				2833	{
				2834	#if DEBUG
				2835	void next = list;
				2836	void *objp;
				2837
				2838	while (next) {
				2839	objp = next - obj_offset(cachep);
				2840	next = (void *)next;
				2841	poison_obj(cachep, objp, POISON_FREE);
				2842	}
				2843	#endif
				2844	}
				2845
				2846	static inline void fixup_slab_list(struct kmem_cache *cachep,
				2847	struct kmem_cache_node n, struct page page,
				2848	void **list)
				2849	{
				2850	/* move slabp to correct slabp list: */
				2851	list_del(&page->lru);
				2852	if (page->active == cachep->num) {
				2853	list_add(&page->lru, &n->slabs_full);
				2854	if (OBJFREELIST_SLAB(cachep)) {
				2855	#if DEBUG
				2856	/* Poisoning will be done without holding the lock */
				2857	if (cachep->flags & SLAB_POISON) {
				2858	void **objp = page->freelist;
				2859
				2860	objp = list;
				2861	*list = objp;
				2862	}
				2863	#endif
				2864	page->freelist = NULL;
				2865	}
				2866	} else
				2867	list_add(&page->lru, &n->slabs_partial);
				2868	}
				2869
				2870	/* Try to find non-pfmemalloc slab if needed */
				2871	static noinline struct page get_valid_first_slab(struct kmem_cache_node n,
				2872	struct page *page, bool pfmemalloc)
				2873	{
				2874	if (!page)
				2875	return NULL;
				2876
				2877	if (pfmemalloc)
				2878	return page;
				2879
				2880	if (!PageSlabPfmemalloc(page))
				2881	return page;
				2882
				2883	/* No need to keep pfmemalloc slab if we have enough free objects */
				2884	if (n->free_objects > n->free_limit) {
				2885	ClearPageSlabPfmemalloc(page);
				2886	return page;
				2887	}
				2888
				2889	/* Move pfmemalloc slab to the end of list to speed up next search */
				2890	list_del(&page->lru);
				2891	if (!page->active) {
				2892	list_add_tail(&page->lru, &n->slabs_free);
				2893	n->free_slabs++;
				2894	} else
				2895	list_add_tail(&page->lru, &n->slabs_partial);
				2896
				2897	list_for_each_entry(page, &n->slabs_partial, lru) {
				2898	if (!PageSlabPfmemalloc(page))
				2899	return page;
				2900	}
				2901
				2902	n->free_touched = 1;
				2903	list_for_each_entry(page, &n->slabs_free, lru) {
				2904	if (!PageSlabPfmemalloc(page)) {
				2905	n->free_slabs--;
				2906	return page;
				2907	}
				2908	}
				2909
				2910	return NULL;
				2911	}
				2912
				2913	static struct page get_first_slab(struct kmem_cache_node n, bool pfmemalloc)
				2914	{
				2915	struct page *page;
				2916
				2917	assert_spin_locked(&n->list_lock);
				2918	page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
				2919	if (!page) {
				2920	n->free_touched = 1;
				2921	page = list_first_entry_or_null(&n->slabs_free, struct page,
				2922	lru);
				2923	if (page)
				2924	n->free_slabs--;
				2925	}
				2926
				2927	if (sk_memalloc_socks())
				2928	page = get_valid_first_slab(n, page, pfmemalloc);
				2929
				2930	return page;
				2931	}
				2932
				2933	static noinline void cache_alloc_pfmemalloc(struct kmem_cache cachep,
				2934	struct kmem_cache_node *n, gfp_t flags)
				2935	{
				2936	struct page *page;
				2937	void *obj;
				2938	void *list = NULL;
				2939
				2940	if (!gfp_pfmemalloc_allowed(flags))
				2941	return NULL;
				2942
				2943	spin_lock(&n->list_lock);
				2944	page = get_first_slab(n, true);
				2945	if (!page) {
				2946	spin_unlock(&n->list_lock);
				2947	return NULL;
				2948	}
				2949
				2950	obj = slab_get_obj(cachep, page);
				2951	n->free_objects--;
				2952
				2953	fixup_slab_list(cachep, n, page, &list);
				2954
				2955	spin_unlock(&n->list_lock);
				2956	fixup_objfreelist_debug(cachep, &list);
				2957
				2958	return obj;
				2959	}
				2960
				2961	/*
				2962	* Slab list should be fixed up by fixup_slab_list() for existing slab
				2963	* or cache_grow_end() for new slab
				2964	*/
				2965	static __always_inline int alloc_block(struct kmem_cache *cachep,
				2966	struct array_cache ac, struct page page, int batchcount)
				2967	{
				2968	/*
				2969	* There must be at least one object available for
				2970	* allocation.
				2971	*/
				2972	BUG_ON(page->active >= cachep->num);
				2973
				2974	while (page->active < cachep->num && batchcount--) {
				2975	STATS_INC_ALLOCED(cachep);
				2976	STATS_INC_ACTIVE(cachep);
				2977	STATS_SET_HIGH(cachep);
				2978
				2979	ac->entry[ac->avail++] = slab_get_obj(cachep, page);
				2980	}
				2981
				2982	return batchcount;
				2983	}
				2984
				2985	static void cache_alloc_refill(struct kmem_cache cachep, gfp_t flags)
				2986	{
				2987	int batchcount;
				2988	struct kmem_cache_node *n;
				2989	struct array_cache ac, shared;
				2990	int node;
				2991	void *list = NULL;
				2992	struct page *page;
				2993
				2994	check_irq_off();
				2995	node = numa_mem_id();
				2996
				2997	ac = cpu_cache_get(cachep);
				2998	batchcount = ac->batchcount;
				2999	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
				3000	/*
				3001	* If there was little recent activity on this cache, then
				3002	* perform only a partial refill. Otherwise we could generate
				3003	* refill bouncing.
				3004	*/
				3005	batchcount = BATCHREFILL_LIMIT;
				3006	}
				3007	n = get_node(cachep, node);
				3008
				3009	BUG_ON(ac->avail > 0 \|\| !n);
				3010	shared = READ_ONCE(n->shared);
				3011	if (!n->free_objects && (!shared \|\| !shared->avail))
				3012	goto direct_grow;
				3013
				3014	spin_lock(&n->list_lock);
				3015	shared = READ_ONCE(n->shared);
				3016
				3017	/* See if we can refill from the shared array */
				3018	if (shared && transfer_objects(ac, shared, batchcount)) {
				3019	shared->touched = 1;
				3020	goto alloc_done;
				3021	}
				3022
				3023	while (batchcount > 0) {
				3024	/* Get slab alloc is to come from. */
				3025	page = get_first_slab(n, false);
				3026	if (!page)
				3027	goto must_grow;
				3028
				3029	check_spinlock_acquired(cachep);
				3030
				3031	batchcount = alloc_block(cachep, ac, page, batchcount);
				3032	fixup_slab_list(cachep, n, page, &list);
				3033	}
				3034
				3035	must_grow:
				3036	n->free_objects -= ac->avail;
				3037	alloc_done:
				3038	spin_unlock(&n->list_lock);
				3039	fixup_objfreelist_debug(cachep, &list);
				3040
				3041	direct_grow:
				3042	if (unlikely(!ac->avail)) {
				3043	/* Check if we can use obj in pfmemalloc slab */
				3044	if (sk_memalloc_socks()) {
				3045	void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
				3046
				3047	if (obj)
				3048	return obj;
				3049	}
				3050
				3051	page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
				3052
				3053	/*
				3054	* cache_grow_begin() can reenable interrupts,
				3055	* then ac could change.
				3056	*/
				3057	ac = cpu_cache_get(cachep);
				3058	if (!ac->avail && page)
				3059	alloc_block(cachep, ac, page, batchcount);
				3060	cache_grow_end(cachep, page);
				3061
				3062	if (!ac->avail)
				3063	return NULL;
				3064	}
				3065	ac->touched = 1;
				3066
				3067	return ac->entry[--ac->avail];
				3068	}
				3069
				3070	static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
				3071	gfp_t flags)
				3072	{
				3073	might_sleep_if(gfpflags_allow_blocking(flags));
				3074	}
				3075
				3076	#if DEBUG
				3077	static void cache_alloc_debugcheck_after(struct kmem_cache cachep,
				3078	gfp_t flags, void *objp, unsigned long caller)
				3079	{
				3080	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
				3081	if (!objp)
				3082	return objp;
				3083	if (cachep->flags & SLAB_POISON) {
				3084	check_poison_obj(cachep, objp);
				3085	slab_kernel_map(cachep, objp, 1, 0);
				3086	poison_obj(cachep, objp, POISON_INUSE);
				3087	}
				3088	if (cachep->flags & SLAB_STORE_USER)
				3089	dbg_userword(cachep, objp) = (void )caller;
				3090
				3091	if (cachep->flags & SLAB_RED_ZONE) {
				3092	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE \|\|
				3093	*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
				3094	slab_error(cachep, "double free, or memory outside object was overwritten");
				3095	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
				3096	objp, *dbg_redzone1(cachep, objp),
				3097	*dbg_redzone2(cachep, objp));
				3098	}
				3099	*dbg_redzone1(cachep, objp) = RED_ACTIVE;
				3100	*dbg_redzone2(cachep, objp) = RED_ACTIVE;
				3101	}
				3102
				3103	objp += obj_offset(cachep);
				3104	if (cachep->ctor && cachep->flags & SLAB_POISON)
				3105	cachep->ctor(objp);
				3106	if (ARCH_SLAB_MINALIGN &&
				3107	((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
				3108	pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
				3109	objp, (int)ARCH_SLAB_MINALIGN);
				3110	}
				3111	return objp;
				3112	}
				3113	#else
				3114	#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
				3115	#endif
				3116
				3117	static inline void ____cache_alloc(struct kmem_cache cachep, gfp_t flags)
				3118	{
				3119	void *objp;
				3120	struct array_cache *ac;
				3121
				3122	check_irq_off();
				3123
				3124	ac = cpu_cache_get(cachep);
				3125	if (likely(ac->avail)) {
				3126	ac->touched = 1;
				3127	objp = ac->entry[--ac->avail];
				3128
				3129	STATS_INC_ALLOCHIT(cachep);
				3130	goto out;
				3131	}
				3132
				3133	STATS_INC_ALLOCMISS(cachep);
				3134	objp = cache_alloc_refill(cachep, flags);
				3135	/*
				3136	* the 'ac' may be updated by cache_alloc_refill(),
				3137	* and kmemleak_erase() requires its correct value.
				3138	*/
				3139	ac = cpu_cache_get(cachep);
				3140
				3141	out:
				3142	/*
				3143	* To avoid a false negative, if an object that is in one of the
				3144	* per-CPU caches is leaked, we need to make sure kmemleak doesn't
				3145	* treat the array pointers as a reference to the object.
				3146	*/
				3147	if (objp)
				3148	kmemleak_erase(&ac->entry[ac->avail]);
				3149	return objp;
				3150	}
				3151
				3152	#ifdef CONFIG_NUMA
				3153	/*
				3154	* Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
				3155	*
				3156	* If we are in_interrupt, then process context, including cpusets and
				3157	* mempolicy, may not apply and should not be used for allocation policy.
				3158	*/
				3159	static void alternate_node_alloc(struct kmem_cache cachep, gfp_t flags)
				3160	{
				3161	int nid_alloc, nid_here;
				3162
				3163	if (in_interrupt() \|\| (flags & __GFP_THISNODE))
				3164	return NULL;
				3165	nid_alloc = nid_here = numa_mem_id();
				3166	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
				3167	nid_alloc = cpuset_slab_spread_node();
				3168	else if (current->mempolicy)
				3169	nid_alloc = mempolicy_slab_node();
				3170	if (nid_alloc != nid_here)
				3171	return ____cache_alloc_node(cachep, flags, nid_alloc);
				3172	return NULL;
				3173	}
				3174
				3175	/*
				3176	* Fallback function if there was no memory available and no objects on a
				3177	* certain node and fall back is permitted. First we scan all the
				3178	* available node for available objects. If that fails then we
				3179	* perform an allocation without specifying a node. This allows the page
				3180	* allocator to do its reclaim / fallback magic. We then insert the
				3181	* slab into the proper nodelist and then allocate from it.
				3182	*/
				3183	static void fallback_alloc(struct kmem_cache cache, gfp_t flags)
				3184	{
				3185	struct zonelist *zonelist;
				3186	struct zoneref *z;
				3187	struct zone *zone;
				3188	enum zone_type high_zoneidx = gfp_zone(flags);
				3189	void *obj = NULL;
				3190	struct page *page;
				3191	int nid;
				3192	unsigned int cpuset_mems_cookie;
				3193
				3194	if (flags & __GFP_THISNODE)
				3195	return NULL;
				3196
				3197	retry_cpuset:
				3198	cpuset_mems_cookie = read_mems_allowed_begin();
				3199	zonelist = node_zonelist(mempolicy_slab_node(), flags);
				3200
				3201	retry:
				3202	/*
				3203	* Look through allowed nodes for objects available
				3204	* from existing per node queues.
				3205	*/
				3206	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
				3207	nid = zone_to_nid(zone);
				3208
				3209	if (cpuset_zone_allowed(zone, flags) &&
				3210	get_node(cache, nid) &&
				3211	get_node(cache, nid)->free_objects) {
				3212	obj = ____cache_alloc_node(cache,
				3213	gfp_exact_node(flags), nid);
				3214	if (obj)
				3215	break;
				3216	}
				3217	}
				3218
				3219	if (!obj) {
				3220	/*
				3221	* This allocation will be performed within the constraints
				3222	* of the current cpuset / memory policy requirements.
				3223	* We may trigger various forms of reclaim on the allowed
				3224	* set and go into memory reserves if necessary.
				3225	*/
				3226	page = cache_grow_begin(cache, flags, numa_mem_id());
				3227	cache_grow_end(cache, page);
				3228	if (page) {
				3229	nid = page_to_nid(page);
				3230	obj = ____cache_alloc_node(cache,
				3231	gfp_exact_node(flags), nid);
				3232
				3233	/*
				3234	* Another processor may allocate the objects in
				3235	* the slab since we are not holding any locks.
				3236	*/
				3237	if (!obj)
				3238	goto retry;
				3239	}
				3240	}
				3241
				3242	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
				3243	goto retry_cpuset;
				3244	return obj;
				3245	}
				3246
				3247	/*
				3248	* A interface to enable slab creation on nodeid
				3249	*/
				3250	static void ____cache_alloc_node(struct kmem_cache cachep, gfp_t flags,
				3251	int nodeid)
				3252	{
				3253	struct page *page;
				3254	struct kmem_cache_node *n;
				3255	void *obj = NULL;
				3256	void *list = NULL;
				3257
				3258	VM_BUG_ON(nodeid < 0 \|\| nodeid >= MAX_NUMNODES);
				3259	n = get_node(cachep, nodeid);
				3260	BUG_ON(!n);
				3261
				3262	check_irq_off();
				3263	spin_lock(&n->list_lock);
				3264	page = get_first_slab(n, false);
				3265	if (!page)
				3266	goto must_grow;
				3267
				3268	check_spinlock_acquired_node(cachep, nodeid);
				3269
				3270	STATS_INC_NODEALLOCS(cachep);
				3271	STATS_INC_ACTIVE(cachep);
				3272	STATS_SET_HIGH(cachep);
				3273
				3274	BUG_ON(page->active == cachep->num);
				3275
				3276	obj = slab_get_obj(cachep, page);
				3277	n->free_objects--;
				3278
				3279	fixup_slab_list(cachep, n, page, &list);
				3280
				3281	spin_unlock(&n->list_lock);
				3282	fixup_objfreelist_debug(cachep, &list);
				3283	return obj;
				3284
				3285	must_grow:
				3286	spin_unlock(&n->list_lock);
				3287	page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
				3288	if (page) {
				3289	/* This slab isn't counted yet so don't update free_objects */
				3290	obj = slab_get_obj(cachep, page);
				3291	}
				3292	cache_grow_end(cachep, page);
				3293
				3294	return obj ? obj : fallback_alloc(cachep, flags);
				3295	}
				3296
				3297	static __always_inline void *
				3298	slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
				3299	unsigned long caller)
				3300	{
				3301	unsigned long save_flags;
				3302	void *ptr;
				3303	int slab_node = numa_mem_id();
				3304
				3305	flags &= gfp_allowed_mask;
				3306	cachep = slab_pre_alloc_hook(cachep, flags);
				3307	if (unlikely(!cachep))
				3308	return NULL;
				3309
				3310	cache_alloc_debugcheck_before(cachep, flags);
				3311	local_irq_save(save_flags);
				3312
				3313	if (nodeid == NUMA_NO_NODE)
				3314	nodeid = slab_node;
				3315
				3316	if (unlikely(!get_node(cachep, nodeid))) {
				3317	/* Node not bootstrapped yet */
				3318	ptr = fallback_alloc(cachep, flags);
				3319	goto out;
				3320	}
				3321
				3322	if (nodeid == slab_node) {
				3323	/*
				3324	* Use the locally cached objects if possible.
				3325	* However ____cache_alloc does not allow fallback
				3326	* to other nodes. It may fail while we still have
				3327	* objects on other nodes available.
				3328	*/
				3329	ptr = ____cache_alloc(cachep, flags);
				3330	if (ptr)
				3331	goto out;
				3332	}
				3333	/* ___cache_alloc_node can fall back to other nodes */
				3334	ptr = ____cache_alloc_node(cachep, flags, nodeid);
				3335	out:
				3336	local_irq_restore(save_flags);
				3337	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
				3338
				3339	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
				3340	memset(ptr, 0, cachep->object_size);
				3341
				3342	slab_post_alloc_hook(cachep, flags, 1, &ptr);
				3343	return ptr;
				3344	}
				3345
				3346	static __always_inline void *
				3347	__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
				3348	{
				3349	void *objp;
				3350
				3351	if (current->mempolicy \|\| cpuset_do_slab_mem_spread()) {
				3352	objp = alternate_node_alloc(cache, flags);
				3353	if (objp)
				3354	goto out;
				3355	}
				3356	objp = ____cache_alloc(cache, flags);
				3357
				3358	/*
				3359	* We may just have run out of memory on the local node.
				3360	* ____cache_alloc_node() knows how to locate memory on other nodes
				3361	*/
				3362	if (!objp)
				3363	objp = ____cache_alloc_node(cache, flags, numa_mem_id());
				3364
				3365	out:
				3366	return objp;
				3367	}
				3368	#else
				3369
				3370	static __always_inline void *
				3371	__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
				3372	{
				3373	return ____cache_alloc(cachep, flags);
				3374	}
				3375
				3376	#endif /* CONFIG_NUMA */
				3377
				3378	static __always_inline void *
				3379	slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
				3380	{
				3381	unsigned long save_flags;
				3382	void *objp;
				3383
				3384	flags &= gfp_allowed_mask;
				3385	cachep = slab_pre_alloc_hook(cachep, flags);
				3386	if (unlikely(!cachep))
				3387	return NULL;
				3388
				3389	cache_alloc_debugcheck_before(cachep, flags);
				3390	local_irq_save(save_flags);
				3391	objp = __do_cache_alloc(cachep, flags);
				3392	local_irq_restore(save_flags);
				3393	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
				3394	prefetchw(objp);
				3395
				3396	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
				3397	memset(objp, 0, cachep->object_size);
				3398
				3399	slab_post_alloc_hook(cachep, flags, 1, &objp);
				3400	return objp;
				3401	}
				3402
				3403	/*
				3404	* Caller needs to acquire correct kmem_cache_node's list_lock
				3405	* @list: List of detached free slabs should be freed by caller
				3406	*/
				3407	static void free_block(struct kmem_cache cachep, void *objpp,
				3408	int nr_objects, int node, struct list_head *list)
				3409	{
				3410	int i;
				3411	struct kmem_cache_node *n = get_node(cachep, node);
				3412	struct page *page;
				3413
				3414	n->free_objects += nr_objects;
				3415
				3416	for (i = 0; i < nr_objects; i++) {
				3417	void *objp;
				3418	struct page *page;
				3419
				3420	objp = objpp[i];
				3421
				3422	page = virt_to_head_page(objp);
				3423	list_del(&page->lru);
				3424	check_spinlock_acquired_node(cachep, node);
				3425	slab_put_obj(cachep, page, objp);
				3426	STATS_DEC_ACTIVE(cachep);
				3427
				3428	/* fixup slab chains */
				3429	if (page->active == 0) {
				3430	list_add(&page->lru, &n->slabs_free);
				3431	n->free_slabs++;
				3432	} else {
				3433	/* Unconditionally move a slab to the end of the
				3434	* partial list on free - maximum time for the
				3435	* other objects to be freed, too.
				3436	*/
				3437	list_add_tail(&page->lru, &n->slabs_partial);
				3438	}
				3439	}
				3440
				3441	while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
				3442	n->free_objects -= cachep->num;
				3443
				3444	page = list_last_entry(&n->slabs_free, struct page, lru);
				3445	list_move(&page->lru, list);
				3446	n->free_slabs--;
				3447	n->total_slabs--;
				3448	}
				3449	}
				3450
				3451	static void cache_flusharray(struct kmem_cache cachep, struct array_cache ac)
				3452	{
				3453	int batchcount;
				3454	struct kmem_cache_node *n;
				3455	int node = numa_mem_id();
				3456	LIST_HEAD(list);
				3457
				3458	batchcount = ac->batchcount;
				3459
				3460	check_irq_off();
				3461	n = get_node(cachep, node);
				3462	spin_lock(&n->list_lock);
				3463	if (n->shared) {
				3464	struct array_cache *shared_array = n->shared;
				3465	int max = shared_array->limit - shared_array->avail;
				3466	if (max) {
				3467	if (batchcount > max)
				3468	batchcount = max;
				3469	memcpy(&(shared_array->entry[shared_array->avail]),
				3470	ac->entry, sizeof(void ) batchcount);
				3471	shared_array->avail += batchcount;
				3472	goto free_done;
				3473	}
				3474	}
				3475
				3476	free_block(cachep, ac->entry, batchcount, node, &list);
				3477	free_done:
				3478	#if STATS
				3479	{
				3480	int i = 0;
				3481	struct page *page;
				3482
				3483	list_for_each_entry(page, &n->slabs_free, lru) {
				3484	BUG_ON(page->active);
				3485
				3486	i++;
				3487	}
				3488	STATS_SET_FREEABLE(cachep, i);
				3489	}
				3490	#endif
				3491	spin_unlock(&n->list_lock);
				3492	slabs_destroy(cachep, &list);
				3493	ac->avail -= batchcount;
				3494	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void )ac->avail);
				3495	}
				3496
				3497	/*
				3498	* Release an obj back to its cache. If the obj has a constructed state, it must
				3499	* be in this state _before_ it is released. Called with disabled ints.
				3500	*/
				3501	static __always_inline void __cache_free(struct kmem_cache cachep, void objp,
				3502	unsigned long caller)
				3503	{
				3504	/* Put the object into the quarantine, don't touch it for now. */
				3505	if (kasan_slab_free(cachep, objp, _RET_IP_))
				3506	return;
				3507
				3508	___cache_free(cachep, objp, caller);
				3509	}
				3510
				3511	void ___cache_free(struct kmem_cache cachep, void objp,
				3512	unsigned long caller)
				3513	{
				3514	struct array_cache *ac = cpu_cache_get(cachep);
				3515
				3516	check_irq_off();
				3517	if (unlikely(slab_want_init_on_free(cachep)))
				3518	memset(objp, 0, cachep->object_size);
				3519	kmemleak_free_recursive(objp, cachep->flags);
				3520	objp = cache_free_debugcheck(cachep, objp, caller);
				3521
				3522	/*
				3523	* Skip calling cache_free_alien() when the platform is not numa.
				3524	* This will avoid cache misses that happen while accessing slabp (which
				3525	* is per page memory reference) to get nodeid. Instead use a global
				3526	* variable to skip the call, which is mostly likely to be present in
				3527	* the cache.
				3528	*/
				3529	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
				3530	return;
				3531
				3532	if (ac->avail < ac->limit) {
				3533	STATS_INC_FREEHIT(cachep);
				3534	} else {
				3535	STATS_INC_FREEMISS(cachep);
				3536	cache_flusharray(cachep, ac);
				3537	}
				3538
				3539	if (sk_memalloc_socks()) {
				3540	struct page *page = virt_to_head_page(objp);
				3541
				3542	if (unlikely(PageSlabPfmemalloc(page))) {
				3543	cache_free_pfmemalloc(cachep, page, objp);
				3544	return;
				3545	}
				3546	}
				3547
				3548	ac->entry[ac->avail++] = objp;
				3549	}
				3550
				3551	/**
				3552	* kmem_cache_alloc - Allocate an object
				3553	* @cachep: The cache to allocate from.
				3554	* @flags: See kmalloc().
				3555	*
				3556	* Allocate an object from this cache. The flags are only relevant
				3557	* if the cache has no available objects.
				3558	*/
				3559	void kmem_cache_alloc(struct kmem_cache cachep, gfp_t flags)
				3560	{
				3561	void *ret = slab_alloc(cachep, flags, _RET_IP_);
				3562
				3563	trace_kmem_cache_alloc(_RET_IP_, ret,
				3564	cachep->object_size, cachep->size, flags);
				3565
				3566	return ret;
				3567	}
				3568	EXPORT_SYMBOL(kmem_cache_alloc);
				3569
				3570	static __always_inline void
				3571	cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
				3572	size_t size, void **p, unsigned long caller)
				3573	{
				3574	size_t i;
				3575
				3576	for (i = 0; i < size; i++)
				3577	p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
				3578	}
				3579
				3580	int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
				3581	void **p)
				3582	{
				3583	size_t i;
				3584
				3585	s = slab_pre_alloc_hook(s, flags);
				3586	if (!s)
				3587	return 0;
				3588
				3589	cache_alloc_debugcheck_before(s, flags);
				3590
				3591	local_irq_disable();
				3592	for (i = 0; i < size; i++) {
				3593	void *objp = __do_cache_alloc(s, flags);
				3594
				3595	if (unlikely(!objp))
				3596	goto error;
				3597	p[i] = objp;
				3598	}
				3599	local_irq_enable();
				3600
				3601	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
				3602
				3603	/* Clear memory outside IRQ disabled section */
				3604	if (unlikely(slab_want_init_on_alloc(flags, s)))
				3605	for (i = 0; i < size; i++)
				3606	memset(p[i], 0, s->object_size);
				3607
				3608	slab_post_alloc_hook(s, flags, size, p);
				3609	/* FIXME: Trace call missing. Christoph would like a bulk variant */
				3610	return size;
				3611	error:
				3612	local_irq_enable();
				3613	cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
				3614	slab_post_alloc_hook(s, flags, i, p);
				3615	__kmem_cache_free_bulk(s, i, p);
				3616	return 0;
				3617	}
				3618	EXPORT_SYMBOL(kmem_cache_alloc_bulk);
				3619
				3620	#ifdef CONFIG_TRACING
				3621	void *
				3622	kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
				3623	{
				3624	void *ret;
				3625
				3626	ret = slab_alloc(cachep, flags, _RET_IP_);
				3627
				3628	ret = kasan_kmalloc(cachep, ret, size, flags);
				3629	trace_kmalloc(_RET_IP_, ret,
				3630	size, cachep->size, flags);
				3631	return ret;
				3632	}
				3633	EXPORT_SYMBOL(kmem_cache_alloc_trace);
				3634	#endif
				3635
				3636	#ifdef CONFIG_NUMA
				3637	/**
				3638	* kmem_cache_alloc_node - Allocate an object on the specified node
				3639	* @cachep: The cache to allocate from.
				3640	* @flags: See kmalloc().
				3641	* @nodeid: node number of the target node.
				3642	*
				3643	* Identical to kmem_cache_alloc but it will allocate memory on the given
				3644	* node, which can improve the performance for cpu bound structures.
				3645	*
				3646	* Fallback to other node is possible if __GFP_THISNODE is not set.
				3647	*/
				3648	void kmem_cache_alloc_node(struct kmem_cache cachep, gfp_t flags, int nodeid)
				3649	{
				3650	void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
				3651
				3652	trace_kmem_cache_alloc_node(_RET_IP_, ret,
				3653	cachep->object_size, cachep->size,
				3654	flags, nodeid);
				3655
				3656	return ret;
				3657	}
				3658	EXPORT_SYMBOL(kmem_cache_alloc_node);
				3659
				3660	#ifdef CONFIG_TRACING
				3661	void kmem_cache_alloc_node_trace(struct kmem_cache cachep,
				3662	gfp_t flags,
				3663	int nodeid,
				3664	size_t size)
				3665	{
				3666	void *ret;
				3667
				3668	ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
				3669
				3670	ret = kasan_kmalloc(cachep, ret, size, flags);
				3671	trace_kmalloc_node(_RET_IP_, ret,
				3672	size, cachep->size,
				3673	flags, nodeid);
				3674	return ret;
				3675	}
				3676	EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
				3677	#endif
				3678
				3679	static __always_inline void *
				3680	__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
				3681	{
				3682	struct kmem_cache *cachep;
				3683	void *ret;
				3684
				3685	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
				3686	return NULL;
				3687	cachep = kmalloc_slab(size, flags);
				3688	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
				3689	return cachep;
				3690	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
				3691	ret = kasan_kmalloc(cachep, ret, size, flags);
				3692
				3693	return ret;
				3694	}
				3695
				3696	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				3697	{
				3698	return __do_kmalloc_node(size, flags, node, _RET_IP_);
				3699	}
				3700	EXPORT_SYMBOL(__kmalloc_node);
				3701
				3702	void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
				3703	int node, unsigned long caller)
				3704	{
				3705	return __do_kmalloc_node(size, flags, node, caller);
				3706	}
				3707	EXPORT_SYMBOL(__kmalloc_node_track_caller);
				3708	#endif /* CONFIG_NUMA */
				3709
				3710	/**
				3711	* __do_kmalloc - allocate memory
				3712	* @size: how many bytes of memory are required.
				3713	* @flags: the type of memory to allocate (see kmalloc).
				3714	* @caller: function caller for debug tracking of the caller
				3715	*/
				3716	static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
				3717	unsigned long caller)
				3718	{
				3719	struct kmem_cache *cachep;
				3720	void *ret;
				3721
				3722	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
				3723	return NULL;
				3724	cachep = kmalloc_slab(size, flags);
				3725	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
				3726	return cachep;
				3727	ret = slab_alloc(cachep, flags, caller);
				3728
				3729	ret = kasan_kmalloc(cachep, ret, size, flags);
				3730	trace_kmalloc(caller, ret,
				3731	size, cachep->size, flags);
				3732
				3733	return ret;
				3734	}
				3735
				3736	void *__kmalloc(size_t size, gfp_t flags)
				3737	{
				3738	return __do_kmalloc(size, flags, _RET_IP_);
				3739	}
				3740	EXPORT_SYMBOL(__kmalloc);
				3741
				3742	void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
				3743	{
				3744	return __do_kmalloc(size, flags, caller);
				3745	}
				3746	EXPORT_SYMBOL(__kmalloc_track_caller);
				3747
				3748	/**
				3749	* kmem_cache_free - Deallocate an object
				3750	* @cachep: The cache the allocation was from.
				3751	* @objp: The previously allocated object.
				3752	*
				3753	* Free an object which was previously allocated from this
				3754	* cache.
				3755	*/
				3756	void kmem_cache_free(struct kmem_cache cachep, void objp)
				3757	{
				3758	unsigned long flags;
				3759	cachep = cache_from_obj(cachep, objp);
				3760	if (!cachep)
				3761	return;
				3762
				3763	local_irq_save(flags);
				3764	debug_check_no_locks_freed(objp, cachep->object_size);
				3765	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
				3766	debug_check_no_obj_freed(objp, cachep->object_size);
				3767	__cache_free(cachep, objp, _RET_IP_);
				3768	local_irq_restore(flags);
				3769
				3770	trace_kmem_cache_free(_RET_IP_, objp);
				3771	}
				3772	EXPORT_SYMBOL(kmem_cache_free);
				3773
				3774	void kmem_cache_free_bulk(struct kmem_cache orig_s, size_t size, void *p)
				3775	{
				3776	struct kmem_cache *s;
				3777	size_t i;
				3778
				3779	local_irq_disable();
				3780	for (i = 0; i < size; i++) {
				3781	void *objp = p[i];
				3782
				3783	if (!orig_s) /* called via kfree_bulk */
				3784	s = virt_to_cache(objp);
				3785	else
				3786	s = cache_from_obj(orig_s, objp);
				3787
				3788	debug_check_no_locks_freed(objp, s->object_size);
				3789	if (!(s->flags & SLAB_DEBUG_OBJECTS))
				3790	debug_check_no_obj_freed(objp, s->object_size);
				3791
				3792	__cache_free(s, objp, _RET_IP_);
				3793	}
				3794	local_irq_enable();
				3795
				3796	/* FIXME: add tracing */
				3797	}
				3798	EXPORT_SYMBOL(kmem_cache_free_bulk);
				3799
				3800	/**
				3801	* kfree - free previously allocated memory
				3802	* @objp: pointer returned by kmalloc.
				3803	*
				3804	* If @objp is NULL, no operation is performed.
				3805	*
				3806	* Don't free memory not originally allocated by kmalloc()
				3807	* or you will run into trouble.
				3808	*/
				3809	void kfree(const void *objp)
				3810	{
				3811	struct kmem_cache *c;
				3812	unsigned long flags;
				3813
				3814	trace_kfree(_RET_IP_, objp);
				3815
				3816	if (unlikely(ZERO_OR_NULL_PTR(objp)))
				3817	return;
				3818	local_irq_save(flags);
				3819	kfree_debugcheck(objp);
				3820	c = virt_to_cache(objp);
				3821	debug_check_no_locks_freed(objp, c->object_size);
				3822
				3823	debug_check_no_obj_freed(objp, c->object_size);
				3824	__cache_free(c, (void *)objp, _RET_IP_);
				3825	local_irq_restore(flags);
				3826	}
				3827	EXPORT_SYMBOL(kfree);
				3828
				3829	/*
				3830	* This initializes kmem_cache_node or resizes various caches for all nodes.
				3831	*/
				3832	static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
				3833	{
				3834	int ret;
				3835	int node;
				3836	struct kmem_cache_node *n;
				3837
				3838	for_each_online_node(node) {
				3839	ret = setup_kmem_cache_node(cachep, node, gfp, true);
				3840	if (ret)
				3841	goto fail;
				3842
				3843	}
				3844
				3845	return 0;
				3846
				3847	fail:
				3848	if (!cachep->list.next) {
				3849	/* Cache is not active yet. Roll back what we did */
				3850	node--;
				3851	while (node >= 0) {
				3852	n = get_node(cachep, node);
				3853	if (n) {
				3854	kfree(n->shared);
				3855	free_alien_cache(n->alien);
				3856	kfree(n);
				3857	cachep->node[node] = NULL;
				3858	}
				3859	node--;
				3860	}
				3861	}
				3862	return -ENOMEM;
				3863	}
				3864
				3865	/* Always called with the slab_mutex held */
				3866	static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
				3867	int batchcount, int shared, gfp_t gfp)
				3868	{
				3869	struct array_cache __percpu cpu_cache, prev;
				3870	int cpu;
				3871
				3872	cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
				3873	if (!cpu_cache)
				3874	return -ENOMEM;
				3875
				3876	prev = cachep->cpu_cache;
				3877	cachep->cpu_cache = cpu_cache;
				3878	/*
				3879	* Without a previous cpu_cache there's no need to synchronize remote
				3880	* cpus, so skip the IPIs.
				3881	*/
				3882	if (prev)
				3883	kick_all_cpus_sync();
				3884
				3885	check_irq_on();
				3886	cachep->batchcount = batchcount;
				3887	cachep->limit = limit;
				3888	cachep->shared = shared;
				3889
				3890	if (!prev)
				3891	goto setup_node;
				3892
				3893	for_each_online_cpu(cpu) {
				3894	LIST_HEAD(list);
				3895	int node;
				3896	struct kmem_cache_node *n;
				3897	struct array_cache *ac = per_cpu_ptr(prev, cpu);
				3898
				3899	node = cpu_to_mem(cpu);
				3900	n = get_node(cachep, node);
				3901	spin_lock_irq(&n->list_lock);
				3902	free_block(cachep, ac->entry, ac->avail, node, &list);
				3903	spin_unlock_irq(&n->list_lock);
				3904	slabs_destroy(cachep, &list);
				3905	}
				3906	free_percpu(prev);
				3907
				3908	setup_node:
				3909	return setup_kmem_cache_nodes(cachep, gfp);
				3910	}
				3911
				3912	static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
				3913	int batchcount, int shared, gfp_t gfp)
				3914	{
				3915	int ret;
				3916	struct kmem_cache *c;
				3917
				3918	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
				3919
				3920	if (slab_state < FULL)
				3921	return ret;
				3922
				3923	if ((ret < 0) \|\| !is_root_cache(cachep))
				3924	return ret;
				3925
				3926	lockdep_assert_held(&slab_mutex);
				3927	for_each_memcg_cache(c, cachep) {
				3928	/* return value determined by the root cache only */
				3929	__do_tune_cpucache(c, limit, batchcount, shared, gfp);
				3930	}
				3931
				3932	return ret;
				3933	}
				3934
				3935	/* Called with slab_mutex held always */
				3936	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
				3937	{
				3938	int err;
				3939	int limit = 0;
				3940	int shared = 0;
				3941	int batchcount = 0;
				3942
				3943	err = cache_random_seq_create(cachep, cachep->num, gfp);
				3944	if (err)
				3945	goto end;
				3946
				3947	if (!is_root_cache(cachep)) {
				3948	struct kmem_cache *root = memcg_root_cache(cachep);
				3949	limit = root->limit;
				3950	shared = root->shared;
				3951	batchcount = root->batchcount;
				3952	}
				3953
				3954	if (limit && shared && batchcount)
				3955	goto skip_setup;
				3956	/*
				3957	* The head array serves three purposes:
				3958	* - create a LIFO ordering, i.e. return objects that are cache-warm
				3959	* - reduce the number of spinlock operations.
				3960	* - reduce the number of linked list operations on the slab and
				3961	* bufctl chains: array operations are cheaper.
				3962	* The numbers are guessed, we should auto-tune as described by
				3963	* Bonwick.
				3964	*/
				3965	if (cachep->size > 131072)
				3966	limit = 1;
				3967	else if (cachep->size > PAGE_SIZE)
				3968	limit = 8;
				3969	else if (cachep->size > 1024)
				3970	limit = 24;
				3971	else if (cachep->size > 256)
				3972	limit = 54;
				3973	else
				3974	limit = 120;
				3975
				3976	/*
				3977	* CPU bound tasks (e.g. network routing) can exhibit cpu bound
				3978	* allocation behaviour: Most allocs on one cpu, most free operations
				3979	* on another cpu. For these cases, an efficient object passing between
				3980	* cpus is necessary. This is provided by a shared array. The array
				3981	* replaces Bonwick's magazine layer.
				3982	* On uniprocessor, it's functionally equivalent (but less efficient)
				3983	* to a larger limit. Thus disabled by default.
				3984	*/
				3985	shared = 0;
				3986	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
				3987	shared = 8;
				3988
				3989	#if DEBUG
				3990	/*
				3991	* With debugging enabled, large batchcount lead to excessively long
				3992	* periods with disabled local interrupts. Limit the batchcount
				3993	*/
				3994	if (limit > 32)
				3995	limit = 32;
				3996	#endif
				3997	batchcount = (limit + 1) / 2;
				3998	skip_setup:
				3999	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
				4000	end:
				4001	if (err)
				4002	pr_err("enable_cpucache failed for %s, error %d\n",
				4003	cachep->name, -err);
				4004	return err;
				4005	}
				4006
				4007	/*
				4008	* Drain an array if it contains any elements taking the node lock only if
				4009	* necessary. Note that the node listlock also protects the array_cache
				4010	* if drain_array() is used on the shared array.
				4011	*/
				4012	static void drain_array(struct kmem_cache cachep, struct kmem_cache_node n,
				4013	struct array_cache *ac, int node)
				4014	{
				4015	LIST_HEAD(list);
				4016
				4017	/* ac from n->shared can be freed if we don't hold the slab_mutex. */
				4018	check_mutex_acquired();
				4019
				4020	if (!ac \|\| !ac->avail)
				4021	return;
				4022
				4023	if (ac->touched) {
				4024	ac->touched = 0;
				4025	return;
				4026	}
				4027
				4028	spin_lock_irq(&n->list_lock);
				4029	drain_array_locked(cachep, ac, node, false, &list);
				4030	spin_unlock_irq(&n->list_lock);
				4031
				4032	slabs_destroy(cachep, &list);
				4033	}
				4034
				4035	/**
				4036	* cache_reap - Reclaim memory from caches.
				4037	* @w: work descriptor
				4038	*
				4039	* Called from workqueue/eventd every few seconds.
				4040	* Purpose:
				4041	* - clear the per-cpu caches for this CPU.
				4042	* - return freeable pages to the main free memory pool.
				4043	*
				4044	* If we cannot acquire the cache chain mutex then just give up - we'll try
				4045	* again on the next iteration.
				4046	*/
				4047	static void cache_reap(struct work_struct *w)
				4048	{
				4049	struct kmem_cache *searchp;
				4050	struct kmem_cache_node *n;
				4051	int node = numa_mem_id();
				4052	struct delayed_work *work = to_delayed_work(w);
				4053
				4054	if (!mutex_trylock(&slab_mutex))
				4055	/* Give up. Setup the next iteration. */
				4056	goto out;
				4057
				4058	list_for_each_entry(searchp, &slab_caches, list) {
				4059	check_irq_on();
				4060
				4061	/*
				4062	* We only take the node lock if absolutely necessary and we
				4063	* have established with reasonable certainty that
				4064	* we can do some work if the lock was obtained.
				4065	*/
				4066	n = get_node(searchp, node);
				4067
				4068	reap_alien(searchp, n);
				4069
				4070	drain_array(searchp, n, cpu_cache_get(searchp), node);
				4071
				4072	/*
				4073	* These are racy checks but it does not matter
				4074	* if we skip one check or scan twice.
				4075	*/
				4076	if (time_after(n->next_reap, jiffies))
				4077	goto next;
				4078
				4079	n->next_reap = jiffies + REAPTIMEOUT_NODE;
				4080
				4081	drain_array(searchp, n, n->shared, node);
				4082
				4083	if (n->free_touched)
				4084	n->free_touched = 0;
				4085	else {
				4086	int freed;
				4087
				4088	freed = drain_freelist(searchp, n, (n->free_limit +
				4089	5 * searchp->num - 1) / (5 * searchp->num));
				4090	STATS_ADD_REAPED(searchp, freed);
				4091	}
				4092	next:
				4093	cond_resched();
				4094	}
				4095	check_irq_on();
				4096	mutex_unlock(&slab_mutex);
				4097	next_reap_node();
				4098	out:
				4099	/* Set up the next iteration */
				4100	schedule_delayed_work_on(smp_processor_id(), work,
				4101	round_jiffies_relative(REAPTIMEOUT_AC));
				4102	}
				4103
				4104	void get_slabinfo(struct kmem_cache cachep, struct slabinfo sinfo)
				4105	{
				4106	unsigned long active_objs, num_objs, active_slabs;
				4107	unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
				4108	unsigned long free_slabs = 0;
				4109	int node;
				4110	struct kmem_cache_node *n;
				4111
				4112	for_each_kmem_cache_node(cachep, node, n) {
				4113	check_irq_on();
				4114	spin_lock_irq(&n->list_lock);
				4115
				4116	total_slabs += n->total_slabs;
				4117	free_slabs += n->free_slabs;
				4118	free_objs += n->free_objects;
				4119
				4120	if (n->shared)
				4121	shared_avail += n->shared->avail;
				4122
				4123	spin_unlock_irq(&n->list_lock);
				4124	}
				4125	num_objs = total_slabs * cachep->num;
				4126	active_slabs = total_slabs - free_slabs;
				4127	active_objs = num_objs - free_objs;
				4128
				4129	sinfo->active_objs = active_objs;
				4130	sinfo->num_objs = num_objs;
				4131	sinfo->active_slabs = active_slabs;
				4132	sinfo->num_slabs = total_slabs;
				4133	sinfo->shared_avail = shared_avail;
				4134	sinfo->limit = cachep->limit;
				4135	sinfo->batchcount = cachep->batchcount;
				4136	sinfo->shared = cachep->shared;
				4137	sinfo->objects_per_slab = cachep->num;
				4138	sinfo->cache_order = cachep->gfporder;
				4139	}
				4140
				4141	void slabinfo_show_stats(struct seq_file m, struct kmem_cache cachep)
				4142	{
				4143	#if STATS
				4144	{ /* node stats */
				4145	unsigned long high = cachep->high_mark;
				4146	unsigned long allocs = cachep->num_allocations;
				4147	unsigned long grown = cachep->grown;
				4148	unsigned long reaped = cachep->reaped;
				4149	unsigned long errors = cachep->errors;
				4150	unsigned long max_freeable = cachep->max_freeable;
				4151	unsigned long node_allocs = cachep->node_allocs;
				4152	unsigned long node_frees = cachep->node_frees;
				4153	unsigned long overflows = cachep->node_overflow;
				4154
				4155	seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
				4156	allocs, high, grown,
				4157	reaped, errors, max_freeable, node_allocs,
				4158	node_frees, overflows);
				4159	}
				4160	/* cpu stats */
				4161	{
				4162	unsigned long allochit = atomic_read(&cachep->allochit);
				4163	unsigned long allocmiss = atomic_read(&cachep->allocmiss);
				4164	unsigned long freehit = atomic_read(&cachep->freehit);
				4165	unsigned long freemiss = atomic_read(&cachep->freemiss);
				4166
				4167	seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
				4168	allochit, allocmiss, freehit, freemiss);
				4169	}
				4170	#endif
				4171	}
				4172
				4173	#define MAX_SLABINFO_WRITE 128
				4174	/**
				4175	* slabinfo_write - Tuning for the slab allocator
				4176	* @file: unused
				4177	* @buffer: user buffer
				4178	* @count: data length
				4179	* @ppos: unused
				4180	*/
				4181	ssize_t slabinfo_write(struct file file, const char __user buffer,
				4182	size_t count, loff_t *ppos)
				4183	{
				4184	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
				4185	int limit, batchcount, shared, res;
				4186	struct kmem_cache *cachep;
				4187
				4188	if (count > MAX_SLABINFO_WRITE)
				4189	return -EINVAL;
				4190	if (copy_from_user(&kbuf, buffer, count))
				4191	return -EFAULT;
				4192	kbuf[MAX_SLABINFO_WRITE] = '\0';
				4193
				4194	tmp = strchr(kbuf, ' ');
				4195	if (!tmp)
				4196	return -EINVAL;
				4197	*tmp = '\0';
				4198	tmp++;
				4199	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
				4200	return -EINVAL;
				4201
				4202	/* Find the cache in the chain of caches. */
				4203	mutex_lock(&slab_mutex);
				4204	res = -EINVAL;
				4205	list_for_each_entry(cachep, &slab_caches, list) {
				4206	if (!strcmp(cachep->name, kbuf)) {
				4207	if (limit < 1 \|\| batchcount < 1 \|\|
				4208	batchcount > limit \|\| shared < 0) {
				4209	res = 0;
				4210	} else {
				4211	res = do_tune_cpucache(cachep, limit,
				4212	batchcount, shared,
				4213	GFP_KERNEL);
				4214	}
				4215	break;
				4216	}
				4217	}
				4218	mutex_unlock(&slab_mutex);
				4219	if (res >= 0)
				4220	res = count;
				4221	return res;
				4222	}
				4223
				4224	#ifdef CONFIG_DEBUG_SLAB_LEAK
				4225
				4226	static inline int add_caller(unsigned long *n, unsigned long v)
				4227	{
				4228	unsigned long *p;
				4229	int l;
				4230	if (!v)
				4231	return 1;
				4232	l = n[1];
				4233	p = n + 2;
				4234	while (l) {
				4235	int i = l/2;
				4236	unsigned long q = p + 2 i;
				4237	if (*q == v) {
				4238	q[1]++;
				4239	return 1;
				4240	}
				4241	if (*q > v) {
				4242	l = i;
				4243	} else {
				4244	p = q + 2;
				4245	l -= i + 1;
				4246	}
				4247	}
				4248	if (++n[1] == n[0])
				4249	return 0;
				4250	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void )p - (void )n));
				4251	p[0] = v;
				4252	p[1] = 1;
				4253	return 1;
				4254	}
				4255
				4256	static void handle_slab(unsigned long n, struct kmem_cache c,
				4257	struct page *page)
				4258	{
				4259	void *p;
				4260	int i, j;
				4261	unsigned long v;
				4262
				4263	if (n[0] == n[1])
				4264	return;
				4265	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
				4266	bool active = true;
				4267
				4268	for (j = page->active; j < c->num; j++) {
				4269	if (get_free_obj(page, j) == i) {
				4270	active = false;
				4271	break;
				4272	}
				4273	}
				4274
				4275	if (!active)
				4276	continue;
				4277
				4278	/*
				4279	* probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
				4280	* mapping is established when actual object allocation and
				4281	* we could mistakenly access the unmapped object in the cpu
				4282	* cache.
				4283	*/
				4284	if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
				4285	continue;
				4286
				4287	if (!add_caller(n, v))
				4288	return;
				4289	}
				4290	}
				4291
				4292	static void show_symbol(struct seq_file *m, unsigned long address)
				4293	{
				4294	#ifdef CONFIG_KALLSYMS
				4295	unsigned long offset, size;
				4296	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
				4297
				4298	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
				4299	seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
				4300	if (modname[0])
				4301	seq_printf(m, " [%s]", modname);
				4302	return;
				4303	}
				4304	#endif
				4305	seq_printf(m, "%px", (void *)address);
				4306	}
				4307
				4308	static int leaks_show(struct seq_file m, void p)
				4309	{
				4310	struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
				4311	root_caches_node);
				4312	struct page *page;
				4313	struct kmem_cache_node *n;
				4314	const char *name;
				4315	unsigned long *x = m->private;
				4316	int node;
				4317	int i;
				4318
				4319	if (!(cachep->flags & SLAB_STORE_USER))
				4320	return 0;
				4321	if (!(cachep->flags & SLAB_RED_ZONE))
				4322	return 0;
				4323
				4324	/*
				4325	* Set store_user_clean and start to grab stored user information
				4326	* for all objects on this cache. If some alloc/free requests comes
				4327	* during the processing, information would be wrong so restart
				4328	* whole processing.
				4329	*/
				4330	do {
				4331	drain_cpu_caches(cachep);
				4332	/*
				4333	* drain_cpu_caches() could make kmemleak_object and
				4334	* debug_objects_cache dirty, so reset afterwards.
				4335	*/
				4336	set_store_user_clean(cachep);
				4337
				4338	x[1] = 0;
				4339
				4340	for_each_kmem_cache_node(cachep, node, n) {
				4341
				4342	check_irq_on();
				4343	spin_lock_irq(&n->list_lock);
				4344
				4345	list_for_each_entry(page, &n->slabs_full, lru)
				4346	handle_slab(x, cachep, page);
				4347	list_for_each_entry(page, &n->slabs_partial, lru)
				4348	handle_slab(x, cachep, page);
				4349	spin_unlock_irq(&n->list_lock);
				4350	}
				4351	} while (!is_store_user_clean(cachep));
				4352
				4353	name = cachep->name;
				4354	if (x[0] == x[1]) {
				4355	/* Increase the buffer size */
				4356	mutex_unlock(&slab_mutex);
				4357	m->private = kcalloc(x[0] * 4, sizeof(unsigned long),
				4358	GFP_KERNEL);
				4359	if (!m->private) {
				4360	/* Too bad, we are really out */
				4361	m->private = x;
				4362	mutex_lock(&slab_mutex);
				4363	return -ENOMEM;
				4364	}
				4365	(unsigned long )m->private = x[0] * 2;
				4366	kfree(x);
				4367	mutex_lock(&slab_mutex);
				4368	/* Now make sure this entry will be retried */
				4369	m->count = m->size;
				4370	return 0;
				4371	}
				4372	for (i = 0; i < x[1]; i++) {
				4373	seq_printf(m, "%s: %lu ", name, x[2*i+3]);
				4374	show_symbol(m, x[2*i+2]);
				4375	seq_putc(m, '\n');
				4376	}
				4377
				4378	return 0;
				4379	}
				4380
				4381	static const struct seq_operations slabstats_op = {
				4382	.start = slab_start,
				4383	.next = slab_next,
				4384	.stop = slab_stop,
				4385	.show = leaks_show,
				4386	};
				4387
				4388	static int slabstats_open(struct inode inode, struct file file)
				4389	{
				4390	unsigned long *n;
				4391
				4392	n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
				4393	if (!n)
				4394	return -ENOMEM;
				4395
				4396	n = PAGE_SIZE / (2 sizeof(unsigned long));
				4397
				4398	return 0;
				4399	}
				4400
				4401	static const struct file_operations proc_slabstats_operations = {
				4402	.open = slabstats_open,
				4403	.read = seq_read,
				4404	.llseek = seq_lseek,
				4405	.release = seq_release_private,
				4406	};
				4407	#endif
				4408
				4409	static int __init slab_proc_init(void)
				4410	{
				4411	#ifdef CONFIG_DEBUG_SLAB_LEAK
				4412	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
				4413	#endif
				4414	return 0;
				4415	}
				4416	module_init(slab_proc_init);
				4417
				4418	#ifdef CONFIG_HARDENED_USERCOPY
				4419	/*
				4420	* Rejects incorrectly sized objects and objects that are to be copied
				4421	* to/from userspace but do not fall entirely within the containing slab
				4422	* cache's usercopy region.
				4423	*
				4424	* Returns NULL if check passes, otherwise const char * to name of cache
				4425	* to indicate an error.
				4426	*/
				4427	void __check_heap_object(const void ptr, unsigned long n, struct page page,
				4428	bool to_user)
				4429	{
				4430	struct kmem_cache *cachep;
				4431	unsigned int objnr;
				4432	unsigned long offset;
				4433
				4434	ptr = kasan_reset_tag(ptr);
				4435
				4436	/* Find and validate object. */
				4437	cachep = page->slab_cache;
				4438	objnr = obj_to_index(cachep, page, (void *)ptr);
				4439	BUG_ON(objnr >= cachep->num);
				4440
				4441	/* Find offset within object. */
				4442	offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
				4443
				4444	/* Allow address range falling entirely within usercopy region. */
				4445	if (offset >= cachep->useroffset &&
				4446	offset - cachep->useroffset <= cachep->usersize &&
				4447	n <= cachep->useroffset - offset + cachep->usersize)
				4448	return;
				4449
				4450	/*
				4451	* If the copy is still within the allocated object, produce
				4452	* a warning instead of rejecting the copy. This is intended
				4453	* to be a temporary method to find any missing usercopy
				4454	* whitelists.
				4455	*/
				4456	if (usercopy_fallback &&
				4457	offset <= cachep->object_size &&
				4458	n <= cachep->object_size - offset) {
				4459	usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
				4460	return;
				4461	}
				4462
				4463	usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
				4464	}
				4465	#endif /* CONFIG_HARDENED_USERCOPY */
				4466
				4467	/**
				4468	* ksize - get the actual amount of memory allocated for a given object
				4469	* @objp: Pointer to the object
				4470	*
				4471	* kmalloc may internally round up allocations and return more memory
				4472	* than requested. ksize() can be used to determine the actual amount of
				4473	* memory allocated. The caller may use this additional memory, even though
				4474	* a smaller amount of memory was initially specified with the kmalloc call.
				4475	* The caller must guarantee that objp points to a valid object previously
				4476	* allocated with either kmalloc() or kmem_cache_alloc(). The object
				4477	* must not be freed during the duration of the call.
				4478	*/
				4479	size_t ksize(const void *objp)
				4480	{
				4481	size_t size;
				4482
				4483	BUG_ON(!objp);
				4484	if (unlikely(objp == ZERO_SIZE_PTR))
				4485	return 0;
				4486
				4487	size = virt_to_cache(objp)->object_size;
				4488	/* We assume that ksize callers could use the whole allocated area,
				4489	* so we need to unpoison this area.
				4490	*/
				4491	kasan_unpoison_shadow(objp, size);
				4492
				4493	return size;
				4494	}
				4495	EXPORT_SYMBOL(ksize);