Blame - marvell/linux/mm/slab.c - T108

blob: cdeaf561aa7b47b103cb6d14d828ada02d07d57b [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/mm/slab.c
				4	* Written by Mark Hemment, 1996/97.
				5	* (markhe@nextd.demon.co.uk)
				6	*
				7	* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
				8	*
				9	* Major cleanup, different bufctl logic, per-cpu arrays
				10	* (c) 2000 Manfred Spraul
				11	*
				12	* Cleanup, make the head arrays unconditional, preparation for NUMA
				13	* (c) 2002 Manfred Spraul
				14	*
				15	* An implementation of the Slab Allocator as described in outline in;
				16	* UNIX Internals: The New Frontiers by Uresh Vahalia
				17	* Pub: Prentice Hall ISBN 0-13-101908-2
				18	* or with a little more detail in;
				19	* The Slab Allocator: An Object-Caching Kernel Memory Allocator
				20	* Jeff Bonwick (Sun Microsystems).
				21	* Presented at: USENIX Summer 1994 Technical Conference
				22	*
				23	* The memory is organized in caches, one cache for each object type.
				24	* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
				25	* Each cache consists out of many slabs (they are small (usually one
				26	* page long) and always contiguous), and each slab contains multiple
				27	* initialized objects.
				28	*
				29	* This means, that your constructor is used only for newly allocated
				30	* slabs and you must pass objects with the same initializations to
				31	* kmem_cache_free.
				32	*
				33	* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
				34	* normal). If you need a special memory type, then must create a new
				35	* cache for that memory type.
				36	*
				37	* In order to reduce fragmentation, the slabs are sorted in 3 groups:
				38	* full slabs with 0 free objects
				39	* partial slabs
				40	* empty slabs with no allocated objects
				41	*
				42	* If partial slabs exist, then new allocations come from these slabs,
				43	* otherwise from empty slabs or new slabs are allocated.
				44	*
				45	* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
				46	* during kmem_cache_destroy(). The caller must prevent concurrent allocs.
				47	*
				48	* Each cache has a short per-cpu head array, most allocs
				49	* and frees go into that array, and if that array overflows, then 1/2
				50	* of the entries in the array are given back into the global cache.
				51	* The head array is strictly LIFO and should improve the cache hit rates.
				52	* On SMP, it additionally reduces the spinlock operations.
				53	*
				54	* The c_cpuarray may not be read with enabled local interrupts -
				55	* it's changed with a smp_call_function().
				56	*
				57	* SMP synchronization:
				58	* constructors and destructors are called without any locking.
				59	* Several members in struct kmem_cache and struct slab never change, they
				60	* are accessed without any locking.
				61	* The per-cpu arrays are never accessed from the wrong cpu, no locking,
				62	* and local interrupts are disabled so slab code is preempt-safe.
				63	* The non-constant members are protected with a per-cache irq spinlock.
				64	*
				65	* Many thanks to Mark Hemment, who wrote another per-cpu slab patch
				66	* in 2000 - many ideas in the current implementation are derived from
				67	* his patch.
				68	*
				69	* Further notes from the original documentation:
				70	*
				71	* 11 April '97. Started multi-threading - markhe
				72	* The global cache-chain is protected by the mutex 'slab_mutex'.
				73	* The sem is only needed when accessing/extending the cache-chain, which
				74	* can never happen inside an interrupt (kmem_cache_create(),
				75	* kmem_cache_shrink() and kmem_cache_reap()).
				76	*
				77	* At present, each engine can be growing a cache. This should be blocked.
				78	*
				79	* 15 March 2005. NUMA slab allocator.
				80	* Shai Fultheim <shai@scalex86.org>.
				81	* Shobhit Dayal <shobhit@calsoftinc.com>
				82	* Alok N Kataria <alokk@calsoftinc.com>
				83	* Christoph Lameter <christoph@lameter.com>
				84	*
				85	* Modified the slab allocator to be node aware on NUMA systems.
				86	* Each node has its own list of partial, free and full slabs.
				87	* All object allocations for a node occur from node specific slab lists.
				88	*/
				89
				90	#include <linux/slab.h>
				91	#include <linux/mm.h>
				92	#include <linux/poison.h>
				93	#include <linux/swap.h>
				94	#include <linux/cache.h>
				95	#include <linux/interrupt.h>
				96	#include <linux/init.h>
				97	#include <linux/compiler.h>
				98	#include <linux/cpuset.h>
				99	#include <linux/proc_fs.h>
				100	#include <linux/seq_file.h>
				101	#include <linux/notifier.h>
				102	#include <linux/kallsyms.h>
				103	#include <linux/kfence.h>
				104	#include <linux/cpu.h>
				105	#include <linux/sysctl.h>
				106	#include <linux/module.h>
				107	#include <linux/rcupdate.h>
				108	#include <linux/string.h>
				109	#include <linux/uaccess.h>
				110	#include <linux/nodemask.h>
				111	#include <linux/kmemleak.h>
				112	#include <linux/mempolicy.h>
				113	#include <linux/mutex.h>
				114	#include <linux/fault-inject.h>
				115	#include <linux/rtmutex.h>
				116	#include <linux/reciprocal_div.h>
				117	#include <linux/debugobjects.h>
				118	#include <linux/memory.h>
				119	#include <linux/prefetch.h>
				120	#include <linux/sched/task_stack.h>
				121
				122	#include <net/sock.h>
				123
				124	#include <asm/cacheflush.h>
				125	#include <asm/tlbflush.h>
				126	#include <asm/page.h>
				127
				128	#include <trace/events/kmem.h>
				129
				130	#include "internal.h"
				131
				132	#include "slab.h"
				133
				134	/*
				135	* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
				136	* 0 for faster, smaller code (especially in the critical paths).
				137	*
				138	* STATS - 1 to collect stats for /proc/slabinfo.
				139	* 0 for faster, smaller code (especially in the critical paths).
				140	*
				141	* FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
				142	*/
				143
				144	#ifdef CONFIG_DEBUG_SLAB
				145	#define DEBUG 1
				146	#define STATS 1
				147	#define FORCED_DEBUG 1
				148	#else
				149	#define DEBUG 0
				150	#define STATS 0
				151	#define FORCED_DEBUG 0
				152	#endif
				153
				154	/* Shouldn't this be in a header file somewhere? */
				155	#define BYTES_PER_WORD sizeof(void *)
				156	#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
				157
				158	#ifndef ARCH_KMALLOC_FLAGS
				159	#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
				160	#endif
				161
				162	#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
				163	<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
				164
				165	#if FREELIST_BYTE_INDEX
				166	typedef unsigned char freelist_idx_t;
				167	#else
				168	typedef unsigned short freelist_idx_t;
				169	#endif
				170
				171	#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
				172
				173	/*
				174	* struct array_cache
				175	*
				176	* Purpose:
				177	* - LIFO ordering, to hand out cache-warm objects from _alloc
				178	* - reduce the number of linked list operations
				179	* - reduce spinlock operations
				180	*
				181	* The limit is stored in the per-cpu structure to reduce the data cache
				182	* footprint.
				183	*
				184	*/
				185	struct array_cache {
				186	unsigned int avail;
				187	unsigned int limit;
				188	unsigned int batchcount;
				189	unsigned int touched;
				190	void entry[]; /
				191	* Must have this definition in here for the proper
				192	* alignment of array_cache. Also simplifies accessing
				193	* the entries.
				194	*/
				195	};
				196
				197	struct alien_cache {
				198	spinlock_t lock;
				199	struct array_cache ac;
				200	};
				201
				202	/*
				203	* Need this for bootstrapping a per node allocator.
				204	*/
				205	#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
				206	static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
				207	#define CACHE_CACHE 0
				208	#define SIZE_NODE (MAX_NUMNODES)
				209
				210	static int drain_freelist(struct kmem_cache *cache,
				211	struct kmem_cache_node *n, int tofree);
				212	static void free_block(struct kmem_cache cachep, void *objpp, int len,
				213	int node, struct list_head *list);
				214	static void slabs_destroy(struct kmem_cache cachep, struct list_head list);
				215	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
				216	static void cache_reap(struct work_struct *unused);
				217
				218	static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
				219	void **list);
				220	static inline void fixup_slab_list(struct kmem_cache *cachep,
				221	struct kmem_cache_node n, struct page page,
				222	void **list);
				223	static int slab_early_init = 1;
				224
				225	#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
				226
				227	static void kmem_cache_node_init(struct kmem_cache_node *parent)
				228	{
				229	INIT_LIST_HEAD(&parent->slabs_full);
				230	INIT_LIST_HEAD(&parent->slabs_partial);
				231	INIT_LIST_HEAD(&parent->slabs_free);
				232	parent->total_slabs = 0;
				233	parent->free_slabs = 0;
				234	parent->shared = NULL;
				235	parent->alien = NULL;
				236	parent->colour_next = 0;
				237	spin_lock_init(&parent->list_lock);
				238	parent->free_objects = 0;
				239	parent->free_touched = 0;
				240	}
				241
				242	#define MAKE_LIST(cachep, listp, slab, nodeid) \
				243	do { \
				244	INIT_LIST_HEAD(listp); \
				245	list_splice(&get_node(cachep, nodeid)->slab, listp); \
				246	} while (0)
				247
				248	#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
				249	do { \
				250	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
				251	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
				252	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
				253	} while (0)
				254
				255	#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
				256	#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
				257	#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
				258	#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
				259
				260	#define BATCHREFILL_LIMIT 16
				261	/*
				262	* Optimization question: fewer reaps means less probability for unnessary
				263	* cpucache drain/refill cycles.
				264	*
				265	* OTOH the cpuarrays can contain lots of objects,
				266	* which could lock up otherwise freeable slabs.
				267	*/
				268	#define REAPTIMEOUT_AC (2*HZ)
				269	#define REAPTIMEOUT_NODE (4*HZ)
				270
				271	#if STATS
				272	#define STATS_INC_ACTIVE(x) ((x)->num_active++)
				273	#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
				274	#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
				275	#define STATS_INC_GROWN(x) ((x)->grown++)
				276	#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
				277	#define STATS_SET_HIGH(x) \
				278	do { \
				279	if ((x)->num_active > (x)->high_mark) \
				280	(x)->high_mark = (x)->num_active; \
				281	} while (0)
				282	#define STATS_INC_ERR(x) ((x)->errors++)
				283	#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
				284	#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
				285	#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
				286	#define STATS_SET_FREEABLE(x, i) \
				287	do { \
				288	if ((x)->max_freeable < i) \
				289	(x)->max_freeable = i; \
				290	} while (0)
				291	#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
				292	#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
				293	#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
				294	#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
				295	#else
				296	#define STATS_INC_ACTIVE(x) do { } while (0)
				297	#define STATS_DEC_ACTIVE(x) do { } while (0)
				298	#define STATS_INC_ALLOCED(x) do { } while (0)
				299	#define STATS_INC_GROWN(x) do { } while (0)
				300	#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
				301	#define STATS_SET_HIGH(x) do { } while (0)
				302	#define STATS_INC_ERR(x) do { } while (0)
				303	#define STATS_INC_NODEALLOCS(x) do { } while (0)
				304	#define STATS_INC_NODEFREES(x) do { } while (0)
				305	#define STATS_INC_ACOVERFLOW(x) do { } while (0)
				306	#define STATS_SET_FREEABLE(x, i) do { } while (0)
				307	#define STATS_INC_ALLOCHIT(x) do { } while (0)
				308	#define STATS_INC_ALLOCMISS(x) do { } while (0)
				309	#define STATS_INC_FREEHIT(x) do { } while (0)
				310	#define STATS_INC_FREEMISS(x) do { } while (0)
				311	#endif
				312
				313	#if DEBUG
				314
				315	/*
				316	* memory layout of objects:
				317	* 0 : objp
				318	* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
				319	* the end of an object is aligned with the end of the real
				320	* allocation. Catches writes behind the end of the allocation.
				321	* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
				322	* redzone word.
				323	* cachep->obj_offset: The real object.
				324	* cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
				325	* cachep->size - 1* BYTES_PER_WORD: last caller address
				326	* [BYTES_PER_WORD long]
				327	*/
				328	static int obj_offset(struct kmem_cache *cachep)
				329	{
				330	return cachep->obj_offset;
				331	}
				332
				333	static unsigned long long dbg_redzone1(struct kmem_cache cachep, void *objp)
				334	{
				335	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
				336	return (unsigned long long*) (objp + obj_offset(cachep) -
				337	sizeof(unsigned long long));
				338	}
				339
				340	static unsigned long long dbg_redzone2(struct kmem_cache cachep, void *objp)
				341	{
				342	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
				343	if (cachep->flags & SLAB_STORE_USER)
				344	return (unsigned long long *)(objp + cachep->size -
				345	sizeof(unsigned long long) -
				346	REDZONE_ALIGN);
				347	return (unsigned long long *) (objp + cachep->size -
				348	sizeof(unsigned long long));
				349	}
				350
				351	static void *dbg_userword(struct kmem_cache cachep, void *objp)
				352	{
				353	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
				354	return (void **)(objp + cachep->size - BYTES_PER_WORD);
				355	}
				356
				357	#else
				358
				359	#define obj_offset(x) 0
				360	#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
				361	#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
				362	#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
				363
				364	#endif
				365
				366	/*
				367	* Do not go above this order unless 0 objects fit into the slab or
				368	* overridden on the command line.
				369	*/
				370	#define SLAB_MAX_ORDER_HI 1
				371	#define SLAB_MAX_ORDER_LO 0
				372	static int slab_max_order = SLAB_MAX_ORDER_LO;
				373	static bool slab_max_order_set __initdata;
				374
				375	static inline void index_to_obj(struct kmem_cache cache, struct page *page,
				376	unsigned int idx)
				377	{
				378	return page->s_mem + cache->size * idx;
				379	}
				380
				381	#define BOOT_CPUCACHE_ENTRIES 1
				382	/* internal cache of cache description objs */
				383	static struct kmem_cache kmem_cache_boot = {
				384	.batchcount = 1,
				385	.limit = BOOT_CPUCACHE_ENTRIES,
				386	.shared = 1,
				387	.size = sizeof(struct kmem_cache),
				388	.name = "kmem_cache",
				389	};
				390
				391	static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
				392
				393	static inline struct array_cache cpu_cache_get(struct kmem_cache cachep)
				394	{
				395	return this_cpu_ptr(cachep->cpu_cache);
				396	}
				397
				398	/*
				399	* Calculate the number of objects and left-over bytes for a given buffer size.
				400	*/
				401	static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
				402	slab_flags_t flags, size_t *left_over)
				403	{
				404	unsigned int num;
				405	size_t slab_size = PAGE_SIZE << gfporder;
				406
				407	/*
				408	* The slab management structure can be either off the slab or
				409	* on it. For the latter case, the memory allocated for a
				410	* slab is used for:
				411	*
				412	* - @buffer_size bytes for each object
				413	* - One freelist_idx_t for each object
				414	*
				415	* We don't need to consider alignment of freelist because
				416	* freelist will be at the end of slab page. The objects will be
				417	* at the correct alignment.
				418	*
				419	* If the slab management structure is off the slab, then the
				420	* alignment will already be calculated into the size. Because
				421	* the slabs are all pages aligned, the objects will be at the
				422	* correct alignment when allocated.
				423	*/
				424	if (flags & (CFLGS_OBJFREELIST_SLAB \| CFLGS_OFF_SLAB)) {
				425	num = slab_size / buffer_size;
				426	*left_over = slab_size % buffer_size;
				427	} else {
				428	num = slab_size / (buffer_size + sizeof(freelist_idx_t));
				429	*left_over = slab_size %
				430	(buffer_size + sizeof(freelist_idx_t));
				431	}
				432
				433	return num;
				434	}
				435
				436	#if DEBUG
				437	#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
				438
				439	static void __slab_error(const char function, struct kmem_cache cachep,
				440	char *msg)
				441	{
				442	pr_err("slab error in %s(): cache `%s': %s\n",
				443	function, cachep->name, msg);
				444	dump_stack();
				445	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
				446	}
				447	#endif
				448
				449	/*
				450	* By default on NUMA we use alien caches to stage the freeing of
				451	* objects allocated from other nodes. This causes massive memory
				452	* inefficiencies when using fake NUMA setup to split memory into a
				453	* large number of small nodes, so it can be disabled on the command
				454	* line
				455	*/
				456
				457	static int use_alien_caches __read_mostly = 1;
				458	static int __init noaliencache_setup(char *s)
				459	{
				460	use_alien_caches = 0;
				461	return 1;
				462	}
				463	__setup("noaliencache", noaliencache_setup);
				464
				465	static int __init slab_max_order_setup(char *str)
				466	{
				467	get_option(&str, &slab_max_order);
				468	slab_max_order = slab_max_order < 0 ? 0 :
				469	min(slab_max_order, MAX_ORDER - 1);
				470	slab_max_order_set = true;
				471
				472	return 1;
				473	}
				474	__setup("slab_max_order=", slab_max_order_setup);
				475
				476	#ifdef CONFIG_NUMA
				477	/*
				478	* Special reaping functions for NUMA systems called from cache_reap().
				479	* These take care of doing round robin flushing of alien caches (containing
				480	* objects freed on different nodes from which they were allocated) and the
				481	* flushing of remote pcps by calling drain_node_pages.
				482	*/
				483	static DEFINE_PER_CPU(unsigned long, slab_reap_node);
				484
				485	static void init_reap_node(int cpu)
				486	{
				487	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
				488	node_online_map);
				489	}
				490
				491	static void next_reap_node(void)
				492	{
				493	int node = __this_cpu_read(slab_reap_node);
				494
				495	node = next_node_in(node, node_online_map);
				496	__this_cpu_write(slab_reap_node, node);
				497	}
				498
				499	#else
				500	#define init_reap_node(cpu) do { } while (0)
				501	#define next_reap_node(void) do { } while (0)
				502	#endif
				503
				504	/*
				505	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
				506	* via the workqueue/eventd.
				507	* Add the CPU number into the expiration time to minimize the possibility of
				508	* the CPUs getting into lockstep and contending for the global cache chain
				509	* lock.
				510	*/
				511	static void start_cpu_timer(int cpu)
				512	{
				513	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
				514
				515	if (reap_work->work.func == NULL) {
				516	init_reap_node(cpu);
				517	INIT_DEFERRABLE_WORK(reap_work, cache_reap);
				518	schedule_delayed_work_on(cpu, reap_work,
				519	__round_jiffies_relative(HZ, cpu));
				520	}
				521	}
				522
				523	static void init_arraycache(struct array_cache *ac, int limit, int batch)
				524	{
				525	if (ac) {
				526	ac->avail = 0;
				527	ac->limit = limit;
				528	ac->batchcount = batch;
				529	ac->touched = 0;
				530	}
				531	}
				532
				533	static struct array_cache *alloc_arraycache(int node, int entries,
				534	int batchcount, gfp_t gfp)
				535	{
				536	size_t memsize = sizeof(void ) entries + sizeof(struct array_cache);
				537	struct array_cache *ac = NULL;
				538
				539	ac = kmalloc_node(memsize, gfp, node);
				540	/*
				541	* The array_cache structures contain pointers to free object.
				542	* However, when such objects are allocated or transferred to another
				543	* cache the pointers are not cleared and they could be counted as
				544	* valid references during a kmemleak scan. Therefore, kmemleak must
				545	* not scan such objects.
				546	*/
				547	kmemleak_no_scan(ac);
				548	init_arraycache(ac, entries, batchcount);
				549	return ac;
				550	}
				551
				552	static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
				553	struct page page, void objp)
				554	{
				555	struct kmem_cache_node *n;
				556	int page_node;
				557	LIST_HEAD(list);
				558
				559	page_node = page_to_nid(page);
				560	n = get_node(cachep, page_node);
				561
				562	spin_lock(&n->list_lock);
				563	free_block(cachep, &objp, 1, page_node, &list);
				564	spin_unlock(&n->list_lock);
				565
				566	slabs_destroy(cachep, &list);
				567	}
				568
				569	/*
				570	* Transfer objects in one arraycache to another.
				571	* Locking must be handled by the caller.
				572	*
				573	* Return the number of entries transferred.
				574	*/
				575	static int transfer_objects(struct array_cache *to,
				576	struct array_cache *from, unsigned int max)
				577	{
				578	/* Figure out how many entries to transfer */
				579	int nr = min3(from->avail, max, to->limit - to->avail);
				580
				581	if (!nr)
				582	return 0;
				583
				584	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
				585	sizeof(void ) nr);
				586
				587	from->avail -= nr;
				588	to->avail += nr;
				589	return nr;
				590	}
				591
				592	#ifndef CONFIG_NUMA
				593
				594	#define drain_alien_cache(cachep, alien) do { } while (0)
				595	#define reap_alien(cachep, n) do { } while (0)
				596
				597	static inline struct alien_cache **alloc_alien_cache(int node,
				598	int limit, gfp_t gfp)
				599	{
				600	return NULL;
				601	}
				602
				603	static inline void free_alien_cache(struct alien_cache **ac_ptr)
				604	{
				605	}
				606
				607	static inline int cache_free_alien(struct kmem_cache cachep, void objp)
				608	{
				609	return 0;
				610	}
				611
				612	static inline void alternate_node_alloc(struct kmem_cache cachep,
				613	gfp_t flags)
				614	{
				615	return NULL;
				616	}
				617
				618	static inline void ____cache_alloc_node(struct kmem_cache cachep,
				619	gfp_t flags, int nodeid)
				620	{
				621	return NULL;
				622	}
				623
				624	static inline gfp_t gfp_exact_node(gfp_t flags)
				625	{
				626	return flags & ~__GFP_NOFAIL;
				627	}
				628
				629	#else /* CONFIG_NUMA */
				630
				631	static void ____cache_alloc_node(struct kmem_cache , gfp_t, int);
				632	static void alternate_node_alloc(struct kmem_cache , gfp_t);
				633
				634	static struct alien_cache *__alloc_alien_cache(int node, int entries,
				635	int batch, gfp_t gfp)
				636	{
				637	size_t memsize = sizeof(void ) entries + sizeof(struct alien_cache);
				638	struct alien_cache *alc = NULL;
				639
				640	alc = kmalloc_node(memsize, gfp, node);
				641	if (alc) {
				642	kmemleak_no_scan(alc);
				643	init_arraycache(&alc->ac, entries, batch);
				644	spin_lock_init(&alc->lock);
				645	}
				646	return alc;
				647	}
				648
				649	static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
				650	{
				651	struct alien_cache **alc_ptr;
				652	int i;
				653
				654	if (limit > 1)
				655	limit = 12;
				656	alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
				657	if (!alc_ptr)
				658	return NULL;
				659
				660	for_each_node(i) {
				661	if (i == node \|\| !node_online(i))
				662	continue;
				663	alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
				664	if (!alc_ptr[i]) {
				665	for (i--; i >= 0; i--)
				666	kfree(alc_ptr[i]);
				667	kfree(alc_ptr);
				668	return NULL;
				669	}
				670	}
				671	return alc_ptr;
				672	}
				673
				674	static void free_alien_cache(struct alien_cache **alc_ptr)
				675	{
				676	int i;
				677
				678	if (!alc_ptr)
				679	return;
				680	for_each_node(i)
				681	kfree(alc_ptr[i]);
				682	kfree(alc_ptr);
				683	}
				684
				685	static void __drain_alien_cache(struct kmem_cache *cachep,
				686	struct array_cache *ac, int node,
				687	struct list_head *list)
				688	{
				689	struct kmem_cache_node *n = get_node(cachep, node);
				690
				691	if (ac->avail) {
				692	spin_lock(&n->list_lock);
				693	/*
				694	* Stuff objects into the remote nodes shared array first.
				695	* That way we could avoid the overhead of putting the objects
				696	* into the free lists and getting them back later.
				697	*/
				698	if (n->shared)
				699	transfer_objects(n->shared, ac, ac->limit);
				700
				701	free_block(cachep, ac->entry, ac->avail, node, list);
				702	ac->avail = 0;
				703	spin_unlock(&n->list_lock);
				704	}
				705	}
				706
				707	/*
				708	* Called from cache_reap() to regularly drain alien caches round robin.
				709	*/
				710	static void reap_alien(struct kmem_cache cachep, struct kmem_cache_node n)
				711	{
				712	int node = __this_cpu_read(slab_reap_node);
				713
				714	if (n->alien) {
				715	struct alien_cache *alc = n->alien[node];
				716	struct array_cache *ac;
				717
				718	if (alc) {
				719	ac = &alc->ac;
				720	if (ac->avail && spin_trylock_irq(&alc->lock)) {
				721	LIST_HEAD(list);
				722
				723	__drain_alien_cache(cachep, ac, node, &list);
				724	spin_unlock_irq(&alc->lock);
				725	slabs_destroy(cachep, &list);
				726	}
				727	}
				728	}
				729	}
				730
				731	static void drain_alien_cache(struct kmem_cache *cachep,
				732	struct alien_cache **alien)
				733	{
				734	int i = 0;
				735	struct alien_cache *alc;
				736	struct array_cache *ac;
				737	unsigned long flags;
				738
				739	for_each_online_node(i) {
				740	alc = alien[i];
				741	if (alc) {
				742	LIST_HEAD(list);
				743
				744	ac = &alc->ac;
				745	spin_lock_irqsave(&alc->lock, flags);
				746	__drain_alien_cache(cachep, ac, i, &list);
				747	spin_unlock_irqrestore(&alc->lock, flags);
				748	slabs_destroy(cachep, &list);
				749	}
				750	}
				751	}
				752
				753	static int __cache_free_alien(struct kmem_cache cachep, void objp,
				754	int node, int page_node)
				755	{
				756	struct kmem_cache_node *n;
				757	struct alien_cache *alien = NULL;
				758	struct array_cache *ac;
				759	LIST_HEAD(list);
				760
				761	n = get_node(cachep, node);
				762	STATS_INC_NODEFREES(cachep);
				763	if (n->alien && n->alien[page_node]) {
				764	alien = n->alien[page_node];
				765	ac = &alien->ac;
				766	spin_lock(&alien->lock);
				767	if (unlikely(ac->avail == ac->limit)) {
				768	STATS_INC_ACOVERFLOW(cachep);
				769	__drain_alien_cache(cachep, ac, page_node, &list);
				770	}
				771	ac->entry[ac->avail++] = objp;
				772	spin_unlock(&alien->lock);
				773	slabs_destroy(cachep, &list);
				774	} else {
				775	n = get_node(cachep, page_node);
				776	spin_lock(&n->list_lock);
				777	free_block(cachep, &objp, 1, page_node, &list);
				778	spin_unlock(&n->list_lock);
				779	slabs_destroy(cachep, &list);
				780	}
				781	return 1;
				782	}
				783
				784	static inline int cache_free_alien(struct kmem_cache cachep, void objp)
				785	{
				786	int page_node = page_to_nid(virt_to_page(objp));
				787	int node = numa_mem_id();
				788	/*
				789	* Make sure we are not freeing a object from another node to the array
				790	* cache on this cpu.
				791	*/
				792	if (likely(node == page_node))
				793	return 0;
				794
				795	return __cache_free_alien(cachep, objp, node, page_node);
				796	}
				797
				798	/*
				799	* Construct gfp mask to allocate from a specific node but do not reclaim or
				800	* warn about failures.
				801	*/
				802	static inline gfp_t gfp_exact_node(gfp_t flags)
				803	{
				804	return (flags \| __GFP_THISNODE \| __GFP_NOWARN) & ~(__GFP_RECLAIM\|__GFP_NOFAIL);
				805	}
				806	#endif
				807
				808	static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
				809	{
				810	struct kmem_cache_node *n;
				811
				812	/*
				813	* Set up the kmem_cache_node for cpu before we can
				814	* begin anything. Make sure some other cpu on this
				815	* node has not already allocated this
				816	*/
				817	n = get_node(cachep, node);
				818	if (n) {
				819	spin_lock_irq(&n->list_lock);
				820	n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
				821	cachep->num;
				822	spin_unlock_irq(&n->list_lock);
				823
				824	return 0;
				825	}
				826
				827	n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
				828	if (!n)
				829	return -ENOMEM;
				830
				831	kmem_cache_node_init(n);
				832	n->next_reap = jiffies + REAPTIMEOUT_NODE +
				833	((unsigned long)cachep) % REAPTIMEOUT_NODE;
				834
				835	n->free_limit =
				836	(1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
				837
				838	/*
				839	* The kmem_cache_nodes don't come and go as CPUs
				840	* come and go. slab_mutex is sufficient
				841	* protection here.
				842	*/
				843	cachep->node[node] = n;
				844
				845	return 0;
				846	}
				847
				848	#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) \|\| defined(CONFIG_SMP)
				849	/*
				850	* Allocates and initializes node for a node on each slab cache, used for
				851	* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
				852	* will be allocated off-node since memory is not yet online for the new node.
				853	* When hotplugging memory or a cpu, existing node are not replaced if
				854	* already in use.
				855	*
				856	* Must hold slab_mutex.
				857	*/
				858	static int init_cache_node_node(int node)
				859	{
				860	int ret;
				861	struct kmem_cache *cachep;
				862
				863	list_for_each_entry(cachep, &slab_caches, list) {
				864	ret = init_cache_node(cachep, node, GFP_KERNEL);
				865	if (ret)
				866	return ret;
				867	}
				868
				869	return 0;
				870	}
				871	#endif
				872
				873	static int setup_kmem_cache_node(struct kmem_cache *cachep,
				874	int node, gfp_t gfp, bool force_change)
				875	{
				876	int ret = -ENOMEM;
				877	struct kmem_cache_node *n;
				878	struct array_cache *old_shared = NULL;
				879	struct array_cache *new_shared = NULL;
				880	struct alien_cache **new_alien = NULL;
				881	LIST_HEAD(list);
				882
				883	if (use_alien_caches) {
				884	new_alien = alloc_alien_cache(node, cachep->limit, gfp);
				885	if (!new_alien)
				886	goto fail;
				887	}
				888
				889	if (cachep->shared) {
				890	new_shared = alloc_arraycache(node,
				891	cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
				892	if (!new_shared)
				893	goto fail;
				894	}
				895
				896	ret = init_cache_node(cachep, node, gfp);
				897	if (ret)
				898	goto fail;
				899
				900	n = get_node(cachep, node);
				901	spin_lock_irq(&n->list_lock);
				902	if (n->shared && force_change) {
				903	free_block(cachep, n->shared->entry,
				904	n->shared->avail, node, &list);
				905	n->shared->avail = 0;
				906	}
				907
				908	if (!n->shared \|\| force_change) {
				909	old_shared = n->shared;
				910	n->shared = new_shared;
				911	new_shared = NULL;
				912	}
				913
				914	if (!n->alien) {
				915	n->alien = new_alien;
				916	new_alien = NULL;
				917	}
				918
				919	spin_unlock_irq(&n->list_lock);
				920	slabs_destroy(cachep, &list);
				921
				922	/*
				923	* To protect lockless access to n->shared during irq disabled context.
				924	* If n->shared isn't NULL in irq disabled context, accessing to it is
				925	* guaranteed to be valid until irq is re-enabled, because it will be
				926	* freed after synchronize_rcu().
				927	*/
				928	if (old_shared && force_change)
				929	synchronize_rcu();
				930
				931	fail:
				932	kfree(old_shared);
				933	kfree(new_shared);
				934	free_alien_cache(new_alien);
				935
				936	return ret;
				937	}
				938
				939	#ifdef CONFIG_SMP
				940
				941	static void cpuup_canceled(long cpu)
				942	{
				943	struct kmem_cache *cachep;
				944	struct kmem_cache_node *n = NULL;
				945	int node = cpu_to_mem(cpu);
				946	const struct cpumask *mask = cpumask_of_node(node);
				947
				948	list_for_each_entry(cachep, &slab_caches, list) {
				949	struct array_cache *nc;
				950	struct array_cache *shared;
				951	struct alien_cache **alien;
				952	LIST_HEAD(list);
				953
				954	n = get_node(cachep, node);
				955	if (!n)
				956	continue;
				957
				958	spin_lock_irq(&n->list_lock);
				959
				960	/* Free limit for this kmem_cache_node */
				961	n->free_limit -= cachep->batchcount;
				962
				963	/* cpu is dead; no one can alloc from it. */
				964	nc = per_cpu_ptr(cachep->cpu_cache, cpu);
				965	free_block(cachep, nc->entry, nc->avail, node, &list);
				966	nc->avail = 0;
				967
				968	if (!cpumask_empty(mask)) {
				969	spin_unlock_irq(&n->list_lock);
				970	goto free_slab;
				971	}
				972
				973	shared = n->shared;
				974	if (shared) {
				975	free_block(cachep, shared->entry,
				976	shared->avail, node, &list);
				977	n->shared = NULL;
				978	}
				979
				980	alien = n->alien;
				981	n->alien = NULL;
				982
				983	spin_unlock_irq(&n->list_lock);
				984
				985	kfree(shared);
				986	if (alien) {
				987	drain_alien_cache(cachep, alien);
				988	free_alien_cache(alien);
				989	}
				990
				991	free_slab:
				992	slabs_destroy(cachep, &list);
				993	}
				994	/*
				995	* In the previous loop, all the objects were freed to
				996	* the respective cache's slabs, now we can go ahead and
				997	* shrink each nodelist to its limit.
				998	*/
				999	list_for_each_entry(cachep, &slab_caches, list) {
				1000	n = get_node(cachep, node);
				1001	if (!n)
				1002	continue;
				1003	drain_freelist(cachep, n, INT_MAX);
				1004	}
				1005	}
				1006
				1007	static int cpuup_prepare(long cpu)
				1008	{
				1009	struct kmem_cache *cachep;
				1010	int node = cpu_to_mem(cpu);
				1011	int err;
				1012
				1013	/*
				1014	* We need to do this right in the beginning since
				1015	* alloc_arraycache's are going to use this list.
				1016	* kmalloc_node allows us to add the slab to the right
				1017	* kmem_cache_node and not this cpu's kmem_cache_node
				1018	*/
				1019	err = init_cache_node_node(node);
				1020	if (err < 0)
				1021	goto bad;
				1022
				1023	/*
				1024	* Now we can go ahead with allocating the shared arrays and
				1025	* array caches
				1026	*/
				1027	list_for_each_entry(cachep, &slab_caches, list) {
				1028	err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
				1029	if (err)
				1030	goto bad;
				1031	}
				1032
				1033	return 0;
				1034	bad:
				1035	cpuup_canceled(cpu);
				1036	return -ENOMEM;
				1037	}
				1038
				1039	int slab_prepare_cpu(unsigned int cpu)
				1040	{
				1041	int err;
				1042
				1043	mutex_lock(&slab_mutex);
				1044	err = cpuup_prepare(cpu);
				1045	mutex_unlock(&slab_mutex);
				1046	return err;
				1047	}
				1048
				1049	/*
				1050	* This is called for a failed online attempt and for a successful
				1051	* offline.
				1052	*
				1053	* Even if all the cpus of a node are down, we don't free the
				1054	* kmem_list3 of any cache. This to avoid a race between cpu_down, and
				1055	* a kmalloc allocation from another cpu for memory from the node of
				1056	* the cpu going down. The list3 structure is usually allocated from
				1057	* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
				1058	*/
				1059	int slab_dead_cpu(unsigned int cpu)
				1060	{
				1061	mutex_lock(&slab_mutex);
				1062	cpuup_canceled(cpu);
				1063	mutex_unlock(&slab_mutex);
				1064	return 0;
				1065	}
				1066	#endif
				1067
				1068	static int slab_online_cpu(unsigned int cpu)
				1069	{
				1070	start_cpu_timer(cpu);
				1071	return 0;
				1072	}
				1073
				1074	static int slab_offline_cpu(unsigned int cpu)
				1075	{
				1076	/*
				1077	* Shutdown cache reaper. Note that the slab_mutex is held so
				1078	* that if cache_reap() is invoked it cannot do anything
				1079	* expensive but will only modify reap_work and reschedule the
				1080	* timer.
				1081	*/
				1082	cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
				1083	/* Now the cache_reaper is guaranteed to be not running. */
				1084	per_cpu(slab_reap_work, cpu).work.func = NULL;
				1085	return 0;
				1086	}
				1087
				1088	#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
				1089	/*
				1090	* Drains freelist for a node on each slab cache, used for memory hot-remove.
				1091	* Returns -EBUSY if all objects cannot be drained so that the node is not
				1092	* removed.
				1093	*
				1094	* Must hold slab_mutex.
				1095	*/
				1096	static int __meminit drain_cache_node_node(int node)
				1097	{
				1098	struct kmem_cache *cachep;
				1099	int ret = 0;
				1100
				1101	list_for_each_entry(cachep, &slab_caches, list) {
				1102	struct kmem_cache_node *n;
				1103
				1104	n = get_node(cachep, node);
				1105	if (!n)
				1106	continue;
				1107
				1108	drain_freelist(cachep, n, INT_MAX);
				1109
				1110	if (!list_empty(&n->slabs_full) \|\|
				1111	!list_empty(&n->slabs_partial)) {
				1112	ret = -EBUSY;
				1113	break;
				1114	}
				1115	}
				1116	return ret;
				1117	}
				1118
				1119	static int __meminit slab_memory_callback(struct notifier_block *self,
				1120	unsigned long action, void *arg)
				1121	{
				1122	struct memory_notify *mnb = arg;
				1123	int ret = 0;
				1124	int nid;
				1125
				1126	nid = mnb->status_change_nid;
				1127	if (nid < 0)
				1128	goto out;
				1129
				1130	switch (action) {
				1131	case MEM_GOING_ONLINE:
				1132	mutex_lock(&slab_mutex);
				1133	ret = init_cache_node_node(nid);
				1134	mutex_unlock(&slab_mutex);
				1135	break;
				1136	case MEM_GOING_OFFLINE:
				1137	mutex_lock(&slab_mutex);
				1138	ret = drain_cache_node_node(nid);
				1139	mutex_unlock(&slab_mutex);
				1140	break;
				1141	case MEM_ONLINE:
				1142	case MEM_OFFLINE:
				1143	case MEM_CANCEL_ONLINE:
				1144	case MEM_CANCEL_OFFLINE:
				1145	break;
				1146	}
				1147	out:
				1148	return notifier_from_errno(ret);
				1149	}
				1150	#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
				1151
				1152	/*
				1153	* swap the static kmem_cache_node with kmalloced memory
				1154	*/
				1155	static void __init init_list(struct kmem_cache cachep, struct kmem_cache_node list,
				1156	int nodeid)
				1157	{
				1158	struct kmem_cache_node *ptr;
				1159
				1160	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
				1161	BUG_ON(!ptr);
				1162
				1163	memcpy(ptr, list, sizeof(struct kmem_cache_node));
				1164	/*
				1165	* Do not assume that spinlocks can be initialized via memcpy:
				1166	*/
				1167	spin_lock_init(&ptr->list_lock);
				1168
				1169	MAKE_ALL_LISTS(cachep, ptr, nodeid);
				1170	cachep->node[nodeid] = ptr;
				1171	}
				1172
				1173	/*
				1174	* For setting up all the kmem_cache_node for cache whose buffer_size is same as
				1175	* size of kmem_cache_node.
				1176	*/
				1177	static void __init set_up_node(struct kmem_cache *cachep, int index)
				1178	{
				1179	int node;
				1180
				1181	for_each_online_node(node) {
				1182	cachep->node[node] = &init_kmem_cache_node[index + node];
				1183	cachep->node[node]->next_reap = jiffies +
				1184	REAPTIMEOUT_NODE +
				1185	((unsigned long)cachep) % REAPTIMEOUT_NODE;
				1186	}
				1187	}
				1188
				1189	/*
				1190	* Initialisation. Called after the page allocator have been initialised and
				1191	* before smp_init().
				1192	*/
				1193	void __init kmem_cache_init(void)
				1194	{
				1195	int i;
				1196
				1197	kmem_cache = &kmem_cache_boot;
				1198
				1199	if (!IS_ENABLED(CONFIG_NUMA) \|\| num_possible_nodes() == 1)
				1200	use_alien_caches = 0;
				1201
				1202	for (i = 0; i < NUM_INIT_LISTS; i++)
				1203	kmem_cache_node_init(&init_kmem_cache_node[i]);
				1204
				1205	/*
				1206	* Fragmentation resistance on low memory - only use bigger
				1207	* page orders on machines with more than 32MB of memory if
				1208	* not overridden on the command line.
				1209	*/
				1210	if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
				1211	slab_max_order = SLAB_MAX_ORDER_HI;
				1212
				1213	/* Bootstrap is tricky, because several objects are allocated
				1214	* from caches that do not exist yet:
				1215	* 1) initialize the kmem_cache cache: it contains the struct
				1216	* kmem_cache structures of all caches, except kmem_cache itself:
				1217	* kmem_cache is statically allocated.
				1218	* Initially an __init data area is used for the head array and the
				1219	* kmem_cache_node structures, it's replaced with a kmalloc allocated
				1220	* array at the end of the bootstrap.
				1221	* 2) Create the first kmalloc cache.
				1222	* The struct kmem_cache for the new cache is allocated normally.
				1223	* An __init data area is used for the head array.
				1224	* 3) Create the remaining kmalloc caches, with minimally sized
				1225	* head arrays.
				1226	* 4) Replace the __init data head arrays for kmem_cache and the first
				1227	* kmalloc cache with kmalloc allocated arrays.
				1228	* 5) Replace the __init data for kmem_cache_node for kmem_cache and
				1229	* the other cache's with kmalloc allocated memory.
				1230	* 6) Resize the head arrays of the kmalloc caches to their final sizes.
				1231	*/
				1232
				1233	/* 1) create the kmem_cache */
				1234
				1235	/*
				1236	* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
				1237	*/
				1238	create_boot_cache(kmem_cache, "kmem_cache",
				1239	offsetof(struct kmem_cache, node) +
				1240	nr_node_ids * sizeof(struct kmem_cache_node *),
				1241	SLAB_HWCACHE_ALIGN, 0, 0);
				1242	list_add(&kmem_cache->list, &slab_caches);
				1243	memcg_link_cache(kmem_cache, NULL);
				1244	slab_state = PARTIAL;
				1245
				1246	/*
				1247	* Initialize the caches that provide memory for the kmem_cache_node
				1248	* structures first. Without this, further allocations will bug.
				1249	*/
				1250	kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
				1251	kmalloc_info[INDEX_NODE].name,
				1252	kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
				1253	0, kmalloc_size(INDEX_NODE));
				1254	slab_state = PARTIAL_NODE;
				1255	setup_kmalloc_cache_index_table();
				1256
				1257	slab_early_init = 0;
				1258
				1259	/* 5) Replace the bootstrap kmem_cache_node */
				1260	{
				1261	int nid;
				1262
				1263	for_each_online_node(nid) {
				1264	init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
				1265
				1266	init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
				1267	&init_kmem_cache_node[SIZE_NODE + nid], nid);
				1268	}
				1269	}
				1270
				1271	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
				1272	}
				1273
				1274	void __init kmem_cache_init_late(void)
				1275	{
				1276	struct kmem_cache *cachep;
				1277
				1278	/* 6) resize the head arrays to their final sizes */
				1279	mutex_lock(&slab_mutex);
				1280	list_for_each_entry(cachep, &slab_caches, list)
				1281	if (enable_cpucache(cachep, GFP_NOWAIT))
				1282	BUG();
				1283	mutex_unlock(&slab_mutex);
				1284
				1285	/* Done! */
				1286	slab_state = FULL;
				1287
				1288	#ifdef CONFIG_NUMA
				1289	/*
				1290	* Register a memory hotplug callback that initializes and frees
				1291	* node.
				1292	*/
				1293	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
				1294	#endif
				1295
				1296	/*
				1297	* The reap timers are started later, with a module init call: That part
				1298	* of the kernel is not yet operational.
				1299	*/
				1300	}
				1301
				1302	static int __init cpucache_init(void)
				1303	{
				1304	int ret;
				1305
				1306	/*
				1307	* Register the timers that return unneeded pages to the page allocator
				1308	*/
				1309	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online",
				1310	slab_online_cpu, slab_offline_cpu);
				1311	WARN_ON(ret < 0);
				1312
				1313	return 0;
				1314	}
				1315	__initcall(cpucache_init);
				1316
				1317	static noinline void
				1318	slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
				1319	{
				1320	#if DEBUG
				1321	struct kmem_cache_node *n;
				1322	unsigned long flags;
				1323	int node;
				1324	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				1325	DEFAULT_RATELIMIT_BURST);
				1326
				1327	if ((gfpflags & __GFP_NOWARN) \|\| !__ratelimit(&slab_oom_rs))
				1328	return;
				1329
				1330	pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
				1331	nodeid, gfpflags, &gfpflags);
				1332	pr_warn(" cache: %s, object size: %d, order: %d\n",
				1333	cachep->name, cachep->size, cachep->gfporder);
				1334
				1335	for_each_kmem_cache_node(cachep, node, n) {
				1336	unsigned long total_slabs, free_slabs, free_objs;
				1337
				1338	spin_lock_irqsave(&n->list_lock, flags);
				1339	total_slabs = n->total_slabs;
				1340	free_slabs = n->free_slabs;
				1341	free_objs = n->free_objects;
				1342	spin_unlock_irqrestore(&n->list_lock, flags);
				1343
				1344	pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
				1345	node, total_slabs - free_slabs, total_slabs,
				1346	(total_slabs * cachep->num) - free_objs,
				1347	total_slabs * cachep->num);
				1348	}
				1349	#endif
				1350	}
				1351
				1352	/*
				1353	* Interface to system's page allocator. No need to hold the
				1354	* kmem_cache_node ->list_lock.
				1355	*
				1356	* If we requested dmaable memory, we will get it. Even if we
				1357	* did not request dmaable memory, we might get it, but that
				1358	* would be relatively rare and ignorable.
				1359	*/
				1360	static struct page kmem_getpages(struct kmem_cache cachep, gfp_t flags,
				1361	int nodeid)
				1362	{
				1363	struct page *page;
				1364
				1365	flags \|= cachep->allocflags;
				1366
				1367	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
				1368	if (!page) {
				1369	slab_out_of_memory(cachep, flags, nodeid);
				1370	return NULL;
				1371	}
				1372
				1373	if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
				1374	__free_pages(page, cachep->gfporder);
				1375	return NULL;
				1376	}
				1377
				1378	__SetPageSlab(page);
				1379	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
				1380	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
				1381	SetPageSlabPfmemalloc(page);
				1382
				1383	return page;
				1384	}
				1385
				1386	/*
				1387	* Interface to system's page release.
				1388	*/
				1389	static void kmem_freepages(struct kmem_cache cachep, struct page page)
				1390	{
				1391	int order = cachep->gfporder;
				1392
				1393	BUG_ON(!PageSlab(page));
				1394	__ClearPageSlabPfmemalloc(page);
				1395	__ClearPageSlab(page);
				1396	page_mapcount_reset(page);
				1397	page->mapping = NULL;
				1398
				1399	if (current->reclaim_state)
				1400	current->reclaim_state->reclaimed_slab += 1 << order;
				1401	uncharge_slab_page(page, order, cachep);
				1402	__free_pages(page, order);
				1403	}
				1404
				1405	static void kmem_rcu_free(struct rcu_head *head)
				1406	{
				1407	struct kmem_cache *cachep;
				1408	struct page *page;
				1409
				1410	page = container_of(head, struct page, rcu_head);
				1411	cachep = page->slab_cache;
				1412
				1413	kmem_freepages(cachep, page);
				1414	}
				1415
				1416	#if DEBUG
				1417	static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
				1418	{
				1419	if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
				1420	(cachep->size % PAGE_SIZE) == 0)
				1421	return true;
				1422
				1423	return false;
				1424	}
				1425
				1426	#ifdef CONFIG_DEBUG_PAGEALLOC
				1427	static void slab_kernel_map(struct kmem_cache cachep, void objp, int map)
				1428	{
				1429	if (!is_debug_pagealloc_cache(cachep))
				1430	return;
				1431
				1432	kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
				1433	}
				1434
				1435	#else
				1436	static inline void slab_kernel_map(struct kmem_cache cachep, void objp,
				1437	int map) {}
				1438
				1439	#endif
				1440
				1441	static void poison_obj(struct kmem_cache cachep, void addr, unsigned char val)
				1442	{
				1443	int size = cachep->object_size;
				1444	addr = &((char *)addr)[obj_offset(cachep)];
				1445
				1446	memset(addr, val, size);
				1447	(unsigned char )(addr + size - 1) = POISON_END;
				1448	}
				1449
				1450	static void dump_line(char *data, int offset, int limit)
				1451	{
				1452	int i;
				1453	unsigned char error = 0;
				1454	int bad_count = 0;
				1455
				1456	pr_err("%03x: ", offset);
				1457	for (i = 0; i < limit; i++) {
				1458	if (data[offset + i] != POISON_FREE) {
				1459	error = data[offset + i];
				1460	bad_count++;
				1461	}
				1462	}
				1463	print_hex_dump(KERN_CONT, "", 0, 16, 1,
				1464	&data[offset], limit, 1);
				1465
				1466	if (bad_count == 1) {
				1467	error ^= POISON_FREE;
				1468	if (!(error & (error - 1))) {
				1469	pr_err("Single bit error detected. Probably bad RAM.\n");
				1470	#ifdef CONFIG_X86
				1471	pr_err("Run memtest86+ or a similar memory test tool.\n");
				1472	#else
				1473	pr_err("Run a memory test tool.\n");
				1474	#endif
				1475	}
				1476	}
				1477	}
				1478	#endif
				1479
				1480	#if DEBUG
				1481
				1482	static void print_objinfo(struct kmem_cache cachep, void objp, int lines)
				1483	{
				1484	int i, size;
				1485	char *realobj;
				1486
				1487	if (cachep->flags & SLAB_RED_ZONE) {
				1488	pr_err("Redzone: 0x%llx/0x%llx\n",
				1489	*dbg_redzone1(cachep, objp),
				1490	*dbg_redzone2(cachep, objp));
				1491	}
				1492
				1493	if (cachep->flags & SLAB_STORE_USER)
				1494	pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
				1495	realobj = (char *)objp + obj_offset(cachep);
				1496	size = cachep->object_size;
				1497	for (i = 0; i < size && lines; i += 16, lines--) {
				1498	int limit;
				1499	limit = 16;
				1500	if (i + limit > size)
				1501	limit = size - i;
				1502	dump_line(realobj, i, limit);
				1503	}
				1504	}
				1505
				1506	static void check_poison_obj(struct kmem_cache cachep, void objp)
				1507	{
				1508	char *realobj;
				1509	int size, i;
				1510	int lines = 0;
				1511
				1512	if (is_debug_pagealloc_cache(cachep))
				1513	return;
				1514
				1515	realobj = (char *)objp + obj_offset(cachep);
				1516	size = cachep->object_size;
				1517
				1518	for (i = 0; i < size; i++) {
				1519	char exp = POISON_FREE;
				1520	if (i == size - 1)
				1521	exp = POISON_END;
				1522	if (realobj[i] != exp) {
				1523	int limit;
				1524	/* Mismatch ! */
				1525	/* Print header */
				1526	if (lines == 0) {
				1527	pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
				1528	print_tainted(), cachep->name,
				1529	realobj, size);
				1530	print_objinfo(cachep, objp, 0);
				1531	}
				1532	/* Hexdump the affected line */
				1533	i = (i / 16) * 16;
				1534	limit = 16;
				1535	if (i + limit > size)
				1536	limit = size - i;
				1537	dump_line(realobj, i, limit);
				1538	i += 16;
				1539	lines++;
				1540	/* Limit to 5 lines */
				1541	if (lines > 5)
				1542	break;
				1543	}
				1544	}
				1545	if (lines != 0) {
				1546	/* Print some data about the neighboring objects, if they
				1547	* exist:
				1548	*/
				1549	struct page *page = virt_to_head_page(objp);
				1550	unsigned int objnr;
				1551
				1552	objnr = obj_to_index(cachep, page, objp);
				1553	if (objnr) {
				1554	objp = index_to_obj(cachep, page, objnr - 1);
				1555	realobj = (char *)objp + obj_offset(cachep);
				1556	pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
				1557	print_objinfo(cachep, objp, 2);
				1558	}
				1559	if (objnr + 1 < cachep->num) {
				1560	objp = index_to_obj(cachep, page, objnr + 1);
				1561	realobj = (char *)objp + obj_offset(cachep);
				1562	pr_err("Next obj: start=%px, len=%d\n", realobj, size);
				1563	print_objinfo(cachep, objp, 2);
				1564	}
				1565	}
				1566	}
				1567	#endif
				1568
				1569	#if DEBUG
				1570	static void slab_destroy_debugcheck(struct kmem_cache *cachep,
				1571	struct page *page)
				1572	{
				1573	int i;
				1574
				1575	if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
				1576	poison_obj(cachep, page->freelist - obj_offset(cachep),
				1577	POISON_FREE);
				1578	}
				1579
				1580	for (i = 0; i < cachep->num; i++) {
				1581	void *objp = index_to_obj(cachep, page, i);
				1582
				1583	if (cachep->flags & SLAB_POISON) {
				1584	check_poison_obj(cachep, objp);
				1585	slab_kernel_map(cachep, objp, 1);
				1586	}
				1587	if (cachep->flags & SLAB_RED_ZONE) {
				1588	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				1589	slab_error(cachep, "start of a freed object was overwritten");
				1590	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				1591	slab_error(cachep, "end of a freed object was overwritten");
				1592	}
				1593	}
				1594	}
				1595	#else
				1596	static void slab_destroy_debugcheck(struct kmem_cache *cachep,
				1597	struct page *page)
				1598	{
				1599	}
				1600	#endif
				1601
				1602	/**
				1603	* slab_destroy - destroy and release all objects in a slab
				1604	* @cachep: cache pointer being destroyed
				1605	* @page: page pointer being destroyed
				1606	*
				1607	* Destroy all the objs in a slab page, and release the mem back to the system.
				1608	* Before calling the slab page must have been unlinked from the cache. The
				1609	* kmem_cache_node ->list_lock is not held/needed.
				1610	*/
				1611	static void slab_destroy(struct kmem_cache cachep, struct page page)
				1612	{
				1613	void *freelist;
				1614
				1615	freelist = page->freelist;
				1616	slab_destroy_debugcheck(cachep, page);
				1617	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
				1618	call_rcu(&page->rcu_head, kmem_rcu_free);
				1619	else
				1620	kmem_freepages(cachep, page);
				1621
				1622	/*
				1623	* From now on, we don't use freelist
				1624	* although actual page can be freed in rcu context
				1625	*/
				1626	if (OFF_SLAB(cachep))
				1627	kmem_cache_free(cachep->freelist_cache, freelist);
				1628	}
				1629
				1630	static void slabs_destroy(struct kmem_cache cachep, struct list_head list)
				1631	{
				1632	struct page page, n;
				1633
				1634	list_for_each_entry_safe(page, n, list, slab_list) {
				1635	list_del(&page->slab_list);
				1636	slab_destroy(cachep, page);
				1637	}
				1638	}
				1639
				1640	/**
				1641	* calculate_slab_order - calculate size (page order) of slabs
				1642	* @cachep: pointer to the cache that is being created
				1643	* @size: size of objects to be created in this cache.
				1644	* @flags: slab allocation flags
				1645	*
				1646	* Also calculates the number of objects per slab.
				1647	*
				1648	* This could be made much more intelligent. For now, try to avoid using
				1649	* high order pages for slabs. When the gfp() functions are more friendly
				1650	* towards high-order requests, this should be changed.
				1651	*
				1652	* Return: number of left-over bytes in a slab
				1653	*/
				1654	static size_t calculate_slab_order(struct kmem_cache *cachep,
				1655	size_t size, slab_flags_t flags)
				1656	{
				1657	size_t left_over = 0;
				1658	int gfporder;
				1659
				1660	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
				1661	unsigned int num;
				1662	size_t remainder;
				1663
				1664	num = cache_estimate(gfporder, size, flags, &remainder);
				1665	if (!num)
				1666	continue;
				1667
				1668	/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
				1669	if (num > SLAB_OBJ_MAX_NUM)
				1670	break;
				1671
				1672	if (flags & CFLGS_OFF_SLAB) {
				1673	struct kmem_cache *freelist_cache;
				1674	size_t freelist_size;
				1675
				1676	freelist_size = num * sizeof(freelist_idx_t);
				1677	freelist_cache = kmalloc_slab(freelist_size, 0u);
				1678	if (!freelist_cache)
				1679	continue;
				1680
				1681	/*
				1682	* Needed to avoid possible looping condition
				1683	* in cache_grow_begin()
				1684	*/
				1685	if (OFF_SLAB(freelist_cache))
				1686	continue;
				1687
				1688	/* check if off slab has enough benefit */
				1689	if (freelist_cache->size > cachep->size / 2)
				1690	continue;
				1691	}
				1692
				1693	/* Found something acceptable - save it away */
				1694	cachep->num = num;
				1695	cachep->gfporder = gfporder;
				1696	left_over = remainder;
				1697
				1698	/*
				1699	* A VFS-reclaimable slab tends to have most allocations
				1700	* as GFP_NOFS and we really don't want to have to be allocating
				1701	* higher-order pages when we are unable to shrink dcache.
				1702	*/
				1703	if (flags & SLAB_RECLAIM_ACCOUNT)
				1704	break;
				1705
				1706	/*
				1707	* Large number of objects is good, but very large slabs are
				1708	* currently bad for the gfp()s.
				1709	*/
				1710	if (gfporder >= slab_max_order)
				1711	break;
				1712
				1713	/*
				1714	* Acceptable internal fragmentation?
				1715	*/
				1716	if (left_over * 8 <= (PAGE_SIZE << gfporder))
				1717	break;
				1718	}
				1719	return left_over;
				1720	}
				1721
				1722	static struct array_cache __percpu *alloc_kmem_cache_cpus(
				1723	struct kmem_cache *cachep, int entries, int batchcount)
				1724	{
				1725	int cpu;
				1726	size_t size;
				1727	struct array_cache __percpu *cpu_cache;
				1728
				1729	size = sizeof(void ) entries + sizeof(struct array_cache);
				1730	cpu_cache = __alloc_percpu(size, sizeof(void *));
				1731
				1732	if (!cpu_cache)
				1733	return NULL;
				1734
				1735	for_each_possible_cpu(cpu) {
				1736	init_arraycache(per_cpu_ptr(cpu_cache, cpu),
				1737	entries, batchcount);
				1738	}
				1739
				1740	return cpu_cache;
				1741	}
				1742
				1743	static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
				1744	{
				1745	if (slab_state >= FULL)
				1746	return enable_cpucache(cachep, gfp);
				1747
				1748	cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
				1749	if (!cachep->cpu_cache)
				1750	return 1;
				1751
				1752	if (slab_state == DOWN) {
				1753	/* Creation of first cache (kmem_cache). */
				1754	set_up_node(kmem_cache, CACHE_CACHE);
				1755	} else if (slab_state == PARTIAL) {
				1756	/* For kmem_cache_node */
				1757	set_up_node(cachep, SIZE_NODE);
				1758	} else {
				1759	int node;
				1760
				1761	for_each_online_node(node) {
				1762	cachep->node[node] = kmalloc_node(
				1763	sizeof(struct kmem_cache_node), gfp, node);
				1764	BUG_ON(!cachep->node[node]);
				1765	kmem_cache_node_init(cachep->node[node]);
				1766	}
				1767	}
				1768
				1769	cachep->node[numa_mem_id()]->next_reap =
				1770	jiffies + REAPTIMEOUT_NODE +
				1771	((unsigned long)cachep) % REAPTIMEOUT_NODE;
				1772
				1773	cpu_cache_get(cachep)->avail = 0;
				1774	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
				1775	cpu_cache_get(cachep)->batchcount = 1;
				1776	cpu_cache_get(cachep)->touched = 0;
				1777	cachep->batchcount = 1;
				1778	cachep->limit = BOOT_CPUCACHE_ENTRIES;
				1779	return 0;
				1780	}
				1781
				1782	slab_flags_t kmem_cache_flags(unsigned int object_size,
				1783	slab_flags_t flags, const char *name,
				1784	void (ctor)(void ))
				1785	{
				1786	return flags;
				1787	}
				1788
				1789	struct kmem_cache *
				1790	__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
				1791	slab_flags_t flags, void (ctor)(void ))
				1792	{
				1793	struct kmem_cache *cachep;
				1794
				1795	cachep = find_mergeable(size, align, flags, name, ctor);
				1796	if (cachep) {
				1797	cachep->refcount++;
				1798
				1799	/*
				1800	* Adjust the object sizes so that we clear
				1801	* the complete object on kzalloc.
				1802	*/
				1803	cachep->object_size = max_t(int, cachep->object_size, size);
				1804	}
				1805	return cachep;
				1806	}
				1807
				1808	static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
				1809	size_t size, slab_flags_t flags)
				1810	{
				1811	size_t left;
				1812
				1813	cachep->num = 0;
				1814
				1815	/*
				1816	* If slab auto-initialization on free is enabled, store the freelist
				1817	* off-slab, so that its contents don't end up in one of the allocated
				1818	* objects.
				1819	*/
				1820	if (unlikely(slab_want_init_on_free(cachep)))
				1821	return false;
				1822
				1823	if (cachep->ctor \|\| flags & SLAB_TYPESAFE_BY_RCU)
				1824	return false;
				1825
				1826	left = calculate_slab_order(cachep, size,
				1827	flags \| CFLGS_OBJFREELIST_SLAB);
				1828	if (!cachep->num)
				1829	return false;
				1830
				1831	if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
				1832	return false;
				1833
				1834	cachep->colour = left / cachep->colour_off;
				1835
				1836	return true;
				1837	}
				1838
				1839	static bool set_off_slab_cache(struct kmem_cache *cachep,
				1840	size_t size, slab_flags_t flags)
				1841	{
				1842	size_t left;
				1843
				1844	cachep->num = 0;
				1845
				1846	/*
				1847	* Always use on-slab management when SLAB_NOLEAKTRACE
				1848	* to avoid recursive calls into kmemleak.
				1849	*/
				1850	if (flags & SLAB_NOLEAKTRACE)
				1851	return false;
				1852
				1853	/*
				1854	* Size is large, assume best to place the slab management obj
				1855	* off-slab (should allow better packing of objs).
				1856	*/
				1857	left = calculate_slab_order(cachep, size, flags \| CFLGS_OFF_SLAB);
				1858	if (!cachep->num)
				1859	return false;
				1860
				1861	/*
				1862	* If the slab has been placed off-slab, and we have enough space then
				1863	* move it on-slab. This is at the expense of any extra colouring.
				1864	*/
				1865	if (left >= cachep->num * sizeof(freelist_idx_t))
				1866	return false;
				1867
				1868	cachep->colour = left / cachep->colour_off;
				1869
				1870	return true;
				1871	}
				1872
				1873	static bool set_on_slab_cache(struct kmem_cache *cachep,
				1874	size_t size, slab_flags_t flags)
				1875	{
				1876	size_t left;
				1877
				1878	cachep->num = 0;
				1879
				1880	left = calculate_slab_order(cachep, size, flags);
				1881	if (!cachep->num)
				1882	return false;
				1883
				1884	cachep->colour = left / cachep->colour_off;
				1885
				1886	return true;
				1887	}
				1888
				1889	/**
				1890	* __kmem_cache_create - Create a cache.
				1891	* @cachep: cache management descriptor
				1892	* @flags: SLAB flags
				1893	*
				1894	* Returns a ptr to the cache on success, NULL on failure.
				1895	* Cannot be called within a int, but can be interrupted.
				1896	* The @ctor is run when new pages are allocated by the cache.
				1897	*
				1898	* The flags are
				1899	*
				1900	* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
				1901	* to catch references to uninitialised memory.
				1902	*
				1903	* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
				1904	* for buffer overruns.
				1905	*
				1906	* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
				1907	* cacheline. This can be beneficial if you're counting cycles as closely
				1908	* as davem.
				1909	*
				1910	* Return: a pointer to the created cache or %NULL in case of error
				1911	*/
				1912	int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
				1913	{
				1914	size_t ralign = BYTES_PER_WORD;
				1915	gfp_t gfp;
				1916	int err;
				1917	unsigned int size = cachep->size;
				1918
				1919	#if DEBUG
				1920	#if FORCED_DEBUG
				1921	/*
				1922	* Enable redzoning and last user accounting, except for caches with
				1923	* large objects, if the increased size would increase the object size
				1924	* above the next power of two: caches with object sizes just above a
				1925	* power of two have a significant amount of internal fragmentation.
				1926	*/
				1927	if (size < 4096 \|\| fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
				1928	2 * sizeof(unsigned long long)))
				1929	flags \|= SLAB_RED_ZONE \| SLAB_STORE_USER;
				1930	if (!(flags & SLAB_TYPESAFE_BY_RCU))
				1931	flags \|= SLAB_POISON;
				1932	#endif
				1933	#endif
				1934
				1935	/*
				1936	* Check that size is in terms of words. This is needed to avoid
				1937	* unaligned accesses for some archs when redzoning is used, and makes
				1938	* sure any on-slab bufctl's are also correctly aligned.
				1939	*/
				1940	size = ALIGN(size, BYTES_PER_WORD);
				1941
				1942	if (flags & SLAB_RED_ZONE) {
				1943	ralign = REDZONE_ALIGN;
				1944	/* If redzoning, ensure that the second redzone is suitably
				1945	* aligned, by adjusting the object size accordingly. */
				1946	size = ALIGN(size, REDZONE_ALIGN);
				1947	}
				1948
				1949	/* 3) caller mandated alignment */
				1950	if (ralign < cachep->align) {
				1951	ralign = cachep->align;
				1952	}
				1953	/* disable debug if necessary */
				1954	if (ralign > __alignof__(unsigned long long))
				1955	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
				1956	/*
				1957	* 4) Store it.
				1958	*/
				1959	cachep->align = ralign;
				1960	cachep->colour_off = cache_line_size();
				1961	/* Offset must be a multiple of the alignment. */
				1962	if (cachep->colour_off < cachep->align)
				1963	cachep->colour_off = cachep->align;
				1964
				1965	if (slab_is_available())
				1966	gfp = GFP_KERNEL;
				1967	else
				1968	gfp = GFP_NOWAIT;
				1969
				1970	#if DEBUG
				1971
				1972	/*
				1973	* Both debugging options require word-alignment which is calculated
				1974	* into align above.
				1975	*/
				1976	if (flags & SLAB_RED_ZONE) {
				1977	/* add space for red zone words */
				1978	cachep->obj_offset += sizeof(unsigned long long);
				1979	size += 2 * sizeof(unsigned long long);
				1980	}
				1981	if (flags & SLAB_STORE_USER) {
				1982	/* user store requires one word storage behind the end of
				1983	* the real object. But if the second red zone needs to be
				1984	* aligned to 64 bits, we must allow that much space.
				1985	*/
				1986	if (flags & SLAB_RED_ZONE)
				1987	size += REDZONE_ALIGN;
				1988	else
				1989	size += BYTES_PER_WORD;
				1990	}
				1991	#endif
				1992
				1993	kasan_cache_create(cachep, &size, &flags);
				1994
				1995	size = ALIGN(size, cachep->align);
				1996	/*
				1997	* We should restrict the number of objects in a slab to implement
				1998	* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
				1999	*/
				2000	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
				2001	size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
				2002
				2003	#if DEBUG
				2004	/*
				2005	* To activate debug pagealloc, off-slab management is necessary
				2006	* requirement. In early phase of initialization, small sized slab
				2007	* doesn't get initialized so it would not be possible. So, we need
				2008	* to check size >= 256. It guarantees that all necessary small
				2009	* sized slab is initialized in current slab initialization sequence.
				2010	*/
				2011	if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
				2012	size >= 256 && cachep->object_size > cache_line_size()) {
				2013	if (size < PAGE_SIZE \|\| size % PAGE_SIZE == 0) {
				2014	size_t tmp_size = ALIGN(size, PAGE_SIZE);
				2015
				2016	if (set_off_slab_cache(cachep, tmp_size, flags)) {
				2017	flags \|= CFLGS_OFF_SLAB;
				2018	cachep->obj_offset += tmp_size - size;
				2019	size = tmp_size;
				2020	goto done;
				2021	}
				2022	}
				2023	}
				2024	#endif
				2025
				2026	if (set_objfreelist_slab_cache(cachep, size, flags)) {
				2027	flags \|= CFLGS_OBJFREELIST_SLAB;
				2028	goto done;
				2029	}
				2030
				2031	if (set_off_slab_cache(cachep, size, flags)) {
				2032	flags \|= CFLGS_OFF_SLAB;
				2033	goto done;
				2034	}
				2035
				2036	if (set_on_slab_cache(cachep, size, flags))
				2037	goto done;
				2038
				2039	return -E2BIG;
				2040
				2041	done:
				2042	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
				2043	cachep->flags = flags;
				2044	cachep->allocflags = __GFP_COMP;
				2045	if (flags & SLAB_CACHE_DMA)
				2046	cachep->allocflags \|= GFP_DMA;
				2047	if (flags & SLAB_CACHE_DMA32)
				2048	cachep->allocflags \|= GFP_DMA32;
				2049	if (flags & SLAB_RECLAIM_ACCOUNT)
				2050	cachep->allocflags \|= __GFP_RECLAIMABLE;
				2051	cachep->size = size;
				2052	cachep->reciprocal_buffer_size = reciprocal_value(size);
				2053
				2054	#if DEBUG
				2055	/*
				2056	* If we're going to use the generic kernel_map_pages()
				2057	* poisoning, then it's going to smash the contents of
				2058	* the redzone and userword anyhow, so switch them off.
				2059	*/
				2060	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
				2061	(cachep->flags & SLAB_POISON) &&
				2062	is_debug_pagealloc_cache(cachep))
				2063	cachep->flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
				2064	#endif
				2065
				2066	if (OFF_SLAB(cachep)) {
				2067	cachep->freelist_cache =
				2068	kmalloc_slab(cachep->freelist_size, 0u);
				2069	}
				2070
				2071	err = setup_cpu_cache(cachep, gfp);
				2072	if (err) {
				2073	__kmem_cache_release(cachep);
				2074	return err;
				2075	}
				2076
				2077	return 0;
				2078	}
				2079
				2080	#if DEBUG
				2081	static void check_irq_off(void)
				2082	{
				2083	BUG_ON(!irqs_disabled());
				2084	}
				2085
				2086	static void check_irq_on(void)
				2087	{
				2088	BUG_ON(irqs_disabled());
				2089	}
				2090
				2091	static void check_mutex_acquired(void)
				2092	{
				2093	BUG_ON(!mutex_is_locked(&slab_mutex));
				2094	}
				2095
				2096	static void check_spinlock_acquired(struct kmem_cache *cachep)
				2097	{
				2098	#ifdef CONFIG_SMP
				2099	check_irq_off();
				2100	assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
				2101	#endif
				2102	}
				2103
				2104	static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
				2105	{
				2106	#ifdef CONFIG_SMP
				2107	check_irq_off();
				2108	assert_spin_locked(&get_node(cachep, node)->list_lock);
				2109	#endif
				2110	}
				2111
				2112	#else
				2113	#define check_irq_off() do { } while(0)
				2114	#define check_irq_on() do { } while(0)
				2115	#define check_mutex_acquired() do { } while(0)
				2116	#define check_spinlock_acquired(x) do { } while(0)
				2117	#define check_spinlock_acquired_node(x, y) do { } while(0)
				2118	#endif
				2119
				2120	static void drain_array_locked(struct kmem_cache cachep, struct array_cache ac,
				2121	int node, bool free_all, struct list_head *list)
				2122	{
				2123	int tofree;
				2124
				2125	if (!ac \|\| !ac->avail)
				2126	return;
				2127
				2128	tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
				2129	if (tofree > ac->avail)
				2130	tofree = (ac->avail + 1) / 2;
				2131
				2132	free_block(cachep, ac->entry, tofree, node, list);
				2133	ac->avail -= tofree;
				2134	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void ) ac->avail);
				2135	}
				2136
				2137	static void do_drain(void *arg)
				2138	{
				2139	struct kmem_cache *cachep = arg;
				2140	struct array_cache *ac;
				2141	int node = numa_mem_id();
				2142	struct kmem_cache_node *n;
				2143	LIST_HEAD(list);
				2144
				2145	check_irq_off();
				2146	ac = cpu_cache_get(cachep);
				2147	n = get_node(cachep, node);
				2148	spin_lock(&n->list_lock);
				2149	free_block(cachep, ac->entry, ac->avail, node, &list);
				2150	spin_unlock(&n->list_lock);
				2151	slabs_destroy(cachep, &list);
				2152	ac->avail = 0;
				2153	}
				2154
				2155	static void drain_cpu_caches(struct kmem_cache *cachep)
				2156	{
				2157	struct kmem_cache_node *n;
				2158	int node;
				2159	LIST_HEAD(list);
				2160
				2161	on_each_cpu(do_drain, cachep, 1);
				2162	check_irq_on();
				2163	for_each_kmem_cache_node(cachep, node, n)
				2164	if (n->alien)
				2165	drain_alien_cache(cachep, n->alien);
				2166
				2167	for_each_kmem_cache_node(cachep, node, n) {
				2168	spin_lock_irq(&n->list_lock);
				2169	drain_array_locked(cachep, n->shared, node, true, &list);
				2170	spin_unlock_irq(&n->list_lock);
				2171
				2172	slabs_destroy(cachep, &list);
				2173	}
				2174	}
				2175
				2176	/*
				2177	* Remove slabs from the list of free slabs.
				2178	* Specify the number of slabs to drain in tofree.
				2179	*
				2180	* Returns the actual number of slabs released.
				2181	*/
				2182	static int drain_freelist(struct kmem_cache *cache,
				2183	struct kmem_cache_node *n, int tofree)
				2184	{
				2185	struct list_head *p;
				2186	int nr_freed;
				2187	struct page *page;
				2188
				2189	nr_freed = 0;
				2190	while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
				2191
				2192	spin_lock_irq(&n->list_lock);
				2193	p = n->slabs_free.prev;
				2194	if (p == &n->slabs_free) {
				2195	spin_unlock_irq(&n->list_lock);
				2196	goto out;
				2197	}
				2198
				2199	page = list_entry(p, struct page, slab_list);
				2200	list_del(&page->slab_list);
				2201	n->free_slabs--;
				2202	n->total_slabs--;
				2203	/*
				2204	* Safe to drop the lock. The slab is no longer linked
				2205	* to the cache.
				2206	*/
				2207	n->free_objects -= cache->num;
				2208	spin_unlock_irq(&n->list_lock);
				2209	slab_destroy(cache, page);
				2210	nr_freed++;
				2211	}
				2212	out:
				2213	return nr_freed;
				2214	}
				2215
				2216	bool __kmem_cache_empty(struct kmem_cache *s)
				2217	{
				2218	int node;
				2219	struct kmem_cache_node *n;
				2220
				2221	for_each_kmem_cache_node(s, node, n)
				2222	if (!list_empty(&n->slabs_full) \|\|
				2223	!list_empty(&n->slabs_partial))
				2224	return false;
				2225	return true;
				2226	}
				2227
				2228	int __kmem_cache_shrink(struct kmem_cache *cachep)
				2229	{
				2230	int ret = 0;
				2231	int node;
				2232	struct kmem_cache_node *n;
				2233
				2234	drain_cpu_caches(cachep);
				2235
				2236	check_irq_on();
				2237	for_each_kmem_cache_node(cachep, node, n) {
				2238	drain_freelist(cachep, n, INT_MAX);
				2239
				2240	ret += !list_empty(&n->slabs_full) \|\|
				2241	!list_empty(&n->slabs_partial);
				2242	}
				2243	return (ret ? 1 : 0);
				2244	}
				2245
				2246	#ifdef CONFIG_MEMCG
				2247	void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
				2248	{
				2249	__kmem_cache_shrink(cachep);
				2250	}
				2251
				2252	void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
				2253	{
				2254	}
				2255	#endif
				2256
				2257	int __kmem_cache_shutdown(struct kmem_cache *cachep)
				2258	{
				2259	return __kmem_cache_shrink(cachep);
				2260	}
				2261
				2262	void __kmem_cache_release(struct kmem_cache *cachep)
				2263	{
				2264	int i;
				2265	struct kmem_cache_node *n;
				2266
				2267	cache_random_seq_destroy(cachep);
				2268
				2269	free_percpu(cachep->cpu_cache);
				2270
				2271	/* NUMA: free the node structures */
				2272	for_each_kmem_cache_node(cachep, i, n) {
				2273	kfree(n->shared);
				2274	free_alien_cache(n->alien);
				2275	kfree(n);
				2276	cachep->node[i] = NULL;
				2277	}
				2278	}
				2279
				2280	/*
				2281	* Get the memory for a slab management obj.
				2282	*
				2283	* For a slab cache when the slab descriptor is off-slab, the
				2284	* slab descriptor can't come from the same cache which is being created,
				2285	* Because if it is the case, that means we defer the creation of
				2286	* the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
				2287	* And we eventually call down to __kmem_cache_create(), which
				2288	* in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
				2289	* This is a "chicken-and-egg" problem.
				2290	*
				2291	* So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
				2292	* which are all initialized during kmem_cache_init().
				2293	*/
				2294	static void alloc_slabmgmt(struct kmem_cache cachep,
				2295	struct page *page, int colour_off,
				2296	gfp_t local_flags, int nodeid)
				2297	{
				2298	void *freelist;
				2299	void *addr = page_address(page);
				2300
				2301	page->s_mem = addr + colour_off;
				2302	page->active = 0;
				2303
				2304	if (OBJFREELIST_SLAB(cachep))
				2305	freelist = NULL;
				2306	else if (OFF_SLAB(cachep)) {
				2307	/* Slab management obj is off-slab. */
				2308	freelist = kmem_cache_alloc_node(cachep->freelist_cache,
				2309	local_flags, nodeid);
				2310	if (!freelist)
				2311	return NULL;
				2312	} else {
				2313	/* We will use last bytes at the slab for freelist */
				2314	freelist = addr + (PAGE_SIZE << cachep->gfporder) -
				2315	cachep->freelist_size;
				2316	}
				2317
				2318	return freelist;
				2319	}
				2320
				2321	static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
				2322	{
				2323	return ((freelist_idx_t *)page->freelist)[idx];
				2324	}
				2325
				2326	static inline void set_free_obj(struct page *page,
				2327	unsigned int idx, freelist_idx_t val)
				2328	{
				2329	((freelist_idx_t *)(page->freelist))[idx] = val;
				2330	}
				2331
				2332	static void cache_init_objs_debug(struct kmem_cache cachep, struct page page)
				2333	{
				2334	#if DEBUG
				2335	int i;
				2336
				2337	for (i = 0; i < cachep->num; i++) {
				2338	void *objp = index_to_obj(cachep, page, i);
				2339
				2340	if (cachep->flags & SLAB_STORE_USER)
				2341	*dbg_userword(cachep, objp) = NULL;
				2342
				2343	if (cachep->flags & SLAB_RED_ZONE) {
				2344	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
				2345	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
				2346	}
				2347	/*
				2348	* Constructors are not allowed to allocate memory from the same
				2349	* cache which they are a constructor for. Otherwise, deadlock.
				2350	* They must also be threaded.
				2351	*/
				2352	if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
				2353	kasan_unpoison_object_data(cachep,
				2354	objp + obj_offset(cachep));
				2355	cachep->ctor(objp + obj_offset(cachep));
				2356	kasan_poison_object_data(
				2357	cachep, objp + obj_offset(cachep));
				2358	}
				2359
				2360	if (cachep->flags & SLAB_RED_ZONE) {
				2361	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				2362	slab_error(cachep, "constructor overwrote the end of an object");
				2363	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				2364	slab_error(cachep, "constructor overwrote the start of an object");
				2365	}
				2366	/* need to poison the objs? */
				2367	if (cachep->flags & SLAB_POISON) {
				2368	poison_obj(cachep, objp, POISON_FREE);
				2369	slab_kernel_map(cachep, objp, 0);
				2370	}
				2371	}
				2372	#endif
				2373	}
				2374
				2375	#ifdef CONFIG_SLAB_FREELIST_RANDOM
				2376	/* Hold information during a freelist initialization */
				2377	union freelist_init_state {
				2378	struct {
				2379	unsigned int pos;
				2380	unsigned int *list;
				2381	unsigned int count;
				2382	};
				2383	struct rnd_state rnd_state;
				2384	};
				2385
				2386	/*
				2387	* Initialize the state based on the randomization methode available.
				2388	* return true if the pre-computed list is available, false otherwize.
				2389	*/
				2390	static bool freelist_state_initialize(union freelist_init_state *state,
				2391	struct kmem_cache *cachep,
				2392	unsigned int count)
				2393	{
				2394	bool ret;
				2395	unsigned int rand;
				2396
				2397	/* Use best entropy available to define a random shift */
				2398	rand = get_random_int();
				2399
				2400	/* Use a random state if the pre-computed list is not available */
				2401	if (!cachep->random_seq) {
				2402	prandom_seed_state(&state->rnd_state, rand);
				2403	ret = false;
				2404	} else {
				2405	state->list = cachep->random_seq;
				2406	state->count = count;
				2407	state->pos = rand % count;
				2408	ret = true;
				2409	}
				2410	return ret;
				2411	}
				2412
				2413	/* Get the next entry on the list and randomize it using a random shift */
				2414	static freelist_idx_t next_random_slot(union freelist_init_state *state)
				2415	{
				2416	if (state->pos >= state->count)
				2417	state->pos = 0;
				2418	return state->list[state->pos++];
				2419	}
				2420
				2421	/* Swap two freelist entries */
				2422	static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
				2423	{
				2424	swap(((freelist_idx_t *)page->freelist)[a],
				2425	((freelist_idx_t *)page->freelist)[b]);
				2426	}
				2427
				2428	/*
				2429	* Shuffle the freelist initialization state based on pre-computed lists.
				2430	* return true if the list was successfully shuffled, false otherwise.
				2431	*/
				2432	static bool shuffle_freelist(struct kmem_cache cachep, struct page page)
				2433	{
				2434	unsigned int objfreelist = 0, i, rand, count = cachep->num;
				2435	union freelist_init_state state;
				2436	bool precomputed;
				2437
				2438	if (count < 2)
				2439	return false;
				2440
				2441	precomputed = freelist_state_initialize(&state, cachep, count);
				2442
				2443	/* Take a random entry as the objfreelist */
				2444	if (OBJFREELIST_SLAB(cachep)) {
				2445	if (!precomputed)
				2446	objfreelist = count - 1;
				2447	else
				2448	objfreelist = next_random_slot(&state);
				2449	page->freelist = index_to_obj(cachep, page, objfreelist) +
				2450	obj_offset(cachep);
				2451	count--;
				2452	}
				2453
				2454	/*
				2455	* On early boot, generate the list dynamically.
				2456	* Later use a pre-computed list for speed.
				2457	*/
				2458	if (!precomputed) {
				2459	for (i = 0; i < count; i++)
				2460	set_free_obj(page, i, i);
				2461
				2462	/* Fisher-Yates shuffle */
				2463	for (i = count - 1; i > 0; i--) {
				2464	rand = prandom_u32_state(&state.rnd_state);
				2465	rand %= (i + 1);
				2466	swap_free_obj(page, i, rand);
				2467	}
				2468	} else {
				2469	for (i = 0; i < count; i++)
				2470	set_free_obj(page, i, next_random_slot(&state));
				2471	}
				2472
				2473	if (OBJFREELIST_SLAB(cachep))
				2474	set_free_obj(page, cachep->num - 1, objfreelist);
				2475
				2476	return true;
				2477	}
				2478	#else
				2479	static inline bool shuffle_freelist(struct kmem_cache *cachep,
				2480	struct page *page)
				2481	{
				2482	return false;
				2483	}
				2484	#endif /* CONFIG_SLAB_FREELIST_RANDOM */
				2485
				2486	static void cache_init_objs(struct kmem_cache *cachep,
				2487	struct page *page)
				2488	{
				2489	int i;
				2490	void *objp;
				2491	bool shuffled;
				2492
				2493	cache_init_objs_debug(cachep, page);
				2494
				2495	/* Try to randomize the freelist if enabled */
				2496	shuffled = shuffle_freelist(cachep, page);
				2497
				2498	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
				2499	page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
				2500	obj_offset(cachep);
				2501	}
				2502
				2503	for (i = 0; i < cachep->num; i++) {
				2504	objp = index_to_obj(cachep, page, i);
				2505	objp = kasan_init_slab_obj(cachep, objp);
				2506
				2507	/* constructor could break poison info */
				2508	if (DEBUG == 0 && cachep->ctor) {
				2509	kasan_unpoison_object_data(cachep, objp);
				2510	cachep->ctor(objp);
				2511	kasan_poison_object_data(cachep, objp);
				2512	}
				2513
				2514	if (!shuffled)
				2515	set_free_obj(page, i, i);
				2516	}
				2517	}
				2518
				2519	static void slab_get_obj(struct kmem_cache cachep, struct page *page)
				2520	{
				2521	void *objp;
				2522
				2523	objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
				2524	page->active++;
				2525
				2526	return objp;
				2527	}
				2528
				2529	static void slab_put_obj(struct kmem_cache *cachep,
				2530	struct page page, void objp)
				2531	{
				2532	unsigned int objnr = obj_to_index(cachep, page, objp);
				2533	#if DEBUG
				2534	unsigned int i;
				2535
				2536	/* Verify double free bug */
				2537	for (i = page->active; i < cachep->num; i++) {
				2538	if (get_free_obj(page, i) == objnr) {
				2539	pr_err("slab: double free detected in cache '%s', objp %px\n",
				2540	cachep->name, objp);
				2541	BUG();
				2542	}
				2543	}
				2544	#endif
				2545	page->active--;
				2546	if (!page->freelist)
				2547	page->freelist = objp + obj_offset(cachep);
				2548
				2549	set_free_obj(page, page->active, objnr);
				2550	}
				2551
				2552	/*
				2553	* Map pages beginning at addr to the given cache and slab. This is required
				2554	* for the slab allocator to be able to lookup the cache and slab of a
				2555	* virtual address for kfree, ksize, and slab debugging.
				2556	*/
				2557	static void slab_map_pages(struct kmem_cache cache, struct page page,
				2558	void *freelist)
				2559	{
				2560	page->slab_cache = cache;
				2561	page->freelist = freelist;
				2562	}
				2563
				2564	/*
				2565	* Grow (by 1) the number of slabs within a cache. This is called by
				2566	* kmem_cache_alloc() when there are no active objs left in a cache.
				2567	*/
				2568	static struct page cache_grow_begin(struct kmem_cache cachep,
				2569	gfp_t flags, int nodeid)
				2570	{
				2571	void *freelist;
				2572	size_t offset;
				2573	gfp_t local_flags;
				2574	int page_node;
				2575	struct kmem_cache_node *n;
				2576	struct page *page;
				2577
				2578	/*
				2579	* Be lazy and only check for valid flags here, keeping it out of the
				2580	* critical path in kmem_cache_alloc().
				2581	*/
				2582	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
				2583	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
				2584	flags &= ~GFP_SLAB_BUG_MASK;
				2585	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
				2586	invalid_mask, &invalid_mask, flags, &flags);
				2587	dump_stack();
				2588	}
				2589	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
				2590	local_flags = flags & (GFP_CONSTRAINT_MASK\|GFP_RECLAIM_MASK);
				2591
				2592	check_irq_off();
				2593	if (gfpflags_allow_blocking(local_flags))
				2594	local_irq_enable();
				2595
				2596	/*
				2597	* Get mem for the objs. Attempt to allocate a physical page from
				2598	* 'nodeid'.
				2599	*/
				2600	page = kmem_getpages(cachep, local_flags, nodeid);
				2601	if (!page)
				2602	goto failed;
				2603
				2604	page_node = page_to_nid(page);
				2605	n = get_node(cachep, page_node);
				2606
				2607	/* Get colour for the slab, and cal the next value. */
				2608	n->colour_next++;
				2609	if (n->colour_next >= cachep->colour)
				2610	n->colour_next = 0;
				2611
				2612	offset = n->colour_next;
				2613	if (offset >= cachep->colour)
				2614	offset = 0;
				2615
				2616	offset *= cachep->colour_off;
				2617
				2618	/*
				2619	* Call kasan_poison_slab() before calling alloc_slabmgmt(), so
				2620	* page_address() in the latter returns a non-tagged pointer,
				2621	* as it should be for slab pages.
				2622	*/
				2623	kasan_poison_slab(page);
				2624
				2625	/* Get slab management. */
				2626	freelist = alloc_slabmgmt(cachep, page, offset,
				2627	local_flags & ~GFP_CONSTRAINT_MASK, page_node);
				2628	if (OFF_SLAB(cachep) && !freelist)
				2629	goto opps1;
				2630
				2631	slab_map_pages(cachep, page, freelist);
				2632
				2633	cache_init_objs(cachep, page);
				2634
				2635	if (gfpflags_allow_blocking(local_flags))
				2636	local_irq_disable();
				2637
				2638	return page;
				2639
				2640	opps1:
				2641	kmem_freepages(cachep, page);
				2642	failed:
				2643	if (gfpflags_allow_blocking(local_flags))
				2644	local_irq_disable();
				2645	return NULL;
				2646	}
				2647
				2648	static void cache_grow_end(struct kmem_cache cachep, struct page page)
				2649	{
				2650	struct kmem_cache_node *n;
				2651	void *list = NULL;
				2652
				2653	check_irq_off();
				2654
				2655	if (!page)
				2656	return;
				2657
				2658	INIT_LIST_HEAD(&page->slab_list);
				2659	n = get_node(cachep, page_to_nid(page));
				2660
				2661	spin_lock(&n->list_lock);
				2662	n->total_slabs++;
				2663	if (!page->active) {
				2664	list_add_tail(&page->slab_list, &n->slabs_free);
				2665	n->free_slabs++;
				2666	} else
				2667	fixup_slab_list(cachep, n, page, &list);
				2668
				2669	STATS_INC_GROWN(cachep);
				2670	n->free_objects += cachep->num - page->active;
				2671	spin_unlock(&n->list_lock);
				2672
				2673	fixup_objfreelist_debug(cachep, &list);
				2674	}
				2675
				2676	#if DEBUG
				2677
				2678	/*
				2679	* Perform extra freeing checks:
				2680	* - detect bad pointers.
				2681	* - POISON/RED_ZONE checking
				2682	*/
				2683	static void kfree_debugcheck(const void *objp)
				2684	{
				2685	if (!virt_addr_valid(objp)) {
				2686	pr_err("kfree_debugcheck: out of range ptr %lxh\n",
				2687	(unsigned long)objp);
				2688	BUG();
				2689	}
				2690	}
				2691
				2692	static inline void verify_redzone_free(struct kmem_cache cache, void obj)
				2693	{
				2694	unsigned long long redzone1, redzone2;
				2695
				2696	redzone1 = *dbg_redzone1(cache, obj);
				2697	redzone2 = *dbg_redzone2(cache, obj);
				2698
				2699	/*
				2700	* Redzone is ok.
				2701	*/
				2702	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
				2703	return;
				2704
				2705	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
				2706	slab_error(cache, "double free detected");
				2707	else
				2708	slab_error(cache, "memory outside object was overwritten");
				2709
				2710	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
				2711	obj, redzone1, redzone2);
				2712	}
				2713
				2714	static void cache_free_debugcheck(struct kmem_cache cachep, void *objp,
				2715	unsigned long caller)
				2716	{
				2717	unsigned int objnr;
				2718	struct page *page;
				2719
				2720	BUG_ON(virt_to_cache(objp) != cachep);
				2721
				2722	objp -= obj_offset(cachep);
				2723	kfree_debugcheck(objp);
				2724	page = virt_to_head_page(objp);
				2725
				2726	if (cachep->flags & SLAB_RED_ZONE) {
				2727	verify_redzone_free(cachep, objp);
				2728	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
				2729	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
				2730	}
				2731	if (cachep->flags & SLAB_STORE_USER)
				2732	dbg_userword(cachep, objp) = (void )caller;
				2733
				2734	objnr = obj_to_index(cachep, page, objp);
				2735
				2736	BUG_ON(objnr >= cachep->num);
				2737	BUG_ON(objp != index_to_obj(cachep, page, objnr));
				2738
				2739	if (cachep->flags & SLAB_POISON) {
				2740	poison_obj(cachep, objp, POISON_FREE);
				2741	slab_kernel_map(cachep, objp, 0);
				2742	}
				2743	return objp;
				2744	}
				2745
				2746	#else
				2747	#define kfree_debugcheck(x) do { } while(0)
				2748	#define cache_free_debugcheck(x,objp,z) (objp)
				2749	#endif
				2750
				2751	static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
				2752	void **list)
				2753	{
				2754	#if DEBUG
				2755	void next = list;
				2756	void *objp;
				2757
				2758	while (next) {
				2759	objp = next - obj_offset(cachep);
				2760	next = (void *)next;
				2761	poison_obj(cachep, objp, POISON_FREE);
				2762	}
				2763	#endif
				2764	}
				2765
				2766	static inline void fixup_slab_list(struct kmem_cache *cachep,
				2767	struct kmem_cache_node n, struct page page,
				2768	void **list)
				2769	{
				2770	/* move slabp to correct slabp list: */
				2771	list_del(&page->slab_list);
				2772	if (page->active == cachep->num) {
				2773	list_add(&page->slab_list, &n->slabs_full);
				2774	if (OBJFREELIST_SLAB(cachep)) {
				2775	#if DEBUG
				2776	/* Poisoning will be done without holding the lock */
				2777	if (cachep->flags & SLAB_POISON) {
				2778	void **objp = page->freelist;
				2779
				2780	objp = list;
				2781	*list = objp;
				2782	}
				2783	#endif
				2784	page->freelist = NULL;
				2785	}
				2786	} else
				2787	list_add(&page->slab_list, &n->slabs_partial);
				2788	}
				2789
				2790	/* Try to find non-pfmemalloc slab if needed */
				2791	static noinline struct page get_valid_first_slab(struct kmem_cache_node n,
				2792	struct page *page, bool pfmemalloc)
				2793	{
				2794	if (!page)
				2795	return NULL;
				2796
				2797	if (pfmemalloc)
				2798	return page;
				2799
				2800	if (!PageSlabPfmemalloc(page))
				2801	return page;
				2802
				2803	/* No need to keep pfmemalloc slab if we have enough free objects */
				2804	if (n->free_objects > n->free_limit) {
				2805	ClearPageSlabPfmemalloc(page);
				2806	return page;
				2807	}
				2808
				2809	/* Move pfmemalloc slab to the end of list to speed up next search */
				2810	list_del(&page->slab_list);
				2811	if (!page->active) {
				2812	list_add_tail(&page->slab_list, &n->slabs_free);
				2813	n->free_slabs++;
				2814	} else
				2815	list_add_tail(&page->slab_list, &n->slabs_partial);
				2816
				2817	list_for_each_entry(page, &n->slabs_partial, slab_list) {
				2818	if (!PageSlabPfmemalloc(page))
				2819	return page;
				2820	}
				2821
				2822	n->free_touched = 1;
				2823	list_for_each_entry(page, &n->slabs_free, slab_list) {
				2824	if (!PageSlabPfmemalloc(page)) {
				2825	n->free_slabs--;
				2826	return page;
				2827	}
				2828	}
				2829
				2830	return NULL;
				2831	}
				2832
				2833	static struct page get_first_slab(struct kmem_cache_node n, bool pfmemalloc)
				2834	{
				2835	struct page *page;
				2836
				2837	assert_spin_locked(&n->list_lock);
				2838	page = list_first_entry_or_null(&n->slabs_partial, struct page,
				2839	slab_list);
				2840	if (!page) {
				2841	n->free_touched = 1;
				2842	page = list_first_entry_or_null(&n->slabs_free, struct page,
				2843	slab_list);
				2844	if (page)
				2845	n->free_slabs--;
				2846	}
				2847
				2848	if (sk_memalloc_socks())
				2849	page = get_valid_first_slab(n, page, pfmemalloc);
				2850
				2851	return page;
				2852	}
				2853
				2854	static noinline void cache_alloc_pfmemalloc(struct kmem_cache cachep,
				2855	struct kmem_cache_node *n, gfp_t flags)
				2856	{
				2857	struct page *page;
				2858	void *obj;
				2859	void *list = NULL;
				2860
				2861	if (!gfp_pfmemalloc_allowed(flags))
				2862	return NULL;
				2863
				2864	spin_lock(&n->list_lock);
				2865	page = get_first_slab(n, true);
				2866	if (!page) {
				2867	spin_unlock(&n->list_lock);
				2868	return NULL;
				2869	}
				2870
				2871	obj = slab_get_obj(cachep, page);
				2872	n->free_objects--;
				2873
				2874	fixup_slab_list(cachep, n, page, &list);
				2875
				2876	spin_unlock(&n->list_lock);
				2877	fixup_objfreelist_debug(cachep, &list);
				2878
				2879	return obj;
				2880	}
				2881
				2882	/*
				2883	* Slab list should be fixed up by fixup_slab_list() for existing slab
				2884	* or cache_grow_end() for new slab
				2885	*/
				2886	static __always_inline int alloc_block(struct kmem_cache *cachep,
				2887	struct array_cache ac, struct page page, int batchcount)
				2888	{
				2889	/*
				2890	* There must be at least one object available for
				2891	* allocation.
				2892	*/
				2893	BUG_ON(page->active >= cachep->num);
				2894
				2895	while (page->active < cachep->num && batchcount--) {
				2896	STATS_INC_ALLOCED(cachep);
				2897	STATS_INC_ACTIVE(cachep);
				2898	STATS_SET_HIGH(cachep);
				2899
				2900	ac->entry[ac->avail++] = slab_get_obj(cachep, page);
				2901	}
				2902
				2903	return batchcount;
				2904	}
				2905
				2906	static void cache_alloc_refill(struct kmem_cache cachep, gfp_t flags)
				2907	{
				2908	int batchcount;
				2909	struct kmem_cache_node *n;
				2910	struct array_cache ac, shared;
				2911	int node;
				2912	void *list = NULL;
				2913	struct page *page;
				2914
				2915	check_irq_off();
				2916	node = numa_mem_id();
				2917
				2918	ac = cpu_cache_get(cachep);
				2919	batchcount = ac->batchcount;
				2920	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
				2921	/*
				2922	* If there was little recent activity on this cache, then
				2923	* perform only a partial refill. Otherwise we could generate
				2924	* refill bouncing.
				2925	*/
				2926	batchcount = BATCHREFILL_LIMIT;
				2927	}
				2928	n = get_node(cachep, node);
				2929
				2930	BUG_ON(ac->avail > 0 \|\| !n);
				2931	shared = READ_ONCE(n->shared);
				2932	if (!n->free_objects && (!shared \|\| !shared->avail))
				2933	goto direct_grow;
				2934
				2935	spin_lock(&n->list_lock);
				2936	shared = READ_ONCE(n->shared);
				2937
				2938	/* See if we can refill from the shared array */
				2939	if (shared && transfer_objects(ac, shared, batchcount)) {
				2940	shared->touched = 1;
				2941	goto alloc_done;
				2942	}
				2943
				2944	while (batchcount > 0) {
				2945	/* Get slab alloc is to come from. */
				2946	page = get_first_slab(n, false);
				2947	if (!page)
				2948	goto must_grow;
				2949
				2950	check_spinlock_acquired(cachep);
				2951
				2952	batchcount = alloc_block(cachep, ac, page, batchcount);
				2953	fixup_slab_list(cachep, n, page, &list);
				2954	}
				2955
				2956	must_grow:
				2957	n->free_objects -= ac->avail;
				2958	alloc_done:
				2959	spin_unlock(&n->list_lock);
				2960	fixup_objfreelist_debug(cachep, &list);
				2961
				2962	direct_grow:
				2963	if (unlikely(!ac->avail)) {
				2964	/* Check if we can use obj in pfmemalloc slab */
				2965	if (sk_memalloc_socks()) {
				2966	void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
				2967
				2968	if (obj)
				2969	return obj;
				2970	}
				2971
				2972	page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
				2973
				2974	/*
				2975	* cache_grow_begin() can reenable interrupts,
				2976	* then ac could change.
				2977	*/
				2978	ac = cpu_cache_get(cachep);
				2979	if (!ac->avail && page)
				2980	alloc_block(cachep, ac, page, batchcount);
				2981	cache_grow_end(cachep, page);
				2982
				2983	if (!ac->avail)
				2984	return NULL;
				2985	}
				2986	ac->touched = 1;
				2987
				2988	return ac->entry[--ac->avail];
				2989	}
				2990
				2991	static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
				2992	gfp_t flags)
				2993	{
				2994	might_sleep_if(gfpflags_allow_blocking(flags));
				2995	}
				2996
				2997	#if DEBUG
				2998	static void cache_alloc_debugcheck_after(struct kmem_cache cachep,
				2999	gfp_t flags, void *objp, unsigned long caller)
				3000	{
				3001	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
				3002	if (!objp)
				3003	return objp;
				3004	if (cachep->flags & SLAB_POISON) {
				3005	check_poison_obj(cachep, objp);
				3006	slab_kernel_map(cachep, objp, 1);
				3007	poison_obj(cachep, objp, POISON_INUSE);
				3008	}
				3009	if (cachep->flags & SLAB_STORE_USER)
				3010	dbg_userword(cachep, objp) = (void )caller;
				3011
				3012	if (cachep->flags & SLAB_RED_ZONE) {
				3013	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE \|\|
				3014	*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
				3015	slab_error(cachep, "double free, or memory outside object was overwritten");
				3016	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
				3017	objp, *dbg_redzone1(cachep, objp),
				3018	*dbg_redzone2(cachep, objp));
				3019	}
				3020	*dbg_redzone1(cachep, objp) = RED_ACTIVE;
				3021	*dbg_redzone2(cachep, objp) = RED_ACTIVE;
				3022	}
				3023
				3024	objp += obj_offset(cachep);
				3025	if (cachep->ctor && cachep->flags & SLAB_POISON)
				3026	cachep->ctor(objp);
				3027	if (ARCH_SLAB_MINALIGN &&
				3028	((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
				3029	pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
				3030	objp, (int)ARCH_SLAB_MINALIGN);
				3031	}
				3032	return objp;
				3033	}
				3034	#else
				3035	#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
				3036	#endif
				3037
				3038	static inline void ____cache_alloc(struct kmem_cache cachep, gfp_t flags)
				3039	{
				3040	void *objp;
				3041	struct array_cache *ac;
				3042
				3043	check_irq_off();
				3044
				3045	ac = cpu_cache_get(cachep);
				3046	if (likely(ac->avail)) {
				3047	ac->touched = 1;
				3048	objp = ac->entry[--ac->avail];
				3049
				3050	STATS_INC_ALLOCHIT(cachep);
				3051	goto out;
				3052	}
				3053
				3054	STATS_INC_ALLOCMISS(cachep);
				3055	objp = cache_alloc_refill(cachep, flags);
				3056	/*
				3057	* the 'ac' may be updated by cache_alloc_refill(),
				3058	* and kmemleak_erase() requires its correct value.
				3059	*/
				3060	ac = cpu_cache_get(cachep);
				3061
				3062	out:
				3063	/*
				3064	* To avoid a false negative, if an object that is in one of the
				3065	* per-CPU caches is leaked, we need to make sure kmemleak doesn't
				3066	* treat the array pointers as a reference to the object.
				3067	*/
				3068	if (objp)
				3069	kmemleak_erase(&ac->entry[ac->avail]);
				3070	return objp;
				3071	}
				3072
				3073	#ifdef CONFIG_NUMA
				3074	/*
				3075	* Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
				3076	*
				3077	* If we are in_interrupt, then process context, including cpusets and
				3078	* mempolicy, may not apply and should not be used for allocation policy.
				3079	*/
				3080	static void alternate_node_alloc(struct kmem_cache cachep, gfp_t flags)
				3081	{
				3082	int nid_alloc, nid_here;
				3083
				3084	if (in_interrupt() \|\| (flags & __GFP_THISNODE))
				3085	return NULL;
				3086	nid_alloc = nid_here = numa_mem_id();
				3087	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
				3088	nid_alloc = cpuset_slab_spread_node();
				3089	else if (current->mempolicy)
				3090	nid_alloc = mempolicy_slab_node();
				3091	if (nid_alloc != nid_here)
				3092	return ____cache_alloc_node(cachep, flags, nid_alloc);
				3093	return NULL;
				3094	}
				3095
				3096	/*
				3097	* Fallback function if there was no memory available and no objects on a
				3098	* certain node and fall back is permitted. First we scan all the
				3099	* available node for available objects. If that fails then we
				3100	* perform an allocation without specifying a node. This allows the page
				3101	* allocator to do its reclaim / fallback magic. We then insert the
				3102	* slab into the proper nodelist and then allocate from it.
				3103	*/
				3104	static void fallback_alloc(struct kmem_cache cache, gfp_t flags)
				3105	{
				3106	struct zonelist *zonelist;
				3107	struct zoneref *z;
				3108	struct zone *zone;
				3109	enum zone_type high_zoneidx = gfp_zone(flags);
				3110	void *obj = NULL;
				3111	struct page *page;
				3112	int nid;
				3113	unsigned int cpuset_mems_cookie;
				3114
				3115	if (flags & __GFP_THISNODE)
				3116	return NULL;
				3117
				3118	retry_cpuset:
				3119	cpuset_mems_cookie = read_mems_allowed_begin();
				3120	zonelist = node_zonelist(mempolicy_slab_node(), flags);
				3121
				3122	retry:
				3123	/*
				3124	* Look through allowed nodes for objects available
				3125	* from existing per node queues.
				3126	*/
				3127	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
				3128	nid = zone_to_nid(zone);
				3129
				3130	if (cpuset_zone_allowed(zone, flags) &&
				3131	get_node(cache, nid) &&
				3132	get_node(cache, nid)->free_objects) {
				3133	obj = ____cache_alloc_node(cache,
				3134	gfp_exact_node(flags), nid);
				3135	if (obj)
				3136	break;
				3137	}
				3138	}
				3139
				3140	if (!obj) {
				3141	/*
				3142	* This allocation will be performed within the constraints
				3143	* of the current cpuset / memory policy requirements.
				3144	* We may trigger various forms of reclaim on the allowed
				3145	* set and go into memory reserves if necessary.
				3146	*/
				3147	page = cache_grow_begin(cache, flags, numa_mem_id());
				3148	cache_grow_end(cache, page);
				3149	if (page) {
				3150	nid = page_to_nid(page);
				3151	obj = ____cache_alloc_node(cache,
				3152	gfp_exact_node(flags), nid);
				3153
				3154	/*
				3155	* Another processor may allocate the objects in
				3156	* the slab since we are not holding any locks.
				3157	*/
				3158	if (!obj)
				3159	goto retry;
				3160	}
				3161	}
				3162
				3163	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
				3164	goto retry_cpuset;
				3165	return obj;
				3166	}
				3167
				3168	/*
				3169	* A interface to enable slab creation on nodeid
				3170	*/
				3171	static void ____cache_alloc_node(struct kmem_cache cachep, gfp_t flags,
				3172	int nodeid)
				3173	{
				3174	struct page *page;
				3175	struct kmem_cache_node *n;
				3176	void *obj = NULL;
				3177	void *list = NULL;
				3178
				3179	VM_BUG_ON(nodeid < 0 \|\| nodeid >= MAX_NUMNODES);
				3180	n = get_node(cachep, nodeid);
				3181	BUG_ON(!n);
				3182
				3183	check_irq_off();
				3184	spin_lock(&n->list_lock);
				3185	page = get_first_slab(n, false);
				3186	if (!page)
				3187	goto must_grow;
				3188
				3189	check_spinlock_acquired_node(cachep, nodeid);
				3190
				3191	STATS_INC_NODEALLOCS(cachep);
				3192	STATS_INC_ACTIVE(cachep);
				3193	STATS_SET_HIGH(cachep);
				3194
				3195	BUG_ON(page->active == cachep->num);
				3196
				3197	obj = slab_get_obj(cachep, page);
				3198	n->free_objects--;
				3199
				3200	fixup_slab_list(cachep, n, page, &list);
				3201
				3202	spin_unlock(&n->list_lock);
				3203	fixup_objfreelist_debug(cachep, &list);
				3204	return obj;
				3205
				3206	must_grow:
				3207	spin_unlock(&n->list_lock);
				3208	page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
				3209	if (page) {
				3210	/* This slab isn't counted yet so don't update free_objects */
				3211	obj = slab_get_obj(cachep, page);
				3212	}
				3213	cache_grow_end(cachep, page);
				3214
				3215	return obj ? obj : fallback_alloc(cachep, flags);
				3216	}
				3217
				3218	static __always_inline void *
				3219	slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
				3220	unsigned long caller)
				3221	{
				3222	unsigned long save_flags;
				3223	void *ptr;
				3224	int slab_node = numa_mem_id();
				3225	struct kmem_cache *orig_cachep = cachep;
				3226
				3227	flags &= gfp_allowed_mask;
				3228	cachep = slab_pre_alloc_hook(cachep, flags);
				3229	if (unlikely(!cachep))
				3230	return NULL;
				3231
				3232	/*
				3233	* 5.4 note: passing in original cachep to avoid problems with memcg
				3234	* accounting. Making KFENCE properly work with memcgs on older kernels
				3235	* is not worth the effort.
				3236	*/
				3237	ptr = kfence_alloc(orig_cachep, orig_size, flags);
				3238	if (unlikely(ptr))
				3239	goto out_hooks;
				3240
				3241	cache_alloc_debugcheck_before(cachep, flags);
				3242	local_irq_save(save_flags);
				3243
				3244	if (nodeid == NUMA_NO_NODE)
				3245	nodeid = slab_node;
				3246
				3247	if (unlikely(!get_node(cachep, nodeid))) {
				3248	/* Node not bootstrapped yet */
				3249	ptr = fallback_alloc(cachep, flags);
				3250	goto out;
				3251	}
				3252
				3253	if (nodeid == slab_node) {
				3254	/*
				3255	* Use the locally cached objects if possible.
				3256	* However ____cache_alloc does not allow fallback
				3257	* to other nodes. It may fail while we still have
				3258	* objects on other nodes available.
				3259	*/
				3260	ptr = ____cache_alloc(cachep, flags);
				3261	if (ptr)
				3262	goto out;
				3263	}
				3264	/* ___cache_alloc_node can fall back to other nodes */
				3265	ptr = ____cache_alloc_node(cachep, flags, nodeid);
				3266	out:
				3267	local_irq_restore(save_flags);
				3268	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
				3269
				3270	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
				3271	memset(ptr, 0, cachep->object_size);
				3272
				3273	out_hooks:
				3274	slab_post_alloc_hook(cachep, flags, 1, &ptr);
				3275	return ptr;
				3276	}
				3277
				3278	static __always_inline void *
				3279	__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
				3280	{
				3281	void *objp;
				3282
				3283	if (current->mempolicy \|\| cpuset_do_slab_mem_spread()) {
				3284	objp = alternate_node_alloc(cache, flags);
				3285	if (objp)
				3286	goto out;
				3287	}
				3288	objp = ____cache_alloc(cache, flags);
				3289
				3290	/*
				3291	* We may just have run out of memory on the local node.
				3292	* ____cache_alloc_node() knows how to locate memory on other nodes
				3293	*/
				3294	if (!objp)
				3295	objp = ____cache_alloc_node(cache, flags, numa_mem_id());
				3296
				3297	out:
				3298	return objp;
				3299	}
				3300	#else
				3301
				3302	static __always_inline void *
				3303	__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
				3304	{
				3305	return ____cache_alloc(cachep, flags);
				3306	}
				3307
				3308	#endif /* CONFIG_NUMA */
				3309
				3310	static __always_inline void *
				3311	slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
				3312	{
				3313	unsigned long save_flags;
				3314	void *objp;
				3315	struct kmem_cache *orig_cachep = cachep;
				3316
				3317	flags &= gfp_allowed_mask;
				3318	cachep = slab_pre_alloc_hook(cachep, flags);
				3319	if (unlikely(!cachep))
				3320	return NULL;
				3321
				3322	/*
				3323	* 5.4 note: passing in original cachep to avoid problems with memcg
				3324	* accounting. Making KFENCE properly work with memcgs on older kernels
				3325	* is not worth the effort.
				3326	*/
				3327	objp = kfence_alloc(orig_cachep, orig_size, flags);
				3328	if (unlikely(objp))
				3329	goto out;
				3330
				3331	cache_alloc_debugcheck_before(cachep, flags);
				3332	local_irq_save(save_flags);
				3333	objp = __do_cache_alloc(cachep, flags);
				3334	local_irq_restore(save_flags);
				3335	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
				3336	prefetchw(objp);
				3337
				3338	if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
				3339	memset(objp, 0, cachep->object_size);
				3340
				3341	out:
				3342	slab_post_alloc_hook(cachep, flags, 1, &objp);
				3343	return objp;
				3344	}
				3345
				3346	/*
				3347	* Caller needs to acquire correct kmem_cache_node's list_lock
				3348	* @list: List of detached free slabs should be freed by caller
				3349	*/
				3350	static void free_block(struct kmem_cache cachep, void *objpp,
				3351	int nr_objects, int node, struct list_head *list)
				3352	{
				3353	int i;
				3354	struct kmem_cache_node *n = get_node(cachep, node);
				3355	struct page *page;
				3356
				3357	n->free_objects += nr_objects;
				3358
				3359	for (i = 0; i < nr_objects; i++) {
				3360	void *objp;
				3361	struct page *page;
				3362
				3363	objp = objpp[i];
				3364
				3365	page = virt_to_head_page(objp);
				3366	list_del(&page->slab_list);
				3367	check_spinlock_acquired_node(cachep, node);
				3368	slab_put_obj(cachep, page, objp);
				3369	STATS_DEC_ACTIVE(cachep);
				3370
				3371	/* fixup slab chains */
				3372	if (page->active == 0) {
				3373	list_add(&page->slab_list, &n->slabs_free);
				3374	n->free_slabs++;
				3375	} else {
				3376	/* Unconditionally move a slab to the end of the
				3377	* partial list on free - maximum time for the
				3378	* other objects to be freed, too.
				3379	*/
				3380	list_add_tail(&page->slab_list, &n->slabs_partial);
				3381	}
				3382	}
				3383
				3384	while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
				3385	n->free_objects -= cachep->num;
				3386
				3387	page = list_last_entry(&n->slabs_free, struct page, slab_list);
				3388	list_move(&page->slab_list, list);
				3389	n->free_slabs--;
				3390	n->total_slabs--;
				3391	}
				3392	}
				3393
				3394	static void cache_flusharray(struct kmem_cache cachep, struct array_cache ac)
				3395	{
				3396	int batchcount;
				3397	struct kmem_cache_node *n;
				3398	int node = numa_mem_id();
				3399	LIST_HEAD(list);
				3400
				3401	batchcount = ac->batchcount;
				3402
				3403	check_irq_off();
				3404	n = get_node(cachep, node);
				3405	spin_lock(&n->list_lock);
				3406	if (n->shared) {
				3407	struct array_cache *shared_array = n->shared;
				3408	int max = shared_array->limit - shared_array->avail;
				3409	if (max) {
				3410	if (batchcount > max)
				3411	batchcount = max;
				3412	memcpy(&(shared_array->entry[shared_array->avail]),
				3413	ac->entry, sizeof(void ) batchcount);
				3414	shared_array->avail += batchcount;
				3415	goto free_done;
				3416	}
				3417	}
				3418
				3419	free_block(cachep, ac->entry, batchcount, node, &list);
				3420	free_done:
				3421	#if STATS
				3422	{
				3423	int i = 0;
				3424	struct page *page;
				3425
				3426	list_for_each_entry(page, &n->slabs_free, slab_list) {
				3427	BUG_ON(page->active);
				3428
				3429	i++;
				3430	}
				3431	STATS_SET_FREEABLE(cachep, i);
				3432	}
				3433	#endif
				3434	spin_unlock(&n->list_lock);
				3435	slabs_destroy(cachep, &list);
				3436	ac->avail -= batchcount;
				3437	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void )ac->avail);
				3438	}
				3439
				3440	/*
				3441	* Release an obj back to its cache. If the obj has a constructed state, it must
				3442	* be in this state _before_ it is released. Called with disabled ints.
				3443	*/
				3444	static __always_inline void __cache_free(struct kmem_cache cachep, void objp,
				3445	unsigned long caller)
				3446	{
				3447	if (is_kfence_address(objp)) {
				3448	kmemleak_free_recursive(objp, cachep->flags);
				3449	__kfence_free(objp);
				3450	return;
				3451	}
				3452
				3453	if (unlikely(slab_want_init_on_free(cachep)))
				3454	memset(objp, 0, cachep->object_size);
				3455
				3456	/* Put the object into the quarantine, don't touch it for now. */
				3457	if (kasan_slab_free(cachep, objp, _RET_IP_))
				3458	return;
				3459
				3460	___cache_free(cachep, objp, caller);
				3461	}
				3462
				3463	void ___cache_free(struct kmem_cache cachep, void objp,
				3464	unsigned long caller)
				3465	{
				3466	struct array_cache *ac = cpu_cache_get(cachep);
				3467
				3468	check_irq_off();
				3469	kmemleak_free_recursive(objp, cachep->flags);
				3470	objp = cache_free_debugcheck(cachep, objp, caller);
				3471
				3472	/*
				3473	* Skip calling cache_free_alien() when the platform is not numa.
				3474	* This will avoid cache misses that happen while accessing slabp (which
				3475	* is per page memory reference) to get nodeid. Instead use a global
				3476	* variable to skip the call, which is mostly likely to be present in
				3477	* the cache.
				3478	*/
				3479	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
				3480	return;
				3481
				3482	if (ac->avail < ac->limit) {
				3483	STATS_INC_FREEHIT(cachep);
				3484	} else {
				3485	STATS_INC_FREEMISS(cachep);
				3486	cache_flusharray(cachep, ac);
				3487	}
				3488
				3489	if (sk_memalloc_socks()) {
				3490	struct page *page = virt_to_head_page(objp);
				3491
				3492	if (unlikely(PageSlabPfmemalloc(page))) {
				3493	cache_free_pfmemalloc(cachep, page, objp);
				3494	return;
				3495	}
				3496	}
				3497
				3498	ac->entry[ac->avail++] = objp;
				3499	}
				3500
				3501	/**
				3502	* kmem_cache_alloc - Allocate an object
				3503	* @cachep: The cache to allocate from.
				3504	* @flags: See kmalloc().
				3505	*
				3506	* Allocate an object from this cache. The flags are only relevant
				3507	* if the cache has no available objects.
				3508	*
				3509	* Return: pointer to the new object or %NULL in case of error
				3510	*/
				3511	void kmem_cache_alloc(struct kmem_cache cachep, gfp_t flags)
				3512	{
				3513	void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
				3514
				3515	trace_kmem_cache_alloc(_RET_IP_, ret,
				3516	cachep->object_size, cachep->size, flags);
				3517
				3518	return ret;
				3519	}
				3520	EXPORT_SYMBOL(kmem_cache_alloc);
				3521
				3522	static __always_inline void
				3523	cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
				3524	size_t size, void **p, unsigned long caller)
				3525	{
				3526	size_t i;
				3527
				3528	for (i = 0; i < size; i++)
				3529	p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
				3530	}
				3531
				3532	int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
				3533	void **p)
				3534	{
				3535	size_t i;
				3536	struct kmem_cache *root_s = s;
				3537
				3538	s = slab_pre_alloc_hook(s, flags);
				3539	if (!s)
				3540	return 0;
				3541
				3542	cache_alloc_debugcheck_before(s, flags);
				3543
				3544	local_irq_disable();
				3545	for (i = 0; i < size; i++) {
				3546	/*
				3547	* 5.4 note: passing in original cachep to avoid problems with
				3548	* memcg accounting. Making KFENCE properly work with memcgs on
				3549	* older kernels is not worth the effort.
				3550	*/
				3551	void *objp = kfence_alloc(root_s, s->object_size, flags) ?:
				3552	__do_cache_alloc(s, flags);
				3553
				3554	if (unlikely(!objp))
				3555	goto error;
				3556	p[i] = objp;
				3557	}
				3558	local_irq_enable();
				3559
				3560	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
				3561
				3562	/* Clear memory outside IRQ disabled section */
				3563	if (unlikely(slab_want_init_on_alloc(flags, s)))
				3564	for (i = 0; i < size; i++)
				3565	memset(p[i], 0, s->object_size);
				3566
				3567	slab_post_alloc_hook(s, flags, size, p);
				3568	/* FIXME: Trace call missing. Christoph would like a bulk variant */
				3569	return size;
				3570	error:
				3571	local_irq_enable();
				3572	cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
				3573	slab_post_alloc_hook(s, flags, i, p);
				3574	__kmem_cache_free_bulk(s, i, p);
				3575	return 0;
				3576	}
				3577	EXPORT_SYMBOL(kmem_cache_alloc_bulk);
				3578
				3579	#ifdef CONFIG_TRACING
				3580	void *
				3581	kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
				3582	{
				3583	void *ret;
				3584
				3585	ret = slab_alloc(cachep, flags, size, _RET_IP_);
				3586
				3587	ret = kasan_kmalloc(cachep, ret, size, flags);
				3588	trace_kmalloc(_RET_IP_, ret,
				3589	size, cachep->size, flags);
				3590	return ret;
				3591	}
				3592	EXPORT_SYMBOL(kmem_cache_alloc_trace);
				3593	#endif
				3594
				3595	#ifdef CONFIG_NUMA
				3596	/**
				3597	* kmem_cache_alloc_node - Allocate an object on the specified node
				3598	* @cachep: The cache to allocate from.
				3599	* @flags: See kmalloc().
				3600	* @nodeid: node number of the target node.
				3601	*
				3602	* Identical to kmem_cache_alloc but it will allocate memory on the given
				3603	* node, which can improve the performance for cpu bound structures.
				3604	*
				3605	* Fallback to other node is possible if __GFP_THISNODE is not set.
				3606	*
				3607	* Return: pointer to the new object or %NULL in case of error
				3608	*/
				3609	void kmem_cache_alloc_node(struct kmem_cache cachep, gfp_t flags, int nodeid)
				3610	{
				3611	void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
				3612
				3613	trace_kmem_cache_alloc_node(_RET_IP_, ret,
				3614	cachep->object_size, cachep->size,
				3615	flags, nodeid);
				3616
				3617	return ret;
				3618	}
				3619	EXPORT_SYMBOL(kmem_cache_alloc_node);
				3620
				3621	#ifdef CONFIG_TRACING
				3622	void kmem_cache_alloc_node_trace(struct kmem_cache cachep,
				3623	gfp_t flags,
				3624	int nodeid,
				3625	size_t size)
				3626	{
				3627	void *ret;
				3628
				3629	ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
				3630
				3631	ret = kasan_kmalloc(cachep, ret, size, flags);
				3632	trace_kmalloc_node(_RET_IP_, ret,
				3633	size, cachep->size,
				3634	flags, nodeid);
				3635	return ret;
				3636	}
				3637	EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
				3638	#endif
				3639
				3640	static __always_inline void *
				3641	__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
				3642	{
				3643	struct kmem_cache *cachep;
				3644	void *ret;
				3645
				3646	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
				3647	return NULL;
				3648	cachep = kmalloc_slab(size, flags);
				3649	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
				3650	return cachep;
				3651	ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
				3652	ret = kasan_kmalloc(cachep, ret, size, flags);
				3653
				3654	return ret;
				3655	}
				3656
				3657	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				3658	{
				3659	return __do_kmalloc_node(size, flags, node, _RET_IP_);
				3660	}
				3661	EXPORT_SYMBOL(__kmalloc_node);
				3662
				3663	void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
				3664	int node, unsigned long caller)
				3665	{
				3666	return __do_kmalloc_node(size, flags, node, caller);
				3667	}
				3668	EXPORT_SYMBOL(__kmalloc_node_track_caller);
				3669	#endif /* CONFIG_NUMA */
				3670
				3671	/**
				3672	* __do_kmalloc - allocate memory
				3673	* @size: how many bytes of memory are required.
				3674	* @flags: the type of memory to allocate (see kmalloc).
				3675	* @caller: function caller for debug tracking of the caller
				3676	*
				3677	* Return: pointer to the allocated memory or %NULL in case of error
				3678	*/
				3679	static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
				3680	unsigned long caller)
				3681	{
				3682	struct kmem_cache *cachep;
				3683	void *ret;
				3684
				3685	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
				3686	return NULL;
				3687	cachep = kmalloc_slab(size, flags);
				3688	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
				3689	return cachep;
				3690	ret = slab_alloc(cachep, flags, size, caller);
				3691
				3692	ret = kasan_kmalloc(cachep, ret, size, flags);
				3693	trace_kmalloc(caller, ret,
				3694	size, cachep->size, flags);
				3695
				3696	return ret;
				3697	}
				3698
				3699	void *__kmalloc(size_t size, gfp_t flags)
				3700	{
				3701	return __do_kmalloc(size, flags, _RET_IP_);
				3702	}
				3703	EXPORT_SYMBOL(__kmalloc);
				3704
				3705	void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
				3706	{
				3707	return __do_kmalloc(size, flags, caller);
				3708	}
				3709	EXPORT_SYMBOL(__kmalloc_track_caller);
				3710
				3711	/**
				3712	* kmem_cache_free - Deallocate an object
				3713	* @cachep: The cache the allocation was from.
				3714	* @objp: The previously allocated object.
				3715	*
				3716	* Free an object which was previously allocated from this
				3717	* cache.
				3718	*/
				3719	void kmem_cache_free(struct kmem_cache cachep, void objp)
				3720	{
				3721	unsigned long flags;
				3722	cachep = cache_from_obj(cachep, objp);
				3723	if (!cachep)
				3724	return;
				3725
				3726	local_irq_save(flags);
				3727	debug_check_no_locks_freed(objp, cachep->object_size);
				3728	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
				3729	debug_check_no_obj_freed(objp, cachep->object_size);
				3730	__cache_free(cachep, objp, _RET_IP_);
				3731	local_irq_restore(flags);
				3732
				3733	trace_kmem_cache_free(_RET_IP_, objp);
				3734	}
				3735	EXPORT_SYMBOL(kmem_cache_free);
				3736
				3737	void kmem_cache_free_bulk(struct kmem_cache orig_s, size_t size, void *p)
				3738	{
				3739	struct kmem_cache *s;
				3740	size_t i;
				3741
				3742	local_irq_disable();
				3743	for (i = 0; i < size; i++) {
				3744	void *objp = p[i];
				3745
				3746	if (!orig_s) /* called via kfree_bulk */
				3747	s = virt_to_cache(objp);
				3748	else
				3749	s = cache_from_obj(orig_s, objp);
				3750	if (!s)
				3751	continue;
				3752
				3753	debug_check_no_locks_freed(objp, s->object_size);
				3754	if (!(s->flags & SLAB_DEBUG_OBJECTS))
				3755	debug_check_no_obj_freed(objp, s->object_size);
				3756
				3757	__cache_free(s, objp, _RET_IP_);
				3758	}
				3759	local_irq_enable();
				3760
				3761	/* FIXME: add tracing */
				3762	}
				3763	EXPORT_SYMBOL(kmem_cache_free_bulk);
				3764
				3765	/**
				3766	* kfree - free previously allocated memory
				3767	* @objp: pointer returned by kmalloc.
				3768	*
				3769	* If @objp is NULL, no operation is performed.
				3770	*
				3771	* Don't free memory not originally allocated by kmalloc()
				3772	* or you will run into trouble.
				3773	*/
				3774	void kfree(const void *objp)
				3775	{
				3776	struct kmem_cache *c;
				3777	unsigned long flags;
				3778
				3779	trace_kfree(_RET_IP_, objp);
				3780
				3781	if (unlikely(ZERO_OR_NULL_PTR(objp)))
				3782	return;
				3783	local_irq_save(flags);
				3784	kfree_debugcheck(objp);
				3785	c = virt_to_cache(objp);
				3786	if (!c) {
				3787	local_irq_restore(flags);
				3788	return;
				3789	}
				3790	debug_check_no_locks_freed(objp, c->object_size);
				3791
				3792	debug_check_no_obj_freed(objp, c->object_size);
				3793	__cache_free(c, (void *)objp, _RET_IP_);
				3794	local_irq_restore(flags);
				3795	}
				3796	EXPORT_SYMBOL(kfree);
				3797
				3798	/*
				3799	* This initializes kmem_cache_node or resizes various caches for all nodes.
				3800	*/
				3801	static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
				3802	{
				3803	int ret;
				3804	int node;
				3805	struct kmem_cache_node *n;
				3806
				3807	for_each_online_node(node) {
				3808	ret = setup_kmem_cache_node(cachep, node, gfp, true);
				3809	if (ret)
				3810	goto fail;
				3811
				3812	}
				3813
				3814	return 0;
				3815
				3816	fail:
				3817	if (!cachep->list.next) {
				3818	/* Cache is not active yet. Roll back what we did */
				3819	node--;
				3820	while (node >= 0) {
				3821	n = get_node(cachep, node);
				3822	if (n) {
				3823	kfree(n->shared);
				3824	free_alien_cache(n->alien);
				3825	kfree(n);
				3826	cachep->node[node] = NULL;
				3827	}
				3828	node--;
				3829	}
				3830	}
				3831	return -ENOMEM;
				3832	}
				3833
				3834	/* Always called with the slab_mutex held */
				3835	static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
				3836	int batchcount, int shared, gfp_t gfp)
				3837	{
				3838	struct array_cache __percpu cpu_cache, prev;
				3839	int cpu;
				3840
				3841	cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
				3842	if (!cpu_cache)
				3843	return -ENOMEM;
				3844
				3845	prev = cachep->cpu_cache;
				3846	cachep->cpu_cache = cpu_cache;
				3847	/*
				3848	* Without a previous cpu_cache there's no need to synchronize remote
				3849	* cpus, so skip the IPIs.
				3850	*/
				3851	if (prev)
				3852	kick_all_cpus_sync();
				3853
				3854	check_irq_on();
				3855	cachep->batchcount = batchcount;
				3856	cachep->limit = limit;
				3857	cachep->shared = shared;
				3858
				3859	if (!prev)
				3860	goto setup_node;
				3861
				3862	for_each_online_cpu(cpu) {
				3863	LIST_HEAD(list);
				3864	int node;
				3865	struct kmem_cache_node *n;
				3866	struct array_cache *ac = per_cpu_ptr(prev, cpu);
				3867
				3868	node = cpu_to_mem(cpu);
				3869	n = get_node(cachep, node);
				3870	spin_lock_irq(&n->list_lock);
				3871	free_block(cachep, ac->entry, ac->avail, node, &list);
				3872	spin_unlock_irq(&n->list_lock);
				3873	slabs_destroy(cachep, &list);
				3874	}
				3875	free_percpu(prev);
				3876
				3877	setup_node:
				3878	return setup_kmem_cache_nodes(cachep, gfp);
				3879	}
				3880
				3881	static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
				3882	int batchcount, int shared, gfp_t gfp)
				3883	{
				3884	int ret;
				3885	struct kmem_cache *c;
				3886
				3887	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
				3888
				3889	if (slab_state < FULL)
				3890	return ret;
				3891
				3892	if ((ret < 0) \|\| !is_root_cache(cachep))
				3893	return ret;
				3894
				3895	lockdep_assert_held(&slab_mutex);
				3896	for_each_memcg_cache(c, cachep) {
				3897	/* return value determined by the root cache only */
				3898	__do_tune_cpucache(c, limit, batchcount, shared, gfp);
				3899	}
				3900
				3901	return ret;
				3902	}
				3903
				3904	/* Called with slab_mutex held always */
				3905	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
				3906	{
				3907	int err;
				3908	int limit = 0;
				3909	int shared = 0;
				3910	int batchcount = 0;
				3911
				3912	err = cache_random_seq_create(cachep, cachep->num, gfp);
				3913	if (err)
				3914	goto end;
				3915
				3916	if (!is_root_cache(cachep)) {
				3917	struct kmem_cache *root = memcg_root_cache(cachep);
				3918	limit = root->limit;
				3919	shared = root->shared;
				3920	batchcount = root->batchcount;
				3921	}
				3922
				3923	if (limit && shared && batchcount)
				3924	goto skip_setup;
				3925	/*
				3926	* The head array serves three purposes:
				3927	* - create a LIFO ordering, i.e. return objects that are cache-warm
				3928	* - reduce the number of spinlock operations.
				3929	* - reduce the number of linked list operations on the slab and
				3930	* bufctl chains: array operations are cheaper.
				3931	* The numbers are guessed, we should auto-tune as described by
				3932	* Bonwick.
				3933	*/
				3934	if (cachep->size > 131072)
				3935	limit = 1;
				3936	else if (cachep->size > PAGE_SIZE)
				3937	limit = 8;
				3938	else if (cachep->size > 1024)
				3939	limit = 24;
				3940	else if (cachep->size > 256)
				3941	limit = 54;
				3942	else
				3943	limit = 120;
				3944
				3945	/*
				3946	* CPU bound tasks (e.g. network routing) can exhibit cpu bound
				3947	* allocation behaviour: Most allocs on one cpu, most free operations
				3948	* on another cpu. For these cases, an efficient object passing between
				3949	* cpus is necessary. This is provided by a shared array. The array
				3950	* replaces Bonwick's magazine layer.
				3951	* On uniprocessor, it's functionally equivalent (but less efficient)
				3952	* to a larger limit. Thus disabled by default.
				3953	*/
				3954	shared = 0;
				3955	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
				3956	shared = 8;
				3957
				3958	#if DEBUG
				3959	/*
				3960	* With debugging enabled, large batchcount lead to excessively long
				3961	* periods with disabled local interrupts. Limit the batchcount
				3962	*/
				3963	if (limit > 32)
				3964	limit = 32;
				3965	#endif
				3966	batchcount = (limit + 1) / 2;
				3967	skip_setup:
				3968	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
				3969	end:
				3970	if (err)
				3971	pr_err("enable_cpucache failed for %s, error %d\n",
				3972	cachep->name, -err);
				3973	return err;
				3974	}
				3975
				3976	/*
				3977	* Drain an array if it contains any elements taking the node lock only if
				3978	* necessary. Note that the node listlock also protects the array_cache
				3979	* if drain_array() is used on the shared array.
				3980	*/
				3981	static void drain_array(struct kmem_cache cachep, struct kmem_cache_node n,
				3982	struct array_cache *ac, int node)
				3983	{
				3984	LIST_HEAD(list);
				3985
				3986	/* ac from n->shared can be freed if we don't hold the slab_mutex. */
				3987	check_mutex_acquired();
				3988
				3989	if (!ac \|\| !ac->avail)
				3990	return;
				3991
				3992	if (ac->touched) {
				3993	ac->touched = 0;
				3994	return;
				3995	}
				3996
				3997	spin_lock_irq(&n->list_lock);
				3998	drain_array_locked(cachep, ac, node, false, &list);
				3999	spin_unlock_irq(&n->list_lock);
				4000
				4001	slabs_destroy(cachep, &list);
				4002	}
				4003
				4004	/**
				4005	* cache_reap - Reclaim memory from caches.
				4006	* @w: work descriptor
				4007	*
				4008	* Called from workqueue/eventd every few seconds.
				4009	* Purpose:
				4010	* - clear the per-cpu caches for this CPU.
				4011	* - return freeable pages to the main free memory pool.
				4012	*
				4013	* If we cannot acquire the cache chain mutex then just give up - we'll try
				4014	* again on the next iteration.
				4015	*/
				4016	static void cache_reap(struct work_struct *w)
				4017	{
				4018	struct kmem_cache *searchp;
				4019	struct kmem_cache_node *n;
				4020	int node = numa_mem_id();
				4021	struct delayed_work *work = to_delayed_work(w);
				4022
				4023	if (!mutex_trylock(&slab_mutex))
				4024	/* Give up. Setup the next iteration. */
				4025	goto out;
				4026
				4027	list_for_each_entry(searchp, &slab_caches, list) {
				4028	check_irq_on();
				4029
				4030	/*
				4031	* We only take the node lock if absolutely necessary and we
				4032	* have established with reasonable certainty that
				4033	* we can do some work if the lock was obtained.
				4034	*/
				4035	n = get_node(searchp, node);
				4036
				4037	reap_alien(searchp, n);
				4038
				4039	drain_array(searchp, n, cpu_cache_get(searchp), node);
				4040
				4041	/*
				4042	* These are racy checks but it does not matter
				4043	* if we skip one check or scan twice.
				4044	*/
				4045	if (time_after(n->next_reap, jiffies))
				4046	goto next;
				4047
				4048	n->next_reap = jiffies + REAPTIMEOUT_NODE;
				4049
				4050	drain_array(searchp, n, n->shared, node);
				4051
				4052	if (n->free_touched)
				4053	n->free_touched = 0;
				4054	else {
				4055	int freed;
				4056
				4057	freed = drain_freelist(searchp, n, (n->free_limit +
				4058	5 * searchp->num - 1) / (5 * searchp->num));
				4059	STATS_ADD_REAPED(searchp, freed);
				4060	}
				4061	next:
				4062	cond_resched();
				4063	}
				4064	check_irq_on();
				4065	mutex_unlock(&slab_mutex);
				4066	next_reap_node();
				4067	out:
				4068	/* Set up the next iteration */
				4069	schedule_delayed_work_on(smp_processor_id(), work,
				4070	round_jiffies_relative(REAPTIMEOUT_AC));
				4071	}
				4072
				4073	void get_slabinfo(struct kmem_cache cachep, struct slabinfo sinfo)
				4074	{
				4075	unsigned long active_objs, num_objs, active_slabs;
				4076	unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
				4077	unsigned long free_slabs = 0;
				4078	int node;
				4079	struct kmem_cache_node *n;
				4080
				4081	for_each_kmem_cache_node(cachep, node, n) {
				4082	check_irq_on();
				4083	spin_lock_irq(&n->list_lock);
				4084
				4085	total_slabs += n->total_slabs;
				4086	free_slabs += n->free_slabs;
				4087	free_objs += n->free_objects;
				4088
				4089	if (n->shared)
				4090	shared_avail += n->shared->avail;
				4091
				4092	spin_unlock_irq(&n->list_lock);
				4093	}
				4094	num_objs = total_slabs * cachep->num;
				4095	active_slabs = total_slabs - free_slabs;
				4096	active_objs = num_objs - free_objs;
				4097
				4098	sinfo->active_objs = active_objs;
				4099	sinfo->num_objs = num_objs;
				4100	sinfo->active_slabs = active_slabs;
				4101	sinfo->num_slabs = total_slabs;
				4102	sinfo->shared_avail = shared_avail;
				4103	sinfo->limit = cachep->limit;
				4104	sinfo->batchcount = cachep->batchcount;
				4105	sinfo->shared = cachep->shared;
				4106	sinfo->objects_per_slab = cachep->num;
				4107	sinfo->cache_order = cachep->gfporder;
				4108	}
				4109
				4110	void slabinfo_show_stats(struct seq_file m, struct kmem_cache cachep)
				4111	{
				4112	#if STATS
				4113	{ /* node stats */
				4114	unsigned long high = cachep->high_mark;
				4115	unsigned long allocs = cachep->num_allocations;
				4116	unsigned long grown = cachep->grown;
				4117	unsigned long reaped = cachep->reaped;
				4118	unsigned long errors = cachep->errors;
				4119	unsigned long max_freeable = cachep->max_freeable;
				4120	unsigned long node_allocs = cachep->node_allocs;
				4121	unsigned long node_frees = cachep->node_frees;
				4122	unsigned long overflows = cachep->node_overflow;
				4123
				4124	seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
				4125	allocs, high, grown,
				4126	reaped, errors, max_freeable, node_allocs,
				4127	node_frees, overflows);
				4128	}
				4129	/* cpu stats */
				4130	{
				4131	unsigned long allochit = atomic_read(&cachep->allochit);
				4132	unsigned long allocmiss = atomic_read(&cachep->allocmiss);
				4133	unsigned long freehit = atomic_read(&cachep->freehit);
				4134	unsigned long freemiss = atomic_read(&cachep->freemiss);
				4135
				4136	seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
				4137	allochit, allocmiss, freehit, freemiss);
				4138	}
				4139	#endif
				4140	}
				4141
				4142	#define MAX_SLABINFO_WRITE 128
				4143	/**
				4144	* slabinfo_write - Tuning for the slab allocator
				4145	* @file: unused
				4146	* @buffer: user buffer
				4147	* @count: data length
				4148	* @ppos: unused
				4149	*
				4150	* Return: %0 on success, negative error code otherwise.
				4151	*/
				4152	ssize_t slabinfo_write(struct file file, const char __user buffer,
				4153	size_t count, loff_t *ppos)
				4154	{
				4155	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
				4156	int limit, batchcount, shared, res;
				4157	struct kmem_cache *cachep;
				4158
				4159	if (count > MAX_SLABINFO_WRITE)
				4160	return -EINVAL;
				4161	if (copy_from_user(&kbuf, buffer, count))
				4162	return -EFAULT;
				4163	kbuf[MAX_SLABINFO_WRITE] = '\0';
				4164
				4165	tmp = strchr(kbuf, ' ');
				4166	if (!tmp)
				4167	return -EINVAL;
				4168	*tmp = '\0';
				4169	tmp++;
				4170	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
				4171	return -EINVAL;
				4172
				4173	/* Find the cache in the chain of caches. */
				4174	mutex_lock(&slab_mutex);
				4175	res = -EINVAL;
				4176	list_for_each_entry(cachep, &slab_caches, list) {
				4177	if (!strcmp(cachep->name, kbuf)) {
				4178	if (limit < 1 \|\| batchcount < 1 \|\|
				4179	batchcount > limit \|\| shared < 0) {
				4180	res = 0;
				4181	} else {
				4182	res = do_tune_cpucache(cachep, limit,
				4183	batchcount, shared,
				4184	GFP_KERNEL);
				4185	}
				4186	break;
				4187	}
				4188	}
				4189	mutex_unlock(&slab_mutex);
				4190	if (res >= 0)
				4191	res = count;
				4192	return res;
				4193	}
				4194
				4195	#ifdef CONFIG_HARDENED_USERCOPY
				4196	/*
				4197	* Rejects incorrectly sized objects and objects that are to be copied
				4198	* to/from userspace but do not fall entirely within the containing slab
				4199	* cache's usercopy region.
				4200	*
				4201	* Returns NULL if check passes, otherwise const char * to name of cache
				4202	* to indicate an error.
				4203	*/
				4204	void __check_heap_object(const void ptr, unsigned long n, struct page page,
				4205	bool to_user)
				4206	{
				4207	struct kmem_cache *cachep;
				4208	unsigned int objnr;
				4209	unsigned long offset;
				4210
				4211	ptr = kasan_reset_tag(ptr);
				4212
				4213	/* Find and validate object. */
				4214	cachep = page->slab_cache;
				4215	objnr = obj_to_index(cachep, page, (void *)ptr);
				4216	BUG_ON(objnr >= cachep->num);
				4217
				4218	/* Find offset within object. */
				4219	if (is_kfence_address(ptr))
				4220	offset = ptr - kfence_object_start(ptr);
				4221	else
				4222	offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
				4223
				4224	/* Allow address range falling entirely within usercopy region. */
				4225	if (offset >= cachep->useroffset &&
				4226	offset - cachep->useroffset <= cachep->usersize &&
				4227	n <= cachep->useroffset - offset + cachep->usersize)
				4228	return;
				4229
				4230	/*
				4231	* If the copy is still within the allocated object, produce
				4232	* a warning instead of rejecting the copy. This is intended
				4233	* to be a temporary method to find any missing usercopy
				4234	* whitelists.
				4235	*/
				4236	if (usercopy_fallback &&
				4237	offset <= cachep->object_size &&
				4238	n <= cachep->object_size - offset) {
				4239	usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
				4240	return;
				4241	}
				4242
				4243	usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
				4244	}
				4245	#endif /* CONFIG_HARDENED_USERCOPY */
				4246
				4247	/**
				4248	* __ksize -- Uninstrumented ksize.
				4249	* @objp: pointer to the object
				4250	*
				4251	* Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
				4252	* safety checks as ksize() with KASAN instrumentation enabled.
				4253	*
				4254	* Return: size of the actual memory used by @objp in bytes
				4255	*/
				4256	size_t __ksize(const void *objp)
				4257	{
				4258	struct kmem_cache *c;
				4259	size_t size;
				4260
				4261	BUG_ON(!objp);
				4262	if (unlikely(objp == ZERO_SIZE_PTR))
				4263	return 0;
				4264
				4265	c = virt_to_cache(objp);
				4266	size = c ? c->object_size : 0;
				4267
				4268	return size;
				4269	}
				4270	EXPORT_SYMBOL(__ksize);