Blame - ap/os/linux/linux-3.4.x/mm/slab.c - T106_DC

blob: fae17645f799e44ed3270ae643e63667bd7ffc25 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* linux/mm/slab.c
				3	* Written by Mark Hemment, 1996/97.
				4	* (markhe@nextd.demon.co.uk)
				5	*
				6	* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
				7	*
				8	* Major cleanup, different bufctl logic, per-cpu arrays
				9	* (c) 2000 Manfred Spraul
				10	*
				11	* Cleanup, make the head arrays unconditional, preparation for NUMA
				12	* (c) 2002 Manfred Spraul
				13	*
				14	* An implementation of the Slab Allocator as described in outline in;
				15	* UNIX Internals: The New Frontiers by Uresh Vahalia
				16	* Pub: Prentice Hall ISBN 0-13-101908-2
				17	* or with a little more detail in;
				18	* The Slab Allocator: An Object-Caching Kernel Memory Allocator
				19	* Jeff Bonwick (Sun Microsystems).
				20	* Presented at: USENIX Summer 1994 Technical Conference
				21	*
				22	* The memory is organized in caches, one cache for each object type.
				23	* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
				24	* Each cache consists out of many slabs (they are small (usually one
				25	* page long) and always contiguous), and each slab contains multiple
				26	* initialized objects.
				27	*
				28	* This means, that your constructor is used only for newly allocated
				29	* slabs and you must pass objects with the same initializations to
				30	* kmem_cache_free.
				31	*
				32	* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
				33	* normal). If you need a special memory type, then must create a new
				34	* cache for that memory type.
				35	*
				36	* In order to reduce fragmentation, the slabs are sorted in 3 groups:
				37	* full slabs with 0 free objects
				38	* partial slabs
				39	* empty slabs with no allocated objects
				40	*
				41	* If partial slabs exist, then new allocations come from these slabs,
				42	* otherwise from empty slabs or new slabs are allocated.
				43	*
				44	* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
				45	* during kmem_cache_destroy(). The caller must prevent concurrent allocs.
				46	*
				47	* Each cache has a short per-cpu head array, most allocs
				48	* and frees go into that array, and if that array overflows, then 1/2
				49	* of the entries in the array are given back into the global cache.
				50	* The head array is strictly LIFO and should improve the cache hit rates.
				51	* On SMP, it additionally reduces the spinlock operations.
				52	*
				53	* The c_cpuarray may not be read with enabled local interrupts -
				54	* it's changed with a smp_call_function().
				55	*
				56	* SMP synchronization:
				57	* constructors and destructors are called without any locking.
				58	* Several members in struct kmem_cache and struct slab never change, they
				59	* are accessed without any locking.
				60	* The per-cpu arrays are never accessed from the wrong cpu, no locking,
				61	* and local interrupts are disabled so slab code is preempt-safe.
				62	* The non-constant members are protected with a per-cache irq spinlock.
				63	*
				64	* Many thanks to Mark Hemment, who wrote another per-cpu slab patch
				65	* in 2000 - many ideas in the current implementation are derived from
				66	* his patch.
				67	*
				68	* Further notes from the original documentation:
				69	*
				70	* 11 April '97. Started multi-threading - markhe
				71	* The global cache-chain is protected by the mutex 'cache_chain_mutex'.
				72	* The sem is only needed when accessing/extending the cache-chain, which
				73	* can never happen inside an interrupt (kmem_cache_create(),
				74	* kmem_cache_shrink() and kmem_cache_reap()).
				75	*
				76	* At present, each engine can be growing a cache. This should be blocked.
				77	*
				78	* 15 March 2005. NUMA slab allocator.
				79	* Shai Fultheim <shai@scalex86.org>.
				80	* Shobhit Dayal <shobhit@calsoftinc.com>
				81	* Alok N Kataria <alokk@calsoftinc.com>
				82	* Christoph Lameter <christoph@lameter.com>
				83	*
				84	* Modified the slab allocator to be node aware on NUMA systems.
				85	* Each node has its own list of partial, free and full slabs.
				86	* All object allocations for a node occur from node specific slab lists.
				87	*/
				88
				89	#include <linux/slab.h>
				90	#include <linux/mm.h>
				91	#include <linux/poison.h>
				92	#include <linux/swap.h>
				93	#include <linux/cache.h>
				94	#include <linux/interrupt.h>
				95	#include <linux/init.h>
				96	#include <linux/compiler.h>
				97	#include <linux/cpuset.h>
				98	#include <linux/proc_fs.h>
				99	#include <linux/seq_file.h>
				100	#include <linux/notifier.h>
				101	#include <linux/kallsyms.h>
				102	#include <linux/cpu.h>
				103	#include <linux/sysctl.h>
				104	#include <linux/module.h>
				105	#include <linux/rcupdate.h>
				106	#include <linux/string.h>
				107	#include <linux/uaccess.h>
				108	#include <linux/nodemask.h>
				109	#include <linux/kmemleak.h>
				110	#include <linux/mempolicy.h>
				111	#include <linux/mutex.h>
				112	#include <linux/fault-inject.h>
				113	#include <linux/rtmutex.h>
				114	#include <linux/reciprocal_div.h>
				115	#include <linux/debugobjects.h>
				116	#include <linux/kmemcheck.h>
				117	#include <linux/memory.h>
				118	#include <linux/prefetch.h>
				119	#include <linux/locallock.h>
				120
				121	#include <asm/cacheflush.h>
				122	#include <asm/tlbflush.h>
				123	#include <asm/page.h>
				124
				125	#include <trace/events/kmem.h>
				126
				127	#ifdef CONFIG_KMALLOC_TRACKER
				128	#include <linux/mem_tracker_def.h>
				129	#endif
				130
				131	/*
				132	* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
				133	* 0 for faster, smaller code (especially in the critical paths).
				134	*
				135	* STATS - 1 to collect stats for /proc/slabinfo.
				136	* 0 for faster, smaller code (especially in the critical paths).
				137	*
				138	* FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
				139	*/
				140
				141	#ifdef CONFIG_DEBUG_SLAB
				142	#define DEBUG 1
				143	#define STATS 1
				144	#define FORCED_DEBUG 1
				145	#else
				146	#define DEBUG 0
				147	#define STATS 0
				148	#define FORCED_DEBUG 0
				149	#endif
				150
				151	/* Shouldn't this be in a header file somewhere? */
				152	#define BYTES_PER_WORD sizeof(void *)
				153	#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
				154
				155	#ifndef ARCH_KMALLOC_FLAGS
				156	#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
				157	#endif
				158
				159	/* Legal flag mask for kmem_cache_create(). */
				160	#if DEBUG
				161	# define CREATE_MASK (SLAB_RED_ZONE \| \
				162	SLAB_POISON \| SLAB_HWCACHE_ALIGN \| \
				163	SLAB_CACHE_DMA \| \
				164	SLAB_STORE_USER \| \
				165	SLAB_RECLAIM_ACCOUNT \| SLAB_PANIC \| \
				166	SLAB_DESTROY_BY_RCU \| SLAB_MEM_SPREAD \| \
				167	SLAB_DEBUG_OBJECTS \| SLAB_NOLEAKTRACE \| SLAB_NOTRACK)
				168	#else
				169	# define CREATE_MASK (SLAB_HWCACHE_ALIGN \| \
				170	SLAB_CACHE_DMA \| \
				171	SLAB_RECLAIM_ACCOUNT \| SLAB_PANIC \| \
				172	SLAB_DESTROY_BY_RCU \| SLAB_MEM_SPREAD \| \
				173	SLAB_DEBUG_OBJECTS \| SLAB_NOLEAKTRACE \| SLAB_NOTRACK)
				174	#endif
				175
				176	/*
				177	* kmem_bufctl_t:
				178	*
				179	* Bufctl's are used for linking objs within a slab
				180	* linked offsets.
				181	*
				182	* This implementation relies on "struct page" for locating the cache &
				183	* slab an object belongs to.
				184	* This allows the bufctl structure to be small (one int), but limits
				185	* the number of objects a slab (not a cache) can contain when off-slab
				186	* bufctls are used. The limit is the size of the largest general cache
				187	* that does not use off-slab slabs.
				188	* For 32bit archs with 4 kB pages, is this 56.
				189	* This is not serious, as it is only for large objects, when it is unwise
				190	* to have too many per slab.
				191	* Note: This limit can be raised by introducing a general cache whose size
				192	* is less than 512 (PAGE_SIZE<<3), but greater than 256.
				193	*/
				194
				195	typedef unsigned int kmem_bufctl_t;
				196	#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
				197	#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
				198	#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
				199	#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
				200
				201	/*
				202	* struct slab_rcu
				203	*
				204	* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
				205	* arrange for kmem_freepages to be called via RCU. This is useful if
				206	* we need to approach a kernel structure obliquely, from its address
				207	* obtained without the usual locking. We can lock the structure to
				208	* stabilize it and check it's still at the given address, only if we
				209	* can be sure that the memory has not been meanwhile reused for some
				210	* other kind of object (which our subsystem's lock might corrupt).
				211	*
				212	* rcu_read_lock before reading the address, then rcu_read_unlock after
				213	* taking the spinlock within the structure expected at that address.
				214	*/
				215	struct slab_rcu {
				216	struct rcu_head head;
				217	struct kmem_cache *cachep;
				218	void *addr;
				219	};
				220
				221	/*
				222	* struct slab
				223	*
				224	* Manages the objs in a slab. Placed either at the beginning of mem allocated
				225	* for a slab, or allocated from an general cache.
				226	* Slabs are chained into three list: fully used, partial, fully free slabs.
				227	*/
				228	struct slab {
				229	union {
				230	struct {
				231	struct list_head list;
				232	unsigned long colouroff;
				233	void s_mem; / including colour offset */
				234	unsigned int inuse; /* num of objs active in slab */
				235	kmem_bufctl_t free;
				236	unsigned short nodeid;
				237	};
				238	struct slab_rcu __slab_cover_slab_rcu;
				239	};
				240	};
				241
				242	/*
				243	* struct array_cache
				244	*
				245	* Purpose:
				246	* - LIFO ordering, to hand out cache-warm objects from _alloc
				247	* - reduce the number of linked list operations
				248	* - reduce spinlock operations
				249	*
				250	* The limit is stored in the per-cpu structure to reduce the data cache
				251	* footprint.
				252	*
				253	*/
				254	struct array_cache {
				255	unsigned int avail;
				256	unsigned int limit;
				257	unsigned int batchcount;
				258	unsigned int touched;
				259	spinlock_t lock;
				260	void entry[]; /
				261	* Must have this definition in here for the proper
				262	* alignment of array_cache. Also simplifies accessing
				263	* the entries.
				264	*/
				265	};
				266
				267	/*
				268	* bootstrap: The caches do not work without cpuarrays anymore, but the
				269	* cpuarrays are allocated from the generic caches...
				270	*/
				271	#define BOOT_CPUCACHE_ENTRIES 1
				272	struct arraycache_init {
				273	struct array_cache cache;
				274	void *entries[BOOT_CPUCACHE_ENTRIES];
				275	};
				276
				277	/*
				278	* The slab lists for all objects.
				279	*/
				280	struct kmem_list3 {
				281	struct list_head slabs_partial; /* partial list first, better asm code */
				282	struct list_head slabs_full;
				283	struct list_head slabs_free;
				284	unsigned long free_objects;
				285	unsigned int free_limit;
				286	unsigned int colour_next; /* Per-node cache coloring */
				287	spinlock_t list_lock;
				288	struct array_cache shared; / shared per node */
				289	struct array_cache *alien; / on other nodes */
				290	unsigned long next_reap; /* updated without locking */
				291	int free_touched; /* updated without locking */
				292	};
				293
				294	/*
				295	* Need this for bootstrapping a per node allocator.
				296	*/
				297	#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
				298	static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
				299	#define CACHE_CACHE 0
				300	#define SIZE_AC MAX_NUMNODES
				301	#define SIZE_L3 (2 * MAX_NUMNODES)
				302
				303	static int drain_freelist(struct kmem_cache *cache,
				304	struct kmem_list3 *l3, int tofree);
				305	static void free_block(struct kmem_cache cachep, void *objpp, int len,
				306	int node);
				307	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
				308	static void cache_reap(struct work_struct *unused);
				309
				310	/*
				311	* This function must be completely optimized away if a constant is passed to
				312	* it. Mostly the same as what is in linux/slab.h except it returns an index.
				313	*/
				314	static __always_inline int index_of(const size_t size)
				315	{
				316	extern void __bad_size(void);
				317
				318	if (__builtin_constant_p(size)) {
				319	int i = 0;
				320
				321	#define CACHE(x) \
				322	if (size <=x) \
				323	return i; \
				324	else \
				325	i++;
				326	#include <linux/kmalloc_sizes.h>
				327	#undef CACHE
				328	__bad_size();
				329	} else
				330	__bad_size();
				331	return 0;
				332	}
				333
				334	static int slab_early_init = 1;
				335
				336	#define INDEX_AC index_of(sizeof(struct arraycache_init))
				337	#define INDEX_L3 index_of(sizeof(struct kmem_list3))
				338
				339	static void kmem_list3_init(struct kmem_list3 *parent)
				340	{
				341	INIT_LIST_HEAD(&parent->slabs_full);
				342	INIT_LIST_HEAD(&parent->slabs_partial);
				343	INIT_LIST_HEAD(&parent->slabs_free);
				344	parent->shared = NULL;
				345	parent->alien = NULL;
				346	parent->colour_next = 0;
				347	spin_lock_init(&parent->list_lock);
				348	parent->free_objects = 0;
				349	parent->free_touched = 0;
				350	}
				351
				352	#define MAKE_LIST(cachep, listp, slab, nodeid) \
				353	do { \
				354	INIT_LIST_HEAD(listp); \
				355	list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
				356	} while (0)
				357
				358	#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
				359	do { \
				360	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
				361	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
				362	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
				363	} while (0)
				364
				365	#define CFLGS_OFF_SLAB (0x80000000UL)
				366	#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
				367
				368	#define BATCHREFILL_LIMIT 16
				369	/*
				370	* Optimization question: fewer reaps means less probability for unnessary
				371	* cpucache drain/refill cycles.
				372	*
				373	* OTOH the cpuarrays can contain lots of objects,
				374	* which could lock up otherwise freeable slabs.
				375	*/
				376	#define REAPTIMEOUT_CPUC (2*HZ)
				377	#define REAPTIMEOUT_LIST3 (4*HZ)
				378
				379	#if STATS
				380	#define STATS_INC_ACTIVE(x) ((x)->num_active++)
				381	#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
				382	#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
				383	#define STATS_INC_GROWN(x) ((x)->grown++)
				384	#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
				385	#define STATS_SET_HIGH(x) \
				386	do { \
				387	if ((x)->num_active > (x)->high_mark) \
				388	(x)->high_mark = (x)->num_active; \
				389	} while (0)
				390	#define STATS_INC_ERR(x) ((x)->errors++)
				391	#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
				392	#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
				393	#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
				394	#define STATS_SET_FREEABLE(x, i) \
				395	do { \
				396	if ((x)->max_freeable < i) \
				397	(x)->max_freeable = i; \
				398	} while (0)
				399	#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
				400	#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
				401	#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
				402	#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
				403	#else
				404	#define STATS_INC_ACTIVE(x) do { } while (0)
				405	#define STATS_DEC_ACTIVE(x) do { } while (0)
				406	#define STATS_INC_ALLOCED(x) do { } while (0)
				407	#define STATS_INC_GROWN(x) do { } while (0)
				408	#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
				409	#define STATS_SET_HIGH(x) do { } while (0)
				410	#define STATS_INC_ERR(x) do { } while (0)
				411	#define STATS_INC_NODEALLOCS(x) do { } while (0)
				412	#define STATS_INC_NODEFREES(x) do { } while (0)
				413	#define STATS_INC_ACOVERFLOW(x) do { } while (0)
				414	#define STATS_SET_FREEABLE(x, i) do { } while (0)
				415	#define STATS_INC_ALLOCHIT(x) do { } while (0)
				416	#define STATS_INC_ALLOCMISS(x) do { } while (0)
				417	#define STATS_INC_FREEHIT(x) do { } while (0)
				418	#define STATS_INC_FREEMISS(x) do { } while (0)
				419	#endif
				420
				421	#if DEBUG
				422
				423	/*
				424	* memory layout of objects:
				425	* 0 : objp
				426	* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
				427	* the end of an object is aligned with the end of the real
				428	* allocation. Catches writes behind the end of the allocation.
				429	* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
				430	* redzone word.
				431	* cachep->obj_offset: The real object.
				432	* cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
				433	* cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
				434	* [BYTES_PER_WORD long]
				435	*/
				436	static int obj_offset(struct kmem_cache *cachep)
				437	{
				438	return cachep->obj_offset;
				439	}
				440
				441	static int obj_size(struct kmem_cache *cachep)
				442	{
				443	return cachep->obj_size;
				444	}
				445
				446	static unsigned long long dbg_redzone1(struct kmem_cache cachep, void *objp)
				447	{
				448	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
				449	return (unsigned long long*) (objp + obj_offset(cachep) -
				450	sizeof(unsigned long long));
				451	}
				452
				453	static unsigned long long dbg_redzone2(struct kmem_cache cachep, void *objp)
				454	{
				455	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
				456	if (cachep->flags & SLAB_STORE_USER)
				457	return (unsigned long long *)(objp + cachep->buffer_size -
				458	sizeof(unsigned long long) -
				459	REDZONE_ALIGN);
				460	return (unsigned long long *) (objp + cachep->buffer_size -
				461	sizeof(unsigned long long));
				462	}
				463
				464	static void *dbg_userword(struct kmem_cache cachep, void *objp)
				465	{
				466	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
				467	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
				468	}
				469
				470	#else
				471
				472	#define obj_offset(x) 0
				473	#define obj_size(cachep) (cachep->buffer_size)
				474	#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
				475	#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
				476	#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
				477
				478	#endif
				479
				480	#ifdef CONFIG_DEBUG_SLAB_MARK_HEAD
				481	void *dbg_userrecord(void objp,int index)
				482	{
				483	return (void *)(objp + index BYTES_PER_RECORD);
				484	}
				485	EXPORT_SYMBOL(dbg_userrecord);
				486
				487	void *dbg_userhead(void objp)
				488	{
				489	return (void *)(objp - RECORD_COUNT BYTES_PER_RECORD);
				490	}
				491	EXPORT_SYMBOL(dbg_userhead);
				492	#endif
				493
				494	void *dbg_recordtask(struct kmem_cache cachep, void *objp)
				495	{
				496	return (void *)(objp + cachep->buffer_size - 2BYTES_PER_WORD);
				497	}
				498	EXPORT_SYMBOL(dbg_recordtask);
				499
				500	void *dbg_recordcaller(struct kmem_cache cachep, void *objp)
				501	{
				502	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
				503	}
				504	EXPORT_SYMBOL(dbg_recordcaller);
				505
				506	#ifdef CONFIG_TRACING
				507	size_t slab_buffer_size(struct kmem_cache *cachep)
				508	{
				509	return cachep->buffer_size;
				510	}
				511	EXPORT_SYMBOL(slab_buffer_size);
				512	#endif
				513
				514	/*
				515	* Do not go above this order unless 0 objects fit into the slab or
				516	* overridden on the command line.
				517	*/
				518	#define SLAB_MAX_ORDER_HI 1
				519	#define SLAB_MAX_ORDER_LO 0
				520	static int slab_max_order = SLAB_MAX_ORDER_LO;
				521	static bool slab_max_order_set __initdata;
				522
				523	/*
				524	* Functions for storing/retrieving the cachep and or slab from the page
				525	* allocator. These are used to find the slab an obj belongs to. With kfree(),
				526	* these are used to find the cache which an obj belongs to.
				527	*/
				528	static inline void page_set_cache(struct page page, struct kmem_cache cache)
				529	{
				530	page->lru.next = (struct list_head *)cache;
				531	}
				532
				533	static inline struct kmem_cache page_get_cache(struct page page)
				534	{
				535	page = compound_head(page);
				536	BUG_ON(!PageSlab(page));
				537	return (struct kmem_cache *)page->lru.next;
				538	}
				539
				540	static inline void page_set_slab(struct page page, struct slab slab)
				541	{
				542	page->lru.prev = (struct list_head *)slab;
				543	}
				544
				545	static inline struct slab page_get_slab(struct page page)
				546	{
				547	BUG_ON(!PageSlab(page));
				548	return (struct slab *)page->lru.prev;
				549	}
				550
				551	static inline struct kmem_cache virt_to_cache(const void obj)
				552	{
				553	struct page *page = virt_to_head_page(obj);
				554	return page_get_cache(page);
				555	}
				556
				557	static inline struct slab virt_to_slab(const void obj)
				558	{
				559	struct page *page = virt_to_head_page(obj);
				560	return page_get_slab(page);
				561	}
				562
				563	static inline void index_to_obj(struct kmem_cache cache, struct slab *slab,
				564	unsigned int idx)
				565	{
				566	return slab->s_mem + cache->buffer_size * idx;
				567	}
				568
				569	/*
				570	* We want to avoid an expensive divide : (offset / cache->buffer_size)
				571	* Using the fact that buffer_size is a constant for a particular cache,
				572	* we can replace (offset / cache->buffer_size) by
				573	* reciprocal_divide(offset, cache->reciprocal_buffer_size)
				574	*/
				575	static inline unsigned int obj_to_index(const struct kmem_cache *cache,
				576	const struct slab slab, void obj)
				577	{
				578	u32 offset = (obj - slab->s_mem);
				579	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
				580	}
				581
				582	/*
				583	* These are the default caches for kmalloc. Custom caches can have other sizes.
				584	*/
				585	struct cache_sizes malloc_sizes[] = {
				586	#define CACHE(x) { .cs_size = (x) },
				587	#include <linux/kmalloc_sizes.h>
				588	CACHE(ULONG_MAX)
				589	#undef CACHE
				590	};
				591	EXPORT_SYMBOL(malloc_sizes);
				592
				593	/* Must match cache_sizes above. Out of line to keep cache footprint low. */
				594	struct cache_names {
				595	char *name;
				596	char *name_dma;
				597	};
				598
				599	static struct cache_names __initdata cache_names[] = {
				600	#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
				601	#include <linux/kmalloc_sizes.h>
				602	{NULL,}
				603	#undef CACHE
				604	};
				605
				606	static struct arraycache_init initarray_cache __initdata =
				607	{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
				608	static struct arraycache_init initarray_generic =
				609	{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
				610
				611	/* internal cache of cache description objs */
				612	static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
				613	static struct kmem_cache cache_cache = {
				614	.nodelists = cache_cache_nodelists,
				615	.batchcount = 1,
				616	.limit = BOOT_CPUCACHE_ENTRIES,
				617	.shared = 1,
				618	.buffer_size = sizeof(struct kmem_cache),
				619	.name = "kmem_cache",
				620	};
				621
				622	#define BAD_ALIEN_MAGIC 0x01020304ul
				623
				624	/*
				625	* chicken and egg problem: delay the per-cpu array allocation
				626	* until the general caches are up.
				627	*/
				628	static enum {
				629	NONE,
				630	PARTIAL_AC,
				631	PARTIAL_L3,
				632	EARLY,
				633	LATE,
				634	FULL
				635	} g_cpucache_up;
				636
				637	/*
				638	* used by boot code to determine if it can use slab based allocator
				639	*/
				640	int slab_is_available(void)
				641	{
				642	return g_cpucache_up >= EARLY;
				643	}
				644
				645	/*
				646	* Guard access to the cache-chain.
				647	*/
				648	static DEFINE_MUTEX(cache_chain_mutex);
				649	static struct list_head cache_chain;
				650
				651	#ifdef CONFIG_LOCKDEP
				652
				653	/*
				654	* Slab sometimes uses the kmalloc slabs to store the slab headers
				655	* for other slabs "off slab".
				656	* The locking for this is tricky in that it nests within the locks
				657	* of all other slabs in a few places; to deal with this special
				658	* locking we put on-slab caches into a separate lock-class.
				659	*
				660	* We set lock class for alien array caches which are up during init.
				661	* The lock annotation will be lost if all cpus of a node goes down and
				662	* then comes back up during hotplug
				663	*/
				664	static struct lock_class_key on_slab_l3_key;
				665	static struct lock_class_key on_slab_alc_key;
				666
				667	static struct lock_class_key debugobj_l3_key;
				668	static struct lock_class_key debugobj_alc_key;
				669
				670	static void slab_set_lock_classes(struct kmem_cache *cachep,
				671	struct lock_class_key l3_key, struct lock_class_key alc_key,
				672	int q)
				673	{
				674	struct array_cache **alc;
				675	struct kmem_list3 *l3;
				676	int r;
				677
				678	l3 = cachep->nodelists[q];
				679	if (!l3)
				680	return;
				681
				682	lockdep_set_class(&l3->list_lock, l3_key);
				683	alc = l3->alien;
				684	/*
				685	* FIXME: This check for BAD_ALIEN_MAGIC
				686	* should go away when common slab code is taught to
				687	* work even without alien caches.
				688	* Currently, non NUMA code returns BAD_ALIEN_MAGIC
				689	* for alloc_alien_cache,
				690	*/
				691	if (!alc \|\| (unsigned long)alc == BAD_ALIEN_MAGIC)
				692	return;
				693	for_each_node(r) {
				694	if (alc[r])
				695	lockdep_set_class(&alc[r]->lock, alc_key);
				696	}
				697	}
				698
				699	static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
				700	{
				701	slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
				702	}
				703
				704	static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
				705	{
				706	int node;
				707
				708	for_each_online_node(node)
				709	slab_set_debugobj_lock_classes_node(cachep, node);
				710	}
				711
				712	static void init_lock_keys(struct kmem_cache *cachep, int node)
				713	{
				714	struct kmem_list3 *l3;
				715
				716	if (g_cpucache_up < LATE)
				717	return;
				718
				719	l3 = cachep->nodelists[node];
				720	if (!l3 \|\| OFF_SLAB(cachep))
				721	return;
				722
				723	slab_set_lock_classes(cachep, &on_slab_l3_key, &on_slab_alc_key, node);
				724	}
				725
				726	static void init_node_lock_keys(int node)
				727	{
				728	struct kmem_cache *cachep;
				729
				730	list_for_each_entry(cachep, &cache_chain, next)
				731	init_lock_keys(cachep, node);
				732	}
				733
				734	static inline void init_cachep_lock_keys(struct kmem_cache *cachep)
				735	{
				736	int node;
				737
				738	for_each_node(node)
				739	init_lock_keys(cachep, node);
				740	}
				741	#else
				742	static void init_node_lock_keys(int node)
				743	{
				744	}
				745
				746	static void init_cachep_lock_keys(struct kmem_cache *cachep)
				747	{
				748	}
				749
				750	static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
				751	{
				752	}
				753
				754	static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
				755	{
				756	}
				757	#endif
				758
				759	static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
				760	static DEFINE_PER_CPU(struct list_head, slab_free_list);
				761	static DEFINE_LOCAL_IRQ_LOCK(slab_lock);
				762
				763	#ifndef CONFIG_PREEMPT_RT_BASE
				764	# define slab_on_each_cpu(func, cp) on_each_cpu(func, cp, 1)
				765	#else
				766	/*
				767	* execute func() for all CPUs. On PREEMPT_RT we dont actually have
				768	* to run on the remote CPUs - we only have to take their CPU-locks.
				769	* (This is a rare operation, so cacheline bouncing is not an issue.)
				770	*/
				771	static void
				772	slab_on_each_cpu(void (func)(void arg, int this_cpu), void *arg)
				773	{
				774	unsigned int i;
				775
				776	get_cpu_light();
				777	for_each_online_cpu(i)
				778	func(arg, i);
				779	put_cpu_light();
				780	}
				781
				782	static void lock_slab_on(unsigned int cpu)
				783	{
				784	local_lock_irq_on(slab_lock, cpu);
				785	}
				786
				787	static void unlock_slab_on(unsigned int cpu)
				788	{
				789	local_unlock_irq_on(slab_lock, cpu);
				790	}
				791	#endif
				792
				793	static void free_delayed(struct list_head *h)
				794	{
				795	while(!list_empty(h)) {
				796	struct page *page = list_first_entry(h, struct page, lru);
				797
				798	list_del(&page->lru);
				799	__free_pages(page, page->index);
				800	}
				801	}
				802
				803	static void unlock_l3_and_free_delayed(spinlock_t *list_lock)
				804	{
				805	LIST_HEAD(tmp);
				806
				807	list_splice_init(&__get_cpu_var(slab_free_list), &tmp);
				808	local_spin_unlock_irq(slab_lock, list_lock);
				809	free_delayed(&tmp);
				810	}
				811
				812	static void unlock_slab_and_free_delayed(unsigned long flags)
				813	{
				814	LIST_HEAD(tmp);
				815
				816	list_splice_init(&__get_cpu_var(slab_free_list), &tmp);
				817	local_unlock_irqrestore(slab_lock, flags);
				818	free_delayed(&tmp);
				819	}
				820
				821	static inline struct array_cache cpu_cache_get(struct kmem_cache cachep)
				822	{
				823	return cachep->array[smp_processor_id()];
				824	}
				825
				826	static inline struct array_cache cpu_cache_get_on_cpu(struct kmem_cache cachep,
				827	int cpu)
				828	{
				829	return cachep->array[cpu];
				830	}
				831
				832	static inline struct kmem_cache *__find_general_cachep(size_t size,
				833	gfp_t gfpflags)
				834	{
				835	struct cache_sizes *csizep = malloc_sizes;
				836
				837	#if DEBUG
				838	/* This happens if someone tries to call
				839	* kmem_cache_create(), or __kmalloc(), before
				840	* the generic caches are initialized.
				841	*/
				842	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
				843	#endif
				844	if (!size)
				845	return ZERO_SIZE_PTR;
				846
				847	while (size > csizep->cs_size)
				848	csizep++;
				849
				850	/*
				851	* Really subtle: The last entry with cs->cs_size==ULONG_MAX
				852	* has cs_{dma,}cachep==NULL. Thus no special case
				853	* for large kmalloc calls required.
				854	*/
				855	#ifdef CONFIG_ZONE_DMA
				856	if (unlikely(gfpflags & GFP_DMA))
				857	return csizep->cs_dmacachep;
				858	#endif
				859	return csizep->cs_cachep;
				860	}
				861
				862	static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
				863	{
				864	return __find_general_cachep(size, gfpflags);
				865	}
				866
				867	static size_t slab_mgmt_size(size_t nr_objs, size_t align)
				868	{
				869	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
				870	}
				871
				872	/*
				873	* Calculate the number of objects and left-over bytes for a given buffer size.
				874	*/
				875	static void cache_estimate(unsigned long gfporder, size_t buffer_size,
				876	size_t align, int flags, size_t *left_over,
				877	unsigned int *num)
				878	{
				879	int nr_objs;
				880	size_t mgmt_size;
				881	size_t slab_size = PAGE_SIZE << gfporder;
				882
				883	/*
				884	* The slab management structure can be either off the slab or
				885	* on it. For the latter case, the memory allocated for a
				886	* slab is used for:
				887	*
				888	* - The struct slab
				889	* - One kmem_bufctl_t for each object
				890	* - Padding to respect alignment of @align
				891	* - @buffer_size bytes for each object
				892	*
				893	* If the slab management structure is off the slab, then the
				894	* alignment will already be calculated into the size. Because
				895	* the slabs are all pages aligned, the objects will be at the
				896	* correct alignment when allocated.
				897	*/
				898	if (flags & CFLGS_OFF_SLAB) {
				899	mgmt_size = 0;
				900	nr_objs = slab_size / buffer_size;
				901
				902	if (nr_objs > SLAB_LIMIT)
				903	nr_objs = SLAB_LIMIT;
				904	} else {
				905	/*
				906	* Ignore padding for the initial guess. The padding
				907	* is at most @align-1 bytes, and @buffer_size is at
				908	* least @align. In the worst case, this result will
				909	* be one greater than the number of objects that fit
				910	* into the memory allocation when taking the padding
				911	* into account.
				912	*/
				913	nr_objs = (slab_size - sizeof(struct slab)) /
				914	(buffer_size + sizeof(kmem_bufctl_t));
				915
				916	/*
				917	* This calculated number will be either the right
				918	* amount, or one greater than what we want.
				919	*/
				920	if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
				921	> slab_size)
				922	nr_objs--;
				923
				924	if (nr_objs > SLAB_LIMIT)
				925	nr_objs = SLAB_LIMIT;
				926
				927	mgmt_size = slab_mgmt_size(nr_objs, align);
				928	}
				929	*num = nr_objs;
				930	left_over = slab_size - nr_objsbuffer_size - mgmt_size;
				931	}
				932
				933	#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
				934
				935	static void __slab_error(const char function, struct kmem_cache cachep,
				936	char *msg)
				937	{
				938	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
				939	function, cachep->name, msg);
				940	dump_stack();
				941	}
				942
				943	/*
				944	* By default on NUMA we use alien caches to stage the freeing of
				945	* objects allocated from other nodes. This causes massive memory
				946	* inefficiencies when using fake NUMA setup to split memory into a
				947	* large number of small nodes, so it can be disabled on the command
				948	* line
				949	*/
				950
				951	static int use_alien_caches __read_mostly = 1;
				952	static int __init noaliencache_setup(char *s)
				953	{
				954	use_alien_caches = 0;
				955	return 1;
				956	}
				957	__setup("noaliencache", noaliencache_setup);
				958
				959	static int __init slab_max_order_setup(char *str)
				960	{
				961	get_option(&str, &slab_max_order);
				962	slab_max_order = slab_max_order < 0 ? 0 :
				963	min(slab_max_order, MAX_ORDER - 1);
				964	slab_max_order_set = true;
				965
				966	return 1;
				967	}
				968	__setup("slab_max_order=", slab_max_order_setup);
				969
				970	#ifdef CONFIG_NUMA
				971	/*
				972	* Special reaping functions for NUMA systems called from cache_reap().
				973	* These take care of doing round robin flushing of alien caches (containing
				974	* objects freed on different nodes from which they were allocated) and the
				975	* flushing of remote pcps by calling drain_node_pages.
				976	*/
				977	static DEFINE_PER_CPU(unsigned long, slab_reap_node);
				978
				979	static void init_reap_node(int cpu)
				980	{
				981	int node;
				982
				983	node = next_node(cpu_to_mem(cpu), node_online_map);
				984	if (node == MAX_NUMNODES)
				985	node = first_node(node_online_map);
				986
				987	per_cpu(slab_reap_node, cpu) = node;
				988	}
				989
				990	static void next_reap_node(void)
				991	{
				992	int node = __this_cpu_read(slab_reap_node);
				993
				994	node = next_node(node, node_online_map);
				995	if (unlikely(node >= MAX_NUMNODES))
				996	node = first_node(node_online_map);
				997	__this_cpu_write(slab_reap_node, node);
				998	}
				999
				1000	#else
				1001	#define init_reap_node(cpu) do { } while (0)
				1002	#define next_reap_node(void) do { } while (0)
				1003	#endif
				1004
				1005	/*
				1006	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
				1007	* via the workqueue/eventd.
				1008	* Add the CPU number into the expiration time to minimize the possibility of
				1009	* the CPUs getting into lockstep and contending for the global cache chain
				1010	* lock.
				1011	*/
				1012	static void __cpuinit start_cpu_timer(int cpu)
				1013	{
				1014	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
				1015
				1016	/*
				1017	* When this gets called from do_initcalls via cpucache_init(),
				1018	* init_workqueues() has already run, so keventd will be setup
				1019	* at that time.
				1020	*/
				1021	if (keventd_up() && reap_work->work.func == NULL) {
				1022	init_reap_node(cpu);
				1023	INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
				1024	schedule_delayed_work_on(cpu, reap_work,
				1025	__round_jiffies_relative(HZ, cpu));
				1026	}
				1027	}
				1028
				1029	static struct array_cache *alloc_arraycache(int node, int entries,
				1030	int batchcount, gfp_t gfp)
				1031	{
				1032	int memsize = sizeof(void ) entries + sizeof(struct array_cache);
				1033	struct array_cache *nc = NULL;
				1034
				1035	nc = kmalloc_node(memsize, gfp, node);
				1036	/*
				1037	* The array_cache structures contain pointers to free object.
				1038	* However, when such objects are allocated or transferred to another
				1039	* cache the pointers are not cleared and they could be counted as
				1040	* valid references during a kmemleak scan. Therefore, kmemleak must
				1041	* not scan such objects.
				1042	*/
				1043	kmemleak_no_scan(nc);
				1044	if (nc) {
				1045	nc->avail = 0;
				1046	nc->limit = entries;
				1047	nc->batchcount = batchcount;
				1048	nc->touched = 0;
				1049	spin_lock_init(&nc->lock);
				1050	}
				1051	return nc;
				1052	}
				1053
				1054	/*
				1055	* Transfer objects in one arraycache to another.
				1056	* Locking must be handled by the caller.
				1057	*
				1058	* Return the number of entries transferred.
				1059	*/
				1060	static int transfer_objects(struct array_cache *to,
				1061	struct array_cache *from, unsigned int max)
				1062	{
				1063	/* Figure out how many entries to transfer */
				1064	int nr = min3(from->avail, max, to->limit - to->avail);
				1065
				1066	if (!nr)
				1067	return 0;
				1068
				1069	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
				1070	sizeof(void ) nr);
				1071
				1072	from->avail -= nr;
				1073	to->avail += nr;
				1074	return nr;
				1075	}
				1076
				1077	#ifndef CONFIG_NUMA
				1078
				1079	#define drain_alien_cache(cachep, alien) do { } while (0)
				1080	#define reap_alien(cachep, l3) do { } while (0)
				1081
				1082	static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
				1083	{
				1084	return (struct array_cache **)BAD_ALIEN_MAGIC;
				1085	}
				1086
				1087	static inline void free_alien_cache(struct array_cache **ac_ptr)
				1088	{
				1089	}
				1090
				1091	static inline int cache_free_alien(struct kmem_cache cachep, void objp)
				1092	{
				1093	return 0;
				1094	}
				1095
				1096	static inline void alternate_node_alloc(struct kmem_cache cachep,
				1097	gfp_t flags)
				1098	{
				1099	return NULL;
				1100	}
				1101
				1102	static inline void ____cache_alloc_node(struct kmem_cache cachep,
				1103	gfp_t flags, int nodeid)
				1104	{
				1105	return NULL;
				1106	}
				1107
				1108	#else /* CONFIG_NUMA */
				1109
				1110	static void ____cache_alloc_node(struct kmem_cache , gfp_t, int);
				1111	static void alternate_node_alloc(struct kmem_cache , gfp_t);
				1112
				1113	static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
				1114	{
				1115	struct array_cache **ac_ptr;
				1116	int memsize = sizeof(void ) nr_node_ids;
				1117	int i;
				1118
				1119	if (limit > 1)
				1120	limit = 12;
				1121	ac_ptr = kzalloc_node(memsize, gfp, node);
				1122	if (ac_ptr) {
				1123	for_each_node(i) {
				1124	if (i == node \|\| !node_online(i))
				1125	continue;
				1126	ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
				1127	if (!ac_ptr[i]) {
				1128	for (i--; i >= 0; i--)
				1129	kfree(ac_ptr[i]);
				1130	kfree(ac_ptr);
				1131	return NULL;
				1132	}
				1133	}
				1134	}
				1135	return ac_ptr;
				1136	}
				1137
				1138	static void free_alien_cache(struct array_cache **ac_ptr)
				1139	{
				1140	int i;
				1141
				1142	if (!ac_ptr)
				1143	return;
				1144	for_each_node(i)
				1145	kfree(ac_ptr[i]);
				1146	kfree(ac_ptr);
				1147	}
				1148
				1149	static void __drain_alien_cache(struct kmem_cache *cachep,
				1150	struct array_cache *ac, int node)
				1151	{
				1152	struct kmem_list3 *rl3 = cachep->nodelists[node];
				1153
				1154	if (ac->avail) {
				1155	spin_lock(&rl3->list_lock);
				1156	/*
				1157	* Stuff objects into the remote nodes shared array first.
				1158	* That way we could avoid the overhead of putting the objects
				1159	* into the free lists and getting them back later.
				1160	*/
				1161	if (rl3->shared)
				1162	transfer_objects(rl3->shared, ac, ac->limit);
				1163
				1164	free_block(cachep, ac->entry, ac->avail, node);
				1165	ac->avail = 0;
				1166	spin_unlock(&rl3->list_lock);
				1167	}
				1168	}
				1169
				1170	/*
				1171	* Called from cache_reap() to regularly drain alien caches round robin.
				1172	*/
				1173	static void reap_alien(struct kmem_cache cachep, struct kmem_list3 l3)
				1174	{
				1175	int node = __this_cpu_read(slab_reap_node);
				1176
				1177	if (l3->alien) {
				1178	struct array_cache *ac = l3->alien[node];
				1179
				1180	if (ac && ac->avail &&
				1181	local_spin_trylock_irq(slab_lock, &ac->lock)) {
				1182	__drain_alien_cache(cachep, ac, node);
				1183	local_spin_unlock_irq(slab_lock, &ac->lock);
				1184	}
				1185	}
				1186	}
				1187
				1188	static void drain_alien_cache(struct kmem_cache *cachep,
				1189	struct array_cache **alien)
				1190	{
				1191	int i = 0;
				1192	struct array_cache *ac;
				1193	unsigned long flags;
				1194
				1195	for_each_online_node(i) {
				1196	ac = alien[i];
				1197	if (ac) {
				1198	local_spin_lock_irqsave(slab_lock, &ac->lock, flags);
				1199	__drain_alien_cache(cachep, ac, i);
				1200	local_spin_unlock_irqrestore(slab_lock, &ac->lock, flags);
				1201	}
				1202	}
				1203	}
				1204
				1205	static inline int cache_free_alien(struct kmem_cache cachep, void objp)
				1206	{
				1207	struct slab *slabp = virt_to_slab(objp);
				1208	int nodeid = slabp->nodeid;
				1209	struct kmem_list3 *l3;
				1210	struct array_cache *alien = NULL;
				1211	int node;
				1212
				1213	node = numa_mem_id();
				1214
				1215	/*
				1216	* Make sure we are not freeing a object from another node to the array
				1217	* cache on this cpu.
				1218	*/
				1219	if (likely(slabp->nodeid == node))
				1220	return 0;
				1221
				1222	l3 = cachep->nodelists[node];
				1223	STATS_INC_NODEFREES(cachep);
				1224	if (l3->alien && l3->alien[nodeid]) {
				1225	alien = l3->alien[nodeid];
				1226	spin_lock(&alien->lock);
				1227	if (unlikely(alien->avail == alien->limit)) {
				1228	STATS_INC_ACOVERFLOW(cachep);
				1229	__drain_alien_cache(cachep, alien, nodeid);
				1230	}
				1231	alien->entry[alien->avail++] = objp;
				1232	spin_unlock(&alien->lock);
				1233	} else {
				1234	spin_lock(&(cachep->nodelists[nodeid])->list_lock);
				1235	free_block(cachep, &objp, 1, nodeid);
				1236	spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
				1237	}
				1238	return 1;
				1239	}
				1240	#endif
				1241
				1242	/*
				1243	* Allocates and initializes nodelists for a node on each slab cache, used for
				1244	* either memory or cpu hotplug. If memory is being hot-added, the kmem_list3
				1245	* will be allocated off-node since memory is not yet online for the new node.
				1246	* When hotplugging memory or a cpu, existing nodelists are not replaced if
				1247	* already in use.
				1248	*
				1249	* Must hold cache_chain_mutex.
				1250	*/
				1251	static int init_cache_nodelists_node(int node)
				1252	{
				1253	struct kmem_cache *cachep;
				1254	struct kmem_list3 *l3;
				1255	const int memsize = sizeof(struct kmem_list3);
				1256
				1257	list_for_each_entry(cachep, &cache_chain, next) {
				1258	/*
				1259	* Set up the size64 kmemlist for cpu before we can
				1260	* begin anything. Make sure some other cpu on this
				1261	* node has not already allocated this
				1262	*/
				1263	if (!cachep->nodelists[node]) {
				1264	l3 = kmalloc_node(memsize, GFP_KERNEL, node);
				1265	if (!l3)
				1266	return -ENOMEM;
				1267	kmem_list3_init(l3);
				1268	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
				1269	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
				1270
				1271	/*
				1272	* The l3s don't come and go as CPUs come and
				1273	* go. cache_chain_mutex is sufficient
				1274	* protection here.
				1275	*/
				1276	cachep->nodelists[node] = l3;
				1277	}
				1278
				1279	local_spin_lock_irq(slab_lock, &cachep->nodelists[node]->list_lock);
				1280	cachep->nodelists[node]->free_limit =
				1281	(1 + nr_cpus_node(node)) *
				1282	cachep->batchcount + cachep->num;
				1283	local_spin_unlock_irq(slab_lock, &cachep->nodelists[node]->list_lock);
				1284	}
				1285	return 0;
				1286	}
				1287
				1288	static void __cpuinit cpuup_canceled(long cpu)
				1289	{
				1290	struct kmem_cache *cachep;
				1291	struct kmem_list3 *l3 = NULL;
				1292	int node = cpu_to_mem(cpu);
				1293	const struct cpumask *mask = cpumask_of_node(node);
				1294
				1295	list_for_each_entry(cachep, &cache_chain, next) {
				1296	struct array_cache *nc;
				1297	struct array_cache *shared;
				1298	struct array_cache **alien;
				1299
				1300	/* cpu is dead; no one can alloc from it. */
				1301	nc = cachep->array[cpu];
				1302	cachep->array[cpu] = NULL;
				1303	l3 = cachep->nodelists[node];
				1304
				1305	if (!l3)
				1306	goto free_array_cache;
				1307
				1308	local_spin_lock_irq(slab_lock, &l3->list_lock);
				1309
				1310	/* Free limit for this kmem_list3 */
				1311	l3->free_limit -= cachep->batchcount;
				1312	if (nc)
				1313	free_block(cachep, nc->entry, nc->avail, node);
				1314
				1315	if (!cpumask_empty(mask)) {
				1316	unlock_l3_and_free_delayed(&l3->list_lock);
				1317	goto free_array_cache;
				1318	}
				1319
				1320	shared = l3->shared;
				1321	if (shared) {
				1322	free_block(cachep, shared->entry,
				1323	shared->avail, node);
				1324	l3->shared = NULL;
				1325	}
				1326
				1327	alien = l3->alien;
				1328	l3->alien = NULL;
				1329
				1330	unlock_l3_and_free_delayed(&l3->list_lock);
				1331
				1332	kfree(shared);
				1333	if (alien) {
				1334	drain_alien_cache(cachep, alien);
				1335	free_alien_cache(alien);
				1336	}
				1337	free_array_cache:
				1338	kfree(nc);
				1339	}
				1340	/*
				1341	* In the previous loop, all the objects were freed to
				1342	* the respective cache's slabs, now we can go ahead and
				1343	* shrink each nodelist to its limit.
				1344	*/
				1345	list_for_each_entry(cachep, &cache_chain, next) {
				1346	l3 = cachep->nodelists[node];
				1347	if (!l3)
				1348	continue;
				1349	drain_freelist(cachep, l3, l3->free_objects);
				1350	}
				1351	}
				1352
				1353	static int __cpuinit cpuup_prepare(long cpu)
				1354	{
				1355	struct kmem_cache *cachep;
				1356	struct kmem_list3 *l3 = NULL;
				1357	int node = cpu_to_mem(cpu);
				1358	int err;
				1359
				1360	/*
				1361	* We need to do this right in the beginning since
				1362	* alloc_arraycache's are going to use this list.
				1363	* kmalloc_node allows us to add the slab to the right
				1364	* kmem_list3 and not this cpu's kmem_list3
				1365	*/
				1366	err = init_cache_nodelists_node(node);
				1367	if (err < 0)
				1368	goto bad;
				1369
				1370	/*
				1371	* Now we can go ahead with allocating the shared arrays and
				1372	* array caches
				1373	*/
				1374	list_for_each_entry(cachep, &cache_chain, next) {
				1375	struct array_cache *nc;
				1376	struct array_cache *shared = NULL;
				1377	struct array_cache **alien = NULL;
				1378
				1379	nc = alloc_arraycache(node, cachep->limit,
				1380	cachep->batchcount, GFP_KERNEL);
				1381	if (!nc)
				1382	goto bad;
				1383	if (cachep->shared) {
				1384	shared = alloc_arraycache(node,
				1385	cachep->shared * cachep->batchcount,
				1386	0xbaadf00d, GFP_KERNEL);
				1387	if (!shared) {
				1388	kfree(nc);
				1389	goto bad;
				1390	}
				1391	}
				1392	if (use_alien_caches) {
				1393	alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
				1394	if (!alien) {
				1395	kfree(shared);
				1396	kfree(nc);
				1397	goto bad;
				1398	}
				1399	}
				1400	cachep->array[cpu] = nc;
				1401	l3 = cachep->nodelists[node];
				1402	BUG_ON(!l3);
				1403
				1404	local_spin_lock_irq(slab_lock, &l3->list_lock);
				1405	if (!l3->shared) {
				1406	/*
				1407	* We are serialised from CPU_DEAD or
				1408	* CPU_UP_CANCELLED by the cpucontrol lock
				1409	*/
				1410	l3->shared = shared;
				1411	shared = NULL;
				1412	}
				1413	#ifdef CONFIG_NUMA
				1414	if (!l3->alien) {
				1415	l3->alien = alien;
				1416	alien = NULL;
				1417	}
				1418	#endif
				1419	local_spin_unlock_irq(slab_lock, &l3->list_lock);
				1420	kfree(shared);
				1421	free_alien_cache(alien);
				1422	if (cachep->flags & SLAB_DEBUG_OBJECTS)
				1423	slab_set_debugobj_lock_classes_node(cachep, node);
				1424	}
				1425	init_node_lock_keys(node);
				1426
				1427	return 0;
				1428	bad:
				1429	cpuup_canceled(cpu);
				1430	return -ENOMEM;
				1431	}
				1432
				1433	static int __cpuinit cpuup_callback(struct notifier_block *nfb,
				1434	unsigned long action, void *hcpu)
				1435	{
				1436	long cpu = (long)hcpu;
				1437	int err = 0;
				1438
				1439	switch (action) {
				1440	case CPU_UP_PREPARE:
				1441	case CPU_UP_PREPARE_FROZEN:
				1442	mutex_lock(&cache_chain_mutex);
				1443	err = cpuup_prepare(cpu);
				1444	mutex_unlock(&cache_chain_mutex);
				1445	break;
				1446	case CPU_ONLINE:
				1447	case CPU_ONLINE_FROZEN:
				1448	start_cpu_timer(cpu);
				1449	break;
				1450	#ifdef CONFIG_HOTPLUG_CPU
				1451	case CPU_DOWN_PREPARE:
				1452	case CPU_DOWN_PREPARE_FROZEN:
				1453	/*
				1454	* Shutdown cache reaper. Note that the cache_chain_mutex is
				1455	* held so that if cache_reap() is invoked it cannot do
				1456	* anything expensive but will only modify reap_work
				1457	* and reschedule the timer.
				1458	*/
				1459	cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
				1460	/* Now the cache_reaper is guaranteed to be not running. */
				1461	per_cpu(slab_reap_work, cpu).work.func = NULL;
				1462	break;
				1463	case CPU_DOWN_FAILED:
				1464	case CPU_DOWN_FAILED_FROZEN:
				1465	start_cpu_timer(cpu);
				1466	break;
				1467	case CPU_DEAD:
				1468	case CPU_DEAD_FROZEN:
				1469	/*
				1470	* Even if all the cpus of a node are down, we don't free the
				1471	* kmem_list3 of any cache. This to avoid a race between
				1472	* cpu_down, and a kmalloc allocation from another cpu for
				1473	* memory from the node of the cpu going down. The list3
				1474	* structure is usually allocated from kmem_cache_create() and
				1475	* gets destroyed at kmem_cache_destroy().
				1476	*/
				1477	/* fall through */
				1478	#endif
				1479	case CPU_UP_CANCELED:
				1480	case CPU_UP_CANCELED_FROZEN:
				1481	mutex_lock(&cache_chain_mutex);
				1482	cpuup_canceled(cpu);
				1483	mutex_unlock(&cache_chain_mutex);
				1484	break;
				1485	}
				1486	return notifier_from_errno(err);
				1487	}
				1488
				1489	static struct notifier_block __cpuinitdata cpucache_notifier = {
				1490	&cpuup_callback, NULL, 0
				1491	};
				1492
				1493	#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
				1494	/*
				1495	* Drains freelist for a node on each slab cache, used for memory hot-remove.
				1496	* Returns -EBUSY if all objects cannot be drained so that the node is not
				1497	* removed.
				1498	*
				1499	* Must hold cache_chain_mutex.
				1500	*/
				1501	static int __meminit drain_cache_nodelists_node(int node)
				1502	{
				1503	struct kmem_cache *cachep;
				1504	int ret = 0;
				1505
				1506	list_for_each_entry(cachep, &cache_chain, next) {
				1507	struct kmem_list3 *l3;
				1508
				1509	l3 = cachep->nodelists[node];
				1510	if (!l3)
				1511	continue;
				1512
				1513	drain_freelist(cachep, l3, l3->free_objects);
				1514
				1515	if (!list_empty(&l3->slabs_full) \|\|
				1516	!list_empty(&l3->slabs_partial)) {
				1517	ret = -EBUSY;
				1518	break;
				1519	}
				1520	}
				1521	return ret;
				1522	}
				1523
				1524	static int __meminit slab_memory_callback(struct notifier_block *self,
				1525	unsigned long action, void *arg)
				1526	{
				1527	struct memory_notify *mnb = arg;
				1528	int ret = 0;
				1529	int nid;
				1530
				1531	nid = mnb->status_change_nid;
				1532	if (nid < 0)
				1533	goto out;
				1534
				1535	switch (action) {
				1536	case MEM_GOING_ONLINE:
				1537	mutex_lock(&cache_chain_mutex);
				1538	ret = init_cache_nodelists_node(nid);
				1539	mutex_unlock(&cache_chain_mutex);
				1540	break;
				1541	case MEM_GOING_OFFLINE:
				1542	mutex_lock(&cache_chain_mutex);
				1543	ret = drain_cache_nodelists_node(nid);
				1544	mutex_unlock(&cache_chain_mutex);
				1545	break;
				1546	case MEM_ONLINE:
				1547	case MEM_OFFLINE:
				1548	case MEM_CANCEL_ONLINE:
				1549	case MEM_CANCEL_OFFLINE:
				1550	break;
				1551	}
				1552	out:
				1553	return notifier_from_errno(ret);
				1554	}
				1555	#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
				1556
				1557	/*
				1558	* swap the static kmem_list3 with kmalloced memory
				1559	*/
				1560	static void __init init_list(struct kmem_cache cachep, struct kmem_list3 list,
				1561	int nodeid)
				1562	{
				1563	struct kmem_list3 *ptr;
				1564
				1565	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
				1566	BUG_ON(!ptr);
				1567
				1568	memcpy(ptr, list, sizeof(struct kmem_list3));
				1569	/*
				1570	* Do not assume that spinlocks can be initialized via memcpy:
				1571	*/
				1572	spin_lock_init(&ptr->list_lock);
				1573
				1574	MAKE_ALL_LISTS(cachep, ptr, nodeid);
				1575	cachep->nodelists[nodeid] = ptr;
				1576	}
				1577
				1578	/*
				1579	* For setting up all the kmem_list3s for cache whose buffer_size is same as
				1580	* size of kmem_list3.
				1581	*/
				1582	static void __init set_up_list3s(struct kmem_cache *cachep, int index)
				1583	{
				1584	int node;
				1585
				1586	for_each_online_node(node) {
				1587	cachep->nodelists[node] = &initkmem_list3[index + node];
				1588	cachep->nodelists[node]->next_reap = jiffies +
				1589	REAPTIMEOUT_LIST3 +
				1590	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
				1591	}
				1592	}
				1593
				1594	/*
				1595	* Initialisation. Called after the page allocator have been initialised and
				1596	* before smp_init().
				1597	*/
				1598	void __init kmem_cache_init(void)
				1599	{
				1600	size_t left_over;
				1601	struct cache_sizes *sizes;
				1602	struct cache_names *names;
				1603	int i;
				1604	int order;
				1605	int node;
				1606
				1607	if (num_possible_nodes() == 1)
				1608	use_alien_caches = 0;
				1609
				1610	local_irq_lock_init(slab_lock);
				1611	for_each_possible_cpu(i)
				1612	INIT_LIST_HEAD(&per_cpu(slab_free_list, i));
				1613
				1614	for (i = 0; i < NUM_INIT_LISTS; i++) {
				1615	kmem_list3_init(&initkmem_list3[i]);
				1616	if (i < MAX_NUMNODES)
				1617	cache_cache.nodelists[i] = NULL;
				1618	}
				1619	set_up_list3s(&cache_cache, CACHE_CACHE);
				1620
				1621	/*
				1622	* Fragmentation resistance on low memory - only use bigger
				1623	* page orders on machines with more than 32MB of memory if
				1624	* not overridden on the command line.
				1625	*/
				1626	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
				1627	slab_max_order = SLAB_MAX_ORDER_HI;
				1628
				1629	/* Bootstrap is tricky, because several objects are allocated
				1630	* from caches that do not exist yet:
				1631	* 1) initialize the cache_cache cache: it contains the struct
				1632	* kmem_cache structures of all caches, except cache_cache itself:
				1633	* cache_cache is statically allocated.
				1634	* Initially an __init data area is used for the head array and the
				1635	* kmem_list3 structures, it's replaced with a kmalloc allocated
				1636	* array at the end of the bootstrap.
				1637	* 2) Create the first kmalloc cache.
				1638	* The struct kmem_cache for the new cache is allocated normally.
				1639	* An __init data area is used for the head array.
				1640	* 3) Create the remaining kmalloc caches, with minimally sized
				1641	* head arrays.
				1642	* 4) Replace the __init data head arrays for cache_cache and the first
				1643	* kmalloc cache with kmalloc allocated arrays.
				1644	* 5) Replace the __init data for kmem_list3 for cache_cache and
				1645	* the other cache's with kmalloc allocated memory.
				1646	* 6) Resize the head arrays of the kmalloc caches to their final sizes.
				1647	*/
				1648
				1649	node = numa_mem_id();
				1650
				1651	/* 1) create the cache_cache */
				1652	INIT_LIST_HEAD(&cache_chain);
				1653	list_add(&cache_cache.next, &cache_chain);
				1654	cache_cache.colour_off = cache_line_size();
				1655	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
				1656	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
				1657
				1658	/*
				1659	* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
				1660	*/
				1661	cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
				1662	nr_node_ids * sizeof(struct kmem_list3 *);
				1663	#if DEBUG
				1664	cache_cache.obj_size = cache_cache.buffer_size;
				1665	#endif
				1666	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
				1667	cache_line_size());
				1668	cache_cache.reciprocal_buffer_size =
				1669	reciprocal_value(cache_cache.buffer_size);
				1670
				1671	for (order = 0; order < MAX_ORDER; order++) {
				1672	cache_estimate(order, cache_cache.buffer_size,
				1673	cache_line_size(), 0, &left_over, &cache_cache.num);
				1674	if (cache_cache.num)
				1675	break;
				1676	}
				1677	BUG_ON(!cache_cache.num);
				1678	cache_cache.gfporder = order;
				1679	cache_cache.colour = left_over / cache_cache.colour_off;
				1680	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
				1681	sizeof(struct slab), cache_line_size());
				1682
				1683	/* 2+3) create the kmalloc caches */
				1684	sizes = malloc_sizes;
				1685	names = cache_names;
				1686
				1687	/*
				1688	* Initialize the caches that provide memory for the array cache and the
				1689	* kmem_list3 structures first. Without this, further allocations will
				1690	* bug.
				1691	*/
				1692
				1693	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
				1694	sizes[INDEX_AC].cs_size,
				1695	ARCH_KMALLOC_MINALIGN,
				1696	ARCH_KMALLOC_FLAGS\|SLAB_PANIC,
				1697	NULL);
				1698
				1699	if (INDEX_AC != INDEX_L3) {
				1700	sizes[INDEX_L3].cs_cachep =
				1701	kmem_cache_create(names[INDEX_L3].name,
				1702	sizes[INDEX_L3].cs_size,
				1703	ARCH_KMALLOC_MINALIGN,
				1704	ARCH_KMALLOC_FLAGS\|SLAB_PANIC,
				1705	NULL);
				1706	}
				1707
				1708	slab_early_init = 0;
				1709
				1710	while (sizes->cs_size != ULONG_MAX) {
				1711	/*
				1712	* For performance, all the general caches are L1 aligned.
				1713	* This should be particularly beneficial on SMP boxes, as it
				1714	* eliminates "false sharing".
				1715	* Note for systems short on memory removing the alignment will
				1716	* allow tighter packing of the smaller caches.
				1717	*/
				1718	if (!sizes->cs_cachep) {
				1719	sizes->cs_cachep = kmem_cache_create(names->name,
				1720	sizes->cs_size,
				1721	ARCH_KMALLOC_MINALIGN,
				1722	ARCH_KMALLOC_FLAGS\|SLAB_PANIC,
				1723	NULL);
				1724	}
				1725	#ifdef CONFIG_ZONE_DMA
				1726	sizes->cs_dmacachep = kmem_cache_create(
				1727	names->name_dma,
				1728	sizes->cs_size,
				1729	ARCH_KMALLOC_MINALIGN,
				1730	ARCH_KMALLOC_FLAGS\|SLAB_CACHE_DMA\|
				1731	SLAB_PANIC,
				1732	NULL);
				1733	#endif
				1734	sizes++;
				1735	names++;
				1736	}
				1737	/* 4) Replace the bootstrap head arrays */
				1738	{
				1739	struct array_cache *ptr;
				1740
				1741	ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
				1742
				1743	BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
				1744	memcpy(ptr, cpu_cache_get(&cache_cache),
				1745	sizeof(struct arraycache_init));
				1746	/*
				1747	* Do not assume that spinlocks can be initialized via memcpy:
				1748	*/
				1749	spin_lock_init(&ptr->lock);
				1750
				1751	cache_cache.array[smp_processor_id()] = ptr;
				1752
				1753	ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
				1754
				1755	BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
				1756	!= &initarray_generic.cache);
				1757	memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
				1758	sizeof(struct arraycache_init));
				1759	/*
				1760	* Do not assume that spinlocks can be initialized via memcpy:
				1761	*/
				1762	spin_lock_init(&ptr->lock);
				1763
				1764	malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
				1765	ptr;
				1766	}
				1767	/* 5) Replace the bootstrap kmem_list3's */
				1768	{
				1769	int nid;
				1770
				1771	for_each_online_node(nid) {
				1772	init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
				1773
				1774	init_list(malloc_sizes[INDEX_AC].cs_cachep,
				1775	&initkmem_list3[SIZE_AC + nid], nid);
				1776
				1777	if (INDEX_AC != INDEX_L3) {
				1778	init_list(malloc_sizes[INDEX_L3].cs_cachep,
				1779	&initkmem_list3[SIZE_L3 + nid], nid);
				1780	}
				1781	}
				1782	}
				1783
				1784	g_cpucache_up = EARLY;
				1785	}
				1786
				1787	void __init kmem_cache_init_late(void)
				1788	{
				1789	struct kmem_cache *cachep;
				1790
				1791	g_cpucache_up = LATE;
				1792
				1793	/* 6) resize the head arrays to their final sizes */
				1794	mutex_lock(&cache_chain_mutex);
				1795	list_for_each_entry(cachep, &cache_chain, next) {
				1796	if (enable_cpucache(cachep, GFP_NOWAIT))
				1797	BUG();
				1798	init_cachep_lock_keys(cachep);
				1799	}
				1800	mutex_unlock(&cache_chain_mutex);
				1801
				1802	/* Done! */
				1803	g_cpucache_up = FULL;
				1804
				1805	/*
				1806	* Register a cpu startup notifier callback that initializes
				1807	* cpu_cache_get for all new cpus
				1808	*/
				1809	register_cpu_notifier(&cpucache_notifier);
				1810
				1811	#ifdef CONFIG_NUMA
				1812	/*
				1813	* Register a memory hotplug callback that initializes and frees
				1814	* nodelists.
				1815	*/
				1816	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
				1817	#endif
				1818
				1819	/*
				1820	* The reap timers are started later, with a module init call: That part
				1821	* of the kernel is not yet operational.
				1822	*/
				1823	}
				1824
				1825	static int __init cpucache_init(void)
				1826	{
				1827	int cpu;
				1828
				1829	/*
				1830	* Register the timers that return unneeded pages to the page allocator
				1831	*/
				1832	for_each_online_cpu(cpu)
				1833	start_cpu_timer(cpu);
				1834	return 0;
				1835	}
				1836	__initcall(cpucache_init);
				1837
				1838	static noinline void
				1839	slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
				1840	{
				1841	struct kmem_list3 *l3;
				1842	struct slab *slabp;
				1843	unsigned long flags;
				1844	int node;
				1845
				1846	printk(KERN_WARNING
				1847	"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
				1848	nodeid, gfpflags);
				1849	printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
				1850	cachep->name, cachep->buffer_size, cachep->gfporder);
				1851
				1852	for_each_online_node(node) {
				1853	unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
				1854	unsigned long active_slabs = 0, num_slabs = 0;
				1855
				1856	l3 = cachep->nodelists[node];
				1857	if (!l3)
				1858	continue;
				1859
				1860	spin_lock_irqsave(&l3->list_lock, flags);
				1861	list_for_each_entry(slabp, &l3->slabs_full, list) {
				1862	active_objs += cachep->num;
				1863	active_slabs++;
				1864	}
				1865	list_for_each_entry(slabp, &l3->slabs_partial, list) {
				1866	active_objs += slabp->inuse;
				1867	active_slabs++;
				1868	}
				1869	list_for_each_entry(slabp, &l3->slabs_free, list)
				1870	num_slabs++;
				1871
				1872	free_objects += l3->free_objects;
				1873	spin_unlock_irqrestore(&l3->list_lock, flags);
				1874
				1875	num_slabs += active_slabs;
				1876	num_objs = num_slabs * cachep->num;
				1877	printk(KERN_WARNING
				1878	" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
				1879	node, active_slabs, num_slabs, active_objs, num_objs,
				1880	free_objects);
				1881	}
				1882	}
				1883
				1884	/*
				1885	* Interface to system's page allocator. No need to hold the cache-lock.
				1886	*
				1887	* If we requested dmaable memory, we will get it. Even if we
				1888	* did not request dmaable memory, we might get it, but that
				1889	* would be relatively rare and ignorable.
				1890	*/
				1891	static void kmem_getpages(struct kmem_cache cachep, gfp_t flags, int nodeid)
				1892	{
				1893	struct page *page;
				1894	int nr_pages;
				1895	int i;
				1896
				1897	#ifndef CONFIG_MMU
				1898	/*
				1899	* Nommu uses slab's for process anonymous memory allocations, and thus
				1900	* requires __GFP_COMP to properly refcount higher order allocations
				1901	*/
				1902	flags \|= __GFP_COMP;
				1903	#endif
				1904
				1905	flags \|= cachep->gfpflags;
				1906	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
				1907	flags \|= __GFP_RECLAIMABLE;
				1908
				1909	page = alloc_pages_exact_node(nodeid, flags \| __GFP_NOTRACK, cachep->gfporder);
				1910	if (!page) {
				1911	if (!(flags & __GFP_NOWARN) && printk_ratelimit())
				1912	slab_out_of_memory(cachep, flags, nodeid);
				1913	return NULL;
				1914	}
				1915
				1916	nr_pages = (1 << cachep->gfporder);
				1917	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
				1918	add_zone_page_state(page_zone(page),
				1919	NR_SLAB_RECLAIMABLE, nr_pages);
				1920	else
				1921	add_zone_page_state(page_zone(page),
				1922	NR_SLAB_UNRECLAIMABLE, nr_pages);
				1923	for (i = 0; i < nr_pages; i++)
				1924	__SetPageSlab(page + i);
				1925
				1926	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
				1927	kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
				1928
				1929	if (cachep->ctor)
				1930	kmemcheck_mark_uninitialized_pages(page, nr_pages);
				1931	else
				1932	kmemcheck_mark_unallocated_pages(page, nr_pages);
				1933	}
				1934
				1935	return page_address(page);
				1936	}
				1937
				1938	/*
				1939	* Interface to system's page release.
				1940	*/
				1941	static void kmem_freepages(struct kmem_cache cachep, void addr, bool delayed)
				1942	{
				1943	unsigned long i = (1 << cachep->gfporder);
				1944	struct page page, basepage = virt_to_page(addr);
				1945	const unsigned long nr_freed = i;
				1946
				1947	page = basepage;
				1948
				1949	kmemcheck_free_shadow(page, cachep->gfporder);
				1950
				1951	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
				1952	sub_zone_page_state(page_zone(page),
				1953	NR_SLAB_RECLAIMABLE, nr_freed);
				1954	else
				1955	sub_zone_page_state(page_zone(page),
				1956	NR_SLAB_UNRECLAIMABLE, nr_freed);
				1957	while (i--) {
				1958	BUG_ON(!PageSlab(page));
				1959	__ClearPageSlab(page);
				1960	page++;
				1961	}
				1962	if (current->reclaim_state)
				1963	current->reclaim_state->reclaimed_slab += nr_freed;
				1964
				1965	if (!delayed) {
				1966	free_pages((unsigned long)addr, cachep->gfporder);
				1967	} else {
				1968	basepage->index = cachep->gfporder;
				1969	list_add(&basepage->lru, &__get_cpu_var(slab_free_list));
				1970	}
				1971	}
				1972
				1973	static void kmem_rcu_free(struct rcu_head *head)
				1974	{
				1975	struct slab_rcu slab_rcu = (struct slab_rcu )head;
				1976	struct kmem_cache *cachep = slab_rcu->cachep;
				1977
				1978	kmem_freepages(cachep, slab_rcu->addr, false);
				1979	if (OFF_SLAB(cachep))
				1980	kmem_cache_free(cachep->slabp_cache, slab_rcu);
				1981	}
				1982
				1983	#if DEBUG
				1984
				1985	#ifdef CONFIG_DEBUG_PAGEALLOC
				1986	static void store_stackinfo(struct kmem_cache cachep, unsigned long addr,
				1987	unsigned long caller)
				1988	{
				1989	int size = obj_size(cachep);
				1990
				1991	addr = (unsigned long )&((char )addr)[obj_offset(cachep)];
				1992
				1993	if (size < 5 * sizeof(unsigned long))
				1994	return;
				1995
				1996	*addr++ = 0x12345678;
				1997	*addr++ = caller;
				1998	*addr++ = smp_processor_id();
				1999	size -= 3 * sizeof(unsigned long);
				2000	{
				2001	unsigned long *sptr = &caller;
				2002	unsigned long svalue;
				2003
				2004	while (!kstack_end(sptr)) {
				2005	svalue = *sptr++;
				2006	if (kernel_text_address(svalue)) {
				2007	*addr++ = svalue;
				2008	size -= sizeof(unsigned long);
				2009	if (size <= sizeof(unsigned long))
				2010	break;
				2011	}
				2012	}
				2013
				2014	}
				2015	*addr++ = 0x87654321;
				2016	}
				2017	#endif
				2018
				2019	static void poison_obj(struct kmem_cache cachep, void addr, unsigned char val)
				2020	{
				2021	int size = obj_size(cachep);
				2022	addr = &((char *)addr)[obj_offset(cachep)];
				2023
				2024	memset(addr, val, size);
				2025	(unsigned char )(addr + size - 1) = POISON_END;
				2026	}
				2027
				2028	static void dump_line(char *data, int offset, int limit)
				2029	{
				2030	int i;
				2031	unsigned char error = 0;
				2032	int bad_count = 0;
				2033
				2034	printk(KERN_ERR "%03x: ", offset);
				2035	for (i = 0; i < limit; i++) {
				2036	if (data[offset + i] != POISON_FREE) {
				2037	error = data[offset + i];
				2038	bad_count++;
				2039	}
				2040	}
				2041	print_hex_dump(KERN_CONT, "", 0, 16, 1,
				2042	&data[offset], limit, 1);
				2043
				2044	if (bad_count == 1) {
				2045	error ^= POISON_FREE;
				2046	if (!(error & (error - 1))) {
				2047	printk(KERN_ERR "Single bit error detected. Probably "
				2048	"bad RAM.\n");
				2049	#ifdef CONFIG_X86
				2050	printk(KERN_ERR "Run memtest86+ or a similar memory "
				2051	"test tool.\n");
				2052	#else
				2053	printk(KERN_ERR "Run a memory test tool.\n");
				2054	#endif
				2055	}
				2056	}
				2057	}
				2058	#endif
				2059
				2060	#if DEBUG
				2061
				2062	static void print_objinfo(struct kmem_cache cachep, void objp, int lines)
				2063	{
				2064	int i, size;
				2065	char *realobj;
				2066
				2067	if (cachep->flags & SLAB_RED_ZONE) {
				2068	printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
				2069	*dbg_redzone1(cachep, objp),
				2070	*dbg_redzone2(cachep, objp));
				2071	}
				2072
				2073	if (cachep->flags & SLAB_STORE_USER) {
				2074	printk(KERN_ERR "Last user: [<%p>]",
				2075	*dbg_userword(cachep, objp));
				2076	print_symbol("(%s)",
				2077	(unsigned long)*dbg_userword(cachep, objp));
				2078	printk("\n");
				2079	}
				2080	realobj = (char *)objp + obj_offset(cachep);
				2081	size = obj_size(cachep);
				2082	for (i = 0; i < size && lines; i += 16, lines--) {
				2083	int limit;
				2084	limit = 16;
				2085	if (i + limit > size)
				2086	limit = size - i;
				2087	dump_line(realobj, i, limit);
				2088	}
				2089	}
				2090
				2091	static void check_poison_obj(struct kmem_cache cachep, void objp)
				2092	{
				2093	char *realobj;
				2094	int size, i;
				2095	int lines = 0;
				2096
				2097	realobj = (char *)objp + obj_offset(cachep);
				2098	size = obj_size(cachep);
				2099
				2100	for (i = 0; i < size; i++) {
				2101	char exp = POISON_FREE;
				2102	if (i == size - 1)
				2103	exp = POISON_END;
				2104	if (realobj[i] != exp) {
				2105	int limit;
				2106	/* Mismatch ! */
				2107	/* Print header */
				2108	if (lines == 0) {
				2109	printk(KERN_ERR
				2110	"Slab corruption (%s): %s start=%p, len=%d\n",
				2111	print_tainted(), cachep->name, realobj, size);
				2112	print_objinfo(cachep, objp, 0);
				2113	}
				2114	/* Hexdump the affected line */
				2115	i = (i / 16) * 16;
				2116	limit = 16;
				2117	if (i + limit > size)
				2118	limit = size - i;
				2119	dump_line(realobj, i, limit);
				2120	i += 16;
				2121	lines++;
				2122	/* Limit to 5 lines */
				2123	if (lines > 5)
				2124	break;
				2125	}
				2126	}
				2127	if (lines != 0) {
				2128	/* Print some data about the neighboring objects, if they
				2129	* exist:
				2130	*/
				2131	struct slab *slabp = virt_to_slab(objp);
				2132	unsigned int objnr;
				2133
				2134	objnr = obj_to_index(cachep, slabp, objp);
				2135	if (objnr) {
				2136	objp = index_to_obj(cachep, slabp, objnr - 1);
				2137	realobj = (char *)objp + obj_offset(cachep);
				2138	printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
				2139	realobj, size);
				2140	print_objinfo(cachep, objp, 2);
				2141	}
				2142	if (objnr + 1 < cachep->num) {
				2143	objp = index_to_obj(cachep, slabp, objnr + 1);
				2144	realobj = (char *)objp + obj_offset(cachep);
				2145	printk(KERN_ERR "Next obj: start=%p, len=%d\n",
				2146	realobj, size);
				2147	print_objinfo(cachep, objp, 2);
				2148	}
				2149	}
				2150	}
				2151	#endif
				2152
				2153	#if DEBUG
				2154	static void slab_destroy_debugcheck(struct kmem_cache cachep, struct slab slabp)
				2155	{
				2156	int i;
				2157	for (i = 0; i < cachep->num; i++) {
				2158	void *objp = index_to_obj(cachep, slabp, i);
				2159
				2160	if (cachep->flags & SLAB_POISON) {
				2161	#ifdef CONFIG_DEBUG_PAGEALLOC
				2162	if (cachep->buffer_size % PAGE_SIZE == 0 &&
				2163	OFF_SLAB(cachep))
				2164	kernel_map_pages(virt_to_page(objp),
				2165	cachep->buffer_size / PAGE_SIZE, 1);
				2166	else
				2167	check_poison_obj(cachep, objp);
				2168	#else
				2169	check_poison_obj(cachep, objp);
				2170	#endif
				2171	}
				2172	if (cachep->flags & SLAB_RED_ZONE) {
				2173	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				2174	slab_error(cachep, "start of a freed object "
				2175	"was overwritten");
				2176	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				2177	slab_error(cachep, "end of a freed object "
				2178	"was overwritten");
				2179	}
				2180	}
				2181	}
				2182	#else
				2183	static void slab_destroy_debugcheck(struct kmem_cache cachep, struct slab slabp)
				2184	{
				2185	}
				2186	#endif
				2187
				2188	/**
				2189	* slab_destroy - destroy and release all objects in a slab
				2190	* @cachep: cache pointer being destroyed
				2191	* @slabp: slab pointer being destroyed
				2192	*
				2193	* Destroy all the objs in a slab, and release the mem back to the system.
				2194	* Before calling the slab must have been unlinked from the cache. The
				2195	* cache-lock is not held/needed.
				2196	*/
				2197	static void slab_destroy(struct kmem_cache cachep, struct slab slabp,
				2198	bool delayed)
				2199	{
				2200	void *addr = slabp->s_mem - slabp->colouroff;
				2201
				2202	slab_destroy_debugcheck(cachep, slabp);
				2203	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
				2204	struct slab_rcu *slab_rcu;
				2205
				2206	slab_rcu = (struct slab_rcu *)slabp;
				2207	slab_rcu->cachep = cachep;
				2208	slab_rcu->addr = addr;
				2209	call_rcu(&slab_rcu->head, kmem_rcu_free);
				2210	} else {
				2211	kmem_freepages(cachep, addr, delayed);
				2212	if (OFF_SLAB(cachep))
				2213	kmem_cache_free(cachep->slabp_cache, slabp);
				2214	}
				2215	}
				2216
				2217	static void __kmem_cache_destroy(struct kmem_cache *cachep)
				2218	{
				2219	int i;
				2220	struct kmem_list3 *l3;
				2221
				2222	for_each_online_cpu(i)
				2223	kfree(cachep->array[i]);
				2224
				2225	/* NUMA: free the list3 structures */
				2226	for_each_online_node(i) {
				2227	l3 = cachep->nodelists[i];
				2228	if (l3) {
				2229	kfree(l3->shared);
				2230	free_alien_cache(l3->alien);
				2231	kfree(l3);
				2232	}
				2233	}
				2234	kmem_cache_free(&cache_cache, cachep);
				2235	}
				2236
				2237
				2238	/**
				2239	* calculate_slab_order - calculate size (page order) of slabs
				2240	* @cachep: pointer to the cache that is being created
				2241	* @size: size of objects to be created in this cache.
				2242	* @align: required alignment for the objects.
				2243	* @flags: slab allocation flags
				2244	*
				2245	* Also calculates the number of objects per slab.
				2246	*
				2247	* This could be made much more intelligent. For now, try to avoid using
				2248	* high order pages for slabs. When the gfp() functions are more friendly
				2249	* towards high-order requests, this should be changed.
				2250	*/
				2251	static size_t calculate_slab_order(struct kmem_cache *cachep,
				2252	size_t size, size_t align, unsigned long flags)
				2253	{
				2254	unsigned long offslab_limit;
				2255	size_t left_over = 0;
				2256	int gfporder;
				2257
				2258	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
				2259	unsigned int num;
				2260	size_t remainder;
				2261
				2262	cache_estimate(gfporder, size, align, flags, &remainder, &num);
				2263	if (!num)
				2264	continue;
				2265
				2266	if (flags & CFLGS_OFF_SLAB) {
				2267	/*
				2268	* Max number of objs-per-slab for caches which
				2269	* use off-slab slabs. Needed to avoid a possible
				2270	* looping condition in cache_grow().
				2271	*/
				2272	offslab_limit = size - sizeof(struct slab);
				2273	offslab_limit /= sizeof(kmem_bufctl_t);
				2274
				2275	if (num > offslab_limit)
				2276	break;
				2277	}
				2278
				2279	/* Found something acceptable - save it away */
				2280	cachep->num = num;
				2281	cachep->gfporder = gfporder;
				2282	left_over = remainder;
				2283
				2284	/*
				2285	* A VFS-reclaimable slab tends to have most allocations
				2286	* as GFP_NOFS and we really don't want to have to be allocating
				2287	* higher-order pages when we are unable to shrink dcache.
				2288	*/
				2289	if (flags & SLAB_RECLAIM_ACCOUNT)
				2290	break;
				2291
				2292	/*
				2293	* Large number of objects is good, but very large slabs are
				2294	* currently bad for the gfp()s.
				2295	*/
				2296	if (gfporder >= slab_max_order)
				2297	break;
				2298
				2299	/*
				2300	* Acceptable internal fragmentation?
				2301	*/
				2302	if (left_over * 8 <= (PAGE_SIZE << gfporder))
				2303	break;
				2304	}
				2305	return left_over;
				2306	}
				2307
				2308	static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
				2309	{
				2310	if (g_cpucache_up == FULL)
				2311	return enable_cpucache(cachep, gfp);
				2312
				2313	if (g_cpucache_up == NONE) {
				2314	/*
				2315	* Note: the first kmem_cache_create must create the cache
				2316	* that's used by kmalloc(24), otherwise the creation of
				2317	* further caches will BUG().
				2318	*/
				2319	cachep->array[smp_processor_id()] = &initarray_generic.cache;
				2320
				2321	/*
				2322	* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
				2323	* the first cache, then we need to set up all its list3s,
				2324	* otherwise the creation of further caches will BUG().
				2325	*/
				2326	set_up_list3s(cachep, SIZE_AC);
				2327	if (INDEX_AC == INDEX_L3)
				2328	g_cpucache_up = PARTIAL_L3;
				2329	else
				2330	g_cpucache_up = PARTIAL_AC;
				2331	} else {
				2332	cachep->array[smp_processor_id()] =
				2333	kmalloc(sizeof(struct arraycache_init), gfp);
				2334
				2335	if (g_cpucache_up == PARTIAL_AC) {
				2336	set_up_list3s(cachep, SIZE_L3);
				2337	g_cpucache_up = PARTIAL_L3;
				2338	} else {
				2339	int node;
				2340	for_each_online_node(node) {
				2341	cachep->nodelists[node] =
				2342	kmalloc_node(sizeof(struct kmem_list3),
				2343	gfp, node);
				2344	BUG_ON(!cachep->nodelists[node]);
				2345	kmem_list3_init(cachep->nodelists[node]);
				2346	}
				2347	}
				2348	}
				2349	cachep->nodelists[numa_mem_id()]->next_reap =
				2350	jiffies + REAPTIMEOUT_LIST3 +
				2351	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
				2352
				2353	cpu_cache_get(cachep)->avail = 0;
				2354	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
				2355	cpu_cache_get(cachep)->batchcount = 1;
				2356	cpu_cache_get(cachep)->touched = 0;
				2357	cachep->batchcount = 1;
				2358	cachep->limit = BOOT_CPUCACHE_ENTRIES;
				2359	return 0;
				2360	}
				2361
				2362	/**
				2363	* kmem_cache_create - Create a cache.
				2364	* @name: A string which is used in /proc/slabinfo to identify this cache.
				2365	* @size: The size of objects to be created in this cache.
				2366	* @align: The required alignment for the objects.
				2367	* @flags: SLAB flags
				2368	* @ctor: A constructor for the objects.
				2369	*
				2370	* Returns a ptr to the cache on success, NULL on failure.
				2371	* Cannot be called within a int, but can be interrupted.
				2372	* The @ctor is run when new pages are allocated by the cache.
				2373	*
				2374	* @name must be valid until the cache is destroyed. This implies that
				2375	* the module calling this has to destroy the cache before getting unloaded.
				2376	*
				2377	* The flags are
				2378	*
				2379	* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
				2380	* to catch references to uninitialised memory.
				2381	*
				2382	* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
				2383	* for buffer overruns.
				2384	*
				2385	* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
				2386	* cacheline. This can be beneficial if you're counting cycles as closely
				2387	* as davem.
				2388	*/
				2389	struct kmem_cache *
				2390	kmem_cache_create (const char *name, size_t size, size_t align,
				2391	unsigned long flags, void (ctor)(void ))
				2392	{
				2393	size_t left_over, slab_size, ralign;
				2394	struct kmem_cache cachep = NULL, pc;
				2395	gfp_t gfp;
				2396
				2397	/*
				2398	* Sanity checks... these are all serious usage bugs.
				2399	*/
				2400	if (!name \|\| in_interrupt() \|\| (size < BYTES_PER_WORD) \|\|
				2401	size > KMALLOC_MAX_SIZE) {
				2402	printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
				2403	name);
				2404	BUG();
				2405	}
				2406
				2407	/*
				2408	* We use cache_chain_mutex to ensure a consistent view of
				2409	* cpu_online_mask as well. Please see cpuup_callback
				2410	*/
				2411	if (slab_is_available()) {
				2412	get_online_cpus();
				2413	mutex_lock(&cache_chain_mutex);
				2414	}
				2415
				2416	list_for_each_entry(pc, &cache_chain, next) {
				2417	char tmp;
				2418	int res;
				2419
				2420	/*
				2421	* This happens when the module gets unloaded and doesn't
				2422	* destroy its slab cache and no-one else reuses the vmalloc
				2423	* area of the module. Print a warning.
				2424	*/
				2425	res = probe_kernel_address(pc->name, tmp);
				2426	if (res) {
				2427	printk(KERN_ERR
				2428	"SLAB: cache with size %d has lost its name\n",
				2429	pc->buffer_size);
				2430	continue;
				2431	}
				2432
				2433	if (!strcmp(pc->name, name)) {
				2434	printk(KERN_ERR
				2435	"kmem_cache_create: duplicate cache %s\n", name);
				2436	dump_stack();
				2437	goto oops;
				2438	}
				2439	}
				2440
				2441	#if DEBUG
				2442	WARN_ON(strchr(name, ' ')); /* It confuses parsers */
				2443	#if FORCED_DEBUG
				2444	/*
				2445	* Enable redzoning and last user accounting, except for caches with
				2446	* large objects, if the increased size would increase the object size
				2447	* above the next power of two: caches with object sizes just above a
				2448	* power of two have a significant amount of internal fragmentation.
				2449	*/
				2450	if (size < 4096 \|\| fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
				2451	2 * sizeof(unsigned long long)))
				2452	flags \|= SLAB_RED_ZONE \| SLAB_STORE_USER;
				2453	if (!(flags & SLAB_DESTROY_BY_RCU))
				2454	flags \|= SLAB_POISON;
				2455	#endif
				2456	if (flags & SLAB_DESTROY_BY_RCU)
				2457	BUG_ON(flags & SLAB_POISON);
				2458	#endif
				2459	/*
				2460	* Always checks flags, a caller might be expecting debug support which
				2461	* isn't available.
				2462	*/
				2463	BUG_ON(flags & ~CREATE_MASK);
				2464
				2465	/*
				2466	* Check that size is in terms of words. This is needed to avoid
				2467	* unaligned accesses for some archs when redzoning is used, and makes
				2468	* sure any on-slab bufctl's are also correctly aligned.
				2469	*/
				2470	if (size & (BYTES_PER_WORD - 1)) {
				2471	size += (BYTES_PER_WORD - 1);
				2472	size &= ~(BYTES_PER_WORD - 1);
				2473	}
				2474
				2475	/* calculate the final buffer alignment: */
				2476
				2477	/* 1) arch recommendation: can be overridden for debug */
				2478	if (flags & SLAB_HWCACHE_ALIGN) {
				2479	/*
				2480	* Default alignment: as specified by the arch code. Except if
				2481	* an object is really small, then squeeze multiple objects into
				2482	* one cacheline.
				2483	*/
				2484	ralign = cache_line_size();
				2485	while (size <= ralign / 2)
				2486	ralign /= 2;
				2487	} else {
				2488	ralign = BYTES_PER_WORD;
				2489	}
				2490
				2491	/*
				2492	* Redzoning and user store require word alignment or possibly larger.
				2493	* Note this will be overridden by architecture or caller mandated
				2494	* alignment if either is greater than BYTES_PER_WORD.
				2495	*/
				2496	if (flags & SLAB_STORE_USER)
				2497	ralign = BYTES_PER_WORD;
				2498
				2499	if (flags & SLAB_RED_ZONE) {
				2500	ralign = REDZONE_ALIGN;
				2501	/* If redzoning, ensure that the second redzone is suitably
				2502	* aligned, by adjusting the object size accordingly. */
				2503	size += REDZONE_ALIGN - 1;
				2504	size &= ~(REDZONE_ALIGN - 1);
				2505	}
				2506
				2507	/* 2) arch mandated alignment */
				2508	if (ralign < ARCH_SLAB_MINALIGN) {
				2509	ralign = ARCH_SLAB_MINALIGN;
				2510	}
				2511	/* 3) caller mandated alignment */
				2512	if (ralign < align) {
				2513	ralign = align;
				2514	}
				2515	/* disable debug if necessary */
				2516	if (ralign > __alignof__(unsigned long long))
				2517	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
				2518	/*
				2519	* 4) Store it.
				2520	*/
				2521	align = ralign;
				2522
				2523	if (slab_is_available())
				2524	gfp = GFP_KERNEL;
				2525	else
				2526	gfp = GFP_NOWAIT;
				2527
				2528	/* Get cache's description obj. */
				2529	cachep = kmem_cache_zalloc(&cache_cache, gfp);
				2530	if (!cachep)
				2531	goto oops;
				2532
				2533	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
				2534	#if DEBUG
				2535	cachep->obj_size = size;
				2536
				2537	/*
				2538	* Both debugging options require word-alignment which is calculated
				2539	* into align above.
				2540	*/
				2541	if (flags & SLAB_RED_ZONE) {
				2542	/* add space for red zone words */
				2543	cachep->obj_offset += sizeof(unsigned long long);
				2544	size += 2 * sizeof(unsigned long long);
				2545	}
				2546	if (flags & SLAB_STORE_USER) {
				2547	/* user store requires one word storage behind the end of
				2548	* the real object. But if the second red zone needs to be
				2549	* aligned to 64 bits, we must allow that much space.
				2550	*/
				2551	if (flags & SLAB_RED_ZONE)
				2552	size += REDZONE_ALIGN;
				2553	else
				2554	size += BYTES_PER_WORD;
				2555	}
				2556	#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
				2557	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
				2558	&& cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
				2559	cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
				2560	size = PAGE_SIZE;
				2561	}
				2562	#endif
				2563	#endif
				2564
				2565	/*
				2566	* Determine if the slab management is 'on' or 'off' slab.
				2567	* (bootstrapping cannot cope with offslab caches so don't do
				2568	* it too early on. Always use on-slab management when
				2569	* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
				2570	*/
				2571	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
				2572	!(flags & SLAB_NOLEAKTRACE))
				2573	/*
				2574	* Size is large, assume best to place the slab management obj
				2575	* off-slab (should allow better packing of objs).
				2576	*/
				2577	flags \|= CFLGS_OFF_SLAB;
				2578
				2579	size = ALIGN(size, align);
				2580
				2581	left_over = calculate_slab_order(cachep, size, align, flags);
				2582
				2583	if (!cachep->num) {
				2584	printk(KERN_ERR
				2585	"kmem_cache_create: couldn't create cache %s.\n", name);
				2586	kmem_cache_free(&cache_cache, cachep);
				2587	cachep = NULL;
				2588	goto oops;
				2589	}
				2590	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
				2591	+ sizeof(struct slab), align);
				2592
				2593	/*
				2594	* If the slab has been placed off-slab, and we have enough space then
				2595	* move it on-slab. This is at the expense of any extra colouring.
				2596	*/
				2597	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
				2598	flags &= ~CFLGS_OFF_SLAB;
				2599	left_over -= slab_size;
				2600	}
				2601
				2602	if (flags & CFLGS_OFF_SLAB) {
				2603	/* really off slab. No need for manual alignment */
				2604	slab_size =
				2605	cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
				2606
				2607	#ifdef CONFIG_PAGE_POISONING
				2608	/* If we're going to use the generic kernel_map_pages()
				2609	* poisoning, then it's going to smash the contents of
				2610	* the redzone and userword anyhow, so switch them off.
				2611	*/
				2612	if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
				2613	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
				2614	#endif
				2615	}
				2616
				2617	cachep->colour_off = cache_line_size();
				2618	/* Offset must be a multiple of the alignment. */
				2619	if (cachep->colour_off < align)
				2620	cachep->colour_off = align;
				2621	cachep->colour = left_over / cachep->colour_off;
				2622	cachep->slab_size = slab_size;
				2623	cachep->flags = flags;
				2624	cachep->gfpflags = 0;
				2625	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
				2626	cachep->gfpflags \|= GFP_DMA;
				2627	cachep->buffer_size = size;
				2628	cachep->reciprocal_buffer_size = reciprocal_value(size);
				2629
				2630	if (flags & CFLGS_OFF_SLAB) {
				2631	cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
				2632	/*
				2633	* This is a possibility for one of the malloc_sizes caches.
				2634	* But since we go off slab only for object size greater than
				2635	* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
				2636	* this should not happen at all.
				2637	* But leave a BUG_ON for some lucky dude.
				2638	*/
				2639	BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
				2640	}
				2641	cachep->ctor = ctor;
				2642	cachep->name = name;
				2643
				2644	if (setup_cpu_cache(cachep, gfp)) {
				2645	__kmem_cache_destroy(cachep);
				2646	cachep = NULL;
				2647	goto oops;
				2648	}
				2649
				2650	if (flags & SLAB_DEBUG_OBJECTS) {
				2651	/*
				2652	* Would deadlock through slab_destroy()->call_rcu()->
				2653	* debug_object_activate()->kmem_cache_alloc().
				2654	*/
				2655	WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
				2656
				2657	slab_set_debugobj_lock_classes(cachep);
				2658	}
				2659
				2660	init_cachep_lock_keys(cachep);
				2661
				2662	/* cache setup completed, link it into the list */
				2663	list_add(&cachep->next, &cache_chain);
				2664	oops:
				2665	if (!cachep && (flags & SLAB_PANIC))
				2666	panic("kmem_cache_create(): failed to create slab `%s'\n",
				2667	name);
				2668	if (slab_is_available()) {
				2669	mutex_unlock(&cache_chain_mutex);
				2670	put_online_cpus();
				2671	}
				2672	return cachep;
				2673	}
				2674	EXPORT_SYMBOL(kmem_cache_create);
				2675
				2676	#if DEBUG
				2677	static void check_irq_off(void)
				2678	{
				2679	BUG_ON_NONRT(!irqs_disabled());
				2680	}
				2681
				2682	static void check_irq_on(void)
				2683	{
				2684	BUG_ON(irqs_disabled());
				2685	}
				2686
				2687	static void check_spinlock_acquired(struct kmem_cache *cachep)
				2688	{
				2689	#ifdef CONFIG_SMP
				2690	check_irq_off();
				2691	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
				2692	#endif
				2693	}
				2694
				2695	static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
				2696	{
				2697	#ifdef CONFIG_SMP
				2698	check_irq_off();
				2699	assert_spin_locked(&cachep->nodelists[node]->list_lock);
				2700	#endif
				2701	}
				2702
				2703	#else
				2704	#define check_irq_off() do { } while(0)
				2705	#define check_irq_on() do { } while(0)
				2706	#define check_spinlock_acquired(x) do { } while(0)
				2707	#define check_spinlock_acquired_node(x, y) do { } while(0)
				2708	#endif
				2709
				2710	static void drain_array(struct kmem_cache cachep, struct kmem_list3 l3,
				2711	struct array_cache *ac,
				2712	int force, int node);
				2713
				2714	static void __do_drain(void *arg, unsigned int cpu)
				2715	{
				2716	struct kmem_cache *cachep = arg;
				2717	struct array_cache *ac;
				2718	int node = cpu_to_mem(cpu);
				2719
				2720	ac = cpu_cache_get_on_cpu(cachep, cpu);
				2721	spin_lock(&cachep->nodelists[node]->list_lock);
				2722	free_block(cachep, ac->entry, ac->avail, node);
				2723	spin_unlock(&cachep->nodelists[node]->list_lock);
				2724	ac->avail = 0;
				2725	}
				2726
				2727	#ifndef CONFIG_PREEMPT_RT_BASE
				2728	static void do_drain(void *arg)
				2729	{
				2730	__do_drain(arg, smp_processor_id());
				2731	}
				2732	#else
				2733	static void do_drain(void *arg, int cpu)
				2734	{
				2735	LIST_HEAD(tmp);
				2736
				2737	lock_slab_on(cpu);
				2738	__do_drain(arg, cpu);
				2739	list_splice_init(&per_cpu(slab_free_list, cpu), &tmp);
				2740	unlock_slab_on(cpu);
				2741	free_delayed(&tmp);
				2742	}
				2743	#endif
				2744
				2745	static void drain_cpu_caches(struct kmem_cache *cachep)
				2746	{
				2747	struct kmem_list3 *l3;
				2748	int node;
				2749
				2750	slab_on_each_cpu(do_drain, cachep);
				2751	check_irq_on();
				2752	for_each_online_node(node) {
				2753	l3 = cachep->nodelists[node];
				2754	if (l3 && l3->alien)
				2755	drain_alien_cache(cachep, l3->alien);
				2756	}
				2757
				2758	for_each_online_node(node) {
				2759	l3 = cachep->nodelists[node];
				2760	if (l3)
				2761	drain_array(cachep, l3, l3->shared, 1, node);
				2762	}
				2763	}
				2764
				2765	/*
				2766	* Remove slabs from the list of free slabs.
				2767	* Specify the number of slabs to drain in tofree.
				2768	*
				2769	* Returns the actual number of slabs released.
				2770	*/
				2771	static int drain_freelist(struct kmem_cache *cache,
				2772	struct kmem_list3 *l3, int tofree)
				2773	{
				2774	struct list_head *p;
				2775	int nr_freed;
				2776	struct slab *slabp;
				2777
				2778	nr_freed = 0;
				2779	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
				2780
				2781	local_spin_lock_irq(slab_lock, &l3->list_lock);
				2782	p = l3->slabs_free.prev;
				2783	if (p == &l3->slabs_free) {
				2784	local_spin_unlock_irq(slab_lock, &l3->list_lock);
				2785	goto out;
				2786	}
				2787
				2788	slabp = list_entry(p, struct slab, list);
				2789	#if DEBUG
				2790	BUG_ON(slabp->inuse);
				2791	#endif
				2792	list_del(&slabp->list);
				2793	/*
				2794	* Safe to drop the lock. The slab is no longer linked
				2795	* to the cache.
				2796	*/
				2797	l3->free_objects -= cache->num;
				2798	local_spin_unlock_irq(slab_lock, &l3->list_lock);
				2799	slab_destroy(cache, slabp, false);
				2800	nr_freed++;
				2801	}
				2802	out:
				2803	return nr_freed;
				2804	}
				2805
				2806	/* Called with cache_chain_mutex held to protect against cpu hotplug */
				2807	static int __cache_shrink(struct kmem_cache *cachep)
				2808	{
				2809	int ret = 0, i = 0;
				2810	struct kmem_list3 *l3;
				2811
				2812	drain_cpu_caches(cachep);
				2813
				2814	check_irq_on();
				2815	for_each_online_node(i) {
				2816	l3 = cachep->nodelists[i];
				2817	if (!l3)
				2818	continue;
				2819
				2820	drain_freelist(cachep, l3, l3->free_objects);
				2821
				2822	ret += !list_empty(&l3->slabs_full) \|\|
				2823	!list_empty(&l3->slabs_partial);
				2824	}
				2825	return (ret ? 1 : 0);
				2826	}
				2827
				2828	/**
				2829	* kmem_cache_shrink - Shrink a cache.
				2830	* @cachep: The cache to shrink.
				2831	*
				2832	* Releases as many slabs as possible for a cache.
				2833	* To help debugging, a zero exit status indicates all slabs were released.
				2834	*/
				2835	int kmem_cache_shrink(struct kmem_cache *cachep)
				2836	{
				2837	int ret;
				2838	BUG_ON(!cachep \|\| in_interrupt());
				2839
				2840	get_online_cpus();
				2841	mutex_lock(&cache_chain_mutex);
				2842	ret = __cache_shrink(cachep);
				2843	mutex_unlock(&cache_chain_mutex);
				2844	put_online_cpus();
				2845	return ret;
				2846	}
				2847	EXPORT_SYMBOL(kmem_cache_shrink);
				2848
				2849	/**
				2850	* kmem_cache_destroy - delete a cache
				2851	* @cachep: the cache to destroy
				2852	*
				2853	* Remove a &struct kmem_cache object from the slab cache.
				2854	*
				2855	* It is expected this function will be called by a module when it is
				2856	* unloaded. This will remove the cache completely, and avoid a duplicate
				2857	* cache being allocated each time a module is loaded and unloaded, if the
				2858	* module doesn't have persistent in-kernel storage across loads and unloads.
				2859	*
				2860	* The cache must be empty before calling this function.
				2861	*
				2862	* The caller must guarantee that no one will allocate memory from the cache
				2863	* during the kmem_cache_destroy().
				2864	*/
				2865	void kmem_cache_destroy(struct kmem_cache *cachep)
				2866	{
				2867	BUG_ON(!cachep \|\| in_interrupt());
				2868
				2869	/* Find the cache in the chain of caches. */
				2870	get_online_cpus();
				2871	mutex_lock(&cache_chain_mutex);
				2872	/*
				2873	* the chain is never empty, cache_cache is never destroyed
				2874	*/
				2875	list_del(&cachep->next);
				2876	if (__cache_shrink(cachep)) {
				2877	slab_error(cachep, "Can't free all objects");
				2878	list_add(&cachep->next, &cache_chain);
				2879	mutex_unlock(&cache_chain_mutex);
				2880	put_online_cpus();
				2881	return;
				2882	}
				2883
				2884	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
				2885	rcu_barrier();
				2886
				2887	__kmem_cache_destroy(cachep);
				2888	mutex_unlock(&cache_chain_mutex);
				2889	put_online_cpus();
				2890	}
				2891	EXPORT_SYMBOL(kmem_cache_destroy);
				2892
				2893	/*
				2894	* Get the memory for a slab management obj.
				2895	* For a slab cache when the slab descriptor is off-slab, slab descriptors
				2896	* always come from malloc_sizes caches. The slab descriptor cannot
				2897	* come from the same cache which is getting created because,
				2898	* when we are searching for an appropriate cache for these
				2899	* descriptors in kmem_cache_create, we search through the malloc_sizes array.
				2900	* If we are creating a malloc_sizes cache here it would not be visible to
				2901	* kmem_find_general_cachep till the initialization is complete.
				2902	* Hence we cannot have slabp_cache same as the original cache.
				2903	*/
				2904	static struct slab alloc_slabmgmt(struct kmem_cache cachep, void *objp,
				2905	int colour_off, gfp_t local_flags,
				2906	int nodeid)
				2907	{
				2908	struct slab *slabp;
				2909
				2910	if (OFF_SLAB(cachep)) {
				2911	/* Slab management obj is off-slab. */
				2912	slabp = kmem_cache_alloc_node(cachep->slabp_cache,
				2913	local_flags, nodeid);
				2914	/*
				2915	* If the first object in the slab is leaked (it's allocated
				2916	* but no one has a reference to it), we want to make sure
				2917	* kmemleak does not treat the ->s_mem pointer as a reference
				2918	* to the object. Otherwise we will not report the leak.
				2919	*/
				2920	kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
				2921	local_flags);
				2922	if (!slabp)
				2923	return NULL;
				2924	} else {
				2925	slabp = objp + colour_off;
				2926	colour_off += cachep->slab_size;
				2927	}
				2928	slabp->inuse = 0;
				2929	slabp->colouroff = colour_off;
				2930	slabp->s_mem = objp + colour_off;
				2931	slabp->nodeid = nodeid;
				2932	slabp->free = 0;
				2933	return slabp;
				2934	}
				2935
				2936	static inline kmem_bufctl_t slab_bufctl(struct slab slabp)
				2937	{
				2938	return (kmem_bufctl_t *) (slabp + 1);
				2939	}
				2940
				2941	static void cache_init_objs(struct kmem_cache *cachep,
				2942	struct slab *slabp)
				2943	{
				2944	int i;
				2945
				2946	for (i = 0; i < cachep->num; i++) {
				2947	void *objp = index_to_obj(cachep, slabp, i);
				2948	#if DEBUG
				2949	/* need to poison the objs? */
				2950	if (cachep->flags & SLAB_POISON)
				2951	poison_obj(cachep, objp, POISON_FREE);
				2952	if (cachep->flags & SLAB_STORE_USER)
				2953	*dbg_userword(cachep, objp) = NULL;
				2954
				2955	if (cachep->flags & SLAB_RED_ZONE) {
				2956	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
				2957	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
				2958	}
				2959	/*
				2960	* Constructors are not allowed to allocate memory from the same
				2961	* cache which they are a constructor for. Otherwise, deadlock.
				2962	* They must also be threaded.
				2963	*/
				2964	if (cachep->ctor && !(cachep->flags & SLAB_POISON))
				2965	cachep->ctor(objp + obj_offset(cachep));
				2966
				2967	if (cachep->flags & SLAB_RED_ZONE) {
				2968	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
				2969	slab_error(cachep, "constructor overwrote the"
				2970	" end of an object");
				2971	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
				2972	slab_error(cachep, "constructor overwrote the"
				2973	" start of an object");
				2974	}
				2975	if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
				2976	OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
				2977	kernel_map_pages(virt_to_page(objp),
				2978	cachep->buffer_size / PAGE_SIZE, 0);
				2979	#else
				2980	if (cachep->ctor)
				2981	cachep->ctor(objp);
				2982	#endif
				2983	slab_bufctl(slabp)[i] = i + 1;
				2984	}
				2985	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
				2986	}
				2987
				2988	static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
				2989	{
				2990	if (CONFIG_ZONE_DMA_FLAG) {
				2991	if (flags & GFP_DMA)
				2992	BUG_ON(!(cachep->gfpflags & GFP_DMA));
				2993	else
				2994	BUG_ON(cachep->gfpflags & GFP_DMA);
				2995	}
				2996	}
				2997
				2998	static void slab_get_obj(struct kmem_cache cachep, struct slab *slabp,
				2999	int nodeid)
				3000	{
				3001	void *objp = index_to_obj(cachep, slabp, slabp->free);
				3002	kmem_bufctl_t next;
				3003
				3004	slabp->inuse++;
				3005	next = slab_bufctl(slabp)[slabp->free];
				3006	#if DEBUG
				3007	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
				3008	WARN_ON(slabp->nodeid != nodeid);
				3009	#endif
				3010	slabp->free = next;
				3011
				3012	return objp;
				3013	}
				3014
				3015	static void slab_put_obj(struct kmem_cache cachep, struct slab slabp,
				3016	void *objp, int nodeid)
				3017	{
				3018	unsigned int objnr = obj_to_index(cachep, slabp, objp);
				3019
				3020	#if DEBUG
				3021	/* Verify that the slab belongs to the intended node */
				3022	WARN_ON(slabp->nodeid != nodeid);
				3023
				3024	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
				3025	printk(KERN_ERR "slab: double free detected in cache "
				3026	"'%s', objp %p\n", cachep->name, objp);
				3027	BUG();
				3028	}
				3029	#endif
				3030	slab_bufctl(slabp)[objnr] = slabp->free;
				3031	slabp->free = objnr;
				3032	slabp->inuse--;
				3033	}
				3034
				3035	/*
				3036	* Map pages beginning at addr to the given cache and slab. This is required
				3037	* for the slab allocator to be able to lookup the cache and slab of a
				3038	* virtual address for kfree, ksize, and slab debugging.
				3039	*/
				3040	static void slab_map_pages(struct kmem_cache cache, struct slab slab,
				3041	void *addr)
				3042	{
				3043	int nr_pages;
				3044	struct page *page;
				3045
				3046	page = virt_to_page(addr);
				3047
				3048	nr_pages = 1;
				3049	if (likely(!PageCompound(page)))
				3050	nr_pages <<= cache->gfporder;
				3051
				3052	do {
				3053	page_set_cache(page, cache);
				3054	page_set_slab(page, slab);
				3055	page++;
				3056	} while (--nr_pages);
				3057	}
				3058
				3059	/*
				3060	* Grow (by 1) the number of slabs within a cache. This is called by
				3061	* kmem_cache_alloc() when there are no active objs left in a cache.
				3062	*/
				3063	static int cache_grow(struct kmem_cache *cachep,
				3064	gfp_t flags, int nodeid, void *objp)
				3065	{
				3066	struct slab *slabp;
				3067	size_t offset;
				3068	gfp_t local_flags;
				3069	struct kmem_list3 *l3;
				3070
				3071	/*
				3072	* Be lazy and only check for valid flags here, keeping it out of the
				3073	* critical path in kmem_cache_alloc().
				3074	*/
				3075	BUG_ON(flags & GFP_SLAB_BUG_MASK);
				3076	local_flags = flags & (GFP_CONSTRAINT_MASK\|GFP_RECLAIM_MASK);
				3077
				3078	/* Take the l3 list lock to change the colour_next on this node */
				3079	check_irq_off();
				3080	l3 = cachep->nodelists[nodeid];
				3081	spin_lock(&l3->list_lock);
				3082
				3083	/* Get colour for the slab, and cal the next value. */
				3084	offset = l3->colour_next;
				3085	l3->colour_next++;
				3086	if (l3->colour_next >= cachep->colour)
				3087	l3->colour_next = 0;
				3088	spin_unlock(&l3->list_lock);
				3089
				3090	offset *= cachep->colour_off;
				3091
				3092	if (local_flags & __GFP_WAIT)
				3093	local_unlock_irq(slab_lock);
				3094
				3095	/*
				3096	* The test for missing atomic flag is performed here, rather than
				3097	* the more obvious place, simply to reduce the critical path length
				3098	* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
				3099	* will eventually be caught here (where it matters).
				3100	*/
				3101	kmem_flagcheck(cachep, flags);
				3102
				3103	/*
				3104	* Get mem for the objs. Attempt to allocate a physical page from
				3105	* 'nodeid'.
				3106	*/
				3107	if (!objp)
				3108	objp = kmem_getpages(cachep, local_flags, nodeid);
				3109	if (!objp)
				3110	goto failed;
				3111
				3112	/* Get slab management. */
				3113	slabp = alloc_slabmgmt(cachep, objp, offset,
				3114	local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
				3115	if (!slabp)
				3116	goto opps1;
				3117
				3118	slab_map_pages(cachep, slabp, objp);
				3119
				3120	cache_init_objs(cachep, slabp);
				3121
				3122	if (local_flags & __GFP_WAIT)
				3123	local_lock_irq(slab_lock);
				3124	check_irq_off();
				3125	spin_lock(&l3->list_lock);
				3126
				3127	/* Make slab active. */
				3128	list_add_tail(&slabp->list, &(l3->slabs_free));
				3129	STATS_INC_GROWN(cachep);
				3130	l3->free_objects += cachep->num;
				3131	spin_unlock(&l3->list_lock);
				3132	return 1;
				3133	opps1:
				3134	kmem_freepages(cachep, objp, false);
				3135	failed:
				3136	if (local_flags & __GFP_WAIT)
				3137	local_lock_irq(slab_lock);
				3138	return 0;
				3139	}
				3140
				3141	#if DEBUG
				3142
				3143	/*
				3144	* Perform extra freeing checks:
				3145	* - detect bad pointers.
				3146	* - POISON/RED_ZONE checking
				3147	*/
				3148	static void kfree_debugcheck(const void *objp)
				3149	{
				3150	if (!virt_addr_valid(objp)) {
				3151	printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
				3152	(unsigned long)objp);
				3153	BUG();
				3154	}
				3155	}
				3156
				3157	static inline void verify_redzone_free(struct kmem_cache cache, void obj)
				3158	{
				3159	unsigned long long redzone1, redzone2;
				3160
				3161	redzone1 = *dbg_redzone1(cache, obj);
				3162	redzone2 = *dbg_redzone2(cache, obj);
				3163
				3164	/*
				3165	* Redzone is ok.
				3166	*/
				3167	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
				3168	return;
				3169
				3170	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
				3171	slab_error(cache, "double free detected");
				3172	else
				3173	slab_error(cache, "memory outside object was overwritten");
				3174
				3175	printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
				3176	obj, redzone1, redzone2);
				3177	}
				3178
				3179	static void cache_free_debugcheck(struct kmem_cache cachep, void *objp,
				3180	void *caller)
				3181	{
				3182	struct page *page;
				3183	unsigned int objnr;
				3184	struct slab *slabp;
				3185
				3186	BUG_ON(virt_to_cache(objp) != cachep);
				3187
				3188	objp -= obj_offset(cachep);
				3189	kfree_debugcheck(objp);
				3190	page = virt_to_head_page(objp);
				3191
				3192	slabp = page_get_slab(page);
				3193
				3194	if (cachep->flags & SLAB_RED_ZONE) {
				3195	verify_redzone_free(cachep, objp);
				3196	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
				3197	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
				3198	}
				3199	if (cachep->flags & SLAB_STORE_USER)
				3200	*dbg_userword(cachep, objp) = caller;
				3201
				3202	objnr = obj_to_index(cachep, slabp, objp);
				3203
				3204	BUG_ON(objnr >= cachep->num);
				3205	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
				3206
				3207	#ifdef CONFIG_DEBUG_SLAB_LEAK
				3208	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
				3209	#endif
				3210	if (cachep->flags & SLAB_POISON) {
				3211	#ifdef CONFIG_DEBUG_PAGEALLOC
				3212	if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
				3213	store_stackinfo(cachep, objp, (unsigned long)caller);
				3214	kernel_map_pages(virt_to_page(objp),
				3215	cachep->buffer_size / PAGE_SIZE, 0);
				3216	} else {
				3217	poison_obj(cachep, objp, POISON_FREE);
				3218	}
				3219	#else
				3220	poison_obj(cachep, objp, POISON_FREE);
				3221	#endif
				3222	}
				3223	return objp;
				3224	}
				3225
				3226	static void check_slabp(struct kmem_cache cachep, struct slab slabp)
				3227	{
				3228	kmem_bufctl_t i;
				3229	int entries = 0;
				3230
				3231	/* Check slab's freelist to see if this obj is there. */
				3232	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
				3233	entries++;
				3234	if (entries > cachep->num \|\| i >= cachep->num)
				3235	goto bad;
				3236	}
				3237	if (entries != cachep->num - slabp->inuse) {
				3238	bad:
				3239	printk(KERN_ERR "slab: Internal list corruption detected in "
				3240	"cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
				3241	cachep->name, cachep->num, slabp, slabp->inuse,
				3242	print_tainted());
				3243	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
				3244	sizeof(slabp) + cachep->num sizeof(kmem_bufctl_t),
				3245	1);
				3246	BUG();
				3247	}
				3248	}
				3249	#else
				3250	#define kfree_debugcheck(x) do { } while(0)
				3251	#define cache_free_debugcheck(x,objp,z) (objp)
				3252	#define check_slabp(x,y) do { } while(0)
				3253	#endif
				3254
				3255	static void cache_alloc_refill(struct kmem_cache cachep, gfp_t flags)
				3256	{
				3257	int batchcount;
				3258	struct kmem_list3 *l3;
				3259	struct array_cache *ac;
				3260	int node;
				3261
				3262	retry:
				3263	check_irq_off();
				3264	node = numa_mem_id();
				3265	ac = cpu_cache_get(cachep);
				3266	batchcount = ac->batchcount;
				3267	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
				3268	/*
				3269	* If there was little recent activity on this cache, then
				3270	* perform only a partial refill. Otherwise we could generate
				3271	* refill bouncing.
				3272	*/
				3273	batchcount = BATCHREFILL_LIMIT;
				3274	}
				3275	l3 = cachep->nodelists[node];
				3276
				3277	BUG_ON(ac->avail > 0 \|\| !l3);
				3278	spin_lock(&l3->list_lock);
				3279
				3280	/* See if we can refill from the shared array */
				3281	if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
				3282	l3->shared->touched = 1;
				3283	goto alloc_done;
				3284	}
				3285
				3286	while (batchcount > 0) {
				3287	struct list_head *entry;
				3288	struct slab *slabp;
				3289	/* Get slab alloc is to come from. */
				3290	entry = l3->slabs_partial.next;
				3291	if (entry == &l3->slabs_partial) {
				3292	l3->free_touched = 1;
				3293	entry = l3->slabs_free.next;
				3294	if (entry == &l3->slabs_free)
				3295	goto must_grow;
				3296	}
				3297
				3298	slabp = list_entry(entry, struct slab, list);
				3299	check_slabp(cachep, slabp);
				3300	check_spinlock_acquired(cachep);
				3301
				3302	/*
				3303	* The slab was either on partial or free list so
				3304	* there must be at least one object available for
				3305	* allocation.
				3306	*/
				3307	BUG_ON(slabp->inuse >= cachep->num);
				3308
				3309	while (slabp->inuse < cachep->num && batchcount--) {
				3310	STATS_INC_ALLOCED(cachep);
				3311	STATS_INC_ACTIVE(cachep);
				3312	STATS_SET_HIGH(cachep);
				3313
				3314	ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
				3315	node);
				3316	}
				3317	check_slabp(cachep, slabp);
				3318
				3319	/* move slabp to correct slabp list: */
				3320	list_del(&slabp->list);
				3321	if (slabp->free == BUFCTL_END)
				3322	list_add(&slabp->list, &l3->slabs_full);
				3323	else
				3324	list_add(&slabp->list, &l3->slabs_partial);
				3325	}
				3326
				3327	must_grow:
				3328	l3->free_objects -= ac->avail;
				3329	alloc_done:
				3330	spin_unlock(&l3->list_lock);
				3331
				3332	if (unlikely(!ac->avail)) {
				3333	int x;
				3334	x = cache_grow(cachep, flags \| GFP_THISNODE, node, NULL);
				3335
				3336	/* cache_grow can reenable interrupts, then ac could change. */
				3337	ac = cpu_cache_get(cachep);
				3338	if (!x && ac->avail == 0) /* no objects in sight? abort */
				3339	return NULL;
				3340
				3341	if (!ac->avail) /* objects refilled by interrupt? */
				3342	goto retry;
				3343	}
				3344	ac->touched = 1;
				3345	return ac->entry[--ac->avail];
				3346	}
				3347
				3348	static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
				3349	gfp_t flags)
				3350	{
				3351	might_sleep_if(flags & __GFP_WAIT);
				3352	#if DEBUG
				3353	kmem_flagcheck(cachep, flags);
				3354	#endif
				3355	}
				3356
				3357	#if DEBUG
				3358	static void cache_alloc_debugcheck_after(struct kmem_cache cachep,
				3359	gfp_t flags, void objp, void caller)
				3360	{
				3361	if (!objp)
				3362	return objp;
				3363	if (cachep->flags & SLAB_POISON) {
				3364	#ifdef CONFIG_DEBUG_PAGEALLOC
				3365	if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
				3366	kernel_map_pages(virt_to_page(objp),
				3367	cachep->buffer_size / PAGE_SIZE, 1);
				3368	else
				3369	check_poison_obj(cachep, objp);
				3370	#else
				3371	check_poison_obj(cachep, objp);
				3372	#endif
				3373	poison_obj(cachep, objp, POISON_INUSE);
				3374	}
				3375	if (cachep->flags & SLAB_STORE_USER)
				3376	*dbg_userword(cachep, objp) = caller;
				3377
				3378	if (cachep->flags & SLAB_RED_ZONE) {
				3379	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE \|\|
				3380	*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
				3381	slab_error(cachep, "double free, or memory outside"
				3382	" object was overwritten");
				3383	printk(KERN_ERR
				3384	"%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
				3385	objp, *dbg_redzone1(cachep, objp),
				3386	*dbg_redzone2(cachep, objp));
				3387	}
				3388	*dbg_redzone1(cachep, objp) = RED_ACTIVE;
				3389	*dbg_redzone2(cachep, objp) = RED_ACTIVE;
				3390	}
				3391	#ifdef CONFIG_DEBUG_SLAB_LEAK
				3392	{
				3393	struct slab *slabp;
				3394	unsigned objnr;
				3395
				3396	slabp = page_get_slab(virt_to_head_page(objp));
				3397	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
				3398	slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
				3399	}
				3400	#endif
				3401	objp += obj_offset(cachep);
				3402	if (cachep->ctor && cachep->flags & SLAB_POISON)
				3403	cachep->ctor(objp);
				3404	if (ARCH_SLAB_MINALIGN &&
				3405	((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
				3406	printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
				3407	objp, (int)ARCH_SLAB_MINALIGN);
				3408	}
				3409	return objp;
				3410	}
				3411	#else
				3412	#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
				3413	#endif
				3414
				3415	static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
				3416	{
				3417	if (cachep == &cache_cache)
				3418	return false;
				3419
				3420	return should_failslab(obj_size(cachep), flags, cachep->flags);
				3421	}
				3422
				3423	static inline void ____cache_alloc(struct kmem_cache cachep, gfp_t flags)
				3424	{
				3425	void *objp;
				3426	struct array_cache *ac;
				3427
				3428	check_irq_off();
				3429
				3430	ac = cpu_cache_get(cachep);
				3431	if (likely(ac->avail)) {
				3432	STATS_INC_ALLOCHIT(cachep);
				3433	ac->touched = 1;
				3434	objp = ac->entry[--ac->avail];
				3435	} else {
				3436	STATS_INC_ALLOCMISS(cachep);
				3437	objp = cache_alloc_refill(cachep, flags);
				3438	/*
				3439	* the 'ac' may be updated by cache_alloc_refill(),
				3440	* and kmemleak_erase() requires its correct value.
				3441	*/
				3442	ac = cpu_cache_get(cachep);
				3443	}
				3444	/*
				3445	* To avoid a false negative, if an object that is in one of the
				3446	* per-CPU caches is leaked, we need to make sure kmemleak doesn't
				3447	* treat the array pointers as a reference to the object.
				3448	*/
				3449	if (objp)
				3450	kmemleak_erase(&ac->entry[ac->avail]);
				3451	return objp;
				3452	}
				3453
				3454	#ifdef CONFIG_NUMA
				3455	/*
				3456	* Try allocating on another node if PFA_SPREAD_SLAB\|PF_MEMPOLICY.
				3457	*
				3458	* If we are in_interrupt, then process context, including cpusets and
				3459	* mempolicy, may not apply and should not be used for allocation policy.
				3460	*/
				3461	static void alternate_node_alloc(struct kmem_cache cachep, gfp_t flags)
				3462	{
				3463	int nid_alloc, nid_here;
				3464
				3465	if (in_interrupt() \|\| (flags & __GFP_THISNODE))
				3466	return NULL;
				3467	nid_alloc = nid_here = numa_mem_id();
				3468	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
				3469	nid_alloc = cpuset_slab_spread_node();
				3470	else if (current->mempolicy)
				3471	nid_alloc = slab_node();
				3472	if (nid_alloc != nid_here)
				3473	return ____cache_alloc_node(cachep, flags, nid_alloc);
				3474	return NULL;
				3475	}
				3476
				3477	/*
				3478	* Fallback function if there was no memory available and no objects on a
				3479	* certain node and fall back is permitted. First we scan all the
				3480	* available nodelists for available objects. If that fails then we
				3481	* perform an allocation without specifying a node. This allows the page
				3482	* allocator to do its reclaim / fallback magic. We then insert the
				3483	* slab into the proper nodelist and then allocate from it.
				3484	*/
				3485	static void fallback_alloc(struct kmem_cache cache, gfp_t flags)
				3486	{
				3487	struct zonelist *zonelist;
				3488	gfp_t local_flags;
				3489	struct zoneref *z;
				3490	struct zone *zone;
				3491	enum zone_type high_zoneidx = gfp_zone(flags);
				3492	void *obj = NULL;
				3493	int nid;
				3494	unsigned int cpuset_mems_cookie;
				3495
				3496	if (flags & __GFP_THISNODE)
				3497	return NULL;
				3498
				3499	local_flags = flags & (GFP_CONSTRAINT_MASK\|GFP_RECLAIM_MASK);
				3500
				3501	retry_cpuset:
				3502	cpuset_mems_cookie = get_mems_allowed();
				3503	zonelist = node_zonelist(slab_node(), flags);
				3504
				3505	retry:
				3506	/*
				3507	* Look through allowed nodes for objects available
				3508	* from existing per node queues.
				3509	*/
				3510	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
				3511	nid = zone_to_nid(zone);
				3512
				3513	if (cpuset_zone_allowed_hardwall(zone, flags) &&
				3514	cache->nodelists[nid] &&
				3515	cache->nodelists[nid]->free_objects) {
				3516	obj = ____cache_alloc_node(cache,
				3517	flags \| GFP_THISNODE, nid);
				3518	if (obj)
				3519	break;
				3520	}
				3521	}
				3522
				3523	if (!obj) {
				3524	/*
				3525	* This allocation will be performed within the constraints
				3526	* of the current cpuset / memory policy requirements.
				3527	* We may trigger various forms of reclaim on the allowed
				3528	* set and go into memory reserves if necessary.
				3529	*/
				3530	if (local_flags & __GFP_WAIT)
				3531	local_unlock_irq(slab_lock);
				3532	kmem_flagcheck(cache, flags);
				3533	obj = kmem_getpages(cache, local_flags, numa_mem_id());
				3534	if (local_flags & __GFP_WAIT)
				3535	local_lock_irq(slab_lock);
				3536	if (obj) {
				3537	/*
				3538	* Insert into the appropriate per node queues
				3539	*/
				3540	nid = page_to_nid(virt_to_page(obj));
				3541	if (cache_grow(cache, flags, nid, obj)) {
				3542	obj = ____cache_alloc_node(cache,
				3543	flags \| GFP_THISNODE, nid);
				3544	if (!obj)
				3545	/*
				3546	* Another processor may allocate the
				3547	* objects in the slab since we are
				3548	* not holding any locks.
				3549	*/
				3550	goto retry;
				3551	} else {
				3552	/* cache_grow already freed obj */
				3553	obj = NULL;
				3554	}
				3555	}
				3556	}
				3557
				3558	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
				3559	goto retry_cpuset;
				3560	return obj;
				3561	}
				3562
				3563	/*
				3564	* A interface to enable slab creation on nodeid
				3565	*/
				3566	static void ____cache_alloc_node(struct kmem_cache cachep, gfp_t flags,
				3567	int nodeid)
				3568	{
				3569	struct list_head *entry;
				3570	struct slab *slabp;
				3571	struct kmem_list3 *l3;
				3572	void *obj;
				3573	int x;
				3574
				3575	l3 = cachep->nodelists[nodeid];
				3576	BUG_ON(!l3);
				3577
				3578	retry:
				3579	check_irq_off();
				3580	spin_lock(&l3->list_lock);
				3581	entry = l3->slabs_partial.next;
				3582	if (entry == &l3->slabs_partial) {
				3583	l3->free_touched = 1;
				3584	entry = l3->slabs_free.next;
				3585	if (entry == &l3->slabs_free)
				3586	goto must_grow;
				3587	}
				3588
				3589	slabp = list_entry(entry, struct slab, list);
				3590	check_spinlock_acquired_node(cachep, nodeid);
				3591	check_slabp(cachep, slabp);
				3592
				3593	STATS_INC_NODEALLOCS(cachep);
				3594	STATS_INC_ACTIVE(cachep);
				3595	STATS_SET_HIGH(cachep);
				3596
				3597	BUG_ON(slabp->inuse == cachep->num);
				3598
				3599	obj = slab_get_obj(cachep, slabp, nodeid);
				3600	check_slabp(cachep, slabp);
				3601	l3->free_objects--;
				3602	/* move slabp to correct slabp list: */
				3603	list_del(&slabp->list);
				3604
				3605	if (slabp->free == BUFCTL_END)
				3606	list_add(&slabp->list, &l3->slabs_full);
				3607	else
				3608	list_add(&slabp->list, &l3->slabs_partial);
				3609
				3610	spin_unlock(&l3->list_lock);
				3611	goto done;
				3612
				3613	must_grow:
				3614	spin_unlock(&l3->list_lock);
				3615	x = cache_grow(cachep, flags \| GFP_THISNODE, nodeid, NULL);
				3616	if (x)
				3617	goto retry;
				3618
				3619	return fallback_alloc(cachep, flags);
				3620
				3621	done:
				3622	return obj;
				3623	}
				3624
				3625	/**
				3626	* kmem_cache_alloc_node - Allocate an object on the specified node
				3627	* @cachep: The cache to allocate from.
				3628	* @flags: See kmalloc().
				3629	* @nodeid: node number of the target node.
				3630	* @caller: return address of caller, used for debug information
				3631	*
				3632	* Identical to kmem_cache_alloc but it will allocate memory on the given
				3633	* node, which can improve the performance for cpu bound structures.
				3634	*
				3635	* Fallback to other node is possible if __GFP_THISNODE is not set.
				3636	*/
				3637	static __always_inline void *
				3638	__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
				3639	void *caller)
				3640	{
				3641	unsigned long save_flags;
				3642	void *ptr;
				3643	int slab_node = numa_mem_id();
				3644
				3645	flags &= gfp_allowed_mask;
				3646
				3647	lockdep_trace_alloc(flags);
				3648
				3649	if (slab_should_failslab(cachep, flags))
				3650	return NULL;
				3651
				3652	cache_alloc_debugcheck_before(cachep, flags);
				3653	local_lock_irqsave(slab_lock, save_flags);
				3654
				3655	if (nodeid == NUMA_NO_NODE)
				3656	nodeid = slab_node;
				3657
				3658	if (unlikely(!cachep->nodelists[nodeid])) {
				3659	/* Node not bootstrapped yet */
				3660	ptr = fallback_alloc(cachep, flags);
				3661	goto out;
				3662	}
				3663
				3664	if (nodeid == slab_node) {
				3665	/*
				3666	* Use the locally cached objects if possible.
				3667	* However ____cache_alloc does not allow fallback
				3668	* to other nodes. It may fail while we still have
				3669	* objects on other nodes available.
				3670	*/
				3671	ptr = ____cache_alloc(cachep, flags);
				3672	if (ptr)
				3673	goto out;
				3674	}
				3675	/* ___cache_alloc_node can fall back to other nodes */
				3676	ptr = ____cache_alloc_node(cachep, flags, nodeid);
				3677	out:
				3678	local_unlock_irqrestore(slab_lock, save_flags);
				3679	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
				3680	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
				3681	flags);
				3682
				3683	if (likely(ptr))
				3684	kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
				3685
				3686	if (unlikely((flags & __GFP_ZERO) && ptr))
				3687	memset(ptr, 0, obj_size(cachep));
				3688
				3689	return ptr;
				3690	}
				3691
				3692	static __always_inline void *
				3693	__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
				3694	{
				3695	void *objp;
				3696
				3697	if (unlikely((current->flags & PF_MEMPOLICY) \|\| cpuset_do_slab_mem_spread())) {
				3698	objp = alternate_node_alloc(cache, flags);
				3699	if (objp)
				3700	goto out;
				3701	}
				3702	objp = ____cache_alloc(cache, flags);
				3703
				3704	/*
				3705	* We may just have run out of memory on the local node.
				3706	* ____cache_alloc_node() knows how to locate memory on other nodes
				3707	*/
				3708	if (!objp)
				3709	objp = ____cache_alloc_node(cache, flags, numa_mem_id());
				3710
				3711	out:
				3712	return objp;
				3713	}
				3714	#else
				3715
				3716	static __always_inline void *
				3717	__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
				3718	{
				3719	return ____cache_alloc(cachep, flags);
				3720	}
				3721
				3722	#endif /* CONFIG_NUMA */
				3723
				3724	static __always_inline void *
				3725	__cache_alloc(struct kmem_cache cachep, gfp_t flags, void caller)
				3726	{
				3727	unsigned long save_flags;
				3728	void *objp;
				3729
				3730	flags &= gfp_allowed_mask;
				3731
				3732	lockdep_trace_alloc(flags);
				3733
				3734	if (slab_should_failslab(cachep, flags))
				3735	return NULL;
				3736
				3737	#ifdef CONFIG_MEM_CHECK
				3738	if (cachep->buffer_size > CONFIG_MEM_CHECK_SIZE) {
				3739	printk(KERN_ALERT"memcheck_slab %d %s (%pS)\n", cachep->buffer_size, current->comm, __builtin_return_address(0));
				3740	if (strcmp(current->comm,MEM_CHECK_THREAD_NAME)==0)
				3741	dump_stack();
				3742	}
				3743	#endif
				3744
				3745	cache_alloc_debugcheck_before(cachep, flags);
				3746	local_lock_irqsave(slab_lock, save_flags);
				3747	objp = __do_cache_alloc(cachep, flags);
				3748	local_unlock_irqrestore(slab_lock, save_flags);
				3749	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
				3750	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
				3751	flags);
				3752	prefetchw(objp);
				3753
				3754	if (likely(objp))
				3755	kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
				3756
				3757	if (unlikely((flags & __GFP_ZERO) && objp))
				3758	memset(objp, 0, obj_size(cachep));
				3759
				3760	return objp;
				3761	}
				3762
				3763	/*
				3764	* Caller needs to acquire correct kmem_list's list_lock
				3765	*/
				3766	static void free_block(struct kmem_cache cachep, void *objpp, int nr_objects,
				3767	int node)
				3768	{
				3769	int i;
				3770	struct kmem_list3 *l3;
				3771
				3772	for (i = 0; i < nr_objects; i++) {
				3773	void *objp = objpp[i];
				3774	struct slab *slabp;
				3775
				3776	slabp = virt_to_slab(objp);
				3777	l3 = cachep->nodelists[node];
				3778	list_del(&slabp->list);
				3779	check_spinlock_acquired_node(cachep, node);
				3780	check_slabp(cachep, slabp);
				3781	slab_put_obj(cachep, slabp, objp, node);
				3782	STATS_DEC_ACTIVE(cachep);
				3783	l3->free_objects++;
				3784	check_slabp(cachep, slabp);
				3785
				3786	/* fixup slab chains */
				3787	if (slabp->inuse == 0) {
				3788	if (l3->free_objects > l3->free_limit) {
				3789	l3->free_objects -= cachep->num;
				3790	/* No need to drop any previously held
				3791	* lock here, even if we have a off-slab slab
				3792	* descriptor it is guaranteed to come from
				3793	* a different cache, refer to comments before
				3794	* alloc_slabmgmt.
				3795	*/
				3796	slab_destroy(cachep, slabp, true);
				3797	} else {
				3798	list_add(&slabp->list, &l3->slabs_free);
				3799	}
				3800	} else {
				3801	/* Unconditionally move a slab to the end of the
				3802	* partial list on free - maximum time for the
				3803	* other objects to be freed, too.
				3804	*/
				3805	list_add_tail(&slabp->list, &l3->slabs_partial);
				3806	}
				3807	}
				3808	}
				3809
				3810	static void cache_flusharray(struct kmem_cache cachep, struct array_cache ac)
				3811	{
				3812	int batchcount;
				3813	struct kmem_list3 *l3;
				3814	int node = numa_mem_id();
				3815
				3816	batchcount = ac->batchcount;
				3817	#if DEBUG
				3818	BUG_ON(!batchcount \|\| batchcount > ac->avail);
				3819	#endif
				3820	check_irq_off();
				3821	l3 = cachep->nodelists[node];
				3822	spin_lock(&l3->list_lock);
				3823	if (l3->shared) {
				3824	struct array_cache *shared_array = l3->shared;
				3825	int max = shared_array->limit - shared_array->avail;
				3826	if (max) {
				3827	if (batchcount > max)
				3828	batchcount = max;
				3829	memcpy(&(shared_array->entry[shared_array->avail]),
				3830	ac->entry, sizeof(void ) batchcount);
				3831	shared_array->avail += batchcount;
				3832	goto free_done;
				3833	}
				3834	}
				3835
				3836	free_block(cachep, ac->entry, batchcount, node);
				3837	free_done:
				3838	#if STATS
				3839	{
				3840	int i = 0;
				3841	struct list_head *p;
				3842
				3843	p = l3->slabs_free.next;
				3844	while (p != &(l3->slabs_free)) {
				3845	struct slab *slabp;
				3846
				3847	slabp = list_entry(p, struct slab, list);
				3848	BUG_ON(slabp->inuse);
				3849
				3850	i++;
				3851	p = p->next;
				3852	}
				3853	STATS_SET_FREEABLE(cachep, i);
				3854	}
				3855	#endif
				3856	spin_unlock(&l3->list_lock);
				3857	ac->avail -= batchcount;
				3858	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void )ac->avail);
				3859	}
				3860
				3861	/*
				3862	* Release an obj back to its cache. If the obj has a constructed state, it must
				3863	* be in this state _before_ it is released. Called with disabled ints.
				3864	*/
				3865	static inline void __cache_free(struct kmem_cache cachep, void objp,
				3866	void *caller)
				3867	{
				3868	struct array_cache *ac = cpu_cache_get(cachep);
				3869
				3870	check_irq_off();
				3871	kmemleak_free_recursive(objp, cachep->flags);
				3872	objp = cache_free_debugcheck(cachep, objp, caller);
				3873
				3874	kmemcheck_slab_free(cachep, objp, obj_size(cachep));
				3875
				3876	/*
				3877	* Skip calling cache_free_alien() when the platform is not numa.
				3878	* This will avoid cache misses that happen while accessing slabp (which
				3879	* is per page memory reference) to get nodeid. Instead use a global
				3880	* variable to skip the call, which is mostly likely to be present in
				3881	* the cache.
				3882	*/
				3883	if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
				3884	return;
				3885
				3886	if (likely(ac->avail < ac->limit)) {
				3887	STATS_INC_FREEHIT(cachep);
				3888	} else {
				3889	STATS_INC_FREEMISS(cachep);
				3890	cache_flusharray(cachep, ac);
				3891	}
				3892
				3893	ac->entry[ac->avail++] = objp;
				3894	}
				3895
				3896	/**
				3897	* kmem_cache_alloc - Allocate an object
				3898	* @cachep: The cache to allocate from.
				3899	* @flags: See kmalloc().
				3900	*
				3901	* Allocate an object from this cache. The flags are only relevant
				3902	* if the cache has no available objects.
				3903	*/
				3904	void kmem_cache_alloc(struct kmem_cache cachep, gfp_t flags)
				3905	{
				3906	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
				3907
				3908	trace_kmem_cache_alloc(_RET_IP_, ret,
				3909	obj_size(cachep), cachep->buffer_size, flags);
				3910
				3911	return ret;
				3912	}
				3913	EXPORT_SYMBOL(kmem_cache_alloc);
				3914
				3915	#ifdef CONFIG_TRACING
				3916	void *
				3917	kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
				3918	{
				3919	void *ret;
				3920
				3921	ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
				3922
				3923	trace_kmalloc(_RET_IP_, ret,
				3924	size, slab_buffer_size(cachep), flags);
				3925	return ret;
				3926	}
				3927	EXPORT_SYMBOL(kmem_cache_alloc_trace);
				3928	#endif
				3929
				3930	#ifdef CONFIG_NUMA
				3931	void kmem_cache_alloc_node(struct kmem_cache cachep, gfp_t flags, int nodeid)
				3932	{
				3933	void *ret = __cache_alloc_node(cachep, flags, nodeid,
				3934	__builtin_return_address(0));
				3935
				3936	trace_kmem_cache_alloc_node(_RET_IP_, ret,
				3937	obj_size(cachep), cachep->buffer_size,
				3938	flags, nodeid);
				3939
				3940	return ret;
				3941	}
				3942	EXPORT_SYMBOL(kmem_cache_alloc_node);
				3943
				3944	#ifdef CONFIG_TRACING
				3945	void *kmem_cache_alloc_node_trace(size_t size,
				3946	struct kmem_cache *cachep,
				3947	gfp_t flags,
				3948	int nodeid)
				3949	{
				3950	void *ret;
				3951
				3952	ret = __cache_alloc_node(cachep, flags, nodeid,
				3953	__builtin_return_address(0));
				3954	trace_kmalloc_node(_RET_IP_, ret,
				3955	size, slab_buffer_size(cachep),
				3956	flags, nodeid);
				3957	return ret;
				3958	}
				3959	EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
				3960	#endif
				3961
				3962	static __always_inline void *
				3963	__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
				3964	{
				3965	struct kmem_cache *cachep;
				3966
				3967	cachep = kmem_find_general_cachep(size, flags);
				3968	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
				3969	return cachep;
				3970	return kmem_cache_alloc_node_trace(size, cachep, flags, node);
				3971	}
				3972
				3973	#if defined(CONFIG_DEBUG_SLAB) \|\| defined(CONFIG_TRACING)
				3974	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				3975	{
				3976	return __do_kmalloc_node(size, flags, node,
				3977	__builtin_return_address(0));
				3978	}
				3979	EXPORT_SYMBOL(__kmalloc_node);
				3980
				3981	void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
				3982	int node, unsigned long caller)
				3983	{
				3984	return __do_kmalloc_node(size, flags, node, (void *)caller);
				3985	}
				3986	EXPORT_SYMBOL(__kmalloc_node_track_caller);
				3987	#else
				3988	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				3989	{
				3990	return __do_kmalloc_node(size, flags, node, NULL);
				3991	}
				3992	EXPORT_SYMBOL(__kmalloc_node);
				3993	#endif /* CONFIG_DEBUG_SLAB \|\| CONFIG_TRACING */
				3994	#endif /* CONFIG_NUMA */
				3995
				3996	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				3997	{
				3998	return __kmalloc(size, flags);
				3999	}
				4000	EXPORT_SYMBOL(__kmalloc_node);
				4001	/**
				4002	* __do_kmalloc - allocate memory
				4003	* @size: how many bytes of memory are required.
				4004	* @flags: the type of memory to allocate (see kmalloc).
				4005	* @caller: function caller for debug tracking of the caller
				4006	*/
				4007	static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
				4008	void *caller)
				4009	{
				4010	struct kmem_cache *cachep;
				4011	void *ret;
				4012
				4013	#ifdef CONFIG_DEBUG_SLAB_MARK
				4014	int mark_flag = 0;
				4015	if(!size)
				4016	return ZERO_SIZE_PTR;
				4017	if(size <= 248){
				4018	size += 2 * sizeof(size_t);
				4019	mark_flag = 1;
				4020	}
				4021
				4022	#endif
				4023
				4024	#ifdef CONFIG_DEBUG_SLAB_MARK_HEAD
				4025	if (size <= PAGE_SIZE)
				4026	size += RECORD_COUNT * BYTES_PER_RECORD;
				4027	#endif
				4028
				4029	#ifdef CONFIG_KMALLOC_TRACKER
				4030	size_t len = 0;
				4031
				4032	if (!size)
				4033	return ZERO_SIZE_PTR;
				4034
				4035	size += HEAP_SUFFIX_SIZE;
				4036	#endif
				4037
				4038	/* If you want to save a few bytes .text space: replace
				4039	* __ with kmem_.
				4040	* Then kmalloc uses the uninlined functions instead of the inline
				4041	* functions.
				4042	*/
				4043	cachep = __find_general_cachep(size, flags);
				4044	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
				4045	return cachep;
				4046	ret = __cache_alloc(cachep, flags, caller);
				4047
				4048	trace_kmalloc((unsigned long) caller, ret,
				4049	size, cachep->buffer_size, flags);
				4050
				4051	#ifdef CONFIG_KMALLOC_TRACKER
				4052	if (ret) {
				4053	kmalloc_alloc_tracker(ret, KMALLOC_ORIGINAL_SIZE(size));
				4054	return KMALLOC_SETUP(ret);
				4055	}
				4056	#endif
				4057
				4058	#ifdef CONFIG_DEBUG_SLAB_MARK
				4059	if(ret && mark_flag){
				4060	*dbg_recordtask(cachep, ret) = current;
				4061	*dbg_recordcaller(cachep, ret) = caller;
				4062	mark_flag = 0;
				4063	}
				4064	else if (unlikely(!(flags & __GFP_ZERO) && ret)){
				4065	*dbg_recordtask(cachep, ret) = current;
				4066	*dbg_recordcaller(cachep, ret) = caller;
				4067	}
				4068	#else
				4069	if (unlikely(!(flags & __GFP_ZERO) && ret)){
				4070	*dbg_recordtask(cachep, ret) = current;
				4071	*dbg_recordcaller(cachep, ret) = caller;
				4072	}
				4073	#endif
				4074
				4075	#ifdef CONFIG_DEBUG_SLAB_MARK_HEAD
				4076	if (ret && (obj_size(cachep) <= PAGE_SIZE)) {
				4077	dbg_userrecord(ret,0) = (void )RECORD_MAGIC;
				4078	*dbg_userrecord(ret,1) = caller;
				4079	*dbg_userrecord(ret,2) = current;
				4080	ret = (void *)dbg_userrecord(ret,RECORD_COUNT);
				4081	}
				4082	#endif
				4083
				4084	return ret;
				4085	}
				4086
				4087
				4088	#if defined(CONFIG_DEBUG_SLAB) \|\| defined(CONFIG_TRACING) \|\| defined(CONFIG_DEBUG_SLAB_MARK) \|\| defined(CONFIG_DEBUG_SLAB_MARK_HEAD)
				4089	void *__kmalloc(size_t size, gfp_t flags)
				4090	{
				4091	return __do_kmalloc(size, flags, __builtin_return_address(0));
				4092	}
				4093	EXPORT_SYMBOL(__kmalloc);
				4094
				4095	void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
				4096	{
				4097	return __do_kmalloc(size, flags, (void *)caller);
				4098	}
				4099	EXPORT_SYMBOL(__kmalloc_track_caller);
				4100
				4101	#else
				4102	void *__kmalloc(size_t size, gfp_t flags)
				4103	{
				4104	return __do_kmalloc(size, flags, NULL);
				4105	}
				4106	EXPORT_SYMBOL(__kmalloc);
				4107	#endif
				4108
				4109	/**
				4110	* kmem_cache_free - Deallocate an object
				4111	* @cachep: The cache the allocation was from.
				4112	* @objp: The previously allocated object.
				4113	*
				4114	* Free an object which was previously allocated from this
				4115	* cache.
				4116	*/
				4117	void kmem_cache_free(struct kmem_cache cachep, void objp)
				4118	{
				4119	unsigned long flags;
				4120
				4121	debug_check_no_locks_freed(objp, obj_size(cachep));
				4122	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
				4123	debug_check_no_obj_freed(objp, obj_size(cachep));
				4124	local_lock_irqsave(slab_lock, flags);
				4125	__cache_free(cachep, objp, __builtin_return_address(0));
				4126	unlock_slab_and_free_delayed(flags);
				4127
				4128	trace_kmem_cache_free(_RET_IP_, objp);
				4129	}
				4130	EXPORT_SYMBOL(kmem_cache_free);
				4131
				4132	/**
				4133	* kfree - free previously allocated memory
				4134	* @objp: pointer returned by kmalloc.
				4135	*
				4136	* If @objp is NULL, no operation is performed.
				4137	*
				4138	* Don't free memory not originally allocated by kmalloc()
				4139	* or you will run into trouble.
				4140	*/
				4141	void kfree(const void *objp)
				4142	{
				4143	struct kmem_cache *c;
				4144	unsigned long flags;
				4145
				4146	trace_kfree(_RET_IP_, objp);
				4147
				4148	if (unlikely(ZERO_OR_NULL_PTR(objp)))
				4149	return;
				4150
				4151	#ifdef CONFIG_KMALLOC_TRACKER
				4152	int entry = 0;
				4153	void *mem = NULL;
				4154
				4155	mem = KMALLOC_BASE(objp);
				4156	entry = (size_t )(mem);
				4157
				4158	if ((entry != 0)&& (MEM_TRUE == check_node_entry(entry)))
				4159	{
				4160	mem_free_tracker((void *)entry, MEM_TRACKER_TYPE_KMALLOC);
				4161	}
				4162	else
				4163	{
				4164	panic("error\n");
				4165	}
				4166
				4167	kfree_debugcheck(mem);
				4168	c = virt_to_cache(mem);
				4169	debug_check_no_locks_freed(mem, obj_size(c));
				4170	debug_check_no_obj_freed(mem, obj_size(c));
				4171
				4172	#else
				4173	kfree_debugcheck(objp);
				4174	c = virt_to_cache(objp);
				4175
				4176	#ifdef CONFIG_DEBUG_SLAB_MARK_HEAD
				4177	if (obj_size(c) <= PAGE_SIZE) {
				4178	if (dbg_userhead(objp) == (void )RECORD_MAGIC) {
				4179	objp = (void *)dbg_userhead(objp);
				4180	} else {
				4181	panic("memmory corruption!!");
				4182	}
				4183	}
				4184	#endif
				4185
				4186	debug_check_no_locks_freed(objp, obj_size(c));
				4187	debug_check_no_obj_freed(objp, obj_size(c));
				4188	#endif
				4189
				4190	local_lock_irqsave(slab_lock, flags);
				4191
				4192	#ifdef CONFIG_KMALLOC_TRACKER
				4193	__cache_free(c, (void *)mem, __builtin_return_address(0));
				4194	#else
				4195	__cache_free(c, (void *)objp, __builtin_return_address(0));
				4196	#endif
				4197	unlock_slab_and_free_delayed(flags);
				4198	}
				4199	EXPORT_SYMBOL(kfree);
				4200
				4201	unsigned int kmem_cache_size(struct kmem_cache *cachep)
				4202	{
				4203	return obj_size(cachep);
				4204	}
				4205	EXPORT_SYMBOL(kmem_cache_size);
				4206
				4207	/*
				4208	* This initializes kmem_list3 or resizes various caches for all nodes.
				4209	*/
				4210	static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
				4211	{
				4212	int node;
				4213	struct kmem_list3 *l3;
				4214	struct array_cache *new_shared;
				4215	struct array_cache **new_alien = NULL;
				4216
				4217	for_each_online_node(node) {
				4218
				4219	if (use_alien_caches) {
				4220	new_alien = alloc_alien_cache(node, cachep->limit, gfp);
				4221	if (!new_alien)
				4222	goto fail;
				4223	}
				4224
				4225	new_shared = NULL;
				4226	if (cachep->shared) {
				4227	new_shared = alloc_arraycache(node,
				4228	cachep->shared*cachep->batchcount,
				4229	0xbaadf00d, gfp);
				4230	if (!new_shared) {
				4231	free_alien_cache(new_alien);
				4232	goto fail;
				4233	}
				4234	}
				4235
				4236	l3 = cachep->nodelists[node];
				4237	if (l3) {
				4238	struct array_cache *shared = l3->shared;
				4239
				4240	local_spin_lock_irq(slab_lock, &l3->list_lock);
				4241
				4242	if (shared)
				4243	free_block(cachep, shared->entry,
				4244	shared->avail, node);
				4245
				4246	l3->shared = new_shared;
				4247	if (!l3->alien) {
				4248	l3->alien = new_alien;
				4249	new_alien = NULL;
				4250	}
				4251	l3->free_limit = (1 + nr_cpus_node(node)) *
				4252	cachep->batchcount + cachep->num;
				4253	unlock_l3_and_free_delayed(&l3->list_lock);
				4254
				4255	kfree(shared);
				4256	free_alien_cache(new_alien);
				4257	continue;
				4258	}
				4259	l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
				4260	if (!l3) {
				4261	free_alien_cache(new_alien);
				4262	kfree(new_shared);
				4263	goto fail;
				4264	}
				4265
				4266	kmem_list3_init(l3);
				4267	l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
				4268	((unsigned long)cachep) % REAPTIMEOUT_LIST3;
				4269	l3->shared = new_shared;
				4270	l3->alien = new_alien;
				4271	l3->free_limit = (1 + nr_cpus_node(node)) *
				4272	cachep->batchcount + cachep->num;
				4273	cachep->nodelists[node] = l3;
				4274	}
				4275	return 0;
				4276
				4277	fail:
				4278	if (!cachep->next.next) {
				4279	/* Cache is not active yet. Roll back what we did */
				4280	node--;
				4281	while (node >= 0) {
				4282	if (cachep->nodelists[node]) {
				4283	l3 = cachep->nodelists[node];
				4284
				4285	kfree(l3->shared);
				4286	free_alien_cache(l3->alien);
				4287	kfree(l3);
				4288	cachep->nodelists[node] = NULL;
				4289	}
				4290	node--;
				4291	}
				4292	}
				4293	return -ENOMEM;
				4294	}
				4295
				4296	struct ccupdate_struct {
				4297	struct kmem_cache *cachep;
				4298	struct array_cache *new[0];
				4299	};
				4300
				4301	static void __do_ccupdate_local(void *info, int cpu)
				4302	{
				4303	struct ccupdate_struct *new = info;
				4304	struct array_cache *old;
				4305
				4306	old = cpu_cache_get_on_cpu(new->cachep, cpu);
				4307
				4308	new->cachep->array[cpu] = new->new[cpu];
				4309	new->new[cpu] = old;
				4310	}
				4311
				4312	#ifndef CONFIG_PREEMPT_RT_BASE
				4313	static void do_ccupdate_local(void *info)
				4314	{
				4315	__do_ccupdate_local(info, smp_processor_id());
				4316	}
				4317	#else
				4318	static void do_ccupdate_local(void *info, int cpu)
				4319	{
				4320	lock_slab_on(cpu);
				4321	__do_ccupdate_local(info, cpu);
				4322	unlock_slab_on(cpu);
				4323	}
				4324	#endif
				4325
				4326	/* Always called with the cache_chain_mutex held */
				4327	static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
				4328	int batchcount, int shared, gfp_t gfp)
				4329	{
				4330	struct ccupdate_struct *new;
				4331	int i;
				4332
				4333	new = kzalloc(sizeof(new) + nr_cpu_ids sizeof(struct array_cache *),
				4334	gfp);
				4335	if (!new)
				4336	return -ENOMEM;
				4337
				4338	for_each_online_cpu(i) {
				4339	new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
				4340	batchcount, gfp);
				4341	if (!new->new[i]) {
				4342	for (i--; i >= 0; i--)
				4343	kfree(new->new[i]);
				4344	kfree(new);
				4345	return -ENOMEM;
				4346	}
				4347	}
				4348	new->cachep = cachep;
				4349
				4350	slab_on_each_cpu(do_ccupdate_local, (void *)new);
				4351
				4352	check_irq_on();
				4353	cachep->batchcount = batchcount;
				4354	cachep->limit = limit;
				4355	cachep->shared = shared;
				4356
				4357	for_each_online_cpu(i) {
				4358	struct array_cache *ccold = new->new[i];
				4359	if (!ccold)
				4360	continue;
				4361	local_spin_lock_irq(slab_lock,
				4362	&cachep->nodelists[cpu_to_mem(i)]->list_lock);
				4363	free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
				4364
				4365	unlock_l3_and_free_delayed(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
				4366	kfree(ccold);
				4367	}
				4368	kfree(new);
				4369	return alloc_kmemlist(cachep, gfp);
				4370	}
				4371
				4372	/* Called with cache_chain_mutex held always */
				4373	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
				4374	{
				4375	int err;
				4376	int limit, shared;
				4377
				4378	/*
				4379	* The head array serves three purposes:
				4380	* - create a LIFO ordering, i.e. return objects that are cache-warm
				4381	* - reduce the number of spinlock operations.
				4382	* - reduce the number of linked list operations on the slab and
				4383	* bufctl chains: array operations are cheaper.
				4384	* The numbers are guessed, we should auto-tune as described by
				4385	* Bonwick.
				4386	*/
				4387	if (cachep->buffer_size > 131072)
				4388	limit = 1;
				4389	else if (cachep->buffer_size > PAGE_SIZE)
				4390	limit = 8;
				4391	else if (cachep->buffer_size > 1024)
				4392	limit = 24;
				4393	else if (cachep->buffer_size > 256)
				4394	limit = 54;
				4395	else
				4396	limit = 120;
				4397
				4398	/*
				4399	* CPU bound tasks (e.g. network routing) can exhibit cpu bound
				4400	* allocation behaviour: Most allocs on one cpu, most free operations
				4401	* on another cpu. For these cases, an efficient object passing between
				4402	* cpus is necessary. This is provided by a shared array. The array
				4403	* replaces Bonwick's magazine layer.
				4404	* On uniprocessor, it's functionally equivalent (but less efficient)
				4405	* to a larger limit. Thus disabled by default.
				4406	*/
				4407	shared = 0;
				4408	if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
				4409	shared = 8;
				4410
				4411	#if DEBUG
				4412	/*
				4413	* With debugging enabled, large batchcount lead to excessively long
				4414	* periods with disabled local interrupts. Limit the batchcount
				4415	*/
				4416	if (limit > 32)
				4417	limit = 32;
				4418	#endif
				4419	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
				4420	if (err)
				4421	printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
				4422	cachep->name, -err);
				4423	return err;
				4424	}
				4425
				4426	/*
				4427	* Drain an array if it contains any elements taking the l3 lock only if
				4428	* necessary. Note that the l3 listlock also protects the array_cache
				4429	* if drain_array() is used on the shared array.
				4430	*/
				4431	static void drain_array(struct kmem_cache cachep, struct kmem_list3 l3,
				4432	struct array_cache *ac, int force, int node)
				4433	{
				4434	int tofree;
				4435
				4436	if (!ac \|\| !ac->avail)
				4437	return;
				4438	if (ac->touched && !force) {
				4439	ac->touched = 0;
				4440	} else {
				4441	local_spin_lock_irq(slab_lock, &l3->list_lock);
				4442	if (ac->avail) {
				4443	tofree = force ? ac->avail : (ac->limit + 4) / 5;
				4444	if (tofree > ac->avail)
				4445	tofree = (ac->avail + 1) / 2;
				4446	free_block(cachep, ac->entry, tofree, node);
				4447	ac->avail -= tofree;
				4448	memmove(ac->entry, &(ac->entry[tofree]),
				4449	sizeof(void ) ac->avail);
				4450	}
				4451	local_spin_unlock_irq(slab_lock, &l3->list_lock);
				4452	}
				4453	}
				4454
				4455	/**
				4456	* cache_reap - Reclaim memory from caches.
				4457	* @w: work descriptor
				4458	*
				4459	* Called from workqueue/eventd every few seconds.
				4460	* Purpose:
				4461	* - clear the per-cpu caches for this CPU.
				4462	* - return freeable pages to the main free memory pool.
				4463	*
				4464	* If we cannot acquire the cache chain mutex then just give up - we'll try
				4465	* again on the next iteration.
				4466	*/
				4467	static void cache_reap(struct work_struct *w)
				4468	{
				4469	struct kmem_cache *searchp;
				4470	struct kmem_list3 *l3;
				4471	int node = numa_mem_id();
				4472	struct delayed_work *work = to_delayed_work(w);
				4473
				4474	if (!mutex_trylock(&cache_chain_mutex))
				4475	/* Give up. Setup the next iteration. */
				4476	goto out;
				4477
				4478	list_for_each_entry(searchp, &cache_chain, next) {
				4479	check_irq_on();
				4480
				4481	/*
				4482	* We only take the l3 lock if absolutely necessary and we
				4483	* have established with reasonable certainty that
				4484	* we can do some work if the lock was obtained.
				4485	*/
				4486	l3 = searchp->nodelists[node];
				4487
				4488	reap_alien(searchp, l3);
				4489
				4490	drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
				4491
				4492	/*
				4493	* These are racy checks but it does not matter
				4494	* if we skip one check or scan twice.
				4495	*/
				4496	if (time_after(l3->next_reap, jiffies))
				4497	goto next;
				4498
				4499	l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
				4500
				4501	drain_array(searchp, l3, l3->shared, 0, node);
				4502
				4503	if (l3->free_touched)
				4504	l3->free_touched = 0;
				4505	else {
				4506	int freed;
				4507
				4508	freed = drain_freelist(searchp, l3, (l3->free_limit +
				4509	5 * searchp->num - 1) / (5 * searchp->num));
				4510	STATS_ADD_REAPED(searchp, freed);
				4511	}
				4512	next:
				4513	cond_resched();
				4514	}
				4515	check_irq_on();
				4516	mutex_unlock(&cache_chain_mutex);
				4517	next_reap_node();
				4518	out:
				4519	/* Set up the next iteration */
				4520	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
				4521	}
				4522
				4523	#ifdef CONFIG_SLABINFO
				4524
				4525	static void print_slabinfo_header(struct seq_file *m)
				4526	{
				4527	/*
				4528	* Output format version, so at least we can change it
				4529	* without _too_ many complaints.
				4530	*/
				4531	#if STATS
				4532	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
				4533	#else
				4534	seq_puts(m, "slabinfo - version: 2.1\n");
				4535	#endif
				4536	seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
				4537	"<objperslab> <pagesperslab>");
				4538	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
				4539	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
				4540	#if STATS
				4541	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
				4542	"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
				4543	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
				4544	#endif
				4545	seq_putc(m, '\n');
				4546	}
				4547
				4548	static void s_start(struct seq_file m, loff_t *pos)
				4549	{
				4550	loff_t n = *pos;
				4551
				4552	mutex_lock(&cache_chain_mutex);
				4553	if (!n)
				4554	print_slabinfo_header(m);
				4555
				4556	return seq_list_start(&cache_chain, *pos);
				4557	}
				4558
				4559	static void s_next(struct seq_file m, void p, loff_t pos)
				4560	{
				4561	return seq_list_next(p, &cache_chain, pos);
				4562	}
				4563
				4564	static void s_stop(struct seq_file m, void p)
				4565	{
				4566	mutex_unlock(&cache_chain_mutex);
				4567	}
				4568
				4569	static int s_show(struct seq_file m, void p)
				4570	{
				4571	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
				4572	struct slab *slabp;
				4573	unsigned long active_objs;
				4574	unsigned long num_objs;
				4575	unsigned long active_slabs = 0;
				4576	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
				4577	const char *name;
				4578	char *error = NULL;
				4579	int node;
				4580	struct kmem_list3 *l3;
				4581
				4582	active_objs = 0;
				4583	num_slabs = 0;
				4584	for_each_online_node(node) {
				4585	l3 = cachep->nodelists[node];
				4586	if (!l3)
				4587	continue;
				4588
				4589	check_irq_on();
				4590	local_spin_lock_irq(slab_lock, &l3->list_lock);
				4591
				4592	list_for_each_entry(slabp, &l3->slabs_full, list) {
				4593	if (slabp->inuse != cachep->num && !error)
				4594	error = "slabs_full accounting error";
				4595	active_objs += cachep->num;
				4596	active_slabs++;
				4597	}
				4598	list_for_each_entry(slabp, &l3->slabs_partial, list) {
				4599	if (slabp->inuse == cachep->num && !error)
				4600	error = "slabs_partial inuse accounting error";
				4601	if (!slabp->inuse && !error)
				4602	error = "slabs_partial/inuse accounting error";
				4603	active_objs += slabp->inuse;
				4604	active_slabs++;
				4605	}
				4606	list_for_each_entry(slabp, &l3->slabs_free, list) {
				4607	if (slabp->inuse && !error)
				4608	error = "slabs_free/inuse accounting error";
				4609	num_slabs++;
				4610	}
				4611	free_objects += l3->free_objects;
				4612	if (l3->shared)
				4613	shared_avail += l3->shared->avail;
				4614
				4615	local_spin_unlock_irq(slab_lock, &l3->list_lock);
				4616	}
				4617	num_slabs += active_slabs;
				4618	num_objs = num_slabs * cachep->num;
				4619	if (num_objs - active_objs != free_objects && !error)
				4620	error = "free_objects accounting error";
				4621
				4622	name = cachep->name;
				4623	if (error)
				4624	printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
				4625
				4626	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
				4627	name, active_objs, num_objs, cachep->buffer_size,
				4628	cachep->num, (1 << cachep->gfporder));
				4629	seq_printf(m, " : tunables %4u %4u %4u",
				4630	cachep->limit, cachep->batchcount, cachep->shared);
				4631	seq_printf(m, " : slabdata %6lu %6lu %6lu",
				4632	active_slabs, num_slabs, shared_avail);
				4633	#if STATS
				4634	{ /* list3 stats */
				4635	unsigned long high = cachep->high_mark;
				4636	unsigned long allocs = cachep->num_allocations;
				4637	unsigned long grown = cachep->grown;
				4638	unsigned long reaped = cachep->reaped;
				4639	unsigned long errors = cachep->errors;
				4640	unsigned long max_freeable = cachep->max_freeable;
				4641	unsigned long node_allocs = cachep->node_allocs;
				4642	unsigned long node_frees = cachep->node_frees;
				4643	unsigned long overflows = cachep->node_overflow;
				4644
				4645	seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
				4646	"%4lu %4lu %4lu %4lu %4lu",
				4647	allocs, high, grown,
				4648	reaped, errors, max_freeable, node_allocs,
				4649	node_frees, overflows);
				4650	}
				4651	/* cpu stats */
				4652	{
				4653	unsigned long allochit = atomic_read(&cachep->allochit);
				4654	unsigned long allocmiss = atomic_read(&cachep->allocmiss);
				4655	unsigned long freehit = atomic_read(&cachep->freehit);
				4656	unsigned long freemiss = atomic_read(&cachep->freemiss);
				4657
				4658	seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
				4659	allochit, allocmiss, freehit, freemiss);
				4660	}
				4661	#endif
				4662	seq_putc(m, '\n');
				4663	return 0;
				4664	}
				4665
				4666	/*
				4667	* slabinfo_op - iterator that generates /proc/slabinfo
				4668	*
				4669	* Output layout:
				4670	* cache-name
				4671	* num-active-objs
				4672	* total-objs
				4673	* object size
				4674	* num-active-slabs
				4675	* total-slabs
				4676	* num-pages-per-slab
				4677	* + further values on SMP and with statistics enabled
				4678	*/
				4679
				4680	static const struct seq_operations slabinfo_op = {
				4681	.start = s_start,
				4682	.next = s_next,
				4683	.stop = s_stop,
				4684	.show = s_show,
				4685	};
				4686
				4687	#define MAX_SLABINFO_WRITE 128
				4688	/**
				4689	* slabinfo_write - Tuning for the slab allocator
				4690	* @file: unused
				4691	* @buffer: user buffer
				4692	* @count: data length
				4693	* @ppos: unused
				4694	*/
				4695	static ssize_t slabinfo_write(struct file file, const char __user buffer,
				4696	size_t count, loff_t *ppos)
				4697	{
				4698	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
				4699	int limit, batchcount, shared, res;
				4700	struct kmem_cache *cachep;
				4701
				4702	if (count > MAX_SLABINFO_WRITE)
				4703	return -EINVAL;
				4704	if (copy_from_user(&kbuf, buffer, count))
				4705	return -EFAULT;
				4706	kbuf[MAX_SLABINFO_WRITE] = '\0';
				4707
				4708	tmp = strchr(kbuf, ' ');
				4709	if (!tmp)
				4710	return -EINVAL;
				4711	*tmp = '\0';
				4712	tmp++;
				4713	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
				4714	return -EINVAL;
				4715
				4716	/* Find the cache in the chain of caches. */
				4717	mutex_lock(&cache_chain_mutex);
				4718	res = -EINVAL;
				4719	list_for_each_entry(cachep, &cache_chain, next) {
				4720	if (!strcmp(cachep->name, kbuf)) {
				4721	if (limit < 1 \|\| batchcount < 1 \|\|
				4722	batchcount > limit \|\| shared < 0) {
				4723	res = 0;
				4724	} else {
				4725	res = do_tune_cpucache(cachep, limit,
				4726	batchcount, shared,
				4727	GFP_KERNEL);
				4728	}
				4729	break;
				4730	}
				4731	}
				4732	mutex_unlock(&cache_chain_mutex);
				4733	if (res >= 0)
				4734	res = count;
				4735	return res;
				4736	}
				4737
				4738	static int slabinfo_open(struct inode inode, struct file file)
				4739	{
				4740	return seq_open(file, &slabinfo_op);
				4741	}
				4742
				4743	static const struct file_operations proc_slabinfo_operations = {
				4744	.open = slabinfo_open,
				4745	.read = seq_read,
				4746	.write = slabinfo_write,
				4747	.llseek = seq_lseek,
				4748	.release = seq_release,
				4749	};
				4750
				4751	#ifdef CONFIG_DEBUG_SLAB_LEAK
				4752
				4753	static void leaks_start(struct seq_file m, loff_t *pos)
				4754	{
				4755	mutex_lock(&cache_chain_mutex);
				4756	return seq_list_start(&cache_chain, *pos);
				4757	}
				4758
				4759	static inline int add_caller(unsigned long *n, unsigned long v)
				4760	{
				4761	unsigned long *p;
				4762	int l;
				4763	if (!v)
				4764	return 1;
				4765	l = n[1];
				4766	p = n + 2;
				4767	while (l) {
				4768	int i = l/2;
				4769	unsigned long q = p + 2 i;
				4770	if (*q == v) {
				4771	q[1]++;
				4772	return 1;
				4773	}
				4774	if (*q > v) {
				4775	l = i;
				4776	} else {
				4777	p = q + 2;
				4778	l -= i + 1;
				4779	}
				4780	}
				4781	if (++n[1] == n[0])
				4782	return 0;
				4783	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void )p - (void )n));
				4784	p[0] = v;
				4785	p[1] = 1;
				4786	return 1;
				4787	}
				4788
				4789	static void handle_slab(unsigned long n, struct kmem_cache c, struct slab *s)
				4790	{
				4791	void *p;
				4792	int i;
				4793	if (n[0] == n[1])
				4794	return;
				4795	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
				4796	if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
				4797	continue;
				4798	if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
				4799	return;
				4800	}
				4801	}
				4802
				4803	static void show_symbol(struct seq_file *m, unsigned long address)
				4804	{
				4805	#ifdef CONFIG_KALLSYMS
				4806	unsigned long offset, size;
				4807	char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
				4808
				4809	if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
				4810	seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
				4811	if (modname[0])
				4812	seq_printf(m, " [%s]", modname);
				4813	return;
				4814	}
				4815	#endif
				4816	seq_printf(m, "%p", (void *)address);
				4817	}
				4818
				4819	static int leaks_show(struct seq_file m, void p)
				4820	{
				4821	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
				4822	struct slab *slabp;
				4823	struct kmem_list3 *l3;
				4824	const char *name;
				4825	unsigned long *n = m->private;
				4826	int node;
				4827	int i;
				4828
				4829	if (!(cachep->flags & SLAB_STORE_USER))
				4830	return 0;
				4831	if (!(cachep->flags & SLAB_RED_ZONE))
				4832	return 0;
				4833
				4834	/* OK, we can do it */
				4835
				4836	n[1] = 0;
				4837
				4838	for_each_online_node(node) {
				4839	l3 = cachep->nodelists[node];
				4840	if (!l3)
				4841	continue;
				4842
				4843	check_irq_on();
				4844	local_spin_lock_irq(slab_lock, &l3->list_lock);
				4845
				4846	list_for_each_entry(slabp, &l3->slabs_full, list)
				4847	handle_slab(n, cachep, slabp);
				4848	list_for_each_entry(slabp, &l3->slabs_partial, list)
				4849	handle_slab(n, cachep, slabp);
				4850	local_spin_unlock_irq(slab_lock, &l3->list_lock);
				4851	}
				4852	name = cachep->name;
				4853	if (n[0] == n[1]) {
				4854	/* Increase the buffer size */
				4855	mutex_unlock(&cache_chain_mutex);
				4856	m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
				4857	if (!m->private) {
				4858	/* Too bad, we are really out */
				4859	m->private = n;
				4860	mutex_lock(&cache_chain_mutex);
				4861	return -ENOMEM;
				4862	}
				4863	(unsigned long )m->private = n[0] * 2;
				4864	kfree(n);
				4865	mutex_lock(&cache_chain_mutex);
				4866	/* Now make sure this entry will be retried */
				4867	m->count = m->size;
				4868	return 0;
				4869	}
				4870	for (i = 0; i < n[1]; i++) {
				4871	seq_printf(m, "%s: %lu ", name, n[2*i+3]);
				4872	show_symbol(m, n[2*i+2]);
				4873	seq_putc(m, '\n');
				4874	}
				4875
				4876	return 0;
				4877	}
				4878
				4879	static const struct seq_operations slabstats_op = {
				4880	.start = leaks_start,
				4881	.next = s_next,
				4882	.stop = s_stop,
				4883	.show = leaks_show,
				4884	};
				4885
				4886	static int slabstats_open(struct inode inode, struct file file)
				4887	{
				4888	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
				4889	int ret = -ENOMEM;
				4890	if (n) {
				4891	ret = seq_open(file, &slabstats_op);
				4892	if (!ret) {
				4893	struct seq_file *m = file->private_data;
				4894	n = PAGE_SIZE / (2 sizeof(unsigned long));
				4895	m->private = n;
				4896	n = NULL;
				4897	}
				4898	kfree(n);
				4899	}
				4900	return ret;
				4901	}
				4902
				4903	static const struct file_operations proc_slabstats_operations = {
				4904	.open = slabstats_open,
				4905	.read = seq_read,
				4906	.llseek = seq_lseek,
				4907	.release = seq_release_private,
				4908	};
				4909	#endif
				4910
				4911	static int __init slab_proc_init(void)
				4912	{
				4913	proc_create("slabinfo",S_IWUSR\|S_IRUSR,NULL,&proc_slabinfo_operations);
				4914	#ifdef CONFIG_DEBUG_SLAB_LEAK
				4915	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
				4916	#endif
				4917	return 0;
				4918	}
				4919	module_init(slab_proc_init);
				4920	#endif
				4921
				4922	/**
				4923	* ksize - get the actual amount of memory allocated for a given object
				4924	* @objp: Pointer to the object
				4925	*
				4926	* kmalloc may internally round up allocations and return more memory
				4927	* than requested. ksize() can be used to determine the actual amount of
				4928	* memory allocated. The caller may use this additional memory, even though
				4929	* a smaller amount of memory was initially specified with the kmalloc call.
				4930	* The caller must guarantee that objp points to a valid object previously
				4931	* allocated with either kmalloc() or kmem_cache_alloc(). The object
				4932	* must not be freed during the duration of the call.
				4933	*/
				4934	size_t ksize(const void *objp)
				4935	{
				4936	BUG_ON(!objp);
				4937	if (unlikely(objp == ZERO_SIZE_PTR))
				4938	return 0;
				4939
				4940	#ifdef CONFIG_DEBUG_SLAB_MARK_HEAD
				4941	if (obj_size(virt_to_cache(objp)) <= PAGE_SIZE)
				4942	return (obj_size(virt_to_cache(objp)) - RECORD_COUNT * BYTES_PER_RECORD);
				4943	else
				4944	return obj_size(virt_to_cache(objp));
				4945	#else
				4946	return obj_size(virt_to_cache(objp));
				4947	#endif
				4948	}
				4949	EXPORT_SYMBOL(ksize);