Blame - src/kernel/linux/v4.14/mm/slub.c - T103

blob: db2639832037d0a6dd99e73dd49327e7c0aca52a [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* SLUB: A slab allocator that limits cache line use instead of queuing
				4	* objects in per cpu and per node lists.
				5	*
				6	* The allocator synchronizes using per slab locks or atomic operatios
				7	* and only uses a centralized lock to manage a pool of partial slabs.
				8	*
				9	* (C) 2007 SGI, Christoph Lameter
				10	* (C) 2011 Linux Foundation, Christoph Lameter
				11	*/
				12
				13	#include <linux/mm.h>
				14	#include <linux/swap.h> /* struct reclaim_state */
				15	#include <linux/module.h>
				16	#include <linux/bit_spinlock.h>
				17	#include <linux/interrupt.h>
				18	#include <linux/bitops.h>
				19	#include <linux/slab.h>
				20	#include "slab.h"
				21	#include <linux/proc_fs.h>
				22	#include <linux/notifier.h>
				23	#include <linux/seq_file.h>
				24	#include <linux/kasan.h>
				25	#include <linux/cpu.h>
				26	#include <linux/cpuset.h>
				27	#include <linux/mempolicy.h>
				28	#include <linux/ctype.h>
				29	#include <linux/debugobjects.h>
				30	#include <linux/kallsyms.h>
				31	#include <linux/memory.h>
				32	#include <linux/math64.h>
				33	#include <linux/fault-inject.h>
				34	#include <linux/stacktrace.h>
				35	#include <linux/prefetch.h>
				36	#include <linux/memcontrol.h>
				37	#include <linux/random.h>
				38
				39	#include <trace/events/kmem.h>
				40
				41	#include "internal.h"
				42
				43	/*
				44	* Lock order:
				45	* 1. slab_mutex (Global Mutex)
				46	* 2. node->list_lock
				47	* 3. slab_lock(page) (Only on some arches and for debugging)
				48	*
				49	* slab_mutex
				50	*
				51	* The role of the slab_mutex is to protect the list of all the slabs
				52	* and to synchronize major metadata changes to slab cache structures.
				53	*
				54	* The slab_lock is only used for debugging and on arches that do not
				55	* have the ability to do a cmpxchg_double. It only protects the second
				56	* double word in the page struct. Meaning
				57	* A. page->freelist -> List of object free in a page
				58	* B. page->counters -> Counters of objects
				59	* C. page->frozen -> frozen state
				60	*
				61	* If a slab is frozen then it is exempt from list management. It is not
				62	* on any list. The processor that froze the slab is the one who can
				63	* perform list operations on the page. Other processors may put objects
				64	* onto the freelist but the processor that froze the slab is the only
				65	* one that can retrieve the objects from the page's freelist.
				66	*
				67	* The list_lock protects the partial and full list on each node and
				68	* the partial slab counter. If taken then no new slabs may be added or
				69	* removed from the lists nor make the number of partial slabs be modified.
				70	* (Note that the total number of slabs is an atomic value that may be
				71	* modified without taking the list lock).
				72	*
				73	* The list_lock is a centralized lock and thus we avoid taking it as
				74	* much as possible. As long as SLUB does not have to handle partial
				75	* slabs, operations can continue without any centralized lock. F.e.
				76	* allocating a long series of objects that fill up slabs does not require
				77	* the list lock.
				78	* Interrupts are disabled during allocation and deallocation in order to
				79	* make the slab allocator safe to use in the context of an irq. In addition
				80	* interrupts are disabled to ensure that the processor does not change
				81	* while handling per_cpu slabs, due to kernel preemption.
				82	*
				83	* SLUB assigns one slab for allocation to each processor.
				84	* Allocations only occur from these slabs called cpu slabs.
				85	*
				86	* Slabs with free elements are kept on a partial list and during regular
				87	* operations no list for full slabs is used. If an object in a full slab is
				88	* freed then the slab will show up again on the partial lists.
				89	* We track full slabs for debugging purposes though because otherwise we
				90	* cannot scan all objects.
				91	*
				92	* Slabs are freed when they become empty. Teardown and setup is
				93	* minimal so we rely on the page allocators per cpu caches for
				94	* fast frees and allocs.
				95	*
				96	* Overloading of page flags that are otherwise used for LRU management.
				97	*
				98	* PageActive The slab is frozen and exempt from list processing.
				99	* This means that the slab is dedicated to a purpose
				100	* such as satisfying allocations for a specific
				101	* processor. Objects may be freed in the slab while
				102	* it is frozen but slab_free will then skip the usual
				103	* list operations. It is up to the processor holding
				104	* the slab to integrate the slab into the slab lists
				105	* when the slab is no longer needed.
				106	*
				107	* One use of this flag is to mark slabs that are
				108	* used for allocations. Then such a slab becomes a cpu
				109	* slab. The cpu slab may be equipped with an additional
				110	* freelist that allows lockless access to
				111	* free objects in addition to the regular freelist
				112	* that requires the slab lock.
				113	*
				114	* PageError Slab requires special handling due to debug
				115	* options set. This moves slab handling out of
				116	* the fast path and disables lockless freelists.
				117	*/
				118
				119	static inline int kmem_cache_debug(struct kmem_cache *s)
				120	{
				121	#ifdef CONFIG_SLUB_DEBUG
				122	return unlikely(s->flags & SLAB_DEBUG_FLAGS);
				123	#else
				124	return 0;
				125	#endif
				126	}
				127
				128	void fixup_red_left(struct kmem_cache s, void *p)
				129	{
				130	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
				131	p += s->red_left_pad;
				132
				133	return p;
				134	}
				135
				136	static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
				137	{
				138	#ifdef CONFIG_SLUB_CPU_PARTIAL
				139	return !kmem_cache_debug(s);
				140	#else
				141	return false;
				142	#endif
				143	}
				144
				145	/*
				146	* Issues still to be resolved:
				147	*
				148	* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
				149	*
				150	* - Variable sizing of the per node arrays
				151	*/
				152
				153	/* Enable to test recovery from slab corruption on boot */
				154	#undef SLUB_RESILIENCY_TEST
				155
				156	/* Enable to log cmpxchg failures */
				157	#undef SLUB_DEBUG_CMPXCHG
				158
				159	/*
				160	* Mininum number of partial slabs. These will be left on the partial
				161	* lists even if they are empty. kmem_cache_shrink may reclaim them.
				162	*/
				163	#define MIN_PARTIAL 5
				164
				165	/*
				166	* Maximum number of desirable partial slabs.
				167	* The existence of more partial slabs makes kmem_cache_shrink
				168	* sort the partial list by the number of objects in use.
				169	*/
				170	#define MAX_PARTIAL 10
				171
				172	#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS \| SLAB_RED_ZONE \| \
				173	SLAB_POISON \| SLAB_STORE_USER)
				174
				175	/*
				176	* These debug flags cannot use CMPXCHG because there might be consistency
				177	* issues when checking or reading debug information
				178	*/
				179	#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS \| SLAB_STORE_USER \| \
				180	SLAB_TRACE)
				181
				182
				183	/*
				184	* Debugging flags that require metadata to be stored in the slab. These get
				185	* disabled when slub_debug=O is used and a cache's min order increases with
				186	* metadata.
				187	*/
				188	#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER)
				189
				190	#define OO_SHIFT 16
				191	#define OO_MASK ((1 << OO_SHIFT) - 1)
				192	#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
				193
				194	/* Internal SLUB flags */
				195	#define __OBJECT_POISON 0x80000000UL /* Poison object */
				196	#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
				197
				198	/*
				199	* Tracking user of a slab.
				200	*/
				201	#define TRACK_ADDRS_COUNT 16
				202	struct track {
				203	unsigned long addr; /* Called from address */
				204	#ifdef CONFIG_STACKTRACE
				205	unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
				206	#endif
				207	int cpu; /* Was running on cpu */
				208	int pid; /* Pid context */
				209	unsigned long when; /* When did the operation occur */
				210	};
				211
				212	enum track_item { TRACK_ALLOC, TRACK_FREE };
				213
				214	#ifdef CONFIG_SYSFS
				215	static int sysfs_slab_add(struct kmem_cache *);
				216	static int sysfs_slab_alias(struct kmem_cache , const char );
				217	static void memcg_propagate_slab_attrs(struct kmem_cache *s);
				218	static void sysfs_slab_remove(struct kmem_cache *s);
				219	#else
				220	static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
				221	static inline int sysfs_slab_alias(struct kmem_cache s, const char p)
				222	{ return 0; }
				223	static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
				224	static inline void sysfs_slab_remove(struct kmem_cache *s) { }
				225	#endif
				226
				227	static inline void stat(const struct kmem_cache *s, enum stat_item si)
				228	{
				229	#ifdef CONFIG_SLUB_STATS
				230	/*
				231	* The rmw is racy on a preemptible kernel but this is acceptable, so
				232	* avoid this_cpu_add()'s irq-disable overhead.
				233	*/
				234	raw_cpu_inc(s->cpu_slab->stat[si]);
				235	#endif
				236	}
				237
				238	/********************************************************************
				239	* Core slab cache functions
				240	*******************************************************************/
				241
				242	/*
				243	* Returns freelist pointer (ptr). With hardening, this is obfuscated
				244	* with an XOR of the address where the pointer is held and a per-cache
				245	* random number.
				246	*/
				247	static inline void freelist_ptr(const struct kmem_cache s, void *ptr,
				248	unsigned long ptr_addr)
				249	{
				250	#ifdef CONFIG_SLAB_FREELIST_HARDENED
				251	return (void *)((unsigned long)ptr ^ s->random ^ swab(ptr_addr));
				252	#else
				253	return ptr;
				254	#endif
				255	}
				256
				257	/* Returns the freelist pointer recorded at location ptr_addr. */
				258	static inline void freelist_dereference(const struct kmem_cache s,
				259	void *ptr_addr)
				260	{
				261	return freelist_ptr(s, (void )(unsigned long *)(ptr_addr),
				262	(unsigned long)ptr_addr);
				263	}
				264
				265	static inline void get_freepointer(struct kmem_cache s, void *object)
				266	{
				267	return freelist_dereference(s, object + s->offset);
				268	}
				269
				270	static void prefetch_freepointer(const struct kmem_cache s, void object)
				271	{
				272	prefetch(object + s->offset);
				273	}
				274
				275	static inline void get_freepointer_safe(struct kmem_cache s, void *object)
				276	{
				277	unsigned long freepointer_addr;
				278	void *p;
				279
				280	if (!debug_pagealloc_enabled())
				281	return get_freepointer(s, object);
				282
				283	freepointer_addr = (unsigned long)object + s->offset;
				284	probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
				285	return freelist_ptr(s, p, freepointer_addr);
				286	}
				287
				288	static inline void set_freepointer(struct kmem_cache s, void object, void *fp)
				289	{
				290	unsigned long freeptr_addr = (unsigned long)object + s->offset;
				291
				292	#ifdef CONFIG_SLAB_FREELIST_HARDENED
				293	BUG_ON(object == fp); /* naive detection of double free or corruption */
				294	#endif
				295
				296	(void *)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
				297	}
				298
				299	/* Loop over all objects in a slab */
				300	#define for_each_object(__p, __s, __addr, __objects) \
				301	for (__p = fixup_red_left(__s, __addr); \
				302	__p < (__addr) + (__objects) * (__s)->size; \
				303	__p += (__s)->size)
				304
				305	#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
				306	for (__p = fixup_red_left(__s, __addr), __idx = 1; \
				307	__idx <= __objects; \
				308	__p += (__s)->size, __idx++)
				309
				310	/* Determine object index from a given position */
				311	static inline int slab_index(void p, struct kmem_cache s, void *addr)
				312	{
				313	return (p - addr) / s->size;
				314	}
				315
				316	static inline int order_objects(int order, unsigned long size, int reserved)
				317	{
				318	return ((PAGE_SIZE << order) - reserved) / size;
				319	}
				320
				321	static inline struct kmem_cache_order_objects oo_make(int order,
				322	unsigned long size, int reserved)
				323	{
				324	struct kmem_cache_order_objects x = {
				325	(order << OO_SHIFT) + order_objects(order, size, reserved)
				326	};
				327
				328	return x;
				329	}
				330
				331	static inline int oo_order(struct kmem_cache_order_objects x)
				332	{
				333	return x.x >> OO_SHIFT;
				334	}
				335
				336	static inline int oo_objects(struct kmem_cache_order_objects x)
				337	{
				338	return x.x & OO_MASK;
				339	}
				340
				341	/*
				342	* Per slab locking using the pagelock
				343	*/
				344	static __always_inline void slab_lock(struct page *page)
				345	{
				346	VM_BUG_ON_PAGE(PageTail(page), page);
				347	bit_spin_lock(PG_locked, &page->flags);
				348	}
				349
				350	static __always_inline void slab_unlock(struct page *page)
				351	{
				352	VM_BUG_ON_PAGE(PageTail(page), page);
				353	__bit_spin_unlock(PG_locked, &page->flags);
				354	}
				355
				356	static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
				357	{
				358	struct page tmp;
				359	tmp.counters = counters_new;
				360	/*
				361	* page->counters can cover frozen/inuse/objects as well
				362	* as page->_refcount. If we assign to ->counters directly
				363	* we run the risk of losing updates to page->_refcount, so
				364	* be careful and only assign to the fields we need.
				365	*/
				366	page->frozen = tmp.frozen;
				367	page->inuse = tmp.inuse;
				368	page->objects = tmp.objects;
				369	}
				370
				371	/* Interrupts must be disabled (for the fallback code to work right) */
				372	static inline bool __cmpxchg_double_slab(struct kmem_cache s, struct page page,
				373	void *freelist_old, unsigned long counters_old,
				374	void *freelist_new, unsigned long counters_new,
				375	const char *n)
				376	{
				377	VM_BUG_ON(!irqs_disabled());
				378	#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
				379	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
				380	if (s->flags & __CMPXCHG_DOUBLE) {
				381	if (cmpxchg_double(&page->freelist, &page->counters,
				382	freelist_old, counters_old,
				383	freelist_new, counters_new))
				384	return true;
				385	} else
				386	#endif
				387	{
				388	slab_lock(page);
				389	if (page->freelist == freelist_old &&
				390	page->counters == counters_old) {
				391	page->freelist = freelist_new;
				392	set_page_slub_counters(page, counters_new);
				393	slab_unlock(page);
				394	return true;
				395	}
				396	slab_unlock(page);
				397	}
				398
				399	cpu_relax();
				400	stat(s, CMPXCHG_DOUBLE_FAIL);
				401
				402	#ifdef SLUB_DEBUG_CMPXCHG
				403	pr_info("%s %s: cmpxchg double redo ", n, s->name);
				404	#endif
				405
				406	return false;
				407	}
				408
				409	static inline bool cmpxchg_double_slab(struct kmem_cache s, struct page page,
				410	void *freelist_old, unsigned long counters_old,
				411	void *freelist_new, unsigned long counters_new,
				412	const char *n)
				413	{
				414	#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
				415	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
				416	if (s->flags & __CMPXCHG_DOUBLE) {
				417	if (cmpxchg_double(&page->freelist, &page->counters,
				418	freelist_old, counters_old,
				419	freelist_new, counters_new))
				420	return true;
				421	} else
				422	#endif
				423	{
				424	unsigned long flags;
				425
				426	local_irq_save(flags);
				427	slab_lock(page);
				428	if (page->freelist == freelist_old &&
				429	page->counters == counters_old) {
				430	page->freelist = freelist_new;
				431	set_page_slub_counters(page, counters_new);
				432	slab_unlock(page);
				433	local_irq_restore(flags);
				434	return true;
				435	}
				436	slab_unlock(page);
				437	local_irq_restore(flags);
				438	}
				439
				440	cpu_relax();
				441	stat(s, CMPXCHG_DOUBLE_FAIL);
				442
				443	#ifdef SLUB_DEBUG_CMPXCHG
				444	pr_info("%s %s: cmpxchg double redo ", n, s->name);
				445	#endif
				446
				447	return false;
				448	}
				449
				450	#ifdef CONFIG_SLUB_DEBUG
				451	/*
				452	* Determine a map of object in use on a page.
				453	*
				454	* Node listlock must be held to guarantee that the page does
				455	* not vanish from under us.
				456	*/
				457	static void get_map(struct kmem_cache s, struct page page, unsigned long *map)
				458	{
				459	void *p;
				460	void *addr = page_address(page);
				461
				462	for (p = page->freelist; p; p = get_freepointer(s, p))
				463	set_bit(slab_index(p, s, addr), map);
				464	}
				465
				466	static inline int size_from_object(struct kmem_cache *s)
				467	{
				468	if (s->flags & SLAB_RED_ZONE)
				469	return s->size - s->red_left_pad;
				470
				471	return s->size;
				472	}
				473
				474	static inline void restore_red_left(struct kmem_cache s, void *p)
				475	{
				476	if (s->flags & SLAB_RED_ZONE)
				477	p -= s->red_left_pad;
				478
				479	return p;
				480	}
				481
				482	/*
				483	* Debug settings:
				484	*/
				485	#if defined(CONFIG_SLUB_DEBUG_ON)
				486	static int slub_debug = DEBUG_DEFAULT_FLAGS;
				487	#else
				488	static int slub_debug;
				489	#endif
				490
				491	static char *slub_debug_slabs;
				492	static int disable_higher_order_debug;
				493
				494	/*
				495	* slub is about to manipulate internal object metadata. This memory lies
				496	* outside the range of the allocated object, so accessing it would normally
				497	* be reported by kasan as a bounds error. metadata_access_enable() is used
				498	* to tell kasan that these accesses are OK.
				499	*/
				500	static inline void metadata_access_enable(void)
				501	{
				502	kasan_disable_current();
				503	}
				504
				505	static inline void metadata_access_disable(void)
				506	{
				507	kasan_enable_current();
				508	}
				509
				510	/*
				511	* Object debugging
				512	*/
				513
				514	/* Verify that a pointer has an address that is valid within a slab page */
				515	static inline int check_valid_pointer(struct kmem_cache *s,
				516	struct page page, void object)
				517	{
				518	void *base;
				519
				520	if (!object)
				521	return 1;
				522
				523	base = page_address(page);
				524	object = restore_red_left(s, object);
				525	if (object < base \|\| object >= base + page->objects * s->size \|\|
				526	(object - base) % s->size) {
				527	return 0;
				528	}
				529
				530	return 1;
				531	}
				532
				533	static void print_section(char level, char text, u8 *addr,
				534	unsigned int length)
				535	{
				536	metadata_access_enable();
				537	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
				538	length, 1);
				539	metadata_access_disable();
				540	}
				541
				542	static struct track get_track(struct kmem_cache s, void *object,
				543	enum track_item alloc)
				544	{
				545	struct track *p;
				546
				547	if (s->offset)
				548	p = object + s->offset + sizeof(void *);
				549	else
				550	p = object + s->inuse;
				551
				552	return p + alloc;
				553	}
				554
				555	static void set_track(struct kmem_cache s, void object,
				556	enum track_item alloc, unsigned long addr)
				557	{
				558	struct track *p = get_track(s, object, alloc);
				559
				560	if (addr) {
				561	#ifdef CONFIG_STACKTRACE
				562	struct stack_trace trace;
				563	int i;
				564
				565	trace.nr_entries = 0;
				566	trace.max_entries = TRACK_ADDRS_COUNT;
				567	trace.entries = p->addrs;
				568	trace.skip = 3;
				569	metadata_access_enable();
				570	save_stack_trace(&trace);
				571	metadata_access_disable();
				572
				573	/* See rant in lockdep.c */
				574	if (trace.nr_entries != 0 &&
				575	trace.entries[trace.nr_entries - 1] == ULONG_MAX)
				576	trace.nr_entries--;
				577
				578	for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
				579	p->addrs[i] = 0;
				580	#endif
				581	p->addr = addr;
				582	p->cpu = smp_processor_id();
				583	p->pid = current->pid;
				584	p->when = jiffies;
				585	} else
				586	memset(p, 0, sizeof(struct track));
				587	}
				588
				589	static void init_tracking(struct kmem_cache s, void object)
				590	{
				591	if (!(s->flags & SLAB_STORE_USER))
				592	return;
				593
				594	set_track(s, object, TRACK_FREE, 0UL);
				595	set_track(s, object, TRACK_ALLOC, 0UL);
				596	}
				597
				598	static void print_track(const char s, struct track t)
				599	{
				600	if (!t->addr)
				601	return;
				602
				603	pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
				604	s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
				605	#ifdef CONFIG_STACKTRACE
				606	{
				607	int i;
				608	for (i = 0; i < TRACK_ADDRS_COUNT; i++)
				609	if (t->addrs[i])
				610	pr_err("\t%pS\n", (void *)t->addrs[i]);
				611	else
				612	break;
				613	}
				614	#endif
				615	}
				616
				617	static void print_tracking(struct kmem_cache s, void object)
				618	{
				619	if (!(s->flags & SLAB_STORE_USER))
				620	return;
				621
				622	print_track("Allocated", get_track(s, object, TRACK_ALLOC));
				623	print_track("Freed", get_track(s, object, TRACK_FREE));
				624	}
				625
				626	static void print_page_info(struct page *page)
				627	{
				628	pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
				629	page, page->objects, page->inuse, page->freelist, page->flags);
				630
				631	}
				632
				633	static void slab_bug(struct kmem_cache s, char fmt, ...)
				634	{
				635	struct va_format vaf;
				636	va_list args;
				637
				638	va_start(args, fmt);
				639	vaf.fmt = fmt;
				640	vaf.va = &args;
				641	pr_err("=============================================================================\n");
				642	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
				643	pr_err("-----------------------------------------------------------------------------\n\n");
				644
				645	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
				646	va_end(args);
				647	}
				648
				649	static void slab_fix(struct kmem_cache s, char fmt, ...)
				650	{
				651	struct va_format vaf;
				652	va_list args;
				653
				654	va_start(args, fmt);
				655	vaf.fmt = fmt;
				656	vaf.va = &args;
				657	pr_err("FIX %s: %pV\n", s->name, &vaf);
				658	va_end(args);
				659	}
				660
				661	static bool freelist_corrupted(struct kmem_cache s, struct page page,
				662	void *freelist, void nextfree)
				663	{
				664	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
				665	!check_valid_pointer(s, page, nextfree) && freelist) {
				666	object_err(s, page, *freelist, "Freechain corrupt");
				667	*freelist = NULL;
				668	slab_fix(s, "Isolate corrupted freechain");
				669	return true;
				670	}
				671
				672	return false;
				673	}
				674
				675	static void print_trailer(struct kmem_cache s, struct page page, u8 *p)
				676	{
				677	unsigned int off; /* Offset of last byte */
				678	u8 *addr = page_address(page);
				679
				680	print_tracking(s, p);
				681
				682	print_page_info(page);
				683
				684	pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
				685	p, p - addr, get_freepointer(s, p));
				686
				687	if (s->flags & SLAB_RED_ZONE)
				688	print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
				689	s->red_left_pad);
				690	else if (p > addr + 16)
				691	print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
				692
				693	print_section(KERN_ERR, "Object ", p,
				694	min_t(unsigned long, s->object_size, PAGE_SIZE));
				695	if (s->flags & SLAB_RED_ZONE)
				696	print_section(KERN_ERR, "Redzone ", p + s->object_size,
				697	s->inuse - s->object_size);
				698
				699	if (s->offset)
				700	off = s->offset + sizeof(void *);
				701	else
				702	off = s->inuse;
				703
				704	if (s->flags & SLAB_STORE_USER)
				705	off += 2 * sizeof(struct track);
				706
				707	off += kasan_metadata_size(s);
				708
				709	if (off != size_from_object(s))
				710	/* Beginning of the filler is the free pointer */
				711	print_section(KERN_ERR, "Padding ", p + off,
				712	size_from_object(s) - off);
				713
				714	dump_stack();
				715	}
				716
				717	void object_err(struct kmem_cache s, struct page page,
				718	u8 object, char reason)
				719	{
				720	slab_bug(s, "%s", reason);
				721	print_trailer(s, page, object);
				722	}
				723
				724	static __printf(3, 4) void slab_err(struct kmem_cache s, struct page page,
				725	const char *fmt, ...)
				726	{
				727	va_list args;
				728	char buf[100];
				729
				730	va_start(args, fmt);
				731	vsnprintf(buf, sizeof(buf), fmt, args);
				732	va_end(args);
				733	slab_bug(s, "%s", buf);
				734	print_page_info(page);
				735	dump_stack();
				736	}
				737
				738	static void init_object(struct kmem_cache s, void object, u8 val)
				739	{
				740	u8 *p = object;
				741
				742	if (s->flags & SLAB_RED_ZONE)
				743	memset(p - s->red_left_pad, val, s->red_left_pad);
				744
				745	if (s->flags & __OBJECT_POISON) {
				746	memset(p, POISON_FREE, s->object_size - 1);
				747	p[s->object_size - 1] = POISON_END;
				748	}
				749
				750	if (s->flags & SLAB_RED_ZONE)
				751	memset(p + s->object_size, val, s->inuse - s->object_size);
				752	}
				753
				754	static void restore_bytes(struct kmem_cache s, char message, u8 data,
				755	void from, void to)
				756	{
				757	slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
				758	memset(from, data, to - from);
				759	}
				760
				761	static int check_bytes_and_report(struct kmem_cache s, struct page page,
				762	u8 object, char what,
				763	u8 *start, unsigned int value, unsigned int bytes)
				764	{
				765	u8 *fault;
				766	u8 *end;
				767
				768	metadata_access_enable();
				769	fault = memchr_inv(start, value, bytes);
				770	metadata_access_disable();
				771	if (!fault)
				772	return 1;
				773
				774	end = start + bytes;
				775	while (end > fault && end[-1] == value)
				776	end--;
				777
				778	slab_bug(s, "%s overwritten", what);
				779	pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
				780	fault, end - 1, fault[0], value);
				781	print_trailer(s, page, object);
				782
				783	restore_bytes(s, what, value, fault, end);
				784	return 0;
				785	}
				786
				787	/*
				788	* Object layout:
				789	*
				790	* object address
				791	* Bytes of the object to be managed.
				792	* If the freepointer may overlay the object then the free
				793	* pointer is the first word of the object.
				794	*
				795	* Poisoning uses 0x6b (POISON_FREE) and the last byte is
				796	* 0xa5 (POISON_END)
				797	*
				798	* object + s->object_size
				799	* Padding to reach word boundary. This is also used for Redzoning.
				800	* Padding is extended by another word if Redzoning is enabled and
				801	* object_size == inuse.
				802	*
				803	* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
				804	* 0xcc (RED_ACTIVE) for objects in use.
				805	*
				806	* object + s->inuse
				807	* Meta data starts here.
				808	*
				809	* A. Free pointer (if we cannot overwrite object on free)
				810	* B. Tracking data for SLAB_STORE_USER
				811	* C. Padding to reach required alignment boundary or at mininum
				812	* one word if debugging is on to be able to detect writes
				813	* before the word boundary.
				814	*
				815	* Padding is done using 0x5a (POISON_INUSE)
				816	*
				817	* object + s->size
				818	* Nothing is used beyond s->size.
				819	*
				820	* If slabcaches are merged then the object_size and inuse boundaries are mostly
				821	* ignored. And therefore no slab options that rely on these boundaries
				822	* may be used with merged slabcaches.
				823	*/
				824
				825	static int check_pad_bytes(struct kmem_cache s, struct page page, u8 *p)
				826	{
				827	unsigned long off = s->inuse; /* The end of info */
				828
				829	if (s->offset)
				830	/* Freepointer is placed after the object. */
				831	off += sizeof(void *);
				832
				833	if (s->flags & SLAB_STORE_USER)
				834	/* We also have user information there */
				835	off += 2 * sizeof(struct track);
				836
				837	off += kasan_metadata_size(s);
				838
				839	if (size_from_object(s) == off)
				840	return 1;
				841
				842	return check_bytes_and_report(s, page, p, "Object padding",
				843	p + off, POISON_INUSE, size_from_object(s) - off);
				844	}
				845
				846	/* Check the pad bytes at the end of a slab page */
				847	static int slab_pad_check(struct kmem_cache s, struct page page)
				848	{
				849	u8 *start;
				850	u8 *fault;
				851	u8 *end;
				852	int length;
				853	int remainder;
				854
				855	if (!(s->flags & SLAB_POISON))
				856	return 1;
				857
				858	start = page_address(page);
				859	length = (PAGE_SIZE << compound_order(page)) - s->reserved;
				860	end = start + length;
				861	remainder = length % s->size;
				862	if (!remainder)
				863	return 1;
				864
				865	metadata_access_enable();
				866	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
				867	metadata_access_disable();
				868	if (!fault)
				869	return 1;
				870	while (end > fault && end[-1] == POISON_INUSE)
				871	end--;
				872
				873	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
				874	print_section(KERN_ERR, "Padding ", end - remainder, remainder);
				875
				876	restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
				877	return 0;
				878	}
				879
				880	static int check_object(struct kmem_cache s, struct page page,
				881	void *object, u8 val)
				882	{
				883	u8 *p = object;
				884	u8 *endobject = object + s->object_size;
				885
				886	if (s->flags & SLAB_RED_ZONE) {
				887	if (!check_bytes_and_report(s, page, object, "Redzone",
				888	object - s->red_left_pad, val, s->red_left_pad))
				889	return 0;
				890
				891	if (!check_bytes_and_report(s, page, object, "Redzone",
				892	endobject, val, s->inuse - s->object_size))
				893	return 0;
				894	} else {
				895	if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
				896	check_bytes_and_report(s, page, p, "Alignment padding",
				897	endobject, POISON_INUSE,
				898	s->inuse - s->object_size);
				899	}
				900	}
				901
				902	if (s->flags & SLAB_POISON) {
				903	if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
				904	(!check_bytes_and_report(s, page, p, "Poison", p,
				905	POISON_FREE, s->object_size - 1) \|\|
				906	!check_bytes_and_report(s, page, p, "Poison",
				907	p + s->object_size - 1, POISON_END, 1)))
				908	return 0;
				909	/*
				910	* check_pad_bytes cleans up on its own.
				911	*/
				912	check_pad_bytes(s, page, p);
				913	}
				914
				915	if (!s->offset && val == SLUB_RED_ACTIVE)
				916	/*
				917	* Object and freepointer overlap. Cannot check
				918	* freepointer while object is allocated.
				919	*/
				920	return 1;
				921
				922	/* Check free pointer validity */
				923	if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
				924	object_err(s, page, p, "Freepointer corrupt");
				925	/*
				926	* No choice but to zap it and thus lose the remainder
				927	* of the free objects in this slab. May cause
				928	* another error because the object count is now wrong.
				929	*/
				930	set_freepointer(s, p, NULL);
				931	return 0;
				932	}
				933	return 1;
				934	}
				935
				936	static int check_slab(struct kmem_cache s, struct page page)
				937	{
				938	int maxobj;
				939
				940	VM_BUG_ON(!irqs_disabled());
				941
				942	if (!PageSlab(page)) {
				943	slab_err(s, page, "Not a valid slab page");
				944	return 0;
				945	}
				946
				947	maxobj = order_objects(compound_order(page), s->size, s->reserved);
				948	if (page->objects > maxobj) {
				949	slab_err(s, page, "objects %u > max %u",
				950	page->objects, maxobj);
				951	return 0;
				952	}
				953	if (page->inuse > page->objects) {
				954	slab_err(s, page, "inuse %u > max %u",
				955	page->inuse, page->objects);
				956	return 0;
				957	}
				958	/* Slab_pad_check fixes things up after itself */
				959	slab_pad_check(s, page);
				960	return 1;
				961	}
				962
				963	/*
				964	* Determine if a certain object on a page is on the freelist. Must hold the
				965	* slab lock to guarantee that the chains are in a consistent state.
				966	*/
				967	static int on_freelist(struct kmem_cache s, struct page page, void *search)
				968	{
				969	int nr = 0;
				970	void *fp;
				971	void *object = NULL;
				972	int max_objects;
				973
				974	fp = page->freelist;
				975	while (fp && nr <= page->objects) {
				976	if (fp == search)
				977	return 1;
				978	if (!check_valid_pointer(s, page, fp)) {
				979	if (object) {
				980	object_err(s, page, object,
				981	"Freechain corrupt");
				982	set_freepointer(s, object, NULL);
				983	} else {
				984	slab_err(s, page, "Freepointer corrupt");
				985	page->freelist = NULL;
				986	page->inuse = page->objects;
				987	slab_fix(s, "Freelist cleared");
				988	return 0;
				989	}
				990	break;
				991	}
				992	object = fp;
				993	fp = get_freepointer(s, object);
				994	nr++;
				995	}
				996
				997	max_objects = order_objects(compound_order(page), s->size, s->reserved);
				998	if (max_objects > MAX_OBJS_PER_PAGE)
				999	max_objects = MAX_OBJS_PER_PAGE;
				1000
				1001	if (page->objects != max_objects) {
				1002	slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
				1003	page->objects, max_objects);
				1004	page->objects = max_objects;
				1005	slab_fix(s, "Number of objects adjusted.");
				1006	}
				1007	if (page->inuse != page->objects - nr) {
				1008	slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
				1009	page->inuse, page->objects - nr);
				1010	page->inuse = page->objects - nr;
				1011	slab_fix(s, "Object count adjusted.");
				1012	}
				1013	return search == NULL;
				1014	}
				1015
				1016	static void trace(struct kmem_cache s, struct page page, void *object,
				1017	int alloc)
				1018	{
				1019	if (s->flags & SLAB_TRACE) {
				1020	pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
				1021	s->name,
				1022	alloc ? "alloc" : "free",
				1023	object, page->inuse,
				1024	page->freelist);
				1025
				1026	if (!alloc)
				1027	print_section(KERN_INFO, "Object ", (void *)object,
				1028	s->object_size);
				1029
				1030	dump_stack();
				1031	}
				1032	}
				1033
				1034	/*
				1035	* Tracking of fully allocated slabs for debugging purposes.
				1036	*/
				1037	static void add_full(struct kmem_cache *s,
				1038	struct kmem_cache_node n, struct page page)
				1039	{
				1040	if (!(s->flags & SLAB_STORE_USER))
				1041	return;
				1042
				1043	lockdep_assert_held(&n->list_lock);
				1044	list_add(&page->lru, &n->full);
				1045	}
				1046
				1047	static void remove_full(struct kmem_cache s, struct kmem_cache_node n, struct page *page)
				1048	{
				1049	if (!(s->flags & SLAB_STORE_USER))
				1050	return;
				1051
				1052	lockdep_assert_held(&n->list_lock);
				1053	list_del(&page->lru);
				1054	}
				1055
				1056	/* Tracking of the number of slabs for debugging purposes */
				1057	static inline unsigned long slabs_node(struct kmem_cache *s, int node)
				1058	{
				1059	struct kmem_cache_node *n = get_node(s, node);
				1060
				1061	return atomic_long_read(&n->nr_slabs);
				1062	}
				1063
				1064	static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
				1065	{
				1066	return atomic_long_read(&n->nr_slabs);
				1067	}
				1068
				1069	static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
				1070	{
				1071	struct kmem_cache_node *n = get_node(s, node);
				1072
				1073	/*
				1074	* May be called early in order to allocate a slab for the
				1075	* kmem_cache_node structure. Solve the chicken-egg
				1076	* dilemma by deferring the increment of the count during
				1077	* bootstrap (see early_kmem_cache_node_alloc).
				1078	*/
				1079	if (likely(n)) {
				1080	atomic_long_inc(&n->nr_slabs);
				1081	atomic_long_add(objects, &n->total_objects);
				1082	}
				1083	}
				1084	static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
				1085	{
				1086	struct kmem_cache_node *n = get_node(s, node);
				1087
				1088	atomic_long_dec(&n->nr_slabs);
				1089	atomic_long_sub(objects, &n->total_objects);
				1090	}
				1091
				1092	/* Object debug checks for alloc/free paths */
				1093	static void setup_object_debug(struct kmem_cache s, struct page page,
				1094	void *object)
				1095	{
				1096	if (!(s->flags & (SLAB_STORE_USER\|SLAB_RED_ZONE\|__OBJECT_POISON)))
				1097	return;
				1098
				1099	init_object(s, object, SLUB_RED_INACTIVE);
				1100	init_tracking(s, object);
				1101	}
				1102
				1103	static inline int alloc_consistency_checks(struct kmem_cache *s,
				1104	struct page *page,
				1105	void *object, unsigned long addr)
				1106	{
				1107	if (!check_slab(s, page))
				1108	return 0;
				1109
				1110	if (!check_valid_pointer(s, page, object)) {
				1111	object_err(s, page, object, "Freelist Pointer check fails");
				1112	return 0;
				1113	}
				1114
				1115	if (!check_object(s, page, object, SLUB_RED_INACTIVE))
				1116	return 0;
				1117
				1118	return 1;
				1119	}
				1120
				1121	static noinline int alloc_debug_processing(struct kmem_cache *s,
				1122	struct page *page,
				1123	void *object, unsigned long addr)
				1124	{
				1125	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
				1126	if (!alloc_consistency_checks(s, page, object, addr))
				1127	goto bad;
				1128	}
				1129
				1130	/* Success perform special debug activities for allocs */
				1131	if (s->flags & SLAB_STORE_USER)
				1132	set_track(s, object, TRACK_ALLOC, addr);
				1133	trace(s, page, object, 1);
				1134	init_object(s, object, SLUB_RED_ACTIVE);
				1135	return 1;
				1136
				1137	bad:
				1138	if (PageSlab(page)) {
				1139	/*
				1140	* If this is a slab page then lets do the best we can
				1141	* to avoid issues in the future. Marking all objects
				1142	* as used avoids touching the remaining objects.
				1143	*/
				1144	slab_fix(s, "Marking all objects used");
				1145	page->inuse = page->objects;
				1146	page->freelist = NULL;
				1147	}
				1148	return 0;
				1149	}
				1150
				1151	static inline int free_consistency_checks(struct kmem_cache *s,
				1152	struct page page, void object, unsigned long addr)
				1153	{
				1154	if (!check_valid_pointer(s, page, object)) {
				1155	slab_err(s, page, "Invalid object pointer 0x%p", object);
				1156	return 0;
				1157	}
				1158
				1159	if (on_freelist(s, page, object)) {
				1160	object_err(s, page, object, "Object already free");
				1161	return 0;
				1162	}
				1163
				1164	if (!check_object(s, page, object, SLUB_RED_ACTIVE))
				1165	return 0;
				1166
				1167	if (unlikely(s != page->slab_cache)) {
				1168	if (!PageSlab(page)) {
				1169	slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
				1170	object);
				1171	} else if (!page->slab_cache) {
				1172	pr_err("SLUB <none>: no slab for object 0x%p.\n",
				1173	object);
				1174	dump_stack();
				1175	} else
				1176	object_err(s, page, object,
				1177	"page slab pointer corrupt.");
				1178	return 0;
				1179	}
				1180	return 1;
				1181	}
				1182
				1183	/* Supports checking bulk free of a constructed freelist */
				1184	static noinline int free_debug_processing(
				1185	struct kmem_cache s, struct page page,
				1186	void head, void tail, int bulk_cnt,
				1187	unsigned long addr)
				1188	{
				1189	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				1190	void *object = head;
				1191	int cnt = 0;
				1192	unsigned long uninitialized_var(flags);
				1193	int ret = 0;
				1194
				1195	spin_lock_irqsave(&n->list_lock, flags);
				1196	slab_lock(page);
				1197
				1198	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
				1199	if (!check_slab(s, page))
				1200	goto out;
				1201	}
				1202
				1203	next_object:
				1204	cnt++;
				1205
				1206	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
				1207	if (!free_consistency_checks(s, page, object, addr))
				1208	goto out;
				1209	}
				1210
				1211	if (s->flags & SLAB_STORE_USER)
				1212	set_track(s, object, TRACK_FREE, addr);
				1213	trace(s, page, object, 0);
				1214	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
				1215	init_object(s, object, SLUB_RED_INACTIVE);
				1216
				1217	/* Reached end of constructed freelist yet? */
				1218	if (object != tail) {
				1219	object = get_freepointer(s, object);
				1220	goto next_object;
				1221	}
				1222	ret = 1;
				1223
				1224	out:
				1225	if (cnt != bulk_cnt)
				1226	slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
				1227	bulk_cnt, cnt);
				1228
				1229	slab_unlock(page);
				1230	spin_unlock_irqrestore(&n->list_lock, flags);
				1231	if (!ret)
				1232	slab_fix(s, "Object at 0x%p not freed", object);
				1233	return ret;
				1234	}
				1235
				1236	static int __init setup_slub_debug(char *str)
				1237	{
				1238	slub_debug = DEBUG_DEFAULT_FLAGS;
				1239	if (str++ != '=' \|\| !str)
				1240	/*
				1241	* No options specified. Switch on full debugging.
				1242	*/
				1243	goto out;
				1244
				1245	if (*str == ',')
				1246	/*
				1247	* No options but restriction on slabs. This means full
				1248	* debugging for slabs matching a pattern.
				1249	*/
				1250	goto check_slabs;
				1251
				1252	slub_debug = 0;
				1253	if (*str == '-')
				1254	/*
				1255	* Switch off all debugging measures.
				1256	*/
				1257	goto out;
				1258
				1259	/*
				1260	* Determine which debug features should be switched on
				1261	*/
				1262	for (; str && str != ','; str++) {
				1263	switch (tolower(*str)) {
				1264	case 'f':
				1265	slub_debug \|= SLAB_CONSISTENCY_CHECKS;
				1266	break;
				1267	case 'z':
				1268	slub_debug \|= SLAB_RED_ZONE;
				1269	break;
				1270	case 'p':
				1271	slub_debug \|= SLAB_POISON;
				1272	break;
				1273	case 'u':
				1274	slub_debug \|= SLAB_STORE_USER;
				1275	break;
				1276	case 't':
				1277	slub_debug \|= SLAB_TRACE;
				1278	break;
				1279	case 'a':
				1280	slub_debug \|= SLAB_FAILSLAB;
				1281	break;
				1282	case 'o':
				1283	/*
				1284	* Avoid enabling debugging on caches if its minimum
				1285	* order would increase as a result.
				1286	*/
				1287	disable_higher_order_debug = 1;
				1288	break;
				1289	default:
				1290	pr_err("slub_debug option '%c' unknown. skipped\n",
				1291	*str);
				1292	}
				1293	}
				1294
				1295	check_slabs:
				1296	if (*str == ',')
				1297	slub_debug_slabs = str + 1;
				1298	out:
				1299	return 1;
				1300	}
				1301
				1302	__setup("slub_debug", setup_slub_debug);
				1303
				1304	unsigned long kmem_cache_flags(unsigned long object_size,
				1305	unsigned long flags, const char *name,
				1306	void (ctor)(void ))
				1307	{
				1308	/*
				1309	* Enable debugging if selected on the kernel commandline.
				1310	*/
				1311	if (slub_debug && (!slub_debug_slabs \|\| (name &&
				1312	!strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
				1313	flags \|= slub_debug;
				1314
				1315	return flags;
				1316	}
				1317	#else /* !CONFIG_SLUB_DEBUG */
				1318	static inline void setup_object_debug(struct kmem_cache *s,
				1319	struct page page, void object) {}
				1320
				1321	static inline int alloc_debug_processing(struct kmem_cache *s,
				1322	struct page page, void object, unsigned long addr) { return 0; }
				1323
				1324	static inline int free_debug_processing(
				1325	struct kmem_cache s, struct page page,
				1326	void head, void tail, int bulk_cnt,
				1327	unsigned long addr) { return 0; }
				1328
				1329	static inline int slab_pad_check(struct kmem_cache s, struct page page)
				1330	{ return 1; }
				1331	static inline int check_object(struct kmem_cache s, struct page page,
				1332	void *object, u8 val) { return 1; }
				1333	static inline void add_full(struct kmem_cache s, struct kmem_cache_node n,
				1334	struct page *page) {}
				1335	static inline void remove_full(struct kmem_cache s, struct kmem_cache_node n,
				1336	struct page *page) {}
				1337	unsigned long kmem_cache_flags(unsigned long object_size,
				1338	unsigned long flags, const char *name,
				1339	void (ctor)(void ))
				1340	{
				1341	return flags;
				1342	}
				1343	#define slub_debug 0
				1344
				1345	#define disable_higher_order_debug 0
				1346
				1347	static inline unsigned long slabs_node(struct kmem_cache *s, int node)
				1348	{ return 0; }
				1349	static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
				1350	{ return 0; }
				1351	static inline void inc_slabs_node(struct kmem_cache *s, int node,
				1352	int objects) {}
				1353	static inline void dec_slabs_node(struct kmem_cache *s, int node,
				1354	int objects) {}
				1355
				1356	static bool freelist_corrupted(struct kmem_cache s, struct page page,
				1357	void *freelist, void nextfree)
				1358	{
				1359	return false;
				1360	}
				1361	#endif /* CONFIG_SLUB_DEBUG */
				1362
				1363	/*
				1364	* Hooks for other subsystems that check memory allocations. In a typical
				1365	* production configuration these hooks all should produce no code at all.
				1366	*/
				1367	static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
				1368	{
				1369	kmemleak_alloc(ptr, size, 1, flags);
				1370	kasan_kmalloc_large(ptr, size, flags);
				1371	}
				1372
				1373	static inline void kfree_hook(const void *x)
				1374	{
				1375	kmemleak_free(x);
				1376	kasan_kfree_large(x);
				1377	}
				1378
				1379	static inline void slab_free_hook(struct kmem_cache s, void *x)
				1380	{
				1381	void *freeptr;
				1382
				1383	kmemleak_free_recursive(x, s->flags);
				1384
				1385	/*
				1386	* Trouble is that we may no longer disable interrupts in the fast path
				1387	* So in order to make the debug calls that expect irqs to be
				1388	* disabled we need to disable interrupts temporarily.
				1389	*/
				1390	#ifdef CONFIG_LOCKDEP
				1391	{
				1392	unsigned long flags;
				1393
				1394	local_irq_save(flags);
				1395	debug_check_no_locks_freed(x, s->object_size);
				1396	local_irq_restore(flags);
				1397	}
				1398	#endif
				1399	if (!(s->flags & SLAB_DEBUG_OBJECTS))
				1400	debug_check_no_obj_freed(x, s->object_size);
				1401
				1402	freeptr = get_freepointer(s, x);
				1403	/*
				1404	* kasan_slab_free() may put x into memory quarantine, delaying its
				1405	* reuse. In this case the object's freelist pointer is changed.
				1406	*/
				1407	kasan_slab_free(s, x);
				1408	return freeptr;
				1409	}
				1410
				1411	static inline void slab_free_freelist_hook(struct kmem_cache *s,
				1412	void head, void tail)
				1413	{
				1414	/*
				1415	* Compiler cannot detect this function can be removed if slab_free_hook()
				1416	* evaluates to nothing. Thus, catch all relevant config debug options here.
				1417	*/
				1418	#if defined(CONFIG_LOCKDEP) \|\| \
				1419	defined(CONFIG_DEBUG_KMEMLEAK) \|\| \
				1420	defined(CONFIG_DEBUG_OBJECTS_FREE) \|\| \
				1421	defined(CONFIG_KASAN)
				1422
				1423	void *object = head;
				1424	void *tail_obj = tail ? : head;
				1425	void *freeptr;
				1426
				1427	do {
				1428	freeptr = slab_free_hook(s, object);
				1429	} while ((object != tail_obj) && (object = freeptr));
				1430	#endif
				1431	}
				1432
				1433	static void setup_object(struct kmem_cache s, struct page page,
				1434	void *object)
				1435	{
				1436	setup_object_debug(s, page, object);
				1437	kasan_init_slab_obj(s, object);
				1438	if (unlikely(s->ctor)) {
				1439	kasan_unpoison_object_data(s, object);
				1440	s->ctor(object);
				1441	kasan_poison_object_data(s, object);
				1442	}
				1443	}
				1444
				1445	/*
				1446	* Slab allocation and freeing
				1447	*/
				1448	static inline struct page alloc_slab_page(struct kmem_cache s,
				1449	gfp_t flags, int node, struct kmem_cache_order_objects oo)
				1450	{
				1451	struct page *page;
				1452	int order = oo_order(oo);
				1453
				1454	if (node == NUMA_NO_NODE)
				1455	page = alloc_pages(flags, order);
				1456	else
				1457	page = __alloc_pages_node(node, flags, order);
				1458
				1459	if (page && memcg_charge_slab(page, flags, order, s)) {
				1460	__free_pages(page, order);
				1461	page = NULL;
				1462	}
				1463
				1464	return page;
				1465	}
				1466
				1467	#ifdef CONFIG_SLAB_FREELIST_RANDOM
				1468	/* Pre-initialize the random sequence cache */
				1469	static int init_cache_random_seq(struct kmem_cache *s)
				1470	{
				1471	int err;
				1472	unsigned long i, count = oo_objects(s->oo);
				1473
				1474	/* Bailout if already initialised */
				1475	if (s->random_seq)
				1476	return 0;
				1477
				1478	err = cache_random_seq_create(s, count, GFP_KERNEL);
				1479	if (err) {
				1480	pr_err("SLUB: Unable to initialize free list for %s\n",
				1481	s->name);
				1482	return err;
				1483	}
				1484
				1485	/* Transform to an offset on the set of pages */
				1486	if (s->random_seq) {
				1487	for (i = 0; i < count; i++)
				1488	s->random_seq[i] *= s->size;
				1489	}
				1490	return 0;
				1491	}
				1492
				1493	/* Initialize each random sequence freelist per cache */
				1494	static void __init init_freelist_randomization(void)
				1495	{
				1496	struct kmem_cache *s;
				1497
				1498	mutex_lock(&slab_mutex);
				1499
				1500	list_for_each_entry(s, &slab_caches, list)
				1501	init_cache_random_seq(s);
				1502
				1503	mutex_unlock(&slab_mutex);
				1504	}
				1505
				1506	/* Get the next entry on the pre-computed freelist randomized */
				1507	static void next_freelist_entry(struct kmem_cache s, struct page *page,
				1508	unsigned long pos, void start,
				1509	unsigned long page_limit,
				1510	unsigned long freelist_count)
				1511	{
				1512	unsigned int idx;
				1513
				1514	/*
				1515	* If the target page allocation failed, the number of objects on the
				1516	* page might be smaller than the usual size defined by the cache.
				1517	*/
				1518	do {
				1519	idx = s->random_seq[*pos];
				1520	*pos += 1;
				1521	if (*pos >= freelist_count)
				1522	*pos = 0;
				1523	} while (unlikely(idx >= page_limit));
				1524
				1525	return (char *)start + idx;
				1526	}
				1527
				1528	/* Shuffle the single linked freelist based on a random pre-computed sequence */
				1529	static bool shuffle_freelist(struct kmem_cache s, struct page page)
				1530	{
				1531	void *start;
				1532	void *cur;
				1533	void *next;
				1534	unsigned long idx, pos, page_limit, freelist_count;
				1535
				1536	if (page->objects < 2 \|\| !s->random_seq)
				1537	return false;
				1538
				1539	freelist_count = oo_objects(s->oo);
				1540	pos = get_random_int() % freelist_count;
				1541
				1542	page_limit = page->objects * s->size;
				1543	start = fixup_red_left(s, page_address(page));
				1544
				1545	/* First entry is used as the base of the freelist */
				1546	cur = next_freelist_entry(s, page, &pos, start, page_limit,
				1547	freelist_count);
				1548	page->freelist = cur;
				1549
				1550	for (idx = 1; idx < page->objects; idx++) {
				1551	setup_object(s, page, cur);
				1552	next = next_freelist_entry(s, page, &pos, start, page_limit,
				1553	freelist_count);
				1554	set_freepointer(s, cur, next);
				1555	cur = next;
				1556	}
				1557	setup_object(s, page, cur);
				1558	set_freepointer(s, cur, NULL);
				1559
				1560	return true;
				1561	}
				1562	#else
				1563	static inline int init_cache_random_seq(struct kmem_cache *s)
				1564	{
				1565	return 0;
				1566	}
				1567	static inline void init_freelist_randomization(void) { }
				1568	static inline bool shuffle_freelist(struct kmem_cache s, struct page page)
				1569	{
				1570	return false;
				1571	}
				1572	#endif /* CONFIG_SLAB_FREELIST_RANDOM */
				1573
				1574	static struct page allocate_slab(struct kmem_cache s, gfp_t flags, int node)
				1575	{
				1576	struct page *page;
				1577	struct kmem_cache_order_objects oo = s->oo;
				1578	gfp_t alloc_gfp;
				1579	void start, p;
				1580	int idx, order;
				1581	bool shuffle;
				1582
				1583	flags &= gfp_allowed_mask;
				1584
				1585	if (gfpflags_allow_blocking(flags))
				1586	local_irq_enable();
				1587
				1588	flags \|= s->allocflags;
				1589
				1590	/*
				1591	* Let the initial higher-order allocation fail under memory pressure
				1592	* so we fall-back to the minimum order allocation.
				1593	*/
				1594	alloc_gfp = (flags \| __GFP_NOWARN \| __GFP_NORETRY) & ~__GFP_NOFAIL;
				1595	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
				1596	alloc_gfp = (alloc_gfp \| __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM\|__GFP_NOFAIL);
				1597
				1598	page = alloc_slab_page(s, alloc_gfp, node, oo);
				1599	if (unlikely(!page)) {
				1600	oo = s->min;
				1601	alloc_gfp = flags;
				1602	/*
				1603	* Allocation may have failed due to fragmentation.
				1604	* Try a lower order alloc if possible
				1605	*/
				1606	page = alloc_slab_page(s, alloc_gfp, node, oo);
				1607	if (unlikely(!page))
				1608	goto out;
				1609	stat(s, ORDER_FALLBACK);
				1610	}
				1611
				1612	page->objects = oo_objects(oo);
				1613
				1614	order = compound_order(page);
				1615	page->slab_cache = s;
				1616	__SetPageSlab(page);
				1617	if (page_is_pfmemalloc(page))
				1618	SetPageSlabPfmemalloc(page);
				1619
				1620	start = page_address(page);
				1621
				1622	if (unlikely(s->flags & SLAB_POISON))
				1623	memset(start, POISON_INUSE, PAGE_SIZE << order);
				1624
				1625	kasan_poison_slab(page);
				1626
				1627	shuffle = shuffle_freelist(s, page);
				1628
				1629	if (!shuffle) {
				1630	for_each_object_idx(p, idx, s, start, page->objects) {
				1631	setup_object(s, page, p);
				1632	if (likely(idx < page->objects))
				1633	set_freepointer(s, p, p + s->size);
				1634	else
				1635	set_freepointer(s, p, NULL);
				1636	}
				1637	page->freelist = fixup_red_left(s, start);
				1638	}
				1639
				1640	page->inuse = page->objects;
				1641	page->frozen = 1;
				1642
				1643	out:
				1644	if (gfpflags_allow_blocking(flags))
				1645	local_irq_disable();
				1646	if (!page)
				1647	return NULL;
				1648
				1649	mod_lruvec_page_state(page,
				1650	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				1651	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				1652	1 << oo_order(oo));
				1653
				1654	inc_slabs_node(s, page_to_nid(page), page->objects);
				1655
				1656	return page;
				1657	}
				1658
				1659	static struct page new_slab(struct kmem_cache s, gfp_t flags, int node)
				1660	{
				1661	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
				1662	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
				1663	flags &= ~GFP_SLAB_BUG_MASK;
				1664	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
				1665	invalid_mask, &invalid_mask, flags, &flags);
				1666	dump_stack();
				1667	}
				1668
				1669	return allocate_slab(s,
				1670	flags & (GFP_RECLAIM_MASK \| GFP_CONSTRAINT_MASK), node);
				1671	}
				1672
				1673	static void __free_slab(struct kmem_cache s, struct page page)
				1674	{
				1675	int order = compound_order(page);
				1676	int pages = 1 << order;
				1677
				1678	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
				1679	void *p;
				1680
				1681	slab_pad_check(s, page);
				1682	for_each_object(p, s, page_address(page),
				1683	page->objects)
				1684	check_object(s, page, p, SLUB_RED_INACTIVE);
				1685	}
				1686
				1687	mod_lruvec_page_state(page,
				1688	(s->flags & SLAB_RECLAIM_ACCOUNT) ?
				1689	NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
				1690	-pages);
				1691
				1692	__ClearPageSlabPfmemalloc(page);
				1693	__ClearPageSlab(page);
				1694
				1695	page_mapcount_reset(page);
				1696	if (current->reclaim_state)
				1697	current->reclaim_state->reclaimed_slab += pages;
				1698	memcg_uncharge_slab(page, order, s);
				1699	__free_pages(page, order);
				1700	}
				1701
				1702	#define need_reserve_slab_rcu \
				1703	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
				1704
				1705	static void rcu_free_slab(struct rcu_head *h)
				1706	{
				1707	struct page *page;
				1708
				1709	if (need_reserve_slab_rcu)
				1710	page = virt_to_head_page(h);
				1711	else
				1712	page = container_of((struct list_head *)h, struct page, lru);
				1713
				1714	__free_slab(page->slab_cache, page);
				1715	}
				1716
				1717	static void free_slab(struct kmem_cache s, struct page page)
				1718	{
				1719	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
				1720	struct rcu_head *head;
				1721
				1722	if (need_reserve_slab_rcu) {
				1723	int order = compound_order(page);
				1724	int offset = (PAGE_SIZE << order) - s->reserved;
				1725
				1726	VM_BUG_ON(s->reserved != sizeof(*head));
				1727	head = page_address(page) + offset;
				1728	} else {
				1729	head = &page->rcu_head;
				1730	}
				1731
				1732	call_rcu(head, rcu_free_slab);
				1733	} else
				1734	__free_slab(s, page);
				1735	}
				1736
				1737	static void discard_slab(struct kmem_cache s, struct page page)
				1738	{
				1739	dec_slabs_node(s, page_to_nid(page), page->objects);
				1740	free_slab(s, page);
				1741	}
				1742
				1743	/*
				1744	* Management of partially allocated slabs.
				1745	*/
				1746	static inline void
				1747	__add_partial(struct kmem_cache_node n, struct page page, int tail)
				1748	{
				1749	n->nr_partial++;
				1750	if (tail == DEACTIVATE_TO_TAIL)
				1751	list_add_tail(&page->lru, &n->partial);
				1752	else
				1753	list_add(&page->lru, &n->partial);
				1754	}
				1755
				1756	static inline void add_partial(struct kmem_cache_node *n,
				1757	struct page *page, int tail)
				1758	{
				1759	lockdep_assert_held(&n->list_lock);
				1760	__add_partial(n, page, tail);
				1761	}
				1762
				1763	static inline void remove_partial(struct kmem_cache_node *n,
				1764	struct page *page)
				1765	{
				1766	lockdep_assert_held(&n->list_lock);
				1767	list_del(&page->lru);
				1768	n->nr_partial--;
				1769	}
				1770
				1771	/*
				1772	* Remove slab from the partial list, freeze it and
				1773	* return the pointer to the freelist.
				1774	*
				1775	* Returns a list of objects or NULL if it fails.
				1776	*/
				1777	static inline void acquire_slab(struct kmem_cache s,
				1778	struct kmem_cache_node n, struct page page,
				1779	int mode, int *objects)
				1780	{
				1781	void *freelist;
				1782	unsigned long counters;
				1783	struct page new;
				1784
				1785	lockdep_assert_held(&n->list_lock);
				1786
				1787	/*
				1788	* Zap the freelist and set the frozen bit.
				1789	* The old freelist is the list of objects for the
				1790	* per cpu allocation list.
				1791	*/
				1792	freelist = page->freelist;
				1793	counters = page->counters;
				1794	new.counters = counters;
				1795	*objects = new.objects - new.inuse;
				1796	if (mode) {
				1797	new.inuse = page->objects;
				1798	new.freelist = NULL;
				1799	} else {
				1800	new.freelist = freelist;
				1801	}
				1802
				1803	VM_BUG_ON(new.frozen);
				1804	new.frozen = 1;
				1805
				1806	if (!__cmpxchg_double_slab(s, page,
				1807	freelist, counters,
				1808	new.freelist, new.counters,
				1809	"acquire_slab"))
				1810	return NULL;
				1811
				1812	remove_partial(n, page);
				1813	WARN_ON(!freelist);
				1814	return freelist;
				1815	}
				1816
				1817	static void put_cpu_partial(struct kmem_cache s, struct page page, int drain);
				1818	static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
				1819
				1820	/*
				1821	* Try to allocate a partial slab from a specific node.
				1822	*/
				1823	static void get_partial_node(struct kmem_cache s, struct kmem_cache_node *n,
				1824	struct kmem_cache_cpu *c, gfp_t flags)
				1825	{
				1826	struct page page, page2;
				1827	void *object = NULL;
				1828	unsigned int available = 0;
				1829	int objects;
				1830
				1831	/*
				1832	* Racy check. If we mistakenly see no partial slabs then we
				1833	* just allocate an empty slab. If we mistakenly try to get a
				1834	* partial slab and there is none available then get_partials()
				1835	* will return NULL.
				1836	*/
				1837	if (!n \|\| !n->nr_partial)
				1838	return NULL;
				1839
				1840	spin_lock(&n->list_lock);
				1841	list_for_each_entry_safe(page, page2, &n->partial, lru) {
				1842	void *t;
				1843
				1844	if (!pfmemalloc_match(page, flags))
				1845	continue;
				1846
				1847	t = acquire_slab(s, n, page, object == NULL, &objects);
				1848	if (!t)
				1849	break;
				1850
				1851	available += objects;
				1852	if (!object) {
				1853	c->page = page;
				1854	stat(s, ALLOC_FROM_PARTIAL);
				1855	object = t;
				1856	} else {
				1857	put_cpu_partial(s, page, 0);
				1858	stat(s, CPU_PARTIAL_NODE);
				1859	}
				1860	if (!kmem_cache_has_cpu_partial(s)
				1861	\|\| available > slub_cpu_partial(s) / 2)
				1862	break;
				1863
				1864	}
				1865	spin_unlock(&n->list_lock);
				1866	return object;
				1867	}
				1868
				1869	/*
				1870	* Get a page from somewhere. Search in increasing NUMA distances.
				1871	*/
				1872	static void get_any_partial(struct kmem_cache s, gfp_t flags,
				1873	struct kmem_cache_cpu *c)
				1874	{
				1875	#ifdef CONFIG_NUMA
				1876	struct zonelist *zonelist;
				1877	struct zoneref *z;
				1878	struct zone *zone;
				1879	enum zone_type high_zoneidx = gfp_zone(flags);
				1880	void *object;
				1881	unsigned int cpuset_mems_cookie;
				1882
				1883	/*
				1884	* The defrag ratio allows a configuration of the tradeoffs between
				1885	* inter node defragmentation and node local allocations. A lower
				1886	* defrag_ratio increases the tendency to do local allocations
				1887	* instead of attempting to obtain partial slabs from other nodes.
				1888	*
				1889	* If the defrag_ratio is set to 0 then kmalloc() always
				1890	* returns node local objects. If the ratio is higher then kmalloc()
				1891	* may return off node objects because partial slabs are obtained
				1892	* from other nodes and filled up.
				1893	*
				1894	* If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
				1895	* (which makes defrag_ratio = 1000) then every (well almost)
				1896	* allocation will first attempt to defrag slab caches on other nodes.
				1897	* This means scanning over all nodes to look for partial slabs which
				1898	* may be expensive if we do it every time we are trying to find a slab
				1899	* with available objects.
				1900	*/
				1901	if (!s->remote_node_defrag_ratio \|\|
				1902	get_cycles() % 1024 > s->remote_node_defrag_ratio)
				1903	return NULL;
				1904
				1905	do {
				1906	cpuset_mems_cookie = read_mems_allowed_begin();
				1907	zonelist = node_zonelist(mempolicy_slab_node(), flags);
				1908	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
				1909	struct kmem_cache_node *n;
				1910
				1911	n = get_node(s, zone_to_nid(zone));
				1912
				1913	if (n && cpuset_zone_allowed(zone, flags) &&
				1914	n->nr_partial > s->min_partial) {
				1915	object = get_partial_node(s, n, c, flags);
				1916	if (object) {
				1917	/*
				1918	* Don't check read_mems_allowed_retry()
				1919	* here - if mems_allowed was updated in
				1920	* parallel, that was a harmless race
				1921	* between allocation and the cpuset
				1922	* update
				1923	*/
				1924	return object;
				1925	}
				1926	}
				1927	}
				1928	} while (read_mems_allowed_retry(cpuset_mems_cookie));
				1929	#endif
				1930	return NULL;
				1931	}
				1932
				1933	/*
				1934	* Get a partial page, lock it and return it.
				1935	*/
				1936	static void get_partial(struct kmem_cache s, gfp_t flags, int node,
				1937	struct kmem_cache_cpu *c)
				1938	{
				1939	void *object;
				1940	int searchnode = node;
				1941
				1942	if (node == NUMA_NO_NODE)
				1943	searchnode = numa_mem_id();
				1944
				1945	object = get_partial_node(s, get_node(s, searchnode), c, flags);
				1946	if (object \|\| node != NUMA_NO_NODE)
				1947	return object;
				1948
				1949	return get_any_partial(s, flags, c);
				1950	}
				1951
				1952	#ifdef CONFIG_PREEMPT
				1953	/*
				1954	* Calculate the next globally unique transaction for disambiguiation
				1955	* during cmpxchg. The transactions start with the cpu number and are then
				1956	* incremented by CONFIG_NR_CPUS.
				1957	*/
				1958	#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
				1959	#else
				1960	/*
				1961	* No preemption supported therefore also no need to check for
				1962	* different cpus.
				1963	*/
				1964	#define TID_STEP 1
				1965	#endif
				1966
				1967	static inline unsigned long next_tid(unsigned long tid)
				1968	{
				1969	return tid + TID_STEP;
				1970	}
				1971
				1972	static inline unsigned int tid_to_cpu(unsigned long tid)
				1973	{
				1974	return tid % TID_STEP;
				1975	}
				1976
				1977	static inline unsigned long tid_to_event(unsigned long tid)
				1978	{
				1979	return tid / TID_STEP;
				1980	}
				1981
				1982	static inline unsigned int init_tid(int cpu)
				1983	{
				1984	return cpu;
				1985	}
				1986
				1987	static inline void note_cmpxchg_failure(const char *n,
				1988	const struct kmem_cache *s, unsigned long tid)
				1989	{
				1990	#ifdef SLUB_DEBUG_CMPXCHG
				1991	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
				1992
				1993	pr_info("%s %s: cmpxchg redo ", n, s->name);
				1994
				1995	#ifdef CONFIG_PREEMPT
				1996	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
				1997	pr_warn("due to cpu change %d -> %d\n",
				1998	tid_to_cpu(tid), tid_to_cpu(actual_tid));
				1999	else
				2000	#endif
				2001	if (tid_to_event(tid) != tid_to_event(actual_tid))
				2002	pr_warn("due to cpu running other code. Event %ld->%ld\n",
				2003	tid_to_event(tid), tid_to_event(actual_tid));
				2004	else
				2005	pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
				2006	actual_tid, tid, next_tid(tid));
				2007	#endif
				2008	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
				2009	}
				2010
				2011	static void init_kmem_cache_cpus(struct kmem_cache *s)
				2012	{
				2013	int cpu;
				2014
				2015	for_each_possible_cpu(cpu)
				2016	per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
				2017	}
				2018
				2019	/*
				2020	* Remove the cpu slab
				2021	*/
				2022	static void deactivate_slab(struct kmem_cache s, struct page page,
				2023	void freelist, struct kmem_cache_cpu c)
				2024	{
				2025	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
				2026	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
				2027	int lock = 0;
				2028	enum slab_modes l = M_NONE, m = M_NONE;
				2029	void *nextfree;
				2030	int tail = DEACTIVATE_TO_HEAD;
				2031	struct page new;
				2032	struct page old;
				2033
				2034	if (page->freelist) {
				2035	stat(s, DEACTIVATE_REMOTE_FREES);
				2036	tail = DEACTIVATE_TO_TAIL;
				2037	}
				2038
				2039	/*
				2040	* Stage one: Free all available per cpu objects back
				2041	* to the page freelist while it is still frozen. Leave the
				2042	* last one.
				2043	*
				2044	* There is no need to take the list->lock because the page
				2045	* is still frozen.
				2046	*/
				2047	while (freelist && (nextfree = get_freepointer(s, freelist))) {
				2048	void *prior;
				2049	unsigned long counters;
				2050
				2051	/*
				2052	* If 'nextfree' is invalid, it is possible that the object at
				2053	* 'freelist' is already corrupted. So isolate all objects
				2054	* starting at 'freelist'.
				2055	*/
				2056	if (freelist_corrupted(s, page, &freelist, nextfree))
				2057	break;
				2058
				2059	do {
				2060	prior = page->freelist;
				2061	counters = page->counters;
				2062	set_freepointer(s, freelist, prior);
				2063	new.counters = counters;
				2064	new.inuse--;
				2065	VM_BUG_ON(!new.frozen);
				2066
				2067	} while (!__cmpxchg_double_slab(s, page,
				2068	prior, counters,
				2069	freelist, new.counters,
				2070	"drain percpu freelist"));
				2071
				2072	freelist = nextfree;
				2073	}
				2074
				2075	/*
				2076	* Stage two: Ensure that the page is unfrozen while the
				2077	* list presence reflects the actual number of objects
				2078	* during unfreeze.
				2079	*
				2080	* We setup the list membership and then perform a cmpxchg
				2081	* with the count. If there is a mismatch then the page
				2082	* is not unfrozen but the page is on the wrong list.
				2083	*
				2084	* Then we restart the process which may have to remove
				2085	* the page from the list that we just put it on again
				2086	* because the number of objects in the slab may have
				2087	* changed.
				2088	*/
				2089	redo:
				2090
				2091	old.freelist = page->freelist;
				2092	old.counters = page->counters;
				2093	VM_BUG_ON(!old.frozen);
				2094
				2095	/* Determine target state of the slab */
				2096	new.counters = old.counters;
				2097	if (freelist) {
				2098	new.inuse--;
				2099	set_freepointer(s, freelist, old.freelist);
				2100	new.freelist = freelist;
				2101	} else
				2102	new.freelist = old.freelist;
				2103
				2104	new.frozen = 0;
				2105
				2106	if (!new.inuse && n->nr_partial >= s->min_partial)
				2107	m = M_FREE;
				2108	else if (new.freelist) {
				2109	m = M_PARTIAL;
				2110	if (!lock) {
				2111	lock = 1;
				2112	/*
				2113	* Taking the spinlock removes the possiblity
				2114	* that acquire_slab() will see a slab page that
				2115	* is frozen
				2116	*/
				2117	spin_lock(&n->list_lock);
				2118	}
				2119	} else {
				2120	m = M_FULL;
				2121	if (kmem_cache_debug(s) && !lock) {
				2122	lock = 1;
				2123	/*
				2124	* This also ensures that the scanning of full
				2125	* slabs from diagnostic functions will not see
				2126	* any frozen slabs.
				2127	*/
				2128	spin_lock(&n->list_lock);
				2129	}
				2130	}
				2131
				2132	if (l != m) {
				2133
				2134	if (l == M_PARTIAL)
				2135
				2136	remove_partial(n, page);
				2137
				2138	else if (l == M_FULL)
				2139
				2140	remove_full(s, n, page);
				2141
				2142	if (m == M_PARTIAL) {
				2143
				2144	add_partial(n, page, tail);
				2145	stat(s, tail);
				2146
				2147	} else if (m == M_FULL) {
				2148
				2149	stat(s, DEACTIVATE_FULL);
				2150	add_full(s, n, page);
				2151
				2152	}
				2153	}
				2154
				2155	l = m;
				2156	if (!__cmpxchg_double_slab(s, page,
				2157	old.freelist, old.counters,
				2158	new.freelist, new.counters,
				2159	"unfreezing slab"))
				2160	goto redo;
				2161
				2162	if (lock)
				2163	spin_unlock(&n->list_lock);
				2164
				2165	if (m == M_FREE) {
				2166	stat(s, DEACTIVATE_EMPTY);
				2167	discard_slab(s, page);
				2168	stat(s, FREE_SLAB);
				2169	}
				2170
				2171	c->page = NULL;
				2172	c->freelist = NULL;
				2173	}
				2174
				2175	/*
				2176	* Unfreeze all the cpu partial slabs.
				2177	*
				2178	* This function must be called with interrupts disabled
				2179	* for the cpu using c (or some other guarantee must be there
				2180	* to guarantee no concurrent accesses).
				2181	*/
				2182	static void unfreeze_partials(struct kmem_cache *s,
				2183	struct kmem_cache_cpu *c)
				2184	{
				2185	#ifdef CONFIG_SLUB_CPU_PARTIAL
				2186	struct kmem_cache_node n = NULL, n2 = NULL;
				2187	struct page page, discard_page = NULL;
				2188
				2189	while ((page = c->partial)) {
				2190	struct page new;
				2191	struct page old;
				2192
				2193	c->partial = page->next;
				2194
				2195	n2 = get_node(s, page_to_nid(page));
				2196	if (n != n2) {
				2197	if (n)
				2198	spin_unlock(&n->list_lock);
				2199
				2200	n = n2;
				2201	spin_lock(&n->list_lock);
				2202	}
				2203
				2204	do {
				2205
				2206	old.freelist = page->freelist;
				2207	old.counters = page->counters;
				2208	VM_BUG_ON(!old.frozen);
				2209
				2210	new.counters = old.counters;
				2211	new.freelist = old.freelist;
				2212
				2213	new.frozen = 0;
				2214
				2215	} while (!__cmpxchg_double_slab(s, page,
				2216	old.freelist, old.counters,
				2217	new.freelist, new.counters,
				2218	"unfreezing slab"));
				2219
				2220	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
				2221	page->next = discard_page;
				2222	discard_page = page;
				2223	} else {
				2224	add_partial(n, page, DEACTIVATE_TO_TAIL);
				2225	stat(s, FREE_ADD_PARTIAL);
				2226	}
				2227	}
				2228
				2229	if (n)
				2230	spin_unlock(&n->list_lock);
				2231
				2232	while (discard_page) {
				2233	page = discard_page;
				2234	discard_page = discard_page->next;
				2235
				2236	stat(s, DEACTIVATE_EMPTY);
				2237	discard_slab(s, page);
				2238	stat(s, FREE_SLAB);
				2239	}
				2240	#endif
				2241	}
				2242
				2243	/*
				2244	* Put a page that was just frozen (in __slab_free) into a partial page
				2245	* slot if available. This is done without interrupts disabled and without
				2246	* preemption disabled. The cmpxchg is racy and may put the partial page
				2247	* onto a random cpus partial slot.
				2248	*
				2249	* If we did not find a slot then simply move all the partials to the
				2250	* per node partial list.
				2251	*/
				2252	static void put_cpu_partial(struct kmem_cache s, struct page page, int drain)
				2253	{
				2254	#ifdef CONFIG_SLUB_CPU_PARTIAL
				2255	struct page *oldpage;
				2256	int pages;
				2257	int pobjects;
				2258
				2259	preempt_disable();
				2260	do {
				2261	pages = 0;
				2262	pobjects = 0;
				2263	oldpage = this_cpu_read(s->cpu_slab->partial);
				2264
				2265	if (oldpage) {
				2266	pobjects = oldpage->pobjects;
				2267	pages = oldpage->pages;
				2268	if (drain && pobjects > s->cpu_partial) {
				2269	unsigned long flags;
				2270	/*
				2271	* partial array is full. Move the existing
				2272	* set to the per node partial list.
				2273	*/
				2274	local_irq_save(flags);
				2275	unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
				2276	local_irq_restore(flags);
				2277	oldpage = NULL;
				2278	pobjects = 0;
				2279	pages = 0;
				2280	stat(s, CPU_PARTIAL_DRAIN);
				2281	}
				2282	}
				2283
				2284	pages++;
				2285	pobjects += page->objects - page->inuse;
				2286
				2287	page->pages = pages;
				2288	page->pobjects = pobjects;
				2289	page->next = oldpage;
				2290
				2291	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
				2292	!= oldpage);
				2293	if (unlikely(!s->cpu_partial)) {
				2294	unsigned long flags;
				2295
				2296	local_irq_save(flags);
				2297	unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
				2298	local_irq_restore(flags);
				2299	}
				2300	preempt_enable();
				2301	#endif
				2302	}
				2303
				2304	static inline void flush_slab(struct kmem_cache s, struct kmem_cache_cpu c)
				2305	{
				2306	stat(s, CPUSLAB_FLUSH);
				2307	deactivate_slab(s, c->page, c->freelist, c);
				2308
				2309	c->tid = next_tid(c->tid);
				2310	}
				2311
				2312	/*
				2313	* Flush cpu slab.
				2314	*
				2315	* Called from IPI handler with interrupts disabled.
				2316	*/
				2317	static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
				2318	{
				2319	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
				2320
				2321	if (likely(c)) {
				2322	if (c->page)
				2323	flush_slab(s, c);
				2324
				2325	unfreeze_partials(s, c);
				2326	}
				2327	}
				2328
				2329	static void flush_cpu_slab(void *d)
				2330	{
				2331	struct kmem_cache *s = d;
				2332
				2333	__flush_cpu_slab(s, smp_processor_id());
				2334	}
				2335
				2336	static bool has_cpu_slab(int cpu, void *info)
				2337	{
				2338	struct kmem_cache *s = info;
				2339	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
				2340
				2341	return c->page \|\| slub_percpu_partial(c);
				2342	}
				2343
				2344	static void flush_all(struct kmem_cache *s)
				2345	{
				2346	on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
				2347	}
				2348
				2349	/*
				2350	* Use the cpu notifier to insure that the cpu slabs are flushed when
				2351	* necessary.
				2352	*/
				2353	static int slub_cpu_dead(unsigned int cpu)
				2354	{
				2355	struct kmem_cache *s;
				2356	unsigned long flags;
				2357
				2358	mutex_lock(&slab_mutex);
				2359	list_for_each_entry(s, &slab_caches, list) {
				2360	local_irq_save(flags);
				2361	__flush_cpu_slab(s, cpu);
				2362	local_irq_restore(flags);
				2363	}
				2364	mutex_unlock(&slab_mutex);
				2365	return 0;
				2366	}
				2367
				2368	/*
				2369	* Check if the objects in a per cpu structure fit numa
				2370	* locality expectations.
				2371	*/
				2372	static inline int node_match(struct page *page, int node)
				2373	{
				2374	#ifdef CONFIG_NUMA
				2375	if (!page \|\| (node != NUMA_NO_NODE && page_to_nid(page) != node))
				2376	return 0;
				2377	#endif
				2378	return 1;
				2379	}
				2380
				2381	#ifdef CONFIG_SLUB_DEBUG
				2382	static int count_free(struct page *page)
				2383	{
				2384	return page->objects - page->inuse;
				2385	}
				2386
				2387	static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
				2388	{
				2389	return atomic_long_read(&n->total_objects);
				2390	}
				2391	#endif /* CONFIG_SLUB_DEBUG */
				2392
				2393	#if defined(CONFIG_SLUB_DEBUG) \|\| defined(CONFIG_SYSFS)
				2394	static unsigned long count_partial(struct kmem_cache_node *n,
				2395	int (get_count)(struct page ))
				2396	{
				2397	unsigned long flags;
				2398	unsigned long x = 0;
				2399	struct page *page;
				2400
				2401	spin_lock_irqsave(&n->list_lock, flags);
				2402	list_for_each_entry(page, &n->partial, lru)
				2403	x += get_count(page);
				2404	spin_unlock_irqrestore(&n->list_lock, flags);
				2405	return x;
				2406	}
				2407	#endif /* CONFIG_SLUB_DEBUG \|\| CONFIG_SYSFS */
				2408
				2409	static noinline void
				2410	slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
				2411	{
				2412	#ifdef CONFIG_SLUB_DEBUG
				2413	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				2414	DEFAULT_RATELIMIT_BURST);
				2415	int node;
				2416	struct kmem_cache_node *n;
				2417
				2418	if ((gfpflags & __GFP_NOWARN) \|\| !__ratelimit(&slub_oom_rs))
				2419	return;
				2420
				2421	pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
				2422	nid, gfpflags, &gfpflags);
				2423	pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
				2424	s->name, s->object_size, s->size, oo_order(s->oo),
				2425	oo_order(s->min));
				2426
				2427	if (oo_order(s->min) > get_order(s->object_size))
				2428	pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
				2429	s->name);
				2430
				2431	for_each_kmem_cache_node(s, node, n) {
				2432	unsigned long nr_slabs;
				2433	unsigned long nr_objs;
				2434	unsigned long nr_free;
				2435
				2436	nr_free = count_partial(n, count_free);
				2437	nr_slabs = node_nr_slabs(n);
				2438	nr_objs = node_nr_objs(n);
				2439
				2440	pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
				2441	node, nr_slabs, nr_objs, nr_free);
				2442	}
				2443	#endif
				2444	}
				2445
				2446	static inline void new_slab_objects(struct kmem_cache s, gfp_t flags,
				2447	int node, struct kmem_cache_cpu **pc)
				2448	{
				2449	void *freelist;
				2450	struct kmem_cache_cpu c = pc;
				2451	struct page *page;
				2452
				2453	freelist = get_partial(s, flags, node, c);
				2454
				2455	if (freelist)
				2456	return freelist;
				2457
				2458	page = new_slab(s, flags, node);
				2459	if (page) {
				2460	c = raw_cpu_ptr(s->cpu_slab);
				2461	if (c->page)
				2462	flush_slab(s, c);
				2463
				2464	/*
				2465	* No other reference to the page yet so we can
				2466	* muck around with it freely without cmpxchg
				2467	*/
				2468	freelist = page->freelist;
				2469	page->freelist = NULL;
				2470
				2471	stat(s, ALLOC_SLAB);
				2472	c->page = page;
				2473	*pc = c;
				2474	} else
				2475	freelist = NULL;
				2476
				2477	return freelist;
				2478	}
				2479
				2480	static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
				2481	{
				2482	if (unlikely(PageSlabPfmemalloc(page)))
				2483	return gfp_pfmemalloc_allowed(gfpflags);
				2484
				2485	return true;
				2486	}
				2487
				2488	/*
				2489	* Check the page->freelist of a page and either transfer the freelist to the
				2490	* per cpu freelist or deactivate the page.
				2491	*
				2492	* The page is still frozen if the return value is not NULL.
				2493	*
				2494	* If this function returns NULL then the page has been unfrozen.
				2495	*
				2496	* This function must be called with interrupt disabled.
				2497	*/
				2498	static inline void get_freelist(struct kmem_cache s, struct page *page)
				2499	{
				2500	struct page new;
				2501	unsigned long counters;
				2502	void *freelist;
				2503
				2504	do {
				2505	freelist = page->freelist;
				2506	counters = page->counters;
				2507
				2508	new.counters = counters;
				2509	VM_BUG_ON(!new.frozen);
				2510
				2511	new.inuse = page->objects;
				2512	new.frozen = freelist != NULL;
				2513
				2514	} while (!__cmpxchg_double_slab(s, page,
				2515	freelist, counters,
				2516	NULL, new.counters,
				2517	"get_freelist"));
				2518
				2519	return freelist;
				2520	}
				2521
				2522	/*
				2523	* Slow path. The lockless freelist is empty or we need to perform
				2524	* debugging duties.
				2525	*
				2526	* Processing is still very fast if new objects have been freed to the
				2527	* regular freelist. In that case we simply take over the regular freelist
				2528	* as the lockless freelist and zap the regular freelist.
				2529	*
				2530	* If that is not working then we fall back to the partial lists. We take the
				2531	* first element of the freelist as the object to allocate now and move the
				2532	* rest of the freelist to the lockless freelist.
				2533	*
				2534	* And if we were unable to get a new slab from the partial slab lists then
				2535	* we need to allocate a new slab. This is the slowest path since it involves
				2536	* a call to the page allocator and the setup of a new slab.
				2537	*
				2538	* Version of __slab_alloc to use when we know that interrupts are
				2539	* already disabled (which is the case for bulk allocation).
				2540	*/
				2541	static void ___slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
				2542	unsigned long addr, struct kmem_cache_cpu *c)
				2543	{
				2544	void *freelist;
				2545	struct page *page;
				2546
				2547	page = c->page;
				2548	if (!page) {
				2549	/*
				2550	* if the node is not online or has no normal memory, just
				2551	* ignore the node constraint
				2552	*/
				2553	if (unlikely(node != NUMA_NO_NODE &&
				2554	!node_state(node, N_NORMAL_MEMORY)))
				2555	node = NUMA_NO_NODE;
				2556	goto new_slab;
				2557	}
				2558	redo:
				2559
				2560	if (unlikely(!node_match(page, node))) {
				2561	/*
				2562	* same as above but node_match() being false already
				2563	* implies node != NUMA_NO_NODE
				2564	*/
				2565	if (!node_state(node, N_NORMAL_MEMORY)) {
				2566	node = NUMA_NO_NODE;
				2567	goto redo;
				2568	} else {
				2569	stat(s, ALLOC_NODE_MISMATCH);
				2570	deactivate_slab(s, page, c->freelist, c);
				2571	goto new_slab;
				2572	}
				2573	}
				2574
				2575	/*
				2576	* By rights, we should be searching for a slab page that was
				2577	* PFMEMALLOC but right now, we are losing the pfmemalloc
				2578	* information when the page leaves the per-cpu allocator
				2579	*/
				2580	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
				2581	deactivate_slab(s, page, c->freelist, c);
				2582	goto new_slab;
				2583	}
				2584
				2585	/* must check again c->freelist in case of cpu migration or IRQ */
				2586	freelist = c->freelist;
				2587	if (freelist)
				2588	goto load_freelist;
				2589
				2590	freelist = get_freelist(s, page);
				2591
				2592	if (!freelist) {
				2593	c->page = NULL;
				2594	stat(s, DEACTIVATE_BYPASS);
				2595	goto new_slab;
				2596	}
				2597
				2598	stat(s, ALLOC_REFILL);
				2599
				2600	load_freelist:
				2601	/*
				2602	* freelist is pointing to the list of objects to be used.
				2603	* page is pointing to the page from which the objects are obtained.
				2604	* That page must be frozen for per cpu allocations to work.
				2605	*/
				2606	VM_BUG_ON(!c->page->frozen);
				2607	c->freelist = get_freepointer(s, freelist);
				2608	c->tid = next_tid(c->tid);
				2609	return freelist;
				2610
				2611	new_slab:
				2612
				2613	if (slub_percpu_partial(c)) {
				2614	page = c->page = slub_percpu_partial(c);
				2615	slub_set_percpu_partial(c, page);
				2616	stat(s, CPU_PARTIAL_ALLOC);
				2617	goto redo;
				2618	}
				2619
				2620	freelist = new_slab_objects(s, gfpflags, node, &c);
				2621
				2622	if (unlikely(!freelist)) {
				2623	slab_out_of_memory(s, gfpflags, node);
				2624	return NULL;
				2625	}
				2626
				2627	page = c->page;
				2628	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
				2629	goto load_freelist;
				2630
				2631	/* Only entered in the debug case */
				2632	if (kmem_cache_debug(s) &&
				2633	!alloc_debug_processing(s, page, freelist, addr))
				2634	goto new_slab; /* Slab failed checks. Next slab needed */
				2635
				2636	deactivate_slab(s, page, get_freepointer(s, freelist), c);
				2637	return freelist;
				2638	}
				2639
				2640	/*
				2641	* Another one that disabled interrupt and compensates for possible
				2642	* cpu changes by refetching the per cpu area pointer.
				2643	*/
				2644	static void __slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
				2645	unsigned long addr, struct kmem_cache_cpu *c)
				2646	{
				2647	void *p;
				2648	unsigned long flags;
				2649
				2650	local_irq_save(flags);
				2651	#ifdef CONFIG_PREEMPT
				2652	/*
				2653	* We may have been preempted and rescheduled on a different
				2654	* cpu before disabling interrupts. Need to reload cpu area
				2655	* pointer.
				2656	*/
				2657	c = this_cpu_ptr(s->cpu_slab);
				2658	#endif
				2659
				2660	p = ___slab_alloc(s, gfpflags, node, addr, c);
				2661	local_irq_restore(flags);
				2662	return p;
				2663	}
				2664
				2665	/*
				2666	* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
				2667	* have the fastpath folded into their functions. So no function call
				2668	* overhead for requests that can be satisfied on the fastpath.
				2669	*
				2670	* The fastpath works by first checking if the lockless freelist can be used.
				2671	* If not then __slab_alloc is called for slow processing.
				2672	*
				2673	* Otherwise we can simply pick the next object from the lockless free list.
				2674	*/
				2675	static __always_inline void slab_alloc_node(struct kmem_cache s,
				2676	gfp_t gfpflags, int node, unsigned long addr)
				2677	{
				2678	void *object;
				2679	struct kmem_cache_cpu *c;
				2680	struct page *page;
				2681	unsigned long tid;
				2682
				2683	s = slab_pre_alloc_hook(s, gfpflags);
				2684	if (!s)
				2685	return NULL;
				2686	redo:
				2687	/*
				2688	* Must read kmem_cache cpu data via this cpu ptr. Preemption is
				2689	* enabled. We may switch back and forth between cpus while
				2690	* reading from one cpu area. That does not matter as long
				2691	* as we end up on the original cpu again when doing the cmpxchg.
				2692	*
				2693	* We should guarantee that tid and kmem_cache are retrieved on
				2694	* the same cpu. It could be different if CONFIG_PREEMPT so we need
				2695	* to check if it is matched or not.
				2696	*/
				2697	do {
				2698	tid = this_cpu_read(s->cpu_slab->tid);
				2699	c = raw_cpu_ptr(s->cpu_slab);
				2700	} while (IS_ENABLED(CONFIG_PREEMPT) &&
				2701	unlikely(tid != READ_ONCE(c->tid)));
				2702
				2703	/*
				2704	* Irqless object alloc/free algorithm used here depends on sequence
				2705	* of fetching cpu_slab's data. tid should be fetched before anything
				2706	* on c to guarantee that object and page associated with previous tid
				2707	* won't be used with current tid. If we fetch tid first, object and
				2708	* page could be one associated with next tid and our alloc/free
				2709	* request will be failed. In this case, we will retry. So, no problem.
				2710	*/
				2711	barrier();
				2712
				2713	/*
				2714	* The transaction ids are globally unique per cpu and per operation on
				2715	* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
				2716	* occurs on the right processor and that there was no operation on the
				2717	* linked list in between.
				2718	*/
				2719
				2720	object = c->freelist;
				2721	page = c->page;
				2722	if (unlikely(!object \|\| !node_match(page, node))) {
				2723	object = __slab_alloc(s, gfpflags, node, addr, c);
				2724	stat(s, ALLOC_SLOWPATH);
				2725	} else {
				2726	void *next_object = get_freepointer_safe(s, object);
				2727
				2728	/*
				2729	* The cmpxchg will only match if there was no additional
				2730	* operation and if we are on the right processor.
				2731	*
				2732	* The cmpxchg does the following atomically (without lock
				2733	* semantics!)
				2734	* 1. Relocate first pointer to the current per cpu area.
				2735	* 2. Verify that tid and freelist have not been changed
				2736	* 3. If they were not changed replace tid and freelist
				2737	*
				2738	* Since this is without lock semantics the protection is only
				2739	* against code executing on this cpu not from access by
				2740	* other cpus.
				2741	*/
				2742	if (unlikely(!this_cpu_cmpxchg_double(
				2743	s->cpu_slab->freelist, s->cpu_slab->tid,
				2744	object, tid,
				2745	next_object, next_tid(tid)))) {
				2746
				2747	note_cmpxchg_failure("slab_alloc", s, tid);
				2748	goto redo;
				2749	}
				2750	prefetch_freepointer(s, next_object);
				2751	stat(s, ALLOC_FASTPATH);
				2752	}
				2753
				2754	if (unlikely(gfpflags & __GFP_ZERO) && object)
				2755	memset(object, 0, s->object_size);
				2756
				2757	slab_post_alloc_hook(s, gfpflags, 1, &object);
				2758
				2759	return object;
				2760	}
				2761
				2762	static __always_inline void slab_alloc(struct kmem_cache s,
				2763	gfp_t gfpflags, unsigned long addr)
				2764	{
				2765	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
				2766	}
				2767
				2768	void kmem_cache_alloc(struct kmem_cache s, gfp_t gfpflags)
				2769	{
				2770	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
				2771
				2772	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
				2773	s->size, gfpflags);
				2774
				2775	return ret;
				2776	}
				2777	EXPORT_SYMBOL(kmem_cache_alloc);
				2778
				2779	#ifdef CONFIG_TRACING
				2780	void kmem_cache_alloc_trace(struct kmem_cache s, gfp_t gfpflags, size_t size)
				2781	{
				2782	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
				2783	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
				2784	kasan_kmalloc(s, ret, size, gfpflags);
				2785	return ret;
				2786	}
				2787	EXPORT_SYMBOL(kmem_cache_alloc_trace);
				2788	#endif
				2789
				2790	#ifdef CONFIG_NUMA
				2791	void kmem_cache_alloc_node(struct kmem_cache s, gfp_t gfpflags, int node)
				2792	{
				2793	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
				2794
				2795	trace_kmem_cache_alloc_node(_RET_IP_, ret,
				2796	s->object_size, s->size, gfpflags, node);
				2797
				2798	return ret;
				2799	}
				2800	EXPORT_SYMBOL(kmem_cache_alloc_node);
				2801
				2802	#ifdef CONFIG_TRACING
				2803	void kmem_cache_alloc_node_trace(struct kmem_cache s,
				2804	gfp_t gfpflags,
				2805	int node, size_t size)
				2806	{
				2807	void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
				2808
				2809	trace_kmalloc_node(_RET_IP_, ret,
				2810	size, s->size, gfpflags, node);
				2811
				2812	kasan_kmalloc(s, ret, size, gfpflags);
				2813	return ret;
				2814	}
				2815	EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
				2816	#endif
				2817	#endif
				2818
				2819	/*
				2820	* Slow path handling. This may still be called frequently since objects
				2821	* have a longer lifetime than the cpu slabs in most processing loads.
				2822	*
				2823	* So we still attempt to reduce cache line usage. Just take the slab
				2824	* lock and free the item. If there is no additional partial page
				2825	* handling required then we can return immediately.
				2826	*/
				2827	static void __slab_free(struct kmem_cache s, struct page page,
				2828	void head, void tail, int cnt,
				2829	unsigned long addr)
				2830
				2831	{
				2832	void *prior;
				2833	int was_frozen;
				2834	struct page new;
				2835	unsigned long counters;
				2836	struct kmem_cache_node *n = NULL;
				2837	unsigned long uninitialized_var(flags);
				2838
				2839	stat(s, FREE_SLOWPATH);
				2840
				2841	if (kmem_cache_debug(s) &&
				2842	!free_debug_processing(s, page, head, tail, cnt, addr))
				2843	return;
				2844
				2845	do {
				2846	if (unlikely(n)) {
				2847	spin_unlock_irqrestore(&n->list_lock, flags);
				2848	n = NULL;
				2849	}
				2850	prior = page->freelist;
				2851	counters = page->counters;
				2852	set_freepointer(s, tail, prior);
				2853	new.counters = counters;
				2854	was_frozen = new.frozen;
				2855	new.inuse -= cnt;
				2856	if ((!new.inuse \|\| !prior) && !was_frozen) {
				2857
				2858	if (kmem_cache_has_cpu_partial(s) && !prior) {
				2859
				2860	/*
				2861	* Slab was on no list before and will be
				2862	* partially empty
				2863	* We can defer the list move and instead
				2864	* freeze it.
				2865	*/
				2866	new.frozen = 1;
				2867
				2868	} else { /* Needs to be taken off a list */
				2869
				2870	n = get_node(s, page_to_nid(page));
				2871	/*
				2872	* Speculatively acquire the list_lock.
				2873	* If the cmpxchg does not succeed then we may
				2874	* drop the list_lock without any processing.
				2875	*
				2876	* Otherwise the list_lock will synchronize with
				2877	* other processors updating the list of slabs.
				2878	*/
				2879	spin_lock_irqsave(&n->list_lock, flags);
				2880
				2881	}
				2882	}
				2883
				2884	} while (!cmpxchg_double_slab(s, page,
				2885	prior, counters,
				2886	head, new.counters,
				2887	"__slab_free"));
				2888
				2889	if (likely(!n)) {
				2890
				2891	/*
				2892	* If we just froze the page then put it onto the
				2893	* per cpu partial list.
				2894	*/
				2895	if (new.frozen && !was_frozen) {
				2896	put_cpu_partial(s, page, 1);
				2897	stat(s, CPU_PARTIAL_FREE);
				2898	}
				2899	/*
				2900	* The list lock was not taken therefore no list
				2901	* activity can be necessary.
				2902	*/
				2903	if (was_frozen)
				2904	stat(s, FREE_FROZEN);
				2905	return;
				2906	}
				2907
				2908	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
				2909	goto slab_empty;
				2910
				2911	/*
				2912	* Objects left in the slab. If it was not on the partial list before
				2913	* then add it.
				2914	*/
				2915	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
				2916	if (kmem_cache_debug(s))
				2917	remove_full(s, n, page);
				2918	add_partial(n, page, DEACTIVATE_TO_TAIL);
				2919	stat(s, FREE_ADD_PARTIAL);
				2920	}
				2921	spin_unlock_irqrestore(&n->list_lock, flags);
				2922	return;
				2923
				2924	slab_empty:
				2925	if (prior) {
				2926	/*
				2927	* Slab on the partial list.
				2928	*/
				2929	remove_partial(n, page);
				2930	stat(s, FREE_REMOVE_PARTIAL);
				2931	} else {
				2932	/* Slab must be on the full list */
				2933	remove_full(s, n, page);
				2934	}
				2935
				2936	spin_unlock_irqrestore(&n->list_lock, flags);
				2937	stat(s, FREE_SLAB);
				2938	discard_slab(s, page);
				2939	}
				2940
				2941	/*
				2942	* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
				2943	* can perform fastpath freeing without additional function calls.
				2944	*
				2945	* The fastpath is only possible if we are freeing to the current cpu slab
				2946	* of this processor. This typically the case if we have just allocated
				2947	* the item before.
				2948	*
				2949	* If fastpath is not possible then fall back to __slab_free where we deal
				2950	* with all sorts of special processing.
				2951	*
				2952	* Bulk free of a freelist with several objects (all pointing to the
				2953	* same page) possible by specifying head and tail ptr, plus objects
				2954	* count (cnt). Bulk free indicated by tail pointer being set.
				2955	*/
				2956	static __always_inline void do_slab_free(struct kmem_cache *s,
				2957	struct page page, void head, void *tail,
				2958	int cnt, unsigned long addr)
				2959	{
				2960	void *tail_obj = tail ? : head;
				2961	struct kmem_cache_cpu *c;
				2962	unsigned long tid;
				2963	redo:
				2964	/*
				2965	* Determine the currently cpus per cpu slab.
				2966	* The cpu may change afterward. However that does not matter since
				2967	* data is retrieved via this pointer. If we are on the same cpu
				2968	* during the cmpxchg then the free will succeed.
				2969	*/
				2970	do {
				2971	tid = this_cpu_read(s->cpu_slab->tid);
				2972	c = raw_cpu_ptr(s->cpu_slab);
				2973	} while (IS_ENABLED(CONFIG_PREEMPT) &&
				2974	unlikely(tid != READ_ONCE(c->tid)));
				2975
				2976	/* Same with comment on barrier() in slab_alloc_node() */
				2977	barrier();
				2978
				2979	if (likely(page == c->page)) {
				2980	void **freelist = READ_ONCE(c->freelist);
				2981
				2982	set_freepointer(s, tail_obj, freelist);
				2983
				2984	if (unlikely(!this_cpu_cmpxchg_double(
				2985	s->cpu_slab->freelist, s->cpu_slab->tid,
				2986	freelist, tid,
				2987	head, next_tid(tid)))) {
				2988
				2989	note_cmpxchg_failure("slab_free", s, tid);
				2990	goto redo;
				2991	}
				2992	stat(s, FREE_FASTPATH);
				2993	} else
				2994	__slab_free(s, page, head, tail_obj, cnt, addr);
				2995
				2996	}
				2997
				2998	static __always_inline void slab_free(struct kmem_cache s, struct page page,
				2999	void head, void tail, int cnt,
				3000	unsigned long addr)
				3001	{
				3002	slab_free_freelist_hook(s, head, tail);
				3003	/*
				3004	* slab_free_freelist_hook() could have put the items into quarantine.
				3005	* If so, no need to free them.
				3006	*/
				3007	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
				3008	return;
				3009	do_slab_free(s, page, head, tail, cnt, addr);
				3010	}
				3011
				3012	#ifdef CONFIG_KASAN
				3013	void ___cache_free(struct kmem_cache cache, void x, unsigned long addr)
				3014	{
				3015	do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
				3016	}
				3017	#endif
				3018
				3019	void kmem_cache_free(struct kmem_cache s, void x)
				3020	{
				3021	s = cache_from_obj(s, x);
				3022	if (!s)
				3023	return;
				3024	slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
				3025	trace_kmem_cache_free(_RET_IP_, x);
				3026	}
				3027	EXPORT_SYMBOL(kmem_cache_free);
				3028
				3029	struct detached_freelist {
				3030	struct page *page;
				3031	void *tail;
				3032	void *freelist;
				3033	int cnt;
				3034	struct kmem_cache *s;
				3035	};
				3036
				3037	/*
				3038	* This function progressively scans the array with free objects (with
				3039	* a limited look ahead) and extract objects belonging to the same
				3040	* page. It builds a detached freelist directly within the given
				3041	* page/objects. This can happen without any need for
				3042	* synchronization, because the objects are owned by running process.
				3043	* The freelist is build up as a single linked list in the objects.
				3044	* The idea is, that this detached freelist can then be bulk
				3045	* transferred to the real freelist(s), but only requiring a single
				3046	* synchronization primitive. Look ahead in the array is limited due
				3047	* to performance reasons.
				3048	*/
				3049	static inline
				3050	int build_detached_freelist(struct kmem_cache *s, size_t size,
				3051	void *p, struct detached_freelist df)
				3052	{
				3053	size_t first_skipped_index = 0;
				3054	int lookahead = 3;
				3055	void *object;
				3056	struct page *page;
				3057
				3058	/* Always re-init detached_freelist */
				3059	df->page = NULL;
				3060
				3061	do {
				3062	object = p[--size];
				3063	/* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
				3064	} while (!object && size);
				3065
				3066	if (!object)
				3067	return 0;
				3068
				3069	page = virt_to_head_page(object);
				3070	if (!s) {
				3071	/* Handle kalloc'ed objects */
				3072	if (unlikely(!PageSlab(page))) {
				3073	BUG_ON(!PageCompound(page));
				3074	kfree_hook(object);
				3075	__free_pages(page, compound_order(page));
				3076	p[size] = NULL; /* mark object processed */
				3077	return size;
				3078	}
				3079	/* Derive kmem_cache from object */
				3080	df->s = page->slab_cache;
				3081	} else {
				3082	df->s = cache_from_obj(s, object); /* Support for memcg */
				3083	}
				3084
				3085	/* Start new detached freelist */
				3086	df->page = page;
				3087	set_freepointer(df->s, object, NULL);
				3088	df->tail = object;
				3089	df->freelist = object;
				3090	p[size] = NULL; /* mark object processed */
				3091	df->cnt = 1;
				3092
				3093	while (size) {
				3094	object = p[--size];
				3095	if (!object)
				3096	continue; /* Skip processed objects */
				3097
				3098	/* df->page is always set at this point */
				3099	if (df->page == virt_to_head_page(object)) {
				3100	/* Opportunity build freelist */
				3101	set_freepointer(df->s, object, df->freelist);
				3102	df->freelist = object;
				3103	df->cnt++;
				3104	p[size] = NULL; /* mark object processed */
				3105
				3106	continue;
				3107	}
				3108
				3109	/* Limit look ahead search */
				3110	if (!--lookahead)
				3111	break;
				3112
				3113	if (!first_skipped_index)
				3114	first_skipped_index = size + 1;
				3115	}
				3116
				3117	return first_skipped_index;
				3118	}
				3119
				3120	/* Note that interrupts must be enabled when calling this function. */
				3121	void kmem_cache_free_bulk(struct kmem_cache s, size_t size, void *p)
				3122	{
				3123	if (WARN_ON(!size))
				3124	return;
				3125
				3126	do {
				3127	struct detached_freelist df;
				3128
				3129	size = build_detached_freelist(s, size, p, &df);
				3130	if (!df.page)
				3131	continue;
				3132
				3133	slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
				3134	} while (likely(size));
				3135	}
				3136	EXPORT_SYMBOL(kmem_cache_free_bulk);
				3137
				3138	/* Note that interrupts must be enabled when calling this function. */
				3139	int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
				3140	void **p)
				3141	{
				3142	struct kmem_cache_cpu *c;
				3143	int i;
				3144
				3145	/* memcg and kmem_cache debug support */
				3146	s = slab_pre_alloc_hook(s, flags);
				3147	if (unlikely(!s))
				3148	return false;
				3149	/*
				3150	* Drain objects in the per cpu slab, while disabling local
				3151	* IRQs, which protects against PREEMPT and interrupts
				3152	* handlers invoking normal fastpath.
				3153	*/
				3154	local_irq_disable();
				3155	c = this_cpu_ptr(s->cpu_slab);
				3156
				3157	for (i = 0; i < size; i++) {
				3158	void *object = c->freelist;
				3159
				3160	if (unlikely(!object)) {
				3161	/*
				3162	* We may have removed an object from c->freelist using
				3163	* the fastpath in the previous iteration; in that case,
				3164	* c->tid has not been bumped yet.
				3165	* Since ___slab_alloc() may reenable interrupts while
				3166	* allocating memory, we should bump c->tid now.
				3167	*/
				3168	c->tid = next_tid(c->tid);
				3169
				3170	/*
				3171	* Invoking slow path likely have side-effect
				3172	* of re-populating per CPU c->freelist
				3173	*/
				3174	p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
				3175	_RET_IP_, c);
				3176	if (unlikely(!p[i]))
				3177	goto error;
				3178
				3179	c = this_cpu_ptr(s->cpu_slab);
				3180	continue; /* goto for-loop */
				3181	}
				3182	c->freelist = get_freepointer(s, object);
				3183	p[i] = object;
				3184	}
				3185	c->tid = next_tid(c->tid);
				3186	local_irq_enable();
				3187
				3188	/* Clear memory outside IRQ disabled fastpath loop */
				3189	if (unlikely(flags & __GFP_ZERO)) {
				3190	int j;
				3191
				3192	for (j = 0; j < i; j++)
				3193	memset(p[j], 0, s->object_size);
				3194	}
				3195
				3196	/* memcg and kmem_cache debug support */
				3197	slab_post_alloc_hook(s, flags, size, p);
				3198	return i;
				3199	error:
				3200	local_irq_enable();
				3201	slab_post_alloc_hook(s, flags, i, p);
				3202	__kmem_cache_free_bulk(s, i, p);
				3203	return 0;
				3204	}
				3205	EXPORT_SYMBOL(kmem_cache_alloc_bulk);
				3206
				3207
				3208	/*
				3209	* Object placement in a slab is made very easy because we always start at
				3210	* offset 0. If we tune the size of the object to the alignment then we can
				3211	* get the required alignment by putting one properly sized object after
				3212	* another.
				3213	*
				3214	* Notice that the allocation order determines the sizes of the per cpu
				3215	* caches. Each processor has always one slab available for allocations.
				3216	* Increasing the allocation order reduces the number of times that slabs
				3217	* must be moved on and off the partial lists and is therefore a factor in
				3218	* locking overhead.
				3219	*/
				3220
				3221	/*
				3222	* Mininum / Maximum order of slab pages. This influences locking overhead
				3223	* and slab fragmentation. A higher order reduces the number of partial slabs
				3224	* and increases the number of allocations possible without having to
				3225	* take the list_lock.
				3226	*/
				3227	static int slub_min_order;
				3228	static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
				3229	static int slub_min_objects;
				3230
				3231	/*
				3232	* Calculate the order of allocation given an slab object size.
				3233	*
				3234	* The order of allocation has significant impact on performance and other
				3235	* system components. Generally order 0 allocations should be preferred since
				3236	* order 0 does not cause fragmentation in the page allocator. Larger objects
				3237	* be problematic to put into order 0 slabs because there may be too much
				3238	* unused space left. We go to a higher order if more than 1/16th of the slab
				3239	* would be wasted.
				3240	*
				3241	* In order to reach satisfactory performance we must ensure that a minimum
				3242	* number of objects is in one slab. Otherwise we may generate too much
				3243	* activity on the partial lists which requires taking the list_lock. This is
				3244	* less a concern for large slabs though which are rarely used.
				3245	*
				3246	* slub_max_order specifies the order where we begin to stop considering the
				3247	* number of objects in a slab as critical. If we reach slub_max_order then
				3248	* we try to keep the page order as low as possible. So we accept more waste
				3249	* of space in favor of a small page order.
				3250	*
				3251	* Higher order allocations also allow the placement of more objects in a
				3252	* slab and thereby reduce object handling overhead. If the user has
				3253	* requested a higher mininum order then we start with that one instead of
				3254	* the smallest order which will fit the object.
				3255	*/
				3256	static inline int slab_order(int size, int min_objects,
				3257	int max_order, int fract_leftover, int reserved)
				3258	{
				3259	int order;
				3260	int rem;
				3261	int min_order = slub_min_order;
				3262
				3263	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
				3264	return get_order(size * MAX_OBJS_PER_PAGE) - 1;
				3265
				3266	for (order = max(min_order, get_order(min_objects * size + reserved));
				3267	order <= max_order; order++) {
				3268
				3269	unsigned long slab_size = PAGE_SIZE << order;
				3270
				3271	rem = (slab_size - reserved) % size;
				3272
				3273	if (rem <= slab_size / fract_leftover)
				3274	break;
				3275	}
				3276
				3277	return order;
				3278	}
				3279
				3280	static inline int calculate_order(int size, int reserved)
				3281	{
				3282	int order;
				3283	int min_objects;
				3284	int fraction;
				3285	int max_objects;
				3286
				3287	/*
				3288	* Attempt to find best configuration for a slab. This
				3289	* works by first attempting to generate a layout with
				3290	* the best configuration and backing off gradually.
				3291	*
				3292	* First we increase the acceptable waste in a slab. Then
				3293	* we reduce the minimum objects required in a slab.
				3294	*/
				3295	min_objects = slub_min_objects;
				3296	if (!min_objects)
				3297	min_objects = 4 * (fls(nr_cpu_ids) + 1);
				3298	max_objects = order_objects(slub_max_order, size, reserved);
				3299	min_objects = min(min_objects, max_objects);
				3300
				3301	while (min_objects > 1) {
				3302	fraction = 16;
				3303	while (fraction >= 4) {
				3304	order = slab_order(size, min_objects,
				3305	slub_max_order, fraction, reserved);
				3306	if (order <= slub_max_order)
				3307	return order;
				3308	fraction /= 2;
				3309	}
				3310	min_objects--;
				3311	}
				3312
				3313	/*
				3314	* We were unable to place multiple objects in a slab. Now
				3315	* lets see if we can place a single object there.
				3316	*/
				3317	order = slab_order(size, 1, slub_max_order, 1, reserved);
				3318	if (order <= slub_max_order)
				3319	return order;
				3320
				3321	/*
				3322	* Doh this slab cannot be placed using slub_max_order.
				3323	*/
				3324	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
				3325	if (order < MAX_ORDER)
				3326	return order;
				3327	return -ENOSYS;
				3328	}
				3329
				3330	static void
				3331	init_kmem_cache_node(struct kmem_cache_node *n)
				3332	{
				3333	n->nr_partial = 0;
				3334	spin_lock_init(&n->list_lock);
				3335	INIT_LIST_HEAD(&n->partial);
				3336	#ifdef CONFIG_SLUB_DEBUG
				3337	atomic_long_set(&n->nr_slabs, 0);
				3338	atomic_long_set(&n->total_objects, 0);
				3339	INIT_LIST_HEAD(&n->full);
				3340	#endif
				3341	}
				3342
				3343	static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
				3344	{
				3345	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
				3346	KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
				3347
				3348	/*
				3349	* Must align to double word boundary for the double cmpxchg
				3350	* instructions to work; see __pcpu_double_call_return_bool().
				3351	*/
				3352	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
				3353	2 * sizeof(void *));
				3354
				3355	if (!s->cpu_slab)
				3356	return 0;
				3357
				3358	init_kmem_cache_cpus(s);
				3359
				3360	return 1;
				3361	}
				3362
				3363	static struct kmem_cache *kmem_cache_node;
				3364
				3365	/*
				3366	* No kmalloc_node yet so do it by hand. We know that this is the first
				3367	* slab on the node for this slabcache. There are no concurrent accesses
				3368	* possible.
				3369	*
				3370	* Note that this function only works on the kmem_cache_node
				3371	* when allocating for the kmem_cache_node. This is used for bootstrapping
				3372	* memory on a fresh node that has no slab structures yet.
				3373	*/
				3374	static void early_kmem_cache_node_alloc(int node)
				3375	{
				3376	struct page *page;
				3377	struct kmem_cache_node *n;
				3378
				3379	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
				3380
				3381	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
				3382
				3383	BUG_ON(!page);
				3384	if (page_to_nid(page) != node) {
				3385	pr_err("SLUB: Unable to allocate memory from node %d\n", node);
				3386	pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
				3387	}
				3388
				3389	n = page->freelist;
				3390	BUG_ON(!n);
				3391	page->freelist = get_freepointer(kmem_cache_node, n);
				3392	page->inuse = 1;
				3393	page->frozen = 0;
				3394	kmem_cache_node->node[node] = n;
				3395	#ifdef CONFIG_SLUB_DEBUG
				3396	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
				3397	init_tracking(kmem_cache_node, n);
				3398	#endif
				3399	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
				3400	GFP_KERNEL);
				3401	init_kmem_cache_node(n);
				3402	inc_slabs_node(kmem_cache_node, node, page->objects);
				3403
				3404	/*
				3405	* No locks need to be taken here as it has just been
				3406	* initialized and there is no concurrent access.
				3407	*/
				3408	__add_partial(n, page, DEACTIVATE_TO_HEAD);
				3409	}
				3410
				3411	static void free_kmem_cache_nodes(struct kmem_cache *s)
				3412	{
				3413	int node;
				3414	struct kmem_cache_node *n;
				3415
				3416	for_each_kmem_cache_node(s, node, n) {
				3417	s->node[node] = NULL;
				3418	kmem_cache_free(kmem_cache_node, n);
				3419	}
				3420	}
				3421
				3422	void __kmem_cache_release(struct kmem_cache *s)
				3423	{
				3424	cache_random_seq_destroy(s);
				3425	free_percpu(s->cpu_slab);
				3426	free_kmem_cache_nodes(s);
				3427	}
				3428
				3429	static int init_kmem_cache_nodes(struct kmem_cache *s)
				3430	{
				3431	int node;
				3432
				3433	for_each_node_state(node, N_NORMAL_MEMORY) {
				3434	struct kmem_cache_node *n;
				3435
				3436	if (slab_state == DOWN) {
				3437	early_kmem_cache_node_alloc(node);
				3438	continue;
				3439	}
				3440	n = kmem_cache_alloc_node(kmem_cache_node,
				3441	GFP_KERNEL, node);
				3442
				3443	if (!n) {
				3444	free_kmem_cache_nodes(s);
				3445	return 0;
				3446	}
				3447
				3448	init_kmem_cache_node(n);
				3449	s->node[node] = n;
				3450	}
				3451	return 1;
				3452	}
				3453
				3454	static void set_min_partial(struct kmem_cache *s, unsigned long min)
				3455	{
				3456	if (min < MIN_PARTIAL)
				3457	min = MIN_PARTIAL;
				3458	else if (min > MAX_PARTIAL)
				3459	min = MAX_PARTIAL;
				3460	s->min_partial = min;
				3461	}
				3462
				3463	static void set_cpu_partial(struct kmem_cache *s)
				3464	{
				3465	#ifdef CONFIG_SLUB_CPU_PARTIAL
				3466	/*
				3467	* cpu_partial determined the maximum number of objects kept in the
				3468	* per cpu partial lists of a processor.
				3469	*
				3470	* Per cpu partial lists mainly contain slabs that just have one
				3471	* object freed. If they are used for allocation then they can be
				3472	* filled up again with minimal effort. The slab will never hit the
				3473	* per node partial lists and therefore no locking will be required.
				3474	*
				3475	* This setting also determines
				3476	*
				3477	* A) The number of objects from per cpu partial slabs dumped to the
				3478	* per node list when we reach the limit.
				3479	* B) The number of objects in cpu partial slabs to extract from the
				3480	* per node list when we run out of per cpu objects. We only fetch
				3481	* 50% to keep some capacity around for frees.
				3482	*/
				3483	if (!kmem_cache_has_cpu_partial(s))
				3484	s->cpu_partial = 0;
				3485	else if (s->size >= PAGE_SIZE)
				3486	s->cpu_partial = 2;
				3487	else if (s->size >= 1024)
				3488	s->cpu_partial = 6;
				3489	else if (s->size >= 256)
				3490	s->cpu_partial = 13;
				3491	else
				3492	s->cpu_partial = 30;
				3493	#endif
				3494	}
				3495
				3496	/*
				3497	* calculate_sizes() determines the order and the distribution of data within
				3498	* a slab object.
				3499	*/
				3500	static int calculate_sizes(struct kmem_cache *s, int forced_order)
				3501	{
				3502	unsigned long flags = s->flags;
				3503	size_t size = s->object_size;
				3504	int order;
				3505
				3506	/*
				3507	* Round up object size to the next word boundary. We can only
				3508	* place the free pointer at word boundaries and this determines
				3509	* the possible location of the free pointer.
				3510	*/
				3511	size = ALIGN(size, sizeof(void *));
				3512
				3513	#ifdef CONFIG_SLUB_DEBUG
				3514	/*
				3515	* Determine if we can poison the object itself. If the user of
				3516	* the slab may touch the object after free or before allocation
				3517	* then we should never poison the object itself.
				3518	*/
				3519	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
				3520	!s->ctor)
				3521	s->flags \|= __OBJECT_POISON;
				3522	else
				3523	s->flags &= ~__OBJECT_POISON;
				3524
				3525
				3526	/*
				3527	* If we are Redzoning then check if there is some space between the
				3528	* end of the object and the free pointer. If not then add an
				3529	* additional word to have some bytes to store Redzone information.
				3530	*/
				3531	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
				3532	size += sizeof(void *);
				3533	#endif
				3534
				3535	/*
				3536	* With that we have determined the number of bytes in actual use
				3537	* by the object. This is the potential offset to the free pointer.
				3538	*/
				3539	s->inuse = size;
				3540
				3541	if (((flags & (SLAB_TYPESAFE_BY_RCU \| SLAB_POISON)) \|\|
				3542	s->ctor)) {
				3543	/*
				3544	* Relocate free pointer after the object if it is not
				3545	* permitted to overwrite the first word of the object on
				3546	* kmem_cache_free.
				3547	*
				3548	* This is the case if we do RCU, have a constructor or
				3549	* destructor or are poisoning the objects.
				3550	*/
				3551	s->offset = size;
				3552	size += sizeof(void *);
				3553	}
				3554
				3555	#ifdef CONFIG_SLUB_DEBUG
				3556	if (flags & SLAB_STORE_USER)
				3557	/*
				3558	* Need to store information about allocs and frees after
				3559	* the object.
				3560	*/
				3561	size += 2 * sizeof(struct track);
				3562	#endif
				3563
				3564	kasan_cache_create(s, &size, &s->flags);
				3565	#ifdef CONFIG_SLUB_DEBUG
				3566	if (flags & SLAB_RED_ZONE) {
				3567	/*
				3568	* Add some empty padding so that we can catch
				3569	* overwrites from earlier objects rather than let
				3570	* tracking information or the free pointer be
				3571	* corrupted if a user writes before the start
				3572	* of the object.
				3573	*/
				3574	size += sizeof(void *);
				3575
				3576	s->red_left_pad = sizeof(void *);
				3577	s->red_left_pad = ALIGN(s->red_left_pad, s->align);
				3578	size += s->red_left_pad;
				3579	}
				3580	#endif
				3581
				3582	/*
				3583	* SLUB stores one object immediately after another beginning from
				3584	* offset 0. In order to align the objects we have to simply size
				3585	* each object to conform to the alignment.
				3586	*/
				3587	size = ALIGN(size, s->align);
				3588	s->size = size;
				3589	if (forced_order >= 0)
				3590	order = forced_order;
				3591	else
				3592	order = calculate_order(size, s->reserved);
				3593
				3594	if (order < 0)
				3595	return 0;
				3596
				3597	s->allocflags = 0;
				3598	if (order)
				3599	s->allocflags \|= __GFP_COMP;
				3600
				3601	if (s->flags & SLAB_CACHE_DMA)
				3602	s->allocflags \|= GFP_DMA;
				3603
				3604	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				3605	s->allocflags \|= __GFP_RECLAIMABLE;
				3606
				3607	/*
				3608	* Determine the number of objects per slab
				3609	*/
				3610	s->oo = oo_make(order, size, s->reserved);
				3611	s->min = oo_make(get_order(size), size, s->reserved);
				3612	if (oo_objects(s->oo) > oo_objects(s->max))
				3613	s->max = s->oo;
				3614
				3615	return !!oo_objects(s->oo);
				3616	}
				3617
				3618	static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
				3619	{
				3620	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
				3621	s->reserved = 0;
				3622	#ifdef CONFIG_SLAB_FREELIST_HARDENED
				3623	s->random = get_random_long();
				3624	#endif
				3625
				3626	if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
				3627	s->reserved = sizeof(struct rcu_head);
				3628
				3629	if (!calculate_sizes(s, -1))
				3630	goto error;
				3631	if (disable_higher_order_debug) {
				3632	/*
				3633	* Disable debugging flags that store metadata if the min slab
				3634	* order increased.
				3635	*/
				3636	if (get_order(s->size) > get_order(s->object_size)) {
				3637	s->flags &= ~DEBUG_METADATA_FLAGS;
				3638	s->offset = 0;
				3639	if (!calculate_sizes(s, -1))
				3640	goto error;
				3641	}
				3642	}
				3643
				3644	#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
				3645	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
				3646	if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
				3647	/* Enable fast mode */
				3648	s->flags \|= __CMPXCHG_DOUBLE;
				3649	#endif
				3650
				3651	/*
				3652	* The larger the object size is, the more pages we want on the partial
				3653	* list to avoid pounding the page allocator excessively.
				3654	*/
				3655	set_min_partial(s, ilog2(s->size) / 2);
				3656
				3657	set_cpu_partial(s);
				3658
				3659	#ifdef CONFIG_NUMA
				3660	s->remote_node_defrag_ratio = 1000;
				3661	#endif
				3662
				3663	/* Initialize the pre-computed randomized freelist if slab is up */
				3664	if (slab_state >= UP) {
				3665	if (init_cache_random_seq(s))
				3666	goto error;
				3667	}
				3668
				3669	if (!init_kmem_cache_nodes(s))
				3670	goto error;
				3671
				3672	if (alloc_kmem_cache_cpus(s))
				3673	return 0;
				3674
				3675	free_kmem_cache_nodes(s);
				3676	error:
				3677	if (flags & SLAB_PANIC)
				3678	panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
				3679	s->name, (unsigned long)s->size, s->size,
				3680	oo_order(s->oo), s->offset, flags);
				3681	return -EINVAL;
				3682	}
				3683
				3684	static void list_slab_objects(struct kmem_cache s, struct page page,
				3685	const char *text)
				3686	{
				3687	#ifdef CONFIG_SLUB_DEBUG
				3688	void *addr = page_address(page);
				3689	void *p;
				3690	unsigned long map = kzalloc(BITS_TO_LONGS(page->objects)
				3691	sizeof(long), GFP_ATOMIC);
				3692	if (!map)
				3693	return;
				3694	slab_err(s, page, text, s->name);
				3695	slab_lock(page);
				3696
				3697	get_map(s, page, map);
				3698	for_each_object(p, s, addr, page->objects) {
				3699
				3700	if (!test_bit(slab_index(p, s, addr), map)) {
				3701	pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
				3702	print_tracking(s, p);
				3703	}
				3704	}
				3705	slab_unlock(page);
				3706	kfree(map);
				3707	#endif
				3708	}
				3709
				3710	/*
				3711	* Attempt to free all partial slabs on a node.
				3712	* This is called from __kmem_cache_shutdown(). We must take list_lock
				3713	* because sysfs file might still access partial list after the shutdowning.
				3714	*/
				3715	static void free_partial(struct kmem_cache s, struct kmem_cache_node n)
				3716	{
				3717	LIST_HEAD(discard);
				3718	struct page page, h;
				3719
				3720	BUG_ON(irqs_disabled());
				3721	spin_lock_irq(&n->list_lock);
				3722	list_for_each_entry_safe(page, h, &n->partial, lru) {
				3723	if (!page->inuse) {
				3724	remove_partial(n, page);
				3725	list_add(&page->lru, &discard);
				3726	} else {
				3727	list_slab_objects(s, page,
				3728	"Objects remaining in %s on __kmem_cache_shutdown()");
				3729	}
				3730	}
				3731	spin_unlock_irq(&n->list_lock);
				3732
				3733	list_for_each_entry_safe(page, h, &discard, lru)
				3734	discard_slab(s, page);
				3735	}
				3736
				3737	/*
				3738	* Release all resources used by a slab cache.
				3739	*/
				3740	int __kmem_cache_shutdown(struct kmem_cache *s)
				3741	{
				3742	int node;
				3743	struct kmem_cache_node *n;
				3744
				3745	flush_all(s);
				3746	/* Attempt to free all objects */
				3747	for_each_kmem_cache_node(s, node, n) {
				3748	free_partial(s, n);
				3749	if (n->nr_partial \|\| slabs_node(s, node))
				3750	return 1;
				3751	}
				3752	sysfs_slab_remove(s);
				3753	return 0;
				3754	}
				3755
				3756	/********************************************************************
				3757	* Kmalloc subsystem
				3758	*******************************************************************/
				3759
				3760	static int __init setup_slub_min_order(char *str)
				3761	{
				3762	get_option(&str, &slub_min_order);
				3763
				3764	return 1;
				3765	}
				3766
				3767	__setup("slub_min_order=", setup_slub_min_order);
				3768
				3769	static int __init setup_slub_max_order(char *str)
				3770	{
				3771	get_option(&str, &slub_max_order);
				3772	slub_max_order = min(slub_max_order, MAX_ORDER - 1);
				3773
				3774	return 1;
				3775	}
				3776
				3777	__setup("slub_max_order=", setup_slub_max_order);
				3778
				3779	static int __init setup_slub_min_objects(char *str)
				3780	{
				3781	get_option(&str, &slub_min_objects);
				3782
				3783	return 1;
				3784	}
				3785
				3786	__setup("slub_min_objects=", setup_slub_min_objects);
				3787
				3788	void *__kmalloc(size_t size, gfp_t flags)
				3789	{
				3790	struct kmem_cache *s;
				3791	void *ret;
				3792
				3793	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
				3794	return kmalloc_large(size, flags);
				3795
				3796	s = kmalloc_slab(size, flags);
				3797
				3798	if (unlikely(ZERO_OR_NULL_PTR(s)))
				3799	return s;
				3800
				3801	ret = slab_alloc(s, flags, _RET_IP_);
				3802
				3803	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
				3804
				3805	kasan_kmalloc(s, ret, size, flags);
				3806
				3807	return ret;
				3808	}
				3809	EXPORT_SYMBOL(__kmalloc);
				3810
				3811	#ifdef CONFIG_NUMA
				3812	static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
				3813	{
				3814	struct page *page;
				3815	void *ptr = NULL;
				3816
				3817	flags \|= __GFP_COMP;
				3818	page = alloc_pages_node(node, flags, get_order(size));
				3819	if (page)
				3820	ptr = page_address(page);
				3821
				3822	kmalloc_large_node_hook(ptr, size, flags);
				3823	return ptr;
				3824	}
				3825
				3826	void *__kmalloc_node(size_t size, gfp_t flags, int node)
				3827	{
				3828	struct kmem_cache *s;
				3829	void *ret;
				3830
				3831	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
				3832	ret = kmalloc_large_node(size, flags, node);
				3833
				3834	trace_kmalloc_node(_RET_IP_, ret,
				3835	size, PAGE_SIZE << get_order(size),
				3836	flags, node);
				3837
				3838	return ret;
				3839	}
				3840
				3841	s = kmalloc_slab(size, flags);
				3842
				3843	if (unlikely(ZERO_OR_NULL_PTR(s)))
				3844	return s;
				3845
				3846	ret = slab_alloc_node(s, flags, node, _RET_IP_);
				3847
				3848	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
				3849
				3850	kasan_kmalloc(s, ret, size, flags);
				3851
				3852	return ret;
				3853	}
				3854	EXPORT_SYMBOL(__kmalloc_node);
				3855	#endif
				3856
				3857	#ifdef CONFIG_HARDENED_USERCOPY
				3858	/*
				3859	* Rejects objects that are incorrectly sized.
				3860	*
				3861	* Returns NULL if check passes, otherwise const char * to name of cache
				3862	* to indicate an error.
				3863	*/
				3864	const char __check_heap_object(const void ptr, unsigned long n,
				3865	struct page *page)
				3866	{
				3867	struct kmem_cache *s;
				3868	unsigned long offset;
				3869	size_t object_size;
				3870
				3871	/* Find object and usable object size. */
				3872	s = page->slab_cache;
				3873	object_size = slab_ksize(s);
				3874
				3875	/* Reject impossible pointers. */
				3876	if (ptr < page_address(page))
				3877	return s->name;
				3878
				3879	/* Find offset within object. */
				3880	offset = (ptr - page_address(page)) % s->size;
				3881
				3882	/* Adjust for redzone and reject if within the redzone. */
				3883	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
				3884	if (offset < s->red_left_pad)
				3885	return s->name;
				3886	offset -= s->red_left_pad;
				3887	}
				3888
				3889	/* Allow address range falling entirely within object size. */
				3890	if (offset <= object_size && n <= object_size - offset)
				3891	return NULL;
				3892
				3893	return s->name;
				3894	}
				3895	#endif /* CONFIG_HARDENED_USERCOPY */
				3896
				3897	static size_t __ksize(const void *object)
				3898	{
				3899	struct page *page;
				3900
				3901	if (unlikely(object == ZERO_SIZE_PTR))
				3902	return 0;
				3903
				3904	page = virt_to_head_page(object);
				3905
				3906	if (unlikely(!PageSlab(page))) {
				3907	WARN_ON(!PageCompound(page));
				3908	return PAGE_SIZE << compound_order(page);
				3909	}
				3910
				3911	return slab_ksize(page->slab_cache);
				3912	}
				3913
				3914	size_t ksize(const void *object)
				3915	{
				3916	size_t size = __ksize(object);
				3917	/* We assume that ksize callers could use whole allocated area,
				3918	* so we need to unpoison this area.
				3919	*/
				3920	kasan_unpoison_shadow(object, size);
				3921	return size;
				3922	}
				3923	EXPORT_SYMBOL(ksize);
				3924
				3925	void kfree(const void *x)
				3926	{
				3927	struct page *page;
				3928	void object = (void )x;
				3929
				3930	trace_kfree(_RET_IP_, x);
				3931
				3932	if (unlikely(ZERO_OR_NULL_PTR(x)))
				3933	return;
				3934
				3935	page = virt_to_head_page(x);
				3936	if (unlikely(!PageSlab(page))) {
				3937	BUG_ON(!PageCompound(page));
				3938	kfree_hook(x);
				3939	__free_pages(page, compound_order(page));
				3940	return;
				3941	}
				3942	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
				3943	}
				3944	EXPORT_SYMBOL(kfree);
				3945
				3946	#define SHRINK_PROMOTE_MAX 32
				3947
				3948	/*
				3949	* kmem_cache_shrink discards empty slabs and promotes the slabs filled
				3950	* up most to the head of the partial lists. New allocations will then
				3951	* fill those up and thus they can be removed from the partial lists.
				3952	*
				3953	* The slabs with the least items are placed last. This results in them
				3954	* being allocated from last increasing the chance that the last objects
				3955	* are freed in them.
				3956	*/
				3957	int __kmem_cache_shrink(struct kmem_cache *s)
				3958	{
				3959	int node;
				3960	int i;
				3961	struct kmem_cache_node *n;
				3962	struct page *page;
				3963	struct page *t;
				3964	struct list_head discard;
				3965	struct list_head promote[SHRINK_PROMOTE_MAX];
				3966	unsigned long flags;
				3967	int ret = 0;
				3968
				3969	flush_all(s);
				3970	for_each_kmem_cache_node(s, node, n) {
				3971	INIT_LIST_HEAD(&discard);
				3972	for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
				3973	INIT_LIST_HEAD(promote + i);
				3974
				3975	spin_lock_irqsave(&n->list_lock, flags);
				3976
				3977	/*
				3978	* Build lists of slabs to discard or promote.
				3979	*
				3980	* Note that concurrent frees may occur while we hold the
				3981	* list_lock. page->inuse here is the upper limit.
				3982	*/
				3983	list_for_each_entry_safe(page, t, &n->partial, lru) {
				3984	int free = page->objects - page->inuse;
				3985
				3986	/* Do not reread page->inuse */
				3987	barrier();
				3988
				3989	/* We do not keep full slabs on the list */
				3990	BUG_ON(free <= 0);
				3991
				3992	if (free == page->objects) {
				3993	list_move(&page->lru, &discard);
				3994	n->nr_partial--;
				3995	} else if (free <= SHRINK_PROMOTE_MAX)
				3996	list_move(&page->lru, promote + free - 1);
				3997	}
				3998
				3999	/*
				4000	* Promote the slabs filled up most to the head of the
				4001	* partial list.
				4002	*/
				4003	for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
				4004	list_splice(promote + i, &n->partial);
				4005
				4006	spin_unlock_irqrestore(&n->list_lock, flags);
				4007
				4008	/* Release empty slabs */
				4009	list_for_each_entry_safe(page, t, &discard, lru)
				4010	discard_slab(s, page);
				4011
				4012	if (slabs_node(s, node))
				4013	ret = 1;
				4014	}
				4015
				4016	return ret;
				4017	}
				4018
				4019	#ifdef CONFIG_MEMCG
				4020	static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
				4021	{
				4022	/*
				4023	* Called with all the locks held after a sched RCU grace period.
				4024	* Even if @s becomes empty after shrinking, we can't know that @s
				4025	* doesn't have allocations already in-flight and thus can't
				4026	* destroy @s until the associated memcg is released.
				4027	*
				4028	* However, let's remove the sysfs files for empty caches here.
				4029	* Each cache has a lot of interface files which aren't
				4030	* particularly useful for empty draining caches; otherwise, we can
				4031	* easily end up with millions of unnecessary sysfs files on
				4032	* systems which have a lot of memory and transient cgroups.
				4033	*/
				4034	if (!__kmem_cache_shrink(s))
				4035	sysfs_slab_remove(s);
				4036	}
				4037
				4038	void __kmemcg_cache_deactivate(struct kmem_cache *s)
				4039	{
				4040	/*
				4041	* Disable empty slabs caching. Used to avoid pinning offline
				4042	* memory cgroups by kmem pages that can be freed.
				4043	*/
				4044	slub_set_cpu_partial(s, 0);
				4045	s->min_partial = 0;
				4046
				4047	/*
				4048	* s->cpu_partial is checked locklessly (see put_cpu_partial), so
				4049	* we have to make sure the change is visible before shrinking.
				4050	*/
				4051	slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
				4052	}
				4053	#endif
				4054
				4055	static int slab_mem_going_offline_callback(void *arg)
				4056	{
				4057	struct kmem_cache *s;
				4058
				4059	mutex_lock(&slab_mutex);
				4060	list_for_each_entry(s, &slab_caches, list)
				4061	__kmem_cache_shrink(s);
				4062	mutex_unlock(&slab_mutex);
				4063
				4064	return 0;
				4065	}
				4066
				4067	static void slab_mem_offline_callback(void *arg)
				4068	{
				4069	struct kmem_cache_node *n;
				4070	struct kmem_cache *s;
				4071	struct memory_notify *marg = arg;
				4072	int offline_node;
				4073
				4074	offline_node = marg->status_change_nid_normal;
				4075
				4076	/*
				4077	* If the node still has available memory. we need kmem_cache_node
				4078	* for it yet.
				4079	*/
				4080	if (offline_node < 0)
				4081	return;
				4082
				4083	mutex_lock(&slab_mutex);
				4084	list_for_each_entry(s, &slab_caches, list) {
				4085	n = get_node(s, offline_node);
				4086	if (n) {
				4087	/*
				4088	* if n->nr_slabs > 0, slabs still exist on the node
				4089	* that is going down. We were unable to free them,
				4090	* and offline_pages() function shouldn't call this
				4091	* callback. So, we must fail.
				4092	*/
				4093	BUG_ON(slabs_node(s, offline_node));
				4094
				4095	s->node[offline_node] = NULL;
				4096	kmem_cache_free(kmem_cache_node, n);
				4097	}
				4098	}
				4099	mutex_unlock(&slab_mutex);
				4100	}
				4101
				4102	static int slab_mem_going_online_callback(void *arg)
				4103	{
				4104	struct kmem_cache_node *n;
				4105	struct kmem_cache *s;
				4106	struct memory_notify *marg = arg;
				4107	int nid = marg->status_change_nid_normal;
				4108	int ret = 0;
				4109
				4110	/*
				4111	* If the node's memory is already available, then kmem_cache_node is
				4112	* already created. Nothing to do.
				4113	*/
				4114	if (nid < 0)
				4115	return 0;
				4116
				4117	/*
				4118	* We are bringing a node online. No memory is available yet. We must
				4119	* allocate a kmem_cache_node structure in order to bring the node
				4120	* online.
				4121	*/
				4122	mutex_lock(&slab_mutex);
				4123	list_for_each_entry(s, &slab_caches, list) {
				4124	/*
				4125	* XXX: kmem_cache_alloc_node will fallback to other nodes
				4126	* since memory is not yet available from the node that
				4127	* is brought up.
				4128	*/
				4129	n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
				4130	if (!n) {
				4131	ret = -ENOMEM;
				4132	goto out;
				4133	}
				4134	init_kmem_cache_node(n);
				4135	s->node[nid] = n;
				4136	}
				4137	out:
				4138	mutex_unlock(&slab_mutex);
				4139	return ret;
				4140	}
				4141
				4142	static int slab_memory_callback(struct notifier_block *self,
				4143	unsigned long action, void *arg)
				4144	{
				4145	int ret = 0;
				4146
				4147	switch (action) {
				4148	case MEM_GOING_ONLINE:
				4149	ret = slab_mem_going_online_callback(arg);
				4150	break;
				4151	case MEM_GOING_OFFLINE:
				4152	ret = slab_mem_going_offline_callback(arg);
				4153	break;
				4154	case MEM_OFFLINE:
				4155	case MEM_CANCEL_ONLINE:
				4156	slab_mem_offline_callback(arg);
				4157	break;
				4158	case MEM_ONLINE:
				4159	case MEM_CANCEL_OFFLINE:
				4160	break;
				4161	}
				4162	if (ret)
				4163	ret = notifier_from_errno(ret);
				4164	else
				4165	ret = NOTIFY_OK;
				4166	return ret;
				4167	}
				4168
				4169	static struct notifier_block slab_memory_callback_nb = {
				4170	.notifier_call = slab_memory_callback,
				4171	.priority = SLAB_CALLBACK_PRI,
				4172	};
				4173
				4174	/********************************************************************
				4175	* Basic setup of slabs
				4176	*******************************************************************/
				4177
				4178	/*
				4179	* Used for early kmem_cache structures that were allocated using
				4180	* the page allocator. Allocate them properly then fix up the pointers
				4181	* that may be pointing to the wrong kmem_cache structure.
				4182	*/
				4183
				4184	static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
				4185	{
				4186	int node;
				4187	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
				4188	struct kmem_cache_node *n;
				4189
				4190	memcpy(s, static_cache, kmem_cache->object_size);
				4191
				4192	/*
				4193	* This runs very early, and only the boot processor is supposed to be
				4194	* up. Even if it weren't true, IRQs are not up so we couldn't fire
				4195	* IPIs around.
				4196	*/
				4197	__flush_cpu_slab(s, smp_processor_id());
				4198	for_each_kmem_cache_node(s, node, n) {
				4199	struct page *p;
				4200
				4201	list_for_each_entry(p, &n->partial, lru)
				4202	p->slab_cache = s;
				4203
				4204	#ifdef CONFIG_SLUB_DEBUG
				4205	list_for_each_entry(p, &n->full, lru)
				4206	p->slab_cache = s;
				4207	#endif
				4208	}
				4209	slab_init_memcg_params(s);
				4210	list_add(&s->list, &slab_caches);
				4211	memcg_link_cache(s);
				4212	return s;
				4213	}
				4214
				4215	void __init kmem_cache_init(void)
				4216	{
				4217	static __initdata struct kmem_cache boot_kmem_cache,
				4218	boot_kmem_cache_node;
				4219
				4220	if (debug_guardpage_minorder())
				4221	slub_max_order = 0;
				4222
				4223	kmem_cache_node = &boot_kmem_cache_node;
				4224	kmem_cache = &boot_kmem_cache;
				4225
				4226	create_boot_cache(kmem_cache_node, "kmem_cache_node",
				4227	sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
				4228
				4229	register_hotmemory_notifier(&slab_memory_callback_nb);
				4230
				4231	/* Able to allocate the per node structures */
				4232	slab_state = PARTIAL;
				4233
				4234	create_boot_cache(kmem_cache, "kmem_cache",
				4235	offsetof(struct kmem_cache, node) +
				4236	nr_node_ids * sizeof(struct kmem_cache_node *),
				4237	SLAB_HWCACHE_ALIGN);
				4238
				4239	kmem_cache = bootstrap(&boot_kmem_cache);
				4240
				4241	/*
				4242	* Allocate kmem_cache_node properly from the kmem_cache slab.
				4243	* kmem_cache_node is separately allocated so no need to
				4244	* update any list pointers.
				4245	*/
				4246	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
				4247
				4248	/* Now we can use the kmem_cache to allocate kmalloc slabs */
				4249	setup_kmalloc_cache_index_table();
				4250	create_kmalloc_caches(0);
				4251
				4252	/* Setup random freelists for each cache */
				4253	init_freelist_randomization();
				4254
				4255	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
				4256	slub_cpu_dead);
				4257
				4258	pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n",
				4259	cache_line_size(),
				4260	slub_min_order, slub_max_order, slub_min_objects,
				4261	nr_cpu_ids, nr_node_ids);
				4262	}
				4263
				4264	void __init kmem_cache_init_late(void)
				4265	{
				4266	}
				4267
				4268	struct kmem_cache *
				4269	__kmem_cache_alias(const char *name, size_t size, size_t align,
				4270	unsigned long flags, void (ctor)(void ))
				4271	{
				4272	struct kmem_cache s, c;
				4273
				4274	s = find_mergeable(size, align, flags, name, ctor);
				4275	if (s) {
				4276	s->refcount++;
				4277
				4278	/*
				4279	* Adjust the object sizes so that we clear
				4280	* the complete object on kzalloc.
				4281	*/
				4282	s->object_size = max(s->object_size, (int)size);
				4283	s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
				4284
				4285	for_each_memcg_cache(c, s) {
				4286	c->object_size = s->object_size;
				4287	c->inuse = max_t(int, c->inuse,
				4288	ALIGN(size, sizeof(void *)));
				4289	}
				4290
				4291	if (sysfs_slab_alias(s, name)) {
				4292	s->refcount--;
				4293	s = NULL;
				4294	}
				4295	}
				4296
				4297	return s;
				4298	}
				4299
				4300	int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
				4301	{
				4302	int err;
				4303
				4304	err = kmem_cache_open(s, flags);
				4305	if (err)
				4306	return err;
				4307
				4308	/* Mutex is not taken during early boot */
				4309	if (slab_state <= UP)
				4310	return 0;
				4311
				4312	memcg_propagate_slab_attrs(s);
				4313	err = sysfs_slab_add(s);
				4314	if (err)
				4315	__kmem_cache_release(s);
				4316
				4317	return err;
				4318	}
				4319
				4320	void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
				4321	{
				4322	struct kmem_cache *s;
				4323	void *ret;
				4324
				4325	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
				4326	return kmalloc_large(size, gfpflags);
				4327
				4328	s = kmalloc_slab(size, gfpflags);
				4329
				4330	if (unlikely(ZERO_OR_NULL_PTR(s)))
				4331	return s;
				4332
				4333	ret = slab_alloc(s, gfpflags, caller);
				4334
				4335	/* Honor the call site pointer we received. */
				4336	trace_kmalloc(caller, ret, size, s->size, gfpflags);
				4337
				4338	return ret;
				4339	}
				4340
				4341	#ifdef CONFIG_NUMA
				4342	void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
				4343	int node, unsigned long caller)
				4344	{
				4345	struct kmem_cache *s;
				4346	void *ret;
				4347
				4348	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
				4349	ret = kmalloc_large_node(size, gfpflags, node);
				4350
				4351	trace_kmalloc_node(caller, ret,
				4352	size, PAGE_SIZE << get_order(size),
				4353	gfpflags, node);
				4354
				4355	return ret;
				4356	}
				4357
				4358	s = kmalloc_slab(size, gfpflags);
				4359
				4360	if (unlikely(ZERO_OR_NULL_PTR(s)))
				4361	return s;
				4362
				4363	ret = slab_alloc_node(s, gfpflags, node, caller);
				4364
				4365	/* Honor the call site pointer we received. */
				4366	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
				4367
				4368	return ret;
				4369	}
				4370	#endif
				4371
				4372	#ifdef CONFIG_SYSFS
				4373	static int count_inuse(struct page *page)
				4374	{
				4375	return page->inuse;
				4376	}
				4377
				4378	static int count_total(struct page *page)
				4379	{
				4380	return page->objects;
				4381	}
				4382	#endif
				4383
				4384	#ifdef CONFIG_SLUB_DEBUG
				4385	static int validate_slab(struct kmem_cache s, struct page page,
				4386	unsigned long *map)
				4387	{
				4388	void *p;
				4389	void *addr = page_address(page);
				4390
				4391	if (!check_slab(s, page) \|\|
				4392	!on_freelist(s, page, NULL))
				4393	return 0;
				4394
				4395	/* Now we know that a valid freelist exists */
				4396	bitmap_zero(map, page->objects);
				4397
				4398	get_map(s, page, map);
				4399	for_each_object(p, s, addr, page->objects) {
				4400	if (test_bit(slab_index(p, s, addr), map))
				4401	if (!check_object(s, page, p, SLUB_RED_INACTIVE))
				4402	return 0;
				4403	}
				4404
				4405	for_each_object(p, s, addr, page->objects)
				4406	if (!test_bit(slab_index(p, s, addr), map))
				4407	if (!check_object(s, page, p, SLUB_RED_ACTIVE))
				4408	return 0;
				4409	return 1;
				4410	}
				4411
				4412	static void validate_slab_slab(struct kmem_cache s, struct page page,
				4413	unsigned long *map)
				4414	{
				4415	slab_lock(page);
				4416	validate_slab(s, page, map);
				4417	slab_unlock(page);
				4418	}
				4419
				4420	static int validate_slab_node(struct kmem_cache *s,
				4421	struct kmem_cache_node n, unsigned long map)
				4422	{
				4423	unsigned long count = 0;
				4424	struct page *page;
				4425	unsigned long flags;
				4426
				4427	spin_lock_irqsave(&n->list_lock, flags);
				4428
				4429	list_for_each_entry(page, &n->partial, lru) {
				4430	validate_slab_slab(s, page, map);
				4431	count++;
				4432	}
				4433	if (count != n->nr_partial)
				4434	pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
				4435	s->name, count, n->nr_partial);
				4436
				4437	if (!(s->flags & SLAB_STORE_USER))
				4438	goto out;
				4439
				4440	list_for_each_entry(page, &n->full, lru) {
				4441	validate_slab_slab(s, page, map);
				4442	count++;
				4443	}
				4444	if (count != atomic_long_read(&n->nr_slabs))
				4445	pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
				4446	s->name, count, atomic_long_read(&n->nr_slabs));
				4447
				4448	out:
				4449	spin_unlock_irqrestore(&n->list_lock, flags);
				4450	return count;
				4451	}
				4452
				4453	static long validate_slab_cache(struct kmem_cache *s)
				4454	{
				4455	int node;
				4456	unsigned long count = 0;
				4457	unsigned long map = kmalloc(BITS_TO_LONGS(oo_objects(s->max))
				4458	sizeof(unsigned long), GFP_KERNEL);
				4459	struct kmem_cache_node *n;
				4460
				4461	if (!map)
				4462	return -ENOMEM;
				4463
				4464	flush_all(s);
				4465	for_each_kmem_cache_node(s, node, n)
				4466	count += validate_slab_node(s, n, map);
				4467	kfree(map);
				4468	return count;
				4469	}
				4470	/*
				4471	* Generate lists of code addresses where slabcache objects are allocated
				4472	* and freed.
				4473	*/
				4474
				4475	struct location {
				4476	unsigned long count;
				4477	unsigned long addr;
				4478	long long sum_time;
				4479	long min_time;
				4480	long max_time;
				4481	long min_pid;
				4482	long max_pid;
				4483	DECLARE_BITMAP(cpus, NR_CPUS);
				4484	nodemask_t nodes;
				4485	};
				4486
				4487	struct loc_track {
				4488	unsigned long max;
				4489	unsigned long count;
				4490	struct location *loc;
				4491	};
				4492
				4493	static void free_loc_track(struct loc_track *t)
				4494	{
				4495	if (t->max)
				4496	free_pages((unsigned long)t->loc,
				4497	get_order(sizeof(struct location) * t->max));
				4498	}
				4499
				4500	static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
				4501	{
				4502	struct location *l;
				4503	int order;
				4504
				4505	order = get_order(sizeof(struct location) * max);
				4506
				4507	l = (void *)__get_free_pages(flags, order);
				4508	if (!l)
				4509	return 0;
				4510
				4511	if (t->count) {
				4512	memcpy(l, t->loc, sizeof(struct location) * t->count);
				4513	free_loc_track(t);
				4514	}
				4515	t->max = max;
				4516	t->loc = l;
				4517	return 1;
				4518	}
				4519
				4520	static int add_location(struct loc_track t, struct kmem_cache s,
				4521	const struct track *track)
				4522	{
				4523	long start, end, pos;
				4524	struct location *l;
				4525	unsigned long caddr;
				4526	unsigned long age = jiffies - track->when;
				4527
				4528	start = -1;
				4529	end = t->count;
				4530
				4531	for ( ; ; ) {
				4532	pos = start + (end - start + 1) / 2;
				4533
				4534	/*
				4535	* There is nothing at "end". If we end up there
				4536	* we need to add something to before end.
				4537	*/
				4538	if (pos == end)
				4539	break;
				4540
				4541	caddr = t->loc[pos].addr;
				4542	if (track->addr == caddr) {
				4543
				4544	l = &t->loc[pos];
				4545	l->count++;
				4546	if (track->when) {
				4547	l->sum_time += age;
				4548	if (age < l->min_time)
				4549	l->min_time = age;
				4550	if (age > l->max_time)
				4551	l->max_time = age;
				4552
				4553	if (track->pid < l->min_pid)
				4554	l->min_pid = track->pid;
				4555	if (track->pid > l->max_pid)
				4556	l->max_pid = track->pid;
				4557
				4558	cpumask_set_cpu(track->cpu,
				4559	to_cpumask(l->cpus));
				4560	}
				4561	node_set(page_to_nid(virt_to_page(track)), l->nodes);
				4562	return 1;
				4563	}
				4564
				4565	if (track->addr < caddr)
				4566	end = pos;
				4567	else
				4568	start = pos;
				4569	}
				4570
				4571	/*
				4572	* Not found. Insert new tracking element.
				4573	*/
				4574	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
				4575	return 0;
				4576
				4577	l = t->loc + pos;
				4578	if (pos < t->count)
				4579	memmove(l + 1, l,
				4580	(t->count - pos) * sizeof(struct location));
				4581	t->count++;
				4582	l->count = 1;
				4583	l->addr = track->addr;
				4584	l->sum_time = age;
				4585	l->min_time = age;
				4586	l->max_time = age;
				4587	l->min_pid = track->pid;
				4588	l->max_pid = track->pid;
				4589	cpumask_clear(to_cpumask(l->cpus));
				4590	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
				4591	nodes_clear(l->nodes);
				4592	node_set(page_to_nid(virt_to_page(track)), l->nodes);
				4593	return 1;
				4594	}
				4595
				4596	static void process_slab(struct loc_track t, struct kmem_cache s,
				4597	struct page *page, enum track_item alloc,
				4598	unsigned long *map)
				4599	{
				4600	void *addr = page_address(page);
				4601	void *p;
				4602
				4603	bitmap_zero(map, page->objects);
				4604	get_map(s, page, map);
				4605
				4606	for_each_object(p, s, addr, page->objects)
				4607	if (!test_bit(slab_index(p, s, addr), map))
				4608	add_location(t, s, get_track(s, p, alloc));
				4609	}
				4610
				4611	static int list_locations(struct kmem_cache s, char buf,
				4612	enum track_item alloc)
				4613	{
				4614	int len = 0;
				4615	unsigned long i;
				4616	struct loc_track t = { 0, 0, NULL };
				4617	int node;
				4618	unsigned long map = kmalloc(BITS_TO_LONGS(oo_objects(s->max))
				4619	sizeof(unsigned long), GFP_KERNEL);
				4620	struct kmem_cache_node *n;
				4621
				4622	if (!map \|\| !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
				4623	GFP_KERNEL)) {
				4624	kfree(map);
				4625	return sprintf(buf, "Out of memory\n");
				4626	}
				4627	/* Push back cpu slabs */
				4628	flush_all(s);
				4629
				4630	for_each_kmem_cache_node(s, node, n) {
				4631	unsigned long flags;
				4632	struct page *page;
				4633
				4634	if (!atomic_long_read(&n->nr_slabs))
				4635	continue;
				4636
				4637	spin_lock_irqsave(&n->list_lock, flags);
				4638	list_for_each_entry(page, &n->partial, lru)
				4639	process_slab(&t, s, page, alloc, map);
				4640	list_for_each_entry(page, &n->full, lru)
				4641	process_slab(&t, s, page, alloc, map);
				4642	spin_unlock_irqrestore(&n->list_lock, flags);
				4643	}
				4644
				4645	for (i = 0; i < t.count; i++) {
				4646	struct location *l = &t.loc[i];
				4647
				4648	if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
				4649	break;
				4650	len += sprintf(buf + len, "%7ld ", l->count);
				4651
				4652	if (l->addr)
				4653	len += sprintf(buf + len, "%pS", (void *)l->addr);
				4654	else
				4655	len += sprintf(buf + len, "<not-available>");
				4656
				4657	if (l->sum_time != l->min_time) {
				4658	len += sprintf(buf + len, " age=%ld/%ld/%ld",
				4659	l->min_time,
				4660	(long)div_u64(l->sum_time, l->count),
				4661	l->max_time);
				4662	} else
				4663	len += sprintf(buf + len, " age=%ld",
				4664	l->min_time);
				4665
				4666	if (l->min_pid != l->max_pid)
				4667	len += sprintf(buf + len, " pid=%ld-%ld",
				4668	l->min_pid, l->max_pid);
				4669	else
				4670	len += sprintf(buf + len, " pid=%ld",
				4671	l->min_pid);
				4672
				4673	if (num_online_cpus() > 1 &&
				4674	!cpumask_empty(to_cpumask(l->cpus)) &&
				4675	len < PAGE_SIZE - 60)
				4676	len += scnprintf(buf + len, PAGE_SIZE - len - 50,
				4677	" cpus=%*pbl",
				4678	cpumask_pr_args(to_cpumask(l->cpus)));
				4679
				4680	if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
				4681	len < PAGE_SIZE - 60)
				4682	len += scnprintf(buf + len, PAGE_SIZE - len - 50,
				4683	" nodes=%*pbl",
				4684	nodemask_pr_args(&l->nodes));
				4685
				4686	len += sprintf(buf + len, "\n");
				4687	}
				4688
				4689	free_loc_track(&t);
				4690	kfree(map);
				4691	if (!t.count)
				4692	len += sprintf(buf, "No data\n");
				4693	return len;
				4694	}
				4695	#endif
				4696
				4697	#ifdef SLUB_RESILIENCY_TEST
				4698	static void __init resiliency_test(void)
				4699	{
				4700	u8 *p;
				4701
				4702	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 \|\| KMALLOC_SHIFT_HIGH < 10);
				4703
				4704	pr_err("SLUB resiliency testing\n");
				4705	pr_err("-----------------------\n");
				4706	pr_err("A. Corruption after allocation\n");
				4707
				4708	p = kzalloc(16, GFP_KERNEL);
				4709	p[16] = 0x12;
				4710	pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
				4711	p + 16);
				4712
				4713	validate_slab_cache(kmalloc_caches[4]);
				4714
				4715	/* Hmmm... The next two are dangerous */
				4716	p = kzalloc(32, GFP_KERNEL);
				4717	p[32 + sizeof(void *)] = 0x34;
				4718	pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
				4719	p);
				4720	pr_err("If allocated object is overwritten then not detectable\n\n");
				4721
				4722	validate_slab_cache(kmalloc_caches[5]);
				4723	p = kzalloc(64, GFP_KERNEL);
				4724	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
				4725	*p = 0x56;
				4726	pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
				4727	p);
				4728	pr_err("If allocated object is overwritten then not detectable\n\n");
				4729	validate_slab_cache(kmalloc_caches[6]);
				4730
				4731	pr_err("\nB. Corruption after free\n");
				4732	p = kzalloc(128, GFP_KERNEL);
				4733	kfree(p);
				4734	*p = 0x78;
				4735	pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
				4736	validate_slab_cache(kmalloc_caches[7]);
				4737
				4738	p = kzalloc(256, GFP_KERNEL);
				4739	kfree(p);
				4740	p[50] = 0x9a;
				4741	pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
				4742	validate_slab_cache(kmalloc_caches[8]);
				4743
				4744	p = kzalloc(512, GFP_KERNEL);
				4745	kfree(p);
				4746	p[512] = 0xab;
				4747	pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
				4748	validate_slab_cache(kmalloc_caches[9]);
				4749	}
				4750	#else
				4751	#ifdef CONFIG_SYSFS
				4752	static void resiliency_test(void) {};
				4753	#endif
				4754	#endif
				4755
				4756	#ifdef CONFIG_SYSFS
				4757	enum slab_stat_type {
				4758	SL_ALL, /* All slabs */
				4759	SL_PARTIAL, /* Only partially allocated slabs */
				4760	SL_CPU, /* Only slabs used for cpu caches */
				4761	SL_OBJECTS, /* Determine allocated objects not slabs */
				4762	SL_TOTAL /* Determine object capacity not slabs */
				4763	};
				4764
				4765	#define SO_ALL (1 << SL_ALL)
				4766	#define SO_PARTIAL (1 << SL_PARTIAL)
				4767	#define SO_CPU (1 << SL_CPU)
				4768	#define SO_OBJECTS (1 << SL_OBJECTS)
				4769	#define SO_TOTAL (1 << SL_TOTAL)
				4770
				4771	#ifdef CONFIG_MEMCG
				4772	static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
				4773
				4774	static int __init setup_slub_memcg_sysfs(char *str)
				4775	{
				4776	int v;
				4777
				4778	if (get_option(&str, &v) > 0)
				4779	memcg_sysfs_enabled = v;
				4780
				4781	return 1;
				4782	}
				4783
				4784	__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
				4785	#endif
				4786
				4787	static ssize_t show_slab_objects(struct kmem_cache *s,
				4788	char *buf, unsigned long flags)
				4789	{
				4790	unsigned long total = 0;
				4791	int node;
				4792	int x;
				4793	unsigned long *nodes;
				4794
				4795	nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
				4796	if (!nodes)
				4797	return -ENOMEM;
				4798
				4799	if (flags & SO_CPU) {
				4800	int cpu;
				4801
				4802	for_each_possible_cpu(cpu) {
				4803	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
				4804	cpu);
				4805	int node;
				4806	struct page *page;
				4807
				4808	page = READ_ONCE(c->page);
				4809	if (!page)
				4810	continue;
				4811
				4812	node = page_to_nid(page);
				4813	if (flags & SO_TOTAL)
				4814	x = page->objects;
				4815	else if (flags & SO_OBJECTS)
				4816	x = page->inuse;
				4817	else
				4818	x = 1;
				4819
				4820	total += x;
				4821	nodes[node] += x;
				4822
				4823	page = slub_percpu_partial_read_once(c);
				4824	if (page) {
				4825	node = page_to_nid(page);
				4826	if (flags & SO_TOTAL)
				4827	WARN_ON_ONCE(1);
				4828	else if (flags & SO_OBJECTS)
				4829	WARN_ON_ONCE(1);
				4830	else
				4831	x = page->pages;
				4832	total += x;
				4833	nodes[node] += x;
				4834	}
				4835	}
				4836	}
				4837
				4838	/*
				4839	* It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
				4840	* already held which will conflict with an existing lock order:
				4841	*
				4842	* mem_hotplug_lock->slab_mutex->kernfs_mutex
				4843	*
				4844	* We don't really need mem_hotplug_lock (to hold off
				4845	* slab_mem_going_offline_callback) here because slab's memory hot
				4846	* unplug code doesn't destroy the kmem_cache->node[] data.
				4847	*/
				4848
				4849	#ifdef CONFIG_SLUB_DEBUG
				4850	if (flags & SO_ALL) {
				4851	struct kmem_cache_node *n;
				4852
				4853	for_each_kmem_cache_node(s, node, n) {
				4854
				4855	if (flags & SO_TOTAL)
				4856	x = atomic_long_read(&n->total_objects);
				4857	else if (flags & SO_OBJECTS)
				4858	x = atomic_long_read(&n->total_objects) -
				4859	count_partial(n, count_free);
				4860	else
				4861	x = atomic_long_read(&n->nr_slabs);
				4862	total += x;
				4863	nodes[node] += x;
				4864	}
				4865
				4866	} else
				4867	#endif
				4868	if (flags & SO_PARTIAL) {
				4869	struct kmem_cache_node *n;
				4870
				4871	for_each_kmem_cache_node(s, node, n) {
				4872	if (flags & SO_TOTAL)
				4873	x = count_partial(n, count_total);
				4874	else if (flags & SO_OBJECTS)
				4875	x = count_partial(n, count_inuse);
				4876	else
				4877	x = n->nr_partial;
				4878	total += x;
				4879	nodes[node] += x;
				4880	}
				4881	}
				4882	x = sprintf(buf, "%lu", total);
				4883	#ifdef CONFIG_NUMA
				4884	for (node = 0; node < nr_node_ids; node++)
				4885	if (nodes[node])
				4886	x += sprintf(buf + x, " N%d=%lu",
				4887	node, nodes[node]);
				4888	#endif
				4889	kfree(nodes);
				4890	return x + sprintf(buf + x, "\n");
				4891	}
				4892
				4893	#ifdef CONFIG_SLUB_DEBUG
				4894	static int any_slab_objects(struct kmem_cache *s)
				4895	{
				4896	int node;
				4897	struct kmem_cache_node *n;
				4898
				4899	for_each_kmem_cache_node(s, node, n)
				4900	if (atomic_long_read(&n->total_objects))
				4901	return 1;
				4902
				4903	return 0;
				4904	}
				4905	#endif
				4906
				4907	#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
				4908	#define to_slab(n) container_of(n, struct kmem_cache, kobj)
				4909
				4910	struct slab_attribute {
				4911	struct attribute attr;
				4912	ssize_t (show)(struct kmem_cache s, char *buf);
				4913	ssize_t (store)(struct kmem_cache s, const char *x, size_t count);
				4914	};
				4915
				4916	#define SLAB_ATTR_RO(_name) \
				4917	static struct slab_attribute _name##_attr = \
				4918	__ATTR(_name, 0400, _name##_show, NULL)
				4919
				4920	#define SLAB_ATTR(_name) \
				4921	static struct slab_attribute _name##_attr = \
				4922	__ATTR(_name, 0600, _name##_show, _name##_store)
				4923
				4924	static ssize_t slab_size_show(struct kmem_cache s, char buf)
				4925	{
				4926	return sprintf(buf, "%d\n", s->size);
				4927	}
				4928	SLAB_ATTR_RO(slab_size);
				4929
				4930	static ssize_t align_show(struct kmem_cache s, char buf)
				4931	{
				4932	return sprintf(buf, "%d\n", s->align);
				4933	}
				4934	SLAB_ATTR_RO(align);
				4935
				4936	static ssize_t object_size_show(struct kmem_cache s, char buf)
				4937	{
				4938	return sprintf(buf, "%d\n", s->object_size);
				4939	}
				4940	SLAB_ATTR_RO(object_size);
				4941
				4942	static ssize_t objs_per_slab_show(struct kmem_cache s, char buf)
				4943	{
				4944	return sprintf(buf, "%d\n", oo_objects(s->oo));
				4945	}
				4946	SLAB_ATTR_RO(objs_per_slab);
				4947
				4948	static ssize_t order_store(struct kmem_cache *s,
				4949	const char *buf, size_t length)
				4950	{
				4951	unsigned long order;
				4952	int err;
				4953
				4954	err = kstrtoul(buf, 10, &order);
				4955	if (err)
				4956	return err;
				4957
				4958	if (order > slub_max_order \|\| order < slub_min_order)
				4959	return -EINVAL;
				4960
				4961	calculate_sizes(s, order);
				4962	return length;
				4963	}
				4964
				4965	static ssize_t order_show(struct kmem_cache s, char buf)
				4966	{
				4967	return sprintf(buf, "%d\n", oo_order(s->oo));
				4968	}
				4969	SLAB_ATTR(order);
				4970
				4971	static ssize_t min_partial_show(struct kmem_cache s, char buf)
				4972	{
				4973	return sprintf(buf, "%lu\n", s->min_partial);
				4974	}
				4975
				4976	static ssize_t min_partial_store(struct kmem_cache s, const char buf,
				4977	size_t length)
				4978	{
				4979	unsigned long min;
				4980	int err;
				4981
				4982	err = kstrtoul(buf, 10, &min);
				4983	if (err)
				4984	return err;
				4985
				4986	set_min_partial(s, min);
				4987	return length;
				4988	}
				4989	SLAB_ATTR(min_partial);
				4990
				4991	static ssize_t cpu_partial_show(struct kmem_cache s, char buf)
				4992	{
				4993	return sprintf(buf, "%u\n", slub_cpu_partial(s));
				4994	}
				4995
				4996	static ssize_t cpu_partial_store(struct kmem_cache s, const char buf,
				4997	size_t length)
				4998	{
				4999	unsigned int objects;
				5000	int err;
				5001
				5002	err = kstrtouint(buf, 10, &objects);
				5003	if (err)
				5004	return err;
				5005	if (objects && !kmem_cache_has_cpu_partial(s))
				5006	return -EINVAL;
				5007
				5008	slub_set_cpu_partial(s, objects);
				5009	flush_all(s);
				5010	return length;
				5011	}
				5012	SLAB_ATTR(cpu_partial);
				5013
				5014	static ssize_t ctor_show(struct kmem_cache s, char buf)
				5015	{
				5016	if (!s->ctor)
				5017	return 0;
				5018	return sprintf(buf, "%pS\n", s->ctor);
				5019	}
				5020	SLAB_ATTR_RO(ctor);
				5021
				5022	static ssize_t aliases_show(struct kmem_cache s, char buf)
				5023	{
				5024	return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
				5025	}
				5026	SLAB_ATTR_RO(aliases);
				5027
				5028	static ssize_t partial_show(struct kmem_cache s, char buf)
				5029	{
				5030	return show_slab_objects(s, buf, SO_PARTIAL);
				5031	}
				5032	SLAB_ATTR_RO(partial);
				5033
				5034	static ssize_t cpu_slabs_show(struct kmem_cache s, char buf)
				5035	{
				5036	return show_slab_objects(s, buf, SO_CPU);
				5037	}
				5038	SLAB_ATTR_RO(cpu_slabs);
				5039
				5040	static ssize_t objects_show(struct kmem_cache s, char buf)
				5041	{
				5042	return show_slab_objects(s, buf, SO_ALL\|SO_OBJECTS);
				5043	}
				5044	SLAB_ATTR_RO(objects);
				5045
				5046	static ssize_t objects_partial_show(struct kmem_cache s, char buf)
				5047	{
				5048	return show_slab_objects(s, buf, SO_PARTIAL\|SO_OBJECTS);
				5049	}
				5050	SLAB_ATTR_RO(objects_partial);
				5051
				5052	static ssize_t slabs_cpu_partial_show(struct kmem_cache s, char buf)
				5053	{
				5054	int objects = 0;
				5055	int pages = 0;
				5056	int cpu;
				5057	int len;
				5058
				5059	for_each_online_cpu(cpu) {
				5060	struct page *page;
				5061
				5062	page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
				5063
				5064	if (page) {
				5065	pages += page->pages;
				5066	objects += page->pobjects;
				5067	}
				5068	}
				5069
				5070	len = sprintf(buf, "%d(%d)", objects, pages);
				5071
				5072	#ifdef CONFIG_SMP
				5073	for_each_online_cpu(cpu) {
				5074	struct page *page;
				5075
				5076	page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
				5077
				5078	if (page && len < PAGE_SIZE - 20)
				5079	len += sprintf(buf + len, " C%d=%d(%d)", cpu,
				5080	page->pobjects, page->pages);
				5081	}
				5082	#endif
				5083	return len + sprintf(buf + len, "\n");
				5084	}
				5085	SLAB_ATTR_RO(slabs_cpu_partial);
				5086
				5087	static ssize_t reclaim_account_show(struct kmem_cache s, char buf)
				5088	{
				5089	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
				5090	}
				5091
				5092	static ssize_t reclaim_account_store(struct kmem_cache *s,
				5093	const char *buf, size_t length)
				5094	{
				5095	s->flags &= ~SLAB_RECLAIM_ACCOUNT;
				5096	if (buf[0] == '1')
				5097	s->flags \|= SLAB_RECLAIM_ACCOUNT;
				5098	return length;
				5099	}
				5100	SLAB_ATTR(reclaim_account);
				5101
				5102	static ssize_t hwcache_align_show(struct kmem_cache s, char buf)
				5103	{
				5104	return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
				5105	}
				5106	SLAB_ATTR_RO(hwcache_align);
				5107
				5108	#ifdef CONFIG_ZONE_DMA
				5109	static ssize_t cache_dma_show(struct kmem_cache s, char buf)
				5110	{
				5111	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
				5112	}
				5113	SLAB_ATTR_RO(cache_dma);
				5114	#endif
				5115
				5116	static ssize_t destroy_by_rcu_show(struct kmem_cache s, char buf)
				5117	{
				5118	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
				5119	}
				5120	SLAB_ATTR_RO(destroy_by_rcu);
				5121
				5122	static ssize_t reserved_show(struct kmem_cache s, char buf)
				5123	{
				5124	return sprintf(buf, "%d\n", s->reserved);
				5125	}
				5126	SLAB_ATTR_RO(reserved);
				5127
				5128	#ifdef CONFIG_SLUB_DEBUG
				5129	static ssize_t slabs_show(struct kmem_cache s, char buf)
				5130	{
				5131	return show_slab_objects(s, buf, SO_ALL);
				5132	}
				5133	SLAB_ATTR_RO(slabs);
				5134
				5135	static ssize_t total_objects_show(struct kmem_cache s, char buf)
				5136	{
				5137	return show_slab_objects(s, buf, SO_ALL\|SO_TOTAL);
				5138	}
				5139	SLAB_ATTR_RO(total_objects);
				5140
				5141	static ssize_t sanity_checks_show(struct kmem_cache s, char buf)
				5142	{
				5143	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
				5144	}
				5145
				5146	static ssize_t sanity_checks_store(struct kmem_cache *s,
				5147	const char *buf, size_t length)
				5148	{
				5149	s->flags &= ~SLAB_CONSISTENCY_CHECKS;
				5150	if (buf[0] == '1') {
				5151	s->flags &= ~__CMPXCHG_DOUBLE;
				5152	s->flags \|= SLAB_CONSISTENCY_CHECKS;
				5153	}
				5154	return length;
				5155	}
				5156	SLAB_ATTR(sanity_checks);
				5157
				5158	static ssize_t trace_show(struct kmem_cache s, char buf)
				5159	{
				5160	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
				5161	}
				5162
				5163	static ssize_t trace_store(struct kmem_cache s, const char buf,
				5164	size_t length)
				5165	{
				5166	/*
				5167	* Tracing a merged cache is going to give confusing results
				5168	* as well as cause other issues like converting a mergeable
				5169	* cache into an umergeable one.
				5170	*/
				5171	if (s->refcount > 1)
				5172	return -EINVAL;
				5173
				5174	s->flags &= ~SLAB_TRACE;
				5175	if (buf[0] == '1') {
				5176	s->flags &= ~__CMPXCHG_DOUBLE;
				5177	s->flags \|= SLAB_TRACE;
				5178	}
				5179	return length;
				5180	}
				5181	SLAB_ATTR(trace);
				5182
				5183	static ssize_t red_zone_show(struct kmem_cache s, char buf)
				5184	{
				5185	return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
				5186	}
				5187
				5188	static ssize_t red_zone_store(struct kmem_cache *s,
				5189	const char *buf, size_t length)
				5190	{
				5191	if (any_slab_objects(s))
				5192	return -EBUSY;
				5193
				5194	s->flags &= ~SLAB_RED_ZONE;
				5195	if (buf[0] == '1') {
				5196	s->flags \|= SLAB_RED_ZONE;
				5197	}
				5198	calculate_sizes(s, -1);
				5199	return length;
				5200	}
				5201	SLAB_ATTR(red_zone);
				5202
				5203	static ssize_t poison_show(struct kmem_cache s, char buf)
				5204	{
				5205	return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
				5206	}
				5207
				5208	static ssize_t poison_store(struct kmem_cache *s,
				5209	const char *buf, size_t length)
				5210	{
				5211	if (any_slab_objects(s))
				5212	return -EBUSY;
				5213
				5214	s->flags &= ~SLAB_POISON;
				5215	if (buf[0] == '1') {
				5216	s->flags \|= SLAB_POISON;
				5217	}
				5218	calculate_sizes(s, -1);
				5219	return length;
				5220	}
				5221	SLAB_ATTR(poison);
				5222
				5223	static ssize_t store_user_show(struct kmem_cache s, char buf)
				5224	{
				5225	return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
				5226	}
				5227
				5228	static ssize_t store_user_store(struct kmem_cache *s,
				5229	const char *buf, size_t length)
				5230	{
				5231	if (any_slab_objects(s))
				5232	return -EBUSY;
				5233
				5234	s->flags &= ~SLAB_STORE_USER;
				5235	if (buf[0] == '1') {
				5236	s->flags &= ~__CMPXCHG_DOUBLE;
				5237	s->flags \|= SLAB_STORE_USER;
				5238	}
				5239	calculate_sizes(s, -1);
				5240	return length;
				5241	}
				5242	SLAB_ATTR(store_user);
				5243
				5244	static ssize_t validate_show(struct kmem_cache s, char buf)
				5245	{
				5246	return 0;
				5247	}
				5248
				5249	static ssize_t validate_store(struct kmem_cache *s,
				5250	const char *buf, size_t length)
				5251	{
				5252	int ret = -EINVAL;
				5253
				5254	if (buf[0] == '1') {
				5255	ret = validate_slab_cache(s);
				5256	if (ret >= 0)
				5257	ret = length;
				5258	}
				5259	return ret;
				5260	}
				5261	SLAB_ATTR(validate);
				5262
				5263	static ssize_t alloc_calls_show(struct kmem_cache s, char buf)
				5264	{
				5265	if (!(s->flags & SLAB_STORE_USER))
				5266	return -ENOSYS;
				5267	return list_locations(s, buf, TRACK_ALLOC);
				5268	}
				5269	SLAB_ATTR_RO(alloc_calls);
				5270
				5271	static ssize_t free_calls_show(struct kmem_cache s, char buf)
				5272	{
				5273	if (!(s->flags & SLAB_STORE_USER))
				5274	return -ENOSYS;
				5275	return list_locations(s, buf, TRACK_FREE);
				5276	}
				5277	SLAB_ATTR_RO(free_calls);
				5278	#endif /* CONFIG_SLUB_DEBUG */
				5279
				5280	#ifdef CONFIG_FAILSLAB
				5281	static ssize_t failslab_show(struct kmem_cache s, char buf)
				5282	{
				5283	return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
				5284	}
				5285
				5286	static ssize_t failslab_store(struct kmem_cache s, const char buf,
				5287	size_t length)
				5288	{
				5289	if (s->refcount > 1)
				5290	return -EINVAL;
				5291
				5292	s->flags &= ~SLAB_FAILSLAB;
				5293	if (buf[0] == '1')
				5294	s->flags \|= SLAB_FAILSLAB;
				5295	return length;
				5296	}
				5297	SLAB_ATTR(failslab);
				5298	#endif
				5299
				5300	static ssize_t shrink_show(struct kmem_cache s, char buf)
				5301	{
				5302	return 0;
				5303	}
				5304
				5305	static ssize_t shrink_store(struct kmem_cache *s,
				5306	const char *buf, size_t length)
				5307	{
				5308	if (buf[0] == '1')
				5309	kmem_cache_shrink(s);
				5310	else
				5311	return -EINVAL;
				5312	return length;
				5313	}
				5314	SLAB_ATTR(shrink);
				5315
				5316	#ifdef CONFIG_NUMA
				5317	static ssize_t remote_node_defrag_ratio_show(struct kmem_cache s, char buf)
				5318	{
				5319	return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
				5320	}
				5321
				5322	static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
				5323	const char *buf, size_t length)
				5324	{
				5325	unsigned long ratio;
				5326	int err;
				5327
				5328	err = kstrtoul(buf, 10, &ratio);
				5329	if (err)
				5330	return err;
				5331
				5332	if (ratio <= 100)
				5333	s->remote_node_defrag_ratio = ratio * 10;
				5334
				5335	return length;
				5336	}
				5337	SLAB_ATTR(remote_node_defrag_ratio);
				5338	#endif
				5339
				5340	#ifdef CONFIG_SLUB_STATS
				5341	static int show_stat(struct kmem_cache s, char buf, enum stat_item si)
				5342	{
				5343	unsigned long sum = 0;
				5344	int cpu;
				5345	int len;
				5346	int data = kmalloc(nr_cpu_ids sizeof(int), GFP_KERNEL);
				5347
				5348	if (!data)
				5349	return -ENOMEM;
				5350
				5351	for_each_online_cpu(cpu) {
				5352	unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
				5353
				5354	data[cpu] = x;
				5355	sum += x;
				5356	}
				5357
				5358	len = sprintf(buf, "%lu", sum);
				5359
				5360	#ifdef CONFIG_SMP
				5361	for_each_online_cpu(cpu) {
				5362	if (data[cpu] && len < PAGE_SIZE - 20)
				5363	len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
				5364	}
				5365	#endif
				5366	kfree(data);
				5367	return len + sprintf(buf + len, "\n");
				5368	}
				5369
				5370	static void clear_stat(struct kmem_cache *s, enum stat_item si)
				5371	{
				5372	int cpu;
				5373
				5374	for_each_online_cpu(cpu)
				5375	per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
				5376	}
				5377
				5378	#define STAT_ATTR(si, text) \
				5379	static ssize_t text##_show(struct kmem_cache s, char buf) \
				5380	{ \
				5381	return show_stat(s, buf, si); \
				5382	} \
				5383	static ssize_t text##_store(struct kmem_cache *s, \
				5384	const char *buf, size_t length) \
				5385	{ \
				5386	if (buf[0] != '0') \
				5387	return -EINVAL; \
				5388	clear_stat(s, si); \
				5389	return length; \
				5390	} \
				5391	SLAB_ATTR(text); \
				5392
				5393	STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
				5394	STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
				5395	STAT_ATTR(FREE_FASTPATH, free_fastpath);
				5396	STAT_ATTR(FREE_SLOWPATH, free_slowpath);
				5397	STAT_ATTR(FREE_FROZEN, free_frozen);
				5398	STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
				5399	STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
				5400	STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
				5401	STAT_ATTR(ALLOC_SLAB, alloc_slab);
				5402	STAT_ATTR(ALLOC_REFILL, alloc_refill);
				5403	STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
				5404	STAT_ATTR(FREE_SLAB, free_slab);
				5405	STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
				5406	STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
				5407	STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
				5408	STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
				5409	STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
				5410	STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
				5411	STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
				5412	STAT_ATTR(ORDER_FALLBACK, order_fallback);
				5413	STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
				5414	STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
				5415	STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
				5416	STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
				5417	STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
				5418	STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
				5419	#endif
				5420
				5421	static struct attribute *slab_attrs[] = {
				5422	&slab_size_attr.attr,
				5423	&object_size_attr.attr,
				5424	&objs_per_slab_attr.attr,
				5425	&order_attr.attr,
				5426	&min_partial_attr.attr,
				5427	&cpu_partial_attr.attr,
				5428	&objects_attr.attr,
				5429	&objects_partial_attr.attr,
				5430	&partial_attr.attr,
				5431	&cpu_slabs_attr.attr,
				5432	&ctor_attr.attr,
				5433	&aliases_attr.attr,
				5434	&align_attr.attr,
				5435	&hwcache_align_attr.attr,
				5436	&reclaim_account_attr.attr,
				5437	&destroy_by_rcu_attr.attr,
				5438	&shrink_attr.attr,
				5439	&reserved_attr.attr,
				5440	&slabs_cpu_partial_attr.attr,
				5441	#ifdef CONFIG_SLUB_DEBUG
				5442	&total_objects_attr.attr,
				5443	&slabs_attr.attr,
				5444	&sanity_checks_attr.attr,
				5445	&trace_attr.attr,
				5446	&red_zone_attr.attr,
				5447	&poison_attr.attr,
				5448	&store_user_attr.attr,
				5449	&validate_attr.attr,
				5450	&alloc_calls_attr.attr,
				5451	&free_calls_attr.attr,
				5452	#endif
				5453	#ifdef CONFIG_ZONE_DMA
				5454	&cache_dma_attr.attr,
				5455	#endif
				5456	#ifdef CONFIG_NUMA
				5457	&remote_node_defrag_ratio_attr.attr,
				5458	#endif
				5459	#ifdef CONFIG_SLUB_STATS
				5460	&alloc_fastpath_attr.attr,
				5461	&alloc_slowpath_attr.attr,
				5462	&free_fastpath_attr.attr,
				5463	&free_slowpath_attr.attr,
				5464	&free_frozen_attr.attr,
				5465	&free_add_partial_attr.attr,
				5466	&free_remove_partial_attr.attr,
				5467	&alloc_from_partial_attr.attr,
				5468	&alloc_slab_attr.attr,
				5469	&alloc_refill_attr.attr,
				5470	&alloc_node_mismatch_attr.attr,
				5471	&free_slab_attr.attr,
				5472	&cpuslab_flush_attr.attr,
				5473	&deactivate_full_attr.attr,
				5474	&deactivate_empty_attr.attr,
				5475	&deactivate_to_head_attr.attr,
				5476	&deactivate_to_tail_attr.attr,
				5477	&deactivate_remote_frees_attr.attr,
				5478	&deactivate_bypass_attr.attr,
				5479	&order_fallback_attr.attr,
				5480	&cmpxchg_double_fail_attr.attr,
				5481	&cmpxchg_double_cpu_fail_attr.attr,
				5482	&cpu_partial_alloc_attr.attr,
				5483	&cpu_partial_free_attr.attr,
				5484	&cpu_partial_node_attr.attr,
				5485	&cpu_partial_drain_attr.attr,
				5486	#endif
				5487	#ifdef CONFIG_FAILSLAB
				5488	&failslab_attr.attr,
				5489	#endif
				5490
				5491	NULL
				5492	};
				5493
				5494	static const struct attribute_group slab_attr_group = {
				5495	.attrs = slab_attrs,
				5496	};
				5497
				5498	static ssize_t slab_attr_show(struct kobject *kobj,
				5499	struct attribute *attr,
				5500	char *buf)
				5501	{
				5502	struct slab_attribute *attribute;
				5503	struct kmem_cache *s;
				5504	int err;
				5505
				5506	attribute = to_slab_attr(attr);
				5507	s = to_slab(kobj);
				5508
				5509	if (!attribute->show)
				5510	return -EIO;
				5511
				5512	err = attribute->show(s, buf);
				5513
				5514	return err;
				5515	}
				5516
				5517	static ssize_t slab_attr_store(struct kobject *kobj,
				5518	struct attribute *attr,
				5519	const char *buf, size_t len)
				5520	{
				5521	struct slab_attribute *attribute;
				5522	struct kmem_cache *s;
				5523	int err;
				5524
				5525	attribute = to_slab_attr(attr);
				5526	s = to_slab(kobj);
				5527
				5528	if (!attribute->store)
				5529	return -EIO;
				5530
				5531	err = attribute->store(s, buf, len);
				5532	#ifdef CONFIG_MEMCG
				5533	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
				5534	struct kmem_cache *c;
				5535
				5536	mutex_lock(&slab_mutex);
				5537	if (s->max_attr_size < len)
				5538	s->max_attr_size = len;
				5539
				5540	/*
				5541	* This is a best effort propagation, so this function's return
				5542	* value will be determined by the parent cache only. This is
				5543	* basically because not all attributes will have a well
				5544	* defined semantics for rollbacks - most of the actions will
				5545	* have permanent effects.
				5546	*
				5547	* Returning the error value of any of the children that fail
				5548	* is not 100 % defined, in the sense that users seeing the
				5549	* error code won't be able to know anything about the state of
				5550	* the cache.
				5551	*
				5552	* Only returning the error code for the parent cache at least
				5553	* has well defined semantics. The cache being written to
				5554	* directly either failed or succeeded, in which case we loop
				5555	* through the descendants with best-effort propagation.
				5556	*/
				5557	for_each_memcg_cache(c, s)
				5558	attribute->store(c, buf, len);
				5559	mutex_unlock(&slab_mutex);
				5560	}
				5561	#endif
				5562	return err;
				5563	}
				5564
				5565	static void memcg_propagate_slab_attrs(struct kmem_cache *s)
				5566	{
				5567	#ifdef CONFIG_MEMCG
				5568	int i;
				5569	char *buffer = NULL;
				5570	struct kmem_cache *root_cache;
				5571
				5572	if (is_root_cache(s))
				5573	return;
				5574
				5575	root_cache = s->memcg_params.root_cache;
				5576
				5577	/*
				5578	* This mean this cache had no attribute written. Therefore, no point
				5579	* in copying default values around
				5580	*/
				5581	if (!root_cache->max_attr_size)
				5582	return;
				5583
				5584	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
				5585	char mbuf[64];
				5586	char *buf;
				5587	struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
				5588	ssize_t len;
				5589
				5590	if (!attr \|\| !attr->store \|\| !attr->show)
				5591	continue;
				5592
				5593	/*
				5594	* It is really bad that we have to allocate here, so we will
				5595	* do it only as a fallback. If we actually allocate, though,
				5596	* we can just use the allocated buffer until the end.
				5597	*
				5598	* Most of the slub attributes will tend to be very small in
				5599	* size, but sysfs allows buffers up to a page, so they can
				5600	* theoretically happen.
				5601	*/
				5602	if (buffer)
				5603	buf = buffer;
				5604	else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
				5605	!IS_ENABLED(CONFIG_SLUB_STATS))
				5606	buf = mbuf;
				5607	else {
				5608	buffer = (char *) get_zeroed_page(GFP_KERNEL);
				5609	if (WARN_ON(!buffer))
				5610	continue;
				5611	buf = buffer;
				5612	}
				5613
				5614	len = attr->show(root_cache, buf);
				5615	if (len > 0)
				5616	attr->store(s, buf, len);
				5617	}
				5618
				5619	if (buffer)
				5620	free_page((unsigned long)buffer);
				5621	#endif
				5622	}
				5623
				5624	static void kmem_cache_release(struct kobject *k)
				5625	{
				5626	slab_kmem_cache_release(to_slab(k));
				5627	}
				5628
				5629	static const struct sysfs_ops slab_sysfs_ops = {
				5630	.show = slab_attr_show,
				5631	.store = slab_attr_store,
				5632	};
				5633
				5634	static struct kobj_type slab_ktype = {
				5635	.sysfs_ops = &slab_sysfs_ops,
				5636	.release = kmem_cache_release,
				5637	};
				5638
				5639	static int uevent_filter(struct kset kset, struct kobject kobj)
				5640	{
				5641	struct kobj_type *ktype = get_ktype(kobj);
				5642
				5643	if (ktype == &slab_ktype)
				5644	return 1;
				5645	return 0;
				5646	}
				5647
				5648	static const struct kset_uevent_ops slab_uevent_ops = {
				5649	.filter = uevent_filter,
				5650	};
				5651
				5652	static struct kset *slab_kset;
				5653
				5654	static inline struct kset cache_kset(struct kmem_cache s)
				5655	{
				5656	#ifdef CONFIG_MEMCG
				5657	if (!is_root_cache(s))
				5658	return s->memcg_params.root_cache->memcg_kset;
				5659	#endif
				5660	return slab_kset;
				5661	}
				5662
				5663	#define ID_STR_LENGTH 64
				5664
				5665	/* Create a unique string id for a slab cache:
				5666	*
				5667	* Format :[flags-]size
				5668	*/
				5669	static char create_unique_id(struct kmem_cache s)
				5670	{
				5671	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
				5672	char *p = name;
				5673
				5674	BUG_ON(!name);
				5675
				5676	*p++ = ':';
				5677	/*
				5678	* First flags affecting slabcache operations. We will only
				5679	* get here for aliasable slabs so we do not need to support
				5680	* too many flags. The flags here must cover all flags that
				5681	* are matched during merging to guarantee that the id is
				5682	* unique.
				5683	*/
				5684	if (s->flags & SLAB_CACHE_DMA)
				5685	*p++ = 'd';
				5686	if (s->flags & SLAB_RECLAIM_ACCOUNT)
				5687	*p++ = 'a';
				5688	if (s->flags & SLAB_CONSISTENCY_CHECKS)
				5689	*p++ = 'F';
				5690	if (s->flags & SLAB_ACCOUNT)
				5691	*p++ = 'A';
				5692	if (p != name + 1)
				5693	*p++ = '-';
				5694	p += sprintf(p, "%07d", s->size);
				5695
				5696	BUG_ON(p > name + ID_STR_LENGTH - 1);
				5697	return name;
				5698	}
				5699
				5700	static void sysfs_slab_remove_workfn(struct work_struct *work)
				5701	{
				5702	struct kmem_cache *s =
				5703	container_of(work, struct kmem_cache, kobj_remove_work);
				5704
				5705	if (!s->kobj.state_in_sysfs)
				5706	/*
				5707	* For a memcg cache, this may be called during
				5708	* deactivation and again on shutdown. Remove only once.
				5709	* A cache is never shut down before deactivation is
				5710	* complete, so no need to worry about synchronization.
				5711	*/
				5712	goto out;
				5713
				5714	#ifdef CONFIG_MEMCG
				5715	kset_unregister(s->memcg_kset);
				5716	#endif
				5717	kobject_uevent(&s->kobj, KOBJ_REMOVE);
				5718	out:
				5719	kobject_put(&s->kobj);
				5720	}
				5721
				5722	static int sysfs_slab_add(struct kmem_cache *s)
				5723	{
				5724	int err;
				5725	const char *name;
				5726	struct kset *kset = cache_kset(s);
				5727	int unmergeable = slab_unmergeable(s);
				5728
				5729	INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
				5730
				5731	if (!kset) {
				5732	kobject_init(&s->kobj, &slab_ktype);
				5733	return 0;
				5734	}
				5735
				5736	if (!unmergeable && disable_higher_order_debug &&
				5737	(slub_debug & DEBUG_METADATA_FLAGS))
				5738	unmergeable = 1;
				5739
				5740	if (unmergeable) {
				5741	/*
				5742	* Slabcache can never be merged so we can use the name proper.
				5743	* This is typically the case for debug situations. In that
				5744	* case we can catch duplicate names easily.
				5745	*/
				5746	sysfs_remove_link(&slab_kset->kobj, s->name);
				5747	name = s->name;
				5748	} else {
				5749	/*
				5750	* Create a unique name for the slab as a target
				5751	* for the symlinks.
				5752	*/
				5753	name = create_unique_id(s);
				5754	}
				5755
				5756	s->kobj.kset = kset;
				5757	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
				5758	if (err) {
				5759	kobject_put(&s->kobj);
				5760	goto out;
				5761	}
				5762
				5763	err = sysfs_create_group(&s->kobj, &slab_attr_group);
				5764	if (err)
				5765	goto out_del_kobj;
				5766
				5767	#ifdef CONFIG_MEMCG
				5768	if (is_root_cache(s) && memcg_sysfs_enabled) {
				5769	s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
				5770	if (!s->memcg_kset) {
				5771	err = -ENOMEM;
				5772	goto out_del_kobj;
				5773	}
				5774	}
				5775	#endif
				5776
				5777	kobject_uevent(&s->kobj, KOBJ_ADD);
				5778	if (!unmergeable) {
				5779	/* Setup first alias */
				5780	sysfs_slab_alias(s, s->name);
				5781	}
				5782	out:
				5783	if (!unmergeable)
				5784	kfree(name);
				5785	return err;
				5786	out_del_kobj:
				5787	kobject_del(&s->kobj);
				5788	goto out;
				5789	}
				5790
				5791	static void sysfs_slab_remove(struct kmem_cache *s)
				5792	{
				5793	if (slab_state < FULL)
				5794	/*
				5795	* Sysfs has not been setup yet so no need to remove the
				5796	* cache from sysfs.
				5797	*/
				5798	return;
				5799
				5800	kobject_get(&s->kobj);
				5801	schedule_work(&s->kobj_remove_work);
				5802	}
				5803
				5804	void sysfs_slab_unlink(struct kmem_cache *s)
				5805	{
				5806	if (slab_state >= FULL)
				5807	kobject_del(&s->kobj);
				5808	}
				5809
				5810	void sysfs_slab_release(struct kmem_cache *s)
				5811	{
				5812	if (slab_state >= FULL)
				5813	kobject_put(&s->kobj);
				5814	}
				5815
				5816	/*
				5817	* Need to buffer aliases during bootup until sysfs becomes
				5818	* available lest we lose that information.
				5819	*/
				5820	struct saved_alias {
				5821	struct kmem_cache *s;
				5822	const char *name;
				5823	struct saved_alias *next;
				5824	};
				5825
				5826	static struct saved_alias *alias_list;
				5827
				5828	static int sysfs_slab_alias(struct kmem_cache s, const char name)
				5829	{
				5830	struct saved_alias *al;
				5831
				5832	if (slab_state == FULL) {
				5833	/*
				5834	* If we have a leftover link then remove it.
				5835	*/
				5836	sysfs_remove_link(&slab_kset->kobj, name);
				5837	return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
				5838	}
				5839
				5840	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
				5841	if (!al)
				5842	return -ENOMEM;
				5843
				5844	al->s = s;
				5845	al->name = name;
				5846	al->next = alias_list;
				5847	alias_list = al;
				5848	return 0;
				5849	}
				5850
				5851	static int __init slab_sysfs_init(void)
				5852	{
				5853	struct kmem_cache *s;
				5854	int err;
				5855
				5856	mutex_lock(&slab_mutex);
				5857
				5858	slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
				5859	if (!slab_kset) {
				5860	mutex_unlock(&slab_mutex);
				5861	pr_err("Cannot register slab subsystem.\n");
				5862	return -ENOSYS;
				5863	}
				5864
				5865	slab_state = FULL;
				5866
				5867	list_for_each_entry(s, &slab_caches, list) {
				5868	err = sysfs_slab_add(s);
				5869	if (err)
				5870	pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
				5871	s->name);
				5872	}
				5873
				5874	while (alias_list) {
				5875	struct saved_alias *al = alias_list;
				5876
				5877	alias_list = alias_list->next;
				5878	err = sysfs_slab_alias(al->s, al->name);
				5879	if (err)
				5880	pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
				5881	al->name);
				5882	kfree(al);
				5883	}
				5884
				5885	mutex_unlock(&slab_mutex);
				5886	resiliency_test();
				5887	return 0;
				5888	}
				5889
				5890	__initcall(slab_sysfs_init);
				5891	#endif /* CONFIG_SYSFS */
				5892
				5893	/*
				5894	* The /proc/slabinfo ABI
				5895	*/
				5896	#ifdef CONFIG_SLABINFO
				5897	void get_slabinfo(struct kmem_cache s, struct slabinfo sinfo)
				5898	{
				5899	unsigned long nr_slabs = 0;
				5900	unsigned long nr_objs = 0;
				5901	unsigned long nr_free = 0;
				5902	int node;
				5903	struct kmem_cache_node *n;
				5904
				5905	for_each_kmem_cache_node(s, node, n) {
				5906	nr_slabs += node_nr_slabs(n);
				5907	nr_objs += node_nr_objs(n);
				5908	nr_free += count_partial(n, count_free);
				5909	}
				5910
				5911	sinfo->active_objs = nr_objs - nr_free;
				5912	sinfo->num_objs = nr_objs;
				5913	sinfo->active_slabs = nr_slabs;
				5914	sinfo->num_slabs = nr_slabs;
				5915	sinfo->objects_per_slab = oo_objects(s->oo);
				5916	sinfo->cache_order = oo_order(s->oo);
				5917	}
				5918
				5919	void slabinfo_show_stats(struct seq_file m, struct kmem_cache s)
				5920	{
				5921	}
				5922
				5923	ssize_t slabinfo_write(struct file file, const char __user buffer,
				5924	size_t count, loff_t *ppos)
				5925	{
				5926	return -EIO;
				5927	}
				5928	#endif /* CONFIG_SLABINFO */