Blame - ap/os/linux/linux-3.4.x/mm/page_cgroup.c - T106_DC

blob: 90120387778952150ce6481e5cb7d318d6fccd67 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	#include <linux/mm.h>
				2	#include <linux/mmzone.h>
				3	#include <linux/bootmem.h>
				4	#include <linux/bit_spinlock.h>
				5	#include <linux/page_cgroup.h>
				6	#include <linux/hash.h>
				7	#include <linux/slab.h>
				8	#include <linux/memory.h>
				9	#include <linux/vmalloc.h>
				10	#include <linux/cgroup.h>
				11	#include <linux/swapops.h>
				12	#include <linux/kmemleak.h>
				13
				14	static unsigned long total_usage;
				15
				16	static void page_cgroup_lock_init(struct page_cgroup *pc, int nr_pages)
				17	{
				18	#ifdef CONFIG_PREEMPT_RT_BASE
				19	for (; nr_pages; nr_pages--, pc++)
				20	spin_lock_init(&pc->pcg_lock);
				21	#endif
				22	}
				23
				24	#if !defined(CONFIG_SPARSEMEM)
				25
				26
				27	void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
				28	{
				29	pgdat->node_page_cgroup = NULL;
				30	}
				31
				32	struct page_cgroup lookup_page_cgroup(struct page page)
				33	{
				34	unsigned long pfn = page_to_pfn(page);
				35	unsigned long offset;
				36	struct page_cgroup *base;
				37
				38	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
				39	#ifdef CONFIG_DEBUG_VM
				40	/*
				41	* The sanity checks the page allocator does upon freeing a
				42	* page can reach here before the page_cgroup arrays are
				43	* allocated when feeding a range of pages to the allocator
				44	* for the first time during bootup or memory hotplug.
				45	*/
				46	if (unlikely(!base))
				47	return NULL;
				48	#endif
				49	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
				50	return base + offset;
				51	}
				52
				53	static int __init alloc_node_page_cgroup(int nid)
				54	{
				55	struct page_cgroup *base;
				56	unsigned long table_size;
				57	unsigned long nr_pages;
				58
				59	nr_pages = NODE_DATA(nid)->node_spanned_pages;
				60	if (!nr_pages)
				61	return 0;
				62
				63	table_size = sizeof(struct page_cgroup) * nr_pages;
				64
				65	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
				66	table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
				67	if (!base)
				68	return -ENOMEM;
				69	NODE_DATA(nid)->node_page_cgroup = base;
				70	total_usage += table_size;
				71	page_cgroup_lock_init(base, nr_pages);
				72	return 0;
				73	}
				74
				75	void __init page_cgroup_init_flatmem(void)
				76	{
				77
				78	int nid, fail;
				79
				80	if (mem_cgroup_disabled())
				81	return;
				82
				83	for_each_online_node(nid) {
				84	fail = alloc_node_page_cgroup(nid);
				85	if (fail)
				86	goto fail;
				87	}
				88	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
				89	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
				90	" don't want memory cgroups\n");
				91	return;
				92	fail:
				93	printk(KERN_CRIT "allocation of page_cgroup failed.\n");
				94	printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
				95	panic("Out of memory");
				96	}
				97
				98	#else /* CONFIG_FLAT_NODE_MEM_MAP */
				99
				100	struct page_cgroup lookup_page_cgroup(struct page page)
				101	{
				102	unsigned long pfn = page_to_pfn(page);
				103	struct mem_section *section = __pfn_to_section(pfn);
				104	#ifdef CONFIG_DEBUG_VM
				105	/*
				106	* The sanity checks the page allocator does upon freeing a
				107	* page can reach here before the page_cgroup arrays are
				108	* allocated when feeding a range of pages to the allocator
				109	* for the first time during bootup or memory hotplug.
				110	*/
				111	if (!section->page_cgroup)
				112	return NULL;
				113	#endif
				114	return section->page_cgroup + pfn;
				115	}
				116
				117	static void *__meminit alloc_page_cgroup(size_t size, int nid)
				118	{
				119	gfp_t flags = GFP_KERNEL \| __GFP_ZERO \| __GFP_NOWARN;
				120	void *addr = NULL;
				121
				122	addr = alloc_pages_exact_nid(nid, size, flags);
				123	if (addr) {
				124	kmemleak_alloc(addr, size, 1, flags);
				125	return addr;
				126	}
				127
				128	if (node_state(nid, N_HIGH_MEMORY))
				129	addr = vzalloc_node(size, nid);
				130	else
				131	addr = vzalloc(size);
				132
				133	return addr;
				134	}
				135
				136	static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
				137	{
				138	struct mem_section *section;
				139	struct page_cgroup *base;
				140	unsigned long table_size;
				141
				142	section = __pfn_to_section(pfn);
				143
				144	if (section->page_cgroup)
				145	return 0;
				146
				147	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
				148	base = alloc_page_cgroup(table_size, nid);
				149
				150	/*
				151	* The value stored in section->page_cgroup is (base - pfn)
				152	* and it does not point to the memory block allocated above,
				153	* causing kmemleak false positives.
				154	*/
				155	kmemleak_not_leak(base);
				156
				157	if (!base) {
				158	printk(KERN_ERR "page cgroup allocation failure\n");
				159	return -ENOMEM;
				160	}
				161
				162	page_cgroup_lock_init(base, PAGES_PER_SECTION);
				163
				164	/*
				165	* The passed "pfn" may not be aligned to SECTION. For the calculation
				166	* we need to apply a mask.
				167	*/
				168	pfn &= PAGE_SECTION_MASK;
				169	section->page_cgroup = base - pfn;
				170	total_usage += table_size;
				171	return 0;
				172	}
				173	#ifdef CONFIG_MEMORY_HOTPLUG
				174	static void free_page_cgroup(void *addr)
				175	{
				176	if (is_vmalloc_addr(addr)) {
				177	vfree(addr);
				178	} else {
				179	struct page *page = virt_to_page(addr);
				180	size_t table_size =
				181	sizeof(struct page_cgroup) * PAGES_PER_SECTION;
				182
				183	BUG_ON(PageReserved(page));
				184	kmemleak_free(addr);
				185	free_pages_exact(addr, table_size);
				186	}
				187	}
				188
				189	void __free_page_cgroup(unsigned long pfn)
				190	{
				191	struct mem_section *ms;
				192	struct page_cgroup *base;
				193
				194	ms = __pfn_to_section(pfn);
				195	if (!ms \|\| !ms->page_cgroup)
				196	return;
				197	base = ms->page_cgroup + pfn;
				198	free_page_cgroup(base);
				199	ms->page_cgroup = NULL;
				200	}
				201
				202	int __meminit online_page_cgroup(unsigned long start_pfn,
				203	unsigned long nr_pages,
				204	int nid)
				205	{
				206	unsigned long start, end, pfn;
				207	int fail = 0;
				208
				209	start = SECTION_ALIGN_DOWN(start_pfn);
				210	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
				211
				212	if (nid == -1) {
				213	/*
				214	* In this case, "nid" already exists and contains valid memory.
				215	* "start_pfn" passed to us is a pfn which is an arg for
				216	* online__pages(), and start_pfn should exist.
				217	*/
				218	nid = pfn_to_nid(start_pfn);
				219	VM_BUG_ON(!node_state(nid, N_ONLINE));
				220	}
				221
				222	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
				223	if (!pfn_present(pfn))
				224	continue;
				225	fail = init_section_page_cgroup(pfn, nid);
				226	}
				227	if (!fail)
				228	return 0;
				229
				230	/* rollback */
				231	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
				232	__free_page_cgroup(pfn);
				233
				234	return -ENOMEM;
				235	}
				236
				237	int __meminit offline_page_cgroup(unsigned long start_pfn,
				238	unsigned long nr_pages, int nid)
				239	{
				240	unsigned long start, end, pfn;
				241
				242	start = SECTION_ALIGN_DOWN(start_pfn);
				243	end = SECTION_ALIGN_UP(start_pfn + nr_pages);
				244
				245	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
				246	__free_page_cgroup(pfn);
				247	return 0;
				248
				249	}
				250
				251	static int __meminit page_cgroup_callback(struct notifier_block *self,
				252	unsigned long action, void *arg)
				253	{
				254	struct memory_notify *mn = arg;
				255	int ret = 0;
				256	switch (action) {
				257	case MEM_GOING_ONLINE:
				258	ret = online_page_cgroup(mn->start_pfn,
				259	mn->nr_pages, mn->status_change_nid);
				260	break;
				261	case MEM_OFFLINE:
				262	offline_page_cgroup(mn->start_pfn,
				263	mn->nr_pages, mn->status_change_nid);
				264	break;
				265	case MEM_CANCEL_ONLINE:
				266	case MEM_GOING_OFFLINE:
				267	break;
				268	case MEM_ONLINE:
				269	case MEM_CANCEL_OFFLINE:
				270	break;
				271	}
				272
				273	return notifier_from_errno(ret);
				274	}
				275
				276	#endif
				277
				278	void __init page_cgroup_init(void)
				279	{
				280	unsigned long pfn;
				281	int nid;
				282
				283	if (mem_cgroup_disabled())
				284	return;
				285
				286	for_each_node_state(nid, N_HIGH_MEMORY) {
				287	unsigned long start_pfn, end_pfn;
				288
				289	start_pfn = node_start_pfn(nid);
				290	end_pfn = node_end_pfn(nid);
				291	/*
				292	* start_pfn and end_pfn may not be aligned to SECTION and the
				293	* page->flags of out of node pages are not initialized. So we
				294	* scan [start_pfn, the biggest section's pfn < end_pfn) here.
				295	*/
				296	for (pfn = start_pfn;
				297	pfn < end_pfn;
				298	pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
				299
				300	if (!pfn_valid(pfn))
				301	continue;
				302	/*
				303	* Nodes's pfns can be overlapping.
				304	* We know some arch can have a nodes layout such as
				305	* -------------pfn-------------->
				306	* N0 \| N1 \| N2 \| N0 \| N1 \| N2\|....
				307	*/
				308	if (pfn_to_nid(pfn) != nid)
				309	continue;
				310	if (init_section_page_cgroup(pfn, nid))
				311	goto oom;
				312	}
				313	}
				314	hotplug_memory_notifier(page_cgroup_callback, 0);
				315	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
				316	printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
				317	"don't want memory cgroups\n");
				318	return;
				319	oom:
				320	printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
				321	panic("Out of memory");
				322	}
				323
				324	void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
				325	{
				326	return;
				327	}
				328
				329	#endif
				330
				331
				332	#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
				333
				334	static DEFINE_MUTEX(swap_cgroup_mutex);
				335	struct swap_cgroup_ctrl {
				336	struct page **map;
				337	unsigned long length;
				338	spinlock_t lock;
				339	};
				340
				341	static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
				342
				343	struct swap_cgroup {
				344	unsigned short id;
				345	};
				346	#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
				347
				348	/*
				349	* SwapCgroup implements "lookup" and "exchange" operations.
				350	* In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
				351	* against SwapCache. At swap_free(), this is accessed directly from swap.
				352	*
				353	* This means,
				354	* - we have no race in "exchange" when we're accessed via SwapCache because
				355	* SwapCache(and its swp_entry) is under lock.
				356	* - When called via swap_free(), there is no user of this entry and no race.
				357	* Then, we don't need lock around "exchange".
				358	*
				359	* TODO: we can push these buffers out to HIGHMEM.
				360	*/
				361
				362	/*
				363	* allocate buffer for swap_cgroup.
				364	*/
				365	static int swap_cgroup_prepare(int type)
				366	{
				367	struct page *page;
				368	struct swap_cgroup_ctrl *ctrl;
				369	unsigned long idx, max;
				370
				371	ctrl = &swap_cgroup_ctrl[type];
				372
				373	for (idx = 0; idx < ctrl->length; idx++) {
				374	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				375	if (!page)
				376	goto not_enough_page;
				377	ctrl->map[idx] = page;
				378	}
				379	return 0;
				380	not_enough_page:
				381	max = idx;
				382	for (idx = 0; idx < max; idx++)
				383	__free_page(ctrl->map[idx]);
				384
				385	return -ENOMEM;
				386	}
				387
				388	static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
				389	struct swap_cgroup_ctrl **ctrlp)
				390	{
				391	pgoff_t offset = swp_offset(ent);
				392	struct swap_cgroup_ctrl *ctrl;
				393	struct page *mappage;
				394	struct swap_cgroup *sc;
				395
				396	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
				397	if (ctrlp)
				398	*ctrlp = ctrl;
				399
				400	mappage = ctrl->map[offset / SC_PER_PAGE];
				401	sc = page_address(mappage);
				402	return sc + offset % SC_PER_PAGE;
				403	}
				404
				405	/**
				406	* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
				407	* @end: swap entry to be cmpxchged
				408	* @old: old id
				409	* @new: new id
				410	*
				411	* Returns old id at success, 0 at failure.
				412	* (There is no mem_cgroup using 0 as its id)
				413	*/
				414	unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
				415	unsigned short old, unsigned short new)
				416	{
				417	struct swap_cgroup_ctrl *ctrl;
				418	struct swap_cgroup *sc;
				419	unsigned long flags;
				420	unsigned short retval;
				421
				422	sc = lookup_swap_cgroup(ent, &ctrl);
				423
				424	spin_lock_irqsave(&ctrl->lock, flags);
				425	retval = sc->id;
				426	if (retval == old)
				427	sc->id = new;
				428	else
				429	retval = 0;
				430	spin_unlock_irqrestore(&ctrl->lock, flags);
				431	return retval;
				432	}
				433
				434	/**
				435	* swap_cgroup_record - record mem_cgroup for this swp_entry.
				436	* @ent: swap entry to be recorded into
				437	* @mem: mem_cgroup to be recorded
				438	*
				439	* Returns old value at success, 0 at failure.
				440	* (Of course, old value can be 0.)
				441	*/
				442	unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
				443	{
				444	struct swap_cgroup_ctrl *ctrl;
				445	struct swap_cgroup *sc;
				446	unsigned short old;
				447	unsigned long flags;
				448
				449	sc = lookup_swap_cgroup(ent, &ctrl);
				450
				451	spin_lock_irqsave(&ctrl->lock, flags);
				452	old = sc->id;
				453	sc->id = id;
				454	spin_unlock_irqrestore(&ctrl->lock, flags);
				455
				456	return old;
				457	}
				458
				459	/**
				460	* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
				461	* @ent: swap entry to be looked up.
				462	*
				463	* Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
				464	*/
				465	unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
				466	{
				467	return lookup_swap_cgroup(ent, NULL)->id;
				468	}
				469
				470	int swap_cgroup_swapon(int type, unsigned long max_pages)
				471	{
				472	void *array;
				473	unsigned long array_size;
				474	unsigned long length;
				475	struct swap_cgroup_ctrl *ctrl;
				476
				477	if (!do_swap_account)
				478	return 0;
				479
				480	length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
				481	array_size = length * sizeof(void *);
				482
				483	array = vzalloc(array_size);
				484	if (!array)
				485	goto nomem;
				486
				487	ctrl = &swap_cgroup_ctrl[type];
				488	mutex_lock(&swap_cgroup_mutex);
				489	ctrl->length = length;
				490	ctrl->map = array;
				491	spin_lock_init(&ctrl->lock);
				492	if (swap_cgroup_prepare(type)) {
				493	/* memory shortage */
				494	ctrl->map = NULL;
				495	ctrl->length = 0;
				496	mutex_unlock(&swap_cgroup_mutex);
				497	vfree(array);
				498	goto nomem;
				499	}
				500	mutex_unlock(&swap_cgroup_mutex);
				501
				502	return 0;
				503	nomem:
				504	printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
				505	printk(KERN_INFO
				506	"swap_cgroup can be disabled by swapaccount=0 boot option\n");
				507	return -ENOMEM;
				508	}
				509
				510	void swap_cgroup_swapoff(int type)
				511	{
				512	struct page **map;
				513	unsigned long i, length;
				514	struct swap_cgroup_ctrl *ctrl;
				515
				516	if (!do_swap_account)
				517	return;
				518
				519	mutex_lock(&swap_cgroup_mutex);
				520	ctrl = &swap_cgroup_ctrl[type];
				521	map = ctrl->map;
				522	length = ctrl->length;
				523	ctrl->map = NULL;
				524	ctrl->length = 0;
				525	mutex_unlock(&swap_cgroup_mutex);
				526
				527	if (map) {
				528	for (i = 0; i < length; i++) {
				529	struct page *page = map[i];
				530	if (page)
				531	__free_page(page);
				532	}
				533	vfree(map);
				534	}
				535	}
				536
				537	#endif