blob: 90120387778952150ce6481e5cb7d318d6fccd67 [file] [log] [blame]
lh9ed821d2023-04-07 01:36:19 -07001#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11#include <linux/swapops.h>
12#include <linux/kmemleak.h>
13
14static unsigned long total_usage;
15
16static void page_cgroup_lock_init(struct page_cgroup *pc, int nr_pages)
17{
18#ifdef CONFIG_PREEMPT_RT_BASE
19 for (; nr_pages; nr_pages--, pc++)
20 spin_lock_init(&pc->pcg_lock);
21#endif
22}
23
24#if !defined(CONFIG_SPARSEMEM)
25
26
27void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
28{
29 pgdat->node_page_cgroup = NULL;
30}
31
32struct page_cgroup *lookup_page_cgroup(struct page *page)
33{
34 unsigned long pfn = page_to_pfn(page);
35 unsigned long offset;
36 struct page_cgroup *base;
37
38 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
39#ifdef CONFIG_DEBUG_VM
40 /*
41 * The sanity checks the page allocator does upon freeing a
42 * page can reach here before the page_cgroup arrays are
43 * allocated when feeding a range of pages to the allocator
44 * for the first time during bootup or memory hotplug.
45 */
46 if (unlikely(!base))
47 return NULL;
48#endif
49 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
50 return base + offset;
51}
52
53static int __init alloc_node_page_cgroup(int nid)
54{
55 struct page_cgroup *base;
56 unsigned long table_size;
57 unsigned long nr_pages;
58
59 nr_pages = NODE_DATA(nid)->node_spanned_pages;
60 if (!nr_pages)
61 return 0;
62
63 table_size = sizeof(struct page_cgroup) * nr_pages;
64
65 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
66 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
67 if (!base)
68 return -ENOMEM;
69 NODE_DATA(nid)->node_page_cgroup = base;
70 total_usage += table_size;
71 page_cgroup_lock_init(base, nr_pages);
72 return 0;
73}
74
75void __init page_cgroup_init_flatmem(void)
76{
77
78 int nid, fail;
79
80 if (mem_cgroup_disabled())
81 return;
82
83 for_each_online_node(nid) {
84 fail = alloc_node_page_cgroup(nid);
85 if (fail)
86 goto fail;
87 }
88 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
89 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
90 " don't want memory cgroups\n");
91 return;
92fail:
93 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
94 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
95 panic("Out of memory");
96}
97
98#else /* CONFIG_FLAT_NODE_MEM_MAP */
99
100struct page_cgroup *lookup_page_cgroup(struct page *page)
101{
102 unsigned long pfn = page_to_pfn(page);
103 struct mem_section *section = __pfn_to_section(pfn);
104#ifdef CONFIG_DEBUG_VM
105 /*
106 * The sanity checks the page allocator does upon freeing a
107 * page can reach here before the page_cgroup arrays are
108 * allocated when feeding a range of pages to the allocator
109 * for the first time during bootup or memory hotplug.
110 */
111 if (!section->page_cgroup)
112 return NULL;
113#endif
114 return section->page_cgroup + pfn;
115}
116
117static void *__meminit alloc_page_cgroup(size_t size, int nid)
118{
119 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
120 void *addr = NULL;
121
122 addr = alloc_pages_exact_nid(nid, size, flags);
123 if (addr) {
124 kmemleak_alloc(addr, size, 1, flags);
125 return addr;
126 }
127
128 if (node_state(nid, N_HIGH_MEMORY))
129 addr = vzalloc_node(size, nid);
130 else
131 addr = vzalloc(size);
132
133 return addr;
134}
135
136static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
137{
138 struct mem_section *section;
139 struct page_cgroup *base;
140 unsigned long table_size;
141
142 section = __pfn_to_section(pfn);
143
144 if (section->page_cgroup)
145 return 0;
146
147 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
148 base = alloc_page_cgroup(table_size, nid);
149
150 /*
151 * The value stored in section->page_cgroup is (base - pfn)
152 * and it does not point to the memory block allocated above,
153 * causing kmemleak false positives.
154 */
155 kmemleak_not_leak(base);
156
157 if (!base) {
158 printk(KERN_ERR "page cgroup allocation failure\n");
159 return -ENOMEM;
160 }
161
162 page_cgroup_lock_init(base, PAGES_PER_SECTION);
163
164 /*
165 * The passed "pfn" may not be aligned to SECTION. For the calculation
166 * we need to apply a mask.
167 */
168 pfn &= PAGE_SECTION_MASK;
169 section->page_cgroup = base - pfn;
170 total_usage += table_size;
171 return 0;
172}
173#ifdef CONFIG_MEMORY_HOTPLUG
174static void free_page_cgroup(void *addr)
175{
176 if (is_vmalloc_addr(addr)) {
177 vfree(addr);
178 } else {
179 struct page *page = virt_to_page(addr);
180 size_t table_size =
181 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
182
183 BUG_ON(PageReserved(page));
184 kmemleak_free(addr);
185 free_pages_exact(addr, table_size);
186 }
187}
188
189void __free_page_cgroup(unsigned long pfn)
190{
191 struct mem_section *ms;
192 struct page_cgroup *base;
193
194 ms = __pfn_to_section(pfn);
195 if (!ms || !ms->page_cgroup)
196 return;
197 base = ms->page_cgroup + pfn;
198 free_page_cgroup(base);
199 ms->page_cgroup = NULL;
200}
201
202int __meminit online_page_cgroup(unsigned long start_pfn,
203 unsigned long nr_pages,
204 int nid)
205{
206 unsigned long start, end, pfn;
207 int fail = 0;
208
209 start = SECTION_ALIGN_DOWN(start_pfn);
210 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
211
212 if (nid == -1) {
213 /*
214 * In this case, "nid" already exists and contains valid memory.
215 * "start_pfn" passed to us is a pfn which is an arg for
216 * online__pages(), and start_pfn should exist.
217 */
218 nid = pfn_to_nid(start_pfn);
219 VM_BUG_ON(!node_state(nid, N_ONLINE));
220 }
221
222 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
223 if (!pfn_present(pfn))
224 continue;
225 fail = init_section_page_cgroup(pfn, nid);
226 }
227 if (!fail)
228 return 0;
229
230 /* rollback */
231 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
232 __free_page_cgroup(pfn);
233
234 return -ENOMEM;
235}
236
237int __meminit offline_page_cgroup(unsigned long start_pfn,
238 unsigned long nr_pages, int nid)
239{
240 unsigned long start, end, pfn;
241
242 start = SECTION_ALIGN_DOWN(start_pfn);
243 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
244
245 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
246 __free_page_cgroup(pfn);
247 return 0;
248
249}
250
251static int __meminit page_cgroup_callback(struct notifier_block *self,
252 unsigned long action, void *arg)
253{
254 struct memory_notify *mn = arg;
255 int ret = 0;
256 switch (action) {
257 case MEM_GOING_ONLINE:
258 ret = online_page_cgroup(mn->start_pfn,
259 mn->nr_pages, mn->status_change_nid);
260 break;
261 case MEM_OFFLINE:
262 offline_page_cgroup(mn->start_pfn,
263 mn->nr_pages, mn->status_change_nid);
264 break;
265 case MEM_CANCEL_ONLINE:
266 case MEM_GOING_OFFLINE:
267 break;
268 case MEM_ONLINE:
269 case MEM_CANCEL_OFFLINE:
270 break;
271 }
272
273 return notifier_from_errno(ret);
274}
275
276#endif
277
278void __init page_cgroup_init(void)
279{
280 unsigned long pfn;
281 int nid;
282
283 if (mem_cgroup_disabled())
284 return;
285
286 for_each_node_state(nid, N_HIGH_MEMORY) {
287 unsigned long start_pfn, end_pfn;
288
289 start_pfn = node_start_pfn(nid);
290 end_pfn = node_end_pfn(nid);
291 /*
292 * start_pfn and end_pfn may not be aligned to SECTION and the
293 * page->flags of out of node pages are not initialized. So we
294 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
295 */
296 for (pfn = start_pfn;
297 pfn < end_pfn;
298 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
299
300 if (!pfn_valid(pfn))
301 continue;
302 /*
303 * Nodes's pfns can be overlapping.
304 * We know some arch can have a nodes layout such as
305 * -------------pfn-------------->
306 * N0 | N1 | N2 | N0 | N1 | N2|....
307 */
308 if (pfn_to_nid(pfn) != nid)
309 continue;
310 if (init_section_page_cgroup(pfn, nid))
311 goto oom;
312 }
313 }
314 hotplug_memory_notifier(page_cgroup_callback, 0);
315 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
316 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
317 "don't want memory cgroups\n");
318 return;
319oom:
320 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
321 panic("Out of memory");
322}
323
324void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
325{
326 return;
327}
328
329#endif
330
331
332#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
333
334static DEFINE_MUTEX(swap_cgroup_mutex);
335struct swap_cgroup_ctrl {
336 struct page **map;
337 unsigned long length;
338 spinlock_t lock;
339};
340
341static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
342
343struct swap_cgroup {
344 unsigned short id;
345};
346#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
347
348/*
349 * SwapCgroup implements "lookup" and "exchange" operations.
350 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
351 * against SwapCache. At swap_free(), this is accessed directly from swap.
352 *
353 * This means,
354 * - we have no race in "exchange" when we're accessed via SwapCache because
355 * SwapCache(and its swp_entry) is under lock.
356 * - When called via swap_free(), there is no user of this entry and no race.
357 * Then, we don't need lock around "exchange".
358 *
359 * TODO: we can push these buffers out to HIGHMEM.
360 */
361
362/*
363 * allocate buffer for swap_cgroup.
364 */
365static int swap_cgroup_prepare(int type)
366{
367 struct page *page;
368 struct swap_cgroup_ctrl *ctrl;
369 unsigned long idx, max;
370
371 ctrl = &swap_cgroup_ctrl[type];
372
373 for (idx = 0; idx < ctrl->length; idx++) {
374 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
375 if (!page)
376 goto not_enough_page;
377 ctrl->map[idx] = page;
378 }
379 return 0;
380not_enough_page:
381 max = idx;
382 for (idx = 0; idx < max; idx++)
383 __free_page(ctrl->map[idx]);
384
385 return -ENOMEM;
386}
387
388static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
389 struct swap_cgroup_ctrl **ctrlp)
390{
391 pgoff_t offset = swp_offset(ent);
392 struct swap_cgroup_ctrl *ctrl;
393 struct page *mappage;
394 struct swap_cgroup *sc;
395
396 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
397 if (ctrlp)
398 *ctrlp = ctrl;
399
400 mappage = ctrl->map[offset / SC_PER_PAGE];
401 sc = page_address(mappage);
402 return sc + offset % SC_PER_PAGE;
403}
404
405/**
406 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
407 * @end: swap entry to be cmpxchged
408 * @old: old id
409 * @new: new id
410 *
411 * Returns old id at success, 0 at failure.
412 * (There is no mem_cgroup using 0 as its id)
413 */
414unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
415 unsigned short old, unsigned short new)
416{
417 struct swap_cgroup_ctrl *ctrl;
418 struct swap_cgroup *sc;
419 unsigned long flags;
420 unsigned short retval;
421
422 sc = lookup_swap_cgroup(ent, &ctrl);
423
424 spin_lock_irqsave(&ctrl->lock, flags);
425 retval = sc->id;
426 if (retval == old)
427 sc->id = new;
428 else
429 retval = 0;
430 spin_unlock_irqrestore(&ctrl->lock, flags);
431 return retval;
432}
433
434/**
435 * swap_cgroup_record - record mem_cgroup for this swp_entry.
436 * @ent: swap entry to be recorded into
437 * @mem: mem_cgroup to be recorded
438 *
439 * Returns old value at success, 0 at failure.
440 * (Of course, old value can be 0.)
441 */
442unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
443{
444 struct swap_cgroup_ctrl *ctrl;
445 struct swap_cgroup *sc;
446 unsigned short old;
447 unsigned long flags;
448
449 sc = lookup_swap_cgroup(ent, &ctrl);
450
451 spin_lock_irqsave(&ctrl->lock, flags);
452 old = sc->id;
453 sc->id = id;
454 spin_unlock_irqrestore(&ctrl->lock, flags);
455
456 return old;
457}
458
459/**
460 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
461 * @ent: swap entry to be looked up.
462 *
463 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
464 */
465unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
466{
467 return lookup_swap_cgroup(ent, NULL)->id;
468}
469
470int swap_cgroup_swapon(int type, unsigned long max_pages)
471{
472 void *array;
473 unsigned long array_size;
474 unsigned long length;
475 struct swap_cgroup_ctrl *ctrl;
476
477 if (!do_swap_account)
478 return 0;
479
480 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
481 array_size = length * sizeof(void *);
482
483 array = vzalloc(array_size);
484 if (!array)
485 goto nomem;
486
487 ctrl = &swap_cgroup_ctrl[type];
488 mutex_lock(&swap_cgroup_mutex);
489 ctrl->length = length;
490 ctrl->map = array;
491 spin_lock_init(&ctrl->lock);
492 if (swap_cgroup_prepare(type)) {
493 /* memory shortage */
494 ctrl->map = NULL;
495 ctrl->length = 0;
496 mutex_unlock(&swap_cgroup_mutex);
497 vfree(array);
498 goto nomem;
499 }
500 mutex_unlock(&swap_cgroup_mutex);
501
502 return 0;
503nomem:
504 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
505 printk(KERN_INFO
506 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
507 return -ENOMEM;
508}
509
510void swap_cgroup_swapoff(int type)
511{
512 struct page **map;
513 unsigned long i, length;
514 struct swap_cgroup_ctrl *ctrl;
515
516 if (!do_swap_account)
517 return;
518
519 mutex_lock(&swap_cgroup_mutex);
520 ctrl = &swap_cgroup_ctrl[type];
521 map = ctrl->map;
522 length = ctrl->length;
523 ctrl->map = NULL;
524 ctrl->length = 0;
525 mutex_unlock(&swap_cgroup_mutex);
526
527 if (map) {
528 for (i = 0; i < length; i++) {
529 struct page *page = map[i];
530 if (page)
531 __free_page(page);
532 }
533 vfree(map);
534 }
535}
536
537#endif