blob: 5b346eb97a930b35f8d290d0149fa08287201e10 [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h>
3#include <linux/vmacache.h>
4#include <linux/hugetlb.h>
5#include <linux/huge_mm.h>
6#include <linux/mount.h>
7#include <linux/seq_file.h>
8#include <linux/highmem.h>
9#include <linux/ptrace.h>
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include <linux/mempolicy.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
15#include <linux/sched/mm.h>
16#include <linux/swapops.h>
17#include <linux/mmu_notifier.h>
18#include <linux/page_idle.h>
19#include <linux/shmem_fs.h>
20#include <linux/uaccess.h>
21
22#include <asm/elf.h>
23#include <asm/tlb.h>
24#include <asm/tlbflush.h>
25#include "internal.h"
26
27void task_mem(struct seq_file *m, struct mm_struct *mm)
28{
29 unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
30 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
31
32 anon = get_mm_counter(mm, MM_ANONPAGES);
33 file = get_mm_counter(mm, MM_FILEPAGES);
34 shmem = get_mm_counter(mm, MM_SHMEMPAGES);
35
36 /*
37 * Note: to minimize their overhead, mm maintains hiwater_vm and
38 * hiwater_rss only when about to *lower* total_vm or rss. Any
39 * collector of these hiwater stats must therefore get total_vm
40 * and rss too, which will usually be the higher. Barriers? not
41 * worth the effort, such snapshots can always be inconsistent.
42 */
43 hiwater_vm = total_vm = mm->total_vm;
44 if (hiwater_vm < mm->hiwater_vm)
45 hiwater_vm = mm->hiwater_vm;
46 hiwater_rss = total_rss = anon + file + shmem;
47 if (hiwater_rss < mm->hiwater_rss)
48 hiwater_rss = mm->hiwater_rss;
49
50 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
51 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
52 swap = get_mm_counter(mm, MM_SWAPENTS);
53 ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
54 pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
55 seq_printf(m,
56 "VmPeak:\t%8lu kB\n"
57 "VmSize:\t%8lu kB\n"
58 "VmLck:\t%8lu kB\n"
59 "VmPin:\t%8lu kB\n"
60 "VmHWM:\t%8lu kB\n"
61 "VmRSS:\t%8lu kB\n"
62 "RssAnon:\t%8lu kB\n"
63 "RssFile:\t%8lu kB\n"
64 "RssShmem:\t%8lu kB\n"
65 "VmData:\t%8lu kB\n"
66 "VmStk:\t%8lu kB\n"
67 "VmExe:\t%8lu kB\n"
68 "VmLib:\t%8lu kB\n"
69 "VmPTE:\t%8lu kB\n"
70 "VmPMD:\t%8lu kB\n"
71 "VmSwap:\t%8lu kB\n",
72 hiwater_vm << (PAGE_SHIFT-10),
73 total_vm << (PAGE_SHIFT-10),
74 mm->locked_vm << (PAGE_SHIFT-10),
75 mm->pinned_vm << (PAGE_SHIFT-10),
76 hiwater_rss << (PAGE_SHIFT-10),
77 total_rss << (PAGE_SHIFT-10),
78 anon << (PAGE_SHIFT-10),
79 file << (PAGE_SHIFT-10),
80 shmem << (PAGE_SHIFT-10),
81 mm->data_vm << (PAGE_SHIFT-10),
82 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
83 ptes >> 10,
84 pmds >> 10,
85 swap << (PAGE_SHIFT-10));
86 hugetlb_report_usage(m, mm);
87}
88
89unsigned long task_vsize(struct mm_struct *mm)
90{
91 return PAGE_SIZE * mm->total_vm;
92}
93
94unsigned long task_statm(struct mm_struct *mm,
95 unsigned long *shared, unsigned long *text,
96 unsigned long *data, unsigned long *resident)
97{
98 *shared = get_mm_counter(mm, MM_FILEPAGES) +
99 get_mm_counter(mm, MM_SHMEMPAGES);
100 *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
101 >> PAGE_SHIFT;
102 *data = mm->data_vm + mm->stack_vm;
103 *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
104 return mm->total_vm;
105}
106
107#ifdef CONFIG_NUMA
108/*
109 * Save get_task_policy() for show_numa_map().
110 */
111static void hold_task_mempolicy(struct proc_maps_private *priv)
112{
113 struct task_struct *task = priv->task;
114
115 task_lock(task);
116 priv->task_mempolicy = get_task_policy(task);
117 mpol_get(priv->task_mempolicy);
118 task_unlock(task);
119}
120static void release_task_mempolicy(struct proc_maps_private *priv)
121{
122 mpol_put(priv->task_mempolicy);
123}
124#else
125static void hold_task_mempolicy(struct proc_maps_private *priv)
126{
127}
128static void release_task_mempolicy(struct proc_maps_private *priv)
129{
130}
131#endif
132
133static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
134{
135 const char __user *name = vma_get_anon_name(vma);
136 struct mm_struct *mm = vma->vm_mm;
137
138 unsigned long page_start_vaddr;
139 unsigned long page_offset;
140 unsigned long num_pages;
141 unsigned long max_len = NAME_MAX;
142 int i;
143
144 page_start_vaddr = (unsigned long)name & PAGE_MASK;
145 page_offset = (unsigned long)name - page_start_vaddr;
146 num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
147
148 seq_puts(m, "[anon:");
149
150 for (i = 0; i < num_pages; i++) {
151 int len;
152 int write_len;
153 const char *kaddr;
154 long pages_pinned;
155 struct page *page;
156
157 pages_pinned = get_user_pages_remote(current, mm,
158 page_start_vaddr, 1, 0, &page, NULL, NULL);
159 if (pages_pinned < 1) {
160 seq_puts(m, "<fault>]");
161 return;
162 }
163
164 kaddr = (const char *)kmap(page);
165 len = min(max_len, PAGE_SIZE - page_offset);
166 write_len = strnlen(kaddr + page_offset, len);
167 seq_write(m, kaddr + page_offset, write_len);
168 kunmap(page);
169 put_page(page);
170
171 /* if strnlen hit a null terminator then we're done */
172 if (write_len != len)
173 break;
174
175 max_len -= len;
176 page_offset = 0;
177 page_start_vaddr += PAGE_SIZE;
178 }
179
180 seq_putc(m, ']');
181}
182
183static void vma_stop(struct proc_maps_private *priv)
184{
185 struct mm_struct *mm = priv->mm;
186
187 release_task_mempolicy(priv);
188 up_read(&mm->mmap_sem);
189 mmput(mm);
190}
191
192static struct vm_area_struct *
193m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
194{
195 if (vma == priv->tail_vma)
196 return NULL;
197 return vma->vm_next ?: priv->tail_vma;
198}
199
200static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
201{
202 if (m->count < m->size) /* vma is copied successfully */
203 m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL;
204}
205
206static void *m_start(struct seq_file *m, loff_t *ppos)
207{
208 struct proc_maps_private *priv = m->private;
209 unsigned long last_addr = m->version;
210 struct mm_struct *mm;
211 struct vm_area_struct *vma;
212 unsigned int pos = *ppos;
213
214 /* See m_cache_vma(). Zero at the start or after lseek. */
215 if (last_addr == -1UL)
216 return NULL;
217
218 priv->task = get_proc_task(priv->inode);
219 if (!priv->task)
220 return ERR_PTR(-ESRCH);
221
222 mm = priv->mm;
223 if (!mm || !mmget_not_zero(mm))
224 return NULL;
225
226 down_read(&mm->mmap_sem);
227 hold_task_mempolicy(priv);
228 priv->tail_vma = get_gate_vma(mm);
229
230 if (last_addr) {
231 vma = find_vma(mm, last_addr - 1);
232 if (vma && vma->vm_start <= last_addr)
233 vma = m_next_vma(priv, vma);
234 if (vma)
235 return vma;
236 }
237
238 m->version = 0;
239 if (pos < mm->map_count) {
240 for (vma = mm->mmap; pos; pos--) {
241 m->version = vma->vm_start;
242 vma = vma->vm_next;
243 }
244 return vma;
245 }
246
247 /* we do not bother to update m->version in this case */
248 if (pos == mm->map_count && priv->tail_vma)
249 return priv->tail_vma;
250
251 vma_stop(priv);
252 return NULL;
253}
254
255static void *m_next(struct seq_file *m, void *v, loff_t *pos)
256{
257 struct proc_maps_private *priv = m->private;
258 struct vm_area_struct *next;
259
260 (*pos)++;
261 next = m_next_vma(priv, v);
262 if (!next)
263 vma_stop(priv);
264 return next;
265}
266
267static void m_stop(struct seq_file *m, void *v)
268{
269 struct proc_maps_private *priv = m->private;
270
271 if (!IS_ERR_OR_NULL(v))
272 vma_stop(priv);
273 if (priv->task) {
274 put_task_struct(priv->task);
275 priv->task = NULL;
276 }
277}
278
279static int proc_maps_open(struct inode *inode, struct file *file,
280 const struct seq_operations *ops, int psize)
281{
282 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
283
284 if (!priv)
285 return -ENOMEM;
286
287 priv->inode = inode;
288 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
289 if (IS_ERR(priv->mm)) {
290 int err = PTR_ERR(priv->mm);
291
292 seq_release_private(inode, file);
293 return err;
294 }
295
296 return 0;
297}
298
299static int proc_map_release(struct inode *inode, struct file *file)
300{
301 struct seq_file *seq = file->private_data;
302 struct proc_maps_private *priv = seq->private;
303
304 if (priv->mm)
305 mmdrop(priv->mm);
306
307 kfree(priv->rollup);
308 return seq_release_private(inode, file);
309}
310
311static int do_maps_open(struct inode *inode, struct file *file,
312 const struct seq_operations *ops)
313{
314 return proc_maps_open(inode, file, ops,
315 sizeof(struct proc_maps_private));
316}
317
318/*
319 * Indicate if the VMA is a stack for the given task; for
320 * /proc/PID/maps that is the stack of the main task.
321 */
322static int is_stack(struct vm_area_struct *vma)
323{
324 /*
325 * We make no effort to guess what a given thread considers to be
326 * its "stack". It's not even well-defined for programs written
327 * languages like Go.
328 */
329 return vma->vm_start <= vma->vm_mm->start_stack &&
330 vma->vm_end >= vma->vm_mm->start_stack;
331}
332
333static void show_vma_header_prefix(struct seq_file *m,
334 unsigned long start, unsigned long end,
335 vm_flags_t flags, unsigned long long pgoff,
336 dev_t dev, unsigned long ino)
337{
338 seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
339 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
340 start,
341 end,
342 flags & VM_READ ? 'r' : '-',
343 flags & VM_WRITE ? 'w' : '-',
344 flags & VM_EXEC ? 'x' : '-',
345 flags & VM_MAYSHARE ? 's' : 'p',
346 pgoff,
347 MAJOR(dev), MINOR(dev), ino);
348}
349
350static void
351show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
352{
353 struct mm_struct *mm = vma->vm_mm;
354 struct file *file = vma->vm_file;
355 vm_flags_t flags = vma->vm_flags;
356 unsigned long ino = 0;
357 unsigned long long pgoff = 0;
358 unsigned long start, end;
359 dev_t dev = 0;
360 const char *name = NULL;
361
362 if (file) {
363 struct inode *inode = file_inode(vma->vm_file);
364 dev = inode->i_sb->s_dev;
365 ino = inode->i_ino;
366 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
367 }
368
369 start = vma->vm_start;
370 end = vma->vm_end;
371 show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
372
373 /*
374 * Print the dentry name for named mappings, and a
375 * special [heap] marker for the heap:
376 */
377 if (file) {
378 seq_pad(m, ' ');
379 seq_file_path(m, file, "\n");
380 goto done;
381 }
382
383 if (vma->vm_ops && vma->vm_ops->name) {
384 name = vma->vm_ops->name(vma);
385 if (name)
386 goto done;
387 }
388
389 name = arch_vma_name(vma);
390 if (!name) {
391 if (!mm) {
392 name = "[vdso]";
393 goto done;
394 }
395
396 if (vma->vm_start <= mm->brk &&
397 vma->vm_end >= mm->start_brk) {
398 name = "[heap]";
399 goto done;
400 }
401
402 if (is_stack(vma)) {
403 name = "[stack]";
404 goto done;
405 }
406
407 if (vma_get_anon_name(vma)) {
408 seq_pad(m, ' ');
409 seq_print_vma_name(m, vma);
410 }
411 }
412
413done:
414 if (name) {
415 seq_pad(m, ' ');
416 seq_puts(m, name);
417 }
418 seq_putc(m, '\n');
419}
420
421static int show_map(struct seq_file *m, void *v, int is_pid)
422{
423 show_map_vma(m, v, is_pid);
424 m_cache_vma(m, v);
425 return 0;
426}
427
428static int show_pid_map(struct seq_file *m, void *v)
429{
430 return show_map(m, v, 1);
431}
432
433static int show_tid_map(struct seq_file *m, void *v)
434{
435 return show_map(m, v, 0);
436}
437
438static const struct seq_operations proc_pid_maps_op = {
439 .start = m_start,
440 .next = m_next,
441 .stop = m_stop,
442 .show = show_pid_map
443};
444
445static const struct seq_operations proc_tid_maps_op = {
446 .start = m_start,
447 .next = m_next,
448 .stop = m_stop,
449 .show = show_tid_map
450};
451
452static int pid_maps_open(struct inode *inode, struct file *file)
453{
454 return do_maps_open(inode, file, &proc_pid_maps_op);
455}
456
457static int tid_maps_open(struct inode *inode, struct file *file)
458{
459 return do_maps_open(inode, file, &proc_tid_maps_op);
460}
461
462const struct file_operations proc_pid_maps_operations = {
463 .open = pid_maps_open,
464 .read = seq_read,
465 .llseek = seq_lseek,
466 .release = proc_map_release,
467};
468
469const struct file_operations proc_tid_maps_operations = {
470 .open = tid_maps_open,
471 .read = seq_read,
472 .llseek = seq_lseek,
473 .release = proc_map_release,
474};
475
476/*
477 * Proportional Set Size(PSS): my share of RSS.
478 *
479 * PSS of a process is the count of pages it has in memory, where each
480 * page is divided by the number of processes sharing it. So if a
481 * process has 1000 pages all to itself, and 1000 shared with one other
482 * process, its PSS will be 1500.
483 *
484 * To keep (accumulated) division errors low, we adopt a 64bit
485 * fixed-point pss counter to minimize division errors. So (pss >>
486 * PSS_SHIFT) would be the real byte count.
487 *
488 * A shift of 12 before division means (assuming 4K page size):
489 * - 1M 3-user-pages add up to 8KB errors;
490 * - supports mapcount up to 2^24, or 16M;
491 * - supports PSS up to 2^52 bytes, or 4PB.
492 */
493#define PSS_SHIFT 12
494
495#ifdef CONFIG_PROC_PAGE_MONITOR
496struct mem_size_stats {
497 bool first;
498 unsigned long resident;
499 unsigned long shared_clean;
500 unsigned long shared_dirty;
501 unsigned long private_clean;
502 unsigned long private_dirty;
503 unsigned long referenced;
504 unsigned long anonymous;
505 unsigned long lazyfree;
506 unsigned long anonymous_thp;
507 unsigned long shmem_thp;
508 unsigned long swap;
509 unsigned long shared_hugetlb;
510 unsigned long private_hugetlb;
511 unsigned long first_vma_start;
512 u64 pss;
513 u64 pss_locked;
514 u64 swap_pss;
515 bool check_shmem_swap;
516};
517
518static void smaps_account(struct mem_size_stats *mss, struct page *page,
519 bool compound, bool young, bool dirty, bool locked)
520{
521 int i, nr = compound ? 1 << compound_order(page) : 1;
522 unsigned long size = nr * PAGE_SIZE;
523
524 if (PageAnon(page)) {
525 mss->anonymous += size;
526 if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
527 mss->lazyfree += size;
528 }
529
530 mss->resident += size;
531 /* Accumulate the size in pages that have been accessed. */
532 if (young || page_is_young(page) || PageReferenced(page))
533 mss->referenced += size;
534
535 /*
536 * page_count(page) == 1 guarantees the page is mapped exactly once.
537 * If any subpage of the compound page mapped with PTE it would elevate
538 * page_count().
539 */
540 if (page_count(page) == 1) {
541 if (dirty || PageDirty(page))
542 mss->private_dirty += size;
543 else
544 mss->private_clean += size;
545 mss->pss += (u64)size << PSS_SHIFT;
546 if (locked)
547 mss->pss_locked += (u64)size << PSS_SHIFT;
548 return;
549 }
550
551 for (i = 0; i < nr; i++, page++) {
552 int mapcount = page_mapcount(page);
553 unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
554
555 if (mapcount >= 2) {
556 if (dirty || PageDirty(page))
557 mss->shared_dirty += PAGE_SIZE;
558 else
559 mss->shared_clean += PAGE_SIZE;
560 mss->pss += pss / mapcount;
561 if (locked)
562 mss->pss_locked += pss / mapcount;
563 } else {
564 if (dirty || PageDirty(page))
565 mss->private_dirty += PAGE_SIZE;
566 else
567 mss->private_clean += PAGE_SIZE;
568 mss->pss += pss;
569 if (locked)
570 mss->pss_locked += pss;
571 }
572 }
573}
574
575#ifdef CONFIG_SHMEM
576static int smaps_pte_hole(unsigned long addr, unsigned long end,
577 struct mm_walk *walk)
578{
579 struct mem_size_stats *mss = walk->private;
580
581 mss->swap += shmem_partial_swap_usage(
582 walk->vma->vm_file->f_mapping, addr, end);
583
584 return 0;
585}
586#endif
587
588static void smaps_pte_entry(pte_t *pte, unsigned long addr,
589 struct mm_walk *walk)
590{
591 struct mem_size_stats *mss = walk->private;
592 struct vm_area_struct *vma = walk->vma;
593 bool locked = !!(vma->vm_flags & VM_LOCKED);
594 struct page *page = NULL;
595
596 if (pte_present(*pte)) {
597 page = vm_normal_page(vma, addr, *pte);
598 } else if (is_swap_pte(*pte)) {
599 swp_entry_t swpent = pte_to_swp_entry(*pte);
600
601 if (!non_swap_entry(swpent)) {
602 int mapcount;
603
604 mss->swap += PAGE_SIZE;
605 mapcount = swp_swapcount(swpent);
606 if (mapcount >= 2) {
607 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
608
609 do_div(pss_delta, mapcount);
610 mss->swap_pss += pss_delta;
611 } else {
612 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
613 }
614 } else if (is_migration_entry(swpent))
615 page = migration_entry_to_page(swpent);
616 else if (is_device_private_entry(swpent))
617 page = device_private_entry_to_page(swpent);
618 } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
619 && pte_none(*pte))) {
620 page = find_get_entry(vma->vm_file->f_mapping,
621 linear_page_index(vma, addr));
622 if (!page)
623 return;
624
625 if (radix_tree_exceptional_entry(page))
626 mss->swap += PAGE_SIZE;
627 else
628 put_page(page);
629
630 return;
631 }
632
633 if (!page)
634 return;
635
636 smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
637}
638
639#ifdef CONFIG_TRANSPARENT_HUGEPAGE
640static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
641 struct mm_walk *walk)
642{
643 struct mem_size_stats *mss = walk->private;
644 struct vm_area_struct *vma = walk->vma;
645 bool locked = !!(vma->vm_flags & VM_LOCKED);
646 struct page *page;
647
648 /* FOLL_DUMP will return -EFAULT on huge zero page */
649 page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
650 if (IS_ERR_OR_NULL(page))
651 return;
652 if (PageAnon(page))
653 mss->anonymous_thp += HPAGE_PMD_SIZE;
654 else if (PageSwapBacked(page))
655 mss->shmem_thp += HPAGE_PMD_SIZE;
656 else if (is_zone_device_page(page))
657 /* pass */;
658 else
659 VM_BUG_ON_PAGE(1, page);
660 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
661}
662#else
663static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
664 struct mm_walk *walk)
665{
666}
667#endif
668
669static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
670 struct mm_walk *walk)
671{
672 struct vm_area_struct *vma = walk->vma;
673 pte_t *pte;
674 spinlock_t *ptl;
675
676 ptl = pmd_trans_huge_lock(pmd, vma);
677 if (ptl) {
678 if (pmd_present(*pmd))
679 smaps_pmd_entry(pmd, addr, walk);
680 spin_unlock(ptl);
681 goto out;
682 }
683
684 if (pmd_trans_unstable(pmd))
685 goto out;
686 /*
687 * The mmap_sem held all the way back in m_start() is what
688 * keeps khugepaged out of here and from collapsing things
689 * in here.
690 */
691 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
692 for (; addr != end; pte++, addr += PAGE_SIZE)
693 smaps_pte_entry(pte, addr, walk);
694 pte_unmap_unlock(pte - 1, ptl);
695out:
696 cond_resched();
697 return 0;
698}
699
700static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
701{
702 /*
703 * Don't forget to update Documentation/ on changes.
704 */
705 static const char mnemonics[BITS_PER_LONG][2] = {
706 /*
707 * In case if we meet a flag we don't know about.
708 */
709 [0 ... (BITS_PER_LONG-1)] = "??",
710
711 [ilog2(VM_READ)] = "rd",
712 [ilog2(VM_WRITE)] = "wr",
713 [ilog2(VM_EXEC)] = "ex",
714 [ilog2(VM_SHARED)] = "sh",
715 [ilog2(VM_MAYREAD)] = "mr",
716 [ilog2(VM_MAYWRITE)] = "mw",
717 [ilog2(VM_MAYEXEC)] = "me",
718 [ilog2(VM_MAYSHARE)] = "ms",
719 [ilog2(VM_GROWSDOWN)] = "gd",
720 [ilog2(VM_PFNMAP)] = "pf",
721 [ilog2(VM_DENYWRITE)] = "dw",
722#ifdef CONFIG_X86_INTEL_MPX
723 [ilog2(VM_MPX)] = "mp",
724#endif
725 [ilog2(VM_LOCKED)] = "lo",
726 [ilog2(VM_IO)] = "io",
727 [ilog2(VM_SEQ_READ)] = "sr",
728 [ilog2(VM_RAND_READ)] = "rr",
729 [ilog2(VM_DONTCOPY)] = "dc",
730 [ilog2(VM_DONTEXPAND)] = "de",
731 [ilog2(VM_ACCOUNT)] = "ac",
732 [ilog2(VM_NORESERVE)] = "nr",
733 [ilog2(VM_HUGETLB)] = "ht",
734 [ilog2(VM_ARCH_1)] = "ar",
735 [ilog2(VM_WIPEONFORK)] = "wf",
736 [ilog2(VM_DONTDUMP)] = "dd",
737#ifdef CONFIG_MEM_SOFT_DIRTY
738 [ilog2(VM_SOFTDIRTY)] = "sd",
739#endif
740 [ilog2(VM_MIXEDMAP)] = "mm",
741 [ilog2(VM_HUGEPAGE)] = "hg",
742 [ilog2(VM_NOHUGEPAGE)] = "nh",
743 [ilog2(VM_MERGEABLE)] = "mg",
744 [ilog2(VM_UFFD_MISSING)]= "um",
745 [ilog2(VM_UFFD_WP)] = "uw",
746#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
747 /* These come out via ProtectionKey: */
748 [ilog2(VM_PKEY_BIT0)] = "",
749 [ilog2(VM_PKEY_BIT1)] = "",
750 [ilog2(VM_PKEY_BIT2)] = "",
751 [ilog2(VM_PKEY_BIT3)] = "",
752#endif
753 };
754 size_t i;
755
756 seq_puts(m, "VmFlags: ");
757 for (i = 0; i < BITS_PER_LONG; i++) {
758 if (!mnemonics[i][0])
759 continue;
760 if (vma->vm_flags & (1UL << i)) {
761 seq_printf(m, "%c%c ",
762 mnemonics[i][0], mnemonics[i][1]);
763 }
764 }
765 seq_putc(m, '\n');
766}
767
768#ifdef CONFIG_HUGETLB_PAGE
769static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
770 unsigned long addr, unsigned long end,
771 struct mm_walk *walk)
772{
773 struct mem_size_stats *mss = walk->private;
774 struct vm_area_struct *vma = walk->vma;
775 struct page *page = NULL;
776
777 if (pte_present(*pte)) {
778 page = vm_normal_page(vma, addr, *pte);
779 } else if (is_swap_pte(*pte)) {
780 swp_entry_t swpent = pte_to_swp_entry(*pte);
781
782 if (is_migration_entry(swpent))
783 page = migration_entry_to_page(swpent);
784 else if (is_device_private_entry(swpent))
785 page = device_private_entry_to_page(swpent);
786 }
787 if (page) {
788 int mapcount = page_mapcount(page);
789
790 if (mapcount >= 2)
791 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
792 else
793 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
794 }
795 return 0;
796}
797#endif /* HUGETLB_PAGE */
798
799void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
800{
801}
802
803static int show_smap(struct seq_file *m, void *v, int is_pid)
804{
805 struct proc_maps_private *priv = m->private;
806 struct vm_area_struct *vma = v;
807 struct mem_size_stats mss_stack;
808 struct mem_size_stats *mss;
809 struct mm_walk smaps_walk = {
810 .pmd_entry = smaps_pte_range,
811#ifdef CONFIG_HUGETLB_PAGE
812 .hugetlb_entry = smaps_hugetlb_range,
813#endif
814 .mm = vma->vm_mm,
815 };
816 int ret = 0;
817 bool rollup_mode;
818 bool last_vma;
819
820 if (priv->rollup) {
821 rollup_mode = true;
822 mss = priv->rollup;
823 if (mss->first) {
824 mss->first_vma_start = vma->vm_start;
825 mss->first = false;
826 }
827 last_vma = !m_next_vma(priv, vma);
828 } else {
829 rollup_mode = false;
830 memset(&mss_stack, 0, sizeof(mss_stack));
831 mss = &mss_stack;
832 }
833
834 smaps_walk.private = mss;
835
836#ifdef CONFIG_SHMEM
837 /* In case of smaps_rollup, reset the value from previous vma */
838 mss->check_shmem_swap = false;
839 if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
840 /*
841 * For shared or readonly shmem mappings we know that all
842 * swapped out pages belong to the shmem object, and we can
843 * obtain the swap value much more efficiently. For private
844 * writable mappings, we might have COW pages that are
845 * not affected by the parent swapped out pages of the shmem
846 * object, so we have to distinguish them during the page walk.
847 * Unless we know that the shmem object (or the part mapped by
848 * our VMA) has no swapped out pages at all.
849 */
850 unsigned long shmem_swapped = shmem_swap_usage(vma);
851
852 if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
853 !(vma->vm_flags & VM_WRITE)) {
854 mss->swap += shmem_swapped;
855 } else {
856 mss->check_shmem_swap = true;
857 smaps_walk.pte_hole = smaps_pte_hole;
858 }
859 }
860#endif
861 /* mmap_sem is held in m_start */
862 walk_page_vma(vma, &smaps_walk);
863
864 if (!rollup_mode) {
865 show_map_vma(m, vma, is_pid);
866 if (vma_get_anon_name(vma)) {
867 seq_puts(m, "Name: ");
868 seq_print_vma_name(m, vma);
869 seq_putc(m, '\n');
870 }
871 } else if (last_vma) {
872 show_vma_header_prefix(
873 m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
874 seq_pad(m, ' ');
875 seq_puts(m, "[rollup]\n");
876 } else {
877 ret = SEQ_SKIP;
878 }
879
880 if (!rollup_mode)
881 seq_printf(m,
882 "Size: %8lu kB\n"
883 "KernelPageSize: %8lu kB\n"
884 "MMUPageSize: %8lu kB\n",
885 (vma->vm_end - vma->vm_start) >> 10,
886 vma_kernel_pagesize(vma) >> 10,
887 vma_mmu_pagesize(vma) >> 10);
888
889
890 if (!rollup_mode || last_vma)
891 seq_printf(m,
892 "Rss: %8lu kB\n"
893 "Pss: %8lu kB\n"
894 "Shared_Clean: %8lu kB\n"
895 "Shared_Dirty: %8lu kB\n"
896 "Private_Clean: %8lu kB\n"
897 "Private_Dirty: %8lu kB\n"
898 "Referenced: %8lu kB\n"
899 "Anonymous: %8lu kB\n"
900 "LazyFree: %8lu kB\n"
901 "AnonHugePages: %8lu kB\n"
902 "ShmemPmdMapped: %8lu kB\n"
903 "Shared_Hugetlb: %8lu kB\n"
904 "Private_Hugetlb: %7lu kB\n"
905 "Swap: %8lu kB\n"
906 "SwapPss: %8lu kB\n"
907 "Locked: %8lu kB\n",
908 mss->resident >> 10,
909 (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
910 mss->shared_clean >> 10,
911 mss->shared_dirty >> 10,
912 mss->private_clean >> 10,
913 mss->private_dirty >> 10,
914 mss->referenced >> 10,
915 mss->anonymous >> 10,
916 mss->lazyfree >> 10,
917 mss->anonymous_thp >> 10,
918 mss->shmem_thp >> 10,
919 mss->shared_hugetlb >> 10,
920 mss->private_hugetlb >> 10,
921 mss->swap >> 10,
922 (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
923 (unsigned long)(mss->pss_locked >> (10 + PSS_SHIFT)));
924
925 if (!rollup_mode) {
926 arch_show_smap(m, vma);
927 show_smap_vma_flags(m, vma);
928 }
929 m_cache_vma(m, vma);
930 return ret;
931}
932
933static int show_pid_smap(struct seq_file *m, void *v)
934{
935 return show_smap(m, v, 1);
936}
937
938static int show_tid_smap(struct seq_file *m, void *v)
939{
940 return show_smap(m, v, 0);
941}
942
943static const struct seq_operations proc_pid_smaps_op = {
944 .start = m_start,
945 .next = m_next,
946 .stop = m_stop,
947 .show = show_pid_smap
948};
949
950static const struct seq_operations proc_tid_smaps_op = {
951 .start = m_start,
952 .next = m_next,
953 .stop = m_stop,
954 .show = show_tid_smap
955};
956
957static int pid_smaps_open(struct inode *inode, struct file *file)
958{
959 return do_maps_open(inode, file, &proc_pid_smaps_op);
960}
961
962static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
963{
964 struct seq_file *seq;
965 struct proc_maps_private *priv;
966 int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
967
968 if (ret < 0)
969 return ret;
970 seq = file->private_data;
971 priv = seq->private;
972 priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
973 if (!priv->rollup) {
974 proc_map_release(inode, file);
975 return -ENOMEM;
976 }
977 priv->rollup->first = true;
978 return 0;
979}
980
981static int tid_smaps_open(struct inode *inode, struct file *file)
982{
983 return do_maps_open(inode, file, &proc_tid_smaps_op);
984}
985
986const struct file_operations proc_pid_smaps_operations = {
987 .open = pid_smaps_open,
988 .read = seq_read,
989 .llseek = seq_lseek,
990 .release = proc_map_release,
991};
992
993const struct file_operations proc_pid_smaps_rollup_operations = {
994 .open = pid_smaps_rollup_open,
995 .read = seq_read,
996 .llseek = seq_lseek,
997 .release = proc_map_release,
998};
999
1000const struct file_operations proc_tid_smaps_operations = {
1001 .open = tid_smaps_open,
1002 .read = seq_read,
1003 .llseek = seq_lseek,
1004 .release = proc_map_release,
1005};
1006
1007enum clear_refs_types {
1008 CLEAR_REFS_ALL = 1,
1009 CLEAR_REFS_ANON,
1010 CLEAR_REFS_MAPPED,
1011 CLEAR_REFS_SOFT_DIRTY,
1012 CLEAR_REFS_MM_HIWATER_RSS,
1013 CLEAR_REFS_LAST,
1014};
1015
1016struct clear_refs_private {
1017 enum clear_refs_types type;
1018};
1019
1020#ifdef CONFIG_MEM_SOFT_DIRTY
1021static inline void clear_soft_dirty(struct vm_area_struct *vma,
1022 unsigned long addr, pte_t *pte)
1023{
1024 /*
1025 * The soft-dirty tracker uses #PF-s to catch writes
1026 * to pages, so write-protect the pte as well. See the
1027 * Documentation/vm/soft-dirty.txt for full description
1028 * of how soft-dirty works.
1029 */
1030 pte_t ptent = *pte;
1031
1032 if (pte_present(ptent)) {
1033 ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
1034 ptent = pte_wrprotect(ptent);
1035 ptent = pte_clear_soft_dirty(ptent);
1036 ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
1037 } else if (is_swap_pte(ptent)) {
1038 ptent = pte_swp_clear_soft_dirty(ptent);
1039 set_pte_at(vma->vm_mm, addr, pte, ptent);
1040 }
1041}
1042#else
1043static inline void clear_soft_dirty(struct vm_area_struct *vma,
1044 unsigned long addr, pte_t *pte)
1045{
1046}
1047#endif
1048
1049#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1050static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1051 unsigned long addr, pmd_t *pmdp)
1052{
1053 pmd_t pmd = *pmdp;
1054
1055 if (pmd_present(pmd)) {
1056 /* See comment in change_huge_pmd() */
1057 pmdp_invalidate(vma, addr, pmdp);
1058 if (pmd_dirty(*pmdp))
1059 pmd = pmd_mkdirty(pmd);
1060 if (pmd_young(*pmdp))
1061 pmd = pmd_mkyoung(pmd);
1062
1063 pmd = pmd_wrprotect(pmd);
1064 pmd = pmd_clear_soft_dirty(pmd);
1065
1066 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1067 } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
1068 pmd = pmd_swp_clear_soft_dirty(pmd);
1069 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
1070 }
1071}
1072#else
1073static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
1074 unsigned long addr, pmd_t *pmdp)
1075{
1076}
1077#endif
1078
1079static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
1080 unsigned long end, struct mm_walk *walk)
1081{
1082 struct clear_refs_private *cp = walk->private;
1083 struct vm_area_struct *vma = walk->vma;
1084 pte_t *pte, ptent;
1085 spinlock_t *ptl;
1086 struct page *page;
1087
1088 ptl = pmd_trans_huge_lock(pmd, vma);
1089 if (ptl) {
1090 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1091 clear_soft_dirty_pmd(vma, addr, pmd);
1092 goto out;
1093 }
1094
1095 if (!pmd_present(*pmd))
1096 goto out;
1097
1098 page = pmd_page(*pmd);
1099
1100 /* Clear accessed and referenced bits. */
1101 pmdp_test_and_clear_young(vma, addr, pmd);
1102 test_and_clear_page_young(page);
1103 ClearPageReferenced(page);
1104out:
1105 spin_unlock(ptl);
1106 return 0;
1107 }
1108
1109 if (pmd_trans_unstable(pmd))
1110 return 0;
1111
1112 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1113 for (; addr != end; pte++, addr += PAGE_SIZE) {
1114 ptent = *pte;
1115
1116 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
1117 clear_soft_dirty(vma, addr, pte);
1118 continue;
1119 }
1120
1121 if (!pte_present(ptent))
1122 continue;
1123
1124 page = vm_normal_page(vma, addr, ptent);
1125 if (!page)
1126 continue;
1127
1128 /* Clear accessed and referenced bits. */
1129 ptep_test_and_clear_young(vma, addr, pte);
1130 test_and_clear_page_young(page);
1131 ClearPageReferenced(page);
1132 }
1133 pte_unmap_unlock(pte - 1, ptl);
1134 cond_resched();
1135 return 0;
1136}
1137
1138static int clear_refs_test_walk(unsigned long start, unsigned long end,
1139 struct mm_walk *walk)
1140{
1141 struct clear_refs_private *cp = walk->private;
1142 struct vm_area_struct *vma = walk->vma;
1143
1144 if (vma->vm_flags & VM_PFNMAP)
1145 return 1;
1146
1147 /*
1148 * Writing 1 to /proc/pid/clear_refs affects all pages.
1149 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
1150 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
1151 * Writing 4 to /proc/pid/clear_refs affects all pages.
1152 */
1153 if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
1154 return 1;
1155 if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
1156 return 1;
1157 return 0;
1158}
1159
1160static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1161 size_t count, loff_t *ppos)
1162{
1163 struct task_struct *task;
1164 char buffer[PROC_NUMBUF];
1165 struct mm_struct *mm;
1166 struct vm_area_struct *vma;
1167 enum clear_refs_types type;
1168 struct mmu_gather tlb;
1169 int itype;
1170 int rv;
1171
1172 memset(buffer, 0, sizeof(buffer));
1173 if (count > sizeof(buffer) - 1)
1174 count = sizeof(buffer) - 1;
1175 if (copy_from_user(buffer, buf, count))
1176 return -EFAULT;
1177 rv = kstrtoint(strstrip(buffer), 10, &itype);
1178 if (rv < 0)
1179 return rv;
1180 type = (enum clear_refs_types)itype;
1181 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
1182 return -EINVAL;
1183
1184 task = get_proc_task(file_inode(file));
1185 if (!task)
1186 return -ESRCH;
1187 mm = get_task_mm(task);
1188 if (mm) {
1189 struct clear_refs_private cp = {
1190 .type = type,
1191 };
1192 struct mm_walk clear_refs_walk = {
1193 .pmd_entry = clear_refs_pte_range,
1194 .test_walk = clear_refs_test_walk,
1195 .mm = mm,
1196 .private = &cp,
1197 };
1198
1199 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1200 if (down_write_killable(&mm->mmap_sem)) {
1201 count = -EINTR;
1202 goto out_mm;
1203 }
1204
1205 /*
1206 * Writing 5 to /proc/pid/clear_refs resets the peak
1207 * resident set size to this mm's current rss value.
1208 */
1209 reset_mm_hiwater_rss(mm);
1210 up_write(&mm->mmap_sem);
1211 goto out_mm;
1212 }
1213
1214 down_read(&mm->mmap_sem);
1215 tlb_gather_mmu(&tlb, mm, 0, -1);
1216 if (type == CLEAR_REFS_SOFT_DIRTY) {
1217 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1218 if (!(vma->vm_flags & VM_SOFTDIRTY))
1219 continue;
1220 up_read(&mm->mmap_sem);
1221 if (down_write_killable(&mm->mmap_sem)) {
1222 count = -EINTR;
1223 goto out_mm;
1224 }
1225 /*
1226 * Avoid to modify vma->vm_flags
1227 * without locked ops while the
1228 * coredump reads the vm_flags.
1229 */
1230 if (!mmget_still_valid(mm)) {
1231 /*
1232 * Silently return "count"
1233 * like if get_task_mm()
1234 * failed. FIXME: should this
1235 * function have returned
1236 * -ESRCH if get_task_mm()
1237 * failed like if
1238 * get_proc_task() fails?
1239 */
1240 up_write(&mm->mmap_sem);
1241 goto out_mm;
1242 }
1243 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1244 vma->vm_flags &= ~VM_SOFTDIRTY;
1245 vma_set_page_prot(vma);
1246 }
1247 downgrade_write(&mm->mmap_sem);
1248 break;
1249 }
1250 mmu_notifier_invalidate_range_start(mm, 0, -1);
1251 }
1252 walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
1253 if (type == CLEAR_REFS_SOFT_DIRTY)
1254 mmu_notifier_invalidate_range_end(mm, 0, -1);
1255 tlb_finish_mmu(&tlb, 0, -1);
1256 up_read(&mm->mmap_sem);
1257out_mm:
1258 mmput(mm);
1259 }
1260 put_task_struct(task);
1261
1262 return count;
1263}
1264
1265const struct file_operations proc_clear_refs_operations = {
1266 .write = clear_refs_write,
1267 .llseek = noop_llseek,
1268};
1269
1270typedef struct {
1271 u64 pme;
1272} pagemap_entry_t;
1273
1274struct pagemapread {
1275 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
1276 pagemap_entry_t *buffer;
1277 bool show_pfn;
1278};
1279
1280#define PAGEMAP_WALK_SIZE (PMD_SIZE)
1281#define PAGEMAP_WALK_MASK (PMD_MASK)
1282
1283#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
1284#define PM_PFRAME_BITS 55
1285#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1286#define PM_SOFT_DIRTY BIT_ULL(55)
1287#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
1288#define PM_FILE BIT_ULL(61)
1289#define PM_SWAP BIT_ULL(62)
1290#define PM_PRESENT BIT_ULL(63)
1291
1292#define PM_END_OF_BUFFER 1
1293
1294static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1295{
1296 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1297}
1298
1299static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
1300 struct pagemapread *pm)
1301{
1302 pm->buffer[pm->pos++] = *pme;
1303 if (pm->pos >= pm->len)
1304 return PM_END_OF_BUFFER;
1305 return 0;
1306}
1307
1308static int pagemap_pte_hole(unsigned long start, unsigned long end,
1309 struct mm_walk *walk)
1310{
1311 struct pagemapread *pm = walk->private;
1312 unsigned long addr = start;
1313 int err = 0;
1314
1315 while (addr < end) {
1316 struct vm_area_struct *vma = find_vma(walk->mm, addr);
1317 pagemap_entry_t pme = make_pme(0, 0);
1318 /* End of address space hole, which we mark as non-present. */
1319 unsigned long hole_end;
1320
1321 if (vma)
1322 hole_end = min(end, vma->vm_start);
1323 else
1324 hole_end = end;
1325
1326 for (; addr < hole_end; addr += PAGE_SIZE) {
1327 err = add_to_pagemap(addr, &pme, pm);
1328 if (err)
1329 goto out;
1330 }
1331
1332 if (!vma)
1333 break;
1334
1335 /* Addresses in the VMA. */
1336 if (vma->vm_flags & VM_SOFTDIRTY)
1337 pme = make_pme(0, PM_SOFT_DIRTY);
1338 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1339 err = add_to_pagemap(addr, &pme, pm);
1340 if (err)
1341 goto out;
1342 }
1343 }
1344out:
1345 return err;
1346}
1347
1348static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1349 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1350{
1351 u64 frame = 0, flags = 0;
1352 struct page *page = NULL;
1353
1354 if (pte_present(pte)) {
1355 if (pm->show_pfn)
1356 frame = pte_pfn(pte);
1357 flags |= PM_PRESENT;
1358 page = _vm_normal_page(vma, addr, pte, true);
1359 if (pte_soft_dirty(pte))
1360 flags |= PM_SOFT_DIRTY;
1361 } else if (is_swap_pte(pte)) {
1362 swp_entry_t entry;
1363 if (pte_swp_soft_dirty(pte))
1364 flags |= PM_SOFT_DIRTY;
1365 entry = pte_to_swp_entry(pte);
1366 if (pm->show_pfn)
1367 frame = swp_type(entry) |
1368 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1369 flags |= PM_SWAP;
1370 if (is_migration_entry(entry))
1371 page = migration_entry_to_page(entry);
1372
1373 if (is_device_private_entry(entry))
1374 page = device_private_entry_to_page(entry);
1375 }
1376
1377 if (page && !PageAnon(page))
1378 flags |= PM_FILE;
1379 if (page && page_mapcount(page) == 1)
1380 flags |= PM_MMAP_EXCLUSIVE;
1381 if (vma->vm_flags & VM_SOFTDIRTY)
1382 flags |= PM_SOFT_DIRTY;
1383
1384 return make_pme(frame, flags);
1385}
1386
1387static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1388 struct mm_walk *walk)
1389{
1390 struct vm_area_struct *vma = walk->vma;
1391 struct pagemapread *pm = walk->private;
1392 spinlock_t *ptl;
1393 pte_t *pte, *orig_pte;
1394 int err = 0;
1395
1396#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1397 ptl = pmd_trans_huge_lock(pmdp, vma);
1398 if (ptl) {
1399 u64 flags = 0, frame = 0;
1400 pmd_t pmd = *pmdp;
1401 struct page *page = NULL;
1402
1403 if (vma->vm_flags & VM_SOFTDIRTY)
1404 flags |= PM_SOFT_DIRTY;
1405
1406 if (pmd_present(pmd)) {
1407 page = pmd_page(pmd);
1408
1409 flags |= PM_PRESENT;
1410 if (pmd_soft_dirty(pmd))
1411 flags |= PM_SOFT_DIRTY;
1412 if (pm->show_pfn)
1413 frame = pmd_pfn(pmd) +
1414 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1415 }
1416#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1417 else if (is_swap_pmd(pmd)) {
1418 swp_entry_t entry = pmd_to_swp_entry(pmd);
1419 unsigned long offset;
1420
1421 if (pm->show_pfn) {
1422 offset = swp_offset(entry) +
1423 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1424 frame = swp_type(entry) |
1425 (offset << MAX_SWAPFILES_SHIFT);
1426 }
1427 flags |= PM_SWAP;
1428 if (pmd_swp_soft_dirty(pmd))
1429 flags |= PM_SOFT_DIRTY;
1430 VM_BUG_ON(!is_pmd_migration_entry(pmd));
1431 page = migration_entry_to_page(entry);
1432 }
1433#endif
1434
1435 if (page && page_mapcount(page) == 1)
1436 flags |= PM_MMAP_EXCLUSIVE;
1437
1438 for (; addr != end; addr += PAGE_SIZE) {
1439 pagemap_entry_t pme = make_pme(frame, flags);
1440
1441 err = add_to_pagemap(addr, &pme, pm);
1442 if (err)
1443 break;
1444 if (pm->show_pfn) {
1445 if (flags & PM_PRESENT)
1446 frame++;
1447 else if (flags & PM_SWAP)
1448 frame += (1 << MAX_SWAPFILES_SHIFT);
1449 }
1450 }
1451 spin_unlock(ptl);
1452 return err;
1453 }
1454
1455 if (pmd_trans_unstable(pmdp))
1456 return 0;
1457#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1458
1459 /*
1460 * We can assume that @vma always points to a valid one and @end never
1461 * goes beyond vma->vm_end.
1462 */
1463 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
1464 for (; addr < end; pte++, addr += PAGE_SIZE) {
1465 pagemap_entry_t pme;
1466
1467 pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1468 err = add_to_pagemap(addr, &pme, pm);
1469 if (err)
1470 break;
1471 }
1472 pte_unmap_unlock(orig_pte, ptl);
1473
1474 cond_resched();
1475
1476 return err;
1477}
1478
1479#ifdef CONFIG_HUGETLB_PAGE
1480/* This function walks within one hugetlb entry in the single call */
1481static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1482 unsigned long addr, unsigned long end,
1483 struct mm_walk *walk)
1484{
1485 struct pagemapread *pm = walk->private;
1486 struct vm_area_struct *vma = walk->vma;
1487 u64 flags = 0, frame = 0;
1488 int err = 0;
1489 pte_t pte;
1490
1491 if (vma->vm_flags & VM_SOFTDIRTY)
1492 flags |= PM_SOFT_DIRTY;
1493
1494 pte = huge_ptep_get(ptep);
1495 if (pte_present(pte)) {
1496 struct page *page = pte_page(pte);
1497
1498 if (!PageAnon(page))
1499 flags |= PM_FILE;
1500
1501 if (page_mapcount(page) == 1)
1502 flags |= PM_MMAP_EXCLUSIVE;
1503
1504 flags |= PM_PRESENT;
1505 if (pm->show_pfn)
1506 frame = pte_pfn(pte) +
1507 ((addr & ~hmask) >> PAGE_SHIFT);
1508 }
1509
1510 for (; addr != end; addr += PAGE_SIZE) {
1511 pagemap_entry_t pme = make_pme(frame, flags);
1512
1513 err = add_to_pagemap(addr, &pme, pm);
1514 if (err)
1515 return err;
1516 if (pm->show_pfn && (flags & PM_PRESENT))
1517 frame++;
1518 }
1519
1520 cond_resched();
1521
1522 return err;
1523}
1524#endif /* HUGETLB_PAGE */
1525
1526/*
1527 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1528 *
1529 * For each page in the address space, this file contains one 64-bit entry
1530 * consisting of the following:
1531 *
1532 * Bits 0-54 page frame number (PFN) if present
1533 * Bits 0-4 swap type if swapped
1534 * Bits 5-54 swap offset if swapped
1535 * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
1536 * Bit 56 page exclusively mapped
1537 * Bits 57-60 zero
1538 * Bit 61 page is file-page or shared-anon
1539 * Bit 62 page swapped
1540 * Bit 63 page present
1541 *
1542 * If the page is not present but in swap, then the PFN contains an
1543 * encoding of the swap file number and the page's offset into the
1544 * swap. Unmapped pages return a null PFN. This allows determining
1545 * precisely which pages are mapped (or in swap) and comparing mapped
1546 * pages between processes.
1547 *
1548 * Efficient users of this interface will use /proc/pid/maps to
1549 * determine which areas of memory are actually mapped and llseek to
1550 * skip over unmapped regions.
1551 */
1552static ssize_t pagemap_read(struct file *file, char __user *buf,
1553 size_t count, loff_t *ppos)
1554{
1555 struct mm_struct *mm = file->private_data;
1556 struct pagemapread pm;
1557 struct mm_walk pagemap_walk = {};
1558 unsigned long src;
1559 unsigned long svpfn;
1560 unsigned long start_vaddr;
1561 unsigned long end_vaddr;
1562 int ret = 0, copied = 0;
1563
1564 if (!mm || !mmget_not_zero(mm))
1565 goto out;
1566
1567 ret = -EINVAL;
1568 /* file position must be aligned */
1569 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1570 goto out_mm;
1571
1572 ret = 0;
1573 if (!count)
1574 goto out_mm;
1575
1576 /* do not disclose physical addresses: attack vector */
1577 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1578
1579 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1580 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_KERNEL);
1581 ret = -ENOMEM;
1582 if (!pm.buffer)
1583 goto out_mm;
1584
1585 pagemap_walk.pmd_entry = pagemap_pmd_range;
1586 pagemap_walk.pte_hole = pagemap_pte_hole;
1587#ifdef CONFIG_HUGETLB_PAGE
1588 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1589#endif
1590 pagemap_walk.mm = mm;
1591 pagemap_walk.private = &pm;
1592
1593 src = *ppos;
1594 svpfn = src / PM_ENTRY_BYTES;
1595 start_vaddr = svpfn << PAGE_SHIFT;
1596 end_vaddr = mm->task_size;
1597
1598 /* watch out for wraparound */
1599 if (svpfn > mm->task_size >> PAGE_SHIFT)
1600 start_vaddr = end_vaddr;
1601
1602 /*
1603 * The odds are that this will stop walking way
1604 * before end_vaddr, because the length of the
1605 * user buffer is tracked in "pm", and the walk
1606 * will stop when we hit the end of the buffer.
1607 */
1608 ret = 0;
1609 while (count && (start_vaddr < end_vaddr)) {
1610 int len;
1611 unsigned long end;
1612
1613 pm.pos = 0;
1614 end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1615 /* overflow ? */
1616 if (end < start_vaddr || end > end_vaddr)
1617 end = end_vaddr;
1618 down_read(&mm->mmap_sem);
1619 ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1620 up_read(&mm->mmap_sem);
1621 start_vaddr = end;
1622
1623 len = min(count, PM_ENTRY_BYTES * pm.pos);
1624 if (copy_to_user(buf, pm.buffer, len)) {
1625 ret = -EFAULT;
1626 goto out_free;
1627 }
1628 copied += len;
1629 buf += len;
1630 count -= len;
1631 }
1632 *ppos += copied;
1633 if (!ret || ret == PM_END_OF_BUFFER)
1634 ret = copied;
1635
1636out_free:
1637 kfree(pm.buffer);
1638out_mm:
1639 mmput(mm);
1640out:
1641 return ret;
1642}
1643
1644static int pagemap_open(struct inode *inode, struct file *file)
1645{
1646 struct mm_struct *mm;
1647
1648 mm = proc_mem_open(inode, PTRACE_MODE_READ);
1649 if (IS_ERR(mm))
1650 return PTR_ERR(mm);
1651 file->private_data = mm;
1652 return 0;
1653}
1654
1655static int pagemap_release(struct inode *inode, struct file *file)
1656{
1657 struct mm_struct *mm = file->private_data;
1658
1659 if (mm)
1660 mmdrop(mm);
1661 return 0;
1662}
1663
1664const struct file_operations proc_pagemap_operations = {
1665 .llseek = mem_lseek, /* borrow this */
1666 .read = pagemap_read,
1667 .open = pagemap_open,
1668 .release = pagemap_release,
1669};
1670#endif /* CONFIG_PROC_PAGE_MONITOR */
1671
1672#ifdef CONFIG_NUMA
1673
1674struct numa_maps {
1675 unsigned long pages;
1676 unsigned long anon;
1677 unsigned long active;
1678 unsigned long writeback;
1679 unsigned long mapcount_max;
1680 unsigned long dirty;
1681 unsigned long swapcache;
1682 unsigned long node[MAX_NUMNODES];
1683};
1684
1685struct numa_maps_private {
1686 struct proc_maps_private proc_maps;
1687 struct numa_maps md;
1688};
1689
1690static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1691 unsigned long nr_pages)
1692{
1693 int count = page_mapcount(page);
1694
1695 md->pages += nr_pages;
1696 if (pte_dirty || PageDirty(page))
1697 md->dirty += nr_pages;
1698
1699 if (PageSwapCache(page))
1700 md->swapcache += nr_pages;
1701
1702 if (PageActive(page) || PageUnevictable(page))
1703 md->active += nr_pages;
1704
1705 if (PageWriteback(page))
1706 md->writeback += nr_pages;
1707
1708 if (PageAnon(page))
1709 md->anon += nr_pages;
1710
1711 if (count > md->mapcount_max)
1712 md->mapcount_max = count;
1713
1714 md->node[page_to_nid(page)] += nr_pages;
1715}
1716
1717static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1718 unsigned long addr)
1719{
1720 struct page *page;
1721 int nid;
1722
1723 if (!pte_present(pte))
1724 return NULL;
1725
1726 page = vm_normal_page(vma, addr, pte);
1727 if (!page)
1728 return NULL;
1729
1730 if (PageReserved(page))
1731 return NULL;
1732
1733 nid = page_to_nid(page);
1734 if (!node_isset(nid, node_states[N_MEMORY]))
1735 return NULL;
1736
1737 return page;
1738}
1739
1740#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1741static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
1742 struct vm_area_struct *vma,
1743 unsigned long addr)
1744{
1745 struct page *page;
1746 int nid;
1747
1748 if (!pmd_present(pmd))
1749 return NULL;
1750
1751 page = vm_normal_page_pmd(vma, addr, pmd);
1752 if (!page)
1753 return NULL;
1754
1755 if (PageReserved(page))
1756 return NULL;
1757
1758 nid = page_to_nid(page);
1759 if (!node_isset(nid, node_states[N_MEMORY]))
1760 return NULL;
1761
1762 return page;
1763}
1764#endif
1765
1766static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1767 unsigned long end, struct mm_walk *walk)
1768{
1769 struct numa_maps *md = walk->private;
1770 struct vm_area_struct *vma = walk->vma;
1771 spinlock_t *ptl;
1772 pte_t *orig_pte;
1773 pte_t *pte;
1774
1775#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1776 ptl = pmd_trans_huge_lock(pmd, vma);
1777 if (ptl) {
1778 struct page *page;
1779
1780 page = can_gather_numa_stats_pmd(*pmd, vma, addr);
1781 if (page)
1782 gather_stats(page, md, pmd_dirty(*pmd),
1783 HPAGE_PMD_SIZE/PAGE_SIZE);
1784 spin_unlock(ptl);
1785 return 0;
1786 }
1787
1788 if (pmd_trans_unstable(pmd))
1789 return 0;
1790#endif
1791 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1792 do {
1793 struct page *page = can_gather_numa_stats(*pte, vma, addr);
1794 if (!page)
1795 continue;
1796 gather_stats(page, md, pte_dirty(*pte), 1);
1797
1798 } while (pte++, addr += PAGE_SIZE, addr != end);
1799 pte_unmap_unlock(orig_pte, ptl);
1800 cond_resched();
1801 return 0;
1802}
1803#ifdef CONFIG_HUGETLB_PAGE
1804static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1805 unsigned long addr, unsigned long end, struct mm_walk *walk)
1806{
1807 pte_t huge_pte = huge_ptep_get(pte);
1808 struct numa_maps *md;
1809 struct page *page;
1810
1811 if (!pte_present(huge_pte))
1812 return 0;
1813
1814 page = pte_page(huge_pte);
1815 if (!page)
1816 return 0;
1817
1818 md = walk->private;
1819 gather_stats(page, md, pte_dirty(huge_pte), 1);
1820 return 0;
1821}
1822
1823#else
1824static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1825 unsigned long addr, unsigned long end, struct mm_walk *walk)
1826{
1827 return 0;
1828}
1829#endif
1830
1831/*
1832 * Display pages allocated per node and memory policy via /proc.
1833 */
1834static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1835{
1836 struct numa_maps_private *numa_priv = m->private;
1837 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1838 struct vm_area_struct *vma = v;
1839 struct numa_maps *md = &numa_priv->md;
1840 struct file *file = vma->vm_file;
1841 struct mm_struct *mm = vma->vm_mm;
1842 struct mm_walk walk = {
1843 .hugetlb_entry = gather_hugetlb_stats,
1844 .pmd_entry = gather_pte_stats,
1845 .private = md,
1846 .mm = mm,
1847 };
1848 struct mempolicy *pol;
1849 char buffer[64];
1850 int nid;
1851
1852 if (!mm)
1853 return 0;
1854
1855 /* Ensure we start with an empty set of numa_maps statistics. */
1856 memset(md, 0, sizeof(*md));
1857
1858 pol = __get_vma_policy(vma, vma->vm_start);
1859 if (pol) {
1860 mpol_to_str(buffer, sizeof(buffer), pol);
1861 mpol_cond_put(pol);
1862 } else {
1863 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1864 }
1865
1866 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1867
1868 if (file) {
1869 seq_puts(m, " file=");
1870 seq_file_path(m, file, "\n\t= ");
1871 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1872 seq_puts(m, " heap");
1873 } else if (is_stack(vma)) {
1874 seq_puts(m, " stack");
1875 }
1876
1877 if (is_vm_hugetlb_page(vma))
1878 seq_puts(m, " huge");
1879
1880 /* mmap_sem is held by m_start */
1881 walk_page_vma(vma, &walk);
1882
1883 if (!md->pages)
1884 goto out;
1885
1886 if (md->anon)
1887 seq_printf(m, " anon=%lu", md->anon);
1888
1889 if (md->dirty)
1890 seq_printf(m, " dirty=%lu", md->dirty);
1891
1892 if (md->pages != md->anon && md->pages != md->dirty)
1893 seq_printf(m, " mapped=%lu", md->pages);
1894
1895 if (md->mapcount_max > 1)
1896 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1897
1898 if (md->swapcache)
1899 seq_printf(m, " swapcache=%lu", md->swapcache);
1900
1901 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1902 seq_printf(m, " active=%lu", md->active);
1903
1904 if (md->writeback)
1905 seq_printf(m, " writeback=%lu", md->writeback);
1906
1907 for_each_node_state(nid, N_MEMORY)
1908 if (md->node[nid])
1909 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1910
1911 seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1912out:
1913 seq_putc(m, '\n');
1914 m_cache_vma(m, vma);
1915 return 0;
1916}
1917
1918static int show_pid_numa_map(struct seq_file *m, void *v)
1919{
1920 return show_numa_map(m, v, 1);
1921}
1922
1923static int show_tid_numa_map(struct seq_file *m, void *v)
1924{
1925 return show_numa_map(m, v, 0);
1926}
1927
1928static const struct seq_operations proc_pid_numa_maps_op = {
1929 .start = m_start,
1930 .next = m_next,
1931 .stop = m_stop,
1932 .show = show_pid_numa_map,
1933};
1934
1935static const struct seq_operations proc_tid_numa_maps_op = {
1936 .start = m_start,
1937 .next = m_next,
1938 .stop = m_stop,
1939 .show = show_tid_numa_map,
1940};
1941
1942static int numa_maps_open(struct inode *inode, struct file *file,
1943 const struct seq_operations *ops)
1944{
1945 return proc_maps_open(inode, file, ops,
1946 sizeof(struct numa_maps_private));
1947}
1948
1949static int pid_numa_maps_open(struct inode *inode, struct file *file)
1950{
1951 return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1952}
1953
1954static int tid_numa_maps_open(struct inode *inode, struct file *file)
1955{
1956 return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1957}
1958
1959const struct file_operations proc_pid_numa_maps_operations = {
1960 .open = pid_numa_maps_open,
1961 .read = seq_read,
1962 .llseek = seq_lseek,
1963 .release = proc_map_release,
1964};
1965
1966const struct file_operations proc_tid_numa_maps_operations = {
1967 .open = tid_numa_maps_open,
1968 .read = seq_read,
1969 .llseek = seq_lseek,
1970 .release = proc_map_release,
1971};
1972#endif /* CONFIG_NUMA */