Blame - ap/os/linux/linux-3.4.x/fs/proc/task_mmu.c - R306

blob: bc179339eb268f352c39167812c015b965d3921b [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	#include <linux/mm.h>
				2	#include <linux/hugetlb.h>
				3	#include <linux/huge_mm.h>
				4	#include <linux/mount.h>
				5	#include <linux/seq_file.h>
				6	#include <linux/highmem.h>
				7	#include <linux/ptrace.h>
				8	#include <linux/slab.h>
				9	#include <linux/pagemap.h>
				10	#include <linux/mempolicy.h>
				11	#include <linux/rmap.h>
				12	#include <linux/swap.h>
				13	#include <linux/swapops.h>
				14
				15	#include <asm/elf.h>
				16	#include <asm/uaccess.h>
				17	#include <asm/tlbflush.h>
				18	#include "internal.h"
				19
				20	void task_mem(struct seq_file m, struct mm_struct mm)
				21	{
				22	unsigned long data, text, lib, swap;
				23	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
				24
				25	/*
				26	* Note: to minimize their overhead, mm maintains hiwater_vm and
				27	* hiwater_rss only when about to lower total_vm or rss. Any
				28	* collector of these hiwater stats must therefore get total_vm
				29	* and rss too, which will usually be the higher. Barriers? not
				30	* worth the effort, such snapshots can always be inconsistent.
				31	*/
				32	hiwater_vm = total_vm = mm->total_vm;
				33	if (hiwater_vm < mm->hiwater_vm)
				34	hiwater_vm = mm->hiwater_vm;
				35	hiwater_rss = total_rss = get_mm_rss(mm);
				36	if (hiwater_rss < mm->hiwater_rss)
				37	hiwater_rss = mm->hiwater_rss;
				38
				39	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
				40	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
				41	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
				42	swap = get_mm_counter(mm, MM_SWAPENTS);
				43	seq_printf(m,
				44	"VmPeak:\t%8lu kB\n"
				45	"VmSize:\t%8lu kB\n"
				46	"VmLck:\t%8lu kB\n"
				47	"VmPin:\t%8lu kB\n"
				48	"VmHWM:\t%8lu kB\n"
				49	"VmRSS:\t%8lu kB\n"
				50	"VmData:\t%8lu kB\n"
				51	"VmStk:\t%8lu kB\n"
				52	"VmExe:\t%8lu kB\n"
				53	"VmLib:\t%8lu kB\n"
				54	"VmPTE:\t%8lu kB\n"
				55	"VmSwap:\t%8lu kB\n",
				56	hiwater_vm << (PAGE_SHIFT-10),
				57	(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
				58	mm->locked_vm << (PAGE_SHIFT-10),
				59	mm->pinned_vm << (PAGE_SHIFT-10),
				60	hiwater_rss << (PAGE_SHIFT-10),
				61	total_rss << (PAGE_SHIFT-10),
				62	data << (PAGE_SHIFT-10),
				63	mm->stack_vm << (PAGE_SHIFT-10), text, lib,
				64	(PTRS_PER_PTEsizeof(pte_t)mm->nr_ptes) >> 10,
				65	swap << (PAGE_SHIFT-10));
				66	}
				67
				68	unsigned long task_vsize(struct mm_struct *mm)
				69	{
				70	return PAGE_SIZE * mm->total_vm;
				71	}
				72
				73	unsigned long task_statm(struct mm_struct *mm,
				74	unsigned long shared, unsigned long text,
				75	unsigned long data, unsigned long resident)
				76	{
				77	*shared = get_mm_counter(mm, MM_FILEPAGES);
				78	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
				79	>> PAGE_SHIFT;
				80	*data = mm->total_vm - mm->shared_vm;
				81	resident = shared + get_mm_counter(mm, MM_ANONPAGES);
				82	return mm->total_vm;
				83	}
				84
				85	static void pad_len_spaces(struct seq_file *m, int len)
				86	{
				87	len = 25 + sizeof(void) 6 - len;
				88	if (len < 1)
				89	len = 1;
				90	seq_printf(m, "%*c", len, ' ');
				91	}
				92
				93	static void vma_stop(struct proc_maps_private priv, struct vm_area_struct vma)
				94	{
				95	if (vma && vma != priv->tail_vma) {
				96	struct mm_struct *mm = vma->vm_mm;
				97	up_read(&mm->mmap_sem);
				98	mmput(mm);
				99	}
				100	}
				101
				102	static void m_start(struct seq_file m, loff_t *pos)
				103	{
				104	struct proc_maps_private *priv = m->private;
				105	unsigned long last_addr = m->version;
				106	struct mm_struct *mm;
				107	struct vm_area_struct vma, tail_vma = NULL;
				108	loff_t l = *pos;
				109
				110	/* Clear the per syscall fields in priv */
				111	priv->task = NULL;
				112	priv->tail_vma = NULL;
				113
				114	/*
				115	* We remember last_addr rather than next_addr to hit with
				116	* mmap_cache most of the time. We have zero last_addr at
				117	* the beginning and also after lseek. We will have -1 last_addr
				118	* after the end of the vmas.
				119	*/
				120
				121	if (last_addr == -1UL)
				122	return NULL;
				123
				124	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
				125	if (!priv->task)
				126	return ERR_PTR(-ESRCH);
				127
				128	mm = mm_for_maps(priv->task);
				129	if (!mm \|\| IS_ERR(mm))
				130	return mm;
				131	down_read(&mm->mmap_sem);
				132
				133	tail_vma = get_gate_vma(priv->task->mm);
				134	priv->tail_vma = tail_vma;
				135
				136	/* Start with last addr hint */
				137	vma = find_vma(mm, last_addr);
				138	if (last_addr && vma) {
				139	vma = vma->vm_next;
				140	goto out;
				141	}
				142
				143	/*
				144	* Check the vma index is within the range and do
				145	* sequential scan until m_index.
				146	*/
				147	vma = NULL;
				148	if ((unsigned long)l < mm->map_count) {
				149	vma = mm->mmap;
				150	while (l-- && vma)
				151	vma = vma->vm_next;
				152	goto out;
				153	}
				154
				155	if (l != mm->map_count)
				156	tail_vma = NULL; /* After gate vma */
				157
				158	out:
				159	if (vma)
				160	return vma;
				161
				162	/* End of vmas has been reached */
				163	m->version = (tail_vma != NULL)? 0: -1UL;
				164	up_read(&mm->mmap_sem);
				165	mmput(mm);
				166	return tail_vma;
				167	}
				168
				169	static void m_next(struct seq_file m, void v, loff_t pos)
				170	{
				171	struct proc_maps_private *priv = m->private;
				172	struct vm_area_struct *vma = v;
				173	struct vm_area_struct *tail_vma = priv->tail_vma;
				174
				175	(*pos)++;
				176	if (vma && (vma != tail_vma) && vma->vm_next)
				177	return vma->vm_next;
				178	vma_stop(priv, vma);
				179	return (vma != tail_vma)? tail_vma: NULL;
				180	}
				181
				182	static void m_stop(struct seq_file m, void v)
				183	{
				184	struct proc_maps_private *priv = m->private;
				185	struct vm_area_struct *vma = v;
				186
				187	if (!IS_ERR(vma))
				188	vma_stop(priv, vma);
				189	if (priv->task)
				190	put_task_struct(priv->task);
				191	}
				192
				193	static int do_maps_open(struct inode inode, struct file file,
				194	const struct seq_operations *ops)
				195	{
				196	struct proc_maps_private *priv;
				197	int ret = -ENOMEM;
				198	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
				199	if (priv) {
				200	priv->pid = proc_pid(inode);
				201	ret = seq_open(file, ops);
				202	if (!ret) {
				203	struct seq_file *m = file->private_data;
				204	m->private = priv;
				205	} else {
				206	kfree(priv);
				207	}
				208	}
				209	return ret;
				210	}
				211
				212	static void
				213	show_map_vma(struct seq_file m, struct vm_area_struct vma, int is_pid)
				214	{
				215	struct mm_struct *mm = vma->vm_mm;
				216	struct file *file = vma->vm_file;
				217	struct proc_maps_private *priv = m->private;
				218	struct task_struct *task = priv->task;
				219	vm_flags_t flags = vma->vm_flags;
				220	unsigned long ino = 0;
				221	unsigned long long pgoff = 0;
				222	unsigned long start, end;
				223	dev_t dev = 0;
				224	int len;
				225	const char *name = NULL;
				226
				227	if (file) {
				228	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
				229	dev = inode->i_sb->s_dev;
				230	ino = inode->i_ino;
				231	pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
				232	}
				233
				234	/* We don't show the stack guard page in /proc/maps */
				235	start = vma->vm_start;
				236	if (stack_guard_page_start(vma, start))
				237	start += PAGE_SIZE;
				238	end = vma->vm_end;
				239	if (stack_guard_page_end(vma, end))
				240	end -= PAGE_SIZE;
				241
				242	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
				243	start,
				244	end,
				245	flags & VM_READ ? 'r' : '-',
				246	flags & VM_WRITE ? 'w' : '-',
				247	flags & VM_EXEC ? 'x' : '-',
				248	flags & VM_MAYSHARE ? 's' : 'p',
				249	pgoff,
				250	MAJOR(dev), MINOR(dev), ino, &len);
				251
				252	/*
				253	* Print the dentry name for named mappings, and a
				254	* special [heap] marker for the heap:
				255	*/
				256	if (file) {
				257	pad_len_spaces(m, len);
				258	seq_path(m, &file->f_path, "\n");
				259	goto done;
				260	}
				261
				262	name = arch_vma_name(vma);
				263	if (!name) {
				264	pid_t tid;
				265
				266	if (!mm) {
				267	name = "[vdso]";
				268	goto done;
				269	}
				270
				271	if (vma->vm_start <= mm->brk &&
				272	vma->vm_end >= mm->start_brk) {
				273	name = "[heap]";
				274	goto done;
				275	}
				276
				277	tid = vm_is_stack(task, vma, is_pid);
				278
				279	if (tid != 0) {
				280	/*
				281	* Thread stack in /proc/PID/task/TID/maps or
				282	* the main process stack.
				283	*/
				284	if (!is_pid \|\| (vma->vm_start <= mm->start_stack &&
				285	vma->vm_end >= mm->start_stack)) {
				286	name = "[stack]";
				287	} else {
				288	/* Thread stack in /proc/PID/maps */
				289	pad_len_spaces(m, len);
				290	seq_printf(m, "[stack:%d]", tid);
				291	}
				292	}
				293	}
				294
				295	done:
				296	if (name) {
				297	pad_len_spaces(m, len);
				298	seq_puts(m, name);
				299	}
				300	seq_putc(m, '\n');
				301	}
				302
				303	static int show_map(struct seq_file m, void v, int is_pid)
				304	{
				305	struct vm_area_struct *vma = v;
				306	struct proc_maps_private *priv = m->private;
				307	struct task_struct *task = priv->task;
				308
				309	show_map_vma(m, vma, is_pid);
				310
				311	if (m->count < m->size) /* vma is copied successfully */
				312	m->version = (vma != get_gate_vma(task->mm))
				313	? vma->vm_start : 0;
				314	return 0;
				315	}
				316
				317	static int show_pid_map(struct seq_file m, void v)
				318	{
				319	return show_map(m, v, 1);
				320	}
				321
				322	static int show_tid_map(struct seq_file m, void v)
				323	{
				324	return show_map(m, v, 0);
				325	}
				326
				327	static const struct seq_operations proc_pid_maps_op = {
				328	.start = m_start,
				329	.next = m_next,
				330	.stop = m_stop,
				331	.show = show_pid_map
				332	};
				333
				334	static const struct seq_operations proc_tid_maps_op = {
				335	.start = m_start,
				336	.next = m_next,
				337	.stop = m_stop,
				338	.show = show_tid_map
				339	};
				340
				341	static int pid_maps_open(struct inode inode, struct file file)
				342	{
				343	return do_maps_open(inode, file, &proc_pid_maps_op);
				344	}
				345
				346	static int tid_maps_open(struct inode inode, struct file file)
				347	{
				348	return do_maps_open(inode, file, &proc_tid_maps_op);
				349	}
				350
				351	const struct file_operations proc_pid_maps_operations = {
				352	.open = pid_maps_open,
				353	.read = seq_read,
				354	.llseek = seq_lseek,
				355	.release = seq_release_private,
				356	};
				357
				358	const struct file_operations proc_tid_maps_operations = {
				359	.open = tid_maps_open,
				360	.read = seq_read,
				361	.llseek = seq_lseek,
				362	.release = seq_release_private,
				363	};
				364
				365	/*
				366	* Proportional Set Size(PSS): my share of RSS.
				367	*
				368	* PSS of a process is the count of pages it has in memory, where each
				369	* page is divided by the number of processes sharing it. So if a
				370	* process has 1000 pages all to itself, and 1000 shared with one other
				371	* process, its PSS will be 1500.
				372	*
				373	* To keep (accumulated) division errors low, we adopt a 64bit
				374	* fixed-point pss counter to minimize division errors. So (pss >>
				375	* PSS_SHIFT) would be the real byte count.
				376	*
				377	* A shift of 12 before division means (assuming 4K page size):
				378	* - 1M 3-user-pages add up to 8KB errors;
				379	* - supports mapcount up to 2^24, or 16M;
				380	* - supports PSS up to 2^52 bytes, or 4PB.
				381	*/
				382	#define PSS_SHIFT 12
				383
				384	#ifdef CONFIG_PROC_PAGE_MONITOR
				385	struct mem_size_stats {
				386	struct vm_area_struct *vma;
				387	unsigned long resident;
				388	unsigned long shared_clean;
				389	unsigned long shared_dirty;
				390	unsigned long private_clean;
				391	unsigned long private_dirty;
				392	unsigned long referenced;
				393	unsigned long anonymous;
				394	unsigned long anonymous_thp;
				395	unsigned long swap;
				396	u64 pss;
				397	};
				398
				399
				400	static void smaps_pte_entry(pte_t ptent, unsigned long addr,
				401	unsigned long ptent_size, struct mm_walk *walk)
				402	{
				403	struct mem_size_stats *mss = walk->private;
				404	struct vm_area_struct *vma = mss->vma;
				405	struct page *page;
				406	int mapcount;
				407
				408	if (is_swap_pte(ptent)) {
				409	mss->swap += ptent_size;
				410	return;
				411	}
				412
				413	if (!pte_present(ptent))
				414	return;
				415
				416	page = vm_normal_page(vma, addr, ptent);
				417	if (!page)
				418	return;
				419
				420	if (PageAnon(page))
				421	mss->anonymous += ptent_size;
				422
				423	mss->resident += ptent_size;
				424	/* Accumulate the size in pages that have been accessed. */
				425	if (pte_young(ptent) \|\| PageReferenced(page))
				426	mss->referenced += ptent_size;
				427	mapcount = page_mapcount(page);
				428	if (mapcount >= 2) {
				429	if (pte_dirty(ptent) \|\| PageDirty(page))
				430	mss->shared_dirty += ptent_size;
				431	else
				432	mss->shared_clean += ptent_size;
				433	mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
				434	} else {
				435	if (pte_dirty(ptent) \|\| PageDirty(page))
				436	mss->private_dirty += ptent_size;
				437	else
				438	mss->private_clean += ptent_size;
				439	mss->pss += (ptent_size << PSS_SHIFT);
				440	}
				441	}
				442
				443	static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
				444	struct mm_walk *walk)
				445	{
				446	struct mem_size_stats *mss = walk->private;
				447	struct vm_area_struct *vma = mss->vma;
				448	pte_t *pte;
				449	spinlock_t *ptl;
				450
				451	if (pmd_trans_huge_lock(pmd, vma) == 1) {
				452	smaps_pte_entry((pte_t )pmd, addr, HPAGE_PMD_SIZE, walk);
				453	spin_unlock(&walk->mm->page_table_lock);
				454	mss->anonymous_thp += HPAGE_PMD_SIZE;
				455	return 0;
				456	}
				457
				458	if (pmd_trans_unstable(pmd))
				459	return 0;
				460	/*
				461	* The mmap_sem held all the way back in m_start() is what
				462	* keeps khugepaged out of here and from collapsing things
				463	* in here.
				464	*/
				465	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				466	for (; addr != end; pte++, addr += PAGE_SIZE)
				467	smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
				468	pte_unmap_unlock(pte - 1, ptl);
				469	cond_resched();
				470	return 0;
				471	}
				472
				473	static int show_smap(struct seq_file m, void v, int is_pid)
				474	{
				475	struct proc_maps_private *priv = m->private;
				476	struct task_struct *task = priv->task;
				477	struct vm_area_struct *vma = v;
				478	struct mem_size_stats mss;
				479	struct mm_walk smaps_walk = {
				480	.pmd_entry = smaps_pte_range,
				481	.mm = vma->vm_mm,
				482	.private = &mss,
				483	};
				484
				485	memset(&mss, 0, sizeof mss);
				486	mss.vma = vma;
				487	/* mmap_sem is held in m_start */
				488	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
				489	walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
				490
				491	show_map_vma(m, vma, is_pid);
				492
				493	seq_printf(m,
				494	"Size: %8lu kB\n"
				495	"Rss: %8lu kB\n"
				496	"Pss: %8lu kB\n"
				497	"Shared_Clean: %8lu kB\n"
				498	"Shared_Dirty: %8lu kB\n"
				499	"Private_Clean: %8lu kB\n"
				500	"Private_Dirty: %8lu kB\n"
				501	"Referenced: %8lu kB\n"
				502	"Anonymous: %8lu kB\n"
				503	"AnonHugePages: %8lu kB\n"
				504	"Swap: %8lu kB\n"
				505	"KernelPageSize: %8lu kB\n"
				506	"MMUPageSize: %8lu kB\n"
				507	"Locked: %8lu kB\n",
				508	(vma->vm_end - vma->vm_start) >> 10,
				509	mss.resident >> 10,
				510	(unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
				511	mss.shared_clean >> 10,
				512	mss.shared_dirty >> 10,
				513	mss.private_clean >> 10,
				514	mss.private_dirty >> 10,
				515	mss.referenced >> 10,
				516	mss.anonymous >> 10,
				517	mss.anonymous_thp >> 10,
				518	mss.swap >> 10,
				519	vma_kernel_pagesize(vma) >> 10,
				520	vma_mmu_pagesize(vma) >> 10,
				521	(vma->vm_flags & VM_LOCKED) ?
				522	(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
				523
				524	if (m->count < m->size) /* vma is copied successfully */
				525	m->version = (vma != get_gate_vma(task->mm))
				526	? vma->vm_start : 0;
				527	return 0;
				528	}
				529
				530	static int show_pid_smap(struct seq_file m, void v)
				531	{
				532	return show_smap(m, v, 1);
				533	}
				534
				535	static int show_tid_smap(struct seq_file m, void v)
				536	{
				537	return show_smap(m, v, 0);
				538	}
				539
				540	static const struct seq_operations proc_pid_smaps_op = {
				541	.start = m_start,
				542	.next = m_next,
				543	.stop = m_stop,
				544	.show = show_pid_smap
				545	};
				546
				547	static const struct seq_operations proc_tid_smaps_op = {
				548	.start = m_start,
				549	.next = m_next,
				550	.stop = m_stop,
				551	.show = show_tid_smap
				552	};
				553
				554	static int pid_smaps_open(struct inode inode, struct file file)
				555	{
				556	return do_maps_open(inode, file, &proc_pid_smaps_op);
				557	}
				558
				559	static int tid_smaps_open(struct inode inode, struct file file)
				560	{
				561	return do_maps_open(inode, file, &proc_tid_smaps_op);
				562	}
				563
				564	const struct file_operations proc_pid_smaps_operations = {
				565	.open = pid_smaps_open,
				566	.read = seq_read,
				567	.llseek = seq_lseek,
				568	.release = seq_release_private,
				569	};
				570
				571	const struct file_operations proc_tid_smaps_operations = {
				572	.open = tid_smaps_open,
				573	.read = seq_read,
				574	.llseek = seq_lseek,
				575	.release = seq_release_private,
				576	};
				577
				578	static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
				579	unsigned long end, struct mm_walk *walk)
				580	{
				581	struct vm_area_struct *vma = walk->private;
				582	pte_t *pte, ptent;
				583	spinlock_t *ptl;
				584	struct page *page;
				585
				586	split_huge_page_pmd(walk->mm, pmd);
				587	if (pmd_trans_unstable(pmd))
				588	return 0;
				589
				590	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				591	for (; addr != end; pte++, addr += PAGE_SIZE) {
				592	ptent = *pte;
				593	if (!pte_present(ptent))
				594	continue;
				595
				596	page = vm_normal_page(vma, addr, ptent);
				597	if (!page)
				598	continue;
				599
				600	/* Clear accessed and referenced bits. */
				601	ptep_test_and_clear_young(vma, addr, pte);
				602	ClearPageReferenced(page);
				603	}
				604	pte_unmap_unlock(pte - 1, ptl);
				605	cond_resched();
				606	return 0;
				607	}
				608
				609	#define CLEAR_REFS_ALL 1
				610	#define CLEAR_REFS_ANON 2
				611	#define CLEAR_REFS_MAPPED 3
				612
				613	static ssize_t clear_refs_write(struct file file, const char __user buf,
				614	size_t count, loff_t *ppos)
				615	{
				616	struct task_struct *task;
				617	char buffer[PROC_NUMBUF];
				618	struct mm_struct *mm;
				619	struct vm_area_struct *vma;
				620	int type;
				621	int rv;
				622
				623	memset(buffer, 0, sizeof(buffer));
				624	if (count > sizeof(buffer) - 1)
				625	count = sizeof(buffer) - 1;
				626	if (copy_from_user(buffer, buf, count))
				627	return -EFAULT;
				628	rv = kstrtoint(strstrip(buffer), 10, &type);
				629	if (rv < 0)
				630	return rv;
				631	if (type < CLEAR_REFS_ALL \|\| type > CLEAR_REFS_MAPPED)
				632	return -EINVAL;
				633	task = get_proc_task(file->f_path.dentry->d_inode);
				634	if (!task)
				635	return -ESRCH;
				636	mm = get_task_mm(task);
				637	if (mm) {
				638	struct mm_walk clear_refs_walk = {
				639	.pmd_entry = clear_refs_pte_range,
				640	.mm = mm,
				641	};
				642	down_read(&mm->mmap_sem);
				643	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				644	clear_refs_walk.private = vma;
				645	if (is_vm_hugetlb_page(vma))
				646	continue;
				647	/*
				648	* Writing 1 to /proc/pid/clear_refs affects all pages.
				649	*
				650	* Writing 2 to /proc/pid/clear_refs only affects
				651	* Anonymous pages.
				652	*
				653	* Writing 3 to /proc/pid/clear_refs only affects file
				654	* mapped pages.
				655	*/
				656	if (type == CLEAR_REFS_ANON && vma->vm_file)
				657	continue;
				658	if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
				659	continue;
				660	walk_page_range(vma->vm_start, vma->vm_end,
				661	&clear_refs_walk);
				662	}
				663	flush_tlb_mm(mm);
				664	up_read(&mm->mmap_sem);
				665	mmput(mm);
				666	}
				667	put_task_struct(task);
				668
				669	return count;
				670	}
				671
				672	const struct file_operations proc_clear_refs_operations = {
				673	.write = clear_refs_write,
				674	.llseek = noop_llseek,
				675	};
				676
				677	typedef struct {
				678	u64 pme;
				679	} pagemap_entry_t;
				680
				681	struct pagemapread {
				682	int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
				683	pagemap_entry_t *buffer;
				684	};
				685
				686	#define PAGEMAP_WALK_SIZE (PMD_SIZE)
				687	#define PAGEMAP_WALK_MASK (PMD_MASK)
				688
				689	#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
				690	#define PM_STATUS_BITS 3
				691	#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
				692	#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
				693	#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
				694	#define PM_PSHIFT_BITS 6
				695	#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
				696	#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
				697	#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
				698	#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
				699	#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
				700
				701	#define PM_PRESENT PM_STATUS(4LL)
				702	#define PM_SWAP PM_STATUS(2LL)
				703	#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
				704	#define PM_END_OF_BUFFER 1
				705
				706	static inline pagemap_entry_t make_pme(u64 val)
				707	{
				708	return (pagemap_entry_t) { .pme = val };
				709	}
				710
				711	static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
				712	struct pagemapread *pm)
				713	{
				714	pm->buffer[pm->pos++] = *pme;
				715	if (pm->pos >= pm->len)
				716	return PM_END_OF_BUFFER;
				717	return 0;
				718	}
				719
				720	static int pagemap_pte_hole(unsigned long start, unsigned long end,
				721	struct mm_walk *walk)
				722	{
				723	struct pagemapread *pm = walk->private;
				724	unsigned long addr;
				725	int err = 0;
				726	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
				727
				728	for (addr = start; addr < end; addr += PAGE_SIZE) {
				729	err = add_to_pagemap(addr, &pme, pm);
				730	if (err)
				731	break;
				732	}
				733	return err;
				734	}
				735
				736	static u64 swap_pte_to_pagemap_entry(pte_t pte)
				737	{
				738	swp_entry_t e = pte_to_swp_entry(pte);
				739	return swp_type(e) \| (swp_offset(e) << MAX_SWAPFILES_SHIFT);
				740	}
				741
				742	static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte)
				743	{
				744	if (is_swap_pte(pte))
				745	*pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte))
				746	\| PM_PSHIFT(PAGE_SHIFT) \| PM_SWAP);
				747	else if (pte_present(pte))
				748	*pme = make_pme(PM_PFRAME(pte_pfn(pte))
				749	\| PM_PSHIFT(PAGE_SHIFT) \| PM_PRESENT);
				750	else
				751	*pme = make_pme(PM_NOT_PRESENT);
				752	}
				753
				754	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				755	static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
				756	pmd_t pmd, int offset)
				757	{
				758	/*
				759	* Currently pmd for thp is always present because thp can not be
				760	* swapped-out, migrated, or HWPOISONed (split in such cases instead.)
				761	* This if-check is just to prepare for future implementation.
				762	*/
				763	if (pmd_present(pmd))
				764	*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
				765	\| PM_PSHIFT(PAGE_SHIFT) \| PM_PRESENT);
				766	else
				767	*pme = make_pme(PM_NOT_PRESENT);
				768	}
				769	#else
				770	static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
				771	pmd_t pmd, int offset)
				772	{
				773	}
				774	#endif
				775
				776	static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
				777	struct mm_walk *walk)
				778	{
				779	struct vm_area_struct *vma;
				780	struct pagemapread *pm = walk->private;
				781	pte_t *pte;
				782	int err = 0;
				783	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
				784
				785	/* find the first VMA at or above 'addr' */
				786	vma = find_vma(walk->mm, addr);
				787	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
				788	for (; addr != end; addr += PAGE_SIZE) {
				789	unsigned long offset;
				790
				791	offset = (addr & ~PAGEMAP_WALK_MASK) >>
				792	PAGE_SHIFT;
				793	thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
				794	err = add_to_pagemap(addr, &pme, pm);
				795	if (err)
				796	break;
				797	}
				798	spin_unlock(&walk->mm->page_table_lock);
				799	return err;
				800	}
				801
				802	if (pmd_trans_unstable(pmd))
				803	return 0;
				804	for (; addr != end; addr += PAGE_SIZE) {
				805
				806	/* check to see if we've left 'vma' behind
				807	* and need a new, higher one */
				808	if (vma && (addr >= vma->vm_end)) {
				809	vma = find_vma(walk->mm, addr);
				810	pme = make_pme(PM_NOT_PRESENT);
				811	}
				812
				813	/* check that 'vma' actually covers this address,
				814	* and that it isn't a huge page vma */
				815	if (vma && (vma->vm_start <= addr) &&
				816	!is_vm_hugetlb_page(vma)) {
				817	pte = pte_offset_map(pmd, addr);
				818	pte_to_pagemap_entry(&pme, *pte);
				819	/* unmap before userspace copy */
				820	pte_unmap(pte);
				821	}
				822	err = add_to_pagemap(addr, &pme, pm);
				823	if (err)
				824	return err;
				825	}
				826
				827	cond_resched();
				828
				829	return err;
				830	}
				831
				832	#ifdef CONFIG_HUGETLB_PAGE
				833	static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
				834	pte_t pte, int offset)
				835	{
				836	if (pte_present(pte))
				837	*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
				838	\| PM_PSHIFT(PAGE_SHIFT) \| PM_PRESENT);
				839	else
				840	*pme = make_pme(PM_NOT_PRESENT);
				841	}
				842
				843	/* This function walks within one hugetlb entry in the single call */
				844	static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
				845	unsigned long addr, unsigned long end,
				846	struct mm_walk *walk)
				847	{
				848	struct pagemapread *pm = walk->private;
				849	int err = 0;
				850	pagemap_entry_t pme;
				851
				852	for (; addr != end; addr += PAGE_SIZE) {
				853	int offset = (addr & ~hmask) >> PAGE_SHIFT;
				854	huge_pte_to_pagemap_entry(&pme, *pte, offset);
				855	err = add_to_pagemap(addr, &pme, pm);
				856	if (err)
				857	return err;
				858	}
				859
				860	cond_resched();
				861
				862	return err;
				863	}
				864	#endif /* HUGETLB_PAGE */
				865
				866	/*
				867	* /proc/pid/pagemap - an array mapping virtual pages to pfns
				868	*
				869	* For each page in the address space, this file contains one 64-bit entry
				870	* consisting of the following:
				871	*
				872	* Bits 0-55 page frame number (PFN) if present
				873	* Bits 0-4 swap type if swapped
				874	* Bits 5-55 swap offset if swapped
				875	* Bits 55-60 page shift (page size = 1<<page shift)
				876	* Bit 61 reserved for future use
				877	* Bit 62 page swapped
				878	* Bit 63 page present
				879	*
				880	* If the page is not present but in swap, then the PFN contains an
				881	* encoding of the swap file number and the page's offset into the
				882	* swap. Unmapped pages return a null PFN. This allows determining
				883	* precisely which pages are mapped (or in swap) and comparing mapped
				884	* pages between processes.
				885	*
				886	* Efficient users of this interface will use /proc/pid/maps to
				887	* determine which areas of memory are actually mapped and llseek to
				888	* skip over unmapped regions.
				889	*/
				890	static ssize_t pagemap_read(struct file file, char __user buf,
				891	size_t count, loff_t *ppos)
				892	{
				893	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
				894	struct mm_struct *mm;
				895	struct pagemapread pm;
				896	int ret = -ESRCH;
				897	struct mm_walk pagemap_walk = {};
				898	unsigned long src;
				899	unsigned long svpfn;
				900	unsigned long start_vaddr;
				901	unsigned long end_vaddr;
				902	int copied = 0;
				903
				904	if (!task)
				905	goto out;
				906
				907	ret = -EINVAL;
				908	/* file position must be aligned */
				909	if ((*ppos % PM_ENTRY_BYTES) \|\| (count % PM_ENTRY_BYTES))
				910	goto out_task;
				911
				912	ret = 0;
				913	if (!count)
				914	goto out_task;
				915
				916	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
				917	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
				918	ret = -ENOMEM;
				919	if (!pm.buffer)
				920	goto out_task;
				921
				922	mm = mm_for_maps(task);
				923	ret = PTR_ERR(mm);
				924	if (!mm \|\| IS_ERR(mm))
				925	goto out_free;
				926
				927	pagemap_walk.pmd_entry = pagemap_pte_range;
				928	pagemap_walk.pte_hole = pagemap_pte_hole;
				929	#ifdef CONFIG_HUGETLB_PAGE
				930	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
				931	#endif
				932	pagemap_walk.mm = mm;
				933	pagemap_walk.private = &pm;
				934
				935	src = *ppos;
				936	svpfn = src / PM_ENTRY_BYTES;
				937	start_vaddr = svpfn << PAGE_SHIFT;
				938	end_vaddr = TASK_SIZE_OF(task);
				939
				940	/* watch out for wraparound */
				941	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
				942	start_vaddr = end_vaddr;
				943
				944	/*
				945	* The odds are that this will stop walking way
				946	* before end_vaddr, because the length of the
				947	* user buffer is tracked in "pm", and the walk
				948	* will stop when we hit the end of the buffer.
				949	*/
				950	ret = 0;
				951	while (count && (start_vaddr < end_vaddr)) {
				952	int len;
				953	unsigned long end;
				954
				955	pm.pos = 0;
				956	end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
				957	/* overflow ? */
				958	if (end < start_vaddr \|\| end > end_vaddr)
				959	end = end_vaddr;
				960	down_read(&mm->mmap_sem);
				961	ret = walk_page_range(start_vaddr, end, &pagemap_walk);
				962	up_read(&mm->mmap_sem);
				963	start_vaddr = end;
				964
				965	len = min(count, PM_ENTRY_BYTES * pm.pos);
				966	if (copy_to_user(buf, pm.buffer, len)) {
				967	ret = -EFAULT;
				968	goto out_mm;
				969	}
				970	copied += len;
				971	buf += len;
				972	count -= len;
				973	}
				974	*ppos += copied;
				975	if (!ret \|\| ret == PM_END_OF_BUFFER)
				976	ret = copied;
				977
				978	out_mm:
				979	mmput(mm);
				980	out_free:
				981	kfree(pm.buffer);
				982	out_task:
				983	put_task_struct(task);
				984	out:
				985	return ret;
				986	}
				987
				988	static int pagemap_open(struct inode inode, struct file file)
				989	{
				990	/* do not disclose physical addresses to unprivileged
				991	userspace (closes a rowhammer attack vector) */
				992	if (!capable(CAP_SYS_ADMIN))
				993	return -EPERM;
				994	return 0;
				995	}
				996
				997	const struct file_operations proc_pagemap_operations = {
				998	.llseek = mem_lseek, /* borrow this */
				999	.read = pagemap_read,
				1000	.open = pagemap_open,
				1001	};
				1002	#endif /* CONFIG_PROC_PAGE_MONITOR */
				1003
				1004	#ifdef CONFIG_NUMA
				1005
				1006	struct numa_maps {
				1007	struct vm_area_struct *vma;
				1008	unsigned long pages;
				1009	unsigned long anon;
				1010	unsigned long active;
				1011	unsigned long writeback;
				1012	unsigned long mapcount_max;
				1013	unsigned long dirty;
				1014	unsigned long swapcache;
				1015	unsigned long node[MAX_NUMNODES];
				1016	};
				1017
				1018	struct numa_maps_private {
				1019	struct proc_maps_private proc_maps;
				1020	struct numa_maps md;
				1021	};
				1022
				1023	static void gather_stats(struct page page, struct numa_maps md, int pte_dirty,
				1024	unsigned long nr_pages)
				1025	{
				1026	int count = page_mapcount(page);
				1027
				1028	md->pages += nr_pages;
				1029	if (pte_dirty \|\| PageDirty(page))
				1030	md->dirty += nr_pages;
				1031
				1032	if (PageSwapCache(page))
				1033	md->swapcache += nr_pages;
				1034
				1035	if (PageActive(page) \|\| PageUnevictable(page))
				1036	md->active += nr_pages;
				1037
				1038	if (PageWriteback(page))
				1039	md->writeback += nr_pages;
				1040
				1041	if (PageAnon(page))
				1042	md->anon += nr_pages;
				1043
				1044	if (count > md->mapcount_max)
				1045	md->mapcount_max = count;
				1046
				1047	md->node[page_to_nid(page)] += nr_pages;
				1048	}
				1049
				1050	static struct page can_gather_numa_stats(pte_t pte, struct vm_area_struct vma,
				1051	unsigned long addr)
				1052	{
				1053	struct page *page;
				1054	int nid;
				1055
				1056	if (!pte_present(pte))
				1057	return NULL;
				1058
				1059	page = vm_normal_page(vma, addr, pte);
				1060	if (!page)
				1061	return NULL;
				1062
				1063	if (PageReserved(page))
				1064	return NULL;
				1065
				1066	nid = page_to_nid(page);
				1067	if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
				1068	return NULL;
				1069
				1070	return page;
				1071	}
				1072
				1073	static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
				1074	unsigned long end, struct mm_walk *walk)
				1075	{
				1076	struct numa_maps *md;
				1077	spinlock_t *ptl;
				1078	pte_t *orig_pte;
				1079	pte_t *pte;
				1080
				1081	md = walk->private;
				1082
				1083	if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
				1084	pte_t huge_pte = (pte_t )pmd;
				1085	struct page *page;
				1086
				1087	page = can_gather_numa_stats(huge_pte, md->vma, addr);
				1088	if (page)
				1089	gather_stats(page, md, pte_dirty(huge_pte),
				1090	HPAGE_PMD_SIZE/PAGE_SIZE);
				1091	spin_unlock(&walk->mm->page_table_lock);
				1092	return 0;
				1093	}
				1094
				1095	if (pmd_trans_unstable(pmd))
				1096	return 0;
				1097	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
				1098	do {
				1099	struct page page = can_gather_numa_stats(pte, md->vma, addr);
				1100	if (!page)
				1101	continue;
				1102	gather_stats(page, md, pte_dirty(*pte), 1);
				1103
				1104	} while (pte++, addr += PAGE_SIZE, addr != end);
				1105	pte_unmap_unlock(orig_pte, ptl);
				1106	return 0;
				1107	}
				1108	#ifdef CONFIG_HUGETLB_PAGE
				1109	static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
				1110	unsigned long addr, unsigned long end, struct mm_walk *walk)
				1111	{
				1112	struct numa_maps *md;
				1113	struct page *page;
				1114
				1115	if (pte_none(*pte))
				1116	return 0;
				1117
				1118	page = pte_page(*pte);
				1119	if (!page)
				1120	return 0;
				1121
				1122	md = walk->private;
				1123	gather_stats(page, md, pte_dirty(*pte), 1);
				1124	return 0;
				1125	}
				1126
				1127	#else
				1128	static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
				1129	unsigned long addr, unsigned long end, struct mm_walk *walk)
				1130	{
				1131	return 0;
				1132	}
				1133	#endif
				1134
				1135	/*
				1136	* Display pages allocated per node and memory policy via /proc.
				1137	*/
				1138	static int show_numa_map(struct seq_file m, void v, int is_pid)
				1139	{
				1140	struct numa_maps_private *numa_priv = m->private;
				1141	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
				1142	struct vm_area_struct *vma = v;
				1143	struct numa_maps *md = &numa_priv->md;
				1144	struct file *file = vma->vm_file;
				1145	struct mm_struct *mm = vma->vm_mm;
				1146	struct mm_walk walk = {};
				1147	struct mempolicy *pol;
				1148	int n;
				1149	char buffer[50];
				1150
				1151	if (!mm)
				1152	return 0;
				1153
				1154	/* Ensure we start with an empty set of numa_maps statistics. */
				1155	memset(md, 0, sizeof(*md));
				1156
				1157	md->vma = vma;
				1158
				1159	walk.hugetlb_entry = gather_hugetbl_stats;
				1160	walk.pmd_entry = gather_pte_stats;
				1161	walk.private = md;
				1162	walk.mm = mm;
				1163
				1164	pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
				1165	mpol_to_str(buffer, sizeof(buffer), pol, 0);
				1166	mpol_cond_put(pol);
				1167
				1168	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
				1169
				1170	if (file) {
				1171	seq_printf(m, " file=");
				1172	seq_path(m, &file->f_path, "\n\t= ");
				1173	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
				1174	seq_printf(m, " heap");
				1175	} else {
				1176	pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
				1177	if (tid != 0) {
				1178	/*
				1179	* Thread stack in /proc/PID/task/TID/maps or
				1180	* the main process stack.
				1181	*/
				1182	if (!is_pid \|\| (vma->vm_start <= mm->start_stack &&
				1183	vma->vm_end >= mm->start_stack))
				1184	seq_printf(m, " stack");
				1185	else
				1186	seq_printf(m, " stack:%d", tid);
				1187	}
				1188	}
				1189
				1190	if (is_vm_hugetlb_page(vma))
				1191	seq_printf(m, " huge");
				1192
				1193	walk_page_range(vma->vm_start, vma->vm_end, &walk);
				1194
				1195	if (!md->pages)
				1196	goto out;
				1197
				1198	if (md->anon)
				1199	seq_printf(m, " anon=%lu", md->anon);
				1200
				1201	if (md->dirty)
				1202	seq_printf(m, " dirty=%lu", md->dirty);
				1203
				1204	if (md->pages != md->anon && md->pages != md->dirty)
				1205	seq_printf(m, " mapped=%lu", md->pages);
				1206
				1207	if (md->mapcount_max > 1)
				1208	seq_printf(m, " mapmax=%lu", md->mapcount_max);
				1209
				1210	if (md->swapcache)
				1211	seq_printf(m, " swapcache=%lu", md->swapcache);
				1212
				1213	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
				1214	seq_printf(m, " active=%lu", md->active);
				1215
				1216	if (md->writeback)
				1217	seq_printf(m, " writeback=%lu", md->writeback);
				1218
				1219	for_each_node_state(n, N_HIGH_MEMORY)
				1220	if (md->node[n])
				1221	seq_printf(m, " N%d=%lu", n, md->node[n]);
				1222	out:
				1223	seq_putc(m, '\n');
				1224
				1225	if (m->count < m->size)
				1226	m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
				1227	return 0;
				1228	}
				1229
				1230	static int show_pid_numa_map(struct seq_file m, void v)
				1231	{
				1232	return show_numa_map(m, v, 1);
				1233	}
				1234
				1235	static int show_tid_numa_map(struct seq_file m, void v)
				1236	{
				1237	return show_numa_map(m, v, 0);
				1238	}
				1239
				1240	static const struct seq_operations proc_pid_numa_maps_op = {
				1241	.start = m_start,
				1242	.next = m_next,
				1243	.stop = m_stop,
				1244	.show = show_pid_numa_map,
				1245	};
				1246
				1247	static const struct seq_operations proc_tid_numa_maps_op = {
				1248	.start = m_start,
				1249	.next = m_next,
				1250	.stop = m_stop,
				1251	.show = show_tid_numa_map,
				1252	};
				1253
				1254	static int numa_maps_open(struct inode inode, struct file file,
				1255	const struct seq_operations *ops)
				1256	{
				1257	struct numa_maps_private *priv;
				1258	int ret = -ENOMEM;
				1259	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
				1260	if (priv) {
				1261	priv->proc_maps.pid = proc_pid(inode);
				1262	ret = seq_open(file, ops);
				1263	if (!ret) {
				1264	struct seq_file *m = file->private_data;
				1265	m->private = priv;
				1266	} else {
				1267	kfree(priv);
				1268	}
				1269	}
				1270	return ret;
				1271	}
				1272
				1273	static int pid_numa_maps_open(struct inode inode, struct file file)
				1274	{
				1275	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
				1276	}
				1277
				1278	static int tid_numa_maps_open(struct inode inode, struct file file)
				1279	{
				1280	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
				1281	}
				1282
				1283	const struct file_operations proc_pid_numa_maps_operations = {
				1284	.open = pid_numa_maps_open,
				1285	.read = seq_read,
				1286	.llseek = seq_lseek,
				1287	.release = seq_release_private,
				1288	};
				1289
				1290	const struct file_operations proc_tid_numa_maps_operations = {
				1291	.open = tid_numa_maps_open,
				1292	.read = seq_read,
				1293	.llseek = seq_lseek,
				1294	.release = seq_release_private,
				1295	};
				1296	#endif /* CONFIG_NUMA */