Blame - marvell/linux/mm/madvise.c - T108

blob: ead99081f0a25ecdac6923e4d342c79b94efb304 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/mm/madvise.c
				4	*
				5	* Copyright (C) 1999 Linus Torvalds
				6	* Copyright (C) 2002 Christoph Hellwig
				7	*/
				8
				9	#include <linux/mman.h>
				10	#include <linux/pagemap.h>
				11	#include <linux/syscalls.h>
				12	#include <linux/mempolicy.h>
				13	#include <linux/page-isolation.h>
				14	#include <linux/page_idle.h>
				15	#include <linux/userfaultfd_k.h>
				16	#include <linux/hugetlb.h>
				17	#include <linux/falloc.h>
				18	#include <linux/fadvise.h>
				19	#include <linux/sched.h>
				20	#include <linux/sched/mm.h>
				21	#include <linux/ksm.h>
				22	#include <linux/fs.h>
				23	#include <linux/file.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/backing-dev.h>
				26	#include <linux/pagewalk.h>
				27	#include <linux/swap.h>
				28	#include <linux/swapops.h>
				29	#include <linux/shmem_fs.h>
				30	#include <linux/mmu_notifier.h>
				31	#include <linux/uio.h>
				32	#include <asm/tlb.h>
				33
				34	#include "internal.h"
				35
				36	struct madvise_walk_private {
				37	struct mmu_gather *tlb;
				38	bool pageout;
				39	};
				40
				41	/*
				42	* Any behaviour which results in changes to the vma->vm_flags needs to
				43	* take mmap_sem for writing. Others, which simply traverse vmas, need
				44	* to only take it for reading.
				45	*/
				46	static int madvise_need_mmap_write(int behavior)
				47	{
				48	switch (behavior) {
				49	case MADV_REMOVE:
				50	case MADV_WILLNEED:
				51	case MADV_DONTNEED:
				52	case MADV_COLD:
				53	case MADV_PAGEOUT:
				54	case MADV_FREE:
				55	return 0;
				56	default:
				57	/* be safe, default to 1. list exceptions explicitly */
				58	return 1;
				59	}
				60	}
				61
				62	/*
				63	* We can potentially split a vm area into separate
				64	* areas, each area with its own behavior.
				65	*/
				66	static long madvise_behavior(struct vm_area_struct *vma,
				67	struct vm_area_struct **prev,
				68	unsigned long start, unsigned long end, int behavior)
				69	{
				70	struct mm_struct *mm = vma->vm_mm;
				71	int error = 0;
				72	pgoff_t pgoff;
				73	unsigned long new_flags = vma->vm_flags;
				74
				75	switch (behavior) {
				76	case MADV_NORMAL:
				77	new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
				78	break;
				79	case MADV_SEQUENTIAL:
				80	new_flags = (new_flags & ~VM_RAND_READ) \| VM_SEQ_READ;
				81	break;
				82	case MADV_RANDOM:
				83	new_flags = (new_flags & ~VM_SEQ_READ) \| VM_RAND_READ;
				84	break;
				85	case MADV_DONTFORK:
				86	new_flags \|= VM_DONTCOPY;
				87	break;
				88	case MADV_DOFORK:
				89	if (vma->vm_flags & VM_IO) {
				90	error = -EINVAL;
				91	goto out;
				92	}
				93	new_flags &= ~VM_DONTCOPY;
				94	break;
				95	case MADV_WIPEONFORK:
				96	/* MADV_WIPEONFORK is only supported on anonymous memory. */
				97	if (vma->vm_file \|\| vma->vm_flags & VM_SHARED) {
				98	error = -EINVAL;
				99	goto out;
				100	}
				101	new_flags \|= VM_WIPEONFORK;
				102	break;
				103	case MADV_KEEPONFORK:
				104	new_flags &= ~VM_WIPEONFORK;
				105	break;
				106	case MADV_DONTDUMP:
				107	new_flags \|= VM_DONTDUMP;
				108	break;
				109	case MADV_DODUMP:
				110	if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
				111	error = -EINVAL;
				112	goto out;
				113	}
				114	new_flags &= ~VM_DONTDUMP;
				115	break;
				116	case MADV_MERGEABLE:
				117	case MADV_UNMERGEABLE:
				118	error = ksm_madvise(vma, start, end, behavior, &new_flags);
				119	if (error)
				120	goto out_convert_errno;
				121	break;
				122	case MADV_HUGEPAGE:
				123	case MADV_NOHUGEPAGE:
				124	error = hugepage_madvise(vma, &new_flags, behavior);
				125	if (error)
				126	goto out_convert_errno;
				127	break;
				128	}
				129
				130	if (new_flags == vma->vm_flags) {
				131	*prev = vma;
				132	goto out;
				133	}
				134
				135	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
				136	prev = vma_merge(mm, prev, start, end, new_flags, vma->anon_vma,
				137	vma->vm_file, pgoff, vma_policy(vma),
				138	vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
				139	if (*prev) {
				140	vma = *prev;
				141	goto success;
				142	}
				143
				144	*prev = vma;
				145
				146	if (start != vma->vm_start) {
				147	if (unlikely(mm->map_count >= sysctl_max_map_count)) {
				148	error = -ENOMEM;
				149	goto out;
				150	}
				151	error = __split_vma(mm, vma, start, 1);
				152	if (error)
				153	goto out_convert_errno;
				154	}
				155
				156	if (end != vma->vm_end) {
				157	if (unlikely(mm->map_count >= sysctl_max_map_count)) {
				158	error = -ENOMEM;
				159	goto out;
				160	}
				161	error = __split_vma(mm, vma, end, 0);
				162	if (error)
				163	goto out_convert_errno;
				164	}
				165
				166	success:
				167	/*
				168	* vm_flags is protected by the mmap_sem held in write mode.
				169	*/
				170	vma->vm_flags = new_flags;
				171
				172	out_convert_errno:
				173	/*
				174	* madvise() returns EAGAIN if kernel resources, such as
				175	* slab, are temporarily unavailable.
				176	*/
				177	if (error == -ENOMEM)
				178	error = -EAGAIN;
				179	out:
				180	return error;
				181	}
				182
				183	#ifdef CONFIG_SWAP
				184	static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
				185	unsigned long end, struct mm_walk *walk)
				186	{
				187	pte_t *orig_pte;
				188	struct vm_area_struct *vma = walk->private;
				189	unsigned long index;
				190
				191	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
				192	return 0;
				193
				194	for (index = start; index != end; index += PAGE_SIZE) {
				195	pte_t pte;
				196	swp_entry_t entry;
				197	struct page *page;
				198	spinlock_t *ptl;
				199
				200	orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
				201	pte = *(orig_pte + ((index - start) / PAGE_SIZE));
				202	pte_unmap_unlock(orig_pte, ptl);
				203
				204	if (pte_present(pte) \|\| pte_none(pte))
				205	continue;
				206	entry = pte_to_swp_entry(pte);
				207	if (unlikely(non_swap_entry(entry)))
				208	continue;
				209
				210	page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
				211	vma, index, false);
				212	if (page)
				213	put_page(page);
				214	}
				215
				216	return 0;
				217	}
				218
				219	static const struct mm_walk_ops swapin_walk_ops = {
				220	.pmd_entry = swapin_walk_pmd_entry,
				221	};
				222
				223	static void force_shm_swapin_readahead(struct vm_area_struct *vma,
				224	unsigned long start, unsigned long end,
				225	struct address_space *mapping)
				226	{
				227	pgoff_t index;
				228	struct page *page;
				229	swp_entry_t swap;
				230
				231	for (; start < end; start += PAGE_SIZE) {
				232	index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
				233
				234	page = find_get_entry(mapping, index);
				235	if (!xa_is_value(page)) {
				236	if (page)
				237	put_page(page);
				238	continue;
				239	}
				240	swap = radix_to_swp_entry(page);
				241	page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
				242	NULL, 0, false);
				243	if (page)
				244	put_page(page);
				245	}
				246
				247	lru_add_drain(); /* Push any new pages onto the LRU now */
				248	}
				249	#endif /* CONFIG_SWAP */
				250
				251	/*
				252	* Schedule all required I/O operations. Do not wait for completion.
				253	*/
				254	static long madvise_willneed(struct vm_area_struct *vma,
				255	struct vm_area_struct **prev,
				256	unsigned long start, unsigned long end)
				257	{
				258	struct mm_struct *mm = vma->vm_mm;
				259	struct file *file = vma->vm_file;
				260	loff_t offset;
				261
				262	*prev = vma;
				263	#ifdef CONFIG_SWAP
				264	if (!file) {
				265	walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
				266	lru_add_drain(); /* Push any new pages onto the LRU now */
				267	return 0;
				268	}
				269
				270	if (shmem_mapping(file->f_mapping)) {
				271	force_shm_swapin_readahead(vma, start, end,
				272	file->f_mapping);
				273	return 0;
				274	}
				275	#else
				276	if (!file)
				277	return -EBADF;
				278	#endif
				279
				280	if (IS_DAX(file_inode(file))) {
				281	/* no bad return value, but ignore advice */
				282	return 0;
				283	}
				284
				285	/*
				286	* Filesystem's fadvise may need to take various locks. We need to
				287	* explicitly grab a reference because the vma (and hence the
				288	* vma's reference to the file) can go away as soon as we drop
				289	* mmap_sem.
				290	*/
				291	prev = NULL; / tell sys_madvise we drop mmap_sem */
				292	get_file(file);
				293	offset = (loff_t)(start - vma->vm_start)
				294	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
				295	up_read(&mm->mmap_sem);
				296	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
				297	fput(file);
				298	down_read(&mm->mmap_sem);
				299	return 0;
				300	}
				301
				302	static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
				303	unsigned long addr, unsigned long end,
				304	struct mm_walk *walk)
				305	{
				306	struct madvise_walk_private *private = walk->private;
				307	struct mmu_gather *tlb = private->tlb;
				308	bool pageout = private->pageout;
				309	struct mm_struct *mm = tlb->mm;
				310	struct vm_area_struct *vma = walk->vma;
				311	pte_t orig_pte, pte, ptent;
				312	spinlock_t *ptl;
				313	struct page *page = NULL;
				314	LIST_HEAD(page_list);
				315
				316	if (fatal_signal_pending(current))
				317	return -EINTR;
				318
				319	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				320	if (pmd_trans_huge(*pmd)) {
				321	pmd_t orig_pmd;
				322	unsigned long next = pmd_addr_end(addr, end);
				323
				324	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
				325	ptl = pmd_trans_huge_lock(pmd, vma);
				326	if (!ptl)
				327	return 0;
				328
				329	orig_pmd = *pmd;
				330	if (is_huge_zero_pmd(orig_pmd))
				331	goto huge_unlock;
				332
				333	if (unlikely(!pmd_present(orig_pmd))) {
				334	VM_BUG_ON(thp_migration_supported() &&
				335	!is_pmd_migration_entry(orig_pmd));
				336	goto huge_unlock;
				337	}
				338
				339	page = pmd_page(orig_pmd);
				340
				341	/* Do not interfere with other mappings of this page */
				342	if (page_mapcount(page) != 1)
				343	goto huge_unlock;
				344
				345	if (next - addr != HPAGE_PMD_SIZE) {
				346	int err;
				347
				348	get_page(page);
				349	spin_unlock(ptl);
				350	lock_page(page);
				351	err = split_huge_page(page);
				352	unlock_page(page);
				353	put_page(page);
				354	if (!err)
				355	goto regular_page;
				356	return 0;
				357	}
				358
				359	if (pmd_young(orig_pmd)) {
				360	pmdp_invalidate(vma, addr, pmd);
				361	orig_pmd = pmd_mkold(orig_pmd);
				362
				363	set_pmd_at(mm, addr, pmd, orig_pmd);
				364	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
				365	}
				366
				367	ClearPageReferenced(page);
				368	test_and_clear_page_young(page);
				369	if (pageout) {
				370	if (!isolate_lru_page(page)) {
				371	if (PageUnevictable(page))
				372	putback_lru_page(page);
				373	else
				374	list_add(&page->lru, &page_list);
				375	}
				376	} else
				377	deactivate_page(page);
				378	huge_unlock:
				379	spin_unlock(ptl);
				380	if (pageout)
				381	reclaim_pages(&page_list);
				382	return 0;
				383	}
				384
				385	regular_page:
				386	if (pmd_trans_unstable(pmd))
				387	return 0;
				388	#endif
				389	tlb_change_page_size(tlb, PAGE_SIZE);
				390	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				391	flush_tlb_batched_pending(mm);
				392	arch_enter_lazy_mmu_mode();
				393	for (; addr < end; pte++, addr += PAGE_SIZE) {
				394	ptent = *pte;
				395
				396	if (pte_none(ptent))
				397	continue;
				398
				399	if (!pte_present(ptent))
				400	continue;
				401
				402	page = vm_normal_page(vma, addr, ptent);
				403	if (!page)
				404	continue;
				405
				406	/*
				407	* Creating a THP page is expensive so split it only if we
				408	* are sure it's worth. Split it if we are only owner.
				409	*/
				410	if (PageTransCompound(page)) {
				411	if (page_mapcount(page) != 1)
				412	break;
				413	get_page(page);
				414	if (!trylock_page(page)) {
				415	put_page(page);
				416	break;
				417	}
				418	pte_unmap_unlock(orig_pte, ptl);
				419	if (split_huge_page(page)) {
				420	unlock_page(page);
				421	put_page(page);
				422	pte_offset_map_lock(mm, pmd, addr, &ptl);
				423	break;
				424	}
				425	unlock_page(page);
				426	put_page(page);
				427	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				428	pte--;
				429	addr -= PAGE_SIZE;
				430	continue;
				431	}
				432
				433	/*
				434	* Do not interfere with other mappings of this page and
				435	* non-LRU page.
				436	*/
				437	if (!PageLRU(page) \|\| page_mapcount(page) != 1)
				438	continue;
				439
				440	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				441
				442	if (pte_young(ptent)) {
				443	ptent = ptep_get_and_clear_full(mm, addr, pte,
				444	tlb->fullmm);
				445	ptent = pte_mkold(ptent);
				446	set_pte_at(mm, addr, pte, ptent);
				447	tlb_remove_tlb_entry(tlb, pte, addr);
				448	}
				449
				450	/*
				451	* We are deactivating a page for accelerating reclaiming.
				452	* VM couldn't reclaim the page unless we clear PG_young.
				453	* As a side effect, it makes confuse idle-page tracking
				454	* because they will miss recent referenced history.
				455	*/
				456	ClearPageReferenced(page);
				457	test_and_clear_page_young(page);
				458	if (pageout) {
				459	if (!isolate_lru_page(page)) {
				460	if (PageUnevictable(page))
				461	putback_lru_page(page);
				462	else
				463	list_add(&page->lru, &page_list);
				464	}
				465	} else
				466	deactivate_page(page);
				467	}
				468
				469	arch_leave_lazy_mmu_mode();
				470	pte_unmap_unlock(orig_pte, ptl);
				471	if (pageout)
				472	reclaim_pages(&page_list);
				473	cond_resched();
				474
				475	return 0;
				476	}
				477
				478	static const struct mm_walk_ops cold_walk_ops = {
				479	.pmd_entry = madvise_cold_or_pageout_pte_range,
				480	};
				481
				482	static void madvise_cold_page_range(struct mmu_gather *tlb,
				483	struct vm_area_struct *vma,
				484	unsigned long addr, unsigned long end)
				485	{
				486	struct madvise_walk_private walk_private = {
				487	.pageout = false,
				488	.tlb = tlb,
				489	};
				490
				491	tlb_start_vma(tlb, vma);
				492	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
				493	tlb_end_vma(tlb, vma);
				494	}
				495
				496	static long madvise_cold(struct vm_area_struct *vma,
				497	struct vm_area_struct **prev,
				498	unsigned long start_addr, unsigned long end_addr)
				499	{
				500	struct mm_struct *mm = vma->vm_mm;
				501	struct mmu_gather tlb;
				502
				503	*prev = vma;
				504	if (!can_madv_lru_vma(vma))
				505	return -EINVAL;
				506
				507	lru_add_drain();
				508	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
				509	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
				510	tlb_finish_mmu(&tlb, start_addr, end_addr);
				511
				512	return 0;
				513	}
				514
				515	static void madvise_pageout_page_range(struct mmu_gather *tlb,
				516	struct vm_area_struct *vma,
				517	unsigned long addr, unsigned long end)
				518	{
				519	struct madvise_walk_private walk_private = {
				520	.pageout = true,
				521	.tlb = tlb,
				522	};
				523
				524	tlb_start_vma(tlb, vma);
				525	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
				526	tlb_end_vma(tlb, vma);
				527	}
				528
				529	static inline bool can_do_pageout(struct vm_area_struct *vma)
				530	{
				531	if (vma_is_anonymous(vma))
				532	return true;
				533	if (!vma->vm_file)
				534	return false;
				535	/*
				536	* paging out pagecache only for non-anonymous mappings that correspond
				537	* to the files the calling process could (if tried) open for writing;
				538	* otherwise we'd be including shared non-exclusive mappings, which
				539	* opens a side channel.
				540	*/
				541	return inode_owner_or_capable(file_inode(vma->vm_file)) \|\|
				542	inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
				543	}
				544
				545	static long madvise_pageout(struct vm_area_struct *vma,
				546	struct vm_area_struct **prev,
				547	unsigned long start_addr, unsigned long end_addr)
				548	{
				549	struct mm_struct *mm = vma->vm_mm;
				550	struct mmu_gather tlb;
				551
				552	*prev = vma;
				553	if (!can_madv_lru_vma(vma))
				554	return -EINVAL;
				555
				556	if (!can_do_pageout(vma))
				557	return 0;
				558
				559	lru_add_drain();
				560	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
				561	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
				562	tlb_finish_mmu(&tlb, start_addr, end_addr);
				563
				564	return 0;
				565	}
				566
				567	static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
				568	unsigned long end, struct mm_walk *walk)
				569
				570	{
				571	struct mmu_gather *tlb = walk->private;
				572	struct mm_struct *mm = tlb->mm;
				573	struct vm_area_struct *vma = walk->vma;
				574	spinlock_t *ptl;
				575	pte_t orig_pte, pte, ptent;
				576	struct page *page;
				577	int nr_swap = 0;
				578	unsigned long next;
				579
				580	next = pmd_addr_end(addr, end);
				581	if (pmd_trans_huge(*pmd))
				582	if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
				583	goto next;
				584
				585	if (pmd_trans_unstable(pmd))
				586	return 0;
				587
				588	tlb_change_page_size(tlb, PAGE_SIZE);
				589	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				590	flush_tlb_batched_pending(mm);
				591	arch_enter_lazy_mmu_mode();
				592	for (; addr != end; pte++, addr += PAGE_SIZE) {
				593	ptent = *pte;
				594
				595	if (pte_none(ptent))
				596	continue;
				597	/*
				598	* If the pte has swp_entry, just clear page table to
				599	* prevent swap-in which is more expensive rather than
				600	* (page allocation + zeroing).
				601	*/
				602	if (!pte_present(ptent)) {
				603	swp_entry_t entry;
				604
				605	entry = pte_to_swp_entry(ptent);
				606	if (non_swap_entry(entry))
				607	continue;
				608	nr_swap--;
				609	free_swap_and_cache(entry);
				610	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
				611	continue;
				612	}
				613
				614	page = vm_normal_page(vma, addr, ptent);
				615	if (!page)
				616	continue;
				617
				618	/*
				619	* If pmd isn't transhuge but the page is THP and
				620	* is owned by only this process, split it and
				621	* deactivate all pages.
				622	*/
				623	if (PageTransCompound(page)) {
				624	if (page_mapcount(page) != 1)
				625	goto out;
				626	get_page(page);
				627	if (!trylock_page(page)) {
				628	put_page(page);
				629	goto out;
				630	}
				631	pte_unmap_unlock(orig_pte, ptl);
				632	if (split_huge_page(page)) {
				633	unlock_page(page);
				634	put_page(page);
				635	pte_offset_map_lock(mm, pmd, addr, &ptl);
				636	goto out;
				637	}
				638	unlock_page(page);
				639	put_page(page);
				640	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				641	pte--;
				642	addr -= PAGE_SIZE;
				643	continue;
				644	}
				645
				646	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				647
				648	if (PageSwapCache(page) \|\| PageDirty(page)) {
				649	if (!trylock_page(page))
				650	continue;
				651	/*
				652	* If page is shared with others, we couldn't clear
				653	* PG_dirty of the page.
				654	*/
				655	if (page_mapcount(page) != 1) {
				656	unlock_page(page);
				657	continue;
				658	}
				659
				660	if (PageSwapCache(page) && !try_to_free_swap(page)) {
				661	unlock_page(page);
				662	continue;
				663	}
				664
				665	ClearPageDirty(page);
				666	unlock_page(page);
				667	}
				668
				669	if (pte_young(ptent) \|\| pte_dirty(ptent)) {
				670	/*
				671	* Some of architecture(ex, PPC) don't update TLB
				672	* with set_pte_at and tlb_remove_tlb_entry so for
				673	* the portability, remap the pte with old\|clean
				674	* after pte clearing.
				675	*/
				676	ptent = ptep_get_and_clear_full(mm, addr, pte,
				677	tlb->fullmm);
				678
				679	ptent = pte_mkold(ptent);
				680	ptent = pte_mkclean(ptent);
				681	set_pte_at(mm, addr, pte, ptent);
				682	tlb_remove_tlb_entry(tlb, pte, addr);
				683	}
				684	mark_page_lazyfree(page);
				685	}
				686	out:
				687	if (nr_swap) {
				688	if (current->mm == mm)
				689	sync_mm_rss(mm);
				690
				691	add_mm_counter(mm, MM_SWAPENTS, nr_swap);
				692	}
				693	arch_leave_lazy_mmu_mode();
				694	pte_unmap_unlock(orig_pte, ptl);
				695	cond_resched();
				696	next:
				697	return 0;
				698	}
				699
				700	static const struct mm_walk_ops madvise_free_walk_ops = {
				701	.pmd_entry = madvise_free_pte_range,
				702	};
				703
				704	static int madvise_free_single_vma(struct vm_area_struct *vma,
				705	unsigned long start_addr, unsigned long end_addr)
				706	{
				707	struct mm_struct *mm = vma->vm_mm;
				708	struct mmu_notifier_range range;
				709	struct mmu_gather tlb;
				710
				711	/* MADV_FREE works for only anon vma at the moment */
				712	if (!vma_is_anonymous(vma))
				713	return -EINVAL;
				714
				715	range.start = max(vma->vm_start, start_addr);
				716	if (range.start >= vma->vm_end)
				717	return -EINVAL;
				718	range.end = min(vma->vm_end, end_addr);
				719	if (range.end <= vma->vm_start)
				720	return -EINVAL;
				721	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
				722	range.start, range.end);
				723
				724	lru_add_drain();
				725	tlb_gather_mmu(&tlb, mm, range.start, range.end);
				726	update_hiwater_rss(mm);
				727
				728	mmu_notifier_invalidate_range_start(&range);
				729	tlb_start_vma(&tlb, vma);
				730	walk_page_range(vma->vm_mm, range.start, range.end,
				731	&madvise_free_walk_ops, &tlb);
				732	tlb_end_vma(&tlb, vma);
				733	mmu_notifier_invalidate_range_end(&range);
				734	tlb_finish_mmu(&tlb, range.start, range.end);
				735
				736	return 0;
				737	}
				738
				739	/*
				740	* Application no longer needs these pages. If the pages are dirty,
				741	* it's OK to just throw them away. The app will be more careful about
				742	* data it wants to keep. Be sure to free swap resources too. The
				743	* zap_page_range call sets things up for shrink_active_list to actually free
				744	* these pages later if no one else has touched them in the meantime,
				745	* although we could add these pages to a global reuse list for
				746	* shrink_active_list to pick up before reclaiming other pages.
				747	*
				748	* NB: This interface discards data rather than pushes it out to swap,
				749	* as some implementations do. This has performance implications for
				750	* applications like large transactional databases which want to discard
				751	* pages in anonymous maps after committing to backing store the data
				752	* that was kept in them. There is no reason to write this data out to
				753	* the swap area if the application is discarding it.
				754	*
				755	* An interface that causes the system to free clean pages and flush
				756	* dirty pages is already available as msync(MS_INVALIDATE).
				757	*/
				758	static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
				759	unsigned long start, unsigned long end)
				760	{
				761	zap_page_range(vma, start, end - start);
				762	return 0;
				763	}
				764
				765	static long madvise_dontneed_free(struct vm_area_struct *vma,
				766	struct vm_area_struct **prev,
				767	unsigned long start, unsigned long end,
				768	int behavior)
				769	{
				770	struct mm_struct *mm = vma->vm_mm;
				771
				772	*prev = vma;
				773	if (!can_madv_lru_vma(vma))
				774	return -EINVAL;
				775
				776	if (!userfaultfd_remove(vma, start, end)) {
				777	prev = NULL; / mmap_sem has been dropped, prev is stale */
				778
				779	down_read(&mm->mmap_sem);
				780	vma = find_vma(mm, start);
				781	if (!vma)
				782	return -ENOMEM;
				783	if (start < vma->vm_start) {
				784	/*
				785	* This "vma" under revalidation is the one
				786	* with the lowest vma->vm_start where start
				787	* is also < vma->vm_end. If start <
				788	* vma->vm_start it means an hole materialized
				789	* in the user address space within the
				790	* virtual range passed to MADV_DONTNEED
				791	* or MADV_FREE.
				792	*/
				793	return -ENOMEM;
				794	}
				795	if (!can_madv_lru_vma(vma))
				796	return -EINVAL;
				797	if (end > vma->vm_end) {
				798	/*
				799	* Don't fail if end > vma->vm_end. If the old
				800	* vma was splitted while the mmap_sem was
				801	* released the effect of the concurrent
				802	* operation may not cause madvise() to
				803	* have an undefined result. There may be an
				804	* adjacent next vma that we'll walk
				805	* next. userfaultfd_remove() will generate an
				806	* UFFD_EVENT_REMOVE repetition on the
				807	* end-vma->vm_end range, but the manager can
				808	* handle a repetition fine.
				809	*/
				810	end = vma->vm_end;
				811	}
				812	VM_WARN_ON(start >= end);
				813	}
				814
				815	if (behavior == MADV_DONTNEED)
				816	return madvise_dontneed_single_vma(vma, start, end);
				817	else if (behavior == MADV_FREE)
				818	return madvise_free_single_vma(vma, start, end);
				819	else
				820	return -EINVAL;
				821	}
				822
				823	/*
				824	* Application wants to free up the pages and associated backing store.
				825	* This is effectively punching a hole into the middle of a file.
				826	*/
				827	static long madvise_remove(struct vm_area_struct *vma,
				828	struct vm_area_struct **prev,
				829	unsigned long start, unsigned long end)
				830	{
				831	loff_t offset;
				832	int error;
				833	struct file *f;
				834	struct mm_struct *mm = vma->vm_mm;
				835
				836	prev = NULL; / tell sys_madvise we drop mmap_sem */
				837
				838	if (vma->vm_flags & VM_LOCKED)
				839	return -EINVAL;
				840
				841	f = vma->vm_file;
				842
				843	if (!f \|\| !f->f_mapping \|\| !f->f_mapping->host) {
				844	return -EINVAL;
				845	}
				846
				847	if ((vma->vm_flags & (VM_SHARED\|VM_WRITE)) != (VM_SHARED\|VM_WRITE))
				848	return -EACCES;
				849
				850	offset = (loff_t)(start - vma->vm_start)
				851	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
				852
				853	/*
				854	* Filesystem's fallocate may need to take i_mutex. We need to
				855	* explicitly grab a reference because the vma (and hence the
				856	* vma's reference to the file) can go away as soon as we drop
				857	* mmap_sem.
				858	*/
				859	get_file(f);
				860	if (userfaultfd_remove(vma, start, end)) {
				861	/* mmap_sem was not released by userfaultfd_remove() */
				862	up_read(&mm->mmap_sem);
				863	}
				864	error = vfs_fallocate(f,
				865	FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
				866	offset, end - start);
				867	fput(f);
				868	down_read(&mm->mmap_sem);
				869	return error;
				870	}
				871
				872	#ifdef CONFIG_MEMORY_FAILURE
				873	/*
				874	* Error injection support for memory error handling.
				875	*/
				876	static int madvise_inject_error(int behavior,
				877	unsigned long start, unsigned long end)
				878	{
				879	struct page *page;
				880	struct zone *zone;
				881	unsigned int order;
				882
				883	if (!capable(CAP_SYS_ADMIN))
				884	return -EPERM;
				885
				886
				887	for (; start < end; start += PAGE_SIZE << order) {
				888	unsigned long pfn;
				889	int ret;
				890
				891	ret = get_user_pages_fast(start, 1, 0, &page);
				892	if (ret != 1)
				893	return ret;
				894	pfn = page_to_pfn(page);
				895
				896	/*
				897	* When soft offlining hugepages, after migrating the page
				898	* we dissolve it, therefore in the second loop "page" will
				899	* no longer be a compound page, and order will be 0.
				900	*/
				901	order = compound_order(compound_head(page));
				902
				903	if (PageHWPoison(page)) {
				904	put_page(page);
				905	continue;
				906	}
				907
				908	if (behavior == MADV_SOFT_OFFLINE) {
				909	pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
				910	pfn, start);
				911
				912	ret = soft_offline_page(page, MF_COUNT_INCREASED);
				913	if (ret)
				914	return ret;
				915	continue;
				916	}
				917
				918	pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
				919	pfn, start);
				920
				921	/*
				922	* Drop the page reference taken by get_user_pages_fast(). In
				923	* the absence of MF_COUNT_INCREASED the memory_failure()
				924	* routine is responsible for pinning the page to prevent it
				925	* from being released back to the page allocator.
				926	*/
				927	put_page(page);
				928	ret = memory_failure(pfn, 0);
				929	if (ret)
				930	return ret;
				931	}
				932
				933	/* Ensure that all poisoned pages are removed from per-cpu lists */
				934	for_each_populated_zone(zone)
				935	drain_all_pages(zone);
				936
				937	return 0;
				938	}
				939	#endif
				940
				941	static long
				942	madvise_vma(struct vm_area_struct vma, struct vm_area_struct *prev,
				943	unsigned long start, unsigned long end, int behavior)
				944	{
				945	switch (behavior) {
				946	case MADV_REMOVE:
				947	return madvise_remove(vma, prev, start, end);
				948	case MADV_WILLNEED:
				949	return madvise_willneed(vma, prev, start, end);
				950	case MADV_COLD:
				951	return madvise_cold(vma, prev, start, end);
				952	case MADV_PAGEOUT:
				953	return madvise_pageout(vma, prev, start, end);
				954	case MADV_FREE:
				955	case MADV_DONTNEED:
				956	return madvise_dontneed_free(vma, prev, start, end, behavior);
				957	default:
				958	return madvise_behavior(vma, prev, start, end, behavior);
				959	}
				960	}
				961
				962	static bool
				963	madvise_behavior_valid(int behavior)
				964	{
				965	switch (behavior) {
				966	case MADV_DOFORK:
				967	case MADV_DONTFORK:
				968	case MADV_NORMAL:
				969	case MADV_SEQUENTIAL:
				970	case MADV_RANDOM:
				971	case MADV_REMOVE:
				972	case MADV_WILLNEED:
				973	case MADV_DONTNEED:
				974	case MADV_FREE:
				975	case MADV_COLD:
				976	case MADV_PAGEOUT:
				977	#ifdef CONFIG_KSM
				978	case MADV_MERGEABLE:
				979	case MADV_UNMERGEABLE:
				980	#endif
				981	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				982	case MADV_HUGEPAGE:
				983	case MADV_NOHUGEPAGE:
				984	#endif
				985	case MADV_DONTDUMP:
				986	case MADV_DODUMP:
				987	case MADV_WIPEONFORK:
				988	case MADV_KEEPONFORK:
				989	#ifdef CONFIG_MEMORY_FAILURE
				990	case MADV_SOFT_OFFLINE:
				991	case MADV_HWPOISON:
				992	#endif
				993	return true;
				994
				995	default:
				996	return false;
				997	}
				998	}
				999
				1000	static bool
				1001	process_madvise_behavior_valid(int behavior)
				1002	{
				1003	switch (behavior) {
				1004	case MADV_COLD:
				1005	case MADV_PAGEOUT:
				1006	return true;
				1007	default:
				1008	return false;
				1009	}
				1010	}
				1011
				1012	/*
				1013	* The madvise(2) system call.
				1014	*
				1015	* Applications can use madvise() to advise the kernel how it should
				1016	* handle paging I/O in this VM area. The idea is to help the kernel
				1017	* use appropriate read-ahead and caching techniques. The information
				1018	* provided is advisory only, and can be safely disregarded by the
				1019	* kernel without affecting the correct operation of the application.
				1020	*
				1021	* behavior values:
				1022	* MADV_NORMAL - the default behavior is to read clusters. This
				1023	* results in some read-ahead and read-behind.
				1024	* MADV_RANDOM - the system should read the minimum amount of data
				1025	* on any access, since it is unlikely that the appli-
				1026	* cation will need more than what it asks for.
				1027	* MADV_SEQUENTIAL - pages in the given range will probably be accessed
				1028	* once, so they can be aggressively read ahead, and
				1029	* can be freed soon after they are accessed.
				1030	* MADV_WILLNEED - the application is notifying the system to read
				1031	* some pages ahead.
				1032	* MADV_DONTNEED - the application is finished with the given range,
				1033	* so the kernel can free resources associated with it.
				1034	* MADV_FREE - the application marks pages in the given range as lazy free,
				1035	* where actual purges are postponed until memory pressure happens.
				1036	* MADV_REMOVE - the application wants to free up the given range of
				1037	* pages and associated backing store.
				1038	* MADV_DONTFORK - omit this area from child's address space when forking:
				1039	* typically, to avoid COWing pages pinned by get_user_pages().
				1040	* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
				1041	* MADV_WIPEONFORK - present the child process with zero-filled memory in this
				1042	* range after a fork.
				1043	* MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
				1044	* MADV_HWPOISON - trigger memory error handler as if the given memory range
				1045	* were corrupted by unrecoverable hardware memory failure.
				1046	* MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
				1047	* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
				1048	* this area with pages of identical content from other such areas.
				1049	* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
				1050	* MADV_HUGEPAGE - the application wants to back the given range by transparent
				1051	* huge pages in the future. Existing pages might be coalesced and
				1052	* new pages might be allocated as THP.
				1053	* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
				1054	* transparent huge pages so the existing pages will not be
				1055	* coalesced into THP and new pages will not be allocated as THP.
				1056	* MADV_DONTDUMP - the application wants to prevent pages in the given range
				1057	* from being included in its core dump.
				1058	* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
				1059	* MADV_COLD - the application is not expected to use this memory soon,
				1060	* deactivate pages in this range so that they can be reclaimed
				1061	* easily if memory pressure hanppens.
				1062	* MADV_PAGEOUT - the application is not expected to use this memory soon,
				1063	* page out the pages in this range immediately.
				1064	*
				1065	* return values:
				1066	* zero - success
				1067	* -EINVAL - start + len < 0, start is not page-aligned,
				1068	* "behavior" is not a valid value, or application
				1069	* is attempting to release locked or shared pages,
				1070	* or the specified address range includes file, Huge TLB,
				1071	* MAP_SHARED or VMPFNMAP range.
				1072	* -ENOMEM - addresses in the specified range are not currently
				1073	* mapped, or are outside the AS of the process.
				1074	* -EIO - an I/O error occurred while paging in data.
				1075	* -EBADF - map exists, but area maps something that isn't a file.
				1076	* -EAGAIN - a kernel resource was temporarily unavailable.
				1077	*/
				1078	int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
				1079	{
				1080	unsigned long end, tmp;
				1081	struct vm_area_struct vma, prev;
				1082	int unmapped_error = 0;
				1083	int error = -EINVAL;
				1084	int write;
				1085	size_t len;
				1086	struct blk_plug plug;
				1087
				1088	start = untagged_addr(start);
				1089
				1090	if (!madvise_behavior_valid(behavior))
				1091	return error;
				1092
				1093	if (start & ~PAGE_MASK)
				1094	return error;
				1095	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
				1096
				1097	/* Check to see whether len was rounded up from small -ve to zero */
				1098	if (len_in && !len)
				1099	return error;
				1100
				1101	end = start + len;
				1102	if (end < start)
				1103	return error;
				1104
				1105	error = 0;
				1106	if (end == start)
				1107	return error;
				1108
				1109	#ifdef CONFIG_MEMORY_FAILURE
				1110	if (behavior == MADV_HWPOISON \|\| behavior == MADV_SOFT_OFFLINE)
				1111	return madvise_inject_error(behavior, start, start + len_in);
				1112	#endif
				1113
				1114	write = madvise_need_mmap_write(behavior);
				1115	if (write) {
				1116	if (down_write_killable(&mm->mmap_sem))
				1117	return -EINTR;
				1118
				1119	/*
				1120	* We may have stolen the mm from another process
				1121	* that is undergoing core dumping.
				1122	*
				1123	* Right now that's io_ring, in the future it may
				1124	* be remote process management and not "current"
				1125	* at all.
				1126	*
				1127	* We need to fix core dumping to not do this,
				1128	* but for now we have the mmget_still_valid()
				1129	* model.
				1130	*/
				1131	if (!mmget_still_valid(mm)) {
				1132	up_write(&mm->mmap_sem);
				1133	return -EINTR;
				1134	}
				1135	} else {
				1136	down_read(&mm->mmap_sem);
				1137	}
				1138
				1139	/*
				1140	* If the interval [start,end) covers some unmapped address
				1141	* ranges, just ignore them, but return -ENOMEM at the end.
				1142	* - different from the way of handling in mlock etc.
				1143	*/
				1144	vma = find_vma_prev(mm, start, &prev);
				1145	if (vma && start > vma->vm_start)
				1146	prev = vma;
				1147
				1148	blk_start_plug(&plug);
				1149	for (;;) {
				1150	/* Still start < end. */
				1151	error = -ENOMEM;
				1152	if (!vma)
				1153	goto out;
				1154
				1155	/* Here start < (end\|vma->vm_end). */
				1156	if (start < vma->vm_start) {
				1157	unmapped_error = -ENOMEM;
				1158	start = vma->vm_start;
				1159	if (start >= end)
				1160	goto out;
				1161	}
				1162
				1163	/* Here vma->vm_start <= start < (end\|vma->vm_end) */
				1164	tmp = vma->vm_end;
				1165	if (end < tmp)
				1166	tmp = end;
				1167
				1168	/* Here vma->vm_start <= start < tmp <= (end\|vma->vm_end). */
				1169	error = madvise_vma(vma, &prev, start, tmp, behavior);
				1170	if (error)
				1171	goto out;
				1172	start = tmp;
				1173	if (prev && start < prev->vm_end)
				1174	start = prev->vm_end;
				1175	error = unmapped_error;
				1176	if (start >= end)
				1177	goto out;
				1178	if (prev)
				1179	vma = prev->vm_next;
				1180	else /* madvise_remove dropped mmap_sem */
				1181	vma = find_vma(mm, start);
				1182	}
				1183	out:
				1184	blk_finish_plug(&plug);
				1185	if (write)
				1186	up_write(&mm->mmap_sem);
				1187	else
				1188	up_read(&mm->mmap_sem);
				1189
				1190	return error;
				1191	}
				1192
				1193	SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
				1194	{
				1195	return do_madvise(current->mm, start, len_in, behavior);
				1196	}
				1197
				1198	SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
				1199	size_t, vlen, int, behavior, unsigned int, flags)
				1200	{
				1201	ssize_t ret;
				1202	struct iovec iovstack[UIO_FASTIOV], iovec;
				1203	struct iovec *iov = iovstack;
				1204	struct iov_iter iter;
				1205	struct pid *pid;
				1206	struct task_struct *task;
				1207	struct mm_struct *mm;
				1208	size_t total_len;
				1209	unsigned int f_flags;
				1210
				1211	if (flags != 0) {
				1212	ret = -EINVAL;
				1213	goto out;
				1214	}
				1215
				1216	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
				1217	if (ret < 0)
				1218	goto out;
				1219
				1220	pid = pidfd_get_pid(pidfd, &f_flags);
				1221	if (IS_ERR(pid)) {
				1222	ret = PTR_ERR(pid);
				1223	goto free_iov;
				1224	}
				1225
				1226	task = get_pid_task(pid, PIDTYPE_PID);
				1227	if (!task) {
				1228	ret = -ESRCH;
				1229	goto put_pid;
				1230	}
				1231
				1232	if (!process_madvise_behavior_valid(behavior)) {
				1233	ret = -EINVAL;
				1234	goto release_task;
				1235	}
				1236
				1237	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
				1238	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
				1239	if (IS_ERR_OR_NULL(mm)) {
				1240	ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
				1241	goto release_task;
				1242	}
				1243
				1244	/*
				1245	* Require CAP_SYS_NICE for influencing process performance. Note that
				1246	* only non-destructive hints are currently supported.
				1247	*/
				1248	if (!capable(CAP_SYS_NICE)) {
				1249	ret = -EPERM;
				1250	goto release_mm;
				1251	}
				1252
				1253	total_len = iov_iter_count(&iter);
				1254
				1255	while (iov_iter_count(&iter)) {
				1256	iovec = iov_iter_iovec(&iter);
				1257	ret = do_madvise(mm, (unsigned long)iovec.iov_base,
				1258	iovec.iov_len, behavior);
				1259	if (ret < 0)
				1260	break;
				1261	iov_iter_advance(&iter, iovec.iov_len);
				1262	}
				1263
				1264	if (ret == 0)
				1265	ret = total_len - iov_iter_count(&iter);
				1266
				1267	release_mm:
				1268	mmput(mm);
				1269
				1270	release_task:
				1271	put_task_struct(task);
				1272	put_pid:
				1273	put_pid(pid);
				1274	free_iov:
				1275	kfree(iov);
				1276	out:
				1277	return ret;
				1278	}