Blame - marvell/linux/mm/shmem.c - T108

blob: e2f72b22098876fed0d4d7fa6594d86478726b3f [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* Resizable virtual memory filesystem for Linux.
				3	*
				4	* Copyright (C) 2000 Linus Torvalds.
				5	* 2000 Transmeta Corp.
				6	* 2000-2001 Christoph Rohland
				7	* 2000-2001 SAP AG
				8	* 2002 Red Hat Inc.
				9	* Copyright (C) 2002-2011 Hugh Dickins.
				10	* Copyright (C) 2011 Google Inc.
				11	* Copyright (C) 2002-2005 VERITAS Software Corporation.
				12	* Copyright (C) 2004 Andi Kleen, SuSE Labs
				13	*
				14	* Extended attribute support for tmpfs:
				15	* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
				16	* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
				17	*
				18	* tiny-shmem:
				19	* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
				20	*
				21	* This file is released under the GPL.
				22	*/
				23
				24	#include <linux/fs.h>
				25	#include <linux/init.h>
				26	#include <linux/vfs.h>
				27	#include <linux/mount.h>
				28	#include <linux/ramfs.h>
				29	#include <linux/pagemap.h>
				30	#include <linux/file.h>
				31	#include <linux/mm.h>
				32	#include <linux/random.h>
				33	#include <linux/sched/signal.h>
				34	#include <linux/export.h>
				35	#include <linux/swap.h>
				36	#include <linux/uio.h>
				37	#include <linux/khugepaged.h>
				38	#include <linux/hugetlb.h>
				39	#include <linux/frontswap.h>
				40	#include <linux/fs_parser.h>
				41
				42	#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
				43
				44	static struct vfsmount *shm_mnt;
				45
				46	#ifdef CONFIG_SHMEM
				47	/*
				48	* This virtual memory filesystem is heavily based on the ramfs. It
				49	* extends ramfs by the ability to use swap and honor resource limits
				50	* which makes it a completely usable filesystem.
				51	*/
				52
				53	#include <linux/xattr.h>
				54	#include <linux/exportfs.h>
				55	#include <linux/posix_acl.h>
				56	#include <linux/posix_acl_xattr.h>
				57	#include <linux/mman.h>
				58	#include <linux/string.h>
				59	#include <linux/slab.h>
				60	#include <linux/backing-dev.h>
				61	#include <linux/shmem_fs.h>
				62	#include <linux/writeback.h>
				63	#include <linux/blkdev.h>
				64	#include <linux/pagevec.h>
				65	#include <linux/percpu_counter.h>
				66	#include <linux/falloc.h>
				67	#include <linux/splice.h>
				68	#include <linux/security.h>
				69	#include <linux/swapops.h>
				70	#include <linux/mempolicy.h>
				71	#include <linux/namei.h>
				72	#include <linux/ctype.h>
				73	#include <linux/migrate.h>
				74	#include <linux/highmem.h>
				75	#include <linux/seq_file.h>
				76	#include <linux/magic.h>
				77	#include <linux/syscalls.h>
				78	#include <linux/fcntl.h>
				79	#include <uapi/linux/memfd.h>
				80	#include <linux/userfaultfd_k.h>
				81	#include <linux/rmap.h>
				82	#include <linux/uuid.h>
				83
				84	#include <linux/uaccess.h>
				85	#include <asm/pgtable.h>
				86
				87	#include "internal.h"
				88
				89	#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
				90	#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
				91
				92	/* Pretend that each entry is of this size in directory's i_size */
				93	#define BOGO_DIRENT_SIZE 20
				94
				95	/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
				96	#define SHORT_SYMLINK_LEN 128
				97
				98	/*
				99	* shmem_fallocate communicates with shmem_fault or shmem_writepage via
				100	* inode->i_private (with i_mutex making sure that it has only one user at
				101	* a time): we would prefer not to enlarge the shmem inode just for that.
				102	*/
				103	struct shmem_falloc {
				104	wait_queue_head_t waitq; / faults into hole wait for punch to end */
				105	pgoff_t start; /* start of range currently being fallocated */
				106	pgoff_t next; /* the next page offset to be fallocated */
				107	pgoff_t nr_falloced; /* how many new pages have been fallocated */
				108	pgoff_t nr_unswapped; /* how often writepage refused to swap out */
				109	};
				110
				111	struct shmem_options {
				112	unsigned long long blocks;
				113	unsigned long long inodes;
				114	struct mempolicy *mpol;
				115	kuid_t uid;
				116	kgid_t gid;
				117	umode_t mode;
				118	int huge;
				119	int seen;
				120	#define SHMEM_SEEN_BLOCKS 1
				121	#define SHMEM_SEEN_INODES 2
				122	#define SHMEM_SEEN_HUGE 4
				123	};
				124
				125	#ifdef CONFIG_TMPFS
				126	static unsigned long shmem_default_max_blocks(void)
				127	{
				128	return totalram_pages() / 2;
				129	}
				130
				131	static unsigned long shmem_default_max_inodes(void)
				132	{
				133	unsigned long nr_pages = totalram_pages();
				134
				135	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
				136	}
				137	#endif
				138
				139	static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
				140	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				141	struct shmem_inode_info *info, pgoff_t index);
				142	static int shmem_swapin_page(struct inode *inode, pgoff_t index,
				143	struct page **pagep, enum sgp_type sgp,
				144	gfp_t gfp, struct vm_area_struct *vma,
				145	vm_fault_t *fault_type);
				146	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
				147	struct page **pagep, enum sgp_type sgp,
				148	gfp_t gfp, struct vm_area_struct *vma,
				149	struct vm_fault vmf, vm_fault_t fault_type);
				150
				151	int shmem_getpage(struct inode *inode, pgoff_t index,
				152	struct page **pagep, enum sgp_type sgp)
				153	{
				154	return shmem_getpage_gfp(inode, index, pagep, sgp,
				155	mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
				156	}
				157
				158	static inline struct shmem_sb_info SHMEM_SB(struct super_block sb)
				159	{
				160	return sb->s_fs_info;
				161	}
				162
				163	/*
				164	* shmem_file_setup pre-accounts the whole fixed size of a VM object,
				165	* for shared memory and for shared anonymous (/dev/zero) mappings
				166	* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
				167	* consistent with the pre-accounting of private mappings ...
				168	*/
				169	static inline int shmem_acct_size(unsigned long flags, loff_t size)
				170	{
				171	return (flags & VM_NORESERVE) ?
				172	0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
				173	}
				174
				175	static inline void shmem_unacct_size(unsigned long flags, loff_t size)
				176	{
				177	if (!(flags & VM_NORESERVE))
				178	vm_unacct_memory(VM_ACCT(size));
				179	}
				180
				181	static inline int shmem_reacct_size(unsigned long flags,
				182	loff_t oldsize, loff_t newsize)
				183	{
				184	if (!(flags & VM_NORESERVE)) {
				185	if (VM_ACCT(newsize) > VM_ACCT(oldsize))
				186	return security_vm_enough_memory_mm(current->mm,
				187	VM_ACCT(newsize) - VM_ACCT(oldsize));
				188	else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
				189	vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
				190	}
				191	return 0;
				192	}
				193
				194	/*
				195	* ... whereas tmpfs objects are accounted incrementally as
				196	* pages are allocated, in order to allow large sparse files.
				197	* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
				198	* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
				199	*/
				200	static inline int shmem_acct_block(unsigned long flags, long pages)
				201	{
				202	if (!(flags & VM_NORESERVE))
				203	return 0;
				204
				205	return security_vm_enough_memory_mm(current->mm,
				206	pages * VM_ACCT(PAGE_SIZE));
				207	}
				208
				209	static inline void shmem_unacct_blocks(unsigned long flags, long pages)
				210	{
				211	if (flags & VM_NORESERVE)
				212	vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
				213	}
				214
				215	static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
				216	{
				217	struct shmem_inode_info *info = SHMEM_I(inode);
				218	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				219
				220	if (shmem_acct_block(info->flags, pages))
				221	return false;
				222
				223	if (sbinfo->max_blocks) {
				224	if (percpu_counter_compare(&sbinfo->used_blocks,
				225	sbinfo->max_blocks - pages) > 0)
				226	goto unacct;
				227	percpu_counter_add(&sbinfo->used_blocks, pages);
				228	}
				229
				230	return true;
				231
				232	unacct:
				233	shmem_unacct_blocks(info->flags, pages);
				234	return false;
				235	}
				236
				237	static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
				238	{
				239	struct shmem_inode_info *info = SHMEM_I(inode);
				240	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				241
				242	if (sbinfo->max_blocks)
				243	percpu_counter_sub(&sbinfo->used_blocks, pages);
				244	shmem_unacct_blocks(info->flags, pages);
				245	}
				246
				247	static const struct super_operations shmem_ops;
				248	static const struct address_space_operations shmem_aops;
				249	static const struct file_operations shmem_file_operations;
				250	static const struct inode_operations shmem_inode_operations;
				251	static const struct inode_operations shmem_dir_inode_operations;
				252	static const struct inode_operations shmem_special_inode_operations;
				253	static const struct vm_operations_struct shmem_vm_ops;
				254	static struct file_system_type shmem_fs_type;
				255
				256	bool vma_is_shmem(struct vm_area_struct *vma)
				257	{
				258	return vma->vm_ops == &shmem_vm_ops;
				259	}
				260
				261	static LIST_HEAD(shmem_swaplist);
				262	static DEFINE_MUTEX(shmem_swaplist_mutex);
				263
				264	static int shmem_reserve_inode(struct super_block *sb)
				265	{
				266	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				267	if (sbinfo->max_inodes) {
				268	spin_lock(&sbinfo->stat_lock);
				269	if (!sbinfo->free_inodes) {
				270	spin_unlock(&sbinfo->stat_lock);
				271	return -ENOSPC;
				272	}
				273	sbinfo->free_inodes--;
				274	spin_unlock(&sbinfo->stat_lock);
				275	}
				276	return 0;
				277	}
				278
				279	static void shmem_free_inode(struct super_block *sb)
				280	{
				281	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				282	if (sbinfo->max_inodes) {
				283	spin_lock(&sbinfo->stat_lock);
				284	sbinfo->free_inodes++;
				285	spin_unlock(&sbinfo->stat_lock);
				286	}
				287	}
				288
				289	/**
				290	* shmem_recalc_inode - recalculate the block usage of an inode
				291	* @inode: inode to recalc
				292	*
				293	* We have to calculate the free blocks since the mm can drop
				294	* undirtied hole pages behind our back.
				295	*
				296	* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
				297	* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
				298	*
				299	* It has to be called with the spinlock held.
				300	*/
				301	static void shmem_recalc_inode(struct inode *inode)
				302	{
				303	struct shmem_inode_info *info = SHMEM_I(inode);
				304	long freed;
				305
				306	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
				307	if (freed > 0) {
				308	info->alloced -= freed;
				309	inode->i_blocks -= freed * BLOCKS_PER_PAGE;
				310	shmem_inode_unacct_blocks(inode, freed);
				311	}
				312	}
				313
				314	bool shmem_charge(struct inode *inode, long pages)
				315	{
				316	struct shmem_inode_info *info = SHMEM_I(inode);
				317	unsigned long flags;
				318
				319	if (!shmem_inode_acct_block(inode, pages))
				320	return false;
				321
				322	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
				323	inode->i_mapping->nrpages += pages;
				324
				325	spin_lock_irqsave(&info->lock, flags);
				326	info->alloced += pages;
				327	inode->i_blocks += pages * BLOCKS_PER_PAGE;
				328	shmem_recalc_inode(inode);
				329	spin_unlock_irqrestore(&info->lock, flags);
				330
				331	return true;
				332	}
				333
				334	void shmem_uncharge(struct inode *inode, long pages)
				335	{
				336	struct shmem_inode_info *info = SHMEM_I(inode);
				337	unsigned long flags;
				338
				339	/* nrpages adjustment done by __delete_from_page_cache() or caller */
				340
				341	spin_lock_irqsave(&info->lock, flags);
				342	info->alloced -= pages;
				343	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
				344	shmem_recalc_inode(inode);
				345	spin_unlock_irqrestore(&info->lock, flags);
				346
				347	shmem_inode_unacct_blocks(inode, pages);
				348	}
				349
				350	/*
				351	* Replace item expected in xarray by a new item, while holding xa_lock.
				352	*/
				353	static int shmem_replace_entry(struct address_space *mapping,
				354	pgoff_t index, void expected, void replacement)
				355	{
				356	XA_STATE(xas, &mapping->i_pages, index);
				357	void *item;
				358
				359	VM_BUG_ON(!expected);
				360	VM_BUG_ON(!replacement);
				361	item = xas_load(&xas);
				362	if (item != expected)
				363	return -ENOENT;
				364	xas_store(&xas, replacement);
				365	return 0;
				366	}
				367
				368	/*
				369	* Sometimes, before we decide whether to proceed or to fail, we must check
				370	* that an entry was not already brought back from swap by a racing thread.
				371	*
				372	* Checking page is not enough: by the time a SwapCache page is locked, it
				373	* might be reused, and again be SwapCache, using the same swap as before.
				374	*/
				375	static bool shmem_confirm_swap(struct address_space *mapping,
				376	pgoff_t index, swp_entry_t swap)
				377	{
				378	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
				379	}
				380
				381	/*
				382	* Definitions for "huge tmpfs": tmpfs mounted with the huge= option
				383	*
				384	* SHMEM_HUGE_NEVER:
				385	* disables huge pages for the mount;
				386	* SHMEM_HUGE_ALWAYS:
				387	* enables huge pages for the mount;
				388	* SHMEM_HUGE_WITHIN_SIZE:
				389	* only allocate huge pages if the page will be fully within i_size,
				390	* also respect fadvise()/madvise() hints;
				391	* SHMEM_HUGE_ADVISE:
				392	* only allocate huge pages if requested with fadvise()/madvise();
				393	*/
				394
				395	#define SHMEM_HUGE_NEVER 0
				396	#define SHMEM_HUGE_ALWAYS 1
				397	#define SHMEM_HUGE_WITHIN_SIZE 2
				398	#define SHMEM_HUGE_ADVISE 3
				399
				400	/*
				401	* Special values.
				402	* Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
				403	*
				404	* SHMEM_HUGE_DENY:
				405	* disables huge on shm_mnt and all mounts, for emergency use;
				406	* SHMEM_HUGE_FORCE:
				407	* enables huge on shm_mnt and all mounts, w/o needing option, for testing;
				408	*
				409	*/
				410	#define SHMEM_HUGE_DENY (-1)
				411	#define SHMEM_HUGE_FORCE (-2)
				412
				413	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				414	/* ifdef here to avoid bloating shmem.o when not necessary */
				415
				416	static int shmem_huge __read_mostly;
				417
				418	#if defined(CONFIG_SYSFS)
				419	static int shmem_parse_huge(const char *str)
				420	{
				421	if (!strcmp(str, "never"))
				422	return SHMEM_HUGE_NEVER;
				423	if (!strcmp(str, "always"))
				424	return SHMEM_HUGE_ALWAYS;
				425	if (!strcmp(str, "within_size"))
				426	return SHMEM_HUGE_WITHIN_SIZE;
				427	if (!strcmp(str, "advise"))
				428	return SHMEM_HUGE_ADVISE;
				429	if (!strcmp(str, "deny"))
				430	return SHMEM_HUGE_DENY;
				431	if (!strcmp(str, "force"))
				432	return SHMEM_HUGE_FORCE;
				433	return -EINVAL;
				434	}
				435	#endif
				436
				437	#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
				438	static const char *shmem_format_huge(int huge)
				439	{
				440	switch (huge) {
				441	case SHMEM_HUGE_NEVER:
				442	return "never";
				443	case SHMEM_HUGE_ALWAYS:
				444	return "always";
				445	case SHMEM_HUGE_WITHIN_SIZE:
				446	return "within_size";
				447	case SHMEM_HUGE_ADVISE:
				448	return "advise";
				449	case SHMEM_HUGE_DENY:
				450	return "deny";
				451	case SHMEM_HUGE_FORCE:
				452	return "force";
				453	default:
				454	VM_BUG_ON(1);
				455	return "bad_val";
				456	}
				457	}
				458	#endif
				459
				460	static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
				461	struct shrink_control *sc, unsigned long nr_to_split)
				462	{
				463	LIST_HEAD(list), pos, next;
				464	LIST_HEAD(to_remove);
				465	struct inode *inode;
				466	struct shmem_inode_info *info;
				467	struct page *page;
				468	unsigned long batch = sc ? sc->nr_to_scan : 128;
				469	int split = 0;
				470
				471	if (list_empty(&sbinfo->shrinklist))
				472	return SHRINK_STOP;
				473
				474	spin_lock(&sbinfo->shrinklist_lock);
				475	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
				476	info = list_entry(pos, struct shmem_inode_info, shrinklist);
				477
				478	/* pin the inode */
				479	inode = igrab(&info->vfs_inode);
				480
				481	/* inode is about to be evicted */
				482	if (!inode) {
				483	list_del_init(&info->shrinklist);
				484	goto next;
				485	}
				486
				487	/* Check if there's anything to gain */
				488	if (round_up(inode->i_size, PAGE_SIZE) ==
				489	round_up(inode->i_size, HPAGE_PMD_SIZE)) {
				490	list_move(&info->shrinklist, &to_remove);
				491	goto next;
				492	}
				493
				494	list_move(&info->shrinklist, &list);
				495	next:
				496	sbinfo->shrinklist_len--;
				497	if (!--batch)
				498	break;
				499	}
				500	spin_unlock(&sbinfo->shrinklist_lock);
				501
				502	list_for_each_safe(pos, next, &to_remove) {
				503	info = list_entry(pos, struct shmem_inode_info, shrinklist);
				504	inode = &info->vfs_inode;
				505	list_del_init(&info->shrinklist);
				506	iput(inode);
				507	}
				508
				509	list_for_each_safe(pos, next, &list) {
				510	int ret;
				511
				512	info = list_entry(pos, struct shmem_inode_info, shrinklist);
				513	inode = &info->vfs_inode;
				514
				515	if (nr_to_split && split >= nr_to_split)
				516	goto move_back;
				517
				518	page = find_get_page(inode->i_mapping,
				519	(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
				520	if (!page)
				521	goto drop;
				522
				523	/* No huge page at the end of the file: nothing to split */
				524	if (!PageTransHuge(page)) {
				525	put_page(page);
				526	goto drop;
				527	}
				528
				529	/*
				530	* Move the inode on the list back to shrinklist if we failed
				531	* to lock the page at this time.
				532	*
				533	* Waiting for the lock may lead to deadlock in the
				534	* reclaim path.
				535	*/
				536	if (!trylock_page(page)) {
				537	put_page(page);
				538	goto move_back;
				539	}
				540
				541	ret = split_huge_page(page);
				542	unlock_page(page);
				543	put_page(page);
				544
				545	/* If split failed move the inode on the list back to shrinklist */
				546	if (ret)
				547	goto move_back;
				548
				549	split++;
				550	drop:
				551	list_del_init(&info->shrinklist);
				552	goto put;
				553	move_back:
				554	/*
				555	* Make sure the inode is either on the global list or deleted
				556	* from any local list before iput() since it could be deleted
				557	* in another thread once we put the inode (then the local list
				558	* is corrupted).
				559	*/
				560	spin_lock(&sbinfo->shrinklist_lock);
				561	list_move(&info->shrinklist, &sbinfo->shrinklist);
				562	sbinfo->shrinklist_len++;
				563	spin_unlock(&sbinfo->shrinklist_lock);
				564	put:
				565	iput(inode);
				566	}
				567
				568	return split;
				569	}
				570
				571	static long shmem_unused_huge_scan(struct super_block *sb,
				572	struct shrink_control *sc)
				573	{
				574	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				575
				576	if (!READ_ONCE(sbinfo->shrinklist_len))
				577	return SHRINK_STOP;
				578
				579	return shmem_unused_huge_shrink(sbinfo, sc, 0);
				580	}
				581
				582	static long shmem_unused_huge_count(struct super_block *sb,
				583	struct shrink_control *sc)
				584	{
				585	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				586	return READ_ONCE(sbinfo->shrinklist_len);
				587	}
				588	#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
				589
				590	#define shmem_huge SHMEM_HUGE_DENY
				591
				592	static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
				593	struct shrink_control *sc, unsigned long nr_to_split)
				594	{
				595	return 0;
				596	}
				597	#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
				598
				599	static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
				600	{
				601	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				602	(shmem_huge == SHMEM_HUGE_FORCE \|\| sbinfo->huge) &&
				603	shmem_huge != SHMEM_HUGE_DENY)
				604	return true;
				605	return false;
				606	}
				607
				608	/*
				609	* Like add_to_page_cache_locked, but error if expected item has gone.
				610	*/
				611	static int shmem_add_to_page_cache(struct page *page,
				612	struct address_space *mapping,
				613	pgoff_t index, void *expected, gfp_t gfp)
				614	{
				615	XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
				616	unsigned long i = 0;
				617	unsigned long nr = compound_nr(page);
				618
				619	VM_BUG_ON_PAGE(PageTail(page), page);
				620	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
				621	VM_BUG_ON_PAGE(!PageLocked(page), page);
				622	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
				623	VM_BUG_ON(expected && PageTransHuge(page));
				624
				625	page_ref_add(page, nr);
				626	page->mapping = mapping;
				627	page->index = index;
				628
				629	do {
				630	void *entry;
				631	xas_lock_irq(&xas);
				632	entry = xas_find_conflict(&xas);
				633	if (entry != expected)
				634	xas_set_err(&xas, -EEXIST);
				635	xas_create_range(&xas);
				636	if (xas_error(&xas))
				637	goto unlock;
				638	next:
				639	xas_store(&xas, page);
				640	if (++i < nr) {
				641	xas_next(&xas);
				642	goto next;
				643	}
				644	if (PageTransHuge(page)) {
				645	count_vm_event(THP_FILE_ALLOC);
				646	__inc_node_page_state(page, NR_SHMEM_THPS);
				647	}
				648	mapping->nrpages += nr;
				649	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
				650	__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
				651	unlock:
				652	xas_unlock_irq(&xas);
				653	} while (xas_nomem(&xas, gfp));
				654
				655	if (xas_error(&xas)) {
				656	page->mapping = NULL;
				657	page_ref_sub(page, nr);
				658	return xas_error(&xas);
				659	}
				660
				661	return 0;
				662	}
				663
				664	/*
				665	* Like delete_from_page_cache, but substitutes swap for page.
				666	*/
				667	static void shmem_delete_from_page_cache(struct page page, void radswap)
				668	{
				669	struct address_space *mapping = page->mapping;
				670	int error;
				671
				672	VM_BUG_ON_PAGE(PageCompound(page), page);
				673
				674	xa_lock_irq(&mapping->i_pages);
				675	error = shmem_replace_entry(mapping, page->index, page, radswap);
				676	page->mapping = NULL;
				677	mapping->nrpages--;
				678	__dec_node_page_state(page, NR_FILE_PAGES);
				679	__dec_node_page_state(page, NR_SHMEM);
				680	xa_unlock_irq(&mapping->i_pages);
				681	put_page(page);
				682	BUG_ON(error);
				683	}
				684
				685	/*
				686	* Remove swap entry from page cache, free the swap and its page cache.
				687	*/
				688	static int shmem_free_swap(struct address_space *mapping,
				689	pgoff_t index, void *radswap)
				690	{
				691	void *old;
				692
				693	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
				694	if (old != radswap)
				695	return -ENOENT;
				696	free_swap_and_cache(radix_to_swp_entry(radswap));
				697	return 0;
				698	}
				699
				700	/*
				701	* Determine (in bytes) how many of the shmem object's pages mapped by the
				702	* given offsets are swapped out.
				703	*
				704	* This is safe to call without i_mutex or the i_pages lock thanks to RCU,
				705	* as long as the inode doesn't go away and racy results are not a problem.
				706	*/
				707	unsigned long shmem_partial_swap_usage(struct address_space *mapping,
				708	pgoff_t start, pgoff_t end)
				709	{
				710	XA_STATE(xas, &mapping->i_pages, start);
				711	struct page *page;
				712	unsigned long swapped = 0;
				713
				714	rcu_read_lock();
				715	xas_for_each(&xas, page, end - 1) {
				716	if (xas_retry(&xas, page))
				717	continue;
				718	if (xa_is_value(page))
				719	swapped++;
				720
				721	if (need_resched()) {
				722	xas_pause(&xas);
				723	cond_resched_rcu();
				724	}
				725	}
				726
				727	rcu_read_unlock();
				728
				729	return swapped << PAGE_SHIFT;
				730	}
				731
				732	/*
				733	* Determine (in bytes) how many of the shmem object's pages mapped by the
				734	* given vma is swapped out.
				735	*
				736	* This is safe to call without i_mutex or the i_pages lock thanks to RCU,
				737	* as long as the inode doesn't go away and racy results are not a problem.
				738	*/
				739	unsigned long shmem_swap_usage(struct vm_area_struct *vma)
				740	{
				741	struct inode *inode = file_inode(vma->vm_file);
				742	struct shmem_inode_info *info = SHMEM_I(inode);
				743	struct address_space *mapping = inode->i_mapping;
				744	unsigned long swapped;
				745
				746	/* Be careful as we don't hold info->lock */
				747	swapped = READ_ONCE(info->swapped);
				748
				749	/*
				750	* The easier cases are when the shmem object has nothing in swap, or
				751	* the vma maps it whole. Then we can simply use the stats that we
				752	* already track.
				753	*/
				754	if (!swapped)
				755	return 0;
				756
				757	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
				758	return swapped << PAGE_SHIFT;
				759
				760	/* Here comes the more involved part */
				761	return shmem_partial_swap_usage(mapping,
				762	linear_page_index(vma, vma->vm_start),
				763	linear_page_index(vma, vma->vm_end));
				764	}
				765
				766	/*
				767	* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
				768	*/
				769	void shmem_unlock_mapping(struct address_space *mapping)
				770	{
				771	struct pagevec pvec;
				772	pgoff_t indices[PAGEVEC_SIZE];
				773	pgoff_t index = 0;
				774
				775	pagevec_init(&pvec);
				776	/*
				777	* Minor point, but we might as well stop if someone else SHM_LOCKs it.
				778	*/
				779	while (!mapping_unevictable(mapping)) {
				780	/*
				781	* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
				782	* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
				783	*/
				784	pvec.nr = find_get_entries(mapping, index,
				785	PAGEVEC_SIZE, pvec.pages, indices);
				786	if (!pvec.nr)
				787	break;
				788	index = indices[pvec.nr - 1] + 1;
				789	pagevec_remove_exceptionals(&pvec);
				790	check_move_unevictable_pages(&pvec);
				791	pagevec_release(&pvec);
				792	cond_resched();
				793	}
				794	}
				795
				796	/*
				797	* Remove range of pages and swap entries from page cache, and free them.
				798	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
				799	*/
				800	static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
				801	bool unfalloc)
				802	{
				803	struct address_space *mapping = inode->i_mapping;
				804	struct shmem_inode_info *info = SHMEM_I(inode);
				805	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
				806	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
				807	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
				808	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
				809	struct pagevec pvec;
				810	pgoff_t indices[PAGEVEC_SIZE];
				811	long nr_swaps_freed = 0;
				812	pgoff_t index;
				813	int i;
				814
				815	if (lend == -1)
				816	end = -1; /* unsigned, so actually very big */
				817
				818	pagevec_init(&pvec);
				819	index = start;
				820	while (index < end) {
				821	pvec.nr = find_get_entries(mapping, index,
				822	min(end - index, (pgoff_t)PAGEVEC_SIZE),
				823	pvec.pages, indices);
				824	if (!pvec.nr)
				825	break;
				826	for (i = 0; i < pagevec_count(&pvec); i++) {
				827	struct page *page = pvec.pages[i];
				828
				829	index = indices[i];
				830	if (index >= end)
				831	break;
				832
				833	if (xa_is_value(page)) {
				834	if (unfalloc)
				835	continue;
				836	nr_swaps_freed += !shmem_free_swap(mapping,
				837	index, page);
				838	continue;
				839	}
				840
				841	VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
				842
				843	if (!trylock_page(page))
				844	continue;
				845
				846	if (PageTransTail(page)) {
				847	/* Middle of THP: zero out the page */
				848	clear_highpage(page);
				849	unlock_page(page);
				850	continue;
				851	} else if (PageTransHuge(page)) {
				852	if (index == round_down(end, HPAGE_PMD_NR)) {
				853	/*
				854	* Range ends in the middle of THP:
				855	* zero out the page
				856	*/
				857	clear_highpage(page);
				858	unlock_page(page);
				859	continue;
				860	}
				861	index += HPAGE_PMD_NR - 1;
				862	i += HPAGE_PMD_NR - 1;
				863	}
				864
				865	if (!unfalloc \|\| !PageUptodate(page)) {
				866	VM_BUG_ON_PAGE(PageTail(page), page);
				867	if (page_mapping(page) == mapping) {
				868	VM_BUG_ON_PAGE(PageWriteback(page), page);
				869	truncate_inode_page(mapping, page);
				870	}
				871	}
				872	unlock_page(page);
				873	}
				874	pagevec_remove_exceptionals(&pvec);
				875	pagevec_release(&pvec);
				876	cond_resched();
				877	index++;
				878	}
				879
				880	if (partial_start) {
				881	struct page *page = NULL;
				882	shmem_getpage(inode, start - 1, &page, SGP_READ);
				883	if (page) {
				884	unsigned int top = PAGE_SIZE;
				885	if (start > end) {
				886	top = partial_end;
				887	partial_end = 0;
				888	}
				889	zero_user_segment(page, partial_start, top);
				890	set_page_dirty(page);
				891	unlock_page(page);
				892	put_page(page);
				893	}
				894	}
				895	if (partial_end) {
				896	struct page *page = NULL;
				897	shmem_getpage(inode, end, &page, SGP_READ);
				898	if (page) {
				899	zero_user_segment(page, 0, partial_end);
				900	set_page_dirty(page);
				901	unlock_page(page);
				902	put_page(page);
				903	}
				904	}
				905	if (start >= end)
				906	return;
				907
				908	index = start;
				909	while (index < end) {
				910	cond_resched();
				911
				912	pvec.nr = find_get_entries(mapping, index,
				913	min(end - index, (pgoff_t)PAGEVEC_SIZE),
				914	pvec.pages, indices);
				915	if (!pvec.nr) {
				916	/* If all gone or hole-punch or unfalloc, we're done */
				917	if (index == start \|\| end != -1)
				918	break;
				919	/* But if truncating, restart to make sure all gone */
				920	index = start;
				921	continue;
				922	}
				923	for (i = 0; i < pagevec_count(&pvec); i++) {
				924	struct page *page = pvec.pages[i];
				925
				926	index = indices[i];
				927	if (index >= end)
				928	break;
				929
				930	if (xa_is_value(page)) {
				931	if (unfalloc)
				932	continue;
				933	if (shmem_free_swap(mapping, index, page)) {
				934	/* Swap was replaced by page: retry */
				935	index--;
				936	break;
				937	}
				938	nr_swaps_freed++;
				939	continue;
				940	}
				941
				942	lock_page(page);
				943
				944	if (PageTransTail(page)) {
				945	/* Middle of THP: zero out the page */
				946	clear_highpage(page);
				947	unlock_page(page);
				948	/*
				949	* Partial thp truncate due 'start' in middle
				950	* of THP: don't need to look on these pages
				951	* again on !pvec.nr restart.
				952	*/
				953	if (index != round_down(end, HPAGE_PMD_NR))
				954	start++;
				955	continue;
				956	} else if (PageTransHuge(page)) {
				957	if (index == round_down(end, HPAGE_PMD_NR)) {
				958	/*
				959	* Range ends in the middle of THP:
				960	* zero out the page
				961	*/
				962	clear_highpage(page);
				963	unlock_page(page);
				964	continue;
				965	}
				966	index += HPAGE_PMD_NR - 1;
				967	i += HPAGE_PMD_NR - 1;
				968	}
				969
				970	if (!unfalloc \|\| !PageUptodate(page)) {
				971	VM_BUG_ON_PAGE(PageTail(page), page);
				972	if (page_mapping(page) == mapping) {
				973	VM_BUG_ON_PAGE(PageWriteback(page), page);
				974	truncate_inode_page(mapping, page);
				975	} else {
				976	/* Page was replaced by swap: retry */
				977	unlock_page(page);
				978	index--;
				979	break;
				980	}
				981	}
				982	unlock_page(page);
				983	}
				984	pagevec_remove_exceptionals(&pvec);
				985	pagevec_release(&pvec);
				986	index++;
				987	}
				988
				989	spin_lock_irq(&info->lock);
				990	info->swapped -= nr_swaps_freed;
				991	shmem_recalc_inode(inode);
				992	spin_unlock_irq(&info->lock);
				993	}
				994
				995	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
				996	{
				997	shmem_undo_range(inode, lstart, lend, false);
				998	inode->i_ctime = inode->i_mtime = current_time(inode);
				999	}
				1000	EXPORT_SYMBOL_GPL(shmem_truncate_range);
				1001
				1002	static int shmem_getattr(const struct path path, struct kstat stat,
				1003	u32 request_mask, unsigned int query_flags)
				1004	{
				1005	struct inode *inode = path->dentry->d_inode;
				1006	struct shmem_inode_info *info = SHMEM_I(inode);
				1007	struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
				1008
				1009	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
				1010	spin_lock_irq(&info->lock);
				1011	shmem_recalc_inode(inode);
				1012	spin_unlock_irq(&info->lock);
				1013	}
				1014	generic_fillattr(inode, stat);
				1015
				1016	if (is_huge_enabled(sb_info))
				1017	stat->blksize = HPAGE_PMD_SIZE;
				1018
				1019	return 0;
				1020	}
				1021
				1022	static int shmem_setattr(struct dentry dentry, struct iattr attr)
				1023	{
				1024	struct inode *inode = d_inode(dentry);
				1025	struct shmem_inode_info *info = SHMEM_I(inode);
				1026	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				1027	int error;
				1028
				1029	error = setattr_prepare(dentry, attr);
				1030	if (error)
				1031	return error;
				1032
				1033	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
				1034	loff_t oldsize = inode->i_size;
				1035	loff_t newsize = attr->ia_size;
				1036
				1037	/* protected by i_mutex */
				1038	if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) \|\|
				1039	(newsize > oldsize && (info->seals & F_SEAL_GROW)))
				1040	return -EPERM;
				1041
				1042	if (newsize != oldsize) {
				1043	error = shmem_reacct_size(SHMEM_I(inode)->flags,
				1044	oldsize, newsize);
				1045	if (error)
				1046	return error;
				1047	i_size_write(inode, newsize);
				1048	inode->i_ctime = inode->i_mtime = current_time(inode);
				1049	}
				1050	if (newsize <= oldsize) {
				1051	loff_t holebegin = round_up(newsize, PAGE_SIZE);
				1052	if (oldsize > holebegin)
				1053	unmap_mapping_range(inode->i_mapping,
				1054	holebegin, 0, 1);
				1055	if (info->alloced)
				1056	shmem_truncate_range(inode,
				1057	newsize, (loff_t)-1);
				1058	/* unmap again to remove racily COWed private pages */
				1059	if (oldsize > holebegin)
				1060	unmap_mapping_range(inode->i_mapping,
				1061	holebegin, 0, 1);
				1062
				1063	/*
				1064	* Part of the huge page can be beyond i_size: subject
				1065	* to shrink under memory pressure.
				1066	*/
				1067	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
				1068	spin_lock(&sbinfo->shrinklist_lock);
				1069	/*
				1070	* _careful to defend against unlocked access to
				1071	* ->shrink_list in shmem_unused_huge_shrink()
				1072	*/
				1073	if (list_empty_careful(&info->shrinklist)) {
				1074	list_add_tail(&info->shrinklist,
				1075	&sbinfo->shrinklist);
				1076	sbinfo->shrinklist_len++;
				1077	}
				1078	spin_unlock(&sbinfo->shrinklist_lock);
				1079	}
				1080	}
				1081	}
				1082
				1083	setattr_copy(inode, attr);
				1084	if (attr->ia_valid & ATTR_MODE)
				1085	error = posix_acl_chmod(inode, inode->i_mode);
				1086	return error;
				1087	}
				1088
				1089	static void shmem_evict_inode(struct inode *inode)
				1090	{
				1091	struct shmem_inode_info *info = SHMEM_I(inode);
				1092	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				1093
				1094	if (inode->i_mapping->a_ops == &shmem_aops) {
				1095	shmem_unacct_size(info->flags, inode->i_size);
				1096	inode->i_size = 0;
				1097	shmem_truncate_range(inode, 0, (loff_t)-1);
				1098	if (!list_empty(&info->shrinklist)) {
				1099	spin_lock(&sbinfo->shrinklist_lock);
				1100	if (!list_empty(&info->shrinklist)) {
				1101	list_del_init(&info->shrinklist);
				1102	sbinfo->shrinklist_len--;
				1103	}
				1104	spin_unlock(&sbinfo->shrinklist_lock);
				1105	}
				1106	while (!list_empty(&info->swaplist)) {
				1107	/* Wait while shmem_unuse() is scanning this inode... */
				1108	wait_var_event(&info->stop_eviction,
				1109	!atomic_read(&info->stop_eviction));
				1110	mutex_lock(&shmem_swaplist_mutex);
				1111	/* ...but beware of the race if we peeked too early */
				1112	if (!atomic_read(&info->stop_eviction))
				1113	list_del_init(&info->swaplist);
				1114	mutex_unlock(&shmem_swaplist_mutex);
				1115	}
				1116	}
				1117
				1118	simple_xattrs_free(&info->xattrs);
				1119	WARN_ON(inode->i_blocks);
				1120	shmem_free_inode(inode->i_sb);
				1121	clear_inode(inode);
				1122	}
				1123
				1124	extern struct swap_info_struct *swap_info[];
				1125
				1126	static int shmem_find_swap_entries(struct address_space *mapping,
				1127	pgoff_t start, unsigned int nr_entries,
				1128	struct page *entries, pgoff_t indices,
				1129	unsigned int type, bool frontswap)
				1130	{
				1131	XA_STATE(xas, &mapping->i_pages, start);
				1132	struct page *page;
				1133	swp_entry_t entry;
				1134	unsigned int ret = 0;
				1135
				1136	if (!nr_entries)
				1137	return 0;
				1138
				1139	rcu_read_lock();
				1140	xas_for_each(&xas, page, ULONG_MAX) {
				1141	if (xas_retry(&xas, page))
				1142	continue;
				1143
				1144	if (!xa_is_value(page))
				1145	continue;
				1146
				1147	entry = radix_to_swp_entry(page);
				1148	if (swp_type(entry) != type)
				1149	continue;
				1150	if (frontswap &&
				1151	!frontswap_test(swap_info[type], swp_offset(entry)))
				1152	continue;
				1153
				1154	indices[ret] = xas.xa_index;
				1155	entries[ret] = page;
				1156
				1157	if (need_resched()) {
				1158	xas_pause(&xas);
				1159	cond_resched_rcu();
				1160	}
				1161	if (++ret == nr_entries)
				1162	break;
				1163	}
				1164	rcu_read_unlock();
				1165
				1166	return ret;
				1167	}
				1168
				1169	/*
				1170	* Move the swapped pages for an inode to page cache. Returns the count
				1171	* of pages swapped in, or the error in case of failure.
				1172	*/
				1173	static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
				1174	pgoff_t *indices)
				1175	{
				1176	int i = 0;
				1177	int ret = 0;
				1178	int error = 0;
				1179	struct address_space *mapping = inode->i_mapping;
				1180
				1181	for (i = 0; i < pvec.nr; i++) {
				1182	struct page *page = pvec.pages[i];
				1183
				1184	if (!xa_is_value(page))
				1185	continue;
				1186	error = shmem_swapin_page(inode, indices[i],
				1187	&page, SGP_CACHE,
				1188	mapping_gfp_mask(mapping),
				1189	NULL, NULL);
				1190	if (error == 0) {
				1191	unlock_page(page);
				1192	put_page(page);
				1193	ret++;
				1194	}
				1195	if (error == -ENOMEM)
				1196	break;
				1197	error = 0;
				1198	}
				1199	return error ? error : ret;
				1200	}
				1201
				1202	/*
				1203	* If swap found in inode, free it and move page from swapcache to filecache.
				1204	*/
				1205	static int shmem_unuse_inode(struct inode *inode, unsigned int type,
				1206	bool frontswap, unsigned long *fs_pages_to_unuse)
				1207	{
				1208	struct address_space *mapping = inode->i_mapping;
				1209	pgoff_t start = 0;
				1210	struct pagevec pvec;
				1211	pgoff_t indices[PAGEVEC_SIZE];
				1212	bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
				1213	int ret = 0;
				1214
				1215	pagevec_init(&pvec);
				1216	do {
				1217	unsigned int nr_entries = PAGEVEC_SIZE;
				1218
				1219	if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
				1220	nr_entries = *fs_pages_to_unuse;
				1221
				1222	pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
				1223	pvec.pages, indices,
				1224	type, frontswap);
				1225	if (pvec.nr == 0) {
				1226	ret = 0;
				1227	break;
				1228	}
				1229
				1230	ret = shmem_unuse_swap_entries(inode, pvec, indices);
				1231	if (ret < 0)
				1232	break;
				1233
				1234	if (frontswap_partial) {
				1235	*fs_pages_to_unuse -= ret;
				1236	if (*fs_pages_to_unuse == 0) {
				1237	ret = FRONTSWAP_PAGES_UNUSED;
				1238	break;
				1239	}
				1240	}
				1241
				1242	start = indices[pvec.nr - 1];
				1243	} while (true);
				1244
				1245	return ret;
				1246	}
				1247
				1248	/*
				1249	* Read all the shared memory data that resides in the swap
				1250	* device 'type' back into memory, so the swap device can be
				1251	* unused.
				1252	*/
				1253	int shmem_unuse(unsigned int type, bool frontswap,
				1254	unsigned long *fs_pages_to_unuse)
				1255	{
				1256	struct shmem_inode_info info, next;
				1257	int error = 0;
				1258
				1259	if (list_empty(&shmem_swaplist))
				1260	return 0;
				1261
				1262	mutex_lock(&shmem_swaplist_mutex);
				1263	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
				1264	if (!info->swapped) {
				1265	list_del_init(&info->swaplist);
				1266	continue;
				1267	}
				1268	/*
				1269	* Drop the swaplist mutex while searching the inode for swap;
				1270	* but before doing so, make sure shmem_evict_inode() will not
				1271	* remove placeholder inode from swaplist, nor let it be freed
				1272	* (igrab() would protect from unlink, but not from unmount).
				1273	*/
				1274	atomic_inc(&info->stop_eviction);
				1275	mutex_unlock(&shmem_swaplist_mutex);
				1276
				1277	error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
				1278	fs_pages_to_unuse);
				1279	cond_resched();
				1280
				1281	mutex_lock(&shmem_swaplist_mutex);
				1282	next = list_next_entry(info, swaplist);
				1283	if (!info->swapped)
				1284	list_del_init(&info->swaplist);
				1285	if (atomic_dec_and_test(&info->stop_eviction))
				1286	wake_up_var(&info->stop_eviction);
				1287	if (error)
				1288	break;
				1289	}
				1290	mutex_unlock(&shmem_swaplist_mutex);
				1291
				1292	return error;
				1293	}
				1294
				1295	/*
				1296	* Move the page from the page cache to the swap cache.
				1297	*/
				1298	static int shmem_writepage(struct page page, struct writeback_control wbc)
				1299	{
				1300	struct shmem_inode_info *info;
				1301	struct address_space *mapping;
				1302	struct inode *inode;
				1303	swp_entry_t swap;
				1304	pgoff_t index;
				1305
				1306	VM_BUG_ON_PAGE(PageCompound(page), page);
				1307	BUG_ON(!PageLocked(page));
				1308	mapping = page->mapping;
				1309	index = page->index;
				1310	inode = mapping->host;
				1311	info = SHMEM_I(inode);
				1312	if (info->flags & VM_LOCKED)
				1313	goto redirty;
				1314	if (!total_swap_pages)
				1315	goto redirty;
				1316
				1317	/*
				1318	* Our capabilities prevent regular writeback or sync from ever calling
				1319	* shmem_writepage; but a stacking filesystem might use ->writepage of
				1320	* its underlying filesystem, in which case tmpfs should write out to
				1321	* swap only in response to memory pressure, and not for the writeback
				1322	* threads or sync.
				1323	*/
				1324	if (!wbc->for_reclaim) {
				1325	WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
				1326	goto redirty;
				1327	}
				1328
				1329	/*
				1330	* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
				1331	* value into swapfile.c, the only way we can correctly account for a
				1332	* fallocated page arriving here is now to initialize it and write it.
				1333	*
				1334	* That's okay for a page already fallocated earlier, but if we have
				1335	* not yet completed the fallocation, then (a) we want to keep track
				1336	* of this page in case we have to undo it, and (b) it may not be a
				1337	* good idea to continue anyway, once we're pushing into swap. So
				1338	* reactivate the page, and let shmem_fallocate() quit when too many.
				1339	*/
				1340	if (!PageUptodate(page)) {
				1341	if (inode->i_private) {
				1342	struct shmem_falloc *shmem_falloc;
				1343	spin_lock(&inode->i_lock);
				1344	shmem_falloc = inode->i_private;
				1345	if (shmem_falloc &&
				1346	!shmem_falloc->waitq &&
				1347	index >= shmem_falloc->start &&
				1348	index < shmem_falloc->next)
				1349	shmem_falloc->nr_unswapped++;
				1350	else
				1351	shmem_falloc = NULL;
				1352	spin_unlock(&inode->i_lock);
				1353	if (shmem_falloc)
				1354	goto redirty;
				1355	}
				1356	clear_highpage(page);
				1357	flush_dcache_page(page);
				1358	SetPageUptodate(page);
				1359	}
				1360
				1361	swap = get_swap_page(page);
				1362	if (!swap.val)
				1363	goto redirty;
				1364
				1365	/*
				1366	* Add inode to shmem_unuse()'s list of swapped-out inodes,
				1367	* if it's not already there. Do it now before the page is
				1368	* moved to swap cache, when its pagelock no longer protects
				1369	* the inode from eviction. But don't unlock the mutex until
				1370	* we've incremented swapped, because shmem_unuse_inode() will
				1371	* prune a !swapped inode from the swaplist under this mutex.
				1372	*/
				1373	mutex_lock(&shmem_swaplist_mutex);
				1374	if (list_empty(&info->swaplist))
				1375	list_add(&info->swaplist, &shmem_swaplist);
				1376
				1377	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
				1378	spin_lock_irq(&info->lock);
				1379	shmem_recalc_inode(inode);
				1380	info->swapped++;
				1381	spin_unlock_irq(&info->lock);
				1382
				1383	swap_shmem_alloc(swap);
				1384	shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
				1385
				1386	mutex_unlock(&shmem_swaplist_mutex);
				1387	BUG_ON(page_mapped(page));
				1388	swap_writepage(page, wbc);
				1389	return 0;
				1390	}
				1391
				1392	mutex_unlock(&shmem_swaplist_mutex);
				1393	put_swap_page(page, swap);
				1394	redirty:
				1395	set_page_dirty(page);
				1396	if (wbc->for_reclaim)
				1397	return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
				1398	unlock_page(page);
				1399	return 0;
				1400	}
				1401
				1402	#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
				1403	static void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)
				1404	{
				1405	char buffer[64];
				1406
				1407	if (!mpol \|\| mpol->mode == MPOL_DEFAULT)
				1408	return; /* show nothing */
				1409
				1410	mpol_to_str(buffer, sizeof(buffer), mpol);
				1411
				1412	seq_printf(seq, ",mpol=%s", buffer);
				1413	}
				1414
				1415	static struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)
				1416	{
				1417	struct mempolicy *mpol = NULL;
				1418	if (sbinfo->mpol) {
				1419	spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
				1420	mpol = sbinfo->mpol;
				1421	mpol_get(mpol);
				1422	spin_unlock(&sbinfo->stat_lock);
				1423	}
				1424	return mpol;
				1425	}
				1426	#else /* !CONFIG_NUMA \|\| !CONFIG_TMPFS */
				1427	static inline void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)
				1428	{
				1429	}
				1430	static inline struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)
				1431	{
				1432	return NULL;
				1433	}
				1434	#endif /* CONFIG_NUMA && CONFIG_TMPFS */
				1435	#ifndef CONFIG_NUMA
				1436	#define vm_policy vm_private_data
				1437	#endif
				1438
				1439	static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
				1440	struct shmem_inode_info *info, pgoff_t index)
				1441	{
				1442	/* Create a pseudo vma that just contains the policy */
				1443	vma_init(vma, NULL);
				1444	/* Bias interleave by inode number to distribute better across nodes */
				1445	vma->vm_pgoff = index + info->vfs_inode.i_ino;
				1446	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
				1447	}
				1448
				1449	static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
				1450	{
				1451	/* Drop reference taken by mpol_shared_policy_lookup() */
				1452	mpol_cond_put(vma->vm_policy);
				1453	}
				1454
				1455	static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
				1456	struct shmem_inode_info *info, pgoff_t index)
				1457	{
				1458	struct vm_area_struct pvma;
				1459	struct page *page;
				1460	struct vm_fault vmf;
				1461
				1462	shmem_pseudo_vma_init(&pvma, info, index);
				1463	vmf.vma = &pvma;
				1464	vmf.address = 0;
				1465	page = swap_cluster_readahead(swap, gfp, &vmf);
				1466	shmem_pseudo_vma_destroy(&pvma);
				1467
				1468	return page;
				1469	}
				1470
				1471	static struct page *shmem_alloc_hugepage(gfp_t gfp,
				1472	struct shmem_inode_info *info, pgoff_t index)
				1473	{
				1474	struct vm_area_struct pvma;
				1475	struct address_space *mapping = info->vfs_inode.i_mapping;
				1476	pgoff_t hindex;
				1477	struct page *page;
				1478
				1479	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
				1480	return NULL;
				1481
				1482	hindex = round_down(index, HPAGE_PMD_NR);
				1483	if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
				1484	XA_PRESENT))
				1485	return NULL;
				1486
				1487	shmem_pseudo_vma_init(&pvma, info, hindex);
				1488	page = alloc_pages_vma(gfp \| __GFP_COMP \| __GFP_NORETRY \| __GFP_NOWARN,
				1489	HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
				1490	shmem_pseudo_vma_destroy(&pvma);
				1491	if (page)
				1492	prep_transhuge_page(page);
				1493	return page;
				1494	}
				1495
				1496	static struct page *shmem_alloc_page(gfp_t gfp,
				1497	struct shmem_inode_info *info, pgoff_t index)
				1498	{
				1499	struct vm_area_struct pvma;
				1500	struct page *page;
				1501
				1502	shmem_pseudo_vma_init(&pvma, info, index);
				1503	page = alloc_page_vma(gfp, &pvma, 0);
				1504	shmem_pseudo_vma_destroy(&pvma);
				1505
				1506	return page;
				1507	}
				1508
				1509	static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
				1510	struct inode *inode,
				1511	pgoff_t index, bool huge)
				1512	{
				1513	struct shmem_inode_info *info = SHMEM_I(inode);
				1514	struct page *page;
				1515	int nr;
				1516	int err = -ENOSPC;
				1517
				1518	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
				1519	huge = false;
				1520	nr = huge ? HPAGE_PMD_NR : 1;
				1521
				1522	if (!shmem_inode_acct_block(inode, nr))
				1523	goto failed;
				1524
				1525	if (huge)
				1526	page = shmem_alloc_hugepage(gfp, info, index);
				1527	else
				1528	page = shmem_alloc_page(gfp, info, index);
				1529	if (page) {
				1530	__SetPageLocked(page);
				1531	__SetPageSwapBacked(page);
				1532	return page;
				1533	}
				1534
				1535	err = -ENOMEM;
				1536	shmem_inode_unacct_blocks(inode, nr);
				1537	failed:
				1538	return ERR_PTR(err);
				1539	}
				1540
				1541	/*
				1542	* When a page is moved from swapcache to shmem filecache (either by the
				1543	* usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
				1544	* shmem_unuse_inode()), it may have been read in earlier from swap, in
				1545	* ignorance of the mapping it belongs to. If that mapping has special
				1546	* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
				1547	* we may need to copy to a suitable page before moving to filecache.
				1548	*
				1549	* In a future release, this may well be extended to respect cpuset and
				1550	* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
				1551	* but for now it is a simple matter of zone.
				1552	*/
				1553	static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
				1554	{
				1555	return page_zonenum(page) > gfp_zone(gfp);
				1556	}
				1557
				1558	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				1559	struct shmem_inode_info *info, pgoff_t index)
				1560	{
				1561	struct page oldpage, newpage;
				1562	struct address_space *swap_mapping;
				1563	swp_entry_t entry;
				1564	pgoff_t swap_index;
				1565	int error;
				1566
				1567	oldpage = *pagep;
				1568	entry.val = page_private(oldpage);
				1569	swap_index = swp_offset(entry);
				1570	swap_mapping = page_mapping(oldpage);
				1571
				1572	/*
				1573	* We have arrived here because our zones are constrained, so don't
				1574	* limit chance of success by further cpuset and node constraints.
				1575	*/
				1576	gfp &= ~GFP_CONSTRAINT_MASK;
				1577	newpage = shmem_alloc_page(gfp, info, index);
				1578	if (!newpage)
				1579	return -ENOMEM;
				1580
				1581	get_page(newpage);
				1582	copy_highpage(newpage, oldpage);
				1583	flush_dcache_page(newpage);
				1584
				1585	__SetPageLocked(newpage);
				1586	__SetPageSwapBacked(newpage);
				1587	SetPageUptodate(newpage);
				1588	set_page_private(newpage, entry.val);
				1589	SetPageSwapCache(newpage);
				1590
				1591	/*
				1592	* Our caller will very soon move newpage out of swapcache, but it's
				1593	* a nice clean interface for us to replace oldpage by newpage there.
				1594	*/
				1595	xa_lock_irq(&swap_mapping->i_pages);
				1596	error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
				1597	if (!error) {
				1598	__inc_node_page_state(newpage, NR_FILE_PAGES);
				1599	__dec_node_page_state(oldpage, NR_FILE_PAGES);
				1600	}
				1601	xa_unlock_irq(&swap_mapping->i_pages);
				1602
				1603	if (unlikely(error)) {
				1604	/*
				1605	* Is this possible? I think not, now that our callers check
				1606	* both PageSwapCache and page_private after getting page lock;
				1607	* but be defensive. Reverse old to newpage for clear and free.
				1608	*/
				1609	oldpage = newpage;
				1610	} else {
				1611	mem_cgroup_migrate(oldpage, newpage);
				1612	lru_cache_add_anon(newpage);
				1613	*pagep = newpage;
				1614	}
				1615
				1616	ClearPageSwapCache(oldpage);
				1617	set_page_private(oldpage, 0);
				1618
				1619	unlock_page(oldpage);
				1620	put_page(oldpage);
				1621	put_page(oldpage);
				1622	return error;
				1623	}
				1624
				1625	/*
				1626	* Swap in the page pointed to by *pagep.
				1627	* Caller has to make sure that *pagep contains a valid swapped page.
				1628	* Returns 0 and the page in pagep if success. On failure, returns the
				1629	* the error code and NULL in *pagep.
				1630	*/
				1631	static int shmem_swapin_page(struct inode *inode, pgoff_t index,
				1632	struct page **pagep, enum sgp_type sgp,
				1633	gfp_t gfp, struct vm_area_struct *vma,
				1634	vm_fault_t *fault_type)
				1635	{
				1636	struct address_space *mapping = inode->i_mapping;
				1637	struct shmem_inode_info *info = SHMEM_I(inode);
				1638	struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
				1639	struct mem_cgroup *memcg;
				1640	struct page *page;
				1641	swp_entry_t swap;
				1642	int error;
				1643
				1644	VM_BUG_ON(!pagep \|\| !xa_is_value(pagep));
				1645	swap = radix_to_swp_entry(*pagep);
				1646	*pagep = NULL;
				1647
				1648	/* Look it up and read it in.. */
				1649	page = lookup_swap_cache(swap, NULL, 0);
				1650	if (!page) {
				1651	/* Or update major stats only when swapin succeeds?? */
				1652	if (fault_type) {
				1653	*fault_type \|= VM_FAULT_MAJOR;
				1654	count_vm_event(PGMAJFAULT);
				1655	count_memcg_event_mm(charge_mm, PGMAJFAULT);
				1656	}
				1657	/* Here we actually start the io */
				1658	page = shmem_swapin(swap, gfp, info, index);
				1659	if (!page) {
				1660	error = -ENOMEM;
				1661	goto failed;
				1662	}
				1663	}
				1664
				1665	/* We have to do this with page locked to prevent races */
				1666	lock_page(page);
				1667	if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
				1668	!shmem_confirm_swap(mapping, index, swap)) {
				1669	error = -EEXIST;
				1670	goto unlock;
				1671	}
				1672	if (!PageUptodate(page)) {
				1673	error = -EIO;
				1674	goto failed;
				1675	}
				1676	wait_on_page_writeback(page);
				1677
				1678	if (shmem_should_replace_page(page, gfp)) {
				1679	error = shmem_replace_page(&page, gfp, info, index);
				1680	if (error)
				1681	goto failed;
				1682	}
				1683
				1684	error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
				1685	false);
				1686	if (!error) {
				1687	error = shmem_add_to_page_cache(page, mapping, index,
				1688	swp_to_radix_entry(swap), gfp);
				1689	/*
				1690	* We already confirmed swap under page lock, and make
				1691	* no memory allocation here, so usually no possibility
				1692	* of error; but free_swap_and_cache() only trylocks a
				1693	* page, so it is just possible that the entry has been
				1694	* truncated or holepunched since swap was confirmed.
				1695	* shmem_undo_range() will have done some of the
				1696	* unaccounting, now delete_from_swap_cache() will do
				1697	* the rest.
				1698	*/
				1699	if (error) {
				1700	mem_cgroup_cancel_charge(page, memcg, false);
				1701	delete_from_swap_cache(page);
				1702	}
				1703	}
				1704	if (error)
				1705	goto failed;
				1706
				1707	mem_cgroup_commit_charge(page, memcg, true, false);
				1708
				1709	spin_lock_irq(&info->lock);
				1710	info->swapped--;
				1711	shmem_recalc_inode(inode);
				1712	spin_unlock_irq(&info->lock);
				1713
				1714	if (sgp == SGP_WRITE)
				1715	mark_page_accessed(page);
				1716
				1717	delete_from_swap_cache(page);
				1718	set_page_dirty(page);
				1719	swap_free(swap);
				1720
				1721	*pagep = page;
				1722	return 0;
				1723	failed:
				1724	if (!shmem_confirm_swap(mapping, index, swap))
				1725	error = -EEXIST;
				1726	unlock:
				1727	if (page) {
				1728	unlock_page(page);
				1729	put_page(page);
				1730	}
				1731
				1732	return error;
				1733	}
				1734
				1735	/*
				1736	* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
				1737	*
				1738	* If we allocate a new one we do not mark it dirty. That's up to the
				1739	* vm. If we swap it in we mark it dirty since we also free the swap
				1740	* entry since a page cannot live in both the swap and page cache.
				1741	*
				1742	* vma, vmf, and fault_type are only supplied by shmem_fault:
				1743	* otherwise they are NULL.
				1744	*/
				1745	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
				1746	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
				1747	struct vm_area_struct vma, struct vm_fault vmf,
				1748	vm_fault_t *fault_type)
				1749	{
				1750	struct address_space *mapping = inode->i_mapping;
				1751	struct shmem_inode_info *info = SHMEM_I(inode);
				1752	struct shmem_sb_info *sbinfo;
				1753	struct mm_struct *charge_mm;
				1754	struct mem_cgroup *memcg;
				1755	struct page *page;
				1756	enum sgp_type sgp_huge = sgp;
				1757	pgoff_t hindex = index;
				1758	int error;
				1759	int once = 0;
				1760	int alloced = 0;
				1761
				1762	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
				1763	return -EFBIG;
				1764	if (sgp == SGP_NOHUGE \|\| sgp == SGP_HUGE)
				1765	sgp = SGP_CACHE;
				1766	repeat:
				1767	if (sgp <= SGP_CACHE &&
				1768	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
				1769	return -EINVAL;
				1770	}
				1771
				1772	sbinfo = SHMEM_SB(inode->i_sb);
				1773	charge_mm = vma ? vma->vm_mm : current->mm;
				1774
				1775	page = find_lock_entry(mapping, index);
				1776
				1777	if (page && vma && userfaultfd_minor(vma)) {
				1778	if (!xa_is_value(page)) {
				1779	unlock_page(page);
				1780	put_page(page);
				1781	}
				1782	*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
				1783	return 0;
				1784	}
				1785
				1786	if (xa_is_value(page)) {
				1787	error = shmem_swapin_page(inode, index, &page,
				1788	sgp, gfp, vma, fault_type);
				1789	if (error == -EEXIST)
				1790	goto repeat;
				1791
				1792	*pagep = page;
				1793	return error;
				1794	}
				1795
				1796	if (page && sgp == SGP_WRITE)
				1797	mark_page_accessed(page);
				1798
				1799	/* fallocated page? */
				1800	if (page && !PageUptodate(page)) {
				1801	if (sgp != SGP_READ)
				1802	goto clear;
				1803	unlock_page(page);
				1804	put_page(page);
				1805	page = NULL;
				1806	}
				1807	if (page \|\| sgp == SGP_READ) {
				1808	*pagep = page;
				1809	return 0;
				1810	}
				1811
				1812	/*
				1813	* Fast cache lookup did not find it:
				1814	* bring it back from swap or allocate.
				1815	*/
				1816
				1817	if (vma && userfaultfd_missing(vma)) {
				1818	*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
				1819	return 0;
				1820	}
				1821
				1822	/* shmem_symlink() */
				1823	if (mapping->a_ops != &shmem_aops)
				1824	goto alloc_nohuge;
				1825	if (shmem_huge == SHMEM_HUGE_DENY \|\| sgp_huge == SGP_NOHUGE)
				1826	goto alloc_nohuge;
				1827	if (shmem_huge == SHMEM_HUGE_FORCE)
				1828	goto alloc_huge;
				1829	switch (sbinfo->huge) {
				1830	loff_t i_size;
				1831	pgoff_t off;
				1832	case SHMEM_HUGE_NEVER:
				1833	goto alloc_nohuge;
				1834	case SHMEM_HUGE_WITHIN_SIZE:
				1835	off = round_up(index, HPAGE_PMD_NR);
				1836	i_size = round_up(i_size_read(inode), PAGE_SIZE);
				1837	if (i_size >= HPAGE_PMD_SIZE &&
				1838	i_size >> PAGE_SHIFT >= off)
				1839	goto alloc_huge;
				1840	/* fallthrough */
				1841	case SHMEM_HUGE_ADVISE:
				1842	if (sgp_huge == SGP_HUGE)
				1843	goto alloc_huge;
				1844	/* TODO: implement fadvise() hints */
				1845	goto alloc_nohuge;
				1846	}
				1847
				1848	alloc_huge:
				1849	page = shmem_alloc_and_acct_page(gfp, inode, index, true);
				1850	if (IS_ERR(page)) {
				1851	alloc_nohuge:
				1852	page = shmem_alloc_and_acct_page(gfp, inode,
				1853	index, false);
				1854	}
				1855	if (IS_ERR(page)) {
				1856	int retry = 5;
				1857
				1858	error = PTR_ERR(page);
				1859	page = NULL;
				1860	if (error != -ENOSPC)
				1861	goto unlock;
				1862	/*
				1863	* Try to reclaim some space by splitting a huge page
				1864	* beyond i_size on the filesystem.
				1865	*/
				1866	while (retry--) {
				1867	int ret;
				1868
				1869	ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
				1870	if (ret == SHRINK_STOP)
				1871	break;
				1872	if (ret)
				1873	goto alloc_nohuge;
				1874	}
				1875	goto unlock;
				1876	}
				1877
				1878	if (PageTransHuge(page))
				1879	hindex = round_down(index, HPAGE_PMD_NR);
				1880	else
				1881	hindex = index;
				1882
				1883	if (sgp == SGP_WRITE)
				1884	__SetPageReferenced(page);
				1885
				1886	error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
				1887	PageTransHuge(page));
				1888	if (error)
				1889	goto unacct;
				1890	error = shmem_add_to_page_cache(page, mapping, hindex,
				1891	NULL, gfp & GFP_RECLAIM_MASK);
				1892	if (error) {
				1893	mem_cgroup_cancel_charge(page, memcg,
				1894	PageTransHuge(page));
				1895	goto unacct;
				1896	}
				1897	mem_cgroup_commit_charge(page, memcg, false,
				1898	PageTransHuge(page));
				1899	lru_cache_add_anon(page);
				1900
				1901	spin_lock_irq(&info->lock);
				1902	info->alloced += compound_nr(page);
				1903	inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
				1904	shmem_recalc_inode(inode);
				1905	spin_unlock_irq(&info->lock);
				1906	alloced = true;
				1907
				1908	if (PageTransHuge(page) &&
				1909	DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
				1910	hindex + HPAGE_PMD_NR - 1) {
				1911	/*
				1912	* Part of the huge page is beyond i_size: subject
				1913	* to shrink under memory pressure.
				1914	*/
				1915	spin_lock(&sbinfo->shrinklist_lock);
				1916	/*
				1917	* _careful to defend against unlocked access to
				1918	* ->shrink_list in shmem_unused_huge_shrink()
				1919	*/
				1920	if (list_empty_careful(&info->shrinklist)) {
				1921	list_add_tail(&info->shrinklist,
				1922	&sbinfo->shrinklist);
				1923	sbinfo->shrinklist_len++;
				1924	}
				1925	spin_unlock(&sbinfo->shrinklist_lock);
				1926	}
				1927
				1928	/*
				1929	* Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
				1930	*/
				1931	if (sgp == SGP_FALLOC)
				1932	sgp = SGP_WRITE;
				1933	clear:
				1934	/*
				1935	* Let SGP_WRITE caller clear ends if write does not fill page;
				1936	* but SGP_FALLOC on a page fallocated earlier must initialize
				1937	* it now, lest undo on failure cancel our earlier guarantee.
				1938	*/
				1939	if (sgp != SGP_WRITE && !PageUptodate(page)) {
				1940	struct page *head = compound_head(page);
				1941	int i;
				1942
				1943	for (i = 0; i < compound_nr(head); i++) {
				1944	clear_highpage(head + i);
				1945	flush_dcache_page(head + i);
				1946	}
				1947	SetPageUptodate(head);
				1948	}
				1949
				1950	/* Perhaps the file has been truncated since we checked */
				1951	if (sgp <= SGP_CACHE &&
				1952	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
				1953	if (alloced) {
				1954	ClearPageDirty(page);
				1955	delete_from_page_cache(page);
				1956	spin_lock_irq(&info->lock);
				1957	shmem_recalc_inode(inode);
				1958	spin_unlock_irq(&info->lock);
				1959	}
				1960	error = -EINVAL;
				1961	goto unlock;
				1962	}
				1963	*pagep = page + index - hindex;
				1964	return 0;
				1965
				1966	/*
				1967	* Error recovery.
				1968	*/
				1969	unacct:
				1970	shmem_inode_unacct_blocks(inode, compound_nr(page));
				1971
				1972	if (PageTransHuge(page)) {
				1973	unlock_page(page);
				1974	put_page(page);
				1975	goto alloc_nohuge;
				1976	}
				1977	unlock:
				1978	if (page) {
				1979	unlock_page(page);
				1980	put_page(page);
				1981	}
				1982	if (error == -ENOSPC && !once++) {
				1983	spin_lock_irq(&info->lock);
				1984	shmem_recalc_inode(inode);
				1985	spin_unlock_irq(&info->lock);
				1986	goto repeat;
				1987	}
				1988	if (error == -EEXIST)
				1989	goto repeat;
				1990	return error;
				1991	}
				1992
				1993	/*
				1994	* This is like autoremove_wake_function, but it removes the wait queue
				1995	* entry unconditionally - even if something else had already woken the
				1996	* target.
				1997	*/
				1998	static int synchronous_wake_function(wait_queue_entry_t wait, unsigned mode, int sync, void key)
				1999	{
				2000	int ret = default_wake_function(wait, mode, sync, key);
				2001	list_del_init(&wait->entry);
				2002	return ret;
				2003	}
				2004
				2005	static vm_fault_t shmem_fault(struct vm_fault *vmf)
				2006	{
				2007	struct vm_area_struct *vma = vmf->vma;
				2008	struct inode *inode = file_inode(vma->vm_file);
				2009	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
				2010	enum sgp_type sgp;
				2011	int err;
				2012	vm_fault_t ret = VM_FAULT_LOCKED;
				2013
				2014	/*
				2015	* Trinity finds that probing a hole which tmpfs is punching can
				2016	* prevent the hole-punch from ever completing: which in turn
				2017	* locks writers out with its hold on i_mutex. So refrain from
				2018	* faulting pages into the hole while it's being punched. Although
				2019	* shmem_undo_range() does remove the additions, it may be unable to
				2020	* keep up, as each new page needs its own unmap_mapping_range() call,
				2021	* and the i_mmap tree grows ever slower to scan if new vmas are added.
				2022	*
				2023	* It does not matter if we sometimes reach this check just before the
				2024	* hole-punch begins, so that one fault then races with the punch:
				2025	* we just need to make racing faults a rare case.
				2026	*
				2027	* The implementation below would be much simpler if we just used a
				2028	* standard mutex or completion: but we cannot take i_mutex in fault,
				2029	* and bloating every shmem inode for this unlikely case would be sad.
				2030	*/
				2031	if (unlikely(inode->i_private)) {
				2032	struct shmem_falloc *shmem_falloc;
				2033
				2034	spin_lock(&inode->i_lock);
				2035	shmem_falloc = inode->i_private;
				2036	if (shmem_falloc &&
				2037	shmem_falloc->waitq &&
				2038	vmf->pgoff >= shmem_falloc->start &&
				2039	vmf->pgoff < shmem_falloc->next) {
				2040	struct file *fpin;
				2041	wait_queue_head_t *shmem_falloc_waitq;
				2042	DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
				2043
				2044	ret = VM_FAULT_NOPAGE;
				2045	fpin = maybe_unlock_mmap_for_io(vmf, NULL);
				2046	if (fpin)
				2047	ret = VM_FAULT_RETRY;
				2048
				2049	shmem_falloc_waitq = shmem_falloc->waitq;
				2050	prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
				2051	TASK_UNINTERRUPTIBLE);
				2052	spin_unlock(&inode->i_lock);
				2053	schedule();
				2054
				2055	/*
				2056	* shmem_falloc_waitq points into the shmem_fallocate()
				2057	* stack of the hole-punching task: shmem_falloc_waitq
				2058	* is usually invalid by the time we reach here, but
				2059	* finish_wait() does not dereference it in that case;
				2060	* though i_lock needed lest racing with wake_up_all().
				2061	*/
				2062	spin_lock(&inode->i_lock);
				2063	finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
				2064	spin_unlock(&inode->i_lock);
				2065
				2066	if (fpin)
				2067	fput(fpin);
				2068	return ret;
				2069	}
				2070	spin_unlock(&inode->i_lock);
				2071	}
				2072
				2073	sgp = SGP_CACHE;
				2074
				2075	if ((vma->vm_flags & VM_NOHUGEPAGE) \|\|
				2076	test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
				2077	sgp = SGP_NOHUGE;
				2078	else if (vma->vm_flags & VM_HUGEPAGE)
				2079	sgp = SGP_HUGE;
				2080
				2081	err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
				2082	gfp, vma, vmf, &ret);
				2083	if (err)
				2084	return vmf_error(err);
				2085	return ret;
				2086	}
				2087
				2088	unsigned long shmem_get_unmapped_area(struct file *file,
				2089	unsigned long uaddr, unsigned long len,
				2090	unsigned long pgoff, unsigned long flags)
				2091	{
				2092	unsigned long (get_area)(struct file ,
				2093	unsigned long, unsigned long, unsigned long, unsigned long);
				2094	unsigned long addr;
				2095	unsigned long offset;
				2096	unsigned long inflated_len;
				2097	unsigned long inflated_addr;
				2098	unsigned long inflated_offset;
				2099
				2100	if (len > TASK_SIZE)
				2101	return -ENOMEM;
				2102
				2103	get_area = current->mm->get_unmapped_area;
				2104	addr = get_area(file, uaddr, len, pgoff, flags);
				2105
				2106	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
				2107	return addr;
				2108	if (IS_ERR_VALUE(addr))
				2109	return addr;
				2110	if (addr & ~PAGE_MASK)
				2111	return addr;
				2112	if (addr > TASK_SIZE - len)
				2113	return addr;
				2114
				2115	if (shmem_huge == SHMEM_HUGE_DENY)
				2116	return addr;
				2117	if (len < HPAGE_PMD_SIZE)
				2118	return addr;
				2119	if (flags & MAP_FIXED)
				2120	return addr;
				2121	/*
				2122	* Our priority is to support MAP_SHARED mapped hugely;
				2123	* and support MAP_PRIVATE mapped hugely too, until it is COWed.
				2124	* But if caller specified an address hint and we allocated area there
				2125	* successfully, respect that as before.
				2126	*/
				2127	if (uaddr == addr)
				2128	return addr;
				2129
				2130	if (shmem_huge != SHMEM_HUGE_FORCE) {
				2131	struct super_block *sb;
				2132
				2133	if (file) {
				2134	VM_BUG_ON(file->f_op != &shmem_file_operations);
				2135	sb = file_inode(file)->i_sb;
				2136	} else {
				2137	/*
				2138	* Called directly from mm/mmap.c, or drivers/char/mem.c
				2139	* for "/dev/zero", to create a shared anonymous object.
				2140	*/
				2141	if (IS_ERR(shm_mnt))
				2142	return addr;
				2143	sb = shm_mnt->mnt_sb;
				2144	}
				2145	if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
				2146	return addr;
				2147	}
				2148
				2149	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
				2150	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
				2151	return addr;
				2152	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
				2153	return addr;
				2154
				2155	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
				2156	if (inflated_len > TASK_SIZE)
				2157	return addr;
				2158	if (inflated_len < len)
				2159	return addr;
				2160
				2161	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
				2162	if (IS_ERR_VALUE(inflated_addr))
				2163	return addr;
				2164	if (inflated_addr & ~PAGE_MASK)
				2165	return addr;
				2166
				2167	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
				2168	inflated_addr += offset - inflated_offset;
				2169	if (inflated_offset > offset)
				2170	inflated_addr += HPAGE_PMD_SIZE;
				2171
				2172	if (inflated_addr > TASK_SIZE - len)
				2173	return addr;
				2174	return inflated_addr;
				2175	}
				2176
				2177	#ifdef CONFIG_NUMA
				2178	static int shmem_set_policy(struct vm_area_struct vma, struct mempolicy mpol)
				2179	{
				2180	struct inode *inode = file_inode(vma->vm_file);
				2181	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
				2182	}
				2183
				2184	static struct mempolicy shmem_get_policy(struct vm_area_struct vma,
				2185	unsigned long addr)
				2186	{
				2187	struct inode *inode = file_inode(vma->vm_file);
				2188	pgoff_t index;
				2189
				2190	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
				2191	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
				2192	}
				2193	#endif
				2194
				2195	int shmem_lock(struct file file, int lock, struct user_struct user)
				2196	{
				2197	struct inode *inode = file_inode(file);
				2198	struct shmem_inode_info *info = SHMEM_I(inode);
				2199	int retval = -ENOMEM;
				2200
				2201	/*
				2202	* What serializes the accesses to info->flags?
				2203	* ipc_lock_object() when called from shmctl_do_lock(),
				2204	* no serialization needed when called from shm_destroy().
				2205	*/
				2206	if (lock && !(info->flags & VM_LOCKED)) {
				2207	if (!user_shm_lock(inode->i_size, user))
				2208	goto out_nomem;
				2209	info->flags \|= VM_LOCKED;
				2210	mapping_set_unevictable(file->f_mapping);
				2211	}
				2212	if (!lock && (info->flags & VM_LOCKED) && user) {
				2213	user_shm_unlock(inode->i_size, user);
				2214	info->flags &= ~VM_LOCKED;
				2215	mapping_clear_unevictable(file->f_mapping);
				2216	}
				2217	retval = 0;
				2218
				2219	out_nomem:
				2220	return retval;
				2221	}
				2222
				2223	static int shmem_mmap(struct file file, struct vm_area_struct vma)
				2224	{
				2225	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
				2226	int ret;
				2227
				2228	ret = seal_check_future_write(info->seals, vma);
				2229	if (ret)
				2230	return ret;
				2231
				2232	file_accessed(file);
				2233	vma->vm_ops = &shmem_vm_ops;
				2234	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				2235	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
				2236	(vma->vm_end & HPAGE_PMD_MASK)) {
				2237	khugepaged_enter(vma, vma->vm_flags);
				2238	}
				2239	return 0;
				2240	}
				2241
				2242	static struct inode shmem_get_inode(struct super_block sb, const struct inode *dir,
				2243	umode_t mode, dev_t dev, unsigned long flags)
				2244	{
				2245	struct inode *inode;
				2246	struct shmem_inode_info *info;
				2247	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				2248
				2249	if (shmem_reserve_inode(sb))
				2250	return NULL;
				2251
				2252	inode = new_inode(sb);
				2253	if (inode) {
				2254	inode->i_ino = get_next_ino();
				2255	inode_init_owner(inode, dir, mode);
				2256	inode->i_blocks = 0;
				2257	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
				2258	inode->i_generation = prandom_u32();
				2259	info = SHMEM_I(inode);
				2260	memset(info, 0, (char )inode - (char )info);
				2261	spin_lock_init(&info->lock);
				2262	atomic_set(&info->stop_eviction, 0);
				2263	info->seals = F_SEAL_SEAL;
				2264	info->flags = flags & VM_NORESERVE;
				2265	INIT_LIST_HEAD(&info->shrinklist);
				2266	INIT_LIST_HEAD(&info->swaplist);
				2267	simple_xattrs_init(&info->xattrs);
				2268	cache_no_acl(inode);
				2269
				2270	switch (mode & S_IFMT) {
				2271	default:
				2272	inode->i_op = &shmem_special_inode_operations;
				2273	init_special_inode(inode, mode, dev);
				2274	break;
				2275	case S_IFREG:
				2276	inode->i_mapping->a_ops = &shmem_aops;
				2277	inode->i_op = &shmem_inode_operations;
				2278	inode->i_fop = &shmem_file_operations;
				2279	mpol_shared_policy_init(&info->policy,
				2280	shmem_get_sbmpol(sbinfo));
				2281	break;
				2282	case S_IFDIR:
				2283	inc_nlink(inode);
				2284	/* Some things misbehave if size == 0 on a directory */
				2285	inode->i_size = 2 * BOGO_DIRENT_SIZE;
				2286	inode->i_op = &shmem_dir_inode_operations;
				2287	inode->i_fop = &simple_dir_operations;
				2288	break;
				2289	case S_IFLNK:
				2290	/*
				2291	* Must not load anything in the rbtree,
				2292	* mpol_free_shared_policy will not be called.
				2293	*/
				2294	mpol_shared_policy_init(&info->policy, NULL);
				2295	break;
				2296	}
				2297
				2298	lockdep_annotate_inode_mutex_key(inode);
				2299	} else
				2300	shmem_free_inode(sb);
				2301	return inode;
				2302	}
				2303
				2304	bool shmem_mapping(struct address_space *mapping)
				2305	{
				2306	return mapping->a_ops == &shmem_aops;
				2307	}
				2308
				2309	#ifdef CONFIG_USERFAULTFD
				2310	int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
				2311	pmd_t *dst_pmd,
				2312	struct vm_area_struct *dst_vma,
				2313	unsigned long dst_addr,
				2314	unsigned long src_addr,
				2315	bool zeropage,
				2316	struct page **pagep)
				2317	{
				2318	struct inode *inode = file_inode(dst_vma->vm_file);
				2319	struct shmem_inode_info *info = SHMEM_I(inode);
				2320	struct address_space *mapping = inode->i_mapping;
				2321	gfp_t gfp = mapping_gfp_mask(mapping);
				2322	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
				2323	struct mem_cgroup *memcg;
				2324	void *page_kaddr;
				2325	struct page *page;
				2326	int ret;
				2327	pgoff_t max_off;
				2328
				2329	if (!shmem_inode_acct_block(inode, 1)) {
				2330	/*
				2331	* We may have got a page, returned -ENOENT triggering a retry,
				2332	* and now we find ourselves with -ENOMEM. Release the page, to
				2333	* avoid a BUG_ON in our caller.
				2334	*/
				2335	if (unlikely(*pagep)) {
				2336	put_page(*pagep);
				2337	*pagep = NULL;
				2338	}
				2339	return -ENOMEM;
				2340	}
				2341
				2342	if (!*pagep) {
				2343	ret = -ENOMEM;
				2344	page = shmem_alloc_page(gfp, info, pgoff);
				2345	if (!page)
				2346	goto out_unacct_blocks;
				2347
				2348	if (!zeropage) { /* COPY */
				2349	page_kaddr = kmap_atomic(page);
				2350	ret = copy_from_user(page_kaddr,
				2351	(const void __user *)src_addr,
				2352	PAGE_SIZE);
				2353	kunmap_atomic(page_kaddr);
				2354
				2355	/* fallback to copy_from_user outside mmap_sem */
				2356	if (unlikely(ret)) {
				2357	*pagep = page;
				2358	ret = -ENOENT;
				2359	/* don't free the page */
				2360	goto out_unacct_blocks;
				2361	}
				2362	} else { /* ZEROPAGE */
				2363	clear_highpage(page);
				2364	}
				2365	} else {
				2366	page = *pagep;
				2367	*pagep = NULL;
				2368	}
				2369
				2370	VM_BUG_ON(PageLocked(page));
				2371	VM_BUG_ON(PageSwapBacked(page));
				2372	__SetPageLocked(page);
				2373	__SetPageSwapBacked(page);
				2374	__SetPageUptodate(page);
				2375
				2376	ret = -EFAULT;
				2377	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
				2378	if (unlikely(pgoff >= max_off))
				2379	goto out_release;
				2380
				2381	ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
				2382	if (ret)
				2383	goto out_release;
				2384
				2385	ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
				2386	gfp & GFP_RECLAIM_MASK);
				2387	if (ret)
				2388	goto out_release_uncharge;
				2389
				2390	mem_cgroup_commit_charge(page, memcg, false, false);
				2391
				2392	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
				2393	page, true);
				2394	if (ret)
				2395	goto out_delete_from_cache;
				2396
				2397	spin_lock_irq(&info->lock);
				2398	info->alloced++;
				2399	inode->i_blocks += BLOCKS_PER_PAGE;
				2400	shmem_recalc_inode(inode);
				2401	spin_unlock_irq(&info->lock);
				2402
				2403	SetPageDirty(page);
				2404	unlock_page(page);
				2405	return 0;
				2406	out_delete_from_cache:
				2407	delete_from_page_cache(page);
				2408	out_release_uncharge:
				2409	mem_cgroup_cancel_charge(page, memcg, false);
				2410	out_release:
				2411	unlock_page(page);
				2412	put_page(page);
				2413	out_unacct_blocks:
				2414	shmem_inode_unacct_blocks(inode, 1);
				2415	return ret;
				2416	}
				2417	#endif /* CONFIG_USERFAULTFD */
				2418
				2419	#ifdef CONFIG_TMPFS
				2420	static const struct inode_operations shmem_symlink_inode_operations;
				2421	static const struct inode_operations shmem_short_symlink_operations;
				2422
				2423	#ifdef CONFIG_TMPFS_XATTR
				2424	static int shmem_initxattrs(struct inode , const struct xattr , void *);
				2425	#else
				2426	#define shmem_initxattrs NULL
				2427	#endif
				2428
				2429	static int
				2430	shmem_write_begin(struct file file, struct address_space mapping,
				2431	loff_t pos, unsigned len, unsigned flags,
				2432	struct page pagep, void fsdata)
				2433	{
				2434	struct inode *inode = mapping->host;
				2435	struct shmem_inode_info *info = SHMEM_I(inode);
				2436	pgoff_t index = pos >> PAGE_SHIFT;
				2437
				2438	/* i_mutex is held by caller */
				2439	if (unlikely(info->seals & (F_SEAL_GROW \|
				2440	F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE))) {
				2441	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE))
				2442	return -EPERM;
				2443	if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
				2444	return -EPERM;
				2445	}
				2446
				2447	return shmem_getpage(inode, index, pagep, SGP_WRITE);
				2448	}
				2449
				2450	static int
				2451	shmem_write_end(struct file file, struct address_space mapping,
				2452	loff_t pos, unsigned len, unsigned copied,
				2453	struct page page, void fsdata)
				2454	{
				2455	struct inode *inode = mapping->host;
				2456
				2457	if (pos + copied > inode->i_size)
				2458	i_size_write(inode, pos + copied);
				2459
				2460	if (!PageUptodate(page)) {
				2461	struct page *head = compound_head(page);
				2462	if (PageTransCompound(page)) {
				2463	int i;
				2464
				2465	for (i = 0; i < HPAGE_PMD_NR; i++) {
				2466	if (head + i == page)
				2467	continue;
				2468	clear_highpage(head + i);
				2469	flush_dcache_page(head + i);
				2470	}
				2471	}
				2472	if (copied < PAGE_SIZE) {
				2473	unsigned from = pos & (PAGE_SIZE - 1);
				2474	zero_user_segments(page, 0, from,
				2475	from + copied, PAGE_SIZE);
				2476	}
				2477	SetPageUptodate(head);
				2478	}
				2479	set_page_dirty(page);
				2480	unlock_page(page);
				2481	put_page(page);
				2482
				2483	return copied;
				2484	}
				2485
				2486	static ssize_t shmem_file_read_iter(struct kiocb iocb, struct iov_iter to)
				2487	{
				2488	struct file *file = iocb->ki_filp;
				2489	struct inode *inode = file_inode(file);
				2490	struct address_space *mapping = inode->i_mapping;
				2491	pgoff_t index;
				2492	unsigned long offset;
				2493	enum sgp_type sgp = SGP_READ;
				2494	int error = 0;
				2495	ssize_t retval = 0;
				2496	loff_t *ppos = &iocb->ki_pos;
				2497
				2498	/*
				2499	* Might this read be for a stacking filesystem? Then when reading
				2500	* holes of a sparse file, we actually need to allocate those pages,
				2501	* and even mark them dirty, so it cannot exceed the max_blocks limit.
				2502	*/
				2503	if (!iter_is_iovec(to))
				2504	sgp = SGP_CACHE;
				2505
				2506	index = *ppos >> PAGE_SHIFT;
				2507	offset = *ppos & ~PAGE_MASK;
				2508
				2509	for (;;) {
				2510	struct page *page = NULL;
				2511	pgoff_t end_index;
				2512	unsigned long nr, ret;
				2513	loff_t i_size = i_size_read(inode);
				2514
				2515	end_index = i_size >> PAGE_SHIFT;
				2516	if (index > end_index)
				2517	break;
				2518	if (index == end_index) {
				2519	nr = i_size & ~PAGE_MASK;
				2520	if (nr <= offset)
				2521	break;
				2522	}
				2523
				2524	error = shmem_getpage(inode, index, &page, sgp);
				2525	if (error) {
				2526	if (error == -EINVAL)
				2527	error = 0;
				2528	break;
				2529	}
				2530	if (page) {
				2531	if (sgp == SGP_CACHE)
				2532	set_page_dirty(page);
				2533	unlock_page(page);
				2534	}
				2535
				2536	/*
				2537	* We must evaluate after, since reads (unlike writes)
				2538	* are called without i_mutex protection against truncate
				2539	*/
				2540	nr = PAGE_SIZE;
				2541	i_size = i_size_read(inode);
				2542	end_index = i_size >> PAGE_SHIFT;
				2543	if (index == end_index) {
				2544	nr = i_size & ~PAGE_MASK;
				2545	if (nr <= offset) {
				2546	if (page)
				2547	put_page(page);
				2548	break;
				2549	}
				2550	}
				2551	nr -= offset;
				2552
				2553	if (page) {
				2554	/*
				2555	* If users can be writing to this page using arbitrary
				2556	* virtual addresses, take care about potential aliasing
				2557	* before reading the page on the kernel side.
				2558	*/
				2559	if (mapping_writably_mapped(mapping))
				2560	flush_dcache_page(page);
				2561	/*
				2562	* Mark the page accessed if we read the beginning.
				2563	*/
				2564	if (!offset)
				2565	mark_page_accessed(page);
				2566	} else {
				2567	page = ZERO_PAGE(0);
				2568	get_page(page);
				2569	}
				2570
				2571	/*
				2572	* Ok, we have the page, and it's up-to-date, so
				2573	* now we can copy it to user space...
				2574	*/
				2575	ret = copy_page_to_iter(page, offset, nr, to);
				2576	retval += ret;
				2577	offset += ret;
				2578	index += offset >> PAGE_SHIFT;
				2579	offset &= ~PAGE_MASK;
				2580
				2581	put_page(page);
				2582	if (!iov_iter_count(to))
				2583	break;
				2584	if (ret < nr) {
				2585	error = -EFAULT;
				2586	break;
				2587	}
				2588	cond_resched();
				2589	}
				2590
				2591	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
				2592	file_accessed(file);
				2593	return retval ? retval : error;
				2594	}
				2595
				2596	/*
				2597	* llseek SEEK_DATA or SEEK_HOLE through the page cache.
				2598	*/
				2599	static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
				2600	pgoff_t index, pgoff_t end, int whence)
				2601	{
				2602	struct page *page;
				2603	struct pagevec pvec;
				2604	pgoff_t indices[PAGEVEC_SIZE];
				2605	bool done = false;
				2606	int i;
				2607
				2608	pagevec_init(&pvec);
				2609	pvec.nr = 1; /* start small: we may be there already */
				2610	while (!done) {
				2611	pvec.nr = find_get_entries(mapping, index,
				2612	pvec.nr, pvec.pages, indices);
				2613	if (!pvec.nr) {
				2614	if (whence == SEEK_DATA)
				2615	index = end;
				2616	break;
				2617	}
				2618	for (i = 0; i < pvec.nr; i++, index++) {
				2619	if (index < indices[i]) {
				2620	if (whence == SEEK_HOLE) {
				2621	done = true;
				2622	break;
				2623	}
				2624	index = indices[i];
				2625	}
				2626	page = pvec.pages[i];
				2627	if (page && !xa_is_value(page)) {
				2628	if (!PageUptodate(page))
				2629	page = NULL;
				2630	}
				2631	if (index >= end \|\|
				2632	(page && whence == SEEK_DATA) \|\|
				2633	(!page && whence == SEEK_HOLE)) {
				2634	done = true;
				2635	break;
				2636	}
				2637	}
				2638	pagevec_remove_exceptionals(&pvec);
				2639	pagevec_release(&pvec);
				2640	pvec.nr = PAGEVEC_SIZE;
				2641	cond_resched();
				2642	}
				2643	return index;
				2644	}
				2645
				2646	static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
				2647	{
				2648	struct address_space *mapping = file->f_mapping;
				2649	struct inode *inode = mapping->host;
				2650	pgoff_t start, end;
				2651	loff_t new_offset;
				2652
				2653	if (whence != SEEK_DATA && whence != SEEK_HOLE)
				2654	return generic_file_llseek_size(file, offset, whence,
				2655	MAX_LFS_FILESIZE, i_size_read(inode));
				2656	inode_lock(inode);
				2657	/* We're holding i_mutex so we can access i_size directly */
				2658
				2659	if (offset < 0 \|\| offset >= inode->i_size)
				2660	offset = -ENXIO;
				2661	else {
				2662	start = offset >> PAGE_SHIFT;
				2663	end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2664	new_offset = shmem_seek_hole_data(mapping, start, end, whence);
				2665	new_offset <<= PAGE_SHIFT;
				2666	if (new_offset > offset) {
				2667	if (new_offset < inode->i_size)
				2668	offset = new_offset;
				2669	else if (whence == SEEK_DATA)
				2670	offset = -ENXIO;
				2671	else
				2672	offset = inode->i_size;
				2673	}
				2674	}
				2675
				2676	if (offset >= 0)
				2677	offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
				2678	inode_unlock(inode);
				2679	return offset;
				2680	}
				2681
				2682	static long shmem_fallocate(struct file *file, int mode, loff_t offset,
				2683	loff_t len)
				2684	{
				2685	struct inode *inode = file_inode(file);
				2686	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				2687	struct shmem_inode_info *info = SHMEM_I(inode);
				2688	struct shmem_falloc shmem_falloc;
				2689	pgoff_t start, index, end;
				2690	int error;
				2691
				2692	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				2693	return -EOPNOTSUPP;
				2694
				2695	inode_lock(inode);
				2696
				2697	if (mode & FALLOC_FL_PUNCH_HOLE) {
				2698	struct address_space *mapping = file->f_mapping;
				2699	loff_t unmap_start = round_up(offset, PAGE_SIZE);
				2700	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
				2701	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
				2702
				2703	/* protected by i_mutex */
				2704	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE)) {
				2705	error = -EPERM;
				2706	goto out;
				2707	}
				2708
				2709	shmem_falloc.waitq = &shmem_falloc_waitq;
				2710	shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
				2711	shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
				2712	spin_lock(&inode->i_lock);
				2713	inode->i_private = &shmem_falloc;
				2714	spin_unlock(&inode->i_lock);
				2715
				2716	if ((u64)unmap_end > (u64)unmap_start)
				2717	unmap_mapping_range(mapping, unmap_start,
				2718	1 + unmap_end - unmap_start, 0);
				2719	shmem_truncate_range(inode, offset, offset + len - 1);
				2720	/* No need to unmap again: hole-punching leaves COWed pages */
				2721
				2722	spin_lock(&inode->i_lock);
				2723	inode->i_private = NULL;
				2724	wake_up_all(&shmem_falloc_waitq);
				2725	WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
				2726	spin_unlock(&inode->i_lock);
				2727	error = 0;
				2728	goto out;
				2729	}
				2730
				2731	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
				2732	error = inode_newsize_ok(inode, offset + len);
				2733	if (error)
				2734	goto out;
				2735
				2736	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
				2737	error = -EPERM;
				2738	goto out;
				2739	}
				2740
				2741	start = offset >> PAGE_SHIFT;
				2742	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2743	/* Try to avoid a swapstorm if len is impossible to satisfy */
				2744	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
				2745	error = -ENOSPC;
				2746	goto out;
				2747	}
				2748
				2749	shmem_falloc.waitq = NULL;
				2750	shmem_falloc.start = start;
				2751	shmem_falloc.next = start;
				2752	shmem_falloc.nr_falloced = 0;
				2753	shmem_falloc.nr_unswapped = 0;
				2754	spin_lock(&inode->i_lock);
				2755	inode->i_private = &shmem_falloc;
				2756	spin_unlock(&inode->i_lock);
				2757
				2758	for (index = start; index < end; index++) {
				2759	struct page *page;
				2760
				2761	/*
				2762	* Good, the fallocate(2) manpage permits EINTR: we may have
				2763	* been interrupted because we are using up too much memory.
				2764	*/
				2765	if (signal_pending(current))
				2766	error = -EINTR;
				2767	else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
				2768	error = -ENOMEM;
				2769	else
				2770	error = shmem_getpage(inode, index, &page, SGP_FALLOC);
				2771	if (error) {
				2772	/* Remove the !PageUptodate pages we added */
				2773	if (index > start) {
				2774	shmem_undo_range(inode,
				2775	(loff_t)start << PAGE_SHIFT,
				2776	((loff_t)index << PAGE_SHIFT) - 1, true);
				2777	}
				2778	goto undone;
				2779	}
				2780
				2781	/*
				2782	* Inform shmem_writepage() how far we have reached.
				2783	* No need for lock or barrier: we have the page lock.
				2784	*/
				2785	shmem_falloc.next++;
				2786	if (!PageUptodate(page))
				2787	shmem_falloc.nr_falloced++;
				2788
				2789	/*
				2790	* If !PageUptodate, leave it that way so that freeable pages
				2791	* can be recognized if we need to rollback on error later.
				2792	* But set_page_dirty so that memory pressure will swap rather
				2793	* than free the pages we are allocating (and SGP_CACHE pages
				2794	* might still be clean: we now need to mark those dirty too).
				2795	*/
				2796	set_page_dirty(page);
				2797	unlock_page(page);
				2798	put_page(page);
				2799	cond_resched();
				2800	}
				2801
				2802	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
				2803	i_size_write(inode, offset + len);
				2804	inode->i_ctime = current_time(inode);
				2805	undone:
				2806	spin_lock(&inode->i_lock);
				2807	inode->i_private = NULL;
				2808	spin_unlock(&inode->i_lock);
				2809	out:
				2810	inode_unlock(inode);
				2811	return error;
				2812	}
				2813
				2814	static int shmem_statfs(struct dentry dentry, struct kstatfs buf)
				2815	{
				2816	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
				2817
				2818	buf->f_type = TMPFS_MAGIC;
				2819	buf->f_bsize = PAGE_SIZE;
				2820	buf->f_namelen = NAME_MAX;
				2821	if (sbinfo->max_blocks) {
				2822	buf->f_blocks = sbinfo->max_blocks;
				2823	buf->f_bavail =
				2824	buf->f_bfree = sbinfo->max_blocks -
				2825	percpu_counter_sum(&sbinfo->used_blocks);
				2826	}
				2827	if (sbinfo->max_inodes) {
				2828	buf->f_files = sbinfo->max_inodes;
				2829	buf->f_ffree = sbinfo->free_inodes;
				2830	}
				2831	/* else leave those fields 0 like simple_statfs */
				2832	return 0;
				2833	}
				2834
				2835	/*
				2836	* File creation. Allocate an inode, and we're done..
				2837	*/
				2838	static int
				2839	shmem_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)
				2840	{
				2841	struct inode *inode;
				2842	int error = -ENOSPC;
				2843
				2844	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
				2845	if (inode) {
				2846	error = simple_acl_create(dir, inode);
				2847	if (error)
				2848	goto out_iput;
				2849	error = security_inode_init_security(inode, dir,
				2850	&dentry->d_name,
				2851	shmem_initxattrs, NULL);
				2852	if (error && error != -EOPNOTSUPP)
				2853	goto out_iput;
				2854
				2855	error = 0;
				2856	dir->i_size += BOGO_DIRENT_SIZE;
				2857	dir->i_ctime = dir->i_mtime = current_time(dir);
				2858	d_instantiate(dentry, inode);
				2859	dget(dentry); /* Extra count - pin the dentry in core */
				2860	}
				2861	return error;
				2862	out_iput:
				2863	iput(inode);
				2864	return error;
				2865	}
				2866
				2867	static int
				2868	shmem_tmpfile(struct inode dir, struct dentry dentry, umode_t mode)
				2869	{
				2870	struct inode *inode;
				2871	int error = -ENOSPC;
				2872
				2873	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
				2874	if (inode) {
				2875	error = security_inode_init_security(inode, dir,
				2876	NULL,
				2877	shmem_initxattrs, NULL);
				2878	if (error && error != -EOPNOTSUPP)
				2879	goto out_iput;
				2880	error = simple_acl_create(dir, inode);
				2881	if (error)
				2882	goto out_iput;
				2883	d_tmpfile(dentry, inode);
				2884	}
				2885	return error;
				2886	out_iput:
				2887	iput(inode);
				2888	return error;
				2889	}
				2890
				2891	static int shmem_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
				2892	{
				2893	int error;
				2894
				2895	if ((error = shmem_mknod(dir, dentry, mode \| S_IFDIR, 0)))
				2896	return error;
				2897	inc_nlink(dir);
				2898	return 0;
				2899	}
				2900
				2901	static int shmem_create(struct inode dir, struct dentry dentry, umode_t mode,
				2902	bool excl)
				2903	{
				2904	return shmem_mknod(dir, dentry, mode \| S_IFREG, 0);
				2905	}
				2906
				2907	/*
				2908	* Link a file..
				2909	*/
				2910	static int shmem_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)
				2911	{
				2912	struct inode *inode = d_inode(old_dentry);
				2913	int ret = 0;
				2914
				2915	/*
				2916	* No ordinary (disk based) filesystem counts links as inodes;
				2917	* but each new link needs a new dentry, pinning lowmem, and
				2918	* tmpfs dentries cannot be pruned until they are unlinked.
				2919	* But if an O_TMPFILE file is linked into the tmpfs, the
				2920	* first link must skip that, to get the accounting right.
				2921	*/
				2922	if (inode->i_nlink) {
				2923	ret = shmem_reserve_inode(inode->i_sb);
				2924	if (ret)
				2925	goto out;
				2926	}
				2927
				2928	dir->i_size += BOGO_DIRENT_SIZE;
				2929	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
				2930	inc_nlink(inode);
				2931	ihold(inode); /* New dentry reference */
				2932	dget(dentry); /* Extra pinning count for the created dentry */
				2933	d_instantiate(dentry, inode);
				2934	out:
				2935	return ret;
				2936	}
				2937
				2938	static int shmem_unlink(struct inode dir, struct dentry dentry)
				2939	{
				2940	struct inode *inode = d_inode(dentry);
				2941
				2942	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
				2943	shmem_free_inode(inode->i_sb);
				2944
				2945	dir->i_size -= BOGO_DIRENT_SIZE;
				2946	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
				2947	drop_nlink(inode);
				2948	dput(dentry); /* Undo the count from "create" - this does all the work */
				2949	return 0;
				2950	}
				2951
				2952	static int shmem_rmdir(struct inode dir, struct dentry dentry)
				2953	{
				2954	if (!simple_empty(dentry))
				2955	return -ENOTEMPTY;
				2956
				2957	drop_nlink(d_inode(dentry));
				2958	drop_nlink(dir);
				2959	return shmem_unlink(dir, dentry);
				2960	}
				2961
				2962	static int shmem_exchange(struct inode old_dir, struct dentry old_dentry, struct inode new_dir, struct dentry new_dentry)
				2963	{
				2964	bool old_is_dir = d_is_dir(old_dentry);
				2965	bool new_is_dir = d_is_dir(new_dentry);
				2966
				2967	if (old_dir != new_dir && old_is_dir != new_is_dir) {
				2968	if (old_is_dir) {
				2969	drop_nlink(old_dir);
				2970	inc_nlink(new_dir);
				2971	} else {
				2972	drop_nlink(new_dir);
				2973	inc_nlink(old_dir);
				2974	}
				2975	}
				2976	old_dir->i_ctime = old_dir->i_mtime =
				2977	new_dir->i_ctime = new_dir->i_mtime =
				2978	d_inode(old_dentry)->i_ctime =
				2979	d_inode(new_dentry)->i_ctime = current_time(old_dir);
				2980
				2981	return 0;
				2982	}
				2983
				2984	static int shmem_whiteout(struct inode old_dir, struct dentry old_dentry)
				2985	{
				2986	struct dentry *whiteout;
				2987	int error;
				2988
				2989	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
				2990	if (!whiteout)
				2991	return -ENOMEM;
				2992
				2993	error = shmem_mknod(old_dir, whiteout,
				2994	S_IFCHR \| WHITEOUT_MODE, WHITEOUT_DEV);
				2995	dput(whiteout);
				2996	if (error)
				2997	return error;
				2998
				2999	/*
				3000	* Cheat and hash the whiteout while the old dentry is still in
				3001	* place, instead of playing games with FS_RENAME_DOES_D_MOVE.
				3002	*
				3003	* d_lookup() will consistently find one of them at this point,
				3004	* not sure which one, but that isn't even important.
				3005	*/
				3006	d_rehash(whiteout);
				3007	return 0;
				3008	}
				3009
				3010	/*
				3011	* The VFS layer already does all the dentry stuff for rename,
				3012	* we just have to decrement the usage count for the target if
				3013	* it exists so that the VFS layer correctly free's it when it
				3014	* gets overwritten.
				3015	*/
				3016	static int shmem_rename2(struct inode old_dir, struct dentry old_dentry, struct inode new_dir, struct dentry new_dentry, unsigned int flags)
				3017	{
				3018	struct inode *inode = d_inode(old_dentry);
				3019	int they_are_dirs = S_ISDIR(inode->i_mode);
				3020
				3021	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
				3022	return -EINVAL;
				3023
				3024	if (flags & RENAME_EXCHANGE)
				3025	return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
				3026
				3027	if (!simple_empty(new_dentry))
				3028	return -ENOTEMPTY;
				3029
				3030	if (flags & RENAME_WHITEOUT) {
				3031	int error;
				3032
				3033	error = shmem_whiteout(old_dir, old_dentry);
				3034	if (error)
				3035	return error;
				3036	}
				3037
				3038	if (d_really_is_positive(new_dentry)) {
				3039	(void) shmem_unlink(new_dir, new_dentry);
				3040	if (they_are_dirs) {
				3041	drop_nlink(d_inode(new_dentry));
				3042	drop_nlink(old_dir);
				3043	}
				3044	} else if (they_are_dirs) {
				3045	drop_nlink(old_dir);
				3046	inc_nlink(new_dir);
				3047	}
				3048
				3049	old_dir->i_size -= BOGO_DIRENT_SIZE;
				3050	new_dir->i_size += BOGO_DIRENT_SIZE;
				3051	old_dir->i_ctime = old_dir->i_mtime =
				3052	new_dir->i_ctime = new_dir->i_mtime =
				3053	inode->i_ctime = current_time(old_dir);
				3054	return 0;
				3055	}
				3056
				3057	static int shmem_symlink(struct inode dir, struct dentry dentry, const char *symname)
				3058	{
				3059	int error;
				3060	int len;
				3061	struct inode *inode;
				3062	struct page *page;
				3063
				3064	len = strlen(symname) + 1;
				3065	if (len > PAGE_SIZE)
				3066	return -ENAMETOOLONG;
				3067
				3068	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK \| 0777, 0,
				3069	VM_NORESERVE);
				3070	if (!inode)
				3071	return -ENOSPC;
				3072
				3073	error = security_inode_init_security(inode, dir, &dentry->d_name,
				3074	shmem_initxattrs, NULL);
				3075	if (error) {
				3076	if (error != -EOPNOTSUPP) {
				3077	iput(inode);
				3078	return error;
				3079	}
				3080	error = 0;
				3081	}
				3082
				3083	inode->i_size = len-1;
				3084	if (len <= SHORT_SYMLINK_LEN) {
				3085	inode->i_link = kmemdup(symname, len, GFP_KERNEL);
				3086	if (!inode->i_link) {
				3087	iput(inode);
				3088	return -ENOMEM;
				3089	}
				3090	inode->i_op = &shmem_short_symlink_operations;
				3091	} else {
				3092	inode_nohighmem(inode);
				3093	error = shmem_getpage(inode, 0, &page, SGP_WRITE);
				3094	if (error) {
				3095	iput(inode);
				3096	return error;
				3097	}
				3098	inode->i_mapping->a_ops = &shmem_aops;
				3099	inode->i_op = &shmem_symlink_inode_operations;
				3100	memcpy(page_address(page), symname, len);
				3101	SetPageUptodate(page);
				3102	set_page_dirty(page);
				3103	unlock_page(page);
				3104	put_page(page);
				3105	}
				3106	dir->i_size += BOGO_DIRENT_SIZE;
				3107	dir->i_ctime = dir->i_mtime = current_time(dir);
				3108	d_instantiate(dentry, inode);
				3109	dget(dentry);
				3110	return 0;
				3111	}
				3112
				3113	static void shmem_put_link(void *arg)
				3114	{
				3115	mark_page_accessed(arg);
				3116	put_page(arg);
				3117	}
				3118
				3119	static const char shmem_get_link(struct dentry dentry,
				3120	struct inode *inode,
				3121	struct delayed_call *done)
				3122	{
				3123	struct page *page = NULL;
				3124	int error;
				3125	if (!dentry) {
				3126	page = find_get_page(inode->i_mapping, 0);
				3127	if (!page)
				3128	return ERR_PTR(-ECHILD);
				3129	if (!PageUptodate(page)) {
				3130	put_page(page);
				3131	return ERR_PTR(-ECHILD);
				3132	}
				3133	} else {
				3134	error = shmem_getpage(inode, 0, &page, SGP_READ);
				3135	if (error)
				3136	return ERR_PTR(error);
				3137	unlock_page(page);
				3138	}
				3139	set_delayed_call(done, shmem_put_link, page);
				3140	return page_address(page);
				3141	}
				3142
				3143	#ifdef CONFIG_TMPFS_XATTR
				3144	/*
				3145	* Superblocks without xattr inode operations may get some security.* xattr
				3146	* support from the LSM "for free". As soon as we have any other xattrs
				3147	* like ACLs, we also need to implement the security.* handlers at
				3148	* filesystem level, though.
				3149	*/
				3150
				3151	/*
				3152	* Callback for security_inode_init_security() for acquiring xattrs.
				3153	*/
				3154	static int shmem_initxattrs(struct inode *inode,
				3155	const struct xattr *xattr_array,
				3156	void *fs_info)
				3157	{
				3158	struct shmem_inode_info *info = SHMEM_I(inode);
				3159	const struct xattr *xattr;
				3160	struct simple_xattr *new_xattr;
				3161	size_t len;
				3162
				3163	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
				3164	new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
				3165	if (!new_xattr)
				3166	return -ENOMEM;
				3167
				3168	len = strlen(xattr->name) + 1;
				3169	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
				3170	GFP_KERNEL);
				3171	if (!new_xattr->name) {
				3172	kfree(new_xattr);
				3173	return -ENOMEM;
				3174	}
				3175
				3176	memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
				3177	XATTR_SECURITY_PREFIX_LEN);
				3178	memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
				3179	xattr->name, len);
				3180
				3181	simple_xattr_list_add(&info->xattrs, new_xattr);
				3182	}
				3183
				3184	return 0;
				3185	}
				3186
				3187	static int shmem_xattr_handler_get(const struct xattr_handler *handler,
				3188	struct dentry unused, struct inode inode,
				3189	const char name, void buffer, size_t size,
				3190	int flags)
				3191	{
				3192	struct shmem_inode_info *info = SHMEM_I(inode);
				3193
				3194	name = xattr_full_name(handler, name);
				3195	return simple_xattr_get(&info->xattrs, name, buffer, size);
				3196	}
				3197
				3198	static int shmem_xattr_handler_set(const struct xattr_handler *handler,
				3199	struct dentry unused, struct inode inode,
				3200	const char name, const void value,
				3201	size_t size, int flags)
				3202	{
				3203	struct shmem_inode_info *info = SHMEM_I(inode);
				3204
				3205	name = xattr_full_name(handler, name);
				3206	return simple_xattr_set(&info->xattrs, name, value, size, flags);
				3207	}
				3208
				3209	static const struct xattr_handler shmem_security_xattr_handler = {
				3210	.prefix = XATTR_SECURITY_PREFIX,
				3211	.get = shmem_xattr_handler_get,
				3212	.set = shmem_xattr_handler_set,
				3213	};
				3214
				3215	static const struct xattr_handler shmem_trusted_xattr_handler = {
				3216	.prefix = XATTR_TRUSTED_PREFIX,
				3217	.get = shmem_xattr_handler_get,
				3218	.set = shmem_xattr_handler_set,
				3219	};
				3220
				3221	static const struct xattr_handler *shmem_xattr_handlers[] = {
				3222	#ifdef CONFIG_TMPFS_POSIX_ACL
				3223	&posix_acl_access_xattr_handler,
				3224	&posix_acl_default_xattr_handler,
				3225	#endif
				3226	&shmem_security_xattr_handler,
				3227	&shmem_trusted_xattr_handler,
				3228	NULL
				3229	};
				3230
				3231	static ssize_t shmem_listxattr(struct dentry dentry, char buffer, size_t size)
				3232	{
				3233	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
				3234	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
				3235	}
				3236	#endif /* CONFIG_TMPFS_XATTR */
				3237
				3238	static const struct inode_operations shmem_short_symlink_operations = {
				3239	.get_link = simple_get_link,
				3240	#ifdef CONFIG_TMPFS_XATTR
				3241	.listxattr = shmem_listxattr,
				3242	#endif
				3243	};
				3244
				3245	static const struct inode_operations shmem_symlink_inode_operations = {
				3246	.get_link = shmem_get_link,
				3247	#ifdef CONFIG_TMPFS_XATTR
				3248	.listxattr = shmem_listxattr,
				3249	#endif
				3250	};
				3251
				3252	static struct dentry shmem_get_parent(struct dentry child)
				3253	{
				3254	return ERR_PTR(-ESTALE);
				3255	}
				3256
				3257	static int shmem_match(struct inode ino, void vfh)
				3258	{
				3259	__u32 *fh = vfh;
				3260	__u64 inum = fh[2];
				3261	inum = (inum << 32) \| fh[1];
				3262	return ino->i_ino == inum && fh[0] == ino->i_generation;
				3263	}
				3264
				3265	/* Find any alias of inode, but prefer a hashed alias */
				3266	static struct dentry shmem_find_alias(struct inode inode)
				3267	{
				3268	struct dentry *alias = d_find_alias(inode);
				3269
				3270	return alias ?: d_find_any_alias(inode);
				3271	}
				3272
				3273
				3274	static struct dentry shmem_fh_to_dentry(struct super_block sb,
				3275	struct fid *fid, int fh_len, int fh_type)
				3276	{
				3277	struct inode *inode;
				3278	struct dentry *dentry = NULL;
				3279	u64 inum;
				3280
				3281	if (fh_len < 3)
				3282	return NULL;
				3283
				3284	inum = fid->raw[2];
				3285	inum = (inum << 32) \| fid->raw[1];
				3286
				3287	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
				3288	shmem_match, fid->raw);
				3289	if (inode) {
				3290	dentry = shmem_find_alias(inode);
				3291	iput(inode);
				3292	}
				3293
				3294	return dentry;
				3295	}
				3296
				3297	static int shmem_encode_fh(struct inode inode, __u32 fh, int *len,
				3298	struct inode *parent)
				3299	{
				3300	if (*len < 3) {
				3301	*len = 3;
				3302	return FILEID_INVALID;
				3303	}
				3304
				3305	if (inode_unhashed(inode)) {
				3306	/* Unfortunately insert_inode_hash is not idempotent,
				3307	* so as we hash inodes here rather than at creation
				3308	* time, we need a lock to ensure we only try
				3309	* to do it once
				3310	*/
				3311	static DEFINE_SPINLOCK(lock);
				3312	spin_lock(&lock);
				3313	if (inode_unhashed(inode))
				3314	__insert_inode_hash(inode,
				3315	inode->i_ino + inode->i_generation);
				3316	spin_unlock(&lock);
				3317	}
				3318
				3319	fh[0] = inode->i_generation;
				3320	fh[1] = inode->i_ino;
				3321	fh[2] = ((__u64)inode->i_ino) >> 32;
				3322
				3323	*len = 3;
				3324	return 1;
				3325	}
				3326
				3327	static const struct export_operations shmem_export_ops = {
				3328	.get_parent = shmem_get_parent,
				3329	.encode_fh = shmem_encode_fh,
				3330	.fh_to_dentry = shmem_fh_to_dentry,
				3331	};
				3332
				3333	enum shmem_param {
				3334	Opt_gid,
				3335	Opt_huge,
				3336	Opt_mode,
				3337	Opt_mpol,
				3338	Opt_nr_blocks,
				3339	Opt_nr_inodes,
				3340	Opt_size,
				3341	Opt_uid,
				3342	};
				3343
				3344	static const struct fs_parameter_spec shmem_param_specs[] = {
				3345	fsparam_u32 ("gid", Opt_gid),
				3346	fsparam_enum ("huge", Opt_huge),
				3347	fsparam_u32oct("mode", Opt_mode),
				3348	fsparam_string("mpol", Opt_mpol),
				3349	fsparam_string("nr_blocks", Opt_nr_blocks),
				3350	fsparam_string("nr_inodes", Opt_nr_inodes),
				3351	fsparam_string("size", Opt_size),
				3352	fsparam_u32 ("uid", Opt_uid),
				3353	{}
				3354	};
				3355
				3356	static const struct fs_parameter_enum shmem_param_enums[] = {
				3357	{ Opt_huge, "never", SHMEM_HUGE_NEVER },
				3358	{ Opt_huge, "always", SHMEM_HUGE_ALWAYS },
				3359	{ Opt_huge, "within_size", SHMEM_HUGE_WITHIN_SIZE },
				3360	{ Opt_huge, "advise", SHMEM_HUGE_ADVISE },
				3361	{}
				3362	};
				3363
				3364	const struct fs_parameter_description shmem_fs_parameters = {
				3365	.name = "tmpfs",
				3366	.specs = shmem_param_specs,
				3367	.enums = shmem_param_enums,
				3368	};
				3369
				3370	static int shmem_parse_one(struct fs_context fc, struct fs_parameter param)
				3371	{
				3372	struct shmem_options *ctx = fc->fs_private;
				3373	struct fs_parse_result result;
				3374	unsigned long long size;
				3375	char *rest;
				3376	int opt;
				3377	kuid_t kuid;
				3378	kgid_t kgid;
				3379
				3380	opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
				3381	if (opt < 0)
				3382	return opt;
				3383
				3384	switch (opt) {
				3385	case Opt_size:
				3386	size = memparse(param->string, &rest);
				3387	if (*rest == '%') {
				3388	size <<= PAGE_SHIFT;
				3389	size *= totalram_pages();
				3390	do_div(size, 100);
				3391	rest++;
				3392	}
				3393	if (*rest)
				3394	goto bad_value;
				3395	ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
				3396	ctx->seen \|= SHMEM_SEEN_BLOCKS;
				3397	break;
				3398	case Opt_nr_blocks:
				3399	ctx->blocks = memparse(param->string, &rest);
				3400	if (*rest)
				3401	goto bad_value;
				3402	ctx->seen \|= SHMEM_SEEN_BLOCKS;
				3403	break;
				3404	case Opt_nr_inodes:
				3405	ctx->inodes = memparse(param->string, &rest);
				3406	if (*rest)
				3407	goto bad_value;
				3408	ctx->seen \|= SHMEM_SEEN_INODES;
				3409	break;
				3410	case Opt_mode:
				3411	ctx->mode = result.uint_32 & 07777;
				3412	break;
				3413	case Opt_uid:
				3414	kuid = make_kuid(current_user_ns(), result.uint_32);
				3415	if (!uid_valid(kuid))
				3416	goto bad_value;
				3417
				3418	/*
				3419	* The requested uid must be representable in the
				3420	* filesystem's idmapping.
				3421	*/
				3422	if (!kuid_has_mapping(fc->user_ns, kuid))
				3423	goto bad_value;
				3424
				3425	ctx->uid = kuid;
				3426	break;
				3427	case Opt_gid:
				3428	kgid = make_kgid(current_user_ns(), result.uint_32);
				3429	if (!gid_valid(kgid))
				3430	goto bad_value;
				3431
				3432	/*
				3433	* The requested gid must be representable in the
				3434	* filesystem's idmapping.
				3435	*/
				3436	if (!kgid_has_mapping(fc->user_ns, kgid))
				3437	goto bad_value;
				3438
				3439	ctx->gid = kgid;
				3440	break;
				3441	case Opt_huge:
				3442	ctx->huge = result.uint_32;
				3443	if (ctx->huge != SHMEM_HUGE_NEVER &&
				3444	!(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				3445	has_transparent_hugepage()))
				3446	goto unsupported_parameter;
				3447	ctx->seen \|= SHMEM_SEEN_HUGE;
				3448	break;
				3449	case Opt_mpol:
				3450	if (IS_ENABLED(CONFIG_NUMA)) {
				3451	mpol_put(ctx->mpol);
				3452	ctx->mpol = NULL;
				3453	if (mpol_parse_str(param->string, &ctx->mpol))
				3454	goto bad_value;
				3455	break;
				3456	}
				3457	goto unsupported_parameter;
				3458	}
				3459	return 0;
				3460
				3461	unsupported_parameter:
				3462	return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
				3463	bad_value:
				3464	return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
				3465	}
				3466
				3467	static int shmem_parse_options(struct fs_context fc, void data)
				3468	{
				3469	char *options = data;
				3470
				3471	if (options) {
				3472	int err = security_sb_eat_lsm_opts(options, &fc->security);
				3473	if (err)
				3474	return err;
				3475	}
				3476
				3477	while (options != NULL) {
				3478	char *this_char = options;
				3479	for (;;) {
				3480	/*
				3481	* NUL-terminate this option: unfortunately,
				3482	* mount options form a comma-separated list,
				3483	* but mpol's nodelist may also contain commas.
				3484	*/
				3485	options = strchr(options, ',');
				3486	if (options == NULL)
				3487	break;
				3488	options++;
				3489	if (!isdigit(*options)) {
				3490	options[-1] = '\0';
				3491	break;
				3492	}
				3493	}
				3494	if (*this_char) {
				3495	char *value = strchr(this_char,'=');
				3496	size_t len = 0;
				3497	int err;
				3498
				3499	if (value) {
				3500	*value++ = '\0';
				3501	len = strlen(value);
				3502	}
				3503	err = vfs_parse_fs_string(fc, this_char, value, len);
				3504	if (err < 0)
				3505	return err;
				3506	}
				3507	}
				3508	return 0;
				3509	}
				3510
				3511	/*
				3512	* Reconfigure a shmem filesystem.
				3513	*
				3514	* Note that we disallow change from limited->unlimited blocks/inodes while any
				3515	* are in use; but we must separately disallow unlimited->limited, because in
				3516	* that case we have no record of how much is already in use.
				3517	*/
				3518	static int shmem_reconfigure(struct fs_context *fc)
				3519	{
				3520	struct shmem_options *ctx = fc->fs_private;
				3521	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
				3522	unsigned long inodes;
				3523	const char *err;
				3524
				3525	spin_lock(&sbinfo->stat_lock);
				3526	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
				3527	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
				3528	if (!sbinfo->max_blocks) {
				3529	err = "Cannot retroactively limit size";
				3530	goto out;
				3531	}
				3532	if (percpu_counter_compare(&sbinfo->used_blocks,
				3533	ctx->blocks) > 0) {
				3534	err = "Too small a size for current use";
				3535	goto out;
				3536	}
				3537	}
				3538	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
				3539	if (!sbinfo->max_inodes) {
				3540	err = "Cannot retroactively limit inodes";
				3541	goto out;
				3542	}
				3543	if (ctx->inodes < inodes) {
				3544	err = "Too few inodes for current use";
				3545	goto out;
				3546	}
				3547	}
				3548
				3549	if (ctx->seen & SHMEM_SEEN_HUGE)
				3550	sbinfo->huge = ctx->huge;
				3551	if (ctx->seen & SHMEM_SEEN_BLOCKS)
				3552	sbinfo->max_blocks = ctx->blocks;
				3553	if (ctx->seen & SHMEM_SEEN_INODES) {
				3554	sbinfo->max_inodes = ctx->inodes;
				3555	sbinfo->free_inodes = ctx->inodes - inodes;
				3556	}
				3557
				3558	/*
				3559	* Preserve previous mempolicy unless mpol remount option was specified.
				3560	*/
				3561	if (ctx->mpol) {
				3562	mpol_put(sbinfo->mpol);
				3563	sbinfo->mpol = ctx->mpol; /* transfers initial ref */
				3564	ctx->mpol = NULL;
				3565	}
				3566	spin_unlock(&sbinfo->stat_lock);
				3567	return 0;
				3568	out:
				3569	spin_unlock(&sbinfo->stat_lock);
				3570	return invalf(fc, "tmpfs: %s", err);
				3571	}
				3572
				3573	static int shmem_show_options(struct seq_file seq, struct dentry root)
				3574	{
				3575	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
				3576
				3577	if (sbinfo->max_blocks != shmem_default_max_blocks())
				3578	seq_printf(seq, ",size=%luk",
				3579	sbinfo->max_blocks << (PAGE_SHIFT - 10));
				3580	if (sbinfo->max_inodes != shmem_default_max_inodes())
				3581	seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
				3582	if (sbinfo->mode != (0777 \| S_ISVTX))
				3583	seq_printf(seq, ",mode=%03ho", sbinfo->mode);
				3584	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
				3585	seq_printf(seq, ",uid=%u",
				3586	from_kuid_munged(&init_user_ns, sbinfo->uid));
				3587	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
				3588	seq_printf(seq, ",gid=%u",
				3589	from_kgid_munged(&init_user_ns, sbinfo->gid));
				3590	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3591	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
				3592	if (sbinfo->huge)
				3593	seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
				3594	#endif
				3595	shmem_show_mpol(seq, sbinfo->mpol);
				3596	return 0;
				3597	}
				3598
				3599	#endif /* CONFIG_TMPFS */
				3600
				3601	static void shmem_put_super(struct super_block *sb)
				3602	{
				3603	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				3604
				3605	percpu_counter_destroy(&sbinfo->used_blocks);
				3606	mpol_put(sbinfo->mpol);
				3607	kfree(sbinfo);
				3608	sb->s_fs_info = NULL;
				3609	}
				3610
				3611	static int shmem_fill_super(struct super_block sb, struct fs_context fc)
				3612	{
				3613	struct shmem_options *ctx = fc->fs_private;
				3614	struct inode *inode;
				3615	struct shmem_sb_info *sbinfo;
				3616	int err = -ENOMEM;
				3617
				3618	/* Round up to L1_CACHE_BYTES to resist false sharing */
				3619	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
				3620	L1_CACHE_BYTES), GFP_KERNEL);
				3621	if (!sbinfo)
				3622	return -ENOMEM;
				3623
				3624	sb->s_fs_info = sbinfo;
				3625
				3626	#ifdef CONFIG_TMPFS
				3627	/*
				3628	* Per default we only allow half of the physical ram per
				3629	* tmpfs instance, limiting inodes to one per page of lowmem;
				3630	* but the internal instance is left unlimited.
				3631	*/
				3632	if (!(sb->s_flags & SB_KERNMOUNT)) {
				3633	if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
				3634	ctx->blocks = shmem_default_max_blocks();
				3635	if (!(ctx->seen & SHMEM_SEEN_INODES))
				3636	ctx->inodes = shmem_default_max_inodes();
				3637	} else {
				3638	sb->s_flags \|= SB_NOUSER;
				3639	}
				3640	sb->s_export_op = &shmem_export_ops;
				3641	sb->s_flags \|= SB_NOSEC;
				3642	#else
				3643	sb->s_flags \|= SB_NOUSER;
				3644	#endif
				3645	sbinfo->max_blocks = ctx->blocks;
				3646	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
				3647	sbinfo->uid = ctx->uid;
				3648	sbinfo->gid = ctx->gid;
				3649	sbinfo->mode = ctx->mode;
				3650	sbinfo->huge = ctx->huge;
				3651	sbinfo->mpol = ctx->mpol;
				3652	ctx->mpol = NULL;
				3653
				3654	spin_lock_init(&sbinfo->stat_lock);
				3655	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
				3656	goto failed;
				3657	spin_lock_init(&sbinfo->shrinklist_lock);
				3658	INIT_LIST_HEAD(&sbinfo->shrinklist);
				3659
				3660	sb->s_maxbytes = MAX_LFS_FILESIZE;
				3661	sb->s_blocksize = PAGE_SIZE;
				3662	sb->s_blocksize_bits = PAGE_SHIFT;
				3663	sb->s_magic = TMPFS_MAGIC;
				3664	sb->s_op = &shmem_ops;
				3665	sb->s_time_gran = 1;
				3666	#ifdef CONFIG_TMPFS_XATTR
				3667	sb->s_xattr = shmem_xattr_handlers;
				3668	#endif
				3669	#ifdef CONFIG_TMPFS_POSIX_ACL
				3670	sb->s_flags \|= SB_POSIXACL;
				3671	#endif
				3672	uuid_gen(&sb->s_uuid);
				3673
				3674	inode = shmem_get_inode(sb, NULL, S_IFDIR \| sbinfo->mode, 0, VM_NORESERVE);
				3675	if (!inode)
				3676	goto failed;
				3677	inode->i_uid = sbinfo->uid;
				3678	inode->i_gid = sbinfo->gid;
				3679	sb->s_root = d_make_root(inode);
				3680	if (!sb->s_root)
				3681	goto failed;
				3682	return 0;
				3683
				3684	failed:
				3685	shmem_put_super(sb);
				3686	return err;
				3687	}
				3688
				3689	static int shmem_get_tree(struct fs_context *fc)
				3690	{
				3691	return get_tree_nodev(fc, shmem_fill_super);
				3692	}
				3693
				3694	static void shmem_free_fc(struct fs_context *fc)
				3695	{
				3696	struct shmem_options *ctx = fc->fs_private;
				3697
				3698	if (ctx) {
				3699	mpol_put(ctx->mpol);
				3700	kfree(ctx);
				3701	}
				3702	}
				3703
				3704	static const struct fs_context_operations shmem_fs_context_ops = {
				3705	.free = shmem_free_fc,
				3706	.get_tree = shmem_get_tree,
				3707	#ifdef CONFIG_TMPFS
				3708	.parse_monolithic = shmem_parse_options,
				3709	.parse_param = shmem_parse_one,
				3710	.reconfigure = shmem_reconfigure,
				3711	#endif
				3712	};
				3713
				3714	static struct kmem_cache *shmem_inode_cachep;
				3715
				3716	static struct inode shmem_alloc_inode(struct super_block sb)
				3717	{
				3718	struct shmem_inode_info *info;
				3719	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
				3720	if (!info)
				3721	return NULL;
				3722	return &info->vfs_inode;
				3723	}
				3724
				3725	static void shmem_free_in_core_inode(struct inode *inode)
				3726	{
				3727	if (S_ISLNK(inode->i_mode))
				3728	kfree(inode->i_link);
				3729	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
				3730	}
				3731
				3732	static void shmem_destroy_inode(struct inode *inode)
				3733	{
				3734	if (S_ISREG(inode->i_mode))
				3735	mpol_free_shared_policy(&SHMEM_I(inode)->policy);
				3736	}
				3737
				3738	static void shmem_init_inode(void *foo)
				3739	{
				3740	struct shmem_inode_info *info = foo;
				3741	inode_init_once(&info->vfs_inode);
				3742	}
				3743
				3744	static void shmem_init_inodecache(void)
				3745	{
				3746	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
				3747	sizeof(struct shmem_inode_info),
				3748	0, SLAB_PANIC\|SLAB_ACCOUNT, shmem_init_inode);
				3749	}
				3750
				3751	static void shmem_destroy_inodecache(void)
				3752	{
				3753	kmem_cache_destroy(shmem_inode_cachep);
				3754	}
				3755
				3756	static const struct address_space_operations shmem_aops = {
				3757	.writepage = shmem_writepage,
				3758	.set_page_dirty = __set_page_dirty_no_writeback,
				3759	#ifdef CONFIG_TMPFS
				3760	.write_begin = shmem_write_begin,
				3761	.write_end = shmem_write_end,
				3762	#endif
				3763	#ifdef CONFIG_MIGRATION
				3764	.migratepage = migrate_page,
				3765	#endif
				3766	.error_remove_page = generic_error_remove_page,
				3767	};
				3768
				3769	static const struct file_operations shmem_file_operations = {
				3770	.mmap = shmem_mmap,
				3771	.get_unmapped_area = shmem_get_unmapped_area,
				3772	#ifdef CONFIG_TMPFS
				3773	.llseek = shmem_file_llseek,
				3774	.read_iter = shmem_file_read_iter,
				3775	.write_iter = generic_file_write_iter,
				3776	.fsync = noop_fsync,
				3777	.splice_read = generic_file_splice_read,
				3778	.splice_write = iter_file_splice_write,
				3779	.fallocate = shmem_fallocate,
				3780	#endif
				3781	};
				3782
				3783	static const struct inode_operations shmem_inode_operations = {
				3784	.getattr = shmem_getattr,
				3785	.setattr = shmem_setattr,
				3786	#ifdef CONFIG_TMPFS_XATTR
				3787	.listxattr = shmem_listxattr,
				3788	.set_acl = simple_set_acl,
				3789	#endif
				3790	};
				3791
				3792	static const struct inode_operations shmem_dir_inode_operations = {
				3793	#ifdef CONFIG_TMPFS
				3794	.create = shmem_create,
				3795	.lookup = simple_lookup,
				3796	.link = shmem_link,
				3797	.unlink = shmem_unlink,
				3798	.symlink = shmem_symlink,
				3799	.mkdir = shmem_mkdir,
				3800	.rmdir = shmem_rmdir,
				3801	.mknod = shmem_mknod,
				3802	.rename = shmem_rename2,
				3803	.tmpfile = shmem_tmpfile,
				3804	#endif
				3805	#ifdef CONFIG_TMPFS_XATTR
				3806	.listxattr = shmem_listxattr,
				3807	#endif
				3808	#ifdef CONFIG_TMPFS_POSIX_ACL
				3809	.setattr = shmem_setattr,
				3810	.set_acl = simple_set_acl,
				3811	#endif
				3812	};
				3813
				3814	static const struct inode_operations shmem_special_inode_operations = {
				3815	#ifdef CONFIG_TMPFS_XATTR
				3816	.listxattr = shmem_listxattr,
				3817	#endif
				3818	#ifdef CONFIG_TMPFS_POSIX_ACL
				3819	.setattr = shmem_setattr,
				3820	.set_acl = simple_set_acl,
				3821	#endif
				3822	};
				3823
				3824	static const struct super_operations shmem_ops = {
				3825	.alloc_inode = shmem_alloc_inode,
				3826	.free_inode = shmem_free_in_core_inode,
				3827	.destroy_inode = shmem_destroy_inode,
				3828	#ifdef CONFIG_TMPFS
				3829	.statfs = shmem_statfs,
				3830	.show_options = shmem_show_options,
				3831	#endif
				3832	.evict_inode = shmem_evict_inode,
				3833	.drop_inode = generic_delete_inode,
				3834	.put_super = shmem_put_super,
				3835	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3836	.nr_cached_objects = shmem_unused_huge_count,
				3837	.free_cached_objects = shmem_unused_huge_scan,
				3838	#endif
				3839	};
				3840
				3841	static const struct vm_operations_struct shmem_vm_ops = {
				3842	.fault = shmem_fault,
				3843	.map_pages = filemap_map_pages,
				3844	#ifdef CONFIG_NUMA
				3845	.set_policy = shmem_set_policy,
				3846	.get_policy = shmem_get_policy,
				3847	#endif
				3848	};
				3849
				3850	int shmem_init_fs_context(struct fs_context *fc)
				3851	{
				3852	struct shmem_options *ctx;
				3853
				3854	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
				3855	if (!ctx)
				3856	return -ENOMEM;
				3857
				3858	ctx->mode = 0777 \| S_ISVTX;
				3859	ctx->uid = current_fsuid();
				3860	ctx->gid = current_fsgid();
				3861
				3862	fc->fs_private = ctx;
				3863	fc->ops = &shmem_fs_context_ops;
				3864	return 0;
				3865	}
				3866
				3867	static struct file_system_type shmem_fs_type = {
				3868	.owner = THIS_MODULE,
				3869	.name = "tmpfs",
				3870	.init_fs_context = shmem_init_fs_context,
				3871	#ifdef CONFIG_TMPFS
				3872	.parameters = &shmem_fs_parameters,
				3873	#endif
				3874	.kill_sb = kill_litter_super,
				3875	.fs_flags = FS_USERNS_MOUNT,
				3876	};
				3877
				3878	int __init shmem_init(void)
				3879	{
				3880	int error;
				3881
				3882	shmem_init_inodecache();
				3883
				3884	error = register_filesystem(&shmem_fs_type);
				3885	if (error) {
				3886	pr_err("Could not register tmpfs\n");
				3887	goto out2;
				3888	}
				3889
				3890	shm_mnt = kern_mount(&shmem_fs_type);
				3891	if (IS_ERR(shm_mnt)) {
				3892	error = PTR_ERR(shm_mnt);
				3893	pr_err("Could not kern_mount tmpfs\n");
				3894	goto out1;
				3895	}
				3896
				3897	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3898	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
				3899	SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
				3900	else
				3901	shmem_huge = 0; /* just in case it was patched */
				3902	#endif
				3903	return 0;
				3904
				3905	out1:
				3906	unregister_filesystem(&shmem_fs_type);
				3907	out2:
				3908	shmem_destroy_inodecache();
				3909	shm_mnt = ERR_PTR(error);
				3910	return error;
				3911	}
				3912
				3913	#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
				3914	static ssize_t shmem_enabled_show(struct kobject *kobj,
				3915	struct kobj_attribute attr, char buf)
				3916	{
				3917	int values[] = {
				3918	SHMEM_HUGE_ALWAYS,
				3919	SHMEM_HUGE_WITHIN_SIZE,
				3920	SHMEM_HUGE_ADVISE,
				3921	SHMEM_HUGE_NEVER,
				3922	SHMEM_HUGE_DENY,
				3923	SHMEM_HUGE_FORCE,
				3924	};
				3925	int i, count;
				3926
				3927	for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
				3928	const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
				3929
				3930	count += sprintf(buf + count, fmt,
				3931	shmem_format_huge(values[i]));
				3932	}
				3933	buf[count - 1] = '\n';
				3934	return count;
				3935	}
				3936
				3937	static ssize_t shmem_enabled_store(struct kobject *kobj,
				3938	struct kobj_attribute attr, const char buf, size_t count)
				3939	{
				3940	char tmp[16];
				3941	int huge;
				3942
				3943	if (count + 1 > sizeof(tmp))
				3944	return -EINVAL;
				3945	memcpy(tmp, buf, count);
				3946	tmp[count] = '\0';
				3947	if (count && tmp[count - 1] == '\n')
				3948	tmp[count - 1] = '\0';
				3949
				3950	huge = shmem_parse_huge(tmp);
				3951	if (huge == -EINVAL)
				3952	return -EINVAL;
				3953	if (!has_transparent_hugepage() &&
				3954	huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
				3955	return -EINVAL;
				3956
				3957	shmem_huge = huge;
				3958	if (shmem_huge > SHMEM_HUGE_DENY)
				3959	SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
				3960	return count;
				3961	}
				3962
				3963	struct kobj_attribute shmem_enabled_attr =
				3964	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
				3965	#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
				3966
				3967	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3968	bool shmem_huge_enabled(struct vm_area_struct *vma)
				3969	{
				3970	struct inode *inode = file_inode(vma->vm_file);
				3971	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				3972	loff_t i_size;
				3973	pgoff_t off;
				3974
				3975	if ((vma->vm_flags & VM_NOHUGEPAGE) \|\|
				3976	test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
				3977	return false;
				3978	if (shmem_huge == SHMEM_HUGE_FORCE)
				3979	return true;
				3980	if (shmem_huge == SHMEM_HUGE_DENY)
				3981	return false;
				3982	switch (sbinfo->huge) {
				3983	case SHMEM_HUGE_NEVER:
				3984	return false;
				3985	case SHMEM_HUGE_ALWAYS:
				3986	return true;
				3987	case SHMEM_HUGE_WITHIN_SIZE:
				3988	off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
				3989	i_size = round_up(i_size_read(inode), PAGE_SIZE);
				3990	if (i_size >= HPAGE_PMD_SIZE &&
				3991	i_size >> PAGE_SHIFT >= off)
				3992	return true;
				3993	/* fall through */
				3994	case SHMEM_HUGE_ADVISE:
				3995	/* TODO: implement fadvise() hints */
				3996	return (vma->vm_flags & VM_HUGEPAGE);
				3997	default:
				3998	VM_BUG_ON(1);
				3999	return false;
				4000	}
				4001	}
				4002	#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
				4003
				4004	#else /* !CONFIG_SHMEM */
				4005
				4006	/*
				4007	* tiny-shmem: simple shmemfs and tmpfs using ramfs code
				4008	*
				4009	* This is intended for small system where the benefits of the full
				4010	* shmem code (swap-backed and resource-limited) are outweighed by
				4011	* their complexity. On systems without swap this code should be
				4012	* effectively equivalent, but much lighter weight.
				4013	*/
				4014
				4015	static struct file_system_type shmem_fs_type = {
				4016	.name = "tmpfs",
				4017	.init_fs_context = ramfs_init_fs_context,
				4018	.parameters = &ramfs_fs_parameters,
				4019	.kill_sb = kill_litter_super,
				4020	.fs_flags = FS_USERNS_MOUNT,
				4021	};
				4022
				4023	int __init shmem_init(void)
				4024	{
				4025	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
				4026
				4027	shm_mnt = kern_mount(&shmem_fs_type);
				4028	BUG_ON(IS_ERR(shm_mnt));
				4029
				4030	return 0;
				4031	}
				4032
				4033	int shmem_unuse(unsigned int type, bool frontswap,
				4034	unsigned long *fs_pages_to_unuse)
				4035	{
				4036	return 0;
				4037	}
				4038
				4039	int shmem_lock(struct file file, int lock, struct user_struct user)
				4040	{
				4041	return 0;
				4042	}
				4043
				4044	void shmem_unlock_mapping(struct address_space *mapping)
				4045	{
				4046	}
				4047
				4048	#ifdef CONFIG_MMU
				4049	unsigned long shmem_get_unmapped_area(struct file *file,
				4050	unsigned long addr, unsigned long len,
				4051	unsigned long pgoff, unsigned long flags)
				4052	{
				4053	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
				4054	}
				4055	#endif
				4056
				4057	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
				4058	{
				4059	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
				4060	}
				4061	EXPORT_SYMBOL_GPL(shmem_truncate_range);
				4062
				4063	#define shmem_vm_ops generic_file_vm_ops
				4064	#define shmem_file_operations ramfs_file_operations
				4065	#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
				4066	#define shmem_acct_size(flags, size) 0
				4067	#define shmem_unacct_size(flags, size) do {} while (0)
				4068
				4069	#endif /* CONFIG_SHMEM */
				4070
				4071	/* common code */
				4072
				4073	static struct file __shmem_file_setup(struct vfsmount mnt, const char *name, loff_t size,
				4074	unsigned long flags, unsigned int i_flags)
				4075	{
				4076	struct inode *inode;
				4077	struct file *res;
				4078
				4079	if (IS_ERR(mnt))
				4080	return ERR_CAST(mnt);
				4081
				4082	if (size < 0 \|\| size > MAX_LFS_FILESIZE)
				4083	return ERR_PTR(-EINVAL);
				4084
				4085	if (shmem_acct_size(flags, size))
				4086	return ERR_PTR(-ENOMEM);
				4087
				4088	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG \| S_IRWXUGO, 0,
				4089	flags);
				4090	if (unlikely(!inode)) {
				4091	shmem_unacct_size(flags, size);
				4092	return ERR_PTR(-ENOSPC);
				4093	}
				4094	inode->i_flags \|= i_flags;
				4095	inode->i_size = size;
				4096	clear_nlink(inode); /* It is unlinked */
				4097	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
				4098	if (!IS_ERR(res))
				4099	res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
				4100	&shmem_file_operations);
				4101	if (IS_ERR(res))
				4102	iput(inode);
				4103	return res;
				4104	}
				4105
				4106	/**
				4107	* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
				4108	* kernel internal. There will be NO LSM permission checks against the
				4109	* underlying inode. So users of this interface must do LSM checks at a
				4110	* higher layer. The users are the big_key and shm implementations. LSM
				4111	* checks are provided at the key or shm level rather than the inode.
				4112	* @name: name for dentry (to be seen in /proc/<pid>/maps
				4113	* @size: size to be set for the file
				4114	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
				4115	*/
				4116	struct file shmem_kernel_file_setup(const char name, loff_t size, unsigned long flags)
				4117	{
				4118	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
				4119	}
				4120
				4121	/**
				4122	* shmem_file_setup - get an unlinked file living in tmpfs
				4123	* @name: name for dentry (to be seen in /proc/<pid>/maps
				4124	* @size: size to be set for the file
				4125	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
				4126	*/
				4127	struct file shmem_file_setup(const char name, loff_t size, unsigned long flags)
				4128	{
				4129	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
				4130	}
				4131	EXPORT_SYMBOL_GPL(shmem_file_setup);
				4132
				4133	/**
				4134	* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
				4135	* @mnt: the tmpfs mount where the file will be created
				4136	* @name: name for dentry (to be seen in /proc/<pid>/maps
				4137	* @size: size to be set for the file
				4138	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
				4139	*/
				4140	struct file shmem_file_setup_with_mnt(struct vfsmount mnt, const char *name,
				4141	loff_t size, unsigned long flags)
				4142	{
				4143	return __shmem_file_setup(mnt, name, size, flags, 0);
				4144	}
				4145	EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
				4146
				4147	/**
				4148	* shmem_zero_setup - setup a shared anonymous mapping
				4149	* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
				4150	*/
				4151	int shmem_zero_setup(struct vm_area_struct *vma)
				4152	{
				4153	struct file *file;
				4154	loff_t size = vma->vm_end - vma->vm_start;
				4155
				4156	/*
				4157	* Cloning a new file under mmap_sem leads to a lock ordering conflict
				4158	* between XFS directory reading and selinux: since this file is only
				4159	* accessible to the user through its mapping, use S_PRIVATE flag to
				4160	* bypass file security, in the same way as shmem_kernel_file_setup().
				4161	*/
				4162	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
				4163	if (IS_ERR(file))
				4164	return PTR_ERR(file);
				4165
				4166	if (vma->vm_file)
				4167	fput(vma->vm_file);
				4168	vma->vm_file = file;
				4169	vma->vm_ops = &shmem_vm_ops;
				4170
				4171	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				4172	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
				4173	(vma->vm_end & HPAGE_PMD_MASK)) {
				4174	khugepaged_enter(vma, vma->vm_flags);
				4175	}
				4176
				4177	return 0;
				4178	}
				4179
				4180	/**
				4181	* shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
				4182	* @mapping: the page's address_space
				4183	* @index: the page index
				4184	* @gfp: the page allocator flags to use if allocating
				4185	*
				4186	* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
				4187	* with any new page allocations done using the specified allocation flags.
				4188	* But read_cache_page_gfp() uses the ->readpage() method: which does not
				4189	* suit tmpfs, since it may have pages in swapcache, and needs to find those
				4190	* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
				4191	*
				4192	* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY \| __GFP_NOWARN in
				4193	* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
				4194	*/
				4195	struct page shmem_read_mapping_page_gfp(struct address_space mapping,
				4196	pgoff_t index, gfp_t gfp)
				4197	{
				4198	#ifdef CONFIG_SHMEM
				4199	struct inode *inode = mapping->host;
				4200	struct page *page;
				4201	int error;
				4202
				4203	BUG_ON(mapping->a_ops != &shmem_aops);
				4204	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
				4205	gfp, NULL, NULL, NULL);
				4206	if (error)
				4207	page = ERR_PTR(error);
				4208	else
				4209	unlock_page(page);
				4210	return page;
				4211	#else
				4212	/*
				4213	* The tiny !SHMEM case uses ramfs without swap
				4214	*/
				4215	return read_cache_page_gfp(mapping, index, gfp);
				4216	#endif
				4217	}
				4218	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);