Blame - src/kernel/linux/v4.19/mm/shmem.c - T800

blob: f730e2eeda96a0a7a9eb0ffbb33f1a026150abc7 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Resizable virtual memory filesystem for Linux.
				3	*
				4	* Copyright (C) 2000 Linus Torvalds.
				5	* 2000 Transmeta Corp.
				6	* 2000-2001 Christoph Rohland
				7	* 2000-2001 SAP AG
				8	* 2002 Red Hat Inc.
				9	* Copyright (C) 2002-2011 Hugh Dickins.
				10	* Copyright (C) 2011 Google Inc.
				11	* Copyright (C) 2002-2005 VERITAS Software Corporation.
				12	* Copyright (C) 2004 Andi Kleen, SuSE Labs
				13	*
				14	* Extended attribute support for tmpfs:
				15	* Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
				16	* Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
				17	*
				18	* tiny-shmem:
				19	* Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
				20	*
				21	* This file is released under the GPL.
				22	*/
				23
				24	#include <linux/fs.h>
				25	#include <linux/init.h>
				26	#include <linux/vfs.h>
				27	#include <linux/mount.h>
				28	#include <linux/ramfs.h>
				29	#include <linux/pagemap.h>
				30	#include <linux/file.h>
				31	#include <linux/mm.h>
				32	#include <linux/random.h>
				33	#include <linux/sched/signal.h>
				34	#include <linux/export.h>
				35	#include <linux/swap.h>
				36	#include <linux/uio.h>
				37	#include <linux/khugepaged.h>
				38	#include <linux/hugetlb.h>
				39
				40	#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
				41
				42	static struct vfsmount *shm_mnt;
				43
				44	#ifdef CONFIG_SHMEM
				45	/*
				46	* This virtual memory filesystem is heavily based on the ramfs. It
				47	* extends ramfs by the ability to use swap and honor resource limits
				48	* which makes it a completely usable filesystem.
				49	*/
				50
				51	#include <linux/xattr.h>
				52	#include <linux/exportfs.h>
				53	#include <linux/posix_acl.h>
				54	#include <linux/posix_acl_xattr.h>
				55	#include <linux/mman.h>
				56	#include <linux/string.h>
				57	#include <linux/slab.h>
				58	#include <linux/backing-dev.h>
				59	#include <linux/shmem_fs.h>
				60	#include <linux/writeback.h>
				61	#include <linux/blkdev.h>
				62	#include <linux/pagevec.h>
				63	#include <linux/percpu_counter.h>
				64	#include <linux/falloc.h>
				65	#include <linux/splice.h>
				66	#include <linux/security.h>
				67	#include <linux/swapops.h>
				68	#include <linux/mempolicy.h>
				69	#include <linux/namei.h>
				70	#include <linux/ctype.h>
				71	#include <linux/migrate.h>
				72	#include <linux/highmem.h>
				73	#include <linux/seq_file.h>
				74	#include <linux/magic.h>
				75	#include <linux/syscalls.h>
				76	#include <linux/fcntl.h>
				77	#include <uapi/linux/memfd.h>
				78	#include <linux/userfaultfd_k.h>
				79	#include <linux/rmap.h>
				80	#include <linux/uuid.h>
				81
				82	#include <linux/uaccess.h>
				83	#include <asm/pgtable.h>
				84
				85	#include "internal.h"
				86
				87	#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
				88	#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
				89
				90	/* Pretend that each entry is of this size in directory's i_size */
				91	#define BOGO_DIRENT_SIZE 20
				92
				93	/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
				94	#define SHORT_SYMLINK_LEN 128
				95
				96	/*
				97	* shmem_fallocate communicates with shmem_fault or shmem_writepage via
				98	* inode->i_private (with i_mutex making sure that it has only one user at
				99	* a time): we would prefer not to enlarge the shmem inode just for that.
				100	*/
				101	struct shmem_falloc {
				102	wait_queue_head_t waitq; / faults into hole wait for punch to end */
				103	pgoff_t start; /* start of range currently being fallocated */
				104	pgoff_t next; /* the next page offset to be fallocated */
				105	pgoff_t nr_falloced; /* how many new pages have been fallocated */
				106	pgoff_t nr_unswapped; /* how often writepage refused to swap out */
				107	};
				108
				109	#ifdef CONFIG_TMPFS
				110	static unsigned long shmem_default_max_blocks(void)
				111	{
				112	return totalram_pages / 2;
				113	}
				114
				115	static unsigned long shmem_default_max_inodes(void)
				116	{
				117	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
				118	}
				119	#endif
				120
				121	static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
				122	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				123	struct shmem_inode_info *info, pgoff_t index);
				124	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
				125	struct page **pagep, enum sgp_type sgp,
				126	gfp_t gfp, struct vm_area_struct *vma,
				127	struct vm_fault vmf, vm_fault_t fault_type);
				128
				129	int shmem_getpage(struct inode *inode, pgoff_t index,
				130	struct page **pagep, enum sgp_type sgp)
				131	{
				132	return shmem_getpage_gfp(inode, index, pagep, sgp,
				133	mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
				134	}
				135
				136	static inline struct shmem_sb_info SHMEM_SB(struct super_block sb)
				137	{
				138	return sb->s_fs_info;
				139	}
				140
				141	/*
				142	* shmem_file_setup pre-accounts the whole fixed size of a VM object,
				143	* for shared memory and for shared anonymous (/dev/zero) mappings
				144	* (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
				145	* consistent with the pre-accounting of private mappings ...
				146	*/
				147	static inline int shmem_acct_size(unsigned long flags, loff_t size)
				148	{
				149	return (flags & VM_NORESERVE) ?
				150	0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
				151	}
				152
				153	static inline void shmem_unacct_size(unsigned long flags, loff_t size)
				154	{
				155	if (!(flags & VM_NORESERVE))
				156	vm_unacct_memory(VM_ACCT(size));
				157	}
				158
				159	static inline int shmem_reacct_size(unsigned long flags,
				160	loff_t oldsize, loff_t newsize)
				161	{
				162	if (!(flags & VM_NORESERVE)) {
				163	if (VM_ACCT(newsize) > VM_ACCT(oldsize))
				164	return security_vm_enough_memory_mm(current->mm,
				165	VM_ACCT(newsize) - VM_ACCT(oldsize));
				166	else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
				167	vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
				168	}
				169	return 0;
				170	}
				171
				172	/*
				173	* ... whereas tmpfs objects are accounted incrementally as
				174	* pages are allocated, in order to allow large sparse files.
				175	* shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
				176	* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
				177	*/
				178	static inline int shmem_acct_block(unsigned long flags, long pages)
				179	{
				180	if (!(flags & VM_NORESERVE))
				181	return 0;
				182
				183	return security_vm_enough_memory_mm(current->mm,
				184	pages * VM_ACCT(PAGE_SIZE));
				185	}
				186
				187	static inline void shmem_unacct_blocks(unsigned long flags, long pages)
				188	{
				189	if (flags & VM_NORESERVE)
				190	vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
				191	}
				192
				193	static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
				194	{
				195	struct shmem_inode_info *info = SHMEM_I(inode);
				196	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				197
				198	if (shmem_acct_block(info->flags, pages))
				199	return false;
				200
				201	if (sbinfo->max_blocks) {
				202	if (percpu_counter_compare(&sbinfo->used_blocks,
				203	sbinfo->max_blocks - pages) > 0)
				204	goto unacct;
				205	percpu_counter_add(&sbinfo->used_blocks, pages);
				206	}
				207
				208	return true;
				209
				210	unacct:
				211	shmem_unacct_blocks(info->flags, pages);
				212	return false;
				213	}
				214
				215	static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
				216	{
				217	struct shmem_inode_info *info = SHMEM_I(inode);
				218	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				219
				220	if (sbinfo->max_blocks)
				221	percpu_counter_sub(&sbinfo->used_blocks, pages);
				222	shmem_unacct_blocks(info->flags, pages);
				223	}
				224
				225	static const struct super_operations shmem_ops;
				226	static const struct address_space_operations shmem_aops;
				227	static const struct file_operations shmem_file_operations;
				228	static const struct inode_operations shmem_inode_operations;
				229	static const struct inode_operations shmem_dir_inode_operations;
				230	static const struct inode_operations shmem_special_inode_operations;
				231	static const struct vm_operations_struct shmem_vm_ops;
				232	static struct file_system_type shmem_fs_type;
				233
				234	bool vma_is_shmem(struct vm_area_struct *vma)
				235	{
				236	return vma->vm_ops == &shmem_vm_ops;
				237	}
				238
				239	static LIST_HEAD(shmem_swaplist);
				240	static DEFINE_MUTEX(shmem_swaplist_mutex);
				241
				242	static int shmem_reserve_inode(struct super_block *sb)
				243	{
				244	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				245	if (sbinfo->max_inodes) {
				246	spin_lock(&sbinfo->stat_lock);
				247	if (!sbinfo->free_inodes) {
				248	spin_unlock(&sbinfo->stat_lock);
				249	return -ENOSPC;
				250	}
				251	sbinfo->free_inodes--;
				252	spin_unlock(&sbinfo->stat_lock);
				253	}
				254	return 0;
				255	}
				256
				257	static void shmem_free_inode(struct super_block *sb)
				258	{
				259	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				260	if (sbinfo->max_inodes) {
				261	spin_lock(&sbinfo->stat_lock);
				262	sbinfo->free_inodes++;
				263	spin_unlock(&sbinfo->stat_lock);
				264	}
				265	}
				266
				267	/**
				268	* shmem_recalc_inode - recalculate the block usage of an inode
				269	* @inode: inode to recalc
				270	*
				271	* We have to calculate the free blocks since the mm can drop
				272	* undirtied hole pages behind our back.
				273	*
				274	* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
				275	* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
				276	*
				277	* It has to be called with the spinlock held.
				278	*/
				279	static void shmem_recalc_inode(struct inode *inode)
				280	{
				281	struct shmem_inode_info *info = SHMEM_I(inode);
				282	long freed;
				283
				284	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
				285	if (freed > 0) {
				286	info->alloced -= freed;
				287	inode->i_blocks -= freed * BLOCKS_PER_PAGE;
				288	shmem_inode_unacct_blocks(inode, freed);
				289	}
				290	}
				291
				292	bool shmem_charge(struct inode *inode, long pages)
				293	{
				294	struct shmem_inode_info *info = SHMEM_I(inode);
				295	unsigned long flags;
				296
				297	if (!shmem_inode_acct_block(inode, pages))
				298	return false;
				299
				300	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
				301	inode->i_mapping->nrpages += pages;
				302
				303	spin_lock_irqsave(&info->lock, flags);
				304	info->alloced += pages;
				305	inode->i_blocks += pages * BLOCKS_PER_PAGE;
				306	shmem_recalc_inode(inode);
				307	spin_unlock_irqrestore(&info->lock, flags);
				308
				309	return true;
				310	}
				311
				312	void shmem_uncharge(struct inode *inode, long pages)
				313	{
				314	struct shmem_inode_info *info = SHMEM_I(inode);
				315	unsigned long flags;
				316
				317	/* nrpages adjustment done by __delete_from_page_cache() or caller */
				318
				319	spin_lock_irqsave(&info->lock, flags);
				320	info->alloced -= pages;
				321	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
				322	shmem_recalc_inode(inode);
				323	spin_unlock_irqrestore(&info->lock, flags);
				324
				325	shmem_inode_unacct_blocks(inode, pages);
				326	}
				327
				328	/*
				329	* Replace item expected in radix tree by a new item, while holding tree lock.
				330	*/
				331	static int shmem_radix_tree_replace(struct address_space *mapping,
				332	pgoff_t index, void expected, void replacement)
				333	{
				334	struct radix_tree_node *node;
				335	void __rcu **pslot;
				336	void *item;
				337
				338	VM_BUG_ON(!expected);
				339	VM_BUG_ON(!replacement);
				340	item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
				341	if (!item)
				342	return -ENOENT;
				343	if (item != expected)
				344	return -ENOENT;
				345	__radix_tree_replace(&mapping->i_pages, node, pslot,
				346	replacement, NULL);
				347	return 0;
				348	}
				349
				350	/*
				351	* Sometimes, before we decide whether to proceed or to fail, we must check
				352	* that an entry was not already brought back from swap by a racing thread.
				353	*
				354	* Checking page is not enough: by the time a SwapCache page is locked, it
				355	* might be reused, and again be SwapCache, using the same swap as before.
				356	*/
				357	static bool shmem_confirm_swap(struct address_space *mapping,
				358	pgoff_t index, swp_entry_t swap)
				359	{
				360	void *item;
				361
				362	rcu_read_lock();
				363	item = radix_tree_lookup(&mapping->i_pages, index);
				364	rcu_read_unlock();
				365	return item == swp_to_radix_entry(swap);
				366	}
				367
				368	/*
				369	* Definitions for "huge tmpfs": tmpfs mounted with the huge= option
				370	*
				371	* SHMEM_HUGE_NEVER:
				372	* disables huge pages for the mount;
				373	* SHMEM_HUGE_ALWAYS:
				374	* enables huge pages for the mount;
				375	* SHMEM_HUGE_WITHIN_SIZE:
				376	* only allocate huge pages if the page will be fully within i_size,
				377	* also respect fadvise()/madvise() hints;
				378	* SHMEM_HUGE_ADVISE:
				379	* only allocate huge pages if requested with fadvise()/madvise();
				380	*/
				381
				382	#define SHMEM_HUGE_NEVER 0
				383	#define SHMEM_HUGE_ALWAYS 1
				384	#define SHMEM_HUGE_WITHIN_SIZE 2
				385	#define SHMEM_HUGE_ADVISE 3
				386
				387	/*
				388	* Special values.
				389	* Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
				390	*
				391	* SHMEM_HUGE_DENY:
				392	* disables huge on shm_mnt and all mounts, for emergency use;
				393	* SHMEM_HUGE_FORCE:
				394	* enables huge on shm_mnt and all mounts, w/o needing option, for testing;
				395	*
				396	*/
				397	#define SHMEM_HUGE_DENY (-1)
				398	#define SHMEM_HUGE_FORCE (-2)
				399
				400	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				401	/* ifdef here to avoid bloating shmem.o when not necessary */
				402
				403	static int shmem_huge __read_mostly;
				404
				405	#if defined(CONFIG_SYSFS) \|\| defined(CONFIG_TMPFS)
				406	static int shmem_parse_huge(const char *str)
				407	{
				408	if (!strcmp(str, "never"))
				409	return SHMEM_HUGE_NEVER;
				410	if (!strcmp(str, "always"))
				411	return SHMEM_HUGE_ALWAYS;
				412	if (!strcmp(str, "within_size"))
				413	return SHMEM_HUGE_WITHIN_SIZE;
				414	if (!strcmp(str, "advise"))
				415	return SHMEM_HUGE_ADVISE;
				416	if (!strcmp(str, "deny"))
				417	return SHMEM_HUGE_DENY;
				418	if (!strcmp(str, "force"))
				419	return SHMEM_HUGE_FORCE;
				420	return -EINVAL;
				421	}
				422
				423	static const char *shmem_format_huge(int huge)
				424	{
				425	switch (huge) {
				426	case SHMEM_HUGE_NEVER:
				427	return "never";
				428	case SHMEM_HUGE_ALWAYS:
				429	return "always";
				430	case SHMEM_HUGE_WITHIN_SIZE:
				431	return "within_size";
				432	case SHMEM_HUGE_ADVISE:
				433	return "advise";
				434	case SHMEM_HUGE_DENY:
				435	return "deny";
				436	case SHMEM_HUGE_FORCE:
				437	return "force";
				438	default:
				439	VM_BUG_ON(1);
				440	return "bad_val";
				441	}
				442	}
				443	#endif
				444
				445	static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
				446	struct shrink_control *sc, unsigned long nr_to_split)
				447	{
				448	LIST_HEAD(list), pos, next;
				449	LIST_HEAD(to_remove);
				450	struct inode *inode;
				451	struct shmem_inode_info *info;
				452	struct page *page;
				453	unsigned long batch = sc ? sc->nr_to_scan : 128;
				454	int removed = 0, split = 0;
				455
				456	if (list_empty(&sbinfo->shrinklist))
				457	return SHRINK_STOP;
				458
				459	spin_lock(&sbinfo->shrinklist_lock);
				460	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
				461	info = list_entry(pos, struct shmem_inode_info, shrinklist);
				462
				463	/* pin the inode */
				464	inode = igrab(&info->vfs_inode);
				465
				466	/* inode is about to be evicted */
				467	if (!inode) {
				468	list_del_init(&info->shrinklist);
				469	removed++;
				470	goto next;
				471	}
				472
				473	/* Check if there's anything to gain */
				474	if (round_up(inode->i_size, PAGE_SIZE) ==
				475	round_up(inode->i_size, HPAGE_PMD_SIZE)) {
				476	list_move(&info->shrinklist, &to_remove);
				477	removed++;
				478	goto next;
				479	}
				480
				481	list_move(&info->shrinklist, &list);
				482	next:
				483	if (!--batch)
				484	break;
				485	}
				486	spin_unlock(&sbinfo->shrinklist_lock);
				487
				488	list_for_each_safe(pos, next, &to_remove) {
				489	info = list_entry(pos, struct shmem_inode_info, shrinklist);
				490	inode = &info->vfs_inode;
				491	list_del_init(&info->shrinklist);
				492	iput(inode);
				493	}
				494
				495	list_for_each_safe(pos, next, &list) {
				496	int ret;
				497
				498	info = list_entry(pos, struct shmem_inode_info, shrinklist);
				499	inode = &info->vfs_inode;
				500
				501	if (nr_to_split && split >= nr_to_split)
				502	goto leave;
				503
				504	page = find_get_page(inode->i_mapping,
				505	(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
				506	if (!page)
				507	goto drop;
				508
				509	/* No huge page at the end of the file: nothing to split */
				510	if (!PageTransHuge(page)) {
				511	put_page(page);
				512	goto drop;
				513	}
				514
				515	/*
				516	* Leave the inode on the list if we failed to lock
				517	* the page at this time.
				518	*
				519	* Waiting for the lock may lead to deadlock in the
				520	* reclaim path.
				521	*/
				522	if (!trylock_page(page)) {
				523	put_page(page);
				524	goto leave;
				525	}
				526
				527	ret = split_huge_page(page);
				528	unlock_page(page);
				529	put_page(page);
				530
				531	/* If split failed leave the inode on the list */
				532	if (ret)
				533	goto leave;
				534
				535	split++;
				536	drop:
				537	list_del_init(&info->shrinklist);
				538	removed++;
				539	leave:
				540	iput(inode);
				541	}
				542
				543	spin_lock(&sbinfo->shrinklist_lock);
				544	list_splice_tail(&list, &sbinfo->shrinklist);
				545	sbinfo->shrinklist_len -= removed;
				546	spin_unlock(&sbinfo->shrinklist_lock);
				547
				548	return split;
				549	}
				550
				551	static long shmem_unused_huge_scan(struct super_block *sb,
				552	struct shrink_control *sc)
				553	{
				554	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				555
				556	if (!READ_ONCE(sbinfo->shrinklist_len))
				557	return SHRINK_STOP;
				558
				559	return shmem_unused_huge_shrink(sbinfo, sc, 0);
				560	}
				561
				562	static long shmem_unused_huge_count(struct super_block *sb,
				563	struct shrink_control *sc)
				564	{
				565	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				566	return READ_ONCE(sbinfo->shrinklist_len);
				567	}
				568	#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
				569
				570	#define shmem_huge SHMEM_HUGE_DENY
				571
				572	static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
				573	struct shrink_control *sc, unsigned long nr_to_split)
				574	{
				575	return 0;
				576	}
				577	#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
				578
				579	static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
				580	{
				581	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				582	(shmem_huge == SHMEM_HUGE_FORCE \|\| sbinfo->huge) &&
				583	shmem_huge != SHMEM_HUGE_DENY)
				584	return true;
				585	return false;
				586	}
				587
				588	/*
				589	* Like add_to_page_cache_locked, but error if expected item has gone.
				590	*/
				591	static int shmem_add_to_page_cache(struct page *page,
				592	struct address_space *mapping,
				593	pgoff_t index, void *expected)
				594	{
				595	int error, nr = hpage_nr_pages(page);
				596
				597	VM_BUG_ON_PAGE(PageTail(page), page);
				598	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
				599	VM_BUG_ON_PAGE(!PageLocked(page), page);
				600	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
				601	VM_BUG_ON(expected && PageTransHuge(page));
				602
				603	page_ref_add(page, nr);
				604	page->mapping = mapping;
				605	page->index = index;
				606
				607	xa_lock_irq(&mapping->i_pages);
				608	if (PageTransHuge(page)) {
				609	void __rcu **results;
				610	pgoff_t idx;
				611	int i;
				612
				613	error = 0;
				614	if (radix_tree_gang_lookup_slot(&mapping->i_pages,
				615	&results, &idx, index, 1) &&
				616	idx < index + HPAGE_PMD_NR) {
				617	error = -EEXIST;
				618	}
				619
				620	if (!error) {
				621	for (i = 0; i < HPAGE_PMD_NR; i++) {
				622	error = radix_tree_insert(&mapping->i_pages,
				623	index + i, page + i);
				624	VM_BUG_ON(error);
				625	}
				626	count_vm_event(THP_FILE_ALLOC);
				627	}
				628	} else if (!expected) {
				629	error = radix_tree_insert(&mapping->i_pages, index, page);
				630	} else {
				631	error = shmem_radix_tree_replace(mapping, index, expected,
				632	page);
				633	}
				634
				635	if (!error) {
				636	mapping->nrpages += nr;
				637	if (PageTransHuge(page))
				638	__inc_node_page_state(page, NR_SHMEM_THPS);
				639	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
				640	__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
				641	xa_unlock_irq(&mapping->i_pages);
				642	} else {
				643	page->mapping = NULL;
				644	xa_unlock_irq(&mapping->i_pages);
				645	page_ref_sub(page, nr);
				646	}
				647	return error;
				648	}
				649
				650	/*
				651	* Like delete_from_page_cache, but substitutes swap for page.
				652	*/
				653	static void shmem_delete_from_page_cache(struct page page, void radswap)
				654	{
				655	struct address_space *mapping = page->mapping;
				656	int error;
				657
				658	VM_BUG_ON_PAGE(PageCompound(page), page);
				659
				660	xa_lock_irq(&mapping->i_pages);
				661	error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
				662	page->mapping = NULL;
				663	mapping->nrpages--;
				664	__dec_node_page_state(page, NR_FILE_PAGES);
				665	__dec_node_page_state(page, NR_SHMEM);
				666	xa_unlock_irq(&mapping->i_pages);
				667	put_page(page);
				668	BUG_ON(error);
				669	}
				670
				671	/*
				672	* Remove swap entry from radix tree, free the swap and its page cache.
				673	*/
				674	static int shmem_free_swap(struct address_space *mapping,
				675	pgoff_t index, void *radswap)
				676	{
				677	void *old;
				678
				679	xa_lock_irq(&mapping->i_pages);
				680	old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
				681	xa_unlock_irq(&mapping->i_pages);
				682	if (old != radswap)
				683	return -ENOENT;
				684	free_swap_and_cache(radix_to_swp_entry(radswap));
				685	return 0;
				686	}
				687
				688	/*
				689	* Determine (in bytes) how many of the shmem object's pages mapped by the
				690	* given offsets are swapped out.
				691	*
				692	* This is safe to call without i_mutex or the i_pages lock thanks to RCU,
				693	* as long as the inode doesn't go away and racy results are not a problem.
				694	*/
				695	unsigned long shmem_partial_swap_usage(struct address_space *mapping,
				696	pgoff_t start, pgoff_t end)
				697	{
				698	struct radix_tree_iter iter;
				699	void __rcu **slot;
				700	struct page *page;
				701	unsigned long swapped = 0;
				702
				703	rcu_read_lock();
				704
				705	radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
				706	if (iter.index >= end)
				707	break;
				708
				709	page = radix_tree_deref_slot(slot);
				710
				711	if (radix_tree_deref_retry(page)) {
				712	slot = radix_tree_iter_retry(&iter);
				713	continue;
				714	}
				715
				716	if (radix_tree_exceptional_entry(page))
				717	swapped++;
				718
				719	if (need_resched()) {
				720	slot = radix_tree_iter_resume(slot, &iter);
				721	cond_resched_rcu();
				722	}
				723	}
				724
				725	rcu_read_unlock();
				726
				727	return swapped << PAGE_SHIFT;
				728	}
				729
				730	/*
				731	* Determine (in bytes) how many of the shmem object's pages mapped by the
				732	* given vma is swapped out.
				733	*
				734	* This is safe to call without i_mutex or the i_pages lock thanks to RCU,
				735	* as long as the inode doesn't go away and racy results are not a problem.
				736	*/
				737	unsigned long shmem_swap_usage(struct vm_area_struct *vma)
				738	{
				739	struct inode *inode = file_inode(vma->vm_file);
				740	struct shmem_inode_info *info = SHMEM_I(inode);
				741	struct address_space *mapping = inode->i_mapping;
				742	unsigned long swapped;
				743
				744	/* Be careful as we don't hold info->lock */
				745	swapped = READ_ONCE(info->swapped);
				746
				747	/*
				748	* The easier cases are when the shmem object has nothing in swap, or
				749	* the vma maps it whole. Then we can simply use the stats that we
				750	* already track.
				751	*/
				752	if (!swapped)
				753	return 0;
				754
				755	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
				756	return swapped << PAGE_SHIFT;
				757
				758	/* Here comes the more involved part */
				759	return shmem_partial_swap_usage(mapping,
				760	linear_page_index(vma, vma->vm_start),
				761	linear_page_index(vma, vma->vm_end));
				762	}
				763
				764	/*
				765	* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
				766	*/
				767	void shmem_unlock_mapping(struct address_space *mapping)
				768	{
				769	struct pagevec pvec;
				770	pgoff_t indices[PAGEVEC_SIZE];
				771	pgoff_t index = 0;
				772
				773	pagevec_init(&pvec);
				774	/*
				775	* Minor point, but we might as well stop if someone else SHM_LOCKs it.
				776	*/
				777	while (!mapping_unevictable(mapping)) {
				778	/*
				779	* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
				780	* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
				781	*/
				782	pvec.nr = find_get_entries(mapping, index,
				783	PAGEVEC_SIZE, pvec.pages, indices);
				784	if (!pvec.nr)
				785	break;
				786	index = indices[pvec.nr - 1] + 1;
				787	pagevec_remove_exceptionals(&pvec);
				788	check_move_unevictable_pages(pvec.pages, pvec.nr);
				789	pagevec_release(&pvec);
				790	cond_resched();
				791	}
				792	}
				793
				794	/*
				795	* Remove range of pages and swap entries from radix tree, and free them.
				796	* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
				797	*/
				798	static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
				799	bool unfalloc)
				800	{
				801	struct address_space *mapping = inode->i_mapping;
				802	struct shmem_inode_info *info = SHMEM_I(inode);
				803	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
				804	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
				805	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
				806	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
				807	struct pagevec pvec;
				808	pgoff_t indices[PAGEVEC_SIZE];
				809	long nr_swaps_freed = 0;
				810	pgoff_t index;
				811	int i;
				812
				813	if (lend == -1)
				814	end = -1; /* unsigned, so actually very big */
				815
				816	pagevec_init(&pvec);
				817	index = start;
				818	while (index < end) {
				819	pvec.nr = find_get_entries(mapping, index,
				820	min(end - index, (pgoff_t)PAGEVEC_SIZE),
				821	pvec.pages, indices);
				822	if (!pvec.nr)
				823	break;
				824	for (i = 0; i < pagevec_count(&pvec); i++) {
				825	struct page *page = pvec.pages[i];
				826
				827	index = indices[i];
				828	if (index >= end)
				829	break;
				830
				831	if (radix_tree_exceptional_entry(page)) {
				832	if (unfalloc)
				833	continue;
				834	nr_swaps_freed += !shmem_free_swap(mapping,
				835	index, page);
				836	continue;
				837	}
				838
				839	VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
				840
				841	if (!trylock_page(page))
				842	continue;
				843
				844	if (PageTransTail(page)) {
				845	/* Middle of THP: zero out the page */
				846	clear_highpage(page);
				847	unlock_page(page);
				848	continue;
				849	} else if (PageTransHuge(page)) {
				850	if (index == round_down(end, HPAGE_PMD_NR)) {
				851	/*
				852	* Range ends in the middle of THP:
				853	* zero out the page
				854	*/
				855	clear_highpage(page);
				856	unlock_page(page);
				857	continue;
				858	}
				859	index += HPAGE_PMD_NR - 1;
				860	i += HPAGE_PMD_NR - 1;
				861	}
				862
				863	if (!unfalloc \|\| !PageUptodate(page)) {
				864	VM_BUG_ON_PAGE(PageTail(page), page);
				865	if (page_mapping(page) == mapping) {
				866	VM_BUG_ON_PAGE(PageWriteback(page), page);
				867	truncate_inode_page(mapping, page);
				868	}
				869	}
				870	unlock_page(page);
				871	}
				872	pagevec_remove_exceptionals(&pvec);
				873	pagevec_release(&pvec);
				874	cond_resched();
				875	index++;
				876	}
				877
				878	if (partial_start) {
				879	struct page *page = NULL;
				880	shmem_getpage(inode, start - 1, &page, SGP_READ);
				881	if (page) {
				882	unsigned int top = PAGE_SIZE;
				883	if (start > end) {
				884	top = partial_end;
				885	partial_end = 0;
				886	}
				887	zero_user_segment(page, partial_start, top);
				888	set_page_dirty(page);
				889	unlock_page(page);
				890	put_page(page);
				891	}
				892	}
				893	if (partial_end) {
				894	struct page *page = NULL;
				895	shmem_getpage(inode, end, &page, SGP_READ);
				896	if (page) {
				897	zero_user_segment(page, 0, partial_end);
				898	set_page_dirty(page);
				899	unlock_page(page);
				900	put_page(page);
				901	}
				902	}
				903	if (start >= end)
				904	return;
				905
				906	index = start;
				907	while (index < end) {
				908	cond_resched();
				909
				910	pvec.nr = find_get_entries(mapping, index,
				911	min(end - index, (pgoff_t)PAGEVEC_SIZE),
				912	pvec.pages, indices);
				913	if (!pvec.nr) {
				914	/* If all gone or hole-punch or unfalloc, we're done */
				915	if (index == start \|\| end != -1)
				916	break;
				917	/* But if truncating, restart to make sure all gone */
				918	index = start;
				919	continue;
				920	}
				921	for (i = 0; i < pagevec_count(&pvec); i++) {
				922	struct page *page = pvec.pages[i];
				923
				924	index = indices[i];
				925	if (index >= end)
				926	break;
				927
				928	if (radix_tree_exceptional_entry(page)) {
				929	if (unfalloc)
				930	continue;
				931	if (shmem_free_swap(mapping, index, page)) {
				932	/* Swap was replaced by page: retry */
				933	index--;
				934	break;
				935	}
				936	nr_swaps_freed++;
				937	continue;
				938	}
				939
				940	lock_page(page);
				941
				942	if (PageTransTail(page)) {
				943	/* Middle of THP: zero out the page */
				944	clear_highpage(page);
				945	unlock_page(page);
				946	/*
				947	* Partial thp truncate due 'start' in middle
				948	* of THP: don't need to look on these pages
				949	* again on !pvec.nr restart.
				950	*/
				951	if (index != round_down(end, HPAGE_PMD_NR))
				952	start++;
				953	continue;
				954	} else if (PageTransHuge(page)) {
				955	if (index == round_down(end, HPAGE_PMD_NR)) {
				956	/*
				957	* Range ends in the middle of THP:
				958	* zero out the page
				959	*/
				960	clear_highpage(page);
				961	unlock_page(page);
				962	continue;
				963	}
				964	index += HPAGE_PMD_NR - 1;
				965	i += HPAGE_PMD_NR - 1;
				966	}
				967
				968	if (!unfalloc \|\| !PageUptodate(page)) {
				969	VM_BUG_ON_PAGE(PageTail(page), page);
				970	if (page_mapping(page) == mapping) {
				971	VM_BUG_ON_PAGE(PageWriteback(page), page);
				972	truncate_inode_page(mapping, page);
				973	} else {
				974	/* Page was replaced by swap: retry */
				975	unlock_page(page);
				976	index--;
				977	break;
				978	}
				979	}
				980	unlock_page(page);
				981	}
				982	pagevec_remove_exceptionals(&pvec);
				983	pagevec_release(&pvec);
				984	index++;
				985	}
				986
				987	spin_lock_irq(&info->lock);
				988	info->swapped -= nr_swaps_freed;
				989	shmem_recalc_inode(inode);
				990	spin_unlock_irq(&info->lock);
				991	}
				992
				993	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
				994	{
				995	shmem_undo_range(inode, lstart, lend, false);
				996	inode->i_ctime = inode->i_mtime = current_time(inode);
				997	}
				998	EXPORT_SYMBOL_GPL(shmem_truncate_range);
				999
				1000	static int shmem_getattr(const struct path path, struct kstat stat,
				1001	u32 request_mask, unsigned int query_flags)
				1002	{
				1003	struct inode *inode = path->dentry->d_inode;
				1004	struct shmem_inode_info *info = SHMEM_I(inode);
				1005	struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
				1006
				1007	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
				1008	spin_lock_irq(&info->lock);
				1009	shmem_recalc_inode(inode);
				1010	spin_unlock_irq(&info->lock);
				1011	}
				1012	generic_fillattr(inode, stat);
				1013
				1014	if (is_huge_enabled(sb_info))
				1015	stat->blksize = HPAGE_PMD_SIZE;
				1016
				1017	return 0;
				1018	}
				1019
				1020	static int shmem_setattr(struct dentry dentry, struct iattr attr)
				1021	{
				1022	struct inode *inode = d_inode(dentry);
				1023	struct shmem_inode_info *info = SHMEM_I(inode);
				1024	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				1025	int error;
				1026
				1027	error = setattr_prepare(dentry, attr);
				1028	if (error)
				1029	return error;
				1030
				1031	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
				1032	loff_t oldsize = inode->i_size;
				1033	loff_t newsize = attr->ia_size;
				1034
				1035	/* protected by i_mutex */
				1036	if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) \|\|
				1037	(newsize > oldsize && (info->seals & F_SEAL_GROW)))
				1038	return -EPERM;
				1039
				1040	if (newsize != oldsize) {
				1041	error = shmem_reacct_size(SHMEM_I(inode)->flags,
				1042	oldsize, newsize);
				1043	if (error)
				1044	return error;
				1045	i_size_write(inode, newsize);
				1046	inode->i_ctime = inode->i_mtime = current_time(inode);
				1047	}
				1048	if (newsize <= oldsize) {
				1049	loff_t holebegin = round_up(newsize, PAGE_SIZE);
				1050	if (oldsize > holebegin)
				1051	unmap_mapping_range(inode->i_mapping,
				1052	holebegin, 0, 1);
				1053	if (info->alloced)
				1054	shmem_truncate_range(inode,
				1055	newsize, (loff_t)-1);
				1056	/* unmap again to remove racily COWed private pages */
				1057	if (oldsize > holebegin)
				1058	unmap_mapping_range(inode->i_mapping,
				1059	holebegin, 0, 1);
				1060
				1061	/*
				1062	* Part of the huge page can be beyond i_size: subject
				1063	* to shrink under memory pressure.
				1064	*/
				1065	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
				1066	spin_lock(&sbinfo->shrinklist_lock);
				1067	/*
				1068	* _careful to defend against unlocked access to
				1069	* ->shrink_list in shmem_unused_huge_shrink()
				1070	*/
				1071	if (list_empty_careful(&info->shrinklist)) {
				1072	list_add_tail(&info->shrinklist,
				1073	&sbinfo->shrinklist);
				1074	sbinfo->shrinklist_len++;
				1075	}
				1076	spin_unlock(&sbinfo->shrinklist_lock);
				1077	}
				1078	}
				1079	}
				1080
				1081	setattr_copy(inode, attr);
				1082	if (attr->ia_valid & ATTR_MODE)
				1083	error = posix_acl_chmod(inode, inode->i_mode);
				1084	return error;
				1085	}
				1086
				1087	static void shmem_evict_inode(struct inode *inode)
				1088	{
				1089	struct shmem_inode_info *info = SHMEM_I(inode);
				1090	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				1091
				1092	if (inode->i_mapping->a_ops == &shmem_aops) {
				1093	shmem_unacct_size(info->flags, inode->i_size);
				1094	inode->i_size = 0;
				1095	shmem_truncate_range(inode, 0, (loff_t)-1);
				1096	if (!list_empty(&info->shrinklist)) {
				1097	spin_lock(&sbinfo->shrinklist_lock);
				1098	if (!list_empty(&info->shrinklist)) {
				1099	list_del_init(&info->shrinklist);
				1100	sbinfo->shrinklist_len--;
				1101	}
				1102	spin_unlock(&sbinfo->shrinklist_lock);
				1103	}
				1104	if (!list_empty(&info->swaplist)) {
				1105	mutex_lock(&shmem_swaplist_mutex);
				1106	list_del_init(&info->swaplist);
				1107	mutex_unlock(&shmem_swaplist_mutex);
				1108	}
				1109	}
				1110
				1111	simple_xattrs_free(&info->xattrs);
				1112	WARN_ON(inode->i_blocks);
				1113	shmem_free_inode(inode->i_sb);
				1114	clear_inode(inode);
				1115	}
				1116
				1117	static unsigned long find_swap_entry(struct radix_tree_root root, void item)
				1118	{
				1119	struct radix_tree_iter iter;
				1120	void __rcu **slot;
				1121	unsigned long found = -1;
				1122	unsigned int checked = 0;
				1123
				1124	rcu_read_lock();
				1125	radix_tree_for_each_slot(slot, root, &iter, 0) {
				1126	void *entry = radix_tree_deref_slot(slot);
				1127
				1128	if (radix_tree_deref_retry(entry)) {
				1129	slot = radix_tree_iter_retry(&iter);
				1130	continue;
				1131	}
				1132	if (entry == item) {
				1133	found = iter.index;
				1134	break;
				1135	}
				1136	checked++;
				1137	if ((checked % 4096) != 0)
				1138	continue;
				1139	slot = radix_tree_iter_resume(slot, &iter);
				1140	cond_resched_rcu();
				1141	}
				1142
				1143	rcu_read_unlock();
				1144	return found;
				1145	}
				1146
				1147	/*
				1148	* If swap found in inode, free it and move page from swapcache to filecache.
				1149	*/
				1150	static int shmem_unuse_inode(struct shmem_inode_info *info,
				1151	swp_entry_t swap, struct page **pagep)
				1152	{
				1153	struct address_space *mapping = info->vfs_inode.i_mapping;
				1154	void *radswap;
				1155	pgoff_t index;
				1156	gfp_t gfp;
				1157	int error = 0;
				1158
				1159	radswap = swp_to_radix_entry(swap);
				1160	index = find_swap_entry(&mapping->i_pages, radswap);
				1161	if (index == -1)
				1162	return -EAGAIN; /* tell shmem_unuse we found nothing */
				1163
				1164	/*
				1165	* Move _head_ to start search for next from here.
				1166	* But be careful: shmem_evict_inode checks list_empty without taking
				1167	* mutex, and there's an instant in list_move_tail when info->swaplist
				1168	* would appear empty, if it were the only one on shmem_swaplist.
				1169	*/
				1170	if (shmem_swaplist.next != &info->swaplist)
				1171	list_move_tail(&shmem_swaplist, &info->swaplist);
				1172
				1173	gfp = mapping_gfp_mask(mapping);
				1174	if (shmem_should_replace_page(*pagep, gfp)) {
				1175	mutex_unlock(&shmem_swaplist_mutex);
				1176	error = shmem_replace_page(pagep, gfp, info, index);
				1177	mutex_lock(&shmem_swaplist_mutex);
				1178	/*
				1179	* We needed to drop mutex to make that restrictive page
				1180	* allocation, but the inode might have been freed while we
				1181	* dropped it: although a racing shmem_evict_inode() cannot
				1182	* complete without emptying the radix_tree, our page lock
				1183	* on this swapcache page is not enough to prevent that -
				1184	* free_swap_and_cache() of our swap entry will only
				1185	* trylock_page(), removing swap from radix_tree whatever.
				1186	*
				1187	* We must not proceed to shmem_add_to_page_cache() if the
				1188	* inode has been freed, but of course we cannot rely on
				1189	* inode or mapping or info to check that. However, we can
				1190	* safely check if our swap entry is still in use (and here
				1191	* it can't have got reused for another page): if it's still
				1192	* in use, then the inode cannot have been freed yet, and we
				1193	* can safely proceed (if it's no longer in use, that tells
				1194	* nothing about the inode, but we don't need to unuse swap).
				1195	*/
				1196	if (!page_swapcount(*pagep))
				1197	error = -ENOENT;
				1198	}
				1199
				1200	/*
				1201	* We rely on shmem_swaplist_mutex, not only to protect the swaplist,
				1202	* but also to hold up shmem_evict_inode(): so inode cannot be freed
				1203	* beneath us (pagelock doesn't help until the page is in pagecache).
				1204	*/
				1205	if (!error)
				1206	error = shmem_add_to_page_cache(*pagep, mapping, index,
				1207	radswap);
				1208	if (error != -ENOMEM) {
				1209	/*
				1210	* Truncation and eviction use free_swap_and_cache(), which
				1211	* only does trylock page: if we raced, best clean up here.
				1212	*/
				1213	delete_from_swap_cache(*pagep);
				1214	set_page_dirty(*pagep);
				1215	if (!error) {
				1216	spin_lock_irq(&info->lock);
				1217	info->swapped--;
				1218	spin_unlock_irq(&info->lock);
				1219	swap_free(swap);
				1220	}
				1221	}
				1222	return error;
				1223	}
				1224
				1225	/*
				1226	* Search through swapped inodes to find and replace swap by page.
				1227	*/
				1228	int shmem_unuse(swp_entry_t swap, struct page *page)
				1229	{
				1230	struct list_head this, next;
				1231	struct shmem_inode_info *info;
				1232	struct mem_cgroup *memcg;
				1233	int error = 0;
				1234
				1235	/*
				1236	* There's a faint possibility that swap page was replaced before
				1237	* caller locked it: caller will come back later with the right page.
				1238	*/
				1239	if (unlikely(!PageSwapCache(page) \|\| page_private(page) != swap.val))
				1240	goto out;
				1241
				1242	/*
				1243	* Charge page using GFP_KERNEL while we can wait, before taking
				1244	* the shmem_swaplist_mutex which might hold up shmem_writepage().
				1245	* Charged back to the user (not to caller) when swap account is used.
				1246	*/
				1247	error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
				1248	&memcg, false);
				1249	if (error)
				1250	goto out;
				1251	/* No radix_tree_preload: swap entry keeps a place for page in tree */
				1252	error = -EAGAIN;
				1253
				1254	mutex_lock(&shmem_swaplist_mutex);
				1255	list_for_each_safe(this, next, &shmem_swaplist) {
				1256	info = list_entry(this, struct shmem_inode_info, swaplist);
				1257	if (info->swapped)
				1258	error = shmem_unuse_inode(info, swap, &page);
				1259	else
				1260	list_del_init(&info->swaplist);
				1261	cond_resched();
				1262	if (error != -EAGAIN)
				1263	break;
				1264	/* found nothing in this: move on to search the next */
				1265	}
				1266	mutex_unlock(&shmem_swaplist_mutex);
				1267
				1268	if (error) {
				1269	if (error != -ENOMEM)
				1270	error = 0;
				1271	mem_cgroup_cancel_charge(page, memcg, false);
				1272	} else
				1273	mem_cgroup_commit_charge(page, memcg, true, false);
				1274	out:
				1275	unlock_page(page);
				1276	put_page(page);
				1277	return error;
				1278	}
				1279
				1280	/*
				1281	* Move the page from the page cache to the swap cache.
				1282	*/
				1283	static int shmem_writepage(struct page page, struct writeback_control wbc)
				1284	{
				1285	struct shmem_inode_info *info;
				1286	struct address_space *mapping;
				1287	struct inode *inode;
				1288	swp_entry_t swap;
				1289	pgoff_t index;
				1290
				1291	VM_BUG_ON_PAGE(PageCompound(page), page);
				1292	BUG_ON(!PageLocked(page));
				1293	mapping = page->mapping;
				1294	index = page->index;
				1295	inode = mapping->host;
				1296	info = SHMEM_I(inode);
				1297	if (info->flags & VM_LOCKED)
				1298	goto redirty;
				1299	if (!total_swap_pages)
				1300	goto redirty;
				1301
				1302	/*
				1303	* Our capabilities prevent regular writeback or sync from ever calling
				1304	* shmem_writepage; but a stacking filesystem might use ->writepage of
				1305	* its underlying filesystem, in which case tmpfs should write out to
				1306	* swap only in response to memory pressure, and not for the writeback
				1307	* threads or sync.
				1308	*/
				1309	if (!wbc->for_reclaim) {
				1310	WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
				1311	goto redirty;
				1312	}
				1313
				1314	/*
				1315	* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
				1316	* value into swapfile.c, the only way we can correctly account for a
				1317	* fallocated page arriving here is now to initialize it and write it.
				1318	*
				1319	* That's okay for a page already fallocated earlier, but if we have
				1320	* not yet completed the fallocation, then (a) we want to keep track
				1321	* of this page in case we have to undo it, and (b) it may not be a
				1322	* good idea to continue anyway, once we're pushing into swap. So
				1323	* reactivate the page, and let shmem_fallocate() quit when too many.
				1324	*/
				1325	if (!PageUptodate(page)) {
				1326	if (inode->i_private) {
				1327	struct shmem_falloc *shmem_falloc;
				1328	spin_lock(&inode->i_lock);
				1329	shmem_falloc = inode->i_private;
				1330	if (shmem_falloc &&
				1331	!shmem_falloc->waitq &&
				1332	index >= shmem_falloc->start &&
				1333	index < shmem_falloc->next)
				1334	shmem_falloc->nr_unswapped++;
				1335	else
				1336	shmem_falloc = NULL;
				1337	spin_unlock(&inode->i_lock);
				1338	if (shmem_falloc)
				1339	goto redirty;
				1340	}
				1341	clear_highpage(page);
				1342	flush_dcache_page(page);
				1343	SetPageUptodate(page);
				1344	}
				1345
				1346	swap = get_swap_page(page);
				1347	if (!swap.val)
				1348	goto redirty;
				1349
				1350	/*
				1351	* Add inode to shmem_unuse()'s list of swapped-out inodes,
				1352	* if it's not already there. Do it now before the page is
				1353	* moved to swap cache, when its pagelock no longer protects
				1354	* the inode from eviction. But don't unlock the mutex until
				1355	* we've incremented swapped, because shmem_unuse_inode() will
				1356	* prune a !swapped inode from the swaplist under this mutex.
				1357	*/
				1358	mutex_lock(&shmem_swaplist_mutex);
				1359	if (list_empty(&info->swaplist))
				1360	list_add_tail(&info->swaplist, &shmem_swaplist);
				1361
				1362	if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
				1363	spin_lock_irq(&info->lock);
				1364	shmem_recalc_inode(inode);
				1365	info->swapped++;
				1366	spin_unlock_irq(&info->lock);
				1367
				1368	swap_shmem_alloc(swap);
				1369	shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
				1370
				1371	mutex_unlock(&shmem_swaplist_mutex);
				1372	BUG_ON(page_mapped(page));
				1373	swap_writepage(page, wbc);
				1374	return 0;
				1375	}
				1376
				1377	mutex_unlock(&shmem_swaplist_mutex);
				1378	put_swap_page(page, swap);
				1379	redirty:
				1380	set_page_dirty(page);
				1381	if (wbc->for_reclaim)
				1382	return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
				1383	unlock_page(page);
				1384	return 0;
				1385	}
				1386
				1387	#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
				1388	static void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)
				1389	{
				1390	char buffer[64];
				1391
				1392	if (!mpol \|\| mpol->mode == MPOL_DEFAULT)
				1393	return; /* show nothing */
				1394
				1395	mpol_to_str(buffer, sizeof(buffer), mpol);
				1396
				1397	seq_printf(seq, ",mpol=%s", buffer);
				1398	}
				1399
				1400	static struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)
				1401	{
				1402	struct mempolicy *mpol = NULL;
				1403	if (sbinfo->mpol) {
				1404	spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
				1405	mpol = sbinfo->mpol;
				1406	mpol_get(mpol);
				1407	spin_unlock(&sbinfo->stat_lock);
				1408	}
				1409	return mpol;
				1410	}
				1411	#else /* !CONFIG_NUMA \|\| !CONFIG_TMPFS */
				1412	static inline void shmem_show_mpol(struct seq_file seq, struct mempolicy mpol)
				1413	{
				1414	}
				1415	static inline struct mempolicy shmem_get_sbmpol(struct shmem_sb_info sbinfo)
				1416	{
				1417	return NULL;
				1418	}
				1419	#endif /* CONFIG_NUMA && CONFIG_TMPFS */
				1420	#ifndef CONFIG_NUMA
				1421	#define vm_policy vm_private_data
				1422	#endif
				1423
				1424	static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
				1425	struct shmem_inode_info *info, pgoff_t index)
				1426	{
				1427	/* Create a pseudo vma that just contains the policy */
				1428	vma_init(vma, NULL);
				1429	/* Bias interleave by inode number to distribute better across nodes */
				1430	vma->vm_pgoff = index + info->vfs_inode.i_ino;
				1431	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
				1432	}
				1433
				1434	static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
				1435	{
				1436	/* Drop reference taken by mpol_shared_policy_lookup() */
				1437	mpol_cond_put(vma->vm_policy);
				1438	}
				1439
				1440	static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
				1441	struct shmem_inode_info *info, pgoff_t index)
				1442	{
				1443	struct vm_area_struct pvma;
				1444	struct page *page;
				1445	struct vm_fault vmf;
				1446
				1447	shmem_pseudo_vma_init(&pvma, info, index);
				1448	vmf.vma = &pvma;
				1449	vmf.address = 0;
				1450	page = swap_cluster_readahead(swap, gfp, &vmf);
				1451	shmem_pseudo_vma_destroy(&pvma);
				1452
				1453	return page;
				1454	}
				1455
				1456	static struct page *shmem_alloc_hugepage(gfp_t gfp,
				1457	struct shmem_inode_info *info, pgoff_t index)
				1458	{
				1459	struct vm_area_struct pvma;
				1460	struct inode *inode = &info->vfs_inode;
				1461	struct address_space *mapping = inode->i_mapping;
				1462	pgoff_t idx, hindex;
				1463	void __rcu **results;
				1464	struct page *page;
				1465
				1466	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
				1467	return NULL;
				1468
				1469	hindex = round_down(index, HPAGE_PMD_NR);
				1470	rcu_read_lock();
				1471	if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
				1472	hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
				1473	rcu_read_unlock();
				1474	return NULL;
				1475	}
				1476	rcu_read_unlock();
				1477
				1478	shmem_pseudo_vma_init(&pvma, info, hindex);
				1479	page = alloc_pages_vma(gfp \| __GFP_COMP \| __GFP_NORETRY \| __GFP_NOWARN,
				1480	HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
				1481	shmem_pseudo_vma_destroy(&pvma);
				1482	if (page)
				1483	prep_transhuge_page(page);
				1484	return page;
				1485	}
				1486
				1487	static struct page *shmem_alloc_page(gfp_t gfp,
				1488	struct shmem_inode_info *info, pgoff_t index)
				1489	{
				1490	struct vm_area_struct pvma;
				1491	struct page *page;
				1492
				1493	shmem_pseudo_vma_init(&pvma, info, index);
				1494	page = alloc_page_vma(gfp, &pvma, 0);
				1495	shmem_pseudo_vma_destroy(&pvma);
				1496
				1497	return page;
				1498	}
				1499
				1500	static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
				1501	struct inode *inode,
				1502	pgoff_t index, bool huge)
				1503	{
				1504	struct shmem_inode_info *info = SHMEM_I(inode);
				1505	struct page *page;
				1506	int nr;
				1507	int err = -ENOSPC;
				1508
				1509	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
				1510	huge = false;
				1511	nr = huge ? HPAGE_PMD_NR : 1;
				1512
				1513	if (!shmem_inode_acct_block(inode, nr))
				1514	goto failed;
				1515
				1516	if (huge)
				1517	page = shmem_alloc_hugepage(gfp, info, index);
				1518	else
				1519	page = shmem_alloc_page(gfp, info, index);
				1520	if (page) {
				1521	__SetPageLocked(page);
				1522	__SetPageSwapBacked(page);
				1523	return page;
				1524	}
				1525
				1526	err = -ENOMEM;
				1527	shmem_inode_unacct_blocks(inode, nr);
				1528	failed:
				1529	return ERR_PTR(err);
				1530	}
				1531
				1532	/*
				1533	* When a page is moved from swapcache to shmem filecache (either by the
				1534	* usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
				1535	* shmem_unuse_inode()), it may have been read in earlier from swap, in
				1536	* ignorance of the mapping it belongs to. If that mapping has special
				1537	* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
				1538	* we may need to copy to a suitable page before moving to filecache.
				1539	*
				1540	* In a future release, this may well be extended to respect cpuset and
				1541	* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
				1542	* but for now it is a simple matter of zone.
				1543	*/
				1544	static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
				1545	{
				1546	return page_zonenum(page) > gfp_zone(gfp);
				1547	}
				1548
				1549	static int shmem_replace_page(struct page **pagep, gfp_t gfp,
				1550	struct shmem_inode_info *info, pgoff_t index)
				1551	{
				1552	struct page oldpage, newpage;
				1553	struct address_space *swap_mapping;
				1554	swp_entry_t entry;
				1555	pgoff_t swap_index;
				1556	int error;
				1557
				1558	oldpage = *pagep;
				1559	entry.val = page_private(oldpage);
				1560	swap_index = swp_offset(entry);
				1561	swap_mapping = page_mapping(oldpage);
				1562
				1563	/*
				1564	* We have arrived here because our zones are constrained, so don't
				1565	* limit chance of success by further cpuset and node constraints.
				1566	*/
				1567	gfp &= ~GFP_CONSTRAINT_MASK;
				1568	newpage = shmem_alloc_page(gfp, info, index);
				1569	if (!newpage)
				1570	return -ENOMEM;
				1571
				1572	get_page(newpage);
				1573	copy_highpage(newpage, oldpage);
				1574	flush_dcache_page(newpage);
				1575
				1576	__SetPageLocked(newpage);
				1577	__SetPageSwapBacked(newpage);
				1578	SetPageUptodate(newpage);
				1579	set_page_private(newpage, entry.val);
				1580	SetPageSwapCache(newpage);
				1581
				1582	/*
				1583	* Our caller will very soon move newpage out of swapcache, but it's
				1584	* a nice clean interface for us to replace oldpage by newpage there.
				1585	*/
				1586	xa_lock_irq(&swap_mapping->i_pages);
				1587	error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
				1588	newpage);
				1589	if (!error) {
				1590	__inc_node_page_state(newpage, NR_FILE_PAGES);
				1591	__dec_node_page_state(oldpage, NR_FILE_PAGES);
				1592	}
				1593	xa_unlock_irq(&swap_mapping->i_pages);
				1594
				1595	if (unlikely(error)) {
				1596	/*
				1597	* Is this possible? I think not, now that our callers check
				1598	* both PageSwapCache and page_private after getting page lock;
				1599	* but be defensive. Reverse old to newpage for clear and free.
				1600	*/
				1601	oldpage = newpage;
				1602	} else {
				1603	mem_cgroup_migrate(oldpage, newpage);
				1604	lru_cache_add_anon(newpage);
				1605	*pagep = newpage;
				1606	}
				1607
				1608	ClearPageSwapCache(oldpage);
				1609	set_page_private(oldpage, 0);
				1610
				1611	unlock_page(oldpage);
				1612	put_page(oldpage);
				1613	put_page(oldpage);
				1614	return error;
				1615	}
				1616
				1617	/*
				1618	* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
				1619	*
				1620	* If we allocate a new one we do not mark it dirty. That's up to the
				1621	* vm. If we swap it in we mark it dirty since we also free the swap
				1622	* entry since a page cannot live in both the swap and page cache.
				1623	*
				1624	* fault_mm and fault_type are only supplied by shmem_fault:
				1625	* otherwise they are NULL.
				1626	*/
				1627	static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
				1628	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
				1629	struct vm_area_struct vma, struct vm_fault vmf,
				1630	vm_fault_t *fault_type)
				1631	{
				1632	struct address_space *mapping = inode->i_mapping;
				1633	struct shmem_inode_info *info = SHMEM_I(inode);
				1634	struct shmem_sb_info *sbinfo;
				1635	struct mm_struct *charge_mm;
				1636	struct mem_cgroup *memcg;
				1637	struct page *page;
				1638	swp_entry_t swap;
				1639	enum sgp_type sgp_huge = sgp;
				1640	pgoff_t hindex = index;
				1641	int error;
				1642	int once = 0;
				1643	int alloced = 0;
				1644
				1645	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
				1646	return -EFBIG;
				1647	if (sgp == SGP_NOHUGE \|\| sgp == SGP_HUGE)
				1648	sgp = SGP_CACHE;
				1649	repeat:
				1650	swap.val = 0;
				1651	page = find_lock_entry(mapping, index);
				1652	if (radix_tree_exceptional_entry(page)) {
				1653	swap = radix_to_swp_entry(page);
				1654	page = NULL;
				1655	}
				1656
				1657	if (sgp <= SGP_CACHE &&
				1658	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
				1659	error = -EINVAL;
				1660	goto unlock;
				1661	}
				1662
				1663	if (page && sgp == SGP_WRITE)
				1664	mark_page_accessed(page);
				1665
				1666	/* fallocated page? */
				1667	if (page && !PageUptodate(page)) {
				1668	if (sgp != SGP_READ)
				1669	goto clear;
				1670	unlock_page(page);
				1671	put_page(page);
				1672	page = NULL;
				1673	}
				1674	if (page \|\| (sgp == SGP_READ && !swap.val)) {
				1675	*pagep = page;
				1676	return 0;
				1677	}
				1678
				1679	/*
				1680	* Fast cache lookup did not find it:
				1681	* bring it back from swap or allocate.
				1682	*/
				1683	sbinfo = SHMEM_SB(inode->i_sb);
				1684	charge_mm = vma ? vma->vm_mm : current->mm;
				1685
				1686	if (swap.val) {
				1687	/* Look it up and read it in.. */
				1688	page = lookup_swap_cache(swap, NULL, 0);
				1689	if (!page) {
				1690	/* Or update major stats only when swapin succeeds?? */
				1691	if (fault_type) {
				1692	*fault_type \|= VM_FAULT_MAJOR;
				1693	count_vm_event(PGMAJFAULT);
				1694	count_memcg_event_mm(charge_mm, PGMAJFAULT);
				1695	}
				1696	/* Here we actually start the io */
				1697	page = shmem_swapin(swap, gfp, info, index);
				1698	if (!page) {
				1699	error = -ENOMEM;
				1700	goto failed;
				1701	}
				1702	}
				1703
				1704	/* We have to do this with page locked to prevent races */
				1705	lock_page(page);
				1706	if (!PageSwapCache(page) \|\| page_private(page) != swap.val \|\|
				1707	!shmem_confirm_swap(mapping, index, swap)) {
				1708	error = -EEXIST; /* try again */
				1709	goto unlock;
				1710	}
				1711	if (!PageUptodate(page)) {
				1712	error = -EIO;
				1713	goto failed;
				1714	}
				1715	wait_on_page_writeback(page);
				1716
				1717	if (shmem_should_replace_page(page, gfp)) {
				1718	error = shmem_replace_page(&page, gfp, info, index);
				1719	if (error)
				1720	goto failed;
				1721	}
				1722
				1723	error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
				1724	false);
				1725	if (!error) {
				1726	error = shmem_add_to_page_cache(page, mapping, index,
				1727	swp_to_radix_entry(swap));
				1728	/*
				1729	* We already confirmed swap under page lock, and make
				1730	* no memory allocation here, so usually no possibility
				1731	* of error; but free_swap_and_cache() only trylocks a
				1732	* page, so it is just possible that the entry has been
				1733	* truncated or holepunched since swap was confirmed.
				1734	* shmem_undo_range() will have done some of the
				1735	* unaccounting, now delete_from_swap_cache() will do
				1736	* the rest.
				1737	* Reset swap.val? No, leave it so "failed" goes back to
				1738	* "repeat": reading a hole and writing should succeed.
				1739	*/
				1740	if (error) {
				1741	mem_cgroup_cancel_charge(page, memcg, false);
				1742	delete_from_swap_cache(page);
				1743	}
				1744	}
				1745	if (error)
				1746	goto failed;
				1747
				1748	mem_cgroup_commit_charge(page, memcg, true, false);
				1749
				1750	spin_lock_irq(&info->lock);
				1751	info->swapped--;
				1752	shmem_recalc_inode(inode);
				1753	spin_unlock_irq(&info->lock);
				1754
				1755	if (sgp == SGP_WRITE)
				1756	mark_page_accessed(page);
				1757
				1758	delete_from_swap_cache(page);
				1759	set_page_dirty(page);
				1760	swap_free(swap);
				1761
				1762	} else {
				1763	if (vma && userfaultfd_missing(vma)) {
				1764	*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
				1765	return 0;
				1766	}
				1767
				1768	/* shmem_symlink() */
				1769	if (mapping->a_ops != &shmem_aops)
				1770	goto alloc_nohuge;
				1771	if (shmem_huge == SHMEM_HUGE_DENY \|\| sgp_huge == SGP_NOHUGE)
				1772	goto alloc_nohuge;
				1773	if (shmem_huge == SHMEM_HUGE_FORCE)
				1774	goto alloc_huge;
				1775	switch (sbinfo->huge) {
				1776	loff_t i_size;
				1777	pgoff_t off;
				1778	case SHMEM_HUGE_NEVER:
				1779	goto alloc_nohuge;
				1780	case SHMEM_HUGE_WITHIN_SIZE:
				1781	off = round_up(index, HPAGE_PMD_NR);
				1782	i_size = round_up(i_size_read(inode), PAGE_SIZE);
				1783	if (i_size >= HPAGE_PMD_SIZE &&
				1784	i_size >> PAGE_SHIFT >= off)
				1785	goto alloc_huge;
				1786	/* fallthrough */
				1787	case SHMEM_HUGE_ADVISE:
				1788	if (sgp_huge == SGP_HUGE)
				1789	goto alloc_huge;
				1790	/* TODO: implement fadvise() hints */
				1791	goto alloc_nohuge;
				1792	}
				1793
				1794	alloc_huge:
				1795	page = shmem_alloc_and_acct_page(gfp, inode, index, true);
				1796	if (IS_ERR(page)) {
				1797	alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
				1798	index, false);
				1799	}
				1800	if (IS_ERR(page)) {
				1801	int retry = 5;
				1802	error = PTR_ERR(page);
				1803	page = NULL;
				1804	if (error != -ENOSPC)
				1805	goto failed;
				1806	/*
				1807	* Try to reclaim some spece by splitting a huge page
				1808	* beyond i_size on the filesystem.
				1809	*/
				1810	while (retry--) {
				1811	int ret;
				1812	ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
				1813	if (ret == SHRINK_STOP)
				1814	break;
				1815	if (ret)
				1816	goto alloc_nohuge;
				1817	}
				1818	goto failed;
				1819	}
				1820
				1821	if (PageTransHuge(page))
				1822	hindex = round_down(index, HPAGE_PMD_NR);
				1823	else
				1824	hindex = index;
				1825
				1826	if (sgp == SGP_WRITE)
				1827	__SetPageReferenced(page);
				1828
				1829	error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
				1830	PageTransHuge(page));
				1831	if (error)
				1832	goto unacct;
				1833	error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
				1834	compound_order(page));
				1835	if (!error) {
				1836	error = shmem_add_to_page_cache(page, mapping, hindex,
				1837	NULL);
				1838	radix_tree_preload_end();
				1839	}
				1840	if (error) {
				1841	mem_cgroup_cancel_charge(page, memcg,
				1842	PageTransHuge(page));
				1843	goto unacct;
				1844	}
				1845	mem_cgroup_commit_charge(page, memcg, false,
				1846	PageTransHuge(page));
				1847	lru_cache_add_anon(page);
				1848
				1849	spin_lock_irq(&info->lock);
				1850	info->alloced += 1 << compound_order(page);
				1851	inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
				1852	shmem_recalc_inode(inode);
				1853	spin_unlock_irq(&info->lock);
				1854	alloced = true;
				1855
				1856	if (PageTransHuge(page) &&
				1857	DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
				1858	hindex + HPAGE_PMD_NR - 1) {
				1859	/*
				1860	* Part of the huge page is beyond i_size: subject
				1861	* to shrink under memory pressure.
				1862	*/
				1863	spin_lock(&sbinfo->shrinklist_lock);
				1864	/*
				1865	* _careful to defend against unlocked access to
				1866	* ->shrink_list in shmem_unused_huge_shrink()
				1867	*/
				1868	if (list_empty_careful(&info->shrinklist)) {
				1869	list_add_tail(&info->shrinklist,
				1870	&sbinfo->shrinklist);
				1871	sbinfo->shrinklist_len++;
				1872	}
				1873	spin_unlock(&sbinfo->shrinklist_lock);
				1874	}
				1875
				1876	/*
				1877	* Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
				1878	*/
				1879	if (sgp == SGP_FALLOC)
				1880	sgp = SGP_WRITE;
				1881	clear:
				1882	/*
				1883	* Let SGP_WRITE caller clear ends if write does not fill page;
				1884	* but SGP_FALLOC on a page fallocated earlier must initialize
				1885	* it now, lest undo on failure cancel our earlier guarantee.
				1886	*/
				1887	if (sgp != SGP_WRITE && !PageUptodate(page)) {
				1888	struct page *head = compound_head(page);
				1889	int i;
				1890
				1891	for (i = 0; i < (1 << compound_order(head)); i++) {
				1892	clear_highpage(head + i);
				1893	flush_dcache_page(head + i);
				1894	}
				1895	SetPageUptodate(head);
				1896	}
				1897	}
				1898
				1899	/* Perhaps the file has been truncated since we checked */
				1900	if (sgp <= SGP_CACHE &&
				1901	((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
				1902	if (alloced) {
				1903	ClearPageDirty(page);
				1904	delete_from_page_cache(page);
				1905	spin_lock_irq(&info->lock);
				1906	shmem_recalc_inode(inode);
				1907	spin_unlock_irq(&info->lock);
				1908	}
				1909	error = -EINVAL;
				1910	goto unlock;
				1911	}
				1912	*pagep = page + index - hindex;
				1913	return 0;
				1914
				1915	/*
				1916	* Error recovery.
				1917	*/
				1918	unacct:
				1919	shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
				1920
				1921	if (PageTransHuge(page)) {
				1922	unlock_page(page);
				1923	put_page(page);
				1924	goto alloc_nohuge;
				1925	}
				1926	failed:
				1927	if (swap.val && !shmem_confirm_swap(mapping, index, swap))
				1928	error = -EEXIST;
				1929	unlock:
				1930	if (page) {
				1931	unlock_page(page);
				1932	put_page(page);
				1933	}
				1934	if (error == -ENOSPC && !once++) {
				1935	spin_lock_irq(&info->lock);
				1936	shmem_recalc_inode(inode);
				1937	spin_unlock_irq(&info->lock);
				1938	goto repeat;
				1939	}
				1940	if (error == -EEXIST) /* from above or from radix_tree_insert */
				1941	goto repeat;
				1942	return error;
				1943	}
				1944
				1945	/*
				1946	* This is like autoremove_wake_function, but it removes the wait queue
				1947	* entry unconditionally - even if something else had already woken the
				1948	* target.
				1949	*/
				1950	static int synchronous_wake_function(wait_queue_entry_t wait, unsigned mode, int sync, void key)
				1951	{
				1952	int ret = default_wake_function(wait, mode, sync, key);
				1953	list_del_init(&wait->entry);
				1954	return ret;
				1955	}
				1956
				1957	static vm_fault_t shmem_fault(struct vm_fault *vmf)
				1958	{
				1959	struct vm_area_struct *vma = vmf->vma;
				1960	struct inode *inode = file_inode(vma->vm_file);
				1961	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
				1962	enum sgp_type sgp;
				1963	int err;
				1964	vm_fault_t ret = VM_FAULT_LOCKED;
				1965
				1966	/*
				1967	* Trinity finds that probing a hole which tmpfs is punching can
				1968	* prevent the hole-punch from ever completing: which in turn
				1969	* locks writers out with its hold on i_mutex. So refrain from
				1970	* faulting pages into the hole while it's being punched. Although
				1971	* shmem_undo_range() does remove the additions, it may be unable to
				1972	* keep up, as each new page needs its own unmap_mapping_range() call,
				1973	* and the i_mmap tree grows ever slower to scan if new vmas are added.
				1974	*
				1975	* It does not matter if we sometimes reach this check just before the
				1976	* hole-punch begins, so that one fault then races with the punch:
				1977	* we just need to make racing faults a rare case.
				1978	*
				1979	* The implementation below would be much simpler if we just used a
				1980	* standard mutex or completion: but we cannot take i_mutex in fault,
				1981	* and bloating every shmem inode for this unlikely case would be sad.
				1982	*/
				1983	if (unlikely(inode->i_private)) {
				1984	struct shmem_falloc *shmem_falloc;
				1985
				1986	spin_lock(&inode->i_lock);
				1987	shmem_falloc = inode->i_private;
				1988	if (shmem_falloc &&
				1989	shmem_falloc->waitq &&
				1990	vmf->pgoff >= shmem_falloc->start &&
				1991	vmf->pgoff < shmem_falloc->next) {
				1992	wait_queue_head_t *shmem_falloc_waitq;
				1993	DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
				1994
				1995	ret = VM_FAULT_NOPAGE;
				1996	if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
				1997	!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
				1998	/* It's polite to up mmap_sem if we can */
				1999	up_read(&vma->vm_mm->mmap_sem);
				2000	ret = VM_FAULT_RETRY;
				2001	}
				2002
				2003	shmem_falloc_waitq = shmem_falloc->waitq;
				2004	prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
				2005	TASK_UNINTERRUPTIBLE);
				2006	spin_unlock(&inode->i_lock);
				2007	schedule();
				2008
				2009	/*
				2010	* shmem_falloc_waitq points into the shmem_fallocate()
				2011	* stack of the hole-punching task: shmem_falloc_waitq
				2012	* is usually invalid by the time we reach here, but
				2013	* finish_wait() does not dereference it in that case;
				2014	* though i_lock needed lest racing with wake_up_all().
				2015	*/
				2016	spin_lock(&inode->i_lock);
				2017	finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
				2018	spin_unlock(&inode->i_lock);
				2019	return ret;
				2020	}
				2021	spin_unlock(&inode->i_lock);
				2022	}
				2023
				2024	sgp = SGP_CACHE;
				2025
				2026	if ((vma->vm_flags & VM_NOHUGEPAGE) \|\|
				2027	test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
				2028	sgp = SGP_NOHUGE;
				2029	else if (vma->vm_flags & VM_HUGEPAGE)
				2030	sgp = SGP_HUGE;
				2031
				2032	err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
				2033	gfp, vma, vmf, &ret);
				2034	if (err)
				2035	return vmf_error(err);
				2036	return ret;
				2037	}
				2038
				2039	unsigned long shmem_get_unmapped_area(struct file *file,
				2040	unsigned long uaddr, unsigned long len,
				2041	unsigned long pgoff, unsigned long flags)
				2042	{
				2043	unsigned long (get_area)(struct file ,
				2044	unsigned long, unsigned long, unsigned long, unsigned long);
				2045	unsigned long addr;
				2046	unsigned long offset;
				2047	unsigned long inflated_len;
				2048	unsigned long inflated_addr;
				2049	unsigned long inflated_offset;
				2050
				2051	if (len > TASK_SIZE)
				2052	return -ENOMEM;
				2053
				2054	get_area = current->mm->get_unmapped_area;
				2055	addr = get_area(file, uaddr, len, pgoff, flags);
				2056
				2057	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
				2058	return addr;
				2059	if (IS_ERR_VALUE(addr))
				2060	return addr;
				2061	if (addr & ~PAGE_MASK)
				2062	return addr;
				2063	if (addr > TASK_SIZE - len)
				2064	return addr;
				2065
				2066	if (shmem_huge == SHMEM_HUGE_DENY)
				2067	return addr;
				2068	if (len < HPAGE_PMD_SIZE)
				2069	return addr;
				2070	if (flags & MAP_FIXED)
				2071	return addr;
				2072	/*
				2073	* Our priority is to support MAP_SHARED mapped hugely;
				2074	* and support MAP_PRIVATE mapped hugely too, until it is COWed.
				2075	* But if caller specified an address hint and we allocated area there
				2076	* successfully, respect that as before.
				2077	*/
				2078	if (uaddr == addr)
				2079	return addr;
				2080
				2081	if (shmem_huge != SHMEM_HUGE_FORCE) {
				2082	struct super_block *sb;
				2083
				2084	if (file) {
				2085	VM_BUG_ON(file->f_op != &shmem_file_operations);
				2086	sb = file_inode(file)->i_sb;
				2087	} else {
				2088	/*
				2089	* Called directly from mm/mmap.c, or drivers/char/mem.c
				2090	* for "/dev/zero", to create a shared anonymous object.
				2091	*/
				2092	if (IS_ERR(shm_mnt))
				2093	return addr;
				2094	sb = shm_mnt->mnt_sb;
				2095	}
				2096	if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
				2097	return addr;
				2098	}
				2099
				2100	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
				2101	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
				2102	return addr;
				2103	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
				2104	return addr;
				2105
				2106	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
				2107	if (inflated_len > TASK_SIZE)
				2108	return addr;
				2109	if (inflated_len < len)
				2110	return addr;
				2111
				2112	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
				2113	if (IS_ERR_VALUE(inflated_addr))
				2114	return addr;
				2115	if (inflated_addr & ~PAGE_MASK)
				2116	return addr;
				2117
				2118	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
				2119	inflated_addr += offset - inflated_offset;
				2120	if (inflated_offset > offset)
				2121	inflated_addr += HPAGE_PMD_SIZE;
				2122
				2123	if (inflated_addr > TASK_SIZE - len)
				2124	return addr;
				2125	return inflated_addr;
				2126	}
				2127
				2128	#ifdef CONFIG_NUMA
				2129	static int shmem_set_policy(struct vm_area_struct vma, struct mempolicy mpol)
				2130	{
				2131	struct inode *inode = file_inode(vma->vm_file);
				2132	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
				2133	}
				2134
				2135	static struct mempolicy shmem_get_policy(struct vm_area_struct vma,
				2136	unsigned long addr)
				2137	{
				2138	struct inode *inode = file_inode(vma->vm_file);
				2139	pgoff_t index;
				2140
				2141	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
				2142	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
				2143	}
				2144	#endif
				2145
				2146	int shmem_lock(struct file file, int lock, struct user_struct user)
				2147	{
				2148	struct inode *inode = file_inode(file);
				2149	struct shmem_inode_info *info = SHMEM_I(inode);
				2150	int retval = -ENOMEM;
				2151
				2152	spin_lock_irq(&info->lock);
				2153	if (lock && !(info->flags & VM_LOCKED)) {
				2154	if (!user_shm_lock(inode->i_size, user))
				2155	goto out_nomem;
				2156	info->flags \|= VM_LOCKED;
				2157	mapping_set_unevictable(file->f_mapping);
				2158	}
				2159	if (!lock && (info->flags & VM_LOCKED) && user) {
				2160	user_shm_unlock(inode->i_size, user);
				2161	info->flags &= ~VM_LOCKED;
				2162	mapping_clear_unevictable(file->f_mapping);
				2163	}
				2164	retval = 0;
				2165
				2166	out_nomem:
				2167	spin_unlock_irq(&info->lock);
				2168	return retval;
				2169	}
				2170
				2171	static int shmem_mmap(struct file file, struct vm_area_struct vma)
				2172	{
				2173	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
				2174
				2175	if (info->seals & F_SEAL_FUTURE_WRITE) {
				2176	/*
				2177	* New PROT_WRITE and MAP_SHARED mmaps are not allowed when
				2178	* "future write" seal active.
				2179	*/
				2180	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
				2181	return -EPERM;
				2182
				2183	/*
				2184	* Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
				2185	* read-only mapping, take care to not allow mprotect to revert
				2186	* protections.
				2187	*/
				2188	vma->vm_flags &= ~(VM_MAYWRITE);
				2189	}
				2190
				2191	file_accessed(file);
				2192	vma->vm_ops = &shmem_vm_ops;
				2193	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				2194	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
				2195	(vma->vm_end & HPAGE_PMD_MASK)) {
				2196	khugepaged_enter(vma, vma->vm_flags);
				2197	}
				2198	return 0;
				2199	}
				2200
				2201	static struct inode shmem_get_inode(struct super_block sb, const struct inode *dir,
				2202	umode_t mode, dev_t dev, unsigned long flags)
				2203	{
				2204	struct inode *inode;
				2205	struct shmem_inode_info *info;
				2206	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				2207
				2208	if (shmem_reserve_inode(sb))
				2209	return NULL;
				2210
				2211	inode = new_inode(sb);
				2212	if (inode) {
				2213	inode->i_ino = get_next_ino();
				2214	inode_init_owner(inode, dir, mode);
				2215	inode->i_blocks = 0;
				2216	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
				2217	inode->i_generation = prandom_u32();
				2218	info = SHMEM_I(inode);
				2219	memset(info, 0, (char )inode - (char )info);
				2220	spin_lock_init(&info->lock);
				2221	info->seals = F_SEAL_SEAL;
				2222	info->flags = flags & VM_NORESERVE;
				2223	INIT_LIST_HEAD(&info->shrinklist);
				2224	INIT_LIST_HEAD(&info->swaplist);
				2225	simple_xattrs_init(&info->xattrs);
				2226	cache_no_acl(inode);
				2227
				2228	switch (mode & S_IFMT) {
				2229	default:
				2230	inode->i_op = &shmem_special_inode_operations;
				2231	init_special_inode(inode, mode, dev);
				2232	break;
				2233	case S_IFREG:
				2234	inode->i_mapping->a_ops = &shmem_aops;
				2235	inode->i_op = &shmem_inode_operations;
				2236	inode->i_fop = &shmem_file_operations;
				2237	mpol_shared_policy_init(&info->policy,
				2238	shmem_get_sbmpol(sbinfo));
				2239	break;
				2240	case S_IFDIR:
				2241	inc_nlink(inode);
				2242	/* Some things misbehave if size == 0 on a directory */
				2243	inode->i_size = 2 * BOGO_DIRENT_SIZE;
				2244	inode->i_op = &shmem_dir_inode_operations;
				2245	inode->i_fop = &simple_dir_operations;
				2246	break;
				2247	case S_IFLNK:
				2248	/*
				2249	* Must not load anything in the rbtree,
				2250	* mpol_free_shared_policy will not be called.
				2251	*/
				2252	mpol_shared_policy_init(&info->policy, NULL);
				2253	break;
				2254	}
				2255
				2256	lockdep_annotate_inode_mutex_key(inode);
				2257	} else
				2258	shmem_free_inode(sb);
				2259	return inode;
				2260	}
				2261
				2262	bool shmem_mapping(struct address_space *mapping)
				2263	{
				2264	return mapping->a_ops == &shmem_aops;
				2265	}
				2266
				2267	static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
				2268	pmd_t *dst_pmd,
				2269	struct vm_area_struct *dst_vma,
				2270	unsigned long dst_addr,
				2271	unsigned long src_addr,
				2272	bool zeropage,
				2273	struct page **pagep)
				2274	{
				2275	struct inode *inode = file_inode(dst_vma->vm_file);
				2276	struct shmem_inode_info *info = SHMEM_I(inode);
				2277	struct address_space *mapping = inode->i_mapping;
				2278	gfp_t gfp = mapping_gfp_mask(mapping);
				2279	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
				2280	struct mem_cgroup *memcg;
				2281	spinlock_t *ptl;
				2282	void *page_kaddr;
				2283	struct page *page;
				2284	pte_t _dst_pte, *dst_pte;
				2285	int ret;
				2286	pgoff_t offset, max_off;
				2287
				2288	ret = -ENOMEM;
				2289	if (!shmem_inode_acct_block(inode, 1))
				2290	goto out;
				2291
				2292	if (!*pagep) {
				2293	page = shmem_alloc_page(gfp, info, pgoff);
				2294	if (!page)
				2295	goto out_unacct_blocks;
				2296
				2297	if (!zeropage) { /* mcopy_atomic */
				2298	page_kaddr = kmap_atomic(page);
				2299	ret = copy_from_user(page_kaddr,
				2300	(const void __user *)src_addr,
				2301	PAGE_SIZE);
				2302	kunmap_atomic(page_kaddr);
				2303
				2304	/* fallback to copy_from_user outside mmap_sem */
				2305	if (unlikely(ret)) {
				2306	*pagep = page;
				2307	shmem_inode_unacct_blocks(inode, 1);
				2308	/* don't free the page */
				2309	return -ENOENT;
				2310	}
				2311	} else { /* mfill_zeropage_atomic */
				2312	clear_highpage(page);
				2313	}
				2314	} else {
				2315	page = *pagep;
				2316	*pagep = NULL;
				2317	}
				2318
				2319	VM_BUG_ON(PageLocked(page) \|\| PageSwapBacked(page));
				2320	__SetPageLocked(page);
				2321	__SetPageSwapBacked(page);
				2322	__SetPageUptodate(page);
				2323
				2324	ret = -EFAULT;
				2325	offset = linear_page_index(dst_vma, dst_addr);
				2326	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
				2327	if (unlikely(offset >= max_off))
				2328	goto out_release;
				2329
				2330	ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
				2331	if (ret)
				2332	goto out_release;
				2333
				2334	ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
				2335	if (!ret) {
				2336	ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
				2337	radix_tree_preload_end();
				2338	}
				2339	if (ret)
				2340	goto out_release_uncharge;
				2341
				2342	mem_cgroup_commit_charge(page, memcg, false, false);
				2343
				2344	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
				2345	if (dst_vma->vm_flags & VM_WRITE)
				2346	_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
				2347	else {
				2348	/*
				2349	* We don't set the pte dirty if the vma has no
				2350	* VM_WRITE permission, so mark the page dirty or it
				2351	* could be freed from under us. We could do it
				2352	* unconditionally before unlock_page(), but doing it
				2353	* only if VM_WRITE is not set is faster.
				2354	*/
				2355	set_page_dirty(page);
				2356	}
				2357
				2358	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
				2359
				2360	ret = -EFAULT;
				2361	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
				2362	if (unlikely(offset >= max_off))
				2363	goto out_release_uncharge_unlock;
				2364
				2365	ret = -EEXIST;
				2366	if (!pte_none(*dst_pte))
				2367	goto out_release_uncharge_unlock;
				2368
				2369	lru_cache_add_anon(page);
				2370
				2371	spin_lock(&info->lock);
				2372	info->alloced++;
				2373	inode->i_blocks += BLOCKS_PER_PAGE;
				2374	shmem_recalc_inode(inode);
				2375	spin_unlock(&info->lock);
				2376
				2377	inc_mm_counter(dst_mm, mm_counter_file(page));
				2378	page_add_file_rmap(page, false);
				2379	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
				2380
				2381	/* No need to invalidate - it was non-present before */
				2382	update_mmu_cache(dst_vma, dst_addr, dst_pte);
				2383	pte_unmap_unlock(dst_pte, ptl);
				2384	unlock_page(page);
				2385	ret = 0;
				2386	out:
				2387	return ret;
				2388	out_release_uncharge_unlock:
				2389	pte_unmap_unlock(dst_pte, ptl);
				2390	ClearPageDirty(page);
				2391	delete_from_page_cache(page);
				2392	out_release_uncharge:
				2393	mem_cgroup_cancel_charge(page, memcg, false);
				2394	out_release:
				2395	unlock_page(page);
				2396	put_page(page);
				2397	out_unacct_blocks:
				2398	shmem_inode_unacct_blocks(inode, 1);
				2399	goto out;
				2400	}
				2401
				2402	int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
				2403	pmd_t *dst_pmd,
				2404	struct vm_area_struct *dst_vma,
				2405	unsigned long dst_addr,
				2406	unsigned long src_addr,
				2407	struct page **pagep)
				2408	{
				2409	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
				2410	dst_addr, src_addr, false, pagep);
				2411	}
				2412
				2413	int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
				2414	pmd_t *dst_pmd,
				2415	struct vm_area_struct *dst_vma,
				2416	unsigned long dst_addr)
				2417	{
				2418	struct page *page = NULL;
				2419
				2420	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
				2421	dst_addr, 0, true, &page);
				2422	}
				2423
				2424	#ifdef CONFIG_TMPFS
				2425	static const struct inode_operations shmem_symlink_inode_operations;
				2426	static const struct inode_operations shmem_short_symlink_operations;
				2427
				2428	#ifdef CONFIG_TMPFS_XATTR
				2429	static int shmem_initxattrs(struct inode , const struct xattr , void *);
				2430	#else
				2431	#define shmem_initxattrs NULL
				2432	#endif
				2433
				2434	static int
				2435	shmem_write_begin(struct file file, struct address_space mapping,
				2436	loff_t pos, unsigned len, unsigned flags,
				2437	struct page pagep, void fsdata)
				2438	{
				2439	struct inode *inode = mapping->host;
				2440	struct shmem_inode_info *info = SHMEM_I(inode);
				2441	pgoff_t index = pos >> PAGE_SHIFT;
				2442
				2443	/* i_mutex is held by caller */
				2444	if (unlikely(info->seals & (F_SEAL_GROW \|
				2445	F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE))) {
				2446	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE))
				2447	return -EPERM;
				2448	if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
				2449	return -EPERM;
				2450	}
				2451
				2452	return shmem_getpage(inode, index, pagep, SGP_WRITE);
				2453	}
				2454
				2455	static int
				2456	shmem_write_end(struct file file, struct address_space mapping,
				2457	loff_t pos, unsigned len, unsigned copied,
				2458	struct page page, void fsdata)
				2459	{
				2460	struct inode *inode = mapping->host;
				2461
				2462	if (pos + copied > inode->i_size)
				2463	i_size_write(inode, pos + copied);
				2464
				2465	if (!PageUptodate(page)) {
				2466	struct page *head = compound_head(page);
				2467	if (PageTransCompound(page)) {
				2468	int i;
				2469
				2470	for (i = 0; i < HPAGE_PMD_NR; i++) {
				2471	if (head + i == page)
				2472	continue;
				2473	clear_highpage(head + i);
				2474	flush_dcache_page(head + i);
				2475	}
				2476	}
				2477	if (copied < PAGE_SIZE) {
				2478	unsigned from = pos & (PAGE_SIZE - 1);
				2479	zero_user_segments(page, 0, from,
				2480	from + copied, PAGE_SIZE);
				2481	}
				2482	SetPageUptodate(head);
				2483	}
				2484	set_page_dirty(page);
				2485	unlock_page(page);
				2486	put_page(page);
				2487
				2488	return copied;
				2489	}
				2490
				2491	static ssize_t shmem_file_read_iter(struct kiocb iocb, struct iov_iter to)
				2492	{
				2493	struct file *file = iocb->ki_filp;
				2494	struct inode *inode = file_inode(file);
				2495	struct address_space *mapping = inode->i_mapping;
				2496	pgoff_t index;
				2497	unsigned long offset;
				2498	enum sgp_type sgp = SGP_READ;
				2499	int error = 0;
				2500	ssize_t retval = 0;
				2501	loff_t *ppos = &iocb->ki_pos;
				2502
				2503	/*
				2504	* Might this read be for a stacking filesystem? Then when reading
				2505	* holes of a sparse file, we actually need to allocate those pages,
				2506	* and even mark them dirty, so it cannot exceed the max_blocks limit.
				2507	*/
				2508	if (!iter_is_iovec(to))
				2509	sgp = SGP_CACHE;
				2510
				2511	index = *ppos >> PAGE_SHIFT;
				2512	offset = *ppos & ~PAGE_MASK;
				2513
				2514	for (;;) {
				2515	struct page *page = NULL;
				2516	pgoff_t end_index;
				2517	unsigned long nr, ret;
				2518	loff_t i_size = i_size_read(inode);
				2519
				2520	end_index = i_size >> PAGE_SHIFT;
				2521	if (index > end_index)
				2522	break;
				2523	if (index == end_index) {
				2524	nr = i_size & ~PAGE_MASK;
				2525	if (nr <= offset)
				2526	break;
				2527	}
				2528
				2529	error = shmem_getpage(inode, index, &page, sgp);
				2530	if (error) {
				2531	if (error == -EINVAL)
				2532	error = 0;
				2533	break;
				2534	}
				2535	if (page) {
				2536	if (sgp == SGP_CACHE)
				2537	set_page_dirty(page);
				2538	unlock_page(page);
				2539	}
				2540
				2541	/*
				2542	* We must evaluate after, since reads (unlike writes)
				2543	* are called without i_mutex protection against truncate
				2544	*/
				2545	nr = PAGE_SIZE;
				2546	i_size = i_size_read(inode);
				2547	end_index = i_size >> PAGE_SHIFT;
				2548	if (index == end_index) {
				2549	nr = i_size & ~PAGE_MASK;
				2550	if (nr <= offset) {
				2551	if (page)
				2552	put_page(page);
				2553	break;
				2554	}
				2555	}
				2556	nr -= offset;
				2557
				2558	if (page) {
				2559	/*
				2560	* If users can be writing to this page using arbitrary
				2561	* virtual addresses, take care about potential aliasing
				2562	* before reading the page on the kernel side.
				2563	*/
				2564	if (mapping_writably_mapped(mapping))
				2565	flush_dcache_page(page);
				2566	/*
				2567	* Mark the page accessed if we read the beginning.
				2568	*/
				2569	if (!offset)
				2570	mark_page_accessed(page);
				2571	} else {
				2572	page = ZERO_PAGE(0);
				2573	get_page(page);
				2574	}
				2575
				2576	/*
				2577	* Ok, we have the page, and it's up-to-date, so
				2578	* now we can copy it to user space...
				2579	*/
				2580	ret = copy_page_to_iter(page, offset, nr, to);
				2581	retval += ret;
				2582	offset += ret;
				2583	index += offset >> PAGE_SHIFT;
				2584	offset &= ~PAGE_MASK;
				2585
				2586	put_page(page);
				2587	if (!iov_iter_count(to))
				2588	break;
				2589	if (ret < nr) {
				2590	error = -EFAULT;
				2591	break;
				2592	}
				2593	cond_resched();
				2594	}
				2595
				2596	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
				2597	file_accessed(file);
				2598	return retval ? retval : error;
				2599	}
				2600
				2601	/*
				2602	* llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
				2603	*/
				2604	static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
				2605	pgoff_t index, pgoff_t end, int whence)
				2606	{
				2607	struct page *page;
				2608	struct pagevec pvec;
				2609	pgoff_t indices[PAGEVEC_SIZE];
				2610	bool done = false;
				2611	int i;
				2612
				2613	pagevec_init(&pvec);
				2614	pvec.nr = 1; /* start small: we may be there already */
				2615	while (!done) {
				2616	pvec.nr = find_get_entries(mapping, index,
				2617	pvec.nr, pvec.pages, indices);
				2618	if (!pvec.nr) {
				2619	if (whence == SEEK_DATA)
				2620	index = end;
				2621	break;
				2622	}
				2623	for (i = 0; i < pvec.nr; i++, index++) {
				2624	if (index < indices[i]) {
				2625	if (whence == SEEK_HOLE) {
				2626	done = true;
				2627	break;
				2628	}
				2629	index = indices[i];
				2630	}
				2631	page = pvec.pages[i];
				2632	if (page && !radix_tree_exceptional_entry(page)) {
				2633	if (!PageUptodate(page))
				2634	page = NULL;
				2635	}
				2636	if (index >= end \|\|
				2637	(page && whence == SEEK_DATA) \|\|
				2638	(!page && whence == SEEK_HOLE)) {
				2639	done = true;
				2640	break;
				2641	}
				2642	}
				2643	pagevec_remove_exceptionals(&pvec);
				2644	pagevec_release(&pvec);
				2645	pvec.nr = PAGEVEC_SIZE;
				2646	cond_resched();
				2647	}
				2648	return index;
				2649	}
				2650
				2651	static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
				2652	{
				2653	struct address_space *mapping = file->f_mapping;
				2654	struct inode *inode = mapping->host;
				2655	pgoff_t start, end;
				2656	loff_t new_offset;
				2657
				2658	if (whence != SEEK_DATA && whence != SEEK_HOLE)
				2659	return generic_file_llseek_size(file, offset, whence,
				2660	MAX_LFS_FILESIZE, i_size_read(inode));
				2661	inode_lock(inode);
				2662	/* We're holding i_mutex so we can access i_size directly */
				2663
				2664	if (offset < 0 \|\| offset >= inode->i_size)
				2665	offset = -ENXIO;
				2666	else {
				2667	start = offset >> PAGE_SHIFT;
				2668	end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2669	new_offset = shmem_seek_hole_data(mapping, start, end, whence);
				2670	new_offset <<= PAGE_SHIFT;
				2671	if (new_offset > offset) {
				2672	if (new_offset < inode->i_size)
				2673	offset = new_offset;
				2674	else if (whence == SEEK_DATA)
				2675	offset = -ENXIO;
				2676	else
				2677	offset = inode->i_size;
				2678	}
				2679	}
				2680
				2681	if (offset >= 0)
				2682	offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
				2683	inode_unlock(inode);
				2684	return offset;
				2685	}
				2686
				2687	static long shmem_fallocate(struct file *file, int mode, loff_t offset,
				2688	loff_t len)
				2689	{
				2690	struct inode *inode = file_inode(file);
				2691	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				2692	struct shmem_inode_info *info = SHMEM_I(inode);
				2693	struct shmem_falloc shmem_falloc;
				2694	pgoff_t start, index, end;
				2695	int error;
				2696
				2697	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
				2698	return -EOPNOTSUPP;
				2699
				2700	inode_lock(inode);
				2701
				2702	if (mode & FALLOC_FL_PUNCH_HOLE) {
				2703	struct address_space *mapping = file->f_mapping;
				2704	loff_t unmap_start = round_up(offset, PAGE_SIZE);
				2705	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
				2706	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
				2707
				2708	/* protected by i_mutex */
				2709	if (info->seals & (F_SEAL_WRITE \| F_SEAL_FUTURE_WRITE)) {
				2710	error = -EPERM;
				2711	goto out;
				2712	}
				2713
				2714	shmem_falloc.waitq = &shmem_falloc_waitq;
				2715	shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
				2716	shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
				2717	spin_lock(&inode->i_lock);
				2718	inode->i_private = &shmem_falloc;
				2719	spin_unlock(&inode->i_lock);
				2720
				2721	if ((u64)unmap_end > (u64)unmap_start)
				2722	unmap_mapping_range(mapping, unmap_start,
				2723	1 + unmap_end - unmap_start, 0);
				2724	shmem_truncate_range(inode, offset, offset + len - 1);
				2725	/* No need to unmap again: hole-punching leaves COWed pages */
				2726
				2727	spin_lock(&inode->i_lock);
				2728	inode->i_private = NULL;
				2729	wake_up_all(&shmem_falloc_waitq);
				2730	WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
				2731	spin_unlock(&inode->i_lock);
				2732	error = 0;
				2733	goto out;
				2734	}
				2735
				2736	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
				2737	error = inode_newsize_ok(inode, offset + len);
				2738	if (error)
				2739	goto out;
				2740
				2741	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
				2742	error = -EPERM;
				2743	goto out;
				2744	}
				2745
				2746	start = offset >> PAGE_SHIFT;
				2747	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
				2748	/* Try to avoid a swapstorm if len is impossible to satisfy */
				2749	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
				2750	error = -ENOSPC;
				2751	goto out;
				2752	}
				2753
				2754	shmem_falloc.waitq = NULL;
				2755	shmem_falloc.start = start;
				2756	shmem_falloc.next = start;
				2757	shmem_falloc.nr_falloced = 0;
				2758	shmem_falloc.nr_unswapped = 0;
				2759	spin_lock(&inode->i_lock);
				2760	inode->i_private = &shmem_falloc;
				2761	spin_unlock(&inode->i_lock);
				2762
				2763	for (index = start; index < end; index++) {
				2764	struct page *page;
				2765
				2766	/*
				2767	* Good, the fallocate(2) manpage permits EINTR: we may have
				2768	* been interrupted because we are using up too much memory.
				2769	*/
				2770	if (signal_pending(current))
				2771	error = -EINTR;
				2772	else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
				2773	error = -ENOMEM;
				2774	else
				2775	error = shmem_getpage(inode, index, &page, SGP_FALLOC);
				2776	if (error) {
				2777	/* Remove the !PageUptodate pages we added */
				2778	if (index > start) {
				2779	shmem_undo_range(inode,
				2780	(loff_t)start << PAGE_SHIFT,
				2781	((loff_t)index << PAGE_SHIFT) - 1, true);
				2782	}
				2783	goto undone;
				2784	}
				2785
				2786	/*
				2787	* Inform shmem_writepage() how far we have reached.
				2788	* No need for lock or barrier: we have the page lock.
				2789	*/
				2790	shmem_falloc.next++;
				2791	if (!PageUptodate(page))
				2792	shmem_falloc.nr_falloced++;
				2793
				2794	/*
				2795	* If !PageUptodate, leave it that way so that freeable pages
				2796	* can be recognized if we need to rollback on error later.
				2797	* But set_page_dirty so that memory pressure will swap rather
				2798	* than free the pages we are allocating (and SGP_CACHE pages
				2799	* might still be clean: we now need to mark those dirty too).
				2800	*/
				2801	set_page_dirty(page);
				2802	unlock_page(page);
				2803	put_page(page);
				2804	cond_resched();
				2805	}
				2806
				2807	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
				2808	i_size_write(inode, offset + len);
				2809	inode->i_ctime = current_time(inode);
				2810	undone:
				2811	spin_lock(&inode->i_lock);
				2812	inode->i_private = NULL;
				2813	spin_unlock(&inode->i_lock);
				2814	out:
				2815	inode_unlock(inode);
				2816	return error;
				2817	}
				2818
				2819	static int shmem_statfs(struct dentry dentry, struct kstatfs buf)
				2820	{
				2821	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
				2822
				2823	buf->f_type = TMPFS_MAGIC;
				2824	buf->f_bsize = PAGE_SIZE;
				2825	buf->f_namelen = NAME_MAX;
				2826	if (sbinfo->max_blocks) {
				2827	buf->f_blocks = sbinfo->max_blocks;
				2828	buf->f_bavail =
				2829	buf->f_bfree = sbinfo->max_blocks -
				2830	percpu_counter_sum(&sbinfo->used_blocks);
				2831	}
				2832	if (sbinfo->max_inodes) {
				2833	buf->f_files = sbinfo->max_inodes;
				2834	buf->f_ffree = sbinfo->free_inodes;
				2835	}
				2836	/* else leave those fields 0 like simple_statfs */
				2837	return 0;
				2838	}
				2839
				2840	/*
				2841	* File creation. Allocate an inode, and we're done..
				2842	*/
				2843	static int
				2844	shmem_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)
				2845	{
				2846	struct inode *inode;
				2847	int error = -ENOSPC;
				2848
				2849	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
				2850	if (inode) {
				2851	error = simple_acl_create(dir, inode);
				2852	if (error)
				2853	goto out_iput;
				2854	error = security_inode_init_security(inode, dir,
				2855	&dentry->d_name,
				2856	shmem_initxattrs, NULL);
				2857	if (error && error != -EOPNOTSUPP)
				2858	goto out_iput;
				2859
				2860	error = 0;
				2861	dir->i_size += BOGO_DIRENT_SIZE;
				2862	dir->i_ctime = dir->i_mtime = current_time(dir);
				2863	d_instantiate(dentry, inode);
				2864	dget(dentry); /* Extra count - pin the dentry in core */
				2865	}
				2866	return error;
				2867	out_iput:
				2868	iput(inode);
				2869	return error;
				2870	}
				2871
				2872	static int
				2873	shmem_tmpfile(struct inode dir, struct dentry dentry, umode_t mode)
				2874	{
				2875	struct inode *inode;
				2876	int error = -ENOSPC;
				2877
				2878	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
				2879	if (inode) {
				2880	error = security_inode_init_security(inode, dir,
				2881	NULL,
				2882	shmem_initxattrs, NULL);
				2883	if (error && error != -EOPNOTSUPP)
				2884	goto out_iput;
				2885	error = simple_acl_create(dir, inode);
				2886	if (error)
				2887	goto out_iput;
				2888	d_tmpfile(dentry, inode);
				2889	}
				2890	return error;
				2891	out_iput:
				2892	iput(inode);
				2893	return error;
				2894	}
				2895
				2896	static int shmem_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
				2897	{
				2898	int error;
				2899
				2900	if ((error = shmem_mknod(dir, dentry, mode \| S_IFDIR, 0)))
				2901	return error;
				2902	inc_nlink(dir);
				2903	return 0;
				2904	}
				2905
				2906	static int shmem_create(struct inode dir, struct dentry dentry, umode_t mode,
				2907	bool excl)
				2908	{
				2909	return shmem_mknod(dir, dentry, mode \| S_IFREG, 0);
				2910	}
				2911
				2912	/*
				2913	* Link a file..
				2914	*/
				2915	static int shmem_link(struct dentry old_dentry, struct inode dir, struct dentry *dentry)
				2916	{
				2917	struct inode *inode = d_inode(old_dentry);
				2918	int ret = 0;
				2919
				2920	/*
				2921	* No ordinary (disk based) filesystem counts links as inodes;
				2922	* but each new link needs a new dentry, pinning lowmem, and
				2923	* tmpfs dentries cannot be pruned until they are unlinked.
				2924	* But if an O_TMPFILE file is linked into the tmpfs, the
				2925	* first link must skip that, to get the accounting right.
				2926	*/
				2927	if (inode->i_nlink) {
				2928	ret = shmem_reserve_inode(inode->i_sb);
				2929	if (ret)
				2930	goto out;
				2931	}
				2932
				2933	dir->i_size += BOGO_DIRENT_SIZE;
				2934	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
				2935	inc_nlink(inode);
				2936	ihold(inode); /* New dentry reference */
				2937	dget(dentry); /* Extra pinning count for the created dentry */
				2938	d_instantiate(dentry, inode);
				2939	out:
				2940	return ret;
				2941	}
				2942
				2943	static int shmem_unlink(struct inode dir, struct dentry dentry)
				2944	{
				2945	struct inode *inode = d_inode(dentry);
				2946
				2947	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
				2948	shmem_free_inode(inode->i_sb);
				2949
				2950	dir->i_size -= BOGO_DIRENT_SIZE;
				2951	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
				2952	drop_nlink(inode);
				2953	dput(dentry); /* Undo the count from "create" - this does all the work */
				2954	return 0;
				2955	}
				2956
				2957	static int shmem_rmdir(struct inode dir, struct dentry dentry)
				2958	{
				2959	if (!simple_empty(dentry))
				2960	return -ENOTEMPTY;
				2961
				2962	drop_nlink(d_inode(dentry));
				2963	drop_nlink(dir);
				2964	return shmem_unlink(dir, dentry);
				2965	}
				2966
				2967	static int shmem_exchange(struct inode old_dir, struct dentry old_dentry, struct inode new_dir, struct dentry new_dentry)
				2968	{
				2969	bool old_is_dir = d_is_dir(old_dentry);
				2970	bool new_is_dir = d_is_dir(new_dentry);
				2971
				2972	if (old_dir != new_dir && old_is_dir != new_is_dir) {
				2973	if (old_is_dir) {
				2974	drop_nlink(old_dir);
				2975	inc_nlink(new_dir);
				2976	} else {
				2977	drop_nlink(new_dir);
				2978	inc_nlink(old_dir);
				2979	}
				2980	}
				2981	old_dir->i_ctime = old_dir->i_mtime =
				2982	new_dir->i_ctime = new_dir->i_mtime =
				2983	d_inode(old_dentry)->i_ctime =
				2984	d_inode(new_dentry)->i_ctime = current_time(old_dir);
				2985
				2986	return 0;
				2987	}
				2988
				2989	static int shmem_whiteout(struct inode old_dir, struct dentry old_dentry)
				2990	{
				2991	struct dentry *whiteout;
				2992	int error;
				2993
				2994	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
				2995	if (!whiteout)
				2996	return -ENOMEM;
				2997
				2998	error = shmem_mknod(old_dir, whiteout,
				2999	S_IFCHR \| WHITEOUT_MODE, WHITEOUT_DEV);
				3000	dput(whiteout);
				3001	if (error)
				3002	return error;
				3003
				3004	/*
				3005	* Cheat and hash the whiteout while the old dentry is still in
				3006	* place, instead of playing games with FS_RENAME_DOES_D_MOVE.
				3007	*
				3008	* d_lookup() will consistently find one of them at this point,
				3009	* not sure which one, but that isn't even important.
				3010	*/
				3011	d_rehash(whiteout);
				3012	return 0;
				3013	}
				3014
				3015	/*
				3016	* The VFS layer already does all the dentry stuff for rename,
				3017	* we just have to decrement the usage count for the target if
				3018	* it exists so that the VFS layer correctly free's it when it
				3019	* gets overwritten.
				3020	*/
				3021	static int shmem_rename2(struct inode old_dir, struct dentry old_dentry, struct inode new_dir, struct dentry new_dentry, unsigned int flags)
				3022	{
				3023	struct inode *inode = d_inode(old_dentry);
				3024	int they_are_dirs = S_ISDIR(inode->i_mode);
				3025
				3026	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
				3027	return -EINVAL;
				3028
				3029	if (flags & RENAME_EXCHANGE)
				3030	return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
				3031
				3032	if (!simple_empty(new_dentry))
				3033	return -ENOTEMPTY;
				3034
				3035	if (flags & RENAME_WHITEOUT) {
				3036	int error;
				3037
				3038	error = shmem_whiteout(old_dir, old_dentry);
				3039	if (error)
				3040	return error;
				3041	}
				3042
				3043	if (d_really_is_positive(new_dentry)) {
				3044	(void) shmem_unlink(new_dir, new_dentry);
				3045	if (they_are_dirs) {
				3046	drop_nlink(d_inode(new_dentry));
				3047	drop_nlink(old_dir);
				3048	}
				3049	} else if (they_are_dirs) {
				3050	drop_nlink(old_dir);
				3051	inc_nlink(new_dir);
				3052	}
				3053
				3054	old_dir->i_size -= BOGO_DIRENT_SIZE;
				3055	new_dir->i_size += BOGO_DIRENT_SIZE;
				3056	old_dir->i_ctime = old_dir->i_mtime =
				3057	new_dir->i_ctime = new_dir->i_mtime =
				3058	inode->i_ctime = current_time(old_dir);
				3059	return 0;
				3060	}
				3061
				3062	static int shmem_symlink(struct inode dir, struct dentry dentry, const char *symname)
				3063	{
				3064	int error;
				3065	int len;
				3066	struct inode *inode;
				3067	struct page *page;
				3068
				3069	len = strlen(symname) + 1;
				3070	if (len > PAGE_SIZE)
				3071	return -ENAMETOOLONG;
				3072
				3073	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK \| 0777, 0,
				3074	VM_NORESERVE);
				3075	if (!inode)
				3076	return -ENOSPC;
				3077
				3078	error = security_inode_init_security(inode, dir, &dentry->d_name,
				3079	shmem_initxattrs, NULL);
				3080	if (error) {
				3081	if (error != -EOPNOTSUPP) {
				3082	iput(inode);
				3083	return error;
				3084	}
				3085	error = 0;
				3086	}
				3087
				3088	inode->i_size = len-1;
				3089	if (len <= SHORT_SYMLINK_LEN) {
				3090	inode->i_link = kmemdup(symname, len, GFP_KERNEL);
				3091	if (!inode->i_link) {
				3092	iput(inode);
				3093	return -ENOMEM;
				3094	}
				3095	inode->i_op = &shmem_short_symlink_operations;
				3096	} else {
				3097	inode_nohighmem(inode);
				3098	error = shmem_getpage(inode, 0, &page, SGP_WRITE);
				3099	if (error) {
				3100	iput(inode);
				3101	return error;
				3102	}
				3103	inode->i_mapping->a_ops = &shmem_aops;
				3104	inode->i_op = &shmem_symlink_inode_operations;
				3105	memcpy(page_address(page), symname, len);
				3106	SetPageUptodate(page);
				3107	set_page_dirty(page);
				3108	unlock_page(page);
				3109	put_page(page);
				3110	}
				3111	dir->i_size += BOGO_DIRENT_SIZE;
				3112	dir->i_ctime = dir->i_mtime = current_time(dir);
				3113	d_instantiate(dentry, inode);
				3114	dget(dentry);
				3115	return 0;
				3116	}
				3117
				3118	static void shmem_put_link(void *arg)
				3119	{
				3120	mark_page_accessed(arg);
				3121	put_page(arg);
				3122	}
				3123
				3124	static const char shmem_get_link(struct dentry dentry,
				3125	struct inode *inode,
				3126	struct delayed_call *done)
				3127	{
				3128	struct page *page = NULL;
				3129	int error;
				3130	if (!dentry) {
				3131	page = find_get_page(inode->i_mapping, 0);
				3132	if (!page)
				3133	return ERR_PTR(-ECHILD);
				3134	if (!PageUptodate(page)) {
				3135	put_page(page);
				3136	return ERR_PTR(-ECHILD);
				3137	}
				3138	} else {
				3139	error = shmem_getpage(inode, 0, &page, SGP_READ);
				3140	if (error)
				3141	return ERR_PTR(error);
				3142	unlock_page(page);
				3143	}
				3144	set_delayed_call(done, shmem_put_link, page);
				3145	return page_address(page);
				3146	}
				3147
				3148	#ifdef CONFIG_TMPFS_XATTR
				3149	/*
				3150	* Superblocks without xattr inode operations may get some security.* xattr
				3151	* support from the LSM "for free". As soon as we have any other xattrs
				3152	* like ACLs, we also need to implement the security.* handlers at
				3153	* filesystem level, though.
				3154	*/
				3155
				3156	/*
				3157	* Callback for security_inode_init_security() for acquiring xattrs.
				3158	*/
				3159	static int shmem_initxattrs(struct inode *inode,
				3160	const struct xattr *xattr_array,
				3161	void *fs_info)
				3162	{
				3163	struct shmem_inode_info *info = SHMEM_I(inode);
				3164	const struct xattr *xattr;
				3165	struct simple_xattr *new_xattr;
				3166	size_t len;
				3167
				3168	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
				3169	new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
				3170	if (!new_xattr)
				3171	return -ENOMEM;
				3172
				3173	len = strlen(xattr->name) + 1;
				3174	new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
				3175	GFP_KERNEL);
				3176	if (!new_xattr->name) {
				3177	kfree(new_xattr);
				3178	return -ENOMEM;
				3179	}
				3180
				3181	memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
				3182	XATTR_SECURITY_PREFIX_LEN);
				3183	memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
				3184	xattr->name, len);
				3185
				3186	simple_xattr_list_add(&info->xattrs, new_xattr);
				3187	}
				3188
				3189	return 0;
				3190	}
				3191
				3192	static int shmem_xattr_handler_get(const struct xattr_handler *handler,
				3193	struct dentry unused, struct inode inode,
				3194	const char name, void buffer, size_t size)
				3195	{
				3196	struct shmem_inode_info *info = SHMEM_I(inode);
				3197
				3198	name = xattr_full_name(handler, name);
				3199	return simple_xattr_get(&info->xattrs, name, buffer, size);
				3200	}
				3201
				3202	static int shmem_xattr_handler_set(const struct xattr_handler *handler,
				3203	struct dentry unused, struct inode inode,
				3204	const char name, const void value,
				3205	size_t size, int flags)
				3206	{
				3207	struct shmem_inode_info *info = SHMEM_I(inode);
				3208
				3209	name = xattr_full_name(handler, name);
				3210	return simple_xattr_set(&info->xattrs, name, value, size, flags);
				3211	}
				3212
				3213	static const struct xattr_handler shmem_security_xattr_handler = {
				3214	.prefix = XATTR_SECURITY_PREFIX,
				3215	.get = shmem_xattr_handler_get,
				3216	.set = shmem_xattr_handler_set,
				3217	};
				3218
				3219	static const struct xattr_handler shmem_trusted_xattr_handler = {
				3220	.prefix = XATTR_TRUSTED_PREFIX,
				3221	.get = shmem_xattr_handler_get,
				3222	.set = shmem_xattr_handler_set,
				3223	};
				3224
				3225	static const struct xattr_handler *shmem_xattr_handlers[] = {
				3226	#ifdef CONFIG_TMPFS_POSIX_ACL
				3227	&posix_acl_access_xattr_handler,
				3228	&posix_acl_default_xattr_handler,
				3229	#endif
				3230	&shmem_security_xattr_handler,
				3231	&shmem_trusted_xattr_handler,
				3232	NULL
				3233	};
				3234
				3235	static ssize_t shmem_listxattr(struct dentry dentry, char buffer, size_t size)
				3236	{
				3237	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
				3238	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
				3239	}
				3240	#endif /* CONFIG_TMPFS_XATTR */
				3241
				3242	static const struct inode_operations shmem_short_symlink_operations = {
				3243	.get_link = simple_get_link,
				3244	#ifdef CONFIG_TMPFS_XATTR
				3245	.listxattr = shmem_listxattr,
				3246	#endif
				3247	};
				3248
				3249	static const struct inode_operations shmem_symlink_inode_operations = {
				3250	.get_link = shmem_get_link,
				3251	#ifdef CONFIG_TMPFS_XATTR
				3252	.listxattr = shmem_listxattr,
				3253	#endif
				3254	};
				3255
				3256	static struct dentry shmem_get_parent(struct dentry child)
				3257	{
				3258	return ERR_PTR(-ESTALE);
				3259	}
				3260
				3261	static int shmem_match(struct inode ino, void vfh)
				3262	{
				3263	__u32 *fh = vfh;
				3264	__u64 inum = fh[2];
				3265	inum = (inum << 32) \| fh[1];
				3266	return ino->i_ino == inum && fh[0] == ino->i_generation;
				3267	}
				3268
				3269	/* Find any alias of inode, but prefer a hashed alias */
				3270	static struct dentry shmem_find_alias(struct inode inode)
				3271	{
				3272	struct dentry *alias = d_find_alias(inode);
				3273
				3274	return alias ?: d_find_any_alias(inode);
				3275	}
				3276
				3277
				3278	static struct dentry shmem_fh_to_dentry(struct super_block sb,
				3279	struct fid *fid, int fh_len, int fh_type)
				3280	{
				3281	struct inode *inode;
				3282	struct dentry *dentry = NULL;
				3283	u64 inum;
				3284
				3285	if (fh_len < 3)
				3286	return NULL;
				3287
				3288	inum = fid->raw[2];
				3289	inum = (inum << 32) \| fid->raw[1];
				3290
				3291	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
				3292	shmem_match, fid->raw);
				3293	if (inode) {
				3294	dentry = shmem_find_alias(inode);
				3295	iput(inode);
				3296	}
				3297
				3298	return dentry;
				3299	}
				3300
				3301	static int shmem_encode_fh(struct inode inode, __u32 fh, int *len,
				3302	struct inode *parent)
				3303	{
				3304	if (*len < 3) {
				3305	*len = 3;
				3306	return FILEID_INVALID;
				3307	}
				3308
				3309	if (inode_unhashed(inode)) {
				3310	/* Unfortunately insert_inode_hash is not idempotent,
				3311	* so as we hash inodes here rather than at creation
				3312	* time, we need a lock to ensure we only try
				3313	* to do it once
				3314	*/
				3315	static DEFINE_SPINLOCK(lock);
				3316	spin_lock(&lock);
				3317	if (inode_unhashed(inode))
				3318	__insert_inode_hash(inode,
				3319	inode->i_ino + inode->i_generation);
				3320	spin_unlock(&lock);
				3321	}
				3322
				3323	fh[0] = inode->i_generation;
				3324	fh[1] = inode->i_ino;
				3325	fh[2] = ((__u64)inode->i_ino) >> 32;
				3326
				3327	*len = 3;
				3328	return 1;
				3329	}
				3330
				3331	static const struct export_operations shmem_export_ops = {
				3332	.get_parent = shmem_get_parent,
				3333	.encode_fh = shmem_encode_fh,
				3334	.fh_to_dentry = shmem_fh_to_dentry,
				3335	};
				3336
				3337	static int shmem_parse_options(char options, struct shmem_sb_info sbinfo,
				3338	bool remount)
				3339	{
				3340	char this_char, value, *rest;
				3341	struct mempolicy *mpol = NULL;
				3342	uid_t uid;
				3343	gid_t gid;
				3344
				3345	while (options != NULL) {
				3346	this_char = options;
				3347	for (;;) {
				3348	/*
				3349	* NUL-terminate this option: unfortunately,
				3350	* mount options form a comma-separated list,
				3351	* but mpol's nodelist may also contain commas.
				3352	*/
				3353	options = strchr(options, ',');
				3354	if (options == NULL)
				3355	break;
				3356	options++;
				3357	if (!isdigit(*options)) {
				3358	options[-1] = '\0';
				3359	break;
				3360	}
				3361	}
				3362	if (!*this_char)
				3363	continue;
				3364	if ((value = strchr(this_char,'=')) != NULL) {
				3365	*value++ = 0;
				3366	} else {
				3367	pr_err("tmpfs: No value for mount option '%s'\n",
				3368	this_char);
				3369	goto error;
				3370	}
				3371
				3372	if (!strcmp(this_char,"size")) {
				3373	unsigned long long size;
				3374	size = memparse(value,&rest);
				3375	if (*rest == '%') {
				3376	size <<= PAGE_SHIFT;
				3377	size *= totalram_pages;
				3378	do_div(size, 100);
				3379	rest++;
				3380	}
				3381	if (*rest)
				3382	goto bad_val;
				3383	sbinfo->max_blocks =
				3384	DIV_ROUND_UP(size, PAGE_SIZE);
				3385	} else if (!strcmp(this_char,"nr_blocks")) {
				3386	sbinfo->max_blocks = memparse(value, &rest);
				3387	if (*rest)
				3388	goto bad_val;
				3389	} else if (!strcmp(this_char,"nr_inodes")) {
				3390	sbinfo->max_inodes = memparse(value, &rest);
				3391	if (*rest)
				3392	goto bad_val;
				3393	} else if (!strcmp(this_char,"mode")) {
				3394	if (remount)
				3395	continue;
				3396	sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
				3397	if (*rest)
				3398	goto bad_val;
				3399	} else if (!strcmp(this_char,"uid")) {
				3400	if (remount)
				3401	continue;
				3402	uid = simple_strtoul(value, &rest, 0);
				3403	if (*rest)
				3404	goto bad_val;
				3405	sbinfo->uid = make_kuid(current_user_ns(), uid);
				3406	if (!uid_valid(sbinfo->uid))
				3407	goto bad_val;
				3408	} else if (!strcmp(this_char,"gid")) {
				3409	if (remount)
				3410	continue;
				3411	gid = simple_strtoul(value, &rest, 0);
				3412	if (*rest)
				3413	goto bad_val;
				3414	sbinfo->gid = make_kgid(current_user_ns(), gid);
				3415	if (!gid_valid(sbinfo->gid))
				3416	goto bad_val;
				3417	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3418	} else if (!strcmp(this_char, "huge")) {
				3419	int huge;
				3420	huge = shmem_parse_huge(value);
				3421	if (huge < 0)
				3422	goto bad_val;
				3423	if (!has_transparent_hugepage() &&
				3424	huge != SHMEM_HUGE_NEVER)
				3425	goto bad_val;
				3426	sbinfo->huge = huge;
				3427	#endif
				3428	#ifdef CONFIG_NUMA
				3429	} else if (!strcmp(this_char,"mpol")) {
				3430	mpol_put(mpol);
				3431	mpol = NULL;
				3432	if (mpol_parse_str(value, &mpol))
				3433	goto bad_val;
				3434	#endif
				3435	} else {
				3436	pr_err("tmpfs: Bad mount option %s\n", this_char);
				3437	goto error;
				3438	}
				3439	}
				3440	sbinfo->mpol = mpol;
				3441	return 0;
				3442
				3443	bad_val:
				3444	pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
				3445	value, this_char);
				3446	error:
				3447	mpol_put(mpol);
				3448	return 1;
				3449
				3450	}
				3451
				3452	static int shmem_remount_fs(struct super_block sb, int flags, char *data)
				3453	{
				3454	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				3455	struct shmem_sb_info config = *sbinfo;
				3456	unsigned long inodes;
				3457	int error = -EINVAL;
				3458
				3459	config.mpol = NULL;
				3460	if (shmem_parse_options(data, &config, true))
				3461	return error;
				3462
				3463	spin_lock(&sbinfo->stat_lock);
				3464	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
				3465	if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
				3466	goto out;
				3467	if (config.max_inodes < inodes)
				3468	goto out;
				3469	/*
				3470	* Those tests disallow limited->unlimited while any are in use;
				3471	* but we must separately disallow unlimited->limited, because
				3472	* in that case we have no record of how much is already in use.
				3473	*/
				3474	if (config.max_blocks && !sbinfo->max_blocks)
				3475	goto out;
				3476	if (config.max_inodes && !sbinfo->max_inodes)
				3477	goto out;
				3478
				3479	error = 0;
				3480	sbinfo->huge = config.huge;
				3481	sbinfo->max_blocks = config.max_blocks;
				3482	sbinfo->max_inodes = config.max_inodes;
				3483	sbinfo->free_inodes = config.max_inodes - inodes;
				3484
				3485	/*
				3486	* Preserve previous mempolicy unless mpol remount option was specified.
				3487	*/
				3488	if (config.mpol) {
				3489	mpol_put(sbinfo->mpol);
				3490	sbinfo->mpol = config.mpol; /* transfers initial ref */
				3491	}
				3492	out:
				3493	spin_unlock(&sbinfo->stat_lock);
				3494	return error;
				3495	}
				3496
				3497	static int shmem_show_options(struct seq_file seq, struct dentry root)
				3498	{
				3499	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
				3500
				3501	if (sbinfo->max_blocks != shmem_default_max_blocks())
				3502	seq_printf(seq, ",size=%luk",
				3503	sbinfo->max_blocks << (PAGE_SHIFT - 10));
				3504	if (sbinfo->max_inodes != shmem_default_max_inodes())
				3505	seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
				3506	if (sbinfo->mode != (0777 \| S_ISVTX))
				3507	seq_printf(seq, ",mode=%03ho", sbinfo->mode);
				3508	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
				3509	seq_printf(seq, ",uid=%u",
				3510	from_kuid_munged(&init_user_ns, sbinfo->uid));
				3511	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
				3512	seq_printf(seq, ",gid=%u",
				3513	from_kgid_munged(&init_user_ns, sbinfo->gid));
				3514	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3515	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
				3516	if (sbinfo->huge)
				3517	seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
				3518	#endif
				3519	shmem_show_mpol(seq, sbinfo->mpol);
				3520	return 0;
				3521	}
				3522
				3523	#endif /* CONFIG_TMPFS */
				3524
				3525	static void shmem_put_super(struct super_block *sb)
				3526	{
				3527	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
				3528
				3529	percpu_counter_destroy(&sbinfo->used_blocks);
				3530	mpol_put(sbinfo->mpol);
				3531	kfree(sbinfo);
				3532	sb->s_fs_info = NULL;
				3533	}
				3534
				3535	int shmem_fill_super(struct super_block sb, void data, int silent)
				3536	{
				3537	struct inode *inode;
				3538	struct shmem_sb_info *sbinfo;
				3539	int err = -ENOMEM;
				3540
				3541	/* Round up to L1_CACHE_BYTES to resist false sharing */
				3542	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
				3543	L1_CACHE_BYTES), GFP_KERNEL);
				3544	if (!sbinfo)
				3545	return -ENOMEM;
				3546
				3547	sbinfo->mode = 0777 \| S_ISVTX;
				3548	sbinfo->uid = current_fsuid();
				3549	sbinfo->gid = current_fsgid();
				3550	sb->s_fs_info = sbinfo;
				3551
				3552	#ifdef CONFIG_TMPFS
				3553	/*
				3554	* Per default we only allow half of the physical ram per
				3555	* tmpfs instance, limiting inodes to one per page of lowmem;
				3556	* but the internal instance is left unlimited.
				3557	*/
				3558	if (!(sb->s_flags & SB_KERNMOUNT)) {
				3559	sbinfo->max_blocks = shmem_default_max_blocks();
				3560	sbinfo->max_inodes = shmem_default_max_inodes();
				3561	if (shmem_parse_options(data, sbinfo, false)) {
				3562	err = -EINVAL;
				3563	goto failed;
				3564	}
				3565	} else {
				3566	sb->s_flags \|= SB_NOUSER;
				3567	}
				3568	sb->s_export_op = &shmem_export_ops;
				3569	sb->s_flags \|= SB_NOSEC;
				3570	#else
				3571	sb->s_flags \|= SB_NOUSER;
				3572	#endif
				3573
				3574	spin_lock_init(&sbinfo->stat_lock);
				3575	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
				3576	goto failed;
				3577	sbinfo->free_inodes = sbinfo->max_inodes;
				3578	spin_lock_init(&sbinfo->shrinklist_lock);
				3579	INIT_LIST_HEAD(&sbinfo->shrinklist);
				3580
				3581	sb->s_maxbytes = MAX_LFS_FILESIZE;
				3582	sb->s_blocksize = PAGE_SIZE;
				3583	sb->s_blocksize_bits = PAGE_SHIFT;
				3584	sb->s_magic = TMPFS_MAGIC;
				3585	sb->s_op = &shmem_ops;
				3586	sb->s_time_gran = 1;
				3587	#ifdef CONFIG_TMPFS_XATTR
				3588	sb->s_xattr = shmem_xattr_handlers;
				3589	#endif
				3590	#ifdef CONFIG_TMPFS_POSIX_ACL
				3591	sb->s_flags \|= SB_POSIXACL;
				3592	#endif
				3593	uuid_gen(&sb->s_uuid);
				3594
				3595	inode = shmem_get_inode(sb, NULL, S_IFDIR \| sbinfo->mode, 0, VM_NORESERVE);
				3596	if (!inode)
				3597	goto failed;
				3598	inode->i_uid = sbinfo->uid;
				3599	inode->i_gid = sbinfo->gid;
				3600	sb->s_root = d_make_root(inode);
				3601	if (!sb->s_root)
				3602	goto failed;
				3603	return 0;
				3604
				3605	failed:
				3606	shmem_put_super(sb);
				3607	return err;
				3608	}
				3609
				3610	static struct kmem_cache *shmem_inode_cachep;
				3611
				3612	static struct inode shmem_alloc_inode(struct super_block sb)
				3613	{
				3614	struct shmem_inode_info *info;
				3615	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
				3616	if (!info)
				3617	return NULL;
				3618	return &info->vfs_inode;
				3619	}
				3620
				3621	static void shmem_destroy_callback(struct rcu_head *head)
				3622	{
				3623	struct inode *inode = container_of(head, struct inode, i_rcu);
				3624	if (S_ISLNK(inode->i_mode))
				3625	kfree(inode->i_link);
				3626	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
				3627	}
				3628
				3629	static void shmem_destroy_inode(struct inode *inode)
				3630	{
				3631	if (S_ISREG(inode->i_mode))
				3632	mpol_free_shared_policy(&SHMEM_I(inode)->policy);
				3633	call_rcu(&inode->i_rcu, shmem_destroy_callback);
				3634	}
				3635
				3636	static void shmem_init_inode(void *foo)
				3637	{
				3638	struct shmem_inode_info *info = foo;
				3639	inode_init_once(&info->vfs_inode);
				3640	}
				3641
				3642	static void shmem_init_inodecache(void)
				3643	{
				3644	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
				3645	sizeof(struct shmem_inode_info),
				3646	0, SLAB_PANIC\|SLAB_ACCOUNT, shmem_init_inode);
				3647	}
				3648
				3649	static void shmem_destroy_inodecache(void)
				3650	{
				3651	kmem_cache_destroy(shmem_inode_cachep);
				3652	}
				3653
				3654	static const struct address_space_operations shmem_aops = {
				3655	.writepage = shmem_writepage,
				3656	.set_page_dirty = __set_page_dirty_no_writeback,
				3657	#ifdef CONFIG_TMPFS
				3658	.write_begin = shmem_write_begin,
				3659	.write_end = shmem_write_end,
				3660	#endif
				3661	#ifdef CONFIG_MIGRATION
				3662	.migratepage = migrate_page,
				3663	#endif
				3664	.error_remove_page = generic_error_remove_page,
				3665	};
				3666
				3667	static const struct file_operations shmem_file_operations = {
				3668	.mmap = shmem_mmap,
				3669	.get_unmapped_area = shmem_get_unmapped_area,
				3670	#ifdef CONFIG_TMPFS
				3671	.llseek = shmem_file_llseek,
				3672	.read_iter = shmem_file_read_iter,
				3673	.write_iter = generic_file_write_iter,
				3674	.fsync = noop_fsync,
				3675	.splice_read = generic_file_splice_read,
				3676	.splice_write = iter_file_splice_write,
				3677	.fallocate = shmem_fallocate,
				3678	#endif
				3679	};
				3680
				3681	static const struct inode_operations shmem_inode_operations = {
				3682	.getattr = shmem_getattr,
				3683	.setattr = shmem_setattr,
				3684	#ifdef CONFIG_TMPFS_XATTR
				3685	.listxattr = shmem_listxattr,
				3686	.set_acl = simple_set_acl,
				3687	#endif
				3688	};
				3689
				3690	static const struct inode_operations shmem_dir_inode_operations = {
				3691	#ifdef CONFIG_TMPFS
				3692	.create = shmem_create,
				3693	.lookup = simple_lookup,
				3694	.link = shmem_link,
				3695	.unlink = shmem_unlink,
				3696	.symlink = shmem_symlink,
				3697	.mkdir = shmem_mkdir,
				3698	.rmdir = shmem_rmdir,
				3699	.mknod = shmem_mknod,
				3700	.rename = shmem_rename2,
				3701	.tmpfile = shmem_tmpfile,
				3702	#endif
				3703	#ifdef CONFIG_TMPFS_XATTR
				3704	.listxattr = shmem_listxattr,
				3705	#endif
				3706	#ifdef CONFIG_TMPFS_POSIX_ACL
				3707	.setattr = shmem_setattr,
				3708	.set_acl = simple_set_acl,
				3709	#endif
				3710	};
				3711
				3712	static const struct inode_operations shmem_special_inode_operations = {
				3713	#ifdef CONFIG_TMPFS_XATTR
				3714	.listxattr = shmem_listxattr,
				3715	#endif
				3716	#ifdef CONFIG_TMPFS_POSIX_ACL
				3717	.setattr = shmem_setattr,
				3718	.set_acl = simple_set_acl,
				3719	#endif
				3720	};
				3721
				3722	static const struct super_operations shmem_ops = {
				3723	.alloc_inode = shmem_alloc_inode,
				3724	.destroy_inode = shmem_destroy_inode,
				3725	#ifdef CONFIG_TMPFS
				3726	.statfs = shmem_statfs,
				3727	.remount_fs = shmem_remount_fs,
				3728	.show_options = shmem_show_options,
				3729	#endif
				3730	.evict_inode = shmem_evict_inode,
				3731	.drop_inode = generic_delete_inode,
				3732	.put_super = shmem_put_super,
				3733	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3734	.nr_cached_objects = shmem_unused_huge_count,
				3735	.free_cached_objects = shmem_unused_huge_scan,
				3736	#endif
				3737	};
				3738
				3739	static const struct vm_operations_struct shmem_vm_ops = {
				3740	.fault = shmem_fault,
				3741	.map_pages = filemap_map_pages,
				3742	#ifdef CONFIG_NUMA
				3743	.set_policy = shmem_set_policy,
				3744	.get_policy = shmem_get_policy,
				3745	#endif
				3746	};
				3747
				3748	static struct dentry shmem_mount(struct file_system_type fs_type,
				3749	int flags, const char dev_name, void data)
				3750	{
				3751	return mount_nodev(fs_type, flags, data, shmem_fill_super);
				3752	}
				3753
				3754	static struct file_system_type shmem_fs_type = {
				3755	.owner = THIS_MODULE,
				3756	.name = "tmpfs",
				3757	.mount = shmem_mount,
				3758	.kill_sb = kill_litter_super,
				3759	.fs_flags = FS_USERNS_MOUNT,
				3760	};
				3761
				3762	int __init shmem_init(void)
				3763	{
				3764	int error;
				3765
				3766	/* If rootfs called this, don't re-init */
				3767	if (shmem_inode_cachep)
				3768	return 0;
				3769
				3770	shmem_init_inodecache();
				3771
				3772	error = register_filesystem(&shmem_fs_type);
				3773	if (error) {
				3774	pr_err("Could not register tmpfs\n");
				3775	goto out2;
				3776	}
				3777
				3778	shm_mnt = kern_mount(&shmem_fs_type);
				3779	if (IS_ERR(shm_mnt)) {
				3780	error = PTR_ERR(shm_mnt);
				3781	pr_err("Could not kern_mount tmpfs\n");
				3782	goto out1;
				3783	}
				3784
				3785	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3786	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
				3787	SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
				3788	else
				3789	shmem_huge = 0; /* just in case it was patched */
				3790	#endif
				3791	return 0;
				3792
				3793	out1:
				3794	unregister_filesystem(&shmem_fs_type);
				3795	out2:
				3796	shmem_destroy_inodecache();
				3797	shm_mnt = ERR_PTR(error);
				3798	return error;
				3799	}
				3800
				3801	#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
				3802	static ssize_t shmem_enabled_show(struct kobject *kobj,
				3803	struct kobj_attribute attr, char buf)
				3804	{
				3805	int values[] = {
				3806	SHMEM_HUGE_ALWAYS,
				3807	SHMEM_HUGE_WITHIN_SIZE,
				3808	SHMEM_HUGE_ADVISE,
				3809	SHMEM_HUGE_NEVER,
				3810	SHMEM_HUGE_DENY,
				3811	SHMEM_HUGE_FORCE,
				3812	};
				3813	int i, count;
				3814
				3815	for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
				3816	const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
				3817
				3818	count += sprintf(buf + count, fmt,
				3819	shmem_format_huge(values[i]));
				3820	}
				3821	buf[count - 1] = '\n';
				3822	return count;
				3823	}
				3824
				3825	static ssize_t shmem_enabled_store(struct kobject *kobj,
				3826	struct kobj_attribute attr, const char buf, size_t count)
				3827	{
				3828	char tmp[16];
				3829	int huge;
				3830
				3831	if (count + 1 > sizeof(tmp))
				3832	return -EINVAL;
				3833	memcpy(tmp, buf, count);
				3834	tmp[count] = '\0';
				3835	if (count && tmp[count - 1] == '\n')
				3836	tmp[count - 1] = '\0';
				3837
				3838	huge = shmem_parse_huge(tmp);
				3839	if (huge == -EINVAL)
				3840	return -EINVAL;
				3841	if (!has_transparent_hugepage() &&
				3842	huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
				3843	return -EINVAL;
				3844
				3845	shmem_huge = huge;
				3846	if (shmem_huge > SHMEM_HUGE_DENY)
				3847	SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
				3848	return count;
				3849	}
				3850
				3851	struct kobj_attribute shmem_enabled_attr =
				3852	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
				3853	#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
				3854
				3855	#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
				3856	bool shmem_huge_enabled(struct vm_area_struct *vma)
				3857	{
				3858	struct inode *inode = file_inode(vma->vm_file);
				3859	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
				3860	loff_t i_size;
				3861	pgoff_t off;
				3862
				3863	if (shmem_huge == SHMEM_HUGE_FORCE)
				3864	return true;
				3865	if (shmem_huge == SHMEM_HUGE_DENY)
				3866	return false;
				3867	switch (sbinfo->huge) {
				3868	case SHMEM_HUGE_NEVER:
				3869	return false;
				3870	case SHMEM_HUGE_ALWAYS:
				3871	return true;
				3872	case SHMEM_HUGE_WITHIN_SIZE:
				3873	off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
				3874	i_size = round_up(i_size_read(inode), PAGE_SIZE);
				3875	if (i_size >= HPAGE_PMD_SIZE &&
				3876	i_size >> PAGE_SHIFT >= off)
				3877	return true;
				3878	/* fall through */
				3879	case SHMEM_HUGE_ADVISE:
				3880	/* TODO: implement fadvise() hints */
				3881	return (vma->vm_flags & VM_HUGEPAGE);
				3882	default:
				3883	VM_BUG_ON(1);
				3884	return false;
				3885	}
				3886	}
				3887	#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
				3888
				3889	#else /* !CONFIG_SHMEM */
				3890
				3891	/*
				3892	* tiny-shmem: simple shmemfs and tmpfs using ramfs code
				3893	*
				3894	* This is intended for small system where the benefits of the full
				3895	* shmem code (swap-backed and resource-limited) are outweighed by
				3896	* their complexity. On systems without swap this code should be
				3897	* effectively equivalent, but much lighter weight.
				3898	*/
				3899
				3900	static struct file_system_type shmem_fs_type = {
				3901	.name = "tmpfs",
				3902	.mount = ramfs_mount,
				3903	.kill_sb = kill_litter_super,
				3904	.fs_flags = FS_USERNS_MOUNT,
				3905	};
				3906
				3907	int __init shmem_init(void)
				3908	{
				3909	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
				3910
				3911	shm_mnt = kern_mount(&shmem_fs_type);
				3912	BUG_ON(IS_ERR(shm_mnt));
				3913
				3914	return 0;
				3915	}
				3916
				3917	int shmem_unuse(swp_entry_t swap, struct page *page)
				3918	{
				3919	return 0;
				3920	}
				3921
				3922	int shmem_lock(struct file file, int lock, struct user_struct user)
				3923	{
				3924	return 0;
				3925	}
				3926
				3927	void shmem_unlock_mapping(struct address_space *mapping)
				3928	{
				3929	}
				3930
				3931	#ifdef CONFIG_MMU
				3932	unsigned long shmem_get_unmapped_area(struct file *file,
				3933	unsigned long addr, unsigned long len,
				3934	unsigned long pgoff, unsigned long flags)
				3935	{
				3936	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
				3937	}
				3938	#endif
				3939
				3940	void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
				3941	{
				3942	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
				3943	}
				3944	EXPORT_SYMBOL_GPL(shmem_truncate_range);
				3945
				3946	#define shmem_vm_ops generic_file_vm_ops
				3947	#define shmem_file_operations ramfs_file_operations
				3948	#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
				3949	#define shmem_acct_size(flags, size) 0
				3950	#define shmem_unacct_size(flags, size) do {} while (0)
				3951
				3952	#endif /* CONFIG_SHMEM */
				3953
				3954	/* common code */
				3955
				3956	static struct file __shmem_file_setup(struct vfsmount mnt, const char *name, loff_t size,
				3957	unsigned long flags, unsigned int i_flags)
				3958	{
				3959	struct inode *inode;
				3960	struct file *res;
				3961
				3962	if (IS_ERR(mnt))
				3963	return ERR_CAST(mnt);
				3964
				3965	if (size < 0 \|\| size > MAX_LFS_FILESIZE)
				3966	return ERR_PTR(-EINVAL);
				3967
				3968	if (shmem_acct_size(flags, size))
				3969	return ERR_PTR(-ENOMEM);
				3970
				3971	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG \| S_IRWXUGO, 0,
				3972	flags);
				3973	if (unlikely(!inode)) {
				3974	shmem_unacct_size(flags, size);
				3975	return ERR_PTR(-ENOSPC);
				3976	}
				3977	inode->i_flags \|= i_flags;
				3978	inode->i_size = size;
				3979	clear_nlink(inode); /* It is unlinked */
				3980	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
				3981	if (!IS_ERR(res))
				3982	res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
				3983	&shmem_file_operations);
				3984	if (IS_ERR(res))
				3985	iput(inode);
				3986	return res;
				3987	}
				3988
				3989	/**
				3990	* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
				3991	* kernel internal. There will be NO LSM permission checks against the
				3992	* underlying inode. So users of this interface must do LSM checks at a
				3993	* higher layer. The users are the big_key and shm implementations. LSM
				3994	* checks are provided at the key or shm level rather than the inode.
				3995	* @name: name for dentry (to be seen in /proc/<pid>/maps
				3996	* @size: size to be set for the file
				3997	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
				3998	*/
				3999	struct file shmem_kernel_file_setup(const char name, loff_t size, unsigned long flags)
				4000	{
				4001	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
				4002	}
				4003
				4004	/**
				4005	* shmem_file_setup - get an unlinked file living in tmpfs
				4006	* @name: name for dentry (to be seen in /proc/<pid>/maps
				4007	* @size: size to be set for the file
				4008	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
				4009	*/
				4010	struct file shmem_file_setup(const char name, loff_t size, unsigned long flags)
				4011	{
				4012	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
				4013	}
				4014	EXPORT_SYMBOL_GPL(shmem_file_setup);
				4015
				4016	/**
				4017	* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
				4018	* @mnt: the tmpfs mount where the file will be created
				4019	* @name: name for dentry (to be seen in /proc/<pid>/maps
				4020	* @size: size to be set for the file
				4021	* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
				4022	*/
				4023	struct file shmem_file_setup_with_mnt(struct vfsmount mnt, const char *name,
				4024	loff_t size, unsigned long flags)
				4025	{
				4026	return __shmem_file_setup(mnt, name, size, flags, 0);
				4027	}
				4028	EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
				4029
				4030	/**
				4031	* shmem_zero_setup - setup a shared anonymous mapping
				4032	* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
				4033	*/
				4034	int shmem_zero_setup(struct vm_area_struct *vma)
				4035	{
				4036	struct file *file;
				4037	loff_t size = vma->vm_end - vma->vm_start;
				4038
				4039	/*
				4040	* Cloning a new file under mmap_sem leads to a lock ordering conflict
				4041	* between XFS directory reading and selinux: since this file is only
				4042	* accessible to the user through its mapping, use S_PRIVATE flag to
				4043	* bypass file security, in the same way as shmem_kernel_file_setup().
				4044	*/
				4045	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
				4046	if (IS_ERR(file))
				4047	return PTR_ERR(file);
				4048
				4049	if (vma->vm_file)
				4050	fput(vma->vm_file);
				4051	vma->vm_file = file;
				4052	vma->vm_ops = &shmem_vm_ops;
				4053
				4054	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
				4055	((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
				4056	(vma->vm_end & HPAGE_PMD_MASK)) {
				4057	khugepaged_enter(vma, vma->vm_flags);
				4058	}
				4059
				4060	return 0;
				4061	}
				4062
				4063	/**
				4064	* shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
				4065	* @mapping: the page's address_space
				4066	* @index: the page index
				4067	* @gfp: the page allocator flags to use if allocating
				4068	*
				4069	* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
				4070	* with any new page allocations done using the specified allocation flags.
				4071	* But read_cache_page_gfp() uses the ->readpage() method: which does not
				4072	* suit tmpfs, since it may have pages in swapcache, and needs to find those
				4073	* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
				4074	*
				4075	* i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY \| __GFP_NOWARN in
				4076	* with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
				4077	*/
				4078	struct page shmem_read_mapping_page_gfp(struct address_space mapping,
				4079	pgoff_t index, gfp_t gfp)
				4080	{
				4081	#ifdef CONFIG_SHMEM
				4082	struct inode *inode = mapping->host;
				4083	struct page *page;
				4084	int error;
				4085
				4086	BUG_ON(mapping->a_ops != &shmem_aops);
				4087	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
				4088	gfp, NULL, NULL, NULL);
				4089	if (error)
				4090	page = ERR_PTR(error);
				4091	else
				4092	unlock_page(page);
				4093	return page;
				4094	#else
				4095	/*
				4096	* The tiny !SHMEM case uses ramfs without swap
				4097	*/
				4098	return read_cache_page_gfp(mapping, index, gfp);
				4099	#endif
				4100	}
				4101	EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);