Blame - marvell/linux/fs/inode.c - T108

blob: 3d1b3165963182a8796feb768161fbff3bfeb631 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* (C) 1997 Linus Torvalds
				4	* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
				5	*/
				6	#include <linux/export.h>
				7	#include <linux/fs.h>
				8	#include <linux/mm.h>
				9	#include <linux/backing-dev.h>
				10	#include <linux/hash.h>
				11	#include <linux/swap.h>
				12	#include <linux/security.h>
				13	#include <linux/cdev.h>
				14	#include <linux/memblock.h>
				15	#include <linux/fscrypt.h>
				16	#include <linux/fsnotify.h>
				17	#include <linux/mount.h>
				18	#include <linux/posix_acl.h>
				19	#include <linux/prefetch.h>
				20	#include <linux/buffer_head.h> /* for inode_has_buffers */
				21	#include <linux/ratelimit.h>
				22	#include <linux/list_lru.h>
				23	#include <linux/iversion.h>
				24	#include <trace/events/writeback.h>
				25	#include "internal.h"
				26
				27	/*
				28	* Inode locking rules:
				29	*
				30	* inode->i_lock protects:
				31	* inode->i_state, inode->i_hash, __iget()
				32	* Inode LRU list locks protect:
				33	* inode->i_sb->s_inode_lru, inode->i_lru
				34	* inode->i_sb->s_inode_list_lock protects:
				35	* inode->i_sb->s_inodes, inode->i_sb_list
				36	* bdi->wb.list_lock protects:
				37	* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
				38	* inode_hash_lock protects:
				39	* inode_hashtable, inode->i_hash
				40	*
				41	* Lock ordering:
				42	*
				43	* inode->i_sb->s_inode_list_lock
				44	* inode->i_lock
				45	* Inode LRU list locks
				46	*
				47	* bdi->wb.list_lock
				48	* inode->i_lock
				49	*
				50	* inode_hash_lock
				51	* inode->i_sb->s_inode_list_lock
				52	* inode->i_lock
				53	*
				54	* iunique_lock
				55	* inode_hash_lock
				56	*/
				57
				58	static unsigned int i_hash_mask __read_mostly;
				59	static unsigned int i_hash_shift __read_mostly;
				60	static struct hlist_head *inode_hashtable __read_mostly;
				61	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
				62
				63	/*
				64	* Empty aops. Can be used for the cases where the user does not
				65	* define any of the address_space operations.
				66	*/
				67	const struct address_space_operations empty_aops = {
				68	};
				69	EXPORT_SYMBOL(empty_aops);
				70
				71	/*
				72	* Statistics gathering..
				73	*/
				74	struct inodes_stat_t inodes_stat;
				75
				76	static DEFINE_PER_CPU(unsigned long, nr_inodes);
				77	static DEFINE_PER_CPU(unsigned long, nr_unused);
				78
				79	static struct kmem_cache *inode_cachep __read_mostly;
				80
				81	static long get_nr_inodes(void)
				82	{
				83	int i;
				84	long sum = 0;
				85	for_each_possible_cpu(i)
				86	sum += per_cpu(nr_inodes, i);
				87	return sum < 0 ? 0 : sum;
				88	}
				89
				90	static inline long get_nr_inodes_unused(void)
				91	{
				92	int i;
				93	long sum = 0;
				94	for_each_possible_cpu(i)
				95	sum += per_cpu(nr_unused, i);
				96	return sum < 0 ? 0 : sum;
				97	}
				98
				99	long get_nr_dirty_inodes(void)
				100	{
				101	/* not actually dirty inodes, but a wild approximation */
				102	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
				103	return nr_dirty > 0 ? nr_dirty : 0;
				104	}
				105
				106	/*
				107	* Handle nr_inode sysctl
				108	*/
				109	#ifdef CONFIG_SYSCTL
				110	int proc_nr_inodes(struct ctl_table *table, int write,
				111	void __user buffer, size_t lenp, loff_t *ppos)
				112	{
				113	inodes_stat.nr_inodes = get_nr_inodes();
				114	inodes_stat.nr_unused = get_nr_inodes_unused();
				115	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
				116	}
				117	#endif
				118
				119	static int no_open(struct inode inode, struct file file)
				120	{
				121	return -ENXIO;
				122	}
				123
				124	/**
				125	* inode_init_always - perform inode structure initialisation
				126	* @sb: superblock inode belongs to
				127	* @inode: inode to initialise
				128	*
				129	* These are initializations that need to be done on every inode
				130	* allocation as the fields are not initialised by slab allocation.
				131	*/
				132	int inode_init_always(struct super_block sb, struct inode inode)
				133	{
				134	static const struct inode_operations empty_iops;
				135	static const struct file_operations no_open_fops = {.open = no_open};
				136	struct address_space *const mapping = &inode->i_data;
				137
				138	inode->i_sb = sb;
				139	inode->i_blkbits = sb->s_blocksize_bits;
				140	inode->i_flags = 0;
				141	atomic64_set(&inode->i_sequence, 0);
				142	atomic_set(&inode->i_count, 1);
				143	inode->i_op = &empty_iops;
				144	inode->i_fop = &no_open_fops;
				145	inode->__i_nlink = 1;
				146	inode->i_opflags = 0;
				147	if (sb->s_xattr)
				148	inode->i_opflags \|= IOP_XATTR;
				149	i_uid_write(inode, 0);
				150	i_gid_write(inode, 0);
				151	atomic_set(&inode->i_writecount, 0);
				152	inode->i_size = 0;
				153	inode->i_write_hint = WRITE_LIFE_NOT_SET;
				154	inode->i_blocks = 0;
				155	inode->i_bytes = 0;
				156	inode->i_generation = 0;
				157	inode->i_pipe = NULL;
				158	inode->i_bdev = NULL;
				159	inode->i_cdev = NULL;
				160	inode->i_link = NULL;
				161	inode->i_dir_seq = 0;
				162	inode->i_rdev = 0;
				163	inode->dirtied_when = 0;
				164
				165	#ifdef CONFIG_CGROUP_WRITEBACK
				166	inode->i_wb_frn_winner = 0;
				167	inode->i_wb_frn_avg_time = 0;
				168	inode->i_wb_frn_history = 0;
				169	#endif
				170
				171	spin_lock_init(&inode->i_lock);
				172	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
				173
				174	init_rwsem(&inode->i_rwsem);
				175	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
				176
				177	atomic_set(&inode->i_dio_count, 0);
				178
				179	mapping->a_ops = &empty_aops;
				180	mapping->host = inode;
				181	mapping->flags = 0;
				182	mapping->wb_err = 0;
				183	atomic_set(&mapping->i_mmap_writable, 0);
				184	#ifdef CONFIG_READ_ONLY_THP_FOR_FS
				185	atomic_set(&mapping->nr_thps, 0);
				186	#endif
				187	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
				188	mapping->private_data = NULL;
				189	mapping->writeback_index = 0;
				190	inode->i_private = NULL;
				191	inode->i_mapping = mapping;
				192	INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
				193	#ifdef CONFIG_FS_POSIX_ACL
				194	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
				195	#endif
				196
				197	#ifdef CONFIG_FSNOTIFY
				198	inode->i_fsnotify_mask = 0;
				199	#endif
				200	inode->i_flctx = NULL;
				201
				202	if (unlikely(security_inode_alloc(inode)))
				203	return -ENOMEM;
				204	this_cpu_inc(nr_inodes);
				205
				206	return 0;
				207	}
				208	EXPORT_SYMBOL(inode_init_always);
				209
				210	void free_inode_nonrcu(struct inode *inode)
				211	{
				212	kmem_cache_free(inode_cachep, inode);
				213	}
				214	EXPORT_SYMBOL(free_inode_nonrcu);
				215
				216	static void i_callback(struct rcu_head *head)
				217	{
				218	struct inode *inode = container_of(head, struct inode, i_rcu);
				219	if (inode->free_inode)
				220	inode->free_inode(inode);
				221	else
				222	free_inode_nonrcu(inode);
				223	}
				224
				225	static struct inode alloc_inode(struct super_block sb)
				226	{
				227	const struct super_operations *ops = sb->s_op;
				228	struct inode *inode;
				229
				230	if (ops->alloc_inode)
				231	inode = ops->alloc_inode(sb);
				232	else
				233	inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
				234
				235	if (!inode)
				236	return NULL;
				237
				238	if (unlikely(inode_init_always(sb, inode))) {
				239	if (ops->destroy_inode) {
				240	ops->destroy_inode(inode);
				241	if (!ops->free_inode)
				242	return NULL;
				243	}
				244	inode->free_inode = ops->free_inode;
				245	i_callback(&inode->i_rcu);
				246	return NULL;
				247	}
				248
				249	return inode;
				250	}
				251
				252	void __destroy_inode(struct inode *inode)
				253	{
				254	BUG_ON(inode_has_buffers(inode));
				255	inode_detach_wb(inode);
				256	security_inode_free(inode);
				257	fsnotify_inode_delete(inode);
				258	locks_free_lock_context(inode);
				259	if (!inode->i_nlink) {
				260	WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
				261	atomic_long_dec(&inode->i_sb->s_remove_count);
				262	}
				263
				264	#ifdef CONFIG_FS_POSIX_ACL
				265	if (inode->i_acl && !is_uncached_acl(inode->i_acl))
				266	posix_acl_release(inode->i_acl);
				267	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
				268	posix_acl_release(inode->i_default_acl);
				269	#endif
				270	this_cpu_dec(nr_inodes);
				271	}
				272	EXPORT_SYMBOL(__destroy_inode);
				273
				274	static void destroy_inode(struct inode *inode)
				275	{
				276	const struct super_operations *ops = inode->i_sb->s_op;
				277
				278	BUG_ON(!list_empty(&inode->i_lru));
				279	__destroy_inode(inode);
				280	if (ops->destroy_inode) {
				281	ops->destroy_inode(inode);
				282	if (!ops->free_inode)
				283	return;
				284	}
				285	inode->free_inode = ops->free_inode;
				286	call_rcu(&inode->i_rcu, i_callback);
				287	}
				288
				289	/**
				290	* drop_nlink - directly drop an inode's link count
				291	* @inode: inode
				292	*
				293	* This is a low-level filesystem helper to replace any
				294	* direct filesystem manipulation of i_nlink. In cases
				295	* where we are attempting to track writes to the
				296	* filesystem, a decrement to zero means an imminent
				297	* write when the file is truncated and actually unlinked
				298	* on the filesystem.
				299	*/
				300	void drop_nlink(struct inode *inode)
				301	{
				302	WARN_ON(inode->i_nlink == 0);
				303	inode->__i_nlink--;
				304	if (!inode->i_nlink)
				305	atomic_long_inc(&inode->i_sb->s_remove_count);
				306	}
				307	EXPORT_SYMBOL(drop_nlink);
				308
				309	/**
				310	* clear_nlink - directly zero an inode's link count
				311	* @inode: inode
				312	*
				313	* This is a low-level filesystem helper to replace any
				314	* direct filesystem manipulation of i_nlink. See
				315	* drop_nlink() for why we care about i_nlink hitting zero.
				316	*/
				317	void clear_nlink(struct inode *inode)
				318	{
				319	if (inode->i_nlink) {
				320	inode->__i_nlink = 0;
				321	atomic_long_inc(&inode->i_sb->s_remove_count);
				322	}
				323	}
				324	EXPORT_SYMBOL(clear_nlink);
				325
				326	/**
				327	* set_nlink - directly set an inode's link count
				328	* @inode: inode
				329	* @nlink: new nlink (should be non-zero)
				330	*
				331	* This is a low-level filesystem helper to replace any
				332	* direct filesystem manipulation of i_nlink.
				333	*/
				334	void set_nlink(struct inode *inode, unsigned int nlink)
				335	{
				336	if (!nlink) {
				337	clear_nlink(inode);
				338	} else {
				339	/* Yes, some filesystems do change nlink from zero to one */
				340	if (inode->i_nlink == 0)
				341	atomic_long_dec(&inode->i_sb->s_remove_count);
				342
				343	inode->__i_nlink = nlink;
				344	}
				345	}
				346	EXPORT_SYMBOL(set_nlink);
				347
				348	/**
				349	* inc_nlink - directly increment an inode's link count
				350	* @inode: inode
				351	*
				352	* This is a low-level filesystem helper to replace any
				353	* direct filesystem manipulation of i_nlink. Currently,
				354	* it is only here for parity with dec_nlink().
				355	*/
				356	void inc_nlink(struct inode *inode)
				357	{
				358	if (unlikely(inode->i_nlink == 0)) {
				359	WARN_ON(!(inode->i_state & I_LINKABLE));
				360	atomic_long_dec(&inode->i_sb->s_remove_count);
				361	}
				362
				363	inode->__i_nlink++;
				364	}
				365	EXPORT_SYMBOL(inc_nlink);
				366
				367	static void __address_space_init_once(struct address_space *mapping)
				368	{
				369	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ \| XA_FLAGS_ACCOUNT);
				370	init_rwsem(&mapping->i_mmap_rwsem);
				371	INIT_LIST_HEAD(&mapping->private_list);
				372	spin_lock_init(&mapping->private_lock);
				373	mapping->i_mmap = RB_ROOT_CACHED;
				374	}
				375
				376	void address_space_init_once(struct address_space *mapping)
				377	{
				378	memset(mapping, 0, sizeof(*mapping));
				379	__address_space_init_once(mapping);
				380	}
				381	EXPORT_SYMBOL(address_space_init_once);
				382
				383	/*
				384	* These are initializations that only need to be done
				385	* once, because the fields are idempotent across use
				386	* of the inode, so let the slab aware of that.
				387	*/
				388	void inode_init_once(struct inode *inode)
				389	{
				390	memset(inode, 0, sizeof(*inode));
				391	INIT_HLIST_NODE(&inode->i_hash);
				392	INIT_LIST_HEAD(&inode->i_devices);
				393	INIT_LIST_HEAD(&inode->i_io_list);
				394	INIT_LIST_HEAD(&inode->i_wb_list);
				395	INIT_LIST_HEAD(&inode->i_lru);
				396	__address_space_init_once(&inode->i_data);
				397	i_size_ordered_init(inode);
				398	}
				399	EXPORT_SYMBOL(inode_init_once);
				400
				401	static void init_once(void *foo)
				402	{
				403	struct inode inode = (struct inode ) foo;
				404
				405	inode_init_once(inode);
				406	}
				407
				408	/*
				409	* inode->i_lock must be held
				410	*/
				411	void __iget(struct inode *inode)
				412	{
				413	atomic_inc(&inode->i_count);
				414	}
				415
				416	/*
				417	* get additional reference to inode; caller must already hold one.
				418	*/
				419	void ihold(struct inode *inode)
				420	{
				421	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
				422	}
				423	EXPORT_SYMBOL(ihold);
				424
				425	static void inode_lru_list_add(struct inode *inode)
				426	{
				427	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
				428	this_cpu_inc(nr_unused);
				429	else
				430	inode->i_state \|= I_REFERENCED;
				431	}
				432
				433	/*
				434	* Add inode to LRU if needed (inode is unused and clean).
				435	*
				436	* Needs inode->i_lock held.
				437	*/
				438	void inode_add_lru(struct inode *inode)
				439	{
				440	if (!(inode->i_state & (I_DIRTY_ALL \| I_SYNC \|
				441	I_FREEING \| I_WILL_FREE)) &&
				442	!atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
				443	inode_lru_list_add(inode);
				444	}
				445
				446
				447	static void inode_lru_list_del(struct inode *inode)
				448	{
				449
				450	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
				451	this_cpu_dec(nr_unused);
				452	}
				453
				454	static void inode_pin_lru_isolating(struct inode *inode)
				455	{
				456	lockdep_assert_held(&inode->i_lock);
				457	WARN_ON(inode->i_state & (I_LRU_ISOLATING \| I_FREEING \| I_WILL_FREE));
				458	inode->i_state \|= I_LRU_ISOLATING;
				459	}
				460
				461	static void inode_unpin_lru_isolating(struct inode *inode)
				462	{
				463	spin_lock(&inode->i_lock);
				464	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
				465	inode->i_state &= ~I_LRU_ISOLATING;
				466	smp_mb();
				467	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
				468	spin_unlock(&inode->i_lock);
				469	}
				470
				471	static void inode_wait_for_lru_isolating(struct inode *inode)
				472	{
				473	spin_lock(&inode->i_lock);
				474	if (inode->i_state & I_LRU_ISOLATING) {
				475	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
				476	wait_queue_head_t *wqh;
				477
				478	wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
				479	spin_unlock(&inode->i_lock);
				480	__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
				481	spin_lock(&inode->i_lock);
				482	WARN_ON(inode->i_state & I_LRU_ISOLATING);
				483	}
				484	spin_unlock(&inode->i_lock);
				485	}
				486
				487	/**
				488	* inode_sb_list_add - add inode to the superblock list of inodes
				489	* @inode: inode to add
				490	*/
				491	void inode_sb_list_add(struct inode *inode)
				492	{
				493	spin_lock(&inode->i_sb->s_inode_list_lock);
				494	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
				495	spin_unlock(&inode->i_sb->s_inode_list_lock);
				496	}
				497	EXPORT_SYMBOL_GPL(inode_sb_list_add);
				498
				499	static inline void inode_sb_list_del(struct inode *inode)
				500	{
				501	if (!list_empty(&inode->i_sb_list)) {
				502	spin_lock(&inode->i_sb->s_inode_list_lock);
				503	list_del_init(&inode->i_sb_list);
				504	spin_unlock(&inode->i_sb->s_inode_list_lock);
				505	}
				506	}
				507
				508	static unsigned long hash(struct super_block *sb, unsigned long hashval)
				509	{
				510	unsigned long tmp;
				511
				512	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
				513	L1_CACHE_BYTES;
				514	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
				515	return tmp & i_hash_mask;
				516	}
				517
				518	/**
				519	* __insert_inode_hash - hash an inode
				520	* @inode: unhashed inode
				521	* @hashval: unsigned long value used to locate this object in the
				522	* inode_hashtable.
				523	*
				524	* Add an inode to the inode hash for this superblock.
				525	*/
				526	void __insert_inode_hash(struct inode *inode, unsigned long hashval)
				527	{
				528	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
				529
				530	spin_lock(&inode_hash_lock);
				531	spin_lock(&inode->i_lock);
				532	hlist_add_head(&inode->i_hash, b);
				533	spin_unlock(&inode->i_lock);
				534	spin_unlock(&inode_hash_lock);
				535	}
				536	EXPORT_SYMBOL(__insert_inode_hash);
				537
				538	/**
				539	* __remove_inode_hash - remove an inode from the hash
				540	* @inode: inode to unhash
				541	*
				542	* Remove an inode from the superblock.
				543	*/
				544	void __remove_inode_hash(struct inode *inode)
				545	{
				546	spin_lock(&inode_hash_lock);
				547	spin_lock(&inode->i_lock);
				548	hlist_del_init(&inode->i_hash);
				549	spin_unlock(&inode->i_lock);
				550	spin_unlock(&inode_hash_lock);
				551	}
				552	EXPORT_SYMBOL(__remove_inode_hash);
				553
				554	void clear_inode(struct inode *inode)
				555	{
				556	/*
				557	* We have to cycle the i_pages lock here because reclaim can be in the
				558	* process of removing the last page (in __delete_from_page_cache())
				559	* and we must not free the mapping under it.
				560	*/
				561	xa_lock_irq(&inode->i_data.i_pages);
				562	BUG_ON(inode->i_data.nrpages);
				563	BUG_ON(inode->i_data.nrexceptional);
				564	xa_unlock_irq(&inode->i_data.i_pages);
				565	BUG_ON(!list_empty(&inode->i_data.private_list));
				566	BUG_ON(!(inode->i_state & I_FREEING));
				567	BUG_ON(inode->i_state & I_CLEAR);
				568	BUG_ON(!list_empty(&inode->i_wb_list));
				569	/* don't need i_lock here, no concurrent mods to i_state */
				570	inode->i_state = I_FREEING \| I_CLEAR;
				571	}
				572	EXPORT_SYMBOL(clear_inode);
				573
				574	/*
				575	* Free the inode passed in, removing it from the lists it is still connected
				576	* to. We remove any pages still attached to the inode and wait for any IO that
				577	* is still in progress before finally destroying the inode.
				578	*
				579	* An inode must already be marked I_FREEING so that we avoid the inode being
				580	* moved back onto lists if we race with other code that manipulates the lists
				581	* (e.g. writeback_single_inode). The caller is responsible for setting this.
				582	*
				583	* An inode must already be removed from the LRU list before being evicted from
				584	* the cache. This should occur atomically with setting the I_FREEING state
				585	* flag, so no inodes here should ever be on the LRU when being evicted.
				586	*/
				587	static void evict(struct inode *inode)
				588	{
				589	const struct super_operations *op = inode->i_sb->s_op;
				590
				591	BUG_ON(!(inode->i_state & I_FREEING));
				592	BUG_ON(!list_empty(&inode->i_lru));
				593
				594	if (!list_empty(&inode->i_io_list))
				595	inode_io_list_del(inode);
				596
				597	inode_sb_list_del(inode);
				598
				599	inode_wait_for_lru_isolating(inode);
				600
				601	/*
				602	* Wait for flusher thread to be done with the inode so that filesystem
				603	* does not start destroying it while writeback is still running. Since
				604	* the inode has I_FREEING set, flusher thread won't start new work on
				605	* the inode. We just have to wait for running writeback to finish.
				606	*/
				607	inode_wait_for_writeback(inode);
				608
				609	if (op->evict_inode) {
				610	op->evict_inode(inode);
				611	} else {
				612	truncate_inode_pages_final(&inode->i_data);
				613	clear_inode(inode);
				614	}
				615	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
				616	bd_forget(inode);
				617	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
				618	cd_forget(inode);
				619
				620	remove_inode_hash(inode);
				621
				622	spin_lock(&inode->i_lock);
				623	wake_up_bit(&inode->i_state, __I_NEW);
				624	BUG_ON(inode->i_state != (I_FREEING \| I_CLEAR));
				625	spin_unlock(&inode->i_lock);
				626
				627	destroy_inode(inode);
				628	}
				629
				630	/*
				631	* dispose_list - dispose of the contents of a local list
				632	* @head: the head of the list to free
				633	*
				634	* Dispose-list gets a local list with local inodes in it, so it doesn't
				635	* need to worry about list corruption and SMP locks.
				636	*/
				637	static void dispose_list(struct list_head *head)
				638	{
				639	while (!list_empty(head)) {
				640	struct inode *inode;
				641
				642	inode = list_first_entry(head, struct inode, i_lru);
				643	list_del_init(&inode->i_lru);
				644
				645	evict(inode);
				646	cond_resched();
				647	}
				648	}
				649
				650	/**
				651	* evict_inodes - evict all evictable inodes for a superblock
				652	* @sb: superblock to operate on
				653	*
				654	* Make sure that no inodes with zero refcount are retained. This is
				655	* called by superblock shutdown after having SB_ACTIVE flag removed,
				656	* so any inode reaching zero refcount during or after that call will
				657	* be immediately evicted.
				658	*/
				659	void evict_inodes(struct super_block *sb)
				660	{
				661	struct inode inode, next;
				662	LIST_HEAD(dispose);
				663
				664	again:
				665	spin_lock(&sb->s_inode_list_lock);
				666	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				667	if (atomic_read(&inode->i_count))
				668	continue;
				669
				670	spin_lock(&inode->i_lock);
				671	if (atomic_read(&inode->i_count)) {
				672	spin_unlock(&inode->i_lock);
				673	continue;
				674	}
				675	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				676	spin_unlock(&inode->i_lock);
				677	continue;
				678	}
				679
				680	inode->i_state \|= I_FREEING;
				681	inode_lru_list_del(inode);
				682	spin_unlock(&inode->i_lock);
				683	list_add(&inode->i_lru, &dispose);
				684
				685	/*
				686	* We can have a ton of inodes to evict at unmount time given
				687	* enough memory, check to see if we need to go to sleep for a
				688	* bit so we don't livelock.
				689	*/
				690	if (need_resched()) {
				691	spin_unlock(&sb->s_inode_list_lock);
				692	cond_resched();
				693	dispose_list(&dispose);
				694	goto again;
				695	}
				696	}
				697	spin_unlock(&sb->s_inode_list_lock);
				698
				699	dispose_list(&dispose);
				700	}
				701	EXPORT_SYMBOL_GPL(evict_inodes);
				702
				703	/**
				704	* invalidate_inodes - attempt to free all inodes on a superblock
				705	* @sb: superblock to operate on
				706	* @kill_dirty: flag to guide handling of dirty inodes
				707	*
				708	* Attempts to free all inodes for a given superblock. If there were any
				709	* busy inodes return a non-zero value, else zero.
				710	* If @kill_dirty is set, discard dirty inodes too, otherwise treat
				711	* them as busy.
				712	*/
				713	int invalidate_inodes(struct super_block *sb, bool kill_dirty)
				714	{
				715	int busy = 0;
				716	struct inode inode, next;
				717	LIST_HEAD(dispose);
				718
				719	again:
				720	spin_lock(&sb->s_inode_list_lock);
				721	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				722	spin_lock(&inode->i_lock);
				723	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				724	spin_unlock(&inode->i_lock);
				725	continue;
				726	}
				727	if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
				728	spin_unlock(&inode->i_lock);
				729	busy = 1;
				730	continue;
				731	}
				732	if (atomic_read(&inode->i_count)) {
				733	spin_unlock(&inode->i_lock);
				734	busy = 1;
				735	continue;
				736	}
				737
				738	inode->i_state \|= I_FREEING;
				739	inode_lru_list_del(inode);
				740	spin_unlock(&inode->i_lock);
				741	list_add(&inode->i_lru, &dispose);
				742	if (need_resched()) {
				743	spin_unlock(&sb->s_inode_list_lock);
				744	cond_resched();
				745	dispose_list(&dispose);
				746	goto again;
				747	}
				748	}
				749	spin_unlock(&sb->s_inode_list_lock);
				750
				751	dispose_list(&dispose);
				752
				753	return busy;
				754	}
				755
				756	/*
				757	* Isolate the inode from the LRU in preparation for freeing it.
				758	*
				759	* Any inodes which are pinned purely because of attached pagecache have their
				760	* pagecache removed. If the inode has metadata buffers attached to
				761	* mapping->private_list then try to remove them.
				762	*
				763	* If the inode has the I_REFERENCED flag set, then it means that it has been
				764	* used recently - the flag is set in iput_final(). When we encounter such an
				765	* inode, clear the flag and move it to the back of the LRU so it gets another
				766	* pass through the LRU before it gets reclaimed. This is necessary because of
				767	* the fact we are doing lazy LRU updates to minimise lock contention so the
				768	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
				769	* with this flag set because they are the inodes that are out of order.
				770	*/
				771	static enum lru_status inode_lru_isolate(struct list_head *item,
				772	struct list_lru_one lru, spinlock_t lru_lock, void *arg)
				773	{
				774	struct list_head *freeable = arg;
				775	struct inode *inode = container_of(item, struct inode, i_lru);
				776
				777	/*
				778	* we are inverting the lru lock/inode->i_lock here, so use a trylock.
				779	* If we fail to get the lock, just skip it.
				780	*/
				781	if (!spin_trylock(&inode->i_lock))
				782	return LRU_SKIP;
				783
				784	/*
				785	* Referenced or dirty inodes are still in use. Give them another pass
				786	* through the LRU as we canot reclaim them now.
				787	*/
				788	if (atomic_read(&inode->i_count) \|\|
				789	(inode->i_state & ~I_REFERENCED)) {
				790	list_lru_isolate(lru, &inode->i_lru);
				791	spin_unlock(&inode->i_lock);
				792	this_cpu_dec(nr_unused);
				793	return LRU_REMOVED;
				794	}
				795
				796	/* recently referenced inodes get one more pass */
				797	if (inode->i_state & I_REFERENCED) {
				798	inode->i_state &= ~I_REFERENCED;
				799	spin_unlock(&inode->i_lock);
				800	return LRU_ROTATE;
				801	}
				802
				803	if (inode_has_buffers(inode) \|\| inode->i_data.nrpages) {
				804	inode_pin_lru_isolating(inode);
				805	spin_unlock(&inode->i_lock);
				806	spin_unlock(lru_lock);
				807	if (remove_inode_buffers(inode)) {
				808	unsigned long reap;
				809	reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
				810	if (current_is_kswapd())
				811	__count_vm_events(KSWAPD_INODESTEAL, reap);
				812	else
				813	__count_vm_events(PGINODESTEAL, reap);
				814	if (current->reclaim_state)
				815	current->reclaim_state->reclaimed_slab += reap;
				816	}
				817	inode_unpin_lru_isolating(inode);
				818	spin_lock(lru_lock);
				819	return LRU_RETRY;
				820	}
				821
				822	WARN_ON(inode->i_state & I_NEW);
				823	inode->i_state \|= I_FREEING;
				824	list_lru_isolate_move(lru, &inode->i_lru, freeable);
				825	spin_unlock(&inode->i_lock);
				826
				827	this_cpu_dec(nr_unused);
				828	return LRU_REMOVED;
				829	}
				830
				831	/*
				832	* Walk the superblock inode LRU for freeable inodes and attempt to free them.
				833	* This is called from the superblock shrinker function with a number of inodes
				834	* to trim from the LRU. Inodes to be freed are moved to a temporary list and
				835	* then are freed outside inode_lock by dispose_list().
				836	*/
				837	long prune_icache_sb(struct super_block sb, struct shrink_control sc)
				838	{
				839	LIST_HEAD(freeable);
				840	long freed;
				841
				842	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
				843	inode_lru_isolate, &freeable);
				844	dispose_list(&freeable);
				845	return freed;
				846	}
				847
				848	static void __wait_on_freeing_inode(struct inode *inode);
				849	/*
				850	* Called with the inode lock held.
				851	*/
				852	static struct inode find_inode(struct super_block sb,
				853	struct hlist_head *head,
				854	int (test)(struct inode , void *),
				855	void *data)
				856	{
				857	struct inode *inode = NULL;
				858
				859	repeat:
				860	hlist_for_each_entry(inode, head, i_hash) {
				861	if (inode->i_sb != sb)
				862	continue;
				863	if (!test(inode, data))
				864	continue;
				865	spin_lock(&inode->i_lock);
				866	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				867	__wait_on_freeing_inode(inode);
				868	goto repeat;
				869	}
				870	if (unlikely(inode->i_state & I_CREATING)) {
				871	spin_unlock(&inode->i_lock);
				872	return ERR_PTR(-ESTALE);
				873	}
				874	__iget(inode);
				875	spin_unlock(&inode->i_lock);
				876	return inode;
				877	}
				878	return NULL;
				879	}
				880
				881	/*
				882	* find_inode_fast is the fast path version of find_inode, see the comment at
				883	* iget_locked for details.
				884	*/
				885	static struct inode find_inode_fast(struct super_block sb,
				886	struct hlist_head *head, unsigned long ino)
				887	{
				888	struct inode *inode = NULL;
				889
				890	repeat:
				891	hlist_for_each_entry(inode, head, i_hash) {
				892	if (inode->i_ino != ino)
				893	continue;
				894	if (inode->i_sb != sb)
				895	continue;
				896	spin_lock(&inode->i_lock);
				897	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				898	__wait_on_freeing_inode(inode);
				899	goto repeat;
				900	}
				901	if (unlikely(inode->i_state & I_CREATING)) {
				902	spin_unlock(&inode->i_lock);
				903	return ERR_PTR(-ESTALE);
				904	}
				905	__iget(inode);
				906	spin_unlock(&inode->i_lock);
				907	return inode;
				908	}
				909	return NULL;
				910	}
				911
				912	/*
				913	* Each cpu owns a range of LAST_INO_BATCH numbers.
				914	* 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
				915	* to renew the exhausted range.
				916	*
				917	* This does not significantly increase overflow rate because every CPU can
				918	* consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
				919	* NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
				920	* 2^32 range, and is a worst-case. Even a 50% wastage would only increase
				921	* overflow rate by 2x, which does not seem too significant.
				922	*
				923	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				924	* error if st_ino won't fit in target struct field. Use 32bit counter
				925	* here to attempt to avoid that.
				926	*/
				927	#define LAST_INO_BATCH 1024
				928	static DEFINE_PER_CPU(unsigned int, last_ino);
				929
				930	unsigned int get_next_ino(void)
				931	{
				932	unsigned int *p = &get_cpu_var(last_ino);
				933	unsigned int res = *p;
				934
				935	#ifdef CONFIG_SMP
				936	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
				937	static atomic_t shared_last_ino;
				938	int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
				939
				940	res = next - LAST_INO_BATCH;
				941	}
				942	#endif
				943
				944	res++;
				945	/* get_next_ino should not provide a 0 inode number */
				946	if (unlikely(!res))
				947	res++;
				948	*p = res;
				949	put_cpu_var(last_ino);
				950	return res;
				951	}
				952	EXPORT_SYMBOL(get_next_ino);
				953
				954	/**
				955	* new_inode_pseudo - obtain an inode
				956	* @sb: superblock
				957	*
				958	* Allocates a new inode for given superblock.
				959	* Inode wont be chained in superblock s_inodes list
				960	* This means :
				961	* - fs can't be unmount
				962	* - quotas, fsnotify, writeback can't work
				963	*/
				964	struct inode new_inode_pseudo(struct super_block sb)
				965	{
				966	struct inode *inode = alloc_inode(sb);
				967
				968	if (inode) {
				969	spin_lock(&inode->i_lock);
				970	inode->i_state = 0;
				971	spin_unlock(&inode->i_lock);
				972	INIT_LIST_HEAD(&inode->i_sb_list);
				973	}
				974	return inode;
				975	}
				976
				977	/**
				978	* new_inode - obtain an inode
				979	* @sb: superblock
				980	*
				981	* Allocates a new inode for given superblock. The default gfp_mask
				982	* for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
				983	* If HIGHMEM pages are unsuitable or it is known that pages allocated
				984	* for the page cache are not reclaimable or migratable,
				985	* mapping_set_gfp_mask() must be called with suitable flags on the
				986	* newly created inode's mapping
				987	*
				988	*/
				989	struct inode new_inode(struct super_block sb)
				990	{
				991	struct inode *inode;
				992
				993	spin_lock_prefetch(&sb->s_inode_list_lock);
				994
				995	inode = new_inode_pseudo(sb);
				996	if (inode)
				997	inode_sb_list_add(inode);
				998	return inode;
				999	}
				1000	EXPORT_SYMBOL(new_inode);
				1001
				1002	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				1003	void lockdep_annotate_inode_mutex_key(struct inode *inode)
				1004	{
				1005	if (S_ISDIR(inode->i_mode)) {
				1006	struct file_system_type *type = inode->i_sb->s_type;
				1007
				1008	/* Set new key only if filesystem hasn't already changed it */
				1009	if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
				1010	/*
				1011	* ensure nobody is actually holding i_mutex
				1012	*/
				1013	// mutex_destroy(&inode->i_mutex);
				1014	init_rwsem(&inode->i_rwsem);
				1015	lockdep_set_class(&inode->i_rwsem,
				1016	&type->i_mutex_dir_key);
				1017	}
				1018	}
				1019	}
				1020	EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
				1021	#endif
				1022
				1023	/**
				1024	* unlock_new_inode - clear the I_NEW state and wake up any waiters
				1025	* @inode: new inode to unlock
				1026	*
				1027	* Called when the inode is fully initialised to clear the new state of the
				1028	* inode and wake up anyone waiting for the inode to finish initialisation.
				1029	*/
				1030	void unlock_new_inode(struct inode *inode)
				1031	{
				1032	lockdep_annotate_inode_mutex_key(inode);
				1033	spin_lock(&inode->i_lock);
				1034	WARN_ON(!(inode->i_state & I_NEW));
				1035	inode->i_state &= ~I_NEW & ~I_CREATING;
				1036	smp_mb();
				1037	wake_up_bit(&inode->i_state, __I_NEW);
				1038	spin_unlock(&inode->i_lock);
				1039	}
				1040	EXPORT_SYMBOL(unlock_new_inode);
				1041
				1042	void discard_new_inode(struct inode *inode)
				1043	{
				1044	lockdep_annotate_inode_mutex_key(inode);
				1045	spin_lock(&inode->i_lock);
				1046	WARN_ON(!(inode->i_state & I_NEW));
				1047	inode->i_state &= ~I_NEW;
				1048	smp_mb();
				1049	wake_up_bit(&inode->i_state, __I_NEW);
				1050	spin_unlock(&inode->i_lock);
				1051	iput(inode);
				1052	}
				1053	EXPORT_SYMBOL(discard_new_inode);
				1054
				1055	/**
				1056	* lock_two_inodes - lock two inodes (may be regular files but also dirs)
				1057	*
				1058	* Lock any non-NULL argument. The caller must make sure that if he is passing
				1059	* in two directories, one is not ancestor of the other. Zero, one or two
				1060	* objects may be locked by this function.
				1061	*
				1062	* @inode1: first inode to lock
				1063	* @inode2: second inode to lock
				1064	* @subclass1: inode lock subclass for the first lock obtained
				1065	* @subclass2: inode lock subclass for the second lock obtained
				1066	*/
				1067	void lock_two_inodes(struct inode inode1, struct inode inode2,
				1068	unsigned subclass1, unsigned subclass2)
				1069	{
				1070	if (!inode1 \|\| !inode2) {
				1071	/*
				1072	* Make sure @subclass1 will be used for the acquired lock.
				1073	* This is not strictly necessary (no current caller cares) but
				1074	* let's keep things consistent.
				1075	*/
				1076	if (!inode1)
				1077	swap(inode1, inode2);
				1078	goto lock;
				1079	}
				1080
				1081	/*
				1082	* If one object is directory and the other is not, we must make sure
				1083	* to lock directory first as the other object may be its child.
				1084	*/
				1085	if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
				1086	if (inode1 > inode2)
				1087	swap(inode1, inode2);
				1088	} else if (!S_ISDIR(inode1->i_mode))
				1089	swap(inode1, inode2);
				1090	lock:
				1091	if (inode1)
				1092	inode_lock_nested(inode1, subclass1);
				1093	if (inode2 && inode2 != inode1)
				1094	inode_lock_nested(inode2, subclass2);
				1095	}
				1096
				1097	/**
				1098	* lock_two_nondirectories - take two i_mutexes on non-directory objects
				1099	*
				1100	* Lock any non-NULL argument that is not a directory.
				1101	* Zero, one or two objects may be locked by this function.
				1102	*
				1103	* @inode1: first inode to lock
				1104	* @inode2: second inode to lock
				1105	*/
				1106	void lock_two_nondirectories(struct inode inode1, struct inode inode2)
				1107	{
				1108	if (inode1 > inode2)
				1109	swap(inode1, inode2);
				1110
				1111	if (inode1 && !S_ISDIR(inode1->i_mode))
				1112	inode_lock(inode1);
				1113	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1114	inode_lock_nested(inode2, I_MUTEX_NONDIR2);
				1115	}
				1116	EXPORT_SYMBOL(lock_two_nondirectories);
				1117
				1118	/**
				1119	* unlock_two_nondirectories - release locks from lock_two_nondirectories()
				1120	* @inode1: first inode to unlock
				1121	* @inode2: second inode to unlock
				1122	*/
				1123	void unlock_two_nondirectories(struct inode inode1, struct inode inode2)
				1124	{
				1125	if (inode1 && !S_ISDIR(inode1->i_mode))
				1126	inode_unlock(inode1);
				1127	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1128	inode_unlock(inode2);
				1129	}
				1130	EXPORT_SYMBOL(unlock_two_nondirectories);
				1131
				1132	/**
				1133	* inode_insert5 - obtain an inode from a mounted file system
				1134	* @inode: pre-allocated inode to use for insert to cache
				1135	* @hashval: hash value (usually inode number) to get
				1136	* @test: callback used for comparisons between inodes
				1137	* @set: callback used to initialize a new struct inode
				1138	* @data: opaque data pointer to pass to @test and @set
				1139	*
				1140	* Search for the inode specified by @hashval and @data in the inode cache,
				1141	* and if present it is return it with an increased reference count. This is
				1142	* a variant of iget5_locked() for callers that don't want to fail on memory
				1143	* allocation of inode.
				1144	*
				1145	* If the inode is not in cache, insert the pre-allocated inode to cache and
				1146	* return it locked, hashed, and with the I_NEW flag set. The file system gets
				1147	* to fill it in before unlocking it via unlock_new_inode().
				1148	*
				1149	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1150	* sleep.
				1151	*/
				1152	struct inode inode_insert5(struct inode inode, unsigned long hashval,
				1153	int (test)(struct inode , void *),
				1154	int (set)(struct inode , void ), void data)
				1155	{
				1156	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
				1157	struct inode *old;
				1158	bool creating = inode->i_state & I_CREATING;
				1159
				1160	again:
				1161	spin_lock(&inode_hash_lock);
				1162	old = find_inode(inode->i_sb, head, test, data);
				1163	if (unlikely(old)) {
				1164	/*
				1165	* Uhhuh, somebody else created the same inode under us.
				1166	* Use the old inode instead of the preallocated one.
				1167	*/
				1168	spin_unlock(&inode_hash_lock);
				1169	if (IS_ERR(old))
				1170	return NULL;
				1171	wait_on_inode(old);
				1172	if (unlikely(inode_unhashed(old))) {
				1173	iput(old);
				1174	goto again;
				1175	}
				1176	return old;
				1177	}
				1178
				1179	if (set && unlikely(set(inode, data))) {
				1180	inode = NULL;
				1181	goto unlock;
				1182	}
				1183
				1184	/*
				1185	* Return the locked inode with I_NEW set, the
				1186	* caller is responsible for filling in the contents
				1187	*/
				1188	spin_lock(&inode->i_lock);
				1189	inode->i_state \|= I_NEW;
				1190	hlist_add_head(&inode->i_hash, head);
				1191	spin_unlock(&inode->i_lock);
				1192	if (!creating)
				1193	inode_sb_list_add(inode);
				1194	unlock:
				1195	spin_unlock(&inode_hash_lock);
				1196
				1197	return inode;
				1198	}
				1199	EXPORT_SYMBOL(inode_insert5);
				1200
				1201	/**
				1202	* iget5_locked - obtain an inode from a mounted file system
				1203	* @sb: super block of file system
				1204	* @hashval: hash value (usually inode number) to get
				1205	* @test: callback used for comparisons between inodes
				1206	* @set: callback used to initialize a new struct inode
				1207	* @data: opaque data pointer to pass to @test and @set
				1208	*
				1209	* Search for the inode specified by @hashval and @data in the inode cache,
				1210	* and if present it is return it with an increased reference count. This is
				1211	* a generalized version of iget_locked() for file systems where the inode
				1212	* number is not sufficient for unique identification of an inode.
				1213	*
				1214	* If the inode is not in cache, allocate a new inode and return it locked,
				1215	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1216	* before unlocking it via unlock_new_inode().
				1217	*
				1218	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1219	* sleep.
				1220	*/
				1221	struct inode iget5_locked(struct super_block sb, unsigned long hashval,
				1222	int (test)(struct inode , void *),
				1223	int (set)(struct inode , void ), void data)
				1224	{
				1225	struct inode *inode = ilookup5(sb, hashval, test, data);
				1226
				1227	if (!inode) {
				1228	struct inode *new = alloc_inode(sb);
				1229
				1230	if (new) {
				1231	new->i_state = 0;
				1232	inode = inode_insert5(new, hashval, test, set, data);
				1233	if (unlikely(inode != new))
				1234	destroy_inode(new);
				1235	}
				1236	}
				1237	return inode;
				1238	}
				1239	EXPORT_SYMBOL(iget5_locked);
				1240
				1241	/**
				1242	* iget_locked - obtain an inode from a mounted file system
				1243	* @sb: super block of file system
				1244	* @ino: inode number to get
				1245	*
				1246	* Search for the inode specified by @ino in the inode cache and if present
				1247	* return it with an increased reference count. This is for file systems
				1248	* where the inode number is sufficient for unique identification of an inode.
				1249	*
				1250	* If the inode is not in cache, allocate a new inode and return it locked,
				1251	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1252	* before unlocking it via unlock_new_inode().
				1253	*/
				1254	struct inode iget_locked(struct super_block sb, unsigned long ino)
				1255	{
				1256	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1257	struct inode *inode;
				1258	again:
				1259	spin_lock(&inode_hash_lock);
				1260	inode = find_inode_fast(sb, head, ino);
				1261	spin_unlock(&inode_hash_lock);
				1262	if (inode) {
				1263	if (IS_ERR(inode))
				1264	return NULL;
				1265	wait_on_inode(inode);
				1266	if (unlikely(inode_unhashed(inode))) {
				1267	iput(inode);
				1268	goto again;
				1269	}
				1270	return inode;
				1271	}
				1272
				1273	inode = alloc_inode(sb);
				1274	if (inode) {
				1275	struct inode *old;
				1276
				1277	spin_lock(&inode_hash_lock);
				1278	/* We released the lock, so.. */
				1279	old = find_inode_fast(sb, head, ino);
				1280	if (!old) {
				1281	inode->i_ino = ino;
				1282	spin_lock(&inode->i_lock);
				1283	inode->i_state = I_NEW;
				1284	hlist_add_head(&inode->i_hash, head);
				1285	spin_unlock(&inode->i_lock);
				1286	inode_sb_list_add(inode);
				1287	spin_unlock(&inode_hash_lock);
				1288
				1289	/* Return the locked inode with I_NEW set, the
				1290	* caller is responsible for filling in the contents
				1291	*/
				1292	return inode;
				1293	}
				1294
				1295	/*
				1296	* Uhhuh, somebody else created the same inode under
				1297	* us. Use the old inode instead of the one we just
				1298	* allocated.
				1299	*/
				1300	spin_unlock(&inode_hash_lock);
				1301	destroy_inode(inode);
				1302	if (IS_ERR(old))
				1303	return NULL;
				1304	inode = old;
				1305	wait_on_inode(inode);
				1306	if (unlikely(inode_unhashed(inode))) {
				1307	iput(inode);
				1308	goto again;
				1309	}
				1310	}
				1311	return inode;
				1312	}
				1313	EXPORT_SYMBOL(iget_locked);
				1314
				1315	/*
				1316	* search the inode cache for a matching inode number.
				1317	* If we find one, then the inode number we are trying to
				1318	* allocate is not unique and so we should not use it.
				1319	*
				1320	* Returns 1 if the inode number is unique, 0 if it is not.
				1321	*/
				1322	static int test_inode_iunique(struct super_block *sb, unsigned long ino)
				1323	{
				1324	struct hlist_head *b = inode_hashtable + hash(sb, ino);
				1325	struct inode *inode;
				1326
				1327	spin_lock(&inode_hash_lock);
				1328	hlist_for_each_entry(inode, b, i_hash) {
				1329	if (inode->i_ino == ino && inode->i_sb == sb) {
				1330	spin_unlock(&inode_hash_lock);
				1331	return 0;
				1332	}
				1333	}
				1334	spin_unlock(&inode_hash_lock);
				1335
				1336	return 1;
				1337	}
				1338
				1339	/**
				1340	* iunique - get a unique inode number
				1341	* @sb: superblock
				1342	* @max_reserved: highest reserved inode number
				1343	*
				1344	* Obtain an inode number that is unique on the system for a given
				1345	* superblock. This is used by file systems that have no natural
				1346	* permanent inode numbering system. An inode number is returned that
				1347	* is higher than the reserved limit but unique.
				1348	*
				1349	* BUGS:
				1350	* With a large number of inodes live on the file system this function
				1351	* currently becomes quite slow.
				1352	*/
				1353	ino_t iunique(struct super_block *sb, ino_t max_reserved)
				1354	{
				1355	/*
				1356	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				1357	* error if st_ino won't fit in target struct field. Use 32bit counter
				1358	* here to attempt to avoid that.
				1359	*/
				1360	static DEFINE_SPINLOCK(iunique_lock);
				1361	static unsigned int counter;
				1362	ino_t res;
				1363
				1364	spin_lock(&iunique_lock);
				1365	do {
				1366	if (counter <= max_reserved)
				1367	counter = max_reserved + 1;
				1368	res = counter++;
				1369	} while (!test_inode_iunique(sb, res));
				1370	spin_unlock(&iunique_lock);
				1371
				1372	return res;
				1373	}
				1374	EXPORT_SYMBOL(iunique);
				1375
				1376	struct inode igrab(struct inode inode)
				1377	{
				1378	spin_lock(&inode->i_lock);
				1379	if (!(inode->i_state & (I_FREEING\|I_WILL_FREE))) {
				1380	__iget(inode);
				1381	spin_unlock(&inode->i_lock);
				1382	} else {
				1383	spin_unlock(&inode->i_lock);
				1384	/*
				1385	* Handle the case where s_op->clear_inode is not been
				1386	* called yet, and somebody is calling igrab
				1387	* while the inode is getting freed.
				1388	*/
				1389	inode = NULL;
				1390	}
				1391	return inode;
				1392	}
				1393	EXPORT_SYMBOL(igrab);
				1394
				1395	/**
				1396	* ilookup5_nowait - search for an inode in the inode cache
				1397	* @sb: super block of file system to search
				1398	* @hashval: hash value (usually inode number) to search for
				1399	* @test: callback used for comparisons between inodes
				1400	* @data: opaque data pointer to pass to @test
				1401	*
				1402	* Search for the inode specified by @hashval and @data in the inode cache.
				1403	* If the inode is in the cache, the inode is returned with an incremented
				1404	* reference count.
				1405	*
				1406	* Note: I_NEW is not waited upon so you have to be very careful what you do
				1407	* with the returned inode. You probably should be using ilookup5() instead.
				1408	*
				1409	* Note2: @test is called with the inode_hash_lock held, so can't sleep.
				1410	*/
				1411	struct inode ilookup5_nowait(struct super_block sb, unsigned long hashval,
				1412	int (test)(struct inode , void ), void data)
				1413	{
				1414	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1415	struct inode *inode;
				1416
				1417	spin_lock(&inode_hash_lock);
				1418	inode = find_inode(sb, head, test, data);
				1419	spin_unlock(&inode_hash_lock);
				1420
				1421	return IS_ERR(inode) ? NULL : inode;
				1422	}
				1423	EXPORT_SYMBOL(ilookup5_nowait);
				1424
				1425	/**
				1426	* ilookup5 - search for an inode in the inode cache
				1427	* @sb: super block of file system to search
				1428	* @hashval: hash value (usually inode number) to search for
				1429	* @test: callback used for comparisons between inodes
				1430	* @data: opaque data pointer to pass to @test
				1431	*
				1432	* Search for the inode specified by @hashval and @data in the inode cache,
				1433	* and if the inode is in the cache, return the inode with an incremented
				1434	* reference count. Waits on I_NEW before returning the inode.
				1435	* returned with an incremented reference count.
				1436	*
				1437	* This is a generalized version of ilookup() for file systems where the
				1438	* inode number is not sufficient for unique identification of an inode.
				1439	*
				1440	* Note: @test is called with the inode_hash_lock held, so can't sleep.
				1441	*/
				1442	struct inode ilookup5(struct super_block sb, unsigned long hashval,
				1443	int (test)(struct inode , void ), void data)
				1444	{
				1445	struct inode *inode;
				1446	again:
				1447	inode = ilookup5_nowait(sb, hashval, test, data);
				1448	if (inode) {
				1449	wait_on_inode(inode);
				1450	if (unlikely(inode_unhashed(inode))) {
				1451	iput(inode);
				1452	goto again;
				1453	}
				1454	}
				1455	return inode;
				1456	}
				1457	EXPORT_SYMBOL(ilookup5);
				1458
				1459	/**
				1460	* ilookup - search for an inode in the inode cache
				1461	* @sb: super block of file system to search
				1462	* @ino: inode number to search for
				1463	*
				1464	* Search for the inode @ino in the inode cache, and if the inode is in the
				1465	* cache, the inode is returned with an incremented reference count.
				1466	*/
				1467	struct inode ilookup(struct super_block sb, unsigned long ino)
				1468	{
				1469	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1470	struct inode *inode;
				1471	again:
				1472	spin_lock(&inode_hash_lock);
				1473	inode = find_inode_fast(sb, head, ino);
				1474	spin_unlock(&inode_hash_lock);
				1475
				1476	if (inode) {
				1477	if (IS_ERR(inode))
				1478	return NULL;
				1479	wait_on_inode(inode);
				1480	if (unlikely(inode_unhashed(inode))) {
				1481	iput(inode);
				1482	goto again;
				1483	}
				1484	}
				1485	return inode;
				1486	}
				1487	EXPORT_SYMBOL(ilookup);
				1488
				1489	/**
				1490	* find_inode_nowait - find an inode in the inode cache
				1491	* @sb: super block of file system to search
				1492	* @hashval: hash value (usually inode number) to search for
				1493	* @match: callback used for comparisons between inodes
				1494	* @data: opaque data pointer to pass to @match
				1495	*
				1496	* Search for the inode specified by @hashval and @data in the inode
				1497	* cache, where the helper function @match will return 0 if the inode
				1498	* does not match, 1 if the inode does match, and -1 if the search
				1499	* should be stopped. The @match function must be responsible for
				1500	* taking the i_lock spin_lock and checking i_state for an inode being
				1501	* freed or being initialized, and incrementing the reference count
				1502	* before returning 1. It also must not sleep, since it is called with
				1503	* the inode_hash_lock spinlock held.
				1504	*
				1505	* This is a even more generalized version of ilookup5() when the
				1506	* function must never block --- find_inode() can block in
				1507	* __wait_on_freeing_inode() --- or when the caller can not increment
				1508	* the reference count because the resulting iput() might cause an
				1509	* inode eviction. The tradeoff is that the @match funtion must be
				1510	* very carefully implemented.
				1511	*/
				1512	struct inode find_inode_nowait(struct super_block sb,
				1513	unsigned long hashval,
				1514	int (match)(struct inode , unsigned long,
				1515	void *),
				1516	void *data)
				1517	{
				1518	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1519	struct inode inode, ret_inode = NULL;
				1520	int mval;
				1521
				1522	spin_lock(&inode_hash_lock);
				1523	hlist_for_each_entry(inode, head, i_hash) {
				1524	if (inode->i_sb != sb)
				1525	continue;
				1526	mval = match(inode, hashval, data);
				1527	if (mval == 0)
				1528	continue;
				1529	if (mval == 1)
				1530	ret_inode = inode;
				1531	goto out;
				1532	}
				1533	out:
				1534	spin_unlock(&inode_hash_lock);
				1535	return ret_inode;
				1536	}
				1537	EXPORT_SYMBOL(find_inode_nowait);
				1538
				1539	int insert_inode_locked(struct inode *inode)
				1540	{
				1541	struct super_block *sb = inode->i_sb;
				1542	ino_t ino = inode->i_ino;
				1543	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1544
				1545	while (1) {
				1546	struct inode *old = NULL;
				1547	spin_lock(&inode_hash_lock);
				1548	hlist_for_each_entry(old, head, i_hash) {
				1549	if (old->i_ino != ino)
				1550	continue;
				1551	if (old->i_sb != sb)
				1552	continue;
				1553	spin_lock(&old->i_lock);
				1554	if (old->i_state & (I_FREEING\|I_WILL_FREE)) {
				1555	spin_unlock(&old->i_lock);
				1556	continue;
				1557	}
				1558	break;
				1559	}
				1560	if (likely(!old)) {
				1561	spin_lock(&inode->i_lock);
				1562	inode->i_state \|= I_NEW \| I_CREATING;
				1563	hlist_add_head(&inode->i_hash, head);
				1564	spin_unlock(&inode->i_lock);
				1565	spin_unlock(&inode_hash_lock);
				1566	return 0;
				1567	}
				1568	if (unlikely(old->i_state & I_CREATING)) {
				1569	spin_unlock(&old->i_lock);
				1570	spin_unlock(&inode_hash_lock);
				1571	return -EBUSY;
				1572	}
				1573	__iget(old);
				1574	spin_unlock(&old->i_lock);
				1575	spin_unlock(&inode_hash_lock);
				1576	wait_on_inode(old);
				1577	if (unlikely(!inode_unhashed(old))) {
				1578	iput(old);
				1579	return -EBUSY;
				1580	}
				1581	iput(old);
				1582	}
				1583	}
				1584	EXPORT_SYMBOL(insert_inode_locked);
				1585
				1586	int insert_inode_locked4(struct inode *inode, unsigned long hashval,
				1587	int (test)(struct inode , void ), void data)
				1588	{
				1589	struct inode *old;
				1590
				1591	inode->i_state \|= I_CREATING;
				1592	old = inode_insert5(inode, hashval, test, NULL, data);
				1593
				1594	if (old != inode) {
				1595	iput(old);
				1596	return -EBUSY;
				1597	}
				1598	return 0;
				1599	}
				1600	EXPORT_SYMBOL(insert_inode_locked4);
				1601
				1602
				1603	int generic_delete_inode(struct inode *inode)
				1604	{
				1605	return 1;
				1606	}
				1607	EXPORT_SYMBOL(generic_delete_inode);
				1608
				1609	/*
				1610	* Called when we're dropping the last reference
				1611	* to an inode.
				1612	*
				1613	* Call the FS "drop_inode()" function, defaulting to
				1614	* the legacy UNIX filesystem behaviour. If it tells
				1615	* us to evict inode, do so. Otherwise, retain inode
				1616	* in cache if fs is alive, sync and evict if fs is
				1617	* shutting down.
				1618	*/
				1619	static void iput_final(struct inode *inode)
				1620	{
				1621	struct super_block *sb = inode->i_sb;
				1622	const struct super_operations *op = inode->i_sb->s_op;
				1623	int drop;
				1624
				1625	WARN_ON(inode->i_state & I_NEW);
				1626
				1627	if (op->drop_inode)
				1628	drop = op->drop_inode(inode);
				1629	else
				1630	drop = generic_drop_inode(inode);
				1631
				1632	if (!drop && (sb->s_flags & SB_ACTIVE)) {
				1633	inode_add_lru(inode);
				1634	spin_unlock(&inode->i_lock);
				1635	return;
				1636	}
				1637
				1638	if (!drop) {
				1639	inode->i_state \|= I_WILL_FREE;
				1640	spin_unlock(&inode->i_lock);
				1641	write_inode_now(inode, 1);
				1642	spin_lock(&inode->i_lock);
				1643	WARN_ON(inode->i_state & I_NEW);
				1644	inode->i_state &= ~I_WILL_FREE;
				1645	}
				1646
				1647	inode->i_state \|= I_FREEING;
				1648	if (!list_empty(&inode->i_lru))
				1649	inode_lru_list_del(inode);
				1650	spin_unlock(&inode->i_lock);
				1651
				1652	evict(inode);
				1653	}
				1654
				1655	/**
				1656	* iput - put an inode
				1657	* @inode: inode to put
				1658	*
				1659	* Puts an inode, dropping its usage count. If the inode use count hits
				1660	* zero, the inode is then freed and may also be destroyed.
				1661	*
				1662	* Consequently, iput() can sleep.
				1663	*/
				1664	void iput(struct inode *inode)
				1665	{
				1666	if (!inode)
				1667	return;
				1668	BUG_ON(inode->i_state & I_CLEAR);
				1669	retry:
				1670	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
				1671	if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
				1672	atomic_inc(&inode->i_count);
				1673	spin_unlock(&inode->i_lock);
				1674	trace_writeback_lazytime_iput(inode);
				1675	mark_inode_dirty_sync(inode);
				1676	goto retry;
				1677	}
				1678	iput_final(inode);
				1679	}
				1680	}
				1681	EXPORT_SYMBOL(iput);
				1682
				1683	#ifdef CONFIG_BLOCK
				1684	/**
				1685	* bmap - find a block number in a file
				1686	* @inode: inode owning the block number being requested
				1687	* @block: pointer containing the block to find
				1688	*
				1689	* Replaces the value in *block with the block number on the device holding
				1690	* corresponding to the requested block number in the file.
				1691	* That is, asked for block 4 of inode 1 the function will replace the
				1692	* 4 in *block, with disk block relative to the disk start that holds that
				1693	* block of the file.
				1694	*
				1695	* Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
				1696	* hole, returns 0 and *block is also set to 0.
				1697	*/
				1698	int bmap(struct inode inode, sector_t block)
				1699	{
				1700	if (!inode->i_mapping->a_ops->bmap)
				1701	return -EINVAL;
				1702
				1703	block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
				1704	return 0;
				1705	}
				1706	EXPORT_SYMBOL(bmap);
				1707	#endif
				1708
				1709	/*
				1710	* With relative atime, only update atime if the previous atime is
				1711	* earlier than either the ctime or mtime or if at least a day has
				1712	* passed since the last atime update.
				1713	*/
				1714	static int relatime_need_update(struct vfsmount mnt, struct inode inode,
				1715	struct timespec64 now)
				1716	{
				1717
				1718	if (!(mnt->mnt_flags & MNT_RELATIME))
				1719	return 1;
				1720	/*
				1721	* Is mtime younger than atime? If yes, update atime:
				1722	*/
				1723	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
				1724	return 1;
				1725	/*
				1726	* Is ctime younger than atime? If yes, update atime:
				1727	*/
				1728	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
				1729	return 1;
				1730
				1731	/*
				1732	* Is the previous atime value older than a day? If yes,
				1733	* update atime:
				1734	*/
				1735	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 246060)
				1736	return 1;
				1737	/*
				1738	* Good, we can skip the atime update:
				1739	*/
				1740	return 0;
				1741	}
				1742
				1743	int generic_update_time(struct inode inode, struct timespec64 time, int flags)
				1744	{
				1745	int iflags = I_DIRTY_TIME;
				1746	bool dirty = false;
				1747
				1748	if (flags & S_ATIME)
				1749	inode->i_atime = *time;
				1750	if (flags & S_VERSION)
				1751	dirty = inode_maybe_inc_iversion(inode, false);
				1752	if (flags & S_CTIME)
				1753	inode->i_ctime = *time;
				1754	if (flags & S_MTIME)
				1755	inode->i_mtime = *time;
				1756	if ((flags & (S_ATIME \| S_CTIME \| S_MTIME)) &&
				1757	!(inode->i_sb->s_flags & SB_LAZYTIME))
				1758	dirty = true;
				1759
				1760	if (dirty)
				1761	iflags \|= I_DIRTY_SYNC;
				1762	__mark_inode_dirty(inode, iflags);
				1763	return 0;
				1764	}
				1765	EXPORT_SYMBOL(generic_update_time);
				1766
				1767	/*
				1768	* This does the actual work of updating an inodes time or version. Must have
				1769	* had called mnt_want_write() before calling this.
				1770	*/
				1771	static int update_time(struct inode inode, struct timespec64 time, int flags)
				1772	{
				1773	int (update_time)(struct inode , struct timespec64 *, int);
				1774
				1775	update_time = inode->i_op->update_time ? inode->i_op->update_time :
				1776	generic_update_time;
				1777
				1778	return update_time(inode, time, flags);
				1779	}
				1780
				1781	/**
				1782	* touch_atime - update the access time
				1783	* @path: the &struct path to update
				1784	* @inode: inode to update
				1785	*
				1786	* Update the accessed time on an inode and mark it for writeback.
				1787	* This function automatically handles read only file systems and media,
				1788	* as well as the "noatime" flag and inode specific "noatime" markers.
				1789	*/
				1790	bool atime_needs_update(const struct path path, struct inode inode)
				1791	{
				1792	struct vfsmount *mnt = path->mnt;
				1793	struct timespec64 now;
				1794
				1795	if (inode->i_flags & S_NOATIME)
				1796	return false;
				1797
				1798	/* Atime updates will likely cause i_uid and i_gid to be written
				1799	* back improprely if their true value is unknown to the vfs.
				1800	*/
				1801	if (HAS_UNMAPPED_ID(inode))
				1802	return false;
				1803
				1804	if (IS_NOATIME(inode))
				1805	return false;
				1806	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
				1807	return false;
				1808
				1809	if (mnt->mnt_flags & MNT_NOATIME)
				1810	return false;
				1811	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
				1812	return false;
				1813
				1814	now = current_time(inode);
				1815
				1816	if (!relatime_need_update(mnt, inode, now))
				1817	return false;
				1818
				1819	if (timespec64_equal(&inode->i_atime, &now))
				1820	return false;
				1821
				1822	return true;
				1823	}
				1824
				1825	void touch_atime(const struct path *path)
				1826	{
				1827	struct vfsmount *mnt = path->mnt;
				1828	struct inode *inode = d_inode(path->dentry);
				1829	struct timespec64 now;
				1830
				1831	if (!atime_needs_update(path, inode))
				1832	return;
				1833
				1834	if (!sb_start_write_trylock(inode->i_sb))
				1835	return;
				1836
				1837	if (__mnt_want_write(mnt) != 0)
				1838	goto skip_update;
				1839	/*
				1840	* File systems can error out when updating inodes if they need to
				1841	* allocate new space to modify an inode (such is the case for
				1842	* Btrfs), but since we touch atime while walking down the path we
				1843	* really don't care if we failed to update the atime of the file,
				1844	* so just ignore the return value.
				1845	* We may also fail on filesystems that have the ability to make parts
				1846	* of the fs read only, e.g. subvolumes in Btrfs.
				1847	*/
				1848	now = current_time(inode);
				1849	update_time(inode, &now, S_ATIME);
				1850	__mnt_drop_write(mnt);
				1851	skip_update:
				1852	sb_end_write(inode->i_sb);
				1853	}
				1854	EXPORT_SYMBOL(touch_atime);
				1855
				1856	/*
				1857	* The logic we want is
				1858	*
				1859	* if suid or (sgid and xgrp)
				1860	* remove privs
				1861	*/
				1862	int should_remove_suid(struct dentry *dentry)
				1863	{
				1864	umode_t mode = d_inode(dentry)->i_mode;
				1865	int kill = 0;
				1866
				1867	/* suid always must be killed */
				1868	if (unlikely(mode & S_ISUID))
				1869	kill = ATTR_KILL_SUID;
				1870
				1871	/*
				1872	* sgid without any exec bits is just a mandatory locking mark; leave
				1873	* it alone. If some exec bits are set, it's a real sgid; kill it.
				1874	*/
				1875	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
				1876	kill \|= ATTR_KILL_SGID;
				1877
				1878	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
				1879	return kill;
				1880
				1881	return 0;
				1882	}
				1883	EXPORT_SYMBOL(should_remove_suid);
				1884
				1885	/*
				1886	* Return mask of changes for notify_change() that need to be done as a
				1887	* response to write or truncate. Return 0 if nothing has to be changed.
				1888	* Negative value on error (change should be denied).
				1889	*/
				1890	int dentry_needs_remove_privs(struct dentry *dentry)
				1891	{
				1892	struct inode *inode = d_inode(dentry);
				1893	int mask = 0;
				1894	int ret;
				1895
				1896	if (IS_NOSEC(inode))
				1897	return 0;
				1898
				1899	mask = should_remove_suid(dentry);
				1900	ret = security_inode_need_killpriv(dentry);
				1901	if (ret < 0)
				1902	return ret;
				1903	if (ret)
				1904	mask \|= ATTR_KILL_PRIV;
				1905	return mask;
				1906	}
				1907
				1908	static int __remove_privs(struct dentry *dentry, int kill)
				1909	{
				1910	struct iattr newattrs;
				1911
				1912	newattrs.ia_valid = ATTR_FORCE \| kill;
				1913	/*
				1914	* Note we call this on write, so notify_change will not
				1915	* encounter any conflicting delegations:
				1916	*/
				1917	return notify_change(dentry, &newattrs, NULL);
				1918	}
				1919
				1920	/*
				1921	* Remove special file priviledges (suid, capabilities) when file is written
				1922	* to or truncated.
				1923	*/
				1924	int file_remove_privs(struct file *file)
				1925	{
				1926	struct dentry *dentry = file_dentry(file);
				1927	struct inode *inode = file_inode(file);
				1928	int kill;
				1929	int error = 0;
				1930
				1931	/*
				1932	* Fast path for nothing security related.
				1933	* As well for non-regular files, e.g. blkdev inodes.
				1934	* For example, blkdev_write_iter() might get here
				1935	* trying to remove privs which it is not allowed to.
				1936	*/
				1937	if (IS_NOSEC(inode) \|\| !S_ISREG(inode->i_mode))
				1938	return 0;
				1939
				1940	kill = dentry_needs_remove_privs(dentry);
				1941	if (kill < 0)
				1942	return kill;
				1943	if (kill)
				1944	error = __remove_privs(dentry, kill);
				1945	if (!error)
				1946	inode_has_no_xattr(inode);
				1947
				1948	return error;
				1949	}
				1950	EXPORT_SYMBOL(file_remove_privs);
				1951
				1952	/**
				1953	* file_update_time - update mtime and ctime time
				1954	* @file: file accessed
				1955	*
				1956	* Update the mtime and ctime members of an inode and mark the inode
				1957	* for writeback. Note that this function is meant exclusively for
				1958	* usage in the file write path of filesystems, and filesystems may
				1959	* choose to explicitly ignore update via this function with the
				1960	* S_NOCMTIME inode flag, e.g. for network filesystem where these
				1961	* timestamps are handled by the server. This can return an error for
				1962	* file systems who need to allocate space in order to update an inode.
				1963	*/
				1964
				1965	int file_update_time(struct file *file)
				1966	{
				1967	struct inode *inode = file_inode(file);
				1968	struct timespec64 now;
				1969	int sync_it = 0;
				1970	int ret;
				1971
				1972	/* First try to exhaust all avenues to not sync */
				1973	if (IS_NOCMTIME(inode))
				1974	return 0;
				1975
				1976	now = current_time(inode);
				1977	if (!timespec64_equal(&inode->i_mtime, &now))
				1978	sync_it = S_MTIME;
				1979
				1980	if (!timespec64_equal(&inode->i_ctime, &now))
				1981	sync_it \|= S_CTIME;
				1982
				1983	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
				1984	sync_it \|= S_VERSION;
				1985
				1986	if (!sync_it)
				1987	return 0;
				1988
				1989	/* Finally allowed to write? Takes lock. */
				1990	if (__mnt_want_write_file(file))
				1991	return 0;
				1992
				1993	ret = update_time(inode, &now, sync_it);
				1994	__mnt_drop_write_file(file);
				1995
				1996	return ret;
				1997	}
				1998	EXPORT_SYMBOL(file_update_time);
				1999
				2000	/* Caller must hold the file's inode lock */
				2001	int file_modified(struct file *file)
				2002	{
				2003	int err;
				2004
				2005	/*
				2006	* Clear the security bits if the process is not being run by root.
				2007	* This keeps people from modifying setuid and setgid binaries.
				2008	*/
				2009	err = file_remove_privs(file);
				2010	if (err)
				2011	return err;
				2012
				2013	if (unlikely(file->f_mode & FMODE_NOCMTIME))
				2014	return 0;
				2015
				2016	return file_update_time(file);
				2017	}
				2018	EXPORT_SYMBOL(file_modified);
				2019
				2020	int inode_needs_sync(struct inode *inode)
				2021	{
				2022	if (IS_SYNC(inode))
				2023	return 1;
				2024	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
				2025	return 1;
				2026	return 0;
				2027	}
				2028	EXPORT_SYMBOL(inode_needs_sync);
				2029
				2030	/*
				2031	* If we try to find an inode in the inode hash while it is being
				2032	* deleted, we have to wait until the filesystem completes its
				2033	* deletion before reporting that it isn't found. This function waits
				2034	* until the deletion _might_ have completed. Callers are responsible
				2035	* to recheck inode state.
				2036	*
				2037	* It doesn't matter if I_NEW is not set initially, a call to
				2038	* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
				2039	* will DTRT.
				2040	*/
				2041	static void __wait_on_freeing_inode(struct inode *inode)
				2042	{
				2043	wait_queue_head_t *wq;
				2044	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
				2045	wq = bit_waitqueue(&inode->i_state, __I_NEW);
				2046	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				2047	spin_unlock(&inode->i_lock);
				2048	spin_unlock(&inode_hash_lock);
				2049	schedule();
				2050	finish_wait(wq, &wait.wq_entry);
				2051	spin_lock(&inode_hash_lock);
				2052	}
				2053
				2054	static __initdata unsigned long ihash_entries;
				2055	static int __init set_ihash_entries(char *str)
				2056	{
				2057	if (!str)
				2058	return 0;
				2059	ihash_entries = simple_strtoul(str, &str, 0);
				2060	return 1;
				2061	}
				2062	__setup("ihash_entries=", set_ihash_entries);
				2063
				2064	/*
				2065	* Initialize the waitqueues and inode hash table.
				2066	*/
				2067	void __init inode_init_early(void)
				2068	{
				2069	/* If hashes are distributed across NUMA nodes, defer
				2070	* hash allocation until vmalloc space is available.
				2071	*/
				2072	if (hashdist)
				2073	return;
				2074
				2075	inode_hashtable =
				2076	alloc_large_system_hash("Inode-cache",
				2077	sizeof(struct hlist_head),
				2078	ihash_entries,
				2079	14,
				2080	HASH_EARLY \| HASH_ZERO,
				2081	&i_hash_shift,
				2082	&i_hash_mask,
				2083	0,
				2084	0);
				2085	}
				2086
				2087	void __init inode_init(void)
				2088	{
				2089	/* inode slab cache */
				2090	inode_cachep = kmem_cache_create("inode_cache",
				2091	sizeof(struct inode),
				2092	0,
				2093	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				2094	SLAB_MEM_SPREAD\|SLAB_ACCOUNT),
				2095	init_once);
				2096
				2097	/* Hash may have been set up in inode_init_early */
				2098	if (!hashdist)
				2099	return;
				2100
				2101	inode_hashtable =
				2102	alloc_large_system_hash("Inode-cache",
				2103	sizeof(struct hlist_head),
				2104	ihash_entries,
				2105	14,
				2106	HASH_ZERO,
				2107	&i_hash_shift,
				2108	&i_hash_mask,
				2109	0,
				2110	0);
				2111	}
				2112
				2113	void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
				2114	{
				2115	inode->i_mode = mode;
				2116	if (S_ISCHR(mode)) {
				2117	inode->i_fop = &def_chr_fops;
				2118	inode->i_rdev = rdev;
				2119	} else if (S_ISBLK(mode)) {
				2120	inode->i_fop = &def_blk_fops;
				2121	inode->i_rdev = rdev;
				2122	} else if (S_ISFIFO(mode))
				2123	inode->i_fop = &pipefifo_fops;
				2124	else if (S_ISSOCK(mode))
				2125	; /* leave it no_open_fops */
				2126	else
				2127	printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
				2128	" inode %s:%lu\n", mode, inode->i_sb->s_id,
				2129	inode->i_ino);
				2130	}
				2131	EXPORT_SYMBOL(init_special_inode);
				2132
				2133	/**
				2134	* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
				2135	* @inode: New inode
				2136	* @dir: Directory inode
				2137	* @mode: mode of the new inode
				2138	*/
				2139	void inode_init_owner(struct inode inode, const struct inode dir,
				2140	umode_t mode)
				2141	{
				2142	inode->i_uid = current_fsuid();
				2143	if (dir && dir->i_mode & S_ISGID) {
				2144	inode->i_gid = dir->i_gid;
				2145
				2146	/* Directories are special, and always inherit S_ISGID */
				2147	if (S_ISDIR(mode))
				2148	mode \|= S_ISGID;
				2149	} else
				2150	inode->i_gid = current_fsgid();
				2151	inode->i_mode = mode;
				2152	}
				2153	EXPORT_SYMBOL(inode_init_owner);
				2154
				2155	/**
				2156	* inode_owner_or_capable - check current task permissions to inode
				2157	* @inode: inode being checked
				2158	*
				2159	* Return true if current either has CAP_FOWNER in a namespace with the
				2160	* inode owner uid mapped, or owns the file.
				2161	*/
				2162	bool inode_owner_or_capable(const struct inode *inode)
				2163	{
				2164	struct user_namespace *ns;
				2165
				2166	if (uid_eq(current_fsuid(), inode->i_uid))
				2167	return true;
				2168
				2169	ns = current_user_ns();
				2170	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
				2171	return true;
				2172	return false;
				2173	}
				2174	EXPORT_SYMBOL(inode_owner_or_capable);
				2175
				2176	/*
				2177	* Direct i/o helper functions
				2178	*/
				2179	static void __inode_dio_wait(struct inode *inode)
				2180	{
				2181	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
				2182	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
				2183
				2184	do {
				2185	prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
				2186	if (atomic_read(&inode->i_dio_count))
				2187	schedule();
				2188	} while (atomic_read(&inode->i_dio_count));
				2189	finish_wait(wq, &q.wq_entry);
				2190	}
				2191
				2192	/**
				2193	* inode_dio_wait - wait for outstanding DIO requests to finish
				2194	* @inode: inode to wait for
				2195	*
				2196	* Waits for all pending direct I/O requests to finish so that we can
				2197	* proceed with a truncate or equivalent operation.
				2198	*
				2199	* Must be called under a lock that serializes taking new references
				2200	* to i_dio_count, usually by inode->i_mutex.
				2201	*/
				2202	void inode_dio_wait(struct inode *inode)
				2203	{
				2204	if (atomic_read(&inode->i_dio_count))
				2205	__inode_dio_wait(inode);
				2206	}
				2207	EXPORT_SYMBOL(inode_dio_wait);
				2208
				2209	/*
				2210	* inode_set_flags - atomically set some inode flags
				2211	*
				2212	* Note: the caller should be holding i_mutex, or else be sure that
				2213	* they have exclusive access to the inode structure (i.e., while the
				2214	* inode is being instantiated). The reason for the cmpxchg() loop
				2215	* --- which wouldn't be necessary if all code paths which modify
				2216	* i_flags actually followed this rule, is that there is at least one
				2217	* code path which doesn't today so we use cmpxchg() out of an abundance
				2218	* of caution.
				2219	*
				2220	* In the long run, i_mutex is overkill, and we should probably look
				2221	* at using the i_lock spinlock to protect i_flags, and then make sure
				2222	* it is so documented in include/linux/fs.h and that all code follows
				2223	* the locking convention!!
				2224	*/
				2225	void inode_set_flags(struct inode *inode, unsigned int flags,
				2226	unsigned int mask)
				2227	{
				2228	WARN_ON_ONCE(flags & ~mask);
				2229	set_mask_bits(&inode->i_flags, mask, flags);
				2230	}
				2231	EXPORT_SYMBOL(inode_set_flags);
				2232
				2233	void inode_nohighmem(struct inode *inode)
				2234	{
				2235	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
				2236	}
				2237	EXPORT_SYMBOL(inode_nohighmem);
				2238
				2239	/**
				2240	* timespec64_trunc - Truncate timespec64 to a granularity
				2241	* @t: Timespec64
				2242	* @gran: Granularity in ns.
				2243	*
				2244	* Truncate a timespec64 to a granularity. Always rounds down. gran must
				2245	* not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
				2246	*/
				2247	struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran)
				2248	{
				2249	/* Avoid division in the common cases 1 ns and 1 s. */
				2250	if (gran == 1) {
				2251	/* nothing */
				2252	} else if (gran == NSEC_PER_SEC) {
				2253	t.tv_nsec = 0;
				2254	} else if (gran > 1 && gran < NSEC_PER_SEC) {
				2255	t.tv_nsec -= t.tv_nsec % gran;
				2256	} else {
				2257	WARN(1, "illegal file time granularity: %u", gran);
				2258	}
				2259	return t;
				2260	}
				2261	EXPORT_SYMBOL(timespec64_trunc);
				2262
				2263	/**
				2264	* timestamp_truncate - Truncate timespec to a granularity
				2265	* @t: Timespec
				2266	* @inode: inode being updated
				2267	*
				2268	* Truncate a timespec to the granularity supported by the fs
				2269	* containing the inode. Always rounds down. gran must
				2270	* not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
				2271	*/
				2272	struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
				2273	{
				2274	struct super_block *sb = inode->i_sb;
				2275	unsigned int gran = sb->s_time_gran;
				2276
				2277	t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
				2278	if (unlikely(t.tv_sec == sb->s_time_max \|\| t.tv_sec == sb->s_time_min))
				2279	t.tv_nsec = 0;
				2280
				2281	/* Avoid division in the common cases 1 ns and 1 s. */
				2282	if (gran == 1)
				2283	; /* nothing */
				2284	else if (gran == NSEC_PER_SEC)
				2285	t.tv_nsec = 0;
				2286	else if (gran > 1 && gran < NSEC_PER_SEC)
				2287	t.tv_nsec -= t.tv_nsec % gran;
				2288	else
				2289	WARN(1, "invalid file time granularity: %u", gran);
				2290	return t;
				2291	}
				2292	EXPORT_SYMBOL(timestamp_truncate);
				2293
				2294	/**
				2295	* current_time - Return FS time
				2296	* @inode: inode.
				2297	*
				2298	* Return the current time truncated to the time granularity supported by
				2299	* the fs.
				2300	*
				2301	* Note that inode and inode->sb cannot be NULL.
				2302	* Otherwise, the function warns and returns time without truncation.
				2303	*/
				2304	struct timespec64 current_time(struct inode *inode)
				2305	{
				2306	struct timespec64 now;
				2307
				2308	ktime_get_coarse_real_ts64(&now);
				2309
				2310	if (unlikely(!inode->i_sb)) {
				2311	WARN(1, "current_time() called with uninitialized super_block in the inode");
				2312	return now;
				2313	}
				2314
				2315	return timestamp_truncate(now, inode);
				2316	}
				2317	EXPORT_SYMBOL(current_time);
				2318
				2319	/*
				2320	* Generic function to check FS_IOC_SETFLAGS values and reject any invalid
				2321	* configurations.
				2322	*
				2323	* Note: the caller should be holding i_mutex, or else be sure that they have
				2324	* exclusive access to the inode structure.
				2325	*/
				2326	int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
				2327	unsigned int flags)
				2328	{
				2329	/*
				2330	* The IMMUTABLE and APPEND_ONLY flags can only be changed by
				2331	* the relevant capability.
				2332	*
				2333	* This test looks nicer. Thanks to Pauline Middelink
				2334	*/
				2335	if ((flags ^ oldflags) & (FS_APPEND_FL \| FS_IMMUTABLE_FL) &&
				2336	!capable(CAP_LINUX_IMMUTABLE))
				2337	return -EPERM;
				2338
				2339	return fscrypt_prepare_setflags(inode, oldflags, flags);
				2340	}
				2341	EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
				2342
				2343	/*
				2344	* Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
				2345	* configurations.
				2346	*
				2347	* Note: the caller should be holding i_mutex, or else be sure that they have
				2348	* exclusive access to the inode structure.
				2349	*/
				2350	int vfs_ioc_fssetxattr_check(struct inode inode, const struct fsxattr old_fa,
				2351	struct fsxattr *fa)
				2352	{
				2353	/*
				2354	* Can't modify an immutable/append-only file unless we have
				2355	* appropriate permission.
				2356	*/
				2357	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
				2358	(FS_XFLAG_IMMUTABLE \| FS_XFLAG_APPEND) &&
				2359	!capable(CAP_LINUX_IMMUTABLE))
				2360	return -EPERM;
				2361
				2362	/*
				2363	* Project Quota ID state is only allowed to change from within the init
				2364	* namespace. Enforce that restriction only if we are trying to change
				2365	* the quota ID state. Everything else is allowed in user namespaces.
				2366	*/
				2367	if (current_user_ns() != &init_user_ns) {
				2368	if (old_fa->fsx_projid != fa->fsx_projid)
				2369	return -EINVAL;
				2370	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
				2371	FS_XFLAG_PROJINHERIT)
				2372	return -EINVAL;
				2373	}
				2374
				2375	/* Check extent size hints. */
				2376	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
				2377	return -EINVAL;
				2378
				2379	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
				2380	!S_ISDIR(inode->i_mode))
				2381	return -EINVAL;
				2382
				2383	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
				2384	!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
				2385	return -EINVAL;
				2386
				2387	/*
				2388	* It is only valid to set the DAX flag on regular files and
				2389	* directories on filesystems.
				2390	*/
				2391	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
				2392	!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode)))
				2393	return -EINVAL;
				2394
				2395	/* Extent size hints of zero turn off the flags. */
				2396	if (fa->fsx_extsize == 0)
				2397	fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE \| FS_XFLAG_EXTSZINHERIT);
				2398	if (fa->fsx_cowextsize == 0)
				2399	fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
				2400
				2401	return 0;
				2402	}
				2403	EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
				2404
				2405	/**
				2406	* mode_strip_sgid - handle the sgid bit for non-directories
				2407	* @dir: parent directory inode
				2408	* @mode: mode of the file to be created in @dir
				2409	*
				2410	* If the @mode of the new file has both the S_ISGID and S_IXGRP bit
				2411	* raised and @dir has the S_ISGID bit raised ensure that the caller is
				2412	* either in the group of the parent directory or they have CAP_FSETID
				2413	* in their user namespace and are privileged over the parent directory.
				2414	* In all other cases, strip the S_ISGID bit from @mode.
				2415	*
				2416	* Return: the new mode to use for the file
				2417	*/
				2418	umode_t mode_strip_sgid(const struct inode *dir, umode_t mode)
				2419	{
				2420	if ((mode & (S_ISGID \| S_IXGRP)) != (S_ISGID \| S_IXGRP))
				2421	return mode;
				2422	if (S_ISDIR(mode) \|\| !dir \|\| !(dir->i_mode & S_ISGID))
				2423	return mode;
				2424	if (in_group_p(dir->i_gid))
				2425	return mode;
				2426	if (capable_wrt_inode_uidgid(dir, CAP_FSETID))
				2427	return mode;
				2428
				2429	return mode & ~S_ISGID;
				2430	}
				2431	EXPORT_SYMBOL(mode_strip_sgid);