Blame - src/kernel/linux/v4.19/fs/inode.c - T800

blob: befe45f21123e0d8308e12df8f7eecce9378e4ff [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	/*
				2	* (C) 1997 Linus Torvalds
				3	* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
				4	*/
				5	#include <linux/export.h>
				6	#include <linux/fs.h>
				7	#include <linux/mm.h>
				8	#include <linux/backing-dev.h>
				9	#include <linux/hash.h>
				10	#include <linux/swap.h>
				11	#include <linux/security.h>
				12	#include <linux/cdev.h>
				13	#include <linux/bootmem.h>
				14	#include <linux/fsnotify.h>
				15	#include <linux/mount.h>
				16	#include <linux/posix_acl.h>
				17	#include <linux/prefetch.h>
				18	#include <linux/buffer_head.h> /* for inode_has_buffers */
				19	#include <linux/ratelimit.h>
				20	#include <linux/list_lru.h>
				21	#include <linux/iversion.h>
				22	#include <trace/events/writeback.h>
				23	#include "internal.h"
				24
				25	/*
				26	* Inode locking rules:
				27	*
				28	* inode->i_lock protects:
				29	* inode->i_state, inode->i_hash, __iget()
				30	* Inode LRU list locks protect:
				31	* inode->i_sb->s_inode_lru, inode->i_lru
				32	* inode->i_sb->s_inode_list_lock protects:
				33	* inode->i_sb->s_inodes, inode->i_sb_list
				34	* bdi->wb.list_lock protects:
				35	* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
				36	* inode_hash_lock protects:
				37	* inode_hashtable, inode->i_hash
				38	*
				39	* Lock ordering:
				40	*
				41	* inode->i_sb->s_inode_list_lock
				42	* inode->i_lock
				43	* Inode LRU list locks
				44	*
				45	* bdi->wb.list_lock
				46	* inode->i_lock
				47	*
				48	* inode_hash_lock
				49	* inode->i_sb->s_inode_list_lock
				50	* inode->i_lock
				51	*
				52	* iunique_lock
				53	* inode_hash_lock
				54	*/
				55
				56	static unsigned int i_hash_mask __read_mostly;
				57	static unsigned int i_hash_shift __read_mostly;
				58	static struct hlist_head *inode_hashtable __read_mostly;
				59	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
				60
				61	/*
				62	* Empty aops. Can be used for the cases where the user does not
				63	* define any of the address_space operations.
				64	*/
				65	const struct address_space_operations empty_aops = {
				66	};
				67	EXPORT_SYMBOL(empty_aops);
				68
				69	/*
				70	* Statistics gathering..
				71	*/
				72	struct inodes_stat_t inodes_stat;
				73
				74	static DEFINE_PER_CPU(unsigned long, nr_inodes);
				75	static DEFINE_PER_CPU(unsigned long, nr_unused);
				76
				77	static struct kmem_cache *inode_cachep __read_mostly;
				78
				79	static long get_nr_inodes(void)
				80	{
				81	int i;
				82	long sum = 0;
				83	for_each_possible_cpu(i)
				84	sum += per_cpu(nr_inodes, i);
				85	return sum < 0 ? 0 : sum;
				86	}
				87
				88	static inline long get_nr_inodes_unused(void)
				89	{
				90	int i;
				91	long sum = 0;
				92	for_each_possible_cpu(i)
				93	sum += per_cpu(nr_unused, i);
				94	return sum < 0 ? 0 : sum;
				95	}
				96
				97	long get_nr_dirty_inodes(void)
				98	{
				99	/* not actually dirty inodes, but a wild approximation */
				100	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
				101	return nr_dirty > 0 ? nr_dirty : 0;
				102	}
				103
				104	/*
				105	* Handle nr_inode sysctl
				106	*/
				107	#ifdef CONFIG_SYSCTL
				108	int proc_nr_inodes(struct ctl_table *table, int write,
				109	void __user buffer, size_t lenp, loff_t *ppos)
				110	{
				111	inodes_stat.nr_inodes = get_nr_inodes();
				112	inodes_stat.nr_unused = get_nr_inodes_unused();
				113	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
				114	}
				115	#endif
				116
				117	static int no_open(struct inode inode, struct file file)
				118	{
				119	return -ENXIO;
				120	}
				121
				122	/**
				123	* inode_init_always - perform inode structure initialisation
				124	* @sb: superblock inode belongs to
				125	* @inode: inode to initialise
				126	*
				127	* These are initializations that need to be done on every inode
				128	* allocation as the fields are not initialised by slab allocation.
				129	*/
				130	int inode_init_always(struct super_block sb, struct inode inode)
				131	{
				132	static const struct inode_operations empty_iops;
				133	static const struct file_operations no_open_fops = {.open = no_open};
				134	struct address_space *const mapping = &inode->i_data;
				135
				136	inode->i_sb = sb;
				137	inode->i_blkbits = sb->s_blocksize_bits;
				138	inode->i_flags = 0;
				139	atomic_set(&inode->i_count, 1);
				140	inode->i_op = &empty_iops;
				141	inode->i_fop = &no_open_fops;
				142	inode->__i_nlink = 1;
				143	inode->i_opflags = 0;
				144	if (sb->s_xattr)
				145	inode->i_opflags \|= IOP_XATTR;
				146	i_uid_write(inode, 0);
				147	i_gid_write(inode, 0);
				148	atomic_set(&inode->i_writecount, 0);
				149	inode->i_size = 0;
				150	inode->i_write_hint = WRITE_LIFE_NOT_SET;
				151	inode->i_blocks = 0;
				152	inode->i_bytes = 0;
				153	inode->i_generation = 0;
				154	inode->i_pipe = NULL;
				155	inode->i_bdev = NULL;
				156	inode->i_cdev = NULL;
				157	inode->i_link = NULL;
				158	inode->i_dir_seq = 0;
				159	inode->i_rdev = 0;
				160	inode->dirtied_when = 0;
				161
				162	#ifdef CONFIG_CGROUP_WRITEBACK
				163	inode->i_wb_frn_winner = 0;
				164	inode->i_wb_frn_avg_time = 0;
				165	inode->i_wb_frn_history = 0;
				166	#endif
				167
				168	if (security_inode_alloc(inode))
				169	goto out;
				170	spin_lock_init(&inode->i_lock);
				171	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
				172
				173	init_rwsem(&inode->i_rwsem);
				174	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
				175
				176	atomic_set(&inode->i_dio_count, 0);
				177
				178	mapping->a_ops = &empty_aops;
				179	mapping->host = inode;
				180	mapping->flags = 0;
				181	mapping->wb_err = 0;
				182	atomic_set(&mapping->i_mmap_writable, 0);
				183	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
				184	mapping->private_data = NULL;
				185	mapping->writeback_index = 0;
				186	inode->i_private = NULL;
				187	inode->i_mapping = mapping;
				188	INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
				189	#ifdef CONFIG_FS_POSIX_ACL
				190	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
				191	#endif
				192
				193	#ifdef CONFIG_FSNOTIFY
				194	inode->i_fsnotify_mask = 0;
				195	#endif
				196	inode->i_flctx = NULL;
				197	this_cpu_inc(nr_inodes);
				198
				199	return 0;
				200	out:
				201	return -ENOMEM;
				202	}
				203	EXPORT_SYMBOL(inode_init_always);
				204
				205	static struct inode alloc_inode(struct super_block sb)
				206	{
				207	struct inode *inode;
				208
				209	if (sb->s_op->alloc_inode)
				210	inode = sb->s_op->alloc_inode(sb);
				211	else
				212	inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
				213
				214	if (!inode)
				215	return NULL;
				216
				217	if (unlikely(inode_init_always(sb, inode))) {
				218	if (inode->i_sb->s_op->destroy_inode)
				219	inode->i_sb->s_op->destroy_inode(inode);
				220	else
				221	kmem_cache_free(inode_cachep, inode);
				222	return NULL;
				223	}
				224
				225	return inode;
				226	}
				227
				228	void free_inode_nonrcu(struct inode *inode)
				229	{
				230	kmem_cache_free(inode_cachep, inode);
				231	}
				232	EXPORT_SYMBOL(free_inode_nonrcu);
				233
				234	void __destroy_inode(struct inode *inode)
				235	{
				236	BUG_ON(inode_has_buffers(inode));
				237	inode_detach_wb(inode);
				238	security_inode_free(inode);
				239	fsnotify_inode_delete(inode);
				240	locks_free_lock_context(inode);
				241	if (!inode->i_nlink) {
				242	WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
				243	atomic_long_dec(&inode->i_sb->s_remove_count);
				244	}
				245
				246	#ifdef CONFIG_FS_POSIX_ACL
				247	if (inode->i_acl && !is_uncached_acl(inode->i_acl))
				248	posix_acl_release(inode->i_acl);
				249	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
				250	posix_acl_release(inode->i_default_acl);
				251	#endif
				252	this_cpu_dec(nr_inodes);
				253	}
				254	EXPORT_SYMBOL(__destroy_inode);
				255
				256	static void i_callback(struct rcu_head *head)
				257	{
				258	struct inode *inode = container_of(head, struct inode, i_rcu);
				259	kmem_cache_free(inode_cachep, inode);
				260	}
				261
				262	static void destroy_inode(struct inode *inode)
				263	{
				264	BUG_ON(!list_empty(&inode->i_lru));
				265	__destroy_inode(inode);
				266	if (inode->i_sb->s_op->destroy_inode)
				267	inode->i_sb->s_op->destroy_inode(inode);
				268	else
				269	call_rcu(&inode->i_rcu, i_callback);
				270	}
				271
				272	/**
				273	* drop_nlink - directly drop an inode's link count
				274	* @inode: inode
				275	*
				276	* This is a low-level filesystem helper to replace any
				277	* direct filesystem manipulation of i_nlink. In cases
				278	* where we are attempting to track writes to the
				279	* filesystem, a decrement to zero means an imminent
				280	* write when the file is truncated and actually unlinked
				281	* on the filesystem.
				282	*/
				283	void drop_nlink(struct inode *inode)
				284	{
				285	WARN_ON(inode->i_nlink == 0);
				286	inode->__i_nlink--;
				287	if (!inode->i_nlink)
				288	atomic_long_inc(&inode->i_sb->s_remove_count);
				289	}
				290	EXPORT_SYMBOL(drop_nlink);
				291
				292	/**
				293	* clear_nlink - directly zero an inode's link count
				294	* @inode: inode
				295	*
				296	* This is a low-level filesystem helper to replace any
				297	* direct filesystem manipulation of i_nlink. See
				298	* drop_nlink() for why we care about i_nlink hitting zero.
				299	*/
				300	void clear_nlink(struct inode *inode)
				301	{
				302	if (inode->i_nlink) {
				303	inode->__i_nlink = 0;
				304	atomic_long_inc(&inode->i_sb->s_remove_count);
				305	}
				306	}
				307	EXPORT_SYMBOL(clear_nlink);
				308
				309	/**
				310	* set_nlink - directly set an inode's link count
				311	* @inode: inode
				312	* @nlink: new nlink (should be non-zero)
				313	*
				314	* This is a low-level filesystem helper to replace any
				315	* direct filesystem manipulation of i_nlink.
				316	*/
				317	void set_nlink(struct inode *inode, unsigned int nlink)
				318	{
				319	if (!nlink) {
				320	clear_nlink(inode);
				321	} else {
				322	/* Yes, some filesystems do change nlink from zero to one */
				323	if (inode->i_nlink == 0)
				324	atomic_long_dec(&inode->i_sb->s_remove_count);
				325
				326	inode->__i_nlink = nlink;
				327	}
				328	}
				329	EXPORT_SYMBOL(set_nlink);
				330
				331	/**
				332	* inc_nlink - directly increment an inode's link count
				333	* @inode: inode
				334	*
				335	* This is a low-level filesystem helper to replace any
				336	* direct filesystem manipulation of i_nlink. Currently,
				337	* it is only here for parity with dec_nlink().
				338	*/
				339	void inc_nlink(struct inode *inode)
				340	{
				341	if (unlikely(inode->i_nlink == 0)) {
				342	WARN_ON(!(inode->i_state & I_LINKABLE));
				343	atomic_long_dec(&inode->i_sb->s_remove_count);
				344	}
				345
				346	inode->__i_nlink++;
				347	}
				348	EXPORT_SYMBOL(inc_nlink);
				349
				350	static void __address_space_init_once(struct address_space *mapping)
				351	{
				352	INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC \| __GFP_ACCOUNT);
				353	init_rwsem(&mapping->i_mmap_rwsem);
				354	INIT_LIST_HEAD(&mapping->private_list);
				355	spin_lock_init(&mapping->private_lock);
				356	mapping->i_mmap = RB_ROOT_CACHED;
				357	}
				358
				359	void address_space_init_once(struct address_space *mapping)
				360	{
				361	memset(mapping, 0, sizeof(*mapping));
				362	__address_space_init_once(mapping);
				363	}
				364	EXPORT_SYMBOL(address_space_init_once);
				365
				366	/*
				367	* These are initializations that only need to be done
				368	* once, because the fields are idempotent across use
				369	* of the inode, so let the slab aware of that.
				370	*/
				371	void inode_init_once(struct inode *inode)
				372	{
				373	memset(inode, 0, sizeof(*inode));
				374	INIT_HLIST_NODE(&inode->i_hash);
				375	INIT_LIST_HEAD(&inode->i_devices);
				376	INIT_LIST_HEAD(&inode->i_io_list);
				377	INIT_LIST_HEAD(&inode->i_wb_list);
				378	INIT_LIST_HEAD(&inode->i_lru);
				379	__address_space_init_once(&inode->i_data);
				380	i_size_ordered_init(inode);
				381	}
				382	EXPORT_SYMBOL(inode_init_once);
				383
				384	static void init_once(void *foo)
				385	{
				386	struct inode inode = (struct inode ) foo;
				387
				388	inode_init_once(inode);
				389	}
				390
				391	/*
				392	* inode->i_lock must be held
				393	*/
				394	void __iget(struct inode *inode)
				395	{
				396	atomic_inc(&inode->i_count);
				397	}
				398
				399	/*
				400	* get additional reference to inode; caller must already hold one.
				401	*/
				402	void ihold(struct inode *inode)
				403	{
				404	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
				405	}
				406	EXPORT_SYMBOL(ihold);
				407
				408	static void inode_lru_list_add(struct inode *inode)
				409	{
				410	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
				411	this_cpu_inc(nr_unused);
				412	else
				413	inode->i_state \|= I_REFERENCED;
				414	}
				415
				416	/*
				417	* Add inode to LRU if needed (inode is unused and clean).
				418	*
				419	* Needs inode->i_lock held.
				420	*/
				421	void inode_add_lru(struct inode *inode)
				422	{
				423	if (!(inode->i_state & (I_DIRTY_ALL \| I_SYNC \|
				424	I_FREEING \| I_WILL_FREE)) &&
				425	!atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
				426	inode_lru_list_add(inode);
				427	}
				428
				429
				430	static void inode_lru_list_del(struct inode *inode)
				431	{
				432
				433	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
				434	this_cpu_dec(nr_unused);
				435	}
				436
				437	/**
				438	* inode_sb_list_add - add inode to the superblock list of inodes
				439	* @inode: inode to add
				440	*/
				441	void inode_sb_list_add(struct inode *inode)
				442	{
				443	spin_lock(&inode->i_sb->s_inode_list_lock);
				444	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
				445	spin_unlock(&inode->i_sb->s_inode_list_lock);
				446	}
				447	EXPORT_SYMBOL_GPL(inode_sb_list_add);
				448
				449	static inline void inode_sb_list_del(struct inode *inode)
				450	{
				451	if (!list_empty(&inode->i_sb_list)) {
				452	spin_lock(&inode->i_sb->s_inode_list_lock);
				453	list_del_init(&inode->i_sb_list);
				454	spin_unlock(&inode->i_sb->s_inode_list_lock);
				455	}
				456	}
				457
				458	static unsigned long hash(struct super_block *sb, unsigned long hashval)
				459	{
				460	unsigned long tmp;
				461
				462	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
				463	L1_CACHE_BYTES;
				464	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
				465	return tmp & i_hash_mask;
				466	}
				467
				468	/**
				469	* __insert_inode_hash - hash an inode
				470	* @inode: unhashed inode
				471	* @hashval: unsigned long value used to locate this object in the
				472	* inode_hashtable.
				473	*
				474	* Add an inode to the inode hash for this superblock.
				475	*/
				476	void __insert_inode_hash(struct inode *inode, unsigned long hashval)
				477	{
				478	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
				479
				480	spin_lock(&inode_hash_lock);
				481	spin_lock(&inode->i_lock);
				482	hlist_add_head(&inode->i_hash, b);
				483	spin_unlock(&inode->i_lock);
				484	spin_unlock(&inode_hash_lock);
				485	}
				486	EXPORT_SYMBOL(__insert_inode_hash);
				487
				488	/**
				489	* __remove_inode_hash - remove an inode from the hash
				490	* @inode: inode to unhash
				491	*
				492	* Remove an inode from the superblock.
				493	*/
				494	void __remove_inode_hash(struct inode *inode)
				495	{
				496	spin_lock(&inode_hash_lock);
				497	spin_lock(&inode->i_lock);
				498	hlist_del_init(&inode->i_hash);
				499	spin_unlock(&inode->i_lock);
				500	spin_unlock(&inode_hash_lock);
				501	}
				502	EXPORT_SYMBOL(__remove_inode_hash);
				503
				504	void clear_inode(struct inode *inode)
				505	{
				506	/*
				507	* We have to cycle the i_pages lock here because reclaim can be in the
				508	* process of removing the last page (in __delete_from_page_cache())
				509	* and we must not free the mapping under it.
				510	*/
				511	xa_lock_irq(&inode->i_data.i_pages);
				512	BUG_ON(inode->i_data.nrpages);
				513	BUG_ON(inode->i_data.nrexceptional);
				514	xa_unlock_irq(&inode->i_data.i_pages);
				515	BUG_ON(!list_empty(&inode->i_data.private_list));
				516	BUG_ON(!(inode->i_state & I_FREEING));
				517	BUG_ON(inode->i_state & I_CLEAR);
				518	BUG_ON(!list_empty(&inode->i_wb_list));
				519	/* don't need i_lock here, no concurrent mods to i_state */
				520	inode->i_state = I_FREEING \| I_CLEAR;
				521	}
				522	EXPORT_SYMBOL(clear_inode);
				523
				524	/*
				525	* Free the inode passed in, removing it from the lists it is still connected
				526	* to. We remove any pages still attached to the inode and wait for any IO that
				527	* is still in progress before finally destroying the inode.
				528	*
				529	* An inode must already be marked I_FREEING so that we avoid the inode being
				530	* moved back onto lists if we race with other code that manipulates the lists
				531	* (e.g. writeback_single_inode). The caller is responsible for setting this.
				532	*
				533	* An inode must already be removed from the LRU list before being evicted from
				534	* the cache. This should occur atomically with setting the I_FREEING state
				535	* flag, so no inodes here should ever be on the LRU when being evicted.
				536	*/
				537	static void evict(struct inode *inode)
				538	{
				539	const struct super_operations *op = inode->i_sb->s_op;
				540
				541	BUG_ON(!(inode->i_state & I_FREEING));
				542	BUG_ON(!list_empty(&inode->i_lru));
				543
				544	if (!list_empty(&inode->i_io_list))
				545	inode_io_list_del(inode);
				546
				547	inode_sb_list_del(inode);
				548
				549	/*
				550	* Wait for flusher thread to be done with the inode so that filesystem
				551	* does not start destroying it while writeback is still running. Since
				552	* the inode has I_FREEING set, flusher thread won't start new work on
				553	* the inode. We just have to wait for running writeback to finish.
				554	*/
				555	inode_wait_for_writeback(inode);
				556
				557	if (op->evict_inode) {
				558	op->evict_inode(inode);
				559	} else {
				560	truncate_inode_pages_final(&inode->i_data);
				561	clear_inode(inode);
				562	}
				563	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
				564	bd_forget(inode);
				565	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
				566	cd_forget(inode);
				567
				568	remove_inode_hash(inode);
				569
				570	spin_lock(&inode->i_lock);
				571	wake_up_bit(&inode->i_state, __I_NEW);
				572	BUG_ON(inode->i_state != (I_FREEING \| I_CLEAR));
				573	spin_unlock(&inode->i_lock);
				574
				575	destroy_inode(inode);
				576	}
				577
				578	/*
				579	* dispose_list - dispose of the contents of a local list
				580	* @head: the head of the list to free
				581	*
				582	* Dispose-list gets a local list with local inodes in it, so it doesn't
				583	* need to worry about list corruption and SMP locks.
				584	*/
				585	static void dispose_list(struct list_head *head)
				586	{
				587	while (!list_empty(head)) {
				588	struct inode *inode;
				589
				590	inode = list_first_entry(head, struct inode, i_lru);
				591	list_del_init(&inode->i_lru);
				592
				593	evict(inode);
				594	cond_resched();
				595	}
				596	}
				597
				598	/**
				599	* evict_inodes - evict all evictable inodes for a superblock
				600	* @sb: superblock to operate on
				601	*
				602	* Make sure that no inodes with zero refcount are retained. This is
				603	* called by superblock shutdown after having SB_ACTIVE flag removed,
				604	* so any inode reaching zero refcount during or after that call will
				605	* be immediately evicted.
				606	*/
				607	void evict_inodes(struct super_block *sb)
				608	{
				609	struct inode inode, next;
				610	LIST_HEAD(dispose);
				611
				612	again:
				613	spin_lock(&sb->s_inode_list_lock);
				614	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				615	if (atomic_read(&inode->i_count))
				616	continue;
				617
				618	spin_lock(&inode->i_lock);
				619	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				620	spin_unlock(&inode->i_lock);
				621	continue;
				622	}
				623
				624	inode->i_state \|= I_FREEING;
				625	inode_lru_list_del(inode);
				626	spin_unlock(&inode->i_lock);
				627	list_add(&inode->i_lru, &dispose);
				628
				629	/*
				630	* We can have a ton of inodes to evict at unmount time given
				631	* enough memory, check to see if we need to go to sleep for a
				632	* bit so we don't livelock.
				633	*/
				634	if (need_resched()) {
				635	spin_unlock(&sb->s_inode_list_lock);
				636	cond_resched();
				637	dispose_list(&dispose);
				638	goto again;
				639	}
				640	}
				641	spin_unlock(&sb->s_inode_list_lock);
				642
				643	dispose_list(&dispose);
				644	}
				645	EXPORT_SYMBOL_GPL(evict_inodes);
				646
				647	/**
				648	* invalidate_inodes - attempt to free all inodes on a superblock
				649	* @sb: superblock to operate on
				650	* @kill_dirty: flag to guide handling of dirty inodes
				651	*
				652	* Attempts to free all inodes for a given superblock. If there were any
				653	* busy inodes return a non-zero value, else zero.
				654	* If @kill_dirty is set, discard dirty inodes too, otherwise treat
				655	* them as busy.
				656	*/
				657	int invalidate_inodes(struct super_block *sb, bool kill_dirty)
				658	{
				659	int busy = 0;
				660	struct inode inode, next;
				661	LIST_HEAD(dispose);
				662
				663	again:
				664	spin_lock(&sb->s_inode_list_lock);
				665	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				666	spin_lock(&inode->i_lock);
				667	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				668	spin_unlock(&inode->i_lock);
				669	continue;
				670	}
				671	if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
				672	spin_unlock(&inode->i_lock);
				673	busy = 1;
				674	continue;
				675	}
				676	if (atomic_read(&inode->i_count)) {
				677	spin_unlock(&inode->i_lock);
				678	busy = 1;
				679	continue;
				680	}
				681
				682	inode->i_state \|= I_FREEING;
				683	inode_lru_list_del(inode);
				684	spin_unlock(&inode->i_lock);
				685	list_add(&inode->i_lru, &dispose);
				686	if (need_resched()) {
				687	spin_unlock(&sb->s_inode_list_lock);
				688	cond_resched();
				689	dispose_list(&dispose);
				690	goto again;
				691	}
				692	}
				693	spin_unlock(&sb->s_inode_list_lock);
				694
				695	dispose_list(&dispose);
				696
				697	return busy;
				698	}
				699
				700	/*
				701	* Isolate the inode from the LRU in preparation for freeing it.
				702	*
				703	* Any inodes which are pinned purely because of attached pagecache have their
				704	* pagecache removed. If the inode has metadata buffers attached to
				705	* mapping->private_list then try to remove them.
				706	*
				707	* If the inode has the I_REFERENCED flag set, then it means that it has been
				708	* used recently - the flag is set in iput_final(). When we encounter such an
				709	* inode, clear the flag and move it to the back of the LRU so it gets another
				710	* pass through the LRU before it gets reclaimed. This is necessary because of
				711	* the fact we are doing lazy LRU updates to minimise lock contention so the
				712	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
				713	* with this flag set because they are the inodes that are out of order.
				714	*/
				715	static enum lru_status inode_lru_isolate(struct list_head *item,
				716	struct list_lru_one lru, spinlock_t lru_lock, void *arg)
				717	{
				718	struct list_head *freeable = arg;
				719	struct inode *inode = container_of(item, struct inode, i_lru);
				720
				721	/*
				722	* we are inverting the lru lock/inode->i_lock here, so use a trylock.
				723	* If we fail to get the lock, just skip it.
				724	*/
				725	if (!spin_trylock(&inode->i_lock))
				726	return LRU_SKIP;
				727
				728	/*
				729	* Referenced or dirty inodes are still in use. Give them another pass
				730	* through the LRU as we canot reclaim them now.
				731	*/
				732	if (atomic_read(&inode->i_count) \|\|
				733	(inode->i_state & ~I_REFERENCED)) {
				734	list_lru_isolate(lru, &inode->i_lru);
				735	spin_unlock(&inode->i_lock);
				736	this_cpu_dec(nr_unused);
				737	return LRU_REMOVED;
				738	}
				739
				740	/* recently referenced inodes get one more pass */
				741	if (inode->i_state & I_REFERENCED) {
				742	inode->i_state &= ~I_REFERENCED;
				743	spin_unlock(&inode->i_lock);
				744	return LRU_ROTATE;
				745	}
				746
				747	if (inode_has_buffers(inode) \|\| inode->i_data.nrpages) {
				748	__iget(inode);
				749	spin_unlock(&inode->i_lock);
				750	spin_unlock(lru_lock);
				751	if (remove_inode_buffers(inode)) {
				752	unsigned long reap;
				753	reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
				754	if (current_is_kswapd())
				755	__count_vm_events(KSWAPD_INODESTEAL, reap);
				756	else
				757	__count_vm_events(PGINODESTEAL, reap);
				758	if (current->reclaim_state)
				759	current->reclaim_state->reclaimed_slab += reap;
				760	}
				761	iput(inode);
				762	spin_lock(lru_lock);
				763	return LRU_RETRY;
				764	}
				765
				766	WARN_ON(inode->i_state & I_NEW);
				767	inode->i_state \|= I_FREEING;
				768	list_lru_isolate_move(lru, &inode->i_lru, freeable);
				769	spin_unlock(&inode->i_lock);
				770
				771	this_cpu_dec(nr_unused);
				772	return LRU_REMOVED;
				773	}
				774
				775	/*
				776	* Walk the superblock inode LRU for freeable inodes and attempt to free them.
				777	* This is called from the superblock shrinker function with a number of inodes
				778	* to trim from the LRU. Inodes to be freed are moved to a temporary list and
				779	* then are freed outside inode_lock by dispose_list().
				780	*/
				781	long prune_icache_sb(struct super_block sb, struct shrink_control sc)
				782	{
				783	LIST_HEAD(freeable);
				784	long freed;
				785
				786	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
				787	inode_lru_isolate, &freeable);
				788	dispose_list(&freeable);
				789	return freed;
				790	}
				791
				792	static void __wait_on_freeing_inode(struct inode *inode);
				793	/*
				794	* Called with the inode lock held.
				795	*/
				796	static struct inode find_inode(struct super_block sb,
				797	struct hlist_head *head,
				798	int (test)(struct inode , void *),
				799	void *data)
				800	{
				801	struct inode *inode = NULL;
				802
				803	repeat:
				804	hlist_for_each_entry(inode, head, i_hash) {
				805	if (inode->i_sb != sb)
				806	continue;
				807	if (!test(inode, data))
				808	continue;
				809	spin_lock(&inode->i_lock);
				810	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				811	__wait_on_freeing_inode(inode);
				812	goto repeat;
				813	}
				814	if (unlikely(inode->i_state & I_CREATING)) {
				815	spin_unlock(&inode->i_lock);
				816	return ERR_PTR(-ESTALE);
				817	}
				818	__iget(inode);
				819	spin_unlock(&inode->i_lock);
				820	return inode;
				821	}
				822	return NULL;
				823	}
				824
				825	/*
				826	* find_inode_fast is the fast path version of find_inode, see the comment at
				827	* iget_locked for details.
				828	*/
				829	static struct inode find_inode_fast(struct super_block sb,
				830	struct hlist_head *head, unsigned long ino)
				831	{
				832	struct inode *inode = NULL;
				833
				834	repeat:
				835	hlist_for_each_entry(inode, head, i_hash) {
				836	if (inode->i_ino != ino)
				837	continue;
				838	if (inode->i_sb != sb)
				839	continue;
				840	spin_lock(&inode->i_lock);
				841	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				842	__wait_on_freeing_inode(inode);
				843	goto repeat;
				844	}
				845	if (unlikely(inode->i_state & I_CREATING)) {
				846	spin_unlock(&inode->i_lock);
				847	return ERR_PTR(-ESTALE);
				848	}
				849	__iget(inode);
				850	spin_unlock(&inode->i_lock);
				851	return inode;
				852	}
				853	return NULL;
				854	}
				855
				856	/*
				857	* Each cpu owns a range of LAST_INO_BATCH numbers.
				858	* 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
				859	* to renew the exhausted range.
				860	*
				861	* This does not significantly increase overflow rate because every CPU can
				862	* consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
				863	* NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
				864	* 2^32 range, and is a worst-case. Even a 50% wastage would only increase
				865	* overflow rate by 2x, which does not seem too significant.
				866	*
				867	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				868	* error if st_ino won't fit in target struct field. Use 32bit counter
				869	* here to attempt to avoid that.
				870	*/
				871	#define LAST_INO_BATCH 1024
				872	static DEFINE_PER_CPU(unsigned int, last_ino);
				873
				874	unsigned int get_next_ino(void)
				875	{
				876	unsigned int *p = &get_cpu_var(last_ino);
				877	unsigned int res = *p;
				878
				879	#ifdef CONFIG_SMP
				880	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
				881	static atomic_t shared_last_ino;
				882	int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
				883
				884	res = next - LAST_INO_BATCH;
				885	}
				886	#endif
				887
				888	res++;
				889	/* get_next_ino should not provide a 0 inode number */
				890	if (unlikely(!res))
				891	res++;
				892	*p = res;
				893	put_cpu_var(last_ino);
				894	return res;
				895	}
				896	EXPORT_SYMBOL(get_next_ino);
				897
				898	/**
				899	* new_inode_pseudo - obtain an inode
				900	* @sb: superblock
				901	*
				902	* Allocates a new inode for given superblock.
				903	* Inode wont be chained in superblock s_inodes list
				904	* This means :
				905	* - fs can't be unmount
				906	* - quotas, fsnotify, writeback can't work
				907	*/
				908	struct inode new_inode_pseudo(struct super_block sb)
				909	{
				910	struct inode *inode = alloc_inode(sb);
				911
				912	if (inode) {
				913	spin_lock(&inode->i_lock);
				914	inode->i_state = 0;
				915	spin_unlock(&inode->i_lock);
				916	INIT_LIST_HEAD(&inode->i_sb_list);
				917	}
				918	return inode;
				919	}
				920
				921	/**
				922	* new_inode - obtain an inode
				923	* @sb: superblock
				924	*
				925	* Allocates a new inode for given superblock. The default gfp_mask
				926	* for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
				927	* If HIGHMEM pages are unsuitable or it is known that pages allocated
				928	* for the page cache are not reclaimable or migratable,
				929	* mapping_set_gfp_mask() must be called with suitable flags on the
				930	* newly created inode's mapping
				931	*
				932	*/
				933	struct inode new_inode(struct super_block sb)
				934	{
				935	struct inode *inode;
				936
				937	spin_lock_prefetch(&sb->s_inode_list_lock);
				938
				939	inode = new_inode_pseudo(sb);
				940	if (inode)
				941	inode_sb_list_add(inode);
				942	return inode;
				943	}
				944	EXPORT_SYMBOL(new_inode);
				945
				946	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				947	void lockdep_annotate_inode_mutex_key(struct inode *inode)
				948	{
				949	if (S_ISDIR(inode->i_mode)) {
				950	struct file_system_type *type = inode->i_sb->s_type;
				951
				952	/* Set new key only if filesystem hasn't already changed it */
				953	if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
				954	/*
				955	* ensure nobody is actually holding i_mutex
				956	*/
				957	// mutex_destroy(&inode->i_mutex);
				958	init_rwsem(&inode->i_rwsem);
				959	lockdep_set_class(&inode->i_rwsem,
				960	&type->i_mutex_dir_key);
				961	}
				962	}
				963	}
				964	EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
				965	#endif
				966
				967	/**
				968	* unlock_new_inode - clear the I_NEW state and wake up any waiters
				969	* @inode: new inode to unlock
				970	*
				971	* Called when the inode is fully initialised to clear the new state of the
				972	* inode and wake up anyone waiting for the inode to finish initialisation.
				973	*/
				974	void unlock_new_inode(struct inode *inode)
				975	{
				976	lockdep_annotate_inode_mutex_key(inode);
				977	spin_lock(&inode->i_lock);
				978	WARN_ON(!(inode->i_state & I_NEW));
				979	inode->i_state &= ~I_NEW & ~I_CREATING;
				980	smp_mb();
				981	wake_up_bit(&inode->i_state, __I_NEW);
				982	spin_unlock(&inode->i_lock);
				983	}
				984	EXPORT_SYMBOL(unlock_new_inode);
				985
				986	void discard_new_inode(struct inode *inode)
				987	{
				988	lockdep_annotate_inode_mutex_key(inode);
				989	spin_lock(&inode->i_lock);
				990	WARN_ON(!(inode->i_state & I_NEW));
				991	inode->i_state &= ~I_NEW;
				992	smp_mb();
				993	wake_up_bit(&inode->i_state, __I_NEW);
				994	spin_unlock(&inode->i_lock);
				995	iput(inode);
				996	}
				997	EXPORT_SYMBOL(discard_new_inode);
				998
				999	/**
				1000	* lock_two_nondirectories - take two i_mutexes on non-directory objects
				1001	*
				1002	* Lock any non-NULL argument that is not a directory.
				1003	* Zero, one or two objects may be locked by this function.
				1004	*
				1005	* @inode1: first inode to lock
				1006	* @inode2: second inode to lock
				1007	*/
				1008	void lock_two_nondirectories(struct inode inode1, struct inode inode2)
				1009	{
				1010	if (inode1 > inode2)
				1011	swap(inode1, inode2);
				1012
				1013	if (inode1 && !S_ISDIR(inode1->i_mode))
				1014	inode_lock(inode1);
				1015	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1016	inode_lock_nested(inode2, I_MUTEX_NONDIR2);
				1017	}
				1018	EXPORT_SYMBOL(lock_two_nondirectories);
				1019
				1020	/**
				1021	* unlock_two_nondirectories - release locks from lock_two_nondirectories()
				1022	* @inode1: first inode to unlock
				1023	* @inode2: second inode to unlock
				1024	*/
				1025	void unlock_two_nondirectories(struct inode inode1, struct inode inode2)
				1026	{
				1027	if (inode1 && !S_ISDIR(inode1->i_mode))
				1028	inode_unlock(inode1);
				1029	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1030	inode_unlock(inode2);
				1031	}
				1032	EXPORT_SYMBOL(unlock_two_nondirectories);
				1033
				1034	/**
				1035	* inode_insert5 - obtain an inode from a mounted file system
				1036	* @inode: pre-allocated inode to use for insert to cache
				1037	* @hashval: hash value (usually inode number) to get
				1038	* @test: callback used for comparisons between inodes
				1039	* @set: callback used to initialize a new struct inode
				1040	* @data: opaque data pointer to pass to @test and @set
				1041	*
				1042	* Search for the inode specified by @hashval and @data in the inode cache,
				1043	* and if present it is return it with an increased reference count. This is
				1044	* a variant of iget5_locked() for callers that don't want to fail on memory
				1045	* allocation of inode.
				1046	*
				1047	* If the inode is not in cache, insert the pre-allocated inode to cache and
				1048	* return it locked, hashed, and with the I_NEW flag set. The file system gets
				1049	* to fill it in before unlocking it via unlock_new_inode().
				1050	*
				1051	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1052	* sleep.
				1053	*/
				1054	struct inode inode_insert5(struct inode inode, unsigned long hashval,
				1055	int (test)(struct inode , void *),
				1056	int (set)(struct inode , void ), void data)
				1057	{
				1058	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
				1059	struct inode *old;
				1060	bool creating = inode->i_state & I_CREATING;
				1061
				1062	again:
				1063	spin_lock(&inode_hash_lock);
				1064	old = find_inode(inode->i_sb, head, test, data);
				1065	if (unlikely(old)) {
				1066	/*
				1067	* Uhhuh, somebody else created the same inode under us.
				1068	* Use the old inode instead of the preallocated one.
				1069	*/
				1070	spin_unlock(&inode_hash_lock);
				1071	if (IS_ERR(old))
				1072	return NULL;
				1073	wait_on_inode(old);
				1074	if (unlikely(inode_unhashed(old))) {
				1075	iput(old);
				1076	goto again;
				1077	}
				1078	return old;
				1079	}
				1080
				1081	if (set && unlikely(set(inode, data))) {
				1082	inode = NULL;
				1083	goto unlock;
				1084	}
				1085
				1086	/*
				1087	* Return the locked inode with I_NEW set, the
				1088	* caller is responsible for filling in the contents
				1089	*/
				1090	spin_lock(&inode->i_lock);
				1091	inode->i_state \|= I_NEW;
				1092	hlist_add_head(&inode->i_hash, head);
				1093	spin_unlock(&inode->i_lock);
				1094	if (!creating)
				1095	inode_sb_list_add(inode);
				1096	unlock:
				1097	spin_unlock(&inode_hash_lock);
				1098
				1099	return inode;
				1100	}
				1101	EXPORT_SYMBOL(inode_insert5);
				1102
				1103	/**
				1104	* iget5_locked - obtain an inode from a mounted file system
				1105	* @sb: super block of file system
				1106	* @hashval: hash value (usually inode number) to get
				1107	* @test: callback used for comparisons between inodes
				1108	* @set: callback used to initialize a new struct inode
				1109	* @data: opaque data pointer to pass to @test and @set
				1110	*
				1111	* Search for the inode specified by @hashval and @data in the inode cache,
				1112	* and if present it is return it with an increased reference count. This is
				1113	* a generalized version of iget_locked() for file systems where the inode
				1114	* number is not sufficient for unique identification of an inode.
				1115	*
				1116	* If the inode is not in cache, allocate a new inode and return it locked,
				1117	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1118	* before unlocking it via unlock_new_inode().
				1119	*
				1120	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1121	* sleep.
				1122	*/
				1123	struct inode iget5_locked(struct super_block sb, unsigned long hashval,
				1124	int (test)(struct inode , void *),
				1125	int (set)(struct inode , void ), void data)
				1126	{
				1127	struct inode *inode = ilookup5(sb, hashval, test, data);
				1128
				1129	if (!inode) {
				1130	struct inode *new = alloc_inode(sb);
				1131
				1132	if (new) {
				1133	new->i_state = 0;
				1134	inode = inode_insert5(new, hashval, test, set, data);
				1135	if (unlikely(inode != new))
				1136	destroy_inode(new);
				1137	}
				1138	}
				1139	return inode;
				1140	}
				1141	EXPORT_SYMBOL(iget5_locked);
				1142
				1143	/**
				1144	* iget_locked - obtain an inode from a mounted file system
				1145	* @sb: super block of file system
				1146	* @ino: inode number to get
				1147	*
				1148	* Search for the inode specified by @ino in the inode cache and if present
				1149	* return it with an increased reference count. This is for file systems
				1150	* where the inode number is sufficient for unique identification of an inode.
				1151	*
				1152	* If the inode is not in cache, allocate a new inode and return it locked,
				1153	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1154	* before unlocking it via unlock_new_inode().
				1155	*/
				1156	struct inode iget_locked(struct super_block sb, unsigned long ino)
				1157	{
				1158	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1159	struct inode *inode;
				1160	again:
				1161	spin_lock(&inode_hash_lock);
				1162	inode = find_inode_fast(sb, head, ino);
				1163	spin_unlock(&inode_hash_lock);
				1164	if (inode) {
				1165	if (IS_ERR(inode))
				1166	return NULL;
				1167	wait_on_inode(inode);
				1168	if (unlikely(inode_unhashed(inode))) {
				1169	iput(inode);
				1170	goto again;
				1171	}
				1172	return inode;
				1173	}
				1174
				1175	inode = alloc_inode(sb);
				1176	if (inode) {
				1177	struct inode *old;
				1178
				1179	spin_lock(&inode_hash_lock);
				1180	/* We released the lock, so.. */
				1181	old = find_inode_fast(sb, head, ino);
				1182	if (!old) {
				1183	inode->i_ino = ino;
				1184	spin_lock(&inode->i_lock);
				1185	inode->i_state = I_NEW;
				1186	hlist_add_head(&inode->i_hash, head);
				1187	spin_unlock(&inode->i_lock);
				1188	inode_sb_list_add(inode);
				1189	spin_unlock(&inode_hash_lock);
				1190
				1191	/* Return the locked inode with I_NEW set, the
				1192	* caller is responsible for filling in the contents
				1193	*/
				1194	return inode;
				1195	}
				1196
				1197	/*
				1198	* Uhhuh, somebody else created the same inode under
				1199	* us. Use the old inode instead of the one we just
				1200	* allocated.
				1201	*/
				1202	spin_unlock(&inode_hash_lock);
				1203	destroy_inode(inode);
				1204	if (IS_ERR(old))
				1205	return NULL;
				1206	inode = old;
				1207	wait_on_inode(inode);
				1208	if (unlikely(inode_unhashed(inode))) {
				1209	iput(inode);
				1210	goto again;
				1211	}
				1212	}
				1213	return inode;
				1214	}
				1215	EXPORT_SYMBOL(iget_locked);
				1216
				1217	/*
				1218	* search the inode cache for a matching inode number.
				1219	* If we find one, then the inode number we are trying to
				1220	* allocate is not unique and so we should not use it.
				1221	*
				1222	* Returns 1 if the inode number is unique, 0 if it is not.
				1223	*/
				1224	static int test_inode_iunique(struct super_block *sb, unsigned long ino)
				1225	{
				1226	struct hlist_head *b = inode_hashtable + hash(sb, ino);
				1227	struct inode *inode;
				1228
				1229	spin_lock(&inode_hash_lock);
				1230	hlist_for_each_entry(inode, b, i_hash) {
				1231	if (inode->i_ino == ino && inode->i_sb == sb) {
				1232	spin_unlock(&inode_hash_lock);
				1233	return 0;
				1234	}
				1235	}
				1236	spin_unlock(&inode_hash_lock);
				1237
				1238	return 1;
				1239	}
				1240
				1241	/**
				1242	* iunique - get a unique inode number
				1243	* @sb: superblock
				1244	* @max_reserved: highest reserved inode number
				1245	*
				1246	* Obtain an inode number that is unique on the system for a given
				1247	* superblock. This is used by file systems that have no natural
				1248	* permanent inode numbering system. An inode number is returned that
				1249	* is higher than the reserved limit but unique.
				1250	*
				1251	* BUGS:
				1252	* With a large number of inodes live on the file system this function
				1253	* currently becomes quite slow.
				1254	*/
				1255	ino_t iunique(struct super_block *sb, ino_t max_reserved)
				1256	{
				1257	/*
				1258	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				1259	* error if st_ino won't fit in target struct field. Use 32bit counter
				1260	* here to attempt to avoid that.
				1261	*/
				1262	static DEFINE_SPINLOCK(iunique_lock);
				1263	static unsigned int counter;
				1264	ino_t res;
				1265
				1266	spin_lock(&iunique_lock);
				1267	do {
				1268	if (counter <= max_reserved)
				1269	counter = max_reserved + 1;
				1270	res = counter++;
				1271	} while (!test_inode_iunique(sb, res));
				1272	spin_unlock(&iunique_lock);
				1273
				1274	return res;
				1275	}
				1276	EXPORT_SYMBOL(iunique);
				1277
				1278	struct inode igrab(struct inode inode)
				1279	{
				1280	spin_lock(&inode->i_lock);
				1281	if (!(inode->i_state & (I_FREEING\|I_WILL_FREE))) {
				1282	__iget(inode);
				1283	spin_unlock(&inode->i_lock);
				1284	} else {
				1285	spin_unlock(&inode->i_lock);
				1286	/*
				1287	* Handle the case where s_op->clear_inode is not been
				1288	* called yet, and somebody is calling igrab
				1289	* while the inode is getting freed.
				1290	*/
				1291	inode = NULL;
				1292	}
				1293	return inode;
				1294	}
				1295	EXPORT_SYMBOL(igrab);
				1296
				1297	/**
				1298	* ilookup5_nowait - search for an inode in the inode cache
				1299	* @sb: super block of file system to search
				1300	* @hashval: hash value (usually inode number) to search for
				1301	* @test: callback used for comparisons between inodes
				1302	* @data: opaque data pointer to pass to @test
				1303	*
				1304	* Search for the inode specified by @hashval and @data in the inode cache.
				1305	* If the inode is in the cache, the inode is returned with an incremented
				1306	* reference count.
				1307	*
				1308	* Note: I_NEW is not waited upon so you have to be very careful what you do
				1309	* with the returned inode. You probably should be using ilookup5() instead.
				1310	*
				1311	* Note2: @test is called with the inode_hash_lock held, so can't sleep.
				1312	*/
				1313	struct inode ilookup5_nowait(struct super_block sb, unsigned long hashval,
				1314	int (test)(struct inode , void ), void data)
				1315	{
				1316	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1317	struct inode *inode;
				1318
				1319	spin_lock(&inode_hash_lock);
				1320	inode = find_inode(sb, head, test, data);
				1321	spin_unlock(&inode_hash_lock);
				1322
				1323	return IS_ERR(inode) ? NULL : inode;
				1324	}
				1325	EXPORT_SYMBOL(ilookup5_nowait);
				1326
				1327	/**
				1328	* ilookup5 - search for an inode in the inode cache
				1329	* @sb: super block of file system to search
				1330	* @hashval: hash value (usually inode number) to search for
				1331	* @test: callback used for comparisons between inodes
				1332	* @data: opaque data pointer to pass to @test
				1333	*
				1334	* Search for the inode specified by @hashval and @data in the inode cache,
				1335	* and if the inode is in the cache, return the inode with an incremented
				1336	* reference count. Waits on I_NEW before returning the inode.
				1337	* returned with an incremented reference count.
				1338	*
				1339	* This is a generalized version of ilookup() for file systems where the
				1340	* inode number is not sufficient for unique identification of an inode.
				1341	*
				1342	* Note: @test is called with the inode_hash_lock held, so can't sleep.
				1343	*/
				1344	struct inode ilookup5(struct super_block sb, unsigned long hashval,
				1345	int (test)(struct inode , void ), void data)
				1346	{
				1347	struct inode *inode;
				1348	again:
				1349	inode = ilookup5_nowait(sb, hashval, test, data);
				1350	if (inode) {
				1351	wait_on_inode(inode);
				1352	if (unlikely(inode_unhashed(inode))) {
				1353	iput(inode);
				1354	goto again;
				1355	}
				1356	}
				1357	return inode;
				1358	}
				1359	EXPORT_SYMBOL(ilookup5);
				1360
				1361	/**
				1362	* ilookup - search for an inode in the inode cache
				1363	* @sb: super block of file system to search
				1364	* @ino: inode number to search for
				1365	*
				1366	* Search for the inode @ino in the inode cache, and if the inode is in the
				1367	* cache, the inode is returned with an incremented reference count.
				1368	*/
				1369	struct inode ilookup(struct super_block sb, unsigned long ino)
				1370	{
				1371	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1372	struct inode *inode;
				1373	again:
				1374	spin_lock(&inode_hash_lock);
				1375	inode = find_inode_fast(sb, head, ino);
				1376	spin_unlock(&inode_hash_lock);
				1377
				1378	if (inode) {
				1379	if (IS_ERR(inode))
				1380	return NULL;
				1381	wait_on_inode(inode);
				1382	if (unlikely(inode_unhashed(inode))) {
				1383	iput(inode);
				1384	goto again;
				1385	}
				1386	}
				1387	return inode;
				1388	}
				1389	EXPORT_SYMBOL(ilookup);
				1390
				1391	/**
				1392	* find_inode_nowait - find an inode in the inode cache
				1393	* @sb: super block of file system to search
				1394	* @hashval: hash value (usually inode number) to search for
				1395	* @match: callback used for comparisons between inodes
				1396	* @data: opaque data pointer to pass to @match
				1397	*
				1398	* Search for the inode specified by @hashval and @data in the inode
				1399	* cache, where the helper function @match will return 0 if the inode
				1400	* does not match, 1 if the inode does match, and -1 if the search
				1401	* should be stopped. The @match function must be responsible for
				1402	* taking the i_lock spin_lock and checking i_state for an inode being
				1403	* freed or being initialized, and incrementing the reference count
				1404	* before returning 1. It also must not sleep, since it is called with
				1405	* the inode_hash_lock spinlock held.
				1406	*
				1407	* This is a even more generalized version of ilookup5() when the
				1408	* function must never block --- find_inode() can block in
				1409	* __wait_on_freeing_inode() --- or when the caller can not increment
				1410	* the reference count because the resulting iput() might cause an
				1411	* inode eviction. The tradeoff is that the @match funtion must be
				1412	* very carefully implemented.
				1413	*/
				1414	struct inode find_inode_nowait(struct super_block sb,
				1415	unsigned long hashval,
				1416	int (match)(struct inode , unsigned long,
				1417	void *),
				1418	void *data)
				1419	{
				1420	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1421	struct inode inode, ret_inode = NULL;
				1422	int mval;
				1423
				1424	spin_lock(&inode_hash_lock);
				1425	hlist_for_each_entry(inode, head, i_hash) {
				1426	if (inode->i_sb != sb)
				1427	continue;
				1428	mval = match(inode, hashval, data);
				1429	if (mval == 0)
				1430	continue;
				1431	if (mval == 1)
				1432	ret_inode = inode;
				1433	goto out;
				1434	}
				1435	out:
				1436	spin_unlock(&inode_hash_lock);
				1437	return ret_inode;
				1438	}
				1439	EXPORT_SYMBOL(find_inode_nowait);
				1440
				1441	int insert_inode_locked(struct inode *inode)
				1442	{
				1443	struct super_block *sb = inode->i_sb;
				1444	ino_t ino = inode->i_ino;
				1445	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1446
				1447	while (1) {
				1448	struct inode *old = NULL;
				1449	spin_lock(&inode_hash_lock);
				1450	hlist_for_each_entry(old, head, i_hash) {
				1451	if (old->i_ino != ino)
				1452	continue;
				1453	if (old->i_sb != sb)
				1454	continue;
				1455	spin_lock(&old->i_lock);
				1456	if (old->i_state & (I_FREEING\|I_WILL_FREE)) {
				1457	spin_unlock(&old->i_lock);
				1458	continue;
				1459	}
				1460	break;
				1461	}
				1462	if (likely(!old)) {
				1463	spin_lock(&inode->i_lock);
				1464	inode->i_state \|= I_NEW \| I_CREATING;
				1465	hlist_add_head(&inode->i_hash, head);
				1466	spin_unlock(&inode->i_lock);
				1467	spin_unlock(&inode_hash_lock);
				1468	return 0;
				1469	}
				1470	if (unlikely(old->i_state & I_CREATING)) {
				1471	spin_unlock(&old->i_lock);
				1472	spin_unlock(&inode_hash_lock);
				1473	return -EBUSY;
				1474	}
				1475	__iget(old);
				1476	spin_unlock(&old->i_lock);
				1477	spin_unlock(&inode_hash_lock);
				1478	wait_on_inode(old);
				1479	if (unlikely(!inode_unhashed(old))) {
				1480	iput(old);
				1481	return -EBUSY;
				1482	}
				1483	iput(old);
				1484	}
				1485	}
				1486	EXPORT_SYMBOL(insert_inode_locked);
				1487
				1488	int insert_inode_locked4(struct inode *inode, unsigned long hashval,
				1489	int (test)(struct inode , void ), void data)
				1490	{
				1491	struct inode *old;
				1492
				1493	inode->i_state \|= I_CREATING;
				1494	old = inode_insert5(inode, hashval, test, NULL, data);
				1495
				1496	if (old != inode) {
				1497	iput(old);
				1498	return -EBUSY;
				1499	}
				1500	return 0;
				1501	}
				1502	EXPORT_SYMBOL(insert_inode_locked4);
				1503
				1504
				1505	int generic_delete_inode(struct inode *inode)
				1506	{
				1507	return 1;
				1508	}
				1509	EXPORT_SYMBOL(generic_delete_inode);
				1510
				1511	/*
				1512	* Called when we're dropping the last reference
				1513	* to an inode.
				1514	*
				1515	* Call the FS "drop_inode()" function, defaulting to
				1516	* the legacy UNIX filesystem behaviour. If it tells
				1517	* us to evict inode, do so. Otherwise, retain inode
				1518	* in cache if fs is alive, sync and evict if fs is
				1519	* shutting down.
				1520	*/
				1521	static void iput_final(struct inode *inode)
				1522	{
				1523	struct super_block *sb = inode->i_sb;
				1524	const struct super_operations *op = inode->i_sb->s_op;
				1525	int drop;
				1526
				1527	WARN_ON(inode->i_state & I_NEW);
				1528
				1529	if (op->drop_inode)
				1530	drop = op->drop_inode(inode);
				1531	else
				1532	drop = generic_drop_inode(inode);
				1533
				1534	if (!drop && (sb->s_flags & SB_ACTIVE)) {
				1535	inode_add_lru(inode);
				1536	spin_unlock(&inode->i_lock);
				1537	return;
				1538	}
				1539
				1540	if (!drop) {
				1541	inode->i_state \|= I_WILL_FREE;
				1542	spin_unlock(&inode->i_lock);
				1543	write_inode_now(inode, 1);
				1544	spin_lock(&inode->i_lock);
				1545	WARN_ON(inode->i_state & I_NEW);
				1546	inode->i_state &= ~I_WILL_FREE;
				1547	}
				1548
				1549	inode->i_state \|= I_FREEING;
				1550	if (!list_empty(&inode->i_lru))
				1551	inode_lru_list_del(inode);
				1552	spin_unlock(&inode->i_lock);
				1553
				1554	evict(inode);
				1555	}
				1556
				1557	/**
				1558	* iput - put an inode
				1559	* @inode: inode to put
				1560	*
				1561	* Puts an inode, dropping its usage count. If the inode use count hits
				1562	* zero, the inode is then freed and may also be destroyed.
				1563	*
				1564	* Consequently, iput() can sleep.
				1565	*/
				1566	void iput(struct inode *inode)
				1567	{
				1568	if (!inode)
				1569	return;
				1570	BUG_ON(inode->i_state & I_CLEAR);
				1571	retry:
				1572	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
				1573	if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
				1574	atomic_inc(&inode->i_count);
				1575	spin_unlock(&inode->i_lock);
				1576	trace_writeback_lazytime_iput(inode);
				1577	mark_inode_dirty_sync(inode);
				1578	goto retry;
				1579	}
				1580	iput_final(inode);
				1581	}
				1582	}
				1583	EXPORT_SYMBOL(iput);
				1584
				1585	/**
				1586	* bmap - find a block number in a file
				1587	* @inode: inode of file
				1588	* @block: block to find
				1589	*
				1590	* Returns the block number on the device holding the inode that
				1591	* is the disk block number for the block of the file requested.
				1592	* That is, asked for block 4 of inode 1 the function will return the
				1593	* disk block relative to the disk start that holds that block of the
				1594	* file.
				1595	*/
				1596	sector_t bmap(struct inode *inode, sector_t block)
				1597	{
				1598	sector_t res = 0;
				1599	if (inode->i_mapping->a_ops->bmap)
				1600	res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
				1601	return res;
				1602	}
				1603	EXPORT_SYMBOL(bmap);
				1604
				1605	/*
				1606	* With relative atime, only update atime if the previous atime is
				1607	* earlier than either the ctime or mtime or if at least a day has
				1608	* passed since the last atime update.
				1609	*/
				1610	static int relatime_need_update(struct vfsmount mnt, struct inode inode,
				1611	struct timespec now)
				1612	{
				1613
				1614	if (!(mnt->mnt_flags & MNT_RELATIME))
				1615	return 1;
				1616	/*
				1617	* Is mtime younger than atime? If yes, update atime:
				1618	*/
				1619	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
				1620	return 1;
				1621	/*
				1622	* Is ctime younger than atime? If yes, update atime:
				1623	*/
				1624	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
				1625	return 1;
				1626
				1627	/*
				1628	* Is the previous atime value older than a day? If yes,
				1629	* update atime:
				1630	*/
				1631	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 246060)
				1632	return 1;
				1633	/*
				1634	* Good, we can skip the atime update:
				1635	*/
				1636	return 0;
				1637	}
				1638
				1639	int generic_update_time(struct inode inode, struct timespec64 time, int flags)
				1640	{
				1641	int iflags = I_DIRTY_TIME;
				1642	bool dirty = false;
				1643
				1644	if (flags & S_ATIME)
				1645	inode->i_atime = *time;
				1646	if (flags & S_VERSION)
				1647	dirty = inode_maybe_inc_iversion(inode, false);
				1648	if (flags & S_CTIME)
				1649	inode->i_ctime = *time;
				1650	if (flags & S_MTIME)
				1651	inode->i_mtime = *time;
				1652	if ((flags & (S_ATIME \| S_CTIME \| S_MTIME)) &&
				1653	!(inode->i_sb->s_flags & SB_LAZYTIME))
				1654	dirty = true;
				1655
				1656	if (dirty)
				1657	iflags \|= I_DIRTY_SYNC;
				1658	__mark_inode_dirty(inode, iflags);
				1659	return 0;
				1660	}
				1661	EXPORT_SYMBOL(generic_update_time);
				1662
				1663	/*
				1664	* This does the actual work of updating an inodes time or version. Must have
				1665	* had called mnt_want_write() before calling this.
				1666	*/
				1667	static int update_time(struct inode inode, struct timespec64 time, int flags)
				1668	{
				1669	int (update_time)(struct inode , struct timespec64 *, int);
				1670
				1671	update_time = inode->i_op->update_time ? inode->i_op->update_time :
				1672	generic_update_time;
				1673
				1674	return update_time(inode, time, flags);
				1675	}
				1676
				1677	/**
				1678	* touch_atime - update the access time
				1679	* @path: the &struct path to update
				1680	* @inode: inode to update
				1681	*
				1682	* Update the accessed time on an inode and mark it for writeback.
				1683	* This function automatically handles read only file systems and media,
				1684	* as well as the "noatime" flag and inode specific "noatime" markers.
				1685	*/
				1686	bool atime_needs_update(const struct path path, struct inode inode)
				1687	{
				1688	struct vfsmount *mnt = path->mnt;
				1689	struct timespec64 now;
				1690
				1691	if (inode->i_flags & S_NOATIME)
				1692	return false;
				1693
				1694	/* Atime updates will likely cause i_uid and i_gid to be written
				1695	* back improprely if their true value is unknown to the vfs.
				1696	*/
				1697	if (HAS_UNMAPPED_ID(inode))
				1698	return false;
				1699
				1700	if (IS_NOATIME(inode))
				1701	return false;
				1702	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
				1703	return false;
				1704
				1705	if (mnt->mnt_flags & MNT_NOATIME)
				1706	return false;
				1707	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
				1708	return false;
				1709
				1710	now = current_time(inode);
				1711
				1712	if (!relatime_need_update(mnt, inode, timespec64_to_timespec(now)))
				1713	return false;
				1714
				1715	if (timespec64_equal(&inode->i_atime, &now))
				1716	return false;
				1717
				1718	return true;
				1719	}
				1720
				1721	void touch_atime(const struct path *path)
				1722	{
				1723	struct vfsmount *mnt = path->mnt;
				1724	struct inode *inode = d_inode(path->dentry);
				1725	struct timespec64 now;
				1726
				1727	if (!atime_needs_update(path, inode))
				1728	return;
				1729
				1730	if (!sb_start_write_trylock(inode->i_sb))
				1731	return;
				1732
				1733	if (__mnt_want_write(mnt) != 0)
				1734	goto skip_update;
				1735	/*
				1736	* File systems can error out when updating inodes if they need to
				1737	* allocate new space to modify an inode (such is the case for
				1738	* Btrfs), but since we touch atime while walking down the path we
				1739	* really don't care if we failed to update the atime of the file,
				1740	* so just ignore the return value.
				1741	* We may also fail on filesystems that have the ability to make parts
				1742	* of the fs read only, e.g. subvolumes in Btrfs.
				1743	*/
				1744	now = current_time(inode);
				1745	update_time(inode, &now, S_ATIME);
				1746	__mnt_drop_write(mnt);
				1747	skip_update:
				1748	sb_end_write(inode->i_sb);
				1749	}
				1750	EXPORT_SYMBOL(touch_atime);
				1751
				1752	/*
				1753	* The logic we want is
				1754	*
				1755	* if suid or (sgid and xgrp)
				1756	* remove privs
				1757	*/
				1758	int should_remove_suid(struct dentry *dentry)
				1759	{
				1760	umode_t mode = d_inode(dentry)->i_mode;
				1761	int kill = 0;
				1762
				1763	/* suid always must be killed */
				1764	if (unlikely(mode & S_ISUID))
				1765	kill = ATTR_KILL_SUID;
				1766
				1767	/*
				1768	* sgid without any exec bits is just a mandatory locking mark; leave
				1769	* it alone. If some exec bits are set, it's a real sgid; kill it.
				1770	*/
				1771	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
				1772	kill \|= ATTR_KILL_SGID;
				1773
				1774	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
				1775	return kill;
				1776
				1777	return 0;
				1778	}
				1779	EXPORT_SYMBOL(should_remove_suid);
				1780
				1781	/*
				1782	* Return mask of changes for notify_change() that need to be done as a
				1783	* response to write or truncate. Return 0 if nothing has to be changed.
				1784	* Negative value on error (change should be denied).
				1785	*/
				1786	int dentry_needs_remove_privs(struct dentry *dentry)
				1787	{
				1788	struct inode *inode = d_inode(dentry);
				1789	int mask = 0;
				1790	int ret;
				1791
				1792	if (IS_NOSEC(inode))
				1793	return 0;
				1794
				1795	mask = should_remove_suid(dentry);
				1796	ret = security_inode_need_killpriv(dentry);
				1797	if (ret < 0)
				1798	return ret;
				1799	if (ret)
				1800	mask \|= ATTR_KILL_PRIV;
				1801	return mask;
				1802	}
				1803
				1804	static int __remove_privs(struct vfsmount mnt, struct dentry dentry, int kill)
				1805	{
				1806	struct iattr newattrs;
				1807
				1808	newattrs.ia_valid = ATTR_FORCE \| kill;
				1809	/*
				1810	* Note we call this on write, so notify_change will not
				1811	* encounter any conflicting delegations:
				1812	*/
				1813	return notify_change2(mnt, dentry, &newattrs, NULL);
				1814	}
				1815
				1816	/*
				1817	* Remove special file priviledges (suid, capabilities) when file is written
				1818	* to or truncated.
				1819	*/
				1820	int file_remove_privs(struct file *file)
				1821	{
				1822	struct dentry *dentry = file_dentry(file);
				1823	struct inode *inode = file_inode(file);
				1824	int kill;
				1825	int error = 0;
				1826
				1827	/*
				1828	* Fast path for nothing security related.
				1829	* As well for non-regular files, e.g. blkdev inodes.
				1830	* For example, blkdev_write_iter() might get here
				1831	* trying to remove privs which it is not allowed to.
				1832	*/
				1833	if (IS_NOSEC(inode) \|\| !S_ISREG(inode->i_mode))
				1834	return 0;
				1835
				1836	kill = dentry_needs_remove_privs(dentry);
				1837	if (kill < 0)
				1838	return kill;
				1839	if (kill)
				1840	error = __remove_privs(file->f_path.mnt, dentry, kill);
				1841	if (!error)
				1842	inode_has_no_xattr(inode);
				1843
				1844	return error;
				1845	}
				1846	EXPORT_SYMBOL(file_remove_privs);
				1847
				1848	/**
				1849	* file_update_time - update mtime and ctime time
				1850	* @file: file accessed
				1851	*
				1852	* Update the mtime and ctime members of an inode and mark the inode
				1853	* for writeback. Note that this function is meant exclusively for
				1854	* usage in the file write path of filesystems, and filesystems may
				1855	* choose to explicitly ignore update via this function with the
				1856	* S_NOCMTIME inode flag, e.g. for network filesystem where these
				1857	* timestamps are handled by the server. This can return an error for
				1858	* file systems who need to allocate space in order to update an inode.
				1859	*/
				1860
				1861	int file_update_time(struct file *file)
				1862	{
				1863	struct inode *inode = file_inode(file);
				1864	struct timespec64 now;
				1865	int sync_it = 0;
				1866	int ret;
				1867
				1868	/* First try to exhaust all avenues to not sync */
				1869	if (IS_NOCMTIME(inode))
				1870	return 0;
				1871
				1872	now = current_time(inode);
				1873	if (!timespec64_equal(&inode->i_mtime, &now))
				1874	sync_it = S_MTIME;
				1875
				1876	if (!timespec64_equal(&inode->i_ctime, &now))
				1877	sync_it \|= S_CTIME;
				1878
				1879	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
				1880	sync_it \|= S_VERSION;
				1881
				1882	if (!sync_it)
				1883	return 0;
				1884
				1885	/* Finally allowed to write? Takes lock. */
				1886	if (__mnt_want_write_file(file))
				1887	return 0;
				1888
				1889	ret = update_time(inode, &now, sync_it);
				1890	__mnt_drop_write_file(file);
				1891
				1892	return ret;
				1893	}
				1894	EXPORT_SYMBOL(file_update_time);
				1895
				1896	int inode_needs_sync(struct inode *inode)
				1897	{
				1898	if (IS_SYNC(inode))
				1899	return 1;
				1900	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
				1901	return 1;
				1902	return 0;
				1903	}
				1904	EXPORT_SYMBOL(inode_needs_sync);
				1905
				1906	/*
				1907	* If we try to find an inode in the inode hash while it is being
				1908	* deleted, we have to wait until the filesystem completes its
				1909	* deletion before reporting that it isn't found. This function waits
				1910	* until the deletion _might_ have completed. Callers are responsible
				1911	* to recheck inode state.
				1912	*
				1913	* It doesn't matter if I_NEW is not set initially, a call to
				1914	* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
				1915	* will DTRT.
				1916	*/
				1917	static void __wait_on_freeing_inode(struct inode *inode)
				1918	{
				1919	wait_queue_head_t *wq;
				1920	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
				1921	wq = bit_waitqueue(&inode->i_state, __I_NEW);
				1922	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				1923	spin_unlock(&inode->i_lock);
				1924	spin_unlock(&inode_hash_lock);
				1925	schedule();
				1926	finish_wait(wq, &wait.wq_entry);
				1927	spin_lock(&inode_hash_lock);
				1928	}
				1929
				1930	static __initdata unsigned long ihash_entries;
				1931	static int __init set_ihash_entries(char *str)
				1932	{
				1933	if (!str)
				1934	return 0;
				1935	ihash_entries = simple_strtoul(str, &str, 0);
				1936	return 1;
				1937	}
				1938	__setup("ihash_entries=", set_ihash_entries);
				1939
				1940	/*
				1941	* Initialize the waitqueues and inode hash table.
				1942	*/
				1943	void __init inode_init_early(void)
				1944	{
				1945	/* If hashes are distributed across NUMA nodes, defer
				1946	* hash allocation until vmalloc space is available.
				1947	*/
				1948	if (hashdist)
				1949	return;
				1950
				1951	inode_hashtable =
				1952	alloc_large_system_hash("Inode-cache",
				1953	sizeof(struct hlist_head),
				1954	ihash_entries,
				1955	14,
				1956	HASH_EARLY \| HASH_ZERO,
				1957	&i_hash_shift,
				1958	&i_hash_mask,
				1959	0,
				1960	0);
				1961	}
				1962
				1963	void __init inode_init(void)
				1964	{
				1965	/* inode slab cache */
				1966	inode_cachep = kmem_cache_create("inode_cache",
				1967	sizeof(struct inode),
				1968	0,
				1969	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				1970	SLAB_MEM_SPREAD\|SLAB_ACCOUNT),
				1971	init_once);
				1972
				1973	/* Hash may have been set up in inode_init_early */
				1974	if (!hashdist)
				1975	return;
				1976
				1977	inode_hashtable =
				1978	alloc_large_system_hash("Inode-cache",
				1979	sizeof(struct hlist_head),
				1980	ihash_entries,
				1981	14,
				1982	HASH_ZERO,
				1983	&i_hash_shift,
				1984	&i_hash_mask,
				1985	0,
				1986	0);
				1987	}
				1988
				1989	void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
				1990	{
				1991	inode->i_mode = mode;
				1992	if (S_ISCHR(mode)) {
				1993	inode->i_fop = &def_chr_fops;
				1994	inode->i_rdev = rdev;
				1995	} else if (S_ISBLK(mode)) {
				1996	inode->i_fop = &def_blk_fops;
				1997	inode->i_rdev = rdev;
				1998	} else if (S_ISFIFO(mode))
				1999	inode->i_fop = &pipefifo_fops;
				2000	else if (S_ISSOCK(mode))
				2001	; /* leave it no_open_fops */
				2002	else
				2003	printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
				2004	" inode %s:%lu\n", mode, inode->i_sb->s_id,
				2005	inode->i_ino);
				2006	}
				2007	EXPORT_SYMBOL(init_special_inode);
				2008
				2009	/**
				2010	* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
				2011	* @inode: New inode
				2012	* @dir: Directory inode
				2013	* @mode: mode of the new inode
				2014	*/
				2015	void inode_init_owner(struct inode inode, const struct inode dir,
				2016	umode_t mode)
				2017	{
				2018	inode->i_uid = current_fsuid();
				2019	if (dir && dir->i_mode & S_ISGID) {
				2020	inode->i_gid = dir->i_gid;
				2021
				2022	/* Directories are special, and always inherit S_ISGID */
				2023	if (S_ISDIR(mode))
				2024	mode \|= S_ISGID;
				2025	else if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP) &&
				2026	!in_group_p(inode->i_gid) &&
				2027	!capable_wrt_inode_uidgid(dir, CAP_FSETID))
				2028	mode &= ~S_ISGID;
				2029	} else
				2030	inode->i_gid = current_fsgid();
				2031	inode->i_mode = mode;
				2032	}
				2033	EXPORT_SYMBOL(inode_init_owner);
				2034
				2035	/**
				2036	* inode_owner_or_capable - check current task permissions to inode
				2037	* @inode: inode being checked
				2038	*
				2039	* Return true if current either has CAP_FOWNER in a namespace with the
				2040	* inode owner uid mapped, or owns the file.
				2041	*/
				2042	bool inode_owner_or_capable(const struct inode *inode)
				2043	{
				2044	struct user_namespace *ns;
				2045
				2046	if (uid_eq(current_fsuid(), inode->i_uid))
				2047	return true;
				2048
				2049	ns = current_user_ns();
				2050	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
				2051	return true;
				2052	return false;
				2053	}
				2054	EXPORT_SYMBOL(inode_owner_or_capable);
				2055
				2056	/*
				2057	* Direct i/o helper functions
				2058	*/
				2059	static void __inode_dio_wait(struct inode *inode)
				2060	{
				2061	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
				2062	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
				2063
				2064	do {
				2065	prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
				2066	if (atomic_read(&inode->i_dio_count))
				2067	schedule();
				2068	} while (atomic_read(&inode->i_dio_count));
				2069	finish_wait(wq, &q.wq_entry);
				2070	}
				2071
				2072	/**
				2073	* inode_dio_wait - wait for outstanding DIO requests to finish
				2074	* @inode: inode to wait for
				2075	*
				2076	* Waits for all pending direct I/O requests to finish so that we can
				2077	* proceed with a truncate or equivalent operation.
				2078	*
				2079	* Must be called under a lock that serializes taking new references
				2080	* to i_dio_count, usually by inode->i_mutex.
				2081	*/
				2082	void inode_dio_wait(struct inode *inode)
				2083	{
				2084	if (atomic_read(&inode->i_dio_count))
				2085	__inode_dio_wait(inode);
				2086	}
				2087	EXPORT_SYMBOL(inode_dio_wait);
				2088
				2089	/*
				2090	* inode_set_flags - atomically set some inode flags
				2091	*
				2092	* Note: the caller should be holding i_mutex, or else be sure that
				2093	* they have exclusive access to the inode structure (i.e., while the
				2094	* inode is being instantiated). The reason for the cmpxchg() loop
				2095	* --- which wouldn't be necessary if all code paths which modify
				2096	* i_flags actually followed this rule, is that there is at least one
				2097	* code path which doesn't today so we use cmpxchg() out of an abundance
				2098	* of caution.
				2099	*
				2100	* In the long run, i_mutex is overkill, and we should probably look
				2101	* at using the i_lock spinlock to protect i_flags, and then make sure
				2102	* it is so documented in include/linux/fs.h and that all code follows
				2103	* the locking convention!!
				2104	*/
				2105	void inode_set_flags(struct inode *inode, unsigned int flags,
				2106	unsigned int mask)
				2107	{
				2108	unsigned int old_flags, new_flags;
				2109
				2110	WARN_ON_ONCE(flags & ~mask);
				2111	do {
				2112	old_flags = READ_ONCE(inode->i_flags);
				2113	new_flags = (old_flags & ~mask) \| flags;
				2114	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
				2115	new_flags) != old_flags));
				2116	}
				2117	EXPORT_SYMBOL(inode_set_flags);
				2118
				2119	void inode_nohighmem(struct inode *inode)
				2120	{
				2121	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
				2122	}
				2123	EXPORT_SYMBOL(inode_nohighmem);
				2124
				2125	/**
				2126	* timespec64_trunc - Truncate timespec64 to a granularity
				2127	* @t: Timespec64
				2128	* @gran: Granularity in ns.
				2129	*
				2130	* Truncate a timespec64 to a granularity. Always rounds down. gran must
				2131	* not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
				2132	*/
				2133	struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran)
				2134	{
				2135	/* Avoid division in the common cases 1 ns and 1 s. */
				2136	if (gran == 1) {
				2137	/* nothing */
				2138	} else if (gran == NSEC_PER_SEC) {
				2139	t.tv_nsec = 0;
				2140	} else if (gran > 1 && gran < NSEC_PER_SEC) {
				2141	t.tv_nsec -= t.tv_nsec % gran;
				2142	} else {
				2143	WARN(1, "illegal file time granularity: %u", gran);
				2144	}
				2145	return t;
				2146	}
				2147	EXPORT_SYMBOL(timespec64_trunc);
				2148
				2149	/**
				2150	* current_time - Return FS time
				2151	* @inode: inode.
				2152	*
				2153	* Return the current time truncated to the time granularity supported by
				2154	* the fs.
				2155	*
				2156	* Note that inode and inode->sb cannot be NULL.
				2157	* Otherwise, the function warns and returns time without truncation.
				2158	*/
				2159	struct timespec64 current_time(struct inode *inode)
				2160	{
				2161	struct timespec64 now = current_kernel_time64();
				2162
				2163	if (unlikely(!inode->i_sb)) {
				2164	WARN(1, "current_time() called with uninitialized super_block in the inode");
				2165	return now;
				2166	}
				2167
				2168	return timespec64_trunc(now, inode->i_sb->s_time_gran);
				2169	}
				2170	EXPORT_SYMBOL(current_time);
				2171
				2172	/*
				2173	* Generic function to check FS_IOC_SETFLAGS values and reject any invalid
				2174	* configurations.
				2175	*
				2176	* Note: the caller should be holding i_mutex, or else be sure that they have
				2177	* exclusive access to the inode structure.
				2178	*/
				2179	int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
				2180	unsigned int flags)
				2181	{
				2182	/*
				2183	* The IMMUTABLE and APPEND_ONLY flags can only be changed by
				2184	* the relevant capability.
				2185	*
				2186	* This test looks nicer. Thanks to Pauline Middelink
				2187	*/
				2188	if ((flags ^ oldflags) & (FS_APPEND_FL \| FS_IMMUTABLE_FL) &&
				2189	!capable(CAP_LINUX_IMMUTABLE))
				2190	return -EPERM;
				2191
				2192	return 0;
				2193	}
				2194	EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
				2195
				2196	/*
				2197	* Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
				2198	* configurations.
				2199	*
				2200	* Note: the caller should be holding i_mutex, or else be sure that they have
				2201	* exclusive access to the inode structure.
				2202	*/
				2203	int vfs_ioc_fssetxattr_check(struct inode inode, const struct fsxattr old_fa,
				2204	struct fsxattr *fa)
				2205	{
				2206	/*
				2207	* Can't modify an immutable/append-only file unless we have
				2208	* appropriate permission.
				2209	*/
				2210	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
				2211	(FS_XFLAG_IMMUTABLE \| FS_XFLAG_APPEND) &&
				2212	!capable(CAP_LINUX_IMMUTABLE))
				2213	return -EPERM;
				2214
				2215	/*
				2216	* Project Quota ID state is only allowed to change from within the init
				2217	* namespace. Enforce that restriction only if we are trying to change
				2218	* the quota ID state. Everything else is allowed in user namespaces.
				2219	*/
				2220	if (current_user_ns() != &init_user_ns) {
				2221	if (old_fa->fsx_projid != fa->fsx_projid)
				2222	return -EINVAL;
				2223	if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
				2224	FS_XFLAG_PROJINHERIT)
				2225	return -EINVAL;
				2226	}
				2227
				2228	/* Check extent size hints. */
				2229	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
				2230	return -EINVAL;
				2231
				2232	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
				2233	!S_ISDIR(inode->i_mode))
				2234	return -EINVAL;
				2235
				2236	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
				2237	!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
				2238	return -EINVAL;
				2239
				2240	/*
				2241	* It is only valid to set the DAX flag on regular files and
				2242	* directories on filesystems.
				2243	*/
				2244	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
				2245	!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode)))
				2246	return -EINVAL;
				2247
				2248	/* Extent size hints of zero turn off the flags. */
				2249	if (fa->fsx_extsize == 0)
				2250	fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE \| FS_XFLAG_EXTSZINHERIT);
				2251	if (fa->fsx_cowextsize == 0)
				2252	fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
				2253
				2254	return 0;
				2255	}
				2256	EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);