Blame - src/kernel/linux/v4.14/fs/inode.c - T103

blob: e23b561dc8807b209a6a06f527f1093eb09d94ab [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* (C) 1997 Linus Torvalds
				3	* (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
				4	*/
				5	#include <linux/export.h>
				6	#include <linux/fs.h>
				7	#include <linux/mm.h>
				8	#include <linux/backing-dev.h>
				9	#include <linux/hash.h>
				10	#include <linux/swap.h>
				11	#include <linux/security.h>
				12	#include <linux/cdev.h>
				13	#include <linux/bootmem.h>
				14	#include <linux/fsnotify.h>
				15	#include <linux/mount.h>
				16	#include <linux/posix_acl.h>
				17	#include <linux/prefetch.h>
				18	#include <linux/buffer_head.h> /* for inode_has_buffers */
				19	#include <linux/ratelimit.h>
				20	#include <linux/list_lru.h>
				21	#include <trace/events/writeback.h>
				22	#include "internal.h"
				23
				24	/*
				25	* Inode locking rules:
				26	*
				27	* inode->i_lock protects:
				28	* inode->i_state, inode->i_hash, __iget()
				29	* Inode LRU list locks protect:
				30	* inode->i_sb->s_inode_lru, inode->i_lru
				31	* inode->i_sb->s_inode_list_lock protects:
				32	* inode->i_sb->s_inodes, inode->i_sb_list
				33	* bdi->wb.list_lock protects:
				34	* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
				35	* inode_hash_lock protects:
				36	* inode_hashtable, inode->i_hash
				37	*
				38	* Lock ordering:
				39	*
				40	* inode->i_sb->s_inode_list_lock
				41	* inode->i_lock
				42	* Inode LRU list locks
				43	*
				44	* bdi->wb.list_lock
				45	* inode->i_lock
				46	*
				47	* inode_hash_lock
				48	* inode->i_sb->s_inode_list_lock
				49	* inode->i_lock
				50	*
				51	* iunique_lock
				52	* inode_hash_lock
				53	*/
				54
				55	static unsigned int i_hash_mask __read_mostly;
				56	static unsigned int i_hash_shift __read_mostly;
				57	static struct hlist_head *inode_hashtable __read_mostly;
				58	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
				59
				60	/*
				61	* Empty aops. Can be used for the cases where the user does not
				62	* define any of the address_space operations.
				63	*/
				64	const struct address_space_operations empty_aops = {
				65	};
				66	EXPORT_SYMBOL(empty_aops);
				67
				68	/*
				69	* Statistics gathering..
				70	*/
				71	struct inodes_stat_t inodes_stat;
				72
				73	static DEFINE_PER_CPU(unsigned long, nr_inodes);
				74	static DEFINE_PER_CPU(unsigned long, nr_unused);
				75
				76	static struct kmem_cache *inode_cachep __read_mostly;
				77
				78	static long get_nr_inodes(void)
				79	{
				80	int i;
				81	long sum = 0;
				82	for_each_possible_cpu(i)
				83	sum += per_cpu(nr_inodes, i);
				84	return sum < 0 ? 0 : sum;
				85	}
				86
				87	static inline long get_nr_inodes_unused(void)
				88	{
				89	int i;
				90	long sum = 0;
				91	for_each_possible_cpu(i)
				92	sum += per_cpu(nr_unused, i);
				93	return sum < 0 ? 0 : sum;
				94	}
				95
				96	long get_nr_dirty_inodes(void)
				97	{
				98	/* not actually dirty inodes, but a wild approximation */
				99	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
				100	return nr_dirty > 0 ? nr_dirty : 0;
				101	}
				102
				103	/*
				104	* Handle nr_inode sysctl
				105	*/
				106	#ifdef CONFIG_SYSCTL
				107	int proc_nr_inodes(struct ctl_table *table, int write,
				108	void __user buffer, size_t lenp, loff_t *ppos)
				109	{
				110	inodes_stat.nr_inodes = get_nr_inodes();
				111	inodes_stat.nr_unused = get_nr_inodes_unused();
				112	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
				113	}
				114	#endif
				115
				116	static int no_open(struct inode inode, struct file file)
				117	{
				118	return -ENXIO;
				119	}
				120
				121	/**
				122	* inode_init_always - perform inode structure initialisation
				123	* @sb: superblock inode belongs to
				124	* @inode: inode to initialise
				125	*
				126	* These are initializations that need to be done on every inode
				127	* allocation as the fields are not initialised by slab allocation.
				128	*/
				129	int inode_init_always(struct super_block sb, struct inode inode)
				130	{
				131	static const struct inode_operations empty_iops;
				132	static const struct file_operations no_open_fops = {.open = no_open};
				133	struct address_space *const mapping = &inode->i_data;
				134
				135	inode->i_sb = sb;
				136	inode->i_blkbits = sb->s_blocksize_bits;
				137	inode->i_flags = 0;
				138	atomic64_set(&inode->i_sequence, 0);
				139	atomic_set(&inode->i_count, 1);
				140	inode->i_op = &empty_iops;
				141	inode->i_fop = &no_open_fops;
				142	inode->__i_nlink = 1;
				143	inode->i_opflags = 0;
				144	if (sb->s_xattr)
				145	inode->i_opflags \|= IOP_XATTR;
				146	i_uid_write(inode, 0);
				147	i_gid_write(inode, 0);
				148	atomic_set(&inode->i_writecount, 0);
				149	inode->i_size = 0;
				150	inode->i_write_hint = WRITE_LIFE_NOT_SET;
				151	inode->i_blocks = 0;
				152	inode->i_bytes = 0;
				153	inode->i_generation = 0;
				154	inode->i_pipe = NULL;
				155	inode->i_bdev = NULL;
				156	inode->i_cdev = NULL;
				157	inode->i_link = NULL;
				158	inode->i_dir_seq = 0;
				159	inode->i_rdev = 0;
				160	inode->dirtied_when = 0;
				161
				162	#ifdef CONFIG_CGROUP_WRITEBACK
				163	inode->i_wb_frn_winner = 0;
				164	inode->i_wb_frn_avg_time = 0;
				165	inode->i_wb_frn_history = 0;
				166	#endif
				167
				168	if (security_inode_alloc(inode))
				169	goto out;
				170	spin_lock_init(&inode->i_lock);
				171	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
				172
				173	init_rwsem(&inode->i_rwsem);
				174	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
				175
				176	atomic_set(&inode->i_dio_count, 0);
				177
				178	mapping->a_ops = &empty_aops;
				179	mapping->host = inode;
				180	mapping->flags = 0;
				181	mapping->wb_err = 0;
				182	atomic_set(&mapping->i_mmap_writable, 0);
				183	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
				184	mapping->private_data = NULL;
				185	mapping->writeback_index = 0;
				186	inode->i_private = NULL;
				187	inode->i_mapping = mapping;
				188	INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
				189	#ifdef CONFIG_FS_POSIX_ACL
				190	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
				191	#endif
				192
				193	#ifdef CONFIG_FSNOTIFY
				194	inode->i_fsnotify_mask = 0;
				195	#endif
				196	inode->i_flctx = NULL;
				197	this_cpu_inc(nr_inodes);
				198
				199	return 0;
				200	out:
				201	return -ENOMEM;
				202	}
				203	EXPORT_SYMBOL(inode_init_always);
				204
				205	static struct inode alloc_inode(struct super_block sb)
				206	{
				207	struct inode *inode;
				208
				209	if (sb->s_op->alloc_inode)
				210	inode = sb->s_op->alloc_inode(sb);
				211	else
				212	inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
				213
				214	if (!inode)
				215	return NULL;
				216
				217	if (unlikely(inode_init_always(sb, inode))) {
				218	if (inode->i_sb->s_op->destroy_inode)
				219	inode->i_sb->s_op->destroy_inode(inode);
				220	else
				221	kmem_cache_free(inode_cachep, inode);
				222	return NULL;
				223	}
				224
				225	return inode;
				226	}
				227
				228	void free_inode_nonrcu(struct inode *inode)
				229	{
				230	kmem_cache_free(inode_cachep, inode);
				231	}
				232	EXPORT_SYMBOL(free_inode_nonrcu);
				233
				234	void __destroy_inode(struct inode *inode)
				235	{
				236	BUG_ON(inode_has_buffers(inode));
				237	inode_detach_wb(inode);
				238	security_inode_free(inode);
				239	fsnotify_inode_delete(inode);
				240	locks_free_lock_context(inode);
				241	if (!inode->i_nlink) {
				242	WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
				243	atomic_long_dec(&inode->i_sb->s_remove_count);
				244	}
				245
				246	#ifdef CONFIG_FS_POSIX_ACL
				247	if (inode->i_acl && !is_uncached_acl(inode->i_acl))
				248	posix_acl_release(inode->i_acl);
				249	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
				250	posix_acl_release(inode->i_default_acl);
				251	#endif
				252	this_cpu_dec(nr_inodes);
				253	}
				254	EXPORT_SYMBOL(__destroy_inode);
				255
				256	static void i_callback(struct rcu_head *head)
				257	{
				258	struct inode *inode = container_of(head, struct inode, i_rcu);
				259	kmem_cache_free(inode_cachep, inode);
				260	}
				261
				262	static void destroy_inode(struct inode *inode)
				263	{
				264	BUG_ON(!list_empty(&inode->i_lru));
				265	__destroy_inode(inode);
				266	if (inode->i_sb->s_op->destroy_inode)
				267	inode->i_sb->s_op->destroy_inode(inode);
				268	else
				269	call_rcu(&inode->i_rcu, i_callback);
				270	}
				271
				272	/**
				273	* drop_nlink - directly drop an inode's link count
				274	* @inode: inode
				275	*
				276	* This is a low-level filesystem helper to replace any
				277	* direct filesystem manipulation of i_nlink. In cases
				278	* where we are attempting to track writes to the
				279	* filesystem, a decrement to zero means an imminent
				280	* write when the file is truncated and actually unlinked
				281	* on the filesystem.
				282	*/
				283	void drop_nlink(struct inode *inode)
				284	{
				285	WARN_ON(inode->i_nlink == 0);
				286	inode->__i_nlink--;
				287	if (!inode->i_nlink)
				288	atomic_long_inc(&inode->i_sb->s_remove_count);
				289	}
				290	EXPORT_SYMBOL(drop_nlink);
				291
				292	/**
				293	* clear_nlink - directly zero an inode's link count
				294	* @inode: inode
				295	*
				296	* This is a low-level filesystem helper to replace any
				297	* direct filesystem manipulation of i_nlink. See
				298	* drop_nlink() for why we care about i_nlink hitting zero.
				299	*/
				300	void clear_nlink(struct inode *inode)
				301	{
				302	if (inode->i_nlink) {
				303	inode->__i_nlink = 0;
				304	atomic_long_inc(&inode->i_sb->s_remove_count);
				305	}
				306	}
				307	EXPORT_SYMBOL(clear_nlink);
				308
				309	/**
				310	* set_nlink - directly set an inode's link count
				311	* @inode: inode
				312	* @nlink: new nlink (should be non-zero)
				313	*
				314	* This is a low-level filesystem helper to replace any
				315	* direct filesystem manipulation of i_nlink.
				316	*/
				317	void set_nlink(struct inode *inode, unsigned int nlink)
				318	{
				319	if (!nlink) {
				320	clear_nlink(inode);
				321	} else {
				322	/* Yes, some filesystems do change nlink from zero to one */
				323	if (inode->i_nlink == 0)
				324	atomic_long_dec(&inode->i_sb->s_remove_count);
				325
				326	inode->__i_nlink = nlink;
				327	}
				328	}
				329	EXPORT_SYMBOL(set_nlink);
				330
				331	/**
				332	* inc_nlink - directly increment an inode's link count
				333	* @inode: inode
				334	*
				335	* This is a low-level filesystem helper to replace any
				336	* direct filesystem manipulation of i_nlink. Currently,
				337	* it is only here for parity with dec_nlink().
				338	*/
				339	void inc_nlink(struct inode *inode)
				340	{
				341	if (unlikely(inode->i_nlink == 0)) {
				342	WARN_ON(!(inode->i_state & I_LINKABLE));
				343	atomic_long_dec(&inode->i_sb->s_remove_count);
				344	}
				345
				346	inode->__i_nlink++;
				347	}
				348	EXPORT_SYMBOL(inc_nlink);
				349
				350	void address_space_init_once(struct address_space *mapping)
				351	{
				352	memset(mapping, 0, sizeof(*mapping));
				353	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC \| __GFP_ACCOUNT);
				354	spin_lock_init(&mapping->tree_lock);
				355	init_rwsem(&mapping->i_mmap_rwsem);
				356	INIT_LIST_HEAD(&mapping->private_list);
				357	spin_lock_init(&mapping->private_lock);
				358	mapping->i_mmap = RB_ROOT_CACHED;
				359	}
				360	EXPORT_SYMBOL(address_space_init_once);
				361
				362	/*
				363	* These are initializations that only need to be done
				364	* once, because the fields are idempotent across use
				365	* of the inode, so let the slab aware of that.
				366	*/
				367	void inode_init_once(struct inode *inode)
				368	{
				369	memset(inode, 0, sizeof(*inode));
				370	INIT_HLIST_NODE(&inode->i_hash);
				371	INIT_LIST_HEAD(&inode->i_devices);
				372	INIT_LIST_HEAD(&inode->i_io_list);
				373	INIT_LIST_HEAD(&inode->i_wb_list);
				374	INIT_LIST_HEAD(&inode->i_lru);
				375	address_space_init_once(&inode->i_data);
				376	i_size_ordered_init(inode);
				377	}
				378	EXPORT_SYMBOL(inode_init_once);
				379
				380	static void init_once(void *foo)
				381	{
				382	struct inode inode = (struct inode ) foo;
				383
				384	inode_init_once(inode);
				385	}
				386
				387	/*
				388	* inode->i_lock must be held
				389	*/
				390	void __iget(struct inode *inode)
				391	{
				392	atomic_inc(&inode->i_count);
				393	}
				394
				395	/*
				396	* get additional reference to inode; caller must already hold one.
				397	*/
				398	void ihold(struct inode *inode)
				399	{
				400	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
				401	}
				402	EXPORT_SYMBOL(ihold);
				403
				404	static void inode_lru_list_add(struct inode *inode)
				405	{
				406	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
				407	this_cpu_inc(nr_unused);
				408	else
				409	inode->i_state \|= I_REFERENCED;
				410	}
				411
				412	/*
				413	* Add inode to LRU if needed (inode is unused and clean).
				414	*
				415	* Needs inode->i_lock held.
				416	*/
				417	void inode_add_lru(struct inode *inode)
				418	{
				419	if (!(inode->i_state & (I_DIRTY_ALL \| I_SYNC \|
				420	I_FREEING \| I_WILL_FREE)) &&
				421	!atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
				422	inode_lru_list_add(inode);
				423	}
				424
				425
				426	static void inode_lru_list_del(struct inode *inode)
				427	{
				428
				429	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
				430	this_cpu_dec(nr_unused);
				431	}
				432
				433	/**
				434	* inode_sb_list_add - add inode to the superblock list of inodes
				435	* @inode: inode to add
				436	*/
				437	void inode_sb_list_add(struct inode *inode)
				438	{
				439	spin_lock(&inode->i_sb->s_inode_list_lock);
				440	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
				441	spin_unlock(&inode->i_sb->s_inode_list_lock);
				442	}
				443	EXPORT_SYMBOL_GPL(inode_sb_list_add);
				444
				445	static inline void inode_sb_list_del(struct inode *inode)
				446	{
				447	if (!list_empty(&inode->i_sb_list)) {
				448	spin_lock(&inode->i_sb->s_inode_list_lock);
				449	list_del_init(&inode->i_sb_list);
				450	spin_unlock(&inode->i_sb->s_inode_list_lock);
				451	}
				452	}
				453
				454	static unsigned long hash(struct super_block *sb, unsigned long hashval)
				455	{
				456	unsigned long tmp;
				457
				458	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
				459	L1_CACHE_BYTES;
				460	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
				461	return tmp & i_hash_mask;
				462	}
				463
				464	/**
				465	* __insert_inode_hash - hash an inode
				466	* @inode: unhashed inode
				467	* @hashval: unsigned long value used to locate this object in the
				468	* inode_hashtable.
				469	*
				470	* Add an inode to the inode hash for this superblock.
				471	*/
				472	void __insert_inode_hash(struct inode *inode, unsigned long hashval)
				473	{
				474	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
				475
				476	spin_lock(&inode_hash_lock);
				477	spin_lock(&inode->i_lock);
				478	hlist_add_head(&inode->i_hash, b);
				479	spin_unlock(&inode->i_lock);
				480	spin_unlock(&inode_hash_lock);
				481	}
				482	EXPORT_SYMBOL(__insert_inode_hash);
				483
				484	/**
				485	* __remove_inode_hash - remove an inode from the hash
				486	* @inode: inode to unhash
				487	*
				488	* Remove an inode from the superblock.
				489	*/
				490	void __remove_inode_hash(struct inode *inode)
				491	{
				492	spin_lock(&inode_hash_lock);
				493	spin_lock(&inode->i_lock);
				494	hlist_del_init(&inode->i_hash);
				495	spin_unlock(&inode->i_lock);
				496	spin_unlock(&inode_hash_lock);
				497	}
				498	EXPORT_SYMBOL(__remove_inode_hash);
				499
				500	void clear_inode(struct inode *inode)
				501	{
				502	might_sleep();
				503	/*
				504	* We have to cycle tree_lock here because reclaim can be still in the
				505	* process of removing the last page (in __delete_from_page_cache())
				506	* and we must not free mapping under it.
				507	*/
				508	spin_lock_irq(&inode->i_data.tree_lock);
				509	BUG_ON(inode->i_data.nrpages);
				510	BUG_ON(inode->i_data.nrexceptional);
				511	spin_unlock_irq(&inode->i_data.tree_lock);
				512	BUG_ON(!list_empty(&inode->i_data.private_list));
				513	BUG_ON(!(inode->i_state & I_FREEING));
				514	BUG_ON(inode->i_state & I_CLEAR);
				515	BUG_ON(!list_empty(&inode->i_wb_list));
				516	/* don't need i_lock here, no concurrent mods to i_state */
				517	inode->i_state = I_FREEING \| I_CLEAR;
				518	}
				519	EXPORT_SYMBOL(clear_inode);
				520
				521	/*
				522	* Free the inode passed in, removing it from the lists it is still connected
				523	* to. We remove any pages still attached to the inode and wait for any IO that
				524	* is still in progress before finally destroying the inode.
				525	*
				526	* An inode must already be marked I_FREEING so that we avoid the inode being
				527	* moved back onto lists if we race with other code that manipulates the lists
				528	* (e.g. writeback_single_inode). The caller is responsible for setting this.
				529	*
				530	* An inode must already be removed from the LRU list before being evicted from
				531	* the cache. This should occur atomically with setting the I_FREEING state
				532	* flag, so no inodes here should ever be on the LRU when being evicted.
				533	*/
				534	static void evict(struct inode *inode)
				535	{
				536	const struct super_operations *op = inode->i_sb->s_op;
				537
				538	BUG_ON(!(inode->i_state & I_FREEING));
				539	BUG_ON(!list_empty(&inode->i_lru));
				540
				541	if (!list_empty(&inode->i_io_list))
				542	inode_io_list_del(inode);
				543
				544	inode_sb_list_del(inode);
				545
				546	/*
				547	* Wait for flusher thread to be done with the inode so that filesystem
				548	* does not start destroying it while writeback is still running. Since
				549	* the inode has I_FREEING set, flusher thread won't start new work on
				550	* the inode. We just have to wait for running writeback to finish.
				551	*/
				552	inode_wait_for_writeback(inode);
				553
				554	if (op->evict_inode) {
				555	op->evict_inode(inode);
				556	} else {
				557	truncate_inode_pages_final(&inode->i_data);
				558	clear_inode(inode);
				559	}
				560	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
				561	bd_forget(inode);
				562	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
				563	cd_forget(inode);
				564
				565	remove_inode_hash(inode);
				566
				567	spin_lock(&inode->i_lock);
				568	wake_up_bit(&inode->i_state, __I_NEW);
				569	BUG_ON(inode->i_state != (I_FREEING \| I_CLEAR));
				570	spin_unlock(&inode->i_lock);
				571
				572	destroy_inode(inode);
				573	}
				574
				575	/*
				576	* dispose_list - dispose of the contents of a local list
				577	* @head: the head of the list to free
				578	*
				579	* Dispose-list gets a local list with local inodes in it, so it doesn't
				580	* need to worry about list corruption and SMP locks.
				581	*/
				582	static void dispose_list(struct list_head *head)
				583	{
				584	while (!list_empty(head)) {
				585	struct inode *inode;
				586
				587	inode = list_first_entry(head, struct inode, i_lru);
				588	list_del_init(&inode->i_lru);
				589
				590	evict(inode);
				591	cond_resched();
				592	}
				593	}
				594
				595	/**
				596	* evict_inodes - evict all evictable inodes for a superblock
				597	* @sb: superblock to operate on
				598	*
				599	* Make sure that no inodes with zero refcount are retained. This is
				600	* called by superblock shutdown after having MS_ACTIVE flag removed,
				601	* so any inode reaching zero refcount during or after that call will
				602	* be immediately evicted.
				603	*/
				604	void evict_inodes(struct super_block *sb)
				605	{
				606	struct inode inode, next;
				607	LIST_HEAD(dispose);
				608
				609	again:
				610	spin_lock(&sb->s_inode_list_lock);
				611	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				612	if (atomic_read(&inode->i_count))
				613	continue;
				614
				615	spin_lock(&inode->i_lock);
				616	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				617	spin_unlock(&inode->i_lock);
				618	continue;
				619	}
				620
				621	inode->i_state \|= I_FREEING;
				622	inode_lru_list_del(inode);
				623	spin_unlock(&inode->i_lock);
				624	list_add(&inode->i_lru, &dispose);
				625
				626	/*
				627	* We can have a ton of inodes to evict at unmount time given
				628	* enough memory, check to see if we need to go to sleep for a
				629	* bit so we don't livelock.
				630	*/
				631	if (need_resched()) {
				632	spin_unlock(&sb->s_inode_list_lock);
				633	cond_resched();
				634	dispose_list(&dispose);
				635	goto again;
				636	}
				637	}
				638	spin_unlock(&sb->s_inode_list_lock);
				639
				640	dispose_list(&dispose);
				641	}
				642	EXPORT_SYMBOL_GPL(evict_inodes);
				643
				644	/**
				645	* invalidate_inodes - attempt to free all inodes on a superblock
				646	* @sb: superblock to operate on
				647	* @kill_dirty: flag to guide handling of dirty inodes
				648	*
				649	* Attempts to free all inodes for a given superblock. If there were any
				650	* busy inodes return a non-zero value, else zero.
				651	* If @kill_dirty is set, discard dirty inodes too, otherwise treat
				652	* them as busy.
				653	*/
				654	int invalidate_inodes(struct super_block *sb, bool kill_dirty)
				655	{
				656	int busy = 0;
				657	struct inode inode, next;
				658	LIST_HEAD(dispose);
				659
				660	again:
				661	spin_lock(&sb->s_inode_list_lock);
				662	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
				663	spin_lock(&inode->i_lock);
				664	if (inode->i_state & (I_NEW \| I_FREEING \| I_WILL_FREE)) {
				665	spin_unlock(&inode->i_lock);
				666	continue;
				667	}
				668	if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
				669	spin_unlock(&inode->i_lock);
				670	busy = 1;
				671	continue;
				672	}
				673	if (atomic_read(&inode->i_count)) {
				674	spin_unlock(&inode->i_lock);
				675	busy = 1;
				676	continue;
				677	}
				678
				679	inode->i_state \|= I_FREEING;
				680	inode_lru_list_del(inode);
				681	spin_unlock(&inode->i_lock);
				682	list_add(&inode->i_lru, &dispose);
				683	if (need_resched()) {
				684	spin_unlock(&sb->s_inode_list_lock);
				685	cond_resched();
				686	dispose_list(&dispose);
				687	goto again;
				688	}
				689	}
				690	spin_unlock(&sb->s_inode_list_lock);
				691
				692	dispose_list(&dispose);
				693
				694	return busy;
				695	}
				696
				697	/*
				698	* Isolate the inode from the LRU in preparation for freeing it.
				699	*
				700	* Any inodes which are pinned purely because of attached pagecache have their
				701	* pagecache removed. If the inode has metadata buffers attached to
				702	* mapping->private_list then try to remove them.
				703	*
				704	* If the inode has the I_REFERENCED flag set, then it means that it has been
				705	* used recently - the flag is set in iput_final(). When we encounter such an
				706	* inode, clear the flag and move it to the back of the LRU so it gets another
				707	* pass through the LRU before it gets reclaimed. This is necessary because of
				708	* the fact we are doing lazy LRU updates to minimise lock contention so the
				709	* LRU does not have strict ordering. Hence we don't want to reclaim inodes
				710	* with this flag set because they are the inodes that are out of order.
				711	*/
				712	static enum lru_status inode_lru_isolate(struct list_head *item,
				713	struct list_lru_one lru, spinlock_t lru_lock, void *arg)
				714	{
				715	struct list_head *freeable = arg;
				716	struct inode *inode = container_of(item, struct inode, i_lru);
				717
				718	/*
				719	* we are inverting the lru lock/inode->i_lock here, so use a trylock.
				720	* If we fail to get the lock, just skip it.
				721	*/
				722	if (!spin_trylock(&inode->i_lock))
				723	return LRU_SKIP;
				724
				725	/*
				726	* Referenced or dirty inodes are still in use. Give them another pass
				727	* through the LRU as we canot reclaim them now.
				728	*/
				729	if (atomic_read(&inode->i_count) \|\|
				730	(inode->i_state & ~I_REFERENCED)) {
				731	list_lru_isolate(lru, &inode->i_lru);
				732	spin_unlock(&inode->i_lock);
				733	this_cpu_dec(nr_unused);
				734	return LRU_REMOVED;
				735	}
				736
				737	/* recently referenced inodes get one more pass */
				738	if (inode->i_state & I_REFERENCED) {
				739	inode->i_state &= ~I_REFERENCED;
				740	spin_unlock(&inode->i_lock);
				741	return LRU_ROTATE;
				742	}
				743
				744	if (inode_has_buffers(inode) \|\| inode->i_data.nrpages) {
				745	__iget(inode);
				746	spin_unlock(&inode->i_lock);
				747	spin_unlock(lru_lock);
				748	if (remove_inode_buffers(inode)) {
				749	unsigned long reap;
				750	reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
				751	if (current_is_kswapd())
				752	__count_vm_events(KSWAPD_INODESTEAL, reap);
				753	else
				754	__count_vm_events(PGINODESTEAL, reap);
				755	if (current->reclaim_state)
				756	current->reclaim_state->reclaimed_slab += reap;
				757	}
				758	iput(inode);
				759	spin_lock(lru_lock);
				760	return LRU_RETRY;
				761	}
				762
				763	WARN_ON(inode->i_state & I_NEW);
				764	inode->i_state \|= I_FREEING;
				765	list_lru_isolate_move(lru, &inode->i_lru, freeable);
				766	spin_unlock(&inode->i_lock);
				767
				768	this_cpu_dec(nr_unused);
				769	return LRU_REMOVED;
				770	}
				771
				772	/*
				773	* Walk the superblock inode LRU for freeable inodes and attempt to free them.
				774	* This is called from the superblock shrinker function with a number of inodes
				775	* to trim from the LRU. Inodes to be freed are moved to a temporary list and
				776	* then are freed outside inode_lock by dispose_list().
				777	*/
				778	long prune_icache_sb(struct super_block sb, struct shrink_control sc)
				779	{
				780	LIST_HEAD(freeable);
				781	long freed;
				782
				783	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
				784	inode_lru_isolate, &freeable);
				785	dispose_list(&freeable);
				786	return freed;
				787	}
				788
				789	static void __wait_on_freeing_inode(struct inode *inode);
				790	/*
				791	* Called with the inode lock held.
				792	*/
				793	static struct inode find_inode(struct super_block sb,
				794	struct hlist_head *head,
				795	int (test)(struct inode , void *),
				796	void *data)
				797	{
				798	struct inode *inode = NULL;
				799
				800	repeat:
				801	hlist_for_each_entry(inode, head, i_hash) {
				802	if (inode->i_sb != sb)
				803	continue;
				804	if (!test(inode, data))
				805	continue;
				806	spin_lock(&inode->i_lock);
				807	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				808	__wait_on_freeing_inode(inode);
				809	goto repeat;
				810	}
				811	__iget(inode);
				812	spin_unlock(&inode->i_lock);
				813	return inode;
				814	}
				815	return NULL;
				816	}
				817
				818	/*
				819	* find_inode_fast is the fast path version of find_inode, see the comment at
				820	* iget_locked for details.
				821	*/
				822	static struct inode find_inode_fast(struct super_block sb,
				823	struct hlist_head *head, unsigned long ino)
				824	{
				825	struct inode *inode = NULL;
				826
				827	repeat:
				828	hlist_for_each_entry(inode, head, i_hash) {
				829	if (inode->i_ino != ino)
				830	continue;
				831	if (inode->i_sb != sb)
				832	continue;
				833	spin_lock(&inode->i_lock);
				834	if (inode->i_state & (I_FREEING\|I_WILL_FREE)) {
				835	__wait_on_freeing_inode(inode);
				836	goto repeat;
				837	}
				838	__iget(inode);
				839	spin_unlock(&inode->i_lock);
				840	return inode;
				841	}
				842	return NULL;
				843	}
				844
				845	/*
				846	* Each cpu owns a range of LAST_INO_BATCH numbers.
				847	* 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
				848	* to renew the exhausted range.
				849	*
				850	* This does not significantly increase overflow rate because every CPU can
				851	* consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
				852	* NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
				853	* 2^32 range, and is a worst-case. Even a 50% wastage would only increase
				854	* overflow rate by 2x, which does not seem too significant.
				855	*
				856	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				857	* error if st_ino won't fit in target struct field. Use 32bit counter
				858	* here to attempt to avoid that.
				859	*/
				860	#define LAST_INO_BATCH 1024
				861	static DEFINE_PER_CPU(unsigned int, last_ino);
				862
				863	unsigned int get_next_ino(void)
				864	{
				865	unsigned int *p = &get_cpu_var(last_ino);
				866	unsigned int res = *p;
				867
				868	#ifdef CONFIG_SMP
				869	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
				870	static atomic_t shared_last_ino;
				871	int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
				872
				873	res = next - LAST_INO_BATCH;
				874	}
				875	#endif
				876
				877	res++;
				878	/* get_next_ino should not provide a 0 inode number */
				879	if (unlikely(!res))
				880	res++;
				881	*p = res;
				882	put_cpu_var(last_ino);
				883	return res;
				884	}
				885	EXPORT_SYMBOL(get_next_ino);
				886
				887	/**
				888	* new_inode_pseudo - obtain an inode
				889	* @sb: superblock
				890	*
				891	* Allocates a new inode for given superblock.
				892	* Inode wont be chained in superblock s_inodes list
				893	* This means :
				894	* - fs can't be unmount
				895	* - quotas, fsnotify, writeback can't work
				896	*/
				897	struct inode new_inode_pseudo(struct super_block sb)
				898	{
				899	struct inode *inode = alloc_inode(sb);
				900
				901	if (inode) {
				902	spin_lock(&inode->i_lock);
				903	inode->i_state = 0;
				904	spin_unlock(&inode->i_lock);
				905	INIT_LIST_HEAD(&inode->i_sb_list);
				906	}
				907	return inode;
				908	}
				909
				910	/**
				911	* new_inode - obtain an inode
				912	* @sb: superblock
				913	*
				914	* Allocates a new inode for given superblock. The default gfp_mask
				915	* for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
				916	* If HIGHMEM pages are unsuitable or it is known that pages allocated
				917	* for the page cache are not reclaimable or migratable,
				918	* mapping_set_gfp_mask() must be called with suitable flags on the
				919	* newly created inode's mapping
				920	*
				921	*/
				922	struct inode new_inode(struct super_block sb)
				923	{
				924	struct inode *inode;
				925
				926	spin_lock_prefetch(&sb->s_inode_list_lock);
				927
				928	inode = new_inode_pseudo(sb);
				929	if (inode)
				930	inode_sb_list_add(inode);
				931	return inode;
				932	}
				933	EXPORT_SYMBOL(new_inode);
				934
				935	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				936	void lockdep_annotate_inode_mutex_key(struct inode *inode)
				937	{
				938	if (S_ISDIR(inode->i_mode)) {
				939	struct file_system_type *type = inode->i_sb->s_type;
				940
				941	/* Set new key only if filesystem hasn't already changed it */
				942	if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
				943	/*
				944	* ensure nobody is actually holding i_mutex
				945	*/
				946	// mutex_destroy(&inode->i_mutex);
				947	init_rwsem(&inode->i_rwsem);
				948	lockdep_set_class(&inode->i_rwsem,
				949	&type->i_mutex_dir_key);
				950	}
				951	}
				952	}
				953	EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
				954	#endif
				955
				956	/**
				957	* unlock_new_inode - clear the I_NEW state and wake up any waiters
				958	* @inode: new inode to unlock
				959	*
				960	* Called when the inode is fully initialised to clear the new state of the
				961	* inode and wake up anyone waiting for the inode to finish initialisation.
				962	*/
				963	void unlock_new_inode(struct inode *inode)
				964	{
				965	lockdep_annotate_inode_mutex_key(inode);
				966	spin_lock(&inode->i_lock);
				967	WARN_ON(!(inode->i_state & I_NEW));
				968	inode->i_state &= ~I_NEW;
				969	smp_mb();
				970	wake_up_bit(&inode->i_state, __I_NEW);
				971	spin_unlock(&inode->i_lock);
				972	}
				973	EXPORT_SYMBOL(unlock_new_inode);
				974
				975	/**
				976	* lock_two_nondirectories - take two i_mutexes on non-directory objects
				977	*
				978	* Lock any non-NULL argument that is not a directory.
				979	* Zero, one or two objects may be locked by this function.
				980	*
				981	* @inode1: first inode to lock
				982	* @inode2: second inode to lock
				983	*/
				984	void lock_two_nondirectories(struct inode inode1, struct inode inode2)
				985	{
				986	if (inode1 > inode2)
				987	swap(inode1, inode2);
				988
				989	if (inode1 && !S_ISDIR(inode1->i_mode))
				990	inode_lock(inode1);
				991	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				992	inode_lock_nested(inode2, I_MUTEX_NONDIR2);
				993	}
				994	EXPORT_SYMBOL(lock_two_nondirectories);
				995
				996	/**
				997	* unlock_two_nondirectories - release locks from lock_two_nondirectories()
				998	* @inode1: first inode to unlock
				999	* @inode2: second inode to unlock
				1000	*/
				1001	void unlock_two_nondirectories(struct inode inode1, struct inode inode2)
				1002	{
				1003	if (inode1 && !S_ISDIR(inode1->i_mode))
				1004	inode_unlock(inode1);
				1005	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
				1006	inode_unlock(inode2);
				1007	}
				1008	EXPORT_SYMBOL(unlock_two_nondirectories);
				1009
				1010	/**
				1011	* iget5_locked - obtain an inode from a mounted file system
				1012	* @sb: super block of file system
				1013	* @hashval: hash value (usually inode number) to get
				1014	* @test: callback used for comparisons between inodes
				1015	* @set: callback used to initialize a new struct inode
				1016	* @data: opaque data pointer to pass to @test and @set
				1017	*
				1018	* Search for the inode specified by @hashval and @data in the inode cache,
				1019	* and if present it is return it with an increased reference count. This is
				1020	* a generalized version of iget_locked() for file systems where the inode
				1021	* number is not sufficient for unique identification of an inode.
				1022	*
				1023	* If the inode is not in cache, allocate a new inode and return it locked,
				1024	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1025	* before unlocking it via unlock_new_inode().
				1026	*
				1027	* Note both @test and @set are called with the inode_hash_lock held, so can't
				1028	* sleep.
				1029	*/
				1030	struct inode iget5_locked(struct super_block sb, unsigned long hashval,
				1031	int (test)(struct inode , void *),
				1032	int (set)(struct inode , void ), void data)
				1033	{
				1034	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1035	struct inode *inode;
				1036	again:
				1037	spin_lock(&inode_hash_lock);
				1038	inode = find_inode(sb, head, test, data);
				1039	spin_unlock(&inode_hash_lock);
				1040
				1041	if (inode) {
				1042	wait_on_inode(inode);
				1043	if (unlikely(inode_unhashed(inode))) {
				1044	iput(inode);
				1045	goto again;
				1046	}
				1047	return inode;
				1048	}
				1049
				1050	inode = alloc_inode(sb);
				1051	if (inode) {
				1052	struct inode *old;
				1053
				1054	spin_lock(&inode_hash_lock);
				1055	/* We released the lock, so.. */
				1056	old = find_inode(sb, head, test, data);
				1057	if (!old) {
				1058	if (set(inode, data))
				1059	goto set_failed;
				1060
				1061	spin_lock(&inode->i_lock);
				1062	inode->i_state = I_NEW;
				1063	hlist_add_head(&inode->i_hash, head);
				1064	spin_unlock(&inode->i_lock);
				1065	inode_sb_list_add(inode);
				1066	spin_unlock(&inode_hash_lock);
				1067
				1068	/* Return the locked inode with I_NEW set, the
				1069	* caller is responsible for filling in the contents
				1070	*/
				1071	return inode;
				1072	}
				1073
				1074	/*
				1075	* Uhhuh, somebody else created the same inode under
				1076	* us. Use the old inode instead of the one we just
				1077	* allocated.
				1078	*/
				1079	spin_unlock(&inode_hash_lock);
				1080	destroy_inode(inode);
				1081	inode = old;
				1082	wait_on_inode(inode);
				1083	if (unlikely(inode_unhashed(inode))) {
				1084	iput(inode);
				1085	goto again;
				1086	}
				1087	}
				1088	return inode;
				1089
				1090	set_failed:
				1091	spin_unlock(&inode_hash_lock);
				1092	destroy_inode(inode);
				1093	return NULL;
				1094	}
				1095	EXPORT_SYMBOL(iget5_locked);
				1096
				1097	/**
				1098	* iget_locked - obtain an inode from a mounted file system
				1099	* @sb: super block of file system
				1100	* @ino: inode number to get
				1101	*
				1102	* Search for the inode specified by @ino in the inode cache and if present
				1103	* return it with an increased reference count. This is for file systems
				1104	* where the inode number is sufficient for unique identification of an inode.
				1105	*
				1106	* If the inode is not in cache, allocate a new inode and return it locked,
				1107	* hashed, and with the I_NEW flag set. The file system gets to fill it in
				1108	* before unlocking it via unlock_new_inode().
				1109	*/
				1110	struct inode iget_locked(struct super_block sb, unsigned long ino)
				1111	{
				1112	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1113	struct inode *inode;
				1114	again:
				1115	spin_lock(&inode_hash_lock);
				1116	inode = find_inode_fast(sb, head, ino);
				1117	spin_unlock(&inode_hash_lock);
				1118	if (inode) {
				1119	wait_on_inode(inode);
				1120	if (unlikely(inode_unhashed(inode))) {
				1121	iput(inode);
				1122	goto again;
				1123	}
				1124	return inode;
				1125	}
				1126
				1127	inode = alloc_inode(sb);
				1128	if (inode) {
				1129	struct inode *old;
				1130
				1131	spin_lock(&inode_hash_lock);
				1132	/* We released the lock, so.. */
				1133	old = find_inode_fast(sb, head, ino);
				1134	if (!old) {
				1135	inode->i_ino = ino;
				1136	spin_lock(&inode->i_lock);
				1137	inode->i_state = I_NEW;
				1138	hlist_add_head(&inode->i_hash, head);
				1139	spin_unlock(&inode->i_lock);
				1140	inode_sb_list_add(inode);
				1141	spin_unlock(&inode_hash_lock);
				1142
				1143	/* Return the locked inode with I_NEW set, the
				1144	* caller is responsible for filling in the contents
				1145	*/
				1146	return inode;
				1147	}
				1148
				1149	/*
				1150	* Uhhuh, somebody else created the same inode under
				1151	* us. Use the old inode instead of the one we just
				1152	* allocated.
				1153	*/
				1154	spin_unlock(&inode_hash_lock);
				1155	destroy_inode(inode);
				1156	inode = old;
				1157	wait_on_inode(inode);
				1158	if (unlikely(inode_unhashed(inode))) {
				1159	iput(inode);
				1160	goto again;
				1161	}
				1162	}
				1163	return inode;
				1164	}
				1165	EXPORT_SYMBOL(iget_locked);
				1166
				1167	/*
				1168	* search the inode cache for a matching inode number.
				1169	* If we find one, then the inode number we are trying to
				1170	* allocate is not unique and so we should not use it.
				1171	*
				1172	* Returns 1 if the inode number is unique, 0 if it is not.
				1173	*/
				1174	static int test_inode_iunique(struct super_block *sb, unsigned long ino)
				1175	{
				1176	struct hlist_head *b = inode_hashtable + hash(sb, ino);
				1177	struct inode *inode;
				1178
				1179	spin_lock(&inode_hash_lock);
				1180	hlist_for_each_entry(inode, b, i_hash) {
				1181	if (inode->i_ino == ino && inode->i_sb == sb) {
				1182	spin_unlock(&inode_hash_lock);
				1183	return 0;
				1184	}
				1185	}
				1186	spin_unlock(&inode_hash_lock);
				1187
				1188	return 1;
				1189	}
				1190
				1191	/**
				1192	* iunique - get a unique inode number
				1193	* @sb: superblock
				1194	* @max_reserved: highest reserved inode number
				1195	*
				1196	* Obtain an inode number that is unique on the system for a given
				1197	* superblock. This is used by file systems that have no natural
				1198	* permanent inode numbering system. An inode number is returned that
				1199	* is higher than the reserved limit but unique.
				1200	*
				1201	* BUGS:
				1202	* With a large number of inodes live on the file system this function
				1203	* currently becomes quite slow.
				1204	*/
				1205	ino_t iunique(struct super_block *sb, ino_t max_reserved)
				1206	{
				1207	/*
				1208	* On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
				1209	* error if st_ino won't fit in target struct field. Use 32bit counter
				1210	* here to attempt to avoid that.
				1211	*/
				1212	static DEFINE_SPINLOCK(iunique_lock);
				1213	static unsigned int counter;
				1214	ino_t res;
				1215
				1216	spin_lock(&iunique_lock);
				1217	do {
				1218	if (counter <= max_reserved)
				1219	counter = max_reserved + 1;
				1220	res = counter++;
				1221	} while (!test_inode_iunique(sb, res));
				1222	spin_unlock(&iunique_lock);
				1223
				1224	return res;
				1225	}
				1226	EXPORT_SYMBOL(iunique);
				1227
				1228	struct inode igrab(struct inode inode)
				1229	{
				1230	spin_lock(&inode->i_lock);
				1231	if (!(inode->i_state & (I_FREEING\|I_WILL_FREE))) {
				1232	__iget(inode);
				1233	spin_unlock(&inode->i_lock);
				1234	} else {
				1235	spin_unlock(&inode->i_lock);
				1236	/*
				1237	* Handle the case where s_op->clear_inode is not been
				1238	* called yet, and somebody is calling igrab
				1239	* while the inode is getting freed.
				1240	*/
				1241	inode = NULL;
				1242	}
				1243	return inode;
				1244	}
				1245	EXPORT_SYMBOL(igrab);
				1246
				1247	/**
				1248	* ilookup5_nowait - search for an inode in the inode cache
				1249	* @sb: super block of file system to search
				1250	* @hashval: hash value (usually inode number) to search for
				1251	* @test: callback used for comparisons between inodes
				1252	* @data: opaque data pointer to pass to @test
				1253	*
				1254	* Search for the inode specified by @hashval and @data in the inode cache.
				1255	* If the inode is in the cache, the inode is returned with an incremented
				1256	* reference count.
				1257	*
				1258	* Note: I_NEW is not waited upon so you have to be very careful what you do
				1259	* with the returned inode. You probably should be using ilookup5() instead.
				1260	*
				1261	* Note2: @test is called with the inode_hash_lock held, so can't sleep.
				1262	*/
				1263	struct inode ilookup5_nowait(struct super_block sb, unsigned long hashval,
				1264	int (test)(struct inode , void ), void data)
				1265	{
				1266	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1267	struct inode *inode;
				1268
				1269	spin_lock(&inode_hash_lock);
				1270	inode = find_inode(sb, head, test, data);
				1271	spin_unlock(&inode_hash_lock);
				1272
				1273	return inode;
				1274	}
				1275	EXPORT_SYMBOL(ilookup5_nowait);
				1276
				1277	/**
				1278	* ilookup5 - search for an inode in the inode cache
				1279	* @sb: super block of file system to search
				1280	* @hashval: hash value (usually inode number) to search for
				1281	* @test: callback used for comparisons between inodes
				1282	* @data: opaque data pointer to pass to @test
				1283	*
				1284	* Search for the inode specified by @hashval and @data in the inode cache,
				1285	* and if the inode is in the cache, return the inode with an incremented
				1286	* reference count. Waits on I_NEW before returning the inode.
				1287	* returned with an incremented reference count.
				1288	*
				1289	* This is a generalized version of ilookup() for file systems where the
				1290	* inode number is not sufficient for unique identification of an inode.
				1291	*
				1292	* Note: @test is called with the inode_hash_lock held, so can't sleep.
				1293	*/
				1294	struct inode ilookup5(struct super_block sb, unsigned long hashval,
				1295	int (test)(struct inode , void ), void data)
				1296	{
				1297	struct inode *inode;
				1298	again:
				1299	inode = ilookup5_nowait(sb, hashval, test, data);
				1300	if (inode) {
				1301	wait_on_inode(inode);
				1302	if (unlikely(inode_unhashed(inode))) {
				1303	iput(inode);
				1304	goto again;
				1305	}
				1306	}
				1307	return inode;
				1308	}
				1309	EXPORT_SYMBOL(ilookup5);
				1310
				1311	/**
				1312	* ilookup - search for an inode in the inode cache
				1313	* @sb: super block of file system to search
				1314	* @ino: inode number to search for
				1315	*
				1316	* Search for the inode @ino in the inode cache, and if the inode is in the
				1317	* cache, the inode is returned with an incremented reference count.
				1318	*/
				1319	struct inode ilookup(struct super_block sb, unsigned long ino)
				1320	{
				1321	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1322	struct inode *inode;
				1323	again:
				1324	spin_lock(&inode_hash_lock);
				1325	inode = find_inode_fast(sb, head, ino);
				1326	spin_unlock(&inode_hash_lock);
				1327
				1328	if (inode) {
				1329	wait_on_inode(inode);
				1330	if (unlikely(inode_unhashed(inode))) {
				1331	iput(inode);
				1332	goto again;
				1333	}
				1334	}
				1335	return inode;
				1336	}
				1337	EXPORT_SYMBOL(ilookup);
				1338
				1339	/**
				1340	* find_inode_nowait - find an inode in the inode cache
				1341	* @sb: super block of file system to search
				1342	* @hashval: hash value (usually inode number) to search for
				1343	* @match: callback used for comparisons between inodes
				1344	* @data: opaque data pointer to pass to @match
				1345	*
				1346	* Search for the inode specified by @hashval and @data in the inode
				1347	* cache, where the helper function @match will return 0 if the inode
				1348	* does not match, 1 if the inode does match, and -1 if the search
				1349	* should be stopped. The @match function must be responsible for
				1350	* taking the i_lock spin_lock and checking i_state for an inode being
				1351	* freed or being initialized, and incrementing the reference count
				1352	* before returning 1. It also must not sleep, since it is called with
				1353	* the inode_hash_lock spinlock held.
				1354	*
				1355	* This is a even more generalized version of ilookup5() when the
				1356	* function must never block --- find_inode() can block in
				1357	* __wait_on_freeing_inode() --- or when the caller can not increment
				1358	* the reference count because the resulting iput() might cause an
				1359	* inode eviction. The tradeoff is that the @match funtion must be
				1360	* very carefully implemented.
				1361	*/
				1362	struct inode find_inode_nowait(struct super_block sb,
				1363	unsigned long hashval,
				1364	int (match)(struct inode , unsigned long,
				1365	void *),
				1366	void *data)
				1367	{
				1368	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1369	struct inode inode, ret_inode = NULL;
				1370	int mval;
				1371
				1372	spin_lock(&inode_hash_lock);
				1373	hlist_for_each_entry(inode, head, i_hash) {
				1374	if (inode->i_sb != sb)
				1375	continue;
				1376	mval = match(inode, hashval, data);
				1377	if (mval == 0)
				1378	continue;
				1379	if (mval == 1)
				1380	ret_inode = inode;
				1381	goto out;
				1382	}
				1383	out:
				1384	spin_unlock(&inode_hash_lock);
				1385	return ret_inode;
				1386	}
				1387	EXPORT_SYMBOL(find_inode_nowait);
				1388
				1389	int insert_inode_locked(struct inode *inode)
				1390	{
				1391	struct super_block *sb = inode->i_sb;
				1392	ino_t ino = inode->i_ino;
				1393	struct hlist_head *head = inode_hashtable + hash(sb, ino);
				1394
				1395	while (1) {
				1396	struct inode *old = NULL;
				1397	spin_lock(&inode_hash_lock);
				1398	hlist_for_each_entry(old, head, i_hash) {
				1399	if (old->i_ino != ino)
				1400	continue;
				1401	if (old->i_sb != sb)
				1402	continue;
				1403	spin_lock(&old->i_lock);
				1404	if (old->i_state & (I_FREEING\|I_WILL_FREE)) {
				1405	spin_unlock(&old->i_lock);
				1406	continue;
				1407	}
				1408	break;
				1409	}
				1410	if (likely(!old)) {
				1411	spin_lock(&inode->i_lock);
				1412	inode->i_state \|= I_NEW;
				1413	hlist_add_head(&inode->i_hash, head);
				1414	spin_unlock(&inode->i_lock);
				1415	spin_unlock(&inode_hash_lock);
				1416	return 0;
				1417	}
				1418	__iget(old);
				1419	spin_unlock(&old->i_lock);
				1420	spin_unlock(&inode_hash_lock);
				1421	wait_on_inode(old);
				1422	if (unlikely(!inode_unhashed(old))) {
				1423	iput(old);
				1424	return -EBUSY;
				1425	}
				1426	iput(old);
				1427	}
				1428	}
				1429	EXPORT_SYMBOL(insert_inode_locked);
				1430
				1431	int insert_inode_locked4(struct inode *inode, unsigned long hashval,
				1432	int (test)(struct inode , void ), void data)
				1433	{
				1434	struct super_block *sb = inode->i_sb;
				1435	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
				1436
				1437	while (1) {
				1438	struct inode *old = NULL;
				1439
				1440	spin_lock(&inode_hash_lock);
				1441	hlist_for_each_entry(old, head, i_hash) {
				1442	if (old->i_sb != sb)
				1443	continue;
				1444	if (!test(old, data))
				1445	continue;
				1446	spin_lock(&old->i_lock);
				1447	if (old->i_state & (I_FREEING\|I_WILL_FREE)) {
				1448	spin_unlock(&old->i_lock);
				1449	continue;
				1450	}
				1451	break;
				1452	}
				1453	if (likely(!old)) {
				1454	spin_lock(&inode->i_lock);
				1455	inode->i_state \|= I_NEW;
				1456	hlist_add_head(&inode->i_hash, head);
				1457	spin_unlock(&inode->i_lock);
				1458	spin_unlock(&inode_hash_lock);
				1459	return 0;
				1460	}
				1461	__iget(old);
				1462	spin_unlock(&old->i_lock);
				1463	spin_unlock(&inode_hash_lock);
				1464	wait_on_inode(old);
				1465	if (unlikely(!inode_unhashed(old))) {
				1466	iput(old);
				1467	return -EBUSY;
				1468	}
				1469	iput(old);
				1470	}
				1471	}
				1472	EXPORT_SYMBOL(insert_inode_locked4);
				1473
				1474
				1475	int generic_delete_inode(struct inode *inode)
				1476	{
				1477	return 1;
				1478	}
				1479	EXPORT_SYMBOL(generic_delete_inode);
				1480
				1481	/*
				1482	* Called when we're dropping the last reference
				1483	* to an inode.
				1484	*
				1485	* Call the FS "drop_inode()" function, defaulting to
				1486	* the legacy UNIX filesystem behaviour. If it tells
				1487	* us to evict inode, do so. Otherwise, retain inode
				1488	* in cache if fs is alive, sync and evict if fs is
				1489	* shutting down.
				1490	*/
				1491	static void iput_final(struct inode *inode)
				1492	{
				1493	struct super_block *sb = inode->i_sb;
				1494	const struct super_operations *op = inode->i_sb->s_op;
				1495	int drop;
				1496
				1497	WARN_ON(inode->i_state & I_NEW);
				1498
				1499	if (op->drop_inode)
				1500	drop = op->drop_inode(inode);
				1501	else
				1502	drop = generic_drop_inode(inode);
				1503
				1504	if (!drop && (sb->s_flags & MS_ACTIVE)) {
				1505	inode_add_lru(inode);
				1506	spin_unlock(&inode->i_lock);
				1507	return;
				1508	}
				1509
				1510	if (!drop) {
				1511	inode->i_state \|= I_WILL_FREE;
				1512	spin_unlock(&inode->i_lock);
				1513	write_inode_now(inode, 1);
				1514	spin_lock(&inode->i_lock);
				1515	WARN_ON(inode->i_state & I_NEW);
				1516	inode->i_state &= ~I_WILL_FREE;
				1517	}
				1518
				1519	inode->i_state \|= I_FREEING;
				1520	if (!list_empty(&inode->i_lru))
				1521	inode_lru_list_del(inode);
				1522	spin_unlock(&inode->i_lock);
				1523
				1524	evict(inode);
				1525	}
				1526
				1527	/**
				1528	* iput - put an inode
				1529	* @inode: inode to put
				1530	*
				1531	* Puts an inode, dropping its usage count. If the inode use count hits
				1532	* zero, the inode is then freed and may also be destroyed.
				1533	*
				1534	* Consequently, iput() can sleep.
				1535	*/
				1536	void iput(struct inode *inode)
				1537	{
				1538	if (!inode)
				1539	return;
				1540	BUG_ON(inode->i_state & I_CLEAR);
				1541	retry:
				1542	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
				1543	if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
				1544	atomic_inc(&inode->i_count);
				1545	inode->i_state &= ~I_DIRTY_TIME;
				1546	spin_unlock(&inode->i_lock);
				1547	trace_writeback_lazytime_iput(inode);
				1548	mark_inode_dirty_sync(inode);
				1549	goto retry;
				1550	}
				1551	iput_final(inode);
				1552	}
				1553	}
				1554	EXPORT_SYMBOL(iput);
				1555
				1556	/**
				1557	* bmap - find a block number in a file
				1558	* @inode: inode of file
				1559	* @block: block to find
				1560	*
				1561	* Returns the block number on the device holding the inode that
				1562	* is the disk block number for the block of the file requested.
				1563	* That is, asked for block 4 of inode 1 the function will return the
				1564	* disk block relative to the disk start that holds that block of the
				1565	* file.
				1566	*/
				1567	sector_t bmap(struct inode *inode, sector_t block)
				1568	{
				1569	sector_t res = 0;
				1570	if (inode->i_mapping->a_ops->bmap)
				1571	res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
				1572	return res;
				1573	}
				1574	EXPORT_SYMBOL(bmap);
				1575
				1576	/*
				1577	* Update times in overlayed inode from underlying real inode
				1578	*/
				1579	static void update_ovl_inode_times(struct dentry dentry, struct inode inode,
				1580	bool rcu)
				1581	{
				1582	struct dentry *upperdentry;
				1583
				1584	/*
				1585	* Nothing to do if in rcu or if non-overlayfs
				1586	*/
				1587	if (rcu \|\| likely(!(dentry->d_flags & DCACHE_OP_REAL)))
				1588	return;
				1589
				1590	upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
				1591
				1592	/*
				1593	* If file is on lower then we can't update atime, so no worries about
				1594	* stale mtime/ctime.
				1595	*/
				1596	if (upperdentry) {
				1597	struct inode *realinode = d_inode(upperdentry);
				1598
				1599	if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) \|\|
				1600	!timespec_equal(&inode->i_ctime, &realinode->i_ctime))) {
				1601	inode->i_mtime = realinode->i_mtime;
				1602	inode->i_ctime = realinode->i_ctime;
				1603	}
				1604	}
				1605	}
				1606
				1607	/*
				1608	* With relative atime, only update atime if the previous atime is
				1609	* earlier than either the ctime or mtime or if at least a day has
				1610	* passed since the last atime update.
				1611	*/
				1612	static int relatime_need_update(const struct path path, struct inode inode,
				1613	struct timespec now, bool rcu)
				1614	{
				1615
				1616	if (!(path->mnt->mnt_flags & MNT_RELATIME))
				1617	return 1;
				1618
				1619	update_ovl_inode_times(path->dentry, inode, rcu);
				1620	/*
				1621	* Is mtime younger than atime? If yes, update atime:
				1622	*/
				1623	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
				1624	return 1;
				1625	/*
				1626	* Is ctime younger than atime? If yes, update atime:
				1627	*/
				1628	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
				1629	return 1;
				1630
				1631	/*
				1632	* Is the previous atime value older than a day? If yes,
				1633	* update atime:
				1634	*/
				1635	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 246060)
				1636	return 1;
				1637	/*
				1638	* Good, we can skip the atime update:
				1639	*/
				1640	return 0;
				1641	}
				1642
				1643	int generic_update_time(struct inode inode, struct timespec time, int flags)
				1644	{
				1645	int iflags = I_DIRTY_TIME;
				1646
				1647	if (flags & S_ATIME)
				1648	inode->i_atime = *time;
				1649	if (flags & S_VERSION)
				1650	inode_inc_iversion(inode);
				1651	if (flags & S_CTIME)
				1652	inode->i_ctime = *time;
				1653	if (flags & S_MTIME)
				1654	inode->i_mtime = *time;
				1655
				1656	if (!(inode->i_sb->s_flags & MS_LAZYTIME) \|\| (flags & S_VERSION))
				1657	iflags \|= I_DIRTY_SYNC;
				1658	__mark_inode_dirty(inode, iflags);
				1659	return 0;
				1660	}
				1661	EXPORT_SYMBOL(generic_update_time);
				1662
				1663	/*
				1664	* This does the actual work of updating an inodes time or version. Must have
				1665	* had called mnt_want_write() before calling this.
				1666	*/
				1667	static int update_time(struct inode inode, struct timespec time, int flags)
				1668	{
				1669	int (update_time)(struct inode , struct timespec *, int);
				1670
				1671	update_time = inode->i_op->update_time ? inode->i_op->update_time :
				1672	generic_update_time;
				1673
				1674	return update_time(inode, time, flags);
				1675	}
				1676
				1677	/**
				1678	* touch_atime - update the access time
				1679	* @path: the &struct path to update
				1680	* @inode: inode to update
				1681	*
				1682	* Update the accessed time on an inode and mark it for writeback.
				1683	* This function automatically handles read only file systems and media,
				1684	* as well as the "noatime" flag and inode specific "noatime" markers.
				1685	*/
				1686	bool __atime_needs_update(const struct path path, struct inode inode,
				1687	bool rcu)
				1688	{
				1689	struct vfsmount *mnt = path->mnt;
				1690	struct timespec now;
				1691
				1692	if (inode->i_flags & S_NOATIME)
				1693	return false;
				1694
				1695	/* Atime updates will likely cause i_uid and i_gid to be written
				1696	* back improprely if their true value is unknown to the vfs.
				1697	*/
				1698	if (HAS_UNMAPPED_ID(inode))
				1699	return false;
				1700
				1701	if (IS_NOATIME(inode))
				1702	return false;
				1703	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
				1704	return false;
				1705
				1706	if (mnt->mnt_flags & MNT_NOATIME)
				1707	return false;
				1708	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
				1709	return false;
				1710
				1711	now = current_time(inode);
				1712
				1713	if (!relatime_need_update(path, inode, now, rcu))
				1714	return false;
				1715
				1716	if (timespec_equal(&inode->i_atime, &now))
				1717	return false;
				1718
				1719	return true;
				1720	}
				1721
				1722	void touch_atime(const struct path *path)
				1723	{
				1724	struct vfsmount *mnt = path->mnt;
				1725	struct inode *inode = d_inode(path->dentry);
				1726	struct timespec now;
				1727
				1728	if (!__atime_needs_update(path, inode, false))
				1729	return;
				1730
				1731	if (!sb_start_write_trylock(inode->i_sb))
				1732	return;
				1733
				1734	if (__mnt_want_write(mnt) != 0)
				1735	goto skip_update;
				1736	/*
				1737	* File systems can error out when updating inodes if they need to
				1738	* allocate new space to modify an inode (such is the case for
				1739	* Btrfs), but since we touch atime while walking down the path we
				1740	* really don't care if we failed to update the atime of the file,
				1741	* so just ignore the return value.
				1742	* We may also fail on filesystems that have the ability to make parts
				1743	* of the fs read only, e.g. subvolumes in Btrfs.
				1744	*/
				1745	now = current_time(inode);
				1746	update_time(inode, &now, S_ATIME);
				1747	__mnt_drop_write(mnt);
				1748	skip_update:
				1749	sb_end_write(inode->i_sb);
				1750	}
				1751	EXPORT_SYMBOL(touch_atime);
				1752
				1753	/*
				1754	* The logic we want is
				1755	*
				1756	* if suid or (sgid and xgrp)
				1757	* remove privs
				1758	*/
				1759	int should_remove_suid(struct dentry *dentry)
				1760	{
				1761	umode_t mode = d_inode(dentry)->i_mode;
				1762	int kill = 0;
				1763
				1764	/* suid always must be killed */
				1765	if (unlikely(mode & S_ISUID))
				1766	kill = ATTR_KILL_SUID;
				1767
				1768	/*
				1769	* sgid without any exec bits is just a mandatory locking mark; leave
				1770	* it alone. If some exec bits are set, it's a real sgid; kill it.
				1771	*/
				1772	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
				1773	kill \|= ATTR_KILL_SGID;
				1774
				1775	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
				1776	return kill;
				1777
				1778	return 0;
				1779	}
				1780	EXPORT_SYMBOL(should_remove_suid);
				1781
				1782	/*
				1783	* Return mask of changes for notify_change() that need to be done as a
				1784	* response to write or truncate. Return 0 if nothing has to be changed.
				1785	* Negative value on error (change should be denied).
				1786	*/
				1787	int dentry_needs_remove_privs(struct dentry *dentry)
				1788	{
				1789	struct inode *inode = d_inode(dentry);
				1790	int mask = 0;
				1791	int ret;
				1792
				1793	if (IS_NOSEC(inode))
				1794	return 0;
				1795
				1796	mask = should_remove_suid(dentry);
				1797	ret = security_inode_need_killpriv(dentry);
				1798	if (ret < 0)
				1799	return ret;
				1800	if (ret)
				1801	mask \|= ATTR_KILL_PRIV;
				1802	return mask;
				1803	}
				1804
				1805	static int __remove_privs(struct vfsmount mnt, struct dentry dentry, int kill)
				1806	{
				1807	struct iattr newattrs;
				1808
				1809	newattrs.ia_valid = ATTR_FORCE \| kill;
				1810	/*
				1811	* Note we call this on write, so notify_change will not
				1812	* encounter any conflicting delegations:
				1813	*/
				1814	return notify_change2(mnt, dentry, &newattrs, NULL);
				1815	}
				1816
				1817	/*
				1818	* Remove special file priviledges (suid, capabilities) when file is written
				1819	* to or truncated.
				1820	*/
				1821	int file_remove_privs(struct file *file)
				1822	{
				1823	struct dentry *dentry = file_dentry(file);
				1824	struct inode *inode = file_inode(file);
				1825	int kill;
				1826	int error = 0;
				1827
				1828	/*
				1829	* Fast path for nothing security related.
				1830	* As well for non-regular files, e.g. blkdev inodes.
				1831	* For example, blkdev_write_iter() might get here
				1832	* trying to remove privs which it is not allowed to.
				1833	*/
				1834	if (IS_NOSEC(inode) \|\| !S_ISREG(inode->i_mode))
				1835	return 0;
				1836
				1837	kill = dentry_needs_remove_privs(dentry);
				1838	if (kill < 0)
				1839	return kill;
				1840	if (kill)
				1841	error = __remove_privs(file->f_path.mnt, dentry, kill);
				1842	if (!error)
				1843	inode_has_no_xattr(inode);
				1844
				1845	return error;
				1846	}
				1847	EXPORT_SYMBOL(file_remove_privs);
				1848
				1849	/**
				1850	* file_update_time - update mtime and ctime time
				1851	* @file: file accessed
				1852	*
				1853	* Update the mtime and ctime members of an inode and mark the inode
				1854	* for writeback. Note that this function is meant exclusively for
				1855	* usage in the file write path of filesystems, and filesystems may
				1856	* choose to explicitly ignore update via this function with the
				1857	* S_NOCMTIME inode flag, e.g. for network filesystem where these
				1858	* timestamps are handled by the server. This can return an error for
				1859	* file systems who need to allocate space in order to update an inode.
				1860	*/
				1861
				1862	int file_update_time(struct file *file)
				1863	{
				1864	struct inode *inode = file_inode(file);
				1865	struct timespec now;
				1866	int sync_it = 0;
				1867	int ret;
				1868
				1869	/* First try to exhaust all avenues to not sync */
				1870	if (IS_NOCMTIME(inode))
				1871	return 0;
				1872
				1873	now = current_time(inode);
				1874	if (!timespec_equal(&inode->i_mtime, &now))
				1875	sync_it = S_MTIME;
				1876
				1877	if (!timespec_equal(&inode->i_ctime, &now))
				1878	sync_it \|= S_CTIME;
				1879
				1880	if (IS_I_VERSION(inode))
				1881	sync_it \|= S_VERSION;
				1882
				1883	if (!sync_it)
				1884	return 0;
				1885
				1886	/* Finally allowed to write? Takes lock. */
				1887	if (__mnt_want_write_file(file))
				1888	return 0;
				1889
				1890	ret = update_time(inode, &now, sync_it);
				1891	__mnt_drop_write_file(file);
				1892
				1893	return ret;
				1894	}
				1895	EXPORT_SYMBOL(file_update_time);
				1896
				1897	int inode_needs_sync(struct inode *inode)
				1898	{
				1899	if (IS_SYNC(inode))
				1900	return 1;
				1901	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
				1902	return 1;
				1903	return 0;
				1904	}
				1905	EXPORT_SYMBOL(inode_needs_sync);
				1906
				1907	/*
				1908	* If we try to find an inode in the inode hash while it is being
				1909	* deleted, we have to wait until the filesystem completes its
				1910	* deletion before reporting that it isn't found. This function waits
				1911	* until the deletion _might_ have completed. Callers are responsible
				1912	* to recheck inode state.
				1913	*
				1914	* It doesn't matter if I_NEW is not set initially, a call to
				1915	* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
				1916	* will DTRT.
				1917	*/
				1918	static void __wait_on_freeing_inode(struct inode *inode)
				1919	{
				1920	wait_queue_head_t *wq;
				1921	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
				1922	wq = bit_waitqueue(&inode->i_state, __I_NEW);
				1923	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
				1924	spin_unlock(&inode->i_lock);
				1925	spin_unlock(&inode_hash_lock);
				1926	schedule();
				1927	finish_wait(wq, &wait.wq_entry);
				1928	spin_lock(&inode_hash_lock);
				1929	}
				1930
				1931	static __initdata unsigned long ihash_entries;
				1932	static int __init set_ihash_entries(char *str)
				1933	{
				1934	if (!str)
				1935	return 0;
				1936	ihash_entries = simple_strtoul(str, &str, 0);
				1937	return 1;
				1938	}
				1939	__setup("ihash_entries=", set_ihash_entries);
				1940
				1941	/*
				1942	* Initialize the waitqueues and inode hash table.
				1943	*/
				1944	void __init inode_init_early(void)
				1945	{
				1946	/* If hashes are distributed across NUMA nodes, defer
				1947	* hash allocation until vmalloc space is available.
				1948	*/
				1949	if (hashdist)
				1950	return;
				1951
				1952	inode_hashtable =
				1953	alloc_large_system_hash("Inode-cache",
				1954	sizeof(struct hlist_head),
				1955	ihash_entries,
				1956	14,
				1957	HASH_EARLY \| HASH_ZERO,
				1958	&i_hash_shift,
				1959	&i_hash_mask,
				1960	0,
				1961	0);
				1962	}
				1963
				1964	void __init inode_init(void)
				1965	{
				1966	/* inode slab cache */
				1967	inode_cachep = kmem_cache_create("inode_cache",
				1968	sizeof(struct inode),
				1969	0,
				1970	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				1971	SLAB_MEM_SPREAD\|SLAB_ACCOUNT),
				1972	init_once);
				1973
				1974	/* Hash may have been set up in inode_init_early */
				1975	if (!hashdist)
				1976	return;
				1977
				1978	inode_hashtable =
				1979	alloc_large_system_hash("Inode-cache",
				1980	sizeof(struct hlist_head),
				1981	ihash_entries,
				1982	14,
				1983	HASH_ZERO,
				1984	&i_hash_shift,
				1985	&i_hash_mask,
				1986	0,
				1987	0);
				1988	}
				1989
				1990	void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
				1991	{
				1992	inode->i_mode = mode;
				1993	if (S_ISCHR(mode)) {
				1994	inode->i_fop = &def_chr_fops;
				1995	inode->i_rdev = rdev;
				1996	} else if (S_ISBLK(mode)) {
				1997	inode->i_fop = &def_blk_fops;
				1998	inode->i_rdev = rdev;
				1999	} else if (S_ISFIFO(mode))
				2000	inode->i_fop = &pipefifo_fops;
				2001	else if (S_ISSOCK(mode))
				2002	; /* leave it no_open_fops */
				2003	else
				2004	printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
				2005	" inode %s:%lu\n", mode, inode->i_sb->s_id,
				2006	inode->i_ino);
				2007	}
				2008	EXPORT_SYMBOL(init_special_inode);
				2009
				2010	/**
				2011	* inode_init_owner - Init uid,gid,mode for new inode according to posix standards
				2012	* @inode: New inode
				2013	* @dir: Directory inode
				2014	* @mode: mode of the new inode
				2015	*/
				2016	void inode_init_owner(struct inode inode, const struct inode dir,
				2017	umode_t mode)
				2018	{
				2019	inode->i_uid = current_fsuid();
				2020	if (dir && dir->i_mode & S_ISGID) {
				2021	inode->i_gid = dir->i_gid;
				2022
				2023	/* Directories are special, and always inherit S_ISGID */
				2024	if (S_ISDIR(mode))
				2025	mode \|= S_ISGID;
				2026	else if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP) &&
				2027	!in_group_p(inode->i_gid) &&
				2028	!capable_wrt_inode_uidgid(dir, CAP_FSETID))
				2029	mode &= ~S_ISGID;
				2030	} else
				2031	inode->i_gid = current_fsgid();
				2032	inode->i_mode = mode;
				2033	}
				2034	EXPORT_SYMBOL(inode_init_owner);
				2035
				2036	/**
				2037	* inode_owner_or_capable - check current task permissions to inode
				2038	* @inode: inode being checked
				2039	*
				2040	* Return true if current either has CAP_FOWNER in a namespace with the
				2041	* inode owner uid mapped, or owns the file.
				2042	*/
				2043	bool inode_owner_or_capable(const struct inode *inode)
				2044	{
				2045	struct user_namespace *ns;
				2046
				2047	if (uid_eq(current_fsuid(), inode->i_uid))
				2048	return true;
				2049
				2050	ns = current_user_ns();
				2051	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
				2052	return true;
				2053	return false;
				2054	}
				2055	EXPORT_SYMBOL(inode_owner_or_capable);
				2056
				2057	/*
				2058	* Direct i/o helper functions
				2059	*/
				2060	static void __inode_dio_wait(struct inode *inode)
				2061	{
				2062	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
				2063	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
				2064
				2065	do {
				2066	prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
				2067	if (atomic_read(&inode->i_dio_count))
				2068	schedule();
				2069	} while (atomic_read(&inode->i_dio_count));
				2070	finish_wait(wq, &q.wq_entry);
				2071	}
				2072
				2073	/**
				2074	* inode_dio_wait - wait for outstanding DIO requests to finish
				2075	* @inode: inode to wait for
				2076	*
				2077	* Waits for all pending direct I/O requests to finish so that we can
				2078	* proceed with a truncate or equivalent operation.
				2079	*
				2080	* Must be called under a lock that serializes taking new references
				2081	* to i_dio_count, usually by inode->i_mutex.
				2082	*/
				2083	void inode_dio_wait(struct inode *inode)
				2084	{
				2085	if (atomic_read(&inode->i_dio_count))
				2086	__inode_dio_wait(inode);
				2087	}
				2088	EXPORT_SYMBOL(inode_dio_wait);
				2089
				2090	/*
				2091	* inode_set_flags - atomically set some inode flags
				2092	*
				2093	* Note: the caller should be holding i_mutex, or else be sure that
				2094	* they have exclusive access to the inode structure (i.e., while the
				2095	* inode is being instantiated). The reason for the cmpxchg() loop
				2096	* --- which wouldn't be necessary if all code paths which modify
				2097	* i_flags actually followed this rule, is that there is at least one
				2098	* code path which doesn't today so we use cmpxchg() out of an abundance
				2099	* of caution.
				2100	*
				2101	* In the long run, i_mutex is overkill, and we should probably look
				2102	* at using the i_lock spinlock to protect i_flags, and then make sure
				2103	* it is so documented in include/linux/fs.h and that all code follows
				2104	* the locking convention!!
				2105	*/
				2106	void inode_set_flags(struct inode *inode, unsigned int flags,
				2107	unsigned int mask)
				2108	{
				2109	unsigned int old_flags, new_flags;
				2110
				2111	WARN_ON_ONCE(flags & ~mask);
				2112	do {
				2113	old_flags = ACCESS_ONCE(inode->i_flags);
				2114	new_flags = (old_flags & ~mask) \| flags;
				2115	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
				2116	new_flags) != old_flags));
				2117	}
				2118	EXPORT_SYMBOL(inode_set_flags);
				2119
				2120	void inode_nohighmem(struct inode *inode)
				2121	{
				2122	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
				2123	}
				2124	EXPORT_SYMBOL(inode_nohighmem);
				2125
				2126	/**
				2127	* current_time - Return FS time
				2128	* @inode: inode.
				2129	*
				2130	* Return the current time truncated to the time granularity supported by
				2131	* the fs.
				2132	*
				2133	* Note that inode and inode->sb cannot be NULL.
				2134	* Otherwise, the function warns and returns time without truncation.
				2135	*/
				2136	struct timespec current_time(struct inode *inode)
				2137	{
				2138	struct timespec now = current_kernel_time();
				2139
				2140	if (unlikely(!inode->i_sb)) {
				2141	WARN(1, "current_time() called with uninitialized super_block in the inode");
				2142	return now;
				2143	}
				2144
				2145	return timespec_trunc(now, inode->i_sb->s_time_gran);
				2146	}
				2147	EXPORT_SYMBOL(current_time);