Blame - marvell/linux/fs/ocfs2/dlmglue.c - T108

blob: 97409312ebb1a27223f84b03c00becddce0d98b1 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/* -- mode: c; c-basic-offset: 8; --
				3	* vim: noexpandtab sw=8 ts=8 sts=0:
				4	*
				5	* dlmglue.c
				6	*
				7	* Code which implements an OCFS2 specific interface to our DLM.
				8	*
				9	* Copyright (C) 2003, 2004 Oracle. All rights reserved.
				10	*/
				11
				12	#include <linux/types.h>
				13	#include <linux/slab.h>
				14	#include <linux/highmem.h>
				15	#include <linux/mm.h>
				16	#include <linux/kthread.h>
				17	#include <linux/pagemap.h>
				18	#include <linux/debugfs.h>
				19	#include <linux/seq_file.h>
				20	#include <linux/time.h>
				21	#include <linux/quotaops.h>
				22	#include <linux/sched/signal.h>
				23
				24	#define MLOG_MASK_PREFIX ML_DLM_GLUE
				25	#include <cluster/masklog.h>
				26
				27	#include "ocfs2.h"
				28	#include "ocfs2_lockingver.h"
				29
				30	#include "alloc.h"
				31	#include "dcache.h"
				32	#include "dlmglue.h"
				33	#include "extent_map.h"
				34	#include "file.h"
				35	#include "heartbeat.h"
				36	#include "inode.h"
				37	#include "journal.h"
				38	#include "stackglue.h"
				39	#include "slot_map.h"
				40	#include "super.h"
				41	#include "uptodate.h"
				42	#include "quota.h"
				43	#include "refcounttree.h"
				44	#include "acl.h"
				45
				46	#include "buffer_head_io.h"
				47
				48	struct ocfs2_mask_waiter {
				49	struct list_head mw_item;
				50	int mw_status;
				51	struct completion mw_complete;
				52	unsigned long mw_mask;
				53	unsigned long mw_goal;
				54	#ifdef CONFIG_OCFS2_FS_STATS
				55	ktime_t mw_lock_start;
				56	#endif
				57	};
				58
				59	static struct ocfs2_super ocfs2_get_dentry_osb(struct ocfs2_lock_res lockres);
				60	static struct ocfs2_super ocfs2_get_inode_osb(struct ocfs2_lock_res lockres);
				61	static struct ocfs2_super ocfs2_get_file_osb(struct ocfs2_lock_res lockres);
				62	static struct ocfs2_super ocfs2_get_qinfo_osb(struct ocfs2_lock_res lockres);
				63
				64	/*
				65	* Return value from ->downconvert_worker functions.
				66	*
				67	* These control the precise actions of ocfs2_unblock_lock()
				68	* and ocfs2_process_blocked_lock()
				69	*
				70	*/
				71	enum ocfs2_unblock_action {
				72	UNBLOCK_CONTINUE = 0, /* Continue downconvert */
				73	UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire
				74	* ->post_unlock callback */
				75	UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire
				76	* ->post_unlock() callback. */
				77	};
				78
				79	struct ocfs2_unblock_ctl {
				80	int requeue;
				81	enum ocfs2_unblock_action unblock_action;
				82	};
				83
				84	/* Lockdep class keys */
				85	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				86	static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
				87	#endif
				88
				89	static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
				90	int new_level);
				91	static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
				92
				93	static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
				94	int blocking);
				95
				96	static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
				97	int blocking);
				98
				99	static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
				100	struct ocfs2_lock_res *lockres);
				101
				102	static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
				103
				104	static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
				105	int new_level);
				106	static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
				107	int blocking);
				108
				109	#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
				110
				111	/* This aids in debugging situations where a bad LVB might be involved. */
				112	static void ocfs2_dump_meta_lvb_info(u64 level,
				113	const char *function,
				114	unsigned int line,
				115	struct ocfs2_lock_res *lockres)
				116	{
				117	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				118
				119	mlog(level, "LVB information for %s (called from %s:%u):\n",
				120	lockres->l_name, function, line);
				121	mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
				122	lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
				123	be32_to_cpu(lvb->lvb_igeneration));
				124	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
				125	(unsigned long long)be64_to_cpu(lvb->lvb_isize),
				126	be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
				127	be16_to_cpu(lvb->lvb_imode));
				128	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
				129	"mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
				130	(long long)be64_to_cpu(lvb->lvb_iatime_packed),
				131	(long long)be64_to_cpu(lvb->lvb_ictime_packed),
				132	(long long)be64_to_cpu(lvb->lvb_imtime_packed),
				133	be32_to_cpu(lvb->lvb_iattr));
				134	}
				135
				136
				137	/*
				138	* OCFS2 Lock Resource Operations
				139	*
				140	* These fine tune the behavior of the generic dlmglue locking infrastructure.
				141	*
				142	* The most basic of lock types can point ->l_priv to their respective
				143	* struct ocfs2_super and allow the default actions to manage things.
				144	*
				145	* Right now, each lock type also needs to implement an init function,
				146	* and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
				147	* should be called when the lock is no longer needed (i.e., object
				148	* destruction time).
				149	*/
				150	struct ocfs2_lock_res_ops {
				151	/*
				152	* Translate an ocfs2_lock_res * into an ocfs2_super *. Define
				153	* this callback if ->l_priv is not an ocfs2_super pointer
				154	*/
				155	struct ocfs2_super * (get_osb)(struct ocfs2_lock_res );
				156
				157	/*
				158	* Optionally called in the downconvert thread after a
				159	* successful downconvert. The lockres will not be referenced
				160	* after this callback is called, so it is safe to free
				161	* memory, etc.
				162	*
				163	* The exact semantics of when this is called are controlled
				164	* by ->downconvert_worker()
				165	*/
				166	void (post_unlock)(struct ocfs2_super , struct ocfs2_lock_res *);
				167
				168	/*
				169	* Allow a lock type to add checks to determine whether it is
				170	* safe to downconvert a lock. Return 0 to re-queue the
				171	* downconvert at a later time, nonzero to continue.
				172	*
				173	* For most locks, the default checks that there are no
				174	* incompatible holders are sufficient.
				175	*
				176	* Called with the lockres spinlock held.
				177	*/
				178	int (check_downconvert)(struct ocfs2_lock_res , int);
				179
				180	/*
				181	* Allows a lock type to populate the lock value block. This
				182	* is called on downconvert, and when we drop a lock.
				183	*
				184	* Locks that want to use this should set LOCK_TYPE_USES_LVB
				185	* in the flags field.
				186	*
				187	* Called with the lockres spinlock held.
				188	*/
				189	void (set_lvb)(struct ocfs2_lock_res );
				190
				191	/*
				192	* Called from the downconvert thread when it is determined
				193	* that a lock will be downconverted. This is called without
				194	* any locks held so the function can do work that might
				195	* schedule (syncing out data, etc).
				196	*
				197	* This should return any one of the ocfs2_unblock_action
				198	* values, depending on what it wants the thread to do.
				199	*/
				200	int (downconvert_worker)(struct ocfs2_lock_res , int);
				201
				202	/*
				203	* LOCK_TYPE_* flags which describe the specific requirements
				204	* of a lock type. Descriptions of each individual flag follow.
				205	*/
				206	int flags;
				207	};
				208
				209	/*
				210	* Some locks want to "refresh" potentially stale data when a
				211	* meaningful (PRMODE or EXMODE) lock level is first obtained. If this
				212	* flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
				213	* individual lockres l_flags member from the ast function. It is
				214	* expected that the locking wrapper will clear the
				215	* OCFS2_LOCK_NEEDS_REFRESH flag when done.
				216	*/
				217	#define LOCK_TYPE_REQUIRES_REFRESH 0x1
				218
				219	/*
				220	* Indicate that a lock type makes use of the lock value block. The
				221	* ->set_lvb lock type callback must be defined.
				222	*/
				223	#define LOCK_TYPE_USES_LVB 0x2
				224
				225	static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
				226	.get_osb = ocfs2_get_inode_osb,
				227	.flags = 0,
				228	};
				229
				230	static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
				231	.get_osb = ocfs2_get_inode_osb,
				232	.check_downconvert = ocfs2_check_meta_downconvert,
				233	.set_lvb = ocfs2_set_meta_lvb,
				234	.downconvert_worker = ocfs2_data_convert_worker,
				235	.flags = LOCK_TYPE_REQUIRES_REFRESH\|LOCK_TYPE_USES_LVB,
				236	};
				237
				238	static struct ocfs2_lock_res_ops ocfs2_super_lops = {
				239	.flags = LOCK_TYPE_REQUIRES_REFRESH,
				240	};
				241
				242	static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
				243	.flags = 0,
				244	};
				245
				246	static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
				247	.flags = 0,
				248	};
				249
				250	static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
				251	.flags = LOCK_TYPE_REQUIRES_REFRESH\|LOCK_TYPE_USES_LVB,
				252	};
				253
				254	static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
				255	.flags = LOCK_TYPE_REQUIRES_REFRESH\|LOCK_TYPE_USES_LVB,
				256	};
				257
				258	static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
				259	.get_osb = ocfs2_get_dentry_osb,
				260	.post_unlock = ocfs2_dentry_post_unlock,
				261	.downconvert_worker = ocfs2_dentry_convert_worker,
				262	.flags = 0,
				263	};
				264
				265	static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
				266	.get_osb = ocfs2_get_inode_osb,
				267	.flags = 0,
				268	};
				269
				270	static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
				271	.get_osb = ocfs2_get_file_osb,
				272	.flags = 0,
				273	};
				274
				275	static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
				276	.set_lvb = ocfs2_set_qinfo_lvb,
				277	.get_osb = ocfs2_get_qinfo_osb,
				278	.flags = LOCK_TYPE_REQUIRES_REFRESH \| LOCK_TYPE_USES_LVB,
				279	};
				280
				281	static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
				282	.check_downconvert = ocfs2_check_refcount_downconvert,
				283	.downconvert_worker = ocfs2_refcount_convert_worker,
				284	.flags = 0,
				285	};
				286
				287	static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
				288	{
				289	return lockres->l_type == OCFS2_LOCK_TYPE_META \|\|
				290	lockres->l_type == OCFS2_LOCK_TYPE_RW \|\|
				291	lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
				292	}
				293
				294	static inline struct ocfs2_lock_res ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb lksb)
				295	{
				296	return container_of(lksb, struct ocfs2_lock_res, l_lksb);
				297	}
				298
				299	static inline struct inode ocfs2_lock_res_inode(struct ocfs2_lock_res lockres)
				300	{
				301	BUG_ON(!ocfs2_is_inode_lock(lockres));
				302
				303	return (struct inode *) lockres->l_priv;
				304	}
				305
				306	static inline struct ocfs2_dentry_lock ocfs2_lock_res_dl(struct ocfs2_lock_res lockres)
				307	{
				308	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
				309
				310	return (struct ocfs2_dentry_lock *)lockres->l_priv;
				311	}
				312
				313	static inline struct ocfs2_mem_dqinfo ocfs2_lock_res_qinfo(struct ocfs2_lock_res lockres)
				314	{
				315	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
				316
				317	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
				318	}
				319
				320	static inline struct ocfs2_refcount_tree *
				321	ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
				322	{
				323	return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
				324	}
				325
				326	static inline struct ocfs2_super ocfs2_get_lockres_osb(struct ocfs2_lock_res lockres)
				327	{
				328	if (lockres->l_ops->get_osb)
				329	return lockres->l_ops->get_osb(lockres);
				330
				331	return (struct ocfs2_super *)lockres->l_priv;
				332	}
				333
				334	static int ocfs2_lock_create(struct ocfs2_super *osb,
				335	struct ocfs2_lock_res *lockres,
				336	int level,
				337	u32 dlm_flags);
				338	static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
				339	int wanted);
				340	static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
				341	struct ocfs2_lock_res *lockres,
				342	int level, unsigned long caller_ip);
				343	static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
				344	struct ocfs2_lock_res *lockres,
				345	int level)
				346	{
				347	__ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
				348	}
				349
				350	static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
				351	static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
				352	static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
				353	static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
				354	static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
				355	struct ocfs2_lock_res *lockres);
				356	static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
				357	int convert);
				358	#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
				359	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \
				360	mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
				361	_err, _func, _lockres->l_name); \
				362	else \
				363	mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \
				364	_err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \
				365	(unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \
				366	} while (0)
				367	static int ocfs2_downconvert_thread(void *arg);
				368	static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
				369	struct ocfs2_lock_res *lockres);
				370	static int ocfs2_inode_lock_update(struct inode *inode,
				371	struct buffer_head **bh);
				372	static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
				373	static inline int ocfs2_highest_compat_lock_level(int level);
				374	static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
				375	int new_level);
				376	static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
				377	struct ocfs2_lock_res *lockres,
				378	int new_level,
				379	int lvb,
				380	unsigned int generation);
				381	static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
				382	struct ocfs2_lock_res *lockres);
				383	static int ocfs2_cancel_convert(struct ocfs2_super *osb,
				384	struct ocfs2_lock_res *lockres);
				385
				386
				387	static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
				388	u64 blkno,
				389	u32 generation,
				390	char *name)
				391	{
				392	int len;
				393
				394	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
				395
				396	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
				397	ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
				398	(long long)blkno, generation);
				399
				400	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
				401
				402	mlog(0, "built lock resource with name: %s\n", name);
				403	}
				404
				405	static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
				406
				407	static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
				408	struct ocfs2_dlm_debug *dlm_debug)
				409	{
				410	mlog(0, "Add tracking for lockres %s\n", res->l_name);
				411
				412	spin_lock(&ocfs2_dlm_tracking_lock);
				413	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
				414	spin_unlock(&ocfs2_dlm_tracking_lock);
				415	}
				416
				417	static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
				418	{
				419	spin_lock(&ocfs2_dlm_tracking_lock);
				420	if (!list_empty(&res->l_debug_list))
				421	list_del_init(&res->l_debug_list);
				422	spin_unlock(&ocfs2_dlm_tracking_lock);
				423	}
				424
				425	#ifdef CONFIG_OCFS2_FS_STATS
				426	static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
				427	{
				428	res->l_lock_refresh = 0;
				429	res->l_lock_wait = 0;
				430	memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
				431	memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
				432	}
				433
				434	static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
				435	struct ocfs2_mask_waiter *mw, int ret)
				436	{
				437	u32 usec;
				438	ktime_t kt;
				439	struct ocfs2_lock_stats *stats;
				440
				441	if (level == LKM_PRMODE)
				442	stats = &res->l_lock_prmode;
				443	else if (level == LKM_EXMODE)
				444	stats = &res->l_lock_exmode;
				445	else
				446	return;
				447
				448	kt = ktime_sub(ktime_get(), mw->mw_lock_start);
				449	usec = ktime_to_us(kt);
				450
				451	stats->ls_gets++;
				452	stats->ls_total += ktime_to_ns(kt);
				453	/* overflow */
				454	if (unlikely(stats->ls_gets == 0)) {
				455	stats->ls_gets++;
				456	stats->ls_total = ktime_to_ns(kt);
				457	}
				458
				459	if (stats->ls_max < usec)
				460	stats->ls_max = usec;
				461
				462	if (ret)
				463	stats->ls_fail++;
				464
				465	stats->ls_last = ktime_to_us(ktime_get_real());
				466	}
				467
				468	static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
				469	{
				470	lockres->l_lock_refresh++;
				471	}
				472
				473	static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
				474	{
				475	struct ocfs2_mask_waiter *mw;
				476
				477	if (list_empty(&lockres->l_mask_waiters)) {
				478	lockres->l_lock_wait = 0;
				479	return;
				480	}
				481
				482	mw = list_first_entry(&lockres->l_mask_waiters,
				483	struct ocfs2_mask_waiter, mw_item);
				484	lockres->l_lock_wait =
				485	ktime_to_us(ktime_mono_to_real(mw->mw_lock_start));
				486	}
				487
				488	static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
				489	{
				490	mw->mw_lock_start = ktime_get();
				491	}
				492	#else
				493	static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
				494	{
				495	}
				496	static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
				497	int level, struct ocfs2_mask_waiter *mw, int ret)
				498	{
				499	}
				500	static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
				501	{
				502	}
				503	static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
				504	{
				505	}
				506	static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
				507	{
				508	}
				509	#endif
				510
				511	static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
				512	struct ocfs2_lock_res *res,
				513	enum ocfs2_lock_type type,
				514	struct ocfs2_lock_res_ops *ops,
				515	void *priv)
				516	{
				517	res->l_type = type;
				518	res->l_ops = ops;
				519	res->l_priv = priv;
				520
				521	res->l_level = DLM_LOCK_IV;
				522	res->l_requested = DLM_LOCK_IV;
				523	res->l_blocking = DLM_LOCK_IV;
				524	res->l_action = OCFS2_AST_INVALID;
				525	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
				526
				527	res->l_flags = OCFS2_LOCK_INITIALIZED;
				528
				529	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
				530
				531	ocfs2_init_lock_stats(res);
				532	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				533	if (type != OCFS2_LOCK_TYPE_OPEN)
				534	lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
				535	&lockdep_keys[type], 0);
				536	else
				537	res->l_lockdep_map.key = NULL;
				538	#endif
				539	}
				540
				541	void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
				542	{
				543	/* This also clears out the lock status block */
				544	memset(res, 0, sizeof(struct ocfs2_lock_res));
				545	spin_lock_init(&res->l_lock);
				546	init_waitqueue_head(&res->l_event);
				547	INIT_LIST_HEAD(&res->l_blocked_list);
				548	INIT_LIST_HEAD(&res->l_mask_waiters);
				549	INIT_LIST_HEAD(&res->l_holders);
				550	}
				551
				552	void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
				553	enum ocfs2_lock_type type,
				554	unsigned int generation,
				555	struct inode *inode)
				556	{
				557	struct ocfs2_lock_res_ops *ops;
				558
				559	switch(type) {
				560	case OCFS2_LOCK_TYPE_RW:
				561	ops = &ocfs2_inode_rw_lops;
				562	break;
				563	case OCFS2_LOCK_TYPE_META:
				564	ops = &ocfs2_inode_inode_lops;
				565	break;
				566	case OCFS2_LOCK_TYPE_OPEN:
				567	ops = &ocfs2_inode_open_lops;
				568	break;
				569	default:
				570	mlog_bug_on_msg(1, "type: %d\n", type);
				571	ops = NULL; /* thanks, gcc */
				572	break;
				573	};
				574
				575	ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
				576	generation, res->l_name);
				577	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
				578	}
				579
				580	static struct ocfs2_super ocfs2_get_inode_osb(struct ocfs2_lock_res lockres)
				581	{
				582	struct inode *inode = ocfs2_lock_res_inode(lockres);
				583
				584	return OCFS2_SB(inode->i_sb);
				585	}
				586
				587	static struct ocfs2_super ocfs2_get_qinfo_osb(struct ocfs2_lock_res lockres)
				588	{
				589	struct ocfs2_mem_dqinfo *info = lockres->l_priv;
				590
				591	return OCFS2_SB(info->dqi_gi.dqi_sb);
				592	}
				593
				594	static struct ocfs2_super ocfs2_get_file_osb(struct ocfs2_lock_res lockres)
				595	{
				596	struct ocfs2_file_private *fp = lockres->l_priv;
				597
				598	return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
				599	}
				600
				601	static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
				602	{
				603	__be64 inode_blkno_be;
				604
				605	memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
				606	sizeof(__be64));
				607
				608	return be64_to_cpu(inode_blkno_be);
				609	}
				610
				611	static struct ocfs2_super ocfs2_get_dentry_osb(struct ocfs2_lock_res lockres)
				612	{
				613	struct ocfs2_dentry_lock *dl = lockres->l_priv;
				614
				615	return OCFS2_SB(dl->dl_inode->i_sb);
				616	}
				617
				618	void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
				619	u64 parent, struct inode *inode)
				620	{
				621	int len;
				622	u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
				623	__be64 inode_blkno_be = cpu_to_be64(inode_blkno);
				624	struct ocfs2_lock_res *lockres = &dl->dl_lockres;
				625
				626	ocfs2_lock_res_init_once(lockres);
				627
				628	/*
				629	* Unfortunately, the standard lock naming scheme won't work
				630	* here because we have two 16 byte values to use. Instead,
				631	* we'll stuff the inode number as a binary value. We still
				632	* want error prints to show something without garbling the
				633	* display, so drop a null byte in there before the inode
				634	* number. A future version of OCFS2 will likely use all
				635	* binary lock names. The stringified names have been a
				636	* tremendous aid in debugging, but now that the debugfs
				637	* interface exists, we can mangle things there if need be.
				638	*
				639	* NOTE: We also drop the standard "pad" value (the total lock
				640	* name size stays the same though - the last part is all
				641	* zeros due to the memset in ocfs2_lock_res_init_once()
				642	*/
				643	len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
				644	"%c%016llx",
				645	ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
				646	(long long)parent);
				647
				648	BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
				649
				650	memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
				651	sizeof(__be64));
				652
				653	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
				654	OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
				655	dl);
				656	}
				657
				658	static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
				659	struct ocfs2_super *osb)
				660	{
				661	/* Superblock lockres doesn't come from a slab so we call init
				662	* once on it manually. */
				663	ocfs2_lock_res_init_once(res);
				664	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
				665	0, res->l_name);
				666	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
				667	&ocfs2_super_lops, osb);
				668	}
				669
				670	static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
				671	struct ocfs2_super *osb)
				672	{
				673	/* Rename lockres doesn't come from a slab so we call init
				674	* once on it manually. */
				675	ocfs2_lock_res_init_once(res);
				676	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
				677	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
				678	&ocfs2_rename_lops, osb);
				679	}
				680
				681	static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
				682	struct ocfs2_super *osb)
				683	{
				684	/* nfs_sync lockres doesn't come from a slab so we call init
				685	* once on it manually. */
				686	ocfs2_lock_res_init_once(res);
				687	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
				688	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
				689	&ocfs2_nfs_sync_lops, osb);
				690	}
				691
				692	static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
				693	{
				694	ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
				695	init_rwsem(&osb->nfs_sync_rwlock);
				696	}
				697
				698	void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
				699	{
				700	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
				701
				702	/* Only one trimfs thread are allowed to work at the same time. */
				703	mutex_lock(&osb->obs_trim_fs_mutex);
				704
				705	ocfs2_lock_res_init_once(lockres);
				706	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
				707	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
				708	&ocfs2_trim_fs_lops, osb);
				709	}
				710
				711	void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
				712	{
				713	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
				714
				715	ocfs2_simple_drop_lockres(osb, lockres);
				716	ocfs2_lock_res_free(lockres);
				717
				718	mutex_unlock(&osb->obs_trim_fs_mutex);
				719	}
				720
				721	static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
				722	struct ocfs2_super *osb)
				723	{
				724	ocfs2_lock_res_init_once(res);
				725	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
				726	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
				727	&ocfs2_orphan_scan_lops, osb);
				728	}
				729
				730	void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
				731	struct ocfs2_file_private *fp)
				732	{
				733	struct inode *inode = fp->fp_file->f_mapping->host;
				734	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				735
				736	ocfs2_lock_res_init_once(lockres);
				737	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
				738	inode->i_generation, lockres->l_name);
				739	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
				740	OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
				741	fp);
				742	lockres->l_flags \|= OCFS2_LOCK_NOCACHE;
				743	}
				744
				745	void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
				746	struct ocfs2_mem_dqinfo *info)
				747	{
				748	ocfs2_lock_res_init_once(lockres);
				749	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
				750	0, lockres->l_name);
				751	ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
				752	OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
				753	info);
				754	}
				755
				756	void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
				757	struct ocfs2_super *osb, u64 ref_blkno,
				758	unsigned int generation)
				759	{
				760	ocfs2_lock_res_init_once(lockres);
				761	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
				762	generation, lockres->l_name);
				763	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
				764	&ocfs2_refcount_block_lops, osb);
				765	}
				766
				767	void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
				768	{
				769	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
				770	return;
				771
				772	ocfs2_remove_lockres_tracking(res);
				773
				774	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
				775	"Lockres %s is on the blocked list\n",
				776	res->l_name);
				777	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
				778	"Lockres %s has mask waiters pending\n",
				779	res->l_name);
				780	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
				781	"Lockres %s is locked\n",
				782	res->l_name);
				783	mlog_bug_on_msg(res->l_ro_holders,
				784	"Lockres %s has %u ro holders\n",
				785	res->l_name, res->l_ro_holders);
				786	mlog_bug_on_msg(res->l_ex_holders,
				787	"Lockres %s has %u ex holders\n",
				788	res->l_name, res->l_ex_holders);
				789
				790	/* Need to clear out the lock status block for the dlm */
				791	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
				792
				793	res->l_flags = 0UL;
				794	}
				795
				796	/*
				797	* Keep a list of processes who have interest in a lockres.
				798	* Note: this is now only uesed for check recursive cluster locking.
				799	*/
				800	static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
				801	struct ocfs2_lock_holder *oh)
				802	{
				803	INIT_LIST_HEAD(&oh->oh_list);
				804	oh->oh_owner_pid = get_pid(task_pid(current));
				805
				806	spin_lock(&lockres->l_lock);
				807	list_add_tail(&oh->oh_list, &lockres->l_holders);
				808	spin_unlock(&lockres->l_lock);
				809	}
				810
				811	static struct ocfs2_lock_holder *
				812	ocfs2_pid_holder(struct ocfs2_lock_res *lockres,
				813	struct pid *pid)
				814	{
				815	struct ocfs2_lock_holder *oh;
				816
				817	spin_lock(&lockres->l_lock);
				818	list_for_each_entry(oh, &lockres->l_holders, oh_list) {
				819	if (oh->oh_owner_pid == pid) {
				820	spin_unlock(&lockres->l_lock);
				821	return oh;
				822	}
				823	}
				824	spin_unlock(&lockres->l_lock);
				825	return NULL;
				826	}
				827
				828	static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
				829	struct ocfs2_lock_holder *oh)
				830	{
				831	spin_lock(&lockres->l_lock);
				832	list_del(&oh->oh_list);
				833	spin_unlock(&lockres->l_lock);
				834
				835	put_pid(oh->oh_owner_pid);
				836	}
				837
				838
				839	static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
				840	int level)
				841	{
				842	BUG_ON(!lockres);
				843
				844	switch(level) {
				845	case DLM_LOCK_EX:
				846	lockres->l_ex_holders++;
				847	break;
				848	case DLM_LOCK_PR:
				849	lockres->l_ro_holders++;
				850	break;
				851	default:
				852	BUG();
				853	}
				854	}
				855
				856	static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
				857	int level)
				858	{
				859	BUG_ON(!lockres);
				860
				861	switch(level) {
				862	case DLM_LOCK_EX:
				863	BUG_ON(!lockres->l_ex_holders);
				864	lockres->l_ex_holders--;
				865	break;
				866	case DLM_LOCK_PR:
				867	BUG_ON(!lockres->l_ro_holders);
				868	lockres->l_ro_holders--;
				869	break;
				870	default:
				871	BUG();
				872	}
				873	}
				874
				875	/* WARNING: This function lives in a world where the only three lock
				876	* levels are EX, PR, and NL. It will have to be adjusted when more
				877	* lock types are added. */
				878	static inline int ocfs2_highest_compat_lock_level(int level)
				879	{
				880	int new_level = DLM_LOCK_EX;
				881
				882	if (level == DLM_LOCK_EX)
				883	new_level = DLM_LOCK_NL;
				884	else if (level == DLM_LOCK_PR)
				885	new_level = DLM_LOCK_PR;
				886	return new_level;
				887	}
				888
				889	static void lockres_set_flags(struct ocfs2_lock_res *lockres,
				890	unsigned long newflags)
				891	{
				892	struct ocfs2_mask_waiter mw, tmp;
				893
				894	assert_spin_locked(&lockres->l_lock);
				895
				896	lockres->l_flags = newflags;
				897
				898	list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
				899	if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
				900	continue;
				901
				902	list_del_init(&mw->mw_item);
				903	mw->mw_status = 0;
				904	complete(&mw->mw_complete);
				905	ocfs2_track_lock_wait(lockres);
				906	}
				907	}
				908	static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
				909	{
				910	lockres_set_flags(lockres, lockres->l_flags \| or);
				911	}
				912	static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
				913	unsigned long clear)
				914	{
				915	lockres_set_flags(lockres, lockres->l_flags & ~clear);
				916	}
				917
				918	static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
				919	{
				920	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
				921	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
				922	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
				923	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
				924
				925	lockres->l_level = lockres->l_requested;
				926	if (lockres->l_level <=
				927	ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
				928	lockres->l_blocking = DLM_LOCK_NL;
				929	lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
				930	}
				931	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
				932	}
				933
				934	static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
				935	{
				936	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
				937	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
				938
				939	/* Convert from RO to EX doesn't really need anything as our
				940	* information is already up to data. Convert from NL to
				941	* anything however should mark ourselves as needing an
				942	* update */
				943	if (lockres->l_level == DLM_LOCK_NL &&
				944	lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
				945	lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
				946
				947	lockres->l_level = lockres->l_requested;
				948
				949	/*
				950	* We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
				951	* the OCFS2_LOCK_BUSY flag to prevent the dc thread from
				952	* downconverting the lock before the upconvert has fully completed.
				953	* Do not prevent the dc thread from downconverting if NONBLOCK lock
				954	* had already returned.
				955	*/
				956	if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
				957	lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
				958	else
				959	lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
				960
				961	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
				962	}
				963
				964	static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
				965	{
				966	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
				967	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
				968
				969	if (lockres->l_requested > DLM_LOCK_NL &&
				970	!(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
				971	lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
				972	lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
				973
				974	lockres->l_level = lockres->l_requested;
				975	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
				976	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
				977	}
				978
				979	static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
				980	int level)
				981	{
				982	int needs_downconvert = 0;
				983
				984	assert_spin_locked(&lockres->l_lock);
				985
				986	if (level > lockres->l_blocking) {
				987	/* only schedule a downconvert if we haven't already scheduled
				988	* one that goes low enough to satisfy the level we're
				989	* blocking. this also catches the case where we get
				990	* duplicate BASTs */
				991	if (ocfs2_highest_compat_lock_level(level) <
				992	ocfs2_highest_compat_lock_level(lockres->l_blocking))
				993	needs_downconvert = 1;
				994
				995	lockres->l_blocking = level;
				996	}
				997
				998	mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
				999	lockres->l_name, level, lockres->l_level, lockres->l_blocking,
				1000	needs_downconvert);
				1001
				1002	if (needs_downconvert)
				1003	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
				1004	mlog(0, "needs_downconvert = %d\n", needs_downconvert);
				1005	return needs_downconvert;
				1006	}
				1007
				1008	/*
				1009	* OCFS2_LOCK_PENDING and l_pending_gen.
				1010	*
				1011	* Why does OCFS2_LOCK_PENDING exist? To close a race between setting
				1012	* OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
				1013	* for more details on the race.
				1014	*
				1015	* OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
				1016	* a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
				1017	* returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
				1018	* OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
				1019	* the caller is going to try to clear PENDING again. If nothing else is
				1020	* happening, __lockres_clear_pending() sees PENDING is unset and does
				1021	* nothing.
				1022	*
				1023	* But what if another path (eg downconvert thread) has just started a
				1024	* new locking action? The other path has re-set PENDING. Our path
				1025	* cannot clear PENDING, because that will re-open the original race
				1026	* window.
				1027	*
				1028	* [Example]
				1029	*
				1030	* ocfs2_meta_lock()
				1031	* ocfs2_cluster_lock()
				1032	* set BUSY
				1033	* set PENDING
				1034	* drop l_lock
				1035	* ocfs2_dlm_lock()
				1036	* ocfs2_locking_ast() ocfs2_downconvert_thread()
				1037	* clear PENDING ocfs2_unblock_lock()
				1038	* take_l_lock
				1039	* !BUSY
				1040	* ocfs2_prepare_downconvert()
				1041	* set BUSY
				1042	* set PENDING
				1043	* drop l_lock
				1044	* take l_lock
				1045	* clear PENDING
				1046	* drop l_lock
				1047	* <window>
				1048	* ocfs2_dlm_lock()
				1049	*
				1050	* So as you can see, we now have a window where l_lock is not held,
				1051	* PENDING is not set, and ocfs2_dlm_lock() has not been called.
				1052	*
				1053	* The core problem is that ocfs2_cluster_lock() has cleared the PENDING
				1054	* set by ocfs2_prepare_downconvert(). That wasn't nice.
				1055	*
				1056	* To solve this we introduce l_pending_gen. A call to
				1057	* lockres_clear_pending() will only do so when it is passed a generation
				1058	* number that matches the lockres. lockres_set_pending() will return the
				1059	* current generation number. When ocfs2_cluster_lock() goes to clear
				1060	* PENDING, it passes the generation it got from set_pending(). In our
				1061	* example above, the generation numbers will not match. Thus,
				1062	* ocfs2_cluster_lock() will not clear the PENDING set by
				1063	* ocfs2_prepare_downconvert().
				1064	*/
				1065
				1066	/* Unlocked version for ocfs2_locking_ast() */
				1067	static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
				1068	unsigned int generation,
				1069	struct ocfs2_super *osb)
				1070	{
				1071	assert_spin_locked(&lockres->l_lock);
				1072
				1073	/*
				1074	* The ast and locking functions can race us here. The winner
				1075	* will clear pending, the loser will not.
				1076	*/
				1077	if (!(lockres->l_flags & OCFS2_LOCK_PENDING) \|\|
				1078	(lockres->l_pending_gen != generation))
				1079	return;
				1080
				1081	lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
				1082	lockres->l_pending_gen++;
				1083
				1084	/*
				1085	* The downconvert thread may have skipped us because we
				1086	* were PENDING. Wake it up.
				1087	*/
				1088	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
				1089	ocfs2_wake_downconvert_thread(osb);
				1090	}
				1091
				1092	/* Locked version for callers of ocfs2_dlm_lock() */
				1093	static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
				1094	unsigned int generation,
				1095	struct ocfs2_super *osb)
				1096	{
				1097	unsigned long flags;
				1098
				1099	spin_lock_irqsave(&lockres->l_lock, flags);
				1100	__lockres_clear_pending(lockres, generation, osb);
				1101	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1102	}
				1103
				1104	static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
				1105	{
				1106	assert_spin_locked(&lockres->l_lock);
				1107	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
				1108
				1109	lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
				1110
				1111	return lockres->l_pending_gen;
				1112	}
				1113
				1114	static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
				1115	{
				1116	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
				1117	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
				1118	int needs_downconvert;
				1119	unsigned long flags;
				1120
				1121	BUG_ON(level <= DLM_LOCK_NL);
				1122
				1123	mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
				1124	"type %s\n", lockres->l_name, level, lockres->l_level,
				1125	ocfs2_lock_type_string(lockres->l_type));
				1126
				1127	/*
				1128	* We can skip the bast for locks which don't enable caching -
				1129	* they'll be dropped at the earliest possible time anyway.
				1130	*/
				1131	if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
				1132	return;
				1133
				1134	spin_lock_irqsave(&lockres->l_lock, flags);
				1135	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
				1136	if (needs_downconvert)
				1137	ocfs2_schedule_blocked_lock(osb, lockres);
				1138	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1139
				1140	wake_up(&lockres->l_event);
				1141
				1142	ocfs2_wake_downconvert_thread(osb);
				1143	}
				1144
				1145	static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
				1146	{
				1147	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
				1148	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
				1149	unsigned long flags;
				1150	int status;
				1151
				1152	spin_lock_irqsave(&lockres->l_lock, flags);
				1153
				1154	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
				1155
				1156	if (status == -EAGAIN) {
				1157	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
				1158	goto out;
				1159	}
				1160
				1161	if (status) {
				1162	mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
				1163	lockres->l_name, status);
				1164	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1165	return;
				1166	}
				1167
				1168	mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
				1169	"level %d => %d\n", lockres->l_name, lockres->l_action,
				1170	lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
				1171
				1172	switch(lockres->l_action) {
				1173	case OCFS2_AST_ATTACH:
				1174	ocfs2_generic_handle_attach_action(lockres);
				1175	lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
				1176	break;
				1177	case OCFS2_AST_CONVERT:
				1178	ocfs2_generic_handle_convert_action(lockres);
				1179	break;
				1180	case OCFS2_AST_DOWNCONVERT:
				1181	ocfs2_generic_handle_downconvert_action(lockres);
				1182	break;
				1183	default:
				1184	mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
				1185	"flags 0x%lx, unlock: %u\n",
				1186	lockres->l_name, lockres->l_action, lockres->l_flags,
				1187	lockres->l_unlock_action);
				1188	BUG();
				1189	}
				1190	out:
				1191	/* set it to something invalid so if we get called again we
				1192	* can catch it. */
				1193	lockres->l_action = OCFS2_AST_INVALID;
				1194
				1195	/* Did we try to cancel this lock? Clear that state */
				1196	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
				1197	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
				1198
				1199	/*
				1200	* We may have beaten the locking functions here. We certainly
				1201	* know that dlm_lock() has been called :-)
				1202	* Because we can't have two lock calls in flight at once, we
				1203	* can use lockres->l_pending_gen.
				1204	*/
				1205	__lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
				1206
				1207	wake_up(&lockres->l_event);
				1208	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1209	}
				1210
				1211	static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
				1212	{
				1213	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
				1214	unsigned long flags;
				1215
				1216	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
				1217	lockres->l_name, lockres->l_unlock_action);
				1218
				1219	spin_lock_irqsave(&lockres->l_lock, flags);
				1220	if (error) {
				1221	mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
				1222	"unlock_action %d\n", error, lockres->l_name,
				1223	lockres->l_unlock_action);
				1224	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1225	return;
				1226	}
				1227
				1228	switch(lockres->l_unlock_action) {
				1229	case OCFS2_UNLOCK_CANCEL_CONVERT:
				1230	mlog(0, "Cancel convert success for %s\n", lockres->l_name);
				1231	lockres->l_action = OCFS2_AST_INVALID;
				1232	/* Downconvert thread may have requeued this lock, we
				1233	* need to wake it. */
				1234	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
				1235	ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
				1236	break;
				1237	case OCFS2_UNLOCK_DROP_LOCK:
				1238	lockres->l_level = DLM_LOCK_IV;
				1239	break;
				1240	default:
				1241	BUG();
				1242	}
				1243
				1244	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
				1245	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
				1246	wake_up(&lockres->l_event);
				1247	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1248	}
				1249
				1250	/*
				1251	* This is the filesystem locking protocol. It provides the lock handling
				1252	* hooks for the underlying DLM. It has a maximum version number.
				1253	* The version number allows interoperability with systems running at
				1254	* the same major number and an equal or smaller minor number.
				1255	*
				1256	* Whenever the filesystem does new things with locks (adds or removes a
				1257	* lock, orders them differently, does different things underneath a lock),
				1258	* the version must be changed. The protocol is negotiated when joining
				1259	* the dlm domain. A node may join the domain if its major version is
				1260	* identical to all other nodes and its minor version is greater than
				1261	* or equal to all other nodes. When its minor version is greater than
				1262	* the other nodes, it will run at the minor version specified by the
				1263	* other nodes.
				1264	*
				1265	* If a locking change is made that will not be compatible with older
				1266	* versions, the major number must be increased and the minor version set
				1267	* to zero. If a change merely adds a behavior that can be disabled when
				1268	* speaking to older versions, the minor version must be increased. If a
				1269	* change adds a fully backwards compatible change (eg, LVB changes that
				1270	* are just ignored by older versions), the version does not need to be
				1271	* updated.
				1272	*/
				1273	static struct ocfs2_locking_protocol lproto = {
				1274	.lp_max_version = {
				1275	.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
				1276	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
				1277	},
				1278	.lp_lock_ast = ocfs2_locking_ast,
				1279	.lp_blocking_ast = ocfs2_blocking_ast,
				1280	.lp_unlock_ast = ocfs2_unlock_ast,
				1281	};
				1282
				1283	void ocfs2_set_locking_protocol(void)
				1284	{
				1285	ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
				1286	}
				1287
				1288	static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
				1289	int convert)
				1290	{
				1291	unsigned long flags;
				1292
				1293	spin_lock_irqsave(&lockres->l_lock, flags);
				1294	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
				1295	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
				1296	if (convert)
				1297	lockres->l_action = OCFS2_AST_INVALID;
				1298	else
				1299	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
				1300	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1301
				1302	wake_up(&lockres->l_event);
				1303	}
				1304
				1305	/* Note: If we detect another process working on the lock (i.e.,
				1306	* OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
				1307	* to do the right thing in that case.
				1308	*/
				1309	static int ocfs2_lock_create(struct ocfs2_super *osb,
				1310	struct ocfs2_lock_res *lockres,
				1311	int level,
				1312	u32 dlm_flags)
				1313	{
				1314	int ret = 0;
				1315	unsigned long flags;
				1316	unsigned int gen;
				1317
				1318	mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
				1319	dlm_flags);
				1320
				1321	spin_lock_irqsave(&lockres->l_lock, flags);
				1322	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) \|\|
				1323	(lockres->l_flags & OCFS2_LOCK_BUSY)) {
				1324	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1325	goto bail;
				1326	}
				1327
				1328	lockres->l_action = OCFS2_AST_ATTACH;
				1329	lockres->l_requested = level;
				1330	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
				1331	gen = lockres_set_pending(lockres);
				1332	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1333
				1334	ret = ocfs2_dlm_lock(osb->cconn,
				1335	level,
				1336	&lockres->l_lksb,
				1337	dlm_flags,
				1338	lockres->l_name,
				1339	OCFS2_LOCK_ID_MAX_LEN - 1);
				1340	lockres_clear_pending(lockres, gen, osb);
				1341	if (ret) {
				1342	ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
				1343	ocfs2_recover_from_dlm_error(lockres, 1);
				1344	}
				1345
				1346	mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
				1347
				1348	bail:
				1349	return ret;
				1350	}
				1351
				1352	static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
				1353	int flag)
				1354	{
				1355	unsigned long flags;
				1356	int ret;
				1357
				1358	spin_lock_irqsave(&lockres->l_lock, flags);
				1359	ret = lockres->l_flags & flag;
				1360	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1361
				1362	return ret;
				1363	}
				1364
				1365	static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
				1366
				1367	{
				1368	wait_event(lockres->l_event,
				1369	!ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
				1370	}
				1371
				1372	static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
				1373
				1374	{
				1375	wait_event(lockres->l_event,
				1376	!ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
				1377	}
				1378
				1379	/* predict what lock level we'll be dropping down to on behalf
				1380	* of another node, and return true if the currently wanted
				1381	* level will be compatible with it. */
				1382	static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
				1383	int wanted)
				1384	{
				1385	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
				1386
				1387	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
				1388	}
				1389
				1390	static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
				1391	{
				1392	INIT_LIST_HEAD(&mw->mw_item);
				1393	init_completion(&mw->mw_complete);
				1394	ocfs2_init_start_time(mw);
				1395	}
				1396
				1397	static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
				1398	{
				1399	wait_for_completion(&mw->mw_complete);
				1400	/* Re-arm the completion in case we want to wait on it again */
				1401	reinit_completion(&mw->mw_complete);
				1402	return mw->mw_status;
				1403	}
				1404
				1405	static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
				1406	struct ocfs2_mask_waiter *mw,
				1407	unsigned long mask,
				1408	unsigned long goal)
				1409	{
				1410	BUG_ON(!list_empty(&mw->mw_item));
				1411
				1412	assert_spin_locked(&lockres->l_lock);
				1413
				1414	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
				1415	mw->mw_mask = mask;
				1416	mw->mw_goal = goal;
				1417	ocfs2_track_lock_wait(lockres);
				1418	}
				1419
				1420	/* returns 0 if the mw that was removed was already satisfied, -EBUSY
				1421	* if the mask still hadn't reached its goal */
				1422	static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
				1423	struct ocfs2_mask_waiter *mw)
				1424	{
				1425	int ret = 0;
				1426
				1427	assert_spin_locked(&lockres->l_lock);
				1428	if (!list_empty(&mw->mw_item)) {
				1429	if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
				1430	ret = -EBUSY;
				1431
				1432	list_del_init(&mw->mw_item);
				1433	init_completion(&mw->mw_complete);
				1434	ocfs2_track_lock_wait(lockres);
				1435	}
				1436
				1437	return ret;
				1438	}
				1439
				1440	static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
				1441	struct ocfs2_mask_waiter *mw)
				1442	{
				1443	unsigned long flags;
				1444	int ret = 0;
				1445
				1446	spin_lock_irqsave(&lockres->l_lock, flags);
				1447	ret = __lockres_remove_mask_waiter(lockres, mw);
				1448	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1449
				1450	return ret;
				1451
				1452	}
				1453
				1454	static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
				1455	struct ocfs2_lock_res *lockres)
				1456	{
				1457	int ret;
				1458
				1459	ret = wait_for_completion_interruptible(&mw->mw_complete);
				1460	if (ret)
				1461	lockres_remove_mask_waiter(lockres, mw);
				1462	else
				1463	ret = mw->mw_status;
				1464	/* Re-arm the completion in case we want to wait on it again */
				1465	reinit_completion(&mw->mw_complete);
				1466	return ret;
				1467	}
				1468
				1469	static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
				1470	struct ocfs2_lock_res *lockres,
				1471	int level,
				1472	u32 lkm_flags,
				1473	int arg_flags,
				1474	int l_subclass,
				1475	unsigned long caller_ip)
				1476	{
				1477	struct ocfs2_mask_waiter mw;
				1478	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
				1479	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
				1480	unsigned long flags;
				1481	unsigned int gen;
				1482	int noqueue_attempted = 0;
				1483	int dlm_locked = 0;
				1484	int kick_dc = 0;
				1485
				1486	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
				1487	mlog_errno(-EINVAL);
				1488	return -EINVAL;
				1489	}
				1490
				1491	ocfs2_init_mask_waiter(&mw);
				1492
				1493	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
				1494	lkm_flags \|= DLM_LKF_VALBLK;
				1495
				1496	again:
				1497	wait = 0;
				1498
				1499	spin_lock_irqsave(&lockres->l_lock, flags);
				1500
				1501	if (catch_signals && signal_pending(current)) {
				1502	ret = -ERESTARTSYS;
				1503	goto unlock;
				1504	}
				1505
				1506	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
				1507	"Cluster lock called on freeing lockres %s! flags "
				1508	"0x%lx\n", lockres->l_name, lockres->l_flags);
				1509
				1510	/* We only compare against the currently granted level
				1511	* here. If the lock is blocked waiting on a downconvert,
				1512	* we'll get caught below. */
				1513	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
				1514	level > lockres->l_level) {
				1515	/* is someone sitting in dlm_lock? If so, wait on
				1516	* them. */
				1517	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
				1518	wait = 1;
				1519	goto unlock;
				1520	}
				1521
				1522	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
				1523	/*
				1524	* We've upconverted. If the lock now has a level we can
				1525	* work with, we take it. If, however, the lock is not at the
				1526	* required level, we go thru the full cycle. One way this could
				1527	* happen is if a process requesting an upconvert to PR is
				1528	* closely followed by another requesting upconvert to an EX.
				1529	* If the process requesting EX lands here, we want it to
				1530	* continue attempting to upconvert and let the process
				1531	* requesting PR take the lock.
				1532	* If multiple processes request upconvert to PR, the first one
				1533	* here will take the lock. The others will have to go thru the
				1534	* OCFS2_LOCK_BLOCKED check to ensure that there is no pending
				1535	* downconvert request.
				1536	*/
				1537	if (level <= lockres->l_level)
				1538	goto update_holders;
				1539	}
				1540
				1541	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
				1542	!ocfs2_may_continue_on_blocked_lock(lockres, level)) {
				1543	/* is the lock is currently blocked on behalf of
				1544	* another node */
				1545	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
				1546	wait = 1;
				1547	goto unlock;
				1548	}
				1549
				1550	if (level > lockres->l_level) {
				1551	if (noqueue_attempted > 0) {
				1552	ret = -EAGAIN;
				1553	goto unlock;
				1554	}
				1555	if (lkm_flags & DLM_LKF_NOQUEUE)
				1556	noqueue_attempted = 1;
				1557
				1558	if (lockres->l_action != OCFS2_AST_INVALID)
				1559	mlog(ML_ERROR, "lockres %s has action %u pending\n",
				1560	lockres->l_name, lockres->l_action);
				1561
				1562	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
				1563	lockres->l_action = OCFS2_AST_ATTACH;
				1564	lkm_flags &= ~DLM_LKF_CONVERT;
				1565	} else {
				1566	lockres->l_action = OCFS2_AST_CONVERT;
				1567	lkm_flags \|= DLM_LKF_CONVERT;
				1568	}
				1569
				1570	lockres->l_requested = level;
				1571	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
				1572	gen = lockres_set_pending(lockres);
				1573	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1574
				1575	BUG_ON(level == DLM_LOCK_IV);
				1576	BUG_ON(level == DLM_LOCK_NL);
				1577
				1578	mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
				1579	lockres->l_name, lockres->l_level, level);
				1580
				1581	/* call dlm_lock to upgrade lock now */
				1582	ret = ocfs2_dlm_lock(osb->cconn,
				1583	level,
				1584	&lockres->l_lksb,
				1585	lkm_flags,
				1586	lockres->l_name,
				1587	OCFS2_LOCK_ID_MAX_LEN - 1);
				1588	lockres_clear_pending(lockres, gen, osb);
				1589	if (ret) {
				1590	if (!(lkm_flags & DLM_LKF_NOQUEUE) \|\|
				1591	(ret != -EAGAIN)) {
				1592	ocfs2_log_dlm_error("ocfs2_dlm_lock",
				1593	ret, lockres);
				1594	}
				1595	ocfs2_recover_from_dlm_error(lockres, 1);
				1596	goto out;
				1597	}
				1598	dlm_locked = 1;
				1599
				1600	mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
				1601	lockres->l_name);
				1602
				1603	/* At this point we've gone inside the dlm and need to
				1604	* complete our work regardless. */
				1605	catch_signals = 0;
				1606
				1607	/* wait for busy to clear and carry on */
				1608	goto again;
				1609	}
				1610
				1611	update_holders:
				1612	/* Ok, if we get here then we're good to go. */
				1613	ocfs2_inc_holders(lockres, level);
				1614
				1615	ret = 0;
				1616	unlock:
				1617	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
				1618
				1619	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
				1620	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
				1621
				1622	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1623	if (kick_dc)
				1624	ocfs2_wake_downconvert_thread(osb);
				1625	out:
				1626	/*
				1627	* This is helping work around a lock inversion between the page lock
				1628	* and dlm locks. One path holds the page lock while calling aops
				1629	* which block acquiring dlm locks. The voting thread holds dlm
				1630	* locks while acquiring page locks while down converting data locks.
				1631	* This block is helping an aop path notice the inversion and back
				1632	* off to unlock its page lock before trying the dlm lock again.
				1633	*/
				1634	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
				1635	mw.mw_mask & (OCFS2_LOCK_BUSY\|OCFS2_LOCK_BLOCKED)) {
				1636	wait = 0;
				1637	spin_lock_irqsave(&lockres->l_lock, flags);
				1638	if (__lockres_remove_mask_waiter(lockres, &mw)) {
				1639	if (dlm_locked)
				1640	lockres_or_flags(lockres,
				1641	OCFS2_LOCK_NONBLOCK_FINISHED);
				1642	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1643	ret = -EAGAIN;
				1644	} else {
				1645	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1646	goto again;
				1647	}
				1648	}
				1649	if (wait) {
				1650	ret = ocfs2_wait_for_mask(&mw);
				1651	if (ret == 0)
				1652	goto again;
				1653	mlog_errno(ret);
				1654	}
				1655	ocfs2_update_lock_stats(lockres, level, &mw, ret);
				1656
				1657	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				1658	if (!ret && lockres->l_lockdep_map.key != NULL) {
				1659	if (level == DLM_LOCK_PR)
				1660	rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
				1661	!!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
				1662	caller_ip);
				1663	else
				1664	rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
				1665	!!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
				1666	caller_ip);
				1667	}
				1668	#endif
				1669	return ret;
				1670	}
				1671
				1672	static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
				1673	struct ocfs2_lock_res *lockres,
				1674	int level,
				1675	u32 lkm_flags,
				1676	int arg_flags)
				1677	{
				1678	return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
				1679	0, _RET_IP_);
				1680	}
				1681
				1682
				1683	static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
				1684	struct ocfs2_lock_res *lockres,
				1685	int level,
				1686	unsigned long caller_ip)
				1687	{
				1688	unsigned long flags;
				1689
				1690	spin_lock_irqsave(&lockres->l_lock, flags);
				1691	ocfs2_dec_holders(lockres, level);
				1692	ocfs2_downconvert_on_unlock(osb, lockres);
				1693	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1694	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				1695	if (lockres->l_lockdep_map.key != NULL)
				1696	rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
				1697	#endif
				1698	}
				1699
				1700	static int ocfs2_create_new_lock(struct ocfs2_super *osb,
				1701	struct ocfs2_lock_res *lockres,
				1702	int ex,
				1703	int local)
				1704	{
				1705	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				1706	unsigned long flags;
				1707	u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
				1708
				1709	spin_lock_irqsave(&lockres->l_lock, flags);
				1710	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
				1711	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
				1712	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1713
				1714	return ocfs2_lock_create(osb, lockres, level, lkm_flags);
				1715	}
				1716
				1717	/* Grants us an EX lock on the data and metadata resources, skipping
				1718	* the normal cluster directory lookup. Use this ONLY on newly created
				1719	* inodes which other nodes can't possibly see, and which haven't been
				1720	* hashed in the inode hash yet. This can give us a good performance
				1721	* increase as it'll skip the network broadcast normally associated
				1722	* with creating a new lock resource. */
				1723	int ocfs2_create_new_inode_locks(struct inode *inode)
				1724	{
				1725	int ret;
				1726	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1727
				1728	BUG_ON(!ocfs2_inode_is_new(inode));
				1729
				1730	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
				1731
				1732	/* NOTE: That we don't increment any of the holder counts, nor
				1733	* do we add anything to a journal handle. Since this is
				1734	* supposed to be a new inode which the cluster doesn't know
				1735	* about yet, there is no need to. As far as the LVB handling
				1736	* is concerned, this is basically like acquiring an EX lock
				1737	* on a resource which has an invalid one -- we'll set it
				1738	* valid when we release the EX. */
				1739
				1740	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
				1741	if (ret) {
				1742	mlog_errno(ret);
				1743	goto bail;
				1744	}
				1745
				1746	/*
				1747	* We don't want to use DLM_LKF_LOCAL on a meta data lock as they
				1748	* don't use a generation in their lock names.
				1749	*/
				1750	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
				1751	if (ret) {
				1752	mlog_errno(ret);
				1753	goto bail;
				1754	}
				1755
				1756	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
				1757	if (ret)
				1758	mlog_errno(ret);
				1759
				1760	bail:
				1761	return ret;
				1762	}
				1763
				1764	int ocfs2_rw_lock(struct inode *inode, int write)
				1765	{
				1766	int status, level;
				1767	struct ocfs2_lock_res *lockres;
				1768	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1769
				1770	mlog(0, "inode %llu take %s RW lock\n",
				1771	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1772	write ? "EXMODE" : "PRMODE");
				1773
				1774	if (ocfs2_mount_local(osb))
				1775	return 0;
				1776
				1777	lockres = &OCFS2_I(inode)->ip_rw_lockres;
				1778
				1779	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
				1780
				1781	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
				1782	if (status < 0)
				1783	mlog_errno(status);
				1784
				1785	return status;
				1786	}
				1787
				1788	int ocfs2_try_rw_lock(struct inode *inode, int write)
				1789	{
				1790	int status, level;
				1791	struct ocfs2_lock_res *lockres;
				1792	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1793
				1794	mlog(0, "inode %llu try to take %s RW lock\n",
				1795	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1796	write ? "EXMODE" : "PRMODE");
				1797
				1798	if (ocfs2_mount_local(osb))
				1799	return 0;
				1800
				1801	lockres = &OCFS2_I(inode)->ip_rw_lockres;
				1802
				1803	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
				1804
				1805	status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
				1806	return status;
				1807	}
				1808
				1809	void ocfs2_rw_unlock(struct inode *inode, int write)
				1810	{
				1811	int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
				1812	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
				1813	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1814
				1815	mlog(0, "inode %llu drop %s RW lock\n",
				1816	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1817	write ? "EXMODE" : "PRMODE");
				1818
				1819	if (!ocfs2_mount_local(osb))
				1820	ocfs2_cluster_unlock(osb, lockres, level);
				1821	}
				1822
				1823	/*
				1824	* ocfs2_open_lock always get PR mode lock.
				1825	*/
				1826	int ocfs2_open_lock(struct inode *inode)
				1827	{
				1828	int status = 0;
				1829	struct ocfs2_lock_res *lockres;
				1830	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1831
				1832	mlog(0, "inode %llu take PRMODE open lock\n",
				1833	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				1834
				1835	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_mount_local(osb))
				1836	goto out;
				1837
				1838	lockres = &OCFS2_I(inode)->ip_open_lockres;
				1839
				1840	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
				1841	if (status < 0)
				1842	mlog_errno(status);
				1843
				1844	out:
				1845	return status;
				1846	}
				1847
				1848	int ocfs2_try_open_lock(struct inode *inode, int write)
				1849	{
				1850	int status = 0, level;
				1851	struct ocfs2_lock_res *lockres;
				1852	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1853
				1854	mlog(0, "inode %llu try to take %s open lock\n",
				1855	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1856	write ? "EXMODE" : "PRMODE");
				1857
				1858	if (ocfs2_is_hard_readonly(osb)) {
				1859	if (write)
				1860	status = -EROFS;
				1861	goto out;
				1862	}
				1863
				1864	if (ocfs2_mount_local(osb))
				1865	goto out;
				1866
				1867	lockres = &OCFS2_I(inode)->ip_open_lockres;
				1868
				1869	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
				1870
				1871	/*
				1872	* The file system may already holding a PRMODE/EXMODE open lock.
				1873	* Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
				1874	* other nodes and the -EAGAIN will indicate to the caller that
				1875	* this inode is still in use.
				1876	*/
				1877	status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
				1878
				1879	out:
				1880	return status;
				1881	}
				1882
				1883	/*
				1884	* ocfs2_open_unlock unlock PR and EX mode open locks.
				1885	*/
				1886	void ocfs2_open_unlock(struct inode *inode)
				1887	{
				1888	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
				1889	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1890
				1891	mlog(0, "inode %llu drop open lock\n",
				1892	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				1893
				1894	if (ocfs2_mount_local(osb))
				1895	goto out;
				1896
				1897	if(lockres->l_ro_holders)
				1898	ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
				1899	if(lockres->l_ex_holders)
				1900	ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
				1901
				1902	out:
				1903	return;
				1904	}
				1905
				1906	static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
				1907	int level)
				1908	{
				1909	int ret;
				1910	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
				1911	unsigned long flags;
				1912	struct ocfs2_mask_waiter mw;
				1913
				1914	ocfs2_init_mask_waiter(&mw);
				1915
				1916	retry_cancel:
				1917	spin_lock_irqsave(&lockres->l_lock, flags);
				1918	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
				1919	ret = ocfs2_prepare_cancel_convert(osb, lockres);
				1920	if (ret) {
				1921	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1922	ret = ocfs2_cancel_convert(osb, lockres);
				1923	if (ret < 0) {
				1924	mlog_errno(ret);
				1925	goto out;
				1926	}
				1927	goto retry_cancel;
				1928	}
				1929	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
				1930	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1931
				1932	ocfs2_wait_for_mask(&mw);
				1933	goto retry_cancel;
				1934	}
				1935
				1936	ret = -ERESTARTSYS;
				1937	/*
				1938	* We may still have gotten the lock, in which case there's no
				1939	* point to restarting the syscall.
				1940	*/
				1941	if (lockres->l_level == level)
				1942	ret = 0;
				1943
				1944	mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
				1945	lockres->l_flags, lockres->l_level, lockres->l_action);
				1946
				1947	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1948
				1949	out:
				1950	return ret;
				1951	}
				1952
				1953	/*
				1954	* ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
				1955	* flock() calls. The locking approach this requires is sufficiently
				1956	* different from all other cluster lock types that we implement a
				1957	* separate path to the "low-level" dlm calls. In particular:
				1958	*
				1959	* - No optimization of lock levels is done - we take at exactly
				1960	* what's been requested.
				1961	*
				1962	* - No lock caching is employed. We immediately downconvert to
				1963	* no-lock at unlock time. This also means flock locks never go on
				1964	* the blocking list).
				1965	*
				1966	* - Since userspace can trivially deadlock itself with flock, we make
				1967	* sure to allow cancellation of a misbehaving applications flock()
				1968	* request.
				1969	*
				1970	* - Access to any flock lockres doesn't require concurrency, so we
				1971	* can simplify the code by requiring the caller to guarantee
				1972	* serialization of dlmglue flock calls.
				1973	*/
				1974	int ocfs2_file_lock(struct file *file, int ex, int trylock)
				1975	{
				1976	int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				1977	unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
				1978	unsigned long flags;
				1979	struct ocfs2_file_private *fp = file->private_data;
				1980	struct ocfs2_lock_res *lockres = &fp->fp_flock;
				1981	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
				1982	struct ocfs2_mask_waiter mw;
				1983
				1984	ocfs2_init_mask_waiter(&mw);
				1985
				1986	if ((lockres->l_flags & OCFS2_LOCK_BUSY) \|\|
				1987	(lockres->l_level > DLM_LOCK_NL)) {
				1988	mlog(ML_ERROR,
				1989	"File lock \"%s\" has busy or locked state: flags: 0x%lx, "
				1990	"level: %u\n", lockres->l_name, lockres->l_flags,
				1991	lockres->l_level);
				1992	return -EINVAL;
				1993	}
				1994
				1995	spin_lock_irqsave(&lockres->l_lock, flags);
				1996	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
				1997	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
				1998	spin_unlock_irqrestore(&lockres->l_lock, flags);
				1999
				2000	/*
				2001	* Get the lock at NLMODE to start - that way we
				2002	* can cancel the upconvert request if need be.
				2003	*/
				2004	ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
				2005	if (ret < 0) {
				2006	mlog_errno(ret);
				2007	goto out;
				2008	}
				2009
				2010	ret = ocfs2_wait_for_mask(&mw);
				2011	if (ret) {
				2012	mlog_errno(ret);
				2013	goto out;
				2014	}
				2015	spin_lock_irqsave(&lockres->l_lock, flags);
				2016	}
				2017
				2018	lockres->l_action = OCFS2_AST_CONVERT;
				2019	lkm_flags \|= DLM_LKF_CONVERT;
				2020	lockres->l_requested = level;
				2021	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
				2022
				2023	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
				2024	spin_unlock_irqrestore(&lockres->l_lock, flags);
				2025
				2026	ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
				2027	lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
				2028	if (ret) {
				2029	if (!trylock \|\| (ret != -EAGAIN)) {
				2030	ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
				2031	ret = -EINVAL;
				2032	}
				2033
				2034	ocfs2_recover_from_dlm_error(lockres, 1);
				2035	lockres_remove_mask_waiter(lockres, &mw);
				2036	goto out;
				2037	}
				2038
				2039	ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
				2040	if (ret == -ERESTARTSYS) {
				2041	/*
				2042	* Userspace can cause deadlock itself with
				2043	* flock(). Current behavior locally is to allow the
				2044	* deadlock, but abort the system call if a signal is
				2045	* received. We follow this example, otherwise a
				2046	* poorly written program could sit in kernel until
				2047	* reboot.
				2048	*
				2049	* Handling this is a bit more complicated for Ocfs2
				2050	* though. We can't exit this function with an
				2051	* outstanding lock request, so a cancel convert is
				2052	* required. We intentionally overwrite 'ret' - if the
				2053	* cancel fails and the lock was granted, it's easier
				2054	* to just bubble success back up to the user.
				2055	*/
				2056	ret = ocfs2_flock_handle_signal(lockres, level);
				2057	} else if (!ret && (level > lockres->l_level)) {
				2058	/* Trylock failed asynchronously */
				2059	BUG_ON(!trylock);
				2060	ret = -EAGAIN;
				2061	}
				2062
				2063	out:
				2064
				2065	mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
				2066	lockres->l_name, ex, trylock, ret);
				2067	return ret;
				2068	}
				2069
				2070	void ocfs2_file_unlock(struct file *file)
				2071	{
				2072	int ret;
				2073	unsigned int gen;
				2074	unsigned long flags;
				2075	struct ocfs2_file_private *fp = file->private_data;
				2076	struct ocfs2_lock_res *lockres = &fp->fp_flock;
				2077	struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
				2078	struct ocfs2_mask_waiter mw;
				2079
				2080	ocfs2_init_mask_waiter(&mw);
				2081
				2082	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
				2083	return;
				2084
				2085	if (lockres->l_level == DLM_LOCK_NL)
				2086	return;
				2087
				2088	mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
				2089	lockres->l_name, lockres->l_flags, lockres->l_level,
				2090	lockres->l_action);
				2091
				2092	spin_lock_irqsave(&lockres->l_lock, flags);
				2093	/*
				2094	* Fake a blocking ast for the downconvert code.
				2095	*/
				2096	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
				2097	lockres->l_blocking = DLM_LOCK_EX;
				2098
				2099	gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
				2100	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
				2101	spin_unlock_irqrestore(&lockres->l_lock, flags);
				2102
				2103	ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
				2104	if (ret) {
				2105	mlog_errno(ret);
				2106	return;
				2107	}
				2108
				2109	ret = ocfs2_wait_for_mask(&mw);
				2110	if (ret)
				2111	mlog_errno(ret);
				2112	}
				2113
				2114	static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
				2115	struct ocfs2_lock_res *lockres)
				2116	{
				2117	int kick = 0;
				2118
				2119	/* If we know that another node is waiting on our lock, kick
				2120	* the downconvert thread * pre-emptively when we reach a release
				2121	* condition. */
				2122	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
				2123	switch(lockres->l_blocking) {
				2124	case DLM_LOCK_EX:
				2125	if (!lockres->l_ex_holders && !lockres->l_ro_holders)
				2126	kick = 1;
				2127	break;
				2128	case DLM_LOCK_PR:
				2129	if (!lockres->l_ex_holders)
				2130	kick = 1;
				2131	break;
				2132	default:
				2133	BUG();
				2134	}
				2135	}
				2136
				2137	if (kick)
				2138	ocfs2_wake_downconvert_thread(osb);
				2139	}
				2140
				2141	#define OCFS2_SEC_BITS 34
				2142	#define OCFS2_SEC_SHIFT (64 - 34)
				2143	#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
				2144
				2145	/* LVB only has room for 64 bits of time here so we pack it for
				2146	* now. */
				2147	static u64 ocfs2_pack_timespec(struct timespec64 *spec)
				2148	{
				2149	u64 res;
				2150	u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
				2151	u32 nsec = spec->tv_nsec;
				2152
				2153	res = (sec << OCFS2_SEC_SHIFT) \| (nsec & OCFS2_NSEC_MASK);
				2154
				2155	return res;
				2156	}
				2157
				2158	/* Call this with the lockres locked. I am reasonably sure we don't
				2159	* need ip_lock in this function as anyone who would be changing those
				2160	* values is supposed to be blocked in ocfs2_inode_lock right now. */
				2161	static void __ocfs2_stuff_meta_lvb(struct inode *inode)
				2162	{
				2163	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				2164	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
				2165	struct ocfs2_meta_lvb *lvb;
				2166
				2167	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2168
				2169	/*
				2170	* Invalidate the LVB of a deleted inode - this way other
				2171	* nodes are forced to go to disk and discover the new inode
				2172	* status.
				2173	*/
				2174	if (oi->ip_flags & OCFS2_INODE_DELETED) {
				2175	lvb->lvb_version = 0;
				2176	goto out;
				2177	}
				2178
				2179	lvb->lvb_version = OCFS2_LVB_VERSION;
				2180	lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
				2181	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
				2182	lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode));
				2183	lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
				2184	lvb->lvb_imode = cpu_to_be16(inode->i_mode);
				2185	lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
				2186	lvb->lvb_iatime_packed =
				2187	cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
				2188	lvb->lvb_ictime_packed =
				2189	cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
				2190	lvb->lvb_imtime_packed =
				2191	cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
				2192	lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
				2193	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
				2194	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
				2195
				2196	out:
				2197	mlog_meta_lvb(0, lockres);
				2198	}
				2199
				2200	static void ocfs2_unpack_timespec(struct timespec64 *spec,
				2201	u64 packed_time)
				2202	{
				2203	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
				2204	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
				2205	}
				2206
				2207	static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
				2208	{
				2209	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				2210	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
				2211	struct ocfs2_meta_lvb *lvb;
				2212
				2213	mlog_meta_lvb(0, lockres);
				2214
				2215	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2216
				2217	/* We're safe here without the lockres lock... */
				2218	spin_lock(&oi->ip_lock);
				2219	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
				2220	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
				2221
				2222	oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
				2223	oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
				2224	ocfs2_set_inode_flags(inode);
				2225
				2226	/* fast-symlinks are a special case */
				2227	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
				2228	inode->i_blocks = 0;
				2229	else
				2230	inode->i_blocks = ocfs2_inode_sector_count(inode);
				2231
				2232	i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
				2233	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
				2234	inode->i_mode = be16_to_cpu(lvb->lvb_imode);
				2235	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
				2236	ocfs2_unpack_timespec(&inode->i_atime,
				2237	be64_to_cpu(lvb->lvb_iatime_packed));
				2238	ocfs2_unpack_timespec(&inode->i_mtime,
				2239	be64_to_cpu(lvb->lvb_imtime_packed));
				2240	ocfs2_unpack_timespec(&inode->i_ctime,
				2241	be64_to_cpu(lvb->lvb_ictime_packed));
				2242	spin_unlock(&oi->ip_lock);
				2243	}
				2244
				2245	static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
				2246	struct ocfs2_lock_res *lockres)
				2247	{
				2248	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2249
				2250	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
				2251	&& lvb->lvb_version == OCFS2_LVB_VERSION
				2252	&& be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
				2253	return 1;
				2254	return 0;
				2255	}
				2256
				2257	/* Determine whether a lock resource needs to be refreshed, and
				2258	* arbitrate who gets to refresh it.
				2259	*
				2260	* 0 means no refresh needed.
				2261	*
				2262	* > 0 means you need to refresh this and you MUST call
				2263	* ocfs2_complete_lock_res_refresh afterwards. */
				2264	static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
				2265	{
				2266	unsigned long flags;
				2267	int status = 0;
				2268
				2269	refresh_check:
				2270	spin_lock_irqsave(&lockres->l_lock, flags);
				2271	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
				2272	spin_unlock_irqrestore(&lockres->l_lock, flags);
				2273	goto bail;
				2274	}
				2275
				2276	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
				2277	spin_unlock_irqrestore(&lockres->l_lock, flags);
				2278
				2279	ocfs2_wait_on_refreshing_lock(lockres);
				2280	goto refresh_check;
				2281	}
				2282
				2283	/* Ok, I'll be the one to refresh this lock. */
				2284	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
				2285	spin_unlock_irqrestore(&lockres->l_lock, flags);
				2286
				2287	status = 1;
				2288	bail:
				2289	mlog(0, "status %d\n", status);
				2290	return status;
				2291	}
				2292
				2293	/* If status is non zero, I'll mark it as not being in refresh
				2294	* anymroe, but i won't clear the needs refresh flag. */
				2295	static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
				2296	int status)
				2297	{
				2298	unsigned long flags;
				2299
				2300	spin_lock_irqsave(&lockres->l_lock, flags);
				2301	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
				2302	if (!status)
				2303	lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
				2304	spin_unlock_irqrestore(&lockres->l_lock, flags);
				2305
				2306	wake_up(&lockres->l_event);
				2307	}
				2308
				2309	/* may or may not return a bh if it went to disk. */
				2310	static int ocfs2_inode_lock_update(struct inode *inode,
				2311	struct buffer_head **bh)
				2312	{
				2313	int status = 0;
				2314	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				2315	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
				2316	struct ocfs2_dinode *fe;
				2317	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2318
				2319	if (ocfs2_mount_local(osb))
				2320	goto bail;
				2321
				2322	spin_lock(&oi->ip_lock);
				2323	if (oi->ip_flags & OCFS2_INODE_DELETED) {
				2324	mlog(0, "Orphaned inode %llu was deleted while we "
				2325	"were waiting on a lock. ip_flags = 0x%x\n",
				2326	(unsigned long long)oi->ip_blkno, oi->ip_flags);
				2327	spin_unlock(&oi->ip_lock);
				2328	status = -ENOENT;
				2329	goto bail;
				2330	}
				2331	spin_unlock(&oi->ip_lock);
				2332
				2333	if (!ocfs2_should_refresh_lock_res(lockres))
				2334	goto bail;
				2335
				2336	/* This will discard any caching information we might have had
				2337	* for the inode metadata. */
				2338	ocfs2_metadata_cache_purge(INODE_CACHE(inode));
				2339
				2340	ocfs2_extent_map_trunc(inode, 0);
				2341
				2342	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
				2343	mlog(0, "Trusting LVB on inode %llu\n",
				2344	(unsigned long long)oi->ip_blkno);
				2345	ocfs2_refresh_inode_from_lvb(inode);
				2346	} else {
				2347	/* Boo, we have to go to disk. */
				2348	/* read bh, cast, ocfs2_refresh_inode */
				2349	status = ocfs2_read_inode_block(inode, bh);
				2350	if (status < 0) {
				2351	mlog_errno(status);
				2352	goto bail_refresh;
				2353	}
				2354	fe = (struct ocfs2_dinode ) (bh)->b_data;
				2355
				2356	/* This is a good chance to make sure we're not
				2357	* locking an invalid object. ocfs2_read_inode_block()
				2358	* already checked that the inode block is sane.
				2359	*
				2360	* We bug on a stale inode here because we checked
				2361	* above whether it was wiped from disk. The wiping
				2362	* node provides a guarantee that we receive that
				2363	* message and can mark the inode before dropping any
				2364	* locks associated with it. */
				2365	mlog_bug_on_msg(inode->i_generation !=
				2366	le32_to_cpu(fe->i_generation),
				2367	"Invalid dinode %llu disk generation: %u "
				2368	"inode->i_generation: %u\n",
				2369	(unsigned long long)oi->ip_blkno,
				2370	le32_to_cpu(fe->i_generation),
				2371	inode->i_generation);
				2372	mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) \|\|
				2373	!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
				2374	"Stale dinode %llu dtime: %llu flags: 0x%x\n",
				2375	(unsigned long long)oi->ip_blkno,
				2376	(unsigned long long)le64_to_cpu(fe->i_dtime),
				2377	le32_to_cpu(fe->i_flags));
				2378
				2379	ocfs2_refresh_inode(inode, fe);
				2380	ocfs2_track_lock_refresh(lockres);
				2381	}
				2382
				2383	status = 0;
				2384	bail_refresh:
				2385	ocfs2_complete_lock_res_refresh(lockres, status);
				2386	bail:
				2387	return status;
				2388	}
				2389
				2390	static int ocfs2_assign_bh(struct inode *inode,
				2391	struct buffer_head **ret_bh,
				2392	struct buffer_head *passed_bh)
				2393	{
				2394	int status;
				2395
				2396	if (passed_bh) {
				2397	/* Ok, the update went to disk for us, use the
				2398	* returned bh. */
				2399	*ret_bh = passed_bh;
				2400	get_bh(*ret_bh);
				2401
				2402	return 0;
				2403	}
				2404
				2405	status = ocfs2_read_inode_block(inode, ret_bh);
				2406	if (status < 0)
				2407	mlog_errno(status);
				2408
				2409	return status;
				2410	}
				2411
				2412	/*
				2413	* returns < 0 error if the callback will never be called, otherwise
				2414	* the result of the lock will be communicated via the callback.
				2415	*/
				2416	int ocfs2_inode_lock_full_nested(struct inode *inode,
				2417	struct buffer_head **ret_bh,
				2418	int ex,
				2419	int arg_flags,
				2420	int subclass)
				2421	{
				2422	int status, level, acquired;
				2423	u32 dlm_flags;
				2424	struct ocfs2_lock_res *lockres = NULL;
				2425	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2426	struct buffer_head *local_bh = NULL;
				2427
				2428	mlog(0, "inode %llu, take %s META lock\n",
				2429	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				2430	ex ? "EXMODE" : "PRMODE");
				2431
				2432	status = 0;
				2433	acquired = 0;
				2434	/* We'll allow faking a readonly metadata lock for
				2435	* rodevices. */
				2436	if (ocfs2_is_hard_readonly(osb)) {
				2437	if (ex)
				2438	status = -EROFS;
				2439	goto getbh;
				2440	}
				2441
				2442	if ((arg_flags & OCFS2_META_LOCK_GETBH) \|\|
				2443	ocfs2_mount_local(osb))
				2444	goto update;
				2445
				2446	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
				2447	ocfs2_wait_for_recovery(osb);
				2448
				2449	lockres = &OCFS2_I(inode)->ip_inode_lockres;
				2450	level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				2451	dlm_flags = 0;
				2452	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
				2453	dlm_flags \|= DLM_LKF_NOQUEUE;
				2454
				2455	status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
				2456	arg_flags, subclass, _RET_IP_);
				2457	if (status < 0) {
				2458	if (status != -EAGAIN)
				2459	mlog_errno(status);
				2460	goto bail;
				2461	}
				2462
				2463	/* Notify the error cleanup path to drop the cluster lock. */
				2464	acquired = 1;
				2465
				2466	/* We wait twice because a node may have died while we were in
				2467	* the lower dlm layers. The second time though, we've
				2468	* committed to owning this lock so we don't allow signals to
				2469	* abort the operation. */
				2470	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
				2471	ocfs2_wait_for_recovery(osb);
				2472
				2473	update:
				2474	/*
				2475	* We only see this flag if we're being called from
				2476	* ocfs2_read_locked_inode(). It means we're locking an inode
				2477	* which hasn't been populated yet, so clear the refresh flag
				2478	* and let the caller handle it.
				2479	*/
				2480	if (inode->i_state & I_NEW) {
				2481	status = 0;
				2482	if (lockres)
				2483	ocfs2_complete_lock_res_refresh(lockres, 0);
				2484	goto bail;
				2485	}
				2486
				2487	/* This is fun. The caller may want a bh back, or it may
				2488	* not. ocfs2_inode_lock_update definitely wants one in, but
				2489	* may or may not read one, depending on what's in the
				2490	* LVB. The result of all of this is that we've only gone to
				2491	* disk if we have to, so the complexity is worthwhile. */
				2492	status = ocfs2_inode_lock_update(inode, &local_bh);
				2493	if (status < 0) {
				2494	if (status != -ENOENT)
				2495	mlog_errno(status);
				2496	goto bail;
				2497	}
				2498	getbh:
				2499	if (ret_bh) {
				2500	status = ocfs2_assign_bh(inode, ret_bh, local_bh);
				2501	if (status < 0) {
				2502	mlog_errno(status);
				2503	goto bail;
				2504	}
				2505	}
				2506
				2507	bail:
				2508	if (status < 0) {
				2509	if (ret_bh && (*ret_bh)) {
				2510	brelse(*ret_bh);
				2511	*ret_bh = NULL;
				2512	}
				2513	if (acquired)
				2514	ocfs2_inode_unlock(inode, ex);
				2515	}
				2516
				2517	brelse(local_bh);
				2518	return status;
				2519	}
				2520
				2521	/*
				2522	* This is working around a lock inversion between tasks acquiring DLM
				2523	* locks while holding a page lock and the downconvert thread which
				2524	* blocks dlm lock acquiry while acquiring page locks.
				2525	*
				2526	* ** These _with_page variantes are only intended to be called from aop
				2527	* methods that hold page locks and return a very specific positive error
				2528	* code that aop methods pass up to the VFS -- test for errors with != 0. **
				2529	*
				2530	* The DLM is called such that it returns -EAGAIN if it would have
				2531	* blocked waiting for the downconvert thread. In that case we unlock
				2532	* our page so the downconvert thread can make progress. Once we've
				2533	* done this we have to return AOP_TRUNCATED_PAGE so the aop method
				2534	* that called us can bubble that back up into the VFS who will then
				2535	* immediately retry the aop call.
				2536	*/
				2537	int ocfs2_inode_lock_with_page(struct inode *inode,
				2538	struct buffer_head **ret_bh,
				2539	int ex,
				2540	struct page *page)
				2541	{
				2542	int ret;
				2543
				2544	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
				2545	if (ret == -EAGAIN) {
				2546	unlock_page(page);
				2547	/*
				2548	* If we can't get inode lock immediately, we should not return
				2549	* directly here, since this will lead to a softlockup problem.
				2550	* The method is to get a blocking lock and immediately unlock
				2551	* before returning, this can avoid CPU resource waste due to
				2552	* lots of retries, and benefits fairness in getting lock.
				2553	*/
				2554	if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
				2555	ocfs2_inode_unlock(inode, ex);
				2556	ret = AOP_TRUNCATED_PAGE;
				2557	}
				2558
				2559	return ret;
				2560	}
				2561
				2562	int ocfs2_inode_lock_atime(struct inode *inode,
				2563	struct vfsmount *vfsmnt,
				2564	int *level, int wait)
				2565	{
				2566	int ret;
				2567
				2568	if (wait)
				2569	ret = ocfs2_inode_lock(inode, NULL, 0);
				2570	else
				2571	ret = ocfs2_try_inode_lock(inode, NULL, 0);
				2572
				2573	if (ret < 0) {
				2574	if (ret != -EAGAIN)
				2575	mlog_errno(ret);
				2576	return ret;
				2577	}
				2578
				2579	/*
				2580	* If we should update atime, we will get EX lock,
				2581	* otherwise we just get PR lock.
				2582	*/
				2583	if (ocfs2_should_update_atime(inode, vfsmnt)) {
				2584	struct buffer_head *bh = NULL;
				2585
				2586	ocfs2_inode_unlock(inode, 0);
				2587	if (wait)
				2588	ret = ocfs2_inode_lock(inode, &bh, 1);
				2589	else
				2590	ret = ocfs2_try_inode_lock(inode, &bh, 1);
				2591
				2592	if (ret < 0) {
				2593	if (ret != -EAGAIN)
				2594	mlog_errno(ret);
				2595	return ret;
				2596	}
				2597	*level = 1;
				2598	if (ocfs2_should_update_atime(inode, vfsmnt))
				2599	ocfs2_update_inode_atime(inode, bh);
				2600	brelse(bh);
				2601	} else
				2602	*level = 0;
				2603
				2604	return ret;
				2605	}
				2606
				2607	void ocfs2_inode_unlock(struct inode *inode,
				2608	int ex)
				2609	{
				2610	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				2611	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
				2612	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				2613
				2614	mlog(0, "inode %llu drop %s META lock\n",
				2615	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				2616	ex ? "EXMODE" : "PRMODE");
				2617
				2618	if (!ocfs2_is_hard_readonly(osb) &&
				2619	!ocfs2_mount_local(osb))
				2620	ocfs2_cluster_unlock(osb, lockres, level);
				2621	}
				2622
				2623	/*
				2624	* This _tracker variantes are introduced to deal with the recursive cluster
				2625	* locking issue. The idea is to keep track of a lock holder on the stack of
				2626	* the current process. If there's a lock holder on the stack, we know the
				2627	* task context is already protected by cluster locking. Currently, they're
				2628	* used in some VFS entry routines.
				2629	*
				2630	* return < 0 on error, return == 0 if there's no lock holder on the stack
				2631	* before this call, return == 1 if this call would be a recursive locking.
				2632	* return == -1 if this lock attempt will cause an upgrade which is forbidden.
				2633	*
				2634	* When taking lock levels into account,we face some different situations.
				2635	*
				2636	* 1. no lock is held
				2637	* In this case, just lock the inode as requested and return 0
				2638	*
				2639	* 2. We are holding a lock
				2640	* For this situation, things diverges into several cases
				2641	*
				2642	* wanted holding what to do
				2643	* ex ex see 2.1 below
				2644	* ex pr see 2.2 below
				2645	* pr ex see 2.1 below
				2646	* pr pr see 2.1 below
				2647	*
				2648	* 2.1 lock level that is been held is compatible
				2649	* with the wanted level, so no lock action will be tacken.
				2650	*
				2651	* 2.2 Otherwise, an upgrade is needed, but it is forbidden.
				2652	*
				2653	* Reason why upgrade within a process is forbidden is that
				2654	* lock upgrade may cause dead lock. The following illustrates
				2655	* how it happens.
				2656	*
				2657	* thread on node1 thread on node2
				2658	* ocfs2_inode_lock_tracker(ex=0)
				2659	*
				2660	* <====== ocfs2_inode_lock_tracker(ex=1)
				2661	*
				2662	* ocfs2_inode_lock_tracker(ex=1)
				2663	*/
				2664	int ocfs2_inode_lock_tracker(struct inode *inode,
				2665	struct buffer_head **ret_bh,
				2666	int ex,
				2667	struct ocfs2_lock_holder *oh)
				2668	{
				2669	int status = 0;
				2670	struct ocfs2_lock_res *lockres;
				2671	struct ocfs2_lock_holder *tmp_oh;
				2672	struct pid *pid = task_pid(current);
				2673
				2674
				2675	lockres = &OCFS2_I(inode)->ip_inode_lockres;
				2676	tmp_oh = ocfs2_pid_holder(lockres, pid);
				2677
				2678	if (!tmp_oh) {
				2679	/*
				2680	* This corresponds to the case 1.
				2681	* We haven't got any lock before.
				2682	*/
				2683	status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0);
				2684	if (status < 0) {
				2685	if (status != -ENOENT)
				2686	mlog_errno(status);
				2687	return status;
				2688	}
				2689
				2690	oh->oh_ex = ex;
				2691	ocfs2_add_holder(lockres, oh);
				2692	return 0;
				2693	}
				2694
				2695	if (unlikely(ex && !tmp_oh->oh_ex)) {
				2696	/*
				2697	* case 2.2 upgrade may cause dead lock, forbid it.
				2698	*/
				2699	mlog(ML_ERROR, "Recursive locking is not permitted to "
				2700	"upgrade to EX level from PR level.\n");
				2701	dump_stack();
				2702	return -EINVAL;
				2703	}
				2704
				2705	/*
				2706	* case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full.
				2707	* ignore the lock level and just update it.
				2708	*/
				2709	if (ret_bh) {
				2710	status = ocfs2_inode_lock_full(inode, ret_bh, ex,
				2711	OCFS2_META_LOCK_GETBH);
				2712	if (status < 0) {
				2713	if (status != -ENOENT)
				2714	mlog_errno(status);
				2715	return status;
				2716	}
				2717	}
				2718	return tmp_oh ? 1 : 0;
				2719	}
				2720
				2721	void ocfs2_inode_unlock_tracker(struct inode *inode,
				2722	int ex,
				2723	struct ocfs2_lock_holder *oh,
				2724	int had_lock)
				2725	{
				2726	struct ocfs2_lock_res *lockres;
				2727
				2728	lockres = &OCFS2_I(inode)->ip_inode_lockres;
				2729	/* had_lock means that the currect process already takes the cluster
				2730	* lock previously.
				2731	* If had_lock is 1, we have nothing to do here.
				2732	* If had_lock is 0, we will release the lock.
				2733	*/
				2734	if (!had_lock) {
				2735	ocfs2_inode_unlock(inode, oh->oh_ex);
				2736	ocfs2_remove_holder(lockres, oh);
				2737	}
				2738	}
				2739
				2740	int ocfs2_orphan_scan_lock(struct ocfs2_super osb, u32 seqno)
				2741	{
				2742	struct ocfs2_lock_res *lockres;
				2743	struct ocfs2_orphan_scan_lvb *lvb;
				2744	int status = 0;
				2745
				2746	if (ocfs2_is_hard_readonly(osb))
				2747	return -EROFS;
				2748
				2749	if (ocfs2_mount_local(osb))
				2750	return 0;
				2751
				2752	lockres = &osb->osb_orphan_scan.os_lockres;
				2753	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
				2754	if (status < 0)
				2755	return status;
				2756
				2757	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2758	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
				2759	lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
				2760	*seqno = be32_to_cpu(lvb->lvb_os_seqno);
				2761	else
				2762	*seqno = osb->osb_orphan_scan.os_seqno + 1;
				2763
				2764	return status;
				2765	}
				2766
				2767	void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
				2768	{
				2769	struct ocfs2_lock_res *lockres;
				2770	struct ocfs2_orphan_scan_lvb *lvb;
				2771
				2772	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
				2773	lockres = &osb->osb_orphan_scan.os_lockres;
				2774	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2775	lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
				2776	lvb->lvb_os_seqno = cpu_to_be32(seqno);
				2777	ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
				2778	}
				2779	}
				2780
				2781	int ocfs2_super_lock(struct ocfs2_super *osb,
				2782	int ex)
				2783	{
				2784	int status = 0;
				2785	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				2786	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
				2787
				2788	if (ocfs2_is_hard_readonly(osb))
				2789	return -EROFS;
				2790
				2791	if (ocfs2_mount_local(osb))
				2792	goto bail;
				2793
				2794	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
				2795	if (status < 0) {
				2796	mlog_errno(status);
				2797	goto bail;
				2798	}
				2799
				2800	/* The super block lock path is really in the best position to
				2801	* know when resources covered by the lock need to be
				2802	* refreshed, so we do it here. Of course, making sense of
				2803	* everything is up to the caller :) */
				2804	status = ocfs2_should_refresh_lock_res(lockres);
				2805	if (status) {
				2806	status = ocfs2_refresh_slot_info(osb);
				2807
				2808	ocfs2_complete_lock_res_refresh(lockres, status);
				2809
				2810	if (status < 0) {
				2811	ocfs2_cluster_unlock(osb, lockres, level);
				2812	mlog_errno(status);
				2813	}
				2814	ocfs2_track_lock_refresh(lockres);
				2815	}
				2816	bail:
				2817	return status;
				2818	}
				2819
				2820	void ocfs2_super_unlock(struct ocfs2_super *osb,
				2821	int ex)
				2822	{
				2823	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				2824	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
				2825
				2826	if (!ocfs2_mount_local(osb))
				2827	ocfs2_cluster_unlock(osb, lockres, level);
				2828	}
				2829
				2830	int ocfs2_rename_lock(struct ocfs2_super *osb)
				2831	{
				2832	int status;
				2833	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
				2834
				2835	if (ocfs2_is_hard_readonly(osb))
				2836	return -EROFS;
				2837
				2838	if (ocfs2_mount_local(osb))
				2839	return 0;
				2840
				2841	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
				2842	if (status < 0)
				2843	mlog_errno(status);
				2844
				2845	return status;
				2846	}
				2847
				2848	void ocfs2_rename_unlock(struct ocfs2_super *osb)
				2849	{
				2850	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
				2851
				2852	if (!ocfs2_mount_local(osb))
				2853	ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
				2854	}
				2855
				2856	int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
				2857	{
				2858	int status;
				2859	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
				2860
				2861	if (ocfs2_is_hard_readonly(osb))
				2862	return -EROFS;
				2863
				2864	if (ex)
				2865	down_write(&osb->nfs_sync_rwlock);
				2866	else
				2867	down_read(&osb->nfs_sync_rwlock);
				2868
				2869	if (ocfs2_mount_local(osb))
				2870	return 0;
				2871
				2872	status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
				2873	0, 0);
				2874	if (status < 0) {
				2875	mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
				2876
				2877	if (ex)
				2878	up_write(&osb->nfs_sync_rwlock);
				2879	else
				2880	up_read(&osb->nfs_sync_rwlock);
				2881	}
				2882
				2883	return status;
				2884	}
				2885
				2886	void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
				2887	{
				2888	struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
				2889
				2890	if (!ocfs2_mount_local(osb))
				2891	ocfs2_cluster_unlock(osb, lockres,
				2892	ex ? LKM_EXMODE : LKM_PRMODE);
				2893	if (ex)
				2894	up_write(&osb->nfs_sync_rwlock);
				2895	else
				2896	up_read(&osb->nfs_sync_rwlock);
				2897	}
				2898
				2899	int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
				2900	struct ocfs2_trim_fs_info *info, int trylock)
				2901	{
				2902	int status;
				2903	struct ocfs2_trim_fs_lvb *lvb;
				2904	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
				2905
				2906	if (info)
				2907	info->tf_valid = 0;
				2908
				2909	if (ocfs2_is_hard_readonly(osb))
				2910	return -EROFS;
				2911
				2912	if (ocfs2_mount_local(osb))
				2913	return 0;
				2914
				2915	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
				2916	trylock ? DLM_LKF_NOQUEUE : 0, 0);
				2917	if (status < 0) {
				2918	if (status != -EAGAIN)
				2919	mlog_errno(status);
				2920	return status;
				2921	}
				2922
				2923	if (info) {
				2924	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2925	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
				2926	lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
				2927	info->tf_valid = 1;
				2928	info->tf_success = lvb->lvb_success;
				2929	info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
				2930	info->tf_start = be64_to_cpu(lvb->lvb_start);
				2931	info->tf_len = be64_to_cpu(lvb->lvb_len);
				2932	info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
				2933	info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
				2934	}
				2935	}
				2936
				2937	return status;
				2938	}
				2939
				2940	void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
				2941	struct ocfs2_trim_fs_info *info)
				2942	{
				2943	struct ocfs2_trim_fs_lvb *lvb;
				2944	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
				2945
				2946	if (ocfs2_mount_local(osb))
				2947	return;
				2948
				2949	if (info) {
				2950	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				2951	lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
				2952	lvb->lvb_success = info->tf_success;
				2953	lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
				2954	lvb->lvb_start = cpu_to_be64(info->tf_start);
				2955	lvb->lvb_len = cpu_to_be64(info->tf_len);
				2956	lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
				2957	lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
				2958	}
				2959
				2960	ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
				2961	}
				2962
				2963	int ocfs2_dentry_lock(struct dentry *dentry, int ex)
				2964	{
				2965	int ret;
				2966	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				2967	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
				2968	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
				2969
				2970	BUG_ON(!dl);
				2971
				2972	if (ocfs2_is_hard_readonly(osb)) {
				2973	if (ex)
				2974	return -EROFS;
				2975	return 0;
				2976	}
				2977
				2978	if (ocfs2_mount_local(osb))
				2979	return 0;
				2980
				2981	ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
				2982	if (ret < 0)
				2983	mlog_errno(ret);
				2984
				2985	return ret;
				2986	}
				2987
				2988	void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
				2989	{
				2990	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				2991	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
				2992	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
				2993
				2994	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
				2995	ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
				2996	}
				2997
				2998	/* Reference counting of the dlm debug structure. We want this because
				2999	* open references on the debug inodes can live on after a mount, so
				3000	* we can't rely on the ocfs2_super to always exist. */
				3001	static void ocfs2_dlm_debug_free(struct kref *kref)
				3002	{
				3003	struct ocfs2_dlm_debug *dlm_debug;
				3004
				3005	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
				3006
				3007	kfree(dlm_debug);
				3008	}
				3009
				3010	void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
				3011	{
				3012	if (dlm_debug)
				3013	kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
				3014	}
				3015
				3016	static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
				3017	{
				3018	kref_get(&debug->d_refcnt);
				3019	}
				3020
				3021	struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
				3022	{
				3023	struct ocfs2_dlm_debug *dlm_debug;
				3024
				3025	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
				3026	if (!dlm_debug) {
				3027	mlog_errno(-ENOMEM);
				3028	goto out;
				3029	}
				3030
				3031	kref_init(&dlm_debug->d_refcnt);
				3032	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
				3033	dlm_debug->d_filter_secs = 0;
				3034	out:
				3035	return dlm_debug;
				3036	}
				3037
				3038	/* Access to this is arbitrated for us via seq_file->sem. */
				3039	struct ocfs2_dlm_seq_priv {
				3040	struct ocfs2_dlm_debug *p_dlm_debug;
				3041	struct ocfs2_lock_res p_iter_res;
				3042	struct ocfs2_lock_res p_tmp_res;
				3043	};
				3044
				3045	static struct ocfs2_lock_res ocfs2_dlm_next_res(struct ocfs2_lock_res start,
				3046	struct ocfs2_dlm_seq_priv *priv)
				3047	{
				3048	struct ocfs2_lock_res iter, ret = NULL;
				3049	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
				3050
				3051	assert_spin_locked(&ocfs2_dlm_tracking_lock);
				3052
				3053	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
				3054	/* discover the head of the list */
				3055	if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
				3056	mlog(0, "End of list found, %p\n", ret);
				3057	break;
				3058	}
				3059
				3060	/* We track our "dummy" iteration lockres' by a NULL
				3061	* l_ops field. */
				3062	if (iter->l_ops != NULL) {
				3063	ret = iter;
				3064	break;
				3065	}
				3066	}
				3067
				3068	return ret;
				3069	}
				3070
				3071	static void ocfs2_dlm_seq_start(struct seq_file m, loff_t *pos)
				3072	{
				3073	struct ocfs2_dlm_seq_priv *priv = m->private;
				3074	struct ocfs2_lock_res *iter;
				3075
				3076	spin_lock(&ocfs2_dlm_tracking_lock);
				3077	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
				3078	if (iter) {
				3079	/* Since lockres' have the lifetime of their container
				3080	* (which can be inodes, ocfs2_supers, etc) we want to
				3081	* copy this out to a temporary lockres while still
				3082	* under the spinlock. Obviously after this we can't
				3083	* trust any pointers on the copy returned, but that's
				3084	* ok as the information we want isn't typically held
				3085	* in them. */
				3086	priv->p_tmp_res = *iter;
				3087	iter = &priv->p_tmp_res;
				3088	}
				3089	spin_unlock(&ocfs2_dlm_tracking_lock);
				3090
				3091	return iter;
				3092	}
				3093
				3094	static void ocfs2_dlm_seq_stop(struct seq_file m, void v)
				3095	{
				3096	}
				3097
				3098	static void ocfs2_dlm_seq_next(struct seq_file m, void v, loff_t pos)
				3099	{
				3100	struct ocfs2_dlm_seq_priv *priv = m->private;
				3101	struct ocfs2_lock_res *iter = v;
				3102	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
				3103
				3104	(*pos)++;
				3105	spin_lock(&ocfs2_dlm_tracking_lock);
				3106	iter = ocfs2_dlm_next_res(iter, priv);
				3107	list_del_init(&dummy->l_debug_list);
				3108	if (iter) {
				3109	list_add(&dummy->l_debug_list, &iter->l_debug_list);
				3110	priv->p_tmp_res = *iter;
				3111	iter = &priv->p_tmp_res;
				3112	}
				3113	spin_unlock(&ocfs2_dlm_tracking_lock);
				3114
				3115	return iter;
				3116	}
				3117
				3118	/*
				3119	* Version is used by debugfs.ocfs2 to determine the format being used
				3120	*
				3121	* New in version 2
				3122	* - Lock stats printed
				3123	* New in version 3
				3124	* - Max time in lock stats is in usecs (instead of nsecs)
				3125	* New in version 4
				3126	* - Add last pr/ex unlock times and first lock wait time in usecs
				3127	*/
				3128	#define OCFS2_DLM_DEBUG_STR_VERSION 4
				3129	static int ocfs2_dlm_seq_show(struct seq_file m, void v)
				3130	{
				3131	int i;
				3132	char *lvb;
				3133	struct ocfs2_lock_res *lockres = v;
				3134	#ifdef CONFIG_OCFS2_FS_STATS
				3135	u64 now, last;
				3136	struct ocfs2_dlm_debug *dlm_debug =
				3137	((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug;
				3138	#endif
				3139
				3140	if (!lockres)
				3141	return -EINVAL;
				3142
				3143	#ifdef CONFIG_OCFS2_FS_STATS
				3144	if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) {
				3145	now = ktime_to_us(ktime_get_real());
				3146	if (lockres->l_lock_prmode.ls_last >
				3147	lockres->l_lock_exmode.ls_last)
				3148	last = lockres->l_lock_prmode.ls_last;
				3149	else
				3150	last = lockres->l_lock_exmode.ls_last;
				3151	/*
				3152	* Use d_filter_secs field to filter lock resources dump,
				3153	* the default d_filter_secs(0) value filters nothing,
				3154	* otherwise, only dump the last N seconds active lock
				3155	* resources.
				3156	*/
				3157	if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs)
				3158	return 0;
				3159	}
				3160	#endif
				3161
				3162	seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
				3163
				3164	if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
				3165	seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
				3166	lockres->l_name,
				3167	(unsigned int)ocfs2_get_dentry_lock_ino(lockres));
				3168	else
				3169	seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
				3170
				3171	seq_printf(m, "%d\t"
				3172	"0x%lx\t"
				3173	"0x%x\t"
				3174	"0x%x\t"
				3175	"%u\t"
				3176	"%u\t"
				3177	"%d\t"
				3178	"%d\t",
				3179	lockres->l_level,
				3180	lockres->l_flags,
				3181	lockres->l_action,
				3182	lockres->l_unlock_action,
				3183	lockres->l_ro_holders,
				3184	lockres->l_ex_holders,
				3185	lockres->l_requested,
				3186	lockres->l_blocking);
				3187
				3188	/* Dump the raw LVB */
				3189	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				3190	for(i = 0; i < DLM_LVB_LEN; i++)
				3191	seq_printf(m, "0x%x\t", lvb[i]);
				3192
				3193	#ifdef CONFIG_OCFS2_FS_STATS
				3194	# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets)
				3195	# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets)
				3196	# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail)
				3197	# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail)
				3198	# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total)
				3199	# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total)
				3200	# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
				3201	# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
				3202	# define lock_refresh(_l) ((_l)->l_lock_refresh)
				3203	# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last)
				3204	# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last)
				3205	# define lock_wait(_l) ((_l)->l_lock_wait)
				3206	#else
				3207	# define lock_num_prmode(_l) (0)
				3208	# define lock_num_exmode(_l) (0)
				3209	# define lock_num_prmode_failed(_l) (0)
				3210	# define lock_num_exmode_failed(_l) (0)
				3211	# define lock_total_prmode(_l) (0ULL)
				3212	# define lock_total_exmode(_l) (0ULL)
				3213	# define lock_max_prmode(_l) (0)
				3214	# define lock_max_exmode(_l) (0)
				3215	# define lock_refresh(_l) (0)
				3216	# define lock_last_prmode(_l) (0ULL)
				3217	# define lock_last_exmode(_l) (0ULL)
				3218	# define lock_wait(_l) (0ULL)
				3219	#endif
				3220	/* The following seq_print was added in version 2 of this output */
				3221	seq_printf(m, "%u\t"
				3222	"%u\t"
				3223	"%u\t"
				3224	"%u\t"
				3225	"%llu\t"
				3226	"%llu\t"
				3227	"%u\t"
				3228	"%u\t"
				3229	"%u\t"
				3230	"%llu\t"
				3231	"%llu\t"
				3232	"%llu\t",
				3233	lock_num_prmode(lockres),
				3234	lock_num_exmode(lockres),
				3235	lock_num_prmode_failed(lockres),
				3236	lock_num_exmode_failed(lockres),
				3237	lock_total_prmode(lockres),
				3238	lock_total_exmode(lockres),
				3239	lock_max_prmode(lockres),
				3240	lock_max_exmode(lockres),
				3241	lock_refresh(lockres),
				3242	lock_last_prmode(lockres),
				3243	lock_last_exmode(lockres),
				3244	lock_wait(lockres));
				3245
				3246	/* End the line */
				3247	seq_printf(m, "\n");
				3248	return 0;
				3249	}
				3250
				3251	static const struct seq_operations ocfs2_dlm_seq_ops = {
				3252	.start = ocfs2_dlm_seq_start,
				3253	.stop = ocfs2_dlm_seq_stop,
				3254	.next = ocfs2_dlm_seq_next,
				3255	.show = ocfs2_dlm_seq_show,
				3256	};
				3257
				3258	static int ocfs2_dlm_debug_release(struct inode inode, struct file file)
				3259	{
				3260	struct seq_file *seq = file->private_data;
				3261	struct ocfs2_dlm_seq_priv *priv = seq->private;
				3262	struct ocfs2_lock_res *res = &priv->p_iter_res;
				3263
				3264	ocfs2_remove_lockres_tracking(res);
				3265	ocfs2_put_dlm_debug(priv->p_dlm_debug);
				3266	return seq_release_private(inode, file);
				3267	}
				3268
				3269	static int ocfs2_dlm_debug_open(struct inode inode, struct file file)
				3270	{
				3271	struct ocfs2_dlm_seq_priv *priv;
				3272	struct ocfs2_super *osb;
				3273
				3274	priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
				3275	if (!priv) {
				3276	mlog_errno(-ENOMEM);
				3277	return -ENOMEM;
				3278	}
				3279
				3280	osb = inode->i_private;
				3281	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
				3282	priv->p_dlm_debug = osb->osb_dlm_debug;
				3283	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
				3284
				3285	ocfs2_add_lockres_tracking(&priv->p_iter_res,
				3286	priv->p_dlm_debug);
				3287
				3288	return 0;
				3289	}
				3290
				3291	static const struct file_operations ocfs2_dlm_debug_fops = {
				3292	.open = ocfs2_dlm_debug_open,
				3293	.release = ocfs2_dlm_debug_release,
				3294	.read = seq_read,
				3295	.llseek = seq_lseek,
				3296	};
				3297
				3298	static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
				3299	{
				3300	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
				3301
				3302	debugfs_create_file("locking_state", S_IFREG\|S_IRUSR,
				3303	osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
				3304
				3305	debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
				3306	&dlm_debug->d_filter_secs);
				3307	ocfs2_get_dlm_debug(dlm_debug);
				3308	}
				3309
				3310	static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
				3311	{
				3312	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
				3313
				3314	if (dlm_debug)
				3315	ocfs2_put_dlm_debug(dlm_debug);
				3316	}
				3317
				3318	int ocfs2_dlm_init(struct ocfs2_super *osb)
				3319	{
				3320	int status = 0;
				3321	struct ocfs2_cluster_connection *conn = NULL;
				3322
				3323	if (ocfs2_mount_local(osb)) {
				3324	osb->node_num = 0;
				3325	goto local;
				3326	}
				3327
				3328	ocfs2_dlm_init_debug(osb);
				3329
				3330	/* launch downconvert thread */
				3331	osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
				3332	osb->uuid_str);
				3333	if (IS_ERR(osb->dc_task)) {
				3334	status = PTR_ERR(osb->dc_task);
				3335	osb->dc_task = NULL;
				3336	mlog_errno(status);
				3337	goto bail;
				3338	}
				3339
				3340	/* for now, uuid == domain */
				3341	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
				3342	osb->osb_cluster_name,
				3343	strlen(osb->osb_cluster_name),
				3344	osb->uuid_str,
				3345	strlen(osb->uuid_str),
				3346	&lproto, ocfs2_do_node_down, osb,
				3347	&conn);
				3348	if (status) {
				3349	mlog_errno(status);
				3350	goto bail;
				3351	}
				3352
				3353	status = ocfs2_cluster_this_node(conn, &osb->node_num);
				3354	if (status < 0) {
				3355	mlog_errno(status);
				3356	mlog(ML_ERROR,
				3357	"could not find this host's node number\n");
				3358	ocfs2_cluster_disconnect(conn, 0);
				3359	goto bail;
				3360	}
				3361
				3362	local:
				3363	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
				3364	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
				3365	ocfs2_nfs_sync_lock_init(osb);
				3366	ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
				3367
				3368	osb->cconn = conn;
				3369	bail:
				3370	if (status < 0) {
				3371	ocfs2_dlm_shutdown_debug(osb);
				3372	if (osb->dc_task)
				3373	kthread_stop(osb->dc_task);
				3374	}
				3375
				3376	return status;
				3377	}
				3378
				3379	void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
				3380	int hangup_pending)
				3381	{
				3382	ocfs2_drop_osb_locks(osb);
				3383
				3384	/*
				3385	* Now that we have dropped all locks and ocfs2_dismount_volume()
				3386	* has disabled recovery, the DLM won't be talking to us. It's
				3387	* safe to tear things down before disconnecting the cluster.
				3388	*/
				3389
				3390	if (osb->dc_task) {
				3391	kthread_stop(osb->dc_task);
				3392	osb->dc_task = NULL;
				3393	}
				3394
				3395	ocfs2_lock_res_free(&osb->osb_super_lockres);
				3396	ocfs2_lock_res_free(&osb->osb_rename_lockres);
				3397	ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
				3398	ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
				3399
				3400	if (osb->cconn) {
				3401	ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
				3402	osb->cconn = NULL;
				3403
				3404	ocfs2_dlm_shutdown_debug(osb);
				3405	}
				3406	}
				3407
				3408	static int ocfs2_drop_lock(struct ocfs2_super *osb,
				3409	struct ocfs2_lock_res *lockres)
				3410	{
				3411	int ret;
				3412	unsigned long flags;
				3413	u32 lkm_flags = 0;
				3414
				3415	/* We didn't get anywhere near actually using this lockres. */
				3416	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
				3417	goto out;
				3418
				3419	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
				3420	lkm_flags \|= DLM_LKF_VALBLK;
				3421
				3422	spin_lock_irqsave(&lockres->l_lock, flags);
				3423
				3424	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
				3425	"lockres %s, flags 0x%lx\n",
				3426	lockres->l_name, lockres->l_flags);
				3427
				3428	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
				3429	mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
				3430	"%u, unlock_action = %u\n",
				3431	lockres->l_name, lockres->l_flags, lockres->l_action,
				3432	lockres->l_unlock_action);
				3433
				3434	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3435
				3436	/* XXX: Today we just wait on any busy
				3437	* locks... Perhaps we need to cancel converts in the
				3438	* future? */
				3439	ocfs2_wait_on_busy_lock(lockres);
				3440
				3441	spin_lock_irqsave(&lockres->l_lock, flags);
				3442	}
				3443
				3444	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
				3445	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
				3446	lockres->l_level == DLM_LOCK_EX &&
				3447	!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
				3448	lockres->l_ops->set_lvb(lockres);
				3449	}
				3450
				3451	if (lockres->l_flags & OCFS2_LOCK_BUSY)
				3452	mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
				3453	lockres->l_name);
				3454	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
				3455	mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
				3456
				3457	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
				3458	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3459	goto out;
				3460	}
				3461
				3462	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
				3463
				3464	/* make sure we never get here while waiting for an ast to
				3465	* fire. */
				3466	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
				3467
				3468	/* is this necessary? */
				3469	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
				3470	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
				3471	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3472
				3473	mlog(0, "lock %s\n", lockres->l_name);
				3474
				3475	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
				3476	if (ret) {
				3477	ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
				3478	mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
				3479	ocfs2_dlm_dump_lksb(&lockres->l_lksb);
				3480	BUG();
				3481	}
				3482	mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
				3483	lockres->l_name);
				3484
				3485	ocfs2_wait_on_busy_lock(lockres);
				3486	out:
				3487	return 0;
				3488	}
				3489
				3490	static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
				3491	struct ocfs2_lock_res *lockres);
				3492
				3493	/* Mark the lockres as being dropped. It will no longer be
				3494	* queued if blocking, but we still may have to wait on it
				3495	* being dequeued from the downconvert thread before we can consider
				3496	* it safe to drop.
				3497	*
				3498	* You can not attempt to call cluster_lock on this lockres anymore. */
				3499	void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
				3500	struct ocfs2_lock_res *lockres)
				3501	{
				3502	int status;
				3503	struct ocfs2_mask_waiter mw;
				3504	unsigned long flags, flags2;
				3505
				3506	ocfs2_init_mask_waiter(&mw);
				3507
				3508	spin_lock_irqsave(&lockres->l_lock, flags);
				3509	lockres->l_flags \|= OCFS2_LOCK_FREEING;
				3510	if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
				3511	/*
				3512	* We know the downconvert is queued but not in progress
				3513	* because we are the downconvert thread and processing
				3514	* different lock. So we can just remove the lock from the
				3515	* queue. This is not only an optimization but also a way
				3516	* to avoid the following deadlock:
				3517	* ocfs2_dentry_post_unlock()
				3518	* ocfs2_dentry_lock_put()
				3519	* ocfs2_drop_dentry_lock()
				3520	* iput()
				3521	* ocfs2_evict_inode()
				3522	* ocfs2_clear_inode()
				3523	* ocfs2_mark_lockres_freeing()
				3524	* ... blocks waiting for OCFS2_LOCK_QUEUED
				3525	* since we are the downconvert thread which
				3526	* should clear the flag.
				3527	*/
				3528	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3529	spin_lock_irqsave(&osb->dc_task_lock, flags2);
				3530	list_del_init(&lockres->l_blocked_list);
				3531	osb->blocked_lock_count--;
				3532	spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
				3533	/*
				3534	* Warn if we recurse into another post_unlock call. Strictly
				3535	* speaking it isn't a problem but we need to be careful if
				3536	* that happens (stack overflow, deadlocks, ...) so warn if
				3537	* ocfs2 grows a path for which this can happen.
				3538	*/
				3539	WARN_ON_ONCE(lockres->l_ops->post_unlock);
				3540	/* Since the lock is freeing we don't do much in the fn below */
				3541	ocfs2_process_blocked_lock(osb, lockres);
				3542	return;
				3543	}
				3544	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
				3545	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
				3546	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3547
				3548	mlog(0, "Waiting on lockres %s\n", lockres->l_name);
				3549
				3550	status = ocfs2_wait_for_mask(&mw);
				3551	if (status)
				3552	mlog_errno(status);
				3553
				3554	spin_lock_irqsave(&lockres->l_lock, flags);
				3555	}
				3556	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3557	}
				3558
				3559	void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
				3560	struct ocfs2_lock_res *lockres)
				3561	{
				3562	int ret;
				3563
				3564	ocfs2_mark_lockres_freeing(osb, lockres);
				3565	ret = ocfs2_drop_lock(osb, lockres);
				3566	if (ret)
				3567	mlog_errno(ret);
				3568	}
				3569
				3570	static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
				3571	{
				3572	ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
				3573	ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
				3574	ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
				3575	ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
				3576	}
				3577
				3578	int ocfs2_drop_inode_locks(struct inode *inode)
				3579	{
				3580	int status, err;
				3581
				3582	/* No need to call ocfs2_mark_lockres_freeing here -
				3583	* ocfs2_clear_inode has done it for us. */
				3584
				3585	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
				3586	&OCFS2_I(inode)->ip_open_lockres);
				3587	if (err < 0)
				3588	mlog_errno(err);
				3589
				3590	status = err;
				3591
				3592	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
				3593	&OCFS2_I(inode)->ip_inode_lockres);
				3594	if (err < 0)
				3595	mlog_errno(err);
				3596	if (err < 0 && !status)
				3597	status = err;
				3598
				3599	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
				3600	&OCFS2_I(inode)->ip_rw_lockres);
				3601	if (err < 0)
				3602	mlog_errno(err);
				3603	if (err < 0 && !status)
				3604	status = err;
				3605
				3606	return status;
				3607	}
				3608
				3609	static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
				3610	int new_level)
				3611	{
				3612	assert_spin_locked(&lockres->l_lock);
				3613
				3614	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
				3615
				3616	if (lockres->l_level <= new_level) {
				3617	mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
				3618	"type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
				3619	"block %d, pgen %d\n", lockres->l_name, lockres->l_level,
				3620	new_level, list_empty(&lockres->l_blocked_list),
				3621	list_empty(&lockres->l_mask_waiters), lockres->l_type,
				3622	lockres->l_flags, lockres->l_ro_holders,
				3623	lockres->l_ex_holders, lockres->l_action,
				3624	lockres->l_unlock_action, lockres->l_requested,
				3625	lockres->l_blocking, lockres->l_pending_gen);
				3626	BUG();
				3627	}
				3628
				3629	mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
				3630	lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
				3631
				3632	lockres->l_action = OCFS2_AST_DOWNCONVERT;
				3633	lockres->l_requested = new_level;
				3634	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
				3635	return lockres_set_pending(lockres);
				3636	}
				3637
				3638	static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
				3639	struct ocfs2_lock_res *lockres,
				3640	int new_level,
				3641	int lvb,
				3642	unsigned int generation)
				3643	{
				3644	int ret;
				3645	u32 dlm_flags = DLM_LKF_CONVERT;
				3646
				3647	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
				3648	lockres->l_level, new_level);
				3649
				3650	/*
				3651	* On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
				3652	* expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
				3653	* we can recover correctly from node failure. Otherwise, we may get
				3654	* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
				3655	*/
				3656	if (ocfs2_userspace_stack(osb) &&
				3657	lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
				3658	lvb = 1;
				3659
				3660	if (lvb)
				3661	dlm_flags \|= DLM_LKF_VALBLK;
				3662
				3663	ret = ocfs2_dlm_lock(osb->cconn,
				3664	new_level,
				3665	&lockres->l_lksb,
				3666	dlm_flags,
				3667	lockres->l_name,
				3668	OCFS2_LOCK_ID_MAX_LEN - 1);
				3669	lockres_clear_pending(lockres, generation, osb);
				3670	if (ret) {
				3671	ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
				3672	ocfs2_recover_from_dlm_error(lockres, 1);
				3673	goto bail;
				3674	}
				3675
				3676	ret = 0;
				3677	bail:
				3678	return ret;
				3679	}
				3680
				3681	/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
				3682	static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
				3683	struct ocfs2_lock_res *lockres)
				3684	{
				3685	assert_spin_locked(&lockres->l_lock);
				3686
				3687	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
				3688	/* If we're already trying to cancel a lock conversion
				3689	* then just drop the spinlock and allow the caller to
				3690	* requeue this lock. */
				3691	mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
				3692	return 0;
				3693	}
				3694
				3695	/* were we in a convert when we got the bast fire? */
				3696	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
				3697	lockres->l_action != OCFS2_AST_DOWNCONVERT);
				3698	/* set things up for the unlockast to know to just
				3699	* clear out the ast_action and unset busy, etc. */
				3700	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
				3701
				3702	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
				3703	"lock %s, invalid flags: 0x%lx\n",
				3704	lockres->l_name, lockres->l_flags);
				3705
				3706	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
				3707
				3708	return 1;
				3709	}
				3710
				3711	static int ocfs2_cancel_convert(struct ocfs2_super *osb,
				3712	struct ocfs2_lock_res *lockres)
				3713	{
				3714	int ret;
				3715
				3716	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
				3717	DLM_LKF_CANCEL);
				3718	if (ret) {
				3719	ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
				3720	ocfs2_recover_from_dlm_error(lockres, 0);
				3721	}
				3722
				3723	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
				3724
				3725	return ret;
				3726	}
				3727
				3728	static int ocfs2_unblock_lock(struct ocfs2_super *osb,
				3729	struct ocfs2_lock_res *lockres,
				3730	struct ocfs2_unblock_ctl *ctl)
				3731	{
				3732	unsigned long flags;
				3733	int blocking;
				3734	int new_level;
				3735	int level;
				3736	int ret = 0;
				3737	int set_lvb = 0;
				3738	unsigned int gen;
				3739
				3740	spin_lock_irqsave(&lockres->l_lock, flags);
				3741
				3742	recheck:
				3743	/*
				3744	* Is it still blocking? If not, we have no more work to do.
				3745	*/
				3746	if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
				3747	BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
				3748	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3749	ret = 0;
				3750	goto leave;
				3751	}
				3752
				3753	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
				3754	/* XXX
				3755	* This is a big race. The OCFS2_LOCK_PENDING flag
				3756	* exists entirely for one reason - another thread has set
				3757	* OCFS2_LOCK_BUSY, but has NOT yet called dlm_lock().
				3758	*
				3759	* If we do ocfs2_cancel_convert() before the other thread
				3760	* calls dlm_lock(), our cancel will do nothing. We will
				3761	* get no ast, and we will have no way of knowing the
				3762	* cancel failed. Meanwhile, the other thread will call
				3763	* into dlm_lock() and wait...forever.
				3764	*
				3765	* Why forever? Because another node has asked for the
				3766	* lock first; that's why we're here in unblock_lock().
				3767	*
				3768	* The solution is OCFS2_LOCK_PENDING. When PENDING is
				3769	* set, we just requeue the unblock. Only when the other
				3770	* thread has called dlm_lock() and cleared PENDING will
				3771	* we then cancel their request.
				3772	*
				3773	* All callers of dlm_lock() must set OCFS2_DLM_PENDING
				3774	* at the same time they set OCFS2_DLM_BUSY. They must
				3775	* clear OCFS2_DLM_PENDING after dlm_lock() returns.
				3776	*/
				3777	if (lockres->l_flags & OCFS2_LOCK_PENDING) {
				3778	mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
				3779	lockres->l_name);
				3780	goto leave_requeue;
				3781	}
				3782
				3783	ctl->requeue = 1;
				3784	ret = ocfs2_prepare_cancel_convert(osb, lockres);
				3785	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3786	if (ret) {
				3787	ret = ocfs2_cancel_convert(osb, lockres);
				3788	if (ret < 0)
				3789	mlog_errno(ret);
				3790	}
				3791	goto leave;
				3792	}
				3793
				3794	/*
				3795	* This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
				3796	* set when the ast is received for an upconvert just before the
				3797	* OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
				3798	* on the heels of the ast, we want to delay the downconvert just
				3799	* enough to allow the up requestor to do its task. Because this
				3800	* lock is in the blocked queue, the lock will be downconverted
				3801	* as soon as the requestor is done with the lock.
				3802	*/
				3803	if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
				3804	goto leave_requeue;
				3805
				3806	/*
				3807	* How can we block and yet be at NL? We were trying to upconvert
				3808	* from NL and got canceled. The code comes back here, and now
				3809	* we notice and clear BLOCKING.
				3810	*/
				3811	if (lockres->l_level == DLM_LOCK_NL) {
				3812	BUG_ON(lockres->l_ex_holders \|\| lockres->l_ro_holders);
				3813	mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
				3814	lockres->l_blocking = DLM_LOCK_NL;
				3815	lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
				3816	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3817	goto leave;
				3818	}
				3819
				3820	/* if we're blocking an exclusive and we have any holders,
				3821	* then requeue. */
				3822	if ((lockres->l_blocking == DLM_LOCK_EX)
				3823	&& (lockres->l_ex_holders \|\| lockres->l_ro_holders)) {
				3824	mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
				3825	lockres->l_name, lockres->l_ex_holders,
				3826	lockres->l_ro_holders);
				3827	goto leave_requeue;
				3828	}
				3829
				3830	/* If it's a PR we're blocking, then only
				3831	* requeue if we've got any EX holders */
				3832	if (lockres->l_blocking == DLM_LOCK_PR &&
				3833	lockres->l_ex_holders) {
				3834	mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
				3835	lockres->l_name, lockres->l_ex_holders);
				3836	goto leave_requeue;
				3837	}
				3838
				3839	/*
				3840	* Can we get a lock in this state if the holder counts are
				3841	* zero? The meta data unblock code used to check this.
				3842	*/
				3843	if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
				3844	&& (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
				3845	mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
				3846	lockres->l_name);
				3847	goto leave_requeue;
				3848	}
				3849
				3850	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
				3851
				3852	if (lockres->l_ops->check_downconvert
				3853	&& !lockres->l_ops->check_downconvert(lockres, new_level)) {
				3854	mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
				3855	lockres->l_name);
				3856	goto leave_requeue;
				3857	}
				3858
				3859	/* If we get here, then we know that there are no more
				3860	* incompatible holders (and anyone asking for an incompatible
				3861	* lock is blocked). We can now downconvert the lock */
				3862	if (!lockres->l_ops->downconvert_worker)
				3863	goto downconvert;
				3864
				3865	/* Some lockres types want to do a bit of work before
				3866	* downconverting a lock. Allow that here. The worker function
				3867	* may sleep, so we save off a copy of what we're blocking as
				3868	* it may change while we're not holding the spin lock. */
				3869	blocking = lockres->l_blocking;
				3870	level = lockres->l_level;
				3871	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3872
				3873	ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
				3874
				3875	if (ctl->unblock_action == UNBLOCK_STOP_POST) {
				3876	mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
				3877	lockres->l_name);
				3878	goto leave;
				3879	}
				3880
				3881	spin_lock_irqsave(&lockres->l_lock, flags);
				3882	if ((blocking != lockres->l_blocking) \|\| (level != lockres->l_level)) {
				3883	/* If this changed underneath us, then we can't drop
				3884	* it just yet. */
				3885	mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
				3886	"Recheck\n", lockres->l_name, blocking,
				3887	lockres->l_blocking, level, lockres->l_level);
				3888	goto recheck;
				3889	}
				3890
				3891	downconvert:
				3892	ctl->requeue = 0;
				3893
				3894	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
				3895	if (lockres->l_level == DLM_LOCK_EX)
				3896	set_lvb = 1;
				3897
				3898	/*
				3899	* We only set the lvb if the lock has been fully
				3900	* refreshed - otherwise we risk setting stale
				3901	* data. Otherwise, there's no need to actually clear
				3902	* out the lvb here as it's value is still valid.
				3903	*/
				3904	if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
				3905	lockres->l_ops->set_lvb(lockres);
				3906	}
				3907
				3908	gen = ocfs2_prepare_downconvert(lockres, new_level);
				3909	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3910	ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
				3911	gen);
				3912
				3913	leave:
				3914	if (ret)
				3915	mlog_errno(ret);
				3916	return ret;
				3917
				3918	leave_requeue:
				3919	spin_unlock_irqrestore(&lockres->l_lock, flags);
				3920	ctl->requeue = 1;
				3921
				3922	return 0;
				3923	}
				3924
				3925	static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
				3926	int blocking)
				3927	{
				3928	struct inode *inode;
				3929	struct address_space *mapping;
				3930	struct ocfs2_inode_info *oi;
				3931
				3932	inode = ocfs2_lock_res_inode(lockres);
				3933	mapping = inode->i_mapping;
				3934
				3935	if (S_ISDIR(inode->i_mode)) {
				3936	oi = OCFS2_I(inode);
				3937	oi->ip_dir_lock_gen++;
				3938	mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
				3939	goto out_forget;
				3940	}
				3941
				3942	if (!S_ISREG(inode->i_mode))
				3943	goto out;
				3944
				3945	/*
				3946	* We need this before the filemap_fdatawrite() so that it can
				3947	* transfer the dirty bit from the PTE to the
				3948	* page. Unfortunately this means that even for EX->PR
				3949	* downconverts, we'll lose our mappings and have to build
				3950	* them up again.
				3951	*/
				3952	unmap_mapping_range(mapping, 0, 0, 0);
				3953
				3954	if (filemap_fdatawrite(mapping)) {
				3955	mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
				3956	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				3957	}
				3958	sync_mapping_buffers(mapping);
				3959	if (blocking == DLM_LOCK_EX) {
				3960	truncate_inode_pages(mapping, 0);
				3961	} else {
				3962	/* We only need to wait on the I/O if we're not also
				3963	* truncating pages because truncate_inode_pages waits
				3964	* for us above. We don't truncate pages if we're
				3965	* blocking anything < EXMODE because we want to keep
				3966	* them around in that case. */
				3967	filemap_fdatawait(mapping);
				3968	}
				3969
				3970	out_forget:
				3971	forget_all_cached_acls(inode);
				3972
				3973	out:
				3974	return UNBLOCK_CONTINUE;
				3975	}
				3976
				3977	static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
				3978	struct ocfs2_lock_res *lockres,
				3979	int new_level)
				3980	{
				3981	int checkpointed = ocfs2_ci_fully_checkpointed(ci);
				3982
				3983	BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
				3984	BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
				3985
				3986	if (checkpointed)
				3987	return 1;
				3988
				3989	ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
				3990	return 0;
				3991	}
				3992
				3993	static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
				3994	int new_level)
				3995	{
				3996	struct inode *inode = ocfs2_lock_res_inode(lockres);
				3997
				3998	return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
				3999	}
				4000
				4001	static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
				4002	{
				4003	struct inode *inode = ocfs2_lock_res_inode(lockres);
				4004
				4005	__ocfs2_stuff_meta_lvb(inode);
				4006	}
				4007
				4008	/*
				4009	* Does the final reference drop on our dentry lock. Right now this
				4010	* happens in the downconvert thread, but we could choose to simplify the
				4011	* dlmglue API and push these off to the ocfs2_wq in the future.
				4012	*/
				4013	static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
				4014	struct ocfs2_lock_res *lockres)
				4015	{
				4016	struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
				4017	ocfs2_dentry_lock_put(osb, dl);
				4018	}
				4019
				4020	/*
				4021	* d_delete() matching dentries before the lock downconvert.
				4022	*
				4023	* At this point, any process waiting to destroy the
				4024	* dentry_lock due to last ref count is stopped by the
				4025	* OCFS2_LOCK_QUEUED flag.
				4026	*
				4027	* We have two potential problems
				4028	*
				4029	* 1) If we do the last reference drop on our dentry_lock (via dput)
				4030	* we'll wind up in ocfs2_release_dentry_lock(), waiting on
				4031	* the downconvert to finish. Instead we take an elevated
				4032	* reference and push the drop until after we've completed our
				4033	* unblock processing.
				4034	*
				4035	* 2) There might be another process with a final reference,
				4036	* waiting on us to finish processing. If this is the case, we
				4037	* detect it and exit out - there's no more dentries anyway.
				4038	*/
				4039	static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
				4040	int blocking)
				4041	{
				4042	struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
				4043	struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
				4044	struct dentry *dentry;
				4045	unsigned long flags;
				4046	int extra_ref = 0;
				4047
				4048	/*
				4049	* This node is blocking another node from getting a read
				4050	* lock. This happens when we've renamed within a
				4051	* directory. We've forced the other nodes to d_delete(), but
				4052	* we never actually dropped our lock because it's still
				4053	* valid. The downconvert code will retain a PR for this node,
				4054	* so there's no further work to do.
				4055	*/
				4056	if (blocking == DLM_LOCK_PR)
				4057	return UNBLOCK_CONTINUE;
				4058
				4059	/*
				4060	* Mark this inode as potentially orphaned. The code in
				4061	* ocfs2_delete_inode() will figure out whether it actually
				4062	* needs to be freed or not.
				4063	*/
				4064	spin_lock(&oi->ip_lock);
				4065	oi->ip_flags \|= OCFS2_INODE_MAYBE_ORPHANED;
				4066	spin_unlock(&oi->ip_lock);
				4067
				4068	/*
				4069	* Yuck. We need to make sure however that the check of
				4070	* OCFS2_LOCK_FREEING and the extra reference are atomic with
				4071	* respect to a reference decrement or the setting of that
				4072	* flag.
				4073	*/
				4074	spin_lock_irqsave(&lockres->l_lock, flags);
				4075	spin_lock(&dentry_attach_lock);
				4076	if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
				4077	&& dl->dl_count) {
				4078	dl->dl_count++;
				4079	extra_ref = 1;
				4080	}
				4081	spin_unlock(&dentry_attach_lock);
				4082	spin_unlock_irqrestore(&lockres->l_lock, flags);
				4083
				4084	mlog(0, "extra_ref = %d\n", extra_ref);
				4085
				4086	/*
				4087	* We have a process waiting on us in ocfs2_dentry_iput(),
				4088	* which means we can't have any more outstanding
				4089	* aliases. There's no need to do any more work.
				4090	*/
				4091	if (!extra_ref)
				4092	return UNBLOCK_CONTINUE;
				4093
				4094	spin_lock(&dentry_attach_lock);
				4095	while (1) {
				4096	dentry = ocfs2_find_local_alias(dl->dl_inode,
				4097	dl->dl_parent_blkno, 1);
				4098	if (!dentry)
				4099	break;
				4100	spin_unlock(&dentry_attach_lock);
				4101
				4102	if (S_ISDIR(dl->dl_inode->i_mode))
				4103	shrink_dcache_parent(dentry);
				4104
				4105	mlog(0, "d_delete(%pd);\n", dentry);
				4106
				4107	/*
				4108	* The following dcache calls may do an
				4109	* iput(). Normally we don't want that from the
				4110	* downconverting thread, but in this case it's ok
				4111	* because the requesting node already has an
				4112	* exclusive lock on the inode, so it can't be queued
				4113	* for a downconvert.
				4114	*/
				4115	d_delete(dentry);
				4116	dput(dentry);
				4117
				4118	spin_lock(&dentry_attach_lock);
				4119	}
				4120	spin_unlock(&dentry_attach_lock);
				4121
				4122	/*
				4123	* If we are the last holder of this dentry lock, there is no
				4124	* reason to downconvert so skip straight to the unlock.
				4125	*/
				4126	if (dl->dl_count == 1)
				4127	return UNBLOCK_STOP_POST;
				4128
				4129	return UNBLOCK_CONTINUE_POST;
				4130	}
				4131
				4132	static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
				4133	int new_level)
				4134	{
				4135	struct ocfs2_refcount_tree *tree =
				4136	ocfs2_lock_res_refcount_tree(lockres);
				4137
				4138	return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
				4139	}
				4140
				4141	static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
				4142	int blocking)
				4143	{
				4144	struct ocfs2_refcount_tree *tree =
				4145	ocfs2_lock_res_refcount_tree(lockres);
				4146
				4147	ocfs2_metadata_cache_purge(&tree->rf_ci);
				4148
				4149	return UNBLOCK_CONTINUE;
				4150	}
				4151
				4152	static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
				4153	{
				4154	struct ocfs2_qinfo_lvb *lvb;
				4155	struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
				4156	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
				4157	oinfo->dqi_gi.dqi_type);
				4158
				4159	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				4160	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
				4161	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
				4162	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
				4163	lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
				4164	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
				4165	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
				4166	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
				4167	}
				4168
				4169	void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
				4170	{
				4171	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
				4172	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
				4173	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				4174
				4175	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
				4176	ocfs2_cluster_unlock(osb, lockres, level);
				4177	}
				4178
				4179	static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
				4180	{
				4181	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
				4182	oinfo->dqi_gi.dqi_type);
				4183	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
				4184	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
				4185	struct buffer_head *bh = NULL;
				4186	struct ocfs2_global_disk_dqinfo *gdinfo;
				4187	int status = 0;
				4188
				4189	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
				4190	lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
				4191	info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
				4192	info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
				4193	oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
				4194	oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
				4195	oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
				4196	oinfo->dqi_gi.dqi_free_entry =
				4197	be32_to_cpu(lvb->lvb_free_entry);
				4198	} else {
				4199	status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
				4200	oinfo->dqi_giblk, &bh);
				4201	if (status) {
				4202	mlog_errno(status);
				4203	goto bail;
				4204	}
				4205	gdinfo = (struct ocfs2_global_disk_dqinfo *)
				4206	(bh->b_data + OCFS2_GLOBAL_INFO_OFF);
				4207	info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
				4208	info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
				4209	oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
				4210	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
				4211	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
				4212	oinfo->dqi_gi.dqi_free_entry =
				4213	le32_to_cpu(gdinfo->dqi_free_entry);
				4214	brelse(bh);
				4215	ocfs2_track_lock_refresh(lockres);
				4216	}
				4217
				4218	bail:
				4219	return status;
				4220	}
				4221
				4222	/* Lock quota info, this function expects at least shared lock on the quota file
				4223	* so that we can safely refresh quota info from disk. */
				4224	int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
				4225	{
				4226	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
				4227	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
				4228	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				4229	int status = 0;
				4230
				4231	/* On RO devices, locking really isn't needed... */
				4232	if (ocfs2_is_hard_readonly(osb)) {
				4233	if (ex)
				4234	status = -EROFS;
				4235	goto bail;
				4236	}
				4237	if (ocfs2_mount_local(osb))
				4238	goto bail;
				4239
				4240	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
				4241	if (status < 0) {
				4242	mlog_errno(status);
				4243	goto bail;
				4244	}
				4245	if (!ocfs2_should_refresh_lock_res(lockres))
				4246	goto bail;
				4247	/* OK, we have the lock but we need to refresh the quota info */
				4248	status = ocfs2_refresh_qinfo(oinfo);
				4249	if (status)
				4250	ocfs2_qinfo_unlock(oinfo, ex);
				4251	ocfs2_complete_lock_res_refresh(lockres, status);
				4252	bail:
				4253	return status;
				4254	}
				4255
				4256	int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
				4257	{
				4258	int status;
				4259	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				4260	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
				4261	struct ocfs2_super *osb = lockres->l_priv;
				4262
				4263
				4264	if (ocfs2_is_hard_readonly(osb))
				4265	return -EROFS;
				4266
				4267	if (ocfs2_mount_local(osb))
				4268	return 0;
				4269
				4270	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
				4271	if (status < 0)
				4272	mlog_errno(status);
				4273
				4274	return status;
				4275	}
				4276
				4277	void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
				4278	{
				4279	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
				4280	struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
				4281	struct ocfs2_super *osb = lockres->l_priv;
				4282
				4283	if (!ocfs2_mount_local(osb))
				4284	ocfs2_cluster_unlock(osb, lockres, level);
				4285	}
				4286
				4287	static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
				4288	struct ocfs2_lock_res *lockres)
				4289	{
				4290	int status;
				4291	struct ocfs2_unblock_ctl ctl = {0, 0,};
				4292	unsigned long flags;
				4293
				4294	/* Our reference to the lockres in this function can be
				4295	* considered valid until we remove the OCFS2_LOCK_QUEUED
				4296	* flag. */
				4297
				4298	BUG_ON(!lockres);
				4299	BUG_ON(!lockres->l_ops);
				4300
				4301	mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
				4302
				4303	/* Detect whether a lock has been marked as going away while
				4304	* the downconvert thread was processing other things. A lock can
				4305	* still be marked with OCFS2_LOCK_FREEING after this check,
				4306	* but short circuiting here will still save us some
				4307	* performance. */
				4308	spin_lock_irqsave(&lockres->l_lock, flags);
				4309	if (lockres->l_flags & OCFS2_LOCK_FREEING)
				4310	goto unqueue;
				4311	spin_unlock_irqrestore(&lockres->l_lock, flags);
				4312
				4313	status = ocfs2_unblock_lock(osb, lockres, &ctl);
				4314	if (status < 0)
				4315	mlog_errno(status);
				4316
				4317	spin_lock_irqsave(&lockres->l_lock, flags);
				4318	unqueue:
				4319	if (lockres->l_flags & OCFS2_LOCK_FREEING \|\| !ctl.requeue) {
				4320	lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
				4321	} else
				4322	ocfs2_schedule_blocked_lock(osb, lockres);
				4323
				4324	mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
				4325	ctl.requeue ? "yes" : "no");
				4326	spin_unlock_irqrestore(&lockres->l_lock, flags);
				4327
				4328	if (ctl.unblock_action != UNBLOCK_CONTINUE
				4329	&& lockres->l_ops->post_unlock)
				4330	lockres->l_ops->post_unlock(osb, lockres);
				4331	}
				4332
				4333	static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
				4334	struct ocfs2_lock_res *lockres)
				4335	{
				4336	unsigned long flags;
				4337
				4338	assert_spin_locked(&lockres->l_lock);
				4339
				4340	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
				4341	/* Do not schedule a lock for downconvert when it's on
				4342	* the way to destruction - any nodes wanting access
				4343	* to the resource will get it soon. */
				4344	mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
				4345	lockres->l_name, lockres->l_flags);
				4346	return;
				4347	}
				4348
				4349	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
				4350
				4351	spin_lock_irqsave(&osb->dc_task_lock, flags);
				4352	if (list_empty(&lockres->l_blocked_list)) {
				4353	list_add_tail(&lockres->l_blocked_list,
				4354	&osb->blocked_lock_list);
				4355	osb->blocked_lock_count++;
				4356	}
				4357	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
				4358	}
				4359
				4360	static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
				4361	{
				4362	unsigned long processed;
				4363	unsigned long flags;
				4364	struct ocfs2_lock_res *lockres;
				4365
				4366	spin_lock_irqsave(&osb->dc_task_lock, flags);
				4367	/* grab this early so we know to try again if a state change and
				4368	* wake happens part-way through our work */
				4369	osb->dc_work_sequence = osb->dc_wake_sequence;
				4370
				4371	processed = osb->blocked_lock_count;
				4372	/*
				4373	* blocked lock processing in this loop might call iput which can
				4374	* remove items off osb->blocked_lock_list. Downconvert up to
				4375	* 'processed' number of locks, but stop short if we had some
				4376	* removed in ocfs2_mark_lockres_freeing when downconverting.
				4377	*/
				4378	while (processed && !list_empty(&osb->blocked_lock_list)) {
				4379	lockres = list_entry(osb->blocked_lock_list.next,
				4380	struct ocfs2_lock_res, l_blocked_list);
				4381	list_del_init(&lockres->l_blocked_list);
				4382	osb->blocked_lock_count--;
				4383	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
				4384
				4385	BUG_ON(!processed);
				4386	processed--;
				4387
				4388	ocfs2_process_blocked_lock(osb, lockres);
				4389
				4390	spin_lock_irqsave(&osb->dc_task_lock, flags);
				4391	}
				4392	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
				4393	}
				4394
				4395	static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
				4396	{
				4397	int empty = 0;
				4398	unsigned long flags;
				4399
				4400	spin_lock_irqsave(&osb->dc_task_lock, flags);
				4401	if (list_empty(&osb->blocked_lock_list))
				4402	empty = 1;
				4403
				4404	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
				4405	return empty;
				4406	}
				4407
				4408	static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
				4409	{
				4410	int should_wake = 0;
				4411	unsigned long flags;
				4412
				4413	spin_lock_irqsave(&osb->dc_task_lock, flags);
				4414	if (osb->dc_work_sequence != osb->dc_wake_sequence)
				4415	should_wake = 1;
				4416	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
				4417
				4418	return should_wake;
				4419	}
				4420
				4421	static int ocfs2_downconvert_thread(void *arg)
				4422	{
				4423	struct ocfs2_super *osb = arg;
				4424
				4425	/* only quit once we've been asked to stop and there is no more
				4426	* work available */
				4427	while (!(kthread_should_stop() &&
				4428	ocfs2_downconvert_thread_lists_empty(osb))) {
				4429
				4430	wait_event_interruptible(osb->dc_event,
				4431	ocfs2_downconvert_thread_should_wake(osb) \|\|
				4432	kthread_should_stop());
				4433
				4434	mlog(0, "downconvert_thread: awoken\n");
				4435
				4436	ocfs2_downconvert_thread_do_work(osb);
				4437	}
				4438
				4439	osb->dc_task = NULL;
				4440	return 0;
				4441	}
				4442
				4443	void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
				4444	{
				4445	unsigned long flags;
				4446
				4447	spin_lock_irqsave(&osb->dc_task_lock, flags);
				4448	/* make sure the voting thread gets a swipe at whatever changes
				4449	* the caller may have made to the voting state */
				4450	osb->dc_wake_sequence++;
				4451	spin_unlock_irqrestore(&osb->dc_task_lock, flags);
				4452	wake_up(&osb->dc_event);
				4453	}