Blame - ap/os/linux/linux-3.4.x/fs/ocfs2/dlm/dlmmaster.c - R306

blob: 7ba6ac187eddf97f36b6b7c1a38021939f69b2e1 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* dlmmod.c
				5	*
				6	* standalone DLM module
				7	*
				8	* Copyright (C) 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*
				25	*/
				26
				27
				28	#include <linux/module.h>
				29	#include <linux/fs.h>
				30	#include <linux/types.h>
				31	#include <linux/slab.h>
				32	#include <linux/highmem.h>
				33	#include <linux/init.h>
				34	#include <linux/sysctl.h>
				35	#include <linux/random.h>
				36	#include <linux/blkdev.h>
				37	#include <linux/socket.h>
				38	#include <linux/inet.h>
				39	#include <linux/spinlock.h>
				40	#include <linux/delay.h>
				41
				42
				43	#include "cluster/heartbeat.h"
				44	#include "cluster/nodemanager.h"
				45	#include "cluster/tcp.h"
				46
				47	#include "dlmapi.h"
				48	#include "dlmcommon.h"
				49	#include "dlmdomain.h"
				50	#include "dlmdebug.h"
				51
				52	#define MLOG_MASK_PREFIX (ML_DLM\|ML_DLM_MASTER)
				53	#include "cluster/masklog.h"
				54
				55	static void dlm_mle_node_down(struct dlm_ctxt *dlm,
				56	struct dlm_master_list_entry *mle,
				57	struct o2nm_node *node,
				58	int idx);
				59	static void dlm_mle_node_up(struct dlm_ctxt *dlm,
				60	struct dlm_master_list_entry *mle,
				61	struct o2nm_node *node,
				62	int idx);
				63
				64	static void dlm_assert_master_worker(struct dlm_work_item item, void data);
				65	static int dlm_do_assert_master(struct dlm_ctxt *dlm,
				66	struct dlm_lock_resource *res,
				67	void *nodemap, u32 flags);
				68	static void dlm_deref_lockres_worker(struct dlm_work_item item, void data);
				69
				70	static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
				71	struct dlm_master_list_entry *mle,
				72	const char *name,
				73	unsigned int namelen)
				74	{
				75	if (dlm != mle->dlm)
				76	return 0;
				77
				78	if (namelen != mle->mnamelen \|\|
				79	memcmp(name, mle->mname, namelen) != 0)
				80	return 0;
				81
				82	return 1;
				83	}
				84
				85	static struct kmem_cache *dlm_lockres_cache = NULL;
				86	static struct kmem_cache *dlm_lockname_cache = NULL;
				87	static struct kmem_cache *dlm_mle_cache = NULL;
				88
				89	static void dlm_mle_release(struct kref *kref);
				90	static void dlm_init_mle(struct dlm_master_list_entry *mle,
				91	enum dlm_mle_type type,
				92	struct dlm_ctxt *dlm,
				93	struct dlm_lock_resource *res,
				94	const char *name,
				95	unsigned int namelen);
				96	static void dlm_put_mle(struct dlm_master_list_entry *mle);
				97	static void __dlm_put_mle(struct dlm_master_list_entry *mle);
				98	static int dlm_find_mle(struct dlm_ctxt *dlm,
				99	struct dlm_master_list_entry **mle,
				100	char *name, unsigned int namelen);
				101
				102	static int dlm_do_master_request(struct dlm_lock_resource *res,
				103	struct dlm_master_list_entry *mle, int to);
				104
				105
				106	static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
				107	struct dlm_lock_resource *res,
				108	struct dlm_master_list_entry *mle,
				109	int *blocked);
				110	static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
				111	struct dlm_lock_resource *res,
				112	struct dlm_master_list_entry *mle,
				113	int blocked);
				114	static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
				115	struct dlm_lock_resource *res,
				116	struct dlm_master_list_entry *mle,
				117	struct dlm_master_list_entry **oldmle,
				118	const char *name, unsigned int namelen,
				119	u8 new_master, u8 master);
				120
				121	static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
				122	struct dlm_lock_resource *res);
				123	static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
				124	struct dlm_lock_resource *res);
				125	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
				126	struct dlm_lock_resource *res,
				127	u8 target);
				128	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
				129	struct dlm_lock_resource *res);
				130
				131
				132	int dlm_is_host_down(int errno)
				133	{
				134	switch (errno) {
				135	case -EBADF:
				136	case -ECONNREFUSED:
				137	case -ENOTCONN:
				138	case -ECONNRESET:
				139	case -EPIPE:
				140	case -EHOSTDOWN:
				141	case -EHOSTUNREACH:
				142	case -ETIMEDOUT:
				143	case -ECONNABORTED:
				144	case -ENETDOWN:
				145	case -ENETUNREACH:
				146	case -ENETRESET:
				147	case -ESHUTDOWN:
				148	case -ENOPROTOOPT:
				149	case -EINVAL: /* if returned from our tcp code,
				150	this means there is no socket */
				151	return 1;
				152	}
				153	return 0;
				154	}
				155
				156
				157	/*
				158	* MASTER LIST FUNCTIONS
				159	*/
				160
				161
				162	/*
				163	* regarding master list entries and heartbeat callbacks:
				164	*
				165	* in order to avoid sleeping and allocation that occurs in
				166	* heartbeat, master list entries are simply attached to the
				167	* dlm's established heartbeat callbacks. the mle is attached
				168	* when it is created, and since the dlm->spinlock is held at
				169	* that time, any heartbeat event will be properly discovered
				170	* by the mle. the mle needs to be detached from the
				171	* dlm->mle_hb_events list as soon as heartbeat events are no
				172	* longer useful to the mle, and before the mle is freed.
				173	*
				174	* as a general rule, heartbeat events are no longer needed by
				175	* the mle once an "answer" regarding the lock master has been
				176	* received.
				177	*/
				178	static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
				179	struct dlm_master_list_entry *mle)
				180	{
				181	assert_spin_locked(&dlm->spinlock);
				182
				183	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
				184	}
				185
				186
				187	static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
				188	struct dlm_master_list_entry *mle)
				189	{
				190	if (!list_empty(&mle->hb_events))
				191	list_del_init(&mle->hb_events);
				192	}
				193
				194
				195	static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
				196	struct dlm_master_list_entry *mle)
				197	{
				198	spin_lock(&dlm->spinlock);
				199	__dlm_mle_detach_hb_events(dlm, mle);
				200	spin_unlock(&dlm->spinlock);
				201	}
				202
				203	static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
				204	{
				205	struct dlm_ctxt *dlm;
				206	dlm = mle->dlm;
				207
				208	assert_spin_locked(&dlm->spinlock);
				209	assert_spin_locked(&dlm->master_lock);
				210	mle->inuse++;
				211	kref_get(&mle->mle_refs);
				212	}
				213
				214	static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
				215	{
				216	struct dlm_ctxt *dlm;
				217	dlm = mle->dlm;
				218
				219	spin_lock(&dlm->spinlock);
				220	spin_lock(&dlm->master_lock);
				221	mle->inuse--;
				222	__dlm_put_mle(mle);
				223	spin_unlock(&dlm->master_lock);
				224	spin_unlock(&dlm->spinlock);
				225
				226	}
				227
				228	/* remove from list and free */
				229	static void __dlm_put_mle(struct dlm_master_list_entry *mle)
				230	{
				231	struct dlm_ctxt *dlm;
				232	dlm = mle->dlm;
				233
				234	assert_spin_locked(&dlm->spinlock);
				235	assert_spin_locked(&dlm->master_lock);
				236	if (!atomic_read(&mle->mle_refs.refcount)) {
				237	/* this may or may not crash, but who cares.
				238	* it's a BUG. */
				239	mlog(ML_ERROR, "bad mle: %p\n", mle);
				240	dlm_print_one_mle(mle);
				241	BUG();
				242	} else
				243	kref_put(&mle->mle_refs, dlm_mle_release);
				244	}
				245
				246
				247	/* must not have any spinlocks coming in */
				248	static void dlm_put_mle(struct dlm_master_list_entry *mle)
				249	{
				250	struct dlm_ctxt *dlm;
				251	dlm = mle->dlm;
				252
				253	spin_lock(&dlm->spinlock);
				254	spin_lock(&dlm->master_lock);
				255	__dlm_put_mle(mle);
				256	spin_unlock(&dlm->master_lock);
				257	spin_unlock(&dlm->spinlock);
				258	}
				259
				260	static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
				261	{
				262	kref_get(&mle->mle_refs);
				263	}
				264
				265	static void dlm_init_mle(struct dlm_master_list_entry *mle,
				266	enum dlm_mle_type type,
				267	struct dlm_ctxt *dlm,
				268	struct dlm_lock_resource *res,
				269	const char *name,
				270	unsigned int namelen)
				271	{
				272	assert_spin_locked(&dlm->spinlock);
				273
				274	mle->dlm = dlm;
				275	mle->type = type;
				276	INIT_HLIST_NODE(&mle->master_hash_node);
				277	INIT_LIST_HEAD(&mle->hb_events);
				278	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
				279	spin_lock_init(&mle->spinlock);
				280	init_waitqueue_head(&mle->wq);
				281	atomic_set(&mle->woken, 0);
				282	kref_init(&mle->mle_refs);
				283	memset(mle->response_map, 0, sizeof(mle->response_map));
				284	mle->master = O2NM_MAX_NODES;
				285	mle->new_master = O2NM_MAX_NODES;
				286	mle->inuse = 0;
				287
				288	BUG_ON(mle->type != DLM_MLE_BLOCK &&
				289	mle->type != DLM_MLE_MASTER &&
				290	mle->type != DLM_MLE_MIGRATION);
				291
				292	if (mle->type == DLM_MLE_MASTER) {
				293	BUG_ON(!res);
				294	mle->mleres = res;
				295	memcpy(mle->mname, res->lockname.name, res->lockname.len);
				296	mle->mnamelen = res->lockname.len;
				297	mle->mnamehash = res->lockname.hash;
				298	} else {
				299	BUG_ON(!name);
				300	mle->mleres = NULL;
				301	memcpy(mle->mname, name, namelen);
				302	mle->mnamelen = namelen;
				303	mle->mnamehash = dlm_lockid_hash(name, namelen);
				304	}
				305
				306	atomic_inc(&dlm->mle_tot_count[mle->type]);
				307	atomic_inc(&dlm->mle_cur_count[mle->type]);
				308
				309	/* copy off the node_map and register hb callbacks on our copy */
				310	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
				311	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
				312	clear_bit(dlm->node_num, mle->vote_map);
				313	clear_bit(dlm->node_num, mle->node_map);
				314
				315	/* attach the mle to the domain node up/down events */
				316	__dlm_mle_attach_hb_events(dlm, mle);
				317	}
				318
				319	void __dlm_unlink_mle(struct dlm_ctxt dlm, struct dlm_master_list_entry mle)
				320	{
				321	assert_spin_locked(&dlm->spinlock);
				322	assert_spin_locked(&dlm->master_lock);
				323
				324	if (!hlist_unhashed(&mle->master_hash_node))
				325	hlist_del_init(&mle->master_hash_node);
				326	}
				327
				328	void __dlm_insert_mle(struct dlm_ctxt dlm, struct dlm_master_list_entry mle)
				329	{
				330	struct hlist_head *bucket;
				331
				332	assert_spin_locked(&dlm->master_lock);
				333
				334	bucket = dlm_master_hash(dlm, mle->mnamehash);
				335	hlist_add_head(&mle->master_hash_node, bucket);
				336	}
				337
				338	/* returns 1 if found, 0 if not */
				339	static int dlm_find_mle(struct dlm_ctxt *dlm,
				340	struct dlm_master_list_entry **mle,
				341	char *name, unsigned int namelen)
				342	{
				343	struct dlm_master_list_entry *tmpmle;
				344	struct hlist_head *bucket;
				345	struct hlist_node *list;
				346	unsigned int hash;
				347
				348	assert_spin_locked(&dlm->master_lock);
				349
				350	hash = dlm_lockid_hash(name, namelen);
				351	bucket = dlm_master_hash(dlm, hash);
				352	hlist_for_each(list, bucket) {
				353	tmpmle = hlist_entry(list, struct dlm_master_list_entry,
				354	master_hash_node);
				355	if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
				356	continue;
				357	dlm_get_mle(tmpmle);
				358	*mle = tmpmle;
				359	return 1;
				360	}
				361	return 0;
				362	}
				363
				364	void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
				365	{
				366	struct dlm_master_list_entry *mle;
				367
				368	assert_spin_locked(&dlm->spinlock);
				369
				370	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
				371	if (node_up)
				372	dlm_mle_node_up(dlm, mle, NULL, idx);
				373	else
				374	dlm_mle_node_down(dlm, mle, NULL, idx);
				375	}
				376	}
				377
				378	static void dlm_mle_node_down(struct dlm_ctxt *dlm,
				379	struct dlm_master_list_entry *mle,
				380	struct o2nm_node *node, int idx)
				381	{
				382	spin_lock(&mle->spinlock);
				383
				384	if (!test_bit(idx, mle->node_map))
				385	mlog(0, "node %u already removed from nodemap!\n", idx);
				386	else
				387	clear_bit(idx, mle->node_map);
				388
				389	spin_unlock(&mle->spinlock);
				390	}
				391
				392	static void dlm_mle_node_up(struct dlm_ctxt *dlm,
				393	struct dlm_master_list_entry *mle,
				394	struct o2nm_node *node, int idx)
				395	{
				396	spin_lock(&mle->spinlock);
				397
				398	if (test_bit(idx, mle->node_map))
				399	mlog(0, "node %u already in node map!\n", idx);
				400	else
				401	set_bit(idx, mle->node_map);
				402
				403	spin_unlock(&mle->spinlock);
				404	}
				405
				406
				407	int dlm_init_mle_cache(void)
				408	{
				409	dlm_mle_cache = kmem_cache_create("o2dlm_mle",
				410	sizeof(struct dlm_master_list_entry),
				411	0, SLAB_HWCACHE_ALIGN,
				412	NULL);
				413	if (dlm_mle_cache == NULL)
				414	return -ENOMEM;
				415	return 0;
				416	}
				417
				418	void dlm_destroy_mle_cache(void)
				419	{
				420	if (dlm_mle_cache)
				421	kmem_cache_destroy(dlm_mle_cache);
				422	}
				423
				424	static void dlm_mle_release(struct kref *kref)
				425	{
				426	struct dlm_master_list_entry *mle;
				427	struct dlm_ctxt *dlm;
				428
				429	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
				430	dlm = mle->dlm;
				431
				432	assert_spin_locked(&dlm->spinlock);
				433	assert_spin_locked(&dlm->master_lock);
				434
				435	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
				436	mle->type);
				437
				438	/* remove from list if not already */
				439	__dlm_unlink_mle(dlm, mle);
				440
				441	/* detach the mle from the domain node up/down events */
				442	__dlm_mle_detach_hb_events(dlm, mle);
				443
				444	atomic_dec(&dlm->mle_cur_count[mle->type]);
				445
				446	/* NOTE: kfree under spinlock here.
				447	* if this is bad, we can move this to a freelist. */
				448	kmem_cache_free(dlm_mle_cache, mle);
				449	}
				450
				451
				452	/*
				453	* LOCK RESOURCE FUNCTIONS
				454	*/
				455
				456	int dlm_init_master_caches(void)
				457	{
				458	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
				459	sizeof(struct dlm_lock_resource),
				460	0, SLAB_HWCACHE_ALIGN, NULL);
				461	if (!dlm_lockres_cache)
				462	goto bail;
				463
				464	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
				465	DLM_LOCKID_NAME_MAX, 0,
				466	SLAB_HWCACHE_ALIGN, NULL);
				467	if (!dlm_lockname_cache)
				468	goto bail;
				469
				470	return 0;
				471	bail:
				472	dlm_destroy_master_caches();
				473	return -ENOMEM;
				474	}
				475
				476	void dlm_destroy_master_caches(void)
				477	{
				478	if (dlm_lockname_cache)
				479	kmem_cache_destroy(dlm_lockname_cache);
				480
				481	if (dlm_lockres_cache)
				482	kmem_cache_destroy(dlm_lockres_cache);
				483	}
				484
				485	static void dlm_lockres_release(struct kref *kref)
				486	{
				487	struct dlm_lock_resource *res;
				488	struct dlm_ctxt *dlm;
				489
				490	res = container_of(kref, struct dlm_lock_resource, refs);
				491	dlm = res->dlm;
				492
				493	/* This should not happen -- all lockres' have a name
				494	* associated with them at init time. */
				495	BUG_ON(!res->lockname.name);
				496
				497	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
				498	res->lockname.name);
				499
				500	spin_lock(&dlm->track_lock);
				501	if (!list_empty(&res->tracking))
				502	list_del_init(&res->tracking);
				503	else {
				504	mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
				505	res->lockname.len, res->lockname.name);
				506	dlm_print_one_lock_resource(res);
				507	}
				508	spin_unlock(&dlm->track_lock);
				509
				510	atomic_dec(&dlm->res_cur_count);
				511
				512	if (!hlist_unhashed(&res->hash_node) \|\|
				513	!list_empty(&res->granted) \|\|
				514	!list_empty(&res->converting) \|\|
				515	!list_empty(&res->blocked) \|\|
				516	!list_empty(&res->dirty) \|\|
				517	!list_empty(&res->recovering) \|\|
				518	!list_empty(&res->purge)) {
				519	mlog(ML_ERROR,
				520	"Going to BUG for resource %.*s."
				521	" We're on a list! [%c%c%c%c%c%c%c]\n",
				522	res->lockname.len, res->lockname.name,
				523	!hlist_unhashed(&res->hash_node) ? 'H' : ' ',
				524	!list_empty(&res->granted) ? 'G' : ' ',
				525	!list_empty(&res->converting) ? 'C' : ' ',
				526	!list_empty(&res->blocked) ? 'B' : ' ',
				527	!list_empty(&res->dirty) ? 'D' : ' ',
				528	!list_empty(&res->recovering) ? 'R' : ' ',
				529	!list_empty(&res->purge) ? 'P' : ' ');
				530
				531	dlm_print_one_lock_resource(res);
				532	}
				533
				534	/* By the time we're ready to blow this guy away, we shouldn't
				535	* be on any lists. */
				536	BUG_ON(!hlist_unhashed(&res->hash_node));
				537	BUG_ON(!list_empty(&res->granted));
				538	BUG_ON(!list_empty(&res->converting));
				539	BUG_ON(!list_empty(&res->blocked));
				540	BUG_ON(!list_empty(&res->dirty));
				541	BUG_ON(!list_empty(&res->recovering));
				542	BUG_ON(!list_empty(&res->purge));
				543
				544	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
				545
				546	kmem_cache_free(dlm_lockres_cache, res);
				547	}
				548
				549	void dlm_lockres_put(struct dlm_lock_resource *res)
				550	{
				551	kref_put(&res->refs, dlm_lockres_release);
				552	}
				553
				554	static void dlm_init_lockres(struct dlm_ctxt *dlm,
				555	struct dlm_lock_resource *res,
				556	const char *name, unsigned int namelen)
				557	{
				558	char *qname;
				559
				560	/* If we memset here, we lose our reference to the kmalloc'd
				561	* res->lockname.name, so be sure to init every field
				562	* correctly! */
				563
				564	qname = (char *) res->lockname.name;
				565	memcpy(qname, name, namelen);
				566
				567	res->lockname.len = namelen;
				568	res->lockname.hash = dlm_lockid_hash(name, namelen);
				569
				570	init_waitqueue_head(&res->wq);
				571	spin_lock_init(&res->spinlock);
				572	INIT_HLIST_NODE(&res->hash_node);
				573	INIT_LIST_HEAD(&res->granted);
				574	INIT_LIST_HEAD(&res->converting);
				575	INIT_LIST_HEAD(&res->blocked);
				576	INIT_LIST_HEAD(&res->dirty);
				577	INIT_LIST_HEAD(&res->recovering);
				578	INIT_LIST_HEAD(&res->purge);
				579	INIT_LIST_HEAD(&res->tracking);
				580	atomic_set(&res->asts_reserved, 0);
				581	res->migration_pending = 0;
				582	res->inflight_locks = 0;
				583
				584	res->dlm = dlm;
				585
				586	kref_init(&res->refs);
				587
				588	atomic_inc(&dlm->res_tot_count);
				589	atomic_inc(&dlm->res_cur_count);
				590
				591	/* just for consistency */
				592	spin_lock(&res->spinlock);
				593	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
				594	spin_unlock(&res->spinlock);
				595
				596	res->state = DLM_LOCK_RES_IN_PROGRESS;
				597
				598	res->last_used = 0;
				599
				600	spin_lock(&dlm->spinlock);
				601	list_add_tail(&res->tracking, &dlm->tracking_list);
				602	spin_unlock(&dlm->spinlock);
				603
				604	memset(res->lvb, 0, DLM_LVB_LEN);
				605	memset(res->refmap, 0, sizeof(res->refmap));
				606	}
				607
				608	struct dlm_lock_resource dlm_new_lockres(struct dlm_ctxt dlm,
				609	const char *name,
				610	unsigned int namelen)
				611	{
				612	struct dlm_lock_resource *res = NULL;
				613
				614	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
				615	if (!res)
				616	goto error;
				617
				618	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
				619	if (!res->lockname.name)
				620	goto error;
				621
				622	dlm_init_lockres(dlm, res, name, namelen);
				623	return res;
				624
				625	error:
				626	if (res && res->lockname.name)
				627	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
				628
				629	if (res)
				630	kmem_cache_free(dlm_lockres_cache, res);
				631	return NULL;
				632	}
				633
				634	void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
				635	struct dlm_lock_resource *res, int bit)
				636	{
				637	assert_spin_locked(&res->spinlock);
				638
				639	mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
				640	res->lockname.name, bit, __builtin_return_address(0));
				641
				642	set_bit(bit, res->refmap);
				643	}
				644
				645	void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
				646	struct dlm_lock_resource *res, int bit)
				647	{
				648	assert_spin_locked(&res->spinlock);
				649
				650	mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
				651	res->lockname.name, bit, __builtin_return_address(0));
				652
				653	clear_bit(bit, res->refmap);
				654	}
				655
				656	static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
				657	struct dlm_lock_resource *res)
				658	{
				659	res->inflight_locks++;
				660
				661	mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
				662	res->lockname.len, res->lockname.name, res->inflight_locks,
				663	__builtin_return_address(0));
				664	}
				665
				666	void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
				667	struct dlm_lock_resource *res)
				668	{
				669	assert_spin_locked(&res->spinlock);
				670	__dlm_lockres_grab_inflight_ref(dlm, res);
				671	}
				672
				673	void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
				674	struct dlm_lock_resource *res)
				675	{
				676	assert_spin_locked(&res->spinlock);
				677
				678	BUG_ON(res->inflight_locks == 0);
				679
				680	res->inflight_locks--;
				681
				682	mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
				683	res->lockname.len, res->lockname.name, res->inflight_locks,
				684	__builtin_return_address(0));
				685
				686	wake_up(&res->wq);
				687	}
				688
				689	/*
				690	* lookup a lock resource by name.
				691	* may already exist in the hashtable.
				692	* lockid is null terminated
				693	*
				694	* if not, allocate enough for the lockres and for
				695	* the temporary structure used in doing the mastering.
				696	*
				697	* also, do a lookup in the dlm->master_list to see
				698	* if another node has begun mastering the same lock.
				699	* if so, there should be a block entry in there
				700	* for this name, and we should not attempt to master
				701	* the lock here. need to wait around for that node
				702	* to assert_master (or die).
				703	*
				704	*/
				705	struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
				706	const char *lockid,
				707	int namelen,
				708	int flags)
				709	{
				710	struct dlm_lock_resource tmpres=NULL, res=NULL;
				711	struct dlm_master_list_entry *mle = NULL;
				712	struct dlm_master_list_entry *alloc_mle = NULL;
				713	int blocked = 0;
				714	int ret, nodenum;
				715	struct dlm_node_iter iter;
				716	unsigned int hash;
				717	int tries = 0;
				718	int bit, wait_on_recovery = 0;
				719
				720	BUG_ON(!lockid);
				721
				722	hash = dlm_lockid_hash(lockid, namelen);
				723
				724	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
				725
				726	lookup:
				727	spin_lock(&dlm->spinlock);
				728	tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
				729	if (tmpres) {
				730	spin_unlock(&dlm->spinlock);
				731	spin_lock(&tmpres->spinlock);
				732
				733	/*
				734	* Right after dlm spinlock was released, dlm_thread could have
				735	* purged the lockres. Check if lockres got unhashed. If so
				736	* start over.
				737	*/
				738	if (hlist_unhashed(&tmpres->hash_node)) {
				739	spin_unlock(&tmpres->spinlock);
				740	dlm_lockres_put(tmpres);
				741	tmpres = NULL;
				742	goto lookup;
				743	}
				744
				745	/* Wait on the thread that is mastering the resource */
				746	if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
				747	__dlm_wait_on_lockres(tmpres);
				748	BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
				749	spin_unlock(&tmpres->spinlock);
				750	dlm_lockres_put(tmpres);
				751	tmpres = NULL;
				752	goto lookup;
				753	}
				754
				755	/* Wait on the resource purge to complete before continuing */
				756	if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
				757	BUG_ON(tmpres->owner == dlm->node_num);
				758	__dlm_wait_on_lockres_flags(tmpres,
				759	DLM_LOCK_RES_DROPPING_REF);
				760	spin_unlock(&tmpres->spinlock);
				761	dlm_lockres_put(tmpres);
				762	tmpres = NULL;
				763	goto lookup;
				764	}
				765
				766	/* Grab inflight ref to pin the resource */
				767	dlm_lockres_grab_inflight_ref(dlm, tmpres);
				768
				769	spin_unlock(&tmpres->spinlock);
				770	if (res)
				771	dlm_lockres_put(res);
				772	res = tmpres;
				773	goto leave;
				774	}
				775
				776	if (!res) {
				777	spin_unlock(&dlm->spinlock);
				778	mlog(0, "allocating a new resource\n");
				779	/* nothing found and we need to allocate one. */
				780	alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
				781	if (!alloc_mle)
				782	goto leave;
				783	res = dlm_new_lockres(dlm, lockid, namelen);
				784	if (!res)
				785	goto leave;
				786	goto lookup;
				787	}
				788
				789	mlog(0, "no lockres found, allocated our own: %p\n", res);
				790
				791	if (flags & LKM_LOCAL) {
				792	/* caller knows it's safe to assume it's not mastered elsewhere
				793	* DONE! return right away */
				794	spin_lock(&res->spinlock);
				795	dlm_change_lockres_owner(dlm, res, dlm->node_num);
				796	__dlm_insert_lockres(dlm, res);
				797	dlm_lockres_grab_inflight_ref(dlm, res);
				798	spin_unlock(&res->spinlock);
				799	spin_unlock(&dlm->spinlock);
				800	/* lockres still marked IN_PROGRESS */
				801	goto wake_waiters;
				802	}
				803
				804	/* check master list to see if another node has started mastering it */
				805	spin_lock(&dlm->master_lock);
				806
				807	/* if we found a block, wait for lock to be mastered by another node */
				808	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
				809	if (blocked) {
				810	int mig;
				811	if (mle->type == DLM_MLE_MASTER) {
				812	mlog(ML_ERROR, "master entry for nonexistent lock!\n");
				813	BUG();
				814	}
				815	mig = (mle->type == DLM_MLE_MIGRATION);
				816	/* if there is a migration in progress, let the migration
				817	* finish before continuing. we can wait for the absence
				818	* of the MIGRATION mle: either the migrate finished or
				819	* one of the nodes died and the mle was cleaned up.
				820	* if there is a BLOCK here, but it already has a master
				821	* set, we are too late. the master does not have a ref
				822	* for us in the refmap. detach the mle and drop it.
				823	* either way, go back to the top and start over. */
				824	if (mig \|\| mle->master != O2NM_MAX_NODES) {
				825	BUG_ON(mig && mle->master == dlm->node_num);
				826	/* we arrived too late. the master does not
				827	* have a ref for us. retry. */
				828	mlog(0, "%s:%.*s: late on %s\n",
				829	dlm->name, namelen, lockid,
				830	mig ? "MIGRATION" : "BLOCK");
				831	spin_unlock(&dlm->master_lock);
				832	spin_unlock(&dlm->spinlock);
				833
				834	/* master is known, detach */
				835	if (!mig)
				836	dlm_mle_detach_hb_events(dlm, mle);
				837	dlm_put_mle(mle);
				838	mle = NULL;
				839	/* this is lame, but we can't wait on either
				840	* the mle or lockres waitqueue here */
				841	if (mig)
				842	msleep(100);
				843	goto lookup;
				844	}
				845	} else {
				846	/* go ahead and try to master lock on this node */
				847	mle = alloc_mle;
				848	/* make sure this does not get freed below */
				849	alloc_mle = NULL;
				850	dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
				851	set_bit(dlm->node_num, mle->maybe_map);
				852	__dlm_insert_mle(dlm, mle);
				853
				854	/* still holding the dlm spinlock, check the recovery map
				855	* to see if there are any nodes that still need to be
				856	* considered. these will not appear in the mle nodemap
				857	* but they might own this lockres. wait on them. */
				858	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
				859	if (bit < O2NM_MAX_NODES) {
				860	mlog(0, "%s: res %.*s, At least one node (%d) "
				861	"to recover before lock mastery can begin\n",
				862	dlm->name, namelen, (char *)lockid, bit);
				863	wait_on_recovery = 1;
				864	}
				865	}
				866
				867	/* at this point there is either a DLM_MLE_BLOCK or a
				868	* DLM_MLE_MASTER on the master list, so it's safe to add the
				869	* lockres to the hashtable. anyone who finds the lock will
				870	* still have to wait on the IN_PROGRESS. */
				871
				872	/* finally add the lockres to its hash bucket */
				873	__dlm_insert_lockres(dlm, res);
				874
				875	/* since this lockres is new it doesn't not require the spinlock */
				876	__dlm_lockres_grab_inflight_ref(dlm, res);
				877
				878	/* get an extra ref on the mle in case this is a BLOCK
				879	* if so, the creator of the BLOCK may try to put the last
				880	* ref at this time in the assert master handler, so we
				881	* need an extra one to keep from a bad ptr deref. */
				882	dlm_get_mle_inuse(mle);
				883	spin_unlock(&dlm->master_lock);
				884	spin_unlock(&dlm->spinlock);
				885
				886	redo_request:
				887	while (wait_on_recovery) {
				888	/* any cluster changes that occurred after dropping the
				889	* dlm spinlock would be detectable be a change on the mle,
				890	* so we only need to clear out the recovery map once. */
				891	if (dlm_is_recovery_lock(lockid, namelen)) {
				892	mlog(0, "%s: Recovery map is not empty, but must "
				893	"master $RECOVERY lock now\n", dlm->name);
				894	if (!dlm_pre_master_reco_lockres(dlm, res))
				895	wait_on_recovery = 0;
				896	else {
				897	mlog(0, "%s: waiting 500ms for heartbeat state "
				898	"change\n", dlm->name);
				899	msleep(500);
				900	}
				901	continue;
				902	}
				903
				904	dlm_kick_recovery_thread(dlm);
				905	msleep(1000);
				906	dlm_wait_for_recovery(dlm);
				907
				908	spin_lock(&dlm->spinlock);
				909	bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
				910	if (bit < O2NM_MAX_NODES) {
				911	mlog(0, "%s: res %.*s, At least one node (%d) "
				912	"to recover before lock mastery can begin\n",
				913	dlm->name, namelen, (char *)lockid, bit);
				914	wait_on_recovery = 1;
				915	} else
				916	wait_on_recovery = 0;
				917	spin_unlock(&dlm->spinlock);
				918
				919	if (wait_on_recovery)
				920	dlm_wait_for_node_recovery(dlm, bit, 10000);
				921	}
				922
				923	/* must wait for lock to be mastered elsewhere */
				924	if (blocked)
				925	goto wait;
				926
				927	ret = -EINVAL;
				928	dlm_node_iter_init(mle->vote_map, &iter);
				929	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				930	ret = dlm_do_master_request(res, mle, nodenum);
				931	if (ret < 0)
				932	mlog_errno(ret);
				933	if (mle->master != O2NM_MAX_NODES) {
				934	/* found a master ! */
				935	if (mle->master <= nodenum)
				936	break;
				937	/* if our master request has not reached the master
				938	* yet, keep going until it does. this is how the
				939	* master will know that asserts are needed back to
				940	* the lower nodes. */
				941	mlog(0, "%s: res %.*s, Requests only up to %u but "
				942	"master is %u, keep going\n", dlm->name, namelen,
				943	lockid, nodenum, mle->master);
				944	}
				945	}
				946
				947	wait:
				948	/* keep going until the response map includes all nodes */
				949	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
				950	if (ret < 0) {
				951	wait_on_recovery = 1;
				952	mlog(0, "%s: res %.*s, Node map changed, redo the master "
				953	"request now, blocked=%d\n", dlm->name, res->lockname.len,
				954	res->lockname.name, blocked);
				955	if (++tries > 20) {
				956	mlog(ML_ERROR, "%s: res %.*s, Spinning on "
				957	"dlm_wait_for_lock_mastery, blocked = %d\n",
				958	dlm->name, res->lockname.len,
				959	res->lockname.name, blocked);
				960	dlm_print_one_lock_resource(res);
				961	dlm_print_one_mle(mle);
				962	tries = 0;
				963	}
				964	goto redo_request;
				965	}
				966
				967	mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
				968	res->lockname.name, res->owner);
				969	/* make sure we never continue without this */
				970	BUG_ON(res->owner == O2NM_MAX_NODES);
				971
				972	/* master is known, detach if not already detached */
				973	dlm_mle_detach_hb_events(dlm, mle);
				974	dlm_put_mle(mle);
				975	/* put the extra ref */
				976	dlm_put_mle_inuse(mle);
				977
				978	wake_waiters:
				979	spin_lock(&res->spinlock);
				980	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
				981	spin_unlock(&res->spinlock);
				982	wake_up(&res->wq);
				983
				984	leave:
				985	/* need to free the unused mle */
				986	if (alloc_mle)
				987	kmem_cache_free(dlm_mle_cache, alloc_mle);
				988
				989	return res;
				990	}
				991
				992
				993	#define DLM_MASTERY_TIMEOUT_MS 5000
				994
				995	static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
				996	struct dlm_lock_resource *res,
				997	struct dlm_master_list_entry *mle,
				998	int *blocked)
				999	{
				1000	u8 m;
				1001	int ret, bit;
				1002	int map_changed, voting_done;
				1003	int assert, sleep;
				1004
				1005	recheck:
				1006	ret = 0;
				1007	assert = 0;
				1008
				1009	/* check if another node has already become the owner */
				1010	spin_lock(&res->spinlock);
				1011	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
				1012	mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
				1013	res->lockname.len, res->lockname.name, res->owner);
				1014	spin_unlock(&res->spinlock);
				1015	/* this will cause the master to re-assert across
				1016	* the whole cluster, freeing up mles */
				1017	if (res->owner != dlm->node_num) {
				1018	ret = dlm_do_master_request(res, mle, res->owner);
				1019	if (ret < 0) {
				1020	/* give recovery a chance to run */
				1021	mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
				1022	msleep(500);
				1023	goto recheck;
				1024	}
				1025	}
				1026	ret = 0;
				1027	goto leave;
				1028	}
				1029	spin_unlock(&res->spinlock);
				1030
				1031	spin_lock(&mle->spinlock);
				1032	m = mle->master;
				1033	map_changed = (memcmp(mle->vote_map, mle->node_map,
				1034	sizeof(mle->vote_map)) != 0);
				1035	voting_done = (memcmp(mle->vote_map, mle->response_map,
				1036	sizeof(mle->vote_map)) == 0);
				1037
				1038	/* restart if we hit any errors */
				1039	if (map_changed) {
				1040	int b;
				1041	mlog(0, "%s: %.*s: node map changed, restarting\n",
				1042	dlm->name, res->lockname.len, res->lockname.name);
				1043	ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
				1044	b = (mle->type == DLM_MLE_BLOCK);
				1045	if ((blocked && !b) \|\| (!blocked && b)) {
				1046	mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
				1047	dlm->name, res->lockname.len, res->lockname.name,
				1048	*blocked, b);
				1049	*blocked = b;
				1050	}
				1051	spin_unlock(&mle->spinlock);
				1052	if (ret < 0) {
				1053	mlog_errno(ret);
				1054	goto leave;
				1055	}
				1056	mlog(0, "%s:%.*s: restart lock mastery succeeded, "
				1057	"rechecking now\n", dlm->name, res->lockname.len,
				1058	res->lockname.name);
				1059	goto recheck;
				1060	} else {
				1061	if (!voting_done) {
				1062	mlog(0, "map not changed and voting not done "
				1063	"for %s:%.*s\n", dlm->name, res->lockname.len,
				1064	res->lockname.name);
				1065	}
				1066	}
				1067
				1068	if (m != O2NM_MAX_NODES) {
				1069	/* another node has done an assert!
				1070	* all done! */
				1071	sleep = 0;
				1072	} else {
				1073	sleep = 1;
				1074	/* have all nodes responded? */
				1075	if (voting_done && !*blocked) {
				1076	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
				1077	if (dlm->node_num <= bit) {
				1078	/* my node number is lowest.
				1079	* now tell other nodes that I am
				1080	* mastering this. */
				1081	mle->master = dlm->node_num;
				1082	/* ref was grabbed in get_lock_resource
				1083	* will be dropped in dlmlock_master */
				1084	assert = 1;
				1085	sleep = 0;
				1086	}
				1087	/* if voting is done, but we have not received
				1088	* an assert master yet, we must sleep */
				1089	}
				1090	}
				1091
				1092	spin_unlock(&mle->spinlock);
				1093
				1094	/* sleep if we haven't finished voting yet */
				1095	if (sleep) {
				1096	unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
				1097
				1098	/*
				1099	if (atomic_read(&mle->mle_refs.refcount) < 2)
				1100	mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
				1101	atomic_read(&mle->mle_refs.refcount),
				1102	res->lockname.len, res->lockname.name);
				1103	*/
				1104	atomic_set(&mle->woken, 0);
				1105	(void)wait_event_timeout(mle->wq,
				1106	(atomic_read(&mle->woken) == 1),
				1107	timeo);
				1108	if (res->owner == O2NM_MAX_NODES) {
				1109	mlog(0, "%s:%.*s: waiting again\n", dlm->name,
				1110	res->lockname.len, res->lockname.name);
				1111	goto recheck;
				1112	}
				1113	mlog(0, "done waiting, master is %u\n", res->owner);
				1114	ret = 0;
				1115	goto leave;
				1116	}
				1117
				1118	ret = 0; /* done */
				1119	if (assert) {
				1120	m = dlm->node_num;
				1121	mlog(0, "about to master %.*s here, this=%u\n",
				1122	res->lockname.len, res->lockname.name, m);
				1123	ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
				1124	if (ret) {
				1125	/* This is a failure in the network path,
				1126	* not in the response to the assert_master
				1127	* (any nonzero response is a BUG on this node).
				1128	* Most likely a socket just got disconnected
				1129	* due to node death. */
				1130	mlog_errno(ret);
				1131	}
				1132	/* no longer need to restart lock mastery.
				1133	* all living nodes have been contacted. */
				1134	ret = 0;
				1135	}
				1136
				1137	/* set the lockres owner */
				1138	spin_lock(&res->spinlock);
				1139	/* mastery reference obtained either during
				1140	* assert_master_handler or in get_lock_resource */
				1141	dlm_change_lockres_owner(dlm, res, m);
				1142	spin_unlock(&res->spinlock);
				1143
				1144	leave:
				1145	return ret;
				1146	}
				1147
				1148	struct dlm_bitmap_diff_iter
				1149	{
				1150	int curnode;
				1151	unsigned long *orig_bm;
				1152	unsigned long *cur_bm;
				1153	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				1154	};
				1155
				1156	enum dlm_node_state_change
				1157	{
				1158	NODE_DOWN = -1,
				1159	NODE_NO_CHANGE = 0,
				1160	NODE_UP
				1161	};
				1162
				1163	static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
				1164	unsigned long *orig_bm,
				1165	unsigned long *cur_bm)
				1166	{
				1167	unsigned long p1, p2;
				1168	int i;
				1169
				1170	iter->curnode = -1;
				1171	iter->orig_bm = orig_bm;
				1172	iter->cur_bm = cur_bm;
				1173
				1174	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
				1175	p1 = *(iter->orig_bm + i);
				1176	p2 = *(iter->cur_bm + i);
				1177	iter->diff_bm[i] = (p1 & ~p2) \| (p2 & ~p1);
				1178	}
				1179	}
				1180
				1181	static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
				1182	enum dlm_node_state_change *state)
				1183	{
				1184	int bit;
				1185
				1186	if (iter->curnode >= O2NM_MAX_NODES)
				1187	return -ENOENT;
				1188
				1189	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
				1190	iter->curnode+1);
				1191	if (bit >= O2NM_MAX_NODES) {
				1192	iter->curnode = O2NM_MAX_NODES;
				1193	return -ENOENT;
				1194	}
				1195
				1196	/* if it was there in the original then this node died */
				1197	if (test_bit(bit, iter->orig_bm))
				1198	*state = NODE_DOWN;
				1199	else
				1200	*state = NODE_UP;
				1201
				1202	iter->curnode = bit;
				1203	return bit;
				1204	}
				1205
				1206
				1207	static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
				1208	struct dlm_lock_resource *res,
				1209	struct dlm_master_list_entry *mle,
				1210	int blocked)
				1211	{
				1212	struct dlm_bitmap_diff_iter bdi;
				1213	enum dlm_node_state_change sc;
				1214	int node;
				1215	int ret = 0;
				1216
				1217	mlog(0, "something happened such that the "
				1218	"master process may need to be restarted!\n");
				1219
				1220	assert_spin_locked(&mle->spinlock);
				1221
				1222	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
				1223	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
				1224	while (node >= 0) {
				1225	if (sc == NODE_UP) {
				1226	/* a node came up. clear any old vote from
				1227	* the response map and set it in the vote map
				1228	* then restart the mastery. */
				1229	mlog(ML_NOTICE, "node %d up while restarting\n", node);
				1230
				1231	/* redo the master request, but only for the new node */
				1232	mlog(0, "sending request to new node\n");
				1233	clear_bit(node, mle->response_map);
				1234	set_bit(node, mle->vote_map);
				1235	} else {
				1236	mlog(ML_ERROR, "node down! %d\n", node);
				1237	if (blocked) {
				1238	int lowest = find_next_bit(mle->maybe_map,
				1239	O2NM_MAX_NODES, 0);
				1240
				1241	/* act like it was never there */
				1242	clear_bit(node, mle->maybe_map);
				1243
				1244	if (node == lowest) {
				1245	mlog(0, "expected master %u died"
				1246	" while this node was blocked "
				1247	"waiting on it!\n", node);
				1248	lowest = find_next_bit(mle->maybe_map,
				1249	O2NM_MAX_NODES,
				1250	lowest+1);
				1251	if (lowest < O2NM_MAX_NODES) {
				1252	mlog(0, "%s:%.*s:still "
				1253	"blocked. waiting on %u "
				1254	"now\n", dlm->name,
				1255	res->lockname.len,
				1256	res->lockname.name,
				1257	lowest);
				1258	} else {
				1259	/* mle is an MLE_BLOCK, but
				1260	* there is now nothing left to
				1261	* block on. we need to return
				1262	* all the way back out and try
				1263	* again with an MLE_MASTER.
				1264	* dlm_do_local_recovery_cleanup
				1265	* has already run, so the mle
				1266	* refcount is ok */
				1267	mlog(0, "%s:%.*s: no "
				1268	"longer blocking. try to "
				1269	"master this here\n",
				1270	dlm->name,
				1271	res->lockname.len,
				1272	res->lockname.name);
				1273	mle->type = DLM_MLE_MASTER;
				1274	mle->mleres = res;
				1275	}
				1276	}
				1277	}
				1278
				1279	/* now blank out everything, as if we had never
				1280	* contacted anyone */
				1281	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
				1282	memset(mle->response_map, 0, sizeof(mle->response_map));
				1283	/* reset the vote_map to the current node_map */
				1284	memcpy(mle->vote_map, mle->node_map,
				1285	sizeof(mle->node_map));
				1286	/* put myself into the maybe map */
				1287	if (mle->type != DLM_MLE_BLOCK)
				1288	set_bit(dlm->node_num, mle->maybe_map);
				1289	}
				1290	ret = -EAGAIN;
				1291	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
				1292	}
				1293	return ret;
				1294	}
				1295
				1296
				1297	/*
				1298	* DLM_MASTER_REQUEST_MSG
				1299	*
				1300	* returns: 0 on success,
				1301	* -errno on a network error
				1302	*
				1303	* on error, the caller should assume the target node is "dead"
				1304	*
				1305	*/
				1306
				1307	static int dlm_do_master_request(struct dlm_lock_resource *res,
				1308	struct dlm_master_list_entry *mle, int to)
				1309	{
				1310	struct dlm_ctxt *dlm = mle->dlm;
				1311	struct dlm_master_request request;
				1312	int ret, response=0, resend;
				1313
				1314	memset(&request, 0, sizeof(request));
				1315	request.node_idx = dlm->node_num;
				1316
				1317	BUG_ON(mle->type == DLM_MLE_MIGRATION);
				1318
				1319	request.namelen = (u8)mle->mnamelen;
				1320	memcpy(request.name, mle->mname, request.namelen);
				1321
				1322	again:
				1323	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
				1324	sizeof(request), to, &response);
				1325	if (ret < 0) {
				1326	if (ret == -ESRCH) {
				1327	/* should never happen */
				1328	mlog(ML_ERROR, "TCP stack not ready!\n");
				1329	BUG();
				1330	} else if (ret == -EINVAL) {
				1331	mlog(ML_ERROR, "bad args passed to o2net!\n");
				1332	BUG();
				1333	} else if (ret == -ENOMEM) {
				1334	mlog(ML_ERROR, "out of memory while trying to send "
				1335	"network message! retrying\n");
				1336	/* this is totally crude */
				1337	msleep(50);
				1338	goto again;
				1339	} else if (!dlm_is_host_down(ret)) {
				1340	/* not a network error. bad. */
				1341	mlog_errno(ret);
				1342	mlog(ML_ERROR, "unhandled error!");
				1343	BUG();
				1344	}
				1345	/* all other errors should be network errors,
				1346	* and likely indicate node death */
				1347	mlog(ML_ERROR, "link to %d went down!\n", to);
				1348	goto out;
				1349	}
				1350
				1351	ret = 0;
				1352	resend = 0;
				1353	spin_lock(&mle->spinlock);
				1354	switch (response) {
				1355	case DLM_MASTER_RESP_YES:
				1356	set_bit(to, mle->response_map);
				1357	mlog(0, "node %u is the master, response=YES\n", to);
				1358	mlog(0, "%s:%.*s: master node %u now knows I have a "
				1359	"reference\n", dlm->name, res->lockname.len,
				1360	res->lockname.name, to);
				1361	mle->master = to;
				1362	break;
				1363	case DLM_MASTER_RESP_NO:
				1364	mlog(0, "node %u not master, response=NO\n", to);
				1365	set_bit(to, mle->response_map);
				1366	break;
				1367	case DLM_MASTER_RESP_MAYBE:
				1368	mlog(0, "node %u not master, response=MAYBE\n", to);
				1369	set_bit(to, mle->response_map);
				1370	set_bit(to, mle->maybe_map);
				1371	break;
				1372	case DLM_MASTER_RESP_ERROR:
				1373	mlog(0, "node %u hit an error, resending\n", to);
				1374	resend = 1;
				1375	response = 0;
				1376	break;
				1377	default:
				1378	mlog(ML_ERROR, "bad response! %u\n", response);
				1379	BUG();
				1380	}
				1381	spin_unlock(&mle->spinlock);
				1382	if (resend) {
				1383	/* this is also totally crude */
				1384	msleep(50);
				1385	goto again;
				1386	}
				1387
				1388	out:
				1389	return ret;
				1390	}
				1391
				1392	/*
				1393	* locks that can be taken here:
				1394	* dlm->spinlock
				1395	* res->spinlock
				1396	* mle->spinlock
				1397	* dlm->master_list
				1398	*
				1399	* if possible, TRIM THIS DOWN!!!
				1400	*/
				1401	int dlm_master_request_handler(struct o2net_msg msg, u32 len, void data,
				1402	void **ret_data)
				1403	{
				1404	u8 response = DLM_MASTER_RESP_MAYBE;
				1405	struct dlm_ctxt *dlm = data;
				1406	struct dlm_lock_resource *res = NULL;
				1407	struct dlm_master_request request = (struct dlm_master_request ) msg->buf;
				1408	struct dlm_master_list_entry mle = NULL, tmpmle = NULL;
				1409	char *name;
				1410	unsigned int namelen, hash;
				1411	int found, ret;
				1412	int set_maybe;
				1413	int dispatch_assert = 0;
				1414
				1415	if (!dlm_grab(dlm))
				1416	return DLM_MASTER_RESP_NO;
				1417
				1418	if (!dlm_domain_fully_joined(dlm)) {
				1419	response = DLM_MASTER_RESP_NO;
				1420	goto send_response;
				1421	}
				1422
				1423	name = request->name;
				1424	namelen = request->namelen;
				1425	hash = dlm_lockid_hash(name, namelen);
				1426
				1427	if (namelen > DLM_LOCKID_NAME_MAX) {
				1428	response = DLM_IVBUFLEN;
				1429	goto send_response;
				1430	}
				1431
				1432	way_up_top:
				1433	spin_lock(&dlm->spinlock);
				1434	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
				1435	if (res) {
				1436	spin_unlock(&dlm->spinlock);
				1437
				1438	/* take care of the easy cases up front */
				1439	spin_lock(&res->spinlock);
				1440	if (res->state & (DLM_LOCK_RES_RECOVERING\|
				1441	DLM_LOCK_RES_MIGRATING)) {
				1442	spin_unlock(&res->spinlock);
				1443	mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
				1444	"being recovered/migrated\n");
				1445	response = DLM_MASTER_RESP_ERROR;
				1446	if (mle)
				1447	kmem_cache_free(dlm_mle_cache, mle);
				1448	goto send_response;
				1449	}
				1450
				1451	if (res->owner == dlm->node_num) {
				1452	dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
				1453	spin_unlock(&res->spinlock);
				1454	response = DLM_MASTER_RESP_YES;
				1455	if (mle)
				1456	kmem_cache_free(dlm_mle_cache, mle);
				1457
				1458	/* this node is the owner.
				1459	* there is some extra work that needs to
				1460	* happen now. the requesting node has
				1461	* caused all nodes up to this one to
				1462	* create mles. this node now needs to
				1463	* go back and clean those up. */
				1464	dispatch_assert = 1;
				1465	goto send_response;
				1466	} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
				1467	spin_unlock(&res->spinlock);
				1468	// mlog(0, "node %u is the master\n", res->owner);
				1469	response = DLM_MASTER_RESP_NO;
				1470	if (mle)
				1471	kmem_cache_free(dlm_mle_cache, mle);
				1472	goto send_response;
				1473	}
				1474
				1475	/* ok, there is no owner. either this node is
				1476	* being blocked, or it is actively trying to
				1477	* master this lock. */
				1478	if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
				1479	mlog(ML_ERROR, "lock with no owner should be "
				1480	"in-progress!\n");
				1481	BUG();
				1482	}
				1483
				1484	// mlog(0, "lockres is in progress...\n");
				1485	spin_lock(&dlm->master_lock);
				1486	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
				1487	if (!found) {
				1488	mlog(ML_ERROR, "no mle found for this lock!\n");
				1489	BUG();
				1490	}
				1491	set_maybe = 1;
				1492	spin_lock(&tmpmle->spinlock);
				1493	if (tmpmle->type == DLM_MLE_BLOCK) {
				1494	// mlog(0, "this node is waiting for "
				1495	// "lockres to be mastered\n");
				1496	response = DLM_MASTER_RESP_NO;
				1497	} else if (tmpmle->type == DLM_MLE_MIGRATION) {
				1498	mlog(0, "node %u is master, but trying to migrate to "
				1499	"node %u.\n", tmpmle->master, tmpmle->new_master);
				1500	if (tmpmle->master == dlm->node_num) {
				1501	mlog(ML_ERROR, "no owner on lockres, but this "
				1502	"node is trying to migrate it to %u?!\n",
				1503	tmpmle->new_master);
				1504	BUG();
				1505	} else {
				1506	/* the real master can respond on its own */
				1507	response = DLM_MASTER_RESP_NO;
				1508	}
				1509	} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
				1510	set_maybe = 0;
				1511	if (tmpmle->master == dlm->node_num) {
				1512	response = DLM_MASTER_RESP_YES;
				1513	/* this node will be the owner.
				1514	* go back and clean the mles on any
				1515	* other nodes */
				1516	dispatch_assert = 1;
				1517	dlm_lockres_set_refmap_bit(dlm, res,
				1518	request->node_idx);
				1519	} else
				1520	response = DLM_MASTER_RESP_NO;
				1521	} else {
				1522	// mlog(0, "this node is attempting to "
				1523	// "master lockres\n");
				1524	response = DLM_MASTER_RESP_MAYBE;
				1525	}
				1526	if (set_maybe)
				1527	set_bit(request->node_idx, tmpmle->maybe_map);
				1528	spin_unlock(&tmpmle->spinlock);
				1529
				1530	spin_unlock(&dlm->master_lock);
				1531	spin_unlock(&res->spinlock);
				1532
				1533	/* keep the mle attached to heartbeat events */
				1534	dlm_put_mle(tmpmle);
				1535	if (mle)
				1536	kmem_cache_free(dlm_mle_cache, mle);
				1537	goto send_response;
				1538	}
				1539
				1540	/*
				1541	* lockres doesn't exist on this node
				1542	* if there is an MLE_BLOCK, return NO
				1543	* if there is an MLE_MASTER, return MAYBE
				1544	* otherwise, add an MLE_BLOCK, return NO
				1545	*/
				1546	spin_lock(&dlm->master_lock);
				1547	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
				1548	if (!found) {
				1549	/* this lockid has never been seen on this node yet */
				1550	// mlog(0, "no mle found\n");
				1551	if (!mle) {
				1552	spin_unlock(&dlm->master_lock);
				1553	spin_unlock(&dlm->spinlock);
				1554
				1555	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
				1556	if (!mle) {
				1557	response = DLM_MASTER_RESP_ERROR;
				1558	mlog_errno(-ENOMEM);
				1559	goto send_response;
				1560	}
				1561	goto way_up_top;
				1562	}
				1563
				1564	// mlog(0, "this is second time thru, already allocated, "
				1565	// "add the block.\n");
				1566	dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
				1567	set_bit(request->node_idx, mle->maybe_map);
				1568	__dlm_insert_mle(dlm, mle);
				1569	response = DLM_MASTER_RESP_NO;
				1570	} else {
				1571	// mlog(0, "mle was found\n");
				1572	set_maybe = 1;
				1573	spin_lock(&tmpmle->spinlock);
				1574	if (tmpmle->master == dlm->node_num) {
				1575	mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
				1576	BUG();
				1577	}
				1578	if (tmpmle->type == DLM_MLE_BLOCK)
				1579	response = DLM_MASTER_RESP_NO;
				1580	else if (tmpmle->type == DLM_MLE_MIGRATION) {
				1581	mlog(0, "migration mle was found (%u->%u)\n",
				1582	tmpmle->master, tmpmle->new_master);
				1583	/* real master can respond on its own */
				1584	response = DLM_MASTER_RESP_NO;
				1585	} else
				1586	response = DLM_MASTER_RESP_MAYBE;
				1587	if (set_maybe)
				1588	set_bit(request->node_idx, tmpmle->maybe_map);
				1589	spin_unlock(&tmpmle->spinlock);
				1590	}
				1591	spin_unlock(&dlm->master_lock);
				1592	spin_unlock(&dlm->spinlock);
				1593
				1594	if (found) {
				1595	/* keep the mle attached to heartbeat events */
				1596	dlm_put_mle(tmpmle);
				1597	}
				1598	send_response:
				1599	/*
				1600	* __dlm_lookup_lockres() grabbed a reference to this lockres.
				1601	* The reference is released by dlm_assert_master_worker() under
				1602	* the call to dlm_dispatch_assert_master(). If
				1603	* dlm_assert_master_worker() isn't called, we drop it here.
				1604	*/
				1605	if (dispatch_assert) {
				1606	if (response != DLM_MASTER_RESP_YES)
				1607	mlog(ML_ERROR, "invalid response %d\n", response);
				1608	if (!res) {
				1609	mlog(ML_ERROR, "bad lockres while trying to assert!\n");
				1610	BUG();
				1611	}
				1612	mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
				1613	dlm->node_num, res->lockname.len, res->lockname.name);
				1614	ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
				1615	DLM_ASSERT_MASTER_MLE_CLEANUP);
				1616	if (ret < 0) {
				1617	mlog(ML_ERROR, "failed to dispatch assert master work\n");
				1618	response = DLM_MASTER_RESP_ERROR;
				1619	dlm_lockres_put(res);
				1620	}
				1621	} else {
				1622	if (res)
				1623	dlm_lockres_put(res);
				1624	}
				1625
				1626	dlm_put(dlm);
				1627	return response;
				1628	}
				1629
				1630	/*
				1631	* DLM_ASSERT_MASTER_MSG
				1632	*/
				1633
				1634
				1635	/*
				1636	* NOTE: this can be used for debugging
				1637	* can periodically run all locks owned by this node
				1638	* and re-assert across the cluster...
				1639	*/
				1640	static int dlm_do_assert_master(struct dlm_ctxt *dlm,
				1641	struct dlm_lock_resource *res,
				1642	void *nodemap, u32 flags)
				1643	{
				1644	struct dlm_assert_master assert;
				1645	int to, tmpret;
				1646	struct dlm_node_iter iter;
				1647	int ret = 0;
				1648	int reassert;
				1649	const char *lockname = res->lockname.name;
				1650	unsigned int namelen = res->lockname.len;
				1651
				1652	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
				1653
				1654	spin_lock(&res->spinlock);
				1655	res->state \|= DLM_LOCK_RES_SETREF_INPROG;
				1656	spin_unlock(&res->spinlock);
				1657
				1658	again:
				1659	reassert = 0;
				1660
				1661	/* note that if this nodemap is empty, it returns 0 */
				1662	dlm_node_iter_init(nodemap, &iter);
				1663	while ((to = dlm_node_iter_next(&iter)) >= 0) {
				1664	int r = 0;
				1665	struct dlm_master_list_entry *mle = NULL;
				1666
				1667	mlog(0, "sending assert master to %d (%.*s)\n", to,
				1668	namelen, lockname);
				1669	memset(&assert, 0, sizeof(assert));
				1670	assert.node_idx = dlm->node_num;
				1671	assert.namelen = namelen;
				1672	memcpy(assert.name, lockname, namelen);
				1673	assert.flags = cpu_to_be32(flags);
				1674
				1675	tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
				1676	&assert, sizeof(assert), to, &r);
				1677	if (tmpret < 0) {
				1678	mlog(ML_ERROR, "Error %d when sending message %u (key "
				1679	"0x%x) to node %u\n", tmpret,
				1680	DLM_ASSERT_MASTER_MSG, dlm->key, to);
				1681	if (!dlm_is_host_down(tmpret)) {
				1682	mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
				1683	BUG();
				1684	}
				1685	/* a node died. finish out the rest of the nodes. */
				1686	mlog(0, "link to %d went down!\n", to);
				1687	/* any nonzero status return will do */
				1688	ret = tmpret;
				1689	r = 0;
				1690	} else if (r < 0) {
				1691	/* ok, something horribly messed. kill thyself. */
				1692	mlog(ML_ERROR,"during assert master of %.*s to %u, "
				1693	"got %d.\n", namelen, lockname, to, r);
				1694	spin_lock(&dlm->spinlock);
				1695	spin_lock(&dlm->master_lock);
				1696	if (dlm_find_mle(dlm, &mle, (char *)lockname,
				1697	namelen)) {
				1698	dlm_print_one_mle(mle);
				1699	__dlm_put_mle(mle);
				1700	}
				1701	spin_unlock(&dlm->master_lock);
				1702	spin_unlock(&dlm->spinlock);
				1703	BUG();
				1704	}
				1705
				1706	if (r & DLM_ASSERT_RESPONSE_REASSERT &&
				1707	!(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
				1708	mlog(ML_ERROR, "%.*s: very strange, "
				1709	"master MLE but no lockres on %u\n",
				1710	namelen, lockname, to);
				1711	}
				1712
				1713	if (r & DLM_ASSERT_RESPONSE_REASSERT) {
				1714	mlog(0, "%.*s: node %u create mles on other "
				1715	"nodes and requests a re-assert\n",
				1716	namelen, lockname, to);
				1717	reassert = 1;
				1718	}
				1719	if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
				1720	mlog(0, "%.*s: node %u has a reference to this "
				1721	"lockres, set the bit in the refmap\n",
				1722	namelen, lockname, to);
				1723	spin_lock(&res->spinlock);
				1724	dlm_lockres_set_refmap_bit(dlm, res, to);
				1725	spin_unlock(&res->spinlock);
				1726	}
				1727	}
				1728
				1729	if (reassert)
				1730	goto again;
				1731
				1732	spin_lock(&res->spinlock);
				1733	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
				1734	spin_unlock(&res->spinlock);
				1735	wake_up(&res->wq);
				1736
				1737	return ret;
				1738	}
				1739
				1740	/*
				1741	* locks that can be taken here:
				1742	* dlm->spinlock
				1743	* res->spinlock
				1744	* mle->spinlock
				1745	* dlm->master_list
				1746	*
				1747	* if possible, TRIM THIS DOWN!!!
				1748	*/
				1749	int dlm_assert_master_handler(struct o2net_msg msg, u32 len, void data,
				1750	void **ret_data)
				1751	{
				1752	struct dlm_ctxt *dlm = data;
				1753	struct dlm_master_list_entry *mle = NULL;
				1754	struct dlm_assert_master assert = (struct dlm_assert_master )msg->buf;
				1755	struct dlm_lock_resource *res = NULL;
				1756	char *name;
				1757	unsigned int namelen, hash;
				1758	u32 flags;
				1759	int master_request = 0, have_lockres_ref = 0;
				1760	int ret = 0;
				1761
				1762	if (!dlm_grab(dlm))
				1763	return 0;
				1764
				1765	name = assert->name;
				1766	namelen = assert->namelen;
				1767	hash = dlm_lockid_hash(name, namelen);
				1768	flags = be32_to_cpu(assert->flags);
				1769
				1770	if (namelen > DLM_LOCKID_NAME_MAX) {
				1771	mlog(ML_ERROR, "Invalid name length!");
				1772	goto done;
				1773	}
				1774
				1775	spin_lock(&dlm->spinlock);
				1776
				1777	if (flags)
				1778	mlog(0, "assert_master with flags: %u\n", flags);
				1779
				1780	/* find the MLE */
				1781	spin_lock(&dlm->master_lock);
				1782	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
				1783	/* not an error, could be master just re-asserting */
				1784	mlog(0, "just got an assert_master from %u, but no "
				1785	"MLE for it! (%.*s)\n", assert->node_idx,
				1786	namelen, name);
				1787	} else {
				1788	int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
				1789	if (bit >= O2NM_MAX_NODES) {
				1790	/* not necessarily an error, though less likely.
				1791	* could be master just re-asserting. */
				1792	mlog(0, "no bits set in the maybe_map, but %u "
				1793	"is asserting! (%.*s)\n", assert->node_idx,
				1794	namelen, name);
				1795	} else if (bit != assert->node_idx) {
				1796	if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
				1797	mlog(0, "master %u was found, %u should "
				1798	"back off\n", assert->node_idx, bit);
				1799	} else {
				1800	/* with the fix for bug 569, a higher node
				1801	* number winning the mastery will respond
				1802	* YES to mastery requests, but this node
				1803	* had no way of knowing. let it pass. */
				1804	mlog(0, "%u is the lowest node, "
				1805	"%u is asserting. (%.*s) %u must "
				1806	"have begun after %u won.\n", bit,
				1807	assert->node_idx, namelen, name, bit,
				1808	assert->node_idx);
				1809	}
				1810	}
				1811	if (mle->type == DLM_MLE_MIGRATION) {
				1812	if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
				1813	mlog(0, "%s:%.*s: got cleanup assert"
				1814	" from %u for migration\n",
				1815	dlm->name, namelen, name,
				1816	assert->node_idx);
				1817	} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
				1818	mlog(0, "%s:%.*s: got unrelated assert"
				1819	" from %u for migration, ignoring\n",
				1820	dlm->name, namelen, name,
				1821	assert->node_idx);
				1822	__dlm_put_mle(mle);
				1823	spin_unlock(&dlm->master_lock);
				1824	spin_unlock(&dlm->spinlock);
				1825	goto done;
				1826	}
				1827	}
				1828	}
				1829	spin_unlock(&dlm->master_lock);
				1830
				1831	/* ok everything checks out with the MLE
				1832	* now check to see if there is a lockres */
				1833	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
				1834	if (res) {
				1835	spin_lock(&res->spinlock);
				1836	if (res->state & DLM_LOCK_RES_RECOVERING) {
				1837	mlog(ML_ERROR, "%u asserting but %.*s is "
				1838	"RECOVERING!\n", assert->node_idx, namelen, name);
				1839	goto kill;
				1840	}
				1841	if (!mle) {
				1842	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
				1843	res->owner != assert->node_idx) {
				1844	mlog(ML_ERROR, "DIE! Mastery assert from %u, "
				1845	"but current owner is %u! (%.*s)\n",
				1846	assert->node_idx, res->owner, namelen,
				1847	name);
				1848	__dlm_print_one_lock_resource(res);
				1849	BUG();
				1850	}
				1851	} else if (mle->type != DLM_MLE_MIGRATION) {
				1852	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
				1853	/* owner is just re-asserting */
				1854	if (res->owner == assert->node_idx) {
				1855	mlog(0, "owner %u re-asserting on "
				1856	"lock %.*s\n", assert->node_idx,
				1857	namelen, name);
				1858	goto ok;
				1859	}
				1860	mlog(ML_ERROR, "got assert_master from "
				1861	"node %u, but %u is the owner! "
				1862	"(%.*s)\n", assert->node_idx,
				1863	res->owner, namelen, name);
				1864	goto kill;
				1865	}
				1866	if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
				1867	mlog(ML_ERROR, "got assert from %u, but lock "
				1868	"with no owner should be "
				1869	"in-progress! (%.*s)\n",
				1870	assert->node_idx,
				1871	namelen, name);
				1872	goto kill;
				1873	}
				1874	} else /* mle->type == DLM_MLE_MIGRATION */ {
				1875	/* should only be getting an assert from new master */
				1876	if (assert->node_idx != mle->new_master) {
				1877	mlog(ML_ERROR, "got assert from %u, but "
				1878	"new master is %u, and old master "
				1879	"was %u (%.*s)\n",
				1880	assert->node_idx, mle->new_master,
				1881	mle->master, namelen, name);
				1882	goto kill;
				1883	}
				1884
				1885	}
				1886	ok:
				1887	spin_unlock(&res->spinlock);
				1888	}
				1889
				1890	// mlog(0, "woo! got an assert_master from node %u!\n",
				1891	// assert->node_idx);
				1892	if (mle) {
				1893	int extra_ref = 0;
				1894	int nn = -1;
				1895	int rr, err = 0;
				1896
				1897	spin_lock(&mle->spinlock);
				1898	if (mle->type == DLM_MLE_BLOCK \|\| mle->type == DLM_MLE_MIGRATION)
				1899	extra_ref = 1;
				1900	else {
				1901	/* MASTER mle: if any bits set in the response map
				1902	* then the calling node needs to re-assert to clear
				1903	* up nodes that this node contacted */
				1904	while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
				1905	nn+1)) < O2NM_MAX_NODES) {
				1906	if (nn != dlm->node_num && nn != assert->node_idx)
				1907	master_request = 1;
				1908	}
				1909	}
				1910	mle->master = assert->node_idx;
				1911	atomic_set(&mle->woken, 1);
				1912	wake_up(&mle->wq);
				1913	spin_unlock(&mle->spinlock);
				1914
				1915	if (res) {
				1916	int wake = 0;
				1917	spin_lock(&res->spinlock);
				1918	if (mle->type == DLM_MLE_MIGRATION) {
				1919	mlog(0, "finishing off migration of lockres %.*s, "
				1920	"from %u to %u\n",
				1921	res->lockname.len, res->lockname.name,
				1922	dlm->node_num, mle->new_master);
				1923	res->state &= ~DLM_LOCK_RES_MIGRATING;
				1924	wake = 1;
				1925	dlm_change_lockres_owner(dlm, res, mle->new_master);
				1926	BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
				1927	} else {
				1928	dlm_change_lockres_owner(dlm, res, mle->master);
				1929	}
				1930	spin_unlock(&res->spinlock);
				1931	have_lockres_ref = 1;
				1932	if (wake)
				1933	wake_up(&res->wq);
				1934	}
				1935
				1936	/* master is known, detach if not already detached.
				1937	* ensures that only one assert_master call will happen
				1938	* on this mle. */
				1939	spin_lock(&dlm->master_lock);
				1940
				1941	rr = atomic_read(&mle->mle_refs.refcount);
				1942	if (mle->inuse > 0) {
				1943	if (extra_ref && rr < 3)
				1944	err = 1;
				1945	else if (!extra_ref && rr < 2)
				1946	err = 1;
				1947	} else {
				1948	if (extra_ref && rr < 2)
				1949	err = 1;
				1950	else if (!extra_ref && rr < 1)
				1951	err = 1;
				1952	}
				1953	if (err) {
				1954	mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
				1955	"that will mess up this node, refs=%d, extra=%d, "
				1956	"inuse=%d\n", dlm->name, namelen, name,
				1957	assert->node_idx, rr, extra_ref, mle->inuse);
				1958	dlm_print_one_mle(mle);
				1959	}
				1960	__dlm_unlink_mle(dlm, mle);
				1961	__dlm_mle_detach_hb_events(dlm, mle);
				1962	__dlm_put_mle(mle);
				1963	if (extra_ref) {
				1964	/* the assert master message now balances the extra
				1965	* ref given by the master / migration request message.
				1966	* if this is the last put, it will be removed
				1967	* from the list. */
				1968	__dlm_put_mle(mle);
				1969	}
				1970	spin_unlock(&dlm->master_lock);
				1971	} else if (res) {
				1972	if (res->owner != assert->node_idx) {
				1973	mlog(0, "assert_master from %u, but current "
				1974	"owner is %u (%.*s), no mle\n", assert->node_idx,
				1975	res->owner, namelen, name);
				1976	}
				1977	}
				1978	spin_unlock(&dlm->spinlock);
				1979
				1980	done:
				1981	ret = 0;
				1982	if (res) {
				1983	spin_lock(&res->spinlock);
				1984	res->state \|= DLM_LOCK_RES_SETREF_INPROG;
				1985	spin_unlock(&res->spinlock);
				1986	ret_data = (void )res;
				1987	}
				1988	dlm_put(dlm);
				1989	if (master_request) {
				1990	mlog(0, "need to tell master to reassert\n");
				1991	/* positive. negative would shoot down the node. */
				1992	ret \|= DLM_ASSERT_RESPONSE_REASSERT;
				1993	if (!have_lockres_ref) {
				1994	mlog(ML_ERROR, "strange, got assert from %u, MASTER "
				1995	"mle present here for %s:%.*s, but no lockres!\n",
				1996	assert->node_idx, dlm->name, namelen, name);
				1997	}
				1998	}
				1999	if (have_lockres_ref) {
				2000	/* let the master know we have a reference to the lockres */
				2001	ret \|= DLM_ASSERT_RESPONSE_MASTERY_REF;
				2002	mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
				2003	dlm->name, namelen, name, assert->node_idx);
				2004	}
				2005	return ret;
				2006
				2007	kill:
				2008	/* kill the caller! */
				2009	mlog(ML_ERROR, "Bad message received from another node. Dumping state "
				2010	"and killing the other node now! This node is OK and can continue.\n");
				2011	__dlm_print_one_lock_resource(res);
				2012	spin_unlock(&res->spinlock);
				2013	spin_unlock(&dlm->spinlock);
				2014	ret_data = (void )res;
				2015	dlm_put(dlm);
				2016	return -EINVAL;
				2017	}
				2018
				2019	void dlm_assert_master_post_handler(int status, void data, void ret_data)
				2020	{
				2021	struct dlm_lock_resource res = (struct dlm_lock_resource )ret_data;
				2022
				2023	if (ret_data) {
				2024	spin_lock(&res->spinlock);
				2025	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
				2026	spin_unlock(&res->spinlock);
				2027	wake_up(&res->wq);
				2028	dlm_lockres_put(res);
				2029	}
				2030	return;
				2031	}
				2032
				2033	int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
				2034	struct dlm_lock_resource *res,
				2035	int ignore_higher, u8 request_from, u32 flags)
				2036	{
				2037	struct dlm_work_item *item;
				2038	item = kzalloc(sizeof(*item), GFP_NOFS);
				2039	if (!item)
				2040	return -ENOMEM;
				2041
				2042
				2043	/* queue up work for dlm_assert_master_worker */
				2044	dlm_grab(dlm); /* get an extra ref for the work item */
				2045	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
				2046	item->u.am.lockres = res; /* already have a ref */
				2047	/* can optionally ignore node numbers higher than this node */
				2048	item->u.am.ignore_higher = ignore_higher;
				2049	item->u.am.request_from = request_from;
				2050	item->u.am.flags = flags;
				2051
				2052	if (ignore_higher)
				2053	mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
				2054	res->lockname.name);
				2055
				2056	spin_lock(&dlm->work_lock);
				2057	list_add_tail(&item->list, &dlm->work_list);
				2058	spin_unlock(&dlm->work_lock);
				2059
				2060	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
				2061	return 0;
				2062	}
				2063
				2064	static void dlm_assert_master_worker(struct dlm_work_item item, void data)
				2065	{
				2066	struct dlm_ctxt *dlm = data;
				2067	int ret = 0;
				2068	struct dlm_lock_resource *res;
				2069	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
				2070	int ignore_higher;
				2071	int bit;
				2072	u8 request_from;
				2073	u32 flags;
				2074
				2075	dlm = item->dlm;
				2076	res = item->u.am.lockres;
				2077	ignore_higher = item->u.am.ignore_higher;
				2078	request_from = item->u.am.request_from;
				2079	flags = item->u.am.flags;
				2080
				2081	spin_lock(&dlm->spinlock);
				2082	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
				2083	spin_unlock(&dlm->spinlock);
				2084
				2085	clear_bit(dlm->node_num, nodemap);
				2086	if (ignore_higher) {
				2087	/* if is this just to clear up mles for nodes below
				2088	* this node, do not send the message to the original
				2089	* caller or any node number higher than this */
				2090	clear_bit(request_from, nodemap);
				2091	bit = dlm->node_num;
				2092	while (1) {
				2093	bit = find_next_bit(nodemap, O2NM_MAX_NODES,
				2094	bit+1);
				2095	if (bit >= O2NM_MAX_NODES)
				2096	break;
				2097	clear_bit(bit, nodemap);
				2098	}
				2099	}
				2100
				2101	/*
				2102	* If we're migrating this lock to someone else, we are no
				2103	* longer allowed to assert out own mastery. OTOH, we need to
				2104	* prevent migration from starting while we're still asserting
				2105	* our dominance. The reserved ast delays migration.
				2106	*/
				2107	spin_lock(&res->spinlock);
				2108	if (res->state & DLM_LOCK_RES_MIGRATING) {
				2109	mlog(0, "Someone asked us to assert mastery, but we're "
				2110	"in the middle of migration. Skipping assert, "
				2111	"the new master will handle that.\n");
				2112	spin_unlock(&res->spinlock);
				2113	goto put;
				2114	} else
				2115	__dlm_lockres_reserve_ast(res);
				2116	spin_unlock(&res->spinlock);
				2117
				2118	/* this call now finishes out the nodemap
				2119	* even if one or more nodes die */
				2120	mlog(0, "worker about to master %.*s here, this=%u\n",
				2121	res->lockname.len, res->lockname.name, dlm->node_num);
				2122	ret = dlm_do_assert_master(dlm, res, nodemap, flags);
				2123	if (ret < 0) {
				2124	/* no need to restart, we are done */
				2125	if (!dlm_is_host_down(ret))
				2126	mlog_errno(ret);
				2127	}
				2128
				2129	/* Ok, we've asserted ourselves. Let's let migration start. */
				2130	dlm_lockres_release_ast(dlm, res);
				2131
				2132	put:
				2133	dlm_lockres_put(res);
				2134
				2135	mlog(0, "finished with dlm_assert_master_worker\n");
				2136	}
				2137
				2138	/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
				2139	* We cannot wait for node recovery to complete to begin mastering this
				2140	* lockres because this lockres is used to kick off recovery! ;-)
				2141	* So, do a pre-check on all living nodes to see if any of those nodes
				2142	* think that $RECOVERY is currently mastered by a dead node. If so,
				2143	* we wait a short time to allow that node to get notified by its own
				2144	* heartbeat stack, then check again. All $RECOVERY lock resources
				2145	* mastered by dead nodes are purged when the hearbeat callback is
				2146	* fired, so we can know for sure that it is safe to continue once
				2147	* the node returns a live node or no node. */
				2148	static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
				2149	struct dlm_lock_resource *res)
				2150	{
				2151	struct dlm_node_iter iter;
				2152	int nodenum;
				2153	int ret = 0;
				2154	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
				2155
				2156	spin_lock(&dlm->spinlock);
				2157	dlm_node_iter_init(dlm->domain_map, &iter);
				2158	spin_unlock(&dlm->spinlock);
				2159
				2160	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				2161	/* do not send to self */
				2162	if (nodenum == dlm->node_num)
				2163	continue;
				2164	ret = dlm_do_master_requery(dlm, res, nodenum, &master);
				2165	if (ret < 0) {
				2166	mlog_errno(ret);
				2167	if (!dlm_is_host_down(ret))
				2168	BUG();
				2169	/* host is down, so answer for that node would be
				2170	* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
				2171	ret = 0;
				2172	}
				2173
				2174	if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
				2175	/* check to see if this master is in the recovery map */
				2176	spin_lock(&dlm->spinlock);
				2177	if (test_bit(master, dlm->recovery_map)) {
				2178	mlog(ML_NOTICE, "%s: node %u has not seen "
				2179	"node %u go down yet, and thinks the "
				2180	"dead node is mastering the recovery "
				2181	"lock. must wait.\n", dlm->name,
				2182	nodenum, master);
				2183	ret = -EAGAIN;
				2184	}
				2185	spin_unlock(&dlm->spinlock);
				2186	mlog(0, "%s: reco lock master is %u\n", dlm->name,
				2187	master);
				2188	break;
				2189	}
				2190	}
				2191	return ret;
				2192	}
				2193
				2194	/*
				2195	* DLM_DEREF_LOCKRES_MSG
				2196	*/
				2197
				2198	int dlm_drop_lockres_ref(struct dlm_ctxt dlm, struct dlm_lock_resource res)
				2199	{
				2200	struct dlm_deref_lockres deref;
				2201	int ret = 0, r;
				2202	const char *lockname;
				2203	unsigned int namelen;
				2204
				2205	lockname = res->lockname.name;
				2206	namelen = res->lockname.len;
				2207	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
				2208
				2209	memset(&deref, 0, sizeof(deref));
				2210	deref.node_idx = dlm->node_num;
				2211	deref.namelen = namelen;
				2212	memcpy(deref.name, lockname, namelen);
				2213
				2214	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
				2215	&deref, sizeof(deref), res->owner, &r);
				2216	if (ret < 0)
				2217	mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
				2218	dlm->name, namelen, lockname, ret, res->owner);
				2219	else if (r < 0) {
				2220	/* BAD. other node says I did not have a ref. */
				2221	mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
				2222	dlm->name, namelen, lockname, res->owner, r);
				2223	dlm_print_one_lock_resource(res);
				2224	BUG();
				2225	}
				2226	return ret;
				2227	}
				2228
				2229	int dlm_deref_lockres_handler(struct o2net_msg msg, u32 len, void data,
				2230	void **ret_data)
				2231	{
				2232	struct dlm_ctxt *dlm = data;
				2233	struct dlm_deref_lockres deref = (struct dlm_deref_lockres )msg->buf;
				2234	struct dlm_lock_resource *res = NULL;
				2235	char *name;
				2236	unsigned int namelen;
				2237	int ret = -EINVAL;
				2238	u8 node;
				2239	unsigned int hash;
				2240	struct dlm_work_item *item;
				2241	int cleared = 0;
				2242	int dispatch = 0;
				2243
				2244	if (!dlm_grab(dlm))
				2245	return 0;
				2246
				2247	name = deref->name;
				2248	namelen = deref->namelen;
				2249	node = deref->node_idx;
				2250
				2251	if (namelen > DLM_LOCKID_NAME_MAX) {
				2252	mlog(ML_ERROR, "Invalid name length!");
				2253	goto done;
				2254	}
				2255	if (deref->node_idx >= O2NM_MAX_NODES) {
				2256	mlog(ML_ERROR, "Invalid node number: %u\n", node);
				2257	goto done;
				2258	}
				2259
				2260	hash = dlm_lockid_hash(name, namelen);
				2261
				2262	spin_lock(&dlm->spinlock);
				2263	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
				2264	if (!res) {
				2265	spin_unlock(&dlm->spinlock);
				2266	mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
				2267	dlm->name, namelen, name);
				2268	goto done;
				2269	}
				2270	spin_unlock(&dlm->spinlock);
				2271
				2272	spin_lock(&res->spinlock);
				2273	if (res->state & DLM_LOCK_RES_SETREF_INPROG)
				2274	dispatch = 1;
				2275	else {
				2276	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
				2277	if (test_bit(node, res->refmap)) {
				2278	dlm_lockres_clear_refmap_bit(dlm, res, node);
				2279	cleared = 1;
				2280	}
				2281	}
				2282	spin_unlock(&res->spinlock);
				2283
				2284	if (!dispatch) {
				2285	if (cleared)
				2286	dlm_lockres_calc_usage(dlm, res);
				2287	else {
				2288	mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
				2289	"but it is already dropped!\n", dlm->name,
				2290	res->lockname.len, res->lockname.name, node);
				2291	dlm_print_one_lock_resource(res);
				2292	}
				2293	ret = 0;
				2294	goto done;
				2295	}
				2296
				2297	item = kzalloc(sizeof(*item), GFP_NOFS);
				2298	if (!item) {
				2299	ret = -ENOMEM;
				2300	mlog_errno(ret);
				2301	goto done;
				2302	}
				2303
				2304	dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
				2305	item->u.dl.deref_res = res;
				2306	item->u.dl.deref_node = node;
				2307
				2308	spin_lock(&dlm->work_lock);
				2309	list_add_tail(&item->list, &dlm->work_list);
				2310	spin_unlock(&dlm->work_lock);
				2311
				2312	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
				2313	return 0;
				2314
				2315	done:
				2316	if (res)
				2317	dlm_lockres_put(res);
				2318	dlm_put(dlm);
				2319
				2320	return ret;
				2321	}
				2322
				2323	static void dlm_deref_lockres_worker(struct dlm_work_item item, void data)
				2324	{
				2325	struct dlm_ctxt *dlm;
				2326	struct dlm_lock_resource *res;
				2327	u8 node;
				2328	u8 cleared = 0;
				2329
				2330	dlm = item->dlm;
				2331	res = item->u.dl.deref_res;
				2332	node = item->u.dl.deref_node;
				2333
				2334	spin_lock(&res->spinlock);
				2335	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
				2336	if (test_bit(node, res->refmap)) {
				2337	__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
				2338	dlm_lockres_clear_refmap_bit(dlm, res, node);
				2339	cleared = 1;
				2340	}
				2341	spin_unlock(&res->spinlock);
				2342
				2343	if (cleared) {
				2344	mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
				2345	dlm->name, res->lockname.len, res->lockname.name, node);
				2346	dlm_lockres_calc_usage(dlm, res);
				2347	} else {
				2348	mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
				2349	"but it is already dropped!\n", dlm->name,
				2350	res->lockname.len, res->lockname.name, node);
				2351	dlm_print_one_lock_resource(res);
				2352	}
				2353
				2354	dlm_lockres_put(res);
				2355	}
				2356
				2357	/*
				2358	* A migrateable resource is one that is :
				2359	* 1. locally mastered, and,
				2360	* 2. zero local locks, and,
				2361	* 3. one or more non-local locks, or, one or more references
				2362	* Returns 1 if yes, 0 if not.
				2363	*/
				2364	static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
				2365	struct dlm_lock_resource *res)
				2366	{
				2367	enum dlm_lockres_list idx;
				2368	int nonlocal = 0, node_ref;
				2369	struct list_head *queue;
				2370	struct dlm_lock *lock;
				2371	u64 cookie;
				2372
				2373	assert_spin_locked(&res->spinlock);
				2374
				2375	if (res->owner != dlm->node_num)
				2376	return 0;
				2377
				2378	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
				2379	queue = dlm_list_idx_to_ptr(res, idx);
				2380	list_for_each_entry(lock, queue, list) {
				2381	if (lock->ml.node != dlm->node_num) {
				2382	nonlocal++;
				2383	continue;
				2384	}
				2385	cookie = be64_to_cpu(lock->ml.cookie);
				2386	mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
				2387	"%s list\n", dlm->name, res->lockname.len,
				2388	res->lockname.name,
				2389	dlm_get_lock_cookie_node(cookie),
				2390	dlm_get_lock_cookie_seq(cookie),
				2391	dlm_list_in_text(idx));
				2392	return 0;
				2393	}
				2394	}
				2395
				2396	if (!nonlocal) {
				2397	node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
				2398	if (node_ref >= O2NM_MAX_NODES)
				2399	return 0;
				2400	}
				2401
				2402	mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
				2403	res->lockname.name);
				2404
				2405	return 1;
				2406	}
				2407
				2408	/*
				2409	* DLM_MIGRATE_LOCKRES
				2410	*/
				2411
				2412
				2413	static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
				2414	struct dlm_lock_resource *res, u8 target)
				2415	{
				2416	struct dlm_master_list_entry *mle = NULL;
				2417	struct dlm_master_list_entry *oldmle = NULL;
				2418	struct dlm_migratable_lockres *mres = NULL;
				2419	int ret = 0;
				2420	const char *name;
				2421	unsigned int namelen;
				2422	int mle_added = 0;
				2423	int wake = 0;
				2424
				2425	if (!dlm_grab(dlm))
				2426	return -EINVAL;
				2427
				2428	BUG_ON(target == O2NM_MAX_NODES);
				2429
				2430	name = res->lockname.name;
				2431	namelen = res->lockname.len;
				2432
				2433	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
				2434	target);
				2435
				2436	/* preallocate up front. if this fails, abort */
				2437	ret = -ENOMEM;
				2438	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
				2439	if (!mres) {
				2440	mlog_errno(ret);
				2441	goto leave;
				2442	}
				2443
				2444	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
				2445	if (!mle) {
				2446	mlog_errno(ret);
				2447	goto leave;
				2448	}
				2449	ret = 0;
				2450
				2451	/*
				2452	* clear any existing master requests and
				2453	* add the migration mle to the list
				2454	*/
				2455	spin_lock(&dlm->spinlock);
				2456	spin_lock(&dlm->master_lock);
				2457	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
				2458	namelen, target, dlm->node_num);
				2459	spin_unlock(&dlm->master_lock);
				2460	spin_unlock(&dlm->spinlock);
				2461
				2462	if (ret == -EEXIST) {
				2463	mlog(0, "another process is already migrating it\n");
				2464	goto fail;
				2465	}
				2466	mle_added = 1;
				2467
				2468	/*
				2469	* set the MIGRATING flag and flush asts
				2470	* if we fail after this we need to re-dirty the lockres
				2471	*/
				2472	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
				2473	mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
				2474	"the target went down.\n", res->lockname.len,
				2475	res->lockname.name, target);
				2476	spin_lock(&res->spinlock);
				2477	res->state &= ~DLM_LOCK_RES_MIGRATING;
				2478	wake = 1;
				2479	spin_unlock(&res->spinlock);
				2480	ret = -EINVAL;
				2481	}
				2482
				2483	fail:
				2484	if (oldmle) {
				2485	/* master is known, detach if not already detached */
				2486	dlm_mle_detach_hb_events(dlm, oldmle);
				2487	dlm_put_mle(oldmle);
				2488	}
				2489
				2490	if (ret < 0) {
				2491	if (mle_added) {
				2492	dlm_mle_detach_hb_events(dlm, mle);
				2493	dlm_put_mle(mle);
				2494	} else if (mle) {
				2495	kmem_cache_free(dlm_mle_cache, mle);
				2496	mle = NULL;
				2497	}
				2498	goto leave;
				2499	}
				2500
				2501	/*
				2502	* at this point, we have a migration target, an mle
				2503	* in the master list, and the MIGRATING flag set on
				2504	* the lockres
				2505	*/
				2506
				2507	/* now that remote nodes are spinning on the MIGRATING flag,
				2508	* ensure that all assert_master work is flushed. */
				2509	flush_workqueue(dlm->dlm_worker);
				2510
				2511	/* get an extra reference on the mle.
				2512	* otherwise the assert_master from the new
				2513	* master will destroy this.
				2514	* also, make sure that all callers of dlm_get_mle
				2515	* take both dlm->spinlock and dlm->master_lock */
				2516	spin_lock(&dlm->spinlock);
				2517	spin_lock(&dlm->master_lock);
				2518	dlm_get_mle_inuse(mle);
				2519	spin_unlock(&dlm->master_lock);
				2520	spin_unlock(&dlm->spinlock);
				2521
				2522	/* notify new node and send all lock state */
				2523	/* call send_one_lockres with migration flag.
				2524	* this serves as notice to the target node that a
				2525	* migration is starting. */
				2526	ret = dlm_send_one_lockres(dlm, res, mres, target,
				2527	DLM_MRES_MIGRATION);
				2528
				2529	if (ret < 0) {
				2530	mlog(0, "migration to node %u failed with %d\n",
				2531	target, ret);
				2532	/* migration failed, detach and clean up mle */
				2533	dlm_mle_detach_hb_events(dlm, mle);
				2534	dlm_put_mle(mle);
				2535	dlm_put_mle_inuse(mle);
				2536	spin_lock(&res->spinlock);
				2537	res->state &= ~DLM_LOCK_RES_MIGRATING;
				2538	wake = 1;
				2539	spin_unlock(&res->spinlock);
				2540	if (dlm_is_host_down(ret))
				2541	dlm_wait_for_node_death(dlm, target,
				2542	DLM_NODE_DEATH_WAIT_MAX);
				2543	goto leave;
				2544	}
				2545
				2546	/* at this point, the target sends a message to all nodes,
				2547	* (using dlm_do_migrate_request). this node is skipped since
				2548	* we had to put an mle in the list to begin the process. this
				2549	* node now waits for target to do an assert master. this node
				2550	* will be the last one notified, ensuring that the migration
				2551	* is complete everywhere. if the target dies while this is
				2552	* going on, some nodes could potentially see the target as the
				2553	* master, so it is important that my recovery finds the migration
				2554	* mle and sets the master to UNKNOWN. */
				2555
				2556
				2557	/* wait for new node to assert master */
				2558	while (1) {
				2559	ret = wait_event_interruptible_timeout(mle->wq,
				2560	(atomic_read(&mle->woken) == 1),
				2561	msecs_to_jiffies(5000));
				2562
				2563	if (ret >= 0) {
				2564	if (atomic_read(&mle->woken) == 1 \|\|
				2565	res->owner == target)
				2566	break;
				2567
				2568	mlog(0, "%s:%.*s: timed out during migration\n",
				2569	dlm->name, res->lockname.len, res->lockname.name);
				2570	/* avoid hang during shutdown when migrating lockres
				2571	* to a node which also goes down */
				2572	if (dlm_is_node_dead(dlm, target)) {
				2573	mlog(0, "%s:%.*s: expected migration "
				2574	"target %u is no longer up, restarting\n",
				2575	dlm->name, res->lockname.len,
				2576	res->lockname.name, target);
				2577	ret = -EINVAL;
				2578	/* migration failed, detach and clean up mle */
				2579	dlm_mle_detach_hb_events(dlm, mle);
				2580	dlm_put_mle(mle);
				2581	dlm_put_mle_inuse(mle);
				2582	spin_lock(&res->spinlock);
				2583	res->state &= ~DLM_LOCK_RES_MIGRATING;
				2584	wake = 1;
				2585	spin_unlock(&res->spinlock);
				2586	goto leave;
				2587	}
				2588	} else
				2589	mlog(0, "%s:%.*s: caught signal during migration\n",
				2590	dlm->name, res->lockname.len, res->lockname.name);
				2591	}
				2592
				2593	/* all done, set the owner, clear the flag */
				2594	spin_lock(&res->spinlock);
				2595	dlm_set_lockres_owner(dlm, res, target);
				2596	res->state &= ~DLM_LOCK_RES_MIGRATING;
				2597	dlm_remove_nonlocal_locks(dlm, res);
				2598	spin_unlock(&res->spinlock);
				2599	wake_up(&res->wq);
				2600
				2601	/* master is known, detach if not already detached */
				2602	dlm_mle_detach_hb_events(dlm, mle);
				2603	dlm_put_mle_inuse(mle);
				2604	ret = 0;
				2605
				2606	dlm_lockres_calc_usage(dlm, res);
				2607
				2608	leave:
				2609	/* re-dirty the lockres if we failed */
				2610	if (ret < 0)
				2611	dlm_kick_thread(dlm, res);
				2612
				2613	/* wake up waiters if the MIGRATING flag got set
				2614	* but migration failed */
				2615	if (wake)
				2616	wake_up(&res->wq);
				2617
				2618	if (mres)
				2619	free_page((unsigned long)mres);
				2620
				2621	dlm_put(dlm);
				2622
				2623	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
				2624	name, target, ret);
				2625	return ret;
				2626	}
				2627
				2628	#define DLM_MIGRATION_RETRY_MS 100
				2629
				2630	/*
				2631	* Should be called only after beginning the domain leave process.
				2632	* There should not be any remaining locks on nonlocal lock resources,
				2633	* and there should be no local locks left on locally mastered resources.
				2634	*
				2635	* Called with the dlm spinlock held, may drop it to do migration, but
				2636	* will re-acquire before exit.
				2637	*
				2638	* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
				2639	*/
				2640	int dlm_empty_lockres(struct dlm_ctxt dlm, struct dlm_lock_resource res)
				2641	{
				2642	int ret;
				2643	int lock_dropped = 0;
				2644	u8 target = O2NM_MAX_NODES;
				2645
				2646	assert_spin_locked(&dlm->spinlock);
				2647
				2648	spin_lock(&res->spinlock);
				2649	if (dlm_is_lockres_migrateable(dlm, res))
				2650	target = dlm_pick_migration_target(dlm, res);
				2651	spin_unlock(&res->spinlock);
				2652
				2653	if (target == O2NM_MAX_NODES)
				2654	goto leave;
				2655
				2656	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
				2657	spin_unlock(&dlm->spinlock);
				2658	lock_dropped = 1;
				2659	ret = dlm_migrate_lockres(dlm, res, target);
				2660	if (ret)
				2661	mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
				2662	dlm->name, res->lockname.len, res->lockname.name,
				2663	target, ret);
				2664	spin_lock(&dlm->spinlock);
				2665	leave:
				2666	return lock_dropped;
				2667	}
				2668
				2669	int dlm_lock_basts_flushed(struct dlm_ctxt dlm, struct dlm_lock lock)
				2670	{
				2671	int ret;
				2672	spin_lock(&dlm->ast_lock);
				2673	spin_lock(&lock->spinlock);
				2674	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
				2675	spin_unlock(&lock->spinlock);
				2676	spin_unlock(&dlm->ast_lock);
				2677	return ret;
				2678	}
				2679
				2680	static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
				2681	struct dlm_lock_resource *res,
				2682	u8 mig_target)
				2683	{
				2684	int can_proceed;
				2685	spin_lock(&res->spinlock);
				2686	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
				2687	spin_unlock(&res->spinlock);
				2688
				2689	/* target has died, so make the caller break out of the
				2690	* wait_event, but caller must recheck the domain_map */
				2691	spin_lock(&dlm->spinlock);
				2692	if (!test_bit(mig_target, dlm->domain_map))
				2693	can_proceed = 1;
				2694	spin_unlock(&dlm->spinlock);
				2695	return can_proceed;
				2696	}
				2697
				2698	static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
				2699	struct dlm_lock_resource *res)
				2700	{
				2701	int ret;
				2702	spin_lock(&res->spinlock);
				2703	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
				2704	spin_unlock(&res->spinlock);
				2705	return ret;
				2706	}
				2707
				2708
				2709	static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
				2710	struct dlm_lock_resource *res,
				2711	u8 target)
				2712	{
				2713	int ret = 0;
				2714
				2715	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
				2716	res->lockname.len, res->lockname.name, dlm->node_num,
				2717	target);
				2718	/* need to set MIGRATING flag on lockres. this is done by
				2719	* ensuring that all asts have been flushed for this lockres. */
				2720	spin_lock(&res->spinlock);
				2721	BUG_ON(res->migration_pending);
				2722	res->migration_pending = 1;
				2723	/* strategy is to reserve an extra ast then release
				2724	* it below, letting the release do all of the work */
				2725	__dlm_lockres_reserve_ast(res);
				2726	spin_unlock(&res->spinlock);
				2727
				2728	/* now flush all the pending asts */
				2729	dlm_kick_thread(dlm, res);
				2730	/* before waiting on DIRTY, block processes which may
				2731	* try to dirty the lockres before MIGRATING is set */
				2732	spin_lock(&res->spinlock);
				2733	BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
				2734	res->state \|= DLM_LOCK_RES_BLOCK_DIRTY;
				2735	spin_unlock(&res->spinlock);
				2736	/* now wait on any pending asts and the DIRTY state */
				2737	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
				2738	dlm_lockres_release_ast(dlm, res);
				2739
				2740	mlog(0, "about to wait on migration_wq, dirty=%s\n",
				2741	res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
				2742	/* if the extra ref we just put was the final one, this
				2743	* will pass thru immediately. otherwise, we need to wait
				2744	* for the last ast to finish. */
				2745	again:
				2746	ret = wait_event_interruptible_timeout(dlm->migration_wq,
				2747	dlm_migration_can_proceed(dlm, res, target),
				2748	msecs_to_jiffies(1000));
				2749	if (ret < 0) {
				2750	mlog(0, "woken again: migrating? %s, dead? %s\n",
				2751	res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
				2752	test_bit(target, dlm->domain_map) ? "no":"yes");
				2753	} else {
				2754	mlog(0, "all is well: migrating? %s, dead? %s\n",
				2755	res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
				2756	test_bit(target, dlm->domain_map) ? "no":"yes");
				2757	}
				2758	if (!dlm_migration_can_proceed(dlm, res, target)) {
				2759	mlog(0, "trying again...\n");
				2760	goto again;
				2761	}
				2762
				2763	ret = 0;
				2764	/* did the target go down or die? */
				2765	spin_lock(&dlm->spinlock);
				2766	if (!test_bit(target, dlm->domain_map)) {
				2767	mlog(ML_ERROR, "aha. migration target %u just went down\n",
				2768	target);
				2769	ret = -EHOSTDOWN;
				2770	}
				2771	spin_unlock(&dlm->spinlock);
				2772
				2773	/*
				2774	* if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
				2775	* another try; otherwise, we are sure the MIGRATING state is there,
				2776	* drop the unneded state which blocked threads trying to DIRTY
				2777	*/
				2778	spin_lock(&res->spinlock);
				2779	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
				2780	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
				2781	if (!ret)
				2782	BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
				2783	spin_unlock(&res->spinlock);
				2784
				2785	/*
				2786	* at this point:
				2787	*
				2788	* o the DLM_LOCK_RES_MIGRATING flag is set if target not down
				2789	* o there are no pending asts on this lockres
				2790	* o all processes trying to reserve an ast on this
				2791	* lockres must wait for the MIGRATING flag to clear
				2792	*/
				2793	return ret;
				2794	}
				2795
				2796	/* last step in the migration process.
				2797	* original master calls this to free all of the dlm_lock
				2798	* structures that used to be for other nodes. */
				2799	static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
				2800	struct dlm_lock_resource *res)
				2801	{
				2802	struct list_head *queue = &res->granted;
				2803	int i, bit;
				2804	struct dlm_lock lock, next;
				2805
				2806	assert_spin_locked(&res->spinlock);
				2807
				2808	BUG_ON(res->owner == dlm->node_num);
				2809
				2810	for (i=0; i<3; i++) {
				2811	list_for_each_entry_safe(lock, next, queue, list) {
				2812	if (lock->ml.node != dlm->node_num) {
				2813	mlog(0, "putting lock for node %u\n",
				2814	lock->ml.node);
				2815	/* be extra careful */
				2816	BUG_ON(!list_empty(&lock->ast_list));
				2817	BUG_ON(!list_empty(&lock->bast_list));
				2818	BUG_ON(lock->ast_pending);
				2819	BUG_ON(lock->bast_pending);
				2820	dlm_lockres_clear_refmap_bit(dlm, res,
				2821	lock->ml.node);
				2822	list_del_init(&lock->list);
				2823	dlm_lock_put(lock);
				2824	/* In a normal unlock, we would have added a
				2825	* DLM_UNLOCK_FREE_LOCK action. Force it. */
				2826	dlm_lock_put(lock);
				2827	}
				2828	}
				2829	queue++;
				2830	}
				2831	bit = 0;
				2832	while (1) {
				2833	bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
				2834	if (bit >= O2NM_MAX_NODES)
				2835	break;
				2836	/* do not clear the local node reference, if there is a
				2837	* process holding this, let it drop the ref itself */
				2838	if (bit != dlm->node_num) {
				2839	mlog(0, "%s:%.*s: node %u had a ref to this "
				2840	"migrating lockres, clearing\n", dlm->name,
				2841	res->lockname.len, res->lockname.name, bit);
				2842	dlm_lockres_clear_refmap_bit(dlm, res, bit);
				2843	}
				2844	bit++;
				2845	}
				2846	}
				2847
				2848	/*
				2849	* Pick a node to migrate the lock resource to. This function selects a
				2850	* potential target based first on the locks and then on refmap. It skips
				2851	* nodes that are in the process of exiting the domain.
				2852	*/
				2853	static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
				2854	struct dlm_lock_resource *res)
				2855	{
				2856	enum dlm_lockres_list idx;
				2857	struct list_head *queue = &res->granted;
				2858	struct dlm_lock *lock;
				2859	int noderef;
				2860	u8 nodenum = O2NM_MAX_NODES;
				2861
				2862	assert_spin_locked(&dlm->spinlock);
				2863	assert_spin_locked(&res->spinlock);
				2864
				2865	/* Go through all the locks */
				2866	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
				2867	queue = dlm_list_idx_to_ptr(res, idx);
				2868	list_for_each_entry(lock, queue, list) {
				2869	if (lock->ml.node == dlm->node_num)
				2870	continue;
				2871	if (test_bit(lock->ml.node, dlm->exit_domain_map))
				2872	continue;
				2873	nodenum = lock->ml.node;
				2874	goto bail;
				2875	}
				2876	}
				2877
				2878	/* Go thru the refmap */
				2879	noderef = -1;
				2880	while (1) {
				2881	noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
				2882	noderef + 1);
				2883	if (noderef >= O2NM_MAX_NODES)
				2884	break;
				2885	if (noderef == dlm->node_num)
				2886	continue;
				2887	if (test_bit(noderef, dlm->exit_domain_map))
				2888	continue;
				2889	nodenum = noderef;
				2890	goto bail;
				2891	}
				2892
				2893	bail:
				2894	return nodenum;
				2895	}
				2896
				2897	/* this is called by the new master once all lockres
				2898	* data has been received */
				2899	static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
				2900	struct dlm_lock_resource *res,
				2901	u8 master, u8 new_master,
				2902	struct dlm_node_iter *iter)
				2903	{
				2904	struct dlm_migrate_request migrate;
				2905	int ret, skip, status = 0;
				2906	int nodenum;
				2907
				2908	memset(&migrate, 0, sizeof(migrate));
				2909	migrate.namelen = res->lockname.len;
				2910	memcpy(migrate.name, res->lockname.name, migrate.namelen);
				2911	migrate.new_master = new_master;
				2912	migrate.master = master;
				2913
				2914	ret = 0;
				2915
				2916	/* send message to all nodes, except the master and myself */
				2917	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
				2918	if (nodenum == master \|\|
				2919	nodenum == new_master)
				2920	continue;
				2921
				2922	/* We could race exit domain. If exited, skip. */
				2923	spin_lock(&dlm->spinlock);
				2924	skip = (!test_bit(nodenum, dlm->domain_map));
				2925	spin_unlock(&dlm->spinlock);
				2926	if (skip) {
				2927	clear_bit(nodenum, iter->node_map);
				2928	continue;
				2929	}
				2930
				2931	ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
				2932	&migrate, sizeof(migrate), nodenum,
				2933	&status);
				2934	if (ret < 0) {
				2935	mlog(ML_ERROR, "%s: res %.*s, Error %d send "
				2936	"MIGRATE_REQUEST to node %u\n", dlm->name,
				2937	migrate.namelen, migrate.name, ret, nodenum);
				2938	if (!dlm_is_host_down(ret)) {
				2939	mlog(ML_ERROR, "unhandled error=%d!\n", ret);
				2940	BUG();
				2941	}
				2942	clear_bit(nodenum, iter->node_map);
				2943	ret = 0;
				2944	} else if (status < 0) {
				2945	mlog(0, "migrate request (node %u) returned %d!\n",
				2946	nodenum, status);
				2947	ret = status;
				2948	} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
				2949	/* during the migration request we short-circuited
				2950	* the mastery of the lockres. make sure we have
				2951	* a mastery ref for nodenum */
				2952	mlog(0, "%s:%.*s: need ref for node %u\n",
				2953	dlm->name, res->lockname.len, res->lockname.name,
				2954	nodenum);
				2955	spin_lock(&res->spinlock);
				2956	dlm_lockres_set_refmap_bit(dlm, res, nodenum);
				2957	spin_unlock(&res->spinlock);
				2958	}
				2959	}
				2960
				2961	if (ret < 0)
				2962	mlog_errno(ret);
				2963
				2964	mlog(0, "returning ret=%d\n", ret);
				2965	return ret;
				2966	}
				2967
				2968
				2969	/* if there is an existing mle for this lockres, we now know who the master is.
				2970	* (the one who sent us this message) we can clear it up right away.
				2971	* since the process that put the mle on the list still has a reference to it,
				2972	* we can unhash it now, set the master and wake the process. as a result,
				2973	* we will have no mle in the list to start with. now we can add an mle for
				2974	* the migration and this should be the only one found for those scanning the
				2975	* list. */
				2976	int dlm_migrate_request_handler(struct o2net_msg msg, u32 len, void data,
				2977	void **ret_data)
				2978	{
				2979	struct dlm_ctxt *dlm = data;
				2980	struct dlm_lock_resource *res = NULL;
				2981	struct dlm_migrate_request migrate = (struct dlm_migrate_request ) msg->buf;
				2982	struct dlm_master_list_entry mle = NULL, oldmle = NULL;
				2983	const char *name;
				2984	unsigned int namelen, hash;
				2985	int ret = 0;
				2986
				2987	if (!dlm_grab(dlm))
				2988	return -EINVAL;
				2989
				2990	name = migrate->name;
				2991	namelen = migrate->namelen;
				2992	hash = dlm_lockid_hash(name, namelen);
				2993
				2994	/* preallocate.. if this fails, abort */
				2995	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
				2996
				2997	if (!mle) {
				2998	ret = -ENOMEM;
				2999	goto leave;
				3000	}
				3001
				3002	/* check for pre-existing lock */
				3003	spin_lock(&dlm->spinlock);
				3004	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
				3005	if (res) {
				3006	spin_lock(&res->spinlock);
				3007	if (res->state & DLM_LOCK_RES_RECOVERING) {
				3008	/* if all is working ok, this can only mean that we got
				3009	* a migrate request from a node that we now see as
				3010	* dead. what can we do here? drop it to the floor? */
				3011	spin_unlock(&res->spinlock);
				3012	mlog(ML_ERROR, "Got a migrate request, but the "
				3013	"lockres is marked as recovering!");
				3014	kmem_cache_free(dlm_mle_cache, mle);
				3015	ret = -EINVAL; /* need a better solution */
				3016	goto unlock;
				3017	}
				3018	res->state \|= DLM_LOCK_RES_MIGRATING;
				3019	spin_unlock(&res->spinlock);
				3020	}
				3021
				3022	spin_lock(&dlm->master_lock);
				3023	/* ignore status. only nonzero status would BUG. */
				3024	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
				3025	name, namelen,
				3026	migrate->new_master,
				3027	migrate->master);
				3028
				3029	spin_unlock(&dlm->master_lock);
				3030	unlock:
				3031	spin_unlock(&dlm->spinlock);
				3032
				3033	if (oldmle) {
				3034	/* master is known, detach if not already detached */
				3035	dlm_mle_detach_hb_events(dlm, oldmle);
				3036	dlm_put_mle(oldmle);
				3037	}
				3038
				3039	if (res)
				3040	dlm_lockres_put(res);
				3041	leave:
				3042	dlm_put(dlm);
				3043	return ret;
				3044	}
				3045
				3046	/* must be holding dlm->spinlock and dlm->master_lock
				3047	* when adding a migration mle, we can clear any other mles
				3048	* in the master list because we know with certainty that
				3049	* the master is "master". so we remove any old mle from
				3050	* the list after setting it's master field, and then add
				3051	* the new migration mle. this way we can hold with the rule
				3052	* of having only one mle for a given lock name at all times. */
				3053	static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
				3054	struct dlm_lock_resource *res,
				3055	struct dlm_master_list_entry *mle,
				3056	struct dlm_master_list_entry **oldmle,
				3057	const char *name, unsigned int namelen,
				3058	u8 new_master, u8 master)
				3059	{
				3060	int found;
				3061	int ret = 0;
				3062
				3063	*oldmle = NULL;
				3064
				3065	assert_spin_locked(&dlm->spinlock);
				3066	assert_spin_locked(&dlm->master_lock);
				3067
				3068	/* caller is responsible for any ref taken here on oldmle */
				3069	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
				3070	if (found) {
				3071	struct dlm_master_list_entry tmp = oldmle;
				3072	spin_lock(&tmp->spinlock);
				3073	if (tmp->type == DLM_MLE_MIGRATION) {
				3074	if (master == dlm->node_num) {
				3075	/* ah another process raced me to it */
				3076	mlog(0, "tried to migrate %.*s, but some "
				3077	"process beat me to it\n",
				3078	namelen, name);
				3079	ret = -EEXIST;
				3080	} else {
				3081	/* bad. 2 NODES are trying to migrate! */
				3082	mlog(ML_ERROR, "migration error mle: "
				3083	"master=%u new_master=%u // request: "
				3084	"master=%u new_master=%u // "
				3085	"lockres=%.*s\n",
				3086	tmp->master, tmp->new_master,
				3087	master, new_master,
				3088	namelen, name);
				3089	BUG();
				3090	}
				3091	} else {
				3092	/* this is essentially what assert_master does */
				3093	tmp->master = master;
				3094	atomic_set(&tmp->woken, 1);
				3095	wake_up(&tmp->wq);
				3096	/* remove it so that only one mle will be found */
				3097	__dlm_unlink_mle(dlm, tmp);
				3098	__dlm_mle_detach_hb_events(dlm, tmp);
				3099	ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
				3100	mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
				3101	"telling master to get ref for cleared out mle "
				3102	"during migration\n", dlm->name, namelen, name,
				3103	master, new_master);
				3104	}
				3105	spin_unlock(&tmp->spinlock);
				3106	}
				3107
				3108	/* now add a migration mle to the tail of the list */
				3109	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
				3110	mle->new_master = new_master;
				3111	/* the new master will be sending an assert master for this.
				3112	* at that point we will get the refmap reference */
				3113	mle->master = master;
				3114	/* do this for consistency with other mle types */
				3115	set_bit(new_master, mle->maybe_map);
				3116	__dlm_insert_mle(dlm, mle);
				3117
				3118	return ret;
				3119	}
				3120
				3121	/*
				3122	* Sets the owner of the lockres, associated to the mle, to UNKNOWN
				3123	*/
				3124	static struct dlm_lock_resource dlm_reset_mleres_owner(struct dlm_ctxt dlm,
				3125	struct dlm_master_list_entry *mle)
				3126	{
				3127	struct dlm_lock_resource *res;
				3128
				3129	/* Find the lockres associated to the mle and set its owner to UNK */
				3130	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
				3131	mle->mnamehash);
				3132	if (res) {
				3133	spin_unlock(&dlm->master_lock);
				3134
				3135	/* move lockres onto recovery list */
				3136	spin_lock(&res->spinlock);
				3137	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
				3138	dlm_move_lockres_to_recovery_list(dlm, res);
				3139	spin_unlock(&res->spinlock);
				3140	dlm_lockres_put(res);
				3141
				3142	/* about to get rid of mle, detach from heartbeat */
				3143	__dlm_mle_detach_hb_events(dlm, mle);
				3144
				3145	/* dump the mle */
				3146	spin_lock(&dlm->master_lock);
				3147	__dlm_put_mle(mle);
				3148	spin_unlock(&dlm->master_lock);
				3149	}
				3150
				3151	return res;
				3152	}
				3153
				3154	static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
				3155	struct dlm_master_list_entry *mle)
				3156	{
				3157	__dlm_mle_detach_hb_events(dlm, mle);
				3158
				3159	spin_lock(&mle->spinlock);
				3160	__dlm_unlink_mle(dlm, mle);
				3161	atomic_set(&mle->woken, 1);
				3162	spin_unlock(&mle->spinlock);
				3163
				3164	wake_up(&mle->wq);
				3165	}
				3166
				3167	static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
				3168	struct dlm_master_list_entry *mle, u8 dead_node)
				3169	{
				3170	int bit;
				3171
				3172	BUG_ON(mle->type != DLM_MLE_BLOCK);
				3173
				3174	spin_lock(&mle->spinlock);
				3175	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
				3176	if (bit != dead_node) {
				3177	mlog(0, "mle found, but dead node %u would not have been "
				3178	"master\n", dead_node);
				3179	spin_unlock(&mle->spinlock);
				3180	} else {
				3181	/* Must drop the refcount by one since the assert_master will
				3182	* never arrive. This may result in the mle being unlinked and
				3183	* freed, but there may still be a process waiting in the
				3184	* dlmlock path which is fine. */
				3185	mlog(0, "node %u was expected master\n", dead_node);
				3186	atomic_set(&mle->woken, 1);
				3187	spin_unlock(&mle->spinlock);
				3188	wake_up(&mle->wq);
				3189
				3190	/* Do not need events any longer, so detach from heartbeat */
				3191	__dlm_mle_detach_hb_events(dlm, mle);
				3192	__dlm_put_mle(mle);
				3193	}
				3194	}
				3195
				3196	void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
				3197	{
				3198	struct dlm_master_list_entry *mle;
				3199	struct dlm_lock_resource *res;
				3200	struct hlist_head *bucket;
				3201	struct hlist_node *list;
				3202	unsigned int i;
				3203
				3204	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
				3205	top:
				3206	assert_spin_locked(&dlm->spinlock);
				3207
				3208	/* clean the master list */
				3209	spin_lock(&dlm->master_lock);
				3210	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
				3211	bucket = dlm_master_hash(dlm, i);
				3212	hlist_for_each(list, bucket) {
				3213	mle = hlist_entry(list, struct dlm_master_list_entry,
				3214	master_hash_node);
				3215
				3216	BUG_ON(mle->type != DLM_MLE_BLOCK &&
				3217	mle->type != DLM_MLE_MASTER &&
				3218	mle->type != DLM_MLE_MIGRATION);
				3219
				3220	/* MASTER mles are initiated locally. The waiting
				3221	* process will notice the node map change shortly.
				3222	* Let that happen as normal. */
				3223	if (mle->type == DLM_MLE_MASTER)
				3224	continue;
				3225
				3226	/* BLOCK mles are initiated by other nodes. Need to
				3227	* clean up if the dead node would have been the
				3228	* master. */
				3229	if (mle->type == DLM_MLE_BLOCK) {
				3230	dlm_clean_block_mle(dlm, mle, dead_node);
				3231	continue;
				3232	}
				3233
				3234	/* Everything else is a MIGRATION mle */
				3235
				3236	/* The rule for MIGRATION mles is that the master
				3237	* becomes UNKNOWN if either the original or the new
				3238	* master dies. All UNKNOWN lockres' are sent to
				3239	* whichever node becomes the recovery master. The new
				3240	* master is responsible for determining if there is
				3241	* still a master for this lockres, or if he needs to
				3242	* take over mastery. Either way, this node should
				3243	* expect another message to resolve this. */
				3244
				3245	if (mle->master != dead_node &&
				3246	mle->new_master != dead_node)
				3247	continue;
				3248
				3249	/* If we have reached this point, this mle needs to be
				3250	* removed from the list and freed. */
				3251	dlm_clean_migration_mle(dlm, mle);
				3252
				3253	mlog(0, "%s: node %u died during migration from "
				3254	"%u to %u!\n", dlm->name, dead_node, mle->master,
				3255	mle->new_master);
				3256
				3257	/* If we find a lockres associated with the mle, we've
				3258	* hit this rare case that messes up our lock ordering.
				3259	* If so, we need to drop the master lock so that we can
				3260	* take the lockres lock, meaning that we will have to
				3261	* restart from the head of list. */
				3262	res = dlm_reset_mleres_owner(dlm, mle);
				3263	if (res)
				3264	/* restart */
				3265	goto top;
				3266
				3267	/* This may be the last reference */
				3268	__dlm_put_mle(mle);
				3269	}
				3270	}
				3271	spin_unlock(&dlm->master_lock);
				3272	}
				3273
				3274	int dlm_finish_migration(struct dlm_ctxt dlm, struct dlm_lock_resource res,
				3275	u8 old_master)
				3276	{
				3277	struct dlm_node_iter iter;
				3278	int ret = 0;
				3279
				3280	spin_lock(&dlm->spinlock);
				3281	dlm_node_iter_init(dlm->domain_map, &iter);
				3282	clear_bit(old_master, iter.node_map);
				3283	clear_bit(dlm->node_num, iter.node_map);
				3284	spin_unlock(&dlm->spinlock);
				3285
				3286	/* ownership of the lockres is changing. account for the
				3287	* mastery reference here since old_master will briefly have
				3288	* a reference after the migration completes */
				3289	spin_lock(&res->spinlock);
				3290	dlm_lockres_set_refmap_bit(dlm, res, old_master);
				3291	spin_unlock(&res->spinlock);
				3292
				3293	mlog(0, "now time to do a migrate request to other nodes\n");
				3294	ret = dlm_do_migrate_request(dlm, res, old_master,
				3295	dlm->node_num, &iter);
				3296	if (ret < 0) {
				3297	mlog_errno(ret);
				3298	goto leave;
				3299	}
				3300
				3301	mlog(0, "doing assert master of %.*s to all except the original node\n",
				3302	res->lockname.len, res->lockname.name);
				3303	/* this call now finishes out the nodemap
				3304	* even if one or more nodes die */
				3305	ret = dlm_do_assert_master(dlm, res, iter.node_map,
				3306	DLM_ASSERT_MASTER_FINISH_MIGRATION);
				3307	if (ret < 0) {
				3308	/* no longer need to retry. all living nodes contacted. */
				3309	mlog_errno(ret);
				3310	ret = 0;
				3311	}
				3312
				3313	memset(iter.node_map, 0, sizeof(iter.node_map));
				3314	set_bit(old_master, iter.node_map);
				3315	mlog(0, "doing assert master of %.*s back to %u\n",
				3316	res->lockname.len, res->lockname.name, old_master);
				3317	ret = dlm_do_assert_master(dlm, res, iter.node_map,
				3318	DLM_ASSERT_MASTER_FINISH_MIGRATION);
				3319	if (ret < 0) {
				3320	mlog(0, "assert master to original master failed "
				3321	"with %d.\n", ret);
				3322	/* the only nonzero status here would be because of
				3323	* a dead original node. we're done. */
				3324	ret = 0;
				3325	}
				3326
				3327	/* all done, set the owner, clear the flag */
				3328	spin_lock(&res->spinlock);
				3329	dlm_set_lockres_owner(dlm, res, dlm->node_num);
				3330	res->state &= ~DLM_LOCK_RES_MIGRATING;
				3331	spin_unlock(&res->spinlock);
				3332	/* re-dirty it on the new master */
				3333	dlm_kick_thread(dlm, res);
				3334	wake_up(&res->wq);
				3335	leave:
				3336	return ret;
				3337	}
				3338
				3339	/*
				3340	* LOCKRES AST REFCOUNT
				3341	* this is integral to migration
				3342	*/
				3343
				3344	/* for future intent to call an ast, reserve one ahead of time.
				3345	* this should be called only after waiting on the lockres
				3346	* with dlm_wait_on_lockres, and while still holding the
				3347	* spinlock after the call. */
				3348	void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
				3349	{
				3350	assert_spin_locked(&res->spinlock);
				3351	if (res->state & DLM_LOCK_RES_MIGRATING) {
				3352	__dlm_print_one_lock_resource(res);
				3353	}
				3354	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
				3355
				3356	atomic_inc(&res->asts_reserved);
				3357	}
				3358
				3359	/*
				3360	* used to drop the reserved ast, either because it went unused,
				3361	* or because the ast/bast was actually called.
				3362	*
				3363	* also, if there is a pending migration on this lockres,
				3364	* and this was the last pending ast on the lockres,
				3365	* atomically set the MIGRATING flag before we drop the lock.
				3366	* this is how we ensure that migration can proceed with no
				3367	* asts in progress. note that it is ok if the state of the
				3368	* queues is such that a lock should be granted in the future
				3369	* or that a bast should be fired, because the new master will
				3370	* shuffle the lists on this lockres as soon as it is migrated.
				3371	*/
				3372	void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
				3373	struct dlm_lock_resource *res)
				3374	{
				3375	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
				3376	return;
				3377
				3378	if (!res->migration_pending) {
				3379	spin_unlock(&res->spinlock);
				3380	return;
				3381	}
				3382
				3383	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
				3384	res->migration_pending = 0;
				3385	res->state \|= DLM_LOCK_RES_MIGRATING;
				3386	spin_unlock(&res->spinlock);
				3387	wake_up(&res->wq);
				3388	wake_up(&dlm->migration_wq);
				3389	}
				3390
				3391	void dlm_force_free_mles(struct dlm_ctxt *dlm)
				3392	{
				3393	int i;
				3394	struct hlist_head *bucket;
				3395	struct dlm_master_list_entry *mle;
				3396	struct hlist_node tmp, list;
				3397
				3398	/*
				3399	* We notified all other nodes that we are exiting the domain and
				3400	* marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
				3401	* around we force free them and wake any processes that are waiting
				3402	* on the mles
				3403	*/
				3404	spin_lock(&dlm->spinlock);
				3405	spin_lock(&dlm->master_lock);
				3406
				3407	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
				3408	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
				3409
				3410	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
				3411	bucket = dlm_master_hash(dlm, i);
				3412	hlist_for_each_safe(list, tmp, bucket) {
				3413	mle = hlist_entry(list, struct dlm_master_list_entry,
				3414	master_hash_node);
				3415	if (mle->type != DLM_MLE_BLOCK) {
				3416	mlog(ML_ERROR, "bad mle: %p\n", mle);
				3417	dlm_print_one_mle(mle);
				3418	}
				3419	atomic_set(&mle->woken, 1);
				3420	wake_up(&mle->wq);
				3421
				3422	__dlm_unlink_mle(dlm, mle);
				3423	__dlm_mle_detach_hb_events(dlm, mle);
				3424	__dlm_put_mle(mle);
				3425	}
				3426	}
				3427	spin_unlock(&dlm->master_lock);
				3428	spin_unlock(&dlm->spinlock);
				3429	}