Blame - marvell/linux/drivers/nvme/host/multipath.c - T108

blob: 36d63da71b2f9b0a63b6ac0496e67e1c358df429 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2017-2018 Christoph Hellwig.
				4	*/
				5
				6	#include <linux/backing-dev.h>
				7	#include <linux/moduleparam.h>
				8	#include <trace/events/block.h>
				9	#include "nvme.h"
				10
				11	static bool multipath = true;
				12	module_param(multipath, bool, 0444);
				13	MODULE_PARM_DESC(multipath,
				14	"turn on native support for multiple controllers per subsystem");
				15
				16	void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
				17	{
				18	struct nvme_ns_head *h;
				19
				20	lockdep_assert_held(&subsys->lock);
				21	list_for_each_entry(h, &subsys->nsheads, entry)
				22	if (h->disk)
				23	blk_mq_unfreeze_queue(h->disk->queue);
				24	}
				25
				26	void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
				27	{
				28	struct nvme_ns_head *h;
				29
				30	lockdep_assert_held(&subsys->lock);
				31	list_for_each_entry(h, &subsys->nsheads, entry)
				32	if (h->disk)
				33	blk_mq_freeze_queue_wait(h->disk->queue);
				34	}
				35
				36	void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
				37	{
				38	struct nvme_ns_head *h;
				39
				40	lockdep_assert_held(&subsys->lock);
				41	list_for_each_entry(h, &subsys->nsheads, entry)
				42	if (h->disk)
				43	blk_freeze_queue_start(h->disk->queue);
				44	}
				45
				46	/*
				47	* If multipathing is enabled we need to always use the subsystem instance
				48	* number for numbering our devices to avoid conflicts between subsystems that
				49	* have multiple controllers and thus use the multipath-aware subsystem node
				50	* and those that have a single controller and use the controller node
				51	* directly.
				52	*/
				53	void nvme_set_disk_name(char disk_name, struct nvme_ns ns,
				54	struct nvme_ctrl ctrl, int flags)
				55	{
				56	if (!multipath) {
				57	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
				58	} else if (ns->head->disk) {
				59	sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
				60	ctrl->instance, ns->head->instance);
				61	*flags = GENHD_FL_HIDDEN;
				62	} else {
				63	sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
				64	ns->head->instance);
				65	}
				66	}
				67
				68	bool nvme_failover_req(struct request *req)
				69	{
				70	struct nvme_ns *ns = req->q->queuedata;
				71	u16 status = nvme_req(req)->status;
				72	unsigned long flags;
				73
				74	switch (status & 0x7ff) {
				75	case NVME_SC_ANA_TRANSITION:
				76	case NVME_SC_ANA_INACCESSIBLE:
				77	case NVME_SC_ANA_PERSISTENT_LOSS:
				78	/*
				79	* If we got back an ANA error we know the controller is alive,
				80	* but not ready to serve this namespaces. The spec suggests
				81	* we should update our general state here, but due to the fact
				82	* that the admin and I/O queues are not serialized that is
				83	* fundamentally racy. So instead just clear the current path,
				84	* mark the the path as pending and kick of a re-read of the ANA
				85	* log page ASAP.
				86	*/
				87	nvme_mpath_clear_current_path(ns);
				88	if (ns->ctrl->ana_log_buf) {
				89	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
				90	queue_work(nvme_wq, &ns->ctrl->ana_work);
				91	}
				92	break;
				93	case NVME_SC_HOST_PATH_ERROR:
				94	case NVME_SC_HOST_ABORTED_CMD:
				95	/*
				96	* Temporary transport disruption in talking to the controller.
				97	* Try to send on a new path.
				98	*/
				99	nvme_mpath_clear_current_path(ns);
				100	break;
				101	default:
				102	/* This was a non-ANA error so follow the normal error path. */
				103	return false;
				104	}
				105
				106	spin_lock_irqsave(&ns->head->requeue_lock, flags);
				107	blk_steal_bios(&ns->head->requeue_list, req);
				108	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
				109	blk_mq_end_request(req, 0);
				110
				111	kblockd_schedule_work(&ns->head->requeue_work);
				112	return true;
				113	}
				114
				115	void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
				116	{
				117	struct nvme_ns *ns;
				118
				119	down_read(&ctrl->namespaces_rwsem);
				120	list_for_each_entry(ns, &ctrl->namespaces, list) {
				121	if (ns->head->disk)
				122	kblockd_schedule_work(&ns->head->requeue_work);
				123	}
				124	up_read(&ctrl->namespaces_rwsem);
				125	}
				126
				127	static const char *nvme_ana_state_names[] = {
				128	[0] = "invalid state",
				129	[NVME_ANA_OPTIMIZED] = "optimized",
				130	[NVME_ANA_NONOPTIMIZED] = "non-optimized",
				131	[NVME_ANA_INACCESSIBLE] = "inaccessible",
				132	[NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
				133	[NVME_ANA_CHANGE] = "change",
				134	};
				135
				136	bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
				137	{
				138	struct nvme_ns_head *head = ns->head;
				139	bool changed = false;
				140	int node;
				141
				142	if (!head)
				143	goto out;
				144
				145	for_each_node(node) {
				146	if (ns == rcu_access_pointer(head->current_path[node])) {
				147	rcu_assign_pointer(head->current_path[node], NULL);
				148	changed = true;
				149	}
				150	}
				151	out:
				152	return changed;
				153	}
				154
				155	void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
				156	{
				157	struct nvme_ns *ns;
				158
				159	down_read(&ctrl->namespaces_rwsem);
				160	list_for_each_entry(ns, &ctrl->namespaces, list) {
				161	nvme_mpath_clear_current_path(ns);
				162	kblockd_schedule_work(&ns->head->requeue_work);
				163	}
				164	up_read(&ctrl->namespaces_rwsem);
				165	}
				166
				167	static bool nvme_path_is_disabled(struct nvme_ns *ns)
				168	{
				169	return ns->ctrl->state != NVME_CTRL_LIVE \|\|
				170	test_bit(NVME_NS_ANA_PENDING, &ns->flags) \|\|
				171	test_bit(NVME_NS_REMOVING, &ns->flags);
				172	}
				173
				174	static struct nvme_ns __nvme_find_path(struct nvme_ns_head head, int node)
				175	{
				176	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
				177	struct nvme_ns found = NULL, fallback = NULL, *ns;
				178
				179	list_for_each_entry_rcu(ns, &head->list, siblings) {
				180	if (nvme_path_is_disabled(ns))
				181	continue;
				182
				183	if (ns->ctrl->numa_node != NUMA_NO_NODE &&
				184	READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
				185	distance = node_distance(node, ns->ctrl->numa_node);
				186	else
				187	distance = LOCAL_DISTANCE;
				188
				189	switch (ns->ana_state) {
				190	case NVME_ANA_OPTIMIZED:
				191	if (distance < found_distance) {
				192	found_distance = distance;
				193	found = ns;
				194	}
				195	break;
				196	case NVME_ANA_NONOPTIMIZED:
				197	if (distance < fallback_distance) {
				198	fallback_distance = distance;
				199	fallback = ns;
				200	}
				201	break;
				202	default:
				203	break;
				204	}
				205	}
				206
				207	if (!found)
				208	found = fallback;
				209	if (found)
				210	rcu_assign_pointer(head->current_path[node], found);
				211	return found;
				212	}
				213
				214	static struct nvme_ns nvme_next_ns(struct nvme_ns_head head,
				215	struct nvme_ns *ns)
				216	{
				217	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
				218	siblings);
				219	if (ns)
				220	return ns;
				221	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
				222	}
				223
				224	static struct nvme_ns nvme_round_robin_path(struct nvme_ns_head head,
				225	int node, struct nvme_ns *old)
				226	{
				227	struct nvme_ns ns, found, *fallback = NULL;
				228
				229	if (list_is_singular(&head->list)) {
				230	if (nvme_path_is_disabled(old))
				231	return NULL;
				232	return old;
				233	}
				234
				235	for (ns = nvme_next_ns(head, old);
				236	ns && ns != old;
				237	ns = nvme_next_ns(head, ns)) {
				238	if (nvme_path_is_disabled(ns))
				239	continue;
				240
				241	if (ns->ana_state == NVME_ANA_OPTIMIZED) {
				242	found = ns;
				243	goto out;
				244	}
				245	if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
				246	fallback = ns;
				247	}
				248
				249	/*
				250	* The loop above skips the current path for round-robin semantics.
				251	* Fall back to the current path if either:
				252	* - no other optimized path found and current is optimized,
				253	* - no other usable path found and current is usable.
				254	*/
				255	if (!nvme_path_is_disabled(old) &&
				256	(old->ana_state == NVME_ANA_OPTIMIZED \|\|
				257	(!fallback && old->ana_state == NVME_ANA_NONOPTIMIZED)))
				258	return old;
				259
				260	if (!fallback)
				261	return NULL;
				262	found = fallback;
				263	out:
				264	rcu_assign_pointer(head->current_path[node], found);
				265	return found;
				266	}
				267
				268	static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
				269	{
				270	return ns->ctrl->state == NVME_CTRL_LIVE &&
				271	ns->ana_state == NVME_ANA_OPTIMIZED;
				272	}
				273
				274	inline struct nvme_ns nvme_find_path(struct nvme_ns_head head)
				275	{
				276	int node = numa_node_id();
				277	struct nvme_ns *ns;
				278
				279	ns = srcu_dereference(head->current_path[node], &head->srcu);
				280	if (unlikely(!ns))
				281	return __nvme_find_path(head, node);
				282
				283	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
				284	return nvme_round_robin_path(head, node, ns);
				285	if (unlikely(!nvme_path_is_optimized(ns)))
				286	return __nvme_find_path(head, node);
				287	return ns;
				288	}
				289
				290	static bool nvme_available_path(struct nvme_ns_head *head)
				291	{
				292	struct nvme_ns *ns;
				293
				294	list_for_each_entry_rcu(ns, &head->list, siblings) {
				295	switch (ns->ctrl->state) {
				296	case NVME_CTRL_LIVE:
				297	case NVME_CTRL_RESETTING:
				298	case NVME_CTRL_CONNECTING:
				299	/* fallthru */
				300	return true;
				301	default:
				302	break;
				303	}
				304	}
				305	return false;
				306	}
				307
				308	static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
				309	struct bio *bio)
				310	{
				311	struct nvme_ns_head *head = q->queuedata;
				312	struct device *dev = disk_to_dev(head->disk);
				313	struct nvme_ns *ns;
				314	blk_qc_t ret = BLK_QC_T_NONE;
				315	int srcu_idx;
				316
				317	/*
				318	* The namespace might be going away and the bio might
				319	* be moved to a different queue via blk_steal_bios(),
				320	* so we need to use the bio_split pool from the original
				321	* queue to allocate the bvecs from.
				322	*/
				323	blk_queue_split(q, &bio);
				324
				325	srcu_idx = srcu_read_lock(&head->srcu);
				326	ns = nvme_find_path(head);
				327	if (likely(ns)) {
				328	bio->bi_disk = ns->disk;
				329	bio->bi_opf \|= REQ_NVME_MPATH;
				330	trace_block_bio_remap(bio->bi_disk->queue, bio,
				331	disk_devt(ns->head->disk),
				332	bio->bi_iter.bi_sector);
				333	ret = generic_make_request(bio);
				334	} else if (nvme_available_path(head)) {
				335	dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
				336
				337	spin_lock_irq(&head->requeue_lock);
				338	bio_list_add(&head->requeue_list, bio);
				339	spin_unlock_irq(&head->requeue_lock);
				340	} else {
				341	dev_warn_ratelimited(dev, "no available path - failing I/O\n");
				342
				343	bio->bi_status = BLK_STS_IOERR;
				344	bio_endio(bio);
				345	}
				346
				347	srcu_read_unlock(&head->srcu, srcu_idx);
				348	return ret;
				349	}
				350
				351	static void nvme_requeue_work(struct work_struct *work)
				352	{
				353	struct nvme_ns_head *head =
				354	container_of(work, struct nvme_ns_head, requeue_work);
				355	struct bio bio, next;
				356
				357	spin_lock_irq(&head->requeue_lock);
				358	next = bio_list_get(&head->requeue_list);
				359	spin_unlock_irq(&head->requeue_lock);
				360
				361	while ((bio = next) != NULL) {
				362	next = bio->bi_next;
				363	bio->bi_next = NULL;
				364
				365	/*
				366	* Reset disk to the mpath node and resubmit to select a new
				367	* path.
				368	*/
				369	bio->bi_disk = head->disk;
				370	generic_make_request(bio);
				371	}
				372	}
				373
				374	int nvme_mpath_alloc_disk(struct nvme_ctrl ctrl, struct nvme_ns_head head)
				375	{
				376	struct request_queue *q;
				377	bool vwc = false;
				378
				379	mutex_init(&head->lock);
				380	bio_list_init(&head->requeue_list);
				381	spin_lock_init(&head->requeue_lock);
				382	INIT_WORK(&head->requeue_work, nvme_requeue_work);
				383
				384	/*
				385	* Add a multipath node if the subsystems supports multiple controllers.
				386	* We also do this for private namespaces as the namespace sharing data could
				387	* change after a rescan.
				388	*/
				389	if (!(ctrl->subsys->cmic & (1 << 1)) \|\| !multipath)
				390	return 0;
				391
				392	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
				393	if (!q)
				394	goto out;
				395	q->queuedata = head;
				396	blk_queue_make_request(q, nvme_ns_head_make_request);
				397	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
				398	/* set to a default value for 512 until disk is validated */
				399	blk_queue_logical_block_size(q, 512);
				400	blk_set_stacking_limits(&q->limits);
				401
				402	/* we need to propagate up the VMC settings */
				403	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
				404	vwc = true;
				405	blk_queue_write_cache(q, vwc, vwc);
				406
				407	head->disk = alloc_disk(0);
				408	if (!head->disk)
				409	goto out_cleanup_queue;
				410	head->disk->fops = &nvme_ns_head_ops;
				411	head->disk->private_data = head;
				412	head->disk->queue = q;
				413	head->disk->flags = GENHD_FL_EXT_DEVT;
				414	sprintf(head->disk->disk_name, "nvme%dn%d",
				415	ctrl->subsys->instance, head->instance);
				416	return 0;
				417
				418	out_cleanup_queue:
				419	blk_cleanup_queue(q);
				420	out:
				421	return -ENOMEM;
				422	}
				423
				424	static void nvme_mpath_set_live(struct nvme_ns *ns)
				425	{
				426	struct nvme_ns_head *head = ns->head;
				427
				428	if (!head->disk)
				429	return;
				430
				431	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
				432	device_add_disk(&head->subsys->dev, head->disk,
				433	nvme_ns_id_attr_groups);
				434
				435	mutex_lock(&head->lock);
				436	if (nvme_path_is_optimized(ns)) {
				437	int node, srcu_idx;
				438
				439	srcu_idx = srcu_read_lock(&head->srcu);
				440	for_each_online_node(node)
				441	__nvme_find_path(head, node);
				442	srcu_read_unlock(&head->srcu, srcu_idx);
				443	}
				444	mutex_unlock(&head->lock);
				445
				446	synchronize_srcu(&head->srcu);
				447	kblockd_schedule_work(&head->requeue_work);
				448	}
				449
				450	static int nvme_parse_ana_log(struct nvme_ctrl ctrl, void data,
				451	int (cb)(struct nvme_ctrl ctrl, struct nvme_ana_group_desc *,
				452	void *))
				453	{
				454	void *base = ctrl->ana_log_buf;
				455	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
				456	int error, i;
				457
				458	lockdep_assert_held(&ctrl->ana_lock);
				459
				460	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
				461	struct nvme_ana_group_desc *desc = base + offset;
				462	u32 nr_nsids;
				463	size_t nsid_buf_size;
				464
				465	if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
				466	return -EINVAL;
				467
				468	nr_nsids = le32_to_cpu(desc->nnsids);
				469	nsid_buf_size = nr_nsids * sizeof(__le32);
				470
				471	if (WARN_ON_ONCE(desc->grpid == 0))
				472	return -EINVAL;
				473	if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
				474	return -EINVAL;
				475	if (WARN_ON_ONCE(desc->state == 0))
				476	return -EINVAL;
				477	if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
				478	return -EINVAL;
				479
				480	offset += sizeof(*desc);
				481	if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
				482	return -EINVAL;
				483
				484	error = cb(ctrl, desc, data);
				485	if (error)
				486	return error;
				487
				488	offset += nsid_buf_size;
				489	}
				490
				491	return 0;
				492	}
				493
				494	static inline bool nvme_state_is_live(enum nvme_ana_state state)
				495	{
				496	return state == NVME_ANA_OPTIMIZED \|\| state == NVME_ANA_NONOPTIMIZED;
				497	}
				498
				499	static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
				500	struct nvme_ns *ns)
				501	{
				502	ns->ana_grpid = le32_to_cpu(desc->grpid);
				503	ns->ana_state = desc->state;
				504	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
				505	/*
				506	* nvme_mpath_set_live() will trigger I/O to the multipath path device
				507	* and in turn to this path device. However we cannot accept this I/O
				508	* if the controller is not live. This may deadlock if called from
				509	* nvme_mpath_init_identify() and the ctrl will never complete
				510	* initialization, preventing I/O from completing. For this case we
				511	* will reprocess the ANA log page in nvme_mpath_update() once the
				512	* controller is ready.
				513	*/
				514	if (nvme_state_is_live(ns->ana_state) &&
				515	ns->ctrl->state == NVME_CTRL_LIVE)
				516	nvme_mpath_set_live(ns);
				517	}
				518
				519	static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
				520	struct nvme_ana_group_desc desc, void data)
				521	{
				522	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
				523	unsigned *nr_change_groups = data;
				524	struct nvme_ns *ns;
				525
				526	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
				527	le32_to_cpu(desc->grpid),
				528	nvme_ana_state_names[desc->state]);
				529
				530	if (desc->state == NVME_ANA_CHANGE)
				531	(*nr_change_groups)++;
				532
				533	if (!nr_nsids)
				534	return 0;
				535
				536	down_read(&ctrl->namespaces_rwsem);
				537	list_for_each_entry(ns, &ctrl->namespaces, list) {
				538	unsigned nsid;
				539	again:
				540	nsid = le32_to_cpu(desc->nsids[n]);
				541	if (ns->head->ns_id < nsid)
				542	continue;
				543	if (ns->head->ns_id == nsid)
				544	nvme_update_ns_ana_state(desc, ns);
				545	if (++n == nr_nsids)
				546	break;
				547	if (ns->head->ns_id > nsid)
				548	goto again;
				549	}
				550	up_read(&ctrl->namespaces_rwsem);
				551	return 0;
				552	}
				553
				554	static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
				555	{
				556	u32 nr_change_groups = 0;
				557	int error;
				558
				559	mutex_lock(&ctrl->ana_lock);
				560	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
				561	ctrl->ana_log_buf, ctrl->ana_log_size, 0);
				562	if (error) {
				563	dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
				564	goto out_unlock;
				565	}
				566
				567	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
				568	nvme_update_ana_state);
				569	if (error)
				570	goto out_unlock;
				571
				572	/*
				573	* In theory we should have an ANATT timer per group as they might enter
				574	* the change state at different times. But that is a lot of overhead
				575	* just to protect against a target that keeps entering new changes
				576	* states while never finishing previous ones. But we'll still
				577	* eventually time out once all groups are in change state, so this
				578	* isn't a big deal.
				579	*
				580	* We also double the ANATT value to provide some slack for transports
				581	* or AEN processing overhead.
				582	*/
				583	if (nr_change_groups)
				584	mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
				585	else
				586	del_timer_sync(&ctrl->anatt_timer);
				587	out_unlock:
				588	mutex_unlock(&ctrl->ana_lock);
				589	return error;
				590	}
				591
				592	static void nvme_ana_work(struct work_struct *work)
				593	{
				594	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
				595
				596	nvme_read_ana_log(ctrl);
				597	}
				598
				599	void nvme_mpath_update(struct nvme_ctrl *ctrl)
				600	{
				601	u32 nr_change_groups = 0;
				602
				603	if (!ctrl->ana_log_buf)
				604	return;
				605
				606	mutex_lock(&ctrl->ana_lock);
				607	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
				608	mutex_unlock(&ctrl->ana_lock);
				609	}
				610
				611	static void nvme_anatt_timeout(struct timer_list *t)
				612	{
				613	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
				614
				615	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
				616	nvme_reset_ctrl(ctrl);
				617	}
				618
				619	void nvme_mpath_stop(struct nvme_ctrl *ctrl)
				620	{
				621	if (!nvme_ctrl_use_ana(ctrl))
				622	return;
				623	del_timer_sync(&ctrl->anatt_timer);
				624	cancel_work_sync(&ctrl->ana_work);
				625	}
				626
				627	#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
				628	struct device_attribute subsys_attr_##_name = \
				629	__ATTR(_name, _mode, _show, _store)
				630
				631	static const char *nvme_iopolicy_names[] = {
				632	[NVME_IOPOLICY_NUMA] = "numa",
				633	[NVME_IOPOLICY_RR] = "round-robin",
				634	};
				635
				636	static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
				637	struct device_attribute attr, char buf)
				638	{
				639	struct nvme_subsystem *subsys =
				640	container_of(dev, struct nvme_subsystem, dev);
				641
				642	return sprintf(buf, "%s\n",
				643	nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
				644	}
				645
				646	static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
				647	struct device_attribute attr, const char buf, size_t count)
				648	{
				649	struct nvme_subsystem *subsys =
				650	container_of(dev, struct nvme_subsystem, dev);
				651	int i;
				652
				653	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
				654	if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
				655	WRITE_ONCE(subsys->iopolicy, i);
				656	return count;
				657	}
				658	}
				659
				660	return -EINVAL;
				661	}
				662	SUBSYS_ATTR_RW(iopolicy, S_IRUGO \| S_IWUSR,
				663	nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
				664
				665	static ssize_t ana_grpid_show(struct device dev, struct device_attribute attr,
				666	char *buf)
				667	{
				668	return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
				669	}
				670	DEVICE_ATTR_RO(ana_grpid);
				671
				672	static ssize_t ana_state_show(struct device dev, struct device_attribute attr,
				673	char *buf)
				674	{
				675	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
				676
				677	return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
				678	}
				679	DEVICE_ATTR_RO(ana_state);
				680
				681	static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
				682	struct nvme_ana_group_desc desc, void data)
				683	{
				684	struct nvme_ana_group_desc *dst = data;
				685
				686	if (desc->grpid != dst->grpid)
				687	return 0;
				688
				689	dst = desc;
				690	return -ENXIO; /* just break out of the loop */
				691	}
				692
				693	void nvme_mpath_add_disk(struct nvme_ns ns, struct nvme_id_ns id)
				694	{
				695	if (nvme_ctrl_use_ana(ns->ctrl)) {
				696	struct nvme_ana_group_desc desc = {
				697	.grpid = id->anagrpid,
				698	.state = 0,
				699	};
				700
				701	mutex_lock(&ns->ctrl->ana_lock);
				702	ns->ana_grpid = le32_to_cpu(id->anagrpid);
				703	nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
				704	mutex_unlock(&ns->ctrl->ana_lock);
				705	if (desc.state) {
				706	/* found the group desc: update */
				707	nvme_update_ns_ana_state(&desc, ns);
				708	} else {
				709	/* group desc not found: trigger a re-read */
				710	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
				711	queue_work(nvme_wq, &ns->ctrl->ana_work);
				712	}
				713	} else {
				714	ns->ana_state = NVME_ANA_OPTIMIZED;
				715	nvme_mpath_set_live(ns);
				716	}
				717
				718	if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
				719	struct gendisk *disk = ns->head->disk;
				720
				721	if (disk)
				722	disk->queue->backing_dev_info->capabilities \|=
				723	BDI_CAP_STABLE_WRITES;
				724	}
				725	}
				726
				727	void nvme_mpath_remove_disk(struct nvme_ns_head *head)
				728	{
				729	if (!head->disk)
				730	return;
				731	if (head->disk->flags & GENHD_FL_UP)
				732	del_gendisk(head->disk);
				733	blk_set_queue_dying(head->disk->queue);
				734	/* make sure all pending bios are cleaned up */
				735	kblockd_schedule_work(&head->requeue_work);
				736	flush_work(&head->requeue_work);
				737	blk_cleanup_queue(head->disk->queue);
				738	if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
				739	/*
				740	* if device_add_disk wasn't called, prevent
				741	* disk release to put a bogus reference on the
				742	* request queue
				743	*/
				744	head->disk->queue = NULL;
				745	}
				746	put_disk(head->disk);
				747	}
				748
				749	void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
				750	{
				751	mutex_init(&ctrl->ana_lock);
				752	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
				753	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
				754	}
				755
				756	int nvme_mpath_init_identify(struct nvme_ctrl ctrl, struct nvme_id_ctrl id)
				757	{
				758	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
				759	size_t ana_log_size;
				760	int error = 0;
				761
				762	/* check if multipath is enabled and we have the capability */
				763	if (!multipath \|\| !ctrl->subsys \|\| !(ctrl->subsys->cmic & (1 << 3)))
				764	return 0;
				765
				766	ctrl->anacap = id->anacap;
				767	ctrl->anatt = id->anatt;
				768	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
				769	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
				770
				771	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
				772	ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
				773	ctrl->max_namespaces * sizeof(__le32);
				774	if (ana_log_size > max_transfer_size) {
				775	dev_err(ctrl->device,
				776	"ANA log page size (%zd) larger than MDTS (%zd).\n",
				777	ana_log_size, max_transfer_size);
				778	dev_err(ctrl->device, "disabling ANA support.\n");
				779	goto out_uninit;
				780	}
				781	if (ana_log_size > ctrl->ana_log_size) {
				782	nvme_mpath_stop(ctrl);
				783	kfree(ctrl->ana_log_buf);
				784	ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL);
				785	if (!ctrl->ana_log_buf)
				786	return -ENOMEM;
				787	}
				788	ctrl->ana_log_size = ana_log_size;
				789	error = nvme_read_ana_log(ctrl);
				790	if (error)
				791	goto out_uninit;
				792	return 0;
				793
				794	out_uninit:
				795	nvme_mpath_uninit(ctrl);
				796	return error;
				797	}
				798
				799	void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
				800	{
				801	kfree(ctrl->ana_log_buf);
				802	ctrl->ana_log_buf = NULL;
				803	}
				804