Blame - marvell/linux/drivers/nvme/host/pci.c - T108

blob: 11df63d21490070b92320ff408934838944d2d6c [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* NVM Express device driver
				4	* Copyright (c) 2011-2014, Intel Corporation.
				5	*/
				6
				7	#include <linux/aer.h>
				8	#include <linux/async.h>
				9	#include <linux/blkdev.h>
				10	#include <linux/blk-mq.h>
				11	#include <linux/blk-mq-pci.h>
				12	#include <linux/dmi.h>
				13	#include <linux/init.h>
				14	#include <linux/interrupt.h>
				15	#include <linux/io.h>
				16	#include <linux/mm.h>
				17	#include <linux/module.h>
				18	#include <linux/mutex.h>
				19	#include <linux/once.h>
				20	#include <linux/pci.h>
				21	#include <linux/suspend.h>
				22	#include <linux/t10-pi.h>
				23	#include <linux/types.h>
				24	#include <linux/io-64-nonatomic-lo-hi.h>
				25	#include <linux/sed-opal.h>
				26	#include <linux/pci-p2pdma.h>
				27
				28	#include "trace.h"
				29	#include "nvme.h"
				30
				31	#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
				32	#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
				33
				34	#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
				35
				36	/*
				37	* These can be higher, but we need to ensure that any command doesn't
				38	* require an sg allocation that needs more than a page of data.
				39	*/
				40	#define NVME_MAX_KB_SZ 4096
				41	#define NVME_MAX_SEGS 127
				42
				43	static int use_threaded_interrupts;
				44	module_param(use_threaded_interrupts, int, 0);
				45
				46	static bool use_cmb_sqes = true;
				47	module_param(use_cmb_sqes, bool, 0444);
				48	MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
				49
				50	static unsigned int max_host_mem_size_mb = 128;
				51	module_param(max_host_mem_size_mb, uint, 0444);
				52	MODULE_PARM_DESC(max_host_mem_size_mb,
				53	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
				54
				55	static unsigned int sgl_threshold = SZ_32K;
				56	module_param(sgl_threshold, uint, 0644);
				57	MODULE_PARM_DESC(sgl_threshold,
				58	"Use SGLs when average request segment size is larger or equal to "
				59	"this size. Use 0 to disable SGLs.");
				60
				61	static int io_queue_depth_set(const char val, const struct kernel_param kp);
				62	static const struct kernel_param_ops io_queue_depth_ops = {
				63	.set = io_queue_depth_set,
				64	.get = param_get_int,
				65	};
				66
				67	static int io_queue_depth = 1024;
				68	module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
				69	MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
				70
				71	static unsigned int write_queues;
				72	module_param(write_queues, uint, 0644);
				73	MODULE_PARM_DESC(write_queues,
				74	"Number of queues to use for writes. If not set, reads and writes "
				75	"will share a queue set.");
				76
				77	static unsigned int poll_queues;
				78	module_param(poll_queues, uint, 0644);
				79	MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
				80
				81	struct nvme_dev;
				82	struct nvme_queue;
				83
				84	static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
				85	static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
				86
				87	/*
				88	* Represents an NVM Express device. Each nvme_dev is a PCI function.
				89	*/
				90	struct nvme_dev {
				91	struct nvme_queue *queues;
				92	struct blk_mq_tag_set tagset;
				93	struct blk_mq_tag_set admin_tagset;
				94	u32 __iomem *dbs;
				95	struct device *dev;
				96	struct dma_pool *prp_page_pool;
				97	struct dma_pool *prp_small_pool;
				98	unsigned online_queues;
				99	unsigned max_qid;
				100	unsigned io_queues[HCTX_MAX_TYPES];
				101	unsigned int num_vecs;
				102	int q_depth;
				103	int io_sqes;
				104	u32 db_stride;
				105	void __iomem *bar;
				106	unsigned long bar_mapped_size;
				107	struct work_struct remove_work;
				108	struct mutex shutdown_lock;
				109	bool subsystem;
				110	u64 cmb_size;
				111	bool cmb_use_sqes;
				112	u32 cmbsz;
				113	u32 cmbloc;
				114	struct nvme_ctrl ctrl;
				115	u32 last_ps;
				116
				117	mempool_t *iod_mempool;
				118
				119	/* shadow doorbell buffer support: */
				120	__le32 *dbbuf_dbs;
				121	dma_addr_t dbbuf_dbs_dma_addr;
				122	__le32 *dbbuf_eis;
				123	dma_addr_t dbbuf_eis_dma_addr;
				124
				125	/* host memory buffer support: */
				126	u64 host_mem_size;
				127	u32 nr_host_mem_descs;
				128	u32 host_mem_descs_size;
				129	dma_addr_t host_mem_descs_dma;
				130	struct nvme_host_mem_buf_desc *host_mem_descs;
				131	void **host_mem_desc_bufs;
				132	unsigned int nr_allocated_queues;
				133	unsigned int nr_write_queues;
				134	unsigned int nr_poll_queues;
				135	};
				136
				137	static int io_queue_depth_set(const char val, const struct kernel_param kp)
				138	{
				139	int n = 0, ret;
				140
				141	ret = kstrtoint(val, 10, &n);
				142	if (ret != 0 \|\| n < 2)
				143	return -EINVAL;
				144
				145	return param_set_int(val, kp);
				146	}
				147
				148	static inline unsigned int sq_idx(unsigned int qid, u32 stride)
				149	{
				150	return qid * 2 * stride;
				151	}
				152
				153	static inline unsigned int cq_idx(unsigned int qid, u32 stride)
				154	{
				155	return (qid * 2 + 1) * stride;
				156	}
				157
				158	static inline struct nvme_dev to_nvme_dev(struct nvme_ctrl ctrl)
				159	{
				160	return container_of(ctrl, struct nvme_dev, ctrl);
				161	}
				162
				163	/*
				164	* An NVM Express queue. Each device has at least two (one for admin
				165	* commands and one for I/O commands).
				166	*/
				167	struct nvme_queue {
				168	struct nvme_dev *dev;
				169	spinlock_t sq_lock;
				170	void *sq_cmds;
				171	/* only used for poll queues: */
				172	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
				173	volatile struct nvme_completion *cqes;
				174	dma_addr_t sq_dma_addr;
				175	dma_addr_t cq_dma_addr;
				176	u32 __iomem *q_db;
				177	u16 q_depth;
				178	u16 cq_vector;
				179	u16 sq_tail;
				180	u16 last_sq_tail;
				181	u16 cq_head;
				182	u16 last_cq_head;
				183	u16 qid;
				184	u8 cq_phase;
				185	u8 sqes;
				186	unsigned long flags;
				187	#define NVMEQ_ENABLED 0
				188	#define NVMEQ_SQ_CMB 1
				189	#define NVMEQ_DELETE_ERROR 2
				190	#define NVMEQ_POLLED 3
				191	__le32 *dbbuf_sq_db;
				192	__le32 *dbbuf_cq_db;
				193	__le32 *dbbuf_sq_ei;
				194	__le32 *dbbuf_cq_ei;
				195	struct completion delete_done;
				196	};
				197
				198	/*
				199	* The nvme_iod describes the data in an I/O.
				200	*
				201	* The sg pointer contains the list of PRP/SGL chunk allocations in addition
				202	* to the actual struct scatterlist.
				203	*/
				204	struct nvme_iod {
				205	struct nvme_request req;
				206	struct nvme_queue *nvmeq;
				207	bool use_sgl;
				208	int aborted;
				209	int npages; /* In the PRP list. 0 means small pool in use */
				210	int nents; /* Used in scatterlist */
				211	dma_addr_t first_dma;
				212	unsigned int dma_len; /* length of single DMA segment mapping */
				213	dma_addr_t meta_dma;
				214	struct scatterlist *sg;
				215	};
				216
				217	static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
				218	{
				219	return dev->nr_allocated_queues * 8 * dev->db_stride;
				220	}
				221
				222	static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
				223	{
				224	unsigned int mem_size = nvme_dbbuf_size(dev);
				225
				226	if (dev->dbbuf_dbs)
				227	return 0;
				228
				229	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
				230	&dev->dbbuf_dbs_dma_addr,
				231	GFP_KERNEL);
				232	if (!dev->dbbuf_dbs)
				233	return -ENOMEM;
				234	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
				235	&dev->dbbuf_eis_dma_addr,
				236	GFP_KERNEL);
				237	if (!dev->dbbuf_eis) {
				238	dma_free_coherent(dev->dev, mem_size,
				239	dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
				240	dev->dbbuf_dbs = NULL;
				241	return -ENOMEM;
				242	}
				243
				244	return 0;
				245	}
				246
				247	static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
				248	{
				249	unsigned int mem_size = nvme_dbbuf_size(dev);
				250
				251	if (dev->dbbuf_dbs) {
				252	dma_free_coherent(dev->dev, mem_size,
				253	dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
				254	dev->dbbuf_dbs = NULL;
				255	}
				256	if (dev->dbbuf_eis) {
				257	dma_free_coherent(dev->dev, mem_size,
				258	dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
				259	dev->dbbuf_eis = NULL;
				260	}
				261	}
				262
				263	static void nvme_dbbuf_init(struct nvme_dev *dev,
				264	struct nvme_queue *nvmeq, int qid)
				265	{
				266	if (!dev->dbbuf_dbs \|\| !qid)
				267	return;
				268
				269	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
				270	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
				271	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
				272	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
				273	}
				274
				275	static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
				276	{
				277	if (!nvmeq->qid)
				278	return;
				279
				280	nvmeq->dbbuf_sq_db = NULL;
				281	nvmeq->dbbuf_cq_db = NULL;
				282	nvmeq->dbbuf_sq_ei = NULL;
				283	nvmeq->dbbuf_cq_ei = NULL;
				284	}
				285
				286	static void nvme_dbbuf_set(struct nvme_dev *dev)
				287	{
				288	struct nvme_command c;
				289	unsigned int i;
				290
				291	if (!dev->dbbuf_dbs)
				292	return;
				293
				294	memset(&c, 0, sizeof(c));
				295	c.dbbuf.opcode = nvme_admin_dbbuf;
				296	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
				297	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
				298
				299	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
				300	dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
				301	/* Free memory and continue on */
				302	nvme_dbbuf_dma_free(dev);
				303
				304	for (i = 1; i <= dev->online_queues; i++)
				305	nvme_dbbuf_free(&dev->queues[i]);
				306	}
				307	}
				308
				309	static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
				310	{
				311	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
				312	}
				313
				314	/* Update dbbuf and return true if an MMIO is required */
				315	static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
				316	volatile __le32 *dbbuf_ei)
				317	{
				318	if (dbbuf_db) {
				319	u16 old_value, event_idx;
				320
				321	/*
				322	* Ensure that the queue is written before updating
				323	* the doorbell in memory
				324	*/
				325	wmb();
				326
				327	old_value = le32_to_cpu(*dbbuf_db);
				328	*dbbuf_db = cpu_to_le32(value);
				329
				330	/*
				331	* Ensure that the doorbell is updated before reading the event
				332	* index from memory. The controller needs to provide similar
				333	* ordering to ensure the envent index is updated before reading
				334	* the doorbell.
				335	*/
				336	mb();
				337
				338	event_idx = le32_to_cpu(*dbbuf_ei);
				339	if (!nvme_dbbuf_need_event(event_idx, value, old_value))
				340	return false;
				341	}
				342
				343	return true;
				344	}
				345
				346	/*
				347	* Will slightly overestimate the number of pages needed. This is OK
				348	* as it only leads to a small amount of wasted memory for the lifetime of
				349	* the I/O.
				350	*/
				351	static int nvme_npages(unsigned size, struct nvme_dev *dev)
				352	{
				353	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				354	dev->ctrl.page_size);
				355	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
				356	}
				357
				358	/*
				359	* Calculates the number of pages needed for the SGL segments. For example a 4k
				360	* page can accommodate 256 SGL descriptors.
				361	*/
				362	static int nvme_pci_npages_sgl(unsigned int num_seg)
				363	{
				364	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
				365	}
				366
				367	static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
				368	unsigned int size, unsigned int nseg, bool use_sgl)
				369	{
				370	size_t alloc_size;
				371
				372	if (use_sgl)
				373	alloc_size = sizeof(__le64 ) nvme_pci_npages_sgl(nseg);
				374	else
				375	alloc_size = sizeof(__le64 ) nvme_npages(size, dev);
				376
				377	return alloc_size + sizeof(struct scatterlist) * nseg;
				378	}
				379
				380	static int nvme_admin_init_hctx(struct blk_mq_hw_ctx hctx, void data,
				381	unsigned int hctx_idx)
				382	{
				383	struct nvme_dev *dev = data;
				384	struct nvme_queue *nvmeq = &dev->queues[0];
				385
				386	WARN_ON(hctx_idx != 0);
				387	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
				388
				389	hctx->driver_data = nvmeq;
				390	return 0;
				391	}
				392
				393	static int nvme_init_hctx(struct blk_mq_hw_ctx hctx, void data,
				394	unsigned int hctx_idx)
				395	{
				396	struct nvme_dev *dev = data;
				397	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
				398
				399	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
				400	hctx->driver_data = nvmeq;
				401	return 0;
				402	}
				403
				404	static int nvme_init_request(struct blk_mq_tag_set set, struct request req,
				405	unsigned int hctx_idx, unsigned int numa_node)
				406	{
				407	struct nvme_dev *dev = set->driver_data;
				408	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				409	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
				410	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
				411
				412	BUG_ON(!nvmeq);
				413	iod->nvmeq = nvmeq;
				414
				415	nvme_req(req)->ctrl = &dev->ctrl;
				416	return 0;
				417	}
				418
				419	static int queue_irq_offset(struct nvme_dev *dev)
				420	{
				421	/* if we have more than 1 vec, admin queue offsets us by 1 */
				422	if (dev->num_vecs > 1)
				423	return 1;
				424
				425	return 0;
				426	}
				427
				428	static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
				429	{
				430	struct nvme_dev *dev = set->driver_data;
				431	int i, qoff, offset;
				432
				433	offset = queue_irq_offset(dev);
				434	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
				435	struct blk_mq_queue_map *map = &set->map[i];
				436
				437	map->nr_queues = dev->io_queues[i];
				438	if (!map->nr_queues) {
				439	BUG_ON(i == HCTX_TYPE_DEFAULT);
				440	continue;
				441	}
				442
				443	/*
				444	* The poll queue(s) doesn't have an IRQ (and hence IRQ
				445	* affinity), so use the regular blk-mq cpu mapping
				446	*/
				447	map->queue_offset = qoff;
				448	if (i != HCTX_TYPE_POLL && offset)
				449	blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
				450	else
				451	blk_mq_map_queues(map);
				452	qoff += map->nr_queues;
				453	offset += map->nr_queues;
				454	}
				455
				456	return 0;
				457	}
				458
				459	/*
				460	* Write sq tail if we are asked to, or if the next command would wrap.
				461	*/
				462	static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
				463	{
				464	if (!write_sq) {
				465	u16 next_tail = nvmeq->sq_tail + 1;
				466
				467	if (next_tail == nvmeq->q_depth)
				468	next_tail = 0;
				469	if (next_tail != nvmeq->last_sq_tail)
				470	return;
				471	}
				472
				473	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
				474	nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
				475	writel(nvmeq->sq_tail, nvmeq->q_db);
				476	nvmeq->last_sq_tail = nvmeq->sq_tail;
				477	}
				478
				479	/**
				480	* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
				481	* @nvmeq: The queue to use
				482	* @cmd: The command to send
				483	* @write_sq: whether to write to the SQ doorbell
				484	*/
				485	static void nvme_submit_cmd(struct nvme_queue nvmeq, struct nvme_command cmd,
				486	bool write_sq)
				487	{
				488	spin_lock(&nvmeq->sq_lock);
				489	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
				490	cmd, sizeof(*cmd));
				491	if (++nvmeq->sq_tail == nvmeq->q_depth)
				492	nvmeq->sq_tail = 0;
				493	nvme_write_sq_db(nvmeq, write_sq);
				494	spin_unlock(&nvmeq->sq_lock);
				495	}
				496
				497	static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
				498	{
				499	struct nvme_queue *nvmeq = hctx->driver_data;
				500
				501	spin_lock(&nvmeq->sq_lock);
				502	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
				503	nvme_write_sq_db(nvmeq, true);
				504	spin_unlock(&nvmeq->sq_lock);
				505	}
				506
				507	static void *nvme_pci_iod_list(struct request req)
				508	{
				509	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				510	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
				511	}
				512
				513	static inline bool nvme_pci_use_sgls(struct nvme_dev dev, struct request req)
				514	{
				515	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				516	int nseg = blk_rq_nr_phys_segments(req);
				517	unsigned int avg_seg_size;
				518
				519	if (nseg == 0)
				520	return false;
				521
				522	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
				523
				524	if (!(dev->ctrl.sgls & ((1 << 0) \| (1 << 1))))
				525	return false;
				526	if (!iod->nvmeq->qid)
				527	return false;
				528	if (!sgl_threshold \|\| avg_seg_size < sgl_threshold)
				529	return false;
				530	return true;
				531	}
				532
				533	static void nvme_free_prps(struct nvme_dev dev, struct request req)
				534	{
				535	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
				536	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				537	dma_addr_t dma_addr = iod->first_dma;
				538	int i;
				539
				540	for (i = 0; i < iod->npages; i++) {
				541	__le64 *prp_list = nvme_pci_iod_list(req)[i];
				542	dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
				543
				544	dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
				545	dma_addr = next_dma_addr;
				546	}
				547
				548	}
				549
				550	static void nvme_free_sgls(struct nvme_dev dev, struct request req)
				551	{
				552	const int last_sg = SGES_PER_PAGE - 1;
				553	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				554	dma_addr_t dma_addr = iod->first_dma;
				555	int i;
				556
				557	for (i = 0; i < iod->npages; i++) {
				558	struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
				559	dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
				560
				561	dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
				562	dma_addr = next_dma_addr;
				563	}
				564
				565	}
				566
				567	static void nvme_unmap_sg(struct nvme_dev dev, struct request req)
				568	{
				569	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				570
				571	if (is_pci_p2pdma_page(sg_page(iod->sg)))
				572	pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
				573	rq_dma_dir(req));
				574	else
				575	dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
				576	}
				577
				578	static void nvme_unmap_data(struct nvme_dev dev, struct request req)
				579	{
				580	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				581
				582	if (iod->dma_len) {
				583	dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
				584	rq_dma_dir(req));
				585	return;
				586	}
				587
				588	WARN_ON_ONCE(!iod->nents);
				589
				590	nvme_unmap_sg(dev, req);
				591	if (iod->npages == 0)
				592	dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
				593	iod->first_dma);
				594	else if (iod->use_sgl)
				595	nvme_free_sgls(dev, req);
				596	else
				597	nvme_free_prps(dev, req);
				598	mempool_free(iod->sg, dev->iod_mempool);
				599	}
				600
				601	static void nvme_print_sgl(struct scatterlist *sgl, int nents)
				602	{
				603	int i;
				604	struct scatterlist *sg;
				605
				606	for_each_sg(sgl, sg, nents, i) {
				607	dma_addr_t phys = sg_phys(sg);
				608	pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
				609	"dma_address:%pad dma_length:%d\n",
				610	i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
				611	sg_dma_len(sg));
				612	}
				613	}
				614
				615	static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
				616	struct request req, struct nvme_rw_command cmnd)
				617	{
				618	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				619	struct dma_pool *pool;
				620	int length = blk_rq_payload_bytes(req);
				621	struct scatterlist *sg = iod->sg;
				622	int dma_len = sg_dma_len(sg);
				623	u64 dma_addr = sg_dma_address(sg);
				624	u32 page_size = dev->ctrl.page_size;
				625	int offset = dma_addr & (page_size - 1);
				626	__le64 *prp_list;
				627	void **list = nvme_pci_iod_list(req);
				628	dma_addr_t prp_dma;
				629	int nprps, i;
				630
				631	length -= (page_size - offset);
				632	if (length <= 0) {
				633	iod->first_dma = 0;
				634	goto done;
				635	}
				636
				637	dma_len -= (page_size - offset);
				638	if (dma_len) {
				639	dma_addr += (page_size - offset);
				640	} else {
				641	sg = sg_next(sg);
				642	dma_addr = sg_dma_address(sg);
				643	dma_len = sg_dma_len(sg);
				644	}
				645
				646	if (length <= page_size) {
				647	iod->first_dma = dma_addr;
				648	goto done;
				649	}
				650
				651	nprps = DIV_ROUND_UP(length, page_size);
				652	if (nprps <= (256 / 8)) {
				653	pool = dev->prp_small_pool;
				654	iod->npages = 0;
				655	} else {
				656	pool = dev->prp_page_pool;
				657	iod->npages = 1;
				658	}
				659
				660	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
				661	if (!prp_list) {
				662	iod->first_dma = dma_addr;
				663	iod->npages = -1;
				664	return BLK_STS_RESOURCE;
				665	}
				666	list[0] = prp_list;
				667	iod->first_dma = prp_dma;
				668	i = 0;
				669	for (;;) {
				670	if (i == page_size >> 3) {
				671	__le64 *old_prp_list = prp_list;
				672	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
				673	if (!prp_list)
				674	goto free_prps;
				675	list[iod->npages++] = prp_list;
				676	prp_list[0] = old_prp_list[i - 1];
				677	old_prp_list[i - 1] = cpu_to_le64(prp_dma);
				678	i = 1;
				679	}
				680	prp_list[i++] = cpu_to_le64(dma_addr);
				681	dma_len -= page_size;
				682	dma_addr += page_size;
				683	length -= page_size;
				684	if (length <= 0)
				685	break;
				686	if (dma_len > 0)
				687	continue;
				688	if (unlikely(dma_len < 0))
				689	goto bad_sgl;
				690	sg = sg_next(sg);
				691	dma_addr = sg_dma_address(sg);
				692	dma_len = sg_dma_len(sg);
				693	}
				694	done:
				695	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
				696	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
				697	return BLK_STS_OK;
				698	free_prps:
				699	nvme_free_prps(dev, req);
				700	return BLK_STS_RESOURCE;
				701	bad_sgl:
				702	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
				703	"Invalid SGL for payload:%d nents:%d\n",
				704	blk_rq_payload_bytes(req), iod->nents);
				705	return BLK_STS_IOERR;
				706	}
				707
				708	static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
				709	struct scatterlist *sg)
				710	{
				711	sge->addr = cpu_to_le64(sg_dma_address(sg));
				712	sge->length = cpu_to_le32(sg_dma_len(sg));
				713	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
				714	}
				715
				716	static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
				717	dma_addr_t dma_addr, int entries)
				718	{
				719	sge->addr = cpu_to_le64(dma_addr);
				720	if (entries < SGES_PER_PAGE) {
				721	sge->length = cpu_to_le32(entries * sizeof(*sge));
				722	sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
				723	} else {
				724	sge->length = cpu_to_le32(PAGE_SIZE);
				725	sge->type = NVME_SGL_FMT_SEG_DESC << 4;
				726	}
				727	}
				728
				729	static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
				730	struct request req, struct nvme_rw_command cmd, int entries)
				731	{
				732	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				733	struct dma_pool *pool;
				734	struct nvme_sgl_desc *sg_list;
				735	struct scatterlist *sg = iod->sg;
				736	dma_addr_t sgl_dma;
				737	int i = 0;
				738
				739	/* setting the transfer type as SGL */
				740	cmd->flags = NVME_CMD_SGL_METABUF;
				741
				742	if (entries == 1) {
				743	nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
				744	return BLK_STS_OK;
				745	}
				746
				747	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
				748	pool = dev->prp_small_pool;
				749	iod->npages = 0;
				750	} else {
				751	pool = dev->prp_page_pool;
				752	iod->npages = 1;
				753	}
				754
				755	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
				756	if (!sg_list) {
				757	iod->npages = -1;
				758	return BLK_STS_RESOURCE;
				759	}
				760
				761	nvme_pci_iod_list(req)[0] = sg_list;
				762	iod->first_dma = sgl_dma;
				763
				764	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
				765
				766	do {
				767	if (i == SGES_PER_PAGE) {
				768	struct nvme_sgl_desc *old_sg_desc = sg_list;
				769	struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
				770
				771	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
				772	if (!sg_list)
				773	goto free_sgls;
				774
				775	i = 0;
				776	nvme_pci_iod_list(req)[iod->npages++] = sg_list;
				777	sg_list[i++] = *link;
				778	nvme_pci_sgl_set_seg(link, sgl_dma, entries);
				779	}
				780
				781	nvme_pci_sgl_set_data(&sg_list[i++], sg);
				782	sg = sg_next(sg);
				783	} while (--entries > 0);
				784
				785	return BLK_STS_OK;
				786	free_sgls:
				787	nvme_free_sgls(dev, req);
				788	return BLK_STS_RESOURCE;
				789	}
				790
				791	static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
				792	struct request req, struct nvme_rw_command cmnd,
				793	struct bio_vec *bv)
				794	{
				795	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				796	unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
				797	unsigned int first_prp_len = dev->ctrl.page_size - offset;
				798
				799	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
				800	if (dma_mapping_error(dev->dev, iod->first_dma))
				801	return BLK_STS_RESOURCE;
				802	iod->dma_len = bv->bv_len;
				803
				804	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
				805	if (bv->bv_len > first_prp_len)
				806	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
				807	return 0;
				808	}
				809
				810	static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
				811	struct request req, struct nvme_rw_command cmnd,
				812	struct bio_vec *bv)
				813	{
				814	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				815
				816	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
				817	if (dma_mapping_error(dev->dev, iod->first_dma))
				818	return BLK_STS_RESOURCE;
				819	iod->dma_len = bv->bv_len;
				820
				821	cmnd->flags = NVME_CMD_SGL_METABUF;
				822	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
				823	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
				824	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
				825	return 0;
				826	}
				827
				828	static blk_status_t nvme_map_data(struct nvme_dev dev, struct request req,
				829	struct nvme_command *cmnd)
				830	{
				831	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				832	blk_status_t ret = BLK_STS_RESOURCE;
				833	int nr_mapped;
				834
				835	if (blk_rq_nr_phys_segments(req) == 1) {
				836	struct bio_vec bv = req_bvec(req);
				837
				838	if (!is_pci_p2pdma_page(bv.bv_page)) {
				839	if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
				840	return nvme_setup_prp_simple(dev, req,
				841	&cmnd->rw, &bv);
				842
				843	if (iod->nvmeq->qid && sgl_threshold &&
				844	dev->ctrl.sgls & ((1 << 0) \| (1 << 1)))
				845	return nvme_setup_sgl_simple(dev, req,
				846	&cmnd->rw, &bv);
				847	}
				848	}
				849
				850	iod->dma_len = 0;
				851	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
				852	if (!iod->sg)
				853	return BLK_STS_RESOURCE;
				854	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
				855	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
				856	if (!iod->nents)
				857	goto out_free_sg;
				858
				859	if (is_pci_p2pdma_page(sg_page(iod->sg)))
				860	nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
				861	iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
				862	else
				863	nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
				864	rq_dma_dir(req), DMA_ATTR_NO_WARN);
				865	if (!nr_mapped)
				866	goto out_free_sg;
				867
				868	iod->use_sgl = nvme_pci_use_sgls(dev, req);
				869	if (iod->use_sgl)
				870	ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
				871	else
				872	ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
				873	if (ret != BLK_STS_OK)
				874	goto out_unmap_sg;
				875	return BLK_STS_OK;
				876
				877	out_unmap_sg:
				878	nvme_unmap_sg(dev, req);
				879	out_free_sg:
				880	mempool_free(iod->sg, dev->iod_mempool);
				881	return ret;
				882	}
				883
				884	static blk_status_t nvme_map_metadata(struct nvme_dev dev, struct request req,
				885	struct nvme_command *cmnd)
				886	{
				887	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				888
				889	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
				890	rq_dma_dir(req), 0);
				891	if (dma_mapping_error(dev->dev, iod->meta_dma))
				892	return BLK_STS_IOERR;
				893	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
				894	return 0;
				895	}
				896
				897	/*
				898	* NOTE: ns is NULL when called on the admin queue.
				899	*/
				900	static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
				901	const struct blk_mq_queue_data *bd)
				902	{
				903	struct nvme_ns *ns = hctx->queue->queuedata;
				904	struct nvme_queue *nvmeq = hctx->driver_data;
				905	struct nvme_dev *dev = nvmeq->dev;
				906	struct request *req = bd->rq;
				907	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				908	struct nvme_command cmnd;
				909	blk_status_t ret;
				910
				911	iod->aborted = 0;
				912	iod->npages = -1;
				913	iod->nents = 0;
				914
				915	/*
				916	* We should not need to do this, but we're still using this to
				917	* ensure we can drain requests on a dying queue.
				918	*/
				919	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
				920	return BLK_STS_IOERR;
				921
				922	ret = nvme_setup_cmd(ns, req, &cmnd);
				923	if (ret)
				924	return ret;
				925
				926	if (blk_rq_nr_phys_segments(req)) {
				927	ret = nvme_map_data(dev, req, &cmnd);
				928	if (ret)
				929	goto out_free_cmd;
				930	}
				931
				932	if (blk_integrity_rq(req)) {
				933	ret = nvme_map_metadata(dev, req, &cmnd);
				934	if (ret)
				935	goto out_unmap_data;
				936	}
				937
				938	blk_mq_start_request(req);
				939	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
				940	return BLK_STS_OK;
				941	out_unmap_data:
				942	if (blk_rq_nr_phys_segments(req))
				943	nvme_unmap_data(dev, req);
				944	out_free_cmd:
				945	nvme_cleanup_cmd(req);
				946	return ret;
				947	}
				948
				949	static void nvme_pci_complete_rq(struct request *req)
				950	{
				951	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				952	struct nvme_dev *dev = iod->nvmeq->dev;
				953
				954	nvme_cleanup_cmd(req);
				955	if (blk_integrity_rq(req))
				956	dma_unmap_page(dev->dev, iod->meta_dma,
				957	rq_integrity_vec(req)->bv_len, rq_data_dir(req));
				958	if (blk_rq_nr_phys_segments(req))
				959	nvme_unmap_data(dev, req);
				960	nvme_complete_rq(req);
				961	}
				962
				963	/* We read the CQE phase first to check if the rest of the entry is valid */
				964	static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
				965	{
				966	return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
				967	nvmeq->cq_phase;
				968	}
				969
				970	static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
				971	{
				972	u16 head = nvmeq->cq_head;
				973
				974	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
				975	nvmeq->dbbuf_cq_ei))
				976	writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
				977	}
				978
				979	static inline struct blk_mq_tags nvme_queue_tagset(struct nvme_queue nvmeq)
				980	{
				981	if (!nvmeq->qid)
				982	return nvmeq->dev->admin_tagset.tags[0];
				983	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
				984	}
				985
				986	static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
				987	{
				988	volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
				989	struct request *req;
				990
				991	/*
				992	* AEN requests are special as they don't time out and can
				993	* survive any kind of queue freeze and often don't respond to
				994	* aborts. We don't even bother to allocate a struct request
				995	* for them but rather special case them here.
				996	*/
				997	if (unlikely(nvmeq->qid == 0 &&
				998	cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
				999	nvme_complete_async_event(&nvmeq->dev->ctrl,
				1000	cqe->status, &cqe->result);
				1001	return;
				1002	}
				1003
				1004	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
				1005	if (unlikely(!req)) {
				1006	dev_warn(nvmeq->dev->ctrl.device,
				1007	"invalid id %d completed on queue %d\n",
				1008	cqe->command_id, le16_to_cpu(cqe->sq_id));
				1009	return;
				1010	}
				1011
				1012	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
				1013	nvme_end_request(req, cqe->status, cqe->result);
				1014	}
				1015
				1016	static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
				1017	{
				1018	while (start != end) {
				1019	nvme_handle_cqe(nvmeq, start);
				1020	if (++start == nvmeq->q_depth)
				1021	start = 0;
				1022	}
				1023	}
				1024
				1025	static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
				1026	{
				1027	if (nvmeq->cq_head == nvmeq->q_depth - 1) {
				1028	nvmeq->cq_head = 0;
				1029	nvmeq->cq_phase = !nvmeq->cq_phase;
				1030	} else {
				1031	nvmeq->cq_head++;
				1032	}
				1033	}
				1034
				1035	static inline int nvme_process_cq(struct nvme_queue nvmeq, u16 start,
				1036	u16 *end, unsigned int tag)
				1037	{
				1038	int found = 0;
				1039
				1040	*start = nvmeq->cq_head;
				1041	while (nvme_cqe_pending(nvmeq)) {
				1042	if (tag == -1U \|\| nvmeq->cqes[nvmeq->cq_head].command_id == tag)
				1043	found++;
				1044	nvme_update_cq_head(nvmeq);
				1045	}
				1046	*end = nvmeq->cq_head;
				1047
				1048	if (start != end)
				1049	nvme_ring_cq_doorbell(nvmeq);
				1050	return found;
				1051	}
				1052
				1053	static irqreturn_t nvme_irq(int irq, void *data)
				1054	{
				1055	struct nvme_queue *nvmeq = data;
				1056	irqreturn_t ret = IRQ_NONE;
				1057	u16 start, end;
				1058
				1059	/*
				1060	* The rmb/wmb pair ensures we see all updates from a previous run of
				1061	* the irq handler, even if that was on another CPU.
				1062	*/
				1063	rmb();
				1064	if (nvmeq->cq_head != nvmeq->last_cq_head)
				1065	ret = IRQ_HANDLED;
				1066	nvme_process_cq(nvmeq, &start, &end, -1);
				1067	nvmeq->last_cq_head = nvmeq->cq_head;
				1068	wmb();
				1069
				1070	if (start != end) {
				1071	nvme_complete_cqes(nvmeq, start, end);
				1072	return IRQ_HANDLED;
				1073	}
				1074
				1075	return ret;
				1076	}
				1077
				1078	static irqreturn_t nvme_irq_check(int irq, void *data)
				1079	{
				1080	struct nvme_queue *nvmeq = data;
				1081	if (nvme_cqe_pending(nvmeq))
				1082	return IRQ_WAKE_THREAD;
				1083	return IRQ_NONE;
				1084	}
				1085
				1086	/*
				1087	* Poll for completions any queue, including those not dedicated to polling.
				1088	* Can be called from any context.
				1089	*/
				1090	static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
				1091	{
				1092	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
				1093	u16 start, end;
				1094	int found;
				1095
				1096	/*
				1097	* For a poll queue we need to protect against the polling thread
				1098	* using the CQ lock. For normal interrupt driven threads we have
				1099	* to disable the interrupt to avoid racing with it.
				1100	*/
				1101	if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) {
				1102	spin_lock(&nvmeq->cq_poll_lock);
				1103	found = nvme_process_cq(nvmeq, &start, &end, tag);
				1104	spin_unlock(&nvmeq->cq_poll_lock);
				1105	} else {
				1106	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
				1107	found = nvme_process_cq(nvmeq, &start, &end, tag);
				1108	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
				1109	}
				1110
				1111	nvme_complete_cqes(nvmeq, start, end);
				1112	return found;
				1113	}
				1114
				1115	static int nvme_poll(struct blk_mq_hw_ctx *hctx)
				1116	{
				1117	struct nvme_queue *nvmeq = hctx->driver_data;
				1118	u16 start, end;
				1119	bool found;
				1120
				1121	if (!nvme_cqe_pending(nvmeq))
				1122	return 0;
				1123
				1124	spin_lock(&nvmeq->cq_poll_lock);
				1125	found = nvme_process_cq(nvmeq, &start, &end, -1);
				1126	nvme_complete_cqes(nvmeq, start, end);
				1127	spin_unlock(&nvmeq->cq_poll_lock);
				1128
				1129	return found;
				1130	}
				1131
				1132	static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
				1133	{
				1134	struct nvme_dev *dev = to_nvme_dev(ctrl);
				1135	struct nvme_queue *nvmeq = &dev->queues[0];
				1136	struct nvme_command c;
				1137
				1138	memset(&c, 0, sizeof(c));
				1139	c.common.opcode = nvme_admin_async_event;
				1140	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
				1141	nvme_submit_cmd(nvmeq, &c, true);
				1142	}
				1143
				1144	static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
				1145	{
				1146	struct nvme_command c;
				1147
				1148	memset(&c, 0, sizeof(c));
				1149	c.delete_queue.opcode = opcode;
				1150	c.delete_queue.qid = cpu_to_le16(id);
				1151
				1152	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
				1153	}
				1154
				1155	static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
				1156	struct nvme_queue *nvmeq, s16 vector)
				1157	{
				1158	struct nvme_command c;
				1159	int flags = NVME_QUEUE_PHYS_CONTIG;
				1160
				1161	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
				1162	flags \|= NVME_CQ_IRQ_ENABLED;
				1163
				1164	/*
				1165	* Note: we (ab)use the fact that the prp fields survive if no data
				1166	* is attached to the request.
				1167	*/
				1168	memset(&c, 0, sizeof(c));
				1169	c.create_cq.opcode = nvme_admin_create_cq;
				1170	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
				1171	c.create_cq.cqid = cpu_to_le16(qid);
				1172	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
				1173	c.create_cq.cq_flags = cpu_to_le16(flags);
				1174	c.create_cq.irq_vector = cpu_to_le16(vector);
				1175
				1176	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
				1177	}
				1178
				1179	static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
				1180	struct nvme_queue *nvmeq)
				1181	{
				1182	struct nvme_ctrl *ctrl = &dev->ctrl;
				1183	struct nvme_command c;
				1184	int flags = NVME_QUEUE_PHYS_CONTIG;
				1185
				1186	/*
				1187	* Some drives have a bug that auto-enables WRRU if MEDIUM isn't
				1188	* set. Since URGENT priority is zeroes, it makes all queues
				1189	* URGENT.
				1190	*/
				1191	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
				1192	flags \|= NVME_SQ_PRIO_MEDIUM;
				1193
				1194	/*
				1195	* Note: we (ab)use the fact that the prp fields survive if no data
				1196	* is attached to the request.
				1197	*/
				1198	memset(&c, 0, sizeof(c));
				1199	c.create_sq.opcode = nvme_admin_create_sq;
				1200	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
				1201	c.create_sq.sqid = cpu_to_le16(qid);
				1202	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
				1203	c.create_sq.sq_flags = cpu_to_le16(flags);
				1204	c.create_sq.cqid = cpu_to_le16(qid);
				1205
				1206	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
				1207	}
				1208
				1209	static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
				1210	{
				1211	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
				1212	}
				1213
				1214	static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
				1215	{
				1216	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
				1217	}
				1218
				1219	static void abort_endio(struct request *req, blk_status_t error)
				1220	{
				1221	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				1222	struct nvme_queue *nvmeq = iod->nvmeq;
				1223
				1224	dev_warn(nvmeq->dev->ctrl.device,
				1225	"Abort status: 0x%x", nvme_req(req)->status);
				1226	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
				1227	blk_mq_free_request(req);
				1228	}
				1229
				1230	static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
				1231	{
				1232
				1233	/* If true, indicates loss of adapter communication, possibly by a
				1234	* NVMe Subsystem reset.
				1235	*/
				1236	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
				1237
				1238	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
				1239	switch (dev->ctrl.state) {
				1240	case NVME_CTRL_RESETTING:
				1241	case NVME_CTRL_CONNECTING:
				1242	return false;
				1243	default:
				1244	break;
				1245	}
				1246
				1247	/* We shouldn't reset unless the controller is on fatal error state
				1248	* _or_ if we lost the communication with it.
				1249	*/
				1250	if (!(csts & NVME_CSTS_CFS) && !nssro)
				1251	return false;
				1252
				1253	return true;
				1254	}
				1255
				1256	static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
				1257	{
				1258	/* Read a config register to help see what died. */
				1259	u16 pci_status;
				1260	int result;
				1261
				1262	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				1263	&pci_status);
				1264	if (result == PCIBIOS_SUCCESSFUL)
				1265	dev_warn(dev->ctrl.device,
				1266	"controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
				1267	csts, pci_status);
				1268	else
				1269	dev_warn(dev->ctrl.device,
				1270	"controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
				1271	csts, result);
				1272	}
				1273
				1274	static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
				1275	{
				1276	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
				1277	struct nvme_queue *nvmeq = iod->nvmeq;
				1278	struct nvme_dev *dev = nvmeq->dev;
				1279	struct request *abort_req;
				1280	struct nvme_command cmd;
				1281	u32 csts = readl(dev->bar + NVME_REG_CSTS);
				1282
				1283	/* If PCI error recovery process is happening, we cannot reset or
				1284	* the recovery mechanism will surely fail.
				1285	*/
				1286	mb();
				1287	if (pci_channel_offline(to_pci_dev(dev->dev)))
				1288	return BLK_EH_RESET_TIMER;
				1289
				1290	/*
				1291	* Reset immediately if the controller is failed
				1292	*/
				1293	if (nvme_should_reset(dev, csts)) {
				1294	nvme_warn_reset(dev, csts);
				1295	nvme_dev_disable(dev, false);
				1296	nvme_reset_ctrl(&dev->ctrl);
				1297	return BLK_EH_DONE;
				1298	}
				1299
				1300	/*
				1301	* Did we miss an interrupt?
				1302	*/
				1303	if (nvme_poll_irqdisable(nvmeq, req->tag)) {
				1304	dev_warn(dev->ctrl.device,
				1305	"I/O %d QID %d timeout, completion polled\n",
				1306	req->tag, nvmeq->qid);
				1307	return BLK_EH_DONE;
				1308	}
				1309
				1310	/*
				1311	* Shutdown immediately if controller times out while starting. The
				1312	* reset work will see the pci device disabled when it gets the forced
				1313	* cancellation error. All outstanding requests are completed on
				1314	* shutdown, so we return BLK_EH_DONE.
				1315	*/
				1316	switch (dev->ctrl.state) {
				1317	case NVME_CTRL_CONNECTING:
				1318	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
				1319	/* fall through */
				1320	case NVME_CTRL_DELETING:
				1321	dev_warn_ratelimited(dev->ctrl.device,
				1322	"I/O %d QID %d timeout, disable controller\n",
				1323	req->tag, nvmeq->qid);
				1324	nvme_req(req)->flags \|= NVME_REQ_CANCELLED;
				1325	nvme_dev_disable(dev, true);
				1326	return BLK_EH_DONE;
				1327	case NVME_CTRL_RESETTING:
				1328	return BLK_EH_RESET_TIMER;
				1329	default:
				1330	break;
				1331	}
				1332
				1333	/*
				1334	* Shutdown the controller immediately and schedule a reset if the
				1335	* command was already aborted once before and still hasn't been
				1336	* returned to the driver, or if this is the admin queue.
				1337	*/
				1338	if (!nvmeq->qid \|\| iod->aborted) {
				1339	dev_warn(dev->ctrl.device,
				1340	"I/O %d QID %d timeout, reset controller\n",
				1341	req->tag, nvmeq->qid);
				1342	nvme_req(req)->flags \|= NVME_REQ_CANCELLED;
				1343	nvme_dev_disable(dev, false);
				1344	nvme_reset_ctrl(&dev->ctrl);
				1345
				1346	return BLK_EH_DONE;
				1347	}
				1348
				1349	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
				1350	atomic_inc(&dev->ctrl.abort_limit);
				1351	return BLK_EH_RESET_TIMER;
				1352	}
				1353	iod->aborted = 1;
				1354
				1355	memset(&cmd, 0, sizeof(cmd));
				1356	cmd.abort.opcode = nvme_admin_abort_cmd;
				1357	cmd.abort.cid = req->tag;
				1358	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
				1359
				1360	dev_warn(nvmeq->dev->ctrl.device,
				1361	"I/O %d QID %d timeout, aborting\n",
				1362	req->tag, nvmeq->qid);
				1363
				1364	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
				1365	BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
				1366	if (IS_ERR(abort_req)) {
				1367	atomic_inc(&dev->ctrl.abort_limit);
				1368	return BLK_EH_RESET_TIMER;
				1369	}
				1370
				1371	abort_req->timeout = ADMIN_TIMEOUT;
				1372	abort_req->end_io_data = NULL;
				1373	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
				1374
				1375	/*
				1376	* The aborted req will be completed on receiving the abort req.
				1377	* We enable the timer again. If hit twice, it'll cause a device reset,
				1378	* as the device then is in a faulty state.
				1379	*/
				1380	return BLK_EH_RESET_TIMER;
				1381	}
				1382
				1383	static void nvme_free_queue(struct nvme_queue *nvmeq)
				1384	{
				1385	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
				1386	(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
				1387	if (!nvmeq->sq_cmds)
				1388	return;
				1389
				1390	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
				1391	pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
				1392	nvmeq->sq_cmds, SQ_SIZE(nvmeq));
				1393	} else {
				1394	dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
				1395	nvmeq->sq_cmds, nvmeq->sq_dma_addr);
				1396	}
				1397	}
				1398
				1399	static void nvme_free_queues(struct nvme_dev *dev, int lowest)
				1400	{
				1401	int i;
				1402
				1403	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
				1404	dev->ctrl.queue_count--;
				1405	nvme_free_queue(&dev->queues[i]);
				1406	}
				1407	}
				1408
				1409	/**
				1410	* nvme_suspend_queue - put queue into suspended state
				1411	* @nvmeq: queue to suspend
				1412	*/
				1413	static int nvme_suspend_queue(struct nvme_queue *nvmeq)
				1414	{
				1415	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
				1416	return 1;
				1417
				1418	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
				1419	mb();
				1420
				1421	nvmeq->dev->online_queues--;
				1422	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
				1423	blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
				1424	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
				1425	pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
				1426	return 0;
				1427	}
				1428
				1429	static void nvme_suspend_io_queues(struct nvme_dev *dev)
				1430	{
				1431	int i;
				1432
				1433	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
				1434	nvme_suspend_queue(&dev->queues[i]);
				1435	}
				1436
				1437	static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
				1438	{
				1439	struct nvme_queue *nvmeq = &dev->queues[0];
				1440
				1441	if (shutdown)
				1442	nvme_shutdown_ctrl(&dev->ctrl);
				1443	else
				1444	nvme_disable_ctrl(&dev->ctrl);
				1445
				1446	nvme_poll_irqdisable(nvmeq, -1);
				1447	}
				1448
				1449	/*
				1450	* Called only on a device that has been disabled and after all other threads
				1451	* that can check this device's completion queues have synced. This is the
				1452	* last chance for the driver to see a natural completion before
				1453	* nvme_cancel_request() terminates all incomplete requests.
				1454	*/
				1455	static void nvme_reap_pending_cqes(struct nvme_dev *dev)
				1456	{
				1457	u16 start, end;
				1458	int i;
				1459
				1460	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
				1461	nvme_process_cq(&dev->queues[i], &start, &end, -1);
				1462	nvme_complete_cqes(&dev->queues[i], start, end);
				1463	}
				1464	}
				1465
				1466	static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				1467	int entry_size)
				1468	{
				1469	int q_depth = dev->q_depth;
				1470	unsigned q_size_aligned = roundup(q_depth * entry_size,
				1471	dev->ctrl.page_size);
				1472
				1473	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
				1474	u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
				1475	mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
				1476	q_depth = div_u64(mem_per_q, entry_size);
				1477
				1478	/*
				1479	* Ensure the reduced q_depth is above some threshold where it
				1480	* would be better to map queues in system memory with the
				1481	* original depth
				1482	*/
				1483	if (q_depth < 64)
				1484	return -ENOMEM;
				1485	}
				1486
				1487	return q_depth;
				1488	}
				1489
				1490	static int nvme_alloc_sq_cmds(struct nvme_dev dev, struct nvme_queue nvmeq,
				1491	int qid)
				1492	{
				1493	struct pci_dev *pdev = to_pci_dev(dev->dev);
				1494
				1495	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
				1496	nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
				1497	if (nvmeq->sq_cmds) {
				1498	nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
				1499	nvmeq->sq_cmds);
				1500	if (nvmeq->sq_dma_addr) {
				1501	set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
				1502	return 0;
				1503	}
				1504
				1505	pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
				1506	}
				1507	}
				1508
				1509	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
				1510	&nvmeq->sq_dma_addr, GFP_KERNEL);
				1511	if (!nvmeq->sq_cmds)
				1512	return -ENOMEM;
				1513	return 0;
				1514	}
				1515
				1516	static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
				1517	{
				1518	struct nvme_queue *nvmeq = &dev->queues[qid];
				1519
				1520	if (dev->ctrl.queue_count > qid)
				1521	return 0;
				1522
				1523	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
				1524	nvmeq->q_depth = depth;
				1525	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
				1526	&nvmeq->cq_dma_addr, GFP_KERNEL);
				1527	if (!nvmeq->cqes)
				1528	goto free_nvmeq;
				1529
				1530	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
				1531	goto free_cqdma;
				1532
				1533	nvmeq->dev = dev;
				1534	spin_lock_init(&nvmeq->sq_lock);
				1535	spin_lock_init(&nvmeq->cq_poll_lock);
				1536	nvmeq->cq_head = 0;
				1537	nvmeq->cq_phase = 1;
				1538	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
				1539	nvmeq->qid = qid;
				1540	dev->ctrl.queue_count++;
				1541
				1542	return 0;
				1543
				1544	free_cqdma:
				1545	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
				1546	nvmeq->cq_dma_addr);
				1547	free_nvmeq:
				1548	return -ENOMEM;
				1549	}
				1550
				1551	static int queue_request_irq(struct nvme_queue *nvmeq)
				1552	{
				1553	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
				1554	int nr = nvmeq->dev->ctrl.instance;
				1555
				1556	if (use_threaded_interrupts) {
				1557	return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				1558	nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
				1559	} else {
				1560	return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				1561	NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
				1562	}
				1563	}
				1564
				1565	static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
				1566	{
				1567	struct nvme_dev *dev = nvmeq->dev;
				1568
				1569	nvmeq->sq_tail = 0;
				1570	nvmeq->last_sq_tail = 0;
				1571	nvmeq->cq_head = 0;
				1572	nvmeq->cq_phase = 1;
				1573	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
				1574	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
				1575	nvme_dbbuf_init(dev, nvmeq, qid);
				1576	dev->online_queues++;
				1577	wmb(); /* ensure the first interrupt sees the initialization */
				1578	}
				1579
				1580	static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
				1581	{
				1582	struct nvme_dev *dev = nvmeq->dev;
				1583	int result;
				1584	u16 vector = 0;
				1585
				1586	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
				1587
				1588	/*
				1589	* A queue's vector matches the queue identifier unless the controller
				1590	* has only one vector available.
				1591	*/
				1592	if (!polled)
				1593	vector = dev->num_vecs == 1 ? 0 : qid;
				1594	else
				1595	set_bit(NVMEQ_POLLED, &nvmeq->flags);
				1596
				1597	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
				1598	if (result)
				1599	return result;
				1600
				1601	result = adapter_alloc_sq(dev, qid, nvmeq);
				1602	if (result < 0)
				1603	return result;
				1604	else if (result)
				1605	goto release_cq;
				1606
				1607	nvmeq->cq_vector = vector;
				1608	nvme_init_queue(nvmeq, qid);
				1609
				1610	if (!polled) {
				1611	result = queue_request_irq(nvmeq);
				1612	if (result < 0)
				1613	goto release_sq;
				1614	}
				1615
				1616	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
				1617	return result;
				1618
				1619	release_sq:
				1620	dev->online_queues--;
				1621	adapter_delete_sq(dev, qid);
				1622	release_cq:
				1623	adapter_delete_cq(dev, qid);
				1624	return result;
				1625	}
				1626
				1627	static const struct blk_mq_ops nvme_mq_admin_ops = {
				1628	.queue_rq = nvme_queue_rq,
				1629	.complete = nvme_pci_complete_rq,
				1630	.init_hctx = nvme_admin_init_hctx,
				1631	.init_request = nvme_init_request,
				1632	.timeout = nvme_timeout,
				1633	};
				1634
				1635	static const struct blk_mq_ops nvme_mq_ops = {
				1636	.queue_rq = nvme_queue_rq,
				1637	.complete = nvme_pci_complete_rq,
				1638	.commit_rqs = nvme_commit_rqs,
				1639	.init_hctx = nvme_init_hctx,
				1640	.init_request = nvme_init_request,
				1641	.map_queues = nvme_pci_map_queues,
				1642	.timeout = nvme_timeout,
				1643	.poll = nvme_poll,
				1644	};
				1645
				1646	static void nvme_dev_remove_admin(struct nvme_dev *dev)
				1647	{
				1648	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
				1649	/*
				1650	* If the controller was reset during removal, it's possible
				1651	* user requests may be waiting on a stopped queue. Start the
				1652	* queue to flush these to completion.
				1653	*/
				1654	blk_mq_unquiesce_queue(dev->ctrl.admin_q);
				1655	blk_cleanup_queue(dev->ctrl.admin_q);
				1656	blk_mq_free_tag_set(&dev->admin_tagset);
				1657	}
				1658	}
				1659
				1660	static int nvme_alloc_admin_tags(struct nvme_dev *dev)
				1661	{
				1662	if (!dev->ctrl.admin_q) {
				1663	dev->admin_tagset.ops = &nvme_mq_admin_ops;
				1664	dev->admin_tagset.nr_hw_queues = 1;
				1665
				1666	dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
				1667	dev->admin_tagset.timeout = ADMIN_TIMEOUT;
				1668	dev->admin_tagset.numa_node = dev_to_node(dev->dev);
				1669	dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
				1670	dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
				1671	dev->admin_tagset.driver_data = dev;
				1672
				1673	if (blk_mq_alloc_tag_set(&dev->admin_tagset))
				1674	return -ENOMEM;
				1675	dev->ctrl.admin_tagset = &dev->admin_tagset;
				1676
				1677	dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
				1678	if (IS_ERR(dev->ctrl.admin_q)) {
				1679	blk_mq_free_tag_set(&dev->admin_tagset);
				1680	dev->ctrl.admin_q = NULL;
				1681	return -ENOMEM;
				1682	}
				1683	if (!blk_get_queue(dev->ctrl.admin_q)) {
				1684	nvme_dev_remove_admin(dev);
				1685	dev->ctrl.admin_q = NULL;
				1686	return -ENODEV;
				1687	}
				1688	} else
				1689	blk_mq_unquiesce_queue(dev->ctrl.admin_q);
				1690
				1691	return 0;
				1692	}
				1693
				1694	static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
				1695	{
				1696	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
				1697	}
				1698
				1699	static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
				1700	{
				1701	struct pci_dev *pdev = to_pci_dev(dev->dev);
				1702
				1703	if (size <= dev->bar_mapped_size)
				1704	return 0;
				1705	if (size > pci_resource_len(pdev, 0))
				1706	return -ENOMEM;
				1707	if (dev->bar)
				1708	iounmap(dev->bar);
				1709	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
				1710	if (!dev->bar) {
				1711	dev->bar_mapped_size = 0;
				1712	return -ENOMEM;
				1713	}
				1714	dev->bar_mapped_size = size;
				1715	dev->dbs = dev->bar + NVME_REG_DBS;
				1716
				1717	return 0;
				1718	}
				1719
				1720	static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
				1721	{
				1722	int result;
				1723	u32 aqa;
				1724	struct nvme_queue *nvmeq;
				1725
				1726	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
				1727	if (result < 0)
				1728	return result;
				1729
				1730	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
				1731	NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
				1732
				1733	if (dev->subsystem &&
				1734	(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
				1735	writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
				1736
				1737	result = nvme_disable_ctrl(&dev->ctrl);
				1738	if (result < 0)
				1739	return result;
				1740
				1741	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
				1742	if (result)
				1743	return result;
				1744
				1745	nvmeq = &dev->queues[0];
				1746	aqa = nvmeq->q_depth - 1;
				1747	aqa \|= aqa << 16;
				1748
				1749	writel(aqa, dev->bar + NVME_REG_AQA);
				1750	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
				1751	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
				1752
				1753	result = nvme_enable_ctrl(&dev->ctrl);
				1754	if (result)
				1755	return result;
				1756
				1757	nvmeq->cq_vector = 0;
				1758	nvme_init_queue(nvmeq, 0);
				1759	result = queue_request_irq(nvmeq);
				1760	if (result) {
				1761	dev->online_queues--;
				1762	return result;
				1763	}
				1764
				1765	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
				1766	return result;
				1767	}
				1768
				1769	static int nvme_create_io_queues(struct nvme_dev *dev)
				1770	{
				1771	unsigned i, max, rw_queues;
				1772	int ret = 0;
				1773
				1774	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
				1775	if (nvme_alloc_queue(dev, i, dev->q_depth)) {
				1776	ret = -ENOMEM;
				1777	break;
				1778	}
				1779	}
				1780
				1781	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
				1782	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
				1783	rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				1784	dev->io_queues[HCTX_TYPE_READ];
				1785	} else {
				1786	rw_queues = max;
				1787	}
				1788
				1789	for (i = dev->online_queues; i <= max; i++) {
				1790	bool polled = i > rw_queues;
				1791
				1792	ret = nvme_create_queue(&dev->queues[i], i, polled);
				1793	if (ret)
				1794	break;
				1795	}
				1796
				1797	/*
				1798	* Ignore failing Create SQ/CQ commands, we can continue with less
				1799	* than the desired amount of queues, and even a controller without
				1800	* I/O queues can still be used to issue admin commands. This might
				1801	* be useful to upgrade a buggy firmware for example.
				1802	*/
				1803	return ret >= 0 ? 0 : ret;
				1804	}
				1805
				1806	static ssize_t nvme_cmb_show(struct device *dev,
				1807	struct device_attribute *attr,
				1808	char *buf)
				1809	{
				1810	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
				1811
				1812	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n",
				1813	ndev->cmbloc, ndev->cmbsz);
				1814	}
				1815	static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
				1816
				1817	static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
				1818	{
				1819	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
				1820
				1821	return 1ULL << (12 + 4 * szu);
				1822	}
				1823
				1824	static u32 nvme_cmb_size(struct nvme_dev *dev)
				1825	{
				1826	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
				1827	}
				1828
				1829	static void nvme_map_cmb(struct nvme_dev *dev)
				1830	{
				1831	u64 size, offset;
				1832	resource_size_t bar_size;
				1833	struct pci_dev *pdev = to_pci_dev(dev->dev);
				1834	int bar;
				1835
				1836	if (dev->cmb_size)
				1837	return;
				1838
				1839	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
				1840	if (!dev->cmbsz)
				1841	return;
				1842	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
				1843
				1844	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
				1845	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
				1846	bar = NVME_CMB_BIR(dev->cmbloc);
				1847	bar_size = pci_resource_len(pdev, bar);
				1848
				1849	if (offset > bar_size)
				1850	return;
				1851
				1852	/*
				1853	* Controllers may support a CMB size larger than their BAR,
				1854	* for example, due to being behind a bridge. Reduce the CMB to
				1855	* the reported size of the BAR
				1856	*/
				1857	if (size > bar_size - offset)
				1858	size = bar_size - offset;
				1859
				1860	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
				1861	dev_warn(dev->ctrl.device,
				1862	"failed to register the CMB\n");
				1863	return;
				1864	}
				1865
				1866	dev->cmb_size = size;
				1867	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
				1868
				1869	if ((dev->cmbsz & (NVME_CMBSZ_WDS \| NVME_CMBSZ_RDS)) ==
				1870	(NVME_CMBSZ_WDS \| NVME_CMBSZ_RDS))
				1871	pci_p2pmem_publish(pdev, true);
				1872
				1873	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				1874	&dev_attr_cmb.attr, NULL))
				1875	dev_warn(dev->ctrl.device,
				1876	"failed to add sysfs attribute for CMB\n");
				1877	}
				1878
				1879	static inline void nvme_release_cmb(struct nvme_dev *dev)
				1880	{
				1881	if (dev->cmb_size) {
				1882	sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
				1883	&dev_attr_cmb.attr, NULL);
				1884	dev->cmb_size = 0;
				1885	}
				1886	}
				1887
				1888	static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
				1889	{
				1890	u64 dma_addr = dev->host_mem_descs_dma;
				1891	struct nvme_command c;
				1892	int ret;
				1893
				1894	memset(&c, 0, sizeof(c));
				1895	c.features.opcode = nvme_admin_set_features;
				1896	c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
				1897	c.features.dword11 = cpu_to_le32(bits);
				1898	c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
				1899	ilog2(dev->ctrl.page_size));
				1900	c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
				1901	c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
				1902	c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
				1903
				1904	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
				1905	if (ret) {
				1906	dev_warn(dev->ctrl.device,
				1907	"failed to set host mem (err %d, flags %#x).\n",
				1908	ret, bits);
				1909	}
				1910	return ret;
				1911	}
				1912
				1913	static void nvme_free_host_mem(struct nvme_dev *dev)
				1914	{
				1915	int i;
				1916
				1917	for (i = 0; i < dev->nr_host_mem_descs; i++) {
				1918	struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
				1919	size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
				1920
				1921	dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
				1922	le64_to_cpu(desc->addr),
				1923	DMA_ATTR_NO_KERNEL_MAPPING \| DMA_ATTR_NO_WARN);
				1924	}
				1925
				1926	kfree(dev->host_mem_desc_bufs);
				1927	dev->host_mem_desc_bufs = NULL;
				1928	dma_free_coherent(dev->dev, dev->host_mem_descs_size,
				1929	dev->host_mem_descs, dev->host_mem_descs_dma);
				1930	dev->host_mem_descs = NULL;
				1931	dev->host_mem_descs_size = 0;
				1932	dev->nr_host_mem_descs = 0;
				1933	}
				1934
				1935	static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
				1936	u32 chunk_size)
				1937	{
				1938	struct nvme_host_mem_buf_desc *descs;
				1939	u32 max_entries, len, descs_size;
				1940	dma_addr_t descs_dma;
				1941	int i = 0;
				1942	void **bufs;
				1943	u64 size, tmp;
				1944
				1945	tmp = (preferred + chunk_size - 1);
				1946	do_div(tmp, chunk_size);
				1947	max_entries = tmp;
				1948
				1949	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
				1950	max_entries = dev->ctrl.hmmaxd;
				1951
				1952	descs_size = max_entries * sizeof(*descs);
				1953	descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
				1954	GFP_KERNEL);
				1955	if (!descs)
				1956	goto out;
				1957
				1958	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
				1959	if (!bufs)
				1960	goto out_free_descs;
				1961
				1962	for (size = 0; size < preferred && i < max_entries; size += len) {
				1963	dma_addr_t dma_addr;
				1964
				1965	len = min_t(u64, chunk_size, preferred - size);
				1966	bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				1967	DMA_ATTR_NO_KERNEL_MAPPING \| DMA_ATTR_NO_WARN);
				1968	if (!bufs[i])
				1969	break;
				1970
				1971	descs[i].addr = cpu_to_le64(dma_addr);
				1972	descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
				1973	i++;
				1974	}
				1975
				1976	if (!size)
				1977	goto out_free_bufs;
				1978
				1979	dev->nr_host_mem_descs = i;
				1980	dev->host_mem_size = size;
				1981	dev->host_mem_descs = descs;
				1982	dev->host_mem_descs_dma = descs_dma;
				1983	dev->host_mem_descs_size = descs_size;
				1984	dev->host_mem_desc_bufs = bufs;
				1985	return 0;
				1986
				1987	out_free_bufs:
				1988	while (--i >= 0) {
				1989	size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
				1990
				1991	dma_free_attrs(dev->dev, size, bufs[i],
				1992	le64_to_cpu(descs[i].addr),
				1993	DMA_ATTR_NO_KERNEL_MAPPING \| DMA_ATTR_NO_WARN);
				1994	}
				1995
				1996	kfree(bufs);
				1997	out_free_descs:
				1998	dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
				1999	out:
				2000	dev->host_mem_descs = NULL;
				2001	return -ENOMEM;
				2002	}
				2003
				2004	static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
				2005	{
				2006	u32 chunk_size;
				2007
				2008	/* start big and work our way down */
				2009	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
				2010	chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
				2011	chunk_size /= 2) {
				2012	if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
				2013	if (!min \|\| dev->host_mem_size >= min)
				2014	return 0;
				2015	nvme_free_host_mem(dev);
				2016	}
				2017	}
				2018
				2019	return -ENOMEM;
				2020	}
				2021
				2022	static int nvme_setup_host_mem(struct nvme_dev *dev)
				2023	{
				2024	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
				2025	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
				2026	u64 min = (u64)dev->ctrl.hmmin * 4096;
				2027	u32 enable_bits = NVME_HOST_MEM_ENABLE;
				2028	int ret;
				2029
				2030	preferred = min(preferred, max);
				2031	if (min > max) {
				2032	dev_warn(dev->ctrl.device,
				2033	"min host memory (%lld MiB) above limit (%d MiB).\n",
				2034	min >> ilog2(SZ_1M), max_host_mem_size_mb);
				2035	nvme_free_host_mem(dev);
				2036	return 0;
				2037	}
				2038
				2039	/*
				2040	* If we already have a buffer allocated check if we can reuse it.
				2041	*/
				2042	if (dev->host_mem_descs) {
				2043	if (dev->host_mem_size >= min)
				2044	enable_bits \|= NVME_HOST_MEM_RETURN;
				2045	else
				2046	nvme_free_host_mem(dev);
				2047	}
				2048
				2049	if (!dev->host_mem_descs) {
				2050	if (nvme_alloc_host_mem(dev, min, preferred)) {
				2051	dev_warn(dev->ctrl.device,
				2052	"failed to allocate host memory buffer.\n");
				2053	return 0; /* controller must work without HMB */
				2054	}
				2055
				2056	dev_info(dev->ctrl.device,
				2057	"allocated %lld MiB host memory buffer.\n",
				2058	dev->host_mem_size >> ilog2(SZ_1M));
				2059	}
				2060
				2061	ret = nvme_set_host_mem(dev, enable_bits);
				2062	if (ret)
				2063	nvme_free_host_mem(dev);
				2064	return ret;
				2065	}
				2066
				2067	/*
				2068	* nirqs is the number of interrupts available for write and read
				2069	* queues. The core already reserved an interrupt for the admin queue.
				2070	*/
				2071	static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
				2072	{
				2073	struct nvme_dev *dev = affd->priv;
				2074	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
				2075
				2076	/*
				2077	* If there is no interupt available for queues, ensure that
				2078	* the default queue is set to 1. The affinity set size is
				2079	* also set to one, but the irq core ignores it for this case.
				2080	*
				2081	* If only one interrupt is available or 'write_queue' == 0, combine
				2082	* write and read queues.
				2083	*
				2084	* If 'write_queues' > 0, ensure it leaves room for at least one read
				2085	* queue.
				2086	*/
				2087	if (!nrirqs) {
				2088	nrirqs = 1;
				2089	nr_read_queues = 0;
				2090	} else if (nrirqs == 1 \|\| !nr_write_queues) {
				2091	nr_read_queues = 0;
				2092	} else if (nr_write_queues >= nrirqs) {
				2093	nr_read_queues = 1;
				2094	} else {
				2095	nr_read_queues = nrirqs - nr_write_queues;
				2096	}
				2097
				2098	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
				2099	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
				2100	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
				2101	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
				2102	affd->nr_sets = nr_read_queues ? 2 : 1;
				2103	}
				2104
				2105	static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
				2106	{
				2107	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2108	struct irq_affinity affd = {
				2109	.pre_vectors = 1,
				2110	.calc_sets = nvme_calc_irq_sets,
				2111	.priv = dev,
				2112	};
				2113	unsigned int irq_queues, this_p_queues;
				2114
				2115	/*
				2116	* Poll queues don't need interrupts, but we need at least one IO
				2117	* queue left over for non-polled IO.
				2118	*/
				2119	this_p_queues = dev->nr_poll_queues;
				2120	if (this_p_queues >= nr_io_queues) {
				2121	this_p_queues = nr_io_queues - 1;
				2122	irq_queues = 1;
				2123	} else {
				2124	irq_queues = nr_io_queues - this_p_queues + 1;
				2125	}
				2126	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
				2127
				2128	/* Initialize for the single interrupt case */
				2129	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
				2130	dev->io_queues[HCTX_TYPE_READ] = 0;
				2131
				2132	/*
				2133	* Some Apple controllers require all queues to use the
				2134	* first vector.
				2135	*/
				2136	if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
				2137	irq_queues = 1;
				2138
				2139	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
				2140	PCI_IRQ_ALL_TYPES \| PCI_IRQ_AFFINITY, &affd);
				2141	}
				2142
				2143	static void nvme_disable_io_queues(struct nvme_dev *dev)
				2144	{
				2145	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
				2146	__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
				2147	}
				2148
				2149	static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
				2150	{
				2151	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
				2152	}
				2153
				2154	static int nvme_setup_io_queues(struct nvme_dev *dev)
				2155	{
				2156	struct nvme_queue *adminq = &dev->queues[0];
				2157	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2158	unsigned int nr_io_queues;
				2159	unsigned long size;
				2160	int result;
				2161
				2162	/*
				2163	* Sample the module parameters once at reset time so that we have
				2164	* stable values to work with.
				2165	*/
				2166	dev->nr_write_queues = write_queues;
				2167	dev->nr_poll_queues = poll_queues;
				2168
				2169	/*
				2170	* If tags are shared with admin queue (Apple bug), then
				2171	* make sure we only use one IO queue.
				2172	*/
				2173	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
				2174	nr_io_queues = 1;
				2175	else
				2176	nr_io_queues = min(nvme_max_io_queues(dev),
				2177	dev->nr_allocated_queues - 1);
				2178
				2179	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
				2180	if (result < 0)
				2181	return result;
				2182
				2183	if (nr_io_queues == 0)
				2184	return 0;
				2185
				2186	clear_bit(NVMEQ_ENABLED, &adminq->flags);
				2187
				2188	if (dev->cmb_use_sqes) {
				2189	result = nvme_cmb_qdepth(dev, nr_io_queues,
				2190	sizeof(struct nvme_command));
				2191	if (result > 0)
				2192	dev->q_depth = result;
				2193	else
				2194	dev->cmb_use_sqes = false;
				2195	}
				2196
				2197	do {
				2198	size = db_bar_size(dev, nr_io_queues);
				2199	result = nvme_remap_bar(dev, size);
				2200	if (!result)
				2201	break;
				2202	if (!--nr_io_queues)
				2203	return -ENOMEM;
				2204	} while (1);
				2205	adminq->q_db = dev->dbs;
				2206
				2207	retry:
				2208	/* Deregister the admin queue's interrupt */
				2209	pci_free_irq(pdev, 0, adminq);
				2210
				2211	/*
				2212	* If we enable msix early due to not intx, disable it again before
				2213	* setting up the full range we need.
				2214	*/
				2215	pci_free_irq_vectors(pdev);
				2216
				2217	result = nvme_setup_irqs(dev, nr_io_queues);
				2218	if (result <= 0)
				2219	return -EIO;
				2220
				2221	dev->num_vecs = result;
				2222	result = max(result - 1, 1);
				2223	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
				2224
				2225	/*
				2226	* Should investigate if there's a performance win from allocating
				2227	* more queues than interrupt vectors; it might allow the submission
				2228	* path to scale better, even if the receive path is limited by the
				2229	* number of interrupts.
				2230	*/
				2231	result = queue_request_irq(adminq);
				2232	if (result)
				2233	return result;
				2234	set_bit(NVMEQ_ENABLED, &adminq->flags);
				2235
				2236	result = nvme_create_io_queues(dev);
				2237	if (result \|\| dev->online_queues < 2)
				2238	return result;
				2239
				2240	if (dev->online_queues - 1 < dev->max_qid) {
				2241	nr_io_queues = dev->online_queues - 1;
				2242	nvme_disable_io_queues(dev);
				2243	nvme_suspend_io_queues(dev);
				2244	goto retry;
				2245	}
				2246	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
				2247	dev->io_queues[HCTX_TYPE_DEFAULT],
				2248	dev->io_queues[HCTX_TYPE_READ],
				2249	dev->io_queues[HCTX_TYPE_POLL]);
				2250	return 0;
				2251	}
				2252
				2253	static void nvme_del_queue_end(struct request *req, blk_status_t error)
				2254	{
				2255	struct nvme_queue *nvmeq = req->end_io_data;
				2256
				2257	blk_mq_free_request(req);
				2258	complete(&nvmeq->delete_done);
				2259	}
				2260
				2261	static void nvme_del_cq_end(struct request *req, blk_status_t error)
				2262	{
				2263	struct nvme_queue *nvmeq = req->end_io_data;
				2264
				2265	if (error)
				2266	set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
				2267
				2268	nvme_del_queue_end(req, error);
				2269	}
				2270
				2271	static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
				2272	{
				2273	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
				2274	struct request *req;
				2275	struct nvme_command cmd;
				2276
				2277	memset(&cmd, 0, sizeof(cmd));
				2278	cmd.delete_queue.opcode = opcode;
				2279	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
				2280
				2281	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
				2282	if (IS_ERR(req))
				2283	return PTR_ERR(req);
				2284
				2285	req->timeout = ADMIN_TIMEOUT;
				2286	req->end_io_data = nvmeq;
				2287
				2288	init_completion(&nvmeq->delete_done);
				2289	blk_execute_rq_nowait(q, NULL, req, false,
				2290	opcode == nvme_admin_delete_cq ?
				2291	nvme_del_cq_end : nvme_del_queue_end);
				2292	return 0;
				2293	}
				2294
				2295	static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
				2296	{
				2297	int nr_queues = dev->online_queues - 1, sent = 0;
				2298	unsigned long timeout;
				2299
				2300	retry:
				2301	timeout = ADMIN_TIMEOUT;
				2302	while (nr_queues > 0) {
				2303	if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
				2304	break;
				2305	nr_queues--;
				2306	sent++;
				2307	}
				2308	while (sent) {
				2309	struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
				2310
				2311	timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
				2312	timeout);
				2313	if (timeout == 0)
				2314	return false;
				2315
				2316	sent--;
				2317	if (nr_queues)
				2318	goto retry;
				2319	}
				2320	return true;
				2321	}
				2322
				2323	static void nvme_dev_add(struct nvme_dev *dev)
				2324	{
				2325	int ret;
				2326
				2327	if (!dev->ctrl.tagset) {
				2328	dev->tagset.ops = &nvme_mq_ops;
				2329	dev->tagset.nr_hw_queues = dev->online_queues - 1;
				2330	dev->tagset.nr_maps = 2; /* default + read */
				2331	if (dev->io_queues[HCTX_TYPE_POLL])
				2332	dev->tagset.nr_maps++;
				2333	dev->tagset.timeout = NVME_IO_TIMEOUT;
				2334	dev->tagset.numa_node = dev_to_node(dev->dev);
				2335	dev->tagset.queue_depth =
				2336	min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
				2337	dev->tagset.cmd_size = sizeof(struct nvme_iod);
				2338	dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
				2339	dev->tagset.driver_data = dev;
				2340
				2341	/*
				2342	* Some Apple controllers requires tags to be unique
				2343	* across admin and IO queue, so reserve the first 32
				2344	* tags of the IO queue.
				2345	*/
				2346	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
				2347	dev->tagset.reserved_tags = NVME_AQ_DEPTH;
				2348
				2349	ret = blk_mq_alloc_tag_set(&dev->tagset);
				2350	if (ret) {
				2351	dev_warn(dev->ctrl.device,
				2352	"IO queues tagset allocation failed %d\n", ret);
				2353	return;
				2354	}
				2355	dev->ctrl.tagset = &dev->tagset;
				2356	} else {
				2357	blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
				2358
				2359	/* Free previously allocated queues that are no longer usable */
				2360	nvme_free_queues(dev, dev->online_queues);
				2361	}
				2362
				2363	nvme_dbbuf_set(dev);
				2364	}
				2365
				2366	static int nvme_pci_enable(struct nvme_dev *dev)
				2367	{
				2368	int result = -ENOMEM;
				2369	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2370
				2371	if (pci_enable_device_mem(pdev))
				2372	return result;
				2373
				2374	pci_set_master(pdev);
				2375
				2376	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
				2377	goto disable;
				2378
				2379	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
				2380	result = -ENODEV;
				2381	goto disable;
				2382	}
				2383
				2384	/*
				2385	* Some devices and/or platforms don't advertise or work with INTx
				2386	* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
				2387	* adjust this later.
				2388	*/
				2389	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
				2390	if (result < 0)
				2391	return result;
				2392
				2393	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
				2394
				2395	dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
				2396	io_queue_depth);
				2397	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
				2398	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
				2399	dev->dbs = dev->bar + 4096;
				2400
				2401	/*
				2402	* Some Apple controllers require a non-standard SQE size.
				2403	* Interestingly they also seem to ignore the CC:IOSQES register
				2404	* so we don't bother updating it here.
				2405	*/
				2406	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
				2407	dev->io_sqes = 7;
				2408	else
				2409	dev->io_sqes = NVME_NVM_IOSQES;
				2410
				2411	/*
				2412	* Temporary fix for the Apple controller found in the MacBook8,1 and
				2413	* some MacBook7,1 to avoid controller resets and data loss.
				2414	*/
				2415	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
				2416	dev->q_depth = 2;
				2417	dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
				2418	"set queue depth=%u to work around controller resets\n",
				2419	dev->q_depth);
				2420	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
				2421	(pdev->device == 0xa821 \|\| pdev->device == 0xa822) &&
				2422	NVME_CAP_MQES(dev->ctrl.cap) == 0) {
				2423	dev->q_depth = 64;
				2424	dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
				2425	"set queue depth=%u\n", dev->q_depth);
				2426	}
				2427
				2428	/*
				2429	* Controllers with the shared tags quirk need the IO queue to be
				2430	* big enough so that we get 32 tags for the admin queue
				2431	*/
				2432	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
				2433	(dev->q_depth < (NVME_AQ_DEPTH + 2))) {
				2434	dev->q_depth = NVME_AQ_DEPTH + 2;
				2435	dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
				2436	dev->q_depth);
				2437	}
				2438
				2439
				2440	nvme_map_cmb(dev);
				2441
				2442	pci_enable_pcie_error_reporting(pdev);
				2443	pci_save_state(pdev);
				2444	return 0;
				2445
				2446	disable:
				2447	pci_disable_device(pdev);
				2448	return result;
				2449	}
				2450
				2451	static void nvme_dev_unmap(struct nvme_dev *dev)
				2452	{
				2453	if (dev->bar)
				2454	iounmap(dev->bar);
				2455	pci_release_mem_regions(to_pci_dev(dev->dev));
				2456	}
				2457
				2458	static void nvme_pci_disable(struct nvme_dev *dev)
				2459	{
				2460	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2461
				2462	pci_free_irq_vectors(pdev);
				2463
				2464	if (pci_is_enabled(pdev)) {
				2465	pci_disable_pcie_error_reporting(pdev);
				2466	pci_disable_device(pdev);
				2467	}
				2468	}
				2469
				2470	static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
				2471	{
				2472	bool dead = true, freeze = false;
				2473	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2474
				2475	mutex_lock(&dev->shutdown_lock);
				2476	if (pci_is_enabled(pdev)) {
				2477	u32 csts = readl(dev->bar + NVME_REG_CSTS);
				2478
				2479	if (dev->ctrl.state == NVME_CTRL_LIVE \|\|
				2480	dev->ctrl.state == NVME_CTRL_RESETTING) {
				2481	freeze = true;
				2482	nvme_start_freeze(&dev->ctrl);
				2483	}
				2484	dead = !!((csts & NVME_CSTS_CFS) \|\| !(csts & NVME_CSTS_RDY) \|\|
				2485	pdev->error_state != pci_channel_io_normal);
				2486	}
				2487
				2488	/*
				2489	* Give the controller a chance to complete all entered requests if
				2490	* doing a safe shutdown.
				2491	*/
				2492	if (!dead && shutdown && freeze)
				2493	nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
				2494
				2495	nvme_stop_queues(&dev->ctrl);
				2496
				2497	if (!dead && dev->ctrl.queue_count > 0) {
				2498	nvme_disable_io_queues(dev);
				2499	nvme_disable_admin_queue(dev, shutdown);
				2500	}
				2501	nvme_suspend_io_queues(dev);
				2502	nvme_suspend_queue(&dev->queues[0]);
				2503	nvme_pci_disable(dev);
				2504	nvme_reap_pending_cqes(dev);
				2505
				2506	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
				2507	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
				2508	blk_mq_tagset_wait_completed_request(&dev->tagset);
				2509	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
				2510
				2511	/*
				2512	* The driver will not be starting up queues again if shutting down so
				2513	* must flush all entered requests to their failed completion to avoid
				2514	* deadlocking blk-mq hot-cpu notifier.
				2515	*/
				2516	if (shutdown) {
				2517	nvme_start_queues(&dev->ctrl);
				2518	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
				2519	blk_mq_unquiesce_queue(dev->ctrl.admin_q);
				2520	}
				2521	mutex_unlock(&dev->shutdown_lock);
				2522	}
				2523
				2524	static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
				2525	{
				2526	if (!nvme_wait_reset(&dev->ctrl))
				2527	return -EBUSY;
				2528	nvme_dev_disable(dev, shutdown);
				2529	return 0;
				2530	}
				2531
				2532	static int nvme_setup_prp_pools(struct nvme_dev *dev)
				2533	{
				2534	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
				2535	PAGE_SIZE, PAGE_SIZE, 0);
				2536	if (!dev->prp_page_pool)
				2537	return -ENOMEM;
				2538
				2539	/* Optimisation for I/Os between 4k and 128k */
				2540	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
				2541	256, 256, 0);
				2542	if (!dev->prp_small_pool) {
				2543	dma_pool_destroy(dev->prp_page_pool);
				2544	return -ENOMEM;
				2545	}
				2546	return 0;
				2547	}
				2548
				2549	static void nvme_release_prp_pools(struct nvme_dev *dev)
				2550	{
				2551	dma_pool_destroy(dev->prp_page_pool);
				2552	dma_pool_destroy(dev->prp_small_pool);
				2553	}
				2554
				2555	static void nvme_free_tagset(struct nvme_dev *dev)
				2556	{
				2557	if (dev->tagset.tags)
				2558	blk_mq_free_tag_set(&dev->tagset);
				2559	dev->ctrl.tagset = NULL;
				2560	}
				2561
				2562	static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
				2563	{
				2564	struct nvme_dev *dev = to_nvme_dev(ctrl);
				2565
				2566	nvme_dbbuf_dma_free(dev);
				2567	put_device(dev->dev);
				2568	nvme_free_tagset(dev);
				2569	if (dev->ctrl.admin_q)
				2570	blk_put_queue(dev->ctrl.admin_q);
				2571	kfree(dev->queues);
				2572	free_opal_dev(dev->ctrl.opal_dev);
				2573	mempool_destroy(dev->iod_mempool);
				2574	kfree(dev);
				2575	}
				2576
				2577	static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
				2578	{
				2579	/*
				2580	* Set state to deleting now to avoid blocking nvme_wait_reset(), which
				2581	* may be holding this pci_dev's device lock.
				2582	*/
				2583	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
				2584	nvme_get_ctrl(&dev->ctrl);
				2585	nvme_dev_disable(dev, false);
				2586	nvme_kill_queues(&dev->ctrl);
				2587	if (!queue_work(nvme_wq, &dev->remove_work))
				2588	nvme_put_ctrl(&dev->ctrl);
				2589	}
				2590
				2591	static void nvme_reset_work(struct work_struct *work)
				2592	{
				2593	struct nvme_dev *dev =
				2594	container_of(work, struct nvme_dev, ctrl.reset_work);
				2595	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
				2596	int result;
				2597
				2598	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
				2599	dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
				2600	dev->ctrl.state);
				2601	result = -ENODEV;
				2602	goto out;
				2603	}
				2604
				2605	/*
				2606	* If we're called to reset a live controller first shut it down before
				2607	* moving on.
				2608	*/
				2609	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
				2610	nvme_dev_disable(dev, false);
				2611	nvme_sync_queues(&dev->ctrl);
				2612
				2613	mutex_lock(&dev->shutdown_lock);
				2614	result = nvme_pci_enable(dev);
				2615	if (result)
				2616	goto out_unlock;
				2617
				2618	result = nvme_pci_configure_admin_queue(dev);
				2619	if (result)
				2620	goto out_unlock;
				2621
				2622	result = nvme_alloc_admin_tags(dev);
				2623	if (result)
				2624	goto out_unlock;
				2625
				2626	/*
				2627	* Limit the max command size to prevent iod->sg allocations going
				2628	* over a single page.
				2629	*/
				2630	dev->ctrl.max_hw_sectors = min_t(u32,
				2631	NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
				2632	dev->ctrl.max_segments = NVME_MAX_SEGS;
				2633
				2634	/*
				2635	* Don't limit the IOMMU merged segment size.
				2636	*/
				2637	dma_set_max_seg_size(dev->dev, 0xffffffff);
				2638
				2639	mutex_unlock(&dev->shutdown_lock);
				2640
				2641	/*
				2642	* Introduce CONNECTING state from nvme-fc/rdma transports to mark the
				2643	* initializing procedure here.
				2644	*/
				2645	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
				2646	dev_warn(dev->ctrl.device,
				2647	"failed to mark controller CONNECTING\n");
				2648	result = -EBUSY;
				2649	goto out;
				2650	}
				2651
				2652	result = nvme_init_identify(&dev->ctrl);
				2653	if (result)
				2654	goto out;
				2655
				2656	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
				2657	if (!dev->ctrl.opal_dev)
				2658	dev->ctrl.opal_dev =
				2659	init_opal_dev(&dev->ctrl, &nvme_sec_submit);
				2660	else if (was_suspend)
				2661	opal_unlock_from_suspend(dev->ctrl.opal_dev);
				2662	} else {
				2663	free_opal_dev(dev->ctrl.opal_dev);
				2664	dev->ctrl.opal_dev = NULL;
				2665	}
				2666
				2667	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
				2668	result = nvme_dbbuf_dma_alloc(dev);
				2669	if (result)
				2670	dev_warn(dev->dev,
				2671	"unable to allocate dma for dbbuf\n");
				2672	}
				2673
				2674	if (dev->ctrl.hmpre) {
				2675	result = nvme_setup_host_mem(dev);
				2676	if (result < 0)
				2677	goto out;
				2678	}
				2679
				2680	result = nvme_setup_io_queues(dev);
				2681	if (result)
				2682	goto out;
				2683
				2684	/*
				2685	* Keep the controller around but remove all namespaces if we don't have
				2686	* any working I/O queue.
				2687	*/
				2688	if (dev->online_queues < 2) {
				2689	dev_warn(dev->ctrl.device, "IO queues not created\n");
				2690	nvme_kill_queues(&dev->ctrl);
				2691	nvme_remove_namespaces(&dev->ctrl);
				2692	nvme_free_tagset(dev);
				2693	} else {
				2694	nvme_start_queues(&dev->ctrl);
				2695	nvme_wait_freeze(&dev->ctrl);
				2696	nvme_dev_add(dev);
				2697	nvme_unfreeze(&dev->ctrl);
				2698	}
				2699
				2700	/*
				2701	* If only admin queue live, keep it to do further investigation or
				2702	* recovery.
				2703	*/
				2704	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
				2705	dev_warn(dev->ctrl.device,
				2706	"failed to mark controller live state\n");
				2707	result = -ENODEV;
				2708	goto out;
				2709	}
				2710
				2711	nvme_start_ctrl(&dev->ctrl);
				2712	return;
				2713
				2714	out_unlock:
				2715	mutex_unlock(&dev->shutdown_lock);
				2716	out:
				2717	if (result)
				2718	dev_warn(dev->ctrl.device,
				2719	"Removing after probe failure status: %d\n", result);
				2720	nvme_remove_dead_ctrl(dev);
				2721	}
				2722
				2723	static void nvme_remove_dead_ctrl_work(struct work_struct *work)
				2724	{
				2725	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
				2726	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2727
				2728	if (pci_get_drvdata(pdev))
				2729	device_release_driver(&pdev->dev);
				2730	nvme_put_ctrl(&dev->ctrl);
				2731	}
				2732
				2733	static int nvme_pci_reg_read32(struct nvme_ctrl ctrl, u32 off, u32 val)
				2734	{
				2735	*val = readl(to_nvme_dev(ctrl)->bar + off);
				2736	return 0;
				2737	}
				2738
				2739	static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
				2740	{
				2741	writel(val, to_nvme_dev(ctrl)->bar + off);
				2742	return 0;
				2743	}
				2744
				2745	static int nvme_pci_reg_read64(struct nvme_ctrl ctrl, u32 off, u64 val)
				2746	{
				2747	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
				2748	return 0;
				2749	}
				2750
				2751	static int nvme_pci_get_address(struct nvme_ctrl ctrl, char buf, int size)
				2752	{
				2753	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
				2754
				2755	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
				2756	}
				2757
				2758	static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
				2759	.name = "pcie",
				2760	.module = THIS_MODULE,
				2761	.flags = NVME_F_METADATA_SUPPORTED \|
				2762	NVME_F_PCI_P2PDMA,
				2763	.reg_read32 = nvme_pci_reg_read32,
				2764	.reg_write32 = nvme_pci_reg_write32,
				2765	.reg_read64 = nvme_pci_reg_read64,
				2766	.free_ctrl = nvme_pci_free_ctrl,
				2767	.submit_async_event = nvme_pci_submit_async_event,
				2768	.get_address = nvme_pci_get_address,
				2769	};
				2770
				2771	static int nvme_dev_map(struct nvme_dev *dev)
				2772	{
				2773	struct pci_dev *pdev = to_pci_dev(dev->dev);
				2774
				2775	if (pci_request_mem_regions(pdev, "nvme"))
				2776	return -ENODEV;
				2777
				2778	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
				2779	goto release;
				2780
				2781	return 0;
				2782	release:
				2783	pci_release_mem_regions(pdev);
				2784	return -ENODEV;
				2785	}
				2786
				2787	static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
				2788	{
				2789	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
				2790	/*
				2791	* Several Samsung devices seem to drop off the PCIe bus
				2792	* randomly when APST is on and uses the deepest sleep state.
				2793	* This has been observed on a Samsung "SM951 NVMe SAMSUNG
				2794	* 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
				2795	* 950 PRO 256GB", but it seems to be restricted to two Dell
				2796	* laptops.
				2797	*/
				2798	if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
				2799	(dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") \|\|
				2800	dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
				2801	return NVME_QUIRK_NO_DEEPEST_PS;
				2802	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
				2803	/*
				2804	* Samsung SSD 960 EVO drops off the PCIe bus after system
				2805	* suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
				2806	* within few minutes after bootup on a Coffee Lake board -
				2807	* ASUS PRIME Z370-A
				2808	*/
				2809	if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
				2810	(dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") \|\|
				2811	dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
				2812	return NVME_QUIRK_NO_APST;
				2813	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 \|\|
				2814	pdev->device == 0xa808 \|\| pdev->device == 0xa809)) \|\|
				2815	(pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
				2816	/*
				2817	* Forcing to use host managed nvme power settings for
				2818	* lowest idle power with quick resume latency on
				2819	* Samsung and Toshiba SSDs based on suspend behavior
				2820	* on Coffee Lake board for LENOVO C640
				2821	*/
				2822	if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
				2823	dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
				2824	return NVME_QUIRK_SIMPLE_SUSPEND;
				2825	}
				2826
				2827	/*
				2828	* NVMe SSD drops off the PCIe bus after system idle
				2829	* for 10 hours on a Lenovo N60z board.
				2830	*/
				2831	if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6"))
				2832	return NVME_QUIRK_NO_APST;
				2833
				2834	return 0;
				2835	}
				2836
				2837	static void nvme_async_probe(void *data, async_cookie_t cookie)
				2838	{
				2839	struct nvme_dev *dev = data;
				2840
				2841	flush_work(&dev->ctrl.reset_work);
				2842	flush_work(&dev->ctrl.scan_work);
				2843	nvme_put_ctrl(&dev->ctrl);
				2844	}
				2845
				2846	static int nvme_probe(struct pci_dev pdev, const struct pci_device_id id)
				2847	{
				2848	int node, result = -ENOMEM;
				2849	struct nvme_dev *dev;
				2850	unsigned long quirks = id->driver_data;
				2851	size_t alloc_size;
				2852
				2853	node = dev_to_node(&pdev->dev);
				2854
				2855	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
				2856	if (!dev)
				2857	return -ENOMEM;
				2858
				2859	dev->nr_write_queues = write_queues;
				2860	dev->nr_poll_queues = poll_queues;
				2861	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
				2862	dev->queues = kcalloc_node(dev->nr_allocated_queues,
				2863	sizeof(struct nvme_queue), GFP_KERNEL, node);
				2864	if (!dev->queues)
				2865	goto free;
				2866
				2867	dev->dev = get_device(&pdev->dev);
				2868	pci_set_drvdata(pdev, dev);
				2869
				2870	result = nvme_dev_map(dev);
				2871	if (result)
				2872	goto put_pci;
				2873
				2874	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
				2875	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
				2876	mutex_init(&dev->shutdown_lock);
				2877
				2878	result = nvme_setup_prp_pools(dev);
				2879	if (result)
				2880	goto unmap;
				2881
				2882	quirks \|= check_vendor_combination_bug(pdev);
				2883
				2884	/*
				2885	* Double check that our mempool alloc size will cover the biggest
				2886	* command we support.
				2887	*/
				2888	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
				2889	NVME_MAX_SEGS, true);
				2890	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
				2891
				2892	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
				2893	mempool_kfree,
				2894	(void *) alloc_size,
				2895	GFP_KERNEL, node);
				2896	if (!dev->iod_mempool) {
				2897	result = -ENOMEM;
				2898	goto release_pools;
				2899	}
				2900
				2901	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
				2902	quirks);
				2903	if (result)
				2904	goto release_mempool;
				2905
				2906	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
				2907
				2908	nvme_reset_ctrl(&dev->ctrl);
				2909	async_schedule(nvme_async_probe, dev);
				2910
				2911	return 0;
				2912
				2913	release_mempool:
				2914	mempool_destroy(dev->iod_mempool);
				2915	release_pools:
				2916	nvme_release_prp_pools(dev);
				2917	unmap:
				2918	nvme_dev_unmap(dev);
				2919	put_pci:
				2920	put_device(dev->dev);
				2921	free:
				2922	kfree(dev->queues);
				2923	kfree(dev);
				2924	return result;
				2925	}
				2926
				2927	static void nvme_reset_prepare(struct pci_dev *pdev)
				2928	{
				2929	struct nvme_dev *dev = pci_get_drvdata(pdev);
				2930
				2931	/*
				2932	* We don't need to check the return value from waiting for the reset
				2933	* state as pci_dev device lock is held, making it impossible to race
				2934	* with ->remove().
				2935	*/
				2936	nvme_disable_prepare_reset(dev, false);
				2937	nvme_sync_queues(&dev->ctrl);
				2938	}
				2939
				2940	static void nvme_reset_done(struct pci_dev *pdev)
				2941	{
				2942	struct nvme_dev *dev = pci_get_drvdata(pdev);
				2943
				2944	if (!nvme_try_sched_reset(&dev->ctrl))
				2945	flush_work(&dev->ctrl.reset_work);
				2946	}
				2947
				2948	static void nvme_shutdown(struct pci_dev *pdev)
				2949	{
				2950	struct nvme_dev *dev = pci_get_drvdata(pdev);
				2951	nvme_disable_prepare_reset(dev, true);
				2952	}
				2953
				2954	/*
				2955	* The driver's remove may be called on a device in a partially initialized
				2956	* state. This function must not have any dependencies on the device state in
				2957	* order to proceed.
				2958	*/
				2959	static void nvme_remove(struct pci_dev *pdev)
				2960	{
				2961	struct nvme_dev *dev = pci_get_drvdata(pdev);
				2962
				2963	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
				2964	pci_set_drvdata(pdev, NULL);
				2965
				2966	if (!pci_device_is_present(pdev)) {
				2967	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
				2968	nvme_dev_disable(dev, true);
				2969	}
				2970
				2971	flush_work(&dev->ctrl.reset_work);
				2972	nvme_stop_ctrl(&dev->ctrl);
				2973	nvme_remove_namespaces(&dev->ctrl);
				2974	nvme_dev_disable(dev, true);
				2975	nvme_release_cmb(dev);
				2976	nvme_free_host_mem(dev);
				2977	nvme_dev_remove_admin(dev);
				2978	nvme_free_queues(dev, 0);
				2979	nvme_uninit_ctrl(&dev->ctrl);
				2980	nvme_release_prp_pools(dev);
				2981	nvme_dev_unmap(dev);
				2982	nvme_put_ctrl(&dev->ctrl);
				2983	}
				2984
				2985	#ifdef CONFIG_PM_SLEEP
				2986	static int nvme_get_power_state(struct nvme_ctrl ctrl, u32 ps)
				2987	{
				2988	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
				2989	}
				2990
				2991	static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
				2992	{
				2993	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
				2994	}
				2995
				2996	static int nvme_resume(struct device *dev)
				2997	{
				2998	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
				2999	struct nvme_ctrl *ctrl = &ndev->ctrl;
				3000
				3001	if (ndev->last_ps == U32_MAX \|\|
				3002	nvme_set_power_state(ctrl, ndev->last_ps) != 0)
				3003	return nvme_try_sched_reset(&ndev->ctrl);
				3004	return 0;
				3005	}
				3006
				3007	static int nvme_suspend(struct device *dev)
				3008	{
				3009	struct pci_dev *pdev = to_pci_dev(dev);
				3010	struct nvme_dev *ndev = pci_get_drvdata(pdev);
				3011	struct nvme_ctrl *ctrl = &ndev->ctrl;
				3012	int ret = -EBUSY;
				3013
				3014	ndev->last_ps = U32_MAX;
				3015
				3016	/*
				3017	* The platform does not remove power for a kernel managed suspend so
				3018	* use host managed nvme power settings for lowest idle power if
				3019	* possible. This should have quicker resume latency than a full device
				3020	* shutdown. But if the firmware is involved after the suspend or the
				3021	* device does not support any non-default power states, shut down the
				3022	* device fully.
				3023	*
				3024	* If ASPM is not enabled for the device, shut down the device and allow
				3025	* the PCI bus layer to put it into D3 in order to take the PCIe link
				3026	* down, so as to allow the platform to achieve its minimum low-power
				3027	* state (which may not be possible if the link is up).
				3028	*
				3029	* If a host memory buffer is enabled, shut down the device as the NVMe
				3030	* specification allows the device to access the host memory buffer in
				3031	* host DRAM from all power states, but hosts will fail access to DRAM
				3032	* during S3.
				3033	*/
				3034	if (pm_suspend_via_firmware() \|\| !ctrl->npss \|\|
				3035	!pcie_aspm_enabled(pdev) \|\|
				3036	ndev->nr_host_mem_descs \|\|
				3037	(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
				3038	return nvme_disable_prepare_reset(ndev, true);
				3039
				3040	nvme_start_freeze(ctrl);
				3041	nvme_wait_freeze(ctrl);
				3042	nvme_sync_queues(ctrl);
				3043
				3044	if (ctrl->state != NVME_CTRL_LIVE)
				3045	goto unfreeze;
				3046
				3047	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
				3048	if (ret < 0)
				3049	goto unfreeze;
				3050
				3051	/*
				3052	* A saved state prevents pci pm from generically controlling the
				3053	* device's power. If we're using protocol specific settings, we don't
				3054	* want pci interfering.
				3055	*/
				3056	pci_save_state(pdev);
				3057
				3058	ret = nvme_set_power_state(ctrl, ctrl->npss);
				3059	if (ret < 0)
				3060	goto unfreeze;
				3061
				3062	if (ret) {
				3063	/* discard the saved state */
				3064	pci_load_saved_state(pdev, NULL);
				3065
				3066	/*
				3067	* Clearing npss forces a controller reset on resume. The
				3068	* correct value will be resdicovered then.
				3069	*/
				3070	ret = nvme_disable_prepare_reset(ndev, true);
				3071	ctrl->npss = 0;
				3072	}
				3073	unfreeze:
				3074	nvme_unfreeze(ctrl);
				3075	return ret;
				3076	}
				3077
				3078	static int nvme_simple_suspend(struct device *dev)
				3079	{
				3080	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
				3081	return nvme_disable_prepare_reset(ndev, true);
				3082	}
				3083
				3084	static int nvme_simple_resume(struct device *dev)
				3085	{
				3086	struct pci_dev *pdev = to_pci_dev(dev);
				3087	struct nvme_dev *ndev = pci_get_drvdata(pdev);
				3088
				3089	return nvme_try_sched_reset(&ndev->ctrl);
				3090	}
				3091
				3092	static const struct dev_pm_ops nvme_dev_pm_ops = {
				3093	.suspend = nvme_suspend,
				3094	.resume = nvme_resume,
				3095	.freeze = nvme_simple_suspend,
				3096	.thaw = nvme_simple_resume,
				3097	.poweroff = nvme_simple_suspend,
				3098	.restore = nvme_simple_resume,
				3099	};
				3100	#endif /* CONFIG_PM_SLEEP */
				3101
				3102	static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
				3103	pci_channel_state_t state)
				3104	{
				3105	struct nvme_dev *dev = pci_get_drvdata(pdev);
				3106
				3107	/*
				3108	* A frozen channel requires a reset. When detected, this method will
				3109	* shutdown the controller to quiesce. The controller will be restarted
				3110	* after the slot reset through driver's slot_reset callback.
				3111	*/
				3112	switch (state) {
				3113	case pci_channel_io_normal:
				3114	return PCI_ERS_RESULT_CAN_RECOVER;
				3115	case pci_channel_io_frozen:
				3116	dev_warn(dev->ctrl.device,
				3117	"frozen state error detected, reset controller\n");
				3118	nvme_dev_disable(dev, false);
				3119	return PCI_ERS_RESULT_NEED_RESET;
				3120	case pci_channel_io_perm_failure:
				3121	dev_warn(dev->ctrl.device,
				3122	"failure state error detected, request disconnect\n");
				3123	return PCI_ERS_RESULT_DISCONNECT;
				3124	}
				3125	return PCI_ERS_RESULT_NEED_RESET;
				3126	}
				3127
				3128	static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
				3129	{
				3130	struct nvme_dev *dev = pci_get_drvdata(pdev);
				3131
				3132	dev_info(dev->ctrl.device, "restart after slot reset\n");
				3133	pci_restore_state(pdev);
				3134	nvme_reset_ctrl(&dev->ctrl);
				3135	return PCI_ERS_RESULT_RECOVERED;
				3136	}
				3137
				3138	static void nvme_error_resume(struct pci_dev *pdev)
				3139	{
				3140	struct nvme_dev *dev = pci_get_drvdata(pdev);
				3141
				3142	flush_work(&dev->ctrl.reset_work);
				3143	}
				3144
				3145	static const struct pci_error_handlers nvme_err_handler = {
				3146	.error_detected = nvme_error_detected,
				3147	.slot_reset = nvme_slot_reset,
				3148	.resume = nvme_error_resume,
				3149	.reset_prepare = nvme_reset_prepare,
				3150	.reset_done = nvme_reset_done,
				3151	};
				3152
				3153	static const struct pci_device_id nvme_id_table[] = {
				3154	{ PCI_VDEVICE(INTEL, 0x0953),
				3155	.driver_data = NVME_QUIRK_STRIPE_SIZE \|
				3156	NVME_QUIRK_DEALLOCATE_ZEROES, },
				3157	{ PCI_VDEVICE(INTEL, 0x0a53),
				3158	.driver_data = NVME_QUIRK_STRIPE_SIZE \|
				3159	NVME_QUIRK_DEALLOCATE_ZEROES, },
				3160	{ PCI_VDEVICE(INTEL, 0x0a54),
				3161	.driver_data = NVME_QUIRK_STRIPE_SIZE \|
				3162	NVME_QUIRK_DEALLOCATE_ZEROES, },
				3163	{ PCI_VDEVICE(INTEL, 0x0a55),
				3164	.driver_data = NVME_QUIRK_STRIPE_SIZE \|
				3165	NVME_QUIRK_DEALLOCATE_ZEROES, },
				3166	{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
				3167	.driver_data = NVME_QUIRK_NO_DEEPEST_PS \|
				3168	NVME_QUIRK_MEDIUM_PRIO_SQ \|
				3169	NVME_QUIRK_DISABLE_WRITE_ZEROES, },
				3170	{ PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */
				3171	.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
				3172	{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
				3173	.driver_data = NVME_QUIRK_IDENTIFY_CNS \|
				3174	NVME_QUIRK_DISABLE_WRITE_ZEROES, },
				3175	{ PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */
				3176	.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
				3177	{ PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */
				3178	.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY \|
				3179	NVME_QUIRK_NO_NS_DESC_LIST, },
				3180	{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
				3181	.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
				3182	{ PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */
				3183	.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
				3184	{ PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */
				3185	.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
				3186	{ PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */
				3187	.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
				3188	{ PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */
				3189	.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY \|
				3190	NVME_QUIRK_DISABLE_WRITE_ZEROES\|
				3191	NVME_QUIRK_IGNORE_DEV_SUBNQN, },
				3192	{ PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */
				3193	.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
				3194	{ PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */
				3195	.driver_data = NVME_QUIRK_NO_NS_DESC_LIST \|
				3196	NVME_QUIRK_IGNORE_DEV_SUBNQN, },
				3197	{ PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */
				3198	.driver_data = NVME_QUIRK_LIGHTNVM, },
				3199	{ PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */
				3200	.driver_data = NVME_QUIRK_LIGHTNVM, },
				3201	{ PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */
				3202	.driver_data = NVME_QUIRK_LIGHTNVM, },
				3203	{ PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
				3204	.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
				3205	{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
				3206	.driver_data = NVME_QUIRK_NO_DEEPEST_PS \|
				3207	NVME_QUIRK_IGNORE_DEV_SUBNQN, },
				3208	{ PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */
				3209	.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
				3210	{ PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */
				3211	.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
				3212	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
				3213	.driver_data = NVME_QUIRK_SINGLE_VECTOR },
				3214	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
				3215	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
				3216	.driver_data = NVME_QUIRK_SINGLE_VECTOR \|
				3217	NVME_QUIRK_128_BYTES_SQES \|
				3218	NVME_QUIRK_SHARED_TAGS },
				3219
				3220	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
				3221	{ 0, }
				3222	};
				3223	MODULE_DEVICE_TABLE(pci, nvme_id_table);
				3224
				3225	static struct pci_driver nvme_driver = {
				3226	.name = "nvme",
				3227	.id_table = nvme_id_table,
				3228	.probe = nvme_probe,
				3229	.remove = nvme_remove,
				3230	.shutdown = nvme_shutdown,
				3231	#ifdef CONFIG_PM_SLEEP
				3232	.driver = {
				3233	.pm = &nvme_dev_pm_ops,
				3234	},
				3235	#endif
				3236	.sriov_configure = pci_sriov_configure_simple,
				3237	.err_handler = &nvme_err_handler,
				3238	};
				3239
				3240	static int __init nvme_init(void)
				3241	{
				3242	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
				3243	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
				3244	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
				3245	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
				3246	return pci_register_driver(&nvme_driver);
				3247	}
				3248
				3249	static void __exit nvme_exit(void)
				3250	{
				3251	pci_unregister_driver(&nvme_driver);
				3252	flush_workqueue(nvme_wq);
				3253	}
				3254
				3255	MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
				3256	MODULE_LICENSE("GPL");
				3257	MODULE_VERSION("1.0");
				3258	module_init(nvme_init);
				3259	module_exit(nvme_exit);