Blame - src/kernel/linux/v4.19/block/blk-mq.c - T800

blob: 684acaa96db7e11893b8ddd56b05de75e05ffaef [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Block multiqueue core code
				3	*
				4	* Copyright (C) 2013-2014 Jens Axboe
				5	* Copyright (C) 2013-2014 Christoph Hellwig
				6	*/
				7	#include <linux/kernel.h>
				8	#include <linux/module.h>
				9	#include <linux/backing-dev.h>
				10	#include <linux/bio.h>
				11	#include <linux/blkdev.h>
				12	#include <linux/kmemleak.h>
				13	#include <linux/mm.h>
				14	#include <linux/init.h>
				15	#include <linux/slab.h>
				16	#include <linux/workqueue.h>
				17	#include <linux/smp.h>
				18	#include <linux/llist.h>
				19	#include <linux/list_sort.h>
				20	#include <linux/cpu.h>
				21	#include <linux/cache.h>
				22	#include <linux/sched/sysctl.h>
				23	#include <linux/sched/topology.h>
				24	#include <linux/sched/signal.h>
				25	#include <linux/delay.h>
				26	#include <linux/crash_dump.h>
				27	#include <linux/prefetch.h>
				28
				29	#include <trace/events/block.h>
				30
				31	#include <linux/blk-mq.h>
				32	#include "blk.h"
				33	#include "blk-mq.h"
				34	#include "blk-mq-debugfs.h"
				35	#include "blk-mq-tag.h"
				36	#include "blk-stat.h"
				37	#include "blk-mq-sched.h"
				38	#include "blk-rq-qos.h"
				39
				40	static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
				41	static void blk_mq_poll_stats_start(struct request_queue *q);
				42	static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
				43
				44	static int blk_mq_poll_stats_bkt(const struct request *rq)
				45	{
				46	int ddir, bytes, bucket;
				47
				48	ddir = rq_data_dir(rq);
				49	bytes = blk_rq_bytes(rq);
				50
				51	bucket = ddir + 2*(ilog2(bytes) - 9);
				52
				53	if (bucket < 0)
				54	return -1;
				55	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
				56	return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
				57
				58	return bucket;
				59	}
				60
				61	/*
				62	* Check if any of the ctx's have pending work in this hardware queue
				63	*/
				64	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				65	{
				66	return !list_empty_careful(&hctx->dispatch) \|\|
				67	sbitmap_any_bit_set(&hctx->ctx_map) \|\|
				68	blk_mq_sched_has_work(hctx);
				69	}
				70
				71	/*
				72	* Mark this ctx as having pending work in this hardware queue
				73	*/
				74	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				75	struct blk_mq_ctx *ctx)
				76	{
				77	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
				78	sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
				79	}
				80
				81	static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
				82	struct blk_mq_ctx *ctx)
				83	{
				84	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
				85	}
				86
				87	struct mq_inflight {
				88	struct hd_struct *part;
				89	unsigned int *inflight;
				90	};
				91
				92	static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
				93	struct request rq, void priv,
				94	bool reserved)
				95	{
				96	struct mq_inflight *mi = priv;
				97
				98	/*
				99	* index[0] counts the specific partition that was asked for. index[1]
				100	* counts the ones that are active on the whole device, so increment
				101	* that if mi->part is indeed a partition, and not a whole device.
				102	*/
				103	if (rq->part == mi->part)
				104	mi->inflight[0]++;
				105	if (mi->part->partno)
				106	mi->inflight[1]++;
				107	}
				108
				109	void blk_mq_in_flight(struct request_queue q, struct hd_struct part,
				110	unsigned int inflight[2])
				111	{
				112	struct mq_inflight mi = { .part = part, .inflight = inflight, };
				113
				114	inflight[0] = inflight[1] = 0;
				115	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
				116	}
				117
				118	static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
				119	struct request rq, void priv,
				120	bool reserved)
				121	{
				122	struct mq_inflight *mi = priv;
				123
				124	if (rq->part == mi->part)
				125	mi->inflight[rq_data_dir(rq)]++;
				126	}
				127
				128	void blk_mq_in_flight_rw(struct request_queue q, struct hd_struct part,
				129	unsigned int inflight[2])
				130	{
				131	struct mq_inflight mi = { .part = part, .inflight = inflight, };
				132
				133	inflight[0] = inflight[1] = 0;
				134	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
				135	}
				136
				137	void blk_freeze_queue_start(struct request_queue *q)
				138	{
				139	int freeze_depth;
				140
				141	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
				142	if (freeze_depth == 1) {
				143	percpu_ref_kill(&q->q_usage_counter);
				144	if (q->mq_ops)
				145	blk_mq_run_hw_queues(q, false);
				146	}
				147	}
				148	EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
				149
				150	void blk_mq_freeze_queue_wait(struct request_queue *q)
				151	{
				152	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
				153	}
				154	EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
				155
				156	int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
				157	unsigned long timeout)
				158	{
				159	return wait_event_timeout(q->mq_freeze_wq,
				160	percpu_ref_is_zero(&q->q_usage_counter),
				161	timeout);
				162	}
				163	EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
				164
				165	/*
				166	* Guarantee no request is in use, so we can change any data structure of
				167	* the queue afterward.
				168	*/
				169	void blk_freeze_queue(struct request_queue *q)
				170	{
				171	/*
				172	* In the !blk_mq case we are only calling this to kill the
				173	* q_usage_counter, otherwise this increases the freeze depth
				174	* and waits for it to return to zero. For this reason there is
				175	* no blk_unfreeze_queue(), and blk_freeze_queue() is not
				176	* exported to drivers as the only user for unfreeze is blk_mq.
				177	*/
				178	blk_freeze_queue_start(q);
				179	if (!q->mq_ops)
				180	blk_drain_queue(q);
				181	blk_mq_freeze_queue_wait(q);
				182	}
				183
				184	void blk_mq_freeze_queue(struct request_queue *q)
				185	{
				186	/*
				187	* ...just an alias to keep freeze and unfreeze actions balanced
				188	* in the blk_mq_* namespace
				189	*/
				190	blk_freeze_queue(q);
				191	}
				192	EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
				193
				194	void blk_mq_unfreeze_queue(struct request_queue *q)
				195	{
				196	int freeze_depth;
				197
				198	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
				199	WARN_ON_ONCE(freeze_depth < 0);
				200	if (!freeze_depth) {
				201	percpu_ref_reinit(&q->q_usage_counter);
				202	wake_up_all(&q->mq_freeze_wq);
				203	}
				204	}
				205	EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
				206
				207	/*
				208	* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
				209	* mpt3sas driver such that this function can be removed.
				210	*/
				211	void blk_mq_quiesce_queue_nowait(struct request_queue *q)
				212	{
				213	blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
				214	}
				215	EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
				216
				217	/**
				218	* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
				219	* @q: request queue.
				220	*
				221	* Note: this function does not prevent that the struct request end_io()
				222	* callback function is invoked. Once this function is returned, we make
				223	* sure no dispatch can happen until the queue is unquiesced via
				224	* blk_mq_unquiesce_queue().
				225	*/
				226	void blk_mq_quiesce_queue(struct request_queue *q)
				227	{
				228	struct blk_mq_hw_ctx *hctx;
				229	unsigned int i;
				230	bool rcu = false;
				231
				232	blk_mq_quiesce_queue_nowait(q);
				233
				234	queue_for_each_hw_ctx(q, hctx, i) {
				235	if (hctx->flags & BLK_MQ_F_BLOCKING)
				236	synchronize_srcu(hctx->srcu);
				237	else
				238	rcu = true;
				239	}
				240	if (rcu)
				241	synchronize_rcu();
				242	}
				243	EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
				244
				245	/*
				246	* blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
				247	* @q: request queue.
				248	*
				249	* This function recovers queue into the state before quiescing
				250	* which is done by blk_mq_quiesce_queue.
				251	*/
				252	void blk_mq_unquiesce_queue(struct request_queue *q)
				253	{
				254	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
				255
				256	/* dispatch requests which are inserted during quiescing */
				257	blk_mq_run_hw_queues(q, true);
				258	}
				259	EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
				260
				261	void blk_mq_wake_waiters(struct request_queue *q)
				262	{
				263	struct blk_mq_hw_ctx *hctx;
				264	unsigned int i;
				265
				266	queue_for_each_hw_ctx(q, hctx, i)
				267	if (blk_mq_hw_queue_mapped(hctx))
				268	blk_mq_tag_wakeup_all(hctx->tags, true);
				269	}
				270
				271	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				272	{
				273	return blk_mq_has_free_tags(hctx->tags);
				274	}
				275	EXPORT_SYMBOL(blk_mq_can_queue);
				276
				277	static struct request blk_mq_rq_ctx_init(struct blk_mq_alloc_data data,
				278	unsigned int tag, unsigned int op)
				279	{
				280	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
				281	struct request *rq = tags->static_rqs[tag];
				282	req_flags_t rq_flags = 0;
				283
				284	if (data->flags & BLK_MQ_REQ_INTERNAL) {
				285	rq->tag = -1;
				286	rq->internal_tag = tag;
				287	} else {
				288	if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
				289	rq_flags = RQF_MQ_INFLIGHT;
				290	atomic_inc(&data->hctx->nr_active);
				291	}
				292	rq->tag = tag;
				293	rq->internal_tag = -1;
				294	data->hctx->tags->rqs[rq->tag] = rq;
				295	}
				296
				297	/* csd/requeue_work/fifo_time is initialized before use */
				298	rq->q = data->q;
				299	rq->mq_ctx = data->ctx;
				300	rq->rq_flags = rq_flags;
				301	rq->cpu = -1;
				302	rq->cmd_flags = op;
				303	if (data->flags & BLK_MQ_REQ_PREEMPT)
				304	rq->rq_flags \|= RQF_PREEMPT;
				305	if (blk_queue_io_stat(data->q))
				306	rq->rq_flags \|= RQF_IO_STAT;
				307	INIT_LIST_HEAD(&rq->queuelist);
				308	INIT_HLIST_NODE(&rq->hash);
				309	RB_CLEAR_NODE(&rq->rb_node);
				310	rq->rq_disk = NULL;
				311	rq->part = NULL;
				312	rq->start_time_ns = ktime_get_ns();
				313	rq->io_start_time_ns = 0;
				314	rq->nr_phys_segments = 0;
				315	#if defined(CONFIG_BLK_DEV_INTEGRITY)
				316	rq->nr_integrity_segments = 0;
				317	#endif
				318	rq->special = NULL;
				319	/* tag was already set */
				320	rq->extra_len = 0;
				321	rq->__deadline = 0;
				322
				323	INIT_LIST_HEAD(&rq->timeout_list);
				324	rq->timeout = 0;
				325
				326	rq->end_io = NULL;
				327	rq->end_io_data = NULL;
				328	rq->next_rq = NULL;
				329
				330	#ifdef CONFIG_BLK_CGROUP
				331	rq->rl = NULL;
				332	#endif
				333
				334	data->ctx->rq_dispatched[op_is_sync(op)]++;
				335	refcount_set(&rq->ref, 1);
				336	return rq;
				337	}
				338
				339	static struct request blk_mq_get_request(struct request_queue q,
				340	struct bio *bio, unsigned int op,
				341	struct blk_mq_alloc_data *data)
				342	{
				343	struct elevator_queue *e = q->elevator;
				344	struct request *rq;
				345	unsigned int tag;
				346	bool put_ctx_on_error = false;
				347
				348	blk_queue_enter_live(q);
				349	data->q = q;
				350	if (likely(!data->ctx)) {
				351	data->ctx = blk_mq_get_ctx(q);
				352	put_ctx_on_error = true;
				353	}
				354	if (likely(!data->hctx))
				355	data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
				356	if (op & REQ_NOWAIT)
				357	data->flags \|= BLK_MQ_REQ_NOWAIT;
				358
				359	if (e) {
				360	data->flags \|= BLK_MQ_REQ_INTERNAL;
				361
				362	/*
				363	* Flush requests are special and go directly to the
				364	* dispatch list. Don't include reserved tags in the
				365	* limiting, as it isn't useful.
				366	*/
				367	if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
				368	!(data->flags & BLK_MQ_REQ_RESERVED))
				369	e->type->ops.mq.limit_depth(op, data);
				370	} else {
				371	blk_mq_tag_busy(data->hctx);
				372	}
				373
				374	tag = blk_mq_get_tag(data);
				375	if (tag == BLK_MQ_TAG_FAIL) {
				376	if (put_ctx_on_error) {
				377	blk_mq_put_ctx(data->ctx);
				378	data->ctx = NULL;
				379	}
				380	blk_queue_exit(q);
				381	return NULL;
				382	}
				383
				384	rq = blk_mq_rq_ctx_init(data, tag, op);
				385	if (!op_is_flush(op)) {
				386	rq->elv.icq = NULL;
				387	if (e && e->type->ops.mq.prepare_request) {
				388	if (e->type->icq_cache && rq_ioc(bio))
				389	blk_mq_sched_assign_ioc(rq, bio);
				390
				391	e->type->ops.mq.prepare_request(rq, bio);
				392	rq->rq_flags \|= RQF_ELVPRIV;
				393	}
				394	}
				395	data->hctx->queued++;
				396	return rq;
				397	}
				398
				399	struct request blk_mq_alloc_request(struct request_queue q, unsigned int op,
				400	blk_mq_req_flags_t flags)
				401	{
				402	struct blk_mq_alloc_data alloc_data = { .flags = flags };
				403	struct request *rq;
				404	int ret;
				405
				406	ret = blk_queue_enter(q, flags);
				407	if (ret)
				408	return ERR_PTR(ret);
				409
				410	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
				411	blk_queue_exit(q);
				412
				413	if (!rq)
				414	return ERR_PTR(-EWOULDBLOCK);
				415
				416	blk_mq_put_ctx(alloc_data.ctx);
				417
				418	rq->__data_len = 0;
				419	rq->__sector = (sector_t) -1;
				420	rq->bio = rq->biotail = NULL;
				421	return rq;
				422	}
				423	EXPORT_SYMBOL(blk_mq_alloc_request);
				424
				425	struct request blk_mq_alloc_request_hctx(struct request_queue q,
				426	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
				427	{
				428	struct blk_mq_alloc_data alloc_data = { .flags = flags };
				429	struct request *rq;
				430	unsigned int cpu;
				431	int ret;
				432
				433	/*
				434	* If the tag allocator sleeps we could get an allocation for a
				435	* different hardware context. No need to complicate the low level
				436	* allocator for this for the rare use case of a command tied to
				437	* a specific queue.
				438	*/
				439	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
				440	return ERR_PTR(-EINVAL);
				441
				442	if (hctx_idx >= q->nr_hw_queues)
				443	return ERR_PTR(-EIO);
				444
				445	ret = blk_queue_enter(q, flags);
				446	if (ret)
				447	return ERR_PTR(ret);
				448
				449	/*
				450	* Check if the hardware context is actually mapped to anything.
				451	* If not tell the caller that it should skip this queue.
				452	*/
				453	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
				454	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
				455	blk_queue_exit(q);
				456	return ERR_PTR(-EXDEV);
				457	}
				458	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
				459	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
				460
				461	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
				462	blk_queue_exit(q);
				463
				464	if (!rq)
				465	return ERR_PTR(-EWOULDBLOCK);
				466
				467	return rq;
				468	}
				469	EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
				470
				471	static void __blk_mq_free_request(struct request *rq)
				472	{
				473	struct request_queue *q = rq->q;
				474	struct blk_mq_ctx *ctx = rq->mq_ctx;
				475	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
				476	const int sched_tag = rq->internal_tag;
				477
				478	if (rq->tag != -1)
				479	blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
				480	if (sched_tag != -1)
				481	blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
				482	blk_mq_sched_restart(hctx);
				483	blk_queue_exit(q);
				484	}
				485
				486	void blk_mq_free_request(struct request *rq)
				487	{
				488	struct request_queue *q = rq->q;
				489	struct elevator_queue *e = q->elevator;
				490	struct blk_mq_ctx *ctx = rq->mq_ctx;
				491	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
				492
				493	if (rq->rq_flags & RQF_ELVPRIV) {
				494	if (e && e->type->ops.mq.finish_request)
				495	e->type->ops.mq.finish_request(rq);
				496	if (rq->elv.icq) {
				497	put_io_context(rq->elv.icq->ioc);
				498	rq->elv.icq = NULL;
				499	}
				500	}
				501
				502	ctx->rq_completed[rq_is_sync(rq)]++;
				503	if (rq->rq_flags & RQF_MQ_INFLIGHT)
				504	atomic_dec(&hctx->nr_active);
				505
				506	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
				507	laptop_io_completion(q->backing_dev_info);
				508
				509	rq_qos_done(q, rq);
				510
				511	if (blk_rq_rl(rq))
				512	blk_put_rl(blk_rq_rl(rq));
				513
				514	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
				515	if (refcount_dec_and_test(&rq->ref))
				516	__blk_mq_free_request(rq);
				517	}
				518	EXPORT_SYMBOL_GPL(blk_mq_free_request);
				519
				520	inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
				521	{
				522	u64 now = ktime_get_ns();
				523
				524	if (rq->rq_flags & RQF_STATS) {
				525	blk_mq_poll_stats_start(rq->q);
				526	blk_stat_add(rq, now);
				527	}
				528
				529	blk_account_io_done(rq, now);
				530
				531	if (rq->end_io) {
				532	rq_qos_done(rq->q, rq);
				533	rq->end_io(rq, error);
				534	} else {
				535	if (unlikely(blk_bidi_rq(rq)))
				536	blk_mq_free_request(rq->next_rq);
				537	blk_mq_free_request(rq);
				538	}
				539	}
				540	EXPORT_SYMBOL(__blk_mq_end_request);
				541
				542	void blk_mq_end_request(struct request *rq, blk_status_t error)
				543	{
				544	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
				545	BUG();
				546	__blk_mq_end_request(rq, error);
				547	}
				548	EXPORT_SYMBOL(blk_mq_end_request);
				549
				550	static void __blk_mq_complete_request_remote(void *data)
				551	{
				552	struct request *rq = data;
				553
				554	rq->q->softirq_done_fn(rq);
				555	}
				556
				557	static void __blk_mq_complete_request(struct request *rq)
				558	{
				559	struct blk_mq_ctx *ctx = rq->mq_ctx;
				560	bool shared = false;
				561	int cpu;
				562
				563	if (!blk_mq_mark_complete(rq))
				564	return;
				565	if (rq->internal_tag != -1)
				566	blk_mq_sched_completed_request(rq);
				567
				568	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
				569	rq->q->softirq_done_fn(rq);
				570	return;
				571	}
				572
				573	cpu = get_cpu();
				574	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
				575	shared = cpus_share_cache(cpu, ctx->cpu);
				576
				577	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
				578	rq->csd.func = __blk_mq_complete_request_remote;
				579	rq->csd.info = rq;
				580	rq->csd.flags = 0;
				581	smp_call_function_single_async(ctx->cpu, &rq->csd);
				582	} else {
				583	rq->q->softirq_done_fn(rq);
				584	}
				585	put_cpu();
				586	}
				587
				588	static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
				589	__releases(hctx->srcu)
				590	{
				591	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
				592	rcu_read_unlock();
				593	else
				594	srcu_read_unlock(hctx->srcu, srcu_idx);
				595	}
				596
				597	static void hctx_lock(struct blk_mq_hw_ctx hctx, int srcu_idx)
				598	__acquires(hctx->srcu)
				599	{
				600	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
				601	/* shut up gcc false positive */
				602	*srcu_idx = 0;
				603	rcu_read_lock();
				604	} else
				605	*srcu_idx = srcu_read_lock(hctx->srcu);
				606	}
				607
				608	/**
				609	* blk_mq_complete_request - end I/O on a request
				610	* @rq: the request being processed
				611	*
				612	* Description:
				613	* Ends all I/O on a request. It does not handle partial completions.
				614	* The actual completion happens out-of-order, through a IPI handler.
				615	**/
				616	void blk_mq_complete_request(struct request *rq)
				617	{
				618	if (unlikely(blk_should_fake_timeout(rq->q)))
				619	return;
				620	__blk_mq_complete_request(rq);
				621	}
				622	EXPORT_SYMBOL(blk_mq_complete_request);
				623
				624	int blk_mq_request_started(struct request *rq)
				625	{
				626	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
				627	}
				628	EXPORT_SYMBOL_GPL(blk_mq_request_started);
				629
				630	void blk_mq_start_request(struct request *rq)
				631	{
				632	struct request_queue *q = rq->q;
				633
				634	blk_mq_sched_started_request(rq);
				635
				636	trace_block_rq_issue(q, rq);
				637
				638	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
				639	rq->io_start_time_ns = ktime_get_ns();
				640	#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
				641	rq->throtl_size = blk_rq_sectors(rq);
				642	#endif
				643	rq->rq_flags \|= RQF_STATS;
				644	rq_qos_issue(q, rq);
				645	}
				646
				647	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
				648
				649	blk_add_timer(rq);
				650	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
				651
				652	if (q->dma_drain_size && blk_rq_bytes(rq)) {
				653	/*
				654	* Make sure space for the drain appears. We know we can do
				655	* this because max_hw_segments has been adjusted to be one
				656	* fewer than the device can handle.
				657	*/
				658	rq->nr_phys_segments++;
				659	}
				660	}
				661	EXPORT_SYMBOL(blk_mq_start_request);
				662
				663	static void __blk_mq_requeue_request(struct request *rq)
				664	{
				665	struct request_queue *q = rq->q;
				666
				667	blk_mq_put_driver_tag(rq);
				668
				669	trace_block_rq_requeue(q, rq);
				670	rq_qos_requeue(q, rq);
				671
				672	if (blk_mq_request_started(rq)) {
				673	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
				674	rq->rq_flags &= ~RQF_TIMED_OUT;
				675	if (q->dma_drain_size && blk_rq_bytes(rq))
				676	rq->nr_phys_segments--;
				677	}
				678	}
				679
				680	void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
				681	{
				682	__blk_mq_requeue_request(rq);
				683
				684	/* this request will be re-inserted to io scheduler queue */
				685	blk_mq_sched_requeue_request(rq);
				686
				687	BUG_ON(blk_queued_rq(rq));
				688	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
				689	}
				690	EXPORT_SYMBOL(blk_mq_requeue_request);
				691
				692	static void blk_mq_requeue_work(struct work_struct *work)
				693	{
				694	struct request_queue *q =
				695	container_of(work, struct request_queue, requeue_work.work);
				696	LIST_HEAD(rq_list);
				697	struct request rq, next;
				698
				699	spin_lock_irq(&q->requeue_lock);
				700	list_splice_init(&q->requeue_list, &rq_list);
				701	spin_unlock_irq(&q->requeue_lock);
				702
				703	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
				704	if (!(rq->rq_flags & (RQF_SOFTBARRIER \| RQF_DONTPREP)))
				705	continue;
				706
				707	rq->rq_flags &= ~RQF_SOFTBARRIER;
				708	list_del_init(&rq->queuelist);
				709	/*
				710	* If RQF_DONTPREP, rq has contained some driver specific
				711	* data, so insert it to hctx dispatch list to avoid any
				712	* merge.
				713	*/
				714	if (rq->rq_flags & RQF_DONTPREP)
				715	blk_mq_request_bypass_insert(rq, false);
				716	else
				717	blk_mq_sched_insert_request(rq, true, false, false);
				718	}
				719
				720	while (!list_empty(&rq_list)) {
				721	rq = list_entry(rq_list.next, struct request, queuelist);
				722	list_del_init(&rq->queuelist);
				723	blk_mq_sched_insert_request(rq, false, false, false);
				724	}
				725
				726	blk_mq_run_hw_queues(q, false);
				727	}
				728
				729	void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
				730	bool kick_requeue_list)
				731	{
				732	struct request_queue *q = rq->q;
				733	unsigned long flags;
				734
				735	/*
				736	* We abuse this flag that is otherwise used by the I/O scheduler to
				737	* request head insertion from the workqueue.
				738	*/
				739	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
				740
				741	spin_lock_irqsave(&q->requeue_lock, flags);
				742	if (at_head) {
				743	rq->rq_flags \|= RQF_SOFTBARRIER;
				744	list_add(&rq->queuelist, &q->requeue_list);
				745	} else {
				746	list_add_tail(&rq->queuelist, &q->requeue_list);
				747	}
				748	spin_unlock_irqrestore(&q->requeue_lock, flags);
				749
				750	if (kick_requeue_list)
				751	blk_mq_kick_requeue_list(q);
				752	}
				753	EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
				754
				755	void blk_mq_kick_requeue_list(struct request_queue *q)
				756	{
				757	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
				758	}
				759	EXPORT_SYMBOL(blk_mq_kick_requeue_list);
				760
				761	void blk_mq_delay_kick_requeue_list(struct request_queue *q,
				762	unsigned long msecs)
				763	{
				764	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
				765	msecs_to_jiffies(msecs));
				766	}
				767	EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
				768
				769	struct request blk_mq_tag_to_rq(struct blk_mq_tags tags, unsigned int tag)
				770	{
				771	if (tag < tags->nr_tags) {
				772	prefetch(tags->rqs[tag]);
				773	return tags->rqs[tag];
				774	}
				775
				776	return NULL;
				777	}
				778	EXPORT_SYMBOL(blk_mq_tag_to_rq);
				779
				780	static void blk_mq_rq_timed_out(struct request *req, bool reserved)
				781	{
				782	req->rq_flags \|= RQF_TIMED_OUT;
				783	if (req->q->mq_ops->timeout) {
				784	enum blk_eh_timer_return ret;
				785
				786	ret = req->q->mq_ops->timeout(req, reserved);
				787	if (ret == BLK_EH_DONE)
				788	return;
				789	WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
				790	}
				791
				792	blk_add_timer(req);
				793	}
				794
				795	static bool blk_mq_req_expired(struct request rq, unsigned long next)
				796	{
				797	unsigned long deadline;
				798
				799	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
				800	return false;
				801	if (rq->rq_flags & RQF_TIMED_OUT)
				802	return false;
				803
				804	deadline = blk_rq_deadline(rq);
				805	if (time_after_eq(jiffies, deadline))
				806	return true;
				807
				808	if (*next == 0)
				809	*next = deadline;
				810	else if (time_after(*next, deadline))
				811	*next = deadline;
				812	return false;
				813	}
				814
				815	static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
				816	struct request rq, void priv, bool reserved)
				817	{
				818	unsigned long *next = priv;
				819
				820	/*
				821	* Just do a quick check if it is expired before locking the request in
				822	* so we're not unnecessarilly synchronizing across CPUs.
				823	*/
				824	if (!blk_mq_req_expired(rq, next))
				825	return;
				826
				827	/*
				828	* We have reason to believe the request may be expired. Take a
				829	* reference on the request to lock this request lifetime into its
				830	* currently allocated context to prevent it from being reallocated in
				831	* the event the completion by-passes this timeout handler.
				832	*
				833	* If the reference was already released, then the driver beat the
				834	* timeout handler to posting a natural completion.
				835	*/
				836	if (!refcount_inc_not_zero(&rq->ref))
				837	return;
				838
				839	/*
				840	* The request is now locked and cannot be reallocated underneath the
				841	* timeout handler's processing. Re-verify this exact request is truly
				842	* expired; if it is not expired, then the request was completed and
				843	* reallocated as a new request.
				844	*/
				845	if (blk_mq_req_expired(rq, next))
				846	blk_mq_rq_timed_out(rq, reserved);
				847
				848	if (is_flush_rq(rq, hctx))
				849	rq->end_io(rq, 0);
				850	else if (refcount_dec_and_test(&rq->ref))
				851	__blk_mq_free_request(rq);
				852	}
				853
				854	static void blk_mq_timeout_work(struct work_struct *work)
				855	{
				856	struct request_queue *q =
				857	container_of(work, struct request_queue, timeout_work);
				858	unsigned long next = 0;
				859	struct blk_mq_hw_ctx *hctx;
				860	int i;
				861
				862	/* A deadlock might occur if a request is stuck requiring a
				863	* timeout at the same time a queue freeze is waiting
				864	* completion, since the timeout code would not be able to
				865	* acquire the queue reference here.
				866	*
				867	* That's why we don't use blk_queue_enter here; instead, we use
				868	* percpu_ref_tryget directly, because we need to be able to
				869	* obtain a reference even in the short window between the queue
				870	* starting to freeze, by dropping the first reference in
				871	* blk_freeze_queue_start, and the moment the last request is
				872	* consumed, marked by the instant q_usage_counter reaches
				873	* zero.
				874	*/
				875	if (!percpu_ref_tryget(&q->q_usage_counter))
				876	return;
				877
				878	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
				879
				880	if (next != 0) {
				881	mod_timer(&q->timeout, next);
				882	} else {
				883	/*
				884	* Request timeouts are handled as a forward rolling timer. If
				885	* we end up here it means that no requests are pending and
				886	* also that no request has been pending for a while. Mark
				887	* each hctx as idle.
				888	*/
				889	queue_for_each_hw_ctx(q, hctx, i) {
				890	/* the hctx may be unmapped, so check it here */
				891	if (blk_mq_hw_queue_mapped(hctx))
				892	blk_mq_tag_idle(hctx);
				893	}
				894	}
				895	blk_queue_exit(q);
				896	}
				897
				898	struct flush_busy_ctx_data {
				899	struct blk_mq_hw_ctx *hctx;
				900	struct list_head *list;
				901	};
				902
				903	static bool flush_busy_ctx(struct sbitmap sb, unsigned int bitnr, void data)
				904	{
				905	struct flush_busy_ctx_data *flush_data = data;
				906	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
				907	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
				908
				909	spin_lock(&ctx->lock);
				910	list_splice_tail_init(&ctx->rq_list, flush_data->list);
				911	sbitmap_clear_bit(sb, bitnr);
				912	spin_unlock(&ctx->lock);
				913	return true;
				914	}
				915
				916	/*
				917	* Process software queues that have been marked busy, splicing them
				918	* to the for-dispatch
				919	*/
				920	void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx hctx, struct list_head list)
				921	{
				922	struct flush_busy_ctx_data data = {
				923	.hctx = hctx,
				924	.list = list,
				925	};
				926
				927	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
				928	}
				929	EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
				930
				931	struct dispatch_rq_data {
				932	struct blk_mq_hw_ctx *hctx;
				933	struct request *rq;
				934	};
				935
				936	static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
				937	void *data)
				938	{
				939	struct dispatch_rq_data *dispatch_data = data;
				940	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
				941	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
				942
				943	spin_lock(&ctx->lock);
				944	if (!list_empty(&ctx->rq_list)) {
				945	dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
				946	list_del_init(&dispatch_data->rq->queuelist);
				947	if (list_empty(&ctx->rq_list))
				948	sbitmap_clear_bit(sb, bitnr);
				949	}
				950	spin_unlock(&ctx->lock);
				951
				952	return !dispatch_data->rq;
				953	}
				954
				955	struct request blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx hctx,
				956	struct blk_mq_ctx *start)
				957	{
				958	unsigned off = start ? start->index_hw : 0;
				959	struct dispatch_rq_data data = {
				960	.hctx = hctx,
				961	.rq = NULL,
				962	};
				963
				964	__sbitmap_for_each_set(&hctx->ctx_map, off,
				965	dispatch_rq_from_ctx, &data);
				966
				967	return data.rq;
				968	}
				969
				970	static inline unsigned int queued_to_index(unsigned int queued)
				971	{
				972	if (!queued)
				973	return 0;
				974
				975	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
				976	}
				977
				978	bool blk_mq_get_driver_tag(struct request *rq)
				979	{
				980	struct blk_mq_alloc_data data = {
				981	.q = rq->q,
				982	.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
				983	.flags = BLK_MQ_REQ_NOWAIT,
				984	};
				985	bool shared;
				986
				987	if (rq->tag != -1)
				988	goto done;
				989
				990	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
				991	data.flags \|= BLK_MQ_REQ_RESERVED;
				992
				993	shared = blk_mq_tag_busy(data.hctx);
				994	rq->tag = blk_mq_get_tag(&data);
				995	if (rq->tag >= 0) {
				996	if (shared) {
				997	rq->rq_flags \|= RQF_MQ_INFLIGHT;
				998	atomic_inc(&data.hctx->nr_active);
				999	}
				1000	data.hctx->tags->rqs[rq->tag] = rq;
				1001	}
				1002
				1003	done:
				1004	return rq->tag != -1;
				1005	}
				1006
				1007	static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
				1008	int flags, void *key)
				1009	{
				1010	struct blk_mq_hw_ctx *hctx;
				1011
				1012	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
				1013
				1014	spin_lock(&hctx->dispatch_wait_lock);
				1015	list_del_init(&wait->entry);
				1016	spin_unlock(&hctx->dispatch_wait_lock);
				1017
				1018	blk_mq_run_hw_queue(hctx, true);
				1019	return 1;
				1020	}
				1021
				1022	/*
				1023	* Mark us waiting for a tag. For shared tags, this involves hooking us into
				1024	* the tag wakeups. For non-shared tags, we can simply mark us needing a
				1025	* restart. For both cases, take care to check the condition again after
				1026	* marking us as waiting.
				1027	*/
				1028	static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
				1029	struct request *rq)
				1030	{
				1031	struct wait_queue_head *wq;
				1032	wait_queue_entry_t *wait;
				1033	bool ret;
				1034
				1035	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
				1036	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
				1037	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
				1038
				1039	/*
				1040	* It's possible that a tag was freed in the window between the
				1041	* allocation failure and adding the hardware queue to the wait
				1042	* queue.
				1043	*
				1044	* Don't clear RESTART here, someone else could have set it.
				1045	* At most this will cost an extra queue run.
				1046	*/
				1047	return blk_mq_get_driver_tag(rq);
				1048	}
				1049
				1050	wait = &hctx->dispatch_wait;
				1051	if (!list_empty_careful(&wait->entry))
				1052	return false;
				1053
				1054	wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
				1055
				1056	spin_lock_irq(&wq->lock);
				1057	spin_lock(&hctx->dispatch_wait_lock);
				1058	if (!list_empty(&wait->entry)) {
				1059	spin_unlock(&hctx->dispatch_wait_lock);
				1060	spin_unlock_irq(&wq->lock);
				1061	return false;
				1062	}
				1063
				1064	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
				1065	__add_wait_queue(wq, wait);
				1066
				1067	/*
				1068	* It's possible that a tag was freed in the window between the
				1069	* allocation failure and adding the hardware queue to the wait
				1070	* queue.
				1071	*/
				1072	ret = blk_mq_get_driver_tag(rq);
				1073	if (!ret) {
				1074	spin_unlock(&hctx->dispatch_wait_lock);
				1075	spin_unlock_irq(&wq->lock);
				1076	return false;
				1077	}
				1078
				1079	/*
				1080	* We got a tag, remove ourselves from the wait queue to ensure
				1081	* someone else gets the wakeup.
				1082	*/
				1083	list_del_init(&wait->entry);
				1084	spin_unlock(&hctx->dispatch_wait_lock);
				1085	spin_unlock_irq(&wq->lock);
				1086
				1087	return true;
				1088	}
				1089
				1090	#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
				1091	#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
				1092	/*
				1093	* Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
				1094	* - EWMA is one simple way to compute running average value
				1095	* - weight(7/8 and 1/8) is applied so that it can decrease exponentially
				1096	* - take 4 as factor for avoiding to get too small(0) result, and this
				1097	* factor doesn't matter because EWMA decreases exponentially
				1098	*/
				1099	static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
				1100	{
				1101	unsigned int ewma;
				1102
				1103	if (hctx->queue->elevator)
				1104	return;
				1105
				1106	ewma = hctx->dispatch_busy;
				1107
				1108	if (!ewma && !busy)
				1109	return;
				1110
				1111	ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
				1112	if (busy)
				1113	ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
				1114	ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
				1115
				1116	hctx->dispatch_busy = ewma;
				1117	}
				1118
				1119	#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
				1120
				1121	/*
				1122	* Returns true if we did some work AND can potentially do more.
				1123	*/
				1124	bool blk_mq_dispatch_rq_list(struct request_queue q, struct list_head list,
				1125	bool got_budget)
				1126	{
				1127	struct blk_mq_hw_ctx *hctx;
				1128	struct request rq, nxt;
				1129	bool no_tag = false;
				1130	int errors, queued;
				1131	blk_status_t ret = BLK_STS_OK;
				1132
				1133	if (list_empty(list))
				1134	return false;
				1135
				1136	WARN_ON(!list_is_singular(list) && got_budget);
				1137
				1138	/*
				1139	* Now process all the entries, sending them to the driver.
				1140	*/
				1141	errors = queued = 0;
				1142	do {
				1143	struct blk_mq_queue_data bd;
				1144
				1145	rq = list_first_entry(list, struct request, queuelist);
				1146
				1147	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
				1148	if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
				1149	break;
				1150
				1151	if (!blk_mq_get_driver_tag(rq)) {
				1152	/*
				1153	* The initial allocation attempt failed, so we need to
				1154	* rerun the hardware queue when a tag is freed. The
				1155	* waitqueue takes care of that. If the queue is run
				1156	* before we add this entry back on the dispatch list,
				1157	* we'll re-run it below.
				1158	*/
				1159	if (!blk_mq_mark_tag_wait(hctx, rq)) {
				1160	blk_mq_put_dispatch_budget(hctx);
				1161	/*
				1162	* For non-shared tags, the RESTART check
				1163	* will suffice.
				1164	*/
				1165	if (hctx->flags & BLK_MQ_F_TAG_SHARED)
				1166	no_tag = true;
				1167	break;
				1168	}
				1169	}
				1170
				1171	list_del_init(&rq->queuelist);
				1172
				1173	bd.rq = rq;
				1174
				1175	/*
				1176	* Flag last if we have no more requests, or if we have more
				1177	* but can't assign a driver tag to it.
				1178	*/
				1179	if (list_empty(list))
				1180	bd.last = true;
				1181	else {
				1182	nxt = list_first_entry(list, struct request, queuelist);
				1183	bd.last = !blk_mq_get_driver_tag(nxt);
				1184	}
				1185
				1186	ret = q->mq_ops->queue_rq(hctx, &bd);
				1187	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE) {
				1188	/*
				1189	* If an I/O scheduler has been configured and we got a
				1190	* driver tag for the next request already, free it
				1191	* again.
				1192	*/
				1193	if (!list_empty(list)) {
				1194	nxt = list_first_entry(list, struct request, queuelist);
				1195	blk_mq_put_driver_tag(nxt);
				1196	}
				1197	list_add(&rq->queuelist, list);
				1198	__blk_mq_requeue_request(rq);
				1199	break;
				1200	}
				1201
				1202	if (unlikely(ret != BLK_STS_OK)) {
				1203	errors++;
				1204	blk_mq_end_request(rq, BLK_STS_IOERR);
				1205	continue;
				1206	}
				1207
				1208	queued++;
				1209	} while (!list_empty(list));
				1210
				1211	hctx->dispatched[queued_to_index(queued)]++;
				1212
				1213	/*
				1214	* Any items that need requeuing? Stuff them into hctx->dispatch,
				1215	* that is where we will continue on next queue run.
				1216	*/
				1217	if (!list_empty(list)) {
				1218	bool needs_restart;
				1219
				1220	spin_lock(&hctx->lock);
				1221	list_splice_init(list, &hctx->dispatch);
				1222	spin_unlock(&hctx->lock);
				1223
				1224	/*
				1225	* If SCHED_RESTART was set by the caller of this function and
				1226	* it is no longer set that means that it was cleared by another
				1227	* thread and hence that a queue rerun is needed.
				1228	*
				1229	* If 'no_tag' is set, that means that we failed getting
				1230	* a driver tag with an I/O scheduler attached. If our dispatch
				1231	* waitqueue is no longer active, ensure that we run the queue
				1232	* AFTER adding our entries back to the list.
				1233	*
				1234	* If no I/O scheduler has been configured it is possible that
				1235	* the hardware queue got stopped and restarted before requests
				1236	* were pushed back onto the dispatch list. Rerun the queue to
				1237	* avoid starvation. Notes:
				1238	* - blk_mq_run_hw_queue() checks whether or not a queue has
				1239	* been stopped before rerunning a queue.
				1240	* - Some but not all block drivers stop a queue before
				1241	* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
				1242	* and dm-rq.
				1243	*
				1244	* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
				1245	* bit is set, run queue after a delay to avoid IO stalls
				1246	* that could otherwise occur if the queue is idle.
				1247	*/
				1248	needs_restart = blk_mq_sched_needs_restart(hctx);
				1249	if (!needs_restart \|\|
				1250	(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
				1251	blk_mq_run_hw_queue(hctx, true);
				1252	else if (needs_restart && (ret == BLK_STS_RESOURCE))
				1253	blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
				1254
				1255	blk_mq_update_dispatch_busy(hctx, true);
				1256	return false;
				1257	} else
				1258	blk_mq_update_dispatch_busy(hctx, false);
				1259
				1260	/*
				1261	* If the host/device is unable to accept more work, inform the
				1262	* caller of that.
				1263	*/
				1264	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
				1265	return false;
				1266
				1267	return (queued + errors) != 0;
				1268	}
				1269
				1270	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				1271	{
				1272	int srcu_idx;
				1273
				1274	/*
				1275	* We should be running this queue from one of the CPUs that
				1276	* are mapped to it.
				1277	*
				1278	* There are at least two related races now between setting
				1279	* hctx->next_cpu from blk_mq_hctx_next_cpu() and running
				1280	* __blk_mq_run_hw_queue():
				1281	*
				1282	* - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
				1283	* but later it becomes online, then this warning is harmless
				1284	* at all
				1285	*
				1286	* - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
				1287	* but later it becomes offline, then the warning can't be
				1288	* triggered, and we depend on blk-mq timeout handler to
				1289	* handle dispatched requests to this hctx
				1290	*/
				1291	if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
				1292	cpu_online(hctx->next_cpu)) {
				1293	printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
				1294	raw_smp_processor_id(),
				1295	cpumask_empty(hctx->cpumask) ? "inactive": "active");
				1296	dump_stack();
				1297	}
				1298
				1299	/*
				1300	* We can't run the queue inline with ints disabled. Ensure that
				1301	* we catch bad users of this early.
				1302	*/
				1303	WARN_ON_ONCE(in_interrupt());
				1304
				1305	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
				1306
				1307	hctx_lock(hctx, &srcu_idx);
				1308	blk_mq_sched_dispatch_requests(hctx);
				1309	hctx_unlock(hctx, srcu_idx);
				1310	}
				1311
				1312	static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
				1313	{
				1314	int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
				1315
				1316	if (cpu >= nr_cpu_ids)
				1317	cpu = cpumask_first(hctx->cpumask);
				1318	return cpu;
				1319	}
				1320
				1321	/*
				1322	* It'd be great if the workqueue API had a way to pass
				1323	* in a mask and had some smarts for more clever placement.
				1324	* For now we just round-robin here, switching for every
				1325	* BLK_MQ_CPU_WORK_BATCH queued items.
				1326	*/
				1327	static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
				1328	{
				1329	bool tried = false;
				1330	int next_cpu = hctx->next_cpu;
				1331
				1332	if (hctx->queue->nr_hw_queues == 1)
				1333	return WORK_CPU_UNBOUND;
				1334
				1335	if (--hctx->next_cpu_batch <= 0) {
				1336	select_cpu:
				1337	next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
				1338	cpu_online_mask);
				1339	if (next_cpu >= nr_cpu_ids)
				1340	next_cpu = blk_mq_first_mapped_cpu(hctx);
				1341	hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
				1342	}
				1343
				1344	/*
				1345	* Do unbound schedule if we can't find a online CPU for this hctx,
				1346	* and it should only happen in the path of handling CPU DEAD.
				1347	*/
				1348	if (!cpu_online(next_cpu)) {
				1349	if (!tried) {
				1350	tried = true;
				1351	goto select_cpu;
				1352	}
				1353
				1354	/*
				1355	* Make sure to re-select CPU next time once after CPUs
				1356	* in hctx->cpumask become online again.
				1357	*/
				1358	hctx->next_cpu = next_cpu;
				1359	hctx->next_cpu_batch = 1;
				1360	return WORK_CPU_UNBOUND;
				1361	}
				1362
				1363	hctx->next_cpu = next_cpu;
				1364	return next_cpu;
				1365	}
				1366
				1367	static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
				1368	unsigned long msecs)
				1369	{
				1370	if (unlikely(blk_mq_hctx_stopped(hctx)))
				1371	return;
				1372
				1373	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
				1374	int cpu = get_cpu();
				1375	if (cpumask_test_cpu(cpu, hctx->cpumask)) {
				1376	__blk_mq_run_hw_queue(hctx);
				1377	put_cpu();
				1378	return;
				1379	}
				1380
				1381	put_cpu();
				1382	}
				1383
				1384	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
				1385	msecs_to_jiffies(msecs));
				1386	}
				1387
				1388	void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
				1389	{
				1390	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
				1391	}
				1392	EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
				1393
				1394	bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				1395	{
				1396	int srcu_idx;
				1397	bool need_run;
				1398
				1399	/*
				1400	* When queue is quiesced, we may be switching io scheduler, or
				1401	* updating nr_hw_queues, or other things, and we can't run queue
				1402	* any more, even __blk_mq_hctx_has_pending() can't be called safely.
				1403	*
				1404	* And queue will be rerun in blk_mq_unquiesce_queue() if it is
				1405	* quiesced.
				1406	*/
				1407	hctx_lock(hctx, &srcu_idx);
				1408	need_run = !blk_queue_quiesced(hctx->queue) &&
				1409	blk_mq_hctx_has_pending(hctx);
				1410	hctx_unlock(hctx, srcu_idx);
				1411
				1412	if (need_run) {
				1413	__blk_mq_delay_run_hw_queue(hctx, async, 0);
				1414	return true;
				1415	}
				1416
				1417	return false;
				1418	}
				1419	EXPORT_SYMBOL(blk_mq_run_hw_queue);
				1420
				1421	void blk_mq_run_hw_queues(struct request_queue *q, bool async)
				1422	{
				1423	struct blk_mq_hw_ctx *hctx;
				1424	int i;
				1425
				1426	queue_for_each_hw_ctx(q, hctx, i) {
				1427	if (blk_mq_hctx_stopped(hctx))
				1428	continue;
				1429
				1430	blk_mq_run_hw_queue(hctx, async);
				1431	}
				1432	}
				1433	EXPORT_SYMBOL(blk_mq_run_hw_queues);
				1434
				1435	/**
				1436	* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
				1437	* @q: request queue.
				1438	*
				1439	* The caller is responsible for serializing this function against
				1440	* blk_mq_{start,stop}_hw_queue().
				1441	*/
				1442	bool blk_mq_queue_stopped(struct request_queue *q)
				1443	{
				1444	struct blk_mq_hw_ctx *hctx;
				1445	int i;
				1446
				1447	queue_for_each_hw_ctx(q, hctx, i)
				1448	if (blk_mq_hctx_stopped(hctx))
				1449	return true;
				1450
				1451	return false;
				1452	}
				1453	EXPORT_SYMBOL(blk_mq_queue_stopped);
				1454
				1455	/*
				1456	* This function is often used for pausing .queue_rq() by driver when
				1457	* there isn't enough resource or some conditions aren't satisfied, and
				1458	* BLK_STS_RESOURCE is usually returned.
				1459	*
				1460	* We do not guarantee that dispatch can be drained or blocked
				1461	* after blk_mq_stop_hw_queue() returns. Please use
				1462	* blk_mq_quiesce_queue() for that requirement.
				1463	*/
				1464	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				1465	{
				1466	cancel_delayed_work(&hctx->run_work);
				1467
				1468	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				1469	}
				1470	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				1471
				1472	/*
				1473	* This function is often used for pausing .queue_rq() by driver when
				1474	* there isn't enough resource or some conditions aren't satisfied, and
				1475	* BLK_STS_RESOURCE is usually returned.
				1476	*
				1477	* We do not guarantee that dispatch can be drained or blocked
				1478	* after blk_mq_stop_hw_queues() returns. Please use
				1479	* blk_mq_quiesce_queue() for that requirement.
				1480	*/
				1481	void blk_mq_stop_hw_queues(struct request_queue *q)
				1482	{
				1483	struct blk_mq_hw_ctx *hctx;
				1484	int i;
				1485
				1486	queue_for_each_hw_ctx(q, hctx, i)
				1487	blk_mq_stop_hw_queue(hctx);
				1488	}
				1489	EXPORT_SYMBOL(blk_mq_stop_hw_queues);
				1490
				1491	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				1492	{
				1493	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				1494
				1495	blk_mq_run_hw_queue(hctx, false);
				1496	}
				1497	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				1498
				1499	void blk_mq_start_hw_queues(struct request_queue *q)
				1500	{
				1501	struct blk_mq_hw_ctx *hctx;
				1502	int i;
				1503
				1504	queue_for_each_hw_ctx(q, hctx, i)
				1505	blk_mq_start_hw_queue(hctx);
				1506	}
				1507	EXPORT_SYMBOL(blk_mq_start_hw_queues);
				1508
				1509	void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				1510	{
				1511	if (!blk_mq_hctx_stopped(hctx))
				1512	return;
				1513
				1514	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				1515	blk_mq_run_hw_queue(hctx, async);
				1516	}
				1517	EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
				1518
				1519	void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
				1520	{
				1521	struct blk_mq_hw_ctx *hctx;
				1522	int i;
				1523
				1524	queue_for_each_hw_ctx(q, hctx, i)
				1525	blk_mq_start_stopped_hw_queue(hctx, async);
				1526	}
				1527	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				1528
				1529	static void blk_mq_run_work_fn(struct work_struct *work)
				1530	{
				1531	struct blk_mq_hw_ctx *hctx;
				1532
				1533	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
				1534
				1535	/*
				1536	* If we are stopped, don't run the queue.
				1537	*/
				1538	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				1539	return;
				1540
				1541	__blk_mq_run_hw_queue(hctx);
				1542	}
				1543
				1544	static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
				1545	struct request *rq,
				1546	bool at_head)
				1547	{
				1548	struct blk_mq_ctx *ctx = rq->mq_ctx;
				1549
				1550	lockdep_assert_held(&ctx->lock);
				1551
				1552	trace_block_rq_insert(hctx->queue, rq);
				1553
				1554	if (at_head)
				1555	list_add(&rq->queuelist, &ctx->rq_list);
				1556	else
				1557	list_add_tail(&rq->queuelist, &ctx->rq_list);
				1558	}
				1559
				1560	void __blk_mq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
				1561	bool at_head)
				1562	{
				1563	struct blk_mq_ctx *ctx = rq->mq_ctx;
				1564
				1565	lockdep_assert_held(&ctx->lock);
				1566
				1567	__blk_mq_insert_req_list(hctx, rq, at_head);
				1568	blk_mq_hctx_mark_pending(hctx, ctx);
				1569	}
				1570
				1571	/*
				1572	* Should only be used carefully, when the caller knows we want to
				1573	* bypass a potential IO scheduler on the target device.
				1574	*/
				1575	void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
				1576	{
				1577	struct blk_mq_ctx *ctx = rq->mq_ctx;
				1578	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
				1579
				1580	spin_lock(&hctx->lock);
				1581	list_add_tail(&rq->queuelist, &hctx->dispatch);
				1582	spin_unlock(&hctx->lock);
				1583
				1584	if (run_queue)
				1585	blk_mq_run_hw_queue(hctx, false);
				1586	}
				1587
				1588	void blk_mq_insert_requests(struct blk_mq_hw_ctx hctx, struct blk_mq_ctx ctx,
				1589	struct list_head *list)
				1590
				1591	{
				1592	struct request *rq;
				1593
				1594	/*
				1595	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				1596	* offline now
				1597	*/
				1598	list_for_each_entry(rq, list, queuelist) {
				1599	BUG_ON(rq->mq_ctx != ctx);
				1600	trace_block_rq_insert(hctx->queue, rq);
				1601	}
				1602
				1603	spin_lock(&ctx->lock);
				1604	list_splice_tail_init(list, &ctx->rq_list);
				1605	blk_mq_hctx_mark_pending(hctx, ctx);
				1606	spin_unlock(&ctx->lock);
				1607	}
				1608
				1609	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				1610	{
				1611	struct request *rqa = container_of(a, struct request, queuelist);
				1612	struct request *rqb = container_of(b, struct request, queuelist);
				1613
				1614	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				1615	(rqa->mq_ctx == rqb->mq_ctx &&
				1616	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				1617	}
				1618
				1619	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				1620	{
				1621	struct blk_mq_ctx *this_ctx;
				1622	struct request_queue *this_q;
				1623	struct request *rq;
				1624	LIST_HEAD(list);
				1625	LIST_HEAD(ctx_list);
				1626	unsigned int depth;
				1627
				1628	list_splice_init(&plug->mq_list, &list);
				1629
				1630	list_sort(NULL, &list, plug_ctx_cmp);
				1631
				1632	this_q = NULL;
				1633	this_ctx = NULL;
				1634	depth = 0;
				1635
				1636	while (!list_empty(&list)) {
				1637	rq = list_entry_rq(list.next);
				1638	list_del_init(&rq->queuelist);
				1639	BUG_ON(!rq->q);
				1640	if (rq->mq_ctx != this_ctx) {
				1641	if (this_ctx) {
				1642	trace_block_unplug(this_q, depth, !from_schedule);
				1643	blk_mq_sched_insert_requests(this_q, this_ctx,
				1644	&ctx_list,
				1645	from_schedule);
				1646	}
				1647
				1648	this_ctx = rq->mq_ctx;
				1649	this_q = rq->q;
				1650	depth = 0;
				1651	}
				1652
				1653	depth++;
				1654	list_add_tail(&rq->queuelist, &ctx_list);
				1655	}
				1656
				1657	/*
				1658	* If 'this_ctx' is set, we know we have entries to complete
				1659	* on 'ctx_list'. Do those.
				1660	*/
				1661	if (this_ctx) {
				1662	trace_block_unplug(this_q, depth, !from_schedule);
				1663	blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
				1664	from_schedule);
				1665	}
				1666	}
				1667
				1668	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				1669	{
				1670	blk_init_request_from_bio(rq, bio);
				1671
				1672	blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
				1673
				1674	blk_account_io_start(rq, true);
				1675	}
				1676
				1677	static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx hctx, struct request rq)
				1678	{
				1679	if (rq->tag != -1)
				1680	return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
				1681
				1682	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
				1683	}
				1684
				1685	static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
				1686	struct request *rq,
				1687	blk_qc_t *cookie)
				1688	{
				1689	struct request_queue *q = rq->q;
				1690	struct blk_mq_queue_data bd = {
				1691	.rq = rq,
				1692	.last = true,
				1693	};
				1694	blk_qc_t new_cookie;
				1695	blk_status_t ret;
				1696
				1697	new_cookie = request_to_qc_t(hctx, rq);
				1698
				1699	/*
				1700	* For OK queue, we are done. For error, caller may kill it.
				1701	* Any other error (busy), just add it to our list as we
				1702	* previously would have done.
				1703	*/
				1704	ret = q->mq_ops->queue_rq(hctx, &bd);
				1705	switch (ret) {
				1706	case BLK_STS_OK:
				1707	blk_mq_update_dispatch_busy(hctx, false);
				1708	*cookie = new_cookie;
				1709	break;
				1710	case BLK_STS_RESOURCE:
				1711	case BLK_STS_DEV_RESOURCE:
				1712	blk_mq_update_dispatch_busy(hctx, true);
				1713	__blk_mq_requeue_request(rq);
				1714	break;
				1715	default:
				1716	blk_mq_update_dispatch_busy(hctx, false);
				1717	*cookie = BLK_QC_T_NONE;
				1718	break;
				1719	}
				1720
				1721	return ret;
				1722	}
				1723
				1724	static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
				1725	struct request *rq,
				1726	blk_qc_t *cookie,
				1727	bool bypass_insert)
				1728	{
				1729	struct request_queue *q = rq->q;
				1730	bool run_queue = true;
				1731
				1732	/*
				1733	* RCU or SRCU read lock is needed before checking quiesced flag.
				1734	*
				1735	* When queue is stopped or quiesced, ignore 'bypass_insert' from
				1736	* blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
				1737	* and avoid driver to try to dispatch again.
				1738	*/
				1739	if (blk_mq_hctx_stopped(hctx) \|\| blk_queue_quiesced(q)) {
				1740	run_queue = false;
				1741	bypass_insert = false;
				1742	goto insert;
				1743	}
				1744
				1745	if (q->elevator && !bypass_insert)
				1746	goto insert;
				1747
				1748	if (!blk_mq_get_dispatch_budget(hctx))
				1749	goto insert;
				1750
				1751	if (!blk_mq_get_driver_tag(rq)) {
				1752	blk_mq_put_dispatch_budget(hctx);
				1753	goto insert;
				1754	}
				1755
				1756	return __blk_mq_issue_directly(hctx, rq, cookie);
				1757	insert:
				1758	if (bypass_insert)
				1759	return BLK_STS_RESOURCE;
				1760
				1761	blk_mq_request_bypass_insert(rq, run_queue);
				1762	return BLK_STS_OK;
				1763	}
				1764
				1765	static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
				1766	struct request rq, blk_qc_t cookie)
				1767	{
				1768	blk_status_t ret;
				1769	int srcu_idx;
				1770
				1771	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
				1772
				1773	hctx_lock(hctx, &srcu_idx);
				1774
				1775	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
				1776	if (ret == BLK_STS_RESOURCE \|\| ret == BLK_STS_DEV_RESOURCE)
				1777	blk_mq_request_bypass_insert(rq, true);
				1778	else if (ret != BLK_STS_OK)
				1779	blk_mq_end_request(rq, ret);
				1780
				1781	hctx_unlock(hctx, srcu_idx);
				1782	}
				1783
				1784	blk_status_t blk_mq_request_issue_directly(struct request *rq)
				1785	{
				1786	blk_status_t ret;
				1787	int srcu_idx;
				1788	blk_qc_t unused_cookie;
				1789	struct blk_mq_ctx *ctx = rq->mq_ctx;
				1790	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
				1791
				1792	hctx_lock(hctx, &srcu_idx);
				1793	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
				1794	hctx_unlock(hctx, srcu_idx);
				1795
				1796	return ret;
				1797	}
				1798
				1799	void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
				1800	struct list_head *list)
				1801	{
				1802	while (!list_empty(list)) {
				1803	blk_status_t ret;
				1804	struct request *rq = list_first_entry(list, struct request,
				1805	queuelist);
				1806
				1807	list_del_init(&rq->queuelist);
				1808	ret = blk_mq_request_issue_directly(rq);
				1809	if (ret != BLK_STS_OK) {
				1810	if (ret == BLK_STS_RESOURCE \|\|
				1811	ret == BLK_STS_DEV_RESOURCE) {
				1812	blk_mq_request_bypass_insert(rq,
				1813	list_empty(list));
				1814	break;
				1815	}
				1816	blk_mq_end_request(rq, ret);
				1817	}
				1818	}
				1819	}
				1820
				1821	static blk_qc_t blk_mq_make_request(struct request_queue q, struct bio bio)
				1822	{
				1823	const int is_sync = op_is_sync(bio->bi_opf);
				1824	const int is_flush_fua = op_is_flush(bio->bi_opf);
				1825	struct blk_mq_alloc_data data = { .flags = 0 };
				1826	struct request *rq;
				1827	unsigned int request_count = 0;
				1828	struct blk_plug *plug;
				1829	struct request *same_queue_rq = NULL;
				1830	blk_qc_t cookie;
				1831
				1832	blk_queue_bounce(q, &bio);
				1833
				1834	blk_queue_split(q, &bio);
				1835
				1836	if (!bio_integrity_prep(bio))
				1837	return BLK_QC_T_NONE;
				1838
				1839	if (!is_flush_fua && !blk_queue_nomerges(q) &&
				1840	blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
				1841	return BLK_QC_T_NONE;
				1842
				1843	if (blk_mq_sched_bio_merge(q, bio))
				1844	return BLK_QC_T_NONE;
				1845
				1846	rq_qos_throttle(q, bio, NULL);
				1847
				1848	trace_block_getrq(q, bio, bio->bi_opf);
				1849
				1850	rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
				1851	if (unlikely(!rq)) {
				1852	rq_qos_cleanup(q, bio);
				1853	if (bio->bi_opf & REQ_NOWAIT)
				1854	bio_wouldblock_error(bio);
				1855	return BLK_QC_T_NONE;
				1856	}
				1857
				1858	rq_qos_track(q, rq, bio);
				1859
				1860	cookie = request_to_qc_t(data.hctx, rq);
				1861
				1862	plug = current->plug;
				1863	if (unlikely(is_flush_fua)) {
				1864	blk_mq_put_ctx(data.ctx);
				1865	blk_mq_bio_to_request(rq, bio);
				1866
				1867	/* bypass scheduler for flush rq */
				1868	blk_insert_flush(rq);
				1869	blk_mq_run_hw_queue(data.hctx, true);
				1870	} else if (plug && q->nr_hw_queues == 1) {
				1871	struct request *last = NULL;
				1872
				1873	blk_mq_put_ctx(data.ctx);
				1874	blk_mq_bio_to_request(rq, bio);
				1875
				1876	/*
				1877	* @request_count may become stale because of schedule
				1878	* out, so check the list again.
				1879	*/
				1880	if (list_empty(&plug->mq_list))
				1881	request_count = 0;
				1882	else if (blk_queue_nomerges(q))
				1883	request_count = blk_plug_queued_count(q);
				1884
				1885	if (!request_count)
				1886	trace_block_plug(q);
				1887	else
				1888	last = list_entry_rq(plug->mq_list.prev);
				1889
				1890	if (request_count >= BLK_MAX_REQUEST_COUNT \|\| (last &&
				1891	blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
				1892	blk_flush_plug_list(plug, false);
				1893	trace_block_plug(q);
				1894	}
				1895
				1896	list_add_tail(&rq->queuelist, &plug->mq_list);
				1897	} else if (plug && !blk_queue_nomerges(q)) {
				1898	blk_mq_bio_to_request(rq, bio);
				1899
				1900	/*
				1901	* We do limited plugging. If the bio can be merged, do that.
				1902	* Otherwise the existing request in the plug list will be
				1903	* issued. So the plug list will have one request at most
				1904	* The plug list might get flushed before this. If that happens,
				1905	* the plug list is empty, and same_queue_rq is invalid.
				1906	*/
				1907	if (list_empty(&plug->mq_list))
				1908	same_queue_rq = NULL;
				1909	if (same_queue_rq)
				1910	list_del_init(&same_queue_rq->queuelist);
				1911	list_add_tail(&rq->queuelist, &plug->mq_list);
				1912
				1913	blk_mq_put_ctx(data.ctx);
				1914
				1915	if (same_queue_rq) {
				1916	data.hctx = blk_mq_map_queue(q,
				1917	same_queue_rq->mq_ctx->cpu);
				1918	blk_mq_try_issue_directly(data.hctx, same_queue_rq,
				1919	&cookie);
				1920	}
				1921	} else if ((q->nr_hw_queues > 1 && is_sync) \|\| (!q->elevator &&
				1922	!data.hctx->dispatch_busy)) {
				1923	blk_mq_put_ctx(data.ctx);
				1924	blk_mq_bio_to_request(rq, bio);
				1925	blk_mq_try_issue_directly(data.hctx, rq, &cookie);
				1926	} else {
				1927	blk_mq_put_ctx(data.ctx);
				1928	blk_mq_bio_to_request(rq, bio);
				1929	blk_mq_sched_insert_request(rq, false, true, true);
				1930	}
				1931
				1932	return cookie;
				1933	}
				1934
				1935	void blk_mq_free_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
				1936	unsigned int hctx_idx)
				1937	{
				1938	struct page *page;
				1939
				1940	if (tags->rqs && set->ops->exit_request) {
				1941	int i;
				1942
				1943	for (i = 0; i < tags->nr_tags; i++) {
				1944	struct request *rq = tags->static_rqs[i];
				1945
				1946	if (!rq)
				1947	continue;
				1948	set->ops->exit_request(set, rq, hctx_idx);
				1949	tags->static_rqs[i] = NULL;
				1950	}
				1951	}
				1952
				1953	while (!list_empty(&tags->page_list)) {
				1954	page = list_first_entry(&tags->page_list, struct page, lru);
				1955	list_del_init(&page->lru);
				1956	/*
				1957	* Remove kmemleak object previously allocated in
				1958	* blk_mq_init_rq_map().
				1959	*/
				1960	kmemleak_free(page_address(page));
				1961	__free_pages(page, page->private);
				1962	}
				1963	}
				1964
				1965	void blk_mq_free_rq_map(struct blk_mq_tags *tags)
				1966	{
				1967	kfree(tags->rqs);
				1968	tags->rqs = NULL;
				1969	kfree(tags->static_rqs);
				1970	tags->static_rqs = NULL;
				1971
				1972	blk_mq_free_tags(tags);
				1973	}
				1974
				1975	struct blk_mq_tags blk_mq_alloc_rq_map(struct blk_mq_tag_set set,
				1976	unsigned int hctx_idx,
				1977	unsigned int nr_tags,
				1978	unsigned int reserved_tags)
				1979	{
				1980	struct blk_mq_tags *tags;
				1981	int node;
				1982
				1983	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
				1984	if (node == NUMA_NO_NODE)
				1985	node = set->numa_node;
				1986
				1987	tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
				1988	BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
				1989	if (!tags)
				1990	return NULL;
				1991
				1992	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
				1993	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
				1994	node);
				1995	if (!tags->rqs) {
				1996	blk_mq_free_tags(tags);
				1997	return NULL;
				1998	}
				1999
				2000	tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
				2001	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
				2002	node);
				2003	if (!tags->static_rqs) {
				2004	kfree(tags->rqs);
				2005	blk_mq_free_tags(tags);
				2006	return NULL;
				2007	}
				2008
				2009	return tags;
				2010	}
				2011
				2012	static size_t order_to_size(unsigned int order)
				2013	{
				2014	return (size_t)PAGE_SIZE << order;
				2015	}
				2016
				2017	static int blk_mq_init_request(struct blk_mq_tag_set set, struct request rq,
				2018	unsigned int hctx_idx, int node)
				2019	{
				2020	int ret;
				2021
				2022	if (set->ops->init_request) {
				2023	ret = set->ops->init_request(set, rq, hctx_idx, node);
				2024	if (ret)
				2025	return ret;
				2026	}
				2027
				2028	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
				2029	return 0;
				2030	}
				2031
				2032	int blk_mq_alloc_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
				2033	unsigned int hctx_idx, unsigned int depth)
				2034	{
				2035	unsigned int i, j, entries_per_page, max_order = 4;
				2036	size_t rq_size, left;
				2037	int node;
				2038
				2039	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
				2040	if (node == NUMA_NO_NODE)
				2041	node = set->numa_node;
				2042
				2043	INIT_LIST_HEAD(&tags->page_list);
				2044
				2045	/*
				2046	* rq_size is the size of the request plus driver payload, rounded
				2047	* to the cacheline size
				2048	*/
				2049	rq_size = round_up(sizeof(struct request) + set->cmd_size,
				2050	cache_line_size());
				2051	left = rq_size * depth;
				2052
				2053	for (i = 0; i < depth; ) {
				2054	int this_order = max_order;
				2055	struct page *page;
				2056	int to_do;
				2057	void *p;
				2058
				2059	while (this_order && left < order_to_size(this_order - 1))
				2060	this_order--;
				2061
				2062	do {
				2063	page = alloc_pages_node(node,
				2064	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY \| __GFP_ZERO,
				2065	this_order);
				2066	if (page)
				2067	break;
				2068	if (!this_order--)
				2069	break;
				2070	if (order_to_size(this_order) < rq_size)
				2071	break;
				2072	} while (1);
				2073
				2074	if (!page)
				2075	goto fail;
				2076
				2077	page->private = this_order;
				2078	list_add_tail(&page->lru, &tags->page_list);
				2079
				2080	p = page_address(page);
				2081	/*
				2082	* Allow kmemleak to scan these pages as they contain pointers
				2083	* to additional allocations like via ops->init_request().
				2084	*/
				2085	kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
				2086	entries_per_page = order_to_size(this_order) / rq_size;
				2087	to_do = min(entries_per_page, depth - i);
				2088	left -= to_do * rq_size;
				2089	for (j = 0; j < to_do; j++) {
				2090	struct request *rq = p;
				2091
				2092	tags->static_rqs[i] = rq;
				2093	if (blk_mq_init_request(set, rq, hctx_idx, node)) {
				2094	tags->static_rqs[i] = NULL;
				2095	goto fail;
				2096	}
				2097
				2098	p += rq_size;
				2099	i++;
				2100	}
				2101	}
				2102	return 0;
				2103
				2104	fail:
				2105	blk_mq_free_rqs(set, tags, hctx_idx);
				2106	return -ENOMEM;
				2107	}
				2108
				2109	/*
				2110	* 'cpu' is going away. splice any existing rq_list entries from this
				2111	* software queue to the hw queue dispatch list, and ensure that it
				2112	* gets run.
				2113	*/
				2114	static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
				2115	{
				2116	struct blk_mq_hw_ctx *hctx;
				2117	struct blk_mq_ctx *ctx;
				2118	LIST_HEAD(tmp);
				2119
				2120	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
				2121	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
				2122
				2123	spin_lock(&ctx->lock);
				2124	if (!list_empty(&ctx->rq_list)) {
				2125	list_splice_init(&ctx->rq_list, &tmp);
				2126	blk_mq_hctx_clear_pending(hctx, ctx);
				2127	}
				2128	spin_unlock(&ctx->lock);
				2129
				2130	if (list_empty(&tmp))
				2131	return 0;
				2132
				2133	spin_lock(&hctx->lock);
				2134	list_splice_tail_init(&tmp, &hctx->dispatch);
				2135	spin_unlock(&hctx->lock);
				2136
				2137	blk_mq_run_hw_queue(hctx, true);
				2138	return 0;
				2139	}
				2140
				2141	static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
				2142	{
				2143	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
				2144	&hctx->cpuhp_dead);
				2145	}
				2146
				2147	/* hctx->ctxs will be freed in queue's release handler */
				2148	static void blk_mq_exit_hctx(struct request_queue *q,
				2149	struct blk_mq_tag_set *set,
				2150	struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
				2151	{
				2152	blk_mq_debugfs_unregister_hctx(hctx);
				2153
				2154	if (blk_mq_hw_queue_mapped(hctx))
				2155	blk_mq_tag_idle(hctx);
				2156
				2157	if (set->ops->exit_request)
				2158	set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
				2159
				2160	if (set->ops->exit_hctx)
				2161	set->ops->exit_hctx(hctx, hctx_idx);
				2162
				2163	blk_mq_remove_cpuhp(hctx);
				2164	}
				2165
				2166	static void blk_mq_exit_hw_queues(struct request_queue *q,
				2167	struct blk_mq_tag_set *set, int nr_queue)
				2168	{
				2169	struct blk_mq_hw_ctx *hctx;
				2170	unsigned int i;
				2171
				2172	queue_for_each_hw_ctx(q, hctx, i) {
				2173	if (i == nr_queue)
				2174	break;
				2175	blk_mq_exit_hctx(q, set, hctx, i);
				2176	}
				2177	}
				2178
				2179	static int blk_mq_init_hctx(struct request_queue *q,
				2180	struct blk_mq_tag_set *set,
				2181	struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
				2182	{
				2183	int node;
				2184
				2185	node = hctx->numa_node;
				2186	if (node == NUMA_NO_NODE)
				2187	node = hctx->numa_node = set->numa_node;
				2188
				2189	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
				2190	spin_lock_init(&hctx->lock);
				2191	INIT_LIST_HEAD(&hctx->dispatch);
				2192	hctx->queue = q;
				2193	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
				2194
				2195	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
				2196
				2197	hctx->tags = set->tags[hctx_idx];
				2198
				2199	/*
				2200	* Allocate space for all possible cpus to avoid allocation at
				2201	* runtime
				2202	*/
				2203	hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
				2204	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node);
				2205	if (!hctx->ctxs)
				2206	goto unregister_cpu_notifier;
				2207
				2208	if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
				2209	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY, node))
				2210	goto free_ctxs;
				2211
				2212	hctx->nr_ctx = 0;
				2213
				2214	spin_lock_init(&hctx->dispatch_wait_lock);
				2215	init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
				2216	INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
				2217
				2218	if (set->ops->init_hctx &&
				2219	set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
				2220	goto free_bitmap;
				2221
				2222	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
				2223	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY);
				2224	if (!hctx->fq)
				2225	goto exit_hctx;
				2226
				2227	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
				2228	goto free_fq;
				2229
				2230	if (hctx->flags & BLK_MQ_F_BLOCKING)
				2231	init_srcu_struct(hctx->srcu);
				2232
				2233	blk_mq_debugfs_register_hctx(q, hctx);
				2234
				2235	return 0;
				2236
				2237	free_fq:
				2238	blk_free_flush_queue(hctx->fq);
				2239	exit_hctx:
				2240	if (set->ops->exit_hctx)
				2241	set->ops->exit_hctx(hctx, hctx_idx);
				2242	free_bitmap:
				2243	sbitmap_free(&hctx->ctx_map);
				2244	free_ctxs:
				2245	kfree(hctx->ctxs);
				2246	unregister_cpu_notifier:
				2247	blk_mq_remove_cpuhp(hctx);
				2248	return -1;
				2249	}
				2250
				2251	static void blk_mq_init_cpu_queues(struct request_queue *q,
				2252	unsigned int nr_hw_queues)
				2253	{
				2254	unsigned int i;
				2255
				2256	for_each_possible_cpu(i) {
				2257	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				2258	struct blk_mq_hw_ctx *hctx;
				2259
				2260	__ctx->cpu = i;
				2261	spin_lock_init(&__ctx->lock);
				2262	INIT_LIST_HEAD(&__ctx->rq_list);
				2263	__ctx->queue = q;
				2264
				2265	/*
				2266	* Set local node, IFF we have more than one hw queue. If
				2267	* not, we remain on the home node of the device
				2268	*/
				2269	hctx = blk_mq_map_queue(q, i);
				2270	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				2271	hctx->numa_node = local_memory_node(cpu_to_node(i));
				2272	}
				2273	}
				2274
				2275	static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
				2276	{
				2277	int ret = 0;
				2278
				2279	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
				2280	set->queue_depth, set->reserved_tags);
				2281	if (!set->tags[hctx_idx])
				2282	return false;
				2283
				2284	ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
				2285	set->queue_depth);
				2286	if (!ret)
				2287	return true;
				2288
				2289	blk_mq_free_rq_map(set->tags[hctx_idx]);
				2290	set->tags[hctx_idx] = NULL;
				2291	return false;
				2292	}
				2293
				2294	static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
				2295	unsigned int hctx_idx)
				2296	{
				2297	if (set->tags[hctx_idx]) {
				2298	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
				2299	blk_mq_free_rq_map(set->tags[hctx_idx]);
				2300	set->tags[hctx_idx] = NULL;
				2301	}
				2302	}
				2303
				2304	static void blk_mq_map_swqueue(struct request_queue *q)
				2305	{
				2306	unsigned int i, hctx_idx;
				2307	struct blk_mq_hw_ctx *hctx;
				2308	struct blk_mq_ctx *ctx;
				2309	struct blk_mq_tag_set *set = q->tag_set;
				2310
				2311	/*
				2312	* Avoid others reading imcomplete hctx->cpumask through sysfs
				2313	*/
				2314	mutex_lock(&q->sysfs_lock);
				2315
				2316	queue_for_each_hw_ctx(q, hctx, i) {
				2317	cpumask_clear(hctx->cpumask);
				2318	hctx->nr_ctx = 0;
				2319	hctx->dispatch_from = NULL;
				2320	}
				2321
				2322	/*
				2323	* Map software to hardware queues.
				2324	*
				2325	* If the cpu isn't present, the cpu is mapped to first hctx.
				2326	*/
				2327	for_each_possible_cpu(i) {
				2328	hctx_idx = q->mq_map[i];
				2329	/* unmapped hw queue can be remapped after CPU topo changed */
				2330	if (!set->tags[hctx_idx] &&
				2331	!__blk_mq_alloc_rq_map(set, hctx_idx)) {
				2332	/*
				2333	* If tags initialization fail for some hctx,
				2334	* that hctx won't be brought online. In this
				2335	* case, remap the current ctx to hctx[0] which
				2336	* is guaranteed to always have tags allocated
				2337	*/
				2338	q->mq_map[i] = 0;
				2339	}
				2340
				2341	ctx = per_cpu_ptr(q->queue_ctx, i);
				2342	hctx = blk_mq_map_queue(q, i);
				2343
				2344	cpumask_set_cpu(i, hctx->cpumask);
				2345	ctx->index_hw = hctx->nr_ctx;
				2346	hctx->ctxs[hctx->nr_ctx++] = ctx;
				2347	}
				2348
				2349	mutex_unlock(&q->sysfs_lock);
				2350
				2351	queue_for_each_hw_ctx(q, hctx, i) {
				2352	/*
				2353	* If no software queues are mapped to this hardware queue,
				2354	* disable it and free the request entries.
				2355	*/
				2356	if (!hctx->nr_ctx) {
				2357	/* Never unmap queue 0. We need it as a
				2358	* fallback in case of a new remap fails
				2359	* allocation
				2360	*/
				2361	if (i && set->tags[i])
				2362	blk_mq_free_map_and_requests(set, i);
				2363
				2364	hctx->tags = NULL;
				2365	continue;
				2366	}
				2367
				2368	hctx->tags = set->tags[i];
				2369	WARN_ON(!hctx->tags);
				2370
				2371	/*
				2372	* Set the map size to the number of mapped software queues.
				2373	* This is more accurate and more efficient than looping
				2374	* over all possibly mapped software queues.
				2375	*/
				2376	sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
				2377
				2378	/*
				2379	* Initialize batch roundrobin counts
				2380	*/
				2381	hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
				2382	hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
				2383	}
				2384	}
				2385
				2386	/*
				2387	* Caller needs to ensure that we're either frozen/quiesced, or that
				2388	* the queue isn't live yet.
				2389	*/
				2390	static void queue_set_hctx_shared(struct request_queue *q, bool shared)
				2391	{
				2392	struct blk_mq_hw_ctx *hctx;
				2393	int i;
				2394
				2395	queue_for_each_hw_ctx(q, hctx, i) {
				2396	if (shared)
				2397	hctx->flags \|= BLK_MQ_F_TAG_SHARED;
				2398	else
				2399	hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
				2400	}
				2401	}
				2402
				2403	static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
				2404	bool shared)
				2405	{
				2406	struct request_queue *q;
				2407
				2408	lockdep_assert_held(&set->tag_list_lock);
				2409
				2410	list_for_each_entry(q, &set->tag_list, tag_set_list) {
				2411	blk_mq_freeze_queue(q);
				2412	queue_set_hctx_shared(q, shared);
				2413	blk_mq_unfreeze_queue(q);
				2414	}
				2415	}
				2416
				2417	static void blk_mq_del_queue_tag_set(struct request_queue *q)
				2418	{
				2419	struct blk_mq_tag_set *set = q->tag_set;
				2420
				2421	mutex_lock(&set->tag_list_lock);
				2422	list_del_rcu(&q->tag_set_list);
				2423	if (list_is_singular(&set->tag_list)) {
				2424	/* just transitioned to unshared */
				2425	set->flags &= ~BLK_MQ_F_TAG_SHARED;
				2426	/* update existing queue */
				2427	blk_mq_update_tag_set_depth(set, false);
				2428	}
				2429	mutex_unlock(&set->tag_list_lock);
				2430	INIT_LIST_HEAD(&q->tag_set_list);
				2431	}
				2432
				2433	static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
				2434	struct request_queue *q)
				2435	{
				2436	q->tag_set = set;
				2437
				2438	mutex_lock(&set->tag_list_lock);
				2439
				2440	/*
				2441	* Check to see if we're transitioning to shared (from 1 to 2 queues).
				2442	*/
				2443	if (!list_empty(&set->tag_list) &&
				2444	!(set->flags & BLK_MQ_F_TAG_SHARED)) {
				2445	set->flags \|= BLK_MQ_F_TAG_SHARED;
				2446	/* update existing queue */
				2447	blk_mq_update_tag_set_depth(set, true);
				2448	}
				2449	if (set->flags & BLK_MQ_F_TAG_SHARED)
				2450	queue_set_hctx_shared(q, true);
				2451	list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
				2452
				2453	mutex_unlock(&set->tag_list_lock);
				2454	}
				2455
				2456	/*
				2457	* It is the actual release handler for mq, but we do it from
				2458	* request queue's release handler for avoiding use-after-free
				2459	* and headache because q->mq_kobj shouldn't have been introduced,
				2460	* but we can't group ctx/kctx kobj without it.
				2461	*/
				2462	void blk_mq_release(struct request_queue *q)
				2463	{
				2464	struct blk_mq_hw_ctx *hctx;
				2465	unsigned int i;
				2466
				2467	/* hctx kobj stays in hctx */
				2468	queue_for_each_hw_ctx(q, hctx, i) {
				2469	if (!hctx)
				2470	continue;
				2471	kobject_put(&hctx->kobj);
				2472	}
				2473
				2474	q->mq_map = NULL;
				2475
				2476	kfree(q->queue_hw_ctx);
				2477
				2478	/*
				2479	* release .mq_kobj and sw queue's kobject now because
				2480	* both share lifetime with request queue.
				2481	*/
				2482	blk_mq_sysfs_deinit(q);
				2483
				2484	free_percpu(q->queue_ctx);
				2485	}
				2486
				2487	struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
				2488	{
				2489	struct request_queue uninit_q, q;
				2490
				2491	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
				2492	if (!uninit_q)
				2493	return ERR_PTR(-ENOMEM);
				2494
				2495	q = blk_mq_init_allocated_queue(set, uninit_q);
				2496	if (IS_ERR(q))
				2497	blk_cleanup_queue(uninit_q);
				2498
				2499	return q;
				2500	}
				2501	EXPORT_SYMBOL(blk_mq_init_queue);
				2502
				2503	static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
				2504	{
				2505	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
				2506
				2507	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
				2508	__alignof__(struct blk_mq_hw_ctx)) !=
				2509	sizeof(struct blk_mq_hw_ctx));
				2510
				2511	if (tag_set->flags & BLK_MQ_F_BLOCKING)
				2512	hw_ctx_size += sizeof(struct srcu_struct);
				2513
				2514	return hw_ctx_size;
				2515	}
				2516
				2517	static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
				2518	struct request_queue *q)
				2519	{
				2520	int i, j;
				2521	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
				2522
				2523	blk_mq_sysfs_unregister(q);
				2524
				2525	/* protect against switching io scheduler */
				2526	mutex_lock(&q->sysfs_lock);
				2527	for (i = 0; i < set->nr_hw_queues; i++) {
				2528	int node;
				2529
				2530	if (hctxs[i])
				2531	continue;
				2532
				2533	node = blk_mq_hw_queue_to_node(q->mq_map, i);
				2534	hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
				2535	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
				2536	node);
				2537	if (!hctxs[i])
				2538	break;
				2539
				2540	if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
				2541	GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY,
				2542	node)) {
				2543	kfree(hctxs[i]);
				2544	hctxs[i] = NULL;
				2545	break;
				2546	}
				2547
				2548	atomic_set(&hctxs[i]->nr_active, 0);
				2549	hctxs[i]->numa_node = node;
				2550	hctxs[i]->queue_num = i;
				2551
				2552	if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
				2553	free_cpumask_var(hctxs[i]->cpumask);
				2554	kfree(hctxs[i]);
				2555	hctxs[i] = NULL;
				2556	break;
				2557	}
				2558	blk_mq_hctx_kobj_init(hctxs[i]);
				2559	}
				2560	for (j = i; j < q->nr_hw_queues; j++) {
				2561	struct blk_mq_hw_ctx *hctx = hctxs[j];
				2562
				2563	if (hctx) {
				2564	if (hctx->tags)
				2565	blk_mq_free_map_and_requests(set, j);
				2566	blk_mq_exit_hctx(q, set, hctx, j);
				2567	kobject_put(&hctx->kobj);
				2568	hctxs[j] = NULL;
				2569
				2570	}
				2571	}
				2572	q->nr_hw_queues = i;
				2573	mutex_unlock(&q->sysfs_lock);
				2574	blk_mq_sysfs_register(q);
				2575	}
				2576
				2577	struct request_queue blk_mq_init_allocated_queue(struct blk_mq_tag_set set,
				2578	struct request_queue *q)
				2579	{
				2580	/* mark the queue as mq asap */
				2581	q->mq_ops = set->ops;
				2582
				2583	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
				2584	blk_mq_poll_stats_bkt,
				2585	BLK_MQ_POLL_STATS_BKTS, q);
				2586	if (!q->poll_cb)
				2587	goto err_exit;
				2588
				2589	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
				2590	if (!q->queue_ctx)
				2591	goto err_exit;
				2592
				2593	/* init q->mq_kobj and sw queues' kobjects */
				2594	blk_mq_sysfs_init(q);
				2595
				2596	q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
				2597	GFP_KERNEL, set->numa_node);
				2598	if (!q->queue_hw_ctx)
				2599	goto err_percpu;
				2600
				2601	q->mq_map = set->mq_map;
				2602
				2603	blk_mq_realloc_hw_ctxs(set, q);
				2604	if (!q->nr_hw_queues)
				2605	goto err_hctxs;
				2606
				2607	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
				2608	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
				2609
				2610	q->nr_queues = nr_cpu_ids;
				2611
				2612	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
				2613
				2614	if (!(set->flags & BLK_MQ_F_SG_MERGE))
				2615	queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
				2616
				2617	q->sg_reserved_size = INT_MAX;
				2618
				2619	INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
				2620	INIT_LIST_HEAD(&q->requeue_list);
				2621	spin_lock_init(&q->requeue_lock);
				2622
				2623	blk_queue_make_request(q, blk_mq_make_request);
				2624	if (q->mq_ops->poll)
				2625	q->poll_fn = blk_mq_poll;
				2626
				2627	/*
				2628	* Do this after blk_queue_make_request() overrides it...
				2629	*/
				2630	q->nr_requests = set->queue_depth;
				2631
				2632	/*
				2633	* Default to classic polling
				2634	*/
				2635	q->poll_nsec = -1;
				2636
				2637	if (set->ops->complete)
				2638	blk_queue_softirq_done(q, set->ops->complete);
				2639
				2640	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
				2641	blk_mq_add_queue_tag_set(set, q);
				2642	blk_mq_map_swqueue(q);
				2643
				2644	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
				2645	int ret;
				2646
				2647	ret = elevator_init_mq(q);
				2648	if (ret)
				2649	return ERR_PTR(ret);
				2650	}
				2651
				2652	return q;
				2653
				2654	err_hctxs:
				2655	kfree(q->queue_hw_ctx);
				2656	err_percpu:
				2657	free_percpu(q->queue_ctx);
				2658	err_exit:
				2659	q->mq_ops = NULL;
				2660	return ERR_PTR(-ENOMEM);
				2661	}
				2662	EXPORT_SYMBOL(blk_mq_init_allocated_queue);
				2663
				2664	/* tags can _not_ be used after returning from blk_mq_exit_queue */
				2665	void blk_mq_exit_queue(struct request_queue *q)
				2666	{
				2667	struct blk_mq_tag_set *set = q->tag_set;
				2668
				2669	blk_mq_del_queue_tag_set(q);
				2670	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
				2671	}
				2672
				2673	/* Basically redo blk_mq_init_queue with queue frozen */
				2674	static void blk_mq_queue_reinit(struct request_queue *q)
				2675	{
				2676	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
				2677
				2678	blk_mq_debugfs_unregister_hctxs(q);
				2679	blk_mq_sysfs_unregister(q);
				2680
				2681	/*
				2682	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				2683	* we should change hctx numa_node according to the new topology (this
				2684	* involves freeing and re-allocating memory, worth doing?)
				2685	*/
				2686	blk_mq_map_swqueue(q);
				2687
				2688	blk_mq_sysfs_register(q);
				2689	blk_mq_debugfs_register_hctxs(q);
				2690	}
				2691
				2692	static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
				2693	{
				2694	int i;
				2695
				2696	for (i = 0; i < set->nr_hw_queues; i++)
				2697	if (!__blk_mq_alloc_rq_map(set, i))
				2698	goto out_unwind;
				2699
				2700	return 0;
				2701
				2702	out_unwind:
				2703	while (--i >= 0)
				2704	blk_mq_free_rq_map(set->tags[i]);
				2705
				2706	return -ENOMEM;
				2707	}
				2708
				2709	/*
				2710	* Allocate the request maps associated with this tag_set. Note that this
				2711	* may reduce the depth asked for, if memory is tight. set->queue_depth
				2712	* will be updated to reflect the allocated depth.
				2713	*/
				2714	static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
				2715	{
				2716	unsigned int depth;
				2717	int err;
				2718
				2719	depth = set->queue_depth;
				2720	do {
				2721	err = __blk_mq_alloc_rq_maps(set);
				2722	if (!err)
				2723	break;
				2724
				2725	set->queue_depth >>= 1;
				2726	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
				2727	err = -ENOMEM;
				2728	break;
				2729	}
				2730	} while (set->queue_depth);
				2731
				2732	if (!set->queue_depth \|\| err) {
				2733	pr_err("blk-mq: failed to allocate request map\n");
				2734	return -ENOMEM;
				2735	}
				2736
				2737	if (depth != set->queue_depth)
				2738	pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
				2739	depth, set->queue_depth);
				2740
				2741	return 0;
				2742	}
				2743
				2744	static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
				2745	{
				2746	if (set->ops->map_queues) {
				2747	/*
				2748	* transport .map_queues is usually done in the following
				2749	* way:
				2750	*
				2751	* for (queue = 0; queue < set->nr_hw_queues; queue++) {
				2752	* mask = get_cpu_mask(queue)
				2753	* for_each_cpu(cpu, mask)
				2754	* set->mq_map[cpu] = queue;
				2755	* }
				2756	*
				2757	* When we need to remap, the table has to be cleared for
				2758	* killing stale mapping since one CPU may not be mapped
				2759	* to any hw queue.
				2760	*/
				2761	blk_mq_clear_mq_map(set);
				2762
				2763	return set->ops->map_queues(set);
				2764	} else
				2765	return blk_mq_map_queues(set);
				2766	}
				2767
				2768	/*
				2769	* Alloc a tag set to be associated with one or more request queues.
				2770	* May fail with EINVAL for various error conditions. May adjust the
				2771	* requested depth down, if it's too large. In that case, the set
				2772	* value will be stored in set->queue_depth.
				2773	*/
				2774	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
				2775	{
				2776	int ret;
				2777
				2778	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
				2779
				2780	if (!set->nr_hw_queues)
				2781	return -EINVAL;
				2782	if (!set->queue_depth)
				2783	return -EINVAL;
				2784	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
				2785	return -EINVAL;
				2786
				2787	if (!set->ops->queue_rq)
				2788	return -EINVAL;
				2789
				2790	if (!set->ops->get_budget ^ !set->ops->put_budget)
				2791	return -EINVAL;
				2792
				2793	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
				2794	pr_info("blk-mq: reduced tag depth to %u\n",
				2795	BLK_MQ_MAX_DEPTH);
				2796	set->queue_depth = BLK_MQ_MAX_DEPTH;
				2797	}
				2798
				2799	/*
				2800	* If a crashdump is active, then we are potentially in a very
				2801	* memory constrained environment. Limit us to 1 queue and
				2802	* 64 tags to prevent using too much memory.
				2803	*/
				2804	if (is_kdump_kernel()) {
				2805	set->nr_hw_queues = 1;
				2806	set->queue_depth = min(64U, set->queue_depth);
				2807	}
				2808	/*
				2809	* There is no use for more h/w queues than cpus.
				2810	*/
				2811	if (set->nr_hw_queues > nr_cpu_ids)
				2812	set->nr_hw_queues = nr_cpu_ids;
				2813
				2814	set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
				2815	GFP_KERNEL, set->numa_node);
				2816	if (!set->tags)
				2817	return -ENOMEM;
				2818
				2819	ret = -ENOMEM;
				2820	set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
				2821	GFP_KERNEL, set->numa_node);
				2822	if (!set->mq_map)
				2823	goto out_free_tags;
				2824
				2825	ret = blk_mq_update_queue_map(set);
				2826	if (ret)
				2827	goto out_free_mq_map;
				2828
				2829	ret = blk_mq_alloc_rq_maps(set);
				2830	if (ret)
				2831	goto out_free_mq_map;
				2832
				2833	mutex_init(&set->tag_list_lock);
				2834	INIT_LIST_HEAD(&set->tag_list);
				2835
				2836	return 0;
				2837
				2838	out_free_mq_map:
				2839	kfree(set->mq_map);
				2840	set->mq_map = NULL;
				2841	out_free_tags:
				2842	kfree(set->tags);
				2843	set->tags = NULL;
				2844	return ret;
				2845	}
				2846	EXPORT_SYMBOL(blk_mq_alloc_tag_set);
				2847
				2848	void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
				2849	{
				2850	int i;
				2851
				2852	for (i = 0; i < nr_cpu_ids; i++)
				2853	blk_mq_free_map_and_requests(set, i);
				2854
				2855	kfree(set->mq_map);
				2856	set->mq_map = NULL;
				2857
				2858	kfree(set->tags);
				2859	set->tags = NULL;
				2860	}
				2861	EXPORT_SYMBOL(blk_mq_free_tag_set);
				2862
				2863	int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
				2864	{
				2865	struct blk_mq_tag_set *set = q->tag_set;
				2866	struct blk_mq_hw_ctx *hctx;
				2867	int i, ret;
				2868
				2869	if (!set)
				2870	return -EINVAL;
				2871
				2872	blk_mq_freeze_queue(q);
				2873	blk_mq_quiesce_queue(q);
				2874
				2875	ret = 0;
				2876	queue_for_each_hw_ctx(q, hctx, i) {
				2877	if (!hctx->tags)
				2878	continue;
				2879	/*
				2880	* If we're using an MQ scheduler, just update the scheduler
				2881	* queue depth. This is similar to what the old code would do.
				2882	*/
				2883	if (!hctx->sched_tags) {
				2884	ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
				2885	false);
				2886	} else {
				2887	ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
				2888	nr, true);
				2889	}
				2890	if (ret)
				2891	break;
				2892	if (q->elevator && q->elevator->type->ops.mq.depth_updated)
				2893	q->elevator->type->ops.mq.depth_updated(hctx);
				2894	}
				2895
				2896	if (!ret)
				2897	q->nr_requests = nr;
				2898
				2899	blk_mq_unquiesce_queue(q);
				2900	blk_mq_unfreeze_queue(q);
				2901
				2902	return ret;
				2903	}
				2904
				2905	/*
				2906	* request_queue and elevator_type pair.
				2907	* It is just used by __blk_mq_update_nr_hw_queues to cache
				2908	* the elevator_type associated with a request_queue.
				2909	*/
				2910	struct blk_mq_qe_pair {
				2911	struct list_head node;
				2912	struct request_queue *q;
				2913	struct elevator_type *type;
				2914	};
				2915
				2916	/*
				2917	* Cache the elevator_type in qe pair list and switch the
				2918	* io scheduler to 'none'
				2919	*/
				2920	static bool blk_mq_elv_switch_none(struct list_head *head,
				2921	struct request_queue *q)
				2922	{
				2923	struct blk_mq_qe_pair *qe;
				2924
				2925	if (!q->elevator)
				2926	return true;
				2927
				2928	qe = kmalloc(sizeof(*qe), GFP_NOIO \| __GFP_NOWARN \| __GFP_NORETRY);
				2929	if (!qe)
				2930	return false;
				2931
				2932	INIT_LIST_HEAD(&qe->node);
				2933	qe->q = q;
				2934	qe->type = q->elevator->type;
				2935	list_add(&qe->node, head);
				2936
				2937	mutex_lock(&q->sysfs_lock);
				2938	/*
				2939	* After elevator_switch_mq, the previous elevator_queue will be
				2940	* released by elevator_release. The reference of the io scheduler
				2941	* module get by elevator_get will also be put. So we need to get
				2942	* a reference of the io scheduler module here to prevent it to be
				2943	* removed.
				2944	*/
				2945	__module_get(qe->type->elevator_owner);
				2946	elevator_switch_mq(q, NULL);
				2947	mutex_unlock(&q->sysfs_lock);
				2948
				2949	return true;
				2950	}
				2951
				2952	static void blk_mq_elv_switch_back(struct list_head *head,
				2953	struct request_queue *q)
				2954	{
				2955	struct blk_mq_qe_pair *qe;
				2956	struct elevator_type *t = NULL;
				2957
				2958	list_for_each_entry(qe, head, node)
				2959	if (qe->q == q) {
				2960	t = qe->type;
				2961	break;
				2962	}
				2963
				2964	if (!t)
				2965	return;
				2966
				2967	list_del(&qe->node);
				2968	kfree(qe);
				2969
				2970	mutex_lock(&q->sysfs_lock);
				2971	elevator_switch_mq(q, t);
				2972	mutex_unlock(&q->sysfs_lock);
				2973	}
				2974
				2975	static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
				2976	int nr_hw_queues)
				2977	{
				2978	struct request_queue *q;
				2979	LIST_HEAD(head);
				2980
				2981	lockdep_assert_held(&set->tag_list_lock);
				2982
				2983	if (nr_hw_queues > nr_cpu_ids)
				2984	nr_hw_queues = nr_cpu_ids;
				2985	if (nr_hw_queues < 1 \|\| nr_hw_queues == set->nr_hw_queues)
				2986	return;
				2987
				2988	list_for_each_entry(q, &set->tag_list, tag_set_list)
				2989	blk_mq_freeze_queue(q);
				2990	/*
				2991	* Sync with blk_mq_queue_tag_busy_iter.
				2992	*/
				2993	synchronize_rcu();
				2994	/*
				2995	* Switch IO scheduler to 'none', cleaning up the data associated
				2996	* with the previous scheduler. We will switch back once we are done
				2997	* updating the new sw to hw queue mappings.
				2998	*/
				2999	list_for_each_entry(q, &set->tag_list, tag_set_list)
				3000	if (!blk_mq_elv_switch_none(&head, q))
				3001	goto switch_back;
				3002
				3003	set->nr_hw_queues = nr_hw_queues;
				3004	blk_mq_update_queue_map(set);
				3005	list_for_each_entry(q, &set->tag_list, tag_set_list) {
				3006	blk_mq_realloc_hw_ctxs(set, q);
				3007	blk_mq_queue_reinit(q);
				3008	}
				3009
				3010	switch_back:
				3011	list_for_each_entry(q, &set->tag_list, tag_set_list)
				3012	blk_mq_elv_switch_back(&head, q);
				3013
				3014	list_for_each_entry(q, &set->tag_list, tag_set_list)
				3015	blk_mq_unfreeze_queue(q);
				3016	}
				3017
				3018	void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
				3019	{
				3020	mutex_lock(&set->tag_list_lock);
				3021	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
				3022	mutex_unlock(&set->tag_list_lock);
				3023	}
				3024	EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
				3025
				3026	/* Enable polling stats and return whether they were already enabled. */
				3027	static bool blk_poll_stats_enable(struct request_queue *q)
				3028	{
				3029	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) \|\|
				3030	blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
				3031	return true;
				3032	blk_stat_add_callback(q, q->poll_cb);
				3033	return false;
				3034	}
				3035
				3036	static void blk_mq_poll_stats_start(struct request_queue *q)
				3037	{
				3038	/*
				3039	* We don't arm the callback if polling stats are not enabled or the
				3040	* callback is already active.
				3041	*/
				3042	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) \|\|
				3043	blk_stat_is_active(q->poll_cb))
				3044	return;
				3045
				3046	blk_stat_activate_msecs(q->poll_cb, 100);
				3047	}
				3048
				3049	static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
				3050	{
				3051	struct request_queue *q = cb->data;
				3052	int bucket;
				3053
				3054	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
				3055	if (cb->stat[bucket].nr_samples)
				3056	q->poll_stat[bucket] = cb->stat[bucket];
				3057	}
				3058	}
				3059
				3060	static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
				3061	struct blk_mq_hw_ctx *hctx,
				3062	struct request *rq)
				3063	{
				3064	unsigned long ret = 0;
				3065	int bucket;
				3066
				3067	/*
				3068	* If stats collection isn't on, don't sleep but turn it on for
				3069	* future users
				3070	*/
				3071	if (!blk_poll_stats_enable(q))
				3072	return 0;
				3073
				3074	/*
				3075	* As an optimistic guess, use half of the mean service time
				3076	* for this type of request. We can (and should) make this smarter.
				3077	* For instance, if the completion latencies are tight, we can
				3078	* get closer than just half the mean. This is especially
				3079	* important on devices where the completion latencies are longer
				3080	* than ~10 usec. We do use the stats for the relevant IO size
				3081	* if available which does lead to better estimates.
				3082	*/
				3083	bucket = blk_mq_poll_stats_bkt(rq);
				3084	if (bucket < 0)
				3085	return ret;
				3086
				3087	if (q->poll_stat[bucket].nr_samples)
				3088	ret = (q->poll_stat[bucket].mean + 1) / 2;
				3089
				3090	return ret;
				3091	}
				3092
				3093	static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
				3094	struct blk_mq_hw_ctx *hctx,
				3095	struct request *rq)
				3096	{
				3097	struct hrtimer_sleeper hs;
				3098	enum hrtimer_mode mode;
				3099	unsigned int nsecs;
				3100	ktime_t kt;
				3101
				3102	if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
				3103	return false;
				3104
				3105	/*
				3106	* poll_nsec can be:
				3107	*
				3108	* -1: don't ever hybrid sleep
				3109	* 0: use half of prev avg
				3110	* >0: use this specific value
				3111	*/
				3112	if (q->poll_nsec == -1)
				3113	return false;
				3114	else if (q->poll_nsec > 0)
				3115	nsecs = q->poll_nsec;
				3116	else
				3117	nsecs = blk_mq_poll_nsecs(q, hctx, rq);
				3118
				3119	if (!nsecs)
				3120	return false;
				3121
				3122	rq->rq_flags \|= RQF_MQ_POLL_SLEPT;
				3123
				3124	/*
				3125	* This will be replaced with the stats tracking code, using
				3126	* 'avg_completion_time / 2' as the pre-sleep target.
				3127	*/
				3128	kt = nsecs;
				3129
				3130	mode = HRTIMER_MODE_REL;
				3131	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
				3132	hrtimer_set_expires(&hs.timer, kt);
				3133
				3134	hrtimer_init_sleeper(&hs, current);
				3135	do {
				3136	if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
				3137	break;
				3138	set_current_state(TASK_UNINTERRUPTIBLE);
				3139	hrtimer_start_expires(&hs.timer, mode);
				3140	if (hs.task)
				3141	io_schedule();
				3142	hrtimer_cancel(&hs.timer);
				3143	mode = HRTIMER_MODE_ABS;
				3144	} while (hs.task && !signal_pending(current));
				3145
				3146	__set_current_state(TASK_RUNNING);
				3147	destroy_hrtimer_on_stack(&hs.timer);
				3148	return true;
				3149	}
				3150
				3151	static bool __blk_mq_poll(struct blk_mq_hw_ctx hctx, struct request rq)
				3152	{
				3153	struct request_queue *q = hctx->queue;
				3154	long state;
				3155
				3156	/*
				3157	* If we sleep, have the caller restart the poll loop to reset
				3158	* the state. Like for the other success return cases, the
				3159	* caller is responsible for checking if the IO completed. If
				3160	* the IO isn't complete, we'll get called again and will go
				3161	* straight to the busy poll loop.
				3162	*/
				3163	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
				3164	return true;
				3165
				3166	hctx->poll_considered++;
				3167
				3168	state = current->state;
				3169	while (!need_resched()) {
				3170	int ret;
				3171
				3172	hctx->poll_invoked++;
				3173
				3174	ret = q->mq_ops->poll(hctx, rq->tag);
				3175	if (ret > 0) {
				3176	hctx->poll_success++;
				3177	set_current_state(TASK_RUNNING);
				3178	return true;
				3179	}
				3180
				3181	if (signal_pending_state(state, current))
				3182	set_current_state(TASK_RUNNING);
				3183
				3184	if (current->state == TASK_RUNNING)
				3185	return true;
				3186	if (ret < 0)
				3187	break;
				3188	cpu_relax();
				3189	}
				3190
				3191	__set_current_state(TASK_RUNNING);
				3192	return false;
				3193	}
				3194
				3195	static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
				3196	{
				3197	struct blk_mq_hw_ctx *hctx;
				3198	struct request *rq;
				3199
				3200	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
				3201	return false;
				3202
				3203	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
				3204	if (!blk_qc_t_is_internal(cookie))
				3205	rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
				3206	else {
				3207	rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
				3208	/*
				3209	* With scheduling, if the request has completed, we'll
				3210	* get a NULL return here, as we clear the sched tag when
				3211	* that happens. The request still remains valid, like always,
				3212	* so we should be safe with just the NULL check.
				3213	*/
				3214	if (!rq)
				3215	return false;
				3216	}
				3217
				3218	return __blk_mq_poll(hctx, rq);
				3219	}
				3220
				3221	static int __init blk_mq_init(void)
				3222	{
				3223	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
				3224	blk_mq_hctx_notify_dead);
				3225	return 0;
				3226	}
				3227	subsys_initcall(blk_mq_init);