Blame - src/kernel/linux/v4.19/block/cfq-iosched.c - T800

blob: 9ad521195376d123ea26f0de3c1f1f26cf4895e6 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* CFQ, or complete fairness queueing, disk scheduler.
				3	*
				4	* Based on ideas from a previously unfinished io
				5	* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
				6	*
				7	* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
				8	*/
				9	#include <linux/module.h>
				10	#include <linux/slab.h>
				11	#include <linux/sched/clock.h>
				12	#include <linux/blkdev.h>
				13	#include <linux/elevator.h>
				14	#include <linux/ktime.h>
				15	#include <linux/rbtree.h>
				16	#include <linux/ioprio.h>
				17	#include <linux/blktrace_api.h>
				18	#include <linux/blk-cgroup.h>
				19	#include "blk.h"
				20	#include "blk-wbt.h"
				21
				22	/*
				23	* tunables
				24	*/
				25	/* max queue in one round of service */
				26	static const int cfq_quantum = 8;
				27	static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
				28	/* maximum backwards seek, in KiB */
				29	static const int cfq_back_max = 16 * 1024;
				30	/* penalty of a backwards seek */
				31	static const int cfq_back_penalty = 2;
				32	static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
				33	static u64 cfq_slice_async = NSEC_PER_SEC / 25;
				34	static const int cfq_slice_async_rq = 2;
				35	static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
				36	static u64 cfq_group_idle = NSEC_PER_SEC / 125;
				37	static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
				38	static const int cfq_hist_divisor = 4;
				39
				40	/*
				41	* offset from end of queue service tree for idle class
				42	*/
				43	#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
				44	/* offset from end of group service tree under time slice mode */
				45	#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
				46	/* offset from end of group service under IOPS mode */
				47	#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
				48
				49	/*
				50	* below this threshold, we consider thinktime immediate
				51	*/
				52	#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ)
				53
				54	#define CFQ_SLICE_SCALE (5)
				55	#define CFQ_HW_QUEUE_MIN (5)
				56	#define CFQ_SERVICE_SHIFT 12
				57
				58	#define CFQQ_SEEK_THR (sector_t)(8 * 100)
				59	#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
				60	#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
				61	#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
				62
				63	#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
				64	#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
				65	#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
				66
				67	static struct kmem_cache *cfq_pool;
				68
				69	#define CFQ_PRIO_LISTS IOPRIO_BE_NR
				70	#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
				71	#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
				72
				73	#define sample_valid(samples) ((samples) > 80)
				74	#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
				75
				76	/* blkio-related constants */
				77	#define CFQ_WEIGHT_LEGACY_MIN 10
				78	#define CFQ_WEIGHT_LEGACY_DFL 500
				79	#define CFQ_WEIGHT_LEGACY_MAX 1000
				80
				81	struct cfq_ttime {
				82	u64 last_end_request;
				83
				84	u64 ttime_total;
				85	u64 ttime_mean;
				86	unsigned long ttime_samples;
				87	};
				88
				89	/*
				90	* Most of our rbtree usage is for sorting with min extraction, so
				91	* if we cache the leftmost node we don't have to walk down the tree
				92	* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
				93	* move this into the elevator for the rq sorting as well.
				94	*/
				95	struct cfq_rb_root {
				96	struct rb_root_cached rb;
				97	struct rb_node *rb_rightmost;
				98	unsigned count;
				99	u64 min_vdisktime;
				100	struct cfq_ttime ttime;
				101	};
				102	#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
				103	.rb_rightmost = NULL, \
				104	.ttime = {.last_end_request = ktime_get_ns(),},}
				105
				106	/*
				107	* Per process-grouping structure
				108	*/
				109	struct cfq_queue {
				110	/* reference count */
				111	int ref;
				112	/* various state flags, see below */
				113	unsigned int flags;
				114	/* parent cfq_data */
				115	struct cfq_data *cfqd;
				116	/* service_tree member */
				117	struct rb_node rb_node;
				118	/* service_tree key */
				119	u64 rb_key;
				120	/* prio tree member */
				121	struct rb_node p_node;
				122	/* prio tree root we belong to, if any */
				123	struct rb_root *p_root;
				124	/* sorted list of pending requests */
				125	struct rb_root sort_list;
				126	/* if fifo isn't expired, next request to serve */
				127	struct request *next_rq;
				128	/* requests queued in sort_list */
				129	int queued[2];
				130	/* currently allocated requests */
				131	int allocated[2];
				132	/* fifo list of requests in sort_list */
				133	struct list_head fifo;
				134
				135	/* time when queue got scheduled in to dispatch first request. */
				136	u64 dispatch_start;
				137	u64 allocated_slice;
				138	u64 slice_dispatch;
				139	/* time when first request from queue completed and slice started. */
				140	u64 slice_start;
				141	u64 slice_end;
				142	s64 slice_resid;
				143
				144	/* pending priority requests */
				145	int prio_pending;
				146	/* number of requests that are on the dispatch list or inside driver */
				147	int dispatched;
				148
				149	/* io prio of this group */
				150	unsigned short ioprio, org_ioprio;
				151	unsigned short ioprio_class, org_ioprio_class;
				152
				153	pid_t pid;
				154
				155	u32 seek_history;
				156	sector_t last_request_pos;
				157
				158	struct cfq_rb_root *service_tree;
				159	struct cfq_queue *new_cfqq;
				160	struct cfq_group *cfqg;
				161	/* Number of sectors dispatched from queue in single dispatch round */
				162	unsigned long nr_sectors;
				163	};
				164
				165	/*
				166	* First index in the service_trees.
				167	* IDLE is handled separately, so it has negative index
				168	*/
				169	enum wl_class_t {
				170	BE_WORKLOAD = 0,
				171	RT_WORKLOAD = 1,
				172	IDLE_WORKLOAD = 2,
				173	CFQ_PRIO_NR,
				174	};
				175
				176	/*
				177	* Second index in the service_trees.
				178	*/
				179	enum wl_type_t {
				180	ASYNC_WORKLOAD = 0,
				181	SYNC_NOIDLE_WORKLOAD = 1,
				182	SYNC_WORKLOAD = 2
				183	};
				184
				185	struct cfqg_stats {
				186	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				187	/* number of ios merged */
				188	struct blkg_rwstat merged;
				189	/* total time spent on device in ns, may not be accurate w/ queueing */
				190	struct blkg_rwstat service_time;
				191	/* total time spent waiting in scheduler queue in ns */
				192	struct blkg_rwstat wait_time;
				193	/* number of IOs queued up */
				194	struct blkg_rwstat queued;
				195	/* total disk time and nr sectors dispatched by this group */
				196	struct blkg_stat time;
				197	#ifdef CONFIG_DEBUG_BLK_CGROUP
				198	/* time not charged to this cgroup */
				199	struct blkg_stat unaccounted_time;
				200	/* sum of number of ios queued across all samples */
				201	struct blkg_stat avg_queue_size_sum;
				202	/* count of samples taken for average */
				203	struct blkg_stat avg_queue_size_samples;
				204	/* how many times this group has been removed from service tree */
				205	struct blkg_stat dequeue;
				206	/* total time spent waiting for it to be assigned a timeslice. */
				207	struct blkg_stat group_wait_time;
				208	/* time spent idling for this blkcg_gq */
				209	struct blkg_stat idle_time;
				210	/* total time with empty current active q with other requests queued */
				211	struct blkg_stat empty_time;
				212	/* fields after this shouldn't be cleared on stat reset */
				213	u64 start_group_wait_time;
				214	u64 start_idle_time;
				215	u64 start_empty_time;
				216	uint16_t flags;
				217	#endif /* CONFIG_DEBUG_BLK_CGROUP */
				218	#endif /* CONFIG_CFQ_GROUP_IOSCHED */
				219	};
				220
				221	/* Per-cgroup data */
				222	struct cfq_group_data {
				223	/* must be the first member */
				224	struct blkcg_policy_data cpd;
				225
				226	unsigned int weight;
				227	unsigned int leaf_weight;
				228	u64 group_idle;
				229	};
				230
				231	/* This is per cgroup per device grouping structure */
				232	struct cfq_group {
				233	/* must be the first member */
				234	struct blkg_policy_data pd;
				235
				236	/* group service_tree member */
				237	struct rb_node rb_node;
				238
				239	/* group service_tree key */
				240	u64 vdisktime;
				241
				242	/*
				243	* The number of active cfqgs and sum of their weights under this
				244	* cfqg. This covers this cfqg's leaf_weight and all children's
				245	* weights, but does not cover weights of further descendants.
				246	*
				247	* If a cfqg is on the service tree, it's active. An active cfqg
				248	* also activates its parent and contributes to the children_weight
				249	* of the parent.
				250	*/
				251	int nr_active;
				252	unsigned int children_weight;
				253
				254	/*
				255	* vfraction is the fraction of vdisktime that the tasks in this
				256	* cfqg are entitled to. This is determined by compounding the
				257	* ratios walking up from this cfqg to the root.
				258	*
				259	* It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
				260	* vfractions on a service tree is approximately 1. The sum may
				261	* deviate a bit due to rounding errors and fluctuations caused by
				262	* cfqgs entering and leaving the service tree.
				263	*/
				264	unsigned int vfraction;
				265
				266	/*
				267	* There are two weights - (internal) weight is the weight of this
				268	* cfqg against the sibling cfqgs. leaf_weight is the wight of
				269	* this cfqg against the child cfqgs. For the root cfqg, both
				270	* weights are kept in sync for backward compatibility.
				271	*/
				272	unsigned int weight;
				273	unsigned int new_weight;
				274	unsigned int dev_weight;
				275
				276	unsigned int leaf_weight;
				277	unsigned int new_leaf_weight;
				278	unsigned int dev_leaf_weight;
				279
				280	/* number of cfqq currently on this group */
				281	int nr_cfqq;
				282
				283	/*
				284	* Per group busy queues average. Useful for workload slice calc. We
				285	* create the array for each prio class but at run time it is used
				286	* only for RT and BE class and slot for IDLE class remains unused.
				287	* This is primarily done to avoid confusion and a gcc warning.
				288	*/
				289	unsigned int busy_queues_avg[CFQ_PRIO_NR];
				290	/*
				291	* rr lists of queues with requests. We maintain service trees for
				292	* RT and BE classes. These trees are subdivided in subclasses
				293	* of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
				294	* class there is no subclassification and all the cfq queues go on
				295	* a single tree service_tree_idle.
				296	* Counts are embedded in the cfq_rb_root
				297	*/
				298	struct cfq_rb_root service_trees[2][3];
				299	struct cfq_rb_root service_tree_idle;
				300
				301	u64 saved_wl_slice;
				302	enum wl_type_t saved_wl_type;
				303	enum wl_class_t saved_wl_class;
				304
				305	/* number of requests that are on the dispatch list or inside driver */
				306	int dispatched;
				307	struct cfq_ttime ttime;
				308	struct cfqg_stats stats; /* stats for this cfqg */
				309
				310	/* async queue for each priority case */
				311	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
				312	struct cfq_queue *async_idle_cfqq;
				313
				314	u64 group_idle;
				315	};
				316
				317	struct cfq_io_cq {
				318	struct io_cq icq; /* must be the first member */
				319	struct cfq_queue *cfqq[2];
				320	struct cfq_ttime ttime;
				321	int ioprio; /* the current ioprio */
				322	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				323	uint64_t blkcg_serial_nr; /* the current blkcg serial */
				324	#endif
				325	};
				326
				327	/*
				328	* Per block device queue structure
				329	*/
				330	struct cfq_data {
				331	struct request_queue *queue;
				332	/* Root service tree for cfq_groups */
				333	struct cfq_rb_root grp_service_tree;
				334	struct cfq_group *root_group;
				335
				336	/*
				337	* The priority currently being served
				338	*/
				339	enum wl_class_t serving_wl_class;
				340	enum wl_type_t serving_wl_type;
				341	u64 workload_expires;
				342	struct cfq_group *serving_group;
				343
				344	/*
				345	* Each priority tree is sorted by next_request position. These
				346	* trees are used when determining if two or more queues are
				347	* interleaving requests (see cfq_close_cooperator).
				348	*/
				349	struct rb_root prio_trees[CFQ_PRIO_LISTS];
				350
				351	unsigned int busy_queues;
				352	unsigned int busy_sync_queues;
				353
				354	int rq_in_driver;
				355	int rq_in_flight[2];
				356
				357	/*
				358	* queue-depth detection
				359	*/
				360	int rq_queued;
				361	int hw_tag;
				362	/*
				363	* hw_tag can be
				364	* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
				365	* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
				366	* 0 => no NCQ
				367	*/
				368	int hw_tag_est_depth;
				369	unsigned int hw_tag_samples;
				370
				371	/*
				372	* idle window management
				373	*/
				374	struct hrtimer idle_slice_timer;
				375	struct work_struct unplug_work;
				376
				377	struct cfq_queue *active_queue;
				378	struct cfq_io_cq *active_cic;
				379
				380	sector_t last_position;
				381
				382	/*
				383	* tunables, see top of file
				384	*/
				385	unsigned int cfq_quantum;
				386	unsigned int cfq_back_penalty;
				387	unsigned int cfq_back_max;
				388	unsigned int cfq_slice_async_rq;
				389	unsigned int cfq_latency;
				390	u64 cfq_fifo_expire[2];
				391	u64 cfq_slice[2];
				392	u64 cfq_slice_idle;
				393	u64 cfq_group_idle;
				394	u64 cfq_target_latency;
				395
				396	/*
				397	* Fallback dummy cfqq for extreme OOM conditions
				398	*/
				399	struct cfq_queue oom_cfqq;
				400
				401	u64 last_delayed_sync;
				402	};
				403
				404	static struct cfq_group cfq_get_next_cfqg(struct cfq_data cfqd);
				405	static void cfq_put_queue(struct cfq_queue *cfqq);
				406
				407	static struct cfq_rb_root st_for(struct cfq_group cfqg,
				408	enum wl_class_t class,
				409	enum wl_type_t type)
				410	{
				411	if (!cfqg)
				412	return NULL;
				413
				414	if (class == IDLE_WORKLOAD)
				415	return &cfqg->service_tree_idle;
				416
				417	return &cfqg->service_trees[class][type];
				418	}
				419
				420	enum cfqq_state_flags {
				421	CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
				422	CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
				423	CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
				424	CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
				425	CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
				426	CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
				427	CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
				428	CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
				429	CFQ_CFQQ_FLAG_sync, /* synchronous queue */
				430	CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
				431	CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
				432	CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
				433	CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
				434	};
				435
				436	#define CFQ_CFQQ_FNS(name) \
				437	static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
				438	{ \
				439	(cfqq)->flags \|= (1 << CFQ_CFQQ_FLAG_##name); \
				440	} \
				441	static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
				442	{ \
				443	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
				444	} \
				445	static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
				446	{ \
				447	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
				448	}
				449
				450	CFQ_CFQQ_FNS(on_rr);
				451	CFQ_CFQQ_FNS(wait_request);
				452	CFQ_CFQQ_FNS(must_dispatch);
				453	CFQ_CFQQ_FNS(must_alloc_slice);
				454	CFQ_CFQQ_FNS(fifo_expire);
				455	CFQ_CFQQ_FNS(idle_window);
				456	CFQ_CFQQ_FNS(prio_changed);
				457	CFQ_CFQQ_FNS(slice_new);
				458	CFQ_CFQQ_FNS(sync);
				459	CFQ_CFQQ_FNS(coop);
				460	CFQ_CFQQ_FNS(split_coop);
				461	CFQ_CFQQ_FNS(deep);
				462	CFQ_CFQQ_FNS(wait_busy);
				463	#undef CFQ_CFQQ_FNS
				464
				465	#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
				466
				467	/* cfqg stats flags */
				468	enum cfqg_stats_flags {
				469	CFQG_stats_waiting = 0,
				470	CFQG_stats_idling,
				471	CFQG_stats_empty,
				472	};
				473
				474	#define CFQG_FLAG_FNS(name) \
				475	static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
				476	{ \
				477	stats->flags \|= (1 << CFQG_stats_##name); \
				478	} \
				479	static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
				480	{ \
				481	stats->flags &= ~(1 << CFQG_stats_##name); \
				482	} \
				483	static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
				484	{ \
				485	return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
				486	} \
				487
				488	CFQG_FLAG_FNS(waiting)
				489	CFQG_FLAG_FNS(idling)
				490	CFQG_FLAG_FNS(empty)
				491	#undef CFQG_FLAG_FNS
				492
				493	/* This should be called with the queue_lock held. */
				494	static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
				495	{
				496	u64 now;
				497
				498	if (!cfqg_stats_waiting(stats))
				499	return;
				500
				501	now = ktime_get_ns();
				502	if (now > stats->start_group_wait_time)
				503	blkg_stat_add(&stats->group_wait_time,
				504	now - stats->start_group_wait_time);
				505	cfqg_stats_clear_waiting(stats);
				506	}
				507
				508	/* This should be called with the queue_lock held. */
				509	static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
				510	struct cfq_group *curr_cfqg)
				511	{
				512	struct cfqg_stats *stats = &cfqg->stats;
				513
				514	if (cfqg_stats_waiting(stats))
				515	return;
				516	if (cfqg == curr_cfqg)
				517	return;
				518	stats->start_group_wait_time = ktime_get_ns();
				519	cfqg_stats_mark_waiting(stats);
				520	}
				521
				522	/* This should be called with the queue_lock held. */
				523	static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
				524	{
				525	u64 now;
				526
				527	if (!cfqg_stats_empty(stats))
				528	return;
				529
				530	now = ktime_get_ns();
				531	if (now > stats->start_empty_time)
				532	blkg_stat_add(&stats->empty_time,
				533	now - stats->start_empty_time);
				534	cfqg_stats_clear_empty(stats);
				535	}
				536
				537	static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
				538	{
				539	blkg_stat_add(&cfqg->stats.dequeue, 1);
				540	}
				541
				542	static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
				543	{
				544	struct cfqg_stats *stats = &cfqg->stats;
				545
				546	if (blkg_rwstat_total(&stats->queued))
				547	return;
				548
				549	/*
				550	* group is already marked empty. This can happen if cfqq got new
				551	* request in parent group and moved to this group while being added
				552	* to service tree. Just ignore the event and move on.
				553	*/
				554	if (cfqg_stats_empty(stats))
				555	return;
				556
				557	stats->start_empty_time = ktime_get_ns();
				558	cfqg_stats_mark_empty(stats);
				559	}
				560
				561	static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
				562	{
				563	struct cfqg_stats *stats = &cfqg->stats;
				564
				565	if (cfqg_stats_idling(stats)) {
				566	u64 now = ktime_get_ns();
				567
				568	if (now > stats->start_idle_time)
				569	blkg_stat_add(&stats->idle_time,
				570	now - stats->start_idle_time);
				571	cfqg_stats_clear_idling(stats);
				572	}
				573	}
				574
				575	static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
				576	{
				577	struct cfqg_stats *stats = &cfqg->stats;
				578
				579	BUG_ON(cfqg_stats_idling(stats));
				580
				581	stats->start_idle_time = ktime_get_ns();
				582	cfqg_stats_mark_idling(stats);
				583	}
				584
				585	static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
				586	{
				587	struct cfqg_stats *stats = &cfqg->stats;
				588
				589	blkg_stat_add(&stats->avg_queue_size_sum,
				590	blkg_rwstat_total(&stats->queued));
				591	blkg_stat_add(&stats->avg_queue_size_samples, 1);
				592	cfqg_stats_update_group_wait_time(stats);
				593	}
				594
				595	#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
				596
				597	static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group cfqg, struct cfq_group curr_cfqg) { }
				598	static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
				599	static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
				600	static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
				601	static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
				602	static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
				603	static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
				604
				605	#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
				606
				607	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				608
				609	static inline struct cfq_group pd_to_cfqg(struct blkg_policy_data pd)
				610	{
				611	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
				612	}
				613
				614	static struct cfq_group_data
				615	cpd_to_cfqgd(struct blkcg_policy_data cpd)
				616	{
				617	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
				618	}
				619
				620	static inline struct blkcg_gq cfqg_to_blkg(struct cfq_group cfqg)
				621	{
				622	return pd_to_blkg(&cfqg->pd);
				623	}
				624
				625	static struct blkcg_policy blkcg_policy_cfq;
				626
				627	static inline struct cfq_group blkg_to_cfqg(struct blkcg_gq blkg)
				628	{
				629	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
				630	}
				631
				632	static struct cfq_group_data blkcg_to_cfqgd(struct blkcg blkcg)
				633	{
				634	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
				635	}
				636
				637	static inline struct cfq_group cfqg_parent(struct cfq_group cfqg)
				638	{
				639	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
				640
				641	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
				642	}
				643
				644	static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				645	struct cfq_group *ancestor)
				646	{
				647	return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
				648	cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
				649	}
				650
				651	static inline void cfqg_get(struct cfq_group *cfqg)
				652	{
				653	return blkg_get(cfqg_to_blkg(cfqg));
				654	}
				655
				656	static inline void cfqg_put(struct cfq_group *cfqg)
				657	{
				658	return blkg_put(cfqg_to_blkg(cfqg));
				659	}
				660
				661	#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
				662	blk_add_cgroup_trace_msg((cfqd)->queue, \
				663	cfqg_to_blkg((cfqq)->cfqg)->blkcg, \
				664	"cfq%d%c%c " fmt, (cfqq)->pid, \
				665	cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
				666	cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
				667	##args); \
				668	} while (0)
				669
				670	#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
				671	blk_add_cgroup_trace_msg((cfqd)->queue, \
				672	cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \
				673	} while (0)
				674
				675	static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
				676	struct cfq_group *curr_cfqg,
				677	unsigned int op)
				678	{
				679	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
				680	cfqg_stats_end_empty_time(&cfqg->stats);
				681	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
				682	}
				683
				684	static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
				685	uint64_t time, unsigned long unaccounted_time)
				686	{
				687	blkg_stat_add(&cfqg->stats.time, time);
				688	#ifdef CONFIG_DEBUG_BLK_CGROUP
				689	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
				690	#endif
				691	}
				692
				693	static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
				694	unsigned int op)
				695	{
				696	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
				697	}
				698
				699	static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
				700	unsigned int op)
				701	{
				702	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
				703	}
				704
				705	static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
				706	u64 start_time_ns,
				707	u64 io_start_time_ns,
				708	unsigned int op)
				709	{
				710	struct cfqg_stats *stats = &cfqg->stats;
				711	u64 now = ktime_get_ns();
				712
				713	if (now > io_start_time_ns)
				714	blkg_rwstat_add(&stats->service_time, op,
				715	now - io_start_time_ns);
				716	if (io_start_time_ns > start_time_ns)
				717	blkg_rwstat_add(&stats->wait_time, op,
				718	io_start_time_ns - start_time_ns);
				719	}
				720
				721	/* @stats = 0 */
				722	static void cfqg_stats_reset(struct cfqg_stats *stats)
				723	{
				724	/* queued stats shouldn't be cleared */
				725	blkg_rwstat_reset(&stats->merged);
				726	blkg_rwstat_reset(&stats->service_time);
				727	blkg_rwstat_reset(&stats->wait_time);
				728	blkg_stat_reset(&stats->time);
				729	#ifdef CONFIG_DEBUG_BLK_CGROUP
				730	blkg_stat_reset(&stats->unaccounted_time);
				731	blkg_stat_reset(&stats->avg_queue_size_sum);
				732	blkg_stat_reset(&stats->avg_queue_size_samples);
				733	blkg_stat_reset(&stats->dequeue);
				734	blkg_stat_reset(&stats->group_wait_time);
				735	blkg_stat_reset(&stats->idle_time);
				736	blkg_stat_reset(&stats->empty_time);
				737	#endif
				738	}
				739
				740	/* @to += @from */
				741	static void cfqg_stats_add_aux(struct cfqg_stats to, struct cfqg_stats from)
				742	{
				743	/* queued stats shouldn't be cleared */
				744	blkg_rwstat_add_aux(&to->merged, &from->merged);
				745	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
				746	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
				747	blkg_stat_add_aux(&from->time, &from->time);
				748	#ifdef CONFIG_DEBUG_BLK_CGROUP
				749	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
				750	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
				751	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
				752	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
				753	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
				754	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
				755	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
				756	#endif
				757	}
				758
				759	/*
				760	* Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
				761	* recursive stats can still account for the amount used by this cfqg after
				762	* it's gone.
				763	*/
				764	static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
				765	{
				766	struct cfq_group *parent = cfqg_parent(cfqg);
				767
				768	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
				769
				770	if (unlikely(!parent))
				771	return;
				772
				773	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
				774	cfqg_stats_reset(&cfqg->stats);
				775	}
				776
				777	#else /* CONFIG_CFQ_GROUP_IOSCHED */
				778
				779	static inline struct cfq_group cfqg_parent(struct cfq_group cfqg) { return NULL; }
				780	static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
				781	struct cfq_group *ancestor)
				782	{
				783	return true;
				784	}
				785	static inline void cfqg_get(struct cfq_group *cfqg) { }
				786	static inline void cfqg_put(struct cfq_group *cfqg) { }
				787
				788	#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
				789	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
				790	cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
				791	cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
				792	##args)
				793	#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
				794
				795	static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
				796	struct cfq_group *curr_cfqg, unsigned int op) { }
				797	static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
				798	uint64_t time, unsigned long unaccounted_time) { }
				799	static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
				800	unsigned int op) { }
				801	static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
				802	unsigned int op) { }
				803	static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
				804	u64 start_time_ns,
				805	u64 io_start_time_ns,
				806	unsigned int op) { }
				807
				808	#endif /* CONFIG_CFQ_GROUP_IOSCHED */
				809
				810	static inline u64 get_group_idle(struct cfq_data *cfqd)
				811	{
				812	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				813	struct cfq_queue *cfqq = cfqd->active_queue;
				814
				815	if (cfqq && cfqq->cfqg)
				816	return cfqq->cfqg->group_idle;
				817	#endif
				818	return cfqd->cfq_group_idle;
				819	}
				820
				821	#define cfq_log(cfqd, fmt, args...) \
				822	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
				823
				824	/* Traverses through cfq group service trees */
				825	#define for_each_cfqg_st(cfqg, i, j, st) \
				826	for (i = 0; i <= IDLE_WORKLOAD; i++) \
				827	for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
				828	: &cfqg->service_tree_idle; \
				829	(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) \|\| \
				830	(i == IDLE_WORKLOAD && j == 0); \
				831	j++, st = i < IDLE_WORKLOAD ? \
				832	&cfqg->service_trees[i][j]: NULL) \
				833
				834	static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
				835	struct cfq_ttime *ttime, bool group_idle)
				836	{
				837	u64 slice;
				838	if (!sample_valid(ttime->ttime_samples))
				839	return false;
				840	if (group_idle)
				841	slice = get_group_idle(cfqd);
				842	else
				843	slice = cfqd->cfq_slice_idle;
				844	return ttime->ttime_mean > slice;
				845	}
				846
				847	static inline bool iops_mode(struct cfq_data *cfqd)
				848	{
				849	/*
				850	* If we are not idling on queues and it is a NCQ drive, parallel
				851	* execution of requests is on and measuring time is not possible
				852	* in most of the cases until and unless we drive shallower queue
				853	* depths and that becomes a performance bottleneck. In such cases
				854	* switch to start providing fairness in terms of number of IOs.
				855	*/
				856	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
				857	return true;
				858	else
				859	return false;
				860	}
				861
				862	static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
				863	{
				864	if (cfq_class_idle(cfqq))
				865	return IDLE_WORKLOAD;
				866	if (cfq_class_rt(cfqq))
				867	return RT_WORKLOAD;
				868	return BE_WORKLOAD;
				869	}
				870
				871
				872	static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
				873	{
				874	if (!cfq_cfqq_sync(cfqq))
				875	return ASYNC_WORKLOAD;
				876	if (!cfq_cfqq_idle_window(cfqq))
				877	return SYNC_NOIDLE_WORKLOAD;
				878	return SYNC_WORKLOAD;
				879	}
				880
				881	static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
				882	struct cfq_data *cfqd,
				883	struct cfq_group *cfqg)
				884	{
				885	if (wl_class == IDLE_WORKLOAD)
				886	return cfqg->service_tree_idle.count;
				887
				888	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
				889	cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
				890	cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
				891	}
				892
				893	static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
				894	struct cfq_group *cfqg)
				895	{
				896	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
				897	cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
				898	}
				899
				900	static void cfq_dispatch_insert(struct request_queue , struct request );
				901	static struct cfq_queue cfq_get_queue(struct cfq_data cfqd, bool is_sync,
				902	struct cfq_io_cq cic, struct bio bio);
				903
				904	static inline struct cfq_io_cq icq_to_cic(struct io_cq icq)
				905	{
				906	/* cic->icq is the first member, %NULL will convert to %NULL */
				907	return container_of(icq, struct cfq_io_cq, icq);
				908	}
				909
				910	static inline struct cfq_io_cq cfq_cic_lookup(struct cfq_data cfqd,
				911	struct io_context *ioc)
				912	{
				913	if (ioc)
				914	return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
				915	return NULL;
				916	}
				917
				918	static inline struct cfq_queue cic_to_cfqq(struct cfq_io_cq cic, bool is_sync)
				919	{
				920	return cic->cfqq[is_sync];
				921	}
				922
				923	static inline void cic_set_cfqq(struct cfq_io_cq cic, struct cfq_queue cfqq,
				924	bool is_sync)
				925	{
				926	cic->cfqq[is_sync] = cfqq;
				927	}
				928
				929	static inline struct cfq_data cic_to_cfqd(struct cfq_io_cq cic)
				930	{
				931	return cic->icq.q->elevator->elevator_data;
				932	}
				933
				934	/*
				935	* scheduler run of queue, if there are requests pending and no one in the
				936	* driver that will restart queueing
				937	*/
				938	static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
				939	{
				940	if (cfqd->busy_queues) {
				941	cfq_log(cfqd, "schedule dispatch");
				942	kblockd_schedule_work(&cfqd->unplug_work);
				943	}
				944	}
				945
				946	/*
				947	* Scale schedule slice based on io priority. Use the sync time slice only
				948	* if a queue is marked sync and has sync io queued. A sync queue with async
				949	* io only, should not get full sync slice length.
				950	*/
				951	static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
				952	unsigned short prio)
				953	{
				954	u64 base_slice = cfqd->cfq_slice[sync];
				955	u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
				956
				957	WARN_ON(prio >= IOPRIO_BE_NR);
				958
				959	return base_slice + (slice * (4 - prio));
				960	}
				961
				962	static inline u64
				963	cfq_prio_to_slice(struct cfq_data cfqd, struct cfq_queue cfqq)
				964	{
				965	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
				966	}
				967
				968	/**
				969	* cfqg_scale_charge - scale disk time charge according to cfqg weight
				970	* @charge: disk time being charged
				971	* @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
				972	*
				973	* Scale @charge according to @vfraction, which is in range (0, 1]. The
				974	* scaling is inversely proportional.
				975	*
				976	* scaled = charge / vfraction
				977	*
				978	* The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
				979	*/
				980	static inline u64 cfqg_scale_charge(u64 charge,
				981	unsigned int vfraction)
				982	{
				983	u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
				984
				985	/* charge / vfraction */
				986	c <<= CFQ_SERVICE_SHIFT;
				987	return div_u64(c, vfraction);
				988	}
				989
				990	static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
				991	{
				992	s64 delta = (s64)(vdisktime - min_vdisktime);
				993	if (delta > 0)
				994	min_vdisktime = vdisktime;
				995
				996	return min_vdisktime;
				997	}
				998
				999	static void update_min_vdisktime(struct cfq_rb_root *st)
				1000	{
				1001	if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
				1002	struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
				1003
				1004	st->min_vdisktime = max_vdisktime(st->min_vdisktime,
				1005	cfqg->vdisktime);
				1006	}
				1007	}
				1008
				1009	/*
				1010	* get averaged number of queues of RT/BE priority.
				1011	* average is updated, with a formula that gives more weight to higher numbers,
				1012	* to quickly follows sudden increases and decrease slowly
				1013	*/
				1014
				1015	static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
				1016	struct cfq_group *cfqg, bool rt)
				1017	{
				1018	unsigned min_q, max_q;
				1019	unsigned mult = cfq_hist_divisor - 1;
				1020	unsigned round = cfq_hist_divisor / 2;
				1021	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
				1022
				1023	min_q = min(cfqg->busy_queues_avg[rt], busy);
				1024	max_q = max(cfqg->busy_queues_avg[rt], busy);
				1025	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
				1026	cfq_hist_divisor;
				1027	return cfqg->busy_queues_avg[rt];
				1028	}
				1029
				1030	static inline u64
				1031	cfq_group_slice(struct cfq_data cfqd, struct cfq_group cfqg)
				1032	{
				1033	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
				1034	}
				1035
				1036	static inline u64
				1037	cfq_scaled_cfqq_slice(struct cfq_data cfqd, struct cfq_queue cfqq)
				1038	{
				1039	u64 slice = cfq_prio_to_slice(cfqd, cfqq);
				1040	if (cfqd->cfq_latency) {
				1041	/*
				1042	* interested queues (we consider only the ones with the same
				1043	* priority class in the cfq group)
				1044	*/
				1045	unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
				1046	cfq_class_rt(cfqq));
				1047	u64 sync_slice = cfqd->cfq_slice[1];
				1048	u64 expect_latency = sync_slice * iq;
				1049	u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
				1050
				1051	if (expect_latency > group_slice) {
				1052	u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
				1053	u64 low_slice;
				1054
				1055	/* scale low_slice according to IO priority
				1056	* and sync vs async */
				1057	low_slice = div64_u64(base_low_slice*slice, sync_slice);
				1058	low_slice = min(slice, low_slice);
				1059	/* the adapted slice value is scaled to fit all iqs
				1060	* into the target latency */
				1061	slice = div64_u64(slice*group_slice, expect_latency);
				1062	slice = max(slice, low_slice);
				1063	}
				1064	}
				1065	return slice;
				1066	}
				1067
				1068	static inline void
				1069	cfq_set_prio_slice(struct cfq_data cfqd, struct cfq_queue cfqq)
				1070	{
				1071	u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
				1072	u64 now = ktime_get_ns();
				1073
				1074	cfqq->slice_start = now;
				1075	cfqq->slice_end = now + slice;
				1076	cfqq->allocated_slice = slice;
				1077	cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
				1078	}
				1079
				1080	/*
				1081	* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
				1082	* isn't valid until the first request from the dispatch is activated
				1083	* and the slice time set.
				1084	*/
				1085	static inline bool cfq_slice_used(struct cfq_queue *cfqq)
				1086	{
				1087	if (cfq_cfqq_slice_new(cfqq))
				1088	return false;
				1089	if (ktime_get_ns() < cfqq->slice_end)
				1090	return false;
				1091
				1092	return true;
				1093	}
				1094
				1095	/*
				1096	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
				1097	* We choose the request that is closest to the head right now. Distance
				1098	* behind the head is penalized and only allowed to a certain extent.
				1099	*/
				1100	static struct request *
				1101	cfq_choose_req(struct cfq_data cfqd, struct request rq1, struct request *rq2, sector_t last)
				1102	{
				1103	sector_t s1, s2, d1 = 0, d2 = 0;
				1104	unsigned long back_max;
				1105	#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
				1106	#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
				1107	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
				1108
				1109	if (rq1 == NULL \|\| rq1 == rq2)
				1110	return rq2;
				1111	if (rq2 == NULL)
				1112	return rq1;
				1113
				1114	if (rq_is_sync(rq1) != rq_is_sync(rq2))
				1115	return rq_is_sync(rq1) ? rq1 : rq2;
				1116
				1117	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
				1118	return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
				1119
				1120	s1 = blk_rq_pos(rq1);
				1121	s2 = blk_rq_pos(rq2);
				1122
				1123	/*
				1124	* by definition, 1KiB is 2 sectors
				1125	*/
				1126	back_max = cfqd->cfq_back_max * 2;
				1127
				1128	/*
				1129	* Strict one way elevator _except_ in the case where we allow
				1130	* short backward seeks which are biased as twice the cost of a
				1131	* similar forward seek.
				1132	*/
				1133	if (s1 >= last)
				1134	d1 = s1 - last;
				1135	else if (s1 + back_max >= last)
				1136	d1 = (last - s1) * cfqd->cfq_back_penalty;
				1137	else
				1138	wrap \|= CFQ_RQ1_WRAP;
				1139
				1140	if (s2 >= last)
				1141	d2 = s2 - last;
				1142	else if (s2 + back_max >= last)
				1143	d2 = (last - s2) * cfqd->cfq_back_penalty;
				1144	else
				1145	wrap \|= CFQ_RQ2_WRAP;
				1146
				1147	/* Found required data */
				1148
				1149	/*
				1150	* By doing switch() on the bit mask "wrap" we avoid having to
				1151	* check two variables for all permutations: --> faster!
				1152	*/
				1153	switch (wrap) {
				1154	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
				1155	if (d1 < d2)
				1156	return rq1;
				1157	else if (d2 < d1)
				1158	return rq2;
				1159	else {
				1160	if (s1 >= s2)
				1161	return rq1;
				1162	else
				1163	return rq2;
				1164	}
				1165
				1166	case CFQ_RQ2_WRAP:
				1167	return rq1;
				1168	case CFQ_RQ1_WRAP:
				1169	return rq2;
				1170	case (CFQ_RQ1_WRAP\|CFQ_RQ2_WRAP): /* both rqs wrapped */
				1171	default:
				1172	/*
				1173	* Since both rqs are wrapped,
				1174	* start with the one that's further behind head
				1175	* (--> only one back seek required),
				1176	* since back seek takes more time than forward.
				1177	*/
				1178	if (s1 <= s2)
				1179	return rq1;
				1180	else
				1181	return rq2;
				1182	}
				1183	}
				1184
				1185	static struct cfq_queue cfq_rb_first(struct cfq_rb_root root)
				1186	{
				1187	/* Service tree is empty */
				1188	if (!root->count)
				1189	return NULL;
				1190
				1191	return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
				1192	}
				1193
				1194	static struct cfq_group cfq_rb_first_group(struct cfq_rb_root root)
				1195	{
				1196	return rb_entry_cfqg(rb_first_cached(&root->rb));
				1197	}
				1198
				1199	static void cfq_rb_erase(struct rb_node n, struct cfq_rb_root root)
				1200	{
				1201	if (root->rb_rightmost == n)
				1202	root->rb_rightmost = rb_prev(n);
				1203
				1204	rb_erase_cached(n, &root->rb);
				1205	RB_CLEAR_NODE(n);
				1206
				1207	--root->count;
				1208	}
				1209
				1210	/*
				1211	* would be nice to take fifo expire time into account as well
				1212	*/
				1213	static struct request *
				1214	cfq_find_next_rq(struct cfq_data cfqd, struct cfq_queue cfqq,
				1215	struct request *last)
				1216	{
				1217	struct rb_node *rbnext = rb_next(&last->rb_node);
				1218	struct rb_node *rbprev = rb_prev(&last->rb_node);
				1219	struct request next = NULL, prev = NULL;
				1220
				1221	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
				1222
				1223	if (rbprev)
				1224	prev = rb_entry_rq(rbprev);
				1225
				1226	if (rbnext)
				1227	next = rb_entry_rq(rbnext);
				1228	else {
				1229	rbnext = rb_first(&cfqq->sort_list);
				1230	if (rbnext && rbnext != &last->rb_node)
				1231	next = rb_entry_rq(rbnext);
				1232	}
				1233
				1234	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
				1235	}
				1236
				1237	static u64 cfq_slice_offset(struct cfq_data *cfqd,
				1238	struct cfq_queue *cfqq)
				1239	{
				1240	/*
				1241	* just an approximation, should be ok.
				1242	*/
				1243	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
				1244	cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
				1245	}
				1246
				1247	static inline s64
				1248	cfqg_key(struct cfq_rb_root st, struct cfq_group cfqg)
				1249	{
				1250	return cfqg->vdisktime - st->min_vdisktime;
				1251	}
				1252
				1253	static void
				1254	__cfq_group_service_tree_add(struct cfq_rb_root st, struct cfq_group cfqg)
				1255	{
				1256	struct rb_node **node = &st->rb.rb_root.rb_node;
				1257	struct rb_node *parent = NULL;
				1258	struct cfq_group *__cfqg;
				1259	s64 key = cfqg_key(st, cfqg);
				1260	bool leftmost = true, rightmost = true;
				1261
				1262	while (*node != NULL) {
				1263	parent = *node;
				1264	__cfqg = rb_entry_cfqg(parent);
				1265
				1266	if (key < cfqg_key(st, __cfqg)) {
				1267	node = &parent->rb_left;
				1268	rightmost = false;
				1269	} else {
				1270	node = &parent->rb_right;
				1271	leftmost = false;
				1272	}
				1273	}
				1274
				1275	if (rightmost)
				1276	st->rb_rightmost = &cfqg->rb_node;
				1277
				1278	rb_link_node(&cfqg->rb_node, parent, node);
				1279	rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
				1280	}
				1281
				1282	/*
				1283	* This has to be called only on activation of cfqg
				1284	*/
				1285	static void
				1286	cfq_update_group_weight(struct cfq_group *cfqg)
				1287	{
				1288	if (cfqg->new_weight) {
				1289	cfqg->weight = cfqg->new_weight;
				1290	cfqg->new_weight = 0;
				1291	}
				1292	}
				1293
				1294	static void
				1295	cfq_update_group_leaf_weight(struct cfq_group *cfqg)
				1296	{
				1297	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
				1298
				1299	if (cfqg->new_leaf_weight) {
				1300	cfqg->leaf_weight = cfqg->new_leaf_weight;
				1301	cfqg->new_leaf_weight = 0;
				1302	}
				1303	}
				1304
				1305	static void
				1306	cfq_group_service_tree_add(struct cfq_rb_root st, struct cfq_group cfqg)
				1307	{
				1308	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
				1309	struct cfq_group *pos = cfqg;
				1310	struct cfq_group *parent;
				1311	bool propagate;
				1312
				1313	/* add to the service tree */
				1314	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
				1315
				1316	/*
				1317	* Update leaf_weight. We cannot update weight at this point
				1318	* because cfqg might already have been activated and is
				1319	* contributing its current weight to the parent's child_weight.
				1320	*/
				1321	cfq_update_group_leaf_weight(cfqg);
				1322	__cfq_group_service_tree_add(st, cfqg);
				1323
				1324	/*
				1325	* Activate @cfqg and calculate the portion of vfraction @cfqg is
				1326	* entitled to. vfraction is calculated by walking the tree
				1327	* towards the root calculating the fraction it has at each level.
				1328	* The compounded ratio is how much vfraction @cfqg owns.
				1329	*
				1330	* Start with the proportion tasks in this cfqg has against active
				1331	* children cfqgs - its leaf_weight against children_weight.
				1332	*/
				1333	propagate = !pos->nr_active++;
				1334	pos->children_weight += pos->leaf_weight;
				1335	vfr = vfr * pos->leaf_weight / pos->children_weight;
				1336
				1337	/*
				1338	* Compound ->weight walking up the tree. Both activation and
				1339	* vfraction calculation are done in the same loop. Propagation
				1340	* stops once an already activated node is met. vfraction
				1341	* calculation should always continue to the root.
				1342	*/
				1343	while ((parent = cfqg_parent(pos))) {
				1344	if (propagate) {
				1345	cfq_update_group_weight(pos);
				1346	propagate = !parent->nr_active++;
				1347	parent->children_weight += pos->weight;
				1348	}
				1349	vfr = vfr * pos->weight / parent->children_weight;
				1350	pos = parent;
				1351	}
				1352
				1353	cfqg->vfraction = max_t(unsigned, vfr, 1);
				1354	}
				1355
				1356	static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
				1357	{
				1358	if (!iops_mode(cfqd))
				1359	return CFQ_SLICE_MODE_GROUP_DELAY;
				1360	else
				1361	return CFQ_IOPS_MODE_GROUP_DELAY;
				1362	}
				1363
				1364	static void
				1365	cfq_group_notify_queue_add(struct cfq_data cfqd, struct cfq_group cfqg)
				1366	{
				1367	struct cfq_rb_root *st = &cfqd->grp_service_tree;
				1368	struct cfq_group *__cfqg;
				1369	struct rb_node *n;
				1370
				1371	cfqg->nr_cfqq++;
				1372	if (!RB_EMPTY_NODE(&cfqg->rb_node))
				1373	return;
				1374
				1375	/*
				1376	* Currently put the group at the end. Later implement something
				1377	* so that groups get lesser vtime based on their weights, so that
				1378	* if group does not loose all if it was not continuously backlogged.
				1379	*/
				1380	n = st->rb_rightmost;
				1381	if (n) {
				1382	__cfqg = rb_entry_cfqg(n);
				1383	cfqg->vdisktime = __cfqg->vdisktime +
				1384	cfq_get_cfqg_vdisktime_delay(cfqd);
				1385	} else
				1386	cfqg->vdisktime = st->min_vdisktime;
				1387	cfq_group_service_tree_add(st, cfqg);
				1388	}
				1389
				1390	static void
				1391	cfq_group_service_tree_del(struct cfq_rb_root st, struct cfq_group cfqg)
				1392	{
				1393	struct cfq_group *pos = cfqg;
				1394	bool propagate;
				1395
				1396	/*
				1397	* Undo activation from cfq_group_service_tree_add(). Deactivate
				1398	* @cfqg and propagate deactivation upwards.
				1399	*/
				1400	propagate = !--pos->nr_active;
				1401	pos->children_weight -= pos->leaf_weight;
				1402
				1403	while (propagate) {
				1404	struct cfq_group *parent = cfqg_parent(pos);
				1405
				1406	/* @pos has 0 nr_active at this point */
				1407	WARN_ON_ONCE(pos->children_weight);
				1408	pos->vfraction = 0;
				1409
				1410	if (!parent)
				1411	break;
				1412
				1413	propagate = !--parent->nr_active;
				1414	parent->children_weight -= pos->weight;
				1415	pos = parent;
				1416	}
				1417
				1418	/* remove from the service tree */
				1419	if (!RB_EMPTY_NODE(&cfqg->rb_node))
				1420	cfq_rb_erase(&cfqg->rb_node, st);
				1421	}
				1422
				1423	static void
				1424	cfq_group_notify_queue_del(struct cfq_data cfqd, struct cfq_group cfqg)
				1425	{
				1426	struct cfq_rb_root *st = &cfqd->grp_service_tree;
				1427
				1428	BUG_ON(cfqg->nr_cfqq < 1);
				1429	cfqg->nr_cfqq--;
				1430
				1431	/* If there are other cfq queues under this group, don't delete it */
				1432	if (cfqg->nr_cfqq)
				1433	return;
				1434
				1435	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
				1436	cfq_group_service_tree_del(st, cfqg);
				1437	cfqg->saved_wl_slice = 0;
				1438	cfqg_stats_update_dequeue(cfqg);
				1439	}
				1440
				1441	static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
				1442	u64 *unaccounted_time)
				1443	{
				1444	u64 slice_used;
				1445	u64 now = ktime_get_ns();
				1446
				1447	/*
				1448	* Queue got expired before even a single request completed or
				1449	* got expired immediately after first request completion.
				1450	*/
				1451	if (!cfqq->slice_start \|\| cfqq->slice_start == now) {
				1452	/*
				1453	* Also charge the seek time incurred to the group, otherwise
				1454	* if there are mutiple queues in the group, each can dispatch
				1455	* a single request on seeky media and cause lots of seek time
				1456	* and group will never know it.
				1457	*/
				1458	slice_used = max_t(u64, (now - cfqq->dispatch_start),
				1459	jiffies_to_nsecs(1));
				1460	} else {
				1461	slice_used = now - cfqq->slice_start;
				1462	if (slice_used > cfqq->allocated_slice) {
				1463	*unaccounted_time = slice_used - cfqq->allocated_slice;
				1464	slice_used = cfqq->allocated_slice;
				1465	}
				1466	if (cfqq->slice_start > cfqq->dispatch_start)
				1467	*unaccounted_time += cfqq->slice_start -
				1468	cfqq->dispatch_start;
				1469	}
				1470
				1471	return slice_used;
				1472	}
				1473
				1474	static void cfq_group_served(struct cfq_data cfqd, struct cfq_group cfqg,
				1475	struct cfq_queue *cfqq)
				1476	{
				1477	struct cfq_rb_root *st = &cfqd->grp_service_tree;
				1478	u64 used_sl, charge, unaccounted_sl = 0;
				1479	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
				1480	- cfqg->service_tree_idle.count;
				1481	unsigned int vfr;
				1482	u64 now = ktime_get_ns();
				1483
				1484	BUG_ON(nr_sync < 0);
				1485	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
				1486
				1487	if (iops_mode(cfqd))
				1488	charge = cfqq->slice_dispatch;
				1489	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
				1490	charge = cfqq->allocated_slice;
				1491
				1492	/*
				1493	* Can't update vdisktime while on service tree and cfqg->vfraction
				1494	* is valid only while on it. Cache vfr, leave the service tree,
				1495	* update vdisktime and go back on. The re-addition to the tree
				1496	* will also update the weights as necessary.
				1497	*/
				1498	vfr = cfqg->vfraction;
				1499	cfq_group_service_tree_del(st, cfqg);
				1500	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
				1501	cfq_group_service_tree_add(st, cfqg);
				1502
				1503	/* This group is being expired. Save the context */
				1504	if (cfqd->workload_expires > now) {
				1505	cfqg->saved_wl_slice = cfqd->workload_expires - now;
				1506	cfqg->saved_wl_type = cfqd->serving_wl_type;
				1507	cfqg->saved_wl_class = cfqd->serving_wl_class;
				1508	} else
				1509	cfqg->saved_wl_slice = 0;
				1510
				1511	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
				1512	st->min_vdisktime);
				1513	cfq_log_cfqq(cfqq->cfqd, cfqq,
				1514	"sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
				1515	used_sl, cfqq->slice_dispatch, charge,
				1516	iops_mode(cfqd), cfqq->nr_sectors);
				1517	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
				1518	cfqg_stats_set_start_empty_time(cfqg);
				1519	}
				1520
				1521	/**
				1522	* cfq_init_cfqg_base - initialize base part of a cfq_group
				1523	* @cfqg: cfq_group to initialize
				1524	*
				1525	* Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
				1526	* is enabled or not.
				1527	*/
				1528	static void cfq_init_cfqg_base(struct cfq_group *cfqg)
				1529	{
				1530	struct cfq_rb_root *st;
				1531	int i, j;
				1532
				1533	for_each_cfqg_st(cfqg, i, j, st)
				1534	*st = CFQ_RB_ROOT;
				1535	RB_CLEAR_NODE(&cfqg->rb_node);
				1536
				1537	cfqg->ttime.last_end_request = ktime_get_ns();
				1538	}
				1539
				1540	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				1541	static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
				1542	bool on_dfl, bool reset_dev, bool is_leaf_weight);
				1543
				1544	static void cfqg_stats_exit(struct cfqg_stats *stats)
				1545	{
				1546	blkg_rwstat_exit(&stats->merged);
				1547	blkg_rwstat_exit(&stats->service_time);
				1548	blkg_rwstat_exit(&stats->wait_time);
				1549	blkg_rwstat_exit(&stats->queued);
				1550	blkg_stat_exit(&stats->time);
				1551	#ifdef CONFIG_DEBUG_BLK_CGROUP
				1552	blkg_stat_exit(&stats->unaccounted_time);
				1553	blkg_stat_exit(&stats->avg_queue_size_sum);
				1554	blkg_stat_exit(&stats->avg_queue_size_samples);
				1555	blkg_stat_exit(&stats->dequeue);
				1556	blkg_stat_exit(&stats->group_wait_time);
				1557	blkg_stat_exit(&stats->idle_time);
				1558	blkg_stat_exit(&stats->empty_time);
				1559	#endif
				1560	}
				1561
				1562	static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
				1563	{
				1564	if (blkg_rwstat_init(&stats->merged, gfp) \|\|
				1565	blkg_rwstat_init(&stats->service_time, gfp) \|\|
				1566	blkg_rwstat_init(&stats->wait_time, gfp) \|\|
				1567	blkg_rwstat_init(&stats->queued, gfp) \|\|
				1568	blkg_stat_init(&stats->time, gfp))
				1569	goto err;
				1570
				1571	#ifdef CONFIG_DEBUG_BLK_CGROUP
				1572	if (blkg_stat_init(&stats->unaccounted_time, gfp) \|\|
				1573	blkg_stat_init(&stats->avg_queue_size_sum, gfp) \|\|
				1574	blkg_stat_init(&stats->avg_queue_size_samples, gfp) \|\|
				1575	blkg_stat_init(&stats->dequeue, gfp) \|\|
				1576	blkg_stat_init(&stats->group_wait_time, gfp) \|\|
				1577	blkg_stat_init(&stats->idle_time, gfp) \|\|
				1578	blkg_stat_init(&stats->empty_time, gfp))
				1579	goto err;
				1580	#endif
				1581	return 0;
				1582	err:
				1583	cfqg_stats_exit(stats);
				1584	return -ENOMEM;
				1585	}
				1586
				1587	static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
				1588	{
				1589	struct cfq_group_data *cgd;
				1590
				1591	cgd = kzalloc(sizeof(*cgd), gfp);
				1592	if (!cgd)
				1593	return NULL;
				1594	return &cgd->cpd;
				1595	}
				1596
				1597	static void cfq_cpd_init(struct blkcg_policy_data *cpd)
				1598	{
				1599	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
				1600	unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
				1601	CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
				1602
				1603	if (cpd_to_blkcg(cpd) == &blkcg_root)
				1604	weight *= 2;
				1605
				1606	cgd->weight = weight;
				1607	cgd->leaf_weight = weight;
				1608	cgd->group_idle = cfq_group_idle;
				1609	}
				1610
				1611	static void cfq_cpd_free(struct blkcg_policy_data *cpd)
				1612	{
				1613	kfree(cpd_to_cfqgd(cpd));
				1614	}
				1615
				1616	static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
				1617	{
				1618	struct blkcg *blkcg = cpd_to_blkcg(cpd);
				1619	bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
				1620	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
				1621
				1622	if (blkcg == &blkcg_root)
				1623	weight *= 2;
				1624
				1625	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
				1626	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
				1627	}
				1628
				1629	static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
				1630	{
				1631	struct cfq_group *cfqg;
				1632
				1633	cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
				1634	if (!cfqg)
				1635	return NULL;
				1636
				1637	cfq_init_cfqg_base(cfqg);
				1638	if (cfqg_stats_init(&cfqg->stats, gfp)) {
				1639	kfree(cfqg);
				1640	return NULL;
				1641	}
				1642
				1643	return &cfqg->pd;
				1644	}
				1645
				1646	static void cfq_pd_init(struct blkg_policy_data *pd)
				1647	{
				1648	struct cfq_group *cfqg = pd_to_cfqg(pd);
				1649	struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
				1650
				1651	cfqg->weight = cgd->weight;
				1652	cfqg->leaf_weight = cgd->leaf_weight;
				1653	cfqg->group_idle = cgd->group_idle;
				1654	}
				1655
				1656	static void cfq_pd_offline(struct blkg_policy_data *pd)
				1657	{
				1658	struct cfq_group *cfqg = pd_to_cfqg(pd);
				1659	int i;
				1660
				1661	for (i = 0; i < IOPRIO_BE_NR; i++) {
				1662	if (cfqg->async_cfqq[0][i])
				1663	cfq_put_queue(cfqg->async_cfqq[0][i]);
				1664	if (cfqg->async_cfqq[1][i])
				1665	cfq_put_queue(cfqg->async_cfqq[1][i]);
				1666	}
				1667
				1668	if (cfqg->async_idle_cfqq)
				1669	cfq_put_queue(cfqg->async_idle_cfqq);
				1670
				1671	/*
				1672	* @blkg is going offline and will be ignored by
				1673	* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
				1674	* that they don't get lost. If IOs complete after this point, the
				1675	* stats for them will be lost. Oh well...
				1676	*/
				1677	cfqg_stats_xfer_dead(cfqg);
				1678	}
				1679
				1680	static void cfq_pd_free(struct blkg_policy_data *pd)
				1681	{
				1682	struct cfq_group *cfqg = pd_to_cfqg(pd);
				1683
				1684	cfqg_stats_exit(&cfqg->stats);
				1685	return kfree(cfqg);
				1686	}
				1687
				1688	static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
				1689	{
				1690	struct cfq_group *cfqg = pd_to_cfqg(pd);
				1691
				1692	cfqg_stats_reset(&cfqg->stats);
				1693	}
				1694
				1695	static struct cfq_group cfq_lookup_cfqg(struct cfq_data cfqd,
				1696	struct blkcg *blkcg)
				1697	{
				1698	struct blkcg_gq *blkg;
				1699
				1700	blkg = blkg_lookup(blkcg, cfqd->queue);
				1701	if (likely(blkg))
				1702	return blkg_to_cfqg(blkg);
				1703	return NULL;
				1704	}
				1705
				1706	static void cfq_link_cfqq_cfqg(struct cfq_queue cfqq, struct cfq_group cfqg)
				1707	{
				1708	cfqq->cfqg = cfqg;
				1709	/* cfqq reference on cfqg */
				1710	cfqg_get(cfqg);
				1711	}
				1712
				1713	static u64 cfqg_prfill_weight_device(struct seq_file *sf,
				1714	struct blkg_policy_data *pd, int off)
				1715	{
				1716	struct cfq_group *cfqg = pd_to_cfqg(pd);
				1717
				1718	if (!cfqg->dev_weight)
				1719	return 0;
				1720	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
				1721	}
				1722
				1723	static int cfqg_print_weight_device(struct seq_file sf, void v)
				1724	{
				1725	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				1726	cfqg_prfill_weight_device, &blkcg_policy_cfq,
				1727	0, false);
				1728	return 0;
				1729	}
				1730
				1731	static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
				1732	struct blkg_policy_data *pd, int off)
				1733	{
				1734	struct cfq_group *cfqg = pd_to_cfqg(pd);
				1735
				1736	if (!cfqg->dev_leaf_weight)
				1737	return 0;
				1738	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
				1739	}
				1740
				1741	static int cfqg_print_leaf_weight_device(struct seq_file sf, void v)
				1742	{
				1743	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				1744	cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
				1745	0, false);
				1746	return 0;
				1747	}
				1748
				1749	static int cfq_print_weight(struct seq_file sf, void v)
				1750	{
				1751	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				1752	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
				1753	unsigned int val = 0;
				1754
				1755	if (cgd)
				1756	val = cgd->weight;
				1757
				1758	seq_printf(sf, "%u\n", val);
				1759	return 0;
				1760	}
				1761
				1762	static int cfq_print_leaf_weight(struct seq_file sf, void v)
				1763	{
				1764	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				1765	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
				1766	unsigned int val = 0;
				1767
				1768	if (cgd)
				1769	val = cgd->leaf_weight;
				1770
				1771	seq_printf(sf, "%u\n", val);
				1772	return 0;
				1773	}
				1774
				1775	static int cfq_print_group_idle(struct seq_file sf, void v)
				1776	{
				1777	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				1778	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
				1779	u64 val = 0;
				1780
				1781	if (cgd)
				1782	val = cgd->group_idle;
				1783
				1784	seq_printf(sf, "%llu\n", div_u64(val, NSEC_PER_USEC));
				1785	return 0;
				1786	}
				1787
				1788	static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
				1789	char *buf, size_t nbytes, loff_t off,
				1790	bool on_dfl, bool is_leaf_weight)
				1791	{
				1792	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
				1793	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
				1794	struct blkcg *blkcg = css_to_blkcg(of_css(of));
				1795	struct blkg_conf_ctx ctx;
				1796	struct cfq_group *cfqg;
				1797	struct cfq_group_data *cfqgd;
				1798	int ret;
				1799	u64 v;
				1800
				1801	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
				1802	if (ret)
				1803	return ret;
				1804
				1805	if (sscanf(ctx.body, "%llu", &v) == 1) {
				1806	/* require "default" on dfl */
				1807	ret = -ERANGE;
				1808	if (!v && on_dfl)
				1809	goto out_finish;
				1810	} else if (!strcmp(strim(ctx.body), "default")) {
				1811	v = 0;
				1812	} else {
				1813	ret = -EINVAL;
				1814	goto out_finish;
				1815	}
				1816
				1817	cfqg = blkg_to_cfqg(ctx.blkg);
				1818	cfqgd = blkcg_to_cfqgd(blkcg);
				1819
				1820	ret = -ERANGE;
				1821	if (!v \|\| (v >= min && v <= max)) {
				1822	if (!is_leaf_weight) {
				1823	cfqg->dev_weight = v;
				1824	cfqg->new_weight = v ?: cfqgd->weight;
				1825	} else {
				1826	cfqg->dev_leaf_weight = v;
				1827	cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
				1828	}
				1829	ret = 0;
				1830	}
				1831	out_finish:
				1832	blkg_conf_finish(&ctx);
				1833	return ret ?: nbytes;
				1834	}
				1835
				1836	static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
				1837	char *buf, size_t nbytes, loff_t off)
				1838	{
				1839	return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
				1840	}
				1841
				1842	static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
				1843	char *buf, size_t nbytes, loff_t off)
				1844	{
				1845	return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
				1846	}
				1847
				1848	static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
				1849	bool on_dfl, bool reset_dev, bool is_leaf_weight)
				1850	{
				1851	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
				1852	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
				1853	struct blkcg *blkcg = css_to_blkcg(css);
				1854	struct blkcg_gq *blkg;
				1855	struct cfq_group_data *cfqgd;
				1856	int ret = 0;
				1857
				1858	if (val < min \|\| val > max)
				1859	return -ERANGE;
				1860
				1861	spin_lock_irq(&blkcg->lock);
				1862	cfqgd = blkcg_to_cfqgd(blkcg);
				1863	if (!cfqgd) {
				1864	ret = -EINVAL;
				1865	goto out;
				1866	}
				1867
				1868	if (!is_leaf_weight)
				1869	cfqgd->weight = val;
				1870	else
				1871	cfqgd->leaf_weight = val;
				1872
				1873	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				1874	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
				1875
				1876	if (!cfqg)
				1877	continue;
				1878
				1879	if (!is_leaf_weight) {
				1880	if (reset_dev)
				1881	cfqg->dev_weight = 0;
				1882	if (!cfqg->dev_weight)
				1883	cfqg->new_weight = cfqgd->weight;
				1884	} else {
				1885	if (reset_dev)
				1886	cfqg->dev_leaf_weight = 0;
				1887	if (!cfqg->dev_leaf_weight)
				1888	cfqg->new_leaf_weight = cfqgd->leaf_weight;
				1889	}
				1890	}
				1891
				1892	out:
				1893	spin_unlock_irq(&blkcg->lock);
				1894	return ret;
				1895	}
				1896
				1897	static int cfq_set_weight(struct cgroup_subsys_state css, struct cftype cft,
				1898	u64 val)
				1899	{
				1900	return __cfq_set_weight(css, val, false, false, false);
				1901	}
				1902
				1903	static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
				1904	struct cftype *cft, u64 val)
				1905	{
				1906	return __cfq_set_weight(css, val, false, false, true);
				1907	}
				1908
				1909	static int cfq_set_group_idle(struct cgroup_subsys_state *css,
				1910	struct cftype *cft, u64 val)
				1911	{
				1912	struct blkcg *blkcg = css_to_blkcg(css);
				1913	struct cfq_group_data *cfqgd;
				1914	struct blkcg_gq *blkg;
				1915	int ret = 0;
				1916
				1917	spin_lock_irq(&blkcg->lock);
				1918	cfqgd = blkcg_to_cfqgd(blkcg);
				1919	if (!cfqgd) {
				1920	ret = -EINVAL;
				1921	goto out;
				1922	}
				1923
				1924	cfqgd->group_idle = val * NSEC_PER_USEC;
				1925
				1926	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				1927	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
				1928
				1929	if (!cfqg)
				1930	continue;
				1931
				1932	cfqg->group_idle = cfqgd->group_idle;
				1933	}
				1934
				1935	out:
				1936	spin_unlock_irq(&blkcg->lock);
				1937	return ret;
				1938	}
				1939
				1940	static int cfqg_print_stat(struct seq_file sf, void v)
				1941	{
				1942	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
				1943	&blkcg_policy_cfq, seq_cft(sf)->private, false);
				1944	return 0;
				1945	}
				1946
				1947	static int cfqg_print_rwstat(struct seq_file sf, void v)
				1948	{
				1949	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
				1950	&blkcg_policy_cfq, seq_cft(sf)->private, true);
				1951	return 0;
				1952	}
				1953
				1954	static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
				1955	struct blkg_policy_data *pd, int off)
				1956	{
				1957	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
				1958	&blkcg_policy_cfq, off);
				1959	return __blkg_prfill_u64(sf, pd, sum);
				1960	}
				1961
				1962	static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
				1963	struct blkg_policy_data *pd, int off)
				1964	{
				1965	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
				1966	&blkcg_policy_cfq, off);
				1967	return __blkg_prfill_rwstat(sf, pd, &sum);
				1968	}
				1969
				1970	static int cfqg_print_stat_recursive(struct seq_file sf, void v)
				1971	{
				1972	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				1973	cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
				1974	seq_cft(sf)->private, false);
				1975	return 0;
				1976	}
				1977
				1978	static int cfqg_print_rwstat_recursive(struct seq_file sf, void v)
				1979	{
				1980	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				1981	cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
				1982	seq_cft(sf)->private, true);
				1983	return 0;
				1984	}
				1985
				1986	static u64 cfqg_prfill_sectors(struct seq_file sf, struct blkg_policy_data pd,
				1987	int off)
				1988	{
				1989	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
				1990
				1991	return __blkg_prfill_u64(sf, pd, sum >> 9);
				1992	}
				1993
				1994	static int cfqg_print_stat_sectors(struct seq_file sf, void v)
				1995	{
				1996	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				1997	cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
				1998	return 0;
				1999	}
				2000
				2001	static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
				2002	struct blkg_policy_data *pd, int off)
				2003	{
				2004	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
				2005	offsetof(struct blkcg_gq, stat_bytes));
				2006	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
				2007	atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
				2008
				2009	return __blkg_prfill_u64(sf, pd, sum >> 9);
				2010	}
				2011
				2012	static int cfqg_print_stat_sectors_recursive(struct seq_file sf, void v)
				2013	{
				2014	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				2015	cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
				2016	false);
				2017	return 0;
				2018	}
				2019
				2020	#ifdef CONFIG_DEBUG_BLK_CGROUP
				2021	static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
				2022	struct blkg_policy_data *pd, int off)
				2023	{
				2024	struct cfq_group *cfqg = pd_to_cfqg(pd);
				2025	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
				2026	u64 v = 0;
				2027
				2028	if (samples) {
				2029	v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
				2030	v = div64_u64(v, samples);
				2031	}
				2032	__blkg_prfill_u64(sf, pd, v);
				2033	return 0;
				2034	}
				2035
				2036	/* print avg_queue_size */
				2037	static int cfqg_print_avg_queue_size(struct seq_file sf, void v)
				2038	{
				2039	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				2040	cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
				2041	0, false);
				2042	return 0;
				2043	}
				2044	#endif /* CONFIG_DEBUG_BLK_CGROUP */
				2045
				2046	static struct cftype cfq_blkcg_legacy_files[] = {
				2047	/* on root, weight is mapped to leaf_weight */
				2048	{
				2049	.name = "weight_device",
				2050	.flags = CFTYPE_ONLY_ON_ROOT,
				2051	.seq_show = cfqg_print_leaf_weight_device,
				2052	.write = cfqg_set_leaf_weight_device,
				2053	},
				2054	{
				2055	.name = "weight",
				2056	.flags = CFTYPE_ONLY_ON_ROOT,
				2057	.seq_show = cfq_print_leaf_weight,
				2058	.write_u64 = cfq_set_leaf_weight,
				2059	},
				2060
				2061	/* no such mapping necessary for !roots */
				2062	{
				2063	.name = "weight_device",
				2064	.flags = CFTYPE_NOT_ON_ROOT,
				2065	.seq_show = cfqg_print_weight_device,
				2066	.write = cfqg_set_weight_device,
				2067	},
				2068	{
				2069	.name = "weight",
				2070	.flags = CFTYPE_NOT_ON_ROOT,
				2071	.seq_show = cfq_print_weight,
				2072	.write_u64 = cfq_set_weight,
				2073	},
				2074
				2075	{
				2076	.name = "leaf_weight_device",
				2077	.seq_show = cfqg_print_leaf_weight_device,
				2078	.write = cfqg_set_leaf_weight_device,
				2079	},
				2080	{
				2081	.name = "leaf_weight",
				2082	.seq_show = cfq_print_leaf_weight,
				2083	.write_u64 = cfq_set_leaf_weight,
				2084	},
				2085	{
				2086	.name = "group_idle",
				2087	.seq_show = cfq_print_group_idle,
				2088	.write_u64 = cfq_set_group_idle,
				2089	},
				2090
				2091	/* statistics, covers only the tasks in the cfqg */
				2092	{
				2093	.name = "time",
				2094	.private = offsetof(struct cfq_group, stats.time),
				2095	.seq_show = cfqg_print_stat,
				2096	},
				2097	{
				2098	.name = "sectors",
				2099	.seq_show = cfqg_print_stat_sectors,
				2100	},
				2101	{
				2102	.name = "io_service_bytes",
				2103	.private = (unsigned long)&blkcg_policy_cfq,
				2104	.seq_show = blkg_print_stat_bytes,
				2105	},
				2106	{
				2107	.name = "io_serviced",
				2108	.private = (unsigned long)&blkcg_policy_cfq,
				2109	.seq_show = blkg_print_stat_ios,
				2110	},
				2111	{
				2112	.name = "io_service_time",
				2113	.private = offsetof(struct cfq_group, stats.service_time),
				2114	.seq_show = cfqg_print_rwstat,
				2115	},
				2116	{
				2117	.name = "io_wait_time",
				2118	.private = offsetof(struct cfq_group, stats.wait_time),
				2119	.seq_show = cfqg_print_rwstat,
				2120	},
				2121	{
				2122	.name = "io_merged",
				2123	.private = offsetof(struct cfq_group, stats.merged),
				2124	.seq_show = cfqg_print_rwstat,
				2125	},
				2126	{
				2127	.name = "io_queued",
				2128	.private = offsetof(struct cfq_group, stats.queued),
				2129	.seq_show = cfqg_print_rwstat,
				2130	},
				2131
				2132	/* the same statictics which cover the cfqg and its descendants */
				2133	{
				2134	.name = "time_recursive",
				2135	.private = offsetof(struct cfq_group, stats.time),
				2136	.seq_show = cfqg_print_stat_recursive,
				2137	},
				2138	{
				2139	.name = "sectors_recursive",
				2140	.seq_show = cfqg_print_stat_sectors_recursive,
				2141	},
				2142	{
				2143	.name = "io_service_bytes_recursive",
				2144	.private = (unsigned long)&blkcg_policy_cfq,
				2145	.seq_show = blkg_print_stat_bytes_recursive,
				2146	},
				2147	{
				2148	.name = "io_serviced_recursive",
				2149	.private = (unsigned long)&blkcg_policy_cfq,
				2150	.seq_show = blkg_print_stat_ios_recursive,
				2151	},
				2152	{
				2153	.name = "io_service_time_recursive",
				2154	.private = offsetof(struct cfq_group, stats.service_time),
				2155	.seq_show = cfqg_print_rwstat_recursive,
				2156	},
				2157	{
				2158	.name = "io_wait_time_recursive",
				2159	.private = offsetof(struct cfq_group, stats.wait_time),
				2160	.seq_show = cfqg_print_rwstat_recursive,
				2161	},
				2162	{
				2163	.name = "io_merged_recursive",
				2164	.private = offsetof(struct cfq_group, stats.merged),
				2165	.seq_show = cfqg_print_rwstat_recursive,
				2166	},
				2167	{
				2168	.name = "io_queued_recursive",
				2169	.private = offsetof(struct cfq_group, stats.queued),
				2170	.seq_show = cfqg_print_rwstat_recursive,
				2171	},
				2172	#ifdef CONFIG_DEBUG_BLK_CGROUP
				2173	{
				2174	.name = "avg_queue_size",
				2175	.seq_show = cfqg_print_avg_queue_size,
				2176	},
				2177	{
				2178	.name = "group_wait_time",
				2179	.private = offsetof(struct cfq_group, stats.group_wait_time),
				2180	.seq_show = cfqg_print_stat,
				2181	},
				2182	{
				2183	.name = "idle_time",
				2184	.private = offsetof(struct cfq_group, stats.idle_time),
				2185	.seq_show = cfqg_print_stat,
				2186	},
				2187	{
				2188	.name = "empty_time",
				2189	.private = offsetof(struct cfq_group, stats.empty_time),
				2190	.seq_show = cfqg_print_stat,
				2191	},
				2192	{
				2193	.name = "dequeue",
				2194	.private = offsetof(struct cfq_group, stats.dequeue),
				2195	.seq_show = cfqg_print_stat,
				2196	},
				2197	{
				2198	.name = "unaccounted_time",
				2199	.private = offsetof(struct cfq_group, stats.unaccounted_time),
				2200	.seq_show = cfqg_print_stat,
				2201	},
				2202	#endif /* CONFIG_DEBUG_BLK_CGROUP */
				2203	{ } /* terminate */
				2204	};
				2205
				2206	static int cfq_print_weight_on_dfl(struct seq_file sf, void v)
				2207	{
				2208	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2209	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
				2210
				2211	seq_printf(sf, "default %u\n", cgd->weight);
				2212	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
				2213	&blkcg_policy_cfq, 0, false);
				2214	return 0;
				2215	}
				2216
				2217	static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
				2218	char *buf, size_t nbytes, loff_t off)
				2219	{
				2220	char *endp;
				2221	int ret;
				2222	u64 v;
				2223
				2224	buf = strim(buf);
				2225
				2226	/* "WEIGHT" or "default WEIGHT" sets the default weight */
				2227	v = simple_strtoull(buf, &endp, 0);
				2228	if (*endp == '\0' \|\| sscanf(buf, "default %llu", &v) == 1) {
				2229	ret = __cfq_set_weight(of_css(of), v, true, false, false);
				2230	return ret ?: nbytes;
				2231	}
				2232
				2233	/* "MAJ:MIN WEIGHT" */
				2234	return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
				2235	}
				2236
				2237	static struct cftype cfq_blkcg_files[] = {
				2238	{
				2239	.name = "weight",
				2240	.flags = CFTYPE_NOT_ON_ROOT,
				2241	.seq_show = cfq_print_weight_on_dfl,
				2242	.write = cfq_set_weight_on_dfl,
				2243	},
				2244	{ } /* terminate */
				2245	};
				2246
				2247	#else /* GROUP_IOSCHED */
				2248	static struct cfq_group cfq_lookup_cfqg(struct cfq_data cfqd,
				2249	struct blkcg *blkcg)
				2250	{
				2251	return cfqd->root_group;
				2252	}
				2253
				2254	static inline void
				2255	cfq_link_cfqq_cfqg(struct cfq_queue cfqq, struct cfq_group cfqg) {
				2256	cfqq->cfqg = cfqg;
				2257	}
				2258
				2259	#endif /* GROUP_IOSCHED */
				2260
				2261	/*
				2262	* The cfqd->service_trees holds all pending cfq_queue's that have
				2263	* requests waiting to be processed. It is sorted in the order that
				2264	* we will service the queues.
				2265	*/
				2266	static void cfq_service_tree_add(struct cfq_data cfqd, struct cfq_queue cfqq,
				2267	bool add_front)
				2268	{
				2269	struct rb_node *p, parent;
				2270	struct cfq_queue *__cfqq;
				2271	u64 rb_key;
				2272	struct cfq_rb_root *st;
				2273	bool leftmost = true;
				2274	int new_cfqq = 1;
				2275	u64 now = ktime_get_ns();
				2276
				2277	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
				2278	if (cfq_class_idle(cfqq)) {
				2279	rb_key = CFQ_IDLE_DELAY;
				2280	parent = st->rb_rightmost;
				2281	if (parent && parent != &cfqq->rb_node) {
				2282	__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
				2283	rb_key += __cfqq->rb_key;
				2284	} else
				2285	rb_key += now;
				2286	} else if (!add_front) {
				2287	/*
				2288	* Get our rb key offset. Subtract any residual slice
				2289	* value carried from last service. A negative resid
				2290	* count indicates slice overrun, and this should position
				2291	* the next service time further away in the tree.
				2292	*/
				2293	rb_key = cfq_slice_offset(cfqd, cfqq) + now;
				2294	rb_key -= cfqq->slice_resid;
				2295	cfqq->slice_resid = 0;
				2296	} else {
				2297	rb_key = -NSEC_PER_SEC;
				2298	__cfqq = cfq_rb_first(st);
				2299	rb_key += __cfqq ? __cfqq->rb_key : now;
				2300	}
				2301
				2302	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
				2303	new_cfqq = 0;
				2304	/*
				2305	* same position, nothing more to do
				2306	*/
				2307	if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
				2308	return;
				2309
				2310	cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
				2311	cfqq->service_tree = NULL;
				2312	}
				2313
				2314	parent = NULL;
				2315	cfqq->service_tree = st;
				2316	p = &st->rb.rb_root.rb_node;
				2317	while (*p) {
				2318	parent = *p;
				2319	__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
				2320
				2321	/*
				2322	* sort by key, that represents service time.
				2323	*/
				2324	if (rb_key < __cfqq->rb_key)
				2325	p = &parent->rb_left;
				2326	else {
				2327	p = &parent->rb_right;
				2328	leftmost = false;
				2329	}
				2330	}
				2331
				2332	cfqq->rb_key = rb_key;
				2333	rb_link_node(&cfqq->rb_node, parent, p);
				2334	rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
				2335	st->count++;
				2336	if (add_front \|\| !new_cfqq)
				2337	return;
				2338	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
				2339	}
				2340
				2341	static struct cfq_queue *
				2342	cfq_prio_tree_lookup(struct cfq_data cfqd, struct rb_root root,
				2343	sector_t sector, struct rb_node **ret_parent,
				2344	struct rb_node ***rb_link)
				2345	{
				2346	struct rb_node *p, parent;
				2347	struct cfq_queue *cfqq = NULL;
				2348
				2349	parent = NULL;
				2350	p = &root->rb_node;
				2351	while (*p) {
				2352	struct rb_node **n;
				2353
				2354	parent = *p;
				2355	cfqq = rb_entry(parent, struct cfq_queue, p_node);
				2356
				2357	/*
				2358	* Sort strictly based on sector. Smallest to the left,
				2359	* largest to the right.
				2360	*/
				2361	if (sector > blk_rq_pos(cfqq->next_rq))
				2362	n = &(*p)->rb_right;
				2363	else if (sector < blk_rq_pos(cfqq->next_rq))
				2364	n = &(*p)->rb_left;
				2365	else
				2366	break;
				2367	p = n;
				2368	cfqq = NULL;
				2369	}
				2370
				2371	*ret_parent = parent;
				2372	if (rb_link)
				2373	*rb_link = p;
				2374	return cfqq;
				2375	}
				2376
				2377	static void cfq_prio_tree_add(struct cfq_data cfqd, struct cfq_queue cfqq)
				2378	{
				2379	struct rb_node *p, parent;
				2380	struct cfq_queue *__cfqq;
				2381
				2382	if (cfqq->p_root) {
				2383	rb_erase(&cfqq->p_node, cfqq->p_root);
				2384	cfqq->p_root = NULL;
				2385	}
				2386
				2387	if (cfq_class_idle(cfqq))
				2388	return;
				2389	if (!cfqq->next_rq)
				2390	return;
				2391
				2392	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
				2393	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
				2394	blk_rq_pos(cfqq->next_rq), &parent, &p);
				2395	if (!__cfqq) {
				2396	rb_link_node(&cfqq->p_node, parent, p);
				2397	rb_insert_color(&cfqq->p_node, cfqq->p_root);
				2398	} else
				2399	cfqq->p_root = NULL;
				2400	}
				2401
				2402	/*
				2403	* Update cfqq's position in the service tree.
				2404	*/
				2405	static void cfq_resort_rr_list(struct cfq_data cfqd, struct cfq_queue cfqq)
				2406	{
				2407	/*
				2408	* Resorting requires the cfqq to be on the RR list already.
				2409	*/
				2410	if (cfq_cfqq_on_rr(cfqq)) {
				2411	cfq_service_tree_add(cfqd, cfqq, 0);
				2412	cfq_prio_tree_add(cfqd, cfqq);
				2413	}
				2414	}
				2415
				2416	/*
				2417	* add to busy list of queues for service, trying to be fair in ordering
				2418	* the pending list according to last request service
				2419	*/
				2420	static void cfq_add_cfqq_rr(struct cfq_data cfqd, struct cfq_queue cfqq)
				2421	{
				2422	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
				2423	BUG_ON(cfq_cfqq_on_rr(cfqq));
				2424	cfq_mark_cfqq_on_rr(cfqq);
				2425	cfqd->busy_queues++;
				2426	if (cfq_cfqq_sync(cfqq))
				2427	cfqd->busy_sync_queues++;
				2428
				2429	cfq_resort_rr_list(cfqd, cfqq);
				2430	}
				2431
				2432	/*
				2433	* Called when the cfqq no longer has requests pending, remove it from
				2434	* the service tree.
				2435	*/
				2436	static void cfq_del_cfqq_rr(struct cfq_data cfqd, struct cfq_queue cfqq)
				2437	{
				2438	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
				2439	BUG_ON(!cfq_cfqq_on_rr(cfqq));
				2440	cfq_clear_cfqq_on_rr(cfqq);
				2441
				2442	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
				2443	cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
				2444	cfqq->service_tree = NULL;
				2445	}
				2446	if (cfqq->p_root) {
				2447	rb_erase(&cfqq->p_node, cfqq->p_root);
				2448	cfqq->p_root = NULL;
				2449	}
				2450
				2451	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
				2452	BUG_ON(!cfqd->busy_queues);
				2453	cfqd->busy_queues--;
				2454	if (cfq_cfqq_sync(cfqq))
				2455	cfqd->busy_sync_queues--;
				2456	}
				2457
				2458	/*
				2459	* rb tree support functions
				2460	*/
				2461	static void cfq_del_rq_rb(struct request *rq)
				2462	{
				2463	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				2464	const int sync = rq_is_sync(rq);
				2465
				2466	BUG_ON(!cfqq->queued[sync]);
				2467	cfqq->queued[sync]--;
				2468
				2469	elv_rb_del(&cfqq->sort_list, rq);
				2470
				2471	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
				2472	/*
				2473	* Queue will be deleted from service tree when we actually
				2474	* expire it later. Right now just remove it from prio tree
				2475	* as it is empty.
				2476	*/
				2477	if (cfqq->p_root) {
				2478	rb_erase(&cfqq->p_node, cfqq->p_root);
				2479	cfqq->p_root = NULL;
				2480	}
				2481	}
				2482	}
				2483
				2484	static void cfq_add_rq_rb(struct request *rq)
				2485	{
				2486	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				2487	struct cfq_data *cfqd = cfqq->cfqd;
				2488	struct request *prev;
				2489
				2490	cfqq->queued[rq_is_sync(rq)]++;
				2491
				2492	elv_rb_add(&cfqq->sort_list, rq);
				2493
				2494	if (!cfq_cfqq_on_rr(cfqq))
				2495	cfq_add_cfqq_rr(cfqd, cfqq);
				2496
				2497	/*
				2498	* check if this request is a better next-serve candidate
				2499	*/
				2500	prev = cfqq->next_rq;
				2501	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
				2502
				2503	/*
				2504	* adjust priority tree position, if ->next_rq changes
				2505	*/
				2506	if (prev != cfqq->next_rq)
				2507	cfq_prio_tree_add(cfqd, cfqq);
				2508
				2509	BUG_ON(!cfqq->next_rq);
				2510	}
				2511
				2512	static void cfq_reposition_rq_rb(struct cfq_queue cfqq, struct request rq)
				2513	{
				2514	elv_rb_del(&cfqq->sort_list, rq);
				2515	cfqq->queued[rq_is_sync(rq)]--;
				2516	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
				2517	cfq_add_rq_rb(rq);
				2518	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
				2519	rq->cmd_flags);
				2520	}
				2521
				2522	static struct request *
				2523	cfq_find_rq_fmerge(struct cfq_data cfqd, struct bio bio)
				2524	{
				2525	struct task_struct *tsk = current;
				2526	struct cfq_io_cq *cic;
				2527	struct cfq_queue *cfqq;
				2528
				2529	cic = cfq_cic_lookup(cfqd, tsk->io_context);
				2530	if (!cic)
				2531	return NULL;
				2532
				2533	cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
				2534	if (cfqq)
				2535	return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
				2536
				2537	return NULL;
				2538	}
				2539
				2540	static void cfq_activate_request(struct request_queue q, struct request rq)
				2541	{
				2542	struct cfq_data *cfqd = q->elevator->elevator_data;
				2543
				2544	cfqd->rq_in_driver++;
				2545	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
				2546	cfqd->rq_in_driver);
				2547
				2548	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
				2549	}
				2550
				2551	static void cfq_deactivate_request(struct request_queue q, struct request rq)
				2552	{
				2553	struct cfq_data *cfqd = q->elevator->elevator_data;
				2554
				2555	WARN_ON(!cfqd->rq_in_driver);
				2556	cfqd->rq_in_driver--;
				2557	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
				2558	cfqd->rq_in_driver);
				2559	}
				2560
				2561	static void cfq_remove_request(struct request *rq)
				2562	{
				2563	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				2564
				2565	if (cfqq->next_rq == rq)
				2566	cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
				2567
				2568	list_del_init(&rq->queuelist);
				2569	cfq_del_rq_rb(rq);
				2570
				2571	cfqq->cfqd->rq_queued--;
				2572	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
				2573	if (rq->cmd_flags & REQ_PRIO) {
				2574	WARN_ON(!cfqq->prio_pending);
				2575	cfqq->prio_pending--;
				2576	}
				2577	}
				2578
				2579	static enum elv_merge cfq_merge(struct request_queue q, struct request *req,
				2580	struct bio *bio)
				2581	{
				2582	struct cfq_data *cfqd = q->elevator->elevator_data;
				2583	struct request *__rq;
				2584
				2585	__rq = cfq_find_rq_fmerge(cfqd, bio);
				2586	if (__rq && elv_bio_merge_ok(__rq, bio)) {
				2587	*req = __rq;
				2588	return ELEVATOR_FRONT_MERGE;
				2589	}
				2590
				2591	return ELEVATOR_NO_MERGE;
				2592	}
				2593
				2594	static void cfq_merged_request(struct request_queue q, struct request req,
				2595	enum elv_merge type)
				2596	{
				2597	if (type == ELEVATOR_FRONT_MERGE) {
				2598	struct cfq_queue *cfqq = RQ_CFQQ(req);
				2599
				2600	cfq_reposition_rq_rb(cfqq, req);
				2601	}
				2602	}
				2603
				2604	static void cfq_bio_merged(struct request_queue q, struct request req,
				2605	struct bio *bio)
				2606	{
				2607	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
				2608	}
				2609
				2610	static void
				2611	cfq_merged_requests(struct request_queue q, struct request rq,
				2612	struct request *next)
				2613	{
				2614	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				2615	struct cfq_data *cfqd = q->elevator->elevator_data;
				2616
				2617	/*
				2618	* reposition in fifo if next is older than rq
				2619	*/
				2620	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
				2621	next->fifo_time < rq->fifo_time &&
				2622	cfqq == RQ_CFQQ(next)) {
				2623	list_move(&rq->queuelist, &next->queuelist);
				2624	rq->fifo_time = next->fifo_time;
				2625	}
				2626
				2627	if (cfqq->next_rq == next)
				2628	cfqq->next_rq = rq;
				2629	cfq_remove_request(next);
				2630	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
				2631
				2632	cfqq = RQ_CFQQ(next);
				2633	/*
				2634	* all requests of this queue are merged to other queues, delete it
				2635	* from the service tree. If it's the active_queue,
				2636	* cfq_dispatch_requests() will choose to expire it or do idle
				2637	*/
				2638	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
				2639	cfqq != cfqd->active_queue)
				2640	cfq_del_cfqq_rr(cfqd, cfqq);
				2641	}
				2642
				2643	static int cfq_allow_bio_merge(struct request_queue q, struct request rq,
				2644	struct bio *bio)
				2645	{
				2646	struct cfq_data *cfqd = q->elevator->elevator_data;
				2647	bool is_sync = op_is_sync(bio->bi_opf);
				2648	struct cfq_io_cq *cic;
				2649	struct cfq_queue *cfqq;
				2650
				2651	/*
				2652	* Disallow merge of a sync bio into an async request.
				2653	*/
				2654	if (is_sync && !rq_is_sync(rq))
				2655	return false;
				2656
				2657	/*
				2658	* Lookup the cfqq that this bio will be queued with and allow
				2659	* merge only if rq is queued there.
				2660	*/
				2661	cic = cfq_cic_lookup(cfqd, current->io_context);
				2662	if (!cic)
				2663	return false;
				2664
				2665	cfqq = cic_to_cfqq(cic, is_sync);
				2666	return cfqq == RQ_CFQQ(rq);
				2667	}
				2668
				2669	static int cfq_allow_rq_merge(struct request_queue q, struct request rq,
				2670	struct request *next)
				2671	{
				2672	return RQ_CFQQ(rq) == RQ_CFQQ(next);
				2673	}
				2674
				2675	static inline void cfq_del_timer(struct cfq_data cfqd, struct cfq_queue cfqq)
				2676	{
				2677	hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
				2678	cfqg_stats_update_idle_time(cfqq->cfqg);
				2679	}
				2680
				2681	static void __cfq_set_active_queue(struct cfq_data *cfqd,
				2682	struct cfq_queue *cfqq)
				2683	{
				2684	if (cfqq) {
				2685	cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
				2686	cfqd->serving_wl_class, cfqd->serving_wl_type);
				2687	cfqg_stats_update_avg_queue_size(cfqq->cfqg);
				2688	cfqq->slice_start = 0;
				2689	cfqq->dispatch_start = ktime_get_ns();
				2690	cfqq->allocated_slice = 0;
				2691	cfqq->slice_end = 0;
				2692	cfqq->slice_dispatch = 0;
				2693	cfqq->nr_sectors = 0;
				2694
				2695	cfq_clear_cfqq_wait_request(cfqq);
				2696	cfq_clear_cfqq_must_dispatch(cfqq);
				2697	cfq_clear_cfqq_must_alloc_slice(cfqq);
				2698	cfq_clear_cfqq_fifo_expire(cfqq);
				2699	cfq_mark_cfqq_slice_new(cfqq);
				2700
				2701	cfq_del_timer(cfqd, cfqq);
				2702	}
				2703
				2704	cfqd->active_queue = cfqq;
				2705	}
				2706
				2707	/*
				2708	* current cfqq expired its slice (or was too idle), select new one
				2709	*/
				2710	static void
				2711	__cfq_slice_expired(struct cfq_data cfqd, struct cfq_queue cfqq,
				2712	bool timed_out)
				2713	{
				2714	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
				2715
				2716	if (cfq_cfqq_wait_request(cfqq))
				2717	cfq_del_timer(cfqd, cfqq);
				2718
				2719	cfq_clear_cfqq_wait_request(cfqq);
				2720	cfq_clear_cfqq_wait_busy(cfqq);
				2721
				2722	/*
				2723	* If this cfqq is shared between multiple processes, check to
				2724	* make sure that those processes are still issuing I/Os within
				2725	* the mean seek distance. If not, it may be time to break the
				2726	* queues apart again.
				2727	*/
				2728	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
				2729	cfq_mark_cfqq_split_coop(cfqq);
				2730
				2731	/*
				2732	* store what was left of this slice, if the queue idled/timed out
				2733	*/
				2734	if (timed_out) {
				2735	if (cfq_cfqq_slice_new(cfqq))
				2736	cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
				2737	else
				2738	cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
				2739	cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
				2740	}
				2741
				2742	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
				2743
				2744	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
				2745	cfq_del_cfqq_rr(cfqd, cfqq);
				2746
				2747	cfq_resort_rr_list(cfqd, cfqq);
				2748
				2749	if (cfqq == cfqd->active_queue)
				2750	cfqd->active_queue = NULL;
				2751
				2752	if (cfqd->active_cic) {
				2753	put_io_context(cfqd->active_cic->icq.ioc);
				2754	cfqd->active_cic = NULL;
				2755	}
				2756	}
				2757
				2758	static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
				2759	{
				2760	struct cfq_queue *cfqq = cfqd->active_queue;
				2761
				2762	if (cfqq)
				2763	__cfq_slice_expired(cfqd, cfqq, timed_out);
				2764	}
				2765
				2766	/*
				2767	* Get next queue for service. Unless we have a queue preemption,
				2768	* we'll simply select the first cfqq in the service tree.
				2769	*/
				2770	static struct cfq_queue cfq_get_next_queue(struct cfq_data cfqd)
				2771	{
				2772	struct cfq_rb_root *st = st_for(cfqd->serving_group,
				2773	cfqd->serving_wl_class, cfqd->serving_wl_type);
				2774
				2775	if (!cfqd->rq_queued)
				2776	return NULL;
				2777
				2778	/* There is nothing to dispatch */
				2779	if (!st)
				2780	return NULL;
				2781	if (RB_EMPTY_ROOT(&st->rb.rb_root))
				2782	return NULL;
				2783	return cfq_rb_first(st);
				2784	}
				2785
				2786	static struct cfq_queue cfq_get_next_queue_forced(struct cfq_data cfqd)
				2787	{
				2788	struct cfq_group *cfqg;
				2789	struct cfq_queue *cfqq;
				2790	int i, j;
				2791	struct cfq_rb_root *st;
				2792
				2793	if (!cfqd->rq_queued)
				2794	return NULL;
				2795
				2796	cfqg = cfq_get_next_cfqg(cfqd);
				2797	if (!cfqg)
				2798	return NULL;
				2799
				2800	for_each_cfqg_st(cfqg, i, j, st) {
				2801	cfqq = cfq_rb_first(st);
				2802	if (cfqq)
				2803	return cfqq;
				2804	}
				2805	return NULL;
				2806	}
				2807
				2808	/*
				2809	* Get and set a new active queue for service.
				2810	*/
				2811	static struct cfq_queue cfq_set_active_queue(struct cfq_data cfqd,
				2812	struct cfq_queue *cfqq)
				2813	{
				2814	if (!cfqq)
				2815	cfqq = cfq_get_next_queue(cfqd);
				2816
				2817	__cfq_set_active_queue(cfqd, cfqq);
				2818	return cfqq;
				2819	}
				2820
				2821	static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
				2822	struct request *rq)
				2823	{
				2824	if (blk_rq_pos(rq) >= cfqd->last_position)
				2825	return blk_rq_pos(rq) - cfqd->last_position;
				2826	else
				2827	return cfqd->last_position - blk_rq_pos(rq);
				2828	}
				2829
				2830	static inline int cfq_rq_close(struct cfq_data cfqd, struct cfq_queue cfqq,
				2831	struct request *rq)
				2832	{
				2833	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
				2834	}
				2835
				2836	static struct cfq_queue cfqq_close(struct cfq_data cfqd,
				2837	struct cfq_queue *cur_cfqq)
				2838	{
				2839	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
				2840	struct rb_node parent, node;
				2841	struct cfq_queue *__cfqq;
				2842	sector_t sector = cfqd->last_position;
				2843
				2844	if (RB_EMPTY_ROOT(root))
				2845	return NULL;
				2846
				2847	/*
				2848	* First, if we find a request starting at the end of the last
				2849	* request, choose it.
				2850	*/
				2851	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
				2852	if (__cfqq)
				2853	return __cfqq;
				2854
				2855	/*
				2856	* If the exact sector wasn't found, the parent of the NULL leaf
				2857	* will contain the closest sector.
				2858	*/
				2859	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
				2860	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
				2861	return __cfqq;
				2862
				2863	if (blk_rq_pos(__cfqq->next_rq) < sector)
				2864	node = rb_next(&__cfqq->p_node);
				2865	else
				2866	node = rb_prev(&__cfqq->p_node);
				2867	if (!node)
				2868	return NULL;
				2869
				2870	__cfqq = rb_entry(node, struct cfq_queue, p_node);
				2871	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
				2872	return __cfqq;
				2873
				2874	return NULL;
				2875	}
				2876
				2877	/*
				2878	* cfqd - obvious
				2879	* cur_cfqq - passed in so that we don't decide that the current queue is
				2880	* closely cooperating with itself.
				2881	*
				2882	* So, basically we're assuming that that cur_cfqq has dispatched at least
				2883	* one request, and that cfqd->last_position reflects a position on the disk
				2884	* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
				2885	* assumption.
				2886	*/
				2887	static struct cfq_queue cfq_close_cooperator(struct cfq_data cfqd,
				2888	struct cfq_queue *cur_cfqq)
				2889	{
				2890	struct cfq_queue *cfqq;
				2891
				2892	if (cfq_class_idle(cur_cfqq))
				2893	return NULL;
				2894	if (!cfq_cfqq_sync(cur_cfqq))
				2895	return NULL;
				2896	if (CFQQ_SEEKY(cur_cfqq))
				2897	return NULL;
				2898
				2899	/*
				2900	* Don't search priority tree if it's the only queue in the group.
				2901	*/
				2902	if (cur_cfqq->cfqg->nr_cfqq == 1)
				2903	return NULL;
				2904
				2905	/*
				2906	* We should notice if some of the queues are cooperating, eg
				2907	* working closely on the same area of the disk. In that case,
				2908	* we can group them together and don't waste time idling.
				2909	*/
				2910	cfqq = cfqq_close(cfqd, cur_cfqq);
				2911	if (!cfqq)
				2912	return NULL;
				2913
				2914	/* If new queue belongs to different cfq_group, don't choose it */
				2915	if (cur_cfqq->cfqg != cfqq->cfqg)
				2916	return NULL;
				2917
				2918	/*
				2919	* It only makes sense to merge sync queues.
				2920	*/
				2921	if (!cfq_cfqq_sync(cfqq))
				2922	return NULL;
				2923	if (CFQQ_SEEKY(cfqq))
				2924	return NULL;
				2925
				2926	/*
				2927	* Do not merge queues of different priority classes
				2928	*/
				2929	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
				2930	return NULL;
				2931
				2932	return cfqq;
				2933	}
				2934
				2935	/*
				2936	* Determine whether we should enforce idle window for this queue.
				2937	*/
				2938
				2939	static bool cfq_should_idle(struct cfq_data cfqd, struct cfq_queue cfqq)
				2940	{
				2941	enum wl_class_t wl_class = cfqq_class(cfqq);
				2942	struct cfq_rb_root *st = cfqq->service_tree;
				2943
				2944	BUG_ON(!st);
				2945	BUG_ON(!st->count);
				2946
				2947	if (!cfqd->cfq_slice_idle)
				2948	return false;
				2949
				2950	/* We never do for idle class queues. */
				2951	if (wl_class == IDLE_WORKLOAD)
				2952	return false;
				2953
				2954	/* We do for queues that were marked with idle window flag. */
				2955	if (cfq_cfqq_idle_window(cfqq) &&
				2956	!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
				2957	return true;
				2958
				2959	/*
				2960	* Otherwise, we do only if they are the last ones
				2961	* in their service tree.
				2962	*/
				2963	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
				2964	!cfq_io_thinktime_big(cfqd, &st->ttime, false))
				2965	return true;
				2966	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
				2967	return false;
				2968	}
				2969
				2970	static void cfq_arm_slice_timer(struct cfq_data *cfqd)
				2971	{
				2972	struct cfq_queue *cfqq = cfqd->active_queue;
				2973	struct cfq_rb_root *st = cfqq->service_tree;
				2974	struct cfq_io_cq *cic;
				2975	u64 sl, group_idle = 0;
				2976	u64 now = ktime_get_ns();
				2977
				2978	/*
				2979	* SSD device without seek penalty, disable idling. But only do so
				2980	* for devices that support queuing, otherwise we still have a problem
				2981	* with sync vs async workloads.
				2982	*/
				2983	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
				2984	!get_group_idle(cfqd))
				2985	return;
				2986
				2987	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
				2988	WARN_ON(cfq_cfqq_slice_new(cfqq));
				2989
				2990	/*
				2991	* idle is disabled, either manually or by past process history
				2992	*/
				2993	if (!cfq_should_idle(cfqd, cfqq)) {
				2994	/* no queue idling. Check for group idling */
				2995	group_idle = get_group_idle(cfqd);
				2996	if (!group_idle)
				2997	return;
				2998	}
				2999
				3000	/*
				3001	* still active requests from this queue, don't idle
				3002	*/
				3003	if (cfqq->dispatched)
				3004	return;
				3005
				3006	/*
				3007	* task has exited, don't wait
				3008	*/
				3009	cic = cfqd->active_cic;
				3010	if (!cic \|\| !atomic_read(&cic->icq.ioc->active_ref))
				3011	return;
				3012
				3013	/*
				3014	* If our average think time is larger than the remaining time
				3015	* slice, then don't idle. This avoids overrunning the allotted
				3016	* time slice.
				3017	*/
				3018	if (sample_valid(cic->ttime.ttime_samples) &&
				3019	(cfqq->slice_end - now < cic->ttime.ttime_mean)) {
				3020	cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
				3021	cic->ttime.ttime_mean);
				3022	return;
				3023	}
				3024
				3025	/*
				3026	* There are other queues in the group or this is the only group and
				3027	* it has too big thinktime, don't do group idle.
				3028	*/
				3029	if (group_idle &&
				3030	(cfqq->cfqg->nr_cfqq > 1 \|\|
				3031	cfq_io_thinktime_big(cfqd, &st->ttime, true)))
				3032	return;
				3033
				3034	cfq_mark_cfqq_wait_request(cfqq);
				3035
				3036	if (group_idle)
				3037	sl = group_idle;
				3038	else
				3039	sl = cfqd->cfq_slice_idle;
				3040
				3041	hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
				3042	HRTIMER_MODE_REL);
				3043	cfqg_stats_set_start_idle_time(cfqq->cfqg);
				3044	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
				3045	group_idle ? 1 : 0);
				3046	}
				3047
				3048	/*
				3049	* Move request from internal lists to the request queue dispatch list.
				3050	*/
				3051	static void cfq_dispatch_insert(struct request_queue q, struct request rq)
				3052	{
				3053	struct cfq_data *cfqd = q->elevator->elevator_data;
				3054	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				3055
				3056	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
				3057
				3058	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
				3059	cfq_remove_request(rq);
				3060	cfqq->dispatched++;
				3061	(RQ_CFQG(rq))->dispatched++;
				3062	elv_dispatch_sort(q, rq);
				3063
				3064	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
				3065	cfqq->nr_sectors += blk_rq_sectors(rq);
				3066	}
				3067
				3068	/*
				3069	* return expired entry, or NULL to just start from scratch in rbtree
				3070	*/
				3071	static struct request cfq_check_fifo(struct cfq_queue cfqq)
				3072	{
				3073	struct request *rq = NULL;
				3074
				3075	if (cfq_cfqq_fifo_expire(cfqq))
				3076	return NULL;
				3077
				3078	cfq_mark_cfqq_fifo_expire(cfqq);
				3079
				3080	if (list_empty(&cfqq->fifo))
				3081	return NULL;
				3082
				3083	rq = rq_entry_fifo(cfqq->fifo.next);
				3084	if (ktime_get_ns() < rq->fifo_time)
				3085	rq = NULL;
				3086
				3087	return rq;
				3088	}
				3089
				3090	static inline int
				3091	cfq_prio_to_maxrq(struct cfq_data cfqd, struct cfq_queue cfqq)
				3092	{
				3093	const int base_rq = cfqd->cfq_slice_async_rq;
				3094
				3095	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
				3096
				3097	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
				3098	}
				3099
				3100	/*
				3101	* Must be called with the queue_lock held.
				3102	*/
				3103	static int cfqq_process_refs(struct cfq_queue *cfqq)
				3104	{
				3105	int process_refs, io_refs;
				3106
				3107	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
				3108	process_refs = cfqq->ref - io_refs;
				3109	BUG_ON(process_refs < 0);
				3110	return process_refs;
				3111	}
				3112
				3113	static void cfq_setup_merge(struct cfq_queue cfqq, struct cfq_queue new_cfqq)
				3114	{
				3115	int process_refs, new_process_refs;
				3116	struct cfq_queue *__cfqq;
				3117
				3118	/*
				3119	* If there are no process references on the new_cfqq, then it is
				3120	* unsafe to follow the ->new_cfqq chain as other cfqq's in the
				3121	* chain may have dropped their last reference (not just their
				3122	* last process reference).
				3123	*/
				3124	if (!cfqq_process_refs(new_cfqq))
				3125	return;
				3126
				3127	/* Avoid a circular list and skip interim queue merges */
				3128	while ((__cfqq = new_cfqq->new_cfqq)) {
				3129	if (__cfqq == cfqq)
				3130	return;
				3131	new_cfqq = __cfqq;
				3132	}
				3133
				3134	process_refs = cfqq_process_refs(cfqq);
				3135	new_process_refs = cfqq_process_refs(new_cfqq);
				3136	/*
				3137	* If the process for the cfqq has gone away, there is no
				3138	* sense in merging the queues.
				3139	*/
				3140	if (process_refs == 0 \|\| new_process_refs == 0)
				3141	return;
				3142
				3143	/*
				3144	* Merge in the direction of the lesser amount of work.
				3145	*/
				3146	if (new_process_refs >= process_refs) {
				3147	cfqq->new_cfqq = new_cfqq;
				3148	new_cfqq->ref += process_refs;
				3149	} else {
				3150	new_cfqq->new_cfqq = cfqq;
				3151	cfqq->ref += new_process_refs;
				3152	}
				3153	}
				3154
				3155	static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
				3156	struct cfq_group *cfqg, enum wl_class_t wl_class)
				3157	{
				3158	struct cfq_queue *queue;
				3159	int i;
				3160	bool key_valid = false;
				3161	u64 lowest_key = 0;
				3162	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
				3163
				3164	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
				3165	/* select the one with lowest rb_key */
				3166	queue = cfq_rb_first(st_for(cfqg, wl_class, i));
				3167	if (queue &&
				3168	(!key_valid \|\| queue->rb_key < lowest_key)) {
				3169	lowest_key = queue->rb_key;
				3170	cur_best = i;
				3171	key_valid = true;
				3172	}
				3173	}
				3174
				3175	return cur_best;
				3176	}
				3177
				3178	static void
				3179	choose_wl_class_and_type(struct cfq_data cfqd, struct cfq_group cfqg)
				3180	{
				3181	u64 slice;
				3182	unsigned count;
				3183	struct cfq_rb_root *st;
				3184	u64 group_slice;
				3185	enum wl_class_t original_class = cfqd->serving_wl_class;
				3186	u64 now = ktime_get_ns();
				3187
				3188	/* Choose next priority. RT > BE > IDLE */
				3189	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
				3190	cfqd->serving_wl_class = RT_WORKLOAD;
				3191	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
				3192	cfqd->serving_wl_class = BE_WORKLOAD;
				3193	else {
				3194	cfqd->serving_wl_class = IDLE_WORKLOAD;
				3195	cfqd->workload_expires = now + jiffies_to_nsecs(1);
				3196	return;
				3197	}
				3198
				3199	if (original_class != cfqd->serving_wl_class)
				3200	goto new_workload;
				3201
				3202	/*
				3203	* For RT and BE, we have to choose also the type
				3204	* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
				3205	* expiration time
				3206	*/
				3207	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
				3208	count = st->count;
				3209
				3210	/*
				3211	* check workload expiration, and that we still have other queues ready
				3212	*/
				3213	if (count && !(now > cfqd->workload_expires))
				3214	return;
				3215
				3216	new_workload:
				3217	/* otherwise select new workload type */
				3218	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
				3219	cfqd->serving_wl_class);
				3220	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
				3221	count = st->count;
				3222
				3223	/*
				3224	* the workload slice is computed as a fraction of target latency
				3225	* proportional to the number of queues in that workload, over
				3226	* all the queues in the same priority class
				3227	*/
				3228	group_slice = cfq_group_slice(cfqd, cfqg);
				3229
				3230	slice = div_u64(group_slice * count,
				3231	max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
				3232	cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
				3233	cfqg)));
				3234
				3235	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
				3236	u64 tmp;
				3237
				3238	/*
				3239	* Async queues are currently system wide. Just taking
				3240	* proportion of queues with-in same group will lead to higher
				3241	* async ratio system wide as generally root group is going
				3242	* to have higher weight. A more accurate thing would be to
				3243	* calculate system wide asnc/sync ratio.
				3244	*/
				3245	tmp = cfqd->cfq_target_latency *
				3246	cfqg_busy_async_queues(cfqd, cfqg);
				3247	tmp = div_u64(tmp, cfqd->busy_queues);
				3248	slice = min_t(u64, slice, tmp);
				3249
				3250	/* async workload slice is scaled down according to
				3251	* the sync/async slice ratio. */
				3252	slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
				3253	} else
				3254	/* sync workload slice is at least 2 * cfq_slice_idle */
				3255	slice = max(slice, 2 * cfqd->cfq_slice_idle);
				3256
				3257	slice = max_t(u64, slice, CFQ_MIN_TT);
				3258	cfq_log(cfqd, "workload slice:%llu", slice);
				3259	cfqd->workload_expires = now + slice;
				3260	}
				3261
				3262	static struct cfq_group cfq_get_next_cfqg(struct cfq_data cfqd)
				3263	{
				3264	struct cfq_rb_root *st = &cfqd->grp_service_tree;
				3265	struct cfq_group *cfqg;
				3266
				3267	if (RB_EMPTY_ROOT(&st->rb.rb_root))
				3268	return NULL;
				3269	cfqg = cfq_rb_first_group(st);
				3270	update_min_vdisktime(st);
				3271	return cfqg;
				3272	}
				3273
				3274	static void cfq_choose_cfqg(struct cfq_data *cfqd)
				3275	{
				3276	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
				3277	u64 now = ktime_get_ns();
				3278
				3279	cfqd->serving_group = cfqg;
				3280
				3281	/* Restore the workload type data */
				3282	if (cfqg->saved_wl_slice) {
				3283	cfqd->workload_expires = now + cfqg->saved_wl_slice;
				3284	cfqd->serving_wl_type = cfqg->saved_wl_type;
				3285	cfqd->serving_wl_class = cfqg->saved_wl_class;
				3286	} else
				3287	cfqd->workload_expires = now - 1;
				3288
				3289	choose_wl_class_and_type(cfqd, cfqg);
				3290	}
				3291
				3292	/*
				3293	* Select a queue for service. If we have a current active queue,
				3294	* check whether to continue servicing it, or retrieve and set a new one.
				3295	*/
				3296	static struct cfq_queue cfq_select_queue(struct cfq_data cfqd)
				3297	{
				3298	struct cfq_queue cfqq, new_cfqq = NULL;
				3299	u64 now = ktime_get_ns();
				3300
				3301	cfqq = cfqd->active_queue;
				3302	if (!cfqq)
				3303	goto new_queue;
				3304
				3305	if (!cfqd->rq_queued)
				3306	return NULL;
				3307
				3308	/*
				3309	* We were waiting for group to get backlogged. Expire the queue
				3310	*/
				3311	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
				3312	goto expire;
				3313
				3314	/*
				3315	* The active queue has run out of time, expire it and select new.
				3316	*/
				3317	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
				3318	/*
				3319	* If slice had not expired at the completion of last request
				3320	* we might not have turned on wait_busy flag. Don't expire
				3321	* the queue yet. Allow the group to get backlogged.
				3322	*
				3323	* The very fact that we have used the slice, that means we
				3324	* have been idling all along on this queue and it should be
				3325	* ok to wait for this request to complete.
				3326	*/
				3327	if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
				3328	&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
				3329	cfqq = NULL;
				3330	goto keep_queue;
				3331	} else
				3332	goto check_group_idle;
				3333	}
				3334
				3335	/*
				3336	* The active queue has requests and isn't expired, allow it to
				3337	* dispatch.
				3338	*/
				3339	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
				3340	goto keep_queue;
				3341
				3342	/*
				3343	* If another queue has a request waiting within our mean seek
				3344	* distance, let it run. The expire code will check for close
				3345	* cooperators and put the close queue at the front of the service
				3346	* tree. If possible, merge the expiring queue with the new cfqq.
				3347	*/
				3348	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
				3349	if (new_cfqq) {
				3350	if (!cfqq->new_cfqq)
				3351	cfq_setup_merge(cfqq, new_cfqq);
				3352	goto expire;
				3353	}
				3354
				3355	/*
				3356	* No requests pending. If the active queue still has requests in
				3357	* flight or is idling for a new request, allow either of these
				3358	* conditions to happen (or time out) before selecting a new queue.
				3359	*/
				3360	if (hrtimer_active(&cfqd->idle_slice_timer)) {
				3361	cfqq = NULL;
				3362	goto keep_queue;
				3363	}
				3364
				3365	/*
				3366	* This is a deep seek queue, but the device is much faster than
				3367	* the queue can deliver, don't idle
				3368	**/
				3369	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
				3370	(cfq_cfqq_slice_new(cfqq) \|\|
				3371	(cfqq->slice_end - now > now - cfqq->slice_start))) {
				3372	cfq_clear_cfqq_deep(cfqq);
				3373	cfq_clear_cfqq_idle_window(cfqq);
				3374	}
				3375
				3376	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
				3377	cfqq = NULL;
				3378	goto keep_queue;
				3379	}
				3380
				3381	/*
				3382	* If group idle is enabled and there are requests dispatched from
				3383	* this group, wait for requests to complete.
				3384	*/
				3385	check_group_idle:
				3386	if (get_group_idle(cfqd) && cfqq->cfqg->nr_cfqq == 1 &&
				3387	cfqq->cfqg->dispatched &&
				3388	!cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
				3389	cfqq = NULL;
				3390	goto keep_queue;
				3391	}
				3392
				3393	expire:
				3394	cfq_slice_expired(cfqd, 0);
				3395	new_queue:
				3396	/*
				3397	* Current queue expired. Check if we have to switch to a new
				3398	* service tree
				3399	*/
				3400	if (!new_cfqq)
				3401	cfq_choose_cfqg(cfqd);
				3402
				3403	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
				3404	keep_queue:
				3405	return cfqq;
				3406	}
				3407
				3408	static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
				3409	{
				3410	int dispatched = 0;
				3411
				3412	while (cfqq->next_rq) {
				3413	cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
				3414	dispatched++;
				3415	}
				3416
				3417	BUG_ON(!list_empty(&cfqq->fifo));
				3418
				3419	/* By default cfqq is not expired if it is empty. Do it explicitly */
				3420	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
				3421	return dispatched;
				3422	}
				3423
				3424	/*
				3425	* Drain our current requests. Used for barriers and when switching
				3426	* io schedulers on-the-fly.
				3427	*/
				3428	static int cfq_forced_dispatch(struct cfq_data *cfqd)
				3429	{
				3430	struct cfq_queue *cfqq;
				3431	int dispatched = 0;
				3432
				3433	/* Expire the timeslice of the current active queue first */
				3434	cfq_slice_expired(cfqd, 0);
				3435	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
				3436	__cfq_set_active_queue(cfqd, cfqq);
				3437	dispatched += __cfq_forced_dispatch_cfqq(cfqq);
				3438	}
				3439
				3440	BUG_ON(cfqd->busy_queues);
				3441
				3442	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
				3443	return dispatched;
				3444	}
				3445
				3446	static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
				3447	struct cfq_queue *cfqq)
				3448	{
				3449	u64 now = ktime_get_ns();
				3450
				3451	/* the queue hasn't finished any request, can't estimate */
				3452	if (cfq_cfqq_slice_new(cfqq))
				3453	return true;
				3454	if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
				3455	return true;
				3456
				3457	return false;
				3458	}
				3459
				3460	static bool cfq_may_dispatch(struct cfq_data cfqd, struct cfq_queue cfqq)
				3461	{
				3462	unsigned int max_dispatch;
				3463
				3464	if (cfq_cfqq_must_dispatch(cfqq))
				3465	return true;
				3466
				3467	/*
				3468	* Drain async requests before we start sync IO
				3469	*/
				3470	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
				3471	return false;
				3472
				3473	/*
				3474	* If this is an async queue and we have sync IO in flight, let it wait
				3475	*/
				3476	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
				3477	return false;
				3478
				3479	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
				3480	if (cfq_class_idle(cfqq))
				3481	max_dispatch = 1;
				3482
				3483	/*
				3484	* Does this cfqq already have too much IO in flight?
				3485	*/
				3486	if (cfqq->dispatched >= max_dispatch) {
				3487	bool promote_sync = false;
				3488	/*
				3489	* idle queue must always only have a single IO in flight
				3490	*/
				3491	if (cfq_class_idle(cfqq))
				3492	return false;
				3493
				3494	/*
				3495	* If there is only one sync queue
				3496	* we can ignore async queue here and give the sync
				3497	* queue no dispatch limit. The reason is a sync queue can
				3498	* preempt async queue, limiting the sync queue doesn't make
				3499	* sense. This is useful for aiostress test.
				3500	*/
				3501	if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
				3502	promote_sync = true;
				3503
				3504	/*
				3505	* We have other queues, don't allow more IO from this one
				3506	*/
				3507	if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
				3508	!promote_sync)
				3509	return false;
				3510
				3511	/*
				3512	* Sole queue user, no limit
				3513	*/
				3514	if (cfqd->busy_queues == 1 \|\| promote_sync)
				3515	max_dispatch = -1;
				3516	else
				3517	/*
				3518	* Normally we start throttling cfqq when cfq_quantum/2
				3519	* requests have been dispatched. But we can drive
				3520	* deeper queue depths at the beginning of slice
				3521	* subjected to upper limit of cfq_quantum.
				3522	* */
				3523	max_dispatch = cfqd->cfq_quantum;
				3524	}
				3525
				3526	/*
				3527	* Async queues must wait a bit before being allowed dispatch.
				3528	* We also ramp up the dispatch depth gradually for async IO,
				3529	* based on the last sync IO we serviced
				3530	*/
				3531	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
				3532	u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
				3533	unsigned int depth;
				3534
				3535	depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
				3536	if (!depth && !cfqq->dispatched)
				3537	depth = 1;
				3538	if (depth < max_dispatch)
				3539	max_dispatch = depth;
				3540	}
				3541
				3542	/*
				3543	* If we're below the current max, allow a dispatch
				3544	*/
				3545	return cfqq->dispatched < max_dispatch;
				3546	}
				3547
				3548	/*
				3549	* Dispatch a request from cfqq, moving them to the request queue
				3550	* dispatch list.
				3551	*/
				3552	static bool cfq_dispatch_request(struct cfq_data cfqd, struct cfq_queue cfqq)
				3553	{
				3554	struct request *rq;
				3555
				3556	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
				3557
				3558	rq = cfq_check_fifo(cfqq);
				3559	if (rq)
				3560	cfq_mark_cfqq_must_dispatch(cfqq);
				3561
				3562	if (!cfq_may_dispatch(cfqd, cfqq))
				3563	return false;
				3564
				3565	/*
				3566	* follow expired path, else get first next available
				3567	*/
				3568	if (!rq)
				3569	rq = cfqq->next_rq;
				3570	else
				3571	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
				3572
				3573	/*
				3574	* insert request into driver dispatch list
				3575	*/
				3576	cfq_dispatch_insert(cfqd->queue, rq);
				3577
				3578	if (!cfqd->active_cic) {
				3579	struct cfq_io_cq *cic = RQ_CIC(rq);
				3580
				3581	atomic_long_inc(&cic->icq.ioc->refcount);
				3582	cfqd->active_cic = cic;
				3583	}
				3584
				3585	return true;
				3586	}
				3587
				3588	/*
				3589	* Find the cfqq that we need to service and move a request from that to the
				3590	* dispatch list
				3591	*/
				3592	static int cfq_dispatch_requests(struct request_queue *q, int force)
				3593	{
				3594	struct cfq_data *cfqd = q->elevator->elevator_data;
				3595	struct cfq_queue *cfqq;
				3596
				3597	if (!cfqd->busy_queues)
				3598	return 0;
				3599
				3600	if (unlikely(force))
				3601	return cfq_forced_dispatch(cfqd);
				3602
				3603	cfqq = cfq_select_queue(cfqd);
				3604	if (!cfqq)
				3605	return 0;
				3606
				3607	/*
				3608	* Dispatch a request from this cfqq, if it is allowed
				3609	*/
				3610	if (!cfq_dispatch_request(cfqd, cfqq))
				3611	return 0;
				3612
				3613	cfqq->slice_dispatch++;
				3614	cfq_clear_cfqq_must_dispatch(cfqq);
				3615
				3616	/*
				3617	* expire an async queue immediately if it has used up its slice. idle
				3618	* queue always expire after 1 dispatch round.
				3619	*/
				3620	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
				3621	cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) \|\|
				3622	cfq_class_idle(cfqq))) {
				3623	cfqq->slice_end = ktime_get_ns() + 1;
				3624	cfq_slice_expired(cfqd, 0);
				3625	}
				3626
				3627	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
				3628	return 1;
				3629	}
				3630
				3631	/*
				3632	* task holds one reference to the queue, dropped when task exits. each rq
				3633	* in-flight on this queue also holds a reference, dropped when rq is freed.
				3634	*
				3635	* Each cfq queue took a reference on the parent group. Drop it now.
				3636	* queue lock must be held here.
				3637	*/
				3638	static void cfq_put_queue(struct cfq_queue *cfqq)
				3639	{
				3640	struct cfq_data *cfqd = cfqq->cfqd;
				3641	struct cfq_group *cfqg;
				3642
				3643	BUG_ON(cfqq->ref <= 0);
				3644
				3645	cfqq->ref--;
				3646	if (cfqq->ref)
				3647	return;
				3648
				3649	cfq_log_cfqq(cfqd, cfqq, "put_queue");
				3650	BUG_ON(rb_first(&cfqq->sort_list));
				3651	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
				3652	cfqg = cfqq->cfqg;
				3653
				3654	if (unlikely(cfqd->active_queue == cfqq)) {
				3655	__cfq_slice_expired(cfqd, cfqq, 0);
				3656	cfq_schedule_dispatch(cfqd);
				3657	}
				3658
				3659	BUG_ON(cfq_cfqq_on_rr(cfqq));
				3660	kmem_cache_free(cfq_pool, cfqq);
				3661	cfqg_put(cfqg);
				3662	}
				3663
				3664	static void cfq_put_cooperator(struct cfq_queue *cfqq)
				3665	{
				3666	struct cfq_queue __cfqq, next;
				3667
				3668	/*
				3669	* If this queue was scheduled to merge with another queue, be
				3670	* sure to drop the reference taken on that queue (and others in
				3671	* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
				3672	*/
				3673	__cfqq = cfqq->new_cfqq;
				3674	while (__cfqq) {
				3675	if (__cfqq == cfqq) {
				3676	WARN(1, "cfqq->new_cfqq loop detected\n");
				3677	break;
				3678	}
				3679	next = __cfqq->new_cfqq;
				3680	cfq_put_queue(__cfqq);
				3681	__cfqq = next;
				3682	}
				3683	}
				3684
				3685	static void cfq_exit_cfqq(struct cfq_data cfqd, struct cfq_queue cfqq)
				3686	{
				3687	if (unlikely(cfqq == cfqd->active_queue)) {
				3688	__cfq_slice_expired(cfqd, cfqq, 0);
				3689	cfq_schedule_dispatch(cfqd);
				3690	}
				3691
				3692	cfq_put_cooperator(cfqq);
				3693
				3694	cfq_put_queue(cfqq);
				3695	}
				3696
				3697	static void cfq_init_icq(struct io_cq *icq)
				3698	{
				3699	struct cfq_io_cq *cic = icq_to_cic(icq);
				3700
				3701	cic->ttime.last_end_request = ktime_get_ns();
				3702	}
				3703
				3704	static void cfq_exit_icq(struct io_cq *icq)
				3705	{
				3706	struct cfq_io_cq *cic = icq_to_cic(icq);
				3707	struct cfq_data *cfqd = cic_to_cfqd(cic);
				3708
				3709	if (cic_to_cfqq(cic, false)) {
				3710	cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
				3711	cic_set_cfqq(cic, NULL, false);
				3712	}
				3713
				3714	if (cic_to_cfqq(cic, true)) {
				3715	cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
				3716	cic_set_cfqq(cic, NULL, true);
				3717	}
				3718	}
				3719
				3720	static void cfq_init_prio_data(struct cfq_queue cfqq, struct cfq_io_cq cic)
				3721	{
				3722	struct task_struct *tsk = current;
				3723	int ioprio_class;
				3724
				3725	if (!cfq_cfqq_prio_changed(cfqq))
				3726	return;
				3727
				3728	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
				3729	switch (ioprio_class) {
				3730	default:
				3731	printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
				3732	/* fall through */
				3733	case IOPRIO_CLASS_NONE:
				3734	/*
				3735	* no prio set, inherit CPU scheduling settings
				3736	*/
				3737	cfqq->ioprio = task_nice_ioprio(tsk);
				3738	cfqq->ioprio_class = task_nice_ioclass(tsk);
				3739	break;
				3740	case IOPRIO_CLASS_RT:
				3741	cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
				3742	cfqq->ioprio_class = IOPRIO_CLASS_RT;
				3743	break;
				3744	case IOPRIO_CLASS_BE:
				3745	cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
				3746	cfqq->ioprio_class = IOPRIO_CLASS_BE;
				3747	break;
				3748	case IOPRIO_CLASS_IDLE:
				3749	cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
				3750	cfqq->ioprio = 7;
				3751	cfq_clear_cfqq_idle_window(cfqq);
				3752	break;
				3753	}
				3754
				3755	/*
				3756	* keep track of original prio settings in case we have to temporarily
				3757	* elevate the priority of this queue
				3758	*/
				3759	cfqq->org_ioprio = cfqq->ioprio;
				3760	cfqq->org_ioprio_class = cfqq->ioprio_class;
				3761	cfq_clear_cfqq_prio_changed(cfqq);
				3762	}
				3763
				3764	static void check_ioprio_changed(struct cfq_io_cq cic, struct bio bio)
				3765	{
				3766	int ioprio = cic->icq.ioc->ioprio;
				3767	struct cfq_data *cfqd = cic_to_cfqd(cic);
				3768	struct cfq_queue *cfqq;
				3769
				3770	/*
				3771	* Check whether ioprio has changed. The condition may trigger
				3772	* spuriously on a newly created cic but there's no harm.
				3773	*/
				3774	if (unlikely(!cfqd) \|\| likely(cic->ioprio == ioprio))
				3775	return;
				3776
				3777	cfqq = cic_to_cfqq(cic, false);
				3778	if (cfqq) {
				3779	cfq_put_queue(cfqq);
				3780	cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
				3781	cic_set_cfqq(cic, cfqq, false);
				3782	}
				3783
				3784	cfqq = cic_to_cfqq(cic, true);
				3785	if (cfqq)
				3786	cfq_mark_cfqq_prio_changed(cfqq);
				3787
				3788	cic->ioprio = ioprio;
				3789	}
				3790
				3791	static void cfq_init_cfqq(struct cfq_data cfqd, struct cfq_queue cfqq,
				3792	pid_t pid, bool is_sync)
				3793	{
				3794	RB_CLEAR_NODE(&cfqq->rb_node);
				3795	RB_CLEAR_NODE(&cfqq->p_node);
				3796	INIT_LIST_HEAD(&cfqq->fifo);
				3797
				3798	cfqq->ref = 0;
				3799	cfqq->cfqd = cfqd;
				3800
				3801	cfq_mark_cfqq_prio_changed(cfqq);
				3802
				3803	if (is_sync) {
				3804	if (!cfq_class_idle(cfqq))
				3805	cfq_mark_cfqq_idle_window(cfqq);
				3806	cfq_mark_cfqq_sync(cfqq);
				3807	}
				3808	cfqq->pid = pid;
				3809	}
				3810
				3811	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				3812	static void check_blkcg_changed(struct cfq_io_cq cic, struct bio bio)
				3813	{
				3814	struct cfq_data *cfqd = cic_to_cfqd(cic);
				3815	struct cfq_queue *cfqq;
				3816	uint64_t serial_nr;
				3817
				3818	rcu_read_lock();
				3819	serial_nr = bio_blkcg(bio)->css.serial_nr;
				3820	rcu_read_unlock();
				3821
				3822	/*
				3823	* Check whether blkcg has changed. The condition may trigger
				3824	* spuriously on a newly created cic but there's no harm.
				3825	*/
				3826	if (unlikely(!cfqd) \|\| likely(cic->blkcg_serial_nr == serial_nr))
				3827	return;
				3828
				3829	/*
				3830	* Drop reference to queues. New queues will be assigned in new
				3831	* group upon arrival of fresh requests.
				3832	*/
				3833	cfqq = cic_to_cfqq(cic, false);
				3834	if (cfqq) {
				3835	cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
				3836	cic_set_cfqq(cic, NULL, false);
				3837	cfq_put_queue(cfqq);
				3838	}
				3839
				3840	cfqq = cic_to_cfqq(cic, true);
				3841	if (cfqq) {
				3842	cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
				3843	cic_set_cfqq(cic, NULL, true);
				3844	cfq_put_queue(cfqq);
				3845	}
				3846
				3847	cic->blkcg_serial_nr = serial_nr;
				3848	}
				3849	#else
				3850	static inline void check_blkcg_changed(struct cfq_io_cq cic, struct bio bio)
				3851	{
				3852	}
				3853	#endif /* CONFIG_CFQ_GROUP_IOSCHED */
				3854
				3855	static struct cfq_queue **
				3856	cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
				3857	{
				3858	switch (ioprio_class) {
				3859	case IOPRIO_CLASS_RT:
				3860	return &cfqg->async_cfqq[0][ioprio];
				3861	case IOPRIO_CLASS_NONE:
				3862	ioprio = IOPRIO_NORM;
				3863	/* fall through */
				3864	case IOPRIO_CLASS_BE:
				3865	return &cfqg->async_cfqq[1][ioprio];
				3866	case IOPRIO_CLASS_IDLE:
				3867	return &cfqg->async_idle_cfqq;
				3868	default:
				3869	BUG();
				3870	}
				3871	}
				3872
				3873	static struct cfq_queue *
				3874	cfq_get_queue(struct cfq_data cfqd, bool is_sync, struct cfq_io_cq cic,
				3875	struct bio *bio)
				3876	{
				3877	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
				3878	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
				3879	struct cfq_queue **async_cfqq = NULL;
				3880	struct cfq_queue *cfqq;
				3881	struct cfq_group *cfqg;
				3882
				3883	rcu_read_lock();
				3884	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
				3885	if (!cfqg) {
				3886	cfqq = &cfqd->oom_cfqq;
				3887	goto out;
				3888	}
				3889
				3890	if (!is_sync) {
				3891	if (!ioprio_valid(cic->ioprio)) {
				3892	struct task_struct *tsk = current;
				3893	ioprio = task_nice_ioprio(tsk);
				3894	ioprio_class = task_nice_ioclass(tsk);
				3895	}
				3896	async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
				3897	cfqq = *async_cfqq;
				3898	if (cfqq)
				3899	goto out;
				3900	}
				3901
				3902	cfqq = kmem_cache_alloc_node(cfq_pool,
				3903	GFP_NOWAIT \| __GFP_ZERO \| __GFP_NOWARN,
				3904	cfqd->queue->node);
				3905	if (!cfqq) {
				3906	cfqq = &cfqd->oom_cfqq;
				3907	goto out;
				3908	}
				3909
				3910	/* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
				3911	cfqq->ioprio_class = IOPRIO_CLASS_NONE;
				3912	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
				3913	cfq_init_prio_data(cfqq, cic);
				3914	cfq_link_cfqq_cfqg(cfqq, cfqg);
				3915	cfq_log_cfqq(cfqd, cfqq, "alloced");
				3916
				3917	if (async_cfqq) {
				3918	/* a new async queue is created, pin and remember */
				3919	cfqq->ref++;
				3920	*async_cfqq = cfqq;
				3921	}
				3922	out:
				3923	cfqq->ref++;
				3924	rcu_read_unlock();
				3925	return cfqq;
				3926	}
				3927
				3928	static void
				3929	__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
				3930	{
				3931	u64 elapsed = ktime_get_ns() - ttime->last_end_request;
				3932	elapsed = min(elapsed, 2UL * slice_idle);
				3933
				3934	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
				3935	ttime->ttime_total = div_u64(7ttime->ttime_total + 256elapsed, 8);
				3936	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
				3937	ttime->ttime_samples);
				3938	}
				3939
				3940	static void
				3941	cfq_update_io_thinktime(struct cfq_data cfqd, struct cfq_queue cfqq,
				3942	struct cfq_io_cq *cic)
				3943	{
				3944	if (cfq_cfqq_sync(cfqq)) {
				3945	__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
				3946	__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
				3947	cfqd->cfq_slice_idle);
				3948	}
				3949	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				3950	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, get_group_idle(cfqd));
				3951	#endif
				3952	}
				3953
				3954	static void
				3955	cfq_update_io_seektime(struct cfq_data cfqd, struct cfq_queue cfqq,
				3956	struct request *rq)
				3957	{
				3958	sector_t sdist = 0;
				3959	sector_t n_sec = blk_rq_sectors(rq);
				3960	if (cfqq->last_request_pos) {
				3961	if (cfqq->last_request_pos < blk_rq_pos(rq))
				3962	sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
				3963	else
				3964	sdist = cfqq->last_request_pos - blk_rq_pos(rq);
				3965	}
				3966
				3967	cfqq->seek_history <<= 1;
				3968	if (blk_queue_nonrot(cfqd->queue))
				3969	cfqq->seek_history \|= (n_sec < CFQQ_SECT_THR_NONROT);
				3970	else
				3971	cfqq->seek_history \|= (sdist > CFQQ_SEEK_THR);
				3972	}
				3973
				3974	static inline bool req_noidle(struct request *req)
				3975	{
				3976	return req_op(req) == REQ_OP_WRITE &&
				3977	(req->cmd_flags & (REQ_SYNC \| REQ_IDLE)) == REQ_SYNC;
				3978	}
				3979
				3980	/*
				3981	* Disable idle window if the process thinks too long or seeks so much that
				3982	* it doesn't matter
				3983	*/
				3984	static void
				3985	cfq_update_idle_window(struct cfq_data cfqd, struct cfq_queue cfqq,
				3986	struct cfq_io_cq *cic)
				3987	{
				3988	int old_idle, enable_idle;
				3989
				3990	/*
				3991	* Don't idle for async or idle io prio class
				3992	*/
				3993	if (!cfq_cfqq_sync(cfqq) \|\| cfq_class_idle(cfqq))
				3994	return;
				3995
				3996	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
				3997
				3998	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
				3999	cfq_mark_cfqq_deep(cfqq);
				4000
				4001	if (cfqq->next_rq && req_noidle(cfqq->next_rq))
				4002	enable_idle = 0;
				4003	else if (!atomic_read(&cic->icq.ioc->active_ref) \|\|
				4004	!cfqd->cfq_slice_idle \|\|
				4005	(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
				4006	enable_idle = 0;
				4007	else if (sample_valid(cic->ttime.ttime_samples)) {
				4008	if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
				4009	enable_idle = 0;
				4010	else
				4011	enable_idle = 1;
				4012	}
				4013
				4014	if (old_idle != enable_idle) {
				4015	cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
				4016	if (enable_idle)
				4017	cfq_mark_cfqq_idle_window(cfqq);
				4018	else
				4019	cfq_clear_cfqq_idle_window(cfqq);
				4020	}
				4021	}
				4022
				4023	/*
				4024	* Check if new_cfqq should preempt the currently active queue. Return 0 for
				4025	* no or if we aren't sure, a 1 will cause a preempt.
				4026	*/
				4027	static bool
				4028	cfq_should_preempt(struct cfq_data cfqd, struct cfq_queue new_cfqq,
				4029	struct request *rq)
				4030	{
				4031	struct cfq_queue *cfqq;
				4032
				4033	cfqq = cfqd->active_queue;
				4034	if (!cfqq)
				4035	return false;
				4036
				4037	if (cfq_class_idle(new_cfqq))
				4038	return false;
				4039
				4040	if (cfq_class_idle(cfqq))
				4041	return true;
				4042
				4043	/*
				4044	* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
				4045	*/
				4046	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
				4047	return false;
				4048
				4049	/*
				4050	* if the new request is sync, but the currently running queue is
				4051	* not, let the sync request have priority.
				4052	*/
				4053	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
				4054	return true;
				4055
				4056	/*
				4057	* Treat ancestors of current cgroup the same way as current cgroup.
				4058	* For anybody else we disallow preemption to guarantee service
				4059	* fairness among cgroups.
				4060	*/
				4061	if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
				4062	return false;
				4063
				4064	if (cfq_slice_used(cfqq))
				4065	return true;
				4066
				4067	/*
				4068	* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
				4069	*/
				4070	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
				4071	return true;
				4072
				4073	WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
				4074	/* Allow preemption only if we are idling on sync-noidle tree */
				4075	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
				4076	cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
				4077	RB_EMPTY_ROOT(&cfqq->sort_list))
				4078	return true;
				4079
				4080	/*
				4081	* So both queues are sync. Let the new request get disk time if
				4082	* it's a metadata request and the current queue is doing regular IO.
				4083	*/
				4084	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
				4085	return true;
				4086
				4087	/* An idle queue should not be idle now for some reason */
				4088	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
				4089	return true;
				4090
				4091	if (!cfqd->active_cic \|\| !cfq_cfqq_wait_request(cfqq))
				4092	return false;
				4093
				4094	/*
				4095	* if this request is as-good as one we would expect from the
				4096	* current cfqq, let it preempt
				4097	*/
				4098	if (cfq_rq_close(cfqd, cfqq, rq))
				4099	return true;
				4100
				4101	return false;
				4102	}
				4103
				4104	/*
				4105	* cfqq preempts the active queue. if we allowed preempt with no slice left,
				4106	* let it have half of its nominal slice.
				4107	*/
				4108	static void cfq_preempt_queue(struct cfq_data cfqd, struct cfq_queue cfqq)
				4109	{
				4110	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
				4111
				4112	cfq_log_cfqq(cfqd, cfqq, "preempt");
				4113	cfq_slice_expired(cfqd, 1);
				4114
				4115	/*
				4116	* workload type is changed, don't save slice, otherwise preempt
				4117	* doesn't happen
				4118	*/
				4119	if (old_type != cfqq_type(cfqq))
				4120	cfqq->cfqg->saved_wl_slice = 0;
				4121
				4122	/*
				4123	* Put the new queue at the front of the of the current list,
				4124	* so we know that it will be selected next.
				4125	*/
				4126	BUG_ON(!cfq_cfqq_on_rr(cfqq));
				4127
				4128	cfq_service_tree_add(cfqd, cfqq, 1);
				4129
				4130	cfqq->slice_end = 0;
				4131	cfq_mark_cfqq_slice_new(cfqq);
				4132	}
				4133
				4134	/*
				4135	* Called when a new fs request (rq) is added (to cfqq). Check if there's
				4136	* something we should do about it
				4137	*/
				4138	static void
				4139	cfq_rq_enqueued(struct cfq_data cfqd, struct cfq_queue cfqq,
				4140	struct request *rq)
				4141	{
				4142	struct cfq_io_cq *cic = RQ_CIC(rq);
				4143
				4144	cfqd->rq_queued++;
				4145	if (rq->cmd_flags & REQ_PRIO)
				4146	cfqq->prio_pending++;
				4147
				4148	cfq_update_io_thinktime(cfqd, cfqq, cic);
				4149	cfq_update_io_seektime(cfqd, cfqq, rq);
				4150	cfq_update_idle_window(cfqd, cfqq, cic);
				4151
				4152	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
				4153
				4154	if (cfqq == cfqd->active_queue) {
				4155	/*
				4156	* Remember that we saw a request from this process, but
				4157	* don't start queuing just yet. Otherwise we risk seeing lots
				4158	* of tiny requests, because we disrupt the normal plugging
				4159	* and merging. If the request is already larger than a single
				4160	* page, let it rip immediately. For that case we assume that
				4161	* merging is already done. Ditto for a busy system that
				4162	* has other work pending, don't risk delaying until the
				4163	* idle timer unplug to continue working.
				4164	*/
				4165	if (cfq_cfqq_wait_request(cfqq)) {
				4166	if (blk_rq_bytes(rq) > PAGE_SIZE \|\|
				4167	cfqd->busy_queues > 1) {
				4168	cfq_del_timer(cfqd, cfqq);
				4169	cfq_clear_cfqq_wait_request(cfqq);
				4170	__blk_run_queue(cfqd->queue);
				4171	} else {
				4172	cfqg_stats_update_idle_time(cfqq->cfqg);
				4173	cfq_mark_cfqq_must_dispatch(cfqq);
				4174	}
				4175	}
				4176	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
				4177	/*
				4178	* not the active queue - expire current slice if it is
				4179	* idle and has expired it's mean thinktime or this new queue
				4180	* has some old slice time left and is of higher priority or
				4181	* this new queue is RT and the current one is BE
				4182	*/
				4183	cfq_preempt_queue(cfqd, cfqq);
				4184	__blk_run_queue(cfqd->queue);
				4185	}
				4186	}
				4187
				4188	static void cfq_insert_request(struct request_queue q, struct request rq)
				4189	{
				4190	struct cfq_data *cfqd = q->elevator->elevator_data;
				4191	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				4192
				4193	cfq_log_cfqq(cfqd, cfqq, "insert_request");
				4194	cfq_init_prio_data(cfqq, RQ_CIC(rq));
				4195
				4196	rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
				4197	list_add_tail(&rq->queuelist, &cfqq->fifo);
				4198	cfq_add_rq_rb(rq);
				4199	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
				4200	rq->cmd_flags);
				4201	cfq_rq_enqueued(cfqd, cfqq, rq);
				4202	}
				4203
				4204	/*
				4205	* Update hw_tag based on peak queue depth over 50 samples under
				4206	* sufficient load.
				4207	*/
				4208	static void cfq_update_hw_tag(struct cfq_data *cfqd)
				4209	{
				4210	struct cfq_queue *cfqq = cfqd->active_queue;
				4211
				4212	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
				4213	cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
				4214
				4215	if (cfqd->hw_tag == 1)
				4216	return;
				4217
				4218	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
				4219	cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
				4220	return;
				4221
				4222	/*
				4223	* If active queue hasn't enough requests and can idle, cfq might not
				4224	* dispatch sufficient requests to hardware. Don't zero hw_tag in this
				4225	* case
				4226	*/
				4227	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
				4228	cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
				4229	CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
				4230	return;
				4231
				4232	if (cfqd->hw_tag_samples++ < 50)
				4233	return;
				4234
				4235	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
				4236	cfqd->hw_tag = 1;
				4237	else
				4238	cfqd->hw_tag = 0;
				4239	}
				4240
				4241	static bool cfq_should_wait_busy(struct cfq_data cfqd, struct cfq_queue cfqq)
				4242	{
				4243	struct cfq_io_cq *cic = cfqd->active_cic;
				4244	u64 now = ktime_get_ns();
				4245
				4246	/* If the queue already has requests, don't wait */
				4247	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
				4248	return false;
				4249
				4250	/* If there are other queues in the group, don't wait */
				4251	if (cfqq->cfqg->nr_cfqq > 1)
				4252	return false;
				4253
				4254	/* the only queue in the group, but think time is big */
				4255	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
				4256	return false;
				4257
				4258	if (cfq_slice_used(cfqq))
				4259	return true;
				4260
				4261	/* if slice left is less than think time, wait busy */
				4262	if (cic && sample_valid(cic->ttime.ttime_samples)
				4263	&& (cfqq->slice_end - now < cic->ttime.ttime_mean))
				4264	return true;
				4265
				4266	/*
				4267	* If think times is less than a jiffy than ttime_mean=0 and above
				4268	* will not be true. It might happen that slice has not expired yet
				4269	* but will expire soon (4-5 ns) during select_queue(). To cover the
				4270	* case where think time is less than a jiffy, mark the queue wait
				4271	* busy if only 1 jiffy is left in the slice.
				4272	*/
				4273	if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
				4274	return true;
				4275
				4276	return false;
				4277	}
				4278
				4279	static void cfq_completed_request(struct request_queue q, struct request rq)
				4280	{
				4281	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				4282	struct cfq_data *cfqd = cfqq->cfqd;
				4283	const int sync = rq_is_sync(rq);
				4284	u64 now = ktime_get_ns();
				4285
				4286	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
				4287
				4288	cfq_update_hw_tag(cfqd);
				4289
				4290	WARN_ON(!cfqd->rq_in_driver);
				4291	WARN_ON(!cfqq->dispatched);
				4292	cfqd->rq_in_driver--;
				4293	cfqq->dispatched--;
				4294	(RQ_CFQG(rq))->dispatched--;
				4295	cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
				4296	rq->io_start_time_ns, rq->cmd_flags);
				4297
				4298	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
				4299
				4300	if (sync) {
				4301	struct cfq_rb_root *st;
				4302
				4303	RQ_CIC(rq)->ttime.last_end_request = now;
				4304
				4305	if (cfq_cfqq_on_rr(cfqq))
				4306	st = cfqq->service_tree;
				4307	else
				4308	st = st_for(cfqq->cfqg, cfqq_class(cfqq),
				4309	cfqq_type(cfqq));
				4310
				4311	st->ttime.last_end_request = now;
				4312	if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
				4313	cfqd->last_delayed_sync = now;
				4314	}
				4315
				4316	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4317	cfqq->cfqg->ttime.last_end_request = now;
				4318	#endif
				4319
				4320	/*
				4321	* If this is the active queue, check if it needs to be expired,
				4322	* or if we want to idle in case it has no pending requests.
				4323	*/
				4324	if (cfqd->active_queue == cfqq) {
				4325	const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
				4326
				4327	if (cfq_cfqq_slice_new(cfqq)) {
				4328	cfq_set_prio_slice(cfqd, cfqq);
				4329	cfq_clear_cfqq_slice_new(cfqq);
				4330	}
				4331
				4332	/*
				4333	* Should we wait for next request to come in before we expire
				4334	* the queue.
				4335	*/
				4336	if (cfq_should_wait_busy(cfqd, cfqq)) {
				4337	u64 extend_sl = cfqd->cfq_slice_idle;
				4338	if (!cfqd->cfq_slice_idle)
				4339	extend_sl = get_group_idle(cfqd);
				4340	cfqq->slice_end = now + extend_sl;
				4341	cfq_mark_cfqq_wait_busy(cfqq);
				4342	cfq_log_cfqq(cfqd, cfqq, "will busy wait");
				4343	}
				4344
				4345	/*
				4346	* Idling is not enabled on:
				4347	* - expired queues
				4348	* - idle-priority queues
				4349	* - async queues
				4350	* - queues with still some requests queued
				4351	* - when there is a close cooperator
				4352	*/
				4353	if (cfq_slice_used(cfqq) \|\| cfq_class_idle(cfqq))
				4354	cfq_slice_expired(cfqd, 1);
				4355	else if (sync && cfqq_empty &&
				4356	!cfq_close_cooperator(cfqd, cfqq)) {
				4357	cfq_arm_slice_timer(cfqd);
				4358	}
				4359	}
				4360
				4361	if (!cfqd->rq_in_driver)
				4362	cfq_schedule_dispatch(cfqd);
				4363	}
				4364
				4365	static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
				4366	{
				4367	/*
				4368	* If REQ_PRIO is set, boost class and prio level, if it's below
				4369	* BE/NORM. If prio is not set, restore the potentially boosted
				4370	* class/prio level.
				4371	*/
				4372	if (!(op & REQ_PRIO)) {
				4373	cfqq->ioprio_class = cfqq->org_ioprio_class;
				4374	cfqq->ioprio = cfqq->org_ioprio;
				4375	} else {
				4376	if (cfq_class_idle(cfqq))
				4377	cfqq->ioprio_class = IOPRIO_CLASS_BE;
				4378	if (cfqq->ioprio > IOPRIO_NORM)
				4379	cfqq->ioprio = IOPRIO_NORM;
				4380	}
				4381	}
				4382
				4383	static inline int __cfq_may_queue(struct cfq_queue *cfqq)
				4384	{
				4385	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
				4386	cfq_mark_cfqq_must_alloc_slice(cfqq);
				4387	return ELV_MQUEUE_MUST;
				4388	}
				4389
				4390	return ELV_MQUEUE_MAY;
				4391	}
				4392
				4393	static int cfq_may_queue(struct request_queue *q, unsigned int op)
				4394	{
				4395	struct cfq_data *cfqd = q->elevator->elevator_data;
				4396	struct task_struct *tsk = current;
				4397	struct cfq_io_cq *cic;
				4398	struct cfq_queue *cfqq;
				4399
				4400	/*
				4401	* don't force setup of a queue from here, as a call to may_queue
				4402	* does not necessarily imply that a request actually will be queued.
				4403	* so just lookup a possibly existing queue, or return 'may queue'
				4404	* if that fails
				4405	*/
				4406	cic = cfq_cic_lookup(cfqd, tsk->io_context);
				4407	if (!cic)
				4408	return ELV_MQUEUE_MAY;
				4409
				4410	cfqq = cic_to_cfqq(cic, op_is_sync(op));
				4411	if (cfqq) {
				4412	cfq_init_prio_data(cfqq, cic);
				4413	cfqq_boost_on_prio(cfqq, op);
				4414
				4415	return __cfq_may_queue(cfqq);
				4416	}
				4417
				4418	return ELV_MQUEUE_MAY;
				4419	}
				4420
				4421	/*
				4422	* queue lock held here
				4423	*/
				4424	static void cfq_put_request(struct request *rq)
				4425	{
				4426	struct cfq_queue *cfqq = RQ_CFQQ(rq);
				4427
				4428	if (cfqq) {
				4429	const int rw = rq_data_dir(rq);
				4430
				4431	BUG_ON(!cfqq->allocated[rw]);
				4432	cfqq->allocated[rw]--;
				4433
				4434	/* Put down rq reference on cfqg */
				4435	cfqg_put(RQ_CFQG(rq));
				4436	rq->elv.priv[0] = NULL;
				4437	rq->elv.priv[1] = NULL;
				4438
				4439	cfq_put_queue(cfqq);
				4440	}
				4441	}
				4442
				4443	static struct cfq_queue *
				4444	cfq_merge_cfqqs(struct cfq_data cfqd, struct cfq_io_cq cic,
				4445	struct cfq_queue *cfqq)
				4446	{
				4447	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
				4448	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
				4449	cfq_mark_cfqq_coop(cfqq->new_cfqq);
				4450	cfq_put_queue(cfqq);
				4451	return cic_to_cfqq(cic, 1);
				4452	}
				4453
				4454	/*
				4455	* Returns NULL if a new cfqq should be allocated, or the old cfqq if this
				4456	* was the last process referring to said cfqq.
				4457	*/
				4458	static struct cfq_queue *
				4459	split_cfqq(struct cfq_io_cq cic, struct cfq_queue cfqq)
				4460	{
				4461	if (cfqq_process_refs(cfqq) == 1) {
				4462	cfqq->pid = current->pid;
				4463	cfq_clear_cfqq_coop(cfqq);
				4464	cfq_clear_cfqq_split_coop(cfqq);
				4465	return cfqq;
				4466	}
				4467
				4468	cic_set_cfqq(cic, NULL, 1);
				4469
				4470	cfq_put_cooperator(cfqq);
				4471
				4472	cfq_put_queue(cfqq);
				4473	return NULL;
				4474	}
				4475	/*
				4476	* Allocate cfq data structures associated with this request.
				4477	*/
				4478	static int
				4479	cfq_set_request(struct request_queue q, struct request rq, struct bio *bio,
				4480	gfp_t gfp_mask)
				4481	{
				4482	struct cfq_data *cfqd = q->elevator->elevator_data;
				4483	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
				4484	const int rw = rq_data_dir(rq);
				4485	const bool is_sync = rq_is_sync(rq);
				4486	struct cfq_queue *cfqq;
				4487
				4488	spin_lock_irq(q->queue_lock);
				4489
				4490	check_ioprio_changed(cic, bio);
				4491	check_blkcg_changed(cic, bio);
				4492	new_queue:
				4493	cfqq = cic_to_cfqq(cic, is_sync);
				4494	if (!cfqq \|\| cfqq == &cfqd->oom_cfqq) {
				4495	if (cfqq)
				4496	cfq_put_queue(cfqq);
				4497	cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
				4498	cic_set_cfqq(cic, cfqq, is_sync);
				4499	} else {
				4500	/*
				4501	* If the queue was seeky for too long, break it apart.
				4502	*/
				4503	if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
				4504	cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
				4505	cfqq = split_cfqq(cic, cfqq);
				4506	if (!cfqq)
				4507	goto new_queue;
				4508	}
				4509
				4510	/*
				4511	* Check to see if this queue is scheduled to merge with
				4512	* another, closely cooperating queue. The merging of
				4513	* queues happens here as it must be done in process context.
				4514	* The reference on new_cfqq was taken in merge_cfqqs.
				4515	*/
				4516	if (cfqq->new_cfqq)
				4517	cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
				4518	}
				4519
				4520	cfqq->allocated[rw]++;
				4521
				4522	cfqq->ref++;
				4523	cfqg_get(cfqq->cfqg);
				4524	rq->elv.priv[0] = cfqq;
				4525	rq->elv.priv[1] = cfqq->cfqg;
				4526	spin_unlock_irq(q->queue_lock);
				4527
				4528	return 0;
				4529	}
				4530
				4531	static void cfq_kick_queue(struct work_struct *work)
				4532	{
				4533	struct cfq_data *cfqd =
				4534	container_of(work, struct cfq_data, unplug_work);
				4535	struct request_queue *q = cfqd->queue;
				4536
				4537	spin_lock_irq(q->queue_lock);
				4538	__blk_run_queue(cfqd->queue);
				4539	spin_unlock_irq(q->queue_lock);
				4540	}
				4541
				4542	/*
				4543	* Timer running if the active_queue is currently idling inside its time slice
				4544	*/
				4545	static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
				4546	{
				4547	struct cfq_data *cfqd = container_of(timer, struct cfq_data,
				4548	idle_slice_timer);
				4549	struct cfq_queue *cfqq;
				4550	unsigned long flags;
				4551	int timed_out = 1;
				4552
				4553	cfq_log(cfqd, "idle timer fired");
				4554
				4555	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
				4556
				4557	cfqq = cfqd->active_queue;
				4558	if (cfqq) {
				4559	timed_out = 0;
				4560
				4561	/*
				4562	* We saw a request before the queue expired, let it through
				4563	*/
				4564	if (cfq_cfqq_must_dispatch(cfqq))
				4565	goto out_kick;
				4566
				4567	/*
				4568	* expired
				4569	*/
				4570	if (cfq_slice_used(cfqq))
				4571	goto expire;
				4572
				4573	/*
				4574	* only expire and reinvoke request handler, if there are
				4575	* other queues with pending requests
				4576	*/
				4577	if (!cfqd->busy_queues)
				4578	goto out_cont;
				4579
				4580	/*
				4581	* not expired and it has a request pending, let it dispatch
				4582	*/
				4583	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
				4584	goto out_kick;
				4585
				4586	/*
				4587	* Queue depth flag is reset only when the idle didn't succeed
				4588	*/
				4589	cfq_clear_cfqq_deep(cfqq);
				4590	}
				4591	expire:
				4592	cfq_slice_expired(cfqd, timed_out);
				4593	out_kick:
				4594	cfq_schedule_dispatch(cfqd);
				4595	out_cont:
				4596	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
				4597	return HRTIMER_NORESTART;
				4598	}
				4599
				4600	static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
				4601	{
				4602	hrtimer_cancel(&cfqd->idle_slice_timer);
				4603	cancel_work_sync(&cfqd->unplug_work);
				4604	}
				4605
				4606	static void cfq_exit_queue(struct elevator_queue *e)
				4607	{
				4608	struct cfq_data *cfqd = e->elevator_data;
				4609	struct request_queue *q = cfqd->queue;
				4610
				4611	cfq_shutdown_timer_wq(cfqd);
				4612
				4613	spin_lock_irq(q->queue_lock);
				4614
				4615	if (cfqd->active_queue)
				4616	__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
				4617
				4618	spin_unlock_irq(q->queue_lock);
				4619
				4620	cfq_shutdown_timer_wq(cfqd);
				4621
				4622	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4623	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
				4624	#else
				4625	kfree(cfqd->root_group);
				4626	#endif
				4627	kfree(cfqd);
				4628	}
				4629
				4630	static int cfq_init_queue(struct request_queue q, struct elevator_type e)
				4631	{
				4632	struct cfq_data *cfqd;
				4633	struct blkcg_gq *blkg __maybe_unused;
				4634	int i, ret;
				4635	struct elevator_queue *eq;
				4636
				4637	eq = elevator_alloc(q, e);
				4638	if (!eq)
				4639	return -ENOMEM;
				4640
				4641	cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
				4642	if (!cfqd) {
				4643	kobject_put(&eq->kobj);
				4644	return -ENOMEM;
				4645	}
				4646	eq->elevator_data = cfqd;
				4647
				4648	cfqd->queue = q;
				4649	spin_lock_irq(q->queue_lock);
				4650	q->elevator = eq;
				4651	spin_unlock_irq(q->queue_lock);
				4652
				4653	/* Init root service tree */
				4654	cfqd->grp_service_tree = CFQ_RB_ROOT;
				4655
				4656	/* Init root group and prefer root group over other groups by default */
				4657	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4658	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
				4659	if (ret)
				4660	goto out_free;
				4661
				4662	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
				4663	#else
				4664	ret = -ENOMEM;
				4665	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
				4666	GFP_KERNEL, cfqd->queue->node);
				4667	if (!cfqd->root_group)
				4668	goto out_free;
				4669
				4670	cfq_init_cfqg_base(cfqd->root_group);
				4671	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
				4672	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
				4673	#endif
				4674
				4675	/*
				4676	* Not strictly needed (since RB_ROOT just clears the node and we
				4677	* zeroed cfqd on alloc), but better be safe in case someone decides
				4678	* to add magic to the rb code
				4679	*/
				4680	for (i = 0; i < CFQ_PRIO_LISTS; i++)
				4681	cfqd->prio_trees[i] = RB_ROOT;
				4682
				4683	/*
				4684	* Our fallback cfqq if cfq_get_queue() runs into OOM issues.
				4685	* Grab a permanent reference to it, so that the normal code flow
				4686	* will not attempt to free it. oom_cfqq is linked to root_group
				4687	* but shouldn't hold a reference as it'll never be unlinked. Lose
				4688	* the reference from linking right away.
				4689	*/
				4690	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
				4691	cfqd->oom_cfqq.ref++;
				4692
				4693	spin_lock_irq(q->queue_lock);
				4694	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
				4695	cfqg_put(cfqd->root_group);
				4696	spin_unlock_irq(q->queue_lock);
				4697
				4698	hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
				4699	HRTIMER_MODE_REL);
				4700	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
				4701
				4702	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
				4703
				4704	cfqd->cfq_quantum = cfq_quantum;
				4705	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
				4706	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
				4707	cfqd->cfq_back_max = cfq_back_max;
				4708	cfqd->cfq_back_penalty = cfq_back_penalty;
				4709	cfqd->cfq_slice[0] = cfq_slice_async;
				4710	cfqd->cfq_slice[1] = cfq_slice_sync;
				4711	cfqd->cfq_target_latency = cfq_target_latency;
				4712	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
				4713	cfqd->cfq_slice_idle = cfq_slice_idle;
				4714	cfqd->cfq_group_idle = cfq_group_idle;
				4715	cfqd->cfq_latency = 1;
				4716	cfqd->hw_tag = -1;
				4717	/*
				4718	* we optimistically start assuming sync ops weren't delayed in last
				4719	* second, in order to have larger depth for async operations.
				4720	*/
				4721	cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
				4722	return 0;
				4723
				4724	out_free:
				4725	kfree(cfqd);
				4726	kobject_put(&eq->kobj);
				4727	return ret;
				4728	}
				4729
				4730	static void cfq_registered_queue(struct request_queue *q)
				4731	{
				4732	struct elevator_queue *e = q->elevator;
				4733	struct cfq_data *cfqd = e->elevator_data;
				4734
				4735	/*
				4736	* Default to IOPS mode with no idling for SSDs
				4737	*/
				4738	if (blk_queue_nonrot(q))
				4739	cfqd->cfq_slice_idle = 0;
				4740	wbt_disable_default(q);
				4741	}
				4742
				4743	/*
				4744	* sysfs parts below -->
				4745	*/
				4746	static ssize_t
				4747	cfq_var_show(unsigned int var, char *page)
				4748	{
				4749	return sprintf(page, "%u\n", var);
				4750	}
				4751
				4752	static void
				4753	cfq_var_store(unsigned int var, const char page)
				4754	{
				4755	char p = (char ) page;
				4756
				4757	*var = simple_strtoul(p, &p, 10);
				4758	}
				4759
				4760	#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
				4761	static ssize_t __FUNC(struct elevator_queue e, char page) \
				4762	{ \
				4763	struct cfq_data *cfqd = e->elevator_data; \
				4764	u64 __data = __VAR; \
				4765	if (__CONV) \
				4766	__data = div_u64(__data, NSEC_PER_MSEC); \
				4767	return cfq_var_show(__data, (page)); \
				4768	}
				4769	SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
				4770	SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
				4771	SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
				4772	SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
				4773	SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
				4774	SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
				4775	SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
				4776	SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
				4777	SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
				4778	SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
				4779	SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
				4780	SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
				4781	#undef SHOW_FUNCTION
				4782
				4783	#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
				4784	static ssize_t __FUNC(struct elevator_queue e, char page) \
				4785	{ \
				4786	struct cfq_data *cfqd = e->elevator_data; \
				4787	u64 __data = __VAR; \
				4788	__data = div_u64(__data, NSEC_PER_USEC); \
				4789	return cfq_var_show(__data, (page)); \
				4790	}
				4791	USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
				4792	USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
				4793	USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
				4794	USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
				4795	USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
				4796	#undef USEC_SHOW_FUNCTION
				4797
				4798	#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
				4799	static ssize_t __FUNC(struct elevator_queue e, const char page, size_t count) \
				4800	{ \
				4801	struct cfq_data *cfqd = e->elevator_data; \
				4802	unsigned int __data, __min = (MIN), __max = (MAX); \
				4803	\
				4804	cfq_var_store(&__data, (page)); \
				4805	if (__data < __min) \
				4806	__data = __min; \
				4807	else if (__data > __max) \
				4808	__data = __max; \
				4809	if (__CONV) \
				4810	(__PTR) = (u64)__data NSEC_PER_MSEC; \
				4811	else \
				4812	*(__PTR) = __data; \
				4813	return count; \
				4814	}
				4815	STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
				4816	STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
				4817	UINT_MAX, 1);
				4818	STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
				4819	UINT_MAX, 1);
				4820	STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
				4821	STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
				4822	UINT_MAX, 0);
				4823	STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
				4824	STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
				4825	STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
				4826	STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
				4827	STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
				4828	UINT_MAX, 0);
				4829	STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
				4830	STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
				4831	#undef STORE_FUNCTION
				4832
				4833	#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
				4834	static ssize_t __FUNC(struct elevator_queue e, const char page, size_t count) \
				4835	{ \
				4836	struct cfq_data *cfqd = e->elevator_data; \
				4837	unsigned int __data, __min = (MIN), __max = (MAX); \
				4838	\
				4839	cfq_var_store(&__data, (page)); \
				4840	if (__data < __min) \
				4841	__data = __min; \
				4842	else if (__data > __max) \
				4843	__data = __max; \
				4844	(__PTR) = (u64)__data NSEC_PER_USEC; \
				4845	return count; \
				4846	}
				4847	USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
				4848	USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
				4849	USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
				4850	USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
				4851	USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
				4852	#undef USEC_STORE_FUNCTION
				4853
				4854	#define CFQ_ATTR(name) \
				4855	__ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
				4856
				4857	static struct elv_fs_entry cfq_attrs[] = {
				4858	CFQ_ATTR(quantum),
				4859	CFQ_ATTR(fifo_expire_sync),
				4860	CFQ_ATTR(fifo_expire_async),
				4861	CFQ_ATTR(back_seek_max),
				4862	CFQ_ATTR(back_seek_penalty),
				4863	CFQ_ATTR(slice_sync),
				4864	CFQ_ATTR(slice_sync_us),
				4865	CFQ_ATTR(slice_async),
				4866	CFQ_ATTR(slice_async_us),
				4867	CFQ_ATTR(slice_async_rq),
				4868	CFQ_ATTR(slice_idle),
				4869	CFQ_ATTR(slice_idle_us),
				4870	CFQ_ATTR(group_idle),
				4871	CFQ_ATTR(group_idle_us),
				4872	CFQ_ATTR(low_latency),
				4873	CFQ_ATTR(target_latency),
				4874	CFQ_ATTR(target_latency_us),
				4875	__ATTR_NULL
				4876	};
				4877
				4878	static struct elevator_type iosched_cfq = {
				4879	.ops.sq = {
				4880	.elevator_merge_fn = cfq_merge,
				4881	.elevator_merged_fn = cfq_merged_request,
				4882	.elevator_merge_req_fn = cfq_merged_requests,
				4883	.elevator_allow_bio_merge_fn = cfq_allow_bio_merge,
				4884	.elevator_allow_rq_merge_fn = cfq_allow_rq_merge,
				4885	.elevator_bio_merged_fn = cfq_bio_merged,
				4886	.elevator_dispatch_fn = cfq_dispatch_requests,
				4887	.elevator_add_req_fn = cfq_insert_request,
				4888	.elevator_activate_req_fn = cfq_activate_request,
				4889	.elevator_deactivate_req_fn = cfq_deactivate_request,
				4890	.elevator_completed_req_fn = cfq_completed_request,
				4891	.elevator_former_req_fn = elv_rb_former_request,
				4892	.elevator_latter_req_fn = elv_rb_latter_request,
				4893	.elevator_init_icq_fn = cfq_init_icq,
				4894	.elevator_exit_icq_fn = cfq_exit_icq,
				4895	.elevator_set_req_fn = cfq_set_request,
				4896	.elevator_put_req_fn = cfq_put_request,
				4897	.elevator_may_queue_fn = cfq_may_queue,
				4898	.elevator_init_fn = cfq_init_queue,
				4899	.elevator_exit_fn = cfq_exit_queue,
				4900	.elevator_registered_fn = cfq_registered_queue,
				4901	},
				4902	.icq_size = sizeof(struct cfq_io_cq),
				4903	.icq_align = __alignof__(struct cfq_io_cq),
				4904	.elevator_attrs = cfq_attrs,
				4905	.elevator_name = "cfq",
				4906	.elevator_owner = THIS_MODULE,
				4907	};
				4908
				4909	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4910	static struct blkcg_policy blkcg_policy_cfq = {
				4911	.dfl_cftypes = cfq_blkcg_files,
				4912	.legacy_cftypes = cfq_blkcg_legacy_files,
				4913
				4914	.cpd_alloc_fn = cfq_cpd_alloc,
				4915	.cpd_init_fn = cfq_cpd_init,
				4916	.cpd_free_fn = cfq_cpd_free,
				4917	.cpd_bind_fn = cfq_cpd_bind,
				4918
				4919	.pd_alloc_fn = cfq_pd_alloc,
				4920	.pd_init_fn = cfq_pd_init,
				4921	.pd_offline_fn = cfq_pd_offline,
				4922	.pd_free_fn = cfq_pd_free,
				4923	.pd_reset_stats_fn = cfq_pd_reset_stats,
				4924	};
				4925	#endif
				4926
				4927	static int __init cfq_init(void)
				4928	{
				4929	int ret;
				4930
				4931	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4932	ret = blkcg_policy_register(&blkcg_policy_cfq);
				4933	if (ret)
				4934	return ret;
				4935	#else
				4936	cfq_group_idle = 0;
				4937	#endif
				4938
				4939	ret = -ENOMEM;
				4940	cfq_pool = KMEM_CACHE(cfq_queue, 0);
				4941	if (!cfq_pool)
				4942	goto err_pol_unreg;
				4943
				4944	ret = elv_register(&iosched_cfq);
				4945	if (ret)
				4946	goto err_free_pool;
				4947
				4948	return 0;
				4949
				4950	err_free_pool:
				4951	kmem_cache_destroy(cfq_pool);
				4952	err_pol_unreg:
				4953	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4954	blkcg_policy_unregister(&blkcg_policy_cfq);
				4955	#endif
				4956	return ret;
				4957	}
				4958
				4959	static void __exit cfq_exit(void)
				4960	{
				4961	#ifdef CONFIG_CFQ_GROUP_IOSCHED
				4962	blkcg_policy_unregister(&blkcg_policy_cfq);
				4963	#endif
				4964	elv_unregister(&iosched_cfq);
				4965	kmem_cache_destroy(cfq_pool);
				4966	}
				4967
				4968	module_init(cfq_init);
				4969	module_exit(cfq_exit);
				4970
				4971	MODULE_AUTHOR("Jens Axboe");
				4972	MODULE_LICENSE("GPL");
				4973	MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");