Blame - src/kernel/linux/v4.14/net/sched/sch_qfq.c - T103

blob: 6ddfd4991108ad9de057a22175d87f667c24f370 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame]	1	/*
				2	* net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler.
				3	*
				4	* Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
				5	* Copyright (c) 2012 Paolo Valente.
				6	*
				7	* This program is free software; you can redistribute it and/or
				8	* modify it under the terms of the GNU General Public License
				9	* version 2 as published by the Free Software Foundation.
				10	*/
				11
				12	#include <linux/module.h>
				13	#include <linux/init.h>
				14	#include <linux/bitops.h>
				15	#include <linux/errno.h>
				16	#include <linux/netdevice.h>
				17	#include <linux/pkt_sched.h>
				18	#include <net/sch_generic.h>
				19	#include <net/pkt_sched.h>
				20	#include <net/pkt_cls.h>
				21
				22
				23	/* Quick Fair Queueing Plus
				24	========================
				25
				26	Sources:
				27
				28	[1] Paolo Valente,
				29	"Reducing the Execution Time of Fair-Queueing Schedulers."
				30	http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf
				31
				32	Sources for QFQ:
				33
				34	[2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient
				35	Packet Scheduling with Tight Bandwidth Distribution Guarantees."
				36
				37	See also:
				38	http://retis.sssup.it/~fabio/linux/qfq/
				39	*/
				40
				41	/*
				42
				43	QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES
				44	classes. Each aggregate is timestamped with a virtual start time S
				45	and a virtual finish time F, and scheduled according to its
				46	timestamps. S and F are computed as a function of a system virtual
				47	time function V. The classes within each aggregate are instead
				48	scheduled with DRR.
				49
				50	To speed up operations, QFQ+ divides also aggregates into a limited
				51	number of groups. Which group a class belongs to depends on the
				52	ratio between the maximum packet length for the class and the weight
				53	of the class. Groups have their own S and F. In the end, QFQ+
				54	schedules groups, then aggregates within groups, then classes within
				55	aggregates. See [1] and [2] for a full description.
				56
				57	Virtual time computations.
				58
				59	S, F and V are all computed in fixed point arithmetic with
				60	FRAC_BITS decimal bits.
				61
				62	QFQ_MAX_INDEX is the maximum index allowed for a group. We need
				63	one bit per index.
				64	QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
				65
				66	The layout of the bits is as below:
				67
				68	[ MTU_SHIFT ][ FRAC_BITS ]
				69	[ MAX_INDEX ][ MIN_SLOT_SHIFT ]
				70	^.__grp->index = 0
				71	*.__grp->slot_shift
				72
				73	where MIN_SLOT_SHIFT is derived by difference from the others.
				74
				75	The max group index corresponds to Lmax/w_min, where
				76	Lmax=1<<MTU_SHIFT, w_min = 1 .
				77	From this, and knowing how many groups (MAX_INDEX) we want,
				78	we can derive the shift corresponding to each group.
				79
				80	Because we often need to compute
				81	F = S + len/w_i and V = V + len/wsum
				82	instead of storing w_i store the value
				83	inv_w = (1<<FRAC_BITS)/w_i
				84	so we can do F = S + len * inv_w * wsum.
				85	We use W_TOT in the formulas so we can easily move between
				86	static and adaptive weight sum.
				87
				88	The per-scheduler-instance data contain all the data structures
				89	for the scheduler: bitmaps and bucket lists.
				90
				91	*/
				92
				93	/*
				94	* Maximum number of consecutive slots occupied by backlogged classes
				95	* inside a group.
				96	*/
				97	#define QFQ_MAX_SLOTS 32
				98
				99	/*
				100	* Shifts used for aggregate<->group mapping. We allow class weights that are
				101	* in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the
				102	* group with the smallest index that can support the L_i / r_i configured
				103	* for the classes in the aggregate.
				104	*
				105	* grp->index is the index of the group; and grp->slot_shift
				106	* is the shift for the corresponding (scaled) sigma_i.
				107	*/
				108	#define QFQ_MAX_INDEX 24
				109	#define QFQ_MAX_WSHIFT 10
				110
				111	#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */
				112	#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT)
				113
				114	#define FRAC_BITS 30 /* fixed point arithmetic */
				115	#define ONE_FP (1UL << FRAC_BITS)
				116
				117	#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */
				118	#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */
				119
				120	#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */
				121
				122	/*
				123	* Possible group states. These values are used as indexes for the bitmaps
				124	* array of struct qfq_queue.
				125	*/
				126	enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
				127
				128	struct qfq_group;
				129
				130	struct qfq_aggregate;
				131
				132	struct qfq_class {
				133	struct Qdisc_class_common common;
				134
				135	unsigned int filter_cnt;
				136
				137	struct gnet_stats_basic_packed bstats;
				138	struct gnet_stats_queue qstats;
				139	struct net_rate_estimator __rcu *rate_est;
				140	struct Qdisc *qdisc;
				141	struct list_head alist; /* Link for active-classes list. */
				142	struct qfq_aggregate agg; / Parent aggregate. */
				143	int deficit; /* DRR deficit counter. */
				144	};
				145
				146	struct qfq_aggregate {
				147	struct hlist_node next; /* Link for the slot list. */
				148	u64 S, F; /* flow timestamps (exact) */
				149
				150	/* group we belong to. In principle we would need the index,
				151	* which is log_2(lmax/weight), but we never reference it
				152	* directly, only the group.
				153	*/
				154	struct qfq_group *grp;
				155
				156	/* these are copied from the flowset. */
				157	u32 class_weight; /* Weight of each class in this aggregate. */
				158	/* Max pkt size for the classes in this aggregate, DRR quantum. */
				159	int lmax;
				160
				161	u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */
				162	u32 budgetmax; /* Max budget for this aggregate. */
				163	u32 initial_budget, budget; /* Initial and current budget. */
				164
				165	int num_classes; /* Number of classes in this aggr. */
				166	struct list_head active; /* DRR queue of active classes. */
				167
				168	struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */
				169	};
				170
				171	struct qfq_group {
				172	u64 S, F; /* group timestamps (approx). */
				173	unsigned int slot_shift; /* Slot shift. */
				174	unsigned int index; /* Group index. */
				175	unsigned int front; /* Index of the front slot. */
				176	unsigned long full_slots; /* non-empty slots */
				177
				178	/* Array of RR lists of active aggregates. */
				179	struct hlist_head slots[QFQ_MAX_SLOTS];
				180	};
				181
				182	struct qfq_sched {
				183	struct tcf_proto __rcu *filter_list;
				184	struct tcf_block *block;
				185	struct Qdisc_class_hash clhash;
				186
				187	u64 oldV, V; /* Precise virtual times. */
				188	struct qfq_aggregate in_serv_agg; / Aggregate being served. */
				189	u32 wsum; /* weight sum */
				190	u32 iwsum; /* inverse weight sum */
				191
				192	unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
				193	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
				194	u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */
				195
				196	u32 max_agg_classes; /* Max number of classes per aggr. */
				197	struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */
				198	};
				199
				200	/*
				201	* Possible reasons why the timestamps of an aggregate are updated
				202	* enqueue: the aggregate switches from idle to active and must scheduled
				203	* for service
				204	* requeue: the aggregate finishes its budget, so it stops being served and
				205	* must be rescheduled for service
				206	*/
				207	enum update_reason {enqueue, requeue};
				208
				209	static struct qfq_class qfq_find_class(struct Qdisc sch, u32 classid)
				210	{
				211	struct qfq_sched *q = qdisc_priv(sch);
				212	struct Qdisc_class_common *clc;
				213
				214	clc = qdisc_class_find(&q->clhash, classid);
				215	if (clc == NULL)
				216	return NULL;
				217	return container_of(clc, struct qfq_class, common);
				218	}
				219
				220	static void qfq_purge_queue(struct qfq_class *cl)
				221	{
				222	unsigned int len = cl->qdisc->q.qlen;
				223	unsigned int backlog = cl->qdisc->qstats.backlog;
				224
				225	qdisc_reset(cl->qdisc);
				226	qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
				227	}
				228
				229	static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
				230	[TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
				231	[TCA_QFQ_LMAX] = { .type = NLA_U32 },
				232	};
				233
				234	/*
				235	* Calculate a flow index, given its weight and maximum packet length.
				236	* index = log_2(maxlen/weight) but we need to apply the scaling.
				237	* This is used only once at flow creation.
				238	*/
				239	static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift)
				240	{
				241	u64 slot_size = (u64)maxlen * inv_w;
				242	unsigned long size_map;
				243	int index = 0;
				244
				245	size_map = slot_size >> min_slot_shift;
				246	if (!size_map)
				247	goto out;
				248
				249	index = __fls(size_map) + 1; /* basically a log_2 */
				250	index -= !(slot_size - (1ULL << (index + min_slot_shift - 1)));
				251
				252	if (index < 0)
				253	index = 0;
				254	out:
				255	pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
				256	(unsigned long) ONE_FP/inv_w, maxlen, index);
				257
				258	return index;
				259	}
				260
				261	static void qfq_deactivate_agg(struct qfq_sched , struct qfq_aggregate );
				262	static void qfq_activate_agg(struct qfq_sched , struct qfq_aggregate ,
				263	enum update_reason);
				264
				265	static void qfq_init_agg(struct qfq_sched q, struct qfq_aggregate agg,
				266	u32 lmax, u32 weight)
				267	{
				268	INIT_LIST_HEAD(&agg->active);
				269	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
				270
				271	agg->lmax = lmax;
				272	agg->class_weight = weight;
				273	}
				274
				275	static struct qfq_aggregate qfq_find_agg(struct qfq_sched q,
				276	u32 lmax, u32 weight)
				277	{
				278	struct qfq_aggregate *agg;
				279
				280	hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next)
				281	if (agg->lmax == lmax && agg->class_weight == weight)
				282	return agg;
				283
				284	return NULL;
				285	}
				286
				287
				288	/* Update aggregate as a function of the new number of classes. */
				289	static void qfq_update_agg(struct qfq_sched q, struct qfq_aggregate agg,
				290	int new_num_classes)
				291	{
				292	u32 new_agg_weight;
				293
				294	if (new_num_classes == q->max_agg_classes)
				295	hlist_del_init(&agg->nonfull_next);
				296
				297	if (agg->num_classes > new_num_classes &&
				298	new_num_classes == q->max_agg_classes - 1) /* agg no more full */
				299	hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs);
				300
				301	/* The next assignment may let
				302	* agg->initial_budget > agg->budgetmax
				303	* hold, we will take it into account in charge_actual_service().
				304	*/
				305	agg->budgetmax = new_num_classes * agg->lmax;
				306	new_agg_weight = agg->class_weight * new_num_classes;
				307	agg->inv_w = ONE_FP/new_agg_weight;
				308
				309	if (agg->grp == NULL) {
				310	int i = qfq_calc_index(agg->inv_w, agg->budgetmax,
				311	q->min_slot_shift);
				312	agg->grp = &q->groups[i];
				313	}
				314
				315	q->wsum +=
				316	(int) agg->class_weight * (new_num_classes - agg->num_classes);
				317	q->iwsum = ONE_FP / q->wsum;
				318
				319	agg->num_classes = new_num_classes;
				320	}
				321
				322	/* Add class to aggregate. */
				323	static void qfq_add_to_agg(struct qfq_sched *q,
				324	struct qfq_aggregate *agg,
				325	struct qfq_class *cl)
				326	{
				327	cl->agg = agg;
				328
				329	qfq_update_agg(q, agg, agg->num_classes+1);
				330	if (cl->qdisc->q.qlen > 0) { /* adding an active class */
				331	list_add_tail(&cl->alist, &agg->active);
				332	if (list_first_entry(&agg->active, struct qfq_class, alist) ==
				333	cl && q->in_serv_agg != agg) /* agg was inactive */
				334	qfq_activate_agg(q, agg, enqueue); /* schedule agg */
				335	}
				336	}
				337
				338	static struct qfq_aggregate qfq_choose_next_agg(struct qfq_sched );
				339
				340	static void qfq_destroy_agg(struct qfq_sched q, struct qfq_aggregate agg)
				341	{
				342	hlist_del_init(&agg->nonfull_next);
				343	q->wsum -= agg->class_weight;
				344	if (q->wsum != 0)
				345	q->iwsum = ONE_FP / q->wsum;
				346
				347	if (q->in_serv_agg == agg)
				348	q->in_serv_agg = qfq_choose_next_agg(q);
				349	kfree(agg);
				350	}
				351
				352	/* Deschedule class from within its parent aggregate. */
				353	static void qfq_deactivate_class(struct qfq_sched q, struct qfq_class cl)
				354	{
				355	struct qfq_aggregate *agg = cl->agg;
				356
				357
				358	list_del(&cl->alist); /* remove from RR queue of the aggregate */
				359	if (list_empty(&agg->active)) /* agg is now inactive */
				360	qfq_deactivate_agg(q, agg);
				361	}
				362
				363	/* Remove class from its parent aggregate. */
				364	static void qfq_rm_from_agg(struct qfq_sched q, struct qfq_class cl)
				365	{
				366	struct qfq_aggregate *agg = cl->agg;
				367
				368	cl->agg = NULL;
				369	if (agg->num_classes == 1) { /* agg being emptied, destroy it */
				370	qfq_destroy_agg(q, agg);
				371	return;
				372	}
				373	qfq_update_agg(q, agg, agg->num_classes-1);
				374	}
				375
				376	/* Deschedule class and remove it from its parent aggregate. */
				377	static void qfq_deact_rm_from_agg(struct qfq_sched q, struct qfq_class cl)
				378	{
				379	if (cl->qdisc->q.qlen > 0) /* class is active */
				380	qfq_deactivate_class(q, cl);
				381
				382	qfq_rm_from_agg(q, cl);
				383	}
				384
				385	/* Move class to a new aggregate, matching the new class weight and/or lmax */
				386	static int qfq_change_agg(struct Qdisc sch, struct qfq_class cl, u32 weight,
				387	u32 lmax)
				388	{
				389	struct qfq_sched *q = qdisc_priv(sch);
				390	struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight);
				391
				392	if (new_agg == NULL) { /* create new aggregate */
				393	new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC);
				394	if (new_agg == NULL)
				395	return -ENOBUFS;
				396	qfq_init_agg(q, new_agg, lmax, weight);
				397	}
				398	qfq_deact_rm_from_agg(q, cl);
				399	qfq_add_to_agg(q, new_agg, cl);
				400
				401	return 0;
				402	}
				403
				404	static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
				405	struct nlattr *tca, unsigned long arg)
				406	{
				407	struct qfq_sched *q = qdisc_priv(sch);
				408	struct qfq_class cl = (struct qfq_class )*arg;
				409	bool existing = false;
				410	struct nlattr *tb[TCA_QFQ_MAX + 1];
				411	struct qfq_aggregate *new_agg = NULL;
				412	u32 weight, lmax, inv_w;
				413	int err;
				414	int delta_w;
				415
				416	if (tca[TCA_OPTIONS] == NULL) {
				417	pr_notice("qfq: no options\n");
				418	return -EINVAL;
				419	}
				420
				421	err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy,
				422	NULL);
				423	if (err < 0)
				424	return err;
				425
				426	if (tb[TCA_QFQ_WEIGHT]) {
				427	weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
				428	if (!weight \|\| weight > (1UL << QFQ_MAX_WSHIFT)) {
				429	pr_notice("qfq: invalid weight %u\n", weight);
				430	return -EINVAL;
				431	}
				432	} else
				433	weight = 1;
				434
				435	if (tb[TCA_QFQ_LMAX]) {
				436	lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
				437	if (lmax < QFQ_MIN_LMAX \|\| lmax > (1UL << QFQ_MTU_SHIFT)) {
				438	pr_notice("qfq: invalid max length %u\n", lmax);
				439	return -EINVAL;
				440	}
				441	} else
				442	lmax = psched_mtu(qdisc_dev(sch));
				443
				444	inv_w = ONE_FP / weight;
				445	weight = ONE_FP / inv_w;
				446
				447	if (cl != NULL &&
				448	lmax == cl->agg->lmax &&
				449	weight == cl->agg->class_weight)
				450	return 0; /* nothing to change */
				451
				452	delta_w = weight - (cl ? cl->agg->class_weight : 0);
				453
				454	if (q->wsum + delta_w > QFQ_MAX_WSUM) {
				455	pr_notice("qfq: total weight out of range (%d + %u)\n",
				456	delta_w, q->wsum);
				457	return -EINVAL;
				458	}
				459
				460	if (cl != NULL) { /* modify existing class */
				461	if (tca[TCA_RATE]) {
				462	err = gen_replace_estimator(&cl->bstats, NULL,
				463	&cl->rate_est,
				464	NULL,
				465	qdisc_root_sleeping_running(sch),
				466	tca[TCA_RATE]);
				467	if (err)
				468	return err;
				469	}
				470	existing = true;
				471	goto set_change_agg;
				472	}
				473
				474	/* create and init new class */
				475	cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
				476	if (cl == NULL)
				477	return -ENOBUFS;
				478
				479	cl->common.classid = classid;
				480	cl->deficit = lmax;
				481
				482	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
				483	&pfifo_qdisc_ops, classid);
				484	if (cl->qdisc == NULL)
				485	cl->qdisc = &noop_qdisc;
				486
				487	if (tca[TCA_RATE]) {
				488	err = gen_new_estimator(&cl->bstats, NULL,
				489	&cl->rate_est,
				490	NULL,
				491	qdisc_root_sleeping_running(sch),
				492	tca[TCA_RATE]);
				493	if (err)
				494	goto destroy_class;
				495	}
				496
				497	if (cl->qdisc != &noop_qdisc)
				498	qdisc_hash_add(cl->qdisc, true);
				499	sch_tree_lock(sch);
				500	qdisc_class_hash_insert(&q->clhash, &cl->common);
				501	sch_tree_unlock(sch);
				502
				503	qdisc_class_hash_grow(sch, &q->clhash);
				504
				505	set_change_agg:
				506	sch_tree_lock(sch);
				507	new_agg = qfq_find_agg(q, lmax, weight);
				508	if (new_agg == NULL) { /* create new aggregate */
				509	sch_tree_unlock(sch);
				510	new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
				511	if (new_agg == NULL) {
				512	err = -ENOBUFS;
				513	gen_kill_estimator(&cl->rate_est);
				514	goto destroy_class;
				515	}
				516	sch_tree_lock(sch);
				517	qfq_init_agg(q, new_agg, lmax, weight);
				518	}
				519	if (existing)
				520	qfq_deact_rm_from_agg(q, cl);
				521	qfq_add_to_agg(q, new_agg, cl);
				522	sch_tree_unlock(sch);
				523
				524	*arg = (unsigned long)cl;
				525	return 0;
				526
				527	destroy_class:
				528	qdisc_destroy(cl->qdisc);
				529	kfree(cl);
				530	return err;
				531	}
				532
				533	static void qfq_destroy_class(struct Qdisc sch, struct qfq_class cl)
				534	{
				535	struct qfq_sched *q = qdisc_priv(sch);
				536
				537	qfq_rm_from_agg(q, cl);
				538	gen_kill_estimator(&cl->rate_est);
				539	qdisc_destroy(cl->qdisc);
				540	kfree(cl);
				541	}
				542
				543	static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
				544	{
				545	struct qfq_sched *q = qdisc_priv(sch);
				546	struct qfq_class cl = (struct qfq_class )arg;
				547
				548	if (cl->filter_cnt > 0)
				549	return -EBUSY;
				550
				551	sch_tree_lock(sch);
				552
				553	qfq_purge_queue(cl);
				554	qdisc_class_hash_remove(&q->clhash, &cl->common);
				555
				556	sch_tree_unlock(sch);
				557
				558	qfq_destroy_class(sch, cl);
				559	return 0;
				560	}
				561
				562	static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
				563	{
				564	return (unsigned long)qfq_find_class(sch, classid);
				565	}
				566
				567	static struct tcf_block qfq_tcf_block(struct Qdisc sch, unsigned long cl)
				568	{
				569	struct qfq_sched *q = qdisc_priv(sch);
				570
				571	if (cl)
				572	return NULL;
				573
				574	return q->block;
				575	}
				576
				577	static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
				578	u32 classid)
				579	{
				580	struct qfq_class *cl = qfq_find_class(sch, classid);
				581
				582	if (cl != NULL)
				583	cl->filter_cnt++;
				584
				585	return (unsigned long)cl;
				586	}
				587
				588	static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
				589	{
				590	struct qfq_class cl = (struct qfq_class )arg;
				591
				592	cl->filter_cnt--;
				593	}
				594
				595	static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
				596	struct Qdisc new, struct Qdisc *old)
				597	{
				598	struct qfq_class cl = (struct qfq_class )arg;
				599
				600	if (new == NULL) {
				601	new = qdisc_create_dflt(sch->dev_queue,
				602	&pfifo_qdisc_ops, cl->common.classid);
				603	if (new == NULL)
				604	new = &noop_qdisc;
				605	}
				606
				607	*old = qdisc_replace(sch, new, &cl->qdisc);
				608	return 0;
				609	}
				610
				611	static struct Qdisc qfq_class_leaf(struct Qdisc sch, unsigned long arg)
				612	{
				613	struct qfq_class cl = (struct qfq_class )arg;
				614
				615	return cl->qdisc;
				616	}
				617
				618	static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
				619	struct sk_buff skb, struct tcmsg tcm)
				620	{
				621	struct qfq_class cl = (struct qfq_class )arg;
				622	struct nlattr *nest;
				623
				624	tcm->tcm_parent = TC_H_ROOT;
				625	tcm->tcm_handle = cl->common.classid;
				626	tcm->tcm_info = cl->qdisc->handle;
				627
				628	nest = nla_nest_start(skb, TCA_OPTIONS);
				629	if (nest == NULL)
				630	goto nla_put_failure;
				631	if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) \|\|
				632	nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax))
				633	goto nla_put_failure;
				634	return nla_nest_end(skb, nest);
				635
				636	nla_put_failure:
				637	nla_nest_cancel(skb, nest);
				638	return -EMSGSIZE;
				639	}
				640
				641	static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
				642	struct gnet_dump *d)
				643	{
				644	struct qfq_class cl = (struct qfq_class )arg;
				645	struct tc_qfq_stats xstats;
				646
				647	memset(&xstats, 0, sizeof(xstats));
				648
				649	xstats.weight = cl->agg->class_weight;
				650	xstats.lmax = cl->agg->lmax;
				651
				652	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
				653	d, NULL, &cl->bstats) < 0 \|\|
				654	gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 \|\|
				655	gnet_stats_copy_queue(d, NULL,
				656	&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
				657	return -1;
				658
				659	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
				660	}
				661
				662	static void qfq_walk(struct Qdisc sch, struct qdisc_walker arg)
				663	{
				664	struct qfq_sched *q = qdisc_priv(sch);
				665	struct qfq_class *cl;
				666	unsigned int i;
				667
				668	if (arg->stop)
				669	return;
				670
				671	for (i = 0; i < q->clhash.hashsize; i++) {
				672	hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
				673	if (arg->count < arg->skip) {
				674	arg->count++;
				675	continue;
				676	}
				677	if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
				678	arg->stop = 1;
				679	return;
				680	}
				681	arg->count++;
				682	}
				683	}
				684	}
				685
				686	static struct qfq_class qfq_classify(struct sk_buff skb, struct Qdisc *sch,
				687	int *qerr)
				688	{
				689	struct qfq_sched *q = qdisc_priv(sch);
				690	struct qfq_class *cl;
				691	struct tcf_result res;
				692	struct tcf_proto *fl;
				693	int result;
				694
				695	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
				696	pr_debug("qfq_classify: found %d\n", skb->priority);
				697	cl = qfq_find_class(sch, skb->priority);
				698	if (cl != NULL)
				699	return cl;
				700	}
				701
				702	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				703	fl = rcu_dereference_bh(q->filter_list);
				704	result = tcf_classify(skb, fl, &res, false);
				705	if (result >= 0) {
				706	#ifdef CONFIG_NET_CLS_ACT
				707	switch (result) {
				708	case TC_ACT_QUEUED:
				709	case TC_ACT_STOLEN:
				710	case TC_ACT_TRAP:
				711	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
				712	case TC_ACT_SHOT:
				713	return NULL;
				714	}
				715	#endif
				716	cl = (struct qfq_class *)res.class;
				717	if (cl == NULL)
				718	cl = qfq_find_class(sch, res.classid);
				719	return cl;
				720	}
				721
				722	return NULL;
				723	}
				724
				725	/* Generic comparison function, handling wraparound. */
				726	static inline int qfq_gt(u64 a, u64 b)
				727	{
				728	return (s64)(a - b) > 0;
				729	}
				730
				731	/* Round a precise timestamp to its slotted value. */
				732	static inline u64 qfq_round_down(u64 ts, unsigned int shift)
				733	{
				734	return ts & ~((1ULL << shift) - 1);
				735	}
				736
				737	/* return the pointer to the group with lowest index in the bitmap */
				738	static inline struct qfq_group qfq_ffs(struct qfq_sched q,
				739	unsigned long bitmap)
				740	{
				741	int index = __ffs(bitmap);
				742	return &q->groups[index];
				743	}
				744	/* Calculate a mask to mimic what would be ffs_from(). */
				745	static inline unsigned long mask_from(unsigned long bitmap, int from)
				746	{
				747	return bitmap & ~((1UL << from) - 1);
				748	}
				749
				750	/*
				751	* The state computation relies on ER=0, IR=1, EB=2, IB=3
				752	* First compute eligibility comparing grp->S, q->V,
				753	* then check if someone is blocking us and possibly add EB
				754	*/
				755	static int qfq_calc_state(struct qfq_sched q, const struct qfq_group grp)
				756	{
				757	/* if S > V we are not eligible */
				758	unsigned int state = qfq_gt(grp->S, q->V);
				759	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
				760	struct qfq_group *next;
				761
				762	if (mask) {
				763	next = qfq_ffs(q, mask);
				764	if (qfq_gt(grp->F, next->F))
				765	state \|= EB;
				766	}
				767
				768	return state;
				769	}
				770
				771
				772	/*
				773	* In principle
				774	* q->bitmaps[dst] \|= q->bitmaps[src] & mask;
				775	* q->bitmaps[src] &= ~mask;
				776	* but we should make sure that src != dst
				777	*/
				778	static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
				779	int src, int dst)
				780	{
				781	q->bitmaps[dst] \|= q->bitmaps[src] & mask;
				782	q->bitmaps[src] &= ~mask;
				783	}
				784
				785	static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
				786	{
				787	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
				788	struct qfq_group *next;
				789
				790	if (mask) {
				791	next = qfq_ffs(q, mask);
				792	if (!qfq_gt(next->F, old_F))
				793	return;
				794	}
				795
				796	mask = (1UL << index) - 1;
				797	qfq_move_groups(q, mask, EB, ER);
				798	qfq_move_groups(q, mask, IB, IR);
				799	}
				800
				801	/*
				802	* perhaps
				803	*
				804	old_V ^= q->V;
				805	old_V >>= q->min_slot_shift;
				806	if (old_V) {
				807	...
				808	}
				809	*
				810	*/
				811	static void qfq_make_eligible(struct qfq_sched *q)
				812	{
				813	unsigned long vslot = q->V >> q->min_slot_shift;
				814	unsigned long old_vslot = q->oldV >> q->min_slot_shift;
				815
				816	if (vslot != old_vslot) {
				817	unsigned long mask;
				818	int last_flip_pos = fls(vslot ^ old_vslot);
				819
				820	if (last_flip_pos > 31) /* higher than the number of groups */
				821	mask = ~0UL; /* make all groups eligible */
				822	else
				823	mask = (1UL << last_flip_pos) - 1;
				824
				825	qfq_move_groups(q, mask, IR, ER);
				826	qfq_move_groups(q, mask, IB, EB);
				827	}
				828	}
				829
				830	/*
				831	* The index of the slot in which the input aggregate agg is to be
				832	* inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
				833	* and not a '-1' because the start time of the group may be moved
				834	* backward by one slot after the aggregate has been inserted, and
				835	* this would cause non-empty slots to be right-shifted by one
				836	* position.
				837	*
				838	* QFQ+ fully satisfies this bound to the slot index if the parameters
				839	* of the classes are not changed dynamically, and if QFQ+ never
				840	* happens to postpone the service of agg unjustly, i.e., it never
				841	* happens that the aggregate becomes backlogged and eligible, or just
				842	* eligible, while an aggregate with a higher approximated finish time
				843	* is being served. In particular, in this case QFQ+ guarantees that
				844	* the timestamps of agg are low enough that the slot index is never
				845	* higher than 2. Unfortunately, QFQ+ cannot provide the same
				846	* guarantee if it happens to unjustly postpone the service of agg, or
				847	* if the parameters of some class are changed.
				848	*
				849	* As for the first event, i.e., an out-of-order service, the
				850	* upper bound to the slot index guaranteed by QFQ+ grows to
				851	* 2 +
				852	* QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
				853	* (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
				854	*
				855	* The following function deals with this problem by backward-shifting
				856	* the timestamps of agg, if needed, so as to guarantee that the slot
				857	* index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
				858	* cause the service of other aggregates to be postponed, yet the
				859	* worst-case guarantees of these aggregates are not violated. In
				860	* fact, in case of no out-of-order service, the timestamps of agg
				861	* would have been even lower than they are after the backward shift,
				862	* because QFQ+ would have guaranteed a maximum value equal to 2 for
				863	* the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
				864	* service is postponed because of the backward-shift would have
				865	* however waited for the service of agg before being served.
				866	*
				867	* The other event that may cause the slot index to be higher than 2
				868	* for agg is a recent change of the parameters of some class. If the
				869	* weight of a class is increased or the lmax (max_pkt_size) of the
				870	* class is decreased, then a new aggregate with smaller slot size
				871	* than the original parent aggregate of the class may happen to be
				872	* activated. The activation of this aggregate should be properly
				873	* delayed to when the service of the class has finished in the ideal
				874	* system tracked by QFQ+. If the activation of the aggregate is not
				875	* delayed to this reference time instant, then this aggregate may be
				876	* unjustly served before other aggregates waiting for service. This
				877	* may cause the above bound to the slot index to be violated for some
				878	* of these unlucky aggregates.
				879	*
				880	* Instead of delaying the activation of the new aggregate, which is
				881	* quite complex, the above-discussed capping of the slot index is
				882	* used to handle also the consequences of a change of the parameters
				883	* of a class.
				884	*/
				885	static void qfq_slot_insert(struct qfq_group grp, struct qfq_aggregate agg,
				886	u64 roundedS)
				887	{
				888	u64 slot = (roundedS - grp->S) >> grp->slot_shift;
				889	unsigned int i; /* slot index in the bucket list */
				890
				891	if (unlikely(slot > QFQ_MAX_SLOTS - 2)) {
				892	u64 deltaS = roundedS - grp->S -
				893	((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift);
				894	agg->S -= deltaS;
				895	agg->F -= deltaS;
				896	slot = QFQ_MAX_SLOTS - 2;
				897	}
				898
				899	i = (grp->front + slot) % QFQ_MAX_SLOTS;
				900
				901	hlist_add_head(&agg->next, &grp->slots[i]);
				902	__set_bit(slot, &grp->full_slots);
				903	}
				904
				905	/* Maybe introduce hlist_first_entry?? */
				906	static struct qfq_aggregate qfq_slot_head(struct qfq_group grp)
				907	{
				908	return hlist_entry(grp->slots[grp->front].first,
				909	struct qfq_aggregate, next);
				910	}
				911
				912	/*
				913	* remove the entry from the slot
				914	*/
				915	static void qfq_front_slot_remove(struct qfq_group *grp)
				916	{
				917	struct qfq_aggregate *agg = qfq_slot_head(grp);
				918
				919	BUG_ON(!agg);
				920	hlist_del(&agg->next);
				921	if (hlist_empty(&grp->slots[grp->front]))
				922	__clear_bit(0, &grp->full_slots);
				923	}
				924
				925	/*
				926	* Returns the first aggregate in the first non-empty bucket of the
				927	* group. As a side effect, adjusts the bucket list so the first
				928	* non-empty bucket is at position 0 in full_slots.
				929	*/
				930	static struct qfq_aggregate qfq_slot_scan(struct qfq_group grp)
				931	{
				932	unsigned int i;
				933
				934	pr_debug("qfq slot_scan: grp %u full %#lx\n",
				935	grp->index, grp->full_slots);
				936
				937	if (grp->full_slots == 0)
				938	return NULL;
				939
				940	i = __ffs(grp->full_slots); /* zero based */
				941	if (i > 0) {
				942	grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
				943	grp->full_slots >>= i;
				944	}
				945
				946	return qfq_slot_head(grp);
				947	}
				948
				949	/*
				950	* adjust the bucket list. When the start time of a group decreases,
				951	* we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
				952	* move the objects. The mask of occupied slots must be shifted
				953	* because we use ffs() to find the first non-empty slot.
				954	* This covers decreases in the group's start time, but what about
				955	* increases of the start time ?
				956	* Here too we should make sure that i is less than 32
				957	*/
				958	static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
				959	{
				960	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
				961
				962	grp->full_slots <<= i;
				963	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
				964	}
				965
				966	static void qfq_update_eligible(struct qfq_sched *q)
				967	{
				968	struct qfq_group *grp;
				969	unsigned long ineligible;
				970
				971	ineligible = q->bitmaps[IR] \| q->bitmaps[IB];
				972	if (ineligible) {
				973	if (!q->bitmaps[ER]) {
				974	grp = qfq_ffs(q, ineligible);
				975	if (qfq_gt(grp->S, q->V))
				976	q->V = grp->S;
				977	}
				978	qfq_make_eligible(q);
				979	}
				980	}
				981
				982	/* Dequeue head packet of the head class in the DRR queue of the aggregate. */
				983	static void agg_dequeue(struct qfq_aggregate *agg,
				984	struct qfq_class *cl, unsigned int len)
				985	{
				986	qdisc_dequeue_peeked(cl->qdisc);
				987
				988	cl->deficit -= (int) len;
				989
				990	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
				991	list_del(&cl->alist);
				992	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
				993	cl->deficit += agg->lmax;
				994	list_move_tail(&cl->alist, &agg->active);
				995	}
				996	}
				997
				998	static inline struct sk_buff qfq_peek_skb(struct qfq_aggregate agg,
				999	struct qfq_class **cl,
				1000	unsigned int *len)
				1001	{
				1002	struct sk_buff *skb;
				1003
				1004	*cl = list_first_entry(&agg->active, struct qfq_class, alist);
				1005	skb = (cl)->qdisc->ops->peek((cl)->qdisc);
				1006	if (skb == NULL)
				1007	WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
				1008	else
				1009	*len = qdisc_pkt_len(skb);
				1010
				1011	return skb;
				1012	}
				1013
				1014	/* Update F according to the actual service received by the aggregate. */
				1015	static inline void charge_actual_service(struct qfq_aggregate *agg)
				1016	{
				1017	/* Compute the service received by the aggregate, taking into
				1018	* account that, after decreasing the number of classes in
				1019	* agg, it may happen that
				1020	* agg->initial_budget - agg->budget > agg->bugdetmax
				1021	*/
				1022	u32 service_received = min(agg->budgetmax,
				1023	agg->initial_budget - agg->budget);
				1024
				1025	agg->F = agg->S + (u64)service_received * agg->inv_w;
				1026	}
				1027
				1028	/* Assign a reasonable start time for a new aggregate in group i.
				1029	* Admissible values for \hat(F) are multiples of \sigma_i
				1030	* no greater than V+\sigma_i . Larger values mean that
				1031	* we had a wraparound so we consider the timestamp to be stale.
				1032	*
				1033	* If F is not stale and F >= V then we set S = F.
				1034	* Otherwise we should assign S = V, but this may violate
				1035	* the ordering in EB (see [2]). So, if we have groups in ER,
				1036	* set S to the F_j of the first group j which would be blocking us.
				1037	* We are guaranteed not to move S backward because
				1038	* otherwise our group i would still be blocked.
				1039	*/
				1040	static void qfq_update_start(struct qfq_sched q, struct qfq_aggregate agg)
				1041	{
				1042	unsigned long mask;
				1043	u64 limit, roundedF;
				1044	int slot_shift = agg->grp->slot_shift;
				1045
				1046	roundedF = qfq_round_down(agg->F, slot_shift);
				1047	limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
				1048
				1049	if (!qfq_gt(agg->F, q->V) \|\| qfq_gt(roundedF, limit)) {
				1050	/* timestamp was stale */
				1051	mask = mask_from(q->bitmaps[ER], agg->grp->index);
				1052	if (mask) {
				1053	struct qfq_group *next = qfq_ffs(q, mask);
				1054	if (qfq_gt(roundedF, next->F)) {
				1055	if (qfq_gt(limit, next->F))
				1056	agg->S = next->F;
				1057	else /* preserve timestamp correctness */
				1058	agg->S = limit;
				1059	return;
				1060	}
				1061	}
				1062	agg->S = q->V;
				1063	} else /* timestamp is not stale */
				1064	agg->S = agg->F;
				1065	}
				1066
				1067	/* Update the timestamps of agg before scheduling/rescheduling it for
				1068	* service. In particular, assign to agg->F its maximum possible
				1069	* value, i.e., the virtual finish time with which the aggregate
				1070	* should be labeled if it used all its budget once in service.
				1071	*/
				1072	static inline void
				1073	qfq_update_agg_ts(struct qfq_sched *q,
				1074	struct qfq_aggregate *agg, enum update_reason reason)
				1075	{
				1076	if (reason != requeue)
				1077	qfq_update_start(q, agg);
				1078	else /* just charge agg for the service received */
				1079	agg->S = agg->F;
				1080
				1081	agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w;
				1082	}
				1083
				1084	static void qfq_schedule_agg(struct qfq_sched q, struct qfq_aggregate agg);
				1085
				1086	static struct sk_buff qfq_dequeue(struct Qdisc sch)
				1087	{
				1088	struct qfq_sched *q = qdisc_priv(sch);
				1089	struct qfq_aggregate *in_serv_agg = q->in_serv_agg;
				1090	struct qfq_class *cl;
				1091	struct sk_buff *skb = NULL;
				1092	/* next-packet len, 0 means no more active classes in in-service agg */
				1093	unsigned int len = 0;
				1094
				1095	if (in_serv_agg == NULL)
				1096	return NULL;
				1097
				1098	if (!list_empty(&in_serv_agg->active))
				1099	skb = qfq_peek_skb(in_serv_agg, &cl, &len);
				1100
				1101	/*
				1102	* If there are no active classes in the in-service aggregate,
				1103	* or if the aggregate has not enough budget to serve its next
				1104	* class, then choose the next aggregate to serve.
				1105	*/
				1106	if (len == 0 \|\| in_serv_agg->budget < len) {
				1107	charge_actual_service(in_serv_agg);
				1108
				1109	/* recharge the budget of the aggregate */
				1110	in_serv_agg->initial_budget = in_serv_agg->budget =
				1111	in_serv_agg->budgetmax;
				1112
				1113	if (!list_empty(&in_serv_agg->active)) {
				1114	/*
				1115	* Still active: reschedule for
				1116	* service. Possible optimization: if no other
				1117	* aggregate is active, then there is no point
				1118	* in rescheduling this aggregate, and we can
				1119	* just keep it as the in-service one. This
				1120	* should be however a corner case, and to
				1121	* handle it, we would need to maintain an
				1122	* extra num_active_aggs field.
				1123	*/
				1124	qfq_update_agg_ts(q, in_serv_agg, requeue);
				1125	qfq_schedule_agg(q, in_serv_agg);
				1126	} else if (sch->q.qlen == 0) { /* no aggregate to serve */
				1127	q->in_serv_agg = NULL;
				1128	return NULL;
				1129	}
				1130
				1131	/*
				1132	* If we get here, there are other aggregates queued:
				1133	* choose the new aggregate to serve.
				1134	*/
				1135	in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q);
				1136	skb = qfq_peek_skb(in_serv_agg, &cl, &len);
				1137	}
				1138	if (!skb)
				1139	return NULL;
				1140
				1141	qdisc_qstats_backlog_dec(sch, skb);
				1142	sch->q.qlen--;
				1143	qdisc_bstats_update(sch, skb);
				1144
				1145	agg_dequeue(in_serv_agg, cl, len);
				1146	/* If lmax is lowered, through qfq_change_class, for a class
				1147	* owning pending packets with larger size than the new value
				1148	* of lmax, then the following condition may hold.
				1149	*/
				1150	if (unlikely(in_serv_agg->budget < len))
				1151	in_serv_agg->budget = 0;
				1152	else
				1153	in_serv_agg->budget -= len;
				1154
				1155	q->V += (u64)len * q->iwsum;
				1156	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
				1157	len, (unsigned long long) in_serv_agg->F,
				1158	(unsigned long long) q->V);
				1159
				1160	return skb;
				1161	}
				1162
				1163	static struct qfq_aggregate qfq_choose_next_agg(struct qfq_sched q)
				1164	{
				1165	struct qfq_group *grp;
				1166	struct qfq_aggregate agg, new_front_agg;
				1167	u64 old_F;
				1168
				1169	qfq_update_eligible(q);
				1170	q->oldV = q->V;
				1171
				1172	if (!q->bitmaps[ER])
				1173	return NULL;
				1174
				1175	grp = qfq_ffs(q, q->bitmaps[ER]);
				1176	old_F = grp->F;
				1177
				1178	agg = qfq_slot_head(grp);
				1179
				1180	/* agg starts to be served, remove it from schedule */
				1181	qfq_front_slot_remove(grp);
				1182
				1183	new_front_agg = qfq_slot_scan(grp);
				1184
				1185	if (new_front_agg == NULL) /* group is now inactive, remove from ER */
				1186	__clear_bit(grp->index, &q->bitmaps[ER]);
				1187	else {
				1188	u64 roundedS = qfq_round_down(new_front_agg->S,
				1189	grp->slot_shift);
				1190	unsigned int s;
				1191
				1192	if (grp->S == roundedS)
				1193	return agg;
				1194	grp->S = roundedS;
				1195	grp->F = roundedS + (2ULL << grp->slot_shift);
				1196	__clear_bit(grp->index, &q->bitmaps[ER]);
				1197	s = qfq_calc_state(q, grp);
				1198	__set_bit(grp->index, &q->bitmaps[s]);
				1199	}
				1200
				1201	qfq_unblock_groups(q, grp->index, old_F);
				1202
				1203	return agg;
				1204	}
				1205
				1206	static int qfq_enqueue(struct sk_buff skb, struct Qdisc sch,
				1207	struct sk_buff **to_free)
				1208	{
				1209	struct qfq_sched *q = qdisc_priv(sch);
				1210	struct qfq_class *cl;
				1211	struct qfq_aggregate *agg;
				1212	int err = 0;
				1213
				1214	cl = qfq_classify(skb, sch, &err);
				1215	if (cl == NULL) {
				1216	if (err & __NET_XMIT_BYPASS)
				1217	qdisc_qstats_drop(sch);
				1218	__qdisc_drop(skb, to_free);
				1219	return err;
				1220	}
				1221	pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid);
				1222
				1223	if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) {
				1224	pr_debug("qfq: increasing maxpkt from %u to %u for class %u",
				1225	cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid);
				1226	err = qfq_change_agg(sch, cl, cl->agg->class_weight,
				1227	qdisc_pkt_len(skb));
				1228	if (err) {
				1229	cl->qstats.drops++;
				1230	return qdisc_drop(skb, sch, to_free);
				1231	}
				1232	}
				1233
				1234	err = qdisc_enqueue(skb, cl->qdisc, to_free);
				1235	if (unlikely(err != NET_XMIT_SUCCESS)) {
				1236	pr_debug("qfq_enqueue: enqueue failed %d\n", err);
				1237	if (net_xmit_drop_count(err)) {
				1238	cl->qstats.drops++;
				1239	qdisc_qstats_drop(sch);
				1240	}
				1241	return err;
				1242	}
				1243
				1244	bstats_update(&cl->bstats, skb);
				1245	qdisc_qstats_backlog_inc(sch, skb);
				1246	++sch->q.qlen;
				1247
				1248	agg = cl->agg;
				1249	/* if the queue was not empty, then done here */
				1250	if (cl->qdisc->q.qlen != 1) {
				1251	if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) &&
				1252	list_first_entry(&agg->active, struct qfq_class, alist)
				1253	== cl && cl->deficit < qdisc_pkt_len(skb))
				1254	list_move_tail(&cl->alist, &agg->active);
				1255
				1256	return err;
				1257	}
				1258
				1259	/* schedule class for service within the aggregate */
				1260	cl->deficit = agg->lmax;
				1261	list_add_tail(&cl->alist, &agg->active);
				1262
				1263	if (list_first_entry(&agg->active, struct qfq_class, alist) != cl \|\|
				1264	q->in_serv_agg == agg)
				1265	return err; /* non-empty or in service, nothing else to do */
				1266
				1267	qfq_activate_agg(q, agg, enqueue);
				1268
				1269	return err;
				1270	}
				1271
				1272	/*
				1273	* Schedule aggregate according to its timestamps.
				1274	*/
				1275	static void qfq_schedule_agg(struct qfq_sched q, struct qfq_aggregate agg)
				1276	{
				1277	struct qfq_group *grp = agg->grp;
				1278	u64 roundedS;
				1279	int s;
				1280
				1281	roundedS = qfq_round_down(agg->S, grp->slot_shift);
				1282
				1283	/*
				1284	* Insert agg in the correct bucket.
				1285	* If agg->S >= grp->S we don't need to adjust the
				1286	* bucket list and simply go to the insertion phase.
				1287	* Otherwise grp->S is decreasing, we must make room
				1288	* in the bucket list, and also recompute the group state.
				1289	* Finally, if there were no flows in this group and nobody
				1290	* was in ER make sure to adjust V.
				1291	*/
				1292	if (grp->full_slots) {
				1293	if (!qfq_gt(grp->S, agg->S))
				1294	goto skip_update;
				1295
				1296	/* create a slot for this agg->S */
				1297	qfq_slot_rotate(grp, roundedS);
				1298	/* group was surely ineligible, remove */
				1299	__clear_bit(grp->index, &q->bitmaps[IR]);
				1300	__clear_bit(grp->index, &q->bitmaps[IB]);
				1301	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) &&
				1302	q->in_serv_agg == NULL)
				1303	q->V = roundedS;
				1304
				1305	grp->S = roundedS;
				1306	grp->F = roundedS + (2ULL << grp->slot_shift);
				1307	s = qfq_calc_state(q, grp);
				1308	__set_bit(grp->index, &q->bitmaps[s]);
				1309
				1310	pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
				1311	s, q->bitmaps[s],
				1312	(unsigned long long) agg->S,
				1313	(unsigned long long) agg->F,
				1314	(unsigned long long) q->V);
				1315
				1316	skip_update:
				1317	qfq_slot_insert(grp, agg, roundedS);
				1318	}
				1319
				1320
				1321	/* Update agg ts and schedule agg for service */
				1322	static void qfq_activate_agg(struct qfq_sched q, struct qfq_aggregate agg,
				1323	enum update_reason reason)
				1324	{
				1325	agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */
				1326
				1327	qfq_update_agg_ts(q, agg, reason);
				1328	if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */
				1329	q->in_serv_agg = agg; /* start serving this aggregate */
				1330	/* update V: to be in service, agg must be eligible */
				1331	q->oldV = q->V = agg->S;
				1332	} else if (agg != q->in_serv_agg)
				1333	qfq_schedule_agg(q, agg);
				1334	}
				1335
				1336	static void qfq_slot_remove(struct qfq_sched q, struct qfq_group grp,
				1337	struct qfq_aggregate *agg)
				1338	{
				1339	unsigned int i, offset;
				1340	u64 roundedS;
				1341
				1342	roundedS = qfq_round_down(agg->S, grp->slot_shift);
				1343	offset = (roundedS - grp->S) >> grp->slot_shift;
				1344
				1345	i = (grp->front + offset) % QFQ_MAX_SLOTS;
				1346
				1347	hlist_del(&agg->next);
				1348	if (hlist_empty(&grp->slots[i]))
				1349	__clear_bit(offset, &grp->full_slots);
				1350	}
				1351
				1352	/*
				1353	* Called to forcibly deschedule an aggregate. If the aggregate is
				1354	* not in the front bucket, or if the latter has other aggregates in
				1355	* the front bucket, we can simply remove the aggregate with no other
				1356	* side effects.
				1357	* Otherwise we must propagate the event up.
				1358	*/
				1359	static void qfq_deactivate_agg(struct qfq_sched q, struct qfq_aggregate agg)
				1360	{
				1361	struct qfq_group *grp = agg->grp;
				1362	unsigned long mask;
				1363	u64 roundedS;
				1364	int s;
				1365
				1366	if (agg == q->in_serv_agg) {
				1367	charge_actual_service(agg);
				1368	q->in_serv_agg = qfq_choose_next_agg(q);
				1369	return;
				1370	}
				1371
				1372	agg->F = agg->S;
				1373	qfq_slot_remove(q, grp, agg);
				1374
				1375	if (!grp->full_slots) {
				1376	__clear_bit(grp->index, &q->bitmaps[IR]);
				1377	__clear_bit(grp->index, &q->bitmaps[EB]);
				1378	__clear_bit(grp->index, &q->bitmaps[IB]);
				1379
				1380	if (test_bit(grp->index, &q->bitmaps[ER]) &&
				1381	!(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
				1382	mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
				1383	if (mask)
				1384	mask = ~((1UL << __fls(mask)) - 1);
				1385	else
				1386	mask = ~0UL;
				1387	qfq_move_groups(q, mask, EB, ER);
				1388	qfq_move_groups(q, mask, IB, IR);
				1389	}
				1390	__clear_bit(grp->index, &q->bitmaps[ER]);
				1391	} else if (hlist_empty(&grp->slots[grp->front])) {
				1392	agg = qfq_slot_scan(grp);
				1393	roundedS = qfq_round_down(agg->S, grp->slot_shift);
				1394	if (grp->S != roundedS) {
				1395	__clear_bit(grp->index, &q->bitmaps[ER]);
				1396	__clear_bit(grp->index, &q->bitmaps[IR]);
				1397	__clear_bit(grp->index, &q->bitmaps[EB]);
				1398	__clear_bit(grp->index, &q->bitmaps[IB]);
				1399	grp->S = roundedS;
				1400	grp->F = roundedS + (2ULL << grp->slot_shift);
				1401	s = qfq_calc_state(q, grp);
				1402	__set_bit(grp->index, &q->bitmaps[s]);
				1403	}
				1404	}
				1405	}
				1406
				1407	static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
				1408	{
				1409	struct qfq_sched *q = qdisc_priv(sch);
				1410	struct qfq_class cl = (struct qfq_class )arg;
				1411
				1412	qfq_deactivate_class(q, cl);
				1413	}
				1414
				1415	static int qfq_init_qdisc(struct Qdisc sch, struct nlattr opt)
				1416	{
				1417	struct qfq_sched *q = qdisc_priv(sch);
				1418	struct qfq_group *grp;
				1419	int i, j, err;
				1420	u32 max_cl_shift, maxbudg_shift, max_classes;
				1421
				1422	err = tcf_block_get(&q->block, &q->filter_list);
				1423	if (err)
				1424	return err;
				1425
				1426	err = qdisc_class_hash_init(&q->clhash);
				1427	if (err < 0)
				1428	return err;
				1429
				1430	if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES)
				1431	max_classes = QFQ_MAX_AGG_CLASSES;
				1432	else
				1433	max_classes = qdisc_dev(sch)->tx_queue_len + 1;
				1434	/* max_cl_shift = floor(log_2(max_classes)) */
				1435	max_cl_shift = __fls(max_classes);
				1436	q->max_agg_classes = 1<<max_cl_shift;
				1437
				1438	/* maxbudg_shift = log2(max_len * max_classes_per_agg) */
				1439	maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift;
				1440	q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX;
				1441
				1442	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
				1443	grp = &q->groups[i];
				1444	grp->index = i;
				1445	grp->slot_shift = q->min_slot_shift + i;
				1446	for (j = 0; j < QFQ_MAX_SLOTS; j++)
				1447	INIT_HLIST_HEAD(&grp->slots[j]);
				1448	}
				1449
				1450	INIT_HLIST_HEAD(&q->nonfull_aggs);
				1451
				1452	return 0;
				1453	}
				1454
				1455	static void qfq_reset_qdisc(struct Qdisc *sch)
				1456	{
				1457	struct qfq_sched *q = qdisc_priv(sch);
				1458	struct qfq_class *cl;
				1459	unsigned int i;
				1460
				1461	for (i = 0; i < q->clhash.hashsize; i++) {
				1462	hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
				1463	if (cl->qdisc->q.qlen > 0)
				1464	qfq_deactivate_class(q, cl);
				1465
				1466	qdisc_reset(cl->qdisc);
				1467	}
				1468	}
				1469	sch->qstats.backlog = 0;
				1470	sch->q.qlen = 0;
				1471	}
				1472
				1473	static void qfq_destroy_qdisc(struct Qdisc *sch)
				1474	{
				1475	struct qfq_sched *q = qdisc_priv(sch);
				1476	struct qfq_class *cl;
				1477	struct hlist_node *next;
				1478	unsigned int i;
				1479
				1480	tcf_block_put(q->block);
				1481
				1482	for (i = 0; i < q->clhash.hashsize; i++) {
				1483	hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
				1484	common.hnode) {
				1485	qfq_destroy_class(sch, cl);
				1486	}
				1487	}
				1488	qdisc_class_hash_destroy(&q->clhash);
				1489	}
				1490
				1491	static const struct Qdisc_class_ops qfq_class_ops = {
				1492	.change = qfq_change_class,
				1493	.delete = qfq_delete_class,
				1494	.find = qfq_search_class,
				1495	.tcf_block = qfq_tcf_block,
				1496	.bind_tcf = qfq_bind_tcf,
				1497	.unbind_tcf = qfq_unbind_tcf,
				1498	.graft = qfq_graft_class,
				1499	.leaf = qfq_class_leaf,
				1500	.qlen_notify = qfq_qlen_notify,
				1501	.dump = qfq_dump_class,
				1502	.dump_stats = qfq_dump_class_stats,
				1503	.walk = qfq_walk,
				1504	};
				1505
				1506	static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
				1507	.cl_ops = &qfq_class_ops,
				1508	.id = "qfq",
				1509	.priv_size = sizeof(struct qfq_sched),
				1510	.enqueue = qfq_enqueue,
				1511	.dequeue = qfq_dequeue,
				1512	.peek = qdisc_peek_dequeued,
				1513	.init = qfq_init_qdisc,
				1514	.reset = qfq_reset_qdisc,
				1515	.destroy = qfq_destroy_qdisc,
				1516	.owner = THIS_MODULE,
				1517	};
				1518
				1519	static int __init qfq_init(void)
				1520	{
				1521	return register_qdisc(&qfq_qdisc_ops);
				1522	}
				1523
				1524	static void __exit qfq_exit(void)
				1525	{
				1526	unregister_qdisc(&qfq_qdisc_ops);
				1527	}
				1528
				1529	module_init(qfq_init);
				1530	module_exit(qfq_exit);
				1531	MODULE_LICENSE("GPL");