Blame - src/kernel/linux/v4.14/net/sched/sch_cbq.c - T103

blob: 40fd1ee0095c09843d7ca0056c4282f3b63d7e8f [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* net/sched/sch_cbq.c Class-Based Queueing discipline.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License, or (at your option) any later version.
				8	*
				9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
				10	*
				11	*/
				12
				13	#include <linux/module.h>
				14	#include <linux/slab.h>
				15	#include <linux/types.h>
				16	#include <linux/kernel.h>
				17	#include <linux/string.h>
				18	#include <linux/errno.h>
				19	#include <linux/skbuff.h>
				20	#include <net/netlink.h>
				21	#include <net/pkt_sched.h>
				22	#include <net/pkt_cls.h>
				23
				24
				25	/* Class-Based Queueing (CBQ) algorithm.
				26	=======================================
				27
				28	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
				29	Management Models for Packet Networks",
				30	IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
				31
				32	[2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
				33
				34	[3] Sally Floyd, "Notes on Class-Based Queueing: Setting
				35	Parameters", 1996
				36
				37	[4] Sally Floyd and Michael Speer, "Experimental Results
				38	for Class-Based Queueing", 1998, not published.
				39
				40	-----------------------------------------------------------------------
				41
				42	Algorithm skeleton was taken from NS simulator cbq.cc.
				43	If someone wants to check this code against the LBL version,
				44	he should take into account that ONLY the skeleton was borrowed,
				45	the implementation is different. Particularly:
				46
				47	--- The WRR algorithm is different. Our version looks more
				48	reasonable (I hope) and works when quanta are allowed to be
				49	less than MTU, which is always the case when real time classes
				50	have small rates. Note, that the statement of [3] is
				51	incomplete, delay may actually be estimated even if class
				52	per-round allotment is less than MTU. Namely, if per-round
				53	allotment is W*r_i, and r_1+...+r_k = r < 1
				54
				55	delay_i <= ([MTU/(Wr_i)]Wr + Wr + k*MTU)/B
				56
				57	In the worst case we have IntServ estimate with D = Wr+kMTU
				58	and C = MTU*r. The proof (if correct at all) is trivial.
				59
				60
				61	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
				62	interpret some places, which look like wrong translations
				63	from NS. Anyone is advised to find these differences
				64	and explain to me, why I am wrong 8).
				65
				66	--- Linux has no EOI event, so that we cannot estimate true class
				67	idle time. Workaround is to consider the next dequeue event
				68	as sign that previous packet is finished. This is wrong because of
				69	internal device queueing, but on a permanently loaded link it is true.
				70	Moreover, combined with clock integrator, this scheme looks
				71	very close to an ideal solution. */
				72
				73	struct cbq_sched_data;
				74
				75
				76	struct cbq_class {
				77	struct Qdisc_class_common common;
				78	struct cbq_class next_alive; / next class with backlog in this priority band */
				79
				80	/* Parameters */
				81	unsigned char priority; /* class priority */
				82	unsigned char priority2; /* priority to be used after overlimit */
				83	unsigned char ewma_log; /* time constant for idle time calculation */
				84
				85	u32 defmap;
				86
				87	/* Link-sharing scheduler parameters */
				88	long maxidle; /* Class parameters: see below. */
				89	long offtime;
				90	long minidle;
				91	u32 avpkt;
				92	struct qdisc_rate_table *R_tab;
				93
				94	/* General scheduler (WRR) parameters */
				95	long allot;
				96	long quantum; /* Allotment per WRR round */
				97	long weight; /* Relative allotment: see below */
				98
				99	struct Qdisc qdisc; / Ptr to CBQ discipline */
				100	struct cbq_class split; / Ptr to split node */
				101	struct cbq_class share; / Ptr to LS parent in the class tree */
				102	struct cbq_class tparent; / Ptr to tree parent in the class tree */
				103	struct cbq_class borrow; / NULL if class is bandwidth limited;
				104	parent otherwise */
				105	struct cbq_class sibling; / Sibling chain */
				106	struct cbq_class children; / Pointer to children chain */
				107
				108	struct Qdisc q; / Elementary queueing discipline */
				109
				110
				111	/* Variables */
				112	unsigned char cpriority; /* Effective priority */
				113	unsigned char delayed;
				114	unsigned char level; /* level of the class in hierarchy:
				115	0 for leaf classes, and maximal
				116	level of children + 1 for nodes.
				117	*/
				118
				119	psched_time_t last; /* Last end of service */
				120	psched_time_t undertime;
				121	long avgidle;
				122	long deficit; /* Saved deficit for WRR */
				123	psched_time_t penalized;
				124	struct gnet_stats_basic_packed bstats;
				125	struct gnet_stats_queue qstats;
				126	struct net_rate_estimator __rcu *rate_est;
				127	struct tc_cbq_xstats xstats;
				128
				129	struct tcf_proto __rcu *filter_list;
				130	struct tcf_block *block;
				131
				132	int filters;
				133
				134	struct cbq_class *defaults[TC_PRIO_MAX + 1];
				135	};
				136
				137	struct cbq_sched_data {
				138	struct Qdisc_class_hash clhash; /* Hash table of all classes */
				139	int nclasses[TC_CBQ_MAXPRIO + 1];
				140	unsigned int quanta[TC_CBQ_MAXPRIO + 1];
				141
				142	struct cbq_class link;
				143
				144	unsigned int activemask;
				145	struct cbq_class active[TC_CBQ_MAXPRIO + 1]; / List of all classes
				146	with backlog */
				147
				148	#ifdef CONFIG_NET_CLS_ACT
				149	struct cbq_class *rx_class;
				150	#endif
				151	struct cbq_class *tx_class;
				152	struct cbq_class *tx_borrowed;
				153	int tx_len;
				154	psched_time_t now; /* Cached timestamp */
				155	unsigned int pmask;
				156
				157	struct hrtimer delay_timer;
				158	struct qdisc_watchdog watchdog; /* Watchdog timer,
				159	started when CBQ has
				160	backlog, but cannot
				161	transmit just now */
				162	psched_tdiff_t wd_expires;
				163	int toplevel;
				164	u32 hgenerator;
				165	};
				166
				167
				168	#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
				169
				170	static inline struct cbq_class *
				171	cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
				172	{
				173	struct Qdisc_class_common *clc;
				174
				175	clc = qdisc_class_find(&q->clhash, classid);
				176	if (clc == NULL)
				177	return NULL;
				178	return container_of(clc, struct cbq_class, common);
				179	}
				180
				181	#ifdef CONFIG_NET_CLS_ACT
				182
				183	static struct cbq_class *
				184	cbq_reclassify(struct sk_buff skb, struct cbq_class this)
				185	{
				186	struct cbq_class *cl;
				187
				188	for (cl = this->tparent; cl; cl = cl->tparent) {
				189	struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
				190
				191	if (new != NULL && new != this)
				192	return new;
				193	}
				194	return NULL;
				195	}
				196
				197	#endif
				198
				199	/* Classify packet. The procedure is pretty complicated, but
				200	* it allows us to combine link sharing and priority scheduling
				201	* transparently.
				202	*
				203	* Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
				204	* so that it resolves to split nodes. Then packets are classified
				205	* by logical priority, or a more specific classifier may be attached
				206	* to the split node.
				207	*/
				208
				209	static struct cbq_class *
				210	cbq_classify(struct sk_buff skb, struct Qdisc sch, int *qerr)
				211	{
				212	struct cbq_sched_data *q = qdisc_priv(sch);
				213	struct cbq_class *head = &q->link;
				214	struct cbq_class **defmap;
				215	struct cbq_class *cl = NULL;
				216	u32 prio = skb->priority;
				217	struct tcf_proto *fl;
				218	struct tcf_result res;
				219
				220	/*
				221	* Step 1. If skb->priority points to one of our classes, use it.
				222	*/
				223	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
				224	(cl = cbq_class_lookup(q, prio)) != NULL)
				225	return cl;
				226
				227	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				228	for (;;) {
				229	int result = 0;
				230	defmap = head->defaults;
				231
				232	fl = rcu_dereference_bh(head->filter_list);
				233	/*
				234	* Step 2+n. Apply classifier.
				235	*/
				236	result = tcf_classify(skb, fl, &res, true);
				237	if (!fl \|\| result < 0)
				238	goto fallback;
				239
				240	cl = (void *)res.class;
				241	if (!cl) {
				242	if (TC_H_MAJ(res.classid))
				243	cl = cbq_class_lookup(q, res.classid);
				244	else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
				245	cl = defmap[TC_PRIO_BESTEFFORT];
				246
				247	if (cl == NULL)
				248	goto fallback;
				249	}
				250	if (cl->level >= head->level)
				251	goto fallback;
				252	#ifdef CONFIG_NET_CLS_ACT
				253	switch (result) {
				254	case TC_ACT_QUEUED:
				255	case TC_ACT_STOLEN:
				256	case TC_ACT_TRAP:
				257	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
				258	case TC_ACT_SHOT:
				259	return NULL;
				260	case TC_ACT_RECLASSIFY:
				261	return cbq_reclassify(skb, cl);
				262	}
				263	#endif
				264	if (cl->level == 0)
				265	return cl;
				266
				267	/*
				268	* Step 3+n. If classifier selected a link sharing class,
				269	* apply agency specific classifier.
				270	* Repeat this procdure until we hit a leaf node.
				271	*/
				272	head = cl;
				273	}
				274
				275	fallback:
				276	cl = head;
				277
				278	/*
				279	* Step 4. No success...
				280	*/
				281	if (TC_H_MAJ(prio) == 0 &&
				282	!(cl = head->defaults[prio & TC_PRIO_MAX]) &&
				283	!(cl = head->defaults[TC_PRIO_BESTEFFORT]))
				284	return head;
				285
				286	return cl;
				287	}
				288
				289	/*
				290	* A packet has just been enqueued on the empty class.
				291	* cbq_activate_class adds it to the tail of active class list
				292	* of its priority band.
				293	*/
				294
				295	static inline void cbq_activate_class(struct cbq_class *cl)
				296	{
				297	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
				298	int prio = cl->cpriority;
				299	struct cbq_class *cl_tail;
				300
				301	cl_tail = q->active[prio];
				302	q->active[prio] = cl;
				303
				304	if (cl_tail != NULL) {
				305	cl->next_alive = cl_tail->next_alive;
				306	cl_tail->next_alive = cl;
				307	} else {
				308	cl->next_alive = cl;
				309	q->activemask \|= (1<<prio);
				310	}
				311	}
				312
				313	/*
				314	* Unlink class from active chain.
				315	* Note that this same procedure is done directly in cbq_dequeue*
				316	* during round-robin procedure.
				317	*/
				318
				319	static void cbq_deactivate_class(struct cbq_class *this)
				320	{
				321	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
				322	int prio = this->cpriority;
				323	struct cbq_class *cl;
				324	struct cbq_class *cl_prev = q->active[prio];
				325
				326	do {
				327	cl = cl_prev->next_alive;
				328	if (cl == this) {
				329	cl_prev->next_alive = cl->next_alive;
				330	cl->next_alive = NULL;
				331
				332	if (cl == q->active[prio]) {
				333	q->active[prio] = cl_prev;
				334	if (cl == q->active[prio]) {
				335	q->active[prio] = NULL;
				336	q->activemask &= ~(1<<prio);
				337	return;
				338	}
				339	}
				340	return;
				341	}
				342	} while ((cl_prev = cl) != q->active[prio]);
				343	}
				344
				345	static void
				346	cbq_mark_toplevel(struct cbq_sched_data q, struct cbq_class cl)
				347	{
				348	int toplevel = q->toplevel;
				349
				350	if (toplevel > cl->level) {
				351	psched_time_t now = psched_get_time();
				352
				353	do {
				354	if (cl->undertime < now) {
				355	q->toplevel = cl->level;
				356	return;
				357	}
				358	} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
				359	}
				360	}
				361
				362	static int
				363	cbq_enqueue(struct sk_buff skb, struct Qdisc sch,
				364	struct sk_buff **to_free)
				365	{
				366	struct cbq_sched_data *q = qdisc_priv(sch);
				367	int uninitialized_var(ret);
				368	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
				369
				370	#ifdef CONFIG_NET_CLS_ACT
				371	q->rx_class = cl;
				372	#endif
				373	if (cl == NULL) {
				374	if (ret & __NET_XMIT_BYPASS)
				375	qdisc_qstats_drop(sch);
				376	__qdisc_drop(skb, to_free);
				377	return ret;
				378	}
				379
				380	ret = qdisc_enqueue(skb, cl->q, to_free);
				381	if (ret == NET_XMIT_SUCCESS) {
				382	sch->q.qlen++;
				383	cbq_mark_toplevel(q, cl);
				384	if (!cl->next_alive)
				385	cbq_activate_class(cl);
				386	return ret;
				387	}
				388
				389	if (net_xmit_drop_count(ret)) {
				390	qdisc_qstats_drop(sch);
				391	cbq_mark_toplevel(q, cl);
				392	cl->qstats.drops++;
				393	}
				394	return ret;
				395	}
				396
				397	/* Overlimit action: penalize leaf class by adding offtime */
				398	static void cbq_overlimit(struct cbq_class *cl)
				399	{
				400	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
				401	psched_tdiff_t delay = cl->undertime - q->now;
				402
				403	if (!cl->delayed) {
				404	delay += cl->offtime;
				405
				406	/*
				407	* Class goes to sleep, so that it will have no
				408	* chance to work avgidle. Let's forgive it 8)
				409	*
				410	* BTW cbq-2.0 has a crap in this
				411	* place, apparently they forgot to shift it by cl->ewma_log.
				412	*/
				413	if (cl->avgidle < 0)
				414	delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
				415	if (cl->avgidle < cl->minidle)
				416	cl->avgidle = cl->minidle;
				417	if (delay <= 0)
				418	delay = 1;
				419	cl->undertime = q->now + delay;
				420
				421	cl->xstats.overactions++;
				422	cl->delayed = 1;
				423	}
				424	if (q->wd_expires == 0 \|\| q->wd_expires > delay)
				425	q->wd_expires = delay;
				426
				427	/* Dirty work! We must schedule wakeups based on
				428	* real available rate, rather than leaf rate,
				429	* which may be tiny (even zero).
				430	*/
				431	if (q->toplevel == TC_CBQ_MAXLEVEL) {
				432	struct cbq_class *b;
				433	psched_tdiff_t base_delay = q->wd_expires;
				434
				435	for (b = cl->borrow; b; b = b->borrow) {
				436	delay = b->undertime - q->now;
				437	if (delay < base_delay) {
				438	if (delay <= 0)
				439	delay = 1;
				440	base_delay = delay;
				441	}
				442	}
				443
				444	q->wd_expires = base_delay;
				445	}
				446	}
				447
				448	static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
				449	psched_time_t now)
				450	{
				451	struct cbq_class *cl;
				452	struct cbq_class *cl_prev = q->active[prio];
				453	psched_time_t sched = now;
				454
				455	if (cl_prev == NULL)
				456	return 0;
				457
				458	do {
				459	cl = cl_prev->next_alive;
				460	if (now - cl->penalized > 0) {
				461	cl_prev->next_alive = cl->next_alive;
				462	cl->next_alive = NULL;
				463	cl->cpriority = cl->priority;
				464	cl->delayed = 0;
				465	cbq_activate_class(cl);
				466
				467	if (cl == q->active[prio]) {
				468	q->active[prio] = cl_prev;
				469	if (cl == q->active[prio]) {
				470	q->active[prio] = NULL;
				471	return 0;
				472	}
				473	}
				474
				475	cl = cl_prev->next_alive;
				476	} else if (sched - cl->penalized > 0)
				477	sched = cl->penalized;
				478	} while ((cl_prev = cl) != q->active[prio]);
				479
				480	return sched - now;
				481	}
				482
				483	static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
				484	{
				485	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
				486	delay_timer);
				487	struct Qdisc *sch = q->watchdog.qdisc;
				488	psched_time_t now;
				489	psched_tdiff_t delay = 0;
				490	unsigned int pmask;
				491
				492	now = psched_get_time();
				493
				494	pmask = q->pmask;
				495	q->pmask = 0;
				496
				497	while (pmask) {
				498	int prio = ffz(~pmask);
				499	psched_tdiff_t tmp;
				500
				501	pmask &= ~(1<<prio);
				502
				503	tmp = cbq_undelay_prio(q, prio, now);
				504	if (tmp > 0) {
				505	q->pmask \|= 1<<prio;
				506	if (tmp < delay \|\| delay == 0)
				507	delay = tmp;
				508	}
				509	}
				510
				511	if (delay) {
				512	ktime_t time;
				513
				514	time = 0;
				515	time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
				516	hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
				517	}
				518
				519	__netif_schedule(qdisc_root(sch));
				520	return HRTIMER_NORESTART;
				521	}
				522
				523	/*
				524	* It is mission critical procedure.
				525	*
				526	* We "regenerate" toplevel cutoff, if transmitting class
				527	* has backlog and it is not regulated. It is not part of
				528	* original CBQ description, but looks more reasonable.
				529	* Probably, it is wrong. This question needs further investigation.
				530	*/
				531
				532	static inline void
				533	cbq_update_toplevel(struct cbq_sched_data q, struct cbq_class cl,
				534	struct cbq_class *borrowed)
				535	{
				536	if (cl && q->toplevel >= borrowed->level) {
				537	if (cl->q->q.qlen > 1) {
				538	do {
				539	if (borrowed->undertime == PSCHED_PASTPERFECT) {
				540	q->toplevel = borrowed->level;
				541	return;
				542	}
				543	} while ((borrowed = borrowed->borrow) != NULL);
				544	}
				545	#if 0
				546	/* It is not necessary now. Uncommenting it
				547	will save CPU cycles, but decrease fairness.
				548	*/
				549	q->toplevel = TC_CBQ_MAXLEVEL;
				550	#endif
				551	}
				552	}
				553
				554	static void
				555	cbq_update(struct cbq_sched_data *q)
				556	{
				557	struct cbq_class *this = q->tx_class;
				558	struct cbq_class *cl = this;
				559	int len = q->tx_len;
				560	psched_time_t now;
				561
				562	q->tx_class = NULL;
				563	/* Time integrator. We calculate EOS time
				564	* by adding expected packet transmission time.
				565	*/
				566	now = q->now + L2T(&q->link, len);
				567
				568	for ( ; cl; cl = cl->share) {
				569	long avgidle = cl->avgidle;
				570	long idle;
				571
				572	cl->bstats.packets++;
				573	cl->bstats.bytes += len;
				574
				575	/*
				576	* (now - last) is total time between packet right edges.
				577	* (last_pktlen/rate) is "virtual" busy time, so that
				578	*
				579	* idle = (now - last) - last_pktlen/rate
				580	*/
				581
				582	idle = now - cl->last;
				583	if ((unsigned long)idle > 12810241024) {
				584	avgidle = cl->maxidle;
				585	} else {
				586	idle -= L2T(cl, len);
				587
				588	/* true_avgidle := (1-W)true_avgidle + Widle,
				589	* where W=2^{-ewma_log}. But cl->avgidle is scaled:
				590	* cl->avgidle == true_avgidle/W,
				591	* hence:
				592	*/
				593	avgidle += idle - (avgidle>>cl->ewma_log);
				594	}
				595
				596	if (avgidle <= 0) {
				597	/* Overlimit or at-limit */
				598
				599	if (avgidle < cl->minidle)
				600	avgidle = cl->minidle;
				601
				602	cl->avgidle = avgidle;
				603
				604	/* Calculate expected time, when this class
				605	* will be allowed to send.
				606	* It will occur, when:
				607	* (1-W)true_avgidle + Wdelay = 0, i.e.
				608	* idle = (1/W - 1)*(-true_avgidle)
				609	* or
				610	* idle = (1 - W)*(-cl->avgidle);
				611	*/
				612	idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
				613
				614	/*
				615	* That is not all.
				616	* To maintain the rate allocated to the class,
				617	* we add to undertime virtual clock,
				618	* necessary to complete transmitted packet.
				619	* (len/phys_bandwidth has been already passed
				620	* to the moment of cbq_update)
				621	*/
				622
				623	idle -= L2T(&q->link, len);
				624	idle += L2T(cl, len);
				625
				626	cl->undertime = now + idle;
				627	} else {
				628	/* Underlimit */
				629
				630	cl->undertime = PSCHED_PASTPERFECT;
				631	if (avgidle > cl->maxidle)
				632	cl->avgidle = cl->maxidle;
				633	else
				634	cl->avgidle = avgidle;
				635	}
				636	if ((s64)(now - cl->last) > 0)
				637	cl->last = now;
				638	}
				639
				640	cbq_update_toplevel(q, this, q->tx_borrowed);
				641	}
				642
				643	static inline struct cbq_class *
				644	cbq_under_limit(struct cbq_class *cl)
				645	{
				646	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
				647	struct cbq_class *this_cl = cl;
				648
				649	if (cl->tparent == NULL)
				650	return cl;
				651
				652	if (cl->undertime == PSCHED_PASTPERFECT \|\| q->now >= cl->undertime) {
				653	cl->delayed = 0;
				654	return cl;
				655	}
				656
				657	do {
				658	/* It is very suspicious place. Now overlimit
				659	* action is generated for not bounded classes
				660	* only if link is completely congested.
				661	* Though it is in agree with ancestor-only paradigm,
				662	* it looks very stupid. Particularly,
				663	* it means that this chunk of code will either
				664	* never be called or result in strong amplification
				665	* of burstiness. Dangerous, silly, and, however,
				666	* no another solution exists.
				667	*/
				668	cl = cl->borrow;
				669	if (!cl) {
				670	this_cl->qstats.overlimits++;
				671	cbq_overlimit(this_cl);
				672	return NULL;
				673	}
				674	if (cl->level > q->toplevel)
				675	return NULL;
				676	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
				677
				678	cl->delayed = 0;
				679	return cl;
				680	}
				681
				682	static inline struct sk_buff *
				683	cbq_dequeue_prio(struct Qdisc *sch, int prio)
				684	{
				685	struct cbq_sched_data *q = qdisc_priv(sch);
				686	struct cbq_class cl_tail, cl_prev, *cl;
				687	struct sk_buff *skb;
				688	int deficit;
				689
				690	cl_tail = cl_prev = q->active[prio];
				691	cl = cl_prev->next_alive;
				692
				693	do {
				694	deficit = 0;
				695
				696	/* Start round */
				697	do {
				698	struct cbq_class *borrow = cl;
				699
				700	if (cl->q->q.qlen &&
				701	(borrow = cbq_under_limit(cl)) == NULL)
				702	goto skip_class;
				703
				704	if (cl->deficit <= 0) {
				705	/* Class exhausted its allotment per
				706	* this round. Switch to the next one.
				707	*/
				708	deficit = 1;
				709	cl->deficit += cl->quantum;
				710	goto next_class;
				711	}
				712
				713	skb = cl->q->dequeue(cl->q);
				714
				715	/* Class did not give us any skb :-(
				716	* It could occur even if cl->q->q.qlen != 0
				717	* f.e. if cl->q == "tbf"
				718	*/
				719	if (skb == NULL)
				720	goto skip_class;
				721
				722	cl->deficit -= qdisc_pkt_len(skb);
				723	q->tx_class = cl;
				724	q->tx_borrowed = borrow;
				725	if (borrow != cl) {
				726	#ifndef CBQ_XSTATS_BORROWS_BYTES
				727	borrow->xstats.borrows++;
				728	cl->xstats.borrows++;
				729	#else
				730	borrow->xstats.borrows += qdisc_pkt_len(skb);
				731	cl->xstats.borrows += qdisc_pkt_len(skb);
				732	#endif
				733	}
				734	q->tx_len = qdisc_pkt_len(skb);
				735
				736	if (cl->deficit <= 0) {
				737	q->active[prio] = cl;
				738	cl = cl->next_alive;
				739	cl->deficit += cl->quantum;
				740	}
				741	return skb;
				742
				743	skip_class:
				744	if (cl->q->q.qlen == 0 \|\| prio != cl->cpriority) {
				745	/* Class is empty or penalized.
				746	* Unlink it from active chain.
				747	*/
				748	cl_prev->next_alive = cl->next_alive;
				749	cl->next_alive = NULL;
				750
				751	/* Did cl_tail point to it? */
				752	if (cl == cl_tail) {
				753	/* Repair it! */
				754	cl_tail = cl_prev;
				755
				756	/* Was it the last class in this band? */
				757	if (cl == cl_tail) {
				758	/* Kill the band! */
				759	q->active[prio] = NULL;
				760	q->activemask &= ~(1<<prio);
				761	if (cl->q->q.qlen)
				762	cbq_activate_class(cl);
				763	return NULL;
				764	}
				765
				766	q->active[prio] = cl_tail;
				767	}
				768	if (cl->q->q.qlen)
				769	cbq_activate_class(cl);
				770
				771	cl = cl_prev;
				772	}
				773
				774	next_class:
				775	cl_prev = cl;
				776	cl = cl->next_alive;
				777	} while (cl_prev != cl_tail);
				778	} while (deficit);
				779
				780	q->active[prio] = cl_prev;
				781
				782	return NULL;
				783	}
				784
				785	static inline struct sk_buff *
				786	cbq_dequeue_1(struct Qdisc *sch)
				787	{
				788	struct cbq_sched_data *q = qdisc_priv(sch);
				789	struct sk_buff *skb;
				790	unsigned int activemask;
				791
				792	activemask = q->activemask & 0xFF;
				793	while (activemask) {
				794	int prio = ffz(~activemask);
				795	activemask &= ~(1<<prio);
				796	skb = cbq_dequeue_prio(sch, prio);
				797	if (skb)
				798	return skb;
				799	}
				800	return NULL;
				801	}
				802
				803	static struct sk_buff *
				804	cbq_dequeue(struct Qdisc *sch)
				805	{
				806	struct sk_buff *skb;
				807	struct cbq_sched_data *q = qdisc_priv(sch);
				808	psched_time_t now;
				809
				810	now = psched_get_time();
				811
				812	if (q->tx_class)
				813	cbq_update(q);
				814
				815	q->now = now;
				816
				817	for (;;) {
				818	q->wd_expires = 0;
				819
				820	skb = cbq_dequeue_1(sch);
				821	if (skb) {
				822	qdisc_bstats_update(sch, skb);
				823	sch->q.qlen--;
				824	return skb;
				825	}
				826
				827	/* All the classes are overlimit.
				828	*
				829	* It is possible, if:
				830	*
				831	* 1. Scheduler is empty.
				832	* 2. Toplevel cutoff inhibited borrowing.
				833	* 3. Root class is overlimit.
				834	*
				835	* Reset 2d and 3d conditions and retry.
				836	*
				837	* Note, that NS and cbq-2.0 are buggy, peeking
				838	* an arbitrary class is appropriate for ancestor-only
				839	* sharing, but not for toplevel algorithm.
				840	*
				841	* Our version is better, but slower, because it requires
				842	* two passes, but it is unavoidable with top-level sharing.
				843	*/
				844
				845	if (q->toplevel == TC_CBQ_MAXLEVEL &&
				846	q->link.undertime == PSCHED_PASTPERFECT)
				847	break;
				848
				849	q->toplevel = TC_CBQ_MAXLEVEL;
				850	q->link.undertime = PSCHED_PASTPERFECT;
				851	}
				852
				853	/* No packets in scheduler or nobody wants to give them to us :-(
				854	* Sigh... start watchdog timer in the last case.
				855	*/
				856
				857	if (sch->q.qlen) {
				858	qdisc_qstats_overlimit(sch);
				859	if (q->wd_expires)
				860	qdisc_watchdog_schedule(&q->watchdog,
				861	now + q->wd_expires);
				862	}
				863	return NULL;
				864	}
				865
				866	/* CBQ class maintanance routines */
				867
				868	static void cbq_adjust_levels(struct cbq_class *this)
				869	{
				870	if (this == NULL)
				871	return;
				872
				873	do {
				874	int level = 0;
				875	struct cbq_class *cl;
				876
				877	cl = this->children;
				878	if (cl) {
				879	do {
				880	if (cl->level > level)
				881	level = cl->level;
				882	} while ((cl = cl->sibling) != this->children);
				883	}
				884	this->level = level + 1;
				885	} while ((this = this->tparent) != NULL);
				886	}
				887
				888	static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
				889	{
				890	struct cbq_class *cl;
				891	unsigned int h;
				892
				893	if (q->quanta[prio] == 0)
				894	return;
				895
				896	for (h = 0; h < q->clhash.hashsize; h++) {
				897	hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
				898	/* BUGGGG... Beware! This expression suffer of
				899	* arithmetic overflows!
				900	*/
				901	if (cl->priority == prio) {
				902	cl->quantum = (cl->weightcl->allotq->nclasses[prio])/
				903	q->quanta[prio];
				904	}
				905	if (cl->quantum <= 0 \|\|
				906	cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
				907	pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
				908	cl->common.classid, cl->quantum);
				909	cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
				910	}
				911	}
				912	}
				913	}
				914
				915	static void cbq_sync_defmap(struct cbq_class *cl)
				916	{
				917	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
				918	struct cbq_class *split = cl->split;
				919	unsigned int h;
				920	int i;
				921
				922	if (split == NULL)
				923	return;
				924
				925	for (i = 0; i <= TC_PRIO_MAX; i++) {
				926	if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
				927	split->defaults[i] = NULL;
				928	}
				929
				930	for (i = 0; i <= TC_PRIO_MAX; i++) {
				931	int level = split->level;
				932
				933	if (split->defaults[i])
				934	continue;
				935
				936	for (h = 0; h < q->clhash.hashsize; h++) {
				937	struct cbq_class *c;
				938
				939	hlist_for_each_entry(c, &q->clhash.hash[h],
				940	common.hnode) {
				941	if (c->split == split && c->level < level &&
				942	c->defmap & (1<<i)) {
				943	split->defaults[i] = c;
				944	level = c->level;
				945	}
				946	}
				947	}
				948	}
				949	}
				950
				951	static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
				952	{
				953	struct cbq_class *split = NULL;
				954
				955	if (splitid == 0) {
				956	split = cl->split;
				957	if (!split)
				958	return;
				959	splitid = split->common.classid;
				960	}
				961
				962	if (split == NULL \|\| split->common.classid != splitid) {
				963	for (split = cl->tparent; split; split = split->tparent)
				964	if (split->common.classid == splitid)
				965	break;
				966	}
				967
				968	if (split == NULL)
				969	return;
				970
				971	if (cl->split != split) {
				972	cl->defmap = 0;
				973	cbq_sync_defmap(cl);
				974	cl->split = split;
				975	cl->defmap = def & mask;
				976	} else
				977	cl->defmap = (cl->defmap & ~mask) \| (def & mask);
				978
				979	cbq_sync_defmap(cl);
				980	}
				981
				982	static void cbq_unlink_class(struct cbq_class *this)
				983	{
				984	struct cbq_class cl, *clp;
				985	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
				986
				987	qdisc_class_hash_remove(&q->clhash, &this->common);
				988
				989	if (this->tparent) {
				990	clp = &this->sibling;
				991	cl = *clp;
				992	do {
				993	if (cl == this) {
				994	*clp = cl->sibling;
				995	break;
				996	}
				997	clp = &cl->sibling;
				998	} while ((cl = *clp) != this->sibling);
				999
				1000	if (this->tparent->children == this) {
				1001	this->tparent->children = this->sibling;
				1002	if (this->sibling == this)
				1003	this->tparent->children = NULL;
				1004	}
				1005	} else {
				1006	WARN_ON(this->sibling != this);
				1007	}
				1008	}
				1009
				1010	static void cbq_link_class(struct cbq_class *this)
				1011	{
				1012	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
				1013	struct cbq_class *parent = this->tparent;
				1014
				1015	this->sibling = this;
				1016	qdisc_class_hash_insert(&q->clhash, &this->common);
				1017
				1018	if (parent == NULL)
				1019	return;
				1020
				1021	if (parent->children == NULL) {
				1022	parent->children = this;
				1023	} else {
				1024	this->sibling = parent->children->sibling;
				1025	parent->children->sibling = this;
				1026	}
				1027	}
				1028
				1029	static void
				1030	cbq_reset(struct Qdisc *sch)
				1031	{
				1032	struct cbq_sched_data *q = qdisc_priv(sch);
				1033	struct cbq_class *cl;
				1034	int prio;
				1035	unsigned int h;
				1036
				1037	q->activemask = 0;
				1038	q->pmask = 0;
				1039	q->tx_class = NULL;
				1040	q->tx_borrowed = NULL;
				1041	qdisc_watchdog_cancel(&q->watchdog);
				1042	hrtimer_cancel(&q->delay_timer);
				1043	q->toplevel = TC_CBQ_MAXLEVEL;
				1044	q->now = psched_get_time();
				1045
				1046	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
				1047	q->active[prio] = NULL;
				1048
				1049	for (h = 0; h < q->clhash.hashsize; h++) {
				1050	hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
				1051	qdisc_reset(cl->q);
				1052
				1053	cl->next_alive = NULL;
				1054	cl->undertime = PSCHED_PASTPERFECT;
				1055	cl->avgidle = cl->maxidle;
				1056	cl->deficit = cl->quantum;
				1057	cl->cpriority = cl->priority;
				1058	}
				1059	}
				1060	sch->q.qlen = 0;
				1061	}
				1062
				1063
				1064	static int cbq_set_lss(struct cbq_class cl, struct tc_cbq_lssopt lss)
				1065	{
				1066	if (lss->change & TCF_CBQ_LSS_FLAGS) {
				1067	cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
				1068	cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
				1069	}
				1070	if (lss->change & TCF_CBQ_LSS_EWMA)
				1071	cl->ewma_log = lss->ewma_log;
				1072	if (lss->change & TCF_CBQ_LSS_AVPKT)
				1073	cl->avpkt = lss->avpkt;
				1074	if (lss->change & TCF_CBQ_LSS_MINIDLE)
				1075	cl->minidle = -(long)lss->minidle;
				1076	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
				1077	cl->maxidle = lss->maxidle;
				1078	cl->avgidle = lss->maxidle;
				1079	}
				1080	if (lss->change & TCF_CBQ_LSS_OFFTIME)
				1081	cl->offtime = lss->offtime;
				1082	return 0;
				1083	}
				1084
				1085	static void cbq_rmprio(struct cbq_sched_data q, struct cbq_class cl)
				1086	{
				1087	q->nclasses[cl->priority]--;
				1088	q->quanta[cl->priority] -= cl->weight;
				1089	cbq_normalize_quanta(q, cl->priority);
				1090	}
				1091
				1092	static void cbq_addprio(struct cbq_sched_data q, struct cbq_class cl)
				1093	{
				1094	q->nclasses[cl->priority]++;
				1095	q->quanta[cl->priority] += cl->weight;
				1096	cbq_normalize_quanta(q, cl->priority);
				1097	}
				1098
				1099	static int cbq_set_wrr(struct cbq_class cl, struct tc_cbq_wrropt wrr)
				1100	{
				1101	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
				1102
				1103	if (wrr->allot)
				1104	cl->allot = wrr->allot;
				1105	if (wrr->weight)
				1106	cl->weight = wrr->weight;
				1107	if (wrr->priority) {
				1108	cl->priority = wrr->priority - 1;
				1109	cl->cpriority = cl->priority;
				1110	if (cl->priority >= cl->priority2)
				1111	cl->priority2 = TC_CBQ_MAXPRIO - 1;
				1112	}
				1113
				1114	cbq_addprio(q, cl);
				1115	return 0;
				1116	}
				1117
				1118	static int cbq_set_fopt(struct cbq_class cl, struct tc_cbq_fopt fopt)
				1119	{
				1120	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
				1121	return 0;
				1122	}
				1123
				1124	static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
				1125	[TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
				1126	[TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
				1127	[TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
				1128	[TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
				1129	[TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
				1130	[TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
				1131	[TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
				1132	};
				1133
				1134	static int cbq_opt_parse(struct nlattr tb[TCA_CBQ_MAX + 1], struct nlattr opt)
				1135	{
				1136	int err;
				1137
				1138	if (!opt)
				1139	return -EINVAL;
				1140
				1141	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
				1142	if (err < 0)
				1143	return err;
				1144
				1145	if (tb[TCA_CBQ_WRROPT]) {
				1146	const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
				1147
				1148	if (wrr->priority > TC_CBQ_MAXPRIO)
				1149	err = -EINVAL;
				1150	}
				1151	return err;
				1152	}
				1153
				1154	static int cbq_init(struct Qdisc sch, struct nlattr opt)
				1155	{
				1156	struct cbq_sched_data *q = qdisc_priv(sch);
				1157	struct nlattr *tb[TCA_CBQ_MAX + 1];
				1158	struct tc_ratespec *r;
				1159	int err;
				1160
				1161	qdisc_watchdog_init(&q->watchdog, sch);
				1162	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
				1163	q->delay_timer.function = cbq_undelay;
				1164
				1165	err = cbq_opt_parse(tb, opt);
				1166	if (err < 0)
				1167	return err;
				1168
				1169	if (tb[TCA_CBQ_RTAB] == NULL \|\| tb[TCA_CBQ_RATE] == NULL)
				1170	return -EINVAL;
				1171
				1172	r = nla_data(tb[TCA_CBQ_RATE]);
				1173
				1174	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
				1175	return -EINVAL;
				1176
				1177	err = tcf_block_get(&q->link.block, &q->link.filter_list);
				1178	if (err)
				1179	goto put_rtab;
				1180
				1181	err = qdisc_class_hash_init(&q->clhash);
				1182	if (err < 0)
				1183	goto put_block;
				1184
				1185	q->link.sibling = &q->link;
				1186	q->link.common.classid = sch->handle;
				1187	q->link.qdisc = sch;
				1188	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
				1189	sch->handle);
				1190	if (!q->link.q)
				1191	q->link.q = &noop_qdisc;
				1192	else
				1193	qdisc_hash_add(q->link.q, true);
				1194
				1195	q->link.priority = TC_CBQ_MAXPRIO - 1;
				1196	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
				1197	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
				1198	q->link.allot = psched_mtu(qdisc_dev(sch));
				1199	q->link.quantum = q->link.allot;
				1200	q->link.weight = q->link.R_tab->rate.rate;
				1201
				1202	q->link.ewma_log = TC_CBQ_DEF_EWMA;
				1203	q->link.avpkt = q->link.allot/2;
				1204	q->link.minidle = -0x7FFFFFFF;
				1205
				1206	q->toplevel = TC_CBQ_MAXLEVEL;
				1207	q->now = psched_get_time();
				1208
				1209	cbq_link_class(&q->link);
				1210
				1211	if (tb[TCA_CBQ_LSSOPT])
				1212	cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
				1213
				1214	cbq_addprio(q, &q->link);
				1215	return 0;
				1216
				1217	put_block:
				1218	tcf_block_put(q->link.block);
				1219
				1220	put_rtab:
				1221	qdisc_put_rtab(q->link.R_tab);
				1222	return err;
				1223	}
				1224
				1225	static int cbq_dump_rate(struct sk_buff skb, struct cbq_class cl)
				1226	{
				1227	unsigned char *b = skb_tail_pointer(skb);
				1228
				1229	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
				1230	goto nla_put_failure;
				1231	return skb->len;
				1232
				1233	nla_put_failure:
				1234	nlmsg_trim(skb, b);
				1235	return -1;
				1236	}
				1237
				1238	static int cbq_dump_lss(struct sk_buff skb, struct cbq_class cl)
				1239	{
				1240	unsigned char *b = skb_tail_pointer(skb);
				1241	struct tc_cbq_lssopt opt;
				1242
				1243	opt.flags = 0;
				1244	if (cl->borrow == NULL)
				1245	opt.flags \|= TCF_CBQ_LSS_BOUNDED;
				1246	if (cl->share == NULL)
				1247	opt.flags \|= TCF_CBQ_LSS_ISOLATED;
				1248	opt.ewma_log = cl->ewma_log;
				1249	opt.level = cl->level;
				1250	opt.avpkt = cl->avpkt;
				1251	opt.maxidle = cl->maxidle;
				1252	opt.minidle = (u32)(-cl->minidle);
				1253	opt.offtime = cl->offtime;
				1254	opt.change = ~0;
				1255	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
				1256	goto nla_put_failure;
				1257	return skb->len;
				1258
				1259	nla_put_failure:
				1260	nlmsg_trim(skb, b);
				1261	return -1;
				1262	}
				1263
				1264	static int cbq_dump_wrr(struct sk_buff skb, struct cbq_class cl)
				1265	{
				1266	unsigned char *b = skb_tail_pointer(skb);
				1267	struct tc_cbq_wrropt opt;
				1268
				1269	memset(&opt, 0, sizeof(opt));
				1270	opt.flags = 0;
				1271	opt.allot = cl->allot;
				1272	opt.priority = cl->priority + 1;
				1273	opt.cpriority = cl->cpriority + 1;
				1274	opt.weight = cl->weight;
				1275	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
				1276	goto nla_put_failure;
				1277	return skb->len;
				1278
				1279	nla_put_failure:
				1280	nlmsg_trim(skb, b);
				1281	return -1;
				1282	}
				1283
				1284	static int cbq_dump_fopt(struct sk_buff skb, struct cbq_class cl)
				1285	{
				1286	unsigned char *b = skb_tail_pointer(skb);
				1287	struct tc_cbq_fopt opt;
				1288
				1289	if (cl->split \|\| cl->defmap) {
				1290	opt.split = cl->split ? cl->split->common.classid : 0;
				1291	opt.defmap = cl->defmap;
				1292	opt.defchange = ~0;
				1293	if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
				1294	goto nla_put_failure;
				1295	}
				1296	return skb->len;
				1297
				1298	nla_put_failure:
				1299	nlmsg_trim(skb, b);
				1300	return -1;
				1301	}
				1302
				1303	static int cbq_dump_attr(struct sk_buff skb, struct cbq_class cl)
				1304	{
				1305	if (cbq_dump_lss(skb, cl) < 0 \|\|
				1306	cbq_dump_rate(skb, cl) < 0 \|\|
				1307	cbq_dump_wrr(skb, cl) < 0 \|\|
				1308	cbq_dump_fopt(skb, cl) < 0)
				1309	return -1;
				1310	return 0;
				1311	}
				1312
				1313	static int cbq_dump(struct Qdisc sch, struct sk_buff skb)
				1314	{
				1315	struct cbq_sched_data *q = qdisc_priv(sch);
				1316	struct nlattr *nest;
				1317
				1318	nest = nla_nest_start(skb, TCA_OPTIONS);
				1319	if (nest == NULL)
				1320	goto nla_put_failure;
				1321	if (cbq_dump_attr(skb, &q->link) < 0)
				1322	goto nla_put_failure;
				1323	return nla_nest_end(skb, nest);
				1324
				1325	nla_put_failure:
				1326	nla_nest_cancel(skb, nest);
				1327	return -1;
				1328	}
				1329
				1330	static int
				1331	cbq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				1332	{
				1333	struct cbq_sched_data *q = qdisc_priv(sch);
				1334
				1335	q->link.xstats.avgidle = q->link.avgidle;
				1336	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
				1337	}
				1338
				1339	static int
				1340	cbq_dump_class(struct Qdisc *sch, unsigned long arg,
				1341	struct sk_buff skb, struct tcmsg tcm)
				1342	{
				1343	struct cbq_class cl = (struct cbq_class )arg;
				1344	struct nlattr *nest;
				1345
				1346	if (cl->tparent)
				1347	tcm->tcm_parent = cl->tparent->common.classid;
				1348	else
				1349	tcm->tcm_parent = TC_H_ROOT;
				1350	tcm->tcm_handle = cl->common.classid;
				1351	tcm->tcm_info = cl->q->handle;
				1352
				1353	nest = nla_nest_start(skb, TCA_OPTIONS);
				1354	if (nest == NULL)
				1355	goto nla_put_failure;
				1356	if (cbq_dump_attr(skb, cl) < 0)
				1357	goto nla_put_failure;
				1358	return nla_nest_end(skb, nest);
				1359
				1360	nla_put_failure:
				1361	nla_nest_cancel(skb, nest);
				1362	return -1;
				1363	}
				1364
				1365	static int
				1366	cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
				1367	struct gnet_dump *d)
				1368	{
				1369	struct cbq_sched_data *q = qdisc_priv(sch);
				1370	struct cbq_class cl = (struct cbq_class )arg;
				1371
				1372	cl->xstats.avgidle = cl->avgidle;
				1373	cl->xstats.undertime = 0;
				1374
				1375	if (cl->undertime != PSCHED_PASTPERFECT)
				1376	cl->xstats.undertime = cl->undertime - q->now;
				1377
				1378	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
				1379	d, NULL, &cl->bstats) < 0 \|\|
				1380	gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 \|\|
				1381	gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
				1382	return -1;
				1383
				1384	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
				1385	}
				1386
				1387	static int cbq_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
				1388	struct Qdisc **old)
				1389	{
				1390	struct cbq_class cl = (struct cbq_class )arg;
				1391
				1392	if (new == NULL) {
				1393	new = qdisc_create_dflt(sch->dev_queue,
				1394	&pfifo_qdisc_ops, cl->common.classid);
				1395	if (new == NULL)
				1396	return -ENOBUFS;
				1397	}
				1398
				1399	*old = qdisc_replace(sch, new, &cl->q);
				1400	return 0;
				1401	}
				1402
				1403	static struct Qdisc cbq_leaf(struct Qdisc sch, unsigned long arg)
				1404	{
				1405	struct cbq_class cl = (struct cbq_class )arg;
				1406
				1407	return cl->q;
				1408	}
				1409
				1410	static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
				1411	{
				1412	struct cbq_class cl = (struct cbq_class )arg;
				1413
				1414	cbq_deactivate_class(cl);
				1415	}
				1416
				1417	static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
				1418	{
				1419	struct cbq_sched_data *q = qdisc_priv(sch);
				1420
				1421	return (unsigned long)cbq_class_lookup(q, classid);
				1422	}
				1423
				1424	static void cbq_destroy_class(struct Qdisc sch, struct cbq_class cl)
				1425	{
				1426	struct cbq_sched_data *q = qdisc_priv(sch);
				1427
				1428	WARN_ON(cl->filters);
				1429
				1430	tcf_block_put(cl->block);
				1431	qdisc_destroy(cl->q);
				1432	qdisc_put_rtab(cl->R_tab);
				1433	gen_kill_estimator(&cl->rate_est);
				1434	if (cl != &q->link)
				1435	kfree(cl);
				1436	}
				1437
				1438	static void cbq_destroy(struct Qdisc *sch)
				1439	{
				1440	struct cbq_sched_data *q = qdisc_priv(sch);
				1441	struct hlist_node *next;
				1442	struct cbq_class *cl;
				1443	unsigned int h;
				1444
				1445	#ifdef CONFIG_NET_CLS_ACT
				1446	q->rx_class = NULL;
				1447	#endif
				1448	/*
				1449	* Filters must be destroyed first because we don't destroy the
				1450	* classes from root to leafs which means that filters can still
				1451	* be bound to classes which have been destroyed already. --TGR '04
				1452	*/
				1453	for (h = 0; h < q->clhash.hashsize; h++) {
				1454	hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
				1455	tcf_block_put(cl->block);
				1456	cl->block = NULL;
				1457	}
				1458	}
				1459	for (h = 0; h < q->clhash.hashsize; h++) {
				1460	hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
				1461	common.hnode)
				1462	cbq_destroy_class(sch, cl);
				1463	}
				1464	qdisc_class_hash_destroy(&q->clhash);
				1465	}
				1466
				1467	static int
				1468	cbq_change_class(struct Qdisc sch, u32 classid, u32 parentid, struct nlattr *tca,
				1469	unsigned long *arg)
				1470	{
				1471	int err;
				1472	struct cbq_sched_data *q = qdisc_priv(sch);
				1473	struct cbq_class cl = (struct cbq_class )*arg;
				1474	struct nlattr *opt = tca[TCA_OPTIONS];
				1475	struct nlattr *tb[TCA_CBQ_MAX + 1];
				1476	struct cbq_class *parent;
				1477	struct qdisc_rate_table *rtab = NULL;
				1478
				1479	err = cbq_opt_parse(tb, opt);
				1480	if (err < 0)
				1481	return err;
				1482
				1483	if (tb[TCA_CBQ_OVL_STRATEGY] \|\| tb[TCA_CBQ_POLICE])
				1484	return -EOPNOTSUPP;
				1485
				1486	if (cl) {
				1487	/* Check parent */
				1488	if (parentid) {
				1489	if (cl->tparent &&
				1490	cl->tparent->common.classid != parentid)
				1491	return -EINVAL;
				1492	if (!cl->tparent && parentid != TC_H_ROOT)
				1493	return -EINVAL;
				1494	}
				1495
				1496	if (tb[TCA_CBQ_RATE]) {
				1497	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
				1498	tb[TCA_CBQ_RTAB]);
				1499	if (rtab == NULL)
				1500	return -EINVAL;
				1501	}
				1502
				1503	if (tca[TCA_RATE]) {
				1504	err = gen_replace_estimator(&cl->bstats, NULL,
				1505	&cl->rate_est,
				1506	NULL,
				1507	qdisc_root_sleeping_running(sch),
				1508	tca[TCA_RATE]);
				1509	if (err) {
				1510	qdisc_put_rtab(rtab);
				1511	return err;
				1512	}
				1513	}
				1514
				1515	/* Change class parameters */
				1516	sch_tree_lock(sch);
				1517
				1518	if (cl->next_alive != NULL)
				1519	cbq_deactivate_class(cl);
				1520
				1521	if (rtab) {
				1522	qdisc_put_rtab(cl->R_tab);
				1523	cl->R_tab = rtab;
				1524	}
				1525
				1526	if (tb[TCA_CBQ_LSSOPT])
				1527	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
				1528
				1529	if (tb[TCA_CBQ_WRROPT]) {
				1530	cbq_rmprio(q, cl);
				1531	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
				1532	}
				1533
				1534	if (tb[TCA_CBQ_FOPT])
				1535	cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
				1536
				1537	if (cl->q->q.qlen)
				1538	cbq_activate_class(cl);
				1539
				1540	sch_tree_unlock(sch);
				1541
				1542	return 0;
				1543	}
				1544
				1545	if (parentid == TC_H_ROOT)
				1546	return -EINVAL;
				1547
				1548	if (tb[TCA_CBQ_WRROPT] == NULL \|\| tb[TCA_CBQ_RATE] == NULL \|\|
				1549	tb[TCA_CBQ_LSSOPT] == NULL)
				1550	return -EINVAL;
				1551
				1552	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
				1553	if (rtab == NULL)
				1554	return -EINVAL;
				1555
				1556	if (classid) {
				1557	err = -EINVAL;
				1558	if (TC_H_MAJ(classid ^ sch->handle) \|\|
				1559	cbq_class_lookup(q, classid))
				1560	goto failure;
				1561	} else {
				1562	int i;
				1563	classid = TC_H_MAKE(sch->handle, 0x8000);
				1564
				1565	for (i = 0; i < 0x8000; i++) {
				1566	if (++q->hgenerator >= 0x8000)
				1567	q->hgenerator = 1;
				1568	if (cbq_class_lookup(q, classid\|q->hgenerator) == NULL)
				1569	break;
				1570	}
				1571	err = -ENOSR;
				1572	if (i >= 0x8000)
				1573	goto failure;
				1574	classid = classid\|q->hgenerator;
				1575	}
				1576
				1577	parent = &q->link;
				1578	if (parentid) {
				1579	parent = cbq_class_lookup(q, parentid);
				1580	err = -EINVAL;
				1581	if (parent == NULL)
				1582	goto failure;
				1583	}
				1584
				1585	err = -ENOBUFS;
				1586	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
				1587	if (cl == NULL)
				1588	goto failure;
				1589
				1590	err = tcf_block_get(&cl->block, &cl->filter_list);
				1591	if (err) {
				1592	kfree(cl);
				1593	return err;
				1594	}
				1595
				1596	if (tca[TCA_RATE]) {
				1597	err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
				1598	NULL,
				1599	qdisc_root_sleeping_running(sch),
				1600	tca[TCA_RATE]);
				1601	if (err) {
				1602	tcf_block_put(cl->block);
				1603	kfree(cl);
				1604	goto failure;
				1605	}
				1606	}
				1607
				1608	cl->R_tab = rtab;
				1609	rtab = NULL;
				1610	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
				1611	if (!cl->q)
				1612	cl->q = &noop_qdisc;
				1613	else
				1614	qdisc_hash_add(cl->q, true);
				1615
				1616	cl->common.classid = classid;
				1617	cl->tparent = parent;
				1618	cl->qdisc = sch;
				1619	cl->allot = parent->allot;
				1620	cl->quantum = cl->allot;
				1621	cl->weight = cl->R_tab->rate.rate;
				1622
				1623	sch_tree_lock(sch);
				1624	cbq_link_class(cl);
				1625	cl->borrow = cl->tparent;
				1626	if (cl->tparent != &q->link)
				1627	cl->share = cl->tparent;
				1628	cbq_adjust_levels(parent);
				1629	cl->minidle = -0x7FFFFFFF;
				1630	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
				1631	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
				1632	if (cl->ewma_log == 0)
				1633	cl->ewma_log = q->link.ewma_log;
				1634	if (cl->maxidle == 0)
				1635	cl->maxidle = q->link.maxidle;
				1636	if (cl->avpkt == 0)
				1637	cl->avpkt = q->link.avpkt;
				1638	if (tb[TCA_CBQ_FOPT])
				1639	cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
				1640	sch_tree_unlock(sch);
				1641
				1642	qdisc_class_hash_grow(sch, &q->clhash);
				1643
				1644	*arg = (unsigned long)cl;
				1645	return 0;
				1646
				1647	failure:
				1648	qdisc_put_rtab(rtab);
				1649	return err;
				1650	}
				1651
				1652	static int cbq_delete(struct Qdisc *sch, unsigned long arg)
				1653	{
				1654	struct cbq_sched_data *q = qdisc_priv(sch);
				1655	struct cbq_class cl = (struct cbq_class )arg;
				1656	unsigned int qlen, backlog;
				1657
				1658	if (cl->filters \|\| cl->children \|\| cl == &q->link)
				1659	return -EBUSY;
				1660
				1661	sch_tree_lock(sch);
				1662
				1663	qlen = cl->q->q.qlen;
				1664	backlog = cl->q->qstats.backlog;
				1665	qdisc_reset(cl->q);
				1666	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
				1667
				1668	if (cl->next_alive)
				1669	cbq_deactivate_class(cl);
				1670
				1671	if (q->tx_borrowed == cl)
				1672	q->tx_borrowed = q->tx_class;
				1673	if (q->tx_class == cl) {
				1674	q->tx_class = NULL;
				1675	q->tx_borrowed = NULL;
				1676	}
				1677	#ifdef CONFIG_NET_CLS_ACT
				1678	if (q->rx_class == cl)
				1679	q->rx_class = NULL;
				1680	#endif
				1681
				1682	cbq_unlink_class(cl);
				1683	cbq_adjust_levels(cl->tparent);
				1684	cl->defmap = 0;
				1685	cbq_sync_defmap(cl);
				1686
				1687	cbq_rmprio(q, cl);
				1688	sch_tree_unlock(sch);
				1689
				1690	cbq_destroy_class(sch, cl);
				1691	return 0;
				1692	}
				1693
				1694	static struct tcf_block cbq_tcf_block(struct Qdisc sch, unsigned long arg)
				1695	{
				1696	struct cbq_sched_data *q = qdisc_priv(sch);
				1697	struct cbq_class cl = (struct cbq_class )arg;
				1698
				1699	if (cl == NULL)
				1700	cl = &q->link;
				1701
				1702	return cl->block;
				1703	}
				1704
				1705	static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
				1706	u32 classid)
				1707	{
				1708	struct cbq_sched_data *q = qdisc_priv(sch);
				1709	struct cbq_class p = (struct cbq_class )parent;
				1710	struct cbq_class *cl = cbq_class_lookup(q, classid);
				1711
				1712	if (cl) {
				1713	if (p && p->level <= cl->level)
				1714	return 0;
				1715	cl->filters++;
				1716	return (unsigned long)cl;
				1717	}
				1718	return 0;
				1719	}
				1720
				1721	static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
				1722	{
				1723	struct cbq_class cl = (struct cbq_class )arg;
				1724
				1725	cl->filters--;
				1726	}
				1727
				1728	static void cbq_walk(struct Qdisc sch, struct qdisc_walker arg)
				1729	{
				1730	struct cbq_sched_data *q = qdisc_priv(sch);
				1731	struct cbq_class *cl;
				1732	unsigned int h;
				1733
				1734	if (arg->stop)
				1735	return;
				1736
				1737	for (h = 0; h < q->clhash.hashsize; h++) {
				1738	hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
				1739	if (arg->count < arg->skip) {
				1740	arg->count++;
				1741	continue;
				1742	}
				1743	if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
				1744	arg->stop = 1;
				1745	return;
				1746	}
				1747	arg->count++;
				1748	}
				1749	}
				1750	}
				1751
				1752	static const struct Qdisc_class_ops cbq_class_ops = {
				1753	.graft = cbq_graft,
				1754	.leaf = cbq_leaf,
				1755	.qlen_notify = cbq_qlen_notify,
				1756	.find = cbq_find,
				1757	.change = cbq_change_class,
				1758	.delete = cbq_delete,
				1759	.walk = cbq_walk,
				1760	.tcf_block = cbq_tcf_block,
				1761	.bind_tcf = cbq_bind_filter,
				1762	.unbind_tcf = cbq_unbind_filter,
				1763	.dump = cbq_dump_class,
				1764	.dump_stats = cbq_dump_class_stats,
				1765	};
				1766
				1767	static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
				1768	.next = NULL,
				1769	.cl_ops = &cbq_class_ops,
				1770	.id = "cbq",
				1771	.priv_size = sizeof(struct cbq_sched_data),
				1772	.enqueue = cbq_enqueue,
				1773	.dequeue = cbq_dequeue,
				1774	.peek = qdisc_peek_dequeued,
				1775	.init = cbq_init,
				1776	.reset = cbq_reset,
				1777	.destroy = cbq_destroy,
				1778	.change = NULL,
				1779	.dump = cbq_dump,
				1780	.dump_stats = cbq_dump_stats,
				1781	.owner = THIS_MODULE,
				1782	};
				1783
				1784	static int __init cbq_module_init(void)
				1785	{
				1786	return register_qdisc(&cbq_qdisc_ops);
				1787	}
				1788	static void __exit cbq_module_exit(void)
				1789	{
				1790	unregister_qdisc(&cbq_qdisc_ops);
				1791	}
				1792	module_init(cbq_module_init)
				1793	module_exit(cbq_module_exit)
				1794	MODULE_LICENSE("GPL");