blob: 40fd1ee0095c09843d7ca0056c4282f3b63d7e8f [file] [log] [blame]
rjw1f884582022-01-06 17:20:42 +08001/*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <net/netlink.h>
21#include <net/pkt_sched.h>
22#include <net/pkt_cls.h>
23
24
25/* Class-Based Queueing (CBQ) algorithm.
26 =======================================
27
28 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
29 Management Models for Packet Networks",
30 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
31
32 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
33
34 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
35 Parameters", 1996
36
37 [4] Sally Floyd and Michael Speer, "Experimental Results
38 for Class-Based Queueing", 1998, not published.
39
40 -----------------------------------------------------------------------
41
42 Algorithm skeleton was taken from NS simulator cbq.cc.
43 If someone wants to check this code against the LBL version,
44 he should take into account that ONLY the skeleton was borrowed,
45 the implementation is different. Particularly:
46
47 --- The WRR algorithm is different. Our version looks more
48 reasonable (I hope) and works when quanta are allowed to be
49 less than MTU, which is always the case when real time classes
50 have small rates. Note, that the statement of [3] is
51 incomplete, delay may actually be estimated even if class
52 per-round allotment is less than MTU. Namely, if per-round
53 allotment is W*r_i, and r_1+...+r_k = r < 1
54
55 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
56
57 In the worst case we have IntServ estimate with D = W*r+k*MTU
58 and C = MTU*r. The proof (if correct at all) is trivial.
59
60
61 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
62 interpret some places, which look like wrong translations
63 from NS. Anyone is advised to find these differences
64 and explain to me, why I am wrong 8).
65
66 --- Linux has no EOI event, so that we cannot estimate true class
67 idle time. Workaround is to consider the next dequeue event
68 as sign that previous packet is finished. This is wrong because of
69 internal device queueing, but on a permanently loaded link it is true.
70 Moreover, combined with clock integrator, this scheme looks
71 very close to an ideal solution. */
72
73struct cbq_sched_data;
74
75
76struct cbq_class {
77 struct Qdisc_class_common common;
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79
80/* Parameters */
81 unsigned char priority; /* class priority */
82 unsigned char priority2; /* priority to be used after overlimit */
83 unsigned char ewma_log; /* time constant for idle time calculation */
84
85 u32 defmap;
86
87 /* Link-sharing scheduler parameters */
88 long maxidle; /* Class parameters: see below. */
89 long offtime;
90 long minidle;
91 u32 avpkt;
92 struct qdisc_rate_table *R_tab;
93
94 /* General scheduler (WRR) parameters */
95 long allot;
96 long quantum; /* Allotment per WRR round */
97 long weight; /* Relative allotment: see below */
98
99 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
100 struct cbq_class *split; /* Ptr to split node */
101 struct cbq_class *share; /* Ptr to LS parent in the class tree */
102 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
103 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
104 parent otherwise */
105 struct cbq_class *sibling; /* Sibling chain */
106 struct cbq_class *children; /* Pointer to children chain */
107
108 struct Qdisc *q; /* Elementary queueing discipline */
109
110
111/* Variables */
112 unsigned char cpriority; /* Effective priority */
113 unsigned char delayed;
114 unsigned char level; /* level of the class in hierarchy:
115 0 for leaf classes, and maximal
116 level of children + 1 for nodes.
117 */
118
119 psched_time_t last; /* Last end of service */
120 psched_time_t undertime;
121 long avgidle;
122 long deficit; /* Saved deficit for WRR */
123 psched_time_t penalized;
124 struct gnet_stats_basic_packed bstats;
125 struct gnet_stats_queue qstats;
126 struct net_rate_estimator __rcu *rate_est;
127 struct tc_cbq_xstats xstats;
128
129 struct tcf_proto __rcu *filter_list;
130 struct tcf_block *block;
131
132 int filters;
133
134 struct cbq_class *defaults[TC_PRIO_MAX + 1];
135};
136
137struct cbq_sched_data {
138 struct Qdisc_class_hash clhash; /* Hash table of all classes */
139 int nclasses[TC_CBQ_MAXPRIO + 1];
140 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
141
142 struct cbq_class link;
143
144 unsigned int activemask;
145 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
146 with backlog */
147
148#ifdef CONFIG_NET_CLS_ACT
149 struct cbq_class *rx_class;
150#endif
151 struct cbq_class *tx_class;
152 struct cbq_class *tx_borrowed;
153 int tx_len;
154 psched_time_t now; /* Cached timestamp */
155 unsigned int pmask;
156
157 struct hrtimer delay_timer;
158 struct qdisc_watchdog watchdog; /* Watchdog timer,
159 started when CBQ has
160 backlog, but cannot
161 transmit just now */
162 psched_tdiff_t wd_expires;
163 int toplevel;
164 u32 hgenerator;
165};
166
167
168#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
169
170static inline struct cbq_class *
171cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
172{
173 struct Qdisc_class_common *clc;
174
175 clc = qdisc_class_find(&q->clhash, classid);
176 if (clc == NULL)
177 return NULL;
178 return container_of(clc, struct cbq_class, common);
179}
180
181#ifdef CONFIG_NET_CLS_ACT
182
183static struct cbq_class *
184cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
185{
186 struct cbq_class *cl;
187
188 for (cl = this->tparent; cl; cl = cl->tparent) {
189 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
190
191 if (new != NULL && new != this)
192 return new;
193 }
194 return NULL;
195}
196
197#endif
198
199/* Classify packet. The procedure is pretty complicated, but
200 * it allows us to combine link sharing and priority scheduling
201 * transparently.
202 *
203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
204 * so that it resolves to split nodes. Then packets are classified
205 * by logical priority, or a more specific classifier may be attached
206 * to the split node.
207 */
208
209static struct cbq_class *
210cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
211{
212 struct cbq_sched_data *q = qdisc_priv(sch);
213 struct cbq_class *head = &q->link;
214 struct cbq_class **defmap;
215 struct cbq_class *cl = NULL;
216 u32 prio = skb->priority;
217 struct tcf_proto *fl;
218 struct tcf_result res;
219
220 /*
221 * Step 1. If skb->priority points to one of our classes, use it.
222 */
223 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
224 (cl = cbq_class_lookup(q, prio)) != NULL)
225 return cl;
226
227 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
228 for (;;) {
229 int result = 0;
230 defmap = head->defaults;
231
232 fl = rcu_dereference_bh(head->filter_list);
233 /*
234 * Step 2+n. Apply classifier.
235 */
236 result = tcf_classify(skb, fl, &res, true);
237 if (!fl || result < 0)
238 goto fallback;
239
240 cl = (void *)res.class;
241 if (!cl) {
242 if (TC_H_MAJ(res.classid))
243 cl = cbq_class_lookup(q, res.classid);
244 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
245 cl = defmap[TC_PRIO_BESTEFFORT];
246
247 if (cl == NULL)
248 goto fallback;
249 }
250 if (cl->level >= head->level)
251 goto fallback;
252#ifdef CONFIG_NET_CLS_ACT
253 switch (result) {
254 case TC_ACT_QUEUED:
255 case TC_ACT_STOLEN:
256 case TC_ACT_TRAP:
257 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
258 case TC_ACT_SHOT:
259 return NULL;
260 case TC_ACT_RECLASSIFY:
261 return cbq_reclassify(skb, cl);
262 }
263#endif
264 if (cl->level == 0)
265 return cl;
266
267 /*
268 * Step 3+n. If classifier selected a link sharing class,
269 * apply agency specific classifier.
270 * Repeat this procdure until we hit a leaf node.
271 */
272 head = cl;
273 }
274
275fallback:
276 cl = head;
277
278 /*
279 * Step 4. No success...
280 */
281 if (TC_H_MAJ(prio) == 0 &&
282 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
283 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
284 return head;
285
286 return cl;
287}
288
289/*
290 * A packet has just been enqueued on the empty class.
291 * cbq_activate_class adds it to the tail of active class list
292 * of its priority band.
293 */
294
295static inline void cbq_activate_class(struct cbq_class *cl)
296{
297 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
298 int prio = cl->cpriority;
299 struct cbq_class *cl_tail;
300
301 cl_tail = q->active[prio];
302 q->active[prio] = cl;
303
304 if (cl_tail != NULL) {
305 cl->next_alive = cl_tail->next_alive;
306 cl_tail->next_alive = cl;
307 } else {
308 cl->next_alive = cl;
309 q->activemask |= (1<<prio);
310 }
311}
312
313/*
314 * Unlink class from active chain.
315 * Note that this same procedure is done directly in cbq_dequeue*
316 * during round-robin procedure.
317 */
318
319static void cbq_deactivate_class(struct cbq_class *this)
320{
321 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
322 int prio = this->cpriority;
323 struct cbq_class *cl;
324 struct cbq_class *cl_prev = q->active[prio];
325
326 do {
327 cl = cl_prev->next_alive;
328 if (cl == this) {
329 cl_prev->next_alive = cl->next_alive;
330 cl->next_alive = NULL;
331
332 if (cl == q->active[prio]) {
333 q->active[prio] = cl_prev;
334 if (cl == q->active[prio]) {
335 q->active[prio] = NULL;
336 q->activemask &= ~(1<<prio);
337 return;
338 }
339 }
340 return;
341 }
342 } while ((cl_prev = cl) != q->active[prio]);
343}
344
345static void
346cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
347{
348 int toplevel = q->toplevel;
349
350 if (toplevel > cl->level) {
351 psched_time_t now = psched_get_time();
352
353 do {
354 if (cl->undertime < now) {
355 q->toplevel = cl->level;
356 return;
357 }
358 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
359 }
360}
361
362static int
363cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
364 struct sk_buff **to_free)
365{
366 struct cbq_sched_data *q = qdisc_priv(sch);
367 int uninitialized_var(ret);
368 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
369
370#ifdef CONFIG_NET_CLS_ACT
371 q->rx_class = cl;
372#endif
373 if (cl == NULL) {
374 if (ret & __NET_XMIT_BYPASS)
375 qdisc_qstats_drop(sch);
376 __qdisc_drop(skb, to_free);
377 return ret;
378 }
379
380 ret = qdisc_enqueue(skb, cl->q, to_free);
381 if (ret == NET_XMIT_SUCCESS) {
382 sch->q.qlen++;
383 cbq_mark_toplevel(q, cl);
384 if (!cl->next_alive)
385 cbq_activate_class(cl);
386 return ret;
387 }
388
389 if (net_xmit_drop_count(ret)) {
390 qdisc_qstats_drop(sch);
391 cbq_mark_toplevel(q, cl);
392 cl->qstats.drops++;
393 }
394 return ret;
395}
396
397/* Overlimit action: penalize leaf class by adding offtime */
398static void cbq_overlimit(struct cbq_class *cl)
399{
400 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
401 psched_tdiff_t delay = cl->undertime - q->now;
402
403 if (!cl->delayed) {
404 delay += cl->offtime;
405
406 /*
407 * Class goes to sleep, so that it will have no
408 * chance to work avgidle. Let's forgive it 8)
409 *
410 * BTW cbq-2.0 has a crap in this
411 * place, apparently they forgot to shift it by cl->ewma_log.
412 */
413 if (cl->avgidle < 0)
414 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
415 if (cl->avgidle < cl->minidle)
416 cl->avgidle = cl->minidle;
417 if (delay <= 0)
418 delay = 1;
419 cl->undertime = q->now + delay;
420
421 cl->xstats.overactions++;
422 cl->delayed = 1;
423 }
424 if (q->wd_expires == 0 || q->wd_expires > delay)
425 q->wd_expires = delay;
426
427 /* Dirty work! We must schedule wakeups based on
428 * real available rate, rather than leaf rate,
429 * which may be tiny (even zero).
430 */
431 if (q->toplevel == TC_CBQ_MAXLEVEL) {
432 struct cbq_class *b;
433 psched_tdiff_t base_delay = q->wd_expires;
434
435 for (b = cl->borrow; b; b = b->borrow) {
436 delay = b->undertime - q->now;
437 if (delay < base_delay) {
438 if (delay <= 0)
439 delay = 1;
440 base_delay = delay;
441 }
442 }
443
444 q->wd_expires = base_delay;
445 }
446}
447
448static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
449 psched_time_t now)
450{
451 struct cbq_class *cl;
452 struct cbq_class *cl_prev = q->active[prio];
453 psched_time_t sched = now;
454
455 if (cl_prev == NULL)
456 return 0;
457
458 do {
459 cl = cl_prev->next_alive;
460 if (now - cl->penalized > 0) {
461 cl_prev->next_alive = cl->next_alive;
462 cl->next_alive = NULL;
463 cl->cpriority = cl->priority;
464 cl->delayed = 0;
465 cbq_activate_class(cl);
466
467 if (cl == q->active[prio]) {
468 q->active[prio] = cl_prev;
469 if (cl == q->active[prio]) {
470 q->active[prio] = NULL;
471 return 0;
472 }
473 }
474
475 cl = cl_prev->next_alive;
476 } else if (sched - cl->penalized > 0)
477 sched = cl->penalized;
478 } while ((cl_prev = cl) != q->active[prio]);
479
480 return sched - now;
481}
482
483static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
484{
485 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
486 delay_timer);
487 struct Qdisc *sch = q->watchdog.qdisc;
488 psched_time_t now;
489 psched_tdiff_t delay = 0;
490 unsigned int pmask;
491
492 now = psched_get_time();
493
494 pmask = q->pmask;
495 q->pmask = 0;
496
497 while (pmask) {
498 int prio = ffz(~pmask);
499 psched_tdiff_t tmp;
500
501 pmask &= ~(1<<prio);
502
503 tmp = cbq_undelay_prio(q, prio, now);
504 if (tmp > 0) {
505 q->pmask |= 1<<prio;
506 if (tmp < delay || delay == 0)
507 delay = tmp;
508 }
509 }
510
511 if (delay) {
512 ktime_t time;
513
514 time = 0;
515 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
516 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
517 }
518
519 __netif_schedule(qdisc_root(sch));
520 return HRTIMER_NORESTART;
521}
522
523/*
524 * It is mission critical procedure.
525 *
526 * We "regenerate" toplevel cutoff, if transmitting class
527 * has backlog and it is not regulated. It is not part of
528 * original CBQ description, but looks more reasonable.
529 * Probably, it is wrong. This question needs further investigation.
530 */
531
532static inline void
533cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
534 struct cbq_class *borrowed)
535{
536 if (cl && q->toplevel >= borrowed->level) {
537 if (cl->q->q.qlen > 1) {
538 do {
539 if (borrowed->undertime == PSCHED_PASTPERFECT) {
540 q->toplevel = borrowed->level;
541 return;
542 }
543 } while ((borrowed = borrowed->borrow) != NULL);
544 }
545#if 0
546 /* It is not necessary now. Uncommenting it
547 will save CPU cycles, but decrease fairness.
548 */
549 q->toplevel = TC_CBQ_MAXLEVEL;
550#endif
551 }
552}
553
554static void
555cbq_update(struct cbq_sched_data *q)
556{
557 struct cbq_class *this = q->tx_class;
558 struct cbq_class *cl = this;
559 int len = q->tx_len;
560 psched_time_t now;
561
562 q->tx_class = NULL;
563 /* Time integrator. We calculate EOS time
564 * by adding expected packet transmission time.
565 */
566 now = q->now + L2T(&q->link, len);
567
568 for ( ; cl; cl = cl->share) {
569 long avgidle = cl->avgidle;
570 long idle;
571
572 cl->bstats.packets++;
573 cl->bstats.bytes += len;
574
575 /*
576 * (now - last) is total time between packet right edges.
577 * (last_pktlen/rate) is "virtual" busy time, so that
578 *
579 * idle = (now - last) - last_pktlen/rate
580 */
581
582 idle = now - cl->last;
583 if ((unsigned long)idle > 128*1024*1024) {
584 avgidle = cl->maxidle;
585 } else {
586 idle -= L2T(cl, len);
587
588 /* true_avgidle := (1-W)*true_avgidle + W*idle,
589 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
590 * cl->avgidle == true_avgidle/W,
591 * hence:
592 */
593 avgidle += idle - (avgidle>>cl->ewma_log);
594 }
595
596 if (avgidle <= 0) {
597 /* Overlimit or at-limit */
598
599 if (avgidle < cl->minidle)
600 avgidle = cl->minidle;
601
602 cl->avgidle = avgidle;
603
604 /* Calculate expected time, when this class
605 * will be allowed to send.
606 * It will occur, when:
607 * (1-W)*true_avgidle + W*delay = 0, i.e.
608 * idle = (1/W - 1)*(-true_avgidle)
609 * or
610 * idle = (1 - W)*(-cl->avgidle);
611 */
612 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
613
614 /*
615 * That is not all.
616 * To maintain the rate allocated to the class,
617 * we add to undertime virtual clock,
618 * necessary to complete transmitted packet.
619 * (len/phys_bandwidth has been already passed
620 * to the moment of cbq_update)
621 */
622
623 idle -= L2T(&q->link, len);
624 idle += L2T(cl, len);
625
626 cl->undertime = now + idle;
627 } else {
628 /* Underlimit */
629
630 cl->undertime = PSCHED_PASTPERFECT;
631 if (avgidle > cl->maxidle)
632 cl->avgidle = cl->maxidle;
633 else
634 cl->avgidle = avgidle;
635 }
636 if ((s64)(now - cl->last) > 0)
637 cl->last = now;
638 }
639
640 cbq_update_toplevel(q, this, q->tx_borrowed);
641}
642
643static inline struct cbq_class *
644cbq_under_limit(struct cbq_class *cl)
645{
646 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
647 struct cbq_class *this_cl = cl;
648
649 if (cl->tparent == NULL)
650 return cl;
651
652 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
653 cl->delayed = 0;
654 return cl;
655 }
656
657 do {
658 /* It is very suspicious place. Now overlimit
659 * action is generated for not bounded classes
660 * only if link is completely congested.
661 * Though it is in agree with ancestor-only paradigm,
662 * it looks very stupid. Particularly,
663 * it means that this chunk of code will either
664 * never be called or result in strong amplification
665 * of burstiness. Dangerous, silly, and, however,
666 * no another solution exists.
667 */
668 cl = cl->borrow;
669 if (!cl) {
670 this_cl->qstats.overlimits++;
671 cbq_overlimit(this_cl);
672 return NULL;
673 }
674 if (cl->level > q->toplevel)
675 return NULL;
676 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
677
678 cl->delayed = 0;
679 return cl;
680}
681
682static inline struct sk_buff *
683cbq_dequeue_prio(struct Qdisc *sch, int prio)
684{
685 struct cbq_sched_data *q = qdisc_priv(sch);
686 struct cbq_class *cl_tail, *cl_prev, *cl;
687 struct sk_buff *skb;
688 int deficit;
689
690 cl_tail = cl_prev = q->active[prio];
691 cl = cl_prev->next_alive;
692
693 do {
694 deficit = 0;
695
696 /* Start round */
697 do {
698 struct cbq_class *borrow = cl;
699
700 if (cl->q->q.qlen &&
701 (borrow = cbq_under_limit(cl)) == NULL)
702 goto skip_class;
703
704 if (cl->deficit <= 0) {
705 /* Class exhausted its allotment per
706 * this round. Switch to the next one.
707 */
708 deficit = 1;
709 cl->deficit += cl->quantum;
710 goto next_class;
711 }
712
713 skb = cl->q->dequeue(cl->q);
714
715 /* Class did not give us any skb :-(
716 * It could occur even if cl->q->q.qlen != 0
717 * f.e. if cl->q == "tbf"
718 */
719 if (skb == NULL)
720 goto skip_class;
721
722 cl->deficit -= qdisc_pkt_len(skb);
723 q->tx_class = cl;
724 q->tx_borrowed = borrow;
725 if (borrow != cl) {
726#ifndef CBQ_XSTATS_BORROWS_BYTES
727 borrow->xstats.borrows++;
728 cl->xstats.borrows++;
729#else
730 borrow->xstats.borrows += qdisc_pkt_len(skb);
731 cl->xstats.borrows += qdisc_pkt_len(skb);
732#endif
733 }
734 q->tx_len = qdisc_pkt_len(skb);
735
736 if (cl->deficit <= 0) {
737 q->active[prio] = cl;
738 cl = cl->next_alive;
739 cl->deficit += cl->quantum;
740 }
741 return skb;
742
743skip_class:
744 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
745 /* Class is empty or penalized.
746 * Unlink it from active chain.
747 */
748 cl_prev->next_alive = cl->next_alive;
749 cl->next_alive = NULL;
750
751 /* Did cl_tail point to it? */
752 if (cl == cl_tail) {
753 /* Repair it! */
754 cl_tail = cl_prev;
755
756 /* Was it the last class in this band? */
757 if (cl == cl_tail) {
758 /* Kill the band! */
759 q->active[prio] = NULL;
760 q->activemask &= ~(1<<prio);
761 if (cl->q->q.qlen)
762 cbq_activate_class(cl);
763 return NULL;
764 }
765
766 q->active[prio] = cl_tail;
767 }
768 if (cl->q->q.qlen)
769 cbq_activate_class(cl);
770
771 cl = cl_prev;
772 }
773
774next_class:
775 cl_prev = cl;
776 cl = cl->next_alive;
777 } while (cl_prev != cl_tail);
778 } while (deficit);
779
780 q->active[prio] = cl_prev;
781
782 return NULL;
783}
784
785static inline struct sk_buff *
786cbq_dequeue_1(struct Qdisc *sch)
787{
788 struct cbq_sched_data *q = qdisc_priv(sch);
789 struct sk_buff *skb;
790 unsigned int activemask;
791
792 activemask = q->activemask & 0xFF;
793 while (activemask) {
794 int prio = ffz(~activemask);
795 activemask &= ~(1<<prio);
796 skb = cbq_dequeue_prio(sch, prio);
797 if (skb)
798 return skb;
799 }
800 return NULL;
801}
802
803static struct sk_buff *
804cbq_dequeue(struct Qdisc *sch)
805{
806 struct sk_buff *skb;
807 struct cbq_sched_data *q = qdisc_priv(sch);
808 psched_time_t now;
809
810 now = psched_get_time();
811
812 if (q->tx_class)
813 cbq_update(q);
814
815 q->now = now;
816
817 for (;;) {
818 q->wd_expires = 0;
819
820 skb = cbq_dequeue_1(sch);
821 if (skb) {
822 qdisc_bstats_update(sch, skb);
823 sch->q.qlen--;
824 return skb;
825 }
826
827 /* All the classes are overlimit.
828 *
829 * It is possible, if:
830 *
831 * 1. Scheduler is empty.
832 * 2. Toplevel cutoff inhibited borrowing.
833 * 3. Root class is overlimit.
834 *
835 * Reset 2d and 3d conditions and retry.
836 *
837 * Note, that NS and cbq-2.0 are buggy, peeking
838 * an arbitrary class is appropriate for ancestor-only
839 * sharing, but not for toplevel algorithm.
840 *
841 * Our version is better, but slower, because it requires
842 * two passes, but it is unavoidable with top-level sharing.
843 */
844
845 if (q->toplevel == TC_CBQ_MAXLEVEL &&
846 q->link.undertime == PSCHED_PASTPERFECT)
847 break;
848
849 q->toplevel = TC_CBQ_MAXLEVEL;
850 q->link.undertime = PSCHED_PASTPERFECT;
851 }
852
853 /* No packets in scheduler or nobody wants to give them to us :-(
854 * Sigh... start watchdog timer in the last case.
855 */
856
857 if (sch->q.qlen) {
858 qdisc_qstats_overlimit(sch);
859 if (q->wd_expires)
860 qdisc_watchdog_schedule(&q->watchdog,
861 now + q->wd_expires);
862 }
863 return NULL;
864}
865
866/* CBQ class maintanance routines */
867
868static void cbq_adjust_levels(struct cbq_class *this)
869{
870 if (this == NULL)
871 return;
872
873 do {
874 int level = 0;
875 struct cbq_class *cl;
876
877 cl = this->children;
878 if (cl) {
879 do {
880 if (cl->level > level)
881 level = cl->level;
882 } while ((cl = cl->sibling) != this->children);
883 }
884 this->level = level + 1;
885 } while ((this = this->tparent) != NULL);
886}
887
888static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
889{
890 struct cbq_class *cl;
891 unsigned int h;
892
893 if (q->quanta[prio] == 0)
894 return;
895
896 for (h = 0; h < q->clhash.hashsize; h++) {
897 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
898 /* BUGGGG... Beware! This expression suffer of
899 * arithmetic overflows!
900 */
901 if (cl->priority == prio) {
902 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
903 q->quanta[prio];
904 }
905 if (cl->quantum <= 0 ||
906 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
907 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
908 cl->common.classid, cl->quantum);
909 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
910 }
911 }
912 }
913}
914
915static void cbq_sync_defmap(struct cbq_class *cl)
916{
917 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
918 struct cbq_class *split = cl->split;
919 unsigned int h;
920 int i;
921
922 if (split == NULL)
923 return;
924
925 for (i = 0; i <= TC_PRIO_MAX; i++) {
926 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
927 split->defaults[i] = NULL;
928 }
929
930 for (i = 0; i <= TC_PRIO_MAX; i++) {
931 int level = split->level;
932
933 if (split->defaults[i])
934 continue;
935
936 for (h = 0; h < q->clhash.hashsize; h++) {
937 struct cbq_class *c;
938
939 hlist_for_each_entry(c, &q->clhash.hash[h],
940 common.hnode) {
941 if (c->split == split && c->level < level &&
942 c->defmap & (1<<i)) {
943 split->defaults[i] = c;
944 level = c->level;
945 }
946 }
947 }
948 }
949}
950
951static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
952{
953 struct cbq_class *split = NULL;
954
955 if (splitid == 0) {
956 split = cl->split;
957 if (!split)
958 return;
959 splitid = split->common.classid;
960 }
961
962 if (split == NULL || split->common.classid != splitid) {
963 for (split = cl->tparent; split; split = split->tparent)
964 if (split->common.classid == splitid)
965 break;
966 }
967
968 if (split == NULL)
969 return;
970
971 if (cl->split != split) {
972 cl->defmap = 0;
973 cbq_sync_defmap(cl);
974 cl->split = split;
975 cl->defmap = def & mask;
976 } else
977 cl->defmap = (cl->defmap & ~mask) | (def & mask);
978
979 cbq_sync_defmap(cl);
980}
981
982static void cbq_unlink_class(struct cbq_class *this)
983{
984 struct cbq_class *cl, **clp;
985 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
986
987 qdisc_class_hash_remove(&q->clhash, &this->common);
988
989 if (this->tparent) {
990 clp = &this->sibling;
991 cl = *clp;
992 do {
993 if (cl == this) {
994 *clp = cl->sibling;
995 break;
996 }
997 clp = &cl->sibling;
998 } while ((cl = *clp) != this->sibling);
999
1000 if (this->tparent->children == this) {
1001 this->tparent->children = this->sibling;
1002 if (this->sibling == this)
1003 this->tparent->children = NULL;
1004 }
1005 } else {
1006 WARN_ON(this->sibling != this);
1007 }
1008}
1009
1010static void cbq_link_class(struct cbq_class *this)
1011{
1012 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1013 struct cbq_class *parent = this->tparent;
1014
1015 this->sibling = this;
1016 qdisc_class_hash_insert(&q->clhash, &this->common);
1017
1018 if (parent == NULL)
1019 return;
1020
1021 if (parent->children == NULL) {
1022 parent->children = this;
1023 } else {
1024 this->sibling = parent->children->sibling;
1025 parent->children->sibling = this;
1026 }
1027}
1028
1029static void
1030cbq_reset(struct Qdisc *sch)
1031{
1032 struct cbq_sched_data *q = qdisc_priv(sch);
1033 struct cbq_class *cl;
1034 int prio;
1035 unsigned int h;
1036
1037 q->activemask = 0;
1038 q->pmask = 0;
1039 q->tx_class = NULL;
1040 q->tx_borrowed = NULL;
1041 qdisc_watchdog_cancel(&q->watchdog);
1042 hrtimer_cancel(&q->delay_timer);
1043 q->toplevel = TC_CBQ_MAXLEVEL;
1044 q->now = psched_get_time();
1045
1046 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1047 q->active[prio] = NULL;
1048
1049 for (h = 0; h < q->clhash.hashsize; h++) {
1050 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1051 qdisc_reset(cl->q);
1052
1053 cl->next_alive = NULL;
1054 cl->undertime = PSCHED_PASTPERFECT;
1055 cl->avgidle = cl->maxidle;
1056 cl->deficit = cl->quantum;
1057 cl->cpriority = cl->priority;
1058 }
1059 }
1060 sch->q.qlen = 0;
1061}
1062
1063
1064static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1065{
1066 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1067 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1068 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1069 }
1070 if (lss->change & TCF_CBQ_LSS_EWMA)
1071 cl->ewma_log = lss->ewma_log;
1072 if (lss->change & TCF_CBQ_LSS_AVPKT)
1073 cl->avpkt = lss->avpkt;
1074 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1075 cl->minidle = -(long)lss->minidle;
1076 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1077 cl->maxidle = lss->maxidle;
1078 cl->avgidle = lss->maxidle;
1079 }
1080 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1081 cl->offtime = lss->offtime;
1082 return 0;
1083}
1084
1085static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1086{
1087 q->nclasses[cl->priority]--;
1088 q->quanta[cl->priority] -= cl->weight;
1089 cbq_normalize_quanta(q, cl->priority);
1090}
1091
1092static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1093{
1094 q->nclasses[cl->priority]++;
1095 q->quanta[cl->priority] += cl->weight;
1096 cbq_normalize_quanta(q, cl->priority);
1097}
1098
1099static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1100{
1101 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1102
1103 if (wrr->allot)
1104 cl->allot = wrr->allot;
1105 if (wrr->weight)
1106 cl->weight = wrr->weight;
1107 if (wrr->priority) {
1108 cl->priority = wrr->priority - 1;
1109 cl->cpriority = cl->priority;
1110 if (cl->priority >= cl->priority2)
1111 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1112 }
1113
1114 cbq_addprio(q, cl);
1115 return 0;
1116}
1117
1118static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1119{
1120 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1121 return 0;
1122}
1123
1124static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1125 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1126 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1127 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1128 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1129 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1130 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1131 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1132};
1133
1134static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], struct nlattr *opt)
1135{
1136 int err;
1137
1138 if (!opt)
1139 return -EINVAL;
1140
1141 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1142 if (err < 0)
1143 return err;
1144
1145 if (tb[TCA_CBQ_WRROPT]) {
1146 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147
1148 if (wrr->priority > TC_CBQ_MAXPRIO)
1149 err = -EINVAL;
1150 }
1151 return err;
1152}
1153
1154static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1155{
1156 struct cbq_sched_data *q = qdisc_priv(sch);
1157 struct nlattr *tb[TCA_CBQ_MAX + 1];
1158 struct tc_ratespec *r;
1159 int err;
1160
1161 qdisc_watchdog_init(&q->watchdog, sch);
1162 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1163 q->delay_timer.function = cbq_undelay;
1164
1165 err = cbq_opt_parse(tb, opt);
1166 if (err < 0)
1167 return err;
1168
1169 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1170 return -EINVAL;
1171
1172 r = nla_data(tb[TCA_CBQ_RATE]);
1173
1174 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1175 return -EINVAL;
1176
1177 err = tcf_block_get(&q->link.block, &q->link.filter_list);
1178 if (err)
1179 goto put_rtab;
1180
1181 err = qdisc_class_hash_init(&q->clhash);
1182 if (err < 0)
1183 goto put_block;
1184
1185 q->link.sibling = &q->link;
1186 q->link.common.classid = sch->handle;
1187 q->link.qdisc = sch;
1188 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1189 sch->handle);
1190 if (!q->link.q)
1191 q->link.q = &noop_qdisc;
1192 else
1193 qdisc_hash_add(q->link.q, true);
1194
1195 q->link.priority = TC_CBQ_MAXPRIO - 1;
1196 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1197 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1198 q->link.allot = psched_mtu(qdisc_dev(sch));
1199 q->link.quantum = q->link.allot;
1200 q->link.weight = q->link.R_tab->rate.rate;
1201
1202 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1203 q->link.avpkt = q->link.allot/2;
1204 q->link.minidle = -0x7FFFFFFF;
1205
1206 q->toplevel = TC_CBQ_MAXLEVEL;
1207 q->now = psched_get_time();
1208
1209 cbq_link_class(&q->link);
1210
1211 if (tb[TCA_CBQ_LSSOPT])
1212 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1213
1214 cbq_addprio(q, &q->link);
1215 return 0;
1216
1217put_block:
1218 tcf_block_put(q->link.block);
1219
1220put_rtab:
1221 qdisc_put_rtab(q->link.R_tab);
1222 return err;
1223}
1224
1225static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1226{
1227 unsigned char *b = skb_tail_pointer(skb);
1228
1229 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1230 goto nla_put_failure;
1231 return skb->len;
1232
1233nla_put_failure:
1234 nlmsg_trim(skb, b);
1235 return -1;
1236}
1237
1238static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1239{
1240 unsigned char *b = skb_tail_pointer(skb);
1241 struct tc_cbq_lssopt opt;
1242
1243 opt.flags = 0;
1244 if (cl->borrow == NULL)
1245 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1246 if (cl->share == NULL)
1247 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1248 opt.ewma_log = cl->ewma_log;
1249 opt.level = cl->level;
1250 opt.avpkt = cl->avpkt;
1251 opt.maxidle = cl->maxidle;
1252 opt.minidle = (u32)(-cl->minidle);
1253 opt.offtime = cl->offtime;
1254 opt.change = ~0;
1255 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1256 goto nla_put_failure;
1257 return skb->len;
1258
1259nla_put_failure:
1260 nlmsg_trim(skb, b);
1261 return -1;
1262}
1263
1264static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1265{
1266 unsigned char *b = skb_tail_pointer(skb);
1267 struct tc_cbq_wrropt opt;
1268
1269 memset(&opt, 0, sizeof(opt));
1270 opt.flags = 0;
1271 opt.allot = cl->allot;
1272 opt.priority = cl->priority + 1;
1273 opt.cpriority = cl->cpriority + 1;
1274 opt.weight = cl->weight;
1275 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1276 goto nla_put_failure;
1277 return skb->len;
1278
1279nla_put_failure:
1280 nlmsg_trim(skb, b);
1281 return -1;
1282}
1283
1284static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1285{
1286 unsigned char *b = skb_tail_pointer(skb);
1287 struct tc_cbq_fopt opt;
1288
1289 if (cl->split || cl->defmap) {
1290 opt.split = cl->split ? cl->split->common.classid : 0;
1291 opt.defmap = cl->defmap;
1292 opt.defchange = ~0;
1293 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1294 goto nla_put_failure;
1295 }
1296 return skb->len;
1297
1298nla_put_failure:
1299 nlmsg_trim(skb, b);
1300 return -1;
1301}
1302
1303static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1304{
1305 if (cbq_dump_lss(skb, cl) < 0 ||
1306 cbq_dump_rate(skb, cl) < 0 ||
1307 cbq_dump_wrr(skb, cl) < 0 ||
1308 cbq_dump_fopt(skb, cl) < 0)
1309 return -1;
1310 return 0;
1311}
1312
1313static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1314{
1315 struct cbq_sched_data *q = qdisc_priv(sch);
1316 struct nlattr *nest;
1317
1318 nest = nla_nest_start(skb, TCA_OPTIONS);
1319 if (nest == NULL)
1320 goto nla_put_failure;
1321 if (cbq_dump_attr(skb, &q->link) < 0)
1322 goto nla_put_failure;
1323 return nla_nest_end(skb, nest);
1324
1325nla_put_failure:
1326 nla_nest_cancel(skb, nest);
1327 return -1;
1328}
1329
1330static int
1331cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1332{
1333 struct cbq_sched_data *q = qdisc_priv(sch);
1334
1335 q->link.xstats.avgidle = q->link.avgidle;
1336 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1337}
1338
1339static int
1340cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1341 struct sk_buff *skb, struct tcmsg *tcm)
1342{
1343 struct cbq_class *cl = (struct cbq_class *)arg;
1344 struct nlattr *nest;
1345
1346 if (cl->tparent)
1347 tcm->tcm_parent = cl->tparent->common.classid;
1348 else
1349 tcm->tcm_parent = TC_H_ROOT;
1350 tcm->tcm_handle = cl->common.classid;
1351 tcm->tcm_info = cl->q->handle;
1352
1353 nest = nla_nest_start(skb, TCA_OPTIONS);
1354 if (nest == NULL)
1355 goto nla_put_failure;
1356 if (cbq_dump_attr(skb, cl) < 0)
1357 goto nla_put_failure;
1358 return nla_nest_end(skb, nest);
1359
1360nla_put_failure:
1361 nla_nest_cancel(skb, nest);
1362 return -1;
1363}
1364
1365static int
1366cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1367 struct gnet_dump *d)
1368{
1369 struct cbq_sched_data *q = qdisc_priv(sch);
1370 struct cbq_class *cl = (struct cbq_class *)arg;
1371
1372 cl->xstats.avgidle = cl->avgidle;
1373 cl->xstats.undertime = 0;
1374
1375 if (cl->undertime != PSCHED_PASTPERFECT)
1376 cl->xstats.undertime = cl->undertime - q->now;
1377
1378 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1379 d, NULL, &cl->bstats) < 0 ||
1380 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1381 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1382 return -1;
1383
1384 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1385}
1386
1387static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1388 struct Qdisc **old)
1389{
1390 struct cbq_class *cl = (struct cbq_class *)arg;
1391
1392 if (new == NULL) {
1393 new = qdisc_create_dflt(sch->dev_queue,
1394 &pfifo_qdisc_ops, cl->common.classid);
1395 if (new == NULL)
1396 return -ENOBUFS;
1397 }
1398
1399 *old = qdisc_replace(sch, new, &cl->q);
1400 return 0;
1401}
1402
1403static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1404{
1405 struct cbq_class *cl = (struct cbq_class *)arg;
1406
1407 return cl->q;
1408}
1409
1410static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1411{
1412 struct cbq_class *cl = (struct cbq_class *)arg;
1413
1414 cbq_deactivate_class(cl);
1415}
1416
1417static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1418{
1419 struct cbq_sched_data *q = qdisc_priv(sch);
1420
1421 return (unsigned long)cbq_class_lookup(q, classid);
1422}
1423
1424static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1425{
1426 struct cbq_sched_data *q = qdisc_priv(sch);
1427
1428 WARN_ON(cl->filters);
1429
1430 tcf_block_put(cl->block);
1431 qdisc_destroy(cl->q);
1432 qdisc_put_rtab(cl->R_tab);
1433 gen_kill_estimator(&cl->rate_est);
1434 if (cl != &q->link)
1435 kfree(cl);
1436}
1437
1438static void cbq_destroy(struct Qdisc *sch)
1439{
1440 struct cbq_sched_data *q = qdisc_priv(sch);
1441 struct hlist_node *next;
1442 struct cbq_class *cl;
1443 unsigned int h;
1444
1445#ifdef CONFIG_NET_CLS_ACT
1446 q->rx_class = NULL;
1447#endif
1448 /*
1449 * Filters must be destroyed first because we don't destroy the
1450 * classes from root to leafs which means that filters can still
1451 * be bound to classes which have been destroyed already. --TGR '04
1452 */
1453 for (h = 0; h < q->clhash.hashsize; h++) {
1454 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1455 tcf_block_put(cl->block);
1456 cl->block = NULL;
1457 }
1458 }
1459 for (h = 0; h < q->clhash.hashsize; h++) {
1460 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1461 common.hnode)
1462 cbq_destroy_class(sch, cl);
1463 }
1464 qdisc_class_hash_destroy(&q->clhash);
1465}
1466
1467static int
1468cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1469 unsigned long *arg)
1470{
1471 int err;
1472 struct cbq_sched_data *q = qdisc_priv(sch);
1473 struct cbq_class *cl = (struct cbq_class *)*arg;
1474 struct nlattr *opt = tca[TCA_OPTIONS];
1475 struct nlattr *tb[TCA_CBQ_MAX + 1];
1476 struct cbq_class *parent;
1477 struct qdisc_rate_table *rtab = NULL;
1478
1479 err = cbq_opt_parse(tb, opt);
1480 if (err < 0)
1481 return err;
1482
1483 if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1484 return -EOPNOTSUPP;
1485
1486 if (cl) {
1487 /* Check parent */
1488 if (parentid) {
1489 if (cl->tparent &&
1490 cl->tparent->common.classid != parentid)
1491 return -EINVAL;
1492 if (!cl->tparent && parentid != TC_H_ROOT)
1493 return -EINVAL;
1494 }
1495
1496 if (tb[TCA_CBQ_RATE]) {
1497 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1498 tb[TCA_CBQ_RTAB]);
1499 if (rtab == NULL)
1500 return -EINVAL;
1501 }
1502
1503 if (tca[TCA_RATE]) {
1504 err = gen_replace_estimator(&cl->bstats, NULL,
1505 &cl->rate_est,
1506 NULL,
1507 qdisc_root_sleeping_running(sch),
1508 tca[TCA_RATE]);
1509 if (err) {
1510 qdisc_put_rtab(rtab);
1511 return err;
1512 }
1513 }
1514
1515 /* Change class parameters */
1516 sch_tree_lock(sch);
1517
1518 if (cl->next_alive != NULL)
1519 cbq_deactivate_class(cl);
1520
1521 if (rtab) {
1522 qdisc_put_rtab(cl->R_tab);
1523 cl->R_tab = rtab;
1524 }
1525
1526 if (tb[TCA_CBQ_LSSOPT])
1527 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1528
1529 if (tb[TCA_CBQ_WRROPT]) {
1530 cbq_rmprio(q, cl);
1531 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1532 }
1533
1534 if (tb[TCA_CBQ_FOPT])
1535 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1536
1537 if (cl->q->q.qlen)
1538 cbq_activate_class(cl);
1539
1540 sch_tree_unlock(sch);
1541
1542 return 0;
1543 }
1544
1545 if (parentid == TC_H_ROOT)
1546 return -EINVAL;
1547
1548 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1549 tb[TCA_CBQ_LSSOPT] == NULL)
1550 return -EINVAL;
1551
1552 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1553 if (rtab == NULL)
1554 return -EINVAL;
1555
1556 if (classid) {
1557 err = -EINVAL;
1558 if (TC_H_MAJ(classid ^ sch->handle) ||
1559 cbq_class_lookup(q, classid))
1560 goto failure;
1561 } else {
1562 int i;
1563 classid = TC_H_MAKE(sch->handle, 0x8000);
1564
1565 for (i = 0; i < 0x8000; i++) {
1566 if (++q->hgenerator >= 0x8000)
1567 q->hgenerator = 1;
1568 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1569 break;
1570 }
1571 err = -ENOSR;
1572 if (i >= 0x8000)
1573 goto failure;
1574 classid = classid|q->hgenerator;
1575 }
1576
1577 parent = &q->link;
1578 if (parentid) {
1579 parent = cbq_class_lookup(q, parentid);
1580 err = -EINVAL;
1581 if (parent == NULL)
1582 goto failure;
1583 }
1584
1585 err = -ENOBUFS;
1586 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1587 if (cl == NULL)
1588 goto failure;
1589
1590 err = tcf_block_get(&cl->block, &cl->filter_list);
1591 if (err) {
1592 kfree(cl);
1593 return err;
1594 }
1595
1596 if (tca[TCA_RATE]) {
1597 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1598 NULL,
1599 qdisc_root_sleeping_running(sch),
1600 tca[TCA_RATE]);
1601 if (err) {
1602 tcf_block_put(cl->block);
1603 kfree(cl);
1604 goto failure;
1605 }
1606 }
1607
1608 cl->R_tab = rtab;
1609 rtab = NULL;
1610 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1611 if (!cl->q)
1612 cl->q = &noop_qdisc;
1613 else
1614 qdisc_hash_add(cl->q, true);
1615
1616 cl->common.classid = classid;
1617 cl->tparent = parent;
1618 cl->qdisc = sch;
1619 cl->allot = parent->allot;
1620 cl->quantum = cl->allot;
1621 cl->weight = cl->R_tab->rate.rate;
1622
1623 sch_tree_lock(sch);
1624 cbq_link_class(cl);
1625 cl->borrow = cl->tparent;
1626 if (cl->tparent != &q->link)
1627 cl->share = cl->tparent;
1628 cbq_adjust_levels(parent);
1629 cl->minidle = -0x7FFFFFFF;
1630 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1631 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1632 if (cl->ewma_log == 0)
1633 cl->ewma_log = q->link.ewma_log;
1634 if (cl->maxidle == 0)
1635 cl->maxidle = q->link.maxidle;
1636 if (cl->avpkt == 0)
1637 cl->avpkt = q->link.avpkt;
1638 if (tb[TCA_CBQ_FOPT])
1639 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1640 sch_tree_unlock(sch);
1641
1642 qdisc_class_hash_grow(sch, &q->clhash);
1643
1644 *arg = (unsigned long)cl;
1645 return 0;
1646
1647failure:
1648 qdisc_put_rtab(rtab);
1649 return err;
1650}
1651
1652static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1653{
1654 struct cbq_sched_data *q = qdisc_priv(sch);
1655 struct cbq_class *cl = (struct cbq_class *)arg;
1656 unsigned int qlen, backlog;
1657
1658 if (cl->filters || cl->children || cl == &q->link)
1659 return -EBUSY;
1660
1661 sch_tree_lock(sch);
1662
1663 qlen = cl->q->q.qlen;
1664 backlog = cl->q->qstats.backlog;
1665 qdisc_reset(cl->q);
1666 qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1667
1668 if (cl->next_alive)
1669 cbq_deactivate_class(cl);
1670
1671 if (q->tx_borrowed == cl)
1672 q->tx_borrowed = q->tx_class;
1673 if (q->tx_class == cl) {
1674 q->tx_class = NULL;
1675 q->tx_borrowed = NULL;
1676 }
1677#ifdef CONFIG_NET_CLS_ACT
1678 if (q->rx_class == cl)
1679 q->rx_class = NULL;
1680#endif
1681
1682 cbq_unlink_class(cl);
1683 cbq_adjust_levels(cl->tparent);
1684 cl->defmap = 0;
1685 cbq_sync_defmap(cl);
1686
1687 cbq_rmprio(q, cl);
1688 sch_tree_unlock(sch);
1689
1690 cbq_destroy_class(sch, cl);
1691 return 0;
1692}
1693
1694static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
1695{
1696 struct cbq_sched_data *q = qdisc_priv(sch);
1697 struct cbq_class *cl = (struct cbq_class *)arg;
1698
1699 if (cl == NULL)
1700 cl = &q->link;
1701
1702 return cl->block;
1703}
1704
1705static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1706 u32 classid)
1707{
1708 struct cbq_sched_data *q = qdisc_priv(sch);
1709 struct cbq_class *p = (struct cbq_class *)parent;
1710 struct cbq_class *cl = cbq_class_lookup(q, classid);
1711
1712 if (cl) {
1713 if (p && p->level <= cl->level)
1714 return 0;
1715 cl->filters++;
1716 return (unsigned long)cl;
1717 }
1718 return 0;
1719}
1720
1721static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1722{
1723 struct cbq_class *cl = (struct cbq_class *)arg;
1724
1725 cl->filters--;
1726}
1727
1728static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1729{
1730 struct cbq_sched_data *q = qdisc_priv(sch);
1731 struct cbq_class *cl;
1732 unsigned int h;
1733
1734 if (arg->stop)
1735 return;
1736
1737 for (h = 0; h < q->clhash.hashsize; h++) {
1738 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1739 if (arg->count < arg->skip) {
1740 arg->count++;
1741 continue;
1742 }
1743 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1744 arg->stop = 1;
1745 return;
1746 }
1747 arg->count++;
1748 }
1749 }
1750}
1751
1752static const struct Qdisc_class_ops cbq_class_ops = {
1753 .graft = cbq_graft,
1754 .leaf = cbq_leaf,
1755 .qlen_notify = cbq_qlen_notify,
1756 .find = cbq_find,
1757 .change = cbq_change_class,
1758 .delete = cbq_delete,
1759 .walk = cbq_walk,
1760 .tcf_block = cbq_tcf_block,
1761 .bind_tcf = cbq_bind_filter,
1762 .unbind_tcf = cbq_unbind_filter,
1763 .dump = cbq_dump_class,
1764 .dump_stats = cbq_dump_class_stats,
1765};
1766
1767static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1768 .next = NULL,
1769 .cl_ops = &cbq_class_ops,
1770 .id = "cbq",
1771 .priv_size = sizeof(struct cbq_sched_data),
1772 .enqueue = cbq_enqueue,
1773 .dequeue = cbq_dequeue,
1774 .peek = qdisc_peek_dequeued,
1775 .init = cbq_init,
1776 .reset = cbq_reset,
1777 .destroy = cbq_destroy,
1778 .change = NULL,
1779 .dump = cbq_dump,
1780 .dump_stats = cbq_dump_stats,
1781 .owner = THIS_MODULE,
1782};
1783
1784static int __init cbq_module_init(void)
1785{
1786 return register_qdisc(&cbq_qdisc_ops);
1787}
1788static void __exit cbq_module_exit(void)
1789{
1790 unregister_qdisc(&cbq_qdisc_ops);
1791}
1792module_init(cbq_module_init)
1793module_exit(cbq_module_exit)
1794MODULE_LICENSE("GPL");