blob: 259a39ca99bfb7e7a9b37518b4d9bd25b12ac9cc [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * net/sched/sch_tbf.c Token Bucket Filter queue.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
7 * original idea by Martin Devera
8 */
9
10#include <linux/module.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/string.h>
14#include <linux/errno.h>
15#include <linux/skbuff.h>
16#include <net/netlink.h>
17#include <net/sch_generic.h>
18#include <net/pkt_sched.h>
19
20
21/* Simple Token Bucket Filter.
22 =======================================
23
24 SOURCE.
25 -------
26
27 None.
28
29 Description.
30 ------------
31
32 A data flow obeys TBF with rate R and depth B, if for any
33 time interval t_i...t_f the number of transmitted bits
34 does not exceed B + R*(t_f-t_i).
35
36 Packetized version of this definition:
37 The sequence of packets of sizes s_i served at moments t_i
38 obeys TBF, if for any i<=k:
39
40 s_i+....+s_k <= B + R*(t_k - t_i)
41
42 Algorithm.
43 ----------
44
45 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
46
47 N(t+delta) = min{B/R, N(t) + delta}
48
49 If the first packet in queue has length S, it may be
50 transmitted only at the time t_* when S/R <= N(t_*),
51 and in this case N(t) jumps:
52
53 N(t_* + 0) = N(t_* - 0) - S/R.
54
55
56
57 Actually, QoS requires two TBF to be applied to a data stream.
58 One of them controls steady state burst size, another
59 one with rate P (peak rate) and depth M (equal to link MTU)
60 limits bursts at a smaller time scale.
61
62 It is easy to see that P>R, and B>M. If P is infinity, this double
63 TBF is equivalent to a single one.
64
65 When TBF works in reshaping mode, latency is estimated as:
66
67 lat = max ((L-B)/R, (L-M)/P)
68
69
70 NOTES.
71 ------
72
73 If TBF throttles, it starts a watchdog timer, which will wake it up
74 when it is ready to transmit.
75 Note that the minimal timer resolution is 1/HZ.
76 If no new packets arrive during this period,
77 or if the device is not awaken by EOI for some previous packet,
78 TBF can stop its activity for 1/HZ.
79
80
81 This means, that with depth B, the maximal rate is
82
83 R_crit = B*HZ
84
85 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
86
87 Note that the peak rate TBF is much more tough: with MTU 1500
88 P_crit = 150Kbytes/sec. So, if you need greater peak
89 rates, use alpha with HZ=1000 :-)
90
91 With classful TBF, limit is just kept for backwards compatibility.
92 It is passed to the default bfifo qdisc - if the inner qdisc is
93 changed the limit is not effective anymore.
94*/
95
96struct tbf_sched_data {
97/* Parameters */
98 u32 limit; /* Maximal length of backlog: bytes */
99 u32 max_size;
100 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
101 s64 mtu;
102 struct psched_ratecfg rate;
103 struct psched_ratecfg peak;
104
105/* Variables */
106 s64 tokens; /* Current number of B tokens */
107 s64 ptokens; /* Current number of P tokens */
108 s64 t_c; /* Time check-point */
109 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
110 struct qdisc_watchdog watchdog; /* Watchdog timer */
111};
112
113
114/* Time to Length, convert time in ns to length in bytes
115 * to determinate how many bytes can be sent in given time.
116 */
117static u64 psched_ns_t2l(const struct psched_ratecfg *r,
118 u64 time_in_ns)
119{
120 /* The formula is :
121 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
122 */
123 u64 len = time_in_ns * r->rate_bytes_ps;
124
125 do_div(len, NSEC_PER_SEC);
126
127 if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
128 do_div(len, 53);
129 len = len * 48;
130 }
131
132 if (len > r->overhead)
133 len -= r->overhead;
134 else
135 len = 0;
136
137 return len;
138}
139
140/* GSO packet is too big, segment it so that tbf can transmit
141 * each segment in time
142 */
143static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
144 struct sk_buff **to_free)
145{
146 struct tbf_sched_data *q = qdisc_priv(sch);
147 struct sk_buff *segs, *nskb;
148 netdev_features_t features = netif_skb_features(skb);
149 unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len;
150 int ret, nb;
151
152 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
153
154 if (IS_ERR_OR_NULL(segs))
155 return qdisc_drop(skb, sch, to_free);
156
157 nb = 0;
158 while (segs) {
159 nskb = segs->next;
160 skb_mark_not_on_list(segs);
161 seg_len = segs->len;
162 qdisc_skb_cb(segs)->pkt_len = seg_len;
163 ret = qdisc_enqueue(segs, q->qdisc, to_free);
164 if (ret != NET_XMIT_SUCCESS) {
165 if (net_xmit_drop_count(ret))
166 qdisc_qstats_drop(sch);
167 } else {
168 nb++;
169 len += seg_len;
170 }
171 segs = nskb;
172 }
173 sch->q.qlen += nb;
174 sch->qstats.backlog += len;
175 if (nb > 0) {
176 qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
177 consume_skb(skb);
178 return NET_XMIT_SUCCESS;
179 }
180
181 kfree_skb(skb);
182 return NET_XMIT_DROP;
183}
184
185static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
186 struct sk_buff **to_free)
187{
188 struct tbf_sched_data *q = qdisc_priv(sch);
189 unsigned int len = qdisc_pkt_len(skb);
190 int ret;
191
192 if (qdisc_pkt_len(skb) > q->max_size) {
193 if (skb_is_gso(skb) &&
194 skb_gso_validate_mac_len(skb, q->max_size))
195 return tbf_segment(skb, sch, to_free);
196 return qdisc_drop(skb, sch, to_free);
197 }
198 ret = qdisc_enqueue(skb, q->qdisc, to_free);
199 if (ret != NET_XMIT_SUCCESS) {
200 if (net_xmit_drop_count(ret))
201 qdisc_qstats_drop(sch);
202 return ret;
203 }
204
205 sch->qstats.backlog += len;
206 sch->q.qlen++;
207 return NET_XMIT_SUCCESS;
208}
209
210static bool tbf_peak_present(const struct tbf_sched_data *q)
211{
212 return q->peak.rate_bytes_ps;
213}
214
215static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
216{
217 struct tbf_sched_data *q = qdisc_priv(sch);
218 struct sk_buff *skb;
219
220 skb = q->qdisc->ops->peek(q->qdisc);
221
222 if (skb) {
223 s64 now;
224 s64 toks;
225 s64 ptoks = 0;
226 unsigned int len = qdisc_pkt_len(skb);
227
228 now = ktime_get_ns();
229 toks = min_t(s64, now - q->t_c, q->buffer);
230
231 if (tbf_peak_present(q)) {
232 ptoks = toks + q->ptokens;
233 if (ptoks > q->mtu)
234 ptoks = q->mtu;
235 ptoks -= (s64) psched_l2t_ns(&q->peak, len);
236 }
237 toks += q->tokens;
238 if (toks > q->buffer)
239 toks = q->buffer;
240 toks -= (s64) psched_l2t_ns(&q->rate, len);
241
242 if ((toks|ptoks) >= 0) {
243 skb = qdisc_dequeue_peeked(q->qdisc);
244 if (unlikely(!skb))
245 return NULL;
246
247 q->t_c = now;
248 q->tokens = toks;
249 q->ptokens = ptoks;
250 qdisc_qstats_backlog_dec(sch, skb);
251 sch->q.qlen--;
252 qdisc_bstats_update(sch, skb);
253 return skb;
254 }
255
256 qdisc_watchdog_schedule_ns(&q->watchdog,
257 now + max_t(long, -toks, -ptoks));
258
259 /* Maybe we have a shorter packet in the queue,
260 which can be sent now. It sounds cool,
261 but, however, this is wrong in principle.
262 We MUST NOT reorder packets under these circumstances.
263
264 Really, if we split the flow into independent
265 subflows, it would be a very good solution.
266 This is the main idea of all FQ algorithms
267 (cf. CSZ, HPFQ, HFSC)
268 */
269
270 qdisc_qstats_overlimit(sch);
271 }
272 return NULL;
273}
274
275static void tbf_reset(struct Qdisc *sch)
276{
277 struct tbf_sched_data *q = qdisc_priv(sch);
278
279 qdisc_reset(q->qdisc);
280 sch->qstats.backlog = 0;
281 sch->q.qlen = 0;
282 q->t_c = ktime_get_ns();
283 q->tokens = q->buffer;
284 q->ptokens = q->mtu;
285 qdisc_watchdog_cancel(&q->watchdog);
286}
287
288static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
289 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
290 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
291 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
292 [TCA_TBF_RATE64] = { .type = NLA_U64 },
293 [TCA_TBF_PRATE64] = { .type = NLA_U64 },
294 [TCA_TBF_BURST] = { .type = NLA_U32 },
295 [TCA_TBF_PBURST] = { .type = NLA_U32 },
296};
297
298static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
299 struct netlink_ext_ack *extack)
300{
301 int err;
302 struct tbf_sched_data *q = qdisc_priv(sch);
303 struct nlattr *tb[TCA_TBF_MAX + 1];
304 struct tc_tbf_qopt *qopt;
305 struct Qdisc *child = NULL;
306 struct Qdisc *old = NULL;
307 struct psched_ratecfg rate;
308 struct psched_ratecfg peak;
309 u64 max_size;
310 s64 buffer, mtu;
311 u64 rate64 = 0, prate64 = 0;
312
313 err = nla_parse_nested_deprecated(tb, TCA_TBF_MAX, opt, tbf_policy,
314 NULL);
315 if (err < 0)
316 return err;
317
318 err = -EINVAL;
319 if (tb[TCA_TBF_PARMS] == NULL)
320 goto done;
321
322 qopt = nla_data(tb[TCA_TBF_PARMS]);
323 if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
324 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
325 tb[TCA_TBF_RTAB],
326 NULL));
327
328 if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
329 qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
330 tb[TCA_TBF_PTAB],
331 NULL));
332
333 buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
334 mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
335
336 if (tb[TCA_TBF_RATE64])
337 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
338 psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
339
340 if (tb[TCA_TBF_BURST]) {
341 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
342 buffer = psched_l2t_ns(&rate, max_size);
343 } else {
344 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
345 }
346
347 if (qopt->peakrate.rate) {
348 if (tb[TCA_TBF_PRATE64])
349 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
350 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
351 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
352 pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
353 peak.rate_bytes_ps, rate.rate_bytes_ps);
354 err = -EINVAL;
355 goto done;
356 }
357
358 if (tb[TCA_TBF_PBURST]) {
359 u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
360 max_size = min_t(u32, max_size, pburst);
361 mtu = psched_l2t_ns(&peak, pburst);
362 } else {
363 max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
364 }
365 } else {
366 memset(&peak, 0, sizeof(peak));
367 }
368
369 if (max_size < psched_mtu(qdisc_dev(sch)))
370 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
371 max_size, qdisc_dev(sch)->name,
372 psched_mtu(qdisc_dev(sch)));
373
374 if (!max_size) {
375 err = -EINVAL;
376 goto done;
377 }
378
379 if (q->qdisc != &noop_qdisc) {
380 err = fifo_set_limit(q->qdisc, qopt->limit);
381 if (err)
382 goto done;
383 } else if (qopt->limit > 0) {
384 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
385 extack);
386 if (IS_ERR(child)) {
387 err = PTR_ERR(child);
388 goto done;
389 }
390
391 /* child is fifo, no need to check for noop_qdisc */
392 qdisc_hash_add(child, true);
393 }
394
395 sch_tree_lock(sch);
396 if (child) {
397 qdisc_tree_flush_backlog(q->qdisc);
398 old = q->qdisc;
399 q->qdisc = child;
400 }
401 q->limit = qopt->limit;
402 if (tb[TCA_TBF_PBURST])
403 q->mtu = mtu;
404 else
405 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
406 q->max_size = max_size;
407 if (tb[TCA_TBF_BURST])
408 q->buffer = buffer;
409 else
410 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
411 q->tokens = q->buffer;
412 q->ptokens = q->mtu;
413
414 memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
415 memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
416
417 sch_tree_unlock(sch);
418 qdisc_put(old);
419 err = 0;
420done:
421 return err;
422}
423
424static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
425 struct netlink_ext_ack *extack)
426{
427 struct tbf_sched_data *q = qdisc_priv(sch);
428
429 qdisc_watchdog_init(&q->watchdog, sch);
430 q->qdisc = &noop_qdisc;
431
432 if (!opt)
433 return -EINVAL;
434
435 q->t_c = ktime_get_ns();
436
437 return tbf_change(sch, opt, extack);
438}
439
440static void tbf_destroy(struct Qdisc *sch)
441{
442 struct tbf_sched_data *q = qdisc_priv(sch);
443
444 qdisc_watchdog_cancel(&q->watchdog);
445 qdisc_put(q->qdisc);
446}
447
448static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
449{
450 struct tbf_sched_data *q = qdisc_priv(sch);
451 struct nlattr *nest;
452 struct tc_tbf_qopt opt;
453
454 sch->qstats.backlog = q->qdisc->qstats.backlog;
455 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
456 if (nest == NULL)
457 goto nla_put_failure;
458
459 opt.limit = q->limit;
460 psched_ratecfg_getrate(&opt.rate, &q->rate);
461 if (tbf_peak_present(q))
462 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
463 else
464 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
465 opt.mtu = PSCHED_NS2TICKS(q->mtu);
466 opt.buffer = PSCHED_NS2TICKS(q->buffer);
467 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
468 goto nla_put_failure;
469 if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
470 nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
471 TCA_TBF_PAD))
472 goto nla_put_failure;
473 if (tbf_peak_present(q) &&
474 q->peak.rate_bytes_ps >= (1ULL << 32) &&
475 nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
476 TCA_TBF_PAD))
477 goto nla_put_failure;
478
479 return nla_nest_end(skb, nest);
480
481nla_put_failure:
482 nla_nest_cancel(skb, nest);
483 return -1;
484}
485
486static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
487 struct sk_buff *skb, struct tcmsg *tcm)
488{
489 struct tbf_sched_data *q = qdisc_priv(sch);
490
491 tcm->tcm_handle |= TC_H_MIN(1);
492 tcm->tcm_info = q->qdisc->handle;
493
494 return 0;
495}
496
497static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
498 struct Qdisc **old, struct netlink_ext_ack *extack)
499{
500 struct tbf_sched_data *q = qdisc_priv(sch);
501
502 if (new == NULL)
503 new = &noop_qdisc;
504
505 *old = qdisc_replace(sch, new, &q->qdisc);
506 return 0;
507}
508
509static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
510{
511 struct tbf_sched_data *q = qdisc_priv(sch);
512 return q->qdisc;
513}
514
515static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
516{
517 return 1;
518}
519
520static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
521{
522 if (!walker->stop) {
523 if (walker->count >= walker->skip)
524 if (walker->fn(sch, 1, walker) < 0) {
525 walker->stop = 1;
526 return;
527 }
528 walker->count++;
529 }
530}
531
532static const struct Qdisc_class_ops tbf_class_ops = {
533 .graft = tbf_graft,
534 .leaf = tbf_leaf,
535 .find = tbf_find,
536 .walk = tbf_walk,
537 .dump = tbf_dump_class,
538};
539
540static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
541 .next = NULL,
542 .cl_ops = &tbf_class_ops,
543 .id = "tbf",
544 .priv_size = sizeof(struct tbf_sched_data),
545 .enqueue = tbf_enqueue,
546 .dequeue = tbf_dequeue,
547 .peek = qdisc_peek_dequeued,
548 .init = tbf_init,
549 .reset = tbf_reset,
550 .destroy = tbf_destroy,
551 .change = tbf_change,
552 .dump = tbf_dump,
553 .owner = THIS_MODULE,
554};
555
556static int __init tbf_module_init(void)
557{
558 return register_qdisc(&tbf_qdisc_ops);
559}
560
561static void __exit tbf_module_exit(void)
562{
563 unregister_qdisc(&tbf_qdisc_ops);
564}
565module_init(tbf_module_init)
566module_exit(tbf_module_exit)
567MODULE_LICENSE("GPL");