blob: 63658ec662d03d9397a2db1e2a2d97dac7f9d789 [file] [log] [blame]
b.liue9582032025-04-17 19:18:16 +08001// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * net/sched/sch_api.c Packet scheduler API.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 *
7 * Fixes:
8 *
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12 */
13
14#include <linux/module.h>
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/string.h>
18#include <linux/errno.h>
19#include <linux/skbuff.h>
20#include <linux/init.h>
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/kmod.h>
24#include <linux/list.h>
25#include <linux/hrtimer.h>
26#include <linux/slab.h>
27#include <linux/hashtable.h>
28
29#include <net/net_namespace.h>
30#include <net/sock.h>
31#include <net/netlink.h>
32#include <net/pkt_sched.h>
33#include <net/pkt_cls.h>
34
35/*
36
37 Short review.
38 -------------
39
40 This file consists of two interrelated parts:
41
42 1. queueing disciplines manager frontend.
43 2. traffic classes manager frontend.
44
45 Generally, queueing discipline ("qdisc") is a black box,
46 which is able to enqueue packets and to dequeue them (when
47 device is ready to send something) in order and at times
48 determined by algorithm hidden in it.
49
50 qdisc's are divided to two categories:
51 - "queues", which have no internal structure visible from outside.
52 - "schedulers", which split all the packets to "traffic classes",
53 using "packet classifiers" (look at cls_api.c)
54
55 In turn, classes may have child qdiscs (as rule, queues)
56 attached to them etc. etc. etc.
57
58 The goal of the routines in this file is to translate
59 information supplied by user in the form of handles
60 to more intelligible for kernel form, to make some sanity
61 checks and part of work, which is common to all qdiscs
62 and to provide rtnetlink notifications.
63
64 All real intelligent work is done inside qdisc modules.
65
66
67
68 Every discipline has two major routines: enqueue and dequeue.
69
70 ---dequeue
71
72 dequeue usually returns a skb to send. It is allowed to return NULL,
73 but it does not mean that queue is empty, it just means that
74 discipline does not want to send anything this time.
75 Queue is really empty if q->q.qlen == 0.
76 For complicated disciplines with multiple queues q->q is not
77 real packet queue, but however q->q.qlen must be valid.
78
79 ---enqueue
80
81 enqueue returns 0, if packet was enqueued successfully.
82 If packet (this one or another one) was dropped, it returns
83 not zero error code.
84 NET_XMIT_DROP - this packet dropped
85 Expected action: do not backoff, but wait until queue will clear.
86 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
87 Expected action: backoff or ignore
88
89 Auxiliary routines:
90
91 ---peek
92
93 like dequeue but without removing a packet from the queue
94
95 ---reset
96
97 returns qdisc to initial state: purge all buffers, clear all
98 timers, counters (except for statistics) etc.
99
100 ---init
101
102 initializes newly created qdisc.
103
104 ---destroy
105
106 destroys resources allocated by init and during lifetime of qdisc.
107
108 ---change
109
110 changes qdisc parameters.
111 */
112
113/* Protects list of registered TC modules. It is pure SMP lock. */
114static DEFINE_RWLOCK(qdisc_mod_lock);
115
116
117/************************************************
118 * Queueing disciplines manipulation. *
119 ************************************************/
120
121
122/* The list of all installed queueing disciplines. */
123
124static struct Qdisc_ops *qdisc_base;
125
126/* Register/unregister queueing discipline */
127
128int register_qdisc(struct Qdisc_ops *qops)
129{
130 struct Qdisc_ops *q, **qp;
131 int rc = -EEXIST;
132
133 write_lock(&qdisc_mod_lock);
134 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135 if (!strcmp(qops->id, q->id))
136 goto out;
137
138 if (qops->enqueue == NULL)
139 qops->enqueue = noop_qdisc_ops.enqueue;
140 if (qops->peek == NULL) {
141 if (qops->dequeue == NULL)
142 qops->peek = noop_qdisc_ops.peek;
143 else
144 goto out_einval;
145 }
146 if (qops->dequeue == NULL)
147 qops->dequeue = noop_qdisc_ops.dequeue;
148
149 if (qops->cl_ops) {
150 const struct Qdisc_class_ops *cops = qops->cl_ops;
151
152 if (!(cops->find && cops->walk && cops->leaf))
153 goto out_einval;
154
155 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156 goto out_einval;
157 }
158
159 qops->next = NULL;
160 *qp = qops;
161 rc = 0;
162out:
163 write_unlock(&qdisc_mod_lock);
164 return rc;
165
166out_einval:
167 rc = -EINVAL;
168 goto out;
169}
170EXPORT_SYMBOL(register_qdisc);
171
172int unregister_qdisc(struct Qdisc_ops *qops)
173{
174 struct Qdisc_ops *q, **qp;
175 int err = -ENOENT;
176
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179 if (q == qops)
180 break;
181 if (q) {
182 *qp = q->next;
183 q->next = NULL;
184 err = 0;
185 }
186 write_unlock(&qdisc_mod_lock);
187 return err;
188}
189EXPORT_SYMBOL(unregister_qdisc);
190
191/* Get default qdisc if not otherwise specified */
192void qdisc_get_default(char *name, size_t len)
193{
194 read_lock(&qdisc_mod_lock);
195 strlcpy(name, default_qdisc_ops->id, len);
196 read_unlock(&qdisc_mod_lock);
197}
198
199static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200{
201 struct Qdisc_ops *q = NULL;
202
203 for (q = qdisc_base; q; q = q->next) {
204 if (!strcmp(name, q->id)) {
205 if (!try_module_get(q->owner))
206 q = NULL;
207 break;
208 }
209 }
210
211 return q;
212}
213
214/* Set new default qdisc to use */
215int qdisc_set_default(const char *name)
216{
217 const struct Qdisc_ops *ops;
218
219 if (!capable(CAP_NET_ADMIN))
220 return -EPERM;
221
222 write_lock(&qdisc_mod_lock);
223 ops = qdisc_lookup_default(name);
224 if (!ops) {
225 /* Not found, drop lock and try to load module */
226 write_unlock(&qdisc_mod_lock);
227 request_module("sch_%s", name);
228 write_lock(&qdisc_mod_lock);
229
230 ops = qdisc_lookup_default(name);
231 }
232
233 if (ops) {
234 /* Set new default */
235 module_put(default_qdisc_ops->owner);
236 default_qdisc_ops = ops;
237 }
238 write_unlock(&qdisc_mod_lock);
239
240 return ops ? 0 : -ENOENT;
241}
242
243#ifdef CONFIG_NET_SCH_DEFAULT
244/* Set default value from kernel config */
245static int __init sch_default_qdisc(void)
246{
247 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248}
249late_initcall(sch_default_qdisc);
250#endif
251
252/* We know handle. Find qdisc among all qdisc's attached to device
253 * (root qdisc, all its children, children of children etc.)
254 * Note: caller either uses rtnl or rcu_read_lock()
255 */
256
257static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258{
259 struct Qdisc *q;
260
261 if (!qdisc_dev(root))
262 return (root->handle == handle ? root : NULL);
263
264 if (!(root->flags & TCQ_F_BUILTIN) &&
265 root->handle == handle)
266 return root;
267
268 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269 if (q->handle == handle)
270 return q;
271 }
272 return NULL;
273}
274
275void qdisc_hash_add(struct Qdisc *q, bool invisible)
276{
277 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278 ASSERT_RTNL();
279 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280 if (invisible)
281 q->flags |= TCQ_F_INVISIBLE;
282 }
283}
284EXPORT_SYMBOL(qdisc_hash_add);
285
286void qdisc_hash_del(struct Qdisc *q)
287{
288 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289 ASSERT_RTNL();
290 hash_del_rcu(&q->hash);
291 }
292}
293EXPORT_SYMBOL(qdisc_hash_del);
294
295struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296{
297 struct Qdisc *q;
298
299 if (!handle)
300 return NULL;
301 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
302 if (q)
303 goto out;
304
305 if (dev_ingress_queue(dev))
306 q = qdisc_match_from_root(
307 dev_ingress_queue(dev)->qdisc_sleeping,
308 handle);
309out:
310 return q;
311}
312
313struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314{
315 struct netdev_queue *nq;
316 struct Qdisc *q;
317
318 if (!handle)
319 return NULL;
320 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
321 if (q)
322 goto out;
323
324 nq = dev_ingress_queue_rcu(dev);
325 if (nq)
326 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327out:
328 return q;
329}
330
331static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332{
333 unsigned long cl;
334 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335
336 if (cops == NULL)
337 return NULL;
338 cl = cops->find(p, classid);
339
340 if (cl == 0)
341 return NULL;
342 return cops->leaf(p, cl);
343}
344
345/* Find queueing discipline by name */
346
347static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348{
349 struct Qdisc_ops *q = NULL;
350
351 if (kind) {
352 read_lock(&qdisc_mod_lock);
353 for (q = qdisc_base; q; q = q->next) {
354 if (nla_strcmp(kind, q->id) == 0) {
355 if (!try_module_get(q->owner))
356 q = NULL;
357 break;
358 }
359 }
360 read_unlock(&qdisc_mod_lock);
361 }
362 return q;
363}
364
365/* The linklayer setting were not transferred from iproute2, in older
366 * versions, and the rate tables lookup systems have been dropped in
367 * the kernel. To keep backward compatible with older iproute2 tc
368 * utils, we detect the linklayer setting by detecting if the rate
369 * table were modified.
370 *
371 * For linklayer ATM table entries, the rate table will be aligned to
372 * 48 bytes, thus some table entries will contain the same value. The
373 * mpu (min packet unit) is also encoded into the old rate table, thus
374 * starting from the mpu, we find low and high table entries for
375 * mapping this cell. If these entries contain the same value, when
376 * the rate tables have been modified for linklayer ATM.
377 *
378 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379 * and then roundup to the next cell, calc the table entry one below,
380 * and compare.
381 */
382static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383{
384 int low = roundup(r->mpu, 48);
385 int high = roundup(low+1, 48);
386 int cell_low = low >> r->cell_log;
387 int cell_high = (high >> r->cell_log) - 1;
388
389 /* rtab is too inaccurate at rates > 100Mbit/s */
390 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391 pr_debug("TC linklayer: Giving up ATM detection\n");
392 return TC_LINKLAYER_ETHERNET;
393 }
394
395 if ((cell_high > cell_low) && (cell_high < 256)
396 && (rtab[cell_low] == rtab[cell_high])) {
397 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398 cell_low, cell_high, rtab[cell_high]);
399 return TC_LINKLAYER_ATM;
400 }
401 return TC_LINKLAYER_ETHERNET;
402}
403
404static struct qdisc_rate_table *qdisc_rtab_list;
405
406struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407 struct nlattr *tab,
408 struct netlink_ext_ack *extack)
409{
410 struct qdisc_rate_table *rtab;
411
412 if (tab == NULL || r->rate == 0 ||
413 r->cell_log == 0 || r->cell_log >= 32 ||
414 nla_len(tab) != TC_RTAB_SIZE) {
415 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
416 return NULL;
417 }
418
419 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
420 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
421 !memcmp(&rtab->data, nla_data(tab), 1024)) {
422 rtab->refcnt++;
423 return rtab;
424 }
425 }
426
427 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
428 if (rtab) {
429 rtab->rate = *r;
430 rtab->refcnt = 1;
431 memcpy(rtab->data, nla_data(tab), 1024);
432 if (r->linklayer == TC_LINKLAYER_UNAWARE)
433 r->linklayer = __detect_linklayer(r, rtab->data);
434 rtab->next = qdisc_rtab_list;
435 qdisc_rtab_list = rtab;
436 } else {
437 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
438 }
439 return rtab;
440}
441EXPORT_SYMBOL(qdisc_get_rtab);
442
443void qdisc_put_rtab(struct qdisc_rate_table *tab)
444{
445 struct qdisc_rate_table *rtab, **rtabp;
446
447 if (!tab || --tab->refcnt)
448 return;
449
450 for (rtabp = &qdisc_rtab_list;
451 (rtab = *rtabp) != NULL;
452 rtabp = &rtab->next) {
453 if (rtab == tab) {
454 *rtabp = rtab->next;
455 kfree(rtab);
456 return;
457 }
458 }
459}
460EXPORT_SYMBOL(qdisc_put_rtab);
461
462static LIST_HEAD(qdisc_stab_list);
463
464static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
465 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
466 [TCA_STAB_DATA] = { .type = NLA_BINARY },
467};
468
469static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
470 struct netlink_ext_ack *extack)
471{
472 struct nlattr *tb[TCA_STAB_MAX + 1];
473 struct qdisc_size_table *stab;
474 struct tc_sizespec *s;
475 unsigned int tsize = 0;
476 u16 *tab = NULL;
477 int err;
478
479 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
480 extack);
481 if (err < 0)
482 return ERR_PTR(err);
483 if (!tb[TCA_STAB_BASE]) {
484 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
485 return ERR_PTR(-EINVAL);
486 }
487
488 s = nla_data(tb[TCA_STAB_BASE]);
489
490 if (s->tsize > 0) {
491 if (!tb[TCA_STAB_DATA]) {
492 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
493 return ERR_PTR(-EINVAL);
494 }
495 tab = nla_data(tb[TCA_STAB_DATA]);
496 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
497 }
498
499 if (tsize != s->tsize || (!tab && tsize > 0)) {
500 NL_SET_ERR_MSG(extack, "Invalid size of size table");
501 return ERR_PTR(-EINVAL);
502 }
503
504 list_for_each_entry(stab, &qdisc_stab_list, list) {
505 if (memcmp(&stab->szopts, s, sizeof(*s)))
506 continue;
507 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
508 continue;
509 stab->refcnt++;
510 return stab;
511 }
512
513 if (s->size_log > STAB_SIZE_LOG_MAX ||
514 s->cell_log > STAB_SIZE_LOG_MAX) {
515 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
516 return ERR_PTR(-EINVAL);
517 }
518
519 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
520 if (!stab)
521 return ERR_PTR(-ENOMEM);
522
523 stab->refcnt = 1;
524 stab->szopts = *s;
525 if (tsize > 0)
526 memcpy(stab->data, tab, tsize * sizeof(u16));
527
528 list_add_tail(&stab->list, &qdisc_stab_list);
529
530 return stab;
531}
532
533void qdisc_put_stab(struct qdisc_size_table *tab)
534{
535 if (!tab)
536 return;
537
538 if (--tab->refcnt == 0) {
539 list_del(&tab->list);
540 kfree_rcu(tab, rcu);
541 }
542}
543EXPORT_SYMBOL(qdisc_put_stab);
544
545static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
546{
547 struct nlattr *nest;
548
549 nest = nla_nest_start_noflag(skb, TCA_STAB);
550 if (nest == NULL)
551 goto nla_put_failure;
552 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
553 goto nla_put_failure;
554 nla_nest_end(skb, nest);
555
556 return skb->len;
557
558nla_put_failure:
559 return -1;
560}
561
562void __qdisc_calculate_pkt_len(struct sk_buff *skb,
563 const struct qdisc_size_table *stab)
564{
565 int pkt_len, slot;
566
567 pkt_len = skb->len + stab->szopts.overhead;
568 if (unlikely(!stab->szopts.tsize))
569 goto out;
570
571 slot = pkt_len + stab->szopts.cell_align;
572 if (unlikely(slot < 0))
573 slot = 0;
574
575 slot >>= stab->szopts.cell_log;
576 if (likely(slot < stab->szopts.tsize))
577 pkt_len = stab->data[slot];
578 else
579 pkt_len = stab->data[stab->szopts.tsize - 1] *
580 (slot / stab->szopts.tsize) +
581 stab->data[slot % stab->szopts.tsize];
582
583 pkt_len <<= stab->szopts.size_log;
584out:
585 if (unlikely(pkt_len < 1))
586 pkt_len = 1;
587 qdisc_skb_cb(skb)->pkt_len = pkt_len;
588}
589
590void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
591{
592 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
593 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
594 txt, qdisc->ops->id, qdisc->handle >> 16);
595 qdisc->flags |= TCQ_F_WARN_NONWC;
596 }
597}
598EXPORT_SYMBOL(qdisc_warn_nonwc);
599
600static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
601{
602 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
603 timer);
604
605 rcu_read_lock();
606 __netif_schedule(qdisc_root(wd->qdisc));
607 rcu_read_unlock();
608
609 return HRTIMER_NORESTART;
610}
611
612void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
613 clockid_t clockid)
614{
615 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
616 wd->timer.function = qdisc_watchdog;
617 wd->qdisc = qdisc;
618}
619EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
620
621void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
622{
623 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
624}
625EXPORT_SYMBOL(qdisc_watchdog_init);
626
627void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
628{
629 if (test_bit(__QDISC_STATE_DEACTIVATED,
630 &qdisc_root_sleeping(wd->qdisc)->state))
631 return;
632
633 if (wd->last_expires == expires)
634 return;
635
636 wd->last_expires = expires;
637 hrtimer_start(&wd->timer,
638 ns_to_ktime(expires),
639 HRTIMER_MODE_ABS_PINNED);
640}
641EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
642
643void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
644{
645 hrtimer_cancel(&wd->timer);
646}
647EXPORT_SYMBOL(qdisc_watchdog_cancel);
648
649static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
650{
651 struct hlist_head *h;
652 unsigned int i;
653
654 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
655
656 if (h != NULL) {
657 for (i = 0; i < n; i++)
658 INIT_HLIST_HEAD(&h[i]);
659 }
660 return h;
661}
662
663void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
664{
665 struct Qdisc_class_common *cl;
666 struct hlist_node *next;
667 struct hlist_head *nhash, *ohash;
668 unsigned int nsize, nmask, osize;
669 unsigned int i, h;
670
671 /* Rehash when load factor exceeds 0.75 */
672 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
673 return;
674 nsize = clhash->hashsize * 2;
675 nmask = nsize - 1;
676 nhash = qdisc_class_hash_alloc(nsize);
677 if (nhash == NULL)
678 return;
679
680 ohash = clhash->hash;
681 osize = clhash->hashsize;
682
683 sch_tree_lock(sch);
684 for (i = 0; i < osize; i++) {
685 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
686 h = qdisc_class_hash(cl->classid, nmask);
687 hlist_add_head(&cl->hnode, &nhash[h]);
688 }
689 }
690 clhash->hash = nhash;
691 clhash->hashsize = nsize;
692 clhash->hashmask = nmask;
693 sch_tree_unlock(sch);
694
695 kvfree(ohash);
696}
697EXPORT_SYMBOL(qdisc_class_hash_grow);
698
699int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
700{
701 unsigned int size = 4;
702
703 clhash->hash = qdisc_class_hash_alloc(size);
704 if (!clhash->hash)
705 return -ENOMEM;
706 clhash->hashsize = size;
707 clhash->hashmask = size - 1;
708 clhash->hashelems = 0;
709 return 0;
710}
711EXPORT_SYMBOL(qdisc_class_hash_init);
712
713void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
714{
715 kvfree(clhash->hash);
716}
717EXPORT_SYMBOL(qdisc_class_hash_destroy);
718
719void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
720 struct Qdisc_class_common *cl)
721{
722 unsigned int h;
723
724 INIT_HLIST_NODE(&cl->hnode);
725 h = qdisc_class_hash(cl->classid, clhash->hashmask);
726 hlist_add_head(&cl->hnode, &clhash->hash[h]);
727 clhash->hashelems++;
728}
729EXPORT_SYMBOL(qdisc_class_hash_insert);
730
731void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
732 struct Qdisc_class_common *cl)
733{
734 hlist_del(&cl->hnode);
735 clhash->hashelems--;
736}
737EXPORT_SYMBOL(qdisc_class_hash_remove);
738
739/* Allocate an unique handle from space managed by kernel
740 * Possible range is [8000-FFFF]:0000 (0x8000 values)
741 */
742static u32 qdisc_alloc_handle(struct net_device *dev)
743{
744 int i = 0x8000;
745 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
746
747 do {
748 autohandle += TC_H_MAKE(0x10000U, 0);
749 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
750 autohandle = TC_H_MAKE(0x80000000U, 0);
751 if (!qdisc_lookup(dev, autohandle))
752 return autohandle;
753 cond_resched();
754 } while (--i > 0);
755
756 return 0;
757}
758
759void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
760{
761 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
762 const struct Qdisc_class_ops *cops;
763 unsigned long cl;
764 u32 parentid;
765 bool notify;
766 int drops;
767
768 if (n == 0 && len == 0)
769 return;
770 drops = max_t(int, n, 0);
771 rcu_read_lock();
772 while ((parentid = sch->parent)) {
773 if (parentid == TC_H_ROOT)
774 break;
775
776 if (sch->flags & TCQ_F_NOPARENT)
777 break;
778 /* Notify parent qdisc only if child qdisc becomes empty.
779 *
780 * If child was empty even before update then backlog
781 * counter is screwed and we skip notification because
782 * parent class is already passive.
783 *
784 * If the original child was offloaded then it is allowed
785 * to be seem as empty, so the parent is notified anyway.
786 */
787 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
788 !qdisc_is_offloaded);
789 /* TODO: perform the search on a per txq basis */
790 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
791 if (sch == NULL) {
792 WARN_ON_ONCE(parentid != TC_H_ROOT);
793 break;
794 }
795 cops = sch->ops->cl_ops;
796 if (notify && cops->qlen_notify) {
797 cl = cops->find(sch, parentid);
798 cops->qlen_notify(sch, cl);
799 }
800 sch->q.qlen -= n;
801 sch->qstats.backlog -= len;
802 __qdisc_qstats_drop(sch, drops);
803 }
804 rcu_read_unlock();
805}
806EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
807
808int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
809 void *type_data)
810{
811 struct net_device *dev = qdisc_dev(sch);
812 int err;
813
814 sch->flags &= ~TCQ_F_OFFLOADED;
815 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
816 return 0;
817
818 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
819 if (err == -EOPNOTSUPP)
820 return 0;
821
822 if (!err)
823 sch->flags |= TCQ_F_OFFLOADED;
824
825 return err;
826}
827EXPORT_SYMBOL(qdisc_offload_dump_helper);
828
829void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
830 struct Qdisc *new, struct Qdisc *old,
831 enum tc_setup_type type, void *type_data,
832 struct netlink_ext_ack *extack)
833{
834 bool any_qdisc_is_offloaded;
835 int err;
836
837 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
838 return;
839
840 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
841
842 /* Don't report error if the graft is part of destroy operation. */
843 if (!err || !new || new == &noop_qdisc)
844 return;
845
846 /* Don't report error if the parent, the old child and the new
847 * one are not offloaded.
848 */
849 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
850 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
851 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
852
853 if (any_qdisc_is_offloaded)
854 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
855}
856EXPORT_SYMBOL(qdisc_offload_graft_helper);
857
858static void qdisc_offload_graft_root(struct net_device *dev,
859 struct Qdisc *new, struct Qdisc *old,
860 struct netlink_ext_ack *extack)
861{
862 struct tc_root_qopt_offload graft_offload = {
863 .command = TC_ROOT_GRAFT,
864 .handle = new ? new->handle : 0,
865 .ingress = (new && new->flags & TCQ_F_INGRESS) ||
866 (old && old->flags & TCQ_F_INGRESS),
867 };
868
869 qdisc_offload_graft_helper(dev, NULL, new, old,
870 TC_SETUP_ROOT_QDISC, &graft_offload, extack);
871}
872
873static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
874 u32 portid, u32 seq, u16 flags, int event)
875{
876 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
877 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
878 struct tcmsg *tcm;
879 struct nlmsghdr *nlh;
880 unsigned char *b = skb_tail_pointer(skb);
881 struct gnet_dump d;
882 struct qdisc_size_table *stab;
883 u32 block_index;
884 __u32 qlen;
885
886 cond_resched();
887 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
888 if (!nlh)
889 goto out_nlmsg_trim;
890 tcm = nlmsg_data(nlh);
891 tcm->tcm_family = AF_UNSPEC;
892 tcm->tcm__pad1 = 0;
893 tcm->tcm__pad2 = 0;
894 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
895 tcm->tcm_parent = clid;
896 tcm->tcm_handle = q->handle;
897 tcm->tcm_info = refcount_read(&q->refcnt);
898 if (nla_put_string(skb, TCA_KIND, q->ops->id))
899 goto nla_put_failure;
900 if (q->ops->ingress_block_get) {
901 block_index = q->ops->ingress_block_get(q);
902 if (block_index &&
903 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
904 goto nla_put_failure;
905 }
906 if (q->ops->egress_block_get) {
907 block_index = q->ops->egress_block_get(q);
908 if (block_index &&
909 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
910 goto nla_put_failure;
911 }
912 if (q->ops->dump && q->ops->dump(q, skb) < 0)
913 goto nla_put_failure;
914 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
915 goto nla_put_failure;
916 qlen = qdisc_qlen_sum(q);
917
918 stab = rtnl_dereference(q->stab);
919 if (stab && qdisc_dump_stab(skb, stab) < 0)
920 goto nla_put_failure;
921
922 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
923 NULL, &d, TCA_PAD) < 0)
924 goto nla_put_failure;
925
926 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
927 goto nla_put_failure;
928
929 if (qdisc_is_percpu_stats(q)) {
930 cpu_bstats = q->cpu_bstats;
931 cpu_qstats = q->cpu_qstats;
932 }
933
934 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
935 &d, cpu_bstats, &q->bstats) < 0 ||
936 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
937 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
938 goto nla_put_failure;
939
940 if (gnet_stats_finish_copy(&d) < 0)
941 goto nla_put_failure;
942
943 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
944 return skb->len;
945
946out_nlmsg_trim:
947nla_put_failure:
948 nlmsg_trim(skb, b);
949 return -1;
950}
951
952static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
953{
954 if (q->flags & TCQ_F_BUILTIN)
955 return true;
956 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
957 return true;
958
959 return false;
960}
961
962static int qdisc_notify(struct net *net, struct sk_buff *oskb,
963 struct nlmsghdr *n, u32 clid,
964 struct Qdisc *old, struct Qdisc *new)
965{
966 struct sk_buff *skb;
967 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
968
969 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
970 if (!skb)
971 return -ENOBUFS;
972
973 if (old && !tc_qdisc_dump_ignore(old, false)) {
974 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
975 0, RTM_DELQDISC) < 0)
976 goto err_out;
977 }
978 if (new && !tc_qdisc_dump_ignore(new, false)) {
979 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
980 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
981 goto err_out;
982 }
983
984 if (skb->len)
985 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
986 n->nlmsg_flags & NLM_F_ECHO);
987
988err_out:
989 kfree_skb(skb);
990 return -EINVAL;
991}
992
993static void notify_and_destroy(struct net *net, struct sk_buff *skb,
994 struct nlmsghdr *n, u32 clid,
995 struct Qdisc *old, struct Qdisc *new)
996{
997 if (new || old)
998 qdisc_notify(net, skb, n, clid, old, new);
999
1000 if (old)
1001 qdisc_put(old);
1002}
1003
1004static void qdisc_clear_nolock(struct Qdisc *sch)
1005{
1006 sch->flags &= ~TCQ_F_NOLOCK;
1007 if (!(sch->flags & TCQ_F_CPUSTATS))
1008 return;
1009
1010 free_percpu(sch->cpu_bstats);
1011 free_percpu(sch->cpu_qstats);
1012 sch->cpu_bstats = NULL;
1013 sch->cpu_qstats = NULL;
1014 sch->flags &= ~TCQ_F_CPUSTATS;
1015}
1016
1017/* Graft qdisc "new" to class "classid" of qdisc "parent" or
1018 * to device "dev".
1019 *
1020 * When appropriate send a netlink notification using 'skb'
1021 * and "n".
1022 *
1023 * On success, destroy old qdisc.
1024 */
1025
1026static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1027 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1028 struct Qdisc *new, struct Qdisc *old,
1029 struct netlink_ext_ack *extack)
1030{
1031 struct Qdisc *q = old;
1032 struct net *net = dev_net(dev);
1033
1034 if (parent == NULL) {
1035 unsigned int i, num_q, ingress;
1036
1037 ingress = 0;
1038 num_q = dev->num_tx_queues;
1039 if ((q && q->flags & TCQ_F_INGRESS) ||
1040 (new && new->flags & TCQ_F_INGRESS)) {
1041 num_q = 1;
1042 ingress = 1;
1043 if (!dev_ingress_queue(dev)) {
1044 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1045 return -ENOENT;
1046 }
1047 }
1048
1049 if (dev->flags & IFF_UP)
1050 dev_deactivate(dev);
1051
1052 qdisc_offload_graft_root(dev, new, old, extack);
1053
1054 if (new && new->ops->attach)
1055 goto skip;
1056
1057 for (i = 0; i < num_q; i++) {
1058 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1059
1060 if (!ingress)
1061 dev_queue = netdev_get_tx_queue(dev, i);
1062
1063 old = dev_graft_qdisc(dev_queue, new);
1064 if (new && i > 0)
1065 qdisc_refcount_inc(new);
1066
1067 if (!ingress)
1068 qdisc_put(old);
1069 }
1070
1071skip:
1072 if (!ingress) {
1073 old = rtnl_dereference(dev->qdisc);
1074 if (new && !new->ops->attach)
1075 qdisc_refcount_inc(new);
1076 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1077
1078 notify_and_destroy(net, skb, n, classid, old, new);
1079
1080 if (new && new->ops->attach)
1081 new->ops->attach(new);
1082 } else {
1083 notify_and_destroy(net, skb, n, classid, old, new);
1084 }
1085
1086 if (dev->flags & IFF_UP)
1087 dev_activate(dev);
1088 } else {
1089 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1090 unsigned long cl;
1091 int err;
1092
1093 /* Only support running class lockless if parent is lockless */
1094 if (new && (new->flags & TCQ_F_NOLOCK) &&
1095 parent && !(parent->flags & TCQ_F_NOLOCK))
1096 qdisc_clear_nolock(new);
1097
1098 if (!cops || !cops->graft)
1099 return -EOPNOTSUPP;
1100
1101 cl = cops->find(parent, classid);
1102 if (!cl) {
1103 NL_SET_ERR_MSG(extack, "Specified class not found");
1104 return -ENOENT;
1105 }
1106
1107 if (new && new->ops == &noqueue_qdisc_ops) {
1108 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1109 return -EINVAL;
1110 }
1111
1112 if (new &&
1113 !(parent->flags & TCQ_F_MQROOT) &&
1114 rcu_access_pointer(new->stab)) {
1115 NL_SET_ERR_MSG(extack, "STAB not supported on a non root");
1116 return -EINVAL;
1117 }
1118 err = cops->graft(parent, cl, new, &old, extack);
1119 if (err)
1120 return err;
1121 notify_and_destroy(net, skb, n, classid, old, new);
1122 }
1123 return 0;
1124}
1125
1126static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1127 struct netlink_ext_ack *extack)
1128{
1129 u32 block_index;
1130
1131 if (tca[TCA_INGRESS_BLOCK]) {
1132 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1133
1134 if (!block_index) {
1135 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1136 return -EINVAL;
1137 }
1138 if (!sch->ops->ingress_block_set) {
1139 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1140 return -EOPNOTSUPP;
1141 }
1142 sch->ops->ingress_block_set(sch, block_index);
1143 }
1144 if (tca[TCA_EGRESS_BLOCK]) {
1145 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1146
1147 if (!block_index) {
1148 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1149 return -EINVAL;
1150 }
1151 if (!sch->ops->egress_block_set) {
1152 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1153 return -EOPNOTSUPP;
1154 }
1155 sch->ops->egress_block_set(sch, block_index);
1156 }
1157 return 0;
1158}
1159
1160/*
1161 Allocate and initialize new qdisc.
1162
1163 Parameters are passed via opt.
1164 */
1165
1166static struct Qdisc *qdisc_create(struct net_device *dev,
1167 struct netdev_queue *dev_queue,
1168 struct Qdisc *p, u32 parent, u32 handle,
1169 struct nlattr **tca, int *errp,
1170 struct netlink_ext_ack *extack)
1171{
1172 int err;
1173 struct nlattr *kind = tca[TCA_KIND];
1174 struct Qdisc *sch;
1175 struct Qdisc_ops *ops;
1176 struct qdisc_size_table *stab;
1177
1178 ops = qdisc_lookup_ops(kind);
1179#ifdef CONFIG_MODULES
1180 if (ops == NULL && kind != NULL) {
1181 char name[IFNAMSIZ];
1182 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1183 /* We dropped the RTNL semaphore in order to
1184 * perform the module load. So, even if we
1185 * succeeded in loading the module we have to
1186 * tell the caller to replay the request. We
1187 * indicate this using -EAGAIN.
1188 * We replay the request because the device may
1189 * go away in the mean time.
1190 */
1191 rtnl_unlock();
1192 request_module("sch_%s", name);
1193 rtnl_lock();
1194 ops = qdisc_lookup_ops(kind);
1195 if (ops != NULL) {
1196 /* We will try again qdisc_lookup_ops,
1197 * so don't keep a reference.
1198 */
1199 module_put(ops->owner);
1200 err = -EAGAIN;
1201 goto err_out;
1202 }
1203 }
1204 }
1205#endif
1206
1207 err = -ENOENT;
1208 if (!ops) {
1209 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1210 goto err_out;
1211 }
1212
1213 sch = qdisc_alloc(dev_queue, ops, extack);
1214 if (IS_ERR(sch)) {
1215 err = PTR_ERR(sch);
1216 goto err_out2;
1217 }
1218
1219 sch->parent = parent;
1220
1221 if (handle == TC_H_INGRESS) {
1222 if (!(sch->flags & TCQ_F_INGRESS)) {
1223 NL_SET_ERR_MSG(extack,
1224 "Specified parent ID is reserved for ingress and clsact Qdiscs");
1225 err = -EINVAL;
1226 goto err_out3;
1227 }
1228 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1229 } else {
1230 if (handle == 0) {
1231 handle = qdisc_alloc_handle(dev);
1232 if (handle == 0) {
1233 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1234 err = -ENOSPC;
1235 goto err_out3;
1236 }
1237 }
1238 if (!netif_is_multiqueue(dev))
1239 sch->flags |= TCQ_F_ONETXQUEUE;
1240 }
1241
1242 sch->handle = handle;
1243
1244 /* This exist to keep backward compatible with a userspace
1245 * loophole, what allowed userspace to get IFF_NO_QUEUE
1246 * facility on older kernels by setting tx_queue_len=0 (prior
1247 * to qdisc init), and then forgot to reinit tx_queue_len
1248 * before again attaching a qdisc.
1249 */
1250 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1251 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1252 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1253 }
1254
1255 err = qdisc_block_indexes_set(sch, tca, extack);
1256 if (err)
1257 goto err_out3;
1258
1259 if (ops->init) {
1260 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1261 if (err != 0)
1262 goto err_out5;
1263 }
1264
1265 if (tca[TCA_STAB]) {
1266 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1267 if (IS_ERR(stab)) {
1268 err = PTR_ERR(stab);
1269 goto err_out4;
1270 }
1271 rcu_assign_pointer(sch->stab, stab);
1272 }
1273 if (tca[TCA_RATE]) {
1274 seqcount_t *running;
1275
1276 err = -EOPNOTSUPP;
1277 if (sch->flags & TCQ_F_MQROOT) {
1278 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1279 goto err_out4;
1280 }
1281
1282 if (sch->parent != TC_H_ROOT &&
1283 !(sch->flags & TCQ_F_INGRESS) &&
1284 (!p || !(p->flags & TCQ_F_MQROOT)))
1285 running = qdisc_root_sleeping_running(sch);
1286 else
1287 running = &sch->running;
1288
1289 err = gen_new_estimator(&sch->bstats,
1290 sch->cpu_bstats,
1291 &sch->rate_est,
1292 NULL,
1293 running,
1294 tca[TCA_RATE]);
1295 if (err) {
1296 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1297 goto err_out4;
1298 }
1299 }
1300
1301 qdisc_hash_add(sch, false);
1302
1303 return sch;
1304
1305err_out5:
1306 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1307 if (ops->destroy)
1308 ops->destroy(sch);
1309err_out3:
1310 dev_put(dev);
1311 qdisc_free(sch);
1312err_out2:
1313 module_put(ops->owner);
1314err_out:
1315 *errp = err;
1316 return NULL;
1317
1318err_out4:
1319 /*
1320 * Any broken qdiscs that would require a ops->reset() here?
1321 * The qdisc was never in action so it shouldn't be necessary.
1322 */
1323 qdisc_put_stab(rtnl_dereference(sch->stab));
1324 if (ops->destroy)
1325 ops->destroy(sch);
1326 goto err_out3;
1327}
1328
1329static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1330 struct netlink_ext_ack *extack)
1331{
1332 struct qdisc_size_table *ostab, *stab = NULL;
1333 int err = 0;
1334
1335 if (tca[TCA_OPTIONS]) {
1336 if (!sch->ops->change) {
1337 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1338 return -EINVAL;
1339 }
1340 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1341 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1342 return -EOPNOTSUPP;
1343 }
1344 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1345 if (err)
1346 return err;
1347 }
1348
1349 if (tca[TCA_STAB]) {
1350 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1351 if (IS_ERR(stab))
1352 return PTR_ERR(stab);
1353 }
1354
1355 ostab = rtnl_dereference(sch->stab);
1356 rcu_assign_pointer(sch->stab, stab);
1357 qdisc_put_stab(ostab);
1358
1359 if (tca[TCA_RATE]) {
1360 /* NB: ignores errors from replace_estimator
1361 because change can't be undone. */
1362 if (sch->flags & TCQ_F_MQROOT)
1363 goto out;
1364 gen_replace_estimator(&sch->bstats,
1365 sch->cpu_bstats,
1366 &sch->rate_est,
1367 NULL,
1368 qdisc_root_sleeping_running(sch),
1369 tca[TCA_RATE]);
1370 }
1371out:
1372 return 0;
1373}
1374
1375struct check_loop_arg {
1376 struct qdisc_walker w;
1377 struct Qdisc *p;
1378 int depth;
1379};
1380
1381static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1382 struct qdisc_walker *w);
1383
1384static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1385{
1386 struct check_loop_arg arg;
1387
1388 if (q->ops->cl_ops == NULL)
1389 return 0;
1390
1391 arg.w.stop = arg.w.skip = arg.w.count = 0;
1392 arg.w.fn = check_loop_fn;
1393 arg.depth = depth;
1394 arg.p = p;
1395 q->ops->cl_ops->walk(q, &arg.w);
1396 return arg.w.stop ? -ELOOP : 0;
1397}
1398
1399static int
1400check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1401{
1402 struct Qdisc *leaf;
1403 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1404 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1405
1406 leaf = cops->leaf(q, cl);
1407 if (leaf) {
1408 if (leaf == arg->p || arg->depth > 7)
1409 return -ELOOP;
1410 return check_loop(leaf, arg->p, arg->depth + 1);
1411 }
1412 return 0;
1413}
1414
1415const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1416 [TCA_KIND] = { .type = NLA_STRING },
1417 [TCA_RATE] = { .type = NLA_BINARY,
1418 .len = sizeof(struct tc_estimator) },
1419 [TCA_STAB] = { .type = NLA_NESTED },
1420 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1421 [TCA_CHAIN] = { .type = NLA_U32 },
1422 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1423 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1424};
1425
1426/*
1427 * Delete/get qdisc.
1428 */
1429
1430static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1431 struct netlink_ext_ack *extack)
1432{
1433 struct net *net = sock_net(skb->sk);
1434 struct tcmsg *tcm = nlmsg_data(n);
1435 struct nlattr *tca[TCA_MAX + 1];
1436 struct net_device *dev;
1437 u32 clid;
1438 struct Qdisc *q = NULL;
1439 struct Qdisc *p = NULL;
1440 int err;
1441
1442 if ((n->nlmsg_type != RTM_GETQDISC) &&
1443 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1444 return -EPERM;
1445
1446 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1447 rtm_tca_policy, extack);
1448 if (err < 0)
1449 return err;
1450
1451 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1452 if (!dev)
1453 return -ENODEV;
1454
1455 clid = tcm->tcm_parent;
1456 if (clid) {
1457 if (clid != TC_H_ROOT) {
1458 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1459 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1460 if (!p) {
1461 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1462 return -ENOENT;
1463 }
1464 q = qdisc_leaf(p, clid);
1465 } else if (dev_ingress_queue(dev)) {
1466 q = dev_ingress_queue(dev)->qdisc_sleeping;
1467 }
1468 } else {
1469 q = rtnl_dereference(dev->qdisc);
1470 }
1471 if (!q) {
1472 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1473 return -ENOENT;
1474 }
1475
1476 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1477 NL_SET_ERR_MSG(extack, "Invalid handle");
1478 return -EINVAL;
1479 }
1480 } else {
1481 q = qdisc_lookup(dev, tcm->tcm_handle);
1482 if (!q) {
1483 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1484 return -ENOENT;
1485 }
1486 }
1487
1488 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1489 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1490 return -EINVAL;
1491 }
1492
1493 if (n->nlmsg_type == RTM_DELQDISC) {
1494 if (!clid) {
1495 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1496 return -EINVAL;
1497 }
1498 if (q->handle == 0) {
1499 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1500 return -ENOENT;
1501 }
1502 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1503 if (err != 0)
1504 return err;
1505 } else {
1506 qdisc_notify(net, skb, n, clid, NULL, q);
1507 }
1508 return 0;
1509}
1510
1511static bool req_create_or_replace(struct nlmsghdr *n)
1512{
1513 return (n->nlmsg_flags & NLM_F_CREATE &&
1514 n->nlmsg_flags & NLM_F_REPLACE);
1515}
1516
1517static bool req_create_exclusive(struct nlmsghdr *n)
1518{
1519 return (n->nlmsg_flags & NLM_F_CREATE &&
1520 n->nlmsg_flags & NLM_F_EXCL);
1521}
1522
1523static bool req_change(struct nlmsghdr *n)
1524{
1525 return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1526 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1527 !(n->nlmsg_flags & NLM_F_EXCL));
1528}
1529
1530/*
1531 * Create/change qdisc.
1532 */
1533static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1534 struct netlink_ext_ack *extack)
1535{
1536 struct net *net = sock_net(skb->sk);
1537 struct tcmsg *tcm;
1538 struct nlattr *tca[TCA_MAX + 1];
1539 struct net_device *dev;
1540 u32 clid;
1541 struct Qdisc *q, *p;
1542 int err;
1543
1544 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1545 return -EPERM;
1546
1547replay:
1548 /* Reinit, just in case something touches this. */
1549 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1550 rtm_tca_policy, extack);
1551 if (err < 0)
1552 return err;
1553
1554 tcm = nlmsg_data(n);
1555 clid = tcm->tcm_parent;
1556 q = p = NULL;
1557
1558 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1559 if (!dev)
1560 return -ENODEV;
1561
1562
1563 if (clid) {
1564 if (clid != TC_H_ROOT) {
1565 if (clid != TC_H_INGRESS) {
1566 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1567 if (!p) {
1568 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1569 return -ENOENT;
1570 }
1571 q = qdisc_leaf(p, clid);
1572 } else if (dev_ingress_queue_create(dev)) {
1573 q = dev_ingress_queue(dev)->qdisc_sleeping;
1574 }
1575 } else {
1576 q = rtnl_dereference(dev->qdisc);
1577 }
1578
1579 /* It may be default qdisc, ignore it */
1580 if (q && q->handle == 0)
1581 q = NULL;
1582
1583 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1584 if (tcm->tcm_handle) {
1585 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1586 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1587 return -EEXIST;
1588 }
1589 if (TC_H_MIN(tcm->tcm_handle)) {
1590 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1591 return -EINVAL;
1592 }
1593 q = qdisc_lookup(dev, tcm->tcm_handle);
1594 if (!q)
1595 goto create_n_graft;
1596 if (n->nlmsg_flags & NLM_F_EXCL) {
1597 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1598 return -EEXIST;
1599 }
1600 if (tca[TCA_KIND] &&
1601 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1602 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1603 return -EINVAL;
1604 }
1605 if (q->flags & TCQ_F_INGRESS) {
1606 NL_SET_ERR_MSG(extack,
1607 "Cannot regraft ingress or clsact Qdiscs");
1608 return -EINVAL;
1609 }
1610 if (q == p ||
1611 (p && check_loop(q, p, 0))) {
1612 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1613 return -ELOOP;
1614 }
1615 if (clid == TC_H_INGRESS) {
1616 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1617 return -EINVAL;
1618 }
1619 qdisc_refcount_inc(q);
1620 goto graft;
1621 } else {
1622 if (!q)
1623 goto create_n_graft;
1624
1625 /* This magic test requires explanation.
1626 *
1627 * We know, that some child q is already
1628 * attached to this parent and have choice:
1629 * 1) change it or 2) create/graft new one.
1630 * If the requested qdisc kind is different
1631 * than the existing one, then we choose graft.
1632 * If they are the same then this is "change"
1633 * operation - just let it fallthrough..
1634 *
1635 * 1. We are allowed to create/graft only
1636 * if the request is explicitly stating
1637 * "please create if it doesn't exist".
1638 *
1639 * 2. If the request is to exclusive create
1640 * then the qdisc tcm_handle is not expected
1641 * to exist, so that we choose create/graft too.
1642 *
1643 * 3. The last case is when no flags are set.
1644 * This will happen when for example tc
1645 * utility issues a "change" command.
1646 * Alas, it is sort of hole in API, we
1647 * cannot decide what to do unambiguously.
1648 * For now we select create/graft.
1649 */
1650 if (tca[TCA_KIND] &&
1651 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1652 if (req_create_or_replace(n) ||
1653 req_create_exclusive(n))
1654 goto create_n_graft;
1655 else if (req_change(n))
1656 goto create_n_graft2;
1657 }
1658 }
1659 }
1660 } else {
1661 if (!tcm->tcm_handle) {
1662 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1663 return -EINVAL;
1664 }
1665 q = qdisc_lookup(dev, tcm->tcm_handle);
1666 }
1667
1668 /* Change qdisc parameters */
1669 if (!q) {
1670 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1671 return -ENOENT;
1672 }
1673 if (n->nlmsg_flags & NLM_F_EXCL) {
1674 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1675 return -EEXIST;
1676 }
1677 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1678 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1679 return -EINVAL;
1680 }
1681 err = qdisc_change(q, tca, extack);
1682 if (err == 0)
1683 qdisc_notify(net, skb, n, clid, NULL, q);
1684 return err;
1685
1686create_n_graft:
1687 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1688 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1689 return -ENOENT;
1690 }
1691create_n_graft2:
1692 if (clid == TC_H_INGRESS) {
1693 if (dev_ingress_queue(dev)) {
1694 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1695 tcm->tcm_parent, tcm->tcm_parent,
1696 tca, &err, extack);
1697 } else {
1698 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1699 err = -ENOENT;
1700 }
1701 } else {
1702 struct netdev_queue *dev_queue;
1703
1704 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1705 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1706 else if (p)
1707 dev_queue = p->dev_queue;
1708 else
1709 dev_queue = netdev_get_tx_queue(dev, 0);
1710
1711 q = qdisc_create(dev, dev_queue, p,
1712 tcm->tcm_parent, tcm->tcm_handle,
1713 tca, &err, extack);
1714 }
1715 if (q == NULL) {
1716 if (err == -EAGAIN)
1717 goto replay;
1718 return err;
1719 }
1720
1721graft:
1722 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1723 if (err) {
1724 if (q)
1725 qdisc_put(q);
1726 return err;
1727 }
1728
1729 return 0;
1730}
1731
1732static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1733 struct netlink_callback *cb,
1734 int *q_idx_p, int s_q_idx, bool recur,
1735 bool dump_invisible)
1736{
1737 int ret = 0, q_idx = *q_idx_p;
1738 struct Qdisc *q;
1739 int b;
1740
1741 if (!root)
1742 return 0;
1743
1744 q = root;
1745 if (q_idx < s_q_idx) {
1746 q_idx++;
1747 } else {
1748 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1749 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1750 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1751 RTM_NEWQDISC) <= 0)
1752 goto done;
1753 q_idx++;
1754 }
1755
1756 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1757 * itself has already been dumped.
1758 *
1759 * If we've already dumped the top-level (ingress) qdisc above and the global
1760 * qdisc hashtable, we don't want to hit it again
1761 */
1762 if (!qdisc_dev(root) || !recur)
1763 goto out;
1764
1765 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1766 if (q_idx < s_q_idx) {
1767 q_idx++;
1768 continue;
1769 }
1770 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1771 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1772 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1773 RTM_NEWQDISC) <= 0)
1774 goto done;
1775 q_idx++;
1776 }
1777
1778out:
1779 *q_idx_p = q_idx;
1780 return ret;
1781done:
1782 ret = -1;
1783 goto out;
1784}
1785
1786static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1787{
1788 struct net *net = sock_net(skb->sk);
1789 int idx, q_idx;
1790 int s_idx, s_q_idx;
1791 struct net_device *dev;
1792 const struct nlmsghdr *nlh = cb->nlh;
1793 struct nlattr *tca[TCA_MAX + 1];
1794 int err;
1795
1796 s_idx = cb->args[0];
1797 s_q_idx = q_idx = cb->args[1];
1798
1799 idx = 0;
1800 ASSERT_RTNL();
1801
1802 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1803 rtm_tca_policy, cb->extack);
1804 if (err < 0)
1805 return err;
1806
1807 for_each_netdev(net, dev) {
1808 struct netdev_queue *dev_queue;
1809
1810 if (idx < s_idx)
1811 goto cont;
1812 if (idx > s_idx)
1813 s_q_idx = 0;
1814 q_idx = 0;
1815
1816 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1817 skb, cb, &q_idx, s_q_idx,
1818 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1819 goto done;
1820
1821 dev_queue = dev_ingress_queue(dev);
1822 if (dev_queue &&
1823 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1824 &q_idx, s_q_idx, false,
1825 tca[TCA_DUMP_INVISIBLE]) < 0)
1826 goto done;
1827
1828cont:
1829 idx++;
1830 }
1831
1832done:
1833 cb->args[0] = idx;
1834 cb->args[1] = q_idx;
1835
1836 return skb->len;
1837}
1838
1839
1840
1841/************************************************
1842 * Traffic classes manipulation. *
1843 ************************************************/
1844
1845static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1846 unsigned long cl,
1847 u32 portid, u32 seq, u16 flags, int event)
1848{
1849 struct tcmsg *tcm;
1850 struct nlmsghdr *nlh;
1851 unsigned char *b = skb_tail_pointer(skb);
1852 struct gnet_dump d;
1853 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1854
1855 cond_resched();
1856 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1857 if (!nlh)
1858 goto out_nlmsg_trim;
1859 tcm = nlmsg_data(nlh);
1860 tcm->tcm_family = AF_UNSPEC;
1861 tcm->tcm__pad1 = 0;
1862 tcm->tcm__pad2 = 0;
1863 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1864 tcm->tcm_parent = q->handle;
1865 tcm->tcm_handle = q->handle;
1866 tcm->tcm_info = 0;
1867 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1868 goto nla_put_failure;
1869 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1870 goto nla_put_failure;
1871
1872 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1873 NULL, &d, TCA_PAD) < 0)
1874 goto nla_put_failure;
1875
1876 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1877 goto nla_put_failure;
1878
1879 if (gnet_stats_finish_copy(&d) < 0)
1880 goto nla_put_failure;
1881
1882 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1883 return skb->len;
1884
1885out_nlmsg_trim:
1886nla_put_failure:
1887 nlmsg_trim(skb, b);
1888 return -1;
1889}
1890
1891static int tclass_notify(struct net *net, struct sk_buff *oskb,
1892 struct nlmsghdr *n, struct Qdisc *q,
1893 unsigned long cl, int event)
1894{
1895 struct sk_buff *skb;
1896 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1897 int err = 0;
1898
1899 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1900 if (!skb)
1901 return -ENOBUFS;
1902
1903 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1904 kfree_skb(skb);
1905 return -EINVAL;
1906 }
1907
1908 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1909 n->nlmsg_flags & NLM_F_ECHO);
1910 if (err > 0)
1911 err = 0;
1912 return err;
1913}
1914
1915static int tclass_del_notify(struct net *net,
1916 const struct Qdisc_class_ops *cops,
1917 struct sk_buff *oskb, struct nlmsghdr *n,
1918 struct Qdisc *q, unsigned long cl)
1919{
1920 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1921 struct sk_buff *skb;
1922 int err = 0;
1923
1924 if (!cops->delete)
1925 return -EOPNOTSUPP;
1926
1927 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1928 if (!skb)
1929 return -ENOBUFS;
1930
1931 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1932 RTM_DELTCLASS) < 0) {
1933 kfree_skb(skb);
1934 return -EINVAL;
1935 }
1936
1937 err = cops->delete(q, cl);
1938 if (err) {
1939 kfree_skb(skb);
1940 return err;
1941 }
1942
1943 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1944 n->nlmsg_flags & NLM_F_ECHO);
1945 if (err > 0)
1946 err = 0;
1947 return err;
1948}
1949
1950#ifdef CONFIG_NET_CLS
1951
1952struct tcf_bind_args {
1953 struct tcf_walker w;
1954 unsigned long base;
1955 unsigned long cl;
1956 u32 classid;
1957};
1958
1959static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1960{
1961 struct tcf_bind_args *a = (void *)arg;
1962
1963 if (tp->ops->bind_class) {
1964 struct Qdisc *q = tcf_block_q(tp->chain->block);
1965
1966 sch_tree_lock(q);
1967 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1968 sch_tree_unlock(q);
1969 }
1970 return 0;
1971}
1972
1973struct tc_bind_class_args {
1974 struct qdisc_walker w;
1975 unsigned long new_cl;
1976 u32 portid;
1977 u32 clid;
1978};
1979
1980static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1981 struct qdisc_walker *w)
1982{
1983 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1984 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1985 struct tcf_block *block;
1986 struct tcf_chain *chain;
1987
1988 block = cops->tcf_block(q, cl, NULL);
1989 if (!block)
1990 return 0;
1991 for (chain = tcf_get_next_chain(block, NULL);
1992 chain;
1993 chain = tcf_get_next_chain(block, chain)) {
1994 struct tcf_proto *tp;
1995
1996 for (tp = tcf_get_next_proto(chain, NULL, true);
1997 tp; tp = tcf_get_next_proto(chain, tp, true)) {
1998 struct tcf_bind_args arg = {};
1999
2000 arg.w.fn = tcf_node_bind;
2001 arg.classid = a->clid;
2002 arg.base = cl;
2003 arg.cl = a->new_cl;
2004 tp->ops->walk(tp, &arg.w, true);
2005 }
2006 }
2007
2008 return 0;
2009}
2010
2011static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2012 unsigned long new_cl)
2013{
2014 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2015 struct tc_bind_class_args args = {};
2016
2017 if (!cops->tcf_block)
2018 return;
2019 args.portid = portid;
2020 args.clid = clid;
2021 args.new_cl = new_cl;
2022 args.w.fn = tc_bind_class_walker;
2023 q->ops->cl_ops->walk(q, &args.w);
2024}
2025
2026#else
2027
2028static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2029 unsigned long new_cl)
2030{
2031}
2032
2033#endif
2034
2035static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2036 struct netlink_ext_ack *extack)
2037{
2038 struct net *net = sock_net(skb->sk);
2039 struct tcmsg *tcm = nlmsg_data(n);
2040 struct nlattr *tca[TCA_MAX + 1];
2041 struct net_device *dev;
2042 struct Qdisc *q = NULL;
2043 const struct Qdisc_class_ops *cops;
2044 unsigned long cl = 0;
2045 unsigned long new_cl;
2046 u32 portid;
2047 u32 clid;
2048 u32 qid;
2049 int err;
2050
2051 if ((n->nlmsg_type != RTM_GETTCLASS) &&
2052 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2053 return -EPERM;
2054
2055 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2056 rtm_tca_policy, extack);
2057 if (err < 0)
2058 return err;
2059
2060 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2061 if (!dev)
2062 return -ENODEV;
2063
2064 /*
2065 parent == TC_H_UNSPEC - unspecified parent.
2066 parent == TC_H_ROOT - class is root, which has no parent.
2067 parent == X:0 - parent is root class.
2068 parent == X:Y - parent is a node in hierarchy.
2069 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2070
2071 handle == 0:0 - generate handle from kernel pool.
2072 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2073 handle == X:Y - clear.
2074 handle == X:0 - root class.
2075 */
2076
2077 /* Step 1. Determine qdisc handle X:0 */
2078
2079 portid = tcm->tcm_parent;
2080 clid = tcm->tcm_handle;
2081 qid = TC_H_MAJ(clid);
2082
2083 if (portid != TC_H_ROOT) {
2084 u32 qid1 = TC_H_MAJ(portid);
2085
2086 if (qid && qid1) {
2087 /* If both majors are known, they must be identical. */
2088 if (qid != qid1)
2089 return -EINVAL;
2090 } else if (qid1) {
2091 qid = qid1;
2092 } else if (qid == 0)
2093 qid = rtnl_dereference(dev->qdisc)->handle;
2094
2095 /* Now qid is genuine qdisc handle consistent
2096 * both with parent and child.
2097 *
2098 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2099 */
2100 if (portid)
2101 portid = TC_H_MAKE(qid, portid);
2102 } else {
2103 if (qid == 0)
2104 qid = rtnl_dereference(dev->qdisc)->handle;
2105 }
2106
2107 /* OK. Locate qdisc */
2108 q = qdisc_lookup(dev, qid);
2109 if (!q)
2110 return -ENOENT;
2111
2112 /* An check that it supports classes */
2113 cops = q->ops->cl_ops;
2114 if (cops == NULL)
2115 return -EINVAL;
2116
2117 /* Now try to get class */
2118 if (clid == 0) {
2119 if (portid == TC_H_ROOT)
2120 clid = qid;
2121 } else
2122 clid = TC_H_MAKE(qid, clid);
2123
2124 if (clid)
2125 cl = cops->find(q, clid);
2126
2127 if (cl == 0) {
2128 err = -ENOENT;
2129 if (n->nlmsg_type != RTM_NEWTCLASS ||
2130 !(n->nlmsg_flags & NLM_F_CREATE))
2131 goto out;
2132 } else {
2133 switch (n->nlmsg_type) {
2134 case RTM_NEWTCLASS:
2135 err = -EEXIST;
2136 if (n->nlmsg_flags & NLM_F_EXCL)
2137 goto out;
2138 break;
2139 case RTM_DELTCLASS:
2140 err = tclass_del_notify(net, cops, skb, n, q, cl);
2141 /* Unbind the class with flilters with 0 */
2142 tc_bind_tclass(q, portid, clid, 0);
2143 goto out;
2144 case RTM_GETTCLASS:
2145 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2146 goto out;
2147 default:
2148 err = -EINVAL;
2149 goto out;
2150 }
2151 }
2152
2153 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2154 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2155 return -EOPNOTSUPP;
2156 }
2157
2158 new_cl = cl;
2159 err = -EOPNOTSUPP;
2160 if (cops->change)
2161 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2162 if (err == 0) {
2163 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2164 /* We just create a new class, need to do reverse binding. */
2165 if (cl != new_cl)
2166 tc_bind_tclass(q, portid, clid, new_cl);
2167 }
2168out:
2169 return err;
2170}
2171
2172struct qdisc_dump_args {
2173 struct qdisc_walker w;
2174 struct sk_buff *skb;
2175 struct netlink_callback *cb;
2176};
2177
2178static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2179 struct qdisc_walker *arg)
2180{
2181 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2182
2183 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2184 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2185 RTM_NEWTCLASS);
2186}
2187
2188static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2189 struct tcmsg *tcm, struct netlink_callback *cb,
2190 int *t_p, int s_t)
2191{
2192 struct qdisc_dump_args arg;
2193
2194 if (tc_qdisc_dump_ignore(q, false) ||
2195 *t_p < s_t || !q->ops->cl_ops ||
2196 (tcm->tcm_parent &&
2197 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2198 (*t_p)++;
2199 return 0;
2200 }
2201 if (*t_p > s_t)
2202 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2203 arg.w.fn = qdisc_class_dump;
2204 arg.skb = skb;
2205 arg.cb = cb;
2206 arg.w.stop = 0;
2207 arg.w.skip = cb->args[1];
2208 arg.w.count = 0;
2209 q->ops->cl_ops->walk(q, &arg.w);
2210 cb->args[1] = arg.w.count;
2211 if (arg.w.stop)
2212 return -1;
2213 (*t_p)++;
2214 return 0;
2215}
2216
2217static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2218 struct tcmsg *tcm, struct netlink_callback *cb,
2219 int *t_p, int s_t, bool recur)
2220{
2221 struct Qdisc *q;
2222 int b;
2223
2224 if (!root)
2225 return 0;
2226
2227 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2228 return -1;
2229
2230 if (!qdisc_dev(root) || !recur)
2231 return 0;
2232
2233 if (tcm->tcm_parent) {
2234 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2235 if (q && q != root &&
2236 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2237 return -1;
2238 return 0;
2239 }
2240 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2241 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2242 return -1;
2243 }
2244
2245 return 0;
2246}
2247
2248static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2249{
2250 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2251 struct net *net = sock_net(skb->sk);
2252 struct netdev_queue *dev_queue;
2253 struct net_device *dev;
2254 int t, s_t;
2255
2256 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2257 return 0;
2258 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2259 if (!dev)
2260 return 0;
2261
2262 s_t = cb->args[0];
2263 t = 0;
2264
2265 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2266 skb, tcm, cb, &t, s_t, true) < 0)
2267 goto done;
2268
2269 dev_queue = dev_ingress_queue(dev);
2270 if (dev_queue &&
2271 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2272 &t, s_t, false) < 0)
2273 goto done;
2274
2275done:
2276 cb->args[0] = t;
2277
2278 dev_put(dev);
2279 return skb->len;
2280}
2281
2282#ifdef CONFIG_PROC_FS
2283static int psched_show(struct seq_file *seq, void *v)
2284{
2285 seq_printf(seq, "%08x %08x %08x %08x\n",
2286 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2287 1000000,
2288 (u32)NSEC_PER_SEC / hrtimer_resolution);
2289
2290 return 0;
2291}
2292
2293static int __net_init psched_net_init(struct net *net)
2294{
2295 struct proc_dir_entry *e;
2296
2297 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2298 if (e == NULL)
2299 return -ENOMEM;
2300
2301 return 0;
2302}
2303
2304static void __net_exit psched_net_exit(struct net *net)
2305{
2306 remove_proc_entry("psched", net->proc_net);
2307}
2308#else
2309static int __net_init psched_net_init(struct net *net)
2310{
2311 return 0;
2312}
2313
2314static void __net_exit psched_net_exit(struct net *net)
2315{
2316}
2317#endif
2318
2319static struct pernet_operations psched_net_ops = {
2320 .init = psched_net_init,
2321 .exit = psched_net_exit,
2322};
2323
2324static int __init pktsched_init(void)
2325{
2326 int err;
2327
2328 err = register_pernet_subsys(&psched_net_ops);
2329 if (err) {
2330 pr_err("pktsched_init: "
2331 "cannot initialize per netns operations\n");
2332 return err;
2333 }
2334
2335 register_qdisc(&fq_codel_qdisc_ops);
2336 register_qdisc(&pfifo_qdisc_ops);
2337 register_qdisc(&bfifo_qdisc_ops);
2338 register_qdisc(&pfifo_head_drop_qdisc_ops);
2339 register_qdisc(&mq_qdisc_ops);
2340 register_qdisc(&noqueue_qdisc_ops);
2341
2342 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2343 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2344 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2345 0);
2346 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2347 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2348 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2349 0);
2350
2351 return 0;
2352}
2353
2354subsys_initcall(pktsched_init);