Blame - src/kernel/linux/v4.14/net/sched/sch_generic.c - T103

blob: 091a9746627fa92cf26256cd0abaaa433cb81a5c [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* net/sched/sch_generic.c Generic packet scheduler routines.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License, or (at your option) any later version.
				8	*
				9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
				10	* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
				11	* - Ingress support
				12	*/
				13
				14	#include <linux/bitops.h>
				15	#include <linux/module.h>
				16	#include <linux/types.h>
				17	#include <linux/kernel.h>
				18	#include <linux/sched.h>
				19	#include <linux/string.h>
				20	#include <linux/errno.h>
				21	#include <linux/netdevice.h>
				22	#include <linux/skbuff.h>
				23	#include <linux/rtnetlink.h>
				24	#include <linux/init.h>
				25	#include <linux/rcupdate.h>
				26	#include <linux/list.h>
				27	#include <linux/slab.h>
				28	#include <linux/if_vlan.h>
				29	#include <net/sch_generic.h>
				30	#include <net/pkt_sched.h>
				31	#include <net/dst.h>
				32	#include <trace/events/qdisc.h>
				33
				34	/* Qdisc to use by default */
				35	const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
				36	EXPORT_SYMBOL(default_qdisc_ops);
				37
				38	/* Main transmission queue. */
				39
				40	/* Modifications to data participating in scheduling must be protected with
				41	* qdisc_lock(qdisc) spinlock.
				42	*
				43	* The idea is the following:
				44	* - enqueue, dequeue are serialized via qdisc root lock
				45	* - ingress filtering is also serialized via qdisc root lock
				46	* - updates to tree and tree walking are only done under the rtnl mutex.
				47	*/
				48
				49	static inline int dev_requeue_skb(struct sk_buff skb, struct Qdisc q)
				50	{
				51	q->gso_skb = skb;
				52	q->qstats.requeues++;
				53	qdisc_qstats_backlog_inc(q, skb);
				54	q->q.qlen++; /* it's still part of the queue */
				55	__netif_schedule(q);
				56
				57	return 0;
				58	}
				59
				60	static void try_bulk_dequeue_skb(struct Qdisc *q,
				61	struct sk_buff *skb,
				62	const struct netdev_queue *txq,
				63	int *packets)
				64	{
				65	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
				66
				67	while (bytelimit > 0) {
				68	struct sk_buff *nskb = q->dequeue(q);
				69
				70	if (!nskb)
				71	break;
				72
				73	bytelimit -= nskb->len; /* covers GSO len */
				74	skb->next = nskb;
				75	skb = nskb;
				76	(packets)++; / GSO counts as one pkt */
				77	}
				78	skb->next = NULL;
				79	}
				80
				81	/* This variant of try_bulk_dequeue_skb() makes sure
				82	* all skbs in the chain are for the same txq
				83	*/
				84	static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				85	struct sk_buff *skb,
				86	int *packets)
				87	{
				88	int mapping = skb_get_queue_mapping(skb);
				89	struct sk_buff *nskb;
				90	int cnt = 0;
				91
				92	do {
				93	nskb = q->dequeue(q);
				94	if (!nskb)
				95	break;
				96	if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
				97	q->skb_bad_txq = nskb;
				98	qdisc_qstats_backlog_inc(q, nskb);
				99	q->q.qlen++;
				100	break;
				101	}
				102	skb->next = nskb;
				103	skb = nskb;
				104	} while (++cnt < 8);
				105	(*packets) += cnt;
				106	skb->next = NULL;
				107	}
				108
				109	/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
				110	* A requeued skb (via q->gso_skb) can also be a SKB list.
				111	*/
				112	static struct sk_buff dequeue_skb(struct Qdisc q, bool *validate,
				113	int *packets)
				114	{
				115	struct sk_buff *skb = q->gso_skb;
				116	const struct netdev_queue *txq = q->dev_queue;
				117
				118	*packets = 1;
				119	if (unlikely(skb)) {
				120	/* skb in gso_skb were already validated */
				121	*validate = false;
				122	/* check the reason of requeuing without tx lock first */
				123	txq = skb_get_tx_queue(txq->dev, skb);
				124	if (!netif_xmit_frozen_or_stopped(txq)) {
				125	q->gso_skb = NULL;
				126	qdisc_qstats_backlog_dec(q, skb);
				127	q->q.qlen--;
				128	} else
				129	skb = NULL;
				130	goto trace;
				131	}
				132	*validate = true;
				133	skb = q->skb_bad_txq;
				134	if (unlikely(skb)) {
				135	/* check the reason of requeuing without tx lock first */
				136	txq = skb_get_tx_queue(txq->dev, skb);
				137	if (!netif_xmit_frozen_or_stopped(txq)) {
				138	q->skb_bad_txq = NULL;
				139	qdisc_qstats_backlog_dec(q, skb);
				140	q->q.qlen--;
				141	goto bulk;
				142	}
				143	skb = NULL;
				144	goto trace;
				145	}
				146	if (!(q->flags & TCQ_F_ONETXQUEUE) \|\|
				147	!netif_xmit_frozen_or_stopped(txq))
				148	skb = q->dequeue(q);
				149	if (skb) {
				150	bulk:
				151	if (qdisc_may_bulk(q))
				152	try_bulk_dequeue_skb(q, skb, txq, packets);
				153	else
				154	try_bulk_dequeue_skb_slow(q, skb, packets);
				155	}
				156	trace:
				157	trace_qdisc_dequeue(q, txq, *packets, skb);
				158	return skb;
				159	}
				160
				161	/*
				162	* Transmit possibly several skbs, and handle the return status as
				163	* required. Owning running seqcount bit guarantees that
				164	* only one CPU can execute this function.
				165	*
				166	* Returns to the caller:
				167	* 0 - queue is empty or throttled.
				168	* >0 - queue is not empty.
				169	*/
				170	int sch_direct_xmit(struct sk_buff skb, struct Qdisc q,
				171	struct net_device dev, struct netdev_queue txq,
				172	spinlock_t *root_lock, bool validate)
				173	{
				174	int ret = NETDEV_TX_BUSY;
				175
				176	/* And release qdisc */
				177	spin_unlock(root_lock);
				178
				179	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
				180	if (validate)
				181	skb = validate_xmit_skb_list(skb, dev);
				182
				183	if (likely(skb)) {
				184	HARD_TX_LOCK(dev, txq, smp_processor_id());
				185	if (!netif_xmit_frozen_or_stopped(txq))
				186	skb = dev_hard_start_xmit(skb, dev, txq, &ret);
				187
				188	HARD_TX_UNLOCK(dev, txq);
				189	} else {
				190	spin_lock(root_lock);
				191	return qdisc_qlen(q);
				192	}
				193	spin_lock(root_lock);
				194
				195	if (dev_xmit_complete(ret)) {
				196	/* Driver sent out skb successfully or skb was consumed */
				197	ret = qdisc_qlen(q);
				198	} else {
				199	/* Driver returned NETDEV_TX_BUSY - requeue skb */
				200	if (unlikely(ret != NETDEV_TX_BUSY))
				201	net_warn_ratelimited("BUG %s code %d qlen %d\n",
				202	dev->name, ret, q->q.qlen);
				203
				204	ret = dev_requeue_skb(skb, q);
				205	}
				206
				207	if (ret && netif_xmit_frozen_or_stopped(txq))
				208	ret = 0;
				209
				210	return ret;
				211	}
				212
				213	/*
				214	* NOTE: Called under qdisc_lock(q) with locally disabled BH.
				215	*
				216	* running seqcount guarantees only one CPU can process
				217	* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
				218	* this queue.
				219	*
				220	* netif_tx_lock serializes accesses to device driver.
				221	*
				222	* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
				223	* if one is grabbed, another must be free.
				224	*
				225	* Note, that this procedure can be called by a watchdog timer
				226	*
				227	* Returns to the caller:
				228	* 0 - queue is empty or throttled.
				229	* >0 - queue is not empty.
				230	*
				231	*/
				232	static inline int qdisc_restart(struct Qdisc q, int packets)
				233	{
				234	struct netdev_queue *txq;
				235	struct net_device *dev;
				236	spinlock_t *root_lock;
				237	struct sk_buff *skb;
				238	bool validate;
				239
				240	/* Dequeue packet */
				241	skb = dequeue_skb(q, &validate, packets);
				242	if (unlikely(!skb))
				243	return 0;
				244
				245	root_lock = qdisc_lock(q);
				246	dev = qdisc_dev(q);
				247	txq = skb_get_tx_queue(dev, skb);
				248
				249	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
				250	}
				251
				252	void __qdisc_run(struct Qdisc *q)
				253	{
				254	int quota = dev_tx_weight;
				255	int packets;
				256
				257	while (qdisc_restart(q, &packets)) {
				258	/*
				259	* Ordered by possible occurrence: Postpone processing if
				260	* 1. we've exceeded packet quota
				261	* 2. another process needs the CPU;
				262	*/
				263	quota -= packets;
				264	if (quota <= 0 \|\| need_resched()) {
				265	__netif_schedule(q);
				266	break;
				267	}
				268	}
				269
				270	qdisc_run_end(q);
				271	}
				272
				273	unsigned long dev_trans_start(struct net_device *dev)
				274	{
				275	unsigned long val, res;
				276	unsigned int i;
				277
				278	if (is_vlan_dev(dev))
				279	dev = vlan_dev_real_dev(dev);
				280	res = netdev_get_tx_queue(dev, 0)->trans_start;
				281	for (i = 1; i < dev->num_tx_queues; i++) {
				282	val = netdev_get_tx_queue(dev, i)->trans_start;
				283	if (val && time_after(val, res))
				284	res = val;
				285	}
				286
				287	return res;
				288	}
				289	EXPORT_SYMBOL(dev_trans_start);
				290
				291	static void dev_watchdog(unsigned long arg)
				292	{
				293	struct net_device dev = (struct net_device )arg;
				294
				295	netif_tx_lock(dev);
				296	if (!qdisc_tx_is_noop(dev)) {
				297	if (netif_device_present(dev) &&
				298	netif_running(dev) &&
				299	netif_carrier_ok(dev)) {
				300	int some_queue_timedout = 0;
				301	unsigned int i;
				302	unsigned long trans_start;
				303
				304	for (i = 0; i < dev->num_tx_queues; i++) {
				305	struct netdev_queue *txq;
				306
				307	txq = netdev_get_tx_queue(dev, i);
				308	trans_start = txq->trans_start;
				309	if (netif_xmit_stopped(txq) &&
				310	time_after(jiffies, (trans_start +
				311	dev->watchdog_timeo))) {
				312	some_queue_timedout = 1;
				313	txq->trans_timeout++;
				314	break;
				315	}
				316	}
				317
				318	if (some_queue_timedout) {
				319	WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
				320	dev->name, netdev_drivername(dev), i);
				321	dev->netdev_ops->ndo_tx_timeout(dev);
				322	}
				323	if (!mod_timer(&dev->watchdog_timer,
				324	round_jiffies(jiffies +
				325	dev->watchdog_timeo)))
				326	dev_hold(dev);
				327	}
				328	}
				329	netif_tx_unlock(dev);
				330
				331	dev_put(dev);
				332	}
				333
				334	void __netdev_watchdog_up(struct net_device *dev)
				335	{
				336	if (dev->netdev_ops->ndo_tx_timeout) {
				337	if (dev->watchdog_timeo <= 0)
				338	dev->watchdog_timeo = 5*HZ;
				339	if (!mod_timer(&dev->watchdog_timer,
				340	round_jiffies(jiffies + dev->watchdog_timeo)))
				341	dev_hold(dev);
				342	}
				343	}
				344	EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
				345
				346	static void dev_watchdog_up(struct net_device *dev)
				347	{
				348	__netdev_watchdog_up(dev);
				349	}
				350
				351	static void dev_watchdog_down(struct net_device *dev)
				352	{
				353	netif_tx_lock_bh(dev);
				354	if (del_timer(&dev->watchdog_timer))
				355	dev_put(dev);
				356	netif_tx_unlock_bh(dev);
				357	}
				358
				359	/**
				360	* netif_carrier_on - set carrier
				361	* @dev: network device
				362	*
				363	* Device has detected that carrier.
				364	*/
				365	void netif_carrier_on(struct net_device *dev)
				366	{
				367	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
				368	if (dev->reg_state == NETREG_UNINITIALIZED)
				369	return;
				370	atomic_inc(&dev->carrier_changes);
				371	linkwatch_fire_event(dev);
				372	if (netif_running(dev))
				373	__netdev_watchdog_up(dev);
				374	}
				375	}
				376	EXPORT_SYMBOL(netif_carrier_on);
				377
				378	/**
				379	* netif_carrier_off - clear carrier
				380	* @dev: network device
				381	*
				382	* Device has detected loss of carrier.
				383	*/
				384	void netif_carrier_off(struct net_device *dev)
				385	{
				386	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
				387	if (dev->reg_state == NETREG_UNINITIALIZED)
				388	return;
				389	atomic_inc(&dev->carrier_changes);
				390	linkwatch_fire_event(dev);
				391	}
				392	}
				393	EXPORT_SYMBOL(netif_carrier_off);
				394
				395	/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
				396	under all circumstances. It is difficult to invent anything faster or
				397	cheaper.
				398	*/
				399
				400	static int noop_enqueue(struct sk_buff skb, struct Qdisc qdisc,
				401	struct sk_buff **to_free)
				402	{
				403	__qdisc_drop(skb, to_free);
				404	return NET_XMIT_CN;
				405	}
				406
				407	static struct sk_buff noop_dequeue(struct Qdisc qdisc)
				408	{
				409	return NULL;
				410	}
				411
				412	struct Qdisc_ops noop_qdisc_ops __read_mostly = {
				413	.id = "noop",
				414	.priv_size = 0,
				415	.enqueue = noop_enqueue,
				416	.dequeue = noop_dequeue,
				417	.peek = noop_dequeue,
				418	.owner = THIS_MODULE,
				419	};
				420
				421	static struct netdev_queue noop_netdev_queue = {
				422	.qdisc = &noop_qdisc,
				423	.qdisc_sleeping = &noop_qdisc,
				424	};
				425
				426	struct Qdisc noop_qdisc = {
				427	.enqueue = noop_enqueue,
				428	.dequeue = noop_dequeue,
				429	.flags = TCQ_F_BUILTIN,
				430	.ops = &noop_qdisc_ops,
				431	.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
				432	.dev_queue = &noop_netdev_queue,
				433	.running = SEQCNT_ZERO(noop_qdisc.running),
				434	.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
				435	};
				436	EXPORT_SYMBOL(noop_qdisc);
				437
				438	static int noqueue_init(struct Qdisc qdisc, struct nlattr opt)
				439	{
				440	/* register_qdisc() assigns a default of noop_enqueue if unset,
				441	* but __dev_queue_xmit() treats noqueue only as such
				442	* if this is NULL - so clear it here. */
				443	qdisc->enqueue = NULL;
				444	return 0;
				445	}
				446
				447	struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
				448	.id = "noqueue",
				449	.priv_size = 0,
				450	.init = noqueue_init,
				451	.enqueue = noop_enqueue,
				452	.dequeue = noop_dequeue,
				453	.peek = noop_dequeue,
				454	.owner = THIS_MODULE,
				455	};
				456
				457	static const u8 prio2band[TC_PRIO_MAX + 1] = {
				458	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
				459	};
				460
				461	/* 3-band FIFO queue: old style, but should be a bit faster than
				462	generic prio+fifo combination.
				463	*/
				464
				465	#define PFIFO_FAST_BANDS 3
				466
				467	/*
				468	* Private data for a pfifo_fast scheduler containing:
				469	* - queues for the three band
				470	* - bitmap indicating which of the bands contain skbs
				471	*/
				472	struct pfifo_fast_priv {
				473	u32 bitmap;
				474	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
				475	};
				476
				477	/*
				478	* Convert a bitmap to the first band number where an skb is queued, where:
				479	* bitmap=0 means there are no skbs on any band.
				480	* bitmap=1 means there is an skb on band 0.
				481	* bitmap=7 means there are skbs on all 3 bands, etc.
				482	*/
				483	static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
				484
				485	static inline struct qdisc_skb_head band2list(struct pfifo_fast_priv priv,
				486	int band)
				487	{
				488	return priv->q + band;
				489	}
				490
				491	static int pfifo_fast_enqueue(struct sk_buff skb, struct Qdisc qdisc,
				492	struct sk_buff **to_free)
				493	{
				494	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
				495	int band = prio2band[skb->priority & TC_PRIO_MAX];
				496	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
				497	struct qdisc_skb_head *list = band2list(priv, band);
				498
				499	priv->bitmap \|= (1 << band);
				500	qdisc->q.qlen++;
				501	return __qdisc_enqueue_tail(skb, qdisc, list);
				502	}
				503
				504	return qdisc_drop(skb, qdisc, to_free);
				505	}
				506
				507	static struct sk_buff pfifo_fast_dequeue(struct Qdisc qdisc)
				508	{
				509	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
				510	int band = bitmap2band[priv->bitmap];
				511
				512	if (likely(band >= 0)) {
				513	struct qdisc_skb_head *qh = band2list(priv, band);
				514	struct sk_buff *skb = __qdisc_dequeue_head(qh);
				515
				516	if (likely(skb != NULL)) {
				517	qdisc_qstats_backlog_dec(qdisc, skb);
				518	qdisc_bstats_update(qdisc, skb);
				519	}
				520
				521	qdisc->q.qlen--;
				522	if (qh->qlen == 0)
				523	priv->bitmap &= ~(1 << band);
				524
				525	return skb;
				526	}
				527
				528	return NULL;
				529	}
				530
				531	static struct sk_buff pfifo_fast_peek(struct Qdisc qdisc)
				532	{
				533	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
				534	int band = bitmap2band[priv->bitmap];
				535
				536	if (band >= 0) {
				537	struct qdisc_skb_head *qh = band2list(priv, band);
				538
				539	return qh->head;
				540	}
				541
				542	return NULL;
				543	}
				544
				545	static void pfifo_fast_reset(struct Qdisc *qdisc)
				546	{
				547	int prio;
				548	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
				549
				550	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
				551	__qdisc_reset_queue(band2list(priv, prio));
				552
				553	priv->bitmap = 0;
				554	qdisc->qstats.backlog = 0;
				555	qdisc->q.qlen = 0;
				556	}
				557
				558	static int pfifo_fast_dump(struct Qdisc qdisc, struct sk_buff skb)
				559	{
				560	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
				561
				562	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
				563	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
				564	goto nla_put_failure;
				565	return skb->len;
				566
				567	nla_put_failure:
				568	return -1;
				569	}
				570
				571	static int pfifo_fast_init(struct Qdisc qdisc, struct nlattr opt)
				572	{
				573	int prio;
				574	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
				575
				576	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
				577	qdisc_skb_head_init(band2list(priv, prio));
				578
				579	/* Can by-pass the queue discipline */
				580	qdisc->flags \|= TCQ_F_CAN_BYPASS;
				581	return 0;
				582	}
				583
				584	struct Qdisc_ops pfifo_fast_ops __read_mostly = {
				585	.id = "pfifo_fast",
				586	.priv_size = sizeof(struct pfifo_fast_priv),
				587	.enqueue = pfifo_fast_enqueue,
				588	.dequeue = pfifo_fast_dequeue,
				589	.peek = pfifo_fast_peek,
				590	.init = pfifo_fast_init,
				591	.reset = pfifo_fast_reset,
				592	.dump = pfifo_fast_dump,
				593	.owner = THIS_MODULE,
				594	};
				595	EXPORT_SYMBOL(pfifo_fast_ops);
				596
				597	static struct lock_class_key qdisc_tx_busylock;
				598	static struct lock_class_key qdisc_running_key;
				599
				600	struct Qdisc qdisc_alloc(struct netdev_queue dev_queue,
				601	const struct Qdisc_ops *ops)
				602	{
				603	void *p;
				604	struct Qdisc *sch;
				605	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
				606	int err = -ENOBUFS;
				607	struct net_device *dev = dev_queue->dev;
				608
				609	p = kzalloc_node(size, GFP_KERNEL,
				610	netdev_queue_numa_node_read(dev_queue));
				611
				612	if (!p)
				613	goto errout;
				614	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
				615	/* if we got non aligned memory, ask more and do alignment ourself */
				616	if (sch != p) {
				617	kfree(p);
				618	p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				619	netdev_queue_numa_node_read(dev_queue));
				620	if (!p)
				621	goto errout;
				622	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
				623	sch->padded = (char ) sch - (char ) p;
				624	}
				625	qdisc_skb_head_init(&sch->q);
				626	spin_lock_init(&sch->q.lock);
				627
				628	spin_lock_init(&sch->busylock);
				629	lockdep_set_class(&sch->busylock,
				630	dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
				631
				632	seqcount_init(&sch->running);
				633	lockdep_set_class(&sch->running,
				634	dev->qdisc_running_key ?: &qdisc_running_key);
				635
				636	sch->ops = ops;
				637	sch->enqueue = ops->enqueue;
				638	sch->dequeue = ops->dequeue;
				639	sch->dev_queue = dev_queue;
				640	dev_hold(dev);
				641	refcount_set(&sch->refcnt, 1);
				642
				643	return sch;
				644	errout:
				645	return ERR_PTR(err);
				646	}
				647
				648	struct Qdisc qdisc_create_dflt(struct netdev_queue dev_queue,
				649	const struct Qdisc_ops *ops,
				650	unsigned int parentid)
				651	{
				652	struct Qdisc *sch;
				653
				654	if (!try_module_get(ops->owner))
				655	return NULL;
				656
				657	sch = qdisc_alloc(dev_queue, ops);
				658	if (IS_ERR(sch)) {
				659	module_put(ops->owner);
				660	return NULL;
				661	}
				662	sch->parent = parentid;
				663
				664	if (!ops->init \|\| ops->init(sch, NULL) == 0)
				665	return sch;
				666
				667	qdisc_destroy(sch);
				668	return NULL;
				669	}
				670	EXPORT_SYMBOL(qdisc_create_dflt);
				671
				672	/* Under qdisc_lock(qdisc) and BH! */
				673
				674	void qdisc_reset(struct Qdisc *qdisc)
				675	{
				676	const struct Qdisc_ops *ops = qdisc->ops;
				677
				678	if (ops->reset)
				679	ops->reset(qdisc);
				680
				681	kfree_skb(qdisc->skb_bad_txq);
				682	qdisc->skb_bad_txq = NULL;
				683
				684	if (qdisc->gso_skb) {
				685	kfree_skb_list(qdisc->gso_skb);
				686	qdisc->gso_skb = NULL;
				687	}
				688	qdisc->q.qlen = 0;
				689	qdisc->qstats.backlog = 0;
				690	}
				691	EXPORT_SYMBOL(qdisc_reset);
				692
				693	static void qdisc_rcu_free(struct rcu_head *head)
				694	{
				695	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
				696
				697	if (qdisc_is_percpu_stats(qdisc)) {
				698	free_percpu(qdisc->cpu_bstats);
				699	free_percpu(qdisc->cpu_qstats);
				700	}
				701
				702	kfree((char *) qdisc - qdisc->padded);
				703	}
				704
				705	void qdisc_destroy(struct Qdisc *qdisc)
				706	{
				707	const struct Qdisc_ops *ops;
				708
				709	if (!qdisc)
				710	return;
				711	ops = qdisc->ops;
				712
				713	if (qdisc->flags & TCQ_F_BUILTIN \|\|
				714	!refcount_dec_and_test(&qdisc->refcnt))
				715	return;
				716
				717	#ifdef CONFIG_NET_SCHED
				718	qdisc_hash_del(qdisc);
				719
				720	qdisc_put_stab(rtnl_dereference(qdisc->stab));
				721	#endif
				722	gen_kill_estimator(&qdisc->rate_est);
				723	if (ops->reset)
				724	ops->reset(qdisc);
				725	if (ops->destroy)
				726	ops->destroy(qdisc);
				727
				728	module_put(ops->owner);
				729	dev_put(qdisc_dev(qdisc));
				730
				731	kfree_skb_list(qdisc->gso_skb);
				732	kfree_skb(qdisc->skb_bad_txq);
				733	/*
				734	* gen_estimator est_timer() might access qdisc->q.lock,
				735	* wait a RCU grace period before freeing qdisc.
				736	*/
				737	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
				738	}
				739	EXPORT_SYMBOL(qdisc_destroy);
				740
				741	/* Attach toplevel qdisc to device queue. */
				742	struct Qdisc dev_graft_qdisc(struct netdev_queue dev_queue,
				743	struct Qdisc *qdisc)
				744	{
				745	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
				746	spinlock_t *root_lock;
				747
				748	root_lock = qdisc_lock(oqdisc);
				749	spin_lock_bh(root_lock);
				750
				751	/* ... and graft new one */
				752	if (qdisc == NULL)
				753	qdisc = &noop_qdisc;
				754	dev_queue->qdisc_sleeping = qdisc;
				755	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
				756
				757	spin_unlock_bh(root_lock);
				758
				759	return oqdisc;
				760	}
				761	EXPORT_SYMBOL(dev_graft_qdisc);
				762
				763	static void attach_one_default_qdisc(struct net_device *dev,
				764	struct netdev_queue *dev_queue,
				765	void *_unused)
				766	{
				767	struct Qdisc *qdisc;
				768	const struct Qdisc_ops *ops = default_qdisc_ops;
				769
				770	if (dev->priv_flags & IFF_NO_QUEUE)
				771	ops = &noqueue_qdisc_ops;
				772
				773	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
				774	if (!qdisc) {
				775	netdev_info(dev, "activation failed\n");
				776	return;
				777	}
				778	if (!netif_is_multiqueue(dev))
				779	qdisc->flags \|= TCQ_F_ONETXQUEUE \| TCQ_F_NOPARENT;
				780	dev_queue->qdisc_sleeping = qdisc;
				781	}
				782
				783	static void attach_default_qdiscs(struct net_device *dev)
				784	{
				785	struct netdev_queue *txq;
				786	struct Qdisc *qdisc;
				787
				788	txq = netdev_get_tx_queue(dev, 0);
				789
				790	if (!netif_is_multiqueue(dev) \|\|
				791	dev->priv_flags & IFF_NO_QUEUE) {
				792	netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
				793	dev->qdisc = txq->qdisc_sleeping;
				794	qdisc_refcount_inc(dev->qdisc);
				795	} else {
				796	qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
				797	if (qdisc) {
				798	dev->qdisc = qdisc;
				799	qdisc->ops->attach(qdisc);
				800	}
				801	}
				802	#ifdef CONFIG_NET_SCHED
				803	if (dev->qdisc != &noop_qdisc)
				804	qdisc_hash_add(dev->qdisc, false);
				805	#endif
				806	}
				807
				808	static void transition_one_qdisc(struct net_device *dev,
				809	struct netdev_queue *dev_queue,
				810	void *_need_watchdog)
				811	{
				812	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
				813	int *need_watchdog_p = _need_watchdog;
				814
				815	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
				816	clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
				817
				818	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
				819	if (need_watchdog_p) {
				820	dev_queue->trans_start = 0;
				821	*need_watchdog_p = 1;
				822	}
				823	}
				824
				825	void dev_activate(struct net_device *dev)
				826	{
				827	int need_watchdog;
				828
				829	/* No queueing discipline is attached to device;
				830	* create default one for devices, which need queueing
				831	* and noqueue_qdisc for virtual interfaces
				832	*/
				833
				834	if (dev->qdisc == &noop_qdisc)
				835	attach_default_qdiscs(dev);
				836
				837	if (!netif_carrier_ok(dev))
				838	/* Delay activation until next carrier-on event */
				839	return;
				840
				841	need_watchdog = 0;
				842	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
				843	if (dev_ingress_queue(dev))
				844	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
				845
				846	if (need_watchdog) {
				847	netif_trans_update(dev);
				848	dev_watchdog_up(dev);
				849	}
				850	}
				851	EXPORT_SYMBOL(dev_activate);
				852
				853	static void dev_deactivate_queue(struct net_device *dev,
				854	struct netdev_queue *dev_queue,
				855	void *_qdisc_default)
				856	{
				857	struct Qdisc *qdisc_default = _qdisc_default;
				858	struct Qdisc *qdisc;
				859
				860	qdisc = rtnl_dereference(dev_queue->qdisc);
				861	if (qdisc) {
				862	spin_lock_bh(qdisc_lock(qdisc));
				863
				864	if (!(qdisc->flags & TCQ_F_BUILTIN))
				865	set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
				866
				867	rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
				868	qdisc_reset(qdisc);
				869
				870	spin_unlock_bh(qdisc_lock(qdisc));
				871	}
				872	}
				873
				874	static bool some_qdisc_is_busy(struct net_device *dev)
				875	{
				876	unsigned int i;
				877
				878	for (i = 0; i < dev->num_tx_queues; i++) {
				879	struct netdev_queue *dev_queue;
				880	spinlock_t *root_lock;
				881	struct Qdisc *q;
				882	int val;
				883
				884	dev_queue = netdev_get_tx_queue(dev, i);
				885	q = dev_queue->qdisc_sleeping;
				886	root_lock = qdisc_lock(q);
				887
				888	spin_lock_bh(root_lock);
				889
				890	val = (qdisc_is_running(q) \|\|
				891	test_bit(__QDISC_STATE_SCHED, &q->state));
				892
				893	spin_unlock_bh(root_lock);
				894
				895	if (val)
				896	return true;
				897	}
				898	return false;
				899	}
				900
				901	static void dev_qdisc_reset(struct net_device *dev,
				902	struct netdev_queue *dev_queue,
				903	void *none)
				904	{
				905	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
				906
				907	if (qdisc)
				908	qdisc_reset(qdisc);
				909	}
				910
				911	/**
				912	* dev_deactivate_many - deactivate transmissions on several devices
				913	* @head: list of devices to deactivate
				914	*
				915	* This function returns only when all outstanding transmissions
				916	* have completed, unless all devices are in dismantle phase.
				917	*/
				918	void dev_deactivate_many(struct list_head *head)
				919	{
				920	struct net_device *dev;
				921
				922	list_for_each_entry(dev, head, close_list) {
				923	netdev_for_each_tx_queue(dev, dev_deactivate_queue,
				924	&noop_qdisc);
				925	if (dev_ingress_queue(dev))
				926	dev_deactivate_queue(dev, dev_ingress_queue(dev),
				927	&noop_qdisc);
				928
				929	dev_watchdog_down(dev);
				930	}
				931
				932	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
				933	* This is avoided if all devices are in dismantle phase :
				934	* Caller will call synchronize_net() for us
				935	*/
				936	synchronize_net();
				937
				938	/* Wait for outstanding qdisc_run calls. */
				939	list_for_each_entry(dev, head, close_list) {
				940	while (some_qdisc_is_busy(dev))
				941	yield();
				942	/* The new qdisc is assigned at this point so we can safely
				943	* unwind stale skb lists and qdisc statistics
				944	*/
				945	netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
				946	if (dev_ingress_queue(dev))
				947	dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
				948	}
				949	}
				950
				951	void dev_deactivate(struct net_device *dev)
				952	{
				953	LIST_HEAD(single);
				954
				955	list_add(&dev->close_list, &single);
				956	dev_deactivate_many(&single);
				957	list_del(&single);
				958	}
				959	EXPORT_SYMBOL(dev_deactivate);
				960
				961	static void dev_init_scheduler_queue(struct net_device *dev,
				962	struct netdev_queue *dev_queue,
				963	void *_qdisc)
				964	{
				965	struct Qdisc *qdisc = _qdisc;
				966
				967	rcu_assign_pointer(dev_queue->qdisc, qdisc);
				968	dev_queue->qdisc_sleeping = qdisc;
				969	}
				970
				971	void dev_init_scheduler(struct net_device *dev)
				972	{
				973	dev->qdisc = &noop_qdisc;
				974	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
				975	if (dev_ingress_queue(dev))
				976	dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
				977
				978	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
				979	}
				980
				981	static void shutdown_scheduler_queue(struct net_device *dev,
				982	struct netdev_queue *dev_queue,
				983	void *_qdisc_default)
				984	{
				985	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
				986	struct Qdisc *qdisc_default = _qdisc_default;
				987
				988	if (qdisc) {
				989	rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
				990	dev_queue->qdisc_sleeping = qdisc_default;
				991
				992	qdisc_destroy(qdisc);
				993	}
				994	}
				995
				996	void dev_shutdown(struct net_device *dev)
				997	{
				998	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
				999	if (dev_ingress_queue(dev))
				1000	shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
				1001	qdisc_destroy(dev->qdisc);
				1002	dev->qdisc = &noop_qdisc;
				1003
				1004	WARN_ON(timer_pending(&dev->watchdog_timer));
				1005	}
				1006
				1007	void psched_ratecfg_precompute(struct psched_ratecfg *r,
				1008	const struct tc_ratespec *conf,
				1009	u64 rate64)
				1010	{
				1011	memset(r, 0, sizeof(*r));
				1012	r->overhead = conf->overhead;
				1013	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
				1014	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
				1015	r->mult = 1;
				1016	/*
				1017	* The deal here is to replace a divide by a reciprocal one
				1018	* in fast path (a reciprocal divide is a multiply and a shift)
				1019	*
				1020	* Normal formula would be :
				1021	* time_in_ns = (NSEC_PER_SEC * len) / rate_bps
				1022	*
				1023	* We compute mult/shift to use instead :
				1024	* time_in_ns = (len * mult) >> shift;
				1025	*
				1026	* We try to get the highest possible mult value for accuracy,
				1027	* but have to make sure no overflows will ever happen.
				1028	*/
				1029	if (r->rate_bytes_ps > 0) {
				1030	u64 factor = NSEC_PER_SEC;
				1031
				1032	for (;;) {
				1033	r->mult = div64_u64(factor, r->rate_bytes_ps);
				1034	if (r->mult & (1U << 31) \|\| factor & (1ULL << 63))
				1035	break;
				1036	factor <<= 1;
				1037	r->shift++;
				1038	}
				1039	}
				1040	}
				1041	EXPORT_SYMBOL(psched_ratecfg_precompute);