Blame - src/kernel/linux/v4.19/net/sched/sch_generic.c - T800

blob: 488e4fcd34a6fc05d71bdecda70bab78574b57ba [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* net/sched/sch_generic.c Generic packet scheduler routines.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License, or (at your option) any later version.
				8	*
				9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
				10	* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
				11	* - Ingress support
				12	*/
				13
				14	#include <linux/bitops.h>
				15	#include <linux/module.h>
				16	#include <linux/types.h>
				17	#include <linux/kernel.h>
				18	#include <linux/sched.h>
				19	#include <linux/string.h>
				20	#include <linux/errno.h>
				21	#include <linux/netdevice.h>
				22	#include <linux/skbuff.h>
				23	#include <linux/rtnetlink.h>
				24	#include <linux/init.h>
				25	#include <linux/rcupdate.h>
				26	#include <linux/list.h>
				27	#include <linux/slab.h>
				28	#include <linux/if_vlan.h>
				29	#include <linux/skb_array.h>
				30	#include <linux/if_macvlan.h>
				31	#include <net/sch_generic.h>
				32	#include <net/pkt_sched.h>
				33	#include <net/dst.h>
				34	#include <trace/events/qdisc.h>
				35	#include <net/xfrm.h>
				36
				37	/* Qdisc to use by default */
				38	const struct Qdisc_ops *default_qdisc_ops = &fq_codel_qdisc_ops;
				39	EXPORT_SYMBOL(default_qdisc_ops);
				40
				41	/* Main transmission queue. */
				42
				43	/* Modifications to data participating in scheduling must be protected with
				44	* qdisc_lock(qdisc) spinlock.
				45	*
				46	* The idea is the following:
				47	* - enqueue, dequeue are serialized via qdisc root lock
				48	* - ingress filtering is also serialized via qdisc root lock
				49	* - updates to tree and tree walking are only done under the rtnl mutex.
				50	*/
				51
				52	#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
				53
				54	static inline struct sk_buff __skb_dequeue_bad_txq(struct Qdisc q)
				55	{
				56	const struct netdev_queue *txq = q->dev_queue;
				57	spinlock_t *lock = NULL;
				58	struct sk_buff *skb;
				59
				60	if (q->flags & TCQ_F_NOLOCK) {
				61	lock = qdisc_lock(q);
				62	spin_lock(lock);
				63	}
				64
				65	skb = skb_peek(&q->skb_bad_txq);
				66	if (skb) {
				67	/* check the reason of requeuing without tx lock first */
				68	txq = skb_get_tx_queue(txq->dev, skb);
				69	if (!netif_xmit_frozen_or_stopped(txq)) {
				70	skb = __skb_dequeue(&q->skb_bad_txq);
				71	if (qdisc_is_percpu_stats(q)) {
				72	qdisc_qstats_cpu_backlog_dec(q, skb);
				73	qdisc_qstats_atomic_qlen_dec(q);
				74	} else {
				75	qdisc_qstats_backlog_dec(q, skb);
				76	q->q.qlen--;
				77	}
				78	} else {
				79	skb = SKB_XOFF_MAGIC;
				80	}
				81	}
				82
				83	if (lock)
				84	spin_unlock(lock);
				85
				86	return skb;
				87	}
				88
				89	static inline struct sk_buff qdisc_dequeue_skb_bad_txq(struct Qdisc q)
				90	{
				91	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
				92
				93	if (unlikely(skb))
				94	skb = __skb_dequeue_bad_txq(q);
				95
				96	return skb;
				97	}
				98
				99	static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
				100	struct sk_buff *skb)
				101	{
				102	spinlock_t *lock = NULL;
				103
				104	if (q->flags & TCQ_F_NOLOCK) {
				105	lock = qdisc_lock(q);
				106	spin_lock(lock);
				107	}
				108
				109	__skb_queue_tail(&q->skb_bad_txq, skb);
				110
				111	if (qdisc_is_percpu_stats(q)) {
				112	qdisc_qstats_cpu_backlog_inc(q, skb);
				113	qdisc_qstats_atomic_qlen_inc(q);
				114	} else {
				115	qdisc_qstats_backlog_inc(q, skb);
				116	q->q.qlen++;
				117	}
				118
				119	if (lock)
				120	spin_unlock(lock);
				121	}
				122
				123	static inline int __dev_requeue_skb(struct sk_buff skb, struct Qdisc q)
				124	{
				125	while (skb) {
				126	struct sk_buff *next = skb->next;
				127
				128	__skb_queue_tail(&q->gso_skb, skb);
				129	q->qstats.requeues++;
				130	qdisc_qstats_backlog_inc(q, skb);
				131	q->q.qlen++; /* it's still part of the queue */
				132
				133	skb = next;
				134	}
				135	__netif_schedule(q);
				136
				137	return 0;
				138	}
				139
				140	static inline int dev_requeue_skb_locked(struct sk_buff skb, struct Qdisc q)
				141	{
				142	spinlock_t *lock = qdisc_lock(q);
				143
				144	spin_lock(lock);
				145	while (skb) {
				146	struct sk_buff *next = skb->next;
				147
				148	__skb_queue_tail(&q->gso_skb, skb);
				149
				150	qdisc_qstats_cpu_requeues_inc(q);
				151	qdisc_qstats_cpu_backlog_inc(q, skb);
				152	qdisc_qstats_atomic_qlen_inc(q);
				153
				154	skb = next;
				155	}
				156	spin_unlock(lock);
				157
				158	__netif_schedule(q);
				159
				160	return 0;
				161	}
				162
				163	static inline int dev_requeue_skb(struct sk_buff skb, struct Qdisc q)
				164	{
				165	if (q->flags & TCQ_F_NOLOCK)
				166	return dev_requeue_skb_locked(skb, q);
				167	else
				168	return __dev_requeue_skb(skb, q);
				169	}
				170
				171	static void try_bulk_dequeue_skb(struct Qdisc *q,
				172	struct sk_buff *skb,
				173	const struct netdev_queue *txq,
				174	int *packets)
				175	{
				176	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
				177
				178	while (bytelimit > 0) {
				179	struct sk_buff *nskb = q->dequeue(q);
				180
				181	if (!nskb)
				182	break;
				183
				184	bytelimit -= nskb->len; /* covers GSO len */
				185	skb->next = nskb;
				186	skb = nskb;
				187	(packets)++; / GSO counts as one pkt */
				188	}
				189	skb->next = NULL;
				190	}
				191
				192	/* This variant of try_bulk_dequeue_skb() makes sure
				193	* all skbs in the chain are for the same txq
				194	*/
				195	static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				196	struct sk_buff *skb,
				197	int *packets)
				198	{
				199	int mapping = skb_get_queue_mapping(skb);
				200	struct sk_buff *nskb;
				201	int cnt = 0;
				202
				203	do {
				204	nskb = q->dequeue(q);
				205	if (!nskb)
				206	break;
				207	if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
				208	qdisc_enqueue_skb_bad_txq(q, nskb);
				209	break;
				210	}
				211	skb->next = nskb;
				212	skb = nskb;
				213	} while (++cnt < 8);
				214	(*packets) += cnt;
				215	skb->next = NULL;
				216	}
				217
				218	/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
				219	* A requeued skb (via q->gso_skb) can also be a SKB list.
				220	*/
				221	static struct sk_buff dequeue_skb(struct Qdisc q, bool *validate,
				222	int *packets)
				223	{
				224	const struct netdev_queue *txq = q->dev_queue;
				225	struct sk_buff *skb = NULL;
				226
				227	*packets = 1;
				228	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
				229	spinlock_t *lock = NULL;
				230
				231	if (q->flags & TCQ_F_NOLOCK) {
				232	lock = qdisc_lock(q);
				233	spin_lock(lock);
				234	}
				235
				236	skb = skb_peek(&q->gso_skb);
				237
				238	/* skb may be null if another cpu pulls gso_skb off in between
				239	* empty check and lock.
				240	*/
				241	if (!skb) {
				242	if (lock)
				243	spin_unlock(lock);
				244	goto validate;
				245	}
				246
				247	/* skb in gso_skb were already validated */
				248	*validate = false;
				249	if (xfrm_offload(skb))
				250	*validate = true;
				251	/* check the reason of requeuing without tx lock first */
				252	txq = skb_get_tx_queue(txq->dev, skb);
				253	if (!netif_xmit_frozen_or_stopped(txq)) {
				254	skb = __skb_dequeue(&q->gso_skb);
				255	if (qdisc_is_percpu_stats(q)) {
				256	qdisc_qstats_cpu_backlog_dec(q, skb);
				257	qdisc_qstats_atomic_qlen_dec(q);
				258	} else {
				259	qdisc_qstats_backlog_dec(q, skb);
				260	q->q.qlen--;
				261	}
				262	} else {
				263	skb = NULL;
				264	}
				265	if (lock)
				266	spin_unlock(lock);
				267	goto trace;
				268	}
				269	validate:
				270	*validate = true;
				271
				272	if ((q->flags & TCQ_F_ONETXQUEUE) &&
				273	netif_xmit_frozen_or_stopped(txq))
				274	return skb;
				275
				276	skb = qdisc_dequeue_skb_bad_txq(q);
				277	if (unlikely(skb)) {
				278	if (skb == SKB_XOFF_MAGIC)
				279	return NULL;
				280	goto bulk;
				281	}
				282	skb = q->dequeue(q);
				283	if (skb) {
				284	bulk:
				285	if (qdisc_may_bulk(q))
				286	try_bulk_dequeue_skb(q, skb, txq, packets);
				287	else
				288	try_bulk_dequeue_skb_slow(q, skb, packets);
				289	}
				290	trace:
				291	trace_qdisc_dequeue(q, txq, *packets, skb);
				292	return skb;
				293	}
				294
				295	/*
				296	* Transmit possibly several skbs, and handle the return status as
				297	* required. Owning running seqcount bit guarantees that
				298	* only one CPU can execute this function.
				299	*
				300	* Returns to the caller:
				301	* false - hardware queue frozen backoff
				302	* true - feel free to send more pkts
				303	*/
				304	bool sch_direct_xmit(struct sk_buff skb, struct Qdisc q,
				305	struct net_device dev, struct netdev_queue txq,
				306	spinlock_t *root_lock, bool validate)
				307	{
				308	int ret = NETDEV_TX_BUSY;
				309	bool again = false;
				310
				311	/* And release qdisc */
				312	if (root_lock)
				313	spin_unlock(root_lock);
				314
				315	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
				316	if (validate)
				317	skb = validate_xmit_skb_list(skb, dev, &again);
				318
				319	#ifdef CONFIG_XFRM_OFFLOAD
				320	if (unlikely(again)) {
				321	if (root_lock)
				322	spin_lock(root_lock);
				323
				324	dev_requeue_skb(skb, q);
				325	return false;
				326	}
				327	#endif
				328
				329	if (likely(skb)) {
				330	HARD_TX_LOCK(dev, txq, smp_processor_id());
				331	if (!netif_xmit_frozen_or_stopped(txq))
				332	skb = dev_hard_start_xmit(skb, dev, txq, &ret);
				333
				334	HARD_TX_UNLOCK(dev, txq);
				335	} else {
				336	if (root_lock)
				337	spin_lock(root_lock);
				338	return true;
				339	}
				340
				341	if (root_lock)
				342	spin_lock(root_lock);
				343
				344	if (!dev_xmit_complete(ret)) {
				345	/* Driver returned NETDEV_TX_BUSY - requeue skb */
				346	if (unlikely(ret != NETDEV_TX_BUSY))
				347	net_warn_ratelimited("BUG %s code %d qlen %d\n",
				348	dev->name, ret, q->q.qlen);
				349
				350	dev_requeue_skb(skb, q);
				351	return false;
				352	}
				353
				354	return true;
				355	}
				356
				357	/*
				358	* NOTE: Called under qdisc_lock(q) with locally disabled BH.
				359	*
				360	* running seqcount guarantees only one CPU can process
				361	* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
				362	* this queue.
				363	*
				364	* netif_tx_lock serializes accesses to device driver.
				365	*
				366	* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
				367	* if one is grabbed, another must be free.
				368	*
				369	* Note, that this procedure can be called by a watchdog timer
				370	*
				371	* Returns to the caller:
				372	* 0 - queue is empty or throttled.
				373	* >0 - queue is not empty.
				374	*
				375	*/
				376	static inline bool qdisc_restart(struct Qdisc q, int packets)
				377	{
				378	spinlock_t *root_lock = NULL;
				379	struct netdev_queue *txq;
				380	struct net_device *dev;
				381	struct sk_buff *skb;
				382	bool validate;
				383
				384	/* Dequeue packet */
				385	skb = dequeue_skb(q, &validate, packets);
				386	if (unlikely(!skb))
				387	return false;
				388
				389	if (!(q->flags & TCQ_F_NOLOCK))
				390	root_lock = qdisc_lock(q);
				391
				392	dev = qdisc_dev(q);
				393	txq = skb_get_tx_queue(dev, skb);
				394
				395	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
				396	}
				397
				398	void __qdisc_run(struct Qdisc *q)
				399	{
				400	int quota = dev_tx_weight;
				401	int packets;
				402
				403	while (qdisc_restart(q, &packets)) {
				404	/*
				405	* Ordered by possible occurrence: Postpone processing if
				406	* 1. we've exceeded packet quota
				407	* 2. another process needs the CPU;
				408	*/
				409	quota -= packets;
				410	if (quota <= 0 \|\| need_resched()) {
				411	__netif_schedule(q);
				412	break;
				413	}
				414	}
				415	}
				416
				417	unsigned long dev_trans_start(struct net_device *dev)
				418	{
				419	unsigned long val, res;
				420	unsigned int i;
				421
				422	if (is_vlan_dev(dev))
				423	dev = vlan_dev_real_dev(dev);
				424	else if (netif_is_macvlan(dev))
				425	dev = macvlan_dev_real_dev(dev);
				426	res = netdev_get_tx_queue(dev, 0)->trans_start;
				427	for (i = 1; i < dev->num_tx_queues; i++) {
				428	val = netdev_get_tx_queue(dev, i)->trans_start;
				429	if (val && time_after(val, res))
				430	res = val;
				431	}
				432
				433	return res;
				434	}
				435	EXPORT_SYMBOL(dev_trans_start);
				436
				437	static void dev_watchdog(struct timer_list *t)
				438	{
				439	struct net_device *dev = from_timer(dev, t, watchdog_timer);
				440
				441	netif_tx_lock(dev);
				442	if (!qdisc_tx_is_noop(dev)) {
				443	if (netif_device_present(dev) &&
				444	netif_running(dev) &&
				445	netif_carrier_ok(dev)) {
				446	int some_queue_timedout = 0;
				447	unsigned int i;
				448	unsigned long trans_start;
				449
				450	for (i = 0; i < dev->num_tx_queues; i++) {
				451	struct netdev_queue *txq;
				452
				453	txq = netdev_get_tx_queue(dev, i);
				454	trans_start = txq->trans_start;
				455	if (netif_xmit_stopped(txq) &&
				456	time_after(jiffies, (trans_start +
				457	dev->watchdog_timeo))) {
				458	some_queue_timedout = 1;
				459	txq->trans_timeout++;
				460	break;
				461	}
				462	}
				463
				464	if (some_queue_timedout) {
				465	WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
				466	dev->name, netdev_drivername(dev), i);
				467	dev->netdev_ops->ndo_tx_timeout(dev);
				468	}
				469	if (!mod_timer(&dev->watchdog_timer,
				470	round_jiffies(jiffies +
				471	dev->watchdog_timeo)))
				472	dev_hold(dev);
				473	}
				474	}
				475	netif_tx_unlock(dev);
				476
				477	dev_put(dev);
				478	}
				479
				480	void __netdev_watchdog_up(struct net_device *dev)
				481	{
				482	if (dev->netdev_ops->ndo_tx_timeout) {
				483	if (dev->watchdog_timeo <= 0)
				484	dev->watchdog_timeo = 5*HZ;
				485	if (!mod_timer(&dev->watchdog_timer,
				486	round_jiffies(jiffies + dev->watchdog_timeo)))
				487	dev_hold(dev);
				488	}
				489	}
				490
				491	static void dev_watchdog_up(struct net_device *dev)
				492	{
				493	__netdev_watchdog_up(dev);
				494	}
				495
				496	static void dev_watchdog_down(struct net_device *dev)
				497	{
				498	netif_tx_lock_bh(dev);
				499	if (del_timer(&dev->watchdog_timer))
				500	dev_put(dev);
				501	netif_tx_unlock_bh(dev);
				502	}
				503
				504	/**
				505	* netif_carrier_on - set carrier
				506	* @dev: network device
				507	*
				508	* Device has detected that carrier.
				509	*/
				510	void netif_carrier_on(struct net_device *dev)
				511	{
				512	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
				513	if (dev->reg_state == NETREG_UNINITIALIZED)
				514	return;
				515	atomic_inc(&dev->carrier_up_count);
				516	linkwatch_fire_event(dev);
				517	if (netif_running(dev))
				518	__netdev_watchdog_up(dev);
				519	}
				520	}
				521	EXPORT_SYMBOL(netif_carrier_on);
				522
				523	/**
				524	* netif_carrier_off - clear carrier
				525	* @dev: network device
				526	*
				527	* Device has detected loss of carrier.
				528	*/
				529	void netif_carrier_off(struct net_device *dev)
				530	{
				531	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
				532	if (dev->reg_state == NETREG_UNINITIALIZED)
				533	return;
				534	atomic_inc(&dev->carrier_down_count);
				535	linkwatch_fire_event(dev);
				536	}
				537	}
				538	EXPORT_SYMBOL(netif_carrier_off);
				539
				540	/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
				541	under all circumstances. It is difficult to invent anything faster or
				542	cheaper.
				543	*/
				544
				545	static int noop_enqueue(struct sk_buff skb, struct Qdisc qdisc,
				546	struct sk_buff **to_free)
				547	{
				548	__qdisc_drop(skb, to_free);
				549	return NET_XMIT_CN;
				550	}
				551
				552	static struct sk_buff noop_dequeue(struct Qdisc qdisc)
				553	{
				554	return NULL;
				555	}
				556
				557	struct Qdisc_ops noop_qdisc_ops __read_mostly = {
				558	.id = "noop",
				559	.priv_size = 0,
				560	.enqueue = noop_enqueue,
				561	.dequeue = noop_dequeue,
				562	.peek = noop_dequeue,
				563	.owner = THIS_MODULE,
				564	};
				565
				566	static struct netdev_queue noop_netdev_queue = {
				567	.qdisc = &noop_qdisc,
				568	.qdisc_sleeping = &noop_qdisc,
				569	};
				570
				571	struct Qdisc noop_qdisc = {
				572	.enqueue = noop_enqueue,
				573	.dequeue = noop_dequeue,
				574	.flags = TCQ_F_BUILTIN,
				575	.ops = &noop_qdisc_ops,
				576	.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
				577	.dev_queue = &noop_netdev_queue,
				578	.running = SEQCNT_ZERO(noop_qdisc.running),
				579	.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
				580	.gso_skb = {
				581	.next = (struct sk_buff *)&noop_qdisc.gso_skb,
				582	.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
				583	.qlen = 0,
				584	.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
				585	},
				586	.skb_bad_txq = {
				587	.next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
				588	.prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
				589	.qlen = 0,
				590	.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
				591	},
				592	};
				593	EXPORT_SYMBOL(noop_qdisc);
				594
				595	static int noqueue_init(struct Qdisc qdisc, struct nlattr opt,
				596	struct netlink_ext_ack *extack)
				597	{
				598	/* register_qdisc() assigns a default of noop_enqueue if unset,
				599	* but __dev_queue_xmit() treats noqueue only as such
				600	* if this is NULL - so clear it here. */
				601	qdisc->enqueue = NULL;
				602	return 0;
				603	}
				604
				605	struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
				606	.id = "noqueue",
				607	.priv_size = 0,
				608	.init = noqueue_init,
				609	.enqueue = noop_enqueue,
				610	.dequeue = noop_dequeue,
				611	.peek = noop_dequeue,
				612	.owner = THIS_MODULE,
				613	};
				614
				615	static struct lock_class_key qdisc_tx_busylock;
				616	static struct lock_class_key qdisc_running_key;
				617
				618	struct Qdisc qdisc_alloc(struct netdev_queue dev_queue,
				619	const struct Qdisc_ops *ops,
				620	struct netlink_ext_ack *extack)
				621	{
				622	void *p;
				623	struct Qdisc *sch;
				624	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
				625	int err = -ENOBUFS;
				626	struct net_device *dev;
				627
				628	if (!dev_queue) {
				629	NL_SET_ERR_MSG(extack, "No device queue given");
				630	err = -EINVAL;
				631	goto errout;
				632	}
				633
				634	dev = dev_queue->dev;
				635	p = kzalloc_node(size, GFP_KERNEL,
				636	netdev_queue_numa_node_read(dev_queue));
				637
				638	if (!p)
				639	goto errout;
				640	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
				641	/* if we got non aligned memory, ask more and do alignment ourself */
				642	if (sch != p) {
				643	kfree(p);
				644	p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				645	netdev_queue_numa_node_read(dev_queue));
				646	if (!p)
				647	goto errout;
				648	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
				649	sch->padded = (char ) sch - (char ) p;
				650	}
				651	__skb_queue_head_init(&sch->gso_skb);
				652	__skb_queue_head_init(&sch->skb_bad_txq);
				653	qdisc_skb_head_init(&sch->q);
				654	spin_lock_init(&sch->q.lock);
				655
				656	if (ops->static_flags & TCQ_F_CPUSTATS) {
				657	sch->cpu_bstats =
				658	netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
				659	if (!sch->cpu_bstats)
				660	goto errout1;
				661
				662	sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
				663	if (!sch->cpu_qstats) {
				664	free_percpu(sch->cpu_bstats);
				665	goto errout1;
				666	}
				667	}
				668
				669	spin_lock_init(&sch->busylock);
				670	lockdep_set_class(&sch->busylock,
				671	dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
				672
				673	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
				674	spin_lock_init(&sch->seqlock);
				675	lockdep_set_class(&sch->busylock,
				676	dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
				677
				678	seqcount_init(&sch->running);
				679	lockdep_set_class(&sch->running,
				680	dev->qdisc_running_key ?: &qdisc_running_key);
				681
				682	sch->ops = ops;
				683	sch->flags = ops->static_flags;
				684	sch->enqueue = ops->enqueue;
				685	sch->dequeue = ops->dequeue;
				686	sch->dev_queue = dev_queue;
				687	dev_hold(dev);
				688	refcount_set(&sch->refcnt, 1);
				689
				690	return sch;
				691	errout1:
				692	kfree(p);
				693	errout:
				694	return ERR_PTR(err);
				695	}
				696
				697	struct Qdisc qdisc_create_dflt(struct netdev_queue dev_queue,
				698	const struct Qdisc_ops *ops,
				699	unsigned int parentid,
				700	struct netlink_ext_ack *extack)
				701	{
				702	struct Qdisc *sch;
				703
				704	if (!try_module_get(ops->owner)) {
				705	NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
				706	return NULL;
				707	}
				708
				709	sch = qdisc_alloc(dev_queue, ops, extack);
				710	if (IS_ERR(sch)) {
				711	module_put(ops->owner);
				712	return NULL;
				713	}
				714	sch->parent = parentid;
				715
				716	if (!ops->init \|\| ops->init(sch, NULL, extack) == 0)
				717	return sch;
				718
				719	qdisc_destroy(sch);
				720	return NULL;
				721	}
				722	EXPORT_SYMBOL(qdisc_create_dflt);
				723
				724	/* Under qdisc_lock(qdisc) and BH! */
				725
				726	void qdisc_reset(struct Qdisc *qdisc)
				727	{
				728	const struct Qdisc_ops *ops = qdisc->ops;
				729	struct sk_buff skb, tmp;
				730
				731	if (ops->reset)
				732	ops->reset(qdisc);
				733
				734	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
				735	__skb_unlink(skb, &qdisc->gso_skb);
				736	kfree_skb_list(skb);
				737	}
				738
				739	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
				740	__skb_unlink(skb, &qdisc->skb_bad_txq);
				741	kfree_skb_list(skb);
				742	}
				743
				744	qdisc->q.qlen = 0;
				745	qdisc->qstats.backlog = 0;
				746	}
				747	EXPORT_SYMBOL(qdisc_reset);
				748
				749	void qdisc_free(struct Qdisc *qdisc)
				750	{
				751	if (qdisc_is_percpu_stats(qdisc)) {
				752	free_percpu(qdisc->cpu_bstats);
				753	free_percpu(qdisc->cpu_qstats);
				754	}
				755
				756	kfree((char *) qdisc - qdisc->padded);
				757	}
				758
				759	void qdisc_destroy(struct Qdisc *qdisc)
				760	{
				761	const struct Qdisc_ops *ops;
				762	struct sk_buff skb, tmp;
				763
				764	if (!qdisc)
				765	return;
				766	ops = qdisc->ops;
				767
				768	if (qdisc->flags & TCQ_F_BUILTIN \|\|
				769	!refcount_dec_and_test(&qdisc->refcnt))
				770	return;
				771
				772	#ifdef CONFIG_NET_SCHED
				773	qdisc_hash_del(qdisc);
				774
				775	qdisc_put_stab(rtnl_dereference(qdisc->stab));
				776	#endif
				777	gen_kill_estimator(&qdisc->rate_est);
				778	if (ops->reset)
				779	ops->reset(qdisc);
				780	if (ops->destroy)
				781	ops->destroy(qdisc);
				782
				783	module_put(ops->owner);
				784	dev_put(qdisc_dev(qdisc));
				785
				786	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
				787	__skb_unlink(skb, &qdisc->gso_skb);
				788	kfree_skb_list(skb);
				789	}
				790
				791	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
				792	__skb_unlink(skb, &qdisc->skb_bad_txq);
				793	kfree_skb_list(skb);
				794	}
				795
				796	qdisc_free(qdisc);
				797	}
				798	EXPORT_SYMBOL(qdisc_destroy);
				799
				800	/* Attach toplevel qdisc to device queue. */
				801	struct Qdisc dev_graft_qdisc(struct netdev_queue dev_queue,
				802	struct Qdisc *qdisc)
				803	{
				804	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
				805	spinlock_t *root_lock;
				806
				807	root_lock = qdisc_lock(oqdisc);
				808	spin_lock_bh(root_lock);
				809
				810	/* ... and graft new one */
				811	if (qdisc == NULL)
				812	qdisc = &noop_qdisc;
				813	dev_queue->qdisc_sleeping = qdisc;
				814	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
				815
				816	spin_unlock_bh(root_lock);
				817
				818	return oqdisc;
				819	}
				820	EXPORT_SYMBOL(dev_graft_qdisc);
				821
				822	static void attach_one_default_qdisc(struct net_device *dev,
				823	struct netdev_queue *dev_queue,
				824	void *_unused)
				825	{
				826	struct Qdisc *qdisc;
				827	const struct Qdisc_ops *ops = &fq_codel_qdisc_ops;
				828
				829	if (dev->priv_flags & IFF_NO_QUEUE)
				830	ops = &noqueue_qdisc_ops;
				831
				832	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
				833	if (!qdisc) {
				834	netdev_info(dev, "activation failed\n");
				835	return;
				836	}
				837	if (!netif_is_multiqueue(dev))
				838	qdisc->flags \|= TCQ_F_ONETXQUEUE \| TCQ_F_NOPARENT;
				839	dev_queue->qdisc_sleeping = qdisc;
				840	}
				841
				842	static void attach_default_qdiscs(struct net_device *dev)
				843	{
				844	struct netdev_queue *txq;
				845	struct Qdisc *qdisc;
				846
				847	txq = netdev_get_tx_queue(dev, 0);
				848
				849	if (!netif_is_multiqueue(dev) \|\|
				850	dev->priv_flags & IFF_NO_QUEUE) {
				851	netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
				852	dev->qdisc = txq->qdisc_sleeping;
				853	qdisc_refcount_inc(dev->qdisc);
				854	} else {
				855	qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
				856	if (qdisc) {
				857	dev->qdisc = qdisc;
				858	qdisc->ops->attach(qdisc);
				859	}
				860	}
				861	#ifdef CONFIG_NET_SCHED
				862	if (dev->qdisc != &noop_qdisc)
				863	qdisc_hash_add(dev->qdisc, false);
				864	#endif
				865	}
				866
				867	static void transition_one_qdisc(struct net_device *dev,
				868	struct netdev_queue *dev_queue,
				869	void *_need_watchdog)
				870	{
				871	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
				872	int *need_watchdog_p = _need_watchdog;
				873
				874	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
				875	clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
				876
				877	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
				878	if (need_watchdog_p) {
				879	dev_queue->trans_start = 0;
				880	*need_watchdog_p = 1;
				881	}
				882	}
				883
				884	void dev_activate(struct net_device *dev)
				885	{
				886	int need_watchdog;
				887
				888	/* No queueing discipline is attached to device;
				889	* create default one for devices, which need queueing
				890	* and noqueue_qdisc for virtual interfaces
				891	*/
				892
				893	if (dev->qdisc == &noop_qdisc)
				894	attach_default_qdiscs(dev);
				895
				896	if (!netif_carrier_ok(dev))
				897	/* Delay activation until next carrier-on event */
				898	return;
				899
				900	need_watchdog = 0;
				901	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
				902	if (dev_ingress_queue(dev))
				903	transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
				904
				905	if (need_watchdog) {
				906	netif_trans_update(dev);
				907	dev_watchdog_up(dev);
				908	}
				909	}
				910	EXPORT_SYMBOL(dev_activate);
				911
				912	static void dev_deactivate_queue(struct net_device *dev,
				913	struct netdev_queue *dev_queue,
				914	void *_qdisc_default)
				915	{
				916	struct Qdisc *qdisc_default = _qdisc_default;
				917	struct Qdisc *qdisc;
				918
				919	qdisc = rtnl_dereference(dev_queue->qdisc);
				920	if (qdisc) {
				921	bool nolock = qdisc->flags & TCQ_F_NOLOCK;
				922
				923	if (nolock)
				924	spin_lock_bh(&qdisc->seqlock);
				925	spin_lock_bh(qdisc_lock(qdisc));
				926
				927	if (!(qdisc->flags & TCQ_F_BUILTIN))
				928	set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
				929
				930	rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
				931	qdisc_reset(qdisc);
				932
				933	spin_unlock_bh(qdisc_lock(qdisc));
				934	if (nolock)
				935	spin_unlock_bh(&qdisc->seqlock);
				936	}
				937	}
				938
				939	static bool some_qdisc_is_busy(struct net_device *dev)
				940	{
				941	unsigned int i;
				942
				943	for (i = 0; i < dev->num_tx_queues; i++) {
				944	struct netdev_queue *dev_queue;
				945	spinlock_t *root_lock;
				946	struct Qdisc *q;
				947	int val;
				948
				949	dev_queue = netdev_get_tx_queue(dev, i);
				950	q = dev_queue->qdisc_sleeping;
				951
				952	root_lock = qdisc_lock(q);
				953	spin_lock_bh(root_lock);
				954
				955	val = (qdisc_is_running(q) \|\|
				956	test_bit(__QDISC_STATE_SCHED, &q->state));
				957
				958	spin_unlock_bh(root_lock);
				959
				960	if (val)
				961	return true;
				962	}
				963	return false;
				964	}
				965
				966	static void dev_qdisc_reset(struct net_device *dev,
				967	struct netdev_queue *dev_queue,
				968	void *none)
				969	{
				970	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
				971
				972	if (qdisc)
				973	qdisc_reset(qdisc);
				974	}
				975
				976	/**
				977	* dev_deactivate_many - deactivate transmissions on several devices
				978	* @head: list of devices to deactivate
				979	*
				980	* This function returns only when all outstanding transmissions
				981	* have completed, unless all devices are in dismantle phase.
				982	*/
				983	void dev_deactivate_many(struct list_head *head)
				984	{
				985	struct net_device *dev;
				986
				987	list_for_each_entry(dev, head, close_list) {
				988	netdev_for_each_tx_queue(dev, dev_deactivate_queue,
				989	&noop_qdisc);
				990	if (dev_ingress_queue(dev))
				991	dev_deactivate_queue(dev, dev_ingress_queue(dev),
				992	&noop_qdisc);
				993
				994	dev_watchdog_down(dev);
				995	}
				996
				997	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
				998	* This is avoided if all devices are in dismantle phase :
				999	* Caller will call synchronize_net() for us
				1000	*/
				1001	synchronize_net();
				1002
				1003	/* Wait for outstanding qdisc_run calls. */
				1004	list_for_each_entry(dev, head, close_list) {
				1005	while (some_qdisc_is_busy(dev))
				1006	yield();
				1007	/* The new qdisc is assigned at this point so we can safely
				1008	* unwind stale skb lists and qdisc statistics
				1009	*/
				1010	netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
				1011	if (dev_ingress_queue(dev))
				1012	dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
				1013	}
				1014	}
				1015
				1016	void dev_deactivate(struct net_device *dev)
				1017	{
				1018	LIST_HEAD(single);
				1019
				1020	list_add(&dev->close_list, &single);
				1021	dev_deactivate_many(&single);
				1022	list_del(&single);
				1023	}
				1024	EXPORT_SYMBOL(dev_deactivate);
				1025
				1026	static int qdisc_change_tx_queue_len(struct net_device *dev,
				1027	struct netdev_queue *dev_queue)
				1028	{
				1029	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
				1030	const struct Qdisc_ops *ops = qdisc->ops;
				1031
				1032	if (ops->change_tx_queue_len)
				1033	return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
				1034	return 0;
				1035	}
				1036
				1037	int dev_qdisc_change_tx_queue_len(struct net_device *dev)
				1038	{
				1039	bool up = dev->flags & IFF_UP;
				1040	unsigned int i;
				1041	int ret = 0;
				1042
				1043	if (up)
				1044	dev_deactivate(dev);
				1045
				1046	for (i = 0; i < dev->num_tx_queues; i++) {
				1047	ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
				1048
				1049	/* TODO: revert changes on a partial failure */
				1050	if (ret)
				1051	break;
				1052	}
				1053
				1054	if (up)
				1055	dev_activate(dev);
				1056	return ret;
				1057	}
				1058
				1059	static void dev_init_scheduler_queue(struct net_device *dev,
				1060	struct netdev_queue *dev_queue,
				1061	void *_qdisc)
				1062	{
				1063	struct Qdisc *qdisc = _qdisc;
				1064
				1065	rcu_assign_pointer(dev_queue->qdisc, qdisc);
				1066	dev_queue->qdisc_sleeping = qdisc;
				1067	}
				1068
				1069	void dev_init_scheduler(struct net_device *dev)
				1070	{
				1071	dev->qdisc = &noop_qdisc;
				1072	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
				1073	if (dev_ingress_queue(dev))
				1074	dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
				1075
				1076	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
				1077	}
				1078
				1079	static void shutdown_scheduler_queue(struct net_device *dev,
				1080	struct netdev_queue *dev_queue,
				1081	void *_qdisc_default)
				1082	{
				1083	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
				1084	struct Qdisc *qdisc_default = _qdisc_default;
				1085
				1086	if (qdisc) {
				1087	rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
				1088	dev_queue->qdisc_sleeping = qdisc_default;
				1089
				1090	qdisc_destroy(qdisc);
				1091	}
				1092	}
				1093
				1094	void dev_shutdown(struct net_device *dev)
				1095	{
				1096	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
				1097	if (dev_ingress_queue(dev))
				1098	shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
				1099	qdisc_destroy(dev->qdisc);
				1100	dev->qdisc = &noop_qdisc;
				1101
				1102	WARN_ON(timer_pending(&dev->watchdog_timer));
				1103	}
				1104
				1105	void psched_ratecfg_precompute(struct psched_ratecfg *r,
				1106	const struct tc_ratespec *conf,
				1107	u64 rate64)
				1108	{
				1109	memset(r, 0, sizeof(*r));
				1110	r->overhead = conf->overhead;
				1111	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
				1112	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
				1113	r->mult = 1;
				1114	/*
				1115	* The deal here is to replace a divide by a reciprocal one
				1116	* in fast path (a reciprocal divide is a multiply and a shift)
				1117	*
				1118	* Normal formula would be :
				1119	* time_in_ns = (NSEC_PER_SEC * len) / rate_bps
				1120	*
				1121	* We compute mult/shift to use instead :
				1122	* time_in_ns = (len * mult) >> shift;
				1123	*
				1124	* We try to get the highest possible mult value for accuracy,
				1125	* but have to make sure no overflows will ever happen.
				1126	*/
				1127	if (r->rate_bytes_ps > 0) {
				1128	u64 factor = NSEC_PER_SEC;
				1129
				1130	for (;;) {
				1131	r->mult = div64_u64(factor, r->rate_bytes_ps);
				1132	if (r->mult & (1U << 31) \|\| factor & (1ULL << 63))
				1133	break;
				1134	factor <<= 1;
				1135	r->shift++;
				1136	}
				1137	}
				1138	}
				1139	EXPORT_SYMBOL(psched_ratecfg_precompute);
				1140
				1141	static void mini_qdisc_rcu_func(struct rcu_head *head)
				1142	{
				1143	}
				1144
				1145	void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
				1146	struct tcf_proto *tp_head)
				1147	{
				1148	struct mini_Qdisc miniq_old = rtnl_dereference(miniqp->p_miniq);
				1149	struct mini_Qdisc *miniq;
				1150
				1151	if (!tp_head) {
				1152	RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
				1153	/* Wait for flying RCU callback before it is freed. */
				1154	rcu_barrier_bh();
				1155	return;
				1156	}
				1157
				1158	miniq = !miniq_old \|\| miniq_old == &miniqp->miniq2 ?
				1159	&miniqp->miniq1 : &miniqp->miniq2;
				1160
				1161	/* We need to make sure that readers won't see the miniq
				1162	* we are about to modify. So wait until previous call_rcu_bh callback
				1163	* is done.
				1164	*/
				1165	rcu_barrier_bh();
				1166	miniq->filter_list = tp_head;
				1167	rcu_assign_pointer(*miniqp->p_miniq, miniq);
				1168
				1169	if (miniq_old)
				1170	/* This is counterpart of the rcu barriers above. We need to
				1171	* block potential new user of miniq_old until all readers
				1172	* are not seeing it.
				1173	*/
				1174	call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
				1175	}
				1176	EXPORT_SYMBOL(mini_qdisc_pair_swap);
				1177
				1178	void mini_qdisc_pair_init(struct mini_Qdisc_pair miniqp, struct Qdisc qdisc,
				1179	struct mini_Qdisc __rcu **p_miniq)
				1180	{
				1181	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
				1182	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
				1183	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
				1184	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
				1185	miniqp->p_miniq = p_miniq;
				1186	}
				1187	EXPORT_SYMBOL(mini_qdisc_pair_init);