Blame - marvell/linux/net/sched/sch_fq.c - T108

blob: f757ea90aba6db9a5b92be29fe688578b551da33 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
				2	/*
				3	* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
				4	*
				5	* Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
				6	*
				7	* Meant to be mostly used for locally generated traffic :
				8	* Fast classification depends on skb->sk being set before reaching us.
				9	* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
				10	* All packets belonging to a socket are considered as a 'flow'.
				11	*
				12	* Flows are dynamically allocated and stored in a hash table of RB trees
				13	* They are also part of one Round Robin 'queues' (new or old flows)
				14	*
				15	* Burst avoidance (aka pacing) capability :
				16	*
				17	* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
				18	* bunch of packets, and this packet scheduler adds delay between
				19	* packets to respect rate limitation.
				20	*
				21	* enqueue() :
				22	* - lookup one RB tree (out of 1024 or more) to find the flow.
				23	* If non existent flow, create it, add it to the tree.
				24	* Add skb to the per flow list of skb (fifo).
				25	* - Use a special fifo for high prio packets
				26	*
				27	* dequeue() : serves flows in Round Robin
				28	* Note : When a flow becomes empty, we do not immediately remove it from
				29	* rb trees, for performance reasons (its expected to send additional packets,
				30	* or SLAB cache will reuse socket for another flow)
				31	*/
				32
				33	#include <linux/module.h>
				34	#include <linux/types.h>
				35	#include <linux/kernel.h>
				36	#include <linux/jiffies.h>
				37	#include <linux/string.h>
				38	#include <linux/in.h>
				39	#include <linux/errno.h>
				40	#include <linux/init.h>
				41	#include <linux/skbuff.h>
				42	#include <linux/slab.h>
				43	#include <linux/rbtree.h>
				44	#include <linux/hash.h>
				45	#include <linux/prefetch.h>
				46	#include <linux/vmalloc.h>
				47	#include <net/netlink.h>
				48	#include <net/pkt_sched.h>
				49	#include <net/sock.h>
				50	#include <net/tcp_states.h>
				51	#include <net/tcp.h>
				52
				53	struct fq_skb_cb {
				54	u64 time_to_send;
				55	};
				56
				57	static inline struct fq_skb_cb fq_skb_cb(struct sk_buff skb)
				58	{
				59	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
				60	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
				61	}
				62
				63	/*
				64	* Per flow structure, dynamically allocated.
				65	* If packets have monotically increasing time_to_send, they are placed in O(1)
				66	* in linear list (head,tail), otherwise are placed in a rbtree (t_root).
				67	*/
				68	struct fq_flow {
				69	struct rb_root t_root;
				70	struct sk_buff head; / list of skbs for this flow : first skb */
				71	union {
				72	struct sk_buff tail; / last skb in the list */
				73	unsigned long age; /* jiffies when flow was emptied, for gc */
				74	};
				75	struct rb_node fq_node; /* anchor in fq_root[] trees */
				76	struct sock *sk;
				77	int qlen; /* number of packets in flow queue */
				78	int credit;
				79	u32 socket_hash; /* sk_hash */
				80	struct fq_flow next; / next pointer in RR lists, or &detached */
				81
				82	struct rb_node rate_node; /* anchor in q->delayed tree */
				83	u64 time_next_packet;
				84	};
				85
				86	struct fq_flow_head {
				87	struct fq_flow *first;
				88	struct fq_flow *last;
				89	};
				90
				91	struct fq_sched_data {
				92	struct fq_flow_head new_flows;
				93
				94	struct fq_flow_head old_flows;
				95
				96	struct rb_root delayed; /* for rate limited flows */
				97	u64 time_next_delayed_flow;
				98	unsigned long unthrottle_latency_ns;
				99
				100	struct fq_flow internal; /* for non classified or high prio packets */
				101	u32 quantum;
				102	u32 initial_quantum;
				103	u32 flow_refill_delay;
				104	u32 flow_plimit; /* max packets per flow */
				105	unsigned long flow_max_rate; /* optional max rate per flow */
				106	u64 ce_threshold;
				107	u32 orphan_mask; /* mask for orphaned skb */
				108	u32 low_rate_threshold;
				109	struct rb_root *fq_root;
				110	u8 rate_enable;
				111	u8 fq_trees_log;
				112
				113	u32 flows;
				114	u32 inactive_flows;
				115	u32 throttled_flows;
				116
				117	u64 stat_gc_flows;
				118	u64 stat_internal_packets;
				119	u64 stat_throttled;
				120	u64 stat_ce_mark;
				121	u64 stat_flows_plimit;
				122	u64 stat_pkts_too_long;
				123	u64 stat_allocation_errors;
				124	struct qdisc_watchdog watchdog;
				125	};
				126
				127	/* special value to mark a detached flow (not on old/new list) */
				128	static struct fq_flow detached, throttled;
				129
				130	static void fq_flow_set_detached(struct fq_flow *f)
				131	{
				132	f->next = &detached;
				133	f->age = jiffies;
				134	}
				135
				136	static bool fq_flow_is_detached(const struct fq_flow *f)
				137	{
				138	return f->next == &detached;
				139	}
				140
				141	static bool fq_flow_is_throttled(const struct fq_flow *f)
				142	{
				143	return f->next == &throttled;
				144	}
				145
				146	static void fq_flow_add_tail(struct fq_flow_head head, struct fq_flow flow)
				147	{
				148	if (head->first)
				149	head->last->next = flow;
				150	else
				151	head->first = flow;
				152	head->last = flow;
				153	flow->next = NULL;
				154	}
				155
				156	static void fq_flow_unset_throttled(struct fq_sched_data q, struct fq_flow f)
				157	{
				158	rb_erase(&f->rate_node, &q->delayed);
				159	q->throttled_flows--;
				160	fq_flow_add_tail(&q->old_flows, f);
				161	}
				162
				163	static void fq_flow_set_throttled(struct fq_sched_data q, struct fq_flow f)
				164	{
				165	struct rb_node *p = &q->delayed.rb_node, parent = NULL;
				166
				167	while (*p) {
				168	struct fq_flow *aux;
				169
				170	parent = *p;
				171	aux = rb_entry(parent, struct fq_flow, rate_node);
				172	if (f->time_next_packet >= aux->time_next_packet)
				173	p = &parent->rb_right;
				174	else
				175	p = &parent->rb_left;
				176	}
				177	rb_link_node(&f->rate_node, parent, p);
				178	rb_insert_color(&f->rate_node, &q->delayed);
				179	q->throttled_flows++;
				180	q->stat_throttled++;
				181
				182	f->next = &throttled;
				183	if (q->time_next_delayed_flow > f->time_next_packet)
				184	q->time_next_delayed_flow = f->time_next_packet;
				185	}
				186
				187
				188	static struct kmem_cache *fq_flow_cachep __read_mostly;
				189
				190
				191	/* limit number of collected flows per round */
				192	#define FQ_GC_MAX 8
				193	#define FQ_GC_AGE (3*HZ)
				194
				195	static bool fq_gc_candidate(const struct fq_flow *f)
				196	{
				197	return fq_flow_is_detached(f) &&
				198	time_after(jiffies, f->age + FQ_GC_AGE);
				199	}
				200
				201	static void fq_gc(struct fq_sched_data *q,
				202	struct rb_root *root,
				203	struct sock *sk)
				204	{
				205	struct fq_flow f, tofree[FQ_GC_MAX];
				206	struct rb_node *p, parent;
				207	int fcnt = 0;
				208
				209	p = &root->rb_node;
				210	parent = NULL;
				211	while (*p) {
				212	parent = *p;
				213
				214	f = rb_entry(parent, struct fq_flow, fq_node);
				215	if (f->sk == sk)
				216	break;
				217
				218	if (fq_gc_candidate(f)) {
				219	tofree[fcnt++] = f;
				220	if (fcnt == FQ_GC_MAX)
				221	break;
				222	}
				223
				224	if (f->sk > sk)
				225	p = &parent->rb_right;
				226	else
				227	p = &parent->rb_left;
				228	}
				229
				230	q->flows -= fcnt;
				231	q->inactive_flows -= fcnt;
				232	q->stat_gc_flows += fcnt;
				233	while (fcnt) {
				234	struct fq_flow *f = tofree[--fcnt];
				235
				236	rb_erase(&f->fq_node, root);
				237	kmem_cache_free(fq_flow_cachep, f);
				238	}
				239	}
				240
				241	static struct fq_flow fq_classify(struct sk_buff skb, struct fq_sched_data *q)
				242	{
				243	struct rb_node *p, parent;
				244	struct sock *sk = skb->sk;
				245	struct rb_root *root;
				246	struct fq_flow *f;
				247
				248	/* warning: no starvation prevention... */
				249	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
				250	return &q->internal;
				251
				252	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
				253	* or a listener (SYNCOOKIE mode)
				254	* 1) request sockets are not full blown,
				255	* they do not contain sk_pacing_rate
				256	* 2) They are not part of a 'flow' yet
				257	* 3) We do not want to rate limit them (eg SYNFLOOD attack),
				258	* especially if the listener set SO_MAX_PACING_RATE
				259	* 4) We pretend they are orphaned
				260	*/
				261	if (!sk \|\| sk_listener(sk)) {
				262	unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
				263
				264	/* By forcing low order bit to 1, we make sure to not
				265	* collide with a local flow (socket pointers are word aligned)
				266	*/
				267	sk = (struct sock *)((hash << 1) \| 1UL);
				268	skb_orphan(skb);
				269	} else if (sk->sk_state == TCP_CLOSE) {
				270	unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
				271	/*
				272	* Sockets in TCP_CLOSE are non connected.
				273	* Typical use case is UDP sockets, they can send packets
				274	* with sendto() to many different destinations.
				275	* We probably could use a generic bit advertising
				276	* non connected sockets, instead of sk_state == TCP_CLOSE,
				277	* if we care enough.
				278	*/
				279	sk = (struct sock *)((hash << 1) \| 1UL);
				280	}
				281
				282	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
				283
				284	if (q->flows >= (2U << q->fq_trees_log) &&
				285	q->inactive_flows > q->flows/2)
				286	fq_gc(q, root, sk);
				287
				288	p = &root->rb_node;
				289	parent = NULL;
				290	while (*p) {
				291	parent = *p;
				292
				293	f = rb_entry(parent, struct fq_flow, fq_node);
				294	if (f->sk == sk) {
				295	/* socket might have been reallocated, so check
				296	* if its sk_hash is the same.
				297	* It not, we need to refill credit with
				298	* initial quantum
				299	*/
				300	if (unlikely(skb->sk == sk &&
				301	f->socket_hash != sk->sk_hash)) {
				302	f->credit = q->initial_quantum;
				303	f->socket_hash = sk->sk_hash;
				304	if (q->rate_enable)
				305	smp_store_release(&sk->sk_pacing_status,
				306	SK_PACING_FQ);
				307	if (fq_flow_is_throttled(f))
				308	fq_flow_unset_throttled(q, f);
				309	f->time_next_packet = 0ULL;
				310	}
				311	return f;
				312	}
				313	if (f->sk > sk)
				314	p = &parent->rb_right;
				315	else
				316	p = &parent->rb_left;
				317	}
				318
				319	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC \| __GFP_NOWARN);
				320	if (unlikely(!f)) {
				321	q->stat_allocation_errors++;
				322	return &q->internal;
				323	}
				324	/* f->t_root is already zeroed after kmem_cache_zalloc() */
				325
				326	fq_flow_set_detached(f);
				327	f->sk = sk;
				328	if (skb->sk == sk) {
				329	f->socket_hash = sk->sk_hash;
				330	if (q->rate_enable)
				331	smp_store_release(&sk->sk_pacing_status,
				332	SK_PACING_FQ);
				333	}
				334	f->credit = q->initial_quantum;
				335
				336	rb_link_node(&f->fq_node, parent, p);
				337	rb_insert_color(&f->fq_node, root);
				338
				339	q->flows++;
				340	q->inactive_flows++;
				341	return f;
				342	}
				343
				344	static struct sk_buff fq_peek(struct fq_flow flow)
				345	{
				346	struct sk_buff *skb = skb_rb_first(&flow->t_root);
				347	struct sk_buff *head = flow->head;
				348
				349	if (!skb)
				350	return head;
				351
				352	if (!head)
				353	return skb;
				354
				355	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
				356	return skb;
				357	return head;
				358	}
				359
				360	static void fq_erase_head(struct Qdisc sch, struct fq_flow flow,
				361	struct sk_buff *skb)
				362	{
				363	if (skb == flow->head) {
				364	flow->head = skb->next;
				365	} else {
				366	rb_erase(&skb->rbnode, &flow->t_root);
				367	skb->dev = qdisc_dev(sch);
				368	}
				369	}
				370
				371	/* remove one skb from head of flow queue */
				372	static struct sk_buff fq_dequeue_head(struct Qdisc sch, struct fq_flow *flow)
				373	{
				374	struct sk_buff *skb = fq_peek(flow);
				375
				376	if (skb) {
				377	fq_erase_head(sch, flow, skb);
				378	skb_mark_not_on_list(skb);
				379	flow->qlen--;
				380	qdisc_qstats_backlog_dec(sch, skb);
				381	sch->q.qlen--;
				382	}
				383	return skb;
				384	}
				385
				386	static void flow_queue_add(struct fq_flow flow, struct sk_buff skb)
				387	{
				388	struct rb_node *p, parent;
				389	struct sk_buff head, aux;
				390
				391	fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
				392
				393	head = flow->head;
				394	if (!head \|\|
				395	fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
				396	if (!head)
				397	flow->head = skb;
				398	else
				399	flow->tail->next = skb;
				400	flow->tail = skb;
				401	skb->next = NULL;
				402	return;
				403	}
				404
				405	p = &flow->t_root.rb_node;
				406	parent = NULL;
				407
				408	while (*p) {
				409	parent = *p;
				410	aux = rb_to_skb(parent);
				411	if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
				412	p = &parent->rb_right;
				413	else
				414	p = &parent->rb_left;
				415	}
				416	rb_link_node(&skb->rbnode, parent, p);
				417	rb_insert_color(&skb->rbnode, &flow->t_root);
				418	}
				419
				420	static int fq_enqueue(struct sk_buff skb, struct Qdisc sch,
				421	struct sk_buff **to_free)
				422	{
				423	struct fq_sched_data *q = qdisc_priv(sch);
				424	struct fq_flow *f;
				425
				426	if (unlikely(sch->q.qlen >= sch->limit))
				427	return qdisc_drop(skb, sch, to_free);
				428
				429	f = fq_classify(skb, q);
				430	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
				431	q->stat_flows_plimit++;
				432	return qdisc_drop(skb, sch, to_free);
				433	}
				434
				435	f->qlen++;
				436	qdisc_qstats_backlog_inc(sch, skb);
				437	if (fq_flow_is_detached(f)) {
				438	fq_flow_add_tail(&q->new_flows, f);
				439	if (time_after(jiffies, f->age + q->flow_refill_delay))
				440	f->credit = max_t(u32, f->credit, q->quantum);
				441	q->inactive_flows--;
				442	}
				443
				444	/* Note: this overwrites f->age */
				445	flow_queue_add(f, skb);
				446
				447	if (unlikely(f == &q->internal)) {
				448	q->stat_internal_packets++;
				449	}
				450	sch->q.qlen++;
				451
				452	return NET_XMIT_SUCCESS;
				453	}
				454
				455	static void fq_check_throttled(struct fq_sched_data *q, u64 now)
				456	{
				457	unsigned long sample;
				458	struct rb_node *p;
				459
				460	if (q->time_next_delayed_flow > now)
				461	return;
				462
				463	/* Update unthrottle latency EWMA.
				464	* This is cheap and can help diagnosing timer/latency problems.
				465	*/
				466	sample = (unsigned long)(now - q->time_next_delayed_flow);
				467	q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
				468	q->unthrottle_latency_ns += sample >> 3;
				469
				470	q->time_next_delayed_flow = ~0ULL;
				471	while ((p = rb_first(&q->delayed)) != NULL) {
				472	struct fq_flow *f = rb_entry(p, struct fq_flow, rate_node);
				473
				474	if (f->time_next_packet > now) {
				475	q->time_next_delayed_flow = f->time_next_packet;
				476	break;
				477	}
				478	fq_flow_unset_throttled(q, f);
				479	}
				480	}
				481
				482	static struct sk_buff fq_dequeue(struct Qdisc sch)
				483	{
				484	struct fq_sched_data *q = qdisc_priv(sch);
				485	struct fq_flow_head *head;
				486	struct sk_buff *skb;
				487	struct fq_flow *f;
				488	unsigned long rate;
				489	u32 plen;
				490	u64 now;
				491
				492	if (!sch->q.qlen)
				493	return NULL;
				494
				495	skb = fq_dequeue_head(sch, &q->internal);
				496	if (skb)
				497	goto out;
				498
				499	now = ktime_get_ns();
				500	fq_check_throttled(q, now);
				501	begin:
				502	head = &q->new_flows;
				503	if (!head->first) {
				504	head = &q->old_flows;
				505	if (!head->first) {
				506	if (q->time_next_delayed_flow != ~0ULL)
				507	qdisc_watchdog_schedule_ns(&q->watchdog,
				508	q->time_next_delayed_flow);
				509	return NULL;
				510	}
				511	}
				512	f = head->first;
				513
				514	if (f->credit <= 0) {
				515	f->credit += q->quantum;
				516	head->first = f->next;
				517	fq_flow_add_tail(&q->old_flows, f);
				518	goto begin;
				519	}
				520
				521	skb = fq_peek(f);
				522	if (skb) {
				523	u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
				524	f->time_next_packet);
				525
				526	if (now < time_next_packet) {
				527	head->first = f->next;
				528	f->time_next_packet = time_next_packet;
				529	fq_flow_set_throttled(q, f);
				530	goto begin;
				531	}
				532	if (time_next_packet &&
				533	(s64)(now - time_next_packet - q->ce_threshold) > 0) {
				534	INET_ECN_set_ce(skb);
				535	q->stat_ce_mark++;
				536	}
				537	}
				538
				539	skb = fq_dequeue_head(sch, f);
				540	if (!skb) {
				541	head->first = f->next;
				542	/* force a pass through old_flows to prevent starvation */
				543	if ((head == &q->new_flows) && q->old_flows.first) {
				544	fq_flow_add_tail(&q->old_flows, f);
				545	} else {
				546	fq_flow_set_detached(f);
				547	q->inactive_flows++;
				548	}
				549	goto begin;
				550	}
				551	prefetch(&skb->end);
				552	plen = qdisc_pkt_len(skb);
				553	f->credit -= plen;
				554
				555	if (!q->rate_enable)
				556	goto out;
				557
				558	rate = q->flow_max_rate;
				559
				560	/* If EDT time was provided for this skb, we need to
				561	* update f->time_next_packet only if this qdisc enforces
				562	* a flow max rate.
				563	*/
				564	if (!skb->tstamp) {
				565	if (skb->sk)
				566	rate = min(skb->sk->sk_pacing_rate, rate);
				567
				568	if (rate <= q->low_rate_threshold) {
				569	f->credit = 0;
				570	} else {
				571	plen = max(plen, q->quantum);
				572	if (f->credit > 0)
				573	goto out;
				574	}
				575	}
				576	if (rate != ~0UL) {
				577	u64 len = (u64)plen * NSEC_PER_SEC;
				578
				579	if (likely(rate))
				580	len = div64_ul(len, rate);
				581	/* Since socket rate can change later,
				582	* clamp the delay to 1 second.
				583	* Really, providers of too big packets should be fixed !
				584	*/
				585	if (unlikely(len > NSEC_PER_SEC)) {
				586	len = NSEC_PER_SEC;
				587	q->stat_pkts_too_long++;
				588	}
				589	/* Account for schedule/timers drifts.
				590	* f->time_next_packet was set when prior packet was sent,
				591	* and current time (@now) can be too late by tens of us.
				592	*/
				593	if (f->time_next_packet)
				594	len -= min(len/2, now - f->time_next_packet);
				595	f->time_next_packet = now + len;
				596	}
				597	out:
				598	qdisc_bstats_update(sch, skb);
				599	return skb;
				600	}
				601
				602	static void fq_flow_purge(struct fq_flow *flow)
				603	{
				604	struct rb_node *p = rb_first(&flow->t_root);
				605
				606	while (p) {
				607	struct sk_buff *skb = rb_to_skb(p);
				608
				609	p = rb_next(p);
				610	rb_erase(&skb->rbnode, &flow->t_root);
				611	rtnl_kfree_skbs(skb, skb);
				612	}
				613	rtnl_kfree_skbs(flow->head, flow->tail);
				614	flow->head = NULL;
				615	flow->qlen = 0;
				616	}
				617
				618	static void fq_reset(struct Qdisc *sch)
				619	{
				620	struct fq_sched_data *q = qdisc_priv(sch);
				621	struct rb_root *root;
				622	struct rb_node *p;
				623	struct fq_flow *f;
				624	unsigned int idx;
				625
				626	sch->q.qlen = 0;
				627	sch->qstats.backlog = 0;
				628
				629	fq_flow_purge(&q->internal);
				630
				631	if (!q->fq_root)
				632	return;
				633
				634	for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
				635	root = &q->fq_root[idx];
				636	while ((p = rb_first(root)) != NULL) {
				637	f = rb_entry(p, struct fq_flow, fq_node);
				638	rb_erase(p, root);
				639
				640	fq_flow_purge(f);
				641
				642	kmem_cache_free(fq_flow_cachep, f);
				643	}
				644	}
				645	q->new_flows.first = NULL;
				646	q->old_flows.first = NULL;
				647	q->delayed = RB_ROOT;
				648	q->flows = 0;
				649	q->inactive_flows = 0;
				650	q->throttled_flows = 0;
				651	}
				652
				653	static void fq_rehash(struct fq_sched_data *q,
				654	struct rb_root *old_array, u32 old_log,
				655	struct rb_root *new_array, u32 new_log)
				656	{
				657	struct rb_node op, np, parent;
				658	struct rb_root oroot, nroot;
				659	struct fq_flow of, nf;
				660	int fcnt = 0;
				661	u32 idx;
				662
				663	for (idx = 0; idx < (1U << old_log); idx++) {
				664	oroot = &old_array[idx];
				665	while ((op = rb_first(oroot)) != NULL) {
				666	rb_erase(op, oroot);
				667	of = rb_entry(op, struct fq_flow, fq_node);
				668	if (fq_gc_candidate(of)) {
				669	fcnt++;
				670	kmem_cache_free(fq_flow_cachep, of);
				671	continue;
				672	}
				673	nroot = &new_array[hash_ptr(of->sk, new_log)];
				674
				675	np = &nroot->rb_node;
				676	parent = NULL;
				677	while (*np) {
				678	parent = *np;
				679
				680	nf = rb_entry(parent, struct fq_flow, fq_node);
				681	BUG_ON(nf->sk == of->sk);
				682
				683	if (nf->sk > of->sk)
				684	np = &parent->rb_right;
				685	else
				686	np = &parent->rb_left;
				687	}
				688
				689	rb_link_node(&of->fq_node, parent, np);
				690	rb_insert_color(&of->fq_node, nroot);
				691	}
				692	}
				693	q->flows -= fcnt;
				694	q->inactive_flows -= fcnt;
				695	q->stat_gc_flows += fcnt;
				696	}
				697
				698	static void fq_free(void *addr)
				699	{
				700	kvfree(addr);
				701	}
				702
				703	static int fq_resize(struct Qdisc *sch, u32 log)
				704	{
				705	struct fq_sched_data *q = qdisc_priv(sch);
				706	struct rb_root *array;
				707	void *old_fq_root;
				708	u32 idx;
				709
				710	if (q->fq_root && log == q->fq_trees_log)
				711	return 0;
				712
				713	/* If XPS was setup, we can allocate memory on right NUMA node */
				714	array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL \| __GFP_RETRY_MAYFAIL,
				715	netdev_queue_numa_node_read(sch->dev_queue));
				716	if (!array)
				717	return -ENOMEM;
				718
				719	for (idx = 0; idx < (1U << log); idx++)
				720	array[idx] = RB_ROOT;
				721
				722	sch_tree_lock(sch);
				723
				724	old_fq_root = q->fq_root;
				725	if (old_fq_root)
				726	fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
				727
				728	q->fq_root = array;
				729	q->fq_trees_log = log;
				730
				731	sch_tree_unlock(sch);
				732
				733	fq_free(old_fq_root);
				734
				735	return 0;
				736	}
				737
				738	static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
				739	[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
				740	[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
				741	[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
				742	[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
				743	[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
				744	[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
				745	[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
				746	[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
				747	[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
				748	[TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
				749	[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
				750	[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
				751	};
				752
				753	static int fq_change(struct Qdisc sch, struct nlattr opt,
				754	struct netlink_ext_ack *extack)
				755	{
				756	struct fq_sched_data *q = qdisc_priv(sch);
				757	struct nlattr *tb[TCA_FQ_MAX + 1];
				758	int err, drop_count = 0;
				759	unsigned drop_len = 0;
				760	u32 fq_log;
				761
				762	if (!opt)
				763	return -EINVAL;
				764
				765	err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy,
				766	NULL);
				767	if (err < 0)
				768	return err;
				769
				770	sch_tree_lock(sch);
				771
				772	fq_log = q->fq_trees_log;
				773
				774	if (tb[TCA_FQ_BUCKETS_LOG]) {
				775	u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
				776
				777	if (nval >= 1 && nval <= ilog2(256*1024))
				778	fq_log = nval;
				779	else
				780	err = -EINVAL;
				781	}
				782	if (tb[TCA_FQ_PLIMIT])
				783	sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
				784
				785	if (tb[TCA_FQ_FLOW_PLIMIT])
				786	q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
				787
				788	if (tb[TCA_FQ_QUANTUM]) {
				789	u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
				790
				791	if (quantum > 0 && quantum <= (1 << 20)) {
				792	q->quantum = quantum;
				793	} else {
				794	NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
				795	err = -EINVAL;
				796	}
				797	}
				798
				799	if (tb[TCA_FQ_INITIAL_QUANTUM])
				800	q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
				801
				802	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
				803	pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
				804	nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
				805
				806	if (tb[TCA_FQ_FLOW_MAX_RATE]) {
				807	u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
				808
				809	q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
				810	}
				811	if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
				812	q->low_rate_threshold =
				813	nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
				814
				815	if (tb[TCA_FQ_RATE_ENABLE]) {
				816	u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
				817
				818	if (enable <= 1)
				819	q->rate_enable = enable;
				820	else
				821	err = -EINVAL;
				822	}
				823
				824	if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
				825	u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
				826
				827	q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
				828	}
				829
				830	if (tb[TCA_FQ_ORPHAN_MASK])
				831	q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
				832
				833	if (tb[TCA_FQ_CE_THRESHOLD])
				834	q->ce_threshold = (u64)NSEC_PER_USEC *
				835	nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
				836
				837	if (!err) {
				838	sch_tree_unlock(sch);
				839	err = fq_resize(sch, fq_log);
				840	sch_tree_lock(sch);
				841	}
				842	while (sch->q.qlen > sch->limit) {
				843	struct sk_buff *skb = fq_dequeue(sch);
				844
				845	if (!skb)
				846	break;
				847	drop_len += qdisc_pkt_len(skb);
				848	rtnl_kfree_skbs(skb, skb);
				849	drop_count++;
				850	}
				851	qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
				852
				853	sch_tree_unlock(sch);
				854	return err;
				855	}
				856
				857	static void fq_destroy(struct Qdisc *sch)
				858	{
				859	struct fq_sched_data *q = qdisc_priv(sch);
				860
				861	fq_reset(sch);
				862	fq_free(q->fq_root);
				863	qdisc_watchdog_cancel(&q->watchdog);
				864	}
				865
				866	static int fq_init(struct Qdisc sch, struct nlattr opt,
				867	struct netlink_ext_ack *extack)
				868	{
				869	struct fq_sched_data *q = qdisc_priv(sch);
				870	int err;
				871
				872	sch->limit = 10000;
				873	q->flow_plimit = 100;
				874	q->quantum = 2 * psched_mtu(qdisc_dev(sch));
				875	q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
				876	q->flow_refill_delay = msecs_to_jiffies(40);
				877	q->flow_max_rate = ~0UL;
				878	q->time_next_delayed_flow = ~0ULL;
				879	q->rate_enable = 1;
				880	q->new_flows.first = NULL;
				881	q->old_flows.first = NULL;
				882	q->delayed = RB_ROOT;
				883	q->fq_root = NULL;
				884	q->fq_trees_log = ilog2(1024);
				885	q->orphan_mask = 1024 - 1;
				886	q->low_rate_threshold = 550000 / 8;
				887
				888	/* Default ce_threshold of 4294 seconds */
				889	q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
				890
				891	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
				892
				893	if (opt)
				894	err = fq_change(sch, opt, extack);
				895	else
				896	err = fq_resize(sch, q->fq_trees_log);
				897
				898	return err;
				899	}
				900
				901	static int fq_dump(struct Qdisc sch, struct sk_buff skb)
				902	{
				903	struct fq_sched_data *q = qdisc_priv(sch);
				904	u64 ce_threshold = q->ce_threshold;
				905	struct nlattr *opts;
				906
				907	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
				908	if (opts == NULL)
				909	goto nla_put_failure;
				910
				911	/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
				912
				913	do_div(ce_threshold, NSEC_PER_USEC);
				914
				915	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) \|\|
				916	nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) \|\|
				917	nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) \|\|
				918	nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) \|\|
				919	nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) \|\|
				920	nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
				921	min_t(unsigned long, q->flow_max_rate, ~0U)) \|\|
				922	nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
				923	jiffies_to_usecs(q->flow_refill_delay)) \|\|
				924	nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) \|\|
				925	nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
				926	q->low_rate_threshold) \|\|
				927	nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) \|\|
				928	nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
				929	goto nla_put_failure;
				930
				931	return nla_nest_end(skb, opts);
				932
				933	nla_put_failure:
				934	return -1;
				935	}
				936
				937	static int fq_dump_stats(struct Qdisc sch, struct gnet_dump d)
				938	{
				939	struct fq_sched_data *q = qdisc_priv(sch);
				940	struct tc_fq_qd_stats st;
				941
				942	sch_tree_lock(sch);
				943
				944	st.gc_flows = q->stat_gc_flows;
				945	st.highprio_packets = q->stat_internal_packets;
				946	st.tcp_retrans = 0;
				947	st.throttled = q->stat_throttled;
				948	st.flows_plimit = q->stat_flows_plimit;
				949	st.pkts_too_long = q->stat_pkts_too_long;
				950	st.allocation_errors = q->stat_allocation_errors;
				951	st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
				952	st.flows = q->flows;
				953	st.inactive_flows = q->inactive_flows;
				954	st.throttled_flows = q->throttled_flows;
				955	st.unthrottle_latency_ns = min_t(unsigned long,
				956	q->unthrottle_latency_ns, ~0U);
				957	st.ce_mark = q->stat_ce_mark;
				958	sch_tree_unlock(sch);
				959
				960	return gnet_stats_copy_app(d, &st, sizeof(st));
				961	}
				962
				963	static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
				964	.id = "fq",
				965	.priv_size = sizeof(struct fq_sched_data),
				966
				967	.enqueue = fq_enqueue,
				968	.dequeue = fq_dequeue,
				969	.peek = qdisc_peek_dequeued,
				970	.init = fq_init,
				971	.reset = fq_reset,
				972	.destroy = fq_destroy,
				973	.change = fq_change,
				974	.dump = fq_dump,
				975	.dump_stats = fq_dump_stats,
				976	.owner = THIS_MODULE,
				977	};
				978
				979	static int __init fq_module_init(void)
				980	{
				981	int ret;
				982
				983	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
				984	sizeof(struct fq_flow),
				985	0, 0, NULL);
				986	if (!fq_flow_cachep)
				987	return -ENOMEM;
				988
				989	ret = register_qdisc(&fq_qdisc_ops);
				990	if (ret)
				991	kmem_cache_destroy(fq_flow_cachep);
				992	return ret;
				993	}
				994
				995	static void __exit fq_module_exit(void)
				996	{
				997	unregister_qdisc(&fq_qdisc_ops);
				998	kmem_cache_destroy(fq_flow_cachep);
				999	}
				1000
				1001	module_init(fq_module_init)
				1002	module_exit(fq_module_exit)
				1003	MODULE_AUTHOR("Eric Dumazet");
				1004	MODULE_LICENSE("GPL");