Blame - src/kernel/linux/v4.19/net/sched/sch_netem.c - T800

blob: 15f8f24c190d4600a61f07a6acda86e29c433275 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* net/sched/sch_netem.c Network emulator
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License.
				8	*
				9	* Many of the algorithms and ideas for this came from
				10	* NIST Net which is not copyrighted.
				11	*
				12	* Authors: Stephen Hemminger <shemminger@osdl.org>
				13	* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
				14	*/
				15
				16	#include <linux/mm.h>
				17	#include <linux/module.h>
				18	#include <linux/slab.h>
				19	#include <linux/types.h>
				20	#include <linux/kernel.h>
				21	#include <linux/errno.h>
				22	#include <linux/skbuff.h>
				23	#include <linux/vmalloc.h>
				24	#include <linux/rtnetlink.h>
				25	#include <linux/reciprocal_div.h>
				26	#include <linux/rbtree.h>
				27
				28	#include <net/netlink.h>
				29	#include <net/pkt_sched.h>
				30	#include <net/inet_ecn.h>
				31
				32	#define VERSION "1.3"
				33
				34	/* Network Emulation Queuing algorithm.
				35	====================================
				36
				37	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
				38	Network Emulation Tool
				39	[2] Luigi Rizzo, DummyNet for FreeBSD
				40
				41	----------------------------------------------------------------
				42
				43	This started out as a simple way to delay outgoing packets to
				44	test TCP but has grown to include most of the functionality
				45	of a full blown network emulator like NISTnet. It can delay
				46	packets and add random jitter (and correlation). The random
				47	distribution can be loaded from a table as well to provide
				48	normal, Pareto, or experimental curves. Packet loss,
				49	duplication, and reordering can also be emulated.
				50
				51	This qdisc does not do classification that can be handled in
				52	layering other disciplines. It does not need to do bandwidth
				53	control either since that can be handled by using token
				54	bucket or other rate control.
				55
				56	Correlated Loss Generator models
				57
				58	Added generation of correlated loss according to the
				59	"Gilbert-Elliot" model, a 4-state markov model.
				60
				61	References:
				62	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
				63	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
				64	and intuitive loss model for packet networks and its implementation
				65	in the Netem module in the Linux kernel", available in [1]
				66
				67	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
				68	Fabio Ludovici <fabio.ludovici at yahoo.it>
				69	*/
				70
				71	struct disttable {
				72	u32 size;
				73	s16 table[0];
				74	};
				75
				76	struct netem_sched_data {
				77	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
				78	struct rb_root t_root;
				79
				80	/* optional qdisc for classful handling (NULL at netem init) */
				81	struct Qdisc *qdisc;
				82
				83	struct qdisc_watchdog watchdog;
				84
				85	s64 latency;
				86	s64 jitter;
				87
				88	u32 loss;
				89	u32 ecn;
				90	u32 limit;
				91	u32 counter;
				92	u32 gap;
				93	u32 duplicate;
				94	u32 reorder;
				95	u32 corrupt;
				96	u64 rate;
				97	s32 packet_overhead;
				98	u32 cell_size;
				99	struct reciprocal_value cell_size_reciprocal;
				100	s32 cell_overhead;
				101
				102	struct crndstate {
				103	u32 last;
				104	u32 rho;
				105	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
				106
				107	struct disttable *delay_dist;
				108
				109	enum {
				110	CLG_RANDOM,
				111	CLG_4_STATES,
				112	CLG_GILB_ELL,
				113	} loss_model;
				114
				115	enum {
				116	TX_IN_GAP_PERIOD = 1,
				117	TX_IN_BURST_PERIOD,
				118	LOST_IN_GAP_PERIOD,
				119	LOST_IN_BURST_PERIOD,
				120	} _4_state_model;
				121
				122	enum {
				123	GOOD_STATE = 1,
				124	BAD_STATE,
				125	} GE_state_model;
				126
				127	/* Correlated Loss Generation models */
				128	struct clgstate {
				129	/* state of the Markov chain */
				130	u8 state;
				131
				132	/* 4-states and Gilbert-Elliot models */
				133	u32 a1; /* p13 for 4-states or p for GE */
				134	u32 a2; /* p31 for 4-states or r for GE */
				135	u32 a3; /* p32 for 4-states or h for GE */
				136	u32 a4; /* p14 for 4-states or 1-k for GE */
				137	u32 a5; /* p23 used only in 4-states */
				138	} clg;
				139
				140	struct tc_netem_slot slot_config;
				141	struct slotstate {
				142	u64 slot_next;
				143	s32 packets_left;
				144	s32 bytes_left;
				145	} slot;
				146
				147	struct disttable *slot_dist;
				148	};
				149
				150	/* Time stamp put into socket buffer control block
				151	* Only valid when skbs are in our internal t(ime)fifo queue.
				152	*
				153	* As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
				154	* and skb->next & skb->prev are scratch space for a qdisc,
				155	* we save skb->tstamp value in skb->cb[] before destroying it.
				156	*/
				157	struct netem_skb_cb {
				158	u64 time_to_send;
				159	};
				160
				161	static inline struct netem_skb_cb netem_skb_cb(struct sk_buff skb)
				162	{
				163	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
				164	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
				165	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
				166	}
				167
				168	/* init_crandom - initialize correlated random number generator
				169	* Use entropy source for initial seed.
				170	*/
				171	static void init_crandom(struct crndstate *state, unsigned long rho)
				172	{
				173	state->rho = rho;
				174	state->last = prandom_u32();
				175	}
				176
				177	/* get_crandom - correlated random number generator
				178	* Next number depends on last value.
				179	* rho is scaled to avoid floating point.
				180	*/
				181	static u32 get_crandom(struct crndstate *state)
				182	{
				183	u64 value, rho;
				184	unsigned long answer;
				185
				186	if (!state \|\| state->rho == 0) /* no correlation */
				187	return prandom_u32();
				188
				189	value = prandom_u32();
				190	rho = (u64)state->rho + 1;
				191	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
				192	state->last = answer;
				193	return answer;
				194	}
				195
				196	/* loss_4state - 4-state model loss generator
				197	* Generates losses according to the 4-state Markov chain adopted in
				198	* the GI (General and Intuitive) loss model.
				199	*/
				200	static bool loss_4state(struct netem_sched_data *q)
				201	{
				202	struct clgstate *clg = &q->clg;
				203	u32 rnd = prandom_u32();
				204
				205	/*
				206	* Makes a comparison between rnd and the transition
				207	* probabilities outgoing from the current state, then decides the
				208	* next state and if the next packet has to be transmitted or lost.
				209	* The four states correspond to:
				210	* TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
				211	* LOST_IN_BURST_PERIOD => isolated losses within a gap period
				212	* LOST_IN_GAP_PERIOD => lost packets within a burst period
				213	* TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
				214	*/
				215	switch (clg->state) {
				216	case TX_IN_GAP_PERIOD:
				217	if (rnd < clg->a4) {
				218	clg->state = LOST_IN_BURST_PERIOD;
				219	return true;
				220	} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
				221	clg->state = LOST_IN_GAP_PERIOD;
				222	return true;
				223	} else if (clg->a1 + clg->a4 < rnd) {
				224	clg->state = TX_IN_GAP_PERIOD;
				225	}
				226
				227	break;
				228	case TX_IN_BURST_PERIOD:
				229	if (rnd < clg->a5) {
				230	clg->state = LOST_IN_GAP_PERIOD;
				231	return true;
				232	} else {
				233	clg->state = TX_IN_BURST_PERIOD;
				234	}
				235
				236	break;
				237	case LOST_IN_GAP_PERIOD:
				238	if (rnd < clg->a3)
				239	clg->state = TX_IN_BURST_PERIOD;
				240	else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
				241	clg->state = TX_IN_GAP_PERIOD;
				242	} else if (clg->a2 + clg->a3 < rnd) {
				243	clg->state = LOST_IN_GAP_PERIOD;
				244	return true;
				245	}
				246	break;
				247	case LOST_IN_BURST_PERIOD:
				248	clg->state = TX_IN_GAP_PERIOD;
				249	break;
				250	}
				251
				252	return false;
				253	}
				254
				255	/* loss_gilb_ell - Gilbert-Elliot model loss generator
				256	* Generates losses according to the Gilbert-Elliot loss model or
				257	* its special cases (Gilbert or Simple Gilbert)
				258	*
				259	* Makes a comparison between random number and the transition
				260	* probabilities outgoing from the current state, then decides the
				261	* next state. A second random number is extracted and the comparison
				262	* with the loss probability of the current state decides if the next
				263	* packet will be transmitted or lost.
				264	*/
				265	static bool loss_gilb_ell(struct netem_sched_data *q)
				266	{
				267	struct clgstate *clg = &q->clg;
				268
				269	switch (clg->state) {
				270	case GOOD_STATE:
				271	if (prandom_u32() < clg->a1)
				272	clg->state = BAD_STATE;
				273	if (prandom_u32() < clg->a4)
				274	return true;
				275	break;
				276	case BAD_STATE:
				277	if (prandom_u32() < clg->a2)
				278	clg->state = GOOD_STATE;
				279	if (prandom_u32() > clg->a3)
				280	return true;
				281	}
				282
				283	return false;
				284	}
				285
				286	static bool loss_event(struct netem_sched_data *q)
				287	{
				288	switch (q->loss_model) {
				289	case CLG_RANDOM:
				290	/* Random packet drop 0 => none, ~0 => all */
				291	return q->loss && q->loss >= get_crandom(&q->loss_cor);
				292
				293	case CLG_4_STATES:
				294	/* 4state loss model algorithm (used also for GI model)
				295	* Extracts a value from the markov 4 state loss generator,
				296	* if it is 1 drops a packet and if needed writes the event in
				297	* the kernel logs
				298	*/
				299	return loss_4state(q);
				300
				301	case CLG_GILB_ELL:
				302	/* Gilbert-Elliot loss model algorithm
				303	* Extracts a value from the Gilbert-Elliot loss generator,
				304	* if it is 1 drops a packet and if needed writes the event in
				305	* the kernel logs
				306	*/
				307	return loss_gilb_ell(q);
				308	}
				309
				310	return false; /* not reached */
				311	}
				312
				313
				314	/* tabledist - return a pseudo-randomly distributed value with mean mu and
				315	* std deviation sigma. Uses table lookup to approximate the desired
				316	* distribution, and a uniformly-distributed pseudo-random source.
				317	*/
				318	static s64 tabledist(s64 mu, s32 sigma,
				319	struct crndstate *state,
				320	const struct disttable *dist)
				321	{
				322	s64 x;
				323	long t;
				324	u32 rnd;
				325
				326	if (sigma == 0)
				327	return mu;
				328
				329	rnd = get_crandom(state);
				330
				331	/* default uniform distribution */
				332	if (dist == NULL)
				333	return ((rnd % (2 * sigma)) + mu) - sigma;
				334
				335	t = dist->table[rnd % dist->size];
				336	x = (sigma % NETEM_DIST_SCALE) * t;
				337	if (x >= 0)
				338	x += NETEM_DIST_SCALE/2;
				339	else
				340	x -= NETEM_DIST_SCALE/2;
				341
				342	return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
				343	}
				344
				345	static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
				346	{
				347	len += q->packet_overhead;
				348
				349	if (q->cell_size) {
				350	u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
				351
				352	if (len > cells * q->cell_size) /* extra cell needed for remainder */
				353	cells++;
				354	len = cells * (q->cell_size + q->cell_overhead);
				355	}
				356
				357	return div64_u64(len * NSEC_PER_SEC, q->rate);
				358	}
				359
				360	static void tfifo_reset(struct Qdisc *sch)
				361	{
				362	struct netem_sched_data *q = qdisc_priv(sch);
				363	struct rb_node *p = rb_first(&q->t_root);
				364
				365	while (p) {
				366	struct sk_buff *skb = rb_to_skb(p);
				367
				368	p = rb_next(p);
				369	rb_erase(&skb->rbnode, &q->t_root);
				370	rtnl_kfree_skbs(skb, skb);
				371	}
				372	}
				373
				374	static void tfifo_enqueue(struct sk_buff nskb, struct Qdisc sch)
				375	{
				376	struct netem_sched_data *q = qdisc_priv(sch);
				377	u64 tnext = netem_skb_cb(nskb)->time_to_send;
				378	struct rb_node *p = &q->t_root.rb_node, parent = NULL;
				379
				380	while (*p) {
				381	struct sk_buff *skb;
				382
				383	parent = *p;
				384	skb = rb_to_skb(parent);
				385	if (tnext >= netem_skb_cb(skb)->time_to_send)
				386	p = &parent->rb_right;
				387	else
				388	p = &parent->rb_left;
				389	}
				390	rb_link_node(&nskb->rbnode, parent, p);
				391	rb_insert_color(&nskb->rbnode, &q->t_root);
				392	sch->q.qlen++;
				393	}
				394
				395	/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
				396	* when we statistically choose to corrupt one, we instead segment it, returning
				397	* the first packet to be corrupted, and re-enqueue the remaining frames
				398	*/
				399	static struct sk_buff netem_segment(struct sk_buff skb, struct Qdisc *sch,
				400	struct sk_buff **to_free)
				401	{
				402	struct sk_buff *segs;
				403	netdev_features_t features = netif_skb_features(skb);
				404
				405	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
				406
				407	if (IS_ERR_OR_NULL(segs)) {
				408	qdisc_drop(skb, sch, to_free);
				409	return NULL;
				410	}
				411	consume_skb(skb);
				412	return segs;
				413	}
				414
				415	static void netem_enqueue_skb_head(struct qdisc_skb_head qh, struct sk_buff skb)
				416	{
				417	skb->next = qh->head;
				418
				419	if (!qh->head)
				420	qh->tail = skb;
				421	qh->head = skb;
				422	qh->qlen++;
				423	}
				424
				425	/*
				426	* Insert one skb into qdisc.
				427	* Note: parent depends on return value to account for queue length.
				428	* NET_XMIT_DROP: queue length didn't change.
				429	* NET_XMIT_SUCCESS: one skb was queued.
				430	*/
				431	static int netem_enqueue(struct sk_buff skb, struct Qdisc sch,
				432	struct sk_buff **to_free)
				433	{
				434	struct netem_sched_data *q = qdisc_priv(sch);
				435	/* We don't fill cb now as skb_unshare() may invalidate it */
				436	struct netem_skb_cb *cb;
				437	struct sk_buff *skb2;
				438	struct sk_buff *segs = NULL;
				439	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
				440	int nb = 0;
				441	int count = 1;
				442	int rc = NET_XMIT_SUCCESS;
				443	int rc_drop = NET_XMIT_DROP;
				444
				445	/* Do not fool qdisc_drop_all() */
				446	skb->prev = NULL;
				447
				448	/* Random duplication */
				449	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
				450	++count;
				451
				452	/* Drop packet? */
				453	if (loss_event(q)) {
				454	if (q->ecn && INET_ECN_set_ce(skb))
				455	qdisc_qstats_drop(sch); /* mark packet */
				456	else
				457	--count;
				458	}
				459	if (count == 0) {
				460	qdisc_qstats_drop(sch);
				461	__qdisc_drop(skb, to_free);
				462	return NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				463	}
				464
				465	/* If a delay is expected, orphan the skb. (orphaning usually takes
				466	* place at TX completion time, so _before_ the link transit delay)
				467	*/
				468	if (q->latency \|\| q->jitter \|\| q->rate)
				469	skb_orphan_partial(skb);
				470
				471	/*
				472	* If we need to duplicate packet, then re-insert at top of the
				473	* qdisc tree, since parent queuer expects that only one
				474	* skb will be queued.
				475	*/
				476	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
				477	struct Qdisc *rootq = qdisc_root_bh(sch);
				478	u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
				479
				480	q->duplicate = 0;
				481	rootq->enqueue(skb2, rootq, to_free);
				482	q->duplicate = dupsave;
				483	rc_drop = NET_XMIT_SUCCESS;
				484	}
				485
				486	/*
				487	* Randomized packet corruption.
				488	* Make copy if needed since we are modifying
				489	* If packet is going to be hardware checksummed, then
				490	* do it now in software before we mangle it.
				491	*/
				492	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
				493	if (skb_is_gso(skb)) {
				494	segs = netem_segment(skb, sch, to_free);
				495	if (!segs)
				496	return rc_drop;
				497	} else {
				498	segs = skb;
				499	}
				500
				501	skb = segs;
				502	segs = segs->next;
				503
				504	skb = skb_unshare(skb, GFP_ATOMIC);
				505	if (unlikely(!skb)) {
				506	qdisc_qstats_drop(sch);
				507	goto finish_segs;
				508	}
				509	if (skb->ip_summed == CHECKSUM_PARTIAL &&
				510	skb_checksum_help(skb)) {
				511	qdisc_drop(skb, sch, to_free);
				512	goto finish_segs;
				513	}
				514
				515	skb->data[prandom_u32() % skb_headlen(skb)] ^=
				516	1<<(prandom_u32() % 8);
				517	}
				518
				519	if (unlikely(sch->q.qlen >= sch->limit)) {
				520	qdisc_drop_all(skb, sch, to_free);
				521	return rc_drop;
				522	}
				523
				524	qdisc_qstats_backlog_inc(sch, skb);
				525
				526	cb = netem_skb_cb(skb);
				527	if (q->gap == 0 \|\| /* not doing reordering */
				528	q->counter < q->gap - 1 \|\| /* inside last reordering gap */
				529	q->reorder < get_crandom(&q->reorder_cor)) {
				530	u64 now;
				531	s64 delay;
				532
				533	delay = tabledist(q->latency, q->jitter,
				534	&q->delay_cor, q->delay_dist);
				535
				536	now = ktime_get_ns();
				537
				538	if (q->rate) {
				539	struct netem_skb_cb *last = NULL;
				540
				541	if (sch->q.tail)
				542	last = netem_skb_cb(sch->q.tail);
				543	if (q->t_root.rb_node) {
				544	struct sk_buff *t_skb;
				545	struct netem_skb_cb *t_last;
				546
				547	t_skb = skb_rb_last(&q->t_root);
				548	t_last = netem_skb_cb(t_skb);
				549	if (!last \|\|
				550	t_last->time_to_send > last->time_to_send) {
				551	last = t_last;
				552	}
				553	}
				554
				555	if (last) {
				556	/*
				557	* Last packet in queue is reference point (now),
				558	* calculate this time bonus and subtract
				559	* from delay.
				560	*/
				561	delay -= last->time_to_send - now;
				562	delay = max_t(s64, 0, delay);
				563	now = last->time_to_send;
				564	}
				565
				566	delay += packet_time_ns(qdisc_pkt_len(skb), q);
				567	}
				568
				569	cb->time_to_send = now + delay;
				570	++q->counter;
				571	tfifo_enqueue(skb, sch);
				572	} else {
				573	/*
				574	* Do re-ordering by putting one out of N packets at the front
				575	* of the queue.
				576	*/
				577	cb->time_to_send = ktime_get_ns();
				578	q->counter = 0;
				579
				580	netem_enqueue_skb_head(&sch->q, skb);
				581	sch->qstats.requeues++;
				582	}
				583
				584	finish_segs:
				585	if (segs) {
				586	while (segs) {
				587	skb2 = segs->next;
				588	segs->next = NULL;
				589	qdisc_skb_cb(segs)->pkt_len = segs->len;
				590	last_len = segs->len;
				591	rc = qdisc_enqueue(segs, sch, to_free);
				592	if (rc != NET_XMIT_SUCCESS) {
				593	if (net_xmit_drop_count(rc))
				594	qdisc_qstats_drop(sch);
				595	} else {
				596	nb++;
				597	len += last_len;
				598	}
				599	segs = skb2;
				600	}
				601	sch->q.qlen += nb;
				602	if (nb > 1)
				603	qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
				604	}
				605	return NET_XMIT_SUCCESS;
				606	}
				607
				608	/* Delay the next round with a new future slot with a
				609	* correct number of bytes and packets.
				610	*/
				611
				612	static void get_slot_next(struct netem_sched_data *q, u64 now)
				613	{
				614	s64 next_delay;
				615
				616	if (!q->slot_dist)
				617	next_delay = q->slot_config.min_delay +
				618	(prandom_u32() *
				619	(q->slot_config.max_delay -
				620	q->slot_config.min_delay) >> 32);
				621	else
				622	next_delay = tabledist(q->slot_config.dist_delay,
				623	(s32)(q->slot_config.dist_jitter),
				624	NULL, q->slot_dist);
				625
				626	q->slot.slot_next = now + next_delay;
				627	q->slot.packets_left = q->slot_config.max_packets;
				628	q->slot.bytes_left = q->slot_config.max_bytes;
				629	}
				630
				631	static struct sk_buff netem_dequeue(struct Qdisc sch)
				632	{
				633	struct netem_sched_data *q = qdisc_priv(sch);
				634	struct sk_buff *skb;
				635	struct rb_node *p;
				636
				637	tfifo_dequeue:
				638	skb = __qdisc_dequeue_head(&sch->q);
				639	if (skb) {
				640	qdisc_qstats_backlog_dec(sch, skb);
				641	deliver:
				642	qdisc_bstats_update(sch, skb);
				643	return skb;
				644	}
				645	p = rb_first(&q->t_root);
				646	if (p) {
				647	u64 time_to_send;
				648	u64 now = ktime_get_ns();
				649
				650	skb = rb_to_skb(p);
				651
				652	/* if more time remaining? */
				653	time_to_send = netem_skb_cb(skb)->time_to_send;
				654	if (q->slot.slot_next && q->slot.slot_next < time_to_send)
				655	get_slot_next(q, now);
				656
				657	if (time_to_send <= now && q->slot.slot_next <= now) {
				658	rb_erase(p, &q->t_root);
				659	sch->q.qlen--;
				660	qdisc_qstats_backlog_dec(sch, skb);
				661	skb->next = NULL;
				662	skb->prev = NULL;
				663	/* skb->dev shares skb->rbnode area,
				664	* we need to restore its value.
				665	*/
				666	skb->dev = qdisc_dev(sch);
				667
				668	#ifdef CONFIG_NET_CLS_ACT
				669	/*
				670	* If it's at ingress let's pretend the delay is
				671	* from the network (tstamp will be updated).
				672	*/
				673	if (skb->tc_redirected && skb->tc_from_ingress)
				674	skb->tstamp = 0;
				675	#endif
				676
				677	if (q->slot.slot_next) {
				678	q->slot.packets_left--;
				679	q->slot.bytes_left -= qdisc_pkt_len(skb);
				680	if (q->slot.packets_left <= 0 \|\|
				681	q->slot.bytes_left <= 0)
				682	get_slot_next(q, now);
				683	}
				684
				685	if (q->qdisc) {
				686	unsigned int pkt_len = qdisc_pkt_len(skb);
				687	struct sk_buff *to_free = NULL;
				688	int err;
				689
				690	err = qdisc_enqueue(skb, q->qdisc, &to_free);
				691	kfree_skb_list(to_free);
				692	if (err != NET_XMIT_SUCCESS &&
				693	net_xmit_drop_count(err)) {
				694	qdisc_qstats_drop(sch);
				695	qdisc_tree_reduce_backlog(sch, 1,
				696	pkt_len);
				697	}
				698	goto tfifo_dequeue;
				699	}
				700	goto deliver;
				701	}
				702
				703	if (q->qdisc) {
				704	skb = q->qdisc->ops->dequeue(q->qdisc);
				705	if (skb)
				706	goto deliver;
				707	}
				708
				709	qdisc_watchdog_schedule_ns(&q->watchdog,
				710	max(time_to_send,
				711	q->slot.slot_next));
				712	}
				713
				714	if (q->qdisc) {
				715	skb = q->qdisc->ops->dequeue(q->qdisc);
				716	if (skb)
				717	goto deliver;
				718	}
				719	return NULL;
				720	}
				721
				722	static void netem_reset(struct Qdisc *sch)
				723	{
				724	struct netem_sched_data *q = qdisc_priv(sch);
				725
				726	qdisc_reset_queue(sch);
				727	tfifo_reset(sch);
				728	if (q->qdisc)
				729	qdisc_reset(q->qdisc);
				730	qdisc_watchdog_cancel(&q->watchdog);
				731	}
				732
				733	static void dist_free(struct disttable *d)
				734	{
				735	kvfree(d);
				736	}
				737
				738	/*
				739	* Distribution data is a variable size payload containing
				740	* signed 16 bit values.
				741	*/
				742
				743	static int get_dist_table(struct Qdisc sch, struct disttable *tbl,
				744	const struct nlattr *attr)
				745	{
				746	size_t n = nla_len(attr)/sizeof(__s16);
				747	const __s16 *data = nla_data(attr);
				748	spinlock_t *root_lock;
				749	struct disttable *d;
				750	int i;
				751
				752	if (!n \|\| n > NETEM_DIST_MAX)
				753	return -EINVAL;
				754
				755	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
				756	if (!d)
				757	return -ENOMEM;
				758
				759	d->size = n;
				760	for (i = 0; i < n; i++)
				761	d->table[i] = data[i];
				762
				763	root_lock = qdisc_root_sleeping_lock(sch);
				764
				765	spin_lock_bh(root_lock);
				766	swap(*tbl, d);
				767	spin_unlock_bh(root_lock);
				768
				769	dist_free(d);
				770	return 0;
				771	}
				772
				773	static void get_slot(struct netem_sched_data q, const struct nlattr attr)
				774	{
				775	const struct tc_netem_slot *c = nla_data(attr);
				776
				777	q->slot_config = *c;
				778	if (q->slot_config.max_packets == 0)
				779	q->slot_config.max_packets = INT_MAX;
				780	if (q->slot_config.max_bytes == 0)
				781	q->slot_config.max_bytes = INT_MAX;
				782	q->slot.packets_left = q->slot_config.max_packets;
				783	q->slot.bytes_left = q->slot_config.max_bytes;
				784	if (q->slot_config.min_delay \| q->slot_config.max_delay \|
				785	q->slot_config.dist_jitter)
				786	q->slot.slot_next = ktime_get_ns();
				787	else
				788	q->slot.slot_next = 0;
				789	}
				790
				791	static void get_correlation(struct netem_sched_data q, const struct nlattr attr)
				792	{
				793	const struct tc_netem_corr *c = nla_data(attr);
				794
				795	init_crandom(&q->delay_cor, c->delay_corr);
				796	init_crandom(&q->loss_cor, c->loss_corr);
				797	init_crandom(&q->dup_cor, c->dup_corr);
				798	}
				799
				800	static void get_reorder(struct netem_sched_data q, const struct nlattr attr)
				801	{
				802	const struct tc_netem_reorder *r = nla_data(attr);
				803
				804	q->reorder = r->probability;
				805	init_crandom(&q->reorder_cor, r->correlation);
				806	}
				807
				808	static void get_corrupt(struct netem_sched_data q, const struct nlattr attr)
				809	{
				810	const struct tc_netem_corrupt *r = nla_data(attr);
				811
				812	q->corrupt = r->probability;
				813	init_crandom(&q->corrupt_cor, r->correlation);
				814	}
				815
				816	static void get_rate(struct netem_sched_data q, const struct nlattr attr)
				817	{
				818	const struct tc_netem_rate *r = nla_data(attr);
				819
				820	q->rate = r->rate;
				821	q->packet_overhead = r->packet_overhead;
				822	q->cell_size = r->cell_size;
				823	q->cell_overhead = r->cell_overhead;
				824	if (q->cell_size)
				825	q->cell_size_reciprocal = reciprocal_value(q->cell_size);
				826	else
				827	q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
				828	}
				829
				830	static int get_loss_clg(struct netem_sched_data q, const struct nlattr attr)
				831	{
				832	const struct nlattr *la;
				833	int rem;
				834
				835	nla_for_each_nested(la, attr, rem) {
				836	u16 type = nla_type(la);
				837
				838	switch (type) {
				839	case NETEM_LOSS_GI: {
				840	const struct tc_netem_gimodel *gi = nla_data(la);
				841
				842	if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
				843	pr_info("netem: incorrect gi model size\n");
				844	return -EINVAL;
				845	}
				846
				847	q->loss_model = CLG_4_STATES;
				848
				849	q->clg.state = TX_IN_GAP_PERIOD;
				850	q->clg.a1 = gi->p13;
				851	q->clg.a2 = gi->p31;
				852	q->clg.a3 = gi->p32;
				853	q->clg.a4 = gi->p14;
				854	q->clg.a5 = gi->p23;
				855	break;
				856	}
				857
				858	case NETEM_LOSS_GE: {
				859	const struct tc_netem_gemodel *ge = nla_data(la);
				860
				861	if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
				862	pr_info("netem: incorrect ge model size\n");
				863	return -EINVAL;
				864	}
				865
				866	q->loss_model = CLG_GILB_ELL;
				867	q->clg.state = GOOD_STATE;
				868	q->clg.a1 = ge->p;
				869	q->clg.a2 = ge->r;
				870	q->clg.a3 = ge->h;
				871	q->clg.a4 = ge->k1;
				872	break;
				873	}
				874
				875	default:
				876	pr_info("netem: unknown loss type %u\n", type);
				877	return -EINVAL;
				878	}
				879	}
				880
				881	return 0;
				882	}
				883
				884	static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
				885	[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
				886	[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
				887	[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
				888	[TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
				889	[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
				890	[TCA_NETEM_ECN] = { .type = NLA_U32 },
				891	[TCA_NETEM_RATE64] = { .type = NLA_U64 },
				892	[TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
				893	[TCA_NETEM_JITTER64] = { .type = NLA_S64 },
				894	[TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
				895	};
				896
				897	static int parse_attr(struct nlattr tb[], int maxtype, struct nlattr nla,
				898	const struct nla_policy *policy, int len)
				899	{
				900	int nested_len = nla_len(nla) - NLA_ALIGN(len);
				901
				902	if (nested_len < 0) {
				903	pr_info("netem: invalid attributes len %d\n", nested_len);
				904	return -EINVAL;
				905	}
				906
				907	if (nested_len >= nla_attr_size(0))
				908	return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
				909	nested_len, policy, NULL);
				910
				911	memset(tb, 0, sizeof(struct nlattr ) (maxtype + 1));
				912	return 0;
				913	}
				914
				915	/* Parse netlink message to set options */
				916	static int netem_change(struct Qdisc sch, struct nlattr opt,
				917	struct netlink_ext_ack *extack)
				918	{
				919	struct netem_sched_data *q = qdisc_priv(sch);
				920	struct nlattr *tb[TCA_NETEM_MAX + 1];
				921	struct tc_netem_qopt *qopt;
				922	struct clgstate old_clg;
				923	int old_loss_model = CLG_RANDOM;
				924	int ret;
				925
				926	if (opt == NULL)
				927	return -EINVAL;
				928
				929	qopt = nla_data(opt);
				930	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
				931	if (ret < 0)
				932	return ret;
				933
				934	/* backup q->clg and q->loss_model */
				935	old_clg = q->clg;
				936	old_loss_model = q->loss_model;
				937
				938	if (tb[TCA_NETEM_LOSS]) {
				939	ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
				940	if (ret) {
				941	q->loss_model = old_loss_model;
				942	return ret;
				943	}
				944	} else {
				945	q->loss_model = CLG_RANDOM;
				946	}
				947
				948	if (tb[TCA_NETEM_DELAY_DIST]) {
				949	ret = get_dist_table(sch, &q->delay_dist,
				950	tb[TCA_NETEM_DELAY_DIST]);
				951	if (ret)
				952	goto get_table_failure;
				953	}
				954
				955	if (tb[TCA_NETEM_SLOT_DIST]) {
				956	ret = get_dist_table(sch, &q->slot_dist,
				957	tb[TCA_NETEM_SLOT_DIST]);
				958	if (ret)
				959	goto get_table_failure;
				960	}
				961
				962	sch->limit = qopt->limit;
				963
				964	q->latency = PSCHED_TICKS2NS(qopt->latency);
				965	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
				966	q->limit = qopt->limit;
				967	q->gap = qopt->gap;
				968	q->counter = 0;
				969	q->loss = qopt->loss;
				970	q->duplicate = qopt->duplicate;
				971
				972	/* for compatibility with earlier versions.
				973	* if gap is set, need to assume 100% probability
				974	*/
				975	if (q->gap)
				976	q->reorder = ~0;
				977
				978	if (tb[TCA_NETEM_CORR])
				979	get_correlation(q, tb[TCA_NETEM_CORR]);
				980
				981	if (tb[TCA_NETEM_REORDER])
				982	get_reorder(q, tb[TCA_NETEM_REORDER]);
				983
				984	if (tb[TCA_NETEM_CORRUPT])
				985	get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
				986
				987	if (tb[TCA_NETEM_RATE])
				988	get_rate(q, tb[TCA_NETEM_RATE]);
				989
				990	if (tb[TCA_NETEM_RATE64])
				991	q->rate = max_t(u64, q->rate,
				992	nla_get_u64(tb[TCA_NETEM_RATE64]));
				993
				994	if (tb[TCA_NETEM_LATENCY64])
				995	q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
				996
				997	if (tb[TCA_NETEM_JITTER64])
				998	q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
				999
				1000	if (tb[TCA_NETEM_ECN])
				1001	q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
				1002
				1003	if (tb[TCA_NETEM_SLOT])
				1004	get_slot(q, tb[TCA_NETEM_SLOT]);
				1005
				1006	return ret;
				1007
				1008	get_table_failure:
				1009	/* recover clg and loss_model, in case of
				1010	* q->clg and q->loss_model were modified
				1011	* in get_loss_clg()
				1012	*/
				1013	q->clg = old_clg;
				1014	q->loss_model = old_loss_model;
				1015	return ret;
				1016	}
				1017
				1018	static int netem_init(struct Qdisc sch, struct nlattr opt,
				1019	struct netlink_ext_ack *extack)
				1020	{
				1021	struct netem_sched_data *q = qdisc_priv(sch);
				1022	int ret;
				1023
				1024	qdisc_watchdog_init(&q->watchdog, sch);
				1025
				1026	if (!opt)
				1027	return -EINVAL;
				1028
				1029	q->loss_model = CLG_RANDOM;
				1030	ret = netem_change(sch, opt, extack);
				1031	if (ret)
				1032	pr_info("netem: change failed\n");
				1033	return ret;
				1034	}
				1035
				1036	static void netem_destroy(struct Qdisc *sch)
				1037	{
				1038	struct netem_sched_data *q = qdisc_priv(sch);
				1039
				1040	qdisc_watchdog_cancel(&q->watchdog);
				1041	if (q->qdisc)
				1042	qdisc_destroy(q->qdisc);
				1043	dist_free(q->delay_dist);
				1044	dist_free(q->slot_dist);
				1045	}
				1046
				1047	static int dump_loss_model(const struct netem_sched_data *q,
				1048	struct sk_buff *skb)
				1049	{
				1050	struct nlattr *nest;
				1051
				1052	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
				1053	if (nest == NULL)
				1054	goto nla_put_failure;
				1055
				1056	switch (q->loss_model) {
				1057	case CLG_RANDOM:
				1058	/* legacy loss model */
				1059	nla_nest_cancel(skb, nest);
				1060	return 0; /* no data */
				1061
				1062	case CLG_4_STATES: {
				1063	struct tc_netem_gimodel gi = {
				1064	.p13 = q->clg.a1,
				1065	.p31 = q->clg.a2,
				1066	.p32 = q->clg.a3,
				1067	.p14 = q->clg.a4,
				1068	.p23 = q->clg.a5,
				1069	};
				1070
				1071	if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
				1072	goto nla_put_failure;
				1073	break;
				1074	}
				1075	case CLG_GILB_ELL: {
				1076	struct tc_netem_gemodel ge = {
				1077	.p = q->clg.a1,
				1078	.r = q->clg.a2,
				1079	.h = q->clg.a3,
				1080	.k1 = q->clg.a4,
				1081	};
				1082
				1083	if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
				1084	goto nla_put_failure;
				1085	break;
				1086	}
				1087	}
				1088
				1089	nla_nest_end(skb, nest);
				1090	return 0;
				1091
				1092	nla_put_failure:
				1093	nla_nest_cancel(skb, nest);
				1094	return -1;
				1095	}
				1096
				1097	static int netem_dump(struct Qdisc sch, struct sk_buff skb)
				1098	{
				1099	const struct netem_sched_data *q = qdisc_priv(sch);
				1100	struct nlattr nla = (struct nlattr ) skb_tail_pointer(skb);
				1101	struct tc_netem_qopt qopt;
				1102	struct tc_netem_corr cor;
				1103	struct tc_netem_reorder reorder;
				1104	struct tc_netem_corrupt corrupt;
				1105	struct tc_netem_rate rate;
				1106	struct tc_netem_slot slot;
				1107
				1108	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
				1109	UINT_MAX);
				1110	qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
				1111	UINT_MAX);
				1112	qopt.limit = q->limit;
				1113	qopt.loss = q->loss;
				1114	qopt.gap = q->gap;
				1115	qopt.duplicate = q->duplicate;
				1116	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
				1117	goto nla_put_failure;
				1118
				1119	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
				1120	goto nla_put_failure;
				1121
				1122	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
				1123	goto nla_put_failure;
				1124
				1125	cor.delay_corr = q->delay_cor.rho;
				1126	cor.loss_corr = q->loss_cor.rho;
				1127	cor.dup_corr = q->dup_cor.rho;
				1128	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
				1129	goto nla_put_failure;
				1130
				1131	reorder.probability = q->reorder;
				1132	reorder.correlation = q->reorder_cor.rho;
				1133	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
				1134	goto nla_put_failure;
				1135
				1136	corrupt.probability = q->corrupt;
				1137	corrupt.correlation = q->corrupt_cor.rho;
				1138	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
				1139	goto nla_put_failure;
				1140
				1141	if (q->rate >= (1ULL << 32)) {
				1142	if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
				1143	TCA_NETEM_PAD))
				1144	goto nla_put_failure;
				1145	rate.rate = ~0U;
				1146	} else {
				1147	rate.rate = q->rate;
				1148	}
				1149	rate.packet_overhead = q->packet_overhead;
				1150	rate.cell_size = q->cell_size;
				1151	rate.cell_overhead = q->cell_overhead;
				1152	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
				1153	goto nla_put_failure;
				1154
				1155	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
				1156	goto nla_put_failure;
				1157
				1158	if (dump_loss_model(q, skb) != 0)
				1159	goto nla_put_failure;
				1160
				1161	if (q->slot_config.min_delay \| q->slot_config.max_delay \|
				1162	q->slot_config.dist_jitter) {
				1163	slot = q->slot_config;
				1164	if (slot.max_packets == INT_MAX)
				1165	slot.max_packets = 0;
				1166	if (slot.max_bytes == INT_MAX)
				1167	slot.max_bytes = 0;
				1168	if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
				1169	goto nla_put_failure;
				1170	}
				1171
				1172	return nla_nest_end(skb, nla);
				1173
				1174	nla_put_failure:
				1175	nlmsg_trim(skb, nla);
				1176	return -1;
				1177	}
				1178
				1179	static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
				1180	struct sk_buff skb, struct tcmsg tcm)
				1181	{
				1182	struct netem_sched_data *q = qdisc_priv(sch);
				1183
				1184	if (cl != 1 \|\| !q->qdisc) /* only one class */
				1185	return -ENOENT;
				1186
				1187	tcm->tcm_handle \|= TC_H_MIN(1);
				1188	tcm->tcm_info = q->qdisc->handle;
				1189
				1190	return 0;
				1191	}
				1192
				1193	static int netem_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
				1194	struct Qdisc *old, struct netlink_ext_ack extack)
				1195	{
				1196	struct netem_sched_data *q = qdisc_priv(sch);
				1197
				1198	*old = qdisc_replace(sch, new, &q->qdisc);
				1199	return 0;
				1200	}
				1201
				1202	static struct Qdisc netem_leaf(struct Qdisc sch, unsigned long arg)
				1203	{
				1204	struct netem_sched_data *q = qdisc_priv(sch);
				1205	return q->qdisc;
				1206	}
				1207
				1208	static unsigned long netem_find(struct Qdisc *sch, u32 classid)
				1209	{
				1210	return 1;
				1211	}
				1212
				1213	static void netem_walk(struct Qdisc sch, struct qdisc_walker walker)
				1214	{
				1215	if (!walker->stop) {
				1216	if (walker->count >= walker->skip)
				1217	if (walker->fn(sch, 1, walker) < 0) {
				1218	walker->stop = 1;
				1219	return;
				1220	}
				1221	walker->count++;
				1222	}
				1223	}
				1224
				1225	static const struct Qdisc_class_ops netem_class_ops = {
				1226	.graft = netem_graft,
				1227	.leaf = netem_leaf,
				1228	.find = netem_find,
				1229	.walk = netem_walk,
				1230	.dump = netem_dump_class,
				1231	};
				1232
				1233	static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
				1234	.id = "netem",
				1235	.cl_ops = &netem_class_ops,
				1236	.priv_size = sizeof(struct netem_sched_data),
				1237	.enqueue = netem_enqueue,
				1238	.dequeue = netem_dequeue,
				1239	.peek = qdisc_peek_dequeued,
				1240	.init = netem_init,
				1241	.reset = netem_reset,
				1242	.destroy = netem_destroy,
				1243	.change = netem_change,
				1244	.dump = netem_dump,
				1245	.owner = THIS_MODULE,
				1246	};
				1247
				1248
				1249	static int __init netem_module_init(void)
				1250	{
				1251	pr_info("netem: version " VERSION "\n");
				1252	return register_qdisc(&netem_qdisc_ops);
				1253	}
				1254	static void __exit netem_module_exit(void)
				1255	{
				1256	unregister_qdisc(&netem_qdisc_ops);
				1257	}
				1258	module_init(netem_module_init)
				1259	module_exit(netem_module_exit)
				1260	MODULE_LICENSE("GPL");