Blame - ap/os/linux/linux-3.4.x/net/sched/sch_netem.c - R306

blob: 992acaac5de64301efa5a8a5f3b8813bf624a3fd [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* net/sched/sch_netem.c Network emulator
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License.
				8	*
				9	* Many of the algorithms and ideas for this came from
				10	* NIST Net which is not copyrighted.
				11	*
				12	* Authors: Stephen Hemminger <shemminger@osdl.org>
				13	* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
				14	*/
				15
				16	#include <linux/mm.h>
				17	#include <linux/module.h>
				18	#include <linux/slab.h>
				19	#include <linux/types.h>
				20	#include <linux/kernel.h>
				21	#include <linux/errno.h>
				22	#include <linux/skbuff.h>
				23	#include <linux/vmalloc.h>
				24	#include <linux/rtnetlink.h>
				25	#include <linux/reciprocal_div.h>
				26
				27	#include <net/netlink.h>
				28	#include <net/pkt_sched.h>
				29
				30	#define VERSION "1.3"
				31
				32	/* Network Emulation Queuing algorithm.
				33	====================================
				34
				35	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
				36	Network Emulation Tool
				37	[2] Luigi Rizzo, DummyNet for FreeBSD
				38
				39	----------------------------------------------------------------
				40
				41	This started out as a simple way to delay outgoing packets to
				42	test TCP but has grown to include most of the functionality
				43	of a full blown network emulator like NISTnet. It can delay
				44	packets and add random jitter (and correlation). The random
				45	distribution can be loaded from a table as well to provide
				46	normal, Pareto, or experimental curves. Packet loss,
				47	duplication, and reordering can also be emulated.
				48
				49	This qdisc does not do classification that can be handled in
				50	layering other disciplines. It does not need to do bandwidth
				51	control either since that can be handled by using token
				52	bucket or other rate control.
				53
				54	Correlated Loss Generator models
				55
				56	Added generation of correlated loss according to the
				57	"Gilbert-Elliot" model, a 4-state markov model.
				58
				59	References:
				60	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
				61	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
				62	and intuitive loss model for packet networks and its implementation
				63	in the Netem module in the Linux kernel", available in [1]
				64
				65	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
				66	Fabio Ludovici <fabio.ludovici at yahoo.it>
				67	*/
				68
				69	struct netem_sched_data {
				70	/* internal t(ime)fifo qdisc uses sch->q and sch->limit */
				71
				72	/* optional qdisc for classful handling (NULL at netem init) */
				73	struct Qdisc *qdisc;
				74
				75	struct qdisc_watchdog watchdog;
				76
				77	psched_tdiff_t latency;
				78	psched_tdiff_t jitter;
				79
				80	u32 loss;
				81	u32 limit;
				82	u32 counter;
				83	u32 gap;
				84	u32 duplicate;
				85	u32 reorder;
				86	u32 corrupt;
				87	u32 rate;
				88	s32 packet_overhead;
				89	u32 cell_size;
				90	u32 cell_size_reciprocal;
				91	s32 cell_overhead;
				92
				93	struct crndstate {
				94	u32 last;
				95	u32 rho;
				96	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
				97
				98	struct disttable {
				99	u32 size;
				100	s16 table[0];
				101	} *delay_dist;
				102
				103	enum {
				104	CLG_RANDOM,
				105	CLG_4_STATES,
				106	CLG_GILB_ELL,
				107	} loss_model;
				108
				109	/* Correlated Loss Generation models */
				110	struct clgstate {
				111	/* state of the Markov chain */
				112	u8 state;
				113
				114	/* 4-states and Gilbert-Elliot models */
				115	u32 a1; /* p13 for 4-states or p for GE */
				116	u32 a2; /* p31 for 4-states or r for GE */
				117	u32 a3; /* p32 for 4-states or h for GE */
				118	u32 a4; /* p14 for 4-states or 1-k for GE */
				119	u32 a5; /* p23 used only in 4-states */
				120	} clg;
				121
				122	};
				123
				124	/* Time stamp put into socket buffer control block
				125	* Only valid when skbs are in our internal t(ime)fifo queue.
				126	*/
				127	struct netem_skb_cb {
				128	psched_time_t time_to_send;
				129	};
				130
				131	static inline struct netem_skb_cb netem_skb_cb(struct sk_buff skb)
				132	{
				133	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
				134	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
				135	}
				136
				137	/* init_crandom - initialize correlated random number generator
				138	* Use entropy source for initial seed.
				139	*/
				140	static void init_crandom(struct crndstate *state, unsigned long rho)
				141	{
				142	state->rho = rho;
				143	state->last = net_random();
				144	}
				145
				146	/* get_crandom - correlated random number generator
				147	* Next number depends on last value.
				148	* rho is scaled to avoid floating point.
				149	*/
				150	static u32 get_crandom(struct crndstate *state)
				151	{
				152	u64 value, rho;
				153	unsigned long answer;
				154
				155	if (state->rho == 0) /* no correlation */
				156	return net_random();
				157
				158	value = net_random();
				159	rho = (u64)state->rho + 1;
				160	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
				161	state->last = answer;
				162	return answer;
				163	}
				164
				165	/* loss_4state - 4-state model loss generator
				166	* Generates losses according to the 4-state Markov chain adopted in
				167	* the GI (General and Intuitive) loss model.
				168	*/
				169	static bool loss_4state(struct netem_sched_data *q)
				170	{
				171	struct clgstate *clg = &q->clg;
				172	u32 rnd = net_random();
				173
				174	/*
				175	* Makes a comparison between rnd and the transition
				176	* probabilities outgoing from the current state, then decides the
				177	* next state and if the next packet has to be transmitted or lost.
				178	* The four states correspond to:
				179	* 1 => successfully transmitted packets within a gap period
				180	* 4 => isolated losses within a gap period
				181	* 3 => lost packets within a burst period
				182	* 2 => successfully transmitted packets within a burst period
				183	*/
				184	switch (clg->state) {
				185	case 1:
				186	if (rnd < clg->a4) {
				187	clg->state = 4;
				188	return true;
				189	} else if (clg->a4 < rnd && rnd < clg->a1) {
				190	clg->state = 3;
				191	return true;
				192	} else if (clg->a1 < rnd)
				193	clg->state = 1;
				194
				195	break;
				196	case 2:
				197	if (rnd < clg->a5) {
				198	clg->state = 3;
				199	return true;
				200	} else
				201	clg->state = 2;
				202
				203	break;
				204	case 3:
				205	if (rnd < clg->a3)
				206	clg->state = 2;
				207	else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
				208	clg->state = 1;
				209	return true;
				210	} else if (clg->a2 + clg->a3 < rnd) {
				211	clg->state = 3;
				212	return true;
				213	}
				214	break;
				215	case 4:
				216	clg->state = 1;
				217	break;
				218	}
				219
				220	return false;
				221	}
				222
				223	/* loss_gilb_ell - Gilbert-Elliot model loss generator
				224	* Generates losses according to the Gilbert-Elliot loss model or
				225	* its special cases (Gilbert or Simple Gilbert)
				226	*
				227	* Makes a comparison between random number and the transition
				228	* probabilities outgoing from the current state, then decides the
				229	* next state. A second random number is extracted and the comparison
				230	* with the loss probability of the current state decides if the next
				231	* packet will be transmitted or lost.
				232	*/
				233	static bool loss_gilb_ell(struct netem_sched_data *q)
				234	{
				235	struct clgstate *clg = &q->clg;
				236
				237	switch (clg->state) {
				238	case 1:
				239	if (net_random() < clg->a1)
				240	clg->state = 2;
				241	if (net_random() < clg->a4)
				242	return true;
				243	case 2:
				244	if (net_random() < clg->a2)
				245	clg->state = 1;
				246	if (clg->a3 > net_random())
				247	return true;
				248	}
				249
				250	return false;
				251	}
				252
				253	static bool loss_event(struct netem_sched_data *q)
				254	{
				255	switch (q->loss_model) {
				256	case CLG_RANDOM:
				257	/* Random packet drop 0 => none, ~0 => all */
				258	return q->loss && q->loss >= get_crandom(&q->loss_cor);
				259
				260	case CLG_4_STATES:
				261	/* 4state loss model algorithm (used also for GI model)
				262	* Extracts a value from the markov 4 state loss generator,
				263	* if it is 1 drops a packet and if needed writes the event in
				264	* the kernel logs
				265	*/
				266	return loss_4state(q);
				267
				268	case CLG_GILB_ELL:
				269	/* Gilbert-Elliot loss model algorithm
				270	* Extracts a value from the Gilbert-Elliot loss generator,
				271	* if it is 1 drops a packet and if needed writes the event in
				272	* the kernel logs
				273	*/
				274	return loss_gilb_ell(q);
				275	}
				276
				277	return false; /* not reached */
				278	}
				279
				280
				281	/* tabledist - return a pseudo-randomly distributed value with mean mu and
				282	* std deviation sigma. Uses table lookup to approximate the desired
				283	* distribution, and a uniformly-distributed pseudo-random source.
				284	*/
				285	static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
				286	struct crndstate *state,
				287	const struct disttable *dist)
				288	{
				289	psched_tdiff_t x;
				290	long t;
				291	u32 rnd;
				292
				293	if (sigma == 0)
				294	return mu;
				295
				296	rnd = get_crandom(state);
				297
				298	/* default uniform distribution */
				299	if (dist == NULL)
				300	return (rnd % (2*sigma)) - sigma + mu;
				301
				302	t = dist->table[rnd % dist->size];
				303	x = (sigma % NETEM_DIST_SCALE) * t;
				304	if (x >= 0)
				305	x += NETEM_DIST_SCALE/2;
				306	else
				307	x -= NETEM_DIST_SCALE/2;
				308
				309	return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
				310	}
				311
				312	static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
				313	{
				314	u64 ticks;
				315
				316	len += q->packet_overhead;
				317
				318	if (q->cell_size) {
				319	u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
				320
				321	if (len > cells * q->cell_size) /* extra cell needed for remainder */
				322	cells++;
				323	len = cells * (q->cell_size + q->cell_overhead);
				324	}
				325
				326	ticks = (u64)len * NSEC_PER_SEC;
				327
				328	do_div(ticks, q->rate);
				329	return PSCHED_NS2TICKS(ticks);
				330	}
				331
				332	static void tfifo_enqueue(struct sk_buff nskb, struct Qdisc sch)
				333	{
				334	struct sk_buff_head *list = &sch->q;
				335	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
				336	struct sk_buff *skb = skb_peek_tail(list);
				337
				338	/* Optimize for add at tail */
				339	if (likely(!skb \|\| tnext >= netem_skb_cb(skb)->time_to_send))
				340	return __skb_queue_tail(list, nskb);
				341
				342	skb_queue_reverse_walk(list, skb) {
				343	if (tnext >= netem_skb_cb(skb)->time_to_send)
				344	break;
				345	}
				346
				347	__skb_queue_after(list, skb, nskb);
				348	}
				349
				350	/*
				351	* Insert one skb into qdisc.
				352	* Note: parent depends on return value to account for queue length.
				353	* NET_XMIT_DROP: queue length didn't change.
				354	* NET_XMIT_SUCCESS: one skb was queued.
				355	*/
				356	static int netem_enqueue(struct sk_buff skb, struct Qdisc sch)
				357	{
				358	struct netem_sched_data *q = qdisc_priv(sch);
				359	/* We don't fill cb now as skb_unshare() may invalidate it */
				360	struct netem_skb_cb *cb;
				361	struct sk_buff *skb2;
				362	int count = 1;
				363
				364	/* Random duplication */
				365	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
				366	++count;
				367
				368	/* Drop packet? */
				369	if (loss_event(q))
				370	--count;
				371
				372	if (count == 0) {
				373	sch->qstats.drops++;
				374	kfree_skb(skb);
				375	return NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				376	}
				377
				378	skb_orphan(skb);
				379
				380	/*
				381	* If we need to duplicate packet, then re-insert at top of the
				382	* qdisc tree, since parent queuer expects that only one
				383	* skb will be queued.
				384	*/
				385	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
				386	struct Qdisc *rootq = qdisc_root(sch);
				387	u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
				388	q->duplicate = 0;
				389
				390	qdisc_enqueue_root(skb2, rootq);
				391	q->duplicate = dupsave;
				392	}
				393
				394	/*
				395	* Randomized packet corruption.
				396	* Make copy if needed since we are modifying
				397	* If packet is going to be hardware checksummed, then
				398	* do it now in software before we mangle it.
				399	*/
				400	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
				401	if (!(skb = skb_unshare(skb, GFP_ATOMIC)) \|\|
				402	(skb->ip_summed == CHECKSUM_PARTIAL &&
				403	skb_checksum_help(skb)))
				404	return qdisc_drop(skb, sch);
				405
				406	skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
				407	}
				408
				409	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
				410	return qdisc_reshape_fail(skb, sch);
				411
				412	sch->qstats.backlog += qdisc_pkt_len(skb);
				413
				414	cb = netem_skb_cb(skb);
				415	if (q->gap == 0 \|\| /* not doing reordering */
				416	q->counter < q->gap - 1 \|\| /* inside last reordering gap */
				417	q->reorder < get_crandom(&q->reorder_cor)) {
				418	psched_time_t now;
				419	psched_tdiff_t delay;
				420
				421	delay = tabledist(q->latency, q->jitter,
				422	&q->delay_cor, q->delay_dist);
				423
				424	now = psched_get_time();
				425
				426	if (q->rate) {
				427	struct sk_buff_head *list = &sch->q;
				428
				429	delay += packet_len_2_sched_time(skb->len, q);
				430
				431	if (!skb_queue_empty(list)) {
				432	/*
				433	* Last packet in queue is reference point (now).
				434	* First packet in queue is already in flight,
				435	* calculate this time bonus and substract
				436	* from delay.
				437	*/
				438	delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
				439	now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
				440	}
				441	}
				442
				443	cb->time_to_send = now + delay;
				444	++q->counter;
				445	tfifo_enqueue(skb, sch);
				446	} else {
				447	/*
				448	* Do re-ordering by putting one out of N packets at the front
				449	* of the queue.
				450	*/
				451	cb->time_to_send = psched_get_time();
				452	q->counter = 0;
				453
				454	__skb_queue_head(&sch->q, skb);
				455	sch->qstats.requeues++;
				456	}
				457
				458	return NET_XMIT_SUCCESS;
				459	}
				460
				461	static unsigned int netem_drop(struct Qdisc *sch)
				462	{
				463	struct netem_sched_data *q = qdisc_priv(sch);
				464	unsigned int len;
				465
				466	len = qdisc_queue_drop(sch);
				467	if (!len && q->qdisc && q->qdisc->ops->drop)
				468	len = q->qdisc->ops->drop(q->qdisc);
				469	if (len)
				470	sch->qstats.drops++;
				471
				472	return len;
				473	}
				474
				475	static struct sk_buff netem_dequeue(struct Qdisc sch)
				476	{
				477	struct netem_sched_data *q = qdisc_priv(sch);
				478	struct sk_buff *skb;
				479
				480	if (qdisc_is_throttled(sch))
				481	return NULL;
				482
				483	tfifo_dequeue:
				484	skb = qdisc_peek_head(sch);
				485	if (skb) {
				486	const struct netem_skb_cb *cb = netem_skb_cb(skb);
				487
				488	/* if more time remaining? */
				489	if (cb->time_to_send <= psched_get_time()) {
				490	__skb_unlink(skb, &sch->q);
				491	sch->qstats.backlog -= qdisc_pkt_len(skb);
				492
				493	#ifdef CONFIG_NET_CLS_ACT
				494	/*
				495	* If it's at ingress let's pretend the delay is
				496	* from the network (tstamp will be updated).
				497	*/
				498	if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
				499	skb->tstamp.tv64 = 0;
				500	#endif
				501
				502	if (q->qdisc) {
				503	int err = qdisc_enqueue(skb, q->qdisc);
				504
				505	if (unlikely(err != NET_XMIT_SUCCESS)) {
				506	if (net_xmit_drop_count(err)) {
				507	sch->qstats.drops++;
				508	qdisc_tree_decrease_qlen(sch, 1);
				509	}
				510	}
				511	goto tfifo_dequeue;
				512	}
				513	deliver:
				514	qdisc_unthrottled(sch);
				515	qdisc_bstats_update(sch, skb);
				516	return skb;
				517	}
				518
				519	if (q->qdisc) {
				520	skb = q->qdisc->ops->dequeue(q->qdisc);
				521	if (skb)
				522	goto deliver;
				523	}
				524	qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
				525	}
				526
				527	if (q->qdisc) {
				528	skb = q->qdisc->ops->dequeue(q->qdisc);
				529	if (skb)
				530	goto deliver;
				531	}
				532	return NULL;
				533	}
				534
				535	static void netem_reset(struct Qdisc *sch)
				536	{
				537	struct netem_sched_data *q = qdisc_priv(sch);
				538
				539	qdisc_reset_queue(sch);
				540	if (q->qdisc)
				541	qdisc_reset(q->qdisc);
				542	qdisc_watchdog_cancel(&q->watchdog);
				543	}
				544
				545	static void dist_free(struct disttable *d)
				546	{
				547	if (d) {
				548	if (is_vmalloc_addr(d))
				549	vfree(d);
				550	else
				551	kfree(d);
				552	}
				553	}
				554
				555	/*
				556	* Distribution data is a variable size payload containing
				557	* signed 16 bit values.
				558	*/
				559	static int get_dist_table(struct Qdisc sch, const struct nlattr attr)
				560	{
				561	struct netem_sched_data *q = qdisc_priv(sch);
				562	size_t n = nla_len(attr)/sizeof(__s16);
				563	const __s16 *data = nla_data(attr);
				564	spinlock_t *root_lock;
				565	struct disttable *d;
				566	int i;
				567	size_t s;
				568
				569	if (n > NETEM_DIST_MAX)
				570	return -EINVAL;
				571
				572	s = sizeof(struct disttable) + n * sizeof(s16);
				573	d = kmalloc(s, GFP_KERNEL \| __GFP_NOWARN);
				574	if (!d)
				575	d = vmalloc(s);
				576	if (!d)
				577	return -ENOMEM;
				578
				579	d->size = n;
				580	for (i = 0; i < n; i++)
				581	d->table[i] = data[i];
				582
				583	root_lock = qdisc_root_sleeping_lock(sch);
				584
				585	spin_lock_bh(root_lock);
				586	swap(q->delay_dist, d);
				587	spin_unlock_bh(root_lock);
				588
				589	dist_free(d);
				590	return 0;
				591	}
				592
				593	static void get_correlation(struct Qdisc sch, const struct nlattr attr)
				594	{
				595	struct netem_sched_data *q = qdisc_priv(sch);
				596	const struct tc_netem_corr *c = nla_data(attr);
				597
				598	init_crandom(&q->delay_cor, c->delay_corr);
				599	init_crandom(&q->loss_cor, c->loss_corr);
				600	init_crandom(&q->dup_cor, c->dup_corr);
				601	}
				602
				603	static void get_reorder(struct Qdisc sch, const struct nlattr attr)
				604	{
				605	struct netem_sched_data *q = qdisc_priv(sch);
				606	const struct tc_netem_reorder *r = nla_data(attr);
				607
				608	q->reorder = r->probability;
				609	init_crandom(&q->reorder_cor, r->correlation);
				610	}
				611
				612	static void get_corrupt(struct Qdisc sch, const struct nlattr attr)
				613	{
				614	struct netem_sched_data *q = qdisc_priv(sch);
				615	const struct tc_netem_corrupt *r = nla_data(attr);
				616
				617	q->corrupt = r->probability;
				618	init_crandom(&q->corrupt_cor, r->correlation);
				619	}
				620
				621	static void get_rate(struct Qdisc sch, const struct nlattr attr)
				622	{
				623	struct netem_sched_data *q = qdisc_priv(sch);
				624	const struct tc_netem_rate *r = nla_data(attr);
				625
				626	q->rate = r->rate;
				627	q->packet_overhead = r->packet_overhead;
				628	q->cell_size = r->cell_size;
				629	if (q->cell_size)
				630	q->cell_size_reciprocal = reciprocal_value(q->cell_size);
				631	q->cell_overhead = r->cell_overhead;
				632	}
				633
				634	static int get_loss_clg(struct Qdisc sch, const struct nlattr attr)
				635	{
				636	struct netem_sched_data *q = qdisc_priv(sch);
				637	const struct nlattr *la;
				638	int rem;
				639
				640	nla_for_each_nested(la, attr, rem) {
				641	u16 type = nla_type(la);
				642
				643	switch(type) {
				644	case NETEM_LOSS_GI: {
				645	const struct tc_netem_gimodel *gi = nla_data(la);
				646
				647	if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
				648	pr_info("netem: incorrect gi model size\n");
				649	return -EINVAL;
				650	}
				651
				652	q->loss_model = CLG_4_STATES;
				653
				654	q->clg.state = 1;
				655	q->clg.a1 = gi->p13;
				656	q->clg.a2 = gi->p31;
				657	q->clg.a3 = gi->p32;
				658	q->clg.a4 = gi->p14;
				659	q->clg.a5 = gi->p23;
				660	break;
				661	}
				662
				663	case NETEM_LOSS_GE: {
				664	const struct tc_netem_gemodel *ge = nla_data(la);
				665
				666	if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
				667	pr_info("netem: incorrect ge model size\n");
				668	return -EINVAL;
				669	}
				670
				671	q->loss_model = CLG_GILB_ELL;
				672	q->clg.state = 1;
				673	q->clg.a1 = ge->p;
				674	q->clg.a2 = ge->r;
				675	q->clg.a3 = ge->h;
				676	q->clg.a4 = ge->k1;
				677	break;
				678	}
				679
				680	default:
				681	pr_info("netem: unknown loss type %u\n", type);
				682	return -EINVAL;
				683	}
				684	}
				685
				686	return 0;
				687	}
				688
				689	static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
				690	[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
				691	[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
				692	[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
				693	[TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
				694	[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
				695	};
				696
				697	static int parse_attr(struct nlattr tb[], int maxtype, struct nlattr nla,
				698	const struct nla_policy *policy, int len)
				699	{
				700	int nested_len = nla_len(nla) - NLA_ALIGN(len);
				701
				702	if (nested_len < 0) {
				703	pr_info("netem: invalid attributes len %d\n", nested_len);
				704	return -EINVAL;
				705	}
				706
				707	if (nested_len >= nla_attr_size(0))
				708	return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
				709	nested_len, policy);
				710
				711	memset(tb, 0, sizeof(struct nlattr ) (maxtype + 1));
				712	return 0;
				713	}
				714
				715	/* Parse netlink message to set options */
				716	static int netem_change(struct Qdisc sch, struct nlattr opt)
				717	{
				718	struct netem_sched_data *q = qdisc_priv(sch);
				719	struct nlattr *tb[TCA_NETEM_MAX + 1];
				720	struct tc_netem_qopt *qopt;
				721	int ret;
				722
				723	if (opt == NULL)
				724	return -EINVAL;
				725
				726	qopt = nla_data(opt);
				727	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
				728	if (ret < 0)
				729	return ret;
				730
				731	sch->limit = qopt->limit;
				732
				733	q->latency = qopt->latency;
				734	q->jitter = qopt->jitter;
				735	q->limit = qopt->limit;
				736	q->gap = qopt->gap;
				737	q->counter = 0;
				738	q->loss = qopt->loss;
				739	q->duplicate = qopt->duplicate;
				740
				741	/* for compatibility with earlier versions.
				742	* if gap is set, need to assume 100% probability
				743	*/
				744	if (q->gap)
				745	q->reorder = ~0;
				746
				747	if (tb[TCA_NETEM_CORR])
				748	get_correlation(sch, tb[TCA_NETEM_CORR]);
				749
				750	if (tb[TCA_NETEM_DELAY_DIST]) {
				751	ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
				752	if (ret)
				753	return ret;
				754	}
				755
				756	if (tb[TCA_NETEM_REORDER])
				757	get_reorder(sch, tb[TCA_NETEM_REORDER]);
				758
				759	if (tb[TCA_NETEM_CORRUPT])
				760	get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
				761
				762	if (tb[TCA_NETEM_RATE])
				763	get_rate(sch, tb[TCA_NETEM_RATE]);
				764
				765	q->loss_model = CLG_RANDOM;
				766	if (tb[TCA_NETEM_LOSS])
				767	ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
				768
				769	return ret;
				770	}
				771
				772	static int netem_init(struct Qdisc sch, struct nlattr opt)
				773	{
				774	struct netem_sched_data *q = qdisc_priv(sch);
				775	int ret;
				776
				777	if (!opt)
				778	return -EINVAL;
				779
				780	qdisc_watchdog_init(&q->watchdog, sch);
				781
				782	q->loss_model = CLG_RANDOM;
				783	ret = netem_change(sch, opt);
				784	if (ret)
				785	pr_info("netem: change failed\n");
				786	return ret;
				787	}
				788
				789	static void netem_destroy(struct Qdisc *sch)
				790	{
				791	struct netem_sched_data *q = qdisc_priv(sch);
				792
				793	qdisc_watchdog_cancel(&q->watchdog);
				794	if (q->qdisc)
				795	qdisc_destroy(q->qdisc);
				796	dist_free(q->delay_dist);
				797	}
				798
				799	static int dump_loss_model(const struct netem_sched_data *q,
				800	struct sk_buff *skb)
				801	{
				802	struct nlattr *nest;
				803
				804	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
				805	if (nest == NULL)
				806	goto nla_put_failure;
				807
				808	switch (q->loss_model) {
				809	case CLG_RANDOM:
				810	/* legacy loss model */
				811	nla_nest_cancel(skb, nest);
				812	return 0; /* no data */
				813
				814	case CLG_4_STATES: {
				815	struct tc_netem_gimodel gi = {
				816	.p13 = q->clg.a1,
				817	.p31 = q->clg.a2,
				818	.p32 = q->clg.a3,
				819	.p14 = q->clg.a4,
				820	.p23 = q->clg.a5,
				821	};
				822
				823	NLA_PUT(skb, NETEM_LOSS_GI, sizeof(gi), &gi);
				824	break;
				825	}
				826	case CLG_GILB_ELL: {
				827	struct tc_netem_gemodel ge = {
				828	.p = q->clg.a1,
				829	.r = q->clg.a2,
				830	.h = q->clg.a3,
				831	.k1 = q->clg.a4,
				832	};
				833
				834	NLA_PUT(skb, NETEM_LOSS_GE, sizeof(ge), &ge);
				835	break;
				836	}
				837	}
				838
				839	nla_nest_end(skb, nest);
				840	return 0;
				841
				842	nla_put_failure:
				843	nla_nest_cancel(skb, nest);
				844	return -1;
				845	}
				846
				847	static int netem_dump(struct Qdisc sch, struct sk_buff skb)
				848	{
				849	const struct netem_sched_data *q = qdisc_priv(sch);
				850	struct nlattr nla = (struct nlattr ) skb_tail_pointer(skb);
				851	struct tc_netem_qopt qopt;
				852	struct tc_netem_corr cor;
				853	struct tc_netem_reorder reorder;
				854	struct tc_netem_corrupt corrupt;
				855	struct tc_netem_rate rate;
				856
				857	qopt.latency = q->latency;
				858	qopt.jitter = q->jitter;
				859	qopt.limit = q->limit;
				860	qopt.loss = q->loss;
				861	qopt.gap = q->gap;
				862	qopt.duplicate = q->duplicate;
				863	NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
				864
				865	cor.delay_corr = q->delay_cor.rho;
				866	cor.loss_corr = q->loss_cor.rho;
				867	cor.dup_corr = q->dup_cor.rho;
				868	NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
				869
				870	reorder.probability = q->reorder;
				871	reorder.correlation = q->reorder_cor.rho;
				872	NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
				873
				874	corrupt.probability = q->corrupt;
				875	corrupt.correlation = q->corrupt_cor.rho;
				876	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
				877
				878	rate.rate = q->rate;
				879	rate.packet_overhead = q->packet_overhead;
				880	rate.cell_size = q->cell_size;
				881	rate.cell_overhead = q->cell_overhead;
				882	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
				883
				884	if (dump_loss_model(q, skb) != 0)
				885	goto nla_put_failure;
				886
				887	return nla_nest_end(skb, nla);
				888
				889	nla_put_failure:
				890	nlmsg_trim(skb, nla);
				891	return -1;
				892	}
				893
				894	static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
				895	struct sk_buff skb, struct tcmsg tcm)
				896	{
				897	struct netem_sched_data *q = qdisc_priv(sch);
				898
				899	if (cl != 1 \|\| !q->qdisc) /* only one class */
				900	return -ENOENT;
				901
				902	tcm->tcm_handle \|= TC_H_MIN(1);
				903	tcm->tcm_info = q->qdisc->handle;
				904
				905	return 0;
				906	}
				907
				908	static int netem_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
				909	struct Qdisc **old)
				910	{
				911	struct netem_sched_data *q = qdisc_priv(sch);
				912
				913	sch_tree_lock(sch);
				914	*old = q->qdisc;
				915	q->qdisc = new;
				916	if (*old) {
				917	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
				918	qdisc_reset(*old);
				919	}
				920	sch_tree_unlock(sch);
				921
				922	return 0;
				923	}
				924
				925	static struct Qdisc netem_leaf(struct Qdisc sch, unsigned long arg)
				926	{
				927	struct netem_sched_data *q = qdisc_priv(sch);
				928	return q->qdisc;
				929	}
				930
				931	static unsigned long netem_get(struct Qdisc *sch, u32 classid)
				932	{
				933	return 1;
				934	}
				935
				936	static void netem_put(struct Qdisc *sch, unsigned long arg)
				937	{
				938	}
				939
				940	static void netem_walk(struct Qdisc sch, struct qdisc_walker walker)
				941	{
				942	if (!walker->stop) {
				943	if (walker->count >= walker->skip)
				944	if (walker->fn(sch, 1, walker) < 0) {
				945	walker->stop = 1;
				946	return;
				947	}
				948	walker->count++;
				949	}
				950	}
				951
				952	static const struct Qdisc_class_ops netem_class_ops = {
				953	.graft = netem_graft,
				954	.leaf = netem_leaf,
				955	.get = netem_get,
				956	.put = netem_put,
				957	.walk = netem_walk,
				958	.dump = netem_dump_class,
				959	};
				960
				961	static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
				962	.id = "netem",
				963	.cl_ops = &netem_class_ops,
				964	.priv_size = sizeof(struct netem_sched_data),
				965	.enqueue = netem_enqueue,
				966	.dequeue = netem_dequeue,
				967	.peek = qdisc_peek_dequeued,
				968	.drop = netem_drop,
				969	.init = netem_init,
				970	.reset = netem_reset,
				971	.destroy = netem_destroy,
				972	.change = netem_change,
				973	.dump = netem_dump,
				974	.owner = THIS_MODULE,
				975	};
				976
				977
				978	static int __init netem_module_init(void)
				979	{
				980	pr_info("netem: version " VERSION "\n");
				981	return register_qdisc(&netem_qdisc_ops);
				982	}
				983	static void __exit netem_module_exit(void)
				984	{
				985	unregister_qdisc(&netem_qdisc_ops);
				986	}
				987	module_init(netem_module_init)
				988	module_exit(netem_module_exit)
				989	MODULE_LICENSE("GPL");