1/*
2 * net/sched/sch_netem.c	Network emulator
3 *
4 * 		This program is free software; you can redistribute it and/or
5 * 		modify it under the terms of the GNU General Public License
6 * 		as published by the Free Software Foundation; either version
7 * 		2 of the License.
8 *
9 *  		Many of the algorithms and ideas for this came from
10 *		NIST Net which is not copyrighted.
11 *
12 * Authors:	Stephen Hemminger <shemminger@osdl.org>
13 *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/mm.h>
17#include <linux/module.h>
18#include <linux/slab.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/skbuff.h>
23#include <linux/vmalloc.h>
24#include <linux/rtnetlink.h>
25#include <linux/reciprocal_div.h>
26#include <linux/rbtree.h>
27
28#include <net/netlink.h>
29#include <net/pkt_sched.h>
30#include <net/inet_ecn.h>
31
32#define VERSION "1.3"
33
34/*	Network Emulation Queuing algorithm.
35	====================================
36
37	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38		 Network Emulation Tool
39		 [2] Luigi Rizzo, DummyNet for FreeBSD
40
41	 ----------------------------------------------------------------
42
43	 This started out as a simple way to delay outgoing packets to
44	 test TCP but has grown to include most of the functionality
45	 of a full blown network emulator like NISTnet. It can delay
46	 packets and add random jitter (and correlation). The random
47	 distribution can be loaded from a table as well to provide
48	 normal, Pareto, or experimental curves. Packet loss,
49	 duplication, and reordering can also be emulated.
50
51	 This qdisc does not do classification that can be handled in
52	 layering other disciplines.  It does not need to do bandwidth
53	 control either since that can be handled by using token
54	 bucket or other rate control.
55
56     Correlated Loss Generator models
57
58	Added generation of correlated loss according to the
59	"Gilbert-Elliot" model, a 4-state markov model.
60
61	References:
62	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64	and intuitive loss model for packet networks and its implementation
65	in the Netem module in the Linux kernel", available in [1]
66
67	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69*/
70
71struct netem_sched_data {
72	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
73	struct rb_root t_root;
74
75	/* optional qdisc for classful handling (NULL at netem init) */
76	struct Qdisc	*qdisc;
77
78	struct qdisc_watchdog watchdog;
79
80	psched_tdiff_t latency;
81	psched_tdiff_t jitter;
82
83	u32 loss;
84	u32 ecn;
85	u32 limit;
86	u32 counter;
87	u32 gap;
88	u32 duplicate;
89	u32 reorder;
90	u32 corrupt;
91	u64 rate;
92	s32 packet_overhead;
93	u32 cell_size;
94	struct reciprocal_value cell_size_reciprocal;
95	s32 cell_overhead;
96
97	struct crndstate {
98		u32 last;
99		u32 rho;
100	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
101
102	struct disttable {
103		u32  size;
104		s16 table[0];
105	} *delay_dist;
106
107	enum  {
108		CLG_RANDOM,
109		CLG_4_STATES,
110		CLG_GILB_ELL,
111	} loss_model;
112
113	enum {
114		TX_IN_GAP_PERIOD = 1,
115		TX_IN_BURST_PERIOD,
116		LOST_IN_GAP_PERIOD,
117		LOST_IN_BURST_PERIOD,
118	} _4_state_model;
119
120	enum {
121		GOOD_STATE = 1,
122		BAD_STATE,
123	} GE_state_model;
124
125	/* Correlated Loss Generation models */
126	struct clgstate {
127		/* state of the Markov chain */
128		u8 state;
129
130		/* 4-states and Gilbert-Elliot models */
131		u32 a1;	/* p13 for 4-states or p for GE */
132		u32 a2;	/* p31 for 4-states or r for GE */
133		u32 a3;	/* p32 for 4-states or h for GE */
134		u32 a4;	/* p14 for 4-states or 1-k for GE */
135		u32 a5; /* p23 used only in 4-states */
136	} clg;
137
138};
139
140/* Time stamp put into socket buffer control block
141 * Only valid when skbs are in our internal t(ime)fifo queue.
142 *
143 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
144 * and skb->next & skb->prev are scratch space for a qdisc,
145 * we save skb->tstamp value in skb->cb[] before destroying it.
146 */
147struct netem_skb_cb {
148	psched_time_t	time_to_send;
149	ktime_t		tstamp_save;
150};
151
152
153static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
154{
155	return container_of(rb, struct sk_buff, rbnode);
156}
157
158static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
159{
160	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
161	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
162	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
163}
164
165/* init_crandom - initialize correlated random number generator
166 * Use entropy source for initial seed.
167 */
168static void init_crandom(struct crndstate *state, unsigned long rho)
169{
170	state->rho = rho;
171	state->last = prandom_u32();
172}
173
174/* get_crandom - correlated random number generator
175 * Next number depends on last value.
176 * rho is scaled to avoid floating point.
177 */
178static u32 get_crandom(struct crndstate *state)
179{
180	u64 value, rho;
181	unsigned long answer;
182
183	if (state->rho == 0)	/* no correlation */
184		return prandom_u32();
185
186	value = prandom_u32();
187	rho = (u64)state->rho + 1;
188	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
189	state->last = answer;
190	return answer;
191}
192
193/* loss_4state - 4-state model loss generator
194 * Generates losses according to the 4-state Markov chain adopted in
195 * the GI (General and Intuitive) loss model.
196 */
197static bool loss_4state(struct netem_sched_data *q)
198{
199	struct clgstate *clg = &q->clg;
200	u32 rnd = prandom_u32();
201
202	/*
203	 * Makes a comparison between rnd and the transition
204	 * probabilities outgoing from the current state, then decides the
205	 * next state and if the next packet has to be transmitted or lost.
206	 * The four states correspond to:
207	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
208	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
209	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
210	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
211	 */
212	switch (clg->state) {
213	case TX_IN_GAP_PERIOD:
214		if (rnd < clg->a4) {
215			clg->state = LOST_IN_BURST_PERIOD;
216			return true;
217		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
218			clg->state = LOST_IN_GAP_PERIOD;
219			return true;
220		} else if (clg->a1 + clg->a4 < rnd) {
221			clg->state = TX_IN_GAP_PERIOD;
222		}
223
224		break;
225	case TX_IN_BURST_PERIOD:
226		if (rnd < clg->a5) {
227			clg->state = LOST_IN_GAP_PERIOD;
228			return true;
229		} else {
230			clg->state = TX_IN_BURST_PERIOD;
231		}
232
233		break;
234	case LOST_IN_GAP_PERIOD:
235		if (rnd < clg->a3)
236			clg->state = TX_IN_BURST_PERIOD;
237		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
238			clg->state = TX_IN_GAP_PERIOD;
239		} else if (clg->a2 + clg->a3 < rnd) {
240			clg->state = LOST_IN_GAP_PERIOD;
241			return true;
242		}
243		break;
244	case LOST_IN_BURST_PERIOD:
245		clg->state = TX_IN_GAP_PERIOD;
246		break;
247	}
248
249	return false;
250}
251
252/* loss_gilb_ell - Gilbert-Elliot model loss generator
253 * Generates losses according to the Gilbert-Elliot loss model or
254 * its special cases  (Gilbert or Simple Gilbert)
255 *
256 * Makes a comparison between random number and the transition
257 * probabilities outgoing from the current state, then decides the
258 * next state. A second random number is extracted and the comparison
259 * with the loss probability of the current state decides if the next
260 * packet will be transmitted or lost.
261 */
262static bool loss_gilb_ell(struct netem_sched_data *q)
263{
264	struct clgstate *clg = &q->clg;
265
266	switch (clg->state) {
267	case GOOD_STATE:
268		if (prandom_u32() < clg->a1)
269			clg->state = BAD_STATE;
270		if (prandom_u32() < clg->a4)
271			return true;
272		break;
273	case BAD_STATE:
274		if (prandom_u32() < clg->a2)
275			clg->state = GOOD_STATE;
276		if (prandom_u32() > clg->a3)
277			return true;
278	}
279
280	return false;
281}
282
283static bool loss_event(struct netem_sched_data *q)
284{
285	switch (q->loss_model) {
286	case CLG_RANDOM:
287		/* Random packet drop 0 => none, ~0 => all */
288		return q->loss && q->loss >= get_crandom(&q->loss_cor);
289
290	case CLG_4_STATES:
291		/* 4state loss model algorithm (used also for GI model)
292		* Extracts a value from the markov 4 state loss generator,
293		* if it is 1 drops a packet and if needed writes the event in
294		* the kernel logs
295		*/
296		return loss_4state(q);
297
298	case CLG_GILB_ELL:
299		/* Gilbert-Elliot loss model algorithm
300		* Extracts a value from the Gilbert-Elliot loss generator,
301		* if it is 1 drops a packet and if needed writes the event in
302		* the kernel logs
303		*/
304		return loss_gilb_ell(q);
305	}
306
307	return false;	/* not reached */
308}
309
310
311/* tabledist - return a pseudo-randomly distributed value with mean mu and
312 * std deviation sigma.  Uses table lookup to approximate the desired
313 * distribution, and a uniformly-distributed pseudo-random source.
314 */
315static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
316				struct crndstate *state,
317				const struct disttable *dist)
318{
319	psched_tdiff_t x;
320	long t;
321	u32 rnd;
322
323	if (sigma == 0)
324		return mu;
325
326	rnd = get_crandom(state);
327
328	/* default uniform distribution */
329	if (dist == NULL)
330		return (rnd % (2*sigma)) - sigma + mu;
331
332	t = dist->table[rnd % dist->size];
333	x = (sigma % NETEM_DIST_SCALE) * t;
334	if (x >= 0)
335		x += NETEM_DIST_SCALE/2;
336	else
337		x -= NETEM_DIST_SCALE/2;
338
339	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
340}
341
342static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
343{
344	u64 ticks;
345
346	len += q->packet_overhead;
347
348	if (q->cell_size) {
349		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
350
351		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
352			cells++;
353		len = cells * (q->cell_size + q->cell_overhead);
354	}
355
356	ticks = (u64)len * NSEC_PER_SEC;
357
358	do_div(ticks, q->rate);
359	return PSCHED_NS2TICKS(ticks);
360}
361
362static void tfifo_reset(struct Qdisc *sch)
363{
364	struct netem_sched_data *q = qdisc_priv(sch);
365	struct rb_node *p;
366
367	while ((p = rb_first(&q->t_root))) {
368		struct sk_buff *skb = netem_rb_to_skb(p);
369
370		rb_erase(p, &q->t_root);
371		skb->next = NULL;
372		skb->prev = NULL;
373		kfree_skb(skb);
374	}
375}
376
377static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
378{
379	struct netem_sched_data *q = qdisc_priv(sch);
380	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
381	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
382
383	while (*p) {
384		struct sk_buff *skb;
385
386		parent = *p;
387		skb = netem_rb_to_skb(parent);
388		if (tnext >= netem_skb_cb(skb)->time_to_send)
389			p = &parent->rb_right;
390		else
391			p = &parent->rb_left;
392	}
393	rb_link_node(&nskb->rbnode, parent, p);
394	rb_insert_color(&nskb->rbnode, &q->t_root);
395	sch->q.qlen++;
396}
397
398/*
399 * Insert one skb into qdisc.
400 * Note: parent depends on return value to account for queue length.
401 * 	NET_XMIT_DROP: queue length didn't change.
402 *      NET_XMIT_SUCCESS: one skb was queued.
403 */
404static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
405{
406	struct netem_sched_data *q = qdisc_priv(sch);
407	/* We don't fill cb now as skb_unshare() may invalidate it */
408	struct netem_skb_cb *cb;
409	struct sk_buff *skb2;
410	int count = 1;
411
412	/* Random duplication */
413	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
414		++count;
415
416	/* Drop packet? */
417	if (loss_event(q)) {
418		if (q->ecn && INET_ECN_set_ce(skb))
419			qdisc_qstats_drop(sch); /* mark packet */
420		else
421			--count;
422	}
423	if (count == 0) {
424		qdisc_qstats_drop(sch);
425		kfree_skb(skb);
426		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
427	}
428
429	/* If a delay is expected, orphan the skb. (orphaning usually takes
430	 * place at TX completion time, so _before_ the link transit delay)
431	 */
432	if (q->latency || q->jitter)
433		skb_orphan_partial(skb);
434
435	/*
436	 * If we need to duplicate packet, then re-insert at top of the
437	 * qdisc tree, since parent queuer expects that only one
438	 * skb will be queued.
439	 */
440	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
441		struct Qdisc *rootq = qdisc_root(sch);
442		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
443		q->duplicate = 0;
444
445		qdisc_enqueue_root(skb2, rootq);
446		q->duplicate = dupsave;
447	}
448
449	/*
450	 * Randomized packet corruption.
451	 * Make copy if needed since we are modifying
452	 * If packet is going to be hardware checksummed, then
453	 * do it now in software before we mangle it.
454	 */
455	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
456		if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
457		    (skb->ip_summed == CHECKSUM_PARTIAL &&
458		     skb_checksum_help(skb)))
459			return qdisc_drop(skb, sch);
460
461		skb->data[prandom_u32() % skb_headlen(skb)] ^=
462			1<<(prandom_u32() % 8);
463	}
464
465	if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
466		return qdisc_reshape_fail(skb, sch);
467
468	qdisc_qstats_backlog_inc(sch, skb);
469
470	cb = netem_skb_cb(skb);
471	if (q->gap == 0 ||		/* not doing reordering */
472	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
473	    q->reorder < get_crandom(&q->reorder_cor)) {
474		psched_time_t now;
475		psched_tdiff_t delay;
476
477		delay = tabledist(q->latency, q->jitter,
478				  &q->delay_cor, q->delay_dist);
479
480		now = psched_get_time();
481
482		if (q->rate) {
483			struct sk_buff *last;
484
485			if (!skb_queue_empty(&sch->q))
486				last = skb_peek_tail(&sch->q);
487			else
488				last = netem_rb_to_skb(rb_last(&q->t_root));
489			if (last) {
490				/*
491				 * Last packet in queue is reference point (now),
492				 * calculate this time bonus and subtract
493				 * from delay.
494				 */
495				delay -= netem_skb_cb(last)->time_to_send - now;
496				delay = max_t(psched_tdiff_t, 0, delay);
497				now = netem_skb_cb(last)->time_to_send;
498			}
499
500			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
501		}
502
503		cb->time_to_send = now + delay;
504		cb->tstamp_save = skb->tstamp;
505		++q->counter;
506		tfifo_enqueue(skb, sch);
507	} else {
508		/*
509		 * Do re-ordering by putting one out of N packets at the front
510		 * of the queue.
511		 */
512		cb->time_to_send = psched_get_time();
513		q->counter = 0;
514
515		__skb_queue_head(&sch->q, skb);
516		sch->qstats.requeues++;
517	}
518
519	return NET_XMIT_SUCCESS;
520}
521
522static unsigned int netem_drop(struct Qdisc *sch)
523{
524	struct netem_sched_data *q = qdisc_priv(sch);
525	unsigned int len;
526
527	len = qdisc_queue_drop(sch);
528
529	if (!len) {
530		struct rb_node *p = rb_first(&q->t_root);
531
532		if (p) {
533			struct sk_buff *skb = netem_rb_to_skb(p);
534
535			rb_erase(p, &q->t_root);
536			sch->q.qlen--;
537			skb->next = NULL;
538			skb->prev = NULL;
539			qdisc_qstats_backlog_dec(sch, skb);
540			kfree_skb(skb);
541		}
542	}
543	if (!len && q->qdisc && q->qdisc->ops->drop)
544	    len = q->qdisc->ops->drop(q->qdisc);
545	if (len)
546		qdisc_qstats_drop(sch);
547
548	return len;
549}
550
551static struct sk_buff *netem_dequeue(struct Qdisc *sch)
552{
553	struct netem_sched_data *q = qdisc_priv(sch);
554	struct sk_buff *skb;
555	struct rb_node *p;
556
557	if (qdisc_is_throttled(sch))
558		return NULL;
559
560tfifo_dequeue:
561	skb = __skb_dequeue(&sch->q);
562	if (skb) {
563		qdisc_qstats_backlog_dec(sch, skb);
564deliver:
565		qdisc_unthrottled(sch);
566		qdisc_bstats_update(sch, skb);
567		return skb;
568	}
569	p = rb_first(&q->t_root);
570	if (p) {
571		psched_time_t time_to_send;
572
573		skb = netem_rb_to_skb(p);
574
575		/* if more time remaining? */
576		time_to_send = netem_skb_cb(skb)->time_to_send;
577		if (time_to_send <= psched_get_time()) {
578			rb_erase(p, &q->t_root);
579
580			sch->q.qlen--;
581			qdisc_qstats_backlog_dec(sch, skb);
582			skb->next = NULL;
583			skb->prev = NULL;
584			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
585
586#ifdef CONFIG_NET_CLS_ACT
587			/*
588			 * If it's at ingress let's pretend the delay is
589			 * from the network (tstamp will be updated).
590			 */
591			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
592				skb->tstamp.tv64 = 0;
593#endif
594
595			if (q->qdisc) {
596				int err = qdisc_enqueue(skb, q->qdisc);
597
598				if (unlikely(err != NET_XMIT_SUCCESS)) {
599					if (net_xmit_drop_count(err)) {
600						qdisc_qstats_drop(sch);
601						qdisc_tree_decrease_qlen(sch, 1);
602					}
603				}
604				goto tfifo_dequeue;
605			}
606			goto deliver;
607		}
608
609		if (q->qdisc) {
610			skb = q->qdisc->ops->dequeue(q->qdisc);
611			if (skb)
612				goto deliver;
613		}
614		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
615	}
616
617	if (q->qdisc) {
618		skb = q->qdisc->ops->dequeue(q->qdisc);
619		if (skb)
620			goto deliver;
621	}
622	return NULL;
623}
624
625static void netem_reset(struct Qdisc *sch)
626{
627	struct netem_sched_data *q = qdisc_priv(sch);
628
629	qdisc_reset_queue(sch);
630	tfifo_reset(sch);
631	if (q->qdisc)
632		qdisc_reset(q->qdisc);
633	qdisc_watchdog_cancel(&q->watchdog);
634}
635
636static void dist_free(struct disttable *d)
637{
638	kvfree(d);
639}
640
641/*
642 * Distribution data is a variable size payload containing
643 * signed 16 bit values.
644 */
645static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
646{
647	struct netem_sched_data *q = qdisc_priv(sch);
648	size_t n = nla_len(attr)/sizeof(__s16);
649	const __s16 *data = nla_data(attr);
650	spinlock_t *root_lock;
651	struct disttable *d;
652	int i;
653	size_t s;
654
655	if (n > NETEM_DIST_MAX)
656		return -EINVAL;
657
658	s = sizeof(struct disttable) + n * sizeof(s16);
659	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
660	if (!d)
661		d = vmalloc(s);
662	if (!d)
663		return -ENOMEM;
664
665	d->size = n;
666	for (i = 0; i < n; i++)
667		d->table[i] = data[i];
668
669	root_lock = qdisc_root_sleeping_lock(sch);
670
671	spin_lock_bh(root_lock);
672	swap(q->delay_dist, d);
673	spin_unlock_bh(root_lock);
674
675	dist_free(d);
676	return 0;
677}
678
679static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
680{
681	const struct tc_netem_corr *c = nla_data(attr);
682
683	init_crandom(&q->delay_cor, c->delay_corr);
684	init_crandom(&q->loss_cor, c->loss_corr);
685	init_crandom(&q->dup_cor, c->dup_corr);
686}
687
688static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
689{
690	const struct tc_netem_reorder *r = nla_data(attr);
691
692	q->reorder = r->probability;
693	init_crandom(&q->reorder_cor, r->correlation);
694}
695
696static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
697{
698	const struct tc_netem_corrupt *r = nla_data(attr);
699
700	q->corrupt = r->probability;
701	init_crandom(&q->corrupt_cor, r->correlation);
702}
703
704static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
705{
706	const struct tc_netem_rate *r = nla_data(attr);
707
708	q->rate = r->rate;
709	q->packet_overhead = r->packet_overhead;
710	q->cell_size = r->cell_size;
711	q->cell_overhead = r->cell_overhead;
712	if (q->cell_size)
713		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
714	else
715		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
716}
717
718static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
719{
720	const struct nlattr *la;
721	int rem;
722
723	nla_for_each_nested(la, attr, rem) {
724		u16 type = nla_type(la);
725
726		switch (type) {
727		case NETEM_LOSS_GI: {
728			const struct tc_netem_gimodel *gi = nla_data(la);
729
730			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
731				pr_info("netem: incorrect gi model size\n");
732				return -EINVAL;
733			}
734
735			q->loss_model = CLG_4_STATES;
736
737			q->clg.state = TX_IN_GAP_PERIOD;
738			q->clg.a1 = gi->p13;
739			q->clg.a2 = gi->p31;
740			q->clg.a3 = gi->p32;
741			q->clg.a4 = gi->p14;
742			q->clg.a5 = gi->p23;
743			break;
744		}
745
746		case NETEM_LOSS_GE: {
747			const struct tc_netem_gemodel *ge = nla_data(la);
748
749			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
750				pr_info("netem: incorrect ge model size\n");
751				return -EINVAL;
752			}
753
754			q->loss_model = CLG_GILB_ELL;
755			q->clg.state = GOOD_STATE;
756			q->clg.a1 = ge->p;
757			q->clg.a2 = ge->r;
758			q->clg.a3 = ge->h;
759			q->clg.a4 = ge->k1;
760			break;
761		}
762
763		default:
764			pr_info("netem: unknown loss type %u\n", type);
765			return -EINVAL;
766		}
767	}
768
769	return 0;
770}
771
772static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
773	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
774	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
775	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
776	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
777	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
778	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
779	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
780};
781
782static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
783		      const struct nla_policy *policy, int len)
784{
785	int nested_len = nla_len(nla) - NLA_ALIGN(len);
786
787	if (nested_len < 0) {
788		pr_info("netem: invalid attributes len %d\n", nested_len);
789		return -EINVAL;
790	}
791
792	if (nested_len >= nla_attr_size(0))
793		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
794				 nested_len, policy);
795
796	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
797	return 0;
798}
799
800/* Parse netlink message to set options */
801static int netem_change(struct Qdisc *sch, struct nlattr *opt)
802{
803	struct netem_sched_data *q = qdisc_priv(sch);
804	struct nlattr *tb[TCA_NETEM_MAX + 1];
805	struct tc_netem_qopt *qopt;
806	struct clgstate old_clg;
807	int old_loss_model = CLG_RANDOM;
808	int ret;
809
810	if (opt == NULL)
811		return -EINVAL;
812
813	qopt = nla_data(opt);
814	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
815	if (ret < 0)
816		return ret;
817
818	/* backup q->clg and q->loss_model */
819	old_clg = q->clg;
820	old_loss_model = q->loss_model;
821
822	if (tb[TCA_NETEM_LOSS]) {
823		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
824		if (ret) {
825			q->loss_model = old_loss_model;
826			return ret;
827		}
828	} else {
829		q->loss_model = CLG_RANDOM;
830	}
831
832	if (tb[TCA_NETEM_DELAY_DIST]) {
833		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
834		if (ret) {
835			/* recover clg and loss_model, in case of
836			 * q->clg and q->loss_model were modified
837			 * in get_loss_clg()
838			 */
839			q->clg = old_clg;
840			q->loss_model = old_loss_model;
841			return ret;
842		}
843	}
844
845	sch->limit = qopt->limit;
846
847	q->latency = qopt->latency;
848	q->jitter = qopt->jitter;
849	q->limit = qopt->limit;
850	q->gap = qopt->gap;
851	q->counter = 0;
852	q->loss = qopt->loss;
853	q->duplicate = qopt->duplicate;
854
855	/* for compatibility with earlier versions.
856	 * if gap is set, need to assume 100% probability
857	 */
858	if (q->gap)
859		q->reorder = ~0;
860
861	if (tb[TCA_NETEM_CORR])
862		get_correlation(q, tb[TCA_NETEM_CORR]);
863
864	if (tb[TCA_NETEM_REORDER])
865		get_reorder(q, tb[TCA_NETEM_REORDER]);
866
867	if (tb[TCA_NETEM_CORRUPT])
868		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
869
870	if (tb[TCA_NETEM_RATE])
871		get_rate(q, tb[TCA_NETEM_RATE]);
872
873	if (tb[TCA_NETEM_RATE64])
874		q->rate = max_t(u64, q->rate,
875				nla_get_u64(tb[TCA_NETEM_RATE64]));
876
877	if (tb[TCA_NETEM_ECN])
878		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
879
880	return ret;
881}
882
883static int netem_init(struct Qdisc *sch, struct nlattr *opt)
884{
885	struct netem_sched_data *q = qdisc_priv(sch);
886	int ret;
887
888	if (!opt)
889		return -EINVAL;
890
891	qdisc_watchdog_init(&q->watchdog, sch);
892
893	q->loss_model = CLG_RANDOM;
894	ret = netem_change(sch, opt);
895	if (ret)
896		pr_info("netem: change failed\n");
897	return ret;
898}
899
900static void netem_destroy(struct Qdisc *sch)
901{
902	struct netem_sched_data *q = qdisc_priv(sch);
903
904	qdisc_watchdog_cancel(&q->watchdog);
905	if (q->qdisc)
906		qdisc_destroy(q->qdisc);
907	dist_free(q->delay_dist);
908}
909
910static int dump_loss_model(const struct netem_sched_data *q,
911			   struct sk_buff *skb)
912{
913	struct nlattr *nest;
914
915	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
916	if (nest == NULL)
917		goto nla_put_failure;
918
919	switch (q->loss_model) {
920	case CLG_RANDOM:
921		/* legacy loss model */
922		nla_nest_cancel(skb, nest);
923		return 0;	/* no data */
924
925	case CLG_4_STATES: {
926		struct tc_netem_gimodel gi = {
927			.p13 = q->clg.a1,
928			.p31 = q->clg.a2,
929			.p32 = q->clg.a3,
930			.p14 = q->clg.a4,
931			.p23 = q->clg.a5,
932		};
933
934		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
935			goto nla_put_failure;
936		break;
937	}
938	case CLG_GILB_ELL: {
939		struct tc_netem_gemodel ge = {
940			.p = q->clg.a1,
941			.r = q->clg.a2,
942			.h = q->clg.a3,
943			.k1 = q->clg.a4,
944		};
945
946		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
947			goto nla_put_failure;
948		break;
949	}
950	}
951
952	nla_nest_end(skb, nest);
953	return 0;
954
955nla_put_failure:
956	nla_nest_cancel(skb, nest);
957	return -1;
958}
959
960static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
961{
962	const struct netem_sched_data *q = qdisc_priv(sch);
963	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
964	struct tc_netem_qopt qopt;
965	struct tc_netem_corr cor;
966	struct tc_netem_reorder reorder;
967	struct tc_netem_corrupt corrupt;
968	struct tc_netem_rate rate;
969
970	qopt.latency = q->latency;
971	qopt.jitter = q->jitter;
972	qopt.limit = q->limit;
973	qopt.loss = q->loss;
974	qopt.gap = q->gap;
975	qopt.duplicate = q->duplicate;
976	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
977		goto nla_put_failure;
978
979	cor.delay_corr = q->delay_cor.rho;
980	cor.loss_corr = q->loss_cor.rho;
981	cor.dup_corr = q->dup_cor.rho;
982	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
983		goto nla_put_failure;
984
985	reorder.probability = q->reorder;
986	reorder.correlation = q->reorder_cor.rho;
987	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
988		goto nla_put_failure;
989
990	corrupt.probability = q->corrupt;
991	corrupt.correlation = q->corrupt_cor.rho;
992	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
993		goto nla_put_failure;
994
995	if (q->rate >= (1ULL << 32)) {
996		if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate))
997			goto nla_put_failure;
998		rate.rate = ~0U;
999	} else {
1000		rate.rate = q->rate;
1001	}
1002	rate.packet_overhead = q->packet_overhead;
1003	rate.cell_size = q->cell_size;
1004	rate.cell_overhead = q->cell_overhead;
1005	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1006		goto nla_put_failure;
1007
1008	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1009		goto nla_put_failure;
1010
1011	if (dump_loss_model(q, skb) != 0)
1012		goto nla_put_failure;
1013
1014	return nla_nest_end(skb, nla);
1015
1016nla_put_failure:
1017	nlmsg_trim(skb, nla);
1018	return -1;
1019}
1020
1021static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1022			  struct sk_buff *skb, struct tcmsg *tcm)
1023{
1024	struct netem_sched_data *q = qdisc_priv(sch);
1025
1026	if (cl != 1 || !q->qdisc) 	/* only one class */
1027		return -ENOENT;
1028
1029	tcm->tcm_handle |= TC_H_MIN(1);
1030	tcm->tcm_info = q->qdisc->handle;
1031
1032	return 0;
1033}
1034
1035static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1036		     struct Qdisc **old)
1037{
1038	struct netem_sched_data *q = qdisc_priv(sch);
1039
1040	sch_tree_lock(sch);
1041	*old = q->qdisc;
1042	q->qdisc = new;
1043	if (*old) {
1044		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1045		qdisc_reset(*old);
1046	}
1047	sch_tree_unlock(sch);
1048
1049	return 0;
1050}
1051
1052static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1053{
1054	struct netem_sched_data *q = qdisc_priv(sch);
1055	return q->qdisc;
1056}
1057
1058static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1059{
1060	return 1;
1061}
1062
1063static void netem_put(struct Qdisc *sch, unsigned long arg)
1064{
1065}
1066
1067static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1068{
1069	if (!walker->stop) {
1070		if (walker->count >= walker->skip)
1071			if (walker->fn(sch, 1, walker) < 0) {
1072				walker->stop = 1;
1073				return;
1074			}
1075		walker->count++;
1076	}
1077}
1078
1079static const struct Qdisc_class_ops netem_class_ops = {
1080	.graft		=	netem_graft,
1081	.leaf		=	netem_leaf,
1082	.get		=	netem_get,
1083	.put		=	netem_put,
1084	.walk		=	netem_walk,
1085	.dump		=	netem_dump_class,
1086};
1087
1088static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1089	.id		=	"netem",
1090	.cl_ops		=	&netem_class_ops,
1091	.priv_size	=	sizeof(struct netem_sched_data),
1092	.enqueue	=	netem_enqueue,
1093	.dequeue	=	netem_dequeue,
1094	.peek		=	qdisc_peek_dequeued,
1095	.drop		=	netem_drop,
1096	.init		=	netem_init,
1097	.reset		=	netem_reset,
1098	.destroy	=	netem_destroy,
1099	.change		=	netem_change,
1100	.dump		=	netem_dump,
1101	.owner		=	THIS_MODULE,
1102};
1103
1104
1105static int __init netem_module_init(void)
1106{
1107	pr_info("netem: version " VERSION "\n");
1108	return register_qdisc(&netem_qdisc_ops);
1109}
1110static void __exit netem_module_exit(void)
1111{
1112	unregister_qdisc(&netem_qdisc_ops);
1113}
1114module_init(netem_module_init)
1115module_exit(netem_module_exit)
1116MODULE_LICENSE("GPL");
1117