1/*
2 * net/sched/sch_tbf.c	Token Bucket Filter queue.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 *						 original idea by Martin Devera
12 *
13 */
14
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/errno.h>
20#include <linux/skbuff.h>
21#include <net/netlink.h>
22#include <net/sch_generic.h>
23#include <net/pkt_sched.h>
24
25
26/*	Simple Token Bucket Filter.
27	=======================================
28
29	SOURCE.
30	-------
31
32	None.
33
34	Description.
35	------------
36
37	A data flow obeys TBF with rate R and depth B, if for any
38	time interval t_i...t_f the number of transmitted bits
39	does not exceed B + R*(t_f-t_i).
40
41	Packetized version of this definition:
42	The sequence of packets of sizes s_i served at moments t_i
43	obeys TBF, if for any i<=k:
44
45	s_i+....+s_k <= B + R*(t_k - t_i)
46
47	Algorithm.
48	----------
49
50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
51
52	N(t+delta) = min{B/R, N(t) + delta}
53
54	If the first packet in queue has length S, it may be
55	transmitted only at the time t_* when S/R <= N(t_*),
56	and in this case N(t) jumps:
57
58	N(t_* + 0) = N(t_* - 0) - S/R.
59
60
61
62	Actually, QoS requires two TBF to be applied to a data stream.
63	One of them controls steady state burst size, another
64	one with rate P (peak rate) and depth M (equal to link MTU)
65	limits bursts at a smaller time scale.
66
67	It is easy to see that P>R, and B>M. If P is infinity, this double
68	TBF is equivalent to a single one.
69
70	When TBF works in reshaping mode, latency is estimated as:
71
72	lat = max ((L-B)/R, (L-M)/P)
73
74
75	NOTES.
76	------
77
78	If TBF throttles, it starts a watchdog timer, which will wake it up
79	when it is ready to transmit.
80	Note that the minimal timer resolution is 1/HZ.
81	If no new packets arrive during this period,
82	or if the device is not awaken by EOI for some previous packet,
83	TBF can stop its activity for 1/HZ.
84
85
86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99*/
100
101struct tbf_sched_data {
102/* Parameters */
103	u32		limit;		/* Maximal length of backlog: bytes */
104	u32		max_size;
105	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
106	s64		mtu;
107	struct psched_ratecfg rate;
108	struct psched_ratecfg peak;
109
110/* Variables */
111	s64	tokens;			/* Current number of B tokens */
112	s64	ptokens;		/* Current number of P tokens */
113	s64	t_c;			/* Time check-point */
114	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
115	struct qdisc_watchdog watchdog;	/* Watchdog timer */
116};
117
118
119/* Time to Length, convert time in ns to length in bytes
120 * to determinate how many bytes can be sent in given time.
121 */
122static u64 psched_ns_t2l(const struct psched_ratecfg *r,
123			 u64 time_in_ns)
124{
125	/* The formula is :
126	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
127	 */
128	u64 len = time_in_ns * r->rate_bytes_ps;
129
130	do_div(len, NSEC_PER_SEC);
131
132	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
133		do_div(len, 53);
134		len = len * 48;
135	}
136
137	if (len > r->overhead)
138		len -= r->overhead;
139	else
140		len = 0;
141
142	return len;
143}
144
145/*
146 * Return length of individual segments of a gso packet,
147 * including all headers (MAC, IP, TCP/UDP)
148 */
149static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
150{
151	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
152	return hdr_len + skb_gso_transport_seglen(skb);
153}
154
155/* GSO packet is too big, segment it so that tbf can transmit
156 * each segment in time
157 */
158static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
159{
160	struct tbf_sched_data *q = qdisc_priv(sch);
161	struct sk_buff *segs, *nskb;
162	netdev_features_t features = netif_skb_features(skb);
163	int ret, nb;
164
165	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
166
167	if (IS_ERR_OR_NULL(segs))
168		return qdisc_reshape_fail(skb, sch);
169
170	nb = 0;
171	while (segs) {
172		nskb = segs->next;
173		segs->next = NULL;
174		qdisc_skb_cb(segs)->pkt_len = segs->len;
175		ret = qdisc_enqueue(segs, q->qdisc);
176		if (ret != NET_XMIT_SUCCESS) {
177			if (net_xmit_drop_count(ret))
178				qdisc_qstats_drop(sch);
179		} else {
180			nb++;
181		}
182		segs = nskb;
183	}
184	sch->q.qlen += nb;
185	if (nb > 1)
186		qdisc_tree_decrease_qlen(sch, 1 - nb);
187	consume_skb(skb);
188	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
189}
190
191static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
192{
193	struct tbf_sched_data *q = qdisc_priv(sch);
194	int ret;
195
196	if (qdisc_pkt_len(skb) > q->max_size) {
197		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
198			return tbf_segment(skb, sch);
199		return qdisc_reshape_fail(skb, sch);
200	}
201	ret = qdisc_enqueue(skb, q->qdisc);
202	if (ret != NET_XMIT_SUCCESS) {
203		if (net_xmit_drop_count(ret))
204			qdisc_qstats_drop(sch);
205		return ret;
206	}
207
208	sch->q.qlen++;
209	return NET_XMIT_SUCCESS;
210}
211
212static unsigned int tbf_drop(struct Qdisc *sch)
213{
214	struct tbf_sched_data *q = qdisc_priv(sch);
215	unsigned int len = 0;
216
217	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
218		sch->q.qlen--;
219		qdisc_qstats_drop(sch);
220	}
221	return len;
222}
223
224static bool tbf_peak_present(const struct tbf_sched_data *q)
225{
226	return q->peak.rate_bytes_ps;
227}
228
229static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
230{
231	struct tbf_sched_data *q = qdisc_priv(sch);
232	struct sk_buff *skb;
233
234	skb = q->qdisc->ops->peek(q->qdisc);
235
236	if (skb) {
237		s64 now;
238		s64 toks;
239		s64 ptoks = 0;
240		unsigned int len = qdisc_pkt_len(skb);
241
242		now = ktime_get_ns();
243		toks = min_t(s64, now - q->t_c, q->buffer);
244
245		if (tbf_peak_present(q)) {
246			ptoks = toks + q->ptokens;
247			if (ptoks > q->mtu)
248				ptoks = q->mtu;
249			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
250		}
251		toks += q->tokens;
252		if (toks > q->buffer)
253			toks = q->buffer;
254		toks -= (s64) psched_l2t_ns(&q->rate, len);
255
256		if ((toks|ptoks) >= 0) {
257			skb = qdisc_dequeue_peeked(q->qdisc);
258			if (unlikely(!skb))
259				return NULL;
260
261			q->t_c = now;
262			q->tokens = toks;
263			q->ptokens = ptoks;
264			sch->q.qlen--;
265			qdisc_unthrottled(sch);
266			qdisc_bstats_update(sch, skb);
267			return skb;
268		}
269
270		qdisc_watchdog_schedule_ns(&q->watchdog,
271					   now + max_t(long, -toks, -ptoks),
272					   true);
273
274		/* Maybe we have a shorter packet in the queue,
275		   which can be sent now. It sounds cool,
276		   but, however, this is wrong in principle.
277		   We MUST NOT reorder packets under these circumstances.
278
279		   Really, if we split the flow into independent
280		   subflows, it would be a very good solution.
281		   This is the main idea of all FQ algorithms
282		   (cf. CSZ, HPFQ, HFSC)
283		 */
284
285		qdisc_qstats_overlimit(sch);
286	}
287	return NULL;
288}
289
290static void tbf_reset(struct Qdisc *sch)
291{
292	struct tbf_sched_data *q = qdisc_priv(sch);
293
294	qdisc_reset(q->qdisc);
295	sch->q.qlen = 0;
296	q->t_c = ktime_get_ns();
297	q->tokens = q->buffer;
298	q->ptokens = q->mtu;
299	qdisc_watchdog_cancel(&q->watchdog);
300}
301
302static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
303	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
304	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
305	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
306	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
307	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
308	[TCA_TBF_BURST] = { .type = NLA_U32 },
309	[TCA_TBF_PBURST] = { .type = NLA_U32 },
310};
311
312static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
313{
314	int err;
315	struct tbf_sched_data *q = qdisc_priv(sch);
316	struct nlattr *tb[TCA_TBF_MAX + 1];
317	struct tc_tbf_qopt *qopt;
318	struct Qdisc *child = NULL;
319	struct psched_ratecfg rate;
320	struct psched_ratecfg peak;
321	u64 max_size;
322	s64 buffer, mtu;
323	u64 rate64 = 0, prate64 = 0;
324
325	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
326	if (err < 0)
327		return err;
328
329	err = -EINVAL;
330	if (tb[TCA_TBF_PARMS] == NULL)
331		goto done;
332
333	qopt = nla_data(tb[TCA_TBF_PARMS]);
334	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
335		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
336					      tb[TCA_TBF_RTAB]));
337
338	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
339			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
340						      tb[TCA_TBF_PTAB]));
341
342	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
343	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
344
345	if (tb[TCA_TBF_RATE64])
346		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
347	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
348
349	if (tb[TCA_TBF_BURST]) {
350		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
351		buffer = psched_l2t_ns(&rate, max_size);
352	} else {
353		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
354	}
355
356	if (qopt->peakrate.rate) {
357		if (tb[TCA_TBF_PRATE64])
358			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
359		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
360		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
361			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
362					peak.rate_bytes_ps, rate.rate_bytes_ps);
363			err = -EINVAL;
364			goto done;
365		}
366
367		if (tb[TCA_TBF_PBURST]) {
368			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
369			max_size = min_t(u32, max_size, pburst);
370			mtu = psched_l2t_ns(&peak, pburst);
371		} else {
372			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
373		}
374	} else {
375		memset(&peak, 0, sizeof(peak));
376	}
377
378	if (max_size < psched_mtu(qdisc_dev(sch)))
379		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
380				    max_size, qdisc_dev(sch)->name,
381				    psched_mtu(qdisc_dev(sch)));
382
383	if (!max_size) {
384		err = -EINVAL;
385		goto done;
386	}
387
388	if (q->qdisc != &noop_qdisc) {
389		err = fifo_set_limit(q->qdisc, qopt->limit);
390		if (err)
391			goto done;
392	} else if (qopt->limit > 0) {
393		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
394		if (IS_ERR(child)) {
395			err = PTR_ERR(child);
396			goto done;
397		}
398	}
399
400	sch_tree_lock(sch);
401	if (child) {
402		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
403		qdisc_destroy(q->qdisc);
404		q->qdisc = child;
405	}
406	q->limit = qopt->limit;
407	if (tb[TCA_TBF_PBURST])
408		q->mtu = mtu;
409	else
410		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
411	q->max_size = max_size;
412	if (tb[TCA_TBF_BURST])
413		q->buffer = buffer;
414	else
415		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
416	q->tokens = q->buffer;
417	q->ptokens = q->mtu;
418
419	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
420	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
421
422	sch_tree_unlock(sch);
423	err = 0;
424done:
425	return err;
426}
427
428static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
429{
430	struct tbf_sched_data *q = qdisc_priv(sch);
431
432	if (opt == NULL)
433		return -EINVAL;
434
435	q->t_c = ktime_get_ns();
436	qdisc_watchdog_init(&q->watchdog, sch);
437	q->qdisc = &noop_qdisc;
438
439	return tbf_change(sch, opt);
440}
441
442static void tbf_destroy(struct Qdisc *sch)
443{
444	struct tbf_sched_data *q = qdisc_priv(sch);
445
446	qdisc_watchdog_cancel(&q->watchdog);
447	qdisc_destroy(q->qdisc);
448}
449
450static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
451{
452	struct tbf_sched_data *q = qdisc_priv(sch);
453	struct nlattr *nest;
454	struct tc_tbf_qopt opt;
455
456	sch->qstats.backlog = q->qdisc->qstats.backlog;
457	nest = nla_nest_start(skb, TCA_OPTIONS);
458	if (nest == NULL)
459		goto nla_put_failure;
460
461	opt.limit = q->limit;
462	psched_ratecfg_getrate(&opt.rate, &q->rate);
463	if (tbf_peak_present(q))
464		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
465	else
466		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
467	opt.mtu = PSCHED_NS2TICKS(q->mtu);
468	opt.buffer = PSCHED_NS2TICKS(q->buffer);
469	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
470		goto nla_put_failure;
471	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
472	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
473		goto nla_put_failure;
474	if (tbf_peak_present(q) &&
475	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
476	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
477		goto nla_put_failure;
478
479	return nla_nest_end(skb, nest);
480
481nla_put_failure:
482	nla_nest_cancel(skb, nest);
483	return -1;
484}
485
486static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
487			  struct sk_buff *skb, struct tcmsg *tcm)
488{
489	struct tbf_sched_data *q = qdisc_priv(sch);
490
491	tcm->tcm_handle |= TC_H_MIN(1);
492	tcm->tcm_info = q->qdisc->handle;
493
494	return 0;
495}
496
497static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
498		     struct Qdisc **old)
499{
500	struct tbf_sched_data *q = qdisc_priv(sch);
501
502	if (new == NULL)
503		new = &noop_qdisc;
504
505	sch_tree_lock(sch);
506	*old = q->qdisc;
507	q->qdisc = new;
508	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
509	qdisc_reset(*old);
510	sch_tree_unlock(sch);
511
512	return 0;
513}
514
515static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
516{
517	struct tbf_sched_data *q = qdisc_priv(sch);
518	return q->qdisc;
519}
520
521static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
522{
523	return 1;
524}
525
526static void tbf_put(struct Qdisc *sch, unsigned long arg)
527{
528}
529
530static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
531{
532	if (!walker->stop) {
533		if (walker->count >= walker->skip)
534			if (walker->fn(sch, 1, walker) < 0) {
535				walker->stop = 1;
536				return;
537			}
538		walker->count++;
539	}
540}
541
542static const struct Qdisc_class_ops tbf_class_ops = {
543	.graft		=	tbf_graft,
544	.leaf		=	tbf_leaf,
545	.get		=	tbf_get,
546	.put		=	tbf_put,
547	.walk		=	tbf_walk,
548	.dump		=	tbf_dump_class,
549};
550
551static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
552	.next		=	NULL,
553	.cl_ops		=	&tbf_class_ops,
554	.id		=	"tbf",
555	.priv_size	=	sizeof(struct tbf_sched_data),
556	.enqueue	=	tbf_enqueue,
557	.dequeue	=	tbf_dequeue,
558	.peek		=	qdisc_peek_dequeued,
559	.drop		=	tbf_drop,
560	.init		=	tbf_init,
561	.reset		=	tbf_reset,
562	.destroy	=	tbf_destroy,
563	.change		=	tbf_change,
564	.dump		=	tbf_dump,
565	.owner		=	THIS_MODULE,
566};
567
568static int __init tbf_module_init(void)
569{
570	return register_qdisc(&tbf_qdisc_ops);
571}
572
573static void __exit tbf_module_exit(void)
574{
575	unregister_qdisc(&tbf_qdisc_ops);
576}
577module_init(tbf_module_init)
578module_exit(tbf_module_exit)
579MODULE_LICENSE("GPL");
580