1/*
2 * net/sched/gen_estimator.c	Simple rate estimator.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Changes:
12 *              Jamal Hadi Salim - moved it to net/core and reshulfed
13 *              names to make it usable in general net subsystem.
14 */
15
16#include <asm/uaccess.h>
17#include <linux/bitops.h>
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/jiffies.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/netdevice.h>
30#include <linux/skbuff.h>
31#include <linux/rtnetlink.h>
32#include <linux/init.h>
33#include <linux/rbtree.h>
34#include <linux/slab.h>
35#include <net/sock.h>
36#include <net/gen_stats.h>
37
38/*
39   This code is NOT intended to be used for statistics collection,
40   its purpose is to provide a base for statistical multiplexing
41   for controlled load service.
42   If you need only statistics, run a user level daemon which
43   periodically reads byte counters.
44
45   Unfortunately, rate estimation is not a very easy task.
46   F.e. I did not find a simple way to estimate the current peak rate
47   and even failed to formulate the problem 8)8)
48
49   So I preferred not to built an estimator into the scheduler,
50   but run this task separately.
51   Ideally, it should be kernel thread(s), but for now it runs
52   from timers, which puts apparent top bounds on the number of rated
53   flows, has minimal overhead on small, but is enough
54   to handle controlled load service, sets of aggregates.
55
56   We measure rate over A=(1<<interval) seconds and evaluate EWMA:
57
58   avrate = avrate*(1-W) + rate*W
59
60   where W is chosen as negative power of 2: W = 2^(-ewma_log)
61
62   The resulting time constant is:
63
64   T = A/(-ln(1-W))
65
66
67   NOTES.
68
69   * avbps and avpps are scaled by 2^5.
70   * both values are reported as 32 bit unsigned values. bps can
71     overflow for fast links : max speed being 34360Mbit/sec
72   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
73     for HZ=100 and HZ=1024 8)), maximal interval
74     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
75     are too expensive, longer ones can be implemented
76     at user level painlessly.
77 */
78
79#define EST_MAX_INTERVAL	5
80
81struct gen_estimator
82{
83	struct list_head	list;
84	struct gnet_stats_basic_packed	*bstats;
85	struct gnet_stats_rate_est64	*rate_est;
86	spinlock_t		*stats_lock;
87	int			ewma_log;
88	u32			last_packets;
89	unsigned long		avpps;
90	u64			last_bytes;
91	u64			avbps;
92	struct rcu_head		e_rcu;
93	struct rb_node		node;
94	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
95	struct rcu_head		head;
96};
97
98struct gen_estimator_head
99{
100	struct timer_list	timer;
101	struct list_head	list;
102};
103
104static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
105
106/* Protects against NULL dereference */
107static DEFINE_RWLOCK(est_lock);
108
109/* Protects against soft lockup during large deletion */
110static struct rb_root est_root = RB_ROOT;
111static DEFINE_SPINLOCK(est_tree_lock);
112
113static void est_timer(unsigned long arg)
114{
115	int idx = (int)arg;
116	struct gen_estimator *e;
117
118	rcu_read_lock();
119	list_for_each_entry_rcu(e, &elist[idx].list, list) {
120		struct gnet_stats_basic_packed b = {0};
121		unsigned long rate;
122		u64 brate;
123
124		spin_lock(e->stats_lock);
125		read_lock(&est_lock);
126		if (e->bstats == NULL)
127			goto skip;
128
129		__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
130
131		brate = (b.bytes - e->last_bytes)<<(7 - idx);
132		e->last_bytes = b.bytes;
133		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
134		e->rate_est->bps = (e->avbps+0xF)>>5;
135
136		rate = b.packets - e->last_packets;
137		rate <<= (7 - idx);
138		e->last_packets = b.packets;
139		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
140		e->rate_est->pps = (e->avpps + 0xF) >> 5;
141skip:
142		read_unlock(&est_lock);
143		spin_unlock(e->stats_lock);
144	}
145
146	if (!list_empty(&elist[idx].list))
147		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
148	rcu_read_unlock();
149}
150
151static void gen_add_node(struct gen_estimator *est)
152{
153	struct rb_node **p = &est_root.rb_node, *parent = NULL;
154
155	while (*p) {
156		struct gen_estimator *e;
157
158		parent = *p;
159		e = rb_entry(parent, struct gen_estimator, node);
160
161		if (est->bstats > e->bstats)
162			p = &parent->rb_right;
163		else
164			p = &parent->rb_left;
165	}
166	rb_link_node(&est->node, parent, p);
167	rb_insert_color(&est->node, &est_root);
168}
169
170static
171struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
172				    const struct gnet_stats_rate_est64 *rate_est)
173{
174	struct rb_node *p = est_root.rb_node;
175
176	while (p) {
177		struct gen_estimator *e;
178
179		e = rb_entry(p, struct gen_estimator, node);
180
181		if (bstats > e->bstats)
182			p = p->rb_right;
183		else if (bstats < e->bstats || rate_est != e->rate_est)
184			p = p->rb_left;
185		else
186			return e;
187	}
188	return NULL;
189}
190
191/**
192 * gen_new_estimator - create a new rate estimator
193 * @bstats: basic statistics
194 * @rate_est: rate estimator statistics
195 * @stats_lock: statistics lock
196 * @opt: rate estimator configuration TLV
197 *
198 * Creates a new rate estimator with &bstats as source and &rate_est
199 * as destination. A new timer with the interval specified in the
200 * configuration TLV is created. Upon each interval, the latest statistics
201 * will be read from &bstats and the estimated rate will be stored in
202 * &rate_est with the statistics lock grabbed during this period.
203 *
204 * Returns 0 on success or a negative error code.
205 *
206 */
207int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
208		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
209		      struct gnet_stats_rate_est64 *rate_est,
210		      spinlock_t *stats_lock,
211		      struct nlattr *opt)
212{
213	struct gen_estimator *est;
214	struct gnet_estimator *parm = nla_data(opt);
215	struct gnet_stats_basic_packed b = {0};
216	int idx;
217
218	if (nla_len(opt) < sizeof(*parm))
219		return -EINVAL;
220
221	if (parm->interval < -2 || parm->interval > 3)
222		return -EINVAL;
223
224	est = kzalloc(sizeof(*est), GFP_KERNEL);
225	if (est == NULL)
226		return -ENOBUFS;
227
228	__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
229
230	idx = parm->interval + 2;
231	est->bstats = bstats;
232	est->rate_est = rate_est;
233	est->stats_lock = stats_lock;
234	est->ewma_log = parm->ewma_log;
235	est->last_bytes = b.bytes;
236	est->avbps = rate_est->bps<<5;
237	est->last_packets = b.packets;
238	est->avpps = rate_est->pps<<10;
239	est->cpu_bstats = cpu_bstats;
240
241	spin_lock_bh(&est_tree_lock);
242	if (!elist[idx].timer.function) {
243		INIT_LIST_HEAD(&elist[idx].list);
244		setup_timer(&elist[idx].timer, est_timer, idx);
245	}
246
247	if (list_empty(&elist[idx].list))
248		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
249
250	list_add_rcu(&est->list, &elist[idx].list);
251	gen_add_node(est);
252	spin_unlock_bh(&est_tree_lock);
253
254	return 0;
255}
256EXPORT_SYMBOL(gen_new_estimator);
257
258/**
259 * gen_kill_estimator - remove a rate estimator
260 * @bstats: basic statistics
261 * @rate_est: rate estimator statistics
262 *
263 * Removes the rate estimator specified by &bstats and &rate_est.
264 *
265 * Note : Caller should respect an RCU grace period before freeing stats_lock
266 */
267void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
268			struct gnet_stats_rate_est64 *rate_est)
269{
270	struct gen_estimator *e;
271
272	spin_lock_bh(&est_tree_lock);
273	while ((e = gen_find_node(bstats, rate_est))) {
274		rb_erase(&e->node, &est_root);
275
276		write_lock(&est_lock);
277		e->bstats = NULL;
278		write_unlock(&est_lock);
279
280		list_del_rcu(&e->list);
281		kfree_rcu(e, e_rcu);
282	}
283	spin_unlock_bh(&est_tree_lock);
284}
285EXPORT_SYMBOL(gen_kill_estimator);
286
287/**
288 * gen_replace_estimator - replace rate estimator configuration
289 * @bstats: basic statistics
290 * @rate_est: rate estimator statistics
291 * @stats_lock: statistics lock
292 * @opt: rate estimator configuration TLV
293 *
294 * Replaces the configuration of a rate estimator by calling
295 * gen_kill_estimator() and gen_new_estimator().
296 *
297 * Returns 0 on success or a negative error code.
298 */
299int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
300			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
301			  struct gnet_stats_rate_est64 *rate_est,
302			  spinlock_t *stats_lock, struct nlattr *opt)
303{
304	gen_kill_estimator(bstats, rate_est);
305	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
306}
307EXPORT_SYMBOL(gen_replace_estimator);
308
309/**
310 * gen_estimator_active - test if estimator is currently in use
311 * @bstats: basic statistics
312 * @rate_est: rate estimator statistics
313 *
314 * Returns true if estimator is active, and false if not.
315 */
316bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
317			  const struct gnet_stats_rate_est64 *rate_est)
318{
319	bool res;
320
321	ASSERT_RTNL();
322
323	spin_lock_bh(&est_tree_lock);
324	res = gen_find_node(bstats, rate_est) != NULL;
325	spin_unlock_bh(&est_tree_lock);
326
327	return res;
328}
329EXPORT_SYMBOL(gen_estimator_active);
330