1/*
2 * net/sched/sch_api.c	Packet scheduler API.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/errno.h>
23#include <linux/skbuff.h>
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
29#include <linux/hrtimer.h>
30#include <linux/lockdep.h>
31#include <linux/slab.h>
32
33#include <net/net_namespace.h>
34#include <net/sock.h>
35#include <net/netlink.h>
36#include <net/pkt_sched.h>
37
38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39			struct nlmsghdr *n, u32 clid,
40			struct Qdisc *old, struct Qdisc *new);
41static int tclass_notify(struct net *net, struct sk_buff *oskb,
42			 struct nlmsghdr *n, struct Qdisc *q,
43			 unsigned long cl, int event);
44
45/*
46
47   Short review.
48   -------------
49
50   This file consists of two interrelated parts:
51
52   1. queueing disciplines manager frontend.
53   2. traffic classes manager frontend.
54
55   Generally, queueing discipline ("qdisc") is a black box,
56   which is able to enqueue packets and to dequeue them (when
57   device is ready to send something) in order and at times
58   determined by algorithm hidden in it.
59
60   qdisc's are divided to two categories:
61   - "queues", which have no internal structure visible from outside.
62   - "schedulers", which split all the packets to "traffic classes",
63     using "packet classifiers" (look at cls_api.c)
64
65   In turn, classes may have child qdiscs (as rule, queues)
66   attached to them etc. etc. etc.
67
68   The goal of the routines in this file is to translate
69   information supplied by user in the form of handles
70   to more intelligible for kernel form, to make some sanity
71   checks and part of work, which is common to all qdiscs
72   and to provide rtnetlink notifications.
73
74   All real intelligent work is done inside qdisc modules.
75
76
77
78   Every discipline has two major routines: enqueue and dequeue.
79
80   ---dequeue
81
82   dequeue usually returns a skb to send. It is allowed to return NULL,
83   but it does not mean that queue is empty, it just means that
84   discipline does not want to send anything this time.
85   Queue is really empty if q->q.qlen == 0.
86   For complicated disciplines with multiple queues q->q is not
87   real packet queue, but however q->q.qlen must be valid.
88
89   ---enqueue
90
91   enqueue returns 0, if packet was enqueued successfully.
92   If packet (this one or another one) was dropped, it returns
93   not zero error code.
94   NET_XMIT_DROP 	- this packet dropped
95     Expected action: do not backoff, but wait until queue will clear.
96   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97     Expected action: backoff or ignore
98   NET_XMIT_POLICED	- dropped by police.
99     Expected action: backoff or error to real-time apps.
100
101   Auxiliary routines:
102
103   ---peek
104
105   like dequeue but without removing a packet from the queue
106
107   ---reset
108
109   returns qdisc to initial state: purge all buffers, clear all
110   timers, counters (except for statistics) etc.
111
112   ---init
113
114   initializes newly created qdisc.
115
116   ---destroy
117
118   destroys resources allocated by init and during lifetime of qdisc.
119
120   ---change
121
122   changes qdisc parameters.
123 */
124
125/* Protects list of registered TC modules. It is pure SMP lock. */
126static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129/************************************************
130 *	Queueing disciplines manipulation.	*
131 ************************************************/
132
133
134/* The list of all installed queueing disciplines. */
135
136static struct Qdisc_ops *qdisc_base;
137
138/* Register/unregister queueing discipline */
139
140int register_qdisc(struct Qdisc_ops *qops)
141{
142	struct Qdisc_ops *q, **qp;
143	int rc = -EEXIST;
144
145	write_lock(&qdisc_mod_lock);
146	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147		if (!strcmp(qops->id, q->id))
148			goto out;
149
150	if (qops->enqueue == NULL)
151		qops->enqueue = noop_qdisc_ops.enqueue;
152	if (qops->peek == NULL) {
153		if (qops->dequeue == NULL)
154			qops->peek = noop_qdisc_ops.peek;
155		else
156			goto out_einval;
157	}
158	if (qops->dequeue == NULL)
159		qops->dequeue = noop_qdisc_ops.dequeue;
160
161	if (qops->cl_ops) {
162		const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165			goto out_einval;
166
167		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168			goto out_einval;
169	}
170
171	qops->next = NULL;
172	*qp = qops;
173	rc = 0;
174out:
175	write_unlock(&qdisc_mod_lock);
176	return rc;
177
178out_einval:
179	rc = -EINVAL;
180	goto out;
181}
182EXPORT_SYMBOL(register_qdisc);
183
184int unregister_qdisc(struct Qdisc_ops *qops)
185{
186	struct Qdisc_ops *q, **qp;
187	int err = -ENOENT;
188
189	write_lock(&qdisc_mod_lock);
190	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191		if (q == qops)
192			break;
193	if (q) {
194		*qp = q->next;
195		q->next = NULL;
196		err = 0;
197	}
198	write_unlock(&qdisc_mod_lock);
199	return err;
200}
201EXPORT_SYMBOL(unregister_qdisc);
202
203/* Get default qdisc if not otherwise specified */
204void qdisc_get_default(char *name, size_t len)
205{
206	read_lock(&qdisc_mod_lock);
207	strlcpy(name, default_qdisc_ops->id, len);
208	read_unlock(&qdisc_mod_lock);
209}
210
211static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212{
213	struct Qdisc_ops *q = NULL;
214
215	for (q = qdisc_base; q; q = q->next) {
216		if (!strcmp(name, q->id)) {
217			if (!try_module_get(q->owner))
218				q = NULL;
219			break;
220		}
221	}
222
223	return q;
224}
225
226/* Set new default qdisc to use */
227int qdisc_set_default(const char *name)
228{
229	const struct Qdisc_ops *ops;
230
231	if (!capable(CAP_NET_ADMIN))
232		return -EPERM;
233
234	write_lock(&qdisc_mod_lock);
235	ops = qdisc_lookup_default(name);
236	if (!ops) {
237		/* Not found, drop lock and try to load module */
238		write_unlock(&qdisc_mod_lock);
239		request_module("sch_%s", name);
240		write_lock(&qdisc_mod_lock);
241
242		ops = qdisc_lookup_default(name);
243	}
244
245	if (ops) {
246		/* Set new default */
247		module_put(default_qdisc_ops->owner);
248		default_qdisc_ops = ops;
249	}
250	write_unlock(&qdisc_mod_lock);
251
252	return ops ? 0 : -ENOENT;
253}
254
255/* We know handle. Find qdisc among all qdisc's attached to device
256 * (root qdisc, all its children, children of children etc.)
257 * Note: caller either uses rtnl or rcu_read_lock()
258 */
259
260static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261{
262	struct Qdisc *q;
263
264	if (!(root->flags & TCQ_F_BUILTIN) &&
265	    root->handle == handle)
266		return root;
267
268	list_for_each_entry_rcu(q, &root->list, list) {
269		if (q->handle == handle)
270			return q;
271	}
272	return NULL;
273}
274
275void qdisc_list_add(struct Qdisc *q)
276{
277	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278		struct Qdisc *root = qdisc_dev(q)->qdisc;
279
280		WARN_ON_ONCE(root == &noop_qdisc);
281		ASSERT_RTNL();
282		list_add_tail_rcu(&q->list, &root->list);
283	}
284}
285EXPORT_SYMBOL(qdisc_list_add);
286
287void qdisc_list_del(struct Qdisc *q)
288{
289	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
290		ASSERT_RTNL();
291		list_del_rcu(&q->list);
292	}
293}
294EXPORT_SYMBOL(qdisc_list_del);
295
296struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
297{
298	struct Qdisc *q;
299
300	q = qdisc_match_from_root(dev->qdisc, handle);
301	if (q)
302		goto out;
303
304	if (dev_ingress_queue(dev))
305		q = qdisc_match_from_root(
306			dev_ingress_queue(dev)->qdisc_sleeping,
307			handle);
308out:
309	return q;
310}
311
312static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
313{
314	unsigned long cl;
315	struct Qdisc *leaf;
316	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
317
318	if (cops == NULL)
319		return NULL;
320	cl = cops->get(p, classid);
321
322	if (cl == 0)
323		return NULL;
324	leaf = cops->leaf(p, cl);
325	cops->put(p, cl);
326	return leaf;
327}
328
329/* Find queueing discipline by name */
330
331static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
332{
333	struct Qdisc_ops *q = NULL;
334
335	if (kind) {
336		read_lock(&qdisc_mod_lock);
337		for (q = qdisc_base; q; q = q->next) {
338			if (nla_strcmp(kind, q->id) == 0) {
339				if (!try_module_get(q->owner))
340					q = NULL;
341				break;
342			}
343		}
344		read_unlock(&qdisc_mod_lock);
345	}
346	return q;
347}
348
349/* The linklayer setting were not transferred from iproute2, in older
350 * versions, and the rate tables lookup systems have been dropped in
351 * the kernel. To keep backward compatible with older iproute2 tc
352 * utils, we detect the linklayer setting by detecting if the rate
353 * table were modified.
354 *
355 * For linklayer ATM table entries, the rate table will be aligned to
356 * 48 bytes, thus some table entries will contain the same value.  The
357 * mpu (min packet unit) is also encoded into the old rate table, thus
358 * starting from the mpu, we find low and high table entries for
359 * mapping this cell.  If these entries contain the same value, when
360 * the rate tables have been modified for linklayer ATM.
361 *
362 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
363 * and then roundup to the next cell, calc the table entry one below,
364 * and compare.
365 */
366static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
367{
368	int low       = roundup(r->mpu, 48);
369	int high      = roundup(low+1, 48);
370	int cell_low  = low >> r->cell_log;
371	int cell_high = (high >> r->cell_log) - 1;
372
373	/* rtab is too inaccurate at rates > 100Mbit/s */
374	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
375		pr_debug("TC linklayer: Giving up ATM detection\n");
376		return TC_LINKLAYER_ETHERNET;
377	}
378
379	if ((cell_high > cell_low) && (cell_high < 256)
380	    && (rtab[cell_low] == rtab[cell_high])) {
381		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
382			 cell_low, cell_high, rtab[cell_high]);
383		return TC_LINKLAYER_ATM;
384	}
385	return TC_LINKLAYER_ETHERNET;
386}
387
388static struct qdisc_rate_table *qdisc_rtab_list;
389
390struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
391{
392	struct qdisc_rate_table *rtab;
393
394	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
395	    nla_len(tab) != TC_RTAB_SIZE)
396		return NULL;
397
398	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
399		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
400		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
401			rtab->refcnt++;
402			return rtab;
403		}
404	}
405
406	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
407	if (rtab) {
408		rtab->rate = *r;
409		rtab->refcnt = 1;
410		memcpy(rtab->data, nla_data(tab), 1024);
411		if (r->linklayer == TC_LINKLAYER_UNAWARE)
412			r->linklayer = __detect_linklayer(r, rtab->data);
413		rtab->next = qdisc_rtab_list;
414		qdisc_rtab_list = rtab;
415	}
416	return rtab;
417}
418EXPORT_SYMBOL(qdisc_get_rtab);
419
420void qdisc_put_rtab(struct qdisc_rate_table *tab)
421{
422	struct qdisc_rate_table *rtab, **rtabp;
423
424	if (!tab || --tab->refcnt)
425		return;
426
427	for (rtabp = &qdisc_rtab_list;
428	     (rtab = *rtabp) != NULL;
429	     rtabp = &rtab->next) {
430		if (rtab == tab) {
431			*rtabp = rtab->next;
432			kfree(rtab);
433			return;
434		}
435	}
436}
437EXPORT_SYMBOL(qdisc_put_rtab);
438
439static LIST_HEAD(qdisc_stab_list);
440static DEFINE_SPINLOCK(qdisc_stab_lock);
441
442static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
443	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
444	[TCA_STAB_DATA] = { .type = NLA_BINARY },
445};
446
447static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
448{
449	struct nlattr *tb[TCA_STAB_MAX + 1];
450	struct qdisc_size_table *stab;
451	struct tc_sizespec *s;
452	unsigned int tsize = 0;
453	u16 *tab = NULL;
454	int err;
455
456	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
457	if (err < 0)
458		return ERR_PTR(err);
459	if (!tb[TCA_STAB_BASE])
460		return ERR_PTR(-EINVAL);
461
462	s = nla_data(tb[TCA_STAB_BASE]);
463
464	if (s->tsize > 0) {
465		if (!tb[TCA_STAB_DATA])
466			return ERR_PTR(-EINVAL);
467		tab = nla_data(tb[TCA_STAB_DATA]);
468		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
469	}
470
471	if (tsize != s->tsize || (!tab && tsize > 0))
472		return ERR_PTR(-EINVAL);
473
474	spin_lock(&qdisc_stab_lock);
475
476	list_for_each_entry(stab, &qdisc_stab_list, list) {
477		if (memcmp(&stab->szopts, s, sizeof(*s)))
478			continue;
479		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
480			continue;
481		stab->refcnt++;
482		spin_unlock(&qdisc_stab_lock);
483		return stab;
484	}
485
486	spin_unlock(&qdisc_stab_lock);
487
488	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
489	if (!stab)
490		return ERR_PTR(-ENOMEM);
491
492	stab->refcnt = 1;
493	stab->szopts = *s;
494	if (tsize > 0)
495		memcpy(stab->data, tab, tsize * sizeof(u16));
496
497	spin_lock(&qdisc_stab_lock);
498	list_add_tail(&stab->list, &qdisc_stab_list);
499	spin_unlock(&qdisc_stab_lock);
500
501	return stab;
502}
503
504static void stab_kfree_rcu(struct rcu_head *head)
505{
506	kfree(container_of(head, struct qdisc_size_table, rcu));
507}
508
509void qdisc_put_stab(struct qdisc_size_table *tab)
510{
511	if (!tab)
512		return;
513
514	spin_lock(&qdisc_stab_lock);
515
516	if (--tab->refcnt == 0) {
517		list_del(&tab->list);
518		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
519	}
520
521	spin_unlock(&qdisc_stab_lock);
522}
523EXPORT_SYMBOL(qdisc_put_stab);
524
525static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
526{
527	struct nlattr *nest;
528
529	nest = nla_nest_start(skb, TCA_STAB);
530	if (nest == NULL)
531		goto nla_put_failure;
532	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
533		goto nla_put_failure;
534	nla_nest_end(skb, nest);
535
536	return skb->len;
537
538nla_put_failure:
539	return -1;
540}
541
542void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
543{
544	int pkt_len, slot;
545
546	pkt_len = skb->len + stab->szopts.overhead;
547	if (unlikely(!stab->szopts.tsize))
548		goto out;
549
550	slot = pkt_len + stab->szopts.cell_align;
551	if (unlikely(slot < 0))
552		slot = 0;
553
554	slot >>= stab->szopts.cell_log;
555	if (likely(slot < stab->szopts.tsize))
556		pkt_len = stab->data[slot];
557	else
558		pkt_len = stab->data[stab->szopts.tsize - 1] *
559				(slot / stab->szopts.tsize) +
560				stab->data[slot % stab->szopts.tsize];
561
562	pkt_len <<= stab->szopts.size_log;
563out:
564	if (unlikely(pkt_len < 1))
565		pkt_len = 1;
566	qdisc_skb_cb(skb)->pkt_len = pkt_len;
567}
568EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
569
570void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
571{
572	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
573		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
574			txt, qdisc->ops->id, qdisc->handle >> 16);
575		qdisc->flags |= TCQ_F_WARN_NONWC;
576	}
577}
578EXPORT_SYMBOL(qdisc_warn_nonwc);
579
580static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
581{
582	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
583						 timer);
584
585	rcu_read_lock();
586	qdisc_unthrottled(wd->qdisc);
587	__netif_schedule(qdisc_root(wd->qdisc));
588	rcu_read_unlock();
589
590	return HRTIMER_NORESTART;
591}
592
593void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
594{
595	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
596	wd->timer.function = qdisc_watchdog;
597	wd->qdisc = qdisc;
598}
599EXPORT_SYMBOL(qdisc_watchdog_init);
600
601void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool throttle)
602{
603	if (test_bit(__QDISC_STATE_DEACTIVATED,
604		     &qdisc_root_sleeping(wd->qdisc)->state))
605		return;
606
607	if (throttle)
608		qdisc_throttled(wd->qdisc);
609
610	hrtimer_start(&wd->timer,
611		      ns_to_ktime(expires),
612		      HRTIMER_MODE_ABS_PINNED);
613}
614EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
615
616void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
617{
618	hrtimer_cancel(&wd->timer);
619	qdisc_unthrottled(wd->qdisc);
620}
621EXPORT_SYMBOL(qdisc_watchdog_cancel);
622
623static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
624{
625	unsigned int size = n * sizeof(struct hlist_head), i;
626	struct hlist_head *h;
627
628	if (size <= PAGE_SIZE)
629		h = kmalloc(size, GFP_KERNEL);
630	else
631		h = (struct hlist_head *)
632			__get_free_pages(GFP_KERNEL, get_order(size));
633
634	if (h != NULL) {
635		for (i = 0; i < n; i++)
636			INIT_HLIST_HEAD(&h[i]);
637	}
638	return h;
639}
640
641static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
642{
643	unsigned int size = n * sizeof(struct hlist_head);
644
645	if (size <= PAGE_SIZE)
646		kfree(h);
647	else
648		free_pages((unsigned long)h, get_order(size));
649}
650
651void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
652{
653	struct Qdisc_class_common *cl;
654	struct hlist_node *next;
655	struct hlist_head *nhash, *ohash;
656	unsigned int nsize, nmask, osize;
657	unsigned int i, h;
658
659	/* Rehash when load factor exceeds 0.75 */
660	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
661		return;
662	nsize = clhash->hashsize * 2;
663	nmask = nsize - 1;
664	nhash = qdisc_class_hash_alloc(nsize);
665	if (nhash == NULL)
666		return;
667
668	ohash = clhash->hash;
669	osize = clhash->hashsize;
670
671	sch_tree_lock(sch);
672	for (i = 0; i < osize; i++) {
673		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
674			h = qdisc_class_hash(cl->classid, nmask);
675			hlist_add_head(&cl->hnode, &nhash[h]);
676		}
677	}
678	clhash->hash     = nhash;
679	clhash->hashsize = nsize;
680	clhash->hashmask = nmask;
681	sch_tree_unlock(sch);
682
683	qdisc_class_hash_free(ohash, osize);
684}
685EXPORT_SYMBOL(qdisc_class_hash_grow);
686
687int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
688{
689	unsigned int size = 4;
690
691	clhash->hash = qdisc_class_hash_alloc(size);
692	if (clhash->hash == NULL)
693		return -ENOMEM;
694	clhash->hashsize  = size;
695	clhash->hashmask  = size - 1;
696	clhash->hashelems = 0;
697	return 0;
698}
699EXPORT_SYMBOL(qdisc_class_hash_init);
700
701void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
702{
703	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
704}
705EXPORT_SYMBOL(qdisc_class_hash_destroy);
706
707void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
708			     struct Qdisc_class_common *cl)
709{
710	unsigned int h;
711
712	INIT_HLIST_NODE(&cl->hnode);
713	h = qdisc_class_hash(cl->classid, clhash->hashmask);
714	hlist_add_head(&cl->hnode, &clhash->hash[h]);
715	clhash->hashelems++;
716}
717EXPORT_SYMBOL(qdisc_class_hash_insert);
718
719void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
720			     struct Qdisc_class_common *cl)
721{
722	hlist_del(&cl->hnode);
723	clhash->hashelems--;
724}
725EXPORT_SYMBOL(qdisc_class_hash_remove);
726
727/* Allocate an unique handle from space managed by kernel
728 * Possible range is [8000-FFFF]:0000 (0x8000 values)
729 */
730static u32 qdisc_alloc_handle(struct net_device *dev)
731{
732	int i = 0x8000;
733	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
734
735	do {
736		autohandle += TC_H_MAKE(0x10000U, 0);
737		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
738			autohandle = TC_H_MAKE(0x80000000U, 0);
739		if (!qdisc_lookup(dev, autohandle))
740			return autohandle;
741		cond_resched();
742	} while	(--i > 0);
743
744	return 0;
745}
746
747void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
748{
749	const struct Qdisc_class_ops *cops;
750	unsigned long cl;
751	u32 parentid;
752	int drops;
753
754	if (n == 0)
755		return;
756	drops = max_t(int, n, 0);
757	rcu_read_lock();
758	while ((parentid = sch->parent)) {
759		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
760			break;
761
762		if (sch->flags & TCQ_F_NOPARENT)
763			break;
764		/* TODO: perform the search on a per txq basis */
765		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
766		if (sch == NULL) {
767			WARN_ON_ONCE(parentid != TC_H_ROOT);
768			break;
769		}
770		cops = sch->ops->cl_ops;
771		if (cops->qlen_notify) {
772			cl = cops->get(sch, parentid);
773			cops->qlen_notify(sch, cl);
774			cops->put(sch, cl);
775		}
776		sch->q.qlen -= n;
777		__qdisc_qstats_drop(sch, drops);
778	}
779	rcu_read_unlock();
780}
781EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
782
783static void notify_and_destroy(struct net *net, struct sk_buff *skb,
784			       struct nlmsghdr *n, u32 clid,
785			       struct Qdisc *old, struct Qdisc *new)
786{
787	if (new || old)
788		qdisc_notify(net, skb, n, clid, old, new);
789
790	if (old)
791		qdisc_destroy(old);
792}
793
794/* Graft qdisc "new" to class "classid" of qdisc "parent" or
795 * to device "dev".
796 *
797 * When appropriate send a netlink notification using 'skb'
798 * and "n".
799 *
800 * On success, destroy old qdisc.
801 */
802
803static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
804		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
805		       struct Qdisc *new, struct Qdisc *old)
806{
807	struct Qdisc *q = old;
808	struct net *net = dev_net(dev);
809	int err = 0;
810
811	if (parent == NULL) {
812		unsigned int i, num_q, ingress;
813
814		ingress = 0;
815		num_q = dev->num_tx_queues;
816		if ((q && q->flags & TCQ_F_INGRESS) ||
817		    (new && new->flags & TCQ_F_INGRESS)) {
818			num_q = 1;
819			ingress = 1;
820			if (!dev_ingress_queue(dev))
821				return -ENOENT;
822		}
823
824		if (dev->flags & IFF_UP)
825			dev_deactivate(dev);
826
827		if (new && new->ops->attach)
828			goto skip;
829
830		for (i = 0; i < num_q; i++) {
831			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
832
833			if (!ingress)
834				dev_queue = netdev_get_tx_queue(dev, i);
835
836			old = dev_graft_qdisc(dev_queue, new);
837			if (new && i > 0)
838				atomic_inc(&new->refcnt);
839
840			if (!ingress)
841				qdisc_destroy(old);
842		}
843
844skip:
845		if (!ingress) {
846			notify_and_destroy(net, skb, n, classid,
847					   dev->qdisc, new);
848			if (new && !new->ops->attach)
849				atomic_inc(&new->refcnt);
850			dev->qdisc = new ? : &noop_qdisc;
851
852			if (new && new->ops->attach)
853				new->ops->attach(new);
854		} else {
855			notify_and_destroy(net, skb, n, classid, old, new);
856		}
857
858		if (dev->flags & IFF_UP)
859			dev_activate(dev);
860	} else {
861		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
862
863		err = -EOPNOTSUPP;
864		if (cops && cops->graft) {
865			unsigned long cl = cops->get(parent, classid);
866			if (cl) {
867				err = cops->graft(parent, cl, new, &old);
868				cops->put(parent, cl);
869			} else
870				err = -ENOENT;
871		}
872		if (!err)
873			notify_and_destroy(net, skb, n, classid, old, new);
874	}
875	return err;
876}
877
878/* lockdep annotation is needed for ingress; egress gets it only for name */
879static struct lock_class_key qdisc_tx_lock;
880static struct lock_class_key qdisc_rx_lock;
881
882/*
883   Allocate and initialize new qdisc.
884
885   Parameters are passed via opt.
886 */
887
888static struct Qdisc *
889qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
890	     struct Qdisc *p, u32 parent, u32 handle,
891	     struct nlattr **tca, int *errp)
892{
893	int err;
894	struct nlattr *kind = tca[TCA_KIND];
895	struct Qdisc *sch;
896	struct Qdisc_ops *ops;
897	struct qdisc_size_table *stab;
898
899	ops = qdisc_lookup_ops(kind);
900#ifdef CONFIG_MODULES
901	if (ops == NULL && kind != NULL) {
902		char name[IFNAMSIZ];
903		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
904			/* We dropped the RTNL semaphore in order to
905			 * perform the module load.  So, even if we
906			 * succeeded in loading the module we have to
907			 * tell the caller to replay the request.  We
908			 * indicate this using -EAGAIN.
909			 * We replay the request because the device may
910			 * go away in the mean time.
911			 */
912			rtnl_unlock();
913			request_module("sch_%s", name);
914			rtnl_lock();
915			ops = qdisc_lookup_ops(kind);
916			if (ops != NULL) {
917				/* We will try again qdisc_lookup_ops,
918				 * so don't keep a reference.
919				 */
920				module_put(ops->owner);
921				err = -EAGAIN;
922				goto err_out;
923			}
924		}
925	}
926#endif
927
928	err = -ENOENT;
929	if (ops == NULL)
930		goto err_out;
931
932	sch = qdisc_alloc(dev_queue, ops);
933	if (IS_ERR(sch)) {
934		err = PTR_ERR(sch);
935		goto err_out2;
936	}
937
938	sch->parent = parent;
939
940	if (handle == TC_H_INGRESS) {
941		sch->flags |= TCQ_F_INGRESS;
942		handle = TC_H_MAKE(TC_H_INGRESS, 0);
943		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
944	} else {
945		if (handle == 0) {
946			handle = qdisc_alloc_handle(dev);
947			err = -ENOMEM;
948			if (handle == 0)
949				goto err_out3;
950		}
951		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
952		if (!netif_is_multiqueue(dev))
953			sch->flags |= TCQ_F_ONETXQUEUE;
954	}
955
956	sch->handle = handle;
957
958	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
959		if (qdisc_is_percpu_stats(sch)) {
960			sch->cpu_bstats =
961				netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
962			if (!sch->cpu_bstats)
963				goto err_out4;
964
965			sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
966			if (!sch->cpu_qstats)
967				goto err_out4;
968		}
969
970		if (tca[TCA_STAB]) {
971			stab = qdisc_get_stab(tca[TCA_STAB]);
972			if (IS_ERR(stab)) {
973				err = PTR_ERR(stab);
974				goto err_out4;
975			}
976			rcu_assign_pointer(sch->stab, stab);
977		}
978		if (tca[TCA_RATE]) {
979			spinlock_t *root_lock;
980
981			err = -EOPNOTSUPP;
982			if (sch->flags & TCQ_F_MQROOT)
983				goto err_out4;
984
985			if ((sch->parent != TC_H_ROOT) &&
986			    !(sch->flags & TCQ_F_INGRESS) &&
987			    (!p || !(p->flags & TCQ_F_MQROOT)))
988				root_lock = qdisc_root_sleeping_lock(sch);
989			else
990				root_lock = qdisc_lock(sch);
991
992			err = gen_new_estimator(&sch->bstats,
993						sch->cpu_bstats,
994						&sch->rate_est,
995						root_lock,
996						tca[TCA_RATE]);
997			if (err)
998				goto err_out4;
999		}
1000
1001		qdisc_list_add(sch);
1002
1003		return sch;
1004	}
1005err_out3:
1006	dev_put(dev);
1007	kfree((char *) sch - sch->padded);
1008err_out2:
1009	module_put(ops->owner);
1010err_out:
1011	*errp = err;
1012	return NULL;
1013
1014err_out4:
1015	free_percpu(sch->cpu_bstats);
1016	free_percpu(sch->cpu_qstats);
1017	/*
1018	 * Any broken qdiscs that would require a ops->reset() here?
1019	 * The qdisc was never in action so it shouldn't be necessary.
1020	 */
1021	qdisc_put_stab(rtnl_dereference(sch->stab));
1022	if (ops->destroy)
1023		ops->destroy(sch);
1024	goto err_out3;
1025}
1026
1027static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1028{
1029	struct qdisc_size_table *ostab, *stab = NULL;
1030	int err = 0;
1031
1032	if (tca[TCA_OPTIONS]) {
1033		if (sch->ops->change == NULL)
1034			return -EINVAL;
1035		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1036		if (err)
1037			return err;
1038	}
1039
1040	if (tca[TCA_STAB]) {
1041		stab = qdisc_get_stab(tca[TCA_STAB]);
1042		if (IS_ERR(stab))
1043			return PTR_ERR(stab);
1044	}
1045
1046	ostab = rtnl_dereference(sch->stab);
1047	rcu_assign_pointer(sch->stab, stab);
1048	qdisc_put_stab(ostab);
1049
1050	if (tca[TCA_RATE]) {
1051		/* NB: ignores errors from replace_estimator
1052		   because change can't be undone. */
1053		if (sch->flags & TCQ_F_MQROOT)
1054			goto out;
1055		gen_replace_estimator(&sch->bstats,
1056				      sch->cpu_bstats,
1057				      &sch->rate_est,
1058				      qdisc_root_sleeping_lock(sch),
1059				      tca[TCA_RATE]);
1060	}
1061out:
1062	return 0;
1063}
1064
1065struct check_loop_arg {
1066	struct qdisc_walker	w;
1067	struct Qdisc		*p;
1068	int			depth;
1069};
1070
1071static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1072
1073static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1074{
1075	struct check_loop_arg	arg;
1076
1077	if (q->ops->cl_ops == NULL)
1078		return 0;
1079
1080	arg.w.stop = arg.w.skip = arg.w.count = 0;
1081	arg.w.fn = check_loop_fn;
1082	arg.depth = depth;
1083	arg.p = p;
1084	q->ops->cl_ops->walk(q, &arg.w);
1085	return arg.w.stop ? -ELOOP : 0;
1086}
1087
1088static int
1089check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1090{
1091	struct Qdisc *leaf;
1092	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1093	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1094
1095	leaf = cops->leaf(q, cl);
1096	if (leaf) {
1097		if (leaf == arg->p || arg->depth > 7)
1098			return -ELOOP;
1099		return check_loop(leaf, arg->p, arg->depth + 1);
1100	}
1101	return 0;
1102}
1103
1104/*
1105 * Delete/get qdisc.
1106 */
1107
1108static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1109{
1110	struct net *net = sock_net(skb->sk);
1111	struct tcmsg *tcm = nlmsg_data(n);
1112	struct nlattr *tca[TCA_MAX + 1];
1113	struct net_device *dev;
1114	u32 clid;
1115	struct Qdisc *q = NULL;
1116	struct Qdisc *p = NULL;
1117	int err;
1118
1119	if ((n->nlmsg_type != RTM_GETQDISC) &&
1120	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1121		return -EPERM;
1122
1123	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1124	if (err < 0)
1125		return err;
1126
1127	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1128	if (!dev)
1129		return -ENODEV;
1130
1131	clid = tcm->tcm_parent;
1132	if (clid) {
1133		if (clid != TC_H_ROOT) {
1134			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1135				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1136				if (!p)
1137					return -ENOENT;
1138				q = qdisc_leaf(p, clid);
1139			} else if (dev_ingress_queue(dev)) {
1140				q = dev_ingress_queue(dev)->qdisc_sleeping;
1141			}
1142		} else {
1143			q = dev->qdisc;
1144		}
1145		if (!q)
1146			return -ENOENT;
1147
1148		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1149			return -EINVAL;
1150	} else {
1151		q = qdisc_lookup(dev, tcm->tcm_handle);
1152		if (!q)
1153			return -ENOENT;
1154	}
1155
1156	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1157		return -EINVAL;
1158
1159	if (n->nlmsg_type == RTM_DELQDISC) {
1160		if (!clid)
1161			return -EINVAL;
1162		if (q->handle == 0)
1163			return -ENOENT;
1164		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1165		if (err != 0)
1166			return err;
1167	} else {
1168		qdisc_notify(net, skb, n, clid, NULL, q);
1169	}
1170	return 0;
1171}
1172
1173/*
1174 * Create/change qdisc.
1175 */
1176
1177static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1178{
1179	struct net *net = sock_net(skb->sk);
1180	struct tcmsg *tcm;
1181	struct nlattr *tca[TCA_MAX + 1];
1182	struct net_device *dev;
1183	u32 clid;
1184	struct Qdisc *q, *p;
1185	int err;
1186
1187	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1188		return -EPERM;
1189
1190replay:
1191	/* Reinit, just in case something touches this. */
1192	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1193	if (err < 0)
1194		return err;
1195
1196	tcm = nlmsg_data(n);
1197	clid = tcm->tcm_parent;
1198	q = p = NULL;
1199
1200	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1201	if (!dev)
1202		return -ENODEV;
1203
1204
1205	if (clid) {
1206		if (clid != TC_H_ROOT) {
1207			if (clid != TC_H_INGRESS) {
1208				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1209				if (!p)
1210					return -ENOENT;
1211				q = qdisc_leaf(p, clid);
1212			} else if (dev_ingress_queue_create(dev)) {
1213				q = dev_ingress_queue(dev)->qdisc_sleeping;
1214			}
1215		} else {
1216			q = dev->qdisc;
1217		}
1218
1219		/* It may be default qdisc, ignore it */
1220		if (q && q->handle == 0)
1221			q = NULL;
1222
1223		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1224			if (tcm->tcm_handle) {
1225				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1226					return -EEXIST;
1227				if (TC_H_MIN(tcm->tcm_handle))
1228					return -EINVAL;
1229				q = qdisc_lookup(dev, tcm->tcm_handle);
1230				if (!q)
1231					goto create_n_graft;
1232				if (n->nlmsg_flags & NLM_F_EXCL)
1233					return -EEXIST;
1234				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1235					return -EINVAL;
1236				if (q == p ||
1237				    (p && check_loop(q, p, 0)))
1238					return -ELOOP;
1239				atomic_inc(&q->refcnt);
1240				goto graft;
1241			} else {
1242				if (!q)
1243					goto create_n_graft;
1244
1245				/* This magic test requires explanation.
1246				 *
1247				 *   We know, that some child q is already
1248				 *   attached to this parent and have choice:
1249				 *   either to change it or to create/graft new one.
1250				 *
1251				 *   1. We are allowed to create/graft only
1252				 *   if CREATE and REPLACE flags are set.
1253				 *
1254				 *   2. If EXCL is set, requestor wanted to say,
1255				 *   that qdisc tcm_handle is not expected
1256				 *   to exist, so that we choose create/graft too.
1257				 *
1258				 *   3. The last case is when no flags are set.
1259				 *   Alas, it is sort of hole in API, we
1260				 *   cannot decide what to do unambiguously.
1261				 *   For now we select create/graft, if
1262				 *   user gave KIND, which does not match existing.
1263				 */
1264				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1265				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1266				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1267				     (tca[TCA_KIND] &&
1268				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1269					goto create_n_graft;
1270			}
1271		}
1272	} else {
1273		if (!tcm->tcm_handle)
1274			return -EINVAL;
1275		q = qdisc_lookup(dev, tcm->tcm_handle);
1276	}
1277
1278	/* Change qdisc parameters */
1279	if (q == NULL)
1280		return -ENOENT;
1281	if (n->nlmsg_flags & NLM_F_EXCL)
1282		return -EEXIST;
1283	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1284		return -EINVAL;
1285	err = qdisc_change(q, tca);
1286	if (err == 0)
1287		qdisc_notify(net, skb, n, clid, NULL, q);
1288	return err;
1289
1290create_n_graft:
1291	if (!(n->nlmsg_flags & NLM_F_CREATE))
1292		return -ENOENT;
1293	if (clid == TC_H_INGRESS) {
1294		if (dev_ingress_queue(dev))
1295			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1296					 tcm->tcm_parent, tcm->tcm_parent,
1297					 tca, &err);
1298		else
1299			err = -ENOENT;
1300	} else {
1301		struct netdev_queue *dev_queue;
1302
1303		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1304			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1305		else if (p)
1306			dev_queue = p->dev_queue;
1307		else
1308			dev_queue = netdev_get_tx_queue(dev, 0);
1309
1310		q = qdisc_create(dev, dev_queue, p,
1311				 tcm->tcm_parent, tcm->tcm_handle,
1312				 tca, &err);
1313	}
1314	if (q == NULL) {
1315		if (err == -EAGAIN)
1316			goto replay;
1317		return err;
1318	}
1319
1320graft:
1321	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1322	if (err) {
1323		if (q)
1324			qdisc_destroy(q);
1325		return err;
1326	}
1327
1328	return 0;
1329}
1330
1331static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1332			 u32 portid, u32 seq, u16 flags, int event)
1333{
1334	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1335	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1336	struct tcmsg *tcm;
1337	struct nlmsghdr  *nlh;
1338	unsigned char *b = skb_tail_pointer(skb);
1339	struct gnet_dump d;
1340	struct qdisc_size_table *stab;
1341	__u32 qlen;
1342
1343	cond_resched();
1344	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1345	if (!nlh)
1346		goto out_nlmsg_trim;
1347	tcm = nlmsg_data(nlh);
1348	tcm->tcm_family = AF_UNSPEC;
1349	tcm->tcm__pad1 = 0;
1350	tcm->tcm__pad2 = 0;
1351	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1352	tcm->tcm_parent = clid;
1353	tcm->tcm_handle = q->handle;
1354	tcm->tcm_info = atomic_read(&q->refcnt);
1355	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1356		goto nla_put_failure;
1357	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1358		goto nla_put_failure;
1359	qlen = q->q.qlen;
1360
1361	stab = rtnl_dereference(q->stab);
1362	if (stab && qdisc_dump_stab(skb, stab) < 0)
1363		goto nla_put_failure;
1364
1365	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1366					 qdisc_root_sleeping_lock(q), &d) < 0)
1367		goto nla_put_failure;
1368
1369	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1370		goto nla_put_failure;
1371
1372	if (qdisc_is_percpu_stats(q)) {
1373		cpu_bstats = q->cpu_bstats;
1374		cpu_qstats = q->cpu_qstats;
1375	}
1376
1377	if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1378	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1379	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1380		goto nla_put_failure;
1381
1382	if (gnet_stats_finish_copy(&d) < 0)
1383		goto nla_put_failure;
1384
1385	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1386	return skb->len;
1387
1388out_nlmsg_trim:
1389nla_put_failure:
1390	nlmsg_trim(skb, b);
1391	return -1;
1392}
1393
1394static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1395{
1396	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1397}
1398
1399static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1400			struct nlmsghdr *n, u32 clid,
1401			struct Qdisc *old, struct Qdisc *new)
1402{
1403	struct sk_buff *skb;
1404	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1405
1406	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1407	if (!skb)
1408		return -ENOBUFS;
1409
1410	if (old && !tc_qdisc_dump_ignore(old)) {
1411		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1412				  0, RTM_DELQDISC) < 0)
1413			goto err_out;
1414	}
1415	if (new && !tc_qdisc_dump_ignore(new)) {
1416		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1417				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1418			goto err_out;
1419	}
1420
1421	if (skb->len)
1422		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1423				      n->nlmsg_flags & NLM_F_ECHO);
1424
1425err_out:
1426	kfree_skb(skb);
1427	return -EINVAL;
1428}
1429
1430static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1431			      struct netlink_callback *cb,
1432			      int *q_idx_p, int s_q_idx)
1433{
1434	int ret = 0, q_idx = *q_idx_p;
1435	struct Qdisc *q;
1436
1437	if (!root)
1438		return 0;
1439
1440	q = root;
1441	if (q_idx < s_q_idx) {
1442		q_idx++;
1443	} else {
1444		if (!tc_qdisc_dump_ignore(q) &&
1445		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1446				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1447			goto done;
1448		q_idx++;
1449	}
1450	list_for_each_entry(q, &root->list, list) {
1451		if (q_idx < s_q_idx) {
1452			q_idx++;
1453			continue;
1454		}
1455		if (!tc_qdisc_dump_ignore(q) &&
1456		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1457				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1458			goto done;
1459		q_idx++;
1460	}
1461
1462out:
1463	*q_idx_p = q_idx;
1464	return ret;
1465done:
1466	ret = -1;
1467	goto out;
1468}
1469
1470static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1471{
1472	struct net *net = sock_net(skb->sk);
1473	int idx, q_idx;
1474	int s_idx, s_q_idx;
1475	struct net_device *dev;
1476
1477	s_idx = cb->args[0];
1478	s_q_idx = q_idx = cb->args[1];
1479
1480	idx = 0;
1481	ASSERT_RTNL();
1482	for_each_netdev(net, dev) {
1483		struct netdev_queue *dev_queue;
1484
1485		if (idx < s_idx)
1486			goto cont;
1487		if (idx > s_idx)
1488			s_q_idx = 0;
1489		q_idx = 0;
1490
1491		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1492			goto done;
1493
1494		dev_queue = dev_ingress_queue(dev);
1495		if (dev_queue &&
1496		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1497				       &q_idx, s_q_idx) < 0)
1498			goto done;
1499
1500cont:
1501		idx++;
1502	}
1503
1504done:
1505	cb->args[0] = idx;
1506	cb->args[1] = q_idx;
1507
1508	return skb->len;
1509}
1510
1511
1512
1513/************************************************
1514 *	Traffic classes manipulation.		*
1515 ************************************************/
1516
1517
1518
1519static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1520{
1521	struct net *net = sock_net(skb->sk);
1522	struct tcmsg *tcm = nlmsg_data(n);
1523	struct nlattr *tca[TCA_MAX + 1];
1524	struct net_device *dev;
1525	struct Qdisc *q = NULL;
1526	const struct Qdisc_class_ops *cops;
1527	unsigned long cl = 0;
1528	unsigned long new_cl;
1529	u32 portid;
1530	u32 clid;
1531	u32 qid;
1532	int err;
1533
1534	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1535	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1536		return -EPERM;
1537
1538	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1539	if (err < 0)
1540		return err;
1541
1542	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1543	if (!dev)
1544		return -ENODEV;
1545
1546	/*
1547	   parent == TC_H_UNSPEC - unspecified parent.
1548	   parent == TC_H_ROOT   - class is root, which has no parent.
1549	   parent == X:0	 - parent is root class.
1550	   parent == X:Y	 - parent is a node in hierarchy.
1551	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1552
1553	   handle == 0:0	 - generate handle from kernel pool.
1554	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1555	   handle == X:Y	 - clear.
1556	   handle == X:0	 - root class.
1557	 */
1558
1559	/* Step 1. Determine qdisc handle X:0 */
1560
1561	portid = tcm->tcm_parent;
1562	clid = tcm->tcm_handle;
1563	qid = TC_H_MAJ(clid);
1564
1565	if (portid != TC_H_ROOT) {
1566		u32 qid1 = TC_H_MAJ(portid);
1567
1568		if (qid && qid1) {
1569			/* If both majors are known, they must be identical. */
1570			if (qid != qid1)
1571				return -EINVAL;
1572		} else if (qid1) {
1573			qid = qid1;
1574		} else if (qid == 0)
1575			qid = dev->qdisc->handle;
1576
1577		/* Now qid is genuine qdisc handle consistent
1578		 * both with parent and child.
1579		 *
1580		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1581		 */
1582		if (portid)
1583			portid = TC_H_MAKE(qid, portid);
1584	} else {
1585		if (qid == 0)
1586			qid = dev->qdisc->handle;
1587	}
1588
1589	/* OK. Locate qdisc */
1590	q = qdisc_lookup(dev, qid);
1591	if (!q)
1592		return -ENOENT;
1593
1594	/* An check that it supports classes */
1595	cops = q->ops->cl_ops;
1596	if (cops == NULL)
1597		return -EINVAL;
1598
1599	/* Now try to get class */
1600	if (clid == 0) {
1601		if (portid == TC_H_ROOT)
1602			clid = qid;
1603	} else
1604		clid = TC_H_MAKE(qid, clid);
1605
1606	if (clid)
1607		cl = cops->get(q, clid);
1608
1609	if (cl == 0) {
1610		err = -ENOENT;
1611		if (n->nlmsg_type != RTM_NEWTCLASS ||
1612		    !(n->nlmsg_flags & NLM_F_CREATE))
1613			goto out;
1614	} else {
1615		switch (n->nlmsg_type) {
1616		case RTM_NEWTCLASS:
1617			err = -EEXIST;
1618			if (n->nlmsg_flags & NLM_F_EXCL)
1619				goto out;
1620			break;
1621		case RTM_DELTCLASS:
1622			err = -EOPNOTSUPP;
1623			if (cops->delete)
1624				err = cops->delete(q, cl);
1625			if (err == 0)
1626				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1627			goto out;
1628		case RTM_GETTCLASS:
1629			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1630			goto out;
1631		default:
1632			err = -EINVAL;
1633			goto out;
1634		}
1635	}
1636
1637	new_cl = cl;
1638	err = -EOPNOTSUPP;
1639	if (cops->change)
1640		err = cops->change(q, clid, portid, tca, &new_cl);
1641	if (err == 0)
1642		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1643
1644out:
1645	if (cl)
1646		cops->put(q, cl);
1647
1648	return err;
1649}
1650
1651
1652static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1653			  unsigned long cl,
1654			  u32 portid, u32 seq, u16 flags, int event)
1655{
1656	struct tcmsg *tcm;
1657	struct nlmsghdr  *nlh;
1658	unsigned char *b = skb_tail_pointer(skb);
1659	struct gnet_dump d;
1660	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1661
1662	cond_resched();
1663	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1664	if (!nlh)
1665		goto out_nlmsg_trim;
1666	tcm = nlmsg_data(nlh);
1667	tcm->tcm_family = AF_UNSPEC;
1668	tcm->tcm__pad1 = 0;
1669	tcm->tcm__pad2 = 0;
1670	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1671	tcm->tcm_parent = q->handle;
1672	tcm->tcm_handle = q->handle;
1673	tcm->tcm_info = 0;
1674	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1675		goto nla_put_failure;
1676	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1677		goto nla_put_failure;
1678
1679	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1680					 qdisc_root_sleeping_lock(q), &d) < 0)
1681		goto nla_put_failure;
1682
1683	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1684		goto nla_put_failure;
1685
1686	if (gnet_stats_finish_copy(&d) < 0)
1687		goto nla_put_failure;
1688
1689	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1690	return skb->len;
1691
1692out_nlmsg_trim:
1693nla_put_failure:
1694	nlmsg_trim(skb, b);
1695	return -1;
1696}
1697
1698static int tclass_notify(struct net *net, struct sk_buff *oskb,
1699			 struct nlmsghdr *n, struct Qdisc *q,
1700			 unsigned long cl, int event)
1701{
1702	struct sk_buff *skb;
1703	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1704
1705	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1706	if (!skb)
1707		return -ENOBUFS;
1708
1709	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1710		kfree_skb(skb);
1711		return -EINVAL;
1712	}
1713
1714	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1715			      n->nlmsg_flags & NLM_F_ECHO);
1716}
1717
1718struct qdisc_dump_args {
1719	struct qdisc_walker	w;
1720	struct sk_buff		*skb;
1721	struct netlink_callback	*cb;
1722};
1723
1724static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1725{
1726	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1727
1728	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1729			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1730}
1731
1732static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1733				struct tcmsg *tcm, struct netlink_callback *cb,
1734				int *t_p, int s_t)
1735{
1736	struct qdisc_dump_args arg;
1737
1738	if (tc_qdisc_dump_ignore(q) ||
1739	    *t_p < s_t || !q->ops->cl_ops ||
1740	    (tcm->tcm_parent &&
1741	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1742		(*t_p)++;
1743		return 0;
1744	}
1745	if (*t_p > s_t)
1746		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1747	arg.w.fn = qdisc_class_dump;
1748	arg.skb = skb;
1749	arg.cb = cb;
1750	arg.w.stop  = 0;
1751	arg.w.skip = cb->args[1];
1752	arg.w.count = 0;
1753	q->ops->cl_ops->walk(q, &arg.w);
1754	cb->args[1] = arg.w.count;
1755	if (arg.w.stop)
1756		return -1;
1757	(*t_p)++;
1758	return 0;
1759}
1760
1761static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1762			       struct tcmsg *tcm, struct netlink_callback *cb,
1763			       int *t_p, int s_t)
1764{
1765	struct Qdisc *q;
1766
1767	if (!root)
1768		return 0;
1769
1770	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1771		return -1;
1772
1773	list_for_each_entry(q, &root->list, list) {
1774		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1775			return -1;
1776	}
1777
1778	return 0;
1779}
1780
1781static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1782{
1783	struct tcmsg *tcm = nlmsg_data(cb->nlh);
1784	struct net *net = sock_net(skb->sk);
1785	struct netdev_queue *dev_queue;
1786	struct net_device *dev;
1787	int t, s_t;
1788
1789	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1790		return 0;
1791	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1792	if (!dev)
1793		return 0;
1794
1795	s_t = cb->args[0];
1796	t = 0;
1797
1798	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1799		goto done;
1800
1801	dev_queue = dev_ingress_queue(dev);
1802	if (dev_queue &&
1803	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1804				&t, s_t) < 0)
1805		goto done;
1806
1807done:
1808	cb->args[0] = t;
1809
1810	dev_put(dev);
1811	return skb->len;
1812}
1813
1814/* Main classifier routine: scans classifier chain attached
1815 * to this qdisc, (optionally) tests for protocol and asks
1816 * specific classifiers.
1817 */
1818int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1819		       struct tcf_result *res)
1820{
1821	__be16 protocol = tc_skb_protocol(skb);
1822	int err;
1823
1824	for (; tp; tp = rcu_dereference_bh(tp->next)) {
1825		if (tp->protocol != protocol &&
1826		    tp->protocol != htons(ETH_P_ALL))
1827			continue;
1828		err = tp->classify(skb, tp, res);
1829
1830		if (err >= 0) {
1831#ifdef CONFIG_NET_CLS_ACT
1832			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1833				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1834#endif
1835			return err;
1836		}
1837	}
1838	return -1;
1839}
1840EXPORT_SYMBOL(tc_classify_compat);
1841
1842int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1843		struct tcf_result *res)
1844{
1845	int err = 0;
1846#ifdef CONFIG_NET_CLS_ACT
1847	const struct tcf_proto *otp = tp;
1848reclassify:
1849#endif
1850
1851	err = tc_classify_compat(skb, tp, res);
1852#ifdef CONFIG_NET_CLS_ACT
1853	if (err == TC_ACT_RECLASSIFY) {
1854		u32 verd = G_TC_VERD(skb->tc_verd);
1855		tp = otp;
1856
1857		if (verd++ >= MAX_REC_LOOP) {
1858			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1859					       tp->q->ops->id,
1860					       tp->prio & 0xffff,
1861					       ntohs(tp->protocol));
1862			return TC_ACT_SHOT;
1863		}
1864		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1865		goto reclassify;
1866	}
1867#endif
1868	return err;
1869}
1870EXPORT_SYMBOL(tc_classify);
1871
1872bool tcf_destroy(struct tcf_proto *tp, bool force)
1873{
1874	if (tp->ops->destroy(tp, force)) {
1875		module_put(tp->ops->owner);
1876		kfree_rcu(tp, rcu);
1877		return true;
1878	}
1879
1880	return false;
1881}
1882
1883void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1884{
1885	struct tcf_proto *tp;
1886
1887	while ((tp = rtnl_dereference(*fl)) != NULL) {
1888		RCU_INIT_POINTER(*fl, tp->next);
1889		tcf_destroy(tp, true);
1890	}
1891}
1892EXPORT_SYMBOL(tcf_destroy_chain);
1893
1894#ifdef CONFIG_PROC_FS
1895static int psched_show(struct seq_file *seq, void *v)
1896{
1897	struct timespec ts;
1898
1899	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1900	seq_printf(seq, "%08x %08x %08x %08x\n",
1901		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1902		   1000000,
1903		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1904
1905	return 0;
1906}
1907
1908static int psched_open(struct inode *inode, struct file *file)
1909{
1910	return single_open(file, psched_show, NULL);
1911}
1912
1913static const struct file_operations psched_fops = {
1914	.owner = THIS_MODULE,
1915	.open = psched_open,
1916	.read  = seq_read,
1917	.llseek = seq_lseek,
1918	.release = single_release,
1919};
1920
1921static int __net_init psched_net_init(struct net *net)
1922{
1923	struct proc_dir_entry *e;
1924
1925	e = proc_create("psched", 0, net->proc_net, &psched_fops);
1926	if (e == NULL)
1927		return -ENOMEM;
1928
1929	return 0;
1930}
1931
1932static void __net_exit psched_net_exit(struct net *net)
1933{
1934	remove_proc_entry("psched", net->proc_net);
1935}
1936#else
1937static int __net_init psched_net_init(struct net *net)
1938{
1939	return 0;
1940}
1941
1942static void __net_exit psched_net_exit(struct net *net)
1943{
1944}
1945#endif
1946
1947static struct pernet_operations psched_net_ops = {
1948	.init = psched_net_init,
1949	.exit = psched_net_exit,
1950};
1951
1952static int __init pktsched_init(void)
1953{
1954	int err;
1955
1956	err = register_pernet_subsys(&psched_net_ops);
1957	if (err) {
1958		pr_err("pktsched_init: "
1959		       "cannot initialize per netns operations\n");
1960		return err;
1961	}
1962
1963	register_qdisc(&pfifo_fast_ops);
1964	register_qdisc(&pfifo_qdisc_ops);
1965	register_qdisc(&bfifo_qdisc_ops);
1966	register_qdisc(&pfifo_head_drop_qdisc_ops);
1967	register_qdisc(&mq_qdisc_ops);
1968
1969	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1970	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1971	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1972	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1973	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1974	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1975
1976	return 0;
1977}
1978
1979subsys_initcall(pktsched_init);
1980