1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IPv4 Forwarding Information Base: semantics.
7 *
8 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 *		This program is free software; you can redistribute it and/or
11 *		modify it under the terms of the GNU General Public License
12 *		as published by the Free Software Foundation; either version
13 *		2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <linux/bitops.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/jiffies.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/init.h>
34#include <linux/slab.h>
35
36#include <net/arp.h>
37#include <net/ip.h>
38#include <net/protocol.h>
39#include <net/route.h>
40#include <net/tcp.h>
41#include <net/sock.h>
42#include <net/ip_fib.h>
43#include <net/netlink.h>
44#include <net/nexthop.h>
45
46#include "fib_lookup.h"
47
48static DEFINE_SPINLOCK(fib_info_lock);
49static struct hlist_head *fib_info_hash;
50static struct hlist_head *fib_info_laddrhash;
51static unsigned int fib_info_hash_size;
52static unsigned int fib_info_cnt;
53
54#define DEVINDEX_HASHBITS 8
55#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57
58#ifdef CONFIG_IP_ROUTE_MULTIPATH
59
60static DEFINE_SPINLOCK(fib_multipath_lock);
61
62#define for_nexthops(fi) {						\
63	int nhsel; const struct fib_nh *nh;				\
64	for (nhsel = 0, nh = (fi)->fib_nh;				\
65	     nhsel < (fi)->fib_nhs;					\
66	     nh++, nhsel++)
67
68#define change_nexthops(fi) {						\
69	int nhsel; struct fib_nh *nexthop_nh;				\
70	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
71	     nhsel < (fi)->fib_nhs;					\
72	     nexthop_nh++, nhsel++)
73
74#else /* CONFIG_IP_ROUTE_MULTIPATH */
75
76/* Hope, that gcc will optimize it to get rid of dummy loop */
77
78#define for_nexthops(fi) {						\
79	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
80	for (nhsel = 0; nhsel < 1; nhsel++)
81
82#define change_nexthops(fi) {						\
83	int nhsel;							\
84	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
85	for (nhsel = 0; nhsel < 1; nhsel++)
86
87#endif /* CONFIG_IP_ROUTE_MULTIPATH */
88
89#define endfor_nexthops(fi) }
90
91
92const struct fib_prop fib_props[RTN_MAX + 1] = {
93	[RTN_UNSPEC] = {
94		.error	= 0,
95		.scope	= RT_SCOPE_NOWHERE,
96	},
97	[RTN_UNICAST] = {
98		.error	= 0,
99		.scope	= RT_SCOPE_UNIVERSE,
100	},
101	[RTN_LOCAL] = {
102		.error	= 0,
103		.scope	= RT_SCOPE_HOST,
104	},
105	[RTN_BROADCAST] = {
106		.error	= 0,
107		.scope	= RT_SCOPE_LINK,
108	},
109	[RTN_ANYCAST] = {
110		.error	= 0,
111		.scope	= RT_SCOPE_LINK,
112	},
113	[RTN_MULTICAST] = {
114		.error	= 0,
115		.scope	= RT_SCOPE_UNIVERSE,
116	},
117	[RTN_BLACKHOLE] = {
118		.error	= -EINVAL,
119		.scope	= RT_SCOPE_UNIVERSE,
120	},
121	[RTN_UNREACHABLE] = {
122		.error	= -EHOSTUNREACH,
123		.scope	= RT_SCOPE_UNIVERSE,
124	},
125	[RTN_PROHIBIT] = {
126		.error	= -EACCES,
127		.scope	= RT_SCOPE_UNIVERSE,
128	},
129	[RTN_THROW] = {
130		.error	= -EAGAIN,
131		.scope	= RT_SCOPE_UNIVERSE,
132	},
133	[RTN_NAT] = {
134		.error	= -EINVAL,
135		.scope	= RT_SCOPE_NOWHERE,
136	},
137	[RTN_XRESOLVE] = {
138		.error	= -EINVAL,
139		.scope	= RT_SCOPE_NOWHERE,
140	},
141};
142
143static void rt_fibinfo_free(struct rtable __rcu **rtp)
144{
145	struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146
147	if (!rt)
148		return;
149
150	/* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151	 * because we waited an RCU grace period before calling
152	 * free_fib_info_rcu()
153	 */
154
155	dst_free(&rt->dst);
156}
157
158static void free_nh_exceptions(struct fib_nh *nh)
159{
160	struct fnhe_hash_bucket *hash;
161	int i;
162
163	hash = rcu_dereference_protected(nh->nh_exceptions, 1);
164	if (!hash)
165		return;
166	for (i = 0; i < FNHE_HASH_SIZE; i++) {
167		struct fib_nh_exception *fnhe;
168
169		fnhe = rcu_dereference_protected(hash[i].chain, 1);
170		while (fnhe) {
171			struct fib_nh_exception *next;
172
173			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
174
175			rt_fibinfo_free(&fnhe->fnhe_rth_input);
176			rt_fibinfo_free(&fnhe->fnhe_rth_output);
177
178			kfree(fnhe);
179
180			fnhe = next;
181		}
182	}
183	kfree(hash);
184}
185
186static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
187{
188	int cpu;
189
190	if (!rtp)
191		return;
192
193	for_each_possible_cpu(cpu) {
194		struct rtable *rt;
195
196		rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
197		if (rt)
198			dst_free(&rt->dst);
199	}
200	free_percpu(rtp);
201}
202
203/* Release a nexthop info record */
204static void free_fib_info_rcu(struct rcu_head *head)
205{
206	struct fib_info *fi = container_of(head, struct fib_info, rcu);
207
208	change_nexthops(fi) {
209		if (nexthop_nh->nh_dev)
210			dev_put(nexthop_nh->nh_dev);
211		free_nh_exceptions(nexthop_nh);
212		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
213		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
214	} endfor_nexthops(fi);
215
216	if (fi->fib_metrics != (u32 *) dst_default_metrics)
217		kfree(fi->fib_metrics);
218	kfree(fi);
219}
220
221void free_fib_info(struct fib_info *fi)
222{
223	if (fi->fib_dead == 0) {
224		pr_warn("Freeing alive fib_info %p\n", fi);
225		return;
226	}
227	fib_info_cnt--;
228#ifdef CONFIG_IP_ROUTE_CLASSID
229	change_nexthops(fi) {
230		if (nexthop_nh->nh_tclassid)
231			fi->fib_net->ipv4.fib_num_tclassid_users--;
232	} endfor_nexthops(fi);
233#endif
234	call_rcu(&fi->rcu, free_fib_info_rcu);
235}
236
237void fib_release_info(struct fib_info *fi)
238{
239	spin_lock_bh(&fib_info_lock);
240	if (fi && --fi->fib_treeref == 0) {
241		hlist_del(&fi->fib_hash);
242		if (fi->fib_prefsrc)
243			hlist_del(&fi->fib_lhash);
244		change_nexthops(fi) {
245			if (!nexthop_nh->nh_dev)
246				continue;
247			hlist_del(&nexthop_nh->nh_hash);
248		} endfor_nexthops(fi)
249		fi->fib_dead = 1;
250		fib_info_put(fi);
251	}
252	spin_unlock_bh(&fib_info_lock);
253}
254
255static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
256{
257	const struct fib_nh *onh = ofi->fib_nh;
258
259	for_nexthops(fi) {
260		if (nh->nh_oif != onh->nh_oif ||
261		    nh->nh_gw  != onh->nh_gw ||
262		    nh->nh_scope != onh->nh_scope ||
263#ifdef CONFIG_IP_ROUTE_MULTIPATH
264		    nh->nh_weight != onh->nh_weight ||
265#endif
266#ifdef CONFIG_IP_ROUTE_CLASSID
267		    nh->nh_tclassid != onh->nh_tclassid ||
268#endif
269		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
270			return -1;
271		onh++;
272	} endfor_nexthops(fi);
273	return 0;
274}
275
276static inline unsigned int fib_devindex_hashfn(unsigned int val)
277{
278	unsigned int mask = DEVINDEX_HASHSIZE - 1;
279
280	return (val ^
281		(val >> DEVINDEX_HASHBITS) ^
282		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
283}
284
285static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
286{
287	unsigned int mask = (fib_info_hash_size - 1);
288	unsigned int val = fi->fib_nhs;
289
290	val ^= (fi->fib_protocol << 8) | fi->fib_scope;
291	val ^= (__force u32)fi->fib_prefsrc;
292	val ^= fi->fib_priority;
293	for_nexthops(fi) {
294		val ^= fib_devindex_hashfn(nh->nh_oif);
295	} endfor_nexthops(fi)
296
297	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
298}
299
300static struct fib_info *fib_find_info(const struct fib_info *nfi)
301{
302	struct hlist_head *head;
303	struct fib_info *fi;
304	unsigned int hash;
305
306	hash = fib_info_hashfn(nfi);
307	head = &fib_info_hash[hash];
308
309	hlist_for_each_entry(fi, head, fib_hash) {
310		if (!net_eq(fi->fib_net, nfi->fib_net))
311			continue;
312		if (fi->fib_nhs != nfi->fib_nhs)
313			continue;
314		if (nfi->fib_protocol == fi->fib_protocol &&
315		    nfi->fib_scope == fi->fib_scope &&
316		    nfi->fib_prefsrc == fi->fib_prefsrc &&
317		    nfi->fib_priority == fi->fib_priority &&
318		    nfi->fib_type == fi->fib_type &&
319		    memcmp(nfi->fib_metrics, fi->fib_metrics,
320			   sizeof(u32) * RTAX_MAX) == 0 &&
321		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
322		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
323			return fi;
324	}
325
326	return NULL;
327}
328
329/* Check, that the gateway is already configured.
330 * Used only by redirect accept routine.
331 */
332int ip_fib_check_default(__be32 gw, struct net_device *dev)
333{
334	struct hlist_head *head;
335	struct fib_nh *nh;
336	unsigned int hash;
337
338	spin_lock(&fib_info_lock);
339
340	hash = fib_devindex_hashfn(dev->ifindex);
341	head = &fib_info_devhash[hash];
342	hlist_for_each_entry(nh, head, nh_hash) {
343		if (nh->nh_dev == dev &&
344		    nh->nh_gw == gw &&
345		    !(nh->nh_flags & RTNH_F_DEAD)) {
346			spin_unlock(&fib_info_lock);
347			return 0;
348		}
349	}
350
351	spin_unlock(&fib_info_lock);
352
353	return -1;
354}
355
356static inline size_t fib_nlmsg_size(struct fib_info *fi)
357{
358	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
359			 + nla_total_size(4) /* RTA_TABLE */
360			 + nla_total_size(4) /* RTA_DST */
361			 + nla_total_size(4) /* RTA_PRIORITY */
362			 + nla_total_size(4) /* RTA_PREFSRC */
363			 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
364
365	/* space for nested metrics */
366	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
367
368	if (fi->fib_nhs) {
369		/* Also handles the special case fib_nhs == 1 */
370
371		/* each nexthop is packed in an attribute */
372		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
373
374		/* may contain flow and gateway attribute */
375		nhsize += 2 * nla_total_size(4);
376
377		/* all nexthops are packed in a nested attribute */
378		payload += nla_total_size(fi->fib_nhs * nhsize);
379	}
380
381	return payload;
382}
383
384void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
385	       int dst_len, u32 tb_id, const struct nl_info *info,
386	       unsigned int nlm_flags)
387{
388	struct sk_buff *skb;
389	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
390	int err = -ENOBUFS;
391
392	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
393	if (!skb)
394		goto errout;
395
396	err = fib_dump_info(skb, info->portid, seq, event, tb_id,
397			    fa->fa_type, key, dst_len,
398			    fa->fa_tos, fa->fa_info, nlm_flags);
399	if (err < 0) {
400		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
401		WARN_ON(err == -EMSGSIZE);
402		kfree_skb(skb);
403		goto errout;
404	}
405	rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
406		    info->nlh, GFP_KERNEL);
407	return;
408errout:
409	if (err < 0)
410		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
411}
412
413static int fib_detect_death(struct fib_info *fi, int order,
414			    struct fib_info **last_resort, int *last_idx,
415			    int dflt)
416{
417	struct neighbour *n;
418	int state = NUD_NONE;
419
420	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
421	if (n) {
422		state = n->nud_state;
423		neigh_release(n);
424	}
425	if (state == NUD_REACHABLE)
426		return 0;
427	if ((state & NUD_VALID) && order != dflt)
428		return 0;
429	if ((state & NUD_VALID) ||
430	    (*last_idx < 0 && order > dflt)) {
431		*last_resort = fi;
432		*last_idx = order;
433	}
434	return 1;
435}
436
437#ifdef CONFIG_IP_ROUTE_MULTIPATH
438
439static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
440{
441	int nhs = 0;
442
443	while (rtnh_ok(rtnh, remaining)) {
444		nhs++;
445		rtnh = rtnh_next(rtnh, &remaining);
446	}
447
448	/* leftover implies invalid nexthop configuration, discard it */
449	return remaining > 0 ? 0 : nhs;
450}
451
452static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
453		       int remaining, struct fib_config *cfg)
454{
455	change_nexthops(fi) {
456		int attrlen;
457
458		if (!rtnh_ok(rtnh, remaining))
459			return -EINVAL;
460
461		nexthop_nh->nh_flags =
462			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
463		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
464		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
465
466		attrlen = rtnh_attrlen(rtnh);
467		if (attrlen > 0) {
468			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
469
470			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
471			nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
472#ifdef CONFIG_IP_ROUTE_CLASSID
473			nla = nla_find(attrs, attrlen, RTA_FLOW);
474			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
475			if (nexthop_nh->nh_tclassid)
476				fi->fib_net->ipv4.fib_num_tclassid_users++;
477#endif
478		}
479
480		rtnh = rtnh_next(rtnh, &remaining);
481	} endfor_nexthops(fi);
482
483	return 0;
484}
485
486#endif
487
488int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
489{
490#ifdef CONFIG_IP_ROUTE_MULTIPATH
491	struct rtnexthop *rtnh;
492	int remaining;
493#endif
494
495	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
496		return 1;
497
498	if (cfg->fc_oif || cfg->fc_gw) {
499		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
500		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
501			return 0;
502		return 1;
503	}
504
505#ifdef CONFIG_IP_ROUTE_MULTIPATH
506	if (!cfg->fc_mp)
507		return 0;
508
509	rtnh = cfg->fc_mp;
510	remaining = cfg->fc_mp_len;
511
512	for_nexthops(fi) {
513		int attrlen;
514
515		if (!rtnh_ok(rtnh, remaining))
516			return -EINVAL;
517
518		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
519			return 1;
520
521		attrlen = rtnh_attrlen(rtnh);
522		if (attrlen > 0) {
523			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
524
525			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
526			if (nla && nla_get_in_addr(nla) != nh->nh_gw)
527				return 1;
528#ifdef CONFIG_IP_ROUTE_CLASSID
529			nla = nla_find(attrs, attrlen, RTA_FLOW);
530			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
531				return 1;
532#endif
533		}
534
535		rtnh = rtnh_next(rtnh, &remaining);
536	} endfor_nexthops(fi);
537#endif
538	return 0;
539}
540
541
542/*
543 * Picture
544 * -------
545 *
546 * Semantics of nexthop is very messy by historical reasons.
547 * We have to take into account, that:
548 * a) gateway can be actually local interface address,
549 *    so that gatewayed route is direct.
550 * b) gateway must be on-link address, possibly
551 *    described not by an ifaddr, but also by a direct route.
552 * c) If both gateway and interface are specified, they should not
553 *    contradict.
554 * d) If we use tunnel routes, gateway could be not on-link.
555 *
556 * Attempt to reconcile all of these (alas, self-contradictory) conditions
557 * results in pretty ugly and hairy code with obscure logic.
558 *
559 * I chose to generalized it instead, so that the size
560 * of code does not increase practically, but it becomes
561 * much more general.
562 * Every prefix is assigned a "scope" value: "host" is local address,
563 * "link" is direct route,
564 * [ ... "site" ... "interior" ... ]
565 * and "universe" is true gateway route with global meaning.
566 *
567 * Every prefix refers to a set of "nexthop"s (gw, oif),
568 * where gw must have narrower scope. This recursion stops
569 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
570 * which means that gw is forced to be on link.
571 *
572 * Code is still hairy, but now it is apparently logically
573 * consistent and very flexible. F.e. as by-product it allows
574 * to co-exists in peace independent exterior and interior
575 * routing processes.
576 *
577 * Normally it looks as following.
578 *
579 * {universe prefix}  -> (gw, oif) [scope link]
580 *		  |
581 *		  |-> {link prefix} -> (gw, oif) [scope local]
582 *					|
583 *					|-> {local prefix} (terminal node)
584 */
585static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
586			struct fib_nh *nh)
587{
588	int err;
589	struct net *net;
590	struct net_device *dev;
591
592	net = cfg->fc_nlinfo.nl_net;
593	if (nh->nh_gw) {
594		struct fib_result res;
595
596		if (nh->nh_flags & RTNH_F_ONLINK) {
597
598			if (cfg->fc_scope >= RT_SCOPE_LINK)
599				return -EINVAL;
600			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
601				return -EINVAL;
602			dev = __dev_get_by_index(net, nh->nh_oif);
603			if (!dev)
604				return -ENODEV;
605			if (!(dev->flags & IFF_UP))
606				return -ENETDOWN;
607			nh->nh_dev = dev;
608			dev_hold(dev);
609			nh->nh_scope = RT_SCOPE_LINK;
610			return 0;
611		}
612		rcu_read_lock();
613		{
614			struct flowi4 fl4 = {
615				.daddr = nh->nh_gw,
616				.flowi4_scope = cfg->fc_scope + 1,
617				.flowi4_oif = nh->nh_oif,
618				.flowi4_iif = LOOPBACK_IFINDEX,
619			};
620
621			/* It is not necessary, but requires a bit of thinking */
622			if (fl4.flowi4_scope < RT_SCOPE_LINK)
623				fl4.flowi4_scope = RT_SCOPE_LINK;
624			err = fib_lookup(net, &fl4, &res);
625			if (err) {
626				rcu_read_unlock();
627				return err;
628			}
629		}
630		err = -EINVAL;
631		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
632			goto out;
633		nh->nh_scope = res.scope;
634		nh->nh_oif = FIB_RES_OIF(res);
635		nh->nh_dev = dev = FIB_RES_DEV(res);
636		if (!dev)
637			goto out;
638		dev_hold(dev);
639		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
640	} else {
641		struct in_device *in_dev;
642
643		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
644			return -EINVAL;
645
646		rcu_read_lock();
647		err = -ENODEV;
648		in_dev = inetdev_by_index(net, nh->nh_oif);
649		if (!in_dev)
650			goto out;
651		err = -ENETDOWN;
652		if (!(in_dev->dev->flags & IFF_UP))
653			goto out;
654		nh->nh_dev = in_dev->dev;
655		dev_hold(nh->nh_dev);
656		nh->nh_scope = RT_SCOPE_HOST;
657		err = 0;
658	}
659out:
660	rcu_read_unlock();
661	return err;
662}
663
664static inline unsigned int fib_laddr_hashfn(__be32 val)
665{
666	unsigned int mask = (fib_info_hash_size - 1);
667
668	return ((__force u32)val ^
669		((__force u32)val >> 7) ^
670		((__force u32)val >> 14)) & mask;
671}
672
673static struct hlist_head *fib_info_hash_alloc(int bytes)
674{
675	if (bytes <= PAGE_SIZE)
676		return kzalloc(bytes, GFP_KERNEL);
677	else
678		return (struct hlist_head *)
679			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
680					 get_order(bytes));
681}
682
683static void fib_info_hash_free(struct hlist_head *hash, int bytes)
684{
685	if (!hash)
686		return;
687
688	if (bytes <= PAGE_SIZE)
689		kfree(hash);
690	else
691		free_pages((unsigned long) hash, get_order(bytes));
692}
693
694static void fib_info_hash_move(struct hlist_head *new_info_hash,
695			       struct hlist_head *new_laddrhash,
696			       unsigned int new_size)
697{
698	struct hlist_head *old_info_hash, *old_laddrhash;
699	unsigned int old_size = fib_info_hash_size;
700	unsigned int i, bytes;
701
702	spin_lock_bh(&fib_info_lock);
703	old_info_hash = fib_info_hash;
704	old_laddrhash = fib_info_laddrhash;
705	fib_info_hash_size = new_size;
706
707	for (i = 0; i < old_size; i++) {
708		struct hlist_head *head = &fib_info_hash[i];
709		struct hlist_node *n;
710		struct fib_info *fi;
711
712		hlist_for_each_entry_safe(fi, n, head, fib_hash) {
713			struct hlist_head *dest;
714			unsigned int new_hash;
715
716			hlist_del(&fi->fib_hash);
717
718			new_hash = fib_info_hashfn(fi);
719			dest = &new_info_hash[new_hash];
720			hlist_add_head(&fi->fib_hash, dest);
721		}
722	}
723	fib_info_hash = new_info_hash;
724
725	for (i = 0; i < old_size; i++) {
726		struct hlist_head *lhead = &fib_info_laddrhash[i];
727		struct hlist_node *n;
728		struct fib_info *fi;
729
730		hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
731			struct hlist_head *ldest;
732			unsigned int new_hash;
733
734			hlist_del(&fi->fib_lhash);
735
736			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
737			ldest = &new_laddrhash[new_hash];
738			hlist_add_head(&fi->fib_lhash, ldest);
739		}
740	}
741	fib_info_laddrhash = new_laddrhash;
742
743	spin_unlock_bh(&fib_info_lock);
744
745	bytes = old_size * sizeof(struct hlist_head *);
746	fib_info_hash_free(old_info_hash, bytes);
747	fib_info_hash_free(old_laddrhash, bytes);
748}
749
750__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
751{
752	nh->nh_saddr = inet_select_addr(nh->nh_dev,
753					nh->nh_gw,
754					nh->nh_parent->fib_scope);
755	nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
756
757	return nh->nh_saddr;
758}
759
760struct fib_info *fib_create_info(struct fib_config *cfg)
761{
762	int err;
763	struct fib_info *fi = NULL;
764	struct fib_info *ofi;
765	int nhs = 1;
766	struct net *net = cfg->fc_nlinfo.nl_net;
767
768	if (cfg->fc_type > RTN_MAX)
769		goto err_inval;
770
771	/* Fast check to catch the most weird cases */
772	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
773		goto err_inval;
774
775#ifdef CONFIG_IP_ROUTE_MULTIPATH
776	if (cfg->fc_mp) {
777		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
778		if (nhs == 0)
779			goto err_inval;
780	}
781#endif
782
783	err = -ENOBUFS;
784	if (fib_info_cnt >= fib_info_hash_size) {
785		unsigned int new_size = fib_info_hash_size << 1;
786		struct hlist_head *new_info_hash;
787		struct hlist_head *new_laddrhash;
788		unsigned int bytes;
789
790		if (!new_size)
791			new_size = 16;
792		bytes = new_size * sizeof(struct hlist_head *);
793		new_info_hash = fib_info_hash_alloc(bytes);
794		new_laddrhash = fib_info_hash_alloc(bytes);
795		if (!new_info_hash || !new_laddrhash) {
796			fib_info_hash_free(new_info_hash, bytes);
797			fib_info_hash_free(new_laddrhash, bytes);
798		} else
799			fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
800
801		if (!fib_info_hash_size)
802			goto failure;
803	}
804
805	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
806	if (!fi)
807		goto failure;
808	fib_info_cnt++;
809	if (cfg->fc_mx) {
810		fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
811		if (!fi->fib_metrics)
812			goto failure;
813	} else
814		fi->fib_metrics = (u32 *) dst_default_metrics;
815
816	fi->fib_net = net;
817	fi->fib_protocol = cfg->fc_protocol;
818	fi->fib_scope = cfg->fc_scope;
819	fi->fib_flags = cfg->fc_flags;
820	fi->fib_priority = cfg->fc_priority;
821	fi->fib_prefsrc = cfg->fc_prefsrc;
822	fi->fib_type = cfg->fc_type;
823
824	fi->fib_nhs = nhs;
825	change_nexthops(fi) {
826		nexthop_nh->nh_parent = fi;
827		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
828		if (!nexthop_nh->nh_pcpu_rth_output)
829			goto failure;
830	} endfor_nexthops(fi)
831
832	if (cfg->fc_mx) {
833		struct nlattr *nla;
834		int remaining;
835
836		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
837			int type = nla_type(nla);
838
839			if (type) {
840				u32 val;
841
842				if (type > RTAX_MAX)
843					goto err_inval;
844				if (type == RTAX_CC_ALGO) {
845					char tmp[TCP_CA_NAME_MAX];
846
847					nla_strlcpy(tmp, nla, sizeof(tmp));
848					val = tcp_ca_get_key_by_name(tmp);
849					if (val == TCP_CA_UNSPEC)
850						goto err_inval;
851				} else {
852					val = nla_get_u32(nla);
853				}
854				if (type == RTAX_ADVMSS && val > 65535 - 40)
855					val = 65535 - 40;
856				if (type == RTAX_MTU && val > 65535 - 15)
857					val = 65535 - 15;
858				fi->fib_metrics[type - 1] = val;
859			}
860		}
861	}
862
863	if (cfg->fc_mp) {
864#ifdef CONFIG_IP_ROUTE_MULTIPATH
865		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
866		if (err != 0)
867			goto failure;
868		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
869			goto err_inval;
870		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
871			goto err_inval;
872#ifdef CONFIG_IP_ROUTE_CLASSID
873		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
874			goto err_inval;
875#endif
876#else
877		goto err_inval;
878#endif
879	} else {
880		struct fib_nh *nh = fi->fib_nh;
881
882		nh->nh_oif = cfg->fc_oif;
883		nh->nh_gw = cfg->fc_gw;
884		nh->nh_flags = cfg->fc_flags;
885#ifdef CONFIG_IP_ROUTE_CLASSID
886		nh->nh_tclassid = cfg->fc_flow;
887		if (nh->nh_tclassid)
888			fi->fib_net->ipv4.fib_num_tclassid_users++;
889#endif
890#ifdef CONFIG_IP_ROUTE_MULTIPATH
891		nh->nh_weight = 1;
892#endif
893	}
894
895	if (fib_props[cfg->fc_type].error) {
896		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
897			goto err_inval;
898		goto link_it;
899	} else {
900		switch (cfg->fc_type) {
901		case RTN_UNICAST:
902		case RTN_LOCAL:
903		case RTN_BROADCAST:
904		case RTN_ANYCAST:
905		case RTN_MULTICAST:
906			break;
907		default:
908			goto err_inval;
909		}
910	}
911
912	if (cfg->fc_scope > RT_SCOPE_HOST)
913		goto err_inval;
914
915	if (cfg->fc_scope == RT_SCOPE_HOST) {
916		struct fib_nh *nh = fi->fib_nh;
917
918		/* Local address is added. */
919		if (nhs != 1 || nh->nh_gw)
920			goto err_inval;
921		nh->nh_scope = RT_SCOPE_NOWHERE;
922		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
923		err = -ENODEV;
924		if (!nh->nh_dev)
925			goto failure;
926	} else {
927		change_nexthops(fi) {
928			err = fib_check_nh(cfg, fi, nexthop_nh);
929			if (err != 0)
930				goto failure;
931		} endfor_nexthops(fi)
932	}
933
934	if (fi->fib_prefsrc) {
935		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
936		    fi->fib_prefsrc != cfg->fc_dst)
937			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
938				goto err_inval;
939	}
940
941	change_nexthops(fi) {
942		fib_info_update_nh_saddr(net, nexthop_nh);
943	} endfor_nexthops(fi)
944
945link_it:
946	ofi = fib_find_info(fi);
947	if (ofi) {
948		fi->fib_dead = 1;
949		free_fib_info(fi);
950		ofi->fib_treeref++;
951		return ofi;
952	}
953
954	fi->fib_treeref++;
955	atomic_inc(&fi->fib_clntref);
956	spin_lock_bh(&fib_info_lock);
957	hlist_add_head(&fi->fib_hash,
958		       &fib_info_hash[fib_info_hashfn(fi)]);
959	if (fi->fib_prefsrc) {
960		struct hlist_head *head;
961
962		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
963		hlist_add_head(&fi->fib_lhash, head);
964	}
965	change_nexthops(fi) {
966		struct hlist_head *head;
967		unsigned int hash;
968
969		if (!nexthop_nh->nh_dev)
970			continue;
971		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
972		head = &fib_info_devhash[hash];
973		hlist_add_head(&nexthop_nh->nh_hash, head);
974	} endfor_nexthops(fi)
975	spin_unlock_bh(&fib_info_lock);
976	return fi;
977
978err_inval:
979	err = -EINVAL;
980
981failure:
982	if (fi) {
983		fi->fib_dead = 1;
984		free_fib_info(fi);
985	}
986
987	return ERR_PTR(err);
988}
989
990int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
991		  u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
992		  struct fib_info *fi, unsigned int flags)
993{
994	struct nlmsghdr *nlh;
995	struct rtmsg *rtm;
996
997	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
998	if (!nlh)
999		return -EMSGSIZE;
1000
1001	rtm = nlmsg_data(nlh);
1002	rtm->rtm_family = AF_INET;
1003	rtm->rtm_dst_len = dst_len;
1004	rtm->rtm_src_len = 0;
1005	rtm->rtm_tos = tos;
1006	if (tb_id < 256)
1007		rtm->rtm_table = tb_id;
1008	else
1009		rtm->rtm_table = RT_TABLE_COMPAT;
1010	if (nla_put_u32(skb, RTA_TABLE, tb_id))
1011		goto nla_put_failure;
1012	rtm->rtm_type = type;
1013	rtm->rtm_flags = fi->fib_flags;
1014	rtm->rtm_scope = fi->fib_scope;
1015	rtm->rtm_protocol = fi->fib_protocol;
1016
1017	if (rtm->rtm_dst_len &&
1018	    nla_put_in_addr(skb, RTA_DST, dst))
1019		goto nla_put_failure;
1020	if (fi->fib_priority &&
1021	    nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
1022		goto nla_put_failure;
1023	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
1024		goto nla_put_failure;
1025
1026	if (fi->fib_prefsrc &&
1027	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
1028		goto nla_put_failure;
1029	if (fi->fib_nhs == 1) {
1030		if (fi->fib_nh->nh_gw &&
1031		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
1032			goto nla_put_failure;
1033		if (fi->fib_nh->nh_oif &&
1034		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
1035			goto nla_put_failure;
1036#ifdef CONFIG_IP_ROUTE_CLASSID
1037		if (fi->fib_nh[0].nh_tclassid &&
1038		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
1039			goto nla_put_failure;
1040#endif
1041	}
1042#ifdef CONFIG_IP_ROUTE_MULTIPATH
1043	if (fi->fib_nhs > 1) {
1044		struct rtnexthop *rtnh;
1045		struct nlattr *mp;
1046
1047		mp = nla_nest_start(skb, RTA_MULTIPATH);
1048		if (!mp)
1049			goto nla_put_failure;
1050
1051		for_nexthops(fi) {
1052			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1053			if (!rtnh)
1054				goto nla_put_failure;
1055
1056			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1057			rtnh->rtnh_hops = nh->nh_weight - 1;
1058			rtnh->rtnh_ifindex = nh->nh_oif;
1059
1060			if (nh->nh_gw &&
1061			    nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
1062				goto nla_put_failure;
1063#ifdef CONFIG_IP_ROUTE_CLASSID
1064			if (nh->nh_tclassid &&
1065			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
1066				goto nla_put_failure;
1067#endif
1068			/* length of rtnetlink header + attributes */
1069			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1070		} endfor_nexthops(fi);
1071
1072		nla_nest_end(skb, mp);
1073	}
1074#endif
1075	nlmsg_end(skb, nlh);
1076	return 0;
1077
1078nla_put_failure:
1079	nlmsg_cancel(skb, nlh);
1080	return -EMSGSIZE;
1081}
1082
1083/*
1084 * Update FIB if:
1085 * - local address disappeared -> we must delete all the entries
1086 *   referring to it.
1087 * - device went down -> we must shutdown all nexthops going via it.
1088 */
1089int fib_sync_down_addr(struct net *net, __be32 local)
1090{
1091	int ret = 0;
1092	unsigned int hash = fib_laddr_hashfn(local);
1093	struct hlist_head *head = &fib_info_laddrhash[hash];
1094	struct fib_info *fi;
1095
1096	if (!fib_info_laddrhash || local == 0)
1097		return 0;
1098
1099	hlist_for_each_entry(fi, head, fib_lhash) {
1100		if (!net_eq(fi->fib_net, net))
1101			continue;
1102		if (fi->fib_prefsrc == local) {
1103			fi->fib_flags |= RTNH_F_DEAD;
1104			ret++;
1105		}
1106	}
1107	return ret;
1108}
1109
1110int fib_sync_down_dev(struct net_device *dev, int force)
1111{
1112	int ret = 0;
1113	int scope = RT_SCOPE_NOWHERE;
1114	struct fib_info *prev_fi = NULL;
1115	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1116	struct hlist_head *head = &fib_info_devhash[hash];
1117	struct fib_nh *nh;
1118
1119	if (force)
1120		scope = -1;
1121
1122	hlist_for_each_entry(nh, head, nh_hash) {
1123		struct fib_info *fi = nh->nh_parent;
1124		int dead;
1125
1126		BUG_ON(!fi->fib_nhs);
1127		if (nh->nh_dev != dev || fi == prev_fi)
1128			continue;
1129		prev_fi = fi;
1130		dead = 0;
1131		change_nexthops(fi) {
1132			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1133				dead++;
1134			else if (nexthop_nh->nh_dev == dev &&
1135				 nexthop_nh->nh_scope != scope) {
1136				nexthop_nh->nh_flags |= RTNH_F_DEAD;
1137#ifdef CONFIG_IP_ROUTE_MULTIPATH
1138				spin_lock_bh(&fib_multipath_lock);
1139				fi->fib_power -= nexthop_nh->nh_power;
1140				nexthop_nh->nh_power = 0;
1141				spin_unlock_bh(&fib_multipath_lock);
1142#endif
1143				dead++;
1144			}
1145#ifdef CONFIG_IP_ROUTE_MULTIPATH
1146			if (force > 1 && nexthop_nh->nh_dev == dev) {
1147				dead = fi->fib_nhs;
1148				break;
1149			}
1150#endif
1151		} endfor_nexthops(fi)
1152		if (dead == fi->fib_nhs) {
1153			fi->fib_flags |= RTNH_F_DEAD;
1154			ret++;
1155		}
1156	}
1157
1158	return ret;
1159}
1160
1161/* Must be invoked inside of an RCU protected region.  */
1162void fib_select_default(struct fib_result *res)
1163{
1164	struct fib_info *fi = NULL, *last_resort = NULL;
1165	struct hlist_head *fa_head = res->fa_head;
1166	struct fib_table *tb = res->table;
1167	int order = -1, last_idx = -1;
1168	struct fib_alias *fa;
1169
1170	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1171		struct fib_info *next_fi = fa->fa_info;
1172
1173		if (next_fi->fib_scope != res->scope ||
1174		    fa->fa_type != RTN_UNICAST)
1175			continue;
1176
1177		if (next_fi->fib_priority > res->fi->fib_priority)
1178			break;
1179		if (!next_fi->fib_nh[0].nh_gw ||
1180		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1181			continue;
1182
1183		fib_alias_accessed(fa);
1184
1185		if (!fi) {
1186			if (next_fi != res->fi)
1187				break;
1188		} else if (!fib_detect_death(fi, order, &last_resort,
1189					     &last_idx, tb->tb_default)) {
1190			fib_result_assign(res, fi);
1191			tb->tb_default = order;
1192			goto out;
1193		}
1194		fi = next_fi;
1195		order++;
1196	}
1197
1198	if (order <= 0 || !fi) {
1199		tb->tb_default = -1;
1200		goto out;
1201	}
1202
1203	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1204				tb->tb_default)) {
1205		fib_result_assign(res, fi);
1206		tb->tb_default = order;
1207		goto out;
1208	}
1209
1210	if (last_idx >= 0)
1211		fib_result_assign(res, last_resort);
1212	tb->tb_default = last_idx;
1213out:
1214	return;
1215}
1216
1217#ifdef CONFIG_IP_ROUTE_MULTIPATH
1218
1219/*
1220 * Dead device goes up. We wake up dead nexthops.
1221 * It takes sense only on multipath routes.
1222 */
1223int fib_sync_up(struct net_device *dev)
1224{
1225	struct fib_info *prev_fi;
1226	unsigned int hash;
1227	struct hlist_head *head;
1228	struct fib_nh *nh;
1229	int ret;
1230
1231	if (!(dev->flags & IFF_UP))
1232		return 0;
1233
1234	prev_fi = NULL;
1235	hash = fib_devindex_hashfn(dev->ifindex);
1236	head = &fib_info_devhash[hash];
1237	ret = 0;
1238
1239	hlist_for_each_entry(nh, head, nh_hash) {
1240		struct fib_info *fi = nh->nh_parent;
1241		int alive;
1242
1243		BUG_ON(!fi->fib_nhs);
1244		if (nh->nh_dev != dev || fi == prev_fi)
1245			continue;
1246
1247		prev_fi = fi;
1248		alive = 0;
1249		change_nexthops(fi) {
1250			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1251				alive++;
1252				continue;
1253			}
1254			if (!nexthop_nh->nh_dev ||
1255			    !(nexthop_nh->nh_dev->flags & IFF_UP))
1256				continue;
1257			if (nexthop_nh->nh_dev != dev ||
1258			    !__in_dev_get_rtnl(dev))
1259				continue;
1260			alive++;
1261			spin_lock_bh(&fib_multipath_lock);
1262			nexthop_nh->nh_power = 0;
1263			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1264			spin_unlock_bh(&fib_multipath_lock);
1265		} endfor_nexthops(fi)
1266
1267		if (alive > 0) {
1268			fi->fib_flags &= ~RTNH_F_DEAD;
1269			ret++;
1270		}
1271	}
1272
1273	return ret;
1274}
1275
1276/*
1277 * The algorithm is suboptimal, but it provides really
1278 * fair weighted route distribution.
1279 */
1280void fib_select_multipath(struct fib_result *res)
1281{
1282	struct fib_info *fi = res->fi;
1283	int w;
1284
1285	spin_lock_bh(&fib_multipath_lock);
1286	if (fi->fib_power <= 0) {
1287		int power = 0;
1288		change_nexthops(fi) {
1289			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1290				power += nexthop_nh->nh_weight;
1291				nexthop_nh->nh_power = nexthop_nh->nh_weight;
1292			}
1293		} endfor_nexthops(fi);
1294		fi->fib_power = power;
1295		if (power <= 0) {
1296			spin_unlock_bh(&fib_multipath_lock);
1297			/* Race condition: route has just become dead. */
1298			res->nh_sel = 0;
1299			return;
1300		}
1301	}
1302
1303
1304	/* w should be random number [0..fi->fib_power-1],
1305	 * it is pretty bad approximation.
1306	 */
1307
1308	w = jiffies % fi->fib_power;
1309
1310	change_nexthops(fi) {
1311		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1312		    nexthop_nh->nh_power) {
1313			w -= nexthop_nh->nh_power;
1314			if (w <= 0) {
1315				nexthop_nh->nh_power--;
1316				fi->fib_power--;
1317				res->nh_sel = nhsel;
1318				spin_unlock_bh(&fib_multipath_lock);
1319				return;
1320			}
1321		}
1322	} endfor_nexthops(fi);
1323
1324	/* Race condition: route has just become dead. */
1325	res->nh_sel = 0;
1326	spin_unlock_bh(&fib_multipath_lock);
1327}
1328#endif
1329