1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #define pr_fmt(fmt) "IPv4: " fmt
66 
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #include <linux/kmemleak.h>
110 #endif
111 #include <net/secure_seq.h>
112 
113 #define RT_FL_TOS(oldflp4) \
114 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
115 
116 #define RT_GC_TIMEOUT (300*HZ)
117 
118 static int ip_rt_max_size;
119 static int ip_rt_redirect_number __read_mostly	= 9;
120 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
121 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
122 static int ip_rt_error_cost __read_mostly	= HZ;
123 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
124 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
125 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
126 static int ip_rt_min_advmss __read_mostly	= 256;
127 
128 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
129 /*
130  *	Interface to generic destination cache.
131  */
132 
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void		 ipv4_link_failure(struct sk_buff *skb);
138 static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139 					   struct sk_buff *skb, u32 mtu);
140 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141 					struct sk_buff *skb);
142 static void		ipv4_dst_destroy(struct dst_entry *dst);
143 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)144 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
145 {
146 	WARN_ON(1);
147 	return NULL;
148 }
149 
150 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
151 					   struct sk_buff *skb,
152 					   const void *daddr);
153 
154 static struct dst_ops ipv4_dst_ops = {
155 	.family =		AF_INET,
156 	.check =		ipv4_dst_check,
157 	.default_advmss =	ipv4_default_advmss,
158 	.mtu =			ipv4_mtu,
159 	.cow_metrics =		ipv4_cow_metrics,
160 	.destroy =		ipv4_dst_destroy,
161 	.negative_advice =	ipv4_negative_advice,
162 	.link_failure =		ipv4_link_failure,
163 	.update_pmtu =		ip_rt_update_pmtu,
164 	.redirect =		ip_do_redirect,
165 	.local_out =		__ip_local_out,
166 	.neigh_lookup =		ipv4_neigh_lookup,
167 };
168 
169 #define ECN_OR_COST(class)	TC_PRIO_##class
170 
171 const __u8 ip_tos2prio[16] = {
172 	TC_PRIO_BESTEFFORT,
173 	ECN_OR_COST(BESTEFFORT),
174 	TC_PRIO_BESTEFFORT,
175 	ECN_OR_COST(BESTEFFORT),
176 	TC_PRIO_BULK,
177 	ECN_OR_COST(BULK),
178 	TC_PRIO_BULK,
179 	ECN_OR_COST(BULK),
180 	TC_PRIO_INTERACTIVE,
181 	ECN_OR_COST(INTERACTIVE),
182 	TC_PRIO_INTERACTIVE,
183 	ECN_OR_COST(INTERACTIVE),
184 	TC_PRIO_INTERACTIVE_BULK,
185 	ECN_OR_COST(INTERACTIVE_BULK),
186 	TC_PRIO_INTERACTIVE_BULK,
187 	ECN_OR_COST(INTERACTIVE_BULK)
188 };
189 EXPORT_SYMBOL(ip_tos2prio);
190 
191 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
192 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
193 
194 #ifdef CONFIG_PROC_FS
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)195 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
196 {
197 	if (*pos)
198 		return NULL;
199 	return SEQ_START_TOKEN;
200 }
201 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)202 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
203 {
204 	++*pos;
205 	return NULL;
206 }
207 
rt_cache_seq_stop(struct seq_file * seq,void * v)208 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
209 {
210 }
211 
rt_cache_seq_show(struct seq_file * seq,void * v)212 static int rt_cache_seq_show(struct seq_file *seq, void *v)
213 {
214 	if (v == SEQ_START_TOKEN)
215 		seq_printf(seq, "%-127s\n",
216 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
217 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
218 			   "HHUptod\tSpecDst");
219 	return 0;
220 }
221 
222 static const struct seq_operations rt_cache_seq_ops = {
223 	.start  = rt_cache_seq_start,
224 	.next   = rt_cache_seq_next,
225 	.stop   = rt_cache_seq_stop,
226 	.show   = rt_cache_seq_show,
227 };
228 
rt_cache_seq_open(struct inode * inode,struct file * file)229 static int rt_cache_seq_open(struct inode *inode, struct file *file)
230 {
231 	return seq_open(file, &rt_cache_seq_ops);
232 }
233 
234 static const struct file_operations rt_cache_seq_fops = {
235 	.owner	 = THIS_MODULE,
236 	.open	 = rt_cache_seq_open,
237 	.read	 = seq_read,
238 	.llseek	 = seq_lseek,
239 	.release = seq_release,
240 };
241 
242 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)243 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
244 {
245 	int cpu;
246 
247 	if (*pos == 0)
248 		return SEQ_START_TOKEN;
249 
250 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
251 		if (!cpu_possible(cpu))
252 			continue;
253 		*pos = cpu+1;
254 		return &per_cpu(rt_cache_stat, cpu);
255 	}
256 	return NULL;
257 }
258 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)259 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
260 {
261 	int cpu;
262 
263 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
264 		if (!cpu_possible(cpu))
265 			continue;
266 		*pos = cpu+1;
267 		return &per_cpu(rt_cache_stat, cpu);
268 	}
269 	return NULL;
270 
271 }
272 
rt_cpu_seq_stop(struct seq_file * seq,void * v)273 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
274 {
275 
276 }
277 
rt_cpu_seq_show(struct seq_file * seq,void * v)278 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
279 {
280 	struct rt_cache_stat *st = v;
281 
282 	if (v == SEQ_START_TOKEN) {
283 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
284 		return 0;
285 	}
286 
287 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
288 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
289 		   dst_entries_get_slow(&ipv4_dst_ops),
290 		   0, /* st->in_hit */
291 		   st->in_slow_tot,
292 		   st->in_slow_mc,
293 		   st->in_no_route,
294 		   st->in_brd,
295 		   st->in_martian_dst,
296 		   st->in_martian_src,
297 
298 		   0, /* st->out_hit */
299 		   st->out_slow_tot,
300 		   st->out_slow_mc,
301 
302 		   0, /* st->gc_total */
303 		   0, /* st->gc_ignored */
304 		   0, /* st->gc_goal_miss */
305 		   0, /* st->gc_dst_overflow */
306 		   0, /* st->in_hlist_search */
307 		   0  /* st->out_hlist_search */
308 		);
309 	return 0;
310 }
311 
312 static const struct seq_operations rt_cpu_seq_ops = {
313 	.start  = rt_cpu_seq_start,
314 	.next   = rt_cpu_seq_next,
315 	.stop   = rt_cpu_seq_stop,
316 	.show   = rt_cpu_seq_show,
317 };
318 
319 
rt_cpu_seq_open(struct inode * inode,struct file * file)320 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
321 {
322 	return seq_open(file, &rt_cpu_seq_ops);
323 }
324 
325 static const struct file_operations rt_cpu_seq_fops = {
326 	.owner	 = THIS_MODULE,
327 	.open	 = rt_cpu_seq_open,
328 	.read	 = seq_read,
329 	.llseek	 = seq_lseek,
330 	.release = seq_release,
331 };
332 
333 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)334 static int rt_acct_proc_show(struct seq_file *m, void *v)
335 {
336 	struct ip_rt_acct *dst, *src;
337 	unsigned int i, j;
338 
339 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
340 	if (!dst)
341 		return -ENOMEM;
342 
343 	for_each_possible_cpu(i) {
344 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
345 		for (j = 0; j < 256; j++) {
346 			dst[j].o_bytes   += src[j].o_bytes;
347 			dst[j].o_packets += src[j].o_packets;
348 			dst[j].i_bytes   += src[j].i_bytes;
349 			dst[j].i_packets += src[j].i_packets;
350 		}
351 	}
352 
353 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
354 	kfree(dst);
355 	return 0;
356 }
357 
rt_acct_proc_open(struct inode * inode,struct file * file)358 static int rt_acct_proc_open(struct inode *inode, struct file *file)
359 {
360 	return single_open(file, rt_acct_proc_show, NULL);
361 }
362 
363 static const struct file_operations rt_acct_proc_fops = {
364 	.owner		= THIS_MODULE,
365 	.open		= rt_acct_proc_open,
366 	.read		= seq_read,
367 	.llseek		= seq_lseek,
368 	.release	= single_release,
369 };
370 #endif
371 
ip_rt_do_proc_init(struct net * net)372 static int __net_init ip_rt_do_proc_init(struct net *net)
373 {
374 	struct proc_dir_entry *pde;
375 
376 	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
377 			  &rt_cache_seq_fops);
378 	if (!pde)
379 		goto err1;
380 
381 	pde = proc_create("rt_cache", S_IRUGO,
382 			  net->proc_net_stat, &rt_cpu_seq_fops);
383 	if (!pde)
384 		goto err2;
385 
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
388 	if (!pde)
389 		goto err3;
390 #endif
391 	return 0;
392 
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394 err3:
395 	remove_proc_entry("rt_cache", net->proc_net_stat);
396 #endif
397 err2:
398 	remove_proc_entry("rt_cache", net->proc_net);
399 err1:
400 	return -ENOMEM;
401 }
402 
ip_rt_do_proc_exit(struct net * net)403 static void __net_exit ip_rt_do_proc_exit(struct net *net)
404 {
405 	remove_proc_entry("rt_cache", net->proc_net_stat);
406 	remove_proc_entry("rt_cache", net->proc_net);
407 #ifdef CONFIG_IP_ROUTE_CLASSID
408 	remove_proc_entry("rt_acct", net->proc_net);
409 #endif
410 }
411 
412 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
413 	.init = ip_rt_do_proc_init,
414 	.exit = ip_rt_do_proc_exit,
415 };
416 
ip_rt_proc_init(void)417 static int __init ip_rt_proc_init(void)
418 {
419 	return register_pernet_subsys(&ip_rt_proc_ops);
420 }
421 
422 #else
ip_rt_proc_init(void)423 static inline int ip_rt_proc_init(void)
424 {
425 	return 0;
426 }
427 #endif /* CONFIG_PROC_FS */
428 
rt_is_expired(const struct rtable * rth)429 static inline bool rt_is_expired(const struct rtable *rth)
430 {
431 	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
432 }
433 
rt_cache_flush(struct net * net)434 void rt_cache_flush(struct net *net)
435 {
436 	rt_genid_bump_ipv4(net);
437 }
438 
ipv4_neigh_lookup(const struct dst_entry * dst,struct sk_buff * skb,const void * daddr)439 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440 					   struct sk_buff *skb,
441 					   const void *daddr)
442 {
443 	struct net_device *dev = dst->dev;
444 	const __be32 *pkey = daddr;
445 	const struct rtable *rt;
446 	struct neighbour *n;
447 
448 	rt = (const struct rtable *) dst;
449 	if (rt->rt_gateway)
450 		pkey = (const __be32 *) &rt->rt_gateway;
451 	else if (skb)
452 		pkey = &ip_hdr(skb)->daddr;
453 
454 	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
455 	if (n)
456 		return n;
457 	return neigh_create(&arp_tbl, pkey, dev);
458 }
459 
460 #define IP_IDENTS_SZ 2048u
461 struct ip_ident_bucket {
462 	atomic_t	id;
463 	u32		stamp32;
464 };
465 
466 static struct ip_ident_bucket *ip_idents __read_mostly;
467 
468 /* In order to protect privacy, we add a perturbation to identifiers
469  * if one generator is seldom used. This makes hard for an attacker
470  * to infer how many packets were sent between two points in time.
471  */
ip_idents_reserve(u32 hash,int segs)472 u32 ip_idents_reserve(u32 hash, int segs)
473 {
474 	struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
475 	u32 old = ACCESS_ONCE(bucket->stamp32);
476 	u32 now = (u32)jiffies;
477 	u32 delta = 0;
478 
479 	if (old != now && cmpxchg(&bucket->stamp32, old, now) == old)
480 		delta = prandom_u32_max(now - old);
481 
482 	return atomic_add_return(segs + delta, &bucket->id) - segs;
483 }
484 EXPORT_SYMBOL(ip_idents_reserve);
485 
__ip_select_ident(struct net * net,struct iphdr * iph,int segs)486 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
487 {
488 	static u32 ip_idents_hashrnd __read_mostly;
489 	u32 hash, id;
490 
491 	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
492 
493 	hash = jhash_3words((__force u32)iph->daddr,
494 			    (__force u32)iph->saddr,
495 			    iph->protocol ^ net_hash_mix(net),
496 			    ip_idents_hashrnd);
497 	id = ip_idents_reserve(hash, segs);
498 	iph->id = htons(id);
499 }
500 EXPORT_SYMBOL(__ip_select_ident);
501 
__build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct iphdr * iph,int oif,u8 tos,u8 prot,u32 mark,int flow_flags)502 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
503 			     const struct iphdr *iph,
504 			     int oif, u8 tos,
505 			     u8 prot, u32 mark, int flow_flags)
506 {
507 	if (sk) {
508 		const struct inet_sock *inet = inet_sk(sk);
509 
510 		oif = sk->sk_bound_dev_if;
511 		mark = sk->sk_mark;
512 		tos = RT_CONN_FLAGS(sk);
513 		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
514 	}
515 	flowi4_init_output(fl4, oif, mark, tos,
516 			   RT_SCOPE_UNIVERSE, prot,
517 			   flow_flags,
518 			   iph->daddr, iph->saddr, 0, 0);
519 }
520 
build_skb_flow_key(struct flowi4 * fl4,const struct sk_buff * skb,const struct sock * sk)521 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
522 			       const struct sock *sk)
523 {
524 	const struct iphdr *iph = ip_hdr(skb);
525 	int oif = skb->dev->ifindex;
526 	u8 tos = RT_TOS(iph->tos);
527 	u8 prot = iph->protocol;
528 	u32 mark = skb->mark;
529 
530 	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
531 }
532 
build_sk_flow_key(struct flowi4 * fl4,const struct sock * sk)533 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
534 {
535 	const struct inet_sock *inet = inet_sk(sk);
536 	const struct ip_options_rcu *inet_opt;
537 	__be32 daddr = inet->inet_daddr;
538 
539 	rcu_read_lock();
540 	inet_opt = rcu_dereference(inet->inet_opt);
541 	if (inet_opt && inet_opt->opt.srr)
542 		daddr = inet_opt->opt.faddr;
543 	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
544 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
545 			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
546 			   inet_sk_flowi_flags(sk),
547 			   daddr, inet->inet_saddr, 0, 0);
548 	rcu_read_unlock();
549 }
550 
ip_rt_build_flow_key(struct flowi4 * fl4,const struct sock * sk,const struct sk_buff * skb)551 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
552 				 const struct sk_buff *skb)
553 {
554 	if (skb)
555 		build_skb_flow_key(fl4, skb, sk);
556 	else
557 		build_sk_flow_key(fl4, sk);
558 }
559 
rt_free(struct rtable * rt)560 static inline void rt_free(struct rtable *rt)
561 {
562 	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
563 }
564 
565 static DEFINE_SPINLOCK(fnhe_lock);
566 
fnhe_flush_routes(struct fib_nh_exception * fnhe)567 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
568 {
569 	struct rtable *rt;
570 
571 	rt = rcu_dereference(fnhe->fnhe_rth_input);
572 	if (rt) {
573 		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
574 		rt_free(rt);
575 	}
576 	rt = rcu_dereference(fnhe->fnhe_rth_output);
577 	if (rt) {
578 		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
579 		rt_free(rt);
580 	}
581 }
582 
fnhe_oldest(struct fnhe_hash_bucket * hash)583 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
584 {
585 	struct fib_nh_exception *fnhe, *oldest;
586 
587 	oldest = rcu_dereference(hash->chain);
588 	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
589 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
590 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
591 			oldest = fnhe;
592 	}
593 	fnhe_flush_routes(oldest);
594 	return oldest;
595 }
596 
fnhe_hashfun(__be32 daddr)597 static inline u32 fnhe_hashfun(__be32 daddr)
598 {
599 	static u32 fnhe_hashrnd __read_mostly;
600 	u32 hval;
601 
602 	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
603 	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
604 	return hash_32(hval, FNHE_HASH_SHIFT);
605 }
606 
fill_route_from_fnhe(struct rtable * rt,struct fib_nh_exception * fnhe)607 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
608 {
609 	rt->rt_pmtu = fnhe->fnhe_pmtu;
610 	rt->dst.expires = fnhe->fnhe_expires;
611 
612 	if (fnhe->fnhe_gw) {
613 		rt->rt_flags |= RTCF_REDIRECTED;
614 		rt->rt_gateway = fnhe->fnhe_gw;
615 		rt->rt_uses_gateway = 1;
616 	}
617 }
618 
update_or_create_fnhe(struct fib_nh * nh,__be32 daddr,__be32 gw,u32 pmtu,unsigned long expires)619 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
620 				  u32 pmtu, unsigned long expires)
621 {
622 	struct fnhe_hash_bucket *hash;
623 	struct fib_nh_exception *fnhe;
624 	struct rtable *rt;
625 	unsigned int i;
626 	int depth;
627 	u32 hval = fnhe_hashfun(daddr);
628 
629 	spin_lock_bh(&fnhe_lock);
630 
631 	hash = rcu_dereference(nh->nh_exceptions);
632 	if (!hash) {
633 		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
634 		if (!hash)
635 			goto out_unlock;
636 		rcu_assign_pointer(nh->nh_exceptions, hash);
637 	}
638 
639 	hash += hval;
640 
641 	depth = 0;
642 	for (fnhe = rcu_dereference(hash->chain); fnhe;
643 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
644 		if (fnhe->fnhe_daddr == daddr)
645 			break;
646 		depth++;
647 	}
648 
649 	if (fnhe) {
650 		if (gw)
651 			fnhe->fnhe_gw = gw;
652 		if (pmtu) {
653 			fnhe->fnhe_pmtu = pmtu;
654 			fnhe->fnhe_expires = max(1UL, expires);
655 		}
656 		/* Update all cached dsts too */
657 		rt = rcu_dereference(fnhe->fnhe_rth_input);
658 		if (rt)
659 			fill_route_from_fnhe(rt, fnhe);
660 		rt = rcu_dereference(fnhe->fnhe_rth_output);
661 		if (rt)
662 			fill_route_from_fnhe(rt, fnhe);
663 	} else {
664 		if (depth > FNHE_RECLAIM_DEPTH)
665 			fnhe = fnhe_oldest(hash);
666 		else {
667 			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
668 			if (!fnhe)
669 				goto out_unlock;
670 
671 			fnhe->fnhe_next = hash->chain;
672 			rcu_assign_pointer(hash->chain, fnhe);
673 		}
674 		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
675 		fnhe->fnhe_daddr = daddr;
676 		fnhe->fnhe_gw = gw;
677 		fnhe->fnhe_pmtu = pmtu;
678 		fnhe->fnhe_expires = expires;
679 
680 		/* Exception created; mark the cached routes for the nexthop
681 		 * stale, so anyone caching it rechecks if this exception
682 		 * applies to them.
683 		 */
684 		rt = rcu_dereference(nh->nh_rth_input);
685 		if (rt)
686 			rt->dst.obsolete = DST_OBSOLETE_KILL;
687 
688 		for_each_possible_cpu(i) {
689 			struct rtable __rcu **prt;
690 			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
691 			rt = rcu_dereference(*prt);
692 			if (rt)
693 				rt->dst.obsolete = DST_OBSOLETE_KILL;
694 		}
695 	}
696 
697 	fnhe->fnhe_stamp = jiffies;
698 
699 out_unlock:
700 	spin_unlock_bh(&fnhe_lock);
701 }
702 
__ip_do_redirect(struct rtable * rt,struct sk_buff * skb,struct flowi4 * fl4,bool kill_route)703 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
704 			     bool kill_route)
705 {
706 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
707 	__be32 old_gw = ip_hdr(skb)->saddr;
708 	struct net_device *dev = skb->dev;
709 	struct in_device *in_dev;
710 	struct fib_result res;
711 	struct neighbour *n;
712 	struct net *net;
713 
714 	switch (icmp_hdr(skb)->code & 7) {
715 	case ICMP_REDIR_NET:
716 	case ICMP_REDIR_NETTOS:
717 	case ICMP_REDIR_HOST:
718 	case ICMP_REDIR_HOSTTOS:
719 		break;
720 
721 	default:
722 		return;
723 	}
724 
725 	if (rt->rt_gateway != old_gw)
726 		return;
727 
728 	in_dev = __in_dev_get_rcu(dev);
729 	if (!in_dev)
730 		return;
731 
732 	net = dev_net(dev);
733 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
734 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
735 	    ipv4_is_zeronet(new_gw))
736 		goto reject_redirect;
737 
738 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
739 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
740 			goto reject_redirect;
741 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
742 			goto reject_redirect;
743 	} else {
744 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
745 			goto reject_redirect;
746 	}
747 
748 	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
749 	if (!IS_ERR(n)) {
750 		if (!(n->nud_state & NUD_VALID)) {
751 			neigh_event_send(n, NULL);
752 		} else {
753 			if (fib_lookup(net, fl4, &res) == 0) {
754 				struct fib_nh *nh = &FIB_RES_NH(res);
755 
756 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
757 						0, jiffies + ip_rt_gc_timeout);
758 			}
759 			if (kill_route)
760 				rt->dst.obsolete = DST_OBSOLETE_KILL;
761 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
762 		}
763 		neigh_release(n);
764 	}
765 	return;
766 
767 reject_redirect:
768 #ifdef CONFIG_IP_ROUTE_VERBOSE
769 	if (IN_DEV_LOG_MARTIANS(in_dev)) {
770 		const struct iphdr *iph = (const struct iphdr *) skb->data;
771 		__be32 daddr = iph->daddr;
772 		__be32 saddr = iph->saddr;
773 
774 		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
775 				     "  Advised path = %pI4 -> %pI4\n",
776 				     &old_gw, dev->name, &new_gw,
777 				     &saddr, &daddr);
778 	}
779 #endif
780 	;
781 }
782 
ip_do_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)783 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
784 {
785 	struct rtable *rt;
786 	struct flowi4 fl4;
787 	const struct iphdr *iph = (const struct iphdr *) skb->data;
788 	int oif = skb->dev->ifindex;
789 	u8 tos = RT_TOS(iph->tos);
790 	u8 prot = iph->protocol;
791 	u32 mark = skb->mark;
792 
793 	rt = (struct rtable *) dst;
794 
795 	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
796 	__ip_do_redirect(rt, skb, &fl4, true);
797 }
798 
ipv4_negative_advice(struct dst_entry * dst)799 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
800 {
801 	struct rtable *rt = (struct rtable *)dst;
802 	struct dst_entry *ret = dst;
803 
804 	if (rt) {
805 		if (dst->obsolete > 0) {
806 			ip_rt_put(rt);
807 			ret = NULL;
808 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
809 			   rt->dst.expires) {
810 			ip_rt_put(rt);
811 			ret = NULL;
812 		}
813 	}
814 	return ret;
815 }
816 
817 /*
818  * Algorithm:
819  *	1. The first ip_rt_redirect_number redirects are sent
820  *	   with exponential backoff, then we stop sending them at all,
821  *	   assuming that the host ignores our redirects.
822  *	2. If we did not see packets requiring redirects
823  *	   during ip_rt_redirect_silence, we assume that the host
824  *	   forgot redirected route and start to send redirects again.
825  *
826  * This algorithm is much cheaper and more intelligent than dumb load limiting
827  * in icmp.c.
828  *
829  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
830  * and "frag. need" (breaks PMTU discovery) in icmp.c.
831  */
832 
ip_rt_send_redirect(struct sk_buff * skb)833 void ip_rt_send_redirect(struct sk_buff *skb)
834 {
835 	struct rtable *rt = skb_rtable(skb);
836 	struct in_device *in_dev;
837 	struct inet_peer *peer;
838 	struct net *net;
839 	int log_martians;
840 
841 	rcu_read_lock();
842 	in_dev = __in_dev_get_rcu(rt->dst.dev);
843 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
844 		rcu_read_unlock();
845 		return;
846 	}
847 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
848 	rcu_read_unlock();
849 
850 	net = dev_net(rt->dst.dev);
851 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
852 	if (!peer) {
853 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
854 			  rt_nexthop(rt, ip_hdr(skb)->daddr));
855 		return;
856 	}
857 
858 	/* No redirected packets during ip_rt_redirect_silence;
859 	 * reset the algorithm.
860 	 */
861 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
862 		peer->rate_tokens = 0;
863 
864 	/* Too many ignored redirects; do not send anything
865 	 * set dst.rate_last to the last seen redirected packet.
866 	 */
867 	if (peer->rate_tokens >= ip_rt_redirect_number) {
868 		peer->rate_last = jiffies;
869 		goto out_put_peer;
870 	}
871 
872 	/* Check for load limit; set rate_last to the latest sent
873 	 * redirect.
874 	 */
875 	if (peer->rate_tokens == 0 ||
876 	    time_after(jiffies,
877 		       (peer->rate_last +
878 			(ip_rt_redirect_load << peer->rate_tokens)))) {
879 		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
880 
881 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
882 		peer->rate_last = jiffies;
883 		++peer->rate_tokens;
884 #ifdef CONFIG_IP_ROUTE_VERBOSE
885 		if (log_martians &&
886 		    peer->rate_tokens == ip_rt_redirect_number)
887 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
888 					     &ip_hdr(skb)->saddr, inet_iif(skb),
889 					     &ip_hdr(skb)->daddr, &gw);
890 #endif
891 	}
892 out_put_peer:
893 	inet_putpeer(peer);
894 }
895 
ip_error(struct sk_buff * skb)896 static int ip_error(struct sk_buff *skb)
897 {
898 	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
899 	struct rtable *rt = skb_rtable(skb);
900 	struct inet_peer *peer;
901 	unsigned long now;
902 	struct net *net;
903 	bool send;
904 	int code;
905 
906 	/* IP on this device is disabled. */
907 	if (!in_dev)
908 		goto out;
909 
910 	net = dev_net(rt->dst.dev);
911 	if (!IN_DEV_FORWARD(in_dev)) {
912 		switch (rt->dst.error) {
913 		case EHOSTUNREACH:
914 			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
915 			break;
916 
917 		case ENETUNREACH:
918 			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
919 			break;
920 		}
921 		goto out;
922 	}
923 
924 	switch (rt->dst.error) {
925 	case EINVAL:
926 	default:
927 		goto out;
928 	case EHOSTUNREACH:
929 		code = ICMP_HOST_UNREACH;
930 		break;
931 	case ENETUNREACH:
932 		code = ICMP_NET_UNREACH;
933 		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
934 		break;
935 	case EACCES:
936 		code = ICMP_PKT_FILTERED;
937 		break;
938 	}
939 
940 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
941 
942 	send = true;
943 	if (peer) {
944 		now = jiffies;
945 		peer->rate_tokens += now - peer->rate_last;
946 		if (peer->rate_tokens > ip_rt_error_burst)
947 			peer->rate_tokens = ip_rt_error_burst;
948 		peer->rate_last = now;
949 		if (peer->rate_tokens >= ip_rt_error_cost)
950 			peer->rate_tokens -= ip_rt_error_cost;
951 		else
952 			send = false;
953 		inet_putpeer(peer);
954 	}
955 	if (send)
956 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
957 
958 out:	kfree_skb(skb);
959 	return 0;
960 }
961 
__ip_rt_update_pmtu(struct rtable * rt,struct flowi4 * fl4,u32 mtu)962 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
963 {
964 	struct dst_entry *dst = &rt->dst;
965 	struct fib_result res;
966 
967 	if (dst_metric_locked(dst, RTAX_MTU))
968 		return;
969 
970 	if (ipv4_mtu(dst) < mtu)
971 		return;
972 
973 	if (mtu < ip_rt_min_pmtu)
974 		mtu = ip_rt_min_pmtu;
975 
976 	if (rt->rt_pmtu == mtu &&
977 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
978 		return;
979 
980 	rcu_read_lock();
981 	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
982 		struct fib_nh *nh = &FIB_RES_NH(res);
983 
984 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
985 				      jiffies + ip_rt_mtu_expires);
986 	}
987 	rcu_read_unlock();
988 }
989 
ip_rt_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)990 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
991 			      struct sk_buff *skb, u32 mtu)
992 {
993 	struct rtable *rt = (struct rtable *) dst;
994 	struct flowi4 fl4;
995 
996 	ip_rt_build_flow_key(&fl4, sk, skb);
997 	__ip_rt_update_pmtu(rt, &fl4, mtu);
998 }
999 
ipv4_update_pmtu(struct sk_buff * skb,struct net * net,u32 mtu,int oif,u32 mark,u8 protocol,int flow_flags)1000 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1001 		      int oif, u32 mark, u8 protocol, int flow_flags)
1002 {
1003 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1004 	struct flowi4 fl4;
1005 	struct rtable *rt;
1006 
1007 	if (!mark)
1008 		mark = IP4_REPLY_MARK(net, skb->mark);
1009 
1010 	__build_flow_key(&fl4, NULL, iph, oif,
1011 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1012 	rt = __ip_route_output_key(net, &fl4);
1013 	if (!IS_ERR(rt)) {
1014 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1015 		ip_rt_put(rt);
1016 	}
1017 }
1018 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1019 
__ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1020 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021 {
1022 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1023 	struct flowi4 fl4;
1024 	struct rtable *rt;
1025 
1026 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1027 
1028 	if (!fl4.flowi4_mark)
1029 		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1030 
1031 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1032 	if (!IS_ERR(rt)) {
1033 		__ip_rt_update_pmtu(rt, &fl4, mtu);
1034 		ip_rt_put(rt);
1035 	}
1036 }
1037 
ipv4_sk_update_pmtu(struct sk_buff * skb,struct sock * sk,u32 mtu)1038 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1039 {
1040 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1041 	struct flowi4 fl4;
1042 	struct rtable *rt;
1043 	struct dst_entry *odst = NULL;
1044 	bool new = false;
1045 
1046 	bh_lock_sock(sk);
1047 
1048 	if (!ip_sk_accept_pmtu(sk))
1049 		goto out;
1050 
1051 	odst = sk_dst_get(sk);
1052 
1053 	if (sock_owned_by_user(sk) || !odst) {
1054 		__ipv4_sk_update_pmtu(skb, sk, mtu);
1055 		goto out;
1056 	}
1057 
1058 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1059 
1060 	rt = (struct rtable *)odst;
1061 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1062 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1063 		if (IS_ERR(rt))
1064 			goto out;
1065 
1066 		new = true;
1067 	}
1068 
1069 	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1070 
1071 	if (!dst_check(&rt->dst, 0)) {
1072 		if (new)
1073 			dst_release(&rt->dst);
1074 
1075 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1076 		if (IS_ERR(rt))
1077 			goto out;
1078 
1079 		new = true;
1080 	}
1081 
1082 	if (new)
1083 		sk_dst_set(sk, &rt->dst);
1084 
1085 out:
1086 	bh_unlock_sock(sk);
1087 	dst_release(odst);
1088 }
1089 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1090 
ipv4_redirect(struct sk_buff * skb,struct net * net,int oif,u32 mark,u8 protocol,int flow_flags)1091 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1092 		   int oif, u32 mark, u8 protocol, int flow_flags)
1093 {
1094 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1095 	struct flowi4 fl4;
1096 	struct rtable *rt;
1097 
1098 	__build_flow_key(&fl4, NULL, iph, oif,
1099 			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1100 	rt = __ip_route_output_key(net, &fl4);
1101 	if (!IS_ERR(rt)) {
1102 		__ip_do_redirect(rt, skb, &fl4, false);
1103 		ip_rt_put(rt);
1104 	}
1105 }
1106 EXPORT_SYMBOL_GPL(ipv4_redirect);
1107 
ipv4_sk_redirect(struct sk_buff * skb,struct sock * sk)1108 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1109 {
1110 	const struct iphdr *iph = (const struct iphdr *) skb->data;
1111 	struct flowi4 fl4;
1112 	struct rtable *rt;
1113 
1114 	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1115 	rt = __ip_route_output_key(sock_net(sk), &fl4);
1116 	if (!IS_ERR(rt)) {
1117 		__ip_do_redirect(rt, skb, &fl4, false);
1118 		ip_rt_put(rt);
1119 	}
1120 }
1121 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1122 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1123 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1124 {
1125 	struct rtable *rt = (struct rtable *) dst;
1126 
1127 	/* All IPV4 dsts are created with ->obsolete set to the value
1128 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1129 	 * into this function always.
1130 	 *
1131 	 * When a PMTU/redirect information update invalidates a route,
1132 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1133 	 * DST_OBSOLETE_DEAD by dst_free().
1134 	 */
1135 	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1136 		return NULL;
1137 	return dst;
1138 }
1139 
ipv4_link_failure(struct sk_buff * skb)1140 static void ipv4_link_failure(struct sk_buff *skb)
1141 {
1142 	struct rtable *rt;
1143 
1144 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1145 
1146 	rt = skb_rtable(skb);
1147 	if (rt)
1148 		dst_set_expires(&rt->dst, 0);
1149 }
1150 
ip_rt_bug(struct sock * sk,struct sk_buff * skb)1151 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1152 {
1153 	pr_debug("%s: %pI4 -> %pI4, %s\n",
1154 		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1155 		 skb->dev ? skb->dev->name : "?");
1156 	kfree_skb(skb);
1157 	WARN_ON(1);
1158 	return 0;
1159 }
1160 
1161 /*
1162    We do not cache source address of outgoing interface,
1163    because it is used only by IP RR, TS and SRR options,
1164    so that it out of fast path.
1165 
1166    BTW remember: "addr" is allowed to be not aligned
1167    in IP options!
1168  */
1169 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1170 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1171 {
1172 	__be32 src;
1173 
1174 	if (rt_is_output_route(rt))
1175 		src = ip_hdr(skb)->saddr;
1176 	else {
1177 		struct fib_result res;
1178 		struct flowi4 fl4;
1179 		struct iphdr *iph;
1180 
1181 		iph = ip_hdr(skb);
1182 
1183 		memset(&fl4, 0, sizeof(fl4));
1184 		fl4.daddr = iph->daddr;
1185 		fl4.saddr = iph->saddr;
1186 		fl4.flowi4_tos = RT_TOS(iph->tos);
1187 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1188 		fl4.flowi4_iif = skb->dev->ifindex;
1189 		fl4.flowi4_mark = skb->mark;
1190 
1191 		rcu_read_lock();
1192 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1193 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1194 		else
1195 			src = inet_select_addr(rt->dst.dev,
1196 					       rt_nexthop(rt, iph->daddr),
1197 					       RT_SCOPE_UNIVERSE);
1198 		rcu_read_unlock();
1199 	}
1200 	memcpy(addr, &src, 4);
1201 }
1202 
1203 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1204 static void set_class_tag(struct rtable *rt, u32 tag)
1205 {
1206 	if (!(rt->dst.tclassid & 0xFFFF))
1207 		rt->dst.tclassid |= tag & 0xFFFF;
1208 	if (!(rt->dst.tclassid & 0xFFFF0000))
1209 		rt->dst.tclassid |= tag & 0xFFFF0000;
1210 }
1211 #endif
1212 
ipv4_default_advmss(const struct dst_entry * dst)1213 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1214 {
1215 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1216 
1217 	if (advmss == 0) {
1218 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1219 			       ip_rt_min_advmss);
1220 		if (advmss > 65535 - 40)
1221 			advmss = 65535 - 40;
1222 	}
1223 	return advmss;
1224 }
1225 
ipv4_mtu(const struct dst_entry * dst)1226 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1227 {
1228 	const struct rtable *rt = (const struct rtable *) dst;
1229 	unsigned int mtu = rt->rt_pmtu;
1230 
1231 	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1232 		mtu = dst_metric_raw(dst, RTAX_MTU);
1233 
1234 	if (mtu)
1235 		return mtu;
1236 
1237 	mtu = dst->dev->mtu;
1238 
1239 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1240 		if (rt->rt_uses_gateway && mtu > 576)
1241 			mtu = 576;
1242 	}
1243 
1244 	return min_t(unsigned int, mtu, IP_MAX_MTU);
1245 }
1246 
find_exception(struct fib_nh * nh,__be32 daddr)1247 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1248 {
1249 	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1250 	struct fib_nh_exception *fnhe;
1251 	u32 hval;
1252 
1253 	if (!hash)
1254 		return NULL;
1255 
1256 	hval = fnhe_hashfun(daddr);
1257 
1258 	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1259 	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1260 		if (fnhe->fnhe_daddr == daddr)
1261 			return fnhe;
1262 	}
1263 	return NULL;
1264 }
1265 
rt_bind_exception(struct rtable * rt,struct fib_nh_exception * fnhe,__be32 daddr)1266 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1267 			      __be32 daddr)
1268 {
1269 	bool ret = false;
1270 
1271 	spin_lock_bh(&fnhe_lock);
1272 
1273 	if (daddr == fnhe->fnhe_daddr) {
1274 		struct rtable __rcu **porig;
1275 		struct rtable *orig;
1276 		int genid = fnhe_genid(dev_net(rt->dst.dev));
1277 
1278 		if (rt_is_input_route(rt))
1279 			porig = &fnhe->fnhe_rth_input;
1280 		else
1281 			porig = &fnhe->fnhe_rth_output;
1282 		orig = rcu_dereference(*porig);
1283 
1284 		if (fnhe->fnhe_genid != genid) {
1285 			fnhe->fnhe_genid = genid;
1286 			fnhe->fnhe_gw = 0;
1287 			fnhe->fnhe_pmtu = 0;
1288 			fnhe->fnhe_expires = 0;
1289 			fnhe_flush_routes(fnhe);
1290 			orig = NULL;
1291 		}
1292 		fill_route_from_fnhe(rt, fnhe);
1293 		if (!rt->rt_gateway)
1294 			rt->rt_gateway = daddr;
1295 
1296 		if (!(rt->dst.flags & DST_NOCACHE)) {
1297 			rcu_assign_pointer(*porig, rt);
1298 			if (orig)
1299 				rt_free(orig);
1300 			ret = true;
1301 		}
1302 
1303 		fnhe->fnhe_stamp = jiffies;
1304 	}
1305 	spin_unlock_bh(&fnhe_lock);
1306 
1307 	return ret;
1308 }
1309 
rt_cache_route(struct fib_nh * nh,struct rtable * rt)1310 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1311 {
1312 	struct rtable *orig, *prev, **p;
1313 	bool ret = true;
1314 
1315 	if (rt_is_input_route(rt)) {
1316 		p = (struct rtable **)&nh->nh_rth_input;
1317 	} else {
1318 		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1319 	}
1320 	orig = *p;
1321 
1322 	prev = cmpxchg(p, orig, rt);
1323 	if (prev == orig) {
1324 		if (orig)
1325 			rt_free(orig);
1326 	} else
1327 		ret = false;
1328 
1329 	return ret;
1330 }
1331 
1332 struct uncached_list {
1333 	spinlock_t		lock;
1334 	struct list_head	head;
1335 };
1336 
1337 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1338 
rt_add_uncached_list(struct rtable * rt)1339 static void rt_add_uncached_list(struct rtable *rt)
1340 {
1341 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1342 
1343 	rt->rt_uncached_list = ul;
1344 
1345 	spin_lock_bh(&ul->lock);
1346 	list_add_tail(&rt->rt_uncached, &ul->head);
1347 	spin_unlock_bh(&ul->lock);
1348 }
1349 
ipv4_dst_destroy(struct dst_entry * dst)1350 static void ipv4_dst_destroy(struct dst_entry *dst)
1351 {
1352 	struct rtable *rt = (struct rtable *) dst;
1353 
1354 	if (!list_empty(&rt->rt_uncached)) {
1355 		struct uncached_list *ul = rt->rt_uncached_list;
1356 
1357 		spin_lock_bh(&ul->lock);
1358 		list_del(&rt->rt_uncached);
1359 		spin_unlock_bh(&ul->lock);
1360 	}
1361 }
1362 
rt_flush_dev(struct net_device * dev)1363 void rt_flush_dev(struct net_device *dev)
1364 {
1365 	struct net *net = dev_net(dev);
1366 	struct rtable *rt;
1367 	int cpu;
1368 
1369 	for_each_possible_cpu(cpu) {
1370 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1371 
1372 		spin_lock_bh(&ul->lock);
1373 		list_for_each_entry(rt, &ul->head, rt_uncached) {
1374 			if (rt->dst.dev != dev)
1375 				continue;
1376 			rt->dst.dev = net->loopback_dev;
1377 			dev_hold(rt->dst.dev);
1378 			dev_put(dev);
1379 		}
1380 		spin_unlock_bh(&ul->lock);
1381 	}
1382 }
1383 
rt_cache_valid(const struct rtable * rt)1384 static bool rt_cache_valid(const struct rtable *rt)
1385 {
1386 	return	rt &&
1387 		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1388 		!rt_is_expired(rt);
1389 }
1390 
rt_set_nexthop(struct rtable * rt,__be32 daddr,const struct fib_result * res,struct fib_nh_exception * fnhe,struct fib_info * fi,u16 type,u32 itag)1391 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1392 			   const struct fib_result *res,
1393 			   struct fib_nh_exception *fnhe,
1394 			   struct fib_info *fi, u16 type, u32 itag)
1395 {
1396 	bool cached = false;
1397 
1398 	if (fi) {
1399 		struct fib_nh *nh = &FIB_RES_NH(*res);
1400 
1401 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1402 			rt->rt_gateway = nh->nh_gw;
1403 			rt->rt_uses_gateway = 1;
1404 		}
1405 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1406 #ifdef CONFIG_IP_ROUTE_CLASSID
1407 		rt->dst.tclassid = nh->nh_tclassid;
1408 #endif
1409 		if (unlikely(fnhe))
1410 			cached = rt_bind_exception(rt, fnhe, daddr);
1411 		else if (!(rt->dst.flags & DST_NOCACHE))
1412 			cached = rt_cache_route(nh, rt);
1413 		if (unlikely(!cached)) {
1414 			/* Routes we intend to cache in nexthop exception or
1415 			 * FIB nexthop have the DST_NOCACHE bit clear.
1416 			 * However, if we are unsuccessful at storing this
1417 			 * route into the cache we really need to set it.
1418 			 */
1419 			rt->dst.flags |= DST_NOCACHE;
1420 			if (!rt->rt_gateway)
1421 				rt->rt_gateway = daddr;
1422 			rt_add_uncached_list(rt);
1423 		}
1424 	} else
1425 		rt_add_uncached_list(rt);
1426 
1427 #ifdef CONFIG_IP_ROUTE_CLASSID
1428 #ifdef CONFIG_IP_MULTIPLE_TABLES
1429 	set_class_tag(rt, res->tclassid);
1430 #endif
1431 	set_class_tag(rt, itag);
1432 #endif
1433 }
1434 
rt_dst_alloc(struct net_device * dev,bool nopolicy,bool noxfrm,bool will_cache)1435 static struct rtable *rt_dst_alloc(struct net_device *dev,
1436 				   bool nopolicy, bool noxfrm, bool will_cache)
1437 {
1438 	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1439 			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1440 			 (nopolicy ? DST_NOPOLICY : 0) |
1441 			 (noxfrm ? DST_NOXFRM : 0));
1442 }
1443 
1444 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)1445 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1446 				u8 tos, struct net_device *dev, int our)
1447 {
1448 	struct rtable *rth;
1449 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1450 	u32 itag = 0;
1451 	int err;
1452 
1453 	/* Primary sanity checks. */
1454 
1455 	if (!in_dev)
1456 		return -EINVAL;
1457 
1458 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1459 	    skb->protocol != htons(ETH_P_IP))
1460 		goto e_inval;
1461 
1462 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1463 		if (ipv4_is_loopback(saddr))
1464 			goto e_inval;
1465 
1466 	if (ipv4_is_zeronet(saddr)) {
1467 		if (!ipv4_is_local_multicast(daddr))
1468 			goto e_inval;
1469 	} else {
1470 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1471 					  in_dev, &itag);
1472 		if (err < 0)
1473 			goto e_err;
1474 	}
1475 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1476 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1477 	if (!rth)
1478 		goto e_nobufs;
1479 
1480 #ifdef CONFIG_IP_ROUTE_CLASSID
1481 	rth->dst.tclassid = itag;
1482 #endif
1483 	rth->dst.output = ip_rt_bug;
1484 
1485 	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1486 	rth->rt_flags	= RTCF_MULTICAST;
1487 	rth->rt_type	= RTN_MULTICAST;
1488 	rth->rt_is_input= 1;
1489 	rth->rt_iif	= 0;
1490 	rth->rt_pmtu	= 0;
1491 	rth->rt_gateway	= 0;
1492 	rth->rt_uses_gateway = 0;
1493 	INIT_LIST_HEAD(&rth->rt_uncached);
1494 	if (our) {
1495 		rth->dst.input= ip_local_deliver;
1496 		rth->rt_flags |= RTCF_LOCAL;
1497 	}
1498 
1499 #ifdef CONFIG_IP_MROUTE
1500 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1501 		rth->dst.input = ip_mr_input;
1502 #endif
1503 	RT_CACHE_STAT_INC(in_slow_mc);
1504 
1505 	skb_dst_set(skb, &rth->dst);
1506 	return 0;
1507 
1508 e_nobufs:
1509 	return -ENOBUFS;
1510 e_inval:
1511 	return -EINVAL;
1512 e_err:
1513 	return err;
1514 }
1515 
1516 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)1517 static void ip_handle_martian_source(struct net_device *dev,
1518 				     struct in_device *in_dev,
1519 				     struct sk_buff *skb,
1520 				     __be32 daddr,
1521 				     __be32 saddr)
1522 {
1523 	RT_CACHE_STAT_INC(in_martian_src);
1524 #ifdef CONFIG_IP_ROUTE_VERBOSE
1525 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1526 		/*
1527 		 *	RFC1812 recommendation, if source is martian,
1528 		 *	the only hint is MAC header.
1529 		 */
1530 		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1531 			&daddr, &saddr, dev->name);
1532 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1533 			print_hex_dump(KERN_WARNING, "ll header: ",
1534 				       DUMP_PREFIX_OFFSET, 16, 1,
1535 				       skb_mac_header(skb),
1536 				       dev->hard_header_len, true);
1537 		}
1538 	}
1539 #endif
1540 }
1541 
ip_del_fnhe(struct fib_nh * nh,__be32 daddr)1542 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1543 {
1544 	struct fnhe_hash_bucket *hash;
1545 	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1546 	u32 hval = fnhe_hashfun(daddr);
1547 
1548 	spin_lock_bh(&fnhe_lock);
1549 
1550 	hash = rcu_dereference_protected(nh->nh_exceptions,
1551 					 lockdep_is_held(&fnhe_lock));
1552 	hash += hval;
1553 
1554 	fnhe_p = &hash->chain;
1555 	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1556 	while (fnhe) {
1557 		if (fnhe->fnhe_daddr == daddr) {
1558 			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1559 				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1560 			fnhe_flush_routes(fnhe);
1561 			kfree_rcu(fnhe, rcu);
1562 			break;
1563 		}
1564 		fnhe_p = &fnhe->fnhe_next;
1565 		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1566 						 lockdep_is_held(&fnhe_lock));
1567 	}
1568 
1569 	spin_unlock_bh(&fnhe_lock);
1570 }
1571 
1572 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1573 static int __mkroute_input(struct sk_buff *skb,
1574 			   const struct fib_result *res,
1575 			   struct in_device *in_dev,
1576 			   __be32 daddr, __be32 saddr, u32 tos)
1577 {
1578 	struct fib_nh_exception *fnhe;
1579 	struct rtable *rth;
1580 	int err;
1581 	struct in_device *out_dev;
1582 	unsigned int flags = 0;
1583 	bool do_cache;
1584 	u32 itag = 0;
1585 
1586 	/* get a working reference to the output device */
1587 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1588 	if (!out_dev) {
1589 		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1590 		return -EINVAL;
1591 	}
1592 
1593 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1594 				  in_dev->dev, in_dev, &itag);
1595 	if (err < 0) {
1596 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1597 					 saddr);
1598 
1599 		goto cleanup;
1600 	}
1601 
1602 	do_cache = res->fi && !itag;
1603 	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1604 	    skb->protocol == htons(ETH_P_IP) &&
1605 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1606 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1607 		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1608 
1609 	if (skb->protocol != htons(ETH_P_IP)) {
1610 		/* Not IP (i.e. ARP). Do not create route, if it is
1611 		 * invalid for proxy arp. DNAT routes are always valid.
1612 		 *
1613 		 * Proxy arp feature have been extended to allow, ARP
1614 		 * replies back to the same interface, to support
1615 		 * Private VLAN switch technologies. See arp.c.
1616 		 */
1617 		if (out_dev == in_dev &&
1618 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1619 			err = -EINVAL;
1620 			goto cleanup;
1621 		}
1622 	}
1623 
1624 	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1625 	if (do_cache) {
1626 		if (fnhe) {
1627 			rth = rcu_dereference(fnhe->fnhe_rth_input);
1628 			if (rth && rth->dst.expires &&
1629 			    time_after(jiffies, rth->dst.expires)) {
1630 				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1631 				fnhe = NULL;
1632 			} else {
1633 				goto rt_cache;
1634 			}
1635 		}
1636 
1637 		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1638 
1639 rt_cache:
1640 		if (rt_cache_valid(rth)) {
1641 			skb_dst_set_noref(skb, &rth->dst);
1642 			goto out;
1643 		}
1644 	}
1645 
1646 	rth = rt_dst_alloc(out_dev->dev,
1647 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1648 			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1649 	if (!rth) {
1650 		err = -ENOBUFS;
1651 		goto cleanup;
1652 	}
1653 
1654 	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1655 	rth->rt_flags = flags;
1656 	rth->rt_type = res->type;
1657 	rth->rt_is_input = 1;
1658 	rth->rt_iif 	= 0;
1659 	rth->rt_pmtu	= 0;
1660 	rth->rt_gateway	= 0;
1661 	rth->rt_uses_gateway = 0;
1662 	INIT_LIST_HEAD(&rth->rt_uncached);
1663 	RT_CACHE_STAT_INC(in_slow_tot);
1664 
1665 	rth->dst.input = ip_forward;
1666 	rth->dst.output = ip_output;
1667 
1668 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1669 	skb_dst_set(skb, &rth->dst);
1670 out:
1671 	err = 0;
1672  cleanup:
1673 	return err;
1674 }
1675 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,const struct flowi4 * fl4,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)1676 static int ip_mkroute_input(struct sk_buff *skb,
1677 			    struct fib_result *res,
1678 			    const struct flowi4 *fl4,
1679 			    struct in_device *in_dev,
1680 			    __be32 daddr, __be32 saddr, u32 tos)
1681 {
1682 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1683 	if (res->fi && res->fi->fib_nhs > 1)
1684 		fib_select_multipath(res);
1685 #endif
1686 
1687 	/* create a routing cache entry */
1688 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1689 }
1690 
1691 /*
1692  *	NOTE. We drop all the packets that has local source
1693  *	addresses, because every properly looped back packet
1694  *	must have correct destination already attached by output routine.
1695  *
1696  *	Such approach solves two big problems:
1697  *	1. Not simplex devices are handled properly.
1698  *	2. IP spoofing attempts are filtered with 100% of guarantee.
1699  *	called with rcu_read_lock()
1700  */
1701 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1702 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1703 			       u8 tos, struct net_device *dev)
1704 {
1705 	struct fib_result res;
1706 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1707 	struct flowi4	fl4;
1708 	unsigned int	flags = 0;
1709 	u32		itag = 0;
1710 	struct rtable	*rth;
1711 	int		err = -EINVAL;
1712 	struct net    *net = dev_net(dev);
1713 	bool do_cache;
1714 
1715 	/* IP on this device is disabled. */
1716 
1717 	if (!in_dev)
1718 		goto out;
1719 
1720 	/* Check for the most weird martians, which can be not detected
1721 	   by fib_lookup.
1722 	 */
1723 
1724 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1725 		goto martian_source;
1726 
1727 	res.fi = NULL;
1728 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1729 		goto brd_input;
1730 
1731 	/* Accept zero addresses only to limited broadcast;
1732 	 * I even do not know to fix it or not. Waiting for complains :-)
1733 	 */
1734 	if (ipv4_is_zeronet(saddr))
1735 		goto martian_source;
1736 
1737 	if (ipv4_is_zeronet(daddr))
1738 		goto martian_destination;
1739 
1740 	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1741 	 * and call it once if daddr or/and saddr are loopback addresses
1742 	 */
1743 	if (ipv4_is_loopback(daddr)) {
1744 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1745 			goto martian_destination;
1746 	} else if (ipv4_is_loopback(saddr)) {
1747 		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1748 			goto martian_source;
1749 	}
1750 
1751 	/*
1752 	 *	Now we are ready to route packet.
1753 	 */
1754 	fl4.flowi4_oif = 0;
1755 	fl4.flowi4_iif = dev->ifindex;
1756 	fl4.flowi4_mark = skb->mark;
1757 	fl4.flowi4_tos = tos;
1758 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1759 	fl4.daddr = daddr;
1760 	fl4.saddr = saddr;
1761 	err = fib_lookup(net, &fl4, &res);
1762 	if (err != 0) {
1763 		if (!IN_DEV_FORWARD(in_dev))
1764 			err = -EHOSTUNREACH;
1765 		goto no_route;
1766 	}
1767 
1768 	if (res.type == RTN_BROADCAST)
1769 		goto brd_input;
1770 
1771 	if (res.type == RTN_LOCAL) {
1772 		err = fib_validate_source(skb, saddr, daddr, tos,
1773 					  0, dev, in_dev, &itag);
1774 		if (err < 0)
1775 			goto martian_source_keep_err;
1776 		goto local_input;
1777 	}
1778 
1779 	if (!IN_DEV_FORWARD(in_dev)) {
1780 		err = -EHOSTUNREACH;
1781 		goto no_route;
1782 	}
1783 	if (res.type != RTN_UNICAST)
1784 		goto martian_destination;
1785 
1786 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1787 out:	return err;
1788 
1789 brd_input:
1790 	if (skb->protocol != htons(ETH_P_IP))
1791 		goto e_inval;
1792 
1793 	if (!ipv4_is_zeronet(saddr)) {
1794 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1795 					  in_dev, &itag);
1796 		if (err < 0)
1797 			goto martian_source_keep_err;
1798 	}
1799 	flags |= RTCF_BROADCAST;
1800 	res.type = RTN_BROADCAST;
1801 	RT_CACHE_STAT_INC(in_brd);
1802 
1803 local_input:
1804 	do_cache = false;
1805 	if (res.fi) {
1806 		if (!itag) {
1807 			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1808 			if (rt_cache_valid(rth)) {
1809 				skb_dst_set_noref(skb, &rth->dst);
1810 				err = 0;
1811 				goto out;
1812 			}
1813 			do_cache = true;
1814 		}
1815 	}
1816 
1817 	rth = rt_dst_alloc(net->loopback_dev,
1818 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1819 	if (!rth)
1820 		goto e_nobufs;
1821 
1822 	rth->dst.input= ip_local_deliver;
1823 	rth->dst.output= ip_rt_bug;
1824 #ifdef CONFIG_IP_ROUTE_CLASSID
1825 	rth->dst.tclassid = itag;
1826 #endif
1827 
1828 	rth->rt_genid = rt_genid_ipv4(net);
1829 	rth->rt_flags 	= flags|RTCF_LOCAL;
1830 	rth->rt_type	= res.type;
1831 	rth->rt_is_input = 1;
1832 	rth->rt_iif	= 0;
1833 	rth->rt_pmtu	= 0;
1834 	rth->rt_gateway	= 0;
1835 	rth->rt_uses_gateway = 0;
1836 	INIT_LIST_HEAD(&rth->rt_uncached);
1837 	RT_CACHE_STAT_INC(in_slow_tot);
1838 	if (res.type == RTN_UNREACHABLE) {
1839 		rth->dst.input= ip_error;
1840 		rth->dst.error= -err;
1841 		rth->rt_flags 	&= ~RTCF_LOCAL;
1842 	}
1843 	if (do_cache) {
1844 		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1845 			rth->dst.flags |= DST_NOCACHE;
1846 			rt_add_uncached_list(rth);
1847 		}
1848 	}
1849 	skb_dst_set(skb, &rth->dst);
1850 	err = 0;
1851 	goto out;
1852 
1853 no_route:
1854 	RT_CACHE_STAT_INC(in_no_route);
1855 	res.type = RTN_UNREACHABLE;
1856 	res.fi = NULL;
1857 	goto local_input;
1858 
1859 	/*
1860 	 *	Do not cache martian addresses: they should be logged (RFC1812)
1861 	 */
1862 martian_destination:
1863 	RT_CACHE_STAT_INC(in_martian_dst);
1864 #ifdef CONFIG_IP_ROUTE_VERBOSE
1865 	if (IN_DEV_LOG_MARTIANS(in_dev))
1866 		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1867 				     &daddr, &saddr, dev->name);
1868 #endif
1869 
1870 e_inval:
1871 	err = -EINVAL;
1872 	goto out;
1873 
1874 e_nobufs:
1875 	err = -ENOBUFS;
1876 	goto out;
1877 
1878 martian_source:
1879 	err = -EINVAL;
1880 martian_source_keep_err:
1881 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1882 	goto out;
1883 }
1884 
ip_route_input_noref(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)1885 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1886 			 u8 tos, struct net_device *dev)
1887 {
1888 	int res;
1889 
1890 	rcu_read_lock();
1891 
1892 	/* Multicast recognition logic is moved from route cache to here.
1893 	   The problem was that too many Ethernet cards have broken/missing
1894 	   hardware multicast filters :-( As result the host on multicasting
1895 	   network acquires a lot of useless route cache entries, sort of
1896 	   SDR messages from all the world. Now we try to get rid of them.
1897 	   Really, provided software IP multicast filter is organized
1898 	   reasonably (at least, hashed), it does not result in a slowdown
1899 	   comparing with route cache reject entries.
1900 	   Note, that multicast routers are not affected, because
1901 	   route cache entry is created eventually.
1902 	 */
1903 	if (ipv4_is_multicast(daddr)) {
1904 		struct in_device *in_dev = __in_dev_get_rcu(dev);
1905 
1906 		if (in_dev) {
1907 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1908 						  ip_hdr(skb)->protocol);
1909 			if (our
1910 #ifdef CONFIG_IP_MROUTE
1911 				||
1912 			    (!ipv4_is_local_multicast(daddr) &&
1913 			     IN_DEV_MFORWARD(in_dev))
1914 #endif
1915 			   ) {
1916 				int res = ip_route_input_mc(skb, daddr, saddr,
1917 							    tos, dev, our);
1918 				rcu_read_unlock();
1919 				return res;
1920 			}
1921 		}
1922 		rcu_read_unlock();
1923 		return -EINVAL;
1924 	}
1925 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1926 	rcu_read_unlock();
1927 	return res;
1928 }
1929 EXPORT_SYMBOL(ip_route_input_noref);
1930 
1931 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,int orig_oif,struct net_device * dev_out,unsigned int flags)1932 static struct rtable *__mkroute_output(const struct fib_result *res,
1933 				       const struct flowi4 *fl4, int orig_oif,
1934 				       struct net_device *dev_out,
1935 				       unsigned int flags)
1936 {
1937 	struct fib_info *fi = res->fi;
1938 	struct fib_nh_exception *fnhe;
1939 	struct in_device *in_dev;
1940 	u16 type = res->type;
1941 	struct rtable *rth;
1942 	bool do_cache;
1943 
1944 	in_dev = __in_dev_get_rcu(dev_out);
1945 	if (!in_dev)
1946 		return ERR_PTR(-EINVAL);
1947 
1948 	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1949 		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1950 			return ERR_PTR(-EINVAL);
1951 
1952 	if (ipv4_is_lbcast(fl4->daddr))
1953 		type = RTN_BROADCAST;
1954 	else if (ipv4_is_multicast(fl4->daddr))
1955 		type = RTN_MULTICAST;
1956 	else if (ipv4_is_zeronet(fl4->daddr))
1957 		return ERR_PTR(-EINVAL);
1958 
1959 	if (dev_out->flags & IFF_LOOPBACK)
1960 		flags |= RTCF_LOCAL;
1961 
1962 	do_cache = true;
1963 	if (type == RTN_BROADCAST) {
1964 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1965 		fi = NULL;
1966 	} else if (type == RTN_MULTICAST) {
1967 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1968 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1969 				     fl4->flowi4_proto))
1970 			flags &= ~RTCF_LOCAL;
1971 		else
1972 			do_cache = false;
1973 		/* If multicast route do not exist use
1974 		 * default one, but do not gateway in this case.
1975 		 * Yes, it is hack.
1976 		 */
1977 		if (fi && res->prefixlen < 4)
1978 			fi = NULL;
1979 	}
1980 
1981 	fnhe = NULL;
1982 	do_cache &= fi != NULL;
1983 	if (do_cache) {
1984 		struct rtable __rcu **prth;
1985 		struct fib_nh *nh = &FIB_RES_NH(*res);
1986 
1987 		fnhe = find_exception(nh, fl4->daddr);
1988 		if (fnhe) {
1989 			prth = &fnhe->fnhe_rth_output;
1990 			rth = rcu_dereference(*prth);
1991 			if (rth && rth->dst.expires &&
1992 			    time_after(jiffies, rth->dst.expires)) {
1993 				ip_del_fnhe(nh, fl4->daddr);
1994 				fnhe = NULL;
1995 			} else {
1996 				goto rt_cache;
1997 			}
1998 		}
1999 
2000 		if (unlikely(fl4->flowi4_flags &
2001 			     FLOWI_FLAG_KNOWN_NH &&
2002 			     !(nh->nh_gw &&
2003 			       nh->nh_scope == RT_SCOPE_LINK))) {
2004 			do_cache = false;
2005 			goto add;
2006 		}
2007 		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2008 		rth = rcu_dereference(*prth);
2009 
2010 rt_cache:
2011 		if (rt_cache_valid(rth)) {
2012 			dst_hold(&rth->dst);
2013 			return rth;
2014 		}
2015 	}
2016 
2017 add:
2018 	rth = rt_dst_alloc(dev_out,
2019 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2020 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2021 			   do_cache);
2022 	if (!rth)
2023 		return ERR_PTR(-ENOBUFS);
2024 
2025 	rth->dst.output = ip_output;
2026 
2027 	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
2028 	rth->rt_flags	= flags;
2029 	rth->rt_type	= type;
2030 	rth->rt_is_input = 0;
2031 	rth->rt_iif	= orig_oif ? : 0;
2032 	rth->rt_pmtu	= 0;
2033 	rth->rt_gateway = 0;
2034 	rth->rt_uses_gateway = 0;
2035 	INIT_LIST_HEAD(&rth->rt_uncached);
2036 
2037 	RT_CACHE_STAT_INC(out_slow_tot);
2038 
2039 	if (flags & RTCF_LOCAL)
2040 		rth->dst.input = ip_local_deliver;
2041 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2042 		if (flags & RTCF_LOCAL &&
2043 		    !(dev_out->flags & IFF_LOOPBACK)) {
2044 			rth->dst.output = ip_mc_output;
2045 			RT_CACHE_STAT_INC(out_slow_mc);
2046 		}
2047 #ifdef CONFIG_IP_MROUTE
2048 		if (type == RTN_MULTICAST) {
2049 			if (IN_DEV_MFORWARD(in_dev) &&
2050 			    !ipv4_is_local_multicast(fl4->daddr)) {
2051 				rth->dst.input = ip_mr_input;
2052 				rth->dst.output = ip_mc_output;
2053 			}
2054 		}
2055 #endif
2056 	}
2057 
2058 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2059 
2060 	return rth;
2061 }
2062 
2063 /*
2064  * Major route resolver routine.
2065  */
2066 
__ip_route_output_key(struct net * net,struct flowi4 * fl4)2067 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2068 {
2069 	struct net_device *dev_out = NULL;
2070 	__u8 tos = RT_FL_TOS(fl4);
2071 	unsigned int flags = 0;
2072 	struct fib_result res;
2073 	struct rtable *rth;
2074 	int orig_oif;
2075 
2076 	res.tclassid	= 0;
2077 	res.fi		= NULL;
2078 	res.table	= NULL;
2079 
2080 	orig_oif = fl4->flowi4_oif;
2081 
2082 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2083 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2084 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2085 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2086 
2087 	rcu_read_lock();
2088 	if (fl4->saddr) {
2089 		rth = ERR_PTR(-EINVAL);
2090 		if (ipv4_is_multicast(fl4->saddr) ||
2091 		    ipv4_is_lbcast(fl4->saddr) ||
2092 		    ipv4_is_zeronet(fl4->saddr))
2093 			goto out;
2094 
2095 		/* I removed check for oif == dev_out->oif here.
2096 		   It was wrong for two reasons:
2097 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2098 		      is assigned to multiple interfaces.
2099 		   2. Moreover, we are allowed to send packets with saddr
2100 		      of another iface. --ANK
2101 		 */
2102 
2103 		if (fl4->flowi4_oif == 0 &&
2104 		    (ipv4_is_multicast(fl4->daddr) ||
2105 		     ipv4_is_lbcast(fl4->daddr))) {
2106 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2107 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2108 			if (!dev_out)
2109 				goto out;
2110 
2111 			/* Special hack: user can direct multicasts
2112 			   and limited broadcast via necessary interface
2113 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2114 			   This hack is not just for fun, it allows
2115 			   vic,vat and friends to work.
2116 			   They bind socket to loopback, set ttl to zero
2117 			   and expect that it will work.
2118 			   From the viewpoint of routing cache they are broken,
2119 			   because we are not allowed to build multicast path
2120 			   with loopback source addr (look, routing cache
2121 			   cannot know, that ttl is zero, so that packet
2122 			   will not leave this host and route is valid).
2123 			   Luckily, this hack is good workaround.
2124 			 */
2125 
2126 			fl4->flowi4_oif = dev_out->ifindex;
2127 			goto make_route;
2128 		}
2129 
2130 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2131 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2132 			if (!__ip_dev_find(net, fl4->saddr, false))
2133 				goto out;
2134 		}
2135 	}
2136 
2137 
2138 	if (fl4->flowi4_oif) {
2139 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2140 		rth = ERR_PTR(-ENODEV);
2141 		if (!dev_out)
2142 			goto out;
2143 
2144 		/* RACE: Check return value of inet_select_addr instead. */
2145 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2146 			rth = ERR_PTR(-ENETUNREACH);
2147 			goto out;
2148 		}
2149 		if (ipv4_is_local_multicast(fl4->daddr) ||
2150 		    ipv4_is_lbcast(fl4->daddr)) {
2151 			if (!fl4->saddr)
2152 				fl4->saddr = inet_select_addr(dev_out, 0,
2153 							      RT_SCOPE_LINK);
2154 			goto make_route;
2155 		}
2156 		if (!fl4->saddr) {
2157 			if (ipv4_is_multicast(fl4->daddr))
2158 				fl4->saddr = inet_select_addr(dev_out, 0,
2159 							      fl4->flowi4_scope);
2160 			else if (!fl4->daddr)
2161 				fl4->saddr = inet_select_addr(dev_out, 0,
2162 							      RT_SCOPE_HOST);
2163 		}
2164 	}
2165 
2166 	if (!fl4->daddr) {
2167 		fl4->daddr = fl4->saddr;
2168 		if (!fl4->daddr)
2169 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2170 		dev_out = net->loopback_dev;
2171 		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2172 		res.type = RTN_LOCAL;
2173 		flags |= RTCF_LOCAL;
2174 		goto make_route;
2175 	}
2176 
2177 	if (fib_lookup(net, fl4, &res)) {
2178 		res.fi = NULL;
2179 		res.table = NULL;
2180 		if (fl4->flowi4_oif) {
2181 			/* Apparently, routing tables are wrong. Assume,
2182 			   that the destination is on link.
2183 
2184 			   WHY? DW.
2185 			   Because we are allowed to send to iface
2186 			   even if it has NO routes and NO assigned
2187 			   addresses. When oif is specified, routing
2188 			   tables are looked up with only one purpose:
2189 			   to catch if destination is gatewayed, rather than
2190 			   direct. Moreover, if MSG_DONTROUTE is set,
2191 			   we send packet, ignoring both routing tables
2192 			   and ifaddr state. --ANK
2193 
2194 
2195 			   We could make it even if oif is unknown,
2196 			   likely IPv6, but we do not.
2197 			 */
2198 
2199 			if (fl4->saddr == 0)
2200 				fl4->saddr = inet_select_addr(dev_out, 0,
2201 							      RT_SCOPE_LINK);
2202 			res.type = RTN_UNICAST;
2203 			goto make_route;
2204 		}
2205 		rth = ERR_PTR(-ENETUNREACH);
2206 		goto out;
2207 	}
2208 
2209 	if (res.type == RTN_LOCAL) {
2210 		if (!fl4->saddr) {
2211 			if (res.fi->fib_prefsrc)
2212 				fl4->saddr = res.fi->fib_prefsrc;
2213 			else
2214 				fl4->saddr = fl4->daddr;
2215 		}
2216 		dev_out = net->loopback_dev;
2217 		fl4->flowi4_oif = dev_out->ifindex;
2218 		flags |= RTCF_LOCAL;
2219 		goto make_route;
2220 	}
2221 
2222 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2223 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2224 		fib_select_multipath(&res);
2225 	else
2226 #endif
2227 	if (!res.prefixlen &&
2228 	    res.table->tb_num_default > 1 &&
2229 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2230 		fib_select_default(&res);
2231 
2232 	if (!fl4->saddr)
2233 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2234 
2235 	dev_out = FIB_RES_DEV(res);
2236 	fl4->flowi4_oif = dev_out->ifindex;
2237 
2238 
2239 make_route:
2240 	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2241 
2242 out:
2243 	rcu_read_unlock();
2244 	return rth;
2245 }
2246 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2247 
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2248 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2249 {
2250 	return NULL;
2251 }
2252 
ipv4_blackhole_mtu(const struct dst_entry * dst)2253 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2254 {
2255 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2256 
2257 	return mtu ? : dst->dev->mtu;
2258 }
2259 
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb,u32 mtu)2260 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2261 					  struct sk_buff *skb, u32 mtu)
2262 {
2263 }
2264 
ipv4_rt_blackhole_redirect(struct dst_entry * dst,struct sock * sk,struct sk_buff * skb)2265 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2266 				       struct sk_buff *skb)
2267 {
2268 }
2269 
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2270 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2271 					  unsigned long old)
2272 {
2273 	return NULL;
2274 }
2275 
2276 static struct dst_ops ipv4_dst_blackhole_ops = {
2277 	.family			=	AF_INET,
2278 	.check			=	ipv4_blackhole_dst_check,
2279 	.mtu			=	ipv4_blackhole_mtu,
2280 	.default_advmss		=	ipv4_default_advmss,
2281 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2282 	.redirect		=	ipv4_rt_blackhole_redirect,
2283 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2284 	.neigh_lookup		=	ipv4_neigh_lookup,
2285 };
2286 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2287 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2288 {
2289 	struct rtable *ort = (struct rtable *) dst_orig;
2290 	struct rtable *rt;
2291 
2292 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2293 	if (rt) {
2294 		struct dst_entry *new = &rt->dst;
2295 
2296 		new->__use = 1;
2297 		new->input = dst_discard;
2298 		new->output = dst_discard_sk;
2299 
2300 		new->dev = ort->dst.dev;
2301 		if (new->dev)
2302 			dev_hold(new->dev);
2303 
2304 		rt->rt_is_input = ort->rt_is_input;
2305 		rt->rt_iif = ort->rt_iif;
2306 		rt->rt_pmtu = ort->rt_pmtu;
2307 
2308 		rt->rt_genid = rt_genid_ipv4(net);
2309 		rt->rt_flags = ort->rt_flags;
2310 		rt->rt_type = ort->rt_type;
2311 		rt->rt_gateway = ort->rt_gateway;
2312 		rt->rt_uses_gateway = ort->rt_uses_gateway;
2313 
2314 		INIT_LIST_HEAD(&rt->rt_uncached);
2315 
2316 		dst_free(new);
2317 	}
2318 
2319 	dst_release(dst_orig);
2320 
2321 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2322 }
2323 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,struct sock * sk)2324 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2325 				    struct sock *sk)
2326 {
2327 	struct rtable *rt = __ip_route_output_key(net, flp4);
2328 
2329 	if (IS_ERR(rt))
2330 		return rt;
2331 
2332 	if (flp4->flowi4_proto)
2333 		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2334 							flowi4_to_flowi(flp4),
2335 							sk, 0);
2336 
2337 	return rt;
2338 }
2339 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2340 
rt_fill_info(struct net * net,__be32 dst,__be32 src,struct flowi4 * fl4,struct sk_buff * skb,u32 portid,u32 seq,int event,int nowait,unsigned int flags)2341 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2342 			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2343 			u32 seq, int event, int nowait, unsigned int flags)
2344 {
2345 	struct rtable *rt = skb_rtable(skb);
2346 	struct rtmsg *r;
2347 	struct nlmsghdr *nlh;
2348 	unsigned long expires = 0;
2349 	u32 error;
2350 	u32 metrics[RTAX_MAX];
2351 
2352 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2353 	if (!nlh)
2354 		return -EMSGSIZE;
2355 
2356 	r = nlmsg_data(nlh);
2357 	r->rtm_family	 = AF_INET;
2358 	r->rtm_dst_len	= 32;
2359 	r->rtm_src_len	= 0;
2360 	r->rtm_tos	= fl4->flowi4_tos;
2361 	r->rtm_table	= RT_TABLE_MAIN;
2362 	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2363 		goto nla_put_failure;
2364 	r->rtm_type	= rt->rt_type;
2365 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2366 	r->rtm_protocol = RTPROT_UNSPEC;
2367 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2368 	if (rt->rt_flags & RTCF_NOTIFY)
2369 		r->rtm_flags |= RTM_F_NOTIFY;
2370 	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2371 		r->rtm_flags |= RTCF_DOREDIRECT;
2372 
2373 	if (nla_put_in_addr(skb, RTA_DST, dst))
2374 		goto nla_put_failure;
2375 	if (src) {
2376 		r->rtm_src_len = 32;
2377 		if (nla_put_in_addr(skb, RTA_SRC, src))
2378 			goto nla_put_failure;
2379 	}
2380 	if (rt->dst.dev &&
2381 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2382 		goto nla_put_failure;
2383 #ifdef CONFIG_IP_ROUTE_CLASSID
2384 	if (rt->dst.tclassid &&
2385 	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2386 		goto nla_put_failure;
2387 #endif
2388 	if (!rt_is_input_route(rt) &&
2389 	    fl4->saddr != src) {
2390 		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2391 			goto nla_put_failure;
2392 	}
2393 	if (rt->rt_uses_gateway &&
2394 	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2395 		goto nla_put_failure;
2396 
2397 	expires = rt->dst.expires;
2398 	if (expires) {
2399 		unsigned long now = jiffies;
2400 
2401 		if (time_before(now, expires))
2402 			expires -= now;
2403 		else
2404 			expires = 0;
2405 	}
2406 
2407 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2408 	if (rt->rt_pmtu && expires)
2409 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2410 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2411 		goto nla_put_failure;
2412 
2413 	if (fl4->flowi4_mark &&
2414 	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2415 		goto nla_put_failure;
2416 
2417 	error = rt->dst.error;
2418 
2419 	if (rt_is_input_route(rt)) {
2420 #ifdef CONFIG_IP_MROUTE
2421 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2422 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2423 			int err = ipmr_get_route(net, skb,
2424 						 fl4->saddr, fl4->daddr,
2425 						 r, nowait);
2426 			if (err <= 0) {
2427 				if (!nowait) {
2428 					if (err == 0)
2429 						return 0;
2430 					goto nla_put_failure;
2431 				} else {
2432 					if (err == -EMSGSIZE)
2433 						goto nla_put_failure;
2434 					error = err;
2435 				}
2436 			}
2437 		} else
2438 #endif
2439 			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2440 				goto nla_put_failure;
2441 	}
2442 
2443 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2444 		goto nla_put_failure;
2445 
2446 	nlmsg_end(skb, nlh);
2447 	return 0;
2448 
2449 nla_put_failure:
2450 	nlmsg_cancel(skb, nlh);
2451 	return -EMSGSIZE;
2452 }
2453 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh)2454 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2455 {
2456 	struct net *net = sock_net(in_skb->sk);
2457 	struct rtmsg *rtm;
2458 	struct nlattr *tb[RTA_MAX+1];
2459 	struct rtable *rt = NULL;
2460 	struct flowi4 fl4;
2461 	__be32 dst = 0;
2462 	__be32 src = 0;
2463 	u32 iif;
2464 	int err;
2465 	int mark;
2466 	struct sk_buff *skb;
2467 
2468 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2469 	if (err < 0)
2470 		goto errout;
2471 
2472 	rtm = nlmsg_data(nlh);
2473 
2474 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2475 	if (!skb) {
2476 		err = -ENOBUFS;
2477 		goto errout;
2478 	}
2479 
2480 	/* Reserve room for dummy headers, this skb can pass
2481 	   through good chunk of routing engine.
2482 	 */
2483 	skb_reset_mac_header(skb);
2484 	skb_reset_network_header(skb);
2485 
2486 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2487 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2488 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2489 
2490 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2491 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2492 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2493 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2494 
2495 	memset(&fl4, 0, sizeof(fl4));
2496 	fl4.daddr = dst;
2497 	fl4.saddr = src;
2498 	fl4.flowi4_tos = rtm->rtm_tos;
2499 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2500 	fl4.flowi4_mark = mark;
2501 
2502 	if (iif) {
2503 		struct net_device *dev;
2504 
2505 		dev = __dev_get_by_index(net, iif);
2506 		if (!dev) {
2507 			err = -ENODEV;
2508 			goto errout_free;
2509 		}
2510 
2511 		skb->protocol	= htons(ETH_P_IP);
2512 		skb->dev	= dev;
2513 		skb->mark	= mark;
2514 		local_bh_disable();
2515 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2516 		local_bh_enable();
2517 
2518 		rt = skb_rtable(skb);
2519 		if (err == 0 && rt->dst.error)
2520 			err = -rt->dst.error;
2521 	} else {
2522 		rt = ip_route_output_key(net, &fl4);
2523 
2524 		err = 0;
2525 		if (IS_ERR(rt))
2526 			err = PTR_ERR(rt);
2527 	}
2528 
2529 	if (err)
2530 		goto errout_free;
2531 
2532 	skb_dst_set(skb, &rt->dst);
2533 	if (rtm->rtm_flags & RTM_F_NOTIFY)
2534 		rt->rt_flags |= RTCF_NOTIFY;
2535 
2536 	err = rt_fill_info(net, dst, src, &fl4, skb,
2537 			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2538 			   RTM_NEWROUTE, 0, 0);
2539 	if (err < 0)
2540 		goto errout_free;
2541 
2542 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2543 errout:
2544 	return err;
2545 
2546 errout_free:
2547 	kfree_skb(skb);
2548 	goto errout;
2549 }
2550 
ip_rt_multicast_event(struct in_device * in_dev)2551 void ip_rt_multicast_event(struct in_device *in_dev)
2552 {
2553 	rt_cache_flush(dev_net(in_dev->dev));
2554 }
2555 
2556 #ifdef CONFIG_SYSCTL
2557 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2558 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2559 static int ip_rt_gc_elasticity __read_mostly	= 8;
2560 
ipv4_sysctl_rtcache_flush(struct ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)2561 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2562 					void __user *buffer,
2563 					size_t *lenp, loff_t *ppos)
2564 {
2565 	struct net *net = (struct net *)__ctl->extra1;
2566 
2567 	if (write) {
2568 		rt_cache_flush(net);
2569 		fnhe_genid_bump(net);
2570 		return 0;
2571 	}
2572 
2573 	return -EINVAL;
2574 }
2575 
2576 static struct ctl_table ipv4_route_table[] = {
2577 	{
2578 		.procname	= "gc_thresh",
2579 		.data		= &ipv4_dst_ops.gc_thresh,
2580 		.maxlen		= sizeof(int),
2581 		.mode		= 0644,
2582 		.proc_handler	= proc_dointvec,
2583 	},
2584 	{
2585 		.procname	= "max_size",
2586 		.data		= &ip_rt_max_size,
2587 		.maxlen		= sizeof(int),
2588 		.mode		= 0644,
2589 		.proc_handler	= proc_dointvec,
2590 	},
2591 	{
2592 		/*  Deprecated. Use gc_min_interval_ms */
2593 
2594 		.procname	= "gc_min_interval",
2595 		.data		= &ip_rt_gc_min_interval,
2596 		.maxlen		= sizeof(int),
2597 		.mode		= 0644,
2598 		.proc_handler	= proc_dointvec_jiffies,
2599 	},
2600 	{
2601 		.procname	= "gc_min_interval_ms",
2602 		.data		= &ip_rt_gc_min_interval,
2603 		.maxlen		= sizeof(int),
2604 		.mode		= 0644,
2605 		.proc_handler	= proc_dointvec_ms_jiffies,
2606 	},
2607 	{
2608 		.procname	= "gc_timeout",
2609 		.data		= &ip_rt_gc_timeout,
2610 		.maxlen		= sizeof(int),
2611 		.mode		= 0644,
2612 		.proc_handler	= proc_dointvec_jiffies,
2613 	},
2614 	{
2615 		.procname	= "gc_interval",
2616 		.data		= &ip_rt_gc_interval,
2617 		.maxlen		= sizeof(int),
2618 		.mode		= 0644,
2619 		.proc_handler	= proc_dointvec_jiffies,
2620 	},
2621 	{
2622 		.procname	= "redirect_load",
2623 		.data		= &ip_rt_redirect_load,
2624 		.maxlen		= sizeof(int),
2625 		.mode		= 0644,
2626 		.proc_handler	= proc_dointvec,
2627 	},
2628 	{
2629 		.procname	= "redirect_number",
2630 		.data		= &ip_rt_redirect_number,
2631 		.maxlen		= sizeof(int),
2632 		.mode		= 0644,
2633 		.proc_handler	= proc_dointvec,
2634 	},
2635 	{
2636 		.procname	= "redirect_silence",
2637 		.data		= &ip_rt_redirect_silence,
2638 		.maxlen		= sizeof(int),
2639 		.mode		= 0644,
2640 		.proc_handler	= proc_dointvec,
2641 	},
2642 	{
2643 		.procname	= "error_cost",
2644 		.data		= &ip_rt_error_cost,
2645 		.maxlen		= sizeof(int),
2646 		.mode		= 0644,
2647 		.proc_handler	= proc_dointvec,
2648 	},
2649 	{
2650 		.procname	= "error_burst",
2651 		.data		= &ip_rt_error_burst,
2652 		.maxlen		= sizeof(int),
2653 		.mode		= 0644,
2654 		.proc_handler	= proc_dointvec,
2655 	},
2656 	{
2657 		.procname	= "gc_elasticity",
2658 		.data		= &ip_rt_gc_elasticity,
2659 		.maxlen		= sizeof(int),
2660 		.mode		= 0644,
2661 		.proc_handler	= proc_dointvec,
2662 	},
2663 	{
2664 		.procname	= "mtu_expires",
2665 		.data		= &ip_rt_mtu_expires,
2666 		.maxlen		= sizeof(int),
2667 		.mode		= 0644,
2668 		.proc_handler	= proc_dointvec_jiffies,
2669 	},
2670 	{
2671 		.procname	= "min_pmtu",
2672 		.data		= &ip_rt_min_pmtu,
2673 		.maxlen		= sizeof(int),
2674 		.mode		= 0644,
2675 		.proc_handler	= proc_dointvec,
2676 	},
2677 	{
2678 		.procname	= "min_adv_mss",
2679 		.data		= &ip_rt_min_advmss,
2680 		.maxlen		= sizeof(int),
2681 		.mode		= 0644,
2682 		.proc_handler	= proc_dointvec,
2683 	},
2684 	{ }
2685 };
2686 
2687 static struct ctl_table ipv4_route_flush_table[] = {
2688 	{
2689 		.procname	= "flush",
2690 		.maxlen		= sizeof(int),
2691 		.mode		= 0200,
2692 		.proc_handler	= ipv4_sysctl_rtcache_flush,
2693 	},
2694 	{ },
2695 };
2696 
sysctl_route_net_init(struct net * net)2697 static __net_init int sysctl_route_net_init(struct net *net)
2698 {
2699 	struct ctl_table *tbl;
2700 
2701 	tbl = ipv4_route_flush_table;
2702 	if (!net_eq(net, &init_net)) {
2703 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2704 		if (!tbl)
2705 			goto err_dup;
2706 
2707 		/* Don't export sysctls to unprivileged users */
2708 		if (net->user_ns != &init_user_ns)
2709 			tbl[0].procname = NULL;
2710 	}
2711 	tbl[0].extra1 = net;
2712 
2713 	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2714 	if (!net->ipv4.route_hdr)
2715 		goto err_reg;
2716 	return 0;
2717 
2718 err_reg:
2719 	if (tbl != ipv4_route_flush_table)
2720 		kfree(tbl);
2721 err_dup:
2722 	return -ENOMEM;
2723 }
2724 
sysctl_route_net_exit(struct net * net)2725 static __net_exit void sysctl_route_net_exit(struct net *net)
2726 {
2727 	struct ctl_table *tbl;
2728 
2729 	tbl = net->ipv4.route_hdr->ctl_table_arg;
2730 	unregister_net_sysctl_table(net->ipv4.route_hdr);
2731 	BUG_ON(tbl == ipv4_route_flush_table);
2732 	kfree(tbl);
2733 }
2734 
2735 static __net_initdata struct pernet_operations sysctl_route_ops = {
2736 	.init = sysctl_route_net_init,
2737 	.exit = sysctl_route_net_exit,
2738 };
2739 #endif
2740 
rt_genid_init(struct net * net)2741 static __net_init int rt_genid_init(struct net *net)
2742 {
2743 	atomic_set(&net->ipv4.rt_genid, 0);
2744 	atomic_set(&net->fnhe_genid, 0);
2745 	get_random_bytes(&net->ipv4.dev_addr_genid,
2746 			 sizeof(net->ipv4.dev_addr_genid));
2747 	return 0;
2748 }
2749 
2750 static __net_initdata struct pernet_operations rt_genid_ops = {
2751 	.init = rt_genid_init,
2752 };
2753 
ipv4_inetpeer_init(struct net * net)2754 static int __net_init ipv4_inetpeer_init(struct net *net)
2755 {
2756 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2757 
2758 	if (!bp)
2759 		return -ENOMEM;
2760 	inet_peer_base_init(bp);
2761 	net->ipv4.peers = bp;
2762 	return 0;
2763 }
2764 
ipv4_inetpeer_exit(struct net * net)2765 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2766 {
2767 	struct inet_peer_base *bp = net->ipv4.peers;
2768 
2769 	net->ipv4.peers = NULL;
2770 	inetpeer_invalidate_tree(bp);
2771 	kfree(bp);
2772 }
2773 
2774 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2775 	.init	=	ipv4_inetpeer_init,
2776 	.exit	=	ipv4_inetpeer_exit,
2777 };
2778 
2779 #ifdef CONFIG_IP_ROUTE_CLASSID
2780 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2781 #endif /* CONFIG_IP_ROUTE_CLASSID */
2782 
ip_rt_init(void)2783 int __init ip_rt_init(void)
2784 {
2785 	int rc = 0;
2786 	int cpu;
2787 
2788 	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2789 	if (!ip_idents)
2790 		panic("IP: failed to allocate ip_idents\n");
2791 
2792 	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2793 
2794 	for_each_possible_cpu(cpu) {
2795 		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2796 
2797 		INIT_LIST_HEAD(&ul->head);
2798 		spin_lock_init(&ul->lock);
2799 	}
2800 #ifdef CONFIG_IP_ROUTE_CLASSID
2801 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2802 	if (!ip_rt_acct)
2803 		panic("IP: failed to allocate ip_rt_acct\n");
2804 #endif
2805 
2806 	ipv4_dst_ops.kmem_cachep =
2807 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2808 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2809 
2810 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2811 
2812 	if (dst_entries_init(&ipv4_dst_ops) < 0)
2813 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2814 
2815 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2816 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2817 
2818 	ipv4_dst_ops.gc_thresh = ~0;
2819 	ip_rt_max_size = INT_MAX;
2820 
2821 	devinet_init();
2822 	ip_fib_init();
2823 
2824 	if (ip_rt_proc_init())
2825 		pr_err("Unable to create route proc files\n");
2826 #ifdef CONFIG_XFRM
2827 	xfrm_init();
2828 	xfrm4_init();
2829 #endif
2830 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2831 
2832 #ifdef CONFIG_SYSCTL
2833 	register_pernet_subsys(&sysctl_route_ops);
2834 #endif
2835 	register_pernet_subsys(&rt_genid_ops);
2836 	register_pernet_subsys(&ipv4_inetpeer_ops);
2837 	return rc;
2838 }
2839 
2840 #ifdef CONFIG_SYSCTL
2841 /*
2842  * We really need to sanitize the damn ipv4 init order, then all
2843  * this nonsense will go away.
2844  */
ip_static_sysctl_init(void)2845 void __init ip_static_sysctl_init(void)
2846 {
2847 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2848 }
2849 #endif
2850