1/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/in6.h>
36#include <linux/inetdevice.h>
37#include <linux/igmp.h>
38#include <linux/netfilter_ipv4.h>
39#include <linux/etherdevice.h>
40#include <linux/if_ether.h>
41#include <linux/if_vlan.h>
42#include <linux/rculist.h>
43#include <linux/err.h>
44
45#include <net/sock.h>
46#include <net/ip.h>
47#include <net/icmp.h>
48#include <net/protocol.h>
49#include <net/ip_tunnels.h>
50#include <net/arp.h>
51#include <net/checksum.h>
52#include <net/dsfield.h>
53#include <net/inet_ecn.h>
54#include <net/xfrm.h>
55#include <net/net_namespace.h>
56#include <net/netns/generic.h>
57#include <net/rtnetlink.h>
58#include <net/udp.h>
59
60#if IS_ENABLED(CONFIG_IPV6)
61#include <net/ipv6.h>
62#include <net/ip6_fib.h>
63#include <net/ip6_route.h>
64#endif
65
66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67{
68	return hash_32((__force u32)key ^ (__force u32)remote,
69			 IP_TNL_HASH_BITS);
70}
71
72static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73			     struct dst_entry *dst, __be32 saddr)
74{
75	struct dst_entry *old_dst;
76
77	dst_clone(dst);
78	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79	dst_release(old_dst);
80	idst->saddr = saddr;
81}
82
83static noinline void tunnel_dst_set(struct ip_tunnel *t,
84			   struct dst_entry *dst, __be32 saddr)
85{
86	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87}
88
89static void tunnel_dst_reset(struct ip_tunnel *t)
90{
91	tunnel_dst_set(t, NULL, 0);
92}
93
94void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95{
96	int i;
97
98	for_each_possible_cpu(i)
99		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100}
101EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102
103static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104					u32 cookie, __be32 *saddr)
105{
106	struct ip_tunnel_dst *idst;
107	struct dst_entry *dst;
108
109	rcu_read_lock();
110	idst = raw_cpu_ptr(t->dst_cache);
111	dst = rcu_dereference(idst->dst);
112	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113		dst = NULL;
114	if (dst) {
115		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116			*saddr = idst->saddr;
117		} else {
118			tunnel_dst_reset(t);
119			dst_release(dst);
120			dst = NULL;
121		}
122	}
123	rcu_read_unlock();
124	return (struct rtable *)dst;
125}
126
127static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128				__be16 flags, __be32 key)
129{
130	if (p->i_flags & TUNNEL_KEY) {
131		if (flags & TUNNEL_KEY)
132			return key == p->i_key;
133		else
134			/* key expected, none present */
135			return false;
136	} else
137		return !(flags & TUNNEL_KEY);
138}
139
140/* Fallback tunnel: no source, no destination, no key, no options
141
142   Tunnel hash table:
143   We require exact key match i.e. if a key is present in packet
144   it will match only tunnel with the same key; if it is not present,
145   it will match only keyless tunnel.
146
147   All keysless packets, if not matched configured keyless tunnels
148   will match fallback tunnel.
149   Given src, dst and key, find appropriate for input tunnel.
150*/
151struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152				   int link, __be16 flags,
153				   __be32 remote, __be32 local,
154				   __be32 key)
155{
156	unsigned int hash;
157	struct ip_tunnel *t, *cand = NULL;
158	struct hlist_head *head;
159
160	hash = ip_tunnel_hash(key, remote);
161	head = &itn->tunnels[hash];
162
163	hlist_for_each_entry_rcu(t, head, hash_node) {
164		if (local != t->parms.iph.saddr ||
165		    remote != t->parms.iph.daddr ||
166		    !(t->dev->flags & IFF_UP))
167			continue;
168
169		if (!ip_tunnel_key_match(&t->parms, flags, key))
170			continue;
171
172		if (t->parms.link == link)
173			return t;
174		else
175			cand = t;
176	}
177
178	hlist_for_each_entry_rcu(t, head, hash_node) {
179		if (remote != t->parms.iph.daddr ||
180		    t->parms.iph.saddr != 0 ||
181		    !(t->dev->flags & IFF_UP))
182			continue;
183
184		if (!ip_tunnel_key_match(&t->parms, flags, key))
185			continue;
186
187		if (t->parms.link == link)
188			return t;
189		else if (!cand)
190			cand = t;
191	}
192
193	hash = ip_tunnel_hash(key, 0);
194	head = &itn->tunnels[hash];
195
196	hlist_for_each_entry_rcu(t, head, hash_node) {
197		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199			continue;
200
201		if (!(t->dev->flags & IFF_UP))
202			continue;
203
204		if (!ip_tunnel_key_match(&t->parms, flags, key))
205			continue;
206
207		if (t->parms.link == link)
208			return t;
209		else if (!cand)
210			cand = t;
211	}
212
213	if (flags & TUNNEL_NO_KEY)
214		goto skip_key_lookup;
215
216	hlist_for_each_entry_rcu(t, head, hash_node) {
217		if (t->parms.i_key != key ||
218		    t->parms.iph.saddr != 0 ||
219		    t->parms.iph.daddr != 0 ||
220		    !(t->dev->flags & IFF_UP))
221			continue;
222
223		if (t->parms.link == link)
224			return t;
225		else if (!cand)
226			cand = t;
227	}
228
229skip_key_lookup:
230	if (cand)
231		return cand;
232
233	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234		return netdev_priv(itn->fb_tunnel_dev);
235
236
237	return NULL;
238}
239EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240
241static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242				    struct ip_tunnel_parm *parms)
243{
244	unsigned int h;
245	__be32 remote;
246	__be32 i_key = parms->i_key;
247
248	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249		remote = parms->iph.daddr;
250	else
251		remote = 0;
252
253	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254		i_key = 0;
255
256	h = ip_tunnel_hash(i_key, remote);
257	return &itn->tunnels[h];
258}
259
260static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261{
262	struct hlist_head *head = ip_bucket(itn, &t->parms);
263
264	hlist_add_head_rcu(&t->hash_node, head);
265}
266
267static void ip_tunnel_del(struct ip_tunnel *t)
268{
269	hlist_del_init_rcu(&t->hash_node);
270}
271
272static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273					struct ip_tunnel_parm *parms,
274					int type)
275{
276	__be32 remote = parms->iph.daddr;
277	__be32 local = parms->iph.saddr;
278	__be32 key = parms->i_key;
279	__be16 flags = parms->i_flags;
280	int link = parms->link;
281	struct ip_tunnel *t = NULL;
282	struct hlist_head *head = ip_bucket(itn, parms);
283
284	hlist_for_each_entry_rcu(t, head, hash_node) {
285		if (local == t->parms.iph.saddr &&
286		    remote == t->parms.iph.daddr &&
287		    link == t->parms.link &&
288		    type == t->dev->type &&
289		    ip_tunnel_key_match(&t->parms, flags, key))
290			break;
291	}
292	return t;
293}
294
295static struct net_device *__ip_tunnel_create(struct net *net,
296					     const struct rtnl_link_ops *ops,
297					     struct ip_tunnel_parm *parms)
298{
299	int err;
300	struct ip_tunnel *tunnel;
301	struct net_device *dev;
302	char name[IFNAMSIZ];
303
304	if (parms->name[0])
305		strlcpy(name, parms->name, IFNAMSIZ);
306	else {
307		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308			err = -E2BIG;
309			goto failed;
310		}
311		strlcpy(name, ops->kind, IFNAMSIZ);
312		strncat(name, "%d", 2);
313	}
314
315	ASSERT_RTNL();
316	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317	if (!dev) {
318		err = -ENOMEM;
319		goto failed;
320	}
321	dev_net_set(dev, net);
322
323	dev->rtnl_link_ops = ops;
324
325	tunnel = netdev_priv(dev);
326	tunnel->parms = *parms;
327	tunnel->net = net;
328
329	err = register_netdevice(dev);
330	if (err)
331		goto failed_free;
332
333	return dev;
334
335failed_free:
336	free_netdev(dev);
337failed:
338	return ERR_PTR(err);
339}
340
341static inline void init_tunnel_flow(struct flowi4 *fl4,
342				    int proto,
343				    __be32 daddr, __be32 saddr,
344				    __be32 key, __u8 tos, int oif)
345{
346	memset(fl4, 0, sizeof(*fl4));
347	fl4->flowi4_oif = oif;
348	fl4->daddr = daddr;
349	fl4->saddr = saddr;
350	fl4->flowi4_tos = tos;
351	fl4->flowi4_proto = proto;
352	fl4->fl4_gre_key = key;
353}
354
355static int ip_tunnel_bind_dev(struct net_device *dev)
356{
357	struct net_device *tdev = NULL;
358	struct ip_tunnel *tunnel = netdev_priv(dev);
359	const struct iphdr *iph;
360	int hlen = LL_MAX_HEADER;
361	int mtu = ETH_DATA_LEN;
362	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363
364	iph = &tunnel->parms.iph;
365
366	/* Guess output device to choose reasonable mtu and needed_headroom */
367	if (iph->daddr) {
368		struct flowi4 fl4;
369		struct rtable *rt;
370
371		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372				 iph->saddr, tunnel->parms.o_key,
373				 RT_TOS(iph->tos), tunnel->parms.link);
374		rt = ip_route_output_key(tunnel->net, &fl4);
375
376		if (!IS_ERR(rt)) {
377			tdev = rt->dst.dev;
378			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379			ip_rt_put(rt);
380		}
381		if (dev->type != ARPHRD_ETHER)
382			dev->flags |= IFF_POINTOPOINT;
383	}
384
385	if (!tdev && tunnel->parms.link)
386		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387
388	if (tdev) {
389		hlen = tdev->hard_header_len + tdev->needed_headroom;
390		mtu = tdev->mtu;
391	}
392
393	dev->needed_headroom = t_hlen + hlen;
394	mtu -= (dev->hard_header_len + t_hlen);
395
396	if (mtu < 68)
397		mtu = 68;
398
399	return mtu;
400}
401
402static struct ip_tunnel *ip_tunnel_create(struct net *net,
403					  struct ip_tunnel_net *itn,
404					  struct ip_tunnel_parm *parms)
405{
406	struct ip_tunnel *nt;
407	struct net_device *dev;
408
409	BUG_ON(!itn->fb_tunnel_dev);
410	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
411	if (IS_ERR(dev))
412		return ERR_CAST(dev);
413
414	dev->mtu = ip_tunnel_bind_dev(dev);
415
416	nt = netdev_priv(dev);
417	ip_tunnel_add(itn, nt);
418	return nt;
419}
420
421int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
422		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
423{
424	struct pcpu_sw_netstats *tstats;
425	const struct iphdr *iph = ip_hdr(skb);
426	int err;
427
428#ifdef CONFIG_NET_IPGRE_BROADCAST
429	if (ipv4_is_multicast(iph->daddr)) {
430		tunnel->dev->stats.multicast++;
431		skb->pkt_type = PACKET_BROADCAST;
432	}
433#endif
434
435	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
436	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
437		tunnel->dev->stats.rx_crc_errors++;
438		tunnel->dev->stats.rx_errors++;
439		goto drop;
440	}
441
442	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
443		if (!(tpi->flags&TUNNEL_SEQ) ||
444		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
445			tunnel->dev->stats.rx_fifo_errors++;
446			tunnel->dev->stats.rx_errors++;
447			goto drop;
448		}
449		tunnel->i_seqno = ntohl(tpi->seq) + 1;
450	}
451
452	skb_reset_network_header(skb);
453
454	err = IP_ECN_decapsulate(iph, skb);
455	if (unlikely(err)) {
456		if (log_ecn_error)
457			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
458					&iph->saddr, iph->tos);
459		if (err > 1) {
460			++tunnel->dev->stats.rx_frame_errors;
461			++tunnel->dev->stats.rx_errors;
462			goto drop;
463		}
464	}
465
466	tstats = this_cpu_ptr(tunnel->dev->tstats);
467	u64_stats_update_begin(&tstats->syncp);
468	tstats->rx_packets++;
469	tstats->rx_bytes += skb->len;
470	u64_stats_update_end(&tstats->syncp);
471
472	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
473
474	if (tunnel->dev->type == ARPHRD_ETHER) {
475		skb->protocol = eth_type_trans(skb, tunnel->dev);
476		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
477	} else {
478		skb->dev = tunnel->dev;
479	}
480
481	gro_cells_receive(&tunnel->gro_cells, skb);
482	return 0;
483
484drop:
485	kfree_skb(skb);
486	return 0;
487}
488EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
489
490static int ip_encap_hlen(struct ip_tunnel_encap *e)
491{
492	const struct ip_tunnel_encap_ops *ops;
493	int hlen = -EINVAL;
494
495	if (e->type == TUNNEL_ENCAP_NONE)
496		return 0;
497
498	if (e->type >= MAX_IPTUN_ENCAP_OPS)
499		return -EINVAL;
500
501	rcu_read_lock();
502	ops = rcu_dereference(iptun_encaps[e->type]);
503	if (likely(ops && ops->encap_hlen))
504		hlen = ops->encap_hlen(e);
505	rcu_read_unlock();
506
507	return hlen;
508}
509
510const struct ip_tunnel_encap_ops __rcu *
511		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
512
513int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
514			    unsigned int num)
515{
516	if (num >= MAX_IPTUN_ENCAP_OPS)
517		return -ERANGE;
518
519	return !cmpxchg((const struct ip_tunnel_encap_ops **)
520			&iptun_encaps[num],
521			NULL, ops) ? 0 : -1;
522}
523EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
524
525int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
526			    unsigned int num)
527{
528	int ret;
529
530	if (num >= MAX_IPTUN_ENCAP_OPS)
531		return -ERANGE;
532
533	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
534		       &iptun_encaps[num],
535		       ops, NULL) == ops) ? 0 : -1;
536
537	synchronize_net();
538
539	return ret;
540}
541EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
542
543int ip_tunnel_encap_setup(struct ip_tunnel *t,
544			  struct ip_tunnel_encap *ipencap)
545{
546	int hlen;
547
548	memset(&t->encap, 0, sizeof(t->encap));
549
550	hlen = ip_encap_hlen(ipencap);
551	if (hlen < 0)
552		return hlen;
553
554	t->encap.type = ipencap->type;
555	t->encap.sport = ipencap->sport;
556	t->encap.dport = ipencap->dport;
557	t->encap.flags = ipencap->flags;
558
559	t->encap_hlen = hlen;
560	t->hlen = t->encap_hlen + t->tun_hlen;
561
562	return 0;
563}
564EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
565
566int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
567		    u8 *protocol, struct flowi4 *fl4)
568{
569	const struct ip_tunnel_encap_ops *ops;
570	int ret = -EINVAL;
571
572	if (t->encap.type == TUNNEL_ENCAP_NONE)
573		return 0;
574
575	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
576		return -EINVAL;
577
578	rcu_read_lock();
579	ops = rcu_dereference(iptun_encaps[t->encap.type]);
580	if (likely(ops && ops->build_header))
581		ret = ops->build_header(skb, &t->encap, protocol, fl4);
582	rcu_read_unlock();
583
584	return ret;
585}
586EXPORT_SYMBOL(ip_tunnel_encap);
587
588static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
589			    struct rtable *rt, __be16 df,
590			    const struct iphdr *inner_iph)
591{
592	struct ip_tunnel *tunnel = netdev_priv(dev);
593	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
594	int mtu;
595
596	if (df)
597		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598					- sizeof(struct iphdr) - tunnel->hlen;
599	else
600		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
601
602	if (skb_dst(skb))
603		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
604
605	if (skb->protocol == htons(ETH_P_IP)) {
606		if (!skb_is_gso(skb) &&
607		    (inner_iph->frag_off & htons(IP_DF)) &&
608		    mtu < pkt_size) {
609			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
610			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
611			return -E2BIG;
612		}
613	}
614#if IS_ENABLED(CONFIG_IPV6)
615	else if (skb->protocol == htons(ETH_P_IPV6)) {
616		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
617
618		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
619			   mtu >= IPV6_MIN_MTU) {
620			if ((tunnel->parms.iph.daddr &&
621			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
622			    rt6->rt6i_dst.plen == 128) {
623				rt6->rt6i_flags |= RTF_MODIFIED;
624				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
625			}
626		}
627
628		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
629					mtu < pkt_size) {
630			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631			return -E2BIG;
632		}
633	}
634#endif
635	return 0;
636}
637
638void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
639		    const struct iphdr *tnl_params, u8 protocol)
640{
641	struct ip_tunnel *tunnel = netdev_priv(dev);
642	const struct iphdr *inner_iph;
643	struct flowi4 fl4;
644	u8     tos, ttl;
645	__be16 df;
646	struct rtable *rt;		/* Route to the other host */
647	unsigned int max_headroom;	/* The extra header space needed */
648	__be32 dst;
649	int err;
650	bool connected;
651
652	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
653	connected = (tunnel->parms.iph.daddr != 0);
654
655	dst = tnl_params->daddr;
656	if (dst == 0) {
657		/* NBMA tunnel */
658
659		if (!skb_dst(skb)) {
660			dev->stats.tx_fifo_errors++;
661			goto tx_error;
662		}
663
664		if (skb->protocol == htons(ETH_P_IP)) {
665			rt = skb_rtable(skb);
666			dst = rt_nexthop(rt, inner_iph->daddr);
667		}
668#if IS_ENABLED(CONFIG_IPV6)
669		else if (skb->protocol == htons(ETH_P_IPV6)) {
670			const struct in6_addr *addr6;
671			struct neighbour *neigh;
672			bool do_tx_error_icmp;
673			int addr_type;
674
675			neigh = dst_neigh_lookup(skb_dst(skb),
676						 &ipv6_hdr(skb)->daddr);
677			if (!neigh)
678				goto tx_error;
679
680			addr6 = (const struct in6_addr *)&neigh->primary_key;
681			addr_type = ipv6_addr_type(addr6);
682
683			if (addr_type == IPV6_ADDR_ANY) {
684				addr6 = &ipv6_hdr(skb)->daddr;
685				addr_type = ipv6_addr_type(addr6);
686			}
687
688			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
689				do_tx_error_icmp = true;
690			else {
691				do_tx_error_icmp = false;
692				dst = addr6->s6_addr32[3];
693			}
694			neigh_release(neigh);
695			if (do_tx_error_icmp)
696				goto tx_error_icmp;
697		}
698#endif
699		else
700			goto tx_error;
701
702		connected = false;
703	}
704
705	tos = tnl_params->tos;
706	if (tos & 0x1) {
707		tos &= ~0x1;
708		if (skb->protocol == htons(ETH_P_IP)) {
709			tos = inner_iph->tos;
710			connected = false;
711		} else if (skb->protocol == htons(ETH_P_IPV6)) {
712			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
713			connected = false;
714		}
715	}
716
717	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
718			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
719
720	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
721		goto tx_error;
722
723	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
724
725	if (!rt) {
726		rt = ip_route_output_key(tunnel->net, &fl4);
727
728		if (IS_ERR(rt)) {
729			dev->stats.tx_carrier_errors++;
730			goto tx_error;
731		}
732		if (connected)
733			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
734	}
735
736	if (rt->dst.dev == dev) {
737		ip_rt_put(rt);
738		dev->stats.collisions++;
739		goto tx_error;
740	}
741
742	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
743		ip_rt_put(rt);
744		goto tx_error;
745	}
746
747	if (tunnel->err_count > 0) {
748		if (time_before(jiffies,
749				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
750			tunnel->err_count--;
751
752			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
753			dst_link_failure(skb);
754		} else
755			tunnel->err_count = 0;
756	}
757
758	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
759	ttl = tnl_params->ttl;
760	if (ttl == 0) {
761		if (skb->protocol == htons(ETH_P_IP))
762			ttl = inner_iph->ttl;
763#if IS_ENABLED(CONFIG_IPV6)
764		else if (skb->protocol == htons(ETH_P_IPV6))
765			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
766#endif
767		else
768			ttl = ip4_dst_hoplimit(&rt->dst);
769	}
770
771	df = tnl_params->frag_off;
772	if (skb->protocol == htons(ETH_P_IP))
773		df |= (inner_iph->frag_off&htons(IP_DF));
774
775	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
776			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
777	if (max_headroom > dev->needed_headroom)
778		dev->needed_headroom = max_headroom;
779
780	if (skb_cow_head(skb, dev->needed_headroom)) {
781		ip_rt_put(rt);
782		dev->stats.tx_dropped++;
783		kfree_skb(skb);
784		return;
785	}
786
787	err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
788			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
789	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
790
791	return;
792
793#if IS_ENABLED(CONFIG_IPV6)
794tx_error_icmp:
795	dst_link_failure(skb);
796#endif
797tx_error:
798	dev->stats.tx_errors++;
799	kfree_skb(skb);
800}
801EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
802
803static void ip_tunnel_update(struct ip_tunnel_net *itn,
804			     struct ip_tunnel *t,
805			     struct net_device *dev,
806			     struct ip_tunnel_parm *p,
807			     bool set_mtu)
808{
809	ip_tunnel_del(t);
810	t->parms.iph.saddr = p->iph.saddr;
811	t->parms.iph.daddr = p->iph.daddr;
812	t->parms.i_key = p->i_key;
813	t->parms.o_key = p->o_key;
814	if (dev->type != ARPHRD_ETHER) {
815		memcpy(dev->dev_addr, &p->iph.saddr, 4);
816		memcpy(dev->broadcast, &p->iph.daddr, 4);
817	}
818	ip_tunnel_add(itn, t);
819
820	t->parms.iph.ttl = p->iph.ttl;
821	t->parms.iph.tos = p->iph.tos;
822	t->parms.iph.frag_off = p->iph.frag_off;
823
824	if (t->parms.link != p->link) {
825		int mtu;
826
827		t->parms.link = p->link;
828		mtu = ip_tunnel_bind_dev(dev);
829		if (set_mtu)
830			dev->mtu = mtu;
831	}
832	ip_tunnel_dst_reset_all(t);
833	netdev_state_change(dev);
834}
835
836int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
837{
838	int err = 0;
839	struct ip_tunnel *t = netdev_priv(dev);
840	struct net *net = t->net;
841	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
842
843	BUG_ON(!itn->fb_tunnel_dev);
844	switch (cmd) {
845	case SIOCGETTUNNEL:
846		if (dev == itn->fb_tunnel_dev) {
847			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
848			if (!t)
849				t = netdev_priv(dev);
850		}
851		memcpy(p, &t->parms, sizeof(*p));
852		break;
853
854	case SIOCADDTUNNEL:
855	case SIOCCHGTUNNEL:
856		err = -EPERM;
857		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
858			goto done;
859		if (p->iph.ttl)
860			p->iph.frag_off |= htons(IP_DF);
861		if (!(p->i_flags & VTI_ISVTI)) {
862			if (!(p->i_flags & TUNNEL_KEY))
863				p->i_key = 0;
864			if (!(p->o_flags & TUNNEL_KEY))
865				p->o_key = 0;
866		}
867
868		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
869
870		if (cmd == SIOCADDTUNNEL) {
871			if (!t) {
872				t = ip_tunnel_create(net, itn, p);
873				err = PTR_ERR_OR_ZERO(t);
874				break;
875			}
876
877			err = -EEXIST;
878			break;
879		}
880		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
881			if (t) {
882				if (t->dev != dev) {
883					err = -EEXIST;
884					break;
885				}
886			} else {
887				unsigned int nflags = 0;
888
889				if (ipv4_is_multicast(p->iph.daddr))
890					nflags = IFF_BROADCAST;
891				else if (p->iph.daddr)
892					nflags = IFF_POINTOPOINT;
893
894				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
895					err = -EINVAL;
896					break;
897				}
898
899				t = netdev_priv(dev);
900			}
901		}
902
903		if (t) {
904			err = 0;
905			ip_tunnel_update(itn, t, dev, p, true);
906		} else {
907			err = -ENOENT;
908		}
909		break;
910
911	case SIOCDELTUNNEL:
912		err = -EPERM;
913		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
914			goto done;
915
916		if (dev == itn->fb_tunnel_dev) {
917			err = -ENOENT;
918			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
919			if (!t)
920				goto done;
921			err = -EPERM;
922			if (t == netdev_priv(itn->fb_tunnel_dev))
923				goto done;
924			dev = t->dev;
925		}
926		unregister_netdevice(dev);
927		err = 0;
928		break;
929
930	default:
931		err = -EINVAL;
932	}
933
934done:
935	return err;
936}
937EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
938
939int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
940{
941	struct ip_tunnel *tunnel = netdev_priv(dev);
942	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
943
944	if (new_mtu < 68 ||
945	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
946		return -EINVAL;
947	dev->mtu = new_mtu;
948	return 0;
949}
950EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
951
952static void ip_tunnel_dev_free(struct net_device *dev)
953{
954	struct ip_tunnel *tunnel = netdev_priv(dev);
955
956	gro_cells_destroy(&tunnel->gro_cells);
957	free_percpu(tunnel->dst_cache);
958	free_percpu(dev->tstats);
959	free_netdev(dev);
960}
961
962void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
963{
964	struct ip_tunnel *tunnel = netdev_priv(dev);
965	struct ip_tunnel_net *itn;
966
967	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
968
969	if (itn->fb_tunnel_dev != dev) {
970		ip_tunnel_del(netdev_priv(dev));
971		unregister_netdevice_queue(dev, head);
972	}
973}
974EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
975
976struct net *ip_tunnel_get_link_net(const struct net_device *dev)
977{
978	struct ip_tunnel *tunnel = netdev_priv(dev);
979
980	return tunnel->net;
981}
982EXPORT_SYMBOL(ip_tunnel_get_link_net);
983
984int ip_tunnel_get_iflink(const struct net_device *dev)
985{
986	struct ip_tunnel *tunnel = netdev_priv(dev);
987
988	return tunnel->parms.link;
989}
990EXPORT_SYMBOL(ip_tunnel_get_iflink);
991
992int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
993				  struct rtnl_link_ops *ops, char *devname)
994{
995	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
996	struct ip_tunnel_parm parms;
997	unsigned int i;
998
999	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1000		INIT_HLIST_HEAD(&itn->tunnels[i]);
1001
1002	if (!ops) {
1003		itn->fb_tunnel_dev = NULL;
1004		return 0;
1005	}
1006
1007	memset(&parms, 0, sizeof(parms));
1008	if (devname)
1009		strlcpy(parms.name, devname, IFNAMSIZ);
1010
1011	rtnl_lock();
1012	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1013	/* FB netdevice is special: we have one, and only one per netns.
1014	 * Allowing to move it to another netns is clearly unsafe.
1015	 */
1016	if (!IS_ERR(itn->fb_tunnel_dev)) {
1017		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1018		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1019		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1020	}
1021	rtnl_unlock();
1022
1023	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1024}
1025EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1026
1027static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1028			      struct rtnl_link_ops *ops)
1029{
1030	struct net *net = dev_net(itn->fb_tunnel_dev);
1031	struct net_device *dev, *aux;
1032	int h;
1033
1034	for_each_netdev_safe(net, dev, aux)
1035		if (dev->rtnl_link_ops == ops)
1036			unregister_netdevice_queue(dev, head);
1037
1038	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1039		struct ip_tunnel *t;
1040		struct hlist_node *n;
1041		struct hlist_head *thead = &itn->tunnels[h];
1042
1043		hlist_for_each_entry_safe(t, n, thead, hash_node)
1044			/* If dev is in the same netns, it has already
1045			 * been added to the list by the previous loop.
1046			 */
1047			if (!net_eq(dev_net(t->dev), net))
1048				unregister_netdevice_queue(t->dev, head);
1049	}
1050}
1051
1052void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1053{
1054	LIST_HEAD(list);
1055
1056	rtnl_lock();
1057	ip_tunnel_destroy(itn, &list, ops);
1058	unregister_netdevice_many(&list);
1059	rtnl_unlock();
1060}
1061EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1062
1063int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1064		      struct ip_tunnel_parm *p)
1065{
1066	struct ip_tunnel *nt;
1067	struct net *net = dev_net(dev);
1068	struct ip_tunnel_net *itn;
1069	int mtu;
1070	int err;
1071
1072	nt = netdev_priv(dev);
1073	itn = net_generic(net, nt->ip_tnl_net_id);
1074
1075	if (ip_tunnel_find(itn, p, dev->type))
1076		return -EEXIST;
1077
1078	nt->net = net;
1079	nt->parms = *p;
1080	err = register_netdevice(dev);
1081	if (err)
1082		goto out;
1083
1084	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1085		eth_hw_addr_random(dev);
1086
1087	mtu = ip_tunnel_bind_dev(dev);
1088	if (!tb[IFLA_MTU])
1089		dev->mtu = mtu;
1090
1091	ip_tunnel_add(itn, nt);
1092
1093out:
1094	return err;
1095}
1096EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1097
1098int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1099			 struct ip_tunnel_parm *p)
1100{
1101	struct ip_tunnel *t;
1102	struct ip_tunnel *tunnel = netdev_priv(dev);
1103	struct net *net = tunnel->net;
1104	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1105
1106	if (dev == itn->fb_tunnel_dev)
1107		return -EINVAL;
1108
1109	t = ip_tunnel_find(itn, p, dev->type);
1110
1111	if (t) {
1112		if (t->dev != dev)
1113			return -EEXIST;
1114	} else {
1115		t = tunnel;
1116
1117		if (dev->type != ARPHRD_ETHER) {
1118			unsigned int nflags = 0;
1119
1120			if (ipv4_is_multicast(p->iph.daddr))
1121				nflags = IFF_BROADCAST;
1122			else if (p->iph.daddr)
1123				nflags = IFF_POINTOPOINT;
1124
1125			if ((dev->flags ^ nflags) &
1126			    (IFF_POINTOPOINT | IFF_BROADCAST))
1127				return -EINVAL;
1128		}
1129	}
1130
1131	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1132	return 0;
1133}
1134EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1135
1136int ip_tunnel_init(struct net_device *dev)
1137{
1138	struct ip_tunnel *tunnel = netdev_priv(dev);
1139	struct iphdr *iph = &tunnel->parms.iph;
1140	int err;
1141
1142	dev->destructor	= ip_tunnel_dev_free;
1143	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1144	if (!dev->tstats)
1145		return -ENOMEM;
1146
1147	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1148	if (!tunnel->dst_cache) {
1149		free_percpu(dev->tstats);
1150		return -ENOMEM;
1151	}
1152
1153	err = gro_cells_init(&tunnel->gro_cells, dev);
1154	if (err) {
1155		free_percpu(tunnel->dst_cache);
1156		free_percpu(dev->tstats);
1157		return err;
1158	}
1159
1160	tunnel->dev = dev;
1161	tunnel->net = dev_net(dev);
1162	strcpy(tunnel->parms.name, dev->name);
1163	iph->version		= 4;
1164	iph->ihl		= 5;
1165
1166	return 0;
1167}
1168EXPORT_SYMBOL_GPL(ip_tunnel_init);
1169
1170void ip_tunnel_uninit(struct net_device *dev)
1171{
1172	struct ip_tunnel *tunnel = netdev_priv(dev);
1173	struct net *net = tunnel->net;
1174	struct ip_tunnel_net *itn;
1175
1176	itn = net_generic(net, tunnel->ip_tnl_net_id);
1177	/* fb_tunnel_dev will be unregisted in net-exit call. */
1178	if (itn->fb_tunnel_dev != dev)
1179		ip_tunnel_del(netdev_priv(dev));
1180
1181	ip_tunnel_dst_reset_all(tunnel);
1182}
1183EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1184
1185/* Do least required initialization, rest of init is done in tunnel_init call */
1186void ip_tunnel_setup(struct net_device *dev, int net_id)
1187{
1188	struct ip_tunnel *tunnel = netdev_priv(dev);
1189	tunnel->ip_tnl_net_id = net_id;
1190}
1191EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1192
1193MODULE_LICENSE("GPL");
1194