1 /*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
ip_tunnel_hash(__be32 key,__be32 remote)66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 return hash_32((__force u32)key ^ (__force u32)remote,
69 IP_TNL_HASH_BITS);
70 }
71
__tunnel_dst_set(struct ip_tunnel_dst * idst,struct dst_entry * dst,__be32 saddr)72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 struct dst_entry *dst, __be32 saddr)
74 {
75 struct dst_entry *old_dst;
76
77 dst_clone(dst);
78 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 dst_release(old_dst);
80 idst->saddr = saddr;
81 }
82
tunnel_dst_set(struct ip_tunnel * t,struct dst_entry * dst,__be32 saddr)83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 struct dst_entry *dst, __be32 saddr)
85 {
86 __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88
tunnel_dst_reset(struct ip_tunnel * t)89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91 tunnel_dst_set(t, NULL, 0);
92 }
93
ip_tunnel_dst_reset_all(struct ip_tunnel * t)94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96 int i;
97
98 for_each_possible_cpu(i)
99 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102
tunnel_rtable_get(struct ip_tunnel * t,u32 cookie,__be32 * saddr)103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 u32 cookie, __be32 *saddr)
105 {
106 struct ip_tunnel_dst *idst;
107 struct dst_entry *dst;
108
109 rcu_read_lock();
110 idst = raw_cpu_ptr(t->dst_cache);
111 dst = rcu_dereference(idst->dst);
112 if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 dst = NULL;
114 if (dst) {
115 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 *saddr = idst->saddr;
117 } else {
118 tunnel_dst_reset(t);
119 dst_release(dst);
120 dst = NULL;
121 }
122 }
123 rcu_read_unlock();
124 return (struct rtable *)dst;
125 }
126
ip_tunnel_key_match(const struct ip_tunnel_parm * p,__be16 flags,__be32 key)127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 __be16 flags, __be32 key)
129 {
130 if (p->i_flags & TUNNEL_KEY) {
131 if (flags & TUNNEL_KEY)
132 return key == p->i_key;
133 else
134 /* key expected, none present */
135 return false;
136 } else
137 return !(flags & TUNNEL_KEY);
138 }
139
140 /* Fallback tunnel: no source, no destination, no key, no options
141
142 Tunnel hash table:
143 We require exact key match i.e. if a key is present in packet
144 it will match only tunnel with the same key; if it is not present,
145 it will match only keyless tunnel.
146
147 All keysless packets, if not matched configured keyless tunnels
148 will match fallback tunnel.
149 Given src, dst and key, find appropriate for input tunnel.
150 */
ip_tunnel_lookup(struct ip_tunnel_net * itn,int link,__be16 flags,__be32 remote,__be32 local,__be32 key)151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 int link, __be16 flags,
153 __be32 remote, __be32 local,
154 __be32 key)
155 {
156 unsigned int hash;
157 struct ip_tunnel *t, *cand = NULL;
158 struct hlist_head *head;
159
160 hash = ip_tunnel_hash(key, remote);
161 head = &itn->tunnels[hash];
162
163 hlist_for_each_entry_rcu(t, head, hash_node) {
164 if (local != t->parms.iph.saddr ||
165 remote != t->parms.iph.daddr ||
166 !(t->dev->flags & IFF_UP))
167 continue;
168
169 if (!ip_tunnel_key_match(&t->parms, flags, key))
170 continue;
171
172 if (t->parms.link == link)
173 return t;
174 else
175 cand = t;
176 }
177
178 hlist_for_each_entry_rcu(t, head, hash_node) {
179 if (remote != t->parms.iph.daddr ||
180 t->parms.iph.saddr != 0 ||
181 !(t->dev->flags & IFF_UP))
182 continue;
183
184 if (!ip_tunnel_key_match(&t->parms, flags, key))
185 continue;
186
187 if (t->parms.link == link)
188 return t;
189 else if (!cand)
190 cand = t;
191 }
192
193 hash = ip_tunnel_hash(key, 0);
194 head = &itn->tunnels[hash];
195
196 hlist_for_each_entry_rcu(t, head, hash_node) {
197 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 continue;
200
201 if (!(t->dev->flags & IFF_UP))
202 continue;
203
204 if (!ip_tunnel_key_match(&t->parms, flags, key))
205 continue;
206
207 if (t->parms.link == link)
208 return t;
209 else if (!cand)
210 cand = t;
211 }
212
213 if (flags & TUNNEL_NO_KEY)
214 goto skip_key_lookup;
215
216 hlist_for_each_entry_rcu(t, head, hash_node) {
217 if (t->parms.i_key != key ||
218 t->parms.iph.saddr != 0 ||
219 t->parms.iph.daddr != 0 ||
220 !(t->dev->flags & IFF_UP))
221 continue;
222
223 if (t->parms.link == link)
224 return t;
225 else if (!cand)
226 cand = t;
227 }
228
229 skip_key_lookup:
230 if (cand)
231 return cand;
232
233 t = rcu_dereference(itn->collect_md_tun);
234 if (t)
235 return t;
236
237 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
238 return netdev_priv(itn->fb_tunnel_dev);
239
240 return NULL;
241 }
242 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
243
ip_bucket(struct ip_tunnel_net * itn,struct ip_tunnel_parm * parms)244 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
245 struct ip_tunnel_parm *parms)
246 {
247 unsigned int h;
248 __be32 remote;
249 __be32 i_key = parms->i_key;
250
251 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
252 remote = parms->iph.daddr;
253 else
254 remote = 0;
255
256 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
257 i_key = 0;
258
259 h = ip_tunnel_hash(i_key, remote);
260 return &itn->tunnels[h];
261 }
262
ip_tunnel_add(struct ip_tunnel_net * itn,struct ip_tunnel * t)263 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
264 {
265 struct hlist_head *head = ip_bucket(itn, &t->parms);
266
267 if (t->collect_md)
268 rcu_assign_pointer(itn->collect_md_tun, t);
269 hlist_add_head_rcu(&t->hash_node, head);
270 }
271
ip_tunnel_del(struct ip_tunnel_net * itn,struct ip_tunnel * t)272 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
273 {
274 if (t->collect_md)
275 rcu_assign_pointer(itn->collect_md_tun, NULL);
276 hlist_del_init_rcu(&t->hash_node);
277 }
278
ip_tunnel_find(struct ip_tunnel_net * itn,struct ip_tunnel_parm * parms,int type)279 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
280 struct ip_tunnel_parm *parms,
281 int type)
282 {
283 __be32 remote = parms->iph.daddr;
284 __be32 local = parms->iph.saddr;
285 __be32 key = parms->i_key;
286 __be16 flags = parms->i_flags;
287 int link = parms->link;
288 struct ip_tunnel *t = NULL;
289 struct hlist_head *head = ip_bucket(itn, parms);
290
291 hlist_for_each_entry_rcu(t, head, hash_node) {
292 if (local == t->parms.iph.saddr &&
293 remote == t->parms.iph.daddr &&
294 link == t->parms.link &&
295 type == t->dev->type &&
296 ip_tunnel_key_match(&t->parms, flags, key))
297 break;
298 }
299 return t;
300 }
301
__ip_tunnel_create(struct net * net,const struct rtnl_link_ops * ops,struct ip_tunnel_parm * parms)302 static struct net_device *__ip_tunnel_create(struct net *net,
303 const struct rtnl_link_ops *ops,
304 struct ip_tunnel_parm *parms)
305 {
306 int err;
307 struct ip_tunnel *tunnel;
308 struct net_device *dev;
309 char name[IFNAMSIZ];
310
311 if (parms->name[0])
312 strlcpy(name, parms->name, IFNAMSIZ);
313 else {
314 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
315 err = -E2BIG;
316 goto failed;
317 }
318 strlcpy(name, ops->kind, IFNAMSIZ);
319 strncat(name, "%d", 2);
320 }
321
322 ASSERT_RTNL();
323 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
324 if (!dev) {
325 err = -ENOMEM;
326 goto failed;
327 }
328 dev_net_set(dev, net);
329
330 dev->rtnl_link_ops = ops;
331
332 tunnel = netdev_priv(dev);
333 tunnel->parms = *parms;
334 tunnel->net = net;
335
336 err = register_netdevice(dev);
337 if (err)
338 goto failed_free;
339
340 return dev;
341
342 failed_free:
343 free_netdev(dev);
344 failed:
345 return ERR_PTR(err);
346 }
347
init_tunnel_flow(struct flowi4 * fl4,int proto,__be32 daddr,__be32 saddr,__be32 key,__u8 tos,int oif)348 static inline void init_tunnel_flow(struct flowi4 *fl4,
349 int proto,
350 __be32 daddr, __be32 saddr,
351 __be32 key, __u8 tos, int oif)
352 {
353 memset(fl4, 0, sizeof(*fl4));
354 fl4->flowi4_oif = oif;
355 fl4->daddr = daddr;
356 fl4->saddr = saddr;
357 fl4->flowi4_tos = tos;
358 fl4->flowi4_proto = proto;
359 fl4->fl4_gre_key = key;
360 }
361
ip_tunnel_bind_dev(struct net_device * dev)362 static int ip_tunnel_bind_dev(struct net_device *dev)
363 {
364 struct net_device *tdev = NULL;
365 struct ip_tunnel *tunnel = netdev_priv(dev);
366 const struct iphdr *iph;
367 int hlen = LL_MAX_HEADER;
368 int mtu = ETH_DATA_LEN;
369 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
370
371 iph = &tunnel->parms.iph;
372
373 /* Guess output device to choose reasonable mtu and needed_headroom */
374 if (iph->daddr) {
375 struct flowi4 fl4;
376 struct rtable *rt;
377
378 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
379 iph->saddr, tunnel->parms.o_key,
380 RT_TOS(iph->tos), tunnel->parms.link);
381 rt = ip_route_output_key(tunnel->net, &fl4);
382
383 if (!IS_ERR(rt)) {
384 tdev = rt->dst.dev;
385 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
386 ip_rt_put(rt);
387 }
388 if (dev->type != ARPHRD_ETHER)
389 dev->flags |= IFF_POINTOPOINT;
390 }
391
392 if (!tdev && tunnel->parms.link)
393 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
394
395 if (tdev) {
396 hlen = tdev->hard_header_len + tdev->needed_headroom;
397 mtu = tdev->mtu;
398 }
399
400 dev->needed_headroom = t_hlen + hlen;
401 mtu -= (dev->hard_header_len + t_hlen);
402
403 if (mtu < 68)
404 mtu = 68;
405
406 return mtu;
407 }
408
ip_tunnel_create(struct net * net,struct ip_tunnel_net * itn,struct ip_tunnel_parm * parms)409 static struct ip_tunnel *ip_tunnel_create(struct net *net,
410 struct ip_tunnel_net *itn,
411 struct ip_tunnel_parm *parms)
412 {
413 struct ip_tunnel *nt;
414 struct net_device *dev;
415
416 BUG_ON(!itn->fb_tunnel_dev);
417 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
418 if (IS_ERR(dev))
419 return ERR_CAST(dev);
420
421 dev->mtu = ip_tunnel_bind_dev(dev);
422
423 nt = netdev_priv(dev);
424 ip_tunnel_add(itn, nt);
425 return nt;
426 }
427
ip_tunnel_rcv(struct ip_tunnel * tunnel,struct sk_buff * skb,const struct tnl_ptk_info * tpi,struct metadata_dst * tun_dst,bool log_ecn_error)428 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
429 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
430 bool log_ecn_error)
431 {
432 struct pcpu_sw_netstats *tstats;
433 const struct iphdr *iph = ip_hdr(skb);
434 int err;
435
436 #ifdef CONFIG_NET_IPGRE_BROADCAST
437 if (ipv4_is_multicast(iph->daddr)) {
438 tunnel->dev->stats.multicast++;
439 skb->pkt_type = PACKET_BROADCAST;
440 }
441 #endif
442
443 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
444 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
445 tunnel->dev->stats.rx_crc_errors++;
446 tunnel->dev->stats.rx_errors++;
447 goto drop;
448 }
449
450 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
451 if (!(tpi->flags&TUNNEL_SEQ) ||
452 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
453 tunnel->dev->stats.rx_fifo_errors++;
454 tunnel->dev->stats.rx_errors++;
455 goto drop;
456 }
457 tunnel->i_seqno = ntohl(tpi->seq) + 1;
458 }
459
460 skb_reset_network_header(skb);
461
462 err = IP_ECN_decapsulate(iph, skb);
463 if (unlikely(err)) {
464 if (log_ecn_error)
465 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
466 &iph->saddr, iph->tos);
467 if (err > 1) {
468 ++tunnel->dev->stats.rx_frame_errors;
469 ++tunnel->dev->stats.rx_errors;
470 goto drop;
471 }
472 }
473
474 tstats = this_cpu_ptr(tunnel->dev->tstats);
475 u64_stats_update_begin(&tstats->syncp);
476 tstats->rx_packets++;
477 tstats->rx_bytes += skb->len;
478 u64_stats_update_end(&tstats->syncp);
479
480 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
481
482 if (tunnel->dev->type == ARPHRD_ETHER) {
483 skb->protocol = eth_type_trans(skb, tunnel->dev);
484 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
485 } else {
486 skb->dev = tunnel->dev;
487 }
488
489 if (tun_dst)
490 skb_dst_set(skb, (struct dst_entry *)tun_dst);
491
492 gro_cells_receive(&tunnel->gro_cells, skb);
493 return 0;
494
495 drop:
496 kfree_skb(skb);
497 return 0;
498 }
499 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
500
ip_encap_hlen(struct ip_tunnel_encap * e)501 static int ip_encap_hlen(struct ip_tunnel_encap *e)
502 {
503 const struct ip_tunnel_encap_ops *ops;
504 int hlen = -EINVAL;
505
506 if (e->type == TUNNEL_ENCAP_NONE)
507 return 0;
508
509 if (e->type >= MAX_IPTUN_ENCAP_OPS)
510 return -EINVAL;
511
512 rcu_read_lock();
513 ops = rcu_dereference(iptun_encaps[e->type]);
514 if (likely(ops && ops->encap_hlen))
515 hlen = ops->encap_hlen(e);
516 rcu_read_unlock();
517
518 return hlen;
519 }
520
521 const struct ip_tunnel_encap_ops __rcu *
522 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
523
ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops * ops,unsigned int num)524 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
525 unsigned int num)
526 {
527 if (num >= MAX_IPTUN_ENCAP_OPS)
528 return -ERANGE;
529
530 return !cmpxchg((const struct ip_tunnel_encap_ops **)
531 &iptun_encaps[num],
532 NULL, ops) ? 0 : -1;
533 }
534 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
535
ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops * ops,unsigned int num)536 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
537 unsigned int num)
538 {
539 int ret;
540
541 if (num >= MAX_IPTUN_ENCAP_OPS)
542 return -ERANGE;
543
544 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
545 &iptun_encaps[num],
546 ops, NULL) == ops) ? 0 : -1;
547
548 synchronize_net();
549
550 return ret;
551 }
552 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
553
ip_tunnel_encap_setup(struct ip_tunnel * t,struct ip_tunnel_encap * ipencap)554 int ip_tunnel_encap_setup(struct ip_tunnel *t,
555 struct ip_tunnel_encap *ipencap)
556 {
557 int hlen;
558
559 memset(&t->encap, 0, sizeof(t->encap));
560
561 hlen = ip_encap_hlen(ipencap);
562 if (hlen < 0)
563 return hlen;
564
565 t->encap.type = ipencap->type;
566 t->encap.sport = ipencap->sport;
567 t->encap.dport = ipencap->dport;
568 t->encap.flags = ipencap->flags;
569
570 t->encap_hlen = hlen;
571 t->hlen = t->encap_hlen + t->tun_hlen;
572
573 return 0;
574 }
575 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
576
ip_tunnel_encap(struct sk_buff * skb,struct ip_tunnel * t,u8 * protocol,struct flowi4 * fl4)577 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
578 u8 *protocol, struct flowi4 *fl4)
579 {
580 const struct ip_tunnel_encap_ops *ops;
581 int ret = -EINVAL;
582
583 if (t->encap.type == TUNNEL_ENCAP_NONE)
584 return 0;
585
586 if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
587 return -EINVAL;
588
589 rcu_read_lock();
590 ops = rcu_dereference(iptun_encaps[t->encap.type]);
591 if (likely(ops && ops->build_header))
592 ret = ops->build_header(skb, &t->encap, protocol, fl4);
593 rcu_read_unlock();
594
595 return ret;
596 }
597 EXPORT_SYMBOL(ip_tunnel_encap);
598
tnl_update_pmtu(struct net_device * dev,struct sk_buff * skb,struct rtable * rt,__be16 df,const struct iphdr * inner_iph)599 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
600 struct rtable *rt, __be16 df,
601 const struct iphdr *inner_iph)
602 {
603 struct ip_tunnel *tunnel = netdev_priv(dev);
604 int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
605 int mtu;
606
607 if (df)
608 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
609 - sizeof(struct iphdr) - tunnel->hlen;
610 else
611 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
612
613 if (skb_dst(skb))
614 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
615
616 if (skb->protocol == htons(ETH_P_IP)) {
617 if (!skb_is_gso(skb) &&
618 (inner_iph->frag_off & htons(IP_DF)) &&
619 mtu < pkt_size) {
620 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
621 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
622 return -E2BIG;
623 }
624 }
625 #if IS_ENABLED(CONFIG_IPV6)
626 else if (skb->protocol == htons(ETH_P_IPV6)) {
627 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
628
629 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
630 mtu >= IPV6_MIN_MTU) {
631 if ((tunnel->parms.iph.daddr &&
632 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
633 rt6->rt6i_dst.plen == 128) {
634 rt6->rt6i_flags |= RTF_MODIFIED;
635 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
636 }
637 }
638
639 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
640 mtu < pkt_size) {
641 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642 return -E2BIG;
643 }
644 }
645 #endif
646 return 0;
647 }
648
ip_tunnel_xmit(struct sk_buff * skb,struct net_device * dev,const struct iphdr * tnl_params,u8 protocol)649 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
650 const struct iphdr *tnl_params, u8 protocol)
651 {
652 struct ip_tunnel *tunnel = netdev_priv(dev);
653 const struct iphdr *inner_iph;
654 struct flowi4 fl4;
655 u8 tos, ttl;
656 __be16 df;
657 struct rtable *rt; /* Route to the other host */
658 unsigned int max_headroom; /* The extra header space needed */
659 __be32 dst;
660 int err;
661 bool connected;
662
663 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
664 connected = (tunnel->parms.iph.daddr != 0);
665
666 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
667
668 dst = tnl_params->daddr;
669 if (dst == 0) {
670 /* NBMA tunnel */
671
672 if (!skb_dst(skb)) {
673 dev->stats.tx_fifo_errors++;
674 goto tx_error;
675 }
676
677 if (skb->protocol == htons(ETH_P_IP)) {
678 rt = skb_rtable(skb);
679 dst = rt_nexthop(rt, inner_iph->daddr);
680 }
681 #if IS_ENABLED(CONFIG_IPV6)
682 else if (skb->protocol == htons(ETH_P_IPV6)) {
683 const struct in6_addr *addr6;
684 struct neighbour *neigh;
685 bool do_tx_error_icmp;
686 int addr_type;
687
688 neigh = dst_neigh_lookup(skb_dst(skb),
689 &ipv6_hdr(skb)->daddr);
690 if (!neigh)
691 goto tx_error;
692
693 addr6 = (const struct in6_addr *)&neigh->primary_key;
694 addr_type = ipv6_addr_type(addr6);
695
696 if (addr_type == IPV6_ADDR_ANY) {
697 addr6 = &ipv6_hdr(skb)->daddr;
698 addr_type = ipv6_addr_type(addr6);
699 }
700
701 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
702 do_tx_error_icmp = true;
703 else {
704 do_tx_error_icmp = false;
705 dst = addr6->s6_addr32[3];
706 }
707 neigh_release(neigh);
708 if (do_tx_error_icmp)
709 goto tx_error_icmp;
710 }
711 #endif
712 else
713 goto tx_error;
714
715 connected = false;
716 }
717
718 tos = tnl_params->tos;
719 if (tos & 0x1) {
720 tos &= ~0x1;
721 if (skb->protocol == htons(ETH_P_IP)) {
722 tos = inner_iph->tos;
723 connected = false;
724 } else if (skb->protocol == htons(ETH_P_IPV6)) {
725 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
726 connected = false;
727 }
728 }
729
730 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
731 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
732
733 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
734 goto tx_error;
735
736 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
737
738 if (!rt) {
739 rt = ip_route_output_key(tunnel->net, &fl4);
740
741 if (IS_ERR(rt)) {
742 dev->stats.tx_carrier_errors++;
743 goto tx_error;
744 }
745 if (connected)
746 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
747 }
748
749 if (rt->dst.dev == dev) {
750 ip_rt_put(rt);
751 dev->stats.collisions++;
752 goto tx_error;
753 }
754
755 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
756 ip_rt_put(rt);
757 goto tx_error;
758 }
759
760 if (tunnel->err_count > 0) {
761 if (time_before(jiffies,
762 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
763 tunnel->err_count--;
764
765 dst_link_failure(skb);
766 } else
767 tunnel->err_count = 0;
768 }
769
770 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
771 ttl = tnl_params->ttl;
772 if (ttl == 0) {
773 if (skb->protocol == htons(ETH_P_IP))
774 ttl = inner_iph->ttl;
775 #if IS_ENABLED(CONFIG_IPV6)
776 else if (skb->protocol == htons(ETH_P_IPV6))
777 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
778 #endif
779 else
780 ttl = ip4_dst_hoplimit(&rt->dst);
781 }
782
783 df = tnl_params->frag_off;
784 if (skb->protocol == htons(ETH_P_IP))
785 df |= (inner_iph->frag_off&htons(IP_DF));
786
787 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
788 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
789 if (max_headroom > dev->needed_headroom)
790 dev->needed_headroom = max_headroom;
791
792 if (skb_cow_head(skb, dev->needed_headroom)) {
793 ip_rt_put(rt);
794 dev->stats.tx_dropped++;
795 kfree_skb(skb);
796 return;
797 }
798
799 err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
800 tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
801 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
802
803 return;
804
805 #if IS_ENABLED(CONFIG_IPV6)
806 tx_error_icmp:
807 dst_link_failure(skb);
808 #endif
809 tx_error:
810 dev->stats.tx_errors++;
811 kfree_skb(skb);
812 }
813 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
814
ip_tunnel_update(struct ip_tunnel_net * itn,struct ip_tunnel * t,struct net_device * dev,struct ip_tunnel_parm * p,bool set_mtu)815 static void ip_tunnel_update(struct ip_tunnel_net *itn,
816 struct ip_tunnel *t,
817 struct net_device *dev,
818 struct ip_tunnel_parm *p,
819 bool set_mtu)
820 {
821 ip_tunnel_del(itn, t);
822 t->parms.iph.saddr = p->iph.saddr;
823 t->parms.iph.daddr = p->iph.daddr;
824 t->parms.i_key = p->i_key;
825 t->parms.o_key = p->o_key;
826 if (dev->type != ARPHRD_ETHER) {
827 memcpy(dev->dev_addr, &p->iph.saddr, 4);
828 memcpy(dev->broadcast, &p->iph.daddr, 4);
829 }
830 ip_tunnel_add(itn, t);
831
832 t->parms.iph.ttl = p->iph.ttl;
833 t->parms.iph.tos = p->iph.tos;
834 t->parms.iph.frag_off = p->iph.frag_off;
835
836 if (t->parms.link != p->link) {
837 int mtu;
838
839 t->parms.link = p->link;
840 mtu = ip_tunnel_bind_dev(dev);
841 if (set_mtu)
842 dev->mtu = mtu;
843 }
844 ip_tunnel_dst_reset_all(t);
845 netdev_state_change(dev);
846 }
847
ip_tunnel_ioctl(struct net_device * dev,struct ip_tunnel_parm * p,int cmd)848 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
849 {
850 int err = 0;
851 struct ip_tunnel *t = netdev_priv(dev);
852 struct net *net = t->net;
853 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
854
855 BUG_ON(!itn->fb_tunnel_dev);
856 switch (cmd) {
857 case SIOCGETTUNNEL:
858 if (dev == itn->fb_tunnel_dev) {
859 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
860 if (!t)
861 t = netdev_priv(dev);
862 }
863 memcpy(p, &t->parms, sizeof(*p));
864 break;
865
866 case SIOCADDTUNNEL:
867 case SIOCCHGTUNNEL:
868 err = -EPERM;
869 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
870 goto done;
871 if (p->iph.ttl)
872 p->iph.frag_off |= htons(IP_DF);
873 if (!(p->i_flags & VTI_ISVTI)) {
874 if (!(p->i_flags & TUNNEL_KEY))
875 p->i_key = 0;
876 if (!(p->o_flags & TUNNEL_KEY))
877 p->o_key = 0;
878 }
879
880 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
881
882 if (cmd == SIOCADDTUNNEL) {
883 if (!t) {
884 t = ip_tunnel_create(net, itn, p);
885 err = PTR_ERR_OR_ZERO(t);
886 break;
887 }
888
889 err = -EEXIST;
890 break;
891 }
892 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
893 if (t) {
894 if (t->dev != dev) {
895 err = -EEXIST;
896 break;
897 }
898 } else {
899 unsigned int nflags = 0;
900
901 if (ipv4_is_multicast(p->iph.daddr))
902 nflags = IFF_BROADCAST;
903 else if (p->iph.daddr)
904 nflags = IFF_POINTOPOINT;
905
906 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
907 err = -EINVAL;
908 break;
909 }
910
911 t = netdev_priv(dev);
912 }
913 }
914
915 if (t) {
916 err = 0;
917 ip_tunnel_update(itn, t, dev, p, true);
918 } else {
919 err = -ENOENT;
920 }
921 break;
922
923 case SIOCDELTUNNEL:
924 err = -EPERM;
925 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
926 goto done;
927
928 if (dev == itn->fb_tunnel_dev) {
929 err = -ENOENT;
930 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
931 if (!t)
932 goto done;
933 err = -EPERM;
934 if (t == netdev_priv(itn->fb_tunnel_dev))
935 goto done;
936 dev = t->dev;
937 }
938 unregister_netdevice(dev);
939 err = 0;
940 break;
941
942 default:
943 err = -EINVAL;
944 }
945
946 done:
947 return err;
948 }
949 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
950
__ip_tunnel_change_mtu(struct net_device * dev,int new_mtu,bool strict)951 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
952 {
953 struct ip_tunnel *tunnel = netdev_priv(dev);
954 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
955 int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
956
957 if (new_mtu < 68)
958 return -EINVAL;
959
960 if (new_mtu > max_mtu) {
961 if (strict)
962 return -EINVAL;
963
964 new_mtu = max_mtu;
965 }
966
967 dev->mtu = new_mtu;
968 return 0;
969 }
970 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
971
ip_tunnel_change_mtu(struct net_device * dev,int new_mtu)972 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
973 {
974 return __ip_tunnel_change_mtu(dev, new_mtu, true);
975 }
976 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
977
ip_tunnel_dev_free(struct net_device * dev)978 static void ip_tunnel_dev_free(struct net_device *dev)
979 {
980 struct ip_tunnel *tunnel = netdev_priv(dev);
981
982 gro_cells_destroy(&tunnel->gro_cells);
983 free_percpu(tunnel->dst_cache);
984 free_percpu(dev->tstats);
985 free_netdev(dev);
986 }
987
ip_tunnel_dellink(struct net_device * dev,struct list_head * head)988 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
989 {
990 struct ip_tunnel *tunnel = netdev_priv(dev);
991 struct ip_tunnel_net *itn;
992
993 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
994
995 if (itn->fb_tunnel_dev != dev) {
996 ip_tunnel_del(itn, netdev_priv(dev));
997 unregister_netdevice_queue(dev, head);
998 }
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1001
ip_tunnel_get_link_net(const struct net_device * dev)1002 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1003 {
1004 struct ip_tunnel *tunnel = netdev_priv(dev);
1005
1006 return tunnel->net;
1007 }
1008 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1009
ip_tunnel_get_iflink(const struct net_device * dev)1010 int ip_tunnel_get_iflink(const struct net_device *dev)
1011 {
1012 struct ip_tunnel *tunnel = netdev_priv(dev);
1013
1014 return tunnel->parms.link;
1015 }
1016 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1017
ip_tunnel_init_net(struct net * net,int ip_tnl_net_id,struct rtnl_link_ops * ops,char * devname)1018 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1019 struct rtnl_link_ops *ops, char *devname)
1020 {
1021 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1022 struct ip_tunnel_parm parms;
1023 unsigned int i;
1024
1025 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1026 INIT_HLIST_HEAD(&itn->tunnels[i]);
1027
1028 if (!ops) {
1029 itn->fb_tunnel_dev = NULL;
1030 return 0;
1031 }
1032
1033 memset(&parms, 0, sizeof(parms));
1034 if (devname)
1035 strlcpy(parms.name, devname, IFNAMSIZ);
1036
1037 rtnl_lock();
1038 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1039 /* FB netdevice is special: we have one, and only one per netns.
1040 * Allowing to move it to another netns is clearly unsafe.
1041 */
1042 if (!IS_ERR(itn->fb_tunnel_dev)) {
1043 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1044 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1045 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1046 }
1047 rtnl_unlock();
1048
1049 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1050 }
1051 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1052
ip_tunnel_destroy(struct ip_tunnel_net * itn,struct list_head * head,struct rtnl_link_ops * ops)1053 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1054 struct rtnl_link_ops *ops)
1055 {
1056 struct net *net = dev_net(itn->fb_tunnel_dev);
1057 struct net_device *dev, *aux;
1058 int h;
1059
1060 for_each_netdev_safe(net, dev, aux)
1061 if (dev->rtnl_link_ops == ops)
1062 unregister_netdevice_queue(dev, head);
1063
1064 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1065 struct ip_tunnel *t;
1066 struct hlist_node *n;
1067 struct hlist_head *thead = &itn->tunnels[h];
1068
1069 hlist_for_each_entry_safe(t, n, thead, hash_node)
1070 /* If dev is in the same netns, it has already
1071 * been added to the list by the previous loop.
1072 */
1073 if (!net_eq(dev_net(t->dev), net))
1074 unregister_netdevice_queue(t->dev, head);
1075 }
1076 }
1077
ip_tunnel_delete_net(struct ip_tunnel_net * itn,struct rtnl_link_ops * ops)1078 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1079 {
1080 LIST_HEAD(list);
1081
1082 rtnl_lock();
1083 ip_tunnel_destroy(itn, &list, ops);
1084 unregister_netdevice_many(&list);
1085 rtnl_unlock();
1086 }
1087 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1088
ip_tunnel_newlink(struct net_device * dev,struct nlattr * tb[],struct ip_tunnel_parm * p)1089 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1090 struct ip_tunnel_parm *p)
1091 {
1092 struct ip_tunnel *nt;
1093 struct net *net = dev_net(dev);
1094 struct ip_tunnel_net *itn;
1095 int mtu;
1096 int err;
1097
1098 nt = netdev_priv(dev);
1099 itn = net_generic(net, nt->ip_tnl_net_id);
1100
1101 if (nt->collect_md) {
1102 if (rtnl_dereference(itn->collect_md_tun))
1103 return -EEXIST;
1104 } else {
1105 if (ip_tunnel_find(itn, p, dev->type))
1106 return -EEXIST;
1107 }
1108
1109 nt->net = net;
1110 nt->parms = *p;
1111 err = register_netdevice(dev);
1112 if (err)
1113 goto out;
1114
1115 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1116 eth_hw_addr_random(dev);
1117
1118 mtu = ip_tunnel_bind_dev(dev);
1119 if (!tb[IFLA_MTU])
1120 dev->mtu = mtu;
1121
1122 ip_tunnel_add(itn, nt);
1123 out:
1124 return err;
1125 }
1126 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1127
ip_tunnel_changelink(struct net_device * dev,struct nlattr * tb[],struct ip_tunnel_parm * p)1128 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1129 struct ip_tunnel_parm *p)
1130 {
1131 struct ip_tunnel *t;
1132 struct ip_tunnel *tunnel = netdev_priv(dev);
1133 struct net *net = tunnel->net;
1134 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1135
1136 if (dev == itn->fb_tunnel_dev)
1137 return -EINVAL;
1138
1139 t = ip_tunnel_find(itn, p, dev->type);
1140
1141 if (t) {
1142 if (t->dev != dev)
1143 return -EEXIST;
1144 } else {
1145 t = tunnel;
1146
1147 if (dev->type != ARPHRD_ETHER) {
1148 unsigned int nflags = 0;
1149
1150 if (ipv4_is_multicast(p->iph.daddr))
1151 nflags = IFF_BROADCAST;
1152 else if (p->iph.daddr)
1153 nflags = IFF_POINTOPOINT;
1154
1155 if ((dev->flags ^ nflags) &
1156 (IFF_POINTOPOINT | IFF_BROADCAST))
1157 return -EINVAL;
1158 }
1159 }
1160
1161 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1162 return 0;
1163 }
1164 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1165
ip_tunnel_init(struct net_device * dev)1166 int ip_tunnel_init(struct net_device *dev)
1167 {
1168 struct ip_tunnel *tunnel = netdev_priv(dev);
1169 struct iphdr *iph = &tunnel->parms.iph;
1170 int err;
1171
1172 dev->destructor = ip_tunnel_dev_free;
1173 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1174 if (!dev->tstats)
1175 return -ENOMEM;
1176
1177 tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1178 if (!tunnel->dst_cache) {
1179 free_percpu(dev->tstats);
1180 return -ENOMEM;
1181 }
1182
1183 err = gro_cells_init(&tunnel->gro_cells, dev);
1184 if (err) {
1185 free_percpu(tunnel->dst_cache);
1186 free_percpu(dev->tstats);
1187 return err;
1188 }
1189
1190 tunnel->dev = dev;
1191 tunnel->net = dev_net(dev);
1192 strcpy(tunnel->parms.name, dev->name);
1193 iph->version = 4;
1194 iph->ihl = 5;
1195
1196 if (tunnel->collect_md) {
1197 dev->features |= NETIF_F_NETNS_LOCAL;
1198 netif_keep_dst(dev);
1199 }
1200 return 0;
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1203
ip_tunnel_uninit(struct net_device * dev)1204 void ip_tunnel_uninit(struct net_device *dev)
1205 {
1206 struct ip_tunnel *tunnel = netdev_priv(dev);
1207 struct net *net = tunnel->net;
1208 struct ip_tunnel_net *itn;
1209
1210 itn = net_generic(net, tunnel->ip_tnl_net_id);
1211 /* fb_tunnel_dev will be unregisted in net-exit call. */
1212 if (itn->fb_tunnel_dev != dev)
1213 ip_tunnel_del(itn, netdev_priv(dev));
1214
1215 ip_tunnel_dst_reset_all(tunnel);
1216 }
1217 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1218
1219 /* Do least required initialization, rest of init is done in tunnel_init call */
ip_tunnel_setup(struct net_device * dev,int net_id)1220 void ip_tunnel_setup(struct net_device *dev, int net_id)
1221 {
1222 struct ip_tunnel *tunnel = netdev_priv(dev);
1223 tunnel->ip_tnl_net_id = net_id;
1224 }
1225 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1226
1227 MODULE_LICENSE("GPL");
1228