1/* 2 * Copyright (c) 2013 Nicira, Inc. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 16 * 02110-1301, USA 17 */ 18 19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 20 21#include <linux/capability.h> 22#include <linux/module.h> 23#include <linux/types.h> 24#include <linux/kernel.h> 25#include <linux/slab.h> 26#include <linux/uaccess.h> 27#include <linux/skbuff.h> 28#include <linux/netdevice.h> 29#include <linux/in.h> 30#include <linux/tcp.h> 31#include <linux/udp.h> 32#include <linux/if_arp.h> 33#include <linux/mroute.h> 34#include <linux/init.h> 35#include <linux/in6.h> 36#include <linux/inetdevice.h> 37#include <linux/igmp.h> 38#include <linux/netfilter_ipv4.h> 39#include <linux/etherdevice.h> 40#include <linux/if_ether.h> 41#include <linux/if_vlan.h> 42#include <linux/rculist.h> 43#include <linux/err.h> 44 45#include <net/sock.h> 46#include <net/ip.h> 47#include <net/icmp.h> 48#include <net/protocol.h> 49#include <net/ip_tunnels.h> 50#include <net/arp.h> 51#include <net/checksum.h> 52#include <net/dsfield.h> 53#include <net/inet_ecn.h> 54#include <net/xfrm.h> 55#include <net/net_namespace.h> 56#include <net/netns/generic.h> 57#include <net/rtnetlink.h> 58#include <net/udp.h> 59 60#if IS_ENABLED(CONFIG_IPV6) 61#include <net/ipv6.h> 62#include <net/ip6_fib.h> 63#include <net/ip6_route.h> 64#endif 65 66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 67{ 68 return hash_32((__force u32)key ^ (__force u32)remote, 69 IP_TNL_HASH_BITS); 70} 71 72static void __tunnel_dst_set(struct ip_tunnel_dst *idst, 73 struct dst_entry *dst, __be32 saddr) 74{ 75 struct dst_entry *old_dst; 76 77 dst_clone(dst); 78 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); 79 dst_release(old_dst); 80 idst->saddr = saddr; 81} 82 83static noinline void tunnel_dst_set(struct ip_tunnel *t, 84 struct dst_entry *dst, __be32 saddr) 85{ 86 __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); 87} 88 89static void tunnel_dst_reset(struct ip_tunnel *t) 90{ 91 tunnel_dst_set(t, NULL, 0); 92} 93 94void ip_tunnel_dst_reset_all(struct ip_tunnel *t) 95{ 96 int i; 97 98 for_each_possible_cpu(i) 99 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); 100} 101EXPORT_SYMBOL(ip_tunnel_dst_reset_all); 102 103static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, 104 u32 cookie, __be32 *saddr) 105{ 106 struct ip_tunnel_dst *idst; 107 struct dst_entry *dst; 108 109 rcu_read_lock(); 110 idst = raw_cpu_ptr(t->dst_cache); 111 dst = rcu_dereference(idst->dst); 112 if (dst && !atomic_inc_not_zero(&dst->__refcnt)) 113 dst = NULL; 114 if (dst) { 115 if (!dst->obsolete || dst->ops->check(dst, cookie)) { 116 *saddr = idst->saddr; 117 } else { 118 tunnel_dst_reset(t); 119 dst_release(dst); 120 dst = NULL; 121 } 122 } 123 rcu_read_unlock(); 124 return (struct rtable *)dst; 125} 126 127static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 128 __be16 flags, __be32 key) 129{ 130 if (p->i_flags & TUNNEL_KEY) { 131 if (flags & TUNNEL_KEY) 132 return key == p->i_key; 133 else 134 /* key expected, none present */ 135 return false; 136 } else 137 return !(flags & TUNNEL_KEY); 138} 139 140/* Fallback tunnel: no source, no destination, no key, no options 141 142 Tunnel hash table: 143 We require exact key match i.e. if a key is present in packet 144 it will match only tunnel with the same key; if it is not present, 145 it will match only keyless tunnel. 146 147 All keysless packets, if not matched configured keyless tunnels 148 will match fallback tunnel. 149 Given src, dst and key, find appropriate for input tunnel. 150*/ 151struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 152 int link, __be16 flags, 153 __be32 remote, __be32 local, 154 __be32 key) 155{ 156 unsigned int hash; 157 struct ip_tunnel *t, *cand = NULL; 158 struct hlist_head *head; 159 160 hash = ip_tunnel_hash(key, remote); 161 head = &itn->tunnels[hash]; 162 163 hlist_for_each_entry_rcu(t, head, hash_node) { 164 if (local != t->parms.iph.saddr || 165 remote != t->parms.iph.daddr || 166 !(t->dev->flags & IFF_UP)) 167 continue; 168 169 if (!ip_tunnel_key_match(&t->parms, flags, key)) 170 continue; 171 172 if (t->parms.link == link) 173 return t; 174 else 175 cand = t; 176 } 177 178 hlist_for_each_entry_rcu(t, head, hash_node) { 179 if (remote != t->parms.iph.daddr || 180 t->parms.iph.saddr != 0 || 181 !(t->dev->flags & IFF_UP)) 182 continue; 183 184 if (!ip_tunnel_key_match(&t->parms, flags, key)) 185 continue; 186 187 if (t->parms.link == link) 188 return t; 189 else if (!cand) 190 cand = t; 191 } 192 193 hash = ip_tunnel_hash(key, 0); 194 head = &itn->tunnels[hash]; 195 196 hlist_for_each_entry_rcu(t, head, hash_node) { 197 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 198 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 199 continue; 200 201 if (!(t->dev->flags & IFF_UP)) 202 continue; 203 204 if (!ip_tunnel_key_match(&t->parms, flags, key)) 205 continue; 206 207 if (t->parms.link == link) 208 return t; 209 else if (!cand) 210 cand = t; 211 } 212 213 if (flags & TUNNEL_NO_KEY) 214 goto skip_key_lookup; 215 216 hlist_for_each_entry_rcu(t, head, hash_node) { 217 if (t->parms.i_key != key || 218 t->parms.iph.saddr != 0 || 219 t->parms.iph.daddr != 0 || 220 !(t->dev->flags & IFF_UP)) 221 continue; 222 223 if (t->parms.link == link) 224 return t; 225 else if (!cand) 226 cand = t; 227 } 228 229skip_key_lookup: 230 if (cand) 231 return cand; 232 233 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) 234 return netdev_priv(itn->fb_tunnel_dev); 235 236 237 return NULL; 238} 239EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 240 241static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 242 struct ip_tunnel_parm *parms) 243{ 244 unsigned int h; 245 __be32 remote; 246 __be32 i_key = parms->i_key; 247 248 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 249 remote = parms->iph.daddr; 250 else 251 remote = 0; 252 253 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 254 i_key = 0; 255 256 h = ip_tunnel_hash(i_key, remote); 257 return &itn->tunnels[h]; 258} 259 260static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 261{ 262 struct hlist_head *head = ip_bucket(itn, &t->parms); 263 264 hlist_add_head_rcu(&t->hash_node, head); 265} 266 267static void ip_tunnel_del(struct ip_tunnel *t) 268{ 269 hlist_del_init_rcu(&t->hash_node); 270} 271 272static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 273 struct ip_tunnel_parm *parms, 274 int type) 275{ 276 __be32 remote = parms->iph.daddr; 277 __be32 local = parms->iph.saddr; 278 __be32 key = parms->i_key; 279 __be16 flags = parms->i_flags; 280 int link = parms->link; 281 struct ip_tunnel *t = NULL; 282 struct hlist_head *head = ip_bucket(itn, parms); 283 284 hlist_for_each_entry_rcu(t, head, hash_node) { 285 if (local == t->parms.iph.saddr && 286 remote == t->parms.iph.daddr && 287 link == t->parms.link && 288 type == t->dev->type && 289 ip_tunnel_key_match(&t->parms, flags, key)) 290 break; 291 } 292 return t; 293} 294 295static struct net_device *__ip_tunnel_create(struct net *net, 296 const struct rtnl_link_ops *ops, 297 struct ip_tunnel_parm *parms) 298{ 299 int err; 300 struct ip_tunnel *tunnel; 301 struct net_device *dev; 302 char name[IFNAMSIZ]; 303 304 if (parms->name[0]) 305 strlcpy(name, parms->name, IFNAMSIZ); 306 else { 307 if (strlen(ops->kind) > (IFNAMSIZ - 3)) { 308 err = -E2BIG; 309 goto failed; 310 } 311 strlcpy(name, ops->kind, IFNAMSIZ); 312 strncat(name, "%d", 2); 313 } 314 315 ASSERT_RTNL(); 316 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 317 if (!dev) { 318 err = -ENOMEM; 319 goto failed; 320 } 321 dev_net_set(dev, net); 322 323 dev->rtnl_link_ops = ops; 324 325 tunnel = netdev_priv(dev); 326 tunnel->parms = *parms; 327 tunnel->net = net; 328 329 err = register_netdevice(dev); 330 if (err) 331 goto failed_free; 332 333 return dev; 334 335failed_free: 336 free_netdev(dev); 337failed: 338 return ERR_PTR(err); 339} 340 341static inline void init_tunnel_flow(struct flowi4 *fl4, 342 int proto, 343 __be32 daddr, __be32 saddr, 344 __be32 key, __u8 tos, int oif) 345{ 346 memset(fl4, 0, sizeof(*fl4)); 347 fl4->flowi4_oif = oif; 348 fl4->daddr = daddr; 349 fl4->saddr = saddr; 350 fl4->flowi4_tos = tos; 351 fl4->flowi4_proto = proto; 352 fl4->fl4_gre_key = key; 353} 354 355static int ip_tunnel_bind_dev(struct net_device *dev) 356{ 357 struct net_device *tdev = NULL; 358 struct ip_tunnel *tunnel = netdev_priv(dev); 359 const struct iphdr *iph; 360 int hlen = LL_MAX_HEADER; 361 int mtu = ETH_DATA_LEN; 362 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 363 364 iph = &tunnel->parms.iph; 365 366 /* Guess output device to choose reasonable mtu and needed_headroom */ 367 if (iph->daddr) { 368 struct flowi4 fl4; 369 struct rtable *rt; 370 371 init_tunnel_flow(&fl4, iph->protocol, iph->daddr, 372 iph->saddr, tunnel->parms.o_key, 373 RT_TOS(iph->tos), tunnel->parms.link); 374 rt = ip_route_output_key(tunnel->net, &fl4); 375 376 if (!IS_ERR(rt)) { 377 tdev = rt->dst.dev; 378 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); 379 ip_rt_put(rt); 380 } 381 if (dev->type != ARPHRD_ETHER) 382 dev->flags |= IFF_POINTOPOINT; 383 } 384 385 if (!tdev && tunnel->parms.link) 386 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 387 388 if (tdev) { 389 hlen = tdev->hard_header_len + tdev->needed_headroom; 390 mtu = tdev->mtu; 391 } 392 393 dev->needed_headroom = t_hlen + hlen; 394 mtu -= (dev->hard_header_len + t_hlen); 395 396 if (mtu < 68) 397 mtu = 68; 398 399 return mtu; 400} 401 402static struct ip_tunnel *ip_tunnel_create(struct net *net, 403 struct ip_tunnel_net *itn, 404 struct ip_tunnel_parm *parms) 405{ 406 struct ip_tunnel *nt; 407 struct net_device *dev; 408 409 BUG_ON(!itn->fb_tunnel_dev); 410 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); 411 if (IS_ERR(dev)) 412 return ERR_CAST(dev); 413 414 dev->mtu = ip_tunnel_bind_dev(dev); 415 416 nt = netdev_priv(dev); 417 ip_tunnel_add(itn, nt); 418 return nt; 419} 420 421int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 422 const struct tnl_ptk_info *tpi, bool log_ecn_error) 423{ 424 struct pcpu_sw_netstats *tstats; 425 const struct iphdr *iph = ip_hdr(skb); 426 int err; 427 428#ifdef CONFIG_NET_IPGRE_BROADCAST 429 if (ipv4_is_multicast(iph->daddr)) { 430 tunnel->dev->stats.multicast++; 431 skb->pkt_type = PACKET_BROADCAST; 432 } 433#endif 434 435 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 436 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 437 tunnel->dev->stats.rx_crc_errors++; 438 tunnel->dev->stats.rx_errors++; 439 goto drop; 440 } 441 442 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 443 if (!(tpi->flags&TUNNEL_SEQ) || 444 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 445 tunnel->dev->stats.rx_fifo_errors++; 446 tunnel->dev->stats.rx_errors++; 447 goto drop; 448 } 449 tunnel->i_seqno = ntohl(tpi->seq) + 1; 450 } 451 452 skb_reset_network_header(skb); 453 454 err = IP_ECN_decapsulate(iph, skb); 455 if (unlikely(err)) { 456 if (log_ecn_error) 457 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 458 &iph->saddr, iph->tos); 459 if (err > 1) { 460 ++tunnel->dev->stats.rx_frame_errors; 461 ++tunnel->dev->stats.rx_errors; 462 goto drop; 463 } 464 } 465 466 tstats = this_cpu_ptr(tunnel->dev->tstats); 467 u64_stats_update_begin(&tstats->syncp); 468 tstats->rx_packets++; 469 tstats->rx_bytes += skb->len; 470 u64_stats_update_end(&tstats->syncp); 471 472 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 473 474 if (tunnel->dev->type == ARPHRD_ETHER) { 475 skb->protocol = eth_type_trans(skb, tunnel->dev); 476 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 477 } else { 478 skb->dev = tunnel->dev; 479 } 480 481 gro_cells_receive(&tunnel->gro_cells, skb); 482 return 0; 483 484drop: 485 kfree_skb(skb); 486 return 0; 487} 488EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 489 490static int ip_encap_hlen(struct ip_tunnel_encap *e) 491{ 492 const struct ip_tunnel_encap_ops *ops; 493 int hlen = -EINVAL; 494 495 if (e->type == TUNNEL_ENCAP_NONE) 496 return 0; 497 498 if (e->type >= MAX_IPTUN_ENCAP_OPS) 499 return -EINVAL; 500 501 rcu_read_lock(); 502 ops = rcu_dereference(iptun_encaps[e->type]); 503 if (likely(ops && ops->encap_hlen)) 504 hlen = ops->encap_hlen(e); 505 rcu_read_unlock(); 506 507 return hlen; 508} 509 510const struct ip_tunnel_encap_ops __rcu * 511 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; 512 513int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 514 unsigned int num) 515{ 516 if (num >= MAX_IPTUN_ENCAP_OPS) 517 return -ERANGE; 518 519 return !cmpxchg((const struct ip_tunnel_encap_ops **) 520 &iptun_encaps[num], 521 NULL, ops) ? 0 : -1; 522} 523EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 524 525int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 526 unsigned int num) 527{ 528 int ret; 529 530 if (num >= MAX_IPTUN_ENCAP_OPS) 531 return -ERANGE; 532 533 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 534 &iptun_encaps[num], 535 ops, NULL) == ops) ? 0 : -1; 536 537 synchronize_net(); 538 539 return ret; 540} 541EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 542 543int ip_tunnel_encap_setup(struct ip_tunnel *t, 544 struct ip_tunnel_encap *ipencap) 545{ 546 int hlen; 547 548 memset(&t->encap, 0, sizeof(t->encap)); 549 550 hlen = ip_encap_hlen(ipencap); 551 if (hlen < 0) 552 return hlen; 553 554 t->encap.type = ipencap->type; 555 t->encap.sport = ipencap->sport; 556 t->encap.dport = ipencap->dport; 557 t->encap.flags = ipencap->flags; 558 559 t->encap_hlen = hlen; 560 t->hlen = t->encap_hlen + t->tun_hlen; 561 562 return 0; 563} 564EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 565 566int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, 567 u8 *protocol, struct flowi4 *fl4) 568{ 569 const struct ip_tunnel_encap_ops *ops; 570 int ret = -EINVAL; 571 572 if (t->encap.type == TUNNEL_ENCAP_NONE) 573 return 0; 574 575 if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) 576 return -EINVAL; 577 578 rcu_read_lock(); 579 ops = rcu_dereference(iptun_encaps[t->encap.type]); 580 if (likely(ops && ops->build_header)) 581 ret = ops->build_header(skb, &t->encap, protocol, fl4); 582 rcu_read_unlock(); 583 584 return ret; 585} 586EXPORT_SYMBOL(ip_tunnel_encap); 587 588static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 589 struct rtable *rt, __be16 df, 590 const struct iphdr *inner_iph) 591{ 592 struct ip_tunnel *tunnel = netdev_priv(dev); 593 int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; 594 int mtu; 595 596 if (df) 597 mtu = dst_mtu(&rt->dst) - dev->hard_header_len 598 - sizeof(struct iphdr) - tunnel->hlen; 599 else 600 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 601 602 if (skb_dst(skb)) 603 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 604 605 if (skb->protocol == htons(ETH_P_IP)) { 606 if (!skb_is_gso(skb) && 607 (inner_iph->frag_off & htons(IP_DF)) && 608 mtu < pkt_size) { 609 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 610 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 611 return -E2BIG; 612 } 613 } 614#if IS_ENABLED(CONFIG_IPV6) 615 else if (skb->protocol == htons(ETH_P_IPV6)) { 616 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 617 618 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 619 mtu >= IPV6_MIN_MTU) { 620 if ((tunnel->parms.iph.daddr && 621 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 622 rt6->rt6i_dst.plen == 128) { 623 rt6->rt6i_flags |= RTF_MODIFIED; 624 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 625 } 626 } 627 628 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 629 mtu < pkt_size) { 630 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 631 return -E2BIG; 632 } 633 } 634#endif 635 return 0; 636} 637 638void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 639 const struct iphdr *tnl_params, u8 protocol) 640{ 641 struct ip_tunnel *tunnel = netdev_priv(dev); 642 const struct iphdr *inner_iph; 643 struct flowi4 fl4; 644 u8 tos, ttl; 645 __be16 df; 646 struct rtable *rt; /* Route to the other host */ 647 unsigned int max_headroom; /* The extra header space needed */ 648 __be32 dst; 649 int err; 650 bool connected; 651 652 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 653 connected = (tunnel->parms.iph.daddr != 0); 654 655 dst = tnl_params->daddr; 656 if (dst == 0) { 657 /* NBMA tunnel */ 658 659 if (!skb_dst(skb)) { 660 dev->stats.tx_fifo_errors++; 661 goto tx_error; 662 } 663 664 if (skb->protocol == htons(ETH_P_IP)) { 665 rt = skb_rtable(skb); 666 dst = rt_nexthop(rt, inner_iph->daddr); 667 } 668#if IS_ENABLED(CONFIG_IPV6) 669 else if (skb->protocol == htons(ETH_P_IPV6)) { 670 const struct in6_addr *addr6; 671 struct neighbour *neigh; 672 bool do_tx_error_icmp; 673 int addr_type; 674 675 neigh = dst_neigh_lookup(skb_dst(skb), 676 &ipv6_hdr(skb)->daddr); 677 if (!neigh) 678 goto tx_error; 679 680 addr6 = (const struct in6_addr *)&neigh->primary_key; 681 addr_type = ipv6_addr_type(addr6); 682 683 if (addr_type == IPV6_ADDR_ANY) { 684 addr6 = &ipv6_hdr(skb)->daddr; 685 addr_type = ipv6_addr_type(addr6); 686 } 687 688 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 689 do_tx_error_icmp = true; 690 else { 691 do_tx_error_icmp = false; 692 dst = addr6->s6_addr32[3]; 693 } 694 neigh_release(neigh); 695 if (do_tx_error_icmp) 696 goto tx_error_icmp; 697 } 698#endif 699 else 700 goto tx_error; 701 702 connected = false; 703 } 704 705 tos = tnl_params->tos; 706 if (tos & 0x1) { 707 tos &= ~0x1; 708 if (skb->protocol == htons(ETH_P_IP)) { 709 tos = inner_iph->tos; 710 connected = false; 711 } else if (skb->protocol == htons(ETH_P_IPV6)) { 712 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 713 connected = false; 714 } 715 } 716 717 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, 718 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); 719 720 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 721 goto tx_error; 722 723 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; 724 725 if (!rt) { 726 rt = ip_route_output_key(tunnel->net, &fl4); 727 728 if (IS_ERR(rt)) { 729 dev->stats.tx_carrier_errors++; 730 goto tx_error; 731 } 732 if (connected) 733 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); 734 } 735 736 if (rt->dst.dev == dev) { 737 ip_rt_put(rt); 738 dev->stats.collisions++; 739 goto tx_error; 740 } 741 742 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) { 743 ip_rt_put(rt); 744 goto tx_error; 745 } 746 747 if (tunnel->err_count > 0) { 748 if (time_before(jiffies, 749 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 750 tunnel->err_count--; 751 752 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 753 dst_link_failure(skb); 754 } else 755 tunnel->err_count = 0; 756 } 757 758 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 759 ttl = tnl_params->ttl; 760 if (ttl == 0) { 761 if (skb->protocol == htons(ETH_P_IP)) 762 ttl = inner_iph->ttl; 763#if IS_ENABLED(CONFIG_IPV6) 764 else if (skb->protocol == htons(ETH_P_IPV6)) 765 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 766#endif 767 else 768 ttl = ip4_dst_hoplimit(&rt->dst); 769 } 770 771 df = tnl_params->frag_off; 772 if (skb->protocol == htons(ETH_P_IP)) 773 df |= (inner_iph->frag_off&htons(IP_DF)); 774 775 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 776 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 777 if (max_headroom > dev->needed_headroom) 778 dev->needed_headroom = max_headroom; 779 780 if (skb_cow_head(skb, dev->needed_headroom)) { 781 ip_rt_put(rt); 782 dev->stats.tx_dropped++; 783 kfree_skb(skb); 784 return; 785 } 786 787 err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, 788 tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); 789 iptunnel_xmit_stats(err, &dev->stats, dev->tstats); 790 791 return; 792 793#if IS_ENABLED(CONFIG_IPV6) 794tx_error_icmp: 795 dst_link_failure(skb); 796#endif 797tx_error: 798 dev->stats.tx_errors++; 799 kfree_skb(skb); 800} 801EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 802 803static void ip_tunnel_update(struct ip_tunnel_net *itn, 804 struct ip_tunnel *t, 805 struct net_device *dev, 806 struct ip_tunnel_parm *p, 807 bool set_mtu) 808{ 809 ip_tunnel_del(t); 810 t->parms.iph.saddr = p->iph.saddr; 811 t->parms.iph.daddr = p->iph.daddr; 812 t->parms.i_key = p->i_key; 813 t->parms.o_key = p->o_key; 814 if (dev->type != ARPHRD_ETHER) { 815 memcpy(dev->dev_addr, &p->iph.saddr, 4); 816 memcpy(dev->broadcast, &p->iph.daddr, 4); 817 } 818 ip_tunnel_add(itn, t); 819 820 t->parms.iph.ttl = p->iph.ttl; 821 t->parms.iph.tos = p->iph.tos; 822 t->parms.iph.frag_off = p->iph.frag_off; 823 824 if (t->parms.link != p->link) { 825 int mtu; 826 827 t->parms.link = p->link; 828 mtu = ip_tunnel_bind_dev(dev); 829 if (set_mtu) 830 dev->mtu = mtu; 831 } 832 ip_tunnel_dst_reset_all(t); 833 netdev_state_change(dev); 834} 835 836int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 837{ 838 int err = 0; 839 struct ip_tunnel *t = netdev_priv(dev); 840 struct net *net = t->net; 841 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 842 843 BUG_ON(!itn->fb_tunnel_dev); 844 switch (cmd) { 845 case SIOCGETTUNNEL: 846 if (dev == itn->fb_tunnel_dev) { 847 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 848 if (!t) 849 t = netdev_priv(dev); 850 } 851 memcpy(p, &t->parms, sizeof(*p)); 852 break; 853 854 case SIOCADDTUNNEL: 855 case SIOCCHGTUNNEL: 856 err = -EPERM; 857 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 858 goto done; 859 if (p->iph.ttl) 860 p->iph.frag_off |= htons(IP_DF); 861 if (!(p->i_flags & VTI_ISVTI)) { 862 if (!(p->i_flags & TUNNEL_KEY)) 863 p->i_key = 0; 864 if (!(p->o_flags & TUNNEL_KEY)) 865 p->o_key = 0; 866 } 867 868 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 869 870 if (cmd == SIOCADDTUNNEL) { 871 if (!t) { 872 t = ip_tunnel_create(net, itn, p); 873 err = PTR_ERR_OR_ZERO(t); 874 break; 875 } 876 877 err = -EEXIST; 878 break; 879 } 880 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 881 if (t) { 882 if (t->dev != dev) { 883 err = -EEXIST; 884 break; 885 } 886 } else { 887 unsigned int nflags = 0; 888 889 if (ipv4_is_multicast(p->iph.daddr)) 890 nflags = IFF_BROADCAST; 891 else if (p->iph.daddr) 892 nflags = IFF_POINTOPOINT; 893 894 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 895 err = -EINVAL; 896 break; 897 } 898 899 t = netdev_priv(dev); 900 } 901 } 902 903 if (t) { 904 err = 0; 905 ip_tunnel_update(itn, t, dev, p, true); 906 } else { 907 err = -ENOENT; 908 } 909 break; 910 911 case SIOCDELTUNNEL: 912 err = -EPERM; 913 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 914 goto done; 915 916 if (dev == itn->fb_tunnel_dev) { 917 err = -ENOENT; 918 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 919 if (!t) 920 goto done; 921 err = -EPERM; 922 if (t == netdev_priv(itn->fb_tunnel_dev)) 923 goto done; 924 dev = t->dev; 925 } 926 unregister_netdevice(dev); 927 err = 0; 928 break; 929 930 default: 931 err = -EINVAL; 932 } 933 934done: 935 return err; 936} 937EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); 938 939int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 940{ 941 struct ip_tunnel *tunnel = netdev_priv(dev); 942 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 943 944 if (new_mtu < 68 || 945 new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen) 946 return -EINVAL; 947 dev->mtu = new_mtu; 948 return 0; 949} 950EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 951 952static void ip_tunnel_dev_free(struct net_device *dev) 953{ 954 struct ip_tunnel *tunnel = netdev_priv(dev); 955 956 gro_cells_destroy(&tunnel->gro_cells); 957 free_percpu(tunnel->dst_cache); 958 free_percpu(dev->tstats); 959 free_netdev(dev); 960} 961 962void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 963{ 964 struct ip_tunnel *tunnel = netdev_priv(dev); 965 struct ip_tunnel_net *itn; 966 967 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 968 969 if (itn->fb_tunnel_dev != dev) { 970 ip_tunnel_del(netdev_priv(dev)); 971 unregister_netdevice_queue(dev, head); 972 } 973} 974EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 975 976struct net *ip_tunnel_get_link_net(const struct net_device *dev) 977{ 978 struct ip_tunnel *tunnel = netdev_priv(dev); 979 980 return tunnel->net; 981} 982EXPORT_SYMBOL(ip_tunnel_get_link_net); 983 984int ip_tunnel_get_iflink(const struct net_device *dev) 985{ 986 struct ip_tunnel *tunnel = netdev_priv(dev); 987 988 return tunnel->parms.link; 989} 990EXPORT_SYMBOL(ip_tunnel_get_iflink); 991 992int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, 993 struct rtnl_link_ops *ops, char *devname) 994{ 995 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 996 struct ip_tunnel_parm parms; 997 unsigned int i; 998 999 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1000 INIT_HLIST_HEAD(&itn->tunnels[i]); 1001 1002 if (!ops) { 1003 itn->fb_tunnel_dev = NULL; 1004 return 0; 1005 } 1006 1007 memset(&parms, 0, sizeof(parms)); 1008 if (devname) 1009 strlcpy(parms.name, devname, IFNAMSIZ); 1010 1011 rtnl_lock(); 1012 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1013 /* FB netdevice is special: we have one, and only one per netns. 1014 * Allowing to move it to another netns is clearly unsafe. 1015 */ 1016 if (!IS_ERR(itn->fb_tunnel_dev)) { 1017 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1018 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1019 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1020 } 1021 rtnl_unlock(); 1022 1023 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1024} 1025EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1026 1027static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, 1028 struct rtnl_link_ops *ops) 1029{ 1030 struct net *net = dev_net(itn->fb_tunnel_dev); 1031 struct net_device *dev, *aux; 1032 int h; 1033 1034 for_each_netdev_safe(net, dev, aux) 1035 if (dev->rtnl_link_ops == ops) 1036 unregister_netdevice_queue(dev, head); 1037 1038 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1039 struct ip_tunnel *t; 1040 struct hlist_node *n; 1041 struct hlist_head *thead = &itn->tunnels[h]; 1042 1043 hlist_for_each_entry_safe(t, n, thead, hash_node) 1044 /* If dev is in the same netns, it has already 1045 * been added to the list by the previous loop. 1046 */ 1047 if (!net_eq(dev_net(t->dev), net)) 1048 unregister_netdevice_queue(t->dev, head); 1049 } 1050} 1051 1052void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) 1053{ 1054 LIST_HEAD(list); 1055 1056 rtnl_lock(); 1057 ip_tunnel_destroy(itn, &list, ops); 1058 unregister_netdevice_many(&list); 1059 rtnl_unlock(); 1060} 1061EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); 1062 1063int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1064 struct ip_tunnel_parm *p) 1065{ 1066 struct ip_tunnel *nt; 1067 struct net *net = dev_net(dev); 1068 struct ip_tunnel_net *itn; 1069 int mtu; 1070 int err; 1071 1072 nt = netdev_priv(dev); 1073 itn = net_generic(net, nt->ip_tnl_net_id); 1074 1075 if (ip_tunnel_find(itn, p, dev->type)) 1076 return -EEXIST; 1077 1078 nt->net = net; 1079 nt->parms = *p; 1080 err = register_netdevice(dev); 1081 if (err) 1082 goto out; 1083 1084 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1085 eth_hw_addr_random(dev); 1086 1087 mtu = ip_tunnel_bind_dev(dev); 1088 if (!tb[IFLA_MTU]) 1089 dev->mtu = mtu; 1090 1091 ip_tunnel_add(itn, nt); 1092 1093out: 1094 return err; 1095} 1096EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1097 1098int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1099 struct ip_tunnel_parm *p) 1100{ 1101 struct ip_tunnel *t; 1102 struct ip_tunnel *tunnel = netdev_priv(dev); 1103 struct net *net = tunnel->net; 1104 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1105 1106 if (dev == itn->fb_tunnel_dev) 1107 return -EINVAL; 1108 1109 t = ip_tunnel_find(itn, p, dev->type); 1110 1111 if (t) { 1112 if (t->dev != dev) 1113 return -EEXIST; 1114 } else { 1115 t = tunnel; 1116 1117 if (dev->type != ARPHRD_ETHER) { 1118 unsigned int nflags = 0; 1119 1120 if (ipv4_is_multicast(p->iph.daddr)) 1121 nflags = IFF_BROADCAST; 1122 else if (p->iph.daddr) 1123 nflags = IFF_POINTOPOINT; 1124 1125 if ((dev->flags ^ nflags) & 1126 (IFF_POINTOPOINT | IFF_BROADCAST)) 1127 return -EINVAL; 1128 } 1129 } 1130 1131 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); 1132 return 0; 1133} 1134EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1135 1136int ip_tunnel_init(struct net_device *dev) 1137{ 1138 struct ip_tunnel *tunnel = netdev_priv(dev); 1139 struct iphdr *iph = &tunnel->parms.iph; 1140 int err; 1141 1142 dev->destructor = ip_tunnel_dev_free; 1143 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1144 if (!dev->tstats) 1145 return -ENOMEM; 1146 1147 tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); 1148 if (!tunnel->dst_cache) { 1149 free_percpu(dev->tstats); 1150 return -ENOMEM; 1151 } 1152 1153 err = gro_cells_init(&tunnel->gro_cells, dev); 1154 if (err) { 1155 free_percpu(tunnel->dst_cache); 1156 free_percpu(dev->tstats); 1157 return err; 1158 } 1159 1160 tunnel->dev = dev; 1161 tunnel->net = dev_net(dev); 1162 strcpy(tunnel->parms.name, dev->name); 1163 iph->version = 4; 1164 iph->ihl = 5; 1165 1166 return 0; 1167} 1168EXPORT_SYMBOL_GPL(ip_tunnel_init); 1169 1170void ip_tunnel_uninit(struct net_device *dev) 1171{ 1172 struct ip_tunnel *tunnel = netdev_priv(dev); 1173 struct net *net = tunnel->net; 1174 struct ip_tunnel_net *itn; 1175 1176 itn = net_generic(net, tunnel->ip_tnl_net_id); 1177 /* fb_tunnel_dev will be unregisted in net-exit call. */ 1178 if (itn->fb_tunnel_dev != dev) 1179 ip_tunnel_del(netdev_priv(dev)); 1180 1181 ip_tunnel_dst_reset_all(tunnel); 1182} 1183EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1184 1185/* Do least required initialization, rest of init is done in tunnel_init call */ 1186void ip_tunnel_setup(struct net_device *dev, int net_id) 1187{ 1188 struct ip_tunnel *tunnel = netdev_priv(dev); 1189 tunnel->ip_tnl_net_id = net_id; 1190} 1191EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1192 1193MODULE_LICENSE("GPL"); 1194