root/net/ipv4/fib_frontend.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fib4_rules_init
  2. fib4_has_custom_rules
  3. fib_new_table
  4. fib_get_table
  5. fib4_has_custom_rules
  6. fib_replace_table
  7. fib_unmerge
  8. fib_flush
  9. __inet_dev_addr_type
  10. inet_addr_type_table
  11. inet_addr_type
  12. inet_dev_addr_type
  13. inet_addr_type_dev_table
  14. fib_compute_spec_dst
  15. fib_info_nh_uses_dev
  16. __fib_validate_source
  17. fib_validate_source
  18. sk_extract_addr
  19. put_rtax
  20. rtentry_to_fib_config
  21. ip_rt_ioctl
  22. fib_gw_from_via
  23. rtm_to_fib_config
  24. inet_rtm_delroute
  25. inet_rtm_newroute
  26. ip_valid_fib_dump_req
  27. inet_dump_fib
  28. fib_magic
  29. fib_add_ifaddr
  30. fib_modify_prefix_metric
  31. fib_del_ifaddr
  32. nl_fib_lookup
  33. nl_fib_input
  34. nl_fib_lookup_init
  35. nl_fib_lookup_exit
  36. fib_disable_ip
  37. fib_inetaddr_event
  38. fib_netdev_event
  39. ip_fib_net_init
  40. ip_fib_net_exit
  41. fib_net_init
  42. fib_net_exit
  43. ip_fib_init

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              IPv4 Forwarding Information Base: FIB frontend.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11 
  12 #include <linux/module.h>
  13 #include <linux/uaccess.h>
  14 #include <linux/bitops.h>
  15 #include <linux/capability.h>
  16 #include <linux/types.h>
  17 #include <linux/kernel.h>
  18 #include <linux/mm.h>
  19 #include <linux/string.h>
  20 #include <linux/socket.h>
  21 #include <linux/sockios.h>
  22 #include <linux/errno.h>
  23 #include <linux/in.h>
  24 #include <linux/inet.h>
  25 #include <linux/inetdevice.h>
  26 #include <linux/netdevice.h>
  27 #include <linux/if_addr.h>
  28 #include <linux/if_arp.h>
  29 #include <linux/skbuff.h>
  30 #include <linux/cache.h>
  31 #include <linux/init.h>
  32 #include <linux/list.h>
  33 #include <linux/slab.h>
  34 
  35 #include <net/ip.h>
  36 #include <net/protocol.h>
  37 #include <net/route.h>
  38 #include <net/tcp.h>
  39 #include <net/sock.h>
  40 #include <net/arp.h>
  41 #include <net/ip_fib.h>
  42 #include <net/nexthop.h>
  43 #include <net/rtnetlink.h>
  44 #include <net/xfrm.h>
  45 #include <net/l3mdev.h>
  46 #include <net/lwtunnel.h>
  47 #include <trace/events/fib.h>
  48 
  49 #ifndef CONFIG_IP_MULTIPLE_TABLES
  50 
  51 static int __net_init fib4_rules_init(struct net *net)
  52 {
  53         struct fib_table *local_table, *main_table;
  54 
  55         main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
  56         if (!main_table)
  57                 return -ENOMEM;
  58 
  59         local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
  60         if (!local_table)
  61                 goto fail;
  62 
  63         hlist_add_head_rcu(&local_table->tb_hlist,
  64                                 &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
  65         hlist_add_head_rcu(&main_table->tb_hlist,
  66                                 &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
  67         return 0;
  68 
  69 fail:
  70         fib_free_table(main_table);
  71         return -ENOMEM;
  72 }
  73 
  74 static bool fib4_has_custom_rules(struct net *net)
  75 {
  76         return false;
  77 }
  78 #else
  79 
  80 struct fib_table *fib_new_table(struct net *net, u32 id)
  81 {
  82         struct fib_table *tb, *alias = NULL;
  83         unsigned int h;
  84 
  85         if (id == 0)
  86                 id = RT_TABLE_MAIN;
  87         tb = fib_get_table(net, id);
  88         if (tb)
  89                 return tb;
  90 
  91         if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
  92                 alias = fib_new_table(net, RT_TABLE_MAIN);
  93 
  94         tb = fib_trie_table(id, alias);
  95         if (!tb)
  96                 return NULL;
  97 
  98         switch (id) {
  99         case RT_TABLE_MAIN:
 100                 rcu_assign_pointer(net->ipv4.fib_main, tb);
 101                 break;
 102         case RT_TABLE_DEFAULT:
 103                 rcu_assign_pointer(net->ipv4.fib_default, tb);
 104                 break;
 105         default:
 106                 break;
 107         }
 108 
 109         h = id & (FIB_TABLE_HASHSZ - 1);
 110         hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
 111         return tb;
 112 }
 113 EXPORT_SYMBOL_GPL(fib_new_table);
 114 
 115 /* caller must hold either rtnl or rcu read lock */
 116 struct fib_table *fib_get_table(struct net *net, u32 id)
 117 {
 118         struct fib_table *tb;
 119         struct hlist_head *head;
 120         unsigned int h;
 121 
 122         if (id == 0)
 123                 id = RT_TABLE_MAIN;
 124         h = id & (FIB_TABLE_HASHSZ - 1);
 125 
 126         head = &net->ipv4.fib_table_hash[h];
 127         hlist_for_each_entry_rcu(tb, head, tb_hlist,
 128                                  lockdep_rtnl_is_held()) {
 129                 if (tb->tb_id == id)
 130                         return tb;
 131         }
 132         return NULL;
 133 }
 134 
 135 static bool fib4_has_custom_rules(struct net *net)
 136 {
 137         return net->ipv4.fib_has_custom_rules;
 138 }
 139 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 140 
 141 static void fib_replace_table(struct net *net, struct fib_table *old,
 142                               struct fib_table *new)
 143 {
 144 #ifdef CONFIG_IP_MULTIPLE_TABLES
 145         switch (new->tb_id) {
 146         case RT_TABLE_MAIN:
 147                 rcu_assign_pointer(net->ipv4.fib_main, new);
 148                 break;
 149         case RT_TABLE_DEFAULT:
 150                 rcu_assign_pointer(net->ipv4.fib_default, new);
 151                 break;
 152         default:
 153                 break;
 154         }
 155 
 156 #endif
 157         /* replace the old table in the hlist */
 158         hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
 159 }
 160 
 161 int fib_unmerge(struct net *net)
 162 {
 163         struct fib_table *old, *new, *main_table;
 164 
 165         /* attempt to fetch local table if it has been allocated */
 166         old = fib_get_table(net, RT_TABLE_LOCAL);
 167         if (!old)
 168                 return 0;
 169 
 170         new = fib_trie_unmerge(old);
 171         if (!new)
 172                 return -ENOMEM;
 173 
 174         /* table is already unmerged */
 175         if (new == old)
 176                 return 0;
 177 
 178         /* replace merged table with clean table */
 179         fib_replace_table(net, old, new);
 180         fib_free_table(old);
 181 
 182         /* attempt to fetch main table if it has been allocated */
 183         main_table = fib_get_table(net, RT_TABLE_MAIN);
 184         if (!main_table)
 185                 return 0;
 186 
 187         /* flush local entries from main table */
 188         fib_table_flush_external(main_table);
 189 
 190         return 0;
 191 }
 192 
 193 void fib_flush(struct net *net)
 194 {
 195         int flushed = 0;
 196         unsigned int h;
 197 
 198         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 199                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 200                 struct hlist_node *tmp;
 201                 struct fib_table *tb;
 202 
 203                 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
 204                         flushed += fib_table_flush(net, tb, false);
 205         }
 206 
 207         if (flushed)
 208                 rt_cache_flush(net);
 209 }
 210 
 211 /*
 212  * Find address type as if only "dev" was present in the system. If
 213  * on_dev is NULL then all interfaces are taken into consideration.
 214  */
 215 static inline unsigned int __inet_dev_addr_type(struct net *net,
 216                                                 const struct net_device *dev,
 217                                                 __be32 addr, u32 tb_id)
 218 {
 219         struct flowi4           fl4 = { .daddr = addr };
 220         struct fib_result       res;
 221         unsigned int ret = RTN_BROADCAST;
 222         struct fib_table *table;
 223 
 224         if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
 225                 return RTN_BROADCAST;
 226         if (ipv4_is_multicast(addr))
 227                 return RTN_MULTICAST;
 228 
 229         rcu_read_lock();
 230 
 231         table = fib_get_table(net, tb_id);
 232         if (table) {
 233                 ret = RTN_UNICAST;
 234                 if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
 235                         struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
 236 
 237                         if (!dev || dev == nhc->nhc_dev)
 238                                 ret = res.type;
 239                 }
 240         }
 241 
 242         rcu_read_unlock();
 243         return ret;
 244 }
 245 
 246 unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
 247 {
 248         return __inet_dev_addr_type(net, NULL, addr, tb_id);
 249 }
 250 EXPORT_SYMBOL(inet_addr_type_table);
 251 
 252 unsigned int inet_addr_type(struct net *net, __be32 addr)
 253 {
 254         return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 255 }
 256 EXPORT_SYMBOL(inet_addr_type);
 257 
 258 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 259                                 __be32 addr)
 260 {
 261         u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 262 
 263         return __inet_dev_addr_type(net, dev, addr, rt_table);
 264 }
 265 EXPORT_SYMBOL(inet_dev_addr_type);
 266 
 267 /* inet_addr_type with dev == NULL but using the table from a dev
 268  * if one is associated
 269  */
 270 unsigned int inet_addr_type_dev_table(struct net *net,
 271                                       const struct net_device *dev,
 272                                       __be32 addr)
 273 {
 274         u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
 275 
 276         return __inet_dev_addr_type(net, NULL, addr, rt_table);
 277 }
 278 EXPORT_SYMBOL(inet_addr_type_dev_table);
 279 
 280 __be32 fib_compute_spec_dst(struct sk_buff *skb)
 281 {
 282         struct net_device *dev = skb->dev;
 283         struct in_device *in_dev;
 284         struct fib_result res;
 285         struct rtable *rt;
 286         struct net *net;
 287         int scope;
 288 
 289         rt = skb_rtable(skb);
 290         if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
 291             RTCF_LOCAL)
 292                 return ip_hdr(skb)->daddr;
 293 
 294         in_dev = __in_dev_get_rcu(dev);
 295 
 296         net = dev_net(dev);
 297 
 298         scope = RT_SCOPE_UNIVERSE;
 299         if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
 300                 bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
 301                 struct flowi4 fl4 = {
 302                         .flowi4_iif = LOOPBACK_IFINDEX,
 303                         .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
 304                         .daddr = ip_hdr(skb)->saddr,
 305                         .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
 306                         .flowi4_scope = scope,
 307                         .flowi4_mark = vmark ? skb->mark : 0,
 308                 };
 309                 if (!fib_lookup(net, &fl4, &res, 0))
 310                         return fib_result_prefsrc(net, &res);
 311         } else {
 312                 scope = RT_SCOPE_LINK;
 313         }
 314 
 315         return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
 316 }
 317 
 318 bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
 319 {
 320         bool dev_match = false;
 321 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 322         if (unlikely(fi->nh)) {
 323                 dev_match = nexthop_uses_dev(fi->nh, dev);
 324         } else {
 325                 int ret;
 326 
 327                 for (ret = 0; ret < fib_info_num_path(fi); ret++) {
 328                         const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
 329 
 330                         if (nhc_l3mdev_matches_dev(nhc, dev)) {
 331                                 dev_match = true;
 332                                 break;
 333                         }
 334                 }
 335         }
 336 #else
 337         if (fib_info_nhc(fi, 0)->nhc_dev == dev)
 338                 dev_match = true;
 339 #endif
 340 
 341         return dev_match;
 342 }
 343 EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
 344 
 345 /* Given (packet source, input interface) and optional (dst, oif, tos):
 346  * - (main) check, that source is valid i.e. not broadcast or our local
 347  *   address.
 348  * - figure out what "logical" interface this packet arrived
 349  *   and calculate "specific destination" address.
 350  * - check, that packet arrived from expected physical interface.
 351  * called with rcu_read_lock()
 352  */
 353 static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 354                                  u8 tos, int oif, struct net_device *dev,
 355                                  int rpf, struct in_device *idev, u32 *itag)
 356 {
 357         struct net *net = dev_net(dev);
 358         struct flow_keys flkeys;
 359         int ret, no_addr;
 360         struct fib_result res;
 361         struct flowi4 fl4;
 362         bool dev_match;
 363 
 364         fl4.flowi4_oif = 0;
 365         fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
 366         if (!fl4.flowi4_iif)
 367                 fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
 368         fl4.daddr = src;
 369         fl4.saddr = dst;
 370         fl4.flowi4_tos = tos;
 371         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 372         fl4.flowi4_tun_key.tun_id = 0;
 373         fl4.flowi4_flags = 0;
 374         fl4.flowi4_uid = sock_net_uid(net, NULL);
 375 
 376         no_addr = idev->ifa_list == NULL;
 377 
 378         fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 379         if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
 380                 fl4.flowi4_proto = 0;
 381                 fl4.fl4_sport = 0;
 382                 fl4.fl4_dport = 0;
 383         }
 384 
 385         if (fib_lookup(net, &fl4, &res, 0))
 386                 goto last_resort;
 387         if (res.type != RTN_UNICAST &&
 388             (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
 389                 goto e_inval;
 390         fib_combine_itag(itag, &res);
 391 
 392         dev_match = fib_info_nh_uses_dev(res.fi, dev);
 393         /* This is not common, loopback packets retain skb_dst so normally they
 394          * would not even hit this slow path.
 395          */
 396         dev_match = dev_match || (res.type == RTN_LOCAL &&
 397                                   dev == net->loopback_dev);
 398         if (dev_match) {
 399                 ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 400                 return ret;
 401         }
 402         if (no_addr)
 403                 goto last_resort;
 404         if (rpf == 1)
 405                 goto e_rpf;
 406         fl4.flowi4_oif = dev->ifindex;
 407 
 408         ret = 0;
 409         if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 410                 if (res.type == RTN_UNICAST)
 411                         ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 412         }
 413         return ret;
 414 
 415 last_resort:
 416         if (rpf)
 417                 goto e_rpf;
 418         *itag = 0;
 419         return 0;
 420 
 421 e_inval:
 422         return -EINVAL;
 423 e_rpf:
 424         return -EXDEV;
 425 }
 426 
 427 /* Ignore rp_filter for packets protected by IPsec. */
 428 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 429                         u8 tos, int oif, struct net_device *dev,
 430                         struct in_device *idev, u32 *itag)
 431 {
 432         int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 433         struct net *net = dev_net(dev);
 434 
 435         if (!r && !fib_num_tclassid_users(net) &&
 436             (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 437                 if (IN_DEV_ACCEPT_LOCAL(idev))
 438                         goto ok;
 439                 /* with custom local routes in place, checking local addresses
 440                  * only will be too optimistic, with custom rules, checking
 441                  * local addresses only can be too strict, e.g. due to vrf
 442                  */
 443                 if (net->ipv4.fib_has_custom_local_routes ||
 444                     fib4_has_custom_rules(net))
 445                         goto full_check;
 446                 if (inet_lookup_ifaddr_rcu(net, src))
 447                         return -EINVAL;
 448 
 449 ok:
 450                 *itag = 0;
 451                 return 0;
 452         }
 453 
 454 full_check:
 455         return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
 456 }
 457 
 458 static inline __be32 sk_extract_addr(struct sockaddr *addr)
 459 {
 460         return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
 461 }
 462 
 463 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
 464 {
 465         struct nlattr *nla;
 466 
 467         nla = (struct nlattr *) ((char *) mx + len);
 468         nla->nla_type = type;
 469         nla->nla_len = nla_attr_size(4);
 470         *(u32 *) nla_data(nla) = value;
 471 
 472         return len + nla_total_size(4);
 473 }
 474 
 475 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 476                                  struct fib_config *cfg)
 477 {
 478         __be32 addr;
 479         int plen;
 480 
 481         memset(cfg, 0, sizeof(*cfg));
 482         cfg->fc_nlinfo.nl_net = net;
 483 
 484         if (rt->rt_dst.sa_family != AF_INET)
 485                 return -EAFNOSUPPORT;
 486 
 487         /*
 488          * Check mask for validity:
 489          * a) it must be contiguous.
 490          * b) destination must have all host bits clear.
 491          * c) if application forgot to set correct family (AF_INET),
 492          *    reject request unless it is absolutely clear i.e.
 493          *    both family and mask are zero.
 494          */
 495         plen = 32;
 496         addr = sk_extract_addr(&rt->rt_dst);
 497         if (!(rt->rt_flags & RTF_HOST)) {
 498                 __be32 mask = sk_extract_addr(&rt->rt_genmask);
 499 
 500                 if (rt->rt_genmask.sa_family != AF_INET) {
 501                         if (mask || rt->rt_genmask.sa_family)
 502                                 return -EAFNOSUPPORT;
 503                 }
 504 
 505                 if (bad_mask(mask, addr))
 506                         return -EINVAL;
 507 
 508                 plen = inet_mask_len(mask);
 509         }
 510 
 511         cfg->fc_dst_len = plen;
 512         cfg->fc_dst = addr;
 513 
 514         if (cmd != SIOCDELRT) {
 515                 cfg->fc_nlflags = NLM_F_CREATE;
 516                 cfg->fc_protocol = RTPROT_BOOT;
 517         }
 518 
 519         if (rt->rt_metric)
 520                 cfg->fc_priority = rt->rt_metric - 1;
 521 
 522         if (rt->rt_flags & RTF_REJECT) {
 523                 cfg->fc_scope = RT_SCOPE_HOST;
 524                 cfg->fc_type = RTN_UNREACHABLE;
 525                 return 0;
 526         }
 527 
 528         cfg->fc_scope = RT_SCOPE_NOWHERE;
 529         cfg->fc_type = RTN_UNICAST;
 530 
 531         if (rt->rt_dev) {
 532                 char *colon;
 533                 struct net_device *dev;
 534                 char devname[IFNAMSIZ];
 535 
 536                 if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
 537                         return -EFAULT;
 538 
 539                 devname[IFNAMSIZ-1] = 0;
 540                 colon = strchr(devname, ':');
 541                 if (colon)
 542                         *colon = 0;
 543                 dev = __dev_get_by_name(net, devname);
 544                 if (!dev)
 545                         return -ENODEV;
 546                 cfg->fc_oif = dev->ifindex;
 547                 cfg->fc_table = l3mdev_fib_table(dev);
 548                 if (colon) {
 549                         const struct in_ifaddr *ifa;
 550                         struct in_device *in_dev;
 551 
 552                         in_dev = __in_dev_get_rtnl(dev);
 553                         if (!in_dev)
 554                                 return -ENODEV;
 555 
 556                         *colon = ':';
 557 
 558                         rcu_read_lock();
 559                         in_dev_for_each_ifa_rcu(ifa, in_dev) {
 560                                 if (strcmp(ifa->ifa_label, devname) == 0)
 561                                         break;
 562                         }
 563                         rcu_read_unlock();
 564 
 565                         if (!ifa)
 566                                 return -ENODEV;
 567                         cfg->fc_prefsrc = ifa->ifa_local;
 568                 }
 569         }
 570 
 571         addr = sk_extract_addr(&rt->rt_gateway);
 572         if (rt->rt_gateway.sa_family == AF_INET && addr) {
 573                 unsigned int addr_type;
 574 
 575                 cfg->fc_gw4 = addr;
 576                 cfg->fc_gw_family = AF_INET;
 577                 addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
 578                 if (rt->rt_flags & RTF_GATEWAY &&
 579                     addr_type == RTN_UNICAST)
 580                         cfg->fc_scope = RT_SCOPE_UNIVERSE;
 581         }
 582 
 583         if (cmd == SIOCDELRT)
 584                 return 0;
 585 
 586         if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
 587                 return -EINVAL;
 588 
 589         if (cfg->fc_scope == RT_SCOPE_NOWHERE)
 590                 cfg->fc_scope = RT_SCOPE_LINK;
 591 
 592         if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
 593                 struct nlattr *mx;
 594                 int len = 0;
 595 
 596                 mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
 597                 if (!mx)
 598                         return -ENOMEM;
 599 
 600                 if (rt->rt_flags & RTF_MTU)
 601                         len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
 602 
 603                 if (rt->rt_flags & RTF_WINDOW)
 604                         len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
 605 
 606                 if (rt->rt_flags & RTF_IRTT)
 607                         len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
 608 
 609                 cfg->fc_mx = mx;
 610                 cfg->fc_mx_len = len;
 611         }
 612 
 613         return 0;
 614 }
 615 
 616 /*
 617  * Handle IP routing ioctl calls.
 618  * These are used to manipulate the routing tables
 619  */
 620 int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
 621 {
 622         struct fib_config cfg;
 623         int err;
 624 
 625         switch (cmd) {
 626         case SIOCADDRT:         /* Add a route */
 627         case SIOCDELRT:         /* Delete a route */
 628                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 629                         return -EPERM;
 630 
 631                 rtnl_lock();
 632                 err = rtentry_to_fib_config(net, cmd, rt, &cfg);
 633                 if (err == 0) {
 634                         struct fib_table *tb;
 635 
 636                         if (cmd == SIOCDELRT) {
 637                                 tb = fib_get_table(net, cfg.fc_table);
 638                                 if (tb)
 639                                         err = fib_table_delete(net, tb, &cfg,
 640                                                                NULL);
 641                                 else
 642                                         err = -ESRCH;
 643                         } else {
 644                                 tb = fib_new_table(net, cfg.fc_table);
 645                                 if (tb)
 646                                         err = fib_table_insert(net, tb,
 647                                                                &cfg, NULL);
 648                                 else
 649                                         err = -ENOBUFS;
 650                         }
 651 
 652                         /* allocated by rtentry_to_fib_config() */
 653                         kfree(cfg.fc_mx);
 654                 }
 655                 rtnl_unlock();
 656                 return err;
 657         }
 658         return -EINVAL;
 659 }
 660 
 661 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 662         [RTA_UNSPEC]            = { .strict_start_type = RTA_DPORT + 1 },
 663         [RTA_DST]               = { .type = NLA_U32 },
 664         [RTA_SRC]               = { .type = NLA_U32 },
 665         [RTA_IIF]               = { .type = NLA_U32 },
 666         [RTA_OIF]               = { .type = NLA_U32 },
 667         [RTA_GATEWAY]           = { .type = NLA_U32 },
 668         [RTA_PRIORITY]          = { .type = NLA_U32 },
 669         [RTA_PREFSRC]           = { .type = NLA_U32 },
 670         [RTA_METRICS]           = { .type = NLA_NESTED },
 671         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 672         [RTA_FLOW]              = { .type = NLA_U32 },
 673         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
 674         [RTA_ENCAP]             = { .type = NLA_NESTED },
 675         [RTA_UID]               = { .type = NLA_U32 },
 676         [RTA_MARK]              = { .type = NLA_U32 },
 677         [RTA_TABLE]             = { .type = NLA_U32 },
 678         [RTA_IP_PROTO]          = { .type = NLA_U8 },
 679         [RTA_SPORT]             = { .type = NLA_U16 },
 680         [RTA_DPORT]             = { .type = NLA_U16 },
 681         [RTA_NH_ID]             = { .type = NLA_U32 },
 682 };
 683 
 684 int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
 685                     struct netlink_ext_ack *extack)
 686 {
 687         struct rtvia *via;
 688         int alen;
 689 
 690         if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
 691                 NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
 692                 return -EINVAL;
 693         }
 694 
 695         via = nla_data(nla);
 696         alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
 697 
 698         switch (via->rtvia_family) {
 699         case AF_INET:
 700                 if (alen != sizeof(__be32)) {
 701                         NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
 702                         return -EINVAL;
 703                 }
 704                 cfg->fc_gw_family = AF_INET;
 705                 cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
 706                 break;
 707         case AF_INET6:
 708 #ifdef CONFIG_IPV6
 709                 if (alen != sizeof(struct in6_addr)) {
 710                         NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
 711                         return -EINVAL;
 712                 }
 713                 cfg->fc_gw_family = AF_INET6;
 714                 cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
 715 #else
 716                 NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
 717                 return -EINVAL;
 718 #endif
 719                 break;
 720         default:
 721                 NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
 722                 return -EINVAL;
 723         }
 724 
 725         return 0;
 726 }
 727 
 728 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 729                              struct nlmsghdr *nlh, struct fib_config *cfg,
 730                              struct netlink_ext_ack *extack)
 731 {
 732         bool has_gw = false, has_via = false;
 733         struct nlattr *attr;
 734         int err, remaining;
 735         struct rtmsg *rtm;
 736 
 737         err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
 738                                         rtm_ipv4_policy, extack);
 739         if (err < 0)
 740                 goto errout;
 741 
 742         memset(cfg, 0, sizeof(*cfg));
 743 
 744         rtm = nlmsg_data(nlh);
 745         cfg->fc_dst_len = rtm->rtm_dst_len;
 746         cfg->fc_tos = rtm->rtm_tos;
 747         cfg->fc_table = rtm->rtm_table;
 748         cfg->fc_protocol = rtm->rtm_protocol;
 749         cfg->fc_scope = rtm->rtm_scope;
 750         cfg->fc_type = rtm->rtm_type;
 751         cfg->fc_flags = rtm->rtm_flags;
 752         cfg->fc_nlflags = nlh->nlmsg_flags;
 753 
 754         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
 755         cfg->fc_nlinfo.nlh = nlh;
 756         cfg->fc_nlinfo.nl_net = net;
 757 
 758         if (cfg->fc_type > RTN_MAX) {
 759                 NL_SET_ERR_MSG(extack, "Invalid route type");
 760                 err = -EINVAL;
 761                 goto errout;
 762         }
 763 
 764         nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
 765                 switch (nla_type(attr)) {
 766                 case RTA_DST:
 767                         cfg->fc_dst = nla_get_be32(attr);
 768                         break;
 769                 case RTA_OIF:
 770                         cfg->fc_oif = nla_get_u32(attr);
 771                         break;
 772                 case RTA_GATEWAY:
 773                         has_gw = true;
 774                         cfg->fc_gw4 = nla_get_be32(attr);
 775                         if (cfg->fc_gw4)
 776                                 cfg->fc_gw_family = AF_INET;
 777                         break;
 778                 case RTA_VIA:
 779                         has_via = true;
 780                         err = fib_gw_from_via(cfg, attr, extack);
 781                         if (err)
 782                                 goto errout;
 783                         break;
 784                 case RTA_PRIORITY:
 785                         cfg->fc_priority = nla_get_u32(attr);
 786                         break;
 787                 case RTA_PREFSRC:
 788                         cfg->fc_prefsrc = nla_get_be32(attr);
 789                         break;
 790                 case RTA_METRICS:
 791                         cfg->fc_mx = nla_data(attr);
 792                         cfg->fc_mx_len = nla_len(attr);
 793                         break;
 794                 case RTA_MULTIPATH:
 795                         err = lwtunnel_valid_encap_type_attr(nla_data(attr),
 796                                                              nla_len(attr),
 797                                                              extack);
 798                         if (err < 0)
 799                                 goto errout;
 800                         cfg->fc_mp = nla_data(attr);
 801                         cfg->fc_mp_len = nla_len(attr);
 802                         break;
 803                 case RTA_FLOW:
 804                         cfg->fc_flow = nla_get_u32(attr);
 805                         break;
 806                 case RTA_TABLE:
 807                         cfg->fc_table = nla_get_u32(attr);
 808                         break;
 809                 case RTA_ENCAP:
 810                         cfg->fc_encap = attr;
 811                         break;
 812                 case RTA_ENCAP_TYPE:
 813                         cfg->fc_encap_type = nla_get_u16(attr);
 814                         err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
 815                                                         extack);
 816                         if (err < 0)
 817                                 goto errout;
 818                         break;
 819                 case RTA_NH_ID:
 820                         cfg->fc_nh_id = nla_get_u32(attr);
 821                         break;
 822                 }
 823         }
 824 
 825         if (cfg->fc_nh_id) {
 826                 if (cfg->fc_oif || cfg->fc_gw_family ||
 827                     cfg->fc_encap || cfg->fc_mp) {
 828                         NL_SET_ERR_MSG(extack,
 829                                        "Nexthop specification and nexthop id are mutually exclusive");
 830                         return -EINVAL;
 831                 }
 832         }
 833 
 834         if (has_gw && has_via) {
 835                 NL_SET_ERR_MSG(extack,
 836                                "Nexthop configuration can not contain both GATEWAY and VIA");
 837                 goto errout;
 838         }
 839 
 840         return 0;
 841 errout:
 842         return err;
 843 }
 844 
 845 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 846                              struct netlink_ext_ack *extack)
 847 {
 848         struct net *net = sock_net(skb->sk);
 849         struct fib_config cfg;
 850         struct fib_table *tb;
 851         int err;
 852 
 853         err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 854         if (err < 0)
 855                 goto errout;
 856 
 857         if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
 858                 NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
 859                 err = -EINVAL;
 860                 goto errout;
 861         }
 862 
 863         tb = fib_get_table(net, cfg.fc_table);
 864         if (!tb) {
 865                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
 866                 err = -ESRCH;
 867                 goto errout;
 868         }
 869 
 870         err = fib_table_delete(net, tb, &cfg, extack);
 871 errout:
 872         return err;
 873 }
 874 
 875 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 876                              struct netlink_ext_ack *extack)
 877 {
 878         struct net *net = sock_net(skb->sk);
 879         struct fib_config cfg;
 880         struct fib_table *tb;
 881         int err;
 882 
 883         err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
 884         if (err < 0)
 885                 goto errout;
 886 
 887         tb = fib_new_table(net, cfg.fc_table);
 888         if (!tb) {
 889                 err = -ENOBUFS;
 890                 goto errout;
 891         }
 892 
 893         err = fib_table_insert(net, tb, &cfg, extack);
 894         if (!err && cfg.fc_type == RTN_LOCAL)
 895                 net->ipv4.fib_has_custom_local_routes = true;
 896 errout:
 897         return err;
 898 }
 899 
 900 int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 901                           struct fib_dump_filter *filter,
 902                           struct netlink_callback *cb)
 903 {
 904         struct netlink_ext_ack *extack = cb->extack;
 905         struct nlattr *tb[RTA_MAX + 1];
 906         struct rtmsg *rtm;
 907         int err, i;
 908 
 909         ASSERT_RTNL();
 910 
 911         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 912                 NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
 913                 return -EINVAL;
 914         }
 915 
 916         rtm = nlmsg_data(nlh);
 917         if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
 918             rtm->rtm_scope) {
 919                 NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
 920                 return -EINVAL;
 921         }
 922 
 923         if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
 924                 NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
 925                 return -EINVAL;
 926         }
 927         if (rtm->rtm_flags & RTM_F_CLONED)
 928                 filter->dump_routes = false;
 929         else
 930                 filter->dump_exceptions = false;
 931 
 932         filter->flags    = rtm->rtm_flags;
 933         filter->protocol = rtm->rtm_protocol;
 934         filter->rt_type  = rtm->rtm_type;
 935         filter->table_id = rtm->rtm_table;
 936 
 937         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
 938                                             rtm_ipv4_policy, extack);
 939         if (err < 0)
 940                 return err;
 941 
 942         for (i = 0; i <= RTA_MAX; ++i) {
 943                 int ifindex;
 944 
 945                 if (!tb[i])
 946                         continue;
 947 
 948                 switch (i) {
 949                 case RTA_TABLE:
 950                         filter->table_id = nla_get_u32(tb[i]);
 951                         break;
 952                 case RTA_OIF:
 953                         ifindex = nla_get_u32(tb[i]);
 954                         filter->dev = __dev_get_by_index(net, ifindex);
 955                         if (!filter->dev)
 956                                 return -ENODEV;
 957                         break;
 958                 default:
 959                         NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
 960                         return -EINVAL;
 961                 }
 962         }
 963 
 964         if (filter->flags || filter->protocol || filter->rt_type ||
 965             filter->table_id || filter->dev) {
 966                 filter->filter_set = 1;
 967                 cb->answer_flags = NLM_F_DUMP_FILTERED;
 968         }
 969 
 970         return 0;
 971 }
 972 EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
 973 
 974 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 975 {
 976         struct fib_dump_filter filter = { .dump_routes = true,
 977                                           .dump_exceptions = true };
 978         const struct nlmsghdr *nlh = cb->nlh;
 979         struct net *net = sock_net(skb->sk);
 980         unsigned int h, s_h;
 981         unsigned int e = 0, s_e;
 982         struct fib_table *tb;
 983         struct hlist_head *head;
 984         int dumped = 0, err;
 985 
 986         if (cb->strict_check) {
 987                 err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
 988                 if (err < 0)
 989                         return err;
 990         } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 991                 struct rtmsg *rtm = nlmsg_data(nlh);
 992 
 993                 filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
 994         }
 995 
 996         /* ipv4 does not use prefix flag */
 997         if (filter.flags & RTM_F_PREFIX)
 998                 return skb->len;
 999 
1000         if (filter.table_id) {
1001                 tb = fib_get_table(net, filter.table_id);
1002                 if (!tb) {
1003                         if (rtnl_msg_family(cb->nlh) != PF_INET)
1004                                 return skb->len;
1005 
1006                         NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
1007                         return -ENOENT;
1008                 }
1009 
1010                 rcu_read_lock();
1011                 err = fib_table_dump(tb, skb, cb, &filter);
1012                 rcu_read_unlock();
1013                 return skb->len ? : err;
1014         }
1015 
1016         s_h = cb->args[0];
1017         s_e = cb->args[1];
1018 
1019         rcu_read_lock();
1020 
1021         for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
1022                 e = 0;
1023                 head = &net->ipv4.fib_table_hash[h];
1024                 hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1025                         if (e < s_e)
1026                                 goto next;
1027                         if (dumped)
1028                                 memset(&cb->args[2], 0, sizeof(cb->args) -
1029                                                  2 * sizeof(cb->args[0]));
1030                         err = fib_table_dump(tb, skb, cb, &filter);
1031                         if (err < 0) {
1032                                 if (likely(skb->len))
1033                                         goto out;
1034 
1035                                 goto out_err;
1036                         }
1037                         dumped = 1;
1038 next:
1039                         e++;
1040                 }
1041         }
1042 out:
1043         err = skb->len;
1044 out_err:
1045         rcu_read_unlock();
1046 
1047         cb->args[1] = e;
1048         cb->args[0] = h;
1049 
1050         return err;
1051 }
1052 
1053 /* Prepare and feed intra-kernel routing request.
1054  * Really, it should be netlink message, but :-( netlink
1055  * can be not configured, so that we feed it directly
1056  * to fib engine. It is legal, because all events occur
1057  * only when netlink is already locked.
1058  */
1059 static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1060                       struct in_ifaddr *ifa, u32 rt_priority)
1061 {
1062         struct net *net = dev_net(ifa->ifa_dev->dev);
1063         u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1064         struct fib_table *tb;
1065         struct fib_config cfg = {
1066                 .fc_protocol = RTPROT_KERNEL,
1067                 .fc_type = type,
1068                 .fc_dst = dst,
1069                 .fc_dst_len = dst_len,
1070                 .fc_priority = rt_priority,
1071                 .fc_prefsrc = ifa->ifa_local,
1072                 .fc_oif = ifa->ifa_dev->dev->ifindex,
1073                 .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1074                 .fc_nlinfo = {
1075                         .nl_net = net,
1076                 },
1077         };
1078 
1079         if (!tb_id)
1080                 tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1081 
1082         tb = fib_new_table(net, tb_id);
1083         if (!tb)
1084                 return;
1085 
1086         cfg.fc_table = tb->tb_id;
1087 
1088         if (type != RTN_LOCAL)
1089                 cfg.fc_scope = RT_SCOPE_LINK;
1090         else
1091                 cfg.fc_scope = RT_SCOPE_HOST;
1092 
1093         if (cmd == RTM_NEWROUTE)
1094                 fib_table_insert(net, tb, &cfg, NULL);
1095         else
1096                 fib_table_delete(net, tb, &cfg, NULL);
1097 }
1098 
1099 void fib_add_ifaddr(struct in_ifaddr *ifa)
1100 {
1101         struct in_device *in_dev = ifa->ifa_dev;
1102         struct net_device *dev = in_dev->dev;
1103         struct in_ifaddr *prim = ifa;
1104         __be32 mask = ifa->ifa_mask;
1105         __be32 addr = ifa->ifa_local;
1106         __be32 prefix = ifa->ifa_address & mask;
1107 
1108         if (ifa->ifa_flags & IFA_F_SECONDARY) {
1109                 prim = inet_ifa_byprefix(in_dev, prefix, mask);
1110                 if (!prim) {
1111                         pr_warn("%s: bug: prim == NULL\n", __func__);
1112                         return;
1113                 }
1114         }
1115 
1116         fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1117 
1118         if (!(dev->flags & IFF_UP))
1119                 return;
1120 
1121         /* Add broadcast address, if it is explicitly assigned. */
1122         if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
1123                 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1124                           prim, 0);
1125 
1126         if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1127             (prefix != addr || ifa->ifa_prefixlen < 32)) {
1128                 if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1129                         fib_magic(RTM_NEWROUTE,
1130                                   dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1131                                   prefix, ifa->ifa_prefixlen, prim,
1132                                   ifa->ifa_rt_priority);
1133 
1134                 /* Add network specific broadcasts, when it takes a sense */
1135                 if (ifa->ifa_prefixlen < 31) {
1136                         fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
1137                                   prim, 0);
1138                         fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1139                                   32, prim, 0);
1140                 }
1141         }
1142 }
1143 
1144 void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1145 {
1146         __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1147         struct in_device *in_dev = ifa->ifa_dev;
1148         struct net_device *dev = in_dev->dev;
1149 
1150         if (!(dev->flags & IFF_UP) ||
1151             ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1152             ipv4_is_zeronet(prefix) ||
1153             (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
1154                 return;
1155 
1156         /* add the new */
1157         fib_magic(RTM_NEWROUTE,
1158                   dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1159                   prefix, ifa->ifa_prefixlen, ifa, new_metric);
1160 
1161         /* delete the old */
1162         fib_magic(RTM_DELROUTE,
1163                   dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1164                   prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1165 }
1166 
1167 /* Delete primary or secondary address.
1168  * Optionally, on secondary address promotion consider the addresses
1169  * from subnet iprim as deleted, even if they are in device list.
1170  * In this case the secondary ifa can be in device list.
1171  */
1172 void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1173 {
1174         struct in_device *in_dev = ifa->ifa_dev;
1175         struct net_device *dev = in_dev->dev;
1176         struct in_ifaddr *ifa1;
1177         struct in_ifaddr *prim = ifa, *prim1 = NULL;
1178         __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1179         __be32 any = ifa->ifa_address & ifa->ifa_mask;
1180 #define LOCAL_OK        1
1181 #define BRD_OK          2
1182 #define BRD0_OK         4
1183 #define BRD1_OK         8
1184         unsigned int ok = 0;
1185         int subnet = 0;         /* Primary network */
1186         int gone = 1;           /* Address is missing */
1187         int same_prefsrc = 0;   /* Another primary with same IP */
1188 
1189         if (ifa->ifa_flags & IFA_F_SECONDARY) {
1190                 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1191                 if (!prim) {
1192                         /* if the device has been deleted, we don't perform
1193                          * address promotion
1194                          */
1195                         if (!in_dev->dead)
1196                                 pr_warn("%s: bug: prim == NULL\n", __func__);
1197                         return;
1198                 }
1199                 if (iprim && iprim != prim) {
1200                         pr_warn("%s: bug: iprim != prim\n", __func__);
1201                         return;
1202                 }
1203         } else if (!ipv4_is_zeronet(any) &&
1204                    (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1205                 if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1206                         fib_magic(RTM_DELROUTE,
1207                                   dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1208                                   any, ifa->ifa_prefixlen, prim, 0);
1209                 subnet = 1;
1210         }
1211 
1212         if (in_dev->dead)
1213                 goto no_promotions;
1214 
1215         /* Deletion is more complicated than add.
1216          * We should take care of not to delete too much :-)
1217          *
1218          * Scan address list to be sure that addresses are really gone.
1219          */
1220         rcu_read_lock();
1221         in_dev_for_each_ifa_rcu(ifa1, in_dev) {
1222                 if (ifa1 == ifa) {
1223                         /* promotion, keep the IP */
1224                         gone = 0;
1225                         continue;
1226                 }
1227                 /* Ignore IFAs from our subnet */
1228                 if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1229                     inet_ifa_match(ifa1->ifa_address, iprim))
1230                         continue;
1231 
1232                 /* Ignore ifa1 if it uses different primary IP (prefsrc) */
1233                 if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1234                         /* Another address from our subnet? */
1235                         if (ifa1->ifa_mask == prim->ifa_mask &&
1236                             inet_ifa_match(ifa1->ifa_address, prim))
1237                                 prim1 = prim;
1238                         else {
1239                                 /* We reached the secondaries, so
1240                                  * same_prefsrc should be determined.
1241                                  */
1242                                 if (!same_prefsrc)
1243                                         continue;
1244                                 /* Search new prim1 if ifa1 is not
1245                                  * using the current prim1
1246                                  */
1247                                 if (!prim1 ||
1248                                     ifa1->ifa_mask != prim1->ifa_mask ||
1249                                     !inet_ifa_match(ifa1->ifa_address, prim1))
1250                                         prim1 = inet_ifa_byprefix(in_dev,
1251                                                         ifa1->ifa_address,
1252                                                         ifa1->ifa_mask);
1253                                 if (!prim1)
1254                                         continue;
1255                                 if (prim1->ifa_local != prim->ifa_local)
1256                                         continue;
1257                         }
1258                 } else {
1259                         if (prim->ifa_local != ifa1->ifa_local)
1260                                 continue;
1261                         prim1 = ifa1;
1262                         if (prim != prim1)
1263                                 same_prefsrc = 1;
1264                 }
1265                 if (ifa->ifa_local == ifa1->ifa_local)
1266                         ok |= LOCAL_OK;
1267                 if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1268                         ok |= BRD_OK;
1269                 if (brd == ifa1->ifa_broadcast)
1270                         ok |= BRD1_OK;
1271                 if (any == ifa1->ifa_broadcast)
1272                         ok |= BRD0_OK;
1273                 /* primary has network specific broadcasts */
1274                 if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1275                         __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1276                         __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1277 
1278                         if (!ipv4_is_zeronet(any1)) {
1279                                 if (ifa->ifa_broadcast == brd1 ||
1280                                     ifa->ifa_broadcast == any1)
1281                                         ok |= BRD_OK;
1282                                 if (brd == brd1 || brd == any1)
1283                                         ok |= BRD1_OK;
1284                                 if (any == brd1 || any == any1)
1285                                         ok |= BRD0_OK;
1286                         }
1287                 }
1288         }
1289         rcu_read_unlock();
1290 
1291 no_promotions:
1292         if (!(ok & BRD_OK))
1293                 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1294                           prim, 0);
1295         if (subnet && ifa->ifa_prefixlen < 31) {
1296                 if (!(ok & BRD1_OK))
1297                         fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1298                                   prim, 0);
1299                 if (!(ok & BRD0_OK))
1300                         fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1301                                   prim, 0);
1302         }
1303         if (!(ok & LOCAL_OK)) {
1304                 unsigned int addr_type;
1305 
1306                 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1307 
1308                 /* Check, that this local address finally disappeared. */
1309                 addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1310                                                      ifa->ifa_local);
1311                 if (gone && addr_type != RTN_LOCAL) {
1312                         /* And the last, but not the least thing.
1313                          * We must flush stray FIB entries.
1314                          *
1315                          * First of all, we scan fib_info list searching
1316                          * for stray nexthop entries, then ignite fib_flush.
1317                          */
1318                         if (fib_sync_down_addr(dev, ifa->ifa_local))
1319                                 fib_flush(dev_net(dev));
1320                 }
1321         }
1322 #undef LOCAL_OK
1323 #undef BRD_OK
1324 #undef BRD0_OK
1325 #undef BRD1_OK
1326 }
1327 
1328 static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1329 {
1330 
1331         struct fib_result       res;
1332         struct flowi4           fl4 = {
1333                 .flowi4_mark = frn->fl_mark,
1334                 .daddr = frn->fl_addr,
1335                 .flowi4_tos = frn->fl_tos,
1336                 .flowi4_scope = frn->fl_scope,
1337         };
1338         struct fib_table *tb;
1339 
1340         rcu_read_lock();
1341 
1342         tb = fib_get_table(net, frn->tb_id_in);
1343 
1344         frn->err = -ENOENT;
1345         if (tb) {
1346                 local_bh_disable();
1347 
1348                 frn->tb_id = tb->tb_id;
1349                 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1350 
1351                 if (!frn->err) {
1352                         frn->prefixlen = res.prefixlen;
1353                         frn->nh_sel = res.nh_sel;
1354                         frn->type = res.type;
1355                         frn->scope = res.scope;
1356                 }
1357                 local_bh_enable();
1358         }
1359 
1360         rcu_read_unlock();
1361 }
1362 
1363 static void nl_fib_input(struct sk_buff *skb)
1364 {
1365         struct net *net;
1366         struct fib_result_nl *frn;
1367         struct nlmsghdr *nlh;
1368         u32 portid;
1369 
1370         net = sock_net(skb->sk);
1371         nlh = nlmsg_hdr(skb);
1372         if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1373             skb->len < nlh->nlmsg_len ||
1374             nlmsg_len(nlh) < sizeof(*frn))
1375                 return;
1376 
1377         skb = netlink_skb_clone(skb, GFP_KERNEL);
1378         if (!skb)
1379                 return;
1380         nlh = nlmsg_hdr(skb);
1381 
1382         frn = (struct fib_result_nl *) nlmsg_data(nlh);
1383         nl_fib_lookup(net, frn);
1384 
1385         portid = NETLINK_CB(skb).portid;      /* netlink portid */
1386         NETLINK_CB(skb).portid = 0;        /* from kernel */
1387         NETLINK_CB(skb).dst_group = 0;  /* unicast */
1388         netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
1389 }
1390 
1391 static int __net_init nl_fib_lookup_init(struct net *net)
1392 {
1393         struct sock *sk;
1394         struct netlink_kernel_cfg cfg = {
1395                 .input  = nl_fib_input,
1396         };
1397 
1398         sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1399         if (!sk)
1400                 return -EAFNOSUPPORT;
1401         net->ipv4.fibnl = sk;
1402         return 0;
1403 }
1404 
1405 static void nl_fib_lookup_exit(struct net *net)
1406 {
1407         netlink_kernel_release(net->ipv4.fibnl);
1408         net->ipv4.fibnl = NULL;
1409 }
1410 
1411 static void fib_disable_ip(struct net_device *dev, unsigned long event,
1412                            bool force)
1413 {
1414         if (fib_sync_down_dev(dev, event, force))
1415                 fib_flush(dev_net(dev));
1416         else
1417                 rt_cache_flush(dev_net(dev));
1418         arp_ifdown(dev);
1419 }
1420 
1421 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1422 {
1423         struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1424         struct net_device *dev = ifa->ifa_dev->dev;
1425         struct net *net = dev_net(dev);
1426 
1427         switch (event) {
1428         case NETDEV_UP:
1429                 fib_add_ifaddr(ifa);
1430 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1431                 fib_sync_up(dev, RTNH_F_DEAD);
1432 #endif
1433                 atomic_inc(&net->ipv4.dev_addr_genid);
1434                 rt_cache_flush(dev_net(dev));
1435                 break;
1436         case NETDEV_DOWN:
1437                 fib_del_ifaddr(ifa, NULL);
1438                 atomic_inc(&net->ipv4.dev_addr_genid);
1439                 if (!ifa->ifa_dev->ifa_list) {
1440                         /* Last address was deleted from this interface.
1441                          * Disable IP.
1442                          */
1443                         fib_disable_ip(dev, event, true);
1444                 } else {
1445                         rt_cache_flush(dev_net(dev));
1446                 }
1447                 break;
1448         }
1449         return NOTIFY_DONE;
1450 }
1451 
1452 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1453 {
1454         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1455         struct netdev_notifier_changeupper_info *upper_info = ptr;
1456         struct netdev_notifier_info_ext *info_ext = ptr;
1457         struct in_device *in_dev;
1458         struct net *net = dev_net(dev);
1459         struct in_ifaddr *ifa;
1460         unsigned int flags;
1461 
1462         if (event == NETDEV_UNREGISTER) {
1463                 fib_disable_ip(dev, event, true);
1464                 rt_flush_dev(dev);
1465                 return NOTIFY_DONE;
1466         }
1467 
1468         in_dev = __in_dev_get_rtnl(dev);
1469         if (!in_dev)
1470                 return NOTIFY_DONE;
1471 
1472         switch (event) {
1473         case NETDEV_UP:
1474                 in_dev_for_each_ifa_rtnl(ifa, in_dev) {
1475                         fib_add_ifaddr(ifa);
1476                 }
1477 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1478                 fib_sync_up(dev, RTNH_F_DEAD);
1479 #endif
1480                 atomic_inc(&net->ipv4.dev_addr_genid);
1481                 rt_cache_flush(net);
1482                 break;
1483         case NETDEV_DOWN:
1484                 fib_disable_ip(dev, event, false);
1485                 break;
1486         case NETDEV_CHANGE:
1487                 flags = dev_get_flags(dev);
1488                 if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1489                         fib_sync_up(dev, RTNH_F_LINKDOWN);
1490                 else
1491                         fib_sync_down_dev(dev, event, false);
1492                 rt_cache_flush(net);
1493                 break;
1494         case NETDEV_CHANGEMTU:
1495                 fib_sync_mtu(dev, info_ext->ext.mtu);
1496                 rt_cache_flush(net);
1497                 break;
1498         case NETDEV_CHANGEUPPER:
1499                 upper_info = ptr;
1500                 /* flush all routes if dev is linked to or unlinked from
1501                  * an L3 master device (e.g., VRF)
1502                  */
1503                 if (upper_info->upper_dev &&
1504                     netif_is_l3_master(upper_info->upper_dev))
1505                         fib_disable_ip(dev, NETDEV_DOWN, true);
1506                 break;
1507         }
1508         return NOTIFY_DONE;
1509 }
1510 
1511 static struct notifier_block fib_inetaddr_notifier = {
1512         .notifier_call = fib_inetaddr_event,
1513 };
1514 
1515 static struct notifier_block fib_netdev_notifier = {
1516         .notifier_call = fib_netdev_event,
1517 };
1518 
1519 static int __net_init ip_fib_net_init(struct net *net)
1520 {
1521         int err;
1522         size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1523 
1524         err = fib4_notifier_init(net);
1525         if (err)
1526                 return err;
1527 
1528         /* Avoid false sharing : Use at least a full cache line */
1529         size = max_t(size_t, size, L1_CACHE_BYTES);
1530 
1531         net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1532         if (!net->ipv4.fib_table_hash) {
1533                 err = -ENOMEM;
1534                 goto err_table_hash_alloc;
1535         }
1536 
1537         err = fib4_rules_init(net);
1538         if (err < 0)
1539                 goto err_rules_init;
1540         return 0;
1541 
1542 err_rules_init:
1543         kfree(net->ipv4.fib_table_hash);
1544 err_table_hash_alloc:
1545         fib4_notifier_exit(net);
1546         return err;
1547 }
1548 
1549 static void ip_fib_net_exit(struct net *net)
1550 {
1551         int i;
1552 
1553         rtnl_lock();
1554 #ifdef CONFIG_IP_MULTIPLE_TABLES
1555         RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1556         RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1557 #endif
1558         /* Destroy the tables in reverse order to guarantee that the
1559          * local table, ID 255, is destroyed before the main table, ID
1560          * 254. This is necessary as the local table may contain
1561          * references to data contained in the main table.
1562          */
1563         for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1564                 struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1565                 struct hlist_node *tmp;
1566                 struct fib_table *tb;
1567 
1568                 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1569                         hlist_del(&tb->tb_hlist);
1570                         fib_table_flush(net, tb, true);
1571                         fib_free_table(tb);
1572                 }
1573         }
1574 
1575 #ifdef CONFIG_IP_MULTIPLE_TABLES
1576         fib4_rules_exit(net);
1577 #endif
1578         rtnl_unlock();
1579         kfree(net->ipv4.fib_table_hash);
1580         fib4_notifier_exit(net);
1581 }
1582 
1583 static int __net_init fib_net_init(struct net *net)
1584 {
1585         int error;
1586 
1587 #ifdef CONFIG_IP_ROUTE_CLASSID
1588         net->ipv4.fib_num_tclassid_users = 0;
1589 #endif
1590         error = ip_fib_net_init(net);
1591         if (error < 0)
1592                 goto out;
1593         error = nl_fib_lookup_init(net);
1594         if (error < 0)
1595                 goto out_nlfl;
1596         error = fib_proc_init(net);
1597         if (error < 0)
1598                 goto out_proc;
1599 out:
1600         return error;
1601 
1602 out_proc:
1603         nl_fib_lookup_exit(net);
1604 out_nlfl:
1605         ip_fib_net_exit(net);
1606         goto out;
1607 }
1608 
1609 static void __net_exit fib_net_exit(struct net *net)
1610 {
1611         fib_proc_exit(net);
1612         nl_fib_lookup_exit(net);
1613         ip_fib_net_exit(net);
1614 }
1615 
1616 static struct pernet_operations fib_net_ops = {
1617         .init = fib_net_init,
1618         .exit = fib_net_exit,
1619 };
1620 
1621 void __init ip_fib_init(void)
1622 {
1623         fib_trie_init();
1624 
1625         register_pernet_subsys(&fib_net_ops);
1626 
1627         register_netdevice_notifier(&fib_netdev_notifier);
1628         register_inetaddr_notifier(&fib_inetaddr_notifier);
1629 
1630         rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1631         rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1632         rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1633 }

/* [<][>][^][v][top][bottom][index][help] */