root/net/ipv4/nexthop.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. nh_dev_hashfn
  2. nexthop_devhash_add
  3. nexthop_free_mpath
  4. nexthop_free_single
  5. nexthop_free_rcu
  6. nexthop_alloc
  7. nexthop_grp_alloc
  8. nh_base_seq_inc
  9. nexthop_find_by_id
  10. nh_find_unused_id
  11. nla_put_nh_group
  12. nh_fill_node
  13. nh_nlmsg_size_grp
  14. nh_nlmsg_size_single
  15. nh_nlmsg_size
  16. nexthop_notify
  17. valid_group_nh
  18. nh_check_attr_group
  19. ipv6_good_nh
  20. ipv4_good_nh
  21. nexthop_select_path
  22. nexthop_for_each_fib6_nh
  23. check_src_addr
  24. fib6_check_nexthop
  25. fib6_check_nh_list
  26. nexthop_check_scope
  27. fib_check_nexthop
  28. fib_check_nh_list
  29. nh_group_rebalance
  30. remove_nh_grp_entry
  31. remove_nexthop_from_groups
  32. remove_nexthop_group
  33. __remove_nexthop_fib
  34. __remove_nexthop
  35. remove_nexthop
  36. nh_rt_cache_flush
  37. replace_nexthop_grp
  38. replace_nexthop_single
  39. __nexthop_replace_notify
  40. nexthop_replace_notify
  41. replace_nexthop
  42. insert_nexthop
  43. nexthop_flush_dev
  44. flush_all_nexthops
  45. nexthop_create_group
  46. nh_create_ipv4
  47. nh_create_ipv6
  48. nexthop_create
  49. nexthop_add
  50. rtm_to_nh_config
  51. rtm_new_nexthop
  52. nh_valid_get_del_req
  53. rtm_del_nexthop
  54. rtm_get_nexthop
  55. nh_dump_filtered
  56. nh_valid_dump_req
  57. rtm_dump_nexthop
  58. nexthop_sync_mtu
  59. nh_netdev_event
  60. nexthop_net_exit
  61. nexthop_net_init
  62. nexthop_init

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* Generic nexthop implementation
   3  *
   4  * Copyright (c) 2017-19 Cumulus Networks
   5  * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
   6  */
   7 
   8 #include <linux/nexthop.h>
   9 #include <linux/rtnetlink.h>
  10 #include <linux/slab.h>
  11 #include <net/arp.h>
  12 #include <net/ipv6_stubs.h>
  13 #include <net/lwtunnel.h>
  14 #include <net/ndisc.h>
  15 #include <net/nexthop.h>
  16 #include <net/route.h>
  17 #include <net/sock.h>
  18 
  19 static void remove_nexthop(struct net *net, struct nexthop *nh,
  20                            struct nl_info *nlinfo);
  21 
  22 #define NH_DEV_HASHBITS  8
  23 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
  24 
  25 static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
  26         [NHA_UNSPEC]            = { .strict_start_type = NHA_UNSPEC + 1 },
  27         [NHA_ID]                = { .type = NLA_U32 },
  28         [NHA_GROUP]             = { .type = NLA_BINARY },
  29         [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
  30         [NHA_BLACKHOLE]         = { .type = NLA_FLAG },
  31         [NHA_OIF]               = { .type = NLA_U32 },
  32         [NHA_GATEWAY]           = { .type = NLA_BINARY },
  33         [NHA_ENCAP_TYPE]        = { .type = NLA_U16 },
  34         [NHA_ENCAP]             = { .type = NLA_NESTED },
  35         [NHA_GROUPS]            = { .type = NLA_FLAG },
  36         [NHA_MASTER]            = { .type = NLA_U32 },
  37 };
  38 
  39 static unsigned int nh_dev_hashfn(unsigned int val)
  40 {
  41         unsigned int mask = NH_DEV_HASHSIZE - 1;
  42 
  43         return (val ^
  44                 (val >> NH_DEV_HASHBITS) ^
  45                 (val >> (NH_DEV_HASHBITS * 2))) & mask;
  46 }
  47 
  48 static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
  49 {
  50         struct net_device *dev = nhi->fib_nhc.nhc_dev;
  51         struct hlist_head *head;
  52         unsigned int hash;
  53 
  54         WARN_ON(!dev);
  55 
  56         hash = nh_dev_hashfn(dev->ifindex);
  57         head = &net->nexthop.devhash[hash];
  58         hlist_add_head(&nhi->dev_hash, head);
  59 }
  60 
  61 static void nexthop_free_mpath(struct nexthop *nh)
  62 {
  63         struct nh_group *nhg;
  64         int i;
  65 
  66         nhg = rcu_dereference_raw(nh->nh_grp);
  67         for (i = 0; i < nhg->num_nh; ++i) {
  68                 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
  69 
  70                 WARN_ON(!list_empty(&nhge->nh_list));
  71                 nexthop_put(nhge->nh);
  72         }
  73 
  74         WARN_ON(nhg->spare == nhg);
  75 
  76         kfree(nhg->spare);
  77         kfree(nhg);
  78 }
  79 
  80 static void nexthop_free_single(struct nexthop *nh)
  81 {
  82         struct nh_info *nhi;
  83 
  84         nhi = rcu_dereference_raw(nh->nh_info);
  85         switch (nhi->family) {
  86         case AF_INET:
  87                 fib_nh_release(nh->net, &nhi->fib_nh);
  88                 break;
  89         case AF_INET6:
  90                 ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
  91                 break;
  92         }
  93         kfree(nhi);
  94 }
  95 
  96 void nexthop_free_rcu(struct rcu_head *head)
  97 {
  98         struct nexthop *nh = container_of(head, struct nexthop, rcu);
  99 
 100         if (nh->is_group)
 101                 nexthop_free_mpath(nh);
 102         else
 103                 nexthop_free_single(nh);
 104 
 105         kfree(nh);
 106 }
 107 EXPORT_SYMBOL_GPL(nexthop_free_rcu);
 108 
 109 static struct nexthop *nexthop_alloc(void)
 110 {
 111         struct nexthop *nh;
 112 
 113         nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 114         if (nh) {
 115                 INIT_LIST_HEAD(&nh->fi_list);
 116                 INIT_LIST_HEAD(&nh->f6i_list);
 117                 INIT_LIST_HEAD(&nh->grp_list);
 118         }
 119         return nh;
 120 }
 121 
 122 static struct nh_group *nexthop_grp_alloc(u16 num_nh)
 123 {
 124         size_t sz = offsetof(struct nexthop, nh_grp)
 125                     + sizeof(struct nh_group)
 126                     + sizeof(struct nh_grp_entry) * num_nh;
 127         struct nh_group *nhg;
 128 
 129         nhg = kzalloc(sz, GFP_KERNEL);
 130         if (nhg)
 131                 nhg->num_nh = num_nh;
 132 
 133         return nhg;
 134 }
 135 
 136 static void nh_base_seq_inc(struct net *net)
 137 {
 138         while (++net->nexthop.seq == 0)
 139                 ;
 140 }
 141 
 142 /* no reference taken; rcu lock or rtnl must be held */
 143 struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
 144 {
 145         struct rb_node **pp, *parent = NULL, *next;
 146 
 147         pp = &net->nexthop.rb_root.rb_node;
 148         while (1) {
 149                 struct nexthop *nh;
 150 
 151                 next = rcu_dereference_raw(*pp);
 152                 if (!next)
 153                         break;
 154                 parent = next;
 155 
 156                 nh = rb_entry(parent, struct nexthop, rb_node);
 157                 if (id < nh->id)
 158                         pp = &next->rb_left;
 159                 else if (id > nh->id)
 160                         pp = &next->rb_right;
 161                 else
 162                         return nh;
 163         }
 164         return NULL;
 165 }
 166 EXPORT_SYMBOL_GPL(nexthop_find_by_id);
 167 
 168 /* used for auto id allocation; called with rtnl held */
 169 static u32 nh_find_unused_id(struct net *net)
 170 {
 171         u32 id_start = net->nexthop.last_id_allocated;
 172 
 173         while (1) {
 174                 net->nexthop.last_id_allocated++;
 175                 if (net->nexthop.last_id_allocated == id_start)
 176                         break;
 177 
 178                 if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
 179                         return net->nexthop.last_id_allocated;
 180         }
 181         return 0;
 182 }
 183 
 184 static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg)
 185 {
 186         struct nexthop_grp *p;
 187         size_t len = nhg->num_nh * sizeof(*p);
 188         struct nlattr *nla;
 189         u16 group_type = 0;
 190         int i;
 191 
 192         if (nhg->mpath)
 193                 group_type = NEXTHOP_GRP_TYPE_MPATH;
 194 
 195         if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
 196                 goto nla_put_failure;
 197 
 198         nla = nla_reserve(skb, NHA_GROUP, len);
 199         if (!nla)
 200                 goto nla_put_failure;
 201 
 202         p = nla_data(nla);
 203         for (i = 0; i < nhg->num_nh; ++i) {
 204                 p->id = nhg->nh_entries[i].nh->id;
 205                 p->weight = nhg->nh_entries[i].weight - 1;
 206                 p += 1;
 207         }
 208 
 209         return 0;
 210 
 211 nla_put_failure:
 212         return -EMSGSIZE;
 213 }
 214 
 215 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 216                         int event, u32 portid, u32 seq, unsigned int nlflags)
 217 {
 218         struct fib6_nh *fib6_nh;
 219         struct fib_nh *fib_nh;
 220         struct nlmsghdr *nlh;
 221         struct nh_info *nhi;
 222         struct nhmsg *nhm;
 223 
 224         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
 225         if (!nlh)
 226                 return -EMSGSIZE;
 227 
 228         nhm = nlmsg_data(nlh);
 229         nhm->nh_family = AF_UNSPEC;
 230         nhm->nh_flags = nh->nh_flags;
 231         nhm->nh_protocol = nh->protocol;
 232         nhm->nh_scope = 0;
 233         nhm->resvd = 0;
 234 
 235         if (nla_put_u32(skb, NHA_ID, nh->id))
 236                 goto nla_put_failure;
 237 
 238         if (nh->is_group) {
 239                 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 240 
 241                 if (nla_put_nh_group(skb, nhg))
 242                         goto nla_put_failure;
 243                 goto out;
 244         }
 245 
 246         nhi = rtnl_dereference(nh->nh_info);
 247         nhm->nh_family = nhi->family;
 248         if (nhi->reject_nh) {
 249                 if (nla_put_flag(skb, NHA_BLACKHOLE))
 250                         goto nla_put_failure;
 251                 goto out;
 252         } else {
 253                 const struct net_device *dev;
 254 
 255                 dev = nhi->fib_nhc.nhc_dev;
 256                 if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
 257                         goto nla_put_failure;
 258         }
 259 
 260         nhm->nh_scope = nhi->fib_nhc.nhc_scope;
 261         switch (nhi->family) {
 262         case AF_INET:
 263                 fib_nh = &nhi->fib_nh;
 264                 if (fib_nh->fib_nh_gw_family &&
 265                     nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
 266                         goto nla_put_failure;
 267                 break;
 268 
 269         case AF_INET6:
 270                 fib6_nh = &nhi->fib6_nh;
 271                 if (fib6_nh->fib_nh_gw_family &&
 272                     nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
 273                         goto nla_put_failure;
 274                 break;
 275         }
 276 
 277         if (nhi->fib_nhc.nhc_lwtstate &&
 278             lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
 279                                 NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
 280                 goto nla_put_failure;
 281 
 282 out:
 283         nlmsg_end(skb, nlh);
 284         return 0;
 285 
 286 nla_put_failure:
 287         nlmsg_cancel(skb, nlh);
 288         return -EMSGSIZE;
 289 }
 290 
 291 static size_t nh_nlmsg_size_grp(struct nexthop *nh)
 292 {
 293         struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 294         size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
 295 
 296         return nla_total_size(sz) +
 297                nla_total_size(2);  /* NHA_GROUP_TYPE */
 298 }
 299 
 300 static size_t nh_nlmsg_size_single(struct nexthop *nh)
 301 {
 302         struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 303         size_t sz;
 304 
 305         /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
 306          * are mutually exclusive
 307          */
 308         sz = nla_total_size(4);  /* NHA_OIF */
 309 
 310         switch (nhi->family) {
 311         case AF_INET:
 312                 if (nhi->fib_nh.fib_nh_gw_family)
 313                         sz += nla_total_size(4);  /* NHA_GATEWAY */
 314                 break;
 315 
 316         case AF_INET6:
 317                 /* NHA_GATEWAY */
 318                 if (nhi->fib6_nh.fib_nh_gw_family)
 319                         sz += nla_total_size(sizeof(const struct in6_addr));
 320                 break;
 321         }
 322 
 323         if (nhi->fib_nhc.nhc_lwtstate) {
 324                 sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
 325                 sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
 326         }
 327 
 328         return sz;
 329 }
 330 
 331 static size_t nh_nlmsg_size(struct nexthop *nh)
 332 {
 333         size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
 334 
 335         sz += nla_total_size(4); /* NHA_ID */
 336 
 337         if (nh->is_group)
 338                 sz += nh_nlmsg_size_grp(nh);
 339         else
 340                 sz += nh_nlmsg_size_single(nh);
 341 
 342         return sz;
 343 }
 344 
 345 static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 346 {
 347         unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
 348         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 349         struct sk_buff *skb;
 350         int err = -ENOBUFS;
 351 
 352         skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
 353         if (!skb)
 354                 goto errout;
 355 
 356         err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags);
 357         if (err < 0) {
 358                 /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
 359                 WARN_ON(err == -EMSGSIZE);
 360                 kfree_skb(skb);
 361                 goto errout;
 362         }
 363 
 364         rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
 365                     info->nlh, gfp_any());
 366         return;
 367 errout:
 368         if (err < 0)
 369                 rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
 370 }
 371 
 372 static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
 373                            struct netlink_ext_ack *extack)
 374 {
 375         if (nh->is_group) {
 376                 struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 377 
 378                 /* nested multipath (group within a group) is not
 379                  * supported
 380                  */
 381                 if (nhg->mpath) {
 382                         NL_SET_ERR_MSG(extack,
 383                                        "Multipath group can not be a nexthop within a group");
 384                         return false;
 385                 }
 386         } else {
 387                 struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 388 
 389                 if (nhi->reject_nh && npaths > 1) {
 390                         NL_SET_ERR_MSG(extack,
 391                                        "Blackhole nexthop can not be used in a group with more than 1 path");
 392                         return false;
 393                 }
 394         }
 395 
 396         return true;
 397 }
 398 
 399 static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 400                                struct netlink_ext_ack *extack)
 401 {
 402         unsigned int len = nla_len(tb[NHA_GROUP]);
 403         struct nexthop_grp *nhg;
 404         unsigned int i, j;
 405 
 406         if (len & (sizeof(struct nexthop_grp) - 1)) {
 407                 NL_SET_ERR_MSG(extack,
 408                                "Invalid length for nexthop group attribute");
 409                 return -EINVAL;
 410         }
 411 
 412         /* convert len to number of nexthop ids */
 413         len /= sizeof(*nhg);
 414 
 415         nhg = nla_data(tb[NHA_GROUP]);
 416         for (i = 0; i < len; ++i) {
 417                 if (nhg[i].resvd1 || nhg[i].resvd2) {
 418                         NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
 419                         return -EINVAL;
 420                 }
 421                 if (nhg[i].weight > 254) {
 422                         NL_SET_ERR_MSG(extack, "Invalid value for weight");
 423                         return -EINVAL;
 424                 }
 425                 for (j = i + 1; j < len; ++j) {
 426                         if (nhg[i].id == nhg[j].id) {
 427                                 NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
 428                                 return -EINVAL;
 429                         }
 430                 }
 431         }
 432 
 433         nhg = nla_data(tb[NHA_GROUP]);
 434         for (i = 0; i < len; ++i) {
 435                 struct nexthop *nh;
 436 
 437                 nh = nexthop_find_by_id(net, nhg[i].id);
 438                 if (!nh) {
 439                         NL_SET_ERR_MSG(extack, "Invalid nexthop id");
 440                         return -EINVAL;
 441                 }
 442                 if (!valid_group_nh(nh, len, extack))
 443                         return -EINVAL;
 444         }
 445         for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {
 446                 if (!tb[i])
 447                         continue;
 448 
 449                 NL_SET_ERR_MSG(extack,
 450                                "No other attributes can be set in nexthop groups");
 451                 return -EINVAL;
 452         }
 453 
 454         return 0;
 455 }
 456 
 457 static bool ipv6_good_nh(const struct fib6_nh *nh)
 458 {
 459         int state = NUD_REACHABLE;
 460         struct neighbour *n;
 461 
 462         rcu_read_lock_bh();
 463 
 464         n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 465         if (n)
 466                 state = n->nud_state;
 467 
 468         rcu_read_unlock_bh();
 469 
 470         return !!(state & NUD_VALID);
 471 }
 472 
 473 static bool ipv4_good_nh(const struct fib_nh *nh)
 474 {
 475         int state = NUD_REACHABLE;
 476         struct neighbour *n;
 477 
 478         rcu_read_lock_bh();
 479 
 480         n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 481                                       (__force u32)nh->fib_nh_gw4);
 482         if (n)
 483                 state = n->nud_state;
 484 
 485         rcu_read_unlock_bh();
 486 
 487         return !!(state & NUD_VALID);
 488 }
 489 
 490 struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
 491 {
 492         struct nexthop *rc = NULL;
 493         struct nh_group *nhg;
 494         int i;
 495 
 496         if (!nh->is_group)
 497                 return nh;
 498 
 499         nhg = rcu_dereference(nh->nh_grp);
 500         for (i = 0; i < nhg->num_nh; ++i) {
 501                 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 502                 struct nh_info *nhi;
 503 
 504                 if (hash > atomic_read(&nhge->upper_bound))
 505                         continue;
 506 
 507                 /* nexthops always check if it is good and does
 508                  * not rely on a sysctl for this behavior
 509                  */
 510                 nhi = rcu_dereference(nhge->nh->nh_info);
 511                 switch (nhi->family) {
 512                 case AF_INET:
 513                         if (ipv4_good_nh(&nhi->fib_nh))
 514                                 return nhge->nh;
 515                         break;
 516                 case AF_INET6:
 517                         if (ipv6_good_nh(&nhi->fib6_nh))
 518                                 return nhge->nh;
 519                         break;
 520                 }
 521 
 522                 if (!rc)
 523                         rc = nhge->nh;
 524         }
 525 
 526         return rc;
 527 }
 528 EXPORT_SYMBOL_GPL(nexthop_select_path);
 529 
 530 int nexthop_for_each_fib6_nh(struct nexthop *nh,
 531                              int (*cb)(struct fib6_nh *nh, void *arg),
 532                              void *arg)
 533 {
 534         struct nh_info *nhi;
 535         int err;
 536 
 537         if (nh->is_group) {
 538                 struct nh_group *nhg;
 539                 int i;
 540 
 541                 nhg = rcu_dereference_rtnl(nh->nh_grp);
 542                 for (i = 0; i < nhg->num_nh; i++) {
 543                         struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 544 
 545                         nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
 546                         err = cb(&nhi->fib6_nh, arg);
 547                         if (err)
 548                                 return err;
 549                 }
 550         } else {
 551                 nhi = rcu_dereference_rtnl(nh->nh_info);
 552                 err = cb(&nhi->fib6_nh, arg);
 553                 if (err)
 554                         return err;
 555         }
 556 
 557         return 0;
 558 }
 559 EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
 560 
 561 static int check_src_addr(const struct in6_addr *saddr,
 562                           struct netlink_ext_ack *extack)
 563 {
 564         if (!ipv6_addr_any(saddr)) {
 565                 NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
 566                 return -EINVAL;
 567         }
 568         return 0;
 569 }
 570 
 571 int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
 572                        struct netlink_ext_ack *extack)
 573 {
 574         struct nh_info *nhi;
 575 
 576         /* fib6_src is unique to a fib6_info and limits the ability to cache
 577          * routes in fib6_nh within a nexthop that is potentially shared
 578          * across multiple fib entries. If the config wants to use source
 579          * routing it can not use nexthop objects. mlxsw also does not allow
 580          * fib6_src on routes.
 581          */
 582         if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
 583                 return -EINVAL;
 584 
 585         if (nh->is_group) {
 586                 struct nh_group *nhg;
 587 
 588                 nhg = rtnl_dereference(nh->nh_grp);
 589                 if (nhg->has_v4)
 590                         goto no_v4_nh;
 591         } else {
 592                 nhi = rtnl_dereference(nh->nh_info);
 593                 if (nhi->family == AF_INET)
 594                         goto no_v4_nh;
 595         }
 596 
 597         return 0;
 598 no_v4_nh:
 599         NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
 600         return -EINVAL;
 601 }
 602 EXPORT_SYMBOL_GPL(fib6_check_nexthop);
 603 
 604 /* if existing nexthop has ipv6 routes linked to it, need
 605  * to verify this new spec works with ipv6
 606  */
 607 static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
 608                               struct netlink_ext_ack *extack)
 609 {
 610         struct fib6_info *f6i;
 611 
 612         if (list_empty(&old->f6i_list))
 613                 return 0;
 614 
 615         list_for_each_entry(f6i, &old->f6i_list, nh_list) {
 616                 if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
 617                         return -EINVAL;
 618         }
 619 
 620         return fib6_check_nexthop(new, NULL, extack);
 621 }
 622 
 623 static int nexthop_check_scope(struct nexthop *nh, u8 scope,
 624                                struct netlink_ext_ack *extack)
 625 {
 626         struct nh_info *nhi;
 627 
 628         nhi = rtnl_dereference(nh->nh_info);
 629         if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
 630                 NL_SET_ERR_MSG(extack,
 631                                "Route with host scope can not have a gateway");
 632                 return -EINVAL;
 633         }
 634 
 635         if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
 636                 NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
 637                 return -EINVAL;
 638         }
 639 
 640         return 0;
 641 }
 642 
 643 /* Invoked by fib add code to verify nexthop by id is ok with
 644  * config for prefix; parts of fib_check_nh not done when nexthop
 645  * object is used.
 646  */
 647 int fib_check_nexthop(struct nexthop *nh, u8 scope,
 648                       struct netlink_ext_ack *extack)
 649 {
 650         int err = 0;
 651 
 652         if (nh->is_group) {
 653                 struct nh_group *nhg;
 654 
 655                 if (scope == RT_SCOPE_HOST) {
 656                         NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
 657                         err = -EINVAL;
 658                         goto out;
 659                 }
 660 
 661                 nhg = rtnl_dereference(nh->nh_grp);
 662                 /* all nexthops in a group have the same scope */
 663                 err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack);
 664         } else {
 665                 err = nexthop_check_scope(nh, scope, extack);
 666         }
 667 out:
 668         return err;
 669 }
 670 
 671 static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
 672                              struct netlink_ext_ack *extack)
 673 {
 674         struct fib_info *fi;
 675 
 676         list_for_each_entry(fi, &old->fi_list, nh_list) {
 677                 int err;
 678 
 679                 err = fib_check_nexthop(new, fi->fib_scope, extack);
 680                 if (err)
 681                         return err;
 682         }
 683         return 0;
 684 }
 685 
 686 static void nh_group_rebalance(struct nh_group *nhg)
 687 {
 688         int total = 0;
 689         int w = 0;
 690         int i;
 691 
 692         for (i = 0; i < nhg->num_nh; ++i)
 693                 total += nhg->nh_entries[i].weight;
 694 
 695         for (i = 0; i < nhg->num_nh; ++i) {
 696                 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 697                 int upper_bound;
 698 
 699                 w += nhge->weight;
 700                 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
 701                 atomic_set(&nhge->upper_bound, upper_bound);
 702         }
 703 }
 704 
 705 static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 706                                 struct nl_info *nlinfo)
 707 {
 708         struct nh_grp_entry *nhges, *new_nhges;
 709         struct nexthop *nhp = nhge->nh_parent;
 710         struct nexthop *nh = nhge->nh;
 711         struct nh_group *nhg, *newg;
 712         int i, j;
 713 
 714         WARN_ON(!nh);
 715 
 716         nhg = rtnl_dereference(nhp->nh_grp);
 717         newg = nhg->spare;
 718 
 719         /* last entry, keep it visible and remove the parent */
 720         if (nhg->num_nh == 1) {
 721                 remove_nexthop(net, nhp, nlinfo);
 722                 return;
 723         }
 724 
 725         newg->has_v4 = nhg->has_v4;
 726         newg->mpath = nhg->mpath;
 727         newg->num_nh = nhg->num_nh;
 728 
 729         /* copy old entries to new except the one getting removed */
 730         nhges = nhg->nh_entries;
 731         new_nhges = newg->nh_entries;
 732         for (i = 0, j = 0; i < nhg->num_nh; ++i) {
 733                 /* current nexthop getting removed */
 734                 if (nhg->nh_entries[i].nh == nh) {
 735                         newg->num_nh--;
 736                         continue;
 737                 }
 738 
 739                 list_del(&nhges[i].nh_list);
 740                 new_nhges[j].nh_parent = nhges[i].nh_parent;
 741                 new_nhges[j].nh = nhges[i].nh;
 742                 new_nhges[j].weight = nhges[i].weight;
 743                 list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
 744                 j++;
 745         }
 746 
 747         nh_group_rebalance(newg);
 748         rcu_assign_pointer(nhp->nh_grp, newg);
 749 
 750         list_del(&nhge->nh_list);
 751         nexthop_put(nhge->nh);
 752 
 753         if (nlinfo)
 754                 nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
 755 }
 756 
 757 static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 758                                        struct nl_info *nlinfo)
 759 {
 760         struct nh_grp_entry *nhge, *tmp;
 761 
 762         list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
 763                 remove_nh_grp_entry(net, nhge, nlinfo);
 764 
 765         /* make sure all see the newly published array before releasing rtnl */
 766         synchronize_rcu();
 767 }
 768 
 769 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 770 {
 771         struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
 772         int i, num_nh = nhg->num_nh;
 773 
 774         for (i = 0; i < num_nh; ++i) {
 775                 struct nh_grp_entry *nhge = &nhg->nh_entries[i];
 776 
 777                 if (WARN_ON(!nhge->nh))
 778                         continue;
 779 
 780                 list_del_init(&nhge->nh_list);
 781         }
 782 }
 783 
 784 /* not called for nexthop replace */
 785 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 786 {
 787         struct fib6_info *f6i, *tmp;
 788         bool do_flush = false;
 789         struct fib_info *fi;
 790 
 791         list_for_each_entry(fi, &nh->fi_list, nh_list) {
 792                 fi->fib_flags |= RTNH_F_DEAD;
 793                 do_flush = true;
 794         }
 795         if (do_flush)
 796                 fib_flush(net);
 797 
 798         /* ip6_del_rt removes the entry from this list hence the _safe */
 799         list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
 800                 /* __ip6_del_rt does a release, so do a hold here */
 801                 fib6_info_hold(f6i);
 802                 ipv6_stub->ip6_del_rt(net, f6i);
 803         }
 804 }
 805 
 806 static void __remove_nexthop(struct net *net, struct nexthop *nh,
 807                              struct nl_info *nlinfo)
 808 {
 809         __remove_nexthop_fib(net, nh);
 810 
 811         if (nh->is_group) {
 812                 remove_nexthop_group(nh, nlinfo);
 813         } else {
 814                 struct nh_info *nhi;
 815 
 816                 nhi = rtnl_dereference(nh->nh_info);
 817                 if (nhi->fib_nhc.nhc_dev)
 818                         hlist_del(&nhi->dev_hash);
 819 
 820                 remove_nexthop_from_groups(net, nh, nlinfo);
 821         }
 822 }
 823 
 824 static void remove_nexthop(struct net *net, struct nexthop *nh,
 825                            struct nl_info *nlinfo)
 826 {
 827         /* remove from the tree */
 828         rb_erase(&nh->rb_node, &net->nexthop.rb_root);
 829 
 830         if (nlinfo)
 831                 nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 832 
 833         __remove_nexthop(net, nh, nlinfo);
 834         nh_base_seq_inc(net);
 835 
 836         nexthop_put(nh);
 837 }
 838 
 839 /* if any FIB entries reference this nexthop, any dst entries
 840  * need to be regenerated
 841  */
 842 static void nh_rt_cache_flush(struct net *net, struct nexthop *nh)
 843 {
 844         struct fib6_info *f6i;
 845 
 846         if (!list_empty(&nh->fi_list))
 847                 rt_cache_flush(net);
 848 
 849         list_for_each_entry(f6i, &nh->f6i_list, nh_list)
 850                 ipv6_stub->fib6_update_sernum(net, f6i);
 851 }
 852 
 853 static int replace_nexthop_grp(struct net *net, struct nexthop *old,
 854                                struct nexthop *new,
 855                                struct netlink_ext_ack *extack)
 856 {
 857         struct nh_group *oldg, *newg;
 858         int i;
 859 
 860         if (!new->is_group) {
 861                 NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
 862                 return -EINVAL;
 863         }
 864 
 865         oldg = rtnl_dereference(old->nh_grp);
 866         newg = rtnl_dereference(new->nh_grp);
 867 
 868         /* update parents - used by nexthop code for cleanup */
 869         for (i = 0; i < newg->num_nh; i++)
 870                 newg->nh_entries[i].nh_parent = old;
 871 
 872         rcu_assign_pointer(old->nh_grp, newg);
 873 
 874         for (i = 0; i < oldg->num_nh; i++)
 875                 oldg->nh_entries[i].nh_parent = new;
 876 
 877         rcu_assign_pointer(new->nh_grp, oldg);
 878 
 879         return 0;
 880 }
 881 
 882 static int replace_nexthop_single(struct net *net, struct nexthop *old,
 883                                   struct nexthop *new,
 884                                   struct netlink_ext_ack *extack)
 885 {
 886         struct nh_info *oldi, *newi;
 887 
 888         if (new->is_group) {
 889                 NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
 890                 return -EINVAL;
 891         }
 892 
 893         oldi = rtnl_dereference(old->nh_info);
 894         newi = rtnl_dereference(new->nh_info);
 895 
 896         newi->nh_parent = old;
 897         oldi->nh_parent = new;
 898 
 899         old->protocol = new->protocol;
 900         old->nh_flags = new->nh_flags;
 901 
 902         rcu_assign_pointer(old->nh_info, newi);
 903         rcu_assign_pointer(new->nh_info, oldi);
 904 
 905         return 0;
 906 }
 907 
 908 static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
 909                                      struct nl_info *info)
 910 {
 911         struct fib6_info *f6i;
 912 
 913         if (!list_empty(&nh->fi_list)) {
 914                 struct fib_info *fi;
 915 
 916                 /* expectation is a few fib_info per nexthop and then
 917                  * a lot of routes per fib_info. So mark the fib_info
 918                  * and then walk the fib tables once
 919                  */
 920                 list_for_each_entry(fi, &nh->fi_list, nh_list)
 921                         fi->nh_updated = true;
 922 
 923                 fib_info_notify_update(net, info);
 924 
 925                 list_for_each_entry(fi, &nh->fi_list, nh_list)
 926                         fi->nh_updated = false;
 927         }
 928 
 929         list_for_each_entry(f6i, &nh->f6i_list, nh_list)
 930                 ipv6_stub->fib6_rt_update(net, f6i, info);
 931 }
 932 
 933 /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
 934  * linked to this nexthop and for all groups that the nexthop
 935  * is a member of
 936  */
 937 static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
 938                                    struct nl_info *info)
 939 {
 940         struct nh_grp_entry *nhge;
 941 
 942         __nexthop_replace_notify(net, nh, info);
 943 
 944         list_for_each_entry(nhge, &nh->grp_list, nh_list)
 945                 __nexthop_replace_notify(net, nhge->nh_parent, info);
 946 }
 947 
 948 static int replace_nexthop(struct net *net, struct nexthop *old,
 949                            struct nexthop *new, struct netlink_ext_ack *extack)
 950 {
 951         bool new_is_reject = false;
 952         struct nh_grp_entry *nhge;
 953         int err;
 954 
 955         /* check that existing FIB entries are ok with the
 956          * new nexthop definition
 957          */
 958         err = fib_check_nh_list(old, new, extack);
 959         if (err)
 960                 return err;
 961 
 962         err = fib6_check_nh_list(old, new, extack);
 963         if (err)
 964                 return err;
 965 
 966         if (!new->is_group) {
 967                 struct nh_info *nhi = rtnl_dereference(new->nh_info);
 968 
 969                 new_is_reject = nhi->reject_nh;
 970         }
 971 
 972         list_for_each_entry(nhge, &old->grp_list, nh_list) {
 973                 /* if new nexthop is a blackhole, any groups using this
 974                  * nexthop cannot have more than 1 path
 975                  */
 976                 if (new_is_reject &&
 977                     nexthop_num_path(nhge->nh_parent) > 1) {
 978                         NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
 979                         return -EINVAL;
 980                 }
 981 
 982                 err = fib_check_nh_list(nhge->nh_parent, new, extack);
 983                 if (err)
 984                         return err;
 985 
 986                 err = fib6_check_nh_list(nhge->nh_parent, new, extack);
 987                 if (err)
 988                         return err;
 989         }
 990 
 991         if (old->is_group)
 992                 err = replace_nexthop_grp(net, old, new, extack);
 993         else
 994                 err = replace_nexthop_single(net, old, new, extack);
 995 
 996         if (!err) {
 997                 nh_rt_cache_flush(net, old);
 998 
 999                 __remove_nexthop(net, new, NULL);
1000                 nexthop_put(new);
1001         }
1002 
1003         return err;
1004 }
1005 
1006 /* called with rtnl_lock held */
1007 static int insert_nexthop(struct net *net, struct nexthop *new_nh,
1008                           struct nh_config *cfg, struct netlink_ext_ack *extack)
1009 {
1010         struct rb_node **pp, *parent = NULL, *next;
1011         struct rb_root *root = &net->nexthop.rb_root;
1012         bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
1013         bool create = !!(cfg->nlflags & NLM_F_CREATE);
1014         u32 new_id = new_nh->id;
1015         int replace_notify = 0;
1016         int rc = -EEXIST;
1017 
1018         pp = &root->rb_node;
1019         while (1) {
1020                 struct nexthop *nh;
1021 
1022                 next = rtnl_dereference(*pp);
1023                 if (!next)
1024                         break;
1025 
1026                 parent = next;
1027 
1028                 nh = rb_entry(parent, struct nexthop, rb_node);
1029                 if (new_id < nh->id) {
1030                         pp = &next->rb_left;
1031                 } else if (new_id > nh->id) {
1032                         pp = &next->rb_right;
1033                 } else if (replace) {
1034                         rc = replace_nexthop(net, nh, new_nh, extack);
1035                         if (!rc) {
1036                                 new_nh = nh; /* send notification with old nh */
1037                                 replace_notify = 1;
1038                         }
1039                         goto out;
1040                 } else {
1041                         /* id already exists and not a replace */
1042                         goto out;
1043                 }
1044         }
1045 
1046         if (replace && !create) {
1047                 NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
1048                 rc = -ENOENT;
1049                 goto out;
1050         }
1051 
1052         rb_link_node_rcu(&new_nh->rb_node, parent, pp);
1053         rb_insert_color(&new_nh->rb_node, root);
1054         rc = 0;
1055 out:
1056         if (!rc) {
1057                 nh_base_seq_inc(net);
1058                 nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
1059                 if (replace_notify)
1060                         nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
1061         }
1062 
1063         return rc;
1064 }
1065 
1066 /* rtnl */
1067 /* remove all nexthops tied to a device being deleted */
1068 static void nexthop_flush_dev(struct net_device *dev)
1069 {
1070         unsigned int hash = nh_dev_hashfn(dev->ifindex);
1071         struct net *net = dev_net(dev);
1072         struct hlist_head *head = &net->nexthop.devhash[hash];
1073         struct hlist_node *n;
1074         struct nh_info *nhi;
1075 
1076         hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
1077                 if (nhi->fib_nhc.nhc_dev != dev)
1078                         continue;
1079 
1080                 remove_nexthop(net, nhi->nh_parent, NULL);
1081         }
1082 }
1083 
1084 /* rtnl; called when net namespace is deleted */
1085 static void flush_all_nexthops(struct net *net)
1086 {
1087         struct rb_root *root = &net->nexthop.rb_root;
1088         struct rb_node *node;
1089         struct nexthop *nh;
1090 
1091         while ((node = rb_first(root))) {
1092                 nh = rb_entry(node, struct nexthop, rb_node);
1093                 remove_nexthop(net, nh, NULL);
1094                 cond_resched();
1095         }
1096 }
1097 
1098 static struct nexthop *nexthop_create_group(struct net *net,
1099                                             struct nh_config *cfg)
1100 {
1101         struct nlattr *grps_attr = cfg->nh_grp;
1102         struct nexthop_grp *entry = nla_data(grps_attr);
1103         u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
1104         struct nh_group *nhg;
1105         struct nexthop *nh;
1106         int i;
1107 
1108         nh = nexthop_alloc();
1109         if (!nh)
1110                 return ERR_PTR(-ENOMEM);
1111 
1112         nh->is_group = 1;
1113 
1114         nhg = nexthop_grp_alloc(num_nh);
1115         if (!nhg) {
1116                 kfree(nh);
1117                 return ERR_PTR(-ENOMEM);
1118         }
1119 
1120         /* spare group used for removals */
1121         nhg->spare = nexthop_grp_alloc(num_nh);
1122         if (!nhg) {
1123                 kfree(nhg);
1124                 kfree(nh);
1125                 return NULL;
1126         }
1127         nhg->spare->spare = nhg;
1128 
1129         for (i = 0; i < nhg->num_nh; ++i) {
1130                 struct nexthop *nhe;
1131                 struct nh_info *nhi;
1132 
1133                 nhe = nexthop_find_by_id(net, entry[i].id);
1134                 if (!nexthop_get(nhe))
1135                         goto out_no_nh;
1136 
1137                 nhi = rtnl_dereference(nhe->nh_info);
1138                 if (nhi->family == AF_INET)
1139                         nhg->has_v4 = true;
1140 
1141                 nhg->nh_entries[i].nh = nhe;
1142                 nhg->nh_entries[i].weight = entry[i].weight + 1;
1143                 list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
1144                 nhg->nh_entries[i].nh_parent = nh;
1145         }
1146 
1147         if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
1148                 nhg->mpath = 1;
1149                 nh_group_rebalance(nhg);
1150         }
1151 
1152         rcu_assign_pointer(nh->nh_grp, nhg);
1153 
1154         return nh;
1155 
1156 out_no_nh:
1157         for (; i >= 0; --i)
1158                 nexthop_put(nhg->nh_entries[i].nh);
1159 
1160         kfree(nhg->spare);
1161         kfree(nhg);
1162         kfree(nh);
1163 
1164         return ERR_PTR(-ENOENT);
1165 }
1166 
1167 static int nh_create_ipv4(struct net *net, struct nexthop *nh,
1168                           struct nh_info *nhi, struct nh_config *cfg,
1169                           struct netlink_ext_ack *extack)
1170 {
1171         struct fib_nh *fib_nh = &nhi->fib_nh;
1172         struct fib_config fib_cfg = {
1173                 .fc_oif   = cfg->nh_ifindex,
1174                 .fc_gw4   = cfg->gw.ipv4,
1175                 .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
1176                 .fc_flags = cfg->nh_flags,
1177                 .fc_encap = cfg->nh_encap,
1178                 .fc_encap_type = cfg->nh_encap_type,
1179         };
1180         u32 tb_id = l3mdev_fib_table(cfg->dev);
1181         int err;
1182 
1183         err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
1184         if (err) {
1185                 fib_nh_release(net, fib_nh);
1186                 goto out;
1187         }
1188 
1189         /* sets nh_dev if successful */
1190         err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
1191         if (!err) {
1192                 nh->nh_flags = fib_nh->fib_nh_flags;
1193                 fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
1194                                           fib_nh->fib_nh_scope);
1195         } else {
1196                 fib_nh_release(net, fib_nh);
1197         }
1198 out:
1199         return err;
1200 }
1201 
1202 static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
1203                           struct nh_info *nhi, struct nh_config *cfg,
1204                           struct netlink_ext_ack *extack)
1205 {
1206         struct fib6_nh *fib6_nh = &nhi->fib6_nh;
1207         struct fib6_config fib6_cfg = {
1208                 .fc_table = l3mdev_fib_table(cfg->dev),
1209                 .fc_ifindex = cfg->nh_ifindex,
1210                 .fc_gateway = cfg->gw.ipv6,
1211                 .fc_flags = cfg->nh_flags,
1212                 .fc_encap = cfg->nh_encap,
1213                 .fc_encap_type = cfg->nh_encap_type,
1214         };
1215         int err;
1216 
1217         if (!ipv6_addr_any(&cfg->gw.ipv6))
1218                 fib6_cfg.fc_flags |= RTF_GATEWAY;
1219 
1220         /* sets nh_dev if successful */
1221         err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
1222                                       extack);
1223         if (err)
1224                 ipv6_stub->fib6_nh_release(fib6_nh);
1225         else
1226                 nh->nh_flags = fib6_nh->fib_nh_flags;
1227 
1228         return err;
1229 }
1230 
1231 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
1232                                       struct netlink_ext_ack *extack)
1233 {
1234         struct nh_info *nhi;
1235         struct nexthop *nh;
1236         int err = 0;
1237 
1238         nh = nexthop_alloc();
1239         if (!nh)
1240                 return ERR_PTR(-ENOMEM);
1241 
1242         nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
1243         if (!nhi) {
1244                 kfree(nh);
1245                 return ERR_PTR(-ENOMEM);
1246         }
1247 
1248         nh->nh_flags = cfg->nh_flags;
1249         nh->net = net;
1250 
1251         nhi->nh_parent = nh;
1252         nhi->family = cfg->nh_family;
1253         nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
1254 
1255         if (cfg->nh_blackhole) {
1256                 nhi->reject_nh = 1;
1257                 cfg->nh_ifindex = net->loopback_dev->ifindex;
1258         }
1259 
1260         switch (cfg->nh_family) {
1261         case AF_INET:
1262                 err = nh_create_ipv4(net, nh, nhi, cfg, extack);
1263                 break;
1264         case AF_INET6:
1265                 err = nh_create_ipv6(net, nh, nhi, cfg, extack);
1266                 break;
1267         }
1268 
1269         if (err) {
1270                 kfree(nhi);
1271                 kfree(nh);
1272                 return ERR_PTR(err);
1273         }
1274 
1275         /* add the entry to the device based hash */
1276         nexthop_devhash_add(net, nhi);
1277 
1278         rcu_assign_pointer(nh->nh_info, nhi);
1279 
1280         return nh;
1281 }
1282 
1283 /* called with rtnl lock held */
1284 static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
1285                                    struct netlink_ext_ack *extack)
1286 {
1287         struct nexthop *nh;
1288         int err;
1289 
1290         if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) {
1291                 NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
1292                 return ERR_PTR(-EINVAL);
1293         }
1294 
1295         if (!cfg->nh_id) {
1296                 cfg->nh_id = nh_find_unused_id(net);
1297                 if (!cfg->nh_id) {
1298                         NL_SET_ERR_MSG(extack, "No unused id");
1299                         return ERR_PTR(-EINVAL);
1300                 }
1301         }
1302 
1303         if (cfg->nh_grp)
1304                 nh = nexthop_create_group(net, cfg);
1305         else
1306                 nh = nexthop_create(net, cfg, extack);
1307 
1308         if (IS_ERR(nh))
1309                 return nh;
1310 
1311         refcount_set(&nh->refcnt, 1);
1312         nh->id = cfg->nh_id;
1313         nh->protocol = cfg->nh_protocol;
1314         nh->net = net;
1315 
1316         err = insert_nexthop(net, nh, cfg, extack);
1317         if (err) {
1318                 __remove_nexthop(net, nh, NULL);
1319                 nexthop_put(nh);
1320                 nh = ERR_PTR(err);
1321         }
1322 
1323         return nh;
1324 }
1325 
1326 static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
1327                             struct nlmsghdr *nlh, struct nh_config *cfg,
1328                             struct netlink_ext_ack *extack)
1329 {
1330         struct nhmsg *nhm = nlmsg_data(nlh);
1331         struct nlattr *tb[NHA_MAX + 1];
1332         int err;
1333 
1334         err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1335                           extack);
1336         if (err < 0)
1337                 return err;
1338 
1339         err = -EINVAL;
1340         if (nhm->resvd || nhm->nh_scope) {
1341                 NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
1342                 goto out;
1343         }
1344         if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
1345                 NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
1346                 goto out;
1347         }
1348 
1349         switch (nhm->nh_family) {
1350         case AF_INET:
1351         case AF_INET6:
1352                 break;
1353         case AF_UNSPEC:
1354                 if (tb[NHA_GROUP])
1355                         break;
1356                 /* fallthrough */
1357         default:
1358                 NL_SET_ERR_MSG(extack, "Invalid address family");
1359                 goto out;
1360         }
1361 
1362         if (tb[NHA_GROUPS] || tb[NHA_MASTER]) {
1363                 NL_SET_ERR_MSG(extack, "Invalid attributes in request");
1364                 goto out;
1365         }
1366 
1367         memset(cfg, 0, sizeof(*cfg));
1368         cfg->nlflags = nlh->nlmsg_flags;
1369         cfg->nlinfo.portid = NETLINK_CB(skb).portid;
1370         cfg->nlinfo.nlh = nlh;
1371         cfg->nlinfo.nl_net = net;
1372 
1373         cfg->nh_family = nhm->nh_family;
1374         cfg->nh_protocol = nhm->nh_protocol;
1375         cfg->nh_flags = nhm->nh_flags;
1376 
1377         if (tb[NHA_ID])
1378                 cfg->nh_id = nla_get_u32(tb[NHA_ID]);
1379 
1380         if (tb[NHA_GROUP]) {
1381                 if (nhm->nh_family != AF_UNSPEC) {
1382                         NL_SET_ERR_MSG(extack, "Invalid family for group");
1383                         goto out;
1384                 }
1385                 cfg->nh_grp = tb[NHA_GROUP];
1386 
1387                 cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
1388                 if (tb[NHA_GROUP_TYPE])
1389                         cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
1390 
1391                 if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
1392                         NL_SET_ERR_MSG(extack, "Invalid group type");
1393                         goto out;
1394                 }
1395                 err = nh_check_attr_group(net, tb, extack);
1396 
1397                 /* no other attributes should be set */
1398                 goto out;
1399         }
1400 
1401         if (tb[NHA_BLACKHOLE]) {
1402                 if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
1403                     tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
1404                         NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
1405                         goto out;
1406                 }
1407 
1408                 cfg->nh_blackhole = 1;
1409                 err = 0;
1410                 goto out;
1411         }
1412 
1413         if (!tb[NHA_OIF]) {
1414                 NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
1415                 goto out;
1416         }
1417 
1418         cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
1419         if (cfg->nh_ifindex)
1420                 cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
1421 
1422         if (!cfg->dev) {
1423                 NL_SET_ERR_MSG(extack, "Invalid device index");
1424                 goto out;
1425         } else if (!(cfg->dev->flags & IFF_UP)) {
1426                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
1427                 err = -ENETDOWN;
1428                 goto out;
1429         } else if (!netif_carrier_ok(cfg->dev)) {
1430                 NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
1431                 err = -ENETDOWN;
1432                 goto out;
1433         }
1434 
1435         err = -EINVAL;
1436         if (tb[NHA_GATEWAY]) {
1437                 struct nlattr *gwa = tb[NHA_GATEWAY];
1438 
1439                 switch (cfg->nh_family) {
1440                 case AF_INET:
1441                         if (nla_len(gwa) != sizeof(u32)) {
1442                                 NL_SET_ERR_MSG(extack, "Invalid gateway");
1443                                 goto out;
1444                         }
1445                         cfg->gw.ipv4 = nla_get_be32(gwa);
1446                         break;
1447                 case AF_INET6:
1448                         if (nla_len(gwa) != sizeof(struct in6_addr)) {
1449                                 NL_SET_ERR_MSG(extack, "Invalid gateway");
1450                                 goto out;
1451                         }
1452                         cfg->gw.ipv6 = nla_get_in6_addr(gwa);
1453                         break;
1454                 default:
1455                         NL_SET_ERR_MSG(extack,
1456                                        "Unknown address family for gateway");
1457                         goto out;
1458                 }
1459         } else {
1460                 /* device only nexthop (no gateway) */
1461                 if (cfg->nh_flags & RTNH_F_ONLINK) {
1462                         NL_SET_ERR_MSG(extack,
1463                                        "ONLINK flag can not be set for nexthop without a gateway");
1464                         goto out;
1465                 }
1466         }
1467 
1468         if (tb[NHA_ENCAP]) {
1469                 cfg->nh_encap = tb[NHA_ENCAP];
1470 
1471                 if (!tb[NHA_ENCAP_TYPE]) {
1472                         NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
1473                         goto out;
1474                 }
1475 
1476                 cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
1477                 err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
1478                 if (err < 0)
1479                         goto out;
1480 
1481         } else if (tb[NHA_ENCAP_TYPE]) {
1482                 NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
1483                 goto out;
1484         }
1485 
1486 
1487         err = 0;
1488 out:
1489         return err;
1490 }
1491 
1492 /* rtnl */
1493 static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
1494                            struct netlink_ext_ack *extack)
1495 {
1496         struct net *net = sock_net(skb->sk);
1497         struct nh_config cfg;
1498         struct nexthop *nh;
1499         int err;
1500 
1501         err = rtm_to_nh_config(net, skb, nlh, &cfg, extack);
1502         if (!err) {
1503                 nh = nexthop_add(net, &cfg, extack);
1504                 if (IS_ERR(nh))
1505                         err = PTR_ERR(nh);
1506         }
1507 
1508         return err;
1509 }
1510 
1511 static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id,
1512                                 struct netlink_ext_ack *extack)
1513 {
1514         struct nhmsg *nhm = nlmsg_data(nlh);
1515         struct nlattr *tb[NHA_MAX + 1];
1516         int err, i;
1517 
1518         err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1519                           extack);
1520         if (err < 0)
1521                 return err;
1522 
1523         err = -EINVAL;
1524         for (i = 0; i < __NHA_MAX; ++i) {
1525                 if (!tb[i])
1526                         continue;
1527 
1528                 switch (i) {
1529                 case NHA_ID:
1530                         break;
1531                 default:
1532                         NL_SET_ERR_MSG_ATTR(extack, tb[i],
1533                                             "Unexpected attribute in request");
1534                         goto out;
1535                 }
1536         }
1537         if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
1538                 NL_SET_ERR_MSG(extack, "Invalid values in header");
1539                 goto out;
1540         }
1541 
1542         if (!tb[NHA_ID]) {
1543                 NL_SET_ERR_MSG(extack, "Nexthop id is missing");
1544                 goto out;
1545         }
1546 
1547         *id = nla_get_u32(tb[NHA_ID]);
1548         if (!(*id))
1549                 NL_SET_ERR_MSG(extack, "Invalid nexthop id");
1550         else
1551                 err = 0;
1552 out:
1553         return err;
1554 }
1555 
1556 /* rtnl */
1557 static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
1558                            struct netlink_ext_ack *extack)
1559 {
1560         struct net *net = sock_net(skb->sk);
1561         struct nl_info nlinfo = {
1562                 .nlh = nlh,
1563                 .nl_net = net,
1564                 .portid = NETLINK_CB(skb).portid,
1565         };
1566         struct nexthop *nh;
1567         int err;
1568         u32 id;
1569 
1570         err = nh_valid_get_del_req(nlh, &id, extack);
1571         if (err)
1572                 return err;
1573 
1574         nh = nexthop_find_by_id(net, id);
1575         if (!nh)
1576                 return -ENOENT;
1577 
1578         remove_nexthop(net, nh, &nlinfo);
1579 
1580         return 0;
1581 }
1582 
1583 /* rtnl */
1584 static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
1585                            struct netlink_ext_ack *extack)
1586 {
1587         struct net *net = sock_net(in_skb->sk);
1588         struct sk_buff *skb = NULL;
1589         struct nexthop *nh;
1590         int err;
1591         u32 id;
1592 
1593         err = nh_valid_get_del_req(nlh, &id, extack);
1594         if (err)
1595                 return err;
1596 
1597         err = -ENOBUFS;
1598         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1599         if (!skb)
1600                 goto out;
1601 
1602         err = -ENOENT;
1603         nh = nexthop_find_by_id(net, id);
1604         if (!nh)
1605                 goto errout_free;
1606 
1607         err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
1608                            nlh->nlmsg_seq, 0);
1609         if (err < 0) {
1610                 WARN_ON(err == -EMSGSIZE);
1611                 goto errout_free;
1612         }
1613 
1614         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1615 out:
1616         return err;
1617 errout_free:
1618         kfree_skb(skb);
1619         goto out;
1620 }
1621 
1622 static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
1623                              bool group_filter, u8 family)
1624 {
1625         const struct net_device *dev;
1626         const struct nh_info *nhi;
1627 
1628         if (group_filter && !nh->is_group)
1629                 return true;
1630 
1631         if (!dev_idx && !master_idx && !family)
1632                 return false;
1633 
1634         if (nh->is_group)
1635                 return true;
1636 
1637         nhi = rtnl_dereference(nh->nh_info);
1638         if (family && nhi->family != family)
1639                 return true;
1640 
1641         dev = nhi->fib_nhc.nhc_dev;
1642         if (dev_idx && (!dev || dev->ifindex != dev_idx))
1643                 return true;
1644 
1645         if (master_idx) {
1646                 struct net_device *master;
1647 
1648                 if (!dev)
1649                         return true;
1650 
1651                 master = netdev_master_upper_dev_get((struct net_device *)dev);
1652                 if (!master || master->ifindex != master_idx)
1653                         return true;
1654         }
1655 
1656         return false;
1657 }
1658 
1659 static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
1660                              int *master_idx, bool *group_filter,
1661                              struct netlink_callback *cb)
1662 {
1663         struct netlink_ext_ack *extack = cb->extack;
1664         struct nlattr *tb[NHA_MAX + 1];
1665         struct nhmsg *nhm;
1666         int err, i;
1667         u32 idx;
1668 
1669         err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy,
1670                           NULL);
1671         if (err < 0)
1672                 return err;
1673 
1674         for (i = 0; i <= NHA_MAX; ++i) {
1675                 if (!tb[i])
1676                         continue;
1677 
1678                 switch (i) {
1679                 case NHA_OIF:
1680                         idx = nla_get_u32(tb[i]);
1681                         if (idx > INT_MAX) {
1682                                 NL_SET_ERR_MSG(extack, "Invalid device index");
1683                                 return -EINVAL;
1684                         }
1685                         *dev_idx = idx;
1686                         break;
1687                 case NHA_MASTER:
1688                         idx = nla_get_u32(tb[i]);
1689                         if (idx > INT_MAX) {
1690                                 NL_SET_ERR_MSG(extack, "Invalid master device index");
1691                                 return -EINVAL;
1692                         }
1693                         *master_idx = idx;
1694                         break;
1695                 case NHA_GROUPS:
1696                         *group_filter = true;
1697                         break;
1698                 default:
1699                         NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
1700                         return -EINVAL;
1701                 }
1702         }
1703 
1704         nhm = nlmsg_data(nlh);
1705         if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
1706                 NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
1707                 return -EINVAL;
1708         }
1709 
1710         return 0;
1711 }
1712 
1713 /* rtnl */
1714 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
1715 {
1716         struct nhmsg *nhm = nlmsg_data(cb->nlh);
1717         int dev_filter_idx = 0, master_idx = 0;
1718         struct net *net = sock_net(skb->sk);
1719         struct rb_root *root = &net->nexthop.rb_root;
1720         bool group_filter = false;
1721         struct rb_node *node;
1722         int idx = 0, s_idx;
1723         int err;
1724 
1725         err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
1726                                 &group_filter, cb);
1727         if (err < 0)
1728                 return err;
1729 
1730         s_idx = cb->args[0];
1731         for (node = rb_first(root); node; node = rb_next(node)) {
1732                 struct nexthop *nh;
1733 
1734                 if (idx < s_idx)
1735                         goto cont;
1736 
1737                 nh = rb_entry(node, struct nexthop, rb_node);
1738                 if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
1739                                      group_filter, nhm->nh_family))
1740                         goto cont;
1741 
1742                 err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
1743                                    NETLINK_CB(cb->skb).portid,
1744                                    cb->nlh->nlmsg_seq, NLM_F_MULTI);
1745                 if (err < 0) {
1746                         if (likely(skb->len))
1747                                 goto out;
1748 
1749                         goto out_err;
1750                 }
1751 cont:
1752                 idx++;
1753         }
1754 
1755 out:
1756         err = skb->len;
1757 out_err:
1758         cb->args[0] = idx;
1759         cb->seq = net->nexthop.seq;
1760         nl_dump_check_consistent(cb, nlmsg_hdr(skb));
1761 
1762         return err;
1763 }
1764 
1765 static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
1766 {
1767         unsigned int hash = nh_dev_hashfn(dev->ifindex);
1768         struct net *net = dev_net(dev);
1769         struct hlist_head *head = &net->nexthop.devhash[hash];
1770         struct hlist_node *n;
1771         struct nh_info *nhi;
1772 
1773         hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
1774                 if (nhi->fib_nhc.nhc_dev == dev) {
1775                         if (nhi->family == AF_INET)
1776                                 fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
1777                                                    orig_mtu);
1778                 }
1779         }
1780 }
1781 
1782 /* rtnl */
1783 static int nh_netdev_event(struct notifier_block *this,
1784                            unsigned long event, void *ptr)
1785 {
1786         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1787         struct netdev_notifier_info_ext *info_ext;
1788 
1789         switch (event) {
1790         case NETDEV_DOWN:
1791         case NETDEV_UNREGISTER:
1792                 nexthop_flush_dev(dev);
1793                 break;
1794         case NETDEV_CHANGE:
1795                 if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
1796                         nexthop_flush_dev(dev);
1797                 break;
1798         case NETDEV_CHANGEMTU:
1799                 info_ext = ptr;
1800                 nexthop_sync_mtu(dev, info_ext->ext.mtu);
1801                 rt_cache_flush(dev_net(dev));
1802                 break;
1803         }
1804         return NOTIFY_DONE;
1805 }
1806 
1807 static struct notifier_block nh_netdev_notifier = {
1808         .notifier_call = nh_netdev_event,
1809 };
1810 
1811 static void __net_exit nexthop_net_exit(struct net *net)
1812 {
1813         rtnl_lock();
1814         flush_all_nexthops(net);
1815         rtnl_unlock();
1816         kfree(net->nexthop.devhash);
1817 }
1818 
1819 static int __net_init nexthop_net_init(struct net *net)
1820 {
1821         size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
1822 
1823         net->nexthop.rb_root = RB_ROOT;
1824         net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
1825         if (!net->nexthop.devhash)
1826                 return -ENOMEM;
1827 
1828         return 0;
1829 }
1830 
1831 static struct pernet_operations nexthop_net_ops = {
1832         .init = nexthop_net_init,
1833         .exit = nexthop_net_exit,
1834 };
1835 
1836 static int __init nexthop_init(void)
1837 {
1838         register_pernet_subsys(&nexthop_net_ops);
1839 
1840         register_netdevice_notifier(&nh_netdev_notifier);
1841 
1842         rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
1843         rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0);
1844         rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop,
1845                       rtm_dump_nexthop, 0);
1846 
1847         rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
1848         rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
1849 
1850         rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0);
1851         rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0);
1852 
1853         return 0;
1854 }
1855 subsys_initcall(nexthop_init);

/* [<][>][^][v][top][bottom][index][help] */