root/net/netfilter/ipvs/ip_vs_mh.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. generate_hash_secret
  2. is_unavailable
  3. ip_vs_mh_hashkey
  4. ip_vs_mh_reset
  5. ip_vs_mh_permutate
  6. ip_vs_mh_populate
  7. ip_vs_mh_get
  8. ip_vs_mh_get_fallback
  9. ip_vs_mh_reassign
  10. ip_vs_mh_gcd_weight
  11. ip_vs_mh_shift_weight
  12. ip_vs_mh_state_free
  13. ip_vs_mh_init_svc
  14. ip_vs_mh_done_svc
  15. ip_vs_mh_dest_changed
  16. ip_vs_mh_get_port
  17. ip_vs_mh_schedule
  18. ip_vs_mh_init
  19. ip_vs_mh_cleanup

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* IPVS:        Maglev Hashing scheduling module
   3  *
   4  * Authors:     Inju Song <inju.song@navercorp.com>
   5  *
   6  */
   7 
   8 /* The mh algorithm is to assign a preference list of all the lookup
   9  * table positions to each destination and populate the table with
  10  * the most-preferred position of destinations. Then it is to select
  11  * destination with the hash key of source IP address through looking
  12  * up a the lookup table.
  13  *
  14  * The algorithm is detailed in:
  15  * [3.4 Consistent Hasing]
  16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
  17  *
  18  */
  19 
  20 #define KMSG_COMPONENT "IPVS"
  21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  22 
  23 #include <linux/ip.h>
  24 #include <linux/slab.h>
  25 #include <linux/module.h>
  26 #include <linux/kernel.h>
  27 #include <linux/skbuff.h>
  28 
  29 #include <net/ip_vs.h>
  30 
  31 #include <linux/siphash.h>
  32 #include <linux/bitops.h>
  33 #include <linux/gcd.h>
  34 
  35 #define IP_VS_SVC_F_SCHED_MH_FALLBACK   IP_VS_SVC_F_SCHED1 /* MH fallback */
  36 #define IP_VS_SVC_F_SCHED_MH_PORT       IP_VS_SVC_F_SCHED2 /* MH use port */
  37 
  38 struct ip_vs_mh_lookup {
  39         struct ip_vs_dest __rcu *dest;  /* real server (cache) */
  40 };
  41 
  42 struct ip_vs_mh_dest_setup {
  43         unsigned int    offset; /* starting offset */
  44         unsigned int    skip;   /* skip */
  45         unsigned int    perm;   /* next_offset */
  46         int             turns;  /* weight / gcd() and rshift */
  47 };
  48 
  49 /* Available prime numbers for MH table */
  50 static int primes[] = {251, 509, 1021, 2039, 4093,
  51                        8191, 16381, 32749, 65521, 131071};
  52 
  53 /* For IPVS MH entry hash table */
  54 #ifndef CONFIG_IP_VS_MH_TAB_INDEX
  55 #define CONFIG_IP_VS_MH_TAB_INDEX       12
  56 #endif
  57 #define IP_VS_MH_TAB_BITS               (CONFIG_IP_VS_MH_TAB_INDEX / 2)
  58 #define IP_VS_MH_TAB_INDEX              (CONFIG_IP_VS_MH_TAB_INDEX - 8)
  59 #define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
  60 
  61 struct ip_vs_mh_state {
  62         struct rcu_head                 rcu_head;
  63         struct ip_vs_mh_lookup          *lookup;
  64         struct ip_vs_mh_dest_setup      *dest_setup;
  65         hsiphash_key_t                  hash1, hash2;
  66         int                             gcd;
  67         int                             rshift;
  68 };
  69 
  70 static inline void generate_hash_secret(hsiphash_key_t *hash1,
  71                                         hsiphash_key_t *hash2)
  72 {
  73         hash1->key[0] = 2654435761UL;
  74         hash1->key[1] = 2654435761UL;
  75 
  76         hash2->key[0] = 2654446892UL;
  77         hash2->key[1] = 2654446892UL;
  78 }
  79 
  80 /* Helper function to determine if server is unavailable */
  81 static inline bool is_unavailable(struct ip_vs_dest *dest)
  82 {
  83         return atomic_read(&dest->weight) <= 0 ||
  84                dest->flags & IP_VS_DEST_F_OVERLOAD;
  85 }
  86 
  87 /* Returns hash value for IPVS MH entry */
  88 static inline unsigned int
  89 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
  90                  __be16 port, hsiphash_key_t *key, unsigned int offset)
  91 {
  92         unsigned int v;
  93         __be32 addr_fold = addr->ip;
  94 
  95 #ifdef CONFIG_IP_VS_IPV6
  96         if (af == AF_INET6)
  97                 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
  98                             addr->ip6[2] ^ addr->ip6[3];
  99 #endif
 100         v = (offset + ntohs(port) + ntohl(addr_fold));
 101         return hsiphash(&v, sizeof(v), key);
 102 }
 103 
 104 /* Reset all the hash buckets of the specified table. */
 105 static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
 106 {
 107         int i;
 108         struct ip_vs_mh_lookup *l;
 109         struct ip_vs_dest *dest;
 110 
 111         l = &s->lookup[0];
 112         for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
 113                 dest = rcu_dereference_protected(l->dest, 1);
 114                 if (dest) {
 115                         ip_vs_dest_put(dest);
 116                         RCU_INIT_POINTER(l->dest, NULL);
 117                 }
 118                 l++;
 119         }
 120 }
 121 
 122 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
 123                               struct ip_vs_service *svc)
 124 {
 125         struct list_head *p;
 126         struct ip_vs_mh_dest_setup *ds;
 127         struct ip_vs_dest *dest;
 128         int lw;
 129 
 130         /* If gcd is smaller then 1, number of dests or
 131          * all last_weight of dests are zero. So, skip
 132          * permutation for the dests.
 133          */
 134         if (s->gcd < 1)
 135                 return 0;
 136 
 137         /* Set dest_setup for the dests permutation */
 138         p = &svc->destinations;
 139         ds = &s->dest_setup[0];
 140         while ((p = p->next) != &svc->destinations) {
 141                 dest = list_entry(p, struct ip_vs_dest, n_list);
 142 
 143                 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
 144                                               dest->port, &s->hash1, 0) %
 145                                               IP_VS_MH_TAB_SIZE;
 146                 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
 147                                             dest->port, &s->hash2, 0) %
 148                                             (IP_VS_MH_TAB_SIZE - 1) + 1;
 149                 ds->perm = ds->offset;
 150 
 151                 lw = atomic_read(&dest->last_weight);
 152                 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
 153                 ds++;
 154         }
 155 
 156         return 0;
 157 }
 158 
 159 static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
 160                              struct ip_vs_service *svc)
 161 {
 162         int n, c, dt_count;
 163         unsigned long *table;
 164         struct list_head *p;
 165         struct ip_vs_mh_dest_setup *ds;
 166         struct ip_vs_dest *dest, *new_dest;
 167 
 168         /* If gcd is smaller then 1, number of dests or
 169          * all last_weight of dests are zero. So, skip
 170          * the population for the dests and reset lookup table.
 171          */
 172         if (s->gcd < 1) {
 173                 ip_vs_mh_reset(s);
 174                 return 0;
 175         }
 176 
 177         table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
 178                         sizeof(unsigned long), GFP_KERNEL);
 179         if (!table)
 180                 return -ENOMEM;
 181 
 182         p = &svc->destinations;
 183         n = 0;
 184         dt_count = 0;
 185         while (n < IP_VS_MH_TAB_SIZE) {
 186                 if (p == &svc->destinations)
 187                         p = p->next;
 188 
 189                 ds = &s->dest_setup[0];
 190                 while (p != &svc->destinations) {
 191                         /* Ignore added server with zero weight */
 192                         if (ds->turns < 1) {
 193                                 p = p->next;
 194                                 ds++;
 195                                 continue;
 196                         }
 197 
 198                         c = ds->perm;
 199                         while (test_bit(c, table)) {
 200                                 /* Add skip, mod IP_VS_MH_TAB_SIZE */
 201                                 ds->perm += ds->skip;
 202                                 if (ds->perm >= IP_VS_MH_TAB_SIZE)
 203                                         ds->perm -= IP_VS_MH_TAB_SIZE;
 204                                 c = ds->perm;
 205                         }
 206 
 207                         __set_bit(c, table);
 208 
 209                         dest = rcu_dereference_protected(s->lookup[c].dest, 1);
 210                         new_dest = list_entry(p, struct ip_vs_dest, n_list);
 211                         if (dest != new_dest) {
 212                                 if (dest)
 213                                         ip_vs_dest_put(dest);
 214                                 ip_vs_dest_hold(new_dest);
 215                                 RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
 216                         }
 217 
 218                         if (++n == IP_VS_MH_TAB_SIZE)
 219                                 goto out;
 220 
 221                         if (++dt_count >= ds->turns) {
 222                                 dt_count = 0;
 223                                 p = p->next;
 224                                 ds++;
 225                         }
 226                 }
 227         }
 228 
 229 out:
 230         kfree(table);
 231         return 0;
 232 }
 233 
 234 /* Get ip_vs_dest associated with supplied parameters. */
 235 static inline struct ip_vs_dest *
 236 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
 237              const union nf_inet_addr *addr, __be16 port)
 238 {
 239         unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
 240                                              % IP_VS_MH_TAB_SIZE;
 241         struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
 242 
 243         return (!dest || is_unavailable(dest)) ? NULL : dest;
 244 }
 245 
 246 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */
 247 static inline struct ip_vs_dest *
 248 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
 249                       const union nf_inet_addr *addr, __be16 port)
 250 {
 251         unsigned int offset, roffset;
 252         unsigned int hash, ihash;
 253         struct ip_vs_dest *dest;
 254 
 255         /* First try the dest it's supposed to go to */
 256         ihash = ip_vs_mh_hashkey(svc->af, addr, port,
 257                                  &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
 258         dest = rcu_dereference(s->lookup[ihash].dest);
 259         if (!dest)
 260                 return NULL;
 261         if (!is_unavailable(dest))
 262                 return dest;
 263 
 264         IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
 265                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
 266 
 267         /* If the original dest is unavailable, loop around the table
 268          * starting from ihash to find a new dest
 269          */
 270         for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
 271                 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
 272                 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
 273                                         roffset) % IP_VS_MH_TAB_SIZE;
 274                 dest = rcu_dereference(s->lookup[hash].dest);
 275                 if (!dest)
 276                         break;
 277                 if (!is_unavailable(dest))
 278                         return dest;
 279                 IP_VS_DBG_BUF(6,
 280                               "MH: selected unavailable server %s:%u (offset %u), reselecting",
 281                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
 282                               ntohs(dest->port), roffset);
 283         }
 284 
 285         return NULL;
 286 }
 287 
 288 /* Assign all the hash buckets of the specified table with the service. */
 289 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
 290                              struct ip_vs_service *svc)
 291 {
 292         int ret;
 293 
 294         if (svc->num_dests > IP_VS_MH_TAB_SIZE)
 295                 return -EINVAL;
 296 
 297         if (svc->num_dests >= 1) {
 298                 s->dest_setup = kcalloc(svc->num_dests,
 299                                         sizeof(struct ip_vs_mh_dest_setup),
 300                                         GFP_KERNEL);
 301                 if (!s->dest_setup)
 302                         return -ENOMEM;
 303         }
 304 
 305         ip_vs_mh_permutate(s, svc);
 306 
 307         ret = ip_vs_mh_populate(s, svc);
 308         if (ret < 0)
 309                 goto out;
 310 
 311         IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
 312                       IP_VS_DBG_ADDR(svc->af, &svc->addr),
 313                       ntohs(svc->port));
 314 
 315 out:
 316         if (svc->num_dests >= 1) {
 317                 kfree(s->dest_setup);
 318                 s->dest_setup = NULL;
 319         }
 320         return ret;
 321 }
 322 
 323 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
 324 {
 325         struct ip_vs_dest *dest;
 326         int weight;
 327         int g = 0;
 328 
 329         list_for_each_entry(dest, &svc->destinations, n_list) {
 330                 weight = atomic_read(&dest->last_weight);
 331                 if (weight > 0) {
 332                         if (g > 0)
 333                                 g = gcd(weight, g);
 334                         else
 335                                 g = weight;
 336                 }
 337         }
 338         return g;
 339 }
 340 
 341 /* To avoid assigning huge weight for the MH table,
 342  * calculate shift value with gcd.
 343  */
 344 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
 345 {
 346         struct ip_vs_dest *dest;
 347         int new_weight, weight = 0;
 348         int mw, shift;
 349 
 350         /* If gcd is smaller then 1, number of dests or
 351          * all last_weight of dests are zero. So, return
 352          * shift value as zero.
 353          */
 354         if (gcd < 1)
 355                 return 0;
 356 
 357         list_for_each_entry(dest, &svc->destinations, n_list) {
 358                 new_weight = atomic_read(&dest->last_weight);
 359                 if (new_weight > weight)
 360                         weight = new_weight;
 361         }
 362 
 363         /* Because gcd is greater than zero,
 364          * the maximum weight and gcd are always greater than zero
 365          */
 366         mw = weight / gcd;
 367 
 368         /* shift = occupied bits of weight/gcd - MH highest bits */
 369         shift = fls(mw) - IP_VS_MH_TAB_BITS;
 370         return (shift >= 0) ? shift : 0;
 371 }
 372 
 373 static void ip_vs_mh_state_free(struct rcu_head *head)
 374 {
 375         struct ip_vs_mh_state *s;
 376 
 377         s = container_of(head, struct ip_vs_mh_state, rcu_head);
 378         kfree(s->lookup);
 379         kfree(s);
 380 }
 381 
 382 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
 383 {
 384         int ret;
 385         struct ip_vs_mh_state *s;
 386 
 387         /* Allocate the MH table for this service */
 388         s = kzalloc(sizeof(*s), GFP_KERNEL);
 389         if (!s)
 390                 return -ENOMEM;
 391 
 392         s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
 393                             GFP_KERNEL);
 394         if (!s->lookup) {
 395                 kfree(s);
 396                 return -ENOMEM;
 397         }
 398 
 399         generate_hash_secret(&s->hash1, &s->hash2);
 400         s->gcd = ip_vs_mh_gcd_weight(svc);
 401         s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 402 
 403         IP_VS_DBG(6,
 404                   "MH lookup table (memory=%zdbytes) allocated for current service\n",
 405                   sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 406 
 407         /* Assign the lookup table with current dests */
 408         ret = ip_vs_mh_reassign(s, svc);
 409         if (ret < 0) {
 410                 ip_vs_mh_reset(s);
 411                 ip_vs_mh_state_free(&s->rcu_head);
 412                 return ret;
 413         }
 414 
 415         /* No more failures, attach state */
 416         svc->sched_data = s;
 417         return 0;
 418 }
 419 
 420 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
 421 {
 422         struct ip_vs_mh_state *s = svc->sched_data;
 423 
 424         /* Got to clean up lookup entry here */
 425         ip_vs_mh_reset(s);
 426 
 427         call_rcu(&s->rcu_head, ip_vs_mh_state_free);
 428         IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
 429                   sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 430 }
 431 
 432 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
 433                                  struct ip_vs_dest *dest)
 434 {
 435         struct ip_vs_mh_state *s = svc->sched_data;
 436 
 437         s->gcd = ip_vs_mh_gcd_weight(svc);
 438         s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 439 
 440         /* Assign the lookup table with the updated service */
 441         return ip_vs_mh_reassign(s, svc);
 442 }
 443 
 444 /* Helper function to get port number */
 445 static inline __be16
 446 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 447 {
 448         __be16 _ports[2], *ports;
 449 
 450         /* At this point we know that we have a valid packet of some kind.
 451          * Because ICMP packets are only guaranteed to have the first 8
 452          * bytes, let's just grab the ports.  Fortunately they're in the
 453          * same position for all three of the protocols we care about.
 454          */
 455         switch (iph->protocol) {
 456         case IPPROTO_TCP:
 457         case IPPROTO_UDP:
 458         case IPPROTO_SCTP:
 459                 ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
 460                                            &_ports);
 461                 if (unlikely(!ports))
 462                         return 0;
 463 
 464                 if (likely(!ip_vs_iph_inverse(iph)))
 465                         return ports[0];
 466                 else
 467                         return ports[1];
 468         default:
 469                 return 0;
 470         }
 471 }
 472 
 473 /* Maglev Hashing scheduling */
 474 static struct ip_vs_dest *
 475 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 476                   struct ip_vs_iphdr *iph)
 477 {
 478         struct ip_vs_dest *dest;
 479         struct ip_vs_mh_state *s;
 480         __be16 port = 0;
 481         const union nf_inet_addr *hash_addr;
 482 
 483         hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
 484 
 485         IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
 486 
 487         if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
 488                 port = ip_vs_mh_get_port(skb, iph);
 489 
 490         s = (struct ip_vs_mh_state *)svc->sched_data;
 491 
 492         if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
 493                 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
 494         else
 495                 dest = ip_vs_mh_get(svc, s, hash_addr, port);
 496 
 497         if (!dest) {
 498                 ip_vs_scheduler_err(svc, "no destination available");
 499                 return NULL;
 500         }
 501 
 502         IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
 503                       IP_VS_DBG_ADDR(svc->af, hash_addr),
 504                       ntohs(port),
 505                       IP_VS_DBG_ADDR(dest->af, &dest->addr),
 506                       ntohs(dest->port));
 507 
 508         return dest;
 509 }
 510 
 511 /* IPVS MH Scheduler structure */
 512 static struct ip_vs_scheduler ip_vs_mh_scheduler = {
 513         .name =                 "mh",
 514         .refcnt =               ATOMIC_INIT(0),
 515         .module =               THIS_MODULE,
 516         .n_list  =              LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
 517         .init_service =         ip_vs_mh_init_svc,
 518         .done_service =         ip_vs_mh_done_svc,
 519         .add_dest =             ip_vs_mh_dest_changed,
 520         .del_dest =             ip_vs_mh_dest_changed,
 521         .upd_dest =             ip_vs_mh_dest_changed,
 522         .schedule =             ip_vs_mh_schedule,
 523 };
 524 
 525 static int __init ip_vs_mh_init(void)
 526 {
 527         return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
 528 }
 529 
 530 static void __exit ip_vs_mh_cleanup(void)
 531 {
 532         unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
 533         rcu_barrier();
 534 }
 535 
 536 module_init(ip_vs_mh_init);
 537 module_exit(ip_vs_mh_cleanup);
 538 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
 539 MODULE_LICENSE("GPL v2");
 540 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");

/* [<][>][^][v][top][bottom][index][help] */