root/net/netfilter/ipvs/ip_vs_sh.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_unavailable
  2. ip_vs_sh_hashkey
  3. ip_vs_sh_get
  4. ip_vs_sh_get_fallback
  5. ip_vs_sh_reassign
  6. ip_vs_sh_flush
  7. ip_vs_sh_init_svc
  8. ip_vs_sh_done_svc
  9. ip_vs_sh_dest_changed
  10. ip_vs_sh_get_port
  11. ip_vs_sh_schedule
  12. ip_vs_sh_init
  13. ip_vs_sh_cleanup

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * IPVS:        Source Hashing scheduling module
   4  *
   5  * Authors:     Wensong Zhang <wensong@gnuchina.org>
   6  *
   7  * Changes:
   8  */
   9 
  10 /*
  11  * The sh algorithm is to select server by the hash key of source IP
  12  * address. The pseudo code is as follows:
  13  *
  14  *       n <- servernode[src_ip];
  15  *       if (n is dead) OR
  16  *          (n is overloaded) or (n.weight <= 0) then
  17  *                 return NULL;
  18  *
  19  *       return n;
  20  *
  21  * Notes that servernode is a 256-bucket hash table that maps the hash
  22  * index derived from packet source IP address to the current server
  23  * array. If the sh scheduler is used in cache cluster, it is good to
  24  * combine it with cache_bypass feature. When the statically assigned
  25  * server is dead or overloaded, the load balancer can bypass the cache
  26  * server and send requests to the original server directly.
  27  *
  28  * The weight destination attribute can be used to control the
  29  * distribution of connections to the destinations in servernode. The
  30  * greater the weight, the more connections the destination
  31  * will receive.
  32  *
  33  */
  34 
  35 #define KMSG_COMPONENT "IPVS"
  36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  37 
  38 #include <linux/ip.h>
  39 #include <linux/slab.h>
  40 #include <linux/module.h>
  41 #include <linux/kernel.h>
  42 #include <linux/skbuff.h>
  43 
  44 #include <net/ip_vs.h>
  45 
  46 #include <net/tcp.h>
  47 #include <linux/udp.h>
  48 #include <linux/sctp.h>
  49 
  50 
  51 /*
  52  *      IPVS SH bucket
  53  */
  54 struct ip_vs_sh_bucket {
  55         struct ip_vs_dest __rcu *dest;  /* real server (cache) */
  56 };
  57 
  58 /*
  59  *     for IPVS SH entry hash table
  60  */
  61 #ifndef CONFIG_IP_VS_SH_TAB_BITS
  62 #define CONFIG_IP_VS_SH_TAB_BITS        8
  63 #endif
  64 #define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
  65 #define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
  66 #define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
  67 
  68 struct ip_vs_sh_state {
  69         struct rcu_head                 rcu_head;
  70         struct ip_vs_sh_bucket          buckets[IP_VS_SH_TAB_SIZE];
  71 };
  72 
  73 /* Helper function to determine if server is unavailable */
  74 static inline bool is_unavailable(struct ip_vs_dest *dest)
  75 {
  76         return atomic_read(&dest->weight) <= 0 ||
  77                dest->flags & IP_VS_DEST_F_OVERLOAD;
  78 }
  79 
  80 /*
  81  *      Returns hash value for IPVS SH entry
  82  */
  83 static inline unsigned int
  84 ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
  85                  __be16 port, unsigned int offset)
  86 {
  87         __be32 addr_fold = addr->ip;
  88 
  89 #ifdef CONFIG_IP_VS_IPV6
  90         if (af == AF_INET6)
  91                 addr_fold = addr->ip6[0]^addr->ip6[1]^
  92                             addr->ip6[2]^addr->ip6[3];
  93 #endif
  94         return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
  95                                  IP_VS_SH_TAB_BITS)) &
  96                 IP_VS_SH_TAB_MASK;
  97 }
  98 
  99 
 100 /*
 101  *      Get ip_vs_dest associated with supplied parameters.
 102  */
 103 static inline struct ip_vs_dest *
 104 ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
 105              const union nf_inet_addr *addr, __be16 port)
 106 {
 107         unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
 108         struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
 109 
 110         return (!dest || is_unavailable(dest)) ? NULL : dest;
 111 }
 112 
 113 
 114 /* As ip_vs_sh_get, but with fallback if selected server is unavailable
 115  *
 116  * The fallback strategy loops around the table starting from a "random"
 117  * point (in fact, it is chosen to be the original hash value to make the
 118  * algorithm deterministic) to find a new server.
 119  */
 120 static inline struct ip_vs_dest *
 121 ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
 122                       const union nf_inet_addr *addr, __be16 port)
 123 {
 124         unsigned int offset, roffset;
 125         unsigned int hash, ihash;
 126         struct ip_vs_dest *dest;
 127 
 128         /* first try the dest it's supposed to go to */
 129         ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
 130         dest = rcu_dereference(s->buckets[ihash].dest);
 131         if (!dest)
 132                 return NULL;
 133         if (!is_unavailable(dest))
 134                 return dest;
 135 
 136         IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
 137                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
 138 
 139         /* if the original dest is unavailable, loop around the table
 140          * starting from ihash to find a new dest
 141          */
 142         for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
 143                 roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
 144                 hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
 145                 dest = rcu_dereference(s->buckets[hash].dest);
 146                 if (!dest)
 147                         break;
 148                 if (!is_unavailable(dest))
 149                         return dest;
 150                 IP_VS_DBG_BUF(6, "SH: selected unavailable "
 151                               "server %s:%d (offset %d), reselecting",
 152                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
 153                               ntohs(dest->port), roffset);
 154         }
 155 
 156         return NULL;
 157 }
 158 
 159 /*
 160  *      Assign all the hash buckets of the specified table with the service.
 161  */
 162 static int
 163 ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
 164 {
 165         int i;
 166         struct ip_vs_sh_bucket *b;
 167         struct list_head *p;
 168         struct ip_vs_dest *dest;
 169         int d_count;
 170         bool empty;
 171 
 172         b = &s->buckets[0];
 173         p = &svc->destinations;
 174         empty = list_empty(p);
 175         d_count = 0;
 176         for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
 177                 dest = rcu_dereference_protected(b->dest, 1);
 178                 if (dest)
 179                         ip_vs_dest_put(dest);
 180                 if (empty)
 181                         RCU_INIT_POINTER(b->dest, NULL);
 182                 else {
 183                         if (p == &svc->destinations)
 184                                 p = p->next;
 185 
 186                         dest = list_entry(p, struct ip_vs_dest, n_list);
 187                         ip_vs_dest_hold(dest);
 188                         RCU_INIT_POINTER(b->dest, dest);
 189 
 190                         IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
 191                                       i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
 192                                       atomic_read(&dest->weight));
 193 
 194                         /* Don't move to next dest until filling weight */
 195                         if (++d_count >= atomic_read(&dest->weight)) {
 196                                 p = p->next;
 197                                 d_count = 0;
 198                         }
 199 
 200                 }
 201                 b++;
 202         }
 203         return 0;
 204 }
 205 
 206 
 207 /*
 208  *      Flush all the hash buckets of the specified table.
 209  */
 210 static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
 211 {
 212         int i;
 213         struct ip_vs_sh_bucket *b;
 214         struct ip_vs_dest *dest;
 215 
 216         b = &s->buckets[0];
 217         for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
 218                 dest = rcu_dereference_protected(b->dest, 1);
 219                 if (dest) {
 220                         ip_vs_dest_put(dest);
 221                         RCU_INIT_POINTER(b->dest, NULL);
 222                 }
 223                 b++;
 224         }
 225 }
 226 
 227 
 228 static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
 229 {
 230         struct ip_vs_sh_state *s;
 231 
 232         /* allocate the SH table for this service */
 233         s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
 234         if (s == NULL)
 235                 return -ENOMEM;
 236 
 237         svc->sched_data = s;
 238         IP_VS_DBG(6, "SH hash table (memory=%zdbytes) allocated for "
 239                   "current service\n",
 240                   sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 241 
 242         /* assign the hash buckets with current dests */
 243         ip_vs_sh_reassign(s, svc);
 244 
 245         return 0;
 246 }
 247 
 248 
 249 static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
 250 {
 251         struct ip_vs_sh_state *s = svc->sched_data;
 252 
 253         /* got to clean up hash buckets here */
 254         ip_vs_sh_flush(s);
 255 
 256         /* release the table itself */
 257         kfree_rcu(s, rcu_head);
 258         IP_VS_DBG(6, "SH hash table (memory=%zdbytes) released\n",
 259                   sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
 260 }
 261 
 262 
 263 static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
 264                                  struct ip_vs_dest *dest)
 265 {
 266         struct ip_vs_sh_state *s = svc->sched_data;
 267 
 268         /* assign the hash buckets with the updated service */
 269         ip_vs_sh_reassign(s, svc);
 270 
 271         return 0;
 272 }
 273 
 274 
 275 /* Helper function to get port number */
 276 static inline __be16
 277 ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
 278 {
 279         __be16 _ports[2], *ports;
 280 
 281         /* At this point we know that we have a valid packet of some kind.
 282          * Because ICMP packets are only guaranteed to have the first 8
 283          * bytes, let's just grab the ports.  Fortunately they're in the
 284          * same position for all three of the protocols we care about.
 285          */
 286         switch (iph->protocol) {
 287         case IPPROTO_TCP:
 288         case IPPROTO_UDP:
 289         case IPPROTO_SCTP:
 290                 ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
 291                                            &_ports);
 292                 if (unlikely(!ports))
 293                         return 0;
 294 
 295                 if (likely(!ip_vs_iph_inverse(iph)))
 296                         return ports[0];
 297                 else
 298                         return ports[1];
 299         default:
 300                 return 0;
 301         }
 302 }
 303 
 304 
 305 /*
 306  *      Source Hashing scheduling
 307  */
 308 static struct ip_vs_dest *
 309 ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
 310                   struct ip_vs_iphdr *iph)
 311 {
 312         struct ip_vs_dest *dest;
 313         struct ip_vs_sh_state *s;
 314         __be16 port = 0;
 315         const union nf_inet_addr *hash_addr;
 316 
 317         hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
 318 
 319         IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
 320 
 321         if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
 322                 port = ip_vs_sh_get_port(skb, iph);
 323 
 324         s = (struct ip_vs_sh_state *) svc->sched_data;
 325 
 326         if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
 327                 dest = ip_vs_sh_get_fallback(svc, s, hash_addr, port);
 328         else
 329                 dest = ip_vs_sh_get(svc, s, hash_addr, port);
 330 
 331         if (!dest) {
 332                 ip_vs_scheduler_err(svc, "no destination available");
 333                 return NULL;
 334         }
 335 
 336         IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
 337                       IP_VS_DBG_ADDR(svc->af, hash_addr),
 338                       IP_VS_DBG_ADDR(dest->af, &dest->addr),
 339                       ntohs(dest->port));
 340 
 341         return dest;
 342 }
 343 
 344 
 345 /*
 346  *      IPVS SH Scheduler structure
 347  */
 348 static struct ip_vs_scheduler ip_vs_sh_scheduler =
 349 {
 350         .name =                 "sh",
 351         .refcnt =               ATOMIC_INIT(0),
 352         .module =               THIS_MODULE,
 353         .n_list  =              LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
 354         .init_service =         ip_vs_sh_init_svc,
 355         .done_service =         ip_vs_sh_done_svc,
 356         .add_dest =             ip_vs_sh_dest_changed,
 357         .del_dest =             ip_vs_sh_dest_changed,
 358         .upd_dest =             ip_vs_sh_dest_changed,
 359         .schedule =             ip_vs_sh_schedule,
 360 };
 361 
 362 
 363 static int __init ip_vs_sh_init(void)
 364 {
 365         return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
 366 }
 367 
 368 
 369 static void __exit ip_vs_sh_cleanup(void)
 370 {
 371         unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
 372         synchronize_rcu();
 373 }
 374 
 375 
 376 module_init(ip_vs_sh_init);
 377 module_exit(ip_vs_sh_cleanup);
 378 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */