root/net/netfilter/nf_conncount.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. already_closed
  2. key_diff
  3. conn_free
  4. find_or_evict
  5. __nf_conncount_add
  6. nf_conncount_add
  7. nf_conncount_list_init
  8. nf_conncount_gc_list
  9. __tree_nodes_free
  10. tree_nodes_free
  11. schedule_gc_worker
  12. insert_tree
  13. count_tree
  14. tree_gc_worker
  15. nf_conncount_count
  16. nf_conncount_init
  17. nf_conncount_cache_free
  18. destroy_tree
  19. nf_conncount_destroy
  20. nf_conncount_modinit
  21. nf_conncount_modexit

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * count the number of connections matching an arbitrary key.
   4  *
   5  * (C) 2017 Red Hat GmbH
   6  * Author: Florian Westphal <fw@strlen.de>
   7  *
   8  * split from xt_connlimit.c:
   9  *   (c) 2000 Gerd Knorr <kraxel@bytesex.org>
  10  *   Nov 2002: Martin Bene <martin.bene@icomedias.com>:
  11  *              only ignore TIME_WAIT or gone connections
  12  *   (C) CC Computer Consultants GmbH, 2007
  13  */
  14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15 #include <linux/in.h>
  16 #include <linux/in6.h>
  17 #include <linux/ip.h>
  18 #include <linux/ipv6.h>
  19 #include <linux/jhash.h>
  20 #include <linux/slab.h>
  21 #include <linux/list.h>
  22 #include <linux/rbtree.h>
  23 #include <linux/module.h>
  24 #include <linux/random.h>
  25 #include <linux/skbuff.h>
  26 #include <linux/spinlock.h>
  27 #include <linux/netfilter/nf_conntrack_tcp.h>
  28 #include <linux/netfilter/x_tables.h>
  29 #include <net/netfilter/nf_conntrack.h>
  30 #include <net/netfilter/nf_conntrack_count.h>
  31 #include <net/netfilter/nf_conntrack_core.h>
  32 #include <net/netfilter/nf_conntrack_tuple.h>
  33 #include <net/netfilter/nf_conntrack_zones.h>
  34 
  35 #define CONNCOUNT_SLOTS         256U
  36 
  37 #define CONNCOUNT_GC_MAX_NODES  8
  38 #define MAX_KEYLEN              5
  39 
  40 /* we will save the tuples of all connections we care about */
  41 struct nf_conncount_tuple {
  42         struct list_head                node;
  43         struct nf_conntrack_tuple       tuple;
  44         struct nf_conntrack_zone        zone;
  45         int                             cpu;
  46         u32                             jiffies32;
  47 };
  48 
  49 struct nf_conncount_rb {
  50         struct rb_node node;
  51         struct nf_conncount_list list;
  52         u32 key[MAX_KEYLEN];
  53         struct rcu_head rcu_head;
  54 };
  55 
  56 static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
  57 
  58 struct nf_conncount_data {
  59         unsigned int keylen;
  60         struct rb_root root[CONNCOUNT_SLOTS];
  61         struct net *net;
  62         struct work_struct gc_work;
  63         unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
  64         unsigned int gc_tree;
  65 };
  66 
  67 static u_int32_t conncount_rnd __read_mostly;
  68 static struct kmem_cache *conncount_rb_cachep __read_mostly;
  69 static struct kmem_cache *conncount_conn_cachep __read_mostly;
  70 
  71 static inline bool already_closed(const struct nf_conn *conn)
  72 {
  73         if (nf_ct_protonum(conn) == IPPROTO_TCP)
  74                 return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT ||
  75                        conn->proto.tcp.state == TCP_CONNTRACK_CLOSE;
  76         else
  77                 return false;
  78 }
  79 
  80 static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
  81 {
  82         return memcmp(a, b, klen * sizeof(u32));
  83 }
  84 
  85 static void conn_free(struct nf_conncount_list *list,
  86                       struct nf_conncount_tuple *conn)
  87 {
  88         lockdep_assert_held(&list->list_lock);
  89 
  90         list->count--;
  91         list_del(&conn->node);
  92 
  93         kmem_cache_free(conncount_conn_cachep, conn);
  94 }
  95 
  96 static const struct nf_conntrack_tuple_hash *
  97 find_or_evict(struct net *net, struct nf_conncount_list *list,
  98               struct nf_conncount_tuple *conn)
  99 {
 100         const struct nf_conntrack_tuple_hash *found;
 101         unsigned long a, b;
 102         int cpu = raw_smp_processor_id();
 103         u32 age;
 104 
 105         found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);
 106         if (found)
 107                 return found;
 108         b = conn->jiffies32;
 109         a = (u32)jiffies;
 110 
 111         /* conn might have been added just before by another cpu and
 112          * might still be unconfirmed.  In this case, nf_conntrack_find()
 113          * returns no result.  Thus only evict if this cpu added the
 114          * stale entry or if the entry is older than two jiffies.
 115          */
 116         age = a - b;
 117         if (conn->cpu == cpu || age >= 2) {
 118                 conn_free(list, conn);
 119                 return ERR_PTR(-ENOENT);
 120         }
 121 
 122         return ERR_PTR(-EAGAIN);
 123 }
 124 
 125 static int __nf_conncount_add(struct net *net,
 126                               struct nf_conncount_list *list,
 127                               const struct nf_conntrack_tuple *tuple,
 128                               const struct nf_conntrack_zone *zone)
 129 {
 130         const struct nf_conntrack_tuple_hash *found;
 131         struct nf_conncount_tuple *conn, *conn_n;
 132         struct nf_conn *found_ct;
 133         unsigned int collect = 0;
 134 
 135         /* check the saved connections */
 136         list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 137                 if (collect > CONNCOUNT_GC_MAX_NODES)
 138                         break;
 139 
 140                 found = find_or_evict(net, list, conn);
 141                 if (IS_ERR(found)) {
 142                         /* Not found, but might be about to be confirmed */
 143                         if (PTR_ERR(found) == -EAGAIN) {
 144                                 if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 145                                     nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
 146                                     nf_ct_zone_id(zone, zone->dir))
 147                                         return 0; /* already exists */
 148                         } else {
 149                                 collect++;
 150                         }
 151                         continue;
 152                 }
 153 
 154                 found_ct = nf_ct_tuplehash_to_ctrack(found);
 155 
 156                 if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
 157                     nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 158                         /*
 159                          * We should not see tuples twice unless someone hooks
 160                          * this into a table without "-p tcp --syn".
 161                          *
 162                          * Attempt to avoid a re-add in this case.
 163                          */
 164                         nf_ct_put(found_ct);
 165                         return 0;
 166                 } else if (already_closed(found_ct)) {
 167                         /*
 168                          * we do not care about connections which are
 169                          * closed already -> ditch it
 170                          */
 171                         nf_ct_put(found_ct);
 172                         conn_free(list, conn);
 173                         collect++;
 174                         continue;
 175                 }
 176 
 177                 nf_ct_put(found_ct);
 178         }
 179 
 180         if (WARN_ON_ONCE(list->count > INT_MAX))
 181                 return -EOVERFLOW;
 182 
 183         conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 184         if (conn == NULL)
 185                 return -ENOMEM;
 186 
 187         conn->tuple = *tuple;
 188         conn->zone = *zone;
 189         conn->cpu = raw_smp_processor_id();
 190         conn->jiffies32 = (u32)jiffies;
 191         list_add_tail(&conn->node, &list->head);
 192         list->count++;
 193         return 0;
 194 }
 195 
 196 int nf_conncount_add(struct net *net,
 197                      struct nf_conncount_list *list,
 198                      const struct nf_conntrack_tuple *tuple,
 199                      const struct nf_conntrack_zone *zone)
 200 {
 201         int ret;
 202 
 203         /* check the saved connections */
 204         spin_lock_bh(&list->list_lock);
 205         ret = __nf_conncount_add(net, list, tuple, zone);
 206         spin_unlock_bh(&list->list_lock);
 207 
 208         return ret;
 209 }
 210 EXPORT_SYMBOL_GPL(nf_conncount_add);
 211 
 212 void nf_conncount_list_init(struct nf_conncount_list *list)
 213 {
 214         spin_lock_init(&list->list_lock);
 215         INIT_LIST_HEAD(&list->head);
 216         list->count = 0;
 217 }
 218 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 219 
 220 /* Return true if the list is empty. Must be called with BH disabled. */
 221 bool nf_conncount_gc_list(struct net *net,
 222                           struct nf_conncount_list *list)
 223 {
 224         const struct nf_conntrack_tuple_hash *found;
 225         struct nf_conncount_tuple *conn, *conn_n;
 226         struct nf_conn *found_ct;
 227         unsigned int collected = 0;
 228         bool ret = false;
 229 
 230         /* don't bother if other cpu is already doing GC */
 231         if (!spin_trylock(&list->list_lock))
 232                 return false;
 233 
 234         list_for_each_entry_safe(conn, conn_n, &list->head, node) {
 235                 found = find_or_evict(net, list, conn);
 236                 if (IS_ERR(found)) {
 237                         if (PTR_ERR(found) == -ENOENT)
 238                                 collected++;
 239                         continue;
 240                 }
 241 
 242                 found_ct = nf_ct_tuplehash_to_ctrack(found);
 243                 if (already_closed(found_ct)) {
 244                         /*
 245                          * we do not care about connections which are
 246                          * closed already -> ditch it
 247                          */
 248                         nf_ct_put(found_ct);
 249                         conn_free(list, conn);
 250                         collected++;
 251                         continue;
 252                 }
 253 
 254                 nf_ct_put(found_ct);
 255                 if (collected > CONNCOUNT_GC_MAX_NODES)
 256                         break;
 257         }
 258 
 259         if (!list->count)
 260                 ret = true;
 261         spin_unlock(&list->list_lock);
 262 
 263         return ret;
 264 }
 265 EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
 266 
 267 static void __tree_nodes_free(struct rcu_head *h)
 268 {
 269         struct nf_conncount_rb *rbconn;
 270 
 271         rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
 272         kmem_cache_free(conncount_rb_cachep, rbconn);
 273 }
 274 
 275 /* caller must hold tree nf_conncount_locks[] lock */
 276 static void tree_nodes_free(struct rb_root *root,
 277                             struct nf_conncount_rb *gc_nodes[],
 278                             unsigned int gc_count)
 279 {
 280         struct nf_conncount_rb *rbconn;
 281 
 282         while (gc_count) {
 283                 rbconn = gc_nodes[--gc_count];
 284                 spin_lock(&rbconn->list.list_lock);
 285                 if (!rbconn->list.count) {
 286                         rb_erase(&rbconn->node, root);
 287                         call_rcu(&rbconn->rcu_head, __tree_nodes_free);
 288                 }
 289                 spin_unlock(&rbconn->list.list_lock);
 290         }
 291 }
 292 
 293 static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
 294 {
 295         set_bit(tree, data->pending_trees);
 296         schedule_work(&data->gc_work);
 297 }
 298 
 299 static unsigned int
 300 insert_tree(struct net *net,
 301             struct nf_conncount_data *data,
 302             struct rb_root *root,
 303             unsigned int hash,
 304             const u32 *key,
 305             const struct nf_conntrack_tuple *tuple,
 306             const struct nf_conntrack_zone *zone)
 307 {
 308         struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
 309         struct rb_node **rbnode, *parent;
 310         struct nf_conncount_rb *rbconn;
 311         struct nf_conncount_tuple *conn;
 312         unsigned int count = 0, gc_count = 0;
 313         u8 keylen = data->keylen;
 314         bool do_gc = true;
 315 
 316         spin_lock_bh(&nf_conncount_locks[hash]);
 317 restart:
 318         parent = NULL;
 319         rbnode = &(root->rb_node);
 320         while (*rbnode) {
 321                 int diff;
 322                 rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
 323 
 324                 parent = *rbnode;
 325                 diff = key_diff(key, rbconn->key, keylen);
 326                 if (diff < 0) {
 327                         rbnode = &((*rbnode)->rb_left);
 328                 } else if (diff > 0) {
 329                         rbnode = &((*rbnode)->rb_right);
 330                 } else {
 331                         int ret;
 332 
 333                         ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
 334                         if (ret)
 335                                 count = 0; /* hotdrop */
 336                         else
 337                                 count = rbconn->list.count;
 338                         tree_nodes_free(root, gc_nodes, gc_count);
 339                         goto out_unlock;
 340                 }
 341 
 342                 if (gc_count >= ARRAY_SIZE(gc_nodes))
 343                         continue;
 344 
 345                 if (do_gc && nf_conncount_gc_list(net, &rbconn->list))
 346                         gc_nodes[gc_count++] = rbconn;
 347         }
 348 
 349         if (gc_count) {
 350                 tree_nodes_free(root, gc_nodes, gc_count);
 351                 schedule_gc_worker(data, hash);
 352                 gc_count = 0;
 353                 do_gc = false;
 354                 goto restart;
 355         }
 356 
 357         /* expected case: match, insert new node */
 358         rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 359         if (rbconn == NULL)
 360                 goto out_unlock;
 361 
 362         conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 363         if (conn == NULL) {
 364                 kmem_cache_free(conncount_rb_cachep, rbconn);
 365                 goto out_unlock;
 366         }
 367 
 368         conn->tuple = *tuple;
 369         conn->zone = *zone;
 370         memcpy(rbconn->key, key, sizeof(u32) * keylen);
 371 
 372         nf_conncount_list_init(&rbconn->list);
 373         list_add(&conn->node, &rbconn->list.head);
 374         count = 1;
 375         rbconn->list.count = count;
 376 
 377         rb_link_node_rcu(&rbconn->node, parent, rbnode);
 378         rb_insert_color(&rbconn->node, root);
 379 out_unlock:
 380         spin_unlock_bh(&nf_conncount_locks[hash]);
 381         return count;
 382 }
 383 
 384 static unsigned int
 385 count_tree(struct net *net,
 386            struct nf_conncount_data *data,
 387            const u32 *key,
 388            const struct nf_conntrack_tuple *tuple,
 389            const struct nf_conntrack_zone *zone)
 390 {
 391         struct rb_root *root;
 392         struct rb_node *parent;
 393         struct nf_conncount_rb *rbconn;
 394         unsigned int hash;
 395         u8 keylen = data->keylen;
 396 
 397         hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
 398         root = &data->root[hash];
 399 
 400         parent = rcu_dereference_raw(root->rb_node);
 401         while (parent) {
 402                 int diff;
 403 
 404                 rbconn = rb_entry(parent, struct nf_conncount_rb, node);
 405 
 406                 diff = key_diff(key, rbconn->key, keylen);
 407                 if (diff < 0) {
 408                         parent = rcu_dereference_raw(parent->rb_left);
 409                 } else if (diff > 0) {
 410                         parent = rcu_dereference_raw(parent->rb_right);
 411                 } else {
 412                         int ret;
 413 
 414                         if (!tuple) {
 415                                 nf_conncount_gc_list(net, &rbconn->list);
 416                                 return rbconn->list.count;
 417                         }
 418 
 419                         spin_lock_bh(&rbconn->list.list_lock);
 420                         /* Node might be about to be free'd.
 421                          * We need to defer to insert_tree() in this case.
 422                          */
 423                         if (rbconn->list.count == 0) {
 424                                 spin_unlock_bh(&rbconn->list.list_lock);
 425                                 break;
 426                         }
 427 
 428                         /* same source network -> be counted! */
 429                         ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
 430                         spin_unlock_bh(&rbconn->list.list_lock);
 431                         if (ret)
 432                                 return 0; /* hotdrop */
 433                         else
 434                                 return rbconn->list.count;
 435                 }
 436         }
 437 
 438         if (!tuple)
 439                 return 0;
 440 
 441         return insert_tree(net, data, root, hash, key, tuple, zone);
 442 }
 443 
 444 static void tree_gc_worker(struct work_struct *work)
 445 {
 446         struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
 447         struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
 448         struct rb_root *root;
 449         struct rb_node *node;
 450         unsigned int tree, next_tree, gc_count = 0;
 451 
 452         tree = data->gc_tree % CONNCOUNT_SLOTS;
 453         root = &data->root[tree];
 454 
 455         local_bh_disable();
 456         rcu_read_lock();
 457         for (node = rb_first(root); node != NULL; node = rb_next(node)) {
 458                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
 459                 if (nf_conncount_gc_list(data->net, &rbconn->list))
 460                         gc_count++;
 461         }
 462         rcu_read_unlock();
 463         local_bh_enable();
 464 
 465         cond_resched();
 466 
 467         spin_lock_bh(&nf_conncount_locks[tree]);
 468         if (gc_count < ARRAY_SIZE(gc_nodes))
 469                 goto next; /* do not bother */
 470 
 471         gc_count = 0;
 472         node = rb_first(root);
 473         while (node != NULL) {
 474                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
 475                 node = rb_next(node);
 476 
 477                 if (rbconn->list.count > 0)
 478                         continue;
 479 
 480                 gc_nodes[gc_count++] = rbconn;
 481                 if (gc_count >= ARRAY_SIZE(gc_nodes)) {
 482                         tree_nodes_free(root, gc_nodes, gc_count);
 483                         gc_count = 0;
 484                 }
 485         }
 486 
 487         tree_nodes_free(root, gc_nodes, gc_count);
 488 next:
 489         clear_bit(tree, data->pending_trees);
 490 
 491         next_tree = (tree + 1) % CONNCOUNT_SLOTS;
 492         next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);
 493 
 494         if (next_tree < CONNCOUNT_SLOTS) {
 495                 data->gc_tree = next_tree;
 496                 schedule_work(work);
 497         }
 498 
 499         spin_unlock_bh(&nf_conncount_locks[tree]);
 500 }
 501 
 502 /* Count and return number of conntrack entries in 'net' with particular 'key'.
 503  * If 'tuple' is not null, insert it into the accounting data structure.
 504  * Call with RCU read lock.
 505  */
 506 unsigned int nf_conncount_count(struct net *net,
 507                                 struct nf_conncount_data *data,
 508                                 const u32 *key,
 509                                 const struct nf_conntrack_tuple *tuple,
 510                                 const struct nf_conntrack_zone *zone)
 511 {
 512         return count_tree(net, data, key, tuple, zone);
 513 }
 514 EXPORT_SYMBOL_GPL(nf_conncount_count);
 515 
 516 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
 517                                             unsigned int keylen)
 518 {
 519         struct nf_conncount_data *data;
 520         int ret, i;
 521 
 522         if (keylen % sizeof(u32) ||
 523             keylen / sizeof(u32) > MAX_KEYLEN ||
 524             keylen == 0)
 525                 return ERR_PTR(-EINVAL);
 526 
 527         net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));
 528 
 529         data = kmalloc(sizeof(*data), GFP_KERNEL);
 530         if (!data)
 531                 return ERR_PTR(-ENOMEM);
 532 
 533         ret = nf_ct_netns_get(net, family);
 534         if (ret < 0) {
 535                 kfree(data);
 536                 return ERR_PTR(ret);
 537         }
 538 
 539         for (i = 0; i < ARRAY_SIZE(data->root); ++i)
 540                 data->root[i] = RB_ROOT;
 541 
 542         data->keylen = keylen / sizeof(u32);
 543         data->net = net;
 544         INIT_WORK(&data->gc_work, tree_gc_worker);
 545 
 546         return data;
 547 }
 548 EXPORT_SYMBOL_GPL(nf_conncount_init);
 549 
 550 void nf_conncount_cache_free(struct nf_conncount_list *list)
 551 {
 552         struct nf_conncount_tuple *conn, *conn_n;
 553 
 554         list_for_each_entry_safe(conn, conn_n, &list->head, node)
 555                 kmem_cache_free(conncount_conn_cachep, conn);
 556 }
 557 EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
 558 
 559 static void destroy_tree(struct rb_root *r)
 560 {
 561         struct nf_conncount_rb *rbconn;
 562         struct rb_node *node;
 563 
 564         while ((node = rb_first(r)) != NULL) {
 565                 rbconn = rb_entry(node, struct nf_conncount_rb, node);
 566 
 567                 rb_erase(node, r);
 568 
 569                 nf_conncount_cache_free(&rbconn->list);
 570 
 571                 kmem_cache_free(conncount_rb_cachep, rbconn);
 572         }
 573 }
 574 
 575 void nf_conncount_destroy(struct net *net, unsigned int family,
 576                           struct nf_conncount_data *data)
 577 {
 578         unsigned int i;
 579 
 580         cancel_work_sync(&data->gc_work);
 581         nf_ct_netns_put(net, family);
 582 
 583         for (i = 0; i < ARRAY_SIZE(data->root); ++i)
 584                 destroy_tree(&data->root[i]);
 585 
 586         kfree(data);
 587 }
 588 EXPORT_SYMBOL_GPL(nf_conncount_destroy);
 589 
 590 static int __init nf_conncount_modinit(void)
 591 {
 592         int i;
 593 
 594         for (i = 0; i < CONNCOUNT_SLOTS; ++i)
 595                 spin_lock_init(&nf_conncount_locks[i]);
 596 
 597         conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
 598                                            sizeof(struct nf_conncount_tuple),
 599                                            0, 0, NULL);
 600         if (!conncount_conn_cachep)
 601                 return -ENOMEM;
 602 
 603         conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
 604                                            sizeof(struct nf_conncount_rb),
 605                                            0, 0, NULL);
 606         if (!conncount_rb_cachep) {
 607                 kmem_cache_destroy(conncount_conn_cachep);
 608                 return -ENOMEM;
 609         }
 610 
 611         return 0;
 612 }
 613 
 614 static void __exit nf_conncount_modexit(void)
 615 {
 616         kmem_cache_destroy(conncount_conn_cachep);
 617         kmem_cache_destroy(conncount_rb_cachep);
 618 }
 619 
 620 module_init(nf_conncount_modinit);
 621 module_exit(nf_conncount_modexit);
 622 MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
 623 MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
 624 MODULE_DESCRIPTION("netfilter: count number of connections matching a key");
 625 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */