1/* flow.c: Generic flow cache. 2 * 3 * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) 4 * Copyright (C) 2003 David S. Miller (davem@redhat.com) 5 */ 6 7#include <linux/kernel.h> 8#include <linux/module.h> 9#include <linux/list.h> 10#include <linux/jhash.h> 11#include <linux/interrupt.h> 12#include <linux/mm.h> 13#include <linux/random.h> 14#include <linux/init.h> 15#include <linux/slab.h> 16#include <linux/smp.h> 17#include <linux/completion.h> 18#include <linux/percpu.h> 19#include <linux/bitops.h> 20#include <linux/notifier.h> 21#include <linux/cpu.h> 22#include <linux/cpumask.h> 23#include <linux/mutex.h> 24#include <net/flow.h> 25#include <linux/atomic.h> 26#include <linux/security.h> 27#include <net/net_namespace.h> 28 29struct flow_cache_entry { 30 union { 31 struct hlist_node hlist; 32 struct list_head gc_list; 33 } u; 34 struct net *net; 35 u16 family; 36 u8 dir; 37 u32 genid; 38 struct flowi key; 39 struct flow_cache_object *object; 40}; 41 42struct flow_flush_info { 43 struct flow_cache *cache; 44 atomic_t cpuleft; 45 struct completion completion; 46}; 47 48static struct kmem_cache *flow_cachep __read_mostly; 49 50#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) 51#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) 52 53static void flow_cache_new_hashrnd(unsigned long arg) 54{ 55 struct flow_cache *fc = (void *) arg; 56 int i; 57 58 for_each_possible_cpu(i) 59 per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; 60 61 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; 62 add_timer(&fc->rnd_timer); 63} 64 65static int flow_entry_valid(struct flow_cache_entry *fle, 66 struct netns_xfrm *xfrm) 67{ 68 if (atomic_read(&xfrm->flow_cache_genid) != fle->genid) 69 return 0; 70 if (fle->object && !fle->object->ops->check(fle->object)) 71 return 0; 72 return 1; 73} 74 75static void flow_entry_kill(struct flow_cache_entry *fle, 76 struct netns_xfrm *xfrm) 77{ 78 if (fle->object) 79 fle->object->ops->delete(fle->object); 80 kmem_cache_free(flow_cachep, fle); 81} 82 83static void flow_cache_gc_task(struct work_struct *work) 84{ 85 struct list_head gc_list; 86 struct flow_cache_entry *fce, *n; 87 struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, 88 flow_cache_gc_work); 89 90 INIT_LIST_HEAD(&gc_list); 91 spin_lock_bh(&xfrm->flow_cache_gc_lock); 92 list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); 93 spin_unlock_bh(&xfrm->flow_cache_gc_lock); 94 95 list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) 96 flow_entry_kill(fce, xfrm); 97} 98 99static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, 100 int deleted, struct list_head *gc_list, 101 struct netns_xfrm *xfrm) 102{ 103 if (deleted) { 104 fcp->hash_count -= deleted; 105 spin_lock_bh(&xfrm->flow_cache_gc_lock); 106 list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); 107 spin_unlock_bh(&xfrm->flow_cache_gc_lock); 108 schedule_work(&xfrm->flow_cache_gc_work); 109 } 110} 111 112static void __flow_cache_shrink(struct flow_cache *fc, 113 struct flow_cache_percpu *fcp, 114 int shrink_to) 115{ 116 struct flow_cache_entry *fle; 117 struct hlist_node *tmp; 118 LIST_HEAD(gc_list); 119 int i, deleted = 0; 120 struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, 121 flow_cache_global); 122 123 for (i = 0; i < flow_cache_hash_size(fc); i++) { 124 int saved = 0; 125 126 hlist_for_each_entry_safe(fle, tmp, 127 &fcp->hash_table[i], u.hlist) { 128 if (saved < shrink_to && 129 flow_entry_valid(fle, xfrm)) { 130 saved++; 131 } else { 132 deleted++; 133 hlist_del(&fle->u.hlist); 134 list_add_tail(&fle->u.gc_list, &gc_list); 135 } 136 } 137 } 138 139 flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); 140} 141 142static void flow_cache_shrink(struct flow_cache *fc, 143 struct flow_cache_percpu *fcp) 144{ 145 int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); 146 147 __flow_cache_shrink(fc, fcp, shrink_to); 148} 149 150static void flow_new_hash_rnd(struct flow_cache *fc, 151 struct flow_cache_percpu *fcp) 152{ 153 get_random_bytes(&fcp->hash_rnd, sizeof(u32)); 154 fcp->hash_rnd_recalc = 0; 155 __flow_cache_shrink(fc, fcp, 0); 156} 157 158static u32 flow_hash_code(struct flow_cache *fc, 159 struct flow_cache_percpu *fcp, 160 const struct flowi *key, 161 size_t keysize) 162{ 163 const u32 *k = (const u32 *) key; 164 const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); 165 166 return jhash2(k, length, fcp->hash_rnd) 167 & (flow_cache_hash_size(fc) - 1); 168} 169 170/* I hear what you're saying, use memcmp. But memcmp cannot make 171 * important assumptions that we can here, such as alignment. 172 */ 173static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, 174 size_t keysize) 175{ 176 const flow_compare_t *k1, *k1_lim, *k2; 177 178 k1 = (const flow_compare_t *) key1; 179 k1_lim = k1 + keysize; 180 181 k2 = (const flow_compare_t *) key2; 182 183 do { 184 if (*k1++ != *k2++) 185 return 1; 186 } while (k1 < k1_lim); 187 188 return 0; 189} 190 191struct flow_cache_object * 192flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, 193 flow_resolve_t resolver, void *ctx) 194{ 195 struct flow_cache *fc = &net->xfrm.flow_cache_global; 196 struct flow_cache_percpu *fcp; 197 struct flow_cache_entry *fle, *tfle; 198 struct flow_cache_object *flo; 199 size_t keysize; 200 unsigned int hash; 201 202 local_bh_disable(); 203 fcp = this_cpu_ptr(fc->percpu); 204 205 fle = NULL; 206 flo = NULL; 207 208 keysize = flow_key_size(family); 209 if (!keysize) 210 goto nocache; 211 212 /* Packet really early in init? Making flow_cache_init a 213 * pre-smp initcall would solve this. --RR */ 214 if (!fcp->hash_table) 215 goto nocache; 216 217 if (fcp->hash_rnd_recalc) 218 flow_new_hash_rnd(fc, fcp); 219 220 hash = flow_hash_code(fc, fcp, key, keysize); 221 hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) { 222 if (tfle->net == net && 223 tfle->family == family && 224 tfle->dir == dir && 225 flow_key_compare(key, &tfle->key, keysize) == 0) { 226 fle = tfle; 227 break; 228 } 229 } 230 231 if (unlikely(!fle)) { 232 if (fcp->hash_count > fc->high_watermark) 233 flow_cache_shrink(fc, fcp); 234 235 fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); 236 if (fle) { 237 fle->net = net; 238 fle->family = family; 239 fle->dir = dir; 240 memcpy(&fle->key, key, keysize * sizeof(flow_compare_t)); 241 fle->object = NULL; 242 hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); 243 fcp->hash_count++; 244 } 245 } else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) { 246 flo = fle->object; 247 if (!flo) 248 goto ret_object; 249 flo = flo->ops->get(flo); 250 if (flo) 251 goto ret_object; 252 } else if (fle->object) { 253 flo = fle->object; 254 flo->ops->delete(flo); 255 fle->object = NULL; 256 } 257 258nocache: 259 flo = NULL; 260 if (fle) { 261 flo = fle->object; 262 fle->object = NULL; 263 } 264 flo = resolver(net, key, family, dir, flo, ctx); 265 if (fle) { 266 fle->genid = atomic_read(&net->xfrm.flow_cache_genid); 267 if (!IS_ERR(flo)) 268 fle->object = flo; 269 else 270 fle->genid--; 271 } else { 272 if (!IS_ERR_OR_NULL(flo)) 273 flo->ops->delete(flo); 274 } 275ret_object: 276 local_bh_enable(); 277 return flo; 278} 279EXPORT_SYMBOL(flow_cache_lookup); 280 281static void flow_cache_flush_tasklet(unsigned long data) 282{ 283 struct flow_flush_info *info = (void *)data; 284 struct flow_cache *fc = info->cache; 285 struct flow_cache_percpu *fcp; 286 struct flow_cache_entry *fle; 287 struct hlist_node *tmp; 288 LIST_HEAD(gc_list); 289 int i, deleted = 0; 290 struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, 291 flow_cache_global); 292 293 fcp = this_cpu_ptr(fc->percpu); 294 for (i = 0; i < flow_cache_hash_size(fc); i++) { 295 hlist_for_each_entry_safe(fle, tmp, 296 &fcp->hash_table[i], u.hlist) { 297 if (flow_entry_valid(fle, xfrm)) 298 continue; 299 300 deleted++; 301 hlist_del(&fle->u.hlist); 302 list_add_tail(&fle->u.gc_list, &gc_list); 303 } 304 } 305 306 flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm); 307 308 if (atomic_dec_and_test(&info->cpuleft)) 309 complete(&info->completion); 310} 311 312/* 313 * Return whether a cpu needs flushing. Conservatively, we assume 314 * the presence of any entries means the core may require flushing, 315 * since the flow_cache_ops.check() function may assume it's running 316 * on the same core as the per-cpu cache component. 317 */ 318static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) 319{ 320 struct flow_cache_percpu *fcp; 321 int i; 322 323 fcp = per_cpu_ptr(fc->percpu, cpu); 324 for (i = 0; i < flow_cache_hash_size(fc); i++) 325 if (!hlist_empty(&fcp->hash_table[i])) 326 return 0; 327 return 1; 328} 329 330static void flow_cache_flush_per_cpu(void *data) 331{ 332 struct flow_flush_info *info = data; 333 struct tasklet_struct *tasklet; 334 335 tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet; 336 tasklet->data = (unsigned long)info; 337 tasklet_schedule(tasklet); 338} 339 340void flow_cache_flush(struct net *net) 341{ 342 struct flow_flush_info info; 343 cpumask_var_t mask; 344 int i, self; 345 346 /* Track which cpus need flushing to avoid disturbing all cores. */ 347 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 348 return; 349 cpumask_clear(mask); 350 351 /* Don't want cpus going down or up during this. */ 352 get_online_cpus(); 353 mutex_lock(&net->xfrm.flow_flush_sem); 354 info.cache = &net->xfrm.flow_cache_global; 355 for_each_online_cpu(i) 356 if (!flow_cache_percpu_empty(info.cache, i)) 357 cpumask_set_cpu(i, mask); 358 atomic_set(&info.cpuleft, cpumask_weight(mask)); 359 if (atomic_read(&info.cpuleft) == 0) 360 goto done; 361 362 init_completion(&info.completion); 363 364 local_bh_disable(); 365 self = cpumask_test_and_clear_cpu(smp_processor_id(), mask); 366 on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0); 367 if (self) 368 flow_cache_flush_tasklet((unsigned long)&info); 369 local_bh_enable(); 370 371 wait_for_completion(&info.completion); 372 373done: 374 mutex_unlock(&net->xfrm.flow_flush_sem); 375 put_online_cpus(); 376 free_cpumask_var(mask); 377} 378 379static void flow_cache_flush_task(struct work_struct *work) 380{ 381 struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm, 382 flow_cache_flush_work); 383 struct net *net = container_of(xfrm, struct net, xfrm); 384 385 flow_cache_flush(net); 386} 387 388void flow_cache_flush_deferred(struct net *net) 389{ 390 schedule_work(&net->xfrm.flow_cache_flush_work); 391} 392 393static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) 394{ 395 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 396 size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); 397 398 if (!fcp->hash_table) { 399 fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); 400 if (!fcp->hash_table) { 401 pr_err("NET: failed to allocate flow cache sz %zu\n", sz); 402 return -ENOMEM; 403 } 404 fcp->hash_rnd_recalc = 1; 405 fcp->hash_count = 0; 406 tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); 407 } 408 return 0; 409} 410 411static int flow_cache_cpu(struct notifier_block *nfb, 412 unsigned long action, 413 void *hcpu) 414{ 415 struct flow_cache *fc = container_of(nfb, struct flow_cache, 416 hotcpu_notifier); 417 int res, cpu = (unsigned long) hcpu; 418 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); 419 420 switch (action) { 421 case CPU_UP_PREPARE: 422 case CPU_UP_PREPARE_FROZEN: 423 res = flow_cache_cpu_prepare(fc, cpu); 424 if (res) 425 return notifier_from_errno(res); 426 break; 427 case CPU_DEAD: 428 case CPU_DEAD_FROZEN: 429 __flow_cache_shrink(fc, fcp, 0); 430 break; 431 } 432 return NOTIFY_OK; 433} 434 435int flow_cache_init(struct net *net) 436{ 437 int i; 438 struct flow_cache *fc = &net->xfrm.flow_cache_global; 439 440 if (!flow_cachep) 441 flow_cachep = kmem_cache_create("flow_cache", 442 sizeof(struct flow_cache_entry), 443 0, SLAB_PANIC, NULL); 444 spin_lock_init(&net->xfrm.flow_cache_gc_lock); 445 INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list); 446 INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); 447 INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); 448 mutex_init(&net->xfrm.flow_flush_sem); 449 450 fc->hash_shift = 10; 451 fc->low_watermark = 2 * flow_cache_hash_size(fc); 452 fc->high_watermark = 4 * flow_cache_hash_size(fc); 453 454 fc->percpu = alloc_percpu(struct flow_cache_percpu); 455 if (!fc->percpu) 456 return -ENOMEM; 457 458 cpu_notifier_register_begin(); 459 460 for_each_online_cpu(i) { 461 if (flow_cache_cpu_prepare(fc, i)) 462 goto err; 463 } 464 fc->hotcpu_notifier = (struct notifier_block){ 465 .notifier_call = flow_cache_cpu, 466 }; 467 __register_hotcpu_notifier(&fc->hotcpu_notifier); 468 469 cpu_notifier_register_done(); 470 471 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, 472 (unsigned long) fc); 473 fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; 474 add_timer(&fc->rnd_timer); 475 476 return 0; 477 478err: 479 for_each_possible_cpu(i) { 480 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); 481 kfree(fcp->hash_table); 482 fcp->hash_table = NULL; 483 } 484 485 cpu_notifier_register_done(); 486 487 free_percpu(fc->percpu); 488 fc->percpu = NULL; 489 490 return -ENOMEM; 491} 492EXPORT_SYMBOL(flow_cache_init); 493 494void flow_cache_fini(struct net *net) 495{ 496 int i; 497 struct flow_cache *fc = &net->xfrm.flow_cache_global; 498 499 del_timer_sync(&fc->rnd_timer); 500 unregister_hotcpu_notifier(&fc->hotcpu_notifier); 501 502 for_each_possible_cpu(i) { 503 struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); 504 kfree(fcp->hash_table); 505 fcp->hash_table = NULL; 506 } 507 508 free_percpu(fc->percpu); 509 fc->percpu = NULL; 510} 511EXPORT_SYMBOL(flow_cache_fini); 512