1/* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7#include "dm-cache-policy.h" 8#include "dm.h" 9 10#include <linux/hash.h> 11#include <linux/jiffies.h> 12#include <linux/module.h> 13#include <linux/mutex.h> 14#include <linux/slab.h> 15#include <linux/vmalloc.h> 16 17#define DM_MSG_PREFIX "cache-policy-mq" 18 19static struct kmem_cache *mq_entry_cache; 20 21/*----------------------------------------------------------------*/ 22 23static unsigned next_power(unsigned n, unsigned min) 24{ 25 return roundup_pow_of_two(max(n, min)); 26} 27 28/*----------------------------------------------------------------*/ 29 30/* 31 * Large, sequential ios are probably better left on the origin device since 32 * spindles tend to have good bandwidth. 33 * 34 * The io_tracker tries to spot when the io is in one of these sequential 35 * modes. 36 * 37 * Two thresholds to switch between random and sequential io mode are defaulting 38 * as follows and can be adjusted via the constructor and message interfaces. 39 */ 40#define RANDOM_THRESHOLD_DEFAULT 4 41#define SEQUENTIAL_THRESHOLD_DEFAULT 512 42 43enum io_pattern { 44 PATTERN_SEQUENTIAL, 45 PATTERN_RANDOM 46}; 47 48struct io_tracker { 49 enum io_pattern pattern; 50 51 unsigned nr_seq_samples; 52 unsigned nr_rand_samples; 53 unsigned thresholds[2]; 54 55 dm_oblock_t last_end_oblock; 56}; 57 58static void iot_init(struct io_tracker *t, 59 int sequential_threshold, int random_threshold) 60{ 61 t->pattern = PATTERN_RANDOM; 62 t->nr_seq_samples = 0; 63 t->nr_rand_samples = 0; 64 t->last_end_oblock = 0; 65 t->thresholds[PATTERN_RANDOM] = random_threshold; 66 t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; 67} 68 69static enum io_pattern iot_pattern(struct io_tracker *t) 70{ 71 return t->pattern; 72} 73 74static void iot_update_stats(struct io_tracker *t, struct bio *bio) 75{ 76 if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) 77 t->nr_seq_samples++; 78 else { 79 /* 80 * Just one non-sequential IO is enough to reset the 81 * counters. 82 */ 83 if (t->nr_seq_samples) { 84 t->nr_seq_samples = 0; 85 t->nr_rand_samples = 0; 86 } 87 88 t->nr_rand_samples++; 89 } 90 91 t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); 92} 93 94static void iot_check_for_pattern_switch(struct io_tracker *t) 95{ 96 switch (t->pattern) { 97 case PATTERN_SEQUENTIAL: 98 if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { 99 t->pattern = PATTERN_RANDOM; 100 t->nr_seq_samples = t->nr_rand_samples = 0; 101 } 102 break; 103 104 case PATTERN_RANDOM: 105 if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { 106 t->pattern = PATTERN_SEQUENTIAL; 107 t->nr_seq_samples = t->nr_rand_samples = 0; 108 } 109 break; 110 } 111} 112 113static void iot_examine_bio(struct io_tracker *t, struct bio *bio) 114{ 115 iot_update_stats(t, bio); 116 iot_check_for_pattern_switch(t); 117} 118 119/*----------------------------------------------------------------*/ 120 121 122/* 123 * This queue is divided up into different levels. Allowing us to push 124 * entries to the back of any of the levels. Think of it as a partially 125 * sorted queue. 126 */ 127#define NR_QUEUE_LEVELS 16u 128#define NR_SENTINELS NR_QUEUE_LEVELS * 3 129 130#define WRITEBACK_PERIOD HZ 131 132struct queue { 133 unsigned nr_elts; 134 bool current_writeback_sentinels; 135 unsigned long next_writeback; 136 struct list_head qs[NR_QUEUE_LEVELS]; 137 struct list_head sentinels[NR_SENTINELS]; 138}; 139 140static void queue_init(struct queue *q) 141{ 142 unsigned i; 143 144 q->nr_elts = 0; 145 q->current_writeback_sentinels = false; 146 q->next_writeback = 0; 147 for (i = 0; i < NR_QUEUE_LEVELS; i++) { 148 INIT_LIST_HEAD(q->qs + i); 149 INIT_LIST_HEAD(q->sentinels + i); 150 INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); 151 INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); 152 } 153} 154 155static unsigned queue_size(struct queue *q) 156{ 157 return q->nr_elts; 158} 159 160static bool queue_empty(struct queue *q) 161{ 162 return q->nr_elts == 0; 163} 164 165/* 166 * Insert an entry to the back of the given level. 167 */ 168static void queue_push(struct queue *q, unsigned level, struct list_head *elt) 169{ 170 q->nr_elts++; 171 list_add_tail(elt, q->qs + level); 172} 173 174static void queue_remove(struct queue *q, struct list_head *elt) 175{ 176 q->nr_elts--; 177 list_del(elt); 178} 179 180static bool is_sentinel(struct queue *q, struct list_head *h) 181{ 182 return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); 183} 184 185/* 186 * Gives us the oldest entry of the lowest popoulated level. If the first 187 * level is emptied then we shift down one level. 188 */ 189static struct list_head *queue_peek(struct queue *q) 190{ 191 unsigned level; 192 struct list_head *h; 193 194 for (level = 0; level < NR_QUEUE_LEVELS; level++) 195 list_for_each(h, q->qs + level) 196 if (!is_sentinel(q, h)) 197 return h; 198 199 return NULL; 200} 201 202static struct list_head *queue_pop(struct queue *q) 203{ 204 struct list_head *r = queue_peek(q); 205 206 if (r) { 207 q->nr_elts--; 208 list_del(r); 209 } 210 211 return r; 212} 213 214/* 215 * Pops an entry from a level that is not past a sentinel. 216 */ 217static struct list_head *queue_pop_old(struct queue *q) 218{ 219 unsigned level; 220 struct list_head *h; 221 222 for (level = 0; level < NR_QUEUE_LEVELS; level++) 223 list_for_each(h, q->qs + level) { 224 if (is_sentinel(q, h)) 225 break; 226 227 q->nr_elts--; 228 list_del(h); 229 return h; 230 } 231 232 return NULL; 233} 234 235static struct list_head *list_pop(struct list_head *lh) 236{ 237 struct list_head *r = lh->next; 238 239 BUG_ON(!r); 240 list_del_init(r); 241 242 return r; 243} 244 245static struct list_head *writeback_sentinel(struct queue *q, unsigned level) 246{ 247 if (q->current_writeback_sentinels) 248 return q->sentinels + NR_QUEUE_LEVELS + level; 249 else 250 return q->sentinels + 2 * NR_QUEUE_LEVELS + level; 251} 252 253static void queue_update_writeback_sentinels(struct queue *q) 254{ 255 unsigned i; 256 struct list_head *h; 257 258 if (time_after(jiffies, q->next_writeback)) { 259 for (i = 0; i < NR_QUEUE_LEVELS; i++) { 260 h = writeback_sentinel(q, i); 261 list_del(h); 262 list_add_tail(h, q->qs + i); 263 } 264 265 q->next_writeback = jiffies + WRITEBACK_PERIOD; 266 q->current_writeback_sentinels = !q->current_writeback_sentinels; 267 } 268} 269 270/* 271 * Sometimes we want to iterate through entries that have been pushed since 272 * a certain event. We use sentinel entries on the queues to delimit these 273 * 'tick' events. 274 */ 275static void queue_tick(struct queue *q) 276{ 277 unsigned i; 278 279 for (i = 0; i < NR_QUEUE_LEVELS; i++) { 280 list_del(q->sentinels + i); 281 list_add_tail(q->sentinels + i, q->qs + i); 282 } 283} 284 285typedef void (*iter_fn)(struct list_head *, void *); 286static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) 287{ 288 unsigned i; 289 struct list_head *h; 290 291 for (i = 0; i < NR_QUEUE_LEVELS; i++) { 292 list_for_each_prev(h, q->qs + i) { 293 if (is_sentinel(q, h)) 294 break; 295 296 fn(h, context); 297 } 298 } 299} 300 301/*----------------------------------------------------------------*/ 302 303/* 304 * Describes a cache entry. Used in both the cache and the pre_cache. 305 */ 306struct entry { 307 struct hlist_node hlist; 308 struct list_head list; 309 dm_oblock_t oblock; 310 311 /* 312 * FIXME: pack these better 313 */ 314 bool dirty:1; 315 unsigned hit_count; 316}; 317 318/* 319 * Rather than storing the cblock in an entry, we allocate all entries in 320 * an array, and infer the cblock from the entry position. 321 * 322 * Free entries are linked together into a list. 323 */ 324struct entry_pool { 325 struct entry *entries, *entries_end; 326 struct list_head free; 327 unsigned nr_allocated; 328}; 329 330static int epool_init(struct entry_pool *ep, unsigned nr_entries) 331{ 332 unsigned i; 333 334 ep->entries = vzalloc(sizeof(struct entry) * nr_entries); 335 if (!ep->entries) 336 return -ENOMEM; 337 338 ep->entries_end = ep->entries + nr_entries; 339 340 INIT_LIST_HEAD(&ep->free); 341 for (i = 0; i < nr_entries; i++) 342 list_add(&ep->entries[i].list, &ep->free); 343 344 ep->nr_allocated = 0; 345 346 return 0; 347} 348 349static void epool_exit(struct entry_pool *ep) 350{ 351 vfree(ep->entries); 352} 353 354static struct entry *alloc_entry(struct entry_pool *ep) 355{ 356 struct entry *e; 357 358 if (list_empty(&ep->free)) 359 return NULL; 360 361 e = list_entry(list_pop(&ep->free), struct entry, list); 362 INIT_LIST_HEAD(&e->list); 363 INIT_HLIST_NODE(&e->hlist); 364 ep->nr_allocated++; 365 366 return e; 367} 368 369/* 370 * This assumes the cblock hasn't already been allocated. 371 */ 372static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) 373{ 374 struct entry *e = ep->entries + from_cblock(cblock); 375 376 list_del_init(&e->list); 377 INIT_HLIST_NODE(&e->hlist); 378 ep->nr_allocated++; 379 380 return e; 381} 382 383static void free_entry(struct entry_pool *ep, struct entry *e) 384{ 385 BUG_ON(!ep->nr_allocated); 386 ep->nr_allocated--; 387 INIT_HLIST_NODE(&e->hlist); 388 list_add(&e->list, &ep->free); 389} 390 391/* 392 * Returns NULL if the entry is free. 393 */ 394static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) 395{ 396 struct entry *e = ep->entries + from_cblock(cblock); 397 return !hlist_unhashed(&e->hlist) ? e : NULL; 398} 399 400static bool epool_empty(struct entry_pool *ep) 401{ 402 return list_empty(&ep->free); 403} 404 405static bool in_pool(struct entry_pool *ep, struct entry *e) 406{ 407 return e >= ep->entries && e < ep->entries_end; 408} 409 410static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) 411{ 412 return to_cblock(e - ep->entries); 413} 414 415/*----------------------------------------------------------------*/ 416 417struct mq_policy { 418 struct dm_cache_policy policy; 419 420 /* protects everything */ 421 struct mutex lock; 422 dm_cblock_t cache_size; 423 struct io_tracker tracker; 424 425 /* 426 * Entries come from two pools, one of pre-cache entries, and one 427 * for the cache proper. 428 */ 429 struct entry_pool pre_cache_pool; 430 struct entry_pool cache_pool; 431 432 /* 433 * We maintain three queues of entries. The cache proper, 434 * consisting of a clean and dirty queue, contains the currently 435 * active mappings. Whereas the pre_cache tracks blocks that 436 * are being hit frequently and potential candidates for promotion 437 * to the cache. 438 */ 439 struct queue pre_cache; 440 struct queue cache_clean; 441 struct queue cache_dirty; 442 443 /* 444 * Keeps track of time, incremented by the core. We use this to 445 * avoid attributing multiple hits within the same tick. 446 * 447 * Access to tick_protected should be done with the spin lock held. 448 * It's copied to tick at the start of the map function (within the 449 * mutex). 450 */ 451 spinlock_t tick_lock; 452 unsigned tick_protected; 453 unsigned tick; 454 455 /* 456 * A count of the number of times the map function has been called 457 * and found an entry in the pre_cache or cache. Currently used to 458 * calculate the generation. 459 */ 460 unsigned hit_count; 461 462 /* 463 * A generation is a longish period that is used to trigger some 464 * book keeping effects. eg, decrementing hit counts on entries. 465 * This is needed to allow the cache to evolve as io patterns 466 * change. 467 */ 468 unsigned generation; 469 unsigned generation_period; /* in lookups (will probably change) */ 470 471 unsigned discard_promote_adjustment; 472 unsigned read_promote_adjustment; 473 unsigned write_promote_adjustment; 474 475 /* 476 * The hash table allows us to quickly find an entry by origin 477 * block. Both pre_cache and cache entries are in here. 478 */ 479 unsigned nr_buckets; 480 dm_block_t hash_bits; 481 struct hlist_head *table; 482}; 483 484#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 485#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 486#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 487#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 488 489/*----------------------------------------------------------------*/ 490 491/* 492 * Simple hash table implementation. Should replace with the standard hash 493 * table that's making its way upstream. 494 */ 495static void hash_insert(struct mq_policy *mq, struct entry *e) 496{ 497 unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); 498 499 hlist_add_head(&e->hlist, mq->table + h); 500} 501 502static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) 503{ 504 unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); 505 struct hlist_head *bucket = mq->table + h; 506 struct entry *e; 507 508 hlist_for_each_entry(e, bucket, hlist) 509 if (e->oblock == oblock) { 510 hlist_del(&e->hlist); 511 hlist_add_head(&e->hlist, bucket); 512 return e; 513 } 514 515 return NULL; 516} 517 518static void hash_remove(struct entry *e) 519{ 520 hlist_del(&e->hlist); 521} 522 523/*----------------------------------------------------------------*/ 524 525static bool any_free_cblocks(struct mq_policy *mq) 526{ 527 return !epool_empty(&mq->cache_pool); 528} 529 530static bool any_clean_cblocks(struct mq_policy *mq) 531{ 532 return !queue_empty(&mq->cache_clean); 533} 534 535/*----------------------------------------------------------------*/ 536 537/* 538 * Now we get to the meat of the policy. This section deals with deciding 539 * when to to add entries to the pre_cache and cache, and move between 540 * them. 541 */ 542 543/* 544 * The queue level is based on the log2 of the hit count. 545 */ 546static unsigned queue_level(struct entry *e) 547{ 548 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); 549} 550 551static bool in_cache(struct mq_policy *mq, struct entry *e) 552{ 553 return in_pool(&mq->cache_pool, e); 554} 555 556/* 557 * Inserts the entry into the pre_cache or the cache. Ensures the cache 558 * block is marked as allocated if necc. Inserts into the hash table. 559 * Sets the tick which records when the entry was last moved about. 560 */ 561static void push(struct mq_policy *mq, struct entry *e) 562{ 563 hash_insert(mq, e); 564 565 if (in_cache(mq, e)) 566 queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, 567 queue_level(e), &e->list); 568 else 569 queue_push(&mq->pre_cache, queue_level(e), &e->list); 570} 571 572/* 573 * Removes an entry from pre_cache or cache. Removes from the hash table. 574 */ 575static void del(struct mq_policy *mq, struct entry *e) 576{ 577 if (in_cache(mq, e)) 578 queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); 579 else 580 queue_remove(&mq->pre_cache, &e->list); 581 582 hash_remove(e); 583} 584 585/* 586 * Like del, except it removes the first entry in the queue (ie. the least 587 * recently used). 588 */ 589static struct entry *pop(struct mq_policy *mq, struct queue *q) 590{ 591 struct entry *e; 592 struct list_head *h = queue_pop(q); 593 594 if (!h) 595 return NULL; 596 597 e = container_of(h, struct entry, list); 598 hash_remove(e); 599 600 return e; 601} 602 603static struct entry *pop_old(struct mq_policy *mq, struct queue *q) 604{ 605 struct entry *e; 606 struct list_head *h = queue_pop_old(q); 607 608 if (!h) 609 return NULL; 610 611 e = container_of(h, struct entry, list); 612 hash_remove(e); 613 614 return e; 615} 616 617static struct entry *peek(struct queue *q) 618{ 619 struct list_head *h = queue_peek(q); 620 return h ? container_of(h, struct entry, list) : NULL; 621} 622 623/* 624 * The promotion threshold is adjusted every generation. As are the counts 625 * of the entries. 626 * 627 * At the moment the threshold is taken by averaging the hit counts of some 628 * of the entries in the cache (the first 20 entries across all levels in 629 * ascending order, giving preference to the clean entries at each level). 630 * 631 * We can be much cleverer than this though. For example, each promotion 632 * could bump up the threshold helping to prevent churn. Much more to do 633 * here. 634 */ 635 636#define MAX_TO_AVERAGE 20 637 638static void check_generation(struct mq_policy *mq) 639{ 640 unsigned total = 0, nr = 0, count = 0, level; 641 struct list_head *head; 642 struct entry *e; 643 644 if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { 645 mq->hit_count = 0; 646 mq->generation++; 647 648 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { 649 head = mq->cache_clean.qs + level; 650 list_for_each_entry(e, head, list) { 651 nr++; 652 total += e->hit_count; 653 654 if (++count >= MAX_TO_AVERAGE) 655 break; 656 } 657 658 head = mq->cache_dirty.qs + level; 659 list_for_each_entry(e, head, list) { 660 nr++; 661 total += e->hit_count; 662 663 if (++count >= MAX_TO_AVERAGE) 664 break; 665 } 666 } 667 } 668} 669 670/* 671 * Whenever we use an entry we bump up it's hit counter, and push it to the 672 * back to it's current level. 673 */ 674static void requeue(struct mq_policy *mq, struct entry *e) 675{ 676 check_generation(mq); 677 del(mq, e); 678 push(mq, e); 679} 680 681/* 682 * Demote the least recently used entry from the cache to the pre_cache. 683 * Returns the new cache entry to use, and the old origin block it was 684 * mapped to. 685 * 686 * We drop the hit count on the demoted entry back to 1 to stop it bouncing 687 * straight back into the cache if it's subsequently hit. There are 688 * various options here, and more experimentation would be good: 689 * 690 * - just forget about the demoted entry completely (ie. don't insert it 691 into the pre_cache). 692 * - divide the hit count rather that setting to some hard coded value. 693 * - set the hit count to a hard coded value other than 1, eg, is it better 694 * if it goes in at level 2? 695 */ 696static int demote_cblock(struct mq_policy *mq, 697 struct policy_locker *locker, dm_oblock_t *oblock) 698{ 699 struct entry *demoted = peek(&mq->cache_clean); 700 701 if (!demoted) 702 /* 703 * We could get a block from mq->cache_dirty, but that 704 * would add extra latency to the triggering bio as it 705 * waits for the writeback. Better to not promote this 706 * time and hope there's a clean block next time this block 707 * is hit. 708 */ 709 return -ENOSPC; 710 711 if (locker->fn(locker, demoted->oblock)) 712 /* 713 * We couldn't lock the demoted block. 714 */ 715 return -EBUSY; 716 717 del(mq, demoted); 718 *oblock = demoted->oblock; 719 free_entry(&mq->cache_pool, demoted); 720 721 /* 722 * We used to put the demoted block into the pre-cache, but I think 723 * it's simpler to just let it work it's way up from zero again. 724 * Stops blocks flickering in and out of the cache. 725 */ 726 727 return 0; 728} 729 730/* 731 * Entries in the pre_cache whose hit count passes the promotion 732 * threshold move to the cache proper. Working out the correct 733 * value for the promotion_threshold is crucial to this policy. 734 */ 735static unsigned promote_threshold(struct mq_policy *mq) 736{ 737 struct entry *e; 738 739 if (any_free_cblocks(mq)) 740 return 0; 741 742 e = peek(&mq->cache_clean); 743 if (e) 744 return e->hit_count; 745 746 e = peek(&mq->cache_dirty); 747 if (e) 748 return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; 749 750 /* This should never happen */ 751 return 0; 752} 753 754/* 755 * We modify the basic promotion_threshold depending on the specific io. 756 * 757 * If the origin block has been discarded then there's no cost to copy it 758 * to the cache. 759 * 760 * We bias towards reads, since they can be demoted at no cost if they 761 * haven't been dirtied. 762 */ 763static unsigned adjusted_promote_threshold(struct mq_policy *mq, 764 bool discarded_oblock, int data_dir) 765{ 766 if (data_dir == READ) 767 return promote_threshold(mq) + mq->read_promote_adjustment; 768 769 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 770 /* 771 * We don't need to do any copying at all, so give this a 772 * very low threshold. 773 */ 774 return mq->discard_promote_adjustment; 775 } 776 777 return promote_threshold(mq) + mq->write_promote_adjustment; 778} 779 780static bool should_promote(struct mq_policy *mq, struct entry *e, 781 bool discarded_oblock, int data_dir) 782{ 783 return e->hit_count >= 784 adjusted_promote_threshold(mq, discarded_oblock, data_dir); 785} 786 787static int cache_entry_found(struct mq_policy *mq, 788 struct entry *e, 789 struct policy_result *result) 790{ 791 requeue(mq, e); 792 793 if (in_cache(mq, e)) { 794 result->op = POLICY_HIT; 795 result->cblock = infer_cblock(&mq->cache_pool, e); 796 } 797 798 return 0; 799} 800 801/* 802 * Moves an entry from the pre_cache to the cache. The main work is 803 * finding which cache block to use. 804 */ 805static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, 806 struct policy_locker *locker, 807 struct policy_result *result) 808{ 809 int r; 810 struct entry *new_e; 811 812 /* Ensure there's a free cblock in the cache */ 813 if (epool_empty(&mq->cache_pool)) { 814 result->op = POLICY_REPLACE; 815 r = demote_cblock(mq, locker, &result->old_oblock); 816 if (r) { 817 result->op = POLICY_MISS; 818 return 0; 819 } 820 821 } else 822 result->op = POLICY_NEW; 823 824 new_e = alloc_entry(&mq->cache_pool); 825 BUG_ON(!new_e); 826 827 new_e->oblock = e->oblock; 828 new_e->dirty = false; 829 new_e->hit_count = e->hit_count; 830 831 del(mq, e); 832 free_entry(&mq->pre_cache_pool, e); 833 push(mq, new_e); 834 835 result->cblock = infer_cblock(&mq->cache_pool, new_e); 836 837 return 0; 838} 839 840static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, 841 bool can_migrate, bool discarded_oblock, 842 int data_dir, struct policy_locker *locker, 843 struct policy_result *result) 844{ 845 int r = 0; 846 847 if (!should_promote(mq, e, discarded_oblock, data_dir)) { 848 requeue(mq, e); 849 result->op = POLICY_MISS; 850 851 } else if (!can_migrate) 852 r = -EWOULDBLOCK; 853 854 else { 855 requeue(mq, e); 856 r = pre_cache_to_cache(mq, e, locker, result); 857 } 858 859 return r; 860} 861 862static void insert_in_pre_cache(struct mq_policy *mq, 863 dm_oblock_t oblock) 864{ 865 struct entry *e = alloc_entry(&mq->pre_cache_pool); 866 867 if (!e) 868 /* 869 * There's no spare entry structure, so we grab the least 870 * used one from the pre_cache. 871 */ 872 e = pop(mq, &mq->pre_cache); 873 874 if (unlikely(!e)) { 875 DMWARN("couldn't pop from pre cache"); 876 return; 877 } 878 879 e->dirty = false; 880 e->oblock = oblock; 881 e->hit_count = 1; 882 push(mq, e); 883} 884 885static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, 886 struct policy_locker *locker, 887 struct policy_result *result) 888{ 889 int r; 890 struct entry *e; 891 892 if (epool_empty(&mq->cache_pool)) { 893 result->op = POLICY_REPLACE; 894 r = demote_cblock(mq, locker, &result->old_oblock); 895 if (unlikely(r)) { 896 result->op = POLICY_MISS; 897 insert_in_pre_cache(mq, oblock); 898 return; 899 } 900 901 /* 902 * This will always succeed, since we've just demoted. 903 */ 904 e = alloc_entry(&mq->cache_pool); 905 BUG_ON(!e); 906 907 } else { 908 e = alloc_entry(&mq->cache_pool); 909 result->op = POLICY_NEW; 910 } 911 912 e->oblock = oblock; 913 e->dirty = false; 914 e->hit_count = 1; 915 push(mq, e); 916 917 result->cblock = infer_cblock(&mq->cache_pool, e); 918} 919 920static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, 921 bool can_migrate, bool discarded_oblock, 922 int data_dir, struct policy_locker *locker, 923 struct policy_result *result) 924{ 925 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { 926 if (can_migrate) 927 insert_in_cache(mq, oblock, locker, result); 928 else 929 return -EWOULDBLOCK; 930 } else { 931 insert_in_pre_cache(mq, oblock); 932 result->op = POLICY_MISS; 933 } 934 935 return 0; 936} 937 938/* 939 * Looks the oblock up in the hash table, then decides whether to put in 940 * pre_cache, or cache etc. 941 */ 942static int map(struct mq_policy *mq, dm_oblock_t oblock, 943 bool can_migrate, bool discarded_oblock, 944 int data_dir, struct policy_locker *locker, 945 struct policy_result *result) 946{ 947 int r = 0; 948 struct entry *e = hash_lookup(mq, oblock); 949 950 if (e && in_cache(mq, e)) 951 r = cache_entry_found(mq, e, result); 952 953 else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && 954 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 955 result->op = POLICY_MISS; 956 957 else if (e) 958 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, 959 data_dir, locker, result); 960 961 else 962 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, 963 data_dir, locker, result); 964 965 if (r == -EWOULDBLOCK) 966 result->op = POLICY_MISS; 967 968 return r; 969} 970 971/*----------------------------------------------------------------*/ 972 973/* 974 * Public interface, via the policy struct. See dm-cache-policy.h for a 975 * description of these. 976 */ 977 978static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) 979{ 980 return container_of(p, struct mq_policy, policy); 981} 982 983static void mq_destroy(struct dm_cache_policy *p) 984{ 985 struct mq_policy *mq = to_mq_policy(p); 986 987 vfree(mq->table); 988 epool_exit(&mq->cache_pool); 989 epool_exit(&mq->pre_cache_pool); 990 kfree(mq); 991} 992 993static void update_pre_cache_hits(struct list_head *h, void *context) 994{ 995 struct entry *e = container_of(h, struct entry, list); 996 e->hit_count++; 997} 998 999static void update_cache_hits(struct list_head *h, void *context) 1000{ 1001 struct mq_policy *mq = context; 1002 struct entry *e = container_of(h, struct entry, list); 1003 e->hit_count++; 1004 mq->hit_count++; 1005} 1006 1007static void copy_tick(struct mq_policy *mq) 1008{ 1009 unsigned long flags, tick; 1010 1011 spin_lock_irqsave(&mq->tick_lock, flags); 1012 tick = mq->tick_protected; 1013 if (tick != mq->tick) { 1014 queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); 1015 queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); 1016 queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); 1017 mq->tick = tick; 1018 } 1019 1020 queue_tick(&mq->pre_cache); 1021 queue_tick(&mq->cache_dirty); 1022 queue_tick(&mq->cache_clean); 1023 queue_update_writeback_sentinels(&mq->cache_dirty); 1024 spin_unlock_irqrestore(&mq->tick_lock, flags); 1025} 1026 1027static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 1028 bool can_block, bool can_migrate, bool discarded_oblock, 1029 struct bio *bio, struct policy_locker *locker, 1030 struct policy_result *result) 1031{ 1032 int r; 1033 struct mq_policy *mq = to_mq_policy(p); 1034 1035 result->op = POLICY_MISS; 1036 1037 if (can_block) 1038 mutex_lock(&mq->lock); 1039 else if (!mutex_trylock(&mq->lock)) 1040 return -EWOULDBLOCK; 1041 1042 copy_tick(mq); 1043 1044 iot_examine_bio(&mq->tracker, bio); 1045 r = map(mq, oblock, can_migrate, discarded_oblock, 1046 bio_data_dir(bio), locker, result); 1047 1048 mutex_unlock(&mq->lock); 1049 1050 return r; 1051} 1052 1053static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 1054{ 1055 int r; 1056 struct mq_policy *mq = to_mq_policy(p); 1057 struct entry *e; 1058 1059 if (!mutex_trylock(&mq->lock)) 1060 return -EWOULDBLOCK; 1061 1062 e = hash_lookup(mq, oblock); 1063 if (e && in_cache(mq, e)) { 1064 *cblock = infer_cblock(&mq->cache_pool, e); 1065 r = 0; 1066 } else 1067 r = -ENOENT; 1068 1069 mutex_unlock(&mq->lock); 1070 1071 return r; 1072} 1073 1074static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) 1075{ 1076 struct entry *e; 1077 1078 e = hash_lookup(mq, oblock); 1079 BUG_ON(!e || !in_cache(mq, e)); 1080 1081 del(mq, e); 1082 e->dirty = set; 1083 push(mq, e); 1084} 1085 1086static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1087{ 1088 struct mq_policy *mq = to_mq_policy(p); 1089 1090 mutex_lock(&mq->lock); 1091 __mq_set_clear_dirty(mq, oblock, true); 1092 mutex_unlock(&mq->lock); 1093} 1094 1095static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1096{ 1097 struct mq_policy *mq = to_mq_policy(p); 1098 1099 mutex_lock(&mq->lock); 1100 __mq_set_clear_dirty(mq, oblock, false); 1101 mutex_unlock(&mq->lock); 1102} 1103 1104static int mq_load_mapping(struct dm_cache_policy *p, 1105 dm_oblock_t oblock, dm_cblock_t cblock, 1106 uint32_t hint, bool hint_valid) 1107{ 1108 struct mq_policy *mq = to_mq_policy(p); 1109 struct entry *e; 1110 1111 e = alloc_particular_entry(&mq->cache_pool, cblock); 1112 e->oblock = oblock; 1113 e->dirty = false; /* this gets corrected in a minute */ 1114 e->hit_count = hint_valid ? hint : 1; 1115 push(mq, e); 1116 1117 return 0; 1118} 1119 1120static int mq_save_hints(struct mq_policy *mq, struct queue *q, 1121 policy_walk_fn fn, void *context) 1122{ 1123 int r; 1124 unsigned level; 1125 struct list_head *h; 1126 struct entry *e; 1127 1128 for (level = 0; level < NR_QUEUE_LEVELS; level++) 1129 list_for_each(h, q->qs + level) { 1130 if (is_sentinel(q, h)) 1131 continue; 1132 1133 e = container_of(h, struct entry, list); 1134 r = fn(context, infer_cblock(&mq->cache_pool, e), 1135 e->oblock, e->hit_count); 1136 if (r) 1137 return r; 1138 } 1139 1140 return 0; 1141} 1142 1143static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, 1144 void *context) 1145{ 1146 struct mq_policy *mq = to_mq_policy(p); 1147 int r = 0; 1148 1149 mutex_lock(&mq->lock); 1150 1151 r = mq_save_hints(mq, &mq->cache_clean, fn, context); 1152 if (!r) 1153 r = mq_save_hints(mq, &mq->cache_dirty, fn, context); 1154 1155 mutex_unlock(&mq->lock); 1156 1157 return r; 1158} 1159 1160static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) 1161{ 1162 struct entry *e; 1163 1164 e = hash_lookup(mq, oblock); 1165 BUG_ON(!e || !in_cache(mq, e)); 1166 1167 del(mq, e); 1168 free_entry(&mq->cache_pool, e); 1169} 1170 1171static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1172{ 1173 struct mq_policy *mq = to_mq_policy(p); 1174 1175 mutex_lock(&mq->lock); 1176 __remove_mapping(mq, oblock); 1177 mutex_unlock(&mq->lock); 1178} 1179 1180static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) 1181{ 1182 struct entry *e = epool_find(&mq->cache_pool, cblock); 1183 1184 if (!e) 1185 return -ENODATA; 1186 1187 del(mq, e); 1188 free_entry(&mq->cache_pool, e); 1189 1190 return 0; 1191} 1192 1193static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 1194{ 1195 int r; 1196 struct mq_policy *mq = to_mq_policy(p); 1197 1198 mutex_lock(&mq->lock); 1199 r = __remove_cblock(mq, cblock); 1200 mutex_unlock(&mq->lock); 1201 1202 return r; 1203} 1204 1205#define CLEAN_TARGET_PERCENTAGE 25 1206 1207static bool clean_target_met(struct mq_policy *mq) 1208{ 1209 /* 1210 * Cache entries may not be populated. So we're cannot rely on the 1211 * size of the clean queue. 1212 */ 1213 unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty); 1214 unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100; 1215 1216 return nr_clean >= target; 1217} 1218 1219static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, 1220 dm_cblock_t *cblock) 1221{ 1222 struct entry *e = pop_old(mq, &mq->cache_dirty); 1223 1224 if (!e && !clean_target_met(mq)) 1225 e = pop(mq, &mq->cache_dirty); 1226 1227 if (!e) 1228 return -ENODATA; 1229 1230 *oblock = e->oblock; 1231 *cblock = infer_cblock(&mq->cache_pool, e); 1232 e->dirty = false; 1233 push(mq, e); 1234 1235 return 0; 1236} 1237 1238static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, 1239 dm_cblock_t *cblock) 1240{ 1241 int r; 1242 struct mq_policy *mq = to_mq_policy(p); 1243 1244 mutex_lock(&mq->lock); 1245 r = __mq_writeback_work(mq, oblock, cblock); 1246 mutex_unlock(&mq->lock); 1247 1248 return r; 1249} 1250 1251static void __force_mapping(struct mq_policy *mq, 1252 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1253{ 1254 struct entry *e = hash_lookup(mq, current_oblock); 1255 1256 if (e && in_cache(mq, e)) { 1257 del(mq, e); 1258 e->oblock = new_oblock; 1259 e->dirty = true; 1260 push(mq, e); 1261 } 1262} 1263 1264static void mq_force_mapping(struct dm_cache_policy *p, 1265 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1266{ 1267 struct mq_policy *mq = to_mq_policy(p); 1268 1269 mutex_lock(&mq->lock); 1270 __force_mapping(mq, current_oblock, new_oblock); 1271 mutex_unlock(&mq->lock); 1272} 1273 1274static dm_cblock_t mq_residency(struct dm_cache_policy *p) 1275{ 1276 dm_cblock_t r; 1277 struct mq_policy *mq = to_mq_policy(p); 1278 1279 mutex_lock(&mq->lock); 1280 r = to_cblock(mq->cache_pool.nr_allocated); 1281 mutex_unlock(&mq->lock); 1282 1283 return r; 1284} 1285 1286static void mq_tick(struct dm_cache_policy *p) 1287{ 1288 struct mq_policy *mq = to_mq_policy(p); 1289 unsigned long flags; 1290 1291 spin_lock_irqsave(&mq->tick_lock, flags); 1292 mq->tick_protected++; 1293 spin_unlock_irqrestore(&mq->tick_lock, flags); 1294} 1295 1296static int mq_set_config_value(struct dm_cache_policy *p, 1297 const char *key, const char *value) 1298{ 1299 struct mq_policy *mq = to_mq_policy(p); 1300 unsigned long tmp; 1301 1302 if (kstrtoul(value, 10, &tmp)) 1303 return -EINVAL; 1304 1305 if (!strcasecmp(key, "random_threshold")) { 1306 mq->tracker.thresholds[PATTERN_RANDOM] = tmp; 1307 1308 } else if (!strcasecmp(key, "sequential_threshold")) { 1309 mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; 1310 1311 } else if (!strcasecmp(key, "discard_promote_adjustment")) 1312 mq->discard_promote_adjustment = tmp; 1313 1314 else if (!strcasecmp(key, "read_promote_adjustment")) 1315 mq->read_promote_adjustment = tmp; 1316 1317 else if (!strcasecmp(key, "write_promote_adjustment")) 1318 mq->write_promote_adjustment = tmp; 1319 1320 else 1321 return -EINVAL; 1322 1323 return 0; 1324} 1325 1326static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) 1327{ 1328 ssize_t sz = 0; 1329 struct mq_policy *mq = to_mq_policy(p); 1330 1331 DMEMIT("10 random_threshold %u " 1332 "sequential_threshold %u " 1333 "discard_promote_adjustment %u " 1334 "read_promote_adjustment %u " 1335 "write_promote_adjustment %u", 1336 mq->tracker.thresholds[PATTERN_RANDOM], 1337 mq->tracker.thresholds[PATTERN_SEQUENTIAL], 1338 mq->discard_promote_adjustment, 1339 mq->read_promote_adjustment, 1340 mq->write_promote_adjustment); 1341 1342 return 0; 1343} 1344 1345/* Init the policy plugin interface function pointers. */ 1346static void init_policy_functions(struct mq_policy *mq) 1347{ 1348 mq->policy.destroy = mq_destroy; 1349 mq->policy.map = mq_map; 1350 mq->policy.lookup = mq_lookup; 1351 mq->policy.set_dirty = mq_set_dirty; 1352 mq->policy.clear_dirty = mq_clear_dirty; 1353 mq->policy.load_mapping = mq_load_mapping; 1354 mq->policy.walk_mappings = mq_walk_mappings; 1355 mq->policy.remove_mapping = mq_remove_mapping; 1356 mq->policy.remove_cblock = mq_remove_cblock; 1357 mq->policy.writeback_work = mq_writeback_work; 1358 mq->policy.force_mapping = mq_force_mapping; 1359 mq->policy.residency = mq_residency; 1360 mq->policy.tick = mq_tick; 1361 mq->policy.emit_config_values = mq_emit_config_values; 1362 mq->policy.set_config_value = mq_set_config_value; 1363} 1364 1365static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1366 sector_t origin_size, 1367 sector_t cache_block_size) 1368{ 1369 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); 1370 1371 if (!mq) 1372 return NULL; 1373 1374 init_policy_functions(mq); 1375 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); 1376 mq->cache_size = cache_size; 1377 1378 if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { 1379 DMERR("couldn't initialize pool of pre-cache entries"); 1380 goto bad_pre_cache_init; 1381 } 1382 1383 if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { 1384 DMERR("couldn't initialize pool of cache entries"); 1385 goto bad_cache_init; 1386 } 1387 1388 mq->tick_protected = 0; 1389 mq->tick = 0; 1390 mq->hit_count = 0; 1391 mq->generation = 0; 1392 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; 1393 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; 1394 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; 1395 mutex_init(&mq->lock); 1396 spin_lock_init(&mq->tick_lock); 1397 1398 queue_init(&mq->pre_cache); 1399 queue_init(&mq->cache_clean); 1400 queue_init(&mq->cache_dirty); 1401 1402 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); 1403 1404 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1405 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1406 mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); 1407 if (!mq->table) 1408 goto bad_alloc_table; 1409 1410 return &mq->policy; 1411 1412bad_alloc_table: 1413 epool_exit(&mq->cache_pool); 1414bad_cache_init: 1415 epool_exit(&mq->pre_cache_pool); 1416bad_pre_cache_init: 1417 kfree(mq); 1418 1419 return NULL; 1420} 1421 1422/*----------------------------------------------------------------*/ 1423 1424static struct dm_cache_policy_type mq_policy_type = { 1425 .name = "mq", 1426 .version = {1, 3, 0}, 1427 .hint_size = 4, 1428 .owner = THIS_MODULE, 1429 .create = mq_create 1430}; 1431 1432static struct dm_cache_policy_type default_policy_type = { 1433 .name = "default", 1434 .version = {1, 3, 0}, 1435 .hint_size = 4, 1436 .owner = THIS_MODULE, 1437 .create = mq_create, 1438 .real = &mq_policy_type 1439}; 1440 1441static int __init mq_init(void) 1442{ 1443 int r; 1444 1445 mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", 1446 sizeof(struct entry), 1447 __alignof__(struct entry), 1448 0, NULL); 1449 if (!mq_entry_cache) 1450 goto bad; 1451 1452 r = dm_cache_policy_register(&mq_policy_type); 1453 if (r) { 1454 DMERR("register failed %d", r); 1455 goto bad_register_mq; 1456 } 1457 1458 r = dm_cache_policy_register(&default_policy_type); 1459 if (!r) { 1460 DMINFO("version %u.%u.%u loaded", 1461 mq_policy_type.version[0], 1462 mq_policy_type.version[1], 1463 mq_policy_type.version[2]); 1464 return 0; 1465 } 1466 1467 DMERR("register failed (as default) %d", r); 1468 1469 dm_cache_policy_unregister(&mq_policy_type); 1470bad_register_mq: 1471 kmem_cache_destroy(mq_entry_cache); 1472bad: 1473 return -ENOMEM; 1474} 1475 1476static void __exit mq_exit(void) 1477{ 1478 dm_cache_policy_unregister(&mq_policy_type); 1479 dm_cache_policy_unregister(&default_policy_type); 1480 1481 kmem_cache_destroy(mq_entry_cache); 1482} 1483 1484module_init(mq_init); 1485module_exit(mq_exit); 1486 1487MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1488MODULE_LICENSE("GPL"); 1489MODULE_DESCRIPTION("mq cache policy"); 1490 1491MODULE_ALIAS("dm-cache-default"); 1492