root/drivers/md/bcache/btree.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. btree_current_write
  2. btree_prev_write
  3. btree_bset_first
  4. btree_bset_last
  5. bset_block_offset
  6. set_gc_sectors
  7. bch_btree_op_init
  8. rw_lock
  9. rw_unlock
  10. wake_up_gc
  11. force_wake_up_gc
  12. bch_btree_map_nodes
  13. bch_btree_map_leaf_nodes

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _BCACHE_BTREE_H
   3 #define _BCACHE_BTREE_H
   4 
   5 /*
   6  * THE BTREE:
   7  *
   8  * At a high level, bcache's btree is relatively standard b+ tree. All keys and
   9  * pointers are in the leaves; interior nodes only have pointers to the child
  10  * nodes.
  11  *
  12  * In the interior nodes, a struct bkey always points to a child btree node, and
  13  * the key is the highest key in the child node - except that the highest key in
  14  * an interior node is always MAX_KEY. The size field refers to the size on disk
  15  * of the child node - this would allow us to have variable sized btree nodes
  16  * (handy for keeping the depth of the btree 1 by expanding just the root).
  17  *
  18  * Btree nodes are themselves log structured, but this is hidden fairly
  19  * thoroughly. Btree nodes on disk will in practice have extents that overlap
  20  * (because they were written at different times), but in memory we never have
  21  * overlapping extents - when we read in a btree node from disk, the first thing
  22  * we do is resort all the sets of keys with a mergesort, and in the same pass
  23  * we check for overlapping extents and adjust them appropriately.
  24  *
  25  * struct btree_op is a central interface to the btree code. It's used for
  26  * specifying read vs. write locking, and the embedded closure is used for
  27  * waiting on IO or reserve memory.
  28  *
  29  * BTREE CACHE:
  30  *
  31  * Btree nodes are cached in memory; traversing the btree might require reading
  32  * in btree nodes which is handled mostly transparently.
  33  *
  34  * bch_btree_node_get() looks up a btree node in the cache and reads it in from
  35  * disk if necessary. This function is almost never called directly though - the
  36  * btree() macro is used to get a btree node, call some function on it, and
  37  * unlock the node after the function returns.
  38  *
  39  * The root is special cased - it's taken out of the cache's lru (thus pinning
  40  * it in memory), so we can find the root of the btree by just dereferencing a
  41  * pointer instead of looking it up in the cache. This makes locking a bit
  42  * tricky, since the root pointer is protected by the lock in the btree node it
  43  * points to - the btree_root() macro handles this.
  44  *
  45  * In various places we must be able to allocate memory for multiple btree nodes
  46  * in order to make forward progress. To do this we use the btree cache itself
  47  * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
  48  * cache we can reuse. We can't allow more than one thread to be doing this at a
  49  * time, so there's a lock, implemented by a pointer to the btree_op closure -
  50  * this allows the btree_root() macro to implicitly release this lock.
  51  *
  52  * BTREE IO:
  53  *
  54  * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
  55  * this.
  56  *
  57  * For writing, we have two btree_write structs embeddded in struct btree - one
  58  * write in flight, and one being set up, and we toggle between them.
  59  *
  60  * Writing is done with a single function -  bch_btree_write() really serves two
  61  * different purposes and should be broken up into two different functions. When
  62  * passing now = false, it merely indicates that the node is now dirty - calling
  63  * it ensures that the dirty keys will be written at some point in the future.
  64  *
  65  * When passing now = true, bch_btree_write() causes a write to happen
  66  * "immediately" (if there was already a write in flight, it'll cause the write
  67  * to happen as soon as the previous write completes). It returns immediately
  68  * though - but it takes a refcount on the closure in struct btree_op you passed
  69  * to it, so a closure_sync() later can be used to wait for the write to
  70  * complete.
  71  *
  72  * This is handy because btree_split() and garbage collection can issue writes
  73  * in parallel, reducing the amount of time they have to hold write locks.
  74  *
  75  * LOCKING:
  76  *
  77  * When traversing the btree, we may need write locks starting at some level -
  78  * inserting a key into the btree will typically only require a write lock on
  79  * the leaf node.
  80  *
  81  * This is specified with the lock field in struct btree_op; lock = 0 means we
  82  * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
  83  * checks this field and returns the node with the appropriate lock held.
  84  *
  85  * If, after traversing the btree, the insertion code discovers it has to split
  86  * then it must restart from the root and take new locks - to do this it changes
  87  * the lock field and returns -EINTR, which causes the btree_root() macro to
  88  * loop.
  89  *
  90  * Handling cache misses require a different mechanism for upgrading to a write
  91  * lock. We do cache lookups with only a read lock held, but if we get a cache
  92  * miss and we wish to insert this data into the cache, we have to insert a
  93  * placeholder key to detect races - otherwise, we could race with a write and
  94  * overwrite the data that was just written to the cache with stale data from
  95  * the backing device.
  96  *
  97  * For this we use a sequence number that write locks and unlocks increment - to
  98  * insert the check key it unlocks the btree node and then takes a write lock,
  99  * and fails if the sequence number doesn't match.
 100  */
 101 
 102 #include "bset.h"
 103 #include "debug.h"
 104 
 105 struct btree_write {
 106         atomic_t                *journal;
 107 
 108         /* If btree_split() frees a btree node, it writes a new pointer to that
 109          * btree node indicating it was freed; it takes a refcount on
 110          * c->prio_blocked because we can't write the gens until the new
 111          * pointer is on disk. This allows btree_write_endio() to release the
 112          * refcount that btree_split() took.
 113          */
 114         int                     prio_blocked;
 115 };
 116 
 117 struct btree {
 118         /* Hottest entries first */
 119         struct hlist_node       hash;
 120 
 121         /* Key/pointer for this btree node */
 122         BKEY_PADDED(key);
 123 
 124         /* Single bit - set when accessed, cleared by shrinker */
 125         unsigned long           accessed;
 126         unsigned long           seq;
 127         struct rw_semaphore     lock;
 128         struct cache_set        *c;
 129         struct btree            *parent;
 130 
 131         struct mutex            write_lock;
 132 
 133         unsigned long           flags;
 134         uint16_t                written;        /* would be nice to kill */
 135         uint8_t                 level;
 136 
 137         struct btree_keys       keys;
 138 
 139         /* For outstanding btree writes, used as a lock - protects write_idx */
 140         struct closure          io;
 141         struct semaphore        io_mutex;
 142 
 143         struct list_head        list;
 144         struct delayed_work     work;
 145 
 146         struct btree_write      writes[2];
 147         struct bio              *bio;
 148 };
 149 
 150 #define BTREE_FLAG(flag)                                                \
 151 static inline bool btree_node_ ## flag(struct btree *b)                 \
 152 {       return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
 153                                                                         \
 154 static inline void set_btree_node_ ## flag(struct btree *b)             \
 155 {       set_bit(BTREE_NODE_ ## flag, &b->flags); }
 156 
 157 enum btree_flags {
 158         BTREE_NODE_io_error,
 159         BTREE_NODE_dirty,
 160         BTREE_NODE_write_idx,
 161         BTREE_NODE_journal_flush,
 162 };
 163 
 164 BTREE_FLAG(io_error);
 165 BTREE_FLAG(dirty);
 166 BTREE_FLAG(write_idx);
 167 BTREE_FLAG(journal_flush);
 168 
 169 static inline struct btree_write *btree_current_write(struct btree *b)
 170 {
 171         return b->writes + btree_node_write_idx(b);
 172 }
 173 
 174 static inline struct btree_write *btree_prev_write(struct btree *b)
 175 {
 176         return b->writes + (btree_node_write_idx(b) ^ 1);
 177 }
 178 
 179 static inline struct bset *btree_bset_first(struct btree *b)
 180 {
 181         return b->keys.set->data;
 182 }
 183 
 184 static inline struct bset *btree_bset_last(struct btree *b)
 185 {
 186         return bset_tree_last(&b->keys)->data;
 187 }
 188 
 189 static inline unsigned int bset_block_offset(struct btree *b, struct bset *i)
 190 {
 191         return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
 192 }
 193 
 194 static inline void set_gc_sectors(struct cache_set *c)
 195 {
 196         atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 197 }
 198 
 199 void bkey_put(struct cache_set *c, struct bkey *k);
 200 
 201 /* Looping macros */
 202 
 203 #define for_each_cached_btree(b, c, iter)                               \
 204         for (iter = 0;                                                  \
 205              iter < ARRAY_SIZE((c)->bucket_hash);                       \
 206              iter++)                                                    \
 207                 hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
 208 
 209 /* Recursing down the btree */
 210 
 211 struct btree_op {
 212         /* for waiting on btree reserve in btree_split() */
 213         wait_queue_entry_t              wait;
 214 
 215         /* Btree level at which we start taking write locks */
 216         short                   lock;
 217 
 218         unsigned int            insert_collision:1;
 219 };
 220 
 221 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 222 {
 223         memset(op, 0, sizeof(struct btree_op));
 224         init_wait(&op->wait);
 225         op->lock = write_lock_level;
 226 }
 227 
 228 static inline void rw_lock(bool w, struct btree *b, int level)
 229 {
 230         w ? down_write_nested(&b->lock, level + 1)
 231           : down_read_nested(&b->lock, level + 1);
 232         if (w)
 233                 b->seq++;
 234 }
 235 
 236 static inline void rw_unlock(bool w, struct btree *b)
 237 {
 238         if (w)
 239                 b->seq++;
 240         (w ? up_write : up_read)(&b->lock);
 241 }
 242 
 243 void bch_btree_node_read_done(struct btree *b);
 244 void __bch_btree_node_write(struct btree *b, struct closure *parent);
 245 void bch_btree_node_write(struct btree *b, struct closure *parent);
 246 
 247 void bch_btree_set_root(struct btree *b);
 248 struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
 249                                      int level, bool wait,
 250                                      struct btree *parent);
 251 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
 252                                  struct bkey *k, int level, bool write,
 253                                  struct btree *parent);
 254 
 255 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 256                                struct bkey *check_key);
 257 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
 258                      atomic_t *journal_ref, struct bkey *replace_key);
 259 
 260 int bch_gc_thread_start(struct cache_set *c);
 261 void bch_initial_gc_finish(struct cache_set *c);
 262 void bch_moving_gc(struct cache_set *c);
 263 int bch_btree_check(struct cache_set *c);
 264 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
 265 
 266 static inline void wake_up_gc(struct cache_set *c)
 267 {
 268         wake_up(&c->gc_wait);
 269 }
 270 
 271 static inline void force_wake_up_gc(struct cache_set *c)
 272 {
 273         /*
 274          * Garbage collection thread only works when sectors_to_gc < 0,
 275          * calling wake_up_gc() won't start gc thread if sectors_to_gc is
 276          * not a nagetive value.
 277          * Therefore sectors_to_gc is set to -1 here, before waking up
 278          * gc thread by calling wake_up_gc(). Then gc_should_run() will
 279          * give a chance to permit gc thread to run. "Give a chance" means
 280          * before going into gc_should_run(), there is still possibility
 281          * that c->sectors_to_gc being set to other positive value. So
 282          * this routine won't 100% make sure gc thread will be woken up
 283          * to run.
 284          */
 285         atomic_set(&c->sectors_to_gc, -1);
 286         wake_up_gc(c);
 287 }
 288 
 289 #define MAP_DONE        0
 290 #define MAP_CONTINUE    1
 291 
 292 #define MAP_ALL_NODES   0
 293 #define MAP_LEAF_NODES  1
 294 
 295 #define MAP_END_KEY     1
 296 
 297 typedef int (btree_map_nodes_fn)(struct btree_op *b_op, struct btree *b);
 298 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
 299                           struct bkey *from, btree_map_nodes_fn *fn, int flags);
 300 
 301 static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
 302                                       struct bkey *from, btree_map_nodes_fn *fn)
 303 {
 304         return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
 305 }
 306 
 307 static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
 308                                            struct cache_set *c,
 309                                            struct bkey *from,
 310                                            btree_map_nodes_fn *fn)
 311 {
 312         return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
 313 }
 314 
 315 typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b,
 316                                 struct bkey *k);
 317 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
 318                        struct bkey *from, btree_map_keys_fn *fn, int flags);
 319 
 320 typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k);
 321 
 322 void bch_keybuf_init(struct keybuf *buf);
 323 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 324                        struct bkey *end, keybuf_pred_fn *pred);
 325 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
 326                                   struct bkey *end);
 327 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w);
 328 struct keybuf_key *bch_keybuf_next(struct keybuf *buf);
 329 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
 330                                           struct keybuf *buf,
 331                                           struct bkey *end,
 332                                           keybuf_pred_fn *pred);
 333 void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
 334 #endif

/* [<][>][^][v][top][bottom][index][help] */