root/drivers/md/dm-clone-target.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. get_clone_mode
  2. clone_device_name
  3. __set_clone_mode
  4. __abort_transaction
  5. __reload_in_core_bitset
  6. __metadata_operation_failed
  7. wakeup_hydration_waiters
  8. wake_worker
  9. remap_to_source
  10. remap_to_dest
  11. bio_triggers_commit
  12. region_to_sector
  13. bio_to_region
  14. bio_region_range
  15. is_overwrite_bio
  16. fail_bios
  17. submit_bios
  18. issue_bio
  19. remap_and_issue
  20. issue_deferred_bios
  21. complete_overwrite_bio
  22. trim_bio
  23. complete_discard_bio
  24. process_discard_bio
  25. hash_table_init
  26. hash_table_exit
  27. get_hash_table_bucket
  28. __hash_find
  29. __insert_region_hydration
  30. __find_or_insert_region_hydration
  31. alloc_hydration
  32. free_hydration
  33. hydration_init
  34. hydration_update_metadata
  35. hydration_complete
  36. hydration_kcopyd_callback
  37. hydration_copy
  38. overwrite_endio
  39. hydration_overwrite
  40. hydrate_bio_region
  41. __batch_hydration
  42. __start_next_hydration
  43. do_hydration
  44. need_commit_due_to_time
  45. commit_metadata
  46. process_deferred_discards
  47. process_deferred_bios
  48. process_deferred_flush_bios
  49. do_worker
  50. do_waker
  51. clone_map
  52. clone_endio
  53. emit_flags
  54. emit_core_args
  55. clone_status
  56. clone_is_congested
  57. get_dev_size
  58. parse_feature_args
  59. parse_core_args
  60. parse_region_size
  61. validate_nr_regions
  62. parse_metadata_dev
  63. parse_dest_dev
  64. parse_source_dev
  65. copy_ctr_args
  66. clone_ctr
  67. clone_dtr
  68. clone_postsuspend
  69. clone_resume
  70. bdev_supports_discards
  71. disable_passdown_if_not_supported
  72. set_discard_limits
  73. clone_io_hints
  74. clone_iterate_devices
  75. set_hydration_threshold
  76. set_hydration_batch_size
  77. enable_hydration
  78. disable_hydration
  79. clone_message
  80. dm_clone_init
  81. dm_clone_exit

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
   4  */
   5 
   6 #include <linux/mm.h>
   7 #include <linux/bio.h>
   8 #include <linux/err.h>
   9 #include <linux/hash.h>
  10 #include <linux/list.h>
  11 #include <linux/log2.h>
  12 #include <linux/init.h>
  13 #include <linux/slab.h>
  14 #include <linux/wait.h>
  15 #include <linux/dm-io.h>
  16 #include <linux/mutex.h>
  17 #include <linux/atomic.h>
  18 #include <linux/bitops.h>
  19 #include <linux/blkdev.h>
  20 #include <linux/kdev_t.h>
  21 #include <linux/kernel.h>
  22 #include <linux/module.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/mempool.h>
  25 #include <linux/spinlock.h>
  26 #include <linux/blk_types.h>
  27 #include <linux/dm-kcopyd.h>
  28 #include <linux/workqueue.h>
  29 #include <linux/backing-dev.h>
  30 #include <linux/device-mapper.h>
  31 
  32 #include "dm.h"
  33 #include "dm-clone-metadata.h"
  34 
  35 #define DM_MSG_PREFIX "clone"
  36 
  37 /*
  38  * Minimum and maximum allowed region sizes
  39  */
  40 #define MIN_REGION_SIZE (1 << 3)  /* 4KB */
  41 #define MAX_REGION_SIZE (1 << 21) /* 1GB */
  42 
  43 #define MIN_HYDRATIONS 256 /* Size of hydration mempool */
  44 #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
  45 #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
  46 
  47 #define COMMIT_PERIOD HZ /* 1 sec */
  48 
  49 /*
  50  * Hydration hash table size: 1 << HASH_TABLE_BITS
  51  */
  52 #define HASH_TABLE_BITS 15
  53 
  54 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
  55         "A percentage of time allocated for hydrating regions");
  56 
  57 /* Slab cache for struct dm_clone_region_hydration */
  58 static struct kmem_cache *_hydration_cache;
  59 
  60 /* dm-clone metadata modes */
  61 enum clone_metadata_mode {
  62         CM_WRITE,               /* metadata may be changed */
  63         CM_READ_ONLY,           /* metadata may not be changed */
  64         CM_FAIL,                /* all metadata I/O fails */
  65 };
  66 
  67 struct hash_table_bucket;
  68 
  69 struct clone {
  70         struct dm_target *ti;
  71         struct dm_target_callbacks callbacks;
  72 
  73         struct dm_dev *metadata_dev;
  74         struct dm_dev *dest_dev;
  75         struct dm_dev *source_dev;
  76 
  77         unsigned long nr_regions;
  78         sector_t region_size;
  79         unsigned int region_shift;
  80 
  81         /*
  82          * A metadata commit and the actions taken in case it fails should run
  83          * as a single atomic step.
  84          */
  85         struct mutex commit_lock;
  86 
  87         struct dm_clone_metadata *cmd;
  88 
  89         /*
  90          * bio used to flush the destination device, before committing the
  91          * metadata.
  92          */
  93         struct bio flush_bio;
  94 
  95         /* Region hydration hash table */
  96         struct hash_table_bucket *ht;
  97 
  98         atomic_t ios_in_flight;
  99 
 100         wait_queue_head_t hydration_stopped;
 101 
 102         mempool_t hydration_pool;
 103 
 104         unsigned long last_commit_jiffies;
 105 
 106         /*
 107          * We defer incoming WRITE bios for regions that are not hydrated,
 108          * until after these regions have been hydrated.
 109          *
 110          * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
 111          * metadata have been committed.
 112          */
 113         spinlock_t lock;
 114         struct bio_list deferred_bios;
 115         struct bio_list deferred_discard_bios;
 116         struct bio_list deferred_flush_bios;
 117         struct bio_list deferred_flush_completions;
 118 
 119         /* Maximum number of regions being copied during background hydration. */
 120         unsigned int hydration_threshold;
 121 
 122         /* Number of regions to batch together during background hydration. */
 123         unsigned int hydration_batch_size;
 124 
 125         /* Which region to hydrate next */
 126         unsigned long hydration_offset;
 127 
 128         atomic_t hydrations_in_flight;
 129 
 130         /*
 131          * Save a copy of the table line rather than reconstructing it for the
 132          * status.
 133          */
 134         unsigned int nr_ctr_args;
 135         const char **ctr_args;
 136 
 137         struct workqueue_struct *wq;
 138         struct work_struct worker;
 139         struct delayed_work waker;
 140 
 141         struct dm_kcopyd_client *kcopyd_client;
 142 
 143         enum clone_metadata_mode mode;
 144         unsigned long flags;
 145 };
 146 
 147 /*
 148  * dm-clone flags
 149  */
 150 #define DM_CLONE_DISCARD_PASSDOWN 0
 151 #define DM_CLONE_HYDRATION_ENABLED 1
 152 #define DM_CLONE_HYDRATION_SUSPENDED 2
 153 
 154 /*---------------------------------------------------------------------------*/
 155 
 156 /*
 157  * Metadata failure handling.
 158  */
 159 static enum clone_metadata_mode get_clone_mode(struct clone *clone)
 160 {
 161         return READ_ONCE(clone->mode);
 162 }
 163 
 164 static const char *clone_device_name(struct clone *clone)
 165 {
 166         return dm_table_device_name(clone->ti->table);
 167 }
 168 
 169 static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
 170 {
 171         const char *descs[] = {
 172                 "read-write",
 173                 "read-only",
 174                 "fail"
 175         };
 176 
 177         enum clone_metadata_mode old_mode = get_clone_mode(clone);
 178 
 179         /* Never move out of fail mode */
 180         if (old_mode == CM_FAIL)
 181                 new_mode = CM_FAIL;
 182 
 183         switch (new_mode) {
 184         case CM_FAIL:
 185         case CM_READ_ONLY:
 186                 dm_clone_metadata_set_read_only(clone->cmd);
 187                 break;
 188 
 189         case CM_WRITE:
 190                 dm_clone_metadata_set_read_write(clone->cmd);
 191                 break;
 192         }
 193 
 194         WRITE_ONCE(clone->mode, new_mode);
 195 
 196         if (new_mode != old_mode) {
 197                 dm_table_event(clone->ti->table);
 198                 DMINFO("%s: Switching to %s mode", clone_device_name(clone),
 199                        descs[(int)new_mode]);
 200         }
 201 }
 202 
 203 static void __abort_transaction(struct clone *clone)
 204 {
 205         const char *dev_name = clone_device_name(clone);
 206 
 207         if (get_clone_mode(clone) >= CM_READ_ONLY)
 208                 return;
 209 
 210         DMERR("%s: Aborting current metadata transaction", dev_name);
 211         if (dm_clone_metadata_abort(clone->cmd)) {
 212                 DMERR("%s: Failed to abort metadata transaction", dev_name);
 213                 __set_clone_mode(clone, CM_FAIL);
 214         }
 215 }
 216 
 217 static void __reload_in_core_bitset(struct clone *clone)
 218 {
 219         const char *dev_name = clone_device_name(clone);
 220 
 221         if (get_clone_mode(clone) == CM_FAIL)
 222                 return;
 223 
 224         /* Reload the on-disk bitset */
 225         DMINFO("%s: Reloading on-disk bitmap", dev_name);
 226         if (dm_clone_reload_in_core_bitset(clone->cmd)) {
 227                 DMERR("%s: Failed to reload on-disk bitmap", dev_name);
 228                 __set_clone_mode(clone, CM_FAIL);
 229         }
 230 }
 231 
 232 static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
 233 {
 234         DMERR("%s: Metadata operation `%s' failed: error = %d",
 235               clone_device_name(clone), op, r);
 236 
 237         __abort_transaction(clone);
 238         __set_clone_mode(clone, CM_READ_ONLY);
 239 
 240         /*
 241          * dm_clone_reload_in_core_bitset() may run concurrently with either
 242          * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
 243          * it's safe as we have already set the metadata to read-only mode.
 244          */
 245         __reload_in_core_bitset(clone);
 246 }
 247 
 248 /*---------------------------------------------------------------------------*/
 249 
 250 /* Wake up anyone waiting for region hydrations to stop */
 251 static inline void wakeup_hydration_waiters(struct clone *clone)
 252 {
 253         wake_up_all(&clone->hydration_stopped);
 254 }
 255 
 256 static inline void wake_worker(struct clone *clone)
 257 {
 258         queue_work(clone->wq, &clone->worker);
 259 }
 260 
 261 /*---------------------------------------------------------------------------*/
 262 
 263 /*
 264  * bio helper functions.
 265  */
 266 static inline void remap_to_source(struct clone *clone, struct bio *bio)
 267 {
 268         bio_set_dev(bio, clone->source_dev->bdev);
 269 }
 270 
 271 static inline void remap_to_dest(struct clone *clone, struct bio *bio)
 272 {
 273         bio_set_dev(bio, clone->dest_dev->bdev);
 274 }
 275 
 276 static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
 277 {
 278         return op_is_flush(bio->bi_opf) &&
 279                 dm_clone_changed_this_transaction(clone->cmd);
 280 }
 281 
 282 /* Get the address of the region in sectors */
 283 static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
 284 {
 285         return ((sector_t)region_nr << clone->region_shift);
 286 }
 287 
 288 /* Get the region number of the bio */
 289 static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
 290 {
 291         return (bio->bi_iter.bi_sector >> clone->region_shift);
 292 }
 293 
 294 /* Get the region range covered by the bio */
 295 static void bio_region_range(struct clone *clone, struct bio *bio,
 296                              unsigned long *rs, unsigned long *nr_regions)
 297 {
 298         unsigned long end;
 299 
 300         *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
 301         end = bio_end_sector(bio) >> clone->region_shift;
 302 
 303         if (*rs >= end)
 304                 *nr_regions = 0;
 305         else
 306                 *nr_regions = end - *rs;
 307 }
 308 
 309 /* Check whether a bio overwrites a region */
 310 static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
 311 {
 312         return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
 313 }
 314 
 315 static void fail_bios(struct bio_list *bios, blk_status_t status)
 316 {
 317         struct bio *bio;
 318 
 319         while ((bio = bio_list_pop(bios))) {
 320                 bio->bi_status = status;
 321                 bio_endio(bio);
 322         }
 323 }
 324 
 325 static void submit_bios(struct bio_list *bios)
 326 {
 327         struct bio *bio;
 328         struct blk_plug plug;
 329 
 330         blk_start_plug(&plug);
 331 
 332         while ((bio = bio_list_pop(bios)))
 333                 generic_make_request(bio);
 334 
 335         blk_finish_plug(&plug);
 336 }
 337 
 338 /*
 339  * Submit bio to the underlying device.
 340  *
 341  * If the bio triggers a commit, delay it, until after the metadata have been
 342  * committed.
 343  *
 344  * NOTE: The bio remapping must be performed by the caller.
 345  */
 346 static void issue_bio(struct clone *clone, struct bio *bio)
 347 {
 348         if (!bio_triggers_commit(clone, bio)) {
 349                 generic_make_request(bio);
 350                 return;
 351         }
 352 
 353         /*
 354          * If the metadata mode is RO or FAIL we won't be able to commit the
 355          * metadata, so we complete the bio with an error.
 356          */
 357         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 358                 bio_io_error(bio);
 359                 return;
 360         }
 361 
 362         /*
 363          * Batch together any bios that trigger commits and then issue a single
 364          * commit for them in process_deferred_flush_bios().
 365          */
 366         spin_lock_irq(&clone->lock);
 367         bio_list_add(&clone->deferred_flush_bios, bio);
 368         spin_unlock_irq(&clone->lock);
 369 
 370         wake_worker(clone);
 371 }
 372 
 373 /*
 374  * Remap bio to the destination device and submit it.
 375  *
 376  * If the bio triggers a commit, delay it, until after the metadata have been
 377  * committed.
 378  */
 379 static void remap_and_issue(struct clone *clone, struct bio *bio)
 380 {
 381         remap_to_dest(clone, bio);
 382         issue_bio(clone, bio);
 383 }
 384 
 385 /*
 386  * Issue bios that have been deferred until after their region has finished
 387  * hydrating.
 388  *
 389  * We delegate the bio submission to the worker thread, so this is safe to call
 390  * from interrupt context.
 391  */
 392 static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
 393 {
 394         struct bio *bio;
 395         unsigned long flags;
 396         struct bio_list flush_bios = BIO_EMPTY_LIST;
 397         struct bio_list normal_bios = BIO_EMPTY_LIST;
 398 
 399         if (bio_list_empty(bios))
 400                 return;
 401 
 402         while ((bio = bio_list_pop(bios))) {
 403                 if (bio_triggers_commit(clone, bio))
 404                         bio_list_add(&flush_bios, bio);
 405                 else
 406                         bio_list_add(&normal_bios, bio);
 407         }
 408 
 409         spin_lock_irqsave(&clone->lock, flags);
 410         bio_list_merge(&clone->deferred_bios, &normal_bios);
 411         bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
 412         spin_unlock_irqrestore(&clone->lock, flags);
 413 
 414         wake_worker(clone);
 415 }
 416 
 417 static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
 418 {
 419         unsigned long flags;
 420 
 421         /*
 422          * If the bio has the REQ_FUA flag set we must commit the metadata
 423          * before signaling its completion.
 424          *
 425          * complete_overwrite_bio() is only called by hydration_complete(),
 426          * after having successfully updated the metadata. This means we don't
 427          * need to call dm_clone_changed_this_transaction() to check if the
 428          * metadata has changed and thus we can avoid taking the metadata spin
 429          * lock.
 430          */
 431         if (!(bio->bi_opf & REQ_FUA)) {
 432                 bio_endio(bio);
 433                 return;
 434         }
 435 
 436         /*
 437          * If the metadata mode is RO or FAIL we won't be able to commit the
 438          * metadata, so we complete the bio with an error.
 439          */
 440         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 441                 bio_io_error(bio);
 442                 return;
 443         }
 444 
 445         /*
 446          * Batch together any bios that trigger commits and then issue a single
 447          * commit for them in process_deferred_flush_bios().
 448          */
 449         spin_lock_irqsave(&clone->lock, flags);
 450         bio_list_add(&clone->deferred_flush_completions, bio);
 451         spin_unlock_irqrestore(&clone->lock, flags);
 452 
 453         wake_worker(clone);
 454 }
 455 
 456 static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
 457 {
 458         bio->bi_iter.bi_sector = sector;
 459         bio->bi_iter.bi_size = to_bytes(len);
 460 }
 461 
 462 static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
 463 {
 464         unsigned long rs, nr_regions;
 465 
 466         /*
 467          * If the destination device supports discards, remap and trim the
 468          * discard bio and pass it down. Otherwise complete the bio
 469          * immediately.
 470          */
 471         if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
 472                 remap_to_dest(clone, bio);
 473                 bio_region_range(clone, bio, &rs, &nr_regions);
 474                 trim_bio(bio, region_to_sector(clone, rs),
 475                          nr_regions << clone->region_shift);
 476                 generic_make_request(bio);
 477         } else
 478                 bio_endio(bio);
 479 }
 480 
 481 static void process_discard_bio(struct clone *clone, struct bio *bio)
 482 {
 483         unsigned long rs, nr_regions;
 484 
 485         bio_region_range(clone, bio, &rs, &nr_regions);
 486         if (!nr_regions) {
 487                 bio_endio(bio);
 488                 return;
 489         }
 490 
 491         if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
 492                     (rs + nr_regions) > clone->nr_regions)) {
 493                 DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
 494                       clone_device_name(clone), rs, nr_regions,
 495                       clone->nr_regions,
 496                       (unsigned long long)bio->bi_iter.bi_sector,
 497                       bio_sectors(bio));
 498                 bio_endio(bio);
 499                 return;
 500         }
 501 
 502         /*
 503          * The covered regions are already hydrated so we just need to pass
 504          * down the discard.
 505          */
 506         if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
 507                 complete_discard_bio(clone, bio, true);
 508                 return;
 509         }
 510 
 511         /*
 512          * If the metadata mode is RO or FAIL we won't be able to update the
 513          * metadata for the regions covered by the discard so we just ignore
 514          * it.
 515          */
 516         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 517                 bio_endio(bio);
 518                 return;
 519         }
 520 
 521         /*
 522          * Defer discard processing.
 523          */
 524         spin_lock_irq(&clone->lock);
 525         bio_list_add(&clone->deferred_discard_bios, bio);
 526         spin_unlock_irq(&clone->lock);
 527 
 528         wake_worker(clone);
 529 }
 530 
 531 /*---------------------------------------------------------------------------*/
 532 
 533 /*
 534  * dm-clone region hydrations.
 535  */
 536 struct dm_clone_region_hydration {
 537         struct clone *clone;
 538         unsigned long region_nr;
 539 
 540         struct bio *overwrite_bio;
 541         bio_end_io_t *overwrite_bio_end_io;
 542 
 543         struct bio_list deferred_bios;
 544 
 545         blk_status_t status;
 546 
 547         /* Used by hydration batching */
 548         struct list_head list;
 549 
 550         /* Used by hydration hash table */
 551         struct hlist_node h;
 552 };
 553 
 554 /*
 555  * Hydration hash table implementation.
 556  *
 557  * Ideally we would like to use list_bl, which uses bit spin locks and employs
 558  * the least significant bit of the list head to lock the corresponding bucket,
 559  * reducing the memory overhead for the locks. But, currently, list_bl and bit
 560  * spin locks don't support IRQ safe versions. Since we have to take the lock
 561  * in both process and interrupt context, we must fall back to using regular
 562  * spin locks; one per hash table bucket.
 563  */
 564 struct hash_table_bucket {
 565         struct hlist_head head;
 566 
 567         /* Spinlock protecting the bucket */
 568         spinlock_t lock;
 569 };
 570 
 571 #define bucket_lock_irqsave(bucket, flags) \
 572         spin_lock_irqsave(&(bucket)->lock, flags)
 573 
 574 #define bucket_unlock_irqrestore(bucket, flags) \
 575         spin_unlock_irqrestore(&(bucket)->lock, flags)
 576 
 577 static int hash_table_init(struct clone *clone)
 578 {
 579         unsigned int i, sz;
 580         struct hash_table_bucket *bucket;
 581 
 582         sz = 1 << HASH_TABLE_BITS;
 583 
 584         clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
 585         if (!clone->ht)
 586                 return -ENOMEM;
 587 
 588         for (i = 0; i < sz; i++) {
 589                 bucket = clone->ht + i;
 590 
 591                 INIT_HLIST_HEAD(&bucket->head);
 592                 spin_lock_init(&bucket->lock);
 593         }
 594 
 595         return 0;
 596 }
 597 
 598 static void hash_table_exit(struct clone *clone)
 599 {
 600         kvfree(clone->ht);
 601 }
 602 
 603 static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
 604                                                        unsigned long region_nr)
 605 {
 606         return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
 607 }
 608 
 609 /*
 610  * Search hash table for a hydration with hd->region_nr == region_nr
 611  *
 612  * NOTE: Must be called with the bucket lock held
 613  */
 614 static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
 615                                                      unsigned long region_nr)
 616 {
 617         struct dm_clone_region_hydration *hd;
 618 
 619         hlist_for_each_entry(hd, &bucket->head, h) {
 620                 if (hd->region_nr == region_nr)
 621                         return hd;
 622         }
 623 
 624         return NULL;
 625 }
 626 
 627 /*
 628  * Insert a hydration into the hash table.
 629  *
 630  * NOTE: Must be called with the bucket lock held.
 631  */
 632 static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
 633                                              struct dm_clone_region_hydration *hd)
 634 {
 635         hlist_add_head(&hd->h, &bucket->head);
 636 }
 637 
 638 /*
 639  * This function inserts a hydration into the hash table, unless someone else
 640  * managed to insert a hydration for the same region first. In the latter case
 641  * it returns the existing hydration descriptor for this region.
 642  *
 643  * NOTE: Must be called with the hydration hash table lock held.
 644  */
 645 static struct dm_clone_region_hydration *
 646 __find_or_insert_region_hydration(struct hash_table_bucket *bucket,
 647                                   struct dm_clone_region_hydration *hd)
 648 {
 649         struct dm_clone_region_hydration *hd2;
 650 
 651         hd2 = __hash_find(bucket, hd->region_nr);
 652         if (hd2)
 653                 return hd2;
 654 
 655         __insert_region_hydration(bucket, hd);
 656 
 657         return hd;
 658 }
 659 
 660 /*---------------------------------------------------------------------------*/
 661 
 662 /* Allocate a hydration */
 663 static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
 664 {
 665         struct dm_clone_region_hydration *hd;
 666 
 667         /*
 668          * Allocate a hydration from the hydration mempool.
 669          * This might block but it can't fail.
 670          */
 671         hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
 672         hd->clone = clone;
 673 
 674         return hd;
 675 }
 676 
 677 static inline void free_hydration(struct dm_clone_region_hydration *hd)
 678 {
 679         mempool_free(hd, &hd->clone->hydration_pool);
 680 }
 681 
 682 /* Initialize a hydration */
 683 static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
 684 {
 685         hd->region_nr = region_nr;
 686         hd->overwrite_bio = NULL;
 687         bio_list_init(&hd->deferred_bios);
 688         hd->status = 0;
 689 
 690         INIT_LIST_HEAD(&hd->list);
 691         INIT_HLIST_NODE(&hd->h);
 692 }
 693 
 694 /*---------------------------------------------------------------------------*/
 695 
 696 /*
 697  * Update dm-clone's metadata after a region has finished hydrating and remove
 698  * hydration from the hash table.
 699  */
 700 static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
 701 {
 702         int r = 0;
 703         unsigned long flags;
 704         struct hash_table_bucket *bucket;
 705         struct clone *clone = hd->clone;
 706 
 707         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
 708                 r = -EPERM;
 709 
 710         /* Update the metadata */
 711         if (likely(!r) && hd->status == BLK_STS_OK)
 712                 r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
 713 
 714         bucket = get_hash_table_bucket(clone, hd->region_nr);
 715 
 716         /* Remove hydration from hash table */
 717         bucket_lock_irqsave(bucket, flags);
 718         hlist_del(&hd->h);
 719         bucket_unlock_irqrestore(bucket, flags);
 720 
 721         return r;
 722 }
 723 
 724 /*
 725  * Complete a region's hydration:
 726  *
 727  *      1. Update dm-clone's metadata.
 728  *      2. Remove hydration from hash table.
 729  *      3. Complete overwrite bio.
 730  *      4. Issue deferred bios.
 731  *      5. If this was the last hydration, wake up anyone waiting for
 732  *         hydrations to finish.
 733  */
 734 static void hydration_complete(struct dm_clone_region_hydration *hd)
 735 {
 736         int r;
 737         blk_status_t status;
 738         struct clone *clone = hd->clone;
 739 
 740         r = hydration_update_metadata(hd);
 741 
 742         if (hd->status == BLK_STS_OK && likely(!r)) {
 743                 if (hd->overwrite_bio)
 744                         complete_overwrite_bio(clone, hd->overwrite_bio);
 745 
 746                 issue_deferred_bios(clone, &hd->deferred_bios);
 747         } else {
 748                 status = r ? BLK_STS_IOERR : hd->status;
 749 
 750                 if (hd->overwrite_bio)
 751                         bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
 752 
 753                 fail_bios(&hd->deferred_bios, status);
 754         }
 755 
 756         free_hydration(hd);
 757 
 758         if (atomic_dec_and_test(&clone->hydrations_in_flight))
 759                 wakeup_hydration_waiters(clone);
 760 }
 761 
 762 static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
 763 {
 764         blk_status_t status;
 765 
 766         struct dm_clone_region_hydration *tmp, *hd = context;
 767         struct clone *clone = hd->clone;
 768 
 769         LIST_HEAD(batched_hydrations);
 770 
 771         if (read_err || write_err) {
 772                 DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
 773                 status = BLK_STS_IOERR;
 774         } else {
 775                 status = BLK_STS_OK;
 776         }
 777         list_splice_tail(&hd->list, &batched_hydrations);
 778 
 779         hd->status = status;
 780         hydration_complete(hd);
 781 
 782         /* Complete batched hydrations */
 783         list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
 784                 hd->status = status;
 785                 hydration_complete(hd);
 786         }
 787 
 788         /* Continue background hydration, if there is no I/O in-flight */
 789         if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
 790             !atomic_read(&clone->ios_in_flight))
 791                 wake_worker(clone);
 792 }
 793 
 794 static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
 795 {
 796         unsigned long region_start, region_end;
 797         sector_t tail_size, region_size, total_size;
 798         struct dm_io_region from, to;
 799         struct clone *clone = hd->clone;
 800 
 801         if (WARN_ON(!nr_regions))
 802                 return;
 803 
 804         region_size = clone->region_size;
 805         region_start = hd->region_nr;
 806         region_end = region_start + nr_regions - 1;
 807 
 808         total_size = region_to_sector(clone, nr_regions - 1);
 809 
 810         if (region_end == clone->nr_regions - 1) {
 811                 /*
 812                  * The last region of the target might be smaller than
 813                  * region_size.
 814                  */
 815                 tail_size = clone->ti->len & (region_size - 1);
 816                 if (!tail_size)
 817                         tail_size = region_size;
 818         } else {
 819                 tail_size = region_size;
 820         }
 821 
 822         total_size += tail_size;
 823 
 824         from.bdev = clone->source_dev->bdev;
 825         from.sector = region_to_sector(clone, region_start);
 826         from.count = total_size;
 827 
 828         to.bdev = clone->dest_dev->bdev;
 829         to.sector = from.sector;
 830         to.count = from.count;
 831 
 832         /* Issue copy */
 833         atomic_add(nr_regions, &clone->hydrations_in_flight);
 834         dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
 835                        hydration_kcopyd_callback, hd);
 836 }
 837 
 838 static void overwrite_endio(struct bio *bio)
 839 {
 840         struct dm_clone_region_hydration *hd = bio->bi_private;
 841 
 842         bio->bi_end_io = hd->overwrite_bio_end_io;
 843         hd->status = bio->bi_status;
 844 
 845         hydration_complete(hd);
 846 }
 847 
 848 static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
 849 {
 850         /*
 851          * We don't need to save and restore bio->bi_private because device
 852          * mapper core generates a new bio for us to use, with clean
 853          * bi_private.
 854          */
 855         hd->overwrite_bio = bio;
 856         hd->overwrite_bio_end_io = bio->bi_end_io;
 857 
 858         bio->bi_end_io = overwrite_endio;
 859         bio->bi_private = hd;
 860 
 861         atomic_inc(&hd->clone->hydrations_in_flight);
 862         generic_make_request(bio);
 863 }
 864 
 865 /*
 866  * Hydrate bio's region.
 867  *
 868  * This function starts the hydration of the bio's region and puts the bio in
 869  * the list of deferred bios for this region. In case, by the time this
 870  * function is called, the region has finished hydrating it's submitted to the
 871  * destination device.
 872  *
 873  * NOTE: The bio remapping must be performed by the caller.
 874  */
 875 static void hydrate_bio_region(struct clone *clone, struct bio *bio)
 876 {
 877         unsigned long flags;
 878         unsigned long region_nr;
 879         struct hash_table_bucket *bucket;
 880         struct dm_clone_region_hydration *hd, *hd2;
 881 
 882         region_nr = bio_to_region(clone, bio);
 883         bucket = get_hash_table_bucket(clone, region_nr);
 884 
 885         bucket_lock_irqsave(bucket, flags);
 886 
 887         hd = __hash_find(bucket, region_nr);
 888         if (hd) {
 889                 /* Someone else is hydrating the region */
 890                 bio_list_add(&hd->deferred_bios, bio);
 891                 bucket_unlock_irqrestore(bucket, flags);
 892                 return;
 893         }
 894 
 895         if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
 896                 /* The region has been hydrated */
 897                 bucket_unlock_irqrestore(bucket, flags);
 898                 issue_bio(clone, bio);
 899                 return;
 900         }
 901 
 902         /*
 903          * We must allocate a hydration descriptor and start the hydration of
 904          * the corresponding region.
 905          */
 906         bucket_unlock_irqrestore(bucket, flags);
 907 
 908         hd = alloc_hydration(clone);
 909         hydration_init(hd, region_nr);
 910 
 911         bucket_lock_irqsave(bucket, flags);
 912 
 913         /* Check if the region has been hydrated in the meantime. */
 914         if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
 915                 bucket_unlock_irqrestore(bucket, flags);
 916                 free_hydration(hd);
 917                 issue_bio(clone, bio);
 918                 return;
 919         }
 920 
 921         hd2 = __find_or_insert_region_hydration(bucket, hd);
 922         if (hd2 != hd) {
 923                 /* Someone else started the region's hydration. */
 924                 bio_list_add(&hd2->deferred_bios, bio);
 925                 bucket_unlock_irqrestore(bucket, flags);
 926                 free_hydration(hd);
 927                 return;
 928         }
 929 
 930         /*
 931          * If the metadata mode is RO or FAIL then there is no point starting a
 932          * hydration, since we will not be able to update the metadata when the
 933          * hydration finishes.
 934          */
 935         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
 936                 hlist_del(&hd->h);
 937                 bucket_unlock_irqrestore(bucket, flags);
 938                 free_hydration(hd);
 939                 bio_io_error(bio);
 940                 return;
 941         }
 942 
 943         /*
 944          * Start region hydration.
 945          *
 946          * If a bio overwrites a region, i.e., its size is equal to the
 947          * region's size, then we don't need to copy the region from the source
 948          * to the destination device.
 949          */
 950         if (is_overwrite_bio(clone, bio)) {
 951                 bucket_unlock_irqrestore(bucket, flags);
 952                 hydration_overwrite(hd, bio);
 953         } else {
 954                 bio_list_add(&hd->deferred_bios, bio);
 955                 bucket_unlock_irqrestore(bucket, flags);
 956                 hydration_copy(hd, 1);
 957         }
 958 }
 959 
 960 /*---------------------------------------------------------------------------*/
 961 
 962 /*
 963  * Background hydrations.
 964  */
 965 
 966 /*
 967  * Batch region hydrations.
 968  *
 969  * To better utilize device bandwidth we batch together the hydration of
 970  * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
 971  * is good for small, random write performance (because of the overwriting of
 972  * un-hydrated regions) and at the same time issue big copy requests to kcopyd
 973  * to achieve high hydration bandwidth.
 974  */
 975 struct batch_info {
 976         struct dm_clone_region_hydration *head;
 977         unsigned int nr_batched_regions;
 978 };
 979 
 980 static void __batch_hydration(struct batch_info *batch,
 981                               struct dm_clone_region_hydration *hd)
 982 {
 983         struct clone *clone = hd->clone;
 984         unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
 985 
 986         if (batch->head) {
 987                 /* Try to extend the current batch */
 988                 if (batch->nr_batched_regions < max_batch_size &&
 989                     (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
 990                         list_add_tail(&hd->list, &batch->head->list);
 991                         batch->nr_batched_regions++;
 992                         hd = NULL;
 993                 }
 994 
 995                 /* Check if we should issue the current batch */
 996                 if (batch->nr_batched_regions >= max_batch_size || hd) {
 997                         hydration_copy(batch->head, batch->nr_batched_regions);
 998                         batch->head = NULL;
 999                         batch->nr_batched_regions = 0;
1000                 }
1001         }
1002 
1003         if (!hd)
1004                 return;
1005 
1006         /* We treat max batch sizes of zero and one equivalently */
1007         if (max_batch_size <= 1) {
1008                 hydration_copy(hd, 1);
1009                 return;
1010         }
1011 
1012         /* Start a new batch */
1013         BUG_ON(!list_empty(&hd->list));
1014         batch->head = hd;
1015         batch->nr_batched_regions = 1;
1016 }
1017 
1018 static unsigned long __start_next_hydration(struct clone *clone,
1019                                             unsigned long offset,
1020                                             struct batch_info *batch)
1021 {
1022         unsigned long flags;
1023         struct hash_table_bucket *bucket;
1024         struct dm_clone_region_hydration *hd;
1025         unsigned long nr_regions = clone->nr_regions;
1026 
1027         hd = alloc_hydration(clone);
1028 
1029         /* Try to find a region to hydrate. */
1030         do {
1031                 offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
1032                 if (offset == nr_regions)
1033                         break;
1034 
1035                 bucket = get_hash_table_bucket(clone, offset);
1036                 bucket_lock_irqsave(bucket, flags);
1037 
1038                 if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
1039                     !__hash_find(bucket, offset)) {
1040                         hydration_init(hd, offset);
1041                         __insert_region_hydration(bucket, hd);
1042                         bucket_unlock_irqrestore(bucket, flags);
1043 
1044                         /* Batch hydration */
1045                         __batch_hydration(batch, hd);
1046 
1047                         return (offset + 1);
1048                 }
1049 
1050                 bucket_unlock_irqrestore(bucket, flags);
1051 
1052         } while (++offset < nr_regions);
1053 
1054         if (hd)
1055                 free_hydration(hd);
1056 
1057         return offset;
1058 }
1059 
1060 /*
1061  * This function searches for regions that still reside in the source device
1062  * and starts their hydration.
1063  */
1064 static void do_hydration(struct clone *clone)
1065 {
1066         unsigned int current_volume;
1067         unsigned long offset, nr_regions = clone->nr_regions;
1068 
1069         struct batch_info batch = {
1070                 .head = NULL,
1071                 .nr_batched_regions = 0,
1072         };
1073 
1074         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1075                 return;
1076 
1077         if (dm_clone_is_hydration_done(clone->cmd))
1078                 return;
1079 
1080         /*
1081          * Avoid race with device suspension.
1082          */
1083         atomic_inc(&clone->hydrations_in_flight);
1084 
1085         /*
1086          * Make sure atomic_inc() is ordered before test_bit(), otherwise we
1087          * might race with clone_postsuspend() and start a region hydration
1088          * after the target has been suspended.
1089          *
1090          * This is paired with the smp_mb__after_atomic() in
1091          * clone_postsuspend().
1092          */
1093         smp_mb__after_atomic();
1094 
1095         offset = clone->hydration_offset;
1096         while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
1097                !atomic_read(&clone->ios_in_flight) &&
1098                test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
1099                offset < nr_regions) {
1100                 current_volume = atomic_read(&clone->hydrations_in_flight);
1101                 current_volume += batch.nr_batched_regions;
1102 
1103                 if (current_volume > READ_ONCE(clone->hydration_threshold))
1104                         break;
1105 
1106                 offset = __start_next_hydration(clone, offset, &batch);
1107         }
1108 
1109         if (batch.head)
1110                 hydration_copy(batch.head, batch.nr_batched_regions);
1111 
1112         if (offset >= nr_regions)
1113                 offset = 0;
1114 
1115         clone->hydration_offset = offset;
1116 
1117         if (atomic_dec_and_test(&clone->hydrations_in_flight))
1118                 wakeup_hydration_waiters(clone);
1119 }
1120 
1121 /*---------------------------------------------------------------------------*/
1122 
1123 static bool need_commit_due_to_time(struct clone *clone)
1124 {
1125         return !time_in_range(jiffies, clone->last_commit_jiffies,
1126                               clone->last_commit_jiffies + COMMIT_PERIOD);
1127 }
1128 
1129 /*
1130  * A non-zero return indicates read-only or fail mode.
1131  */
1132 static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
1133 {
1134         int r = 0;
1135 
1136         if (dest_dev_flushed)
1137                 *dest_dev_flushed = false;
1138 
1139         mutex_lock(&clone->commit_lock);
1140 
1141         if (!dm_clone_changed_this_transaction(clone->cmd))
1142                 goto out;
1143 
1144         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
1145                 r = -EPERM;
1146                 goto out;
1147         }
1148 
1149         r = dm_clone_metadata_pre_commit(clone->cmd);
1150         if (unlikely(r)) {
1151                 __metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
1152                 goto out;
1153         }
1154 
1155         bio_reset(&clone->flush_bio);
1156         bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
1157         clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1158 
1159         r = submit_bio_wait(&clone->flush_bio);
1160         if (unlikely(r)) {
1161                 __metadata_operation_failed(clone, "flush destination device", r);
1162                 goto out;
1163         }
1164 
1165         if (dest_dev_flushed)
1166                 *dest_dev_flushed = true;
1167 
1168         r = dm_clone_metadata_commit(clone->cmd);
1169         if (unlikely(r)) {
1170                 __metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
1171                 goto out;
1172         }
1173 
1174         if (dm_clone_is_hydration_done(clone->cmd))
1175                 dm_table_event(clone->ti->table);
1176 out:
1177         mutex_unlock(&clone->commit_lock);
1178 
1179         return r;
1180 }
1181 
1182 static void process_deferred_discards(struct clone *clone)
1183 {
1184         int r = -EPERM;
1185         struct bio *bio;
1186         struct blk_plug plug;
1187         unsigned long rs, nr_regions;
1188         struct bio_list discards = BIO_EMPTY_LIST;
1189 
1190         spin_lock_irq(&clone->lock);
1191         bio_list_merge(&discards, &clone->deferred_discard_bios);
1192         bio_list_init(&clone->deferred_discard_bios);
1193         spin_unlock_irq(&clone->lock);
1194 
1195         if (bio_list_empty(&discards))
1196                 return;
1197 
1198         if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1199                 goto out;
1200 
1201         /* Update the metadata */
1202         bio_list_for_each(bio, &discards) {
1203                 bio_region_range(clone, bio, &rs, &nr_regions);
1204                 /*
1205                  * A discard request might cover regions that have been already
1206                  * hydrated. There is no need to update the metadata for these
1207                  * regions.
1208                  */
1209                 r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
1210                 if (unlikely(r))
1211                         break;
1212         }
1213 out:
1214         blk_start_plug(&plug);
1215         while ((bio = bio_list_pop(&discards)))
1216                 complete_discard_bio(clone, bio, r == 0);
1217         blk_finish_plug(&plug);
1218 }
1219 
1220 static void process_deferred_bios(struct clone *clone)
1221 {
1222         struct bio_list bios = BIO_EMPTY_LIST;
1223 
1224         spin_lock_irq(&clone->lock);
1225         bio_list_merge(&bios, &clone->deferred_bios);
1226         bio_list_init(&clone->deferred_bios);
1227         spin_unlock_irq(&clone->lock);
1228 
1229         if (bio_list_empty(&bios))
1230                 return;
1231 
1232         submit_bios(&bios);
1233 }
1234 
1235 static void process_deferred_flush_bios(struct clone *clone)
1236 {
1237         struct bio *bio;
1238         bool dest_dev_flushed;
1239         struct bio_list bios = BIO_EMPTY_LIST;
1240         struct bio_list bio_completions = BIO_EMPTY_LIST;
1241 
1242         /*
1243          * If there are any deferred flush bios, we must commit the metadata
1244          * before issuing them or signaling their completion.
1245          */
1246         spin_lock_irq(&clone->lock);
1247         bio_list_merge(&bios, &clone->deferred_flush_bios);
1248         bio_list_init(&clone->deferred_flush_bios);
1249 
1250         bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
1251         bio_list_init(&clone->deferred_flush_completions);
1252         spin_unlock_irq(&clone->lock);
1253 
1254         if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
1255             !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
1256                 return;
1257 
1258         if (commit_metadata(clone, &dest_dev_flushed)) {
1259                 bio_list_merge(&bios, &bio_completions);
1260 
1261                 while ((bio = bio_list_pop(&bios)))
1262                         bio_io_error(bio);
1263 
1264                 return;
1265         }
1266 
1267         clone->last_commit_jiffies = jiffies;
1268 
1269         while ((bio = bio_list_pop(&bio_completions)))
1270                 bio_endio(bio);
1271 
1272         while ((bio = bio_list_pop(&bios))) {
1273                 if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
1274                         /* We just flushed the destination device as part of
1275                          * the metadata commit, so there is no reason to send
1276                          * another flush.
1277                          */
1278                         bio_endio(bio);
1279                 } else {
1280                         generic_make_request(bio);
1281                 }
1282         }
1283 }
1284 
1285 static void do_worker(struct work_struct *work)
1286 {
1287         struct clone *clone = container_of(work, typeof(*clone), worker);
1288 
1289         process_deferred_bios(clone);
1290         process_deferred_discards(clone);
1291 
1292         /*
1293          * process_deferred_flush_bios():
1294          *
1295          *   - Commit metadata
1296          *
1297          *   - Process deferred REQ_FUA completions
1298          *
1299          *   - Process deferred REQ_PREFLUSH bios
1300          */
1301         process_deferred_flush_bios(clone);
1302 
1303         /* Background hydration */
1304         do_hydration(clone);
1305 }
1306 
1307 /*
1308  * Commit periodically so that not too much unwritten data builds up.
1309  *
1310  * Also, restart background hydration, if it has been stopped by in-flight I/O.
1311  */
1312 static void do_waker(struct work_struct *work)
1313 {
1314         struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
1315 
1316         wake_worker(clone);
1317         queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
1318 }
1319 
1320 /*---------------------------------------------------------------------------*/
1321 
1322 /*
1323  * Target methods
1324  */
1325 static int clone_map(struct dm_target *ti, struct bio *bio)
1326 {
1327         struct clone *clone = ti->private;
1328         unsigned long region_nr;
1329 
1330         atomic_inc(&clone->ios_in_flight);
1331 
1332         if (unlikely(get_clone_mode(clone) == CM_FAIL))
1333                 return DM_MAPIO_KILL;
1334 
1335         /*
1336          * REQ_PREFLUSH bios carry no data:
1337          *
1338          * - Commit metadata, if changed
1339          *
1340          * - Pass down to destination device
1341          */
1342         if (bio->bi_opf & REQ_PREFLUSH) {
1343                 remap_and_issue(clone, bio);
1344                 return DM_MAPIO_SUBMITTED;
1345         }
1346 
1347         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1348 
1349         /*
1350          * dm-clone interprets discards and performs a fast hydration of the
1351          * discarded regions, i.e., we skip the copy from the source device and
1352          * just mark the regions as hydrated.
1353          */
1354         if (bio_op(bio) == REQ_OP_DISCARD) {
1355                 process_discard_bio(clone, bio);
1356                 return DM_MAPIO_SUBMITTED;
1357         }
1358 
1359         /*
1360          * If the bio's region is hydrated, redirect it to the destination
1361          * device.
1362          *
1363          * If the region is not hydrated and the bio is a READ, redirect it to
1364          * the source device.
1365          *
1366          * Else, defer WRITE bio until after its region has been hydrated and
1367          * start the region's hydration immediately.
1368          */
1369         region_nr = bio_to_region(clone, bio);
1370         if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
1371                 remap_and_issue(clone, bio);
1372                 return DM_MAPIO_SUBMITTED;
1373         } else if (bio_data_dir(bio) == READ) {
1374                 remap_to_source(clone, bio);
1375                 return DM_MAPIO_REMAPPED;
1376         }
1377 
1378         remap_to_dest(clone, bio);
1379         hydrate_bio_region(clone, bio);
1380 
1381         return DM_MAPIO_SUBMITTED;
1382 }
1383 
1384 static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1385 {
1386         struct clone *clone = ti->private;
1387 
1388         atomic_dec(&clone->ios_in_flight);
1389 
1390         return DM_ENDIO_DONE;
1391 }
1392 
1393 static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
1394                        ssize_t *sz_ptr)
1395 {
1396         ssize_t sz = *sz_ptr;
1397         unsigned int count;
1398 
1399         count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1400         count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1401 
1402         DMEMIT("%u ", count);
1403 
1404         if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
1405                 DMEMIT("no_hydration ");
1406 
1407         if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1408                 DMEMIT("no_discard_passdown ");
1409 
1410         *sz_ptr = sz;
1411 }
1412 
1413 static void emit_core_args(struct clone *clone, char *result,
1414                            unsigned int maxlen, ssize_t *sz_ptr)
1415 {
1416         ssize_t sz = *sz_ptr;
1417         unsigned int count = 4;
1418 
1419         DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
1420                READ_ONCE(clone->hydration_threshold),
1421                READ_ONCE(clone->hydration_batch_size));
1422 
1423         *sz_ptr = sz;
1424 }
1425 
1426 /*
1427  * Status format:
1428  *
1429  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1430  * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
1431  * <#features> <features>* <#core args> <core args>* <clone metadata mode>
1432  */
1433 static void clone_status(struct dm_target *ti, status_type_t type,
1434                          unsigned int status_flags, char *result,
1435                          unsigned int maxlen)
1436 {
1437         int r;
1438         unsigned int i;
1439         ssize_t sz = 0;
1440         dm_block_t nr_free_metadata_blocks = 0;
1441         dm_block_t nr_metadata_blocks = 0;
1442         char buf[BDEVNAME_SIZE];
1443         struct clone *clone = ti->private;
1444 
1445         switch (type) {
1446         case STATUSTYPE_INFO:
1447                 if (get_clone_mode(clone) == CM_FAIL) {
1448                         DMEMIT("Fail");
1449                         break;
1450                 }
1451 
1452                 /* Commit to ensure statistics aren't out-of-date */
1453                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
1454                         (void) commit_metadata(clone, NULL);
1455 
1456                 r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
1457 
1458                 if (r) {
1459                         DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
1460                               clone_device_name(clone), r);
1461                         goto error;
1462                 }
1463 
1464                 r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
1465 
1466                 if (r) {
1467                         DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
1468                               clone_device_name(clone), r);
1469                         goto error;
1470                 }
1471 
1472                 DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
1473                        DM_CLONE_METADATA_BLOCK_SIZE,
1474                        (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
1475                        (unsigned long long)nr_metadata_blocks,
1476                        (unsigned long long)clone->region_size,
1477                        dm_clone_nr_of_hydrated_regions(clone->cmd),
1478                        clone->nr_regions,
1479                        atomic_read(&clone->hydrations_in_flight));
1480 
1481                 emit_flags(clone, result, maxlen, &sz);
1482                 emit_core_args(clone, result, maxlen, &sz);
1483 
1484                 switch (get_clone_mode(clone)) {
1485                 case CM_WRITE:
1486                         DMEMIT("rw");
1487                         break;
1488                 case CM_READ_ONLY:
1489                         DMEMIT("ro");
1490                         break;
1491                 case CM_FAIL:
1492                         DMEMIT("Fail");
1493                 }
1494 
1495                 break;
1496 
1497         case STATUSTYPE_TABLE:
1498                 format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
1499                 DMEMIT("%s ", buf);
1500 
1501                 format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
1502                 DMEMIT("%s ", buf);
1503 
1504                 format_dev_t(buf, clone->source_dev->bdev->bd_dev);
1505                 DMEMIT("%s", buf);
1506 
1507                 for (i = 0; i < clone->nr_ctr_args; i++)
1508                         DMEMIT(" %s", clone->ctr_args[i]);
1509         }
1510 
1511         return;
1512 
1513 error:
1514         DMEMIT("Error");
1515 }
1516 
1517 static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1518 {
1519         struct request_queue *dest_q, *source_q;
1520         struct clone *clone = container_of(cb, struct clone, callbacks);
1521 
1522         source_q = bdev_get_queue(clone->source_dev->bdev);
1523         dest_q = bdev_get_queue(clone->dest_dev->bdev);
1524 
1525         return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
1526                 bdi_congested(source_q->backing_dev_info, bdi_bits));
1527 }
1528 
1529 static sector_t get_dev_size(struct dm_dev *dev)
1530 {
1531         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1532 }
1533 
1534 /*---------------------------------------------------------------------------*/
1535 
1536 /*
1537  * Construct a clone device mapping:
1538  *
1539  * clone <metadata dev> <destination dev> <source dev> <region size>
1540  *      [<#feature args> [<feature arg>]* [<#core args> [key value]*]]
1541  *
1542  * metadata dev: Fast device holding the persistent metadata
1543  * destination dev: The destination device, which will become a clone of the
1544  *                  source device
1545  * source dev: The read-only source device that gets cloned
1546  * region size: dm-clone unit size in sectors
1547  *
1548  * #feature args: Number of feature arguments passed
1549  * feature args: E.g. no_hydration, no_discard_passdown
1550  *
1551  * #core arguments: An even number of core arguments
1552  * core arguments: Key/value pairs for tuning the core
1553  *                 E.g. 'hydration_threshold 256'
1554  */
1555 static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
1556 {
1557         int r;
1558         unsigned int argc;
1559         const char *arg_name;
1560         struct dm_target *ti = clone->ti;
1561 
1562         const struct dm_arg args = {
1563                 .min = 0,
1564                 .max = 2,
1565                 .error = "Invalid number of feature arguments"
1566         };
1567 
1568         /* No feature arguments supplied */
1569         if (!as->argc)
1570                 return 0;
1571 
1572         r = dm_read_arg_group(&args, as, &argc, &ti->error);
1573         if (r)
1574                 return r;
1575 
1576         while (argc) {
1577                 arg_name = dm_shift_arg(as);
1578                 argc--;
1579 
1580                 if (!strcasecmp(arg_name, "no_hydration")) {
1581                         __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1582                 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1583                         __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1584                 } else {
1585                         ti->error = "Invalid feature argument";
1586                         return -EINVAL;
1587                 }
1588         }
1589 
1590         return 0;
1591 }
1592 
1593 static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
1594 {
1595         int r;
1596         unsigned int argc;
1597         unsigned int value;
1598         const char *arg_name;
1599         struct dm_target *ti = clone->ti;
1600 
1601         const struct dm_arg args = {
1602                 .min = 0,
1603                 .max = 4,
1604                 .error = "Invalid number of core arguments"
1605         };
1606 
1607         /* Initialize core arguments */
1608         clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
1609         clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
1610 
1611         /* No core arguments supplied */
1612         if (!as->argc)
1613                 return 0;
1614 
1615         r = dm_read_arg_group(&args, as, &argc, &ti->error);
1616         if (r)
1617                 return r;
1618 
1619         if (argc & 1) {
1620                 ti->error = "Number of core arguments must be even";
1621                 return -EINVAL;
1622         }
1623 
1624         while (argc) {
1625                 arg_name = dm_shift_arg(as);
1626                 argc -= 2;
1627 
1628                 if (!strcasecmp(arg_name, "hydration_threshold")) {
1629                         if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1630                                 ti->error = "Invalid value for argument `hydration_threshold'";
1631                                 return -EINVAL;
1632                         }
1633                         clone->hydration_threshold = value;
1634                 } else if (!strcasecmp(arg_name, "hydration_batch_size")) {
1635                         if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1636                                 ti->error = "Invalid value for argument `hydration_batch_size'";
1637                                 return -EINVAL;
1638                         }
1639                         clone->hydration_batch_size = value;
1640                 } else {
1641                         ti->error = "Invalid core argument";
1642                         return -EINVAL;
1643                 }
1644         }
1645 
1646         return 0;
1647 }
1648 
1649 static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
1650 {
1651         int r;
1652         unsigned int region_size;
1653         struct dm_arg arg;
1654 
1655         arg.min = MIN_REGION_SIZE;
1656         arg.max = MAX_REGION_SIZE;
1657         arg.error = "Invalid region size";
1658 
1659         r = dm_read_arg(&arg, as, &region_size, error);
1660         if (r)
1661                 return r;
1662 
1663         /* Check region size is a power of 2 */
1664         if (!is_power_of_2(region_size)) {
1665                 *error = "Region size is not a power of 2";
1666                 return -EINVAL;
1667         }
1668 
1669         /* Validate the region size against the device logical block size */
1670         if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
1671             region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
1672                 *error = "Region size is not a multiple of device logical block size";
1673                 return -EINVAL;
1674         }
1675 
1676         clone->region_size = region_size;
1677 
1678         return 0;
1679 }
1680 
1681 static int validate_nr_regions(unsigned long n, char **error)
1682 {
1683         /*
1684          * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
1685          * further to 2^31 regions.
1686          */
1687         if (n > (1UL << 31)) {
1688                 *error = "Too many regions. Consider increasing the region size";
1689                 return -EINVAL;
1690         }
1691 
1692         return 0;
1693 }
1694 
1695 static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1696 {
1697         int r;
1698         sector_t metadata_dev_size;
1699         char b[BDEVNAME_SIZE];
1700 
1701         r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1702                           &clone->metadata_dev);
1703         if (r) {
1704                 *error = "Error opening metadata device";
1705                 return r;
1706         }
1707 
1708         metadata_dev_size = get_dev_size(clone->metadata_dev);
1709         if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
1710                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1711                        bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
1712 
1713         return 0;
1714 }
1715 
1716 static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1717 {
1718         int r;
1719         sector_t dest_dev_size;
1720 
1721         r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1722                           &clone->dest_dev);
1723         if (r) {
1724                 *error = "Error opening destination device";
1725                 return r;
1726         }
1727 
1728         dest_dev_size = get_dev_size(clone->dest_dev);
1729         if (dest_dev_size < clone->ti->len) {
1730                 dm_put_device(clone->ti, clone->dest_dev);
1731                 *error = "Device size larger than destination device";
1732                 return -EINVAL;
1733         }
1734 
1735         return 0;
1736 }
1737 
1738 static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1739 {
1740         int r;
1741         sector_t source_dev_size;
1742 
1743         r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
1744                           &clone->source_dev);
1745         if (r) {
1746                 *error = "Error opening source device";
1747                 return r;
1748         }
1749 
1750         source_dev_size = get_dev_size(clone->source_dev);
1751         if (source_dev_size < clone->ti->len) {
1752                 dm_put_device(clone->ti, clone->source_dev);
1753                 *error = "Device size larger than source device";
1754                 return -EINVAL;
1755         }
1756 
1757         return 0;
1758 }
1759 
1760 static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
1761 {
1762         unsigned int i;
1763         const char **copy;
1764 
1765         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1766         if (!copy)
1767                 goto error;
1768 
1769         for (i = 0; i < argc; i++) {
1770                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
1771 
1772                 if (!copy[i]) {
1773                         while (i--)
1774                                 kfree(copy[i]);
1775                         kfree(copy);
1776                         goto error;
1777                 }
1778         }
1779 
1780         clone->nr_ctr_args = argc;
1781         clone->ctr_args = copy;
1782         return 0;
1783 
1784 error:
1785         *error = "Failed to allocate memory for table line";
1786         return -ENOMEM;
1787 }
1788 
1789 static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1790 {
1791         int r;
1792         sector_t nr_regions;
1793         struct clone *clone;
1794         struct dm_arg_set as;
1795 
1796         if (argc < 4) {
1797                 ti->error = "Invalid number of arguments";
1798                 return -EINVAL;
1799         }
1800 
1801         as.argc = argc;
1802         as.argv = argv;
1803 
1804         clone = kzalloc(sizeof(*clone), GFP_KERNEL);
1805         if (!clone) {
1806                 ti->error = "Failed to allocate clone structure";
1807                 return -ENOMEM;
1808         }
1809 
1810         clone->ti = ti;
1811 
1812         /* Initialize dm-clone flags */
1813         __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1814         __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1815         __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1816 
1817         r = parse_metadata_dev(clone, &as, &ti->error);
1818         if (r)
1819                 goto out_with_clone;
1820 
1821         r = parse_dest_dev(clone, &as, &ti->error);
1822         if (r)
1823                 goto out_with_meta_dev;
1824 
1825         r = parse_source_dev(clone, &as, &ti->error);
1826         if (r)
1827                 goto out_with_dest_dev;
1828 
1829         r = parse_region_size(clone, &as, &ti->error);
1830         if (r)
1831                 goto out_with_source_dev;
1832 
1833         clone->region_shift = __ffs(clone->region_size);
1834         nr_regions = dm_sector_div_up(ti->len, clone->region_size);
1835 
1836         /* Check for overflow */
1837         if (nr_regions != (unsigned long)nr_regions) {
1838                 ti->error = "Too many regions. Consider increasing the region size";
1839                 r = -EOVERFLOW;
1840                 goto out_with_source_dev;
1841         }
1842 
1843         clone->nr_regions = nr_regions;
1844 
1845         r = validate_nr_regions(clone->nr_regions, &ti->error);
1846         if (r)
1847                 goto out_with_source_dev;
1848 
1849         r = dm_set_target_max_io_len(ti, clone->region_size);
1850         if (r) {
1851                 ti->error = "Failed to set max io len";
1852                 goto out_with_source_dev;
1853         }
1854 
1855         r = parse_feature_args(&as, clone);
1856         if (r)
1857                 goto out_with_source_dev;
1858 
1859         r = parse_core_args(&as, clone);
1860         if (r)
1861                 goto out_with_source_dev;
1862 
1863         /* Load metadata */
1864         clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
1865                                             clone->region_size);
1866         if (IS_ERR(clone->cmd)) {
1867                 ti->error = "Failed to load metadata";
1868                 r = PTR_ERR(clone->cmd);
1869                 goto out_with_source_dev;
1870         }
1871 
1872         __set_clone_mode(clone, CM_WRITE);
1873 
1874         if (get_clone_mode(clone) != CM_WRITE) {
1875                 ti->error = "Unable to get write access to metadata, please check/repair metadata";
1876                 r = -EPERM;
1877                 goto out_with_metadata;
1878         }
1879 
1880         clone->last_commit_jiffies = jiffies;
1881 
1882         /* Allocate hydration hash table */
1883         r = hash_table_init(clone);
1884         if (r) {
1885                 ti->error = "Failed to allocate hydration hash table";
1886                 goto out_with_metadata;
1887         }
1888 
1889         atomic_set(&clone->ios_in_flight, 0);
1890         init_waitqueue_head(&clone->hydration_stopped);
1891         spin_lock_init(&clone->lock);
1892         bio_list_init(&clone->deferred_bios);
1893         bio_list_init(&clone->deferred_discard_bios);
1894         bio_list_init(&clone->deferred_flush_bios);
1895         bio_list_init(&clone->deferred_flush_completions);
1896         clone->hydration_offset = 0;
1897         atomic_set(&clone->hydrations_in_flight, 0);
1898         bio_init(&clone->flush_bio, NULL, 0);
1899 
1900         clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
1901         if (!clone->wq) {
1902                 ti->error = "Failed to allocate workqueue";
1903                 r = -ENOMEM;
1904                 goto out_with_ht;
1905         }
1906 
1907         INIT_WORK(&clone->worker, do_worker);
1908         INIT_DELAYED_WORK(&clone->waker, do_waker);
1909 
1910         clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1911         if (IS_ERR(clone->kcopyd_client)) {
1912                 r = PTR_ERR(clone->kcopyd_client);
1913                 goto out_with_wq;
1914         }
1915 
1916         r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
1917                                    _hydration_cache);
1918         if (r) {
1919                 ti->error = "Failed to create dm_clone_region_hydration memory pool";
1920                 goto out_with_kcopyd;
1921         }
1922 
1923         /* Save a copy of the table line */
1924         r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
1925         if (r)
1926                 goto out_with_mempool;
1927 
1928         mutex_init(&clone->commit_lock);
1929         clone->callbacks.congested_fn = clone_is_congested;
1930         dm_table_add_target_callbacks(ti->table, &clone->callbacks);
1931 
1932         /* Enable flushes */
1933         ti->num_flush_bios = 1;
1934         ti->flush_supported = true;
1935 
1936         /* Enable discards */
1937         ti->discards_supported = true;
1938         ti->num_discard_bios = 1;
1939 
1940         ti->private = clone;
1941 
1942         return 0;
1943 
1944 out_with_mempool:
1945         mempool_exit(&clone->hydration_pool);
1946 out_with_kcopyd:
1947         dm_kcopyd_client_destroy(clone->kcopyd_client);
1948 out_with_wq:
1949         destroy_workqueue(clone->wq);
1950 out_with_ht:
1951         hash_table_exit(clone);
1952 out_with_metadata:
1953         dm_clone_metadata_close(clone->cmd);
1954 out_with_source_dev:
1955         dm_put_device(ti, clone->source_dev);
1956 out_with_dest_dev:
1957         dm_put_device(ti, clone->dest_dev);
1958 out_with_meta_dev:
1959         dm_put_device(ti, clone->metadata_dev);
1960 out_with_clone:
1961         kfree(clone);
1962 
1963         return r;
1964 }
1965 
1966 static void clone_dtr(struct dm_target *ti)
1967 {
1968         unsigned int i;
1969         struct clone *clone = ti->private;
1970 
1971         mutex_destroy(&clone->commit_lock);
1972         bio_uninit(&clone->flush_bio);
1973 
1974         for (i = 0; i < clone->nr_ctr_args; i++)
1975                 kfree(clone->ctr_args[i]);
1976         kfree(clone->ctr_args);
1977 
1978         mempool_exit(&clone->hydration_pool);
1979         dm_kcopyd_client_destroy(clone->kcopyd_client);
1980         destroy_workqueue(clone->wq);
1981         hash_table_exit(clone);
1982         dm_clone_metadata_close(clone->cmd);
1983         dm_put_device(ti, clone->source_dev);
1984         dm_put_device(ti, clone->dest_dev);
1985         dm_put_device(ti, clone->metadata_dev);
1986 
1987         kfree(clone);
1988 }
1989 
1990 /*---------------------------------------------------------------------------*/
1991 
1992 static void clone_postsuspend(struct dm_target *ti)
1993 {
1994         struct clone *clone = ti->private;
1995 
1996         /*
1997          * To successfully suspend the device:
1998          *
1999          *      - We cancel the delayed work for periodic commits and wait for
2000          *        it to finish.
2001          *
2002          *      - We stop the background hydration, i.e. we prevent new region
2003          *        hydrations from starting.
2004          *
2005          *      - We wait for any in-flight hydrations to finish.
2006          *
2007          *      - We flush the workqueue.
2008          *
2009          *      - We commit the metadata.
2010          */
2011         cancel_delayed_work_sync(&clone->waker);
2012 
2013         set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2014 
2015         /*
2016          * Make sure set_bit() is ordered before atomic_read(), otherwise we
2017          * might race with do_hydration() and miss some started region
2018          * hydrations.
2019          *
2020          * This is paired with smp_mb__after_atomic() in do_hydration().
2021          */
2022         smp_mb__after_atomic();
2023 
2024         wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
2025         flush_workqueue(clone->wq);
2026 
2027         (void) commit_metadata(clone, NULL);
2028 }
2029 
2030 static void clone_resume(struct dm_target *ti)
2031 {
2032         struct clone *clone = ti->private;
2033 
2034         clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2035         do_waker(&clone->waker.work);
2036 }
2037 
2038 static bool bdev_supports_discards(struct block_device *bdev)
2039 {
2040         struct request_queue *q = bdev_get_queue(bdev);
2041 
2042         return (q && blk_queue_discard(q));
2043 }
2044 
2045 /*
2046  * If discard_passdown was enabled verify that the destination device supports
2047  * discards. Disable discard_passdown if not.
2048  */
2049 static void disable_passdown_if_not_supported(struct clone *clone)
2050 {
2051         struct block_device *dest_dev = clone->dest_dev->bdev;
2052         struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
2053         const char *reason = NULL;
2054         char buf[BDEVNAME_SIZE];
2055 
2056         if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
2057                 return;
2058 
2059         if (!bdev_supports_discards(dest_dev))
2060                 reason = "discard unsupported";
2061         else if (dest_limits->max_discard_sectors < clone->region_size)
2062                 reason = "max discard sectors smaller than a region";
2063 
2064         if (reason) {
2065                 DMWARN("Destination device (%s) %s: Disabling discard passdown.",
2066                        bdevname(dest_dev, buf), reason);
2067                 clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
2068         }
2069 }
2070 
2071 static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
2072 {
2073         struct block_device *dest_bdev = clone->dest_dev->bdev;
2074         struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
2075 
2076         if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
2077                 /* No passdown is done so we set our own virtual limits */
2078                 limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
2079                 limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
2080                 return;
2081         }
2082 
2083         /*
2084          * clone_iterate_devices() is stacking both the source and destination
2085          * device limits but discards aren't passed to the source device, so
2086          * inherit destination's limits.
2087          */
2088         limits->max_discard_sectors = dest_limits->max_discard_sectors;
2089         limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
2090         limits->discard_granularity = dest_limits->discard_granularity;
2091         limits->discard_alignment = dest_limits->discard_alignment;
2092         limits->discard_misaligned = dest_limits->discard_misaligned;
2093         limits->max_discard_segments = dest_limits->max_discard_segments;
2094 }
2095 
2096 static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
2097 {
2098         struct clone *clone = ti->private;
2099         u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2100 
2101         /*
2102          * If the system-determined stacked limits are compatible with
2103          * dm-clone's region size (io_opt is a factor) do not override them.
2104          */
2105         if (io_opt_sectors < clone->region_size ||
2106             do_div(io_opt_sectors, clone->region_size)) {
2107                 blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
2108                 blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
2109         }
2110 
2111         disable_passdown_if_not_supported(clone);
2112         set_discard_limits(clone, limits);
2113 }
2114 
2115 static int clone_iterate_devices(struct dm_target *ti,
2116                                  iterate_devices_callout_fn fn, void *data)
2117 {
2118         int ret;
2119         struct clone *clone = ti->private;
2120         struct dm_dev *dest_dev = clone->dest_dev;
2121         struct dm_dev *source_dev = clone->source_dev;
2122 
2123         ret = fn(ti, source_dev, 0, ti->len, data);
2124         if (!ret)
2125                 ret = fn(ti, dest_dev, 0, ti->len, data);
2126         return ret;
2127 }
2128 
2129 /*
2130  * dm-clone message functions.
2131  */
2132 static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
2133 {
2134         WRITE_ONCE(clone->hydration_threshold, nr_regions);
2135 
2136         /*
2137          * If user space sets hydration_threshold to zero then the hydration
2138          * will stop. If at a later time the hydration_threshold is increased
2139          * we must restart the hydration process by waking up the worker.
2140          */
2141         wake_worker(clone);
2142 }
2143 
2144 static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
2145 {
2146         WRITE_ONCE(clone->hydration_batch_size, nr_regions);
2147 }
2148 
2149 static void enable_hydration(struct clone *clone)
2150 {
2151         if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
2152                 wake_worker(clone);
2153 }
2154 
2155 static void disable_hydration(struct clone *clone)
2156 {
2157         clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
2158 }
2159 
2160 static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
2161                          char *result, unsigned int maxlen)
2162 {
2163         struct clone *clone = ti->private;
2164         unsigned int value;
2165 
2166         if (!argc)
2167                 return -EINVAL;
2168 
2169         if (!strcasecmp(argv[0], "enable_hydration")) {
2170                 enable_hydration(clone);
2171                 return 0;
2172         }
2173 
2174         if (!strcasecmp(argv[0], "disable_hydration")) {
2175                 disable_hydration(clone);
2176                 return 0;
2177         }
2178 
2179         if (argc != 2)
2180                 return -EINVAL;
2181 
2182         if (!strcasecmp(argv[0], "hydration_threshold")) {
2183                 if (kstrtouint(argv[1], 10, &value))
2184                         return -EINVAL;
2185 
2186                 set_hydration_threshold(clone, value);
2187 
2188                 return 0;
2189         }
2190 
2191         if (!strcasecmp(argv[0], "hydration_batch_size")) {
2192                 if (kstrtouint(argv[1], 10, &value))
2193                         return -EINVAL;
2194 
2195                 set_hydration_batch_size(clone, value);
2196 
2197                 return 0;
2198         }
2199 
2200         DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
2201         return -EINVAL;
2202 }
2203 
2204 static struct target_type clone_target = {
2205         .name = "clone",
2206         .version = {1, 0, 0},
2207         .module = THIS_MODULE,
2208         .ctr = clone_ctr,
2209         .dtr =  clone_dtr,
2210         .map = clone_map,
2211         .end_io = clone_endio,
2212         .postsuspend = clone_postsuspend,
2213         .resume = clone_resume,
2214         .status = clone_status,
2215         .message = clone_message,
2216         .io_hints = clone_io_hints,
2217         .iterate_devices = clone_iterate_devices,
2218 };
2219 
2220 /*---------------------------------------------------------------------------*/
2221 
2222 /* Module functions */
2223 static int __init dm_clone_init(void)
2224 {
2225         int r;
2226 
2227         _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
2228         if (!_hydration_cache)
2229                 return -ENOMEM;
2230 
2231         r = dm_register_target(&clone_target);
2232         if (r < 0) {
2233                 DMERR("Failed to register clone target");
2234                 return r;
2235         }
2236 
2237         return 0;
2238 }
2239 
2240 static void __exit dm_clone_exit(void)
2241 {
2242         dm_unregister_target(&clone_target);
2243 
2244         kmem_cache_destroy(_hydration_cache);
2245         _hydration_cache = NULL;
2246 }
2247 
2248 /* Module hooks */
2249 module_init(dm_clone_init);
2250 module_exit(dm_clone_exit);
2251 
2252 MODULE_DESCRIPTION(DM_NAME " clone target");
2253 MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
2254 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */