root/drivers/md/dm-log-writes.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. bio_to_dev_sectors
  2. dev_to_bio_sectors
  3. put_pending_block
  4. put_io_block
  5. log_end_io
  6. log_end_super
  7. free_pending_block
  8. write_metadata
  9. write_inline_data
  10. log_one_block
  11. log_super
  12. logdev_last_sector
  13. log_writes_kthread
  14. log_writes_ctr
  15. log_mark
  16. log_writes_dtr
  17. normal_map_bio
  18. log_writes_map
  19. normal_end_io
  20. log_writes_status
  21. log_writes_prepare_ioctl
  22. log_writes_iterate_devices
  23. log_writes_message
  24. log_writes_io_hints
  25. log_dax
  26. log_writes_dax_direct_access
  27. log_writes_dax_copy_from_iter
  28. log_writes_dax_copy_to_iter
  29. dm_log_writes_init
  30. dm_log_writes_exit

   1 /*
   2  * Copyright (C) 2014 Facebook. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6 
   7 #include <linux/device-mapper.h>
   8 
   9 #include <linux/module.h>
  10 #include <linux/init.h>
  11 #include <linux/blkdev.h>
  12 #include <linux/bio.h>
  13 #include <linux/dax.h>
  14 #include <linux/slab.h>
  15 #include <linux/kthread.h>
  16 #include <linux/freezer.h>
  17 #include <linux/uio.h>
  18 
  19 #define DM_MSG_PREFIX "log-writes"
  20 
  21 /*
  22  * This target will sequentially log all writes to the target device onto the
  23  * log device.  This is helpful for replaying writes to check for fs consistency
  24  * at all times.  This target provides a mechanism to mark specific events to
  25  * check data at a later time.  So for example you would:
  26  *
  27  * write data
  28  * fsync
  29  * dmsetup message /dev/whatever mark mymark
  30  * unmount /mnt/test
  31  *
  32  * Then replay the log up to mymark and check the contents of the replay to
  33  * verify it matches what was written.
  34  *
  35  * We log writes only after they have been flushed, this makes the log describe
  36  * close to the order in which the data hits the actual disk, not its cache.  So
  37  * for example the following sequence (W means write, C means complete)
  38  *
  39  * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
  40  *
  41  * Would result in the log looking like this:
  42  *
  43  * c,a,b,flush,fuad,<other writes>,<next flush>
  44  *
  45  * This is meant to help expose problems where file systems do not properly wait
  46  * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
  47  * completes it is added to the log as it should be on disk.
  48  *
  49  * We treat DISCARDs as if they don't bypass cache so that they are logged in
  50  * order of completion along with the normal writes.  If we didn't do it this
  51  * way we would process all the discards first and then write all the data, when
  52  * in fact we want to do the data and the discard in the order that they
  53  * completed.
  54  */
  55 #define LOG_FLUSH_FLAG          (1 << 0)
  56 #define LOG_FUA_FLAG            (1 << 1)
  57 #define LOG_DISCARD_FLAG        (1 << 2)
  58 #define LOG_MARK_FLAG           (1 << 3)
  59 #define LOG_METADATA_FLAG       (1 << 4)
  60 
  61 #define WRITE_LOG_VERSION 1ULL
  62 #define WRITE_LOG_MAGIC 0x6a736677736872ULL
  63 #define WRITE_LOG_SUPER_SECTOR 0
  64 
  65 /*
  66  * The disk format for this is braindead simple.
  67  *
  68  * At byte 0 we have our super, followed by the following sequence for
  69  * nr_entries:
  70  *
  71  * [   1 sector    ][  entry->nr_sectors ]
  72  * [log_write_entry][    data written    ]
  73  *
  74  * The log_write_entry takes up a full sector so we can have arbitrary length
  75  * marks and it leaves us room for extra content in the future.
  76  */
  77 
  78 /*
  79  * Basic info about the log for userspace.
  80  */
  81 struct log_write_super {
  82         __le64 magic;
  83         __le64 version;
  84         __le64 nr_entries;
  85         __le32 sectorsize;
  86 };
  87 
  88 /*
  89  * sector - the sector we wrote.
  90  * nr_sectors - the number of sectors we wrote.
  91  * flags - flags for this log entry.
  92  * data_len - the size of the data in this log entry, this is for private log
  93  * entry stuff, the MARK data provided by userspace for example.
  94  */
  95 struct log_write_entry {
  96         __le64 sector;
  97         __le64 nr_sectors;
  98         __le64 flags;
  99         __le64 data_len;
 100 };
 101 
 102 struct log_writes_c {
 103         struct dm_dev *dev;
 104         struct dm_dev *logdev;
 105         u64 logged_entries;
 106         u32 sectorsize;
 107         u32 sectorshift;
 108         atomic_t io_blocks;
 109         atomic_t pending_blocks;
 110         sector_t next_sector;
 111         sector_t end_sector;
 112         bool logging_enabled;
 113         bool device_supports_discard;
 114         spinlock_t blocks_lock;
 115         struct list_head unflushed_blocks;
 116         struct list_head logging_blocks;
 117         wait_queue_head_t wait;
 118         struct task_struct *log_kthread;
 119         struct completion super_done;
 120 };
 121 
 122 struct pending_block {
 123         int vec_cnt;
 124         u64 flags;
 125         sector_t sector;
 126         sector_t nr_sectors;
 127         char *data;
 128         u32 datalen;
 129         struct list_head list;
 130         struct bio_vec vecs[0];
 131 };
 132 
 133 struct per_bio_data {
 134         struct pending_block *block;
 135 };
 136 
 137 static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
 138                                           sector_t sectors)
 139 {
 140         return sectors >> (lc->sectorshift - SECTOR_SHIFT);
 141 }
 142 
 143 static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
 144                                           sector_t sectors)
 145 {
 146         return sectors << (lc->sectorshift - SECTOR_SHIFT);
 147 }
 148 
 149 static void put_pending_block(struct log_writes_c *lc)
 150 {
 151         if (atomic_dec_and_test(&lc->pending_blocks)) {
 152                 smp_mb__after_atomic();
 153                 if (waitqueue_active(&lc->wait))
 154                         wake_up(&lc->wait);
 155         }
 156 }
 157 
 158 static void put_io_block(struct log_writes_c *lc)
 159 {
 160         if (atomic_dec_and_test(&lc->io_blocks)) {
 161                 smp_mb__after_atomic();
 162                 if (waitqueue_active(&lc->wait))
 163                         wake_up(&lc->wait);
 164         }
 165 }
 166 
 167 static void log_end_io(struct bio *bio)
 168 {
 169         struct log_writes_c *lc = bio->bi_private;
 170 
 171         if (bio->bi_status) {
 172                 unsigned long flags;
 173 
 174                 DMERR("Error writing log block, error=%d", bio->bi_status);
 175                 spin_lock_irqsave(&lc->blocks_lock, flags);
 176                 lc->logging_enabled = false;
 177                 spin_unlock_irqrestore(&lc->blocks_lock, flags);
 178         }
 179 
 180         bio_free_pages(bio);
 181         put_io_block(lc);
 182         bio_put(bio);
 183 }
 184 
 185 static void log_end_super(struct bio *bio)
 186 {
 187         struct log_writes_c *lc = bio->bi_private;
 188 
 189         complete(&lc->super_done);
 190         log_end_io(bio);
 191 }
 192 
 193 /*
 194  * Meant to be called if there is an error, it will free all the pages
 195  * associated with the block.
 196  */
 197 static void free_pending_block(struct log_writes_c *lc,
 198                                struct pending_block *block)
 199 {
 200         int i;
 201 
 202         for (i = 0; i < block->vec_cnt; i++) {
 203                 if (block->vecs[i].bv_page)
 204                         __free_page(block->vecs[i].bv_page);
 205         }
 206         kfree(block->data);
 207         kfree(block);
 208         put_pending_block(lc);
 209 }
 210 
 211 static int write_metadata(struct log_writes_c *lc, void *entry,
 212                           size_t entrylen, void *data, size_t datalen,
 213                           sector_t sector)
 214 {
 215         struct bio *bio;
 216         struct page *page;
 217         void *ptr;
 218         size_t ret;
 219 
 220         bio = bio_alloc(GFP_KERNEL, 1);
 221         if (!bio) {
 222                 DMERR("Couldn't alloc log bio");
 223                 goto error;
 224         }
 225         bio->bi_iter.bi_size = 0;
 226         bio->bi_iter.bi_sector = sector;
 227         bio_set_dev(bio, lc->logdev->bdev);
 228         bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
 229                           log_end_super : log_end_io;
 230         bio->bi_private = lc;
 231         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 232 
 233         page = alloc_page(GFP_KERNEL);
 234         if (!page) {
 235                 DMERR("Couldn't alloc log page");
 236                 bio_put(bio);
 237                 goto error;
 238         }
 239 
 240         ptr = kmap_atomic(page);
 241         memcpy(ptr, entry, entrylen);
 242         if (datalen)
 243                 memcpy(ptr + entrylen, data, datalen);
 244         memset(ptr + entrylen + datalen, 0,
 245                lc->sectorsize - entrylen - datalen);
 246         kunmap_atomic(ptr);
 247 
 248         ret = bio_add_page(bio, page, lc->sectorsize, 0);
 249         if (ret != lc->sectorsize) {
 250                 DMERR("Couldn't add page to the log block");
 251                 goto error_bio;
 252         }
 253         submit_bio(bio);
 254         return 0;
 255 error_bio:
 256         bio_put(bio);
 257         __free_page(page);
 258 error:
 259         put_io_block(lc);
 260         return -1;
 261 }
 262 
 263 static int write_inline_data(struct log_writes_c *lc, void *entry,
 264                              size_t entrylen, void *data, size_t datalen,
 265                              sector_t sector)
 266 {
 267         int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
 268         struct page *page;
 269         struct bio *bio;
 270         size_t ret;
 271         void *ptr;
 272 
 273         while (datalen) {
 274                 num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
 275                 bio_pages = min(num_pages, BIO_MAX_PAGES);
 276 
 277                 atomic_inc(&lc->io_blocks);
 278 
 279                 bio = bio_alloc(GFP_KERNEL, bio_pages);
 280                 if (!bio) {
 281                         DMERR("Couldn't alloc inline data bio");
 282                         goto error;
 283                 }
 284 
 285                 bio->bi_iter.bi_size = 0;
 286                 bio->bi_iter.bi_sector = sector;
 287                 bio_set_dev(bio, lc->logdev->bdev);
 288                 bio->bi_end_io = log_end_io;
 289                 bio->bi_private = lc;
 290                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 291 
 292                 for (i = 0; i < bio_pages; i++) {
 293                         pg_datalen = min_t(int, datalen, PAGE_SIZE);
 294                         pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
 295 
 296                         page = alloc_page(GFP_KERNEL);
 297                         if (!page) {
 298                                 DMERR("Couldn't alloc inline data page");
 299                                 goto error_bio;
 300                         }
 301 
 302                         ptr = kmap_atomic(page);
 303                         memcpy(ptr, data, pg_datalen);
 304                         if (pg_sectorlen > pg_datalen)
 305                                 memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
 306                         kunmap_atomic(ptr);
 307 
 308                         ret = bio_add_page(bio, page, pg_sectorlen, 0);
 309                         if (ret != pg_sectorlen) {
 310                                 DMERR("Couldn't add page of inline data");
 311                                 __free_page(page);
 312                                 goto error_bio;
 313                         }
 314 
 315                         datalen -= pg_datalen;
 316                         data    += pg_datalen;
 317                 }
 318                 submit_bio(bio);
 319 
 320                 sector += bio_pages * PAGE_SECTORS;
 321         }
 322         return 0;
 323 error_bio:
 324         bio_free_pages(bio);
 325         bio_put(bio);
 326 error:
 327         put_io_block(lc);
 328         return -1;
 329 }
 330 
 331 static int log_one_block(struct log_writes_c *lc,
 332                          struct pending_block *block, sector_t sector)
 333 {
 334         struct bio *bio;
 335         struct log_write_entry entry;
 336         size_t metadatalen, ret;
 337         int i;
 338 
 339         entry.sector = cpu_to_le64(block->sector);
 340         entry.nr_sectors = cpu_to_le64(block->nr_sectors);
 341         entry.flags = cpu_to_le64(block->flags);
 342         entry.data_len = cpu_to_le64(block->datalen);
 343 
 344         metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
 345         if (write_metadata(lc, &entry, sizeof(entry), block->data,
 346                            metadatalen, sector)) {
 347                 free_pending_block(lc, block);
 348                 return -1;
 349         }
 350 
 351         sector += dev_to_bio_sectors(lc, 1);
 352 
 353         if (block->datalen && metadatalen == 0) {
 354                 if (write_inline_data(lc, &entry, sizeof(entry), block->data,
 355                                       block->datalen, sector)) {
 356                         free_pending_block(lc, block);
 357                         return -1;
 358                 }
 359                 /* we don't support both inline data & bio data */
 360                 goto out;
 361         }
 362 
 363         if (!block->vec_cnt)
 364                 goto out;
 365 
 366         atomic_inc(&lc->io_blocks);
 367         bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
 368         if (!bio) {
 369                 DMERR("Couldn't alloc log bio");
 370                 goto error;
 371         }
 372         bio->bi_iter.bi_size = 0;
 373         bio->bi_iter.bi_sector = sector;
 374         bio_set_dev(bio, lc->logdev->bdev);
 375         bio->bi_end_io = log_end_io;
 376         bio->bi_private = lc;
 377         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 378 
 379         for (i = 0; i < block->vec_cnt; i++) {
 380                 /*
 381                  * The page offset is always 0 because we allocate a new page
 382                  * for every bvec in the original bio for simplicity sake.
 383                  */
 384                 ret = bio_add_page(bio, block->vecs[i].bv_page,
 385                                    block->vecs[i].bv_len, 0);
 386                 if (ret != block->vecs[i].bv_len) {
 387                         atomic_inc(&lc->io_blocks);
 388                         submit_bio(bio);
 389                         bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
 390                         if (!bio) {
 391                                 DMERR("Couldn't alloc log bio");
 392                                 goto error;
 393                         }
 394                         bio->bi_iter.bi_size = 0;
 395                         bio->bi_iter.bi_sector = sector;
 396                         bio_set_dev(bio, lc->logdev->bdev);
 397                         bio->bi_end_io = log_end_io;
 398                         bio->bi_private = lc;
 399                         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 400 
 401                         ret = bio_add_page(bio, block->vecs[i].bv_page,
 402                                            block->vecs[i].bv_len, 0);
 403                         if (ret != block->vecs[i].bv_len) {
 404                                 DMERR("Couldn't add page on new bio?");
 405                                 bio_put(bio);
 406                                 goto error;
 407                         }
 408                 }
 409                 sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
 410         }
 411         submit_bio(bio);
 412 out:
 413         kfree(block->data);
 414         kfree(block);
 415         put_pending_block(lc);
 416         return 0;
 417 error:
 418         free_pending_block(lc, block);
 419         put_io_block(lc);
 420         return -1;
 421 }
 422 
 423 static int log_super(struct log_writes_c *lc)
 424 {
 425         struct log_write_super super;
 426 
 427         super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
 428         super.version = cpu_to_le64(WRITE_LOG_VERSION);
 429         super.nr_entries = cpu_to_le64(lc->logged_entries);
 430         super.sectorsize = cpu_to_le32(lc->sectorsize);
 431 
 432         if (write_metadata(lc, &super, sizeof(super), NULL, 0,
 433                            WRITE_LOG_SUPER_SECTOR)) {
 434                 DMERR("Couldn't write super");
 435                 return -1;
 436         }
 437 
 438         /*
 439          * Super sector should be writen in-order, otherwise the
 440          * nr_entries could be rewritten incorrectly by an old bio.
 441          */
 442         wait_for_completion_io(&lc->super_done);
 443 
 444         return 0;
 445 }
 446 
 447 static inline sector_t logdev_last_sector(struct log_writes_c *lc)
 448 {
 449         return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
 450 }
 451 
 452 static int log_writes_kthread(void *arg)
 453 {
 454         struct log_writes_c *lc = (struct log_writes_c *)arg;
 455         sector_t sector = 0;
 456 
 457         while (!kthread_should_stop()) {
 458                 bool super = false;
 459                 bool logging_enabled;
 460                 struct pending_block *block = NULL;
 461                 int ret;
 462 
 463                 spin_lock_irq(&lc->blocks_lock);
 464                 if (!list_empty(&lc->logging_blocks)) {
 465                         block = list_first_entry(&lc->logging_blocks,
 466                                                  struct pending_block, list);
 467                         list_del_init(&block->list);
 468                         if (!lc->logging_enabled)
 469                                 goto next;
 470 
 471                         sector = lc->next_sector;
 472                         if (!(block->flags & LOG_DISCARD_FLAG))
 473                                 lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
 474                         lc->next_sector += dev_to_bio_sectors(lc, 1);
 475 
 476                         /*
 477                          * Apparently the size of the device may not be known
 478                          * right away, so handle this properly.
 479                          */
 480                         if (!lc->end_sector)
 481                                 lc->end_sector = logdev_last_sector(lc);
 482                         if (lc->end_sector &&
 483                             lc->next_sector >= lc->end_sector) {
 484                                 DMERR("Ran out of space on the logdev");
 485                                 lc->logging_enabled = false;
 486                                 goto next;
 487                         }
 488                         lc->logged_entries++;
 489                         atomic_inc(&lc->io_blocks);
 490 
 491                         super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
 492                         if (super)
 493                                 atomic_inc(&lc->io_blocks);
 494                 }
 495 next:
 496                 logging_enabled = lc->logging_enabled;
 497                 spin_unlock_irq(&lc->blocks_lock);
 498                 if (block) {
 499                         if (logging_enabled) {
 500                                 ret = log_one_block(lc, block, sector);
 501                                 if (!ret && super)
 502                                         ret = log_super(lc);
 503                                 if (ret) {
 504                                         spin_lock_irq(&lc->blocks_lock);
 505                                         lc->logging_enabled = false;
 506                                         spin_unlock_irq(&lc->blocks_lock);
 507                                 }
 508                         } else
 509                                 free_pending_block(lc, block);
 510                         continue;
 511                 }
 512 
 513                 if (!try_to_freeze()) {
 514                         set_current_state(TASK_INTERRUPTIBLE);
 515                         if (!kthread_should_stop() &&
 516                             list_empty(&lc->logging_blocks))
 517                                 schedule();
 518                         __set_current_state(TASK_RUNNING);
 519                 }
 520         }
 521         return 0;
 522 }
 523 
 524 /*
 525  * Construct a log-writes mapping:
 526  * log-writes <dev_path> <log_dev_path>
 527  */
 528 static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 529 {
 530         struct log_writes_c *lc;
 531         struct dm_arg_set as;
 532         const char *devname, *logdevname;
 533         int ret;
 534 
 535         as.argc = argc;
 536         as.argv = argv;
 537 
 538         if (argc < 2) {
 539                 ti->error = "Invalid argument count";
 540                 return -EINVAL;
 541         }
 542 
 543         lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
 544         if (!lc) {
 545                 ti->error = "Cannot allocate context";
 546                 return -ENOMEM;
 547         }
 548         spin_lock_init(&lc->blocks_lock);
 549         INIT_LIST_HEAD(&lc->unflushed_blocks);
 550         INIT_LIST_HEAD(&lc->logging_blocks);
 551         init_waitqueue_head(&lc->wait);
 552         init_completion(&lc->super_done);
 553         atomic_set(&lc->io_blocks, 0);
 554         atomic_set(&lc->pending_blocks, 0);
 555 
 556         devname = dm_shift_arg(&as);
 557         ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
 558         if (ret) {
 559                 ti->error = "Device lookup failed";
 560                 goto bad;
 561         }
 562 
 563         logdevname = dm_shift_arg(&as);
 564         ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
 565                             &lc->logdev);
 566         if (ret) {
 567                 ti->error = "Log device lookup failed";
 568                 dm_put_device(ti, lc->dev);
 569                 goto bad;
 570         }
 571 
 572         lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
 573         lc->sectorshift = ilog2(lc->sectorsize);
 574         lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
 575         if (IS_ERR(lc->log_kthread)) {
 576                 ret = PTR_ERR(lc->log_kthread);
 577                 ti->error = "Couldn't alloc kthread";
 578                 dm_put_device(ti, lc->dev);
 579                 dm_put_device(ti, lc->logdev);
 580                 goto bad;
 581         }
 582 
 583         /*
 584          * next_sector is in 512b sectors to correspond to what bi_sector expects.
 585          * The super starts at sector 0, and the next_sector is the next logical
 586          * one based on the sectorsize of the device.
 587          */
 588         lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
 589         lc->logging_enabled = true;
 590         lc->end_sector = logdev_last_sector(lc);
 591         lc->device_supports_discard = true;
 592 
 593         ti->num_flush_bios = 1;
 594         ti->flush_supported = true;
 595         ti->num_discard_bios = 1;
 596         ti->discards_supported = true;
 597         ti->per_io_data_size = sizeof(struct per_bio_data);
 598         ti->private = lc;
 599         return 0;
 600 
 601 bad:
 602         kfree(lc);
 603         return ret;
 604 }
 605 
 606 static int log_mark(struct log_writes_c *lc, char *data)
 607 {
 608         struct pending_block *block;
 609         size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
 610 
 611         block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
 612         if (!block) {
 613                 DMERR("Error allocating pending block");
 614                 return -ENOMEM;
 615         }
 616 
 617         block->data = kstrndup(data, maxsize - 1, GFP_KERNEL);
 618         if (!block->data) {
 619                 DMERR("Error copying mark data");
 620                 kfree(block);
 621                 return -ENOMEM;
 622         }
 623         atomic_inc(&lc->pending_blocks);
 624         block->datalen = strlen(block->data);
 625         block->flags |= LOG_MARK_FLAG;
 626         spin_lock_irq(&lc->blocks_lock);
 627         list_add_tail(&block->list, &lc->logging_blocks);
 628         spin_unlock_irq(&lc->blocks_lock);
 629         wake_up_process(lc->log_kthread);
 630         return 0;
 631 }
 632 
 633 static void log_writes_dtr(struct dm_target *ti)
 634 {
 635         struct log_writes_c *lc = ti->private;
 636 
 637         spin_lock_irq(&lc->blocks_lock);
 638         list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
 639         spin_unlock_irq(&lc->blocks_lock);
 640 
 641         /*
 642          * This is just nice to have since it'll update the super to include the
 643          * unflushed blocks, if it fails we don't really care.
 644          */
 645         log_mark(lc, "dm-log-writes-end");
 646         wake_up_process(lc->log_kthread);
 647         wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
 648                    !atomic_read(&lc->pending_blocks));
 649         kthread_stop(lc->log_kthread);
 650 
 651         WARN_ON(!list_empty(&lc->logging_blocks));
 652         WARN_ON(!list_empty(&lc->unflushed_blocks));
 653         dm_put_device(ti, lc->dev);
 654         dm_put_device(ti, lc->logdev);
 655         kfree(lc);
 656 }
 657 
 658 static void normal_map_bio(struct dm_target *ti, struct bio *bio)
 659 {
 660         struct log_writes_c *lc = ti->private;
 661 
 662         bio_set_dev(bio, lc->dev->bdev);
 663 }
 664 
 665 static int log_writes_map(struct dm_target *ti, struct bio *bio)
 666 {
 667         struct log_writes_c *lc = ti->private;
 668         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 669         struct pending_block *block;
 670         struct bvec_iter iter;
 671         struct bio_vec bv;
 672         size_t alloc_size;
 673         int i = 0;
 674         bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
 675         bool fua_bio = (bio->bi_opf & REQ_FUA);
 676         bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
 677         bool meta_bio = (bio->bi_opf & REQ_META);
 678 
 679         pb->block = NULL;
 680 
 681         /* Don't bother doing anything if logging has been disabled */
 682         if (!lc->logging_enabled)
 683                 goto map_bio;
 684 
 685         /*
 686          * Map reads as normal.
 687          */
 688         if (bio_data_dir(bio) == READ)
 689                 goto map_bio;
 690 
 691         /* No sectors and not a flush?  Don't care */
 692         if (!bio_sectors(bio) && !flush_bio)
 693                 goto map_bio;
 694 
 695         /*
 696          * Discards will have bi_size set but there's no actual data, so just
 697          * allocate the size of the pending block.
 698          */
 699         if (discard_bio)
 700                 alloc_size = sizeof(struct pending_block);
 701         else
 702                 alloc_size = struct_size(block, vecs, bio_segments(bio));
 703 
 704         block = kzalloc(alloc_size, GFP_NOIO);
 705         if (!block) {
 706                 DMERR("Error allocating pending block");
 707                 spin_lock_irq(&lc->blocks_lock);
 708                 lc->logging_enabled = false;
 709                 spin_unlock_irq(&lc->blocks_lock);
 710                 return DM_MAPIO_KILL;
 711         }
 712         INIT_LIST_HEAD(&block->list);
 713         pb->block = block;
 714         atomic_inc(&lc->pending_blocks);
 715 
 716         if (flush_bio)
 717                 block->flags |= LOG_FLUSH_FLAG;
 718         if (fua_bio)
 719                 block->flags |= LOG_FUA_FLAG;
 720         if (discard_bio)
 721                 block->flags |= LOG_DISCARD_FLAG;
 722         if (meta_bio)
 723                 block->flags |= LOG_METADATA_FLAG;
 724 
 725         block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
 726         block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
 727 
 728         /* We don't need the data, just submit */
 729         if (discard_bio) {
 730                 WARN_ON(flush_bio || fua_bio);
 731                 if (lc->device_supports_discard)
 732                         goto map_bio;
 733                 bio_endio(bio);
 734                 return DM_MAPIO_SUBMITTED;
 735         }
 736 
 737         /* Flush bio, splice the unflushed blocks onto this list and submit */
 738         if (flush_bio && !bio_sectors(bio)) {
 739                 spin_lock_irq(&lc->blocks_lock);
 740                 list_splice_init(&lc->unflushed_blocks, &block->list);
 741                 spin_unlock_irq(&lc->blocks_lock);
 742                 goto map_bio;
 743         }
 744 
 745         /*
 746          * We will write this bio somewhere else way later so we need to copy
 747          * the actual contents into new pages so we know the data will always be
 748          * there.
 749          *
 750          * We do this because this could be a bio from O_DIRECT in which case we
 751          * can't just hold onto the page until some later point, we have to
 752          * manually copy the contents.
 753          */
 754         bio_for_each_segment(bv, bio, iter) {
 755                 struct page *page;
 756                 void *src, *dst;
 757 
 758                 page = alloc_page(GFP_NOIO);
 759                 if (!page) {
 760                         DMERR("Error allocing page");
 761                         free_pending_block(lc, block);
 762                         spin_lock_irq(&lc->blocks_lock);
 763                         lc->logging_enabled = false;
 764                         spin_unlock_irq(&lc->blocks_lock);
 765                         return DM_MAPIO_KILL;
 766                 }
 767 
 768                 src = kmap_atomic(bv.bv_page);
 769                 dst = kmap_atomic(page);
 770                 memcpy(dst, src + bv.bv_offset, bv.bv_len);
 771                 kunmap_atomic(dst);
 772                 kunmap_atomic(src);
 773                 block->vecs[i].bv_page = page;
 774                 block->vecs[i].bv_len = bv.bv_len;
 775                 block->vec_cnt++;
 776                 i++;
 777         }
 778 
 779         /* Had a flush with data in it, weird */
 780         if (flush_bio) {
 781                 spin_lock_irq(&lc->blocks_lock);
 782                 list_splice_init(&lc->unflushed_blocks, &block->list);
 783                 spin_unlock_irq(&lc->blocks_lock);
 784         }
 785 map_bio:
 786         normal_map_bio(ti, bio);
 787         return DM_MAPIO_REMAPPED;
 788 }
 789 
 790 static int normal_end_io(struct dm_target *ti, struct bio *bio,
 791                 blk_status_t *error)
 792 {
 793         struct log_writes_c *lc = ti->private;
 794         struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 795 
 796         if (bio_data_dir(bio) == WRITE && pb->block) {
 797                 struct pending_block *block = pb->block;
 798                 unsigned long flags;
 799 
 800                 spin_lock_irqsave(&lc->blocks_lock, flags);
 801                 if (block->flags & LOG_FLUSH_FLAG) {
 802                         list_splice_tail_init(&block->list, &lc->logging_blocks);
 803                         list_add_tail(&block->list, &lc->logging_blocks);
 804                         wake_up_process(lc->log_kthread);
 805                 } else if (block->flags & LOG_FUA_FLAG) {
 806                         list_add_tail(&block->list, &lc->logging_blocks);
 807                         wake_up_process(lc->log_kthread);
 808                 } else
 809                         list_add_tail(&block->list, &lc->unflushed_blocks);
 810                 spin_unlock_irqrestore(&lc->blocks_lock, flags);
 811         }
 812 
 813         return DM_ENDIO_DONE;
 814 }
 815 
 816 /*
 817  * INFO format: <logged entries> <highest allocated sector>
 818  */
 819 static void log_writes_status(struct dm_target *ti, status_type_t type,
 820                               unsigned status_flags, char *result,
 821                               unsigned maxlen)
 822 {
 823         unsigned sz = 0;
 824         struct log_writes_c *lc = ti->private;
 825 
 826         switch (type) {
 827         case STATUSTYPE_INFO:
 828                 DMEMIT("%llu %llu", lc->logged_entries,
 829                        (unsigned long long)lc->next_sector - 1);
 830                 if (!lc->logging_enabled)
 831                         DMEMIT(" logging_disabled");
 832                 break;
 833 
 834         case STATUSTYPE_TABLE:
 835                 DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
 836                 break;
 837         }
 838 }
 839 
 840 static int log_writes_prepare_ioctl(struct dm_target *ti,
 841                                     struct block_device **bdev)
 842 {
 843         struct log_writes_c *lc = ti->private;
 844         struct dm_dev *dev = lc->dev;
 845 
 846         *bdev = dev->bdev;
 847         /*
 848          * Only pass ioctls through if the device sizes match exactly.
 849          */
 850         if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
 851                 return 1;
 852         return 0;
 853 }
 854 
 855 static int log_writes_iterate_devices(struct dm_target *ti,
 856                                       iterate_devices_callout_fn fn,
 857                                       void *data)
 858 {
 859         struct log_writes_c *lc = ti->private;
 860 
 861         return fn(ti, lc->dev, 0, ti->len, data);
 862 }
 863 
 864 /*
 865  * Messages supported:
 866  *   mark <mark data> - specify the marked data.
 867  */
 868 static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv,
 869                               char *result, unsigned maxlen)
 870 {
 871         int r = -EINVAL;
 872         struct log_writes_c *lc = ti->private;
 873 
 874         if (argc != 2) {
 875                 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
 876                 return r;
 877         }
 878 
 879         if (!strcasecmp(argv[0], "mark"))
 880                 r = log_mark(lc, argv[1]);
 881         else
 882                 DMWARN("Unrecognised log writes target message received: %s", argv[0]);
 883 
 884         return r;
 885 }
 886 
 887 static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
 888 {
 889         struct log_writes_c *lc = ti->private;
 890         struct request_queue *q = bdev_get_queue(lc->dev->bdev);
 891 
 892         if (!q || !blk_queue_discard(q)) {
 893                 lc->device_supports_discard = false;
 894                 limits->discard_granularity = lc->sectorsize;
 895                 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
 896         }
 897         limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
 898         limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
 899         limits->io_min = limits->physical_block_size;
 900 }
 901 
 902 #if IS_ENABLED(CONFIG_DAX_DRIVER)
 903 static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
 904                    struct iov_iter *i)
 905 {
 906         struct pending_block *block;
 907 
 908         if (!bytes)
 909                 return 0;
 910 
 911         block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
 912         if (!block) {
 913                 DMERR("Error allocating dax pending block");
 914                 return -ENOMEM;
 915         }
 916 
 917         block->data = kzalloc(bytes, GFP_KERNEL);
 918         if (!block->data) {
 919                 DMERR("Error allocating dax data space");
 920                 kfree(block);
 921                 return -ENOMEM;
 922         }
 923 
 924         /* write data provided via the iterator */
 925         if (!copy_from_iter(block->data, bytes, i)) {
 926                 DMERR("Error copying dax data");
 927                 kfree(block->data);
 928                 kfree(block);
 929                 return -EIO;
 930         }
 931 
 932         /* rewind the iterator so that the block driver can use it */
 933         iov_iter_revert(i, bytes);
 934 
 935         block->datalen = bytes;
 936         block->sector = bio_to_dev_sectors(lc, sector);
 937         block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
 938 
 939         atomic_inc(&lc->pending_blocks);
 940         spin_lock_irq(&lc->blocks_lock);
 941         list_add_tail(&block->list, &lc->unflushed_blocks);
 942         spin_unlock_irq(&lc->blocks_lock);
 943         wake_up_process(lc->log_kthread);
 944 
 945         return 0;
 946 }
 947 
 948 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 949                                          long nr_pages, void **kaddr, pfn_t *pfn)
 950 {
 951         struct log_writes_c *lc = ti->private;
 952         sector_t sector = pgoff * PAGE_SECTORS;
 953         int ret;
 954 
 955         ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff);
 956         if (ret)
 957                 return ret;
 958         return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn);
 959 }
 960 
 961 static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
 962                                             pgoff_t pgoff, void *addr, size_t bytes,
 963                                             struct iov_iter *i)
 964 {
 965         struct log_writes_c *lc = ti->private;
 966         sector_t sector = pgoff * PAGE_SECTORS;
 967         int err;
 968 
 969         if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
 970                 return 0;
 971 
 972         /* Don't bother doing anything if logging has been disabled */
 973         if (!lc->logging_enabled)
 974                 goto dax_copy;
 975 
 976         err = log_dax(lc, sector, bytes, i);
 977         if (err) {
 978                 DMWARN("Error %d logging DAX write", err);
 979                 return 0;
 980         }
 981 dax_copy:
 982         return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 983 }
 984 
 985 static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
 986                                           pgoff_t pgoff, void *addr, size_t bytes,
 987                                           struct iov_iter *i)
 988 {
 989         struct log_writes_c *lc = ti->private;
 990         sector_t sector = pgoff * PAGE_SECTORS;
 991 
 992         if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
 993                 return 0;
 994         return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 995 }
 996 
 997 #else
 998 #define log_writes_dax_direct_access NULL
 999 #define log_writes_dax_copy_from_iter NULL
1000 #define log_writes_dax_copy_to_iter NULL
1001 #endif
1002 
1003 static struct target_type log_writes_target = {
1004         .name   = "log-writes",
1005         .version = {1, 1, 0},
1006         .module = THIS_MODULE,
1007         .ctr    = log_writes_ctr,
1008         .dtr    = log_writes_dtr,
1009         .map    = log_writes_map,
1010         .end_io = normal_end_io,
1011         .status = log_writes_status,
1012         .prepare_ioctl = log_writes_prepare_ioctl,
1013         .message = log_writes_message,
1014         .iterate_devices = log_writes_iterate_devices,
1015         .io_hints = log_writes_io_hints,
1016         .direct_access = log_writes_dax_direct_access,
1017         .dax_copy_from_iter = log_writes_dax_copy_from_iter,
1018         .dax_copy_to_iter = log_writes_dax_copy_to_iter,
1019 };
1020 
1021 static int __init dm_log_writes_init(void)
1022 {
1023         int r = dm_register_target(&log_writes_target);
1024 
1025         if (r < 0)
1026                 DMERR("register failed %d", r);
1027 
1028         return r;
1029 }
1030 
1031 static void __exit dm_log_writes_exit(void)
1032 {
1033         dm_unregister_target(&log_writes_target);
1034 }
1035 
1036 module_init(dm_log_writes_init);
1037 module_exit(dm_log_writes_exit);
1038 
1039 MODULE_DESCRIPTION(DM_NAME " log writes target");
1040 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
1041 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */