1/* 2 * Copyright (C) 2012 Fusion-io All rights reserved. 3 * Copyright (C) 2012 Intel Corp. All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public 7 * License v2 as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public 15 * License along with this program; if not, write to the 16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 * Boston, MA 021110-1307, USA. 18 */ 19#include <linux/sched.h> 20#include <linux/wait.h> 21#include <linux/bio.h> 22#include <linux/slab.h> 23#include <linux/buffer_head.h> 24#include <linux/blkdev.h> 25#include <linux/random.h> 26#include <linux/iocontext.h> 27#include <linux/capability.h> 28#include <linux/ratelimit.h> 29#include <linux/kthread.h> 30#include <linux/raid/pq.h> 31#include <linux/hash.h> 32#include <linux/list_sort.h> 33#include <linux/raid/xor.h> 34#include <linux/vmalloc.h> 35#include <asm/div64.h> 36#include "ctree.h" 37#include "extent_map.h" 38#include "disk-io.h" 39#include "transaction.h" 40#include "print-tree.h" 41#include "volumes.h" 42#include "raid56.h" 43#include "async-thread.h" 44#include "check-integrity.h" 45#include "rcu-string.h" 46 47/* set when additional merges to this rbio are not allowed */ 48#define RBIO_RMW_LOCKED_BIT 1 49 50/* 51 * set when this rbio is sitting in the hash, but it is just a cache 52 * of past RMW 53 */ 54#define RBIO_CACHE_BIT 2 55 56/* 57 * set when it is safe to trust the stripe_pages for caching 58 */ 59#define RBIO_CACHE_READY_BIT 3 60 61#define RBIO_CACHE_SIZE 1024 62 63enum btrfs_rbio_ops { 64 BTRFS_RBIO_WRITE = 0, 65 BTRFS_RBIO_READ_REBUILD = 1, 66 BTRFS_RBIO_PARITY_SCRUB = 2, 67}; 68 69struct btrfs_raid_bio { 70 struct btrfs_fs_info *fs_info; 71 struct btrfs_bio *bbio; 72 73 /* while we're doing rmw on a stripe 74 * we put it into a hash table so we can 75 * lock the stripe and merge more rbios 76 * into it. 77 */ 78 struct list_head hash_list; 79 80 /* 81 * LRU list for the stripe cache 82 */ 83 struct list_head stripe_cache; 84 85 /* 86 * for scheduling work in the helper threads 87 */ 88 struct btrfs_work work; 89 90 /* 91 * bio list and bio_list_lock are used 92 * to add more bios into the stripe 93 * in hopes of avoiding the full rmw 94 */ 95 struct bio_list bio_list; 96 spinlock_t bio_list_lock; 97 98 /* also protected by the bio_list_lock, the 99 * plug list is used by the plugging code 100 * to collect partial bios while plugged. The 101 * stripe locking code also uses it to hand off 102 * the stripe lock to the next pending IO 103 */ 104 struct list_head plug_list; 105 106 /* 107 * flags that tell us if it is safe to 108 * merge with this bio 109 */ 110 unsigned long flags; 111 112 /* size of each individual stripe on disk */ 113 int stripe_len; 114 115 /* number of data stripes (no p/q) */ 116 int nr_data; 117 118 int real_stripes; 119 120 int stripe_npages; 121 /* 122 * set if we're doing a parity rebuild 123 * for a read from higher up, which is handled 124 * differently from a parity rebuild as part of 125 * rmw 126 */ 127 enum btrfs_rbio_ops operation; 128 129 /* first bad stripe */ 130 int faila; 131 132 /* second bad stripe (for raid6 use) */ 133 int failb; 134 135 int scrubp; 136 /* 137 * number of pages needed to represent the full 138 * stripe 139 */ 140 int nr_pages; 141 142 /* 143 * size of all the bios in the bio_list. This 144 * helps us decide if the rbio maps to a full 145 * stripe or not 146 */ 147 int bio_list_bytes; 148 149 int generic_bio_cnt; 150 151 atomic_t refs; 152 153 atomic_t stripes_pending; 154 155 atomic_t error; 156 /* 157 * these are two arrays of pointers. We allocate the 158 * rbio big enough to hold them both and setup their 159 * locations when the rbio is allocated 160 */ 161 162 /* pointers to pages that we allocated for 163 * reading/writing stripes directly from the disk (including P/Q) 164 */ 165 struct page **stripe_pages; 166 167 /* 168 * pointers to the pages in the bio_list. Stored 169 * here for faster lookup 170 */ 171 struct page **bio_pages; 172 173 /* 174 * bitmap to record which horizontal stripe has data 175 */ 176 unsigned long *dbitmap; 177}; 178 179static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 180static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 181static void rmw_work(struct btrfs_work *work); 182static void read_rebuild_work(struct btrfs_work *work); 183static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 184static void async_read_rebuild(struct btrfs_raid_bio *rbio); 185static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 186static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 187static void __free_raid_bio(struct btrfs_raid_bio *rbio); 188static void index_rbio_pages(struct btrfs_raid_bio *rbio); 189static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 190 191static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 192 int need_check); 193static void async_scrub_parity(struct btrfs_raid_bio *rbio); 194 195/* 196 * the stripe hash table is used for locking, and to collect 197 * bios in hopes of making a full stripe 198 */ 199int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 200{ 201 struct btrfs_stripe_hash_table *table; 202 struct btrfs_stripe_hash_table *x; 203 struct btrfs_stripe_hash *cur; 204 struct btrfs_stripe_hash *h; 205 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 206 int i; 207 int table_size; 208 209 if (info->stripe_hash_table) 210 return 0; 211 212 /* 213 * The table is large, starting with order 4 and can go as high as 214 * order 7 in case lock debugging is turned on. 215 * 216 * Try harder to allocate and fallback to vmalloc to lower the chance 217 * of a failing mount. 218 */ 219 table_size = sizeof(*table) + sizeof(*h) * num_entries; 220 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 221 if (!table) { 222 table = vzalloc(table_size); 223 if (!table) 224 return -ENOMEM; 225 } 226 227 spin_lock_init(&table->cache_lock); 228 INIT_LIST_HEAD(&table->stripe_cache); 229 230 h = table->table; 231 232 for (i = 0; i < num_entries; i++) { 233 cur = h + i; 234 INIT_LIST_HEAD(&cur->hash_list); 235 spin_lock_init(&cur->lock); 236 init_waitqueue_head(&cur->wait); 237 } 238 239 x = cmpxchg(&info->stripe_hash_table, NULL, table); 240 if (x) 241 kvfree(x); 242 return 0; 243} 244 245/* 246 * caching an rbio means to copy anything from the 247 * bio_pages array into the stripe_pages array. We 248 * use the page uptodate bit in the stripe cache array 249 * to indicate if it has valid data 250 * 251 * once the caching is done, we set the cache ready 252 * bit. 253 */ 254static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 255{ 256 int i; 257 char *s; 258 char *d; 259 int ret; 260 261 ret = alloc_rbio_pages(rbio); 262 if (ret) 263 return; 264 265 for (i = 0; i < rbio->nr_pages; i++) { 266 if (!rbio->bio_pages[i]) 267 continue; 268 269 s = kmap(rbio->bio_pages[i]); 270 d = kmap(rbio->stripe_pages[i]); 271 272 memcpy(d, s, PAGE_CACHE_SIZE); 273 274 kunmap(rbio->bio_pages[i]); 275 kunmap(rbio->stripe_pages[i]); 276 SetPageUptodate(rbio->stripe_pages[i]); 277 } 278 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 279} 280 281/* 282 * we hash on the first logical address of the stripe 283 */ 284static int rbio_bucket(struct btrfs_raid_bio *rbio) 285{ 286 u64 num = rbio->bbio->raid_map[0]; 287 288 /* 289 * we shift down quite a bit. We're using byte 290 * addressing, and most of the lower bits are zeros. 291 * This tends to upset hash_64, and it consistently 292 * returns just one or two different values. 293 * 294 * shifting off the lower bits fixes things. 295 */ 296 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 297} 298 299/* 300 * stealing an rbio means taking all the uptodate pages from the stripe 301 * array in the source rbio and putting them into the destination rbio 302 */ 303static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 304{ 305 int i; 306 struct page *s; 307 struct page *d; 308 309 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 310 return; 311 312 for (i = 0; i < dest->nr_pages; i++) { 313 s = src->stripe_pages[i]; 314 if (!s || !PageUptodate(s)) { 315 continue; 316 } 317 318 d = dest->stripe_pages[i]; 319 if (d) 320 __free_page(d); 321 322 dest->stripe_pages[i] = s; 323 src->stripe_pages[i] = NULL; 324 } 325} 326 327/* 328 * merging means we take the bio_list from the victim and 329 * splice it into the destination. The victim should 330 * be discarded afterwards. 331 * 332 * must be called with dest->rbio_list_lock held 333 */ 334static void merge_rbio(struct btrfs_raid_bio *dest, 335 struct btrfs_raid_bio *victim) 336{ 337 bio_list_merge(&dest->bio_list, &victim->bio_list); 338 dest->bio_list_bytes += victim->bio_list_bytes; 339 dest->generic_bio_cnt += victim->generic_bio_cnt; 340 bio_list_init(&victim->bio_list); 341} 342 343/* 344 * used to prune items that are in the cache. The caller 345 * must hold the hash table lock. 346 */ 347static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 348{ 349 int bucket = rbio_bucket(rbio); 350 struct btrfs_stripe_hash_table *table; 351 struct btrfs_stripe_hash *h; 352 int freeit = 0; 353 354 /* 355 * check the bit again under the hash table lock. 356 */ 357 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 358 return; 359 360 table = rbio->fs_info->stripe_hash_table; 361 h = table->table + bucket; 362 363 /* hold the lock for the bucket because we may be 364 * removing it from the hash table 365 */ 366 spin_lock(&h->lock); 367 368 /* 369 * hold the lock for the bio list because we need 370 * to make sure the bio list is empty 371 */ 372 spin_lock(&rbio->bio_list_lock); 373 374 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 375 list_del_init(&rbio->stripe_cache); 376 table->cache_size -= 1; 377 freeit = 1; 378 379 /* if the bio list isn't empty, this rbio is 380 * still involved in an IO. We take it out 381 * of the cache list, and drop the ref that 382 * was held for the list. 383 * 384 * If the bio_list was empty, we also remove 385 * the rbio from the hash_table, and drop 386 * the corresponding ref 387 */ 388 if (bio_list_empty(&rbio->bio_list)) { 389 if (!list_empty(&rbio->hash_list)) { 390 list_del_init(&rbio->hash_list); 391 atomic_dec(&rbio->refs); 392 BUG_ON(!list_empty(&rbio->plug_list)); 393 } 394 } 395 } 396 397 spin_unlock(&rbio->bio_list_lock); 398 spin_unlock(&h->lock); 399 400 if (freeit) 401 __free_raid_bio(rbio); 402} 403 404/* 405 * prune a given rbio from the cache 406 */ 407static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 408{ 409 struct btrfs_stripe_hash_table *table; 410 unsigned long flags; 411 412 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 413 return; 414 415 table = rbio->fs_info->stripe_hash_table; 416 417 spin_lock_irqsave(&table->cache_lock, flags); 418 __remove_rbio_from_cache(rbio); 419 spin_unlock_irqrestore(&table->cache_lock, flags); 420} 421 422/* 423 * remove everything in the cache 424 */ 425static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 426{ 427 struct btrfs_stripe_hash_table *table; 428 unsigned long flags; 429 struct btrfs_raid_bio *rbio; 430 431 table = info->stripe_hash_table; 432 433 spin_lock_irqsave(&table->cache_lock, flags); 434 while (!list_empty(&table->stripe_cache)) { 435 rbio = list_entry(table->stripe_cache.next, 436 struct btrfs_raid_bio, 437 stripe_cache); 438 __remove_rbio_from_cache(rbio); 439 } 440 spin_unlock_irqrestore(&table->cache_lock, flags); 441} 442 443/* 444 * remove all cached entries and free the hash table 445 * used by unmount 446 */ 447void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 448{ 449 if (!info->stripe_hash_table) 450 return; 451 btrfs_clear_rbio_cache(info); 452 kvfree(info->stripe_hash_table); 453 info->stripe_hash_table = NULL; 454} 455 456/* 457 * insert an rbio into the stripe cache. It 458 * must have already been prepared by calling 459 * cache_rbio_pages 460 * 461 * If this rbio was already cached, it gets 462 * moved to the front of the lru. 463 * 464 * If the size of the rbio cache is too big, we 465 * prune an item. 466 */ 467static void cache_rbio(struct btrfs_raid_bio *rbio) 468{ 469 struct btrfs_stripe_hash_table *table; 470 unsigned long flags; 471 472 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 473 return; 474 475 table = rbio->fs_info->stripe_hash_table; 476 477 spin_lock_irqsave(&table->cache_lock, flags); 478 spin_lock(&rbio->bio_list_lock); 479 480 /* bump our ref if we were not in the list before */ 481 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 482 atomic_inc(&rbio->refs); 483 484 if (!list_empty(&rbio->stripe_cache)){ 485 list_move(&rbio->stripe_cache, &table->stripe_cache); 486 } else { 487 list_add(&rbio->stripe_cache, &table->stripe_cache); 488 table->cache_size += 1; 489 } 490 491 spin_unlock(&rbio->bio_list_lock); 492 493 if (table->cache_size > RBIO_CACHE_SIZE) { 494 struct btrfs_raid_bio *found; 495 496 found = list_entry(table->stripe_cache.prev, 497 struct btrfs_raid_bio, 498 stripe_cache); 499 500 if (found != rbio) 501 __remove_rbio_from_cache(found); 502 } 503 504 spin_unlock_irqrestore(&table->cache_lock, flags); 505 return; 506} 507 508/* 509 * helper function to run the xor_blocks api. It is only 510 * able to do MAX_XOR_BLOCKS at a time, so we need to 511 * loop through. 512 */ 513static void run_xor(void **pages, int src_cnt, ssize_t len) 514{ 515 int src_off = 0; 516 int xor_src_cnt = 0; 517 void *dest = pages[src_cnt]; 518 519 while(src_cnt > 0) { 520 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 521 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 522 523 src_cnt -= xor_src_cnt; 524 src_off += xor_src_cnt; 525 } 526} 527 528/* 529 * returns true if the bio list inside this rbio 530 * covers an entire stripe (no rmw required). 531 * Must be called with the bio list lock held, or 532 * at a time when you know it is impossible to add 533 * new bios into the list 534 */ 535static int __rbio_is_full(struct btrfs_raid_bio *rbio) 536{ 537 unsigned long size = rbio->bio_list_bytes; 538 int ret = 1; 539 540 if (size != rbio->nr_data * rbio->stripe_len) 541 ret = 0; 542 543 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 544 return ret; 545} 546 547static int rbio_is_full(struct btrfs_raid_bio *rbio) 548{ 549 unsigned long flags; 550 int ret; 551 552 spin_lock_irqsave(&rbio->bio_list_lock, flags); 553 ret = __rbio_is_full(rbio); 554 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 555 return ret; 556} 557 558/* 559 * returns 1 if it is safe to merge two rbios together. 560 * The merging is safe if the two rbios correspond to 561 * the same stripe and if they are both going in the same 562 * direction (read vs write), and if neither one is 563 * locked for final IO 564 * 565 * The caller is responsible for locking such that 566 * rmw_locked is safe to test 567 */ 568static int rbio_can_merge(struct btrfs_raid_bio *last, 569 struct btrfs_raid_bio *cur) 570{ 571 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 572 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 573 return 0; 574 575 /* 576 * we can't merge with cached rbios, since the 577 * idea is that when we merge the destination 578 * rbio is going to run our IO for us. We can 579 * steal from cached rbio's though, other functions 580 * handle that. 581 */ 582 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 583 test_bit(RBIO_CACHE_BIT, &cur->flags)) 584 return 0; 585 586 if (last->bbio->raid_map[0] != 587 cur->bbio->raid_map[0]) 588 return 0; 589 590 /* we can't merge with different operations */ 591 if (last->operation != cur->operation) 592 return 0; 593 /* 594 * We've need read the full stripe from the drive. 595 * check and repair the parity and write the new results. 596 * 597 * We're not allowed to add any new bios to the 598 * bio list here, anyone else that wants to 599 * change this stripe needs to do their own rmw. 600 */ 601 if (last->operation == BTRFS_RBIO_PARITY_SCRUB || 602 cur->operation == BTRFS_RBIO_PARITY_SCRUB) 603 return 0; 604 605 return 1; 606} 607 608/* 609 * helper to index into the pstripe 610 */ 611static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 612{ 613 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 614 return rbio->stripe_pages[index]; 615} 616 617/* 618 * helper to index into the qstripe, returns null 619 * if there is no qstripe 620 */ 621static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 622{ 623 if (rbio->nr_data + 1 == rbio->real_stripes) 624 return NULL; 625 626 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 627 PAGE_CACHE_SHIFT; 628 return rbio->stripe_pages[index]; 629} 630 631/* 632 * The first stripe in the table for a logical address 633 * has the lock. rbios are added in one of three ways: 634 * 635 * 1) Nobody has the stripe locked yet. The rbio is given 636 * the lock and 0 is returned. The caller must start the IO 637 * themselves. 638 * 639 * 2) Someone has the stripe locked, but we're able to merge 640 * with the lock owner. The rbio is freed and the IO will 641 * start automatically along with the existing rbio. 1 is returned. 642 * 643 * 3) Someone has the stripe locked, but we're not able to merge. 644 * The rbio is added to the lock owner's plug list, or merged into 645 * an rbio already on the plug list. When the lock owner unlocks, 646 * the next rbio on the list is run and the IO is started automatically. 647 * 1 is returned 648 * 649 * If we return 0, the caller still owns the rbio and must continue with 650 * IO submission. If we return 1, the caller must assume the rbio has 651 * already been freed. 652 */ 653static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 654{ 655 int bucket = rbio_bucket(rbio); 656 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 657 struct btrfs_raid_bio *cur; 658 struct btrfs_raid_bio *pending; 659 unsigned long flags; 660 DEFINE_WAIT(wait); 661 struct btrfs_raid_bio *freeit = NULL; 662 struct btrfs_raid_bio *cache_drop = NULL; 663 int ret = 0; 664 int walk = 0; 665 666 spin_lock_irqsave(&h->lock, flags); 667 list_for_each_entry(cur, &h->hash_list, hash_list) { 668 walk++; 669 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { 670 spin_lock(&cur->bio_list_lock); 671 672 /* can we steal this cached rbio's pages? */ 673 if (bio_list_empty(&cur->bio_list) && 674 list_empty(&cur->plug_list) && 675 test_bit(RBIO_CACHE_BIT, &cur->flags) && 676 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 677 list_del_init(&cur->hash_list); 678 atomic_dec(&cur->refs); 679 680 steal_rbio(cur, rbio); 681 cache_drop = cur; 682 spin_unlock(&cur->bio_list_lock); 683 684 goto lockit; 685 } 686 687 /* can we merge into the lock owner? */ 688 if (rbio_can_merge(cur, rbio)) { 689 merge_rbio(cur, rbio); 690 spin_unlock(&cur->bio_list_lock); 691 freeit = rbio; 692 ret = 1; 693 goto out; 694 } 695 696 697 /* 698 * we couldn't merge with the running 699 * rbio, see if we can merge with the 700 * pending ones. We don't have to 701 * check for rmw_locked because there 702 * is no way they are inside finish_rmw 703 * right now 704 */ 705 list_for_each_entry(pending, &cur->plug_list, 706 plug_list) { 707 if (rbio_can_merge(pending, rbio)) { 708 merge_rbio(pending, rbio); 709 spin_unlock(&cur->bio_list_lock); 710 freeit = rbio; 711 ret = 1; 712 goto out; 713 } 714 } 715 716 /* no merging, put us on the tail of the plug list, 717 * our rbio will be started with the currently 718 * running rbio unlocks 719 */ 720 list_add_tail(&rbio->plug_list, &cur->plug_list); 721 spin_unlock(&cur->bio_list_lock); 722 ret = 1; 723 goto out; 724 } 725 } 726lockit: 727 atomic_inc(&rbio->refs); 728 list_add(&rbio->hash_list, &h->hash_list); 729out: 730 spin_unlock_irqrestore(&h->lock, flags); 731 if (cache_drop) 732 remove_rbio_from_cache(cache_drop); 733 if (freeit) 734 __free_raid_bio(freeit); 735 return ret; 736} 737 738/* 739 * called as rmw or parity rebuild is completed. If the plug list has more 740 * rbios waiting for this stripe, the next one on the list will be started 741 */ 742static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 743{ 744 int bucket; 745 struct btrfs_stripe_hash *h; 746 unsigned long flags; 747 int keep_cache = 0; 748 749 bucket = rbio_bucket(rbio); 750 h = rbio->fs_info->stripe_hash_table->table + bucket; 751 752 if (list_empty(&rbio->plug_list)) 753 cache_rbio(rbio); 754 755 spin_lock_irqsave(&h->lock, flags); 756 spin_lock(&rbio->bio_list_lock); 757 758 if (!list_empty(&rbio->hash_list)) { 759 /* 760 * if we're still cached and there is no other IO 761 * to perform, just leave this rbio here for others 762 * to steal from later 763 */ 764 if (list_empty(&rbio->plug_list) && 765 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 766 keep_cache = 1; 767 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 768 BUG_ON(!bio_list_empty(&rbio->bio_list)); 769 goto done; 770 } 771 772 list_del_init(&rbio->hash_list); 773 atomic_dec(&rbio->refs); 774 775 /* 776 * we use the plug list to hold all the rbios 777 * waiting for the chance to lock this stripe. 778 * hand the lock over to one of them. 779 */ 780 if (!list_empty(&rbio->plug_list)) { 781 struct btrfs_raid_bio *next; 782 struct list_head *head = rbio->plug_list.next; 783 784 next = list_entry(head, struct btrfs_raid_bio, 785 plug_list); 786 787 list_del_init(&rbio->plug_list); 788 789 list_add(&next->hash_list, &h->hash_list); 790 atomic_inc(&next->refs); 791 spin_unlock(&rbio->bio_list_lock); 792 spin_unlock_irqrestore(&h->lock, flags); 793 794 if (next->operation == BTRFS_RBIO_READ_REBUILD) 795 async_read_rebuild(next); 796 else if (next->operation == BTRFS_RBIO_WRITE) { 797 steal_rbio(rbio, next); 798 async_rmw_stripe(next); 799 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 800 steal_rbio(rbio, next); 801 async_scrub_parity(next); 802 } 803 804 goto done_nolock; 805 } else if (waitqueue_active(&h->wait)) { 806 spin_unlock(&rbio->bio_list_lock); 807 spin_unlock_irqrestore(&h->lock, flags); 808 wake_up(&h->wait); 809 goto done_nolock; 810 } 811 } 812done: 813 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock_irqrestore(&h->lock, flags); 815 816done_nolock: 817 if (!keep_cache) 818 remove_rbio_from_cache(rbio); 819} 820 821static void __free_raid_bio(struct btrfs_raid_bio *rbio) 822{ 823 int i; 824 825 WARN_ON(atomic_read(&rbio->refs) < 0); 826 if (!atomic_dec_and_test(&rbio->refs)) 827 return; 828 829 WARN_ON(!list_empty(&rbio->stripe_cache)); 830 WARN_ON(!list_empty(&rbio->hash_list)); 831 WARN_ON(!bio_list_empty(&rbio->bio_list)); 832 833 for (i = 0; i < rbio->nr_pages; i++) { 834 if (rbio->stripe_pages[i]) { 835 __free_page(rbio->stripe_pages[i]); 836 rbio->stripe_pages[i] = NULL; 837 } 838 } 839 840 btrfs_put_bbio(rbio->bbio); 841 kfree(rbio); 842} 843 844static void free_raid_bio(struct btrfs_raid_bio *rbio) 845{ 846 unlock_stripe(rbio); 847 __free_raid_bio(rbio); 848} 849 850/* 851 * this frees the rbio and runs through all the bios in the 852 * bio_list and calls end_io on them 853 */ 854static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 855{ 856 struct bio *cur = bio_list_get(&rbio->bio_list); 857 struct bio *next; 858 859 if (rbio->generic_bio_cnt) 860 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 861 862 free_raid_bio(rbio); 863 864 while (cur) { 865 next = cur->bi_next; 866 cur->bi_next = NULL; 867 if (uptodate) 868 set_bit(BIO_UPTODATE, &cur->bi_flags); 869 bio_endio(cur, err); 870 cur = next; 871 } 872} 873 874/* 875 * end io function used by finish_rmw. When we finally 876 * get here, we've written a full stripe 877 */ 878static void raid_write_end_io(struct bio *bio, int err) 879{ 880 struct btrfs_raid_bio *rbio = bio->bi_private; 881 882 if (err) 883 fail_bio_stripe(rbio, bio); 884 885 bio_put(bio); 886 887 if (!atomic_dec_and_test(&rbio->stripes_pending)) 888 return; 889 890 err = 0; 891 892 /* OK, we have read all the stripes we need to. */ 893 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 894 err = -EIO; 895 896 rbio_orig_end_io(rbio, err, 0); 897 return; 898} 899 900/* 901 * the read/modify/write code wants to use the original bio for 902 * any pages it included, and then use the rbio for everything 903 * else. This function decides if a given index (stripe number) 904 * and page number in that stripe fall inside the original bio 905 * or the rbio. 906 * 907 * if you set bio_list_only, you'll get a NULL back for any ranges 908 * that are outside the bio_list 909 * 910 * This doesn't take any refs on anything, you get a bare page pointer 911 * and the caller must bump refs as required. 912 * 913 * You must call index_rbio_pages once before you can trust 914 * the answers from this function. 915 */ 916static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 917 int index, int pagenr, int bio_list_only) 918{ 919 int chunk_page; 920 struct page *p = NULL; 921 922 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 923 924 spin_lock_irq(&rbio->bio_list_lock); 925 p = rbio->bio_pages[chunk_page]; 926 spin_unlock_irq(&rbio->bio_list_lock); 927 928 if (p || bio_list_only) 929 return p; 930 931 return rbio->stripe_pages[chunk_page]; 932} 933 934/* 935 * number of pages we need for the entire stripe across all the 936 * drives 937 */ 938static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 939{ 940 unsigned long nr = stripe_len * nr_stripes; 941 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); 942} 943 944/* 945 * allocation and initial setup for the btrfs_raid_bio. Not 946 * this does not allocate any pages for rbio->pages. 947 */ 948static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 949 struct btrfs_bio *bbio, u64 stripe_len) 950{ 951 struct btrfs_raid_bio *rbio; 952 int nr_data = 0; 953 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 954 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 955 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 956 void *p; 957 958 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + 959 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), 960 GFP_NOFS); 961 if (!rbio) 962 return ERR_PTR(-ENOMEM); 963 964 bio_list_init(&rbio->bio_list); 965 INIT_LIST_HEAD(&rbio->plug_list); 966 spin_lock_init(&rbio->bio_list_lock); 967 INIT_LIST_HEAD(&rbio->stripe_cache); 968 INIT_LIST_HEAD(&rbio->hash_list); 969 rbio->bbio = bbio; 970 rbio->fs_info = root->fs_info; 971 rbio->stripe_len = stripe_len; 972 rbio->nr_pages = num_pages; 973 rbio->real_stripes = real_stripes; 974 rbio->stripe_npages = stripe_npages; 975 rbio->faila = -1; 976 rbio->failb = -1; 977 atomic_set(&rbio->refs, 1); 978 atomic_set(&rbio->error, 0); 979 atomic_set(&rbio->stripes_pending, 0); 980 981 /* 982 * the stripe_pages and bio_pages array point to the extra 983 * memory we allocated past the end of the rbio 984 */ 985 p = rbio + 1; 986 rbio->stripe_pages = p; 987 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 988 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; 989 990 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 991 nr_data = real_stripes - 1; 992 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 993 nr_data = real_stripes - 2; 994 else 995 BUG(); 996 997 rbio->nr_data = nr_data; 998 return rbio; 999} 1000 1001/* allocate pages for all the stripes in the bio, including parity */ 1002static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1003{ 1004 int i; 1005 struct page *page; 1006 1007 for (i = 0; i < rbio->nr_pages; i++) { 1008 if (rbio->stripe_pages[i]) 1009 continue; 1010 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1011 if (!page) 1012 return -ENOMEM; 1013 rbio->stripe_pages[i] = page; 1014 ClearPageUptodate(page); 1015 } 1016 return 0; 1017} 1018 1019/* allocate pages for just the p/q stripes */ 1020static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1021{ 1022 int i; 1023 struct page *page; 1024 1025 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 1026 1027 for (; i < rbio->nr_pages; i++) { 1028 if (rbio->stripe_pages[i]) 1029 continue; 1030 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1031 if (!page) 1032 return -ENOMEM; 1033 rbio->stripe_pages[i] = page; 1034 } 1035 return 0; 1036} 1037 1038/* 1039 * add a single page from a specific stripe into our list of bios for IO 1040 * this will try to merge into existing bios if possible, and returns 1041 * zero if all went well. 1042 */ 1043static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1044 struct bio_list *bio_list, 1045 struct page *page, 1046 int stripe_nr, 1047 unsigned long page_index, 1048 unsigned long bio_max_len) 1049{ 1050 struct bio *last = bio_list->tail; 1051 u64 last_end = 0; 1052 int ret; 1053 struct bio *bio; 1054 struct btrfs_bio_stripe *stripe; 1055 u64 disk_start; 1056 1057 stripe = &rbio->bbio->stripes[stripe_nr]; 1058 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 1059 1060 /* if the device is missing, just fail this stripe */ 1061 if (!stripe->dev->bdev) 1062 return fail_rbio_index(rbio, stripe_nr); 1063 1064 /* see if we can add this page onto our existing bio */ 1065 if (last) { 1066 last_end = (u64)last->bi_iter.bi_sector << 9; 1067 last_end += last->bi_iter.bi_size; 1068 1069 /* 1070 * we can't merge these if they are from different 1071 * devices or if they are not contiguous 1072 */ 1073 if (last_end == disk_start && stripe->dev->bdev && 1074 test_bit(BIO_UPTODATE, &last->bi_flags) && 1075 last->bi_bdev == stripe->dev->bdev) { 1076 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 1077 if (ret == PAGE_CACHE_SIZE) 1078 return 0; 1079 } 1080 } 1081 1082 /* put a new bio on the list */ 1083 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 1084 if (!bio) 1085 return -ENOMEM; 1086 1087 bio->bi_iter.bi_size = 0; 1088 bio->bi_bdev = stripe->dev->bdev; 1089 bio->bi_iter.bi_sector = disk_start >> 9; 1090 set_bit(BIO_UPTODATE, &bio->bi_flags); 1091 1092 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 1093 bio_list_add(bio_list, bio); 1094 return 0; 1095} 1096 1097/* 1098 * while we're doing the read/modify/write cycle, we could 1099 * have errors in reading pages off the disk. This checks 1100 * for errors and if we're not able to read the page it'll 1101 * trigger parity reconstruction. The rmw will be finished 1102 * after we've reconstructed the failed stripes 1103 */ 1104static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1105{ 1106 if (rbio->faila >= 0 || rbio->failb >= 0) { 1107 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1108 __raid56_parity_recover(rbio); 1109 } else { 1110 finish_rmw(rbio); 1111 } 1112} 1113 1114/* 1115 * these are just the pages from the rbio array, not from anything 1116 * the FS sent down to us 1117 */ 1118static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 1119{ 1120 int index; 1121 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 1122 index += page; 1123 return rbio->stripe_pages[index]; 1124} 1125 1126/* 1127 * helper function to walk our bio list and populate the bio_pages array with 1128 * the result. This seems expensive, but it is faster than constantly 1129 * searching through the bio list as we setup the IO in finish_rmw or stripe 1130 * reconstruction. 1131 * 1132 * This must be called before you trust the answers from page_in_rbio 1133 */ 1134static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1135{ 1136 struct bio *bio; 1137 u64 start; 1138 unsigned long stripe_offset; 1139 unsigned long page_index; 1140 struct page *p; 1141 int i; 1142 1143 spin_lock_irq(&rbio->bio_list_lock); 1144 bio_list_for_each(bio, &rbio->bio_list) { 1145 start = (u64)bio->bi_iter.bi_sector << 9; 1146 stripe_offset = start - rbio->bbio->raid_map[0]; 1147 page_index = stripe_offset >> PAGE_CACHE_SHIFT; 1148 1149 for (i = 0; i < bio->bi_vcnt; i++) { 1150 p = bio->bi_io_vec[i].bv_page; 1151 rbio->bio_pages[page_index + i] = p; 1152 } 1153 } 1154 spin_unlock_irq(&rbio->bio_list_lock); 1155} 1156 1157/* 1158 * this is called from one of two situations. We either 1159 * have a full stripe from the higher layers, or we've read all 1160 * the missing bits off disk. 1161 * 1162 * This will calculate the parity and then send down any 1163 * changed blocks. 1164 */ 1165static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1166{ 1167 struct btrfs_bio *bbio = rbio->bbio; 1168 void *pointers[rbio->real_stripes]; 1169 int stripe_len = rbio->stripe_len; 1170 int nr_data = rbio->nr_data; 1171 int stripe; 1172 int pagenr; 1173 int p_stripe = -1; 1174 int q_stripe = -1; 1175 struct bio_list bio_list; 1176 struct bio *bio; 1177 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 1178 int ret; 1179 1180 bio_list_init(&bio_list); 1181 1182 if (rbio->real_stripes - rbio->nr_data == 1) { 1183 p_stripe = rbio->real_stripes - 1; 1184 } else if (rbio->real_stripes - rbio->nr_data == 2) { 1185 p_stripe = rbio->real_stripes - 2; 1186 q_stripe = rbio->real_stripes - 1; 1187 } else { 1188 BUG(); 1189 } 1190 1191 /* at this point we either have a full stripe, 1192 * or we've read the full stripe from the drive. 1193 * recalculate the parity and write the new results. 1194 * 1195 * We're not allowed to add any new bios to the 1196 * bio list here, anyone else that wants to 1197 * change this stripe needs to do their own rmw. 1198 */ 1199 spin_lock_irq(&rbio->bio_list_lock); 1200 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1201 spin_unlock_irq(&rbio->bio_list_lock); 1202 1203 atomic_set(&rbio->error, 0); 1204 1205 /* 1206 * now that we've set rmw_locked, run through the 1207 * bio list one last time and map the page pointers 1208 * 1209 * We don't cache full rbios because we're assuming 1210 * the higher layers are unlikely to use this area of 1211 * the disk again soon. If they do use it again, 1212 * hopefully they will send another full bio. 1213 */ 1214 index_rbio_pages(rbio); 1215 if (!rbio_is_full(rbio)) 1216 cache_rbio_pages(rbio); 1217 else 1218 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1219 1220 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1221 struct page *p; 1222 /* first collect one page from each data stripe */ 1223 for (stripe = 0; stripe < nr_data; stripe++) { 1224 p = page_in_rbio(rbio, stripe, pagenr, 0); 1225 pointers[stripe] = kmap(p); 1226 } 1227 1228 /* then add the parity stripe */ 1229 p = rbio_pstripe_page(rbio, pagenr); 1230 SetPageUptodate(p); 1231 pointers[stripe++] = kmap(p); 1232 1233 if (q_stripe != -1) { 1234 1235 /* 1236 * raid6, add the qstripe and call the 1237 * library function to fill in our p/q 1238 */ 1239 p = rbio_qstripe_page(rbio, pagenr); 1240 SetPageUptodate(p); 1241 pointers[stripe++] = kmap(p); 1242 1243 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1244 pointers); 1245 } else { 1246 /* raid5 */ 1247 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 1248 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 1249 } 1250 1251 1252 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1253 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1254 } 1255 1256 /* 1257 * time to start writing. Make bios for everything from the 1258 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1259 * everything else. 1260 */ 1261 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1262 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1263 struct page *page; 1264 if (stripe < rbio->nr_data) { 1265 page = page_in_rbio(rbio, stripe, pagenr, 1); 1266 if (!page) 1267 continue; 1268 } else { 1269 page = rbio_stripe_page(rbio, stripe, pagenr); 1270 } 1271 1272 ret = rbio_add_io_page(rbio, &bio_list, 1273 page, stripe, pagenr, rbio->stripe_len); 1274 if (ret) 1275 goto cleanup; 1276 } 1277 } 1278 1279 if (likely(!bbio->num_tgtdevs)) 1280 goto write_data; 1281 1282 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1283 if (!bbio->tgtdev_map[stripe]) 1284 continue; 1285 1286 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1287 struct page *page; 1288 if (stripe < rbio->nr_data) { 1289 page = page_in_rbio(rbio, stripe, pagenr, 1); 1290 if (!page) 1291 continue; 1292 } else { 1293 page = rbio_stripe_page(rbio, stripe, pagenr); 1294 } 1295 1296 ret = rbio_add_io_page(rbio, &bio_list, page, 1297 rbio->bbio->tgtdev_map[stripe], 1298 pagenr, rbio->stripe_len); 1299 if (ret) 1300 goto cleanup; 1301 } 1302 } 1303 1304write_data: 1305 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1306 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1307 1308 while (1) { 1309 bio = bio_list_pop(&bio_list); 1310 if (!bio) 1311 break; 1312 1313 bio->bi_private = rbio; 1314 bio->bi_end_io = raid_write_end_io; 1315 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1316 submit_bio(WRITE, bio); 1317 } 1318 return; 1319 1320cleanup: 1321 rbio_orig_end_io(rbio, -EIO, 0); 1322} 1323 1324/* 1325 * helper to find the stripe number for a given bio. Used to figure out which 1326 * stripe has failed. This expects the bio to correspond to a physical disk, 1327 * so it looks up based on physical sector numbers. 1328 */ 1329static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1330 struct bio *bio) 1331{ 1332 u64 physical = bio->bi_iter.bi_sector; 1333 u64 stripe_start; 1334 int i; 1335 struct btrfs_bio_stripe *stripe; 1336 1337 physical <<= 9; 1338 1339 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1340 stripe = &rbio->bbio->stripes[i]; 1341 stripe_start = stripe->physical; 1342 if (physical >= stripe_start && 1343 physical < stripe_start + rbio->stripe_len && 1344 bio->bi_bdev == stripe->dev->bdev) { 1345 return i; 1346 } 1347 } 1348 return -1; 1349} 1350 1351/* 1352 * helper to find the stripe number for a given 1353 * bio (before mapping). Used to figure out which stripe has 1354 * failed. This looks up based on logical block numbers. 1355 */ 1356static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1357 struct bio *bio) 1358{ 1359 u64 logical = bio->bi_iter.bi_sector; 1360 u64 stripe_start; 1361 int i; 1362 1363 logical <<= 9; 1364 1365 for (i = 0; i < rbio->nr_data; i++) { 1366 stripe_start = rbio->bbio->raid_map[i]; 1367 if (logical >= stripe_start && 1368 logical < stripe_start + rbio->stripe_len) { 1369 return i; 1370 } 1371 } 1372 return -1; 1373} 1374 1375/* 1376 * returns -EIO if we had too many failures 1377 */ 1378static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1379{ 1380 unsigned long flags; 1381 int ret = 0; 1382 1383 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1384 1385 /* we already know this stripe is bad, move on */ 1386 if (rbio->faila == failed || rbio->failb == failed) 1387 goto out; 1388 1389 if (rbio->faila == -1) { 1390 /* first failure on this rbio */ 1391 rbio->faila = failed; 1392 atomic_inc(&rbio->error); 1393 } else if (rbio->failb == -1) { 1394 /* second failure on this rbio */ 1395 rbio->failb = failed; 1396 atomic_inc(&rbio->error); 1397 } else { 1398 ret = -EIO; 1399 } 1400out: 1401 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1402 1403 return ret; 1404} 1405 1406/* 1407 * helper to fail a stripe based on a physical disk 1408 * bio. 1409 */ 1410static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1411 struct bio *bio) 1412{ 1413 int failed = find_bio_stripe(rbio, bio); 1414 1415 if (failed < 0) 1416 return -EIO; 1417 1418 return fail_rbio_index(rbio, failed); 1419} 1420 1421/* 1422 * this sets each page in the bio uptodate. It should only be used on private 1423 * rbio pages, nothing that comes in from the higher layers 1424 */ 1425static void set_bio_pages_uptodate(struct bio *bio) 1426{ 1427 int i; 1428 struct page *p; 1429 1430 for (i = 0; i < bio->bi_vcnt; i++) { 1431 p = bio->bi_io_vec[i].bv_page; 1432 SetPageUptodate(p); 1433 } 1434} 1435 1436/* 1437 * end io for the read phase of the rmw cycle. All the bios here are physical 1438 * stripe bios we've read from the disk so we can recalculate the parity of the 1439 * stripe. 1440 * 1441 * This will usually kick off finish_rmw once all the bios are read in, but it 1442 * may trigger parity reconstruction if we had any errors along the way 1443 */ 1444static void raid_rmw_end_io(struct bio *bio, int err) 1445{ 1446 struct btrfs_raid_bio *rbio = bio->bi_private; 1447 1448 if (err) 1449 fail_bio_stripe(rbio, bio); 1450 else 1451 set_bio_pages_uptodate(bio); 1452 1453 bio_put(bio); 1454 1455 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1456 return; 1457 1458 err = 0; 1459 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1460 goto cleanup; 1461 1462 /* 1463 * this will normally call finish_rmw to start our write 1464 * but if there are any failed stripes we'll reconstruct 1465 * from parity first 1466 */ 1467 validate_rbio_for_rmw(rbio); 1468 return; 1469 1470cleanup: 1471 1472 rbio_orig_end_io(rbio, -EIO, 0); 1473} 1474 1475static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1476{ 1477 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1478 rmw_work, NULL, NULL); 1479 1480 btrfs_queue_work(rbio->fs_info->rmw_workers, 1481 &rbio->work); 1482} 1483 1484static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1485{ 1486 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1487 read_rebuild_work, NULL, NULL); 1488 1489 btrfs_queue_work(rbio->fs_info->rmw_workers, 1490 &rbio->work); 1491} 1492 1493/* 1494 * the stripe must be locked by the caller. It will 1495 * unlock after all the writes are done 1496 */ 1497static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1498{ 1499 int bios_to_read = 0; 1500 struct bio_list bio_list; 1501 int ret; 1502 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1503 int pagenr; 1504 int stripe; 1505 struct bio *bio; 1506 1507 bio_list_init(&bio_list); 1508 1509 ret = alloc_rbio_pages(rbio); 1510 if (ret) 1511 goto cleanup; 1512 1513 index_rbio_pages(rbio); 1514 1515 atomic_set(&rbio->error, 0); 1516 /* 1517 * build a list of bios to read all the missing parts of this 1518 * stripe 1519 */ 1520 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1521 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1522 struct page *page; 1523 /* 1524 * we want to find all the pages missing from 1525 * the rbio and read them from the disk. If 1526 * page_in_rbio finds a page in the bio list 1527 * we don't need to read it off the stripe. 1528 */ 1529 page = page_in_rbio(rbio, stripe, pagenr, 1); 1530 if (page) 1531 continue; 1532 1533 page = rbio_stripe_page(rbio, stripe, pagenr); 1534 /* 1535 * the bio cache may have handed us an uptodate 1536 * page. If so, be happy and use it 1537 */ 1538 if (PageUptodate(page)) 1539 continue; 1540 1541 ret = rbio_add_io_page(rbio, &bio_list, page, 1542 stripe, pagenr, rbio->stripe_len); 1543 if (ret) 1544 goto cleanup; 1545 } 1546 } 1547 1548 bios_to_read = bio_list_size(&bio_list); 1549 if (!bios_to_read) { 1550 /* 1551 * this can happen if others have merged with 1552 * us, it means there is nothing left to read. 1553 * But if there are missing devices it may not be 1554 * safe to do the full stripe write yet. 1555 */ 1556 goto finish; 1557 } 1558 1559 /* 1560 * the bbio may be freed once we submit the last bio. Make sure 1561 * not to touch it after that 1562 */ 1563 atomic_set(&rbio->stripes_pending, bios_to_read); 1564 while (1) { 1565 bio = bio_list_pop(&bio_list); 1566 if (!bio) 1567 break; 1568 1569 bio->bi_private = rbio; 1570 bio->bi_end_io = raid_rmw_end_io; 1571 1572 btrfs_bio_wq_end_io(rbio->fs_info, bio, 1573 BTRFS_WQ_ENDIO_RAID56); 1574 1575 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1576 submit_bio(READ, bio); 1577 } 1578 /* the actual write will happen once the reads are done */ 1579 return 0; 1580 1581cleanup: 1582 rbio_orig_end_io(rbio, -EIO, 0); 1583 return -EIO; 1584 1585finish: 1586 validate_rbio_for_rmw(rbio); 1587 return 0; 1588} 1589 1590/* 1591 * if the upper layers pass in a full stripe, we thank them by only allocating 1592 * enough pages to hold the parity, and sending it all down quickly. 1593 */ 1594static int full_stripe_write(struct btrfs_raid_bio *rbio) 1595{ 1596 int ret; 1597 1598 ret = alloc_rbio_parity_pages(rbio); 1599 if (ret) { 1600 __free_raid_bio(rbio); 1601 return ret; 1602 } 1603 1604 ret = lock_stripe_add(rbio); 1605 if (ret == 0) 1606 finish_rmw(rbio); 1607 return 0; 1608} 1609 1610/* 1611 * partial stripe writes get handed over to async helpers. 1612 * We're really hoping to merge a few more writes into this 1613 * rbio before calculating new parity 1614 */ 1615static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1616{ 1617 int ret; 1618 1619 ret = lock_stripe_add(rbio); 1620 if (ret == 0) 1621 async_rmw_stripe(rbio); 1622 return 0; 1623} 1624 1625/* 1626 * sometimes while we were reading from the drive to 1627 * recalculate parity, enough new bios come into create 1628 * a full stripe. So we do a check here to see if we can 1629 * go directly to finish_rmw 1630 */ 1631static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1632{ 1633 /* head off into rmw land if we don't have a full stripe */ 1634 if (!rbio_is_full(rbio)) 1635 return partial_stripe_write(rbio); 1636 return full_stripe_write(rbio); 1637} 1638 1639/* 1640 * We use plugging call backs to collect full stripes. 1641 * Any time we get a partial stripe write while plugged 1642 * we collect it into a list. When the unplug comes down, 1643 * we sort the list by logical block number and merge 1644 * everything we can into the same rbios 1645 */ 1646struct btrfs_plug_cb { 1647 struct blk_plug_cb cb; 1648 struct btrfs_fs_info *info; 1649 struct list_head rbio_list; 1650 struct btrfs_work work; 1651}; 1652 1653/* 1654 * rbios on the plug list are sorted for easier merging. 1655 */ 1656static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1657{ 1658 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1659 plug_list); 1660 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1661 plug_list); 1662 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1663 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1664 1665 if (a_sector < b_sector) 1666 return -1; 1667 if (a_sector > b_sector) 1668 return 1; 1669 return 0; 1670} 1671 1672static void run_plug(struct btrfs_plug_cb *plug) 1673{ 1674 struct btrfs_raid_bio *cur; 1675 struct btrfs_raid_bio *last = NULL; 1676 1677 /* 1678 * sort our plug list then try to merge 1679 * everything we can in hopes of creating full 1680 * stripes. 1681 */ 1682 list_sort(NULL, &plug->rbio_list, plug_cmp); 1683 while (!list_empty(&plug->rbio_list)) { 1684 cur = list_entry(plug->rbio_list.next, 1685 struct btrfs_raid_bio, plug_list); 1686 list_del_init(&cur->plug_list); 1687 1688 if (rbio_is_full(cur)) { 1689 /* we have a full stripe, send it down */ 1690 full_stripe_write(cur); 1691 continue; 1692 } 1693 if (last) { 1694 if (rbio_can_merge(last, cur)) { 1695 merge_rbio(last, cur); 1696 __free_raid_bio(cur); 1697 continue; 1698 1699 } 1700 __raid56_parity_write(last); 1701 } 1702 last = cur; 1703 } 1704 if (last) { 1705 __raid56_parity_write(last); 1706 } 1707 kfree(plug); 1708} 1709 1710/* 1711 * if the unplug comes from schedule, we have to push the 1712 * work off to a helper thread 1713 */ 1714static void unplug_work(struct btrfs_work *work) 1715{ 1716 struct btrfs_plug_cb *plug; 1717 plug = container_of(work, struct btrfs_plug_cb, work); 1718 run_plug(plug); 1719} 1720 1721static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1722{ 1723 struct btrfs_plug_cb *plug; 1724 plug = container_of(cb, struct btrfs_plug_cb, cb); 1725 1726 if (from_schedule) { 1727 btrfs_init_work(&plug->work, btrfs_rmw_helper, 1728 unplug_work, NULL, NULL); 1729 btrfs_queue_work(plug->info->rmw_workers, 1730 &plug->work); 1731 return; 1732 } 1733 run_plug(plug); 1734} 1735 1736/* 1737 * our main entry point for writes from the rest of the FS. 1738 */ 1739int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1740 struct btrfs_bio *bbio, u64 stripe_len) 1741{ 1742 struct btrfs_raid_bio *rbio; 1743 struct btrfs_plug_cb *plug = NULL; 1744 struct blk_plug_cb *cb; 1745 int ret; 1746 1747 rbio = alloc_rbio(root, bbio, stripe_len); 1748 if (IS_ERR(rbio)) { 1749 btrfs_put_bbio(bbio); 1750 return PTR_ERR(rbio); 1751 } 1752 bio_list_add(&rbio->bio_list, bio); 1753 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1754 rbio->operation = BTRFS_RBIO_WRITE; 1755 1756 btrfs_bio_counter_inc_noblocked(root->fs_info); 1757 rbio->generic_bio_cnt = 1; 1758 1759 /* 1760 * don't plug on full rbios, just get them out the door 1761 * as quickly as we can 1762 */ 1763 if (rbio_is_full(rbio)) { 1764 ret = full_stripe_write(rbio); 1765 if (ret) 1766 btrfs_bio_counter_dec(root->fs_info); 1767 return ret; 1768 } 1769 1770 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1771 sizeof(*plug)); 1772 if (cb) { 1773 plug = container_of(cb, struct btrfs_plug_cb, cb); 1774 if (!plug->info) { 1775 plug->info = root->fs_info; 1776 INIT_LIST_HEAD(&plug->rbio_list); 1777 } 1778 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1779 ret = 0; 1780 } else { 1781 ret = __raid56_parity_write(rbio); 1782 if (ret) 1783 btrfs_bio_counter_dec(root->fs_info); 1784 } 1785 return ret; 1786} 1787 1788/* 1789 * all parity reconstruction happens here. We've read in everything 1790 * we can find from the drives and this does the heavy lifting of 1791 * sorting the good from the bad. 1792 */ 1793static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1794{ 1795 int pagenr, stripe; 1796 void **pointers; 1797 int faila = -1, failb = -1; 1798 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1799 struct page *page; 1800 int err; 1801 int i; 1802 1803 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1804 if (!pointers) { 1805 err = -ENOMEM; 1806 goto cleanup_io; 1807 } 1808 1809 faila = rbio->faila; 1810 failb = rbio->failb; 1811 1812 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1813 spin_lock_irq(&rbio->bio_list_lock); 1814 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1815 spin_unlock_irq(&rbio->bio_list_lock); 1816 } 1817 1818 index_rbio_pages(rbio); 1819 1820 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1821 /* 1822 * Now we just use bitmap to mark the horizontal stripes in 1823 * which we have data when doing parity scrub. 1824 */ 1825 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1826 !test_bit(pagenr, rbio->dbitmap)) 1827 continue; 1828 1829 /* setup our array of pointers with pages 1830 * from each stripe 1831 */ 1832 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1833 /* 1834 * if we're rebuilding a read, we have to use 1835 * pages from the bio list 1836 */ 1837 if (rbio->operation == BTRFS_RBIO_READ_REBUILD && 1838 (stripe == faila || stripe == failb)) { 1839 page = page_in_rbio(rbio, stripe, pagenr, 0); 1840 } else { 1841 page = rbio_stripe_page(rbio, stripe, pagenr); 1842 } 1843 pointers[stripe] = kmap(page); 1844 } 1845 1846 /* all raid6 handling here */ 1847 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1848 /* 1849 * single failure, rebuild from parity raid5 1850 * style 1851 */ 1852 if (failb < 0) { 1853 if (faila == rbio->nr_data) { 1854 /* 1855 * Just the P stripe has failed, without 1856 * a bad data or Q stripe. 1857 * TODO, we should redo the xor here. 1858 */ 1859 err = -EIO; 1860 goto cleanup; 1861 } 1862 /* 1863 * a single failure in raid6 is rebuilt 1864 * in the pstripe code below 1865 */ 1866 goto pstripe; 1867 } 1868 1869 /* make sure our ps and qs are in order */ 1870 if (faila > failb) { 1871 int tmp = failb; 1872 failb = faila; 1873 faila = tmp; 1874 } 1875 1876 /* if the q stripe is failed, do a pstripe reconstruction 1877 * from the xors. 1878 * If both the q stripe and the P stripe are failed, we're 1879 * here due to a crc mismatch and we can't give them the 1880 * data they want 1881 */ 1882 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1883 if (rbio->bbio->raid_map[faila] == 1884 RAID5_P_STRIPE) { 1885 err = -EIO; 1886 goto cleanup; 1887 } 1888 /* 1889 * otherwise we have one bad data stripe and 1890 * a good P stripe. raid5! 1891 */ 1892 goto pstripe; 1893 } 1894 1895 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1896 raid6_datap_recov(rbio->real_stripes, 1897 PAGE_SIZE, faila, pointers); 1898 } else { 1899 raid6_2data_recov(rbio->real_stripes, 1900 PAGE_SIZE, faila, failb, 1901 pointers); 1902 } 1903 } else { 1904 void *p; 1905 1906 /* rebuild from P stripe here (raid5 or raid6) */ 1907 BUG_ON(failb != -1); 1908pstripe: 1909 /* Copy parity block into failed block to start with */ 1910 memcpy(pointers[faila], 1911 pointers[rbio->nr_data], 1912 PAGE_CACHE_SIZE); 1913 1914 /* rearrange the pointer array */ 1915 p = pointers[faila]; 1916 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1917 pointers[stripe] = pointers[stripe + 1]; 1918 pointers[rbio->nr_data - 1] = p; 1919 1920 /* xor in the rest */ 1921 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1922 } 1923 /* if we're doing this rebuild as part of an rmw, go through 1924 * and set all of our private rbio pages in the 1925 * failed stripes as uptodate. This way finish_rmw will 1926 * know they can be trusted. If this was a read reconstruction, 1927 * other endio functions will fiddle the uptodate bits 1928 */ 1929 if (rbio->operation == BTRFS_RBIO_WRITE) { 1930 for (i = 0; i < nr_pages; i++) { 1931 if (faila != -1) { 1932 page = rbio_stripe_page(rbio, faila, i); 1933 SetPageUptodate(page); 1934 } 1935 if (failb != -1) { 1936 page = rbio_stripe_page(rbio, failb, i); 1937 SetPageUptodate(page); 1938 } 1939 } 1940 } 1941 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1942 /* 1943 * if we're rebuilding a read, we have to use 1944 * pages from the bio list 1945 */ 1946 if (rbio->operation == BTRFS_RBIO_READ_REBUILD && 1947 (stripe == faila || stripe == failb)) { 1948 page = page_in_rbio(rbio, stripe, pagenr, 0); 1949 } else { 1950 page = rbio_stripe_page(rbio, stripe, pagenr); 1951 } 1952 kunmap(page); 1953 } 1954 } 1955 1956 err = 0; 1957cleanup: 1958 kfree(pointers); 1959 1960cleanup_io: 1961 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1962 if (err == 0) 1963 cache_rbio_pages(rbio); 1964 else 1965 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1966 1967 rbio_orig_end_io(rbio, err, err == 0); 1968 } else if (err == 0) { 1969 rbio->faila = -1; 1970 rbio->failb = -1; 1971 1972 if (rbio->operation == BTRFS_RBIO_WRITE) 1973 finish_rmw(rbio); 1974 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 1975 finish_parity_scrub(rbio, 0); 1976 else 1977 BUG(); 1978 } else { 1979 rbio_orig_end_io(rbio, err, 0); 1980 } 1981} 1982 1983/* 1984 * This is called only for stripes we've read from disk to 1985 * reconstruct the parity. 1986 */ 1987static void raid_recover_end_io(struct bio *bio, int err) 1988{ 1989 struct btrfs_raid_bio *rbio = bio->bi_private; 1990 1991 /* 1992 * we only read stripe pages off the disk, set them 1993 * up to date if there were no errors 1994 */ 1995 if (err) 1996 fail_bio_stripe(rbio, bio); 1997 else 1998 set_bio_pages_uptodate(bio); 1999 bio_put(bio); 2000 2001 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2002 return; 2003 2004 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2005 rbio_orig_end_io(rbio, -EIO, 0); 2006 else 2007 __raid_recover_end_io(rbio); 2008} 2009 2010/* 2011 * reads everything we need off the disk to reconstruct 2012 * the parity. endio handlers trigger final reconstruction 2013 * when the IO is done. 2014 * 2015 * This is used both for reads from the higher layers and for 2016 * parity construction required to finish a rmw cycle. 2017 */ 2018static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2019{ 2020 int bios_to_read = 0; 2021 struct bio_list bio_list; 2022 int ret; 2023 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 2024 int pagenr; 2025 int stripe; 2026 struct bio *bio; 2027 2028 bio_list_init(&bio_list); 2029 2030 ret = alloc_rbio_pages(rbio); 2031 if (ret) 2032 goto cleanup; 2033 2034 atomic_set(&rbio->error, 0); 2035 2036 /* 2037 * read everything that hasn't failed. Thanks to the 2038 * stripe cache, it is possible that some or all of these 2039 * pages are going to be uptodate. 2040 */ 2041 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2042 if (rbio->faila == stripe || rbio->failb == stripe) { 2043 atomic_inc(&rbio->error); 2044 continue; 2045 } 2046 2047 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 2048 struct page *p; 2049 2050 /* 2051 * the rmw code may have already read this 2052 * page in 2053 */ 2054 p = rbio_stripe_page(rbio, stripe, pagenr); 2055 if (PageUptodate(p)) 2056 continue; 2057 2058 ret = rbio_add_io_page(rbio, &bio_list, 2059 rbio_stripe_page(rbio, stripe, pagenr), 2060 stripe, pagenr, rbio->stripe_len); 2061 if (ret < 0) 2062 goto cleanup; 2063 } 2064 } 2065 2066 bios_to_read = bio_list_size(&bio_list); 2067 if (!bios_to_read) { 2068 /* 2069 * we might have no bios to read just because the pages 2070 * were up to date, or we might have no bios to read because 2071 * the devices were gone. 2072 */ 2073 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2074 __raid_recover_end_io(rbio); 2075 goto out; 2076 } else { 2077 goto cleanup; 2078 } 2079 } 2080 2081 /* 2082 * the bbio may be freed once we submit the last bio. Make sure 2083 * not to touch it after that 2084 */ 2085 atomic_set(&rbio->stripes_pending, bios_to_read); 2086 while (1) { 2087 bio = bio_list_pop(&bio_list); 2088 if (!bio) 2089 break; 2090 2091 bio->bi_private = rbio; 2092 bio->bi_end_io = raid_recover_end_io; 2093 2094 btrfs_bio_wq_end_io(rbio->fs_info, bio, 2095 BTRFS_WQ_ENDIO_RAID56); 2096 2097 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 2098 submit_bio(READ, bio); 2099 } 2100out: 2101 return 0; 2102 2103cleanup: 2104 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) 2105 rbio_orig_end_io(rbio, -EIO, 0); 2106 return -EIO; 2107} 2108 2109/* 2110 * the main entry point for reads from the higher layers. This 2111 * is really only called when the normal read path had a failure, 2112 * so we assume the bio they send down corresponds to a failed part 2113 * of the drive. 2114 */ 2115int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2116 struct btrfs_bio *bbio, u64 stripe_len, 2117 int mirror_num, int generic_io) 2118{ 2119 struct btrfs_raid_bio *rbio; 2120 int ret; 2121 2122 rbio = alloc_rbio(root, bbio, stripe_len); 2123 if (IS_ERR(rbio)) { 2124 if (generic_io) 2125 btrfs_put_bbio(bbio); 2126 return PTR_ERR(rbio); 2127 } 2128 2129 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2130 bio_list_add(&rbio->bio_list, bio); 2131 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2132 2133 rbio->faila = find_logical_bio_stripe(rbio, bio); 2134 if (rbio->faila == -1) { 2135 BUG(); 2136 if (generic_io) 2137 btrfs_put_bbio(bbio); 2138 kfree(rbio); 2139 return -EIO; 2140 } 2141 2142 if (generic_io) { 2143 btrfs_bio_counter_inc_noblocked(root->fs_info); 2144 rbio->generic_bio_cnt = 1; 2145 } else { 2146 btrfs_get_bbio(bbio); 2147 } 2148 2149 /* 2150 * reconstruct from the q stripe if they are 2151 * asking for mirror 3 2152 */ 2153 if (mirror_num == 3) 2154 rbio->failb = rbio->real_stripes - 2; 2155 2156 ret = lock_stripe_add(rbio); 2157 2158 /* 2159 * __raid56_parity_recover will end the bio with 2160 * any errors it hits. We don't want to return 2161 * its error value up the stack because our caller 2162 * will end up calling bio_endio with any nonzero 2163 * return 2164 */ 2165 if (ret == 0) 2166 __raid56_parity_recover(rbio); 2167 /* 2168 * our rbio has been added to the list of 2169 * rbios that will be handled after the 2170 * currently lock owner is done 2171 */ 2172 return 0; 2173 2174} 2175 2176static void rmw_work(struct btrfs_work *work) 2177{ 2178 struct btrfs_raid_bio *rbio; 2179 2180 rbio = container_of(work, struct btrfs_raid_bio, work); 2181 raid56_rmw_stripe(rbio); 2182} 2183 2184static void read_rebuild_work(struct btrfs_work *work) 2185{ 2186 struct btrfs_raid_bio *rbio; 2187 2188 rbio = container_of(work, struct btrfs_raid_bio, work); 2189 __raid56_parity_recover(rbio); 2190} 2191 2192/* 2193 * The following code is used to scrub/replace the parity stripe 2194 * 2195 * Note: We need make sure all the pages that add into the scrub/replace 2196 * raid bio are correct and not be changed during the scrub/replace. That 2197 * is those pages just hold metadata or file data with checksum. 2198 */ 2199 2200struct btrfs_raid_bio * 2201raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, 2202 struct btrfs_bio *bbio, u64 stripe_len, 2203 struct btrfs_device *scrub_dev, 2204 unsigned long *dbitmap, int stripe_nsectors) 2205{ 2206 struct btrfs_raid_bio *rbio; 2207 int i; 2208 2209 rbio = alloc_rbio(root, bbio, stripe_len); 2210 if (IS_ERR(rbio)) 2211 return NULL; 2212 bio_list_add(&rbio->bio_list, bio); 2213 /* 2214 * This is a special bio which is used to hold the completion handler 2215 * and make the scrub rbio is similar to the other types 2216 */ 2217 ASSERT(!bio->bi_iter.bi_size); 2218 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2219 2220 for (i = 0; i < rbio->real_stripes; i++) { 2221 if (bbio->stripes[i].dev == scrub_dev) { 2222 rbio->scrubp = i; 2223 break; 2224 } 2225 } 2226 2227 /* Now we just support the sectorsize equals to page size */ 2228 ASSERT(root->sectorsize == PAGE_SIZE); 2229 ASSERT(rbio->stripe_npages == stripe_nsectors); 2230 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2231 2232 return rbio; 2233} 2234 2235void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, 2236 struct page *page, u64 logical) 2237{ 2238 int stripe_offset; 2239 int index; 2240 2241 ASSERT(logical >= rbio->bbio->raid_map[0]); 2242 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2243 rbio->stripe_len * rbio->nr_data); 2244 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2245 index = stripe_offset >> PAGE_CACHE_SHIFT; 2246 rbio->bio_pages[index] = page; 2247} 2248 2249/* 2250 * We just scrub the parity that we have correct data on the same horizontal, 2251 * so we needn't allocate all pages for all the stripes. 2252 */ 2253static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2254{ 2255 int i; 2256 int bit; 2257 int index; 2258 struct page *page; 2259 2260 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2261 for (i = 0; i < rbio->real_stripes; i++) { 2262 index = i * rbio->stripe_npages + bit; 2263 if (rbio->stripe_pages[index]) 2264 continue; 2265 2266 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2267 if (!page) 2268 return -ENOMEM; 2269 rbio->stripe_pages[index] = page; 2270 ClearPageUptodate(page); 2271 } 2272 } 2273 return 0; 2274} 2275 2276/* 2277 * end io function used by finish_rmw. When we finally 2278 * get here, we've written a full stripe 2279 */ 2280static void raid_write_parity_end_io(struct bio *bio, int err) 2281{ 2282 struct btrfs_raid_bio *rbio = bio->bi_private; 2283 2284 if (err) 2285 fail_bio_stripe(rbio, bio); 2286 2287 bio_put(bio); 2288 2289 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2290 return; 2291 2292 err = 0; 2293 2294 if (atomic_read(&rbio->error)) 2295 err = -EIO; 2296 2297 rbio_orig_end_io(rbio, err, 0); 2298} 2299 2300static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2301 int need_check) 2302{ 2303 struct btrfs_bio *bbio = rbio->bbio; 2304 void *pointers[rbio->real_stripes]; 2305 DECLARE_BITMAP(pbitmap, rbio->stripe_npages); 2306 int nr_data = rbio->nr_data; 2307 int stripe; 2308 int pagenr; 2309 int p_stripe = -1; 2310 int q_stripe = -1; 2311 struct page *p_page = NULL; 2312 struct page *q_page = NULL; 2313 struct bio_list bio_list; 2314 struct bio *bio; 2315 int is_replace = 0; 2316 int ret; 2317 2318 bio_list_init(&bio_list); 2319 2320 if (rbio->real_stripes - rbio->nr_data == 1) { 2321 p_stripe = rbio->real_stripes - 1; 2322 } else if (rbio->real_stripes - rbio->nr_data == 2) { 2323 p_stripe = rbio->real_stripes - 2; 2324 q_stripe = rbio->real_stripes - 1; 2325 } else { 2326 BUG(); 2327 } 2328 2329 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2330 is_replace = 1; 2331 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2332 } 2333 2334 /* 2335 * Because the higher layers(scrubber) are unlikely to 2336 * use this area of the disk again soon, so don't cache 2337 * it. 2338 */ 2339 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2340 2341 if (!need_check) 2342 goto writeback; 2343 2344 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2345 if (!p_page) 2346 goto cleanup; 2347 SetPageUptodate(p_page); 2348 2349 if (q_stripe != -1) { 2350 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2351 if (!q_page) { 2352 __free_page(p_page); 2353 goto cleanup; 2354 } 2355 SetPageUptodate(q_page); 2356 } 2357 2358 atomic_set(&rbio->error, 0); 2359 2360 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2361 struct page *p; 2362 void *parity; 2363 /* first collect one page from each data stripe */ 2364 for (stripe = 0; stripe < nr_data; stripe++) { 2365 p = page_in_rbio(rbio, stripe, pagenr, 0); 2366 pointers[stripe] = kmap(p); 2367 } 2368 2369 /* then add the parity stripe */ 2370 pointers[stripe++] = kmap(p_page); 2371 2372 if (q_stripe != -1) { 2373 2374 /* 2375 * raid6, add the qstripe and call the 2376 * library function to fill in our p/q 2377 */ 2378 pointers[stripe++] = kmap(q_page); 2379 2380 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2381 pointers); 2382 } else { 2383 /* raid5 */ 2384 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 2385 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 2386 } 2387 2388 /* Check scrubbing pairty and repair it */ 2389 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2390 parity = kmap(p); 2391 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) 2392 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); 2393 else 2394 /* Parity is right, needn't writeback */ 2395 bitmap_clear(rbio->dbitmap, pagenr, 1); 2396 kunmap(p); 2397 2398 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 2399 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2400 } 2401 2402 __free_page(p_page); 2403 if (q_page) 2404 __free_page(q_page); 2405 2406writeback: 2407 /* 2408 * time to start writing. Make bios for everything from the 2409 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2410 * everything else. 2411 */ 2412 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2413 struct page *page; 2414 2415 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2416 ret = rbio_add_io_page(rbio, &bio_list, 2417 page, rbio->scrubp, pagenr, rbio->stripe_len); 2418 if (ret) 2419 goto cleanup; 2420 } 2421 2422 if (!is_replace) 2423 goto submit_write; 2424 2425 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2426 struct page *page; 2427 2428 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2429 ret = rbio_add_io_page(rbio, &bio_list, page, 2430 bbio->tgtdev_map[rbio->scrubp], 2431 pagenr, rbio->stripe_len); 2432 if (ret) 2433 goto cleanup; 2434 } 2435 2436submit_write: 2437 nr_data = bio_list_size(&bio_list); 2438 if (!nr_data) { 2439 /* Every parity is right */ 2440 rbio_orig_end_io(rbio, 0, 0); 2441 return; 2442 } 2443 2444 atomic_set(&rbio->stripes_pending, nr_data); 2445 2446 while (1) { 2447 bio = bio_list_pop(&bio_list); 2448 if (!bio) 2449 break; 2450 2451 bio->bi_private = rbio; 2452 bio->bi_end_io = raid_write_parity_end_io; 2453 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 2454 submit_bio(WRITE, bio); 2455 } 2456 return; 2457 2458cleanup: 2459 rbio_orig_end_io(rbio, -EIO, 0); 2460} 2461 2462static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2463{ 2464 if (stripe >= 0 && stripe < rbio->nr_data) 2465 return 1; 2466 return 0; 2467} 2468 2469/* 2470 * While we're doing the parity check and repair, we could have errors 2471 * in reading pages off the disk. This checks for errors and if we're 2472 * not able to read the page it'll trigger parity reconstruction. The 2473 * parity scrub will be finished after we've reconstructed the failed 2474 * stripes 2475 */ 2476static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2477{ 2478 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2479 goto cleanup; 2480 2481 if (rbio->faila >= 0 || rbio->failb >= 0) { 2482 int dfail = 0, failp = -1; 2483 2484 if (is_data_stripe(rbio, rbio->faila)) 2485 dfail++; 2486 else if (is_parity_stripe(rbio->faila)) 2487 failp = rbio->faila; 2488 2489 if (is_data_stripe(rbio, rbio->failb)) 2490 dfail++; 2491 else if (is_parity_stripe(rbio->failb)) 2492 failp = rbio->failb; 2493 2494 /* 2495 * Because we can not use a scrubbing parity to repair 2496 * the data, so the capability of the repair is declined. 2497 * (In the case of RAID5, we can not repair anything) 2498 */ 2499 if (dfail > rbio->bbio->max_errors - 1) 2500 goto cleanup; 2501 2502 /* 2503 * If all data is good, only parity is correctly, just 2504 * repair the parity. 2505 */ 2506 if (dfail == 0) { 2507 finish_parity_scrub(rbio, 0); 2508 return; 2509 } 2510 2511 /* 2512 * Here means we got one corrupted data stripe and one 2513 * corrupted parity on RAID6, if the corrupted parity 2514 * is scrubbing parity, luckly, use the other one to repair 2515 * the data, or we can not repair the data stripe. 2516 */ 2517 if (failp != rbio->scrubp) 2518 goto cleanup; 2519 2520 __raid_recover_end_io(rbio); 2521 } else { 2522 finish_parity_scrub(rbio, 1); 2523 } 2524 return; 2525 2526cleanup: 2527 rbio_orig_end_io(rbio, -EIO, 0); 2528} 2529 2530/* 2531 * end io for the read phase of the rmw cycle. All the bios here are physical 2532 * stripe bios we've read from the disk so we can recalculate the parity of the 2533 * stripe. 2534 * 2535 * This will usually kick off finish_rmw once all the bios are read in, but it 2536 * may trigger parity reconstruction if we had any errors along the way 2537 */ 2538static void raid56_parity_scrub_end_io(struct bio *bio, int err) 2539{ 2540 struct btrfs_raid_bio *rbio = bio->bi_private; 2541 2542 if (err) 2543 fail_bio_stripe(rbio, bio); 2544 else 2545 set_bio_pages_uptodate(bio); 2546 2547 bio_put(bio); 2548 2549 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2550 return; 2551 2552 /* 2553 * this will normally call finish_rmw to start our write 2554 * but if there are any failed stripes we'll reconstruct 2555 * from parity first 2556 */ 2557 validate_rbio_for_parity_scrub(rbio); 2558} 2559 2560static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2561{ 2562 int bios_to_read = 0; 2563 struct bio_list bio_list; 2564 int ret; 2565 int pagenr; 2566 int stripe; 2567 struct bio *bio; 2568 2569 ret = alloc_rbio_essential_pages(rbio); 2570 if (ret) 2571 goto cleanup; 2572 2573 bio_list_init(&bio_list); 2574 2575 atomic_set(&rbio->error, 0); 2576 /* 2577 * build a list of bios to read all the missing parts of this 2578 * stripe 2579 */ 2580 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2581 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2582 struct page *page; 2583 /* 2584 * we want to find all the pages missing from 2585 * the rbio and read them from the disk. If 2586 * page_in_rbio finds a page in the bio list 2587 * we don't need to read it off the stripe. 2588 */ 2589 page = page_in_rbio(rbio, stripe, pagenr, 1); 2590 if (page) 2591 continue; 2592 2593 page = rbio_stripe_page(rbio, stripe, pagenr); 2594 /* 2595 * the bio cache may have handed us an uptodate 2596 * page. If so, be happy and use it 2597 */ 2598 if (PageUptodate(page)) 2599 continue; 2600 2601 ret = rbio_add_io_page(rbio, &bio_list, page, 2602 stripe, pagenr, rbio->stripe_len); 2603 if (ret) 2604 goto cleanup; 2605 } 2606 } 2607 2608 bios_to_read = bio_list_size(&bio_list); 2609 if (!bios_to_read) { 2610 /* 2611 * this can happen if others have merged with 2612 * us, it means there is nothing left to read. 2613 * But if there are missing devices it may not be 2614 * safe to do the full stripe write yet. 2615 */ 2616 goto finish; 2617 } 2618 2619 /* 2620 * the bbio may be freed once we submit the last bio. Make sure 2621 * not to touch it after that 2622 */ 2623 atomic_set(&rbio->stripes_pending, bios_to_read); 2624 while (1) { 2625 bio = bio_list_pop(&bio_list); 2626 if (!bio) 2627 break; 2628 2629 bio->bi_private = rbio; 2630 bio->bi_end_io = raid56_parity_scrub_end_io; 2631 2632 btrfs_bio_wq_end_io(rbio->fs_info, bio, 2633 BTRFS_WQ_ENDIO_RAID56); 2634 2635 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 2636 submit_bio(READ, bio); 2637 } 2638 /* the actual write will happen once the reads are done */ 2639 return; 2640 2641cleanup: 2642 rbio_orig_end_io(rbio, -EIO, 0); 2643 return; 2644 2645finish: 2646 validate_rbio_for_parity_scrub(rbio); 2647} 2648 2649static void scrub_parity_work(struct btrfs_work *work) 2650{ 2651 struct btrfs_raid_bio *rbio; 2652 2653 rbio = container_of(work, struct btrfs_raid_bio, work); 2654 raid56_parity_scrub_stripe(rbio); 2655} 2656 2657static void async_scrub_parity(struct btrfs_raid_bio *rbio) 2658{ 2659 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 2660 scrub_parity_work, NULL, NULL); 2661 2662 btrfs_queue_work(rbio->fs_info->rmw_workers, 2663 &rbio->work); 2664} 2665 2666void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2667{ 2668 if (!lock_stripe_add(rbio)) 2669 async_scrub_parity(rbio); 2670} 2671