root/fs/btrfs/relocation.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mapping_tree_init
  2. backref_cache_init
  3. backref_cache_cleanup
  4. alloc_backref_node
  5. free_backref_node
  6. alloc_backref_edge
  7. free_backref_edge
  8. tree_insert
  9. tree_search
  10. backref_tree_panic
  11. walk_up_backref
  12. walk_down_backref
  13. unlock_node_buffer
  14. drop_node_buffer
  15. drop_backref_node
  16. remove_backref_node
  17. update_backref_node
  18. update_backref_cache
  19. reloc_root_is_dead
  20. have_reloc_root
  21. should_ignore_root
  22. find_reloc_root
  23. is_cowonly_root
  24. read_fs_root
  25. find_inline_backref
  26. build_backref_tree
  27. clone_backref_node
  28. __add_reloc_root
  29. __del_reloc_root
  30. __update_reloc_root
  31. create_reloc_root
  32. btrfs_init_reloc_root
  33. btrfs_update_reloc_root
  34. find_next_inode
  35. in_block_group
  36. get_new_location
  37. replace_file_extents
  38. memcmp_node_keys
  39. replace_path
  40. walk_up_reloc_tree
  41. walk_down_reloc_tree
  42. invalidate_extent_cache
  43. find_next_key
  44. insert_dirty_subvol
  45. clean_dirty_subvols
  46. merge_reloc_root
  47. prepare_to_merge
  48. free_reloc_roots
  49. merge_reloc_roots
  50. free_block_list
  51. record_reloc_root_in_trans
  52. select_reloc_root
  53. select_one_root
  54. calcu_metadata_size
  55. reserve_metadata_space
  56. do_relocation
  57. link_to_upper
  58. finish_pending_nodes
  59. mark_block_processed
  60. __mark_block_processed
  61. update_processed_blocks
  62. tree_block_processed
  63. get_tree_block_key
  64. relocate_tree_block
  65. relocate_tree_blocks
  66. prealloc_file_extent_cluster
  67. setup_extent_mapping
  68. relocate_file_extent_cluster
  69. relocate_data_extent
  70. add_tree_block
  71. __add_tree_block
  72. block_use_full_backref
  73. delete_block_group_cache
  74. find_data_references
  75. add_data_references
  76. find_next_extent
  77. set_reloc_control
  78. unset_reloc_control
  79. check_extent_flags
  80. prepare_to_relocate
  81. relocate_block_group
  82. __insert_orphan_inode
  83. create_reloc_inode
  84. alloc_reloc_control
  85. describe_relocation
  86. btrfs_relocate_block_group
  87. mark_garbage_root
  88. btrfs_recover_relocation
  89. btrfs_reloc_clone_csums
  90. btrfs_reloc_cow_block
  91. btrfs_reloc_pre_snapshot
  92. btrfs_reloc_post_snapshot

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2009 Oracle.  All rights reserved.
   4  */
   5 
   6 #include <linux/sched.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/writeback.h>
   9 #include <linux/blkdev.h>
  10 #include <linux/rbtree.h>
  11 #include <linux/slab.h>
  12 #include "ctree.h"
  13 #include "disk-io.h"
  14 #include "transaction.h"
  15 #include "volumes.h"
  16 #include "locking.h"
  17 #include "btrfs_inode.h"
  18 #include "async-thread.h"
  19 #include "free-space-cache.h"
  20 #include "inode-map.h"
  21 #include "qgroup.h"
  22 #include "print-tree.h"
  23 #include "delalloc-space.h"
  24 #include "block-group.h"
  25 
  26 /*
  27  * backref_node, mapping_node and tree_block start with this
  28  */
  29 struct tree_entry {
  30         struct rb_node rb_node;
  31         u64 bytenr;
  32 };
  33 
  34 /*
  35  * present a tree block in the backref cache
  36  */
  37 struct backref_node {
  38         struct rb_node rb_node;
  39         u64 bytenr;
  40 
  41         u64 new_bytenr;
  42         /* objectid of tree block owner, can be not uptodate */
  43         u64 owner;
  44         /* link to pending, changed or detached list */
  45         struct list_head list;
  46         /* list of upper level blocks reference this block */
  47         struct list_head upper;
  48         /* list of child blocks in the cache */
  49         struct list_head lower;
  50         /* NULL if this node is not tree root */
  51         struct btrfs_root *root;
  52         /* extent buffer got by COW the block */
  53         struct extent_buffer *eb;
  54         /* level of tree block */
  55         unsigned int level:8;
  56         /* is the block in non-reference counted tree */
  57         unsigned int cowonly:1;
  58         /* 1 if no child node in the cache */
  59         unsigned int lowest:1;
  60         /* is the extent buffer locked */
  61         unsigned int locked:1;
  62         /* has the block been processed */
  63         unsigned int processed:1;
  64         /* have backrefs of this block been checked */
  65         unsigned int checked:1;
  66         /*
  67          * 1 if corresponding block has been cowed but some upper
  68          * level block pointers may not point to the new location
  69          */
  70         unsigned int pending:1;
  71         /*
  72          * 1 if the backref node isn't connected to any other
  73          * backref node.
  74          */
  75         unsigned int detached:1;
  76 };
  77 
  78 /*
  79  * present a block pointer in the backref cache
  80  */
  81 struct backref_edge {
  82         struct list_head list[2];
  83         struct backref_node *node[2];
  84 };
  85 
  86 #define LOWER   0
  87 #define UPPER   1
  88 #define RELOCATION_RESERVED_NODES       256
  89 
  90 struct backref_cache {
  91         /* red black tree of all backref nodes in the cache */
  92         struct rb_root rb_root;
  93         /* for passing backref nodes to btrfs_reloc_cow_block */
  94         struct backref_node *path[BTRFS_MAX_LEVEL];
  95         /*
  96          * list of blocks that have been cowed but some block
  97          * pointers in upper level blocks may not reflect the
  98          * new location
  99          */
 100         struct list_head pending[BTRFS_MAX_LEVEL];
 101         /* list of backref nodes with no child node */
 102         struct list_head leaves;
 103         /* list of blocks that have been cowed in current transaction */
 104         struct list_head changed;
 105         /* list of detached backref node. */
 106         struct list_head detached;
 107 
 108         u64 last_trans;
 109 
 110         int nr_nodes;
 111         int nr_edges;
 112 };
 113 
 114 /*
 115  * map address of tree root to tree
 116  */
 117 struct mapping_node {
 118         struct rb_node rb_node;
 119         u64 bytenr;
 120         void *data;
 121 };
 122 
 123 struct mapping_tree {
 124         struct rb_root rb_root;
 125         spinlock_t lock;
 126 };
 127 
 128 /*
 129  * present a tree block to process
 130  */
 131 struct tree_block {
 132         struct rb_node rb_node;
 133         u64 bytenr;
 134         struct btrfs_key key;
 135         unsigned int level:8;
 136         unsigned int key_ready:1;
 137 };
 138 
 139 #define MAX_EXTENTS 128
 140 
 141 struct file_extent_cluster {
 142         u64 start;
 143         u64 end;
 144         u64 boundary[MAX_EXTENTS];
 145         unsigned int nr;
 146 };
 147 
 148 struct reloc_control {
 149         /* block group to relocate */
 150         struct btrfs_block_group_cache *block_group;
 151         /* extent tree */
 152         struct btrfs_root *extent_root;
 153         /* inode for moving data */
 154         struct inode *data_inode;
 155 
 156         struct btrfs_block_rsv *block_rsv;
 157 
 158         struct backref_cache backref_cache;
 159 
 160         struct file_extent_cluster cluster;
 161         /* tree blocks have been processed */
 162         struct extent_io_tree processed_blocks;
 163         /* map start of tree root to corresponding reloc tree */
 164         struct mapping_tree reloc_root_tree;
 165         /* list of reloc trees */
 166         struct list_head reloc_roots;
 167         /* list of subvolume trees that get relocated */
 168         struct list_head dirty_subvol_roots;
 169         /* size of metadata reservation for merging reloc trees */
 170         u64 merging_rsv_size;
 171         /* size of relocated tree nodes */
 172         u64 nodes_relocated;
 173         /* reserved size for block group relocation*/
 174         u64 reserved_bytes;
 175 
 176         u64 search_start;
 177         u64 extents_found;
 178 
 179         unsigned int stage:8;
 180         unsigned int create_reloc_tree:1;
 181         unsigned int merge_reloc_tree:1;
 182         unsigned int found_file_extent:1;
 183 };
 184 
 185 /* stages of data relocation */
 186 #define MOVE_DATA_EXTENTS       0
 187 #define UPDATE_DATA_PTRS        1
 188 
 189 static void remove_backref_node(struct backref_cache *cache,
 190                                 struct backref_node *node);
 191 static void __mark_block_processed(struct reloc_control *rc,
 192                                    struct backref_node *node);
 193 
 194 static void mapping_tree_init(struct mapping_tree *tree)
 195 {
 196         tree->rb_root = RB_ROOT;
 197         spin_lock_init(&tree->lock);
 198 }
 199 
 200 static void backref_cache_init(struct backref_cache *cache)
 201 {
 202         int i;
 203         cache->rb_root = RB_ROOT;
 204         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 205                 INIT_LIST_HEAD(&cache->pending[i]);
 206         INIT_LIST_HEAD(&cache->changed);
 207         INIT_LIST_HEAD(&cache->detached);
 208         INIT_LIST_HEAD(&cache->leaves);
 209 }
 210 
 211 static void backref_cache_cleanup(struct backref_cache *cache)
 212 {
 213         struct backref_node *node;
 214         int i;
 215 
 216         while (!list_empty(&cache->detached)) {
 217                 node = list_entry(cache->detached.next,
 218                                   struct backref_node, list);
 219                 remove_backref_node(cache, node);
 220         }
 221 
 222         while (!list_empty(&cache->leaves)) {
 223                 node = list_entry(cache->leaves.next,
 224                                   struct backref_node, lower);
 225                 remove_backref_node(cache, node);
 226         }
 227 
 228         cache->last_trans = 0;
 229 
 230         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 231                 ASSERT(list_empty(&cache->pending[i]));
 232         ASSERT(list_empty(&cache->changed));
 233         ASSERT(list_empty(&cache->detached));
 234         ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
 235         ASSERT(!cache->nr_nodes);
 236         ASSERT(!cache->nr_edges);
 237 }
 238 
 239 static struct backref_node *alloc_backref_node(struct backref_cache *cache)
 240 {
 241         struct backref_node *node;
 242 
 243         node = kzalloc(sizeof(*node), GFP_NOFS);
 244         if (node) {
 245                 INIT_LIST_HEAD(&node->list);
 246                 INIT_LIST_HEAD(&node->upper);
 247                 INIT_LIST_HEAD(&node->lower);
 248                 RB_CLEAR_NODE(&node->rb_node);
 249                 cache->nr_nodes++;
 250         }
 251         return node;
 252 }
 253 
 254 static void free_backref_node(struct backref_cache *cache,
 255                               struct backref_node *node)
 256 {
 257         if (node) {
 258                 cache->nr_nodes--;
 259                 kfree(node);
 260         }
 261 }
 262 
 263 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
 264 {
 265         struct backref_edge *edge;
 266 
 267         edge = kzalloc(sizeof(*edge), GFP_NOFS);
 268         if (edge)
 269                 cache->nr_edges++;
 270         return edge;
 271 }
 272 
 273 static void free_backref_edge(struct backref_cache *cache,
 274                               struct backref_edge *edge)
 275 {
 276         if (edge) {
 277                 cache->nr_edges--;
 278                 kfree(edge);
 279         }
 280 }
 281 
 282 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 283                                    struct rb_node *node)
 284 {
 285         struct rb_node **p = &root->rb_node;
 286         struct rb_node *parent = NULL;
 287         struct tree_entry *entry;
 288 
 289         while (*p) {
 290                 parent = *p;
 291                 entry = rb_entry(parent, struct tree_entry, rb_node);
 292 
 293                 if (bytenr < entry->bytenr)
 294                         p = &(*p)->rb_left;
 295                 else if (bytenr > entry->bytenr)
 296                         p = &(*p)->rb_right;
 297                 else
 298                         return parent;
 299         }
 300 
 301         rb_link_node(node, parent, p);
 302         rb_insert_color(node, root);
 303         return NULL;
 304 }
 305 
 306 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 307 {
 308         struct rb_node *n = root->rb_node;
 309         struct tree_entry *entry;
 310 
 311         while (n) {
 312                 entry = rb_entry(n, struct tree_entry, rb_node);
 313 
 314                 if (bytenr < entry->bytenr)
 315                         n = n->rb_left;
 316                 else if (bytenr > entry->bytenr)
 317                         n = n->rb_right;
 318                 else
 319                         return n;
 320         }
 321         return NULL;
 322 }
 323 
 324 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 325 {
 326 
 327         struct btrfs_fs_info *fs_info = NULL;
 328         struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
 329                                               rb_node);
 330         if (bnode->root)
 331                 fs_info = bnode->root->fs_info;
 332         btrfs_panic(fs_info, errno,
 333                     "Inconsistency in backref cache found at offset %llu",
 334                     bytenr);
 335 }
 336 
 337 /*
 338  * walk up backref nodes until reach node presents tree root
 339  */
 340 static struct backref_node *walk_up_backref(struct backref_node *node,
 341                                             struct backref_edge *edges[],
 342                                             int *index)
 343 {
 344         struct backref_edge *edge;
 345         int idx = *index;
 346 
 347         while (!list_empty(&node->upper)) {
 348                 edge = list_entry(node->upper.next,
 349                                   struct backref_edge, list[LOWER]);
 350                 edges[idx++] = edge;
 351                 node = edge->node[UPPER];
 352         }
 353         BUG_ON(node->detached);
 354         *index = idx;
 355         return node;
 356 }
 357 
 358 /*
 359  * walk down backref nodes to find start of next reference path
 360  */
 361 static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 362                                               int *index)
 363 {
 364         struct backref_edge *edge;
 365         struct backref_node *lower;
 366         int idx = *index;
 367 
 368         while (idx > 0) {
 369                 edge = edges[idx - 1];
 370                 lower = edge->node[LOWER];
 371                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 372                         idx--;
 373                         continue;
 374                 }
 375                 edge = list_entry(edge->list[LOWER].next,
 376                                   struct backref_edge, list[LOWER]);
 377                 edges[idx - 1] = edge;
 378                 *index = idx;
 379                 return edge->node[UPPER];
 380         }
 381         *index = 0;
 382         return NULL;
 383 }
 384 
 385 static void unlock_node_buffer(struct backref_node *node)
 386 {
 387         if (node->locked) {
 388                 btrfs_tree_unlock(node->eb);
 389                 node->locked = 0;
 390         }
 391 }
 392 
 393 static void drop_node_buffer(struct backref_node *node)
 394 {
 395         if (node->eb) {
 396                 unlock_node_buffer(node);
 397                 free_extent_buffer(node->eb);
 398                 node->eb = NULL;
 399         }
 400 }
 401 
 402 static void drop_backref_node(struct backref_cache *tree,
 403                               struct backref_node *node)
 404 {
 405         BUG_ON(!list_empty(&node->upper));
 406 
 407         drop_node_buffer(node);
 408         list_del(&node->list);
 409         list_del(&node->lower);
 410         if (!RB_EMPTY_NODE(&node->rb_node))
 411                 rb_erase(&node->rb_node, &tree->rb_root);
 412         free_backref_node(tree, node);
 413 }
 414 
 415 /*
 416  * remove a backref node from the backref cache
 417  */
 418 static void remove_backref_node(struct backref_cache *cache,
 419                                 struct backref_node *node)
 420 {
 421         struct backref_node *upper;
 422         struct backref_edge *edge;
 423 
 424         if (!node)
 425                 return;
 426 
 427         BUG_ON(!node->lowest && !node->detached);
 428         while (!list_empty(&node->upper)) {
 429                 edge = list_entry(node->upper.next, struct backref_edge,
 430                                   list[LOWER]);
 431                 upper = edge->node[UPPER];
 432                 list_del(&edge->list[LOWER]);
 433                 list_del(&edge->list[UPPER]);
 434                 free_backref_edge(cache, edge);
 435 
 436                 if (RB_EMPTY_NODE(&upper->rb_node)) {
 437                         BUG_ON(!list_empty(&node->upper));
 438                         drop_backref_node(cache, node);
 439                         node = upper;
 440                         node->lowest = 1;
 441                         continue;
 442                 }
 443                 /*
 444                  * add the node to leaf node list if no other
 445                  * child block cached.
 446                  */
 447                 if (list_empty(&upper->lower)) {
 448                         list_add_tail(&upper->lower, &cache->leaves);
 449                         upper->lowest = 1;
 450                 }
 451         }
 452 
 453         drop_backref_node(cache, node);
 454 }
 455 
 456 static void update_backref_node(struct backref_cache *cache,
 457                                 struct backref_node *node, u64 bytenr)
 458 {
 459         struct rb_node *rb_node;
 460         rb_erase(&node->rb_node, &cache->rb_root);
 461         node->bytenr = bytenr;
 462         rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 463         if (rb_node)
 464                 backref_tree_panic(rb_node, -EEXIST, bytenr);
 465 }
 466 
 467 /*
 468  * update backref cache after a transaction commit
 469  */
 470 static int update_backref_cache(struct btrfs_trans_handle *trans,
 471                                 struct backref_cache *cache)
 472 {
 473         struct backref_node *node;
 474         int level = 0;
 475 
 476         if (cache->last_trans == 0) {
 477                 cache->last_trans = trans->transid;
 478                 return 0;
 479         }
 480 
 481         if (cache->last_trans == trans->transid)
 482                 return 0;
 483 
 484         /*
 485          * detached nodes are used to avoid unnecessary backref
 486          * lookup. transaction commit changes the extent tree.
 487          * so the detached nodes are no longer useful.
 488          */
 489         while (!list_empty(&cache->detached)) {
 490                 node = list_entry(cache->detached.next,
 491                                   struct backref_node, list);
 492                 remove_backref_node(cache, node);
 493         }
 494 
 495         while (!list_empty(&cache->changed)) {
 496                 node = list_entry(cache->changed.next,
 497                                   struct backref_node, list);
 498                 list_del_init(&node->list);
 499                 BUG_ON(node->pending);
 500                 update_backref_node(cache, node, node->new_bytenr);
 501         }
 502 
 503         /*
 504          * some nodes can be left in the pending list if there were
 505          * errors during processing the pending nodes.
 506          */
 507         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 508                 list_for_each_entry(node, &cache->pending[level], list) {
 509                         BUG_ON(!node->pending);
 510                         if (node->bytenr == node->new_bytenr)
 511                                 continue;
 512                         update_backref_node(cache, node, node->new_bytenr);
 513                 }
 514         }
 515 
 516         cache->last_trans = 0;
 517         return 1;
 518 }
 519 
 520 static bool reloc_root_is_dead(struct btrfs_root *root)
 521 {
 522         /*
 523          * Pair with set_bit/clear_bit in clean_dirty_subvols and
 524          * btrfs_update_reloc_root. We need to see the updated bit before
 525          * trying to access reloc_root
 526          */
 527         smp_rmb();
 528         if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
 529                 return true;
 530         return false;
 531 }
 532 
 533 /*
 534  * Check if this subvolume tree has valid reloc tree.
 535  *
 536  * Reloc tree after swap is considered dead, thus not considered as valid.
 537  * This is enough for most callers, as they don't distinguish dead reloc root
 538  * from no reloc root.  But should_ignore_root() below is a special case.
 539  */
 540 static bool have_reloc_root(struct btrfs_root *root)
 541 {
 542         if (reloc_root_is_dead(root))
 543                 return false;
 544         if (!root->reloc_root)
 545                 return false;
 546         return true;
 547 }
 548 
 549 static int should_ignore_root(struct btrfs_root *root)
 550 {
 551         struct btrfs_root *reloc_root;
 552 
 553         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 554                 return 0;
 555 
 556         /* This root has been merged with its reloc tree, we can ignore it */
 557         if (reloc_root_is_dead(root))
 558                 return 1;
 559 
 560         reloc_root = root->reloc_root;
 561         if (!reloc_root)
 562                 return 0;
 563 
 564         if (btrfs_header_generation(reloc_root->commit_root) ==
 565             root->fs_info->running_transaction->transid)
 566                 return 0;
 567         /*
 568          * if there is reloc tree and it was created in previous
 569          * transaction backref lookup can find the reloc tree,
 570          * so backref node for the fs tree root is useless for
 571          * relocation.
 572          */
 573         return 1;
 574 }
 575 /*
 576  * find reloc tree by address of tree root
 577  */
 578 static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 579                                           u64 bytenr)
 580 {
 581         struct rb_node *rb_node;
 582         struct mapping_node *node;
 583         struct btrfs_root *root = NULL;
 584 
 585         spin_lock(&rc->reloc_root_tree.lock);
 586         rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 587         if (rb_node) {
 588                 node = rb_entry(rb_node, struct mapping_node, rb_node);
 589                 root = (struct btrfs_root *)node->data;
 590         }
 591         spin_unlock(&rc->reloc_root_tree.lock);
 592         return root;
 593 }
 594 
 595 static int is_cowonly_root(u64 root_objectid)
 596 {
 597         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 598             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 599             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 600             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 601             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 602             root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
 603             root_objectid == BTRFS_UUID_TREE_OBJECTID ||
 604             root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
 605             root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
 606                 return 1;
 607         return 0;
 608 }
 609 
 610 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 611                                         u64 root_objectid)
 612 {
 613         struct btrfs_key key;
 614 
 615         key.objectid = root_objectid;
 616         key.type = BTRFS_ROOT_ITEM_KEY;
 617         if (is_cowonly_root(root_objectid))
 618                 key.offset = 0;
 619         else
 620                 key.offset = (u64)-1;
 621 
 622         return btrfs_get_fs_root(fs_info, &key, false);
 623 }
 624 
 625 static noinline_for_stack
 626 int find_inline_backref(struct extent_buffer *leaf, int slot,
 627                         unsigned long *ptr, unsigned long *end)
 628 {
 629         struct btrfs_key key;
 630         struct btrfs_extent_item *ei;
 631         struct btrfs_tree_block_info *bi;
 632         u32 item_size;
 633 
 634         btrfs_item_key_to_cpu(leaf, &key, slot);
 635 
 636         item_size = btrfs_item_size_nr(leaf, slot);
 637         if (item_size < sizeof(*ei)) {
 638                 btrfs_print_v0_err(leaf->fs_info);
 639                 btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL);
 640                 return 1;
 641         }
 642         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 643         WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 644                   BTRFS_EXTENT_FLAG_TREE_BLOCK));
 645 
 646         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 647             item_size <= sizeof(*ei) + sizeof(*bi)) {
 648                 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 649                 return 1;
 650         }
 651         if (key.type == BTRFS_METADATA_ITEM_KEY &&
 652             item_size <= sizeof(*ei)) {
 653                 WARN_ON(item_size < sizeof(*ei));
 654                 return 1;
 655         }
 656 
 657         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 658                 bi = (struct btrfs_tree_block_info *)(ei + 1);
 659                 *ptr = (unsigned long)(bi + 1);
 660         } else {
 661                 *ptr = (unsigned long)(ei + 1);
 662         }
 663         *end = (unsigned long)ei + item_size;
 664         return 0;
 665 }
 666 
 667 /*
 668  * build backref tree for a given tree block. root of the backref tree
 669  * corresponds the tree block, leaves of the backref tree correspond
 670  * roots of b-trees that reference the tree block.
 671  *
 672  * the basic idea of this function is check backrefs of a given block
 673  * to find upper level blocks that reference the block, and then check
 674  * backrefs of these upper level blocks recursively. the recursion stop
 675  * when tree root is reached or backrefs for the block is cached.
 676  *
 677  * NOTE: if we find backrefs for a block are cached, we know backrefs
 678  * for all upper level blocks that directly/indirectly reference the
 679  * block are also cached.
 680  */
 681 static noinline_for_stack
 682 struct backref_node *build_backref_tree(struct reloc_control *rc,
 683                                         struct btrfs_key *node_key,
 684                                         int level, u64 bytenr)
 685 {
 686         struct backref_cache *cache = &rc->backref_cache;
 687         struct btrfs_path *path1; /* For searching extent root */
 688         struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
 689         struct extent_buffer *eb;
 690         struct btrfs_root *root;
 691         struct backref_node *cur;
 692         struct backref_node *upper;
 693         struct backref_node *lower;
 694         struct backref_node *node = NULL;
 695         struct backref_node *exist = NULL;
 696         struct backref_edge *edge;
 697         struct rb_node *rb_node;
 698         struct btrfs_key key;
 699         unsigned long end;
 700         unsigned long ptr;
 701         LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
 702         LIST_HEAD(useless);
 703         int cowonly;
 704         int ret;
 705         int err = 0;
 706         bool need_check = true;
 707 
 708         path1 = btrfs_alloc_path();
 709         path2 = btrfs_alloc_path();
 710         if (!path1 || !path2) {
 711                 err = -ENOMEM;
 712                 goto out;
 713         }
 714         path1->reada = READA_FORWARD;
 715         path2->reada = READA_FORWARD;
 716 
 717         node = alloc_backref_node(cache);
 718         if (!node) {
 719                 err = -ENOMEM;
 720                 goto out;
 721         }
 722 
 723         node->bytenr = bytenr;
 724         node->level = level;
 725         node->lowest = 1;
 726         cur = node;
 727 again:
 728         end = 0;
 729         ptr = 0;
 730         key.objectid = cur->bytenr;
 731         key.type = BTRFS_METADATA_ITEM_KEY;
 732         key.offset = (u64)-1;
 733 
 734         path1->search_commit_root = 1;
 735         path1->skip_locking = 1;
 736         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 737                                 0, 0);
 738         if (ret < 0) {
 739                 err = ret;
 740                 goto out;
 741         }
 742         ASSERT(ret);
 743         ASSERT(path1->slots[0]);
 744 
 745         path1->slots[0]--;
 746 
 747         WARN_ON(cur->checked);
 748         if (!list_empty(&cur->upper)) {
 749                 /*
 750                  * the backref was added previously when processing
 751                  * backref of type BTRFS_TREE_BLOCK_REF_KEY
 752                  */
 753                 ASSERT(list_is_singular(&cur->upper));
 754                 edge = list_entry(cur->upper.next, struct backref_edge,
 755                                   list[LOWER]);
 756                 ASSERT(list_empty(&edge->list[UPPER]));
 757                 exist = edge->node[UPPER];
 758                 /*
 759                  * add the upper level block to pending list if we need
 760                  * check its backrefs
 761                  */
 762                 if (!exist->checked)
 763                         list_add_tail(&edge->list[UPPER], &list);
 764         } else {
 765                 exist = NULL;
 766         }
 767 
 768         while (1) {
 769                 cond_resched();
 770                 eb = path1->nodes[0];
 771 
 772                 if (ptr >= end) {
 773                         if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 774                                 ret = btrfs_next_leaf(rc->extent_root, path1);
 775                                 if (ret < 0) {
 776                                         err = ret;
 777                                         goto out;
 778                                 }
 779                                 if (ret > 0)
 780                                         break;
 781                                 eb = path1->nodes[0];
 782                         }
 783 
 784                         btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 785                         if (key.objectid != cur->bytenr) {
 786                                 WARN_ON(exist);
 787                                 break;
 788                         }
 789 
 790                         if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 791                             key.type == BTRFS_METADATA_ITEM_KEY) {
 792                                 ret = find_inline_backref(eb, path1->slots[0],
 793                                                           &ptr, &end);
 794                                 if (ret)
 795                                         goto next;
 796                         }
 797                 }
 798 
 799                 if (ptr < end) {
 800                         /* update key for inline back ref */
 801                         struct btrfs_extent_inline_ref *iref;
 802                         int type;
 803                         iref = (struct btrfs_extent_inline_ref *)ptr;
 804                         type = btrfs_get_extent_inline_ref_type(eb, iref,
 805                                                         BTRFS_REF_TYPE_BLOCK);
 806                         if (type == BTRFS_REF_TYPE_INVALID) {
 807                                 err = -EUCLEAN;
 808                                 goto out;
 809                         }
 810                         key.type = type;
 811                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 812 
 813                         WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 814                                 key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 815                 }
 816 
 817                 /*
 818                  * Parent node found and matches current inline ref, no need to
 819                  * rebuild this node for this inline ref.
 820                  */
 821                 if (exist &&
 822                     ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 823                       exist->owner == key.offset) ||
 824                      (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 825                       exist->bytenr == key.offset))) {
 826                         exist = NULL;
 827                         goto next;
 828                 }
 829 
 830                 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
 831                 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 832                         if (key.objectid == key.offset) {
 833                                 /*
 834                                  * Only root blocks of reloc trees use backref
 835                                  * pointing to itself.
 836                                  */
 837                                 root = find_reloc_root(rc, cur->bytenr);
 838                                 ASSERT(root);
 839                                 cur->root = root;
 840                                 break;
 841                         }
 842 
 843                         edge = alloc_backref_edge(cache);
 844                         if (!edge) {
 845                                 err = -ENOMEM;
 846                                 goto out;
 847                         }
 848                         rb_node = tree_search(&cache->rb_root, key.offset);
 849                         if (!rb_node) {
 850                                 upper = alloc_backref_node(cache);
 851                                 if (!upper) {
 852                                         free_backref_edge(cache, edge);
 853                                         err = -ENOMEM;
 854                                         goto out;
 855                                 }
 856                                 upper->bytenr = key.offset;
 857                                 upper->level = cur->level + 1;
 858                                 /*
 859                                  *  backrefs for the upper level block isn't
 860                                  *  cached, add the block to pending list
 861                                  */
 862                                 list_add_tail(&edge->list[UPPER], &list);
 863                         } else {
 864                                 upper = rb_entry(rb_node, struct backref_node,
 865                                                  rb_node);
 866                                 ASSERT(upper->checked);
 867                                 INIT_LIST_HEAD(&edge->list[UPPER]);
 868                         }
 869                         list_add_tail(&edge->list[LOWER], &cur->upper);
 870                         edge->node[LOWER] = cur;
 871                         edge->node[UPPER] = upper;
 872 
 873                         goto next;
 874                 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
 875                         err = -EINVAL;
 876                         btrfs_print_v0_err(rc->extent_root->fs_info);
 877                         btrfs_handle_fs_error(rc->extent_root->fs_info, err,
 878                                               NULL);
 879                         goto out;
 880                 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 881                         goto next;
 882                 }
 883 
 884                 /*
 885                  * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
 886                  * means the root objectid. We need to search the tree to get
 887                  * its parent bytenr.
 888                  */
 889                 root = read_fs_root(rc->extent_root->fs_info, key.offset);
 890                 if (IS_ERR(root)) {
 891                         err = PTR_ERR(root);
 892                         goto out;
 893                 }
 894 
 895                 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 896                         cur->cowonly = 1;
 897 
 898                 if (btrfs_root_level(&root->root_item) == cur->level) {
 899                         /* tree root */
 900                         ASSERT(btrfs_root_bytenr(&root->root_item) ==
 901                                cur->bytenr);
 902                         if (should_ignore_root(root))
 903                                 list_add(&cur->list, &useless);
 904                         else
 905                                 cur->root = root;
 906                         break;
 907                 }
 908 
 909                 level = cur->level + 1;
 910 
 911                 /* Search the tree to find parent blocks referring the block. */
 912                 path2->search_commit_root = 1;
 913                 path2->skip_locking = 1;
 914                 path2->lowest_level = level;
 915                 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 916                 path2->lowest_level = 0;
 917                 if (ret < 0) {
 918                         err = ret;
 919                         goto out;
 920                 }
 921                 if (ret > 0 && path2->slots[level] > 0)
 922                         path2->slots[level]--;
 923 
 924                 eb = path2->nodes[level];
 925                 if (btrfs_node_blockptr(eb, path2->slots[level]) !=
 926                     cur->bytenr) {
 927                         btrfs_err(root->fs_info,
 928         "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
 929                                   cur->bytenr, level - 1,
 930                                   root->root_key.objectid,
 931                                   node_key->objectid, node_key->type,
 932                                   node_key->offset);
 933                         err = -ENOENT;
 934                         goto out;
 935                 }
 936                 lower = cur;
 937                 need_check = true;
 938 
 939                 /* Add all nodes and edges in the path */
 940                 for (; level < BTRFS_MAX_LEVEL; level++) {
 941                         if (!path2->nodes[level]) {
 942                                 ASSERT(btrfs_root_bytenr(&root->root_item) ==
 943                                        lower->bytenr);
 944                                 if (should_ignore_root(root))
 945                                         list_add(&lower->list, &useless);
 946                                 else
 947                                         lower->root = root;
 948                                 break;
 949                         }
 950 
 951                         edge = alloc_backref_edge(cache);
 952                         if (!edge) {
 953                                 err = -ENOMEM;
 954                                 goto out;
 955                         }
 956 
 957                         eb = path2->nodes[level];
 958                         rb_node = tree_search(&cache->rb_root, eb->start);
 959                         if (!rb_node) {
 960                                 upper = alloc_backref_node(cache);
 961                                 if (!upper) {
 962                                         free_backref_edge(cache, edge);
 963                                         err = -ENOMEM;
 964                                         goto out;
 965                                 }
 966                                 upper->bytenr = eb->start;
 967                                 upper->owner = btrfs_header_owner(eb);
 968                                 upper->level = lower->level + 1;
 969                                 if (!test_bit(BTRFS_ROOT_REF_COWS,
 970                                               &root->state))
 971                                         upper->cowonly = 1;
 972 
 973                                 /*
 974                                  * if we know the block isn't shared
 975                                  * we can void checking its backrefs.
 976                                  */
 977                                 if (btrfs_block_can_be_shared(root, eb))
 978                                         upper->checked = 0;
 979                                 else
 980                                         upper->checked = 1;
 981 
 982                                 /*
 983                                  * add the block to pending list if we
 984                                  * need check its backrefs, we only do this once
 985                                  * while walking up a tree as we will catch
 986                                  * anything else later on.
 987                                  */
 988                                 if (!upper->checked && need_check) {
 989                                         need_check = false;
 990                                         list_add_tail(&edge->list[UPPER],
 991                                                       &list);
 992                                 } else {
 993                                         if (upper->checked)
 994                                                 need_check = true;
 995                                         INIT_LIST_HEAD(&edge->list[UPPER]);
 996                                 }
 997                         } else {
 998                                 upper = rb_entry(rb_node, struct backref_node,
 999                                                  rb_node);
1000                                 ASSERT(upper->checked);
1001                                 INIT_LIST_HEAD(&edge->list[UPPER]);
1002                                 if (!upper->owner)
1003                                         upper->owner = btrfs_header_owner(eb);
1004                         }
1005                         list_add_tail(&edge->list[LOWER], &lower->upper);
1006                         edge->node[LOWER] = lower;
1007                         edge->node[UPPER] = upper;
1008 
1009                         if (rb_node)
1010                                 break;
1011                         lower = upper;
1012                         upper = NULL;
1013                 }
1014                 btrfs_release_path(path2);
1015 next:
1016                 if (ptr < end) {
1017                         ptr += btrfs_extent_inline_ref_size(key.type);
1018                         if (ptr >= end) {
1019                                 WARN_ON(ptr > end);
1020                                 ptr = 0;
1021                                 end = 0;
1022                         }
1023                 }
1024                 if (ptr >= end)
1025                         path1->slots[0]++;
1026         }
1027         btrfs_release_path(path1);
1028 
1029         cur->checked = 1;
1030         WARN_ON(exist);
1031 
1032         /* the pending list isn't empty, take the first block to process */
1033         if (!list_empty(&list)) {
1034                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1035                 list_del_init(&edge->list[UPPER]);
1036                 cur = edge->node[UPPER];
1037                 goto again;
1038         }
1039 
1040         /*
1041          * everything goes well, connect backref nodes and insert backref nodes
1042          * into the cache.
1043          */
1044         ASSERT(node->checked);
1045         cowonly = node->cowonly;
1046         if (!cowonly) {
1047                 rb_node = tree_insert(&cache->rb_root, node->bytenr,
1048                                       &node->rb_node);
1049                 if (rb_node)
1050                         backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1051                 list_add_tail(&node->lower, &cache->leaves);
1052         }
1053 
1054         list_for_each_entry(edge, &node->upper, list[LOWER])
1055                 list_add_tail(&edge->list[UPPER], &list);
1056 
1057         while (!list_empty(&list)) {
1058                 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1059                 list_del_init(&edge->list[UPPER]);
1060                 upper = edge->node[UPPER];
1061                 if (upper->detached) {
1062                         list_del(&edge->list[LOWER]);
1063                         lower = edge->node[LOWER];
1064                         free_backref_edge(cache, edge);
1065                         if (list_empty(&lower->upper))
1066                                 list_add(&lower->list, &useless);
1067                         continue;
1068                 }
1069 
1070                 if (!RB_EMPTY_NODE(&upper->rb_node)) {
1071                         if (upper->lowest) {
1072                                 list_del_init(&upper->lower);
1073                                 upper->lowest = 0;
1074                         }
1075 
1076                         list_add_tail(&edge->list[UPPER], &upper->lower);
1077                         continue;
1078                 }
1079 
1080                 if (!upper->checked) {
1081                         /*
1082                          * Still want to blow up for developers since this is a
1083                          * logic bug.
1084                          */
1085                         ASSERT(0);
1086                         err = -EINVAL;
1087                         goto out;
1088                 }
1089                 if (cowonly != upper->cowonly) {
1090                         ASSERT(0);
1091                         err = -EINVAL;
1092                         goto out;
1093                 }
1094 
1095                 if (!cowonly) {
1096                         rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1097                                               &upper->rb_node);
1098                         if (rb_node)
1099                                 backref_tree_panic(rb_node, -EEXIST,
1100                                                    upper->bytenr);
1101                 }
1102 
1103                 list_add_tail(&edge->list[UPPER], &upper->lower);
1104 
1105                 list_for_each_entry(edge, &upper->upper, list[LOWER])
1106                         list_add_tail(&edge->list[UPPER], &list);
1107         }
1108         /*
1109          * process useless backref nodes. backref nodes for tree leaves
1110          * are deleted from the cache. backref nodes for upper level
1111          * tree blocks are left in the cache to avoid unnecessary backref
1112          * lookup.
1113          */
1114         while (!list_empty(&useless)) {
1115                 upper = list_entry(useless.next, struct backref_node, list);
1116                 list_del_init(&upper->list);
1117                 ASSERT(list_empty(&upper->upper));
1118                 if (upper == node)
1119                         node = NULL;
1120                 if (upper->lowest) {
1121                         list_del_init(&upper->lower);
1122                         upper->lowest = 0;
1123                 }
1124                 while (!list_empty(&upper->lower)) {
1125                         edge = list_entry(upper->lower.next,
1126                                           struct backref_edge, list[UPPER]);
1127                         list_del(&edge->list[UPPER]);
1128                         list_del(&edge->list[LOWER]);
1129                         lower = edge->node[LOWER];
1130                         free_backref_edge(cache, edge);
1131 
1132                         if (list_empty(&lower->upper))
1133                                 list_add(&lower->list, &useless);
1134                 }
1135                 __mark_block_processed(rc, upper);
1136                 if (upper->level > 0) {
1137                         list_add(&upper->list, &cache->detached);
1138                         upper->detached = 1;
1139                 } else {
1140                         rb_erase(&upper->rb_node, &cache->rb_root);
1141                         free_backref_node(cache, upper);
1142                 }
1143         }
1144 out:
1145         btrfs_free_path(path1);
1146         btrfs_free_path(path2);
1147         if (err) {
1148                 while (!list_empty(&useless)) {
1149                         lower = list_entry(useless.next,
1150                                            struct backref_node, list);
1151                         list_del_init(&lower->list);
1152                 }
1153                 while (!list_empty(&list)) {
1154                         edge = list_first_entry(&list, struct backref_edge,
1155                                                 list[UPPER]);
1156                         list_del(&edge->list[UPPER]);
1157                         list_del(&edge->list[LOWER]);
1158                         lower = edge->node[LOWER];
1159                         upper = edge->node[UPPER];
1160                         free_backref_edge(cache, edge);
1161 
1162                         /*
1163                          * Lower is no longer linked to any upper backref nodes
1164                          * and isn't in the cache, we can free it ourselves.
1165                          */
1166                         if (list_empty(&lower->upper) &&
1167                             RB_EMPTY_NODE(&lower->rb_node))
1168                                 list_add(&lower->list, &useless);
1169 
1170                         if (!RB_EMPTY_NODE(&upper->rb_node))
1171                                 continue;
1172 
1173                         /* Add this guy's upper edges to the list to process */
1174                         list_for_each_entry(edge, &upper->upper, list[LOWER])
1175                                 list_add_tail(&edge->list[UPPER], &list);
1176                         if (list_empty(&upper->upper))
1177                                 list_add(&upper->list, &useless);
1178                 }
1179 
1180                 while (!list_empty(&useless)) {
1181                         lower = list_entry(useless.next,
1182                                            struct backref_node, list);
1183                         list_del_init(&lower->list);
1184                         if (lower == node)
1185                                 node = NULL;
1186                         free_backref_node(cache, lower);
1187                 }
1188 
1189                 remove_backref_node(cache, node);
1190                 return ERR_PTR(err);
1191         }
1192         ASSERT(!node || !node->detached);
1193         return node;
1194 }
1195 
1196 /*
1197  * helper to add backref node for the newly created snapshot.
1198  * the backref node is created by cloning backref node that
1199  * corresponds to root of source tree
1200  */
1201 static int clone_backref_node(struct btrfs_trans_handle *trans,
1202                               struct reloc_control *rc,
1203                               struct btrfs_root *src,
1204                               struct btrfs_root *dest)
1205 {
1206         struct btrfs_root *reloc_root = src->reloc_root;
1207         struct backref_cache *cache = &rc->backref_cache;
1208         struct backref_node *node = NULL;
1209         struct backref_node *new_node;
1210         struct backref_edge *edge;
1211         struct backref_edge *new_edge;
1212         struct rb_node *rb_node;
1213 
1214         if (cache->last_trans > 0)
1215                 update_backref_cache(trans, cache);
1216 
1217         rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1218         if (rb_node) {
1219                 node = rb_entry(rb_node, struct backref_node, rb_node);
1220                 if (node->detached)
1221                         node = NULL;
1222                 else
1223                         BUG_ON(node->new_bytenr != reloc_root->node->start);
1224         }
1225 
1226         if (!node) {
1227                 rb_node = tree_search(&cache->rb_root,
1228                                       reloc_root->commit_root->start);
1229                 if (rb_node) {
1230                         node = rb_entry(rb_node, struct backref_node,
1231                                         rb_node);
1232                         BUG_ON(node->detached);
1233                 }
1234         }
1235 
1236         if (!node)
1237                 return 0;
1238 
1239         new_node = alloc_backref_node(cache);
1240         if (!new_node)
1241                 return -ENOMEM;
1242 
1243         new_node->bytenr = dest->node->start;
1244         new_node->level = node->level;
1245         new_node->lowest = node->lowest;
1246         new_node->checked = 1;
1247         new_node->root = dest;
1248 
1249         if (!node->lowest) {
1250                 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1251                         new_edge = alloc_backref_edge(cache);
1252                         if (!new_edge)
1253                                 goto fail;
1254 
1255                         new_edge->node[UPPER] = new_node;
1256                         new_edge->node[LOWER] = edge->node[LOWER];
1257                         list_add_tail(&new_edge->list[UPPER],
1258                                       &new_node->lower);
1259                 }
1260         } else {
1261                 list_add_tail(&new_node->lower, &cache->leaves);
1262         }
1263 
1264         rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1265                               &new_node->rb_node);
1266         if (rb_node)
1267                 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1268 
1269         if (!new_node->lowest) {
1270                 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1271                         list_add_tail(&new_edge->list[LOWER],
1272                                       &new_edge->node[LOWER]->upper);
1273                 }
1274         }
1275         return 0;
1276 fail:
1277         while (!list_empty(&new_node->lower)) {
1278                 new_edge = list_entry(new_node->lower.next,
1279                                       struct backref_edge, list[UPPER]);
1280                 list_del(&new_edge->list[UPPER]);
1281                 free_backref_edge(cache, new_edge);
1282         }
1283         free_backref_node(cache, new_node);
1284         return -ENOMEM;
1285 }
1286 
1287 /*
1288  * helper to add 'address of tree root -> reloc tree' mapping
1289  */
1290 static int __must_check __add_reloc_root(struct btrfs_root *root)
1291 {
1292         struct btrfs_fs_info *fs_info = root->fs_info;
1293         struct rb_node *rb_node;
1294         struct mapping_node *node;
1295         struct reloc_control *rc = fs_info->reloc_ctl;
1296 
1297         node = kmalloc(sizeof(*node), GFP_NOFS);
1298         if (!node)
1299                 return -ENOMEM;
1300 
1301         node->bytenr = root->commit_root->start;
1302         node->data = root;
1303 
1304         spin_lock(&rc->reloc_root_tree.lock);
1305         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1306                               node->bytenr, &node->rb_node);
1307         spin_unlock(&rc->reloc_root_tree.lock);
1308         if (rb_node) {
1309                 btrfs_panic(fs_info, -EEXIST,
1310                             "Duplicate root found for start=%llu while inserting into relocation tree",
1311                             node->bytenr);
1312         }
1313 
1314         list_add_tail(&root->root_list, &rc->reloc_roots);
1315         return 0;
1316 }
1317 
1318 /*
1319  * helper to delete the 'address of tree root -> reloc tree'
1320  * mapping
1321  */
1322 static void __del_reloc_root(struct btrfs_root *root)
1323 {
1324         struct btrfs_fs_info *fs_info = root->fs_info;
1325         struct rb_node *rb_node;
1326         struct mapping_node *node = NULL;
1327         struct reloc_control *rc = fs_info->reloc_ctl;
1328 
1329         if (rc && root->node) {
1330                 spin_lock(&rc->reloc_root_tree.lock);
1331                 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1332                                       root->commit_root->start);
1333                 if (rb_node) {
1334                         node = rb_entry(rb_node, struct mapping_node, rb_node);
1335                         rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1336                         RB_CLEAR_NODE(&node->rb_node);
1337                 }
1338                 spin_unlock(&rc->reloc_root_tree.lock);
1339                 if (!node)
1340                         return;
1341                 BUG_ON((struct btrfs_root *)node->data != root);
1342         }
1343 
1344         spin_lock(&fs_info->trans_lock);
1345         list_del_init(&root->root_list);
1346         spin_unlock(&fs_info->trans_lock);
1347         kfree(node);
1348 }
1349 
1350 /*
1351  * helper to update the 'address of tree root -> reloc tree'
1352  * mapping
1353  */
1354 static int __update_reloc_root(struct btrfs_root *root)
1355 {
1356         struct btrfs_fs_info *fs_info = root->fs_info;
1357         struct rb_node *rb_node;
1358         struct mapping_node *node = NULL;
1359         struct reloc_control *rc = fs_info->reloc_ctl;
1360 
1361         spin_lock(&rc->reloc_root_tree.lock);
1362         rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1363                               root->commit_root->start);
1364         if (rb_node) {
1365                 node = rb_entry(rb_node, struct mapping_node, rb_node);
1366                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1367         }
1368         spin_unlock(&rc->reloc_root_tree.lock);
1369 
1370         if (!node)
1371                 return 0;
1372         BUG_ON((struct btrfs_root *)node->data != root);
1373 
1374         spin_lock(&rc->reloc_root_tree.lock);
1375         node->bytenr = root->node->start;
1376         rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1377                               node->bytenr, &node->rb_node);
1378         spin_unlock(&rc->reloc_root_tree.lock);
1379         if (rb_node)
1380                 backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1381         return 0;
1382 }
1383 
1384 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1385                                         struct btrfs_root *root, u64 objectid)
1386 {
1387         struct btrfs_fs_info *fs_info = root->fs_info;
1388         struct btrfs_root *reloc_root;
1389         struct extent_buffer *eb;
1390         struct btrfs_root_item *root_item;
1391         struct btrfs_key root_key;
1392         int ret;
1393 
1394         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1395         BUG_ON(!root_item);
1396 
1397         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1398         root_key.type = BTRFS_ROOT_ITEM_KEY;
1399         root_key.offset = objectid;
1400 
1401         if (root->root_key.objectid == objectid) {
1402                 u64 commit_root_gen;
1403 
1404                 /* called by btrfs_init_reloc_root */
1405                 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1406                                       BTRFS_TREE_RELOC_OBJECTID);
1407                 BUG_ON(ret);
1408                 /*
1409                  * Set the last_snapshot field to the generation of the commit
1410                  * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1411                  * correctly (returns true) when the relocation root is created
1412                  * either inside the critical section of a transaction commit
1413                  * (through transaction.c:qgroup_account_snapshot()) and when
1414                  * it's created before the transaction commit is started.
1415                  */
1416                 commit_root_gen = btrfs_header_generation(root->commit_root);
1417                 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1418         } else {
1419                 /*
1420                  * called by btrfs_reloc_post_snapshot_hook.
1421                  * the source tree is a reloc tree, all tree blocks
1422                  * modified after it was created have RELOC flag
1423                  * set in their headers. so it's OK to not update
1424                  * the 'last_snapshot'.
1425                  */
1426                 ret = btrfs_copy_root(trans, root, root->node, &eb,
1427                                       BTRFS_TREE_RELOC_OBJECTID);
1428                 BUG_ON(ret);
1429         }
1430 
1431         memcpy(root_item, &root->root_item, sizeof(*root_item));
1432         btrfs_set_root_bytenr(root_item, eb->start);
1433         btrfs_set_root_level(root_item, btrfs_header_level(eb));
1434         btrfs_set_root_generation(root_item, trans->transid);
1435 
1436         if (root->root_key.objectid == objectid) {
1437                 btrfs_set_root_refs(root_item, 0);
1438                 memset(&root_item->drop_progress, 0,
1439                        sizeof(struct btrfs_disk_key));
1440                 root_item->drop_level = 0;
1441         }
1442 
1443         btrfs_tree_unlock(eb);
1444         free_extent_buffer(eb);
1445 
1446         ret = btrfs_insert_root(trans, fs_info->tree_root,
1447                                 &root_key, root_item);
1448         BUG_ON(ret);
1449         kfree(root_item);
1450 
1451         reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1452         BUG_ON(IS_ERR(reloc_root));
1453         reloc_root->last_trans = trans->transid;
1454         return reloc_root;
1455 }
1456 
1457 /*
1458  * create reloc tree for a given fs tree. reloc tree is just a
1459  * snapshot of the fs tree with special root objectid.
1460  */
1461 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1462                           struct btrfs_root *root)
1463 {
1464         struct btrfs_fs_info *fs_info = root->fs_info;
1465         struct btrfs_root *reloc_root;
1466         struct reloc_control *rc = fs_info->reloc_ctl;
1467         struct btrfs_block_rsv *rsv;
1468         int clear_rsv = 0;
1469         int ret;
1470 
1471         /*
1472          * The subvolume has reloc tree but the swap is finished, no need to
1473          * create/update the dead reloc tree
1474          */
1475         if (reloc_root_is_dead(root))
1476                 return 0;
1477 
1478         if (root->reloc_root) {
1479                 reloc_root = root->reloc_root;
1480                 reloc_root->last_trans = trans->transid;
1481                 return 0;
1482         }
1483 
1484         if (!rc || !rc->create_reloc_tree ||
1485             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1486                 return 0;
1487 
1488         if (!trans->reloc_reserved) {
1489                 rsv = trans->block_rsv;
1490                 trans->block_rsv = rc->block_rsv;
1491                 clear_rsv = 1;
1492         }
1493         reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1494         if (clear_rsv)
1495                 trans->block_rsv = rsv;
1496 
1497         ret = __add_reloc_root(reloc_root);
1498         BUG_ON(ret < 0);
1499         root->reloc_root = reloc_root;
1500         return 0;
1501 }
1502 
1503 /*
1504  * update root item of reloc tree
1505  */
1506 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1507                             struct btrfs_root *root)
1508 {
1509         struct btrfs_fs_info *fs_info = root->fs_info;
1510         struct btrfs_root *reloc_root;
1511         struct btrfs_root_item *root_item;
1512         int ret;
1513 
1514         if (!have_reloc_root(root))
1515                 goto out;
1516 
1517         reloc_root = root->reloc_root;
1518         root_item = &reloc_root->root_item;
1519 
1520         /* root->reloc_root will stay until current relocation finished */
1521         if (fs_info->reloc_ctl->merge_reloc_tree &&
1522             btrfs_root_refs(root_item) == 0) {
1523                 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1524                 /*
1525                  * Mark the tree as dead before we change reloc_root so
1526                  * have_reloc_root will not touch it from now on.
1527                  */
1528                 smp_wmb();
1529                 __del_reloc_root(reloc_root);
1530         }
1531 
1532         if (reloc_root->commit_root != reloc_root->node) {
1533                 __update_reloc_root(reloc_root);
1534                 btrfs_set_root_node(root_item, reloc_root->node);
1535                 free_extent_buffer(reloc_root->commit_root);
1536                 reloc_root->commit_root = btrfs_root_node(reloc_root);
1537         }
1538 
1539         ret = btrfs_update_root(trans, fs_info->tree_root,
1540                                 &reloc_root->root_key, root_item);
1541         BUG_ON(ret);
1542 
1543 out:
1544         return 0;
1545 }
1546 
1547 /*
1548  * helper to find first cached inode with inode number >= objectid
1549  * in a subvolume
1550  */
1551 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1552 {
1553         struct rb_node *node;
1554         struct rb_node *prev;
1555         struct btrfs_inode *entry;
1556         struct inode *inode;
1557 
1558         spin_lock(&root->inode_lock);
1559 again:
1560         node = root->inode_tree.rb_node;
1561         prev = NULL;
1562         while (node) {
1563                 prev = node;
1564                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1565 
1566                 if (objectid < btrfs_ino(entry))
1567                         node = node->rb_left;
1568                 else if (objectid > btrfs_ino(entry))
1569                         node = node->rb_right;
1570                 else
1571                         break;
1572         }
1573         if (!node) {
1574                 while (prev) {
1575                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
1576                         if (objectid <= btrfs_ino(entry)) {
1577                                 node = prev;
1578                                 break;
1579                         }
1580                         prev = rb_next(prev);
1581                 }
1582         }
1583         while (node) {
1584                 entry = rb_entry(node, struct btrfs_inode, rb_node);
1585                 inode = igrab(&entry->vfs_inode);
1586                 if (inode) {
1587                         spin_unlock(&root->inode_lock);
1588                         return inode;
1589                 }
1590 
1591                 objectid = btrfs_ino(entry) + 1;
1592                 if (cond_resched_lock(&root->inode_lock))
1593                         goto again;
1594 
1595                 node = rb_next(node);
1596         }
1597         spin_unlock(&root->inode_lock);
1598         return NULL;
1599 }
1600 
1601 static int in_block_group(u64 bytenr,
1602                           struct btrfs_block_group_cache *block_group)
1603 {
1604         if (bytenr >= block_group->key.objectid &&
1605             bytenr < block_group->key.objectid + block_group->key.offset)
1606                 return 1;
1607         return 0;
1608 }
1609 
1610 /*
1611  * get new location of data
1612  */
1613 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1614                             u64 bytenr, u64 num_bytes)
1615 {
1616         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1617         struct btrfs_path *path;
1618         struct btrfs_file_extent_item *fi;
1619         struct extent_buffer *leaf;
1620         int ret;
1621 
1622         path = btrfs_alloc_path();
1623         if (!path)
1624                 return -ENOMEM;
1625 
1626         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1627         ret = btrfs_lookup_file_extent(NULL, root, path,
1628                         btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1629         if (ret < 0)
1630                 goto out;
1631         if (ret > 0) {
1632                 ret = -ENOENT;
1633                 goto out;
1634         }
1635 
1636         leaf = path->nodes[0];
1637         fi = btrfs_item_ptr(leaf, path->slots[0],
1638                             struct btrfs_file_extent_item);
1639 
1640         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1641                btrfs_file_extent_compression(leaf, fi) ||
1642                btrfs_file_extent_encryption(leaf, fi) ||
1643                btrfs_file_extent_other_encoding(leaf, fi));
1644 
1645         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1646                 ret = -EINVAL;
1647                 goto out;
1648         }
1649 
1650         *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1651         ret = 0;
1652 out:
1653         btrfs_free_path(path);
1654         return ret;
1655 }
1656 
1657 /*
1658  * update file extent items in the tree leaf to point to
1659  * the new locations.
1660  */
1661 static noinline_for_stack
1662 int replace_file_extents(struct btrfs_trans_handle *trans,
1663                          struct reloc_control *rc,
1664                          struct btrfs_root *root,
1665                          struct extent_buffer *leaf)
1666 {
1667         struct btrfs_fs_info *fs_info = root->fs_info;
1668         struct btrfs_key key;
1669         struct btrfs_file_extent_item *fi;
1670         struct inode *inode = NULL;
1671         u64 parent;
1672         u64 bytenr;
1673         u64 new_bytenr = 0;
1674         u64 num_bytes;
1675         u64 end;
1676         u32 nritems;
1677         u32 i;
1678         int ret = 0;
1679         int first = 1;
1680         int dirty = 0;
1681 
1682         if (rc->stage != UPDATE_DATA_PTRS)
1683                 return 0;
1684 
1685         /* reloc trees always use full backref */
1686         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1687                 parent = leaf->start;
1688         else
1689                 parent = 0;
1690 
1691         nritems = btrfs_header_nritems(leaf);
1692         for (i = 0; i < nritems; i++) {
1693                 struct btrfs_ref ref = { 0 };
1694 
1695                 cond_resched();
1696                 btrfs_item_key_to_cpu(leaf, &key, i);
1697                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1698                         continue;
1699                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1700                 if (btrfs_file_extent_type(leaf, fi) ==
1701                     BTRFS_FILE_EXTENT_INLINE)
1702                         continue;
1703                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1704                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1705                 if (bytenr == 0)
1706                         continue;
1707                 if (!in_block_group(bytenr, rc->block_group))
1708                         continue;
1709 
1710                 /*
1711                  * if we are modifying block in fs tree, wait for readpage
1712                  * to complete and drop the extent cache
1713                  */
1714                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1715                         if (first) {
1716                                 inode = find_next_inode(root, key.objectid);
1717                                 first = 0;
1718                         } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1719                                 btrfs_add_delayed_iput(inode);
1720                                 inode = find_next_inode(root, key.objectid);
1721                         }
1722                         if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1723                                 end = key.offset +
1724                                       btrfs_file_extent_num_bytes(leaf, fi);
1725                                 WARN_ON(!IS_ALIGNED(key.offset,
1726                                                     fs_info->sectorsize));
1727                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1728                                 end--;
1729                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1730                                                       key.offset, end);
1731                                 if (!ret)
1732                                         continue;
1733 
1734                                 btrfs_drop_extent_cache(BTRFS_I(inode),
1735                                                 key.offset,     end, 1);
1736                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1737                                               key.offset, end);
1738                         }
1739                 }
1740 
1741                 ret = get_new_location(rc->data_inode, &new_bytenr,
1742                                        bytenr, num_bytes);
1743                 if (ret) {
1744                         /*
1745                          * Don't have to abort since we've not changed anything
1746                          * in the file extent yet.
1747                          */
1748                         break;
1749                 }
1750 
1751                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1752                 dirty = 1;
1753 
1754                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1755                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1756                                        num_bytes, parent);
1757                 ref.real_root = root->root_key.objectid;
1758                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1759                                     key.objectid, key.offset);
1760                 ret = btrfs_inc_extent_ref(trans, &ref);
1761                 if (ret) {
1762                         btrfs_abort_transaction(trans, ret);
1763                         break;
1764                 }
1765 
1766                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1767                                        num_bytes, parent);
1768                 ref.real_root = root->root_key.objectid;
1769                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1770                                     key.objectid, key.offset);
1771                 ret = btrfs_free_extent(trans, &ref);
1772                 if (ret) {
1773                         btrfs_abort_transaction(trans, ret);
1774                         break;
1775                 }
1776         }
1777         if (dirty)
1778                 btrfs_mark_buffer_dirty(leaf);
1779         if (inode)
1780                 btrfs_add_delayed_iput(inode);
1781         return ret;
1782 }
1783 
1784 static noinline_for_stack
1785 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1786                      struct btrfs_path *path, int level)
1787 {
1788         struct btrfs_disk_key key1;
1789         struct btrfs_disk_key key2;
1790         btrfs_node_key(eb, &key1, slot);
1791         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1792         return memcmp(&key1, &key2, sizeof(key1));
1793 }
1794 
1795 /*
1796  * try to replace tree blocks in fs tree with the new blocks
1797  * in reloc tree. tree blocks haven't been modified since the
1798  * reloc tree was create can be replaced.
1799  *
1800  * if a block was replaced, level of the block + 1 is returned.
1801  * if no block got replaced, 0 is returned. if there are other
1802  * errors, a negative error number is returned.
1803  */
1804 static noinline_for_stack
1805 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1806                  struct btrfs_root *dest, struct btrfs_root *src,
1807                  struct btrfs_path *path, struct btrfs_key *next_key,
1808                  int lowest_level, int max_level)
1809 {
1810         struct btrfs_fs_info *fs_info = dest->fs_info;
1811         struct extent_buffer *eb;
1812         struct extent_buffer *parent;
1813         struct btrfs_ref ref = { 0 };
1814         struct btrfs_key key;
1815         u64 old_bytenr;
1816         u64 new_bytenr;
1817         u64 old_ptr_gen;
1818         u64 new_ptr_gen;
1819         u64 last_snapshot;
1820         u32 blocksize;
1821         int cow = 0;
1822         int level;
1823         int ret;
1824         int slot;
1825 
1826         BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1827         BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1828 
1829         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1830 again:
1831         slot = path->slots[lowest_level];
1832         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1833 
1834         eb = btrfs_lock_root_node(dest);
1835         btrfs_set_lock_blocking_write(eb);
1836         level = btrfs_header_level(eb);
1837 
1838         if (level < lowest_level) {
1839                 btrfs_tree_unlock(eb);
1840                 free_extent_buffer(eb);
1841                 return 0;
1842         }
1843 
1844         if (cow) {
1845                 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1846                 BUG_ON(ret);
1847         }
1848         btrfs_set_lock_blocking_write(eb);
1849 
1850         if (next_key) {
1851                 next_key->objectid = (u64)-1;
1852                 next_key->type = (u8)-1;
1853                 next_key->offset = (u64)-1;
1854         }
1855 
1856         parent = eb;
1857         while (1) {
1858                 struct btrfs_key first_key;
1859 
1860                 level = btrfs_header_level(parent);
1861                 BUG_ON(level < lowest_level);
1862 
1863                 ret = btrfs_bin_search(parent, &key, level, &slot);
1864                 if (ret < 0)
1865                         break;
1866                 if (ret && slot > 0)
1867                         slot--;
1868 
1869                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1870                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1871 
1872                 old_bytenr = btrfs_node_blockptr(parent, slot);
1873                 blocksize = fs_info->nodesize;
1874                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1875                 btrfs_node_key_to_cpu(parent, &first_key, slot);
1876 
1877                 if (level <= max_level) {
1878                         eb = path->nodes[level];
1879                         new_bytenr = btrfs_node_blockptr(eb,
1880                                                         path->slots[level]);
1881                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1882                                                         path->slots[level]);
1883                 } else {
1884                         new_bytenr = 0;
1885                         new_ptr_gen = 0;
1886                 }
1887 
1888                 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1889                         ret = level;
1890                         break;
1891                 }
1892 
1893                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1894                     memcmp_node_keys(parent, slot, path, level)) {
1895                         if (level <= lowest_level) {
1896                                 ret = 0;
1897                                 break;
1898                         }
1899 
1900                         eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1901                                              level - 1, &first_key);
1902                         if (IS_ERR(eb)) {
1903                                 ret = PTR_ERR(eb);
1904                                 break;
1905                         } else if (!extent_buffer_uptodate(eb)) {
1906                                 ret = -EIO;
1907                                 free_extent_buffer(eb);
1908                                 break;
1909                         }
1910                         btrfs_tree_lock(eb);
1911                         if (cow) {
1912                                 ret = btrfs_cow_block(trans, dest, eb, parent,
1913                                                       slot, &eb);
1914                                 BUG_ON(ret);
1915                         }
1916                         btrfs_set_lock_blocking_write(eb);
1917 
1918                         btrfs_tree_unlock(parent);
1919                         free_extent_buffer(parent);
1920 
1921                         parent = eb;
1922                         continue;
1923                 }
1924 
1925                 if (!cow) {
1926                         btrfs_tree_unlock(parent);
1927                         free_extent_buffer(parent);
1928                         cow = 1;
1929                         goto again;
1930                 }
1931 
1932                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1933                                       path->slots[level]);
1934                 btrfs_release_path(path);
1935 
1936                 path->lowest_level = level;
1937                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1938                 path->lowest_level = 0;
1939                 BUG_ON(ret);
1940 
1941                 /*
1942                  * Info qgroup to trace both subtrees.
1943                  *
1944                  * We must trace both trees.
1945                  * 1) Tree reloc subtree
1946                  *    If not traced, we will leak data numbers
1947                  * 2) Fs subtree
1948                  *    If not traced, we will double count old data
1949                  *
1950                  * We don't scan the subtree right now, but only record
1951                  * the swapped tree blocks.
1952                  * The real subtree rescan is delayed until we have new
1953                  * CoW on the subtree root node before transaction commit.
1954                  */
1955                 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1956                                 rc->block_group, parent, slot,
1957                                 path->nodes[level], path->slots[level],
1958                                 last_snapshot);
1959                 if (ret < 0)
1960                         break;
1961                 /*
1962                  * swap blocks in fs tree and reloc tree.
1963                  */
1964                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1965                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1966                 btrfs_mark_buffer_dirty(parent);
1967 
1968                 btrfs_set_node_blockptr(path->nodes[level],
1969                                         path->slots[level], old_bytenr);
1970                 btrfs_set_node_ptr_generation(path->nodes[level],
1971                                               path->slots[level], old_ptr_gen);
1972                 btrfs_mark_buffer_dirty(path->nodes[level]);
1973 
1974                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1975                                        blocksize, path->nodes[level]->start);
1976                 ref.skip_qgroup = true;
1977                 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1978                 ret = btrfs_inc_extent_ref(trans, &ref);
1979                 BUG_ON(ret);
1980                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1981                                        blocksize, 0);
1982                 ref.skip_qgroup = true;
1983                 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1984                 ret = btrfs_inc_extent_ref(trans, &ref);
1985                 BUG_ON(ret);
1986 
1987                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1988                                        blocksize, path->nodes[level]->start);
1989                 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1990                 ref.skip_qgroup = true;
1991                 ret = btrfs_free_extent(trans, &ref);
1992                 BUG_ON(ret);
1993 
1994                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1995                                        blocksize, 0);
1996                 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1997                 ref.skip_qgroup = true;
1998                 ret = btrfs_free_extent(trans, &ref);
1999                 BUG_ON(ret);
2000 
2001                 btrfs_unlock_up_safe(path, 0);
2002 
2003                 ret = level;
2004                 break;
2005         }
2006         btrfs_tree_unlock(parent);
2007         free_extent_buffer(parent);
2008         return ret;
2009 }
2010 
2011 /*
2012  * helper to find next relocated block in reloc tree
2013  */
2014 static noinline_for_stack
2015 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2016                        int *level)
2017 {
2018         struct extent_buffer *eb;
2019         int i;
2020         u64 last_snapshot;
2021         u32 nritems;
2022 
2023         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2024 
2025         for (i = 0; i < *level; i++) {
2026                 free_extent_buffer(path->nodes[i]);
2027                 path->nodes[i] = NULL;
2028         }
2029 
2030         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
2031                 eb = path->nodes[i];
2032                 nritems = btrfs_header_nritems(eb);
2033                 while (path->slots[i] + 1 < nritems) {
2034                         path->slots[i]++;
2035                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2036                             last_snapshot)
2037                                 continue;
2038 
2039                         *level = i;
2040                         return 0;
2041                 }
2042                 free_extent_buffer(path->nodes[i]);
2043                 path->nodes[i] = NULL;
2044         }
2045         return 1;
2046 }
2047 
2048 /*
2049  * walk down reloc tree to find relocated block of lowest level
2050  */
2051 static noinline_for_stack
2052 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2053                          int *level)
2054 {
2055         struct btrfs_fs_info *fs_info = root->fs_info;
2056         struct extent_buffer *eb = NULL;
2057         int i;
2058         u64 bytenr;
2059         u64 ptr_gen = 0;
2060         u64 last_snapshot;
2061         u32 nritems;
2062 
2063         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2064 
2065         for (i = *level; i > 0; i--) {
2066                 struct btrfs_key first_key;
2067 
2068                 eb = path->nodes[i];
2069                 nritems = btrfs_header_nritems(eb);
2070                 while (path->slots[i] < nritems) {
2071                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2072                         if (ptr_gen > last_snapshot)
2073                                 break;
2074                         path->slots[i]++;
2075                 }
2076                 if (path->slots[i] >= nritems) {
2077                         if (i == *level)
2078                                 break;
2079                         *level = i + 1;
2080                         return 0;
2081                 }
2082                 if (i == 1) {
2083                         *level = i;
2084                         return 0;
2085                 }
2086 
2087                 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2088                 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
2089                 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
2090                                      &first_key);
2091                 if (IS_ERR(eb)) {
2092                         return PTR_ERR(eb);
2093                 } else if (!extent_buffer_uptodate(eb)) {
2094                         free_extent_buffer(eb);
2095                         return -EIO;
2096                 }
2097                 BUG_ON(btrfs_header_level(eb) != i - 1);
2098                 path->nodes[i - 1] = eb;
2099                 path->slots[i - 1] = 0;
2100         }
2101         return 1;
2102 }
2103 
2104 /*
2105  * invalidate extent cache for file extents whose key in range of
2106  * [min_key, max_key)
2107  */
2108 static int invalidate_extent_cache(struct btrfs_root *root,
2109                                    struct btrfs_key *min_key,
2110                                    struct btrfs_key *max_key)
2111 {
2112         struct btrfs_fs_info *fs_info = root->fs_info;
2113         struct inode *inode = NULL;
2114         u64 objectid;
2115         u64 start, end;
2116         u64 ino;
2117 
2118         objectid = min_key->objectid;
2119         while (1) {
2120                 cond_resched();
2121                 iput(inode);
2122 
2123                 if (objectid > max_key->objectid)
2124                         break;
2125 
2126                 inode = find_next_inode(root, objectid);
2127                 if (!inode)
2128                         break;
2129                 ino = btrfs_ino(BTRFS_I(inode));
2130 
2131                 if (ino > max_key->objectid) {
2132                         iput(inode);
2133                         break;
2134                 }
2135 
2136                 objectid = ino + 1;
2137                 if (!S_ISREG(inode->i_mode))
2138                         continue;
2139 
2140                 if (unlikely(min_key->objectid == ino)) {
2141                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2142                                 continue;
2143                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2144                                 start = 0;
2145                         else {
2146                                 start = min_key->offset;
2147                                 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2148                         }
2149                 } else {
2150                         start = 0;
2151                 }
2152 
2153                 if (unlikely(max_key->objectid == ino)) {
2154                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2155                                 continue;
2156                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2157                                 end = (u64)-1;
2158                         } else {
2159                                 if (max_key->offset == 0)
2160                                         continue;
2161                                 end = max_key->offset;
2162                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2163                                 end--;
2164                         }
2165                 } else {
2166                         end = (u64)-1;
2167                 }
2168 
2169                 /* the lock_extent waits for readpage to complete */
2170                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2171                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
2172                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2173         }
2174         return 0;
2175 }
2176 
2177 static int find_next_key(struct btrfs_path *path, int level,
2178                          struct btrfs_key *key)
2179 
2180 {
2181         while (level < BTRFS_MAX_LEVEL) {
2182                 if (!path->nodes[level])
2183                         break;
2184                 if (path->slots[level] + 1 <
2185                     btrfs_header_nritems(path->nodes[level])) {
2186                         btrfs_node_key_to_cpu(path->nodes[level], key,
2187                                               path->slots[level] + 1);
2188                         return 0;
2189                 }
2190                 level++;
2191         }
2192         return 1;
2193 }
2194 
2195 /*
2196  * Insert current subvolume into reloc_control::dirty_subvol_roots
2197  */
2198 static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
2199                                 struct reloc_control *rc,
2200                                 struct btrfs_root *root)
2201 {
2202         struct btrfs_root *reloc_root = root->reloc_root;
2203         struct btrfs_root_item *reloc_root_item;
2204 
2205         /* @root must be a subvolume tree root with a valid reloc tree */
2206         ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
2207         ASSERT(reloc_root);
2208 
2209         reloc_root_item = &reloc_root->root_item;
2210         memset(&reloc_root_item->drop_progress, 0,
2211                 sizeof(reloc_root_item->drop_progress));
2212         reloc_root_item->drop_level = 0;
2213         btrfs_set_root_refs(reloc_root_item, 0);
2214         btrfs_update_reloc_root(trans, root);
2215 
2216         if (list_empty(&root->reloc_dirty_list)) {
2217                 btrfs_grab_fs_root(root);
2218                 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
2219         }
2220 }
2221 
2222 static int clean_dirty_subvols(struct reloc_control *rc)
2223 {
2224         struct btrfs_root *root;
2225         struct btrfs_root *next;
2226         int ret = 0;
2227         int ret2;
2228 
2229         list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
2230                                  reloc_dirty_list) {
2231                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2232                         /* Merged subvolume, cleanup its reloc root */
2233                         struct btrfs_root *reloc_root = root->reloc_root;
2234 
2235                         list_del_init(&root->reloc_dirty_list);
2236                         root->reloc_root = NULL;
2237                         if (reloc_root) {
2238 
2239                                 ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1);
2240                                 if (ret2 < 0 && !ret)
2241                                         ret = ret2;
2242                         }
2243                         /*
2244                          * Need barrier to ensure clear_bit() only happens after
2245                          * root->reloc_root = NULL. Pairs with have_reloc_root.
2246                          */
2247                         smp_wmb();
2248                         clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
2249                         btrfs_put_fs_root(root);
2250                 } else {
2251                         /* Orphan reloc tree, just clean it up */
2252                         ret2 = btrfs_drop_snapshot(root, NULL, 0, 1);
2253                         if (ret2 < 0 && !ret)
2254                                 ret = ret2;
2255                 }
2256         }
2257         return ret;
2258 }
2259 
2260 /*
2261  * merge the relocated tree blocks in reloc tree with corresponding
2262  * fs tree.
2263  */
2264 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2265                                                struct btrfs_root *root)
2266 {
2267         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2268         struct btrfs_key key;
2269         struct btrfs_key next_key;
2270         struct btrfs_trans_handle *trans = NULL;
2271         struct btrfs_root *reloc_root;
2272         struct btrfs_root_item *root_item;
2273         struct btrfs_path *path;
2274         struct extent_buffer *leaf;
2275         int level;
2276         int max_level;
2277         int replaced = 0;
2278         int ret;
2279         int err = 0;
2280         u32 min_reserved;
2281 
2282         path = btrfs_alloc_path();
2283         if (!path)
2284                 return -ENOMEM;
2285         path->reada = READA_FORWARD;
2286 
2287         reloc_root = root->reloc_root;
2288         root_item = &reloc_root->root_item;
2289 
2290         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2291                 level = btrfs_root_level(root_item);
2292                 extent_buffer_get(reloc_root->node);
2293                 path->nodes[level] = reloc_root->node;
2294                 path->slots[level] = 0;
2295         } else {
2296                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2297 
2298                 level = root_item->drop_level;
2299                 BUG_ON(level == 0);
2300                 path->lowest_level = level;
2301                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2302                 path->lowest_level = 0;
2303                 if (ret < 0) {
2304                         btrfs_free_path(path);
2305                         return ret;
2306                 }
2307 
2308                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2309                                       path->slots[level]);
2310                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2311 
2312                 btrfs_unlock_up_safe(path, 0);
2313         }
2314 
2315         min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2316         memset(&next_key, 0, sizeof(next_key));
2317 
2318         while (1) {
2319                 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2320                                              BTRFS_RESERVE_FLUSH_ALL);
2321                 if (ret) {
2322                         err = ret;
2323                         goto out;
2324                 }
2325                 trans = btrfs_start_transaction(root, 0);
2326                 if (IS_ERR(trans)) {
2327                         err = PTR_ERR(trans);
2328                         trans = NULL;
2329                         goto out;
2330                 }
2331                 trans->block_rsv = rc->block_rsv;
2332 
2333                 replaced = 0;
2334                 max_level = level;
2335 
2336                 ret = walk_down_reloc_tree(reloc_root, path, &level);
2337                 if (ret < 0) {
2338                         err = ret;
2339                         goto out;
2340                 }
2341                 if (ret > 0)
2342                         break;
2343 
2344                 if (!find_next_key(path, level, &key) &&
2345                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2346                         ret = 0;
2347                 } else {
2348                         ret = replace_path(trans, rc, root, reloc_root, path,
2349                                            &next_key, level, max_level);
2350                 }
2351                 if (ret < 0) {
2352                         err = ret;
2353                         goto out;
2354                 }
2355 
2356                 if (ret > 0) {
2357                         level = ret;
2358                         btrfs_node_key_to_cpu(path->nodes[level], &key,
2359                                               path->slots[level]);
2360                         replaced = 1;
2361                 }
2362 
2363                 ret = walk_up_reloc_tree(reloc_root, path, &level);
2364                 if (ret > 0)
2365                         break;
2366 
2367                 BUG_ON(level == 0);
2368                 /*
2369                  * save the merging progress in the drop_progress.
2370                  * this is OK since root refs == 1 in this case.
2371                  */
2372                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2373                                path->slots[level]);
2374                 root_item->drop_level = level;
2375 
2376                 btrfs_end_transaction_throttle(trans);
2377                 trans = NULL;
2378 
2379                 btrfs_btree_balance_dirty(fs_info);
2380 
2381                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2382                         invalidate_extent_cache(root, &key, &next_key);
2383         }
2384 
2385         /*
2386          * handle the case only one block in the fs tree need to be
2387          * relocated and the block is tree root.
2388          */
2389         leaf = btrfs_lock_root_node(root);
2390         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2391         btrfs_tree_unlock(leaf);
2392         free_extent_buffer(leaf);
2393         if (ret < 0)
2394                 err = ret;
2395 out:
2396         btrfs_free_path(path);
2397 
2398         if (err == 0)
2399                 insert_dirty_subvol(trans, rc, root);
2400 
2401         if (trans)
2402                 btrfs_end_transaction_throttle(trans);
2403 
2404         btrfs_btree_balance_dirty(fs_info);
2405 
2406         if (replaced && rc->stage == UPDATE_DATA_PTRS)
2407                 invalidate_extent_cache(root, &key, &next_key);
2408 
2409         return err;
2410 }
2411 
2412 static noinline_for_stack
2413 int prepare_to_merge(struct reloc_control *rc, int err)
2414 {
2415         struct btrfs_root *root = rc->extent_root;
2416         struct btrfs_fs_info *fs_info = root->fs_info;
2417         struct btrfs_root *reloc_root;
2418         struct btrfs_trans_handle *trans;
2419         LIST_HEAD(reloc_roots);
2420         u64 num_bytes = 0;
2421         int ret;
2422 
2423         mutex_lock(&fs_info->reloc_mutex);
2424         rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2425         rc->merging_rsv_size += rc->nodes_relocated * 2;
2426         mutex_unlock(&fs_info->reloc_mutex);
2427 
2428 again:
2429         if (!err) {
2430                 num_bytes = rc->merging_rsv_size;
2431                 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2432                                           BTRFS_RESERVE_FLUSH_ALL);
2433                 if (ret)
2434                         err = ret;
2435         }
2436 
2437         trans = btrfs_join_transaction(rc->extent_root);
2438         if (IS_ERR(trans)) {
2439                 if (!err)
2440                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
2441                                                 num_bytes);
2442                 return PTR_ERR(trans);
2443         }
2444 
2445         if (!err) {
2446                 if (num_bytes != rc->merging_rsv_size) {
2447                         btrfs_end_transaction(trans);
2448                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
2449                                                 num_bytes);
2450                         goto again;
2451                 }
2452         }
2453 
2454         rc->merge_reloc_tree = 1;
2455 
2456         while (!list_empty(&rc->reloc_roots)) {
2457                 reloc_root = list_entry(rc->reloc_roots.next,
2458                                         struct btrfs_root, root_list);
2459                 list_del_init(&reloc_root->root_list);
2460 
2461                 root = read_fs_root(fs_info, reloc_root->root_key.offset);
2462                 BUG_ON(IS_ERR(root));
2463                 BUG_ON(root->reloc_root != reloc_root);
2464 
2465                 /*
2466                  * set reference count to 1, so btrfs_recover_relocation
2467                  * knows it should resumes merging
2468                  */
2469                 if (!err)
2470                         btrfs_set_root_refs(&reloc_root->root_item, 1);
2471                 btrfs_update_reloc_root(trans, root);
2472 
2473                 list_add(&reloc_root->root_list, &reloc_roots);
2474         }
2475 
2476         list_splice(&reloc_roots, &rc->reloc_roots);
2477 
2478         if (!err)
2479                 btrfs_commit_transaction(trans);
2480         else
2481                 btrfs_end_transaction(trans);
2482         return err;
2483 }
2484 
2485 static noinline_for_stack
2486 void free_reloc_roots(struct list_head *list)
2487 {
2488         struct btrfs_root *reloc_root;
2489 
2490         while (!list_empty(list)) {
2491                 reloc_root = list_entry(list->next, struct btrfs_root,
2492                                         root_list);
2493                 __del_reloc_root(reloc_root);
2494                 free_extent_buffer(reloc_root->node);
2495                 free_extent_buffer(reloc_root->commit_root);
2496                 reloc_root->node = NULL;
2497                 reloc_root->commit_root = NULL;
2498         }
2499 }
2500 
2501 static noinline_for_stack
2502 void merge_reloc_roots(struct reloc_control *rc)
2503 {
2504         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2505         struct btrfs_root *root;
2506         struct btrfs_root *reloc_root;
2507         LIST_HEAD(reloc_roots);
2508         int found = 0;
2509         int ret = 0;
2510 again:
2511         root = rc->extent_root;
2512 
2513         /*
2514          * this serializes us with btrfs_record_root_in_transaction,
2515          * we have to make sure nobody is in the middle of
2516          * adding their roots to the list while we are
2517          * doing this splice
2518          */
2519         mutex_lock(&fs_info->reloc_mutex);
2520         list_splice_init(&rc->reloc_roots, &reloc_roots);
2521         mutex_unlock(&fs_info->reloc_mutex);
2522 
2523         while (!list_empty(&reloc_roots)) {
2524                 found = 1;
2525                 reloc_root = list_entry(reloc_roots.next,
2526                                         struct btrfs_root, root_list);
2527 
2528                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2529                         root = read_fs_root(fs_info,
2530                                             reloc_root->root_key.offset);
2531                         BUG_ON(IS_ERR(root));
2532                         BUG_ON(root->reloc_root != reloc_root);
2533 
2534                         ret = merge_reloc_root(rc, root);
2535                         if (ret) {
2536                                 if (list_empty(&reloc_root->root_list))
2537                                         list_add_tail(&reloc_root->root_list,
2538                                                       &reloc_roots);
2539                                 goto out;
2540                         }
2541                 } else {
2542                         list_del_init(&reloc_root->root_list);
2543                         /* Don't forget to queue this reloc root for cleanup */
2544                         list_add_tail(&reloc_root->reloc_dirty_list,
2545                                       &rc->dirty_subvol_roots);
2546                 }
2547         }
2548 
2549         if (found) {
2550                 found = 0;
2551                 goto again;
2552         }
2553 out:
2554         if (ret) {
2555                 btrfs_handle_fs_error(fs_info, ret, NULL);
2556                 if (!list_empty(&reloc_roots))
2557                         free_reloc_roots(&reloc_roots);
2558 
2559                 /* new reloc root may be added */
2560                 mutex_lock(&fs_info->reloc_mutex);
2561                 list_splice_init(&rc->reloc_roots, &reloc_roots);
2562                 mutex_unlock(&fs_info->reloc_mutex);
2563                 if (!list_empty(&reloc_roots))
2564                         free_reloc_roots(&reloc_roots);
2565         }
2566 
2567         /*
2568          * We used to have
2569          *
2570          * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2571          *
2572          * here, but it's wrong.  If we fail to start the transaction in
2573          * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2574          * have actually been removed from the reloc_root_tree rb tree.  This is
2575          * fine because we're bailing here, and we hold a reference on the root
2576          * for the list that holds it, so these roots will be cleaned up when we
2577          * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
2578          * will be cleaned up on unmount.
2579          *
2580          * The remaining nodes will be cleaned up by free_reloc_control.
2581          */
2582 }
2583 
2584 static void free_block_list(struct rb_root *blocks)
2585 {
2586         struct tree_block *block;
2587         struct rb_node *rb_node;
2588         while ((rb_node = rb_first(blocks))) {
2589                 block = rb_entry(rb_node, struct tree_block, rb_node);
2590                 rb_erase(rb_node, blocks);
2591                 kfree(block);
2592         }
2593 }
2594 
2595 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2596                                       struct btrfs_root *reloc_root)
2597 {
2598         struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2599         struct btrfs_root *root;
2600 
2601         if (reloc_root->last_trans == trans->transid)
2602                 return 0;
2603 
2604         root = read_fs_root(fs_info, reloc_root->root_key.offset);
2605         BUG_ON(IS_ERR(root));
2606         BUG_ON(root->reloc_root != reloc_root);
2607 
2608         return btrfs_record_root_in_trans(trans, root);
2609 }
2610 
2611 static noinline_for_stack
2612 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2613                                      struct reloc_control *rc,
2614                                      struct backref_node *node,
2615                                      struct backref_edge *edges[])
2616 {
2617         struct backref_node *next;
2618         struct btrfs_root *root;
2619         int index = 0;
2620 
2621         next = node;
2622         while (1) {
2623                 cond_resched();
2624                 next = walk_up_backref(next, edges, &index);
2625                 root = next->root;
2626                 BUG_ON(!root);
2627                 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2628 
2629                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2630                         record_reloc_root_in_trans(trans, root);
2631                         break;
2632                 }
2633 
2634                 btrfs_record_root_in_trans(trans, root);
2635                 root = root->reloc_root;
2636 
2637                 if (next->new_bytenr != root->node->start) {
2638                         BUG_ON(next->new_bytenr);
2639                         BUG_ON(!list_empty(&next->list));
2640                         next->new_bytenr = root->node->start;
2641                         next->root = root;
2642                         list_add_tail(&next->list,
2643                                       &rc->backref_cache.changed);
2644                         __mark_block_processed(rc, next);
2645                         break;
2646                 }
2647 
2648                 WARN_ON(1);
2649                 root = NULL;
2650                 next = walk_down_backref(edges, &index);
2651                 if (!next || next->level <= node->level)
2652                         break;
2653         }
2654         if (!root)
2655                 return NULL;
2656 
2657         next = node;
2658         /* setup backref node path for btrfs_reloc_cow_block */
2659         while (1) {
2660                 rc->backref_cache.path[next->level] = next;
2661                 if (--index < 0)
2662                         break;
2663                 next = edges[index]->node[UPPER];
2664         }
2665         return root;
2666 }
2667 
2668 /*
2669  * select a tree root for relocation. return NULL if the block
2670  * is reference counted. we should use do_relocation() in this
2671  * case. return a tree root pointer if the block isn't reference
2672  * counted. return -ENOENT if the block is root of reloc tree.
2673  */
2674 static noinline_for_stack
2675 struct btrfs_root *select_one_root(struct backref_node *node)
2676 {
2677         struct backref_node *next;
2678         struct btrfs_root *root;
2679         struct btrfs_root *fs_root = NULL;
2680         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2681         int index = 0;
2682 
2683         next = node;
2684         while (1) {
2685                 cond_resched();
2686                 next = walk_up_backref(next, edges, &index);
2687                 root = next->root;
2688                 BUG_ON(!root);
2689 
2690                 /* no other choice for non-references counted tree */
2691                 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2692                         return root;
2693 
2694                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2695                         fs_root = root;
2696 
2697                 if (next != node)
2698                         return NULL;
2699 
2700                 next = walk_down_backref(edges, &index);
2701                 if (!next || next->level <= node->level)
2702                         break;
2703         }
2704 
2705         if (!fs_root)
2706                 return ERR_PTR(-ENOENT);
2707         return fs_root;
2708 }
2709 
2710 static noinline_for_stack
2711 u64 calcu_metadata_size(struct reloc_control *rc,
2712                         struct backref_node *node, int reserve)
2713 {
2714         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2715         struct backref_node *next = node;
2716         struct backref_edge *edge;
2717         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2718         u64 num_bytes = 0;
2719         int index = 0;
2720 
2721         BUG_ON(reserve && node->processed);
2722 
2723         while (next) {
2724                 cond_resched();
2725                 while (1) {
2726                         if (next->processed && (reserve || next != node))
2727                                 break;
2728 
2729                         num_bytes += fs_info->nodesize;
2730 
2731                         if (list_empty(&next->upper))
2732                                 break;
2733 
2734                         edge = list_entry(next->upper.next,
2735                                           struct backref_edge, list[LOWER]);
2736                         edges[index++] = edge;
2737                         next = edge->node[UPPER];
2738                 }
2739                 next = walk_down_backref(edges, &index);
2740         }
2741         return num_bytes;
2742 }
2743 
2744 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2745                                   struct reloc_control *rc,
2746                                   struct backref_node *node)
2747 {
2748         struct btrfs_root *root = rc->extent_root;
2749         struct btrfs_fs_info *fs_info = root->fs_info;
2750         u64 num_bytes;
2751         int ret;
2752         u64 tmp;
2753 
2754         num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2755 
2756         trans->block_rsv = rc->block_rsv;
2757         rc->reserved_bytes += num_bytes;
2758 
2759         /*
2760          * We are under a transaction here so we can only do limited flushing.
2761          * If we get an enospc just kick back -EAGAIN so we know to drop the
2762          * transaction and try to refill when we can flush all the things.
2763          */
2764         ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2765                                 BTRFS_RESERVE_FLUSH_LIMIT);
2766         if (ret) {
2767                 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2768                 while (tmp <= rc->reserved_bytes)
2769                         tmp <<= 1;
2770                 /*
2771                  * only one thread can access block_rsv at this point,
2772                  * so we don't need hold lock to protect block_rsv.
2773                  * we expand more reservation size here to allow enough
2774                  * space for relocation and we will return earlier in
2775                  * enospc case.
2776                  */
2777                 rc->block_rsv->size = tmp + fs_info->nodesize *
2778                                       RELOCATION_RESERVED_NODES;
2779                 return -EAGAIN;
2780         }
2781 
2782         return 0;
2783 }
2784 
2785 /*
2786  * relocate a block tree, and then update pointers in upper level
2787  * blocks that reference the block to point to the new location.
2788  *
2789  * if called by link_to_upper, the block has already been relocated.
2790  * in that case this function just updates pointers.
2791  */
2792 static int do_relocation(struct btrfs_trans_handle *trans,
2793                          struct reloc_control *rc,
2794                          struct backref_node *node,
2795                          struct btrfs_key *key,
2796                          struct btrfs_path *path, int lowest)
2797 {
2798         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2799         struct backref_node *upper;
2800         struct backref_edge *edge;
2801         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2802         struct btrfs_root *root;
2803         struct extent_buffer *eb;
2804         u32 blocksize;
2805         u64 bytenr;
2806         u64 generation;
2807         int slot;
2808         int ret;
2809         int err = 0;
2810 
2811         BUG_ON(lowest && node->eb);
2812 
2813         path->lowest_level = node->level + 1;
2814         rc->backref_cache.path[node->level] = node;
2815         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2816                 struct btrfs_key first_key;
2817                 struct btrfs_ref ref = { 0 };
2818 
2819                 cond_resched();
2820 
2821                 upper = edge->node[UPPER];
2822                 root = select_reloc_root(trans, rc, upper, edges);
2823                 BUG_ON(!root);
2824 
2825                 if (upper->eb && !upper->locked) {
2826                         if (!lowest) {
2827                                 ret = btrfs_bin_search(upper->eb, key,
2828                                                        upper->level, &slot);
2829                                 if (ret < 0) {
2830                                         err = ret;
2831                                         goto next;
2832                                 }
2833                                 BUG_ON(ret);
2834                                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2835                                 if (node->eb->start == bytenr)
2836                                         goto next;
2837                         }
2838                         drop_node_buffer(upper);
2839                 }
2840 
2841                 if (!upper->eb) {
2842                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2843                         if (ret) {
2844                                 if (ret < 0)
2845                                         err = ret;
2846                                 else
2847                                         err = -ENOENT;
2848 
2849                                 btrfs_release_path(path);
2850                                 break;
2851                         }
2852 
2853                         if (!upper->eb) {
2854                                 upper->eb = path->nodes[upper->level];
2855                                 path->nodes[upper->level] = NULL;
2856                         } else {
2857                                 BUG_ON(upper->eb != path->nodes[upper->level]);
2858                         }
2859 
2860                         upper->locked = 1;
2861                         path->locks[upper->level] = 0;
2862 
2863                         slot = path->slots[upper->level];
2864                         btrfs_release_path(path);
2865                 } else {
2866                         ret = btrfs_bin_search(upper->eb, key, upper->level,
2867                                                &slot);
2868                         if (ret < 0) {
2869                                 err = ret;
2870                                 goto next;
2871                         }
2872                         BUG_ON(ret);
2873                 }
2874 
2875                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2876                 if (lowest) {
2877                         if (bytenr != node->bytenr) {
2878                                 btrfs_err(root->fs_info,
2879                 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2880                                           bytenr, node->bytenr, slot,
2881                                           upper->eb->start);
2882                                 err = -EIO;
2883                                 goto next;
2884                         }
2885                 } else {
2886                         if (node->eb->start == bytenr)
2887                                 goto next;
2888                 }
2889 
2890                 blocksize = root->fs_info->nodesize;
2891                 generation = btrfs_node_ptr_generation(upper->eb, slot);
2892                 btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2893                 eb = read_tree_block(fs_info, bytenr, generation,
2894                                      upper->level - 1, &first_key);
2895                 if (IS_ERR(eb)) {
2896                         err = PTR_ERR(eb);
2897                         goto next;
2898                 } else if (!extent_buffer_uptodate(eb)) {
2899                         free_extent_buffer(eb);
2900                         err = -EIO;
2901                         goto next;
2902                 }
2903                 btrfs_tree_lock(eb);
2904                 btrfs_set_lock_blocking_write(eb);
2905 
2906                 if (!node->eb) {
2907                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2908                                               slot, &eb);
2909                         btrfs_tree_unlock(eb);
2910                         free_extent_buffer(eb);
2911                         if (ret < 0) {
2912                                 err = ret;
2913                                 goto next;
2914                         }
2915                         BUG_ON(node->eb != eb);
2916                 } else {
2917                         btrfs_set_node_blockptr(upper->eb, slot,
2918                                                 node->eb->start);
2919                         btrfs_set_node_ptr_generation(upper->eb, slot,
2920                                                       trans->transid);
2921                         btrfs_mark_buffer_dirty(upper->eb);
2922 
2923                         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2924                                                node->eb->start, blocksize,
2925                                                upper->eb->start);
2926                         ref.real_root = root->root_key.objectid;
2927                         btrfs_init_tree_ref(&ref, node->level,
2928                                             btrfs_header_owner(upper->eb));
2929                         ret = btrfs_inc_extent_ref(trans, &ref);
2930                         BUG_ON(ret);
2931 
2932                         ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2933                         BUG_ON(ret);
2934                 }
2935 next:
2936                 if (!upper->pending)
2937                         drop_node_buffer(upper);
2938                 else
2939                         unlock_node_buffer(upper);
2940                 if (err)
2941                         break;
2942         }
2943 
2944         if (!err && node->pending) {
2945                 drop_node_buffer(node);
2946                 list_move_tail(&node->list, &rc->backref_cache.changed);
2947                 node->pending = 0;
2948         }
2949 
2950         path->lowest_level = 0;
2951         BUG_ON(err == -ENOSPC);
2952         return err;
2953 }
2954 
2955 static int link_to_upper(struct btrfs_trans_handle *trans,
2956                          struct reloc_control *rc,
2957                          struct backref_node *node,
2958                          struct btrfs_path *path)
2959 {
2960         struct btrfs_key key;
2961 
2962         btrfs_node_key_to_cpu(node->eb, &key, 0);
2963         return do_relocation(trans, rc, node, &key, path, 0);
2964 }
2965 
2966 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2967                                 struct reloc_control *rc,
2968                                 struct btrfs_path *path, int err)
2969 {
2970         LIST_HEAD(list);
2971         struct backref_cache *cache = &rc->backref_cache;
2972         struct backref_node *node;
2973         int level;
2974         int ret;
2975 
2976         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2977                 while (!list_empty(&cache->pending[level])) {
2978                         node = list_entry(cache->pending[level].next,
2979                                           struct backref_node, list);
2980                         list_move_tail(&node->list, &list);
2981                         BUG_ON(!node->pending);
2982 
2983                         if (!err) {
2984                                 ret = link_to_upper(trans, rc, node, path);
2985                                 if (ret < 0)
2986                                         err = ret;
2987                         }
2988                 }
2989                 list_splice_init(&list, &cache->pending[level]);
2990         }
2991         return err;
2992 }
2993 
2994 static void mark_block_processed(struct reloc_control *rc,
2995                                  u64 bytenr, u32 blocksize)
2996 {
2997         set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2998                         EXTENT_DIRTY);
2999 }
3000 
3001 static void __mark_block_processed(struct reloc_control *rc,
3002                                    struct backref_node *node)
3003 {
3004         u32 blocksize;
3005         if (node->level == 0 ||
3006             in_block_group(node->bytenr, rc->block_group)) {
3007                 blocksize = rc->extent_root->fs_info->nodesize;
3008                 mark_block_processed(rc, node->bytenr, blocksize);
3009         }
3010         node->processed = 1;
3011 }
3012 
3013 /*
3014  * mark a block and all blocks directly/indirectly reference the block
3015  * as processed.
3016  */
3017 static void update_processed_blocks(struct reloc_control *rc,
3018                                     struct backref_node *node)
3019 {
3020         struct backref_node *next = node;
3021         struct backref_edge *edge;
3022         struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
3023         int index = 0;
3024 
3025         while (next) {
3026                 cond_resched();
3027                 while (1) {
3028                         if (next->processed)
3029                                 break;
3030 
3031                         __mark_block_processed(rc, next);
3032 
3033                         if (list_empty(&next->upper))
3034                                 break;
3035 
3036                         edge = list_entry(next->upper.next,
3037                                           struct backref_edge, list[LOWER]);
3038                         edges[index++] = edge;
3039                         next = edge->node[UPPER];
3040                 }
3041                 next = walk_down_backref(edges, &index);
3042         }
3043 }
3044 
3045 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
3046 {
3047         u32 blocksize = rc->extent_root->fs_info->nodesize;
3048 
3049         if (test_range_bit(&rc->processed_blocks, bytenr,
3050                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
3051                 return 1;
3052         return 0;
3053 }
3054 
3055 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
3056                               struct tree_block *block)
3057 {
3058         struct extent_buffer *eb;
3059 
3060         BUG_ON(block->key_ready);
3061         eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
3062                              block->level, NULL);
3063         if (IS_ERR(eb)) {
3064                 return PTR_ERR(eb);
3065         } else if (!extent_buffer_uptodate(eb)) {
3066                 free_extent_buffer(eb);
3067                 return -EIO;
3068         }
3069         if (block->level == 0)
3070                 btrfs_item_key_to_cpu(eb, &block->key, 0);
3071         else
3072                 btrfs_node_key_to_cpu(eb, &block->key, 0);
3073         free_extent_buffer(eb);
3074         block->key_ready = 1;
3075         return 0;
3076 }
3077 
3078 /*
3079  * helper function to relocate a tree block
3080  */
3081 static int relocate_tree_block(struct btrfs_trans_handle *trans,
3082                                 struct reloc_control *rc,
3083                                 struct backref_node *node,
3084                                 struct btrfs_key *key,
3085                                 struct btrfs_path *path)
3086 {
3087         struct btrfs_root *root;
3088         int ret = 0;
3089 
3090         if (!node)
3091                 return 0;
3092 
3093         BUG_ON(node->processed);
3094         root = select_one_root(node);
3095         if (root == ERR_PTR(-ENOENT)) {
3096                 update_processed_blocks(rc, node);
3097                 goto out;
3098         }
3099 
3100         if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3101                 ret = reserve_metadata_space(trans, rc, node);
3102                 if (ret)
3103                         goto out;
3104         }
3105 
3106         if (root) {
3107                 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
3108                         BUG_ON(node->new_bytenr);
3109                         BUG_ON(!list_empty(&node->list));
3110                         btrfs_record_root_in_trans(trans, root);
3111                         root = root->reloc_root;
3112                         node->new_bytenr = root->node->start;
3113                         node->root = root;
3114                         list_add_tail(&node->list, &rc->backref_cache.changed);
3115                 } else {
3116                         path->lowest_level = node->level;
3117                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
3118                         btrfs_release_path(path);
3119                         if (ret > 0)
3120                                 ret = 0;
3121                 }
3122                 if (!ret)
3123                         update_processed_blocks(rc, node);
3124         } else {
3125                 ret = do_relocation(trans, rc, node, key, path, 1);
3126         }
3127 out:
3128         if (ret || node->level == 0 || node->cowonly)
3129                 remove_backref_node(&rc->backref_cache, node);
3130         return ret;
3131 }
3132 
3133 /*
3134  * relocate a list of blocks
3135  */
3136 static noinline_for_stack
3137 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3138                          struct reloc_control *rc, struct rb_root *blocks)
3139 {
3140         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3141         struct backref_node *node;
3142         struct btrfs_path *path;
3143         struct tree_block *block;
3144         struct tree_block *next;
3145         int ret;
3146         int err = 0;
3147 
3148         path = btrfs_alloc_path();
3149         if (!path) {
3150                 err = -ENOMEM;
3151                 goto out_free_blocks;
3152         }
3153 
3154         /* Kick in readahead for tree blocks with missing keys */
3155         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3156                 if (!block->key_ready)
3157                         readahead_tree_block(fs_info, block->bytenr);
3158         }
3159 
3160         /* Get first keys */
3161         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3162                 if (!block->key_ready) {
3163                         err = get_tree_block_key(fs_info, block);
3164                         if (err)
3165                                 goto out_free_path;
3166                 }
3167         }
3168 
3169         /* Do tree relocation */
3170         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
3171                 node = build_backref_tree(rc, &block->key,
3172                                           block->level, block->bytenr);
3173                 if (IS_ERR(node)) {
3174                         err = PTR_ERR(node);
3175                         goto out;
3176                 }
3177 
3178                 ret = relocate_tree_block(trans, rc, node, &block->key,
3179                                           path);
3180                 if (ret < 0) {
3181                         err = ret;
3182                         break;
3183                 }
3184         }
3185 out:
3186         err = finish_pending_nodes(trans, rc, path, err);
3187 
3188 out_free_path:
3189         btrfs_free_path(path);
3190 out_free_blocks:
3191         free_block_list(blocks);
3192         return err;
3193 }
3194 
3195 static noinline_for_stack
3196 int prealloc_file_extent_cluster(struct inode *inode,
3197                                  struct file_extent_cluster *cluster)
3198 {
3199         u64 alloc_hint = 0;
3200         u64 start;
3201         u64 end;
3202         u64 offset = BTRFS_I(inode)->index_cnt;
3203         u64 num_bytes;
3204         int nr = 0;
3205         int ret = 0;
3206         u64 prealloc_start = cluster->start - offset;
3207         u64 prealloc_end = cluster->end - offset;
3208         u64 cur_offset;
3209         struct extent_changeset *data_reserved = NULL;
3210 
3211         BUG_ON(cluster->start != cluster->boundary[0]);
3212         inode_lock(inode);
3213 
3214         ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3215                                           prealloc_end + 1 - prealloc_start);
3216         if (ret)
3217                 goto out;
3218 
3219         cur_offset = prealloc_start;
3220         while (nr < cluster->nr) {
3221                 start = cluster->boundary[nr] - offset;
3222                 if (nr + 1 < cluster->nr)
3223                         end = cluster->boundary[nr + 1] - 1 - offset;
3224                 else
3225                         end = cluster->end - offset;
3226 
3227                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3228                 num_bytes = end + 1 - start;
3229                 if (cur_offset < start)
3230                         btrfs_free_reserved_data_space(inode, data_reserved,
3231                                         cur_offset, start - cur_offset);
3232                 ret = btrfs_prealloc_file_range(inode, 0, start,
3233                                                 num_bytes, num_bytes,
3234                                                 end + 1, &alloc_hint);
3235                 cur_offset = end + 1;
3236                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3237                 if (ret)
3238                         break;
3239                 nr++;
3240         }
3241         if (cur_offset < prealloc_end)
3242                 btrfs_free_reserved_data_space(inode, data_reserved,
3243                                 cur_offset, prealloc_end + 1 - cur_offset);
3244 out:
3245         inode_unlock(inode);
3246         extent_changeset_free(data_reserved);
3247         return ret;
3248 }
3249 
3250 static noinline_for_stack
3251 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3252                          u64 block_start)
3253 {
3254         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3255         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3256         struct extent_map *em;
3257         int ret = 0;
3258 
3259         em = alloc_extent_map();
3260         if (!em)
3261                 return -ENOMEM;
3262 
3263         em->start = start;
3264         em->len = end + 1 - start;
3265         em->block_len = em->len;
3266         em->block_start = block_start;
3267         em->bdev = fs_info->fs_devices->latest_bdev;
3268         set_bit(EXTENT_FLAG_PINNED, &em->flags);
3269 
3270         lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3271         while (1) {
3272                 write_lock(&em_tree->lock);
3273                 ret = add_extent_mapping(em_tree, em, 0);
3274                 write_unlock(&em_tree->lock);
3275                 if (ret != -EEXIST) {
3276                         free_extent_map(em);
3277                         break;
3278                 }
3279                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3280         }
3281         unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3282         return ret;
3283 }
3284 
3285 static int relocate_file_extent_cluster(struct inode *inode,
3286                                         struct file_extent_cluster *cluster)
3287 {
3288         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3289         u64 page_start;
3290         u64 page_end;
3291         u64 offset = BTRFS_I(inode)->index_cnt;
3292         unsigned long index;
3293         unsigned long last_index;
3294         struct page *page;
3295         struct file_ra_state *ra;
3296         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3297         int nr = 0;
3298         int ret = 0;
3299 
3300         if (!cluster->nr)
3301                 return 0;
3302 
3303         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3304         if (!ra)
3305                 return -ENOMEM;
3306 
3307         ret = prealloc_file_extent_cluster(inode, cluster);
3308         if (ret)
3309                 goto out;
3310 
3311         file_ra_state_init(ra, inode->i_mapping);
3312 
3313         ret = setup_extent_mapping(inode, cluster->start - offset,
3314                                    cluster->end - offset, cluster->start);
3315         if (ret)
3316                 goto out;
3317 
3318         index = (cluster->start - offset) >> PAGE_SHIFT;
3319         last_index = (cluster->end - offset) >> PAGE_SHIFT;
3320         while (index <= last_index) {
3321                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3322                                 PAGE_SIZE);
3323                 if (ret)
3324                         goto out;
3325 
3326                 page = find_lock_page(inode->i_mapping, index);
3327                 if (!page) {
3328                         page_cache_sync_readahead(inode->i_mapping,
3329                                                   ra, NULL, index,
3330                                                   last_index + 1 - index);
3331                         page = find_or_create_page(inode->i_mapping, index,
3332                                                    mask);
3333                         if (!page) {
3334                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3335                                                         PAGE_SIZE, true);
3336                                 btrfs_delalloc_release_extents(BTRFS_I(inode),
3337                                                         PAGE_SIZE);
3338                                 ret = -ENOMEM;
3339                                 goto out;
3340                         }
3341                 }
3342 
3343                 if (PageReadahead(page)) {
3344                         page_cache_async_readahead(inode->i_mapping,
3345                                                    ra, NULL, page, index,
3346                                                    last_index + 1 - index);
3347                 }
3348 
3349                 if (!PageUptodate(page)) {
3350                         btrfs_readpage(NULL, page);
3351                         lock_page(page);
3352                         if (!PageUptodate(page)) {
3353                                 unlock_page(page);
3354                                 put_page(page);
3355                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
3356                                                         PAGE_SIZE, true);
3357                                 btrfs_delalloc_release_extents(BTRFS_I(inode),
3358                                                                PAGE_SIZE);
3359                                 ret = -EIO;
3360                                 goto out;
3361                         }
3362                 }
3363 
3364                 page_start = page_offset(page);
3365                 page_end = page_start + PAGE_SIZE - 1;
3366 
3367                 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3368 
3369                 set_page_extent_mapped(page);
3370 
3371                 if (nr < cluster->nr &&
3372                     page_start + offset == cluster->boundary[nr]) {
3373                         set_extent_bits(&BTRFS_I(inode)->io_tree,
3374                                         page_start, page_end,
3375                                         EXTENT_BOUNDARY);
3376                         nr++;
3377                 }
3378 
3379                 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
3380                                                 NULL);
3381                 if (ret) {
3382                         unlock_page(page);
3383                         put_page(page);
3384                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
3385                                                          PAGE_SIZE, true);
3386                         btrfs_delalloc_release_extents(BTRFS_I(inode),
3387                                                        PAGE_SIZE);
3388 
3389                         clear_extent_bits(&BTRFS_I(inode)->io_tree,
3390                                           page_start, page_end,
3391                                           EXTENT_LOCKED | EXTENT_BOUNDARY);
3392                         goto out;
3393 
3394                 }
3395                 set_page_dirty(page);
3396 
3397                 unlock_extent(&BTRFS_I(inode)->io_tree,
3398                               page_start, page_end);
3399                 unlock_page(page);
3400                 put_page(page);
3401 
3402                 index++;
3403                 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
3404                 balance_dirty_pages_ratelimited(inode->i_mapping);
3405                 btrfs_throttle(fs_info);
3406         }
3407         WARN_ON(nr != cluster->nr);
3408 out:
3409         kfree(ra);
3410         return ret;
3411 }
3412 
3413 static noinline_for_stack
3414 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3415                          struct file_extent_cluster *cluster)
3416 {
3417         int ret;
3418 
3419         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3420                 ret = relocate_file_extent_cluster(inode, cluster);
3421                 if (ret)
3422                         return ret;
3423                 cluster->nr = 0;
3424         }
3425 
3426         if (!cluster->nr)
3427                 cluster->start = extent_key->objectid;
3428         else
3429                 BUG_ON(cluster->nr >= MAX_EXTENTS);
3430         cluster->end = extent_key->objectid + extent_key->offset - 1;
3431         cluster->boundary[cluster->nr] = extent_key->objectid;
3432         cluster->nr++;
3433 
3434         if (cluster->nr >= MAX_EXTENTS) {
3435                 ret = relocate_file_extent_cluster(inode, cluster);
3436                 if (ret)
3437                         return ret;
3438                 cluster->nr = 0;
3439         }
3440         return 0;
3441 }
3442 
3443 /*
3444  * helper to add a tree block to the list.
3445  * the major work is getting the generation and level of the block
3446  */
3447 static int add_tree_block(struct reloc_control *rc,
3448                           struct btrfs_key *extent_key,
3449                           struct btrfs_path *path,
3450                           struct rb_root *blocks)
3451 {
3452         struct extent_buffer *eb;
3453         struct btrfs_extent_item *ei;
3454         struct btrfs_tree_block_info *bi;
3455         struct tree_block *block;
3456         struct rb_node *rb_node;
3457         u32 item_size;
3458         int level = -1;
3459         u64 generation;
3460 
3461         eb =  path->nodes[0];
3462         item_size = btrfs_item_size_nr(eb, path->slots[0]);
3463 
3464         if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3465             item_size >= sizeof(*ei) + sizeof(*bi)) {
3466                 ei = btrfs_item_ptr(eb, path->slots[0],
3467                                 struct btrfs_extent_item);
3468                 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3469                         bi = (struct btrfs_tree_block_info *)(ei + 1);
3470                         level = btrfs_tree_block_level(eb, bi);
3471                 } else {
3472                         level = (int)extent_key->offset;
3473                 }
3474                 generation = btrfs_extent_generation(eb, ei);
3475         } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3476                 btrfs_print_v0_err(eb->fs_info);
3477                 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3478                 return -EINVAL;
3479         } else {
3480                 BUG();
3481         }
3482 
3483         btrfs_release_path(path);
3484 
3485         BUG_ON(level == -1);
3486 
3487         block = kmalloc(sizeof(*block), GFP_NOFS);
3488         if (!block)
3489                 return -ENOMEM;
3490 
3491         block->bytenr = extent_key->objectid;
3492         block->key.objectid = rc->extent_root->fs_info->nodesize;
3493         block->key.offset = generation;
3494         block->level = level;
3495         block->key_ready = 0;
3496 
3497         rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3498         if (rb_node)
3499                 backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3500 
3501         return 0;
3502 }
3503 
3504 /*
3505  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3506  */
3507 static int __add_tree_block(struct reloc_control *rc,
3508                             u64 bytenr, u32 blocksize,
3509                             struct rb_root *blocks)
3510 {
3511         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3512         struct btrfs_path *path;
3513         struct btrfs_key key;
3514         int ret;
3515         bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3516 
3517         if (tree_block_processed(bytenr, rc))
3518                 return 0;
3519 
3520         if (tree_search(blocks, bytenr))
3521                 return 0;
3522 
3523         path = btrfs_alloc_path();
3524         if (!path)
3525                 return -ENOMEM;
3526 again:
3527         key.objectid = bytenr;
3528         if (skinny) {
3529                 key.type = BTRFS_METADATA_ITEM_KEY;
3530                 key.offset = (u64)-1;
3531         } else {
3532                 key.type = BTRFS_EXTENT_ITEM_KEY;
3533                 key.offset = blocksize;
3534         }
3535 
3536         path->search_commit_root = 1;
3537         path->skip_locking = 1;
3538         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3539         if (ret < 0)
3540                 goto out;
3541 
3542         if (ret > 0 && skinny) {
3543                 if (path->slots[0]) {
3544                         path->slots[0]--;
3545                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3546                                               path->slots[0]);
3547                         if (key.objectid == bytenr &&
3548                             (key.type == BTRFS_METADATA_ITEM_KEY ||
3549                              (key.type == BTRFS_EXTENT_ITEM_KEY &&
3550                               key.offset == blocksize)))
3551                                 ret = 0;
3552                 }
3553 
3554                 if (ret) {
3555                         skinny = false;
3556                         btrfs_release_path(path);
3557                         goto again;
3558                 }
3559         }
3560         if (ret) {
3561                 ASSERT(ret == 1);
3562                 btrfs_print_leaf(path->nodes[0]);
3563                 btrfs_err(fs_info,
3564              "tree block extent item (%llu) is not found in extent tree",
3565                      bytenr);
3566                 WARN_ON(1);
3567                 ret = -EINVAL;
3568                 goto out;
3569         }
3570 
3571         ret = add_tree_block(rc, &key, path, blocks);
3572 out:
3573         btrfs_free_path(path);
3574         return ret;
3575 }
3576 
3577 /*
3578  * helper to check if the block use full backrefs for pointers in it
3579  */
3580 static int block_use_full_backref(struct reloc_control *rc,
3581                                   struct extent_buffer *eb)
3582 {
3583         u64 flags;
3584         int ret;
3585 
3586         if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3587             btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3588                 return 1;
3589 
3590         ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3591                                        eb->start, btrfs_header_level(eb), 1,
3592                                        NULL, &flags);
3593         BUG_ON(ret);
3594 
3595         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3596                 ret = 1;
3597         else
3598                 ret = 0;
3599         return ret;
3600 }
3601 
3602 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3603                                     struct btrfs_block_group_cache *block_group,
3604                                     struct inode *inode,
3605                                     u64 ino)
3606 {
3607         struct btrfs_key key;
3608         struct btrfs_root *root = fs_info->tree_root;
3609         struct btrfs_trans_handle *trans;
3610         int ret = 0;
3611 
3612         if (inode)
3613                 goto truncate;
3614 
3615         key.objectid = ino;
3616         key.type = BTRFS_INODE_ITEM_KEY;
3617         key.offset = 0;
3618 
3619         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3620         if (IS_ERR(inode))
3621                 return -ENOENT;
3622 
3623 truncate:
3624         ret = btrfs_check_trunc_cache_free_space(fs_info,
3625                                                  &fs_info->global_block_rsv);
3626         if (ret)
3627                 goto out;
3628 
3629         trans = btrfs_join_transaction(root);
3630         if (IS_ERR(trans)) {
3631                 ret = PTR_ERR(trans);
3632                 goto out;
3633         }
3634 
3635         ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3636 
3637         btrfs_end_transaction(trans);
3638         btrfs_btree_balance_dirty(fs_info);
3639 out:
3640         iput(inode);
3641         return ret;
3642 }
3643 
3644 /*
3645  * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3646  * this function scans fs tree to find blocks reference the data extent
3647  */
3648 static int find_data_references(struct reloc_control *rc,
3649                                 struct btrfs_key *extent_key,
3650                                 struct extent_buffer *leaf,
3651                                 struct btrfs_extent_data_ref *ref,
3652                                 struct rb_root *blocks)
3653 {
3654         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3655         struct btrfs_path *path;
3656         struct tree_block *block;
3657         struct btrfs_root *root;
3658         struct btrfs_file_extent_item *fi;
3659         struct rb_node *rb_node;
3660         struct btrfs_key key;
3661         u64 ref_root;
3662         u64 ref_objectid;
3663         u64 ref_offset;
3664         u32 ref_count;
3665         u32 nritems;
3666         int err = 0;
3667         int added = 0;
3668         int counted;
3669         int ret;
3670 
3671         ref_root = btrfs_extent_data_ref_root(leaf, ref);
3672         ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3673         ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3674         ref_count = btrfs_extent_data_ref_count(leaf, ref);
3675 
3676         /*
3677          * This is an extent belonging to the free space cache, lets just delete
3678          * it and redo the search.
3679          */
3680         if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3681                 ret = delete_block_group_cache(fs_info, rc->block_group,
3682                                                NULL, ref_objectid);
3683                 if (ret != -ENOENT)
3684                         return ret;
3685                 ret = 0;
3686         }
3687 
3688         path = btrfs_alloc_path();
3689         if (!path)
3690                 return -ENOMEM;
3691         path->reada = READA_FORWARD;
3692 
3693         root = read_fs_root(fs_info, ref_root);
3694         if (IS_ERR(root)) {
3695                 err = PTR_ERR(root);
3696                 goto out;
3697         }
3698 
3699         key.objectid = ref_objectid;
3700         key.type = BTRFS_EXTENT_DATA_KEY;
3701         if (ref_offset > ((u64)-1 << 32))
3702                 key.offset = 0;
3703         else
3704                 key.offset = ref_offset;
3705 
3706         path->search_commit_root = 1;
3707         path->skip_locking = 1;
3708         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3709         if (ret < 0) {
3710                 err = ret;
3711                 goto out;
3712         }
3713 
3714         leaf = path->nodes[0];
3715         nritems = btrfs_header_nritems(leaf);
3716         /*
3717          * the references in tree blocks that use full backrefs
3718          * are not counted in
3719          */
3720         if (block_use_full_backref(rc, leaf))
3721                 counted = 0;
3722         else
3723                 counted = 1;
3724         rb_node = tree_search(blocks, leaf->start);
3725         if (rb_node) {
3726                 if (counted)
3727                         added = 1;
3728                 else
3729                         path->slots[0] = nritems;
3730         }
3731 
3732         while (ref_count > 0) {
3733                 while (path->slots[0] >= nritems) {
3734                         ret = btrfs_next_leaf(root, path);
3735                         if (ret < 0) {
3736                                 err = ret;
3737                                 goto out;
3738                         }
3739                         if (WARN_ON(ret > 0))
3740                                 goto out;
3741 
3742                         leaf = path->nodes[0];
3743                         nritems = btrfs_header_nritems(leaf);
3744                         added = 0;
3745 
3746                         if (block_use_full_backref(rc, leaf))
3747                                 counted = 0;
3748                         else
3749                                 counted = 1;
3750                         rb_node = tree_search(blocks, leaf->start);
3751                         if (rb_node) {
3752                                 if (counted)
3753                                         added = 1;
3754                                 else
3755                                         path->slots[0] = nritems;
3756                         }
3757                 }
3758 
3759                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3760                 if (WARN_ON(key.objectid != ref_objectid ||
3761                     key.type != BTRFS_EXTENT_DATA_KEY))
3762                         break;
3763 
3764                 fi = btrfs_item_ptr(leaf, path->slots[0],
3765                                     struct btrfs_file_extent_item);
3766 
3767                 if (btrfs_file_extent_type(leaf, fi) ==
3768                     BTRFS_FILE_EXTENT_INLINE)
3769                         goto next;
3770 
3771                 if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3772                     extent_key->objectid)
3773                         goto next;
3774 
3775                 key.offset -= btrfs_file_extent_offset(leaf, fi);
3776                 if (key.offset != ref_offset)
3777                         goto next;
3778 
3779                 if (counted)
3780                         ref_count--;
3781                 if (added)
3782                         goto next;
3783 
3784                 if (!tree_block_processed(leaf->start, rc)) {
3785                         block = kmalloc(sizeof(*block), GFP_NOFS);
3786                         if (!block) {
3787                                 err = -ENOMEM;
3788                                 break;
3789                         }
3790                         block->bytenr = leaf->start;
3791                         btrfs_item_key_to_cpu(leaf, &block->key, 0);
3792                         block->level = 0;
3793                         block->key_ready = 1;
3794                         rb_node = tree_insert(blocks, block->bytenr,
3795                                               &block->rb_node);
3796                         if (rb_node)
3797                                 backref_tree_panic(rb_node, -EEXIST,
3798                                                    block->bytenr);
3799                 }
3800                 if (counted)
3801                         added = 1;
3802                 else
3803                         path->slots[0] = nritems;
3804 next:
3805                 path->slots[0]++;
3806 
3807         }
3808 out:
3809         btrfs_free_path(path);
3810         return err;
3811 }
3812 
3813 /*
3814  * helper to find all tree blocks that reference a given data extent
3815  */
3816 static noinline_for_stack
3817 int add_data_references(struct reloc_control *rc,
3818                         struct btrfs_key *extent_key,
3819                         struct btrfs_path *path,
3820                         struct rb_root *blocks)
3821 {
3822         struct btrfs_key key;
3823         struct extent_buffer *eb;
3824         struct btrfs_extent_data_ref *dref;
3825         struct btrfs_extent_inline_ref *iref;
3826         unsigned long ptr;
3827         unsigned long end;
3828         u32 blocksize = rc->extent_root->fs_info->nodesize;
3829         int ret = 0;
3830         int err = 0;
3831 
3832         eb = path->nodes[0];
3833         ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3834         end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3835         ptr += sizeof(struct btrfs_extent_item);
3836 
3837         while (ptr < end) {
3838                 iref = (struct btrfs_extent_inline_ref *)ptr;
3839                 key.type = btrfs_get_extent_inline_ref_type(eb, iref,
3840                                                         BTRFS_REF_TYPE_DATA);
3841                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3842                         key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3843                         ret = __add_tree_block(rc, key.offset, blocksize,
3844                                                blocks);
3845                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3846                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3847                         ret = find_data_references(rc, extent_key,
3848                                                    eb, dref, blocks);
3849                 } else {
3850                         ret = -EUCLEAN;
3851                         btrfs_err(rc->extent_root->fs_info,
3852                      "extent %llu slot %d has an invalid inline ref type",
3853                              eb->start, path->slots[0]);
3854                 }
3855                 if (ret) {
3856                         err = ret;
3857                         goto out;
3858                 }
3859                 ptr += btrfs_extent_inline_ref_size(key.type);
3860         }
3861         WARN_ON(ptr > end);
3862 
3863         while (1) {
3864                 cond_resched();
3865                 eb = path->nodes[0];
3866                 if (path->slots[0] >= btrfs_header_nritems(eb)) {
3867                         ret = btrfs_next_leaf(rc->extent_root, path);
3868                         if (ret < 0) {
3869                                 err = ret;
3870                                 break;
3871                         }
3872                         if (ret > 0)
3873                                 break;
3874                         eb = path->nodes[0];
3875                 }
3876 
3877                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3878                 if (key.objectid != extent_key->objectid)
3879                         break;
3880 
3881                 if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3882                         ret = __add_tree_block(rc, key.offset, blocksize,
3883                                                blocks);
3884                 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3885                         dref = btrfs_item_ptr(eb, path->slots[0],
3886                                               struct btrfs_extent_data_ref);
3887                         ret = find_data_references(rc, extent_key,
3888                                                    eb, dref, blocks);
3889                 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
3890                         btrfs_print_v0_err(eb->fs_info);
3891                         btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3892                         ret = -EINVAL;
3893                 } else {
3894                         ret = 0;
3895                 }
3896                 if (ret) {
3897                         err = ret;
3898                         break;
3899                 }
3900                 path->slots[0]++;
3901         }
3902 out:
3903         btrfs_release_path(path);
3904         if (err)
3905                 free_block_list(blocks);
3906         return err;
3907 }
3908 
3909 /*
3910  * helper to find next unprocessed extent
3911  */
3912 static noinline_for_stack
3913 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3914                      struct btrfs_key *extent_key)
3915 {
3916         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3917         struct btrfs_key key;
3918         struct extent_buffer *leaf;
3919         u64 start, end, last;
3920         int ret;
3921 
3922         last = rc->block_group->key.objectid + rc->block_group->key.offset;
3923         while (1) {
3924                 cond_resched();
3925                 if (rc->search_start >= last) {
3926                         ret = 1;
3927                         break;
3928                 }
3929 
3930                 key.objectid = rc->search_start;
3931                 key.type = BTRFS_EXTENT_ITEM_KEY;
3932                 key.offset = 0;
3933 
3934                 path->search_commit_root = 1;
3935                 path->skip_locking = 1;
3936                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3937                                         0, 0);
3938                 if (ret < 0)
3939                         break;
3940 next:
3941                 leaf = path->nodes[0];
3942                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3943                         ret = btrfs_next_leaf(rc->extent_root, path);
3944                         if (ret != 0)
3945                                 break;
3946                         leaf = path->nodes[0];
3947                 }
3948 
3949                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3950                 if (key.objectid >= last) {
3951                         ret = 1;
3952                         break;
3953                 }
3954 
3955                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3956                     key.type != BTRFS_METADATA_ITEM_KEY) {
3957                         path->slots[0]++;
3958                         goto next;
3959                 }
3960 
3961                 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3962                     key.objectid + key.offset <= rc->search_start) {
3963                         path->slots[0]++;
3964                         goto next;
3965                 }
3966 
3967                 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3968                     key.objectid + fs_info->nodesize <=
3969                     rc->search_start) {
3970                         path->slots[0]++;
3971                         goto next;
3972                 }
3973 
3974                 ret = find_first_extent_bit(&rc->processed_blocks,
3975                                             key.objectid, &start, &end,
3976                                             EXTENT_DIRTY, NULL);
3977 
3978                 if (ret == 0 && start <= key.objectid) {
3979                         btrfs_release_path(path);
3980                         rc->search_start = end + 1;
3981                 } else {
3982                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
3983                                 rc->search_start = key.objectid + key.offset;
3984                         else
3985                                 rc->search_start = key.objectid +
3986                                         fs_info->nodesize;
3987                         memcpy(extent_key, &key, sizeof(key));
3988                         return 0;
3989                 }
3990         }
3991         btrfs_release_path(path);
3992         return ret;
3993 }
3994 
3995 static void set_reloc_control(struct reloc_control *rc)
3996 {
3997         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3998 
3999         mutex_lock(&fs_info->reloc_mutex);
4000         fs_info->reloc_ctl = rc;
4001         mutex_unlock(&fs_info->reloc_mutex);
4002 }
4003 
4004 static void unset_reloc_control(struct reloc_control *rc)
4005 {
4006         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
4007 
4008         mutex_lock(&fs_info->reloc_mutex);
4009         fs_info->reloc_ctl = NULL;
4010         mutex_unlock(&fs_info->reloc_mutex);
4011 }
4012 
4013 static int check_extent_flags(u64 flags)
4014 {
4015         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
4016             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
4017                 return 1;
4018         if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
4019             !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
4020                 return 1;
4021         if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
4022             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4023                 return 1;
4024         return 0;
4025 }
4026 
4027 static noinline_for_stack
4028 int prepare_to_relocate(struct reloc_control *rc)
4029 {
4030         struct btrfs_trans_handle *trans;
4031         int ret;
4032 
4033         rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
4034                                               BTRFS_BLOCK_RSV_TEMP);
4035         if (!rc->block_rsv)
4036                 return -ENOMEM;
4037 
4038         memset(&rc->cluster, 0, sizeof(rc->cluster));
4039         rc->search_start = rc->block_group->key.objectid;
4040         rc->extents_found = 0;
4041         rc->nodes_relocated = 0;
4042         rc->merging_rsv_size = 0;
4043         rc->reserved_bytes = 0;
4044         rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
4045                               RELOCATION_RESERVED_NODES;
4046         ret = btrfs_block_rsv_refill(rc->extent_root,
4047                                      rc->block_rsv, rc->block_rsv->size,
4048                                      BTRFS_RESERVE_FLUSH_ALL);
4049         if (ret)
4050                 return ret;
4051 
4052         rc->create_reloc_tree = 1;
4053         set_reloc_control(rc);
4054 
4055         trans = btrfs_join_transaction(rc->extent_root);
4056         if (IS_ERR(trans)) {
4057                 unset_reloc_control(rc);
4058                 /*
4059                  * extent tree is not a ref_cow tree and has no reloc_root to
4060                  * cleanup.  And callers are responsible to free the above
4061                  * block rsv.
4062                  */
4063                 return PTR_ERR(trans);
4064         }
4065         btrfs_commit_transaction(trans);
4066         return 0;
4067 }
4068 
4069 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
4070 {
4071         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
4072         struct rb_root blocks = RB_ROOT;
4073         struct btrfs_key key;
4074         struct btrfs_trans_handle *trans = NULL;
4075         struct btrfs_path *path;
4076         struct btrfs_extent_item *ei;
4077         u64 flags;
4078         u32 item_size;
4079         int ret;
4080         int err = 0;
4081         int progress = 0;
4082 
4083         path = btrfs_alloc_path();
4084         if (!path)
4085                 return -ENOMEM;
4086         path->reada = READA_FORWARD;
4087 
4088         ret = prepare_to_relocate(rc);
4089         if (ret) {
4090                 err = ret;
4091                 goto out_free;
4092         }
4093 
4094         while (1) {
4095                 rc->reserved_bytes = 0;
4096                 ret = btrfs_block_rsv_refill(rc->extent_root,
4097                                         rc->block_rsv, rc->block_rsv->size,
4098                                         BTRFS_RESERVE_FLUSH_ALL);
4099                 if (ret) {
4100                         err = ret;
4101                         break;
4102                 }
4103                 progress++;
4104                 trans = btrfs_start_transaction(rc->extent_root, 0);
4105                 if (IS_ERR(trans)) {
4106                         err = PTR_ERR(trans);
4107                         trans = NULL;
4108                         break;
4109                 }
4110 restart:
4111                 if (update_backref_cache(trans, &rc->backref_cache)) {
4112                         btrfs_end_transaction(trans);
4113                         trans = NULL;
4114                         continue;
4115                 }
4116 
4117                 ret = find_next_extent(rc, path, &key);
4118                 if (ret < 0)
4119                         err = ret;
4120                 if (ret != 0)
4121                         break;
4122 
4123                 rc->extents_found++;
4124 
4125                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4126                                     struct btrfs_extent_item);
4127                 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4128                 if (item_size >= sizeof(*ei)) {
4129                         flags = btrfs_extent_flags(path->nodes[0], ei);
4130                         ret = check_extent_flags(flags);
4131                         BUG_ON(ret);
4132                 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
4133                         err = -EINVAL;
4134                         btrfs_print_v0_err(trans->fs_info);
4135                         btrfs_abort_transaction(trans, err);
4136                         break;
4137                 } else {
4138                         BUG();
4139                 }
4140 
4141                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4142                         ret = add_tree_block(rc, &key, path, &blocks);
4143                 } else if (rc->stage == UPDATE_DATA_PTRS &&
4144                            (flags & BTRFS_EXTENT_FLAG_DATA)) {
4145                         ret = add_data_references(rc, &key, path, &blocks);
4146                 } else {
4147                         btrfs_release_path(path);
4148                         ret = 0;
4149                 }
4150                 if (ret < 0) {
4151                         err = ret;
4152                         break;
4153                 }
4154 
4155                 if (!RB_EMPTY_ROOT(&blocks)) {
4156                         ret = relocate_tree_blocks(trans, rc, &blocks);
4157                         if (ret < 0) {
4158                                 if (ret != -EAGAIN) {
4159                                         err = ret;
4160                                         break;
4161                                 }
4162                                 rc->extents_found--;
4163                                 rc->search_start = key.objectid;
4164                         }
4165                 }
4166 
4167                 btrfs_end_transaction_throttle(trans);
4168                 btrfs_btree_balance_dirty(fs_info);
4169                 trans = NULL;
4170 
4171                 if (rc->stage == MOVE_DATA_EXTENTS &&
4172                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
4173                         rc->found_file_extent = 1;
4174                         ret = relocate_data_extent(rc->data_inode,
4175                                                    &key, &rc->cluster);
4176                         if (ret < 0) {
4177                                 err = ret;
4178                                 break;
4179                         }
4180                 }
4181         }
4182         if (trans && progress && err == -ENOSPC) {
4183                 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
4184                 if (ret == 1) {
4185                         err = 0;
4186                         progress = 0;
4187                         goto restart;
4188                 }
4189         }
4190 
4191         btrfs_release_path(path);
4192         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4193 
4194         if (trans) {
4195                 btrfs_end_transaction_throttle(trans);
4196                 btrfs_btree_balance_dirty(fs_info);
4197         }
4198 
4199         if (!err) {
4200                 ret = relocate_file_extent_cluster(rc->data_inode,
4201                                                    &rc->cluster);
4202                 if (ret < 0)
4203                         err = ret;
4204         }
4205 
4206         rc->create_reloc_tree = 0;
4207         set_reloc_control(rc);
4208 
4209         backref_cache_cleanup(&rc->backref_cache);
4210         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4211 
4212         err = prepare_to_merge(rc, err);
4213 
4214         merge_reloc_roots(rc);
4215 
4216         rc->merge_reloc_tree = 0;
4217         unset_reloc_control(rc);
4218         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4219 
4220         /* get rid of pinned extents */
4221         trans = btrfs_join_transaction(rc->extent_root);
4222         if (IS_ERR(trans)) {
4223                 err = PTR_ERR(trans);
4224                 goto out_free;
4225         }
4226         btrfs_commit_transaction(trans);
4227 out_free:
4228         ret = clean_dirty_subvols(rc);
4229         if (ret < 0 && !err)
4230                 err = ret;
4231         btrfs_free_block_rsv(fs_info, rc->block_rsv);
4232         btrfs_free_path(path);
4233         return err;
4234 }
4235 
4236 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4237                                  struct btrfs_root *root, u64 objectid)
4238 {
4239         struct btrfs_path *path;
4240         struct btrfs_inode_item *item;
4241         struct extent_buffer *leaf;
4242         int ret;
4243 
4244         path = btrfs_alloc_path();
4245         if (!path)
4246                 return -ENOMEM;
4247 
4248         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4249         if (ret)
4250                 goto out;
4251 
4252         leaf = path->nodes[0];
4253         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4254         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4255         btrfs_set_inode_generation(leaf, item, 1);
4256         btrfs_set_inode_size(leaf, item, 0);
4257         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4258         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4259                                           BTRFS_INODE_PREALLOC);
4260         btrfs_mark_buffer_dirty(leaf);
4261 out:
4262         btrfs_free_path(path);
4263         return ret;
4264 }
4265 
4266 /*
4267  * helper to create inode for data relocation.
4268  * the inode is in data relocation tree and its link count is 0
4269  */
4270 static noinline_for_stack
4271 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4272                                  struct btrfs_block_group_cache *group)
4273 {
4274         struct inode *inode = NULL;
4275         struct btrfs_trans_handle *trans;
4276         struct btrfs_root *root;
4277         struct btrfs_key key;
4278         u64 objectid;
4279         int err = 0;
4280 
4281         root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4282         if (IS_ERR(root))
4283                 return ERR_CAST(root);
4284 
4285         trans = btrfs_start_transaction(root, 6);
4286         if (IS_ERR(trans))
4287                 return ERR_CAST(trans);
4288 
4289         err = btrfs_find_free_objectid(root, &objectid);
4290         if (err)
4291                 goto out;
4292 
4293         err = __insert_orphan_inode(trans, root, objectid);
4294         BUG_ON(err);
4295 
4296         key.objectid = objectid;
4297         key.type = BTRFS_INODE_ITEM_KEY;
4298         key.offset = 0;
4299         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4300         BUG_ON(IS_ERR(inode));
4301         BTRFS_I(inode)->index_cnt = group->key.objectid;
4302 
4303         err = btrfs_orphan_add(trans, BTRFS_I(inode));
4304 out:
4305         btrfs_end_transaction(trans);
4306         btrfs_btree_balance_dirty(fs_info);
4307         if (err) {
4308                 if (inode)
4309                         iput(inode);
4310                 inode = ERR_PTR(err);
4311         }
4312         return inode;
4313 }
4314 
4315 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4316 {
4317         struct reloc_control *rc;
4318 
4319         rc = kzalloc(sizeof(*rc), GFP_NOFS);
4320         if (!rc)
4321                 return NULL;
4322 
4323         INIT_LIST_HEAD(&rc->reloc_roots);
4324         INIT_LIST_HEAD(&rc->dirty_subvol_roots);
4325         backref_cache_init(&rc->backref_cache);
4326         mapping_tree_init(&rc->reloc_root_tree);
4327         extent_io_tree_init(fs_info, &rc->processed_blocks,
4328                             IO_TREE_RELOC_BLOCKS, NULL);
4329         return rc;
4330 }
4331 
4332 /*
4333  * Print the block group being relocated
4334  */
4335 static void describe_relocation(struct btrfs_fs_info *fs_info,
4336                                 struct btrfs_block_group_cache *block_group)
4337 {
4338         char buf[128] = {'\0'};
4339 
4340         btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
4341 
4342         btrfs_info(fs_info,
4343                    "relocating block group %llu flags %s",
4344                    block_group->key.objectid, buf);
4345 }
4346 
4347 /*
4348  * function to relocate all extents in a block group.
4349  */
4350 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4351 {
4352         struct btrfs_block_group_cache *bg;
4353         struct btrfs_root *extent_root = fs_info->extent_root;
4354         struct reloc_control *rc;
4355         struct inode *inode;
4356         struct btrfs_path *path;
4357         int ret;
4358         int rw = 0;
4359         int err = 0;
4360 
4361         bg = btrfs_lookup_block_group(fs_info, group_start);
4362         if (!bg)
4363                 return -ENOENT;
4364 
4365         if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4366                 btrfs_put_block_group(bg);
4367                 return -ETXTBSY;
4368         }
4369 
4370         rc = alloc_reloc_control(fs_info);
4371         if (!rc) {
4372                 btrfs_put_block_group(bg);
4373                 return -ENOMEM;
4374         }
4375 
4376         rc->extent_root = extent_root;
4377         rc->block_group = bg;
4378 
4379         ret = btrfs_inc_block_group_ro(rc->block_group);
4380         if (ret) {
4381                 err = ret;
4382                 goto out;
4383         }
4384         rw = 1;
4385 
4386         path = btrfs_alloc_path();
4387         if (!path) {
4388                 err = -ENOMEM;
4389                 goto out;
4390         }
4391 
4392         inode = lookup_free_space_inode(rc->block_group, path);
4393         btrfs_free_path(path);
4394 
4395         if (!IS_ERR(inode))
4396                 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4397         else
4398                 ret = PTR_ERR(inode);
4399 
4400         if (ret && ret != -ENOENT) {
4401                 err = ret;
4402                 goto out;
4403         }
4404 
4405         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4406         if (IS_ERR(rc->data_inode)) {
4407                 err = PTR_ERR(rc->data_inode);
4408                 rc->data_inode = NULL;
4409                 goto out;
4410         }
4411 
4412         describe_relocation(fs_info, rc->block_group);
4413 
4414         btrfs_wait_block_group_reservations(rc->block_group);
4415         btrfs_wait_nocow_writers(rc->block_group);
4416         btrfs_wait_ordered_roots(fs_info, U64_MAX,
4417                                  rc->block_group->key.objectid,
4418                                  rc->block_group->key.offset);
4419 
4420         while (1) {
4421                 mutex_lock(&fs_info->cleaner_mutex);
4422                 ret = relocate_block_group(rc);
4423                 mutex_unlock(&fs_info->cleaner_mutex);
4424                 if (ret < 0)
4425                         err = ret;
4426 
4427                 /*
4428                  * We may have gotten ENOSPC after we already dirtied some
4429                  * extents.  If writeout happens while we're relocating a
4430                  * different block group we could end up hitting the
4431                  * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4432                  * btrfs_reloc_cow_block.  Make sure we write everything out
4433                  * properly so we don't trip over this problem, and then break
4434                  * out of the loop if we hit an error.
4435                  */
4436                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4437                         ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4438                                                        (u64)-1);
4439                         if (ret)
4440                                 err = ret;
4441                         invalidate_mapping_pages(rc->data_inode->i_mapping,
4442                                                  0, -1);
4443                         rc->stage = UPDATE_DATA_PTRS;
4444                 }
4445 
4446                 if (err < 0)
4447                         goto out;
4448 
4449                 if (rc->extents_found == 0)
4450                         break;
4451 
4452                 btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4453 
4454         }
4455 
4456         WARN_ON(rc->block_group->pinned > 0);
4457         WARN_ON(rc->block_group->reserved > 0);
4458         WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4459 out:
4460         if (err && rw)
4461                 btrfs_dec_block_group_ro(rc->block_group);
4462         iput(rc->data_inode);
4463         btrfs_put_block_group(rc->block_group);
4464         kfree(rc);
4465         return err;
4466 }
4467 
4468 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4469 {
4470         struct btrfs_fs_info *fs_info = root->fs_info;
4471         struct btrfs_trans_handle *trans;
4472         int ret, err;
4473 
4474         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4475         if (IS_ERR(trans))
4476                 return PTR_ERR(trans);
4477 
4478         memset(&root->root_item.drop_progress, 0,
4479                 sizeof(root->root_item.drop_progress));
4480         root->root_item.drop_level = 0;
4481         btrfs_set_root_refs(&root->root_item, 0);
4482         ret = btrfs_update_root(trans, fs_info->tree_root,
4483                                 &root->root_key, &root->root_item);
4484 
4485         err = btrfs_end_transaction(trans);
4486         if (err)
4487                 return err;
4488         return ret;
4489 }
4490 
4491 /*
4492  * recover relocation interrupted by system crash.
4493  *
4494  * this function resumes merging reloc trees with corresponding fs trees.
4495  * this is important for keeping the sharing of tree blocks
4496  */
4497 int btrfs_recover_relocation(struct btrfs_root *root)
4498 {
4499         struct btrfs_fs_info *fs_info = root->fs_info;
4500         LIST_HEAD(reloc_roots);
4501         struct btrfs_key key;
4502         struct btrfs_root *fs_root;
4503         struct btrfs_root *reloc_root;
4504         struct btrfs_path *path;
4505         struct extent_buffer *leaf;
4506         struct reloc_control *rc = NULL;
4507         struct btrfs_trans_handle *trans;
4508         int ret;
4509         int err = 0;
4510 
4511         path = btrfs_alloc_path();
4512         if (!path)
4513                 return -ENOMEM;
4514         path->reada = READA_BACK;
4515 
4516         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4517         key.type = BTRFS_ROOT_ITEM_KEY;
4518         key.offset = (u64)-1;
4519 
4520         while (1) {
4521                 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4522                                         path, 0, 0);
4523                 if (ret < 0) {
4524                         err = ret;
4525                         goto out;
4526                 }
4527                 if (ret > 0) {
4528                         if (path->slots[0] == 0)
4529                                 break;
4530                         path->slots[0]--;
4531                 }
4532                 leaf = path->nodes[0];
4533                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4534                 btrfs_release_path(path);
4535 
4536                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4537                     key.type != BTRFS_ROOT_ITEM_KEY)
4538                         break;
4539 
4540                 reloc_root = btrfs_read_fs_root(root, &key);
4541                 if (IS_ERR(reloc_root)) {
4542                         err = PTR_ERR(reloc_root);
4543                         goto out;
4544                 }
4545 
4546                 list_add(&reloc_root->root_list, &reloc_roots);
4547 
4548                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4549                         fs_root = read_fs_root(fs_info,
4550                                                reloc_root->root_key.offset);
4551                         if (IS_ERR(fs_root)) {
4552                                 ret = PTR_ERR(fs_root);
4553                                 if (ret != -ENOENT) {
4554                                         err = ret;
4555                                         goto out;
4556                                 }
4557                                 ret = mark_garbage_root(reloc_root);
4558                                 if (ret < 0) {
4559                                         err = ret;
4560                                         goto out;
4561                                 }
4562                         }
4563                 }
4564 
4565                 if (key.offset == 0)
4566                         break;
4567 
4568                 key.offset--;
4569         }
4570         btrfs_release_path(path);
4571 
4572         if (list_empty(&reloc_roots))
4573                 goto out;
4574 
4575         rc = alloc_reloc_control(fs_info);
4576         if (!rc) {
4577                 err = -ENOMEM;
4578                 goto out;
4579         }
4580 
4581         rc->extent_root = fs_info->extent_root;
4582 
4583         set_reloc_control(rc);
4584 
4585         trans = btrfs_join_transaction(rc->extent_root);
4586         if (IS_ERR(trans)) {
4587                 err = PTR_ERR(trans);
4588                 goto out_unset;
4589         }
4590 
4591         rc->merge_reloc_tree = 1;
4592 
4593         while (!list_empty(&reloc_roots)) {
4594                 reloc_root = list_entry(reloc_roots.next,
4595                                         struct btrfs_root, root_list);
4596                 list_del(&reloc_root->root_list);
4597 
4598                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4599                         list_add_tail(&reloc_root->root_list,
4600                                       &rc->reloc_roots);
4601                         continue;
4602                 }
4603 
4604                 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4605                 if (IS_ERR(fs_root)) {
4606                         err = PTR_ERR(fs_root);
4607                         list_add_tail(&reloc_root->root_list, &reloc_roots);
4608                         btrfs_end_transaction(trans);
4609                         goto out_unset;
4610                 }
4611 
4612                 err = __add_reloc_root(reloc_root);
4613                 BUG_ON(err < 0); /* -ENOMEM or logic error */
4614                 fs_root->reloc_root = reloc_root;
4615         }
4616 
4617         err = btrfs_commit_transaction(trans);
4618         if (err)
4619                 goto out_unset;
4620 
4621         merge_reloc_roots(rc);
4622 
4623         unset_reloc_control(rc);
4624 
4625         trans = btrfs_join_transaction(rc->extent_root);
4626         if (IS_ERR(trans)) {
4627                 err = PTR_ERR(trans);
4628                 goto out_clean;
4629         }
4630         err = btrfs_commit_transaction(trans);
4631 out_clean:
4632         ret = clean_dirty_subvols(rc);
4633         if (ret < 0 && !err)
4634                 err = ret;
4635 out_unset:
4636         unset_reloc_control(rc);
4637         kfree(rc);
4638 out:
4639         if (!list_empty(&reloc_roots))
4640                 free_reloc_roots(&reloc_roots);
4641 
4642         btrfs_free_path(path);
4643 
4644         if (err == 0) {
4645                 /* cleanup orphan inode in data relocation tree */
4646                 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4647                 if (IS_ERR(fs_root))
4648                         err = PTR_ERR(fs_root);
4649                 else
4650                         err = btrfs_orphan_cleanup(fs_root);
4651         }
4652         return err;
4653 }
4654 
4655 /*
4656  * helper to add ordered checksum for data relocation.
4657  *
4658  * cloning checksum properly handles the nodatasum extents.
4659  * it also saves CPU time to re-calculate the checksum.
4660  */
4661 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4662 {
4663         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4664         struct btrfs_ordered_sum *sums;
4665         struct btrfs_ordered_extent *ordered;
4666         int ret;
4667         u64 disk_bytenr;
4668         u64 new_bytenr;
4669         LIST_HEAD(list);
4670 
4671         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4672         BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4673 
4674         disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4675         ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4676                                        disk_bytenr + len - 1, &list, 0);
4677         if (ret)
4678                 goto out;
4679 
4680         while (!list_empty(&list)) {
4681                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4682                 list_del_init(&sums->list);
4683 
4684                 /*
4685                  * We need to offset the new_bytenr based on where the csum is.
4686                  * We need to do this because we will read in entire prealloc
4687                  * extents but we may have written to say the middle of the
4688                  * prealloc extent, so we need to make sure the csum goes with
4689                  * the right disk offset.
4690                  *
4691                  * We can do this because the data reloc inode refers strictly
4692                  * to the on disk bytes, so we don't have to worry about
4693                  * disk_len vs real len like with real inodes since it's all
4694                  * disk length.
4695                  */
4696                 new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4697                 sums->bytenr = new_bytenr;
4698 
4699                 btrfs_add_ordered_sum(ordered, sums);
4700         }
4701 out:
4702         btrfs_put_ordered_extent(ordered);
4703         return ret;
4704 }
4705 
4706 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4707                           struct btrfs_root *root, struct extent_buffer *buf,
4708                           struct extent_buffer *cow)
4709 {
4710         struct btrfs_fs_info *fs_info = root->fs_info;
4711         struct reloc_control *rc;
4712         struct backref_node *node;
4713         int first_cow = 0;
4714         int level;
4715         int ret = 0;
4716 
4717         rc = fs_info->reloc_ctl;
4718         if (!rc)
4719                 return 0;
4720 
4721         BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4722                root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4723 
4724         level = btrfs_header_level(buf);
4725         if (btrfs_header_generation(buf) <=
4726             btrfs_root_last_snapshot(&root->root_item))
4727                 first_cow = 1;
4728 
4729         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4730             rc->create_reloc_tree) {
4731                 WARN_ON(!first_cow && level == 0);
4732 
4733                 node = rc->backref_cache.path[level];
4734                 BUG_ON(node->bytenr != buf->start &&
4735                        node->new_bytenr != buf->start);
4736 
4737                 drop_node_buffer(node);
4738                 extent_buffer_get(cow);
4739                 node->eb = cow;
4740                 node->new_bytenr = cow->start;
4741 
4742                 if (!node->pending) {
4743                         list_move_tail(&node->list,
4744                                        &rc->backref_cache.pending[level]);
4745                         node->pending = 1;
4746                 }
4747 
4748                 if (first_cow)
4749                         __mark_block_processed(rc, node);
4750 
4751                 if (first_cow && level > 0)
4752                         rc->nodes_relocated += buf->len;
4753         }
4754 
4755         if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4756                 ret = replace_file_extents(trans, rc, root, cow);
4757         return ret;
4758 }
4759 
4760 /*
4761  * called before creating snapshot. it calculates metadata reservation
4762  * required for relocating tree blocks in the snapshot
4763  */
4764 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4765                               u64 *bytes_to_reserve)
4766 {
4767         struct btrfs_root *root = pending->root;
4768         struct reloc_control *rc = root->fs_info->reloc_ctl;
4769 
4770         if (!rc || !have_reloc_root(root))
4771                 return;
4772 
4773         if (!rc->merge_reloc_tree)
4774                 return;
4775 
4776         root = root->reloc_root;
4777         BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4778         /*
4779          * relocation is in the stage of merging trees. the space
4780          * used by merging a reloc tree is twice the size of
4781          * relocated tree nodes in the worst case. half for cowing
4782          * the reloc tree, half for cowing the fs tree. the space
4783          * used by cowing the reloc tree will be freed after the
4784          * tree is dropped. if we create snapshot, cowing the fs
4785          * tree may use more space than it frees. so we need
4786          * reserve extra space.
4787          */
4788         *bytes_to_reserve += rc->nodes_relocated;
4789 }
4790 
4791 /*
4792  * called after snapshot is created. migrate block reservation
4793  * and create reloc root for the newly created snapshot
4794  */
4795 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4796                                struct btrfs_pending_snapshot *pending)
4797 {
4798         struct btrfs_root *root = pending->root;
4799         struct btrfs_root *reloc_root;
4800         struct btrfs_root *new_root;
4801         struct reloc_control *rc = root->fs_info->reloc_ctl;
4802         int ret;
4803 
4804         if (!rc || !have_reloc_root(root))
4805                 return 0;
4806 
4807         rc = root->fs_info->reloc_ctl;
4808         rc->merging_rsv_size += rc->nodes_relocated;
4809 
4810         if (rc->merge_reloc_tree) {
4811                 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4812                                               rc->block_rsv,
4813                                               rc->nodes_relocated, true);
4814                 if (ret)
4815                         return ret;
4816         }
4817 
4818         new_root = pending->snap;
4819         reloc_root = create_reloc_root(trans, root->reloc_root,
4820                                        new_root->root_key.objectid);
4821         if (IS_ERR(reloc_root))
4822                 return PTR_ERR(reloc_root);
4823 
4824         ret = __add_reloc_root(reloc_root);
4825         BUG_ON(ret < 0);
4826         new_root->reloc_root = reloc_root;
4827 
4828         if (rc->create_reloc_tree)
4829                 ret = clone_backref_node(trans, rc, root, reloc_root);
4830         return ret;
4831 }

/* [<][>][^][v][top][bottom][index][help] */