1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/pagemap.h>
6#include <linux/page-flags.h>
7#include <linux/spinlock.h>
8#include <linux/blkdev.h>
9#include <linux/swap.h>
10#include <linux/writeback.h>
11#include <linux/pagevec.h>
12#include <linux/prefetch.h>
13#include <linux/cleancache.h>
14#include "extent_io.h"
15#include "extent_map.h"
16#include "ctree.h"
17#include "btrfs_inode.h"
18#include "volumes.h"
19#include "check-integrity.h"
20#include "locking.h"
21#include "rcu-string.h"
22#include "backref.h"
23
24static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache;
26static struct bio_set *btrfs_bioset;
27
28static inline bool extent_state_in_tree(const struct extent_state *state)
29{
30	return !RB_EMPTY_NODE(&state->rb_node);
31}
32
33#ifdef CONFIG_BTRFS_DEBUG
34static LIST_HEAD(buffers);
35static LIST_HEAD(states);
36
37static DEFINE_SPINLOCK(leak_lock);
38
39static inline
40void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
41{
42	unsigned long flags;
43
44	spin_lock_irqsave(&leak_lock, flags);
45	list_add(new, head);
46	spin_unlock_irqrestore(&leak_lock, flags);
47}
48
49static inline
50void btrfs_leak_debug_del(struct list_head *entry)
51{
52	unsigned long flags;
53
54	spin_lock_irqsave(&leak_lock, flags);
55	list_del(entry);
56	spin_unlock_irqrestore(&leak_lock, flags);
57}
58
59static inline
60void btrfs_leak_debug_check(void)
61{
62	struct extent_state *state;
63	struct extent_buffer *eb;
64
65	while (!list_empty(&states)) {
66		state = list_entry(states.next, struct extent_state, leak_list);
67		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
68		       state->start, state->end, state->state,
69		       extent_state_in_tree(state),
70		       atomic_read(&state->refs));
71		list_del(&state->leak_list);
72		kmem_cache_free(extent_state_cache, state);
73	}
74
75	while (!list_empty(&buffers)) {
76		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
77		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
78		       "refs %d\n",
79		       eb->start, eb->len, atomic_read(&eb->refs));
80		list_del(&eb->leak_list);
81		kmem_cache_free(extent_buffer_cache, eb);
82	}
83}
84
85#define btrfs_debug_check_extent_io_range(tree, start, end)		\
86	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
87static inline void __btrfs_debug_check_extent_io_range(const char *caller,
88		struct extent_io_tree *tree, u64 start, u64 end)
89{
90	struct inode *inode;
91	u64 isize;
92
93	if (!tree->mapping)
94		return;
95
96	inode = tree->mapping->host;
97	isize = i_size_read(inode);
98	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
99		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
100		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
101				caller, btrfs_ino(inode), isize, start, end);
102	}
103}
104#else
105#define btrfs_leak_debug_add(new, head)	do {} while (0)
106#define btrfs_leak_debug_del(entry)	do {} while (0)
107#define btrfs_leak_debug_check()	do {} while (0)
108#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
109#endif
110
111#define BUFFER_LRU_MAX 64
112
113struct tree_entry {
114	u64 start;
115	u64 end;
116	struct rb_node rb_node;
117};
118
119struct extent_page_data {
120	struct bio *bio;
121	struct extent_io_tree *tree;
122	get_extent_t *get_extent;
123	unsigned long bio_flags;
124
125	/* tells writepage not to lock the state bits for this range
126	 * it still does the unlocking
127	 */
128	unsigned int extent_locked:1;
129
130	/* tells the submit_bio code to use a WRITE_SYNC */
131	unsigned int sync_io:1;
132};
133
134static void add_extent_changeset(struct extent_state *state, unsigned bits,
135				 struct extent_changeset *changeset,
136				 int set)
137{
138	int ret;
139
140	if (!changeset)
141		return;
142	if (set && (state->state & bits) == bits)
143		return;
144	if (!set && (state->state & bits) == 0)
145		return;
146	changeset->bytes_changed += state->end - state->start + 1;
147	ret = ulist_add(changeset->range_changed, state->start, state->end,
148			GFP_ATOMIC);
149	/* ENOMEM */
150	BUG_ON(ret < 0);
151}
152
153static noinline void flush_write_bio(void *data);
154static inline struct btrfs_fs_info *
155tree_fs_info(struct extent_io_tree *tree)
156{
157	if (!tree->mapping)
158		return NULL;
159	return btrfs_sb(tree->mapping->host->i_sb);
160}
161
162int __init extent_io_init(void)
163{
164	extent_state_cache = kmem_cache_create("btrfs_extent_state",
165			sizeof(struct extent_state), 0,
166			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
167	if (!extent_state_cache)
168		return -ENOMEM;
169
170	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
171			sizeof(struct extent_buffer), 0,
172			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
173	if (!extent_buffer_cache)
174		goto free_state_cache;
175
176	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
177				     offsetof(struct btrfs_io_bio, bio));
178	if (!btrfs_bioset)
179		goto free_buffer_cache;
180
181	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
182		goto free_bioset;
183
184	return 0;
185
186free_bioset:
187	bioset_free(btrfs_bioset);
188	btrfs_bioset = NULL;
189
190free_buffer_cache:
191	kmem_cache_destroy(extent_buffer_cache);
192	extent_buffer_cache = NULL;
193
194free_state_cache:
195	kmem_cache_destroy(extent_state_cache);
196	extent_state_cache = NULL;
197	return -ENOMEM;
198}
199
200void extent_io_exit(void)
201{
202	btrfs_leak_debug_check();
203
204	/*
205	 * Make sure all delayed rcu free are flushed before we
206	 * destroy caches.
207	 */
208	rcu_barrier();
209	if (extent_state_cache)
210		kmem_cache_destroy(extent_state_cache);
211	if (extent_buffer_cache)
212		kmem_cache_destroy(extent_buffer_cache);
213	if (btrfs_bioset)
214		bioset_free(btrfs_bioset);
215}
216
217void extent_io_tree_init(struct extent_io_tree *tree,
218			 struct address_space *mapping)
219{
220	tree->state = RB_ROOT;
221	tree->ops = NULL;
222	tree->dirty_bytes = 0;
223	spin_lock_init(&tree->lock);
224	tree->mapping = mapping;
225}
226
227static struct extent_state *alloc_extent_state(gfp_t mask)
228{
229	struct extent_state *state;
230
231	state = kmem_cache_alloc(extent_state_cache, mask);
232	if (!state)
233		return state;
234	state->state = 0;
235	state->private = 0;
236	RB_CLEAR_NODE(&state->rb_node);
237	btrfs_leak_debug_add(&state->leak_list, &states);
238	atomic_set(&state->refs, 1);
239	init_waitqueue_head(&state->wq);
240	trace_alloc_extent_state(state, mask, _RET_IP_);
241	return state;
242}
243
244void free_extent_state(struct extent_state *state)
245{
246	if (!state)
247		return;
248	if (atomic_dec_and_test(&state->refs)) {
249		WARN_ON(extent_state_in_tree(state));
250		btrfs_leak_debug_del(&state->leak_list);
251		trace_free_extent_state(state, _RET_IP_);
252		kmem_cache_free(extent_state_cache, state);
253	}
254}
255
256static struct rb_node *tree_insert(struct rb_root *root,
257				   struct rb_node *search_start,
258				   u64 offset,
259				   struct rb_node *node,
260				   struct rb_node ***p_in,
261				   struct rb_node **parent_in)
262{
263	struct rb_node **p;
264	struct rb_node *parent = NULL;
265	struct tree_entry *entry;
266
267	if (p_in && parent_in) {
268		p = *p_in;
269		parent = *parent_in;
270		goto do_insert;
271	}
272
273	p = search_start ? &search_start : &root->rb_node;
274	while (*p) {
275		parent = *p;
276		entry = rb_entry(parent, struct tree_entry, rb_node);
277
278		if (offset < entry->start)
279			p = &(*p)->rb_left;
280		else if (offset > entry->end)
281			p = &(*p)->rb_right;
282		else
283			return parent;
284	}
285
286do_insert:
287	rb_link_node(node, parent, p);
288	rb_insert_color(node, root);
289	return NULL;
290}
291
292static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
293				      struct rb_node **prev_ret,
294				      struct rb_node **next_ret,
295				      struct rb_node ***p_ret,
296				      struct rb_node **parent_ret)
297{
298	struct rb_root *root = &tree->state;
299	struct rb_node **n = &root->rb_node;
300	struct rb_node *prev = NULL;
301	struct rb_node *orig_prev = NULL;
302	struct tree_entry *entry;
303	struct tree_entry *prev_entry = NULL;
304
305	while (*n) {
306		prev = *n;
307		entry = rb_entry(prev, struct tree_entry, rb_node);
308		prev_entry = entry;
309
310		if (offset < entry->start)
311			n = &(*n)->rb_left;
312		else if (offset > entry->end)
313			n = &(*n)->rb_right;
314		else
315			return *n;
316	}
317
318	if (p_ret)
319		*p_ret = n;
320	if (parent_ret)
321		*parent_ret = prev;
322
323	if (prev_ret) {
324		orig_prev = prev;
325		while (prev && offset > prev_entry->end) {
326			prev = rb_next(prev);
327			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
328		}
329		*prev_ret = prev;
330		prev = orig_prev;
331	}
332
333	if (next_ret) {
334		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
335		while (prev && offset < prev_entry->start) {
336			prev = rb_prev(prev);
337			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
338		}
339		*next_ret = prev;
340	}
341	return NULL;
342}
343
344static inline struct rb_node *
345tree_search_for_insert(struct extent_io_tree *tree,
346		       u64 offset,
347		       struct rb_node ***p_ret,
348		       struct rb_node **parent_ret)
349{
350	struct rb_node *prev = NULL;
351	struct rb_node *ret;
352
353	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
354	if (!ret)
355		return prev;
356	return ret;
357}
358
359static inline struct rb_node *tree_search(struct extent_io_tree *tree,
360					  u64 offset)
361{
362	return tree_search_for_insert(tree, offset, NULL, NULL);
363}
364
365static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
366		     struct extent_state *other)
367{
368	if (tree->ops && tree->ops->merge_extent_hook)
369		tree->ops->merge_extent_hook(tree->mapping->host, new,
370					     other);
371}
372
373/*
374 * utility function to look for merge candidates inside a given range.
375 * Any extents with matching state are merged together into a single
376 * extent in the tree.  Extents with EXTENT_IO in their state field
377 * are not merged because the end_io handlers need to be able to do
378 * operations on them without sleeping (or doing allocations/splits).
379 *
380 * This should be called with the tree lock held.
381 */
382static void merge_state(struct extent_io_tree *tree,
383		        struct extent_state *state)
384{
385	struct extent_state *other;
386	struct rb_node *other_node;
387
388	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
389		return;
390
391	other_node = rb_prev(&state->rb_node);
392	if (other_node) {
393		other = rb_entry(other_node, struct extent_state, rb_node);
394		if (other->end == state->start - 1 &&
395		    other->state == state->state) {
396			merge_cb(tree, state, other);
397			state->start = other->start;
398			rb_erase(&other->rb_node, &tree->state);
399			RB_CLEAR_NODE(&other->rb_node);
400			free_extent_state(other);
401		}
402	}
403	other_node = rb_next(&state->rb_node);
404	if (other_node) {
405		other = rb_entry(other_node, struct extent_state, rb_node);
406		if (other->start == state->end + 1 &&
407		    other->state == state->state) {
408			merge_cb(tree, state, other);
409			state->end = other->end;
410			rb_erase(&other->rb_node, &tree->state);
411			RB_CLEAR_NODE(&other->rb_node);
412			free_extent_state(other);
413		}
414	}
415}
416
417static void set_state_cb(struct extent_io_tree *tree,
418			 struct extent_state *state, unsigned *bits)
419{
420	if (tree->ops && tree->ops->set_bit_hook)
421		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
422}
423
424static void clear_state_cb(struct extent_io_tree *tree,
425			   struct extent_state *state, unsigned *bits)
426{
427	if (tree->ops && tree->ops->clear_bit_hook)
428		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
429}
430
431static void set_state_bits(struct extent_io_tree *tree,
432			   struct extent_state *state, unsigned *bits,
433			   struct extent_changeset *changeset);
434
435/*
436 * insert an extent_state struct into the tree.  'bits' are set on the
437 * struct before it is inserted.
438 *
439 * This may return -EEXIST if the extent is already there, in which case the
440 * state struct is freed.
441 *
442 * The tree lock is not taken internally.  This is a utility function and
443 * probably isn't what you want to call (see set/clear_extent_bit).
444 */
445static int insert_state(struct extent_io_tree *tree,
446			struct extent_state *state, u64 start, u64 end,
447			struct rb_node ***p,
448			struct rb_node **parent,
449			unsigned *bits, struct extent_changeset *changeset)
450{
451	struct rb_node *node;
452
453	if (end < start)
454		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
455		       end, start);
456	state->start = start;
457	state->end = end;
458
459	set_state_bits(tree, state, bits, changeset);
460
461	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
462	if (node) {
463		struct extent_state *found;
464		found = rb_entry(node, struct extent_state, rb_node);
465		printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
466		       "%llu %llu\n",
467		       found->start, found->end, start, end);
468		return -EEXIST;
469	}
470	merge_state(tree, state);
471	return 0;
472}
473
474static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
475		     u64 split)
476{
477	if (tree->ops && tree->ops->split_extent_hook)
478		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
479}
480
481/*
482 * split a given extent state struct in two, inserting the preallocated
483 * struct 'prealloc' as the newly created second half.  'split' indicates an
484 * offset inside 'orig' where it should be split.
485 *
486 * Before calling,
487 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
488 * are two extent state structs in the tree:
489 * prealloc: [orig->start, split - 1]
490 * orig: [ split, orig->end ]
491 *
492 * The tree locks are not taken by this function. They need to be held
493 * by the caller.
494 */
495static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
496		       struct extent_state *prealloc, u64 split)
497{
498	struct rb_node *node;
499
500	split_cb(tree, orig, split);
501
502	prealloc->start = orig->start;
503	prealloc->end = split - 1;
504	prealloc->state = orig->state;
505	orig->start = split;
506
507	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
508			   &prealloc->rb_node, NULL, NULL);
509	if (node) {
510		free_extent_state(prealloc);
511		return -EEXIST;
512	}
513	return 0;
514}
515
516static struct extent_state *next_state(struct extent_state *state)
517{
518	struct rb_node *next = rb_next(&state->rb_node);
519	if (next)
520		return rb_entry(next, struct extent_state, rb_node);
521	else
522		return NULL;
523}
524
525/*
526 * utility function to clear some bits in an extent state struct.
527 * it will optionally wake up any one waiting on this state (wake == 1).
528 *
529 * If no bits are set on the state struct after clearing things, the
530 * struct is freed and removed from the tree
531 */
532static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
533					    struct extent_state *state,
534					    unsigned *bits, int wake,
535					    struct extent_changeset *changeset)
536{
537	struct extent_state *next;
538	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
539
540	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
541		u64 range = state->end - state->start + 1;
542		WARN_ON(range > tree->dirty_bytes);
543		tree->dirty_bytes -= range;
544	}
545	clear_state_cb(tree, state, bits);
546	add_extent_changeset(state, bits_to_clear, changeset, 0);
547	state->state &= ~bits_to_clear;
548	if (wake)
549		wake_up(&state->wq);
550	if (state->state == 0) {
551		next = next_state(state);
552		if (extent_state_in_tree(state)) {
553			rb_erase(&state->rb_node, &tree->state);
554			RB_CLEAR_NODE(&state->rb_node);
555			free_extent_state(state);
556		} else {
557			WARN_ON(1);
558		}
559	} else {
560		merge_state(tree, state);
561		next = next_state(state);
562	}
563	return next;
564}
565
566static struct extent_state *
567alloc_extent_state_atomic(struct extent_state *prealloc)
568{
569	if (!prealloc)
570		prealloc = alloc_extent_state(GFP_ATOMIC);
571
572	return prealloc;
573}
574
575static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
576{
577	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
578		    "Extent tree was modified by another "
579		    "thread while locked.");
580}
581
582/*
583 * clear some bits on a range in the tree.  This may require splitting
584 * or inserting elements in the tree, so the gfp mask is used to
585 * indicate which allocations or sleeping are allowed.
586 *
587 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
588 * the given range from the tree regardless of state (ie for truncate).
589 *
590 * the range [start, end] is inclusive.
591 *
592 * This takes the tree lock, and returns 0 on success and < 0 on error.
593 */
594static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
595			      unsigned bits, int wake, int delete,
596			      struct extent_state **cached_state,
597			      gfp_t mask, struct extent_changeset *changeset)
598{
599	struct extent_state *state;
600	struct extent_state *cached;
601	struct extent_state *prealloc = NULL;
602	struct rb_node *node;
603	u64 last_end;
604	int err;
605	int clear = 0;
606
607	btrfs_debug_check_extent_io_range(tree, start, end);
608
609	if (bits & EXTENT_DELALLOC)
610		bits |= EXTENT_NORESERVE;
611
612	if (delete)
613		bits |= ~EXTENT_CTLBITS;
614	bits |= EXTENT_FIRST_DELALLOC;
615
616	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
617		clear = 1;
618again:
619	if (!prealloc && gfpflags_allow_blocking(mask)) {
620		/*
621		 * Don't care for allocation failure here because we might end
622		 * up not needing the pre-allocated extent state at all, which
623		 * is the case if we only have in the tree extent states that
624		 * cover our input range and don't cover too any other range.
625		 * If we end up needing a new extent state we allocate it later.
626		 */
627		prealloc = alloc_extent_state(mask);
628	}
629
630	spin_lock(&tree->lock);
631	if (cached_state) {
632		cached = *cached_state;
633
634		if (clear) {
635			*cached_state = NULL;
636			cached_state = NULL;
637		}
638
639		if (cached && extent_state_in_tree(cached) &&
640		    cached->start <= start && cached->end > start) {
641			if (clear)
642				atomic_dec(&cached->refs);
643			state = cached;
644			goto hit_next;
645		}
646		if (clear)
647			free_extent_state(cached);
648	}
649	/*
650	 * this search will find the extents that end after
651	 * our range starts
652	 */
653	node = tree_search(tree, start);
654	if (!node)
655		goto out;
656	state = rb_entry(node, struct extent_state, rb_node);
657hit_next:
658	if (state->start > end)
659		goto out;
660	WARN_ON(state->end < start);
661	last_end = state->end;
662
663	/* the state doesn't have the wanted bits, go ahead */
664	if (!(state->state & bits)) {
665		state = next_state(state);
666		goto next;
667	}
668
669	/*
670	 *     | ---- desired range ---- |
671	 *  | state | or
672	 *  | ------------- state -------------- |
673	 *
674	 * We need to split the extent we found, and may flip
675	 * bits on second half.
676	 *
677	 * If the extent we found extends past our range, we
678	 * just split and search again.  It'll get split again
679	 * the next time though.
680	 *
681	 * If the extent we found is inside our range, we clear
682	 * the desired bit on it.
683	 */
684
685	if (state->start < start) {
686		prealloc = alloc_extent_state_atomic(prealloc);
687		BUG_ON(!prealloc);
688		err = split_state(tree, state, prealloc, start);
689		if (err)
690			extent_io_tree_panic(tree, err);
691
692		prealloc = NULL;
693		if (err)
694			goto out;
695		if (state->end <= end) {
696			state = clear_state_bit(tree, state, &bits, wake,
697						changeset);
698			goto next;
699		}
700		goto search_again;
701	}
702	/*
703	 * | ---- desired range ---- |
704	 *                        | state |
705	 * We need to split the extent, and clear the bit
706	 * on the first half
707	 */
708	if (state->start <= end && state->end > end) {
709		prealloc = alloc_extent_state_atomic(prealloc);
710		BUG_ON(!prealloc);
711		err = split_state(tree, state, prealloc, end + 1);
712		if (err)
713			extent_io_tree_panic(tree, err);
714
715		if (wake)
716			wake_up(&state->wq);
717
718		clear_state_bit(tree, prealloc, &bits, wake, changeset);
719
720		prealloc = NULL;
721		goto out;
722	}
723
724	state = clear_state_bit(tree, state, &bits, wake, changeset);
725next:
726	if (last_end == (u64)-1)
727		goto out;
728	start = last_end + 1;
729	if (start <= end && state && !need_resched())
730		goto hit_next;
731	goto search_again;
732
733out:
734	spin_unlock(&tree->lock);
735	if (prealloc)
736		free_extent_state(prealloc);
737
738	return 0;
739
740search_again:
741	if (start > end)
742		goto out;
743	spin_unlock(&tree->lock);
744	if (gfpflags_allow_blocking(mask))
745		cond_resched();
746	goto again;
747}
748
749static void wait_on_state(struct extent_io_tree *tree,
750			  struct extent_state *state)
751		__releases(tree->lock)
752		__acquires(tree->lock)
753{
754	DEFINE_WAIT(wait);
755	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
756	spin_unlock(&tree->lock);
757	schedule();
758	spin_lock(&tree->lock);
759	finish_wait(&state->wq, &wait);
760}
761
762/*
763 * waits for one or more bits to clear on a range in the state tree.
764 * The range [start, end] is inclusive.
765 * The tree lock is taken by this function
766 */
767static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
768			    unsigned long bits)
769{
770	struct extent_state *state;
771	struct rb_node *node;
772
773	btrfs_debug_check_extent_io_range(tree, start, end);
774
775	spin_lock(&tree->lock);
776again:
777	while (1) {
778		/*
779		 * this search will find all the extents that end after
780		 * our range starts
781		 */
782		node = tree_search(tree, start);
783process_node:
784		if (!node)
785			break;
786
787		state = rb_entry(node, struct extent_state, rb_node);
788
789		if (state->start > end)
790			goto out;
791
792		if (state->state & bits) {
793			start = state->start;
794			atomic_inc(&state->refs);
795			wait_on_state(tree, state);
796			free_extent_state(state);
797			goto again;
798		}
799		start = state->end + 1;
800
801		if (start > end)
802			break;
803
804		if (!cond_resched_lock(&tree->lock)) {
805			node = rb_next(node);
806			goto process_node;
807		}
808	}
809out:
810	spin_unlock(&tree->lock);
811}
812
813static void set_state_bits(struct extent_io_tree *tree,
814			   struct extent_state *state,
815			   unsigned *bits, struct extent_changeset *changeset)
816{
817	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
818
819	set_state_cb(tree, state, bits);
820	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
821		u64 range = state->end - state->start + 1;
822		tree->dirty_bytes += range;
823	}
824	add_extent_changeset(state, bits_to_set, changeset, 1);
825	state->state |= bits_to_set;
826}
827
828static void cache_state_if_flags(struct extent_state *state,
829				 struct extent_state **cached_ptr,
830				 unsigned flags)
831{
832	if (cached_ptr && !(*cached_ptr)) {
833		if (!flags || (state->state & flags)) {
834			*cached_ptr = state;
835			atomic_inc(&state->refs);
836		}
837	}
838}
839
840static void cache_state(struct extent_state *state,
841			struct extent_state **cached_ptr)
842{
843	return cache_state_if_flags(state, cached_ptr,
844				    EXTENT_IOBITS | EXTENT_BOUNDARY);
845}
846
847/*
848 * set some bits on a range in the tree.  This may require allocations or
849 * sleeping, so the gfp mask is used to indicate what is allowed.
850 *
851 * If any of the exclusive bits are set, this will fail with -EEXIST if some
852 * part of the range already has the desired bits set.  The start of the
853 * existing range is returned in failed_start in this case.
854 *
855 * [start, end] is inclusive This takes the tree lock.
856 */
857
858static int __must_check
859__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
860		 unsigned bits, unsigned exclusive_bits,
861		 u64 *failed_start, struct extent_state **cached_state,
862		 gfp_t mask, struct extent_changeset *changeset)
863{
864	struct extent_state *state;
865	struct extent_state *prealloc = NULL;
866	struct rb_node *node;
867	struct rb_node **p;
868	struct rb_node *parent;
869	int err = 0;
870	u64 last_start;
871	u64 last_end;
872
873	btrfs_debug_check_extent_io_range(tree, start, end);
874
875	bits |= EXTENT_FIRST_DELALLOC;
876again:
877	if (!prealloc && gfpflags_allow_blocking(mask)) {
878		prealloc = alloc_extent_state(mask);
879		BUG_ON(!prealloc);
880	}
881
882	spin_lock(&tree->lock);
883	if (cached_state && *cached_state) {
884		state = *cached_state;
885		if (state->start <= start && state->end > start &&
886		    extent_state_in_tree(state)) {
887			node = &state->rb_node;
888			goto hit_next;
889		}
890	}
891	/*
892	 * this search will find all the extents that end after
893	 * our range starts.
894	 */
895	node = tree_search_for_insert(tree, start, &p, &parent);
896	if (!node) {
897		prealloc = alloc_extent_state_atomic(prealloc);
898		BUG_ON(!prealloc);
899		err = insert_state(tree, prealloc, start, end,
900				   &p, &parent, &bits, changeset);
901		if (err)
902			extent_io_tree_panic(tree, err);
903
904		cache_state(prealloc, cached_state);
905		prealloc = NULL;
906		goto out;
907	}
908	state = rb_entry(node, struct extent_state, rb_node);
909hit_next:
910	last_start = state->start;
911	last_end = state->end;
912
913	/*
914	 * | ---- desired range ---- |
915	 * | state |
916	 *
917	 * Just lock what we found and keep going
918	 */
919	if (state->start == start && state->end <= end) {
920		if (state->state & exclusive_bits) {
921			*failed_start = state->start;
922			err = -EEXIST;
923			goto out;
924		}
925
926		set_state_bits(tree, state, &bits, changeset);
927		cache_state(state, cached_state);
928		merge_state(tree, state);
929		if (last_end == (u64)-1)
930			goto out;
931		start = last_end + 1;
932		state = next_state(state);
933		if (start < end && state && state->start == start &&
934		    !need_resched())
935			goto hit_next;
936		goto search_again;
937	}
938
939	/*
940	 *     | ---- desired range ---- |
941	 * | state |
942	 *   or
943	 * | ------------- state -------------- |
944	 *
945	 * We need to split the extent we found, and may flip bits on
946	 * second half.
947	 *
948	 * If the extent we found extends past our
949	 * range, we just split and search again.  It'll get split
950	 * again the next time though.
951	 *
952	 * If the extent we found is inside our range, we set the
953	 * desired bit on it.
954	 */
955	if (state->start < start) {
956		if (state->state & exclusive_bits) {
957			*failed_start = start;
958			err = -EEXIST;
959			goto out;
960		}
961
962		prealloc = alloc_extent_state_atomic(prealloc);
963		BUG_ON(!prealloc);
964		err = split_state(tree, state, prealloc, start);
965		if (err)
966			extent_io_tree_panic(tree, err);
967
968		prealloc = NULL;
969		if (err)
970			goto out;
971		if (state->end <= end) {
972			set_state_bits(tree, state, &bits, changeset);
973			cache_state(state, cached_state);
974			merge_state(tree, state);
975			if (last_end == (u64)-1)
976				goto out;
977			start = last_end + 1;
978			state = next_state(state);
979			if (start < end && state && state->start == start &&
980			    !need_resched())
981				goto hit_next;
982		}
983		goto search_again;
984	}
985	/*
986	 * | ---- desired range ---- |
987	 *     | state | or               | state |
988	 *
989	 * There's a hole, we need to insert something in it and
990	 * ignore the extent we found.
991	 */
992	if (state->start > start) {
993		u64 this_end;
994		if (end < last_start)
995			this_end = end;
996		else
997			this_end = last_start - 1;
998
999		prealloc = alloc_extent_state_atomic(prealloc);
1000		BUG_ON(!prealloc);
1001
1002		/*
1003		 * Avoid to free 'prealloc' if it can be merged with
1004		 * the later extent.
1005		 */
1006		err = insert_state(tree, prealloc, start, this_end,
1007				   NULL, NULL, &bits, changeset);
1008		if (err)
1009			extent_io_tree_panic(tree, err);
1010
1011		cache_state(prealloc, cached_state);
1012		prealloc = NULL;
1013		start = this_end + 1;
1014		goto search_again;
1015	}
1016	/*
1017	 * | ---- desired range ---- |
1018	 *                        | state |
1019	 * We need to split the extent, and set the bit
1020	 * on the first half
1021	 */
1022	if (state->start <= end && state->end > end) {
1023		if (state->state & exclusive_bits) {
1024			*failed_start = start;
1025			err = -EEXIST;
1026			goto out;
1027		}
1028
1029		prealloc = alloc_extent_state_atomic(prealloc);
1030		BUG_ON(!prealloc);
1031		err = split_state(tree, state, prealloc, end + 1);
1032		if (err)
1033			extent_io_tree_panic(tree, err);
1034
1035		set_state_bits(tree, prealloc, &bits, changeset);
1036		cache_state(prealloc, cached_state);
1037		merge_state(tree, prealloc);
1038		prealloc = NULL;
1039		goto out;
1040	}
1041
1042	goto search_again;
1043
1044out:
1045	spin_unlock(&tree->lock);
1046	if (prealloc)
1047		free_extent_state(prealloc);
1048
1049	return err;
1050
1051search_again:
1052	if (start > end)
1053		goto out;
1054	spin_unlock(&tree->lock);
1055	if (gfpflags_allow_blocking(mask))
1056		cond_resched();
1057	goto again;
1058}
1059
1060int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1061		   unsigned bits, u64 * failed_start,
1062		   struct extent_state **cached_state, gfp_t mask)
1063{
1064	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1065				cached_state, mask, NULL);
1066}
1067
1068
1069/**
1070 * convert_extent_bit - convert all bits in a given range from one bit to
1071 * 			another
1072 * @tree:	the io tree to search
1073 * @start:	the start offset in bytes
1074 * @end:	the end offset in bytes (inclusive)
1075 * @bits:	the bits to set in this range
1076 * @clear_bits:	the bits to clear in this range
1077 * @cached_state:	state that we're going to cache
1078 * @mask:	the allocation mask
1079 *
1080 * This will go through and set bits for the given range.  If any states exist
1081 * already in this range they are set with the given bit and cleared of the
1082 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1083 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1084 * boundary bits like LOCK.
1085 */
1086int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1087		       unsigned bits, unsigned clear_bits,
1088		       struct extent_state **cached_state, gfp_t mask)
1089{
1090	struct extent_state *state;
1091	struct extent_state *prealloc = NULL;
1092	struct rb_node *node;
1093	struct rb_node **p;
1094	struct rb_node *parent;
1095	int err = 0;
1096	u64 last_start;
1097	u64 last_end;
1098	bool first_iteration = true;
1099
1100	btrfs_debug_check_extent_io_range(tree, start, end);
1101
1102again:
1103	if (!prealloc && gfpflags_allow_blocking(mask)) {
1104		/*
1105		 * Best effort, don't worry if extent state allocation fails
1106		 * here for the first iteration. We might have a cached state
1107		 * that matches exactly the target range, in which case no
1108		 * extent state allocations are needed. We'll only know this
1109		 * after locking the tree.
1110		 */
1111		prealloc = alloc_extent_state(mask);
1112		if (!prealloc && !first_iteration)
1113			return -ENOMEM;
1114	}
1115
1116	spin_lock(&tree->lock);
1117	if (cached_state && *cached_state) {
1118		state = *cached_state;
1119		if (state->start <= start && state->end > start &&
1120		    extent_state_in_tree(state)) {
1121			node = &state->rb_node;
1122			goto hit_next;
1123		}
1124	}
1125
1126	/*
1127	 * this search will find all the extents that end after
1128	 * our range starts.
1129	 */
1130	node = tree_search_for_insert(tree, start, &p, &parent);
1131	if (!node) {
1132		prealloc = alloc_extent_state_atomic(prealloc);
1133		if (!prealloc) {
1134			err = -ENOMEM;
1135			goto out;
1136		}
1137		err = insert_state(tree, prealloc, start, end,
1138				   &p, &parent, &bits, NULL);
1139		if (err)
1140			extent_io_tree_panic(tree, err);
1141		cache_state(prealloc, cached_state);
1142		prealloc = NULL;
1143		goto out;
1144	}
1145	state = rb_entry(node, struct extent_state, rb_node);
1146hit_next:
1147	last_start = state->start;
1148	last_end = state->end;
1149
1150	/*
1151	 * | ---- desired range ---- |
1152	 * | state |
1153	 *
1154	 * Just lock what we found and keep going
1155	 */
1156	if (state->start == start && state->end <= end) {
1157		set_state_bits(tree, state, &bits, NULL);
1158		cache_state(state, cached_state);
1159		state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1160		if (last_end == (u64)-1)
1161			goto out;
1162		start = last_end + 1;
1163		if (start < end && state && state->start == start &&
1164		    !need_resched())
1165			goto hit_next;
1166		goto search_again;
1167	}
1168
1169	/*
1170	 *     | ---- desired range ---- |
1171	 * | state |
1172	 *   or
1173	 * | ------------- state -------------- |
1174	 *
1175	 * We need to split the extent we found, and may flip bits on
1176	 * second half.
1177	 *
1178	 * If the extent we found extends past our
1179	 * range, we just split and search again.  It'll get split
1180	 * again the next time though.
1181	 *
1182	 * If the extent we found is inside our range, we set the
1183	 * desired bit on it.
1184	 */
1185	if (state->start < start) {
1186		prealloc = alloc_extent_state_atomic(prealloc);
1187		if (!prealloc) {
1188			err = -ENOMEM;
1189			goto out;
1190		}
1191		err = split_state(tree, state, prealloc, start);
1192		if (err)
1193			extent_io_tree_panic(tree, err);
1194		prealloc = NULL;
1195		if (err)
1196			goto out;
1197		if (state->end <= end) {
1198			set_state_bits(tree, state, &bits, NULL);
1199			cache_state(state, cached_state);
1200			state = clear_state_bit(tree, state, &clear_bits, 0,
1201						NULL);
1202			if (last_end == (u64)-1)
1203				goto out;
1204			start = last_end + 1;
1205			if (start < end && state && state->start == start &&
1206			    !need_resched())
1207				goto hit_next;
1208		}
1209		goto search_again;
1210	}
1211	/*
1212	 * | ---- desired range ---- |
1213	 *     | state | or               | state |
1214	 *
1215	 * There's a hole, we need to insert something in it and
1216	 * ignore the extent we found.
1217	 */
1218	if (state->start > start) {
1219		u64 this_end;
1220		if (end < last_start)
1221			this_end = end;
1222		else
1223			this_end = last_start - 1;
1224
1225		prealloc = alloc_extent_state_atomic(prealloc);
1226		if (!prealloc) {
1227			err = -ENOMEM;
1228			goto out;
1229		}
1230
1231		/*
1232		 * Avoid to free 'prealloc' if it can be merged with
1233		 * the later extent.
1234		 */
1235		err = insert_state(tree, prealloc, start, this_end,
1236				   NULL, NULL, &bits, NULL);
1237		if (err)
1238			extent_io_tree_panic(tree, err);
1239		cache_state(prealloc, cached_state);
1240		prealloc = NULL;
1241		start = this_end + 1;
1242		goto search_again;
1243	}
1244	/*
1245	 * | ---- desired range ---- |
1246	 *                        | state |
1247	 * We need to split the extent, and set the bit
1248	 * on the first half
1249	 */
1250	if (state->start <= end && state->end > end) {
1251		prealloc = alloc_extent_state_atomic(prealloc);
1252		if (!prealloc) {
1253			err = -ENOMEM;
1254			goto out;
1255		}
1256
1257		err = split_state(tree, state, prealloc, end + 1);
1258		if (err)
1259			extent_io_tree_panic(tree, err);
1260
1261		set_state_bits(tree, prealloc, &bits, NULL);
1262		cache_state(prealloc, cached_state);
1263		clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1264		prealloc = NULL;
1265		goto out;
1266	}
1267
1268	goto search_again;
1269
1270out:
1271	spin_unlock(&tree->lock);
1272	if (prealloc)
1273		free_extent_state(prealloc);
1274
1275	return err;
1276
1277search_again:
1278	if (start > end)
1279		goto out;
1280	spin_unlock(&tree->lock);
1281	if (gfpflags_allow_blocking(mask))
1282		cond_resched();
1283	first_iteration = false;
1284	goto again;
1285}
1286
1287/* wrappers around set/clear extent bit */
1288int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1289		     gfp_t mask)
1290{
1291	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1292			      NULL, mask);
1293}
1294
1295int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1296		    unsigned bits, gfp_t mask)
1297{
1298	return set_extent_bit(tree, start, end, bits, NULL,
1299			      NULL, mask);
1300}
1301
1302int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1303			   unsigned bits, gfp_t mask,
1304			   struct extent_changeset *changeset)
1305{
1306	/*
1307	 * We don't support EXTENT_LOCKED yet, as current changeset will
1308	 * record any bits changed, so for EXTENT_LOCKED case, it will
1309	 * either fail with -EEXIST or changeset will record the whole
1310	 * range.
1311	 */
1312	BUG_ON(bits & EXTENT_LOCKED);
1313
1314	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
1315				changeset);
1316}
1317
1318int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1319		     unsigned bits, int wake, int delete,
1320		     struct extent_state **cached, gfp_t mask)
1321{
1322	return __clear_extent_bit(tree, start, end, bits, wake, delete,
1323				  cached, mask, NULL);
1324}
1325
1326int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1327		      unsigned bits, gfp_t mask)
1328{
1329	int wake = 0;
1330
1331	if (bits & EXTENT_LOCKED)
1332		wake = 1;
1333
1334	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
1335}
1336
1337int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1338			     unsigned bits, gfp_t mask,
1339			     struct extent_changeset *changeset)
1340{
1341	/*
1342	 * Don't support EXTENT_LOCKED case, same reason as
1343	 * set_record_extent_bits().
1344	 */
1345	BUG_ON(bits & EXTENT_LOCKED);
1346
1347	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
1348				  changeset);
1349}
1350
1351int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1352			struct extent_state **cached_state, gfp_t mask)
1353{
1354	return set_extent_bit(tree, start, end,
1355			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1356			      NULL, cached_state, mask);
1357}
1358
1359int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1360		      struct extent_state **cached_state, gfp_t mask)
1361{
1362	return set_extent_bit(tree, start, end,
1363			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1364			      NULL, cached_state, mask);
1365}
1366
1367int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1368		       gfp_t mask)
1369{
1370	return clear_extent_bit(tree, start, end,
1371				EXTENT_DIRTY | EXTENT_DELALLOC |
1372				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1373}
1374
1375int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1376		     gfp_t mask)
1377{
1378	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1379			      NULL, mask);
1380}
1381
1382int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1383			struct extent_state **cached_state, gfp_t mask)
1384{
1385	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
1386			      cached_state, mask);
1387}
1388
1389int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1390			  struct extent_state **cached_state, gfp_t mask)
1391{
1392	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1393				cached_state, mask);
1394}
1395
1396/*
1397 * either insert or lock state struct between start and end use mask to tell
1398 * us if waiting is desired.
1399 */
1400int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1401		     unsigned bits, struct extent_state **cached_state)
1402{
1403	int err;
1404	u64 failed_start;
1405
1406	while (1) {
1407		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1408				       EXTENT_LOCKED, &failed_start,
1409				       cached_state, GFP_NOFS, NULL);
1410		if (err == -EEXIST) {
1411			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1412			start = failed_start;
1413		} else
1414			break;
1415		WARN_ON(start > end);
1416	}
1417	return err;
1418}
1419
1420int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1421{
1422	return lock_extent_bits(tree, start, end, 0, NULL);
1423}
1424
1425int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1426{
1427	int err;
1428	u64 failed_start;
1429
1430	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1431			       &failed_start, NULL, GFP_NOFS, NULL);
1432	if (err == -EEXIST) {
1433		if (failed_start > start)
1434			clear_extent_bit(tree, start, failed_start - 1,
1435					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1436		return 0;
1437	}
1438	return 1;
1439}
1440
1441int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1442			 struct extent_state **cached, gfp_t mask)
1443{
1444	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1445				mask);
1446}
1447
1448int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1449{
1450	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1451				GFP_NOFS);
1452}
1453
1454int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1455{
1456	unsigned long index = start >> PAGE_CACHE_SHIFT;
1457	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1458	struct page *page;
1459
1460	while (index <= end_index) {
1461		page = find_get_page(inode->i_mapping, index);
1462		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1463		clear_page_dirty_for_io(page);
1464		page_cache_release(page);
1465		index++;
1466	}
1467	return 0;
1468}
1469
1470int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1471{
1472	unsigned long index = start >> PAGE_CACHE_SHIFT;
1473	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1474	struct page *page;
1475
1476	while (index <= end_index) {
1477		page = find_get_page(inode->i_mapping, index);
1478		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1479		__set_page_dirty_nobuffers(page);
1480		account_page_redirty(page);
1481		page_cache_release(page);
1482		index++;
1483	}
1484	return 0;
1485}
1486
1487/*
1488 * helper function to set both pages and extents in the tree writeback
1489 */
1490static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1491{
1492	unsigned long index = start >> PAGE_CACHE_SHIFT;
1493	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1494	struct page *page;
1495
1496	while (index <= end_index) {
1497		page = find_get_page(tree->mapping, index);
1498		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1499		set_page_writeback(page);
1500		page_cache_release(page);
1501		index++;
1502	}
1503	return 0;
1504}
1505
1506/* find the first state struct with 'bits' set after 'start', and
1507 * return it.  tree->lock must be held.  NULL will returned if
1508 * nothing was found after 'start'
1509 */
1510static struct extent_state *
1511find_first_extent_bit_state(struct extent_io_tree *tree,
1512			    u64 start, unsigned bits)
1513{
1514	struct rb_node *node;
1515	struct extent_state *state;
1516
1517	/*
1518	 * this search will find all the extents that end after
1519	 * our range starts.
1520	 */
1521	node = tree_search(tree, start);
1522	if (!node)
1523		goto out;
1524
1525	while (1) {
1526		state = rb_entry(node, struct extent_state, rb_node);
1527		if (state->end >= start && (state->state & bits))
1528			return state;
1529
1530		node = rb_next(node);
1531		if (!node)
1532			break;
1533	}
1534out:
1535	return NULL;
1536}
1537
1538/*
1539 * find the first offset in the io tree with 'bits' set. zero is
1540 * returned if we find something, and *start_ret and *end_ret are
1541 * set to reflect the state struct that was found.
1542 *
1543 * If nothing was found, 1 is returned. If found something, return 0.
1544 */
1545int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1546			  u64 *start_ret, u64 *end_ret, unsigned bits,
1547			  struct extent_state **cached_state)
1548{
1549	struct extent_state *state;
1550	struct rb_node *n;
1551	int ret = 1;
1552
1553	spin_lock(&tree->lock);
1554	if (cached_state && *cached_state) {
1555		state = *cached_state;
1556		if (state->end == start - 1 && extent_state_in_tree(state)) {
1557			n = rb_next(&state->rb_node);
1558			while (n) {
1559				state = rb_entry(n, struct extent_state,
1560						 rb_node);
1561				if (state->state & bits)
1562					goto got_it;
1563				n = rb_next(n);
1564			}
1565			free_extent_state(*cached_state);
1566			*cached_state = NULL;
1567			goto out;
1568		}
1569		free_extent_state(*cached_state);
1570		*cached_state = NULL;
1571	}
1572
1573	state = find_first_extent_bit_state(tree, start, bits);
1574got_it:
1575	if (state) {
1576		cache_state_if_flags(state, cached_state, 0);
1577		*start_ret = state->start;
1578		*end_ret = state->end;
1579		ret = 0;
1580	}
1581out:
1582	spin_unlock(&tree->lock);
1583	return ret;
1584}
1585
1586/*
1587 * find a contiguous range of bytes in the file marked as delalloc, not
1588 * more than 'max_bytes'.  start and end are used to return the range,
1589 *
1590 * 1 is returned if we find something, 0 if nothing was in the tree
1591 */
1592static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1593					u64 *start, u64 *end, u64 max_bytes,
1594					struct extent_state **cached_state)
1595{
1596	struct rb_node *node;
1597	struct extent_state *state;
1598	u64 cur_start = *start;
1599	u64 found = 0;
1600	u64 total_bytes = 0;
1601
1602	spin_lock(&tree->lock);
1603
1604	/*
1605	 * this search will find all the extents that end after
1606	 * our range starts.
1607	 */
1608	node = tree_search(tree, cur_start);
1609	if (!node) {
1610		if (!found)
1611			*end = (u64)-1;
1612		goto out;
1613	}
1614
1615	while (1) {
1616		state = rb_entry(node, struct extent_state, rb_node);
1617		if (found && (state->start != cur_start ||
1618			      (state->state & EXTENT_BOUNDARY))) {
1619			goto out;
1620		}
1621		if (!(state->state & EXTENT_DELALLOC)) {
1622			if (!found)
1623				*end = state->end;
1624			goto out;
1625		}
1626		if (!found) {
1627			*start = state->start;
1628			*cached_state = state;
1629			atomic_inc(&state->refs);
1630		}
1631		found++;
1632		*end = state->end;
1633		cur_start = state->end + 1;
1634		node = rb_next(node);
1635		total_bytes += state->end - state->start + 1;
1636		if (total_bytes >= max_bytes)
1637			break;
1638		if (!node)
1639			break;
1640	}
1641out:
1642	spin_unlock(&tree->lock);
1643	return found;
1644}
1645
1646static noinline void __unlock_for_delalloc(struct inode *inode,
1647					   struct page *locked_page,
1648					   u64 start, u64 end)
1649{
1650	int ret;
1651	struct page *pages[16];
1652	unsigned long index = start >> PAGE_CACHE_SHIFT;
1653	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1654	unsigned long nr_pages = end_index - index + 1;
1655	int i;
1656
1657	if (index == locked_page->index && end_index == index)
1658		return;
1659
1660	while (nr_pages > 0) {
1661		ret = find_get_pages_contig(inode->i_mapping, index,
1662				     min_t(unsigned long, nr_pages,
1663				     ARRAY_SIZE(pages)), pages);
1664		for (i = 0; i < ret; i++) {
1665			if (pages[i] != locked_page)
1666				unlock_page(pages[i]);
1667			page_cache_release(pages[i]);
1668		}
1669		nr_pages -= ret;
1670		index += ret;
1671		cond_resched();
1672	}
1673}
1674
1675static noinline int lock_delalloc_pages(struct inode *inode,
1676					struct page *locked_page,
1677					u64 delalloc_start,
1678					u64 delalloc_end)
1679{
1680	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1681	unsigned long start_index = index;
1682	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1683	unsigned long pages_locked = 0;
1684	struct page *pages[16];
1685	unsigned long nrpages;
1686	int ret;
1687	int i;
1688
1689	/* the caller is responsible for locking the start index */
1690	if (index == locked_page->index && index == end_index)
1691		return 0;
1692
1693	/* skip the page at the start index */
1694	nrpages = end_index - index + 1;
1695	while (nrpages > 0) {
1696		ret = find_get_pages_contig(inode->i_mapping, index,
1697				     min_t(unsigned long,
1698				     nrpages, ARRAY_SIZE(pages)), pages);
1699		if (ret == 0) {
1700			ret = -EAGAIN;
1701			goto done;
1702		}
1703		/* now we have an array of pages, lock them all */
1704		for (i = 0; i < ret; i++) {
1705			/*
1706			 * the caller is taking responsibility for
1707			 * locked_page
1708			 */
1709			if (pages[i] != locked_page) {
1710				lock_page(pages[i]);
1711				if (!PageDirty(pages[i]) ||
1712				    pages[i]->mapping != inode->i_mapping) {
1713					ret = -EAGAIN;
1714					unlock_page(pages[i]);
1715					page_cache_release(pages[i]);
1716					goto done;
1717				}
1718			}
1719			page_cache_release(pages[i]);
1720			pages_locked++;
1721		}
1722		nrpages -= ret;
1723		index += ret;
1724		cond_resched();
1725	}
1726	ret = 0;
1727done:
1728	if (ret && pages_locked) {
1729		__unlock_for_delalloc(inode, locked_page,
1730			      delalloc_start,
1731			      ((u64)(start_index + pages_locked - 1)) <<
1732			      PAGE_CACHE_SHIFT);
1733	}
1734	return ret;
1735}
1736
1737/*
1738 * find a contiguous range of bytes in the file marked as delalloc, not
1739 * more than 'max_bytes'.  start and end are used to return the range,
1740 *
1741 * 1 is returned if we find something, 0 if nothing was in the tree
1742 */
1743STATIC u64 find_lock_delalloc_range(struct inode *inode,
1744				    struct extent_io_tree *tree,
1745				    struct page *locked_page, u64 *start,
1746				    u64 *end, u64 max_bytes)
1747{
1748	u64 delalloc_start;
1749	u64 delalloc_end;
1750	u64 found;
1751	struct extent_state *cached_state = NULL;
1752	int ret;
1753	int loops = 0;
1754
1755again:
1756	/* step one, find a bunch of delalloc bytes starting at start */
1757	delalloc_start = *start;
1758	delalloc_end = 0;
1759	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1760				    max_bytes, &cached_state);
1761	if (!found || delalloc_end <= *start) {
1762		*start = delalloc_start;
1763		*end = delalloc_end;
1764		free_extent_state(cached_state);
1765		return 0;
1766	}
1767
1768	/*
1769	 * start comes from the offset of locked_page.  We have to lock
1770	 * pages in order, so we can't process delalloc bytes before
1771	 * locked_page
1772	 */
1773	if (delalloc_start < *start)
1774		delalloc_start = *start;
1775
1776	/*
1777	 * make sure to limit the number of pages we try to lock down
1778	 */
1779	if (delalloc_end + 1 - delalloc_start > max_bytes)
1780		delalloc_end = delalloc_start + max_bytes - 1;
1781
1782	/* step two, lock all the pages after the page that has start */
1783	ret = lock_delalloc_pages(inode, locked_page,
1784				  delalloc_start, delalloc_end);
1785	if (ret == -EAGAIN) {
1786		/* some of the pages are gone, lets avoid looping by
1787		 * shortening the size of the delalloc range we're searching
1788		 */
1789		free_extent_state(cached_state);
1790		cached_state = NULL;
1791		if (!loops) {
1792			max_bytes = PAGE_CACHE_SIZE;
1793			loops = 1;
1794			goto again;
1795		} else {
1796			found = 0;
1797			goto out_failed;
1798		}
1799	}
1800	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1801
1802	/* step three, lock the state bits for the whole range */
1803	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1804
1805	/* then test to make sure it is all still delalloc */
1806	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1807			     EXTENT_DELALLOC, 1, cached_state);
1808	if (!ret) {
1809		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1810				     &cached_state, GFP_NOFS);
1811		__unlock_for_delalloc(inode, locked_page,
1812			      delalloc_start, delalloc_end);
1813		cond_resched();
1814		goto again;
1815	}
1816	free_extent_state(cached_state);
1817	*start = delalloc_start;
1818	*end = delalloc_end;
1819out_failed:
1820	return found;
1821}
1822
1823int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1824				 struct page *locked_page,
1825				 unsigned clear_bits,
1826				 unsigned long page_ops)
1827{
1828	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1829	int ret;
1830	struct page *pages[16];
1831	unsigned long index = start >> PAGE_CACHE_SHIFT;
1832	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1833	unsigned long nr_pages = end_index - index + 1;
1834	int i;
1835
1836	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1837	if (page_ops == 0)
1838		return 0;
1839
1840	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1841		mapping_set_error(inode->i_mapping, -EIO);
1842
1843	while (nr_pages > 0) {
1844		ret = find_get_pages_contig(inode->i_mapping, index,
1845				     min_t(unsigned long,
1846				     nr_pages, ARRAY_SIZE(pages)), pages);
1847		for (i = 0; i < ret; i++) {
1848
1849			if (page_ops & PAGE_SET_PRIVATE2)
1850				SetPagePrivate2(pages[i]);
1851
1852			if (pages[i] == locked_page) {
1853				page_cache_release(pages[i]);
1854				continue;
1855			}
1856			if (page_ops & PAGE_CLEAR_DIRTY)
1857				clear_page_dirty_for_io(pages[i]);
1858			if (page_ops & PAGE_SET_WRITEBACK)
1859				set_page_writeback(pages[i]);
1860			if (page_ops & PAGE_SET_ERROR)
1861				SetPageError(pages[i]);
1862			if (page_ops & PAGE_END_WRITEBACK)
1863				end_page_writeback(pages[i]);
1864			if (page_ops & PAGE_UNLOCK)
1865				unlock_page(pages[i]);
1866			page_cache_release(pages[i]);
1867		}
1868		nr_pages -= ret;
1869		index += ret;
1870		cond_resched();
1871	}
1872	return 0;
1873}
1874
1875/*
1876 * count the number of bytes in the tree that have a given bit(s)
1877 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1878 * cached.  The total number found is returned.
1879 */
1880u64 count_range_bits(struct extent_io_tree *tree,
1881		     u64 *start, u64 search_end, u64 max_bytes,
1882		     unsigned bits, int contig)
1883{
1884	struct rb_node *node;
1885	struct extent_state *state;
1886	u64 cur_start = *start;
1887	u64 total_bytes = 0;
1888	u64 last = 0;
1889	int found = 0;
1890
1891	if (WARN_ON(search_end <= cur_start))
1892		return 0;
1893
1894	spin_lock(&tree->lock);
1895	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1896		total_bytes = tree->dirty_bytes;
1897		goto out;
1898	}
1899	/*
1900	 * this search will find all the extents that end after
1901	 * our range starts.
1902	 */
1903	node = tree_search(tree, cur_start);
1904	if (!node)
1905		goto out;
1906
1907	while (1) {
1908		state = rb_entry(node, struct extent_state, rb_node);
1909		if (state->start > search_end)
1910			break;
1911		if (contig && found && state->start > last + 1)
1912			break;
1913		if (state->end >= cur_start && (state->state & bits) == bits) {
1914			total_bytes += min(search_end, state->end) + 1 -
1915				       max(cur_start, state->start);
1916			if (total_bytes >= max_bytes)
1917				break;
1918			if (!found) {
1919				*start = max(cur_start, state->start);
1920				found = 1;
1921			}
1922			last = state->end;
1923		} else if (contig && found) {
1924			break;
1925		}
1926		node = rb_next(node);
1927		if (!node)
1928			break;
1929	}
1930out:
1931	spin_unlock(&tree->lock);
1932	return total_bytes;
1933}
1934
1935/*
1936 * set the private field for a given byte offset in the tree.  If there isn't
1937 * an extent_state there already, this does nothing.
1938 */
1939static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1940{
1941	struct rb_node *node;
1942	struct extent_state *state;
1943	int ret = 0;
1944
1945	spin_lock(&tree->lock);
1946	/*
1947	 * this search will find all the extents that end after
1948	 * our range starts.
1949	 */
1950	node = tree_search(tree, start);
1951	if (!node) {
1952		ret = -ENOENT;
1953		goto out;
1954	}
1955	state = rb_entry(node, struct extent_state, rb_node);
1956	if (state->start != start) {
1957		ret = -ENOENT;
1958		goto out;
1959	}
1960	state->private = private;
1961out:
1962	spin_unlock(&tree->lock);
1963	return ret;
1964}
1965
1966int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1967{
1968	struct rb_node *node;
1969	struct extent_state *state;
1970	int ret = 0;
1971
1972	spin_lock(&tree->lock);
1973	/*
1974	 * this search will find all the extents that end after
1975	 * our range starts.
1976	 */
1977	node = tree_search(tree, start);
1978	if (!node) {
1979		ret = -ENOENT;
1980		goto out;
1981	}
1982	state = rb_entry(node, struct extent_state, rb_node);
1983	if (state->start != start) {
1984		ret = -ENOENT;
1985		goto out;
1986	}
1987	*private = state->private;
1988out:
1989	spin_unlock(&tree->lock);
1990	return ret;
1991}
1992
1993/*
1994 * searches a range in the state tree for a given mask.
1995 * If 'filled' == 1, this returns 1 only if every extent in the tree
1996 * has the bits set.  Otherwise, 1 is returned if any bit in the
1997 * range is found set.
1998 */
1999int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2000		   unsigned bits, int filled, struct extent_state *cached)
2001{
2002	struct extent_state *state = NULL;
2003	struct rb_node *node;
2004	int bitset = 0;
2005
2006	spin_lock(&tree->lock);
2007	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2008	    cached->end > start)
2009		node = &cached->rb_node;
2010	else
2011		node = tree_search(tree, start);
2012	while (node && start <= end) {
2013		state = rb_entry(node, struct extent_state, rb_node);
2014
2015		if (filled && state->start > start) {
2016			bitset = 0;
2017			break;
2018		}
2019
2020		if (state->start > end)
2021			break;
2022
2023		if (state->state & bits) {
2024			bitset = 1;
2025			if (!filled)
2026				break;
2027		} else if (filled) {
2028			bitset = 0;
2029			break;
2030		}
2031
2032		if (state->end == (u64)-1)
2033			break;
2034
2035		start = state->end + 1;
2036		if (start > end)
2037			break;
2038		node = rb_next(node);
2039		if (!node) {
2040			if (filled)
2041				bitset = 0;
2042			break;
2043		}
2044	}
2045	spin_unlock(&tree->lock);
2046	return bitset;
2047}
2048
2049/*
2050 * helper function to set a given page up to date if all the
2051 * extents in the tree for that page are up to date
2052 */
2053static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
2054{
2055	u64 start = page_offset(page);
2056	u64 end = start + PAGE_CACHE_SIZE - 1;
2057	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
2058		SetPageUptodate(page);
2059}
2060
2061int free_io_failure(struct inode *inode, struct io_failure_record *rec)
2062{
2063	int ret;
2064	int err = 0;
2065	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2066
2067	set_state_private(failure_tree, rec->start, 0);
2068	ret = clear_extent_bits(failure_tree, rec->start,
2069				rec->start + rec->len - 1,
2070				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2071	if (ret)
2072		err = ret;
2073
2074	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
2075				rec->start + rec->len - 1,
2076				EXTENT_DAMAGED, GFP_NOFS);
2077	if (ret && !err)
2078		err = ret;
2079
2080	kfree(rec);
2081	return err;
2082}
2083
2084/*
2085 * this bypasses the standard btrfs submit functions deliberately, as
2086 * the standard behavior is to write all copies in a raid setup. here we only
2087 * want to write the one bad copy. so we do the mapping for ourselves and issue
2088 * submit_bio directly.
2089 * to avoid any synchronization issues, wait for the data after writing, which
2090 * actually prevents the read that triggered the error from finishing.
2091 * currently, there can be no more than two copies of every data bit. thus,
2092 * exactly one rewrite is required.
2093 */
2094int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2095		      struct page *page, unsigned int pg_offset, int mirror_num)
2096{
2097	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2098	struct bio *bio;
2099	struct btrfs_device *dev;
2100	u64 map_length = 0;
2101	u64 sector;
2102	struct btrfs_bio *bbio = NULL;
2103	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
2104	int ret;
2105
2106	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
2107	BUG_ON(!mirror_num);
2108
2109	/* we can't repair anything in raid56 yet */
2110	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
2111		return 0;
2112
2113	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2114	if (!bio)
2115		return -EIO;
2116	bio->bi_iter.bi_size = 0;
2117	map_length = length;
2118
2119	ret = btrfs_map_block(fs_info, WRITE, logical,
2120			      &map_length, &bbio, mirror_num);
2121	if (ret) {
2122		bio_put(bio);
2123		return -EIO;
2124	}
2125	BUG_ON(mirror_num != bbio->mirror_num);
2126	sector = bbio->stripes[mirror_num-1].physical >> 9;
2127	bio->bi_iter.bi_sector = sector;
2128	dev = bbio->stripes[mirror_num-1].dev;
2129	btrfs_put_bbio(bbio);
2130	if (!dev || !dev->bdev || !dev->writeable) {
2131		bio_put(bio);
2132		return -EIO;
2133	}
2134	bio->bi_bdev = dev->bdev;
2135	bio_add_page(bio, page, length, pg_offset);
2136
2137	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2138		/* try to remap that extent elsewhere? */
2139		bio_put(bio);
2140		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2141		return -EIO;
2142	}
2143
2144	btrfs_info_rl_in_rcu(fs_info,
2145		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
2146				  btrfs_ino(inode), start,
2147				  rcu_str_deref(dev->name), sector);
2148	bio_put(bio);
2149	return 0;
2150}
2151
2152int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2153			 int mirror_num)
2154{
2155	u64 start = eb->start;
2156	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2157	int ret = 0;
2158
2159	if (root->fs_info->sb->s_flags & MS_RDONLY)
2160		return -EROFS;
2161
2162	for (i = 0; i < num_pages; i++) {
2163		struct page *p = eb->pages[i];
2164
2165		ret = repair_io_failure(root->fs_info->btree_inode, start,
2166					PAGE_CACHE_SIZE, start, p,
2167					start - page_offset(p), mirror_num);
2168		if (ret)
2169			break;
2170		start += PAGE_CACHE_SIZE;
2171	}
2172
2173	return ret;
2174}
2175
2176/*
2177 * each time an IO finishes, we do a fast check in the IO failure tree
2178 * to see if we need to process or clean up an io_failure_record
2179 */
2180int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2181		     unsigned int pg_offset)
2182{
2183	u64 private;
2184	u64 private_failure;
2185	struct io_failure_record *failrec;
2186	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2187	struct extent_state *state;
2188	int num_copies;
2189	int ret;
2190
2191	private = 0;
2192	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2193				(u64)-1, 1, EXTENT_DIRTY, 0);
2194	if (!ret)
2195		return 0;
2196
2197	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2198				&private_failure);
2199	if (ret)
2200		return 0;
2201
2202	failrec = (struct io_failure_record *)(unsigned long) private_failure;
2203	BUG_ON(!failrec->this_mirror);
2204
2205	if (failrec->in_validation) {
2206		/* there was no real error, just free the record */
2207		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2208			 failrec->start);
2209		goto out;
2210	}
2211	if (fs_info->sb->s_flags & MS_RDONLY)
2212		goto out;
2213
2214	spin_lock(&BTRFS_I(inode)->io_tree.lock);
2215	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2216					    failrec->start,
2217					    EXTENT_LOCKED);
2218	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2219
2220	if (state && state->start <= failrec->start &&
2221	    state->end >= failrec->start + failrec->len - 1) {
2222		num_copies = btrfs_num_copies(fs_info, failrec->logical,
2223					      failrec->len);
2224		if (num_copies > 1)  {
2225			repair_io_failure(inode, start, failrec->len,
2226					  failrec->logical, page,
2227					  pg_offset, failrec->failed_mirror);
2228		}
2229	}
2230
2231out:
2232	free_io_failure(inode, failrec);
2233
2234	return 0;
2235}
2236
2237/*
2238 * Can be called when
2239 * - hold extent lock
2240 * - under ordered extent
2241 * - the inode is freeing
2242 */
2243void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2244{
2245	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2246	struct io_failure_record *failrec;
2247	struct extent_state *state, *next;
2248
2249	if (RB_EMPTY_ROOT(&failure_tree->state))
2250		return;
2251
2252	spin_lock(&failure_tree->lock);
2253	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2254	while (state) {
2255		if (state->start > end)
2256			break;
2257
2258		ASSERT(state->end <= end);
2259
2260		next = next_state(state);
2261
2262		failrec = (struct io_failure_record *)(unsigned long)state->private;
2263		free_extent_state(state);
2264		kfree(failrec);
2265
2266		state = next;
2267	}
2268	spin_unlock(&failure_tree->lock);
2269}
2270
2271int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2272				struct io_failure_record **failrec_ret)
2273{
2274	struct io_failure_record *failrec;
2275	u64 private;
2276	struct extent_map *em;
2277	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2278	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2279	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2280	int ret;
2281	u64 logical;
2282
2283	ret = get_state_private(failure_tree, start, &private);
2284	if (ret) {
2285		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2286		if (!failrec)
2287			return -ENOMEM;
2288
2289		failrec->start = start;
2290		failrec->len = end - start + 1;
2291		failrec->this_mirror = 0;
2292		failrec->bio_flags = 0;
2293		failrec->in_validation = 0;
2294
2295		read_lock(&em_tree->lock);
2296		em = lookup_extent_mapping(em_tree, start, failrec->len);
2297		if (!em) {
2298			read_unlock(&em_tree->lock);
2299			kfree(failrec);
2300			return -EIO;
2301		}
2302
2303		if (em->start > start || em->start + em->len <= start) {
2304			free_extent_map(em);
2305			em = NULL;
2306		}
2307		read_unlock(&em_tree->lock);
2308		if (!em) {
2309			kfree(failrec);
2310			return -EIO;
2311		}
2312
2313		logical = start - em->start;
2314		logical = em->block_start + logical;
2315		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2316			logical = em->block_start;
2317			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2318			extent_set_compress_type(&failrec->bio_flags,
2319						 em->compress_type);
2320		}
2321
2322		pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
2323			 logical, start, failrec->len);
2324
2325		failrec->logical = logical;
2326		free_extent_map(em);
2327
2328		/* set the bits in the private failure tree */
2329		ret = set_extent_bits(failure_tree, start, end,
2330					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2331		if (ret >= 0)
2332			ret = set_state_private(failure_tree, start,
2333						(u64)(unsigned long)failrec);
2334		/* set the bits in the inode's tree */
2335		if (ret >= 0)
2336			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2337						GFP_NOFS);
2338		if (ret < 0) {
2339			kfree(failrec);
2340			return ret;
2341		}
2342	} else {
2343		failrec = (struct io_failure_record *)(unsigned long)private;
2344		pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2345			 failrec->logical, failrec->start, failrec->len,
2346			 failrec->in_validation);
2347		/*
2348		 * when data can be on disk more than twice, add to failrec here
2349		 * (e.g. with a list for failed_mirror) to make
2350		 * clean_io_failure() clean all those errors at once.
2351		 */
2352	}
2353
2354	*failrec_ret = failrec;
2355
2356	return 0;
2357}
2358
2359int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
2360			   struct io_failure_record *failrec, int failed_mirror)
2361{
2362	int num_copies;
2363
2364	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2365				      failrec->logical, failrec->len);
2366	if (num_copies == 1) {
2367		/*
2368		 * we only have a single copy of the data, so don't bother with
2369		 * all the retry and error correction code that follows. no
2370		 * matter what the error is, it is very likely to persist.
2371		 */
2372		pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
2373			 num_copies, failrec->this_mirror, failed_mirror);
2374		return 0;
2375	}
2376
2377	/*
2378	 * there are two premises:
2379	 *	a) deliver good data to the caller
2380	 *	b) correct the bad sectors on disk
2381	 */
2382	if (failed_bio->bi_vcnt > 1) {
2383		/*
2384		 * to fulfill b), we need to know the exact failing sectors, as
2385		 * we don't want to rewrite any more than the failed ones. thus,
2386		 * we need separate read requests for the failed bio
2387		 *
2388		 * if the following BUG_ON triggers, our validation request got
2389		 * merged. we need separate requests for our algorithm to work.
2390		 */
2391		BUG_ON(failrec->in_validation);
2392		failrec->in_validation = 1;
2393		failrec->this_mirror = failed_mirror;
2394	} else {
2395		/*
2396		 * we're ready to fulfill a) and b) alongside. get a good copy
2397		 * of the failed sector and if we succeed, we have setup
2398		 * everything for repair_io_failure to do the rest for us.
2399		 */
2400		if (failrec->in_validation) {
2401			BUG_ON(failrec->this_mirror != failed_mirror);
2402			failrec->in_validation = 0;
2403			failrec->this_mirror = 0;
2404		}
2405		failrec->failed_mirror = failed_mirror;
2406		failrec->this_mirror++;
2407		if (failrec->this_mirror == failed_mirror)
2408			failrec->this_mirror++;
2409	}
2410
2411	if (failrec->this_mirror > num_copies) {
2412		pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
2413			 num_copies, failrec->this_mirror, failed_mirror);
2414		return 0;
2415	}
2416
2417	return 1;
2418}
2419
2420
2421struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2422				    struct io_failure_record *failrec,
2423				    struct page *page, int pg_offset, int icsum,
2424				    bio_end_io_t *endio_func, void *data)
2425{
2426	struct bio *bio;
2427	struct btrfs_io_bio *btrfs_failed_bio;
2428	struct btrfs_io_bio *btrfs_bio;
2429
2430	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2431	if (!bio)
2432		return NULL;
2433
2434	bio->bi_end_io = endio_func;
2435	bio->bi_iter.bi_sector = failrec->logical >> 9;
2436	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2437	bio->bi_iter.bi_size = 0;
2438	bio->bi_private = data;
2439
2440	btrfs_failed_bio = btrfs_io_bio(failed_bio);
2441	if (btrfs_failed_bio->csum) {
2442		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2443		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2444
2445		btrfs_bio = btrfs_io_bio(bio);
2446		btrfs_bio->csum = btrfs_bio->csum_inline;
2447		icsum *= csum_size;
2448		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2449		       csum_size);
2450	}
2451
2452	bio_add_page(bio, page, failrec->len, pg_offset);
2453
2454	return bio;
2455}
2456
2457/*
2458 * this is a generic handler for readpage errors (default
2459 * readpage_io_failed_hook). if other copies exist, read those and write back
2460 * good data to the failed position. does not investigate in remapping the
2461 * failed extent elsewhere, hoping the device will be smart enough to do this as
2462 * needed
2463 */
2464
2465static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2466			      struct page *page, u64 start, u64 end,
2467			      int failed_mirror)
2468{
2469	struct io_failure_record *failrec;
2470	struct inode *inode = page->mapping->host;
2471	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2472	struct bio *bio;
2473	int read_mode;
2474	int ret;
2475
2476	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2477
2478	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2479	if (ret)
2480		return ret;
2481
2482	ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
2483	if (!ret) {
2484		free_io_failure(inode, failrec);
2485		return -EIO;
2486	}
2487
2488	if (failed_bio->bi_vcnt > 1)
2489		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2490	else
2491		read_mode = READ_SYNC;
2492
2493	phy_offset >>= inode->i_sb->s_blocksize_bits;
2494	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2495				      start - page_offset(page),
2496				      (int)phy_offset, failed_bio->bi_end_io,
2497				      NULL);
2498	if (!bio) {
2499		free_io_failure(inode, failrec);
2500		return -EIO;
2501	}
2502
2503	pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
2504		 read_mode, failrec->this_mirror, failrec->in_validation);
2505
2506	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2507					 failrec->this_mirror,
2508					 failrec->bio_flags, 0);
2509	if (ret) {
2510		free_io_failure(inode, failrec);
2511		bio_put(bio);
2512	}
2513
2514	return ret;
2515}
2516
2517/* lots and lots of room for performance fixes in the end_bio funcs */
2518
2519int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2520{
2521	int uptodate = (err == 0);
2522	struct extent_io_tree *tree;
2523	int ret = 0;
2524
2525	tree = &BTRFS_I(page->mapping->host)->io_tree;
2526
2527	if (tree->ops && tree->ops->writepage_end_io_hook) {
2528		ret = tree->ops->writepage_end_io_hook(page, start,
2529					       end, NULL, uptodate);
2530		if (ret)
2531			uptodate = 0;
2532	}
2533
2534	if (!uptodate) {
2535		ClearPageUptodate(page);
2536		SetPageError(page);
2537		ret = ret < 0 ? ret : -EIO;
2538		mapping_set_error(page->mapping, ret);
2539	}
2540	return 0;
2541}
2542
2543/*
2544 * after a writepage IO is done, we need to:
2545 * clear the uptodate bits on error
2546 * clear the writeback bits in the extent tree for this IO
2547 * end_page_writeback if the page has no more pending IO
2548 *
2549 * Scheduling is not allowed, so the extent state tree is expected
2550 * to have one and only one object corresponding to this IO.
2551 */
2552static void end_bio_extent_writepage(struct bio *bio)
2553{
2554	struct bio_vec *bvec;
2555	u64 start;
2556	u64 end;
2557	int i;
2558
2559	bio_for_each_segment_all(bvec, bio, i) {
2560		struct page *page = bvec->bv_page;
2561
2562		/* We always issue full-page reads, but if some block
2563		 * in a page fails to read, blk_update_request() will
2564		 * advance bv_offset and adjust bv_len to compensate.
2565		 * Print a warning for nonzero offsets, and an error
2566		 * if they don't add up to a full page.  */
2567		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2568			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2569				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2570				   "partial page write in btrfs with offset %u and length %u",
2571					bvec->bv_offset, bvec->bv_len);
2572			else
2573				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2574				   "incomplete page write in btrfs with offset %u and "
2575				   "length %u",
2576					bvec->bv_offset, bvec->bv_len);
2577		}
2578
2579		start = page_offset(page);
2580		end = start + bvec->bv_offset + bvec->bv_len - 1;
2581
2582		if (end_extent_writepage(page, bio->bi_error, start, end))
2583			continue;
2584
2585		end_page_writeback(page);
2586	}
2587
2588	bio_put(bio);
2589}
2590
2591static void
2592endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2593			      int uptodate)
2594{
2595	struct extent_state *cached = NULL;
2596	u64 end = start + len - 1;
2597
2598	if (uptodate && tree->track_uptodate)
2599		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
2600	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2601}
2602
2603/*
2604 * after a readpage IO is done, we need to:
2605 * clear the uptodate bits on error
2606 * set the uptodate bits if things worked
2607 * set the page up to date if all extents in the tree are uptodate
2608 * clear the lock bit in the extent tree
2609 * unlock the page if there are no other extents locked for it
2610 *
2611 * Scheduling is not allowed, so the extent state tree is expected
2612 * to have one and only one object corresponding to this IO.
2613 */
2614static void end_bio_extent_readpage(struct bio *bio)
2615{
2616	struct bio_vec *bvec;
2617	int uptodate = !bio->bi_error;
2618	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2619	struct extent_io_tree *tree;
2620	u64 offset = 0;
2621	u64 start;
2622	u64 end;
2623	u64 len;
2624	u64 extent_start = 0;
2625	u64 extent_len = 0;
2626	int mirror;
2627	int ret;
2628	int i;
2629
2630	bio_for_each_segment_all(bvec, bio, i) {
2631		struct page *page = bvec->bv_page;
2632		struct inode *inode = page->mapping->host;
2633
2634		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2635			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector,
2636			 bio->bi_error, io_bio->mirror_num);
2637		tree = &BTRFS_I(inode)->io_tree;
2638
2639		/* We always issue full-page reads, but if some block
2640		 * in a page fails to read, blk_update_request() will
2641		 * advance bv_offset and adjust bv_len to compensate.
2642		 * Print a warning for nonzero offsets, and an error
2643		 * if they don't add up to a full page.  */
2644		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2645			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2646				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2647				   "partial page read in btrfs with offset %u and length %u",
2648					bvec->bv_offset, bvec->bv_len);
2649			else
2650				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2651				   "incomplete page read in btrfs with offset %u and "
2652				   "length %u",
2653					bvec->bv_offset, bvec->bv_len);
2654		}
2655
2656		start = page_offset(page);
2657		end = start + bvec->bv_offset + bvec->bv_len - 1;
2658		len = bvec->bv_len;
2659
2660		mirror = io_bio->mirror_num;
2661		if (likely(uptodate && tree->ops &&
2662			   tree->ops->readpage_end_io_hook)) {
2663			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2664							      page, start, end,
2665							      mirror);
2666			if (ret)
2667				uptodate = 0;
2668			else
2669				clean_io_failure(inode, start, page, 0);
2670		}
2671
2672		if (likely(uptodate))
2673			goto readpage_ok;
2674
2675		if (tree->ops && tree->ops->readpage_io_failed_hook) {
2676			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2677			if (!ret && !bio->bi_error)
2678				uptodate = 1;
2679		} else {
2680			/*
2681			 * The generic bio_readpage_error handles errors the
2682			 * following way: If possible, new read requests are
2683			 * created and submitted and will end up in
2684			 * end_bio_extent_readpage as well (if we're lucky, not
2685			 * in the !uptodate case). In that case it returns 0 and
2686			 * we just go on with the next page in our bio. If it
2687			 * can't handle the error it will return -EIO and we
2688			 * remain responsible for that page.
2689			 */
2690			ret = bio_readpage_error(bio, offset, page, start, end,
2691						 mirror);
2692			if (ret == 0) {
2693				uptodate = !bio->bi_error;
2694				offset += len;
2695				continue;
2696			}
2697		}
2698readpage_ok:
2699		if (likely(uptodate)) {
2700			loff_t i_size = i_size_read(inode);
2701			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2702			unsigned off;
2703
2704			/* Zero out the end if this page straddles i_size */
2705			off = i_size & (PAGE_CACHE_SIZE-1);
2706			if (page->index == end_index && off)
2707				zero_user_segment(page, off, PAGE_CACHE_SIZE);
2708			SetPageUptodate(page);
2709		} else {
2710			ClearPageUptodate(page);
2711			SetPageError(page);
2712		}
2713		unlock_page(page);
2714		offset += len;
2715
2716		if (unlikely(!uptodate)) {
2717			if (extent_len) {
2718				endio_readpage_release_extent(tree,
2719							      extent_start,
2720							      extent_len, 1);
2721				extent_start = 0;
2722				extent_len = 0;
2723			}
2724			endio_readpage_release_extent(tree, start,
2725						      end - start + 1, 0);
2726		} else if (!extent_len) {
2727			extent_start = start;
2728			extent_len = end + 1 - start;
2729		} else if (extent_start + extent_len == start) {
2730			extent_len += end + 1 - start;
2731		} else {
2732			endio_readpage_release_extent(tree, extent_start,
2733						      extent_len, uptodate);
2734			extent_start = start;
2735			extent_len = end + 1 - start;
2736		}
2737	}
2738
2739	if (extent_len)
2740		endio_readpage_release_extent(tree, extent_start, extent_len,
2741					      uptodate);
2742	if (io_bio->end_io)
2743		io_bio->end_io(io_bio, bio->bi_error);
2744	bio_put(bio);
2745}
2746
2747/*
2748 * this allocates from the btrfs_bioset.  We're returning a bio right now
2749 * but you can call btrfs_io_bio for the appropriate container_of magic
2750 */
2751struct bio *
2752btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2753		gfp_t gfp_flags)
2754{
2755	struct btrfs_io_bio *btrfs_bio;
2756	struct bio *bio;
2757
2758	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
2759
2760	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2761		while (!bio && (nr_vecs /= 2)) {
2762			bio = bio_alloc_bioset(gfp_flags,
2763					       nr_vecs, btrfs_bioset);
2764		}
2765	}
2766
2767	if (bio) {
2768		bio->bi_bdev = bdev;
2769		bio->bi_iter.bi_sector = first_sector;
2770		btrfs_bio = btrfs_io_bio(bio);
2771		btrfs_bio->csum = NULL;
2772		btrfs_bio->csum_allocated = NULL;
2773		btrfs_bio->end_io = NULL;
2774	}
2775	return bio;
2776}
2777
2778struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2779{
2780	struct btrfs_io_bio *btrfs_bio;
2781	struct bio *new;
2782
2783	new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2784	if (new) {
2785		btrfs_bio = btrfs_io_bio(new);
2786		btrfs_bio->csum = NULL;
2787		btrfs_bio->csum_allocated = NULL;
2788		btrfs_bio->end_io = NULL;
2789
2790#ifdef CONFIG_BLK_CGROUP
2791		/* FIXME, put this into bio_clone_bioset */
2792		if (bio->bi_css)
2793			bio_associate_blkcg(new, bio->bi_css);
2794#endif
2795	}
2796	return new;
2797}
2798
2799/* this also allocates from the btrfs_bioset */
2800struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
2801{
2802	struct btrfs_io_bio *btrfs_bio;
2803	struct bio *bio;
2804
2805	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
2806	if (bio) {
2807		btrfs_bio = btrfs_io_bio(bio);
2808		btrfs_bio->csum = NULL;
2809		btrfs_bio->csum_allocated = NULL;
2810		btrfs_bio->end_io = NULL;
2811	}
2812	return bio;
2813}
2814
2815
2816static int __must_check submit_one_bio(int rw, struct bio *bio,
2817				       int mirror_num, unsigned long bio_flags)
2818{
2819	int ret = 0;
2820	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2821	struct page *page = bvec->bv_page;
2822	struct extent_io_tree *tree = bio->bi_private;
2823	u64 start;
2824
2825	start = page_offset(page) + bvec->bv_offset;
2826
2827	bio->bi_private = NULL;
2828
2829	bio_get(bio);
2830
2831	if (tree->ops && tree->ops->submit_bio_hook)
2832		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2833					   mirror_num, bio_flags, start);
2834	else
2835		btrfsic_submit_bio(rw, bio);
2836
2837	bio_put(bio);
2838	return ret;
2839}
2840
2841static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2842		     unsigned long offset, size_t size, struct bio *bio,
2843		     unsigned long bio_flags)
2844{
2845	int ret = 0;
2846	if (tree->ops && tree->ops->merge_bio_hook)
2847		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2848						bio_flags);
2849	BUG_ON(ret < 0);
2850	return ret;
2851
2852}
2853
2854static int submit_extent_page(int rw, struct extent_io_tree *tree,
2855			      struct writeback_control *wbc,
2856			      struct page *page, sector_t sector,
2857			      size_t size, unsigned long offset,
2858			      struct block_device *bdev,
2859			      struct bio **bio_ret,
2860			      unsigned long max_pages,
2861			      bio_end_io_t end_io_func,
2862			      int mirror_num,
2863			      unsigned long prev_bio_flags,
2864			      unsigned long bio_flags,
2865			      bool force_bio_submit)
2866{
2867	int ret = 0;
2868	struct bio *bio;
2869	int contig = 0;
2870	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2871	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2872
2873	if (bio_ret && *bio_ret) {
2874		bio = *bio_ret;
2875		if (old_compressed)
2876			contig = bio->bi_iter.bi_sector == sector;
2877		else
2878			contig = bio_end_sector(bio) == sector;
2879
2880		if (prev_bio_flags != bio_flags || !contig ||
2881		    force_bio_submit ||
2882		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2883		    bio_add_page(bio, page, page_size, offset) < page_size) {
2884			ret = submit_one_bio(rw, bio, mirror_num,
2885					     prev_bio_flags);
2886			if (ret < 0) {
2887				*bio_ret = NULL;
2888				return ret;
2889			}
2890			bio = NULL;
2891		} else {
2892			if (wbc)
2893				wbc_account_io(wbc, page, page_size);
2894			return 0;
2895		}
2896	}
2897
2898	bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
2899			GFP_NOFS | __GFP_HIGH);
2900	if (!bio)
2901		return -ENOMEM;
2902
2903	bio_add_page(bio, page, page_size, offset);
2904	bio->bi_end_io = end_io_func;
2905	bio->bi_private = tree;
2906	if (wbc) {
2907		wbc_init_bio(wbc, bio);
2908		wbc_account_io(wbc, page, page_size);
2909	}
2910
2911	if (bio_ret)
2912		*bio_ret = bio;
2913	else
2914		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2915
2916	return ret;
2917}
2918
2919static void attach_extent_buffer_page(struct extent_buffer *eb,
2920				      struct page *page)
2921{
2922	if (!PagePrivate(page)) {
2923		SetPagePrivate(page);
2924		page_cache_get(page);
2925		set_page_private(page, (unsigned long)eb);
2926	} else {
2927		WARN_ON(page->private != (unsigned long)eb);
2928	}
2929}
2930
2931void set_page_extent_mapped(struct page *page)
2932{
2933	if (!PagePrivate(page)) {
2934		SetPagePrivate(page);
2935		page_cache_get(page);
2936		set_page_private(page, EXTENT_PAGE_PRIVATE);
2937	}
2938}
2939
2940static struct extent_map *
2941__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2942		 u64 start, u64 len, get_extent_t *get_extent,
2943		 struct extent_map **em_cached)
2944{
2945	struct extent_map *em;
2946
2947	if (em_cached && *em_cached) {
2948		em = *em_cached;
2949		if (extent_map_in_tree(em) && start >= em->start &&
2950		    start < extent_map_end(em)) {
2951			atomic_inc(&em->refs);
2952			return em;
2953		}
2954
2955		free_extent_map(em);
2956		*em_cached = NULL;
2957	}
2958
2959	em = get_extent(inode, page, pg_offset, start, len, 0);
2960	if (em_cached && !IS_ERR_OR_NULL(em)) {
2961		BUG_ON(*em_cached);
2962		atomic_inc(&em->refs);
2963		*em_cached = em;
2964	}
2965	return em;
2966}
2967/*
2968 * basic readpage implementation.  Locked extent state structs are inserted
2969 * into the tree that are removed when the IO is done (by the end_io
2970 * handlers)
2971 * XXX JDM: This needs looking at to ensure proper page locking
2972 */
2973static int __do_readpage(struct extent_io_tree *tree,
2974			 struct page *page,
2975			 get_extent_t *get_extent,
2976			 struct extent_map **em_cached,
2977			 struct bio **bio, int mirror_num,
2978			 unsigned long *bio_flags, int rw,
2979			 u64 *prev_em_start)
2980{
2981	struct inode *inode = page->mapping->host;
2982	u64 start = page_offset(page);
2983	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2984	u64 end;
2985	u64 cur = start;
2986	u64 extent_offset;
2987	u64 last_byte = i_size_read(inode);
2988	u64 block_start;
2989	u64 cur_end;
2990	sector_t sector;
2991	struct extent_map *em;
2992	struct block_device *bdev;
2993	int ret;
2994	int nr = 0;
2995	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
2996	size_t pg_offset = 0;
2997	size_t iosize;
2998	size_t disk_io_size;
2999	size_t blocksize = inode->i_sb->s_blocksize;
3000	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
3001
3002	set_page_extent_mapped(page);
3003
3004	end = page_end;
3005	if (!PageUptodate(page)) {
3006		if (cleancache_get_page(page) == 0) {
3007			BUG_ON(blocksize != PAGE_SIZE);
3008			unlock_extent(tree, start, end);
3009			goto out;
3010		}
3011	}
3012
3013	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
3014		char *userpage;
3015		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
3016
3017		if (zero_offset) {
3018			iosize = PAGE_CACHE_SIZE - zero_offset;
3019			userpage = kmap_atomic(page);
3020			memset(userpage + zero_offset, 0, iosize);
3021			flush_dcache_page(page);
3022			kunmap_atomic(userpage);
3023		}
3024	}
3025	while (cur <= end) {
3026		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
3027		bool force_bio_submit = false;
3028
3029		if (cur >= last_byte) {
3030			char *userpage;
3031			struct extent_state *cached = NULL;
3032
3033			iosize = PAGE_CACHE_SIZE - pg_offset;
3034			userpage = kmap_atomic(page);
3035			memset(userpage + pg_offset, 0, iosize);
3036			flush_dcache_page(page);
3037			kunmap_atomic(userpage);
3038			set_extent_uptodate(tree, cur, cur + iosize - 1,
3039					    &cached, GFP_NOFS);
3040			if (!parent_locked)
3041				unlock_extent_cached(tree, cur,
3042						     cur + iosize - 1,
3043						     &cached, GFP_NOFS);
3044			break;
3045		}
3046		em = __get_extent_map(inode, page, pg_offset, cur,
3047				      end - cur + 1, get_extent, em_cached);
3048		if (IS_ERR_OR_NULL(em)) {
3049			SetPageError(page);
3050			if (!parent_locked)
3051				unlock_extent(tree, cur, end);
3052			break;
3053		}
3054		extent_offset = cur - em->start;
3055		BUG_ON(extent_map_end(em) <= cur);
3056		BUG_ON(end < cur);
3057
3058		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3059			this_bio_flag |= EXTENT_BIO_COMPRESSED;
3060			extent_set_compress_type(&this_bio_flag,
3061						 em->compress_type);
3062		}
3063
3064		iosize = min(extent_map_end(em) - cur, end - cur + 1);
3065		cur_end = min(extent_map_end(em) - 1, end);
3066		iosize = ALIGN(iosize, blocksize);
3067		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
3068			disk_io_size = em->block_len;
3069			sector = em->block_start >> 9;
3070		} else {
3071			sector = (em->block_start + extent_offset) >> 9;
3072			disk_io_size = iosize;
3073		}
3074		bdev = em->bdev;
3075		block_start = em->block_start;
3076		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3077			block_start = EXTENT_MAP_HOLE;
3078
3079		/*
3080		 * If we have a file range that points to a compressed extent
3081		 * and it's followed by a consecutive file range that points to
3082		 * to the same compressed extent (possibly with a different
3083		 * offset and/or length, so it either points to the whole extent
3084		 * or only part of it), we must make sure we do not submit a
3085		 * single bio to populate the pages for the 2 ranges because
3086		 * this makes the compressed extent read zero out the pages
3087		 * belonging to the 2nd range. Imagine the following scenario:
3088		 *
3089		 *  File layout
3090		 *  [0 - 8K]                     [8K - 24K]
3091		 *    |                               |
3092		 *    |                               |
3093		 * points to extent X,         points to extent X,
3094		 * offset 4K, length of 8K     offset 0, length 16K
3095		 *
3096		 * [extent X, compressed length = 4K uncompressed length = 16K]
3097		 *
3098		 * If the bio to read the compressed extent covers both ranges,
3099		 * it will decompress extent X into the pages belonging to the
3100		 * first range and then it will stop, zeroing out the remaining
3101		 * pages that belong to the other range that points to extent X.
3102		 * So here we make sure we submit 2 bios, one for the first
3103		 * range and another one for the third range. Both will target
3104		 * the same physical extent from disk, but we can't currently
3105		 * make the compressed bio endio callback populate the pages
3106		 * for both ranges because each compressed bio is tightly
3107		 * coupled with a single extent map, and each range can have
3108		 * an extent map with a different offset value relative to the
3109		 * uncompressed data of our extent and different lengths. This
3110		 * is a corner case so we prioritize correctness over
3111		 * non-optimal behavior (submitting 2 bios for the same extent).
3112		 */
3113		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3114		    prev_em_start && *prev_em_start != (u64)-1 &&
3115		    *prev_em_start != em->orig_start)
3116			force_bio_submit = true;
3117
3118		if (prev_em_start)
3119			*prev_em_start = em->orig_start;
3120
3121		free_extent_map(em);
3122		em = NULL;
3123
3124		/* we've found a hole, just zero and go on */
3125		if (block_start == EXTENT_MAP_HOLE) {
3126			char *userpage;
3127			struct extent_state *cached = NULL;
3128
3129			userpage = kmap_atomic(page);
3130			memset(userpage + pg_offset, 0, iosize);
3131			flush_dcache_page(page);
3132			kunmap_atomic(userpage);
3133
3134			set_extent_uptodate(tree, cur, cur + iosize - 1,
3135					    &cached, GFP_NOFS);
3136			if (parent_locked)
3137				free_extent_state(cached);
3138			else
3139				unlock_extent_cached(tree, cur,
3140						     cur + iosize - 1,
3141						     &cached, GFP_NOFS);
3142			cur = cur + iosize;
3143			pg_offset += iosize;
3144			continue;
3145		}
3146		/* the get_extent function already copied into the page */
3147		if (test_range_bit(tree, cur, cur_end,
3148				   EXTENT_UPTODATE, 1, NULL)) {
3149			check_page_uptodate(tree, page);
3150			if (!parent_locked)
3151				unlock_extent(tree, cur, cur + iosize - 1);
3152			cur = cur + iosize;
3153			pg_offset += iosize;
3154			continue;
3155		}
3156		/* we have an inline extent but it didn't get marked up
3157		 * to date.  Error out
3158		 */
3159		if (block_start == EXTENT_MAP_INLINE) {
3160			SetPageError(page);
3161			if (!parent_locked)
3162				unlock_extent(tree, cur, cur + iosize - 1);
3163			cur = cur + iosize;
3164			pg_offset += iosize;
3165			continue;
3166		}
3167
3168		pnr -= page->index;
3169		ret = submit_extent_page(rw, tree, NULL, page,
3170					 sector, disk_io_size, pg_offset,
3171					 bdev, bio, pnr,
3172					 end_bio_extent_readpage, mirror_num,
3173					 *bio_flags,
3174					 this_bio_flag,
3175					 force_bio_submit);
3176		if (!ret) {
3177			nr++;
3178			*bio_flags = this_bio_flag;
3179		} else {
3180			SetPageError(page);
3181			if (!parent_locked)
3182				unlock_extent(tree, cur, cur + iosize - 1);
3183		}
3184		cur = cur + iosize;
3185		pg_offset += iosize;
3186	}
3187out:
3188	if (!nr) {
3189		if (!PageError(page))
3190			SetPageUptodate(page);
3191		unlock_page(page);
3192	}
3193	return 0;
3194}
3195
3196static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
3197					     struct page *pages[], int nr_pages,
3198					     u64 start, u64 end,
3199					     get_extent_t *get_extent,
3200					     struct extent_map **em_cached,
3201					     struct bio **bio, int mirror_num,
3202					     unsigned long *bio_flags, int rw,
3203					     u64 *prev_em_start)
3204{
3205	struct inode *inode;
3206	struct btrfs_ordered_extent *ordered;
3207	int index;
3208
3209	inode = pages[0]->mapping->host;
3210	while (1) {
3211		lock_extent(tree, start, end);
3212		ordered = btrfs_lookup_ordered_range(inode, start,
3213						     end - start + 1);
3214		if (!ordered)
3215			break;
3216		unlock_extent(tree, start, end);
3217		btrfs_start_ordered_extent(inode, ordered, 1);
3218		btrfs_put_ordered_extent(ordered);
3219	}
3220
3221	for (index = 0; index < nr_pages; index++) {
3222		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
3223			      mirror_num, bio_flags, rw, prev_em_start);
3224		page_cache_release(pages[index]);
3225	}
3226}
3227
3228static void __extent_readpages(struct extent_io_tree *tree,
3229			       struct page *pages[],
3230			       int nr_pages, get_extent_t *get_extent,
3231			       struct extent_map **em_cached,
3232			       struct bio **bio, int mirror_num,
3233			       unsigned long *bio_flags, int rw,
3234			       u64 *prev_em_start)
3235{
3236	u64 start = 0;
3237	u64 end = 0;
3238	u64 page_start;
3239	int index;
3240	int first_index = 0;
3241
3242	for (index = 0; index < nr_pages; index++) {
3243		page_start = page_offset(pages[index]);
3244		if (!end) {
3245			start = page_start;
3246			end = start + PAGE_CACHE_SIZE - 1;
3247			first_index = index;
3248		} else if (end + 1 == page_start) {
3249			end += PAGE_CACHE_SIZE;
3250		} else {
3251			__do_contiguous_readpages(tree, &pages[first_index],
3252						  index - first_index, start,
3253						  end, get_extent, em_cached,
3254						  bio, mirror_num, bio_flags,
3255						  rw, prev_em_start);
3256			start = page_start;
3257			end = start + PAGE_CACHE_SIZE - 1;
3258			first_index = index;
3259		}
3260	}
3261
3262	if (end)
3263		__do_contiguous_readpages(tree, &pages[first_index],
3264					  index - first_index, start,
3265					  end, get_extent, em_cached, bio,
3266					  mirror_num, bio_flags, rw,
3267					  prev_em_start);
3268}
3269
3270static int __extent_read_full_page(struct extent_io_tree *tree,
3271				   struct page *page,
3272				   get_extent_t *get_extent,
3273				   struct bio **bio, int mirror_num,
3274				   unsigned long *bio_flags, int rw)
3275{
3276	struct inode *inode = page->mapping->host;
3277	struct btrfs_ordered_extent *ordered;
3278	u64 start = page_offset(page);
3279	u64 end = start + PAGE_CACHE_SIZE - 1;
3280	int ret;
3281
3282	while (1) {
3283		lock_extent(tree, start, end);
3284		ordered = btrfs_lookup_ordered_extent(inode, start);
3285		if (!ordered)
3286			break;
3287		unlock_extent(tree, start, end);
3288		btrfs_start_ordered_extent(inode, ordered, 1);
3289		btrfs_put_ordered_extent(ordered);
3290	}
3291
3292	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3293			    bio_flags, rw, NULL);
3294	return ret;
3295}
3296
3297int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3298			    get_extent_t *get_extent, int mirror_num)
3299{
3300	struct bio *bio = NULL;
3301	unsigned long bio_flags = 0;
3302	int ret;
3303
3304	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
3305				      &bio_flags, READ);
3306	if (bio)
3307		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3308	return ret;
3309}
3310
3311int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
3312				 get_extent_t *get_extent, int mirror_num)
3313{
3314	struct bio *bio = NULL;
3315	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
3316	int ret;
3317
3318	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
3319			    &bio_flags, READ, NULL);
3320	if (bio)
3321		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3322	return ret;
3323}
3324
3325static noinline void update_nr_written(struct page *page,
3326				      struct writeback_control *wbc,
3327				      unsigned long nr_written)
3328{
3329	wbc->nr_to_write -= nr_written;
3330	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
3331	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
3332		page->mapping->writeback_index = page->index + nr_written;
3333}
3334
3335/*
3336 * helper for __extent_writepage, doing all of the delayed allocation setup.
3337 *
3338 * This returns 1 if our fill_delalloc function did all the work required
3339 * to write the page (copy into inline extent).  In this case the IO has
3340 * been started and the page is already unlocked.
3341 *
3342 * This returns 0 if all went well (page still locked)
3343 * This returns < 0 if there were errors (page still locked)
3344 */
3345static noinline_for_stack int writepage_delalloc(struct inode *inode,
3346			      struct page *page, struct writeback_control *wbc,
3347			      struct extent_page_data *epd,
3348			      u64 delalloc_start,
3349			      unsigned long *nr_written)
3350{
3351	struct extent_io_tree *tree = epd->tree;
3352	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
3353	u64 nr_delalloc;
3354	u64 delalloc_to_write = 0;
3355	u64 delalloc_end = 0;
3356	int ret;
3357	int page_started = 0;
3358
3359	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
3360		return 0;
3361
3362	while (delalloc_end < page_end) {
3363		nr_delalloc = find_lock_delalloc_range(inode, tree,
3364					       page,
3365					       &delalloc_start,
3366					       &delalloc_end,
3367					       BTRFS_MAX_EXTENT_SIZE);
3368		if (nr_delalloc == 0) {
3369			delalloc_start = delalloc_end + 1;
3370			continue;
3371		}
3372		ret = tree->ops->fill_delalloc(inode, page,
3373					       delalloc_start,
3374					       delalloc_end,
3375					       &page_started,
3376					       nr_written);
3377		/* File system has been set read-only */
3378		if (ret) {
3379			SetPageError(page);
3380			/* fill_delalloc should be return < 0 for error
3381			 * but just in case, we use > 0 here meaning the
3382			 * IO is started, so we don't want to return > 0
3383			 * unless things are going well.
3384			 */
3385			ret = ret < 0 ? ret : -EIO;
3386			goto done;
3387		}
3388		/*
3389		 * delalloc_end is already one less than the total
3390		 * length, so we don't subtract one from
3391		 * PAGE_CACHE_SIZE
3392		 */
3393		delalloc_to_write += (delalloc_end - delalloc_start +
3394				      PAGE_CACHE_SIZE) >>
3395				      PAGE_CACHE_SHIFT;
3396		delalloc_start = delalloc_end + 1;
3397	}
3398	if (wbc->nr_to_write < delalloc_to_write) {
3399		int thresh = 8192;
3400
3401		if (delalloc_to_write < thresh * 2)
3402			thresh = delalloc_to_write;
3403		wbc->nr_to_write = min_t(u64, delalloc_to_write,
3404					 thresh);
3405	}
3406
3407	/* did the fill delalloc function already unlock and start
3408	 * the IO?
3409	 */
3410	if (page_started) {
3411		/*
3412		 * we've unlocked the page, so we can't update
3413		 * the mapping's writeback index, just update
3414		 * nr_to_write.
3415		 */
3416		wbc->nr_to_write -= *nr_written;
3417		return 1;
3418	}
3419
3420	ret = 0;
3421
3422done:
3423	return ret;
3424}
3425
3426/*
3427 * helper for __extent_writepage.  This calls the writepage start hooks,
3428 * and does the loop to map the page into extents and bios.
3429 *
3430 * We return 1 if the IO is started and the page is unlocked,
3431 * 0 if all went well (page still locked)
3432 * < 0 if there were errors (page still locked)
3433 */
3434static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3435				 struct page *page,
3436				 struct writeback_control *wbc,
3437				 struct extent_page_data *epd,
3438				 loff_t i_size,
3439				 unsigned long nr_written,
3440				 int write_flags, int *nr_ret)
3441{
3442	struct extent_io_tree *tree = epd->tree;
3443	u64 start = page_offset(page);
3444	u64 page_end = start + PAGE_CACHE_SIZE - 1;
3445	u64 end;
3446	u64 cur = start;
3447	u64 extent_offset;
3448	u64 block_start;
3449	u64 iosize;
3450	sector_t sector;
3451	struct extent_state *cached_state = NULL;
3452	struct extent_map *em;
3453	struct block_device *bdev;
3454	size_t pg_offset = 0;
3455	size_t blocksize;
3456	int ret = 0;
3457	int nr = 0;
3458	bool compressed;
3459
3460	if (tree->ops && tree->ops->writepage_start_hook) {
3461		ret = tree->ops->writepage_start_hook(page, start,
3462						      page_end);
3463		if (ret) {
3464			/* Fixup worker will requeue */
3465			if (ret == -EBUSY)
3466				wbc->pages_skipped++;
3467			else
3468				redirty_page_for_writepage(wbc, page);
3469
3470			update_nr_written(page, wbc, nr_written);
3471			unlock_page(page);
3472			ret = 1;
3473			goto done_unlocked;
3474		}
3475	}
3476
3477	/*
3478	 * we don't want to touch the inode after unlocking the page,
3479	 * so we update the mapping writeback index now
3480	 */
3481	update_nr_written(page, wbc, nr_written + 1);
3482
3483	end = page_end;
3484	if (i_size <= start) {
3485		if (tree->ops && tree->ops->writepage_end_io_hook)
3486			tree->ops->writepage_end_io_hook(page, start,
3487							 page_end, NULL, 1);
3488		goto done;
3489	}
3490
3491	blocksize = inode->i_sb->s_blocksize;
3492
3493	while (cur <= end) {
3494		u64 em_end;
3495		if (cur >= i_size) {
3496			if (tree->ops && tree->ops->writepage_end_io_hook)
3497				tree->ops->writepage_end_io_hook(page, cur,
3498							 page_end, NULL, 1);
3499			break;
3500		}
3501		em = epd->get_extent(inode, page, pg_offset, cur,
3502				     end - cur + 1, 1);
3503		if (IS_ERR_OR_NULL(em)) {
3504			SetPageError(page);
3505			ret = PTR_ERR_OR_ZERO(em);
3506			break;
3507		}
3508
3509		extent_offset = cur - em->start;
3510		em_end = extent_map_end(em);
3511		BUG_ON(em_end <= cur);
3512		BUG_ON(end < cur);
3513		iosize = min(em_end - cur, end - cur + 1);
3514		iosize = ALIGN(iosize, blocksize);
3515		sector = (em->block_start + extent_offset) >> 9;
3516		bdev = em->bdev;
3517		block_start = em->block_start;
3518		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3519		free_extent_map(em);
3520		em = NULL;
3521
3522		/*
3523		 * compressed and inline extents are written through other
3524		 * paths in the FS
3525		 */
3526		if (compressed || block_start == EXTENT_MAP_HOLE ||
3527		    block_start == EXTENT_MAP_INLINE) {
3528			/*
3529			 * end_io notification does not happen here for
3530			 * compressed extents
3531			 */
3532			if (!compressed && tree->ops &&
3533			    tree->ops->writepage_end_io_hook)
3534				tree->ops->writepage_end_io_hook(page, cur,
3535							 cur + iosize - 1,
3536							 NULL, 1);
3537			else if (compressed) {
3538				/* we don't want to end_page_writeback on
3539				 * a compressed extent.  this happens
3540				 * elsewhere
3541				 */
3542				nr++;
3543			}
3544
3545			cur += iosize;
3546			pg_offset += iosize;
3547			continue;
3548		}
3549
3550		if (tree->ops && tree->ops->writepage_io_hook) {
3551			ret = tree->ops->writepage_io_hook(page, cur,
3552						cur + iosize - 1);
3553		} else {
3554			ret = 0;
3555		}
3556		if (ret) {
3557			SetPageError(page);
3558		} else {
3559			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
3560
3561			set_range_writeback(tree, cur, cur + iosize - 1);
3562			if (!PageWriteback(page)) {
3563				btrfs_err(BTRFS_I(inode)->root->fs_info,
3564					   "page %lu not writeback, cur %llu end %llu",
3565				       page->index, cur, end);
3566			}
3567
3568			ret = submit_extent_page(write_flags, tree, wbc, page,
3569						 sector, iosize, pg_offset,
3570						 bdev, &epd->bio, max_nr,
3571						 end_bio_extent_writepage,
3572						 0, 0, 0, false);
3573			if (ret)
3574				SetPageError(page);
3575		}
3576		cur = cur + iosize;
3577		pg_offset += iosize;
3578		nr++;
3579	}
3580done:
3581	*nr_ret = nr;
3582
3583done_unlocked:
3584
3585	/* drop our reference on any cached states */
3586	free_extent_state(cached_state);
3587	return ret;
3588}
3589
3590/*
3591 * the writepage semantics are similar to regular writepage.  extent
3592 * records are inserted to lock ranges in the tree, and as dirty areas
3593 * are found, they are marked writeback.  Then the lock bits are removed
3594 * and the end_io handler clears the writeback ranges
3595 */
3596static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3597			      void *data)
3598{
3599	struct inode *inode = page->mapping->host;
3600	struct extent_page_data *epd = data;
3601	u64 start = page_offset(page);
3602	u64 page_end = start + PAGE_CACHE_SIZE - 1;
3603	int ret;
3604	int nr = 0;
3605	size_t pg_offset = 0;
3606	loff_t i_size = i_size_read(inode);
3607	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
3608	int write_flags;
3609	unsigned long nr_written = 0;
3610
3611	if (wbc->sync_mode == WB_SYNC_ALL)
3612		write_flags = WRITE_SYNC;
3613	else
3614		write_flags = WRITE;
3615
3616	trace___extent_writepage(page, inode, wbc);
3617
3618	WARN_ON(!PageLocked(page));
3619
3620	ClearPageError(page);
3621
3622	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3623	if (page->index > end_index ||
3624	   (page->index == end_index && !pg_offset)) {
3625		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3626		unlock_page(page);
3627		return 0;
3628	}
3629
3630	if (page->index == end_index) {
3631		char *userpage;
3632
3633		userpage = kmap_atomic(page);
3634		memset(userpage + pg_offset, 0,
3635		       PAGE_CACHE_SIZE - pg_offset);
3636		kunmap_atomic(userpage);
3637		flush_dcache_page(page);
3638	}
3639
3640	pg_offset = 0;
3641
3642	set_page_extent_mapped(page);
3643
3644	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
3645	if (ret == 1)
3646		goto done_unlocked;
3647	if (ret)
3648		goto done;
3649
3650	ret = __extent_writepage_io(inode, page, wbc, epd,
3651				    i_size, nr_written, write_flags, &nr);
3652	if (ret == 1)
3653		goto done_unlocked;
3654
3655done:
3656	if (nr == 0) {
3657		/* make sure the mapping tag for page dirty gets cleared */
3658		set_page_writeback(page);
3659		end_page_writeback(page);
3660	}
3661	if (PageError(page)) {
3662		ret = ret < 0 ? ret : -EIO;
3663		end_extent_writepage(page, ret, start, page_end);
3664	}
3665	unlock_page(page);
3666	return ret;
3667
3668done_unlocked:
3669	return 0;
3670}
3671
3672void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3673{
3674	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3675		       TASK_UNINTERRUPTIBLE);
3676}
3677
3678static noinline_for_stack int
3679lock_extent_buffer_for_io(struct extent_buffer *eb,
3680			  struct btrfs_fs_info *fs_info,
3681			  struct extent_page_data *epd)
3682{
3683	unsigned long i, num_pages;
3684	int flush = 0;
3685	int ret = 0;
3686
3687	if (!btrfs_try_tree_write_lock(eb)) {
3688		flush = 1;
3689		flush_write_bio(epd);
3690		btrfs_tree_lock(eb);
3691	}
3692
3693	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3694		btrfs_tree_unlock(eb);
3695		if (!epd->sync_io)
3696			return 0;
3697		if (!flush) {
3698			flush_write_bio(epd);
3699			flush = 1;
3700		}
3701		while (1) {
3702			wait_on_extent_buffer_writeback(eb);
3703			btrfs_tree_lock(eb);
3704			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3705				break;
3706			btrfs_tree_unlock(eb);
3707		}
3708	}
3709
3710	/*
3711	 * We need to do this to prevent races in people who check if the eb is
3712	 * under IO since we can end up having no IO bits set for a short period
3713	 * of time.
3714	 */
3715	spin_lock(&eb->refs_lock);
3716	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3717		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3718		spin_unlock(&eb->refs_lock);
3719		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3720		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
3721				     -eb->len,
3722				     fs_info->dirty_metadata_batch);
3723		ret = 1;
3724	} else {
3725		spin_unlock(&eb->refs_lock);
3726	}
3727
3728	btrfs_tree_unlock(eb);
3729
3730	if (!ret)
3731		return ret;
3732
3733	num_pages = num_extent_pages(eb->start, eb->len);
3734	for (i = 0; i < num_pages; i++) {
3735		struct page *p = eb->pages[i];
3736
3737		if (!trylock_page(p)) {
3738			if (!flush) {
3739				flush_write_bio(epd);
3740				flush = 1;
3741			}
3742			lock_page(p);
3743		}
3744	}
3745
3746	return ret;
3747}
3748
3749static void end_extent_buffer_writeback(struct extent_buffer *eb)
3750{
3751	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3752	smp_mb__after_atomic();
3753	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3754}
3755
3756static void set_btree_ioerr(struct page *page)
3757{
3758	struct extent_buffer *eb = (struct extent_buffer *)page->private;
3759	struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
3760
3761	SetPageError(page);
3762	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3763		return;
3764
3765	/*
3766	 * If writeback for a btree extent that doesn't belong to a log tree
3767	 * failed, increment the counter transaction->eb_write_errors.
3768	 * We do this because while the transaction is running and before it's
3769	 * committing (when we call filemap_fdata[write|wait]_range against
3770	 * the btree inode), we might have
3771	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3772	 * returns an error or an error happens during writeback, when we're
3773	 * committing the transaction we wouldn't know about it, since the pages
3774	 * can be no longer dirty nor marked anymore for writeback (if a
3775	 * subsequent modification to the extent buffer didn't happen before the
3776	 * transaction commit), which makes filemap_fdata[write|wait]_range not
3777	 * able to find the pages tagged with SetPageError at transaction
3778	 * commit time. So if this happens we must abort the transaction,
3779	 * otherwise we commit a super block with btree roots that point to
3780	 * btree nodes/leafs whose content on disk is invalid - either garbage
3781	 * or the content of some node/leaf from a past generation that got
3782	 * cowed or deleted and is no longer valid.
3783	 *
3784	 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3785	 * not be enough - we need to distinguish between log tree extents vs
3786	 * non-log tree extents, and the next filemap_fdatawait_range() call
3787	 * will catch and clear such errors in the mapping - and that call might
3788	 * be from a log sync and not from a transaction commit. Also, checking
3789	 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3790	 * not done and would not be reliable - the eb might have been released
3791	 * from memory and reading it back again means that flag would not be
3792	 * set (since it's a runtime flag, not persisted on disk).
3793	 *
3794	 * Using the flags below in the btree inode also makes us achieve the
3795	 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3796	 * writeback for all dirty pages and before filemap_fdatawait_range()
3797	 * is called, the writeback for all dirty pages had already finished
3798	 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3799	 * filemap_fdatawait_range() would return success, as it could not know
3800	 * that writeback errors happened (the pages were no longer tagged for
3801	 * writeback).
3802	 */
3803	switch (eb->log_index) {
3804	case -1:
3805		set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
3806		break;
3807	case 0:
3808		set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
3809		break;
3810	case 1:
3811		set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
3812		break;
3813	default:
3814		BUG(); /* unexpected, logic error */
3815	}
3816}
3817
3818static void end_bio_extent_buffer_writepage(struct bio *bio)
3819{
3820	struct bio_vec *bvec;
3821	struct extent_buffer *eb;
3822	int i, done;
3823
3824	bio_for_each_segment_all(bvec, bio, i) {
3825		struct page *page = bvec->bv_page;
3826
3827		eb = (struct extent_buffer *)page->private;
3828		BUG_ON(!eb);
3829		done = atomic_dec_and_test(&eb->io_pages);
3830
3831		if (bio->bi_error ||
3832		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3833			ClearPageUptodate(page);
3834			set_btree_ioerr(page);
3835		}
3836
3837		end_page_writeback(page);
3838
3839		if (!done)
3840			continue;
3841
3842		end_extent_buffer_writeback(eb);
3843	}
3844
3845	bio_put(bio);
3846}
3847
3848static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3849			struct btrfs_fs_info *fs_info,
3850			struct writeback_control *wbc,
3851			struct extent_page_data *epd)
3852{
3853	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3854	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3855	u64 offset = eb->start;
3856	unsigned long i, num_pages;
3857	unsigned long bio_flags = 0;
3858	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3859	int ret = 0;
3860
3861	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3862	num_pages = num_extent_pages(eb->start, eb->len);
3863	atomic_set(&eb->io_pages, num_pages);
3864	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3865		bio_flags = EXTENT_BIO_TREE_LOG;
3866
3867	for (i = 0; i < num_pages; i++) {
3868		struct page *p = eb->pages[i];
3869
3870		clear_page_dirty_for_io(p);
3871		set_page_writeback(p);
3872		ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
3873					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3874					 -1, end_bio_extent_buffer_writepage,
3875					 0, epd->bio_flags, bio_flags, false);
3876		epd->bio_flags = bio_flags;
3877		if (ret) {
3878			set_btree_ioerr(p);
3879			end_page_writeback(p);
3880			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3881				end_extent_buffer_writeback(eb);
3882			ret = -EIO;
3883			break;
3884		}
3885		offset += PAGE_CACHE_SIZE;
3886		update_nr_written(p, wbc, 1);
3887		unlock_page(p);
3888	}
3889
3890	if (unlikely(ret)) {
3891		for (; i < num_pages; i++) {
3892			struct page *p = eb->pages[i];
3893			clear_page_dirty_for_io(p);
3894			unlock_page(p);
3895		}
3896	}
3897
3898	return ret;
3899}
3900
3901int btree_write_cache_pages(struct address_space *mapping,
3902				   struct writeback_control *wbc)
3903{
3904	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3905	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3906	struct extent_buffer *eb, *prev_eb = NULL;
3907	struct extent_page_data epd = {
3908		.bio = NULL,
3909		.tree = tree,
3910		.extent_locked = 0,
3911		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3912		.bio_flags = 0,
3913	};
3914	int ret = 0;
3915	int done = 0;
3916	int nr_to_write_done = 0;
3917	struct pagevec pvec;
3918	int nr_pages;
3919	pgoff_t index;
3920	pgoff_t end;		/* Inclusive */
3921	int scanned = 0;
3922	int tag;
3923
3924	pagevec_init(&pvec, 0);
3925	if (wbc->range_cyclic) {
3926		index = mapping->writeback_index; /* Start from prev offset */
3927		end = -1;
3928	} else {
3929		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3930		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3931		scanned = 1;
3932	}
3933	if (wbc->sync_mode == WB_SYNC_ALL)
3934		tag = PAGECACHE_TAG_TOWRITE;
3935	else
3936		tag = PAGECACHE_TAG_DIRTY;
3937retry:
3938	if (wbc->sync_mode == WB_SYNC_ALL)
3939		tag_pages_for_writeback(mapping, index, end);
3940	while (!done && !nr_to_write_done && (index <= end) &&
3941	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3942			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3943		unsigned i;
3944
3945		scanned = 1;
3946		for (i = 0; i < nr_pages; i++) {
3947			struct page *page = pvec.pages[i];
3948
3949			if (!PagePrivate(page))
3950				continue;
3951
3952			if (!wbc->range_cyclic && page->index > end) {
3953				done = 1;
3954				break;
3955			}
3956
3957			spin_lock(&mapping->private_lock);
3958			if (!PagePrivate(page)) {
3959				spin_unlock(&mapping->private_lock);
3960				continue;
3961			}
3962
3963			eb = (struct extent_buffer *)page->private;
3964
3965			/*
3966			 * Shouldn't happen and normally this would be a BUG_ON
3967			 * but no sense in crashing the users box for something
3968			 * we can survive anyway.
3969			 */
3970			if (WARN_ON(!eb)) {
3971				spin_unlock(&mapping->private_lock);
3972				continue;
3973			}
3974
3975			if (eb == prev_eb) {
3976				spin_unlock(&mapping->private_lock);
3977				continue;
3978			}
3979
3980			ret = atomic_inc_not_zero(&eb->refs);
3981			spin_unlock(&mapping->private_lock);
3982			if (!ret)
3983				continue;
3984
3985			prev_eb = eb;
3986			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3987			if (!ret) {
3988				free_extent_buffer(eb);
3989				continue;
3990			}
3991
3992			ret = write_one_eb(eb, fs_info, wbc, &epd);
3993			if (ret) {
3994				done = 1;
3995				free_extent_buffer(eb);
3996				break;
3997			}
3998			free_extent_buffer(eb);
3999
4000			/*
4001			 * the filesystem may choose to bump up nr_to_write.
4002			 * We have to make sure to honor the new nr_to_write
4003			 * at any time
4004			 */
4005			nr_to_write_done = wbc->nr_to_write <= 0;
4006		}
4007		pagevec_release(&pvec);
4008		cond_resched();
4009	}
4010	if (!scanned && !done) {
4011		/*
4012		 * We hit the last page and there is more work to be done: wrap
4013		 * back to the start of the file
4014		 */
4015		scanned = 1;
4016		index = 0;
4017		goto retry;
4018	}
4019	flush_write_bio(&epd);
4020	return ret;
4021}
4022
4023/**
4024 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
4025 * @mapping: address space structure to write
4026 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4027 * @writepage: function called for each page
4028 * @data: data passed to writepage function
4029 *
4030 * If a page is already under I/O, write_cache_pages() skips it, even
4031 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4032 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4033 * and msync() need to guarantee that all the data which was dirty at the time
4034 * the call was made get new I/O started against them.  If wbc->sync_mode is
4035 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4036 * existing IO to complete.
4037 */
4038static int extent_write_cache_pages(struct extent_io_tree *tree,
4039			     struct address_space *mapping,
4040			     struct writeback_control *wbc,
4041			     writepage_t writepage, void *data,
4042			     void (*flush_fn)(void *))
4043{
4044	struct inode *inode = mapping->host;
4045	int ret = 0;
4046	int done = 0;
4047	int err = 0;
4048	int nr_to_write_done = 0;
4049	struct pagevec pvec;
4050	int nr_pages;
4051	pgoff_t index;
4052	pgoff_t end;		/* Inclusive */
4053	int scanned = 0;
4054	int tag;
4055
4056	/*
4057	 * We have to hold onto the inode so that ordered extents can do their
4058	 * work when the IO finishes.  The alternative to this is failing to add
4059	 * an ordered extent if the igrab() fails there and that is a huge pain
4060	 * to deal with, so instead just hold onto the inode throughout the
4061	 * writepages operation.  If it fails here we are freeing up the inode
4062	 * anyway and we'd rather not waste our time writing out stuff that is
4063	 * going to be truncated anyway.
4064	 */
4065	if (!igrab(inode))
4066		return 0;
4067
4068	pagevec_init(&pvec, 0);
4069	if (wbc->range_cyclic) {
4070		index = mapping->writeback_index; /* Start from prev offset */
4071		end = -1;
4072	} else {
4073		index = wbc->range_start >> PAGE_CACHE_SHIFT;
4074		end = wbc->range_end >> PAGE_CACHE_SHIFT;
4075		scanned = 1;
4076	}
4077	if (wbc->sync_mode == WB_SYNC_ALL)
4078		tag = PAGECACHE_TAG_TOWRITE;
4079	else
4080		tag = PAGECACHE_TAG_DIRTY;
4081retry:
4082	if (wbc->sync_mode == WB_SYNC_ALL)
4083		tag_pages_for_writeback(mapping, index, end);
4084	while (!done && !nr_to_write_done && (index <= end) &&
4085	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
4086			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
4087		unsigned i;
4088
4089		scanned = 1;
4090		for (i = 0; i < nr_pages; i++) {
4091			struct page *page = pvec.pages[i];
4092
4093			/*
4094			 * At this point we hold neither mapping->tree_lock nor
4095			 * lock on the page itself: the page may be truncated or
4096			 * invalidated (changing page->mapping to NULL), or even
4097			 * swizzled back from swapper_space to tmpfs file
4098			 * mapping
4099			 */
4100			if (!trylock_page(page)) {
4101				flush_fn(data);
4102				lock_page(page);
4103			}
4104
4105			if (unlikely(page->mapping != mapping)) {
4106				unlock_page(page);
4107				continue;
4108			}
4109
4110			if (!wbc->range_cyclic && page->index > end) {
4111				done = 1;
4112				unlock_page(page);
4113				continue;
4114			}
4115
4116			if (wbc->sync_mode != WB_SYNC_NONE) {
4117				if (PageWriteback(page))
4118					flush_fn(data);
4119				wait_on_page_writeback(page);
4120			}
4121
4122			if (PageWriteback(page) ||
4123			    !clear_page_dirty_for_io(page)) {
4124				unlock_page(page);
4125				continue;
4126			}
4127
4128			ret = (*writepage)(page, wbc, data);
4129
4130			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
4131				unlock_page(page);
4132				ret = 0;
4133			}
4134			if (!err && ret < 0)
4135				err = ret;
4136
4137			/*
4138			 * the filesystem may choose to bump up nr_to_write.
4139			 * We have to make sure to honor the new nr_to_write
4140			 * at any time
4141			 */
4142			nr_to_write_done = wbc->nr_to_write <= 0;
4143		}
4144		pagevec_release(&pvec);
4145		cond_resched();
4146	}
4147	if (!scanned && !done && !err) {
4148		/*
4149		 * We hit the last page and there is more work to be done: wrap
4150		 * back to the start of the file
4151		 */
4152		scanned = 1;
4153		index = 0;
4154		goto retry;
4155	}
4156	btrfs_add_delayed_iput(inode);
4157	return err;
4158}
4159
4160static void flush_epd_write_bio(struct extent_page_data *epd)
4161{
4162	if (epd->bio) {
4163		int rw = WRITE;
4164		int ret;
4165
4166		if (epd->sync_io)
4167			rw = WRITE_SYNC;
4168
4169		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
4170		BUG_ON(ret < 0); /* -ENOMEM */
4171		epd->bio = NULL;
4172	}
4173}
4174
4175static noinline void flush_write_bio(void *data)
4176{
4177	struct extent_page_data *epd = data;
4178	flush_epd_write_bio(epd);
4179}
4180
4181int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
4182			  get_extent_t *get_extent,
4183			  struct writeback_control *wbc)
4184{
4185	int ret;
4186	struct extent_page_data epd = {
4187		.bio = NULL,
4188		.tree = tree,
4189		.get_extent = get_extent,
4190		.extent_locked = 0,
4191		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4192		.bio_flags = 0,
4193	};
4194
4195	ret = __extent_writepage(page, wbc, &epd);
4196
4197	flush_epd_write_bio(&epd);
4198	return ret;
4199}
4200
4201int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
4202			      u64 start, u64 end, get_extent_t *get_extent,
4203			      int mode)
4204{
4205	int ret = 0;
4206	struct address_space *mapping = inode->i_mapping;
4207	struct page *page;
4208	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
4209		PAGE_CACHE_SHIFT;
4210
4211	struct extent_page_data epd = {
4212		.bio = NULL,
4213		.tree = tree,
4214		.get_extent = get_extent,
4215		.extent_locked = 1,
4216		.sync_io = mode == WB_SYNC_ALL,
4217		.bio_flags = 0,
4218	};
4219	struct writeback_control wbc_writepages = {
4220		.sync_mode	= mode,
4221		.nr_to_write	= nr_pages * 2,
4222		.range_start	= start,
4223		.range_end	= end + 1,
4224	};
4225
4226	while (start <= end) {
4227		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
4228		if (clear_page_dirty_for_io(page))
4229			ret = __extent_writepage(page, &wbc_writepages, &epd);
4230		else {
4231			if (tree->ops && tree->ops->writepage_end_io_hook)
4232				tree->ops->writepage_end_io_hook(page, start,
4233						 start + PAGE_CACHE_SIZE - 1,
4234						 NULL, 1);
4235			unlock_page(page);
4236		}
4237		page_cache_release(page);
4238		start += PAGE_CACHE_SIZE;
4239	}
4240
4241	flush_epd_write_bio(&epd);
4242	return ret;
4243}
4244
4245int extent_writepages(struct extent_io_tree *tree,
4246		      struct address_space *mapping,
4247		      get_extent_t *get_extent,
4248		      struct writeback_control *wbc)
4249{
4250	int ret = 0;
4251	struct extent_page_data epd = {
4252		.bio = NULL,
4253		.tree = tree,
4254		.get_extent = get_extent,
4255		.extent_locked = 0,
4256		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4257		.bio_flags = 0,
4258	};
4259
4260	ret = extent_write_cache_pages(tree, mapping, wbc,
4261				       __extent_writepage, &epd,
4262				       flush_write_bio);
4263	flush_epd_write_bio(&epd);
4264	return ret;
4265}
4266
4267int extent_readpages(struct extent_io_tree *tree,
4268		     struct address_space *mapping,
4269		     struct list_head *pages, unsigned nr_pages,
4270		     get_extent_t get_extent)
4271{
4272	struct bio *bio = NULL;
4273	unsigned page_idx;
4274	unsigned long bio_flags = 0;
4275	struct page *pagepool[16];
4276	struct page *page;
4277	struct extent_map *em_cached = NULL;
4278	int nr = 0;
4279	u64 prev_em_start = (u64)-1;
4280
4281	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
4282		page = list_entry(pages->prev, struct page, lru);
4283
4284		prefetchw(&page->flags);
4285		list_del(&page->lru);
4286		if (add_to_page_cache_lru(page, mapping,
4287					page->index, GFP_NOFS)) {
4288			page_cache_release(page);
4289			continue;
4290		}
4291
4292		pagepool[nr++] = page;
4293		if (nr < ARRAY_SIZE(pagepool))
4294			continue;
4295		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
4296				   &bio, 0, &bio_flags, READ, &prev_em_start);
4297		nr = 0;
4298	}
4299	if (nr)
4300		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
4301				   &bio, 0, &bio_flags, READ, &prev_em_start);
4302
4303	if (em_cached)
4304		free_extent_map(em_cached);
4305
4306	BUG_ON(!list_empty(pages));
4307	if (bio)
4308		return submit_one_bio(READ, bio, 0, bio_flags);
4309	return 0;
4310}
4311
4312/*
4313 * basic invalidatepage code, this waits on any locked or writeback
4314 * ranges corresponding to the page, and then deletes any extent state
4315 * records from the tree
4316 */
4317int extent_invalidatepage(struct extent_io_tree *tree,
4318			  struct page *page, unsigned long offset)
4319{
4320	struct extent_state *cached_state = NULL;
4321	u64 start = page_offset(page);
4322	u64 end = start + PAGE_CACHE_SIZE - 1;
4323	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4324
4325	start += ALIGN(offset, blocksize);
4326	if (start > end)
4327		return 0;
4328
4329	lock_extent_bits(tree, start, end, 0, &cached_state);
4330	wait_on_page_writeback(page);
4331	clear_extent_bit(tree, start, end,
4332			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4333			 EXTENT_DO_ACCOUNTING,
4334			 1, 1, &cached_state, GFP_NOFS);
4335	return 0;
4336}
4337
4338/*
4339 * a helper for releasepage, this tests for areas of the page that
4340 * are locked or under IO and drops the related state bits if it is safe
4341 * to drop the page.
4342 */
4343static int try_release_extent_state(struct extent_map_tree *map,
4344				    struct extent_io_tree *tree,
4345				    struct page *page, gfp_t mask)
4346{
4347	u64 start = page_offset(page);
4348	u64 end = start + PAGE_CACHE_SIZE - 1;
4349	int ret = 1;
4350
4351	if (test_range_bit(tree, start, end,
4352			   EXTENT_IOBITS, 0, NULL))
4353		ret = 0;
4354	else {
4355		if ((mask & GFP_NOFS) == GFP_NOFS)
4356			mask = GFP_NOFS;
4357		/*
4358		 * at this point we can safely clear everything except the
4359		 * locked bit and the nodatasum bit
4360		 */
4361		ret = clear_extent_bit(tree, start, end,
4362				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
4363				 0, 0, NULL, mask);
4364
4365		/* if clear_extent_bit failed for enomem reasons,
4366		 * we can't allow the release to continue.
4367		 */
4368		if (ret < 0)
4369			ret = 0;
4370		else
4371			ret = 1;
4372	}
4373	return ret;
4374}
4375
4376/*
4377 * a helper for releasepage.  As long as there are no locked extents
4378 * in the range corresponding to the page, both state records and extent
4379 * map records are removed
4380 */
4381int try_release_extent_mapping(struct extent_map_tree *map,
4382			       struct extent_io_tree *tree, struct page *page,
4383			       gfp_t mask)
4384{
4385	struct extent_map *em;
4386	u64 start = page_offset(page);
4387	u64 end = start + PAGE_CACHE_SIZE - 1;
4388
4389	if (gfpflags_allow_blocking(mask) &&
4390	    page->mapping->host->i_size > 16 * 1024 * 1024) {
4391		u64 len;
4392		while (start <= end) {
4393			len = end - start + 1;
4394			write_lock(&map->lock);
4395			em = lookup_extent_mapping(map, start, len);
4396			if (!em) {
4397				write_unlock(&map->lock);
4398				break;
4399			}
4400			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4401			    em->start != start) {
4402				write_unlock(&map->lock);
4403				free_extent_map(em);
4404				break;
4405			}
4406			if (!test_range_bit(tree, em->start,
4407					    extent_map_end(em) - 1,
4408					    EXTENT_LOCKED | EXTENT_WRITEBACK,
4409					    0, NULL)) {
4410				remove_extent_mapping(map, em);
4411				/* once for the rb tree */
4412				free_extent_map(em);
4413			}
4414			start = extent_map_end(em);
4415			write_unlock(&map->lock);
4416
4417			/* once for us */
4418			free_extent_map(em);
4419		}
4420	}
4421	return try_release_extent_state(map, tree, page, mask);
4422}
4423
4424/*
4425 * helper function for fiemap, which doesn't want to see any holes.
4426 * This maps until we find something past 'last'
4427 */
4428static struct extent_map *get_extent_skip_holes(struct inode *inode,
4429						u64 offset,
4430						u64 last,
4431						get_extent_t *get_extent)
4432{
4433	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
4434	struct extent_map *em;
4435	u64 len;
4436
4437	if (offset >= last)
4438		return NULL;
4439
4440	while (1) {
4441		len = last - offset;
4442		if (len == 0)
4443			break;
4444		len = ALIGN(len, sectorsize);
4445		em = get_extent(inode, NULL, 0, offset, len, 0);
4446		if (IS_ERR_OR_NULL(em))
4447			return em;
4448
4449		/* if this isn't a hole return it */
4450		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
4451		    em->block_start != EXTENT_MAP_HOLE) {
4452			return em;
4453		}
4454
4455		/* this is a hole, advance to the next extent */
4456		offset = extent_map_end(em);
4457		free_extent_map(em);
4458		if (offset >= last)
4459			break;
4460	}
4461	return NULL;
4462}
4463
4464int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4465		__u64 start, __u64 len, get_extent_t *get_extent)
4466{
4467	int ret = 0;
4468	u64 off = start;
4469	u64 max = start + len;
4470	u32 flags = 0;
4471	u32 found_type;
4472	u64 last;
4473	u64 last_for_get_extent = 0;
4474	u64 disko = 0;
4475	u64 isize = i_size_read(inode);
4476	struct btrfs_key found_key;
4477	struct extent_map *em = NULL;
4478	struct extent_state *cached_state = NULL;
4479	struct btrfs_path *path;
4480	struct btrfs_root *root = BTRFS_I(inode)->root;
4481	int end = 0;
4482	u64 em_start = 0;
4483	u64 em_len = 0;
4484	u64 em_end = 0;
4485
4486	if (len == 0)
4487		return -EINVAL;
4488
4489	path = btrfs_alloc_path();
4490	if (!path)
4491		return -ENOMEM;
4492	path->leave_spinning = 1;
4493
4494	start = round_down(start, BTRFS_I(inode)->root->sectorsize);
4495	len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
4496
4497	/*
4498	 * lookup the last file extent.  We're not using i_size here
4499	 * because there might be preallocation past i_size
4500	 */
4501	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4502				       0);
4503	if (ret < 0) {
4504		btrfs_free_path(path);
4505		return ret;
4506	}
4507	WARN_ON(!ret);
4508	path->slots[0]--;
4509	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4510	found_type = found_key.type;
4511
4512	/* No extents, but there might be delalloc bits */
4513	if (found_key.objectid != btrfs_ino(inode) ||
4514	    found_type != BTRFS_EXTENT_DATA_KEY) {
4515		/* have to trust i_size as the end */
4516		last = (u64)-1;
4517		last_for_get_extent = isize;
4518	} else {
4519		/*
4520		 * remember the start of the last extent.  There are a
4521		 * bunch of different factors that go into the length of the
4522		 * extent, so its much less complex to remember where it started
4523		 */
4524		last = found_key.offset;
4525		last_for_get_extent = last + 1;
4526	}
4527	btrfs_release_path(path);
4528
4529	/*
4530	 * we might have some extents allocated but more delalloc past those
4531	 * extents.  so, we trust isize unless the start of the last extent is
4532	 * beyond isize
4533	 */
4534	if (last < isize) {
4535		last = (u64)-1;
4536		last_for_get_extent = isize;
4537	}
4538
4539	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
4540			 &cached_state);
4541
4542	em = get_extent_skip_holes(inode, start, last_for_get_extent,
4543				   get_extent);
4544	if (!em)
4545		goto out;
4546	if (IS_ERR(em)) {
4547		ret = PTR_ERR(em);
4548		goto out;
4549	}
4550
4551	while (!end) {
4552		u64 offset_in_extent = 0;
4553
4554		/* break if the extent we found is outside the range */
4555		if (em->start >= max || extent_map_end(em) < off)
4556			break;
4557
4558		/*
4559		 * get_extent may return an extent that starts before our
4560		 * requested range.  We have to make sure the ranges
4561		 * we return to fiemap always move forward and don't
4562		 * overlap, so adjust the offsets here
4563		 */
4564		em_start = max(em->start, off);
4565
4566		/*
4567		 * record the offset from the start of the extent
4568		 * for adjusting the disk offset below.  Only do this if the
4569		 * extent isn't compressed since our in ram offset may be past
4570		 * what we have actually allocated on disk.
4571		 */
4572		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4573			offset_in_extent = em_start - em->start;
4574		em_end = extent_map_end(em);
4575		em_len = em_end - em_start;
4576		disko = 0;
4577		flags = 0;
4578
4579		/*
4580		 * bump off for our next call to get_extent
4581		 */
4582		off = extent_map_end(em);
4583		if (off >= max)
4584			end = 1;
4585
4586		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4587			end = 1;
4588			flags |= FIEMAP_EXTENT_LAST;
4589		} else if (em->block_start == EXTENT_MAP_INLINE) {
4590			flags |= (FIEMAP_EXTENT_DATA_INLINE |
4591				  FIEMAP_EXTENT_NOT_ALIGNED);
4592		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
4593			flags |= (FIEMAP_EXTENT_DELALLOC |
4594				  FIEMAP_EXTENT_UNKNOWN);
4595		} else if (fieinfo->fi_extents_max) {
4596			u64 bytenr = em->block_start -
4597				(em->start - em->orig_start);
4598
4599			disko = em->block_start + offset_in_extent;
4600
4601			/*
4602			 * As btrfs supports shared space, this information
4603			 * can be exported to userspace tools via
4604			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4605			 * then we're just getting a count and we can skip the
4606			 * lookup stuff.
4607			 */
4608			ret = btrfs_check_shared(NULL, root->fs_info,
4609						 root->objectid,
4610						 btrfs_ino(inode), bytenr);
4611			if (ret < 0)
4612				goto out_free;
4613			if (ret)
4614				flags |= FIEMAP_EXTENT_SHARED;
4615			ret = 0;
4616		}
4617		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4618			flags |= FIEMAP_EXTENT_ENCODED;
4619		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4620			flags |= FIEMAP_EXTENT_UNWRITTEN;
4621
4622		free_extent_map(em);
4623		em = NULL;
4624		if ((em_start >= last) || em_len == (u64)-1 ||
4625		   (last == (u64)-1 && isize <= em_end)) {
4626			flags |= FIEMAP_EXTENT_LAST;
4627			end = 1;
4628		}
4629
4630		/* now scan forward to see if this is really the last extent. */
4631		em = get_extent_skip_holes(inode, off, last_for_get_extent,
4632					   get_extent);
4633		if (IS_ERR(em)) {
4634			ret = PTR_ERR(em);
4635			goto out;
4636		}
4637		if (!em) {
4638			flags |= FIEMAP_EXTENT_LAST;
4639			end = 1;
4640		}
4641		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
4642					      em_len, flags);
4643		if (ret) {
4644			if (ret == 1)
4645				ret = 0;
4646			goto out_free;
4647		}
4648	}
4649out_free:
4650	free_extent_map(em);
4651out:
4652	btrfs_free_path(path);
4653	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4654			     &cached_state, GFP_NOFS);
4655	return ret;
4656}
4657
4658static void __free_extent_buffer(struct extent_buffer *eb)
4659{
4660	btrfs_leak_debug_del(&eb->leak_list);
4661	kmem_cache_free(extent_buffer_cache, eb);
4662}
4663
4664int extent_buffer_under_io(struct extent_buffer *eb)
4665{
4666	return (atomic_read(&eb->io_pages) ||
4667		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4668		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4669}
4670
4671/*
4672 * Helper for releasing extent buffer page.
4673 */
4674static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
4675{
4676	unsigned long index;
4677	struct page *page;
4678	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4679
4680	BUG_ON(extent_buffer_under_io(eb));
4681
4682	index = num_extent_pages(eb->start, eb->len);
4683	if (index == 0)
4684		return;
4685
4686	do {
4687		index--;
4688		page = eb->pages[index];
4689		if (!page)
4690			continue;
4691		if (mapped)
4692			spin_lock(&page->mapping->private_lock);
4693		/*
4694		 * We do this since we'll remove the pages after we've
4695		 * removed the eb from the radix tree, so we could race
4696		 * and have this page now attached to the new eb.  So
4697		 * only clear page_private if it's still connected to
4698		 * this eb.
4699		 */
4700		if (PagePrivate(page) &&
4701		    page->private == (unsigned long)eb) {
4702			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4703			BUG_ON(PageDirty(page));
4704			BUG_ON(PageWriteback(page));
4705			/*
4706			 * We need to make sure we haven't be attached
4707			 * to a new eb.
4708			 */
4709			ClearPagePrivate(page);
4710			set_page_private(page, 0);
4711			/* One for the page private */
4712			page_cache_release(page);
4713		}
4714
4715		if (mapped)
4716			spin_unlock(&page->mapping->private_lock);
4717
4718		/* One for when we alloced the page */
4719		page_cache_release(page);
4720	} while (index != 0);
4721}
4722
4723/*
4724 * Helper for releasing the extent buffer.
4725 */
4726static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4727{
4728	btrfs_release_extent_buffer_page(eb);
4729	__free_extent_buffer(eb);
4730}
4731
4732static struct extent_buffer *
4733__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4734		      unsigned long len)
4735{
4736	struct extent_buffer *eb = NULL;
4737
4738	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
4739	eb->start = start;
4740	eb->len = len;
4741	eb->fs_info = fs_info;
4742	eb->bflags = 0;
4743	rwlock_init(&eb->lock);
4744	atomic_set(&eb->write_locks, 0);
4745	atomic_set(&eb->read_locks, 0);
4746	atomic_set(&eb->blocking_readers, 0);
4747	atomic_set(&eb->blocking_writers, 0);
4748	atomic_set(&eb->spinning_readers, 0);
4749	atomic_set(&eb->spinning_writers, 0);
4750	eb->lock_nested = 0;
4751	init_waitqueue_head(&eb->write_lock_wq);
4752	init_waitqueue_head(&eb->read_lock_wq);
4753
4754	btrfs_leak_debug_add(&eb->leak_list, &buffers);
4755
4756	spin_lock_init(&eb->refs_lock);
4757	atomic_set(&eb->refs, 1);
4758	atomic_set(&eb->io_pages, 0);
4759
4760	/*
4761	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4762	 */
4763	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4764		> MAX_INLINE_EXTENT_BUFFER_SIZE);
4765	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4766
4767	return eb;
4768}
4769
4770struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4771{
4772	unsigned long i;
4773	struct page *p;
4774	struct extent_buffer *new;
4775	unsigned long num_pages = num_extent_pages(src->start, src->len);
4776
4777	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
4778	if (new == NULL)
4779		return NULL;
4780
4781	for (i = 0; i < num_pages; i++) {
4782		p = alloc_page(GFP_NOFS);
4783		if (!p) {
4784			btrfs_release_extent_buffer(new);
4785			return NULL;
4786		}
4787		attach_extent_buffer_page(new, p);
4788		WARN_ON(PageDirty(p));
4789		SetPageUptodate(p);
4790		new->pages[i] = p;
4791	}
4792
4793	copy_extent_buffer(new, src, 0, 0, src->len);
4794	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4795	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4796
4797	return new;
4798}
4799
4800struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4801						u64 start)
4802{
4803	struct extent_buffer *eb;
4804	unsigned long len;
4805	unsigned long num_pages;
4806	unsigned long i;
4807
4808	if (!fs_info) {
4809		/*
4810		 * Called only from tests that don't always have a fs_info
4811		 * available, but we know that nodesize is 4096
4812		 */
4813		len = 4096;
4814	} else {
4815		len = fs_info->tree_root->nodesize;
4816	}
4817	num_pages = num_extent_pages(0, len);
4818
4819	eb = __alloc_extent_buffer(fs_info, start, len);
4820	if (!eb)
4821		return NULL;
4822
4823	for (i = 0; i < num_pages; i++) {
4824		eb->pages[i] = alloc_page(GFP_NOFS);
4825		if (!eb->pages[i])
4826			goto err;
4827	}
4828	set_extent_buffer_uptodate(eb);
4829	btrfs_set_header_nritems(eb, 0);
4830	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4831
4832	return eb;
4833err:
4834	for (; i > 0; i--)
4835		__free_page(eb->pages[i - 1]);
4836	__free_extent_buffer(eb);
4837	return NULL;
4838}
4839
4840static void check_buffer_tree_ref(struct extent_buffer *eb)
4841{
4842	int refs;
4843	/* the ref bit is tricky.  We have to make sure it is set
4844	 * if we have the buffer dirty.   Otherwise the
4845	 * code to free a buffer can end up dropping a dirty
4846	 * page
4847	 *
4848	 * Once the ref bit is set, it won't go away while the
4849	 * buffer is dirty or in writeback, and it also won't
4850	 * go away while we have the reference count on the
4851	 * eb bumped.
4852	 *
4853	 * We can't just set the ref bit without bumping the
4854	 * ref on the eb because free_extent_buffer might
4855	 * see the ref bit and try to clear it.  If this happens
4856	 * free_extent_buffer might end up dropping our original
4857	 * ref by mistake and freeing the page before we are able
4858	 * to add one more ref.
4859	 *
4860	 * So bump the ref count first, then set the bit.  If someone
4861	 * beat us to it, drop the ref we added.
4862	 */
4863	refs = atomic_read(&eb->refs);
4864	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4865		return;
4866
4867	spin_lock(&eb->refs_lock);
4868	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4869		atomic_inc(&eb->refs);
4870	spin_unlock(&eb->refs_lock);
4871}
4872
4873static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4874		struct page *accessed)
4875{
4876	unsigned long num_pages, i;
4877
4878	check_buffer_tree_ref(eb);
4879
4880	num_pages = num_extent_pages(eb->start, eb->len);
4881	for (i = 0; i < num_pages; i++) {
4882		struct page *p = eb->pages[i];
4883
4884		if (p != accessed)
4885			mark_page_accessed(p);
4886	}
4887}
4888
4889struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4890					 u64 start)
4891{
4892	struct extent_buffer *eb;
4893
4894	rcu_read_lock();
4895	eb = radix_tree_lookup(&fs_info->buffer_radix,
4896			       start >> PAGE_CACHE_SHIFT);
4897	if (eb && atomic_inc_not_zero(&eb->refs)) {
4898		rcu_read_unlock();
4899		/*
4900		 * Lock our eb's refs_lock to avoid races with
4901		 * free_extent_buffer. When we get our eb it might be flagged
4902		 * with EXTENT_BUFFER_STALE and another task running
4903		 * free_extent_buffer might have seen that flag set,
4904		 * eb->refs == 2, that the buffer isn't under IO (dirty and
4905		 * writeback flags not set) and it's still in the tree (flag
4906		 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
4907		 * of decrementing the extent buffer's reference count twice.
4908		 * So here we could race and increment the eb's reference count,
4909		 * clear its stale flag, mark it as dirty and drop our reference
4910		 * before the other task finishes executing free_extent_buffer,
4911		 * which would later result in an attempt to free an extent
4912		 * buffer that is dirty.
4913		 */
4914		if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
4915			spin_lock(&eb->refs_lock);
4916			spin_unlock(&eb->refs_lock);
4917		}
4918		mark_extent_buffer_accessed(eb, NULL);
4919		return eb;
4920	}
4921	rcu_read_unlock();
4922
4923	return NULL;
4924}
4925
4926#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4927struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4928					       u64 start)
4929{
4930	struct extent_buffer *eb, *exists = NULL;
4931	int ret;
4932
4933	eb = find_extent_buffer(fs_info, start);
4934	if (eb)
4935		return eb;
4936	eb = alloc_dummy_extent_buffer(fs_info, start);
4937	if (!eb)
4938		return NULL;
4939	eb->fs_info = fs_info;
4940again:
4941	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4942	if (ret)
4943		goto free_eb;
4944	spin_lock(&fs_info->buffer_lock);
4945	ret = radix_tree_insert(&fs_info->buffer_radix,
4946				start >> PAGE_CACHE_SHIFT, eb);
4947	spin_unlock(&fs_info->buffer_lock);
4948	radix_tree_preload_end();
4949	if (ret == -EEXIST) {
4950		exists = find_extent_buffer(fs_info, start);
4951		if (exists)
4952			goto free_eb;
4953		else
4954			goto again;
4955	}
4956	check_buffer_tree_ref(eb);
4957	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4958
4959	/*
4960	 * We will free dummy extent buffer's if they come into
4961	 * free_extent_buffer with a ref count of 2, but if we are using this we
4962	 * want the buffers to stay in memory until we're done with them, so
4963	 * bump the ref count again.
4964	 */
4965	atomic_inc(&eb->refs);
4966	return eb;
4967free_eb:
4968	btrfs_release_extent_buffer(eb);
4969	return exists;
4970}
4971#endif
4972
4973struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4974					  u64 start)
4975{
4976	unsigned long len = fs_info->tree_root->nodesize;
4977	unsigned long num_pages = num_extent_pages(start, len);
4978	unsigned long i;
4979	unsigned long index = start >> PAGE_CACHE_SHIFT;
4980	struct extent_buffer *eb;
4981	struct extent_buffer *exists = NULL;
4982	struct page *p;
4983	struct address_space *mapping = fs_info->btree_inode->i_mapping;
4984	int uptodate = 1;
4985	int ret;
4986
4987	eb = find_extent_buffer(fs_info, start);
4988	if (eb)
4989		return eb;
4990
4991	eb = __alloc_extent_buffer(fs_info, start, len);
4992	if (!eb)
4993		return NULL;
4994
4995	for (i = 0; i < num_pages; i++, index++) {
4996		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
4997		if (!p)
4998			goto free_eb;
4999
5000		spin_lock(&mapping->private_lock);
5001		if (PagePrivate(p)) {
5002			/*
5003			 * We could have already allocated an eb for this page
5004			 * and attached one so lets see if we can get a ref on
5005			 * the existing eb, and if we can we know it's good and
5006			 * we can just return that one, else we know we can just
5007			 * overwrite page->private.
5008			 */
5009			exists = (struct extent_buffer *)p->private;
5010			if (atomic_inc_not_zero(&exists->refs)) {
5011				spin_unlock(&mapping->private_lock);
5012				unlock_page(p);
5013				page_cache_release(p);
5014				mark_extent_buffer_accessed(exists, p);
5015				goto free_eb;
5016			}
5017			exists = NULL;
5018
5019			/*
5020			 * Do this so attach doesn't complain and we need to
5021			 * drop the ref the old guy had.
5022			 */
5023			ClearPagePrivate(p);
5024			WARN_ON(PageDirty(p));
5025			page_cache_release(p);
5026		}
5027		attach_extent_buffer_page(eb, p);
5028		spin_unlock(&mapping->private_lock);
5029		WARN_ON(PageDirty(p));
5030		eb->pages[i] = p;
5031		if (!PageUptodate(p))
5032			uptodate = 0;
5033
5034		/*
5035		 * see below about how we avoid a nasty race with release page
5036		 * and why we unlock later
5037		 */
5038	}
5039	if (uptodate)
5040		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5041again:
5042	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
5043	if (ret)
5044		goto free_eb;
5045
5046	spin_lock(&fs_info->buffer_lock);
5047	ret = radix_tree_insert(&fs_info->buffer_radix,
5048				start >> PAGE_CACHE_SHIFT, eb);
5049	spin_unlock(&fs_info->buffer_lock);
5050	radix_tree_preload_end();
5051	if (ret == -EEXIST) {
5052		exists = find_extent_buffer(fs_info, start);
5053		if (exists)
5054			goto free_eb;
5055		else
5056			goto again;
5057	}
5058	/* add one reference for the tree */
5059	check_buffer_tree_ref(eb);
5060	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5061
5062	/*
5063	 * there is a race where release page may have
5064	 * tried to find this extent buffer in the radix
5065	 * but failed.  It will tell the VM it is safe to
5066	 * reclaim the, and it will clear the page private bit.
5067	 * We must make sure to set the page private bit properly
5068	 * after the extent buffer is in the radix tree so
5069	 * it doesn't get lost
5070	 */
5071	SetPageChecked(eb->pages[0]);
5072	for (i = 1; i < num_pages; i++) {
5073		p = eb->pages[i];
5074		ClearPageChecked(p);
5075		unlock_page(p);
5076	}
5077	unlock_page(eb->pages[0]);
5078	return eb;
5079
5080free_eb:
5081	WARN_ON(!atomic_dec_and_test(&eb->refs));
5082	for (i = 0; i < num_pages; i++) {
5083		if (eb->pages[i])
5084			unlock_page(eb->pages[i]);
5085	}
5086
5087	btrfs_release_extent_buffer(eb);
5088	return exists;
5089}
5090
5091static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5092{
5093	struct extent_buffer *eb =
5094			container_of(head, struct extent_buffer, rcu_head);
5095
5096	__free_extent_buffer(eb);
5097}
5098
5099/* Expects to have eb->eb_lock already held */
5100static int release_extent_buffer(struct extent_buffer *eb)
5101{
5102	WARN_ON(atomic_read(&eb->refs) == 0);
5103	if (atomic_dec_and_test(&eb->refs)) {
5104		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5105			struct btrfs_fs_info *fs_info = eb->fs_info;
5106
5107			spin_unlock(&eb->refs_lock);
5108
5109			spin_lock(&fs_info->buffer_lock);
5110			radix_tree_delete(&fs_info->buffer_radix,
5111					  eb->start >> PAGE_CACHE_SHIFT);
5112			spin_unlock(&fs_info->buffer_lock);
5113		} else {
5114			spin_unlock(&eb->refs_lock);
5115		}
5116
5117		/* Should be safe to release our pages at this point */
5118		btrfs_release_extent_buffer_page(eb);
5119#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5120		if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
5121			__free_extent_buffer(eb);
5122			return 1;
5123		}
5124#endif
5125		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5126		return 1;
5127	}
5128	spin_unlock(&eb->refs_lock);
5129
5130	return 0;
5131}
5132
5133void free_extent_buffer(struct extent_buffer *eb)
5134{
5135	int refs;
5136	int old;
5137	if (!eb)
5138		return;
5139
5140	while (1) {
5141		refs = atomic_read(&eb->refs);
5142		if (refs <= 3)
5143			break;
5144		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5145		if (old == refs)
5146			return;
5147	}
5148
5149	spin_lock(&eb->refs_lock);
5150	if (atomic_read(&eb->refs) == 2 &&
5151	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
5152		atomic_dec(&eb->refs);
5153
5154	if (atomic_read(&eb->refs) == 2 &&
5155	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5156	    !extent_buffer_under_io(eb) &&
5157	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5158		atomic_dec(&eb->refs);
5159
5160	/*
5161	 * I know this is terrible, but it's temporary until we stop tracking
5162	 * the uptodate bits and such for the extent buffers.
5163	 */
5164	release_extent_buffer(eb);
5165}
5166
5167void free_extent_buffer_stale(struct extent_buffer *eb)
5168{
5169	if (!eb)
5170		return;
5171
5172	spin_lock(&eb->refs_lock);
5173	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5174
5175	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5176	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5177		atomic_dec(&eb->refs);
5178	release_extent_buffer(eb);
5179}
5180
5181void clear_extent_buffer_dirty(struct extent_buffer *eb)
5182{
5183	unsigned long i;
5184	unsigned long num_pages;
5185	struct page *page;
5186
5187	num_pages = num_extent_pages(eb->start, eb->len);
5188
5189	for (i = 0; i < num_pages; i++) {
5190		page = eb->pages[i];
5191		if (!PageDirty(page))
5192			continue;
5193
5194		lock_page(page);
5195		WARN_ON(!PagePrivate(page));
5196
5197		clear_page_dirty_for_io(page);
5198		spin_lock_irq(&page->mapping->tree_lock);
5199		if (!PageDirty(page)) {
5200			radix_tree_tag_clear(&page->mapping->page_tree,
5201						page_index(page),
5202						PAGECACHE_TAG_DIRTY);
5203		}
5204		spin_unlock_irq(&page->mapping->tree_lock);
5205		ClearPageError(page);
5206		unlock_page(page);
5207	}
5208	WARN_ON(atomic_read(&eb->refs) == 0);
5209}
5210
5211int set_extent_buffer_dirty(struct extent_buffer *eb)
5212{
5213	unsigned long i;
5214	unsigned long num_pages;
5215	int was_dirty = 0;
5216
5217	check_buffer_tree_ref(eb);
5218
5219	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5220
5221	num_pages = num_extent_pages(eb->start, eb->len);
5222	WARN_ON(atomic_read(&eb->refs) == 0);
5223	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5224
5225	for (i = 0; i < num_pages; i++)
5226		set_page_dirty(eb->pages[i]);
5227	return was_dirty;
5228}
5229
5230int clear_extent_buffer_uptodate(struct extent_buffer *eb)
5231{
5232	unsigned long i;
5233	struct page *page;
5234	unsigned long num_pages;
5235
5236	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5237	num_pages = num_extent_pages(eb->start, eb->len);
5238	for (i = 0; i < num_pages; i++) {
5239		page = eb->pages[i];
5240		if (page)
5241			ClearPageUptodate(page);
5242	}
5243	return 0;
5244}
5245
5246int set_extent_buffer_uptodate(struct extent_buffer *eb)
5247{
5248	unsigned long i;
5249	struct page *page;
5250	unsigned long num_pages;
5251
5252	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5253	num_pages = num_extent_pages(eb->start, eb->len);
5254	for (i = 0; i < num_pages; i++) {
5255		page = eb->pages[i];
5256		SetPageUptodate(page);
5257	}
5258	return 0;
5259}
5260
5261int extent_buffer_uptodate(struct extent_buffer *eb)
5262{
5263	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5264}
5265
5266int read_extent_buffer_pages(struct extent_io_tree *tree,
5267			     struct extent_buffer *eb, u64 start, int wait,
5268			     get_extent_t *get_extent, int mirror_num)
5269{
5270	unsigned long i;
5271	unsigned long start_i;
5272	struct page *page;
5273	int err;
5274	int ret = 0;
5275	int locked_pages = 0;
5276	int all_uptodate = 1;
5277	unsigned long num_pages;
5278	unsigned long num_reads = 0;
5279	struct bio *bio = NULL;
5280	unsigned long bio_flags = 0;
5281
5282	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5283		return 0;
5284
5285	if (start) {
5286		WARN_ON(start < eb->start);
5287		start_i = (start >> PAGE_CACHE_SHIFT) -
5288			(eb->start >> PAGE_CACHE_SHIFT);
5289	} else {
5290		start_i = 0;
5291	}
5292
5293	num_pages = num_extent_pages(eb->start, eb->len);
5294	for (i = start_i; i < num_pages; i++) {
5295		page = eb->pages[i];
5296		if (wait == WAIT_NONE) {
5297			if (!trylock_page(page))
5298				goto unlock_exit;
5299		} else {
5300			lock_page(page);
5301		}
5302		locked_pages++;
5303		if (!PageUptodate(page)) {
5304			num_reads++;
5305			all_uptodate = 0;
5306		}
5307	}
5308	if (all_uptodate) {
5309		if (start_i == 0)
5310			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5311		goto unlock_exit;
5312	}
5313
5314	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5315	eb->read_mirror = 0;
5316	atomic_set(&eb->io_pages, num_reads);
5317	for (i = start_i; i < num_pages; i++) {
5318		page = eb->pages[i];
5319		if (!PageUptodate(page)) {
5320			ClearPageError(page);
5321			err = __extent_read_full_page(tree, page,
5322						      get_extent, &bio,
5323						      mirror_num, &bio_flags,
5324						      READ | REQ_META);
5325			if (err)
5326				ret = err;
5327		} else {
5328			unlock_page(page);
5329		}
5330	}
5331
5332	if (bio) {
5333		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
5334				     bio_flags);
5335		if (err)
5336			return err;
5337	}
5338
5339	if (ret || wait != WAIT_COMPLETE)
5340		return ret;
5341
5342	for (i = start_i; i < num_pages; i++) {
5343		page = eb->pages[i];
5344		wait_on_page_locked(page);
5345		if (!PageUptodate(page))
5346			ret = -EIO;
5347	}
5348
5349	return ret;
5350
5351unlock_exit:
5352	i = start_i;
5353	while (locked_pages > 0) {
5354		page = eb->pages[i];
5355		i++;
5356		unlock_page(page);
5357		locked_pages--;
5358	}
5359	return ret;
5360}
5361
5362void read_extent_buffer(struct extent_buffer *eb, void *dstv,
5363			unsigned long start,
5364			unsigned long len)
5365{
5366	size_t cur;
5367	size_t offset;
5368	struct page *page;
5369	char *kaddr;
5370	char *dst = (char *)dstv;
5371	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5372	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5373
5374	WARN_ON(start > eb->len);
5375	WARN_ON(start + len > eb->start + eb->len);
5376
5377	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5378
5379	while (len > 0) {
5380		page = eb->pages[i];
5381
5382		cur = min(len, (PAGE_CACHE_SIZE - offset));
5383		kaddr = page_address(page);
5384		memcpy(dst, kaddr + offset, cur);
5385
5386		dst += cur;
5387		len -= cur;
5388		offset = 0;
5389		i++;
5390	}
5391}
5392
5393int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
5394			unsigned long start,
5395			unsigned long len)
5396{
5397	size_t cur;
5398	size_t offset;
5399	struct page *page;
5400	char *kaddr;
5401	char __user *dst = (char __user *)dstv;
5402	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5403	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5404	int ret = 0;
5405
5406	WARN_ON(start > eb->len);
5407	WARN_ON(start + len > eb->start + eb->len);
5408
5409	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5410
5411	while (len > 0) {
5412		page = eb->pages[i];
5413
5414		cur = min(len, (PAGE_CACHE_SIZE - offset));
5415		kaddr = page_address(page);
5416		if (copy_to_user(dst, kaddr + offset, cur)) {
5417			ret = -EFAULT;
5418			break;
5419		}
5420
5421		dst += cur;
5422		len -= cur;
5423		offset = 0;
5424		i++;
5425	}
5426
5427	return ret;
5428}
5429
5430int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
5431			       unsigned long min_len, char **map,
5432			       unsigned long *map_start,
5433			       unsigned long *map_len)
5434{
5435	size_t offset = start & (PAGE_CACHE_SIZE - 1);
5436	char *kaddr;
5437	struct page *p;
5438	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5439	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5440	unsigned long end_i = (start_offset + start + min_len - 1) >>
5441		PAGE_CACHE_SHIFT;
5442
5443	if (i != end_i)
5444		return -EINVAL;
5445
5446	if (i == 0) {
5447		offset = start_offset;
5448		*map_start = 0;
5449	} else {
5450		offset = 0;
5451		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
5452	}
5453
5454	if (start + min_len > eb->len) {
5455		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
5456		       "wanted %lu %lu\n",
5457		       eb->start, eb->len, start, min_len);
5458		return -EINVAL;
5459	}
5460
5461	p = eb->pages[i];
5462	kaddr = page_address(p);
5463	*map = kaddr + offset;
5464	*map_len = PAGE_CACHE_SIZE - offset;
5465	return 0;
5466}
5467
5468int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
5469			  unsigned long start,
5470			  unsigned long len)
5471{
5472	size_t cur;
5473	size_t offset;
5474	struct page *page;
5475	char *kaddr;
5476	char *ptr = (char *)ptrv;
5477	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5478	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5479	int ret = 0;
5480
5481	WARN_ON(start > eb->len);
5482	WARN_ON(start + len > eb->start + eb->len);
5483
5484	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5485
5486	while (len > 0) {
5487		page = eb->pages[i];
5488
5489		cur = min(len, (PAGE_CACHE_SIZE - offset));
5490
5491		kaddr = page_address(page);
5492		ret = memcmp(ptr, kaddr + offset, cur);
5493		if (ret)
5494			break;
5495
5496		ptr += cur;
5497		len -= cur;
5498		offset = 0;
5499		i++;
5500	}
5501	return ret;
5502}
5503
5504void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5505			 unsigned long start, unsigned long len)
5506{
5507	size_t cur;
5508	size_t offset;
5509	struct page *page;
5510	char *kaddr;
5511	char *src = (char *)srcv;
5512	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5513	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5514
5515	WARN_ON(start > eb->len);
5516	WARN_ON(start + len > eb->start + eb->len);
5517
5518	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5519
5520	while (len > 0) {
5521		page = eb->pages[i];
5522		WARN_ON(!PageUptodate(page));
5523
5524		cur = min(len, PAGE_CACHE_SIZE - offset);
5525		kaddr = page_address(page);
5526		memcpy(kaddr + offset, src, cur);
5527
5528		src += cur;
5529		len -= cur;
5530		offset = 0;
5531		i++;
5532	}
5533}
5534
5535void memset_extent_buffer(struct extent_buffer *eb, char c,
5536			  unsigned long start, unsigned long len)
5537{
5538	size_t cur;
5539	size_t offset;
5540	struct page *page;
5541	char *kaddr;
5542	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5543	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5544
5545	WARN_ON(start > eb->len);
5546	WARN_ON(start + len > eb->start + eb->len);
5547
5548	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5549
5550	while (len > 0) {
5551		page = eb->pages[i];
5552		WARN_ON(!PageUptodate(page));
5553
5554		cur = min(len, PAGE_CACHE_SIZE - offset);
5555		kaddr = page_address(page);
5556		memset(kaddr + offset, c, cur);
5557
5558		len -= cur;
5559		offset = 0;
5560		i++;
5561	}
5562}
5563
5564void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5565			unsigned long dst_offset, unsigned long src_offset,
5566			unsigned long len)
5567{
5568	u64 dst_len = dst->len;
5569	size_t cur;
5570	size_t offset;
5571	struct page *page;
5572	char *kaddr;
5573	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5574	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
5575
5576	WARN_ON(src->len != dst_len);
5577
5578	offset = (start_offset + dst_offset) &
5579		(PAGE_CACHE_SIZE - 1);
5580
5581	while (len > 0) {
5582		page = dst->pages[i];
5583		WARN_ON(!PageUptodate(page));
5584
5585		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
5586
5587		kaddr = page_address(page);
5588		read_extent_buffer(src, kaddr + offset, src_offset, cur);
5589
5590		src_offset += cur;
5591		len -= cur;
5592		offset = 0;
5593		i++;
5594	}
5595}
5596
5597static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5598{
5599	unsigned long distance = (src > dst) ? src - dst : dst - src;
5600	return distance < len;
5601}
5602
5603static void copy_pages(struct page *dst_page, struct page *src_page,
5604		       unsigned long dst_off, unsigned long src_off,
5605		       unsigned long len)
5606{
5607	char *dst_kaddr = page_address(dst_page);
5608	char *src_kaddr;
5609	int must_memmove = 0;
5610
5611	if (dst_page != src_page) {
5612		src_kaddr = page_address(src_page);
5613	} else {
5614		src_kaddr = dst_kaddr;
5615		if (areas_overlap(src_off, dst_off, len))
5616			must_memmove = 1;
5617	}
5618
5619	if (must_memmove)
5620		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5621	else
5622		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
5623}
5624
5625void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5626			   unsigned long src_offset, unsigned long len)
5627{
5628	size_t cur;
5629	size_t dst_off_in_page;
5630	size_t src_off_in_page;
5631	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5632	unsigned long dst_i;
5633	unsigned long src_i;
5634
5635	if (src_offset + len > dst->len) {
5636		btrfs_err(dst->fs_info,
5637			"memmove bogus src_offset %lu move "
5638		       "len %lu dst len %lu", src_offset, len, dst->len);
5639		BUG_ON(1);
5640	}
5641	if (dst_offset + len > dst->len) {
5642		btrfs_err(dst->fs_info,
5643			"memmove bogus dst_offset %lu move "
5644		       "len %lu dst len %lu", dst_offset, len, dst->len);
5645		BUG_ON(1);
5646	}
5647
5648	while (len > 0) {
5649		dst_off_in_page = (start_offset + dst_offset) &
5650			(PAGE_CACHE_SIZE - 1);
5651		src_off_in_page = (start_offset + src_offset) &
5652			(PAGE_CACHE_SIZE - 1);
5653
5654		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
5655		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
5656
5657		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
5658					       src_off_in_page));
5659		cur = min_t(unsigned long, cur,
5660			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
5661
5662		copy_pages(dst->pages[dst_i], dst->pages[src_i],
5663			   dst_off_in_page, src_off_in_page, cur);
5664
5665		src_offset += cur;
5666		dst_offset += cur;
5667		len -= cur;
5668	}
5669}
5670
5671void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5672			   unsigned long src_offset, unsigned long len)
5673{
5674	size_t cur;
5675	size_t dst_off_in_page;
5676	size_t src_off_in_page;
5677	unsigned long dst_end = dst_offset + len - 1;
5678	unsigned long src_end = src_offset + len - 1;
5679	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5680	unsigned long dst_i;
5681	unsigned long src_i;
5682
5683	if (src_offset + len > dst->len) {
5684		btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
5685		       "len %lu len %lu", src_offset, len, dst->len);
5686		BUG_ON(1);
5687	}
5688	if (dst_offset + len > dst->len) {
5689		btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
5690		       "len %lu len %lu", dst_offset, len, dst->len);
5691		BUG_ON(1);
5692	}
5693	if (dst_offset < src_offset) {
5694		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5695		return;
5696	}
5697	while (len > 0) {
5698		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
5699		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
5700
5701		dst_off_in_page = (start_offset + dst_end) &
5702			(PAGE_CACHE_SIZE - 1);
5703		src_off_in_page = (start_offset + src_end) &
5704			(PAGE_CACHE_SIZE - 1);
5705
5706		cur = min_t(unsigned long, len, src_off_in_page + 1);
5707		cur = min(cur, dst_off_in_page + 1);
5708		copy_pages(dst->pages[dst_i], dst->pages[src_i],
5709			   dst_off_in_page - cur + 1,
5710			   src_off_in_page - cur + 1, cur);
5711
5712		dst_end -= cur;
5713		src_end -= cur;
5714		len -= cur;
5715	}
5716}
5717
5718int try_release_extent_buffer(struct page *page)
5719{
5720	struct extent_buffer *eb;
5721
5722	/*
5723	 * We need to make sure noboody is attaching this page to an eb right
5724	 * now.
5725	 */
5726	spin_lock(&page->mapping->private_lock);
5727	if (!PagePrivate(page)) {
5728		spin_unlock(&page->mapping->private_lock);
5729		return 1;
5730	}
5731
5732	eb = (struct extent_buffer *)page->private;
5733	BUG_ON(!eb);
5734
5735	/*
5736	 * This is a little awful but should be ok, we need to make sure that
5737	 * the eb doesn't disappear out from under us while we're looking at
5738	 * this page.
5739	 */
5740	spin_lock(&eb->refs_lock);
5741	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5742		spin_unlock(&eb->refs_lock);
5743		spin_unlock(&page->mapping->private_lock);
5744		return 0;
5745	}
5746	spin_unlock(&page->mapping->private_lock);
5747
5748	/*
5749	 * If tree ref isn't set then we know the ref on this eb is a real ref,
5750	 * so just return, this page will likely be freed soon anyway.
5751	 */
5752	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5753		spin_unlock(&eb->refs_lock);
5754		return 0;
5755	}
5756
5757	return release_extent_buffer(eb);
5758}
5759