1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/pagemap.h>
6#include <linux/page-flags.h>
7#include <linux/spinlock.h>
8#include <linux/blkdev.h>
9#include <linux/swap.h>
10#include <linux/writeback.h>
11#include <linux/pagevec.h>
12#include <linux/prefetch.h>
13#include <linux/cleancache.h>
14#include "extent_io.h"
15#include "extent_map.h"
16#include "ctree.h"
17#include "btrfs_inode.h"
18#include "volumes.h"
19#include "check-integrity.h"
20#include "locking.h"
21#include "rcu-string.h"
22#include "backref.h"
23
24static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache;
26static struct bio_set *btrfs_bioset;
27
28static inline bool extent_state_in_tree(const struct extent_state *state)
29{
30	return !RB_EMPTY_NODE(&state->rb_node);
31}
32
33#ifdef CONFIG_BTRFS_DEBUG
34static LIST_HEAD(buffers);
35static LIST_HEAD(states);
36
37static DEFINE_SPINLOCK(leak_lock);
38
39static inline
40void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
41{
42	unsigned long flags;
43
44	spin_lock_irqsave(&leak_lock, flags);
45	list_add(new, head);
46	spin_unlock_irqrestore(&leak_lock, flags);
47}
48
49static inline
50void btrfs_leak_debug_del(struct list_head *entry)
51{
52	unsigned long flags;
53
54	spin_lock_irqsave(&leak_lock, flags);
55	list_del(entry);
56	spin_unlock_irqrestore(&leak_lock, flags);
57}
58
59static inline
60void btrfs_leak_debug_check(void)
61{
62	struct extent_state *state;
63	struct extent_buffer *eb;
64
65	while (!list_empty(&states)) {
66		state = list_entry(states.next, struct extent_state, leak_list);
67		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
68		       state->start, state->end, state->state,
69		       extent_state_in_tree(state),
70		       atomic_read(&state->refs));
71		list_del(&state->leak_list);
72		kmem_cache_free(extent_state_cache, state);
73	}
74
75	while (!list_empty(&buffers)) {
76		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
77		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
78		       "refs %d\n",
79		       eb->start, eb->len, atomic_read(&eb->refs));
80		list_del(&eb->leak_list);
81		kmem_cache_free(extent_buffer_cache, eb);
82	}
83}
84
85#define btrfs_debug_check_extent_io_range(tree, start, end)		\
86	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
87static inline void __btrfs_debug_check_extent_io_range(const char *caller,
88		struct extent_io_tree *tree, u64 start, u64 end)
89{
90	struct inode *inode;
91	u64 isize;
92
93	if (!tree->mapping)
94		return;
95
96	inode = tree->mapping->host;
97	isize = i_size_read(inode);
98	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
99		printk_ratelimited(KERN_DEBUG
100		    "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
101				caller, btrfs_ino(inode), isize, start, end);
102	}
103}
104#else
105#define btrfs_leak_debug_add(new, head)	do {} while (0)
106#define btrfs_leak_debug_del(entry)	do {} while (0)
107#define btrfs_leak_debug_check()	do {} while (0)
108#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
109#endif
110
111#define BUFFER_LRU_MAX 64
112
113struct tree_entry {
114	u64 start;
115	u64 end;
116	struct rb_node rb_node;
117};
118
119struct extent_page_data {
120	struct bio *bio;
121	struct extent_io_tree *tree;
122	get_extent_t *get_extent;
123	unsigned long bio_flags;
124
125	/* tells writepage not to lock the state bits for this range
126	 * it still does the unlocking
127	 */
128	unsigned int extent_locked:1;
129
130	/* tells the submit_bio code to use a WRITE_SYNC */
131	unsigned int sync_io:1;
132};
133
134static noinline void flush_write_bio(void *data);
135static inline struct btrfs_fs_info *
136tree_fs_info(struct extent_io_tree *tree)
137{
138	if (!tree->mapping)
139		return NULL;
140	return btrfs_sb(tree->mapping->host->i_sb);
141}
142
143int __init extent_io_init(void)
144{
145	extent_state_cache = kmem_cache_create("btrfs_extent_state",
146			sizeof(struct extent_state), 0,
147			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
148	if (!extent_state_cache)
149		return -ENOMEM;
150
151	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
152			sizeof(struct extent_buffer), 0,
153			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
154	if (!extent_buffer_cache)
155		goto free_state_cache;
156
157	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
158				     offsetof(struct btrfs_io_bio, bio));
159	if (!btrfs_bioset)
160		goto free_buffer_cache;
161
162	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
163		goto free_bioset;
164
165	return 0;
166
167free_bioset:
168	bioset_free(btrfs_bioset);
169	btrfs_bioset = NULL;
170
171free_buffer_cache:
172	kmem_cache_destroy(extent_buffer_cache);
173	extent_buffer_cache = NULL;
174
175free_state_cache:
176	kmem_cache_destroy(extent_state_cache);
177	extent_state_cache = NULL;
178	return -ENOMEM;
179}
180
181void extent_io_exit(void)
182{
183	btrfs_leak_debug_check();
184
185	/*
186	 * Make sure all delayed rcu free are flushed before we
187	 * destroy caches.
188	 */
189	rcu_barrier();
190	if (extent_state_cache)
191		kmem_cache_destroy(extent_state_cache);
192	if (extent_buffer_cache)
193		kmem_cache_destroy(extent_buffer_cache);
194	if (btrfs_bioset)
195		bioset_free(btrfs_bioset);
196}
197
198void extent_io_tree_init(struct extent_io_tree *tree,
199			 struct address_space *mapping)
200{
201	tree->state = RB_ROOT;
202	tree->ops = NULL;
203	tree->dirty_bytes = 0;
204	spin_lock_init(&tree->lock);
205	tree->mapping = mapping;
206}
207
208static struct extent_state *alloc_extent_state(gfp_t mask)
209{
210	struct extent_state *state;
211
212	state = kmem_cache_alloc(extent_state_cache, mask);
213	if (!state)
214		return state;
215	state->state = 0;
216	state->private = 0;
217	RB_CLEAR_NODE(&state->rb_node);
218	btrfs_leak_debug_add(&state->leak_list, &states);
219	atomic_set(&state->refs, 1);
220	init_waitqueue_head(&state->wq);
221	trace_alloc_extent_state(state, mask, _RET_IP_);
222	return state;
223}
224
225void free_extent_state(struct extent_state *state)
226{
227	if (!state)
228		return;
229	if (atomic_dec_and_test(&state->refs)) {
230		WARN_ON(extent_state_in_tree(state));
231		btrfs_leak_debug_del(&state->leak_list);
232		trace_free_extent_state(state, _RET_IP_);
233		kmem_cache_free(extent_state_cache, state);
234	}
235}
236
237static struct rb_node *tree_insert(struct rb_root *root,
238				   struct rb_node *search_start,
239				   u64 offset,
240				   struct rb_node *node,
241				   struct rb_node ***p_in,
242				   struct rb_node **parent_in)
243{
244	struct rb_node **p;
245	struct rb_node *parent = NULL;
246	struct tree_entry *entry;
247
248	if (p_in && parent_in) {
249		p = *p_in;
250		parent = *parent_in;
251		goto do_insert;
252	}
253
254	p = search_start ? &search_start : &root->rb_node;
255	while (*p) {
256		parent = *p;
257		entry = rb_entry(parent, struct tree_entry, rb_node);
258
259		if (offset < entry->start)
260			p = &(*p)->rb_left;
261		else if (offset > entry->end)
262			p = &(*p)->rb_right;
263		else
264			return parent;
265	}
266
267do_insert:
268	rb_link_node(node, parent, p);
269	rb_insert_color(node, root);
270	return NULL;
271}
272
273static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
274				      struct rb_node **prev_ret,
275				      struct rb_node **next_ret,
276				      struct rb_node ***p_ret,
277				      struct rb_node **parent_ret)
278{
279	struct rb_root *root = &tree->state;
280	struct rb_node **n = &root->rb_node;
281	struct rb_node *prev = NULL;
282	struct rb_node *orig_prev = NULL;
283	struct tree_entry *entry;
284	struct tree_entry *prev_entry = NULL;
285
286	while (*n) {
287		prev = *n;
288		entry = rb_entry(prev, struct tree_entry, rb_node);
289		prev_entry = entry;
290
291		if (offset < entry->start)
292			n = &(*n)->rb_left;
293		else if (offset > entry->end)
294			n = &(*n)->rb_right;
295		else
296			return *n;
297	}
298
299	if (p_ret)
300		*p_ret = n;
301	if (parent_ret)
302		*parent_ret = prev;
303
304	if (prev_ret) {
305		orig_prev = prev;
306		while (prev && offset > prev_entry->end) {
307			prev = rb_next(prev);
308			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
309		}
310		*prev_ret = prev;
311		prev = orig_prev;
312	}
313
314	if (next_ret) {
315		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
316		while (prev && offset < prev_entry->start) {
317			prev = rb_prev(prev);
318			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
319		}
320		*next_ret = prev;
321	}
322	return NULL;
323}
324
325static inline struct rb_node *
326tree_search_for_insert(struct extent_io_tree *tree,
327		       u64 offset,
328		       struct rb_node ***p_ret,
329		       struct rb_node **parent_ret)
330{
331	struct rb_node *prev = NULL;
332	struct rb_node *ret;
333
334	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
335	if (!ret)
336		return prev;
337	return ret;
338}
339
340static inline struct rb_node *tree_search(struct extent_io_tree *tree,
341					  u64 offset)
342{
343	return tree_search_for_insert(tree, offset, NULL, NULL);
344}
345
346static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
347		     struct extent_state *other)
348{
349	if (tree->ops && tree->ops->merge_extent_hook)
350		tree->ops->merge_extent_hook(tree->mapping->host, new,
351					     other);
352}
353
354/*
355 * utility function to look for merge candidates inside a given range.
356 * Any extents with matching state are merged together into a single
357 * extent in the tree.  Extents with EXTENT_IO in their state field
358 * are not merged because the end_io handlers need to be able to do
359 * operations on them without sleeping (or doing allocations/splits).
360 *
361 * This should be called with the tree lock held.
362 */
363static void merge_state(struct extent_io_tree *tree,
364		        struct extent_state *state)
365{
366	struct extent_state *other;
367	struct rb_node *other_node;
368
369	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
370		return;
371
372	other_node = rb_prev(&state->rb_node);
373	if (other_node) {
374		other = rb_entry(other_node, struct extent_state, rb_node);
375		if (other->end == state->start - 1 &&
376		    other->state == state->state) {
377			merge_cb(tree, state, other);
378			state->start = other->start;
379			rb_erase(&other->rb_node, &tree->state);
380			RB_CLEAR_NODE(&other->rb_node);
381			free_extent_state(other);
382		}
383	}
384	other_node = rb_next(&state->rb_node);
385	if (other_node) {
386		other = rb_entry(other_node, struct extent_state, rb_node);
387		if (other->start == state->end + 1 &&
388		    other->state == state->state) {
389			merge_cb(tree, state, other);
390			state->end = other->end;
391			rb_erase(&other->rb_node, &tree->state);
392			RB_CLEAR_NODE(&other->rb_node);
393			free_extent_state(other);
394		}
395	}
396}
397
398static void set_state_cb(struct extent_io_tree *tree,
399			 struct extent_state *state, unsigned *bits)
400{
401	if (tree->ops && tree->ops->set_bit_hook)
402		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
403}
404
405static void clear_state_cb(struct extent_io_tree *tree,
406			   struct extent_state *state, unsigned *bits)
407{
408	if (tree->ops && tree->ops->clear_bit_hook)
409		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
410}
411
412static void set_state_bits(struct extent_io_tree *tree,
413			   struct extent_state *state, unsigned *bits);
414
415/*
416 * insert an extent_state struct into the tree.  'bits' are set on the
417 * struct before it is inserted.
418 *
419 * This may return -EEXIST if the extent is already there, in which case the
420 * state struct is freed.
421 *
422 * The tree lock is not taken internally.  This is a utility function and
423 * probably isn't what you want to call (see set/clear_extent_bit).
424 */
425static int insert_state(struct extent_io_tree *tree,
426			struct extent_state *state, u64 start, u64 end,
427			struct rb_node ***p,
428			struct rb_node **parent,
429			unsigned *bits)
430{
431	struct rb_node *node;
432
433	if (end < start)
434		WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
435		       end, start);
436	state->start = start;
437	state->end = end;
438
439	set_state_bits(tree, state, bits);
440
441	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
442	if (node) {
443		struct extent_state *found;
444		found = rb_entry(node, struct extent_state, rb_node);
445		printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
446		       "%llu %llu\n",
447		       found->start, found->end, start, end);
448		return -EEXIST;
449	}
450	merge_state(tree, state);
451	return 0;
452}
453
454static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
455		     u64 split)
456{
457	if (tree->ops && tree->ops->split_extent_hook)
458		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
459}
460
461/*
462 * split a given extent state struct in two, inserting the preallocated
463 * struct 'prealloc' as the newly created second half.  'split' indicates an
464 * offset inside 'orig' where it should be split.
465 *
466 * Before calling,
467 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
468 * are two extent state structs in the tree:
469 * prealloc: [orig->start, split - 1]
470 * orig: [ split, orig->end ]
471 *
472 * The tree locks are not taken by this function. They need to be held
473 * by the caller.
474 */
475static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
476		       struct extent_state *prealloc, u64 split)
477{
478	struct rb_node *node;
479
480	split_cb(tree, orig, split);
481
482	prealloc->start = orig->start;
483	prealloc->end = split - 1;
484	prealloc->state = orig->state;
485	orig->start = split;
486
487	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
488			   &prealloc->rb_node, NULL, NULL);
489	if (node) {
490		free_extent_state(prealloc);
491		return -EEXIST;
492	}
493	return 0;
494}
495
496static struct extent_state *next_state(struct extent_state *state)
497{
498	struct rb_node *next = rb_next(&state->rb_node);
499	if (next)
500		return rb_entry(next, struct extent_state, rb_node);
501	else
502		return NULL;
503}
504
505/*
506 * utility function to clear some bits in an extent state struct.
507 * it will optionally wake up any one waiting on this state (wake == 1).
508 *
509 * If no bits are set on the state struct after clearing things, the
510 * struct is freed and removed from the tree
511 */
512static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
513					    struct extent_state *state,
514					    unsigned *bits, int wake)
515{
516	struct extent_state *next;
517	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
518
519	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
520		u64 range = state->end - state->start + 1;
521		WARN_ON(range > tree->dirty_bytes);
522		tree->dirty_bytes -= range;
523	}
524	clear_state_cb(tree, state, bits);
525	state->state &= ~bits_to_clear;
526	if (wake)
527		wake_up(&state->wq);
528	if (state->state == 0) {
529		next = next_state(state);
530		if (extent_state_in_tree(state)) {
531			rb_erase(&state->rb_node, &tree->state);
532			RB_CLEAR_NODE(&state->rb_node);
533			free_extent_state(state);
534		} else {
535			WARN_ON(1);
536		}
537	} else {
538		merge_state(tree, state);
539		next = next_state(state);
540	}
541	return next;
542}
543
544static struct extent_state *
545alloc_extent_state_atomic(struct extent_state *prealloc)
546{
547	if (!prealloc)
548		prealloc = alloc_extent_state(GFP_ATOMIC);
549
550	return prealloc;
551}
552
553static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
554{
555	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
556		    "Extent tree was modified by another "
557		    "thread while locked.");
558}
559
560/*
561 * clear some bits on a range in the tree.  This may require splitting
562 * or inserting elements in the tree, so the gfp mask is used to
563 * indicate which allocations or sleeping are allowed.
564 *
565 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
566 * the given range from the tree regardless of state (ie for truncate).
567 *
568 * the range [start, end] is inclusive.
569 *
570 * This takes the tree lock, and returns 0 on success and < 0 on error.
571 */
572int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
573		     unsigned bits, int wake, int delete,
574		     struct extent_state **cached_state,
575		     gfp_t mask)
576{
577	struct extent_state *state;
578	struct extent_state *cached;
579	struct extent_state *prealloc = NULL;
580	struct rb_node *node;
581	u64 last_end;
582	int err;
583	int clear = 0;
584
585	btrfs_debug_check_extent_io_range(tree, start, end);
586
587	if (bits & EXTENT_DELALLOC)
588		bits |= EXTENT_NORESERVE;
589
590	if (delete)
591		bits |= ~EXTENT_CTLBITS;
592	bits |= EXTENT_FIRST_DELALLOC;
593
594	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
595		clear = 1;
596again:
597	if (!prealloc && (mask & __GFP_WAIT)) {
598		/*
599		 * Don't care for allocation failure here because we might end
600		 * up not needing the pre-allocated extent state at all, which
601		 * is the case if we only have in the tree extent states that
602		 * cover our input range and don't cover too any other range.
603		 * If we end up needing a new extent state we allocate it later.
604		 */
605		prealloc = alloc_extent_state(mask);
606	}
607
608	spin_lock(&tree->lock);
609	if (cached_state) {
610		cached = *cached_state;
611
612		if (clear) {
613			*cached_state = NULL;
614			cached_state = NULL;
615		}
616
617		if (cached && extent_state_in_tree(cached) &&
618		    cached->start <= start && cached->end > start) {
619			if (clear)
620				atomic_dec(&cached->refs);
621			state = cached;
622			goto hit_next;
623		}
624		if (clear)
625			free_extent_state(cached);
626	}
627	/*
628	 * this search will find the extents that end after
629	 * our range starts
630	 */
631	node = tree_search(tree, start);
632	if (!node)
633		goto out;
634	state = rb_entry(node, struct extent_state, rb_node);
635hit_next:
636	if (state->start > end)
637		goto out;
638	WARN_ON(state->end < start);
639	last_end = state->end;
640
641	/* the state doesn't have the wanted bits, go ahead */
642	if (!(state->state & bits)) {
643		state = next_state(state);
644		goto next;
645	}
646
647	/*
648	 *     | ---- desired range ---- |
649	 *  | state | or
650	 *  | ------------- state -------------- |
651	 *
652	 * We need to split the extent we found, and may flip
653	 * bits on second half.
654	 *
655	 * If the extent we found extends past our range, we
656	 * just split and search again.  It'll get split again
657	 * the next time though.
658	 *
659	 * If the extent we found is inside our range, we clear
660	 * the desired bit on it.
661	 */
662
663	if (state->start < start) {
664		prealloc = alloc_extent_state_atomic(prealloc);
665		BUG_ON(!prealloc);
666		err = split_state(tree, state, prealloc, start);
667		if (err)
668			extent_io_tree_panic(tree, err);
669
670		prealloc = NULL;
671		if (err)
672			goto out;
673		if (state->end <= end) {
674			state = clear_state_bit(tree, state, &bits, wake);
675			goto next;
676		}
677		goto search_again;
678	}
679	/*
680	 * | ---- desired range ---- |
681	 *                        | state |
682	 * We need to split the extent, and clear the bit
683	 * on the first half
684	 */
685	if (state->start <= end && state->end > end) {
686		prealloc = alloc_extent_state_atomic(prealloc);
687		BUG_ON(!prealloc);
688		err = split_state(tree, state, prealloc, end + 1);
689		if (err)
690			extent_io_tree_panic(tree, err);
691
692		if (wake)
693			wake_up(&state->wq);
694
695		clear_state_bit(tree, prealloc, &bits, wake);
696
697		prealloc = NULL;
698		goto out;
699	}
700
701	state = clear_state_bit(tree, state, &bits, wake);
702next:
703	if (last_end == (u64)-1)
704		goto out;
705	start = last_end + 1;
706	if (start <= end && state && !need_resched())
707		goto hit_next;
708	goto search_again;
709
710out:
711	spin_unlock(&tree->lock);
712	if (prealloc)
713		free_extent_state(prealloc);
714
715	return 0;
716
717search_again:
718	if (start > end)
719		goto out;
720	spin_unlock(&tree->lock);
721	if (mask & __GFP_WAIT)
722		cond_resched();
723	goto again;
724}
725
726static void wait_on_state(struct extent_io_tree *tree,
727			  struct extent_state *state)
728		__releases(tree->lock)
729		__acquires(tree->lock)
730{
731	DEFINE_WAIT(wait);
732	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
733	spin_unlock(&tree->lock);
734	schedule();
735	spin_lock(&tree->lock);
736	finish_wait(&state->wq, &wait);
737}
738
739/*
740 * waits for one or more bits to clear on a range in the state tree.
741 * The range [start, end] is inclusive.
742 * The tree lock is taken by this function
743 */
744static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
745			    unsigned long bits)
746{
747	struct extent_state *state;
748	struct rb_node *node;
749
750	btrfs_debug_check_extent_io_range(tree, start, end);
751
752	spin_lock(&tree->lock);
753again:
754	while (1) {
755		/*
756		 * this search will find all the extents that end after
757		 * our range starts
758		 */
759		node = tree_search(tree, start);
760process_node:
761		if (!node)
762			break;
763
764		state = rb_entry(node, struct extent_state, rb_node);
765
766		if (state->start > end)
767			goto out;
768
769		if (state->state & bits) {
770			start = state->start;
771			atomic_inc(&state->refs);
772			wait_on_state(tree, state);
773			free_extent_state(state);
774			goto again;
775		}
776		start = state->end + 1;
777
778		if (start > end)
779			break;
780
781		if (!cond_resched_lock(&tree->lock)) {
782			node = rb_next(node);
783			goto process_node;
784		}
785	}
786out:
787	spin_unlock(&tree->lock);
788}
789
790static void set_state_bits(struct extent_io_tree *tree,
791			   struct extent_state *state,
792			   unsigned *bits)
793{
794	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
795
796	set_state_cb(tree, state, bits);
797	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
798		u64 range = state->end - state->start + 1;
799		tree->dirty_bytes += range;
800	}
801	state->state |= bits_to_set;
802}
803
804static void cache_state_if_flags(struct extent_state *state,
805				 struct extent_state **cached_ptr,
806				 unsigned flags)
807{
808	if (cached_ptr && !(*cached_ptr)) {
809		if (!flags || (state->state & flags)) {
810			*cached_ptr = state;
811			atomic_inc(&state->refs);
812		}
813	}
814}
815
816static void cache_state(struct extent_state *state,
817			struct extent_state **cached_ptr)
818{
819	return cache_state_if_flags(state, cached_ptr,
820				    EXTENT_IOBITS | EXTENT_BOUNDARY);
821}
822
823/*
824 * set some bits on a range in the tree.  This may require allocations or
825 * sleeping, so the gfp mask is used to indicate what is allowed.
826 *
827 * If any of the exclusive bits are set, this will fail with -EEXIST if some
828 * part of the range already has the desired bits set.  The start of the
829 * existing range is returned in failed_start in this case.
830 *
831 * [start, end] is inclusive This takes the tree lock.
832 */
833
834static int __must_check
835__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
836		 unsigned bits, unsigned exclusive_bits,
837		 u64 *failed_start, struct extent_state **cached_state,
838		 gfp_t mask)
839{
840	struct extent_state *state;
841	struct extent_state *prealloc = NULL;
842	struct rb_node *node;
843	struct rb_node **p;
844	struct rb_node *parent;
845	int err = 0;
846	u64 last_start;
847	u64 last_end;
848
849	btrfs_debug_check_extent_io_range(tree, start, end);
850
851	bits |= EXTENT_FIRST_DELALLOC;
852again:
853	if (!prealloc && (mask & __GFP_WAIT)) {
854		prealloc = alloc_extent_state(mask);
855		BUG_ON(!prealloc);
856	}
857
858	spin_lock(&tree->lock);
859	if (cached_state && *cached_state) {
860		state = *cached_state;
861		if (state->start <= start && state->end > start &&
862		    extent_state_in_tree(state)) {
863			node = &state->rb_node;
864			goto hit_next;
865		}
866	}
867	/*
868	 * this search will find all the extents that end after
869	 * our range starts.
870	 */
871	node = tree_search_for_insert(tree, start, &p, &parent);
872	if (!node) {
873		prealloc = alloc_extent_state_atomic(prealloc);
874		BUG_ON(!prealloc);
875		err = insert_state(tree, prealloc, start, end,
876				   &p, &parent, &bits);
877		if (err)
878			extent_io_tree_panic(tree, err);
879
880		cache_state(prealloc, cached_state);
881		prealloc = NULL;
882		goto out;
883	}
884	state = rb_entry(node, struct extent_state, rb_node);
885hit_next:
886	last_start = state->start;
887	last_end = state->end;
888
889	/*
890	 * | ---- desired range ---- |
891	 * | state |
892	 *
893	 * Just lock what we found and keep going
894	 */
895	if (state->start == start && state->end <= end) {
896		if (state->state & exclusive_bits) {
897			*failed_start = state->start;
898			err = -EEXIST;
899			goto out;
900		}
901
902		set_state_bits(tree, state, &bits);
903		cache_state(state, cached_state);
904		merge_state(tree, state);
905		if (last_end == (u64)-1)
906			goto out;
907		start = last_end + 1;
908		state = next_state(state);
909		if (start < end && state && state->start == start &&
910		    !need_resched())
911			goto hit_next;
912		goto search_again;
913	}
914
915	/*
916	 *     | ---- desired range ---- |
917	 * | state |
918	 *   or
919	 * | ------------- state -------------- |
920	 *
921	 * We need to split the extent we found, and may flip bits on
922	 * second half.
923	 *
924	 * If the extent we found extends past our
925	 * range, we just split and search again.  It'll get split
926	 * again the next time though.
927	 *
928	 * If the extent we found is inside our range, we set the
929	 * desired bit on it.
930	 */
931	if (state->start < start) {
932		if (state->state & exclusive_bits) {
933			*failed_start = start;
934			err = -EEXIST;
935			goto out;
936		}
937
938		prealloc = alloc_extent_state_atomic(prealloc);
939		BUG_ON(!prealloc);
940		err = split_state(tree, state, prealloc, start);
941		if (err)
942			extent_io_tree_panic(tree, err);
943
944		prealloc = NULL;
945		if (err)
946			goto out;
947		if (state->end <= end) {
948			set_state_bits(tree, state, &bits);
949			cache_state(state, cached_state);
950			merge_state(tree, state);
951			if (last_end == (u64)-1)
952				goto out;
953			start = last_end + 1;
954			state = next_state(state);
955			if (start < end && state && state->start == start &&
956			    !need_resched())
957				goto hit_next;
958		}
959		goto search_again;
960	}
961	/*
962	 * | ---- desired range ---- |
963	 *     | state | or               | state |
964	 *
965	 * There's a hole, we need to insert something in it and
966	 * ignore the extent we found.
967	 */
968	if (state->start > start) {
969		u64 this_end;
970		if (end < last_start)
971			this_end = end;
972		else
973			this_end = last_start - 1;
974
975		prealloc = alloc_extent_state_atomic(prealloc);
976		BUG_ON(!prealloc);
977
978		/*
979		 * Avoid to free 'prealloc' if it can be merged with
980		 * the later extent.
981		 */
982		err = insert_state(tree, prealloc, start, this_end,
983				   NULL, NULL, &bits);
984		if (err)
985			extent_io_tree_panic(tree, err);
986
987		cache_state(prealloc, cached_state);
988		prealloc = NULL;
989		start = this_end + 1;
990		goto search_again;
991	}
992	/*
993	 * | ---- desired range ---- |
994	 *                        | state |
995	 * We need to split the extent, and set the bit
996	 * on the first half
997	 */
998	if (state->start <= end && state->end > end) {
999		if (state->state & exclusive_bits) {
1000			*failed_start = start;
1001			err = -EEXIST;
1002			goto out;
1003		}
1004
1005		prealloc = alloc_extent_state_atomic(prealloc);
1006		BUG_ON(!prealloc);
1007		err = split_state(tree, state, prealloc, end + 1);
1008		if (err)
1009			extent_io_tree_panic(tree, err);
1010
1011		set_state_bits(tree, prealloc, &bits);
1012		cache_state(prealloc, cached_state);
1013		merge_state(tree, prealloc);
1014		prealloc = NULL;
1015		goto out;
1016	}
1017
1018	goto search_again;
1019
1020out:
1021	spin_unlock(&tree->lock);
1022	if (prealloc)
1023		free_extent_state(prealloc);
1024
1025	return err;
1026
1027search_again:
1028	if (start > end)
1029		goto out;
1030	spin_unlock(&tree->lock);
1031	if (mask & __GFP_WAIT)
1032		cond_resched();
1033	goto again;
1034}
1035
1036int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1037		   unsigned bits, u64 * failed_start,
1038		   struct extent_state **cached_state, gfp_t mask)
1039{
1040	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1041				cached_state, mask);
1042}
1043
1044
1045/**
1046 * convert_extent_bit - convert all bits in a given range from one bit to
1047 * 			another
1048 * @tree:	the io tree to search
1049 * @start:	the start offset in bytes
1050 * @end:	the end offset in bytes (inclusive)
1051 * @bits:	the bits to set in this range
1052 * @clear_bits:	the bits to clear in this range
1053 * @cached_state:	state that we're going to cache
1054 * @mask:	the allocation mask
1055 *
1056 * This will go through and set bits for the given range.  If any states exist
1057 * already in this range they are set with the given bit and cleared of the
1058 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1059 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1060 * boundary bits like LOCK.
1061 */
1062int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1063		       unsigned bits, unsigned clear_bits,
1064		       struct extent_state **cached_state, gfp_t mask)
1065{
1066	struct extent_state *state;
1067	struct extent_state *prealloc = NULL;
1068	struct rb_node *node;
1069	struct rb_node **p;
1070	struct rb_node *parent;
1071	int err = 0;
1072	u64 last_start;
1073	u64 last_end;
1074	bool first_iteration = true;
1075
1076	btrfs_debug_check_extent_io_range(tree, start, end);
1077
1078again:
1079	if (!prealloc && (mask & __GFP_WAIT)) {
1080		/*
1081		 * Best effort, don't worry if extent state allocation fails
1082		 * here for the first iteration. We might have a cached state
1083		 * that matches exactly the target range, in which case no
1084		 * extent state allocations are needed. We'll only know this
1085		 * after locking the tree.
1086		 */
1087		prealloc = alloc_extent_state(mask);
1088		if (!prealloc && !first_iteration)
1089			return -ENOMEM;
1090	}
1091
1092	spin_lock(&tree->lock);
1093	if (cached_state && *cached_state) {
1094		state = *cached_state;
1095		if (state->start <= start && state->end > start &&
1096		    extent_state_in_tree(state)) {
1097			node = &state->rb_node;
1098			goto hit_next;
1099		}
1100	}
1101
1102	/*
1103	 * this search will find all the extents that end after
1104	 * our range starts.
1105	 */
1106	node = tree_search_for_insert(tree, start, &p, &parent);
1107	if (!node) {
1108		prealloc = alloc_extent_state_atomic(prealloc);
1109		if (!prealloc) {
1110			err = -ENOMEM;
1111			goto out;
1112		}
1113		err = insert_state(tree, prealloc, start, end,
1114				   &p, &parent, &bits);
1115		if (err)
1116			extent_io_tree_panic(tree, err);
1117		cache_state(prealloc, cached_state);
1118		prealloc = NULL;
1119		goto out;
1120	}
1121	state = rb_entry(node, struct extent_state, rb_node);
1122hit_next:
1123	last_start = state->start;
1124	last_end = state->end;
1125
1126	/*
1127	 * | ---- desired range ---- |
1128	 * | state |
1129	 *
1130	 * Just lock what we found and keep going
1131	 */
1132	if (state->start == start && state->end <= end) {
1133		set_state_bits(tree, state, &bits);
1134		cache_state(state, cached_state);
1135		state = clear_state_bit(tree, state, &clear_bits, 0);
1136		if (last_end == (u64)-1)
1137			goto out;
1138		start = last_end + 1;
1139		if (start < end && state && state->start == start &&
1140		    !need_resched())
1141			goto hit_next;
1142		goto search_again;
1143	}
1144
1145	/*
1146	 *     | ---- desired range ---- |
1147	 * | state |
1148	 *   or
1149	 * | ------------- state -------------- |
1150	 *
1151	 * We need to split the extent we found, and may flip bits on
1152	 * second half.
1153	 *
1154	 * If the extent we found extends past our
1155	 * range, we just split and search again.  It'll get split
1156	 * again the next time though.
1157	 *
1158	 * If the extent we found is inside our range, we set the
1159	 * desired bit on it.
1160	 */
1161	if (state->start < start) {
1162		prealloc = alloc_extent_state_atomic(prealloc);
1163		if (!prealloc) {
1164			err = -ENOMEM;
1165			goto out;
1166		}
1167		err = split_state(tree, state, prealloc, start);
1168		if (err)
1169			extent_io_tree_panic(tree, err);
1170		prealloc = NULL;
1171		if (err)
1172			goto out;
1173		if (state->end <= end) {
1174			set_state_bits(tree, state, &bits);
1175			cache_state(state, cached_state);
1176			state = clear_state_bit(tree, state, &clear_bits, 0);
1177			if (last_end == (u64)-1)
1178				goto out;
1179			start = last_end + 1;
1180			if (start < end && state && state->start == start &&
1181			    !need_resched())
1182				goto hit_next;
1183		}
1184		goto search_again;
1185	}
1186	/*
1187	 * | ---- desired range ---- |
1188	 *     | state | or               | state |
1189	 *
1190	 * There's a hole, we need to insert something in it and
1191	 * ignore the extent we found.
1192	 */
1193	if (state->start > start) {
1194		u64 this_end;
1195		if (end < last_start)
1196			this_end = end;
1197		else
1198			this_end = last_start - 1;
1199
1200		prealloc = alloc_extent_state_atomic(prealloc);
1201		if (!prealloc) {
1202			err = -ENOMEM;
1203			goto out;
1204		}
1205
1206		/*
1207		 * Avoid to free 'prealloc' if it can be merged with
1208		 * the later extent.
1209		 */
1210		err = insert_state(tree, prealloc, start, this_end,
1211				   NULL, NULL, &bits);
1212		if (err)
1213			extent_io_tree_panic(tree, err);
1214		cache_state(prealloc, cached_state);
1215		prealloc = NULL;
1216		start = this_end + 1;
1217		goto search_again;
1218	}
1219	/*
1220	 * | ---- desired range ---- |
1221	 *                        | state |
1222	 * We need to split the extent, and set the bit
1223	 * on the first half
1224	 */
1225	if (state->start <= end && state->end > end) {
1226		prealloc = alloc_extent_state_atomic(prealloc);
1227		if (!prealloc) {
1228			err = -ENOMEM;
1229			goto out;
1230		}
1231
1232		err = split_state(tree, state, prealloc, end + 1);
1233		if (err)
1234			extent_io_tree_panic(tree, err);
1235
1236		set_state_bits(tree, prealloc, &bits);
1237		cache_state(prealloc, cached_state);
1238		clear_state_bit(tree, prealloc, &clear_bits, 0);
1239		prealloc = NULL;
1240		goto out;
1241	}
1242
1243	goto search_again;
1244
1245out:
1246	spin_unlock(&tree->lock);
1247	if (prealloc)
1248		free_extent_state(prealloc);
1249
1250	return err;
1251
1252search_again:
1253	if (start > end)
1254		goto out;
1255	spin_unlock(&tree->lock);
1256	if (mask & __GFP_WAIT)
1257		cond_resched();
1258	first_iteration = false;
1259	goto again;
1260}
1261
1262/* wrappers around set/clear extent bit */
1263int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1264		     gfp_t mask)
1265{
1266	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1267			      NULL, mask);
1268}
1269
1270int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1271		    unsigned bits, gfp_t mask)
1272{
1273	return set_extent_bit(tree, start, end, bits, NULL,
1274			      NULL, mask);
1275}
1276
1277int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1278		      unsigned bits, gfp_t mask)
1279{
1280	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1281}
1282
1283int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1284			struct extent_state **cached_state, gfp_t mask)
1285{
1286	return set_extent_bit(tree, start, end,
1287			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1288			      NULL, cached_state, mask);
1289}
1290
1291int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1292		      struct extent_state **cached_state, gfp_t mask)
1293{
1294	return set_extent_bit(tree, start, end,
1295			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1296			      NULL, cached_state, mask);
1297}
1298
1299int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1300		       gfp_t mask)
1301{
1302	return clear_extent_bit(tree, start, end,
1303				EXTENT_DIRTY | EXTENT_DELALLOC |
1304				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1305}
1306
1307int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1308		     gfp_t mask)
1309{
1310	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1311			      NULL, mask);
1312}
1313
1314int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1315			struct extent_state **cached_state, gfp_t mask)
1316{
1317	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
1318			      cached_state, mask);
1319}
1320
1321int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1322			  struct extent_state **cached_state, gfp_t mask)
1323{
1324	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1325				cached_state, mask);
1326}
1327
1328/*
1329 * either insert or lock state struct between start and end use mask to tell
1330 * us if waiting is desired.
1331 */
1332int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1333		     unsigned bits, struct extent_state **cached_state)
1334{
1335	int err;
1336	u64 failed_start;
1337
1338	while (1) {
1339		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1340				       EXTENT_LOCKED, &failed_start,
1341				       cached_state, GFP_NOFS);
1342		if (err == -EEXIST) {
1343			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1344			start = failed_start;
1345		} else
1346			break;
1347		WARN_ON(start > end);
1348	}
1349	return err;
1350}
1351
1352int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1353{
1354	return lock_extent_bits(tree, start, end, 0, NULL);
1355}
1356
1357int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1358{
1359	int err;
1360	u64 failed_start;
1361
1362	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1363			       &failed_start, NULL, GFP_NOFS);
1364	if (err == -EEXIST) {
1365		if (failed_start > start)
1366			clear_extent_bit(tree, start, failed_start - 1,
1367					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1368		return 0;
1369	}
1370	return 1;
1371}
1372
1373int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1374			 struct extent_state **cached, gfp_t mask)
1375{
1376	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1377				mask);
1378}
1379
1380int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1381{
1382	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1383				GFP_NOFS);
1384}
1385
1386int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1387{
1388	unsigned long index = start >> PAGE_CACHE_SHIFT;
1389	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1390	struct page *page;
1391
1392	while (index <= end_index) {
1393		page = find_get_page(inode->i_mapping, index);
1394		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1395		clear_page_dirty_for_io(page);
1396		page_cache_release(page);
1397		index++;
1398	}
1399	return 0;
1400}
1401
1402int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1403{
1404	unsigned long index = start >> PAGE_CACHE_SHIFT;
1405	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1406	struct page *page;
1407
1408	while (index <= end_index) {
1409		page = find_get_page(inode->i_mapping, index);
1410		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1411		__set_page_dirty_nobuffers(page);
1412		account_page_redirty(page);
1413		page_cache_release(page);
1414		index++;
1415	}
1416	return 0;
1417}
1418
1419/*
1420 * helper function to set both pages and extents in the tree writeback
1421 */
1422static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1423{
1424	unsigned long index = start >> PAGE_CACHE_SHIFT;
1425	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1426	struct page *page;
1427
1428	while (index <= end_index) {
1429		page = find_get_page(tree->mapping, index);
1430		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1431		set_page_writeback(page);
1432		page_cache_release(page);
1433		index++;
1434	}
1435	return 0;
1436}
1437
1438/* find the first state struct with 'bits' set after 'start', and
1439 * return it.  tree->lock must be held.  NULL will returned if
1440 * nothing was found after 'start'
1441 */
1442static struct extent_state *
1443find_first_extent_bit_state(struct extent_io_tree *tree,
1444			    u64 start, unsigned bits)
1445{
1446	struct rb_node *node;
1447	struct extent_state *state;
1448
1449	/*
1450	 * this search will find all the extents that end after
1451	 * our range starts.
1452	 */
1453	node = tree_search(tree, start);
1454	if (!node)
1455		goto out;
1456
1457	while (1) {
1458		state = rb_entry(node, struct extent_state, rb_node);
1459		if (state->end >= start && (state->state & bits))
1460			return state;
1461
1462		node = rb_next(node);
1463		if (!node)
1464			break;
1465	}
1466out:
1467	return NULL;
1468}
1469
1470/*
1471 * find the first offset in the io tree with 'bits' set. zero is
1472 * returned if we find something, and *start_ret and *end_ret are
1473 * set to reflect the state struct that was found.
1474 *
1475 * If nothing was found, 1 is returned. If found something, return 0.
1476 */
1477int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1478			  u64 *start_ret, u64 *end_ret, unsigned bits,
1479			  struct extent_state **cached_state)
1480{
1481	struct extent_state *state;
1482	struct rb_node *n;
1483	int ret = 1;
1484
1485	spin_lock(&tree->lock);
1486	if (cached_state && *cached_state) {
1487		state = *cached_state;
1488		if (state->end == start - 1 && extent_state_in_tree(state)) {
1489			n = rb_next(&state->rb_node);
1490			while (n) {
1491				state = rb_entry(n, struct extent_state,
1492						 rb_node);
1493				if (state->state & bits)
1494					goto got_it;
1495				n = rb_next(n);
1496			}
1497			free_extent_state(*cached_state);
1498			*cached_state = NULL;
1499			goto out;
1500		}
1501		free_extent_state(*cached_state);
1502		*cached_state = NULL;
1503	}
1504
1505	state = find_first_extent_bit_state(tree, start, bits);
1506got_it:
1507	if (state) {
1508		cache_state_if_flags(state, cached_state, 0);
1509		*start_ret = state->start;
1510		*end_ret = state->end;
1511		ret = 0;
1512	}
1513out:
1514	spin_unlock(&tree->lock);
1515	return ret;
1516}
1517
1518/*
1519 * find a contiguous range of bytes in the file marked as delalloc, not
1520 * more than 'max_bytes'.  start and end are used to return the range,
1521 *
1522 * 1 is returned if we find something, 0 if nothing was in the tree
1523 */
1524static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1525					u64 *start, u64 *end, u64 max_bytes,
1526					struct extent_state **cached_state)
1527{
1528	struct rb_node *node;
1529	struct extent_state *state;
1530	u64 cur_start = *start;
1531	u64 found = 0;
1532	u64 total_bytes = 0;
1533
1534	spin_lock(&tree->lock);
1535
1536	/*
1537	 * this search will find all the extents that end after
1538	 * our range starts.
1539	 */
1540	node = tree_search(tree, cur_start);
1541	if (!node) {
1542		if (!found)
1543			*end = (u64)-1;
1544		goto out;
1545	}
1546
1547	while (1) {
1548		state = rb_entry(node, struct extent_state, rb_node);
1549		if (found && (state->start != cur_start ||
1550			      (state->state & EXTENT_BOUNDARY))) {
1551			goto out;
1552		}
1553		if (!(state->state & EXTENT_DELALLOC)) {
1554			if (!found)
1555				*end = state->end;
1556			goto out;
1557		}
1558		if (!found) {
1559			*start = state->start;
1560			*cached_state = state;
1561			atomic_inc(&state->refs);
1562		}
1563		found++;
1564		*end = state->end;
1565		cur_start = state->end + 1;
1566		node = rb_next(node);
1567		total_bytes += state->end - state->start + 1;
1568		if (total_bytes >= max_bytes)
1569			break;
1570		if (!node)
1571			break;
1572	}
1573out:
1574	spin_unlock(&tree->lock);
1575	return found;
1576}
1577
1578static noinline void __unlock_for_delalloc(struct inode *inode,
1579					   struct page *locked_page,
1580					   u64 start, u64 end)
1581{
1582	int ret;
1583	struct page *pages[16];
1584	unsigned long index = start >> PAGE_CACHE_SHIFT;
1585	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1586	unsigned long nr_pages = end_index - index + 1;
1587	int i;
1588
1589	if (index == locked_page->index && end_index == index)
1590		return;
1591
1592	while (nr_pages > 0) {
1593		ret = find_get_pages_contig(inode->i_mapping, index,
1594				     min_t(unsigned long, nr_pages,
1595				     ARRAY_SIZE(pages)), pages);
1596		for (i = 0; i < ret; i++) {
1597			if (pages[i] != locked_page)
1598				unlock_page(pages[i]);
1599			page_cache_release(pages[i]);
1600		}
1601		nr_pages -= ret;
1602		index += ret;
1603		cond_resched();
1604	}
1605}
1606
1607static noinline int lock_delalloc_pages(struct inode *inode,
1608					struct page *locked_page,
1609					u64 delalloc_start,
1610					u64 delalloc_end)
1611{
1612	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1613	unsigned long start_index = index;
1614	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1615	unsigned long pages_locked = 0;
1616	struct page *pages[16];
1617	unsigned long nrpages;
1618	int ret;
1619	int i;
1620
1621	/* the caller is responsible for locking the start index */
1622	if (index == locked_page->index && index == end_index)
1623		return 0;
1624
1625	/* skip the page at the start index */
1626	nrpages = end_index - index + 1;
1627	while (nrpages > 0) {
1628		ret = find_get_pages_contig(inode->i_mapping, index,
1629				     min_t(unsigned long,
1630				     nrpages, ARRAY_SIZE(pages)), pages);
1631		if (ret == 0) {
1632			ret = -EAGAIN;
1633			goto done;
1634		}
1635		/* now we have an array of pages, lock them all */
1636		for (i = 0; i < ret; i++) {
1637			/*
1638			 * the caller is taking responsibility for
1639			 * locked_page
1640			 */
1641			if (pages[i] != locked_page) {
1642				lock_page(pages[i]);
1643				if (!PageDirty(pages[i]) ||
1644				    pages[i]->mapping != inode->i_mapping) {
1645					ret = -EAGAIN;
1646					unlock_page(pages[i]);
1647					page_cache_release(pages[i]);
1648					goto done;
1649				}
1650			}
1651			page_cache_release(pages[i]);
1652			pages_locked++;
1653		}
1654		nrpages -= ret;
1655		index += ret;
1656		cond_resched();
1657	}
1658	ret = 0;
1659done:
1660	if (ret && pages_locked) {
1661		__unlock_for_delalloc(inode, locked_page,
1662			      delalloc_start,
1663			      ((u64)(start_index + pages_locked - 1)) <<
1664			      PAGE_CACHE_SHIFT);
1665	}
1666	return ret;
1667}
1668
1669/*
1670 * find a contiguous range of bytes in the file marked as delalloc, not
1671 * more than 'max_bytes'.  start and end are used to return the range,
1672 *
1673 * 1 is returned if we find something, 0 if nothing was in the tree
1674 */
1675STATIC u64 find_lock_delalloc_range(struct inode *inode,
1676				    struct extent_io_tree *tree,
1677				    struct page *locked_page, u64 *start,
1678				    u64 *end, u64 max_bytes)
1679{
1680	u64 delalloc_start;
1681	u64 delalloc_end;
1682	u64 found;
1683	struct extent_state *cached_state = NULL;
1684	int ret;
1685	int loops = 0;
1686
1687again:
1688	/* step one, find a bunch of delalloc bytes starting at start */
1689	delalloc_start = *start;
1690	delalloc_end = 0;
1691	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1692				    max_bytes, &cached_state);
1693	if (!found || delalloc_end <= *start) {
1694		*start = delalloc_start;
1695		*end = delalloc_end;
1696		free_extent_state(cached_state);
1697		return 0;
1698	}
1699
1700	/*
1701	 * start comes from the offset of locked_page.  We have to lock
1702	 * pages in order, so we can't process delalloc bytes before
1703	 * locked_page
1704	 */
1705	if (delalloc_start < *start)
1706		delalloc_start = *start;
1707
1708	/*
1709	 * make sure to limit the number of pages we try to lock down
1710	 */
1711	if (delalloc_end + 1 - delalloc_start > max_bytes)
1712		delalloc_end = delalloc_start + max_bytes - 1;
1713
1714	/* step two, lock all the pages after the page that has start */
1715	ret = lock_delalloc_pages(inode, locked_page,
1716				  delalloc_start, delalloc_end);
1717	if (ret == -EAGAIN) {
1718		/* some of the pages are gone, lets avoid looping by
1719		 * shortening the size of the delalloc range we're searching
1720		 */
1721		free_extent_state(cached_state);
1722		cached_state = NULL;
1723		if (!loops) {
1724			max_bytes = PAGE_CACHE_SIZE;
1725			loops = 1;
1726			goto again;
1727		} else {
1728			found = 0;
1729			goto out_failed;
1730		}
1731	}
1732	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1733
1734	/* step three, lock the state bits for the whole range */
1735	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1736
1737	/* then test to make sure it is all still delalloc */
1738	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1739			     EXTENT_DELALLOC, 1, cached_state);
1740	if (!ret) {
1741		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1742				     &cached_state, GFP_NOFS);
1743		__unlock_for_delalloc(inode, locked_page,
1744			      delalloc_start, delalloc_end);
1745		cond_resched();
1746		goto again;
1747	}
1748	free_extent_state(cached_state);
1749	*start = delalloc_start;
1750	*end = delalloc_end;
1751out_failed:
1752	return found;
1753}
1754
1755int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1756				 struct page *locked_page,
1757				 unsigned clear_bits,
1758				 unsigned long page_ops)
1759{
1760	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1761	int ret;
1762	struct page *pages[16];
1763	unsigned long index = start >> PAGE_CACHE_SHIFT;
1764	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1765	unsigned long nr_pages = end_index - index + 1;
1766	int i;
1767
1768	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1769	if (page_ops == 0)
1770		return 0;
1771
1772	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1773		mapping_set_error(inode->i_mapping, -EIO);
1774
1775	while (nr_pages > 0) {
1776		ret = find_get_pages_contig(inode->i_mapping, index,
1777				     min_t(unsigned long,
1778				     nr_pages, ARRAY_SIZE(pages)), pages);
1779		for (i = 0; i < ret; i++) {
1780
1781			if (page_ops & PAGE_SET_PRIVATE2)
1782				SetPagePrivate2(pages[i]);
1783
1784			if (pages[i] == locked_page) {
1785				page_cache_release(pages[i]);
1786				continue;
1787			}
1788			if (page_ops & PAGE_CLEAR_DIRTY)
1789				clear_page_dirty_for_io(pages[i]);
1790			if (page_ops & PAGE_SET_WRITEBACK)
1791				set_page_writeback(pages[i]);
1792			if (page_ops & PAGE_SET_ERROR)
1793				SetPageError(pages[i]);
1794			if (page_ops & PAGE_END_WRITEBACK)
1795				end_page_writeback(pages[i]);
1796			if (page_ops & PAGE_UNLOCK)
1797				unlock_page(pages[i]);
1798			page_cache_release(pages[i]);
1799		}
1800		nr_pages -= ret;
1801		index += ret;
1802		cond_resched();
1803	}
1804	return 0;
1805}
1806
1807/*
1808 * count the number of bytes in the tree that have a given bit(s)
1809 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1810 * cached.  The total number found is returned.
1811 */
1812u64 count_range_bits(struct extent_io_tree *tree,
1813		     u64 *start, u64 search_end, u64 max_bytes,
1814		     unsigned bits, int contig)
1815{
1816	struct rb_node *node;
1817	struct extent_state *state;
1818	u64 cur_start = *start;
1819	u64 total_bytes = 0;
1820	u64 last = 0;
1821	int found = 0;
1822
1823	if (WARN_ON(search_end <= cur_start))
1824		return 0;
1825
1826	spin_lock(&tree->lock);
1827	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1828		total_bytes = tree->dirty_bytes;
1829		goto out;
1830	}
1831	/*
1832	 * this search will find all the extents that end after
1833	 * our range starts.
1834	 */
1835	node = tree_search(tree, cur_start);
1836	if (!node)
1837		goto out;
1838
1839	while (1) {
1840		state = rb_entry(node, struct extent_state, rb_node);
1841		if (state->start > search_end)
1842			break;
1843		if (contig && found && state->start > last + 1)
1844			break;
1845		if (state->end >= cur_start && (state->state & bits) == bits) {
1846			total_bytes += min(search_end, state->end) + 1 -
1847				       max(cur_start, state->start);
1848			if (total_bytes >= max_bytes)
1849				break;
1850			if (!found) {
1851				*start = max(cur_start, state->start);
1852				found = 1;
1853			}
1854			last = state->end;
1855		} else if (contig && found) {
1856			break;
1857		}
1858		node = rb_next(node);
1859		if (!node)
1860			break;
1861	}
1862out:
1863	spin_unlock(&tree->lock);
1864	return total_bytes;
1865}
1866
1867/*
1868 * set the private field for a given byte offset in the tree.  If there isn't
1869 * an extent_state there already, this does nothing.
1870 */
1871static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1872{
1873	struct rb_node *node;
1874	struct extent_state *state;
1875	int ret = 0;
1876
1877	spin_lock(&tree->lock);
1878	/*
1879	 * this search will find all the extents that end after
1880	 * our range starts.
1881	 */
1882	node = tree_search(tree, start);
1883	if (!node) {
1884		ret = -ENOENT;
1885		goto out;
1886	}
1887	state = rb_entry(node, struct extent_state, rb_node);
1888	if (state->start != start) {
1889		ret = -ENOENT;
1890		goto out;
1891	}
1892	state->private = private;
1893out:
1894	spin_unlock(&tree->lock);
1895	return ret;
1896}
1897
1898int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1899{
1900	struct rb_node *node;
1901	struct extent_state *state;
1902	int ret = 0;
1903
1904	spin_lock(&tree->lock);
1905	/*
1906	 * this search will find all the extents that end after
1907	 * our range starts.
1908	 */
1909	node = tree_search(tree, start);
1910	if (!node) {
1911		ret = -ENOENT;
1912		goto out;
1913	}
1914	state = rb_entry(node, struct extent_state, rb_node);
1915	if (state->start != start) {
1916		ret = -ENOENT;
1917		goto out;
1918	}
1919	*private = state->private;
1920out:
1921	spin_unlock(&tree->lock);
1922	return ret;
1923}
1924
1925/*
1926 * searches a range in the state tree for a given mask.
1927 * If 'filled' == 1, this returns 1 only if every extent in the tree
1928 * has the bits set.  Otherwise, 1 is returned if any bit in the
1929 * range is found set.
1930 */
1931int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1932		   unsigned bits, int filled, struct extent_state *cached)
1933{
1934	struct extent_state *state = NULL;
1935	struct rb_node *node;
1936	int bitset = 0;
1937
1938	spin_lock(&tree->lock);
1939	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1940	    cached->end > start)
1941		node = &cached->rb_node;
1942	else
1943		node = tree_search(tree, start);
1944	while (node && start <= end) {
1945		state = rb_entry(node, struct extent_state, rb_node);
1946
1947		if (filled && state->start > start) {
1948			bitset = 0;
1949			break;
1950		}
1951
1952		if (state->start > end)
1953			break;
1954
1955		if (state->state & bits) {
1956			bitset = 1;
1957			if (!filled)
1958				break;
1959		} else if (filled) {
1960			bitset = 0;
1961			break;
1962		}
1963
1964		if (state->end == (u64)-1)
1965			break;
1966
1967		start = state->end + 1;
1968		if (start > end)
1969			break;
1970		node = rb_next(node);
1971		if (!node) {
1972			if (filled)
1973				bitset = 0;
1974			break;
1975		}
1976	}
1977	spin_unlock(&tree->lock);
1978	return bitset;
1979}
1980
1981/*
1982 * helper function to set a given page up to date if all the
1983 * extents in the tree for that page are up to date
1984 */
1985static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1986{
1987	u64 start = page_offset(page);
1988	u64 end = start + PAGE_CACHE_SIZE - 1;
1989	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1990		SetPageUptodate(page);
1991}
1992
1993int free_io_failure(struct inode *inode, struct io_failure_record *rec)
1994{
1995	int ret;
1996	int err = 0;
1997	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1998
1999	set_state_private(failure_tree, rec->start, 0);
2000	ret = clear_extent_bits(failure_tree, rec->start,
2001				rec->start + rec->len - 1,
2002				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2003	if (ret)
2004		err = ret;
2005
2006	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
2007				rec->start + rec->len - 1,
2008				EXTENT_DAMAGED, GFP_NOFS);
2009	if (ret && !err)
2010		err = ret;
2011
2012	kfree(rec);
2013	return err;
2014}
2015
2016/*
2017 * this bypasses the standard btrfs submit functions deliberately, as
2018 * the standard behavior is to write all copies in a raid setup. here we only
2019 * want to write the one bad copy. so we do the mapping for ourselves and issue
2020 * submit_bio directly.
2021 * to avoid any synchronization issues, wait for the data after writing, which
2022 * actually prevents the read that triggered the error from finishing.
2023 * currently, there can be no more than two copies of every data bit. thus,
2024 * exactly one rewrite is required.
2025 */
2026int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2027		      struct page *page, unsigned int pg_offset, int mirror_num)
2028{
2029	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2030	struct bio *bio;
2031	struct btrfs_device *dev;
2032	u64 map_length = 0;
2033	u64 sector;
2034	struct btrfs_bio *bbio = NULL;
2035	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
2036	int ret;
2037
2038	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
2039	BUG_ON(!mirror_num);
2040
2041	/* we can't repair anything in raid56 yet */
2042	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
2043		return 0;
2044
2045	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2046	if (!bio)
2047		return -EIO;
2048	bio->bi_iter.bi_size = 0;
2049	map_length = length;
2050
2051	ret = btrfs_map_block(fs_info, WRITE, logical,
2052			      &map_length, &bbio, mirror_num);
2053	if (ret) {
2054		bio_put(bio);
2055		return -EIO;
2056	}
2057	BUG_ON(mirror_num != bbio->mirror_num);
2058	sector = bbio->stripes[mirror_num-1].physical >> 9;
2059	bio->bi_iter.bi_sector = sector;
2060	dev = bbio->stripes[mirror_num-1].dev;
2061	btrfs_put_bbio(bbio);
2062	if (!dev || !dev->bdev || !dev->writeable) {
2063		bio_put(bio);
2064		return -EIO;
2065	}
2066	bio->bi_bdev = dev->bdev;
2067	bio_add_page(bio, page, length, pg_offset);
2068
2069	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2070		/* try to remap that extent elsewhere? */
2071		bio_put(bio);
2072		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2073		return -EIO;
2074	}
2075
2076	printk_ratelimited_in_rcu(KERN_INFO
2077				  "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
2078				  btrfs_ino(inode), start,
2079				  rcu_str_deref(dev->name), sector);
2080	bio_put(bio);
2081	return 0;
2082}
2083
2084int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2085			 int mirror_num)
2086{
2087	u64 start = eb->start;
2088	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2089	int ret = 0;
2090
2091	if (root->fs_info->sb->s_flags & MS_RDONLY)
2092		return -EROFS;
2093
2094	for (i = 0; i < num_pages; i++) {
2095		struct page *p = eb->pages[i];
2096
2097		ret = repair_io_failure(root->fs_info->btree_inode, start,
2098					PAGE_CACHE_SIZE, start, p,
2099					start - page_offset(p), mirror_num);
2100		if (ret)
2101			break;
2102		start += PAGE_CACHE_SIZE;
2103	}
2104
2105	return ret;
2106}
2107
2108/*
2109 * each time an IO finishes, we do a fast check in the IO failure tree
2110 * to see if we need to process or clean up an io_failure_record
2111 */
2112int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2113		     unsigned int pg_offset)
2114{
2115	u64 private;
2116	u64 private_failure;
2117	struct io_failure_record *failrec;
2118	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2119	struct extent_state *state;
2120	int num_copies;
2121	int ret;
2122
2123	private = 0;
2124	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2125				(u64)-1, 1, EXTENT_DIRTY, 0);
2126	if (!ret)
2127		return 0;
2128
2129	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2130				&private_failure);
2131	if (ret)
2132		return 0;
2133
2134	failrec = (struct io_failure_record *)(unsigned long) private_failure;
2135	BUG_ON(!failrec->this_mirror);
2136
2137	if (failrec->in_validation) {
2138		/* there was no real error, just free the record */
2139		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2140			 failrec->start);
2141		goto out;
2142	}
2143	if (fs_info->sb->s_flags & MS_RDONLY)
2144		goto out;
2145
2146	spin_lock(&BTRFS_I(inode)->io_tree.lock);
2147	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2148					    failrec->start,
2149					    EXTENT_LOCKED);
2150	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2151
2152	if (state && state->start <= failrec->start &&
2153	    state->end >= failrec->start + failrec->len - 1) {
2154		num_copies = btrfs_num_copies(fs_info, failrec->logical,
2155					      failrec->len);
2156		if (num_copies > 1)  {
2157			repair_io_failure(inode, start, failrec->len,
2158					  failrec->logical, page,
2159					  pg_offset, failrec->failed_mirror);
2160		}
2161	}
2162
2163out:
2164	free_io_failure(inode, failrec);
2165
2166	return 0;
2167}
2168
2169/*
2170 * Can be called when
2171 * - hold extent lock
2172 * - under ordered extent
2173 * - the inode is freeing
2174 */
2175void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2176{
2177	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2178	struct io_failure_record *failrec;
2179	struct extent_state *state, *next;
2180
2181	if (RB_EMPTY_ROOT(&failure_tree->state))
2182		return;
2183
2184	spin_lock(&failure_tree->lock);
2185	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2186	while (state) {
2187		if (state->start > end)
2188			break;
2189
2190		ASSERT(state->end <= end);
2191
2192		next = next_state(state);
2193
2194		failrec = (struct io_failure_record *)(unsigned long)state->private;
2195		free_extent_state(state);
2196		kfree(failrec);
2197
2198		state = next;
2199	}
2200	spin_unlock(&failure_tree->lock);
2201}
2202
2203int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2204				struct io_failure_record **failrec_ret)
2205{
2206	struct io_failure_record *failrec;
2207	u64 private;
2208	struct extent_map *em;
2209	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2210	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2211	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2212	int ret;
2213	u64 logical;
2214
2215	ret = get_state_private(failure_tree, start, &private);
2216	if (ret) {
2217		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2218		if (!failrec)
2219			return -ENOMEM;
2220
2221		failrec->start = start;
2222		failrec->len = end - start + 1;
2223		failrec->this_mirror = 0;
2224		failrec->bio_flags = 0;
2225		failrec->in_validation = 0;
2226
2227		read_lock(&em_tree->lock);
2228		em = lookup_extent_mapping(em_tree, start, failrec->len);
2229		if (!em) {
2230			read_unlock(&em_tree->lock);
2231			kfree(failrec);
2232			return -EIO;
2233		}
2234
2235		if (em->start > start || em->start + em->len <= start) {
2236			free_extent_map(em);
2237			em = NULL;
2238		}
2239		read_unlock(&em_tree->lock);
2240		if (!em) {
2241			kfree(failrec);
2242			return -EIO;
2243		}
2244
2245		logical = start - em->start;
2246		logical = em->block_start + logical;
2247		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2248			logical = em->block_start;
2249			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2250			extent_set_compress_type(&failrec->bio_flags,
2251						 em->compress_type);
2252		}
2253
2254		pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
2255			 logical, start, failrec->len);
2256
2257		failrec->logical = logical;
2258		free_extent_map(em);
2259
2260		/* set the bits in the private failure tree */
2261		ret = set_extent_bits(failure_tree, start, end,
2262					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2263		if (ret >= 0)
2264			ret = set_state_private(failure_tree, start,
2265						(u64)(unsigned long)failrec);
2266		/* set the bits in the inode's tree */
2267		if (ret >= 0)
2268			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2269						GFP_NOFS);
2270		if (ret < 0) {
2271			kfree(failrec);
2272			return ret;
2273		}
2274	} else {
2275		failrec = (struct io_failure_record *)(unsigned long)private;
2276		pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2277			 failrec->logical, failrec->start, failrec->len,
2278			 failrec->in_validation);
2279		/*
2280		 * when data can be on disk more than twice, add to failrec here
2281		 * (e.g. with a list for failed_mirror) to make
2282		 * clean_io_failure() clean all those errors at once.
2283		 */
2284	}
2285
2286	*failrec_ret = failrec;
2287
2288	return 0;
2289}
2290
2291int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
2292			   struct io_failure_record *failrec, int failed_mirror)
2293{
2294	int num_copies;
2295
2296	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2297				      failrec->logical, failrec->len);
2298	if (num_copies == 1) {
2299		/*
2300		 * we only have a single copy of the data, so don't bother with
2301		 * all the retry and error correction code that follows. no
2302		 * matter what the error is, it is very likely to persist.
2303		 */
2304		pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
2305			 num_copies, failrec->this_mirror, failed_mirror);
2306		return 0;
2307	}
2308
2309	/*
2310	 * there are two premises:
2311	 *	a) deliver good data to the caller
2312	 *	b) correct the bad sectors on disk
2313	 */
2314	if (failed_bio->bi_vcnt > 1) {
2315		/*
2316		 * to fulfill b), we need to know the exact failing sectors, as
2317		 * we don't want to rewrite any more than the failed ones. thus,
2318		 * we need separate read requests for the failed bio
2319		 *
2320		 * if the following BUG_ON triggers, our validation request got
2321		 * merged. we need separate requests for our algorithm to work.
2322		 */
2323		BUG_ON(failrec->in_validation);
2324		failrec->in_validation = 1;
2325		failrec->this_mirror = failed_mirror;
2326	} else {
2327		/*
2328		 * we're ready to fulfill a) and b) alongside. get a good copy
2329		 * of the failed sector and if we succeed, we have setup
2330		 * everything for repair_io_failure to do the rest for us.
2331		 */
2332		if (failrec->in_validation) {
2333			BUG_ON(failrec->this_mirror != failed_mirror);
2334			failrec->in_validation = 0;
2335			failrec->this_mirror = 0;
2336		}
2337		failrec->failed_mirror = failed_mirror;
2338		failrec->this_mirror++;
2339		if (failrec->this_mirror == failed_mirror)
2340			failrec->this_mirror++;
2341	}
2342
2343	if (failrec->this_mirror > num_copies) {
2344		pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
2345			 num_copies, failrec->this_mirror, failed_mirror);
2346		return 0;
2347	}
2348
2349	return 1;
2350}
2351
2352
2353struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2354				    struct io_failure_record *failrec,
2355				    struct page *page, int pg_offset, int icsum,
2356				    bio_end_io_t *endio_func, void *data)
2357{
2358	struct bio *bio;
2359	struct btrfs_io_bio *btrfs_failed_bio;
2360	struct btrfs_io_bio *btrfs_bio;
2361
2362	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2363	if (!bio)
2364		return NULL;
2365
2366	bio->bi_end_io = endio_func;
2367	bio->bi_iter.bi_sector = failrec->logical >> 9;
2368	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2369	bio->bi_iter.bi_size = 0;
2370	bio->bi_private = data;
2371
2372	btrfs_failed_bio = btrfs_io_bio(failed_bio);
2373	if (btrfs_failed_bio->csum) {
2374		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2375		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2376
2377		btrfs_bio = btrfs_io_bio(bio);
2378		btrfs_bio->csum = btrfs_bio->csum_inline;
2379		icsum *= csum_size;
2380		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2381		       csum_size);
2382	}
2383
2384	bio_add_page(bio, page, failrec->len, pg_offset);
2385
2386	return bio;
2387}
2388
2389/*
2390 * this is a generic handler for readpage errors (default
2391 * readpage_io_failed_hook). if other copies exist, read those and write back
2392 * good data to the failed position. does not investigate in remapping the
2393 * failed extent elsewhere, hoping the device will be smart enough to do this as
2394 * needed
2395 */
2396
2397static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2398			      struct page *page, u64 start, u64 end,
2399			      int failed_mirror)
2400{
2401	struct io_failure_record *failrec;
2402	struct inode *inode = page->mapping->host;
2403	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2404	struct bio *bio;
2405	int read_mode;
2406	int ret;
2407
2408	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2409
2410	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2411	if (ret)
2412		return ret;
2413
2414	ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
2415	if (!ret) {
2416		free_io_failure(inode, failrec);
2417		return -EIO;
2418	}
2419
2420	if (failed_bio->bi_vcnt > 1)
2421		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2422	else
2423		read_mode = READ_SYNC;
2424
2425	phy_offset >>= inode->i_sb->s_blocksize_bits;
2426	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2427				      start - page_offset(page),
2428				      (int)phy_offset, failed_bio->bi_end_io,
2429				      NULL);
2430	if (!bio) {
2431		free_io_failure(inode, failrec);
2432		return -EIO;
2433	}
2434
2435	pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
2436		 read_mode, failrec->this_mirror, failrec->in_validation);
2437
2438	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2439					 failrec->this_mirror,
2440					 failrec->bio_flags, 0);
2441	if (ret) {
2442		free_io_failure(inode, failrec);
2443		bio_put(bio);
2444	}
2445
2446	return ret;
2447}
2448
2449/* lots and lots of room for performance fixes in the end_bio funcs */
2450
2451int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2452{
2453	int uptodate = (err == 0);
2454	struct extent_io_tree *tree;
2455	int ret = 0;
2456
2457	tree = &BTRFS_I(page->mapping->host)->io_tree;
2458
2459	if (tree->ops && tree->ops->writepage_end_io_hook) {
2460		ret = tree->ops->writepage_end_io_hook(page, start,
2461					       end, NULL, uptodate);
2462		if (ret)
2463			uptodate = 0;
2464	}
2465
2466	if (!uptodate) {
2467		ClearPageUptodate(page);
2468		SetPageError(page);
2469		ret = ret < 0 ? ret : -EIO;
2470		mapping_set_error(page->mapping, ret);
2471	}
2472	return 0;
2473}
2474
2475/*
2476 * after a writepage IO is done, we need to:
2477 * clear the uptodate bits on error
2478 * clear the writeback bits in the extent tree for this IO
2479 * end_page_writeback if the page has no more pending IO
2480 *
2481 * Scheduling is not allowed, so the extent state tree is expected
2482 * to have one and only one object corresponding to this IO.
2483 */
2484static void end_bio_extent_writepage(struct bio *bio, int err)
2485{
2486	struct bio_vec *bvec;
2487	u64 start;
2488	u64 end;
2489	int i;
2490
2491	bio_for_each_segment_all(bvec, bio, i) {
2492		struct page *page = bvec->bv_page;
2493
2494		/* We always issue full-page reads, but if some block
2495		 * in a page fails to read, blk_update_request() will
2496		 * advance bv_offset and adjust bv_len to compensate.
2497		 * Print a warning for nonzero offsets, and an error
2498		 * if they don't add up to a full page.  */
2499		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2500			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2501				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2502				   "partial page write in btrfs with offset %u and length %u",
2503					bvec->bv_offset, bvec->bv_len);
2504			else
2505				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2506				   "incomplete page write in btrfs with offset %u and "
2507				   "length %u",
2508					bvec->bv_offset, bvec->bv_len);
2509		}
2510
2511		start = page_offset(page);
2512		end = start + bvec->bv_offset + bvec->bv_len - 1;
2513
2514		if (end_extent_writepage(page, err, start, end))
2515			continue;
2516
2517		end_page_writeback(page);
2518	}
2519
2520	bio_put(bio);
2521}
2522
2523static void
2524endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2525			      int uptodate)
2526{
2527	struct extent_state *cached = NULL;
2528	u64 end = start + len - 1;
2529
2530	if (uptodate && tree->track_uptodate)
2531		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
2532	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2533}
2534
2535/*
2536 * after a readpage IO is done, we need to:
2537 * clear the uptodate bits on error
2538 * set the uptodate bits if things worked
2539 * set the page up to date if all extents in the tree are uptodate
2540 * clear the lock bit in the extent tree
2541 * unlock the page if there are no other extents locked for it
2542 *
2543 * Scheduling is not allowed, so the extent state tree is expected
2544 * to have one and only one object corresponding to this IO.
2545 */
2546static void end_bio_extent_readpage(struct bio *bio, int err)
2547{
2548	struct bio_vec *bvec;
2549	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2550	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2551	struct extent_io_tree *tree;
2552	u64 offset = 0;
2553	u64 start;
2554	u64 end;
2555	u64 len;
2556	u64 extent_start = 0;
2557	u64 extent_len = 0;
2558	int mirror;
2559	int ret;
2560	int i;
2561
2562	if (err)
2563		uptodate = 0;
2564
2565	bio_for_each_segment_all(bvec, bio, i) {
2566		struct page *page = bvec->bv_page;
2567		struct inode *inode = page->mapping->host;
2568
2569		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2570			 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
2571			 io_bio->mirror_num);
2572		tree = &BTRFS_I(inode)->io_tree;
2573
2574		/* We always issue full-page reads, but if some block
2575		 * in a page fails to read, blk_update_request() will
2576		 * advance bv_offset and adjust bv_len to compensate.
2577		 * Print a warning for nonzero offsets, and an error
2578		 * if they don't add up to a full page.  */
2579		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2580			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2581				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2582				   "partial page read in btrfs with offset %u and length %u",
2583					bvec->bv_offset, bvec->bv_len);
2584			else
2585				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2586				   "incomplete page read in btrfs with offset %u and "
2587				   "length %u",
2588					bvec->bv_offset, bvec->bv_len);
2589		}
2590
2591		start = page_offset(page);
2592		end = start + bvec->bv_offset + bvec->bv_len - 1;
2593		len = bvec->bv_len;
2594
2595		mirror = io_bio->mirror_num;
2596		if (likely(uptodate && tree->ops &&
2597			   tree->ops->readpage_end_io_hook)) {
2598			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2599							      page, start, end,
2600							      mirror);
2601			if (ret)
2602				uptodate = 0;
2603			else
2604				clean_io_failure(inode, start, page, 0);
2605		}
2606
2607		if (likely(uptodate))
2608			goto readpage_ok;
2609
2610		if (tree->ops && tree->ops->readpage_io_failed_hook) {
2611			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2612			if (!ret && !err &&
2613			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2614				uptodate = 1;
2615		} else {
2616			/*
2617			 * The generic bio_readpage_error handles errors the
2618			 * following way: If possible, new read requests are
2619			 * created and submitted and will end up in
2620			 * end_bio_extent_readpage as well (if we're lucky, not
2621			 * in the !uptodate case). In that case it returns 0 and
2622			 * we just go on with the next page in our bio. If it
2623			 * can't handle the error it will return -EIO and we
2624			 * remain responsible for that page.
2625			 */
2626			ret = bio_readpage_error(bio, offset, page, start, end,
2627						 mirror);
2628			if (ret == 0) {
2629				uptodate =
2630					test_bit(BIO_UPTODATE, &bio->bi_flags);
2631				if (err)
2632					uptodate = 0;
2633				offset += len;
2634				continue;
2635			}
2636		}
2637readpage_ok:
2638		if (likely(uptodate)) {
2639			loff_t i_size = i_size_read(inode);
2640			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2641			unsigned off;
2642
2643			/* Zero out the end if this page straddles i_size */
2644			off = i_size & (PAGE_CACHE_SIZE-1);
2645			if (page->index == end_index && off)
2646				zero_user_segment(page, off, PAGE_CACHE_SIZE);
2647			SetPageUptodate(page);
2648		} else {
2649			ClearPageUptodate(page);
2650			SetPageError(page);
2651		}
2652		unlock_page(page);
2653		offset += len;
2654
2655		if (unlikely(!uptodate)) {
2656			if (extent_len) {
2657				endio_readpage_release_extent(tree,
2658							      extent_start,
2659							      extent_len, 1);
2660				extent_start = 0;
2661				extent_len = 0;
2662			}
2663			endio_readpage_release_extent(tree, start,
2664						      end - start + 1, 0);
2665		} else if (!extent_len) {
2666			extent_start = start;
2667			extent_len = end + 1 - start;
2668		} else if (extent_start + extent_len == start) {
2669			extent_len += end + 1 - start;
2670		} else {
2671			endio_readpage_release_extent(tree, extent_start,
2672						      extent_len, uptodate);
2673			extent_start = start;
2674			extent_len = end + 1 - start;
2675		}
2676	}
2677
2678	if (extent_len)
2679		endio_readpage_release_extent(tree, extent_start, extent_len,
2680					      uptodate);
2681	if (io_bio->end_io)
2682		io_bio->end_io(io_bio, err);
2683	bio_put(bio);
2684}
2685
2686/*
2687 * this allocates from the btrfs_bioset.  We're returning a bio right now
2688 * but you can call btrfs_io_bio for the appropriate container_of magic
2689 */
2690struct bio *
2691btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2692		gfp_t gfp_flags)
2693{
2694	struct btrfs_io_bio *btrfs_bio;
2695	struct bio *bio;
2696
2697	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
2698
2699	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2700		while (!bio && (nr_vecs /= 2)) {
2701			bio = bio_alloc_bioset(gfp_flags,
2702					       nr_vecs, btrfs_bioset);
2703		}
2704	}
2705
2706	if (bio) {
2707		bio->bi_bdev = bdev;
2708		bio->bi_iter.bi_sector = first_sector;
2709		btrfs_bio = btrfs_io_bio(bio);
2710		btrfs_bio->csum = NULL;
2711		btrfs_bio->csum_allocated = NULL;
2712		btrfs_bio->end_io = NULL;
2713	}
2714	return bio;
2715}
2716
2717struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2718{
2719	struct btrfs_io_bio *btrfs_bio;
2720	struct bio *new;
2721
2722	new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2723	if (new) {
2724		btrfs_bio = btrfs_io_bio(new);
2725		btrfs_bio->csum = NULL;
2726		btrfs_bio->csum_allocated = NULL;
2727		btrfs_bio->end_io = NULL;
2728	}
2729	return new;
2730}
2731
2732/* this also allocates from the btrfs_bioset */
2733struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
2734{
2735	struct btrfs_io_bio *btrfs_bio;
2736	struct bio *bio;
2737
2738	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
2739	if (bio) {
2740		btrfs_bio = btrfs_io_bio(bio);
2741		btrfs_bio->csum = NULL;
2742		btrfs_bio->csum_allocated = NULL;
2743		btrfs_bio->end_io = NULL;
2744	}
2745	return bio;
2746}
2747
2748
2749static int __must_check submit_one_bio(int rw, struct bio *bio,
2750				       int mirror_num, unsigned long bio_flags)
2751{
2752	int ret = 0;
2753	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2754	struct page *page = bvec->bv_page;
2755	struct extent_io_tree *tree = bio->bi_private;
2756	u64 start;
2757
2758	start = page_offset(page) + bvec->bv_offset;
2759
2760	bio->bi_private = NULL;
2761
2762	bio_get(bio);
2763
2764	if (tree->ops && tree->ops->submit_bio_hook)
2765		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2766					   mirror_num, bio_flags, start);
2767	else
2768		btrfsic_submit_bio(rw, bio);
2769
2770	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2771		ret = -EOPNOTSUPP;
2772	bio_put(bio);
2773	return ret;
2774}
2775
2776static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2777		     unsigned long offset, size_t size, struct bio *bio,
2778		     unsigned long bio_flags)
2779{
2780	int ret = 0;
2781	if (tree->ops && tree->ops->merge_bio_hook)
2782		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2783						bio_flags);
2784	BUG_ON(ret < 0);
2785	return ret;
2786
2787}
2788
2789static int submit_extent_page(int rw, struct extent_io_tree *tree,
2790			      struct page *page, sector_t sector,
2791			      size_t size, unsigned long offset,
2792			      struct block_device *bdev,
2793			      struct bio **bio_ret,
2794			      unsigned long max_pages,
2795			      bio_end_io_t end_io_func,
2796			      int mirror_num,
2797			      unsigned long prev_bio_flags,
2798			      unsigned long bio_flags,
2799			      bool force_bio_submit)
2800{
2801	int ret = 0;
2802	struct bio *bio;
2803	int nr;
2804	int contig = 0;
2805	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2806	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2807	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2808
2809	if (bio_ret && *bio_ret) {
2810		bio = *bio_ret;
2811		if (old_compressed)
2812			contig = bio->bi_iter.bi_sector == sector;
2813		else
2814			contig = bio_end_sector(bio) == sector;
2815
2816		if (prev_bio_flags != bio_flags || !contig ||
2817		    force_bio_submit ||
2818		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2819		    bio_add_page(bio, page, page_size, offset) < page_size) {
2820			ret = submit_one_bio(rw, bio, mirror_num,
2821					     prev_bio_flags);
2822			if (ret < 0) {
2823				*bio_ret = NULL;
2824				return ret;
2825			}
2826			bio = NULL;
2827		} else {
2828			return 0;
2829		}
2830	}
2831	if (this_compressed)
2832		nr = BIO_MAX_PAGES;
2833	else
2834		nr = bio_get_nr_vecs(bdev);
2835
2836	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2837	if (!bio)
2838		return -ENOMEM;
2839
2840	bio_add_page(bio, page, page_size, offset);
2841	bio->bi_end_io = end_io_func;
2842	bio->bi_private = tree;
2843
2844	if (bio_ret)
2845		*bio_ret = bio;
2846	else
2847		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2848
2849	return ret;
2850}
2851
2852static void attach_extent_buffer_page(struct extent_buffer *eb,
2853				      struct page *page)
2854{
2855	if (!PagePrivate(page)) {
2856		SetPagePrivate(page);
2857		page_cache_get(page);
2858		set_page_private(page, (unsigned long)eb);
2859	} else {
2860		WARN_ON(page->private != (unsigned long)eb);
2861	}
2862}
2863
2864void set_page_extent_mapped(struct page *page)
2865{
2866	if (!PagePrivate(page)) {
2867		SetPagePrivate(page);
2868		page_cache_get(page);
2869		set_page_private(page, EXTENT_PAGE_PRIVATE);
2870	}
2871}
2872
2873static struct extent_map *
2874__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2875		 u64 start, u64 len, get_extent_t *get_extent,
2876		 struct extent_map **em_cached)
2877{
2878	struct extent_map *em;
2879
2880	if (em_cached && *em_cached) {
2881		em = *em_cached;
2882		if (extent_map_in_tree(em) && start >= em->start &&
2883		    start < extent_map_end(em)) {
2884			atomic_inc(&em->refs);
2885			return em;
2886		}
2887
2888		free_extent_map(em);
2889		*em_cached = NULL;
2890	}
2891
2892	em = get_extent(inode, page, pg_offset, start, len, 0);
2893	if (em_cached && !IS_ERR_OR_NULL(em)) {
2894		BUG_ON(*em_cached);
2895		atomic_inc(&em->refs);
2896		*em_cached = em;
2897	}
2898	return em;
2899}
2900/*
2901 * basic readpage implementation.  Locked extent state structs are inserted
2902 * into the tree that are removed when the IO is done (by the end_io
2903 * handlers)
2904 * XXX JDM: This needs looking at to ensure proper page locking
2905 */
2906static int __do_readpage(struct extent_io_tree *tree,
2907			 struct page *page,
2908			 get_extent_t *get_extent,
2909			 struct extent_map **em_cached,
2910			 struct bio **bio, int mirror_num,
2911			 unsigned long *bio_flags, int rw,
2912			 u64 *prev_em_start)
2913{
2914	struct inode *inode = page->mapping->host;
2915	u64 start = page_offset(page);
2916	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2917	u64 end;
2918	u64 cur = start;
2919	u64 extent_offset;
2920	u64 last_byte = i_size_read(inode);
2921	u64 block_start;
2922	u64 cur_end;
2923	sector_t sector;
2924	struct extent_map *em;
2925	struct block_device *bdev;
2926	int ret;
2927	int nr = 0;
2928	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
2929	size_t pg_offset = 0;
2930	size_t iosize;
2931	size_t disk_io_size;
2932	size_t blocksize = inode->i_sb->s_blocksize;
2933	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
2934
2935	set_page_extent_mapped(page);
2936
2937	end = page_end;
2938	if (!PageUptodate(page)) {
2939		if (cleancache_get_page(page) == 0) {
2940			BUG_ON(blocksize != PAGE_SIZE);
2941			unlock_extent(tree, start, end);
2942			goto out;
2943		}
2944	}
2945
2946	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2947		char *userpage;
2948		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2949
2950		if (zero_offset) {
2951			iosize = PAGE_CACHE_SIZE - zero_offset;
2952			userpage = kmap_atomic(page);
2953			memset(userpage + zero_offset, 0, iosize);
2954			flush_dcache_page(page);
2955			kunmap_atomic(userpage);
2956		}
2957	}
2958	while (cur <= end) {
2959		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2960		bool force_bio_submit = false;
2961
2962		if (cur >= last_byte) {
2963			char *userpage;
2964			struct extent_state *cached = NULL;
2965
2966			iosize = PAGE_CACHE_SIZE - pg_offset;
2967			userpage = kmap_atomic(page);
2968			memset(userpage + pg_offset, 0, iosize);
2969			flush_dcache_page(page);
2970			kunmap_atomic(userpage);
2971			set_extent_uptodate(tree, cur, cur + iosize - 1,
2972					    &cached, GFP_NOFS);
2973			if (!parent_locked)
2974				unlock_extent_cached(tree, cur,
2975						     cur + iosize - 1,
2976						     &cached, GFP_NOFS);
2977			break;
2978		}
2979		em = __get_extent_map(inode, page, pg_offset, cur,
2980				      end - cur + 1, get_extent, em_cached);
2981		if (IS_ERR_OR_NULL(em)) {
2982			SetPageError(page);
2983			if (!parent_locked)
2984				unlock_extent(tree, cur, end);
2985			break;
2986		}
2987		extent_offset = cur - em->start;
2988		BUG_ON(extent_map_end(em) <= cur);
2989		BUG_ON(end < cur);
2990
2991		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2992			this_bio_flag |= EXTENT_BIO_COMPRESSED;
2993			extent_set_compress_type(&this_bio_flag,
2994						 em->compress_type);
2995		}
2996
2997		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2998		cur_end = min(extent_map_end(em) - 1, end);
2999		iosize = ALIGN(iosize, blocksize);
3000		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
3001			disk_io_size = em->block_len;
3002			sector = em->block_start >> 9;
3003		} else {
3004			sector = (em->block_start + extent_offset) >> 9;
3005			disk_io_size = iosize;
3006		}
3007		bdev = em->bdev;
3008		block_start = em->block_start;
3009		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3010			block_start = EXTENT_MAP_HOLE;
3011
3012		/*
3013		 * If we have a file range that points to a compressed extent
3014		 * and it's followed by a consecutive file range that points to
3015		 * to the same compressed extent (possibly with a different
3016		 * offset and/or length, so it either points to the whole extent
3017		 * or only part of it), we must make sure we do not submit a
3018		 * single bio to populate the pages for the 2 ranges because
3019		 * this makes the compressed extent read zero out the pages
3020		 * belonging to the 2nd range. Imagine the following scenario:
3021		 *
3022		 *  File layout
3023		 *  [0 - 8K]                     [8K - 24K]
3024		 *    |                               |
3025		 *    |                               |
3026		 * points to extent X,         points to extent X,
3027		 * offset 4K, length of 8K     offset 0, length 16K
3028		 *
3029		 * [extent X, compressed length = 4K uncompressed length = 16K]
3030		 *
3031		 * If the bio to read the compressed extent covers both ranges,
3032		 * it will decompress extent X into the pages belonging to the
3033		 * first range and then it will stop, zeroing out the remaining
3034		 * pages that belong to the other range that points to extent X.
3035		 * So here we make sure we submit 2 bios, one for the first
3036		 * range and another one for the third range. Both will target
3037		 * the same physical extent from disk, but we can't currently
3038		 * make the compressed bio endio callback populate the pages
3039		 * for both ranges because each compressed bio is tightly
3040		 * coupled with a single extent map, and each range can have
3041		 * an extent map with a different offset value relative to the
3042		 * uncompressed data of our extent and different lengths. This
3043		 * is a corner case so we prioritize correctness over
3044		 * non-optimal behavior (submitting 2 bios for the same extent).
3045		 */
3046		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3047		    prev_em_start && *prev_em_start != (u64)-1 &&
3048		    *prev_em_start != em->orig_start)
3049			force_bio_submit = true;
3050
3051		if (prev_em_start)
3052			*prev_em_start = em->orig_start;
3053
3054		free_extent_map(em);
3055		em = NULL;
3056
3057		/* we've found a hole, just zero and go on */
3058		if (block_start == EXTENT_MAP_HOLE) {
3059			char *userpage;
3060			struct extent_state *cached = NULL;
3061
3062			userpage = kmap_atomic(page);
3063			memset(userpage + pg_offset, 0, iosize);
3064			flush_dcache_page(page);
3065			kunmap_atomic(userpage);
3066
3067			set_extent_uptodate(tree, cur, cur + iosize - 1,
3068					    &cached, GFP_NOFS);
3069			unlock_extent_cached(tree, cur, cur + iosize - 1,
3070			                     &cached, GFP_NOFS);
3071			cur = cur + iosize;
3072			pg_offset += iosize;
3073			continue;
3074		}
3075		/* the get_extent function already copied into the page */
3076		if (test_range_bit(tree, cur, cur_end,
3077				   EXTENT_UPTODATE, 1, NULL)) {
3078			check_page_uptodate(tree, page);
3079			if (!parent_locked)
3080				unlock_extent(tree, cur, cur + iosize - 1);
3081			cur = cur + iosize;
3082			pg_offset += iosize;
3083			continue;
3084		}
3085		/* we have an inline extent but it didn't get marked up
3086		 * to date.  Error out
3087		 */
3088		if (block_start == EXTENT_MAP_INLINE) {
3089			SetPageError(page);
3090			if (!parent_locked)
3091				unlock_extent(tree, cur, cur + iosize - 1);
3092			cur = cur + iosize;
3093			pg_offset += iosize;
3094			continue;
3095		}
3096
3097		pnr -= page->index;
3098		ret = submit_extent_page(rw, tree, page,
3099					 sector, disk_io_size, pg_offset,
3100					 bdev, bio, pnr,
3101					 end_bio_extent_readpage, mirror_num,
3102					 *bio_flags,
3103					 this_bio_flag,
3104					 force_bio_submit);
3105		if (!ret) {
3106			nr++;
3107			*bio_flags = this_bio_flag;
3108		} else {
3109			SetPageError(page);
3110			if (!parent_locked)
3111				unlock_extent(tree, cur, cur + iosize - 1);
3112		}
3113		cur = cur + iosize;
3114		pg_offset += iosize;
3115	}
3116out:
3117	if (!nr) {
3118		if (!PageError(page))
3119			SetPageUptodate(page);
3120		unlock_page(page);
3121	}
3122	return 0;
3123}
3124
3125static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
3126					     struct page *pages[], int nr_pages,
3127					     u64 start, u64 end,
3128					     get_extent_t *get_extent,
3129					     struct extent_map **em_cached,
3130					     struct bio **bio, int mirror_num,
3131					     unsigned long *bio_flags, int rw,
3132					     u64 *prev_em_start)
3133{
3134	struct inode *inode;
3135	struct btrfs_ordered_extent *ordered;
3136	int index;
3137
3138	inode = pages[0]->mapping->host;
3139	while (1) {
3140		lock_extent(tree, start, end);
3141		ordered = btrfs_lookup_ordered_range(inode, start,
3142						     end - start + 1);
3143		if (!ordered)
3144			break;
3145		unlock_extent(tree, start, end);
3146		btrfs_start_ordered_extent(inode, ordered, 1);
3147		btrfs_put_ordered_extent(ordered);
3148	}
3149
3150	for (index = 0; index < nr_pages; index++) {
3151		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
3152			      mirror_num, bio_flags, rw, prev_em_start);
3153		page_cache_release(pages[index]);
3154	}
3155}
3156
3157static void __extent_readpages(struct extent_io_tree *tree,
3158			       struct page *pages[],
3159			       int nr_pages, get_extent_t *get_extent,
3160			       struct extent_map **em_cached,
3161			       struct bio **bio, int mirror_num,
3162			       unsigned long *bio_flags, int rw,
3163			       u64 *prev_em_start)
3164{
3165	u64 start = 0;
3166	u64 end = 0;
3167	u64 page_start;
3168	int index;
3169	int first_index = 0;
3170
3171	for (index = 0; index < nr_pages; index++) {
3172		page_start = page_offset(pages[index]);
3173		if (!end) {
3174			start = page_start;
3175			end = start + PAGE_CACHE_SIZE - 1;
3176			first_index = index;
3177		} else if (end + 1 == page_start) {
3178			end += PAGE_CACHE_SIZE;
3179		} else {
3180			__do_contiguous_readpages(tree, &pages[first_index],
3181						  index - first_index, start,
3182						  end, get_extent, em_cached,
3183						  bio, mirror_num, bio_flags,
3184						  rw, prev_em_start);
3185			start = page_start;
3186			end = start + PAGE_CACHE_SIZE - 1;
3187			first_index = index;
3188		}
3189	}
3190
3191	if (end)
3192		__do_contiguous_readpages(tree, &pages[first_index],
3193					  index - first_index, start,
3194					  end, get_extent, em_cached, bio,
3195					  mirror_num, bio_flags, rw,
3196					  prev_em_start);
3197}
3198
3199static int __extent_read_full_page(struct extent_io_tree *tree,
3200				   struct page *page,
3201				   get_extent_t *get_extent,
3202				   struct bio **bio, int mirror_num,
3203				   unsigned long *bio_flags, int rw)
3204{
3205	struct inode *inode = page->mapping->host;
3206	struct btrfs_ordered_extent *ordered;
3207	u64 start = page_offset(page);
3208	u64 end = start + PAGE_CACHE_SIZE - 1;
3209	int ret;
3210
3211	while (1) {
3212		lock_extent(tree, start, end);
3213		ordered = btrfs_lookup_ordered_extent(inode, start);
3214		if (!ordered)
3215			break;
3216		unlock_extent(tree, start, end);
3217		btrfs_start_ordered_extent(inode, ordered, 1);
3218		btrfs_put_ordered_extent(ordered);
3219	}
3220
3221	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3222			    bio_flags, rw, NULL);
3223	return ret;
3224}
3225
3226int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3227			    get_extent_t *get_extent, int mirror_num)
3228{
3229	struct bio *bio = NULL;
3230	unsigned long bio_flags = 0;
3231	int ret;
3232
3233	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
3234				      &bio_flags, READ);
3235	if (bio)
3236		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3237	return ret;
3238}
3239
3240int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
3241				 get_extent_t *get_extent, int mirror_num)
3242{
3243	struct bio *bio = NULL;
3244	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
3245	int ret;
3246
3247	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
3248			    &bio_flags, READ, NULL);
3249	if (bio)
3250		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3251	return ret;
3252}
3253
3254static noinline void update_nr_written(struct page *page,
3255				      struct writeback_control *wbc,
3256				      unsigned long nr_written)
3257{
3258	wbc->nr_to_write -= nr_written;
3259	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
3260	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
3261		page->mapping->writeback_index = page->index + nr_written;
3262}
3263
3264/*
3265 * helper for __extent_writepage, doing all of the delayed allocation setup.
3266 *
3267 * This returns 1 if our fill_delalloc function did all the work required
3268 * to write the page (copy into inline extent).  In this case the IO has
3269 * been started and the page is already unlocked.
3270 *
3271 * This returns 0 if all went well (page still locked)
3272 * This returns < 0 if there were errors (page still locked)
3273 */
3274static noinline_for_stack int writepage_delalloc(struct inode *inode,
3275			      struct page *page, struct writeback_control *wbc,
3276			      struct extent_page_data *epd,
3277			      u64 delalloc_start,
3278			      unsigned long *nr_written)
3279{
3280	struct extent_io_tree *tree = epd->tree;
3281	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
3282	u64 nr_delalloc;
3283	u64 delalloc_to_write = 0;
3284	u64 delalloc_end = 0;
3285	int ret;
3286	int page_started = 0;
3287
3288	if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
3289		return 0;
3290
3291	while (delalloc_end < page_end) {
3292		nr_delalloc = find_lock_delalloc_range(inode, tree,
3293					       page,
3294					       &delalloc_start,
3295					       &delalloc_end,
3296					       BTRFS_MAX_EXTENT_SIZE);
3297		if (nr_delalloc == 0) {
3298			delalloc_start = delalloc_end + 1;
3299			continue;
3300		}
3301		ret = tree->ops->fill_delalloc(inode, page,
3302					       delalloc_start,
3303					       delalloc_end,
3304					       &page_started,
3305					       nr_written);
3306		/* File system has been set read-only */
3307		if (ret) {
3308			SetPageError(page);
3309			/* fill_delalloc should be return < 0 for error
3310			 * but just in case, we use > 0 here meaning the
3311			 * IO is started, so we don't want to return > 0
3312			 * unless things are going well.
3313			 */
3314			ret = ret < 0 ? ret : -EIO;
3315			goto done;
3316		}
3317		/*
3318		 * delalloc_end is already one less than the total
3319		 * length, so we don't subtract one from
3320		 * PAGE_CACHE_SIZE
3321		 */
3322		delalloc_to_write += (delalloc_end - delalloc_start +
3323				      PAGE_CACHE_SIZE) >>
3324				      PAGE_CACHE_SHIFT;
3325		delalloc_start = delalloc_end + 1;
3326	}
3327	if (wbc->nr_to_write < delalloc_to_write) {
3328		int thresh = 8192;
3329
3330		if (delalloc_to_write < thresh * 2)
3331			thresh = delalloc_to_write;
3332		wbc->nr_to_write = min_t(u64, delalloc_to_write,
3333					 thresh);
3334	}
3335
3336	/* did the fill delalloc function already unlock and start
3337	 * the IO?
3338	 */
3339	if (page_started) {
3340		/*
3341		 * we've unlocked the page, so we can't update
3342		 * the mapping's writeback index, just update
3343		 * nr_to_write.
3344		 */
3345		wbc->nr_to_write -= *nr_written;
3346		return 1;
3347	}
3348
3349	ret = 0;
3350
3351done:
3352	return ret;
3353}
3354
3355/*
3356 * helper for __extent_writepage.  This calls the writepage start hooks,
3357 * and does the loop to map the page into extents and bios.
3358 *
3359 * We return 1 if the IO is started and the page is unlocked,
3360 * 0 if all went well (page still locked)
3361 * < 0 if there were errors (page still locked)
3362 */
3363static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3364				 struct page *page,
3365				 struct writeback_control *wbc,
3366				 struct extent_page_data *epd,
3367				 loff_t i_size,
3368				 unsigned long nr_written,
3369				 int write_flags, int *nr_ret)
3370{
3371	struct extent_io_tree *tree = epd->tree;
3372	u64 start = page_offset(page);
3373	u64 page_end = start + PAGE_CACHE_SIZE - 1;
3374	u64 end;
3375	u64 cur = start;
3376	u64 extent_offset;
3377	u64 block_start;
3378	u64 iosize;
3379	sector_t sector;
3380	struct extent_state *cached_state = NULL;
3381	struct extent_map *em;
3382	struct block_device *bdev;
3383	size_t pg_offset = 0;
3384	size_t blocksize;
3385	int ret = 0;
3386	int nr = 0;
3387	bool compressed;
3388
3389	if (tree->ops && tree->ops->writepage_start_hook) {
3390		ret = tree->ops->writepage_start_hook(page, start,
3391						      page_end);
3392		if (ret) {
3393			/* Fixup worker will requeue */
3394			if (ret == -EBUSY)
3395				wbc->pages_skipped++;
3396			else
3397				redirty_page_for_writepage(wbc, page);
3398
3399			update_nr_written(page, wbc, nr_written);
3400			unlock_page(page);
3401			ret = 1;
3402			goto done_unlocked;
3403		}
3404	}
3405
3406	/*
3407	 * we don't want to touch the inode after unlocking the page,
3408	 * so we update the mapping writeback index now
3409	 */
3410	update_nr_written(page, wbc, nr_written + 1);
3411
3412	end = page_end;
3413	if (i_size <= start) {
3414		if (tree->ops && tree->ops->writepage_end_io_hook)
3415			tree->ops->writepage_end_io_hook(page, start,
3416							 page_end, NULL, 1);
3417		goto done;
3418	}
3419
3420	blocksize = inode->i_sb->s_blocksize;
3421
3422	while (cur <= end) {
3423		u64 em_end;
3424		if (cur >= i_size) {
3425			if (tree->ops && tree->ops->writepage_end_io_hook)
3426				tree->ops->writepage_end_io_hook(page, cur,
3427							 page_end, NULL, 1);
3428			break;
3429		}
3430		em = epd->get_extent(inode, page, pg_offset, cur,
3431				     end - cur + 1, 1);
3432		if (IS_ERR_OR_NULL(em)) {
3433			SetPageError(page);
3434			ret = PTR_ERR_OR_ZERO(em);
3435			break;
3436		}
3437
3438		extent_offset = cur - em->start;
3439		em_end = extent_map_end(em);
3440		BUG_ON(em_end <= cur);
3441		BUG_ON(end < cur);
3442		iosize = min(em_end - cur, end - cur + 1);
3443		iosize = ALIGN(iosize, blocksize);
3444		sector = (em->block_start + extent_offset) >> 9;
3445		bdev = em->bdev;
3446		block_start = em->block_start;
3447		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3448		free_extent_map(em);
3449		em = NULL;
3450
3451		/*
3452		 * compressed and inline extents are written through other
3453		 * paths in the FS
3454		 */
3455		if (compressed || block_start == EXTENT_MAP_HOLE ||
3456		    block_start == EXTENT_MAP_INLINE) {
3457			/*
3458			 * end_io notification does not happen here for
3459			 * compressed extents
3460			 */
3461			if (!compressed && tree->ops &&
3462			    tree->ops->writepage_end_io_hook)
3463				tree->ops->writepage_end_io_hook(page, cur,
3464							 cur + iosize - 1,
3465							 NULL, 1);
3466			else if (compressed) {
3467				/* we don't want to end_page_writeback on
3468				 * a compressed extent.  this happens
3469				 * elsewhere
3470				 */
3471				nr++;
3472			}
3473
3474			cur += iosize;
3475			pg_offset += iosize;
3476			continue;
3477		}
3478
3479		if (tree->ops && tree->ops->writepage_io_hook) {
3480			ret = tree->ops->writepage_io_hook(page, cur,
3481						cur + iosize - 1);
3482		} else {
3483			ret = 0;
3484		}
3485		if (ret) {
3486			SetPageError(page);
3487		} else {
3488			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
3489
3490			set_range_writeback(tree, cur, cur + iosize - 1);
3491			if (!PageWriteback(page)) {
3492				btrfs_err(BTRFS_I(inode)->root->fs_info,
3493					   "page %lu not writeback, cur %llu end %llu",
3494				       page->index, cur, end);
3495			}
3496
3497			ret = submit_extent_page(write_flags, tree, page,
3498						 sector, iosize, pg_offset,
3499						 bdev, &epd->bio, max_nr,
3500						 end_bio_extent_writepage,
3501						 0, 0, 0, false);
3502			if (ret)
3503				SetPageError(page);
3504		}
3505		cur = cur + iosize;
3506		pg_offset += iosize;
3507		nr++;
3508	}
3509done:
3510	*nr_ret = nr;
3511
3512done_unlocked:
3513
3514	/* drop our reference on any cached states */
3515	free_extent_state(cached_state);
3516	return ret;
3517}
3518
3519/*
3520 * the writepage semantics are similar to regular writepage.  extent
3521 * records are inserted to lock ranges in the tree, and as dirty areas
3522 * are found, they are marked writeback.  Then the lock bits are removed
3523 * and the end_io handler clears the writeback ranges
3524 */
3525static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3526			      void *data)
3527{
3528	struct inode *inode = page->mapping->host;
3529	struct extent_page_data *epd = data;
3530	u64 start = page_offset(page);
3531	u64 page_end = start + PAGE_CACHE_SIZE - 1;
3532	int ret;
3533	int nr = 0;
3534	size_t pg_offset = 0;
3535	loff_t i_size = i_size_read(inode);
3536	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
3537	int write_flags;
3538	unsigned long nr_written = 0;
3539
3540	if (wbc->sync_mode == WB_SYNC_ALL)
3541		write_flags = WRITE_SYNC;
3542	else
3543		write_flags = WRITE;
3544
3545	trace___extent_writepage(page, inode, wbc);
3546
3547	WARN_ON(!PageLocked(page));
3548
3549	ClearPageError(page);
3550
3551	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
3552	if (page->index > end_index ||
3553	   (page->index == end_index && !pg_offset)) {
3554		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
3555		unlock_page(page);
3556		return 0;
3557	}
3558
3559	if (page->index == end_index) {
3560		char *userpage;
3561
3562		userpage = kmap_atomic(page);
3563		memset(userpage + pg_offset, 0,
3564		       PAGE_CACHE_SIZE - pg_offset);
3565		kunmap_atomic(userpage);
3566		flush_dcache_page(page);
3567	}
3568
3569	pg_offset = 0;
3570
3571	set_page_extent_mapped(page);
3572
3573	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
3574	if (ret == 1)
3575		goto done_unlocked;
3576	if (ret)
3577		goto done;
3578
3579	ret = __extent_writepage_io(inode, page, wbc, epd,
3580				    i_size, nr_written, write_flags, &nr);
3581	if (ret == 1)
3582		goto done_unlocked;
3583
3584done:
3585	if (nr == 0) {
3586		/* make sure the mapping tag for page dirty gets cleared */
3587		set_page_writeback(page);
3588		end_page_writeback(page);
3589	}
3590	if (PageError(page)) {
3591		ret = ret < 0 ? ret : -EIO;
3592		end_extent_writepage(page, ret, start, page_end);
3593	}
3594	unlock_page(page);
3595	return ret;
3596
3597done_unlocked:
3598	return 0;
3599}
3600
3601void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3602{
3603	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3604		       TASK_UNINTERRUPTIBLE);
3605}
3606
3607static noinline_for_stack int
3608lock_extent_buffer_for_io(struct extent_buffer *eb,
3609			  struct btrfs_fs_info *fs_info,
3610			  struct extent_page_data *epd)
3611{
3612	unsigned long i, num_pages;
3613	int flush = 0;
3614	int ret = 0;
3615
3616	if (!btrfs_try_tree_write_lock(eb)) {
3617		flush = 1;
3618		flush_write_bio(epd);
3619		btrfs_tree_lock(eb);
3620	}
3621
3622	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3623		btrfs_tree_unlock(eb);
3624		if (!epd->sync_io)
3625			return 0;
3626		if (!flush) {
3627			flush_write_bio(epd);
3628			flush = 1;
3629		}
3630		while (1) {
3631			wait_on_extent_buffer_writeback(eb);
3632			btrfs_tree_lock(eb);
3633			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3634				break;
3635			btrfs_tree_unlock(eb);
3636		}
3637	}
3638
3639	/*
3640	 * We need to do this to prevent races in people who check if the eb is
3641	 * under IO since we can end up having no IO bits set for a short period
3642	 * of time.
3643	 */
3644	spin_lock(&eb->refs_lock);
3645	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3646		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3647		spin_unlock(&eb->refs_lock);
3648		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3649		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
3650				     -eb->len,
3651				     fs_info->dirty_metadata_batch);
3652		ret = 1;
3653	} else {
3654		spin_unlock(&eb->refs_lock);
3655	}
3656
3657	btrfs_tree_unlock(eb);
3658
3659	if (!ret)
3660		return ret;
3661
3662	num_pages = num_extent_pages(eb->start, eb->len);
3663	for (i = 0; i < num_pages; i++) {
3664		struct page *p = eb->pages[i];
3665
3666		if (!trylock_page(p)) {
3667			if (!flush) {
3668				flush_write_bio(epd);
3669				flush = 1;
3670			}
3671			lock_page(p);
3672		}
3673	}
3674
3675	return ret;
3676}
3677
3678static void end_extent_buffer_writeback(struct extent_buffer *eb)
3679{
3680	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3681	smp_mb__after_atomic();
3682	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3683}
3684
3685static void set_btree_ioerr(struct page *page)
3686{
3687	struct extent_buffer *eb = (struct extent_buffer *)page->private;
3688	struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
3689
3690	SetPageError(page);
3691	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3692		return;
3693
3694	/*
3695	 * If writeback for a btree extent that doesn't belong to a log tree
3696	 * failed, increment the counter transaction->eb_write_errors.
3697	 * We do this because while the transaction is running and before it's
3698	 * committing (when we call filemap_fdata[write|wait]_range against
3699	 * the btree inode), we might have
3700	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3701	 * returns an error or an error happens during writeback, when we're
3702	 * committing the transaction we wouldn't know about it, since the pages
3703	 * can be no longer dirty nor marked anymore for writeback (if a
3704	 * subsequent modification to the extent buffer didn't happen before the
3705	 * transaction commit), which makes filemap_fdata[write|wait]_range not
3706	 * able to find the pages tagged with SetPageError at transaction
3707	 * commit time. So if this happens we must abort the transaction,
3708	 * otherwise we commit a super block with btree roots that point to
3709	 * btree nodes/leafs whose content on disk is invalid - either garbage
3710	 * or the content of some node/leaf from a past generation that got
3711	 * cowed or deleted and is no longer valid.
3712	 *
3713	 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3714	 * not be enough - we need to distinguish between log tree extents vs
3715	 * non-log tree extents, and the next filemap_fdatawait_range() call
3716	 * will catch and clear such errors in the mapping - and that call might
3717	 * be from a log sync and not from a transaction commit. Also, checking
3718	 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3719	 * not done and would not be reliable - the eb might have been released
3720	 * from memory and reading it back again means that flag would not be
3721	 * set (since it's a runtime flag, not persisted on disk).
3722	 *
3723	 * Using the flags below in the btree inode also makes us achieve the
3724	 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3725	 * writeback for all dirty pages and before filemap_fdatawait_range()
3726	 * is called, the writeback for all dirty pages had already finished
3727	 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3728	 * filemap_fdatawait_range() would return success, as it could not know
3729	 * that writeback errors happened (the pages were no longer tagged for
3730	 * writeback).
3731	 */
3732	switch (eb->log_index) {
3733	case -1:
3734		set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
3735		break;
3736	case 0:
3737		set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
3738		break;
3739	case 1:
3740		set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
3741		break;
3742	default:
3743		BUG(); /* unexpected, logic error */
3744	}
3745}
3746
3747static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3748{
3749	struct bio_vec *bvec;
3750	struct extent_buffer *eb;
3751	int i, done;
3752
3753	bio_for_each_segment_all(bvec, bio, i) {
3754		struct page *page = bvec->bv_page;
3755
3756		eb = (struct extent_buffer *)page->private;
3757		BUG_ON(!eb);
3758		done = atomic_dec_and_test(&eb->io_pages);
3759
3760		if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3761			ClearPageUptodate(page);
3762			set_btree_ioerr(page);
3763		}
3764
3765		end_page_writeback(page);
3766
3767		if (!done)
3768			continue;
3769
3770		end_extent_buffer_writeback(eb);
3771	}
3772
3773	bio_put(bio);
3774}
3775
3776static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3777			struct btrfs_fs_info *fs_info,
3778			struct writeback_control *wbc,
3779			struct extent_page_data *epd)
3780{
3781	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3782	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3783	u64 offset = eb->start;
3784	unsigned long i, num_pages;
3785	unsigned long bio_flags = 0;
3786	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3787	int ret = 0;
3788
3789	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3790	num_pages = num_extent_pages(eb->start, eb->len);
3791	atomic_set(&eb->io_pages, num_pages);
3792	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3793		bio_flags = EXTENT_BIO_TREE_LOG;
3794
3795	for (i = 0; i < num_pages; i++) {
3796		struct page *p = eb->pages[i];
3797
3798		clear_page_dirty_for_io(p);
3799		set_page_writeback(p);
3800		ret = submit_extent_page(rw, tree, p, offset >> 9,
3801					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3802					 -1, end_bio_extent_buffer_writepage,
3803					 0, epd->bio_flags, bio_flags, false);
3804		epd->bio_flags = bio_flags;
3805		if (ret) {
3806			set_btree_ioerr(p);
3807			end_page_writeback(p);
3808			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3809				end_extent_buffer_writeback(eb);
3810			ret = -EIO;
3811			break;
3812		}
3813		offset += PAGE_CACHE_SIZE;
3814		update_nr_written(p, wbc, 1);
3815		unlock_page(p);
3816	}
3817
3818	if (unlikely(ret)) {
3819		for (; i < num_pages; i++) {
3820			struct page *p = eb->pages[i];
3821			clear_page_dirty_for_io(p);
3822			unlock_page(p);
3823		}
3824	}
3825
3826	return ret;
3827}
3828
3829int btree_write_cache_pages(struct address_space *mapping,
3830				   struct writeback_control *wbc)
3831{
3832	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3833	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3834	struct extent_buffer *eb, *prev_eb = NULL;
3835	struct extent_page_data epd = {
3836		.bio = NULL,
3837		.tree = tree,
3838		.extent_locked = 0,
3839		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3840		.bio_flags = 0,
3841	};
3842	int ret = 0;
3843	int done = 0;
3844	int nr_to_write_done = 0;
3845	struct pagevec pvec;
3846	int nr_pages;
3847	pgoff_t index;
3848	pgoff_t end;		/* Inclusive */
3849	int scanned = 0;
3850	int tag;
3851
3852	pagevec_init(&pvec, 0);
3853	if (wbc->range_cyclic) {
3854		index = mapping->writeback_index; /* Start from prev offset */
3855		end = -1;
3856	} else {
3857		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3858		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3859		scanned = 1;
3860	}
3861	if (wbc->sync_mode == WB_SYNC_ALL)
3862		tag = PAGECACHE_TAG_TOWRITE;
3863	else
3864		tag = PAGECACHE_TAG_DIRTY;
3865retry:
3866	if (wbc->sync_mode == WB_SYNC_ALL)
3867		tag_pages_for_writeback(mapping, index, end);
3868	while (!done && !nr_to_write_done && (index <= end) &&
3869	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3870			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3871		unsigned i;
3872
3873		scanned = 1;
3874		for (i = 0; i < nr_pages; i++) {
3875			struct page *page = pvec.pages[i];
3876
3877			if (!PagePrivate(page))
3878				continue;
3879
3880			if (!wbc->range_cyclic && page->index > end) {
3881				done = 1;
3882				break;
3883			}
3884
3885			spin_lock(&mapping->private_lock);
3886			if (!PagePrivate(page)) {
3887				spin_unlock(&mapping->private_lock);
3888				continue;
3889			}
3890
3891			eb = (struct extent_buffer *)page->private;
3892
3893			/*
3894			 * Shouldn't happen and normally this would be a BUG_ON
3895			 * but no sense in crashing the users box for something
3896			 * we can survive anyway.
3897			 */
3898			if (WARN_ON(!eb)) {
3899				spin_unlock(&mapping->private_lock);
3900				continue;
3901			}
3902
3903			if (eb == prev_eb) {
3904				spin_unlock(&mapping->private_lock);
3905				continue;
3906			}
3907
3908			ret = atomic_inc_not_zero(&eb->refs);
3909			spin_unlock(&mapping->private_lock);
3910			if (!ret)
3911				continue;
3912
3913			prev_eb = eb;
3914			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3915			if (!ret) {
3916				free_extent_buffer(eb);
3917				continue;
3918			}
3919
3920			ret = write_one_eb(eb, fs_info, wbc, &epd);
3921			if (ret) {
3922				done = 1;
3923				free_extent_buffer(eb);
3924				break;
3925			}
3926			free_extent_buffer(eb);
3927
3928			/*
3929			 * the filesystem may choose to bump up nr_to_write.
3930			 * We have to make sure to honor the new nr_to_write
3931			 * at any time
3932			 */
3933			nr_to_write_done = wbc->nr_to_write <= 0;
3934		}
3935		pagevec_release(&pvec);
3936		cond_resched();
3937	}
3938	if (!scanned && !done) {
3939		/*
3940		 * We hit the last page and there is more work to be done: wrap
3941		 * back to the start of the file
3942		 */
3943		scanned = 1;
3944		index = 0;
3945		goto retry;
3946	}
3947	flush_write_bio(&epd);
3948	return ret;
3949}
3950
3951/**
3952 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3953 * @mapping: address space structure to write
3954 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3955 * @writepage: function called for each page
3956 * @data: data passed to writepage function
3957 *
3958 * If a page is already under I/O, write_cache_pages() skips it, even
3959 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3960 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3961 * and msync() need to guarantee that all the data which was dirty at the time
3962 * the call was made get new I/O started against them.  If wbc->sync_mode is
3963 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3964 * existing IO to complete.
3965 */
3966static int extent_write_cache_pages(struct extent_io_tree *tree,
3967			     struct address_space *mapping,
3968			     struct writeback_control *wbc,
3969			     writepage_t writepage, void *data,
3970			     void (*flush_fn)(void *))
3971{
3972	struct inode *inode = mapping->host;
3973	int ret = 0;
3974	int done = 0;
3975	int err = 0;
3976	int nr_to_write_done = 0;
3977	struct pagevec pvec;
3978	int nr_pages;
3979	pgoff_t index;
3980	pgoff_t end;		/* Inclusive */
3981	int scanned = 0;
3982	int tag;
3983
3984	/*
3985	 * We have to hold onto the inode so that ordered extents can do their
3986	 * work when the IO finishes.  The alternative to this is failing to add
3987	 * an ordered extent if the igrab() fails there and that is a huge pain
3988	 * to deal with, so instead just hold onto the inode throughout the
3989	 * writepages operation.  If it fails here we are freeing up the inode
3990	 * anyway and we'd rather not waste our time writing out stuff that is
3991	 * going to be truncated anyway.
3992	 */
3993	if (!igrab(inode))
3994		return 0;
3995
3996	pagevec_init(&pvec, 0);
3997	if (wbc->range_cyclic) {
3998		index = mapping->writeback_index; /* Start from prev offset */
3999		end = -1;
4000	} else {
4001		index = wbc->range_start >> PAGE_CACHE_SHIFT;
4002		end = wbc->range_end >> PAGE_CACHE_SHIFT;
4003		scanned = 1;
4004	}
4005	if (wbc->sync_mode == WB_SYNC_ALL)
4006		tag = PAGECACHE_TAG_TOWRITE;
4007	else
4008		tag = PAGECACHE_TAG_DIRTY;
4009retry:
4010	if (wbc->sync_mode == WB_SYNC_ALL)
4011		tag_pages_for_writeback(mapping, index, end);
4012	while (!done && !nr_to_write_done && (index <= end) &&
4013	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
4014			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
4015		unsigned i;
4016
4017		scanned = 1;
4018		for (i = 0; i < nr_pages; i++) {
4019			struct page *page = pvec.pages[i];
4020
4021			/*
4022			 * At this point we hold neither mapping->tree_lock nor
4023			 * lock on the page itself: the page may be truncated or
4024			 * invalidated (changing page->mapping to NULL), or even
4025			 * swizzled back from swapper_space to tmpfs file
4026			 * mapping
4027			 */
4028			if (!trylock_page(page)) {
4029				flush_fn(data);
4030				lock_page(page);
4031			}
4032
4033			if (unlikely(page->mapping != mapping)) {
4034				unlock_page(page);
4035				continue;
4036			}
4037
4038			if (!wbc->range_cyclic && page->index > end) {
4039				done = 1;
4040				unlock_page(page);
4041				continue;
4042			}
4043
4044			if (wbc->sync_mode != WB_SYNC_NONE) {
4045				if (PageWriteback(page))
4046					flush_fn(data);
4047				wait_on_page_writeback(page);
4048			}
4049
4050			if (PageWriteback(page) ||
4051			    !clear_page_dirty_for_io(page)) {
4052				unlock_page(page);
4053				continue;
4054			}
4055
4056			ret = (*writepage)(page, wbc, data);
4057
4058			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
4059				unlock_page(page);
4060				ret = 0;
4061			}
4062			if (!err && ret < 0)
4063				err = ret;
4064
4065			/*
4066			 * the filesystem may choose to bump up nr_to_write.
4067			 * We have to make sure to honor the new nr_to_write
4068			 * at any time
4069			 */
4070			nr_to_write_done = wbc->nr_to_write <= 0;
4071		}
4072		pagevec_release(&pvec);
4073		cond_resched();
4074	}
4075	if (!scanned && !done && !err) {
4076		/*
4077		 * We hit the last page and there is more work to be done: wrap
4078		 * back to the start of the file
4079		 */
4080		scanned = 1;
4081		index = 0;
4082		goto retry;
4083	}
4084	btrfs_add_delayed_iput(inode);
4085	return err;
4086}
4087
4088static void flush_epd_write_bio(struct extent_page_data *epd)
4089{
4090	if (epd->bio) {
4091		int rw = WRITE;
4092		int ret;
4093
4094		if (epd->sync_io)
4095			rw = WRITE_SYNC;
4096
4097		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
4098		BUG_ON(ret < 0); /* -ENOMEM */
4099		epd->bio = NULL;
4100	}
4101}
4102
4103static noinline void flush_write_bio(void *data)
4104{
4105	struct extent_page_data *epd = data;
4106	flush_epd_write_bio(epd);
4107}
4108
4109int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
4110			  get_extent_t *get_extent,
4111			  struct writeback_control *wbc)
4112{
4113	int ret;
4114	struct extent_page_data epd = {
4115		.bio = NULL,
4116		.tree = tree,
4117		.get_extent = get_extent,
4118		.extent_locked = 0,
4119		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4120		.bio_flags = 0,
4121	};
4122
4123	ret = __extent_writepage(page, wbc, &epd);
4124
4125	flush_epd_write_bio(&epd);
4126	return ret;
4127}
4128
4129int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
4130			      u64 start, u64 end, get_extent_t *get_extent,
4131			      int mode)
4132{
4133	int ret = 0;
4134	struct address_space *mapping = inode->i_mapping;
4135	struct page *page;
4136	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
4137		PAGE_CACHE_SHIFT;
4138
4139	struct extent_page_data epd = {
4140		.bio = NULL,
4141		.tree = tree,
4142		.get_extent = get_extent,
4143		.extent_locked = 1,
4144		.sync_io = mode == WB_SYNC_ALL,
4145		.bio_flags = 0,
4146	};
4147	struct writeback_control wbc_writepages = {
4148		.sync_mode	= mode,
4149		.nr_to_write	= nr_pages * 2,
4150		.range_start	= start,
4151		.range_end	= end + 1,
4152	};
4153
4154	while (start <= end) {
4155		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
4156		if (clear_page_dirty_for_io(page))
4157			ret = __extent_writepage(page, &wbc_writepages, &epd);
4158		else {
4159			if (tree->ops && tree->ops->writepage_end_io_hook)
4160				tree->ops->writepage_end_io_hook(page, start,
4161						 start + PAGE_CACHE_SIZE - 1,
4162						 NULL, 1);
4163			unlock_page(page);
4164		}
4165		page_cache_release(page);
4166		start += PAGE_CACHE_SIZE;
4167	}
4168
4169	flush_epd_write_bio(&epd);
4170	return ret;
4171}
4172
4173int extent_writepages(struct extent_io_tree *tree,
4174		      struct address_space *mapping,
4175		      get_extent_t *get_extent,
4176		      struct writeback_control *wbc)
4177{
4178	int ret = 0;
4179	struct extent_page_data epd = {
4180		.bio = NULL,
4181		.tree = tree,
4182		.get_extent = get_extent,
4183		.extent_locked = 0,
4184		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4185		.bio_flags = 0,
4186	};
4187
4188	ret = extent_write_cache_pages(tree, mapping, wbc,
4189				       __extent_writepage, &epd,
4190				       flush_write_bio);
4191	flush_epd_write_bio(&epd);
4192	return ret;
4193}
4194
4195int extent_readpages(struct extent_io_tree *tree,
4196		     struct address_space *mapping,
4197		     struct list_head *pages, unsigned nr_pages,
4198		     get_extent_t get_extent)
4199{
4200	struct bio *bio = NULL;
4201	unsigned page_idx;
4202	unsigned long bio_flags = 0;
4203	struct page *pagepool[16];
4204	struct page *page;
4205	struct extent_map *em_cached = NULL;
4206	int nr = 0;
4207	u64 prev_em_start = (u64)-1;
4208
4209	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
4210		page = list_entry(pages->prev, struct page, lru);
4211
4212		prefetchw(&page->flags);
4213		list_del(&page->lru);
4214		if (add_to_page_cache_lru(page, mapping,
4215					page->index, GFP_NOFS)) {
4216			page_cache_release(page);
4217			continue;
4218		}
4219
4220		pagepool[nr++] = page;
4221		if (nr < ARRAY_SIZE(pagepool))
4222			continue;
4223		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
4224				   &bio, 0, &bio_flags, READ, &prev_em_start);
4225		nr = 0;
4226	}
4227	if (nr)
4228		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
4229				   &bio, 0, &bio_flags, READ, &prev_em_start);
4230
4231	if (em_cached)
4232		free_extent_map(em_cached);
4233
4234	BUG_ON(!list_empty(pages));
4235	if (bio)
4236		return submit_one_bio(READ, bio, 0, bio_flags);
4237	return 0;
4238}
4239
4240/*
4241 * basic invalidatepage code, this waits on any locked or writeback
4242 * ranges corresponding to the page, and then deletes any extent state
4243 * records from the tree
4244 */
4245int extent_invalidatepage(struct extent_io_tree *tree,
4246			  struct page *page, unsigned long offset)
4247{
4248	struct extent_state *cached_state = NULL;
4249	u64 start = page_offset(page);
4250	u64 end = start + PAGE_CACHE_SIZE - 1;
4251	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4252
4253	start += ALIGN(offset, blocksize);
4254	if (start > end)
4255		return 0;
4256
4257	lock_extent_bits(tree, start, end, 0, &cached_state);
4258	wait_on_page_writeback(page);
4259	clear_extent_bit(tree, start, end,
4260			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4261			 EXTENT_DO_ACCOUNTING,
4262			 1, 1, &cached_state, GFP_NOFS);
4263	return 0;
4264}
4265
4266/*
4267 * a helper for releasepage, this tests for areas of the page that
4268 * are locked or under IO and drops the related state bits if it is safe
4269 * to drop the page.
4270 */
4271static int try_release_extent_state(struct extent_map_tree *map,
4272				    struct extent_io_tree *tree,
4273				    struct page *page, gfp_t mask)
4274{
4275	u64 start = page_offset(page);
4276	u64 end = start + PAGE_CACHE_SIZE - 1;
4277	int ret = 1;
4278
4279	if (test_range_bit(tree, start, end,
4280			   EXTENT_IOBITS, 0, NULL))
4281		ret = 0;
4282	else {
4283		if ((mask & GFP_NOFS) == GFP_NOFS)
4284			mask = GFP_NOFS;
4285		/*
4286		 * at this point we can safely clear everything except the
4287		 * locked bit and the nodatasum bit
4288		 */
4289		ret = clear_extent_bit(tree, start, end,
4290				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
4291				 0, 0, NULL, mask);
4292
4293		/* if clear_extent_bit failed for enomem reasons,
4294		 * we can't allow the release to continue.
4295		 */
4296		if (ret < 0)
4297			ret = 0;
4298		else
4299			ret = 1;
4300	}
4301	return ret;
4302}
4303
4304/*
4305 * a helper for releasepage.  As long as there are no locked extents
4306 * in the range corresponding to the page, both state records and extent
4307 * map records are removed
4308 */
4309int try_release_extent_mapping(struct extent_map_tree *map,
4310			       struct extent_io_tree *tree, struct page *page,
4311			       gfp_t mask)
4312{
4313	struct extent_map *em;
4314	u64 start = page_offset(page);
4315	u64 end = start + PAGE_CACHE_SIZE - 1;
4316
4317	if ((mask & __GFP_WAIT) &&
4318	    page->mapping->host->i_size > 16 * 1024 * 1024) {
4319		u64 len;
4320		while (start <= end) {
4321			len = end - start + 1;
4322			write_lock(&map->lock);
4323			em = lookup_extent_mapping(map, start, len);
4324			if (!em) {
4325				write_unlock(&map->lock);
4326				break;
4327			}
4328			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4329			    em->start != start) {
4330				write_unlock(&map->lock);
4331				free_extent_map(em);
4332				break;
4333			}
4334			if (!test_range_bit(tree, em->start,
4335					    extent_map_end(em) - 1,
4336					    EXTENT_LOCKED | EXTENT_WRITEBACK,
4337					    0, NULL)) {
4338				remove_extent_mapping(map, em);
4339				/* once for the rb tree */
4340				free_extent_map(em);
4341			}
4342			start = extent_map_end(em);
4343			write_unlock(&map->lock);
4344
4345			/* once for us */
4346			free_extent_map(em);
4347		}
4348	}
4349	return try_release_extent_state(map, tree, page, mask);
4350}
4351
4352/*
4353 * helper function for fiemap, which doesn't want to see any holes.
4354 * This maps until we find something past 'last'
4355 */
4356static struct extent_map *get_extent_skip_holes(struct inode *inode,
4357						u64 offset,
4358						u64 last,
4359						get_extent_t *get_extent)
4360{
4361	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
4362	struct extent_map *em;
4363	u64 len;
4364
4365	if (offset >= last)
4366		return NULL;
4367
4368	while (1) {
4369		len = last - offset;
4370		if (len == 0)
4371			break;
4372		len = ALIGN(len, sectorsize);
4373		em = get_extent(inode, NULL, 0, offset, len, 0);
4374		if (IS_ERR_OR_NULL(em))
4375			return em;
4376
4377		/* if this isn't a hole return it */
4378		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
4379		    em->block_start != EXTENT_MAP_HOLE) {
4380			return em;
4381		}
4382
4383		/* this is a hole, advance to the next extent */
4384		offset = extent_map_end(em);
4385		free_extent_map(em);
4386		if (offset >= last)
4387			break;
4388	}
4389	return NULL;
4390}
4391
4392int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4393		__u64 start, __u64 len, get_extent_t *get_extent)
4394{
4395	int ret = 0;
4396	u64 off = start;
4397	u64 max = start + len;
4398	u32 flags = 0;
4399	u32 found_type;
4400	u64 last;
4401	u64 last_for_get_extent = 0;
4402	u64 disko = 0;
4403	u64 isize = i_size_read(inode);
4404	struct btrfs_key found_key;
4405	struct extent_map *em = NULL;
4406	struct extent_state *cached_state = NULL;
4407	struct btrfs_path *path;
4408	struct btrfs_root *root = BTRFS_I(inode)->root;
4409	int end = 0;
4410	u64 em_start = 0;
4411	u64 em_len = 0;
4412	u64 em_end = 0;
4413
4414	if (len == 0)
4415		return -EINVAL;
4416
4417	path = btrfs_alloc_path();
4418	if (!path)
4419		return -ENOMEM;
4420	path->leave_spinning = 1;
4421
4422	start = round_down(start, BTRFS_I(inode)->root->sectorsize);
4423	len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
4424
4425	/*
4426	 * lookup the last file extent.  We're not using i_size here
4427	 * because there might be preallocation past i_size
4428	 */
4429	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4430				       0);
4431	if (ret < 0) {
4432		btrfs_free_path(path);
4433		return ret;
4434	}
4435	WARN_ON(!ret);
4436	path->slots[0]--;
4437	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4438	found_type = found_key.type;
4439
4440	/* No extents, but there might be delalloc bits */
4441	if (found_key.objectid != btrfs_ino(inode) ||
4442	    found_type != BTRFS_EXTENT_DATA_KEY) {
4443		/* have to trust i_size as the end */
4444		last = (u64)-1;
4445		last_for_get_extent = isize;
4446	} else {
4447		/*
4448		 * remember the start of the last extent.  There are a
4449		 * bunch of different factors that go into the length of the
4450		 * extent, so its much less complex to remember where it started
4451		 */
4452		last = found_key.offset;
4453		last_for_get_extent = last + 1;
4454	}
4455	btrfs_release_path(path);
4456
4457	/*
4458	 * we might have some extents allocated but more delalloc past those
4459	 * extents.  so, we trust isize unless the start of the last extent is
4460	 * beyond isize
4461	 */
4462	if (last < isize) {
4463		last = (u64)-1;
4464		last_for_get_extent = isize;
4465	}
4466
4467	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
4468			 &cached_state);
4469
4470	em = get_extent_skip_holes(inode, start, last_for_get_extent,
4471				   get_extent);
4472	if (!em)
4473		goto out;
4474	if (IS_ERR(em)) {
4475		ret = PTR_ERR(em);
4476		goto out;
4477	}
4478
4479	while (!end) {
4480		u64 offset_in_extent = 0;
4481
4482		/* break if the extent we found is outside the range */
4483		if (em->start >= max || extent_map_end(em) < off)
4484			break;
4485
4486		/*
4487		 * get_extent may return an extent that starts before our
4488		 * requested range.  We have to make sure the ranges
4489		 * we return to fiemap always move forward and don't
4490		 * overlap, so adjust the offsets here
4491		 */
4492		em_start = max(em->start, off);
4493
4494		/*
4495		 * record the offset from the start of the extent
4496		 * for adjusting the disk offset below.  Only do this if the
4497		 * extent isn't compressed since our in ram offset may be past
4498		 * what we have actually allocated on disk.
4499		 */
4500		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4501			offset_in_extent = em_start - em->start;
4502		em_end = extent_map_end(em);
4503		em_len = em_end - em_start;
4504		disko = 0;
4505		flags = 0;
4506
4507		/*
4508		 * bump off for our next call to get_extent
4509		 */
4510		off = extent_map_end(em);
4511		if (off >= max)
4512			end = 1;
4513
4514		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4515			end = 1;
4516			flags |= FIEMAP_EXTENT_LAST;
4517		} else if (em->block_start == EXTENT_MAP_INLINE) {
4518			flags |= (FIEMAP_EXTENT_DATA_INLINE |
4519				  FIEMAP_EXTENT_NOT_ALIGNED);
4520		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
4521			flags |= (FIEMAP_EXTENT_DELALLOC |
4522				  FIEMAP_EXTENT_UNKNOWN);
4523		} else if (fieinfo->fi_extents_max) {
4524			u64 bytenr = em->block_start -
4525				(em->start - em->orig_start);
4526
4527			disko = em->block_start + offset_in_extent;
4528
4529			/*
4530			 * As btrfs supports shared space, this information
4531			 * can be exported to userspace tools via
4532			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4533			 * then we're just getting a count and we can skip the
4534			 * lookup stuff.
4535			 */
4536			ret = btrfs_check_shared(NULL, root->fs_info,
4537						 root->objectid,
4538						 btrfs_ino(inode), bytenr);
4539			if (ret < 0)
4540				goto out_free;
4541			if (ret)
4542				flags |= FIEMAP_EXTENT_SHARED;
4543			ret = 0;
4544		}
4545		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4546			flags |= FIEMAP_EXTENT_ENCODED;
4547
4548		free_extent_map(em);
4549		em = NULL;
4550		if ((em_start >= last) || em_len == (u64)-1 ||
4551		   (last == (u64)-1 && isize <= em_end)) {
4552			flags |= FIEMAP_EXTENT_LAST;
4553			end = 1;
4554		}
4555
4556		/* now scan forward to see if this is really the last extent. */
4557		em = get_extent_skip_holes(inode, off, last_for_get_extent,
4558					   get_extent);
4559		if (IS_ERR(em)) {
4560			ret = PTR_ERR(em);
4561			goto out;
4562		}
4563		if (!em) {
4564			flags |= FIEMAP_EXTENT_LAST;
4565			end = 1;
4566		}
4567		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
4568					      em_len, flags);
4569		if (ret) {
4570			if (ret == 1)
4571				ret = 0;
4572			goto out_free;
4573		}
4574	}
4575out_free:
4576	free_extent_map(em);
4577out:
4578	btrfs_free_path(path);
4579	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4580			     &cached_state, GFP_NOFS);
4581	return ret;
4582}
4583
4584static void __free_extent_buffer(struct extent_buffer *eb)
4585{
4586	btrfs_leak_debug_del(&eb->leak_list);
4587	kmem_cache_free(extent_buffer_cache, eb);
4588}
4589
4590int extent_buffer_under_io(struct extent_buffer *eb)
4591{
4592	return (atomic_read(&eb->io_pages) ||
4593		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4594		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4595}
4596
4597/*
4598 * Helper for releasing extent buffer page.
4599 */
4600static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
4601{
4602	unsigned long index;
4603	struct page *page;
4604	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4605
4606	BUG_ON(extent_buffer_under_io(eb));
4607
4608	index = num_extent_pages(eb->start, eb->len);
4609	if (index == 0)
4610		return;
4611
4612	do {
4613		index--;
4614		page = eb->pages[index];
4615		if (!page)
4616			continue;
4617		if (mapped)
4618			spin_lock(&page->mapping->private_lock);
4619		/*
4620		 * We do this since we'll remove the pages after we've
4621		 * removed the eb from the radix tree, so we could race
4622		 * and have this page now attached to the new eb.  So
4623		 * only clear page_private if it's still connected to
4624		 * this eb.
4625		 */
4626		if (PagePrivate(page) &&
4627		    page->private == (unsigned long)eb) {
4628			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4629			BUG_ON(PageDirty(page));
4630			BUG_ON(PageWriteback(page));
4631			/*
4632			 * We need to make sure we haven't be attached
4633			 * to a new eb.
4634			 */
4635			ClearPagePrivate(page);
4636			set_page_private(page, 0);
4637			/* One for the page private */
4638			page_cache_release(page);
4639		}
4640
4641		if (mapped)
4642			spin_unlock(&page->mapping->private_lock);
4643
4644		/* One for when we alloced the page */
4645		page_cache_release(page);
4646	} while (index != 0);
4647}
4648
4649/*
4650 * Helper for releasing the extent buffer.
4651 */
4652static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4653{
4654	btrfs_release_extent_buffer_page(eb);
4655	__free_extent_buffer(eb);
4656}
4657
4658static struct extent_buffer *
4659__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4660		      unsigned long len)
4661{
4662	struct extent_buffer *eb = NULL;
4663
4664	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
4665	if (eb == NULL)
4666		return NULL;
4667	eb->start = start;
4668	eb->len = len;
4669	eb->fs_info = fs_info;
4670	eb->bflags = 0;
4671	rwlock_init(&eb->lock);
4672	atomic_set(&eb->write_locks, 0);
4673	atomic_set(&eb->read_locks, 0);
4674	atomic_set(&eb->blocking_readers, 0);
4675	atomic_set(&eb->blocking_writers, 0);
4676	atomic_set(&eb->spinning_readers, 0);
4677	atomic_set(&eb->spinning_writers, 0);
4678	eb->lock_nested = 0;
4679	init_waitqueue_head(&eb->write_lock_wq);
4680	init_waitqueue_head(&eb->read_lock_wq);
4681
4682	btrfs_leak_debug_add(&eb->leak_list, &buffers);
4683
4684	spin_lock_init(&eb->refs_lock);
4685	atomic_set(&eb->refs, 1);
4686	atomic_set(&eb->io_pages, 0);
4687
4688	/*
4689	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4690	 */
4691	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4692		> MAX_INLINE_EXTENT_BUFFER_SIZE);
4693	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4694
4695	return eb;
4696}
4697
4698struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4699{
4700	unsigned long i;
4701	struct page *p;
4702	struct extent_buffer *new;
4703	unsigned long num_pages = num_extent_pages(src->start, src->len);
4704
4705	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
4706	if (new == NULL)
4707		return NULL;
4708
4709	for (i = 0; i < num_pages; i++) {
4710		p = alloc_page(GFP_NOFS);
4711		if (!p) {
4712			btrfs_release_extent_buffer(new);
4713			return NULL;
4714		}
4715		attach_extent_buffer_page(new, p);
4716		WARN_ON(PageDirty(p));
4717		SetPageUptodate(p);
4718		new->pages[i] = p;
4719	}
4720
4721	copy_extent_buffer(new, src, 0, 0, src->len);
4722	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4723	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4724
4725	return new;
4726}
4727
4728struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4729						u64 start)
4730{
4731	struct extent_buffer *eb;
4732	unsigned long len;
4733	unsigned long num_pages;
4734	unsigned long i;
4735
4736	if (!fs_info) {
4737		/*
4738		 * Called only from tests that don't always have a fs_info
4739		 * available, but we know that nodesize is 4096
4740		 */
4741		len = 4096;
4742	} else {
4743		len = fs_info->tree_root->nodesize;
4744	}
4745	num_pages = num_extent_pages(0, len);
4746
4747	eb = __alloc_extent_buffer(fs_info, start, len);
4748	if (!eb)
4749		return NULL;
4750
4751	for (i = 0; i < num_pages; i++) {
4752		eb->pages[i] = alloc_page(GFP_NOFS);
4753		if (!eb->pages[i])
4754			goto err;
4755	}
4756	set_extent_buffer_uptodate(eb);
4757	btrfs_set_header_nritems(eb, 0);
4758	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4759
4760	return eb;
4761err:
4762	for (; i > 0; i--)
4763		__free_page(eb->pages[i - 1]);
4764	__free_extent_buffer(eb);
4765	return NULL;
4766}
4767
4768static void check_buffer_tree_ref(struct extent_buffer *eb)
4769{
4770	int refs;
4771	/* the ref bit is tricky.  We have to make sure it is set
4772	 * if we have the buffer dirty.   Otherwise the
4773	 * code to free a buffer can end up dropping a dirty
4774	 * page
4775	 *
4776	 * Once the ref bit is set, it won't go away while the
4777	 * buffer is dirty or in writeback, and it also won't
4778	 * go away while we have the reference count on the
4779	 * eb bumped.
4780	 *
4781	 * We can't just set the ref bit without bumping the
4782	 * ref on the eb because free_extent_buffer might
4783	 * see the ref bit and try to clear it.  If this happens
4784	 * free_extent_buffer might end up dropping our original
4785	 * ref by mistake and freeing the page before we are able
4786	 * to add one more ref.
4787	 *
4788	 * So bump the ref count first, then set the bit.  If someone
4789	 * beat us to it, drop the ref we added.
4790	 */
4791	refs = atomic_read(&eb->refs);
4792	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4793		return;
4794
4795	spin_lock(&eb->refs_lock);
4796	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4797		atomic_inc(&eb->refs);
4798	spin_unlock(&eb->refs_lock);
4799}
4800
4801static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4802		struct page *accessed)
4803{
4804	unsigned long num_pages, i;
4805
4806	check_buffer_tree_ref(eb);
4807
4808	num_pages = num_extent_pages(eb->start, eb->len);
4809	for (i = 0; i < num_pages; i++) {
4810		struct page *p = eb->pages[i];
4811
4812		if (p != accessed)
4813			mark_page_accessed(p);
4814	}
4815}
4816
4817struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4818					 u64 start)
4819{
4820	struct extent_buffer *eb;
4821
4822	rcu_read_lock();
4823	eb = radix_tree_lookup(&fs_info->buffer_radix,
4824			       start >> PAGE_CACHE_SHIFT);
4825	if (eb && atomic_inc_not_zero(&eb->refs)) {
4826		rcu_read_unlock();
4827		/*
4828		 * Lock our eb's refs_lock to avoid races with
4829		 * free_extent_buffer. When we get our eb it might be flagged
4830		 * with EXTENT_BUFFER_STALE and another task running
4831		 * free_extent_buffer might have seen that flag set,
4832		 * eb->refs == 2, that the buffer isn't under IO (dirty and
4833		 * writeback flags not set) and it's still in the tree (flag
4834		 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
4835		 * of decrementing the extent buffer's reference count twice.
4836		 * So here we could race and increment the eb's reference count,
4837		 * clear its stale flag, mark it as dirty and drop our reference
4838		 * before the other task finishes executing free_extent_buffer,
4839		 * which would later result in an attempt to free an extent
4840		 * buffer that is dirty.
4841		 */
4842		if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
4843			spin_lock(&eb->refs_lock);
4844			spin_unlock(&eb->refs_lock);
4845		}
4846		mark_extent_buffer_accessed(eb, NULL);
4847		return eb;
4848	}
4849	rcu_read_unlock();
4850
4851	return NULL;
4852}
4853
4854#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4855struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4856					       u64 start)
4857{
4858	struct extent_buffer *eb, *exists = NULL;
4859	int ret;
4860
4861	eb = find_extent_buffer(fs_info, start);
4862	if (eb)
4863		return eb;
4864	eb = alloc_dummy_extent_buffer(fs_info, start);
4865	if (!eb)
4866		return NULL;
4867	eb->fs_info = fs_info;
4868again:
4869	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4870	if (ret)
4871		goto free_eb;
4872	spin_lock(&fs_info->buffer_lock);
4873	ret = radix_tree_insert(&fs_info->buffer_radix,
4874				start >> PAGE_CACHE_SHIFT, eb);
4875	spin_unlock(&fs_info->buffer_lock);
4876	radix_tree_preload_end();
4877	if (ret == -EEXIST) {
4878		exists = find_extent_buffer(fs_info, start);
4879		if (exists)
4880			goto free_eb;
4881		else
4882			goto again;
4883	}
4884	check_buffer_tree_ref(eb);
4885	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4886
4887	/*
4888	 * We will free dummy extent buffer's if they come into
4889	 * free_extent_buffer with a ref count of 2, but if we are using this we
4890	 * want the buffers to stay in memory until we're done with them, so
4891	 * bump the ref count again.
4892	 */
4893	atomic_inc(&eb->refs);
4894	return eb;
4895free_eb:
4896	btrfs_release_extent_buffer(eb);
4897	return exists;
4898}
4899#endif
4900
4901struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4902					  u64 start)
4903{
4904	unsigned long len = fs_info->tree_root->nodesize;
4905	unsigned long num_pages = num_extent_pages(start, len);
4906	unsigned long i;
4907	unsigned long index = start >> PAGE_CACHE_SHIFT;
4908	struct extent_buffer *eb;
4909	struct extent_buffer *exists = NULL;
4910	struct page *p;
4911	struct address_space *mapping = fs_info->btree_inode->i_mapping;
4912	int uptodate = 1;
4913	int ret;
4914
4915	eb = find_extent_buffer(fs_info, start);
4916	if (eb)
4917		return eb;
4918
4919	eb = __alloc_extent_buffer(fs_info, start, len);
4920	if (!eb)
4921		return NULL;
4922
4923	for (i = 0; i < num_pages; i++, index++) {
4924		p = find_or_create_page(mapping, index, GFP_NOFS);
4925		if (!p)
4926			goto free_eb;
4927
4928		spin_lock(&mapping->private_lock);
4929		if (PagePrivate(p)) {
4930			/*
4931			 * We could have already allocated an eb for this page
4932			 * and attached one so lets see if we can get a ref on
4933			 * the existing eb, and if we can we know it's good and
4934			 * we can just return that one, else we know we can just
4935			 * overwrite page->private.
4936			 */
4937			exists = (struct extent_buffer *)p->private;
4938			if (atomic_inc_not_zero(&exists->refs)) {
4939				spin_unlock(&mapping->private_lock);
4940				unlock_page(p);
4941				page_cache_release(p);
4942				mark_extent_buffer_accessed(exists, p);
4943				goto free_eb;
4944			}
4945			exists = NULL;
4946
4947			/*
4948			 * Do this so attach doesn't complain and we need to
4949			 * drop the ref the old guy had.
4950			 */
4951			ClearPagePrivate(p);
4952			WARN_ON(PageDirty(p));
4953			page_cache_release(p);
4954		}
4955		attach_extent_buffer_page(eb, p);
4956		spin_unlock(&mapping->private_lock);
4957		WARN_ON(PageDirty(p));
4958		eb->pages[i] = p;
4959		if (!PageUptodate(p))
4960			uptodate = 0;
4961
4962		/*
4963		 * see below about how we avoid a nasty race with release page
4964		 * and why we unlock later
4965		 */
4966	}
4967	if (uptodate)
4968		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4969again:
4970	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4971	if (ret)
4972		goto free_eb;
4973
4974	spin_lock(&fs_info->buffer_lock);
4975	ret = radix_tree_insert(&fs_info->buffer_radix,
4976				start >> PAGE_CACHE_SHIFT, eb);
4977	spin_unlock(&fs_info->buffer_lock);
4978	radix_tree_preload_end();
4979	if (ret == -EEXIST) {
4980		exists = find_extent_buffer(fs_info, start);
4981		if (exists)
4982			goto free_eb;
4983		else
4984			goto again;
4985	}
4986	/* add one reference for the tree */
4987	check_buffer_tree_ref(eb);
4988	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4989
4990	/*
4991	 * there is a race where release page may have
4992	 * tried to find this extent buffer in the radix
4993	 * but failed.  It will tell the VM it is safe to
4994	 * reclaim the, and it will clear the page private bit.
4995	 * We must make sure to set the page private bit properly
4996	 * after the extent buffer is in the radix tree so
4997	 * it doesn't get lost
4998	 */
4999	SetPageChecked(eb->pages[0]);
5000	for (i = 1; i < num_pages; i++) {
5001		p = eb->pages[i];
5002		ClearPageChecked(p);
5003		unlock_page(p);
5004	}
5005	unlock_page(eb->pages[0]);
5006	return eb;
5007
5008free_eb:
5009	WARN_ON(!atomic_dec_and_test(&eb->refs));
5010	for (i = 0; i < num_pages; i++) {
5011		if (eb->pages[i])
5012			unlock_page(eb->pages[i]);
5013	}
5014
5015	btrfs_release_extent_buffer(eb);
5016	return exists;
5017}
5018
5019static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5020{
5021	struct extent_buffer *eb =
5022			container_of(head, struct extent_buffer, rcu_head);
5023
5024	__free_extent_buffer(eb);
5025}
5026
5027/* Expects to have eb->eb_lock already held */
5028static int release_extent_buffer(struct extent_buffer *eb)
5029{
5030	WARN_ON(atomic_read(&eb->refs) == 0);
5031	if (atomic_dec_and_test(&eb->refs)) {
5032		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5033			struct btrfs_fs_info *fs_info = eb->fs_info;
5034
5035			spin_unlock(&eb->refs_lock);
5036
5037			spin_lock(&fs_info->buffer_lock);
5038			radix_tree_delete(&fs_info->buffer_radix,
5039					  eb->start >> PAGE_CACHE_SHIFT);
5040			spin_unlock(&fs_info->buffer_lock);
5041		} else {
5042			spin_unlock(&eb->refs_lock);
5043		}
5044
5045		/* Should be safe to release our pages at this point */
5046		btrfs_release_extent_buffer_page(eb);
5047#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5048		if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
5049			__free_extent_buffer(eb);
5050			return 1;
5051		}
5052#endif
5053		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5054		return 1;
5055	}
5056	spin_unlock(&eb->refs_lock);
5057
5058	return 0;
5059}
5060
5061void free_extent_buffer(struct extent_buffer *eb)
5062{
5063	int refs;
5064	int old;
5065	if (!eb)
5066		return;
5067
5068	while (1) {
5069		refs = atomic_read(&eb->refs);
5070		if (refs <= 3)
5071			break;
5072		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5073		if (old == refs)
5074			return;
5075	}
5076
5077	spin_lock(&eb->refs_lock);
5078	if (atomic_read(&eb->refs) == 2 &&
5079	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
5080		atomic_dec(&eb->refs);
5081
5082	if (atomic_read(&eb->refs) == 2 &&
5083	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5084	    !extent_buffer_under_io(eb) &&
5085	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5086		atomic_dec(&eb->refs);
5087
5088	/*
5089	 * I know this is terrible, but it's temporary until we stop tracking
5090	 * the uptodate bits and such for the extent buffers.
5091	 */
5092	release_extent_buffer(eb);
5093}
5094
5095void free_extent_buffer_stale(struct extent_buffer *eb)
5096{
5097	if (!eb)
5098		return;
5099
5100	spin_lock(&eb->refs_lock);
5101	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5102
5103	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5104	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5105		atomic_dec(&eb->refs);
5106	release_extent_buffer(eb);
5107}
5108
5109void clear_extent_buffer_dirty(struct extent_buffer *eb)
5110{
5111	unsigned long i;
5112	unsigned long num_pages;
5113	struct page *page;
5114
5115	num_pages = num_extent_pages(eb->start, eb->len);
5116
5117	for (i = 0; i < num_pages; i++) {
5118		page = eb->pages[i];
5119		if (!PageDirty(page))
5120			continue;
5121
5122		lock_page(page);
5123		WARN_ON(!PagePrivate(page));
5124
5125		clear_page_dirty_for_io(page);
5126		spin_lock_irq(&page->mapping->tree_lock);
5127		if (!PageDirty(page)) {
5128			radix_tree_tag_clear(&page->mapping->page_tree,
5129						page_index(page),
5130						PAGECACHE_TAG_DIRTY);
5131		}
5132		spin_unlock_irq(&page->mapping->tree_lock);
5133		ClearPageError(page);
5134		unlock_page(page);
5135	}
5136	WARN_ON(atomic_read(&eb->refs) == 0);
5137}
5138
5139int set_extent_buffer_dirty(struct extent_buffer *eb)
5140{
5141	unsigned long i;
5142	unsigned long num_pages;
5143	int was_dirty = 0;
5144
5145	check_buffer_tree_ref(eb);
5146
5147	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5148
5149	num_pages = num_extent_pages(eb->start, eb->len);
5150	WARN_ON(atomic_read(&eb->refs) == 0);
5151	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5152
5153	for (i = 0; i < num_pages; i++)
5154		set_page_dirty(eb->pages[i]);
5155	return was_dirty;
5156}
5157
5158int clear_extent_buffer_uptodate(struct extent_buffer *eb)
5159{
5160	unsigned long i;
5161	struct page *page;
5162	unsigned long num_pages;
5163
5164	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5165	num_pages = num_extent_pages(eb->start, eb->len);
5166	for (i = 0; i < num_pages; i++) {
5167		page = eb->pages[i];
5168		if (page)
5169			ClearPageUptodate(page);
5170	}
5171	return 0;
5172}
5173
5174int set_extent_buffer_uptodate(struct extent_buffer *eb)
5175{
5176	unsigned long i;
5177	struct page *page;
5178	unsigned long num_pages;
5179
5180	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5181	num_pages = num_extent_pages(eb->start, eb->len);
5182	for (i = 0; i < num_pages; i++) {
5183		page = eb->pages[i];
5184		SetPageUptodate(page);
5185	}
5186	return 0;
5187}
5188
5189int extent_buffer_uptodate(struct extent_buffer *eb)
5190{
5191	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5192}
5193
5194int read_extent_buffer_pages(struct extent_io_tree *tree,
5195			     struct extent_buffer *eb, u64 start, int wait,
5196			     get_extent_t *get_extent, int mirror_num)
5197{
5198	unsigned long i;
5199	unsigned long start_i;
5200	struct page *page;
5201	int err;
5202	int ret = 0;
5203	int locked_pages = 0;
5204	int all_uptodate = 1;
5205	unsigned long num_pages;
5206	unsigned long num_reads = 0;
5207	struct bio *bio = NULL;
5208	unsigned long bio_flags = 0;
5209
5210	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5211		return 0;
5212
5213	if (start) {
5214		WARN_ON(start < eb->start);
5215		start_i = (start >> PAGE_CACHE_SHIFT) -
5216			(eb->start >> PAGE_CACHE_SHIFT);
5217	} else {
5218		start_i = 0;
5219	}
5220
5221	num_pages = num_extent_pages(eb->start, eb->len);
5222	for (i = start_i; i < num_pages; i++) {
5223		page = eb->pages[i];
5224		if (wait == WAIT_NONE) {
5225			if (!trylock_page(page))
5226				goto unlock_exit;
5227		} else {
5228			lock_page(page);
5229		}
5230		locked_pages++;
5231		if (!PageUptodate(page)) {
5232			num_reads++;
5233			all_uptodate = 0;
5234		}
5235	}
5236	if (all_uptodate) {
5237		if (start_i == 0)
5238			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5239		goto unlock_exit;
5240	}
5241
5242	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5243	eb->read_mirror = 0;
5244	atomic_set(&eb->io_pages, num_reads);
5245	for (i = start_i; i < num_pages; i++) {
5246		page = eb->pages[i];
5247		if (!PageUptodate(page)) {
5248			ClearPageError(page);
5249			err = __extent_read_full_page(tree, page,
5250						      get_extent, &bio,
5251						      mirror_num, &bio_flags,
5252						      READ | REQ_META);
5253			if (err)
5254				ret = err;
5255		} else {
5256			unlock_page(page);
5257		}
5258	}
5259
5260	if (bio) {
5261		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
5262				     bio_flags);
5263		if (err)
5264			return err;
5265	}
5266
5267	if (ret || wait != WAIT_COMPLETE)
5268		return ret;
5269
5270	for (i = start_i; i < num_pages; i++) {
5271		page = eb->pages[i];
5272		wait_on_page_locked(page);
5273		if (!PageUptodate(page))
5274			ret = -EIO;
5275	}
5276
5277	return ret;
5278
5279unlock_exit:
5280	i = start_i;
5281	while (locked_pages > 0) {
5282		page = eb->pages[i];
5283		i++;
5284		unlock_page(page);
5285		locked_pages--;
5286	}
5287	return ret;
5288}
5289
5290void read_extent_buffer(struct extent_buffer *eb, void *dstv,
5291			unsigned long start,
5292			unsigned long len)
5293{
5294	size_t cur;
5295	size_t offset;
5296	struct page *page;
5297	char *kaddr;
5298	char *dst = (char *)dstv;
5299	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5300	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5301
5302	WARN_ON(start > eb->len);
5303	WARN_ON(start + len > eb->start + eb->len);
5304
5305	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5306
5307	while (len > 0) {
5308		page = eb->pages[i];
5309
5310		cur = min(len, (PAGE_CACHE_SIZE - offset));
5311		kaddr = page_address(page);
5312		memcpy(dst, kaddr + offset, cur);
5313
5314		dst += cur;
5315		len -= cur;
5316		offset = 0;
5317		i++;
5318	}
5319}
5320
5321int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
5322			unsigned long start,
5323			unsigned long len)
5324{
5325	size_t cur;
5326	size_t offset;
5327	struct page *page;
5328	char *kaddr;
5329	char __user *dst = (char __user *)dstv;
5330	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5331	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5332	int ret = 0;
5333
5334	WARN_ON(start > eb->len);
5335	WARN_ON(start + len > eb->start + eb->len);
5336
5337	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5338
5339	while (len > 0) {
5340		page = eb->pages[i];
5341
5342		cur = min(len, (PAGE_CACHE_SIZE - offset));
5343		kaddr = page_address(page);
5344		if (copy_to_user(dst, kaddr + offset, cur)) {
5345			ret = -EFAULT;
5346			break;
5347		}
5348
5349		dst += cur;
5350		len -= cur;
5351		offset = 0;
5352		i++;
5353	}
5354
5355	return ret;
5356}
5357
5358int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
5359			       unsigned long min_len, char **map,
5360			       unsigned long *map_start,
5361			       unsigned long *map_len)
5362{
5363	size_t offset = start & (PAGE_CACHE_SIZE - 1);
5364	char *kaddr;
5365	struct page *p;
5366	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5367	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5368	unsigned long end_i = (start_offset + start + min_len - 1) >>
5369		PAGE_CACHE_SHIFT;
5370
5371	if (i != end_i)
5372		return -EINVAL;
5373
5374	if (i == 0) {
5375		offset = start_offset;
5376		*map_start = 0;
5377	} else {
5378		offset = 0;
5379		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
5380	}
5381
5382	if (start + min_len > eb->len) {
5383		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
5384		       "wanted %lu %lu\n",
5385		       eb->start, eb->len, start, min_len);
5386		return -EINVAL;
5387	}
5388
5389	p = eb->pages[i];
5390	kaddr = page_address(p);
5391	*map = kaddr + offset;
5392	*map_len = PAGE_CACHE_SIZE - offset;
5393	return 0;
5394}
5395
5396int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
5397			  unsigned long start,
5398			  unsigned long len)
5399{
5400	size_t cur;
5401	size_t offset;
5402	struct page *page;
5403	char *kaddr;
5404	char *ptr = (char *)ptrv;
5405	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5406	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5407	int ret = 0;
5408
5409	WARN_ON(start > eb->len);
5410	WARN_ON(start + len > eb->start + eb->len);
5411
5412	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5413
5414	while (len > 0) {
5415		page = eb->pages[i];
5416
5417		cur = min(len, (PAGE_CACHE_SIZE - offset));
5418
5419		kaddr = page_address(page);
5420		ret = memcmp(ptr, kaddr + offset, cur);
5421		if (ret)
5422			break;
5423
5424		ptr += cur;
5425		len -= cur;
5426		offset = 0;
5427		i++;
5428	}
5429	return ret;
5430}
5431
5432void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5433			 unsigned long start, unsigned long len)
5434{
5435	size_t cur;
5436	size_t offset;
5437	struct page *page;
5438	char *kaddr;
5439	char *src = (char *)srcv;
5440	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5441	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5442
5443	WARN_ON(start > eb->len);
5444	WARN_ON(start + len > eb->start + eb->len);
5445
5446	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5447
5448	while (len > 0) {
5449		page = eb->pages[i];
5450		WARN_ON(!PageUptodate(page));
5451
5452		cur = min(len, PAGE_CACHE_SIZE - offset);
5453		kaddr = page_address(page);
5454		memcpy(kaddr + offset, src, cur);
5455
5456		src += cur;
5457		len -= cur;
5458		offset = 0;
5459		i++;
5460	}
5461}
5462
5463void memset_extent_buffer(struct extent_buffer *eb, char c,
5464			  unsigned long start, unsigned long len)
5465{
5466	size_t cur;
5467	size_t offset;
5468	struct page *page;
5469	char *kaddr;
5470	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
5471	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
5472
5473	WARN_ON(start > eb->len);
5474	WARN_ON(start + len > eb->start + eb->len);
5475
5476	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5477
5478	while (len > 0) {
5479		page = eb->pages[i];
5480		WARN_ON(!PageUptodate(page));
5481
5482		cur = min(len, PAGE_CACHE_SIZE - offset);
5483		kaddr = page_address(page);
5484		memset(kaddr + offset, c, cur);
5485
5486		len -= cur;
5487		offset = 0;
5488		i++;
5489	}
5490}
5491
5492void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5493			unsigned long dst_offset, unsigned long src_offset,
5494			unsigned long len)
5495{
5496	u64 dst_len = dst->len;
5497	size_t cur;
5498	size_t offset;
5499	struct page *page;
5500	char *kaddr;
5501	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5502	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
5503
5504	WARN_ON(src->len != dst_len);
5505
5506	offset = (start_offset + dst_offset) &
5507		(PAGE_CACHE_SIZE - 1);
5508
5509	while (len > 0) {
5510		page = dst->pages[i];
5511		WARN_ON(!PageUptodate(page));
5512
5513		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
5514
5515		kaddr = page_address(page);
5516		read_extent_buffer(src, kaddr + offset, src_offset, cur);
5517
5518		src_offset += cur;
5519		len -= cur;
5520		offset = 0;
5521		i++;
5522	}
5523}
5524
5525static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5526{
5527	unsigned long distance = (src > dst) ? src - dst : dst - src;
5528	return distance < len;
5529}
5530
5531static void copy_pages(struct page *dst_page, struct page *src_page,
5532		       unsigned long dst_off, unsigned long src_off,
5533		       unsigned long len)
5534{
5535	char *dst_kaddr = page_address(dst_page);
5536	char *src_kaddr;
5537	int must_memmove = 0;
5538
5539	if (dst_page != src_page) {
5540		src_kaddr = page_address(src_page);
5541	} else {
5542		src_kaddr = dst_kaddr;
5543		if (areas_overlap(src_off, dst_off, len))
5544			must_memmove = 1;
5545	}
5546
5547	if (must_memmove)
5548		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5549	else
5550		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
5551}
5552
5553void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5554			   unsigned long src_offset, unsigned long len)
5555{
5556	size_t cur;
5557	size_t dst_off_in_page;
5558	size_t src_off_in_page;
5559	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5560	unsigned long dst_i;
5561	unsigned long src_i;
5562
5563	if (src_offset + len > dst->len) {
5564		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
5565		       "len %lu dst len %lu\n", src_offset, len, dst->len);
5566		BUG_ON(1);
5567	}
5568	if (dst_offset + len > dst->len) {
5569		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
5570		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
5571		BUG_ON(1);
5572	}
5573
5574	while (len > 0) {
5575		dst_off_in_page = (start_offset + dst_offset) &
5576			(PAGE_CACHE_SIZE - 1);
5577		src_off_in_page = (start_offset + src_offset) &
5578			(PAGE_CACHE_SIZE - 1);
5579
5580		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
5581		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
5582
5583		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
5584					       src_off_in_page));
5585		cur = min_t(unsigned long, cur,
5586			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
5587
5588		copy_pages(dst->pages[dst_i], dst->pages[src_i],
5589			   dst_off_in_page, src_off_in_page, cur);
5590
5591		src_offset += cur;
5592		dst_offset += cur;
5593		len -= cur;
5594	}
5595}
5596
5597void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5598			   unsigned long src_offset, unsigned long len)
5599{
5600	size_t cur;
5601	size_t dst_off_in_page;
5602	size_t src_off_in_page;
5603	unsigned long dst_end = dst_offset + len - 1;
5604	unsigned long src_end = src_offset + len - 1;
5605	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5606	unsigned long dst_i;
5607	unsigned long src_i;
5608
5609	if (src_offset + len > dst->len) {
5610		printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
5611		       "len %lu len %lu\n", src_offset, len, dst->len);
5612		BUG_ON(1);
5613	}
5614	if (dst_offset + len > dst->len) {
5615		printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
5616		       "len %lu len %lu\n", dst_offset, len, dst->len);
5617		BUG_ON(1);
5618	}
5619	if (dst_offset < src_offset) {
5620		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5621		return;
5622	}
5623	while (len > 0) {
5624		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
5625		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
5626
5627		dst_off_in_page = (start_offset + dst_end) &
5628			(PAGE_CACHE_SIZE - 1);
5629		src_off_in_page = (start_offset + src_end) &
5630			(PAGE_CACHE_SIZE - 1);
5631
5632		cur = min_t(unsigned long, len, src_off_in_page + 1);
5633		cur = min(cur, dst_off_in_page + 1);
5634		copy_pages(dst->pages[dst_i], dst->pages[src_i],
5635			   dst_off_in_page - cur + 1,
5636			   src_off_in_page - cur + 1, cur);
5637
5638		dst_end -= cur;
5639		src_end -= cur;
5640		len -= cur;
5641	}
5642}
5643
5644int try_release_extent_buffer(struct page *page)
5645{
5646	struct extent_buffer *eb;
5647
5648	/*
5649	 * We need to make sure noboody is attaching this page to an eb right
5650	 * now.
5651	 */
5652	spin_lock(&page->mapping->private_lock);
5653	if (!PagePrivate(page)) {
5654		spin_unlock(&page->mapping->private_lock);
5655		return 1;
5656	}
5657
5658	eb = (struct extent_buffer *)page->private;
5659	BUG_ON(!eb);
5660
5661	/*
5662	 * This is a little awful but should be ok, we need to make sure that
5663	 * the eb doesn't disappear out from under us while we're looking at
5664	 * this page.
5665	 */
5666	spin_lock(&eb->refs_lock);
5667	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5668		spin_unlock(&eb->refs_lock);
5669		spin_unlock(&page->mapping->private_lock);
5670		return 0;
5671	}
5672	spin_unlock(&page->mapping->private_lock);
5673
5674	/*
5675	 * If tree ref isn't set then we know the ref on this eb is a real ref,
5676	 * so just return, this page will likely be freed soon anyway.
5677	 */
5678	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5679		spin_unlock(&eb->refs_lock);
5680		return 0;
5681	}
5682
5683	return release_extent_buffer(eb);
5684}
5685