1/*
2 * Copyright (C) 2007 Oracle.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/backing-dev.h>
26#include <linux/mpage.h>
27#include <linux/falloc.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/slab.h>
33#include <linux/btrfs.h>
34#include <linux/uio.h>
35#include "ctree.h"
36#include "disk-io.h"
37#include "transaction.h"
38#include "btrfs_inode.h"
39#include "print-tree.h"
40#include "tree-log.h"
41#include "locking.h"
42#include "volumes.h"
43#include "qgroup.h"
44
45static struct kmem_cache *btrfs_inode_defrag_cachep;
46/*
47 * when auto defrag is enabled we
48 * queue up these defrag structs to remember which
49 * inodes need defragging passes
50 */
51struct inode_defrag {
52	struct rb_node rb_node;
53	/* objectid */
54	u64 ino;
55	/*
56	 * transid where the defrag was added, we search for
57	 * extents newer than this
58	 */
59	u64 transid;
60
61	/* root objectid */
62	u64 root;
63
64	/* last offset we were able to defrag */
65	u64 last_offset;
66
67	/* if we've wrapped around back to zero once already */
68	int cycled;
69};
70
71static int __compare_inode_defrag(struct inode_defrag *defrag1,
72				  struct inode_defrag *defrag2)
73{
74	if (defrag1->root > defrag2->root)
75		return 1;
76	else if (defrag1->root < defrag2->root)
77		return -1;
78	else if (defrag1->ino > defrag2->ino)
79		return 1;
80	else if (defrag1->ino < defrag2->ino)
81		return -1;
82	else
83		return 0;
84}
85
86/* pop a record for an inode into the defrag tree.  The lock
87 * must be held already
88 *
89 * If you're inserting a record for an older transid than an
90 * existing record, the transid already in the tree is lowered
91 *
92 * If an existing record is found the defrag item you
93 * pass in is freed
94 */
95static int __btrfs_add_inode_defrag(struct inode *inode,
96				    struct inode_defrag *defrag)
97{
98	struct btrfs_root *root = BTRFS_I(inode)->root;
99	struct inode_defrag *entry;
100	struct rb_node **p;
101	struct rb_node *parent = NULL;
102	int ret;
103
104	p = &root->fs_info->defrag_inodes.rb_node;
105	while (*p) {
106		parent = *p;
107		entry = rb_entry(parent, struct inode_defrag, rb_node);
108
109		ret = __compare_inode_defrag(defrag, entry);
110		if (ret < 0)
111			p = &parent->rb_left;
112		else if (ret > 0)
113			p = &parent->rb_right;
114		else {
115			/* if we're reinserting an entry for
116			 * an old defrag run, make sure to
117			 * lower the transid of our existing record
118			 */
119			if (defrag->transid < entry->transid)
120				entry->transid = defrag->transid;
121			if (defrag->last_offset > entry->last_offset)
122				entry->last_offset = defrag->last_offset;
123			return -EEXIST;
124		}
125	}
126	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
127	rb_link_node(&defrag->rb_node, parent, p);
128	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
129	return 0;
130}
131
132static inline int __need_auto_defrag(struct btrfs_root *root)
133{
134	if (!btrfs_test_opt(root, AUTO_DEFRAG))
135		return 0;
136
137	if (btrfs_fs_closing(root->fs_info))
138		return 0;
139
140	return 1;
141}
142
143/*
144 * insert a defrag record for this inode if auto defrag is
145 * enabled
146 */
147int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
148			   struct inode *inode)
149{
150	struct btrfs_root *root = BTRFS_I(inode)->root;
151	struct inode_defrag *defrag;
152	u64 transid;
153	int ret;
154
155	if (!__need_auto_defrag(root))
156		return 0;
157
158	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
159		return 0;
160
161	if (trans)
162		transid = trans->transid;
163	else
164		transid = BTRFS_I(inode)->root->last_trans;
165
166	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
167	if (!defrag)
168		return -ENOMEM;
169
170	defrag->ino = btrfs_ino(inode);
171	defrag->transid = transid;
172	defrag->root = root->root_key.objectid;
173
174	spin_lock(&root->fs_info->defrag_inodes_lock);
175	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
176		/*
177		 * If we set IN_DEFRAG flag and evict the inode from memory,
178		 * and then re-read this inode, this new inode doesn't have
179		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
180		 */
181		ret = __btrfs_add_inode_defrag(inode, defrag);
182		if (ret)
183			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
184	} else {
185		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
186	}
187	spin_unlock(&root->fs_info->defrag_inodes_lock);
188	return 0;
189}
190
191/*
192 * Requeue the defrag object. If there is a defrag object that points to
193 * the same inode in the tree, we will merge them together (by
194 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
195 */
196static void btrfs_requeue_inode_defrag(struct inode *inode,
197				       struct inode_defrag *defrag)
198{
199	struct btrfs_root *root = BTRFS_I(inode)->root;
200	int ret;
201
202	if (!__need_auto_defrag(root))
203		goto out;
204
205	/*
206	 * Here we don't check the IN_DEFRAG flag, because we need merge
207	 * them together.
208	 */
209	spin_lock(&root->fs_info->defrag_inodes_lock);
210	ret = __btrfs_add_inode_defrag(inode, defrag);
211	spin_unlock(&root->fs_info->defrag_inodes_lock);
212	if (ret)
213		goto out;
214	return;
215out:
216	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
217}
218
219/*
220 * pick the defragable inode that we want, if it doesn't exist, we will get
221 * the next one.
222 */
223static struct inode_defrag *
224btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
225{
226	struct inode_defrag *entry = NULL;
227	struct inode_defrag tmp;
228	struct rb_node *p;
229	struct rb_node *parent = NULL;
230	int ret;
231
232	tmp.ino = ino;
233	tmp.root = root;
234
235	spin_lock(&fs_info->defrag_inodes_lock);
236	p = fs_info->defrag_inodes.rb_node;
237	while (p) {
238		parent = p;
239		entry = rb_entry(parent, struct inode_defrag, rb_node);
240
241		ret = __compare_inode_defrag(&tmp, entry);
242		if (ret < 0)
243			p = parent->rb_left;
244		else if (ret > 0)
245			p = parent->rb_right;
246		else
247			goto out;
248	}
249
250	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
251		parent = rb_next(parent);
252		if (parent)
253			entry = rb_entry(parent, struct inode_defrag, rb_node);
254		else
255			entry = NULL;
256	}
257out:
258	if (entry)
259		rb_erase(parent, &fs_info->defrag_inodes);
260	spin_unlock(&fs_info->defrag_inodes_lock);
261	return entry;
262}
263
264void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
265{
266	struct inode_defrag *defrag;
267	struct rb_node *node;
268
269	spin_lock(&fs_info->defrag_inodes_lock);
270	node = rb_first(&fs_info->defrag_inodes);
271	while (node) {
272		rb_erase(node, &fs_info->defrag_inodes);
273		defrag = rb_entry(node, struct inode_defrag, rb_node);
274		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
275
276		cond_resched_lock(&fs_info->defrag_inodes_lock);
277
278		node = rb_first(&fs_info->defrag_inodes);
279	}
280	spin_unlock(&fs_info->defrag_inodes_lock);
281}
282
283#define BTRFS_DEFRAG_BATCH	1024
284
285static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
286				    struct inode_defrag *defrag)
287{
288	struct btrfs_root *inode_root;
289	struct inode *inode;
290	struct btrfs_key key;
291	struct btrfs_ioctl_defrag_range_args range;
292	int num_defrag;
293	int index;
294	int ret;
295
296	/* get the inode */
297	key.objectid = defrag->root;
298	key.type = BTRFS_ROOT_ITEM_KEY;
299	key.offset = (u64)-1;
300
301	index = srcu_read_lock(&fs_info->subvol_srcu);
302
303	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
304	if (IS_ERR(inode_root)) {
305		ret = PTR_ERR(inode_root);
306		goto cleanup;
307	}
308
309	key.objectid = defrag->ino;
310	key.type = BTRFS_INODE_ITEM_KEY;
311	key.offset = 0;
312	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
313	if (IS_ERR(inode)) {
314		ret = PTR_ERR(inode);
315		goto cleanup;
316	}
317	srcu_read_unlock(&fs_info->subvol_srcu, index);
318
319	/* do a chunk of defrag */
320	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
321	memset(&range, 0, sizeof(range));
322	range.len = (u64)-1;
323	range.start = defrag->last_offset;
324
325	sb_start_write(fs_info->sb);
326	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
327				       BTRFS_DEFRAG_BATCH);
328	sb_end_write(fs_info->sb);
329	/*
330	 * if we filled the whole defrag batch, there
331	 * must be more work to do.  Queue this defrag
332	 * again
333	 */
334	if (num_defrag == BTRFS_DEFRAG_BATCH) {
335		defrag->last_offset = range.start;
336		btrfs_requeue_inode_defrag(inode, defrag);
337	} else if (defrag->last_offset && !defrag->cycled) {
338		/*
339		 * we didn't fill our defrag batch, but
340		 * we didn't start at zero.  Make sure we loop
341		 * around to the start of the file.
342		 */
343		defrag->last_offset = 0;
344		defrag->cycled = 1;
345		btrfs_requeue_inode_defrag(inode, defrag);
346	} else {
347		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
348	}
349
350	iput(inode);
351	return 0;
352cleanup:
353	srcu_read_unlock(&fs_info->subvol_srcu, index);
354	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
355	return ret;
356}
357
358/*
359 * run through the list of inodes in the FS that need
360 * defragging
361 */
362int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
363{
364	struct inode_defrag *defrag;
365	u64 first_ino = 0;
366	u64 root_objectid = 0;
367
368	atomic_inc(&fs_info->defrag_running);
369	while (1) {
370		/* Pause the auto defragger. */
371		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
372			     &fs_info->fs_state))
373			break;
374
375		if (!__need_auto_defrag(fs_info->tree_root))
376			break;
377
378		/* find an inode to defrag */
379		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
380						 first_ino);
381		if (!defrag) {
382			if (root_objectid || first_ino) {
383				root_objectid = 0;
384				first_ino = 0;
385				continue;
386			} else {
387				break;
388			}
389		}
390
391		first_ino = defrag->ino + 1;
392		root_objectid = defrag->root;
393
394		__btrfs_run_defrag_inode(fs_info, defrag);
395	}
396	atomic_dec(&fs_info->defrag_running);
397
398	/*
399	 * during unmount, we use the transaction_wait queue to
400	 * wait for the defragger to stop
401	 */
402	wake_up(&fs_info->transaction_wait);
403	return 0;
404}
405
406/* simple helper to fault in pages and copy.  This should go away
407 * and be replaced with calls into generic code.
408 */
409static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
410					 size_t write_bytes,
411					 struct page **prepared_pages,
412					 struct iov_iter *i)
413{
414	size_t copied = 0;
415	size_t total_copied = 0;
416	int pg = 0;
417	int offset = pos & (PAGE_CACHE_SIZE - 1);
418
419	while (write_bytes > 0) {
420		size_t count = min_t(size_t,
421				     PAGE_CACHE_SIZE - offset, write_bytes);
422		struct page *page = prepared_pages[pg];
423		/*
424		 * Copy data from userspace to the current page
425		 */
426		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
427
428		/* Flush processor's dcache for this page */
429		flush_dcache_page(page);
430
431		/*
432		 * if we get a partial write, we can end up with
433		 * partially up to date pages.  These add
434		 * a lot of complexity, so make sure they don't
435		 * happen by forcing this copy to be retried.
436		 *
437		 * The rest of the btrfs_file_write code will fall
438		 * back to page at a time copies after we return 0.
439		 */
440		if (!PageUptodate(page) && copied < count)
441			copied = 0;
442
443		iov_iter_advance(i, copied);
444		write_bytes -= copied;
445		total_copied += copied;
446
447		/* Return to btrfs_file_write_iter to fault page */
448		if (unlikely(copied == 0))
449			break;
450
451		if (copied < PAGE_CACHE_SIZE - offset) {
452			offset += copied;
453		} else {
454			pg++;
455			offset = 0;
456		}
457	}
458	return total_copied;
459}
460
461/*
462 * unlocks pages after btrfs_file_write is done with them
463 */
464static void btrfs_drop_pages(struct page **pages, size_t num_pages)
465{
466	size_t i;
467	for (i = 0; i < num_pages; i++) {
468		/* page checked is some magic around finding pages that
469		 * have been modified without going through btrfs_set_page_dirty
470		 * clear it here. There should be no need to mark the pages
471		 * accessed as prepare_pages should have marked them accessed
472		 * in prepare_pages via find_or_create_page()
473		 */
474		ClearPageChecked(pages[i]);
475		unlock_page(pages[i]);
476		page_cache_release(pages[i]);
477	}
478}
479
480/*
481 * after copy_from_user, pages need to be dirtied and we need to make
482 * sure holes are created between the current EOF and the start of
483 * any next extents (if required).
484 *
485 * this also makes the decision about creating an inline extent vs
486 * doing real data extents, marking pages dirty and delalloc as required.
487 */
488int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
489			     struct page **pages, size_t num_pages,
490			     loff_t pos, size_t write_bytes,
491			     struct extent_state **cached)
492{
493	int err = 0;
494	int i;
495	u64 num_bytes;
496	u64 start_pos;
497	u64 end_of_last_block;
498	u64 end_pos = pos + write_bytes;
499	loff_t isize = i_size_read(inode);
500
501	start_pos = pos & ~((u64)root->sectorsize - 1);
502	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
503
504	end_of_last_block = start_pos + num_bytes - 1;
505	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
506					cached);
507	if (err)
508		return err;
509
510	for (i = 0; i < num_pages; i++) {
511		struct page *p = pages[i];
512		SetPageUptodate(p);
513		ClearPageChecked(p);
514		set_page_dirty(p);
515	}
516
517	/*
518	 * we've only changed i_size in ram, and we haven't updated
519	 * the disk i_size.  There is no need to log the inode
520	 * at this time.
521	 */
522	if (end_pos > isize)
523		i_size_write(inode, end_pos);
524	return 0;
525}
526
527/*
528 * this drops all the extents in the cache that intersect the range
529 * [start, end].  Existing extents are split as required.
530 */
531void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
532			     int skip_pinned)
533{
534	struct extent_map *em;
535	struct extent_map *split = NULL;
536	struct extent_map *split2 = NULL;
537	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
538	u64 len = end - start + 1;
539	u64 gen;
540	int ret;
541	int testend = 1;
542	unsigned long flags;
543	int compressed = 0;
544	bool modified;
545
546	WARN_ON(end < start);
547	if (end == (u64)-1) {
548		len = (u64)-1;
549		testend = 0;
550	}
551	while (1) {
552		int no_splits = 0;
553
554		modified = false;
555		if (!split)
556			split = alloc_extent_map();
557		if (!split2)
558			split2 = alloc_extent_map();
559		if (!split || !split2)
560			no_splits = 1;
561
562		write_lock(&em_tree->lock);
563		em = lookup_extent_mapping(em_tree, start, len);
564		if (!em) {
565			write_unlock(&em_tree->lock);
566			break;
567		}
568		flags = em->flags;
569		gen = em->generation;
570		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
571			if (testend && em->start + em->len >= start + len) {
572				free_extent_map(em);
573				write_unlock(&em_tree->lock);
574				break;
575			}
576			start = em->start + em->len;
577			if (testend)
578				len = start + len - (em->start + em->len);
579			free_extent_map(em);
580			write_unlock(&em_tree->lock);
581			continue;
582		}
583		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
584		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
585		clear_bit(EXTENT_FLAG_LOGGING, &flags);
586		modified = !list_empty(&em->list);
587		if (no_splits)
588			goto next;
589
590		if (em->start < start) {
591			split->start = em->start;
592			split->len = start - em->start;
593
594			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
595				split->orig_start = em->orig_start;
596				split->block_start = em->block_start;
597
598				if (compressed)
599					split->block_len = em->block_len;
600				else
601					split->block_len = split->len;
602				split->orig_block_len = max(split->block_len,
603						em->orig_block_len);
604				split->ram_bytes = em->ram_bytes;
605			} else {
606				split->orig_start = split->start;
607				split->block_len = 0;
608				split->block_start = em->block_start;
609				split->orig_block_len = 0;
610				split->ram_bytes = split->len;
611			}
612
613			split->generation = gen;
614			split->bdev = em->bdev;
615			split->flags = flags;
616			split->compress_type = em->compress_type;
617			replace_extent_mapping(em_tree, em, split, modified);
618			free_extent_map(split);
619			split = split2;
620			split2 = NULL;
621		}
622		if (testend && em->start + em->len > start + len) {
623			u64 diff = start + len - em->start;
624
625			split->start = start + len;
626			split->len = em->start + em->len - (start + len);
627			split->bdev = em->bdev;
628			split->flags = flags;
629			split->compress_type = em->compress_type;
630			split->generation = gen;
631
632			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
633				split->orig_block_len = max(em->block_len,
634						    em->orig_block_len);
635
636				split->ram_bytes = em->ram_bytes;
637				if (compressed) {
638					split->block_len = em->block_len;
639					split->block_start = em->block_start;
640					split->orig_start = em->orig_start;
641				} else {
642					split->block_len = split->len;
643					split->block_start = em->block_start
644						+ diff;
645					split->orig_start = em->orig_start;
646				}
647			} else {
648				split->ram_bytes = split->len;
649				split->orig_start = split->start;
650				split->block_len = 0;
651				split->block_start = em->block_start;
652				split->orig_block_len = 0;
653			}
654
655			if (extent_map_in_tree(em)) {
656				replace_extent_mapping(em_tree, em, split,
657						       modified);
658			} else {
659				ret = add_extent_mapping(em_tree, split,
660							 modified);
661				ASSERT(ret == 0); /* Logic error */
662			}
663			free_extent_map(split);
664			split = NULL;
665		}
666next:
667		if (extent_map_in_tree(em))
668			remove_extent_mapping(em_tree, em);
669		write_unlock(&em_tree->lock);
670
671		/* once for us */
672		free_extent_map(em);
673		/* once for the tree*/
674		free_extent_map(em);
675	}
676	if (split)
677		free_extent_map(split);
678	if (split2)
679		free_extent_map(split2);
680}
681
682/*
683 * this is very complex, but the basic idea is to drop all extents
684 * in the range start - end.  hint_block is filled in with a block number
685 * that would be a good hint to the block allocator for this file.
686 *
687 * If an extent intersects the range but is not entirely inside the range
688 * it is either truncated or split.  Anything entirely inside the range
689 * is deleted from the tree.
690 */
691int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
692			 struct btrfs_root *root, struct inode *inode,
693			 struct btrfs_path *path, u64 start, u64 end,
694			 u64 *drop_end, int drop_cache,
695			 int replace_extent,
696			 u32 extent_item_size,
697			 int *key_inserted)
698{
699	struct extent_buffer *leaf;
700	struct btrfs_file_extent_item *fi;
701	struct btrfs_key key;
702	struct btrfs_key new_key;
703	u64 ino = btrfs_ino(inode);
704	u64 search_start = start;
705	u64 disk_bytenr = 0;
706	u64 num_bytes = 0;
707	u64 extent_offset = 0;
708	u64 extent_end = 0;
709	int del_nr = 0;
710	int del_slot = 0;
711	int extent_type;
712	int recow;
713	int ret;
714	int modify_tree = -1;
715	int update_refs;
716	int found = 0;
717	int leafs_visited = 0;
718
719	if (drop_cache)
720		btrfs_drop_extent_cache(inode, start, end - 1, 0);
721
722	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
723		modify_tree = 0;
724
725	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
726		       root == root->fs_info->tree_root);
727	while (1) {
728		recow = 0;
729		ret = btrfs_lookup_file_extent(trans, root, path, ino,
730					       search_start, modify_tree);
731		if (ret < 0)
732			break;
733		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
734			leaf = path->nodes[0];
735			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
736			if (key.objectid == ino &&
737			    key.type == BTRFS_EXTENT_DATA_KEY)
738				path->slots[0]--;
739		}
740		ret = 0;
741		leafs_visited++;
742next_slot:
743		leaf = path->nodes[0];
744		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
745			BUG_ON(del_nr > 0);
746			ret = btrfs_next_leaf(root, path);
747			if (ret < 0)
748				break;
749			if (ret > 0) {
750				ret = 0;
751				break;
752			}
753			leafs_visited++;
754			leaf = path->nodes[0];
755			recow = 1;
756		}
757
758		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
759
760		if (key.objectid > ino)
761			break;
762		if (WARN_ON_ONCE(key.objectid < ino) ||
763		    key.type < BTRFS_EXTENT_DATA_KEY) {
764			ASSERT(del_nr == 0);
765			path->slots[0]++;
766			goto next_slot;
767		}
768		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
769			break;
770
771		fi = btrfs_item_ptr(leaf, path->slots[0],
772				    struct btrfs_file_extent_item);
773		extent_type = btrfs_file_extent_type(leaf, fi);
774
775		if (extent_type == BTRFS_FILE_EXTENT_REG ||
776		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
777			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
778			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
779			extent_offset = btrfs_file_extent_offset(leaf, fi);
780			extent_end = key.offset +
781				btrfs_file_extent_num_bytes(leaf, fi);
782		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
783			extent_end = key.offset +
784				btrfs_file_extent_inline_len(leaf,
785						     path->slots[0], fi);
786		} else {
787			/* can't happen */
788			BUG();
789		}
790
791		/*
792		 * Don't skip extent items representing 0 byte lengths. They
793		 * used to be created (bug) if while punching holes we hit
794		 * -ENOSPC condition. So if we find one here, just ensure we
795		 * delete it, otherwise we would insert a new file extent item
796		 * with the same key (offset) as that 0 bytes length file
797		 * extent item in the call to setup_items_for_insert() later
798		 * in this function.
799		 */
800		if (extent_end == key.offset && extent_end >= search_start)
801			goto delete_extent_item;
802
803		if (extent_end <= search_start) {
804			path->slots[0]++;
805			goto next_slot;
806		}
807
808		found = 1;
809		search_start = max(key.offset, start);
810		if (recow || !modify_tree) {
811			modify_tree = -1;
812			btrfs_release_path(path);
813			continue;
814		}
815
816		/*
817		 *     | - range to drop - |
818		 *  | -------- extent -------- |
819		 */
820		if (start > key.offset && end < extent_end) {
821			BUG_ON(del_nr > 0);
822			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
823				ret = -EOPNOTSUPP;
824				break;
825			}
826
827			memcpy(&new_key, &key, sizeof(new_key));
828			new_key.offset = start;
829			ret = btrfs_duplicate_item(trans, root, path,
830						   &new_key);
831			if (ret == -EAGAIN) {
832				btrfs_release_path(path);
833				continue;
834			}
835			if (ret < 0)
836				break;
837
838			leaf = path->nodes[0];
839			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
840					    struct btrfs_file_extent_item);
841			btrfs_set_file_extent_num_bytes(leaf, fi,
842							start - key.offset);
843
844			fi = btrfs_item_ptr(leaf, path->slots[0],
845					    struct btrfs_file_extent_item);
846
847			extent_offset += start - key.offset;
848			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
849			btrfs_set_file_extent_num_bytes(leaf, fi,
850							extent_end - start);
851			btrfs_mark_buffer_dirty(leaf);
852
853			if (update_refs && disk_bytenr > 0) {
854				ret = btrfs_inc_extent_ref(trans, root,
855						disk_bytenr, num_bytes, 0,
856						root->root_key.objectid,
857						new_key.objectid,
858						start - extent_offset, 1);
859				BUG_ON(ret); /* -ENOMEM */
860			}
861			key.offset = start;
862		}
863		/*
864		 *  | ---- range to drop ----- |
865		 *      | -------- extent -------- |
866		 */
867		if (start <= key.offset && end < extent_end) {
868			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
869				ret = -EOPNOTSUPP;
870				break;
871			}
872
873			memcpy(&new_key, &key, sizeof(new_key));
874			new_key.offset = end;
875			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
876
877			extent_offset += end - key.offset;
878			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
879			btrfs_set_file_extent_num_bytes(leaf, fi,
880							extent_end - end);
881			btrfs_mark_buffer_dirty(leaf);
882			if (update_refs && disk_bytenr > 0)
883				inode_sub_bytes(inode, end - key.offset);
884			break;
885		}
886
887		search_start = extent_end;
888		/*
889		 *       | ---- range to drop ----- |
890		 *  | -------- extent -------- |
891		 */
892		if (start > key.offset && end >= extent_end) {
893			BUG_ON(del_nr > 0);
894			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
895				ret = -EOPNOTSUPP;
896				break;
897			}
898
899			btrfs_set_file_extent_num_bytes(leaf, fi,
900							start - key.offset);
901			btrfs_mark_buffer_dirty(leaf);
902			if (update_refs && disk_bytenr > 0)
903				inode_sub_bytes(inode, extent_end - start);
904			if (end == extent_end)
905				break;
906
907			path->slots[0]++;
908			goto next_slot;
909		}
910
911		/*
912		 *  | ---- range to drop ----- |
913		 *    | ------ extent ------ |
914		 */
915		if (start <= key.offset && end >= extent_end) {
916delete_extent_item:
917			if (del_nr == 0) {
918				del_slot = path->slots[0];
919				del_nr = 1;
920			} else {
921				BUG_ON(del_slot + del_nr != path->slots[0]);
922				del_nr++;
923			}
924
925			if (update_refs &&
926			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
927				inode_sub_bytes(inode,
928						extent_end - key.offset);
929				extent_end = ALIGN(extent_end,
930						   root->sectorsize);
931			} else if (update_refs && disk_bytenr > 0) {
932				ret = btrfs_free_extent(trans, root,
933						disk_bytenr, num_bytes, 0,
934						root->root_key.objectid,
935						key.objectid, key.offset -
936						extent_offset, 0);
937				BUG_ON(ret); /* -ENOMEM */
938				inode_sub_bytes(inode,
939						extent_end - key.offset);
940			}
941
942			if (end == extent_end)
943				break;
944
945			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
946				path->slots[0]++;
947				goto next_slot;
948			}
949
950			ret = btrfs_del_items(trans, root, path, del_slot,
951					      del_nr);
952			if (ret) {
953				btrfs_abort_transaction(trans, root, ret);
954				break;
955			}
956
957			del_nr = 0;
958			del_slot = 0;
959
960			btrfs_release_path(path);
961			continue;
962		}
963
964		BUG_ON(1);
965	}
966
967	if (!ret && del_nr > 0) {
968		/*
969		 * Set path->slots[0] to first slot, so that after the delete
970		 * if items are move off from our leaf to its immediate left or
971		 * right neighbor leafs, we end up with a correct and adjusted
972		 * path->slots[0] for our insertion (if replace_extent != 0).
973		 */
974		path->slots[0] = del_slot;
975		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
976		if (ret)
977			btrfs_abort_transaction(trans, root, ret);
978	}
979
980	leaf = path->nodes[0];
981	/*
982	 * If btrfs_del_items() was called, it might have deleted a leaf, in
983	 * which case it unlocked our path, so check path->locks[0] matches a
984	 * write lock.
985	 */
986	if (!ret && replace_extent && leafs_visited == 1 &&
987	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
988	     path->locks[0] == BTRFS_WRITE_LOCK) &&
989	    btrfs_leaf_free_space(root, leaf) >=
990	    sizeof(struct btrfs_item) + extent_item_size) {
991
992		key.objectid = ino;
993		key.type = BTRFS_EXTENT_DATA_KEY;
994		key.offset = start;
995		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
996			struct btrfs_key slot_key;
997
998			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
999			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1000				path->slots[0]++;
1001		}
1002		setup_items_for_insert(root, path, &key,
1003				       &extent_item_size,
1004				       extent_item_size,
1005				       sizeof(struct btrfs_item) +
1006				       extent_item_size, 1);
1007		*key_inserted = 1;
1008	}
1009
1010	if (!replace_extent || !(*key_inserted))
1011		btrfs_release_path(path);
1012	if (drop_end)
1013		*drop_end = found ? min(end, extent_end) : end;
1014	return ret;
1015}
1016
1017int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1018		       struct btrfs_root *root, struct inode *inode, u64 start,
1019		       u64 end, int drop_cache)
1020{
1021	struct btrfs_path *path;
1022	int ret;
1023
1024	path = btrfs_alloc_path();
1025	if (!path)
1026		return -ENOMEM;
1027	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1028				   drop_cache, 0, 0, NULL);
1029	btrfs_free_path(path);
1030	return ret;
1031}
1032
1033static int extent_mergeable(struct extent_buffer *leaf, int slot,
1034			    u64 objectid, u64 bytenr, u64 orig_offset,
1035			    u64 *start, u64 *end)
1036{
1037	struct btrfs_file_extent_item *fi;
1038	struct btrfs_key key;
1039	u64 extent_end;
1040
1041	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1042		return 0;
1043
1044	btrfs_item_key_to_cpu(leaf, &key, slot);
1045	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1046		return 0;
1047
1048	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1049	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1050	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1051	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1052	    btrfs_file_extent_compression(leaf, fi) ||
1053	    btrfs_file_extent_encryption(leaf, fi) ||
1054	    btrfs_file_extent_other_encoding(leaf, fi))
1055		return 0;
1056
1057	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1058	if ((*start && *start != key.offset) || (*end && *end != extent_end))
1059		return 0;
1060
1061	*start = key.offset;
1062	*end = extent_end;
1063	return 1;
1064}
1065
1066/*
1067 * Mark extent in the range start - end as written.
1068 *
1069 * This changes extent type from 'pre-allocated' to 'regular'. If only
1070 * part of extent is marked as written, the extent will be split into
1071 * two or three.
1072 */
1073int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1074			      struct inode *inode, u64 start, u64 end)
1075{
1076	struct btrfs_root *root = BTRFS_I(inode)->root;
1077	struct extent_buffer *leaf;
1078	struct btrfs_path *path;
1079	struct btrfs_file_extent_item *fi;
1080	struct btrfs_key key;
1081	struct btrfs_key new_key;
1082	u64 bytenr;
1083	u64 num_bytes;
1084	u64 extent_end;
1085	u64 orig_offset;
1086	u64 other_start;
1087	u64 other_end;
1088	u64 split;
1089	int del_nr = 0;
1090	int del_slot = 0;
1091	int recow;
1092	int ret;
1093	u64 ino = btrfs_ino(inode);
1094
1095	path = btrfs_alloc_path();
1096	if (!path)
1097		return -ENOMEM;
1098again:
1099	recow = 0;
1100	split = start;
1101	key.objectid = ino;
1102	key.type = BTRFS_EXTENT_DATA_KEY;
1103	key.offset = split;
1104
1105	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1106	if (ret < 0)
1107		goto out;
1108	if (ret > 0 && path->slots[0] > 0)
1109		path->slots[0]--;
1110
1111	leaf = path->nodes[0];
1112	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1113	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
1114	fi = btrfs_item_ptr(leaf, path->slots[0],
1115			    struct btrfs_file_extent_item);
1116	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
1117	       BTRFS_FILE_EXTENT_PREALLOC);
1118	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1119	BUG_ON(key.offset > start || extent_end < end);
1120
1121	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1122	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1123	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1124	memcpy(&new_key, &key, sizeof(new_key));
1125
1126	if (start == key.offset && end < extent_end) {
1127		other_start = 0;
1128		other_end = start;
1129		if (extent_mergeable(leaf, path->slots[0] - 1,
1130				     ino, bytenr, orig_offset,
1131				     &other_start, &other_end)) {
1132			new_key.offset = end;
1133			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
1134			fi = btrfs_item_ptr(leaf, path->slots[0],
1135					    struct btrfs_file_extent_item);
1136			btrfs_set_file_extent_generation(leaf, fi,
1137							 trans->transid);
1138			btrfs_set_file_extent_num_bytes(leaf, fi,
1139							extent_end - end);
1140			btrfs_set_file_extent_offset(leaf, fi,
1141						     end - orig_offset);
1142			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1143					    struct btrfs_file_extent_item);
1144			btrfs_set_file_extent_generation(leaf, fi,
1145							 trans->transid);
1146			btrfs_set_file_extent_num_bytes(leaf, fi,
1147							end - other_start);
1148			btrfs_mark_buffer_dirty(leaf);
1149			goto out;
1150		}
1151	}
1152
1153	if (start > key.offset && end == extent_end) {
1154		other_start = end;
1155		other_end = 0;
1156		if (extent_mergeable(leaf, path->slots[0] + 1,
1157				     ino, bytenr, orig_offset,
1158				     &other_start, &other_end)) {
1159			fi = btrfs_item_ptr(leaf, path->slots[0],
1160					    struct btrfs_file_extent_item);
1161			btrfs_set_file_extent_num_bytes(leaf, fi,
1162							start - key.offset);
1163			btrfs_set_file_extent_generation(leaf, fi,
1164							 trans->transid);
1165			path->slots[0]++;
1166			new_key.offset = start;
1167			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
1168
1169			fi = btrfs_item_ptr(leaf, path->slots[0],
1170					    struct btrfs_file_extent_item);
1171			btrfs_set_file_extent_generation(leaf, fi,
1172							 trans->transid);
1173			btrfs_set_file_extent_num_bytes(leaf, fi,
1174							other_end - start);
1175			btrfs_set_file_extent_offset(leaf, fi,
1176						     start - orig_offset);
1177			btrfs_mark_buffer_dirty(leaf);
1178			goto out;
1179		}
1180	}
1181
1182	while (start > key.offset || end < extent_end) {
1183		if (key.offset == start)
1184			split = end;
1185
1186		new_key.offset = split;
1187		ret = btrfs_duplicate_item(trans, root, path, &new_key);
1188		if (ret == -EAGAIN) {
1189			btrfs_release_path(path);
1190			goto again;
1191		}
1192		if (ret < 0) {
1193			btrfs_abort_transaction(trans, root, ret);
1194			goto out;
1195		}
1196
1197		leaf = path->nodes[0];
1198		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1199				    struct btrfs_file_extent_item);
1200		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1201		btrfs_set_file_extent_num_bytes(leaf, fi,
1202						split - key.offset);
1203
1204		fi = btrfs_item_ptr(leaf, path->slots[0],
1205				    struct btrfs_file_extent_item);
1206
1207		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1208		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1209		btrfs_set_file_extent_num_bytes(leaf, fi,
1210						extent_end - split);
1211		btrfs_mark_buffer_dirty(leaf);
1212
1213		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1214					   root->root_key.objectid,
1215					   ino, orig_offset, 1);
1216		BUG_ON(ret); /* -ENOMEM */
1217
1218		if (split == start) {
1219			key.offset = start;
1220		} else {
1221			BUG_ON(start != key.offset);
1222			path->slots[0]--;
1223			extent_end = end;
1224		}
1225		recow = 1;
1226	}
1227
1228	other_start = end;
1229	other_end = 0;
1230	if (extent_mergeable(leaf, path->slots[0] + 1,
1231			     ino, bytenr, orig_offset,
1232			     &other_start, &other_end)) {
1233		if (recow) {
1234			btrfs_release_path(path);
1235			goto again;
1236		}
1237		extent_end = other_end;
1238		del_slot = path->slots[0] + 1;
1239		del_nr++;
1240		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1241					0, root->root_key.objectid,
1242					ino, orig_offset, 0);
1243		BUG_ON(ret); /* -ENOMEM */
1244	}
1245	other_start = 0;
1246	other_end = start;
1247	if (extent_mergeable(leaf, path->slots[0] - 1,
1248			     ino, bytenr, orig_offset,
1249			     &other_start, &other_end)) {
1250		if (recow) {
1251			btrfs_release_path(path);
1252			goto again;
1253		}
1254		key.offset = other_start;
1255		del_slot = path->slots[0];
1256		del_nr++;
1257		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1258					0, root->root_key.objectid,
1259					ino, orig_offset, 0);
1260		BUG_ON(ret); /* -ENOMEM */
1261	}
1262	if (del_nr == 0) {
1263		fi = btrfs_item_ptr(leaf, path->slots[0],
1264			   struct btrfs_file_extent_item);
1265		btrfs_set_file_extent_type(leaf, fi,
1266					   BTRFS_FILE_EXTENT_REG);
1267		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1268		btrfs_mark_buffer_dirty(leaf);
1269	} else {
1270		fi = btrfs_item_ptr(leaf, del_slot - 1,
1271			   struct btrfs_file_extent_item);
1272		btrfs_set_file_extent_type(leaf, fi,
1273					   BTRFS_FILE_EXTENT_REG);
1274		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1275		btrfs_set_file_extent_num_bytes(leaf, fi,
1276						extent_end - key.offset);
1277		btrfs_mark_buffer_dirty(leaf);
1278
1279		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1280		if (ret < 0) {
1281			btrfs_abort_transaction(trans, root, ret);
1282			goto out;
1283		}
1284	}
1285out:
1286	btrfs_free_path(path);
1287	return 0;
1288}
1289
1290/*
1291 * on error we return an unlocked page and the error value
1292 * on success we return a locked page and 0
1293 */
1294static int prepare_uptodate_page(struct page *page, u64 pos,
1295				 bool force_uptodate)
1296{
1297	int ret = 0;
1298
1299	if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1300	    !PageUptodate(page)) {
1301		ret = btrfs_readpage(NULL, page);
1302		if (ret)
1303			return ret;
1304		lock_page(page);
1305		if (!PageUptodate(page)) {
1306			unlock_page(page);
1307			return -EIO;
1308		}
1309	}
1310	return 0;
1311}
1312
1313/*
1314 * this just gets pages into the page cache and locks them down.
1315 */
1316static noinline int prepare_pages(struct inode *inode, struct page **pages,
1317				  size_t num_pages, loff_t pos,
1318				  size_t write_bytes, bool force_uptodate)
1319{
1320	int i;
1321	unsigned long index = pos >> PAGE_CACHE_SHIFT;
1322	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1323	int err = 0;
1324	int faili;
1325
1326	for (i = 0; i < num_pages; i++) {
1327		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1328					       mask | __GFP_WRITE);
1329		if (!pages[i]) {
1330			faili = i - 1;
1331			err = -ENOMEM;
1332			goto fail;
1333		}
1334
1335		if (i == 0)
1336			err = prepare_uptodate_page(pages[i], pos,
1337						    force_uptodate);
1338		if (i == num_pages - 1)
1339			err = prepare_uptodate_page(pages[i],
1340						    pos + write_bytes, false);
1341		if (err) {
1342			page_cache_release(pages[i]);
1343			faili = i - 1;
1344			goto fail;
1345		}
1346		wait_on_page_writeback(pages[i]);
1347	}
1348
1349	return 0;
1350fail:
1351	while (faili >= 0) {
1352		unlock_page(pages[faili]);
1353		page_cache_release(pages[faili]);
1354		faili--;
1355	}
1356	return err;
1357
1358}
1359
1360/*
1361 * This function locks the extent and properly waits for data=ordered extents
1362 * to finish before allowing the pages to be modified if need.
1363 *
1364 * The return value:
1365 * 1 - the extent is locked
1366 * 0 - the extent is not locked, and everything is OK
1367 * -EAGAIN - need re-prepare the pages
1368 * the other < 0 number - Something wrong happens
1369 */
1370static noinline int
1371lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1372				size_t num_pages, loff_t pos,
1373				u64 *lockstart, u64 *lockend,
1374				struct extent_state **cached_state)
1375{
1376	u64 start_pos;
1377	u64 last_pos;
1378	int i;
1379	int ret = 0;
1380
1381	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1382	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
1383
1384	if (start_pos < inode->i_size) {
1385		struct btrfs_ordered_extent *ordered;
1386		lock_extent_bits(&BTRFS_I(inode)->io_tree,
1387				 start_pos, last_pos, 0, cached_state);
1388		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1389						     last_pos - start_pos + 1);
1390		if (ordered &&
1391		    ordered->file_offset + ordered->len > start_pos &&
1392		    ordered->file_offset <= last_pos) {
1393			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1394					     start_pos, last_pos,
1395					     cached_state, GFP_NOFS);
1396			for (i = 0; i < num_pages; i++) {
1397				unlock_page(pages[i]);
1398				page_cache_release(pages[i]);
1399			}
1400			btrfs_start_ordered_extent(inode, ordered, 1);
1401			btrfs_put_ordered_extent(ordered);
1402			return -EAGAIN;
1403		}
1404		if (ordered)
1405			btrfs_put_ordered_extent(ordered);
1406
1407		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1408				  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1409				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1410				  0, 0, cached_state, GFP_NOFS);
1411		*lockstart = start_pos;
1412		*lockend = last_pos;
1413		ret = 1;
1414	}
1415
1416	for (i = 0; i < num_pages; i++) {
1417		if (clear_page_dirty_for_io(pages[i]))
1418			account_page_redirty(pages[i]);
1419		set_page_extent_mapped(pages[i]);
1420		WARN_ON(!PageLocked(pages[i]));
1421	}
1422
1423	return ret;
1424}
1425
1426static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1427				    size_t *write_bytes)
1428{
1429	struct btrfs_root *root = BTRFS_I(inode)->root;
1430	struct btrfs_ordered_extent *ordered;
1431	u64 lockstart, lockend;
1432	u64 num_bytes;
1433	int ret;
1434
1435	ret = btrfs_start_write_no_snapshoting(root);
1436	if (!ret)
1437		return -ENOSPC;
1438
1439	lockstart = round_down(pos, root->sectorsize);
1440	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1441
1442	while (1) {
1443		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1444		ordered = btrfs_lookup_ordered_range(inode, lockstart,
1445						     lockend - lockstart + 1);
1446		if (!ordered) {
1447			break;
1448		}
1449		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1450		btrfs_start_ordered_extent(inode, ordered, 1);
1451		btrfs_put_ordered_extent(ordered);
1452	}
1453
1454	num_bytes = lockend - lockstart + 1;
1455	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1456	if (ret <= 0) {
1457		ret = 0;
1458		btrfs_end_write_no_snapshoting(root);
1459	} else {
1460		*write_bytes = min_t(size_t, *write_bytes ,
1461				     num_bytes - pos + lockstart);
1462	}
1463
1464	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1465
1466	return ret;
1467}
1468
1469static noinline ssize_t __btrfs_buffered_write(struct file *file,
1470					       struct iov_iter *i,
1471					       loff_t pos)
1472{
1473	struct inode *inode = file_inode(file);
1474	struct btrfs_root *root = BTRFS_I(inode)->root;
1475	struct page **pages = NULL;
1476	struct extent_state *cached_state = NULL;
1477	u64 release_bytes = 0;
1478	u64 lockstart;
1479	u64 lockend;
1480	unsigned long first_index;
1481	size_t num_written = 0;
1482	int nrptrs;
1483	int ret = 0;
1484	bool only_release_metadata = false;
1485	bool force_page_uptodate = false;
1486	bool need_unlock;
1487
1488	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
1489			PAGE_CACHE_SIZE / (sizeof(struct page *)));
1490	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1491	nrptrs = max(nrptrs, 8);
1492	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1493	if (!pages)
1494		return -ENOMEM;
1495
1496	first_index = pos >> PAGE_CACHE_SHIFT;
1497
1498	while (iov_iter_count(i) > 0) {
1499		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1500		size_t write_bytes = min(iov_iter_count(i),
1501					 nrptrs * (size_t)PAGE_CACHE_SIZE -
1502					 offset);
1503		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1504						PAGE_CACHE_SIZE);
1505		size_t reserve_bytes;
1506		size_t dirty_pages;
1507		size_t copied;
1508
1509		WARN_ON(num_pages > nrptrs);
1510
1511		/*
1512		 * Fault pages before locking them in prepare_pages
1513		 * to avoid recursive lock
1514		 */
1515		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1516			ret = -EFAULT;
1517			break;
1518		}
1519
1520		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1521		ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
1522		if (ret == -ENOSPC &&
1523		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1524					      BTRFS_INODE_PREALLOC))) {
1525			ret = check_can_nocow(inode, pos, &write_bytes);
1526			if (ret > 0) {
1527				only_release_metadata = true;
1528				/*
1529				 * our prealloc extent may be smaller than
1530				 * write_bytes, so scale down.
1531				 */
1532				num_pages = DIV_ROUND_UP(write_bytes + offset,
1533							 PAGE_CACHE_SIZE);
1534				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1535				ret = 0;
1536			} else {
1537				ret = -ENOSPC;
1538			}
1539		}
1540
1541		if (ret)
1542			break;
1543
1544		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1545		if (ret) {
1546			if (!only_release_metadata)
1547				btrfs_free_reserved_data_space(inode,
1548							       reserve_bytes);
1549			else
1550				btrfs_end_write_no_snapshoting(root);
1551			break;
1552		}
1553
1554		release_bytes = reserve_bytes;
1555		need_unlock = false;
1556again:
1557		/*
1558		 * This is going to setup the pages array with the number of
1559		 * pages we want, so we don't really need to worry about the
1560		 * contents of pages from loop to loop
1561		 */
1562		ret = prepare_pages(inode, pages, num_pages,
1563				    pos, write_bytes,
1564				    force_page_uptodate);
1565		if (ret)
1566			break;
1567
1568		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1569						      pos, &lockstart, &lockend,
1570						      &cached_state);
1571		if (ret < 0) {
1572			if (ret == -EAGAIN)
1573				goto again;
1574			break;
1575		} else if (ret > 0) {
1576			need_unlock = true;
1577			ret = 0;
1578		}
1579
1580		copied = btrfs_copy_from_user(pos, num_pages,
1581					   write_bytes, pages, i);
1582
1583		/*
1584		 * if we have trouble faulting in the pages, fall
1585		 * back to one page at a time
1586		 */
1587		if (copied < write_bytes)
1588			nrptrs = 1;
1589
1590		if (copied == 0) {
1591			force_page_uptodate = true;
1592			dirty_pages = 0;
1593		} else {
1594			force_page_uptodate = false;
1595			dirty_pages = DIV_ROUND_UP(copied + offset,
1596						   PAGE_CACHE_SIZE);
1597		}
1598
1599		/*
1600		 * If we had a short copy we need to release the excess delaloc
1601		 * bytes we reserved.  We need to increment outstanding_extents
1602		 * because btrfs_delalloc_release_space will decrement it, but
1603		 * we still have an outstanding extent for the chunk we actually
1604		 * managed to copy.
1605		 */
1606		if (num_pages > dirty_pages) {
1607			release_bytes = (num_pages - dirty_pages) <<
1608				PAGE_CACHE_SHIFT;
1609			if (copied > 0) {
1610				spin_lock(&BTRFS_I(inode)->lock);
1611				BTRFS_I(inode)->outstanding_extents++;
1612				spin_unlock(&BTRFS_I(inode)->lock);
1613			}
1614			if (only_release_metadata)
1615				btrfs_delalloc_release_metadata(inode,
1616								release_bytes);
1617			else
1618				btrfs_delalloc_release_space(inode,
1619							     release_bytes);
1620		}
1621
1622		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1623
1624		if (copied > 0)
1625			ret = btrfs_dirty_pages(root, inode, pages,
1626						dirty_pages, pos, copied,
1627						NULL);
1628		if (need_unlock)
1629			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1630					     lockstart, lockend, &cached_state,
1631					     GFP_NOFS);
1632		if (ret) {
1633			btrfs_drop_pages(pages, num_pages);
1634			break;
1635		}
1636
1637		release_bytes = 0;
1638		if (only_release_metadata)
1639			btrfs_end_write_no_snapshoting(root);
1640
1641		if (only_release_metadata && copied > 0) {
1642			lockstart = round_down(pos, root->sectorsize);
1643			lockend = lockstart +
1644				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
1645
1646			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1647				       lockend, EXTENT_NORESERVE, NULL,
1648				       NULL, GFP_NOFS);
1649			only_release_metadata = false;
1650		}
1651
1652		btrfs_drop_pages(pages, num_pages);
1653
1654		cond_resched();
1655
1656		balance_dirty_pages_ratelimited(inode->i_mapping);
1657		if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
1658			btrfs_btree_balance_dirty(root);
1659
1660		pos += copied;
1661		num_written += copied;
1662	}
1663
1664	kfree(pages);
1665
1666	if (release_bytes) {
1667		if (only_release_metadata) {
1668			btrfs_end_write_no_snapshoting(root);
1669			btrfs_delalloc_release_metadata(inode, release_bytes);
1670		} else {
1671			btrfs_delalloc_release_space(inode, release_bytes);
1672		}
1673	}
1674
1675	return num_written ? num_written : ret;
1676}
1677
1678static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1679				    struct iov_iter *from,
1680				    loff_t pos)
1681{
1682	struct file *file = iocb->ki_filp;
1683	struct inode *inode = file_inode(file);
1684	ssize_t written;
1685	ssize_t written_buffered;
1686	loff_t endbyte;
1687	int err;
1688
1689	written = generic_file_direct_write(iocb, from, pos);
1690
1691	if (written < 0 || !iov_iter_count(from))
1692		return written;
1693
1694	pos += written;
1695	written_buffered = __btrfs_buffered_write(file, from, pos);
1696	if (written_buffered < 0) {
1697		err = written_buffered;
1698		goto out;
1699	}
1700	/*
1701	 * Ensure all data is persisted. We want the next direct IO read to be
1702	 * able to read what was just written.
1703	 */
1704	endbyte = pos + written_buffered - 1;
1705	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1706	if (err)
1707		goto out;
1708	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1709	if (err)
1710		goto out;
1711	written += written_buffered;
1712	iocb->ki_pos = pos + written_buffered;
1713	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1714				 endbyte >> PAGE_CACHE_SHIFT);
1715out:
1716	return written ? written : err;
1717}
1718
1719static void update_time_for_write(struct inode *inode)
1720{
1721	struct timespec now;
1722
1723	if (IS_NOCMTIME(inode))
1724		return;
1725
1726	now = current_fs_time(inode->i_sb);
1727	if (!timespec_equal(&inode->i_mtime, &now))
1728		inode->i_mtime = now;
1729
1730	if (!timespec_equal(&inode->i_ctime, &now))
1731		inode->i_ctime = now;
1732
1733	if (IS_I_VERSION(inode))
1734		inode_inc_iversion(inode);
1735}
1736
1737static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1738				    struct iov_iter *from)
1739{
1740	struct file *file = iocb->ki_filp;
1741	struct inode *inode = file_inode(file);
1742	struct btrfs_root *root = BTRFS_I(inode)->root;
1743	u64 start_pos;
1744	u64 end_pos;
1745	ssize_t num_written = 0;
1746	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1747	ssize_t err;
1748	loff_t pos;
1749	size_t count;
1750
1751	mutex_lock(&inode->i_mutex);
1752	err = generic_write_checks(iocb, from);
1753	if (err <= 0) {
1754		mutex_unlock(&inode->i_mutex);
1755		return err;
1756	}
1757
1758	current->backing_dev_info = inode_to_bdi(inode);
1759	err = file_remove_suid(file);
1760	if (err) {
1761		mutex_unlock(&inode->i_mutex);
1762		goto out;
1763	}
1764
1765	/*
1766	 * If BTRFS flips readonly due to some impossible error
1767	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1768	 * although we have opened a file as writable, we have
1769	 * to stop this write operation to ensure FS consistency.
1770	 */
1771	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1772		mutex_unlock(&inode->i_mutex);
1773		err = -EROFS;
1774		goto out;
1775	}
1776
1777	/*
1778	 * We reserve space for updating the inode when we reserve space for the
1779	 * extent we are going to write, so we will enospc out there.  We don't
1780	 * need to start yet another transaction to update the inode as we will
1781	 * update the inode when we finish writing whatever data we write.
1782	 */
1783	update_time_for_write(inode);
1784
1785	pos = iocb->ki_pos;
1786	count = iov_iter_count(from);
1787	start_pos = round_down(pos, root->sectorsize);
1788	if (start_pos > i_size_read(inode)) {
1789		/* Expand hole size to cover write data, preventing empty gap */
1790		end_pos = round_up(pos + count, root->sectorsize);
1791		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1792		if (err) {
1793			mutex_unlock(&inode->i_mutex);
1794			goto out;
1795		}
1796	}
1797
1798	if (sync)
1799		atomic_inc(&BTRFS_I(inode)->sync_writers);
1800
1801	if (iocb->ki_flags & IOCB_DIRECT) {
1802		num_written = __btrfs_direct_write(iocb, from, pos);
1803	} else {
1804		num_written = __btrfs_buffered_write(file, from, pos);
1805		if (num_written > 0)
1806			iocb->ki_pos = pos + num_written;
1807	}
1808
1809	mutex_unlock(&inode->i_mutex);
1810
1811	/*
1812	 * We also have to set last_sub_trans to the current log transid,
1813	 * otherwise subsequent syncs to a file that's been synced in this
1814	 * transaction will appear to have already occured.
1815	 */
1816	spin_lock(&BTRFS_I(inode)->lock);
1817	BTRFS_I(inode)->last_sub_trans = root->log_transid;
1818	spin_unlock(&BTRFS_I(inode)->lock);
1819	if (num_written > 0) {
1820		err = generic_write_sync(file, pos, num_written);
1821		if (err < 0)
1822			num_written = err;
1823	}
1824
1825	if (sync)
1826		atomic_dec(&BTRFS_I(inode)->sync_writers);
1827out:
1828	current->backing_dev_info = NULL;
1829	return num_written ? num_written : err;
1830}
1831
1832int btrfs_release_file(struct inode *inode, struct file *filp)
1833{
1834	if (filp->private_data)
1835		btrfs_ioctl_trans_end(filp);
1836	/*
1837	 * ordered_data_close is set by settattr when we are about to truncate
1838	 * a file from a non-zero size to a zero size.  This tries to
1839	 * flush down new bytes that may have been written if the
1840	 * application were using truncate to replace a file in place.
1841	 */
1842	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1843			       &BTRFS_I(inode)->runtime_flags))
1844			filemap_flush(inode->i_mapping);
1845	return 0;
1846}
1847
1848static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1849{
1850	int ret;
1851
1852	atomic_inc(&BTRFS_I(inode)->sync_writers);
1853	ret = btrfs_fdatawrite_range(inode, start, end);
1854	atomic_dec(&BTRFS_I(inode)->sync_writers);
1855
1856	return ret;
1857}
1858
1859/*
1860 * fsync call for both files and directories.  This logs the inode into
1861 * the tree log instead of forcing full commits whenever possible.
1862 *
1863 * It needs to call filemap_fdatawait so that all ordered extent updates are
1864 * in the metadata btree are up to date for copying to the log.
1865 *
1866 * It drops the inode mutex before doing the tree log commit.  This is an
1867 * important optimization for directories because holding the mutex prevents
1868 * new operations on the dir while we write to disk.
1869 */
1870int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1871{
1872	struct dentry *dentry = file->f_path.dentry;
1873	struct inode *inode = d_inode(dentry);
1874	struct btrfs_root *root = BTRFS_I(inode)->root;
1875	struct btrfs_trans_handle *trans;
1876	struct btrfs_log_ctx ctx;
1877	int ret = 0;
1878	bool full_sync = 0;
1879
1880	trace_btrfs_sync_file(file, datasync);
1881
1882	/*
1883	 * We write the dirty pages in the range and wait until they complete
1884	 * out of the ->i_mutex. If so, we can flush the dirty pages by
1885	 * multi-task, and make the performance up.  See
1886	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1887	 */
1888	ret = start_ordered_ops(inode, start, end);
1889	if (ret)
1890		return ret;
1891
1892	mutex_lock(&inode->i_mutex);
1893	atomic_inc(&root->log_batch);
1894	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1895			     &BTRFS_I(inode)->runtime_flags);
1896	/*
1897	 * We might have have had more pages made dirty after calling
1898	 * start_ordered_ops and before acquiring the inode's i_mutex.
1899	 */
1900	if (full_sync) {
1901		/*
1902		 * For a full sync, we need to make sure any ordered operations
1903		 * start and finish before we start logging the inode, so that
1904		 * all extents are persisted and the respective file extent
1905		 * items are in the fs/subvol btree.
1906		 */
1907		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1908	} else {
1909		/*
1910		 * Start any new ordered operations before starting to log the
1911		 * inode. We will wait for them to finish in btrfs_sync_log().
1912		 *
1913		 * Right before acquiring the inode's mutex, we might have new
1914		 * writes dirtying pages, which won't immediately start the
1915		 * respective ordered operations - that is done through the
1916		 * fill_delalloc callbacks invoked from the writepage and
1917		 * writepages address space operations. So make sure we start
1918		 * all ordered operations before starting to log our inode. Not
1919		 * doing this means that while logging the inode, writeback
1920		 * could start and invoke writepage/writepages, which would call
1921		 * the fill_delalloc callbacks (cow_file_range,
1922		 * submit_compressed_extents). These callbacks add first an
1923		 * extent map to the modified list of extents and then create
1924		 * the respective ordered operation, which means in
1925		 * tree-log.c:btrfs_log_inode() we might capture all existing
1926		 * ordered operations (with btrfs_get_logged_extents()) before
1927		 * the fill_delalloc callback adds its ordered operation, and by
1928		 * the time we visit the modified list of extent maps (with
1929		 * btrfs_log_changed_extents()), we see and process the extent
1930		 * map they created. We then use the extent map to construct a
1931		 * file extent item for logging without waiting for the
1932		 * respective ordered operation to finish - this file extent
1933		 * item points to a disk location that might not have yet been
1934		 * written to, containing random data - so after a crash a log
1935		 * replay will make our inode have file extent items that point
1936		 * to disk locations containing invalid data, as we returned
1937		 * success to userspace without waiting for the respective
1938		 * ordered operation to finish, because it wasn't captured by
1939		 * btrfs_get_logged_extents().
1940		 */
1941		ret = start_ordered_ops(inode, start, end);
1942	}
1943	if (ret) {
1944		mutex_unlock(&inode->i_mutex);
1945		goto out;
1946	}
1947	atomic_inc(&root->log_batch);
1948
1949	/*
1950	 * If the last transaction that changed this file was before the current
1951	 * transaction and we have the full sync flag set in our inode, we can
1952	 * bail out now without any syncing.
1953	 *
1954	 * Note that we can't bail out if the full sync flag isn't set. This is
1955	 * because when the full sync flag is set we start all ordered extents
1956	 * and wait for them to fully complete - when they complete they update
1957	 * the inode's last_trans field through:
1958	 *
1959	 *     btrfs_finish_ordered_io() ->
1960	 *         btrfs_update_inode_fallback() ->
1961	 *             btrfs_update_inode() ->
1962	 *                 btrfs_set_inode_last_trans()
1963	 *
1964	 * So we are sure that last_trans is up to date and can do this check to
1965	 * bail out safely. For the fast path, when the full sync flag is not
1966	 * set in our inode, we can not do it because we start only our ordered
1967	 * extents and don't wait for them to complete (that is when
1968	 * btrfs_finish_ordered_io runs), so here at this point their last_trans
1969	 * value might be less than or equals to fs_info->last_trans_committed,
1970	 * and setting a speculative last_trans for an inode when a buffered
1971	 * write is made (such as fs_info->generation + 1 for example) would not
1972	 * be reliable since after setting the value and before fsync is called
1973	 * any number of transactions can start and commit (transaction kthread
1974	 * commits the current transaction periodically), and a transaction
1975	 * commit does not start nor waits for ordered extents to complete.
1976	 */
1977	smp_mb();
1978	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
1979	    (full_sync && BTRFS_I(inode)->last_trans <=
1980	     root->fs_info->last_trans_committed)) {
1981		/*
1982		 * We'v had everything committed since the last time we were
1983		 * modified so clear this flag in case it was set for whatever
1984		 * reason, it's no longer relevant.
1985		 */
1986		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1987			  &BTRFS_I(inode)->runtime_flags);
1988		mutex_unlock(&inode->i_mutex);
1989		goto out;
1990	}
1991
1992	/*
1993	 * ok we haven't committed the transaction yet, lets do a commit
1994	 */
1995	if (file->private_data)
1996		btrfs_ioctl_trans_end(file);
1997
1998	/*
1999	 * We use start here because we will need to wait on the IO to complete
2000	 * in btrfs_sync_log, which could require joining a transaction (for
2001	 * example checking cross references in the nocow path).  If we use join
2002	 * here we could get into a situation where we're waiting on IO to
2003	 * happen that is blocked on a transaction trying to commit.  With start
2004	 * we inc the extwriter counter, so we wait for all extwriters to exit
2005	 * before we start blocking join'ers.  This comment is to keep somebody
2006	 * from thinking they are super smart and changing this to
2007	 * btrfs_join_transaction *cough*Josef*cough*.
2008	 */
2009	trans = btrfs_start_transaction(root, 0);
2010	if (IS_ERR(trans)) {
2011		ret = PTR_ERR(trans);
2012		mutex_unlock(&inode->i_mutex);
2013		goto out;
2014	}
2015	trans->sync = true;
2016
2017	btrfs_init_log_ctx(&ctx);
2018
2019	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
2020	if (ret < 0) {
2021		/* Fallthrough and commit/free transaction. */
2022		ret = 1;
2023	}
2024
2025	/* we've logged all the items and now have a consistent
2026	 * version of the file in the log.  It is possible that
2027	 * someone will come in and modify the file, but that's
2028	 * fine because the log is consistent on disk, and we
2029	 * have references to all of the file's extents
2030	 *
2031	 * It is possible that someone will come in and log the
2032	 * file again, but that will end up using the synchronization
2033	 * inside btrfs_sync_log to keep things safe.
2034	 */
2035	mutex_unlock(&inode->i_mutex);
2036
2037	/*
2038	 * If any of the ordered extents had an error, just return it to user
2039	 * space, so that the application knows some writes didn't succeed and
2040	 * can take proper action (retry for e.g.). Blindly committing the
2041	 * transaction in this case, would fool userspace that everything was
2042	 * successful. And we also want to make sure our log doesn't contain
2043	 * file extent items pointing to extents that weren't fully written to -
2044	 * just like in the non fast fsync path, where we check for the ordered
2045	 * operation's error flag before writing to the log tree and return -EIO
2046	 * if any of them had this flag set (btrfs_wait_ordered_range) -
2047	 * therefore we need to check for errors in the ordered operations,
2048	 * which are indicated by ctx.io_err.
2049	 */
2050	if (ctx.io_err) {
2051		btrfs_end_transaction(trans, root);
2052		ret = ctx.io_err;
2053		goto out;
2054	}
2055
2056	if (ret != BTRFS_NO_LOG_SYNC) {
2057		if (!ret) {
2058			ret = btrfs_sync_log(trans, root, &ctx);
2059			if (!ret) {
2060				ret = btrfs_end_transaction(trans, root);
2061				goto out;
2062			}
2063		}
2064		if (!full_sync) {
2065			ret = btrfs_wait_ordered_range(inode, start,
2066						       end - start + 1);
2067			if (ret) {
2068				btrfs_end_transaction(trans, root);
2069				goto out;
2070			}
2071		}
2072		ret = btrfs_commit_transaction(trans, root);
2073	} else {
2074		ret = btrfs_end_transaction(trans, root);
2075	}
2076out:
2077	return ret > 0 ? -EIO : ret;
2078}
2079
2080static const struct vm_operations_struct btrfs_file_vm_ops = {
2081	.fault		= filemap_fault,
2082	.map_pages	= filemap_map_pages,
2083	.page_mkwrite	= btrfs_page_mkwrite,
2084};
2085
2086static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2087{
2088	struct address_space *mapping = filp->f_mapping;
2089
2090	if (!mapping->a_ops->readpage)
2091		return -ENOEXEC;
2092
2093	file_accessed(filp);
2094	vma->vm_ops = &btrfs_file_vm_ops;
2095
2096	return 0;
2097}
2098
2099static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
2100			  int slot, u64 start, u64 end)
2101{
2102	struct btrfs_file_extent_item *fi;
2103	struct btrfs_key key;
2104
2105	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2106		return 0;
2107
2108	btrfs_item_key_to_cpu(leaf, &key, slot);
2109	if (key.objectid != btrfs_ino(inode) ||
2110	    key.type != BTRFS_EXTENT_DATA_KEY)
2111		return 0;
2112
2113	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2114
2115	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2116		return 0;
2117
2118	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2119		return 0;
2120
2121	if (key.offset == end)
2122		return 1;
2123	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2124		return 1;
2125	return 0;
2126}
2127
2128static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2129		      struct btrfs_path *path, u64 offset, u64 end)
2130{
2131	struct btrfs_root *root = BTRFS_I(inode)->root;
2132	struct extent_buffer *leaf;
2133	struct btrfs_file_extent_item *fi;
2134	struct extent_map *hole_em;
2135	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2136	struct btrfs_key key;
2137	int ret;
2138
2139	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
2140		goto out;
2141
2142	key.objectid = btrfs_ino(inode);
2143	key.type = BTRFS_EXTENT_DATA_KEY;
2144	key.offset = offset;
2145
2146	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2147	if (ret < 0)
2148		return ret;
2149	BUG_ON(!ret);
2150
2151	leaf = path->nodes[0];
2152	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
2153		u64 num_bytes;
2154
2155		path->slots[0]--;
2156		fi = btrfs_item_ptr(leaf, path->slots[0],
2157				    struct btrfs_file_extent_item);
2158		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2159			end - offset;
2160		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2161		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2162		btrfs_set_file_extent_offset(leaf, fi, 0);
2163		btrfs_mark_buffer_dirty(leaf);
2164		goto out;
2165	}
2166
2167	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2168		u64 num_bytes;
2169
2170		key.offset = offset;
2171		btrfs_set_item_key_safe(root->fs_info, path, &key);
2172		fi = btrfs_item_ptr(leaf, path->slots[0],
2173				    struct btrfs_file_extent_item);
2174		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2175			offset;
2176		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2177		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2178		btrfs_set_file_extent_offset(leaf, fi, 0);
2179		btrfs_mark_buffer_dirty(leaf);
2180		goto out;
2181	}
2182	btrfs_release_path(path);
2183
2184	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
2185				       0, 0, end - offset, 0, end - offset,
2186				       0, 0, 0);
2187	if (ret)
2188		return ret;
2189
2190out:
2191	btrfs_release_path(path);
2192
2193	hole_em = alloc_extent_map();
2194	if (!hole_em) {
2195		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2196		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2197			&BTRFS_I(inode)->runtime_flags);
2198	} else {
2199		hole_em->start = offset;
2200		hole_em->len = end - offset;
2201		hole_em->ram_bytes = hole_em->len;
2202		hole_em->orig_start = offset;
2203
2204		hole_em->block_start = EXTENT_MAP_HOLE;
2205		hole_em->block_len = 0;
2206		hole_em->orig_block_len = 0;
2207		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
2208		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2209		hole_em->generation = trans->transid;
2210
2211		do {
2212			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2213			write_lock(&em_tree->lock);
2214			ret = add_extent_mapping(em_tree, hole_em, 1);
2215			write_unlock(&em_tree->lock);
2216		} while (ret == -EEXIST);
2217		free_extent_map(hole_em);
2218		if (ret)
2219			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2220				&BTRFS_I(inode)->runtime_flags);
2221	}
2222
2223	return 0;
2224}
2225
2226/*
2227 * Find a hole extent on given inode and change start/len to the end of hole
2228 * extent.(hole/vacuum extent whose em->start <= start &&
2229 *	   em->start + em->len > start)
2230 * When a hole extent is found, return 1 and modify start/len.
2231 */
2232static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2233{
2234	struct extent_map *em;
2235	int ret = 0;
2236
2237	em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
2238	if (IS_ERR_OR_NULL(em)) {
2239		if (!em)
2240			ret = -ENOMEM;
2241		else
2242			ret = PTR_ERR(em);
2243		return ret;
2244	}
2245
2246	/* Hole or vacuum extent(only exists in no-hole mode) */
2247	if (em->block_start == EXTENT_MAP_HOLE) {
2248		ret = 1;
2249		*len = em->start + em->len > *start + *len ?
2250		       0 : *start + *len - em->start - em->len;
2251		*start = em->start + em->len;
2252	}
2253	free_extent_map(em);
2254	return ret;
2255}
2256
2257static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2258{
2259	struct btrfs_root *root = BTRFS_I(inode)->root;
2260	struct extent_state *cached_state = NULL;
2261	struct btrfs_path *path;
2262	struct btrfs_block_rsv *rsv;
2263	struct btrfs_trans_handle *trans;
2264	u64 lockstart;
2265	u64 lockend;
2266	u64 tail_start;
2267	u64 tail_len;
2268	u64 orig_start = offset;
2269	u64 cur_offset;
2270	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2271	u64 drop_end;
2272	int ret = 0;
2273	int err = 0;
2274	int rsv_count;
2275	bool same_page;
2276	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2277	u64 ino_size;
2278	bool truncated_page = false;
2279	bool updated_inode = false;
2280
2281	ret = btrfs_wait_ordered_range(inode, offset, len);
2282	if (ret)
2283		return ret;
2284
2285	mutex_lock(&inode->i_mutex);
2286	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2287	ret = find_first_non_hole(inode, &offset, &len);
2288	if (ret < 0)
2289		goto out_only_mutex;
2290	if (ret && !len) {
2291		/* Already in a large hole */
2292		ret = 0;
2293		goto out_only_mutex;
2294	}
2295
2296	lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2297	lockend = round_down(offset + len,
2298			     BTRFS_I(inode)->root->sectorsize) - 1;
2299	same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2300		    ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2301
2302	/*
2303	 * We needn't truncate any page which is beyond the end of the file
2304	 * because we are sure there is no data there.
2305	 */
2306	/*
2307	 * Only do this if we are in the same page and we aren't doing the
2308	 * entire page.
2309	 */
2310	if (same_page && len < PAGE_CACHE_SIZE) {
2311		if (offset < ino_size) {
2312			truncated_page = true;
2313			ret = btrfs_truncate_page(inode, offset, len, 0);
2314		} else {
2315			ret = 0;
2316		}
2317		goto out_only_mutex;
2318	}
2319
2320	/* zero back part of the first page */
2321	if (offset < ino_size) {
2322		truncated_page = true;
2323		ret = btrfs_truncate_page(inode, offset, 0, 0);
2324		if (ret) {
2325			mutex_unlock(&inode->i_mutex);
2326			return ret;
2327		}
2328	}
2329
2330	/* Check the aligned pages after the first unaligned page,
2331	 * if offset != orig_start, which means the first unaligned page
2332	 * including serveral following pages are already in holes,
2333	 * the extra check can be skipped */
2334	if (offset == orig_start) {
2335		/* after truncate page, check hole again */
2336		len = offset + len - lockstart;
2337		offset = lockstart;
2338		ret = find_first_non_hole(inode, &offset, &len);
2339		if (ret < 0)
2340			goto out_only_mutex;
2341		if (ret && !len) {
2342			ret = 0;
2343			goto out_only_mutex;
2344		}
2345		lockstart = offset;
2346	}
2347
2348	/* Check the tail unaligned part is in a hole */
2349	tail_start = lockend + 1;
2350	tail_len = offset + len - tail_start;
2351	if (tail_len) {
2352		ret = find_first_non_hole(inode, &tail_start, &tail_len);
2353		if (unlikely(ret < 0))
2354			goto out_only_mutex;
2355		if (!ret) {
2356			/* zero the front end of the last page */
2357			if (tail_start + tail_len < ino_size) {
2358				truncated_page = true;
2359				ret = btrfs_truncate_page(inode,
2360						tail_start + tail_len, 0, 1);
2361				if (ret)
2362					goto out_only_mutex;
2363			}
2364		}
2365	}
2366
2367	if (lockend < lockstart) {
2368		ret = 0;
2369		goto out_only_mutex;
2370	}
2371
2372	while (1) {
2373		struct btrfs_ordered_extent *ordered;
2374
2375		truncate_pagecache_range(inode, lockstart, lockend);
2376
2377		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2378				 0, &cached_state);
2379		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2380
2381		/*
2382		 * We need to make sure we have no ordered extents in this range
2383		 * and nobody raced in and read a page in this range, if we did
2384		 * we need to try again.
2385		 */
2386		if ((!ordered ||
2387		    (ordered->file_offset + ordered->len <= lockstart ||
2388		     ordered->file_offset > lockend)) &&
2389		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
2390			if (ordered)
2391				btrfs_put_ordered_extent(ordered);
2392			break;
2393		}
2394		if (ordered)
2395			btrfs_put_ordered_extent(ordered);
2396		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2397				     lockend, &cached_state, GFP_NOFS);
2398		ret = btrfs_wait_ordered_range(inode, lockstart,
2399					       lockend - lockstart + 1);
2400		if (ret) {
2401			mutex_unlock(&inode->i_mutex);
2402			return ret;
2403		}
2404	}
2405
2406	path = btrfs_alloc_path();
2407	if (!path) {
2408		ret = -ENOMEM;
2409		goto out;
2410	}
2411
2412	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2413	if (!rsv) {
2414		ret = -ENOMEM;
2415		goto out_free;
2416	}
2417	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
2418	rsv->failfast = 1;
2419
2420	/*
2421	 * 1 - update the inode
2422	 * 1 - removing the extents in the range
2423	 * 1 - adding the hole extent if no_holes isn't set
2424	 */
2425	rsv_count = no_holes ? 2 : 3;
2426	trans = btrfs_start_transaction(root, rsv_count);
2427	if (IS_ERR(trans)) {
2428		err = PTR_ERR(trans);
2429		goto out_free;
2430	}
2431
2432	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
2433				      min_size);
2434	BUG_ON(ret);
2435	trans->block_rsv = rsv;
2436
2437	cur_offset = lockstart;
2438	len = lockend - cur_offset;
2439	while (cur_offset < lockend) {
2440		ret = __btrfs_drop_extents(trans, root, inode, path,
2441					   cur_offset, lockend + 1,
2442					   &drop_end, 1, 0, 0, NULL);
2443		if (ret != -ENOSPC)
2444			break;
2445
2446		trans->block_rsv = &root->fs_info->trans_block_rsv;
2447
2448		if (cur_offset < ino_size) {
2449			ret = fill_holes(trans, inode, path, cur_offset,
2450					 drop_end);
2451			if (ret) {
2452				err = ret;
2453				break;
2454			}
2455		}
2456
2457		cur_offset = drop_end;
2458
2459		ret = btrfs_update_inode(trans, root, inode);
2460		if (ret) {
2461			err = ret;
2462			break;
2463		}
2464
2465		btrfs_end_transaction(trans, root);
2466		btrfs_btree_balance_dirty(root);
2467
2468		trans = btrfs_start_transaction(root, rsv_count);
2469		if (IS_ERR(trans)) {
2470			ret = PTR_ERR(trans);
2471			trans = NULL;
2472			break;
2473		}
2474
2475		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
2476					      rsv, min_size);
2477		BUG_ON(ret);	/* shouldn't happen */
2478		trans->block_rsv = rsv;
2479
2480		ret = find_first_non_hole(inode, &cur_offset, &len);
2481		if (unlikely(ret < 0))
2482			break;
2483		if (ret && !len) {
2484			ret = 0;
2485			break;
2486		}
2487	}
2488
2489	if (ret) {
2490		err = ret;
2491		goto out_trans;
2492	}
2493
2494	trans->block_rsv = &root->fs_info->trans_block_rsv;
2495	/*
2496	 * Don't insert file hole extent item if it's for a range beyond eof
2497	 * (because it's useless) or if it represents a 0 bytes range (when
2498	 * cur_offset == drop_end).
2499	 */
2500	if (cur_offset < ino_size && cur_offset < drop_end) {
2501		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2502		if (ret) {
2503			err = ret;
2504			goto out_trans;
2505		}
2506	}
2507
2508out_trans:
2509	if (!trans)
2510		goto out_free;
2511
2512	inode_inc_iversion(inode);
2513	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2514
2515	trans->block_rsv = &root->fs_info->trans_block_rsv;
2516	ret = btrfs_update_inode(trans, root, inode);
2517	updated_inode = true;
2518	btrfs_end_transaction(trans, root);
2519	btrfs_btree_balance_dirty(root);
2520out_free:
2521	btrfs_free_path(path);
2522	btrfs_free_block_rsv(root, rsv);
2523out:
2524	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2525			     &cached_state, GFP_NOFS);
2526out_only_mutex:
2527	if (!updated_inode && truncated_page && !ret && !err) {
2528		/*
2529		 * If we only end up zeroing part of a page, we still need to
2530		 * update the inode item, so that all the time fields are
2531		 * updated as well as the necessary btrfs inode in memory fields
2532		 * for detecting, at fsync time, if the inode isn't yet in the
2533		 * log tree or it's there but not up to date.
2534		 */
2535		trans = btrfs_start_transaction(root, 1);
2536		if (IS_ERR(trans)) {
2537			err = PTR_ERR(trans);
2538		} else {
2539			err = btrfs_update_inode(trans, root, inode);
2540			ret = btrfs_end_transaction(trans, root);
2541		}
2542	}
2543	mutex_unlock(&inode->i_mutex);
2544	if (ret && !err)
2545		err = ret;
2546	return err;
2547}
2548
2549static long btrfs_fallocate(struct file *file, int mode,
2550			    loff_t offset, loff_t len)
2551{
2552	struct inode *inode = file_inode(file);
2553	struct extent_state *cached_state = NULL;
2554	u64 cur_offset;
2555	u64 last_byte;
2556	u64 alloc_start;
2557	u64 alloc_end;
2558	u64 alloc_hint = 0;
2559	u64 locked_end;
2560	struct extent_map *em;
2561	int blocksize = BTRFS_I(inode)->root->sectorsize;
2562	int ret;
2563
2564	alloc_start = round_down(offset, blocksize);
2565	alloc_end = round_up(offset + len, blocksize);
2566
2567	/* Make sure we aren't being give some crap mode */
2568	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2569		return -EOPNOTSUPP;
2570
2571	if (mode & FALLOC_FL_PUNCH_HOLE)
2572		return btrfs_punch_hole(inode, offset, len);
2573
2574	/*
2575	 * Make sure we have enough space before we do the
2576	 * allocation.
2577	 */
2578	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
2579	if (ret)
2580		return ret;
2581
2582	mutex_lock(&inode->i_mutex);
2583	ret = inode_newsize_ok(inode, alloc_end);
2584	if (ret)
2585		goto out;
2586
2587	if (alloc_start > inode->i_size) {
2588		ret = btrfs_cont_expand(inode, i_size_read(inode),
2589					alloc_start);
2590		if (ret)
2591			goto out;
2592	} else {
2593		/*
2594		 * If we are fallocating from the end of the file onward we
2595		 * need to zero out the end of the page if i_size lands in the
2596		 * middle of a page.
2597		 */
2598		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2599		if (ret)
2600			goto out;
2601	}
2602
2603	/*
2604	 * wait for ordered IO before we have any locks.  We'll loop again
2605	 * below with the locks held.
2606	 */
2607	ret = btrfs_wait_ordered_range(inode, alloc_start,
2608				       alloc_end - alloc_start);
2609	if (ret)
2610		goto out;
2611
2612	locked_end = alloc_end - 1;
2613	while (1) {
2614		struct btrfs_ordered_extent *ordered;
2615
2616		/* the extent lock is ordered inside the running
2617		 * transaction
2618		 */
2619		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2620				 locked_end, 0, &cached_state);
2621		ordered = btrfs_lookup_first_ordered_extent(inode,
2622							    alloc_end - 1);
2623		if (ordered &&
2624		    ordered->file_offset + ordered->len > alloc_start &&
2625		    ordered->file_offset < alloc_end) {
2626			btrfs_put_ordered_extent(ordered);
2627			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2628					     alloc_start, locked_end,
2629					     &cached_state, GFP_NOFS);
2630			/*
2631			 * we can't wait on the range with the transaction
2632			 * running or with the extent lock held
2633			 */
2634			ret = btrfs_wait_ordered_range(inode, alloc_start,
2635						       alloc_end - alloc_start);
2636			if (ret)
2637				goto out;
2638		} else {
2639			if (ordered)
2640				btrfs_put_ordered_extent(ordered);
2641			break;
2642		}
2643	}
2644
2645	cur_offset = alloc_start;
2646	while (1) {
2647		u64 actual_end;
2648
2649		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2650				      alloc_end - cur_offset, 0);
2651		if (IS_ERR_OR_NULL(em)) {
2652			if (!em)
2653				ret = -ENOMEM;
2654			else
2655				ret = PTR_ERR(em);
2656			break;
2657		}
2658		last_byte = min(extent_map_end(em), alloc_end);
2659		actual_end = min_t(u64, extent_map_end(em), offset + len);
2660		last_byte = ALIGN(last_byte, blocksize);
2661
2662		if (em->block_start == EXTENT_MAP_HOLE ||
2663		    (cur_offset >= inode->i_size &&
2664		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2665			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
2666							last_byte - cur_offset,
2667							1 << inode->i_blkbits,
2668							offset + len,
2669							&alloc_hint);
2670		} else if (actual_end > inode->i_size &&
2671			   !(mode & FALLOC_FL_KEEP_SIZE)) {
2672			struct btrfs_trans_handle *trans;
2673			struct btrfs_root *root = BTRFS_I(inode)->root;
2674
2675			/*
2676			 * We didn't need to allocate any more space, but we
2677			 * still extended the size of the file so we need to
2678			 * update i_size and the inode item.
2679			 */
2680			trans = btrfs_start_transaction(root, 1);
2681			if (IS_ERR(trans)) {
2682				ret = PTR_ERR(trans);
2683			} else {
2684				inode->i_ctime = CURRENT_TIME;
2685				i_size_write(inode, actual_end);
2686				btrfs_ordered_update_i_size(inode, actual_end,
2687							    NULL);
2688				ret = btrfs_update_inode(trans, root, inode);
2689				if (ret)
2690					btrfs_end_transaction(trans, root);
2691				else
2692					ret = btrfs_end_transaction(trans,
2693								    root);
2694			}
2695		}
2696		free_extent_map(em);
2697		if (ret < 0)
2698			break;
2699
2700		cur_offset = last_byte;
2701		if (cur_offset >= alloc_end) {
2702			ret = 0;
2703			break;
2704		}
2705	}
2706	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2707			     &cached_state, GFP_NOFS);
2708out:
2709	mutex_unlock(&inode->i_mutex);
2710	/* Let go of our reservation. */
2711	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2712	return ret;
2713}
2714
2715static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2716{
2717	struct btrfs_root *root = BTRFS_I(inode)->root;
2718	struct extent_map *em = NULL;
2719	struct extent_state *cached_state = NULL;
2720	u64 lockstart;
2721	u64 lockend;
2722	u64 start;
2723	u64 len;
2724	int ret = 0;
2725
2726	if (inode->i_size == 0)
2727		return -ENXIO;
2728
2729	/*
2730	 * *offset can be negative, in this case we start finding DATA/HOLE from
2731	 * the very start of the file.
2732	 */
2733	start = max_t(loff_t, 0, *offset);
2734
2735	lockstart = round_down(start, root->sectorsize);
2736	lockend = round_up(i_size_read(inode), root->sectorsize);
2737	if (lockend <= lockstart)
2738		lockend = lockstart + root->sectorsize;
2739	lockend--;
2740	len = lockend - lockstart + 1;
2741
2742	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2743			 &cached_state);
2744
2745	while (start < inode->i_size) {
2746		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2747		if (IS_ERR(em)) {
2748			ret = PTR_ERR(em);
2749			em = NULL;
2750			break;
2751		}
2752
2753		if (whence == SEEK_HOLE &&
2754		    (em->block_start == EXTENT_MAP_HOLE ||
2755		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2756			break;
2757		else if (whence == SEEK_DATA &&
2758			   (em->block_start != EXTENT_MAP_HOLE &&
2759			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2760			break;
2761
2762		start = em->start + em->len;
2763		free_extent_map(em);
2764		em = NULL;
2765		cond_resched();
2766	}
2767	free_extent_map(em);
2768	if (!ret) {
2769		if (whence == SEEK_DATA && start >= inode->i_size)
2770			ret = -ENXIO;
2771		else
2772			*offset = min_t(loff_t, start, inode->i_size);
2773	}
2774	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2775			     &cached_state, GFP_NOFS);
2776	return ret;
2777}
2778
2779static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2780{
2781	struct inode *inode = file->f_mapping->host;
2782	int ret;
2783
2784	mutex_lock(&inode->i_mutex);
2785	switch (whence) {
2786	case SEEK_END:
2787	case SEEK_CUR:
2788		offset = generic_file_llseek(file, offset, whence);
2789		goto out;
2790	case SEEK_DATA:
2791	case SEEK_HOLE:
2792		if (offset >= i_size_read(inode)) {
2793			mutex_unlock(&inode->i_mutex);
2794			return -ENXIO;
2795		}
2796
2797		ret = find_desired_extent(inode, &offset, whence);
2798		if (ret) {
2799			mutex_unlock(&inode->i_mutex);
2800			return ret;
2801		}
2802	}
2803
2804	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2805out:
2806	mutex_unlock(&inode->i_mutex);
2807	return offset;
2808}
2809
2810const struct file_operations btrfs_file_operations = {
2811	.llseek		= btrfs_file_llseek,
2812	.read_iter      = generic_file_read_iter,
2813	.splice_read	= generic_file_splice_read,
2814	.write_iter	= btrfs_file_write_iter,
2815	.mmap		= btrfs_file_mmap,
2816	.open		= generic_file_open,
2817	.release	= btrfs_release_file,
2818	.fsync		= btrfs_sync_file,
2819	.fallocate	= btrfs_fallocate,
2820	.unlocked_ioctl	= btrfs_ioctl,
2821#ifdef CONFIG_COMPAT
2822	.compat_ioctl	= btrfs_compat_ioctl,
2823#endif
2824};
2825
2826void btrfs_auto_defrag_exit(void)
2827{
2828	if (btrfs_inode_defrag_cachep)
2829		kmem_cache_destroy(btrfs_inode_defrag_cachep);
2830}
2831
2832int btrfs_auto_defrag_init(void)
2833{
2834	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2835					sizeof(struct inode_defrag), 0,
2836					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2837					NULL);
2838	if (!btrfs_inode_defrag_cachep)
2839		return -ENOMEM;
2840
2841	return 0;
2842}
2843
2844int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
2845{
2846	int ret;
2847
2848	/*
2849	 * So with compression we will find and lock a dirty page and clear the
2850	 * first one as dirty, setup an async extent, and immediately return
2851	 * with the entire range locked but with nobody actually marked with
2852	 * writeback.  So we can't just filemap_write_and_wait_range() and
2853	 * expect it to work since it will just kick off a thread to do the
2854	 * actual work.  So we need to call filemap_fdatawrite_range _again_
2855	 * since it will wait on the page lock, which won't be unlocked until
2856	 * after the pages have been marked as writeback and so we're good to go
2857	 * from there.  We have to do this otherwise we'll miss the ordered
2858	 * extents and that results in badness.  Please Josef, do not think you
2859	 * know better and pull this out at some point in the future, it is
2860	 * right and you are wrong.
2861	 */
2862	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2863	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
2864			     &BTRFS_I(inode)->runtime_flags))
2865		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2866
2867	return ret;
2868}
2869