1/*
2 * Copyright (C) 2007 Oracle.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/backing-dev.h>
26#include <linux/mpage.h>
27#include <linux/falloc.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/slab.h>
33#include <linux/btrfs.h>
34#include <linux/uio.h>
35#include "ctree.h"
36#include "disk-io.h"
37#include "transaction.h"
38#include "btrfs_inode.h"
39#include "print-tree.h"
40#include "tree-log.h"
41#include "locking.h"
42#include "volumes.h"
43#include "qgroup.h"
44
45static struct kmem_cache *btrfs_inode_defrag_cachep;
46/*
47 * when auto defrag is enabled we
48 * queue up these defrag structs to remember which
49 * inodes need defragging passes
50 */
51struct inode_defrag {
52	struct rb_node rb_node;
53	/* objectid */
54	u64 ino;
55	/*
56	 * transid where the defrag was added, we search for
57	 * extents newer than this
58	 */
59	u64 transid;
60
61	/* root objectid */
62	u64 root;
63
64	/* last offset we were able to defrag */
65	u64 last_offset;
66
67	/* if we've wrapped around back to zero once already */
68	int cycled;
69};
70
71static int __compare_inode_defrag(struct inode_defrag *defrag1,
72				  struct inode_defrag *defrag2)
73{
74	if (defrag1->root > defrag2->root)
75		return 1;
76	else if (defrag1->root < defrag2->root)
77		return -1;
78	else if (defrag1->ino > defrag2->ino)
79		return 1;
80	else if (defrag1->ino < defrag2->ino)
81		return -1;
82	else
83		return 0;
84}
85
86/* pop a record for an inode into the defrag tree.  The lock
87 * must be held already
88 *
89 * If you're inserting a record for an older transid than an
90 * existing record, the transid already in the tree is lowered
91 *
92 * If an existing record is found the defrag item you
93 * pass in is freed
94 */
95static int __btrfs_add_inode_defrag(struct inode *inode,
96				    struct inode_defrag *defrag)
97{
98	struct btrfs_root *root = BTRFS_I(inode)->root;
99	struct inode_defrag *entry;
100	struct rb_node **p;
101	struct rb_node *parent = NULL;
102	int ret;
103
104	p = &root->fs_info->defrag_inodes.rb_node;
105	while (*p) {
106		parent = *p;
107		entry = rb_entry(parent, struct inode_defrag, rb_node);
108
109		ret = __compare_inode_defrag(defrag, entry);
110		if (ret < 0)
111			p = &parent->rb_left;
112		else if (ret > 0)
113			p = &parent->rb_right;
114		else {
115			/* if we're reinserting an entry for
116			 * an old defrag run, make sure to
117			 * lower the transid of our existing record
118			 */
119			if (defrag->transid < entry->transid)
120				entry->transid = defrag->transid;
121			if (defrag->last_offset > entry->last_offset)
122				entry->last_offset = defrag->last_offset;
123			return -EEXIST;
124		}
125	}
126	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
127	rb_link_node(&defrag->rb_node, parent, p);
128	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
129	return 0;
130}
131
132static inline int __need_auto_defrag(struct btrfs_root *root)
133{
134	if (!btrfs_test_opt(root, AUTO_DEFRAG))
135		return 0;
136
137	if (btrfs_fs_closing(root->fs_info))
138		return 0;
139
140	return 1;
141}
142
143/*
144 * insert a defrag record for this inode if auto defrag is
145 * enabled
146 */
147int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
148			   struct inode *inode)
149{
150	struct btrfs_root *root = BTRFS_I(inode)->root;
151	struct inode_defrag *defrag;
152	u64 transid;
153	int ret;
154
155	if (!__need_auto_defrag(root))
156		return 0;
157
158	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
159		return 0;
160
161	if (trans)
162		transid = trans->transid;
163	else
164		transid = BTRFS_I(inode)->root->last_trans;
165
166	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
167	if (!defrag)
168		return -ENOMEM;
169
170	defrag->ino = btrfs_ino(inode);
171	defrag->transid = transid;
172	defrag->root = root->root_key.objectid;
173
174	spin_lock(&root->fs_info->defrag_inodes_lock);
175	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
176		/*
177		 * If we set IN_DEFRAG flag and evict the inode from memory,
178		 * and then re-read this inode, this new inode doesn't have
179		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
180		 */
181		ret = __btrfs_add_inode_defrag(inode, defrag);
182		if (ret)
183			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
184	} else {
185		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
186	}
187	spin_unlock(&root->fs_info->defrag_inodes_lock);
188	return 0;
189}
190
191/*
192 * Requeue the defrag object. If there is a defrag object that points to
193 * the same inode in the tree, we will merge them together (by
194 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
195 */
196static void btrfs_requeue_inode_defrag(struct inode *inode,
197				       struct inode_defrag *defrag)
198{
199	struct btrfs_root *root = BTRFS_I(inode)->root;
200	int ret;
201
202	if (!__need_auto_defrag(root))
203		goto out;
204
205	/*
206	 * Here we don't check the IN_DEFRAG flag, because we need merge
207	 * them together.
208	 */
209	spin_lock(&root->fs_info->defrag_inodes_lock);
210	ret = __btrfs_add_inode_defrag(inode, defrag);
211	spin_unlock(&root->fs_info->defrag_inodes_lock);
212	if (ret)
213		goto out;
214	return;
215out:
216	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
217}
218
219/*
220 * pick the defragable inode that we want, if it doesn't exist, we will get
221 * the next one.
222 */
223static struct inode_defrag *
224btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
225{
226	struct inode_defrag *entry = NULL;
227	struct inode_defrag tmp;
228	struct rb_node *p;
229	struct rb_node *parent = NULL;
230	int ret;
231
232	tmp.ino = ino;
233	tmp.root = root;
234
235	spin_lock(&fs_info->defrag_inodes_lock);
236	p = fs_info->defrag_inodes.rb_node;
237	while (p) {
238		parent = p;
239		entry = rb_entry(parent, struct inode_defrag, rb_node);
240
241		ret = __compare_inode_defrag(&tmp, entry);
242		if (ret < 0)
243			p = parent->rb_left;
244		else if (ret > 0)
245			p = parent->rb_right;
246		else
247			goto out;
248	}
249
250	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
251		parent = rb_next(parent);
252		if (parent)
253			entry = rb_entry(parent, struct inode_defrag, rb_node);
254		else
255			entry = NULL;
256	}
257out:
258	if (entry)
259		rb_erase(parent, &fs_info->defrag_inodes);
260	spin_unlock(&fs_info->defrag_inodes_lock);
261	return entry;
262}
263
264void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
265{
266	struct inode_defrag *defrag;
267	struct rb_node *node;
268
269	spin_lock(&fs_info->defrag_inodes_lock);
270	node = rb_first(&fs_info->defrag_inodes);
271	while (node) {
272		rb_erase(node, &fs_info->defrag_inodes);
273		defrag = rb_entry(node, struct inode_defrag, rb_node);
274		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
275
276		cond_resched_lock(&fs_info->defrag_inodes_lock);
277
278		node = rb_first(&fs_info->defrag_inodes);
279	}
280	spin_unlock(&fs_info->defrag_inodes_lock);
281}
282
283#define BTRFS_DEFRAG_BATCH	1024
284
285static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
286				    struct inode_defrag *defrag)
287{
288	struct btrfs_root *inode_root;
289	struct inode *inode;
290	struct btrfs_key key;
291	struct btrfs_ioctl_defrag_range_args range;
292	int num_defrag;
293	int index;
294	int ret;
295
296	/* get the inode */
297	key.objectid = defrag->root;
298	key.type = BTRFS_ROOT_ITEM_KEY;
299	key.offset = (u64)-1;
300
301	index = srcu_read_lock(&fs_info->subvol_srcu);
302
303	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
304	if (IS_ERR(inode_root)) {
305		ret = PTR_ERR(inode_root);
306		goto cleanup;
307	}
308
309	key.objectid = defrag->ino;
310	key.type = BTRFS_INODE_ITEM_KEY;
311	key.offset = 0;
312	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
313	if (IS_ERR(inode)) {
314		ret = PTR_ERR(inode);
315		goto cleanup;
316	}
317	srcu_read_unlock(&fs_info->subvol_srcu, index);
318
319	/* do a chunk of defrag */
320	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
321	memset(&range, 0, sizeof(range));
322	range.len = (u64)-1;
323	range.start = defrag->last_offset;
324
325	sb_start_write(fs_info->sb);
326	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
327				       BTRFS_DEFRAG_BATCH);
328	sb_end_write(fs_info->sb);
329	/*
330	 * if we filled the whole defrag batch, there
331	 * must be more work to do.  Queue this defrag
332	 * again
333	 */
334	if (num_defrag == BTRFS_DEFRAG_BATCH) {
335		defrag->last_offset = range.start;
336		btrfs_requeue_inode_defrag(inode, defrag);
337	} else if (defrag->last_offset && !defrag->cycled) {
338		/*
339		 * we didn't fill our defrag batch, but
340		 * we didn't start at zero.  Make sure we loop
341		 * around to the start of the file.
342		 */
343		defrag->last_offset = 0;
344		defrag->cycled = 1;
345		btrfs_requeue_inode_defrag(inode, defrag);
346	} else {
347		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
348	}
349
350	iput(inode);
351	return 0;
352cleanup:
353	srcu_read_unlock(&fs_info->subvol_srcu, index);
354	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
355	return ret;
356}
357
358/*
359 * run through the list of inodes in the FS that need
360 * defragging
361 */
362int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
363{
364	struct inode_defrag *defrag;
365	u64 first_ino = 0;
366	u64 root_objectid = 0;
367
368	atomic_inc(&fs_info->defrag_running);
369	while (1) {
370		/* Pause the auto defragger. */
371		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
372			     &fs_info->fs_state))
373			break;
374
375		if (!__need_auto_defrag(fs_info->tree_root))
376			break;
377
378		/* find an inode to defrag */
379		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
380						 first_ino);
381		if (!defrag) {
382			if (root_objectid || first_ino) {
383				root_objectid = 0;
384				first_ino = 0;
385				continue;
386			} else {
387				break;
388			}
389		}
390
391		first_ino = defrag->ino + 1;
392		root_objectid = defrag->root;
393
394		__btrfs_run_defrag_inode(fs_info, defrag);
395	}
396	atomic_dec(&fs_info->defrag_running);
397
398	/*
399	 * during unmount, we use the transaction_wait queue to
400	 * wait for the defragger to stop
401	 */
402	wake_up(&fs_info->transaction_wait);
403	return 0;
404}
405
406/* simple helper to fault in pages and copy.  This should go away
407 * and be replaced with calls into generic code.
408 */
409static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
410					 size_t write_bytes,
411					 struct page **prepared_pages,
412					 struct iov_iter *i)
413{
414	size_t copied = 0;
415	size_t total_copied = 0;
416	int pg = 0;
417	int offset = pos & (PAGE_CACHE_SIZE - 1);
418
419	while (write_bytes > 0) {
420		size_t count = min_t(size_t,
421				     PAGE_CACHE_SIZE - offset, write_bytes);
422		struct page *page = prepared_pages[pg];
423		/*
424		 * Copy data from userspace to the current page
425		 */
426		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
427
428		/* Flush processor's dcache for this page */
429		flush_dcache_page(page);
430
431		/*
432		 * if we get a partial write, we can end up with
433		 * partially up to date pages.  These add
434		 * a lot of complexity, so make sure they don't
435		 * happen by forcing this copy to be retried.
436		 *
437		 * The rest of the btrfs_file_write code will fall
438		 * back to page at a time copies after we return 0.
439		 */
440		if (!PageUptodate(page) && copied < count)
441			copied = 0;
442
443		iov_iter_advance(i, copied);
444		write_bytes -= copied;
445		total_copied += copied;
446
447		/* Return to btrfs_file_write_iter to fault page */
448		if (unlikely(copied == 0))
449			break;
450
451		if (copied < PAGE_CACHE_SIZE - offset) {
452			offset += copied;
453		} else {
454			pg++;
455			offset = 0;
456		}
457	}
458	return total_copied;
459}
460
461/*
462 * unlocks pages after btrfs_file_write is done with them
463 */
464static void btrfs_drop_pages(struct page **pages, size_t num_pages)
465{
466	size_t i;
467	for (i = 0; i < num_pages; i++) {
468		/* page checked is some magic around finding pages that
469		 * have been modified without going through btrfs_set_page_dirty
470		 * clear it here. There should be no need to mark the pages
471		 * accessed as prepare_pages should have marked them accessed
472		 * in prepare_pages via find_or_create_page()
473		 */
474		ClearPageChecked(pages[i]);
475		unlock_page(pages[i]);
476		page_cache_release(pages[i]);
477	}
478}
479
480/*
481 * after copy_from_user, pages need to be dirtied and we need to make
482 * sure holes are created between the current EOF and the start of
483 * any next extents (if required).
484 *
485 * this also makes the decision about creating an inline extent vs
486 * doing real data extents, marking pages dirty and delalloc as required.
487 */
488int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
489			     struct page **pages, size_t num_pages,
490			     loff_t pos, size_t write_bytes,
491			     struct extent_state **cached)
492{
493	int err = 0;
494	int i;
495	u64 num_bytes;
496	u64 start_pos;
497	u64 end_of_last_block;
498	u64 end_pos = pos + write_bytes;
499	loff_t isize = i_size_read(inode);
500
501	start_pos = pos & ~((u64)root->sectorsize - 1);
502	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
503
504	end_of_last_block = start_pos + num_bytes - 1;
505	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
506					cached);
507	if (err)
508		return err;
509
510	for (i = 0; i < num_pages; i++) {
511		struct page *p = pages[i];
512		SetPageUptodate(p);
513		ClearPageChecked(p);
514		set_page_dirty(p);
515	}
516
517	/*
518	 * we've only changed i_size in ram, and we haven't updated
519	 * the disk i_size.  There is no need to log the inode
520	 * at this time.
521	 */
522	if (end_pos > isize)
523		i_size_write(inode, end_pos);
524	return 0;
525}
526
527/*
528 * this drops all the extents in the cache that intersect the range
529 * [start, end].  Existing extents are split as required.
530 */
531void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
532			     int skip_pinned)
533{
534	struct extent_map *em;
535	struct extent_map *split = NULL;
536	struct extent_map *split2 = NULL;
537	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
538	u64 len = end - start + 1;
539	u64 gen;
540	int ret;
541	int testend = 1;
542	unsigned long flags;
543	int compressed = 0;
544	bool modified;
545
546	WARN_ON(end < start);
547	if (end == (u64)-1) {
548		len = (u64)-1;
549		testend = 0;
550	}
551	while (1) {
552		int no_splits = 0;
553
554		modified = false;
555		if (!split)
556			split = alloc_extent_map();
557		if (!split2)
558			split2 = alloc_extent_map();
559		if (!split || !split2)
560			no_splits = 1;
561
562		write_lock(&em_tree->lock);
563		em = lookup_extent_mapping(em_tree, start, len);
564		if (!em) {
565			write_unlock(&em_tree->lock);
566			break;
567		}
568		flags = em->flags;
569		gen = em->generation;
570		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
571			if (testend && em->start + em->len >= start + len) {
572				free_extent_map(em);
573				write_unlock(&em_tree->lock);
574				break;
575			}
576			start = em->start + em->len;
577			if (testend)
578				len = start + len - (em->start + em->len);
579			free_extent_map(em);
580			write_unlock(&em_tree->lock);
581			continue;
582		}
583		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
584		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
585		clear_bit(EXTENT_FLAG_LOGGING, &flags);
586		modified = !list_empty(&em->list);
587		if (no_splits)
588			goto next;
589
590		if (em->start < start) {
591			split->start = em->start;
592			split->len = start - em->start;
593
594			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
595				split->orig_start = em->orig_start;
596				split->block_start = em->block_start;
597
598				if (compressed)
599					split->block_len = em->block_len;
600				else
601					split->block_len = split->len;
602				split->orig_block_len = max(split->block_len,
603						em->orig_block_len);
604				split->ram_bytes = em->ram_bytes;
605			} else {
606				split->orig_start = split->start;
607				split->block_len = 0;
608				split->block_start = em->block_start;
609				split->orig_block_len = 0;
610				split->ram_bytes = split->len;
611			}
612
613			split->generation = gen;
614			split->bdev = em->bdev;
615			split->flags = flags;
616			split->compress_type = em->compress_type;
617			replace_extent_mapping(em_tree, em, split, modified);
618			free_extent_map(split);
619			split = split2;
620			split2 = NULL;
621		}
622		if (testend && em->start + em->len > start + len) {
623			u64 diff = start + len - em->start;
624
625			split->start = start + len;
626			split->len = em->start + em->len - (start + len);
627			split->bdev = em->bdev;
628			split->flags = flags;
629			split->compress_type = em->compress_type;
630			split->generation = gen;
631
632			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
633				split->orig_block_len = max(em->block_len,
634						    em->orig_block_len);
635
636				split->ram_bytes = em->ram_bytes;
637				if (compressed) {
638					split->block_len = em->block_len;
639					split->block_start = em->block_start;
640					split->orig_start = em->orig_start;
641				} else {
642					split->block_len = split->len;
643					split->block_start = em->block_start
644						+ diff;
645					split->orig_start = em->orig_start;
646				}
647			} else {
648				split->ram_bytes = split->len;
649				split->orig_start = split->start;
650				split->block_len = 0;
651				split->block_start = em->block_start;
652				split->orig_block_len = 0;
653			}
654
655			if (extent_map_in_tree(em)) {
656				replace_extent_mapping(em_tree, em, split,
657						       modified);
658			} else {
659				ret = add_extent_mapping(em_tree, split,
660							 modified);
661				ASSERT(ret == 0); /* Logic error */
662			}
663			free_extent_map(split);
664			split = NULL;
665		}
666next:
667		if (extent_map_in_tree(em))
668			remove_extent_mapping(em_tree, em);
669		write_unlock(&em_tree->lock);
670
671		/* once for us */
672		free_extent_map(em);
673		/* once for the tree*/
674		free_extent_map(em);
675	}
676	if (split)
677		free_extent_map(split);
678	if (split2)
679		free_extent_map(split2);
680}
681
682/*
683 * this is very complex, but the basic idea is to drop all extents
684 * in the range start - end.  hint_block is filled in with a block number
685 * that would be a good hint to the block allocator for this file.
686 *
687 * If an extent intersects the range but is not entirely inside the range
688 * it is either truncated or split.  Anything entirely inside the range
689 * is deleted from the tree.
690 */
691int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
692			 struct btrfs_root *root, struct inode *inode,
693			 struct btrfs_path *path, u64 start, u64 end,
694			 u64 *drop_end, int drop_cache,
695			 int replace_extent,
696			 u32 extent_item_size,
697			 int *key_inserted)
698{
699	struct extent_buffer *leaf;
700	struct btrfs_file_extent_item *fi;
701	struct btrfs_key key;
702	struct btrfs_key new_key;
703	u64 ino = btrfs_ino(inode);
704	u64 search_start = start;
705	u64 disk_bytenr = 0;
706	u64 num_bytes = 0;
707	u64 extent_offset = 0;
708	u64 extent_end = 0;
709	int del_nr = 0;
710	int del_slot = 0;
711	int extent_type;
712	int recow;
713	int ret;
714	int modify_tree = -1;
715	int update_refs;
716	int found = 0;
717	int leafs_visited = 0;
718
719	if (drop_cache)
720		btrfs_drop_extent_cache(inode, start, end - 1, 0);
721
722	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
723		modify_tree = 0;
724
725	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
726		       root == root->fs_info->tree_root);
727	while (1) {
728		recow = 0;
729		ret = btrfs_lookup_file_extent(trans, root, path, ino,
730					       search_start, modify_tree);
731		if (ret < 0)
732			break;
733		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
734			leaf = path->nodes[0];
735			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
736			if (key.objectid == ino &&
737			    key.type == BTRFS_EXTENT_DATA_KEY)
738				path->slots[0]--;
739		}
740		ret = 0;
741		leafs_visited++;
742next_slot:
743		leaf = path->nodes[0];
744		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
745			BUG_ON(del_nr > 0);
746			ret = btrfs_next_leaf(root, path);
747			if (ret < 0)
748				break;
749			if (ret > 0) {
750				ret = 0;
751				break;
752			}
753			leafs_visited++;
754			leaf = path->nodes[0];
755			recow = 1;
756		}
757
758		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
759
760		if (key.objectid > ino)
761			break;
762		if (WARN_ON_ONCE(key.objectid < ino) ||
763		    key.type < BTRFS_EXTENT_DATA_KEY) {
764			ASSERT(del_nr == 0);
765			path->slots[0]++;
766			goto next_slot;
767		}
768		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
769			break;
770
771		fi = btrfs_item_ptr(leaf, path->slots[0],
772				    struct btrfs_file_extent_item);
773		extent_type = btrfs_file_extent_type(leaf, fi);
774
775		if (extent_type == BTRFS_FILE_EXTENT_REG ||
776		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
777			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
778			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
779			extent_offset = btrfs_file_extent_offset(leaf, fi);
780			extent_end = key.offset +
781				btrfs_file_extent_num_bytes(leaf, fi);
782		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
783			extent_end = key.offset +
784				btrfs_file_extent_inline_len(leaf,
785						     path->slots[0], fi);
786		} else {
787			/* can't happen */
788			BUG();
789		}
790
791		/*
792		 * Don't skip extent items representing 0 byte lengths. They
793		 * used to be created (bug) if while punching holes we hit
794		 * -ENOSPC condition. So if we find one here, just ensure we
795		 * delete it, otherwise we would insert a new file extent item
796		 * with the same key (offset) as that 0 bytes length file
797		 * extent item in the call to setup_items_for_insert() later
798		 * in this function.
799		 */
800		if (extent_end == key.offset && extent_end >= search_start)
801			goto delete_extent_item;
802
803		if (extent_end <= search_start) {
804			path->slots[0]++;
805			goto next_slot;
806		}
807
808		found = 1;
809		search_start = max(key.offset, start);
810		if (recow || !modify_tree) {
811			modify_tree = -1;
812			btrfs_release_path(path);
813			continue;
814		}
815
816		/*
817		 *     | - range to drop - |
818		 *  | -------- extent -------- |
819		 */
820		if (start > key.offset && end < extent_end) {
821			BUG_ON(del_nr > 0);
822			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
823				ret = -EOPNOTSUPP;
824				break;
825			}
826
827			memcpy(&new_key, &key, sizeof(new_key));
828			new_key.offset = start;
829			ret = btrfs_duplicate_item(trans, root, path,
830						   &new_key);
831			if (ret == -EAGAIN) {
832				btrfs_release_path(path);
833				continue;
834			}
835			if (ret < 0)
836				break;
837
838			leaf = path->nodes[0];
839			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
840					    struct btrfs_file_extent_item);
841			btrfs_set_file_extent_num_bytes(leaf, fi,
842							start - key.offset);
843
844			fi = btrfs_item_ptr(leaf, path->slots[0],
845					    struct btrfs_file_extent_item);
846
847			extent_offset += start - key.offset;
848			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
849			btrfs_set_file_extent_num_bytes(leaf, fi,
850							extent_end - start);
851			btrfs_mark_buffer_dirty(leaf);
852
853			if (update_refs && disk_bytenr > 0) {
854				ret = btrfs_inc_extent_ref(trans, root,
855						disk_bytenr, num_bytes, 0,
856						root->root_key.objectid,
857						new_key.objectid,
858						start - extent_offset);
859				BUG_ON(ret); /* -ENOMEM */
860			}
861			key.offset = start;
862		}
863		/*
864		 *  | ---- range to drop ----- |
865		 *      | -------- extent -------- |
866		 */
867		if (start <= key.offset && end < extent_end) {
868			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
869				ret = -EOPNOTSUPP;
870				break;
871			}
872
873			memcpy(&new_key, &key, sizeof(new_key));
874			new_key.offset = end;
875			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
876
877			extent_offset += end - key.offset;
878			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
879			btrfs_set_file_extent_num_bytes(leaf, fi,
880							extent_end - end);
881			btrfs_mark_buffer_dirty(leaf);
882			if (update_refs && disk_bytenr > 0)
883				inode_sub_bytes(inode, end - key.offset);
884			break;
885		}
886
887		search_start = extent_end;
888		/*
889		 *       | ---- range to drop ----- |
890		 *  | -------- extent -------- |
891		 */
892		if (start > key.offset && end >= extent_end) {
893			BUG_ON(del_nr > 0);
894			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
895				ret = -EOPNOTSUPP;
896				break;
897			}
898
899			btrfs_set_file_extent_num_bytes(leaf, fi,
900							start - key.offset);
901			btrfs_mark_buffer_dirty(leaf);
902			if (update_refs && disk_bytenr > 0)
903				inode_sub_bytes(inode, extent_end - start);
904			if (end == extent_end)
905				break;
906
907			path->slots[0]++;
908			goto next_slot;
909		}
910
911		/*
912		 *  | ---- range to drop ----- |
913		 *    | ------ extent ------ |
914		 */
915		if (start <= key.offset && end >= extent_end) {
916delete_extent_item:
917			if (del_nr == 0) {
918				del_slot = path->slots[0];
919				del_nr = 1;
920			} else {
921				BUG_ON(del_slot + del_nr != path->slots[0]);
922				del_nr++;
923			}
924
925			if (update_refs &&
926			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
927				inode_sub_bytes(inode,
928						extent_end - key.offset);
929				extent_end = ALIGN(extent_end,
930						   root->sectorsize);
931			} else if (update_refs && disk_bytenr > 0) {
932				ret = btrfs_free_extent(trans, root,
933						disk_bytenr, num_bytes, 0,
934						root->root_key.objectid,
935						key.objectid, key.offset -
936						extent_offset);
937				BUG_ON(ret); /* -ENOMEM */
938				inode_sub_bytes(inode,
939						extent_end - key.offset);
940			}
941
942			if (end == extent_end)
943				break;
944
945			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
946				path->slots[0]++;
947				goto next_slot;
948			}
949
950			ret = btrfs_del_items(trans, root, path, del_slot,
951					      del_nr);
952			if (ret) {
953				btrfs_abort_transaction(trans, root, ret);
954				break;
955			}
956
957			del_nr = 0;
958			del_slot = 0;
959
960			btrfs_release_path(path);
961			continue;
962		}
963
964		BUG_ON(1);
965	}
966
967	if (!ret && del_nr > 0) {
968		/*
969		 * Set path->slots[0] to first slot, so that after the delete
970		 * if items are move off from our leaf to its immediate left or
971		 * right neighbor leafs, we end up with a correct and adjusted
972		 * path->slots[0] for our insertion (if replace_extent != 0).
973		 */
974		path->slots[0] = del_slot;
975		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
976		if (ret)
977			btrfs_abort_transaction(trans, root, ret);
978	}
979
980	leaf = path->nodes[0];
981	/*
982	 * If btrfs_del_items() was called, it might have deleted a leaf, in
983	 * which case it unlocked our path, so check path->locks[0] matches a
984	 * write lock.
985	 */
986	if (!ret && replace_extent && leafs_visited == 1 &&
987	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
988	     path->locks[0] == BTRFS_WRITE_LOCK) &&
989	    btrfs_leaf_free_space(root, leaf) >=
990	    sizeof(struct btrfs_item) + extent_item_size) {
991
992		key.objectid = ino;
993		key.type = BTRFS_EXTENT_DATA_KEY;
994		key.offset = start;
995		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
996			struct btrfs_key slot_key;
997
998			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
999			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1000				path->slots[0]++;
1001		}
1002		setup_items_for_insert(root, path, &key,
1003				       &extent_item_size,
1004				       extent_item_size,
1005				       sizeof(struct btrfs_item) +
1006				       extent_item_size, 1);
1007		*key_inserted = 1;
1008	}
1009
1010	if (!replace_extent || !(*key_inserted))
1011		btrfs_release_path(path);
1012	if (drop_end)
1013		*drop_end = found ? min(end, extent_end) : end;
1014	return ret;
1015}
1016
1017int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1018		       struct btrfs_root *root, struct inode *inode, u64 start,
1019		       u64 end, int drop_cache)
1020{
1021	struct btrfs_path *path;
1022	int ret;
1023
1024	path = btrfs_alloc_path();
1025	if (!path)
1026		return -ENOMEM;
1027	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1028				   drop_cache, 0, 0, NULL);
1029	btrfs_free_path(path);
1030	return ret;
1031}
1032
1033static int extent_mergeable(struct extent_buffer *leaf, int slot,
1034			    u64 objectid, u64 bytenr, u64 orig_offset,
1035			    u64 *start, u64 *end)
1036{
1037	struct btrfs_file_extent_item *fi;
1038	struct btrfs_key key;
1039	u64 extent_end;
1040
1041	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1042		return 0;
1043
1044	btrfs_item_key_to_cpu(leaf, &key, slot);
1045	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1046		return 0;
1047
1048	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1049	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1050	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1051	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1052	    btrfs_file_extent_compression(leaf, fi) ||
1053	    btrfs_file_extent_encryption(leaf, fi) ||
1054	    btrfs_file_extent_other_encoding(leaf, fi))
1055		return 0;
1056
1057	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1058	if ((*start && *start != key.offset) || (*end && *end != extent_end))
1059		return 0;
1060
1061	*start = key.offset;
1062	*end = extent_end;
1063	return 1;
1064}
1065
1066/*
1067 * Mark extent in the range start - end as written.
1068 *
1069 * This changes extent type from 'pre-allocated' to 'regular'. If only
1070 * part of extent is marked as written, the extent will be split into
1071 * two or three.
1072 */
1073int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1074			      struct inode *inode, u64 start, u64 end)
1075{
1076	struct btrfs_root *root = BTRFS_I(inode)->root;
1077	struct extent_buffer *leaf;
1078	struct btrfs_path *path;
1079	struct btrfs_file_extent_item *fi;
1080	struct btrfs_key key;
1081	struct btrfs_key new_key;
1082	u64 bytenr;
1083	u64 num_bytes;
1084	u64 extent_end;
1085	u64 orig_offset;
1086	u64 other_start;
1087	u64 other_end;
1088	u64 split;
1089	int del_nr = 0;
1090	int del_slot = 0;
1091	int recow;
1092	int ret;
1093	u64 ino = btrfs_ino(inode);
1094
1095	path = btrfs_alloc_path();
1096	if (!path)
1097		return -ENOMEM;
1098again:
1099	recow = 0;
1100	split = start;
1101	key.objectid = ino;
1102	key.type = BTRFS_EXTENT_DATA_KEY;
1103	key.offset = split;
1104
1105	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1106	if (ret < 0)
1107		goto out;
1108	if (ret > 0 && path->slots[0] > 0)
1109		path->slots[0]--;
1110
1111	leaf = path->nodes[0];
1112	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1113	BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
1114	fi = btrfs_item_ptr(leaf, path->slots[0],
1115			    struct btrfs_file_extent_item);
1116	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
1117	       BTRFS_FILE_EXTENT_PREALLOC);
1118	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1119	BUG_ON(key.offset > start || extent_end < end);
1120
1121	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1122	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1123	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1124	memcpy(&new_key, &key, sizeof(new_key));
1125
1126	if (start == key.offset && end < extent_end) {
1127		other_start = 0;
1128		other_end = start;
1129		if (extent_mergeable(leaf, path->slots[0] - 1,
1130				     ino, bytenr, orig_offset,
1131				     &other_start, &other_end)) {
1132			new_key.offset = end;
1133			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
1134			fi = btrfs_item_ptr(leaf, path->slots[0],
1135					    struct btrfs_file_extent_item);
1136			btrfs_set_file_extent_generation(leaf, fi,
1137							 trans->transid);
1138			btrfs_set_file_extent_num_bytes(leaf, fi,
1139							extent_end - end);
1140			btrfs_set_file_extent_offset(leaf, fi,
1141						     end - orig_offset);
1142			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1143					    struct btrfs_file_extent_item);
1144			btrfs_set_file_extent_generation(leaf, fi,
1145							 trans->transid);
1146			btrfs_set_file_extent_num_bytes(leaf, fi,
1147							end - other_start);
1148			btrfs_mark_buffer_dirty(leaf);
1149			goto out;
1150		}
1151	}
1152
1153	if (start > key.offset && end == extent_end) {
1154		other_start = end;
1155		other_end = 0;
1156		if (extent_mergeable(leaf, path->slots[0] + 1,
1157				     ino, bytenr, orig_offset,
1158				     &other_start, &other_end)) {
1159			fi = btrfs_item_ptr(leaf, path->slots[0],
1160					    struct btrfs_file_extent_item);
1161			btrfs_set_file_extent_num_bytes(leaf, fi,
1162							start - key.offset);
1163			btrfs_set_file_extent_generation(leaf, fi,
1164							 trans->transid);
1165			path->slots[0]++;
1166			new_key.offset = start;
1167			btrfs_set_item_key_safe(root->fs_info, path, &new_key);
1168
1169			fi = btrfs_item_ptr(leaf, path->slots[0],
1170					    struct btrfs_file_extent_item);
1171			btrfs_set_file_extent_generation(leaf, fi,
1172							 trans->transid);
1173			btrfs_set_file_extent_num_bytes(leaf, fi,
1174							other_end - start);
1175			btrfs_set_file_extent_offset(leaf, fi,
1176						     start - orig_offset);
1177			btrfs_mark_buffer_dirty(leaf);
1178			goto out;
1179		}
1180	}
1181
1182	while (start > key.offset || end < extent_end) {
1183		if (key.offset == start)
1184			split = end;
1185
1186		new_key.offset = split;
1187		ret = btrfs_duplicate_item(trans, root, path, &new_key);
1188		if (ret == -EAGAIN) {
1189			btrfs_release_path(path);
1190			goto again;
1191		}
1192		if (ret < 0) {
1193			btrfs_abort_transaction(trans, root, ret);
1194			goto out;
1195		}
1196
1197		leaf = path->nodes[0];
1198		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1199				    struct btrfs_file_extent_item);
1200		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1201		btrfs_set_file_extent_num_bytes(leaf, fi,
1202						split - key.offset);
1203
1204		fi = btrfs_item_ptr(leaf, path->slots[0],
1205				    struct btrfs_file_extent_item);
1206
1207		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1208		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1209		btrfs_set_file_extent_num_bytes(leaf, fi,
1210						extent_end - split);
1211		btrfs_mark_buffer_dirty(leaf);
1212
1213		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1214					   root->root_key.objectid,
1215					   ino, orig_offset);
1216		BUG_ON(ret); /* -ENOMEM */
1217
1218		if (split == start) {
1219			key.offset = start;
1220		} else {
1221			BUG_ON(start != key.offset);
1222			path->slots[0]--;
1223			extent_end = end;
1224		}
1225		recow = 1;
1226	}
1227
1228	other_start = end;
1229	other_end = 0;
1230	if (extent_mergeable(leaf, path->slots[0] + 1,
1231			     ino, bytenr, orig_offset,
1232			     &other_start, &other_end)) {
1233		if (recow) {
1234			btrfs_release_path(path);
1235			goto again;
1236		}
1237		extent_end = other_end;
1238		del_slot = path->slots[0] + 1;
1239		del_nr++;
1240		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1241					0, root->root_key.objectid,
1242					ino, orig_offset);
1243		BUG_ON(ret); /* -ENOMEM */
1244	}
1245	other_start = 0;
1246	other_end = start;
1247	if (extent_mergeable(leaf, path->slots[0] - 1,
1248			     ino, bytenr, orig_offset,
1249			     &other_start, &other_end)) {
1250		if (recow) {
1251			btrfs_release_path(path);
1252			goto again;
1253		}
1254		key.offset = other_start;
1255		del_slot = path->slots[0];
1256		del_nr++;
1257		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1258					0, root->root_key.objectid,
1259					ino, orig_offset);
1260		BUG_ON(ret); /* -ENOMEM */
1261	}
1262	if (del_nr == 0) {
1263		fi = btrfs_item_ptr(leaf, path->slots[0],
1264			   struct btrfs_file_extent_item);
1265		btrfs_set_file_extent_type(leaf, fi,
1266					   BTRFS_FILE_EXTENT_REG);
1267		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1268		btrfs_mark_buffer_dirty(leaf);
1269	} else {
1270		fi = btrfs_item_ptr(leaf, del_slot - 1,
1271			   struct btrfs_file_extent_item);
1272		btrfs_set_file_extent_type(leaf, fi,
1273					   BTRFS_FILE_EXTENT_REG);
1274		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1275		btrfs_set_file_extent_num_bytes(leaf, fi,
1276						extent_end - key.offset);
1277		btrfs_mark_buffer_dirty(leaf);
1278
1279		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1280		if (ret < 0) {
1281			btrfs_abort_transaction(trans, root, ret);
1282			goto out;
1283		}
1284	}
1285out:
1286	btrfs_free_path(path);
1287	return 0;
1288}
1289
1290/*
1291 * on error we return an unlocked page and the error value
1292 * on success we return a locked page and 0
1293 */
1294static int prepare_uptodate_page(struct inode *inode,
1295				 struct page *page, u64 pos,
1296				 bool force_uptodate)
1297{
1298	int ret = 0;
1299
1300	if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1301	    !PageUptodate(page)) {
1302		ret = btrfs_readpage(NULL, page);
1303		if (ret)
1304			return ret;
1305		lock_page(page);
1306		if (!PageUptodate(page)) {
1307			unlock_page(page);
1308			return -EIO;
1309		}
1310		if (page->mapping != inode->i_mapping) {
1311			unlock_page(page);
1312			return -EAGAIN;
1313		}
1314	}
1315	return 0;
1316}
1317
1318/*
1319 * this just gets pages into the page cache and locks them down.
1320 */
1321static noinline int prepare_pages(struct inode *inode, struct page **pages,
1322				  size_t num_pages, loff_t pos,
1323				  size_t write_bytes, bool force_uptodate)
1324{
1325	int i;
1326	unsigned long index = pos >> PAGE_CACHE_SHIFT;
1327	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1328	int err = 0;
1329	int faili;
1330
1331	for (i = 0; i < num_pages; i++) {
1332again:
1333		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1334					       mask | __GFP_WRITE);
1335		if (!pages[i]) {
1336			faili = i - 1;
1337			err = -ENOMEM;
1338			goto fail;
1339		}
1340
1341		if (i == 0)
1342			err = prepare_uptodate_page(inode, pages[i], pos,
1343						    force_uptodate);
1344		if (!err && i == num_pages - 1)
1345			err = prepare_uptodate_page(inode, pages[i],
1346						    pos + write_bytes, false);
1347		if (err) {
1348			page_cache_release(pages[i]);
1349			if (err == -EAGAIN) {
1350				err = 0;
1351				goto again;
1352			}
1353			faili = i - 1;
1354			goto fail;
1355		}
1356		wait_on_page_writeback(pages[i]);
1357	}
1358
1359	return 0;
1360fail:
1361	while (faili >= 0) {
1362		unlock_page(pages[faili]);
1363		page_cache_release(pages[faili]);
1364		faili--;
1365	}
1366	return err;
1367
1368}
1369
1370/*
1371 * This function locks the extent and properly waits for data=ordered extents
1372 * to finish before allowing the pages to be modified if need.
1373 *
1374 * The return value:
1375 * 1 - the extent is locked
1376 * 0 - the extent is not locked, and everything is OK
1377 * -EAGAIN - need re-prepare the pages
1378 * the other < 0 number - Something wrong happens
1379 */
1380static noinline int
1381lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1382				size_t num_pages, loff_t pos,
1383				u64 *lockstart, u64 *lockend,
1384				struct extent_state **cached_state)
1385{
1386	u64 start_pos;
1387	u64 last_pos;
1388	int i;
1389	int ret = 0;
1390
1391	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1392	last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
1393
1394	if (start_pos < inode->i_size) {
1395		struct btrfs_ordered_extent *ordered;
1396		lock_extent_bits(&BTRFS_I(inode)->io_tree,
1397				 start_pos, last_pos, 0, cached_state);
1398		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1399						     last_pos - start_pos + 1);
1400		if (ordered &&
1401		    ordered->file_offset + ordered->len > start_pos &&
1402		    ordered->file_offset <= last_pos) {
1403			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1404					     start_pos, last_pos,
1405					     cached_state, GFP_NOFS);
1406			for (i = 0; i < num_pages; i++) {
1407				unlock_page(pages[i]);
1408				page_cache_release(pages[i]);
1409			}
1410			btrfs_start_ordered_extent(inode, ordered, 1);
1411			btrfs_put_ordered_extent(ordered);
1412			return -EAGAIN;
1413		}
1414		if (ordered)
1415			btrfs_put_ordered_extent(ordered);
1416
1417		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1418				  last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1419				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1420				  0, 0, cached_state, GFP_NOFS);
1421		*lockstart = start_pos;
1422		*lockend = last_pos;
1423		ret = 1;
1424	}
1425
1426	for (i = 0; i < num_pages; i++) {
1427		if (clear_page_dirty_for_io(pages[i]))
1428			account_page_redirty(pages[i]);
1429		set_page_extent_mapped(pages[i]);
1430		WARN_ON(!PageLocked(pages[i]));
1431	}
1432
1433	return ret;
1434}
1435
1436static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1437				    size_t *write_bytes)
1438{
1439	struct btrfs_root *root = BTRFS_I(inode)->root;
1440	struct btrfs_ordered_extent *ordered;
1441	u64 lockstart, lockend;
1442	u64 num_bytes;
1443	int ret;
1444
1445	ret = btrfs_start_write_no_snapshoting(root);
1446	if (!ret)
1447		return -ENOSPC;
1448
1449	lockstart = round_down(pos, root->sectorsize);
1450	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
1451
1452	while (1) {
1453		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1454		ordered = btrfs_lookup_ordered_range(inode, lockstart,
1455						     lockend - lockstart + 1);
1456		if (!ordered) {
1457			break;
1458		}
1459		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1460		btrfs_start_ordered_extent(inode, ordered, 1);
1461		btrfs_put_ordered_extent(ordered);
1462	}
1463
1464	num_bytes = lockend - lockstart + 1;
1465	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1466	if (ret <= 0) {
1467		ret = 0;
1468		btrfs_end_write_no_snapshoting(root);
1469	} else {
1470		*write_bytes = min_t(size_t, *write_bytes ,
1471				     num_bytes - pos + lockstart);
1472	}
1473
1474	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1475
1476	return ret;
1477}
1478
1479static noinline ssize_t __btrfs_buffered_write(struct file *file,
1480					       struct iov_iter *i,
1481					       loff_t pos)
1482{
1483	struct inode *inode = file_inode(file);
1484	struct btrfs_root *root = BTRFS_I(inode)->root;
1485	struct page **pages = NULL;
1486	struct extent_state *cached_state = NULL;
1487	u64 release_bytes = 0;
1488	u64 lockstart;
1489	u64 lockend;
1490	size_t num_written = 0;
1491	int nrptrs;
1492	int ret = 0;
1493	bool only_release_metadata = false;
1494	bool force_page_uptodate = false;
1495	bool need_unlock;
1496
1497	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
1498			PAGE_CACHE_SIZE / (sizeof(struct page *)));
1499	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1500	nrptrs = max(nrptrs, 8);
1501	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1502	if (!pages)
1503		return -ENOMEM;
1504
1505	while (iov_iter_count(i) > 0) {
1506		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1507		size_t write_bytes = min(iov_iter_count(i),
1508					 nrptrs * (size_t)PAGE_CACHE_SIZE -
1509					 offset);
1510		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1511						PAGE_CACHE_SIZE);
1512		size_t reserve_bytes;
1513		size_t dirty_pages;
1514		size_t copied;
1515
1516		WARN_ON(num_pages > nrptrs);
1517
1518		/*
1519		 * Fault pages before locking them in prepare_pages
1520		 * to avoid recursive lock
1521		 */
1522		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1523			ret = -EFAULT;
1524			break;
1525		}
1526
1527		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1528
1529		if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1530					     BTRFS_INODE_PREALLOC)) {
1531			ret = check_can_nocow(inode, pos, &write_bytes);
1532			if (ret < 0)
1533				break;
1534			if (ret > 0) {
1535				/*
1536				 * For nodata cow case, no need to reserve
1537				 * data space.
1538				 */
1539				only_release_metadata = true;
1540				/*
1541				 * our prealloc extent may be smaller than
1542				 * write_bytes, so scale down.
1543				 */
1544				num_pages = DIV_ROUND_UP(write_bytes + offset,
1545							 PAGE_CACHE_SIZE);
1546				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1547				goto reserve_metadata;
1548			}
1549		}
1550		ret = btrfs_check_data_free_space(inode, pos, write_bytes);
1551		if (ret < 0)
1552			break;
1553
1554reserve_metadata:
1555		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1556		if (ret) {
1557			if (!only_release_metadata)
1558				btrfs_free_reserved_data_space(inode, pos,
1559							       write_bytes);
1560			else
1561				btrfs_end_write_no_snapshoting(root);
1562			break;
1563		}
1564
1565		release_bytes = reserve_bytes;
1566		need_unlock = false;
1567again:
1568		/*
1569		 * This is going to setup the pages array with the number of
1570		 * pages we want, so we don't really need to worry about the
1571		 * contents of pages from loop to loop
1572		 */
1573		ret = prepare_pages(inode, pages, num_pages,
1574				    pos, write_bytes,
1575				    force_page_uptodate);
1576		if (ret)
1577			break;
1578
1579		ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1580						      pos, &lockstart, &lockend,
1581						      &cached_state);
1582		if (ret < 0) {
1583			if (ret == -EAGAIN)
1584				goto again;
1585			break;
1586		} else if (ret > 0) {
1587			need_unlock = true;
1588			ret = 0;
1589		}
1590
1591		copied = btrfs_copy_from_user(pos, num_pages,
1592					   write_bytes, pages, i);
1593
1594		/*
1595		 * if we have trouble faulting in the pages, fall
1596		 * back to one page at a time
1597		 */
1598		if (copied < write_bytes)
1599			nrptrs = 1;
1600
1601		if (copied == 0) {
1602			force_page_uptodate = true;
1603			dirty_pages = 0;
1604		} else {
1605			force_page_uptodate = false;
1606			dirty_pages = DIV_ROUND_UP(copied + offset,
1607						   PAGE_CACHE_SIZE);
1608		}
1609
1610		/*
1611		 * If we had a short copy we need to release the excess delaloc
1612		 * bytes we reserved.  We need to increment outstanding_extents
1613		 * because btrfs_delalloc_release_space will decrement it, but
1614		 * we still have an outstanding extent for the chunk we actually
1615		 * managed to copy.
1616		 */
1617		if (num_pages > dirty_pages) {
1618			release_bytes = (num_pages - dirty_pages) <<
1619				PAGE_CACHE_SHIFT;
1620			if (copied > 0) {
1621				spin_lock(&BTRFS_I(inode)->lock);
1622				BTRFS_I(inode)->outstanding_extents++;
1623				spin_unlock(&BTRFS_I(inode)->lock);
1624			}
1625			if (only_release_metadata) {
1626				btrfs_delalloc_release_metadata(inode,
1627								release_bytes);
1628			} else {
1629				u64 __pos;
1630
1631				__pos = round_down(pos, root->sectorsize) +
1632					(dirty_pages << PAGE_CACHE_SHIFT);
1633				btrfs_delalloc_release_space(inode, __pos,
1634							     release_bytes);
1635			}
1636		}
1637
1638		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1639
1640		if (copied > 0)
1641			ret = btrfs_dirty_pages(root, inode, pages,
1642						dirty_pages, pos, copied,
1643						NULL);
1644		if (need_unlock)
1645			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1646					     lockstart, lockend, &cached_state,
1647					     GFP_NOFS);
1648		if (ret) {
1649			btrfs_drop_pages(pages, num_pages);
1650			break;
1651		}
1652
1653		release_bytes = 0;
1654		if (only_release_metadata)
1655			btrfs_end_write_no_snapshoting(root);
1656
1657		if (only_release_metadata && copied > 0) {
1658			lockstart = round_down(pos, root->sectorsize);
1659			lockend = lockstart +
1660				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
1661
1662			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1663				       lockend, EXTENT_NORESERVE, NULL,
1664				       NULL, GFP_NOFS);
1665			only_release_metadata = false;
1666		}
1667
1668		btrfs_drop_pages(pages, num_pages);
1669
1670		cond_resched();
1671
1672		balance_dirty_pages_ratelimited(inode->i_mapping);
1673		if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
1674			btrfs_btree_balance_dirty(root);
1675
1676		pos += copied;
1677		num_written += copied;
1678	}
1679
1680	kfree(pages);
1681
1682	if (release_bytes) {
1683		if (only_release_metadata) {
1684			btrfs_end_write_no_snapshoting(root);
1685			btrfs_delalloc_release_metadata(inode, release_bytes);
1686		} else {
1687			btrfs_delalloc_release_space(inode, pos, release_bytes);
1688		}
1689	}
1690
1691	return num_written ? num_written : ret;
1692}
1693
1694static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1695				    struct iov_iter *from,
1696				    loff_t pos)
1697{
1698	struct file *file = iocb->ki_filp;
1699	struct inode *inode = file_inode(file);
1700	ssize_t written;
1701	ssize_t written_buffered;
1702	loff_t endbyte;
1703	int err;
1704
1705	written = generic_file_direct_write(iocb, from, pos);
1706
1707	if (written < 0 || !iov_iter_count(from))
1708		return written;
1709
1710	pos += written;
1711	written_buffered = __btrfs_buffered_write(file, from, pos);
1712	if (written_buffered < 0) {
1713		err = written_buffered;
1714		goto out;
1715	}
1716	/*
1717	 * Ensure all data is persisted. We want the next direct IO read to be
1718	 * able to read what was just written.
1719	 */
1720	endbyte = pos + written_buffered - 1;
1721	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1722	if (err)
1723		goto out;
1724	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1725	if (err)
1726		goto out;
1727	written += written_buffered;
1728	iocb->ki_pos = pos + written_buffered;
1729	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1730				 endbyte >> PAGE_CACHE_SHIFT);
1731out:
1732	return written ? written : err;
1733}
1734
1735static void update_time_for_write(struct inode *inode)
1736{
1737	struct timespec now;
1738
1739	if (IS_NOCMTIME(inode))
1740		return;
1741
1742	now = current_fs_time(inode->i_sb);
1743	if (!timespec_equal(&inode->i_mtime, &now))
1744		inode->i_mtime = now;
1745
1746	if (!timespec_equal(&inode->i_ctime, &now))
1747		inode->i_ctime = now;
1748
1749	if (IS_I_VERSION(inode))
1750		inode_inc_iversion(inode);
1751}
1752
1753static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1754				    struct iov_iter *from)
1755{
1756	struct file *file = iocb->ki_filp;
1757	struct inode *inode = file_inode(file);
1758	struct btrfs_root *root = BTRFS_I(inode)->root;
1759	u64 start_pos;
1760	u64 end_pos;
1761	ssize_t num_written = 0;
1762	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1763	ssize_t err;
1764	loff_t pos;
1765	size_t count;
1766
1767	mutex_lock(&inode->i_mutex);
1768	err = generic_write_checks(iocb, from);
1769	if (err <= 0) {
1770		mutex_unlock(&inode->i_mutex);
1771		return err;
1772	}
1773
1774	current->backing_dev_info = inode_to_bdi(inode);
1775	err = file_remove_privs(file);
1776	if (err) {
1777		mutex_unlock(&inode->i_mutex);
1778		goto out;
1779	}
1780
1781	/*
1782	 * If BTRFS flips readonly due to some impossible error
1783	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1784	 * although we have opened a file as writable, we have
1785	 * to stop this write operation to ensure FS consistency.
1786	 */
1787	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1788		mutex_unlock(&inode->i_mutex);
1789		err = -EROFS;
1790		goto out;
1791	}
1792
1793	/*
1794	 * We reserve space for updating the inode when we reserve space for the
1795	 * extent we are going to write, so we will enospc out there.  We don't
1796	 * need to start yet another transaction to update the inode as we will
1797	 * update the inode when we finish writing whatever data we write.
1798	 */
1799	update_time_for_write(inode);
1800
1801	pos = iocb->ki_pos;
1802	count = iov_iter_count(from);
1803	start_pos = round_down(pos, root->sectorsize);
1804	if (start_pos > i_size_read(inode)) {
1805		/* Expand hole size to cover write data, preventing empty gap */
1806		end_pos = round_up(pos + count, root->sectorsize);
1807		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
1808		if (err) {
1809			mutex_unlock(&inode->i_mutex);
1810			goto out;
1811		}
1812	}
1813
1814	if (sync)
1815		atomic_inc(&BTRFS_I(inode)->sync_writers);
1816
1817	if (iocb->ki_flags & IOCB_DIRECT) {
1818		num_written = __btrfs_direct_write(iocb, from, pos);
1819	} else {
1820		num_written = __btrfs_buffered_write(file, from, pos);
1821		if (num_written > 0)
1822			iocb->ki_pos = pos + num_written;
1823	}
1824
1825	mutex_unlock(&inode->i_mutex);
1826
1827	/*
1828	 * We also have to set last_sub_trans to the current log transid,
1829	 * otherwise subsequent syncs to a file that's been synced in this
1830	 * transaction will appear to have already occured.
1831	 */
1832	spin_lock(&BTRFS_I(inode)->lock);
1833	BTRFS_I(inode)->last_sub_trans = root->log_transid;
1834	spin_unlock(&BTRFS_I(inode)->lock);
1835	if (num_written > 0) {
1836		err = generic_write_sync(file, pos, num_written);
1837		if (err < 0)
1838			num_written = err;
1839	}
1840
1841	if (sync)
1842		atomic_dec(&BTRFS_I(inode)->sync_writers);
1843out:
1844	current->backing_dev_info = NULL;
1845	return num_written ? num_written : err;
1846}
1847
1848int btrfs_release_file(struct inode *inode, struct file *filp)
1849{
1850	if (filp->private_data)
1851		btrfs_ioctl_trans_end(filp);
1852	/*
1853	 * ordered_data_close is set by settattr when we are about to truncate
1854	 * a file from a non-zero size to a zero size.  This tries to
1855	 * flush down new bytes that may have been written if the
1856	 * application were using truncate to replace a file in place.
1857	 */
1858	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1859			       &BTRFS_I(inode)->runtime_flags))
1860			filemap_flush(inode->i_mapping);
1861	return 0;
1862}
1863
1864static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1865{
1866	int ret;
1867
1868	atomic_inc(&BTRFS_I(inode)->sync_writers);
1869	ret = btrfs_fdatawrite_range(inode, start, end);
1870	atomic_dec(&BTRFS_I(inode)->sync_writers);
1871
1872	return ret;
1873}
1874
1875/*
1876 * fsync call for both files and directories.  This logs the inode into
1877 * the tree log instead of forcing full commits whenever possible.
1878 *
1879 * It needs to call filemap_fdatawait so that all ordered extent updates are
1880 * in the metadata btree are up to date for copying to the log.
1881 *
1882 * It drops the inode mutex before doing the tree log commit.  This is an
1883 * important optimization for directories because holding the mutex prevents
1884 * new operations on the dir while we write to disk.
1885 */
1886int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1887{
1888	struct dentry *dentry = file_dentry(file);
1889	struct inode *inode = d_inode(dentry);
1890	struct btrfs_root *root = BTRFS_I(inode)->root;
1891	struct btrfs_trans_handle *trans;
1892	struct btrfs_log_ctx ctx;
1893	int ret = 0;
1894	bool full_sync = 0;
1895	u64 len;
1896
1897	/*
1898	 * The range length can be represented by u64, we have to do the typecasts
1899	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
1900	 */
1901	len = (u64)end - (u64)start + 1;
1902	trace_btrfs_sync_file(file, datasync);
1903
1904	/*
1905	 * We write the dirty pages in the range and wait until they complete
1906	 * out of the ->i_mutex. If so, we can flush the dirty pages by
1907	 * multi-task, and make the performance up.  See
1908	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1909	 */
1910	ret = start_ordered_ops(inode, start, end);
1911	if (ret)
1912		return ret;
1913
1914	mutex_lock(&inode->i_mutex);
1915	atomic_inc(&root->log_batch);
1916	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1917			     &BTRFS_I(inode)->runtime_flags);
1918	/*
1919	 * We might have have had more pages made dirty after calling
1920	 * start_ordered_ops and before acquiring the inode's i_mutex.
1921	 */
1922	if (full_sync) {
1923		/*
1924		 * For a full sync, we need to make sure any ordered operations
1925		 * start and finish before we start logging the inode, so that
1926		 * all extents are persisted and the respective file extent
1927		 * items are in the fs/subvol btree.
1928		 */
1929		ret = btrfs_wait_ordered_range(inode, start, len);
1930	} else {
1931		/*
1932		 * Start any new ordered operations before starting to log the
1933		 * inode. We will wait for them to finish in btrfs_sync_log().
1934		 *
1935		 * Right before acquiring the inode's mutex, we might have new
1936		 * writes dirtying pages, which won't immediately start the
1937		 * respective ordered operations - that is done through the
1938		 * fill_delalloc callbacks invoked from the writepage and
1939		 * writepages address space operations. So make sure we start
1940		 * all ordered operations before starting to log our inode. Not
1941		 * doing this means that while logging the inode, writeback
1942		 * could start and invoke writepage/writepages, which would call
1943		 * the fill_delalloc callbacks (cow_file_range,
1944		 * submit_compressed_extents). These callbacks add first an
1945		 * extent map to the modified list of extents and then create
1946		 * the respective ordered operation, which means in
1947		 * tree-log.c:btrfs_log_inode() we might capture all existing
1948		 * ordered operations (with btrfs_get_logged_extents()) before
1949		 * the fill_delalloc callback adds its ordered operation, and by
1950		 * the time we visit the modified list of extent maps (with
1951		 * btrfs_log_changed_extents()), we see and process the extent
1952		 * map they created. We then use the extent map to construct a
1953		 * file extent item for logging without waiting for the
1954		 * respective ordered operation to finish - this file extent
1955		 * item points to a disk location that might not have yet been
1956		 * written to, containing random data - so after a crash a log
1957		 * replay will make our inode have file extent items that point
1958		 * to disk locations containing invalid data, as we returned
1959		 * success to userspace without waiting for the respective
1960		 * ordered operation to finish, because it wasn't captured by
1961		 * btrfs_get_logged_extents().
1962		 */
1963		ret = start_ordered_ops(inode, start, end);
1964	}
1965	if (ret) {
1966		mutex_unlock(&inode->i_mutex);
1967		goto out;
1968	}
1969	atomic_inc(&root->log_batch);
1970
1971	/*
1972	 * If the last transaction that changed this file was before the current
1973	 * transaction and we have the full sync flag set in our inode, we can
1974	 * bail out now without any syncing.
1975	 *
1976	 * Note that we can't bail out if the full sync flag isn't set. This is
1977	 * because when the full sync flag is set we start all ordered extents
1978	 * and wait for them to fully complete - when they complete they update
1979	 * the inode's last_trans field through:
1980	 *
1981	 *     btrfs_finish_ordered_io() ->
1982	 *         btrfs_update_inode_fallback() ->
1983	 *             btrfs_update_inode() ->
1984	 *                 btrfs_set_inode_last_trans()
1985	 *
1986	 * So we are sure that last_trans is up to date and can do this check to
1987	 * bail out safely. For the fast path, when the full sync flag is not
1988	 * set in our inode, we can not do it because we start only our ordered
1989	 * extents and don't wait for them to complete (that is when
1990	 * btrfs_finish_ordered_io runs), so here at this point their last_trans
1991	 * value might be less than or equals to fs_info->last_trans_committed,
1992	 * and setting a speculative last_trans for an inode when a buffered
1993	 * write is made (such as fs_info->generation + 1 for example) would not
1994	 * be reliable since after setting the value and before fsync is called
1995	 * any number of transactions can start and commit (transaction kthread
1996	 * commits the current transaction periodically), and a transaction
1997	 * commit does not start nor waits for ordered extents to complete.
1998	 */
1999	smp_mb();
2000	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
2001	    (BTRFS_I(inode)->last_trans <=
2002	     root->fs_info->last_trans_committed &&
2003	     (full_sync ||
2004	      !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
2005		/*
2006		 * We'v had everything committed since the last time we were
2007		 * modified so clear this flag in case it was set for whatever
2008		 * reason, it's no longer relevant.
2009		 */
2010		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2011			  &BTRFS_I(inode)->runtime_flags);
2012		mutex_unlock(&inode->i_mutex);
2013		goto out;
2014	}
2015
2016	/*
2017	 * ok we haven't committed the transaction yet, lets do a commit
2018	 */
2019	if (file->private_data)
2020		btrfs_ioctl_trans_end(file);
2021
2022	/*
2023	 * We use start here because we will need to wait on the IO to complete
2024	 * in btrfs_sync_log, which could require joining a transaction (for
2025	 * example checking cross references in the nocow path).  If we use join
2026	 * here we could get into a situation where we're waiting on IO to
2027	 * happen that is blocked on a transaction trying to commit.  With start
2028	 * we inc the extwriter counter, so we wait for all extwriters to exit
2029	 * before we start blocking join'ers.  This comment is to keep somebody
2030	 * from thinking they are super smart and changing this to
2031	 * btrfs_join_transaction *cough*Josef*cough*.
2032	 */
2033	trans = btrfs_start_transaction(root, 0);
2034	if (IS_ERR(trans)) {
2035		ret = PTR_ERR(trans);
2036		mutex_unlock(&inode->i_mutex);
2037		goto out;
2038	}
2039	trans->sync = true;
2040
2041	btrfs_init_log_ctx(&ctx);
2042
2043	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
2044	if (ret < 0) {
2045		/* Fallthrough and commit/free transaction. */
2046		ret = 1;
2047	}
2048
2049	/* we've logged all the items and now have a consistent
2050	 * version of the file in the log.  It is possible that
2051	 * someone will come in and modify the file, but that's
2052	 * fine because the log is consistent on disk, and we
2053	 * have references to all of the file's extents
2054	 *
2055	 * It is possible that someone will come in and log the
2056	 * file again, but that will end up using the synchronization
2057	 * inside btrfs_sync_log to keep things safe.
2058	 */
2059	mutex_unlock(&inode->i_mutex);
2060
2061	/*
2062	 * If any of the ordered extents had an error, just return it to user
2063	 * space, so that the application knows some writes didn't succeed and
2064	 * can take proper action (retry for e.g.). Blindly committing the
2065	 * transaction in this case, would fool userspace that everything was
2066	 * successful. And we also want to make sure our log doesn't contain
2067	 * file extent items pointing to extents that weren't fully written to -
2068	 * just like in the non fast fsync path, where we check for the ordered
2069	 * operation's error flag before writing to the log tree and return -EIO
2070	 * if any of them had this flag set (btrfs_wait_ordered_range) -
2071	 * therefore we need to check for errors in the ordered operations,
2072	 * which are indicated by ctx.io_err.
2073	 */
2074	if (ctx.io_err) {
2075		btrfs_end_transaction(trans, root);
2076		ret = ctx.io_err;
2077		goto out;
2078	}
2079
2080	if (ret != BTRFS_NO_LOG_SYNC) {
2081		if (!ret) {
2082			ret = btrfs_sync_log(trans, root, &ctx);
2083			if (!ret) {
2084				ret = btrfs_end_transaction(trans, root);
2085				goto out;
2086			}
2087		}
2088		if (!full_sync) {
2089			ret = btrfs_wait_ordered_range(inode, start, len);
2090			if (ret) {
2091				btrfs_end_transaction(trans, root);
2092				goto out;
2093			}
2094		}
2095		ret = btrfs_commit_transaction(trans, root);
2096	} else {
2097		ret = btrfs_end_transaction(trans, root);
2098	}
2099out:
2100	return ret > 0 ? -EIO : ret;
2101}
2102
2103static const struct vm_operations_struct btrfs_file_vm_ops = {
2104	.fault		= filemap_fault,
2105	.map_pages	= filemap_map_pages,
2106	.page_mkwrite	= btrfs_page_mkwrite,
2107};
2108
2109static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2110{
2111	struct address_space *mapping = filp->f_mapping;
2112
2113	if (!mapping->a_ops->readpage)
2114		return -ENOEXEC;
2115
2116	file_accessed(filp);
2117	vma->vm_ops = &btrfs_file_vm_ops;
2118
2119	return 0;
2120}
2121
2122static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
2123			  int slot, u64 start, u64 end)
2124{
2125	struct btrfs_file_extent_item *fi;
2126	struct btrfs_key key;
2127
2128	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2129		return 0;
2130
2131	btrfs_item_key_to_cpu(leaf, &key, slot);
2132	if (key.objectid != btrfs_ino(inode) ||
2133	    key.type != BTRFS_EXTENT_DATA_KEY)
2134		return 0;
2135
2136	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2137
2138	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2139		return 0;
2140
2141	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2142		return 0;
2143
2144	if (key.offset == end)
2145		return 1;
2146	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2147		return 1;
2148	return 0;
2149}
2150
2151static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2152		      struct btrfs_path *path, u64 offset, u64 end)
2153{
2154	struct btrfs_root *root = BTRFS_I(inode)->root;
2155	struct extent_buffer *leaf;
2156	struct btrfs_file_extent_item *fi;
2157	struct extent_map *hole_em;
2158	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2159	struct btrfs_key key;
2160	int ret;
2161
2162	if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
2163		goto out;
2164
2165	key.objectid = btrfs_ino(inode);
2166	key.type = BTRFS_EXTENT_DATA_KEY;
2167	key.offset = offset;
2168
2169	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2170	if (ret < 0)
2171		return ret;
2172	BUG_ON(!ret);
2173
2174	leaf = path->nodes[0];
2175	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
2176		u64 num_bytes;
2177
2178		path->slots[0]--;
2179		fi = btrfs_item_ptr(leaf, path->slots[0],
2180				    struct btrfs_file_extent_item);
2181		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2182			end - offset;
2183		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2184		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2185		btrfs_set_file_extent_offset(leaf, fi, 0);
2186		btrfs_mark_buffer_dirty(leaf);
2187		goto out;
2188	}
2189
2190	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2191		u64 num_bytes;
2192
2193		key.offset = offset;
2194		btrfs_set_item_key_safe(root->fs_info, path, &key);
2195		fi = btrfs_item_ptr(leaf, path->slots[0],
2196				    struct btrfs_file_extent_item);
2197		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2198			offset;
2199		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2200		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2201		btrfs_set_file_extent_offset(leaf, fi, 0);
2202		btrfs_mark_buffer_dirty(leaf);
2203		goto out;
2204	}
2205	btrfs_release_path(path);
2206
2207	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
2208				       0, 0, end - offset, 0, end - offset,
2209				       0, 0, 0);
2210	if (ret)
2211		return ret;
2212
2213out:
2214	btrfs_release_path(path);
2215
2216	hole_em = alloc_extent_map();
2217	if (!hole_em) {
2218		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2219		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2220			&BTRFS_I(inode)->runtime_flags);
2221	} else {
2222		hole_em->start = offset;
2223		hole_em->len = end - offset;
2224		hole_em->ram_bytes = hole_em->len;
2225		hole_em->orig_start = offset;
2226
2227		hole_em->block_start = EXTENT_MAP_HOLE;
2228		hole_em->block_len = 0;
2229		hole_em->orig_block_len = 0;
2230		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
2231		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2232		hole_em->generation = trans->transid;
2233
2234		do {
2235			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2236			write_lock(&em_tree->lock);
2237			ret = add_extent_mapping(em_tree, hole_em, 1);
2238			write_unlock(&em_tree->lock);
2239		} while (ret == -EEXIST);
2240		free_extent_map(hole_em);
2241		if (ret)
2242			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2243				&BTRFS_I(inode)->runtime_flags);
2244	}
2245
2246	return 0;
2247}
2248
2249/*
2250 * Find a hole extent on given inode and change start/len to the end of hole
2251 * extent.(hole/vacuum extent whose em->start <= start &&
2252 *	   em->start + em->len > start)
2253 * When a hole extent is found, return 1 and modify start/len.
2254 */
2255static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2256{
2257	struct extent_map *em;
2258	int ret = 0;
2259
2260	em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
2261	if (IS_ERR_OR_NULL(em)) {
2262		if (!em)
2263			ret = -ENOMEM;
2264		else
2265			ret = PTR_ERR(em);
2266		return ret;
2267	}
2268
2269	/* Hole or vacuum extent(only exists in no-hole mode) */
2270	if (em->block_start == EXTENT_MAP_HOLE) {
2271		ret = 1;
2272		*len = em->start + em->len > *start + *len ?
2273		       0 : *start + *len - em->start - em->len;
2274		*start = em->start + em->len;
2275	}
2276	free_extent_map(em);
2277	return ret;
2278}
2279
2280static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2281{
2282	struct btrfs_root *root = BTRFS_I(inode)->root;
2283	struct extent_state *cached_state = NULL;
2284	struct btrfs_path *path;
2285	struct btrfs_block_rsv *rsv;
2286	struct btrfs_trans_handle *trans;
2287	u64 lockstart;
2288	u64 lockend;
2289	u64 tail_start;
2290	u64 tail_len;
2291	u64 orig_start = offset;
2292	u64 cur_offset;
2293	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
2294	u64 drop_end;
2295	int ret = 0;
2296	int err = 0;
2297	unsigned int rsv_count;
2298	bool same_page;
2299	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2300	u64 ino_size;
2301	bool truncated_page = false;
2302	bool updated_inode = false;
2303
2304	ret = btrfs_wait_ordered_range(inode, offset, len);
2305	if (ret)
2306		return ret;
2307
2308	mutex_lock(&inode->i_mutex);
2309	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
2310	ret = find_first_non_hole(inode, &offset, &len);
2311	if (ret < 0)
2312		goto out_only_mutex;
2313	if (ret && !len) {
2314		/* Already in a large hole */
2315		ret = 0;
2316		goto out_only_mutex;
2317	}
2318
2319	lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2320	lockend = round_down(offset + len,
2321			     BTRFS_I(inode)->root->sectorsize) - 1;
2322	same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2323		    ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2324
2325	/*
2326	 * We needn't truncate any page which is beyond the end of the file
2327	 * because we are sure there is no data there.
2328	 */
2329	/*
2330	 * Only do this if we are in the same page and we aren't doing the
2331	 * entire page.
2332	 */
2333	if (same_page && len < PAGE_CACHE_SIZE) {
2334		if (offset < ino_size) {
2335			truncated_page = true;
2336			ret = btrfs_truncate_page(inode, offset, len, 0);
2337		} else {
2338			ret = 0;
2339		}
2340		goto out_only_mutex;
2341	}
2342
2343	/* zero back part of the first page */
2344	if (offset < ino_size) {
2345		truncated_page = true;
2346		ret = btrfs_truncate_page(inode, offset, 0, 0);
2347		if (ret) {
2348			mutex_unlock(&inode->i_mutex);
2349			return ret;
2350		}
2351	}
2352
2353	/* Check the aligned pages after the first unaligned page,
2354	 * if offset != orig_start, which means the first unaligned page
2355	 * including serveral following pages are already in holes,
2356	 * the extra check can be skipped */
2357	if (offset == orig_start) {
2358		/* after truncate page, check hole again */
2359		len = offset + len - lockstart;
2360		offset = lockstart;
2361		ret = find_first_non_hole(inode, &offset, &len);
2362		if (ret < 0)
2363			goto out_only_mutex;
2364		if (ret && !len) {
2365			ret = 0;
2366			goto out_only_mutex;
2367		}
2368		lockstart = offset;
2369	}
2370
2371	/* Check the tail unaligned part is in a hole */
2372	tail_start = lockend + 1;
2373	tail_len = offset + len - tail_start;
2374	if (tail_len) {
2375		ret = find_first_non_hole(inode, &tail_start, &tail_len);
2376		if (unlikely(ret < 0))
2377			goto out_only_mutex;
2378		if (!ret) {
2379			/* zero the front end of the last page */
2380			if (tail_start + tail_len < ino_size) {
2381				truncated_page = true;
2382				ret = btrfs_truncate_page(inode,
2383						tail_start + tail_len, 0, 1);
2384				if (ret)
2385					goto out_only_mutex;
2386			}
2387		}
2388	}
2389
2390	if (lockend < lockstart) {
2391		ret = 0;
2392		goto out_only_mutex;
2393	}
2394
2395	while (1) {
2396		struct btrfs_ordered_extent *ordered;
2397
2398		truncate_pagecache_range(inode, lockstart, lockend);
2399
2400		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2401				 0, &cached_state);
2402		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
2403
2404		/*
2405		 * We need to make sure we have no ordered extents in this range
2406		 * and nobody raced in and read a page in this range, if we did
2407		 * we need to try again.
2408		 */
2409		if ((!ordered ||
2410		    (ordered->file_offset + ordered->len <= lockstart ||
2411		     ordered->file_offset > lockend)) &&
2412		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
2413			if (ordered)
2414				btrfs_put_ordered_extent(ordered);
2415			break;
2416		}
2417		if (ordered)
2418			btrfs_put_ordered_extent(ordered);
2419		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2420				     lockend, &cached_state, GFP_NOFS);
2421		ret = btrfs_wait_ordered_range(inode, lockstart,
2422					       lockend - lockstart + 1);
2423		if (ret) {
2424			mutex_unlock(&inode->i_mutex);
2425			return ret;
2426		}
2427	}
2428
2429	path = btrfs_alloc_path();
2430	if (!path) {
2431		ret = -ENOMEM;
2432		goto out;
2433	}
2434
2435	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2436	if (!rsv) {
2437		ret = -ENOMEM;
2438		goto out_free;
2439	}
2440	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
2441	rsv->failfast = 1;
2442
2443	/*
2444	 * 1 - update the inode
2445	 * 1 - removing the extents in the range
2446	 * 1 - adding the hole extent if no_holes isn't set
2447	 */
2448	rsv_count = no_holes ? 2 : 3;
2449	trans = btrfs_start_transaction(root, rsv_count);
2450	if (IS_ERR(trans)) {
2451		err = PTR_ERR(trans);
2452		goto out_free;
2453	}
2454
2455	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
2456				      min_size);
2457	BUG_ON(ret);
2458	trans->block_rsv = rsv;
2459
2460	cur_offset = lockstart;
2461	len = lockend - cur_offset;
2462	while (cur_offset < lockend) {
2463		ret = __btrfs_drop_extents(trans, root, inode, path,
2464					   cur_offset, lockend + 1,
2465					   &drop_end, 1, 0, 0, NULL);
2466		if (ret != -ENOSPC)
2467			break;
2468
2469		trans->block_rsv = &root->fs_info->trans_block_rsv;
2470
2471		if (cur_offset < ino_size) {
2472			ret = fill_holes(trans, inode, path, cur_offset,
2473					 drop_end);
2474			if (ret) {
2475				err = ret;
2476				break;
2477			}
2478		}
2479
2480		cur_offset = drop_end;
2481
2482		ret = btrfs_update_inode(trans, root, inode);
2483		if (ret) {
2484			err = ret;
2485			break;
2486		}
2487
2488		btrfs_end_transaction(trans, root);
2489		btrfs_btree_balance_dirty(root);
2490
2491		trans = btrfs_start_transaction(root, rsv_count);
2492		if (IS_ERR(trans)) {
2493			ret = PTR_ERR(trans);
2494			trans = NULL;
2495			break;
2496		}
2497
2498		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
2499					      rsv, min_size);
2500		BUG_ON(ret);	/* shouldn't happen */
2501		trans->block_rsv = rsv;
2502
2503		ret = find_first_non_hole(inode, &cur_offset, &len);
2504		if (unlikely(ret < 0))
2505			break;
2506		if (ret && !len) {
2507			ret = 0;
2508			break;
2509		}
2510	}
2511
2512	if (ret) {
2513		err = ret;
2514		goto out_trans;
2515	}
2516
2517	trans->block_rsv = &root->fs_info->trans_block_rsv;
2518	/*
2519	 * If we are using the NO_HOLES feature we might have had already an
2520	 * hole that overlaps a part of the region [lockstart, lockend] and
2521	 * ends at (or beyond) lockend. Since we have no file extent items to
2522	 * represent holes, drop_end can be less than lockend and so we must
2523	 * make sure we have an extent map representing the existing hole (the
2524	 * call to __btrfs_drop_extents() might have dropped the existing extent
2525	 * map representing the existing hole), otherwise the fast fsync path
2526	 * will not record the existence of the hole region
2527	 * [existing_hole_start, lockend].
2528	 */
2529	if (drop_end <= lockend)
2530		drop_end = lockend + 1;
2531	/*
2532	 * Don't insert file hole extent item if it's for a range beyond eof
2533	 * (because it's useless) or if it represents a 0 bytes range (when
2534	 * cur_offset == drop_end).
2535	 */
2536	if (cur_offset < ino_size && cur_offset < drop_end) {
2537		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
2538		if (ret) {
2539			err = ret;
2540			goto out_trans;
2541		}
2542	}
2543
2544out_trans:
2545	if (!trans)
2546		goto out_free;
2547
2548	inode_inc_iversion(inode);
2549	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2550
2551	trans->block_rsv = &root->fs_info->trans_block_rsv;
2552	ret = btrfs_update_inode(trans, root, inode);
2553	updated_inode = true;
2554	btrfs_end_transaction(trans, root);
2555	btrfs_btree_balance_dirty(root);
2556out_free:
2557	btrfs_free_path(path);
2558	btrfs_free_block_rsv(root, rsv);
2559out:
2560	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2561			     &cached_state, GFP_NOFS);
2562out_only_mutex:
2563	if (!updated_inode && truncated_page && !ret && !err) {
2564		/*
2565		 * If we only end up zeroing part of a page, we still need to
2566		 * update the inode item, so that all the time fields are
2567		 * updated as well as the necessary btrfs inode in memory fields
2568		 * for detecting, at fsync time, if the inode isn't yet in the
2569		 * log tree or it's there but not up to date.
2570		 */
2571		trans = btrfs_start_transaction(root, 1);
2572		if (IS_ERR(trans)) {
2573			err = PTR_ERR(trans);
2574		} else {
2575			err = btrfs_update_inode(trans, root, inode);
2576			ret = btrfs_end_transaction(trans, root);
2577		}
2578	}
2579	mutex_unlock(&inode->i_mutex);
2580	if (ret && !err)
2581		err = ret;
2582	return err;
2583}
2584
2585/* Helper structure to record which range is already reserved */
2586struct falloc_range {
2587	struct list_head list;
2588	u64 start;
2589	u64 len;
2590};
2591
2592/*
2593 * Helper function to add falloc range
2594 *
2595 * Caller should have locked the larger range of extent containing
2596 * [start, len)
2597 */
2598static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2599{
2600	struct falloc_range *prev = NULL;
2601	struct falloc_range *range = NULL;
2602
2603	if (list_empty(head))
2604		goto insert;
2605
2606	/*
2607	 * As fallocate iterate by bytenr order, we only need to check
2608	 * the last range.
2609	 */
2610	prev = list_entry(head->prev, struct falloc_range, list);
2611	if (prev->start + prev->len == start) {
2612		prev->len += len;
2613		return 0;
2614	}
2615insert:
2616	range = kmalloc(sizeof(*range), GFP_NOFS);
2617	if (!range)
2618		return -ENOMEM;
2619	range->start = start;
2620	range->len = len;
2621	list_add_tail(&range->list, head);
2622	return 0;
2623}
2624
2625static long btrfs_fallocate(struct file *file, int mode,
2626			    loff_t offset, loff_t len)
2627{
2628	struct inode *inode = file_inode(file);
2629	struct extent_state *cached_state = NULL;
2630	struct falloc_range *range;
2631	struct falloc_range *tmp;
2632	struct list_head reserve_list;
2633	u64 cur_offset;
2634	u64 last_byte;
2635	u64 alloc_start;
2636	u64 alloc_end;
2637	u64 alloc_hint = 0;
2638	u64 locked_end;
2639	u64 actual_end = 0;
2640	struct extent_map *em;
2641	int blocksize = BTRFS_I(inode)->root->sectorsize;
2642	int ret;
2643
2644	alloc_start = round_down(offset, blocksize);
2645	alloc_end = round_up(offset + len, blocksize);
2646
2647	/* Make sure we aren't being give some crap mode */
2648	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2649		return -EOPNOTSUPP;
2650
2651	if (mode & FALLOC_FL_PUNCH_HOLE)
2652		return btrfs_punch_hole(inode, offset, len);
2653
2654	/*
2655	 * Only trigger disk allocation, don't trigger qgroup reserve
2656	 *
2657	 * For qgroup space, it will be checked later.
2658	 */
2659	ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
2660	if (ret < 0)
2661		return ret;
2662
2663	mutex_lock(&inode->i_mutex);
2664	ret = inode_newsize_ok(inode, alloc_end);
2665	if (ret)
2666		goto out;
2667
2668	/*
2669	 * TODO: Move these two operations after we have checked
2670	 * accurate reserved space, or fallocate can still fail but
2671	 * with page truncated or size expanded.
2672	 *
2673	 * But that's a minor problem and won't do much harm BTW.
2674	 */
2675	if (alloc_start > inode->i_size) {
2676		ret = btrfs_cont_expand(inode, i_size_read(inode),
2677					alloc_start);
2678		if (ret)
2679			goto out;
2680	} else if (offset + len > inode->i_size) {
2681		/*
2682		 * If we are fallocating from the end of the file onward we
2683		 * need to zero out the end of the page if i_size lands in the
2684		 * middle of a page.
2685		 */
2686		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2687		if (ret)
2688			goto out;
2689	}
2690
2691	/*
2692	 * wait for ordered IO before we have any locks.  We'll loop again
2693	 * below with the locks held.
2694	 */
2695	ret = btrfs_wait_ordered_range(inode, alloc_start,
2696				       alloc_end - alloc_start);
2697	if (ret)
2698		goto out;
2699
2700	locked_end = alloc_end - 1;
2701	while (1) {
2702		struct btrfs_ordered_extent *ordered;
2703
2704		/* the extent lock is ordered inside the running
2705		 * transaction
2706		 */
2707		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
2708				 locked_end, 0, &cached_state);
2709		ordered = btrfs_lookup_first_ordered_extent(inode,
2710							    alloc_end - 1);
2711		if (ordered &&
2712		    ordered->file_offset + ordered->len > alloc_start &&
2713		    ordered->file_offset < alloc_end) {
2714			btrfs_put_ordered_extent(ordered);
2715			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
2716					     alloc_start, locked_end,
2717					     &cached_state, GFP_NOFS);
2718			/*
2719			 * we can't wait on the range with the transaction
2720			 * running or with the extent lock held
2721			 */
2722			ret = btrfs_wait_ordered_range(inode, alloc_start,
2723						       alloc_end - alloc_start);
2724			if (ret)
2725				goto out;
2726		} else {
2727			if (ordered)
2728				btrfs_put_ordered_extent(ordered);
2729			break;
2730		}
2731	}
2732
2733	/* First, check if we exceed the qgroup limit */
2734	INIT_LIST_HEAD(&reserve_list);
2735	cur_offset = alloc_start;
2736	while (1) {
2737		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2738				      alloc_end - cur_offset, 0);
2739		if (IS_ERR_OR_NULL(em)) {
2740			if (!em)
2741				ret = -ENOMEM;
2742			else
2743				ret = PTR_ERR(em);
2744			break;
2745		}
2746		last_byte = min(extent_map_end(em), alloc_end);
2747		actual_end = min_t(u64, extent_map_end(em), offset + len);
2748		last_byte = ALIGN(last_byte, blocksize);
2749		if (em->block_start == EXTENT_MAP_HOLE ||
2750		    (cur_offset >= inode->i_size &&
2751		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2752			ret = add_falloc_range(&reserve_list, cur_offset,
2753					       last_byte - cur_offset);
2754			if (ret < 0) {
2755				free_extent_map(em);
2756				break;
2757			}
2758			ret = btrfs_qgroup_reserve_data(inode, cur_offset,
2759					last_byte - cur_offset);
2760			if (ret < 0)
2761				break;
2762		}
2763		free_extent_map(em);
2764		cur_offset = last_byte;
2765		if (cur_offset >= alloc_end)
2766			break;
2767	}
2768
2769	/*
2770	 * If ret is still 0, means we're OK to fallocate.
2771	 * Or just cleanup the list and exit.
2772	 */
2773	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
2774		if (!ret)
2775			ret = btrfs_prealloc_file_range(inode, mode,
2776					range->start,
2777					range->len, 1 << inode->i_blkbits,
2778					offset + len, &alloc_hint);
2779		list_del(&range->list);
2780		kfree(range);
2781	}
2782	if (ret < 0)
2783		goto out_unlock;
2784
2785	if (actual_end > inode->i_size &&
2786	    !(mode & FALLOC_FL_KEEP_SIZE)) {
2787		struct btrfs_trans_handle *trans;
2788		struct btrfs_root *root = BTRFS_I(inode)->root;
2789
2790		/*
2791		 * We didn't need to allocate any more space, but we
2792		 * still extended the size of the file so we need to
2793		 * update i_size and the inode item.
2794		 */
2795		trans = btrfs_start_transaction(root, 1);
2796		if (IS_ERR(trans)) {
2797			ret = PTR_ERR(trans);
2798		} else {
2799			inode->i_ctime = CURRENT_TIME;
2800			i_size_write(inode, actual_end);
2801			btrfs_ordered_update_i_size(inode, actual_end, NULL);
2802			ret = btrfs_update_inode(trans, root, inode);
2803			if (ret)
2804				btrfs_end_transaction(trans, root);
2805			else
2806				ret = btrfs_end_transaction(trans, root);
2807		}
2808	}
2809out_unlock:
2810	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2811			     &cached_state, GFP_NOFS);
2812out:
2813	/*
2814	 * As we waited the extent range, the data_rsv_map must be empty
2815	 * in the range, as written data range will be released from it.
2816	 * And for prealloacted extent, it will also be released when
2817	 * its metadata is written.
2818	 * So this is completely used as cleanup.
2819	 */
2820	btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
2821	mutex_unlock(&inode->i_mutex);
2822	/* Let go of our reservation. */
2823	btrfs_free_reserved_data_space(inode, alloc_start,
2824				       alloc_end - alloc_start);
2825	return ret;
2826}
2827
2828static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2829{
2830	struct btrfs_root *root = BTRFS_I(inode)->root;
2831	struct extent_map *em = NULL;
2832	struct extent_state *cached_state = NULL;
2833	u64 lockstart;
2834	u64 lockend;
2835	u64 start;
2836	u64 len;
2837	int ret = 0;
2838
2839	if (inode->i_size == 0)
2840		return -ENXIO;
2841
2842	/*
2843	 * *offset can be negative, in this case we start finding DATA/HOLE from
2844	 * the very start of the file.
2845	 */
2846	start = max_t(loff_t, 0, *offset);
2847
2848	lockstart = round_down(start, root->sectorsize);
2849	lockend = round_up(i_size_read(inode), root->sectorsize);
2850	if (lockend <= lockstart)
2851		lockend = lockstart + root->sectorsize;
2852	lockend--;
2853	len = lockend - lockstart + 1;
2854
2855	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2856			 &cached_state);
2857
2858	while (start < inode->i_size) {
2859		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
2860		if (IS_ERR(em)) {
2861			ret = PTR_ERR(em);
2862			em = NULL;
2863			break;
2864		}
2865
2866		if (whence == SEEK_HOLE &&
2867		    (em->block_start == EXTENT_MAP_HOLE ||
2868		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2869			break;
2870		else if (whence == SEEK_DATA &&
2871			   (em->block_start != EXTENT_MAP_HOLE &&
2872			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
2873			break;
2874
2875		start = em->start + em->len;
2876		free_extent_map(em);
2877		em = NULL;
2878		cond_resched();
2879	}
2880	free_extent_map(em);
2881	if (!ret) {
2882		if (whence == SEEK_DATA && start >= inode->i_size)
2883			ret = -ENXIO;
2884		else
2885			*offset = min_t(loff_t, start, inode->i_size);
2886	}
2887	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2888			     &cached_state, GFP_NOFS);
2889	return ret;
2890}
2891
2892static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2893{
2894	struct inode *inode = file->f_mapping->host;
2895	int ret;
2896
2897	mutex_lock(&inode->i_mutex);
2898	switch (whence) {
2899	case SEEK_END:
2900	case SEEK_CUR:
2901		offset = generic_file_llseek(file, offset, whence);
2902		goto out;
2903	case SEEK_DATA:
2904	case SEEK_HOLE:
2905		if (offset >= i_size_read(inode)) {
2906			mutex_unlock(&inode->i_mutex);
2907			return -ENXIO;
2908		}
2909
2910		ret = find_desired_extent(inode, &offset, whence);
2911		if (ret) {
2912			mutex_unlock(&inode->i_mutex);
2913			return ret;
2914		}
2915	}
2916
2917	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2918out:
2919	mutex_unlock(&inode->i_mutex);
2920	return offset;
2921}
2922
2923const struct file_operations btrfs_file_operations = {
2924	.llseek		= btrfs_file_llseek,
2925	.read_iter      = generic_file_read_iter,
2926	.splice_read	= generic_file_splice_read,
2927	.write_iter	= btrfs_file_write_iter,
2928	.mmap		= btrfs_file_mmap,
2929	.open		= generic_file_open,
2930	.release	= btrfs_release_file,
2931	.fsync		= btrfs_sync_file,
2932	.fallocate	= btrfs_fallocate,
2933	.unlocked_ioctl	= btrfs_ioctl,
2934#ifdef CONFIG_COMPAT
2935	.compat_ioctl	= btrfs_ioctl,
2936#endif
2937};
2938
2939void btrfs_auto_defrag_exit(void)
2940{
2941	if (btrfs_inode_defrag_cachep)
2942		kmem_cache_destroy(btrfs_inode_defrag_cachep);
2943}
2944
2945int btrfs_auto_defrag_init(void)
2946{
2947	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2948					sizeof(struct inode_defrag), 0,
2949					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2950					NULL);
2951	if (!btrfs_inode_defrag_cachep)
2952		return -ENOMEM;
2953
2954	return 0;
2955}
2956
2957int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
2958{
2959	int ret;
2960
2961	/*
2962	 * So with compression we will find and lock a dirty page and clear the
2963	 * first one as dirty, setup an async extent, and immediately return
2964	 * with the entire range locked but with nobody actually marked with
2965	 * writeback.  So we can't just filemap_write_and_wait_range() and
2966	 * expect it to work since it will just kick off a thread to do the
2967	 * actual work.  So we need to call filemap_fdatawrite_range _again_
2968	 * since it will wait on the page lock, which won't be unlocked until
2969	 * after the pages have been marked as writeback and so we're good to go
2970	 * from there.  We have to do this otherwise we'll miss the ordered
2971	 * extents and that results in badness.  Please Josef, do not think you
2972	 * know better and pull this out at some point in the future, it is
2973	 * right and you are wrong.
2974	 */
2975	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2976	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
2977			     &BTRFS_I(inode)->runtime_flags))
2978		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2979
2980	return ret;
2981}
2982