1/*
2 * Copyright (C) 2008 Red Hat.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/pagemap.h>
20#include <linux/sched.h>
21#include <linux/slab.h>
22#include <linux/math64.h>
23#include <linux/ratelimit.h>
24#include "ctree.h"
25#include "free-space-cache.h"
26#include "transaction.h"
27#include "disk-io.h"
28#include "extent_io.h"
29#include "inode-map.h"
30#include "volumes.h"
31
32#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
33#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
34
35struct btrfs_trim_range {
36	u64 start;
37	u64 bytes;
38	struct list_head list;
39};
40
41static int link_free_space(struct btrfs_free_space_ctl *ctl,
42			   struct btrfs_free_space *info);
43static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
44			      struct btrfs_free_space *info);
45
46static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
47					       struct btrfs_path *path,
48					       u64 offset)
49{
50	struct btrfs_key key;
51	struct btrfs_key location;
52	struct btrfs_disk_key disk_key;
53	struct btrfs_free_space_header *header;
54	struct extent_buffer *leaf;
55	struct inode *inode = NULL;
56	int ret;
57
58	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
59	key.offset = offset;
60	key.type = 0;
61
62	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
63	if (ret < 0)
64		return ERR_PTR(ret);
65	if (ret > 0) {
66		btrfs_release_path(path);
67		return ERR_PTR(-ENOENT);
68	}
69
70	leaf = path->nodes[0];
71	header = btrfs_item_ptr(leaf, path->slots[0],
72				struct btrfs_free_space_header);
73	btrfs_free_space_key(leaf, header, &disk_key);
74	btrfs_disk_key_to_cpu(&location, &disk_key);
75	btrfs_release_path(path);
76
77	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
78	if (!inode)
79		return ERR_PTR(-ENOENT);
80	if (IS_ERR(inode))
81		return inode;
82	if (is_bad_inode(inode)) {
83		iput(inode);
84		return ERR_PTR(-ENOENT);
85	}
86
87	mapping_set_gfp_mask(inode->i_mapping,
88			mapping_gfp_mask(inode->i_mapping) &
89			~(__GFP_FS | __GFP_HIGHMEM));
90
91	return inode;
92}
93
94struct inode *lookup_free_space_inode(struct btrfs_root *root,
95				      struct btrfs_block_group_cache
96				      *block_group, struct btrfs_path *path)
97{
98	struct inode *inode = NULL;
99	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
100
101	spin_lock(&block_group->lock);
102	if (block_group->inode)
103		inode = igrab(block_group->inode);
104	spin_unlock(&block_group->lock);
105	if (inode)
106		return inode;
107
108	inode = __lookup_free_space_inode(root, path,
109					  block_group->key.objectid);
110	if (IS_ERR(inode))
111		return inode;
112
113	spin_lock(&block_group->lock);
114	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
115		btrfs_info(root->fs_info,
116			"Old style space inode found, converting.");
117		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
118			BTRFS_INODE_NODATACOW;
119		block_group->disk_cache_state = BTRFS_DC_CLEAR;
120	}
121
122	if (!block_group->iref) {
123		block_group->inode = igrab(inode);
124		block_group->iref = 1;
125	}
126	spin_unlock(&block_group->lock);
127
128	return inode;
129}
130
131static int __create_free_space_inode(struct btrfs_root *root,
132				     struct btrfs_trans_handle *trans,
133				     struct btrfs_path *path,
134				     u64 ino, u64 offset)
135{
136	struct btrfs_key key;
137	struct btrfs_disk_key disk_key;
138	struct btrfs_free_space_header *header;
139	struct btrfs_inode_item *inode_item;
140	struct extent_buffer *leaf;
141	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
142	int ret;
143
144	ret = btrfs_insert_empty_inode(trans, root, path, ino);
145	if (ret)
146		return ret;
147
148	/* We inline crc's for the free disk space cache */
149	if (ino != BTRFS_FREE_INO_OBJECTID)
150		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
151
152	leaf = path->nodes[0];
153	inode_item = btrfs_item_ptr(leaf, path->slots[0],
154				    struct btrfs_inode_item);
155	btrfs_item_key(leaf, &disk_key, path->slots[0]);
156	memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
157			     sizeof(*inode_item));
158	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
159	btrfs_set_inode_size(leaf, inode_item, 0);
160	btrfs_set_inode_nbytes(leaf, inode_item, 0);
161	btrfs_set_inode_uid(leaf, inode_item, 0);
162	btrfs_set_inode_gid(leaf, inode_item, 0);
163	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
164	btrfs_set_inode_flags(leaf, inode_item, flags);
165	btrfs_set_inode_nlink(leaf, inode_item, 1);
166	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
167	btrfs_set_inode_block_group(leaf, inode_item, offset);
168	btrfs_mark_buffer_dirty(leaf);
169	btrfs_release_path(path);
170
171	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
172	key.offset = offset;
173	key.type = 0;
174	ret = btrfs_insert_empty_item(trans, root, path, &key,
175				      sizeof(struct btrfs_free_space_header));
176	if (ret < 0) {
177		btrfs_release_path(path);
178		return ret;
179	}
180
181	leaf = path->nodes[0];
182	header = btrfs_item_ptr(leaf, path->slots[0],
183				struct btrfs_free_space_header);
184	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
185	btrfs_set_free_space_key(leaf, header, &disk_key);
186	btrfs_mark_buffer_dirty(leaf);
187	btrfs_release_path(path);
188
189	return 0;
190}
191
192int create_free_space_inode(struct btrfs_root *root,
193			    struct btrfs_trans_handle *trans,
194			    struct btrfs_block_group_cache *block_group,
195			    struct btrfs_path *path)
196{
197	int ret;
198	u64 ino;
199
200	ret = btrfs_find_free_objectid(root, &ino);
201	if (ret < 0)
202		return ret;
203
204	return __create_free_space_inode(root, trans, path, ino,
205					 block_group->key.objectid);
206}
207
208int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
209				       struct btrfs_block_rsv *rsv)
210{
211	u64 needed_bytes;
212	int ret;
213
214	/* 1 for slack space, 1 for updating the inode */
215	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
216		btrfs_calc_trans_metadata_size(root, 1);
217
218	spin_lock(&rsv->lock);
219	if (rsv->reserved < needed_bytes)
220		ret = -ENOSPC;
221	else
222		ret = 0;
223	spin_unlock(&rsv->lock);
224	return ret;
225}
226
227int btrfs_truncate_free_space_cache(struct btrfs_root *root,
228				    struct btrfs_trans_handle *trans,
229				    struct btrfs_block_group_cache *block_group,
230				    struct inode *inode)
231{
232	int ret = 0;
233	struct btrfs_path *path = btrfs_alloc_path();
234
235	if (!path) {
236		ret = -ENOMEM;
237		goto fail;
238	}
239
240	if (block_group) {
241		mutex_lock(&trans->transaction->cache_write_mutex);
242		if (!list_empty(&block_group->io_list)) {
243			list_del_init(&block_group->io_list);
244
245			btrfs_wait_cache_io(root, trans, block_group,
246					    &block_group->io_ctl, path,
247					    block_group->key.objectid);
248			btrfs_put_block_group(block_group);
249		}
250
251		/*
252		 * now that we've truncated the cache away, its no longer
253		 * setup or written
254		 */
255		spin_lock(&block_group->lock);
256		block_group->disk_cache_state = BTRFS_DC_CLEAR;
257		spin_unlock(&block_group->lock);
258	}
259	btrfs_free_path(path);
260
261	btrfs_i_size_write(inode, 0);
262	truncate_pagecache(inode, 0);
263
264	/*
265	 * We don't need an orphan item because truncating the free space cache
266	 * will never be split across transactions.
267	 * We don't need to check for -EAGAIN because we're a free space
268	 * cache inode
269	 */
270	ret = btrfs_truncate_inode_items(trans, root, inode,
271					 0, BTRFS_EXTENT_DATA_KEY);
272	if (ret) {
273		mutex_unlock(&trans->transaction->cache_write_mutex);
274		btrfs_abort_transaction(trans, root, ret);
275		return ret;
276	}
277
278	ret = btrfs_update_inode(trans, root, inode);
279
280	if (block_group)
281		mutex_unlock(&trans->transaction->cache_write_mutex);
282
283fail:
284	if (ret)
285		btrfs_abort_transaction(trans, root, ret);
286
287	return ret;
288}
289
290static int readahead_cache(struct inode *inode)
291{
292	struct file_ra_state *ra;
293	unsigned long last_index;
294
295	ra = kzalloc(sizeof(*ra), GFP_NOFS);
296	if (!ra)
297		return -ENOMEM;
298
299	file_ra_state_init(ra, inode->i_mapping);
300	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
301
302	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
303
304	kfree(ra);
305
306	return 0;
307}
308
309static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
310		       struct btrfs_root *root, int write)
311{
312	int num_pages;
313	int check_crcs = 0;
314
315	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
316
317	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
318		check_crcs = 1;
319
320	/* Make sure we can fit our crcs into the first page */
321	if (write && check_crcs &&
322	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
323		return -ENOSPC;
324
325	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
326
327	io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
328	if (!io_ctl->pages)
329		return -ENOMEM;
330
331	io_ctl->num_pages = num_pages;
332	io_ctl->root = root;
333	io_ctl->check_crcs = check_crcs;
334	io_ctl->inode = inode;
335
336	return 0;
337}
338
339static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
340{
341	kfree(io_ctl->pages);
342	io_ctl->pages = NULL;
343}
344
345static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
346{
347	if (io_ctl->cur) {
348		io_ctl->cur = NULL;
349		io_ctl->orig = NULL;
350	}
351}
352
353static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
354{
355	ASSERT(io_ctl->index < io_ctl->num_pages);
356	io_ctl->page = io_ctl->pages[io_ctl->index++];
357	io_ctl->cur = page_address(io_ctl->page);
358	io_ctl->orig = io_ctl->cur;
359	io_ctl->size = PAGE_CACHE_SIZE;
360	if (clear)
361		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
362}
363
364static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
365{
366	int i;
367
368	io_ctl_unmap_page(io_ctl);
369
370	for (i = 0; i < io_ctl->num_pages; i++) {
371		if (io_ctl->pages[i]) {
372			ClearPageChecked(io_ctl->pages[i]);
373			unlock_page(io_ctl->pages[i]);
374			page_cache_release(io_ctl->pages[i]);
375		}
376	}
377}
378
379static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
380				int uptodate)
381{
382	struct page *page;
383	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
384	int i;
385
386	for (i = 0; i < io_ctl->num_pages; i++) {
387		page = find_or_create_page(inode->i_mapping, i, mask);
388		if (!page) {
389			io_ctl_drop_pages(io_ctl);
390			return -ENOMEM;
391		}
392		io_ctl->pages[i] = page;
393		if (uptodate && !PageUptodate(page)) {
394			btrfs_readpage(NULL, page);
395			lock_page(page);
396			if (!PageUptodate(page)) {
397				btrfs_err(BTRFS_I(inode)->root->fs_info,
398					   "error reading free space cache");
399				io_ctl_drop_pages(io_ctl);
400				return -EIO;
401			}
402		}
403	}
404
405	for (i = 0; i < io_ctl->num_pages; i++) {
406		clear_page_dirty_for_io(io_ctl->pages[i]);
407		set_page_extent_mapped(io_ctl->pages[i]);
408	}
409
410	return 0;
411}
412
413static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
414{
415	__le64 *val;
416
417	io_ctl_map_page(io_ctl, 1);
418
419	/*
420	 * Skip the csum areas.  If we don't check crcs then we just have a
421	 * 64bit chunk at the front of the first page.
422	 */
423	if (io_ctl->check_crcs) {
424		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
425		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
426	} else {
427		io_ctl->cur += sizeof(u64);
428		io_ctl->size -= sizeof(u64) * 2;
429	}
430
431	val = io_ctl->cur;
432	*val = cpu_to_le64(generation);
433	io_ctl->cur += sizeof(u64);
434}
435
436static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
437{
438	__le64 *gen;
439
440	/*
441	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
442	 * chunk at the front of the first page.
443	 */
444	if (io_ctl->check_crcs) {
445		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
446		io_ctl->size -= sizeof(u64) +
447			(sizeof(u32) * io_ctl->num_pages);
448	} else {
449		io_ctl->cur += sizeof(u64);
450		io_ctl->size -= sizeof(u64) * 2;
451	}
452
453	gen = io_ctl->cur;
454	if (le64_to_cpu(*gen) != generation) {
455		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
456				   "(%Lu) does not match inode (%Lu)\n", *gen,
457				   generation);
458		io_ctl_unmap_page(io_ctl);
459		return -EIO;
460	}
461	io_ctl->cur += sizeof(u64);
462	return 0;
463}
464
465static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
466{
467	u32 *tmp;
468	u32 crc = ~(u32)0;
469	unsigned offset = 0;
470
471	if (!io_ctl->check_crcs) {
472		io_ctl_unmap_page(io_ctl);
473		return;
474	}
475
476	if (index == 0)
477		offset = sizeof(u32) * io_ctl->num_pages;
478
479	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
480			      PAGE_CACHE_SIZE - offset);
481	btrfs_csum_final(crc, (char *)&crc);
482	io_ctl_unmap_page(io_ctl);
483	tmp = page_address(io_ctl->pages[0]);
484	tmp += index;
485	*tmp = crc;
486}
487
488static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
489{
490	u32 *tmp, val;
491	u32 crc = ~(u32)0;
492	unsigned offset = 0;
493
494	if (!io_ctl->check_crcs) {
495		io_ctl_map_page(io_ctl, 0);
496		return 0;
497	}
498
499	if (index == 0)
500		offset = sizeof(u32) * io_ctl->num_pages;
501
502	tmp = page_address(io_ctl->pages[0]);
503	tmp += index;
504	val = *tmp;
505
506	io_ctl_map_page(io_ctl, 0);
507	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
508			      PAGE_CACHE_SIZE - offset);
509	btrfs_csum_final(crc, (char *)&crc);
510	if (val != crc) {
511		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
512				   "space cache\n");
513		io_ctl_unmap_page(io_ctl);
514		return -EIO;
515	}
516
517	return 0;
518}
519
520static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
521			    void *bitmap)
522{
523	struct btrfs_free_space_entry *entry;
524
525	if (!io_ctl->cur)
526		return -ENOSPC;
527
528	entry = io_ctl->cur;
529	entry->offset = cpu_to_le64(offset);
530	entry->bytes = cpu_to_le64(bytes);
531	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
532		BTRFS_FREE_SPACE_EXTENT;
533	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
534	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
535
536	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
537		return 0;
538
539	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
540
541	/* No more pages to map */
542	if (io_ctl->index >= io_ctl->num_pages)
543		return 0;
544
545	/* map the next page */
546	io_ctl_map_page(io_ctl, 1);
547	return 0;
548}
549
550static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
551{
552	if (!io_ctl->cur)
553		return -ENOSPC;
554
555	/*
556	 * If we aren't at the start of the current page, unmap this one and
557	 * map the next one if there is any left.
558	 */
559	if (io_ctl->cur != io_ctl->orig) {
560		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
561		if (io_ctl->index >= io_ctl->num_pages)
562			return -ENOSPC;
563		io_ctl_map_page(io_ctl, 0);
564	}
565
566	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
567	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
568	if (io_ctl->index < io_ctl->num_pages)
569		io_ctl_map_page(io_ctl, 0);
570	return 0;
571}
572
573static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
574{
575	/*
576	 * If we're not on the boundary we know we've modified the page and we
577	 * need to crc the page.
578	 */
579	if (io_ctl->cur != io_ctl->orig)
580		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
581	else
582		io_ctl_unmap_page(io_ctl);
583
584	while (io_ctl->index < io_ctl->num_pages) {
585		io_ctl_map_page(io_ctl, 1);
586		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
587	}
588}
589
590static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
591			    struct btrfs_free_space *entry, u8 *type)
592{
593	struct btrfs_free_space_entry *e;
594	int ret;
595
596	if (!io_ctl->cur) {
597		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
598		if (ret)
599			return ret;
600	}
601
602	e = io_ctl->cur;
603	entry->offset = le64_to_cpu(e->offset);
604	entry->bytes = le64_to_cpu(e->bytes);
605	*type = e->type;
606	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
607	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
608
609	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
610		return 0;
611
612	io_ctl_unmap_page(io_ctl);
613
614	return 0;
615}
616
617static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
618			      struct btrfs_free_space *entry)
619{
620	int ret;
621
622	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
623	if (ret)
624		return ret;
625
626	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
627	io_ctl_unmap_page(io_ctl);
628
629	return 0;
630}
631
632/*
633 * Since we attach pinned extents after the fact we can have contiguous sections
634 * of free space that are split up in entries.  This poses a problem with the
635 * tree logging stuff since it could have allocated across what appears to be 2
636 * entries since we would have merged the entries when adding the pinned extents
637 * back to the free space cache.  So run through the space cache that we just
638 * loaded and merge contiguous entries.  This will make the log replay stuff not
639 * blow up and it will make for nicer allocator behavior.
640 */
641static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
642{
643	struct btrfs_free_space *e, *prev = NULL;
644	struct rb_node *n;
645
646again:
647	spin_lock(&ctl->tree_lock);
648	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
649		e = rb_entry(n, struct btrfs_free_space, offset_index);
650		if (!prev)
651			goto next;
652		if (e->bitmap || prev->bitmap)
653			goto next;
654		if (prev->offset + prev->bytes == e->offset) {
655			unlink_free_space(ctl, prev);
656			unlink_free_space(ctl, e);
657			prev->bytes += e->bytes;
658			kmem_cache_free(btrfs_free_space_cachep, e);
659			link_free_space(ctl, prev);
660			prev = NULL;
661			spin_unlock(&ctl->tree_lock);
662			goto again;
663		}
664next:
665		prev = e;
666	}
667	spin_unlock(&ctl->tree_lock);
668}
669
670static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
671				   struct btrfs_free_space_ctl *ctl,
672				   struct btrfs_path *path, u64 offset)
673{
674	struct btrfs_free_space_header *header;
675	struct extent_buffer *leaf;
676	struct btrfs_io_ctl io_ctl;
677	struct btrfs_key key;
678	struct btrfs_free_space *e, *n;
679	LIST_HEAD(bitmaps);
680	u64 num_entries;
681	u64 num_bitmaps;
682	u64 generation;
683	u8 type;
684	int ret = 0;
685
686	/* Nothing in the space cache, goodbye */
687	if (!i_size_read(inode))
688		return 0;
689
690	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
691	key.offset = offset;
692	key.type = 0;
693
694	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
695	if (ret < 0)
696		return 0;
697	else if (ret > 0) {
698		btrfs_release_path(path);
699		return 0;
700	}
701
702	ret = -1;
703
704	leaf = path->nodes[0];
705	header = btrfs_item_ptr(leaf, path->slots[0],
706				struct btrfs_free_space_header);
707	num_entries = btrfs_free_space_entries(leaf, header);
708	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
709	generation = btrfs_free_space_generation(leaf, header);
710	btrfs_release_path(path);
711
712	if (!BTRFS_I(inode)->generation) {
713		btrfs_info(root->fs_info,
714			   "The free space cache file (%llu) is invalid. skip it\n",
715			   offset);
716		return 0;
717	}
718
719	if (BTRFS_I(inode)->generation != generation) {
720		btrfs_err(root->fs_info,
721			"free space inode generation (%llu) "
722			"did not match free space cache generation (%llu)",
723			BTRFS_I(inode)->generation, generation);
724		return 0;
725	}
726
727	if (!num_entries)
728		return 0;
729
730	ret = io_ctl_init(&io_ctl, inode, root, 0);
731	if (ret)
732		return ret;
733
734	ret = readahead_cache(inode);
735	if (ret)
736		goto out;
737
738	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
739	if (ret)
740		goto out;
741
742	ret = io_ctl_check_crc(&io_ctl, 0);
743	if (ret)
744		goto free_cache;
745
746	ret = io_ctl_check_generation(&io_ctl, generation);
747	if (ret)
748		goto free_cache;
749
750	while (num_entries) {
751		e = kmem_cache_zalloc(btrfs_free_space_cachep,
752				      GFP_NOFS);
753		if (!e)
754			goto free_cache;
755
756		ret = io_ctl_read_entry(&io_ctl, e, &type);
757		if (ret) {
758			kmem_cache_free(btrfs_free_space_cachep, e);
759			goto free_cache;
760		}
761
762		if (!e->bytes) {
763			kmem_cache_free(btrfs_free_space_cachep, e);
764			goto free_cache;
765		}
766
767		if (type == BTRFS_FREE_SPACE_EXTENT) {
768			spin_lock(&ctl->tree_lock);
769			ret = link_free_space(ctl, e);
770			spin_unlock(&ctl->tree_lock);
771			if (ret) {
772				btrfs_err(root->fs_info,
773					"Duplicate entries in free space cache, dumping");
774				kmem_cache_free(btrfs_free_space_cachep, e);
775				goto free_cache;
776			}
777		} else {
778			ASSERT(num_bitmaps);
779			num_bitmaps--;
780			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
781			if (!e->bitmap) {
782				kmem_cache_free(
783					btrfs_free_space_cachep, e);
784				goto free_cache;
785			}
786			spin_lock(&ctl->tree_lock);
787			ret = link_free_space(ctl, e);
788			ctl->total_bitmaps++;
789			ctl->op->recalc_thresholds(ctl);
790			spin_unlock(&ctl->tree_lock);
791			if (ret) {
792				btrfs_err(root->fs_info,
793					"Duplicate entries in free space cache, dumping");
794				kmem_cache_free(btrfs_free_space_cachep, e);
795				goto free_cache;
796			}
797			list_add_tail(&e->list, &bitmaps);
798		}
799
800		num_entries--;
801	}
802
803	io_ctl_unmap_page(&io_ctl);
804
805	/*
806	 * We add the bitmaps at the end of the entries in order that
807	 * the bitmap entries are added to the cache.
808	 */
809	list_for_each_entry_safe(e, n, &bitmaps, list) {
810		list_del_init(&e->list);
811		ret = io_ctl_read_bitmap(&io_ctl, e);
812		if (ret)
813			goto free_cache;
814	}
815
816	io_ctl_drop_pages(&io_ctl);
817	merge_space_tree(ctl);
818	ret = 1;
819out:
820	io_ctl_free(&io_ctl);
821	return ret;
822free_cache:
823	io_ctl_drop_pages(&io_ctl);
824	__btrfs_remove_free_space_cache(ctl);
825	goto out;
826}
827
828int load_free_space_cache(struct btrfs_fs_info *fs_info,
829			  struct btrfs_block_group_cache *block_group)
830{
831	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
832	struct btrfs_root *root = fs_info->tree_root;
833	struct inode *inode;
834	struct btrfs_path *path;
835	int ret = 0;
836	bool matched;
837	u64 used = btrfs_block_group_used(&block_group->item);
838
839	/*
840	 * If this block group has been marked to be cleared for one reason or
841	 * another then we can't trust the on disk cache, so just return.
842	 */
843	spin_lock(&block_group->lock);
844	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
845		spin_unlock(&block_group->lock);
846		return 0;
847	}
848	spin_unlock(&block_group->lock);
849
850	path = btrfs_alloc_path();
851	if (!path)
852		return 0;
853	path->search_commit_root = 1;
854	path->skip_locking = 1;
855
856	inode = lookup_free_space_inode(root, block_group, path);
857	if (IS_ERR(inode)) {
858		btrfs_free_path(path);
859		return 0;
860	}
861
862	/* We may have converted the inode and made the cache invalid. */
863	spin_lock(&block_group->lock);
864	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
865		spin_unlock(&block_group->lock);
866		btrfs_free_path(path);
867		goto out;
868	}
869	spin_unlock(&block_group->lock);
870
871	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
872				      path, block_group->key.objectid);
873	btrfs_free_path(path);
874	if (ret <= 0)
875		goto out;
876
877	spin_lock(&ctl->tree_lock);
878	matched = (ctl->free_space == (block_group->key.offset - used -
879				       block_group->bytes_super));
880	spin_unlock(&ctl->tree_lock);
881
882	if (!matched) {
883		__btrfs_remove_free_space_cache(ctl);
884		btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
885			block_group->key.objectid);
886		ret = -1;
887	}
888out:
889	if (ret < 0) {
890		/* This cache is bogus, make sure it gets cleared */
891		spin_lock(&block_group->lock);
892		block_group->disk_cache_state = BTRFS_DC_CLEAR;
893		spin_unlock(&block_group->lock);
894		ret = 0;
895
896		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
897			block_group->key.objectid);
898	}
899
900	iput(inode);
901	return ret;
902}
903
904static noinline_for_stack
905int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
906			      struct btrfs_free_space_ctl *ctl,
907			      struct btrfs_block_group_cache *block_group,
908			      int *entries, int *bitmaps,
909			      struct list_head *bitmap_list)
910{
911	int ret;
912	struct btrfs_free_cluster *cluster = NULL;
913	struct btrfs_free_cluster *cluster_locked = NULL;
914	struct rb_node *node = rb_first(&ctl->free_space_offset);
915	struct btrfs_trim_range *trim_entry;
916
917	/* Get the cluster for this block_group if it exists */
918	if (block_group && !list_empty(&block_group->cluster_list)) {
919		cluster = list_entry(block_group->cluster_list.next,
920				     struct btrfs_free_cluster,
921				     block_group_list);
922	}
923
924	if (!node && cluster) {
925		cluster_locked = cluster;
926		spin_lock(&cluster_locked->lock);
927		node = rb_first(&cluster->root);
928		cluster = NULL;
929	}
930
931	/* Write out the extent entries */
932	while (node) {
933		struct btrfs_free_space *e;
934
935		e = rb_entry(node, struct btrfs_free_space, offset_index);
936		*entries += 1;
937
938		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
939				       e->bitmap);
940		if (ret)
941			goto fail;
942
943		if (e->bitmap) {
944			list_add_tail(&e->list, bitmap_list);
945			*bitmaps += 1;
946		}
947		node = rb_next(node);
948		if (!node && cluster) {
949			node = rb_first(&cluster->root);
950			cluster_locked = cluster;
951			spin_lock(&cluster_locked->lock);
952			cluster = NULL;
953		}
954	}
955	if (cluster_locked) {
956		spin_unlock(&cluster_locked->lock);
957		cluster_locked = NULL;
958	}
959
960	/*
961	 * Make sure we don't miss any range that was removed from our rbtree
962	 * because trimming is running. Otherwise after a umount+mount (or crash
963	 * after committing the transaction) we would leak free space and get
964	 * an inconsistent free space cache report from fsck.
965	 */
966	list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
967		ret = io_ctl_add_entry(io_ctl, trim_entry->start,
968				       trim_entry->bytes, NULL);
969		if (ret)
970			goto fail;
971		*entries += 1;
972	}
973
974	return 0;
975fail:
976	if (cluster_locked)
977		spin_unlock(&cluster_locked->lock);
978	return -ENOSPC;
979}
980
981static noinline_for_stack int
982update_cache_item(struct btrfs_trans_handle *trans,
983		  struct btrfs_root *root,
984		  struct inode *inode,
985		  struct btrfs_path *path, u64 offset,
986		  int entries, int bitmaps)
987{
988	struct btrfs_key key;
989	struct btrfs_free_space_header *header;
990	struct extent_buffer *leaf;
991	int ret;
992
993	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
994	key.offset = offset;
995	key.type = 0;
996
997	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
998	if (ret < 0) {
999		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1000				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1001				 GFP_NOFS);
1002		goto fail;
1003	}
1004	leaf = path->nodes[0];
1005	if (ret > 0) {
1006		struct btrfs_key found_key;
1007		ASSERT(path->slots[0]);
1008		path->slots[0]--;
1009		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1010		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1011		    found_key.offset != offset) {
1012			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1013					 inode->i_size - 1,
1014					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
1015					 NULL, GFP_NOFS);
1016			btrfs_release_path(path);
1017			goto fail;
1018		}
1019	}
1020
1021	BTRFS_I(inode)->generation = trans->transid;
1022	header = btrfs_item_ptr(leaf, path->slots[0],
1023				struct btrfs_free_space_header);
1024	btrfs_set_free_space_entries(leaf, header, entries);
1025	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1026	btrfs_set_free_space_generation(leaf, header, trans->transid);
1027	btrfs_mark_buffer_dirty(leaf);
1028	btrfs_release_path(path);
1029
1030	return 0;
1031
1032fail:
1033	return -1;
1034}
1035
1036static noinline_for_stack int
1037write_pinned_extent_entries(struct btrfs_root *root,
1038			    struct btrfs_block_group_cache *block_group,
1039			    struct btrfs_io_ctl *io_ctl,
1040			    int *entries)
1041{
1042	u64 start, extent_start, extent_end, len;
1043	struct extent_io_tree *unpin = NULL;
1044	int ret;
1045
1046	if (!block_group)
1047		return 0;
1048
1049	/*
1050	 * We want to add any pinned extents to our free space cache
1051	 * so we don't leak the space
1052	 *
1053	 * We shouldn't have switched the pinned extents yet so this is the
1054	 * right one
1055	 */
1056	unpin = root->fs_info->pinned_extents;
1057
1058	start = block_group->key.objectid;
1059
1060	while (start < block_group->key.objectid + block_group->key.offset) {
1061		ret = find_first_extent_bit(unpin, start,
1062					    &extent_start, &extent_end,
1063					    EXTENT_DIRTY, NULL);
1064		if (ret)
1065			return 0;
1066
1067		/* This pinned extent is out of our range */
1068		if (extent_start >= block_group->key.objectid +
1069		    block_group->key.offset)
1070			return 0;
1071
1072		extent_start = max(extent_start, start);
1073		extent_end = min(block_group->key.objectid +
1074				 block_group->key.offset, extent_end + 1);
1075		len = extent_end - extent_start;
1076
1077		*entries += 1;
1078		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1079		if (ret)
1080			return -ENOSPC;
1081
1082		start = extent_end;
1083	}
1084
1085	return 0;
1086}
1087
1088static noinline_for_stack int
1089write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1090{
1091	struct list_head *pos, *n;
1092	int ret;
1093
1094	/* Write out the bitmaps */
1095	list_for_each_safe(pos, n, bitmap_list) {
1096		struct btrfs_free_space *entry =
1097			list_entry(pos, struct btrfs_free_space, list);
1098
1099		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1100		if (ret)
1101			return -ENOSPC;
1102		list_del_init(&entry->list);
1103	}
1104
1105	return 0;
1106}
1107
1108static int flush_dirty_cache(struct inode *inode)
1109{
1110	int ret;
1111
1112	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1113	if (ret)
1114		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1115				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1116				 GFP_NOFS);
1117
1118	return ret;
1119}
1120
1121static void noinline_for_stack
1122cleanup_bitmap_list(struct list_head *bitmap_list)
1123{
1124	struct list_head *pos, *n;
1125
1126	list_for_each_safe(pos, n, bitmap_list) {
1127		struct btrfs_free_space *entry =
1128			list_entry(pos, struct btrfs_free_space, list);
1129		list_del_init(&entry->list);
1130	}
1131}
1132
1133static void noinline_for_stack
1134cleanup_write_cache_enospc(struct inode *inode,
1135			   struct btrfs_io_ctl *io_ctl,
1136			   struct extent_state **cached_state,
1137			   struct list_head *bitmap_list)
1138{
1139	io_ctl_drop_pages(io_ctl);
1140	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1141			     i_size_read(inode) - 1, cached_state,
1142			     GFP_NOFS);
1143}
1144
1145int btrfs_wait_cache_io(struct btrfs_root *root,
1146			struct btrfs_trans_handle *trans,
1147			struct btrfs_block_group_cache *block_group,
1148			struct btrfs_io_ctl *io_ctl,
1149			struct btrfs_path *path, u64 offset)
1150{
1151	int ret;
1152	struct inode *inode = io_ctl->inode;
1153
1154	if (!inode)
1155		return 0;
1156
1157	if (block_group)
1158		root = root->fs_info->tree_root;
1159
1160	/* Flush the dirty pages in the cache file. */
1161	ret = flush_dirty_cache(inode);
1162	if (ret)
1163		goto out;
1164
1165	/* Update the cache item to tell everyone this cache file is valid. */
1166	ret = update_cache_item(trans, root, inode, path, offset,
1167				io_ctl->entries, io_ctl->bitmaps);
1168out:
1169	io_ctl_free(io_ctl);
1170	if (ret) {
1171		invalidate_inode_pages2(inode->i_mapping);
1172		BTRFS_I(inode)->generation = 0;
1173		if (block_group) {
1174#ifdef DEBUG
1175			btrfs_err(root->fs_info,
1176				"failed to write free space cache for block group %llu",
1177				block_group->key.objectid);
1178#endif
1179		}
1180	}
1181	btrfs_update_inode(trans, root, inode);
1182
1183	if (block_group) {
1184		/* the dirty list is protected by the dirty_bgs_lock */
1185		spin_lock(&trans->transaction->dirty_bgs_lock);
1186
1187		/* the disk_cache_state is protected by the block group lock */
1188		spin_lock(&block_group->lock);
1189
1190		/*
1191		 * only mark this as written if we didn't get put back on
1192		 * the dirty list while waiting for IO.   Otherwise our
1193		 * cache state won't be right, and we won't get written again
1194		 */
1195		if (!ret && list_empty(&block_group->dirty_list))
1196			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1197		else if (ret)
1198			block_group->disk_cache_state = BTRFS_DC_ERROR;
1199
1200		spin_unlock(&block_group->lock);
1201		spin_unlock(&trans->transaction->dirty_bgs_lock);
1202		io_ctl->inode = NULL;
1203		iput(inode);
1204	}
1205
1206	return ret;
1207
1208}
1209
1210/**
1211 * __btrfs_write_out_cache - write out cached info to an inode
1212 * @root - the root the inode belongs to
1213 * @ctl - the free space cache we are going to write out
1214 * @block_group - the block_group for this cache if it belongs to a block_group
1215 * @trans - the trans handle
1216 * @path - the path to use
1217 * @offset - the offset for the key we'll insert
1218 *
1219 * This function writes out a free space cache struct to disk for quick recovery
1220 * on mount.  This will return 0 if it was successfull in writing the cache out,
1221 * or an errno if it was not.
1222 */
1223static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1224				   struct btrfs_free_space_ctl *ctl,
1225				   struct btrfs_block_group_cache *block_group,
1226				   struct btrfs_io_ctl *io_ctl,
1227				   struct btrfs_trans_handle *trans,
1228				   struct btrfs_path *path, u64 offset)
1229{
1230	struct extent_state *cached_state = NULL;
1231	LIST_HEAD(bitmap_list);
1232	int entries = 0;
1233	int bitmaps = 0;
1234	int ret;
1235	int must_iput = 0;
1236
1237	if (!i_size_read(inode))
1238		return -EIO;
1239
1240	WARN_ON(io_ctl->pages);
1241	ret = io_ctl_init(io_ctl, inode, root, 1);
1242	if (ret)
1243		return ret;
1244
1245	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1246		down_write(&block_group->data_rwsem);
1247		spin_lock(&block_group->lock);
1248		if (block_group->delalloc_bytes) {
1249			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1250			spin_unlock(&block_group->lock);
1251			up_write(&block_group->data_rwsem);
1252			BTRFS_I(inode)->generation = 0;
1253			ret = 0;
1254			must_iput = 1;
1255			goto out;
1256		}
1257		spin_unlock(&block_group->lock);
1258	}
1259
1260	/* Lock all pages first so we can lock the extent safely. */
1261	ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1262	if (ret)
1263		goto out;
1264
1265	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1266			 0, &cached_state);
1267
1268	io_ctl_set_generation(io_ctl, trans->transid);
1269
1270	mutex_lock(&ctl->cache_writeout_mutex);
1271	/* Write out the extent entries in the free space cache */
1272	spin_lock(&ctl->tree_lock);
1273	ret = write_cache_extent_entries(io_ctl, ctl,
1274					 block_group, &entries, &bitmaps,
1275					 &bitmap_list);
1276	if (ret)
1277		goto out_nospc_locked;
1278
1279	/*
1280	 * Some spaces that are freed in the current transaction are pinned,
1281	 * they will be added into free space cache after the transaction is
1282	 * committed, we shouldn't lose them.
1283	 *
1284	 * If this changes while we are working we'll get added back to
1285	 * the dirty list and redo it.  No locking needed
1286	 */
1287	ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
1288	if (ret)
1289		goto out_nospc_locked;
1290
1291	/*
1292	 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1293	 * locked while doing it because a concurrent trim can be manipulating
1294	 * or freeing the bitmap.
1295	 */
1296	ret = write_bitmap_entries(io_ctl, &bitmap_list);
1297	spin_unlock(&ctl->tree_lock);
1298	mutex_unlock(&ctl->cache_writeout_mutex);
1299	if (ret)
1300		goto out_nospc;
1301
1302	/* Zero out the rest of the pages just to make sure */
1303	io_ctl_zero_remaining_pages(io_ctl);
1304
1305	/* Everything is written out, now we dirty the pages in the file. */
1306	ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
1307				0, i_size_read(inode), &cached_state);
1308	if (ret)
1309		goto out_nospc;
1310
1311	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1312		up_write(&block_group->data_rwsem);
1313	/*
1314	 * Release the pages and unlock the extent, we will flush
1315	 * them out later
1316	 */
1317	io_ctl_drop_pages(io_ctl);
1318
1319	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1320			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1321
1322	/*
1323	 * at this point the pages are under IO and we're happy,
1324	 * The caller is responsible for waiting on them and updating the
1325	 * the cache and the inode
1326	 */
1327	io_ctl->entries = entries;
1328	io_ctl->bitmaps = bitmaps;
1329
1330	ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1331	if (ret)
1332		goto out;
1333
1334	return 0;
1335
1336out:
1337	io_ctl->inode = NULL;
1338	io_ctl_free(io_ctl);
1339	if (ret) {
1340		invalidate_inode_pages2(inode->i_mapping);
1341		BTRFS_I(inode)->generation = 0;
1342	}
1343	btrfs_update_inode(trans, root, inode);
1344	if (must_iput)
1345		iput(inode);
1346	return ret;
1347
1348out_nospc_locked:
1349	cleanup_bitmap_list(&bitmap_list);
1350	spin_unlock(&ctl->tree_lock);
1351	mutex_unlock(&ctl->cache_writeout_mutex);
1352
1353out_nospc:
1354	cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
1355
1356	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1357		up_write(&block_group->data_rwsem);
1358
1359	goto out;
1360}
1361
1362int btrfs_write_out_cache(struct btrfs_root *root,
1363			  struct btrfs_trans_handle *trans,
1364			  struct btrfs_block_group_cache *block_group,
1365			  struct btrfs_path *path)
1366{
1367	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1368	struct inode *inode;
1369	int ret = 0;
1370
1371	root = root->fs_info->tree_root;
1372
1373	spin_lock(&block_group->lock);
1374	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1375		spin_unlock(&block_group->lock);
1376		return 0;
1377	}
1378	spin_unlock(&block_group->lock);
1379
1380	inode = lookup_free_space_inode(root, block_group, path);
1381	if (IS_ERR(inode))
1382		return 0;
1383
1384	ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
1385				      &block_group->io_ctl, trans,
1386				      path, block_group->key.objectid);
1387	if (ret) {
1388#ifdef DEBUG
1389		btrfs_err(root->fs_info,
1390			"failed to write free space cache for block group %llu",
1391			block_group->key.objectid);
1392#endif
1393		spin_lock(&block_group->lock);
1394		block_group->disk_cache_state = BTRFS_DC_ERROR;
1395		spin_unlock(&block_group->lock);
1396
1397		block_group->io_ctl.inode = NULL;
1398		iput(inode);
1399	}
1400
1401	/*
1402	 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1403	 * to wait for IO and put the inode
1404	 */
1405
1406	return ret;
1407}
1408
1409static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1410					  u64 offset)
1411{
1412	ASSERT(offset >= bitmap_start);
1413	offset -= bitmap_start;
1414	return (unsigned long)(div_u64(offset, unit));
1415}
1416
1417static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1418{
1419	return (unsigned long)(div_u64(bytes, unit));
1420}
1421
1422static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1423				   u64 offset)
1424{
1425	u64 bitmap_start;
1426	u32 bytes_per_bitmap;
1427
1428	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1429	bitmap_start = offset - ctl->start;
1430	bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
1431	bitmap_start *= bytes_per_bitmap;
1432	bitmap_start += ctl->start;
1433
1434	return bitmap_start;
1435}
1436
1437static int tree_insert_offset(struct rb_root *root, u64 offset,
1438			      struct rb_node *node, int bitmap)
1439{
1440	struct rb_node **p = &root->rb_node;
1441	struct rb_node *parent = NULL;
1442	struct btrfs_free_space *info;
1443
1444	while (*p) {
1445		parent = *p;
1446		info = rb_entry(parent, struct btrfs_free_space, offset_index);
1447
1448		if (offset < info->offset) {
1449			p = &(*p)->rb_left;
1450		} else if (offset > info->offset) {
1451			p = &(*p)->rb_right;
1452		} else {
1453			/*
1454			 * we could have a bitmap entry and an extent entry
1455			 * share the same offset.  If this is the case, we want
1456			 * the extent entry to always be found first if we do a
1457			 * linear search through the tree, since we want to have
1458			 * the quickest allocation time, and allocating from an
1459			 * extent is faster than allocating from a bitmap.  So
1460			 * if we're inserting a bitmap and we find an entry at
1461			 * this offset, we want to go right, or after this entry
1462			 * logically.  If we are inserting an extent and we've
1463			 * found a bitmap, we want to go left, or before
1464			 * logically.
1465			 */
1466			if (bitmap) {
1467				if (info->bitmap) {
1468					WARN_ON_ONCE(1);
1469					return -EEXIST;
1470				}
1471				p = &(*p)->rb_right;
1472			} else {
1473				if (!info->bitmap) {
1474					WARN_ON_ONCE(1);
1475					return -EEXIST;
1476				}
1477				p = &(*p)->rb_left;
1478			}
1479		}
1480	}
1481
1482	rb_link_node(node, parent, p);
1483	rb_insert_color(node, root);
1484
1485	return 0;
1486}
1487
1488/*
1489 * searches the tree for the given offset.
1490 *
1491 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1492 * want a section that has at least bytes size and comes at or after the given
1493 * offset.
1494 */
1495static struct btrfs_free_space *
1496tree_search_offset(struct btrfs_free_space_ctl *ctl,
1497		   u64 offset, int bitmap_only, int fuzzy)
1498{
1499	struct rb_node *n = ctl->free_space_offset.rb_node;
1500	struct btrfs_free_space *entry, *prev = NULL;
1501
1502	/* find entry that is closest to the 'offset' */
1503	while (1) {
1504		if (!n) {
1505			entry = NULL;
1506			break;
1507		}
1508
1509		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1510		prev = entry;
1511
1512		if (offset < entry->offset)
1513			n = n->rb_left;
1514		else if (offset > entry->offset)
1515			n = n->rb_right;
1516		else
1517			break;
1518	}
1519
1520	if (bitmap_only) {
1521		if (!entry)
1522			return NULL;
1523		if (entry->bitmap)
1524			return entry;
1525
1526		/*
1527		 * bitmap entry and extent entry may share same offset,
1528		 * in that case, bitmap entry comes after extent entry.
1529		 */
1530		n = rb_next(n);
1531		if (!n)
1532			return NULL;
1533		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1534		if (entry->offset != offset)
1535			return NULL;
1536
1537		WARN_ON(!entry->bitmap);
1538		return entry;
1539	} else if (entry) {
1540		if (entry->bitmap) {
1541			/*
1542			 * if previous extent entry covers the offset,
1543			 * we should return it instead of the bitmap entry
1544			 */
1545			n = rb_prev(&entry->offset_index);
1546			if (n) {
1547				prev = rb_entry(n, struct btrfs_free_space,
1548						offset_index);
1549				if (!prev->bitmap &&
1550				    prev->offset + prev->bytes > offset)
1551					entry = prev;
1552			}
1553		}
1554		return entry;
1555	}
1556
1557	if (!prev)
1558		return NULL;
1559
1560	/* find last entry before the 'offset' */
1561	entry = prev;
1562	if (entry->offset > offset) {
1563		n = rb_prev(&entry->offset_index);
1564		if (n) {
1565			entry = rb_entry(n, struct btrfs_free_space,
1566					offset_index);
1567			ASSERT(entry->offset <= offset);
1568		} else {
1569			if (fuzzy)
1570				return entry;
1571			else
1572				return NULL;
1573		}
1574	}
1575
1576	if (entry->bitmap) {
1577		n = rb_prev(&entry->offset_index);
1578		if (n) {
1579			prev = rb_entry(n, struct btrfs_free_space,
1580					offset_index);
1581			if (!prev->bitmap &&
1582			    prev->offset + prev->bytes > offset)
1583				return prev;
1584		}
1585		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1586			return entry;
1587	} else if (entry->offset + entry->bytes > offset)
1588		return entry;
1589
1590	if (!fuzzy)
1591		return NULL;
1592
1593	while (1) {
1594		if (entry->bitmap) {
1595			if (entry->offset + BITS_PER_BITMAP *
1596			    ctl->unit > offset)
1597				break;
1598		} else {
1599			if (entry->offset + entry->bytes > offset)
1600				break;
1601		}
1602
1603		n = rb_next(&entry->offset_index);
1604		if (!n)
1605			return NULL;
1606		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1607	}
1608	return entry;
1609}
1610
1611static inline void
1612__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1613		    struct btrfs_free_space *info)
1614{
1615	rb_erase(&info->offset_index, &ctl->free_space_offset);
1616	ctl->free_extents--;
1617}
1618
1619static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1620			      struct btrfs_free_space *info)
1621{
1622	__unlink_free_space(ctl, info);
1623	ctl->free_space -= info->bytes;
1624}
1625
1626static int link_free_space(struct btrfs_free_space_ctl *ctl,
1627			   struct btrfs_free_space *info)
1628{
1629	int ret = 0;
1630
1631	ASSERT(info->bytes || info->bitmap);
1632	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1633				 &info->offset_index, (info->bitmap != NULL));
1634	if (ret)
1635		return ret;
1636
1637	ctl->free_space += info->bytes;
1638	ctl->free_extents++;
1639	return ret;
1640}
1641
1642static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1643{
1644	struct btrfs_block_group_cache *block_group = ctl->private;
1645	u64 max_bytes;
1646	u64 bitmap_bytes;
1647	u64 extent_bytes;
1648	u64 size = block_group->key.offset;
1649	u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1650	u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
1651
1652	max_bitmaps = max_t(u32, max_bitmaps, 1);
1653
1654	ASSERT(ctl->total_bitmaps <= max_bitmaps);
1655
1656	/*
1657	 * The goal is to keep the total amount of memory used per 1gb of space
1658	 * at or below 32k, so we need to adjust how much memory we allow to be
1659	 * used by extent based free space tracking
1660	 */
1661	if (size < 1024 * 1024 * 1024)
1662		max_bytes = MAX_CACHE_BYTES_PER_GIG;
1663	else
1664		max_bytes = MAX_CACHE_BYTES_PER_GIG *
1665			div_u64(size, 1024 * 1024 * 1024);
1666
1667	/*
1668	 * we want to account for 1 more bitmap than what we have so we can make
1669	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1670	 * we add more bitmaps.
1671	 */
1672	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
1673
1674	if (bitmap_bytes >= max_bytes) {
1675		ctl->extents_thresh = 0;
1676		return;
1677	}
1678
1679	/*
1680	 * we want the extent entry threshold to always be at most 1/2 the max
1681	 * bytes we can have, or whatever is less than that.
1682	 */
1683	extent_bytes = max_bytes - bitmap_bytes;
1684	extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1685
1686	ctl->extents_thresh =
1687		div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1688}
1689
1690static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1691				       struct btrfs_free_space *info,
1692				       u64 offset, u64 bytes)
1693{
1694	unsigned long start, count;
1695
1696	start = offset_to_bit(info->offset, ctl->unit, offset);
1697	count = bytes_to_bits(bytes, ctl->unit);
1698	ASSERT(start + count <= BITS_PER_BITMAP);
1699
1700	bitmap_clear(info->bitmap, start, count);
1701
1702	info->bytes -= bytes;
1703}
1704
1705static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1706			      struct btrfs_free_space *info, u64 offset,
1707			      u64 bytes)
1708{
1709	__bitmap_clear_bits(ctl, info, offset, bytes);
1710	ctl->free_space -= bytes;
1711}
1712
1713static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1714			    struct btrfs_free_space *info, u64 offset,
1715			    u64 bytes)
1716{
1717	unsigned long start, count;
1718
1719	start = offset_to_bit(info->offset, ctl->unit, offset);
1720	count = bytes_to_bits(bytes, ctl->unit);
1721	ASSERT(start + count <= BITS_PER_BITMAP);
1722
1723	bitmap_set(info->bitmap, start, count);
1724
1725	info->bytes += bytes;
1726	ctl->free_space += bytes;
1727}
1728
1729/*
1730 * If we can not find suitable extent, we will use bytes to record
1731 * the size of the max extent.
1732 */
1733static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1734			 struct btrfs_free_space *bitmap_info, u64 *offset,
1735			 u64 *bytes)
1736{
1737	unsigned long found_bits = 0;
1738	unsigned long max_bits = 0;
1739	unsigned long bits, i;
1740	unsigned long next_zero;
1741	unsigned long extent_bits;
1742
1743	i = offset_to_bit(bitmap_info->offset, ctl->unit,
1744			  max_t(u64, *offset, bitmap_info->offset));
1745	bits = bytes_to_bits(*bytes, ctl->unit);
1746
1747	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1748		next_zero = find_next_zero_bit(bitmap_info->bitmap,
1749					       BITS_PER_BITMAP, i);
1750		extent_bits = next_zero - i;
1751		if (extent_bits >= bits) {
1752			found_bits = extent_bits;
1753			break;
1754		} else if (extent_bits > max_bits) {
1755			max_bits = extent_bits;
1756		}
1757		i = next_zero;
1758	}
1759
1760	if (found_bits) {
1761		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1762		*bytes = (u64)(found_bits) * ctl->unit;
1763		return 0;
1764	}
1765
1766	*bytes = (u64)(max_bits) * ctl->unit;
1767	return -1;
1768}
1769
1770/* Cache the size of the max extent in bytes */
1771static struct btrfs_free_space *
1772find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1773		unsigned long align, u64 *max_extent_size)
1774{
1775	struct btrfs_free_space *entry;
1776	struct rb_node *node;
1777	u64 tmp;
1778	u64 align_off;
1779	int ret;
1780
1781	if (!ctl->free_space_offset.rb_node)
1782		goto out;
1783
1784	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1785	if (!entry)
1786		goto out;
1787
1788	for (node = &entry->offset_index; node; node = rb_next(node)) {
1789		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1790		if (entry->bytes < *bytes) {
1791			if (entry->bytes > *max_extent_size)
1792				*max_extent_size = entry->bytes;
1793			continue;
1794		}
1795
1796		/* make sure the space returned is big enough
1797		 * to match our requested alignment
1798		 */
1799		if (*bytes >= align) {
1800			tmp = entry->offset - ctl->start + align - 1;
1801			tmp = div64_u64(tmp, align);
1802			tmp = tmp * align + ctl->start;
1803			align_off = tmp - entry->offset;
1804		} else {
1805			align_off = 0;
1806			tmp = entry->offset;
1807		}
1808
1809		if (entry->bytes < *bytes + align_off) {
1810			if (entry->bytes > *max_extent_size)
1811				*max_extent_size = entry->bytes;
1812			continue;
1813		}
1814
1815		if (entry->bitmap) {
1816			u64 size = *bytes;
1817
1818			ret = search_bitmap(ctl, entry, &tmp, &size);
1819			if (!ret) {
1820				*offset = tmp;
1821				*bytes = size;
1822				return entry;
1823			} else if (size > *max_extent_size) {
1824				*max_extent_size = size;
1825			}
1826			continue;
1827		}
1828
1829		*offset = tmp;
1830		*bytes = entry->bytes - align_off;
1831		return entry;
1832	}
1833out:
1834	return NULL;
1835}
1836
1837static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1838			   struct btrfs_free_space *info, u64 offset)
1839{
1840	info->offset = offset_to_bitmap(ctl, offset);
1841	info->bytes = 0;
1842	INIT_LIST_HEAD(&info->list);
1843	link_free_space(ctl, info);
1844	ctl->total_bitmaps++;
1845
1846	ctl->op->recalc_thresholds(ctl);
1847}
1848
1849static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1850			struct btrfs_free_space *bitmap_info)
1851{
1852	unlink_free_space(ctl, bitmap_info);
1853	kfree(bitmap_info->bitmap);
1854	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1855	ctl->total_bitmaps--;
1856	ctl->op->recalc_thresholds(ctl);
1857}
1858
1859static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1860			      struct btrfs_free_space *bitmap_info,
1861			      u64 *offset, u64 *bytes)
1862{
1863	u64 end;
1864	u64 search_start, search_bytes;
1865	int ret;
1866
1867again:
1868	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1869
1870	/*
1871	 * We need to search for bits in this bitmap.  We could only cover some
1872	 * of the extent in this bitmap thanks to how we add space, so we need
1873	 * to search for as much as it as we can and clear that amount, and then
1874	 * go searching for the next bit.
1875	 */
1876	search_start = *offset;
1877	search_bytes = ctl->unit;
1878	search_bytes = min(search_bytes, end - search_start + 1);
1879	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
1880	if (ret < 0 || search_start != *offset)
1881		return -EINVAL;
1882
1883	/* We may have found more bits than what we need */
1884	search_bytes = min(search_bytes, *bytes);
1885
1886	/* Cannot clear past the end of the bitmap */
1887	search_bytes = min(search_bytes, end - search_start + 1);
1888
1889	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1890	*offset += search_bytes;
1891	*bytes -= search_bytes;
1892
1893	if (*bytes) {
1894		struct rb_node *next = rb_next(&bitmap_info->offset_index);
1895		if (!bitmap_info->bytes)
1896			free_bitmap(ctl, bitmap_info);
1897
1898		/*
1899		 * no entry after this bitmap, but we still have bytes to
1900		 * remove, so something has gone wrong.
1901		 */
1902		if (!next)
1903			return -EINVAL;
1904
1905		bitmap_info = rb_entry(next, struct btrfs_free_space,
1906				       offset_index);
1907
1908		/*
1909		 * if the next entry isn't a bitmap we need to return to let the
1910		 * extent stuff do its work.
1911		 */
1912		if (!bitmap_info->bitmap)
1913			return -EAGAIN;
1914
1915		/*
1916		 * Ok the next item is a bitmap, but it may not actually hold
1917		 * the information for the rest of this free space stuff, so
1918		 * look for it, and if we don't find it return so we can try
1919		 * everything over again.
1920		 */
1921		search_start = *offset;
1922		search_bytes = ctl->unit;
1923		ret = search_bitmap(ctl, bitmap_info, &search_start,
1924				    &search_bytes);
1925		if (ret < 0 || search_start != *offset)
1926			return -EAGAIN;
1927
1928		goto again;
1929	} else if (!bitmap_info->bytes)
1930		free_bitmap(ctl, bitmap_info);
1931
1932	return 0;
1933}
1934
1935static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1936			       struct btrfs_free_space *info, u64 offset,
1937			       u64 bytes)
1938{
1939	u64 bytes_to_set = 0;
1940	u64 end;
1941
1942	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1943
1944	bytes_to_set = min(end - offset, bytes);
1945
1946	bitmap_set_bits(ctl, info, offset, bytes_to_set);
1947
1948	return bytes_to_set;
1949
1950}
1951
1952static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1953		      struct btrfs_free_space *info)
1954{
1955	struct btrfs_block_group_cache *block_group = ctl->private;
1956
1957	/*
1958	 * If we are below the extents threshold then we can add this as an
1959	 * extent, and don't have to deal with the bitmap
1960	 */
1961	if (ctl->free_extents < ctl->extents_thresh) {
1962		/*
1963		 * If this block group has some small extents we don't want to
1964		 * use up all of our free slots in the cache with them, we want
1965		 * to reserve them to larger extents, however if we have plent
1966		 * of cache left then go ahead an dadd them, no sense in adding
1967		 * the overhead of a bitmap if we don't have to.
1968		 */
1969		if (info->bytes <= block_group->sectorsize * 4) {
1970			if (ctl->free_extents * 2 <= ctl->extents_thresh)
1971				return false;
1972		} else {
1973			return false;
1974		}
1975	}
1976
1977	/*
1978	 * The original block groups from mkfs can be really small, like 8
1979	 * megabytes, so don't bother with a bitmap for those entries.  However
1980	 * some block groups can be smaller than what a bitmap would cover but
1981	 * are still large enough that they could overflow the 32k memory limit,
1982	 * so allow those block groups to still be allowed to have a bitmap
1983	 * entry.
1984	 */
1985	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1986		return false;
1987
1988	return true;
1989}
1990
1991static struct btrfs_free_space_op free_space_op = {
1992	.recalc_thresholds	= recalculate_thresholds,
1993	.use_bitmap		= use_bitmap,
1994};
1995
1996static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
1997			      struct btrfs_free_space *info)
1998{
1999	struct btrfs_free_space *bitmap_info;
2000	struct btrfs_block_group_cache *block_group = NULL;
2001	int added = 0;
2002	u64 bytes, offset, bytes_added;
2003	int ret;
2004
2005	bytes = info->bytes;
2006	offset = info->offset;
2007
2008	if (!ctl->op->use_bitmap(ctl, info))
2009		return 0;
2010
2011	if (ctl->op == &free_space_op)
2012		block_group = ctl->private;
2013again:
2014	/*
2015	 * Since we link bitmaps right into the cluster we need to see if we
2016	 * have a cluster here, and if so and it has our bitmap we need to add
2017	 * the free space to that bitmap.
2018	 */
2019	if (block_group && !list_empty(&block_group->cluster_list)) {
2020		struct btrfs_free_cluster *cluster;
2021		struct rb_node *node;
2022		struct btrfs_free_space *entry;
2023
2024		cluster = list_entry(block_group->cluster_list.next,
2025				     struct btrfs_free_cluster,
2026				     block_group_list);
2027		spin_lock(&cluster->lock);
2028		node = rb_first(&cluster->root);
2029		if (!node) {
2030			spin_unlock(&cluster->lock);
2031			goto no_cluster_bitmap;
2032		}
2033
2034		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2035		if (!entry->bitmap) {
2036			spin_unlock(&cluster->lock);
2037			goto no_cluster_bitmap;
2038		}
2039
2040		if (entry->offset == offset_to_bitmap(ctl, offset)) {
2041			bytes_added = add_bytes_to_bitmap(ctl, entry,
2042							  offset, bytes);
2043			bytes -= bytes_added;
2044			offset += bytes_added;
2045		}
2046		spin_unlock(&cluster->lock);
2047		if (!bytes) {
2048			ret = 1;
2049			goto out;
2050		}
2051	}
2052
2053no_cluster_bitmap:
2054	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2055					 1, 0);
2056	if (!bitmap_info) {
2057		ASSERT(added == 0);
2058		goto new_bitmap;
2059	}
2060
2061	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2062	bytes -= bytes_added;
2063	offset += bytes_added;
2064	added = 0;
2065
2066	if (!bytes) {
2067		ret = 1;
2068		goto out;
2069	} else
2070		goto again;
2071
2072new_bitmap:
2073	if (info && info->bitmap) {
2074		add_new_bitmap(ctl, info, offset);
2075		added = 1;
2076		info = NULL;
2077		goto again;
2078	} else {
2079		spin_unlock(&ctl->tree_lock);
2080
2081		/* no pre-allocated info, allocate a new one */
2082		if (!info) {
2083			info = kmem_cache_zalloc(btrfs_free_space_cachep,
2084						 GFP_NOFS);
2085			if (!info) {
2086				spin_lock(&ctl->tree_lock);
2087				ret = -ENOMEM;
2088				goto out;
2089			}
2090		}
2091
2092		/* allocate the bitmap */
2093		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
2094		spin_lock(&ctl->tree_lock);
2095		if (!info->bitmap) {
2096			ret = -ENOMEM;
2097			goto out;
2098		}
2099		goto again;
2100	}
2101
2102out:
2103	if (info) {
2104		if (info->bitmap)
2105			kfree(info->bitmap);
2106		kmem_cache_free(btrfs_free_space_cachep, info);
2107	}
2108
2109	return ret;
2110}
2111
2112static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2113			  struct btrfs_free_space *info, bool update_stat)
2114{
2115	struct btrfs_free_space *left_info;
2116	struct btrfs_free_space *right_info;
2117	bool merged = false;
2118	u64 offset = info->offset;
2119	u64 bytes = info->bytes;
2120
2121	/*
2122	 * first we want to see if there is free space adjacent to the range we
2123	 * are adding, if there is remove that struct and add a new one to
2124	 * cover the entire range
2125	 */
2126	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2127	if (right_info && rb_prev(&right_info->offset_index))
2128		left_info = rb_entry(rb_prev(&right_info->offset_index),
2129				     struct btrfs_free_space, offset_index);
2130	else
2131		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2132
2133	if (right_info && !right_info->bitmap) {
2134		if (update_stat)
2135			unlink_free_space(ctl, right_info);
2136		else
2137			__unlink_free_space(ctl, right_info);
2138		info->bytes += right_info->bytes;
2139		kmem_cache_free(btrfs_free_space_cachep, right_info);
2140		merged = true;
2141	}
2142
2143	if (left_info && !left_info->bitmap &&
2144	    left_info->offset + left_info->bytes == offset) {
2145		if (update_stat)
2146			unlink_free_space(ctl, left_info);
2147		else
2148			__unlink_free_space(ctl, left_info);
2149		info->offset = left_info->offset;
2150		info->bytes += left_info->bytes;
2151		kmem_cache_free(btrfs_free_space_cachep, left_info);
2152		merged = true;
2153	}
2154
2155	return merged;
2156}
2157
2158static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2159				     struct btrfs_free_space *info,
2160				     bool update_stat)
2161{
2162	struct btrfs_free_space *bitmap;
2163	unsigned long i;
2164	unsigned long j;
2165	const u64 end = info->offset + info->bytes;
2166	const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2167	u64 bytes;
2168
2169	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2170	if (!bitmap)
2171		return false;
2172
2173	i = offset_to_bit(bitmap->offset, ctl->unit, end);
2174	j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2175	if (j == i)
2176		return false;
2177	bytes = (j - i) * ctl->unit;
2178	info->bytes += bytes;
2179
2180	if (update_stat)
2181		bitmap_clear_bits(ctl, bitmap, end, bytes);
2182	else
2183		__bitmap_clear_bits(ctl, bitmap, end, bytes);
2184
2185	if (!bitmap->bytes)
2186		free_bitmap(ctl, bitmap);
2187
2188	return true;
2189}
2190
2191static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2192				       struct btrfs_free_space *info,
2193				       bool update_stat)
2194{
2195	struct btrfs_free_space *bitmap;
2196	u64 bitmap_offset;
2197	unsigned long i;
2198	unsigned long j;
2199	unsigned long prev_j;
2200	u64 bytes;
2201
2202	bitmap_offset = offset_to_bitmap(ctl, info->offset);
2203	/* If we're on a boundary, try the previous logical bitmap. */
2204	if (bitmap_offset == info->offset) {
2205		if (info->offset == 0)
2206			return false;
2207		bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2208	}
2209
2210	bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2211	if (!bitmap)
2212		return false;
2213
2214	i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2215	j = 0;
2216	prev_j = (unsigned long)-1;
2217	for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2218		if (j > i)
2219			break;
2220		prev_j = j;
2221	}
2222	if (prev_j == i)
2223		return false;
2224
2225	if (prev_j == (unsigned long)-1)
2226		bytes = (i + 1) * ctl->unit;
2227	else
2228		bytes = (i - prev_j) * ctl->unit;
2229
2230	info->offset -= bytes;
2231	info->bytes += bytes;
2232
2233	if (update_stat)
2234		bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2235	else
2236		__bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2237
2238	if (!bitmap->bytes)
2239		free_bitmap(ctl, bitmap);
2240
2241	return true;
2242}
2243
2244/*
2245 * We prefer always to allocate from extent entries, both for clustered and
2246 * non-clustered allocation requests. So when attempting to add a new extent
2247 * entry, try to see if there's adjacent free space in bitmap entries, and if
2248 * there is, migrate that space from the bitmaps to the extent.
2249 * Like this we get better chances of satisfying space allocation requests
2250 * because we attempt to satisfy them based on a single cache entry, and never
2251 * on 2 or more entries - even if the entries represent a contiguous free space
2252 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2253 * ends).
2254 */
2255static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2256			      struct btrfs_free_space *info,
2257			      bool update_stat)
2258{
2259	/*
2260	 * Only work with disconnected entries, as we can change their offset,
2261	 * and must be extent entries.
2262	 */
2263	ASSERT(!info->bitmap);
2264	ASSERT(RB_EMPTY_NODE(&info->offset_index));
2265
2266	if (ctl->total_bitmaps > 0) {
2267		bool stole_end;
2268		bool stole_front = false;
2269
2270		stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2271		if (ctl->total_bitmaps > 0)
2272			stole_front = steal_from_bitmap_to_front(ctl, info,
2273								 update_stat);
2274
2275		if (stole_end || stole_front)
2276			try_merge_free_space(ctl, info, update_stat);
2277	}
2278}
2279
2280int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2281			   u64 offset, u64 bytes)
2282{
2283	struct btrfs_free_space *info;
2284	int ret = 0;
2285
2286	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2287	if (!info)
2288		return -ENOMEM;
2289
2290	info->offset = offset;
2291	info->bytes = bytes;
2292	RB_CLEAR_NODE(&info->offset_index);
2293
2294	spin_lock(&ctl->tree_lock);
2295
2296	if (try_merge_free_space(ctl, info, true))
2297		goto link;
2298
2299	/*
2300	 * There was no extent directly to the left or right of this new
2301	 * extent then we know we're going to have to allocate a new extent, so
2302	 * before we do that see if we need to drop this into a bitmap
2303	 */
2304	ret = insert_into_bitmap(ctl, info);
2305	if (ret < 0) {
2306		goto out;
2307	} else if (ret) {
2308		ret = 0;
2309		goto out;
2310	}
2311link:
2312	/*
2313	 * Only steal free space from adjacent bitmaps if we're sure we're not
2314	 * going to add the new free space to existing bitmap entries - because
2315	 * that would mean unnecessary work that would be reverted. Therefore
2316	 * attempt to steal space from bitmaps if we're adding an extent entry.
2317	 */
2318	steal_from_bitmap(ctl, info, true);
2319
2320	ret = link_free_space(ctl, info);
2321	if (ret)
2322		kmem_cache_free(btrfs_free_space_cachep, info);
2323out:
2324	spin_unlock(&ctl->tree_lock);
2325
2326	if (ret) {
2327		printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
2328		ASSERT(ret != -EEXIST);
2329	}
2330
2331	return ret;
2332}
2333
2334int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2335			    u64 offset, u64 bytes)
2336{
2337	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2338	struct btrfs_free_space *info;
2339	int ret;
2340	bool re_search = false;
2341
2342	spin_lock(&ctl->tree_lock);
2343
2344again:
2345	ret = 0;
2346	if (!bytes)
2347		goto out_lock;
2348
2349	info = tree_search_offset(ctl, offset, 0, 0);
2350	if (!info) {
2351		/*
2352		 * oops didn't find an extent that matched the space we wanted
2353		 * to remove, look for a bitmap instead
2354		 */
2355		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2356					  1, 0);
2357		if (!info) {
2358			/*
2359			 * If we found a partial bit of our free space in a
2360			 * bitmap but then couldn't find the other part this may
2361			 * be a problem, so WARN about it.
2362			 */
2363			WARN_ON(re_search);
2364			goto out_lock;
2365		}
2366	}
2367
2368	re_search = false;
2369	if (!info->bitmap) {
2370		unlink_free_space(ctl, info);
2371		if (offset == info->offset) {
2372			u64 to_free = min(bytes, info->bytes);
2373
2374			info->bytes -= to_free;
2375			info->offset += to_free;
2376			if (info->bytes) {
2377				ret = link_free_space(ctl, info);
2378				WARN_ON(ret);
2379			} else {
2380				kmem_cache_free(btrfs_free_space_cachep, info);
2381			}
2382
2383			offset += to_free;
2384			bytes -= to_free;
2385			goto again;
2386		} else {
2387			u64 old_end = info->bytes + info->offset;
2388
2389			info->bytes = offset - info->offset;
2390			ret = link_free_space(ctl, info);
2391			WARN_ON(ret);
2392			if (ret)
2393				goto out_lock;
2394
2395			/* Not enough bytes in this entry to satisfy us */
2396			if (old_end < offset + bytes) {
2397				bytes -= old_end - offset;
2398				offset = old_end;
2399				goto again;
2400			} else if (old_end == offset + bytes) {
2401				/* all done */
2402				goto out_lock;
2403			}
2404			spin_unlock(&ctl->tree_lock);
2405
2406			ret = btrfs_add_free_space(block_group, offset + bytes,
2407						   old_end - (offset + bytes));
2408			WARN_ON(ret);
2409			goto out;
2410		}
2411	}
2412
2413	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2414	if (ret == -EAGAIN) {
2415		re_search = true;
2416		goto again;
2417	}
2418out_lock:
2419	spin_unlock(&ctl->tree_lock);
2420out:
2421	return ret;
2422}
2423
2424void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2425			   u64 bytes)
2426{
2427	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2428	struct btrfs_free_space *info;
2429	struct rb_node *n;
2430	int count = 0;
2431
2432	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2433		info = rb_entry(n, struct btrfs_free_space, offset_index);
2434		if (info->bytes >= bytes && !block_group->ro)
2435			count++;
2436		btrfs_crit(block_group->fs_info,
2437			   "entry offset %llu, bytes %llu, bitmap %s",
2438			   info->offset, info->bytes,
2439		       (info->bitmap) ? "yes" : "no");
2440	}
2441	btrfs_info(block_group->fs_info, "block group has cluster?: %s",
2442	       list_empty(&block_group->cluster_list) ? "no" : "yes");
2443	btrfs_info(block_group->fs_info,
2444		   "%d blocks of free space at or bigger than bytes is", count);
2445}
2446
2447void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2448{
2449	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2450
2451	spin_lock_init(&ctl->tree_lock);
2452	ctl->unit = block_group->sectorsize;
2453	ctl->start = block_group->key.objectid;
2454	ctl->private = block_group;
2455	ctl->op = &free_space_op;
2456	INIT_LIST_HEAD(&ctl->trimming_ranges);
2457	mutex_init(&ctl->cache_writeout_mutex);
2458
2459	/*
2460	 * we only want to have 32k of ram per block group for keeping
2461	 * track of free space, and if we pass 1/2 of that we want to
2462	 * start converting things over to using bitmaps
2463	 */
2464	ctl->extents_thresh = ((1024 * 32) / 2) /
2465				sizeof(struct btrfs_free_space);
2466}
2467
2468/*
2469 * for a given cluster, put all of its extents back into the free
2470 * space cache.  If the block group passed doesn't match the block group
2471 * pointed to by the cluster, someone else raced in and freed the
2472 * cluster already.  In that case, we just return without changing anything
2473 */
2474static int
2475__btrfs_return_cluster_to_free_space(
2476			     struct btrfs_block_group_cache *block_group,
2477			     struct btrfs_free_cluster *cluster)
2478{
2479	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2480	struct btrfs_free_space *entry;
2481	struct rb_node *node;
2482
2483	spin_lock(&cluster->lock);
2484	if (cluster->block_group != block_group)
2485		goto out;
2486
2487	cluster->block_group = NULL;
2488	cluster->window_start = 0;
2489	list_del_init(&cluster->block_group_list);
2490
2491	node = rb_first(&cluster->root);
2492	while (node) {
2493		bool bitmap;
2494
2495		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2496		node = rb_next(&entry->offset_index);
2497		rb_erase(&entry->offset_index, &cluster->root);
2498		RB_CLEAR_NODE(&entry->offset_index);
2499
2500		bitmap = (entry->bitmap != NULL);
2501		if (!bitmap) {
2502			try_merge_free_space(ctl, entry, false);
2503			steal_from_bitmap(ctl, entry, false);
2504		}
2505		tree_insert_offset(&ctl->free_space_offset,
2506				   entry->offset, &entry->offset_index, bitmap);
2507	}
2508	cluster->root = RB_ROOT;
2509
2510out:
2511	spin_unlock(&cluster->lock);
2512	btrfs_put_block_group(block_group);
2513	return 0;
2514}
2515
2516static void __btrfs_remove_free_space_cache_locked(
2517				struct btrfs_free_space_ctl *ctl)
2518{
2519	struct btrfs_free_space *info;
2520	struct rb_node *node;
2521
2522	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2523		info = rb_entry(node, struct btrfs_free_space, offset_index);
2524		if (!info->bitmap) {
2525			unlink_free_space(ctl, info);
2526			kmem_cache_free(btrfs_free_space_cachep, info);
2527		} else {
2528			free_bitmap(ctl, info);
2529		}
2530
2531		cond_resched_lock(&ctl->tree_lock);
2532	}
2533}
2534
2535void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2536{
2537	spin_lock(&ctl->tree_lock);
2538	__btrfs_remove_free_space_cache_locked(ctl);
2539	spin_unlock(&ctl->tree_lock);
2540}
2541
2542void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2543{
2544	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2545	struct btrfs_free_cluster *cluster;
2546	struct list_head *head;
2547
2548	spin_lock(&ctl->tree_lock);
2549	while ((head = block_group->cluster_list.next) !=
2550	       &block_group->cluster_list) {
2551		cluster = list_entry(head, struct btrfs_free_cluster,
2552				     block_group_list);
2553
2554		WARN_ON(cluster->block_group != block_group);
2555		__btrfs_return_cluster_to_free_space(block_group, cluster);
2556
2557		cond_resched_lock(&ctl->tree_lock);
2558	}
2559	__btrfs_remove_free_space_cache_locked(ctl);
2560	spin_unlock(&ctl->tree_lock);
2561
2562}
2563
2564u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2565			       u64 offset, u64 bytes, u64 empty_size,
2566			       u64 *max_extent_size)
2567{
2568	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2569	struct btrfs_free_space *entry = NULL;
2570	u64 bytes_search = bytes + empty_size;
2571	u64 ret = 0;
2572	u64 align_gap = 0;
2573	u64 align_gap_len = 0;
2574
2575	spin_lock(&ctl->tree_lock);
2576	entry = find_free_space(ctl, &offset, &bytes_search,
2577				block_group->full_stripe_len, max_extent_size);
2578	if (!entry)
2579		goto out;
2580
2581	ret = offset;
2582	if (entry->bitmap) {
2583		bitmap_clear_bits(ctl, entry, offset, bytes);
2584		if (!entry->bytes)
2585			free_bitmap(ctl, entry);
2586	} else {
2587		unlink_free_space(ctl, entry);
2588		align_gap_len = offset - entry->offset;
2589		align_gap = entry->offset;
2590
2591		entry->offset = offset + bytes;
2592		WARN_ON(entry->bytes < bytes + align_gap_len);
2593
2594		entry->bytes -= bytes + align_gap_len;
2595		if (!entry->bytes)
2596			kmem_cache_free(btrfs_free_space_cachep, entry);
2597		else
2598			link_free_space(ctl, entry);
2599	}
2600out:
2601	spin_unlock(&ctl->tree_lock);
2602
2603	if (align_gap_len)
2604		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
2605	return ret;
2606}
2607
2608/*
2609 * given a cluster, put all of its extents back into the free space
2610 * cache.  If a block group is passed, this function will only free
2611 * a cluster that belongs to the passed block group.
2612 *
2613 * Otherwise, it'll get a reference on the block group pointed to by the
2614 * cluster and remove the cluster from it.
2615 */
2616int btrfs_return_cluster_to_free_space(
2617			       struct btrfs_block_group_cache *block_group,
2618			       struct btrfs_free_cluster *cluster)
2619{
2620	struct btrfs_free_space_ctl *ctl;
2621	int ret;
2622
2623	/* first, get a safe pointer to the block group */
2624	spin_lock(&cluster->lock);
2625	if (!block_group) {
2626		block_group = cluster->block_group;
2627		if (!block_group) {
2628			spin_unlock(&cluster->lock);
2629			return 0;
2630		}
2631	} else if (cluster->block_group != block_group) {
2632		/* someone else has already freed it don't redo their work */
2633		spin_unlock(&cluster->lock);
2634		return 0;
2635	}
2636	atomic_inc(&block_group->count);
2637	spin_unlock(&cluster->lock);
2638
2639	ctl = block_group->free_space_ctl;
2640
2641	/* now return any extents the cluster had on it */
2642	spin_lock(&ctl->tree_lock);
2643	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2644	spin_unlock(&ctl->tree_lock);
2645
2646	/* finally drop our ref */
2647	btrfs_put_block_group(block_group);
2648	return ret;
2649}
2650
2651static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2652				   struct btrfs_free_cluster *cluster,
2653				   struct btrfs_free_space *entry,
2654				   u64 bytes, u64 min_start,
2655				   u64 *max_extent_size)
2656{
2657	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2658	int err;
2659	u64 search_start = cluster->window_start;
2660	u64 search_bytes = bytes;
2661	u64 ret = 0;
2662
2663	search_start = min_start;
2664	search_bytes = bytes;
2665
2666	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
2667	if (err) {
2668		if (search_bytes > *max_extent_size)
2669			*max_extent_size = search_bytes;
2670		return 0;
2671	}
2672
2673	ret = search_start;
2674	__bitmap_clear_bits(ctl, entry, ret, bytes);
2675
2676	return ret;
2677}
2678
2679/*
2680 * given a cluster, try to allocate 'bytes' from it, returns 0
2681 * if it couldn't find anything suitably large, or a logical disk offset
2682 * if things worked out
2683 */
2684u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2685			     struct btrfs_free_cluster *cluster, u64 bytes,
2686			     u64 min_start, u64 *max_extent_size)
2687{
2688	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2689	struct btrfs_free_space *entry = NULL;
2690	struct rb_node *node;
2691	u64 ret = 0;
2692
2693	spin_lock(&cluster->lock);
2694	if (bytes > cluster->max_size)
2695		goto out;
2696
2697	if (cluster->block_group != block_group)
2698		goto out;
2699
2700	node = rb_first(&cluster->root);
2701	if (!node)
2702		goto out;
2703
2704	entry = rb_entry(node, struct btrfs_free_space, offset_index);
2705	while (1) {
2706		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
2707			*max_extent_size = entry->bytes;
2708
2709		if (entry->bytes < bytes ||
2710		    (!entry->bitmap && entry->offset < min_start)) {
2711			node = rb_next(&entry->offset_index);
2712			if (!node)
2713				break;
2714			entry = rb_entry(node, struct btrfs_free_space,
2715					 offset_index);
2716			continue;
2717		}
2718
2719		if (entry->bitmap) {
2720			ret = btrfs_alloc_from_bitmap(block_group,
2721						      cluster, entry, bytes,
2722						      cluster->window_start,
2723						      max_extent_size);
2724			if (ret == 0) {
2725				node = rb_next(&entry->offset_index);
2726				if (!node)
2727					break;
2728				entry = rb_entry(node, struct btrfs_free_space,
2729						 offset_index);
2730				continue;
2731			}
2732			cluster->window_start += bytes;
2733		} else {
2734			ret = entry->offset;
2735
2736			entry->offset += bytes;
2737			entry->bytes -= bytes;
2738		}
2739
2740		if (entry->bytes == 0)
2741			rb_erase(&entry->offset_index, &cluster->root);
2742		break;
2743	}
2744out:
2745	spin_unlock(&cluster->lock);
2746
2747	if (!ret)
2748		return 0;
2749
2750	spin_lock(&ctl->tree_lock);
2751
2752	ctl->free_space -= bytes;
2753	if (entry->bytes == 0) {
2754		ctl->free_extents--;
2755		if (entry->bitmap) {
2756			kfree(entry->bitmap);
2757			ctl->total_bitmaps--;
2758			ctl->op->recalc_thresholds(ctl);
2759		}
2760		kmem_cache_free(btrfs_free_space_cachep, entry);
2761	}
2762
2763	spin_unlock(&ctl->tree_lock);
2764
2765	return ret;
2766}
2767
2768static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2769				struct btrfs_free_space *entry,
2770				struct btrfs_free_cluster *cluster,
2771				u64 offset, u64 bytes,
2772				u64 cont1_bytes, u64 min_bytes)
2773{
2774	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2775	unsigned long next_zero;
2776	unsigned long i;
2777	unsigned long want_bits;
2778	unsigned long min_bits;
2779	unsigned long found_bits;
2780	unsigned long start = 0;
2781	unsigned long total_found = 0;
2782	int ret;
2783
2784	i = offset_to_bit(entry->offset, ctl->unit,
2785			  max_t(u64, offset, entry->offset));
2786	want_bits = bytes_to_bits(bytes, ctl->unit);
2787	min_bits = bytes_to_bits(min_bytes, ctl->unit);
2788
2789again:
2790	found_bits = 0;
2791	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2792		next_zero = find_next_zero_bit(entry->bitmap,
2793					       BITS_PER_BITMAP, i);
2794		if (next_zero - i >= min_bits) {
2795			found_bits = next_zero - i;
2796			break;
2797		}
2798		i = next_zero;
2799	}
2800
2801	if (!found_bits)
2802		return -ENOSPC;
2803
2804	if (!total_found) {
2805		start = i;
2806		cluster->max_size = 0;
2807	}
2808
2809	total_found += found_bits;
2810
2811	if (cluster->max_size < found_bits * ctl->unit)
2812		cluster->max_size = found_bits * ctl->unit;
2813
2814	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2815		i = next_zero + 1;
2816		goto again;
2817	}
2818
2819	cluster->window_start = start * ctl->unit + entry->offset;
2820	rb_erase(&entry->offset_index, &ctl->free_space_offset);
2821	ret = tree_insert_offset(&cluster->root, entry->offset,
2822				 &entry->offset_index, 1);
2823	ASSERT(!ret); /* -EEXIST; Logic error */
2824
2825	trace_btrfs_setup_cluster(block_group, cluster,
2826				  total_found * ctl->unit, 1);
2827	return 0;
2828}
2829
2830/*
2831 * This searches the block group for just extents to fill the cluster with.
2832 * Try to find a cluster with at least bytes total bytes, at least one
2833 * extent of cont1_bytes, and other clusters of at least min_bytes.
2834 */
2835static noinline int
2836setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2837			struct btrfs_free_cluster *cluster,
2838			struct list_head *bitmaps, u64 offset, u64 bytes,
2839			u64 cont1_bytes, u64 min_bytes)
2840{
2841	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2842	struct btrfs_free_space *first = NULL;
2843	struct btrfs_free_space *entry = NULL;
2844	struct btrfs_free_space *last;
2845	struct rb_node *node;
2846	u64 window_free;
2847	u64 max_extent;
2848	u64 total_size = 0;
2849
2850	entry = tree_search_offset(ctl, offset, 0, 1);
2851	if (!entry)
2852		return -ENOSPC;
2853
2854	/*
2855	 * We don't want bitmaps, so just move along until we find a normal
2856	 * extent entry.
2857	 */
2858	while (entry->bitmap || entry->bytes < min_bytes) {
2859		if (entry->bitmap && list_empty(&entry->list))
2860			list_add_tail(&entry->list, bitmaps);
2861		node = rb_next(&entry->offset_index);
2862		if (!node)
2863			return -ENOSPC;
2864		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2865	}
2866
2867	window_free = entry->bytes;
2868	max_extent = entry->bytes;
2869	first = entry;
2870	last = entry;
2871
2872	for (node = rb_next(&entry->offset_index); node;
2873	     node = rb_next(&entry->offset_index)) {
2874		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2875
2876		if (entry->bitmap) {
2877			if (list_empty(&entry->list))
2878				list_add_tail(&entry->list, bitmaps);
2879			continue;
2880		}
2881
2882		if (entry->bytes < min_bytes)
2883			continue;
2884
2885		last = entry;
2886		window_free += entry->bytes;
2887		if (entry->bytes > max_extent)
2888			max_extent = entry->bytes;
2889	}
2890
2891	if (window_free < bytes || max_extent < cont1_bytes)
2892		return -ENOSPC;
2893
2894	cluster->window_start = first->offset;
2895
2896	node = &first->offset_index;
2897
2898	/*
2899	 * now we've found our entries, pull them out of the free space
2900	 * cache and put them into the cluster rbtree
2901	 */
2902	do {
2903		int ret;
2904
2905		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2906		node = rb_next(&entry->offset_index);
2907		if (entry->bitmap || entry->bytes < min_bytes)
2908			continue;
2909
2910		rb_erase(&entry->offset_index, &ctl->free_space_offset);
2911		ret = tree_insert_offset(&cluster->root, entry->offset,
2912					 &entry->offset_index, 0);
2913		total_size += entry->bytes;
2914		ASSERT(!ret); /* -EEXIST; Logic error */
2915	} while (node && entry != last);
2916
2917	cluster->max_size = max_extent;
2918	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2919	return 0;
2920}
2921
2922/*
2923 * This specifically looks for bitmaps that may work in the cluster, we assume
2924 * that we have already failed to find extents that will work.
2925 */
2926static noinline int
2927setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2928		     struct btrfs_free_cluster *cluster,
2929		     struct list_head *bitmaps, u64 offset, u64 bytes,
2930		     u64 cont1_bytes, u64 min_bytes)
2931{
2932	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2933	struct btrfs_free_space *entry;
2934	int ret = -ENOSPC;
2935	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2936
2937	if (ctl->total_bitmaps == 0)
2938		return -ENOSPC;
2939
2940	/*
2941	 * The bitmap that covers offset won't be in the list unless offset
2942	 * is just its start offset.
2943	 */
2944	entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2945	if (entry->offset != bitmap_offset) {
2946		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2947		if (entry && list_empty(&entry->list))
2948			list_add(&entry->list, bitmaps);
2949	}
2950
2951	list_for_each_entry(entry, bitmaps, list) {
2952		if (entry->bytes < bytes)
2953			continue;
2954		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2955					   bytes, cont1_bytes, min_bytes);
2956		if (!ret)
2957			return 0;
2958	}
2959
2960	/*
2961	 * The bitmaps list has all the bitmaps that record free space
2962	 * starting after offset, so no more search is required.
2963	 */
2964	return -ENOSPC;
2965}
2966
2967/*
2968 * here we try to find a cluster of blocks in a block group.  The goal
2969 * is to find at least bytes+empty_size.
2970 * We might not find them all in one contiguous area.
2971 *
2972 * returns zero and sets up cluster if things worked out, otherwise
2973 * it returns -enospc
2974 */
2975int btrfs_find_space_cluster(struct btrfs_root *root,
2976			     struct btrfs_block_group_cache *block_group,
2977			     struct btrfs_free_cluster *cluster,
2978			     u64 offset, u64 bytes, u64 empty_size)
2979{
2980	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2981	struct btrfs_free_space *entry, *tmp;
2982	LIST_HEAD(bitmaps);
2983	u64 min_bytes;
2984	u64 cont1_bytes;
2985	int ret;
2986
2987	/*
2988	 * Choose the minimum extent size we'll require for this
2989	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
2990	 * For metadata, allow allocates with smaller extents.  For
2991	 * data, keep it dense.
2992	 */
2993	if (btrfs_test_opt(root, SSD_SPREAD)) {
2994		cont1_bytes = min_bytes = bytes + empty_size;
2995	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2996		cont1_bytes = bytes;
2997		min_bytes = block_group->sectorsize;
2998	} else {
2999		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3000		min_bytes = block_group->sectorsize;
3001	}
3002
3003	spin_lock(&ctl->tree_lock);
3004
3005	/*
3006	 * If we know we don't have enough space to make a cluster don't even
3007	 * bother doing all the work to try and find one.
3008	 */
3009	if (ctl->free_space < bytes) {
3010		spin_unlock(&ctl->tree_lock);
3011		return -ENOSPC;
3012	}
3013
3014	spin_lock(&cluster->lock);
3015
3016	/* someone already found a cluster, hooray */
3017	if (cluster->block_group) {
3018		ret = 0;
3019		goto out;
3020	}
3021
3022	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3023				 min_bytes);
3024
3025	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3026				      bytes + empty_size,
3027				      cont1_bytes, min_bytes);
3028	if (ret)
3029		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3030					   offset, bytes + empty_size,
3031					   cont1_bytes, min_bytes);
3032
3033	/* Clear our temporary list */
3034	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3035		list_del_init(&entry->list);
3036
3037	if (!ret) {
3038		atomic_inc(&block_group->count);
3039		list_add_tail(&cluster->block_group_list,
3040			      &block_group->cluster_list);
3041		cluster->block_group = block_group;
3042	} else {
3043		trace_btrfs_failed_cluster_setup(block_group);
3044	}
3045out:
3046	spin_unlock(&cluster->lock);
3047	spin_unlock(&ctl->tree_lock);
3048
3049	return ret;
3050}
3051
3052/*
3053 * simple code to zero out a cluster
3054 */
3055void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3056{
3057	spin_lock_init(&cluster->lock);
3058	spin_lock_init(&cluster->refill_lock);
3059	cluster->root = RB_ROOT;
3060	cluster->max_size = 0;
3061	INIT_LIST_HEAD(&cluster->block_group_list);
3062	cluster->block_group = NULL;
3063}
3064
3065static int do_trimming(struct btrfs_block_group_cache *block_group,
3066		       u64 *total_trimmed, u64 start, u64 bytes,
3067		       u64 reserved_start, u64 reserved_bytes,
3068		       struct btrfs_trim_range *trim_entry)
3069{
3070	struct btrfs_space_info *space_info = block_group->space_info;
3071	struct btrfs_fs_info *fs_info = block_group->fs_info;
3072	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3073	int ret;
3074	int update = 0;
3075	u64 trimmed = 0;
3076
3077	spin_lock(&space_info->lock);
3078	spin_lock(&block_group->lock);
3079	if (!block_group->ro) {
3080		block_group->reserved += reserved_bytes;
3081		space_info->bytes_reserved += reserved_bytes;
3082		update = 1;
3083	}
3084	spin_unlock(&block_group->lock);
3085	spin_unlock(&space_info->lock);
3086
3087	ret = btrfs_discard_extent(fs_info->extent_root,
3088				   start, bytes, &trimmed);
3089	if (!ret)
3090		*total_trimmed += trimmed;
3091
3092	mutex_lock(&ctl->cache_writeout_mutex);
3093	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
3094	list_del(&trim_entry->list);
3095	mutex_unlock(&ctl->cache_writeout_mutex);
3096
3097	if (update) {
3098		spin_lock(&space_info->lock);
3099		spin_lock(&block_group->lock);
3100		if (block_group->ro)
3101			space_info->bytes_readonly += reserved_bytes;
3102		block_group->reserved -= reserved_bytes;
3103		space_info->bytes_reserved -= reserved_bytes;
3104		spin_unlock(&space_info->lock);
3105		spin_unlock(&block_group->lock);
3106	}
3107
3108	return ret;
3109}
3110
3111static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3112			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3113{
3114	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3115	struct btrfs_free_space *entry;
3116	struct rb_node *node;
3117	int ret = 0;
3118	u64 extent_start;
3119	u64 extent_bytes;
3120	u64 bytes;
3121
3122	while (start < end) {
3123		struct btrfs_trim_range trim_entry;
3124
3125		mutex_lock(&ctl->cache_writeout_mutex);
3126		spin_lock(&ctl->tree_lock);
3127
3128		if (ctl->free_space < minlen) {
3129			spin_unlock(&ctl->tree_lock);
3130			mutex_unlock(&ctl->cache_writeout_mutex);
3131			break;
3132		}
3133
3134		entry = tree_search_offset(ctl, start, 0, 1);
3135		if (!entry) {
3136			spin_unlock(&ctl->tree_lock);
3137			mutex_unlock(&ctl->cache_writeout_mutex);
3138			break;
3139		}
3140
3141		/* skip bitmaps */
3142		while (entry->bitmap) {
3143			node = rb_next(&entry->offset_index);
3144			if (!node) {
3145				spin_unlock(&ctl->tree_lock);
3146				mutex_unlock(&ctl->cache_writeout_mutex);
3147				goto out;
3148			}
3149			entry = rb_entry(node, struct btrfs_free_space,
3150					 offset_index);
3151		}
3152
3153		if (entry->offset >= end) {
3154			spin_unlock(&ctl->tree_lock);
3155			mutex_unlock(&ctl->cache_writeout_mutex);
3156			break;
3157		}
3158
3159		extent_start = entry->offset;
3160		extent_bytes = entry->bytes;
3161		start = max(start, extent_start);
3162		bytes = min(extent_start + extent_bytes, end) - start;
3163		if (bytes < minlen) {
3164			spin_unlock(&ctl->tree_lock);
3165			mutex_unlock(&ctl->cache_writeout_mutex);
3166			goto next;
3167		}
3168
3169		unlink_free_space(ctl, entry);
3170		kmem_cache_free(btrfs_free_space_cachep, entry);
3171
3172		spin_unlock(&ctl->tree_lock);
3173		trim_entry.start = extent_start;
3174		trim_entry.bytes = extent_bytes;
3175		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3176		mutex_unlock(&ctl->cache_writeout_mutex);
3177
3178		ret = do_trimming(block_group, total_trimmed, start, bytes,
3179				  extent_start, extent_bytes, &trim_entry);
3180		if (ret)
3181			break;
3182next:
3183		start += bytes;
3184
3185		if (fatal_signal_pending(current)) {
3186			ret = -ERESTARTSYS;
3187			break;
3188		}
3189
3190		cond_resched();
3191	}
3192out:
3193	return ret;
3194}
3195
3196static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3197			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3198{
3199	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3200	struct btrfs_free_space *entry;
3201	int ret = 0;
3202	int ret2;
3203	u64 bytes;
3204	u64 offset = offset_to_bitmap(ctl, start);
3205
3206	while (offset < end) {
3207		bool next_bitmap = false;
3208		struct btrfs_trim_range trim_entry;
3209
3210		mutex_lock(&ctl->cache_writeout_mutex);
3211		spin_lock(&ctl->tree_lock);
3212
3213		if (ctl->free_space < minlen) {
3214			spin_unlock(&ctl->tree_lock);
3215			mutex_unlock(&ctl->cache_writeout_mutex);
3216			break;
3217		}
3218
3219		entry = tree_search_offset(ctl, offset, 1, 0);
3220		if (!entry) {
3221			spin_unlock(&ctl->tree_lock);
3222			mutex_unlock(&ctl->cache_writeout_mutex);
3223			next_bitmap = true;
3224			goto next;
3225		}
3226
3227		bytes = minlen;
3228		ret2 = search_bitmap(ctl, entry, &start, &bytes);
3229		if (ret2 || start >= end) {
3230			spin_unlock(&ctl->tree_lock);
3231			mutex_unlock(&ctl->cache_writeout_mutex);
3232			next_bitmap = true;
3233			goto next;
3234		}
3235
3236		bytes = min(bytes, end - start);
3237		if (bytes < minlen) {
3238			spin_unlock(&ctl->tree_lock);
3239			mutex_unlock(&ctl->cache_writeout_mutex);
3240			goto next;
3241		}
3242
3243		bitmap_clear_bits(ctl, entry, start, bytes);
3244		if (entry->bytes == 0)
3245			free_bitmap(ctl, entry);
3246
3247		spin_unlock(&ctl->tree_lock);
3248		trim_entry.start = start;
3249		trim_entry.bytes = bytes;
3250		list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3251		mutex_unlock(&ctl->cache_writeout_mutex);
3252
3253		ret = do_trimming(block_group, total_trimmed, start, bytes,
3254				  start, bytes, &trim_entry);
3255		if (ret)
3256			break;
3257next:
3258		if (next_bitmap) {
3259			offset += BITS_PER_BITMAP * ctl->unit;
3260		} else {
3261			start += bytes;
3262			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
3263				offset += BITS_PER_BITMAP * ctl->unit;
3264		}
3265
3266		if (fatal_signal_pending(current)) {
3267			ret = -ERESTARTSYS;
3268			break;
3269		}
3270
3271		cond_resched();
3272	}
3273
3274	return ret;
3275}
3276
3277int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3278			   u64 *trimmed, u64 start, u64 end, u64 minlen)
3279{
3280	int ret;
3281
3282	*trimmed = 0;
3283
3284	spin_lock(&block_group->lock);
3285	if (block_group->removed) {
3286		spin_unlock(&block_group->lock);
3287		return 0;
3288	}
3289	atomic_inc(&block_group->trimming);
3290	spin_unlock(&block_group->lock);
3291
3292	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3293	if (ret)
3294		goto out;
3295
3296	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3297out:
3298	spin_lock(&block_group->lock);
3299	if (atomic_dec_and_test(&block_group->trimming) &&
3300	    block_group->removed) {
3301		struct extent_map_tree *em_tree;
3302		struct extent_map *em;
3303
3304		spin_unlock(&block_group->lock);
3305
3306		lock_chunks(block_group->fs_info->chunk_root);
3307		em_tree = &block_group->fs_info->mapping_tree.map_tree;
3308		write_lock(&em_tree->lock);
3309		em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3310					   1);
3311		BUG_ON(!em); /* logic error, can't happen */
3312		/*
3313		 * remove_extent_mapping() will delete us from the pinned_chunks
3314		 * list, which is protected by the chunk mutex.
3315		 */
3316		remove_extent_mapping(em_tree, em);
3317		write_unlock(&em_tree->lock);
3318		unlock_chunks(block_group->fs_info->chunk_root);
3319
3320		/* once for us and once for the tree */
3321		free_extent_map(em);
3322		free_extent_map(em);
3323
3324		/*
3325		 * We've left one free space entry and other tasks trimming
3326		 * this block group have left 1 entry each one. Free them.
3327		 */
3328		__btrfs_remove_free_space_cache(block_group->free_space_ctl);
3329	} else {
3330		spin_unlock(&block_group->lock);
3331	}
3332
3333	return ret;
3334}
3335
3336/*
3337 * Find the left-most item in the cache tree, and then return the
3338 * smallest inode number in the item.
3339 *
3340 * Note: the returned inode number may not be the smallest one in
3341 * the tree, if the left-most item is a bitmap.
3342 */
3343u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3344{
3345	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3346	struct btrfs_free_space *entry = NULL;
3347	u64 ino = 0;
3348
3349	spin_lock(&ctl->tree_lock);
3350
3351	if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3352		goto out;
3353
3354	entry = rb_entry(rb_first(&ctl->free_space_offset),
3355			 struct btrfs_free_space, offset_index);
3356
3357	if (!entry->bitmap) {
3358		ino = entry->offset;
3359
3360		unlink_free_space(ctl, entry);
3361		entry->offset++;
3362		entry->bytes--;
3363		if (!entry->bytes)
3364			kmem_cache_free(btrfs_free_space_cachep, entry);
3365		else
3366			link_free_space(ctl, entry);
3367	} else {
3368		u64 offset = 0;
3369		u64 count = 1;
3370		int ret;
3371
3372		ret = search_bitmap(ctl, entry, &offset, &count);
3373		/* Logic error; Should be empty if it can't find anything */
3374		ASSERT(!ret);
3375
3376		ino = offset;
3377		bitmap_clear_bits(ctl, entry, offset, 1);
3378		if (entry->bytes == 0)
3379			free_bitmap(ctl, entry);
3380	}
3381out:
3382	spin_unlock(&ctl->tree_lock);
3383
3384	return ino;
3385}
3386
3387struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3388				    struct btrfs_path *path)
3389{
3390	struct inode *inode = NULL;
3391
3392	spin_lock(&root->ino_cache_lock);
3393	if (root->ino_cache_inode)
3394		inode = igrab(root->ino_cache_inode);
3395	spin_unlock(&root->ino_cache_lock);
3396	if (inode)
3397		return inode;
3398
3399	inode = __lookup_free_space_inode(root, path, 0);
3400	if (IS_ERR(inode))
3401		return inode;
3402
3403	spin_lock(&root->ino_cache_lock);
3404	if (!btrfs_fs_closing(root->fs_info))
3405		root->ino_cache_inode = igrab(inode);
3406	spin_unlock(&root->ino_cache_lock);
3407
3408	return inode;
3409}
3410
3411int create_free_ino_inode(struct btrfs_root *root,
3412			  struct btrfs_trans_handle *trans,
3413			  struct btrfs_path *path)
3414{
3415	return __create_free_space_inode(root, trans, path,
3416					 BTRFS_FREE_INO_OBJECTID, 0);
3417}
3418
3419int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3420{
3421	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3422	struct btrfs_path *path;
3423	struct inode *inode;
3424	int ret = 0;
3425	u64 root_gen = btrfs_root_generation(&root->root_item);
3426
3427	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3428		return 0;
3429
3430	/*
3431	 * If we're unmounting then just return, since this does a search on the
3432	 * normal root and not the commit root and we could deadlock.
3433	 */
3434	if (btrfs_fs_closing(fs_info))
3435		return 0;
3436
3437	path = btrfs_alloc_path();
3438	if (!path)
3439		return 0;
3440
3441	inode = lookup_free_ino_inode(root, path);
3442	if (IS_ERR(inode))
3443		goto out;
3444
3445	if (root_gen != BTRFS_I(inode)->generation)
3446		goto out_put;
3447
3448	ret = __load_free_space_cache(root, inode, ctl, path, 0);
3449
3450	if (ret < 0)
3451		btrfs_err(fs_info,
3452			"failed to load free ino cache for root %llu",
3453			root->root_key.objectid);
3454out_put:
3455	iput(inode);
3456out:
3457	btrfs_free_path(path);
3458	return ret;
3459}
3460
3461int btrfs_write_out_ino_cache(struct btrfs_root *root,
3462			      struct btrfs_trans_handle *trans,
3463			      struct btrfs_path *path,
3464			      struct inode *inode)
3465{
3466	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3467	int ret;
3468	struct btrfs_io_ctl io_ctl;
3469	bool release_metadata = true;
3470
3471	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3472		return 0;
3473
3474	memset(&io_ctl, 0, sizeof(io_ctl));
3475	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
3476				      trans, path, 0);
3477	if (!ret) {
3478		/*
3479		 * At this point writepages() didn't error out, so our metadata
3480		 * reservation is released when the writeback finishes, at
3481		 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3482		 * with or without an error.
3483		 */
3484		release_metadata = false;
3485		ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
3486	}
3487
3488	if (ret) {
3489		if (release_metadata)
3490			btrfs_delalloc_release_metadata(inode, inode->i_size);
3491#ifdef DEBUG
3492		btrfs_err(root->fs_info,
3493			"failed to write free ino cache for root %llu",
3494			root->root_key.objectid);
3495#endif
3496	}
3497
3498	return ret;
3499}
3500
3501#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3502/*
3503 * Use this if you need to make a bitmap or extent entry specifically, it
3504 * doesn't do any of the merging that add_free_space does, this acts a lot like
3505 * how the free space cache loading stuff works, so you can get really weird
3506 * configurations.
3507 */
3508int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3509			      u64 offset, u64 bytes, bool bitmap)
3510{
3511	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3512	struct btrfs_free_space *info = NULL, *bitmap_info;
3513	void *map = NULL;
3514	u64 bytes_added;
3515	int ret;
3516
3517again:
3518	if (!info) {
3519		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3520		if (!info)
3521			return -ENOMEM;
3522	}
3523
3524	if (!bitmap) {
3525		spin_lock(&ctl->tree_lock);
3526		info->offset = offset;
3527		info->bytes = bytes;
3528		ret = link_free_space(ctl, info);
3529		spin_unlock(&ctl->tree_lock);
3530		if (ret)
3531			kmem_cache_free(btrfs_free_space_cachep, info);
3532		return ret;
3533	}
3534
3535	if (!map) {
3536		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
3537		if (!map) {
3538			kmem_cache_free(btrfs_free_space_cachep, info);
3539			return -ENOMEM;
3540		}
3541	}
3542
3543	spin_lock(&ctl->tree_lock);
3544	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3545					 1, 0);
3546	if (!bitmap_info) {
3547		info->bitmap = map;
3548		map = NULL;
3549		add_new_bitmap(ctl, info, offset);
3550		bitmap_info = info;
3551		info = NULL;
3552	}
3553
3554	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3555	bytes -= bytes_added;
3556	offset += bytes_added;
3557	spin_unlock(&ctl->tree_lock);
3558
3559	if (bytes)
3560		goto again;
3561
3562	if (info)
3563		kmem_cache_free(btrfs_free_space_cachep, info);
3564	if (map)
3565		kfree(map);
3566	return 0;
3567}
3568
3569/*
3570 * Checks to see if the given range is in the free space cache.  This is really
3571 * just used to check the absence of space, so if there is free space in the
3572 * range at all we will return 1.
3573 */
3574int test_check_exists(struct btrfs_block_group_cache *cache,
3575		      u64 offset, u64 bytes)
3576{
3577	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3578	struct btrfs_free_space *info;
3579	int ret = 0;
3580
3581	spin_lock(&ctl->tree_lock);
3582	info = tree_search_offset(ctl, offset, 0, 0);
3583	if (!info) {
3584		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3585					  1, 0);
3586		if (!info)
3587			goto out;
3588	}
3589
3590have_info:
3591	if (info->bitmap) {
3592		u64 bit_off, bit_bytes;
3593		struct rb_node *n;
3594		struct btrfs_free_space *tmp;
3595
3596		bit_off = offset;
3597		bit_bytes = ctl->unit;
3598		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
3599		if (!ret) {
3600			if (bit_off == offset) {
3601				ret = 1;
3602				goto out;
3603			} else if (bit_off > offset &&
3604				   offset + bytes > bit_off) {
3605				ret = 1;
3606				goto out;
3607			}
3608		}
3609
3610		n = rb_prev(&info->offset_index);
3611		while (n) {
3612			tmp = rb_entry(n, struct btrfs_free_space,
3613				       offset_index);
3614			if (tmp->offset + tmp->bytes < offset)
3615				break;
3616			if (offset + bytes < tmp->offset) {
3617				n = rb_prev(&info->offset_index);
3618				continue;
3619			}
3620			info = tmp;
3621			goto have_info;
3622		}
3623
3624		n = rb_next(&info->offset_index);
3625		while (n) {
3626			tmp = rb_entry(n, struct btrfs_free_space,
3627				       offset_index);
3628			if (offset + bytes < tmp->offset)
3629				break;
3630			if (tmp->offset + tmp->bytes < offset) {
3631				n = rb_next(&info->offset_index);
3632				continue;
3633			}
3634			info = tmp;
3635			goto have_info;
3636		}
3637
3638		ret = 0;
3639		goto out;
3640	}
3641
3642	if (info->offset == offset) {
3643		ret = 1;
3644		goto out;
3645	}
3646
3647	if (offset > info->offset && offset < info->offset + info->bytes)
3648		ret = 1;
3649out:
3650	spin_unlock(&ctl->tree_lock);
3651	return ret;
3652}
3653#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3654