• Home
  • History
  • Annotate
  • only in this directory
1/*
2 *  fs/ext4/extents_status.c
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 *	Allison Henderson <achender@linux.vnet.ibm.com>
7 *	Hugh Dickins <hughd@google.com>
8 *	Zheng Liu <wenqing.lz@taobao.com>
9 *
10 * Ext4 extents status tree core functions.
11 */
12#include <linux/list_sort.h>
13#include <linux/proc_fs.h>
14#include <linux/seq_file.h>
15#include "ext4.h"
16
17#include <trace/events/ext4.h>
18
19/*
20 * According to previous discussion in Ext4 Developer Workshop, we
21 * will introduce a new structure called io tree to track all extent
22 * status in order to solve some problems that we have met
23 * (e.g. Reservation space warning), and provide extent-level locking.
24 * Delay extent tree is the first step to achieve this goal.  It is
25 * original built by Yongqiang Yang.  At that time it is called delay
26 * extent tree, whose goal is only track delayed extents in memory to
27 * simplify the implementation of fiemap and bigalloc, and introduce
28 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
29 * delay extent tree at the first commit.  But for better understand
30 * what it does, it has been rename to extent status tree.
31 *
32 * Step1:
33 * Currently the first step has been done.  All delayed extents are
34 * tracked in the tree.  It maintains the delayed extent when a delayed
35 * allocation is issued, and the delayed extent is written out or
36 * invalidated.  Therefore the implementation of fiemap and bigalloc
37 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
38 *
39 * The following comment describes the implemenmtation of extent
40 * status tree and future works.
41 *
42 * Step2:
43 * In this step all extent status are tracked by extent status tree.
44 * Thus, we can first try to lookup a block mapping in this tree before
45 * finding it in extent tree.  Hence, single extent cache can be removed
46 * because extent status tree can do a better job.  Extents in status
47 * tree are loaded on-demand.  Therefore, the extent status tree may not
48 * contain all of the extents in a file.  Meanwhile we define a shrinker
49 * to reclaim memory from extent status tree because fragmented extent
50 * tree will make status tree cost too much memory.  written/unwritten/-
51 * hole extents in the tree will be reclaimed by this shrinker when we
52 * are under high memory pressure.  Delayed extents will not be
53 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
54 */
55
56/*
57 * Extent status tree implementation for ext4.
58 *
59 *
60 * ==========================================================================
61 * Extent status tree tracks all extent status.
62 *
63 * 1. Why we need to implement extent status tree?
64 *
65 * Without extent status tree, ext4 identifies a delayed extent by looking
66 * up page cache, this has several deficiencies - complicated, buggy,
67 * and inefficient code.
68 *
69 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
70 * block or a range of blocks are belonged to a delayed extent.
71 *
72 * Let us have a look at how they do without extent status tree.
73 *   --	FIEMAP
74 *	FIEMAP looks up page cache to identify delayed allocations from holes.
75 *
76 *   --	SEEK_HOLE/DATA
77 *	SEEK_HOLE/DATA has the same problem as FIEMAP.
78 *
79 *   --	bigalloc
80 *	bigalloc looks up page cache to figure out if a block is
81 *	already under delayed allocation or not to determine whether
82 *	quota reserving is needed for the cluster.
83 *
84 *   --	writeout
85 *	Writeout looks up whole page cache to see if a buffer is
86 *	mapped, If there are not very many delayed buffers, then it is
87 *	time comsuming.
88 *
89 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
90 * bigalloc and writeout can figure out if a block or a range of
91 * blocks is under delayed allocation(belonged to a delayed extent) or
92 * not by searching the extent tree.
93 *
94 *
95 * ==========================================================================
96 * 2. Ext4 extent status tree impelmentation
97 *
98 *   --	extent
99 *	A extent is a range of blocks which are contiguous logically and
100 *	physically.  Unlike extent in extent tree, this extent in ext4 is
101 *	a in-memory struct, there is no corresponding on-disk data.  There
102 *	is no limit on length of extent, so an extent can contain as many
103 *	blocks as they are contiguous logically and physically.
104 *
105 *   --	extent status tree
106 *	Every inode has an extent status tree and all allocation blocks
107 *	are added to the tree with different status.  The extent in the
108 *	tree are ordered by logical block no.
109 *
110 *   --	operations on a extent status tree
111 *	There are three important operations on a delayed extent tree: find
112 *	next extent, adding a extent(a range of blocks) and removing a extent.
113 *
114 *   --	race on a extent status tree
115 *	Extent status tree is protected by inode->i_es_lock.
116 *
117 *   --	memory consumption
118 *      Fragmented extent tree will make extent status tree cost too much
119 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
120 *      the tree under a heavy memory pressure.
121 *
122 *
123 * ==========================================================================
124 * 3. Performance analysis
125 *
126 *   --	overhead
127 *	1. There is a cache extent for write access, so if writes are
128 *	not very random, adding space operaions are in O(1) time.
129 *
130 *   --	gain
131 *	2. Code is much simpler, more readable, more maintainable and
132 *	more efficient.
133 *
134 *
135 * ==========================================================================
136 * 4. TODO list
137 *
138 *   -- Refactor delayed space reservation
139 *
140 *   -- Extent-level locking
141 */
142
143static struct kmem_cache *ext4_es_cachep;
144
145static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
146static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
147			      ext4_lblk_t end);
148static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
149static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
150		       struct ext4_inode_info *locked_ei);
151
152int __init ext4_init_es(void)
153{
154	ext4_es_cachep = kmem_cache_create("ext4_extent_status",
155					   sizeof(struct extent_status),
156					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
157	if (ext4_es_cachep == NULL)
158		return -ENOMEM;
159	return 0;
160}
161
162void ext4_exit_es(void)
163{
164	if (ext4_es_cachep)
165		kmem_cache_destroy(ext4_es_cachep);
166}
167
168void ext4_es_init_tree(struct ext4_es_tree *tree)
169{
170	tree->root = RB_ROOT;
171	tree->cache_es = NULL;
172}
173
174#ifdef ES_DEBUG__
175static void ext4_es_print_tree(struct inode *inode)
176{
177	struct ext4_es_tree *tree;
178	struct rb_node *node;
179
180	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
181	tree = &EXT4_I(inode)->i_es_tree;
182	node = rb_first(&tree->root);
183	while (node) {
184		struct extent_status *es;
185		es = rb_entry(node, struct extent_status, rb_node);
186		printk(KERN_DEBUG " [%u/%u) %llu %x",
187		       es->es_lblk, es->es_len,
188		       ext4_es_pblock(es), ext4_es_status(es));
189		node = rb_next(node);
190	}
191	printk(KERN_DEBUG "\n");
192}
193#else
194#define ext4_es_print_tree(inode)
195#endif
196
197static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
198{
199	BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
200	return es->es_lblk + es->es_len - 1;
201}
202
203/*
204 * search through the tree for an delayed extent with a given offset.  If
205 * it can't be found, try to find next extent.
206 */
207static struct extent_status *__es_tree_search(struct rb_root *root,
208					      ext4_lblk_t lblk)
209{
210	struct rb_node *node = root->rb_node;
211	struct extent_status *es = NULL;
212
213	while (node) {
214		es = rb_entry(node, struct extent_status, rb_node);
215		if (lblk < es->es_lblk)
216			node = node->rb_left;
217		else if (lblk > ext4_es_end(es))
218			node = node->rb_right;
219		else
220			return es;
221	}
222
223	if (es && lblk < es->es_lblk)
224		return es;
225
226	if (es && lblk > ext4_es_end(es)) {
227		node = rb_next(&es->rb_node);
228		return node ? rb_entry(node, struct extent_status, rb_node) :
229			      NULL;
230	}
231
232	return NULL;
233}
234
235/*
236 * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
237 * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
238 *
239 * @inode: the inode which owns delayed extents
240 * @lblk: the offset where we start to search
241 * @end: the offset where we stop to search
242 * @es: delayed extent that we found
243 */
244void ext4_es_find_delayed_extent_range(struct inode *inode,
245				 ext4_lblk_t lblk, ext4_lblk_t end,
246				 struct extent_status *es)
247{
248	struct ext4_es_tree *tree = NULL;
249	struct extent_status *es1 = NULL;
250	struct rb_node *node;
251
252	BUG_ON(es == NULL);
253	BUG_ON(end < lblk);
254	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
255
256	read_lock(&EXT4_I(inode)->i_es_lock);
257	tree = &EXT4_I(inode)->i_es_tree;
258
259	/* find extent in cache firstly */
260	es->es_lblk = es->es_len = es->es_pblk = 0;
261	if (tree->cache_es) {
262		es1 = tree->cache_es;
263		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
264			es_debug("%u cached by [%u/%u) %llu %x\n",
265				 lblk, es1->es_lblk, es1->es_len,
266				 ext4_es_pblock(es1), ext4_es_status(es1));
267			goto out;
268		}
269	}
270
271	es1 = __es_tree_search(&tree->root, lblk);
272
273out:
274	if (es1 && !ext4_es_is_delayed(es1)) {
275		while ((node = rb_next(&es1->rb_node)) != NULL) {
276			es1 = rb_entry(node, struct extent_status, rb_node);
277			if (es1->es_lblk > end) {
278				es1 = NULL;
279				break;
280			}
281			if (ext4_es_is_delayed(es1))
282				break;
283		}
284	}
285
286	if (es1 && ext4_es_is_delayed(es1)) {
287		tree->cache_es = es1;
288		es->es_lblk = es1->es_lblk;
289		es->es_len = es1->es_len;
290		es->es_pblk = es1->es_pblk;
291	}
292
293	read_unlock(&EXT4_I(inode)->i_es_lock);
294
295	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296}
297
298static void ext4_es_list_add(struct inode *inode)
299{
300	struct ext4_inode_info *ei = EXT4_I(inode);
301	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
302
303	if (!list_empty(&ei->i_es_list))
304		return;
305
306	spin_lock(&sbi->s_es_lock);
307	if (list_empty(&ei->i_es_list)) {
308		list_add_tail(&ei->i_es_list, &sbi->s_es_list);
309		sbi->s_es_nr_inode++;
310	}
311	spin_unlock(&sbi->s_es_lock);
312}
313
314static void ext4_es_list_del(struct inode *inode)
315{
316	struct ext4_inode_info *ei = EXT4_I(inode);
317	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
318
319	spin_lock(&sbi->s_es_lock);
320	if (!list_empty(&ei->i_es_list)) {
321		list_del_init(&ei->i_es_list);
322		sbi->s_es_nr_inode--;
323		WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
324	}
325	spin_unlock(&sbi->s_es_lock);
326}
327
328static struct extent_status *
329ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
330		     ext4_fsblk_t pblk)
331{
332	struct extent_status *es;
333	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
334	if (es == NULL)
335		return NULL;
336	es->es_lblk = lblk;
337	es->es_len = len;
338	es->es_pblk = pblk;
339
340	/*
341	 * We don't count delayed extent because we never try to reclaim them
342	 */
343	if (!ext4_es_is_delayed(es)) {
344		if (!EXT4_I(inode)->i_es_shk_nr++)
345			ext4_es_list_add(inode);
346		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
347					s_es_stats.es_stats_shk_cnt);
348	}
349
350	EXT4_I(inode)->i_es_all_nr++;
351	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
352
353	return es;
354}
355
356static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
357{
358	EXT4_I(inode)->i_es_all_nr--;
359	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
360
361	/* Decrease the shrink counter when this es is not delayed */
362	if (!ext4_es_is_delayed(es)) {
363		BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
364		if (!--EXT4_I(inode)->i_es_shk_nr)
365			ext4_es_list_del(inode);
366		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
367					s_es_stats.es_stats_shk_cnt);
368	}
369
370	kmem_cache_free(ext4_es_cachep, es);
371}
372
373/*
374 * Check whether or not two extents can be merged
375 * Condition:
376 *  - logical block number is contiguous
377 *  - physical block number is contiguous
378 *  - status is equal
379 */
380static int ext4_es_can_be_merged(struct extent_status *es1,
381				 struct extent_status *es2)
382{
383	if (ext4_es_type(es1) != ext4_es_type(es2))
384		return 0;
385
386	if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
387		pr_warn("ES assertion failed when merging extents. "
388			"The sum of lengths of es1 (%d) and es2 (%d) "
389			"is bigger than allowed file size (%d)\n",
390			es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
391		WARN_ON(1);
392		return 0;
393	}
394
395	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
396		return 0;
397
398	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
399	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
400		return 1;
401
402	if (ext4_es_is_hole(es1))
403		return 1;
404
405	/* we need to check delayed extent is without unwritten status */
406	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
407		return 1;
408
409	return 0;
410}
411
412static struct extent_status *
413ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
414{
415	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
416	struct extent_status *es1;
417	struct rb_node *node;
418
419	node = rb_prev(&es->rb_node);
420	if (!node)
421		return es;
422
423	es1 = rb_entry(node, struct extent_status, rb_node);
424	if (ext4_es_can_be_merged(es1, es)) {
425		es1->es_len += es->es_len;
426		if (ext4_es_is_referenced(es))
427			ext4_es_set_referenced(es1);
428		rb_erase(&es->rb_node, &tree->root);
429		ext4_es_free_extent(inode, es);
430		es = es1;
431	}
432
433	return es;
434}
435
436static struct extent_status *
437ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
438{
439	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
440	struct extent_status *es1;
441	struct rb_node *node;
442
443	node = rb_next(&es->rb_node);
444	if (!node)
445		return es;
446
447	es1 = rb_entry(node, struct extent_status, rb_node);
448	if (ext4_es_can_be_merged(es, es1)) {
449		es->es_len += es1->es_len;
450		if (ext4_es_is_referenced(es1))
451			ext4_es_set_referenced(es);
452		rb_erase(node, &tree->root);
453		ext4_es_free_extent(inode, es1);
454	}
455
456	return es;
457}
458
459#ifdef ES_AGGRESSIVE_TEST
460#include "ext4_extents.h"	/* Needed when ES_AGGRESSIVE_TEST is defined */
461
462static void ext4_es_insert_extent_ext_check(struct inode *inode,
463					    struct extent_status *es)
464{
465	struct ext4_ext_path *path = NULL;
466	struct ext4_extent *ex;
467	ext4_lblk_t ee_block;
468	ext4_fsblk_t ee_start;
469	unsigned short ee_len;
470	int depth, ee_status, es_status;
471
472	path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
473	if (IS_ERR(path))
474		return;
475
476	depth = ext_depth(inode);
477	ex = path[depth].p_ext;
478
479	if (ex) {
480
481		ee_block = le32_to_cpu(ex->ee_block);
482		ee_start = ext4_ext_pblock(ex);
483		ee_len = ext4_ext_get_actual_len(ex);
484
485		ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
486		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
487
488		/*
489		 * Make sure ex and es are not overlap when we try to insert
490		 * a delayed/hole extent.
491		 */
492		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
493			if (in_range(es->es_lblk, ee_block, ee_len)) {
494				pr_warn("ES insert assertion failed for "
495					"inode: %lu we can find an extent "
496					"at block [%d/%d/%llu/%c], but we "
497					"want to add a delayed/hole extent "
498					"[%d/%d/%llu/%x]\n",
499					inode->i_ino, ee_block, ee_len,
500					ee_start, ee_status ? 'u' : 'w',
501					es->es_lblk, es->es_len,
502					ext4_es_pblock(es), ext4_es_status(es));
503			}
504			goto out;
505		}
506
507		/*
508		 * We don't check ee_block == es->es_lblk, etc. because es
509		 * might be a part of whole extent, vice versa.
510		 */
511		if (es->es_lblk < ee_block ||
512		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
513			pr_warn("ES insert assertion failed for inode: %lu "
514				"ex_status [%d/%d/%llu/%c] != "
515				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
516				ee_block, ee_len, ee_start,
517				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
518				ext4_es_pblock(es), es_status ? 'u' : 'w');
519			goto out;
520		}
521
522		if (ee_status ^ es_status) {
523			pr_warn("ES insert assertion failed for inode: %lu "
524				"ex_status [%d/%d/%llu/%c] != "
525				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
526				ee_block, ee_len, ee_start,
527				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
528				ext4_es_pblock(es), es_status ? 'u' : 'w');
529		}
530	} else {
531		/*
532		 * We can't find an extent on disk.  So we need to make sure
533		 * that we don't want to add an written/unwritten extent.
534		 */
535		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
536			pr_warn("ES insert assertion failed for inode: %lu "
537				"can't find an extent at block %d but we want "
538				"to add a written/unwritten extent "
539				"[%d/%d/%llu/%x]\n", inode->i_ino,
540				es->es_lblk, es->es_lblk, es->es_len,
541				ext4_es_pblock(es), ext4_es_status(es));
542		}
543	}
544out:
545	ext4_ext_drop_refs(path);
546	kfree(path);
547}
548
549static void ext4_es_insert_extent_ind_check(struct inode *inode,
550					    struct extent_status *es)
551{
552	struct ext4_map_blocks map;
553	int retval;
554
555	/*
556	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
557	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
558	 * access direct/indirect tree from outside.  It is too dirty to define
559	 * this function in indirect.c file.
560	 */
561
562	map.m_lblk = es->es_lblk;
563	map.m_len = es->es_len;
564
565	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
566	if (retval > 0) {
567		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
568			/*
569			 * We want to add a delayed/hole extent but this
570			 * block has been allocated.
571			 */
572			pr_warn("ES insert assertion failed for inode: %lu "
573				"We can find blocks but we want to add a "
574				"delayed/hole extent [%d/%d/%llu/%x]\n",
575				inode->i_ino, es->es_lblk, es->es_len,
576				ext4_es_pblock(es), ext4_es_status(es));
577			return;
578		} else if (ext4_es_is_written(es)) {
579			if (retval != es->es_len) {
580				pr_warn("ES insert assertion failed for "
581					"inode: %lu retval %d != es_len %d\n",
582					inode->i_ino, retval, es->es_len);
583				return;
584			}
585			if (map.m_pblk != ext4_es_pblock(es)) {
586				pr_warn("ES insert assertion failed for "
587					"inode: %lu m_pblk %llu != "
588					"es_pblk %llu\n",
589					inode->i_ino, map.m_pblk,
590					ext4_es_pblock(es));
591				return;
592			}
593		} else {
594			/*
595			 * We don't need to check unwritten extent because
596			 * indirect-based file doesn't have it.
597			 */
598			BUG_ON(1);
599		}
600	} else if (retval == 0) {
601		if (ext4_es_is_written(es)) {
602			pr_warn("ES insert assertion failed for inode: %lu "
603				"We can't find the block but we want to add "
604				"a written extent [%d/%d/%llu/%x]\n",
605				inode->i_ino, es->es_lblk, es->es_len,
606				ext4_es_pblock(es), ext4_es_status(es));
607			return;
608		}
609	}
610}
611
612static inline void ext4_es_insert_extent_check(struct inode *inode,
613					       struct extent_status *es)
614{
615	/*
616	 * We don't need to worry about the race condition because
617	 * caller takes i_data_sem locking.
618	 */
619	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
620	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
621		ext4_es_insert_extent_ext_check(inode, es);
622	else
623		ext4_es_insert_extent_ind_check(inode, es);
624}
625#else
626static inline void ext4_es_insert_extent_check(struct inode *inode,
627					       struct extent_status *es)
628{
629}
630#endif
631
632static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
633{
634	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
635	struct rb_node **p = &tree->root.rb_node;
636	struct rb_node *parent = NULL;
637	struct extent_status *es;
638
639	while (*p) {
640		parent = *p;
641		es = rb_entry(parent, struct extent_status, rb_node);
642
643		if (newes->es_lblk < es->es_lblk) {
644			if (ext4_es_can_be_merged(newes, es)) {
645				/*
646				 * Here we can modify es_lblk directly
647				 * because it isn't overlapped.
648				 */
649				es->es_lblk = newes->es_lblk;
650				es->es_len += newes->es_len;
651				if (ext4_es_is_written(es) ||
652				    ext4_es_is_unwritten(es))
653					ext4_es_store_pblock(es,
654							     newes->es_pblk);
655				es = ext4_es_try_to_merge_left(inode, es);
656				goto out;
657			}
658			p = &(*p)->rb_left;
659		} else if (newes->es_lblk > ext4_es_end(es)) {
660			if (ext4_es_can_be_merged(es, newes)) {
661				es->es_len += newes->es_len;
662				es = ext4_es_try_to_merge_right(inode, es);
663				goto out;
664			}
665			p = &(*p)->rb_right;
666		} else {
667			BUG_ON(1);
668			return -EINVAL;
669		}
670	}
671
672	es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
673				  newes->es_pblk);
674	if (!es)
675		return -ENOMEM;
676	rb_link_node(&es->rb_node, parent, p);
677	rb_insert_color(&es->rb_node, &tree->root);
678
679out:
680	tree->cache_es = es;
681	return 0;
682}
683
684/*
685 * ext4_es_insert_extent() adds information to an inode's extent
686 * status tree.
687 *
688 * Return 0 on success, error code on failure.
689 */
690int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
691			  ext4_lblk_t len, ext4_fsblk_t pblk,
692			  unsigned int status)
693{
694	struct extent_status newes;
695	ext4_lblk_t end = lblk + len - 1;
696	int err = 0;
697
698	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
699		 lblk, len, pblk, status, inode->i_ino);
700
701	if (!len)
702		return 0;
703
704	BUG_ON(end < lblk);
705
706	if ((status & EXTENT_STATUS_DELAYED) &&
707	    (status & EXTENT_STATUS_WRITTEN)) {
708		ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
709				" delayed and written which can potentially "
710				" cause data loss.\n", lblk, len);
711		WARN_ON(1);
712	}
713
714	newes.es_lblk = lblk;
715	newes.es_len = len;
716	ext4_es_store_pblock_status(&newes, pblk, status);
717	trace_ext4_es_insert_extent(inode, &newes);
718
719	ext4_es_insert_extent_check(inode, &newes);
720
721	write_lock(&EXT4_I(inode)->i_es_lock);
722	err = __es_remove_extent(inode, lblk, end);
723	if (err != 0)
724		goto error;
725retry:
726	err = __es_insert_extent(inode, &newes);
727	if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
728					  128, EXT4_I(inode)))
729		goto retry;
730	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
731		err = 0;
732
733error:
734	write_unlock(&EXT4_I(inode)->i_es_lock);
735
736	ext4_es_print_tree(inode);
737
738	return err;
739}
740
741/*
742 * ext4_es_cache_extent() inserts information into the extent status
743 * tree if and only if there isn't information about the range in
744 * question already.
745 */
746void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
747			  ext4_lblk_t len, ext4_fsblk_t pblk,
748			  unsigned int status)
749{
750	struct extent_status *es;
751	struct extent_status newes;
752	ext4_lblk_t end = lblk + len - 1;
753
754	newes.es_lblk = lblk;
755	newes.es_len = len;
756	ext4_es_store_pblock_status(&newes, pblk, status);
757	trace_ext4_es_cache_extent(inode, &newes);
758
759	if (!len)
760		return;
761
762	BUG_ON(end < lblk);
763
764	write_lock(&EXT4_I(inode)->i_es_lock);
765
766	es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
767	if (!es || es->es_lblk > end)
768		__es_insert_extent(inode, &newes);
769	write_unlock(&EXT4_I(inode)->i_es_lock);
770}
771
772/*
773 * ext4_es_lookup_extent() looks up an extent in extent status tree.
774 *
775 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
776 *
777 * Return: 1 on found, 0 on not
778 */
779int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
780			  struct extent_status *es)
781{
782	struct ext4_es_tree *tree;
783	struct ext4_es_stats *stats;
784	struct extent_status *es1 = NULL;
785	struct rb_node *node;
786	int found = 0;
787
788	trace_ext4_es_lookup_extent_enter(inode, lblk);
789	es_debug("lookup extent in block %u\n", lblk);
790
791	tree = &EXT4_I(inode)->i_es_tree;
792	read_lock(&EXT4_I(inode)->i_es_lock);
793
794	/* find extent in cache firstly */
795	es->es_lblk = es->es_len = es->es_pblk = 0;
796	if (tree->cache_es) {
797		es1 = tree->cache_es;
798		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
799			es_debug("%u cached by [%u/%u)\n",
800				 lblk, es1->es_lblk, es1->es_len);
801			found = 1;
802			goto out;
803		}
804	}
805
806	node = tree->root.rb_node;
807	while (node) {
808		es1 = rb_entry(node, struct extent_status, rb_node);
809		if (lblk < es1->es_lblk)
810			node = node->rb_left;
811		else if (lblk > ext4_es_end(es1))
812			node = node->rb_right;
813		else {
814			found = 1;
815			break;
816		}
817	}
818
819out:
820	stats = &EXT4_SB(inode->i_sb)->s_es_stats;
821	if (found) {
822		BUG_ON(!es1);
823		es->es_lblk = es1->es_lblk;
824		es->es_len = es1->es_len;
825		es->es_pblk = es1->es_pblk;
826		if (!ext4_es_is_referenced(es))
827			ext4_es_set_referenced(es);
828		stats->es_stats_cache_hits++;
829	} else {
830		stats->es_stats_cache_misses++;
831	}
832
833	read_unlock(&EXT4_I(inode)->i_es_lock);
834
835	trace_ext4_es_lookup_extent_exit(inode, es, found);
836	return found;
837}
838
839static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
840			      ext4_lblk_t end)
841{
842	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
843	struct rb_node *node;
844	struct extent_status *es;
845	struct extent_status orig_es;
846	ext4_lblk_t len1, len2;
847	ext4_fsblk_t block;
848	int err;
849
850retry:
851	err = 0;
852	es = __es_tree_search(&tree->root, lblk);
853	if (!es)
854		goto out;
855	if (es->es_lblk > end)
856		goto out;
857
858	/* Simply invalidate cache_es. */
859	tree->cache_es = NULL;
860
861	orig_es.es_lblk = es->es_lblk;
862	orig_es.es_len = es->es_len;
863	orig_es.es_pblk = es->es_pblk;
864
865	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
866	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
867	if (len1 > 0)
868		es->es_len = len1;
869	if (len2 > 0) {
870		if (len1 > 0) {
871			struct extent_status newes;
872
873			newes.es_lblk = end + 1;
874			newes.es_len = len2;
875			block = 0x7FDEADBEEFULL;
876			if (ext4_es_is_written(&orig_es) ||
877			    ext4_es_is_unwritten(&orig_es))
878				block = ext4_es_pblock(&orig_es) +
879					orig_es.es_len - len2;
880			ext4_es_store_pblock_status(&newes, block,
881						    ext4_es_status(&orig_es));
882			err = __es_insert_extent(inode, &newes);
883			if (err) {
884				es->es_lblk = orig_es.es_lblk;
885				es->es_len = orig_es.es_len;
886				if ((err == -ENOMEM) &&
887				    __es_shrink(EXT4_SB(inode->i_sb),
888							128, EXT4_I(inode)))
889					goto retry;
890				goto out;
891			}
892		} else {
893			es->es_lblk = end + 1;
894			es->es_len = len2;
895			if (ext4_es_is_written(es) ||
896			    ext4_es_is_unwritten(es)) {
897				block = orig_es.es_pblk + orig_es.es_len - len2;
898				ext4_es_store_pblock(es, block);
899			}
900		}
901		goto out;
902	}
903
904	if (len1 > 0) {
905		node = rb_next(&es->rb_node);
906		if (node)
907			es = rb_entry(node, struct extent_status, rb_node);
908		else
909			es = NULL;
910	}
911
912	while (es && ext4_es_end(es) <= end) {
913		node = rb_next(&es->rb_node);
914		rb_erase(&es->rb_node, &tree->root);
915		ext4_es_free_extent(inode, es);
916		if (!node) {
917			es = NULL;
918			break;
919		}
920		es = rb_entry(node, struct extent_status, rb_node);
921	}
922
923	if (es && es->es_lblk < end + 1) {
924		ext4_lblk_t orig_len = es->es_len;
925
926		len1 = ext4_es_end(es) - end;
927		es->es_lblk = end + 1;
928		es->es_len = len1;
929		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
930			block = es->es_pblk + orig_len - len1;
931			ext4_es_store_pblock(es, block);
932		}
933	}
934
935out:
936	return err;
937}
938
939/*
940 * ext4_es_remove_extent() removes a space from a extent status tree.
941 *
942 * Return 0 on success, error code on failure.
943 */
944int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
945			  ext4_lblk_t len)
946{
947	ext4_lblk_t end;
948	int err = 0;
949
950	trace_ext4_es_remove_extent(inode, lblk, len);
951	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
952		 lblk, len, inode->i_ino);
953
954	if (!len)
955		return err;
956
957	end = lblk + len - 1;
958	BUG_ON(end < lblk);
959
960	/*
961	 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
962	 * so that we are sure __es_shrink() is done with the inode before it
963	 * is reclaimed.
964	 */
965	write_lock(&EXT4_I(inode)->i_es_lock);
966	err = __es_remove_extent(inode, lblk, end);
967	write_unlock(&EXT4_I(inode)->i_es_lock);
968	ext4_es_print_tree(inode);
969	return err;
970}
971
972static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
973		       struct ext4_inode_info *locked_ei)
974{
975	struct ext4_inode_info *ei;
976	struct ext4_es_stats *es_stats;
977	ktime_t start_time;
978	u64 scan_time;
979	int nr_to_walk;
980	int nr_shrunk = 0;
981	int retried = 0, nr_skipped = 0;
982
983	es_stats = &sbi->s_es_stats;
984	start_time = ktime_get();
985
986retry:
987	spin_lock(&sbi->s_es_lock);
988	nr_to_walk = sbi->s_es_nr_inode;
989	while (nr_to_walk-- > 0) {
990		if (list_empty(&sbi->s_es_list)) {
991			spin_unlock(&sbi->s_es_lock);
992			goto out;
993		}
994		ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
995				      i_es_list);
996		/* Move the inode to the tail */
997		list_move_tail(&ei->i_es_list, &sbi->s_es_list);
998
999		/*
1000		 * Normally we try hard to avoid shrinking precached inodes,
1001		 * but we will as a last resort.
1002		 */
1003		if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1004						EXT4_STATE_EXT_PRECACHED)) {
1005			nr_skipped++;
1006			continue;
1007		}
1008
1009		if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1010			nr_skipped++;
1011			continue;
1012		}
1013		/*
1014		 * Now we hold i_es_lock which protects us from inode reclaim
1015		 * freeing inode under us
1016		 */
1017		spin_unlock(&sbi->s_es_lock);
1018
1019		nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1020		write_unlock(&ei->i_es_lock);
1021
1022		if (nr_to_scan <= 0)
1023			goto out;
1024		spin_lock(&sbi->s_es_lock);
1025	}
1026	spin_unlock(&sbi->s_es_lock);
1027
1028	/*
1029	 * If we skipped any inodes, and we weren't able to make any
1030	 * forward progress, try again to scan precached inodes.
1031	 */
1032	if ((nr_shrunk == 0) && nr_skipped && !retried) {
1033		retried++;
1034		goto retry;
1035	}
1036
1037	if (locked_ei && nr_shrunk == 0)
1038		nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1039
1040out:
1041	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1042	if (likely(es_stats->es_stats_scan_time))
1043		es_stats->es_stats_scan_time = (scan_time +
1044				es_stats->es_stats_scan_time*3) / 4;
1045	else
1046		es_stats->es_stats_scan_time = scan_time;
1047	if (scan_time > es_stats->es_stats_max_scan_time)
1048		es_stats->es_stats_max_scan_time = scan_time;
1049	if (likely(es_stats->es_stats_shrunk))
1050		es_stats->es_stats_shrunk = (nr_shrunk +
1051				es_stats->es_stats_shrunk*3) / 4;
1052	else
1053		es_stats->es_stats_shrunk = nr_shrunk;
1054
1055	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1056			     nr_skipped, retried);
1057	return nr_shrunk;
1058}
1059
1060static unsigned long ext4_es_count(struct shrinker *shrink,
1061				   struct shrink_control *sc)
1062{
1063	unsigned long nr;
1064	struct ext4_sb_info *sbi;
1065
1066	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1067	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1068	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1069	return nr;
1070}
1071
1072static unsigned long ext4_es_scan(struct shrinker *shrink,
1073				  struct shrink_control *sc)
1074{
1075	struct ext4_sb_info *sbi = container_of(shrink,
1076					struct ext4_sb_info, s_es_shrinker);
1077	int nr_to_scan = sc->nr_to_scan;
1078	int ret, nr_shrunk;
1079
1080	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1081	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1082
1083	if (!nr_to_scan)
1084		return ret;
1085
1086	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1087
1088	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1089	return nr_shrunk;
1090}
1091
1092static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
1093{
1094	return *pos ? NULL : SEQ_START_TOKEN;
1095}
1096
1097static void *
1098ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
1099{
1100	return NULL;
1101}
1102
1103static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1104{
1105	struct ext4_sb_info *sbi = seq->private;
1106	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1107	struct ext4_inode_info *ei, *max = NULL;
1108	unsigned int inode_cnt = 0;
1109
1110	if (v != SEQ_START_TOKEN)
1111		return 0;
1112
1113	/* here we just find an inode that has the max nr. of objects */
1114	spin_lock(&sbi->s_es_lock);
1115	list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1116		inode_cnt++;
1117		if (max && max->i_es_all_nr < ei->i_es_all_nr)
1118			max = ei;
1119		else if (!max)
1120			max = ei;
1121	}
1122	spin_unlock(&sbi->s_es_lock);
1123
1124	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
1125		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1126		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1127	seq_printf(seq, "  %lu/%lu cache hits/misses\n",
1128		   es_stats->es_stats_cache_hits,
1129		   es_stats->es_stats_cache_misses);
1130	if (inode_cnt)
1131		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
1132
1133	seq_printf(seq, "average:\n  %llu us scan time\n",
1134	    div_u64(es_stats->es_stats_scan_time, 1000));
1135	seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
1136	if (inode_cnt)
1137		seq_printf(seq,
1138		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
1139		    "  %llu us max scan time\n",
1140		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1141		    div_u64(es_stats->es_stats_max_scan_time, 1000));
1142
1143	return 0;
1144}
1145
1146static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
1147{
1148}
1149
1150static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
1151	.start = ext4_es_seq_shrinker_info_start,
1152	.next  = ext4_es_seq_shrinker_info_next,
1153	.stop  = ext4_es_seq_shrinker_info_stop,
1154	.show  = ext4_es_seq_shrinker_info_show,
1155};
1156
1157static int
1158ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
1159{
1160	int ret;
1161
1162	ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
1163	if (!ret) {
1164		struct seq_file *m = file->private_data;
1165		m->private = PDE_DATA(inode);
1166	}
1167
1168	return ret;
1169}
1170
1171static int
1172ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
1173{
1174	return seq_release(inode, file);
1175}
1176
1177static const struct file_operations ext4_es_seq_shrinker_info_fops = {
1178	.owner		= THIS_MODULE,
1179	.open		= ext4_es_seq_shrinker_info_open,
1180	.read		= seq_read,
1181	.llseek		= seq_lseek,
1182	.release	= ext4_es_seq_shrinker_info_release,
1183};
1184
1185int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1186{
1187	int err;
1188
1189	/* Make sure we have enough bits for physical block number */
1190	BUILD_BUG_ON(ES_SHIFT < 48);
1191	INIT_LIST_HEAD(&sbi->s_es_list);
1192	sbi->s_es_nr_inode = 0;
1193	spin_lock_init(&sbi->s_es_lock);
1194	sbi->s_es_stats.es_stats_shrunk = 0;
1195	sbi->s_es_stats.es_stats_cache_hits = 0;
1196	sbi->s_es_stats.es_stats_cache_misses = 0;
1197	sbi->s_es_stats.es_stats_scan_time = 0;
1198	sbi->s_es_stats.es_stats_max_scan_time = 0;
1199	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1200	if (err)
1201		return err;
1202	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1203	if (err)
1204		goto err1;
1205
1206	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1207	sbi->s_es_shrinker.count_objects = ext4_es_count;
1208	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1209	err = register_shrinker(&sbi->s_es_shrinker);
1210	if (err)
1211		goto err2;
1212
1213	if (sbi->s_proc)
1214		proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
1215				 &ext4_es_seq_shrinker_info_fops, sbi);
1216
1217	return 0;
1218
1219err2:
1220	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1221err1:
1222	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1223	return err;
1224}
1225
1226void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1227{
1228	if (sbi->s_proc)
1229		remove_proc_entry("es_shrinker_info", sbi->s_proc);
1230	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1231	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1232	unregister_shrinker(&sbi->s_es_shrinker);
1233}
1234
1235/*
1236 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1237 * most *nr_to_scan extents, update *nr_to_scan accordingly.
1238 *
1239 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1240 * Increment *nr_shrunk by the number of reclaimed extents. Also update
1241 * ei->i_es_shrink_lblk to where we should continue scanning.
1242 */
1243static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1244				 int *nr_to_scan, int *nr_shrunk)
1245{
1246	struct inode *inode = &ei->vfs_inode;
1247	struct ext4_es_tree *tree = &ei->i_es_tree;
1248	struct extent_status *es;
1249	struct rb_node *node;
1250
1251	es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1252	if (!es)
1253		goto out_wrap;
1254	node = &es->rb_node;
1255	while (*nr_to_scan > 0) {
1256		if (es->es_lblk > end) {
1257			ei->i_es_shrink_lblk = end + 1;
1258			return 0;
1259		}
1260
1261		(*nr_to_scan)--;
1262		node = rb_next(&es->rb_node);
1263		/*
1264		 * We can't reclaim delayed extent from status tree because
1265		 * fiemap, bigallic, and seek_data/hole need to use it.
1266		 */
1267		if (ext4_es_is_delayed(es))
1268			goto next;
1269		if (ext4_es_is_referenced(es)) {
1270			ext4_es_clear_referenced(es);
1271			goto next;
1272		}
1273
1274		rb_erase(&es->rb_node, &tree->root);
1275		ext4_es_free_extent(inode, es);
1276		(*nr_shrunk)++;
1277next:
1278		if (!node)
1279			goto out_wrap;
1280		es = rb_entry(node, struct extent_status, rb_node);
1281	}
1282	ei->i_es_shrink_lblk = es->es_lblk;
1283	return 1;
1284out_wrap:
1285	ei->i_es_shrink_lblk = 0;
1286	return 0;
1287}
1288
1289static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1290{
1291	struct inode *inode = &ei->vfs_inode;
1292	int nr_shrunk = 0;
1293	ext4_lblk_t start = ei->i_es_shrink_lblk;
1294	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1295				      DEFAULT_RATELIMIT_BURST);
1296
1297	if (ei->i_es_shk_nr == 0)
1298		return 0;
1299
1300	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1301	    __ratelimit(&_rs))
1302		ext4_warning(inode->i_sb, "forced shrink of precached extents");
1303
1304	if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1305	    start != 0)
1306		es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1307
1308	ei->i_es_tree.cache_es = NULL;
1309	return nr_shrunk;
1310}
1311