1/*
2 * Copyright (C) 2011 STRATO.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include <linux/btrfs.h>
27
28#include "ctree.h"
29#include "transaction.h"
30#include "disk-io.h"
31#include "locking.h"
32#include "ulist.h"
33#include "backref.h"
34#include "extent_io.h"
35#include "qgroup.h"
36
37/* TODO XXX FIXME
38 *  - subvol delete -> delete when ref goes to 0? delete limits also?
39 *  - reorganize keys
40 *  - compressed
41 *  - sync
42 *  - copy also limits on subvol creation
43 *  - limit
44 *  - caches fuer ulists
45 *  - performance benchmarks
46 *  - check all ioctl parameters
47 */
48
49/*
50 * one struct for each qgroup, organized in fs_info->qgroup_tree.
51 */
52struct btrfs_qgroup {
53	u64 qgroupid;
54
55	/*
56	 * state
57	 */
58	u64 rfer;	/* referenced */
59	u64 rfer_cmpr;	/* referenced compressed */
60	u64 excl;	/* exclusive */
61	u64 excl_cmpr;	/* exclusive compressed */
62
63	/*
64	 * limits
65	 */
66	u64 lim_flags;	/* which limits are set */
67	u64 max_rfer;
68	u64 max_excl;
69	u64 rsv_rfer;
70	u64 rsv_excl;
71
72	/*
73	 * reservation tracking
74	 */
75	u64 reserved;
76
77	/*
78	 * lists
79	 */
80	struct list_head groups;  /* groups this group is member of */
81	struct list_head members; /* groups that are members of this group */
82	struct list_head dirty;   /* dirty groups */
83	struct rb_node node;	  /* tree of qgroups */
84
85	/*
86	 * temp variables for accounting operations
87	 */
88	u64 old_refcnt;
89	u64 new_refcnt;
90};
91
92/*
93 * glue structure to represent the relations between qgroups.
94 */
95struct btrfs_qgroup_list {
96	struct list_head next_group;
97	struct list_head next_member;
98	struct btrfs_qgroup *group;
99	struct btrfs_qgroup *member;
100};
101
102#define ptr_to_u64(x) ((u64)(uintptr_t)x)
103#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
104
105static int
106qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
107		   int init_flags);
108static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
109
110/* must be called with qgroup_ioctl_lock held */
111static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
112					   u64 qgroupid)
113{
114	struct rb_node *n = fs_info->qgroup_tree.rb_node;
115	struct btrfs_qgroup *qgroup;
116
117	while (n) {
118		qgroup = rb_entry(n, struct btrfs_qgroup, node);
119		if (qgroup->qgroupid < qgroupid)
120			n = n->rb_left;
121		else if (qgroup->qgroupid > qgroupid)
122			n = n->rb_right;
123		else
124			return qgroup;
125	}
126	return NULL;
127}
128
129/* must be called with qgroup_lock held */
130static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
131					  u64 qgroupid)
132{
133	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
134	struct rb_node *parent = NULL;
135	struct btrfs_qgroup *qgroup;
136
137	while (*p) {
138		parent = *p;
139		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
140
141		if (qgroup->qgroupid < qgroupid)
142			p = &(*p)->rb_left;
143		else if (qgroup->qgroupid > qgroupid)
144			p = &(*p)->rb_right;
145		else
146			return qgroup;
147	}
148
149	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
150	if (!qgroup)
151		return ERR_PTR(-ENOMEM);
152
153	qgroup->qgroupid = qgroupid;
154	INIT_LIST_HEAD(&qgroup->groups);
155	INIT_LIST_HEAD(&qgroup->members);
156	INIT_LIST_HEAD(&qgroup->dirty);
157
158	rb_link_node(&qgroup->node, parent, p);
159	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
160
161	return qgroup;
162}
163
164static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
165{
166	struct btrfs_qgroup_list *list;
167
168	list_del(&qgroup->dirty);
169	while (!list_empty(&qgroup->groups)) {
170		list = list_first_entry(&qgroup->groups,
171					struct btrfs_qgroup_list, next_group);
172		list_del(&list->next_group);
173		list_del(&list->next_member);
174		kfree(list);
175	}
176
177	while (!list_empty(&qgroup->members)) {
178		list = list_first_entry(&qgroup->members,
179					struct btrfs_qgroup_list, next_member);
180		list_del(&list->next_group);
181		list_del(&list->next_member);
182		kfree(list);
183	}
184	kfree(qgroup);
185}
186
187/* must be called with qgroup_lock held */
188static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
189{
190	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
191
192	if (!qgroup)
193		return -ENOENT;
194
195	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
196	__del_qgroup_rb(qgroup);
197	return 0;
198}
199
200/* must be called with qgroup_lock held */
201static int add_relation_rb(struct btrfs_fs_info *fs_info,
202			   u64 memberid, u64 parentid)
203{
204	struct btrfs_qgroup *member;
205	struct btrfs_qgroup *parent;
206	struct btrfs_qgroup_list *list;
207
208	member = find_qgroup_rb(fs_info, memberid);
209	parent = find_qgroup_rb(fs_info, parentid);
210	if (!member || !parent)
211		return -ENOENT;
212
213	list = kzalloc(sizeof(*list), GFP_ATOMIC);
214	if (!list)
215		return -ENOMEM;
216
217	list->group = parent;
218	list->member = member;
219	list_add_tail(&list->next_group, &member->groups);
220	list_add_tail(&list->next_member, &parent->members);
221
222	return 0;
223}
224
225/* must be called with qgroup_lock held */
226static int del_relation_rb(struct btrfs_fs_info *fs_info,
227			   u64 memberid, u64 parentid)
228{
229	struct btrfs_qgroup *member;
230	struct btrfs_qgroup *parent;
231	struct btrfs_qgroup_list *list;
232
233	member = find_qgroup_rb(fs_info, memberid);
234	parent = find_qgroup_rb(fs_info, parentid);
235	if (!member || !parent)
236		return -ENOENT;
237
238	list_for_each_entry(list, &member->groups, next_group) {
239		if (list->group == parent) {
240			list_del(&list->next_group);
241			list_del(&list->next_member);
242			kfree(list);
243			return 0;
244		}
245	}
246	return -ENOENT;
247}
248
249#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
250int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
251			       u64 rfer, u64 excl)
252{
253	struct btrfs_qgroup *qgroup;
254
255	qgroup = find_qgroup_rb(fs_info, qgroupid);
256	if (!qgroup)
257		return -EINVAL;
258	if (qgroup->rfer != rfer || qgroup->excl != excl)
259		return -EINVAL;
260	return 0;
261}
262#endif
263
264/*
265 * The full config is read in one go, only called from open_ctree()
266 * It doesn't use any locking, as at this point we're still single-threaded
267 */
268int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
269{
270	struct btrfs_key key;
271	struct btrfs_key found_key;
272	struct btrfs_root *quota_root = fs_info->quota_root;
273	struct btrfs_path *path = NULL;
274	struct extent_buffer *l;
275	int slot;
276	int ret = 0;
277	u64 flags = 0;
278	u64 rescan_progress = 0;
279
280	if (!fs_info->quota_enabled)
281		return 0;
282
283	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
284	if (!fs_info->qgroup_ulist) {
285		ret = -ENOMEM;
286		goto out;
287	}
288
289	path = btrfs_alloc_path();
290	if (!path) {
291		ret = -ENOMEM;
292		goto out;
293	}
294
295	/* default this to quota off, in case no status key is found */
296	fs_info->qgroup_flags = 0;
297
298	/*
299	 * pass 1: read status, all qgroup infos and limits
300	 */
301	key.objectid = 0;
302	key.type = 0;
303	key.offset = 0;
304	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
305	if (ret)
306		goto out;
307
308	while (1) {
309		struct btrfs_qgroup *qgroup;
310
311		slot = path->slots[0];
312		l = path->nodes[0];
313		btrfs_item_key_to_cpu(l, &found_key, slot);
314
315		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
316			struct btrfs_qgroup_status_item *ptr;
317
318			ptr = btrfs_item_ptr(l, slot,
319					     struct btrfs_qgroup_status_item);
320
321			if (btrfs_qgroup_status_version(l, ptr) !=
322			    BTRFS_QGROUP_STATUS_VERSION) {
323				btrfs_err(fs_info,
324				 "old qgroup version, quota disabled");
325				goto out;
326			}
327			if (btrfs_qgroup_status_generation(l, ptr) !=
328			    fs_info->generation) {
329				flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
330				btrfs_err(fs_info,
331					"qgroup generation mismatch, "
332					"marked as inconsistent");
333			}
334			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
335									  ptr);
336			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
337			goto next1;
338		}
339
340		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
341		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
342			goto next1;
343
344		qgroup = find_qgroup_rb(fs_info, found_key.offset);
345		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
346		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
347			btrfs_err(fs_info, "inconsitent qgroup config");
348			flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
349		}
350		if (!qgroup) {
351			qgroup = add_qgroup_rb(fs_info, found_key.offset);
352			if (IS_ERR(qgroup)) {
353				ret = PTR_ERR(qgroup);
354				goto out;
355			}
356		}
357		switch (found_key.type) {
358		case BTRFS_QGROUP_INFO_KEY: {
359			struct btrfs_qgroup_info_item *ptr;
360
361			ptr = btrfs_item_ptr(l, slot,
362					     struct btrfs_qgroup_info_item);
363			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
364			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
365			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
366			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
367			/* generation currently unused */
368			break;
369		}
370		case BTRFS_QGROUP_LIMIT_KEY: {
371			struct btrfs_qgroup_limit_item *ptr;
372
373			ptr = btrfs_item_ptr(l, slot,
374					     struct btrfs_qgroup_limit_item);
375			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
376			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
377			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
378			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
379			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
380			break;
381		}
382		}
383next1:
384		ret = btrfs_next_item(quota_root, path);
385		if (ret < 0)
386			goto out;
387		if (ret)
388			break;
389	}
390	btrfs_release_path(path);
391
392	/*
393	 * pass 2: read all qgroup relations
394	 */
395	key.objectid = 0;
396	key.type = BTRFS_QGROUP_RELATION_KEY;
397	key.offset = 0;
398	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
399	if (ret)
400		goto out;
401	while (1) {
402		slot = path->slots[0];
403		l = path->nodes[0];
404		btrfs_item_key_to_cpu(l, &found_key, slot);
405
406		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
407			goto next2;
408
409		if (found_key.objectid > found_key.offset) {
410			/* parent <- member, not needed to build config */
411			/* FIXME should we omit the key completely? */
412			goto next2;
413		}
414
415		ret = add_relation_rb(fs_info, found_key.objectid,
416				      found_key.offset);
417		if (ret == -ENOENT) {
418			btrfs_warn(fs_info,
419				"orphan qgroup relation 0x%llx->0x%llx",
420				found_key.objectid, found_key.offset);
421			ret = 0;	/* ignore the error */
422		}
423		if (ret)
424			goto out;
425next2:
426		ret = btrfs_next_item(quota_root, path);
427		if (ret < 0)
428			goto out;
429		if (ret)
430			break;
431	}
432out:
433	fs_info->qgroup_flags |= flags;
434	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
435		fs_info->quota_enabled = 0;
436		fs_info->pending_quota_state = 0;
437	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
438		   ret >= 0) {
439		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
440	}
441	btrfs_free_path(path);
442
443	if (ret < 0) {
444		ulist_free(fs_info->qgroup_ulist);
445		fs_info->qgroup_ulist = NULL;
446		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
447	}
448
449	return ret < 0 ? ret : 0;
450}
451
452/*
453 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
454 * first two are in single-threaded paths.And for the third one, we have set
455 * quota_root to be null with qgroup_lock held before, so it is safe to clean
456 * up the in-memory structures without qgroup_lock held.
457 */
458void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
459{
460	struct rb_node *n;
461	struct btrfs_qgroup *qgroup;
462
463	while ((n = rb_first(&fs_info->qgroup_tree))) {
464		qgroup = rb_entry(n, struct btrfs_qgroup, node);
465		rb_erase(n, &fs_info->qgroup_tree);
466		__del_qgroup_rb(qgroup);
467	}
468	/*
469	 * we call btrfs_free_qgroup_config() when umounting
470	 * filesystem and disabling quota, so we set qgroup_ulit
471	 * to be null here to avoid double free.
472	 */
473	ulist_free(fs_info->qgroup_ulist);
474	fs_info->qgroup_ulist = NULL;
475}
476
477static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
478				    struct btrfs_root *quota_root,
479				    u64 src, u64 dst)
480{
481	int ret;
482	struct btrfs_path *path;
483	struct btrfs_key key;
484
485	path = btrfs_alloc_path();
486	if (!path)
487		return -ENOMEM;
488
489	key.objectid = src;
490	key.type = BTRFS_QGROUP_RELATION_KEY;
491	key.offset = dst;
492
493	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
494
495	btrfs_mark_buffer_dirty(path->nodes[0]);
496
497	btrfs_free_path(path);
498	return ret;
499}
500
501static int del_qgroup_relation_item(struct btrfs_trans_handle *trans,
502				    struct btrfs_root *quota_root,
503				    u64 src, u64 dst)
504{
505	int ret;
506	struct btrfs_path *path;
507	struct btrfs_key key;
508
509	path = btrfs_alloc_path();
510	if (!path)
511		return -ENOMEM;
512
513	key.objectid = src;
514	key.type = BTRFS_QGROUP_RELATION_KEY;
515	key.offset = dst;
516
517	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
518	if (ret < 0)
519		goto out;
520
521	if (ret > 0) {
522		ret = -ENOENT;
523		goto out;
524	}
525
526	ret = btrfs_del_item(trans, quota_root, path);
527out:
528	btrfs_free_path(path);
529	return ret;
530}
531
532static int add_qgroup_item(struct btrfs_trans_handle *trans,
533			   struct btrfs_root *quota_root, u64 qgroupid)
534{
535	int ret;
536	struct btrfs_path *path;
537	struct btrfs_qgroup_info_item *qgroup_info;
538	struct btrfs_qgroup_limit_item *qgroup_limit;
539	struct extent_buffer *leaf;
540	struct btrfs_key key;
541
542	if (btrfs_test_is_dummy_root(quota_root))
543		return 0;
544
545	path = btrfs_alloc_path();
546	if (!path)
547		return -ENOMEM;
548
549	key.objectid = 0;
550	key.type = BTRFS_QGROUP_INFO_KEY;
551	key.offset = qgroupid;
552
553	/*
554	 * Avoid a transaction abort by catching -EEXIST here. In that
555	 * case, we proceed by re-initializing the existing structure
556	 * on disk.
557	 */
558
559	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
560				      sizeof(*qgroup_info));
561	if (ret && ret != -EEXIST)
562		goto out;
563
564	leaf = path->nodes[0];
565	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
566				 struct btrfs_qgroup_info_item);
567	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
568	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
569	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
570	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
571	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
572
573	btrfs_mark_buffer_dirty(leaf);
574
575	btrfs_release_path(path);
576
577	key.type = BTRFS_QGROUP_LIMIT_KEY;
578	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
579				      sizeof(*qgroup_limit));
580	if (ret && ret != -EEXIST)
581		goto out;
582
583	leaf = path->nodes[0];
584	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
585				  struct btrfs_qgroup_limit_item);
586	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
587	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
588	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
589	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
590	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
591
592	btrfs_mark_buffer_dirty(leaf);
593
594	ret = 0;
595out:
596	btrfs_free_path(path);
597	return ret;
598}
599
600static int del_qgroup_item(struct btrfs_trans_handle *trans,
601			   struct btrfs_root *quota_root, u64 qgroupid)
602{
603	int ret;
604	struct btrfs_path *path;
605	struct btrfs_key key;
606
607	path = btrfs_alloc_path();
608	if (!path)
609		return -ENOMEM;
610
611	key.objectid = 0;
612	key.type = BTRFS_QGROUP_INFO_KEY;
613	key.offset = qgroupid;
614	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
615	if (ret < 0)
616		goto out;
617
618	if (ret > 0) {
619		ret = -ENOENT;
620		goto out;
621	}
622
623	ret = btrfs_del_item(trans, quota_root, path);
624	if (ret)
625		goto out;
626
627	btrfs_release_path(path);
628
629	key.type = BTRFS_QGROUP_LIMIT_KEY;
630	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
631	if (ret < 0)
632		goto out;
633
634	if (ret > 0) {
635		ret = -ENOENT;
636		goto out;
637	}
638
639	ret = btrfs_del_item(trans, quota_root, path);
640
641out:
642	btrfs_free_path(path);
643	return ret;
644}
645
646static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
647				    struct btrfs_root *root,
648				    struct btrfs_qgroup *qgroup)
649{
650	struct btrfs_path *path;
651	struct btrfs_key key;
652	struct extent_buffer *l;
653	struct btrfs_qgroup_limit_item *qgroup_limit;
654	int ret;
655	int slot;
656
657	key.objectid = 0;
658	key.type = BTRFS_QGROUP_LIMIT_KEY;
659	key.offset = qgroup->qgroupid;
660
661	path = btrfs_alloc_path();
662	if (!path)
663		return -ENOMEM;
664
665	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
666	if (ret > 0)
667		ret = -ENOENT;
668
669	if (ret)
670		goto out;
671
672	l = path->nodes[0];
673	slot = path->slots[0];
674	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
675	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
676	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
677	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
678	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
679	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
680
681	btrfs_mark_buffer_dirty(l);
682
683out:
684	btrfs_free_path(path);
685	return ret;
686}
687
688static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
689				   struct btrfs_root *root,
690				   struct btrfs_qgroup *qgroup)
691{
692	struct btrfs_path *path;
693	struct btrfs_key key;
694	struct extent_buffer *l;
695	struct btrfs_qgroup_info_item *qgroup_info;
696	int ret;
697	int slot;
698
699	if (btrfs_test_is_dummy_root(root))
700		return 0;
701
702	key.objectid = 0;
703	key.type = BTRFS_QGROUP_INFO_KEY;
704	key.offset = qgroup->qgroupid;
705
706	path = btrfs_alloc_path();
707	if (!path)
708		return -ENOMEM;
709
710	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
711	if (ret > 0)
712		ret = -ENOENT;
713
714	if (ret)
715		goto out;
716
717	l = path->nodes[0];
718	slot = path->slots[0];
719	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
720	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
721	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
722	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
723	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
724	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
725
726	btrfs_mark_buffer_dirty(l);
727
728out:
729	btrfs_free_path(path);
730	return ret;
731}
732
733static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
734				     struct btrfs_fs_info *fs_info,
735				    struct btrfs_root *root)
736{
737	struct btrfs_path *path;
738	struct btrfs_key key;
739	struct extent_buffer *l;
740	struct btrfs_qgroup_status_item *ptr;
741	int ret;
742	int slot;
743
744	key.objectid = 0;
745	key.type = BTRFS_QGROUP_STATUS_KEY;
746	key.offset = 0;
747
748	path = btrfs_alloc_path();
749	if (!path)
750		return -ENOMEM;
751
752	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
753	if (ret > 0)
754		ret = -ENOENT;
755
756	if (ret)
757		goto out;
758
759	l = path->nodes[0];
760	slot = path->slots[0];
761	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
762	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
763	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
764	btrfs_set_qgroup_status_rescan(l, ptr,
765				fs_info->qgroup_rescan_progress.objectid);
766
767	btrfs_mark_buffer_dirty(l);
768
769out:
770	btrfs_free_path(path);
771	return ret;
772}
773
774/*
775 * called with qgroup_lock held
776 */
777static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
778				  struct btrfs_root *root)
779{
780	struct btrfs_path *path;
781	struct btrfs_key key;
782	struct extent_buffer *leaf = NULL;
783	int ret;
784	int nr = 0;
785
786	path = btrfs_alloc_path();
787	if (!path)
788		return -ENOMEM;
789
790	path->leave_spinning = 1;
791
792	key.objectid = 0;
793	key.offset = 0;
794	key.type = 0;
795
796	while (1) {
797		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
798		if (ret < 0)
799			goto out;
800		leaf = path->nodes[0];
801		nr = btrfs_header_nritems(leaf);
802		if (!nr)
803			break;
804		/*
805		 * delete the leaf one by one
806		 * since the whole tree is going
807		 * to be deleted.
808		 */
809		path->slots[0] = 0;
810		ret = btrfs_del_items(trans, root, path, 0, nr);
811		if (ret)
812			goto out;
813
814		btrfs_release_path(path);
815	}
816	ret = 0;
817out:
818	root->fs_info->pending_quota_state = 0;
819	btrfs_free_path(path);
820	return ret;
821}
822
823int btrfs_quota_enable(struct btrfs_trans_handle *trans,
824		       struct btrfs_fs_info *fs_info)
825{
826	struct btrfs_root *quota_root;
827	struct btrfs_root *tree_root = fs_info->tree_root;
828	struct btrfs_path *path = NULL;
829	struct btrfs_qgroup_status_item *ptr;
830	struct extent_buffer *leaf;
831	struct btrfs_key key;
832	struct btrfs_key found_key;
833	struct btrfs_qgroup *qgroup = NULL;
834	int ret = 0;
835	int slot;
836
837	mutex_lock(&fs_info->qgroup_ioctl_lock);
838	if (fs_info->quota_root) {
839		fs_info->pending_quota_state = 1;
840		goto out;
841	}
842
843	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
844	if (!fs_info->qgroup_ulist) {
845		ret = -ENOMEM;
846		goto out;
847	}
848
849	/*
850	 * initially create the quota tree
851	 */
852	quota_root = btrfs_create_tree(trans, fs_info,
853				       BTRFS_QUOTA_TREE_OBJECTID);
854	if (IS_ERR(quota_root)) {
855		ret =  PTR_ERR(quota_root);
856		goto out;
857	}
858
859	path = btrfs_alloc_path();
860	if (!path) {
861		ret = -ENOMEM;
862		goto out_free_root;
863	}
864
865	key.objectid = 0;
866	key.type = BTRFS_QGROUP_STATUS_KEY;
867	key.offset = 0;
868
869	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
870				      sizeof(*ptr));
871	if (ret)
872		goto out_free_path;
873
874	leaf = path->nodes[0];
875	ptr = btrfs_item_ptr(leaf, path->slots[0],
876				 struct btrfs_qgroup_status_item);
877	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
878	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
879	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
880				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
881	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
882	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
883
884	btrfs_mark_buffer_dirty(leaf);
885
886	key.objectid = 0;
887	key.type = BTRFS_ROOT_REF_KEY;
888	key.offset = 0;
889
890	btrfs_release_path(path);
891	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
892	if (ret > 0)
893		goto out_add_root;
894	if (ret < 0)
895		goto out_free_path;
896
897
898	while (1) {
899		slot = path->slots[0];
900		leaf = path->nodes[0];
901		btrfs_item_key_to_cpu(leaf, &found_key, slot);
902
903		if (found_key.type == BTRFS_ROOT_REF_KEY) {
904			ret = add_qgroup_item(trans, quota_root,
905					      found_key.offset);
906			if (ret)
907				goto out_free_path;
908
909			qgroup = add_qgroup_rb(fs_info, found_key.offset);
910			if (IS_ERR(qgroup)) {
911				ret = PTR_ERR(qgroup);
912				goto out_free_path;
913			}
914		}
915		ret = btrfs_next_item(tree_root, path);
916		if (ret < 0)
917			goto out_free_path;
918		if (ret)
919			break;
920	}
921
922out_add_root:
923	btrfs_release_path(path);
924	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
925	if (ret)
926		goto out_free_path;
927
928	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
929	if (IS_ERR(qgroup)) {
930		ret = PTR_ERR(qgroup);
931		goto out_free_path;
932	}
933	spin_lock(&fs_info->qgroup_lock);
934	fs_info->quota_root = quota_root;
935	fs_info->pending_quota_state = 1;
936	spin_unlock(&fs_info->qgroup_lock);
937out_free_path:
938	btrfs_free_path(path);
939out_free_root:
940	if (ret) {
941		free_extent_buffer(quota_root->node);
942		free_extent_buffer(quota_root->commit_root);
943		kfree(quota_root);
944	}
945out:
946	if (ret) {
947		ulist_free(fs_info->qgroup_ulist);
948		fs_info->qgroup_ulist = NULL;
949	}
950	mutex_unlock(&fs_info->qgroup_ioctl_lock);
951	return ret;
952}
953
954int btrfs_quota_disable(struct btrfs_trans_handle *trans,
955			struct btrfs_fs_info *fs_info)
956{
957	struct btrfs_root *tree_root = fs_info->tree_root;
958	struct btrfs_root *quota_root;
959	int ret = 0;
960
961	mutex_lock(&fs_info->qgroup_ioctl_lock);
962	if (!fs_info->quota_root)
963		goto out;
964	spin_lock(&fs_info->qgroup_lock);
965	fs_info->quota_enabled = 0;
966	fs_info->pending_quota_state = 0;
967	quota_root = fs_info->quota_root;
968	fs_info->quota_root = NULL;
969	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
970	spin_unlock(&fs_info->qgroup_lock);
971
972	btrfs_free_qgroup_config(fs_info);
973
974	ret = btrfs_clean_quota_tree(trans, quota_root);
975	if (ret)
976		goto out;
977
978	ret = btrfs_del_root(trans, tree_root, &quota_root->root_key);
979	if (ret)
980		goto out;
981
982	list_del(&quota_root->dirty_list);
983
984	btrfs_tree_lock(quota_root->node);
985	clean_tree_block(trans, tree_root->fs_info, quota_root->node);
986	btrfs_tree_unlock(quota_root->node);
987	btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
988
989	free_extent_buffer(quota_root->node);
990	free_extent_buffer(quota_root->commit_root);
991	kfree(quota_root);
992out:
993	mutex_unlock(&fs_info->qgroup_ioctl_lock);
994	return ret;
995}
996
997static void qgroup_dirty(struct btrfs_fs_info *fs_info,
998			 struct btrfs_qgroup *qgroup)
999{
1000	if (list_empty(&qgroup->dirty))
1001		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1002}
1003
1004/*
1005 * The easy accounting, if we are adding/removing the only ref for an extent
1006 * then this qgroup and all of the parent qgroups get their refrence and
1007 * exclusive counts adjusted.
1008 *
1009 * Caller should hold fs_info->qgroup_lock.
1010 */
1011static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1012				    struct ulist *tmp, u64 ref_root,
1013				    u64 num_bytes, int sign)
1014{
1015	struct btrfs_qgroup *qgroup;
1016	struct btrfs_qgroup_list *glist;
1017	struct ulist_node *unode;
1018	struct ulist_iterator uiter;
1019	int ret = 0;
1020
1021	qgroup = find_qgroup_rb(fs_info, ref_root);
1022	if (!qgroup)
1023		goto out;
1024
1025	qgroup->rfer += sign * num_bytes;
1026	qgroup->rfer_cmpr += sign * num_bytes;
1027
1028	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1029	qgroup->excl += sign * num_bytes;
1030	qgroup->excl_cmpr += sign * num_bytes;
1031	if (sign > 0)
1032		qgroup->reserved -= num_bytes;
1033
1034	qgroup_dirty(fs_info, qgroup);
1035
1036	/* Get all of the parent groups that contain this qgroup */
1037	list_for_each_entry(glist, &qgroup->groups, next_group) {
1038		ret = ulist_add(tmp, glist->group->qgroupid,
1039				ptr_to_u64(glist->group), GFP_ATOMIC);
1040		if (ret < 0)
1041			goto out;
1042	}
1043
1044	/* Iterate all of the parents and adjust their reference counts */
1045	ULIST_ITER_INIT(&uiter);
1046	while ((unode = ulist_next(tmp, &uiter))) {
1047		qgroup = u64_to_ptr(unode->aux);
1048		qgroup->rfer += sign * num_bytes;
1049		qgroup->rfer_cmpr += sign * num_bytes;
1050		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1051		qgroup->excl += sign * num_bytes;
1052		if (sign > 0)
1053			qgroup->reserved -= num_bytes;
1054		qgroup->excl_cmpr += sign * num_bytes;
1055		qgroup_dirty(fs_info, qgroup);
1056
1057		/* Add any parents of the parents */
1058		list_for_each_entry(glist, &qgroup->groups, next_group) {
1059			ret = ulist_add(tmp, glist->group->qgroupid,
1060					ptr_to_u64(glist->group), GFP_ATOMIC);
1061			if (ret < 0)
1062				goto out;
1063		}
1064	}
1065	ret = 0;
1066out:
1067	return ret;
1068}
1069
1070
1071/*
1072 * Quick path for updating qgroup with only excl refs.
1073 *
1074 * In that case, just update all parent will be enough.
1075 * Or we needs to do a full rescan.
1076 * Caller should also hold fs_info->qgroup_lock.
1077 *
1078 * Return 0 for quick update, return >0 for need to full rescan
1079 * and mark INCONSISTENT flag.
1080 * Return < 0 for other error.
1081 */
1082static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1083				   struct ulist *tmp, u64 src, u64 dst,
1084				   int sign)
1085{
1086	struct btrfs_qgroup *qgroup;
1087	int ret = 1;
1088	int err = 0;
1089
1090	qgroup = find_qgroup_rb(fs_info, src);
1091	if (!qgroup)
1092		goto out;
1093	if (qgroup->excl == qgroup->rfer) {
1094		ret = 0;
1095		err = __qgroup_excl_accounting(fs_info, tmp, dst,
1096					       qgroup->excl, sign);
1097		if (err < 0) {
1098			ret = err;
1099			goto out;
1100		}
1101	}
1102out:
1103	if (ret)
1104		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1105	return ret;
1106}
1107
1108int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
1109			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1110{
1111	struct btrfs_root *quota_root;
1112	struct btrfs_qgroup *parent;
1113	struct btrfs_qgroup *member;
1114	struct btrfs_qgroup_list *list;
1115	struct ulist *tmp;
1116	int ret = 0;
1117
1118	tmp = ulist_alloc(GFP_NOFS);
1119	if (!tmp)
1120		return -ENOMEM;
1121
1122	/* Check the level of src and dst first */
1123	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1124		return -EINVAL;
1125
1126	mutex_lock(&fs_info->qgroup_ioctl_lock);
1127	quota_root = fs_info->quota_root;
1128	if (!quota_root) {
1129		ret = -EINVAL;
1130		goto out;
1131	}
1132	member = find_qgroup_rb(fs_info, src);
1133	parent = find_qgroup_rb(fs_info, dst);
1134	if (!member || !parent) {
1135		ret = -EINVAL;
1136		goto out;
1137	}
1138
1139	/* check if such qgroup relation exist firstly */
1140	list_for_each_entry(list, &member->groups, next_group) {
1141		if (list->group == parent) {
1142			ret = -EEXIST;
1143			goto out;
1144		}
1145	}
1146
1147	ret = add_qgroup_relation_item(trans, quota_root, src, dst);
1148	if (ret)
1149		goto out;
1150
1151	ret = add_qgroup_relation_item(trans, quota_root, dst, src);
1152	if (ret) {
1153		del_qgroup_relation_item(trans, quota_root, src, dst);
1154		goto out;
1155	}
1156
1157	spin_lock(&fs_info->qgroup_lock);
1158	ret = add_relation_rb(quota_root->fs_info, src, dst);
1159	if (ret < 0) {
1160		spin_unlock(&fs_info->qgroup_lock);
1161		goto out;
1162	}
1163	ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1164	spin_unlock(&fs_info->qgroup_lock);
1165out:
1166	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1167	ulist_free(tmp);
1168	return ret;
1169}
1170
1171int __del_qgroup_relation(struct btrfs_trans_handle *trans,
1172			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1173{
1174	struct btrfs_root *quota_root;
1175	struct btrfs_qgroup *parent;
1176	struct btrfs_qgroup *member;
1177	struct btrfs_qgroup_list *list;
1178	struct ulist *tmp;
1179	int ret = 0;
1180	int err;
1181
1182	tmp = ulist_alloc(GFP_NOFS);
1183	if (!tmp)
1184		return -ENOMEM;
1185
1186	quota_root = fs_info->quota_root;
1187	if (!quota_root) {
1188		ret = -EINVAL;
1189		goto out;
1190	}
1191
1192	member = find_qgroup_rb(fs_info, src);
1193	parent = find_qgroup_rb(fs_info, dst);
1194	if (!member || !parent) {
1195		ret = -EINVAL;
1196		goto out;
1197	}
1198
1199	/* check if such qgroup relation exist firstly */
1200	list_for_each_entry(list, &member->groups, next_group) {
1201		if (list->group == parent)
1202			goto exist;
1203	}
1204	ret = -ENOENT;
1205	goto out;
1206exist:
1207	ret = del_qgroup_relation_item(trans, quota_root, src, dst);
1208	err = del_qgroup_relation_item(trans, quota_root, dst, src);
1209	if (err && !ret)
1210		ret = err;
1211
1212	spin_lock(&fs_info->qgroup_lock);
1213	del_relation_rb(fs_info, src, dst);
1214	ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1215	spin_unlock(&fs_info->qgroup_lock);
1216out:
1217	ulist_free(tmp);
1218	return ret;
1219}
1220
1221int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
1222			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1223{
1224	int ret = 0;
1225
1226	mutex_lock(&fs_info->qgroup_ioctl_lock);
1227	ret = __del_qgroup_relation(trans, fs_info, src, dst);
1228	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1229
1230	return ret;
1231}
1232
1233int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
1234			struct btrfs_fs_info *fs_info, u64 qgroupid)
1235{
1236	struct btrfs_root *quota_root;
1237	struct btrfs_qgroup *qgroup;
1238	int ret = 0;
1239
1240	mutex_lock(&fs_info->qgroup_ioctl_lock);
1241	quota_root = fs_info->quota_root;
1242	if (!quota_root) {
1243		ret = -EINVAL;
1244		goto out;
1245	}
1246	qgroup = find_qgroup_rb(fs_info, qgroupid);
1247	if (qgroup) {
1248		ret = -EEXIST;
1249		goto out;
1250	}
1251
1252	ret = add_qgroup_item(trans, quota_root, qgroupid);
1253	if (ret)
1254		goto out;
1255
1256	spin_lock(&fs_info->qgroup_lock);
1257	qgroup = add_qgroup_rb(fs_info, qgroupid);
1258	spin_unlock(&fs_info->qgroup_lock);
1259
1260	if (IS_ERR(qgroup))
1261		ret = PTR_ERR(qgroup);
1262out:
1263	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1264	return ret;
1265}
1266
1267int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
1268			struct btrfs_fs_info *fs_info, u64 qgroupid)
1269{
1270	struct btrfs_root *quota_root;
1271	struct btrfs_qgroup *qgroup;
1272	struct btrfs_qgroup_list *list;
1273	int ret = 0;
1274
1275	mutex_lock(&fs_info->qgroup_ioctl_lock);
1276	quota_root = fs_info->quota_root;
1277	if (!quota_root) {
1278		ret = -EINVAL;
1279		goto out;
1280	}
1281
1282	qgroup = find_qgroup_rb(fs_info, qgroupid);
1283	if (!qgroup) {
1284		ret = -ENOENT;
1285		goto out;
1286	} else {
1287		/* check if there are no children of this qgroup */
1288		if (!list_empty(&qgroup->members)) {
1289			ret = -EBUSY;
1290			goto out;
1291		}
1292	}
1293	ret = del_qgroup_item(trans, quota_root, qgroupid);
1294
1295	while (!list_empty(&qgroup->groups)) {
1296		list = list_first_entry(&qgroup->groups,
1297					struct btrfs_qgroup_list, next_group);
1298		ret = __del_qgroup_relation(trans, fs_info,
1299					   qgroupid,
1300					   list->group->qgroupid);
1301		if (ret)
1302			goto out;
1303	}
1304
1305	spin_lock(&fs_info->qgroup_lock);
1306	del_qgroup_rb(quota_root->fs_info, qgroupid);
1307	spin_unlock(&fs_info->qgroup_lock);
1308out:
1309	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1310	return ret;
1311}
1312
1313int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
1314		       struct btrfs_fs_info *fs_info, u64 qgroupid,
1315		       struct btrfs_qgroup_limit *limit)
1316{
1317	struct btrfs_root *quota_root;
1318	struct btrfs_qgroup *qgroup;
1319	int ret = 0;
1320
1321	mutex_lock(&fs_info->qgroup_ioctl_lock);
1322	quota_root = fs_info->quota_root;
1323	if (!quota_root) {
1324		ret = -EINVAL;
1325		goto out;
1326	}
1327
1328	qgroup = find_qgroup_rb(fs_info, qgroupid);
1329	if (!qgroup) {
1330		ret = -ENOENT;
1331		goto out;
1332	}
1333
1334	spin_lock(&fs_info->qgroup_lock);
1335	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
1336		qgroup->max_rfer = limit->max_rfer;
1337	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
1338		qgroup->max_excl = limit->max_excl;
1339	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
1340		qgroup->rsv_rfer = limit->rsv_rfer;
1341	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
1342		qgroup->rsv_excl = limit->rsv_excl;
1343	qgroup->lim_flags |= limit->flags;
1344
1345	spin_unlock(&fs_info->qgroup_lock);
1346
1347	ret = update_qgroup_limit_item(trans, quota_root, qgroup);
1348	if (ret) {
1349		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1350		btrfs_info(fs_info, "unable to update quota limit for %llu",
1351		       qgroupid);
1352	}
1353
1354out:
1355	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1356	return ret;
1357}
1358
1359static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
1360			   struct btrfs_qgroup_operation *oper2)
1361{
1362	/*
1363	 * Ignore seq and type here, we're looking for any operation
1364	 * at all related to this extent on that root.
1365	 */
1366	if (oper1->bytenr < oper2->bytenr)
1367		return -1;
1368	if (oper1->bytenr > oper2->bytenr)
1369		return 1;
1370	if (oper1->ref_root < oper2->ref_root)
1371		return -1;
1372	if (oper1->ref_root > oper2->ref_root)
1373		return 1;
1374	return 0;
1375}
1376
1377static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
1378			      struct btrfs_qgroup_operation *oper)
1379{
1380	struct rb_node *n;
1381	struct btrfs_qgroup_operation *cur;
1382	int cmp;
1383
1384	spin_lock(&fs_info->qgroup_op_lock);
1385	n = fs_info->qgroup_op_tree.rb_node;
1386	while (n) {
1387		cur = rb_entry(n, struct btrfs_qgroup_operation, n);
1388		cmp = comp_oper_exist(cur, oper);
1389		if (cmp < 0) {
1390			n = n->rb_right;
1391		} else if (cmp) {
1392			n = n->rb_left;
1393		} else {
1394			spin_unlock(&fs_info->qgroup_op_lock);
1395			return -EEXIST;
1396		}
1397	}
1398	spin_unlock(&fs_info->qgroup_op_lock);
1399	return 0;
1400}
1401
1402static int comp_oper(struct btrfs_qgroup_operation *oper1,
1403		     struct btrfs_qgroup_operation *oper2)
1404{
1405	if (oper1->bytenr < oper2->bytenr)
1406		return -1;
1407	if (oper1->bytenr > oper2->bytenr)
1408		return 1;
1409	if (oper1->ref_root < oper2->ref_root)
1410		return -1;
1411	if (oper1->ref_root > oper2->ref_root)
1412		return 1;
1413	if (oper1->seq < oper2->seq)
1414		return -1;
1415	if (oper1->seq > oper2->seq)
1416		return 1;
1417	if (oper1->type < oper2->type)
1418		return -1;
1419	if (oper1->type > oper2->type)
1420		return 1;
1421	return 0;
1422}
1423
1424static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
1425			      struct btrfs_qgroup_operation *oper)
1426{
1427	struct rb_node **p;
1428	struct rb_node *parent = NULL;
1429	struct btrfs_qgroup_operation *cur;
1430	int cmp;
1431
1432	spin_lock(&fs_info->qgroup_op_lock);
1433	p = &fs_info->qgroup_op_tree.rb_node;
1434	while (*p) {
1435		parent = *p;
1436		cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
1437		cmp = comp_oper(cur, oper);
1438		if (cmp < 0) {
1439			p = &(*p)->rb_right;
1440		} else if (cmp) {
1441			p = &(*p)->rb_left;
1442		} else {
1443			spin_unlock(&fs_info->qgroup_op_lock);
1444			return -EEXIST;
1445		}
1446	}
1447	rb_link_node(&oper->n, parent, p);
1448	rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
1449	spin_unlock(&fs_info->qgroup_op_lock);
1450	return 0;
1451}
1452
1453/*
1454 * Record a quota operation for processing later on.
1455 * @trans: the transaction we are adding the delayed op to.
1456 * @fs_info: the fs_info for this fs.
1457 * @ref_root: the root of the reference we are acting on,
1458 * @bytenr: the bytenr we are acting on.
1459 * @num_bytes: the number of bytes in the reference.
1460 * @type: the type of operation this is.
1461 * @mod_seq: do we need to get a sequence number for looking up roots.
1462 *
1463 * We just add it to our trans qgroup_ref_list and carry on and process these
1464 * operations in order at some later point.  If the reference root isn't a fs
1465 * root then we don't bother with doing anything.
1466 *
1467 * MUST BE HOLDING THE REF LOCK.
1468 */
1469int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1470			    struct btrfs_fs_info *fs_info, u64 ref_root,
1471			    u64 bytenr, u64 num_bytes,
1472			    enum btrfs_qgroup_operation_type type, int mod_seq)
1473{
1474	struct btrfs_qgroup_operation *oper;
1475	int ret;
1476
1477	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
1478		return 0;
1479
1480	oper = kmalloc(sizeof(*oper), GFP_NOFS);
1481	if (!oper)
1482		return -ENOMEM;
1483
1484	oper->ref_root = ref_root;
1485	oper->bytenr = bytenr;
1486	oper->num_bytes = num_bytes;
1487	oper->type = type;
1488	oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1489	INIT_LIST_HEAD(&oper->elem.list);
1490	oper->elem.seq = 0;
1491
1492	trace_btrfs_qgroup_record_ref(oper);
1493
1494	if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1495		/*
1496		 * If any operation for this bytenr/ref_root combo
1497		 * exists, then we know it's not exclusively owned and
1498		 * shouldn't be queued up.
1499		 *
1500		 * This also catches the case where we have a cloned
1501		 * extent that gets queued up multiple times during
1502		 * drop snapshot.
1503		 */
1504		if (qgroup_oper_exists(fs_info, oper)) {
1505			kfree(oper);
1506			return 0;
1507		}
1508	}
1509
1510	ret = insert_qgroup_oper(fs_info, oper);
1511	if (ret) {
1512		/* Shouldn't happen so have an assert for developers */
1513		ASSERT(0);
1514		kfree(oper);
1515		return ret;
1516	}
1517	list_add_tail(&oper->list, &trans->qgroup_ref_list);
1518
1519	if (mod_seq)
1520		btrfs_get_tree_mod_seq(fs_info, &oper->elem);
1521
1522	return 0;
1523}
1524
1525static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1526				  struct btrfs_qgroup_operation *oper)
1527{
1528	struct ulist *tmp;
1529	int sign = 0;
1530	int ret = 0;
1531
1532	tmp = ulist_alloc(GFP_NOFS);
1533	if (!tmp)
1534		return -ENOMEM;
1535
1536	spin_lock(&fs_info->qgroup_lock);
1537	if (!fs_info->quota_root)
1538		goto out;
1539
1540	switch (oper->type) {
1541	case BTRFS_QGROUP_OPER_ADD_EXCL:
1542		sign = 1;
1543		break;
1544	case BTRFS_QGROUP_OPER_SUB_EXCL:
1545		sign = -1;
1546		break;
1547	default:
1548		ASSERT(0);
1549	}
1550	ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
1551				       oper->num_bytes, sign);
1552out:
1553	spin_unlock(&fs_info->qgroup_lock);
1554	ulist_free(tmp);
1555	return ret;
1556}
1557
1558/*
1559 * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
1560 * properly.
1561 */
1562static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
1563				  u64 root_to_skip, struct ulist *tmp,
1564				  struct ulist *roots, struct ulist *qgroups,
1565				  u64 seq, int *old_roots, int rescan)
1566{
1567	struct ulist_node *unode;
1568	struct ulist_iterator uiter;
1569	struct ulist_node *tmp_unode;
1570	struct ulist_iterator tmp_uiter;
1571	struct btrfs_qgroup *qg;
1572	int ret;
1573
1574	ULIST_ITER_INIT(&uiter);
1575	while ((unode = ulist_next(roots, &uiter))) {
1576		/* We don't count our current root here */
1577		if (unode->val == root_to_skip)
1578			continue;
1579		qg = find_qgroup_rb(fs_info, unode->val);
1580		if (!qg)
1581			continue;
1582		/*
1583		 * We could have a pending removal of this same ref so we may
1584		 * not have actually found our ref root when doing
1585		 * btrfs_find_all_roots, so we need to keep track of how many
1586		 * old roots we find in case we removed ours and added a
1587		 * different one at the same time.  I don't think this could
1588		 * happen in practice but that sort of thinking leads to pain
1589		 * and suffering and to the dark side.
1590		 */
1591		(*old_roots)++;
1592
1593		ulist_reinit(tmp);
1594		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1595				GFP_ATOMIC);
1596		if (ret < 0)
1597			return ret;
1598		ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
1599		if (ret < 0)
1600			return ret;
1601		ULIST_ITER_INIT(&tmp_uiter);
1602		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
1603			struct btrfs_qgroup_list *glist;
1604
1605			qg = u64_to_ptr(tmp_unode->aux);
1606			/*
1607			 * We use this sequence number to keep from having to
1608			 * run the whole list and 0 out the refcnt every time.
1609			 * We basically use sequnce as the known 0 count and
1610			 * then add 1 everytime we see a qgroup.  This is how we
1611			 * get how many of the roots actually point up to the
1612			 * upper level qgroups in order to determine exclusive
1613			 * counts.
1614			 *
1615			 * For rescan we want to set old_refcnt to seq so our
1616			 * exclusive calculations end up correct.
1617			 */
1618			if (rescan)
1619				qg->old_refcnt = seq;
1620			else if (qg->old_refcnt < seq)
1621				qg->old_refcnt = seq + 1;
1622			else
1623				qg->old_refcnt++;
1624
1625			if (qg->new_refcnt < seq)
1626				qg->new_refcnt = seq + 1;
1627			else
1628				qg->new_refcnt++;
1629			list_for_each_entry(glist, &qg->groups, next_group) {
1630				ret = ulist_add(qgroups, glist->group->qgroupid,
1631						ptr_to_u64(glist->group),
1632						GFP_ATOMIC);
1633				if (ret < 0)
1634					return ret;
1635				ret = ulist_add(tmp, glist->group->qgroupid,
1636						ptr_to_u64(glist->group),
1637						GFP_ATOMIC);
1638				if (ret < 0)
1639					return ret;
1640			}
1641		}
1642	}
1643	return 0;
1644}
1645
1646/*
1647 * We need to walk forward in our operation tree and account for any roots that
1648 * were deleted after we made this operation.
1649 */
1650static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
1651				       struct btrfs_qgroup_operation *oper,
1652				       struct ulist *tmp,
1653				       struct ulist *qgroups, u64 seq,
1654				       int *old_roots)
1655{
1656	struct ulist_node *unode;
1657	struct ulist_iterator uiter;
1658	struct btrfs_qgroup *qg;
1659	struct btrfs_qgroup_operation *tmp_oper;
1660	struct rb_node *n;
1661	int ret;
1662
1663	ulist_reinit(tmp);
1664
1665	/*
1666	 * We only walk forward in the tree since we're only interested in
1667	 * removals that happened _after_  our operation.
1668	 */
1669	spin_lock(&fs_info->qgroup_op_lock);
1670	n = rb_next(&oper->n);
1671	spin_unlock(&fs_info->qgroup_op_lock);
1672	if (!n)
1673		return 0;
1674	tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1675	while (tmp_oper->bytenr == oper->bytenr) {
1676		/*
1677		 * If it's not a removal we don't care, additions work out
1678		 * properly with our refcnt tracking.
1679		 */
1680		if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
1681		    tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
1682			goto next;
1683		qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
1684		if (!qg)
1685			goto next;
1686		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
1687				GFP_ATOMIC);
1688		if (ret) {
1689			if (ret < 0)
1690				return ret;
1691			/*
1692			 * We only want to increase old_roots if this qgroup is
1693			 * not already in the list of qgroups.  If it is already
1694			 * there then that means it must have been re-added or
1695			 * the delete will be discarded because we had an
1696			 * existing ref that we haven't looked up yet.  In this
1697			 * case we don't want to increase old_roots.  So if ret
1698			 * == 1 then we know that this is the first time we've
1699			 * seen this qgroup and we can bump the old_roots.
1700			 */
1701			(*old_roots)++;
1702			ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
1703					GFP_ATOMIC);
1704			if (ret < 0)
1705				return ret;
1706		}
1707next:
1708		spin_lock(&fs_info->qgroup_op_lock);
1709		n = rb_next(&tmp_oper->n);
1710		spin_unlock(&fs_info->qgroup_op_lock);
1711		if (!n)
1712			break;
1713		tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
1714	}
1715
1716	/* Ok now process the qgroups we found */
1717	ULIST_ITER_INIT(&uiter);
1718	while ((unode = ulist_next(tmp, &uiter))) {
1719		struct btrfs_qgroup_list *glist;
1720
1721		qg = u64_to_ptr(unode->aux);
1722		if (qg->old_refcnt < seq)
1723			qg->old_refcnt = seq + 1;
1724		else
1725			qg->old_refcnt++;
1726		if (qg->new_refcnt < seq)
1727			qg->new_refcnt = seq + 1;
1728		else
1729			qg->new_refcnt++;
1730		list_for_each_entry(glist, &qg->groups, next_group) {
1731			ret = ulist_add(qgroups, glist->group->qgroupid,
1732					ptr_to_u64(glist->group), GFP_ATOMIC);
1733			if (ret < 0)
1734				return ret;
1735			ret = ulist_add(tmp, glist->group->qgroupid,
1736					ptr_to_u64(glist->group), GFP_ATOMIC);
1737			if (ret < 0)
1738				return ret;
1739		}
1740	}
1741	return 0;
1742}
1743
1744/* Add refcnt for the newly added reference. */
1745static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
1746				  struct btrfs_qgroup_operation *oper,
1747				  struct btrfs_qgroup *qgroup,
1748				  struct ulist *tmp, struct ulist *qgroups,
1749				  u64 seq)
1750{
1751	struct ulist_node *unode;
1752	struct ulist_iterator uiter;
1753	struct btrfs_qgroup *qg;
1754	int ret;
1755
1756	ulist_reinit(tmp);
1757	ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
1758			GFP_ATOMIC);
1759	if (ret < 0)
1760		return ret;
1761	ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
1762			GFP_ATOMIC);
1763	if (ret < 0)
1764		return ret;
1765	ULIST_ITER_INIT(&uiter);
1766	while ((unode = ulist_next(tmp, &uiter))) {
1767		struct btrfs_qgroup_list *glist;
1768
1769		qg = u64_to_ptr(unode->aux);
1770		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
1771			if (qg->new_refcnt < seq)
1772				qg->new_refcnt = seq + 1;
1773			else
1774				qg->new_refcnt++;
1775		} else {
1776			if (qg->old_refcnt < seq)
1777				qg->old_refcnt = seq + 1;
1778			else
1779				qg->old_refcnt++;
1780		}
1781		list_for_each_entry(glist, &qg->groups, next_group) {
1782			ret = ulist_add(tmp, glist->group->qgroupid,
1783					ptr_to_u64(glist->group), GFP_ATOMIC);
1784			if (ret < 0)
1785				return ret;
1786			ret = ulist_add(qgroups, glist->group->qgroupid,
1787					ptr_to_u64(glist->group), GFP_ATOMIC);
1788			if (ret < 0)
1789				return ret;
1790		}
1791	}
1792	return 0;
1793}
1794
1795/*
1796 * This adjusts the counters for all referenced qgroups if need be.
1797 */
1798static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
1799				  u64 root_to_skip, u64 num_bytes,
1800				  struct ulist *qgroups, u64 seq,
1801				  int old_roots, int new_roots, int rescan)
1802{
1803	struct ulist_node *unode;
1804	struct ulist_iterator uiter;
1805	struct btrfs_qgroup *qg;
1806	u64 cur_new_count, cur_old_count;
1807
1808	ULIST_ITER_INIT(&uiter);
1809	while ((unode = ulist_next(qgroups, &uiter))) {
1810		bool dirty = false;
1811
1812		qg = u64_to_ptr(unode->aux);
1813		/*
1814		 * Wasn't referenced before but is now, add to the reference
1815		 * counters.
1816		 */
1817		if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
1818			qg->rfer += num_bytes;
1819			qg->rfer_cmpr += num_bytes;
1820			dirty = true;
1821		}
1822
1823		/*
1824		 * Was referenced before but isn't now, subtract from the
1825		 * reference counters.
1826		 */
1827		if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
1828			qg->rfer -= num_bytes;
1829			qg->rfer_cmpr -= num_bytes;
1830			dirty = true;
1831		}
1832
1833		if (qg->old_refcnt < seq)
1834			cur_old_count = 0;
1835		else
1836			cur_old_count = qg->old_refcnt - seq;
1837		if (qg->new_refcnt < seq)
1838			cur_new_count = 0;
1839		else
1840			cur_new_count = qg->new_refcnt - seq;
1841
1842		/*
1843		 * If our refcount was the same as the roots previously but our
1844		 * new count isn't the same as the number of roots now then we
1845		 * went from having a exclusive reference on this range to not.
1846		 */
1847		if (old_roots && cur_old_count == old_roots &&
1848		    (cur_new_count != new_roots || new_roots == 0)) {
1849			WARN_ON(cur_new_count != new_roots && new_roots == 0);
1850			qg->excl -= num_bytes;
1851			qg->excl_cmpr -= num_bytes;
1852			dirty = true;
1853		}
1854
1855		/*
1856		 * If we didn't reference all the roots before but now we do we
1857		 * have an exclusive reference to this range.
1858		 */
1859		if ((!old_roots || (old_roots && cur_old_count != old_roots))
1860		    && cur_new_count == new_roots) {
1861			qg->excl += num_bytes;
1862			qg->excl_cmpr += num_bytes;
1863			dirty = true;
1864		}
1865
1866		if (dirty)
1867			qgroup_dirty(fs_info, qg);
1868	}
1869	return 0;
1870}
1871
1872/*
1873 * If we removed a data extent and there were other references for that bytenr
1874 * then we need to lookup all referenced roots to make sure we still don't
1875 * reference this bytenr.  If we do then we can just discard this operation.
1876 */
1877static int check_existing_refs(struct btrfs_trans_handle *trans,
1878			       struct btrfs_fs_info *fs_info,
1879			       struct btrfs_qgroup_operation *oper)
1880{
1881	struct ulist *roots = NULL;
1882	struct ulist_node *unode;
1883	struct ulist_iterator uiter;
1884	int ret = 0;
1885
1886	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1887				   oper->elem.seq, &roots);
1888	if (ret < 0)
1889		return ret;
1890	ret = 0;
1891
1892	ULIST_ITER_INIT(&uiter);
1893	while ((unode = ulist_next(roots, &uiter))) {
1894		if (unode->val == oper->ref_root) {
1895			ret = 1;
1896			break;
1897		}
1898	}
1899	ulist_free(roots);
1900	btrfs_put_tree_mod_seq(fs_info, &oper->elem);
1901
1902	return ret;
1903}
1904
1905/*
1906 * If we share a reference across multiple roots then we may need to adjust
1907 * various qgroups referenced and exclusive counters.  The basic premise is this
1908 *
1909 * 1) We have seq to represent a 0 count.  Instead of looping through all of the
1910 * qgroups and resetting their refcount to 0 we just constantly bump this
1911 * sequence number to act as the base reference count.  This means that if
1912 * anybody is equal to or below this sequence they were never referenced.  We
1913 * jack this sequence up by the number of roots we found each time in order to
1914 * make sure we don't have any overlap.
1915 *
1916 * 2) We first search all the roots that reference the area _except_ the root
1917 * we're acting on currently.  This makes up the old_refcnt of all the qgroups
1918 * before.
1919 *
1920 * 3) We walk all of the qgroups referenced by the root we are currently acting
1921 * on, and will either adjust old_refcnt in the case of a removal or the
1922 * new_refcnt in the case of an addition.
1923 *
1924 * 4) Finally we walk all the qgroups that are referenced by this range
1925 * including the root we are acting on currently.  We will adjust the counters
1926 * based on the number of roots we had and will have after this operation.
1927 *
1928 * Take this example as an illustration
1929 *
1930 *			[qgroup 1/0]
1931 *		     /         |          \
1932 *		[qg 0/0]   [qg 0/1]	[qg 0/2]
1933 *		   \          |            /
1934 *		  [	   extent	    ]
1935 *
1936 * Say we are adding a reference that is covered by qg 0/0.  The first step
1937 * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
1938 * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
1939 * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
1940 * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
1941 * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
1942 * reference and thus must add the size to the referenced bytes.  Everything
1943 * else is the same so nothing else changes.
1944 */
1945static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
1946				    struct btrfs_fs_info *fs_info,
1947				    struct btrfs_qgroup_operation *oper)
1948{
1949	struct ulist *roots = NULL;
1950	struct ulist *qgroups, *tmp;
1951	struct btrfs_qgroup *qgroup;
1952	struct seq_list elem = SEQ_LIST_INIT(elem);
1953	u64 seq;
1954	int old_roots = 0;
1955	int new_roots = 0;
1956	int ret = 0;
1957
1958	if (oper->elem.seq) {
1959		ret = check_existing_refs(trans, fs_info, oper);
1960		if (ret < 0)
1961			return ret;
1962		if (ret)
1963			return 0;
1964	}
1965
1966	qgroups = ulist_alloc(GFP_NOFS);
1967	if (!qgroups)
1968		return -ENOMEM;
1969
1970	tmp = ulist_alloc(GFP_NOFS);
1971	if (!tmp) {
1972		ulist_free(qgroups);
1973		return -ENOMEM;
1974	}
1975
1976	btrfs_get_tree_mod_seq(fs_info, &elem);
1977	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
1978				   &roots);
1979	btrfs_put_tree_mod_seq(fs_info, &elem);
1980	if (ret < 0) {
1981		ulist_free(qgroups);
1982		ulist_free(tmp);
1983		return ret;
1984	}
1985	spin_lock(&fs_info->qgroup_lock);
1986	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
1987	if (!qgroup)
1988		goto out;
1989	seq = fs_info->qgroup_seq;
1990
1991	/*
1992	 * So roots is the list of all the roots currently pointing at the
1993	 * bytenr, including the ref we are adding if we are adding, or not if
1994	 * we are removing a ref.  So we pass in the ref_root to skip that root
1995	 * in our calculations.  We set old_refnct and new_refcnt cause who the
1996	 * hell knows what everything looked like before, and it doesn't matter
1997	 * except...
1998	 */
1999	ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
2000				     seq, &old_roots, 0);
2001	if (ret < 0)
2002		goto out;
2003
2004	/*
2005	 * Now adjust the refcounts of the qgroups that care about this
2006	 * reference, either the old_count in the case of removal or new_count
2007	 * in the case of an addition.
2008	 */
2009	ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
2010				     seq);
2011	if (ret < 0)
2012		goto out;
2013
2014	/*
2015	 * ...in the case of removals.  If we had a removal before we got around
2016	 * to processing this operation then we need to find that guy and count
2017	 * his references as if they really existed so we don't end up screwing
2018	 * up the exclusive counts.  Then whenever we go to process the delete
2019	 * everything will be grand and we can account for whatever exclusive
2020	 * changes need to be made there.  We also have to pass in old_roots so
2021	 * we have an accurate count of the roots as it pertains to this
2022	 * operations view of the world.
2023	 */
2024	ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
2025					  &old_roots);
2026	if (ret < 0)
2027		goto out;
2028
2029	/*
2030	 * We are adding our root, need to adjust up the number of roots,
2031	 * otherwise old_roots is the number of roots we want.
2032	 */
2033	if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
2034		new_roots = old_roots + 1;
2035	} else {
2036		new_roots = old_roots;
2037		old_roots++;
2038	}
2039	fs_info->qgroup_seq += old_roots + 1;
2040
2041
2042	/*
2043	 * And now the magic happens, bless Arne for having a pretty elegant
2044	 * solution for this.
2045	 */
2046	qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
2047			       qgroups, seq, old_roots, new_roots, 0);
2048out:
2049	spin_unlock(&fs_info->qgroup_lock);
2050	ulist_free(qgroups);
2051	ulist_free(roots);
2052	ulist_free(tmp);
2053	return ret;
2054}
2055
2056/*
2057 * Process a reference to a shared subtree. This type of operation is
2058 * queued during snapshot removal when we encounter extents which are
2059 * shared between more than one root.
2060 */
2061static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
2062				     struct btrfs_fs_info *fs_info,
2063				     struct btrfs_qgroup_operation *oper)
2064{
2065	struct ulist *roots = NULL;
2066	struct ulist_node *unode;
2067	struct ulist_iterator uiter;
2068	struct btrfs_qgroup_list *glist;
2069	struct ulist *parents;
2070	int ret = 0;
2071	int err;
2072	struct btrfs_qgroup *qg;
2073	u64 root_obj = 0;
2074	struct seq_list elem = SEQ_LIST_INIT(elem);
2075
2076	parents = ulist_alloc(GFP_NOFS);
2077	if (!parents)
2078		return -ENOMEM;
2079
2080	btrfs_get_tree_mod_seq(fs_info, &elem);
2081	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
2082				   elem.seq, &roots);
2083	btrfs_put_tree_mod_seq(fs_info, &elem);
2084	if (ret < 0)
2085		goto out;
2086
2087	if (roots->nnodes != 1)
2088		goto out;
2089
2090	ULIST_ITER_INIT(&uiter);
2091	unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
2092	/*
2093	 * If we find our ref root then that means all refs
2094	 * this extent has to the root have not yet been
2095	 * deleted. In that case, we do nothing and let the
2096	 * last ref for this bytenr drive our update.
2097	 *
2098	 * This can happen for example if an extent is
2099	 * referenced multiple times in a snapshot (clone,
2100	 * etc). If we are in the middle of snapshot removal,
2101	 * queued updates for such an extent will find the
2102	 * root if we have not yet finished removing the
2103	 * snapshot.
2104	 */
2105	if (unode->val == oper->ref_root)
2106		goto out;
2107
2108	root_obj = unode->val;
2109	BUG_ON(!root_obj);
2110
2111	spin_lock(&fs_info->qgroup_lock);
2112	qg = find_qgroup_rb(fs_info, root_obj);
2113	if (!qg)
2114		goto out_unlock;
2115
2116	qg->excl += oper->num_bytes;
2117	qg->excl_cmpr += oper->num_bytes;
2118	qgroup_dirty(fs_info, qg);
2119
2120	/*
2121	 * Adjust counts for parent groups. First we find all
2122	 * parents, then in the 2nd loop we do the adjustment
2123	 * while adding parents of the parents to our ulist.
2124	 */
2125	list_for_each_entry(glist, &qg->groups, next_group) {
2126		err = ulist_add(parents, glist->group->qgroupid,
2127				ptr_to_u64(glist->group), GFP_ATOMIC);
2128		if (err < 0) {
2129			ret = err;
2130			goto out_unlock;
2131		}
2132	}
2133
2134	ULIST_ITER_INIT(&uiter);
2135	while ((unode = ulist_next(parents, &uiter))) {
2136		qg = u64_to_ptr(unode->aux);
2137		qg->excl += oper->num_bytes;
2138		qg->excl_cmpr += oper->num_bytes;
2139		qgroup_dirty(fs_info, qg);
2140
2141		/* Add any parents of the parents */
2142		list_for_each_entry(glist, &qg->groups, next_group) {
2143			err = ulist_add(parents, glist->group->qgroupid,
2144					ptr_to_u64(glist->group), GFP_ATOMIC);
2145			if (err < 0) {
2146				ret = err;
2147				goto out_unlock;
2148			}
2149		}
2150	}
2151
2152out_unlock:
2153	spin_unlock(&fs_info->qgroup_lock);
2154
2155out:
2156	ulist_free(roots);
2157	ulist_free(parents);
2158	return ret;
2159}
2160
2161/*
2162 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
2163 * from the fs. First, all roots referencing the extent are searched, and
2164 * then the space is accounted accordingly to the different roots. The
2165 * accounting algorithm works in 3 steps documented inline.
2166 */
2167static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
2168				struct btrfs_fs_info *fs_info,
2169				struct btrfs_qgroup_operation *oper)
2170{
2171	int ret = 0;
2172
2173	if (!fs_info->quota_enabled)
2174		return 0;
2175
2176	BUG_ON(!fs_info->quota_root);
2177
2178	mutex_lock(&fs_info->qgroup_rescan_lock);
2179	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2180		if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
2181			mutex_unlock(&fs_info->qgroup_rescan_lock);
2182			return 0;
2183		}
2184	}
2185	mutex_unlock(&fs_info->qgroup_rescan_lock);
2186
2187	ASSERT(is_fstree(oper->ref_root));
2188
2189	trace_btrfs_qgroup_account(oper);
2190
2191	switch (oper->type) {
2192	case BTRFS_QGROUP_OPER_ADD_EXCL:
2193	case BTRFS_QGROUP_OPER_SUB_EXCL:
2194		ret = qgroup_excl_accounting(fs_info, oper);
2195		break;
2196	case BTRFS_QGROUP_OPER_ADD_SHARED:
2197	case BTRFS_QGROUP_OPER_SUB_SHARED:
2198		ret = qgroup_shared_accounting(trans, fs_info, oper);
2199		break;
2200	case BTRFS_QGROUP_OPER_SUB_SUBTREE:
2201		ret = qgroup_subtree_accounting(trans, fs_info, oper);
2202		break;
2203	default:
2204		ASSERT(0);
2205	}
2206	return ret;
2207}
2208
2209/*
2210 * Needs to be called everytime we run delayed refs, even if there is an error
2211 * in order to cleanup outstanding operations.
2212 */
2213int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
2214				    struct btrfs_fs_info *fs_info)
2215{
2216	struct btrfs_qgroup_operation *oper;
2217	int ret = 0;
2218
2219	while (!list_empty(&trans->qgroup_ref_list)) {
2220		oper = list_first_entry(&trans->qgroup_ref_list,
2221					struct btrfs_qgroup_operation, list);
2222		list_del_init(&oper->list);
2223		if (!ret || !trans->aborted)
2224			ret = btrfs_qgroup_account(trans, fs_info, oper);
2225		spin_lock(&fs_info->qgroup_op_lock);
2226		rb_erase(&oper->n, &fs_info->qgroup_op_tree);
2227		spin_unlock(&fs_info->qgroup_op_lock);
2228		btrfs_put_tree_mod_seq(fs_info, &oper->elem);
2229		kfree(oper);
2230	}
2231	return ret;
2232}
2233
2234/*
2235 * called from commit_transaction. Writes all changed qgroups to disk.
2236 */
2237int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
2238		      struct btrfs_fs_info *fs_info)
2239{
2240	struct btrfs_root *quota_root = fs_info->quota_root;
2241	int ret = 0;
2242	int start_rescan_worker = 0;
2243
2244	if (!quota_root)
2245		goto out;
2246
2247	if (!fs_info->quota_enabled && fs_info->pending_quota_state)
2248		start_rescan_worker = 1;
2249
2250	fs_info->quota_enabled = fs_info->pending_quota_state;
2251
2252	spin_lock(&fs_info->qgroup_lock);
2253	while (!list_empty(&fs_info->dirty_qgroups)) {
2254		struct btrfs_qgroup *qgroup;
2255		qgroup = list_first_entry(&fs_info->dirty_qgroups,
2256					  struct btrfs_qgroup, dirty);
2257		list_del_init(&qgroup->dirty);
2258		spin_unlock(&fs_info->qgroup_lock);
2259		ret = update_qgroup_info_item(trans, quota_root, qgroup);
2260		if (ret)
2261			fs_info->qgroup_flags |=
2262					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2263		ret = update_qgroup_limit_item(trans, quota_root, qgroup);
2264		if (ret)
2265			fs_info->qgroup_flags |=
2266					BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2267		spin_lock(&fs_info->qgroup_lock);
2268	}
2269	if (fs_info->quota_enabled)
2270		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2271	else
2272		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2273	spin_unlock(&fs_info->qgroup_lock);
2274
2275	ret = update_qgroup_status_item(trans, fs_info, quota_root);
2276	if (ret)
2277		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2278
2279	if (!ret && start_rescan_worker) {
2280		ret = qgroup_rescan_init(fs_info, 0, 1);
2281		if (!ret) {
2282			qgroup_rescan_zero_tracking(fs_info);
2283			btrfs_queue_work(fs_info->qgroup_rescan_workers,
2284					 &fs_info->qgroup_rescan_work);
2285		}
2286		ret = 0;
2287	}
2288
2289out:
2290
2291	return ret;
2292}
2293
2294/*
2295 * copy the acounting information between qgroups. This is necessary when a
2296 * snapshot or a subvolume is created
2297 */
2298int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2299			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
2300			 struct btrfs_qgroup_inherit *inherit)
2301{
2302	int ret = 0;
2303	int i;
2304	u64 *i_qgroups;
2305	struct btrfs_root *quota_root = fs_info->quota_root;
2306	struct btrfs_qgroup *srcgroup;
2307	struct btrfs_qgroup *dstgroup;
2308	u32 level_size = 0;
2309	u64 nums;
2310
2311	mutex_lock(&fs_info->qgroup_ioctl_lock);
2312	if (!fs_info->quota_enabled)
2313		goto out;
2314
2315	if (!quota_root) {
2316		ret = -EINVAL;
2317		goto out;
2318	}
2319
2320	if (inherit) {
2321		i_qgroups = (u64 *)(inherit + 1);
2322		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2323		       2 * inherit->num_excl_copies;
2324		for (i = 0; i < nums; ++i) {
2325			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2326			if (!srcgroup) {
2327				ret = -EINVAL;
2328				goto out;
2329			}
2330
2331			if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
2332				ret = -EINVAL;
2333				goto out;
2334			}
2335			++i_qgroups;
2336		}
2337	}
2338
2339	/*
2340	 * create a tracking group for the subvol itself
2341	 */
2342	ret = add_qgroup_item(trans, quota_root, objectid);
2343	if (ret)
2344		goto out;
2345
2346	if (srcid) {
2347		struct btrfs_root *srcroot;
2348		struct btrfs_key srckey;
2349
2350		srckey.objectid = srcid;
2351		srckey.type = BTRFS_ROOT_ITEM_KEY;
2352		srckey.offset = (u64)-1;
2353		srcroot = btrfs_read_fs_root_no_name(fs_info, &srckey);
2354		if (IS_ERR(srcroot)) {
2355			ret = PTR_ERR(srcroot);
2356			goto out;
2357		}
2358
2359		rcu_read_lock();
2360		level_size = srcroot->nodesize;
2361		rcu_read_unlock();
2362	}
2363
2364	/*
2365	 * add qgroup to all inherited groups
2366	 */
2367	if (inherit) {
2368		i_qgroups = (u64 *)(inherit + 1);
2369		for (i = 0; i < inherit->num_qgroups; ++i) {
2370			ret = add_qgroup_relation_item(trans, quota_root,
2371						       objectid, *i_qgroups);
2372			if (ret)
2373				goto out;
2374			ret = add_qgroup_relation_item(trans, quota_root,
2375						       *i_qgroups, objectid);
2376			if (ret)
2377				goto out;
2378			++i_qgroups;
2379		}
2380	}
2381
2382
2383	spin_lock(&fs_info->qgroup_lock);
2384
2385	dstgroup = add_qgroup_rb(fs_info, objectid);
2386	if (IS_ERR(dstgroup)) {
2387		ret = PTR_ERR(dstgroup);
2388		goto unlock;
2389	}
2390
2391	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2392		dstgroup->lim_flags = inherit->lim.flags;
2393		dstgroup->max_rfer = inherit->lim.max_rfer;
2394		dstgroup->max_excl = inherit->lim.max_excl;
2395		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2396		dstgroup->rsv_excl = inherit->lim.rsv_excl;
2397
2398		ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
2399		if (ret) {
2400			fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2401			btrfs_info(fs_info, "unable to update quota limit for %llu",
2402			       dstgroup->qgroupid);
2403			goto unlock;
2404		}
2405	}
2406
2407	if (srcid) {
2408		srcgroup = find_qgroup_rb(fs_info, srcid);
2409		if (!srcgroup)
2410			goto unlock;
2411
2412		/*
2413		 * We call inherit after we clone the root in order to make sure
2414		 * our counts don't go crazy, so at this point the only
2415		 * difference between the two roots should be the root node.
2416		 */
2417		dstgroup->rfer = srcgroup->rfer;
2418		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2419		dstgroup->excl = level_size;
2420		dstgroup->excl_cmpr = level_size;
2421		srcgroup->excl = level_size;
2422		srcgroup->excl_cmpr = level_size;
2423
2424		/* inherit the limit info */
2425		dstgroup->lim_flags = srcgroup->lim_flags;
2426		dstgroup->max_rfer = srcgroup->max_rfer;
2427		dstgroup->max_excl = srcgroup->max_excl;
2428		dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2429		dstgroup->rsv_excl = srcgroup->rsv_excl;
2430
2431		qgroup_dirty(fs_info, dstgroup);
2432		qgroup_dirty(fs_info, srcgroup);
2433	}
2434
2435	if (!inherit)
2436		goto unlock;
2437
2438	i_qgroups = (u64 *)(inherit + 1);
2439	for (i = 0; i < inherit->num_qgroups; ++i) {
2440		ret = add_relation_rb(quota_root->fs_info, objectid,
2441				      *i_qgroups);
2442		if (ret)
2443			goto unlock;
2444		++i_qgroups;
2445	}
2446
2447	for (i = 0; i <  inherit->num_ref_copies; ++i) {
2448		struct btrfs_qgroup *src;
2449		struct btrfs_qgroup *dst;
2450
2451		src = find_qgroup_rb(fs_info, i_qgroups[0]);
2452		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2453
2454		if (!src || !dst) {
2455			ret = -EINVAL;
2456			goto unlock;
2457		}
2458
2459		dst->rfer = src->rfer - level_size;
2460		dst->rfer_cmpr = src->rfer_cmpr - level_size;
2461		i_qgroups += 2;
2462	}
2463	for (i = 0; i <  inherit->num_excl_copies; ++i) {
2464		struct btrfs_qgroup *src;
2465		struct btrfs_qgroup *dst;
2466
2467		src = find_qgroup_rb(fs_info, i_qgroups[0]);
2468		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2469
2470		if (!src || !dst) {
2471			ret = -EINVAL;
2472			goto unlock;
2473		}
2474
2475		dst->excl = src->excl + level_size;
2476		dst->excl_cmpr = src->excl_cmpr + level_size;
2477		i_qgroups += 2;
2478	}
2479
2480unlock:
2481	spin_unlock(&fs_info->qgroup_lock);
2482out:
2483	mutex_unlock(&fs_info->qgroup_ioctl_lock);
2484	return ret;
2485}
2486
2487int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
2488{
2489	struct btrfs_root *quota_root;
2490	struct btrfs_qgroup *qgroup;
2491	struct btrfs_fs_info *fs_info = root->fs_info;
2492	u64 ref_root = root->root_key.objectid;
2493	int ret = 0;
2494	struct ulist_node *unode;
2495	struct ulist_iterator uiter;
2496
2497	if (!is_fstree(ref_root))
2498		return 0;
2499
2500	if (num_bytes == 0)
2501		return 0;
2502
2503	spin_lock(&fs_info->qgroup_lock);
2504	quota_root = fs_info->quota_root;
2505	if (!quota_root)
2506		goto out;
2507
2508	qgroup = find_qgroup_rb(fs_info, ref_root);
2509	if (!qgroup)
2510		goto out;
2511
2512	/*
2513	 * in a first step, we check all affected qgroups if any limits would
2514	 * be exceeded
2515	 */
2516	ulist_reinit(fs_info->qgroup_ulist);
2517	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2518			(uintptr_t)qgroup, GFP_ATOMIC);
2519	if (ret < 0)
2520		goto out;
2521	ULIST_ITER_INIT(&uiter);
2522	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2523		struct btrfs_qgroup *qg;
2524		struct btrfs_qgroup_list *glist;
2525
2526		qg = u64_to_ptr(unode->aux);
2527
2528		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
2529		    qg->reserved + (s64)qg->rfer + num_bytes >
2530		    qg->max_rfer) {
2531			ret = -EDQUOT;
2532			goto out;
2533		}
2534
2535		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
2536		    qg->reserved + (s64)qg->excl + num_bytes >
2537		    qg->max_excl) {
2538			ret = -EDQUOT;
2539			goto out;
2540		}
2541
2542		list_for_each_entry(glist, &qg->groups, next_group) {
2543			ret = ulist_add(fs_info->qgroup_ulist,
2544					glist->group->qgroupid,
2545					(uintptr_t)glist->group, GFP_ATOMIC);
2546			if (ret < 0)
2547				goto out;
2548		}
2549	}
2550	ret = 0;
2551	/*
2552	 * no limits exceeded, now record the reservation into all qgroups
2553	 */
2554	ULIST_ITER_INIT(&uiter);
2555	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2556		struct btrfs_qgroup *qg;
2557
2558		qg = u64_to_ptr(unode->aux);
2559
2560		qg->reserved += num_bytes;
2561	}
2562
2563out:
2564	spin_unlock(&fs_info->qgroup_lock);
2565	return ret;
2566}
2567
2568void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
2569{
2570	struct btrfs_root *quota_root;
2571	struct btrfs_qgroup *qgroup;
2572	struct btrfs_fs_info *fs_info = root->fs_info;
2573	struct ulist_node *unode;
2574	struct ulist_iterator uiter;
2575	u64 ref_root = root->root_key.objectid;
2576	int ret = 0;
2577
2578	if (!is_fstree(ref_root))
2579		return;
2580
2581	if (num_bytes == 0)
2582		return;
2583
2584	spin_lock(&fs_info->qgroup_lock);
2585
2586	quota_root = fs_info->quota_root;
2587	if (!quota_root)
2588		goto out;
2589
2590	qgroup = find_qgroup_rb(fs_info, ref_root);
2591	if (!qgroup)
2592		goto out;
2593
2594	ulist_reinit(fs_info->qgroup_ulist);
2595	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2596			(uintptr_t)qgroup, GFP_ATOMIC);
2597	if (ret < 0)
2598		goto out;
2599	ULIST_ITER_INIT(&uiter);
2600	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2601		struct btrfs_qgroup *qg;
2602		struct btrfs_qgroup_list *glist;
2603
2604		qg = u64_to_ptr(unode->aux);
2605
2606		qg->reserved -= num_bytes;
2607
2608		list_for_each_entry(glist, &qg->groups, next_group) {
2609			ret = ulist_add(fs_info->qgroup_ulist,
2610					glist->group->qgroupid,
2611					(uintptr_t)glist->group, GFP_ATOMIC);
2612			if (ret < 0)
2613				goto out;
2614		}
2615	}
2616
2617out:
2618	spin_unlock(&fs_info->qgroup_lock);
2619}
2620
2621void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
2622{
2623	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
2624		return;
2625	btrfs_err(trans->root->fs_info,
2626		"qgroups not uptodate in trans handle %p:  list is%s empty, "
2627		"seq is %#x.%x",
2628		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
2629		(u32)(trans->delayed_ref_elem.seq >> 32),
2630		(u32)trans->delayed_ref_elem.seq);
2631	BUG();
2632}
2633
2634/*
2635 * returns < 0 on error, 0 when more leafs are to be scanned.
2636 * returns 1 when done.
2637 */
2638static int
2639qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2640		   struct btrfs_trans_handle *trans, struct ulist *qgroups,
2641		   struct ulist *tmp, struct extent_buffer *scratch_leaf)
2642{
2643	struct btrfs_key found;
2644	struct ulist *roots = NULL;
2645	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
2646	u64 num_bytes;
2647	u64 seq;
2648	int new_roots;
2649	int slot;
2650	int ret;
2651
2652	path->leave_spinning = 1;
2653	mutex_lock(&fs_info->qgroup_rescan_lock);
2654	ret = btrfs_search_slot_for_read(fs_info->extent_root,
2655					 &fs_info->qgroup_rescan_progress,
2656					 path, 1, 0);
2657
2658	pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
2659		 fs_info->qgroup_rescan_progress.objectid,
2660		 fs_info->qgroup_rescan_progress.type,
2661		 fs_info->qgroup_rescan_progress.offset, ret);
2662
2663	if (ret) {
2664		/*
2665		 * The rescan is about to end, we will not be scanning any
2666		 * further blocks. We cannot unset the RESCAN flag here, because
2667		 * we want to commit the transaction if everything went well.
2668		 * To make the live accounting work in this phase, we set our
2669		 * scan progress pointer such that every real extent objectid
2670		 * will be smaller.
2671		 */
2672		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
2673		btrfs_release_path(path);
2674		mutex_unlock(&fs_info->qgroup_rescan_lock);
2675		return ret;
2676	}
2677
2678	btrfs_item_key_to_cpu(path->nodes[0], &found,
2679			      btrfs_header_nritems(path->nodes[0]) - 1);
2680	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
2681
2682	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
2683	memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
2684	slot = path->slots[0];
2685	btrfs_release_path(path);
2686	mutex_unlock(&fs_info->qgroup_rescan_lock);
2687
2688	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
2689		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
2690		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
2691		    found.type != BTRFS_METADATA_ITEM_KEY)
2692			continue;
2693		if (found.type == BTRFS_METADATA_ITEM_KEY)
2694			num_bytes = fs_info->extent_root->nodesize;
2695		else
2696			num_bytes = found.offset;
2697
2698		ulist_reinit(qgroups);
2699		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
2700					   &roots);
2701		if (ret < 0)
2702			goto out;
2703		spin_lock(&fs_info->qgroup_lock);
2704		seq = fs_info->qgroup_seq;
2705		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
2706
2707		new_roots = 0;
2708		ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
2709					     seq, &new_roots, 1);
2710		if (ret < 0) {
2711			spin_unlock(&fs_info->qgroup_lock);
2712			ulist_free(roots);
2713			goto out;
2714		}
2715
2716		ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
2717					     seq, 0, new_roots, 1);
2718		if (ret < 0) {
2719			spin_unlock(&fs_info->qgroup_lock);
2720			ulist_free(roots);
2721			goto out;
2722		}
2723		spin_unlock(&fs_info->qgroup_lock);
2724		ulist_free(roots);
2725	}
2726out:
2727	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
2728
2729	return ret;
2730}
2731
2732static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2733{
2734	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2735						     qgroup_rescan_work);
2736	struct btrfs_path *path;
2737	struct btrfs_trans_handle *trans = NULL;
2738	struct ulist *tmp = NULL, *qgroups = NULL;
2739	struct extent_buffer *scratch_leaf = NULL;
2740	int err = -ENOMEM;
2741	int ret = 0;
2742
2743	path = btrfs_alloc_path();
2744	if (!path)
2745		goto out;
2746	qgroups = ulist_alloc(GFP_NOFS);
2747	if (!qgroups)
2748		goto out;
2749	tmp = ulist_alloc(GFP_NOFS);
2750	if (!tmp)
2751		goto out;
2752	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
2753	if (!scratch_leaf)
2754		goto out;
2755
2756	err = 0;
2757	while (!err) {
2758		trans = btrfs_start_transaction(fs_info->fs_root, 0);
2759		if (IS_ERR(trans)) {
2760			err = PTR_ERR(trans);
2761			break;
2762		}
2763		if (!fs_info->quota_enabled) {
2764			err = -EINTR;
2765		} else {
2766			err = qgroup_rescan_leaf(fs_info, path, trans,
2767						 qgroups, tmp, scratch_leaf);
2768		}
2769		if (err > 0)
2770			btrfs_commit_transaction(trans, fs_info->fs_root);
2771		else
2772			btrfs_end_transaction(trans, fs_info->fs_root);
2773	}
2774
2775out:
2776	kfree(scratch_leaf);
2777	ulist_free(qgroups);
2778	ulist_free(tmp);
2779	btrfs_free_path(path);
2780
2781	mutex_lock(&fs_info->qgroup_rescan_lock);
2782	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2783
2784	if (err > 0 &&
2785	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
2786		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2787	} else if (err < 0) {
2788		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2789	}
2790	mutex_unlock(&fs_info->qgroup_rescan_lock);
2791
2792	/*
2793	 * only update status, since the previous part has alreay updated the
2794	 * qgroup info.
2795	 */
2796	trans = btrfs_start_transaction(fs_info->quota_root, 1);
2797	if (IS_ERR(trans)) {
2798		err = PTR_ERR(trans);
2799		btrfs_err(fs_info,
2800			  "fail to start transaction for status update: %d\n",
2801			  err);
2802		goto done;
2803	}
2804	ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
2805	if (ret < 0) {
2806		err = ret;
2807		btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
2808	}
2809	btrfs_end_transaction(trans, fs_info->quota_root);
2810
2811	if (err >= 0) {
2812		btrfs_info(fs_info, "qgroup scan completed%s",
2813			err > 0 ? " (inconsistency flag cleared)" : "");
2814	} else {
2815		btrfs_err(fs_info, "qgroup scan failed with %d", err);
2816	}
2817
2818done:
2819	complete_all(&fs_info->qgroup_rescan_completion);
2820}
2821
2822/*
2823 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
2824 * memory required for the rescan context.
2825 */
2826static int
2827qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2828		   int init_flags)
2829{
2830	int ret = 0;
2831
2832	if (!init_flags &&
2833	    (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
2834	     !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
2835		ret = -EINVAL;
2836		goto err;
2837	}
2838
2839	mutex_lock(&fs_info->qgroup_rescan_lock);
2840	spin_lock(&fs_info->qgroup_lock);
2841
2842	if (init_flags) {
2843		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2844			ret = -EINPROGRESS;
2845		else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
2846			ret = -EINVAL;
2847
2848		if (ret) {
2849			spin_unlock(&fs_info->qgroup_lock);
2850			mutex_unlock(&fs_info->qgroup_rescan_lock);
2851			goto err;
2852		}
2853		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2854	}
2855
2856	memset(&fs_info->qgroup_rescan_progress, 0,
2857		sizeof(fs_info->qgroup_rescan_progress));
2858	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
2859
2860	spin_unlock(&fs_info->qgroup_lock);
2861	mutex_unlock(&fs_info->qgroup_rescan_lock);
2862
2863	init_completion(&fs_info->qgroup_rescan_completion);
2864
2865	memset(&fs_info->qgroup_rescan_work, 0,
2866	       sizeof(fs_info->qgroup_rescan_work));
2867	btrfs_init_work(&fs_info->qgroup_rescan_work,
2868			btrfs_qgroup_rescan_helper,
2869			btrfs_qgroup_rescan_worker, NULL, NULL);
2870
2871	if (ret) {
2872err:
2873		btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
2874		return ret;
2875	}
2876
2877	return 0;
2878}
2879
2880static void
2881qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
2882{
2883	struct rb_node *n;
2884	struct btrfs_qgroup *qgroup;
2885
2886	spin_lock(&fs_info->qgroup_lock);
2887	/* clear all current qgroup tracking information */
2888	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
2889		qgroup = rb_entry(n, struct btrfs_qgroup, node);
2890		qgroup->rfer = 0;
2891		qgroup->rfer_cmpr = 0;
2892		qgroup->excl = 0;
2893		qgroup->excl_cmpr = 0;
2894	}
2895	spin_unlock(&fs_info->qgroup_lock);
2896}
2897
2898int
2899btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2900{
2901	int ret = 0;
2902	struct btrfs_trans_handle *trans;
2903
2904	ret = qgroup_rescan_init(fs_info, 0, 1);
2905	if (ret)
2906		return ret;
2907
2908	/*
2909	 * We have set the rescan_progress to 0, which means no more
2910	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
2911	 * However, btrfs_qgroup_account_ref may be right after its call
2912	 * to btrfs_find_all_roots, in which case it would still do the
2913	 * accounting.
2914	 * To solve this, we're committing the transaction, which will
2915	 * ensure we run all delayed refs and only after that, we are
2916	 * going to clear all tracking information for a clean start.
2917	 */
2918
2919	trans = btrfs_join_transaction(fs_info->fs_root);
2920	if (IS_ERR(trans)) {
2921		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2922		return PTR_ERR(trans);
2923	}
2924	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
2925	if (ret) {
2926		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2927		return ret;
2928	}
2929
2930	qgroup_rescan_zero_tracking(fs_info);
2931
2932	btrfs_queue_work(fs_info->qgroup_rescan_workers,
2933			 &fs_info->qgroup_rescan_work);
2934
2935	return 0;
2936}
2937
2938int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
2939{
2940	int running;
2941	int ret = 0;
2942
2943	mutex_lock(&fs_info->qgroup_rescan_lock);
2944	spin_lock(&fs_info->qgroup_lock);
2945	running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2946	spin_unlock(&fs_info->qgroup_lock);
2947	mutex_unlock(&fs_info->qgroup_rescan_lock);
2948
2949	if (running)
2950		ret = wait_for_completion_interruptible(
2951					&fs_info->qgroup_rescan_completion);
2952
2953	return ret;
2954}
2955
2956/*
2957 * this is only called from open_ctree where we're still single threaded, thus
2958 * locking is omitted here.
2959 */
2960void
2961btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2962{
2963	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2964		btrfs_queue_work(fs_info->qgroup_rescan_workers,
2965				 &fs_info->qgroup_rescan_work);
2966}
2967