1/*
2 * Copyright (C) 2007 Oracle.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/seq_file.h>
28#include <linux/string.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include <linux/magic.h>
41#include <linux/slab.h>
42#include <linux/cleancache.h>
43#include <linux/ratelimit.h>
44#include <linux/btrfs.h>
45#include "delayed-inode.h"
46#include "ctree.h"
47#include "disk-io.h"
48#include "transaction.h"
49#include "btrfs_inode.h"
50#include "print-tree.h"
51#include "hash.h"
52#include "props.h"
53#include "xattr.h"
54#include "volumes.h"
55#include "export.h"
56#include "compression.h"
57#include "rcu-string.h"
58#include "dev-replace.h"
59#include "free-space-cache.h"
60#include "backref.h"
61#include "tests/btrfs-tests.h"
62
63#include "qgroup.h"
64#define CREATE_TRACE_POINTS
65#include <trace/events/btrfs.h>
66
67static const struct super_operations btrfs_super_ops;
68static struct file_system_type btrfs_fs_type;
69
70static int btrfs_remount(struct super_block *sb, int *flags, char *data);
71
72static const char *btrfs_decode_error(int errno)
73{
74	char *errstr = "unknown";
75
76	switch (errno) {
77	case -EIO:
78		errstr = "IO failure";
79		break;
80	case -ENOMEM:
81		errstr = "Out of memory";
82		break;
83	case -EROFS:
84		errstr = "Readonly filesystem";
85		break;
86	case -EEXIST:
87		errstr = "Object already exists";
88		break;
89	case -ENOSPC:
90		errstr = "No space left";
91		break;
92	case -ENOENT:
93		errstr = "No such entry";
94		break;
95	}
96
97	return errstr;
98}
99
100static void save_error_info(struct btrfs_fs_info *fs_info)
101{
102	/*
103	 * today we only save the error info into ram.  Long term we'll
104	 * also send it down to the disk
105	 */
106	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
107}
108
109/* btrfs handle error by forcing the filesystem readonly */
110static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
111{
112	struct super_block *sb = fs_info->sb;
113
114	if (sb->s_flags & MS_RDONLY)
115		return;
116
117	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
118		sb->s_flags |= MS_RDONLY;
119		btrfs_info(fs_info, "forced readonly");
120		/*
121		 * Note that a running device replace operation is not
122		 * canceled here although there is no way to update
123		 * the progress. It would add the risk of a deadlock,
124		 * therefore the canceling is ommited. The only penalty
125		 * is that some I/O remains active until the procedure
126		 * completes. The next time when the filesystem is
127		 * mounted writeable again, the device replace
128		 * operation continues.
129		 */
130	}
131}
132
133#ifdef CONFIG_PRINTK
134/*
135 * __btrfs_std_error decodes expected errors from the caller and
136 * invokes the approciate error response.
137 */
138void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
139		       unsigned int line, int errno, const char *fmt, ...)
140{
141	struct super_block *sb = fs_info->sb;
142	const char *errstr;
143
144	/*
145	 * Special case: if the error is EROFS, and we're already
146	 * under MS_RDONLY, then it is safe here.
147	 */
148	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
149  		return;
150
151	errstr = btrfs_decode_error(errno);
152	if (fmt) {
153		struct va_format vaf;
154		va_list args;
155
156		va_start(args, fmt);
157		vaf.fmt = fmt;
158		vaf.va = &args;
159
160		printk(KERN_CRIT
161			"BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
162			sb->s_id, function, line, errno, errstr, &vaf);
163		va_end(args);
164	} else {
165		printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
166			sb->s_id, function, line, errno, errstr);
167	}
168
169	/* Don't go through full error handling during mount */
170	save_error_info(fs_info);
171	if (sb->s_flags & MS_BORN)
172		btrfs_handle_error(fs_info);
173}
174
175static const char * const logtypes[] = {
176	"emergency",
177	"alert",
178	"critical",
179	"error",
180	"warning",
181	"notice",
182	"info",
183	"debug",
184};
185
186void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
187{
188	struct super_block *sb = fs_info->sb;
189	char lvl[4];
190	struct va_format vaf;
191	va_list args;
192	const char *type = logtypes[4];
193	int kern_level;
194
195	va_start(args, fmt);
196
197	kern_level = printk_get_level(fmt);
198	if (kern_level) {
199		size_t size = printk_skip_level(fmt) - fmt;
200		memcpy(lvl, fmt,  size);
201		lvl[size] = '\0';
202		fmt += size;
203		type = logtypes[kern_level - '0'];
204	} else
205		*lvl = '\0';
206
207	vaf.fmt = fmt;
208	vaf.va = &args;
209
210	printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
211
212	va_end(args);
213}
214
215#else
216
217void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
218		       unsigned int line, int errno, const char *fmt, ...)
219{
220	struct super_block *sb = fs_info->sb;
221
222	/*
223	 * Special case: if the error is EROFS, and we're already
224	 * under MS_RDONLY, then it is safe here.
225	 */
226	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
227		return;
228
229	/* Don't go through full error handling during mount */
230	if (sb->s_flags & MS_BORN) {
231		save_error_info(fs_info);
232		btrfs_handle_error(fs_info);
233	}
234}
235#endif
236
237/*
238 * We only mark the transaction aborted and then set the file system read-only.
239 * This will prevent new transactions from starting or trying to join this
240 * one.
241 *
242 * This means that error recovery at the call site is limited to freeing
243 * any local memory allocations and passing the error code up without
244 * further cleanup. The transaction should complete as it normally would
245 * in the call path but will return -EIO.
246 *
247 * We'll complete the cleanup in btrfs_end_transaction and
248 * btrfs_commit_transaction.
249 */
250void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
251			       struct btrfs_root *root, const char *function,
252			       unsigned int line, int errno)
253{
254	/*
255	 * Report first abort since mount
256	 */
257	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
258				&root->fs_info->fs_state)) {
259		WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
260				errno);
261	}
262	trans->aborted = errno;
263	/* Nothing used. The other threads that have joined this
264	 * transaction may be able to continue. */
265	if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
266		const char *errstr;
267
268		errstr = btrfs_decode_error(errno);
269		btrfs_warn(root->fs_info,
270		           "%s:%d: Aborting unused transaction(%s).",
271		           function, line, errstr);
272		return;
273	}
274	ACCESS_ONCE(trans->transaction->aborted) = errno;
275	/* Wake up anybody who may be waiting on this transaction */
276	wake_up(&root->fs_info->transaction_wait);
277	wake_up(&root->fs_info->transaction_blocked_wait);
278	__btrfs_std_error(root->fs_info, function, line, errno, NULL);
279}
280/*
281 * __btrfs_panic decodes unexpected, fatal errors from the caller,
282 * issues an alert, and either panics or BUGs, depending on mount options.
283 */
284void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
285		   unsigned int line, int errno, const char *fmt, ...)
286{
287	char *s_id = "<unknown>";
288	const char *errstr;
289	struct va_format vaf = { .fmt = fmt };
290	va_list args;
291
292	if (fs_info)
293		s_id = fs_info->sb->s_id;
294
295	va_start(args, fmt);
296	vaf.va = &args;
297
298	errstr = btrfs_decode_error(errno);
299	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
300		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
301			s_id, function, line, &vaf, errno, errstr);
302
303	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
304		   function, line, &vaf, errno, errstr);
305	va_end(args);
306	/* Caller calls BUG() */
307}
308
309static void btrfs_put_super(struct super_block *sb)
310{
311	close_ctree(btrfs_sb(sb)->tree_root);
312}
313
314enum {
315	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
316	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
317	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
318	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
319	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
320	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
321	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
322	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
323	Opt_check_integrity, Opt_check_integrity_including_extent_data,
324	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
325	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
326	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
327	Opt_datasum, Opt_treelog, Opt_noinode_cache,
328	Opt_err,
329};
330
331static match_table_t tokens = {
332	{Opt_degraded, "degraded"},
333	{Opt_subvol, "subvol=%s"},
334	{Opt_subvolid, "subvolid=%s"},
335	{Opt_device, "device=%s"},
336	{Opt_nodatasum, "nodatasum"},
337	{Opt_datasum, "datasum"},
338	{Opt_nodatacow, "nodatacow"},
339	{Opt_datacow, "datacow"},
340	{Opt_nobarrier, "nobarrier"},
341	{Opt_barrier, "barrier"},
342	{Opt_max_inline, "max_inline=%s"},
343	{Opt_alloc_start, "alloc_start=%s"},
344	{Opt_thread_pool, "thread_pool=%d"},
345	{Opt_compress, "compress"},
346	{Opt_compress_type, "compress=%s"},
347	{Opt_compress_force, "compress-force"},
348	{Opt_compress_force_type, "compress-force=%s"},
349	{Opt_ssd, "ssd"},
350	{Opt_ssd_spread, "ssd_spread"},
351	{Opt_nossd, "nossd"},
352	{Opt_acl, "acl"},
353	{Opt_noacl, "noacl"},
354	{Opt_notreelog, "notreelog"},
355	{Opt_treelog, "treelog"},
356	{Opt_flushoncommit, "flushoncommit"},
357	{Opt_noflushoncommit, "noflushoncommit"},
358	{Opt_ratio, "metadata_ratio=%d"},
359	{Opt_discard, "discard"},
360	{Opt_nodiscard, "nodiscard"},
361	{Opt_space_cache, "space_cache"},
362	{Opt_clear_cache, "clear_cache"},
363	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
364	{Opt_enospc_debug, "enospc_debug"},
365	{Opt_noenospc_debug, "noenospc_debug"},
366	{Opt_subvolrootid, "subvolrootid=%d"},
367	{Opt_defrag, "autodefrag"},
368	{Opt_nodefrag, "noautodefrag"},
369	{Opt_inode_cache, "inode_cache"},
370	{Opt_noinode_cache, "noinode_cache"},
371	{Opt_no_space_cache, "nospace_cache"},
372	{Opt_recovery, "recovery"},
373	{Opt_skip_balance, "skip_balance"},
374	{Opt_check_integrity, "check_int"},
375	{Opt_check_integrity_including_extent_data, "check_int_data"},
376	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
377	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
378	{Opt_fatal_errors, "fatal_errors=%s"},
379	{Opt_commit_interval, "commit=%d"},
380	{Opt_err, NULL},
381};
382
383/*
384 * Regular mount options parser.  Everything that is needed only when
385 * reading in a new superblock is parsed here.
386 * XXX JDM: This needs to be cleaned up for remount.
387 */
388int btrfs_parse_options(struct btrfs_root *root, char *options)
389{
390	struct btrfs_fs_info *info = root->fs_info;
391	substring_t args[MAX_OPT_ARGS];
392	char *p, *num, *orig = NULL;
393	u64 cache_gen;
394	int intarg;
395	int ret = 0;
396	char *compress_type;
397	bool compress_force = false;
398
399	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
400	if (cache_gen)
401		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
402
403	if (!options)
404		goto out;
405
406	/*
407	 * strsep changes the string, duplicate it because parse_options
408	 * gets called twice
409	 */
410	options = kstrdup(options, GFP_NOFS);
411	if (!options)
412		return -ENOMEM;
413
414	orig = options;
415
416	while ((p = strsep(&options, ",")) != NULL) {
417		int token;
418		if (!*p)
419			continue;
420
421		token = match_token(p, tokens, args);
422		switch (token) {
423		case Opt_degraded:
424			btrfs_info(root->fs_info, "allowing degraded mounts");
425			btrfs_set_opt(info->mount_opt, DEGRADED);
426			break;
427		case Opt_subvol:
428		case Opt_subvolid:
429		case Opt_subvolrootid:
430		case Opt_device:
431			/*
432			 * These are parsed by btrfs_parse_early_options
433			 * and can be happily ignored here.
434			 */
435			break;
436		case Opt_nodatasum:
437			btrfs_set_and_info(root, NODATASUM,
438					   "setting nodatasum");
439			break;
440		case Opt_datasum:
441			if (btrfs_test_opt(root, NODATASUM)) {
442				if (btrfs_test_opt(root, NODATACOW))
443					btrfs_info(root->fs_info, "setting datasum, datacow enabled");
444				else
445					btrfs_info(root->fs_info, "setting datasum");
446			}
447			btrfs_clear_opt(info->mount_opt, NODATACOW);
448			btrfs_clear_opt(info->mount_opt, NODATASUM);
449			break;
450		case Opt_nodatacow:
451			if (!btrfs_test_opt(root, NODATACOW)) {
452				if (!btrfs_test_opt(root, COMPRESS) ||
453				    !btrfs_test_opt(root, FORCE_COMPRESS)) {
454					btrfs_info(root->fs_info,
455						   "setting nodatacow, compression disabled");
456				} else {
457					btrfs_info(root->fs_info, "setting nodatacow");
458				}
459			}
460			btrfs_clear_opt(info->mount_opt, COMPRESS);
461			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
462			btrfs_set_opt(info->mount_opt, NODATACOW);
463			btrfs_set_opt(info->mount_opt, NODATASUM);
464			break;
465		case Opt_datacow:
466			btrfs_clear_and_info(root, NODATACOW,
467					     "setting datacow");
468			break;
469		case Opt_compress_force:
470		case Opt_compress_force_type:
471			compress_force = true;
472			/* Fallthrough */
473		case Opt_compress:
474		case Opt_compress_type:
475			if (token == Opt_compress ||
476			    token == Opt_compress_force ||
477			    strcmp(args[0].from, "zlib") == 0) {
478				compress_type = "zlib";
479				info->compress_type = BTRFS_COMPRESS_ZLIB;
480				btrfs_set_opt(info->mount_opt, COMPRESS);
481				btrfs_clear_opt(info->mount_opt, NODATACOW);
482				btrfs_clear_opt(info->mount_opt, NODATASUM);
483			} else if (strcmp(args[0].from, "lzo") == 0) {
484				compress_type = "lzo";
485				info->compress_type = BTRFS_COMPRESS_LZO;
486				btrfs_set_opt(info->mount_opt, COMPRESS);
487				btrfs_clear_opt(info->mount_opt, NODATACOW);
488				btrfs_clear_opt(info->mount_opt, NODATASUM);
489				btrfs_set_fs_incompat(info, COMPRESS_LZO);
490			} else if (strncmp(args[0].from, "no", 2) == 0) {
491				compress_type = "no";
492				btrfs_clear_opt(info->mount_opt, COMPRESS);
493				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
494				compress_force = false;
495			} else {
496				ret = -EINVAL;
497				goto out;
498			}
499
500			if (compress_force) {
501				btrfs_set_and_info(root, FORCE_COMPRESS,
502						   "force %s compression",
503						   compress_type);
504			} else {
505				if (!btrfs_test_opt(root, COMPRESS))
506					btrfs_info(root->fs_info,
507						   "btrfs: use %s compression",
508						   compress_type);
509				/*
510				 * If we remount from compress-force=xxx to
511				 * compress=xxx, we need clear FORCE_COMPRESS
512				 * flag, otherwise, there is no way for users
513				 * to disable forcible compression separately.
514				 */
515				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
516			}
517			break;
518		case Opt_ssd:
519			btrfs_set_and_info(root, SSD,
520					   "use ssd allocation scheme");
521			break;
522		case Opt_ssd_spread:
523			btrfs_set_and_info(root, SSD_SPREAD,
524					   "use spread ssd allocation scheme");
525			btrfs_set_opt(info->mount_opt, SSD);
526			break;
527		case Opt_nossd:
528			btrfs_set_and_info(root, NOSSD,
529					     "not using ssd allocation scheme");
530			btrfs_clear_opt(info->mount_opt, SSD);
531			break;
532		case Opt_barrier:
533			btrfs_clear_and_info(root, NOBARRIER,
534					     "turning on barriers");
535			break;
536		case Opt_nobarrier:
537			btrfs_set_and_info(root, NOBARRIER,
538					   "turning off barriers");
539			break;
540		case Opt_thread_pool:
541			ret = match_int(&args[0], &intarg);
542			if (ret) {
543				goto out;
544			} else if (intarg > 0) {
545				info->thread_pool_size = intarg;
546			} else {
547				ret = -EINVAL;
548				goto out;
549			}
550			break;
551		case Opt_max_inline:
552			num = match_strdup(&args[0]);
553			if (num) {
554				info->max_inline = memparse(num, NULL);
555				kfree(num);
556
557				if (info->max_inline) {
558					info->max_inline = min_t(u64,
559						info->max_inline,
560						root->sectorsize);
561				}
562				btrfs_info(root->fs_info, "max_inline at %llu",
563					info->max_inline);
564			} else {
565				ret = -ENOMEM;
566				goto out;
567			}
568			break;
569		case Opt_alloc_start:
570			num = match_strdup(&args[0]);
571			if (num) {
572				mutex_lock(&info->chunk_mutex);
573				info->alloc_start = memparse(num, NULL);
574				mutex_unlock(&info->chunk_mutex);
575				kfree(num);
576				btrfs_info(root->fs_info, "allocations start at %llu",
577					info->alloc_start);
578			} else {
579				ret = -ENOMEM;
580				goto out;
581			}
582			break;
583		case Opt_acl:
584#ifdef CONFIG_BTRFS_FS_POSIX_ACL
585			root->fs_info->sb->s_flags |= MS_POSIXACL;
586			break;
587#else
588			btrfs_err(root->fs_info,
589				"support for ACL not compiled in!");
590			ret = -EINVAL;
591			goto out;
592#endif
593		case Opt_noacl:
594			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
595			break;
596		case Opt_notreelog:
597			btrfs_set_and_info(root, NOTREELOG,
598					   "disabling tree log");
599			break;
600		case Opt_treelog:
601			btrfs_clear_and_info(root, NOTREELOG,
602					     "enabling tree log");
603			break;
604		case Opt_flushoncommit:
605			btrfs_set_and_info(root, FLUSHONCOMMIT,
606					   "turning on flush-on-commit");
607			break;
608		case Opt_noflushoncommit:
609			btrfs_clear_and_info(root, FLUSHONCOMMIT,
610					     "turning off flush-on-commit");
611			break;
612		case Opt_ratio:
613			ret = match_int(&args[0], &intarg);
614			if (ret) {
615				goto out;
616			} else if (intarg >= 0) {
617				info->metadata_ratio = intarg;
618				btrfs_info(root->fs_info, "metadata ratio %d",
619				       info->metadata_ratio);
620			} else {
621				ret = -EINVAL;
622				goto out;
623			}
624			break;
625		case Opt_discard:
626			btrfs_set_and_info(root, DISCARD,
627					   "turning on discard");
628			break;
629		case Opt_nodiscard:
630			btrfs_clear_and_info(root, DISCARD,
631					     "turning off discard");
632			break;
633		case Opt_space_cache:
634			btrfs_set_and_info(root, SPACE_CACHE,
635					   "enabling disk space caching");
636			break;
637		case Opt_rescan_uuid_tree:
638			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
639			break;
640		case Opt_no_space_cache:
641			btrfs_clear_and_info(root, SPACE_CACHE,
642					     "disabling disk space caching");
643			break;
644		case Opt_inode_cache:
645			btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
646					   "enabling inode map caching");
647			break;
648		case Opt_noinode_cache:
649			btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
650					     "disabling inode map caching");
651			break;
652		case Opt_clear_cache:
653			btrfs_set_and_info(root, CLEAR_CACHE,
654					   "force clearing of disk cache");
655			break;
656		case Opt_user_subvol_rm_allowed:
657			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
658			break;
659		case Opt_enospc_debug:
660			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
661			break;
662		case Opt_noenospc_debug:
663			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
664			break;
665		case Opt_defrag:
666			btrfs_set_and_info(root, AUTO_DEFRAG,
667					   "enabling auto defrag");
668			break;
669		case Opt_nodefrag:
670			btrfs_clear_and_info(root, AUTO_DEFRAG,
671					     "disabling auto defrag");
672			break;
673		case Opt_recovery:
674			btrfs_info(root->fs_info, "enabling auto recovery");
675			btrfs_set_opt(info->mount_opt, RECOVERY);
676			break;
677		case Opt_skip_balance:
678			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
679			break;
680#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
681		case Opt_check_integrity_including_extent_data:
682			btrfs_info(root->fs_info,
683				   "enabling check integrity including extent data");
684			btrfs_set_opt(info->mount_opt,
685				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
686			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
687			break;
688		case Opt_check_integrity:
689			btrfs_info(root->fs_info, "enabling check integrity");
690			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
691			break;
692		case Opt_check_integrity_print_mask:
693			ret = match_int(&args[0], &intarg);
694			if (ret) {
695				goto out;
696			} else if (intarg >= 0) {
697				info->check_integrity_print_mask = intarg;
698				btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
699				       info->check_integrity_print_mask);
700			} else {
701				ret = -EINVAL;
702				goto out;
703			}
704			break;
705#else
706		case Opt_check_integrity_including_extent_data:
707		case Opt_check_integrity:
708		case Opt_check_integrity_print_mask:
709			btrfs_err(root->fs_info,
710				"support for check_integrity* not compiled in!");
711			ret = -EINVAL;
712			goto out;
713#endif
714		case Opt_fatal_errors:
715			if (strcmp(args[0].from, "panic") == 0)
716				btrfs_set_opt(info->mount_opt,
717					      PANIC_ON_FATAL_ERROR);
718			else if (strcmp(args[0].from, "bug") == 0)
719				btrfs_clear_opt(info->mount_opt,
720					      PANIC_ON_FATAL_ERROR);
721			else {
722				ret = -EINVAL;
723				goto out;
724			}
725			break;
726		case Opt_commit_interval:
727			intarg = 0;
728			ret = match_int(&args[0], &intarg);
729			if (ret < 0) {
730				btrfs_err(root->fs_info, "invalid commit interval");
731				ret = -EINVAL;
732				goto out;
733			}
734			if (intarg > 0) {
735				if (intarg > 300) {
736					btrfs_warn(root->fs_info, "excessive commit interval %d",
737							intarg);
738				}
739				info->commit_interval = intarg;
740			} else {
741				btrfs_info(root->fs_info, "using default commit interval %ds",
742				    BTRFS_DEFAULT_COMMIT_INTERVAL);
743				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
744			}
745			break;
746		case Opt_err:
747			btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
748			ret = -EINVAL;
749			goto out;
750		default:
751			break;
752		}
753	}
754out:
755	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
756		btrfs_info(root->fs_info, "disk space caching is enabled");
757	kfree(orig);
758	return ret;
759}
760
761/*
762 * Parse mount options that are required early in the mount process.
763 *
764 * All other options will be parsed on much later in the mount process and
765 * only when we need to allocate a new super block.
766 */
767static int btrfs_parse_early_options(const char *options, fmode_t flags,
768		void *holder, char **subvol_name, u64 *subvol_objectid,
769		struct btrfs_fs_devices **fs_devices)
770{
771	substring_t args[MAX_OPT_ARGS];
772	char *device_name, *opts, *orig, *p;
773	char *num = NULL;
774	int error = 0;
775
776	if (!options)
777		return 0;
778
779	/*
780	 * strsep changes the string, duplicate it because parse_options
781	 * gets called twice
782	 */
783	opts = kstrdup(options, GFP_KERNEL);
784	if (!opts)
785		return -ENOMEM;
786	orig = opts;
787
788	while ((p = strsep(&opts, ",")) != NULL) {
789		int token;
790		if (!*p)
791			continue;
792
793		token = match_token(p, tokens, args);
794		switch (token) {
795		case Opt_subvol:
796			kfree(*subvol_name);
797			*subvol_name = match_strdup(&args[0]);
798			if (!*subvol_name) {
799				error = -ENOMEM;
800				goto out;
801			}
802			break;
803		case Opt_subvolid:
804			num = match_strdup(&args[0]);
805			if (num) {
806				*subvol_objectid = memparse(num, NULL);
807				kfree(num);
808				/* we want the original fs_tree */
809				if (!*subvol_objectid)
810					*subvol_objectid =
811						BTRFS_FS_TREE_OBJECTID;
812			} else {
813				error = -EINVAL;
814				goto out;
815			}
816			break;
817		case Opt_subvolrootid:
818			printk(KERN_WARNING
819				"BTRFS: 'subvolrootid' mount option is deprecated and has "
820				"no effect\n");
821			break;
822		case Opt_device:
823			device_name = match_strdup(&args[0]);
824			if (!device_name) {
825				error = -ENOMEM;
826				goto out;
827			}
828			error = btrfs_scan_one_device(device_name,
829					flags, holder, fs_devices);
830			kfree(device_name);
831			if (error)
832				goto out;
833			break;
834		default:
835			break;
836		}
837	}
838
839out:
840	kfree(orig);
841	return error;
842}
843
844static struct dentry *get_default_root(struct super_block *sb,
845				       u64 subvol_objectid)
846{
847	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
848	struct btrfs_root *root = fs_info->tree_root;
849	struct btrfs_root *new_root;
850	struct btrfs_dir_item *di;
851	struct btrfs_path *path;
852	struct btrfs_key location;
853	struct inode *inode;
854	u64 dir_id;
855	int new = 0;
856
857	/*
858	 * We have a specific subvol we want to mount, just setup location and
859	 * go look up the root.
860	 */
861	if (subvol_objectid) {
862		location.objectid = subvol_objectid;
863		location.type = BTRFS_ROOT_ITEM_KEY;
864		location.offset = (u64)-1;
865		goto find_root;
866	}
867
868	path = btrfs_alloc_path();
869	if (!path)
870		return ERR_PTR(-ENOMEM);
871	path->leave_spinning = 1;
872
873	/*
874	 * Find the "default" dir item which points to the root item that we
875	 * will mount by default if we haven't been given a specific subvolume
876	 * to mount.
877	 */
878	dir_id = btrfs_super_root_dir(fs_info->super_copy);
879	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
880	if (IS_ERR(di)) {
881		btrfs_free_path(path);
882		return ERR_CAST(di);
883	}
884	if (!di) {
885		/*
886		 * Ok the default dir item isn't there.  This is weird since
887		 * it's always been there, but don't freak out, just try and
888		 * mount to root most subvolume.
889		 */
890		btrfs_free_path(path);
891		dir_id = BTRFS_FIRST_FREE_OBJECTID;
892		new_root = fs_info->fs_root;
893		goto setup_root;
894	}
895
896	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
897	btrfs_free_path(path);
898
899find_root:
900	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
901	if (IS_ERR(new_root))
902		return ERR_CAST(new_root);
903
904	if (!(sb->s_flags & MS_RDONLY)) {
905		int ret;
906		down_read(&fs_info->cleanup_work_sem);
907		ret = btrfs_orphan_cleanup(new_root);
908		up_read(&fs_info->cleanup_work_sem);
909		if (ret)
910			return ERR_PTR(ret);
911	}
912
913	dir_id = btrfs_root_dirid(&new_root->root_item);
914setup_root:
915	location.objectid = dir_id;
916	location.type = BTRFS_INODE_ITEM_KEY;
917	location.offset = 0;
918
919	inode = btrfs_iget(sb, &location, new_root, &new);
920	if (IS_ERR(inode))
921		return ERR_CAST(inode);
922
923	/*
924	 * If we're just mounting the root most subvol put the inode and return
925	 * a reference to the dentry.  We will have already gotten a reference
926	 * to the inode in btrfs_fill_super so we're good to go.
927	 */
928	if (!new && d_inode(sb->s_root) == inode) {
929		iput(inode);
930		return dget(sb->s_root);
931	}
932
933	return d_obtain_root(inode);
934}
935
936static int btrfs_fill_super(struct super_block *sb,
937			    struct btrfs_fs_devices *fs_devices,
938			    void *data, int silent)
939{
940	struct inode *inode;
941	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
942	struct btrfs_key key;
943	int err;
944
945	sb->s_maxbytes = MAX_LFS_FILESIZE;
946	sb->s_magic = BTRFS_SUPER_MAGIC;
947	sb->s_op = &btrfs_super_ops;
948	sb->s_d_op = &btrfs_dentry_operations;
949	sb->s_export_op = &btrfs_export_ops;
950	sb->s_xattr = btrfs_xattr_handlers;
951	sb->s_time_gran = 1;
952#ifdef CONFIG_BTRFS_FS_POSIX_ACL
953	sb->s_flags |= MS_POSIXACL;
954#endif
955	sb->s_flags |= MS_I_VERSION;
956	err = open_ctree(sb, fs_devices, (char *)data);
957	if (err) {
958		printk(KERN_ERR "BTRFS: open_ctree failed\n");
959		return err;
960	}
961
962	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
963	key.type = BTRFS_INODE_ITEM_KEY;
964	key.offset = 0;
965	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
966	if (IS_ERR(inode)) {
967		err = PTR_ERR(inode);
968		goto fail_close;
969	}
970
971	sb->s_root = d_make_root(inode);
972	if (!sb->s_root) {
973		err = -ENOMEM;
974		goto fail_close;
975	}
976
977	save_mount_options(sb, data);
978	cleancache_init_fs(sb);
979	sb->s_flags |= MS_ACTIVE;
980	return 0;
981
982fail_close:
983	close_ctree(fs_info->tree_root);
984	return err;
985}
986
987int btrfs_sync_fs(struct super_block *sb, int wait)
988{
989	struct btrfs_trans_handle *trans;
990	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
991	struct btrfs_root *root = fs_info->tree_root;
992
993	trace_btrfs_sync_fs(wait);
994
995	if (!wait) {
996		filemap_flush(fs_info->btree_inode->i_mapping);
997		return 0;
998	}
999
1000	btrfs_wait_ordered_roots(fs_info, -1);
1001
1002	trans = btrfs_attach_transaction_barrier(root);
1003	if (IS_ERR(trans)) {
1004		/* no transaction, don't bother */
1005		if (PTR_ERR(trans) == -ENOENT) {
1006			/*
1007			 * Exit unless we have some pending changes
1008			 * that need to go through commit
1009			 */
1010			if (fs_info->pending_changes == 0)
1011				return 0;
1012			/*
1013			 * A non-blocking test if the fs is frozen. We must not
1014			 * start a new transaction here otherwise a deadlock
1015			 * happens. The pending operations are delayed to the
1016			 * next commit after thawing.
1017			 */
1018			if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
1019				__sb_end_write(sb, SB_FREEZE_WRITE);
1020			else
1021				return 0;
1022			trans = btrfs_start_transaction(root, 0);
1023		}
1024		if (IS_ERR(trans))
1025			return PTR_ERR(trans);
1026	}
1027	return btrfs_commit_transaction(trans, root);
1028}
1029
1030static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1031{
1032	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
1033	struct btrfs_root *root = info->tree_root;
1034	char *compress_type;
1035
1036	if (btrfs_test_opt(root, DEGRADED))
1037		seq_puts(seq, ",degraded");
1038	if (btrfs_test_opt(root, NODATASUM))
1039		seq_puts(seq, ",nodatasum");
1040	if (btrfs_test_opt(root, NODATACOW))
1041		seq_puts(seq, ",nodatacow");
1042	if (btrfs_test_opt(root, NOBARRIER))
1043		seq_puts(seq, ",nobarrier");
1044	if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
1045		seq_printf(seq, ",max_inline=%llu", info->max_inline);
1046	if (info->alloc_start != 0)
1047		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
1048	if (info->thread_pool_size !=  min_t(unsigned long,
1049					     num_online_cpus() + 2, 8))
1050		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
1051	if (btrfs_test_opt(root, COMPRESS)) {
1052		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
1053			compress_type = "zlib";
1054		else
1055			compress_type = "lzo";
1056		if (btrfs_test_opt(root, FORCE_COMPRESS))
1057			seq_printf(seq, ",compress-force=%s", compress_type);
1058		else
1059			seq_printf(seq, ",compress=%s", compress_type);
1060	}
1061	if (btrfs_test_opt(root, NOSSD))
1062		seq_puts(seq, ",nossd");
1063	if (btrfs_test_opt(root, SSD_SPREAD))
1064		seq_puts(seq, ",ssd_spread");
1065	else if (btrfs_test_opt(root, SSD))
1066		seq_puts(seq, ",ssd");
1067	if (btrfs_test_opt(root, NOTREELOG))
1068		seq_puts(seq, ",notreelog");
1069	if (btrfs_test_opt(root, FLUSHONCOMMIT))
1070		seq_puts(seq, ",flushoncommit");
1071	if (btrfs_test_opt(root, DISCARD))
1072		seq_puts(seq, ",discard");
1073	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
1074		seq_puts(seq, ",noacl");
1075	if (btrfs_test_opt(root, SPACE_CACHE))
1076		seq_puts(seq, ",space_cache");
1077	else
1078		seq_puts(seq, ",nospace_cache");
1079	if (btrfs_test_opt(root, RESCAN_UUID_TREE))
1080		seq_puts(seq, ",rescan_uuid_tree");
1081	if (btrfs_test_opt(root, CLEAR_CACHE))
1082		seq_puts(seq, ",clear_cache");
1083	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
1084		seq_puts(seq, ",user_subvol_rm_allowed");
1085	if (btrfs_test_opt(root, ENOSPC_DEBUG))
1086		seq_puts(seq, ",enospc_debug");
1087	if (btrfs_test_opt(root, AUTO_DEFRAG))
1088		seq_puts(seq, ",autodefrag");
1089	if (btrfs_test_opt(root, INODE_MAP_CACHE))
1090		seq_puts(seq, ",inode_cache");
1091	if (btrfs_test_opt(root, SKIP_BALANCE))
1092		seq_puts(seq, ",skip_balance");
1093	if (btrfs_test_opt(root, RECOVERY))
1094		seq_puts(seq, ",recovery");
1095#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1096	if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
1097		seq_puts(seq, ",check_int_data");
1098	else if (btrfs_test_opt(root, CHECK_INTEGRITY))
1099		seq_puts(seq, ",check_int");
1100	if (info->check_integrity_print_mask)
1101		seq_printf(seq, ",check_int_print_mask=%d",
1102				info->check_integrity_print_mask);
1103#endif
1104	if (info->metadata_ratio)
1105		seq_printf(seq, ",metadata_ratio=%d",
1106				info->metadata_ratio);
1107	if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
1108		seq_puts(seq, ",fatal_errors=panic");
1109	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
1110		seq_printf(seq, ",commit=%d", info->commit_interval);
1111	return 0;
1112}
1113
1114static int btrfs_test_super(struct super_block *s, void *data)
1115{
1116	struct btrfs_fs_info *p = data;
1117	struct btrfs_fs_info *fs_info = btrfs_sb(s);
1118
1119	return fs_info->fs_devices == p->fs_devices;
1120}
1121
1122static int btrfs_set_super(struct super_block *s, void *data)
1123{
1124	int err = set_anon_super(s, data);
1125	if (!err)
1126		s->s_fs_info = data;
1127	return err;
1128}
1129
1130/*
1131 * subvolumes are identified by ino 256
1132 */
1133static inline int is_subvolume_inode(struct inode *inode)
1134{
1135	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
1136		return 1;
1137	return 0;
1138}
1139
1140/*
1141 * This will strip out the subvol=%s argument for an argument string and add
1142 * subvolid=0 to make sure we get the actual tree root for path walking to the
1143 * subvol we want.
1144 */
1145static char *setup_root_args(char *args)
1146{
1147	unsigned len = strlen(args) + 2 + 1;
1148	char *src, *dst, *buf;
1149
1150	/*
1151	 * We need the same args as before, but with this substitution:
1152	 * s!subvol=[^,]+!subvolid=0!
1153	 *
1154	 * Since the replacement string is up to 2 bytes longer than the
1155	 * original, allocate strlen(args) + 2 + 1 bytes.
1156	 */
1157
1158	src = strstr(args, "subvol=");
1159	/* This shouldn't happen, but just in case.. */
1160	if (!src)
1161		return NULL;
1162
1163	buf = dst = kmalloc(len, GFP_NOFS);
1164	if (!buf)
1165		return NULL;
1166
1167	/*
1168	 * If the subvol= arg is not at the start of the string,
1169	 * copy whatever precedes it into buf.
1170	 */
1171	if (src != args) {
1172		*src++ = '\0';
1173		strcpy(buf, args);
1174		dst += strlen(args);
1175	}
1176
1177	strcpy(dst, "subvolid=0");
1178	dst += strlen("subvolid=0");
1179
1180	/*
1181	 * If there is a "," after the original subvol=... string,
1182	 * copy that suffix into our buffer.  Otherwise, we're done.
1183	 */
1184	src = strchr(src, ',');
1185	if (src)
1186		strcpy(dst, src);
1187
1188	return buf;
1189}
1190
1191static struct dentry *mount_subvol(const char *subvol_name, int flags,
1192				   const char *device_name, char *data)
1193{
1194	struct dentry *root;
1195	struct vfsmount *mnt;
1196	char *newargs;
1197
1198	newargs = setup_root_args(data);
1199	if (!newargs)
1200		return ERR_PTR(-ENOMEM);
1201	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
1202			     newargs);
1203
1204	if (PTR_RET(mnt) == -EBUSY) {
1205		if (flags & MS_RDONLY) {
1206			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
1207					     newargs);
1208		} else {
1209			int r;
1210			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
1211					     newargs);
1212			if (IS_ERR(mnt)) {
1213				kfree(newargs);
1214				return ERR_CAST(mnt);
1215			}
1216
1217			r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
1218			if (r < 0) {
1219				/* FIXME: release vfsmount mnt ??*/
1220				kfree(newargs);
1221				return ERR_PTR(r);
1222			}
1223		}
1224	}
1225
1226	kfree(newargs);
1227
1228	if (IS_ERR(mnt))
1229		return ERR_CAST(mnt);
1230
1231	root = mount_subtree(mnt, subvol_name);
1232
1233	if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
1234		struct super_block *s = root->d_sb;
1235		dput(root);
1236		root = ERR_PTR(-EINVAL);
1237		deactivate_locked_super(s);
1238		printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
1239				subvol_name);
1240	}
1241
1242	return root;
1243}
1244
1245static int parse_security_options(char *orig_opts,
1246				  struct security_mnt_opts *sec_opts)
1247{
1248	char *secdata = NULL;
1249	int ret = 0;
1250
1251	secdata = alloc_secdata();
1252	if (!secdata)
1253		return -ENOMEM;
1254	ret = security_sb_copy_data(orig_opts, secdata);
1255	if (ret) {
1256		free_secdata(secdata);
1257		return ret;
1258	}
1259	ret = security_sb_parse_opts_str(secdata, sec_opts);
1260	free_secdata(secdata);
1261	return ret;
1262}
1263
1264static int setup_security_options(struct btrfs_fs_info *fs_info,
1265				  struct super_block *sb,
1266				  struct security_mnt_opts *sec_opts)
1267{
1268	int ret = 0;
1269
1270	/*
1271	 * Call security_sb_set_mnt_opts() to check whether new sec_opts
1272	 * is valid.
1273	 */
1274	ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
1275	if (ret)
1276		return ret;
1277
1278#ifdef CONFIG_SECURITY
1279	if (!fs_info->security_opts.num_mnt_opts) {
1280		/* first time security setup, copy sec_opts to fs_info */
1281		memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
1282	} else {
1283		/*
1284		 * Since SELinux(the only one supports security_mnt_opts) does
1285		 * NOT support changing context during remount/mount same sb,
1286		 * This must be the same or part of the same security options,
1287		 * just free it.
1288		 */
1289		security_free_mnt_opts(sec_opts);
1290	}
1291#endif
1292	return ret;
1293}
1294
1295/*
1296 * Find a superblock for the given device / mount point.
1297 *
1298 * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
1299 *	  for multiple device setup.  Make sure to keep it in sync.
1300 */
1301static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1302		const char *device_name, void *data)
1303{
1304	struct block_device *bdev = NULL;
1305	struct super_block *s;
1306	struct dentry *root;
1307	struct btrfs_fs_devices *fs_devices = NULL;
1308	struct btrfs_fs_info *fs_info = NULL;
1309	struct security_mnt_opts new_sec_opts;
1310	fmode_t mode = FMODE_READ;
1311	char *subvol_name = NULL;
1312	u64 subvol_objectid = 0;
1313	int error = 0;
1314
1315	if (!(flags & MS_RDONLY))
1316		mode |= FMODE_WRITE;
1317
1318	error = btrfs_parse_early_options(data, mode, fs_type,
1319					  &subvol_name, &subvol_objectid,
1320					  &fs_devices);
1321	if (error) {
1322		kfree(subvol_name);
1323		return ERR_PTR(error);
1324	}
1325
1326	if (subvol_name) {
1327		root = mount_subvol(subvol_name, flags, device_name, data);
1328		kfree(subvol_name);
1329		return root;
1330	}
1331
1332	security_init_mnt_opts(&new_sec_opts);
1333	if (data) {
1334		error = parse_security_options(data, &new_sec_opts);
1335		if (error)
1336			return ERR_PTR(error);
1337	}
1338
1339	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
1340	if (error)
1341		goto error_sec_opts;
1342
1343	/*
1344	 * Setup a dummy root and fs_info for test/set super.  This is because
1345	 * we don't actually fill this stuff out until open_ctree, but we need
1346	 * it for searching for existing supers, so this lets us do that and
1347	 * then open_ctree will properly initialize everything later.
1348	 */
1349	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
1350	if (!fs_info) {
1351		error = -ENOMEM;
1352		goto error_sec_opts;
1353	}
1354
1355	fs_info->fs_devices = fs_devices;
1356
1357	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1358	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1359	security_init_mnt_opts(&fs_info->security_opts);
1360	if (!fs_info->super_copy || !fs_info->super_for_commit) {
1361		error = -ENOMEM;
1362		goto error_fs_info;
1363	}
1364
1365	error = btrfs_open_devices(fs_devices, mode, fs_type);
1366	if (error)
1367		goto error_fs_info;
1368
1369	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
1370		error = -EACCES;
1371		goto error_close_devices;
1372	}
1373
1374	bdev = fs_devices->latest_bdev;
1375	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC,
1376		 fs_info);
1377	if (IS_ERR(s)) {
1378		error = PTR_ERR(s);
1379		goto error_close_devices;
1380	}
1381
1382	if (s->s_root) {
1383		btrfs_close_devices(fs_devices);
1384		free_fs_info(fs_info);
1385		if ((flags ^ s->s_flags) & MS_RDONLY)
1386			error = -EBUSY;
1387	} else {
1388		char b[BDEVNAME_SIZE];
1389
1390		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
1391		btrfs_sb(s)->bdev_holder = fs_type;
1392		error = btrfs_fill_super(s, fs_devices, data,
1393					 flags & MS_SILENT ? 1 : 0);
1394	}
1395
1396	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
1397	if (IS_ERR(root)) {
1398		deactivate_locked_super(s);
1399		error = PTR_ERR(root);
1400		goto error_sec_opts;
1401	}
1402
1403	fs_info = btrfs_sb(s);
1404	error = setup_security_options(fs_info, s, &new_sec_opts);
1405	if (error) {
1406		dput(root);
1407		deactivate_locked_super(s);
1408		goto error_sec_opts;
1409	}
1410
1411	return root;
1412
1413error_close_devices:
1414	btrfs_close_devices(fs_devices);
1415error_fs_info:
1416	free_fs_info(fs_info);
1417error_sec_opts:
1418	security_free_mnt_opts(&new_sec_opts);
1419	return ERR_PTR(error);
1420}
1421
1422static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1423				     int new_pool_size, int old_pool_size)
1424{
1425	if (new_pool_size == old_pool_size)
1426		return;
1427
1428	fs_info->thread_pool_size = new_pool_size;
1429
1430	btrfs_info(fs_info, "resize thread pool %d -> %d",
1431	       old_pool_size, new_pool_size);
1432
1433	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1434	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1435	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
1436	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1437	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
1438	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
1439	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
1440				new_pool_size);
1441	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
1442	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1443	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1444	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1445	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
1446				new_pool_size);
1447}
1448
1449static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
1450{
1451	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1452}
1453
1454static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
1455				       unsigned long old_opts, int flags)
1456{
1457	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1458	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1459	     (flags & MS_RDONLY))) {
1460		/* wait for any defraggers to finish */
1461		wait_event(fs_info->transaction_wait,
1462			   (atomic_read(&fs_info->defrag_running) == 0));
1463		if (flags & MS_RDONLY)
1464			sync_filesystem(fs_info->sb);
1465	}
1466}
1467
1468static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
1469					 unsigned long old_opts)
1470{
1471	/*
1472	 * We need cleanup all defragable inodes if the autodefragment is
1473	 * close or the fs is R/O.
1474	 */
1475	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1476	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1477	     (fs_info->sb->s_flags & MS_RDONLY))) {
1478		btrfs_cleanup_defrag_inodes(fs_info);
1479	}
1480
1481	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1482}
1483
1484static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1485{
1486	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1487	struct btrfs_root *root = fs_info->tree_root;
1488	unsigned old_flags = sb->s_flags;
1489	unsigned long old_opts = fs_info->mount_opt;
1490	unsigned long old_compress_type = fs_info->compress_type;
1491	u64 old_max_inline = fs_info->max_inline;
1492	u64 old_alloc_start = fs_info->alloc_start;
1493	int old_thread_pool_size = fs_info->thread_pool_size;
1494	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1495	int ret;
1496
1497	sync_filesystem(sb);
1498	btrfs_remount_prepare(fs_info);
1499
1500	if (data) {
1501		struct security_mnt_opts new_sec_opts;
1502
1503		security_init_mnt_opts(&new_sec_opts);
1504		ret = parse_security_options(data, &new_sec_opts);
1505		if (ret)
1506			goto restore;
1507		ret = setup_security_options(fs_info, sb,
1508					     &new_sec_opts);
1509		if (ret) {
1510			security_free_mnt_opts(&new_sec_opts);
1511			goto restore;
1512		}
1513	}
1514
1515	ret = btrfs_parse_options(root, data);
1516	if (ret) {
1517		ret = -EINVAL;
1518		goto restore;
1519	}
1520
1521	btrfs_remount_begin(fs_info, old_opts, *flags);
1522	btrfs_resize_thread_pool(fs_info,
1523		fs_info->thread_pool_size, old_thread_pool_size);
1524
1525	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1526		goto out;
1527
1528	if (*flags & MS_RDONLY) {
1529		/*
1530		 * this also happens on 'umount -rf' or on shutdown, when
1531		 * the filesystem is busy.
1532		 */
1533		cancel_work_sync(&fs_info->async_reclaim_work);
1534
1535		/* wait for the uuid_scan task to finish */
1536		down(&fs_info->uuid_tree_rescan_sem);
1537		/* avoid complains from lockdep et al. */
1538		up(&fs_info->uuid_tree_rescan_sem);
1539
1540		sb->s_flags |= MS_RDONLY;
1541
1542		btrfs_dev_replace_suspend_for_unmount(fs_info);
1543		btrfs_scrub_cancel(fs_info);
1544		btrfs_pause_balance(fs_info);
1545
1546		ret = btrfs_commit_super(root);
1547		if (ret)
1548			goto restore;
1549	} else {
1550		if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1551			btrfs_err(fs_info,
1552				"Remounting read-write after error is not allowed");
1553			ret = -EINVAL;
1554			goto restore;
1555		}
1556		if (fs_info->fs_devices->rw_devices == 0) {
1557			ret = -EACCES;
1558			goto restore;
1559		}
1560
1561		if (fs_info->fs_devices->missing_devices >
1562		     fs_info->num_tolerated_disk_barrier_failures &&
1563		    !(*flags & MS_RDONLY)) {
1564			btrfs_warn(fs_info,
1565				"too many missing devices, writeable remount is not allowed");
1566			ret = -EACCES;
1567			goto restore;
1568		}
1569
1570		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1571			ret = -EINVAL;
1572			goto restore;
1573		}
1574
1575		ret = btrfs_cleanup_fs_roots(fs_info);
1576		if (ret)
1577			goto restore;
1578
1579		/* recover relocation */
1580		mutex_lock(&fs_info->cleaner_mutex);
1581		ret = btrfs_recover_relocation(root);
1582		mutex_unlock(&fs_info->cleaner_mutex);
1583		if (ret)
1584			goto restore;
1585
1586		ret = btrfs_resume_balance_async(fs_info);
1587		if (ret)
1588			goto restore;
1589
1590		ret = btrfs_resume_dev_replace_async(fs_info);
1591		if (ret) {
1592			btrfs_warn(fs_info, "failed to resume dev_replace");
1593			goto restore;
1594		}
1595
1596		if (!fs_info->uuid_root) {
1597			btrfs_info(fs_info, "creating UUID tree");
1598			ret = btrfs_create_uuid_tree(fs_info);
1599			if (ret) {
1600				btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
1601				goto restore;
1602			}
1603		}
1604		sb->s_flags &= ~MS_RDONLY;
1605	}
1606out:
1607	wake_up_process(fs_info->transaction_kthread);
1608	btrfs_remount_cleanup(fs_info, old_opts);
1609	return 0;
1610
1611restore:
1612	/* We've hit an error - don't reset MS_RDONLY */
1613	if (sb->s_flags & MS_RDONLY)
1614		old_flags |= MS_RDONLY;
1615	sb->s_flags = old_flags;
1616	fs_info->mount_opt = old_opts;
1617	fs_info->compress_type = old_compress_type;
1618	fs_info->max_inline = old_max_inline;
1619	mutex_lock(&fs_info->chunk_mutex);
1620	fs_info->alloc_start = old_alloc_start;
1621	mutex_unlock(&fs_info->chunk_mutex);
1622	btrfs_resize_thread_pool(fs_info,
1623		old_thread_pool_size, fs_info->thread_pool_size);
1624	fs_info->metadata_ratio = old_metadata_ratio;
1625	btrfs_remount_cleanup(fs_info, old_opts);
1626	return ret;
1627}
1628
1629/* Used to sort the devices by max_avail(descending sort) */
1630static int btrfs_cmp_device_free_bytes(const void *dev_info1,
1631				       const void *dev_info2)
1632{
1633	if (((struct btrfs_device_info *)dev_info1)->max_avail >
1634	    ((struct btrfs_device_info *)dev_info2)->max_avail)
1635		return -1;
1636	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
1637		 ((struct btrfs_device_info *)dev_info2)->max_avail)
1638		return 1;
1639	else
1640	return 0;
1641}
1642
1643/*
1644 * sort the devices by max_avail, in which max free extent size of each device
1645 * is stored.(Descending Sort)
1646 */
1647static inline void btrfs_descending_sort_devices(
1648					struct btrfs_device_info *devices,
1649					size_t nr_devices)
1650{
1651	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
1652	     btrfs_cmp_device_free_bytes, NULL);
1653}
1654
1655/*
1656 * The helper to calc the free space on the devices that can be used to store
1657 * file data.
1658 */
1659static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1660{
1661	struct btrfs_fs_info *fs_info = root->fs_info;
1662	struct btrfs_device_info *devices_info;
1663	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1664	struct btrfs_device *device;
1665	u64 skip_space;
1666	u64 type;
1667	u64 avail_space;
1668	u64 used_space;
1669	u64 min_stripe_size;
1670	int min_stripes = 1, num_stripes = 1;
1671	int i = 0, nr_devices;
1672	int ret;
1673
1674	/*
1675	 * We aren't under the device list lock, so this is racey-ish, but good
1676	 * enough for our purposes.
1677	 */
1678	nr_devices = fs_info->fs_devices->open_devices;
1679	if (!nr_devices) {
1680		smp_mb();
1681		nr_devices = fs_info->fs_devices->open_devices;
1682		ASSERT(nr_devices);
1683		if (!nr_devices) {
1684			*free_bytes = 0;
1685			return 0;
1686		}
1687	}
1688
1689	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
1690			       GFP_NOFS);
1691	if (!devices_info)
1692		return -ENOMEM;
1693
1694	/* calc min stripe number for data space alloction */
1695	type = btrfs_get_alloc_profile(root, 1);
1696	if (type & BTRFS_BLOCK_GROUP_RAID0) {
1697		min_stripes = 2;
1698		num_stripes = nr_devices;
1699	} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
1700		min_stripes = 2;
1701		num_stripes = 2;
1702	} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
1703		min_stripes = 4;
1704		num_stripes = 4;
1705	}
1706
1707	if (type & BTRFS_BLOCK_GROUP_DUP)
1708		min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1709	else
1710		min_stripe_size = BTRFS_STRIPE_LEN;
1711
1712	if (fs_info->alloc_start)
1713		mutex_lock(&fs_devices->device_list_mutex);
1714	rcu_read_lock();
1715	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
1716		if (!device->in_fs_metadata || !device->bdev ||
1717		    device->is_tgtdev_for_dev_replace)
1718			continue;
1719
1720		if (i >= nr_devices)
1721			break;
1722
1723		avail_space = device->total_bytes - device->bytes_used;
1724
1725		/* align with stripe_len */
1726		avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
1727		avail_space *= BTRFS_STRIPE_LEN;
1728
1729		/*
1730		 * In order to avoid overwritting the superblock on the drive,
1731		 * btrfs starts at an offset of at least 1MB when doing chunk
1732		 * allocation.
1733		 */
1734		skip_space = 1024 * 1024;
1735
1736		/* user can set the offset in fs_info->alloc_start. */
1737		if (fs_info->alloc_start &&
1738		    fs_info->alloc_start + BTRFS_STRIPE_LEN <=
1739		    device->total_bytes) {
1740			rcu_read_unlock();
1741			skip_space = max(fs_info->alloc_start, skip_space);
1742
1743			/*
1744			 * btrfs can not use the free space in
1745			 * [0, skip_space - 1], we must subtract it from the
1746			 * total. In order to implement it, we account the used
1747			 * space in this range first.
1748			 */
1749			ret = btrfs_account_dev_extents_size(device, 0,
1750							     skip_space - 1,
1751							     &used_space);
1752			if (ret) {
1753				kfree(devices_info);
1754				mutex_unlock(&fs_devices->device_list_mutex);
1755				return ret;
1756			}
1757
1758			rcu_read_lock();
1759
1760			/* calc the free space in [0, skip_space - 1] */
1761			skip_space -= used_space;
1762		}
1763
1764		/*
1765		 * we can use the free space in [0, skip_space - 1], subtract
1766		 * it from the total.
1767		 */
1768		if (avail_space && avail_space >= skip_space)
1769			avail_space -= skip_space;
1770		else
1771			avail_space = 0;
1772
1773		if (avail_space < min_stripe_size)
1774			continue;
1775
1776		devices_info[i].dev = device;
1777		devices_info[i].max_avail = avail_space;
1778
1779		i++;
1780	}
1781	rcu_read_unlock();
1782	if (fs_info->alloc_start)
1783		mutex_unlock(&fs_devices->device_list_mutex);
1784
1785	nr_devices = i;
1786
1787	btrfs_descending_sort_devices(devices_info, nr_devices);
1788
1789	i = nr_devices - 1;
1790	avail_space = 0;
1791	while (nr_devices >= min_stripes) {
1792		if (num_stripes > nr_devices)
1793			num_stripes = nr_devices;
1794
1795		if (devices_info[i].max_avail >= min_stripe_size) {
1796			int j;
1797			u64 alloc_size;
1798
1799			avail_space += devices_info[i].max_avail * num_stripes;
1800			alloc_size = devices_info[i].max_avail;
1801			for (j = i + 1 - num_stripes; j <= i; j++)
1802				devices_info[j].max_avail -= alloc_size;
1803		}
1804		i--;
1805		nr_devices--;
1806	}
1807
1808	kfree(devices_info);
1809	*free_bytes = avail_space;
1810	return 0;
1811}
1812
1813/*
1814 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
1815 *
1816 * If there's a redundant raid level at DATA block groups, use the respective
1817 * multiplier to scale the sizes.
1818 *
1819 * Unused device space usage is based on simulating the chunk allocator
1820 * algorithm that respects the device sizes, order of allocations and the
1821 * 'alloc_start' value, this is a close approximation of the actual use but
1822 * there are other factors that may change the result (like a new metadata
1823 * chunk).
1824 *
1825 * If metadata is exhausted, f_bavail will be 0.
1826 *
1827 * FIXME: not accurate for mixed block groups, total and free/used are ok,
1828 * available appears slightly larger.
1829 */
1830static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1831{
1832	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
1833	struct btrfs_super_block *disk_super = fs_info->super_copy;
1834	struct list_head *head = &fs_info->space_info;
1835	struct btrfs_space_info *found;
1836	u64 total_used = 0;
1837	u64 total_free_data = 0;
1838	u64 total_free_meta = 0;
1839	int bits = dentry->d_sb->s_blocksize_bits;
1840	__be32 *fsid = (__be32 *)fs_info->fsid;
1841	unsigned factor = 1;
1842	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
1843	int ret;
1844	u64 thresh = 0;
1845
1846	/*
1847	 * holding chunk_muext to avoid allocating new chunks, holding
1848	 * device_list_mutex to avoid the device being removed
1849	 */
1850	rcu_read_lock();
1851	list_for_each_entry_rcu(found, head, list) {
1852		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
1853			int i;
1854
1855			total_free_data += found->disk_total - found->disk_used;
1856			total_free_data -=
1857				btrfs_account_ro_block_groups_free_space(found);
1858
1859			for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1860				if (!list_empty(&found->block_groups[i])) {
1861					switch (i) {
1862					case BTRFS_RAID_DUP:
1863					case BTRFS_RAID_RAID1:
1864					case BTRFS_RAID_RAID10:
1865						factor = 2;
1866					}
1867				}
1868			}
1869		}
1870		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
1871			total_free_meta += found->disk_total - found->disk_used;
1872
1873		total_used += found->disk_used;
1874	}
1875
1876	rcu_read_unlock();
1877
1878	buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
1879	buf->f_blocks >>= bits;
1880	buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
1881
1882	/* Account global block reserve as used, it's in logical size already */
1883	spin_lock(&block_rsv->lock);
1884	buf->f_bfree -= block_rsv->size >> bits;
1885	spin_unlock(&block_rsv->lock);
1886
1887	buf->f_bavail = div_u64(total_free_data, factor);
1888	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1889	if (ret)
1890		return ret;
1891	buf->f_bavail += div_u64(total_free_data, factor);
1892	buf->f_bavail = buf->f_bavail >> bits;
1893
1894	/*
1895	 * We calculate the remaining metadata space minus global reserve. If
1896	 * this is (supposedly) smaller than zero, there's no space. But this
1897	 * does not hold in practice, the exhausted state happens where's still
1898	 * some positive delta. So we apply some guesswork and compare the
1899	 * delta to a 4M threshold.  (Practically observed delta was ~2M.)
1900	 *
1901	 * We probably cannot calculate the exact threshold value because this
1902	 * depends on the internal reservations requested by various
1903	 * operations, so some operations that consume a few metadata will
1904	 * succeed even if the Avail is zero. But this is better than the other
1905	 * way around.
1906	 */
1907	thresh = 4 * 1024 * 1024;
1908
1909	if (total_free_meta - thresh < block_rsv->size)
1910		buf->f_bavail = 0;
1911
1912	buf->f_type = BTRFS_SUPER_MAGIC;
1913	buf->f_bsize = dentry->d_sb->s_blocksize;
1914	buf->f_namelen = BTRFS_NAME_LEN;
1915
1916	/* We treat it as constant endianness (it doesn't matter _which_)
1917	   because we want the fsid to come out the same whether mounted
1918	   on a big-endian or little-endian host */
1919	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
1920	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
1921	/* Mask in the root object ID too, to disambiguate subvols */
1922	buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
1923	buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
1924
1925	return 0;
1926}
1927
1928static void btrfs_kill_super(struct super_block *sb)
1929{
1930	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1931	kill_anon_super(sb);
1932	free_fs_info(fs_info);
1933}
1934
1935static struct file_system_type btrfs_fs_type = {
1936	.owner		= THIS_MODULE,
1937	.name		= "btrfs",
1938	.mount		= btrfs_mount,
1939	.kill_sb	= btrfs_kill_super,
1940	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
1941};
1942MODULE_ALIAS_FS("btrfs");
1943
1944static int btrfs_control_open(struct inode *inode, struct file *file)
1945{
1946	/*
1947	 * The control file's private_data is used to hold the
1948	 * transaction when it is started and is used to keep
1949	 * track of whether a transaction is already in progress.
1950	 */
1951	file->private_data = NULL;
1952	return 0;
1953}
1954
1955/*
1956 * used by btrfsctl to scan devices when no FS is mounted
1957 */
1958static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1959				unsigned long arg)
1960{
1961	struct btrfs_ioctl_vol_args *vol;
1962	struct btrfs_fs_devices *fs_devices;
1963	int ret = -ENOTTY;
1964
1965	if (!capable(CAP_SYS_ADMIN))
1966		return -EPERM;
1967
1968	vol = memdup_user((void __user *)arg, sizeof(*vol));
1969	if (IS_ERR(vol))
1970		return PTR_ERR(vol);
1971
1972	switch (cmd) {
1973	case BTRFS_IOC_SCAN_DEV:
1974		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1975					    &btrfs_fs_type, &fs_devices);
1976		break;
1977	case BTRFS_IOC_DEVICES_READY:
1978		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
1979					    &btrfs_fs_type, &fs_devices);
1980		if (ret)
1981			break;
1982		ret = !(fs_devices->num_devices == fs_devices->total_devices);
1983		break;
1984	}
1985
1986	kfree(vol);
1987	return ret;
1988}
1989
1990static int btrfs_freeze(struct super_block *sb)
1991{
1992	struct btrfs_trans_handle *trans;
1993	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
1994
1995	trans = btrfs_attach_transaction_barrier(root);
1996	if (IS_ERR(trans)) {
1997		/* no transaction, don't bother */
1998		if (PTR_ERR(trans) == -ENOENT)
1999			return 0;
2000		return PTR_ERR(trans);
2001	}
2002	return btrfs_commit_transaction(trans, root);
2003}
2004
2005static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
2006{
2007	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
2008	struct btrfs_fs_devices *cur_devices;
2009	struct btrfs_device *dev, *first_dev = NULL;
2010	struct list_head *head;
2011	struct rcu_string *name;
2012
2013	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2014	cur_devices = fs_info->fs_devices;
2015	while (cur_devices) {
2016		head = &cur_devices->devices;
2017		list_for_each_entry(dev, head, dev_list) {
2018			if (dev->missing)
2019				continue;
2020			if (!dev->name)
2021				continue;
2022			if (!first_dev || dev->devid < first_dev->devid)
2023				first_dev = dev;
2024		}
2025		cur_devices = cur_devices->seed;
2026	}
2027
2028	if (first_dev) {
2029		rcu_read_lock();
2030		name = rcu_dereference(first_dev->name);
2031		seq_escape(m, name->str, " \t\n\\");
2032		rcu_read_unlock();
2033	} else {
2034		WARN_ON(1);
2035	}
2036	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2037	return 0;
2038}
2039
2040static const struct super_operations btrfs_super_ops = {
2041	.drop_inode	= btrfs_drop_inode,
2042	.evict_inode	= btrfs_evict_inode,
2043	.put_super	= btrfs_put_super,
2044	.sync_fs	= btrfs_sync_fs,
2045	.show_options	= btrfs_show_options,
2046	.show_devname	= btrfs_show_devname,
2047	.write_inode	= btrfs_write_inode,
2048	.alloc_inode	= btrfs_alloc_inode,
2049	.destroy_inode	= btrfs_destroy_inode,
2050	.statfs		= btrfs_statfs,
2051	.remount_fs	= btrfs_remount,
2052	.freeze_fs	= btrfs_freeze,
2053};
2054
2055static const struct file_operations btrfs_ctl_fops = {
2056	.open = btrfs_control_open,
2057	.unlocked_ioctl	 = btrfs_control_ioctl,
2058	.compat_ioctl = btrfs_control_ioctl,
2059	.owner	 = THIS_MODULE,
2060	.llseek = noop_llseek,
2061};
2062
2063static struct miscdevice btrfs_misc = {
2064	.minor		= BTRFS_MINOR,
2065	.name		= "btrfs-control",
2066	.fops		= &btrfs_ctl_fops
2067};
2068
2069MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
2070MODULE_ALIAS("devname:btrfs-control");
2071
2072static int btrfs_interface_init(void)
2073{
2074	return misc_register(&btrfs_misc);
2075}
2076
2077static void btrfs_interface_exit(void)
2078{
2079	if (misc_deregister(&btrfs_misc) < 0)
2080		printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
2081}
2082
2083static void btrfs_print_info(void)
2084{
2085	printk(KERN_INFO "Btrfs loaded"
2086#ifdef CONFIG_BTRFS_DEBUG
2087			", debug=on"
2088#endif
2089#ifdef CONFIG_BTRFS_ASSERT
2090			", assert=on"
2091#endif
2092#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2093			", integrity-checker=on"
2094#endif
2095			"\n");
2096}
2097
2098static int btrfs_run_sanity_tests(void)
2099{
2100	int ret;
2101
2102	ret = btrfs_init_test_fs();
2103	if (ret)
2104		return ret;
2105
2106	ret = btrfs_test_free_space_cache();
2107	if (ret)
2108		goto out;
2109	ret = btrfs_test_extent_buffer_operations();
2110	if (ret)
2111		goto out;
2112	ret = btrfs_test_extent_io();
2113	if (ret)
2114		goto out;
2115	ret = btrfs_test_inodes();
2116	if (ret)
2117		goto out;
2118	ret = btrfs_test_qgroups();
2119out:
2120	btrfs_destroy_test_fs();
2121	return ret;
2122}
2123
2124static int __init init_btrfs_fs(void)
2125{
2126	int err;
2127
2128	err = btrfs_hash_init();
2129	if (err)
2130		return err;
2131
2132	btrfs_props_init();
2133
2134	err = btrfs_init_sysfs();
2135	if (err)
2136		goto free_hash;
2137
2138	btrfs_init_compress();
2139
2140	err = btrfs_init_cachep();
2141	if (err)
2142		goto free_compress;
2143
2144	err = extent_io_init();
2145	if (err)
2146		goto free_cachep;
2147
2148	err = extent_map_init();
2149	if (err)
2150		goto free_extent_io;
2151
2152	err = ordered_data_init();
2153	if (err)
2154		goto free_extent_map;
2155
2156	err = btrfs_delayed_inode_init();
2157	if (err)
2158		goto free_ordered_data;
2159
2160	err = btrfs_auto_defrag_init();
2161	if (err)
2162		goto free_delayed_inode;
2163
2164	err = btrfs_delayed_ref_init();
2165	if (err)
2166		goto free_auto_defrag;
2167
2168	err = btrfs_prelim_ref_init();
2169	if (err)
2170		goto free_delayed_ref;
2171
2172	err = btrfs_end_io_wq_init();
2173	if (err)
2174		goto free_prelim_ref;
2175
2176	err = btrfs_interface_init();
2177	if (err)
2178		goto free_end_io_wq;
2179
2180	btrfs_init_lockdep();
2181
2182	btrfs_print_info();
2183
2184	err = btrfs_run_sanity_tests();
2185	if (err)
2186		goto unregister_ioctl;
2187
2188	err = register_filesystem(&btrfs_fs_type);
2189	if (err)
2190		goto unregister_ioctl;
2191
2192	return 0;
2193
2194unregister_ioctl:
2195	btrfs_interface_exit();
2196free_end_io_wq:
2197	btrfs_end_io_wq_exit();
2198free_prelim_ref:
2199	btrfs_prelim_ref_exit();
2200free_delayed_ref:
2201	btrfs_delayed_ref_exit();
2202free_auto_defrag:
2203	btrfs_auto_defrag_exit();
2204free_delayed_inode:
2205	btrfs_delayed_inode_exit();
2206free_ordered_data:
2207	ordered_data_exit();
2208free_extent_map:
2209	extent_map_exit();
2210free_extent_io:
2211	extent_io_exit();
2212free_cachep:
2213	btrfs_destroy_cachep();
2214free_compress:
2215	btrfs_exit_compress();
2216	btrfs_exit_sysfs();
2217free_hash:
2218	btrfs_hash_exit();
2219	return err;
2220}
2221
2222static void __exit exit_btrfs_fs(void)
2223{
2224	btrfs_destroy_cachep();
2225	btrfs_delayed_ref_exit();
2226	btrfs_auto_defrag_exit();
2227	btrfs_delayed_inode_exit();
2228	btrfs_prelim_ref_exit();
2229	ordered_data_exit();
2230	extent_map_exit();
2231	extent_io_exit();
2232	btrfs_interface_exit();
2233	btrfs_end_io_wq_exit();
2234	unregister_filesystem(&btrfs_fs_type);
2235	btrfs_exit_sysfs();
2236	btrfs_cleanup_fs_uuids();
2237	btrfs_exit_compress();
2238	btrfs_hash_exit();
2239}
2240
2241late_initcall(init_btrfs_fs);
2242module_exit(exit_btrfs_fs)
2243
2244MODULE_LICENSE("GPL");
2245