1/*
2 *  linux/fs/ext4/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 *  from
10 *
11 *  linux/fs/minix/inode.c
12 *
13 *  Copyright (C) 1991, 1992  Linus Torvalds
14 *
15 *  Big-endian to little-endian byte-swapping/bitmaps by
16 *        David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/fs.h>
22#include <linux/time.h>
23#include <linux/vmalloc.h>
24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/blkdev.h>
27#include <linux/parser.h>
28#include <linux/buffer_head.h>
29#include <linux/exportfs.h>
30#include <linux/vfs.h>
31#include <linux/random.h>
32#include <linux/mount.h>
33#include <linux/namei.h>
34#include <linux/quotaops.h>
35#include <linux/seq_file.h>
36#include <linux/proc_fs.h>
37#include <linux/ctype.h>
38#include <linux/log2.h>
39#include <linux/crc16.h>
40#include <linux/cleancache.h>
41#include <asm/uaccess.h>
42
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45
46#include "ext4.h"
47#include "ext4_extents.h"	/* Needed for trace points definition */
48#include "ext4_jbd2.h"
49#include "xattr.h"
50#include "acl.h"
51#include "mballoc.h"
52
53#define CREATE_TRACE_POINTS
54#include <trace/events/ext4.h>
55
56static struct proc_dir_entry *ext4_proc_root;
57static struct kset *ext4_kset;
58static struct ext4_lazy_init *ext4_li_info;
59static struct mutex ext4_li_mtx;
60static struct ext4_features *ext4_feat;
61static int ext4_mballoc_ready;
62
63static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
64			     unsigned long journal_devnum);
65static int ext4_show_options(struct seq_file *seq, struct dentry *root);
66static int ext4_commit_super(struct super_block *sb, int sync);
67static void ext4_mark_recovery_complete(struct super_block *sb,
68					struct ext4_super_block *es);
69static void ext4_clear_journal_err(struct super_block *sb,
70				   struct ext4_super_block *es);
71static int ext4_sync_fs(struct super_block *sb, int wait);
72static int ext4_remount(struct super_block *sb, int *flags, char *data);
73static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74static int ext4_unfreeze(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77		       const char *dev_name, void *data);
78static inline int ext2_feature_set_ok(struct super_block *sb);
79static inline int ext3_feature_set_ok(struct super_block *sb);
80static int ext4_feature_set_ok(struct super_block *sb, int readonly);
81static void ext4_destroy_lazyinit_thread(void);
82static void ext4_unregister_li_request(struct super_block *sb);
83static void ext4_clear_request_list(void);
84static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
85
86#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
87static struct file_system_type ext2_fs_type = {
88	.owner		= THIS_MODULE,
89	.name		= "ext2",
90	.mount		= ext4_mount,
91	.kill_sb	= kill_block_super,
92	.fs_flags	= FS_REQUIRES_DEV,
93};
94MODULE_ALIAS_FS("ext2");
95MODULE_ALIAS("ext2");
96#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
97#else
98#define IS_EXT2_SB(sb) (0)
99#endif
100
101
102#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
103static struct file_system_type ext3_fs_type = {
104	.owner		= THIS_MODULE,
105	.name		= "ext3",
106	.mount		= ext4_mount,
107	.kill_sb	= kill_block_super,
108	.fs_flags	= FS_REQUIRES_DEV,
109};
110MODULE_ALIAS_FS("ext3");
111MODULE_ALIAS("ext3");
112#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
113#else
114#define IS_EXT3_SB(sb) (0)
115#endif
116
117static int ext4_verify_csum_type(struct super_block *sb,
118				 struct ext4_super_block *es)
119{
120	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
121					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
122		return 1;
123
124	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
125}
126
127static __le32 ext4_superblock_csum(struct super_block *sb,
128				   struct ext4_super_block *es)
129{
130	struct ext4_sb_info *sbi = EXT4_SB(sb);
131	int offset = offsetof(struct ext4_super_block, s_checksum);
132	__u32 csum;
133
134	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
135
136	return cpu_to_le32(csum);
137}
138
139static int ext4_superblock_csum_verify(struct super_block *sb,
140				       struct ext4_super_block *es)
141{
142	if (!ext4_has_metadata_csum(sb))
143		return 1;
144
145	return es->s_checksum == ext4_superblock_csum(sb, es);
146}
147
148void ext4_superblock_csum_set(struct super_block *sb)
149{
150	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
151
152	if (!ext4_has_metadata_csum(sb))
153		return;
154
155	es->s_checksum = ext4_superblock_csum(sb, es);
156}
157
158void *ext4_kvmalloc(size_t size, gfp_t flags)
159{
160	void *ret;
161
162	ret = kmalloc(size, flags | __GFP_NOWARN);
163	if (!ret)
164		ret = __vmalloc(size, flags, PAGE_KERNEL);
165	return ret;
166}
167
168void *ext4_kvzalloc(size_t size, gfp_t flags)
169{
170	void *ret;
171
172	ret = kzalloc(size, flags | __GFP_NOWARN);
173	if (!ret)
174		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
175	return ret;
176}
177
178ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
179			       struct ext4_group_desc *bg)
180{
181	return le32_to_cpu(bg->bg_block_bitmap_lo) |
182		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
183		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
184}
185
186ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
187			       struct ext4_group_desc *bg)
188{
189	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
190		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
191		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
192}
193
194ext4_fsblk_t ext4_inode_table(struct super_block *sb,
195			      struct ext4_group_desc *bg)
196{
197	return le32_to_cpu(bg->bg_inode_table_lo) |
198		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
199		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
200}
201
202__u32 ext4_free_group_clusters(struct super_block *sb,
203			       struct ext4_group_desc *bg)
204{
205	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
206		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
207		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
208}
209
210__u32 ext4_free_inodes_count(struct super_block *sb,
211			      struct ext4_group_desc *bg)
212{
213	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
214		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
215		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
216}
217
218__u32 ext4_used_dirs_count(struct super_block *sb,
219			      struct ext4_group_desc *bg)
220{
221	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
222		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
223		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
224}
225
226__u32 ext4_itable_unused_count(struct super_block *sb,
227			      struct ext4_group_desc *bg)
228{
229	return le16_to_cpu(bg->bg_itable_unused_lo) |
230		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
231		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
232}
233
234void ext4_block_bitmap_set(struct super_block *sb,
235			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
236{
237	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
238	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
239		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
240}
241
242void ext4_inode_bitmap_set(struct super_block *sb,
243			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
244{
245	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
246	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
247		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
248}
249
250void ext4_inode_table_set(struct super_block *sb,
251			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
252{
253	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
254	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
255		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
256}
257
258void ext4_free_group_clusters_set(struct super_block *sb,
259				  struct ext4_group_desc *bg, __u32 count)
260{
261	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
262	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
263		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
264}
265
266void ext4_free_inodes_set(struct super_block *sb,
267			  struct ext4_group_desc *bg, __u32 count)
268{
269	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
270	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
271		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
272}
273
274void ext4_used_dirs_set(struct super_block *sb,
275			  struct ext4_group_desc *bg, __u32 count)
276{
277	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
278	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
279		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
280}
281
282void ext4_itable_unused_set(struct super_block *sb,
283			  struct ext4_group_desc *bg, __u32 count)
284{
285	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
286	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
287		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
288}
289
290
291static void __save_error_info(struct super_block *sb, const char *func,
292			    unsigned int line)
293{
294	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
295
296	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
297	if (bdev_read_only(sb->s_bdev))
298		return;
299	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
300	es->s_last_error_time = cpu_to_le32(get_seconds());
301	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
302	es->s_last_error_line = cpu_to_le32(line);
303	if (!es->s_first_error_time) {
304		es->s_first_error_time = es->s_last_error_time;
305		strncpy(es->s_first_error_func, func,
306			sizeof(es->s_first_error_func));
307		es->s_first_error_line = cpu_to_le32(line);
308		es->s_first_error_ino = es->s_last_error_ino;
309		es->s_first_error_block = es->s_last_error_block;
310	}
311	/*
312	 * Start the daily error reporting function if it hasn't been
313	 * started already
314	 */
315	if (!es->s_error_count)
316		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
317	le32_add_cpu(&es->s_error_count, 1);
318}
319
320static void save_error_info(struct super_block *sb, const char *func,
321			    unsigned int line)
322{
323	__save_error_info(sb, func, line);
324	ext4_commit_super(sb, 1);
325}
326
327/*
328 * The del_gendisk() function uninitializes the disk-specific data
329 * structures, including the bdi structure, without telling anyone
330 * else.  Once this happens, any attempt to call mark_buffer_dirty()
331 * (for example, by ext4_commit_super), will cause a kernel OOPS.
332 * This is a kludge to prevent these oops until we can put in a proper
333 * hook in del_gendisk() to inform the VFS and file system layers.
334 */
335static int block_device_ejected(struct super_block *sb)
336{
337	struct inode *bd_inode = sb->s_bdev->bd_inode;
338	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
339
340	return bdi->dev == NULL;
341}
342
343static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
344{
345	struct super_block		*sb = journal->j_private;
346	struct ext4_sb_info		*sbi = EXT4_SB(sb);
347	int				error = is_journal_aborted(journal);
348	struct ext4_journal_cb_entry	*jce;
349
350	BUG_ON(txn->t_state == T_FINISHED);
351	spin_lock(&sbi->s_md_lock);
352	while (!list_empty(&txn->t_private_list)) {
353		jce = list_entry(txn->t_private_list.next,
354				 struct ext4_journal_cb_entry, jce_list);
355		list_del_init(&jce->jce_list);
356		spin_unlock(&sbi->s_md_lock);
357		jce->jce_func(sb, jce, error);
358		spin_lock(&sbi->s_md_lock);
359	}
360	spin_unlock(&sbi->s_md_lock);
361}
362
363/* Deal with the reporting of failure conditions on a filesystem such as
364 * inconsistencies detected or read IO failures.
365 *
366 * On ext2, we can store the error state of the filesystem in the
367 * superblock.  That is not possible on ext4, because we may have other
368 * write ordering constraints on the superblock which prevent us from
369 * writing it out straight away; and given that the journal is about to
370 * be aborted, we can't rely on the current, or future, transactions to
371 * write out the superblock safely.
372 *
373 * We'll just use the jbd2_journal_abort() error code to record an error in
374 * the journal instead.  On recovery, the journal will complain about
375 * that error until we've noted it down and cleared it.
376 */
377
378static void ext4_handle_error(struct super_block *sb)
379{
380	if (sb->s_flags & MS_RDONLY)
381		return;
382
383	if (!test_opt(sb, ERRORS_CONT)) {
384		journal_t *journal = EXT4_SB(sb)->s_journal;
385
386		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
387		if (journal)
388			jbd2_journal_abort(journal, -EIO);
389	}
390	if (test_opt(sb, ERRORS_RO)) {
391		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
392		/*
393		 * Make sure updated value of ->s_mount_flags will be visible
394		 * before ->s_flags update
395		 */
396		smp_wmb();
397		sb->s_flags |= MS_RDONLY;
398	}
399	if (test_opt(sb, ERRORS_PANIC)) {
400		if (EXT4_SB(sb)->s_journal &&
401		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
402			return;
403		panic("EXT4-fs (device %s): panic forced after error\n",
404			sb->s_id);
405	}
406}
407
408#define ext4_error_ratelimit(sb)					\
409		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
410			     "EXT4-fs error")
411
412void __ext4_error(struct super_block *sb, const char *function,
413		  unsigned int line, const char *fmt, ...)
414{
415	struct va_format vaf;
416	va_list args;
417
418	if (ext4_error_ratelimit(sb)) {
419		va_start(args, fmt);
420		vaf.fmt = fmt;
421		vaf.va = &args;
422		printk(KERN_CRIT
423		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
424		       sb->s_id, function, line, current->comm, &vaf);
425		va_end(args);
426	}
427	save_error_info(sb, function, line);
428	ext4_handle_error(sb);
429}
430
431void __ext4_error_inode(struct inode *inode, const char *function,
432			unsigned int line, ext4_fsblk_t block,
433			const char *fmt, ...)
434{
435	va_list args;
436	struct va_format vaf;
437	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
438
439	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
440	es->s_last_error_block = cpu_to_le64(block);
441	if (ext4_error_ratelimit(inode->i_sb)) {
442		va_start(args, fmt);
443		vaf.fmt = fmt;
444		vaf.va = &args;
445		if (block)
446			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
447			       "inode #%lu: block %llu: comm %s: %pV\n",
448			       inode->i_sb->s_id, function, line, inode->i_ino,
449			       block, current->comm, &vaf);
450		else
451			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
452			       "inode #%lu: comm %s: %pV\n",
453			       inode->i_sb->s_id, function, line, inode->i_ino,
454			       current->comm, &vaf);
455		va_end(args);
456	}
457	save_error_info(inode->i_sb, function, line);
458	ext4_handle_error(inode->i_sb);
459}
460
461void __ext4_error_file(struct file *file, const char *function,
462		       unsigned int line, ext4_fsblk_t block,
463		       const char *fmt, ...)
464{
465	va_list args;
466	struct va_format vaf;
467	struct ext4_super_block *es;
468	struct inode *inode = file_inode(file);
469	char pathname[80], *path;
470
471	es = EXT4_SB(inode->i_sb)->s_es;
472	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
473	if (ext4_error_ratelimit(inode->i_sb)) {
474		path = d_path(&(file->f_path), pathname, sizeof(pathname));
475		if (IS_ERR(path))
476			path = "(unknown)";
477		va_start(args, fmt);
478		vaf.fmt = fmt;
479		vaf.va = &args;
480		if (block)
481			printk(KERN_CRIT
482			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
483			       "block %llu: comm %s: path %s: %pV\n",
484			       inode->i_sb->s_id, function, line, inode->i_ino,
485			       block, current->comm, path, &vaf);
486		else
487			printk(KERN_CRIT
488			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
489			       "comm %s: path %s: %pV\n",
490			       inode->i_sb->s_id, function, line, inode->i_ino,
491			       current->comm, path, &vaf);
492		va_end(args);
493	}
494	save_error_info(inode->i_sb, function, line);
495	ext4_handle_error(inode->i_sb);
496}
497
498const char *ext4_decode_error(struct super_block *sb, int errno,
499			      char nbuf[16])
500{
501	char *errstr = NULL;
502
503	switch (errno) {
504	case -EIO:
505		errstr = "IO failure";
506		break;
507	case -ENOMEM:
508		errstr = "Out of memory";
509		break;
510	case -EROFS:
511		if (!sb || (EXT4_SB(sb)->s_journal &&
512			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
513			errstr = "Journal has aborted";
514		else
515			errstr = "Readonly filesystem";
516		break;
517	default:
518		/* If the caller passed in an extra buffer for unknown
519		 * errors, textualise them now.  Else we just return
520		 * NULL. */
521		if (nbuf) {
522			/* Check for truncated error codes... */
523			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
524				errstr = nbuf;
525		}
526		break;
527	}
528
529	return errstr;
530}
531
532/* __ext4_std_error decodes expected errors from journaling functions
533 * automatically and invokes the appropriate error response.  */
534
535void __ext4_std_error(struct super_block *sb, const char *function,
536		      unsigned int line, int errno)
537{
538	char nbuf[16];
539	const char *errstr;
540
541	/* Special case: if the error is EROFS, and we're not already
542	 * inside a transaction, then there's really no point in logging
543	 * an error. */
544	if (errno == -EROFS && journal_current_handle() == NULL &&
545	    (sb->s_flags & MS_RDONLY))
546		return;
547
548	if (ext4_error_ratelimit(sb)) {
549		errstr = ext4_decode_error(sb, errno, nbuf);
550		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
551		       sb->s_id, function, line, errstr);
552	}
553
554	save_error_info(sb, function, line);
555	ext4_handle_error(sb);
556}
557
558/*
559 * ext4_abort is a much stronger failure handler than ext4_error.  The
560 * abort function may be used to deal with unrecoverable failures such
561 * as journal IO errors or ENOMEM at a critical moment in log management.
562 *
563 * We unconditionally force the filesystem into an ABORT|READONLY state,
564 * unless the error response on the fs has been set to panic in which
565 * case we take the easy way out and panic immediately.
566 */
567
568void __ext4_abort(struct super_block *sb, const char *function,
569		unsigned int line, const char *fmt, ...)
570{
571	va_list args;
572
573	save_error_info(sb, function, line);
574	va_start(args, fmt);
575	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
576	       function, line);
577	vprintk(fmt, args);
578	printk("\n");
579	va_end(args);
580
581	if ((sb->s_flags & MS_RDONLY) == 0) {
582		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
583		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
584		/*
585		 * Make sure updated value of ->s_mount_flags will be visible
586		 * before ->s_flags update
587		 */
588		smp_wmb();
589		sb->s_flags |= MS_RDONLY;
590		if (EXT4_SB(sb)->s_journal)
591			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
592		save_error_info(sb, function, line);
593	}
594	if (test_opt(sb, ERRORS_PANIC)) {
595		if (EXT4_SB(sb)->s_journal &&
596		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
597			return;
598		panic("EXT4-fs panic from previous error\n");
599	}
600}
601
602void __ext4_msg(struct super_block *sb,
603		const char *prefix, const char *fmt, ...)
604{
605	struct va_format vaf;
606	va_list args;
607
608	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
609		return;
610
611	va_start(args, fmt);
612	vaf.fmt = fmt;
613	vaf.va = &args;
614	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
615	va_end(args);
616}
617
618void __ext4_warning(struct super_block *sb, const char *function,
619		    unsigned int line, const char *fmt, ...)
620{
621	struct va_format vaf;
622	va_list args;
623
624	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
625			  "EXT4-fs warning"))
626		return;
627
628	va_start(args, fmt);
629	vaf.fmt = fmt;
630	vaf.va = &args;
631	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
632	       sb->s_id, function, line, &vaf);
633	va_end(args);
634}
635
636void __ext4_grp_locked_error(const char *function, unsigned int line,
637			     struct super_block *sb, ext4_group_t grp,
638			     unsigned long ino, ext4_fsblk_t block,
639			     const char *fmt, ...)
640__releases(bitlock)
641__acquires(bitlock)
642{
643	struct va_format vaf;
644	va_list args;
645	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
646
647	es->s_last_error_ino = cpu_to_le32(ino);
648	es->s_last_error_block = cpu_to_le64(block);
649	__save_error_info(sb, function, line);
650
651	if (ext4_error_ratelimit(sb)) {
652		va_start(args, fmt);
653		vaf.fmt = fmt;
654		vaf.va = &args;
655		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
656		       sb->s_id, function, line, grp);
657		if (ino)
658			printk(KERN_CONT "inode %lu: ", ino);
659		if (block)
660			printk(KERN_CONT "block %llu:",
661			       (unsigned long long) block);
662		printk(KERN_CONT "%pV\n", &vaf);
663		va_end(args);
664	}
665
666	if (test_opt(sb, ERRORS_CONT)) {
667		ext4_commit_super(sb, 0);
668		return;
669	}
670
671	ext4_unlock_group(sb, grp);
672	ext4_handle_error(sb);
673	/*
674	 * We only get here in the ERRORS_RO case; relocking the group
675	 * may be dangerous, but nothing bad will happen since the
676	 * filesystem will have already been marked read/only and the
677	 * journal has been aborted.  We return 1 as a hint to callers
678	 * who might what to use the return value from
679	 * ext4_grp_locked_error() to distinguish between the
680	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
681	 * aggressively from the ext4 function in question, with a
682	 * more appropriate error code.
683	 */
684	ext4_lock_group(sb, grp);
685	return;
686}
687
688void ext4_update_dynamic_rev(struct super_block *sb)
689{
690	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
691
692	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
693		return;
694
695	ext4_warning(sb,
696		     "updating to rev %d because of new feature flag, "
697		     "running e2fsck is recommended",
698		     EXT4_DYNAMIC_REV);
699
700	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
701	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
702	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
703	/* leave es->s_feature_*compat flags alone */
704	/* es->s_uuid will be set by e2fsck if empty */
705
706	/*
707	 * The rest of the superblock fields should be zero, and if not it
708	 * means they are likely already in use, so leave them alone.  We
709	 * can leave it up to e2fsck to clean up any inconsistencies there.
710	 */
711}
712
713/*
714 * Open the external journal device
715 */
716static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
717{
718	struct block_device *bdev;
719	char b[BDEVNAME_SIZE];
720
721	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
722	if (IS_ERR(bdev))
723		goto fail;
724	return bdev;
725
726fail:
727	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
728			__bdevname(dev, b), PTR_ERR(bdev));
729	return NULL;
730}
731
732/*
733 * Release the journal device
734 */
735static void ext4_blkdev_put(struct block_device *bdev)
736{
737	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
738}
739
740static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
741{
742	struct block_device *bdev;
743	bdev = sbi->journal_bdev;
744	if (bdev) {
745		ext4_blkdev_put(bdev);
746		sbi->journal_bdev = NULL;
747	}
748}
749
750static inline struct inode *orphan_list_entry(struct list_head *l)
751{
752	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
753}
754
755static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
756{
757	struct list_head *l;
758
759	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
760		 le32_to_cpu(sbi->s_es->s_last_orphan));
761
762	printk(KERN_ERR "sb_info orphan list:\n");
763	list_for_each(l, &sbi->s_orphan) {
764		struct inode *inode = orphan_list_entry(l);
765		printk(KERN_ERR "  "
766		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
767		       inode->i_sb->s_id, inode->i_ino, inode,
768		       inode->i_mode, inode->i_nlink,
769		       NEXT_ORPHAN(inode));
770	}
771}
772
773static void ext4_put_super(struct super_block *sb)
774{
775	struct ext4_sb_info *sbi = EXT4_SB(sb);
776	struct ext4_super_block *es = sbi->s_es;
777	int i, err;
778
779	ext4_unregister_li_request(sb);
780	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
781
782	flush_workqueue(sbi->rsv_conversion_wq);
783	destroy_workqueue(sbi->rsv_conversion_wq);
784
785	if (sbi->s_journal) {
786		err = jbd2_journal_destroy(sbi->s_journal);
787		sbi->s_journal = NULL;
788		if (err < 0)
789			ext4_abort(sb, "Couldn't clean up the journal");
790	}
791
792	ext4_es_unregister_shrinker(sbi);
793	del_timer_sync(&sbi->s_err_report);
794	ext4_release_system_zone(sb);
795	ext4_mb_release(sb);
796	ext4_ext_release(sb);
797	ext4_xattr_put_super(sb);
798
799	if (!(sb->s_flags & MS_RDONLY)) {
800		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
801		es->s_state = cpu_to_le16(sbi->s_mount_state);
802	}
803	if (!(sb->s_flags & MS_RDONLY))
804		ext4_commit_super(sb, 1);
805
806	if (sbi->s_proc) {
807		remove_proc_entry("options", sbi->s_proc);
808		remove_proc_entry(sb->s_id, ext4_proc_root);
809	}
810	kobject_del(&sbi->s_kobj);
811
812	for (i = 0; i < sbi->s_gdb_count; i++)
813		brelse(sbi->s_group_desc[i]);
814	kvfree(sbi->s_group_desc);
815	kvfree(sbi->s_flex_groups);
816	percpu_counter_destroy(&sbi->s_freeclusters_counter);
817	percpu_counter_destroy(&sbi->s_freeinodes_counter);
818	percpu_counter_destroy(&sbi->s_dirs_counter);
819	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
820	brelse(sbi->s_sbh);
821#ifdef CONFIG_QUOTA
822	for (i = 0; i < EXT4_MAXQUOTAS; i++)
823		kfree(sbi->s_qf_names[i]);
824#endif
825
826	/* Debugging code just in case the in-memory inode orphan list
827	 * isn't empty.  The on-disk one can be non-empty if we've
828	 * detected an error and taken the fs readonly, but the
829	 * in-memory list had better be clean by this point. */
830	if (!list_empty(&sbi->s_orphan))
831		dump_orphan_list(sb, sbi);
832	J_ASSERT(list_empty(&sbi->s_orphan));
833
834	sync_blockdev(sb->s_bdev);
835	invalidate_bdev(sb->s_bdev);
836	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
837		/*
838		 * Invalidate the journal device's buffers.  We don't want them
839		 * floating about in memory - the physical journal device may
840		 * hotswapped, and it breaks the `ro-after' testing code.
841		 */
842		sync_blockdev(sbi->journal_bdev);
843		invalidate_bdev(sbi->journal_bdev);
844		ext4_blkdev_remove(sbi);
845	}
846	if (sbi->s_mb_cache) {
847		ext4_xattr_destroy_cache(sbi->s_mb_cache);
848		sbi->s_mb_cache = NULL;
849	}
850	if (sbi->s_mmp_tsk)
851		kthread_stop(sbi->s_mmp_tsk);
852	sb->s_fs_info = NULL;
853	/*
854	 * Now that we are completely done shutting down the
855	 * superblock, we need to actually destroy the kobject.
856	 */
857	kobject_put(&sbi->s_kobj);
858	wait_for_completion(&sbi->s_kobj_unregister);
859	if (sbi->s_chksum_driver)
860		crypto_free_shash(sbi->s_chksum_driver);
861	kfree(sbi->s_blockgroup_lock);
862	kfree(sbi);
863}
864
865static struct kmem_cache *ext4_inode_cachep;
866
867/*
868 * Called inside transaction, so use GFP_NOFS
869 */
870static struct inode *ext4_alloc_inode(struct super_block *sb)
871{
872	struct ext4_inode_info *ei;
873
874	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
875	if (!ei)
876		return NULL;
877
878	ei->vfs_inode.i_version = 1;
879	spin_lock_init(&ei->i_raw_lock);
880	INIT_LIST_HEAD(&ei->i_prealloc_list);
881	spin_lock_init(&ei->i_prealloc_lock);
882	ext4_es_init_tree(&ei->i_es_tree);
883	rwlock_init(&ei->i_es_lock);
884	INIT_LIST_HEAD(&ei->i_es_list);
885	ei->i_es_all_nr = 0;
886	ei->i_es_shk_nr = 0;
887	ei->i_es_shrink_lblk = 0;
888	ei->i_reserved_data_blocks = 0;
889	ei->i_reserved_meta_blocks = 0;
890	ei->i_allocated_meta_blocks = 0;
891	ei->i_da_metadata_calc_len = 0;
892	ei->i_da_metadata_calc_last_lblock = 0;
893	spin_lock_init(&(ei->i_block_reservation_lock));
894#ifdef CONFIG_QUOTA
895	ei->i_reserved_quota = 0;
896	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
897#endif
898	ei->jinode = NULL;
899	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
900	spin_lock_init(&ei->i_completed_io_lock);
901	ei->i_sync_tid = 0;
902	ei->i_datasync_tid = 0;
903	atomic_set(&ei->i_ioend_count, 0);
904	atomic_set(&ei->i_unwritten, 0);
905	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
906#ifdef CONFIG_EXT4_FS_ENCRYPTION
907	ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
908#endif
909
910	return &ei->vfs_inode;
911}
912
913static int ext4_drop_inode(struct inode *inode)
914{
915	int drop = generic_drop_inode(inode);
916
917	trace_ext4_drop_inode(inode, drop);
918	return drop;
919}
920
921static void ext4_i_callback(struct rcu_head *head)
922{
923	struct inode *inode = container_of(head, struct inode, i_rcu);
924	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
925}
926
927static void ext4_destroy_inode(struct inode *inode)
928{
929	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
930		ext4_msg(inode->i_sb, KERN_ERR,
931			 "Inode %lu (%p): orphan list check failed!",
932			 inode->i_ino, EXT4_I(inode));
933		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
934				EXT4_I(inode), sizeof(struct ext4_inode_info),
935				true);
936		dump_stack();
937	}
938	call_rcu(&inode->i_rcu, ext4_i_callback);
939}
940
941static void init_once(void *foo)
942{
943	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
944
945	INIT_LIST_HEAD(&ei->i_orphan);
946	init_rwsem(&ei->xattr_sem);
947	init_rwsem(&ei->i_data_sem);
948	init_rwsem(&ei->i_mmap_sem);
949	inode_init_once(&ei->vfs_inode);
950}
951
952static int __init init_inodecache(void)
953{
954	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
955					     sizeof(struct ext4_inode_info),
956					     0, (SLAB_RECLAIM_ACCOUNT|
957						SLAB_MEM_SPREAD),
958					     init_once);
959	if (ext4_inode_cachep == NULL)
960		return -ENOMEM;
961	return 0;
962}
963
964static void destroy_inodecache(void)
965{
966	/*
967	 * Make sure all delayed rcu free inodes are flushed before we
968	 * destroy cache.
969	 */
970	rcu_barrier();
971	kmem_cache_destroy(ext4_inode_cachep);
972}
973
974void ext4_clear_inode(struct inode *inode)
975{
976	invalidate_inode_buffers(inode);
977	clear_inode(inode);
978	dquot_drop(inode);
979	ext4_discard_preallocations(inode);
980	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
981	if (EXT4_I(inode)->jinode) {
982		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
983					       EXT4_I(inode)->jinode);
984		jbd2_free_inode(EXT4_I(inode)->jinode);
985		EXT4_I(inode)->jinode = NULL;
986	}
987}
988
989static struct inode *ext4_nfs_get_inode(struct super_block *sb,
990					u64 ino, u32 generation)
991{
992	struct inode *inode;
993
994	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
995		return ERR_PTR(-ESTALE);
996	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
997		return ERR_PTR(-ESTALE);
998
999	/* iget isn't really right if the inode is currently unallocated!!
1000	 *
1001	 * ext4_read_inode will return a bad_inode if the inode had been
1002	 * deleted, so we should be safe.
1003	 *
1004	 * Currently we don't know the generation for parent directory, so
1005	 * a generation of 0 means "accept any"
1006	 */
1007	inode = ext4_iget_normal(sb, ino);
1008	if (IS_ERR(inode))
1009		return ERR_CAST(inode);
1010	if (generation && inode->i_generation != generation) {
1011		iput(inode);
1012		return ERR_PTR(-ESTALE);
1013	}
1014
1015	return inode;
1016}
1017
1018static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1019					int fh_len, int fh_type)
1020{
1021	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1022				    ext4_nfs_get_inode);
1023}
1024
1025static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1026					int fh_len, int fh_type)
1027{
1028	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1029				    ext4_nfs_get_inode);
1030}
1031
1032/*
1033 * Try to release metadata pages (indirect blocks, directories) which are
1034 * mapped via the block device.  Since these pages could have journal heads
1035 * which would prevent try_to_free_buffers() from freeing them, we must use
1036 * jbd2 layer's try_to_free_buffers() function to release them.
1037 */
1038static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
1039				 gfp_t wait)
1040{
1041	journal_t *journal = EXT4_SB(sb)->s_journal;
1042
1043	WARN_ON(PageChecked(page));
1044	if (!page_has_buffers(page))
1045		return 0;
1046	if (journal)
1047		return jbd2_journal_try_to_free_buffers(journal, page,
1048							wait & ~__GFP_WAIT);
1049	return try_to_free_buffers(page);
1050}
1051
1052#ifdef CONFIG_QUOTA
1053#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
1054#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
1055
1056static int ext4_write_dquot(struct dquot *dquot);
1057static int ext4_acquire_dquot(struct dquot *dquot);
1058static int ext4_release_dquot(struct dquot *dquot);
1059static int ext4_mark_dquot_dirty(struct dquot *dquot);
1060static int ext4_write_info(struct super_block *sb, int type);
1061static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1062			 struct path *path);
1063static int ext4_quota_off(struct super_block *sb, int type);
1064static int ext4_quota_on_mount(struct super_block *sb, int type);
1065static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1066			       size_t len, loff_t off);
1067static ssize_t ext4_quota_write(struct super_block *sb, int type,
1068				const char *data, size_t len, loff_t off);
1069static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1070			     unsigned int flags);
1071static int ext4_enable_quotas(struct super_block *sb);
1072
1073static struct dquot **ext4_get_dquots(struct inode *inode)
1074{
1075	return EXT4_I(inode)->i_dquot;
1076}
1077
1078static const struct dquot_operations ext4_quota_operations = {
1079	.get_reserved_space = ext4_get_reserved_space,
1080	.write_dquot	= ext4_write_dquot,
1081	.acquire_dquot	= ext4_acquire_dquot,
1082	.release_dquot	= ext4_release_dquot,
1083	.mark_dirty	= ext4_mark_dquot_dirty,
1084	.write_info	= ext4_write_info,
1085	.alloc_dquot	= dquot_alloc,
1086	.destroy_dquot	= dquot_destroy,
1087};
1088
1089static const struct quotactl_ops ext4_qctl_operations = {
1090	.quota_on	= ext4_quota_on,
1091	.quota_off	= ext4_quota_off,
1092	.quota_sync	= dquot_quota_sync,
1093	.get_state	= dquot_get_state,
1094	.set_info	= dquot_set_dqinfo,
1095	.get_dqblk	= dquot_get_dqblk,
1096	.set_dqblk	= dquot_set_dqblk
1097};
1098#endif
1099
1100static const struct super_operations ext4_sops = {
1101	.alloc_inode	= ext4_alloc_inode,
1102	.destroy_inode	= ext4_destroy_inode,
1103	.write_inode	= ext4_write_inode,
1104	.dirty_inode	= ext4_dirty_inode,
1105	.drop_inode	= ext4_drop_inode,
1106	.evict_inode	= ext4_evict_inode,
1107	.put_super	= ext4_put_super,
1108	.sync_fs	= ext4_sync_fs,
1109	.freeze_fs	= ext4_freeze,
1110	.unfreeze_fs	= ext4_unfreeze,
1111	.statfs		= ext4_statfs,
1112	.remount_fs	= ext4_remount,
1113	.show_options	= ext4_show_options,
1114#ifdef CONFIG_QUOTA
1115	.quota_read	= ext4_quota_read,
1116	.quota_write	= ext4_quota_write,
1117	.get_dquots	= ext4_get_dquots,
1118#endif
1119	.bdev_try_to_free_page = bdev_try_to_free_page,
1120};
1121
1122static const struct export_operations ext4_export_ops = {
1123	.fh_to_dentry = ext4_fh_to_dentry,
1124	.fh_to_parent = ext4_fh_to_parent,
1125	.get_parent = ext4_get_parent,
1126};
1127
1128enum {
1129	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1130	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1131	Opt_nouid32, Opt_debug, Opt_removed,
1132	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1133	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1134	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1135	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1136	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1137	Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1138	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1139	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1140	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1141	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
1142	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1143	Opt_lazytime, Opt_nolazytime,
1144	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1145	Opt_inode_readahead_blks, Opt_journal_ioprio,
1146	Opt_dioread_nolock, Opt_dioread_lock,
1147	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1148	Opt_max_dir_size_kb, Opt_nojournal_checksum,
1149};
1150
1151static const match_table_t tokens = {
1152	{Opt_bsd_df, "bsddf"},
1153	{Opt_minix_df, "minixdf"},
1154	{Opt_grpid, "grpid"},
1155	{Opt_grpid, "bsdgroups"},
1156	{Opt_nogrpid, "nogrpid"},
1157	{Opt_nogrpid, "sysvgroups"},
1158	{Opt_resgid, "resgid=%u"},
1159	{Opt_resuid, "resuid=%u"},
1160	{Opt_sb, "sb=%u"},
1161	{Opt_err_cont, "errors=continue"},
1162	{Opt_err_panic, "errors=panic"},
1163	{Opt_err_ro, "errors=remount-ro"},
1164	{Opt_nouid32, "nouid32"},
1165	{Opt_debug, "debug"},
1166	{Opt_removed, "oldalloc"},
1167	{Opt_removed, "orlov"},
1168	{Opt_user_xattr, "user_xattr"},
1169	{Opt_nouser_xattr, "nouser_xattr"},
1170	{Opt_acl, "acl"},
1171	{Opt_noacl, "noacl"},
1172	{Opt_noload, "norecovery"},
1173	{Opt_noload, "noload"},
1174	{Opt_removed, "nobh"},
1175	{Opt_removed, "bh"},
1176	{Opt_commit, "commit=%u"},
1177	{Opt_min_batch_time, "min_batch_time=%u"},
1178	{Opt_max_batch_time, "max_batch_time=%u"},
1179	{Opt_journal_dev, "journal_dev=%u"},
1180	{Opt_journal_path, "journal_path=%s"},
1181	{Opt_journal_checksum, "journal_checksum"},
1182	{Opt_nojournal_checksum, "nojournal_checksum"},
1183	{Opt_journal_async_commit, "journal_async_commit"},
1184	{Opt_abort, "abort"},
1185	{Opt_data_journal, "data=journal"},
1186	{Opt_data_ordered, "data=ordered"},
1187	{Opt_data_writeback, "data=writeback"},
1188	{Opt_data_err_abort, "data_err=abort"},
1189	{Opt_data_err_ignore, "data_err=ignore"},
1190	{Opt_offusrjquota, "usrjquota="},
1191	{Opt_usrjquota, "usrjquota=%s"},
1192	{Opt_offgrpjquota, "grpjquota="},
1193	{Opt_grpjquota, "grpjquota=%s"},
1194	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1195	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1196	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1197	{Opt_grpquota, "grpquota"},
1198	{Opt_noquota, "noquota"},
1199	{Opt_quota, "quota"},
1200	{Opt_usrquota, "usrquota"},
1201	{Opt_barrier, "barrier=%u"},
1202	{Opt_barrier, "barrier"},
1203	{Opt_nobarrier, "nobarrier"},
1204	{Opt_i_version, "i_version"},
1205	{Opt_dax, "dax"},
1206	{Opt_stripe, "stripe=%u"},
1207	{Opt_delalloc, "delalloc"},
1208	{Opt_lazytime, "lazytime"},
1209	{Opt_nolazytime, "nolazytime"},
1210	{Opt_nodelalloc, "nodelalloc"},
1211	{Opt_removed, "mblk_io_submit"},
1212	{Opt_removed, "nomblk_io_submit"},
1213	{Opt_block_validity, "block_validity"},
1214	{Opt_noblock_validity, "noblock_validity"},
1215	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1216	{Opt_journal_ioprio, "journal_ioprio=%u"},
1217	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
1218	{Opt_auto_da_alloc, "auto_da_alloc"},
1219	{Opt_noauto_da_alloc, "noauto_da_alloc"},
1220	{Opt_dioread_nolock, "dioread_nolock"},
1221	{Opt_dioread_lock, "dioread_lock"},
1222	{Opt_discard, "discard"},
1223	{Opt_nodiscard, "nodiscard"},
1224	{Opt_init_itable, "init_itable=%u"},
1225	{Opt_init_itable, "init_itable"},
1226	{Opt_noinit_itable, "noinit_itable"},
1227	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1228	{Opt_test_dummy_encryption, "test_dummy_encryption"},
1229	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
1230	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
1231	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
1232	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1233	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
1234	{Opt_err, NULL},
1235};
1236
1237static ext4_fsblk_t get_sb_block(void **data)
1238{
1239	ext4_fsblk_t	sb_block;
1240	char		*options = (char *) *data;
1241
1242	if (!options || strncmp(options, "sb=", 3) != 0)
1243		return 1;	/* Default location */
1244
1245	options += 3;
1246	/* TODO: use simple_strtoll with >32bit ext4 */
1247	sb_block = simple_strtoul(options, &options, 0);
1248	if (*options && *options != ',') {
1249		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1250		       (char *) *data);
1251		return 1;
1252	}
1253	if (*options == ',')
1254		options++;
1255	*data = (void *) options;
1256
1257	return sb_block;
1258}
1259
1260#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1261static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1262	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1263
1264#ifdef CONFIG_QUOTA
1265static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1266{
1267	struct ext4_sb_info *sbi = EXT4_SB(sb);
1268	char *qname;
1269	int ret = -1;
1270
1271	if (sb_any_quota_loaded(sb) &&
1272		!sbi->s_qf_names[qtype]) {
1273		ext4_msg(sb, KERN_ERR,
1274			"Cannot change journaled "
1275			"quota options when quota turned on");
1276		return -1;
1277	}
1278	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
1279		ext4_msg(sb, KERN_INFO, "Journaled quota options "
1280			 "ignored when QUOTA feature is enabled");
1281		return 1;
1282	}
1283	qname = match_strdup(args);
1284	if (!qname) {
1285		ext4_msg(sb, KERN_ERR,
1286			"Not enough memory for storing quotafile name");
1287		return -1;
1288	}
1289	if (sbi->s_qf_names[qtype]) {
1290		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
1291			ret = 1;
1292		else
1293			ext4_msg(sb, KERN_ERR,
1294				 "%s quota file already specified",
1295				 QTYPE2NAME(qtype));
1296		goto errout;
1297	}
1298	if (strchr(qname, '/')) {
1299		ext4_msg(sb, KERN_ERR,
1300			"quotafile must be on filesystem root");
1301		goto errout;
1302	}
1303	sbi->s_qf_names[qtype] = qname;
1304	set_opt(sb, QUOTA);
1305	return 1;
1306errout:
1307	kfree(qname);
1308	return ret;
1309}
1310
1311static int clear_qf_name(struct super_block *sb, int qtype)
1312{
1313
1314	struct ext4_sb_info *sbi = EXT4_SB(sb);
1315
1316	if (sb_any_quota_loaded(sb) &&
1317		sbi->s_qf_names[qtype]) {
1318		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1319			" when quota turned on");
1320		return -1;
1321	}
1322	kfree(sbi->s_qf_names[qtype]);
1323	sbi->s_qf_names[qtype] = NULL;
1324	return 1;
1325}
1326#endif
1327
1328#define MOPT_SET	0x0001
1329#define MOPT_CLEAR	0x0002
1330#define MOPT_NOSUPPORT	0x0004
1331#define MOPT_EXPLICIT	0x0008
1332#define MOPT_CLEAR_ERR	0x0010
1333#define MOPT_GTE0	0x0020
1334#ifdef CONFIG_QUOTA
1335#define MOPT_Q		0
1336#define MOPT_QFMT	0x0040
1337#else
1338#define MOPT_Q		MOPT_NOSUPPORT
1339#define MOPT_QFMT	MOPT_NOSUPPORT
1340#endif
1341#define MOPT_DATAJ	0x0080
1342#define MOPT_NO_EXT2	0x0100
1343#define MOPT_NO_EXT3	0x0200
1344#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
1345#define MOPT_STRING	0x0400
1346
1347static const struct mount_opts {
1348	int	token;
1349	int	mount_opt;
1350	int	flags;
1351} ext4_mount_opts[] = {
1352	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1353	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1354	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1355	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1356	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1357	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1358	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1359	 MOPT_EXT4_ONLY | MOPT_SET},
1360	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1361	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1362	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1363	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1364	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
1365	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1366	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1367	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1368	{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1369	 MOPT_EXT4_ONLY | MOPT_CLEAR},
1370	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1371	 MOPT_EXT4_ONLY | MOPT_SET},
1372	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1373				    EXT4_MOUNT_JOURNAL_CHECKSUM),
1374	 MOPT_EXT4_ONLY | MOPT_SET},
1375	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1376	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1377	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1378	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1379	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1380	 MOPT_NO_EXT2 | MOPT_SET},
1381	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1382	 MOPT_NO_EXT2 | MOPT_CLEAR},
1383	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1384	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1385	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1386	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1387	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1388	{Opt_commit, 0, MOPT_GTE0},
1389	{Opt_max_batch_time, 0, MOPT_GTE0},
1390	{Opt_min_batch_time, 0, MOPT_GTE0},
1391	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
1392	{Opt_init_itable, 0, MOPT_GTE0},
1393	{Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
1394	{Opt_stripe, 0, MOPT_GTE0},
1395	{Opt_resuid, 0, MOPT_GTE0},
1396	{Opt_resgid, 0, MOPT_GTE0},
1397	{Opt_journal_dev, 0, MOPT_GTE0},
1398	{Opt_journal_path, 0, MOPT_STRING},
1399	{Opt_journal_ioprio, 0, MOPT_GTE0},
1400	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1401	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1402	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
1403	 MOPT_NO_EXT2 | MOPT_DATAJ},
1404	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1405	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1406#ifdef CONFIG_EXT4_FS_POSIX_ACL
1407	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1408	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1409#else
1410	{Opt_acl, 0, MOPT_NOSUPPORT},
1411	{Opt_noacl, 0, MOPT_NOSUPPORT},
1412#endif
1413	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1414	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1415	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1416	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1417							MOPT_SET | MOPT_Q},
1418	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1419							MOPT_SET | MOPT_Q},
1420	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1421		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
1422	{Opt_usrjquota, 0, MOPT_Q},
1423	{Opt_grpjquota, 0, MOPT_Q},
1424	{Opt_offusrjquota, 0, MOPT_Q},
1425	{Opt_offgrpjquota, 0, MOPT_Q},
1426	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1427	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1428	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1429	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
1430	{Opt_test_dummy_encryption, 0, MOPT_GTE0},
1431	{Opt_err, 0, 0}
1432};
1433
1434static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1435			    substring_t *args, unsigned long *journal_devnum,
1436			    unsigned int *journal_ioprio, int is_remount)
1437{
1438	struct ext4_sb_info *sbi = EXT4_SB(sb);
1439	const struct mount_opts *m;
1440	kuid_t uid;
1441	kgid_t gid;
1442	int arg = 0;
1443
1444#ifdef CONFIG_QUOTA
1445	if (token == Opt_usrjquota)
1446		return set_qf_name(sb, USRQUOTA, &args[0]);
1447	else if (token == Opt_grpjquota)
1448		return set_qf_name(sb, GRPQUOTA, &args[0]);
1449	else if (token == Opt_offusrjquota)
1450		return clear_qf_name(sb, USRQUOTA);
1451	else if (token == Opt_offgrpjquota)
1452		return clear_qf_name(sb, GRPQUOTA);
1453#endif
1454	switch (token) {
1455	case Opt_noacl:
1456	case Opt_nouser_xattr:
1457		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1458		break;
1459	case Opt_sb:
1460		return 1;	/* handled by get_sb_block() */
1461	case Opt_removed:
1462		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
1463		return 1;
1464	case Opt_abort:
1465		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1466		return 1;
1467	case Opt_i_version:
1468		sb->s_flags |= MS_I_VERSION;
1469		return 1;
1470	case Opt_lazytime:
1471		sb->s_flags |= MS_LAZYTIME;
1472		return 1;
1473	case Opt_nolazytime:
1474		sb->s_flags &= ~MS_LAZYTIME;
1475		return 1;
1476	}
1477
1478	for (m = ext4_mount_opts; m->token != Opt_err; m++)
1479		if (token == m->token)
1480			break;
1481
1482	if (m->token == Opt_err) {
1483		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1484			 "or missing value", opt);
1485		return -1;
1486	}
1487
1488	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
1489		ext4_msg(sb, KERN_ERR,
1490			 "Mount option \"%s\" incompatible with ext2", opt);
1491		return -1;
1492	}
1493	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
1494		ext4_msg(sb, KERN_ERR,
1495			 "Mount option \"%s\" incompatible with ext3", opt);
1496		return -1;
1497	}
1498
1499	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
1500		return -1;
1501	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1502		return -1;
1503	if (m->flags & MOPT_EXPLICIT)
1504		set_opt2(sb, EXPLICIT_DELALLOC);
1505	if (m->flags & MOPT_CLEAR_ERR)
1506		clear_opt(sb, ERRORS_MASK);
1507	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1508		ext4_msg(sb, KERN_ERR, "Cannot change quota "
1509			 "options when quota turned on");
1510		return -1;
1511	}
1512
1513	if (m->flags & MOPT_NOSUPPORT) {
1514		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1515	} else if (token == Opt_commit) {
1516		if (arg == 0)
1517			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1518		sbi->s_commit_interval = HZ * arg;
1519	} else if (token == Opt_max_batch_time) {
1520		sbi->s_max_batch_time = arg;
1521	} else if (token == Opt_min_batch_time) {
1522		sbi->s_min_batch_time = arg;
1523	} else if (token == Opt_inode_readahead_blks) {
1524		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
1525			ext4_msg(sb, KERN_ERR,
1526				 "EXT4-fs: inode_readahead_blks must be "
1527				 "0 or a power of 2 smaller than 2^31");
1528			return -1;
1529		}
1530		sbi->s_inode_readahead_blks = arg;
1531	} else if (token == Opt_init_itable) {
1532		set_opt(sb, INIT_INODE_TABLE);
1533		if (!args->from)
1534			arg = EXT4_DEF_LI_WAIT_MULT;
1535		sbi->s_li_wait_mult = arg;
1536	} else if (token == Opt_max_dir_size_kb) {
1537		sbi->s_max_dir_size_kb = arg;
1538	} else if (token == Opt_stripe) {
1539		sbi->s_stripe = arg;
1540	} else if (token == Opt_resuid) {
1541		uid = make_kuid(current_user_ns(), arg);
1542		if (!uid_valid(uid)) {
1543			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
1544			return -1;
1545		}
1546		sbi->s_resuid = uid;
1547	} else if (token == Opt_resgid) {
1548		gid = make_kgid(current_user_ns(), arg);
1549		if (!gid_valid(gid)) {
1550			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
1551			return -1;
1552		}
1553		sbi->s_resgid = gid;
1554	} else if (token == Opt_journal_dev) {
1555		if (is_remount) {
1556			ext4_msg(sb, KERN_ERR,
1557				 "Cannot specify journal on remount");
1558			return -1;
1559		}
1560		*journal_devnum = arg;
1561	} else if (token == Opt_journal_path) {
1562		char *journal_path;
1563		struct inode *journal_inode;
1564		struct path path;
1565		int error;
1566
1567		if (is_remount) {
1568			ext4_msg(sb, KERN_ERR,
1569				 "Cannot specify journal on remount");
1570			return -1;
1571		}
1572		journal_path = match_strdup(&args[0]);
1573		if (!journal_path) {
1574			ext4_msg(sb, KERN_ERR, "error: could not dup "
1575				"journal device string");
1576			return -1;
1577		}
1578
1579		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1580		if (error) {
1581			ext4_msg(sb, KERN_ERR, "error: could not find "
1582				"journal device path: error %d", error);
1583			kfree(journal_path);
1584			return -1;
1585		}
1586
1587		journal_inode = d_inode(path.dentry);
1588		if (!S_ISBLK(journal_inode->i_mode)) {
1589			ext4_msg(sb, KERN_ERR, "error: journal path %s "
1590				"is not a block device", journal_path);
1591			path_put(&path);
1592			kfree(journal_path);
1593			return -1;
1594		}
1595
1596		*journal_devnum = new_encode_dev(journal_inode->i_rdev);
1597		path_put(&path);
1598		kfree(journal_path);
1599	} else if (token == Opt_journal_ioprio) {
1600		if (arg > 7) {
1601			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
1602				 " (must be 0-7)");
1603			return -1;
1604		}
1605		*journal_ioprio =
1606			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1607	} else if (token == Opt_test_dummy_encryption) {
1608#ifdef CONFIG_EXT4_FS_ENCRYPTION
1609		sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
1610		ext4_msg(sb, KERN_WARNING,
1611			 "Test dummy encryption mode enabled");
1612#else
1613		ext4_msg(sb, KERN_WARNING,
1614			 "Test dummy encryption mount option ignored");
1615#endif
1616	} else if (m->flags & MOPT_DATAJ) {
1617		if (is_remount) {
1618			if (!sbi->s_journal)
1619				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
1620			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
1621				ext4_msg(sb, KERN_ERR,
1622					 "Cannot change data mode on remount");
1623				return -1;
1624			}
1625		} else {
1626			clear_opt(sb, DATA_FLAGS);
1627			sbi->s_mount_opt |= m->mount_opt;
1628		}
1629#ifdef CONFIG_QUOTA
1630	} else if (m->flags & MOPT_QFMT) {
1631		if (sb_any_quota_loaded(sb) &&
1632		    sbi->s_jquota_fmt != m->mount_opt) {
1633			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
1634				 "quota options when quota turned on");
1635			return -1;
1636		}
1637		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1638					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
1639			ext4_msg(sb, KERN_INFO,
1640				 "Quota format mount options ignored "
1641				 "when QUOTA feature is enabled");
1642			return 1;
1643		}
1644		sbi->s_jquota_fmt = m->mount_opt;
1645#endif
1646#ifndef CONFIG_FS_DAX
1647	} else if (token == Opt_dax) {
1648		ext4_msg(sb, KERN_INFO, "dax option not supported");
1649		return -1;
1650#endif
1651	} else {
1652		if (!args->from)
1653			arg = 1;
1654		if (m->flags & MOPT_CLEAR)
1655			arg = !arg;
1656		else if (unlikely(!(m->flags & MOPT_SET))) {
1657			ext4_msg(sb, KERN_WARNING,
1658				 "buggy handling of option %s", opt);
1659			WARN_ON(1);
1660			return -1;
1661		}
1662		if (arg != 0)
1663			sbi->s_mount_opt |= m->mount_opt;
1664		else
1665			sbi->s_mount_opt &= ~m->mount_opt;
1666	}
1667	return 1;
1668}
1669
1670static int parse_options(char *options, struct super_block *sb,
1671			 unsigned long *journal_devnum,
1672			 unsigned int *journal_ioprio,
1673			 int is_remount)
1674{
1675	struct ext4_sb_info *sbi = EXT4_SB(sb);
1676	char *p;
1677	substring_t args[MAX_OPT_ARGS];
1678	int token;
1679
1680	if (!options)
1681		return 1;
1682
1683	while ((p = strsep(&options, ",")) != NULL) {
1684		if (!*p)
1685			continue;
1686		/*
1687		 * Initialize args struct so we know whether arg was
1688		 * found; some options take optional arguments.
1689		 */
1690		args[0].to = args[0].from = NULL;
1691		token = match_token(p, tokens, args);
1692		if (handle_mount_opt(sb, p, token, args, journal_devnum,
1693				     journal_ioprio, is_remount) < 0)
1694			return 0;
1695	}
1696#ifdef CONFIG_QUOTA
1697	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
1698	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
1699		ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota "
1700			 "mount options ignored.");
1701		clear_opt(sb, USRQUOTA);
1702		clear_opt(sb, GRPQUOTA);
1703	} else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1704		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1705			clear_opt(sb, USRQUOTA);
1706
1707		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1708			clear_opt(sb, GRPQUOTA);
1709
1710		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1711			ext4_msg(sb, KERN_ERR, "old and new quota "
1712					"format mixing");
1713			return 0;
1714		}
1715
1716		if (!sbi->s_jquota_fmt) {
1717			ext4_msg(sb, KERN_ERR, "journaled quota format "
1718					"not specified");
1719			return 0;
1720		}
1721	}
1722#endif
1723	if (test_opt(sb, DIOREAD_NOLOCK)) {
1724		int blocksize =
1725			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
1726
1727		if (blocksize < PAGE_CACHE_SIZE) {
1728			ext4_msg(sb, KERN_ERR, "can't mount with "
1729				 "dioread_nolock if block size != PAGE_SIZE");
1730			return 0;
1731		}
1732	}
1733	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
1734	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
1735		ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
1736			 "in data=ordered mode");
1737		return 0;
1738	}
1739	return 1;
1740}
1741
1742static inline void ext4_show_quota_options(struct seq_file *seq,
1743					   struct super_block *sb)
1744{
1745#if defined(CONFIG_QUOTA)
1746	struct ext4_sb_info *sbi = EXT4_SB(sb);
1747
1748	if (sbi->s_jquota_fmt) {
1749		char *fmtname = "";
1750
1751		switch (sbi->s_jquota_fmt) {
1752		case QFMT_VFS_OLD:
1753			fmtname = "vfsold";
1754			break;
1755		case QFMT_VFS_V0:
1756			fmtname = "vfsv0";
1757			break;
1758		case QFMT_VFS_V1:
1759			fmtname = "vfsv1";
1760			break;
1761		}
1762		seq_printf(seq, ",jqfmt=%s", fmtname);
1763	}
1764
1765	if (sbi->s_qf_names[USRQUOTA])
1766		seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
1767
1768	if (sbi->s_qf_names[GRPQUOTA])
1769		seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
1770#endif
1771}
1772
1773static const char *token2str(int token)
1774{
1775	const struct match_token *t;
1776
1777	for (t = tokens; t->token != Opt_err; t++)
1778		if (t->token == token && !strchr(t->pattern, '='))
1779			break;
1780	return t->pattern;
1781}
1782
1783/*
1784 * Show an option if
1785 *  - it's set to a non-default value OR
1786 *  - if the per-sb default is different from the global default
1787 */
1788static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1789			      int nodefs)
1790{
1791	struct ext4_sb_info *sbi = EXT4_SB(sb);
1792	struct ext4_super_block *es = sbi->s_es;
1793	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
1794	const struct mount_opts *m;
1795	char sep = nodefs ? '\n' : ',';
1796
1797#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
1798#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
1799
1800	if (sbi->s_sb_block != 1)
1801		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
1802
1803	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
1804		int want_set = m->flags & MOPT_SET;
1805		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
1806		    (m->flags & MOPT_CLEAR_ERR))
1807			continue;
1808		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
1809			continue; /* skip if same as the default */
1810		if ((want_set &&
1811		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
1812		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
1813			continue; /* select Opt_noFoo vs Opt_Foo */
1814		SEQ_OPTS_PRINT("%s", token2str(m->token));
1815	}
1816
1817	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
1818	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
1819		SEQ_OPTS_PRINT("resuid=%u",
1820				from_kuid_munged(&init_user_ns, sbi->s_resuid));
1821	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
1822	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
1823		SEQ_OPTS_PRINT("resgid=%u",
1824				from_kgid_munged(&init_user_ns, sbi->s_resgid));
1825	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
1826	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
1827		SEQ_OPTS_PUTS("errors=remount-ro");
1828	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1829		SEQ_OPTS_PUTS("errors=continue");
1830	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1831		SEQ_OPTS_PUTS("errors=panic");
1832	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
1833		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
1834	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
1835		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
1836	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
1837		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
1838	if (sb->s_flags & MS_I_VERSION)
1839		SEQ_OPTS_PUTS("i_version");
1840	if (nodefs || sbi->s_stripe)
1841		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
1842	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
1843		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1844			SEQ_OPTS_PUTS("data=journal");
1845		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1846			SEQ_OPTS_PUTS("data=ordered");
1847		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1848			SEQ_OPTS_PUTS("data=writeback");
1849	}
1850	if (nodefs ||
1851	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1852		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
1853			       sbi->s_inode_readahead_blks);
1854
1855	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1856		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1857		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1858	if (nodefs || sbi->s_max_dir_size_kb)
1859		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
1860
1861	ext4_show_quota_options(seq, sb);
1862	return 0;
1863}
1864
1865static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1866{
1867	return _ext4_show_options(seq, root->d_sb, 0);
1868}
1869
1870static int options_seq_show(struct seq_file *seq, void *offset)
1871{
1872	struct super_block *sb = seq->private;
1873	int rc;
1874
1875	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
1876	rc = _ext4_show_options(seq, sb, 1);
1877	seq_puts(seq, "\n");
1878	return rc;
1879}
1880
1881static int options_open_fs(struct inode *inode, struct file *file)
1882{
1883	return single_open(file, options_seq_show, PDE_DATA(inode));
1884}
1885
1886static const struct file_operations ext4_seq_options_fops = {
1887	.owner = THIS_MODULE,
1888	.open = options_open_fs,
1889	.read = seq_read,
1890	.llseek = seq_lseek,
1891	.release = single_release,
1892};
1893
1894static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1895			    int read_only)
1896{
1897	struct ext4_sb_info *sbi = EXT4_SB(sb);
1898	int res = 0;
1899
1900	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1901		ext4_msg(sb, KERN_ERR, "revision level too high, "
1902			 "forcing read-only mode");
1903		res = MS_RDONLY;
1904	}
1905	if (read_only)
1906		goto done;
1907	if (!(sbi->s_mount_state & EXT4_VALID_FS))
1908		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1909			 "running e2fsck is recommended");
1910	else if (sbi->s_mount_state & EXT4_ERROR_FS)
1911		ext4_msg(sb, KERN_WARNING,
1912			 "warning: mounting fs with errors, "
1913			 "running e2fsck is recommended");
1914	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1915		 le16_to_cpu(es->s_mnt_count) >=
1916		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1917		ext4_msg(sb, KERN_WARNING,
1918			 "warning: maximal mount count reached, "
1919			 "running e2fsck is recommended");
1920	else if (le32_to_cpu(es->s_checkinterval) &&
1921		(le32_to_cpu(es->s_lastcheck) +
1922			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1923		ext4_msg(sb, KERN_WARNING,
1924			 "warning: checktime reached, "
1925			 "running e2fsck is recommended");
1926	if (!sbi->s_journal)
1927		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1928	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1929		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1930	le16_add_cpu(&es->s_mnt_count, 1);
1931	es->s_mtime = cpu_to_le32(get_seconds());
1932	ext4_update_dynamic_rev(sb);
1933	if (sbi->s_journal)
1934		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1935
1936	ext4_commit_super(sb, 1);
1937done:
1938	if (test_opt(sb, DEBUG))
1939		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1940				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1941			sb->s_blocksize,
1942			sbi->s_groups_count,
1943			EXT4_BLOCKS_PER_GROUP(sb),
1944			EXT4_INODES_PER_GROUP(sb),
1945			sbi->s_mount_opt, sbi->s_mount_opt2);
1946
1947	cleancache_init_fs(sb);
1948	return res;
1949}
1950
1951int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
1952{
1953	struct ext4_sb_info *sbi = EXT4_SB(sb);
1954	struct flex_groups *new_groups;
1955	int size;
1956
1957	if (!sbi->s_log_groups_per_flex)
1958		return 0;
1959
1960	size = ext4_flex_group(sbi, ngroup - 1) + 1;
1961	if (size <= sbi->s_flex_groups_allocated)
1962		return 0;
1963
1964	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
1965	new_groups = ext4_kvzalloc(size, GFP_KERNEL);
1966	if (!new_groups) {
1967		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
1968			 size / (int) sizeof(struct flex_groups));
1969		return -ENOMEM;
1970	}
1971
1972	if (sbi->s_flex_groups) {
1973		memcpy(new_groups, sbi->s_flex_groups,
1974		       (sbi->s_flex_groups_allocated *
1975			sizeof(struct flex_groups)));
1976		kvfree(sbi->s_flex_groups);
1977	}
1978	sbi->s_flex_groups = new_groups;
1979	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
1980	return 0;
1981}
1982
1983static int ext4_fill_flex_info(struct super_block *sb)
1984{
1985	struct ext4_sb_info *sbi = EXT4_SB(sb);
1986	struct ext4_group_desc *gdp = NULL;
1987	ext4_group_t flex_group;
1988	int i, err;
1989
1990	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1991	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
1992		sbi->s_log_groups_per_flex = 0;
1993		return 1;
1994	}
1995
1996	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1997	if (err)
1998		goto failed;
1999
2000	for (i = 0; i < sbi->s_groups_count; i++) {
2001		gdp = ext4_get_group_desc(sb, i, NULL);
2002
2003		flex_group = ext4_flex_group(sbi, i);
2004		atomic_add(ext4_free_inodes_count(sb, gdp),
2005			   &sbi->s_flex_groups[flex_group].free_inodes);
2006		atomic64_add(ext4_free_group_clusters(sb, gdp),
2007			     &sbi->s_flex_groups[flex_group].free_clusters);
2008		atomic_add(ext4_used_dirs_count(sb, gdp),
2009			   &sbi->s_flex_groups[flex_group].used_dirs);
2010	}
2011
2012	return 1;
2013failed:
2014	return 0;
2015}
2016
2017static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
2018				   struct ext4_group_desc *gdp)
2019{
2020	int offset;
2021	__u16 crc = 0;
2022	__le32 le_group = cpu_to_le32(block_group);
2023
2024	if (ext4_has_metadata_csum(sbi->s_sb)) {
2025		/* Use new metadata_csum algorithm */
2026		__le16 save_csum;
2027		__u32 csum32;
2028
2029		save_csum = gdp->bg_checksum;
2030		gdp->bg_checksum = 0;
2031		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
2032				     sizeof(le_group));
2033		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
2034				     sbi->s_desc_size);
2035		gdp->bg_checksum = save_csum;
2036
2037		crc = csum32 & 0xFFFF;
2038		goto out;
2039	}
2040
2041	/* old crc16 code */
2042	if (!(sbi->s_es->s_feature_ro_compat &
2043	      cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
2044		return 0;
2045
2046	offset = offsetof(struct ext4_group_desc, bg_checksum);
2047
2048	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2049	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2050	crc = crc16(crc, (__u8 *)gdp, offset);
2051	offset += sizeof(gdp->bg_checksum); /* skip checksum */
2052	/* for checksum of struct ext4_group_desc do the rest...*/
2053	if ((sbi->s_es->s_feature_incompat &
2054	     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
2055	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
2056		crc = crc16(crc, (__u8 *)gdp + offset,
2057			    le16_to_cpu(sbi->s_es->s_desc_size) -
2058				offset);
2059
2060out:
2061	return cpu_to_le16(crc);
2062}
2063
2064int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
2065				struct ext4_group_desc *gdp)
2066{
2067	if (ext4_has_group_desc_csum(sb) &&
2068	    (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
2069						      block_group, gdp)))
2070		return 0;
2071
2072	return 1;
2073}
2074
2075void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2076			      struct ext4_group_desc *gdp)
2077{
2078	if (!ext4_has_group_desc_csum(sb))
2079		return;
2080	gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
2081}
2082
2083/* Called at mount-time, super-block is locked */
2084static int ext4_check_descriptors(struct super_block *sb,
2085				  ext4_group_t *first_not_zeroed)
2086{
2087	struct ext4_sb_info *sbi = EXT4_SB(sb);
2088	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2089	ext4_fsblk_t last_block;
2090	ext4_fsblk_t block_bitmap;
2091	ext4_fsblk_t inode_bitmap;
2092	ext4_fsblk_t inode_table;
2093	int flexbg_flag = 0;
2094	ext4_group_t i, grp = sbi->s_groups_count;
2095
2096	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2097		flexbg_flag = 1;
2098
2099	ext4_debug("Checking group descriptors");
2100
2101	for (i = 0; i < sbi->s_groups_count; i++) {
2102		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2103
2104		if (i == sbi->s_groups_count - 1 || flexbg_flag)
2105			last_block = ext4_blocks_count(sbi->s_es) - 1;
2106		else
2107			last_block = first_block +
2108				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
2109
2110		if ((grp == sbi->s_groups_count) &&
2111		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2112			grp = i;
2113
2114		block_bitmap = ext4_block_bitmap(sb, gdp);
2115		if (block_bitmap < first_block || block_bitmap > last_block) {
2116			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2117			       "Block bitmap for group %u not in group "
2118			       "(block %llu)!", i, block_bitmap);
2119			return 0;
2120		}
2121		inode_bitmap = ext4_inode_bitmap(sb, gdp);
2122		if (inode_bitmap < first_block || inode_bitmap > last_block) {
2123			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2124			       "Inode bitmap for group %u not in group "
2125			       "(block %llu)!", i, inode_bitmap);
2126			return 0;
2127		}
2128		inode_table = ext4_inode_table(sb, gdp);
2129		if (inode_table < first_block ||
2130		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
2131			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2132			       "Inode table for group %u not in group "
2133			       "(block %llu)!", i, inode_table);
2134			return 0;
2135		}
2136		ext4_lock_group(sb, i);
2137		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2138			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2139				 "Checksum for group %u failed (%u!=%u)",
2140				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
2141				     gdp)), le16_to_cpu(gdp->bg_checksum));
2142			if (!(sb->s_flags & MS_RDONLY)) {
2143				ext4_unlock_group(sb, i);
2144				return 0;
2145			}
2146		}
2147		ext4_unlock_group(sb, i);
2148		if (!flexbg_flag)
2149			first_block += EXT4_BLOCKS_PER_GROUP(sb);
2150	}
2151	if (NULL != first_not_zeroed)
2152		*first_not_zeroed = grp;
2153	return 1;
2154}
2155
2156/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
2157 * the superblock) which were deleted from all directories, but held open by
2158 * a process at the time of a crash.  We walk the list and try to delete these
2159 * inodes at recovery time (only with a read-write filesystem).
2160 *
2161 * In order to keep the orphan inode chain consistent during traversal (in
2162 * case of crash during recovery), we link each inode into the superblock
2163 * orphan list_head and handle it the same way as an inode deletion during
2164 * normal operation (which journals the operations for us).
2165 *
2166 * We only do an iget() and an iput() on each inode, which is very safe if we
2167 * accidentally point at an in-use or already deleted inode.  The worst that
2168 * can happen in this case is that we get a "bit already cleared" message from
2169 * ext4_free_inode().  The only reason we would point at a wrong inode is if
2170 * e2fsck was run on this filesystem, and it must have already done the orphan
2171 * inode cleanup for us, so we can safely abort without any further action.
2172 */
2173static void ext4_orphan_cleanup(struct super_block *sb,
2174				struct ext4_super_block *es)
2175{
2176	unsigned int s_flags = sb->s_flags;
2177	int nr_orphans = 0, nr_truncates = 0;
2178#ifdef CONFIG_QUOTA
2179	int i;
2180#endif
2181	if (!es->s_last_orphan) {
2182		jbd_debug(4, "no orphan inodes to clean up\n");
2183		return;
2184	}
2185
2186	if (bdev_read_only(sb->s_bdev)) {
2187		ext4_msg(sb, KERN_ERR, "write access "
2188			"unavailable, skipping orphan cleanup");
2189		return;
2190	}
2191
2192	/* Check if feature set would not allow a r/w mount */
2193	if (!ext4_feature_set_ok(sb, 0)) {
2194		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2195			 "unknown ROCOMPAT features");
2196		return;
2197	}
2198
2199	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2200		/* don't clear list on RO mount w/ errors */
2201		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
2202			ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
2203				  "clearing orphan list.\n");
2204			es->s_last_orphan = 0;
2205		}
2206		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2207		return;
2208	}
2209
2210	if (s_flags & MS_RDONLY) {
2211		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
2212		sb->s_flags &= ~MS_RDONLY;
2213	}
2214#ifdef CONFIG_QUOTA
2215	/* Needed for iput() to work correctly and not trash data */
2216	sb->s_flags |= MS_ACTIVE;
2217	/* Turn on quotas so that they are updated correctly */
2218	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2219		if (EXT4_SB(sb)->s_qf_names[i]) {
2220			int ret = ext4_quota_on_mount(sb, i);
2221			if (ret < 0)
2222				ext4_msg(sb, KERN_ERR,
2223					"Cannot turn on journaled "
2224					"quota: error %d", ret);
2225		}
2226	}
2227#endif
2228
2229	while (es->s_last_orphan) {
2230		struct inode *inode;
2231
2232		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
2233		if (IS_ERR(inode)) {
2234			es->s_last_orphan = 0;
2235			break;
2236		}
2237
2238		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2239		dquot_initialize(inode);
2240		if (inode->i_nlink) {
2241			if (test_opt(sb, DEBUG))
2242				ext4_msg(sb, KERN_DEBUG,
2243					"%s: truncating inode %lu to %lld bytes",
2244					__func__, inode->i_ino, inode->i_size);
2245			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2246				  inode->i_ino, inode->i_size);
2247			mutex_lock(&inode->i_mutex);
2248			truncate_inode_pages(inode->i_mapping, inode->i_size);
2249			ext4_truncate(inode);
2250			mutex_unlock(&inode->i_mutex);
2251			nr_truncates++;
2252		} else {
2253			if (test_opt(sb, DEBUG))
2254				ext4_msg(sb, KERN_DEBUG,
2255					"%s: deleting unreferenced inode %lu",
2256					__func__, inode->i_ino);
2257			jbd_debug(2, "deleting unreferenced inode %lu\n",
2258				  inode->i_ino);
2259			nr_orphans++;
2260		}
2261		iput(inode);  /* The delete magic happens here! */
2262	}
2263
2264#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
2265
2266	if (nr_orphans)
2267		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
2268		       PLURAL(nr_orphans));
2269	if (nr_truncates)
2270		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
2271		       PLURAL(nr_truncates));
2272#ifdef CONFIG_QUOTA
2273	/* Turn quotas off */
2274	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2275		if (sb_dqopt(sb)->files[i])
2276			dquot_quota_off(sb, i);
2277	}
2278#endif
2279	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
2280}
2281
2282/*
2283 * Maximal extent format file size.
2284 * Resulting logical blkno at s_maxbytes must fit in our on-disk
2285 * extent format containers, within a sector_t, and within i_blocks
2286 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
2287 * so that won't be a limiting factor.
2288 *
2289 * However there is other limiting factor. We do store extents in the form
2290 * of starting block and length, hence the resulting length of the extent
2291 * covering maximum file size must fit into on-disk format containers as
2292 * well. Given that length is always by 1 unit bigger than max unit (because
2293 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2294 *
2295 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2296 */
2297static loff_t ext4_max_size(int blkbits, int has_huge_files)
2298{
2299	loff_t res;
2300	loff_t upper_limit = MAX_LFS_FILESIZE;
2301
2302	/* small i_blocks in vfs inode? */
2303	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
2304		/*
2305		 * CONFIG_LBDAF is not enabled implies the inode
2306		 * i_block represent total blocks in 512 bytes
2307		 * 32 == size of vfs inode i_blocks * 8
2308		 */
2309		upper_limit = (1LL << 32) - 1;
2310
2311		/* total blocks in file system block size */
2312		upper_limit >>= (blkbits - 9);
2313		upper_limit <<= blkbits;
2314	}
2315
2316	/*
2317	 * 32-bit extent-start container, ee_block. We lower the maxbytes
2318	 * by one fs block, so ee_len can cover the extent of maximum file
2319	 * size
2320	 */
2321	res = (1LL << 32) - 1;
2322	res <<= blkbits;
2323
2324	/* Sanity check against vm- & vfs- imposed limits */
2325	if (res > upper_limit)
2326		res = upper_limit;
2327
2328	return res;
2329}
2330
2331/*
2332 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
2333 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
2334 * We need to be 1 filesystem block less than the 2^48 sector limit.
2335 */
2336static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
2337{
2338	loff_t res = EXT4_NDIR_BLOCKS;
2339	int meta_blocks;
2340	loff_t upper_limit;
2341	/* This is calculated to be the largest file size for a dense, block
2342	 * mapped file such that the file's total number of 512-byte sectors,
2343	 * including data and all indirect blocks, does not exceed (2^48 - 1).
2344	 *
2345	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
2346	 * number of 512-byte sectors of the file.
2347	 */
2348
2349	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
2350		/*
2351		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
2352		 * the inode i_block field represents total file blocks in
2353		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
2354		 */
2355		upper_limit = (1LL << 32) - 1;
2356
2357		/* total blocks in file system block size */
2358		upper_limit >>= (bits - 9);
2359
2360	} else {
2361		/*
2362		 * We use 48 bit ext4_inode i_blocks
2363		 * With EXT4_HUGE_FILE_FL set the i_blocks
2364		 * represent total number of blocks in
2365		 * file system block size
2366		 */
2367		upper_limit = (1LL << 48) - 1;
2368
2369	}
2370
2371	/* indirect blocks */
2372	meta_blocks = 1;
2373	/* double indirect blocks */
2374	meta_blocks += 1 + (1LL << (bits-2));
2375	/* tripple indirect blocks */
2376	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
2377
2378	upper_limit -= meta_blocks;
2379	upper_limit <<= bits;
2380
2381	res += 1LL << (bits-2);
2382	res += 1LL << (2*(bits-2));
2383	res += 1LL << (3*(bits-2));
2384	res <<= bits;
2385	if (res > upper_limit)
2386		res = upper_limit;
2387
2388	if (res > MAX_LFS_FILESIZE)
2389		res = MAX_LFS_FILESIZE;
2390
2391	return res;
2392}
2393
2394static ext4_fsblk_t descriptor_loc(struct super_block *sb,
2395				   ext4_fsblk_t logical_sb_block, int nr)
2396{
2397	struct ext4_sb_info *sbi = EXT4_SB(sb);
2398	ext4_group_t bg, first_meta_bg;
2399	int has_super = 0;
2400
2401	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
2402
2403	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
2404	    nr < first_meta_bg)
2405		return logical_sb_block + nr + 1;
2406	bg = sbi->s_desc_per_block * nr;
2407	if (ext4_bg_has_super(sb, bg))
2408		has_super = 1;
2409
2410	/*
2411	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
2412	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
2413	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
2414	 * compensate.
2415	 */
2416	if (sb->s_blocksize == 1024 && nr == 0 &&
2417	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
2418		has_super++;
2419
2420	return (has_super + ext4_group_first_block_no(sb, bg));
2421}
2422
2423/**
2424 * ext4_get_stripe_size: Get the stripe size.
2425 * @sbi: In memory super block info
2426 *
2427 * If we have specified it via mount option, then
2428 * use the mount option value. If the value specified at mount time is
2429 * greater than the blocks per group use the super block value.
2430 * If the super block value is greater than blocks per group return 0.
2431 * Allocator needs it be less than blocks per group.
2432 *
2433 */
2434static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2435{
2436	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
2437	unsigned long stripe_width =
2438			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
2439	int ret;
2440
2441	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
2442		ret = sbi->s_stripe;
2443	else if (stripe_width <= sbi->s_blocks_per_group)
2444		ret = stripe_width;
2445	else if (stride <= sbi->s_blocks_per_group)
2446		ret = stride;
2447	else
2448		ret = 0;
2449
2450	/*
2451	 * If the stripe width is 1, this makes no sense and
2452	 * we set it to 0 to turn off stripe handling code.
2453	 */
2454	if (ret <= 1)
2455		ret = 0;
2456
2457	return ret;
2458}
2459
2460/* sysfs supprt */
2461
2462struct ext4_attr {
2463	struct attribute attr;
2464	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2465	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2466			 const char *, size_t);
2467	union {
2468		int offset;
2469		int deprecated_val;
2470	} u;
2471};
2472
2473static int parse_strtoull(const char *buf,
2474		unsigned long long max, unsigned long long *value)
2475{
2476	int ret;
2477
2478	ret = kstrtoull(skip_spaces(buf), 0, value);
2479	if (!ret && *value > max)
2480		ret = -EINVAL;
2481	return ret;
2482}
2483
2484static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2485					      struct ext4_sb_info *sbi,
2486					      char *buf)
2487{
2488	return snprintf(buf, PAGE_SIZE, "%llu\n",
2489		(s64) EXT4_C2B(sbi,
2490			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
2491}
2492
2493static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2494					 struct ext4_sb_info *sbi, char *buf)
2495{
2496	struct super_block *sb = sbi->s_buddy_cache->i_sb;
2497
2498	if (!sb->s_bdev->bd_part)
2499		return snprintf(buf, PAGE_SIZE, "0\n");
2500	return snprintf(buf, PAGE_SIZE, "%lu\n",
2501			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2502			 sbi->s_sectors_written_start) >> 1);
2503}
2504
2505static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2506					  struct ext4_sb_info *sbi, char *buf)
2507{
2508	struct super_block *sb = sbi->s_buddy_cache->i_sb;
2509
2510	if (!sb->s_bdev->bd_part)
2511		return snprintf(buf, PAGE_SIZE, "0\n");
2512	return snprintf(buf, PAGE_SIZE, "%llu\n",
2513			(unsigned long long)(sbi->s_kbytes_written +
2514			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2515			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2516}
2517
2518static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2519					  struct ext4_sb_info *sbi,
2520					  const char *buf, size_t count)
2521{
2522	unsigned long t;
2523	int ret;
2524
2525	ret = kstrtoul(skip_spaces(buf), 0, &t);
2526	if (ret)
2527		return ret;
2528
2529	if (t && (!is_power_of_2(t) || t > 0x40000000))
2530		return -EINVAL;
2531
2532	sbi->s_inode_readahead_blks = t;
2533	return count;
2534}
2535
2536static ssize_t sbi_ui_show(struct ext4_attr *a,
2537			   struct ext4_sb_info *sbi, char *buf)
2538{
2539	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2540
2541	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2542}
2543
2544static ssize_t sbi_ui_store(struct ext4_attr *a,
2545			    struct ext4_sb_info *sbi,
2546			    const char *buf, size_t count)
2547{
2548	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2549	unsigned long t;
2550	int ret;
2551
2552	ret = kstrtoul(skip_spaces(buf), 0, &t);
2553	if (ret)
2554		return ret;
2555	*ui = t;
2556	return count;
2557}
2558
2559static ssize_t es_ui_show(struct ext4_attr *a,
2560			   struct ext4_sb_info *sbi, char *buf)
2561{
2562
2563	unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
2564			   a->u.offset);
2565
2566	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2567}
2568
2569static ssize_t reserved_clusters_show(struct ext4_attr *a,
2570				  struct ext4_sb_info *sbi, char *buf)
2571{
2572	return snprintf(buf, PAGE_SIZE, "%llu\n",
2573		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
2574}
2575
2576static ssize_t reserved_clusters_store(struct ext4_attr *a,
2577				   struct ext4_sb_info *sbi,
2578				   const char *buf, size_t count)
2579{
2580	unsigned long long val;
2581	int ret;
2582
2583	if (parse_strtoull(buf, -1ULL, &val))
2584		return -EINVAL;
2585	ret = ext4_reserve_clusters(sbi, val);
2586
2587	return ret ? ret : count;
2588}
2589
2590static ssize_t trigger_test_error(struct ext4_attr *a,
2591				  struct ext4_sb_info *sbi,
2592				  const char *buf, size_t count)
2593{
2594	int len = count;
2595
2596	if (!capable(CAP_SYS_ADMIN))
2597		return -EPERM;
2598
2599	if (len && buf[len-1] == '\n')
2600		len--;
2601
2602	if (len)
2603		ext4_error(sbi->s_sb, "%.*s", len, buf);
2604	return count;
2605}
2606
2607static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2608				   struct ext4_sb_info *sbi, char *buf)
2609{
2610	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2611}
2612
2613#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2614static struct ext4_attr ext4_attr_##_name = {			\
2615	.attr = {.name = __stringify(_name), .mode = _mode },	\
2616	.show	= _show,					\
2617	.store	= _store,					\
2618	.u = {							\
2619		.offset = offsetof(struct ext4_sb_info, _elname),\
2620	},							\
2621}
2622
2623#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)		\
2624static struct ext4_attr ext4_attr_##_name = {				\
2625	.attr = {.name = __stringify(_name), .mode = _mode },		\
2626	.show	= _show,						\
2627	.store	= _store,						\
2628	.u = {								\
2629		.offset = offsetof(struct ext4_super_block, _elname),	\
2630	},								\
2631}
2632
2633#define EXT4_ATTR(name, mode, show, store) \
2634static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2635
2636#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2637#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2638#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2639
2640#define EXT4_RO_ATTR_ES_UI(name, elname)	\
2641	EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
2642#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
2643	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2644
2645#define ATTR_LIST(name) &ext4_attr_##name.attr
2646#define EXT4_DEPRECATED_ATTR(_name, _val)	\
2647static struct ext4_attr ext4_attr_##_name = {			\
2648	.attr = {.name = __stringify(_name), .mode = 0444 },	\
2649	.show	= sbi_deprecated_show,				\
2650	.u = {							\
2651		.deprecated_val = _val,				\
2652	},							\
2653}
2654
2655EXT4_RO_ATTR(delayed_allocation_blocks);
2656EXT4_RO_ATTR(session_write_kbytes);
2657EXT4_RO_ATTR(lifetime_write_kbytes);
2658EXT4_RW_ATTR(reserved_clusters);
2659EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2660		 inode_readahead_blks_store, s_inode_readahead_blks);
2661EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
2662EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2663EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2664EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2665EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2666EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2667EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2668EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2669EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2670EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2671EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
2672EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
2673EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
2674EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
2675EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
2676EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
2677EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
2678EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
2679EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
2680
2681static struct attribute *ext4_attrs[] = {
2682	ATTR_LIST(delayed_allocation_blocks),
2683	ATTR_LIST(session_write_kbytes),
2684	ATTR_LIST(lifetime_write_kbytes),
2685	ATTR_LIST(reserved_clusters),
2686	ATTR_LIST(inode_readahead_blks),
2687	ATTR_LIST(inode_goal),
2688	ATTR_LIST(mb_stats),
2689	ATTR_LIST(mb_max_to_scan),
2690	ATTR_LIST(mb_min_to_scan),
2691	ATTR_LIST(mb_order2_req),
2692	ATTR_LIST(mb_stream_req),
2693	ATTR_LIST(mb_group_prealloc),
2694	ATTR_LIST(max_writeback_mb_bump),
2695	ATTR_LIST(extent_max_zeroout_kb),
2696	ATTR_LIST(trigger_fs_error),
2697	ATTR_LIST(err_ratelimit_interval_ms),
2698	ATTR_LIST(err_ratelimit_burst),
2699	ATTR_LIST(warning_ratelimit_interval_ms),
2700	ATTR_LIST(warning_ratelimit_burst),
2701	ATTR_LIST(msg_ratelimit_interval_ms),
2702	ATTR_LIST(msg_ratelimit_burst),
2703	ATTR_LIST(errors_count),
2704	ATTR_LIST(first_error_time),
2705	ATTR_LIST(last_error_time),
2706	NULL,
2707};
2708
2709/* Features this copy of ext4 supports */
2710EXT4_INFO_ATTR(lazy_itable_init);
2711EXT4_INFO_ATTR(batched_discard);
2712EXT4_INFO_ATTR(meta_bg_resize);
2713EXT4_INFO_ATTR(encryption);
2714
2715static struct attribute *ext4_feat_attrs[] = {
2716	ATTR_LIST(lazy_itable_init),
2717	ATTR_LIST(batched_discard),
2718	ATTR_LIST(meta_bg_resize),
2719	ATTR_LIST(encryption),
2720	NULL,
2721};
2722
2723static ssize_t ext4_attr_show(struct kobject *kobj,
2724			      struct attribute *attr, char *buf)
2725{
2726	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2727						s_kobj);
2728	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2729
2730	return a->show ? a->show(a, sbi, buf) : 0;
2731}
2732
2733static ssize_t ext4_attr_store(struct kobject *kobj,
2734			       struct attribute *attr,
2735			       const char *buf, size_t len)
2736{
2737	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2738						s_kobj);
2739	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2740
2741	return a->store ? a->store(a, sbi, buf, len) : 0;
2742}
2743
2744static void ext4_sb_release(struct kobject *kobj)
2745{
2746	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2747						s_kobj);
2748	complete(&sbi->s_kobj_unregister);
2749}
2750
2751static const struct sysfs_ops ext4_attr_ops = {
2752	.show	= ext4_attr_show,
2753	.store	= ext4_attr_store,
2754};
2755
2756static struct kobj_type ext4_ktype = {
2757	.default_attrs	= ext4_attrs,
2758	.sysfs_ops	= &ext4_attr_ops,
2759	.release	= ext4_sb_release,
2760};
2761
2762static void ext4_feat_release(struct kobject *kobj)
2763{
2764	complete(&ext4_feat->f_kobj_unregister);
2765}
2766
2767static ssize_t ext4_feat_show(struct kobject *kobj,
2768			      struct attribute *attr, char *buf)
2769{
2770	return snprintf(buf, PAGE_SIZE, "supported\n");
2771}
2772
2773/*
2774 * We can not use ext4_attr_show/store because it relies on the kobject
2775 * being embedded in the ext4_sb_info structure which is definitely not
2776 * true in this case.
2777 */
2778static const struct sysfs_ops ext4_feat_ops = {
2779	.show	= ext4_feat_show,
2780	.store	= NULL,
2781};
2782
2783static struct kobj_type ext4_feat_ktype = {
2784	.default_attrs	= ext4_feat_attrs,
2785	.sysfs_ops	= &ext4_feat_ops,
2786	.release	= ext4_feat_release,
2787};
2788
2789/*
2790 * Check whether this filesystem can be mounted based on
2791 * the features present and the RDONLY/RDWR mount requested.
2792 * Returns 1 if this filesystem can be mounted as requested,
2793 * 0 if it cannot be.
2794 */
2795static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2796{
2797	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2798		ext4_msg(sb, KERN_ERR,
2799			"Couldn't mount because of "
2800			"unsupported optional features (%x)",
2801			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2802			~EXT4_FEATURE_INCOMPAT_SUPP));
2803		return 0;
2804	}
2805
2806	if (readonly)
2807		return 1;
2808
2809	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
2810		ext4_msg(sb, KERN_INFO, "filesystem is read-only");
2811		sb->s_flags |= MS_RDONLY;
2812		return 1;
2813	}
2814
2815	/* Check that feature set is OK for a read-write mount */
2816	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2817		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2818			 "unsupported optional features (%x)",
2819			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2820				~EXT4_FEATURE_RO_COMPAT_SUPP));
2821		return 0;
2822	}
2823	/*
2824	 * Large file size enabled file system can only be mounted
2825	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2826	 */
2827	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2828		if (sizeof(blkcnt_t) < sizeof(u64)) {
2829			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2830				 "cannot be mounted RDWR without "
2831				 "CONFIG_LBDAF");
2832			return 0;
2833		}
2834	}
2835	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
2836	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2837		ext4_msg(sb, KERN_ERR,
2838			 "Can't support bigalloc feature without "
2839			 "extents feature\n");
2840		return 0;
2841	}
2842
2843#ifndef CONFIG_QUOTA
2844	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
2845	    !readonly) {
2846		ext4_msg(sb, KERN_ERR,
2847			 "Filesystem with quota feature cannot be mounted RDWR "
2848			 "without CONFIG_QUOTA");
2849		return 0;
2850	}
2851#endif  /* CONFIG_QUOTA */
2852	return 1;
2853}
2854
2855/*
2856 * This function is called once a day if we have errors logged
2857 * on the file system
2858 */
2859static void print_daily_error_info(unsigned long arg)
2860{
2861	struct super_block *sb = (struct super_block *) arg;
2862	struct ext4_sb_info *sbi;
2863	struct ext4_super_block *es;
2864
2865	sbi = EXT4_SB(sb);
2866	es = sbi->s_es;
2867
2868	if (es->s_error_count)
2869		/* fsck newer than v1.41.13 is needed to clean this condition. */
2870		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
2871			 le32_to_cpu(es->s_error_count));
2872	if (es->s_first_error_time) {
2873		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
2874		       sb->s_id, le32_to_cpu(es->s_first_error_time),
2875		       (int) sizeof(es->s_first_error_func),
2876		       es->s_first_error_func,
2877		       le32_to_cpu(es->s_first_error_line));
2878		if (es->s_first_error_ino)
2879			printk(": inode %u",
2880			       le32_to_cpu(es->s_first_error_ino));
2881		if (es->s_first_error_block)
2882			printk(": block %llu", (unsigned long long)
2883			       le64_to_cpu(es->s_first_error_block));
2884		printk("\n");
2885	}
2886	if (es->s_last_error_time) {
2887		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
2888		       sb->s_id, le32_to_cpu(es->s_last_error_time),
2889		       (int) sizeof(es->s_last_error_func),
2890		       es->s_last_error_func,
2891		       le32_to_cpu(es->s_last_error_line));
2892		if (es->s_last_error_ino)
2893			printk(": inode %u",
2894			       le32_to_cpu(es->s_last_error_ino));
2895		if (es->s_last_error_block)
2896			printk(": block %llu", (unsigned long long)
2897			       le64_to_cpu(es->s_last_error_block));
2898		printk("\n");
2899	}
2900	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
2901}
2902
2903/* Find next suitable group and run ext4_init_inode_table */
2904static int ext4_run_li_request(struct ext4_li_request *elr)
2905{
2906	struct ext4_group_desc *gdp = NULL;
2907	ext4_group_t group, ngroups;
2908	struct super_block *sb;
2909	unsigned long timeout = 0;
2910	int ret = 0;
2911
2912	sb = elr->lr_super;
2913	ngroups = EXT4_SB(sb)->s_groups_count;
2914
2915	sb_start_write(sb);
2916	for (group = elr->lr_next_group; group < ngroups; group++) {
2917		gdp = ext4_get_group_desc(sb, group, NULL);
2918		if (!gdp) {
2919			ret = 1;
2920			break;
2921		}
2922
2923		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2924			break;
2925	}
2926
2927	if (group >= ngroups)
2928		ret = 1;
2929
2930	if (!ret) {
2931		timeout = jiffies;
2932		ret = ext4_init_inode_table(sb, group,
2933					    elr->lr_timeout ? 0 : 1);
2934		if (elr->lr_timeout == 0) {
2935			timeout = (jiffies - timeout) *
2936				  elr->lr_sbi->s_li_wait_mult;
2937			elr->lr_timeout = timeout;
2938		}
2939		elr->lr_next_sched = jiffies + elr->lr_timeout;
2940		elr->lr_next_group = group + 1;
2941	}
2942	sb_end_write(sb);
2943
2944	return ret;
2945}
2946
2947/*
2948 * Remove lr_request from the list_request and free the
2949 * request structure. Should be called with li_list_mtx held
2950 */
2951static void ext4_remove_li_request(struct ext4_li_request *elr)
2952{
2953	struct ext4_sb_info *sbi;
2954
2955	if (!elr)
2956		return;
2957
2958	sbi = elr->lr_sbi;
2959
2960	list_del(&elr->lr_request);
2961	sbi->s_li_request = NULL;
2962	kfree(elr);
2963}
2964
2965static void ext4_unregister_li_request(struct super_block *sb)
2966{
2967	mutex_lock(&ext4_li_mtx);
2968	if (!ext4_li_info) {
2969		mutex_unlock(&ext4_li_mtx);
2970		return;
2971	}
2972
2973	mutex_lock(&ext4_li_info->li_list_mtx);
2974	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2975	mutex_unlock(&ext4_li_info->li_list_mtx);
2976	mutex_unlock(&ext4_li_mtx);
2977}
2978
2979static struct task_struct *ext4_lazyinit_task;
2980
2981/*
2982 * This is the function where ext4lazyinit thread lives. It walks
2983 * through the request list searching for next scheduled filesystem.
2984 * When such a fs is found, run the lazy initialization request
2985 * (ext4_rn_li_request) and keep track of the time spend in this
2986 * function. Based on that time we compute next schedule time of
2987 * the request. When walking through the list is complete, compute
2988 * next waking time and put itself into sleep.
2989 */
2990static int ext4_lazyinit_thread(void *arg)
2991{
2992	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2993	struct list_head *pos, *n;
2994	struct ext4_li_request *elr;
2995	unsigned long next_wakeup, cur;
2996
2997	BUG_ON(NULL == eli);
2998
2999cont_thread:
3000	while (true) {
3001		next_wakeup = MAX_JIFFY_OFFSET;
3002
3003		mutex_lock(&eli->li_list_mtx);
3004		if (list_empty(&eli->li_request_list)) {
3005			mutex_unlock(&eli->li_list_mtx);
3006			goto exit_thread;
3007		}
3008
3009		list_for_each_safe(pos, n, &eli->li_request_list) {
3010			elr = list_entry(pos, struct ext4_li_request,
3011					 lr_request);
3012
3013			if (time_after_eq(jiffies, elr->lr_next_sched)) {
3014				if (ext4_run_li_request(elr) != 0) {
3015					/* error, remove the lazy_init job */
3016					ext4_remove_li_request(elr);
3017					continue;
3018				}
3019			}
3020
3021			if (time_before(elr->lr_next_sched, next_wakeup))
3022				next_wakeup = elr->lr_next_sched;
3023		}
3024		mutex_unlock(&eli->li_list_mtx);
3025
3026		try_to_freeze();
3027
3028		cur = jiffies;
3029		if ((time_after_eq(cur, next_wakeup)) ||
3030		    (MAX_JIFFY_OFFSET == next_wakeup)) {
3031			cond_resched();
3032			continue;
3033		}
3034
3035		schedule_timeout_interruptible(next_wakeup - cur);
3036
3037		if (kthread_should_stop()) {
3038			ext4_clear_request_list();
3039			goto exit_thread;
3040		}
3041	}
3042
3043exit_thread:
3044	/*
3045	 * It looks like the request list is empty, but we need
3046	 * to check it under the li_list_mtx lock, to prevent any
3047	 * additions into it, and of course we should lock ext4_li_mtx
3048	 * to atomically free the list and ext4_li_info, because at
3049	 * this point another ext4 filesystem could be registering
3050	 * new one.
3051	 */
3052	mutex_lock(&ext4_li_mtx);
3053	mutex_lock(&eli->li_list_mtx);
3054	if (!list_empty(&eli->li_request_list)) {
3055		mutex_unlock(&eli->li_list_mtx);
3056		mutex_unlock(&ext4_li_mtx);
3057		goto cont_thread;
3058	}
3059	mutex_unlock(&eli->li_list_mtx);
3060	kfree(ext4_li_info);
3061	ext4_li_info = NULL;
3062	mutex_unlock(&ext4_li_mtx);
3063
3064	return 0;
3065}
3066
3067static void ext4_clear_request_list(void)
3068{
3069	struct list_head *pos, *n;
3070	struct ext4_li_request *elr;
3071
3072	mutex_lock(&ext4_li_info->li_list_mtx);
3073	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3074		elr = list_entry(pos, struct ext4_li_request,
3075				 lr_request);
3076		ext4_remove_li_request(elr);
3077	}
3078	mutex_unlock(&ext4_li_info->li_list_mtx);
3079}
3080
3081static int ext4_run_lazyinit_thread(void)
3082{
3083	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3084					 ext4_li_info, "ext4lazyinit");
3085	if (IS_ERR(ext4_lazyinit_task)) {
3086		int err = PTR_ERR(ext4_lazyinit_task);
3087		ext4_clear_request_list();
3088		kfree(ext4_li_info);
3089		ext4_li_info = NULL;
3090		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3091				 "initialization thread\n",
3092				 err);
3093		return err;
3094	}
3095	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3096	return 0;
3097}
3098
3099/*
3100 * Check whether it make sense to run itable init. thread or not.
3101 * If there is at least one uninitialized inode table, return
3102 * corresponding group number, else the loop goes through all
3103 * groups and return total number of groups.
3104 */
3105static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3106{
3107	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3108	struct ext4_group_desc *gdp = NULL;
3109
3110	for (group = 0; group < ngroups; group++) {
3111		gdp = ext4_get_group_desc(sb, group, NULL);
3112		if (!gdp)
3113			continue;
3114
3115		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3116			break;
3117	}
3118
3119	return group;
3120}
3121
3122static int ext4_li_info_new(void)
3123{
3124	struct ext4_lazy_init *eli = NULL;
3125
3126	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3127	if (!eli)
3128		return -ENOMEM;
3129
3130	INIT_LIST_HEAD(&eli->li_request_list);
3131	mutex_init(&eli->li_list_mtx);
3132
3133	eli->li_state |= EXT4_LAZYINIT_QUIT;
3134
3135	ext4_li_info = eli;
3136
3137	return 0;
3138}
3139
3140static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3141					    ext4_group_t start)
3142{
3143	struct ext4_sb_info *sbi = EXT4_SB(sb);
3144	struct ext4_li_request *elr;
3145
3146	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3147	if (!elr)
3148		return NULL;
3149
3150	elr->lr_super = sb;
3151	elr->lr_sbi = sbi;
3152	elr->lr_next_group = start;
3153
3154	/*
3155	 * Randomize first schedule time of the request to
3156	 * spread the inode table initialization requests
3157	 * better.
3158	 */
3159	elr->lr_next_sched = jiffies + (prandom_u32() %
3160				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
3161	return elr;
3162}
3163
3164int ext4_register_li_request(struct super_block *sb,
3165			     ext4_group_t first_not_zeroed)
3166{
3167	struct ext4_sb_info *sbi = EXT4_SB(sb);
3168	struct ext4_li_request *elr = NULL;
3169	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3170	int ret = 0;
3171
3172	mutex_lock(&ext4_li_mtx);
3173	if (sbi->s_li_request != NULL) {
3174		/*
3175		 * Reset timeout so it can be computed again, because
3176		 * s_li_wait_mult might have changed.
3177		 */
3178		sbi->s_li_request->lr_timeout = 0;
3179		goto out;
3180	}
3181
3182	if (first_not_zeroed == ngroups ||
3183	    (sb->s_flags & MS_RDONLY) ||
3184	    !test_opt(sb, INIT_INODE_TABLE))
3185		goto out;
3186
3187	elr = ext4_li_request_new(sb, first_not_zeroed);
3188	if (!elr) {
3189		ret = -ENOMEM;
3190		goto out;
3191	}
3192
3193	if (NULL == ext4_li_info) {
3194		ret = ext4_li_info_new();
3195		if (ret)
3196			goto out;
3197	}
3198
3199	mutex_lock(&ext4_li_info->li_list_mtx);
3200	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3201	mutex_unlock(&ext4_li_info->li_list_mtx);
3202
3203	sbi->s_li_request = elr;
3204	/*
3205	 * set elr to NULL here since it has been inserted to
3206	 * the request_list and the removal and free of it is
3207	 * handled by ext4_clear_request_list from now on.
3208	 */
3209	elr = NULL;
3210
3211	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3212		ret = ext4_run_lazyinit_thread();
3213		if (ret)
3214			goto out;
3215	}
3216out:
3217	mutex_unlock(&ext4_li_mtx);
3218	if (ret)
3219		kfree(elr);
3220	return ret;
3221}
3222
3223/*
3224 * We do not need to lock anything since this is called on
3225 * module unload.
3226 */
3227static void ext4_destroy_lazyinit_thread(void)
3228{
3229	/*
3230	 * If thread exited earlier
3231	 * there's nothing to be done.
3232	 */
3233	if (!ext4_li_info || !ext4_lazyinit_task)
3234		return;
3235
3236	kthread_stop(ext4_lazyinit_task);
3237}
3238
3239static int set_journal_csum_feature_set(struct super_block *sb)
3240{
3241	int ret = 1;
3242	int compat, incompat;
3243	struct ext4_sb_info *sbi = EXT4_SB(sb);
3244
3245	if (ext4_has_metadata_csum(sb)) {
3246		/* journal checksum v3 */
3247		compat = 0;
3248		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3249	} else {
3250		/* journal checksum v1 */
3251		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3252		incompat = 0;
3253	}
3254
3255	jbd2_journal_clear_features(sbi->s_journal,
3256			JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3257			JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3258			JBD2_FEATURE_INCOMPAT_CSUM_V2);
3259	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3260		ret = jbd2_journal_set_features(sbi->s_journal,
3261				compat, 0,
3262				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3263				incompat);
3264	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3265		ret = jbd2_journal_set_features(sbi->s_journal,
3266				compat, 0,
3267				incompat);
3268		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3269				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3270	} else {
3271		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3272				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3273	}
3274
3275	return ret;
3276}
3277
3278/*
3279 * Note: calculating the overhead so we can be compatible with
3280 * historical BSD practice is quite difficult in the face of
3281 * clusters/bigalloc.  This is because multiple metadata blocks from
3282 * different block group can end up in the same allocation cluster.
3283 * Calculating the exact overhead in the face of clustered allocation
3284 * requires either O(all block bitmaps) in memory or O(number of block
3285 * groups**2) in time.  We will still calculate the superblock for
3286 * older file systems --- and if we come across with a bigalloc file
3287 * system with zero in s_overhead_clusters the estimate will be close to
3288 * correct especially for very large cluster sizes --- but for newer
3289 * file systems, it's better to calculate this figure once at mkfs
3290 * time, and store it in the superblock.  If the superblock value is
3291 * present (even for non-bigalloc file systems), we will use it.
3292 */
3293static int count_overhead(struct super_block *sb, ext4_group_t grp,
3294			  char *buf)
3295{
3296	struct ext4_sb_info	*sbi = EXT4_SB(sb);
3297	struct ext4_group_desc	*gdp;
3298	ext4_fsblk_t		first_block, last_block, b;
3299	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
3300	int			s, j, count = 0;
3301
3302	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
3303		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3304			sbi->s_itb_per_group + 2);
3305
3306	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3307		(grp * EXT4_BLOCKS_PER_GROUP(sb));
3308	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
3309	for (i = 0; i < ngroups; i++) {
3310		gdp = ext4_get_group_desc(sb, i, NULL);
3311		b = ext4_block_bitmap(sb, gdp);
3312		if (b >= first_block && b <= last_block) {
3313			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3314			count++;
3315		}
3316		b = ext4_inode_bitmap(sb, gdp);
3317		if (b >= first_block && b <= last_block) {
3318			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3319			count++;
3320		}
3321		b = ext4_inode_table(sb, gdp);
3322		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
3323			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
3324				int c = EXT4_B2C(sbi, b - first_block);
3325				ext4_set_bit(c, buf);
3326				count++;
3327			}
3328		if (i != grp)
3329			continue;
3330		s = 0;
3331		if (ext4_bg_has_super(sb, grp)) {
3332			ext4_set_bit(s++, buf);
3333			count++;
3334		}
3335		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
3336			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
3337			count++;
3338		}
3339	}
3340	if (!count)
3341		return 0;
3342	return EXT4_CLUSTERS_PER_GROUP(sb) -
3343		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
3344}
3345
3346/*
3347 * Compute the overhead and stash it in sbi->s_overhead
3348 */
3349int ext4_calculate_overhead(struct super_block *sb)
3350{
3351	struct ext4_sb_info *sbi = EXT4_SB(sb);
3352	struct ext4_super_block *es = sbi->s_es;
3353	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3354	ext4_fsblk_t overhead = 0;
3355	char *buf = (char *) get_zeroed_page(GFP_NOFS);
3356
3357	if (!buf)
3358		return -ENOMEM;
3359
3360	/*
3361	 * Compute the overhead (FS structures).  This is constant
3362	 * for a given filesystem unless the number of block groups
3363	 * changes so we cache the previous value until it does.
3364	 */
3365
3366	/*
3367	 * All of the blocks before first_data_block are overhead
3368	 */
3369	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
3370
3371	/*
3372	 * Add the overhead found in each block group
3373	 */
3374	for (i = 0; i < ngroups; i++) {
3375		int blks;
3376
3377		blks = count_overhead(sb, i, buf);
3378		overhead += blks;
3379		if (blks)
3380			memset(buf, 0, PAGE_SIZE);
3381		cond_resched();
3382	}
3383	/* Add the internal journal blocks as well */
3384	if (sbi->s_journal && !sbi->journal_bdev)
3385		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
3386
3387	sbi->s_overhead = overhead;
3388	smp_wmb();
3389	free_page((unsigned long) buf);
3390	return 0;
3391}
3392
3393
3394static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
3395{
3396	ext4_fsblk_t resv_clusters;
3397
3398	/*
3399	 * There's no need to reserve anything when we aren't using extents.
3400	 * The space estimates are exact, there are no unwritten extents,
3401	 * hole punching doesn't need new metadata... This is needed especially
3402	 * to keep ext2/3 backward compatibility.
3403	 */
3404	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
3405		return 0;
3406	/*
3407	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
3408	 * This should cover the situations where we can not afford to run
3409	 * out of space like for example punch hole, or converting
3410	 * unwritten extents in delalloc path. In most cases such
3411	 * allocation would require 1, or 2 blocks, higher numbers are
3412	 * very rare.
3413	 */
3414	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
3415			EXT4_SB(sb)->s_cluster_bits;
3416
3417	do_div(resv_clusters, 50);
3418	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3419
3420	return resv_clusters;
3421}
3422
3423
3424static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
3425{
3426	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
3427				sbi->s_cluster_bits;
3428
3429	if (count >= clusters)
3430		return -EINVAL;
3431
3432	atomic64_set(&sbi->s_resv_clusters, count);
3433	return 0;
3434}
3435
3436static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3437{
3438	char *orig_data = kstrdup(data, GFP_KERNEL);
3439	struct buffer_head *bh;
3440	struct ext4_super_block *es = NULL;
3441	struct ext4_sb_info *sbi;
3442	ext4_fsblk_t block;
3443	ext4_fsblk_t sb_block = get_sb_block(&data);
3444	ext4_fsblk_t logical_sb_block;
3445	unsigned long offset = 0;
3446	unsigned long journal_devnum = 0;
3447	unsigned long def_mount_opts;
3448	struct inode *root;
3449	char *cp;
3450	const char *descr;
3451	int ret = -ENOMEM;
3452	int blocksize, clustersize;
3453	unsigned int db_count;
3454	unsigned int i;
3455	int needs_recovery, has_huge_files, has_bigalloc;
3456	__u64 blocks_count;
3457	int err = 0;
3458	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3459	ext4_group_t first_not_zeroed;
3460
3461	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
3462	if (!sbi)
3463		goto out_free_orig;
3464
3465	sbi->s_blockgroup_lock =
3466		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
3467	if (!sbi->s_blockgroup_lock) {
3468		kfree(sbi);
3469		goto out_free_orig;
3470	}
3471	sb->s_fs_info = sbi;
3472	sbi->s_sb = sb;
3473	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3474	sbi->s_sb_block = sb_block;
3475	if (sb->s_bdev->bd_part)
3476		sbi->s_sectors_written_start =
3477			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
3478#ifdef CONFIG_EXT4_FS_ENCRYPTION
3479	/* Modes of operations for file and directory encryption. */
3480	sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
3481	sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
3482#endif
3483
3484	/* Cleanup superblock name */
3485	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
3486		*cp = '!';
3487
3488	/* -EINVAL is default */
3489	ret = -EINVAL;
3490	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3491	if (!blocksize) {
3492		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
3493		goto out_fail;
3494	}
3495
3496	/*
3497	 * The ext4 superblock will not be buffer aligned for other than 1kB
3498	 * block sizes.  We need to calculate the offset from buffer start.
3499	 */
3500	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
3501		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3502		offset = do_div(logical_sb_block, blocksize);
3503	} else {
3504		logical_sb_block = sb_block;
3505	}
3506
3507	if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
3508		ext4_msg(sb, KERN_ERR, "unable to read superblock");
3509		goto out_fail;
3510	}
3511	/*
3512	 * Note: s_es must be initialized as soon as possible because
3513	 *       some ext4 macro-instructions depend on its value
3514	 */
3515	es = (struct ext4_super_block *) (bh->b_data + offset);
3516	sbi->s_es = es;
3517	sb->s_magic = le16_to_cpu(es->s_magic);
3518	if (sb->s_magic != EXT4_SUPER_MAGIC)
3519		goto cantfind_ext4;
3520	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3521
3522	/* Warn if metadata_csum and gdt_csum are both set. */
3523	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3524				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
3525	    EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3526		ext4_warning(sb, "metadata_csum and uninit_bg are "
3527			     "redundant flags; please run fsck.");
3528
3529	/* Check for a known checksum algorithm */
3530	if (!ext4_verify_csum_type(sb, es)) {
3531		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3532			 "unknown checksum algorithm.");
3533		silent = 1;
3534		goto cantfind_ext4;
3535	}
3536
3537	/* Load the checksum driver */
3538	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3539				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3540		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
3541		if (IS_ERR(sbi->s_chksum_driver)) {
3542			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
3543			ret = PTR_ERR(sbi->s_chksum_driver);
3544			sbi->s_chksum_driver = NULL;
3545			goto failed_mount;
3546		}
3547	}
3548
3549	/* Check superblock checksum */
3550	if (!ext4_superblock_csum_verify(sb, es)) {
3551		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3552			 "invalid superblock checksum.  Run e2fsck?");
3553		silent = 1;
3554		goto cantfind_ext4;
3555	}
3556
3557	/* Precompute checksum seed for all metadata */
3558	if (ext4_has_metadata_csum(sb))
3559		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3560					       sizeof(es->s_uuid));
3561
3562	/* Set defaults before we parse the mount options */
3563	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3564	set_opt(sb, INIT_INODE_TABLE);
3565	if (def_mount_opts & EXT4_DEFM_DEBUG)
3566		set_opt(sb, DEBUG);
3567	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3568		set_opt(sb, GRPID);
3569	if (def_mount_opts & EXT4_DEFM_UID16)
3570		set_opt(sb, NO_UID32);
3571	/* xattr user namespace & acls are now defaulted on */
3572	set_opt(sb, XATTR_USER);
3573#ifdef CONFIG_EXT4_FS_POSIX_ACL
3574	set_opt(sb, POSIX_ACL);
3575#endif
3576	/* don't forget to enable journal_csum when metadata_csum is enabled. */
3577	if (ext4_has_metadata_csum(sb))
3578		set_opt(sb, JOURNAL_CHECKSUM);
3579
3580	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3581		set_opt(sb, JOURNAL_DATA);
3582	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3583		set_opt(sb, ORDERED_DATA);
3584	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3585		set_opt(sb, WRITEBACK_DATA);
3586
3587	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3588		set_opt(sb, ERRORS_PANIC);
3589	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3590		set_opt(sb, ERRORS_CONT);
3591	else
3592		set_opt(sb, ERRORS_RO);
3593	/* block_validity enabled by default; disable with noblock_validity */
3594	set_opt(sb, BLOCK_VALIDITY);
3595	if (def_mount_opts & EXT4_DEFM_DISCARD)
3596		set_opt(sb, DISCARD);
3597
3598	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
3599	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
3600	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
3601	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
3602	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3603
3604	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3605		set_opt(sb, BARRIER);
3606
3607	/*
3608	 * enable delayed allocation by default
3609	 * Use -o nodelalloc to turn it off
3610	 */
3611	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
3612	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3613		set_opt(sb, DELALLOC);
3614
3615	/*
3616	 * set default s_li_wait_mult for lazyinit, for the case there is
3617	 * no mount option specified.
3618	 */
3619	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3620
3621	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3622			   &journal_devnum, &journal_ioprio, 0)) {
3623		ext4_msg(sb, KERN_WARNING,
3624			 "failed to parse options in superblock: %s",
3625			 sbi->s_es->s_mount_opts);
3626	}
3627	sbi->s_def_mount_opt = sbi->s_mount_opt;
3628	if (!parse_options((char *) data, sb, &journal_devnum,
3629			   &journal_ioprio, 0))
3630		goto failed_mount;
3631
3632	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3633		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
3634			    "with data=journal disables delayed "
3635			    "allocation and O_DIRECT support!\n");
3636		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
3637			ext4_msg(sb, KERN_ERR, "can't mount with "
3638				 "both data=journal and delalloc");
3639			goto failed_mount;
3640		}
3641		if (test_opt(sb, DIOREAD_NOLOCK)) {
3642			ext4_msg(sb, KERN_ERR, "can't mount with "
3643				 "both data=journal and dioread_nolock");
3644			goto failed_mount;
3645		}
3646		if (test_opt(sb, DAX)) {
3647			ext4_msg(sb, KERN_ERR, "can't mount with "
3648				 "both data=journal and dax");
3649			goto failed_mount;
3650		}
3651		if (test_opt(sb, DELALLOC))
3652			clear_opt(sb, DELALLOC);
3653	}
3654
3655	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3656		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3657
3658	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
3659	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
3660	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
3661	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
3662		ext4_msg(sb, KERN_WARNING,
3663		       "feature flags set on rev 0 fs, "
3664		       "running e2fsck is recommended");
3665
3666	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
3667		set_opt2(sb, HURD_COMPAT);
3668		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
3669					      EXT4_FEATURE_INCOMPAT_64BIT)) {
3670			ext4_msg(sb, KERN_ERR,
3671				 "The Hurd can't support 64-bit file systems");
3672			goto failed_mount;
3673		}
3674	}
3675
3676	if (IS_EXT2_SB(sb)) {
3677		if (ext2_feature_set_ok(sb))
3678			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3679				 "using the ext4 subsystem");
3680		else {
3681			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3682				 "to feature incompatibilities");
3683			goto failed_mount;
3684		}
3685	}
3686
3687	if (IS_EXT3_SB(sb)) {
3688		if (ext3_feature_set_ok(sb))
3689			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3690				 "using the ext4 subsystem");
3691		else {
3692			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3693				 "to feature incompatibilities");
3694			goto failed_mount;
3695		}
3696	}
3697
3698	/*
3699	 * Check feature flags regardless of the revision level, since we
3700	 * previously didn't change the revision level when setting the flags,
3701	 * so there is a chance incompat flags are set on a rev 0 filesystem.
3702	 */
3703	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
3704		goto failed_mount;
3705
3706	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3707	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3708	    blocksize > EXT4_MAX_BLOCK_SIZE) {
3709		ext4_msg(sb, KERN_ERR,
3710		       "Unsupported filesystem blocksize %d", blocksize);
3711		goto failed_mount;
3712	}
3713
3714	if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
3715		if (blocksize != PAGE_SIZE) {
3716			ext4_msg(sb, KERN_ERR,
3717					"error: unsupported blocksize for dax");
3718			goto failed_mount;
3719		}
3720		if (!sb->s_bdev->bd_disk->fops->direct_access) {
3721			ext4_msg(sb, KERN_ERR,
3722					"error: device does not support dax");
3723			goto failed_mount;
3724		}
3725	}
3726
3727	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
3728	    es->s_encryption_level) {
3729		ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
3730			 es->s_encryption_level);
3731		goto failed_mount;
3732	}
3733
3734	if (sb->s_blocksize != blocksize) {
3735		/* Validate the filesystem blocksize */
3736		if (!sb_set_blocksize(sb, blocksize)) {
3737			ext4_msg(sb, KERN_ERR, "bad block size %d",
3738					blocksize);
3739			goto failed_mount;
3740		}
3741
3742		brelse(bh);
3743		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3744		offset = do_div(logical_sb_block, blocksize);
3745		bh = sb_bread_unmovable(sb, logical_sb_block);
3746		if (!bh) {
3747			ext4_msg(sb, KERN_ERR,
3748			       "Can't read superblock on 2nd try");
3749			goto failed_mount;
3750		}
3751		es = (struct ext4_super_block *)(bh->b_data + offset);
3752		sbi->s_es = es;
3753		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
3754			ext4_msg(sb, KERN_ERR,
3755			       "Magic mismatch, very weird!");
3756			goto failed_mount;
3757		}
3758	}
3759
3760	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3761				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3762	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
3763						      has_huge_files);
3764	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
3765
3766	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
3767		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
3768		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
3769	} else {
3770		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
3771		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
3772		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
3773		    (!is_power_of_2(sbi->s_inode_size)) ||
3774		    (sbi->s_inode_size > blocksize)) {
3775			ext4_msg(sb, KERN_ERR,
3776			       "unsupported inode size: %d",
3777			       sbi->s_inode_size);
3778			goto failed_mount;
3779		}
3780		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
3781			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
3782	}
3783
3784	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
3785	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
3786		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
3787		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
3788		    !is_power_of_2(sbi->s_desc_size)) {
3789			ext4_msg(sb, KERN_ERR,
3790			       "unsupported descriptor size %lu",
3791			       sbi->s_desc_size);
3792			goto failed_mount;
3793		}
3794	} else
3795		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
3796
3797	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
3798	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
3799	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
3800		goto cantfind_ext4;
3801
3802	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
3803	if (sbi->s_inodes_per_block == 0)
3804		goto cantfind_ext4;
3805	sbi->s_itb_per_group = sbi->s_inodes_per_group /
3806					sbi->s_inodes_per_block;
3807	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
3808	sbi->s_sbh = bh;
3809	sbi->s_mount_state = le16_to_cpu(es->s_state);
3810	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3811	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3812
3813	for (i = 0; i < 4; i++)
3814		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3815	sbi->s_def_hash_version = es->s_def_hash_version;
3816	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
3817		i = le32_to_cpu(es->s_flags);
3818		if (i & EXT2_FLAGS_UNSIGNED_HASH)
3819			sbi->s_hash_unsigned = 3;
3820		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
3821#ifdef __CHAR_UNSIGNED__
3822			if (!(sb->s_flags & MS_RDONLY))
3823				es->s_flags |=
3824					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
3825			sbi->s_hash_unsigned = 3;
3826#else
3827			if (!(sb->s_flags & MS_RDONLY))
3828				es->s_flags |=
3829					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3830#endif
3831		}
3832	}
3833
3834	/* Handle clustersize */
3835	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
3836	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3837				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
3838	if (has_bigalloc) {
3839		if (clustersize < blocksize) {
3840			ext4_msg(sb, KERN_ERR,
3841				 "cluster size (%d) smaller than "
3842				 "block size (%d)", clustersize, blocksize);
3843			goto failed_mount;
3844		}
3845		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
3846			le32_to_cpu(es->s_log_block_size);
3847		sbi->s_clusters_per_group =
3848			le32_to_cpu(es->s_clusters_per_group);
3849		if (sbi->s_clusters_per_group > blocksize * 8) {
3850			ext4_msg(sb, KERN_ERR,
3851				 "#clusters per group too big: %lu",
3852				 sbi->s_clusters_per_group);
3853			goto failed_mount;
3854		}
3855		if (sbi->s_blocks_per_group !=
3856		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
3857			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
3858				 "clusters per group (%lu) inconsistent",
3859				 sbi->s_blocks_per_group,
3860				 sbi->s_clusters_per_group);
3861			goto failed_mount;
3862		}
3863	} else {
3864		if (clustersize != blocksize) {
3865			ext4_warning(sb, "fragment/cluster size (%d) != "
3866				     "block size (%d)", clustersize,
3867				     blocksize);
3868			clustersize = blocksize;
3869		}
3870		if (sbi->s_blocks_per_group > blocksize * 8) {
3871			ext4_msg(sb, KERN_ERR,
3872				 "#blocks per group too big: %lu",
3873				 sbi->s_blocks_per_group);
3874			goto failed_mount;
3875		}
3876		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
3877		sbi->s_cluster_bits = 0;
3878	}
3879	sbi->s_cluster_ratio = clustersize / blocksize;
3880
3881	if (sbi->s_inodes_per_group > blocksize * 8) {
3882		ext4_msg(sb, KERN_ERR,
3883		       "#inodes per group too big: %lu",
3884		       sbi->s_inodes_per_group);
3885		goto failed_mount;
3886	}
3887
3888	/* Do we have standard group size of clustersize * 8 blocks ? */
3889	if (sbi->s_blocks_per_group == clustersize << 3)
3890		set_opt2(sb, STD_GROUP_SIZE);
3891
3892	/*
3893	 * Test whether we have more sectors than will fit in sector_t,
3894	 * and whether the max offset is addressable by the page cache.
3895	 */
3896	err = generic_check_addressable(sb->s_blocksize_bits,
3897					ext4_blocks_count(es));
3898	if (err) {
3899		ext4_msg(sb, KERN_ERR, "filesystem"
3900			 " too large to mount safely on this system");
3901		if (sizeof(sector_t) < 8)
3902			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3903		goto failed_mount;
3904	}
3905
3906	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
3907		goto cantfind_ext4;
3908
3909	/* check blocks count against device size */
3910	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
3911	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
3912		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
3913		       "exceeds size of device (%llu blocks)",
3914		       ext4_blocks_count(es), blocks_count);
3915		goto failed_mount;
3916	}
3917
3918	/*
3919	 * It makes no sense for the first data block to be beyond the end
3920	 * of the filesystem.
3921	 */
3922	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
3923		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
3924			 "block %u is beyond end of filesystem (%llu)",
3925			 le32_to_cpu(es->s_first_data_block),
3926			 ext4_blocks_count(es));
3927		goto failed_mount;
3928	}
3929	blocks_count = (ext4_blocks_count(es) -
3930			le32_to_cpu(es->s_first_data_block) +
3931			EXT4_BLOCKS_PER_GROUP(sb) - 1);
3932	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
3933	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
3934		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
3935		       "(block count %llu, first data block %u, "
3936		       "blocks per group %lu)", sbi->s_groups_count,
3937		       ext4_blocks_count(es),
3938		       le32_to_cpu(es->s_first_data_block),
3939		       EXT4_BLOCKS_PER_GROUP(sb));
3940		goto failed_mount;
3941	}
3942	sbi->s_groups_count = blocks_count;
3943	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
3944			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
3945	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
3946		   EXT4_DESC_PER_BLOCK(sb);
3947	sbi->s_group_desc = ext4_kvmalloc(db_count *
3948					  sizeof(struct buffer_head *),
3949					  GFP_KERNEL);
3950	if (sbi->s_group_desc == NULL) {
3951		ext4_msg(sb, KERN_ERR, "not enough memory");
3952		ret = -ENOMEM;
3953		goto failed_mount;
3954	}
3955
3956	if (ext4_proc_root)
3957		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3958
3959	if (sbi->s_proc)
3960		proc_create_data("options", S_IRUGO, sbi->s_proc,
3961				 &ext4_seq_options_fops, sb);
3962
3963	bgl_lock_init(sbi->s_blockgroup_lock);
3964
3965	for (i = 0; i < db_count; i++) {
3966		block = descriptor_loc(sb, logical_sb_block, i);
3967		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
3968		if (!sbi->s_group_desc[i]) {
3969			ext4_msg(sb, KERN_ERR,
3970			       "can't read group descriptor %d", i);
3971			db_count = i;
3972			goto failed_mount2;
3973		}
3974	}
3975	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
3976		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
3977		goto failed_mount2;
3978	}
3979
3980	sbi->s_gdb_count = db_count;
3981	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3982	spin_lock_init(&sbi->s_next_gen_lock);
3983
3984	setup_timer(&sbi->s_err_report, print_daily_error_info,
3985		(unsigned long) sb);
3986
3987	/* Register extent status tree shrinker */
3988	if (ext4_es_register_shrinker(sbi))
3989		goto failed_mount3;
3990
3991	sbi->s_stripe = ext4_get_stripe_size(sbi);
3992	sbi->s_extent_max_zeroout_kb = 32;
3993
3994	/*
3995	 * set up enough so that it can read an inode
3996	 */
3997	sb->s_op = &ext4_sops;
3998	sb->s_export_op = &ext4_export_ops;
3999	sb->s_xattr = ext4_xattr_handlers;
4000#ifdef CONFIG_QUOTA
4001	sb->dq_op = &ext4_quota_operations;
4002	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
4003		sb->s_qcop = &dquot_quotactl_sysfile_ops;
4004	else
4005		sb->s_qcop = &ext4_qctl_operations;
4006	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
4007#endif
4008	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
4009
4010	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
4011	mutex_init(&sbi->s_orphan_lock);
4012
4013	sb->s_root = NULL;
4014
4015	needs_recovery = (es->s_last_orphan != 0 ||
4016			  EXT4_HAS_INCOMPAT_FEATURE(sb,
4017				    EXT4_FEATURE_INCOMPAT_RECOVER));
4018
4019	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
4020	    !(sb->s_flags & MS_RDONLY))
4021		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
4022			goto failed_mount3a;
4023
4024	/*
4025	 * The first inode we look at is the journal inode.  Don't try
4026	 * root first: it may be modified in the journal!
4027	 */
4028	if (!test_opt(sb, NOLOAD) &&
4029	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
4030		if (ext4_load_journal(sb, es, journal_devnum))
4031			goto failed_mount3a;
4032	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
4033	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
4034		ext4_msg(sb, KERN_ERR, "required journal recovery "
4035		       "suppressed and not mounted read-only");
4036		goto failed_mount_wq;
4037	} else {
4038		clear_opt(sb, DATA_FLAGS);
4039		sbi->s_journal = NULL;
4040		needs_recovery = 0;
4041		goto no_journal;
4042	}
4043
4044	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
4045	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4046				       JBD2_FEATURE_INCOMPAT_64BIT)) {
4047		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4048		goto failed_mount_wq;
4049	}
4050
4051	if (!set_journal_csum_feature_set(sb)) {
4052		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4053			 "feature set");
4054		goto failed_mount_wq;
4055	}
4056
4057	/* We have now updated the journal if required, so we can
4058	 * validate the data journaling mode. */
4059	switch (test_opt(sb, DATA_FLAGS)) {
4060	case 0:
4061		/* No mode set, assume a default based on the journal
4062		 * capabilities: ORDERED_DATA if the journal can
4063		 * cope, else JOURNAL_DATA
4064		 */
4065		if (jbd2_journal_check_available_features
4066		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
4067			set_opt(sb, ORDERED_DATA);
4068		else
4069			set_opt(sb, JOURNAL_DATA);
4070		break;
4071
4072	case EXT4_MOUNT_ORDERED_DATA:
4073	case EXT4_MOUNT_WRITEBACK_DATA:
4074		if (!jbd2_journal_check_available_features
4075		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4076			ext4_msg(sb, KERN_ERR, "Journal does not support "
4077			       "requested data journaling mode");
4078			goto failed_mount_wq;
4079		}
4080	default:
4081		break;
4082	}
4083	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4084
4085	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
4086
4087no_journal:
4088	if (ext4_mballoc_ready) {
4089		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
4090		if (!sbi->s_mb_cache) {
4091			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
4092			goto failed_mount_wq;
4093		}
4094	}
4095
4096	if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
4097	    !(sb->s_flags & MS_RDONLY) &&
4098	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
4099		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
4100		ext4_commit_super(sb, 1);
4101	}
4102
4103	/*
4104	 * Get the # of file system overhead blocks from the
4105	 * superblock if present.
4106	 */
4107	if (es->s_overhead_clusters)
4108		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
4109	else {
4110		err = ext4_calculate_overhead(sb);
4111		if (err)
4112			goto failed_mount_wq;
4113	}
4114
4115	/*
4116	 * The maximum number of concurrent works can be high and
4117	 * concurrency isn't really necessary.  Limit it to 1.
4118	 */
4119	EXT4_SB(sb)->rsv_conversion_wq =
4120		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
4121	if (!EXT4_SB(sb)->rsv_conversion_wq) {
4122		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
4123		ret = -ENOMEM;
4124		goto failed_mount4;
4125	}
4126
4127	/*
4128	 * The jbd2_journal_load will have done any necessary log recovery,
4129	 * so we can safely mount the rest of the filesystem now.
4130	 */
4131
4132	root = ext4_iget(sb, EXT4_ROOT_INO);
4133	if (IS_ERR(root)) {
4134		ext4_msg(sb, KERN_ERR, "get root inode failed");
4135		ret = PTR_ERR(root);
4136		root = NULL;
4137		goto failed_mount4;
4138	}
4139	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
4140		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
4141		iput(root);
4142		goto failed_mount4;
4143	}
4144	sb->s_root = d_make_root(root);
4145	if (!sb->s_root) {
4146		ext4_msg(sb, KERN_ERR, "get root dentry failed");
4147		ret = -ENOMEM;
4148		goto failed_mount4;
4149	}
4150
4151	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
4152		sb->s_flags |= MS_RDONLY;
4153
4154	/* determine the minimum size of new large inodes, if present */
4155	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
4156		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4157						     EXT4_GOOD_OLD_INODE_SIZE;
4158		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4159				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
4160			if (sbi->s_want_extra_isize <
4161			    le16_to_cpu(es->s_want_extra_isize))
4162				sbi->s_want_extra_isize =
4163					le16_to_cpu(es->s_want_extra_isize);
4164			if (sbi->s_want_extra_isize <
4165			    le16_to_cpu(es->s_min_extra_isize))
4166				sbi->s_want_extra_isize =
4167					le16_to_cpu(es->s_min_extra_isize);
4168		}
4169	}
4170	/* Check if enough inode space is available */
4171	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
4172							sbi->s_inode_size) {
4173		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
4174						       EXT4_GOOD_OLD_INODE_SIZE;
4175		ext4_msg(sb, KERN_INFO, "required extra inode space not"
4176			 "available");
4177	}
4178
4179	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
4180	if (err) {
4181		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
4182			 "reserved pool", ext4_calculate_resv_clusters(sb));
4183		goto failed_mount4a;
4184	}
4185
4186	err = ext4_setup_system_zone(sb);
4187	if (err) {
4188		ext4_msg(sb, KERN_ERR, "failed to initialize system "
4189			 "zone (%d)", err);
4190		goto failed_mount4a;
4191	}
4192
4193	ext4_ext_init(sb);
4194	err = ext4_mb_init(sb);
4195	if (err) {
4196		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
4197			 err);
4198		goto failed_mount5;
4199	}
4200
4201	block = ext4_count_free_clusters(sb);
4202	ext4_free_blocks_count_set(sbi->s_es,
4203				   EXT4_C2B(sbi, block));
4204	err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4205				  GFP_KERNEL);
4206	if (!err) {
4207		unsigned long freei = ext4_count_free_inodes(sb);
4208		sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4209		err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4210					  GFP_KERNEL);
4211	}
4212	if (!err)
4213		err = percpu_counter_init(&sbi->s_dirs_counter,
4214					  ext4_count_dirs(sb), GFP_KERNEL);
4215	if (!err)
4216		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4217					  GFP_KERNEL);
4218	if (err) {
4219		ext4_msg(sb, KERN_ERR, "insufficient memory");
4220		goto failed_mount6;
4221	}
4222
4223	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
4224		if (!ext4_fill_flex_info(sb)) {
4225			ext4_msg(sb, KERN_ERR,
4226			       "unable to initialize "
4227			       "flex_bg meta info!");
4228			goto failed_mount6;
4229		}
4230
4231	err = ext4_register_li_request(sb, first_not_zeroed);
4232	if (err)
4233		goto failed_mount6;
4234
4235	sbi->s_kobj.kset = ext4_kset;
4236	init_completion(&sbi->s_kobj_unregister);
4237	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
4238				   "%s", sb->s_id);
4239	if (err)
4240		goto failed_mount7;
4241
4242#ifdef CONFIG_QUOTA
4243	/* Enable quota usage during mount. */
4244	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
4245	    !(sb->s_flags & MS_RDONLY)) {
4246		err = ext4_enable_quotas(sb);
4247		if (err)
4248			goto failed_mount8;
4249	}
4250#endif  /* CONFIG_QUOTA */
4251
4252	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
4253	ext4_orphan_cleanup(sb, es);
4254	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
4255	if (needs_recovery) {
4256		ext4_msg(sb, KERN_INFO, "recovery complete");
4257		ext4_mark_recovery_complete(sb, es);
4258	}
4259	if (EXT4_SB(sb)->s_journal) {
4260		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
4261			descr = " journalled data mode";
4262		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
4263			descr = " ordered data mode";
4264		else
4265			descr = " writeback data mode";
4266	} else
4267		descr = "out journal";
4268
4269	if (test_opt(sb, DISCARD)) {
4270		struct request_queue *q = bdev_get_queue(sb->s_bdev);
4271		if (!blk_queue_discard(q))
4272			ext4_msg(sb, KERN_WARNING,
4273				 "mounting with \"discard\" option, but "
4274				 "the device does not support discard");
4275	}
4276
4277	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4278		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
4279		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
4280
4281	if (es->s_error_count)
4282		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
4283
4284	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
4285	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
4286	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
4287	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
4288
4289	kfree(orig_data);
4290	return 0;
4291
4292cantfind_ext4:
4293	if (!silent)
4294		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
4295	goto failed_mount;
4296
4297#ifdef CONFIG_QUOTA
4298failed_mount8:
4299	kobject_del(&sbi->s_kobj);
4300#endif
4301failed_mount7:
4302	ext4_unregister_li_request(sb);
4303failed_mount6:
4304	ext4_mb_release(sb);
4305	if (sbi->s_flex_groups)
4306		kvfree(sbi->s_flex_groups);
4307	percpu_counter_destroy(&sbi->s_freeclusters_counter);
4308	percpu_counter_destroy(&sbi->s_freeinodes_counter);
4309	percpu_counter_destroy(&sbi->s_dirs_counter);
4310	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
4311failed_mount5:
4312	ext4_ext_release(sb);
4313	ext4_release_system_zone(sb);
4314failed_mount4a:
4315	dput(sb->s_root);
4316	sb->s_root = NULL;
4317failed_mount4:
4318	ext4_msg(sb, KERN_ERR, "mount failed");
4319	if (EXT4_SB(sb)->rsv_conversion_wq)
4320		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4321failed_mount_wq:
4322	if (sbi->s_journal) {
4323		jbd2_journal_destroy(sbi->s_journal);
4324		sbi->s_journal = NULL;
4325	}
4326failed_mount3a:
4327	ext4_es_unregister_shrinker(sbi);
4328failed_mount3:
4329	del_timer_sync(&sbi->s_err_report);
4330	if (sbi->s_mmp_tsk)
4331		kthread_stop(sbi->s_mmp_tsk);
4332failed_mount2:
4333	for (i = 0; i < db_count; i++)
4334		brelse(sbi->s_group_desc[i]);
4335	kvfree(sbi->s_group_desc);
4336failed_mount:
4337	if (sbi->s_chksum_driver)
4338		crypto_free_shash(sbi->s_chksum_driver);
4339	if (sbi->s_proc) {
4340		remove_proc_entry("options", sbi->s_proc);
4341		remove_proc_entry(sb->s_id, ext4_proc_root);
4342	}
4343#ifdef CONFIG_QUOTA
4344	for (i = 0; i < EXT4_MAXQUOTAS; i++)
4345		kfree(sbi->s_qf_names[i]);
4346#endif
4347	ext4_blkdev_remove(sbi);
4348	brelse(bh);
4349out_fail:
4350	sb->s_fs_info = NULL;
4351	kfree(sbi->s_blockgroup_lock);
4352	kfree(sbi);
4353out_free_orig:
4354	kfree(orig_data);
4355	return err ? err : ret;
4356}
4357
4358/*
4359 * Setup any per-fs journal parameters now.  We'll do this both on
4360 * initial mount, once the journal has been initialised but before we've
4361 * done any recovery; and again on any subsequent remount.
4362 */
4363static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
4364{
4365	struct ext4_sb_info *sbi = EXT4_SB(sb);
4366
4367	journal->j_commit_interval = sbi->s_commit_interval;
4368	journal->j_min_batch_time = sbi->s_min_batch_time;
4369	journal->j_max_batch_time = sbi->s_max_batch_time;
4370
4371	write_lock(&journal->j_state_lock);
4372	if (test_opt(sb, BARRIER))
4373		journal->j_flags |= JBD2_BARRIER;
4374	else
4375		journal->j_flags &= ~JBD2_BARRIER;
4376	if (test_opt(sb, DATA_ERR_ABORT))
4377		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
4378	else
4379		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
4380	write_unlock(&journal->j_state_lock);
4381}
4382
4383static journal_t *ext4_get_journal(struct super_block *sb,
4384				   unsigned int journal_inum)
4385{
4386	struct inode *journal_inode;
4387	journal_t *journal;
4388
4389	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4390
4391	/* First, test for the existence of a valid inode on disk.  Bad
4392	 * things happen if we iget() an unused inode, as the subsequent
4393	 * iput() will try to delete it. */
4394
4395	journal_inode = ext4_iget(sb, journal_inum);
4396	if (IS_ERR(journal_inode)) {
4397		ext4_msg(sb, KERN_ERR, "no journal found");
4398		return NULL;
4399	}
4400	if (!journal_inode->i_nlink) {
4401		make_bad_inode(journal_inode);
4402		iput(journal_inode);
4403		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
4404		return NULL;
4405	}
4406
4407	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
4408		  journal_inode, journal_inode->i_size);
4409	if (!S_ISREG(journal_inode->i_mode)) {
4410		ext4_msg(sb, KERN_ERR, "invalid journal inode");
4411		iput(journal_inode);
4412		return NULL;
4413	}
4414
4415	journal = jbd2_journal_init_inode(journal_inode);
4416	if (!journal) {
4417		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
4418		iput(journal_inode);
4419		return NULL;
4420	}
4421	journal->j_private = sb;
4422	ext4_init_journal_params(sb, journal);
4423	return journal;
4424}
4425
4426static journal_t *ext4_get_dev_journal(struct super_block *sb,
4427				       dev_t j_dev)
4428{
4429	struct buffer_head *bh;
4430	journal_t *journal;
4431	ext4_fsblk_t start;
4432	ext4_fsblk_t len;
4433	int hblock, blocksize;
4434	ext4_fsblk_t sb_block;
4435	unsigned long offset;
4436	struct ext4_super_block *es;
4437	struct block_device *bdev;
4438
4439	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4440
4441	bdev = ext4_blkdev_get(j_dev, sb);
4442	if (bdev == NULL)
4443		return NULL;
4444
4445	blocksize = sb->s_blocksize;
4446	hblock = bdev_logical_block_size(bdev);
4447	if (blocksize < hblock) {
4448		ext4_msg(sb, KERN_ERR,
4449			"blocksize too small for journal device");
4450		goto out_bdev;
4451	}
4452
4453	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
4454	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
4455	set_blocksize(bdev, blocksize);
4456	if (!(bh = __bread(bdev, sb_block, blocksize))) {
4457		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
4458		       "external journal");
4459		goto out_bdev;
4460	}
4461
4462	es = (struct ext4_super_block *) (bh->b_data + offset);
4463	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
4464	    !(le32_to_cpu(es->s_feature_incompat) &
4465	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
4466		ext4_msg(sb, KERN_ERR, "external journal has "
4467					"bad superblock");
4468		brelse(bh);
4469		goto out_bdev;
4470	}
4471
4472	if ((le32_to_cpu(es->s_feature_ro_compat) &
4473	     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
4474	    es->s_checksum != ext4_superblock_csum(sb, es)) {
4475		ext4_msg(sb, KERN_ERR, "external journal has "
4476				       "corrupt superblock");
4477		brelse(bh);
4478		goto out_bdev;
4479	}
4480
4481	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
4482		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
4483		brelse(bh);
4484		goto out_bdev;
4485	}
4486
4487	len = ext4_blocks_count(es);
4488	start = sb_block + 1;
4489	brelse(bh);	/* we're done with the superblock */
4490
4491	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
4492					start, len, blocksize);
4493	if (!journal) {
4494		ext4_msg(sb, KERN_ERR, "failed to create device journal");
4495		goto out_bdev;
4496	}
4497	journal->j_private = sb;
4498	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
4499	wait_on_buffer(journal->j_sb_buffer);
4500	if (!buffer_uptodate(journal->j_sb_buffer)) {
4501		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
4502		goto out_journal;
4503	}
4504	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
4505		ext4_msg(sb, KERN_ERR, "External journal has more than one "
4506					"user (unsupported) - %d",
4507			be32_to_cpu(journal->j_superblock->s_nr_users));
4508		goto out_journal;
4509	}
4510	EXT4_SB(sb)->journal_bdev = bdev;
4511	ext4_init_journal_params(sb, journal);
4512	return journal;
4513
4514out_journal:
4515	jbd2_journal_destroy(journal);
4516out_bdev:
4517	ext4_blkdev_put(bdev);
4518	return NULL;
4519}
4520
4521static int ext4_load_journal(struct super_block *sb,
4522			     struct ext4_super_block *es,
4523			     unsigned long journal_devnum)
4524{
4525	journal_t *journal;
4526	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
4527	dev_t journal_dev;
4528	int err = 0;
4529	int really_read_only;
4530
4531	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4532
4533	if (journal_devnum &&
4534	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4535		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
4536			"numbers have changed");
4537		journal_dev = new_decode_dev(journal_devnum);
4538	} else
4539		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
4540
4541	really_read_only = bdev_read_only(sb->s_bdev);
4542
4543	/*
4544	 * Are we loading a blank journal or performing recovery after a
4545	 * crash?  For recovery, we need to check in advance whether we
4546	 * can get read-write access to the device.
4547	 */
4548	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
4549		if (sb->s_flags & MS_RDONLY) {
4550			ext4_msg(sb, KERN_INFO, "INFO: recovery "
4551					"required on readonly filesystem");
4552			if (really_read_only) {
4553				ext4_msg(sb, KERN_ERR, "write access "
4554					"unavailable, cannot proceed");
4555				return -EROFS;
4556			}
4557			ext4_msg(sb, KERN_INFO, "write access will "
4558			       "be enabled during recovery");
4559		}
4560	}
4561
4562	if (journal_inum && journal_dev) {
4563		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
4564		       "and inode journals!");
4565		return -EINVAL;
4566	}
4567
4568	if (journal_inum) {
4569		if (!(journal = ext4_get_journal(sb, journal_inum)))
4570			return -EINVAL;
4571	} else {
4572		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
4573			return -EINVAL;
4574	}
4575
4576	if (!(journal->j_flags & JBD2_BARRIER))
4577		ext4_msg(sb, KERN_INFO, "barriers disabled");
4578
4579	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
4580		err = jbd2_journal_wipe(journal, !really_read_only);
4581	if (!err) {
4582		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
4583		if (save)
4584			memcpy(save, ((char *) es) +
4585			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
4586		err = jbd2_journal_load(journal);
4587		if (save)
4588			memcpy(((char *) es) + EXT4_S_ERR_START,
4589			       save, EXT4_S_ERR_LEN);
4590		kfree(save);
4591	}
4592
4593	if (err) {
4594		ext4_msg(sb, KERN_ERR, "error loading journal");
4595		jbd2_journal_destroy(journal);
4596		return err;
4597	}
4598
4599	EXT4_SB(sb)->s_journal = journal;
4600	ext4_clear_journal_err(sb, es);
4601
4602	if (!really_read_only && journal_devnum &&
4603	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4604		es->s_journal_dev = cpu_to_le32(journal_devnum);
4605
4606		/* Make sure we flush the recovery flag to disk. */
4607		ext4_commit_super(sb, 1);
4608	}
4609
4610	return 0;
4611}
4612
4613static int ext4_commit_super(struct super_block *sb, int sync)
4614{
4615	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
4616	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4617	int error = 0;
4618
4619	if (!sbh || block_device_ejected(sb))
4620		return error;
4621	if (buffer_write_io_error(sbh)) {
4622		/*
4623		 * Oh, dear.  A previous attempt to write the
4624		 * superblock failed.  This could happen because the
4625		 * USB device was yanked out.  Or it could happen to
4626		 * be a transient write error and maybe the block will
4627		 * be remapped.  Nothing we can do but to retry the
4628		 * write and hope for the best.
4629		 */
4630		ext4_msg(sb, KERN_ERR, "previous I/O error to "
4631		       "superblock detected");
4632		clear_buffer_write_io_error(sbh);
4633		set_buffer_uptodate(sbh);
4634	}
4635	/*
4636	 * If the file system is mounted read-only, don't update the
4637	 * superblock write time.  This avoids updating the superblock
4638	 * write time when we are mounting the root file system
4639	 * read/only but we need to replay the journal; at that point,
4640	 * for people who are east of GMT and who make their clock
4641	 * tick in localtime for Windows bug-for-bug compatibility,
4642	 * the clock is set in the future, and this will cause e2fsck
4643	 * to complain and force a full file system check.
4644	 */
4645	if (!(sb->s_flags & MS_RDONLY))
4646		es->s_wtime = cpu_to_le32(get_seconds());
4647	if (sb->s_bdev->bd_part)
4648		es->s_kbytes_written =
4649			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4650			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
4651			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
4652	else
4653		es->s_kbytes_written =
4654			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4655	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
4656		ext4_free_blocks_count_set(es,
4657			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
4658				&EXT4_SB(sb)->s_freeclusters_counter)));
4659	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
4660		es->s_free_inodes_count =
4661			cpu_to_le32(percpu_counter_sum_positive(
4662				&EXT4_SB(sb)->s_freeinodes_counter));
4663	BUFFER_TRACE(sbh, "marking dirty");
4664	ext4_superblock_csum_set(sb);
4665	mark_buffer_dirty(sbh);
4666	if (sync) {
4667		error = sync_dirty_buffer(sbh);
4668		if (error)
4669			return error;
4670
4671		error = buffer_write_io_error(sbh);
4672		if (error) {
4673			ext4_msg(sb, KERN_ERR, "I/O error while writing "
4674			       "superblock");
4675			clear_buffer_write_io_error(sbh);
4676			set_buffer_uptodate(sbh);
4677		}
4678	}
4679	return error;
4680}
4681
4682/*
4683 * Have we just finished recovery?  If so, and if we are mounting (or
4684 * remounting) the filesystem readonly, then we will end up with a
4685 * consistent fs on disk.  Record that fact.
4686 */
4687static void ext4_mark_recovery_complete(struct super_block *sb,
4688					struct ext4_super_block *es)
4689{
4690	journal_t *journal = EXT4_SB(sb)->s_journal;
4691
4692	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
4693		BUG_ON(journal != NULL);
4694		return;
4695	}
4696	jbd2_journal_lock_updates(journal);
4697	if (jbd2_journal_flush(journal) < 0)
4698		goto out;
4699
4700	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
4701	    sb->s_flags & MS_RDONLY) {
4702		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4703		ext4_commit_super(sb, 1);
4704	}
4705
4706out:
4707	jbd2_journal_unlock_updates(journal);
4708}
4709
4710/*
4711 * If we are mounting (or read-write remounting) a filesystem whose journal
4712 * has recorded an error from a previous lifetime, move that error to the
4713 * main filesystem now.
4714 */
4715static void ext4_clear_journal_err(struct super_block *sb,
4716				   struct ext4_super_block *es)
4717{
4718	journal_t *journal;
4719	int j_errno;
4720	const char *errstr;
4721
4722	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4723
4724	journal = EXT4_SB(sb)->s_journal;
4725
4726	/*
4727	 * Now check for any error status which may have been recorded in the
4728	 * journal by a prior ext4_error() or ext4_abort()
4729	 */
4730
4731	j_errno = jbd2_journal_errno(journal);
4732	if (j_errno) {
4733		char nbuf[16];
4734
4735		errstr = ext4_decode_error(sb, j_errno, nbuf);
4736		ext4_warning(sb, "Filesystem error recorded "
4737			     "from previous mount: %s", errstr);
4738		ext4_warning(sb, "Marking fs in need of filesystem check.");
4739
4740		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
4741		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
4742		ext4_commit_super(sb, 1);
4743
4744		jbd2_journal_clear_err(journal);
4745		jbd2_journal_update_sb_errno(journal);
4746	}
4747}
4748
4749/*
4750 * Force the running and committing transactions to commit,
4751 * and wait on the commit.
4752 */
4753int ext4_force_commit(struct super_block *sb)
4754{
4755	journal_t *journal;
4756
4757	if (sb->s_flags & MS_RDONLY)
4758		return 0;
4759
4760	journal = EXT4_SB(sb)->s_journal;
4761	return ext4_journal_force_commit(journal);
4762}
4763
4764static int ext4_sync_fs(struct super_block *sb, int wait)
4765{
4766	int ret = 0;
4767	tid_t target;
4768	bool needs_barrier = false;
4769	struct ext4_sb_info *sbi = EXT4_SB(sb);
4770
4771	trace_ext4_sync_fs(sb, wait);
4772	flush_workqueue(sbi->rsv_conversion_wq);
4773	/*
4774	 * Writeback quota in non-journalled quota case - journalled quota has
4775	 * no dirty dquots
4776	 */
4777	dquot_writeback_dquots(sb, -1);
4778	/*
4779	 * Data writeback is possible w/o journal transaction, so barrier must
4780	 * being sent at the end of the function. But we can skip it if
4781	 * transaction_commit will do it for us.
4782	 */
4783	if (sbi->s_journal) {
4784		target = jbd2_get_latest_transaction(sbi->s_journal);
4785		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4786		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4787			needs_barrier = true;
4788
4789		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4790			if (wait)
4791				ret = jbd2_log_wait_commit(sbi->s_journal,
4792							   target);
4793		}
4794	} else if (wait && test_opt(sb, BARRIER))
4795		needs_barrier = true;
4796	if (needs_barrier) {
4797		int err;
4798		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4799		if (!ret)
4800			ret = err;
4801	}
4802
4803	return ret;
4804}
4805
4806/*
4807 * LVM calls this function before a (read-only) snapshot is created.  This
4808 * gives us a chance to flush the journal completely and mark the fs clean.
4809 *
4810 * Note that only this function cannot bring a filesystem to be in a clean
4811 * state independently. It relies on upper layer to stop all data & metadata
4812 * modifications.
4813 */
4814static int ext4_freeze(struct super_block *sb)
4815{
4816	int error = 0;
4817	journal_t *journal;
4818
4819	if (sb->s_flags & MS_RDONLY)
4820		return 0;
4821
4822	journal = EXT4_SB(sb)->s_journal;
4823
4824	if (journal) {
4825		/* Now we set up the journal barrier. */
4826		jbd2_journal_lock_updates(journal);
4827
4828		/*
4829		 * Don't clear the needs_recovery flag if we failed to
4830		 * flush the journal.
4831		 */
4832		error = jbd2_journal_flush(journal);
4833		if (error < 0)
4834			goto out;
4835
4836		/* Journal blocked and flushed, clear needs_recovery flag. */
4837		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4838	}
4839
4840	error = ext4_commit_super(sb, 1);
4841out:
4842	if (journal)
4843		/* we rely on upper layer to stop further updates */
4844		jbd2_journal_unlock_updates(journal);
4845	return error;
4846}
4847
4848/*
4849 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
4850 * flag here, even though the filesystem is not technically dirty yet.
4851 */
4852static int ext4_unfreeze(struct super_block *sb)
4853{
4854	if (sb->s_flags & MS_RDONLY)
4855		return 0;
4856
4857	if (EXT4_SB(sb)->s_journal) {
4858		/* Reset the needs_recovery flag before the fs is unlocked. */
4859		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4860	}
4861
4862	ext4_commit_super(sb, 1);
4863	return 0;
4864}
4865
4866/*
4867 * Structure to save mount options for ext4_remount's benefit
4868 */
4869struct ext4_mount_options {
4870	unsigned long s_mount_opt;
4871	unsigned long s_mount_opt2;
4872	kuid_t s_resuid;
4873	kgid_t s_resgid;
4874	unsigned long s_commit_interval;
4875	u32 s_min_batch_time, s_max_batch_time;
4876#ifdef CONFIG_QUOTA
4877	int s_jquota_fmt;
4878	char *s_qf_names[EXT4_MAXQUOTAS];
4879#endif
4880};
4881
4882static int ext4_remount(struct super_block *sb, int *flags, char *data)
4883{
4884	struct ext4_super_block *es;
4885	struct ext4_sb_info *sbi = EXT4_SB(sb);
4886	unsigned long old_sb_flags;
4887	struct ext4_mount_options old_opts;
4888	int enable_quota = 0;
4889	ext4_group_t g;
4890	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4891	int err = 0;
4892#ifdef CONFIG_QUOTA
4893	int i, j;
4894#endif
4895	char *orig_data = kstrdup(data, GFP_KERNEL);
4896
4897	/* Store the original options */
4898	old_sb_flags = sb->s_flags;
4899	old_opts.s_mount_opt = sbi->s_mount_opt;
4900	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
4901	old_opts.s_resuid = sbi->s_resuid;
4902	old_opts.s_resgid = sbi->s_resgid;
4903	old_opts.s_commit_interval = sbi->s_commit_interval;
4904	old_opts.s_min_batch_time = sbi->s_min_batch_time;
4905	old_opts.s_max_batch_time = sbi->s_max_batch_time;
4906#ifdef CONFIG_QUOTA
4907	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
4908	for (i = 0; i < EXT4_MAXQUOTAS; i++)
4909		if (sbi->s_qf_names[i]) {
4910			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
4911							 GFP_KERNEL);
4912			if (!old_opts.s_qf_names[i]) {
4913				for (j = 0; j < i; j++)
4914					kfree(old_opts.s_qf_names[j]);
4915				kfree(orig_data);
4916				return -ENOMEM;
4917			}
4918		} else
4919			old_opts.s_qf_names[i] = NULL;
4920#endif
4921	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
4922		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
4923
4924	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
4925		err = -EINVAL;
4926		goto restore_opts;
4927	}
4928
4929	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
4930	    test_opt(sb, JOURNAL_CHECKSUM)) {
4931		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
4932			 "during remount not supported; ignoring");
4933		sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
4934	}
4935
4936	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4937		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4938			ext4_msg(sb, KERN_ERR, "can't mount with "
4939				 "both data=journal and delalloc");
4940			err = -EINVAL;
4941			goto restore_opts;
4942		}
4943		if (test_opt(sb, DIOREAD_NOLOCK)) {
4944			ext4_msg(sb, KERN_ERR, "can't mount with "
4945				 "both data=journal and dioread_nolock");
4946			err = -EINVAL;
4947			goto restore_opts;
4948		}
4949		if (test_opt(sb, DAX)) {
4950			ext4_msg(sb, KERN_ERR, "can't mount with "
4951				 "both data=journal and dax");
4952			err = -EINVAL;
4953			goto restore_opts;
4954		}
4955	}
4956
4957	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
4958		ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
4959			"dax flag with busy inodes while remounting");
4960		sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
4961	}
4962
4963	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
4964		ext4_abort(sb, "Abort forced by user");
4965
4966	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
4967		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
4968
4969	es = sbi->s_es;
4970
4971	if (sbi->s_journal) {
4972		ext4_init_journal_params(sb, sbi->s_journal);
4973		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4974	}
4975
4976	if (*flags & MS_LAZYTIME)
4977		sb->s_flags |= MS_LAZYTIME;
4978
4979	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
4980		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
4981			err = -EROFS;
4982			goto restore_opts;
4983		}
4984
4985		if (*flags & MS_RDONLY) {
4986			err = sync_filesystem(sb);
4987			if (err < 0)
4988				goto restore_opts;
4989			err = dquot_suspend(sb, -1);
4990			if (err < 0)
4991				goto restore_opts;
4992
4993			/*
4994			 * First of all, the unconditional stuff we have to do
4995			 * to disable replay of the journal when we next remount
4996			 */
4997			sb->s_flags |= MS_RDONLY;
4998
4999			/*
5000			 * OK, test if we are remounting a valid rw partition
5001			 * readonly, and if so set the rdonly flag and then
5002			 * mark the partition as valid again.
5003			 */
5004			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
5005			    (sbi->s_mount_state & EXT4_VALID_FS))
5006				es->s_state = cpu_to_le16(sbi->s_mount_state);
5007
5008			if (sbi->s_journal)
5009				ext4_mark_recovery_complete(sb, es);
5010		} else {
5011			/* Make sure we can mount this feature set readwrite */
5012			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
5013					EXT4_FEATURE_RO_COMPAT_READONLY) ||
5014			    !ext4_feature_set_ok(sb, 0)) {
5015				err = -EROFS;
5016				goto restore_opts;
5017			}
5018			/*
5019			 * Make sure the group descriptor checksums
5020			 * are sane.  If they aren't, refuse to remount r/w.
5021			 */
5022			for (g = 0; g < sbi->s_groups_count; g++) {
5023				struct ext4_group_desc *gdp =
5024					ext4_get_group_desc(sb, g, NULL);
5025
5026				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
5027					ext4_msg(sb, KERN_ERR,
5028	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
5029		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
5030					       le16_to_cpu(gdp->bg_checksum));
5031					err = -EINVAL;
5032					goto restore_opts;
5033				}
5034			}
5035
5036			/*
5037			 * If we have an unprocessed orphan list hanging
5038			 * around from a previously readonly bdev mount,
5039			 * require a full umount/remount for now.
5040			 */
5041			if (es->s_last_orphan) {
5042				ext4_msg(sb, KERN_WARNING, "Couldn't "
5043				       "remount RDWR because of unprocessed "
5044				       "orphan inode list.  Please "
5045				       "umount/remount instead");
5046				err = -EINVAL;
5047				goto restore_opts;
5048			}
5049
5050			/*
5051			 * Mounting a RDONLY partition read-write, so reread
5052			 * and store the current valid flag.  (It may have
5053			 * been changed by e2fsck since we originally mounted
5054			 * the partition.)
5055			 */
5056			if (sbi->s_journal)
5057				ext4_clear_journal_err(sb, es);
5058			sbi->s_mount_state = le16_to_cpu(es->s_state);
5059			if (!ext4_setup_super(sb, es, 0))
5060				sb->s_flags &= ~MS_RDONLY;
5061			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
5062						     EXT4_FEATURE_INCOMPAT_MMP))
5063				if (ext4_multi_mount_protect(sb,
5064						le64_to_cpu(es->s_mmp_block))) {
5065					err = -EROFS;
5066					goto restore_opts;
5067				}
5068			enable_quota = 1;
5069		}
5070	}
5071
5072	/*
5073	 * Reinitialize lazy itable initialization thread based on
5074	 * current settings
5075	 */
5076	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
5077		ext4_unregister_li_request(sb);
5078	else {
5079		ext4_group_t first_not_zeroed;
5080		first_not_zeroed = ext4_has_uninit_itable(sb);
5081		ext4_register_li_request(sb, first_not_zeroed);
5082	}
5083
5084	ext4_setup_system_zone(sb);
5085	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
5086		ext4_commit_super(sb, 1);
5087
5088#ifdef CONFIG_QUOTA
5089	/* Release old quota file names */
5090	for (i = 0; i < EXT4_MAXQUOTAS; i++)
5091		kfree(old_opts.s_qf_names[i]);
5092	if (enable_quota) {
5093		if (sb_any_quota_suspended(sb))
5094			dquot_resume(sb, -1);
5095		else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
5096					EXT4_FEATURE_RO_COMPAT_QUOTA)) {
5097			err = ext4_enable_quotas(sb);
5098			if (err)
5099				goto restore_opts;
5100		}
5101	}
5102#endif
5103
5104	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
5105	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
5106	kfree(orig_data);
5107	return 0;
5108
5109restore_opts:
5110	sb->s_flags = old_sb_flags;
5111	sbi->s_mount_opt = old_opts.s_mount_opt;
5112	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
5113	sbi->s_resuid = old_opts.s_resuid;
5114	sbi->s_resgid = old_opts.s_resgid;
5115	sbi->s_commit_interval = old_opts.s_commit_interval;
5116	sbi->s_min_batch_time = old_opts.s_min_batch_time;
5117	sbi->s_max_batch_time = old_opts.s_max_batch_time;
5118#ifdef CONFIG_QUOTA
5119	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
5120	for (i = 0; i < EXT4_MAXQUOTAS; i++) {
5121		kfree(sbi->s_qf_names[i]);
5122		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
5123	}
5124#endif
5125	kfree(orig_data);
5126	return err;
5127}
5128
5129static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
5130{
5131	struct super_block *sb = dentry->d_sb;
5132	struct ext4_sb_info *sbi = EXT4_SB(sb);
5133	struct ext4_super_block *es = sbi->s_es;
5134	ext4_fsblk_t overhead = 0, resv_blocks;
5135	u64 fsid;
5136	s64 bfree;
5137	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
5138
5139	if (!test_opt(sb, MINIX_DF))
5140		overhead = sbi->s_overhead;
5141
5142	buf->f_type = EXT4_SUPER_MAGIC;
5143	buf->f_bsize = sb->s_blocksize;
5144	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
5145	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
5146		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
5147	/* prevent underflow in case that few free space is available */
5148	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
5149	buf->f_bavail = buf->f_bfree -
5150			(ext4_r_blocks_count(es) + resv_blocks);
5151	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
5152		buf->f_bavail = 0;
5153	buf->f_files = le32_to_cpu(es->s_inodes_count);
5154	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
5155	buf->f_namelen = EXT4_NAME_LEN;
5156	fsid = le64_to_cpup((void *)es->s_uuid) ^
5157	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
5158	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
5159	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
5160
5161	return 0;
5162}
5163
5164/* Helper function for writing quotas on sync - we need to start transaction
5165 * before quota file is locked for write. Otherwise the are possible deadlocks:
5166 * Process 1                         Process 2
5167 * ext4_create()                     quota_sync()
5168 *   jbd2_journal_start()                  write_dquot()
5169 *   dquot_initialize()                         down(dqio_mutex)
5170 *     down(dqio_mutex)                    jbd2_journal_start()
5171 *
5172 */
5173
5174#ifdef CONFIG_QUOTA
5175
5176static inline struct inode *dquot_to_inode(struct dquot *dquot)
5177{
5178	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
5179}
5180
5181static int ext4_write_dquot(struct dquot *dquot)
5182{
5183	int ret, err;
5184	handle_t *handle;
5185	struct inode *inode;
5186
5187	inode = dquot_to_inode(dquot);
5188	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5189				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
5190	if (IS_ERR(handle))
5191		return PTR_ERR(handle);
5192	ret = dquot_commit(dquot);
5193	err = ext4_journal_stop(handle);
5194	if (!ret)
5195		ret = err;
5196	return ret;
5197}
5198
5199static int ext4_acquire_dquot(struct dquot *dquot)
5200{
5201	int ret, err;
5202	handle_t *handle;
5203
5204	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5205				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
5206	if (IS_ERR(handle))
5207		return PTR_ERR(handle);
5208	ret = dquot_acquire(dquot);
5209	err = ext4_journal_stop(handle);
5210	if (!ret)
5211		ret = err;
5212	return ret;
5213}
5214
5215static int ext4_release_dquot(struct dquot *dquot)
5216{
5217	int ret, err;
5218	handle_t *handle;
5219
5220	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5221				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
5222	if (IS_ERR(handle)) {
5223		/* Release dquot anyway to avoid endless cycle in dqput() */
5224		dquot_release(dquot);
5225		return PTR_ERR(handle);
5226	}
5227	ret = dquot_release(dquot);
5228	err = ext4_journal_stop(handle);
5229	if (!ret)
5230		ret = err;
5231	return ret;
5232}
5233
5234static int ext4_mark_dquot_dirty(struct dquot *dquot)
5235{
5236	struct super_block *sb = dquot->dq_sb;
5237	struct ext4_sb_info *sbi = EXT4_SB(sb);
5238
5239	/* Are we journaling quotas? */
5240	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
5241	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
5242		dquot_mark_dquot_dirty(dquot);
5243		return ext4_write_dquot(dquot);
5244	} else {
5245		return dquot_mark_dquot_dirty(dquot);
5246	}
5247}
5248
5249static int ext4_write_info(struct super_block *sb, int type)
5250{
5251	int ret, err;
5252	handle_t *handle;
5253
5254	/* Data block + inode block */
5255	handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
5256	if (IS_ERR(handle))
5257		return PTR_ERR(handle);
5258	ret = dquot_commit_info(sb, type);
5259	err = ext4_journal_stop(handle);
5260	if (!ret)
5261		ret = err;
5262	return ret;
5263}
5264
5265/*
5266 * Turn on quotas during mount time - we need to find
5267 * the quota file and such...
5268 */
5269static int ext4_quota_on_mount(struct super_block *sb, int type)
5270{
5271	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
5272					EXT4_SB(sb)->s_jquota_fmt, type);
5273}
5274
5275static void lockdep_set_quota_inode(struct inode *inode, int subclass)
5276{
5277	struct ext4_inode_info *ei = EXT4_I(inode);
5278
5279	/* The first argument of lockdep_set_subclass has to be
5280	 * *exactly* the same as the argument to init_rwsem() --- in
5281	 * this case, in init_once() --- or lockdep gets unhappy
5282	 * because the name of the lock is set using the
5283	 * stringification of the argument to init_rwsem().
5284	 */
5285	(void) ei;	/* shut up clang warning if !CONFIG_LOCKDEP */
5286	lockdep_set_subclass(&ei->i_data_sem, subclass);
5287}
5288
5289/*
5290 * Standard function to be called on quota_on
5291 */
5292static int ext4_quota_on(struct super_block *sb, int type, int format_id,
5293			 struct path *path)
5294{
5295	int err;
5296
5297	if (!test_opt(sb, QUOTA))
5298		return -EINVAL;
5299
5300	/* Quotafile not on the same filesystem? */
5301	if (path->dentry->d_sb != sb)
5302		return -EXDEV;
5303	/* Journaling quota? */
5304	if (EXT4_SB(sb)->s_qf_names[type]) {
5305		/* Quotafile not in fs root? */
5306		if (path->dentry->d_parent != sb->s_root)
5307			ext4_msg(sb, KERN_WARNING,
5308				"Quota file not on filesystem root. "
5309				"Journaled quota will not work");
5310	}
5311
5312	/*
5313	 * When we journal data on quota file, we have to flush journal to see
5314	 * all updates to the file when we bypass pagecache...
5315	 */
5316	if (EXT4_SB(sb)->s_journal &&
5317	    ext4_should_journal_data(d_inode(path->dentry))) {
5318		/*
5319		 * We don't need to lock updates but journal_flush() could
5320		 * otherwise be livelocked...
5321		 */
5322		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
5323		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
5324		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
5325		if (err)
5326			return err;
5327	}
5328	lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
5329	err = dquot_quota_on(sb, type, format_id, path);
5330	if (err)
5331		lockdep_set_quota_inode(path->dentry->d_inode,
5332					     I_DATA_SEM_NORMAL);
5333	return err;
5334}
5335
5336static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
5337			     unsigned int flags)
5338{
5339	int err;
5340	struct inode *qf_inode;
5341	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5342		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5343		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
5344	};
5345
5346	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
5347
5348	if (!qf_inums[type])
5349		return -EPERM;
5350
5351	qf_inode = ext4_iget(sb, qf_inums[type]);
5352	if (IS_ERR(qf_inode)) {
5353		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
5354		return PTR_ERR(qf_inode);
5355	}
5356
5357	/* Don't account quota for quota files to avoid recursion */
5358	qf_inode->i_flags |= S_NOQUOTA;
5359	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
5360	err = dquot_enable(qf_inode, type, format_id, flags);
5361	iput(qf_inode);
5362	if (err)
5363		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
5364
5365	return err;
5366}
5367
5368/* Enable usage tracking for all quota types. */
5369static int ext4_enable_quotas(struct super_block *sb)
5370{
5371	int type, err = 0;
5372	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5373		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5374		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
5375	};
5376
5377	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
5378	for (type = 0; type < EXT4_MAXQUOTAS; type++) {
5379		if (qf_inums[type]) {
5380			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
5381						DQUOT_USAGE_ENABLED);
5382			if (err) {
5383				ext4_warning(sb,
5384					"Failed to enable quota tracking "
5385					"(type=%d, err=%d). Please run "
5386					"e2fsck to fix.", type, err);
5387				return err;
5388			}
5389		}
5390	}
5391	return 0;
5392}
5393
5394static int ext4_quota_off(struct super_block *sb, int type)
5395{
5396	struct inode *inode = sb_dqopt(sb)->files[type];
5397	handle_t *handle;
5398
5399	/* Force all delayed allocation blocks to be allocated.
5400	 * Caller already holds s_umount sem */
5401	if (test_opt(sb, DELALLOC))
5402		sync_filesystem(sb);
5403
5404	if (!inode)
5405		goto out;
5406
5407	/* Update modification times of quota files when userspace can
5408	 * start looking at them */
5409	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
5410	if (IS_ERR(handle))
5411		goto out;
5412	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
5413	ext4_mark_inode_dirty(handle, inode);
5414	ext4_journal_stop(handle);
5415
5416out:
5417	return dquot_quota_off(sb, type);
5418}
5419
5420/* Read data from quotafile - avoid pagecache and such because we cannot afford
5421 * acquiring the locks... As quota files are never truncated and quota code
5422 * itself serializes the operations (and no one else should touch the files)
5423 * we don't have to be afraid of races */
5424static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
5425			       size_t len, loff_t off)
5426{
5427	struct inode *inode = sb_dqopt(sb)->files[type];
5428	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5429	int offset = off & (sb->s_blocksize - 1);
5430	int tocopy;
5431	size_t toread;
5432	struct buffer_head *bh;
5433	loff_t i_size = i_size_read(inode);
5434
5435	if (off > i_size)
5436		return 0;
5437	if (off+len > i_size)
5438		len = i_size-off;
5439	toread = len;
5440	while (toread > 0) {
5441		tocopy = sb->s_blocksize - offset < toread ?
5442				sb->s_blocksize - offset : toread;
5443		bh = ext4_bread(NULL, inode, blk, 0);
5444		if (IS_ERR(bh))
5445			return PTR_ERR(bh);
5446		if (!bh)	/* A hole? */
5447			memset(data, 0, tocopy);
5448		else
5449			memcpy(data, bh->b_data+offset, tocopy);
5450		brelse(bh);
5451		offset = 0;
5452		toread -= tocopy;
5453		data += tocopy;
5454		blk++;
5455	}
5456	return len;
5457}
5458
5459/* Write to quotafile (we know the transaction is already started and has
5460 * enough credits) */
5461static ssize_t ext4_quota_write(struct super_block *sb, int type,
5462				const char *data, size_t len, loff_t off)
5463{
5464	struct inode *inode = sb_dqopt(sb)->files[type];
5465	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5466	int err, offset = off & (sb->s_blocksize - 1);
5467	struct buffer_head *bh;
5468	handle_t *handle = journal_current_handle();
5469
5470	if (EXT4_SB(sb)->s_journal && !handle) {
5471		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
5472			" cancelled because transaction is not started",
5473			(unsigned long long)off, (unsigned long long)len);
5474		return -EIO;
5475	}
5476	/*
5477	 * Since we account only one data block in transaction credits,
5478	 * then it is impossible to cross a block boundary.
5479	 */
5480	if (sb->s_blocksize - offset < len) {
5481		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
5482			" cancelled because not block aligned",
5483			(unsigned long long)off, (unsigned long long)len);
5484		return -EIO;
5485	}
5486
5487	bh = ext4_bread(handle, inode, blk, 1);
5488	if (IS_ERR(bh))
5489		return PTR_ERR(bh);
5490	if (!bh)
5491		goto out;
5492	BUFFER_TRACE(bh, "get write access");
5493	err = ext4_journal_get_write_access(handle, bh);
5494	if (err) {
5495		brelse(bh);
5496		return err;
5497	}
5498	lock_buffer(bh);
5499	memcpy(bh->b_data+offset, data, len);
5500	flush_dcache_page(bh->b_page);
5501	unlock_buffer(bh);
5502	err = ext4_handle_dirty_metadata(handle, NULL, bh);
5503	brelse(bh);
5504out:
5505	if (inode->i_size < off + len) {
5506		i_size_write(inode, off + len);
5507		EXT4_I(inode)->i_disksize = inode->i_size;
5508		ext4_mark_inode_dirty(handle, inode);
5509	}
5510	return len;
5511}
5512
5513#endif
5514
5515static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
5516		       const char *dev_name, void *data)
5517{
5518	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
5519}
5520
5521#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
5522static inline void register_as_ext2(void)
5523{
5524	int err = register_filesystem(&ext2_fs_type);
5525	if (err)
5526		printk(KERN_WARNING
5527		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
5528}
5529
5530static inline void unregister_as_ext2(void)
5531{
5532	unregister_filesystem(&ext2_fs_type);
5533}
5534
5535static inline int ext2_feature_set_ok(struct super_block *sb)
5536{
5537	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
5538		return 0;
5539	if (sb->s_flags & MS_RDONLY)
5540		return 1;
5541	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
5542		return 0;
5543	return 1;
5544}
5545#else
5546static inline void register_as_ext2(void) { }
5547static inline void unregister_as_ext2(void) { }
5548static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
5549#endif
5550
5551#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
5552static inline void register_as_ext3(void)
5553{
5554	int err = register_filesystem(&ext3_fs_type);
5555	if (err)
5556		printk(KERN_WARNING
5557		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
5558}
5559
5560static inline void unregister_as_ext3(void)
5561{
5562	unregister_filesystem(&ext3_fs_type);
5563}
5564
5565static inline int ext3_feature_set_ok(struct super_block *sb)
5566{
5567	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
5568		return 0;
5569	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
5570		return 0;
5571	if (sb->s_flags & MS_RDONLY)
5572		return 1;
5573	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
5574		return 0;
5575	return 1;
5576}
5577#else
5578static inline void register_as_ext3(void) { }
5579static inline void unregister_as_ext3(void) { }
5580static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
5581#endif
5582
5583static struct file_system_type ext4_fs_type = {
5584	.owner		= THIS_MODULE,
5585	.name		= "ext4",
5586	.mount		= ext4_mount,
5587	.kill_sb	= kill_block_super,
5588	.fs_flags	= FS_REQUIRES_DEV,
5589};
5590MODULE_ALIAS_FS("ext4");
5591
5592static int __init ext4_init_feat_adverts(void)
5593{
5594	struct ext4_features *ef;
5595	int ret = -ENOMEM;
5596
5597	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
5598	if (!ef)
5599		goto out;
5600
5601	ef->f_kobj.kset = ext4_kset;
5602	init_completion(&ef->f_kobj_unregister);
5603	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
5604				   "features");
5605	if (ret) {
5606		kfree(ef);
5607		goto out;
5608	}
5609
5610	ext4_feat = ef;
5611	ret = 0;
5612out:
5613	return ret;
5614}
5615
5616static void ext4_exit_feat_adverts(void)
5617{
5618	kobject_put(&ext4_feat->f_kobj);
5619	wait_for_completion(&ext4_feat->f_kobj_unregister);
5620	kfree(ext4_feat);
5621}
5622
5623/* Shared across all ext4 file systems */
5624wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
5625struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
5626
5627static int __init ext4_init_fs(void)
5628{
5629	int i, err;
5630
5631	ext4_li_info = NULL;
5632	mutex_init(&ext4_li_mtx);
5633
5634	/* Build-time check for flags consistency */
5635	ext4_check_flag_values();
5636
5637	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
5638		mutex_init(&ext4__aio_mutex[i]);
5639		init_waitqueue_head(&ext4__ioend_wq[i]);
5640	}
5641
5642	err = ext4_init_es();
5643	if (err)
5644		return err;
5645
5646	err = ext4_init_pageio();
5647	if (err)
5648		goto out7;
5649
5650	err = ext4_init_system_zone();
5651	if (err)
5652		goto out6;
5653	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
5654	if (!ext4_kset) {
5655		err = -ENOMEM;
5656		goto out5;
5657	}
5658	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
5659
5660	err = ext4_init_feat_adverts();
5661	if (err)
5662		goto out4;
5663
5664	err = ext4_init_mballoc();
5665	if (err)
5666		goto out2;
5667	else
5668		ext4_mballoc_ready = 1;
5669	err = init_inodecache();
5670	if (err)
5671		goto out1;
5672	register_as_ext3();
5673	register_as_ext2();
5674	err = register_filesystem(&ext4_fs_type);
5675	if (err)
5676		goto out;
5677
5678	return 0;
5679out:
5680	unregister_as_ext2();
5681	unregister_as_ext3();
5682	destroy_inodecache();
5683out1:
5684	ext4_mballoc_ready = 0;
5685	ext4_exit_mballoc();
5686out2:
5687	ext4_exit_feat_adverts();
5688out4:
5689	if (ext4_proc_root)
5690		remove_proc_entry("fs/ext4", NULL);
5691	kset_unregister(ext4_kset);
5692out5:
5693	ext4_exit_system_zone();
5694out6:
5695	ext4_exit_pageio();
5696out7:
5697	ext4_exit_es();
5698
5699	return err;
5700}
5701
5702static void __exit ext4_exit_fs(void)
5703{
5704	ext4_destroy_lazyinit_thread();
5705	unregister_as_ext2();
5706	unregister_as_ext3();
5707	unregister_filesystem(&ext4_fs_type);
5708	destroy_inodecache();
5709	ext4_exit_mballoc();
5710	ext4_exit_feat_adverts();
5711	remove_proc_entry("fs/ext4", NULL);
5712	kset_unregister(ext4_kset);
5713	ext4_exit_system_zone();
5714	ext4_exit_pageio();
5715	ext4_exit_es();
5716}
5717
5718MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
5719MODULE_DESCRIPTION("Fourth Extended Filesystem");
5720MODULE_LICENSE("GPL");
5721module_init(ext4_init_fs)
5722module_exit(ext4_exit_fs)
5723